From df09eca1250e6baee04599aa7bb07d5db582ed4a Mon Sep 17 00:00:00 2001
From: Tor Andersson <tor.andersson@artifex.com>
Date: Thu, 25 Feb 2016 14:28:14 +0100
Subject: Simplify ToUnicode CMap creation.

Only allocate one scratch buffer, and no larger than it needs to be.

Detect and warn when we can't create a ToUnicode CMap.
---
 source/pdf/pdf-font.c | 277 ++++++++++++++++++++++----------------------------
 1 file changed, 123 insertions(+), 154 deletions(-)

(limited to 'source/pdf/pdf-font.c')
diff --git a/source/pdf/pdf-font.c b/source/pdf/pdf-font.c
index d6f79e72..0b1cbb43 100644
--- a/source/pdf/pdf-font.c
+++ b/source/pdf/pdf-font.c
@@ -1742,208 +1742,176 @@ pdf_add_descendant_font(fz_context *ctx, pdf_document *doc, pdf_font_desc *fontd
 	return fref;
 }
 
+static int next_range(int *table, int size, int k)
+{
+	int n;
+	for (n = 1; k + n < size; ++n)
+	{
+		if ((k & 0xFF00) != ((k+n) & 0xFF00)) /* high byte changes */
+			break;
+		if (table[k] + n != table[k+n])
+			break;
+	}
+	return n;
+}
+
 /* Create the ToUnicode CMap. */
 static pdf_obj*
-pdf_add_tounicode(fz_context *ctx, pdf_document *doc, pdf_font_desc *fontdesc)
+pdf_add_to_unicode(fz_context *ctx, pdf_document *doc, fz_font *font)
 {
-	fz_buffer *fzbuf = NULL;
+	FT_Face face = font->ft_face;
 	pdf_obj *fref = NULL;
 	pdf_obj *fobj = NULL;
-	FT_Face face = fontdesc->font->ft_face;
-	FT_UInt glyph_index;
-	char hex_glyph[7];
-	char hex_unicode[7];
-	char listinfo[19];
-	char entry[22];
-	unsigned short *table, *seq;
-	int k;
-	int has_lock = 0;
-	int count;
-	int temp_count;
-	int pos;
+	fz_buffer *buf;
+
+	int *table;
+	int num_seq = 0;
+	int num_chr = 0;
+	int n, k;
 
-	fz_var(has_lock);
-	fz_var(fzbuf);
 	fz_var(fref);
 	fz_var(fobj);
 
-	fz_try(ctx)
+	/* Populate reverse cmap table */
 	{
-		fzbuf = fz_new_buffer(ctx, 0);
-
-		/* Boiler plate */
-		fz_write_buffer(ctx, fzbuf, "/CIDInit /ProcSet findresource begin\n", strlen("/CIDInit /ProcSet findresource begin\n"));
-		fz_write_buffer(ctx, fzbuf, "12 dict begin\n", strlen("12 dict begin\n"));
-		fz_write_buffer(ctx, fzbuf, "begincmap\n", strlen("begincmap\n"));
-		fz_write_buffer(ctx, fzbuf, "/CIDSystemInfo\n", strlen("/CIDSystemInfo\n"));
-		fz_write_buffer(ctx, fzbuf, "<</Registry(Adobe)\n", strlen("<</Registry(Adobe)\n"));
-		fz_write_buffer(ctx, fzbuf, "/Ordering(UCS) /Supplement 0>> def\n", strlen("/Ordering(UCS) /Supplement 0>> def\n"));
-		fz_write_buffer(ctx, fzbuf, "/CMapName /Adobe-Identity-UCS def\n", strlen("/CMapName /Adobe-Identity-UCS def\n"));
-		fz_write_buffer(ctx, fzbuf, "/CMapType 2 def\n", strlen("/CMapType 2 def\n"));
-		fz_write_buffer(ctx, fzbuf, "1 begincodespacerange\n", strlen("1 begincodespacerange\n"));
-		fz_write_buffer(ctx, fzbuf, "<0000> <FFFF>\n", strlen("<0000> <FFFF>\n"));
-		fz_write_buffer(ctx, fzbuf, "endcodespacerange\n", strlen("endcodespacerange\n"));
-
-		/* Sort via populating */
-		table = fz_calloc(ctx, 65536, sizeof(unsigned short));
-		count = 0;
+		FT_ULong ucs;
+		FT_UInt gid;
+
+		table = fz_calloc(ctx, face->num_glyphs, sizeof *table);
 		fz_lock(ctx, FZ_LOCK_FREETYPE);
-		has_lock = 1;
-		for (k = 0; k < 65536; k++)
+		ucs = FT_Get_First_Char(face, &gid);
+		while (gid > 0)
 		{
-			glyph_index = FT_Get_Char_Index(face, k);
-			if (glyph_index > 0 && glyph_index < 65536)
-			{
-				if (table[glyph_index] == 0)
-				{
-					count++;
-					table[glyph_index] = k;
-				}
-			}
+			if (gid < face->num_glyphs)
+				table[gid] = ucs;
+			ucs = FT_Get_Next_Char(face, ucs, &gid);
 		}
 		fz_unlock(ctx, FZ_LOCK_FREETYPE);
-		has_lock = 0;
+	}
+
+	for (k = 0; k < face->num_glyphs; k += n)
+	{
+		n = next_range(table, face->num_glyphs, k);
+		if (n > 1)
+			++num_seq;
+		else if (table[k] > 0)
+			++num_chr;
+	}
+
+	/* No mappings available... */
+	if (num_seq + num_chr == 0)
+	{
+		fz_warn(ctx, "cannot create ToUnicode mapping for %s", font->name);
+		return NULL;
+	}
+
+	buf = fz_new_buffer(ctx, 0);
+	fz_try(ctx)
+	{
+		/* Header boiler plate */
+		fz_buffer_printf(ctx, buf, "/CIDInit /ProcSet findresource begin\n");
+		fz_buffer_printf(ctx, buf, "12 dict begin\n");
+		fz_buffer_printf(ctx, buf, "begincmap\n");
+		fz_buffer_printf(ctx, buf, "/CIDSystemInfo <</Registry(Adobe)/Ordering(UCS)/Supplement 0>> def\n");
+		fz_buffer_printf(ctx, buf, "/CMapName /Adobe-Identity-UCS def\n");
+		fz_buffer_printf(ctx, buf, "/CMapType 2 def\n");
+		fz_buffer_printf(ctx, buf, "1 begincodespacerange\n");
+		fz_buffer_printf(ctx, buf, "<0000> <FFFF>\n");
+		fz_buffer_printf(ctx, buf, "endcodespacerange\n");
 
-		/* Now output non-zero entries. */
 		/* Note to have a valid CMap, the number of entries in table set can
 		 * not exceed 100, so we have to break into multipe tables. Also, note
 		 * that to reduce the file size we should be looking for sequential
 		 * ranges. Per Adobe technical note #5411, we can't have a range
 		 * cross a boundary where the high order byte changes */
+
 		/* First the ranges */
+		if (num_seq > 0)
 		{
-			seq = fz_calloc(ctx, 65536, sizeof(unsigned short));
-			int num_seq = 0;
-			int k_start = -1;
-			int k_end = -1;
-			int match = 0;
-			char hex_start[7];
-			char hex_end[7];
-			char hex_value_start[7];
-			int j;
-
-			k = 0;
-
-			/* First find the ranges */
-			while (1)
+			int count = 0;
+			if (num_seq > 100)
 			{
-				if (k == 65535)
-					break;
-				if (table[k] + 1 == table[k + 1])
-					match = 1;
-				else
-					match = 0;
-
-				/* End any sequences across upper byte boundary changes */
-				if ((k & 0xff00) != ((k + 1) & 0xff00))
-					match = 0;
-
-				/* Start of a sequence */
-				if (k_start == -1 && match)
-				{
-					k_start = k;
-					k_end = k + 1;
-				}
-
-				/* In a sequence */
-				if (k_start != -1 && match)
-				{
-					k_end = k + 1;
-				}
-
-				/* Done with a sequence */
-				if (k_start != -1 && !match)
-				{
-					seq[num_seq * 2] = k_start;
-					seq[num_seq * 2 + 1] = k_end;
-					num_seq = num_seq + 1;
-					k_start = -1;
-				}
-				k = k + 1;
+				fz_buffer_printf(ctx, buf, "100 beginbfrange\n");
+				num_seq -= 100;
 			}
-
-			/* Now output the ranges, with the 100 max limit enforced */
-			if (num_seq > 0)
+			else
+				fz_buffer_printf(ctx, buf, "%d beginbfrange\n", num_seq);
+			for (k = 0; k < face->num_glyphs; k += n)
 			{
-				pos = 0;
-				while (num_seq > 0)
+				n = next_range(table, face->num_glyphs, k);
+				if (n > 1)
 				{
-					if (num_seq > 100)
-						temp_count = 100;
-					else
-						temp_count = num_seq;
-					num_seq = num_seq - temp_count;
-					sprintf(&listinfo[0], "%d beginbfrange\n", temp_count);
-					fz_write_buffer(ctx, fzbuf, listinfo, strlen(listinfo));
-					k = 0;
-					while (k < temp_count)
+					if (count == 100)
 					{
-						k = k + 1;
-						k_start = seq[pos * 2];
-						k_end = seq[pos * 2 + 1];
-						sprintf(&hex_start[0], "%04x", k_start);
-						sprintf(&hex_end[0], "%04x", k_end);
-						sprintf(&hex_value_start[0], "%04x", table[k_start]);
-						sprintf(&entry[0], "<%s> <%s> <%s>\n", hex_start, hex_end, hex_value_start);
-						fz_write_buffer(ctx, fzbuf, entry, strlen(entry));
-
-						/* Clear out these values from the table so they are not
-						 * used in the single entry */
-						count = count - (k_end - k_start + 1);
-						for (j = k_start; j < k_end + 1; j++)
-							table[j] = 0;
-						pos = pos + 1;
+						fz_buffer_printf(ctx, buf, "endbfrange\n");
+						if (num_seq > 100)
+						{
+							fz_buffer_printf(ctx, buf, "100 beginbfrange\n");
+							num_seq -= 100;
+						}
+						else
+							fz_buffer_printf(ctx, buf, "%d beginbfrange\n", num_seq);
+						count = 0;
 					}
-					fz_write_buffer(ctx, fzbuf, "endbfrange\n", strlen("endbfrange\n"));
+					fz_buffer_printf(ctx, buf, "<%04x> <%04x> <%04x>\n", k, k+n-1, table[k]);
+					++count;
 				}
 			}
+			fz_buffer_printf(ctx, buf, "endbfrange\n");
 		}
 
-		/* The rest of the values need to be output as individuals */
-		pos = 0;
-		while (count > 0)
+		/* Then the singles */
+		if (num_chr > 0)
 		{
-			if (count > 100)
-				temp_count = 100;
+			int count = 0;
+			if (num_chr > 100)
+			{
+				fz_buffer_printf(ctx, buf, "100 beginbfchar\n");
+				num_chr -= 100;
+			}
 			else
-				temp_count = count;
-			count = count - temp_count;
-			sprintf(&listinfo[0], "%d beginbfchar\n", temp_count);
-			fz_write_buffer(ctx, fzbuf, listinfo, strlen(listinfo));
-			k = 0;
-			while (k < temp_count)
+				fz_buffer_printf(ctx, buf, "%d beginbfchar\n", num_chr);
+			for (k = 0; k < face->num_glyphs; k += n)
 			{
-				if (table[pos] > 0)
+				n = next_range(table, face->num_glyphs, k);
+				if (n == 1 && table[k] > 0)
 				{
-					k = k + 1;
-					sprintf(&hex_glyph[0], "%04x", pos);
-					sprintf(&hex_unicode[0], "%04x", table[pos]);
-					sprintf(&entry[0], "<%s> <%s>\n", hex_glyph, hex_unicode);
-					fz_write_buffer(ctx, fzbuf, entry, strlen(entry));
+					if (count == 100)
+					{
+						fz_buffer_printf(ctx, buf, "endbfchar\n");
+						if (num_chr > 100)
+						{
+							fz_buffer_printf(ctx, buf, "100 beginbfchar\n");
+							num_chr -= 100;
+						}
+						else
+							fz_buffer_printf(ctx, buf, "%d beginbfchar\n", num_chr);
+						count = 0;
+					}
+					fz_buffer_printf(ctx, buf, "<%04x> <%04x>\n", k, table[k]);
+					++count;
 				}
-				pos = pos + 1;
 			}
-			fz_write_buffer(ctx, fzbuf, "endbfchar\n", strlen("endbfchar\n"));
+			fz_buffer_printf(ctx, buf, "endbfchar\n");
 		}
-		fz_write_buffer(ctx, fzbuf, "endcmap\n", strlen("endcmap\n"));
-		fz_write_buffer(ctx, fzbuf, "CMapName currentdict /CMap defineresource pop\n", strlen("CMapName currentdict /CMap defineresource pop\n"));
-		fz_write_buffer(ctx, fzbuf, "end\nend\n", strlen("end\nend\n"));
+
+		/* Trailer boiler plate */
+		fz_buffer_printf(ctx, buf, "endcmap\n");
+		fz_buffer_printf(ctx, buf, "CMapName currentdict /CMap defineresource pop\n");
+		fz_buffer_printf(ctx, buf, "end\nend\n");
 
 		fobj = pdf_new_dict(ctx, doc, 3);
 		fref = pdf_new_ref(ctx, doc, fobj);
-		pdf_update_stream(ctx, doc, fref, fzbuf, 0);
+		pdf_update_stream(ctx, doc, fref, buf, 0);
 	}
 	fz_always(ctx)
 	{
 		fz_free(ctx, table);
-		fz_free(ctx, seq);
-		fz_drop_buffer(ctx, fzbuf);
+		fz_drop_buffer(ctx, buf);
 		pdf_drop_obj(ctx, fobj);
 	}
 	fz_catch(ctx)
 	{
-		if (has_lock)
-			fz_unlock(ctx, FZ_LOCK_FREETYPE);
 		pdf_drop_obj(ctx, fref);
 		fz_rethrow(ctx);
 	}
@@ -1991,7 +1959,7 @@ pdf_add_cid_font(fz_context *ctx, pdf_document *doc, fz_font *font)
 
 			/* Get the descendant font and the tounicode references */
 			obj_desc_ref = pdf_add_descendant_font(ctx, doc, fontdesc);
-			obj_tounicode_ref = pdf_add_tounicode(ctx, doc, fontdesc);
+			obj_tounicode_ref = pdf_add_to_unicode(ctx, doc, font);
 
 			/* And now the font */
 			fobj = pdf_new_dict(ctx, doc, 10);
@@ -2003,7 +1971,8 @@ pdf_add_cid_font(fz_context *ctx, pdf_document *doc, fz_font *font)
 			obj_array = pdf_new_array(ctx, doc, 3);
 			pdf_array_insert(ctx, obj_array, obj_desc_ref, 0);
 			pdf_dict_put(ctx, fobj, PDF_NAME_DescendantFonts, obj_array);
-			pdf_dict_put(ctx, fobj, PDF_NAME_ToUnicode, obj_tounicode_ref);
+			if (obj_tounicode_ref)
+				pdf_dict_put(ctx, fobj, PDF_NAME_ToUnicode, obj_tounicode_ref);
 			fref = pdf_new_ref(ctx, doc, fobj);
 
 			/* Add ref to our font resource hash table. */
-- 
cgit v1.2.3