Add CMap processing scripts, and turn cmapdump into mutool.

A dumping script written in python. A flattening script written in python (for easier editing). A subsetting script written in shell to minimize CMaps by reusing subsets. Use 'mutool cmapdump' to bootstrap or verify cmap dumps.
author: Tor Andersson <tor.andersson@artifex.com> 2018-04-04 16:36:21 +0200
committer: Tor Andersson <tor.andersson@artifex.com> 2018-04-25 12:26:32 +0200
commit: a6d083bb776ecd498e57450ef84c20e39ae604cf (patch)
tree: a1f4399f011eb2b59e21f0588d322690a6ab14f4 /scripts
parent: 84cf672da90dfdaa2dfd2742cc69fa0bad268081 (diff)
download: mupdf-a6d083bb776ecd498e57450ef84c20e39ae604cf.tar.xz
7 files changed, 647 insertions, 267 deletions
diff --git a/scripts/cmapclean.py b/scripts/cmapclean.py
new file mode 100644
index 00000000..f41f8afe
--- /dev/null
+++ b/scripts/cmapclean.py
@@ -0,0 +1,204 @@
+# Parse a CMap file and dump it back out.
+
+import sys
+
+# Decode a subset of CMap syntax (only what is needed for our built-in resources)
+# We require that tokens are whitespace separated.
+
+def cleancmap(filename):
+	codespacerange = []
+	usecmap = ""
+	cmapname = ""
+	cmapversion = "1.0"
+	csi_registry = "(Adobe)"
+	csi_ordering = "(Unknown)"
+	csi_supplement = 1
+	wmode = 0
+	isbf = False
+
+	map = {}
+
+	def tocode(s):
+		if s[0] == '<' and s[-1] == '>':
+			return int(s[1:-1], 16)
+		return int(s, 10)
+
+	def map_cidchar(lo, v):
+		map[lo] = v
+
+	def map_cidrange(lo, hi, v):
+		while lo <= hi:
+			map[lo] = v
+			lo = lo + 1
+			v = v + 1
+
+	def add_bf(lo, v):
+		# Decode unicode surrogate pairs
+		if len(v) == 2 and v[0] >= 0xd800 and v[0] <= 0xdbff and v[1] >= 0xdc00 and v[1] <= 0xdfff:
+			map[lo] = ((v[0] - 0xd800) << 10) + (v[1] - 0xdc00) + 0x10000
+		elif len(v) == 1:
+			map[lo] = v[0]
+		elif len(v) <= 8:
+			map[lo] = v[:]
+		else:
+			print "/* warning: too long one-to-many mapping: %s */" % (v)
+
+	def map_bfchar(lo, bf):
+		bf = bf[1:-1] # drop < >
+		v = [int(bf[i:i+4],16) for i in xrange(0, len(bf), 4)]
+		add_bf(lo, v)
+
+	def map_bfrange(lo, hi, bf):
+		bf = bf[1:-1] # drop < >
+		v = [int(bf[i:i+4],16) for i in xrange(0, len(bf), 4)]
+		while lo <= hi:
+			add_bf(lo, v)
+			lo = lo + 1
+			v[-1] = v[-1] + 1
+
+	current = None
+	for line in open(filename, "r").readlines():
+		if line[0] == '%':
+			continue
+		line = line.strip().split()
+		if len(line) == 0:
+			continue
+		if line[0] == '/CMapVersion': cmapversion = line[1]
+		elif line[0] == '/CMapName': cmapname = line[1][1:]
+		elif line[0] == '/WMode': wmode = int(line[1])
+		elif line[0] == '/Registry': csi_registry = line[1]
+		elif line[0] == '/Ordering': csi_ordering = line[1]
+		elif line[0] == '/Supplement': csi_supplement = line[1]
+		elif len(line) > 1 and line[1] == 'usecmap': usecmap = line[0][1:]
+		elif len(line) > 1 and line[1] == 'begincodespacerange': current = 'codespacerange'
+		elif len(line) > 1 and line[1] == 'begincidrange': current = 'cidrange'
+		elif len(line) > 1 and line[1] == 'beginbfrange': current = 'bfrange'; isbf = True
+		elif len(line) > 1 and line[1] == 'begincidchar': current = 'cidchar'
+		elif len(line) > 1 and line[1] == 'beginbfchar': current = 'bfchar'; isbf = True
+		elif line[0] == 'begincodespacerange': current = 'codespacerange'
+		elif line[0] == 'begincidrange': current = 'cidrange'
+		elif line[0] == 'beginbfrange': current = 'bfrange'; isbf = True
+		elif line[0] == 'begincidchar': current = 'cidchar'
+		elif line[0] == 'beginbfchar': current = 'bfchar'; isbf = True
+		elif line[0].startswith("end"):
+			current = None
+		elif current == 'codespacerange' and len(line) == 2:
+			n, a, b = (len(line[0])-2)/2, tocode(line[0]), tocode(line[1])
+			codespacerange.append((n, a, b))
+		elif current == 'cidrange' and len(line) == 3:
+			a, b, c = tocode(line[0]), tocode(line[1]), tocode(line[2])
+			map_cidrange(a, b, c)
+		elif current == 'cidchar' and len(line) == 2:
+			a, b = tocode(line[0]), tocode(line[1])
+			map_cidchar(a, b)
+		elif current == 'bfchar' and len(line) == 2:
+			a, b = tocode(line[0]), line[1]
+			map_bfchar(a, b)
+		elif current == 'bfrange' and len(line) == 3:
+			a, b, c = tocode(line[0]), tocode(line[1]), line[2]
+			map_bfrange(a, b, c)
+
+	# Create ranges
+
+	singles = []
+	ranges = []
+	mranges = []
+
+	out_lo = -100
+	out_hi = -100
+	out_v_lo = 0
+	out_v_hi = 0
+
+	def flush_range():
+		if out_lo >= 0:
+			if out_lo == out_hi:
+				singles.append((out_lo, out_v_lo))
+			else:
+				ranges.append((out_lo, out_hi, out_v_lo))
+
+	keys = map.keys()
+	keys.sort()
+	for code in keys:
+		v = map[code]
+		if type(v) is not int:
+			flush_range()
+			out_lo = out_hi = -100
+			mranges.append((code, v))
+		else:
+			if code != out_hi + 1 or v != out_v_hi + 1:
+				flush_range()
+				out_lo = out_hi = code
+				out_v_lo = out_v_hi = v
+			else:
+				out_hi = out_hi + 1
+				out_v_hi = out_v_hi + 1
+	flush_range()
+
+	# Print CMap file
+
+	print "%!PS-Adobe-3.0 Resource-CMap"
+	print "%%DocumentNeededResources: procset (CIDInit)"
+	print "%%IncludeResource: procset (CIDInit)"
+	print "%%%%BeginResource: CMap (%s)" % cmapname
+	print "%%%%Version: %s" % cmapversion
+	print "%%EndComments"
+	print "/CIDInit /ProcSet findresource begin"
+	print "12 dict begin"
+	print "begincmap"
+	if usecmap: print "/%s usecmap" % usecmap
+	print "/CIDSystemInfo 3 dict dup begin"
+	print "  /Registry %s def" % csi_registry
+	print "  /Ordering %s def" % csi_ordering
+	print "  /Supplement %s def" % csi_supplement
+	print "end def"
+	print "/CMapName /%s def" % cmapname
+	print "/CMapVersion %s def" % cmapversion
+	print "/CMapType 1 def"
+	print "/WMode %d def" % wmode
+
+	if len(codespacerange):
+		print "%d begincodespacerange" % len(codespacerange)
+		for r in codespacerange:
+			fmt = "<%%0%dx> <%%0%dx>" % (r[0]*2, r[0]*2)
+			print fmt % (r[1], r[2])
+		print "endcodespacerange"
+
+	if len(singles) > 0:
+		if isbf:
+			print "%d beginbfchar" % len(singles)
+			for s in singles:
+				print "<%04x> <%04x>" % s
+			print "endbfchar"
+		else:
+			print "%d begincidchar" % len(singles)
+			for s in singles:
+				print "<%04x> %d" % s
+			print "endcidchar"
+
+	if len(ranges) > 0:
+		if isbf:
+			print "%d beginbfrange" % len(ranges)
+			for r in ranges:
+				print "<%04x> <%04x> <%04x>" % r
+			print "endbfrange"
+		else:
+			print "%d begincidrange" % len(ranges)
+			for r in ranges:
+				print "<%04x> <%04x> %d" % r
+			print "endcidrange"
+
+	if len(mranges) > 0:
+		print "%d beginbfchar" % len(mranges)
+		for cid, v in mranges:
+			print "<%04x> <%s>" % (cid, "".join(["%04x" % ch for ch in v]))
+		print "endbfchar"
+
+	print "endcmap"
+	print "CMapName currentdict /CMap defineresource pop"
+	print "end"
+	print "end"
+	print "%%EndResource"
+	print "%%EOF"
+
+for arg in sys.argv[1:]:
+	cleancmap(arg)
diff --git a/scripts/cmapcleanx.c b/scripts/cmapcleanx.c
index a433c8a4..6900d7b1 100644
--- a/scripts/cmapcleanx.c
+++ b/scripts/cmapcleanx.c
@@ -1,36 +1,10 @@
 /* cmapclean.c -- parse a CMap file and write it back out */
 
-/* We never want to build memento versions of the cmapdump util */
-#undef MEMENTO
-
-/* We never want large file access here */
-#undef FZ_LARGEFILE
-
 #include <stdio.h>
 #include <string.h>
 
 #include "mupdf/pdf.h"
 
-#include "../source/fitz/context.c"
-#include "../source/fitz/error.c"
-#include "../source/fitz/memory.c"
-#include "../source/fitz/output.c"
-#include "../source/fitz/string.c"
-#include "../source/fitz/buffer.c"
-#include "../source/fitz/stream-open.c"
-#include "../source/fitz/stream-read.c"
-#include "../source/fitz/strtod.c"
-#include "../source/fitz/strtof.c"
-#include "../source/fitz/ftoa.c"
-#include "../source/fitz/printf.c"
-#ifdef _WIN32
-#include "../source/fitz/time.c"
-#endif
-
-#include "../source/pdf/pdf-lex.c"
-#include "../source/pdf/pdf-cmap.c"
-#include "../source/pdf/pdf-cmap-parse.c"
-
 struct cidrange {
 	unsigned int lo, hi, v;
 };
@@ -74,7 +48,7 @@ main(int argc, char **argv)
 
 	fi = fz_open_file(ctx, argv[1]);
 	cmap = pdf_load_cmap(ctx, fi);
-	fz_close(fi);
+	fz_drop_stream(ctx, fi);
 
 	printf("begincmap\n");
 	printf("/CMapName /%s def\n", cmap->cmap_name);
@@ -102,7 +76,7 @@ main(int argc, char **argv)
 	}
 
 	n = cmap->rlen + cmap->xlen;
-	r = malloc(n * sizeof *r);
+	r = fz_malloc(ctx, n * sizeof *r);
 	i = 0;
 
 	for (k = 0; k < cmap->rlen; k++) {
@@ -135,6 +109,7 @@ main(int argc, char **argv)
 		printf("endcidchar\n");
 	}
 
+#if 0
 	if (cmap->mlen > 0)
 	{
 		printf("beginbfchar\n");
@@ -149,113 +124,10 @@ main(int argc, char **argv)
 		}
 		printf("endbfchar\n");
 	}
+#endif
 
 	printf("endcmap\n");
 
-	fz_free_context(ctx);
-	return 0;
-}
-
-void fz_new_font_context(fz_context *ctx)
-{
-}
-
-void fz_drop_font_context(fz_context *ctx)
-{
-}
-
-fz_font_context *fz_keep_font_context(fz_context *ctx)
-{
-	return NULL;
-}
-
-void fz_new_colorspace_context(fz_context *ctx)
-{
-}
-
-void fz_drop_colorspace_context(fz_context *ctx)
-{
-}
-
-fz_colorspace_context *fz_keep_colorspace_context(fz_context *ctx)
-{
-	return NULL;
-}
-
-void fz_new_aa_context(fz_context *ctx)
-{
-}
-
-void fz_drop_aa_context(fz_context *ctx)
-{
-}
-
-void fz_copy_aa_context(fz_context *dst, fz_context *src)
-{
-}
-
-void *fz_keep_storable(fz_context *ctx, const fz_storable *sc)
-{
-	fz_storable *s = (fz_storable *)sc;
-	return fz_keep_imp(ctx, s, &s->refs);
-}
-
-void fz_drop_storable(fz_context *ctx, const fz_storable *sc)
-{
-	fz_storable *s = (fz_storable *)sc;
-	if (fz_drop_imp(ctx, s, &s->refs))
-		s->drop(ctx, s);
-}
-
-void fz_new_store_context(fz_context *ctx, size_t max)
-{
-}
-
-void fz_drop_store_context(fz_context *ctx)
-{
-}
-
-fz_store *fz_keep_store_context(fz_context *ctx)
-{
-	return NULL;
-}
-
-int fz_store_scavenge(fz_context *ctx, size_t size, int *phase)
-{
-	return 0;
-}
-
-void fz_new_glyph_cache_context(fz_context *ctx)
-{
-}
-
-void fz_drop_glyph_cache_context(fz_context *ctx)
-{
-}
-
-fz_glyph_cache *fz_keep_glyph_cache(fz_context *ctx)
-{
-	return NULL;
-}
-
-void fz_new_document_handler_context(fz_context *ctx)
-{
-}
-
-void fz_drop_document_handler_context(fz_context *ctx)
-{
-}
-
-fz_document_handler_context *fz_keep_document_handler_context(fz_context *ctx)
-{
-	return NULL;
-}
-
-void fz_default_image_decode(void *arg, int w, int h, int l2factor, fz_irect *irect)
-{
-}
-
-int fz_default_image_scale(void *arg, int w, int h, int src_w, int src_h)
-{
+	fz_drop_context(ctx);
 	return 0;
 }
diff --git a/scripts/cmapcleanz.c b/scripts/cmapcleanz.c
index 07373f61..dee04044 100644
--- a/scripts/cmapcleanz.c
+++ b/scripts/cmapcleanz.c
@@ -1,36 +1,10 @@
 /* cmapclean.c -- parse a CMap file and write it back out */
 
-/* We never want to build memento versions of the cmapdump util */
-#undef MEMENTO
-
-/* We never want large file access here */
-#undef FZ_LARGEFILE
-
 #include <stdio.h>
 #include <string.h>
 
 #include "mupdf/pdf.h"
 
-#include "../source/fitz/context.c"
-#include "../source/fitz/error.c"
-#include "../source/fitz/memory.c"
-#include "../source/fitz/output.c"
-#include "../source/fitz/string.c"
-#include "../source/fitz/buffer.c"
-#include "../source/fitz/stream-open.c"
-#include "../source/fitz/stream-read.c"
-#include "../source/fitz/strtod.c"
-#include "../source/fitz/strtof.c"
-#include "../source/fitz/ftoa.c"
-#include "../source/fitz/printf.c"
-#ifdef _WIN32
-#include "../source/fitz/time.c"
-#endif
-
-#include "../source/pdf/pdf-lex.c"
-#include "../source/pdf/pdf-cmap.c"
-#include "../source/pdf/pdf-cmap-parse.c"
-
 void pc(unsigned int c)
 {
 	if (c <= 0xff) printf("<%02x>", c);
@@ -63,7 +37,7 @@ main(int argc, char **argv)
 
 	fi = fz_open_file(ctx, argv[1]);
 	cmap = pdf_load_cmap(ctx, fi);
-	fz_close(fi);
+	fz_drop_stream(ctx, fi);
 
 	printf("begincmap\n");
 	printf("/CMapName /%s def\n", cmap->cmap_name);
@@ -105,7 +79,7 @@ main(int argc, char **argv)
 		for (k = 0; k < cmap->rlen; k++) {
 			if (cmap->ranges[k].high - cmap->ranges[k].low == 0) {
 				pc(cmap->ranges[k].low);
-				printf("%u\n", cmap->ranges[k].out);
+				printf(" %u\n", cmap->ranges[k].out);
 			}
 		}
 		printf("endcidchar\n");
@@ -117,8 +91,9 @@ main(int argc, char **argv)
 		for (k = 0; k < cmap->rlen; k++) {
 			if (cmap->ranges[k].high - cmap->ranges[k].low > 0) {
 				pc(cmap->ranges[k].low);
+				putchar(' ');
 				pc(cmap->ranges[k].high);
-				printf("%u\n", cmap->ranges[k].out);
+				printf(" %u\n", cmap->ranges[k].out);
 			}
 		}
 		printf("endcidrange\n");
@@ -160,6 +135,7 @@ main(int argc, char **argv)
 
 	/* 1-to-many */
 
+#if 0
 	if (cmap->mlen > 0)
 	{
 		printf("beginbfchar\n");
@@ -173,113 +149,10 @@ main(int argc, char **argv)
 		}
 		printf("endbfchar\n");
 	}
+#endif
 
 	printf("endcmap\n");
 
-	fz_free_context(ctx);
-	return 0;
-}
-
-void fz_new_font_context(fz_context *ctx)
-{
-}
-
-void fz_drop_font_context(fz_context *ctx)
-{
-}
-
-fz_font_context *fz_keep_font_context(fz_context *ctx)
-{
-	return NULL;
-}
-
-void fz_new_colorspace_context(fz_context *ctx)
-{
-}
-
-void fz_drop_colorspace_context(fz_context *ctx)
-{
-}
-
-fz_colorspace_context *fz_keep_colorspace_context(fz_context *ctx)
-{
-	return NULL;
-}
-
-void fz_new_aa_context(fz_context *ctx)
-{
-}
-
-void fz_drop_aa_context(fz_context *ctx)
-{
-}
-
-void fz_copy_aa_context(fz_context *dst, fz_context *src)
-{
-}
-
-void *fz_keep_storable(fz_context *ctx, const fz_storable *sc)
-{
-	fz_storable *s = (fz_storable *)sc;
-	return fz_keep_imp(ctx, s, &s->refs);
-}
-
-void fz_drop_storable(fz_context *ctx, const fz_storable *sc)
-{
-	fz_storable *s = (fz_storable *)sc;
-	if (fz_drop_imp(ctx, s, &s->refs))
-		s->drop(ctx, s);
-}
-
-void fz_new_store_context(fz_context *ctx, size_t max)
-{
-}
-
-void fz_drop_store_context(fz_context *ctx)
-{
-}
-
-fz_store *fz_keep_store_context(fz_context *ctx)
-{
-	return NULL;
-}
-
-int fz_store_scavenge(fz_context *ctx, size_t size, int *phase)
-{
-	return 0;
-}
-
-void fz_new_glyph_cache_context(fz_context *ctx)
-{
-}
-
-void fz_drop_glyph_cache_context(fz_context *ctx)
-{
-}
-
-fz_glyph_cache *fz_keep_glyph_cache(fz_context *ctx)
-{
-	return NULL;
-}
-
-void fz_new_document_handler_context(fz_context *ctx)
-{
-}
-
-void fz_drop_document_handler_context(fz_context *ctx)
-{
-}
-
-fz_document_handler_context *fz_keep_document_handler_context(fz_context *ctx)
-{
-	return NULL;
-}
-
-void fz_default_image_decode(void *arg, int w, int h, int l2factor, fz_irect *irect)
-{
-}
-
-int fz_default_image_scale(void *arg, int w, int h, int src_w, int src_h)
-{
+	fz_drop_context(ctx);
 	return 0;
 }
diff --git a/scripts/cmapdump.py b/scripts/cmapdump.py
new file mode 100644
index 00000000..d92da4e8
--- /dev/null
+++ b/scripts/cmapdump.py
@@ -0,0 +1,218 @@
+# Parse a CMap file and dump it as a C struct.
+
+import sys
+
+# Decode a subset of CMap syntax (only what is needed for our built-in resources)
+# We require that tokens are whitespace separated.
+
+def dumpcmap(filename):
+	codespacerange = []
+	usecmap = ""
+	cmapname = ""
+	wmode = 0
+
+	map = {}
+
+	def tocode(s):
+		if s[0] == '<' and s[-1] == '>':
+			return int(s[1:-1], 16)
+		return int(s, 10)
+
+	def map_cidchar(lo, v):
+		map[lo] = v
+
+	def map_cidrange(lo, hi, v):
+		while lo <= hi:
+			map[lo] = v
+			lo = lo + 1
+			v = v + 1
+
+	def add_bf(lo, v):
+		# Decode unicode surrogate pairs
+		if len(v) == 2 and v[0] >= 0xd800 and v[0] <= 0xdbff and v[1] >= 0xdc00 and v[1] <= 0xdfff:
+			map[lo] = ((v[0] - 0xd800) << 10) + (v[1] - 0xdc00) + 0x10000
+		elif len(v) == 1:
+			map[lo] = v[0]
+		elif len(v) <= 8:
+			map[lo] = v[:]
+		else:
+			print "/* warning: too long one-to-many mapping: %s */" % (v)
+
+	def map_bfchar(lo, bf):
+		bf = bf[1:-1] # drop < >
+		v = [int(bf[i:i+4],16) for i in xrange(0, len(bf), 4)]
+		add_bf(lo, v)
+
+	def map_bfrange(lo, hi, bf):
+		bf = bf[1:-1] # drop < >
+		v = [int(bf[i:i+4],16) for i in xrange(0, len(bf), 4)]
+		while lo <= hi:
+			add_bf(lo, v)
+			lo = lo + 1
+			v[-1] = v[-1] + 1
+
+	current = None
+	for line in open(filename, "r").readlines():
+		if line[0] == '%':
+			continue
+		line = line.strip().split()
+		if len(line) == 0:
+			continue
+		if line[0] == '/CMapName':
+			cmapname = line[1][1:]
+		elif line[0] == '/WMode':
+			wmode = int(line[1])
+		elif len(line) > 1 and line[1] == 'usecmap':
+			usecmap = line[0][1:]
+		elif len(line) > 1 and line[1] == 'begincodespacerange': current = 'codespacerange'
+		elif len(line) > 1 and line[1] == 'begincidrange': current = 'cidrange'
+		elif len(line) > 1 and line[1] == 'beginbfrange': current = 'bfrange'
+		elif len(line) > 1 and line[1] == 'begincidchar': current = 'cidchar'
+		elif len(line) > 1 and line[1] == 'beginbfchar': current = 'bfchar'
+		elif line[0] == 'begincodespacerange': current = 'codespacerange'
+		elif line[0] == 'begincidrange': current = 'cidrange'
+		elif line[0] == 'beginbfrange': current = 'bfrange'
+		elif line[0] == 'begincidchar': current = 'cidchar'
+		elif line[0] == 'beginbfchar': current = 'bfchar'
+		elif line[0].startswith("end"):
+			current = None
+		elif current == 'codespacerange' and len(line) == 2:
+			n, a, b = (len(line[0])-2)/2, tocode(line[0]), tocode(line[1])
+			codespacerange.append((n, a, b))
+		elif current == 'cidrange' and len(line) == 3:
+			a, b, c = tocode(line[0]), tocode(line[1]), tocode(line[2])
+			map_cidrange(a, b, c)
+		elif current == 'cidchar' and len(line) == 2:
+			a, b = tocode(line[0]), tocode(line[1])
+			map_cidchar(a, b)
+		elif current == 'bfchar' and len(line) == 2:
+			a, b = tocode(line[0]), line[1]
+			map_bfchar(a, b)
+		elif current == 'bfrange' and len(line) == 3:
+			a, b, c = tocode(line[0]), tocode(line[1]), line[2]
+			map_bfrange(a, b, c)
+
+	# Create ranges
+
+	ranges = []
+	xranges = []
+	mranges = []
+	mdata = []
+
+	out_lo = -100
+	out_hi = -100
+	out_v_lo = 0
+	out_v_hi = 0
+
+	def flush_range():
+		if out_lo >= 0:
+			if out_lo > 0xffff or out_hi > 0xffff or out_v_lo > 0xffff:
+				xranges.append((out_lo, out_hi, out_v_lo))
+			else:
+				ranges.append((out_lo, out_hi, out_v_lo))
+
+	keys = map.keys()
+	keys.sort()
+	for code in keys:
+		v = map[code]
+		if type(v) is not int:
+			flush_range()
+			out_lo = out_hi = -100
+			mranges.append((code, len(mdata)))
+			mdata.append(len(v))
+			mdata.extend(v)
+		else:
+			if code != out_hi + 1 or v != out_v_hi + 1:
+				flush_range()
+				out_lo = out_hi = code
+				out_v_lo = out_v_hi = v
+			else:
+				out_hi = out_hi + 1
+				out_v_hi = out_v_hi + 1
+	flush_range()
+
+	# Print C file
+
+	cname = cmapname.replace('-', '_')
+
+	print
+	print "/*", cmapname, "*/"
+	print
+
+	if len(ranges) > 0:
+		print "static const pdf_range cmap_%s_ranges[] = {" % cname
+		for r in ranges:
+			print "{%d,%d,%d}," % r
+		print "};"
+		print
+	if len(xranges) > 0:
+		print "static const pdf_xrange cmap_%s_xranges[] = {" % cname
+		for r in xranges:
+			print "{%d,%d,%d}," % r
+		print "};"
+		print
+	if len(mranges) > 0:
+		print "static const pdf_mrange cmap_%s_mranges[] = {" % cname
+		for r in mranges:
+			print "{%d,%d}," % r
+		print "};"
+		print
+		print "static const int cmap_%s_table[] = {" % cname
+		n = mdata[0]
+		i = 0
+		for r in mdata:
+			if i <= n:
+				sys.stdout.write("%d," % r)
+				i = i + 1
+			else:
+				sys.stdout.write("\n%d," % r)
+				i = 1
+				n = r
+		sys.stdout.write("\n")
+		print "};"
+		print
+
+	print "pdf_cmap pdf_cmap_%s = {" % cname
+	print "\t{ -1, pdf_drop_cmap_imp },"
+	print "\t/* cmapname */ \"%s\"," % cmapname
+	print "\t/* usecmap */ \"%s\", NULL," % usecmap
+	print "\t/* wmode */ %d," % wmode
+	print "\t/* codespaces */ %d, {" % len(codespacerange)
+	if len(codespacerange) > 0:
+		for codespace in codespacerange:
+			fmt = "\t\t{ %%d, 0x%%0%dx, 0x%%0%dx }," % (codespace[0]*2, codespace[0]*2)
+			print fmt % codespace
+	else:
+			print "\t\t{ 0, 0, 0 },"
+	print "\t},"
+
+	if len(ranges) > 0:
+		print "\t%d, %d, (pdf_range*)cmap_%s_ranges," % (len(ranges),len(ranges),cname)
+	else:
+		print "\t0, 0, NULL, /* ranges */"
+
+	if len(xranges) > 0:
+		print "\t%d, %d, (pdf_xrange*)cmap_%s_xranges," % (len(xranges),len(xranges),cname)
+	else:
+		print "\t0, 0, NULL, /* xranges */"
+
+	if len(mranges) > 0:
+		print "\t%d, %d, (pdf_mrange*)cmap_%s_mranges," % (len(mranges),len(mranges),cname)
+	else:
+		print "\t0, 0, NULL, /* mranges */"
+
+	if len(mdata) > 0:
+		print "\t%d, %d, (int*)cmap_%s_table," % (len(mdata),len(mdata),cname)
+	else:
+		print "\t0, 0, NULL, /* table */"
+
+	print "\t0, 0, 0, NULL /* splay tree */"
+	print "};"
+
+print "/* This is an automatically generated file. Do not edit. */"
+print
+print '#include "mupdf/fitz.h"'
+print '#include "mupdf/pdf.h"'
+
+for arg in sys.argv[1:]:
+	dumpcmap(arg)
diff --git a/scripts/cmapflatten.py b/scripts/cmapflatten.py
new file mode 100644
index 00000000..8bb2193a
--- /dev/null
+++ b/scripts/cmapflatten.py
@@ -0,0 +1,108 @@
+# Parse a Uni* CMap file and flatten it.
+#
+# The Uni* CMap files only have 'cidchar' and 'cidrange' sections, never
+# 'bfchar' or 'bfrange'.
+
+import sys
+
+def flattencmap(filename):
+	codespacerange = []
+	usecmap = ""
+	cmapname = ""
+	cmapversion = "1.0"
+	csi_registry = "(Adobe)"
+	csi_ordering = "(Unknown)"
+	csi_supplement = 1
+	wmode = 0
+
+	map = {}
+
+	def tocode(s):
+		if s[0] == '<' and s[-1] == '>':
+			return int(s[1:-1], 16)
+		return int(s, 10)
+
+	def map_cidchar(lo, v):
+		map[lo] = v
+
+	def map_cidrange(lo, hi, v):
+		while lo <= hi:
+			map[lo] = v
+			lo = lo + 1
+			v = v + 1
+
+	current = None
+	for line in open(filename, "r").readlines():
+		if line[0] == '%':
+			continue
+		line = line.strip().split()
+		if len(line) == 0:
+			continue
+		if line[0] == '/CMapVersion': cmapversion = line[1]
+		elif line[0] == '/CMapName': cmapname = line[1][1:]
+		elif line[0] == '/WMode': wmode = int(line[1])
+		elif line[0] == '/Registry': csi_registry = line[1]
+		elif line[0] == '/Ordering': csi_ordering = line[1]
+		elif line[0] == '/Supplement': csi_supplement = line[1]
+		elif len(line) > 1 and line[1] == 'usecmap': usecmap = line[0][1:]
+		elif len(line) > 1 and line[1] == 'begincodespacerange': current = 'codespacerange'
+		elif len(line) > 1 and line[1] == 'begincidrange': current = 'cidrange'
+		elif len(line) > 1 and line[1] == 'begincidchar': current = 'cidchar'
+		elif line[0].startswith("end"):
+			current = None
+		elif current == 'codespacerange' and len(line) == 2:
+			n, a, b = (len(line[0])-2)/2, tocode(line[0]), tocode(line[1])
+			codespacerange.append((n, a, b))
+		elif current == 'cidrange' and len(line) == 3:
+			a, b, c = tocode(line[0]), tocode(line[1]), tocode(line[2])
+			map_cidrange(a, b, c)
+		elif current == 'cidchar' and len(line) == 2:
+			a, b = tocode(line[0]), tocode(line[1])
+			map_cidchar(a, b)
+
+	# Print flattened CMap file
+
+	print "%!PS-Adobe-3.0 Resource-CMap"
+	print "%%DocumentNeededResources: procset (CIDInit)"
+	print "%%IncludeResource: procset (CIDInit)"
+	print "%%%%BeginResource: CMap (%s)" % cmapname
+	print "%%%%Version: %s" % cmapversion
+	print "%%EndComments"
+	print "/CIDInit /ProcSet findresource begin"
+	print "12 dict begin"
+	print "begincmap"
+	if usecmap: print "/%s usecmap" % usecmap
+	print "/CIDSystemInfo 3 dict dup begin"
+	print "  /Registry %s def" % csi_registry
+	print "  /Ordering %s def" % csi_ordering
+	print "  /Supplement %s def" % csi_supplement
+	print "end def"
+	print "/CMapName /%s def" % cmapname
+	print "/CMapVersion %s def" % cmapversion
+	print "/CMapType 1 def"
+	print "/WMode %d def" % wmode
+
+	if len(codespacerange):
+		print "%d begincodespacerange" % len(codespacerange)
+		for r in codespacerange:
+			fmt = "<%%0%dx> <%%0%dx>" % (r[0]*2, r[0]*2)
+			print fmt % (r[1], r[2])
+		print "endcodespacerange"
+
+	keys = map.keys()
+	keys.sort()
+	print "%d begincidchar" % len(keys)
+	for code in keys:
+		v = map[code]
+		print "<%04x> %d" % (code, v)
+	print "endcidchar"
+
+	print "endcmap"
+	print "CMapName currentdict /CMap defineresource pop"
+	print "end"
+	print "end"
+	print "%%EndResource"
+	print "%%EOF"
+
+for arg in sys.argv[1:]:
+	flattencmap(arg)
diff --git a/scripts/cmapshare.py b/scripts/cmapshare.py
new file mode 100644
index 00000000..f5e62d4e
--- /dev/null
+++ b/scripts/cmapshare.py
@@ -0,0 +1,57 @@
+# Find and extract common CMap subsets.
+# Taken flattened CMaps as input, using only the 'cidchar' sections.
+# The outputs are truncated; so use 'cmapflatten.py' to clean them up.
+
+import sys, os
+
+def load_cmap_set(filename):
+	cmap = set()
+	active = False
+	for line in open(filename).readlines():
+		line = line.strip()
+		if line.endswith("endcidchar"): active = False
+		if active: cmap.add(line)
+		if line.endswith("begincidchar"): active = True
+	return cmap
+
+def load_cmap_prologue(filename):
+	prologue = []
+	for line in open(filename).readlines():
+		line = line.strip()
+		if line.endswith("begincidchar"):
+			break
+		prologue.append(line)
+	return prologue
+
+epilogue = [
+	'endcidchar',
+]
+
+common_name = os.path.basename(sys.argv[1])
+
+# First find the common subset
+common = load_cmap_set(sys.argv[2])
+for f in sys.argv[3:]:
+	common &= load_cmap_set(f)
+
+def print_cmap(filename, prologue, cmap):
+	out = open(filename, "w")
+	for line in prologue:
+		if not line.endswith("usecmap"):
+			print >>out, line
+		if line == 'begincmap':
+			print >>out, "/"+common_name, "usecmap"
+	print >>out, len(cmap), "begincidchar"
+	for line in sorted(cmap):
+		print >>out, line
+	for line in epilogue:
+		print >>out, line
+
+# Print common subset
+print_cmap(sys.argv[1], ["/CMapName /%s" % common_name], common)
+
+# Now find unique bits
+for f in sys.argv[2:]:
+	cmap = load_cmap_set(f) - common
+	prologue = load_cmap_prologue(f)
+	print_cmap(f+".shared", prologue, cmap)
diff --git a/scripts/runcmapshare.sh b/scripts/runcmapshare.sh
new file mode 100644
index 00000000..7a167c1e
--- /dev/null
+++ b/scripts/runcmapshare.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#
+# This scripts expects a to find the original CMap resources in thirdparty/cmap-resources.
+#
+
+rm -f build/cmaps/*
+mkdir -p build/cmaps
+
+function flatten {
+	for DIR in $(echo thirdparty/cmap-resources/Adobe-*)
+	do
+		if [ -f $DIR/CMap/$1 ]
+		then
+			echo $DIR/CMap/$1
+			python scripts/cmapflatten.py $DIR/CMap/$1 > build/cmaps/$1
+		fi
+	done
+}
+
+flatten GBK-EUC-H
+flatten GBK2K-H
+flatten GBKp-EUC-H
+flatten UniCNS-UCS2-H
+flatten UniCNS-UTF16-H
+flatten UniGB-UCS2-H
+flatten UniGB-UTF16-H
+flatten UniJIS-UCS2-H
+flatten UniJIS-UTF16-H
+flatten UniKS-UCS2-H
+flatten UniKS-UTF16-H
+
+python scripts/cmapshare.py build/cmaps/GBK-X build/cmaps/GB*-H
+python scripts/cmapshare.py build/cmaps/UniCNS-X build/cmaps/UniCNS-*-H
+python scripts/cmapshare.py build/cmaps/UniGB-X build/cmaps/UniGB-*-H
+python scripts/cmapshare.py build/cmaps/UniJIS-X build/cmaps/UniJIS-*-H
+python scripts/cmapshare.py build/cmaps/UniKS-X build/cmaps/UniKS-*-H
+
+for F in build/cmaps/*-X
+do
+	B=$(basename $F)
+	python scripts/cmapclean.py $F > resources/cmaps/$B
+done
+
+for F in build/cmaps/*.shared
+do
+	B=$(basename $F .shared)
+	python scripts/cmapclean.py $F > resources/cmaps/$B
+done
author	Tor Andersson <tor.andersson@artifex.com>	2018-04-04 16:36:21 +0200
committer	Tor Andersson <tor.andersson@artifex.com>	2018-04-25 12:26:32 +0200
commit	a6d083bb776ecd498e57450ef84c20e39ae604cf (patch)
tree	a1f4399f011eb2b59e21f0588d322690a6ab14f4 /scripts
parent	84cf672da90dfdaa2dfd2742cc69fa0bad268081 (diff)
download	mupdf-a6d083bb776ecd498e57450ef84c20e39ae604cf.tar.xz