From a6d083bb776ecd498e57450ef84c20e39ae604cf Mon Sep 17 00:00:00 2001 From: Tor Andersson Date: Wed, 4 Apr 2018 16:36:21 +0200 Subject: Add CMap processing scripts, and turn cmapdump into mutool. A dumping script written in python. A flattening script written in python (for easier editing). A subsetting script written in shell to minimize CMaps by reusing subsets. Use 'mutool cmapdump' to bootstrap or verify cmap dumps. --- scripts/cmapclean.py | 204 ++++++++++++++++++++++++++++++++++++++++++++ scripts/cmapcleanx.c | 138 ++---------------------------- scripts/cmapcleanz.c | 141 ++----------------------------- scripts/cmapdump.py | 218 ++++++++++++++++++++++++++++++++++++++++++++++++ scripts/cmapflatten.py | 108 ++++++++++++++++++++++++ scripts/cmapshare.py | 57 +++++++++++++ scripts/runcmapshare.sh | 48 +++++++++++ 7 files changed, 647 insertions(+), 267 deletions(-) create mode 100644 scripts/cmapclean.py create mode 100644 scripts/cmapdump.py create mode 100644 scripts/cmapflatten.py create mode 100644 scripts/cmapshare.py create mode 100644 scripts/runcmapshare.sh (limited to 'scripts') diff --git a/scripts/cmapclean.py b/scripts/cmapclean.py new file mode 100644 index 00000000..f41f8afe --- /dev/null +++ b/scripts/cmapclean.py @@ -0,0 +1,204 @@ +# Parse a CMap file and dump it back out. + +import sys + +# Decode a subset of CMap syntax (only what is needed for our built-in resources) +# We require that tokens are whitespace separated. + +def cleancmap(filename): + codespacerange = [] + usecmap = "" + cmapname = "" + cmapversion = "1.0" + csi_registry = "(Adobe)" + csi_ordering = "(Unknown)" + csi_supplement = 1 + wmode = 0 + isbf = False + + map = {} + + def tocode(s): + if s[0] == '<' and s[-1] == '>': + return int(s[1:-1], 16) + return int(s, 10) + + def map_cidchar(lo, v): + map[lo] = v + + def map_cidrange(lo, hi, v): + while lo <= hi: + map[lo] = v + lo = lo + 1 + v = v + 1 + + def add_bf(lo, v): + # Decode unicode surrogate pairs + if len(v) == 2 and v[0] >= 0xd800 and v[0] <= 0xdbff and v[1] >= 0xdc00 and v[1] <= 0xdfff: + map[lo] = ((v[0] - 0xd800) << 10) + (v[1] - 0xdc00) + 0x10000 + elif len(v) == 1: + map[lo] = v[0] + elif len(v) <= 8: + map[lo] = v[:] + else: + print "/* warning: too long one-to-many mapping: %s */" % (v) + + def map_bfchar(lo, bf): + bf = bf[1:-1] # drop < > + v = [int(bf[i:i+4],16) for i in xrange(0, len(bf), 4)] + add_bf(lo, v) + + def map_bfrange(lo, hi, bf): + bf = bf[1:-1] # drop < > + v = [int(bf[i:i+4],16) for i in xrange(0, len(bf), 4)] + while lo <= hi: + add_bf(lo, v) + lo = lo + 1 + v[-1] = v[-1] + 1 + + current = None + for line in open(filename, "r").readlines(): + if line[0] == '%': + continue + line = line.strip().split() + if len(line) == 0: + continue + if line[0] == '/CMapVersion': cmapversion = line[1] + elif line[0] == '/CMapName': cmapname = line[1][1:] + elif line[0] == '/WMode': wmode = int(line[1]) + elif line[0] == '/Registry': csi_registry = line[1] + elif line[0] == '/Ordering': csi_ordering = line[1] + elif line[0] == '/Supplement': csi_supplement = line[1] + elif len(line) > 1 and line[1] == 'usecmap': usecmap = line[0][1:] + elif len(line) > 1 and line[1] == 'begincodespacerange': current = 'codespacerange' + elif len(line) > 1 and line[1] == 'begincidrange': current = 'cidrange' + elif len(line) > 1 and line[1] == 'beginbfrange': current = 'bfrange'; isbf = True + elif len(line) > 1 and line[1] == 'begincidchar': current = 'cidchar' + elif len(line) > 1 and line[1] == 'beginbfchar': current = 'bfchar'; isbf = True + elif line[0] == 'begincodespacerange': current = 'codespacerange' + elif line[0] == 'begincidrange': current = 'cidrange' + elif line[0] == 'beginbfrange': current = 'bfrange'; isbf = True + elif line[0] == 'begincidchar': current = 'cidchar' + elif line[0] == 'beginbfchar': current = 'bfchar'; isbf = True + elif line[0].startswith("end"): + current = None + elif current == 'codespacerange' and len(line) == 2: + n, a, b = (len(line[0])-2)/2, tocode(line[0]), tocode(line[1]) + codespacerange.append((n, a, b)) + elif current == 'cidrange' and len(line) == 3: + a, b, c = tocode(line[0]), tocode(line[1]), tocode(line[2]) + map_cidrange(a, b, c) + elif current == 'cidchar' and len(line) == 2: + a, b = tocode(line[0]), tocode(line[1]) + map_cidchar(a, b) + elif current == 'bfchar' and len(line) == 2: + a, b = tocode(line[0]), line[1] + map_bfchar(a, b) + elif current == 'bfrange' and len(line) == 3: + a, b, c = tocode(line[0]), tocode(line[1]), line[2] + map_bfrange(a, b, c) + + # Create ranges + + singles = [] + ranges = [] + mranges = [] + + out_lo = -100 + out_hi = -100 + out_v_lo = 0 + out_v_hi = 0 + + def flush_range(): + if out_lo >= 0: + if out_lo == out_hi: + singles.append((out_lo, out_v_lo)) + else: + ranges.append((out_lo, out_hi, out_v_lo)) + + keys = map.keys() + keys.sort() + for code in keys: + v = map[code] + if type(v) is not int: + flush_range() + out_lo = out_hi = -100 + mranges.append((code, v)) + else: + if code != out_hi + 1 or v != out_v_hi + 1: + flush_range() + out_lo = out_hi = code + out_v_lo = out_v_hi = v + else: + out_hi = out_hi + 1 + out_v_hi = out_v_hi + 1 + flush_range() + + # Print CMap file + + print "%!PS-Adobe-3.0 Resource-CMap" + print "%%DocumentNeededResources: procset (CIDInit)" + print "%%IncludeResource: procset (CIDInit)" + print "%%%%BeginResource: CMap (%s)" % cmapname + print "%%%%Version: %s" % cmapversion + print "%%EndComments" + print "/CIDInit /ProcSet findresource begin" + print "12 dict begin" + print "begincmap" + if usecmap: print "/%s usecmap" % usecmap + print "/CIDSystemInfo 3 dict dup begin" + print " /Registry %s def" % csi_registry + print " /Ordering %s def" % csi_ordering + print " /Supplement %s def" % csi_supplement + print "end def" + print "/CMapName /%s def" % cmapname + print "/CMapVersion %s def" % cmapversion + print "/CMapType 1 def" + print "/WMode %d def" % wmode + + if len(codespacerange): + print "%d begincodespacerange" % len(codespacerange) + for r in codespacerange: + fmt = "<%%0%dx> <%%0%dx>" % (r[0]*2, r[0]*2) + print fmt % (r[1], r[2]) + print "endcodespacerange" + + if len(singles) > 0: + if isbf: + print "%d beginbfchar" % len(singles) + for s in singles: + print "<%04x> <%04x>" % s + print "endbfchar" + else: + print "%d begincidchar" % len(singles) + for s in singles: + print "<%04x> %d" % s + print "endcidchar" + + if len(ranges) > 0: + if isbf: + print "%d beginbfrange" % len(ranges) + for r in ranges: + print "<%04x> <%04x> <%04x>" % r + print "endbfrange" + else: + print "%d begincidrange" % len(ranges) + for r in ranges: + print "<%04x> <%04x> %d" % r + print "endcidrange" + + if len(mranges) > 0: + print "%d beginbfchar" % len(mranges) + for cid, v in mranges: + print "<%04x> <%s>" % (cid, "".join(["%04x" % ch for ch in v])) + print "endbfchar" + + print "endcmap" + print "CMapName currentdict /CMap defineresource pop" + print "end" + print "end" + print "%%EndResource" + print "%%EOF" + +for arg in sys.argv[1:]: + cleancmap(arg) diff --git a/scripts/cmapcleanx.c b/scripts/cmapcleanx.c index a433c8a4..6900d7b1 100644 --- a/scripts/cmapcleanx.c +++ b/scripts/cmapcleanx.c @@ -1,36 +1,10 @@ /* cmapclean.c -- parse a CMap file and write it back out */ -/* We never want to build memento versions of the cmapdump util */ -#undef MEMENTO - -/* We never want large file access here */ -#undef FZ_LARGEFILE - #include #include #include "mupdf/pdf.h" -#include "../source/fitz/context.c" -#include "../source/fitz/error.c" -#include "../source/fitz/memory.c" -#include "../source/fitz/output.c" -#include "../source/fitz/string.c" -#include "../source/fitz/buffer.c" -#include "../source/fitz/stream-open.c" -#include "../source/fitz/stream-read.c" -#include "../source/fitz/strtod.c" -#include "../source/fitz/strtof.c" -#include "../source/fitz/ftoa.c" -#include "../source/fitz/printf.c" -#ifdef _WIN32 -#include "../source/fitz/time.c" -#endif - -#include "../source/pdf/pdf-lex.c" -#include "../source/pdf/pdf-cmap.c" -#include "../source/pdf/pdf-cmap-parse.c" - struct cidrange { unsigned int lo, hi, v; }; @@ -74,7 +48,7 @@ main(int argc, char **argv) fi = fz_open_file(ctx, argv[1]); cmap = pdf_load_cmap(ctx, fi); - fz_close(fi); + fz_drop_stream(ctx, fi); printf("begincmap\n"); printf("/CMapName /%s def\n", cmap->cmap_name); @@ -102,7 +76,7 @@ main(int argc, char **argv) } n = cmap->rlen + cmap->xlen; - r = malloc(n * sizeof *r); + r = fz_malloc(ctx, n * sizeof *r); i = 0; for (k = 0; k < cmap->rlen; k++) { @@ -135,6 +109,7 @@ main(int argc, char **argv) printf("endcidchar\n"); } +#if 0 if (cmap->mlen > 0) { printf("beginbfchar\n"); @@ -149,113 +124,10 @@ main(int argc, char **argv) } printf("endbfchar\n"); } +#endif printf("endcmap\n"); - fz_free_context(ctx); - return 0; -} - -void fz_new_font_context(fz_context *ctx) -{ -} - -void fz_drop_font_context(fz_context *ctx) -{ -} - -fz_font_context *fz_keep_font_context(fz_context *ctx) -{ - return NULL; -} - -void fz_new_colorspace_context(fz_context *ctx) -{ -} - -void fz_drop_colorspace_context(fz_context *ctx) -{ -} - -fz_colorspace_context *fz_keep_colorspace_context(fz_context *ctx) -{ - return NULL; -} - -void fz_new_aa_context(fz_context *ctx) -{ -} - -void fz_drop_aa_context(fz_context *ctx) -{ -} - -void fz_copy_aa_context(fz_context *dst, fz_context *src) -{ -} - -void *fz_keep_storable(fz_context *ctx, const fz_storable *sc) -{ - fz_storable *s = (fz_storable *)sc; - return fz_keep_imp(ctx, s, &s->refs); -} - -void fz_drop_storable(fz_context *ctx, const fz_storable *sc) -{ - fz_storable *s = (fz_storable *)sc; - if (fz_drop_imp(ctx, s, &s->refs)) - s->drop(ctx, s); -} - -void fz_new_store_context(fz_context *ctx, size_t max) -{ -} - -void fz_drop_store_context(fz_context *ctx) -{ -} - -fz_store *fz_keep_store_context(fz_context *ctx) -{ - return NULL; -} - -int fz_store_scavenge(fz_context *ctx, size_t size, int *phase) -{ - return 0; -} - -void fz_new_glyph_cache_context(fz_context *ctx) -{ -} - -void fz_drop_glyph_cache_context(fz_context *ctx) -{ -} - -fz_glyph_cache *fz_keep_glyph_cache(fz_context *ctx) -{ - return NULL; -} - -void fz_new_document_handler_context(fz_context *ctx) -{ -} - -void fz_drop_document_handler_context(fz_context *ctx) -{ -} - -fz_document_handler_context *fz_keep_document_handler_context(fz_context *ctx) -{ - return NULL; -} - -void fz_default_image_decode(void *arg, int w, int h, int l2factor, fz_irect *irect) -{ -} - -int fz_default_image_scale(void *arg, int w, int h, int src_w, int src_h) -{ + fz_drop_context(ctx); return 0; } diff --git a/scripts/cmapcleanz.c b/scripts/cmapcleanz.c index 07373f61..dee04044 100644 --- a/scripts/cmapcleanz.c +++ b/scripts/cmapcleanz.c @@ -1,36 +1,10 @@ /* cmapclean.c -- parse a CMap file and write it back out */ -/* We never want to build memento versions of the cmapdump util */ -#undef MEMENTO - -/* We never want large file access here */ -#undef FZ_LARGEFILE - #include #include #include "mupdf/pdf.h" -#include "../source/fitz/context.c" -#include "../source/fitz/error.c" -#include "../source/fitz/memory.c" -#include "../source/fitz/output.c" -#include "../source/fitz/string.c" -#include "../source/fitz/buffer.c" -#include "../source/fitz/stream-open.c" -#include "../source/fitz/stream-read.c" -#include "../source/fitz/strtod.c" -#include "../source/fitz/strtof.c" -#include "../source/fitz/ftoa.c" -#include "../source/fitz/printf.c" -#ifdef _WIN32 -#include "../source/fitz/time.c" -#endif - -#include "../source/pdf/pdf-lex.c" -#include "../source/pdf/pdf-cmap.c" -#include "../source/pdf/pdf-cmap-parse.c" - void pc(unsigned int c) { if (c <= 0xff) printf("<%02x>", c); @@ -63,7 +37,7 @@ main(int argc, char **argv) fi = fz_open_file(ctx, argv[1]); cmap = pdf_load_cmap(ctx, fi); - fz_close(fi); + fz_drop_stream(ctx, fi); printf("begincmap\n"); printf("/CMapName /%s def\n", cmap->cmap_name); @@ -105,7 +79,7 @@ main(int argc, char **argv) for (k = 0; k < cmap->rlen; k++) { if (cmap->ranges[k].high - cmap->ranges[k].low == 0) { pc(cmap->ranges[k].low); - printf("%u\n", cmap->ranges[k].out); + printf(" %u\n", cmap->ranges[k].out); } } printf("endcidchar\n"); @@ -117,8 +91,9 @@ main(int argc, char **argv) for (k = 0; k < cmap->rlen; k++) { if (cmap->ranges[k].high - cmap->ranges[k].low > 0) { pc(cmap->ranges[k].low); + putchar(' '); pc(cmap->ranges[k].high); - printf("%u\n", cmap->ranges[k].out); + printf(" %u\n", cmap->ranges[k].out); } } printf("endcidrange\n"); @@ -160,6 +135,7 @@ main(int argc, char **argv) /* 1-to-many */ +#if 0 if (cmap->mlen > 0) { printf("beginbfchar\n"); @@ -173,113 +149,10 @@ main(int argc, char **argv) } printf("endbfchar\n"); } +#endif printf("endcmap\n"); - fz_free_context(ctx); - return 0; -} - -void fz_new_font_context(fz_context *ctx) -{ -} - -void fz_drop_font_context(fz_context *ctx) -{ -} - -fz_font_context *fz_keep_font_context(fz_context *ctx) -{ - return NULL; -} - -void fz_new_colorspace_context(fz_context *ctx) -{ -} - -void fz_drop_colorspace_context(fz_context *ctx) -{ -} - -fz_colorspace_context *fz_keep_colorspace_context(fz_context *ctx) -{ - return NULL; -} - -void fz_new_aa_context(fz_context *ctx) -{ -} - -void fz_drop_aa_context(fz_context *ctx) -{ -} - -void fz_copy_aa_context(fz_context *dst, fz_context *src) -{ -} - -void *fz_keep_storable(fz_context *ctx, const fz_storable *sc) -{ - fz_storable *s = (fz_storable *)sc; - return fz_keep_imp(ctx, s, &s->refs); -} - -void fz_drop_storable(fz_context *ctx, const fz_storable *sc) -{ - fz_storable *s = (fz_storable *)sc; - if (fz_drop_imp(ctx, s, &s->refs)) - s->drop(ctx, s); -} - -void fz_new_store_context(fz_context *ctx, size_t max) -{ -} - -void fz_drop_store_context(fz_context *ctx) -{ -} - -fz_store *fz_keep_store_context(fz_context *ctx) -{ - return NULL; -} - -int fz_store_scavenge(fz_context *ctx, size_t size, int *phase) -{ - return 0; -} - -void fz_new_glyph_cache_context(fz_context *ctx) -{ -} - -void fz_drop_glyph_cache_context(fz_context *ctx) -{ -} - -fz_glyph_cache *fz_keep_glyph_cache(fz_context *ctx) -{ - return NULL; -} - -void fz_new_document_handler_context(fz_context *ctx) -{ -} - -void fz_drop_document_handler_context(fz_context *ctx) -{ -} - -fz_document_handler_context *fz_keep_document_handler_context(fz_context *ctx) -{ - return NULL; -} - -void fz_default_image_decode(void *arg, int w, int h, int l2factor, fz_irect *irect) -{ -} - -int fz_default_image_scale(void *arg, int w, int h, int src_w, int src_h) -{ + fz_drop_context(ctx); return 0; } diff --git a/scripts/cmapdump.py b/scripts/cmapdump.py new file mode 100644 index 00000000..d92da4e8 --- /dev/null +++ b/scripts/cmapdump.py @@ -0,0 +1,218 @@ +# Parse a CMap file and dump it as a C struct. + +import sys + +# Decode a subset of CMap syntax (only what is needed for our built-in resources) +# We require that tokens are whitespace separated. + +def dumpcmap(filename): + codespacerange = [] + usecmap = "" + cmapname = "" + wmode = 0 + + map = {} + + def tocode(s): + if s[0] == '<' and s[-1] == '>': + return int(s[1:-1], 16) + return int(s, 10) + + def map_cidchar(lo, v): + map[lo] = v + + def map_cidrange(lo, hi, v): + while lo <= hi: + map[lo] = v + lo = lo + 1 + v = v + 1 + + def add_bf(lo, v): + # Decode unicode surrogate pairs + if len(v) == 2 and v[0] >= 0xd800 and v[0] <= 0xdbff and v[1] >= 0xdc00 and v[1] <= 0xdfff: + map[lo] = ((v[0] - 0xd800) << 10) + (v[1] - 0xdc00) + 0x10000 + elif len(v) == 1: + map[lo] = v[0] + elif len(v) <= 8: + map[lo] = v[:] + else: + print "/* warning: too long one-to-many mapping: %s */" % (v) + + def map_bfchar(lo, bf): + bf = bf[1:-1] # drop < > + v = [int(bf[i:i+4],16) for i in xrange(0, len(bf), 4)] + add_bf(lo, v) + + def map_bfrange(lo, hi, bf): + bf = bf[1:-1] # drop < > + v = [int(bf[i:i+4],16) for i in xrange(0, len(bf), 4)] + while lo <= hi: + add_bf(lo, v) + lo = lo + 1 + v[-1] = v[-1] + 1 + + current = None + for line in open(filename, "r").readlines(): + if line[0] == '%': + continue + line = line.strip().split() + if len(line) == 0: + continue + if line[0] == '/CMapName': + cmapname = line[1][1:] + elif line[0] == '/WMode': + wmode = int(line[1]) + elif len(line) > 1 and line[1] == 'usecmap': + usecmap = line[0][1:] + elif len(line) > 1 and line[1] == 'begincodespacerange': current = 'codespacerange' + elif len(line) > 1 and line[1] == 'begincidrange': current = 'cidrange' + elif len(line) > 1 and line[1] == 'beginbfrange': current = 'bfrange' + elif len(line) > 1 and line[1] == 'begincidchar': current = 'cidchar' + elif len(line) > 1 and line[1] == 'beginbfchar': current = 'bfchar' + elif line[0] == 'begincodespacerange': current = 'codespacerange' + elif line[0] == 'begincidrange': current = 'cidrange' + elif line[0] == 'beginbfrange': current = 'bfrange' + elif line[0] == 'begincidchar': current = 'cidchar' + elif line[0] == 'beginbfchar': current = 'bfchar' + elif line[0].startswith("end"): + current = None + elif current == 'codespacerange' and len(line) == 2: + n, a, b = (len(line[0])-2)/2, tocode(line[0]), tocode(line[1]) + codespacerange.append((n, a, b)) + elif current == 'cidrange' and len(line) == 3: + a, b, c = tocode(line[0]), tocode(line[1]), tocode(line[2]) + map_cidrange(a, b, c) + elif current == 'cidchar' and len(line) == 2: + a, b = tocode(line[0]), tocode(line[1]) + map_cidchar(a, b) + elif current == 'bfchar' and len(line) == 2: + a, b = tocode(line[0]), line[1] + map_bfchar(a, b) + elif current == 'bfrange' and len(line) == 3: + a, b, c = tocode(line[0]), tocode(line[1]), line[2] + map_bfrange(a, b, c) + + # Create ranges + + ranges = [] + xranges = [] + mranges = [] + mdata = [] + + out_lo = -100 + out_hi = -100 + out_v_lo = 0 + out_v_hi = 0 + + def flush_range(): + if out_lo >= 0: + if out_lo > 0xffff or out_hi > 0xffff or out_v_lo > 0xffff: + xranges.append((out_lo, out_hi, out_v_lo)) + else: + ranges.append((out_lo, out_hi, out_v_lo)) + + keys = map.keys() + keys.sort() + for code in keys: + v = map[code] + if type(v) is not int: + flush_range() + out_lo = out_hi = -100 + mranges.append((code, len(mdata))) + mdata.append(len(v)) + mdata.extend(v) + else: + if code != out_hi + 1 or v != out_v_hi + 1: + flush_range() + out_lo = out_hi = code + out_v_lo = out_v_hi = v + else: + out_hi = out_hi + 1 + out_v_hi = out_v_hi + 1 + flush_range() + + # Print C file + + cname = cmapname.replace('-', '_') + + print + print "/*", cmapname, "*/" + print + + if len(ranges) > 0: + print "static const pdf_range cmap_%s_ranges[] = {" % cname + for r in ranges: + print "{%d,%d,%d}," % r + print "};" + print + if len(xranges) > 0: + print "static const pdf_xrange cmap_%s_xranges[] = {" % cname + for r in xranges: + print "{%d,%d,%d}," % r + print "};" + print + if len(mranges) > 0: + print "static const pdf_mrange cmap_%s_mranges[] = {" % cname + for r in mranges: + print "{%d,%d}," % r + print "};" + print + print "static const int cmap_%s_table[] = {" % cname + n = mdata[0] + i = 0 + for r in mdata: + if i <= n: + sys.stdout.write("%d," % r) + i = i + 1 + else: + sys.stdout.write("\n%d," % r) + i = 1 + n = r + sys.stdout.write("\n") + print "};" + print + + print "pdf_cmap pdf_cmap_%s = {" % cname + print "\t{ -1, pdf_drop_cmap_imp }," + print "\t/* cmapname */ \"%s\"," % cmapname + print "\t/* usecmap */ \"%s\", NULL," % usecmap + print "\t/* wmode */ %d," % wmode + print "\t/* codespaces */ %d, {" % len(codespacerange) + if len(codespacerange) > 0: + for codespace in codespacerange: + fmt = "\t\t{ %%d, 0x%%0%dx, 0x%%0%dx }," % (codespace[0]*2, codespace[0]*2) + print fmt % codespace + else: + print "\t\t{ 0, 0, 0 }," + print "\t}," + + if len(ranges) > 0: + print "\t%d, %d, (pdf_range*)cmap_%s_ranges," % (len(ranges),len(ranges),cname) + else: + print "\t0, 0, NULL, /* ranges */" + + if len(xranges) > 0: + print "\t%d, %d, (pdf_xrange*)cmap_%s_xranges," % (len(xranges),len(xranges),cname) + else: + print "\t0, 0, NULL, /* xranges */" + + if len(mranges) > 0: + print "\t%d, %d, (pdf_mrange*)cmap_%s_mranges," % (len(mranges),len(mranges),cname) + else: + print "\t0, 0, NULL, /* mranges */" + + if len(mdata) > 0: + print "\t%d, %d, (int*)cmap_%s_table," % (len(mdata),len(mdata),cname) + else: + print "\t0, 0, NULL, /* table */" + + print "\t0, 0, 0, NULL /* splay tree */" + print "};" + +print "/* This is an automatically generated file. Do not edit. */" +print +print '#include "mupdf/fitz.h"' +print '#include "mupdf/pdf.h"' + +for arg in sys.argv[1:]: + dumpcmap(arg) diff --git a/scripts/cmapflatten.py b/scripts/cmapflatten.py new file mode 100644 index 00000000..8bb2193a --- /dev/null +++ b/scripts/cmapflatten.py @@ -0,0 +1,108 @@ +# Parse a Uni* CMap file and flatten it. +# +# The Uni* CMap files only have 'cidchar' and 'cidrange' sections, never +# 'bfchar' or 'bfrange'. + +import sys + +def flattencmap(filename): + codespacerange = [] + usecmap = "" + cmapname = "" + cmapversion = "1.0" + csi_registry = "(Adobe)" + csi_ordering = "(Unknown)" + csi_supplement = 1 + wmode = 0 + + map = {} + + def tocode(s): + if s[0] == '<' and s[-1] == '>': + return int(s[1:-1], 16) + return int(s, 10) + + def map_cidchar(lo, v): + map[lo] = v + + def map_cidrange(lo, hi, v): + while lo <= hi: + map[lo] = v + lo = lo + 1 + v = v + 1 + + current = None + for line in open(filename, "r").readlines(): + if line[0] == '%': + continue + line = line.strip().split() + if len(line) == 0: + continue + if line[0] == '/CMapVersion': cmapversion = line[1] + elif line[0] == '/CMapName': cmapname = line[1][1:] + elif line[0] == '/WMode': wmode = int(line[1]) + elif line[0] == '/Registry': csi_registry = line[1] + elif line[0] == '/Ordering': csi_ordering = line[1] + elif line[0] == '/Supplement': csi_supplement = line[1] + elif len(line) > 1 and line[1] == 'usecmap': usecmap = line[0][1:] + elif len(line) > 1 and line[1] == 'begincodespacerange': current = 'codespacerange' + elif len(line) > 1 and line[1] == 'begincidrange': current = 'cidrange' + elif len(line) > 1 and line[1] == 'begincidchar': current = 'cidchar' + elif line[0].startswith("end"): + current = None + elif current == 'codespacerange' and len(line) == 2: + n, a, b = (len(line[0])-2)/2, tocode(line[0]), tocode(line[1]) + codespacerange.append((n, a, b)) + elif current == 'cidrange' and len(line) == 3: + a, b, c = tocode(line[0]), tocode(line[1]), tocode(line[2]) + map_cidrange(a, b, c) + elif current == 'cidchar' and len(line) == 2: + a, b = tocode(line[0]), tocode(line[1]) + map_cidchar(a, b) + + # Print flattened CMap file + + print "%!PS-Adobe-3.0 Resource-CMap" + print "%%DocumentNeededResources: procset (CIDInit)" + print "%%IncludeResource: procset (CIDInit)" + print "%%%%BeginResource: CMap (%s)" % cmapname + print "%%%%Version: %s" % cmapversion + print "%%EndComments" + print "/CIDInit /ProcSet findresource begin" + print "12 dict begin" + print "begincmap" + if usecmap: print "/%s usecmap" % usecmap + print "/CIDSystemInfo 3 dict dup begin" + print " /Registry %s def" % csi_registry + print " /Ordering %s def" % csi_ordering + print " /Supplement %s def" % csi_supplement + print "end def" + print "/CMapName /%s def" % cmapname + print "/CMapVersion %s def" % cmapversion + print "/CMapType 1 def" + print "/WMode %d def" % wmode + + if len(codespacerange): + print "%d begincodespacerange" % len(codespacerange) + for r in codespacerange: + fmt = "<%%0%dx> <%%0%dx>" % (r[0]*2, r[0]*2) + print fmt % (r[1], r[2]) + print "endcodespacerange" + + keys = map.keys() + keys.sort() + print "%d begincidchar" % len(keys) + for code in keys: + v = map[code] + print "<%04x> %d" % (code, v) + print "endcidchar" + + print "endcmap" + print "CMapName currentdict /CMap defineresource pop" + print "end" + print "end" + print "%%EndResource" + print "%%EOF" + +for arg in sys.argv[1:]: + flattencmap(arg) diff --git a/scripts/cmapshare.py b/scripts/cmapshare.py new file mode 100644 index 00000000..f5e62d4e --- /dev/null +++ b/scripts/cmapshare.py @@ -0,0 +1,57 @@ +# Find and extract common CMap subsets. +# Taken flattened CMaps as input, using only the 'cidchar' sections. +# The outputs are truncated; so use 'cmapflatten.py' to clean them up. + +import sys, os + +def load_cmap_set(filename): + cmap = set() + active = False + for line in open(filename).readlines(): + line = line.strip() + if line.endswith("endcidchar"): active = False + if active: cmap.add(line) + if line.endswith("begincidchar"): active = True + return cmap + +def load_cmap_prologue(filename): + prologue = [] + for line in open(filename).readlines(): + line = line.strip() + if line.endswith("begincidchar"): + break + prologue.append(line) + return prologue + +epilogue = [ + 'endcidchar', +] + +common_name = os.path.basename(sys.argv[1]) + +# First find the common subset +common = load_cmap_set(sys.argv[2]) +for f in sys.argv[3:]: + common &= load_cmap_set(f) + +def print_cmap(filename, prologue, cmap): + out = open(filename, "w") + for line in prologue: + if not line.endswith("usecmap"): + print >>out, line + if line == 'begincmap': + print >>out, "/"+common_name, "usecmap" + print >>out, len(cmap), "begincidchar" + for line in sorted(cmap): + print >>out, line + for line in epilogue: + print >>out, line + +# Print common subset +print_cmap(sys.argv[1], ["/CMapName /%s" % common_name], common) + +# Now find unique bits +for f in sys.argv[2:]: + cmap = load_cmap_set(f) - common + prologue = load_cmap_prologue(f) + print_cmap(f+".shared", prologue, cmap) diff --git a/scripts/runcmapshare.sh b/scripts/runcmapshare.sh new file mode 100644 index 00000000..7a167c1e --- /dev/null +++ b/scripts/runcmapshare.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# +# This scripts expects a to find the original CMap resources in thirdparty/cmap-resources. +# + +rm -f build/cmaps/* +mkdir -p build/cmaps + +function flatten { + for DIR in $(echo thirdparty/cmap-resources/Adobe-*) + do + if [ -f $DIR/CMap/$1 ] + then + echo $DIR/CMap/$1 + python scripts/cmapflatten.py $DIR/CMap/$1 > build/cmaps/$1 + fi + done +} + +flatten GBK-EUC-H +flatten GBK2K-H +flatten GBKp-EUC-H +flatten UniCNS-UCS2-H +flatten UniCNS-UTF16-H +flatten UniGB-UCS2-H +flatten UniGB-UTF16-H +flatten UniJIS-UCS2-H +flatten UniJIS-UTF16-H +flatten UniKS-UCS2-H +flatten UniKS-UTF16-H + +python scripts/cmapshare.py build/cmaps/GBK-X build/cmaps/GB*-H +python scripts/cmapshare.py build/cmaps/UniCNS-X build/cmaps/UniCNS-*-H +python scripts/cmapshare.py build/cmaps/UniGB-X build/cmaps/UniGB-*-H +python scripts/cmapshare.py build/cmaps/UniJIS-X build/cmaps/UniJIS-*-H +python scripts/cmapshare.py build/cmaps/UniKS-X build/cmaps/UniKS-*-H + +for F in build/cmaps/*-X +do + B=$(basename $F) + python scripts/cmapclean.py $F > resources/cmaps/$B +done + +for F in build/cmaps/*.shared +do + B=$(basename $F .shared) + python scripts/cmapclean.py $F > resources/cmaps/$B +done -- cgit v1.2.3