diff options
Diffstat (limited to 'scripts/cmapdump.py')
-rw-r--r-- | scripts/cmapdump.py | 218 |
1 files changed, 218 insertions, 0 deletions
diff --git a/scripts/cmapdump.py b/scripts/cmapdump.py new file mode 100644 index 00000000..d92da4e8 --- /dev/null +++ b/scripts/cmapdump.py @@ -0,0 +1,218 @@ +# Parse a CMap file and dump it as a C struct. + +import sys + +# Decode a subset of CMap syntax (only what is needed for our built-in resources) +# We require that tokens are whitespace separated. + +def dumpcmap(filename): + codespacerange = [] + usecmap = "" + cmapname = "" + wmode = 0 + + map = {} + + def tocode(s): + if s[0] == '<' and s[-1] == '>': + return int(s[1:-1], 16) + return int(s, 10) + + def map_cidchar(lo, v): + map[lo] = v + + def map_cidrange(lo, hi, v): + while lo <= hi: + map[lo] = v + lo = lo + 1 + v = v + 1 + + def add_bf(lo, v): + # Decode unicode surrogate pairs + if len(v) == 2 and v[0] >= 0xd800 and v[0] <= 0xdbff and v[1] >= 0xdc00 and v[1] <= 0xdfff: + map[lo] = ((v[0] - 0xd800) << 10) + (v[1] - 0xdc00) + 0x10000 + elif len(v) == 1: + map[lo] = v[0] + elif len(v) <= 8: + map[lo] = v[:] + else: + print "/* warning: too long one-to-many mapping: %s */" % (v) + + def map_bfchar(lo, bf): + bf = bf[1:-1] # drop < > + v = [int(bf[i:i+4],16) for i in xrange(0, len(bf), 4)] + add_bf(lo, v) + + def map_bfrange(lo, hi, bf): + bf = bf[1:-1] # drop < > + v = [int(bf[i:i+4],16) for i in xrange(0, len(bf), 4)] + while lo <= hi: + add_bf(lo, v) + lo = lo + 1 + v[-1] = v[-1] + 1 + + current = None + for line in open(filename, "r").readlines(): + if line[0] == '%': + continue + line = line.strip().split() + if len(line) == 0: + continue + if line[0] == '/CMapName': + cmapname = line[1][1:] + elif line[0] == '/WMode': + wmode = int(line[1]) + elif len(line) > 1 and line[1] == 'usecmap': + usecmap = line[0][1:] + elif len(line) > 1 and line[1] == 'begincodespacerange': current = 'codespacerange' + elif len(line) > 1 and line[1] == 'begincidrange': current = 'cidrange' + elif len(line) > 1 and line[1] == 'beginbfrange': current = 'bfrange' + elif len(line) > 1 and line[1] == 'begincidchar': current = 'cidchar' + elif len(line) > 1 and line[1] == 'beginbfchar': current = 'bfchar' + elif line[0] == 'begincodespacerange': current = 'codespacerange' + elif line[0] == 'begincidrange': current = 'cidrange' + elif line[0] == 'beginbfrange': current = 'bfrange' + elif line[0] == 'begincidchar': current = 'cidchar' + elif line[0] == 'beginbfchar': current = 'bfchar' + elif line[0].startswith("end"): + current = None + elif current == 'codespacerange' and len(line) == 2: + n, a, b = (len(line[0])-2)/2, tocode(line[0]), tocode(line[1]) + codespacerange.append((n, a, b)) + elif current == 'cidrange' and len(line) == 3: + a, b, c = tocode(line[0]), tocode(line[1]), tocode(line[2]) + map_cidrange(a, b, c) + elif current == 'cidchar' and len(line) == 2: + a, b = tocode(line[0]), tocode(line[1]) + map_cidchar(a, b) + elif current == 'bfchar' and len(line) == 2: + a, b = tocode(line[0]), line[1] + map_bfchar(a, b) + elif current == 'bfrange' and len(line) == 3: + a, b, c = tocode(line[0]), tocode(line[1]), line[2] + map_bfrange(a, b, c) + + # Create ranges + + ranges = [] + xranges = [] + mranges = [] + mdata = [] + + out_lo = -100 + out_hi = -100 + out_v_lo = 0 + out_v_hi = 0 + + def flush_range(): + if out_lo >= 0: + if out_lo > 0xffff or out_hi > 0xffff or out_v_lo > 0xffff: + xranges.append((out_lo, out_hi, out_v_lo)) + else: + ranges.append((out_lo, out_hi, out_v_lo)) + + keys = map.keys() + keys.sort() + for code in keys: + v = map[code] + if type(v) is not int: + flush_range() + out_lo = out_hi = -100 + mranges.append((code, len(mdata))) + mdata.append(len(v)) + mdata.extend(v) + else: + if code != out_hi + 1 or v != out_v_hi + 1: + flush_range() + out_lo = out_hi = code + out_v_lo = out_v_hi = v + else: + out_hi = out_hi + 1 + out_v_hi = out_v_hi + 1 + flush_range() + + # Print C file + + cname = cmapname.replace('-', '_') + + print + print "/*", cmapname, "*/" + print + + if len(ranges) > 0: + print "static const pdf_range cmap_%s_ranges[] = {" % cname + for r in ranges: + print "{%d,%d,%d}," % r + print "};" + print + if len(xranges) > 0: + print "static const pdf_xrange cmap_%s_xranges[] = {" % cname + for r in xranges: + print "{%d,%d,%d}," % r + print "};" + print + if len(mranges) > 0: + print "static const pdf_mrange cmap_%s_mranges[] = {" % cname + for r in mranges: + print "{%d,%d}," % r + print "};" + print + print "static const int cmap_%s_table[] = {" % cname + n = mdata[0] + i = 0 + for r in mdata: + if i <= n: + sys.stdout.write("%d," % r) + i = i + 1 + else: + sys.stdout.write("\n%d," % r) + i = 1 + n = r + sys.stdout.write("\n") + print "};" + print + + print "pdf_cmap pdf_cmap_%s = {" % cname + print "\t{ -1, pdf_drop_cmap_imp }," + print "\t/* cmapname */ \"%s\"," % cmapname + print "\t/* usecmap */ \"%s\", NULL," % usecmap + print "\t/* wmode */ %d," % wmode + print "\t/* codespaces */ %d, {" % len(codespacerange) + if len(codespacerange) > 0: + for codespace in codespacerange: + fmt = "\t\t{ %%d, 0x%%0%dx, 0x%%0%dx }," % (codespace[0]*2, codespace[0]*2) + print fmt % codespace + else: + print "\t\t{ 0, 0, 0 }," + print "\t}," + + if len(ranges) > 0: + print "\t%d, %d, (pdf_range*)cmap_%s_ranges," % (len(ranges),len(ranges),cname) + else: + print "\t0, 0, NULL, /* ranges */" + + if len(xranges) > 0: + print "\t%d, %d, (pdf_xrange*)cmap_%s_xranges," % (len(xranges),len(xranges),cname) + else: + print "\t0, 0, NULL, /* xranges */" + + if len(mranges) > 0: + print "\t%d, %d, (pdf_mrange*)cmap_%s_mranges," % (len(mranges),len(mranges),cname) + else: + print "\t0, 0, NULL, /* mranges */" + + if len(mdata) > 0: + print "\t%d, %d, (int*)cmap_%s_table," % (len(mdata),len(mdata),cname) + else: + print "\t0, 0, NULL, /* table */" + + print "\t0, 0, 0, NULL /* splay tree */" + print "};" + +print "/* This is an automatically generated file. Do not edit. */" +print +print '#include "mupdf/fitz.h"' +print '#include "mupdf/pdf.h"' + +for arg in sys.argv[1:]: + dumpcmap(arg) |