summaryrefslogtreecommitdiff
path: root/scripts/cmapshare.py
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2018-04-04 16:36:21 +0200
committerTor Andersson <tor.andersson@artifex.com>2018-04-25 12:26:32 +0200
commita6d083bb776ecd498e57450ef84c20e39ae604cf (patch)
treea1f4399f011eb2b59e21f0588d322690a6ab14f4 /scripts/cmapshare.py
parent84cf672da90dfdaa2dfd2742cc69fa0bad268081 (diff)
downloadmupdf-a6d083bb776ecd498e57450ef84c20e39ae604cf.tar.xz
Add CMap processing scripts, and turn cmapdump into mutool.
A dumping script written in python. A flattening script written in python (for easier editing). A subsetting script written in shell to minimize CMaps by reusing subsets. Use 'mutool cmapdump' to bootstrap or verify cmap dumps.
Diffstat (limited to 'scripts/cmapshare.py')
-rw-r--r--scripts/cmapshare.py57
1 files changed, 57 insertions, 0 deletions
diff --git a/scripts/cmapshare.py b/scripts/cmapshare.py
new file mode 100644
index 00000000..f5e62d4e
--- /dev/null
+++ b/scripts/cmapshare.py
@@ -0,0 +1,57 @@
+# Find and extract common CMap subsets.
+# Taken flattened CMaps as input, using only the 'cidchar' sections.
+# The outputs are truncated; so use 'cmapflatten.py' to clean them up.
+
+import sys, os
+
+def load_cmap_set(filename):
+ cmap = set()
+ active = False
+ for line in open(filename).readlines():
+ line = line.strip()
+ if line.endswith("endcidchar"): active = False
+ if active: cmap.add(line)
+ if line.endswith("begincidchar"): active = True
+ return cmap
+
+def load_cmap_prologue(filename):
+ prologue = []
+ for line in open(filename).readlines():
+ line = line.strip()
+ if line.endswith("begincidchar"):
+ break
+ prologue.append(line)
+ return prologue
+
+epilogue = [
+ 'endcidchar',
+]
+
+common_name = os.path.basename(sys.argv[1])
+
+# First find the common subset
+common = load_cmap_set(sys.argv[2])
+for f in sys.argv[3:]:
+ common &= load_cmap_set(f)
+
+def print_cmap(filename, prologue, cmap):
+ out = open(filename, "w")
+ for line in prologue:
+ if not line.endswith("usecmap"):
+ print >>out, line
+ if line == 'begincmap':
+ print >>out, "/"+common_name, "usecmap"
+ print >>out, len(cmap), "begincidchar"
+ for line in sorted(cmap):
+ print >>out, line
+ for line in epilogue:
+ print >>out, line
+
+# Print common subset
+print_cmap(sys.argv[1], ["/CMapName /%s" % common_name], common)
+
+# Now find unique bits
+for f in sys.argv[2:]:
+ cmap = load_cmap_set(f) - common
+ prologue = load_cmap_prologue(f)
+ print_cmap(f+".shared", prologue, cmap)