summaryrefslogtreecommitdiff
path: root/source
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2014-05-10 00:32:33 +0200
committerTor Andersson <tor.andersson@artifex.com>2014-05-10 00:32:33 +0200
commitb0bfcf253e40e3f7927df8646cea3f7039200045 (patch)
tree986bf52fd462de4fc468349c2df80a7511f13667 /source
parent50a667517ad74b40cd7fced57b2fa68cea3e8afa (diff)
downloadmupdf-b0bfcf253e40e3f7927df8646cea3f7039200045.tar.xz
Fix 694698: Support 32-bit values in CMaps.
Increasing the existing data structure to 32-bit values would bloat the data tables too much. Simplify the data structure and use three separate range tables for lookups -- one with small 16-bit to 16-bit range lookups, one with 32-bit range lookups, and a final one for one-to-many lookups. This loses the range-to-table optimization we had before, but even with the extra ranges this necessitates, the total size of the compiled binary CMap data is smaller than if we were to extend the previous scheme to 32 bits.
Diffstat (limited to 'source')
-rw-r--r--source/pdf/pdf-cmap-load.c5
-rw-r--r--source/pdf/pdf-cmap.c482
2 files changed, 175 insertions, 312 deletions
diff --git a/source/pdf/pdf-cmap-load.c b/source/pdf/pdf-cmap-load.c
index 08c4a80a..d3f17c35 100644
--- a/source/pdf/pdf-cmap-load.c
+++ b/source/pdf/pdf-cmap-load.c
@@ -8,7 +8,10 @@ pdf_cmap_size(fz_context *ctx, pdf_cmap *cmap)
if (cmap->storable.refs < 0)
return 0;
- return cmap->rcap * sizeof(pdf_range) + cmap->tcap * sizeof(short) + pdf_cmap_size(ctx, cmap->usecmap);
+ return pdf_cmap_size(ctx, cmap->usecmap) +
+ cmap->rcap * sizeof *cmap->ranges +
+ cmap->xcap * sizeof *cmap->xranges +
+ cmap->mcap * sizeof *cmap->mranges;
}
/*
diff --git a/source/pdf/pdf-cmap.c b/source/pdf/pdf-cmap.c
index 5bd8c7fb..db628f23 100644
--- a/source/pdf/pdf-cmap.c
+++ b/source/pdf/pdf-cmap.c
@@ -1,30 +1,5 @@
-/*
- * The CMap data structure here is constructed on the fly by
- * adding simple range-to-range mappings. Then the data structure
- * is optimized to contain both range-to-range and range-to-table
- * lookups.
- *
- * Any one-to-many mappings are inserted as one-to-table
- * lookups in the beginning, and are not affected by the optimization
- * stage.
- *
- * There is a special function to add a 256-length range-to-table mapping.
- * The ranges do not have to be added in order.
- *
- * This code can be a lot simpler if we don't care about wasting memory,
- * or can trust the parser to give us optimal mappings.
- */
-
#include "mupdf/pdf.h"
-/* Macros for accessing the combined extent_flags field */
-#define pdf_range_high(r) ((r)->low + ((r)->extent_flags >> 2))
-#define pdf_range_flags(r) ((r)->extent_flags & 3)
-#define pdf_range_set_high(r, h) \
- ((r)->extent_flags = (((r)->extent_flags & 3) | ((h - (r)->low) << 2)))
-#define pdf_range_set_flags(r, f) \
- ((r)->extent_flags = (((r)->extent_flags & ~3) | f))
-
/*
* Allocate, destroy and simple parameters.
*/
@@ -36,32 +11,16 @@ pdf_free_cmap_imp(fz_context *ctx, fz_storable *cmap_)
if (cmap->usecmap)
pdf_drop_cmap(ctx, cmap->usecmap);
fz_free(ctx, cmap->ranges);
- fz_free(ctx, cmap->table);
+ fz_free(ctx, cmap->xranges);
+ fz_free(ctx, cmap->mranges);
fz_free(ctx, cmap);
}
pdf_cmap *
pdf_new_cmap(fz_context *ctx)
{
- pdf_cmap *cmap;
-
- cmap = fz_malloc_struct(ctx, pdf_cmap);
+ pdf_cmap *cmap = fz_malloc_struct(ctx, pdf_cmap);
FZ_INIT_STORABLE(cmap, 1, pdf_free_cmap_imp);
-
- strcpy(cmap->cmap_name, "");
- strcpy(cmap->usecmap_name, "");
- cmap->usecmap = NULL;
- cmap->wmode = 0;
- cmap->codespace_len = 0;
-
- cmap->rlen = 0;
- cmap->rcap = 0;
- cmap->ranges = NULL;
-
- cmap->tlen = 0;
- cmap->tcap = 0;
- cmap->table = NULL;
-
return cmap;
}
@@ -108,55 +67,6 @@ pdf_set_cmap_wmode(fz_context *ctx, pdf_cmap *cmap, int wmode)
cmap->wmode = wmode;
}
-#ifndef NDEBUG
-void
-pdf_print_cmap(fz_context *ctx, pdf_cmap *cmap)
-{
- int i, k, n;
-
- printf("cmap $%p /%s {\n", (void *) cmap, cmap->cmap_name);
-
- if (cmap->usecmap_name[0])
- printf("\tusecmap /%s\n", cmap->usecmap_name);
- if (cmap->usecmap)
- printf("\tusecmap $%p\n", (void *) cmap->usecmap);
-
- printf("\twmode %d\n", cmap->wmode);
-
- printf("\tcodespaces {\n");
- for (i = 0; i < cmap->codespace_len; i++)
- {
- printf("\t\t<%x> <%x>\n", cmap->codespace[i].low, cmap->codespace[i].high);
- }
- printf("\t}\n");
-
- printf("\tranges (%d,%d) {\n", cmap->rlen, cmap->tlen);
- for (i = 0; i < cmap->rlen; i++)
- {
- pdf_range *r = &cmap->ranges[i];
- printf("\t\t<%04x> <%04x> ", r->low, pdf_range_high(r));
- if (pdf_range_flags(r) == PDF_CMAP_TABLE)
- {
- printf("[ ");
- for (k = 0; k < pdf_range_high(r) - r->low + 1; k++)
- printf("%d ", cmap->table[r->offset + k]);
- printf("]\n");
- }
- else if (pdf_range_flags(r) == PDF_CMAP_MULTI)
- {
- printf("< ");
- n = cmap->table[r->offset];
- for (k = 0; k < n; k++)
- printf("%04x ", cmap->table[r->offset + 1 + k]);
- printf(">\n");
- }
- else
- printf("%d\n", r->offset);
- }
- printf("\t}\n}\n");
-}
-#endif
-
/*
* Add a codespacerange section.
* These ranges are used by pdf_decode_cmap to decode
@@ -178,56 +88,65 @@ pdf_add_codespace(fz_context *ctx, pdf_cmap *cmap, int low, int high, int n)
}
/*
- * Add an integer to the table.
+ * Add a range.
*/
-static int
-add_table(fz_context *ctx, pdf_cmap *cmap, int value)
+static void
+add_range(fz_context *ctx, pdf_cmap *cmap, unsigned int low, unsigned int high, unsigned int out)
{
- if (cmap->tlen >= USHRT_MAX + 1)
+ if (low > high)
{
- fz_warn(ctx, "cmap table is full; ignoring additional entries");
- return 1;
+ fz_warn(ctx, "range limits out of range in cmap %s", cmap->cmap_name);
+ return;
}
- if (cmap->tlen + 1 > cmap->tcap)
+
+ if (low <= 0xFFFF && high <= 0xFFFF && out <= 0xFFFF)
{
- int new_cap = cmap->tcap > 1 ? (cmap->tcap * 3) / 2 : 256;
- cmap->table = fz_resize_array(ctx, cmap->table, new_cap, sizeof(unsigned short));
- cmap->tcap = new_cap;
+ if (cmap->rlen + 1 > cmap->rcap)
+ {
+ int new_cap = cmap->rcap ? cmap->rcap * 2 : 256;
+ cmap->ranges = fz_resize_array(ctx, cmap->ranges, new_cap, sizeof *cmap->ranges);
+ cmap->rcap = new_cap;
+ }
+ cmap->ranges[cmap->rlen].low = low;
+ cmap->ranges[cmap->rlen].high = high;
+ cmap->ranges[cmap->rlen].out = out;
+ cmap->rlen++;
+ }
+ else
+ {
+ if (cmap->xlen + 1 > cmap->xcap)
+ {
+ int new_cap = cmap->xcap ? cmap->xcap * 2 : 256;
+ cmap->xranges = fz_resize_array(ctx, cmap->xranges, new_cap, sizeof *cmap->xranges);
+ cmap->xcap = new_cap;
+ }
+ cmap->xranges[cmap->xlen].low = low;
+ cmap->xranges[cmap->xlen].high = high;
+ cmap->xranges[cmap->xlen].out = out;
+ cmap->xlen++;
}
- cmap->table[cmap->tlen++] = value;
- return 0;
}
/*
- * Add a range.
+ * Add a one-to-many mapping.
*/
static void
-add_range(fz_context *ctx, pdf_cmap *cmap, int low, int high, int flag, int offset)
+add_mrange(fz_context *ctx, pdf_cmap *cmap, unsigned int low, int *out, int len)
{
- /* Sanity check ranges */
- if (low < 0 || low > 65535 || high < 0 || high > 65535 || low > high)
- {
- fz_warn(ctx, "range limits out of range in cmap %s", cmap->cmap_name);
- return;
- }
- /* If the range is too large to be represented, split it */
- if (high - low > 0x3fff)
- {
- add_range(ctx, cmap, low, low+0x3fff, flag, offset);
- add_range(ctx, cmap, low+0x3fff, high, flag, offset+0x3fff);
- return;
- }
- if (cmap->rlen + 1 > cmap->rcap)
+ int i;
+ if (cmap->mlen + 1 > cmap->mcap)
{
- int new_cap = cmap->rcap > 1 ? (cmap->rcap * 3) / 2 : 256;
- cmap->ranges = fz_resize_array(ctx, cmap->ranges, new_cap, sizeof(pdf_range));
- cmap->rcap = new_cap;
+ int new_cap = cmap->mcap ? cmap->mcap * 2 : 256;
+ cmap->mranges = fz_resize_array(ctx, cmap->mranges, new_cap, sizeof *cmap->mranges);
+ cmap->mcap = new_cap;
}
- cmap->ranges[cmap->rlen].low = low;
- pdf_range_set_high(&cmap->ranges[cmap->rlen], high);
- pdf_range_set_flags(&cmap->ranges[cmap->rlen], flag);
- cmap->ranges[cmap->rlen].offset = offset;
- cmap->rlen ++;
+ cmap->mranges[cmap->mlen].low = low;
+ cmap->mranges[cmap->mlen].len = len;
+ for (i = 0; i < len; ++i)
+ cmap->mranges[cmap->mlen].out[i] = out[i];
+ for (; i < PDF_MRANGE_CAP; ++i)
+ cmap->mranges[cmap->mlen].out[i] = 0;
+ cmap->mlen++;
}
/*
@@ -237,30 +156,17 @@ void
pdf_map_range_to_table(fz_context *ctx, pdf_cmap *cmap, int low, int *table, int len)
{
int i;
- int high = low + len;
- int offset = cmap->tlen;
- if (cmap->tlen + len >= USHRT_MAX + 1)
- {
- /* no space in the table; emit as a set of single lookups instead */
- for (i = 0; i < len; i++)
- add_range(ctx, cmap, low + i, low + i, PDF_CMAP_SINGLE, table[i]);
- }
- else
- {
- /* add table cannot fail here, we already checked that it will fit */
- for (i = 0; i < len; i++)
- add_table(ctx, cmap, table[i]);
- add_range(ctx, cmap, low, high, PDF_CMAP_TABLE, offset);
- }
+ for (i = 0; i < len; i++)
+ add_range(ctx, cmap, low + i, low + i, table[i]);
}
/*
* Add a range of contiguous one-to-one mappings (ie 1..5 maps to 21..25)
*/
void
-pdf_map_range_to_range(fz_context *ctx, pdf_cmap *cmap, int low, int high, int offset)
+pdf_map_range_to_range(fz_context *ctx, pdf_cmap *cmap, int low, int high, int out)
{
- add_range(ctx, cmap, low, high, high - low == 0 ? PDF_CMAP_SINGLE : PDF_CMAP_RANGE, offset);
+ add_range(ctx, cmap, low, high, out);
}
/*
@@ -269,197 +175,132 @@ pdf_map_range_to_range(fz_context *ctx, pdf_cmap *cmap, int low, int high, int o
void
pdf_map_one_to_many(fz_context *ctx, pdf_cmap *cmap, int low, int *values, int len)
{
- int offset, i;
-
if (len == 1)
{
- add_range(ctx, cmap, low, low, PDF_CMAP_SINGLE, values[0]);
+ add_range(ctx, cmap, low, low, values[0]);
return;
}
- if (len > 8)
- {
- fz_warn(ctx, "one to many mapping is too large (%d); truncating", len);
- len = 8;
- }
-
+ /* Decode unicode surrogate pairs. */
+ /* Only the *-UCS2 CMaps use one-to-many mappings, so assuming unicode should be safe. */
if (len == 2 &&
values[0] >= 0xD800 && values[0] <= 0xDBFF &&
values[1] >= 0xDC00 && values[1] <= 0xDFFF)
{
- fz_warn(ctx, "ignoring surrogate pair mapping in cmap %s", cmap->cmap_name);
+ int rune = ((values[0] - 0xD800) << 10) + (values[1] - 0xDC00) + 0x10000;
+ add_range(ctx, cmap, low, low, rune);
return;
}
- if (cmap->tlen + len + 1 >= USHRT_MAX + 1)
- fz_warn(ctx, "cannot map one to many; table is full");
- else
+ if (len > PDF_MRANGE_CAP)
{
- int fail;
- offset = cmap->tlen;
- fail = add_table(ctx, cmap, len);
- for (i = 0; i < len; i++)
- fail |= add_table(ctx, cmap, values[i]);
- if (!fail)
- add_range(ctx, cmap, low, low, PDF_CMAP_MULTI, offset);
- else
- cmap->tlen = offset; /* ignore one-to-many mappings when the table is full */
+ fz_warn(ctx, "ignoring one to many mapping in cmap %s", cmap->cmap_name);
+ return;
}
+
+ add_mrange(ctx, cmap, low, values, len);
}
/*
* Sort the input ranges.
- * Merge contiguous input ranges to range-to-range if the output is contiguous.
- * Merge contiguous input ranges to range-to-table if the output is random.
+ * Merge contiguous ranges.
*/
static int cmprange(const void *va, const void *vb)
{
- return ((const pdf_range*)va)->low - ((const pdf_range*)vb)->low;
+ unsigned int a = ((const pdf_range*)va)->low;
+ unsigned int b = ((const pdf_range*)vb)->low;
+ return a < b ? -1 : a > b ? 1 : 0;
}
-void
-pdf_sort_cmap(fz_context *ctx, pdf_cmap *cmap)
+static int cmpxrange(const void *va, const void *vb)
{
- pdf_range *a; /* last written range on output */
- pdf_range *b; /* current range examined on input */
-
- if (cmap->rlen == 0)
- return;
-
- qsort(cmap->ranges, cmap->rlen, sizeof(pdf_range), cmprange);
+ unsigned int a = ((const pdf_xrange*)va)->low;
+ unsigned int b = ((const pdf_xrange*)vb)->low;
+ return a < b ? -1 : a > b ? 1 : 0;
+}
- if (cmap->tlen >= USHRT_MAX + 1)
- {
- fz_warn(ctx, "cmap table is full; will not combine ranges");
- return;
- }
+static int cmpmrange(const void *va, const void *vb)
+{
+ unsigned int a = ((const pdf_mrange*)va)->low;
+ unsigned int b = ((const pdf_mrange*)vb)->low;
+ return a < b ? -1 : a > b ? 1 : 0;
+}
- a = cmap->ranges;
- b = cmap->ranges + 1;
+void
+pdf_sort_cmap(fz_context *ctx, pdf_cmap *cmap)
+{
+ pdf_range *a, *b;
+ pdf_xrange *x, *y;
- while (b < cmap->ranges + cmap->rlen)
+ if (cmap->rlen)
{
- /* ignore one-to-many mappings */
- if (pdf_range_flags(b) == PDF_CMAP_MULTI)
- {
- *(++a) = *b;
- }
-
- /* input contiguous */
- else if (pdf_range_high(a) + 1 == b->low)
+ qsort(cmap->ranges, cmap->rlen, sizeof *cmap->ranges, cmprange);
+ a = cmap->ranges;
+ for (b = a + 1; b < cmap->ranges + cmap->rlen; ++b)
{
- /* output contiguous */
- if (pdf_range_high(a) - a->low + a->offset + 1 == b->offset)
- {
- /* SR -> R and SS -> R and RR -> R and RS -> R */
- if ((pdf_range_flags(a) == PDF_CMAP_SINGLE || pdf_range_flags(a) == PDF_CMAP_RANGE) && (pdf_range_high(b) - a->low <= 0x3fff))
- {
- pdf_range_set_flags(a, PDF_CMAP_RANGE);
- pdf_range_set_high(a, pdf_range_high(b));
- }
-
- /* LS -> L */
- else if (pdf_range_flags(a) == PDF_CMAP_TABLE && pdf_range_flags(b) == PDF_CMAP_SINGLE && (pdf_range_high(b) - a->low <= 0x3fff))
- {
- if (!add_table(ctx, cmap, b->offset))
- pdf_range_set_high(a, pdf_range_high(b));
- else
- *(++a) = *b;
- }
-
- /* LR -> LR */
- else if (pdf_range_flags(a) == PDF_CMAP_TABLE && pdf_range_flags(b) == PDF_CMAP_RANGE)
- {
- *(++a) = *b;
- }
-
- /* XX -> XX */
- else
- {
- *(++a) = *b;
- }
- }
-
- /* output separated */
+ if (b->low == a->high + 1 && b->out == a->out + (a->high - a->low) + 1)
+ a->high = b->high;
else
- {
- /* SS -> L */
- if (pdf_range_flags(a) == PDF_CMAP_SINGLE && pdf_range_flags(b) == PDF_CMAP_SINGLE)
- {
- int offset = cmap->tlen;
- int fail = add_table(ctx, cmap, a->offset);
- fail |= add_table(ctx, cmap, b->offset);
- if (!fail)
- {
- pdf_range_set_flags(a, PDF_CMAP_TABLE);
- pdf_range_set_high(a, pdf_range_high(b));
- a->offset = cmap->tlen - 2;
- } else {
- cmap->tlen = offset;
- *(++a) = *b;
- }
- }
-
- /* LS -> L */
- else if (pdf_range_flags(a) == PDF_CMAP_TABLE && pdf_range_flags(b) == PDF_CMAP_SINGLE && (pdf_range_high(b) - a->low <= 0x3fff))
- {
- if (!add_table(ctx, cmap, b->offset))
- {
- pdf_range_set_high(a, pdf_range_high(b));
- }
- else
- {
- *(++a) = *b;
- }
- }
-
- /* XX -> XX */
- else
- {
- *(++a) = *b;
- }
- }
+ *(++a) = *b;
}
+ cmap->rlen = a - cmap->ranges + 1;
+ }
- /* input separated: XX -> XX */
- else
+ if (cmap->xlen)
+ {
+ qsort(cmap->xranges, cmap->xlen, sizeof *cmap->xranges, cmpxrange);
+ x = cmap->xranges;
+ for (y = x + 1; y < cmap->xranges + cmap->xlen; ++y)
{
- *(++a) = *b;
+ if (y->low == x->high + 1 && y->out == x->out + (x->high - x->low) + 1)
+ x->high = y->high;
+ else
+ *(++x) = *y;
}
-
- b ++;
+ cmap->xlen = x - cmap->xranges + 1;
}
- cmap->rlen = a - cmap->ranges + 1;
+ if (cmap->mlen)
+ {
+ qsort(cmap->mranges, cmap->mlen, sizeof *cmap->mranges, cmpmrange);
+ }
}
/*
* Lookup the mapping of a codepoint.
*/
int
-pdf_lookup_cmap(pdf_cmap *cmap, int cpt)
+pdf_lookup_cmap(pdf_cmap *cmap, unsigned int cpt)
{
- int l = 0;
- int r = cmap->rlen - 1;
- int m;
+ pdf_range *ranges = cmap->ranges;
+ pdf_xrange *xranges = cmap->xranges;
+ int l, r, m;
+ l = 0;
+ r = cmap->rlen - 1;
while (l <= r)
{
m = (l + r) >> 1;
- if (cpt < cmap->ranges[m].low)
+ if (cpt < ranges[m].low)
r = m - 1;
- else if (cpt > pdf_range_high(&cmap->ranges[m]))
+ else if (cpt > ranges[m].high)
l = m + 1;
else
- {
- int i = cpt - cmap->ranges[m].low + cmap->ranges[m].offset;
- if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_TABLE)
- return cmap->table[i];
- if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_MULTI)
- return -1; /* should use lookup_cmap_full */
- return i;
- }
+ return cpt - ranges[m].low + ranges[m].out;
+ }
+
+ l = 0;
+ r = cmap->xlen - 1;
+ while (l <= r)
+ {
+ m = (l + r) >> 1;
+ if (cpt < xranges[m].low)
+ r = m - 1;
+ else if (cpt > xranges[m].high)
+ l = m + 1;
+ else
+ return cpt - xranges[m].low + xranges[m].out;
}
if (cmap->usecmap)
@@ -469,40 +310,59 @@ pdf_lookup_cmap(pdf_cmap *cmap, int cpt)
}
int
-pdf_lookup_cmap_full(pdf_cmap *cmap, int cpt, int *out)
+pdf_lookup_cmap_full(pdf_cmap *cmap, unsigned int cpt, int *out)
{
- int i, k, n;
- int l = 0;
- int r = cmap->rlen - 1;
- int m;
+ pdf_range *ranges = cmap->ranges;
+ pdf_xrange *xranges = cmap->xranges;
+ pdf_mrange *mranges = cmap->mranges;
+ int l, r, m, i;
+ l = 0;
+ r = cmap->rlen - 1;
while (l <= r)
{
m = (l + r) >> 1;
- if (cpt < cmap->ranges[m].low)
+ if (cpt < ranges[m].low)
r = m - 1;
- else if (cpt > pdf_range_high(&cmap->ranges[m]))
+ else if (cpt > ranges[m].high)
l = m + 1;
else
{
- k = cpt - cmap->ranges[m].low + cmap->ranges[m].offset;
- if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_TABLE)
- {
- out[0] = cmap->table[k];
- return 1;
- }
- else if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_MULTI)
- {
- n = cmap->ranges[m].offset;
- for (i = 0; i < cmap->table[n]; i++)
- out[i] = cmap->table[n + i + 1];
- return cmap->table[n];
- }
- else
- {
- out[0] = k;
- return 1;
- }
+ out[0] = cpt - ranges[m].low + ranges[m].out;
+ return 1;
+ }
+ }
+
+ l = 0;
+ r = cmap->xlen - 1;
+ while (l <= r)
+ {
+ m = (l + r) >> 1;
+ if (cpt < xranges[m].low)
+ r = m - 1;
+ else if (cpt > xranges[m].high)
+ l = m + 1;
+ else
+ {
+ out[0] = cpt - xranges[m].low + xranges[m].out;
+ return 1;
+ }
+ }
+
+ l = 0;
+ r = cmap->mlen - 1;
+ while (l <= r)
+ {
+ m = (l + r) >> 1;
+ if (cpt < mranges[m].low)
+ r = m - 1;
+ else if (cpt > mranges[m].low)
+ l = m + 1;
+ else
+ {
+ for (i = 0; i < mranges[m].len; ++i)
+ out[i] = mranges[m].out[i];
+ return mranges[m].len;
}
}