diff options
Diffstat (limited to 'source')
-rw-r--r-- | source/pdf/pdf-cmap-load.c | 5 | ||||
-rw-r--r-- | source/pdf/pdf-cmap.c | 482 |
2 files changed, 175 insertions, 312 deletions
diff --git a/source/pdf/pdf-cmap-load.c b/source/pdf/pdf-cmap-load.c index 08c4a80a..d3f17c35 100644 --- a/source/pdf/pdf-cmap-load.c +++ b/source/pdf/pdf-cmap-load.c @@ -8,7 +8,10 @@ pdf_cmap_size(fz_context *ctx, pdf_cmap *cmap) if (cmap->storable.refs < 0) return 0; - return cmap->rcap * sizeof(pdf_range) + cmap->tcap * sizeof(short) + pdf_cmap_size(ctx, cmap->usecmap); + return pdf_cmap_size(ctx, cmap->usecmap) + + cmap->rcap * sizeof *cmap->ranges + + cmap->xcap * sizeof *cmap->xranges + + cmap->mcap * sizeof *cmap->mranges; } /* diff --git a/source/pdf/pdf-cmap.c b/source/pdf/pdf-cmap.c index 5bd8c7fb..db628f23 100644 --- a/source/pdf/pdf-cmap.c +++ b/source/pdf/pdf-cmap.c @@ -1,30 +1,5 @@ -/* - * The CMap data structure here is constructed on the fly by - * adding simple range-to-range mappings. Then the data structure - * is optimized to contain both range-to-range and range-to-table - * lookups. - * - * Any one-to-many mappings are inserted as one-to-table - * lookups in the beginning, and are not affected by the optimization - * stage. - * - * There is a special function to add a 256-length range-to-table mapping. - * The ranges do not have to be added in order. - * - * This code can be a lot simpler if we don't care about wasting memory, - * or can trust the parser to give us optimal mappings. - */ - #include "mupdf/pdf.h" -/* Macros for accessing the combined extent_flags field */ -#define pdf_range_high(r) ((r)->low + ((r)->extent_flags >> 2)) -#define pdf_range_flags(r) ((r)->extent_flags & 3) -#define pdf_range_set_high(r, h) \ - ((r)->extent_flags = (((r)->extent_flags & 3) | ((h - (r)->low) << 2))) -#define pdf_range_set_flags(r, f) \ - ((r)->extent_flags = (((r)->extent_flags & ~3) | f)) - /* * Allocate, destroy and simple parameters. */ @@ -36,32 +11,16 @@ pdf_free_cmap_imp(fz_context *ctx, fz_storable *cmap_) if (cmap->usecmap) pdf_drop_cmap(ctx, cmap->usecmap); fz_free(ctx, cmap->ranges); - fz_free(ctx, cmap->table); + fz_free(ctx, cmap->xranges); + fz_free(ctx, cmap->mranges); fz_free(ctx, cmap); } pdf_cmap * pdf_new_cmap(fz_context *ctx) { - pdf_cmap *cmap; - - cmap = fz_malloc_struct(ctx, pdf_cmap); + pdf_cmap *cmap = fz_malloc_struct(ctx, pdf_cmap); FZ_INIT_STORABLE(cmap, 1, pdf_free_cmap_imp); - - strcpy(cmap->cmap_name, ""); - strcpy(cmap->usecmap_name, ""); - cmap->usecmap = NULL; - cmap->wmode = 0; - cmap->codespace_len = 0; - - cmap->rlen = 0; - cmap->rcap = 0; - cmap->ranges = NULL; - - cmap->tlen = 0; - cmap->tcap = 0; - cmap->table = NULL; - return cmap; } @@ -108,55 +67,6 @@ pdf_set_cmap_wmode(fz_context *ctx, pdf_cmap *cmap, int wmode) cmap->wmode = wmode; } -#ifndef NDEBUG -void -pdf_print_cmap(fz_context *ctx, pdf_cmap *cmap) -{ - int i, k, n; - - printf("cmap $%p /%s {\n", (void *) cmap, cmap->cmap_name); - - if (cmap->usecmap_name[0]) - printf("\tusecmap /%s\n", cmap->usecmap_name); - if (cmap->usecmap) - printf("\tusecmap $%p\n", (void *) cmap->usecmap); - - printf("\twmode %d\n", cmap->wmode); - - printf("\tcodespaces {\n"); - for (i = 0; i < cmap->codespace_len; i++) - { - printf("\t\t<%x> <%x>\n", cmap->codespace[i].low, cmap->codespace[i].high); - } - printf("\t}\n"); - - printf("\tranges (%d,%d) {\n", cmap->rlen, cmap->tlen); - for (i = 0; i < cmap->rlen; i++) - { - pdf_range *r = &cmap->ranges[i]; - printf("\t\t<%04x> <%04x> ", r->low, pdf_range_high(r)); - if (pdf_range_flags(r) == PDF_CMAP_TABLE) - { - printf("[ "); - for (k = 0; k < pdf_range_high(r) - r->low + 1; k++) - printf("%d ", cmap->table[r->offset + k]); - printf("]\n"); - } - else if (pdf_range_flags(r) == PDF_CMAP_MULTI) - { - printf("< "); - n = cmap->table[r->offset]; - for (k = 0; k < n; k++) - printf("%04x ", cmap->table[r->offset + 1 + k]); - printf(">\n"); - } - else - printf("%d\n", r->offset); - } - printf("\t}\n}\n"); -} -#endif - /* * Add a codespacerange section. * These ranges are used by pdf_decode_cmap to decode @@ -178,56 +88,65 @@ pdf_add_codespace(fz_context *ctx, pdf_cmap *cmap, int low, int high, int n) } /* - * Add an integer to the table. + * Add a range. */ -static int -add_table(fz_context *ctx, pdf_cmap *cmap, int value) +static void +add_range(fz_context *ctx, pdf_cmap *cmap, unsigned int low, unsigned int high, unsigned int out) { - if (cmap->tlen >= USHRT_MAX + 1) + if (low > high) { - fz_warn(ctx, "cmap table is full; ignoring additional entries"); - return 1; + fz_warn(ctx, "range limits out of range in cmap %s", cmap->cmap_name); + return; } - if (cmap->tlen + 1 > cmap->tcap) + + if (low <= 0xFFFF && high <= 0xFFFF && out <= 0xFFFF) { - int new_cap = cmap->tcap > 1 ? (cmap->tcap * 3) / 2 : 256; - cmap->table = fz_resize_array(ctx, cmap->table, new_cap, sizeof(unsigned short)); - cmap->tcap = new_cap; + if (cmap->rlen + 1 > cmap->rcap) + { + int new_cap = cmap->rcap ? cmap->rcap * 2 : 256; + cmap->ranges = fz_resize_array(ctx, cmap->ranges, new_cap, sizeof *cmap->ranges); + cmap->rcap = new_cap; + } + cmap->ranges[cmap->rlen].low = low; + cmap->ranges[cmap->rlen].high = high; + cmap->ranges[cmap->rlen].out = out; + cmap->rlen++; + } + else + { + if (cmap->xlen + 1 > cmap->xcap) + { + int new_cap = cmap->xcap ? cmap->xcap * 2 : 256; + cmap->xranges = fz_resize_array(ctx, cmap->xranges, new_cap, sizeof *cmap->xranges); + cmap->xcap = new_cap; + } + cmap->xranges[cmap->xlen].low = low; + cmap->xranges[cmap->xlen].high = high; + cmap->xranges[cmap->xlen].out = out; + cmap->xlen++; } - cmap->table[cmap->tlen++] = value; - return 0; } /* - * Add a range. + * Add a one-to-many mapping. */ static void -add_range(fz_context *ctx, pdf_cmap *cmap, int low, int high, int flag, int offset) +add_mrange(fz_context *ctx, pdf_cmap *cmap, unsigned int low, int *out, int len) { - /* Sanity check ranges */ - if (low < 0 || low > 65535 || high < 0 || high > 65535 || low > high) - { - fz_warn(ctx, "range limits out of range in cmap %s", cmap->cmap_name); - return; - } - /* If the range is too large to be represented, split it */ - if (high - low > 0x3fff) - { - add_range(ctx, cmap, low, low+0x3fff, flag, offset); - add_range(ctx, cmap, low+0x3fff, high, flag, offset+0x3fff); - return; - } - if (cmap->rlen + 1 > cmap->rcap) + int i; + if (cmap->mlen + 1 > cmap->mcap) { - int new_cap = cmap->rcap > 1 ? (cmap->rcap * 3) / 2 : 256; - cmap->ranges = fz_resize_array(ctx, cmap->ranges, new_cap, sizeof(pdf_range)); - cmap->rcap = new_cap; + int new_cap = cmap->mcap ? cmap->mcap * 2 : 256; + cmap->mranges = fz_resize_array(ctx, cmap->mranges, new_cap, sizeof *cmap->mranges); + cmap->mcap = new_cap; } - cmap->ranges[cmap->rlen].low = low; - pdf_range_set_high(&cmap->ranges[cmap->rlen], high); - pdf_range_set_flags(&cmap->ranges[cmap->rlen], flag); - cmap->ranges[cmap->rlen].offset = offset; - cmap->rlen ++; + cmap->mranges[cmap->mlen].low = low; + cmap->mranges[cmap->mlen].len = len; + for (i = 0; i < len; ++i) + cmap->mranges[cmap->mlen].out[i] = out[i]; + for (; i < PDF_MRANGE_CAP; ++i) + cmap->mranges[cmap->mlen].out[i] = 0; + cmap->mlen++; } /* @@ -237,30 +156,17 @@ void pdf_map_range_to_table(fz_context *ctx, pdf_cmap *cmap, int low, int *table, int len) { int i; - int high = low + len; - int offset = cmap->tlen; - if (cmap->tlen + len >= USHRT_MAX + 1) - { - /* no space in the table; emit as a set of single lookups instead */ - for (i = 0; i < len; i++) - add_range(ctx, cmap, low + i, low + i, PDF_CMAP_SINGLE, table[i]); - } - else - { - /* add table cannot fail here, we already checked that it will fit */ - for (i = 0; i < len; i++) - add_table(ctx, cmap, table[i]); - add_range(ctx, cmap, low, high, PDF_CMAP_TABLE, offset); - } + for (i = 0; i < len; i++) + add_range(ctx, cmap, low + i, low + i, table[i]); } /* * Add a range of contiguous one-to-one mappings (ie 1..5 maps to 21..25) */ void -pdf_map_range_to_range(fz_context *ctx, pdf_cmap *cmap, int low, int high, int offset) +pdf_map_range_to_range(fz_context *ctx, pdf_cmap *cmap, int low, int high, int out) { - add_range(ctx, cmap, low, high, high - low == 0 ? PDF_CMAP_SINGLE : PDF_CMAP_RANGE, offset); + add_range(ctx, cmap, low, high, out); } /* @@ -269,197 +175,132 @@ pdf_map_range_to_range(fz_context *ctx, pdf_cmap *cmap, int low, int high, int o void pdf_map_one_to_many(fz_context *ctx, pdf_cmap *cmap, int low, int *values, int len) { - int offset, i; - if (len == 1) { - add_range(ctx, cmap, low, low, PDF_CMAP_SINGLE, values[0]); + add_range(ctx, cmap, low, low, values[0]); return; } - if (len > 8) - { - fz_warn(ctx, "one to many mapping is too large (%d); truncating", len); - len = 8; - } - + /* Decode unicode surrogate pairs. */ + /* Only the *-UCS2 CMaps use one-to-many mappings, so assuming unicode should be safe. */ if (len == 2 && values[0] >= 0xD800 && values[0] <= 0xDBFF && values[1] >= 0xDC00 && values[1] <= 0xDFFF) { - fz_warn(ctx, "ignoring surrogate pair mapping in cmap %s", cmap->cmap_name); + int rune = ((values[0] - 0xD800) << 10) + (values[1] - 0xDC00) + 0x10000; + add_range(ctx, cmap, low, low, rune); return; } - if (cmap->tlen + len + 1 >= USHRT_MAX + 1) - fz_warn(ctx, "cannot map one to many; table is full"); - else + if (len > PDF_MRANGE_CAP) { - int fail; - offset = cmap->tlen; - fail = add_table(ctx, cmap, len); - for (i = 0; i < len; i++) - fail |= add_table(ctx, cmap, values[i]); - if (!fail) - add_range(ctx, cmap, low, low, PDF_CMAP_MULTI, offset); - else - cmap->tlen = offset; /* ignore one-to-many mappings when the table is full */ + fz_warn(ctx, "ignoring one to many mapping in cmap %s", cmap->cmap_name); + return; } + + add_mrange(ctx, cmap, low, values, len); } /* * Sort the input ranges. - * Merge contiguous input ranges to range-to-range if the output is contiguous. - * Merge contiguous input ranges to range-to-table if the output is random. + * Merge contiguous ranges. */ static int cmprange(const void *va, const void *vb) { - return ((const pdf_range*)va)->low - ((const pdf_range*)vb)->low; + unsigned int a = ((const pdf_range*)va)->low; + unsigned int b = ((const pdf_range*)vb)->low; + return a < b ? -1 : a > b ? 1 : 0; } -void -pdf_sort_cmap(fz_context *ctx, pdf_cmap *cmap) +static int cmpxrange(const void *va, const void *vb) { - pdf_range *a; /* last written range on output */ - pdf_range *b; /* current range examined on input */ - - if (cmap->rlen == 0) - return; - - qsort(cmap->ranges, cmap->rlen, sizeof(pdf_range), cmprange); + unsigned int a = ((const pdf_xrange*)va)->low; + unsigned int b = ((const pdf_xrange*)vb)->low; + return a < b ? -1 : a > b ? 1 : 0; +} - if (cmap->tlen >= USHRT_MAX + 1) - { - fz_warn(ctx, "cmap table is full; will not combine ranges"); - return; - } +static int cmpmrange(const void *va, const void *vb) +{ + unsigned int a = ((const pdf_mrange*)va)->low; + unsigned int b = ((const pdf_mrange*)vb)->low; + return a < b ? -1 : a > b ? 1 : 0; +} - a = cmap->ranges; - b = cmap->ranges + 1; +void +pdf_sort_cmap(fz_context *ctx, pdf_cmap *cmap) +{ + pdf_range *a, *b; + pdf_xrange *x, *y; - while (b < cmap->ranges + cmap->rlen) + if (cmap->rlen) { - /* ignore one-to-many mappings */ - if (pdf_range_flags(b) == PDF_CMAP_MULTI) - { - *(++a) = *b; - } - - /* input contiguous */ - else if (pdf_range_high(a) + 1 == b->low) + qsort(cmap->ranges, cmap->rlen, sizeof *cmap->ranges, cmprange); + a = cmap->ranges; + for (b = a + 1; b < cmap->ranges + cmap->rlen; ++b) { - /* output contiguous */ - if (pdf_range_high(a) - a->low + a->offset + 1 == b->offset) - { - /* SR -> R and SS -> R and RR -> R and RS -> R */ - if ((pdf_range_flags(a) == PDF_CMAP_SINGLE || pdf_range_flags(a) == PDF_CMAP_RANGE) && (pdf_range_high(b) - a->low <= 0x3fff)) - { - pdf_range_set_flags(a, PDF_CMAP_RANGE); - pdf_range_set_high(a, pdf_range_high(b)); - } - - /* LS -> L */ - else if (pdf_range_flags(a) == PDF_CMAP_TABLE && pdf_range_flags(b) == PDF_CMAP_SINGLE && (pdf_range_high(b) - a->low <= 0x3fff)) - { - if (!add_table(ctx, cmap, b->offset)) - pdf_range_set_high(a, pdf_range_high(b)); - else - *(++a) = *b; - } - - /* LR -> LR */ - else if (pdf_range_flags(a) == PDF_CMAP_TABLE && pdf_range_flags(b) == PDF_CMAP_RANGE) - { - *(++a) = *b; - } - - /* XX -> XX */ - else - { - *(++a) = *b; - } - } - - /* output separated */ + if (b->low == a->high + 1 && b->out == a->out + (a->high - a->low) + 1) + a->high = b->high; else - { - /* SS -> L */ - if (pdf_range_flags(a) == PDF_CMAP_SINGLE && pdf_range_flags(b) == PDF_CMAP_SINGLE) - { - int offset = cmap->tlen; - int fail = add_table(ctx, cmap, a->offset); - fail |= add_table(ctx, cmap, b->offset); - if (!fail) - { - pdf_range_set_flags(a, PDF_CMAP_TABLE); - pdf_range_set_high(a, pdf_range_high(b)); - a->offset = cmap->tlen - 2; - } else { - cmap->tlen = offset; - *(++a) = *b; - } - } - - /* LS -> L */ - else if (pdf_range_flags(a) == PDF_CMAP_TABLE && pdf_range_flags(b) == PDF_CMAP_SINGLE && (pdf_range_high(b) - a->low <= 0x3fff)) - { - if (!add_table(ctx, cmap, b->offset)) - { - pdf_range_set_high(a, pdf_range_high(b)); - } - else - { - *(++a) = *b; - } - } - - /* XX -> XX */ - else - { - *(++a) = *b; - } - } + *(++a) = *b; } + cmap->rlen = a - cmap->ranges + 1; + } - /* input separated: XX -> XX */ - else + if (cmap->xlen) + { + qsort(cmap->xranges, cmap->xlen, sizeof *cmap->xranges, cmpxrange); + x = cmap->xranges; + for (y = x + 1; y < cmap->xranges + cmap->xlen; ++y) { - *(++a) = *b; + if (y->low == x->high + 1 && y->out == x->out + (x->high - x->low) + 1) + x->high = y->high; + else + *(++x) = *y; } - - b ++; + cmap->xlen = x - cmap->xranges + 1; } - cmap->rlen = a - cmap->ranges + 1; + if (cmap->mlen) + { + qsort(cmap->mranges, cmap->mlen, sizeof *cmap->mranges, cmpmrange); + } } /* * Lookup the mapping of a codepoint. */ int -pdf_lookup_cmap(pdf_cmap *cmap, int cpt) +pdf_lookup_cmap(pdf_cmap *cmap, unsigned int cpt) { - int l = 0; - int r = cmap->rlen - 1; - int m; + pdf_range *ranges = cmap->ranges; + pdf_xrange *xranges = cmap->xranges; + int l, r, m; + l = 0; + r = cmap->rlen - 1; while (l <= r) { m = (l + r) >> 1; - if (cpt < cmap->ranges[m].low) + if (cpt < ranges[m].low) r = m - 1; - else if (cpt > pdf_range_high(&cmap->ranges[m])) + else if (cpt > ranges[m].high) l = m + 1; else - { - int i = cpt - cmap->ranges[m].low + cmap->ranges[m].offset; - if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_TABLE) - return cmap->table[i]; - if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_MULTI) - return -1; /* should use lookup_cmap_full */ - return i; - } + return cpt - ranges[m].low + ranges[m].out; + } + + l = 0; + r = cmap->xlen - 1; + while (l <= r) + { + m = (l + r) >> 1; + if (cpt < xranges[m].low) + r = m - 1; + else if (cpt > xranges[m].high) + l = m + 1; + else + return cpt - xranges[m].low + xranges[m].out; } if (cmap->usecmap) @@ -469,40 +310,59 @@ pdf_lookup_cmap(pdf_cmap *cmap, int cpt) } int -pdf_lookup_cmap_full(pdf_cmap *cmap, int cpt, int *out) +pdf_lookup_cmap_full(pdf_cmap *cmap, unsigned int cpt, int *out) { - int i, k, n; - int l = 0; - int r = cmap->rlen - 1; - int m; + pdf_range *ranges = cmap->ranges; + pdf_xrange *xranges = cmap->xranges; + pdf_mrange *mranges = cmap->mranges; + int l, r, m, i; + l = 0; + r = cmap->rlen - 1; while (l <= r) { m = (l + r) >> 1; - if (cpt < cmap->ranges[m].low) + if (cpt < ranges[m].low) r = m - 1; - else if (cpt > pdf_range_high(&cmap->ranges[m])) + else if (cpt > ranges[m].high) l = m + 1; else { - k = cpt - cmap->ranges[m].low + cmap->ranges[m].offset; - if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_TABLE) - { - out[0] = cmap->table[k]; - return 1; - } - else if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_MULTI) - { - n = cmap->ranges[m].offset; - for (i = 0; i < cmap->table[n]; i++) - out[i] = cmap->table[n + i + 1]; - return cmap->table[n]; - } - else - { - out[0] = k; - return 1; - } + out[0] = cpt - ranges[m].low + ranges[m].out; + return 1; + } + } + + l = 0; + r = cmap->xlen - 1; + while (l <= r) + { + m = (l + r) >> 1; + if (cpt < xranges[m].low) + r = m - 1; + else if (cpt > xranges[m].high) + l = m + 1; + else + { + out[0] = cpt - xranges[m].low + xranges[m].out; + return 1; + } + } + + l = 0; + r = cmap->mlen - 1; + while (l <= r) + { + m = (l + r) >> 1; + if (cpt < mranges[m].low) + r = m - 1; + else if (cpt > mranges[m].low) + l = m + 1; + else + { + for (i = 0; i < mranges[m].len; ++i) + out[i] = mranges[m].out[i]; + return mranges[m].len; } } |