diff options
author | Robin Watts <robin.watts@artifex.com> | 2017-04-13 19:13:59 +0100 |
---|---|---|
committer | Robin Watts <robin.watts@artifex.com> | 2017-04-18 16:45:49 +0100 |
commit | 670fe1b085fde8355b6d1a70b143eab60e34b1b6 (patch) | |
tree | eca6b0c5e2561a2da269fb9abd1c8d7a9bdeffcb /source/pdf | |
parent | a742ac09981f7c681f63f88738e7eded95412599 (diff) | |
download | mupdf-670fe1b085fde8355b6d1a70b143eab60e34b1b6.tar.xz |
Use splay trees for loading/merging cmaps.
This allows for overlaps, merges adjacent (mergeable) ranges
and gets us properly searchable results.
This causes 1 diff in the test suites (Bug694353.pdf), which is
due to the fallback font not having a hypen present at UCS 0x2010.
Diffstat (limited to 'source/pdf')
-rw-r--r-- | source/pdf/pdf-cmap.c | 665 |
1 files changed, 576 insertions, 89 deletions
diff --git a/source/pdf/pdf-cmap.c b/source/pdf/pdf-cmap.c index 01826234..10e36ba2 100644 --- a/source/pdf/pdf-cmap.c +++ b/source/pdf/pdf-cmap.c @@ -1,5 +1,8 @@ #include "mupdf/pdf.h" +#undef CHECK_SPLAY +#undef DUMP_SPLAY + /* * Allocate, destroy and simple parameters. */ @@ -85,44 +88,525 @@ pdf_add_codespace(fz_context *ctx, pdf_cmap *cmap, unsigned int low, unsigned in cmap->codespace_len ++; } + +struct cmap_splay_s { + unsigned int low; + unsigned int high; + unsigned int out; + unsigned int left; + unsigned int right; + unsigned int parent : 31; + unsigned int many : 1; +}; + +#define EMPTY ((unsigned int)0x40000000) + +/* + The splaying steps used: + + Case 1: | z x + | y D => A y + | x C B z + | A B C D + + Case 2: | z x + | y D => y z + | A x A B C D + | B C + + Case 3: | y x + | x C => A y + | A B B C +*/ + +static void +move_to_root(cmap_splay *tree, unsigned int x) +{ + if (x == EMPTY) + return; + do + { + unsigned int z, zp; + unsigned int y = tree[x].parent; + if (y == EMPTY) + break; + z = tree[y].parent; + if (z == EMPTY) + { + /* Case 3 */ + tree[x].parent = EMPTY; + tree[y].parent = x; + if (tree[y].left == x) + { + /* Case 3 */ + tree[y].left = tree[x].right; + if (tree[y].left != EMPTY) + tree[tree[y].left].parent = y; + tree[x].right = y; + } + else + { + /* Case 3 - reflected */ + assert(tree[y].right == x); + tree[y].right = tree[x].left; + if (tree[y].right != EMPTY) + tree[tree[y].right].parent = y; + tree[x].left = y; + } + break; + } + + zp = tree[z].parent; + tree[x].parent = zp; + if (zp != EMPTY) { + if (tree[zp].left == z) + tree[zp].left = x; + else + { + assert(tree[zp].right == z); + tree[zp].right = x; + } + } + tree[y].parent = x; + if (tree[y].left == x) + { + tree[y].left = tree[x].right; + if (tree[y].left != EMPTY) + tree[tree[y].left].parent = y; + tree[x].right = y; + if (tree[z].left == y) + { + /* Case 1 */ + tree[z].parent = y; + tree[z].left = tree[y].right; + if (tree[z].left != EMPTY) + tree[tree[z].left].parent = z; + tree[y].right = z; + } + else + { + /* Case 2 - reflected */ + assert(tree[z].right == y); + tree[z].parent = x; + tree[z].right = tree[x].left; + if (tree[z].right != EMPTY) + tree[tree[z].right].parent = z; + tree[x].left = z; + } + } + else + { + assert(tree[y].right == x); + tree[y].right = tree[x].left; + if (tree[y].right != EMPTY) + tree[tree[y].right].parent = y; + tree[x].left = y; + if (tree[z].left == y) + { + /* Case 2 */ + tree[z].parent = x; + tree[z].left = tree[x].right; + if (tree[z].left != EMPTY) + tree[tree[z].left].parent = z; + tree[x].right = z; + } + else + { + /* Case 1 - reflected */ + assert(tree[z].right == y); + tree[z].parent = y; + tree[z].right = tree[y].left; + if (tree[z].right != EMPTY) + tree[tree[z].right].parent = z; + tree[y].left = z; + } + } + } while (1); +} + +static unsigned int delete_node(pdf_cmap *cmap, unsigned int current) +{ + cmap_splay *tree = cmap->tree; + unsigned int parent; + unsigned int replacement; + + assert(current != EMPTY); + + parent = tree[current].parent; + if (tree[current].right == EMPTY) + { + if (parent == EMPTY) + { + replacement = cmap->ttop = tree[current].left; + } + else if (tree[parent].left == current) + { + replacement = tree[parent].left = tree[current].left; + } + else + { + assert(tree[parent].right == current); + replacement = tree[parent].right = tree[current].left; + } + if (replacement != EMPTY) + tree[replacement].parent = parent; + else + replacement = parent; + } + else if (tree[current].left == EMPTY) + { + if (parent == EMPTY) + { + replacement = cmap->ttop = tree[current].right; + } + else if (tree[parent].left == current) + { + replacement = tree[parent].left = tree[current].right; + } + else + { + assert(tree[parent].right == current); + replacement = tree[parent].right = tree[current].right; + } + if (replacement != EMPTY) + tree[replacement].parent = parent; + else + replacement = parent; + } + else + { + /* Hard case, find the in-order predecessor of current */ + int amputee = current; + replacement = tree[current].left; + while (tree[replacement].right != EMPTY) { + amputee = replacement; + replacement = tree[replacement].right; + } + /* Remove replacement from the tree */ + if (amputee == current) + { + tree[amputee].left = tree[replacement].left; + if (tree[amputee].left != EMPTY) + tree[tree[amputee].left].parent = amputee; + } + else + { + tree[amputee].right = tree[replacement].left; + if (tree[amputee].right != EMPTY) + tree[tree[amputee].right].parent = amputee; + } + /* Insert replacement in place of current */ + tree[replacement].parent = parent; + if (parent == EMPTY) + { + tree[replacement].parent = EMPTY; + cmap->ttop = replacement; + } + else if (tree[parent].left == current) + tree[parent].left = replacement; + else + { + assert(tree[parent].right == current); + tree[parent].right = replacement; + } + tree[replacement].left = tree[current].left; + if (tree[replacement].left != EMPTY) + tree[tree[replacement].left].parent = replacement; + tree[replacement].right = tree[current].right; + if (tree[replacement].right != EMPTY) + tree[tree[replacement].right].parent = replacement; + } + + /* current is now unlinked. We need to remove it from our array. */ + cmap->tlen--; + if (current != cmap->tlen) + { + if (replacement == cmap->tlen) + replacement = current; + tree[current] = tree[cmap->tlen]; + parent = tree[current].parent; + if (parent == EMPTY) + cmap->ttop = current; + else if (tree[parent].left == cmap->tlen) + tree[parent].left = current; + else + { + assert(tree[parent].right == cmap->tlen); + tree[parent].right = current; + } + if (tree[current].left != EMPTY) + { + assert(tree[tree[current].left].parent == cmap->tlen); + tree[tree[current].left].parent = current; + } + if (tree[current].right != EMPTY) + { + assert(tree[tree[current].right].parent == cmap->tlen); + tree[tree[current].right].parent = current; + } + } + + /* Return the node that we should continue searching from */ + return replacement; +} + +#ifdef DUMP_SPLAY +static void +dump_splay(cmap_splay *tree, unsigned int node, int depth, const char *pre) +{ + int i; + + if (node == EMPTY) + return; + + for (i = 0; i < depth; i++) + fprintf(stderr, " "); + fprintf(stderr, "%s%d:", pre, node); + if (tree[node].parent == EMPTY) + fprintf(stderr, "^EMPTY"); + else + fprintf(stderr, "^%d", tree[node].parent); + if (tree[node].left == EMPTY) + fprintf(stderr, "<EMPTY"); + else + fprintf(stderr, "<%d", tree[node].left); + if (tree[node].right == EMPTY) + fprintf(stderr, ">EMPTY"); + else + fprintf(stderr, ">%d", tree[node].right); + fprintf(stderr, "(%x,%x,%x,%d)\n", tree[node].low, tree[node].high, tree[node].out, tree[node].many); + assert(tree[node].parent == EMPTY || depth); + assert(tree[node].left == EMPTY || tree[tree[node].left].parent == node); + assert(tree[node].right == EMPTY || tree[tree[node].right].parent == node); + dump_splay(tree, tree[node].left, depth+1, "L"); + dump_splay(tree, tree[node].right, depth+1, "R"); +} +#endif + +enum +{ + TOP = 0, + LEFT = 1, + RIGHT = 2 +}; + +static void walk_splay(cmap_splay *tree, unsigned int node, void (*fn)(cmap_splay *, void *), void *arg) +{ + int from = TOP; + + while (node != EMPTY) + { + switch (from) + { + case TOP: + if (tree[node].left != EMPTY) + { + node = tree[node].left; + from = TOP; + break; + } + /* fallthrough */ + case LEFT: + fn(&tree[node], arg); + if (tree[node].right != EMPTY) + { + node = tree[node].right; + from = TOP; + break; + } + /* fallthrough */ + case RIGHT: + { + unsigned int parent = tree[node].parent; + if (parent == EMPTY) + return; + if (tree[parent].left == node) + from = LEFT; + else + { + assert(tree[parent].right == node); + from = RIGHT; + } + node = parent; + } + } + } +} + +#ifdef CHECK_SPLAY +static void +do_check(cmap_splay *node, void *arg) +{ + cmap_splay *tree = arg; + unsigned int num = node - tree; + assert(node->left == EMPTY || tree[node->left].parent == num); + assert(node->right == EMPTY || tree[node->right].parent == num); +} + +static void +check_splay(cmap_splay *tree, unsigned int node, int depth) +{ + if (node == EMPTY) + return; + assert(tree[node].parent == EMPTY); + walk_splay(tree, node, do_check, tree); +} +#endif + /* * Add a range. */ static void -add_range(fz_context *ctx, pdf_cmap *cmap, unsigned int low, unsigned int high, unsigned int out) +add_range(fz_context *ctx, pdf_cmap *cmap, unsigned int low, unsigned int high, unsigned int out, int check_for_overlap, int many) { + int current; + cmap_splay *tree; + if (low > high) { fz_warn(ctx, "range limits out of range in cmap %s", cmap->cmap_name); return; } - if (low <= 0xFFFF && high <= 0xFFFF && out <= 0xFFFF) + tree = cmap->tree; + + if (cmap->tlen) { - if (cmap->rlen + 1 > cmap->rcap) + unsigned int move = cmap->ttop; + unsigned int gt = EMPTY; + unsigned int lt = EMPTY; + if (check_for_overlap) { - int new_cap = cmap->rcap ? cmap->rcap * 2 : 256; - cmap->ranges = fz_resize_array(ctx, cmap->ranges, new_cap, sizeof *cmap->ranges); - cmap->rcap = new_cap; + /* Check for collision with the current node */ + do + { + current = move; + /* Cases we might meet: + * tree[i]: <-----> + * case 0: <-> + * case 1: <-------> + * case 2: <-------------> + * case 3: <-> + * case 4: <-------> + * case 5: <-> + */ + if (low <= tree[current].low && tree[current].low <= high) + { + /* case 1, reduces to case 0 */ + /* or case 2, deleting the node */ + tree[current].out += high + 1 - tree[current].low; + tree[current].low = high + 1; + if (tree[current].low > tree[current].high) + { + move = delete_node(cmap, current); + current = EMPTY; + continue; + } + } + else if (low <= tree[current].high && tree[current].high <= high) + { + /* case 4, reduces to case 5 */ + tree[current].high = low - 1; + assert(tree[current].low <= tree[current].high); + } + else if (tree[current].low < low && high < tree[current].high) + { + /* case 3, reduces to case 5 */ + int new_high = tree[current].high; + tree[current].high = low-1; + add_range(ctx, cmap, high+1, new_high, tree[current].out + high + 1 - tree[current].low, 0, many); + } + /* Now look for where to move to next (left for case 0, right for case 5) */ + if (tree[current].low > high) { + move = tree[current].left; + gt = current; + } + else + { + move = tree[current].right; + lt = current; + } + } + while (move != EMPTY); + } + else + { + do + { + current = move; + if (tree[current].low > high) + { + move = tree[current].left; + gt = current; + } + else + { + move = tree[current].right; + lt = current; + } + } while (move != EMPTY); + } + /* current is now the node to which we would be adding the new node */ + /* lt is the last node we traversed which is lt the new node. */ + /* gt is the last node we traversed which is gt the new node. */ + + if (!many) + { + /* Check for the 'merge' cases. */ + if (lt != EMPTY && !tree[lt].many && tree[lt].high == low-1 && tree[lt].out - tree[lt].low == out - low) + { + tree[lt].high = high; + if (gt != EMPTY && !tree[gt].many && tree[gt].low == high+1 && tree[gt].out - tree[gt].low == out - low) + { + tree[lt].high = tree[gt].high; + delete_node(cmap, gt); + } + goto exit; + } + if (gt != EMPTY && !tree[gt].many && tree[gt].low == high+1 && tree[gt].out - tree[gt].low == out - low) + { + tree[gt].low = low; + tree[gt].out = out; + goto exit; + } } - cmap->ranges[cmap->rlen].low = low; - cmap->ranges[cmap->rlen].high = high; - cmap->ranges[cmap->rlen].out = out; - cmap->rlen++; } else + current = EMPTY; + + if (cmap->tlen == cmap->tcap) { - if (cmap->xlen + 1 > cmap->xcap) - { - int new_cap = cmap->xcap ? cmap->xcap * 2 : 256; - cmap->xranges = fz_resize_array(ctx, cmap->xranges, new_cap, sizeof *cmap->xranges); - cmap->xcap = new_cap; - } - cmap->xranges[cmap->xlen].low = low; - cmap->xranges[cmap->xlen].high = high; - cmap->xranges[cmap->xlen].out = out; - cmap->xlen++; + int new_cap = cmap->tcap ? cmap->tcap * 2 : 256; + tree = cmap->tree = fz_resize_array(ctx, cmap->tree, new_cap, sizeof *cmap->tree); + cmap->tcap = new_cap; + } + tree[cmap->tlen].low = low; + tree[cmap->tlen].high = high; + tree[cmap->tlen].out = out; + tree[cmap->tlen].parent = current; + tree[cmap->tlen].left = EMPTY; + tree[cmap->tlen].right = EMPTY; + tree[cmap->tlen].many = many; + cmap->tlen++; + if (current == EMPTY) + cmap->ttop = 0; + else if (tree[current].low > high) + tree[current].left = cmap->tlen-1; + else + { + assert(tree[current].high < low); + tree[current].right = cmap->tlen-1; } + move_to_root(tree, cmap->tlen-1); + cmap->ttop = cmap->tlen-1; +exit: + {} +#ifdef CHECK_SPLAY + check_splay(cmap->tree, cmap->ttop, 0); +#endif +#ifdef DUMP_SPLAY + dump_splay(cmap->tree, cmap->ttop, 0, ""); +#endif } /* @@ -131,20 +615,20 @@ add_range(fz_context *ctx, pdf_cmap *cmap, unsigned int low, unsigned int high, static void add_mrange(fz_context *ctx, pdf_cmap *cmap, unsigned int low, int *out, int len) { - int i; - if (cmap->mlen + 1 > cmap->mcap) + int out_pos; + + if (cmap->dlen + len + 1 > cmap->dcap) { - int new_cap = cmap->mcap ? cmap->mcap * 2 : 256; - cmap->mranges = fz_resize_array(ctx, cmap->mranges, new_cap, sizeof *cmap->mranges); - cmap->mcap = new_cap; + int new_cap = cmap->dcap ? cmap->dcap * 2 : 256; + cmap->dict = fz_resize_array(ctx, cmap->dict, new_cap, sizeof *cmap->dict); + cmap->dcap = new_cap; } - cmap->mranges[cmap->mlen].low = low; - cmap->mranges[cmap->mlen].len = len; - for (i = 0; i < len; ++i) - cmap->mranges[cmap->mlen].out[i] = out[i]; - for (; i < PDF_MRANGE_CAP; ++i) - cmap->mranges[cmap->mlen].out[i] = 0; - cmap->mlen++; + out_pos = cmap->dlen; + cmap->dict[out_pos] = len; + memcpy(&cmap->dict[out_pos+1], out, sizeof(int)*len); + cmap->dlen += len + 1; + + add_range(ctx, cmap, low, low, out_pos, 1, 1); } /* @@ -153,7 +637,7 @@ add_mrange(fz_context *ctx, pdf_cmap *cmap, unsigned int low, int *out, int len) void pdf_map_range_to_range(fz_context *ctx, pdf_cmap *cmap, unsigned int low, unsigned int high, int out) { - add_range(ctx, cmap, low, high, out); + add_range(ctx, cmap, low, high, out, 1, 0); } /* @@ -164,7 +648,7 @@ pdf_map_one_to_many(fz_context *ctx, pdf_cmap *cmap, unsigned int low, int *valu { if (len == 1) { - add_range(ctx, cmap, low, low, values[0]); + add_range(ctx, cmap, low, low, values[0], 1, 0); return; } @@ -175,7 +659,7 @@ pdf_map_one_to_many(fz_context *ctx, pdf_cmap *cmap, unsigned int low, int *valu values[1] >= 0xDC00 && values[1] <= 0xDFFF) { int rune = ((values[0] - 0xD800) << 10) + (values[1] - 0xDC00) + 0x10000; - add_range(ctx, cmap, low, low, rune); + add_range(ctx, cmap, low, low, rune, 1, 0); return; } @@ -188,70 +672,71 @@ pdf_map_one_to_many(fz_context *ctx, pdf_cmap *cmap, unsigned int low, int *valu add_mrange(ctx, cmap, low, values, len); } -/* - * Sort the input ranges. - * Merge contiguous ranges. - */ - -static int cmprange(const void *va, const void *vb) +static void +count_node_types(cmap_splay *node, void *arg) { - unsigned int a = ((const pdf_range*)va)->low; - unsigned int b = ((const pdf_range*)vb)->low; - return a < b ? -1 : a > b ? 1 : 0; -} + int *counts = (int *)arg; -static int cmpxrange(const void *va, const void *vb) -{ - unsigned int a = ((const pdf_xrange*)va)->low; - unsigned int b = ((const pdf_xrange*)vb)->low; - return a < b ? -1 : a > b ? 1 : 0; + if (node->many) + counts[2]++; + else if (node->low <= 0xffff && node->high <= 0xFFFF && node->out <= 0xFFFF) + counts[0]++; + else + counts[1]++; } -static int cmpmrange(const void *va, const void *vb) +static void +copy_node_types(cmap_splay *node, void *arg) { - unsigned int a = ((const pdf_mrange*)va)->low; - unsigned int b = ((const pdf_mrange*)vb)->low; - return a < b ? -1 : a > b ? 1 : 0; + pdf_cmap *cmap = (pdf_cmap *)arg; + + if (node->many) + { + assert(node->low == node->high); + cmap->mranges[cmap->mlen].low = node->low; + cmap->mranges[cmap->mlen].out = node->out; + cmap->mlen++; + } + else if (node->low <= 0xffff && node->high <= 0xFFFF && node->out <= 0xFFFF) + { + cmap->ranges[cmap->rlen].low = node->low; + cmap->ranges[cmap->rlen].high = node->high; + cmap->ranges[cmap->rlen].out = node->out; + cmap->rlen++; + } + else + { + cmap->xranges[cmap->xlen].low = node->low; + cmap->xranges[cmap->xlen].high = node->high; + cmap->xranges[cmap->xlen].out = node->out; + cmap->xlen++; + } } void pdf_sort_cmap(fz_context *ctx, pdf_cmap *cmap) { - pdf_range *a, *b; - pdf_xrange *x, *y; + int counts[3]; - if (cmap->rlen) - { - qsort(cmap->ranges, cmap->rlen, sizeof *cmap->ranges, cmprange); - a = cmap->ranges; - for (b = a + 1; b < cmap->ranges + cmap->rlen; ++b) - { - if (b->low == a->high + 1 && b->out == a->out + (a->high - a->low) + 1) - a->high = b->high; - else - *(++a) = *b; - } - cmap->rlen = a - cmap->ranges + 1; - } + if (cmap->tree == NULL) + return; - if (cmap->xlen) - { - qsort(cmap->xranges, cmap->xlen, sizeof *cmap->xranges, cmpxrange); - x = cmap->xranges; - for (y = x + 1; y < cmap->xranges + cmap->xlen; ++y) - { - if (y->low == x->high + 1 && y->out == x->out + (x->high - x->low) + 1) - x->high = y->high; - else - *(++x) = *y; - } - cmap->xlen = x - cmap->xranges + 1; - } + counts[0] = 0; + counts[1] = 0; + counts[2] = 0; + walk_splay(cmap->tree, cmap->ttop, count_node_types, &counts); - if (cmap->mlen) - { - qsort(cmap->mranges, cmap->mlen, sizeof *cmap->mranges, cmpmrange); - } + cmap->ranges = fz_malloc_array(ctx, counts[0], sizeof(*cmap->ranges)); + cmap->rcap = counts[0]; + cmap->xranges = fz_malloc_array(ctx, counts[1], sizeof(*cmap->xranges)); + cmap->xcap = counts[1]; + cmap->mranges = fz_malloc_array(ctx, counts[2], sizeof(*cmap->mranges)); + cmap->mcap = counts[2]; + + walk_splay(cmap->tree, cmap->ttop, copy_node_types, cmap); + + fz_free(ctx, cmap->tree); + cmap->tree = NULL; } /* @@ -348,9 +833,11 @@ pdf_lookup_cmap_full(pdf_cmap *cmap, unsigned int cpt, int *out) l = m + 1; else { - for (i = 0; i < mranges[m].len; ++i) - out[i] = mranges[m].out[i]; - return mranges[m].len; + int *ptr = &cmap->dict[cmap->mranges[m].out]; + unsigned int len = (unsigned int)*ptr++; + for (i = 0; i < len; ++i) + out[i] = *ptr++; + return len; } } |