From 632c9b6ad00a2109c513f3da10d1d56a97617df9 Mon Sep 17 00:00:00 2001 From: Tor Andersson Date: Tue, 30 Sep 2014 13:25:00 +0200 Subject: Fix 695501: Intelligent cmap remapping. Use the actual ranges from the cpt-to-gid cmap to optimize the remapping of ToUnicode cmaps from cpt-to-unicode into gid-to-unicode format. --- source/pdf/pdf-unicode.c | 96 +++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 55 deletions(-) (limited to 'source/pdf') diff --git a/source/pdf/pdf-unicode.c b/source/pdf/pdf-unicode.c index ffe5f6b7..6b893051 100644 --- a/source/pdf/pdf-unicode.c +++ b/source/pdf/pdf-unicode.c @@ -2,83 +2,69 @@ /* Load or synthesize ToUnicode map for fonts */ -static void find_min_max_cpt(pdf_cmap *cmap, unsigned int *minp, unsigned int *maxp) +static void +pdf_remap_cmap_range(fz_context *ctx, pdf_cmap *ucs_from_gid, + unsigned int cpt, unsigned int gid, unsigned int n, pdf_cmap *ucs_from_cpt) { - unsigned int min = UINT_MAX; - unsigned int max = 0; - int i; + unsigned int k; + int ucsbuf[8]; + int ucslen; - for (i = 0; i < cmap->rlen; ++i) + for (k = 0; k <= n; ++k) { - if (cmap->ranges[i].low < min) - min = cmap->ranges[i].low; - if (cmap->ranges[i].high > max) - max = cmap->ranges[i].high; + ucslen = pdf_lookup_cmap_full(ucs_from_cpt, cpt + k, ucsbuf); + if (ucslen == 1) + pdf_map_range_to_range(ctx, ucs_from_gid, gid + k, gid + k, ucsbuf[0]); + else if (ucslen > 1) + pdf_map_one_to_many(ctx, ucs_from_gid, gid + k, ucsbuf, ucslen); } +} + +static pdf_cmap * +pdf_remap_cmap(fz_context *ctx, pdf_cmap *gid_from_cpt, pdf_cmap *ucs_from_cpt) +{ + pdf_cmap *ucs_from_gid; + unsigned int i, a, b, x; - for (i = 0; i < cmap->xlen; ++i) + ucs_from_gid = pdf_new_cmap(ctx); + + if (gid_from_cpt->usecmap) + ucs_from_gid->usecmap = pdf_remap_cmap(ctx, gid_from_cpt->usecmap, ucs_from_cpt); + + for (i = 0; i < gid_from_cpt->rlen; ++i) { - if (cmap->xranges[i].low < min) - min = cmap->xranges[i].low; - if (cmap->xranges[i].high > max) - max = cmap->xranges[i].high; + a = gid_from_cpt->ranges[i].low; + b = gid_from_cpt->ranges[i].high; + x = gid_from_cpt->ranges[i].out; + pdf_remap_cmap_range(ctx, ucs_from_gid, a, x, b - a, ucs_from_cpt); } - for (i = 0; i < cmap->mlen; ++i) + for (i = 0; i < gid_from_cpt->xlen; ++i) { - if (cmap->mranges[i].low < min) - min = cmap->mranges[i].low; - if (cmap->mranges[i].low > max) - max = cmap->mranges[i].low; + a = gid_from_cpt->xranges[i].low; + b = gid_from_cpt->xranges[i].high; + x = gid_from_cpt->xranges[i].out; + pdf_remap_cmap_range(ctx, ucs_from_gid, a, x, b - a, ucs_from_cpt); } - *minp = min; - *maxp = max; + /* Font encoding CMaps don't have one-to-many mappings, so we can ignore the mranges. */ + + pdf_sort_cmap(ctx, ucs_from_gid); + + return ucs_from_gid; } void pdf_load_to_unicode(pdf_document *doc, pdf_font_desc *font, char **strings, char *collection, pdf_obj *cmapstm) { - unsigned int cpt, min, max; - int gid; - int ucsbuf[8]; - int ucslen; - int i; fz_context *ctx = doc->ctx; + unsigned int cpt; if (pdf_is_stream(doc, pdf_to_num(cmapstm), pdf_to_gen(cmapstm))) { - pdf_cmap *gid_from_cpt = font->encoding; pdf_cmap *ucs_from_cpt = pdf_load_embedded_cmap(doc, cmapstm); - - font->to_unicode = pdf_new_cmap(ctx); - - /* in case the code space range is much larger than the actual number of characters */ - find_min_max_cpt(gid_from_cpt, &min, &max); - - for (i = 0; i < gid_from_cpt->codespace_len; ++i) - { - unsigned int l = gid_from_cpt->codespace[i].low; - unsigned int h = gid_from_cpt->codespace[i].high; - l = l < min ? min : l > max ? max : l; - h = h < min ? min : h > max ? max : h; - for (cpt = l; cpt <= h; ++cpt) - { - gid = pdf_lookup_cmap(gid_from_cpt, cpt); - if (gid >= 0) - { - ucslen = pdf_lookup_cmap_full(ucs_from_cpt, cpt, ucsbuf); - if (ucslen == 1) - pdf_map_range_to_range(ctx, font->to_unicode, gid, gid, ucsbuf[0]); - if (ucslen > 1) - pdf_map_one_to_many(ctx, font->to_unicode, gid, ucsbuf, ucslen); - } - } - } - - pdf_sort_cmap(ctx, font->to_unicode); - + font->to_unicode = pdf_remap_cmap(ctx, font->encoding, ucs_from_cpt); pdf_drop_cmap(ctx, ucs_from_cpt); font->size += pdf_cmap_size(ctx, font->to_unicode); } -- cgit v1.2.3