From f0c292559fd1d484b0c71fb75d49f2c820234424 Mon Sep 17 00:00:00 2001 From: Tor Andersson Date: Mon, 4 Apr 2011 02:27:31 +0200 Subject: pdf: Add workaround for S22PDF lying about encodings. A popular pdf creator from china has produced many files without embedded fonts, which state that they use WinAnsiEncoding, but in fact use cp936. This workaround detects some of these cases. Thanks to SumatraPDF for the patch. --- mupdf/pdf_font.c | 72 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/mupdf/pdf_font.c b/mupdf/pdf_font.c index 67af2b38..75bc141f 100644 --- a/mupdf/pdf_font.c +++ b/mupdf/pdf_font.c @@ -253,7 +253,6 @@ loadsimplefont(pdf_fontdesc **fontdescp, pdf_xref *xref, fz_obj *dict) fz_obj *widths; unsigned short *etable = nil; pdf_fontdesc *fontdesc; - fz_bbox bbox; FT_Face face; FT_CharMap cmap; int kind; @@ -284,25 +283,49 @@ loadsimplefont(pdf_fontdesc **fontdescp, pdf_xref *xref, fz_obj *dict) if (error) goto cleanup; - face = fontdesc->font->ftface; - kind = ftkind(face); - - pdf_logfont("ft name '%s' '%s'\n", face->family_name, face->style_name); - - bbox.x0 = (face->bbox.xMin * 1000) / face->units_per_EM; - bbox.y0 = (face->bbox.yMin * 1000) / face->units_per_EM; - bbox.x1 = (face->bbox.xMax * 1000) / face->units_per_EM; - bbox.y1 = (face->bbox.yMax * 1000) / face->units_per_EM; + /* Some chinese documents mistakenly consider WinAnsiEncoding to be codepage 936 */ + if (!*fontdesc->font->name && + !fz_dictgets(dict, "ToUnicode") && + !strcmp(fz_toname(fz_dictgets(dict, "Encoding")), "WinAnsiEncoding") && + fz_toint(fz_dictgets(descriptor, "Flags")) == 4) + { + /* note: without the comma, pdf_loadfontdescriptor would prefer /FontName over /BaseFont */ + char *cp936fonts[] = { + "\xCB\xCE\xCC\xE5", "SimSun,Regular", + "\xBA\xDA\xCC\xE5", "SimHei,Regular", + "\xBF\xAC\xCC\xE5_GB2312", "SimKai,Regular", + "\xB7\xC2\xCB\xCE_GB2312", "SimFang,Regular", + "\xC1\xA5\xCA\xE9", "SimLi,Regular", + NULL + }; + for (i = 0; cp936fonts[i]; i += 2) + if (!strcmp(basefont, cp936fonts[i])) + break; + if (cp936fonts[i]) + { + fz_warn("workaround for S22PDF lying about chinese font encodings"); + pdf_dropfont(fontdesc); + fontdesc = pdf_newfontdesc(); + error = pdf_loadfontdescriptor(fontdesc, xref, descriptor, "Adobe-GB1", cp936fonts[i+1]); + error |= pdf_loadsystemcmap(&fontdesc->encoding, "GBK-EUC-H"); + error |= pdf_loadsystemcmap(&fontdesc->tounicode, "Adobe-GB1-UCS2"); + error |= pdf_loadsystemcmap(&fontdesc->tottfcmap, "Adobe-GB1-UCS2"); + if (error) + return fz_rethrow(error, "cannot load font"); - pdf_logfont("ft bbox [%d %d %d %d]\n", bbox.x0, bbox.y0, bbox.x1, bbox.y1); + face = fontdesc->font->ftface; + kind = ftkind(face); + goto skip_encoding; + } + } - if (bbox.x0 == bbox.x1) - fz_setfontbbox(fontdesc->font, -1000, -1000, 2000, 2000); - else - fz_setfontbbox(fontdesc->font, bbox.x0, bbox.y0, bbox.x1, bbox.y1); + face = fontdesc->font->ftface; + kind = ftkind(face); /* Encoding */ + pdf_logfont("ft name '%s' '%s'\n", face->family_name, face->style_name); + symbolic = fontdesc->flags & 4; if (face->num_charmaps > 0) @@ -490,6 +513,8 @@ loadsimplefont(pdf_fontdesc **fontdescp, pdf_xref *xref, fz_obj *dict) if (error) fz_catch(error, "cannot load tounicode"); +skip_encoding: + /* Widths */ pdf_setdefaulthmtx(fontdesc, fontdesc->missingwidth); @@ -548,7 +573,6 @@ loadcidfont(pdf_fontdesc **fontdescp, pdf_xref *xref, fz_obj *dict, fz_obj *enco fz_obj *descriptor; pdf_fontdesc *fontdesc; FT_Face face; - fz_bbox bbox; int kind; char collection[256]; char *basefont; @@ -603,18 +627,6 @@ loadcidfont(pdf_fontdesc **fontdescp, pdf_xref *xref, fz_obj *dict, fz_obj *enco face = fontdesc->font->ftface; kind = ftkind(face); - bbox.x0 = (face->bbox.xMin * 1000) / face->units_per_EM; - bbox.y0 = (face->bbox.yMin * 1000) / face->units_per_EM; - bbox.x1 = (face->bbox.xMax * 1000) / face->units_per_EM; - bbox.y1 = (face->bbox.yMax * 1000) / face->units_per_EM; - - pdf_logfont("ft bbox [%d %d %d %d]\n", bbox.x0, bbox.y0, bbox.x1, bbox.y1); - - if (bbox.x0 == bbox.x1) - fz_setfontbbox(fontdesc->font, -1000, -1000, 2000, 2000); - else - fz_setfontbbox(fontdesc->font, bbox.x0, bbox.y0, bbox.x1, bbox.y1); - /* Check for DynaLab fonts that must use hinting */ if (kind == TRUETYPE) { @@ -855,7 +867,6 @@ pdf_loadfontdescriptor(pdf_fontdesc *fontdesc, pdf_xref *xref, fz_obj *dict, cha { fz_error error; fz_obj *obj1, *obj2, *obj3, *obj; - fz_rect bbox; char *fontname; char *origname; @@ -877,9 +888,6 @@ pdf_loadfontdescriptor(pdf_fontdesc *fontdesc, pdf_xref *xref, fz_obj *dict, cha fontdesc->xheight = fz_toreal(fz_dictgets(dict, "XHeight")); fontdesc->missingwidth = fz_toreal(fz_dictgets(dict, "MissingWidth")); - bbox = pdf_torect(fz_dictgets(dict, "FontBBox")); - pdf_logfont("bbox [%g %g %g %g]\n", bbox.x0, bbox.y0, bbox.x1, bbox.y1); - pdf_logfont("flags %d\n", fontdesc->flags); obj1 = fz_dictgets(dict, "FontFile"); -- cgit v1.2.3