diff options
-rw-r--r-- | Jamfile | 2 | ||||
-rw-r--r-- | TODO | 20 | ||||
-rw-r--r-- | base/rune.c | 168 | ||||
-rw-r--r-- | filter/faxd.c | 4 | ||||
-rw-r--r-- | include/fitz/base.h | 6 | ||||
-rw-r--r-- | include/mupdf/content.h | 3 | ||||
-rw-r--r-- | include/mupdf/page.h | 21 | ||||
-rw-r--r-- | include/mupdf/rsrc.h | 3 | ||||
-rw-r--r-- | mupdf/build.c | 126 | ||||
-rw-r--r-- | mupdf/font.c | 65 | ||||
-rw-r--r-- | mupdf/fontfile.c | 1 | ||||
-rw-r--r-- | mupdf/interpret.c | 59 | ||||
-rw-r--r-- | mupdf/parse.c | 3 | ||||
-rw-r--r-- | mupdf/type3.c | 53 | ||||
-rw-r--r-- | mupdf/unicode.c | 278 | ||||
-rw-r--r-- | render/renderimage.c | 46 | ||||
-rw-r--r-- | test/pdfrip.c | 113 | ||||
-rw-r--r-- | test/x11pdf.c | 35 |
18 files changed, 705 insertions, 301 deletions
@@ -18,6 +18,7 @@ Library libfitz : #util/strlcat.c # base runtime + base/rune.c base/error.c base/memory.c base/md5.c @@ -113,6 +114,7 @@ Library libmupdf : mupdf/fontagl.c mupdf/fontenc.c mupdf/fontfile.c + mupdf/unicode.c mupdf/font.c mupdf/type3.c mupdf/colorspace.c @@ -1,37 +1,36 @@ -colorspace conversions +colorspace conversions (v2) - cal* - iccbased - how to normalize Lab components to 0..1 - fast color cubes - how to cache colorspace cubes (what key?) -image rendering +image rendering (v2) - tiles - dct case - better filter than box + - lazy decoding shadings - ... jeong ... rendering - - save non-transformed bbox in nodes - - explicit mask field in fz_renderer .. general cleanup + - bbox culling (cache bbox in over node?) + - image mask + color case - merge gka optims - - optimize! optimize! optimize! (special case 1 and 4 channel cases) + - optimize inner rendering loops + - optimize image load/decode/scale + - special-case optims (1, 2 and 4) + - cpu-specific optims parser - - text clip mode - - split content streams (TJ objects) - resource dict generate fake ids - try to clean up colorspace/material handling in interpreter - - tounicode clean up - make source ansi c89 / pedantic - reference count everything - standard cleanup mechanism - - naming conventions (fz_new/renew) - - design by contract - split into private and public - comments and documentation @@ -39,5 +38,4 @@ clean up cache global cache for cmaps and fontfiles (emb+sys) render cache (link-nodes and scaled images) - profile font cache (esp with t3 fonts) diff --git a/base/rune.c b/base/rune.c new file mode 100644 index 00000000..8b886637 --- /dev/null +++ b/base/rune.c @@ -0,0 +1,168 @@ +enum +{ + UTFmax = 3, /* maximum bytes per rune */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ + Runeself = 0x80, /* rune and UTF sequences are the same (<) */ + Runeerror = 0x80 /* decoding error in UTF */ +}; + +enum +{ + Bit1 = 7, + Bitx = 6, + Bit2 = 5, + Bit3 = 4, + Bit4 = 3, + + T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ + Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ + T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ + T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ + T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + + Maskx = (1<<Bitx)-1, /* 0011 1111 */ + Testx = Maskx ^ 0xFF, /* 1100 0000 */ + + Bad = Runeerror, +}; + +int +chartorune(int *rune, char *str) +{ + int c, c1, c2; + int l; + + /* + * one character sequence + * 00000-0007F => T1 + */ + c = *(unsigned char*)str; + if(c < Tx) { + *rune = c; + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + c1 = *(unsigned char*)(str+1) ^ Tx; + if(c1 & Testx) + goto bad; + if(c < T3) { + if(c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if(l <= Rune1) + goto bad; + *rune = l; + return 2; + } + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(unsigned char*)(str+2) ^ Tx; + if(c2 & Testx) + goto bad; + if(c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if(l <= Rune2) + goto bad; + *rune = l; + return 3; + } + + /* + * bad decoding + */ +bad: + *rune = Bad; + return 1; +} + +int +runetochar(char *str, int *rune) +{ + int c; + + /* + * one character sequence + * 00000-0007F => 00-7F + */ + c = *rune; + if(c <= Rune1) { + str[0] = c; + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + if(c <= Rune2) { + str[0] = T2 | (c >> 1*Bitx); + str[1] = Tx | (c & Maskx); + return 2; + } + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; +} + +int +runelen(int c) +{ + int rune; + char str[10]; + + rune = c; + return runetochar(str, &rune); +} + +int +runenlen(int *r, int nrune) +{ + int nb, c; + + nb = 0; + while(nrune--) { + c = *r++; + if(c <= Rune1) + nb++; + else + if(c <= Rune2) + nb += 2; + else + nb += 3; + } + return nb; +} + +int +fullrune(char *str, int n) +{ + int c; + + if(n > 0) { + c = *(unsigned char*)str; + if(c < Tx) + return 1; + if(n > 1) + if(c < T3 || n > 2) + return 1; + } + return 0; +} + diff --git a/filter/faxd.c b/filter/faxd.c index 62793dab..d8686a08 100644 --- a/filter/faxd.c +++ b/filter/faxd.c @@ -431,6 +431,10 @@ eol: goto loop; rtc: + i = (32 - fax->bidx) / 8; + while (i-- && in->rp > in->bp) + in->rp --; + out->eof = 1; return fz_iodone; } diff --git a/include/fitz/base.h b/include/fitz/base.h index da0132d4..562bb1df 100644 --- a/include/fitz/base.h +++ b/include/fitz/base.h @@ -21,6 +21,12 @@ #define STRIDE(n, bcp) (((bpc) * (n) + 7) / 8) +int chartorune(int *rune, char *str); +int runetochar(char *str, int *rune); +int runelen(long c); +int runenlen(int *r, int nrune); +int fullrune(char *str, int n); + typedef struct fz_error_s fz_error; struct fz_error_s diff --git a/include/mupdf/content.h b/include/mupdf/content.h index 6dae61da..f891c3c7 100644 --- a/include/mupdf/content.h +++ b/include/mupdf/content.h @@ -66,15 +66,18 @@ struct pdf_csi_s fz_obj *stack[32]; int top; int xbalance; + fz_obj *array; /* path object state */ fz_pathnode *path; fz_pathnode *clip; /* text object state */ + fz_node *textclip; fz_textnode *text; fz_matrix tlm; fz_matrix tm; + int textmode; fz_tree *tree; }; diff --git a/include/mupdf/page.h b/include/mupdf/page.h index 0dd1001b..3bb27db6 100644 --- a/include/mupdf/page.h +++ b/include/mupdf/page.h @@ -7,6 +7,8 @@ typedef struct pdf_outline_s pdf_outline; typedef struct pdf_nametree_s pdf_nametree; typedef struct pdf_pagetree_s pdf_pagetree; typedef struct pdf_page_s pdf_page; +typedef struct pdf_textline_s pdf_textline; +typedef struct pdf_textchar_s pdf_textchar; struct pdf_outlinetree_s { @@ -48,6 +50,19 @@ struct pdf_page_s int rotate; fz_obj *resources; fz_tree *tree; + pdf_textline *text; +}; + +struct pdf_textchar_s +{ + int x, y, c; +}; + +struct pdf_textline_s +{ + int len, cap; + pdf_textchar *text; + pdf_textline *next; }; /* outline.c */ @@ -73,3 +88,9 @@ void pdf_droppagetree(pdf_pagetree *pages); fz_error *pdf_loadpage(pdf_page **pagep, pdf_xref *xref, fz_obj *ref); void pdf_droppage(pdf_page *page); +/* unicode.c */ +fz_error *pdf_loadtextfromtree(pdf_textline **linep, fz_tree *tree); +void pdf_debugtextline(pdf_textline *line); +fz_error *pdf_newtextline(pdf_textline **linep); +void pdf_droptextline(pdf_textline *line); + diff --git a/include/mupdf/rsrc.h b/include/mupdf/rsrc.h index bb29dc8a..eaf20fb2 100644 --- a/include/mupdf/rsrc.h +++ b/include/mupdf/rsrc.h @@ -161,6 +161,9 @@ fz_error *pdf_loadembeddedcmap(fz_cmap **cmapp, pdf_xref *xref, fz_obj *stmref); fz_error *pdf_loadsystemcmap(fz_cmap **cmapp, char *name); fz_error *pdf_makeidentitycmap(fz_cmap **cmapp, int wmode, int bytes); +/* unicode.c */ +fz_error *pdf_loadtounicode(pdf_font *font, pdf_xref *xref, char **strings, char *collection, fz_obj *cmapstm); + /* fontfile.c */ fz_error *pdf_loadbuiltinfont(pdf_font *font, char *basefont); fz_error *pdf_loadembeddedfont(pdf_font *font, pdf_xref *xref, fz_obj *stmref); diff --git a/mupdf/build.c b/mupdf/build.c index 3b602e89..3b65ff6a 100644 --- a/mupdf/build.c +++ b/mupdf/build.c @@ -195,6 +195,28 @@ addcolorshape(pdf_gstate *gs, fz_node *shape, fz_colorspace *cs, float *v) return nil; } +static fz_error * +addinvisibleshape(pdf_gstate *gs, fz_node *shape) +{ + fz_error *error; + fz_node *mask; + fz_pathnode *path; + + error = fz_newmasknode(&mask); + if (error) return error; + + error = fz_newpathnode(&path); + if (error) return error; + error = fz_endpath(path, FZ_FILL, nil, nil); + if (error) return error; + + fz_insertnode(mask, (fz_node*)path); + fz_insertnode(mask, shape); + fz_insertnode(gs->head, mask); + + return nil; +} + static fz_matrix getmatrix(fz_node *node) { if (node->parent) @@ -401,62 +423,6 @@ pdf_showimage(pdf_csi *csi, pdf_image *img) return nil; } -#if 0 - -BMC ... EMC object nesting can be completely fucked up -and out of sync with graphics object nesting. - -fz_error * -pdf_beginmarkedcontent(pdf_gstate *gs, fz_node *meta) -{ - fz_error *error; - fz_node *over; - - error = fz_newovernode(&over); - if (error) return error; - - fz_insertnode(gs->head, meta); - fz_insertnode(meta, over); - gs->head = over; - -printf("begin mc meta=%p over=%p\n", meta, over); -{ -fz_node *node = gs->head; - while (node) - { -printf(" node=%p ismeta=%d\n", node, fz_ismetanode(node)); - node = node->parent; - } -printf("okay.\n"); -} - - return nil; -} - -fz_error * -pdf_endmarkedcontent(pdf_gstate *gs) -{ - fz_node *node = gs->head; - -printf("end mc\n"); -printf(" node=%p ismeta=%d\n", node, fz_ismetanode(node)); - - while (node && !fz_ismetanode(node)) - { -printf(" node=%p ismeta=%d\n", node, fz_ismetanode(node)); - node = node->parent; - } - - if (node == nil) - return fz_throw("syntaxerror: unbalanced marked content"); - - gs->head = node->parent; - - return nil; -} - -#endif - fz_error * pdf_showpath(pdf_csi *csi, int doclose, int dofill, int dostroke, int evenodd) @@ -525,18 +491,40 @@ pdf_flushtext(pdf_csi *csi) pdf_gstate *gstate = csi->gstate + csi->gtop; fz_error *error; - /* invisible */ - if (gstate->render == 3) - return nil; - - else if (gstate->render != 0) - fz_warn("unimplemented text render mode: %d", gstate->render); - if (csi->text) { - error = pdf_addfillshape(gstate, (fz_node*)csi->text); - if (error) - return error; + + /* invisible */ + switch (csi->textmode) + { + case 0: /* fill */ + case 1: /* stroke */ + case 2: /* stroke + fill */ + error = pdf_addfillshape(gstate, (fz_node*)csi->text); + if (error) + return error; + break; + + case 3: /* invisible */ + error = addinvisibleshape(gstate, (fz_node*)csi->text); + if (error) + return error; + break; + + case 4: /* fill + clip */ + case 5: /* stroke + clip */ + case 6: /* stroke + fill + clip */ + case 7: /* invisible clip */ + if (!csi->textclip) + { + error = fz_newovernode(&csi->textclip); + if (error) + return error; + } + fz_insertnode(csi->textclip, (fz_node*)csi->text); + break; + } + csi->text = nil; } @@ -570,13 +558,14 @@ showglyph(pdf_csi *csi, int cid) trm = fz_concat(tsm, csi->tm); - /* flush buffered text if face or matrix has changed */ + /* flush buffered text if face or matrix or rendermode has changed */ if (!csi->text || ((fz_font*)font) != csi->text->font || fabs(trm.a - csi->text->trm.a) > FLT_EPSILON || fabs(trm.b - csi->text->trm.b) > FLT_EPSILON || fabs(trm.c - csi->text->trm.c) > FLT_EPSILON || - fabs(trm.d - csi->text->trm.d) > FLT_EPSILON) + fabs(trm.d - csi->text->trm.d) > FLT_EPSILON || + gstate->render != csi->textmode) { error = pdf_flushtext(csi); if (error) return error; @@ -587,6 +576,7 @@ showglyph(pdf_csi *csi, int cid) csi->text->trm = trm; csi->text->trm.e = 0; csi->text->trm.f = 0; + csi->textmode = gstate->render; } /* add glyph to textobject */ diff --git a/mupdf/font.c b/mupdf/font.c index d9683596..5f4f5046 100644 --- a/mupdf/font.c +++ b/mupdf/font.c @@ -1,11 +1,6 @@ #include <fitz.h> #include <mupdf.h> -/* - * TODO: substitution fonts when no exact match is found. - * base on a) cid system info and b) fontdescriptor flags - */ - #include <ft2build.h> #include FT_FREETYPE_H #include <freetype/internal/ftobjs.h> @@ -241,9 +236,7 @@ loadsimplefont(pdf_font **fontp, pdf_xref *xref, fz_obj *dict) fz_obj *descriptor = nil; fz_obj *encoding = nil; fz_obj *widths = nil; - fz_obj *tounicode = nil; unsigned short *etable = nil; - unsigned short *utable = nil; pdf_font *font; FT_Face face; FT_CharMap cmap; @@ -441,29 +434,11 @@ printf(" builtin encoding\n"); font->ncidtogid = 256; font->cidtogid = etable; - /* - * ToUnicode - */ - - utable = fz_malloc(sizeof(unsigned short) * 256); - if (!utable) + error = pdf_loadtounicode(font, xref, + estrings, nil, fz_dictgets(dict, "ToUnicode")); + if (error) goto cleanup; - for (i = 0; i < 256; i++) - if (estrings[i]) - utable[i] = pdf_lookupagl(estrings[i]); - else - utable[i] = i; - - tounicode = fz_dictgets(dict, "ToUnicode"); - if (fz_isindirect(tounicode)) - { -printf(" load tounicode cmap for simple font\n"); - } - - font->ncidtoucs = 256; - font->cidtoucs = utable; - /* * Widths */ @@ -520,7 +495,6 @@ printf("\n"); return nil; cleanup: - fz_free(utable); fz_free(etable); if (widths) fz_dropobj(widths); @@ -534,7 +508,7 @@ cleanup: */ static fz_error * -loadcidfont(pdf_font **fontp, pdf_xref *xref, fz_obj *dict, fz_obj *encoding) +loadcidfont(pdf_font **fontp, pdf_xref *xref, fz_obj *dict, fz_obj *encoding, fz_obj *tounicode) { fz_error *error; fz_obj *widths = nil; @@ -678,28 +652,7 @@ printf(" cidtogidmap %d\n", len / 2); /* win: 3,4 3,6 3,3 3,2 3,1 3,5 */ } - /* - * ToUnicode - */ - - if (fz_dictgets(dict, "ToUnicode")) - printf(" load tounicode for cid-font"); - - if (!strcmp(collection, "Adobe-CNS1")) - error = pdf_loadsystemcmap(&font->tounicode, "Adobe-CNS1-UCS2"); - else if (!strcmp(collection, "Adobe-GB1")) - error = pdf_loadsystemcmap(&font->tounicode, "Adobe-GB1-UCS2"); - else if (!strcmp(collection, "Adobe-Japan1")) - error = pdf_loadsystemcmap(&font->tounicode, "Adobe-Japan1-UCS2"); - else if (!strcmp(collection, "Adobe-Japan2")) - error = pdf_loadsystemcmap(&font->tounicode, "Adobe-Japan2-UCS2"); - else if (!strcmp(collection, "Adobe-Korea1")) - error = pdf_loadsystemcmap(&font->tounicode, "Adobe-Korea1-UCS2"); - else - { - printf(" unknown character collection\n"); - error = nil; - } + error = pdf_loadtounicode(font, xref, nil, collection, tounicode); if (error) goto cleanup; @@ -840,6 +793,7 @@ loadtype0(pdf_font **fontp, pdf_xref *xref, fz_obj *dict) fz_obj *dfont; fz_obj *subtype; fz_obj *encoding; + fz_obj *tounicode; dfonts = fz_dictgets(dict, "DescendantFonts"); error = pdf_resolve(&dfonts, xref); @@ -851,13 +805,14 @@ loadtype0(pdf_font **fontp, pdf_xref *xref, fz_obj *dict) if (error) return fz_dropobj(dfonts), error; - encoding = fz_dictgets(dict, "Encoding"); subtype = fz_dictgets(dfont, "Subtype"); + encoding = fz_dictgets(dict, "Encoding"); + tounicode = fz_dictgets(dict, "ToUnicode"); if (!strcmp(fz_toname(subtype), "CIDFontType0")) - error = loadcidfont(fontp, xref, dfont, encoding); + error = loadcidfont(fontp, xref, dfont, encoding, tounicode); else if (!strcmp(fz_toname(subtype), "CIDFontType2")) - error = loadcidfont(fontp, xref, dfont, encoding); + error = loadcidfont(fontp, xref, dfont, encoding, tounicode); else error = fz_throw("syntaxerror: unknown cid font type"); diff --git a/mupdf/fontfile.c b/mupdf/fontfile.c index a17bbfdf..6c525b74 100644 --- a/mupdf/fontfile.c +++ b/mupdf/fontfile.c @@ -5,7 +5,6 @@ #include <ft2build.h> #include FT_FREETYPE_H -#include <fontconfig/fontconfig.h> static FT_Library ftlib = nil; diff --git a/mupdf/interpret.c b/mupdf/interpret.c index e377c5b6..5e0e12a9 100644 --- a/mupdf/interpret.c +++ b/mupdf/interpret.c @@ -16,7 +16,7 @@ pdf_newcsi(pdf_csi **csip, int maskonly) csi->gtop = 0; csi->top = 0; - + csi->array = nil; csi->xbalance = 0; error = fz_newpathnode(&csi->path); @@ -44,6 +44,8 @@ pdf_newcsi(pdf_csi **csip, int maskonly) csi->clip = nil; + csi->textclip = nil; + csi->textmode = 0; csi->text = nil; csi->tm = fz_identity(); csi->tlm = fz_identity(); @@ -65,7 +67,9 @@ pdf_dropcsi(pdf_csi *csi) { if (csi->path) fz_dropnode((fz_node*)csi->path); if (csi->clip) fz_dropnode((fz_node*)csi->clip); + if (csi->textclip) fz_dropnode((fz_node*)csi->textclip); if (csi->text) fz_dropnode((fz_node*)csi->text); + if (csi->array) fz_dropobj(csi->array); clearstack(csi); fz_free(csi); } @@ -145,7 +149,7 @@ runinlineimage(pdf_csi *csi, pdf_xref *xref, fz_file *file, fz_obj *dict) token = pdf_lex(file, buf, sizeof buf, &len); if (token != PDF_TKEYWORD || strcmp("EI", buf)) - return fz_throw("syntaxerror: corrupt inline image"); + fz_warn("syntaxerror: corrupt inline image"); error = pdf_showimage(csi, img); if (error) @@ -550,9 +554,17 @@ Lsetcolor: { if (csi->top != 0) goto syntaxerror; + error = pdf_flushtext(csi); if (error) return error; + + if (csi->textclip) + { + error = pdf_addclipmask(gstate, csi->textclip); + if (error) return error; + csi->textclip = nil; + } } else if (!strcmp(buf, "Tc")) @@ -1003,6 +1015,7 @@ pdf_runcsi(pdf_csi *csi, pdf_xref *xref, fz_obj *rdb, fz_file *file) fz_error *error; char buf[65536]; int token, len; + fz_obj *obj; while (1) { @@ -1011,19 +1024,53 @@ pdf_runcsi(pdf_csi *csi, pdf_xref *xref, fz_obj *rdb, fz_file *file) token = pdf_lex(file, buf, sizeof buf, &len); - switch (token) + if (csi->array) + { + if (token == PDF_TCARRAY) + { + csi->stack[csi->top] = csi->array; + csi->array = nil; + csi->top ++; + } + else if (token == PDF_TINT || token == PDF_TREAL) + { + error = fz_newreal(&obj, atof(buf)); + if (error) return error; + error = fz_arraypush(csi->array, obj); + fz_dropobj(obj); + if (error) return error; + } + else if (token == PDF_TSTRING) + { + error = fz_newstring(&obj, buf, len); + if (error) return error; + error = fz_arraypush(csi->array, obj); + fz_dropobj(obj); + if (error) return error; + } + else if (token == PDF_TEOF) + { + return nil; + } + else + { + clearstack(csi); + return fz_throw("syntaxerror in content stream"); + } + } + + else switch (token) { case PDF_TEOF: return nil; - /* FIXME: need to make array parsing be able to span files for + /* we need to make array parsing be able to span files for those stupid pdf files that split TJ arrays across content streams... */ case PDF_TOARRAY: - error = pdf_parsearray(&csi->stack[csi->top], file, buf, sizeof buf); + error = fz_newarray(&csi->array, 8); if (error) return error; - csi->top ++; break; /* drop down to normal pdf object parsing for dictionaries, diff --git a/mupdf/parse.c b/mupdf/parse.c index 2eb3d234..06ae3954 100644 --- a/mupdf/parse.c +++ b/mupdf/parse.c @@ -147,7 +147,8 @@ skip: case PDF_TINT: a = atoi(buf); tok = pdf_lex(file, buf, cap, &len); - if (tok == PDF_TCDICT || tok == PDF_TNAME) + if (tok == PDF_TCDICT || tok == PDF_TNAME || + (tok == PDF_TKEYWORD && !strcmp(buf, "ID"))) { error = fz_newint(&val, a); if (error) goto cleanup; diff --git a/mupdf/type3.c b/mupdf/type3.c index 60d7c38c..81f22086 100644 --- a/mupdf/type3.c +++ b/mupdf/type3.c @@ -103,7 +103,6 @@ pdf_loadtype3font(pdf_font **fontp, pdf_xref *xref, fz_obj *dict) { fz_error *error; char buf[256]; - unsigned short *utable; char *estrings[256]; pdf_font *font; fz_obj *encoding; @@ -202,27 +201,10 @@ printf(" matrix [%g %g %g %g %g %g]\n", if (error) goto cleanup; - /* - * ToUnicode - */ - - utable = fz_malloc(sizeof(unsigned short) * 256); - if (!utable) - goto cleanup; - - for (i = 0; i < 256; i++) - if (estrings[i]) - utable[i] = pdf_lookupagl(estrings[i]); - else - utable[i] = i; - - if (fz_dictgets(dict, "ToUnicode")) - { -printf(" load tounicode cmap for type3 font\n"); - } - - font->ncidtoucs = 256; - font->cidtoucs = utable; + error = pdf_loadtounicode(font, xref, + estrings, nil, fz_dictgets(dict, "ToUnicode")); + if (error) + goto cleanup; /* * Widths @@ -264,22 +246,22 @@ printf(" load tounicode cmap for type3 font\n"); * Resources */ - obj = fz_dictgets(dict, "Resources"); - if (!obj) { - error = fz_throw("syntaxerror: Type3 font missing Resources"); - goto cleanup; - } + resources = nil; - error = pdf_resolve(&obj, xref); - if (error) - goto cleanup; + obj = fz_dictgets(dict, "Resources"); + if (obj) + { + error = pdf_resolve(&obj, xref); + if (error) + goto cleanup; - error = pdf_loadresources(&resources, xref, obj); + error = pdf_loadresources(&resources, xref, obj); - fz_dropobj(obj); + fz_dropobj(obj); - if (error) - goto cleanup; + if (error) + goto cleanup; + } /* * CharProcs @@ -311,7 +293,8 @@ printf(" load tounicode cmap for type3 font\n"); } fz_dropobj(charprocs); - fz_dropobj(resources); + if (resources) + fz_dropobj(resources); *fontp = font; return nil; diff --git a/mupdf/unicode.c b/mupdf/unicode.c new file mode 100644 index 00000000..42fba1c0 --- /dev/null +++ b/mupdf/unicode.c @@ -0,0 +1,278 @@ +#include <fitz.h> +#include <mupdf.h> + +/* + * ToUnicode map for fonts + */ + +fz_error * +pdf_loadtounicode(pdf_font *font, pdf_xref *xref, + char **strings, char *collection, fz_obj *cmapstm) +{ + fz_error *error; + fz_cmap *cmap; + int cid; + int ucs; + int i; + + if (fz_isindirect(cmapstm)) + { + error = pdf_loadembeddedcmap(&cmap, xref, cmapstm); + if (error) + return error; + + error = fz_newcmap(&font->tounicode); + if (error) + goto cleanup; + + for (i = 0; i < (strings ? 256 : 65536); i++) + { + cid = fz_lookupcid(font->encoding, i); + if (cid > 0) + { + ucs = fz_lookupcid(cmap, i); + error = fz_addcidrange(font->tounicode, cid, cid, ucs); + if (error) + goto cleanup; + } + } + + error = fz_endcidrange(font->tounicode); + if (error) + goto cleanup; + + cleanup: + fz_dropcmap(cmap); + return error; + } + + if (collection) + { + if (!strcmp(collection, "Adobe-CNS1")) + return pdf_loadsystemcmap(&font->tounicode, "Adobe-CNS1-UCS2"); + else if (!strcmp(collection, "Adobe-GB1")) + return pdf_loadsystemcmap(&font->tounicode, "Adobe-GB1-UCS2"); + else if (!strcmp(collection, "Adobe-Japan1")) + return pdf_loadsystemcmap(&font->tounicode, "Adobe-Japan1-UCS2"); + else if (!strcmp(collection, "Adobe-Japan2")) + return pdf_loadsystemcmap(&font->tounicode, "Adobe-Japan2-UCS2"); + else if (!strcmp(collection, "Adobe-Korea1")) + return pdf_loadsystemcmap(&font->tounicode, "Adobe-Korea1-UCS2"); + } + + if (strings) + { + font->ncidtoucs = 256; + font->cidtoucs = fz_malloc(256 * sizeof(unsigned short)); + if (!font->cidtoucs) + return fz_outofmem; + + for (i = 0; i < 256; i++) + { + if (strings[i]) + font->cidtoucs[i] = pdf_lookupagl(strings[i]); + else + font->cidtoucs[i] = 0; + } + + return nil; + } + + fz_warn("font: cannot create unicode conversion\n", collection); + return nil; +} + +/* + * Extract lines of text from display tree + */ + +fz_error * +pdf_newtextline(pdf_textline **linep) +{ + pdf_textline *line; + line = *linep = fz_malloc(sizeof(pdf_textline)); + if (!line) + return fz_outofmem; + line->len = 0; + line->cap = 0; + line->text = nil; + line->next = nil; + return nil; +} + +void +pdf_droptextline(pdf_textline *line) +{ + if (line->next) + pdf_droptextline(line->next); + fz_free(line->text); + fz_free(line); +} + +static fz_error * +addtextchar(pdf_textline *line, int x, int y, int c) +{ + pdf_textchar *newtext; + int newcap; + + if (line->len + 1 >= line->cap) + { + newcap = line->cap ? line->cap * 2 : 80; + newtext = fz_realloc(line->text, sizeof(pdf_textchar) * newcap); + if (!newtext) + return fz_outofmem; + line->cap = newcap; + line->text = newtext; + } + + line->text[line->len].x = x; + line->text[line->len].y = y; + line->text[line->len].c = c; + line->len ++; + + return nil; +} + +/* XXX global! not reentrant! */ +static fz_point oldpt = { 0, 0 }; + +static fz_error * +findtext(pdf_textline **line, fz_node *node, fz_matrix ctm) +{ + fz_error *error; + + if (fz_istextnode(node)) + { + fz_textnode *text = (fz_textnode*)node; + pdf_font *font = (pdf_font*)text->font; + fz_matrix inv = fz_invertmatrix(text->trm); + fz_matrix trm = fz_concat(text->trm, ctm); + float dx, dy, t; + fz_point p; + fz_vmtx v; + fz_hmtx h; + int i, g, x, y; + int c; + + for (i = 0; i < text->len; i++) + { + g = text->els[i].cid; + + p.x = text->els[i].x; + p.y = text->els[i].y; + p = fz_transformpoint(trm, p); + x = p.x; + y = p.y; + + p.x = text->els[i].x; + p.y = text->els[i].y; + p = fz_transformpoint(inv, p); + dx = oldpt.x - p.x; + dy = oldpt.y - p.y; + oldpt = p; + + if (text->font->wmode == 0) + { + h = fz_gethmtx(text->font, g); + oldpt.x += h.w * 0.001; + } + else + { + v = fz_getvmtx(text->font, g); + oldpt.y += v.w; + t = dy; dy = dx; dx = t; + } + + if (fabs(dy) > 0.2) + { + pdf_textline *newline; + error = pdf_newtextline(&newline); + if (error) + return error; + (*line)->next = newline; + *line = newline; + } + else if (fabs(dx) > 0.2) + { + error = addtextchar(*line, x, y, ' '); + if (error) + return error; + } + + if (font->tounicode) + c = fz_lookupcid(font->tounicode, g); + else if (g < font->ncidtoucs) + c = font->cidtoucs[g]; + else + c = g; + + error = addtextchar(*line, x, y, c); + if (error) + return error; + } + } + + if (fz_istransformnode(node)) + ctm = fz_concat(((fz_transformnode*)node)->m, ctm); + + for (node = node->first; node; node = node->next) + { + error = findtext(line, node, ctm); + if (error) + return error; + } + + return nil; +} + +fz_error * +pdf_loadtextfromtree(pdf_textline **outp, fz_tree *tree) +{ + pdf_textline *root; + pdf_textline *line; + fz_error *error; + + oldpt.x = -1; + oldpt.y = -1; + + error = pdf_newtextline(&root); + if (error) + return error; + + line = root; + + error = findtext(&line, tree->root, fz_identity()); + if (error) + { + pdf_droptextline(root); + return error; + } + + *outp = root; + return nil; +} + +void +pdf_debugtextline(pdf_textline *line) +{ + char buf[10]; + int c, n, k, i; + + for (i = 0; i < line->len; i++) + { + c = line->text[i].c; + if (c < 128) + putchar(c); + else + { + n = runetochar(buf, &c); + for (k = 0; k < n; k++) + putchar(buf[k]); + } + } + putchar('\n'); + + if (line->next) + pdf_debugtextline(line->next); +} + diff --git a/render/renderimage.c b/render/renderimage.c index a94aa66c..733261aa 100644 --- a/render/renderimage.c +++ b/render/renderimage.c @@ -1,9 +1,5 @@ #include <fitz.h> -#define GAMMA 1.8 - -void fz_gammapixmap(fz_pixmap *pix, float gamma); - #define LERP(a,b,t) (a + (((b - a) * t) >> 16)) static inline int getcomp(fz_pixmap *pix, int u, int v, int k) @@ -100,6 +96,30 @@ overscanrgb(fz_matrix *invmat, fz_pixmap *dst, fz_pixmap *src, int y, int x0, in } } +static inline void +overscanmask(fz_matrix *invmat, fz_pixmap *dst, fz_pixmap *src, int y, int x0, int x1) +{ + int x; + + int u = (invmat->a * (x0+0.5) + invmat->c * (y+0.5) + invmat->e) * 65536; + int v = (invmat->b * (x0+0.5) + invmat->d * (y+0.5) + invmat->f) * 65536; + int du = invmat->a * 65536; + int dv = invmat->b * 65536; + + u -= 0.5 * 65536; + v -= 0.5 * 65536; + + for (x = x0; x <= x1; x++) + { + int sa = sampleimage(src, u, v, 0); + int da = dst->samples[ (y-dst->y) * dst->w + x-dst->x ]; + da = sa + fz_mul255(da, 255 - sa); + dst->samples[ (y-dst->y) * dst->w + x-dst->x ] = da; + u += du; + v += dv; + } +} + static fz_error * drawtile(fz_renderer *gc, fz_pixmap *out, fz_pixmap *tile, fz_matrix ctm, int over) { @@ -135,6 +155,8 @@ drawtile(fz_renderer *gc, fz_pixmap *out, fz_pixmap *tile, fz_matrix ctm, int ov { if (over && tile->n == 4) overscanrgb(&invmat, out, tile, y, x0, x1); + else if (over && tile->n == 1) + overscanmask(&invmat, out, tile, y, x0, x1); else drawscan(&invmat, out, tile, y, x0, x1); } @@ -180,9 +202,7 @@ printf(" load tile %d x %d\n", w, h); if (dx != 1 || dy != 1) { printf(" scale tile 1/%d x 1/%d\n", dx, dy); -/* fz_gammapixmap(tile1, 1.0 / GAMMA); */ error = fz_scalepixmap(&tile2, tile1, dx, dy); -/* fz_gammapixmap(tile2, GAMMA); */ fz_droppixmap(tile1); } else @@ -198,10 +218,18 @@ printf(" scale tile 1/%d x 1/%d\n", dx, dy); /* render image mask */ if (n == 0 && a == 1) { + if (gc->acc && !gc->model) + { +printf(" draw image mask over\n"); + error = drawtile(gc, gc->acc, tile2, ctm, 1); + } + else + { printf(" draw image mask\n"); - error = fz_newpixmap(&gc->tmp, r.min.x, r.min.y, r.max.x - r.min.x, r.max.y - r.min.y, 1); - fz_clearpixmap(gc->tmp); - error = drawtile(gc, gc->tmp, tile2, ctm, 0); + error = fz_newpixmap(&gc->tmp, r.min.x, r.min.y, r.max.x - r.min.x, r.max.y - r.min.y, 1); + fz_clearpixmap(gc->tmp); + error = drawtile(gc, gc->tmp, tile2, ctm, 0); + } } /* render rgb over */ diff --git a/test/pdfrip.c b/test/pdfrip.c index 0837e969..5d4aba39 100644 --- a/test/pdfrip.c +++ b/test/pdfrip.c @@ -2,110 +2,14 @@ #include <mupdf.h> int showtree = 0; -int showtext = 0; float zoom = 1.0; void usage() { - fprintf(stderr, "usage: pdfrip [-dt] [-p password] [-z zoom] file.pdf [pages...]\n"); + fprintf(stderr, "usage: pdfrip [-d] [-p password] [-z zoom] file.pdf [pages...]\n"); exit(1); } -enum -{ - Bit1 = 7, - Bitx = 6, - Bit2 = 5, - Bit3 = 4, - Bit4 = 3, - - T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ - Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ - T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ - T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ - T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ - - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ - - Maskx = (1<<Bitx)-1, /* 0011 1111 */ - Testx = Maskx ^ 0xFF, /* 1100 0000 */ -}; - -void putrune(int c) -{ - if (c <= Rune1) - { - putchar(c); - return; - } - - if (c <= Rune2) - { - putchar(T2 | (c >> 1*Bitx)); - putchar(Tx | (c & Maskx)); - return; - } - - putchar(T3 | (c >> 2*Bitx)); - putchar(Tx | ((c >> 1*Bitx) & Maskx)); - putchar(Tx | (c & Maskx)); -} - -/* - * Dump text nodes as unicode - */ -void dumptext(fz_node *node) -{ - int i, cid, ucs; - static fz_point old = { 0, 0 }; - fz_point p; - float dx, dy; - fz_vmtx v; - fz_hmtx h; - - if (fz_istextnode(node)) - { - fz_textnode *text = (fz_textnode*)node; - pdf_font *font = (pdf_font*)text->font; - fz_matrix invtrm = fz_invertmatrix(text->trm); - - for (i = 0; i < text->len; i++) - { - cid = text->els[i].cid; - p.x = text->els[i].x; - p.y = text->els[i].y; - p = fz_transformpoint(invtrm, p); - dx = old.x - p.x; - dy = old.y - p.y; - old = p; - - if (fabs(dy) > 1.6) - puts("\n"); - else if (fabs(dy) > 0.2) - putchar('\n'); - else if (fabs(dx) > 0.2) - putchar(' '); - - h = fz_gethmtx(text->font, cid); - old.x += h.w / 1000.0; - - if (font->tounicode) - ucs = fz_lookupcid(font->tounicode, cid); - else if (font->ncidtoucs) - ucs = font->cidtoucs[cid]; - else - ucs = cid; - - putrune(ucs); - } - } - - for (node = node->first; node; node = node->next) - dumptext(node); -} - /* * Draw page */ @@ -139,14 +43,6 @@ void showpage(pdf_xref *xref, fz_obj *pageobj) printf("endtree\n"); } - if (showtext) - { - printf("---begin text dump---\n"); - dumptext(page->tree->root); - printf("\n---end text dump---\n"); - } - - else { fz_pixmap *pix; fz_renderer *gc; @@ -191,14 +87,13 @@ int main(int argc, char **argv) char *password = ""; - while ((c = getopt(argc, argv, "dtz:p:")) != -1) + while ((c = getopt(argc, argv, "dz:p:")) != -1) { switch (c) { case 'p': password = optarg; break; case 'z': zoom = atof(optarg); break; case 'd': ++showtree; break; - case 't': ++showtext; break; default: usage(); } } @@ -226,8 +121,8 @@ int main(int argc, char **argv) if (error) fz_abort(error); outlines = nil; - error = pdf_loadoutlinetree(&outlines, xref); - if (error) { fz_warn(error->msg); fz_droperror(error); } +// error = pdf_loadoutlinetree(&outlines, xref); +// if (error) { fz_warn(error->msg); fz_droperror(error); } if (optind == argc) { diff --git a/test/x11pdf.c b/test/x11pdf.c index 58655f8a..3d424b1c 100644 --- a/test/x11pdf.c +++ b/test/x11pdf.c @@ -29,6 +29,9 @@ static int rotate = 0; static int pageno = 1; static int count = 0; +static pdf_page *page = nil; +static fz_obj *pageobj = nil; + static int hist[256]; static int histlen = 0; @@ -126,16 +129,13 @@ static void xtitle(char *s) static void showpage(void) { fz_error *error; - pdf_page *page; fz_matrix ctm; fz_rect bbox; + fz_obj *obj; char s[256]; - fz_obj *pageobj; assert(pageno > 0 && pageno <= pdf_getpagecount(pages)); - pageobj = pdf_getpageobject(pages, pageno - 1); - XDefineCursor(xdpy, xwin, xcwait); XFlush(xdpy); @@ -143,6 +143,14 @@ static void showpage(void) fz_droppixmap(image); image = nil; + obj = pdf_getpageobject(pages, pageno - 1); + if (obj == pageobj) + goto Lskipload; + pageobj = obj; + + if (page) + pdf_droppage(page); + sprintf(s, "Loading page %d", pageno); XSetForeground(xdpy, xgc, BlackPixel(xdpy, xscr)); XDrawString(xdpy, xwin, xgc, 10, 20, s, strlen(s)); @@ -152,6 +160,8 @@ static void showpage(void) if (error) fz_abort(error); +Lskipload: + sprintf(s, "Rendering..."); XSetForeground(xdpy, xgc, BlackPixel(xdpy, xscr)); XDrawString(xdpy, xwin, xgc, 10, 30, s, strlen(s)); @@ -168,8 +178,6 @@ static void showpage(void) if (error) fz_abort(error); - pdf_droppage(page); - XDefineCursor(xdpy, xwin, xcarrow); XFlush(xdpy); @@ -206,6 +214,20 @@ static void pdfopen(char *filename, char *password) image = nil; } +static void dumptext() +{ + fz_error *error; + pdf_textline *line; + + error = pdf_loadtextfromtree(&line, page->tree); + if (error) + fz_abort(error); + + pdf_debugtextline(line); + + pdf_droptextline(line); +} + static void handlekey(int c) { int oldpage = pageno; @@ -223,6 +245,7 @@ static void handlekey(int c) case 'd': fz_debugglyphcache(rast->cache); break; case 'a': rotate -= 5; break; case 's': rotate += 5; break; + case 'x': dumptext(); break; case 'b': pageno--; |