diff options
author | Tor Andersson <tor@ghostscript.com> | 2004-11-16 04:59:21 +0100 |
---|---|---|
committer | Tor Andersson <tor@ghostscript.com> | 2004-11-16 04:59:21 +0100 |
commit | 49132f70ac40b2dc7b9a0e22b33a3964af687874 (patch) | |
tree | 6d583fe99c77665b92b3b0d88d0a1f71889b0343 | |
parent | 01a774064e13cebc9a75e44432226d845fa46f9a (diff) | |
download | mupdf-49132f70ac40b2dc7b9a0e22b33a3964af687874.tar.xz |
text extraction
-rw-r--r-- | mupdf/font.c | 30 | ||||
-rw-r--r-- | test/pdfrip.c | 107 |
2 files changed, 135 insertions, 2 deletions
diff --git a/mupdf/font.c b/mupdf/font.c index ddd5dec2..0e216ea4 100644 --- a/mupdf/font.c +++ b/mupdf/font.c @@ -257,6 +257,10 @@ pdf_newfont(char *name) font->ncidtogid = 0; font->cidtogid = nil; + font->tounicode = nil; + font->ncidtoucs = 0; + font->cidtoucs = nil; + font->filename = nil; font->fontdata = nil; @@ -277,7 +281,9 @@ loadsimplefont(pdf_font **fontp, pdf_xref *xref, fz_obj *dict) fz_obj *descriptor = nil; fz_obj *encoding = nil; fz_obj *widths = nil; + fz_obj *tounicode = nil; unsigned short *etable = nil; + unsigned short *utable = nil; pdf_font *font; FT_Face face; FT_CharMap cmap; @@ -476,6 +482,29 @@ printf(" builtin encoding\n"); font->cidtogid = etable; /* + * ToUnicode + */ + + utable = fz_malloc(sizeof(unsigned short) * 256); + if (!utable) + goto cleanup; + + for (i = 0; i < 256; i++) + if (estrings[i]) + utable[i] = aglcode(estrings[i]); + else + utable[i] = i; + + tounicode = fz_dictgets(dict, "ToUnicode"); + if (fz_isindirect(tounicode)) + { +printf(" load tounicode cmap for simple font\n"); + } + + font->ncidtoucs = 256; + font->cidtoucs = utable; + + /* * Widths */ @@ -531,6 +560,7 @@ printf("\n"); return nil; cleanup: + fz_free(utable); fz_free(etable); if (widths) fz_dropobj(widths); diff --git a/test/pdfrip.c b/test/pdfrip.c index daca3ba9..a4bce202 100644 --- a/test/pdfrip.c +++ b/test/pdfrip.c @@ -2,14 +2,108 @@ #include <mupdf.h> int showtree = 0; +int showtext = 0; float zoom = 1.0; void usage() { - fprintf(stderr, "usage: pdfrip [-d] [-p password] [-z zoom] file.pdf [pages...]\n"); + fprintf(stderr, "usage: pdfrip [-dt] [-p password] [-z zoom] file.pdf [pages...]\n"); exit(1); } +enum +{ + Bit1 = 7, + Bitx = 6, + Bit2 = 5, + Bit3 = 4, + Bit4 = 3, + + T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ + Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ + T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ + T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ + T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + + Maskx = (1<<Bitx)-1, /* 0011 1111 */ + Testx = Maskx ^ 0xFF, /* 1100 0000 */ +}; + +void putrune(int c) +{ + if (c <= Rune1) + { + putchar(c); + return; + } + + if (c <= Rune2) + { + putchar(T2 | (c >> 1*Bitx)); + putchar(Tx | (c & Maskx)); + return; + } + + putchar(T3 | (c >> 2*Bitx)); + putchar(Tx | ((c >> 1*Bitx) & Maskx)); + putchar(Tx | (c & Maskx)); +} + +/* + * Dump text nodes as unicode + */ +void dumptext(fz_node *node) +{ + int i, cid, ucs; + static fz_point old = { 0, 0 }; + fz_point p; + float dx, dy; + fz_vmtx v; + fz_hmtx h; + + if (fz_istextnode(node)) + { + fz_textnode *text = (fz_textnode*)node; + pdf_font *font = (pdf_font*)text->font; + fz_matrix invtrm = fz_invertmatrix(text->trm); + + for (i = 0; i < text->len; i++) + { + cid = text->els[i].cid; + p.x = text->els[i].x; + p.y = text->els[i].y; + p = fz_transformpoint(invtrm, p); + dx = old.x - p.x; + dy = old.y - p.y; + old = p; + + if (fabs(dy) > 1.3) + puts("\n"); + else if (fabs(dy) > 0.1) + putchar('\n'); + else if (fabs(dx) > 0.1) + putchar(' '); + + h = fz_gethmtx(text->font, cid); + old.x += h.w / 1000.0; + + if (font->ncidtoucs) + ucs = font->cidtoucs[cid]; + else + ucs = cid; + + putrune(ucs); + } + } + + for (node = node->child; node; node = node->next) + dumptext(node); +} + /* * Draw page */ @@ -43,6 +137,14 @@ void showpage(pdf_xref *xref, fz_obj *pageobj) printf("endtree\n"); } + if (showtext) + { + printf("---begin text dump---\n"); + dumptext(page->tree->root); + printf("\n---end text dump---\n"); + } + + else { fz_pixmap *pix; fz_renderer *gc; @@ -87,13 +189,14 @@ int main(int argc, char **argv) char *password = ""; - while ((c = getopt(argc, argv, "dz:p:")) != -1) + while ((c = getopt(argc, argv, "dtz:p:")) != -1) { switch (c) { case 'p': password = optarg; break; case 'z': zoom = atof(optarg); break; case 'd': ++showtree; break; + case 't': ++showtext; break; default: usage(); } } |