summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTor Andersson <tor@ghostscript.com>2004-11-16 04:59:21 +0100
committerTor Andersson <tor@ghostscript.com>2004-11-16 04:59:21 +0100
commit49132f70ac40b2dc7b9a0e22b33a3964af687874 (patch)
tree6d583fe99c77665b92b3b0d88d0a1f71889b0343
parent01a774064e13cebc9a75e44432226d845fa46f9a (diff)
downloadmupdf-49132f70ac40b2dc7b9a0e22b33a3964af687874.tar.xz
text extraction
-rw-r--r--mupdf/font.c30
-rw-r--r--test/pdfrip.c107
2 files changed, 135 insertions, 2 deletions
diff --git a/mupdf/font.c b/mupdf/font.c
index ddd5dec2..0e216ea4 100644
--- a/mupdf/font.c
+++ b/mupdf/font.c
@@ -257,6 +257,10 @@ pdf_newfont(char *name)
font->ncidtogid = 0;
font->cidtogid = nil;
+ font->tounicode = nil;
+ font->ncidtoucs = 0;
+ font->cidtoucs = nil;
+
font->filename = nil;
font->fontdata = nil;
@@ -277,7 +281,9 @@ loadsimplefont(pdf_font **fontp, pdf_xref *xref, fz_obj *dict)
fz_obj *descriptor = nil;
fz_obj *encoding = nil;
fz_obj *widths = nil;
+ fz_obj *tounicode = nil;
unsigned short *etable = nil;
+ unsigned short *utable = nil;
pdf_font *font;
FT_Face face;
FT_CharMap cmap;
@@ -476,6 +482,29 @@ printf(" builtin encoding\n");
font->cidtogid = etable;
/*
+ * ToUnicode
+ */
+
+ utable = fz_malloc(sizeof(unsigned short) * 256);
+ if (!utable)
+ goto cleanup;
+
+ for (i = 0; i < 256; i++)
+ if (estrings[i])
+ utable[i] = aglcode(estrings[i]);
+ else
+ utable[i] = i;
+
+ tounicode = fz_dictgets(dict, "ToUnicode");
+ if (fz_isindirect(tounicode))
+ {
+printf(" load tounicode cmap for simple font\n");
+ }
+
+ font->ncidtoucs = 256;
+ font->cidtoucs = utable;
+
+ /*
* Widths
*/
@@ -531,6 +560,7 @@ printf("\n");
return nil;
cleanup:
+ fz_free(utable);
fz_free(etable);
if (widths)
fz_dropobj(widths);
diff --git a/test/pdfrip.c b/test/pdfrip.c
index daca3ba9..a4bce202 100644
--- a/test/pdfrip.c
+++ b/test/pdfrip.c
@@ -2,14 +2,108 @@
#include <mupdf.h>
int showtree = 0;
+int showtext = 0;
float zoom = 1.0;
void usage()
{
- fprintf(stderr, "usage: pdfrip [-d] [-p password] [-z zoom] file.pdf [pages...]\n");
+ fprintf(stderr, "usage: pdfrip [-dt] [-p password] [-z zoom] file.pdf [pages...]\n");
exit(1);
}
+enum
+{
+ Bit1 = 7,
+ Bitx = 6,
+ Bit2 = 5,
+ Bit3 = 4,
+ Bit4 = 3,
+
+ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
+ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
+ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
+ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
+ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
+
+ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
+ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
+ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
+
+ Maskx = (1<<Bitx)-1, /* 0011 1111 */
+ Testx = Maskx ^ 0xFF, /* 1100 0000 */
+};
+
+void putrune(int c)
+{
+ if (c <= Rune1)
+ {
+ putchar(c);
+ return;
+ }
+
+ if (c <= Rune2)
+ {
+ putchar(T2 | (c >> 1*Bitx));
+ putchar(Tx | (c & Maskx));
+ return;
+ }
+
+ putchar(T3 | (c >> 2*Bitx));
+ putchar(Tx | ((c >> 1*Bitx) & Maskx));
+ putchar(Tx | (c & Maskx));
+}
+
+/*
+ * Dump text nodes as unicode
+ */
+void dumptext(fz_node *node)
+{
+ int i, cid, ucs;
+ static fz_point old = { 0, 0 };
+ fz_point p;
+ float dx, dy;
+ fz_vmtx v;
+ fz_hmtx h;
+
+ if (fz_istextnode(node))
+ {
+ fz_textnode *text = (fz_textnode*)node;
+ pdf_font *font = (pdf_font*)text->font;
+ fz_matrix invtrm = fz_invertmatrix(text->trm);
+
+ for (i = 0; i < text->len; i++)
+ {
+ cid = text->els[i].cid;
+ p.x = text->els[i].x;
+ p.y = text->els[i].y;
+ p = fz_transformpoint(invtrm, p);
+ dx = old.x - p.x;
+ dy = old.y - p.y;
+ old = p;
+
+ if (fabs(dy) > 1.3)
+ puts("\n");
+ else if (fabs(dy) > 0.1)
+ putchar('\n');
+ else if (fabs(dx) > 0.1)
+ putchar(' ');
+
+ h = fz_gethmtx(text->font, cid);
+ old.x += h.w / 1000.0;
+
+ if (font->ncidtoucs)
+ ucs = font->cidtoucs[cid];
+ else
+ ucs = cid;
+
+ putrune(ucs);
+ }
+ }
+
+ for (node = node->child; node; node = node->next)
+ dumptext(node);
+}
+
/*
* Draw page
*/
@@ -43,6 +137,14 @@ void showpage(pdf_xref *xref, fz_obj *pageobj)
printf("endtree\n");
}
+ if (showtext)
+ {
+ printf("---begin text dump---\n");
+ dumptext(page->tree->root);
+ printf("\n---end text dump---\n");
+ }
+
+ else
{
fz_pixmap *pix;
fz_renderer *gc;
@@ -87,13 +189,14 @@ int main(int argc, char **argv)
char *password = "";
- while ((c = getopt(argc, argv, "dz:p:")) != -1)
+ while ((c = getopt(argc, argv, "dtz:p:")) != -1)
{
switch (c)
{
case 'p': password = optarg; break;
case 'z': zoom = atof(optarg); break;
case 'd': ++showtree; break;
+ case 't': ++showtext; break;
default: usage();
}
}