diff options
author | Tor Andersson <tor@ghostscript.com> | 2010-03-09 01:18:29 +0100 |
---|---|---|
committer | Tor Andersson <tor@ghostscript.com> | 2010-03-09 01:18:29 +0100 |
commit | 9737242baff66bed4937f9efede3befbaee8e792 (patch) | |
tree | cbe8f8f92536ed51562c72e78e968a6066400d67 | |
parent | c96f530870d86d552007ca2e124fc3aa1f0824ac (diff) | |
download | mupdf-9737242baff66bed4937f9efede3befbaee8e792.tar.xz |
Add text extraction device.
-rw-r--r-- | apps/pdfapp.c | 2 | ||||
-rw-r--r-- | apps/pdfdraw.c | 37 | ||||
-rw-r--r-- | fitz/Jamfile | 3 | ||||
-rw-r--r-- | fitz/dev_draw.c | 10 | ||||
-rw-r--r-- | fitz/dev_null.c | 40 | ||||
-rw-r--r-- | fitz/dev_text.c | 198 | ||||
-rw-r--r-- | fitz/fitz_draw.h | 21 | ||||
-rw-r--r-- | fitz/fitz_res.h | 44 | ||||
-rw-r--r-- | mupdf/mupdf.h | 20 | ||||
-rw-r--r-- | mupdf/pdf_build.c | 7 | ||||
-rw-r--r-- | mupdf/pdf_unicode.c | 163 |
11 files changed, 322 insertions, 223 deletions
diff --git a/apps/pdfapp.c b/apps/pdfapp.c index 1cdf425b..cbc5b984 100644 --- a/apps/pdfapp.c +++ b/apps/pdfapp.c @@ -255,7 +255,7 @@ static void pdfapp_showpage(pdfapp_t *app, int loadpage, int drawpage) app->image = fz_newpixmapwithrect(pdf_devicergb, bbox); fz_clearpixmap(app->image, 0xFF); app->page->contents->rp = app->page->contents->bp; - dev = fz_newdrawdevice(pdf_devicergb, app->image); + dev = fz_newdrawdevice(app->image); error = pdf_runcontentstream(dev, ctm, 0, app->xref, app->page->resources, app->page->contents); fz_freedrawdevice(dev); if (error) diff --git a/apps/pdfdraw.c b/apps/pdfdraw.c index 44957dc2..2855227a 100644 --- a/apps/pdfdraw.c +++ b/apps/pdfdraw.c @@ -231,7 +231,7 @@ static void drawpnm(int pagenum, struct benchmark *loadtimes, struct benchmark * // drawpage->contents->rp = drawpage->contents->bp; // pdf_runcontentstream(dev, ctm, 0, xref, drawpage->resources, drawpage->contents); - dev = fz_newdrawdevice(pdf_devicergb, pix); + dev = fz_newdrawdevice(pix); drawpage->contents->rp = drawpage->contents->bp; error = pdf_runcontentstream(dev, ctm, 0, xref, drawpage->resources, drawpage->contents); if (error) @@ -303,6 +303,37 @@ static void drawpnm(int pagenum, struct benchmark *loadtimes, struct benchmark * fprintf(stdout, "\n"); } +static void drawtxt(int pagenum) +{ + fz_error error; + fz_matrix ctm; + fz_obj *pageobj; + fz_textline *text; + fz_device *dev; + + pageobj = pdf_getpageobject(xref, pagenum); + error = pdf_loadpage(&drawpage, xref, pageobj); + if (error) + die(error); + + ctm = fz_identity(); + + text = fz_newtextline(); + dev = fz_newtextdevice(text); + + drawpage->contents->rp = drawpage->contents->bp; + error = pdf_runcontentstream(dev, ctm, 0, xref, drawpage->resources, drawpage->contents); + if (error) + die(error); + + printf("[Page %d]\n", pagenum); + fz_debugtextline(text); + printf("\n"); + + fz_freetextdevice(dev); + fz_freetextline(text); +} + static void drawpages(char *pagelist) { int page, spage, epage; @@ -352,7 +383,7 @@ static void drawpages(char *pagelist) switch (drawmode) { case DRAWPNM: drawpnm(page, &loadtimes, &drawtimes); break; -// case DRAWTXT: drawtxt(page); break; + case DRAWTXT: drawtxt(page); break; // case DRAWXML: drawxml(page); break; } } @@ -417,8 +448,6 @@ int main(int argc, char **argv) closexref(); -//XXX fz_newrenderer(&drawgc, pdf_devicergb, 0, 1024 * 512); - openxref(argv[fz_optind], password, 0); state = NO_PAGES_DRAWN; } diff --git a/fitz/Jamfile b/fitz/Jamfile index 91e03d61..0942ea02 100644 --- a/fitz/Jamfile +++ b/fitz/Jamfile @@ -77,8 +77,9 @@ Library libfitz : res_colorspace.c res_font.c res_shade.c + dev_null.c dev_trace.c dev_draw.c - # dev_text.c + dev_text.c # dev_ghost.c ; diff --git a/fitz/dev_draw.c b/fitz/dev_draw.c index 9a84faa2..f04d9874 100644 --- a/fitz/dev_draw.c +++ b/fitz/dev_draw.c @@ -492,21 +492,17 @@ void fz_drawdrawimage(void *user, fz_pixmap *image, fz_matrix ctm) // else XXX } -fz_device *fz_newdrawdevice(fz_colorspace *colorspace, fz_pixmap *dest) +fz_device *fz_newdrawdevice(fz_pixmap *dest) { fz_drawdevice *ddev = fz_malloc(sizeof(fz_drawdevice)); - ddev->model = fz_keepcolorspace(colorspace); + ddev->model = fz_keepcolorspace(dest->colorspace); ddev->cache = fz_newglyphcache(512, 512 * 512); ddev->gel = fz_newgel(); ddev->ael = fz_newael(); ddev->dest = dest; ddev->cliptop = 0; - fz_device *dev = fz_malloc(sizeof(fz_device)); - memset(dev, 0, sizeof(fz_device)); - - dev->user = ddev; - + fz_device *dev = fz_newdevice(ddev); dev->fillpath = fz_drawfillpath; dev->strokepath = fz_drawstrokepath; dev->clippath = fz_drawclippath; diff --git a/fitz/dev_null.c b/fitz/dev_null.c new file mode 100644 index 00000000..4ad2e815 --- /dev/null +++ b/fitz/dev_null.c @@ -0,0 +1,40 @@ +#include "fitz.h" + +void fz_nullfillpath(void *user, fz_path *path, fz_colorspace *colorspace, float *color, float alpha) {} +void fz_nullstrokepath(void *user, fz_path *path, fz_colorspace *colorspace, float *color, float alpha) {} +void fz_nullclippath(void *user, fz_path *path) {} +void fz_nullfilltext(void *user, fz_text *text, fz_colorspace *colorspace, float *color, float alpha) {} +void fz_nullstroketext(void *user, fz_text *text, fz_colorspace *colorspace, float *color, float alpha) {} +void fz_nullcliptext(void *user, fz_text *text) {} +void fz_nullignoretext(void *user, fz_text *text) {} +void fz_nullpopclip(void *user) {} +void fz_nulldrawshade(void *user, fz_shade *shade, fz_matrix ctm) {} +void fz_nulldrawimage(void *user, fz_pixmap *image, fz_matrix ctm) {} +void fz_nullfillimagemask(void *user, fz_pixmap *image, fz_matrix ctm, fz_colorspace *colorspace, float *color, float alpha) {} +void fz_nullclipimagemask(void *user, fz_pixmap *image, fz_matrix ctm) {} + +fz_device *fz_newdevice(void *user) +{ + fz_device *dev = fz_malloc(sizeof(fz_device)); + memset(dev, 0, sizeof(fz_device)); + + dev->user = user; + + dev->fillpath = fz_nullfillpath; + dev->strokepath = fz_nullstrokepath; + dev->clippath = fz_nullclippath; + + dev->filltext = fz_nullfilltext; + dev->stroketext = fz_nullstroketext; + dev->cliptext = fz_nullcliptext; + dev->ignoretext = fz_nullignoretext; + + dev->fillimagemask = fz_nullfillimagemask; + dev->clipimagemask = fz_nullclipimagemask; + dev->drawimage = fz_nulldrawimage; + dev->drawshade = fz_nulldrawshade; + + dev->popclip = fz_nullpopclip; + + return dev; +} diff --git a/fitz/dev_text.c b/fitz/dev_text.c new file mode 100644 index 00000000..8bcdea7f --- /dev/null +++ b/fitz/dev_text.c @@ -0,0 +1,198 @@ +#include "fitz.h" + +#include <ft2build.h> +#include FT_FREETYPE_H + +#if ((FREETYPE_MAJOR == 2) && (FREETYPE_MINOR == 1)) || \ + ((FREETYPE_MAJOR == 2) && (FREETYPE_MINOR == 2)) || \ + ((FREETYPE_MAJOR == 2) && (FREETYPE_MINOR == 3) && (FREETYPE_PATCH < 8)) + +int FT_Get_Advance(FT_Face face, int gid, int masks, FT_Fixed *out) +{ + int fterr; + fterr = FT_Load_Glyph(face, gid, masks | FT_LOAD_IGNORE_TRANSFORM); + if (fterr) + return fterr; + *out = face->glyph->advance.x * 1024; + return 0; +} + +#else + +#include FT_ADVANCES_H + +#endif + +typedef struct fz_textdevice_s fz_textdevice; + +struct fz_textdevice_s +{ + fz_point point; + fz_textline *line; +}; + +fz_textline * +fz_newtextline(void) +{ + fz_textline *line; + line = fz_malloc(sizeof(fz_textline)); + line->len = 0; + line->cap = 0; + line->text = nil; + line->next = nil; + return line; +} + +void +fz_freetextline(fz_textline *line) +{ + if (line->next) + fz_freetextline(line->next); + fz_free(line->text); + fz_free(line); +} + +static void +fz_addtextchar(fz_textline *line, int x, int y, int c) +{ + if (line->len + 1 >= line->cap) + { + line->cap = line->cap ? (line->cap * 3) / 2 : 80; + line->text = fz_realloc(line->text, sizeof(fz_textchar) * line->cap); + } + line->text[line->len].x = x; + line->text[line->len].y = y; + line->text[line->len].c = c; + line->len ++; +} + +void +fz_debugtextline(fz_textline *line) +{ + char buf[10]; + int c, n, k, i; + + for (i = 0; i < line->len; i++) + { + c = line->text[i].c; + if (c < 128) + putchar(c); + else + { + n = runetochar(buf, &c); + for (k = 0; k < n; k++) + putchar(buf[k]); + } + } + putchar('\n'); + + if (line->next) + fz_debugtextline(line->next); +} + +static void +fz_textextractline(fz_textline **line, fz_text *text, fz_point *oldpt) +{ + fz_font *font = text->font; + fz_matrix ctm = text->ctm; + fz_matrix tm = text->trm; + fz_matrix inv = fz_invertmatrix(text->trm); + fz_matrix trm; + float dx, dy; + fz_point p; + float adv; + int i, x, y, fterr; + + if (font->ftface) + { + FT_Set_Transform(font->ftface, NULL, NULL); + fterr = FT_Set_Char_Size(font->ftface, 64, 64, 72, 72); + if (fterr) + fz_warn("freetype set character size: %s", ft_errorstring(fterr)); + } + + for (i = 0; i < text->len; i++) + { + tm.e = text->els[i].x; + tm.f = text->els[i].y; + trm = fz_concat(tm, ctm); + x = trm.e; + y = trm.f; + trm.e = 0; + trm.f = 0; + + p.x = text->els[i].x; + p.y = text->els[i].y; + p = fz_transformpoint(inv, p); + dx = oldpt->x - p.x; + dy = oldpt->y - p.y; + *oldpt = p; + + /* TODO: flip advance and test for vertical writing */ + + if (font->ftface) + { + FT_Fixed ftadv; + fterr = FT_Get_Advance(font->ftface, text->els[i].gid, + FT_LOAD_NO_BITMAP | FT_LOAD_NO_HINTING, + &ftadv); + if (fterr) + fz_warn("freetype get advance (gid %d): %s", text->els[i].gid, ft_errorstring(fterr)); + adv = ftadv / 65536.0; + oldpt->x += adv; + } + else + { + adv = font->t3widths[text->els[i].gid]; + oldpt->x += adv; + } + + if (fabs(dy) > 0.2) + { + fz_textline *newline; + newline = fz_newtextline(); + (*line)->next = newline; + *line = newline; + } + else if (fabs(dx) > 0.2) + { + fz_addtextchar(*line, x, y, ' '); + } + + fz_addtextchar(*line, x, y, text->els[i].ucs); + } +} + +void fz_textfilltext(void *user, fz_text *text, fz_colorspace *colorspace, float *color, float alpha) +{ + fz_textdevice *tdev = user; + fz_textextractline(&tdev->line, text, &tdev->point); +} + +void fz_textignoretext(void *user, fz_text *text) +{ + fz_textdevice *tdev = user; + fz_textextractline(&tdev->line, text, &tdev->point); +} + +fz_device *fz_newtextdevice(fz_textline *root) +{ + fz_textdevice *tdev = fz_malloc(sizeof(fz_textdevice)); + tdev->line = root; + tdev->point.x = -1; + tdev->point.y = -1; + + fz_device *dev = fz_newdevice(tdev); + + dev->filltext = fz_textfilltext; + dev->ignoretext = fz_textignoretext; + + return dev; +} + +void fz_freetextdevice(fz_device *dev) +{ + fz_textdevice *tdev = dev->user; + fz_free(tdev); + fz_free(dev); +}
\ No newline at end of file diff --git a/fitz/fitz_draw.h b/fitz/fitz_draw.h index f106f792..0bda294c 100644 --- a/fitz/fitz_draw.h +++ b/fitz/fitz_draw.h @@ -12,9 +12,6 @@ typedef struct fz_renderer_s fz_renderer; typedef struct fz_glyph_s fz_glyph; typedef struct fz_glyphcache_s fz_glyphcache; -fz_device *fz_newdrawdevice(fz_colorspace *colorspace, fz_pixmap *dest); -void fz_freedrawdevice(fz_device *dev); - fz_glyphcache * fz_newglyphcache(int slots, int size); void fz_renderftglyph(fz_glyph *glyph, fz_font *font, int cid, fz_matrix trm); void fz_rendert3glyph(fz_glyph *glyph, fz_font *font, int cid, fz_matrix trm); @@ -117,22 +114,4 @@ extern void (*fz_scol5)(FZ_BYTE *src, FZ_BYTE *dst, int w, int denom); #undef FZ_BYTE -struct fz_renderer_s -{ - int maskonly; - fz_colorspace *model; - fz_glyphcache *cache; - fz_gel *gel; - fz_ael *ael; - - fz_irect clip; - fz_pixmap *dest; - fz_pixmap *over; - unsigned char argb[7]; /* alpha, a*r, a*g, a*b, r, g, b */ - int flag; -}; - extern void fz_accelerate(void); - -fz_renderer * fz_newrenderer(fz_colorspace *pcm, int maskonly, int gcmem); -void fz_droprenderer(fz_renderer *gc); diff --git a/fitz/fitz_res.h b/fitz/fitz_res.h index 27612008..f029c1c2 100644 --- a/fitz/fitz_res.h +++ b/fitz/fitz_res.h @@ -22,7 +22,7 @@ struct fz_device_s void (*cliptext)(void *, fz_text *); void (*ignoretext)(void *, fz_text *); - void (*fillimagemask)(void *, fz_pixmap *img, fz_matrix ctm, fz_colorspace *, float *color); + void (*fillimagemask)(void *, fz_pixmap *img, fz_matrix ctm, fz_colorspace *, float *color, float alpha); void (*clipimagemask)(void *, fz_pixmap *img, fz_matrix ctm); void (*drawimage)(void *, fz_pixmap *img, fz_matrix ctm); void (*drawshade)(void *, fz_shade *shd, fz_matrix ctm); @@ -30,8 +30,50 @@ struct fz_device_s void (*popclip)(void *); }; +/* no-op device functions */ +fz_device *fz_newdevice(void *user); +void fz_initnulldevice(fz_device *dev); +void fz_nullfillpath(void *user, fz_path *path, fz_colorspace *colorspace, float *color, float alpha); +void fz_nullstrokepath(void *user, fz_path *path, fz_colorspace *colorspace, float *color, float alpha); +void fz_nullclippath(void *user, fz_path *path); +void fz_nullfilltext(void *user, fz_text *text, fz_colorspace *colorspace, float *color, float alpha); +void fz_nullstroketext(void *user, fz_text *text, fz_colorspace *colorspace, float *color, float alpha); +void fz_nullcliptext(void *user, fz_text *text); +void fz_nullignoretext(void *user, fz_text *text); +void fz_nullpopclip(void *user); +void fz_nulldrawshade(void *user, fz_shade *shade, fz_matrix ctm); +void fz_nulldrawimage(void *user, fz_pixmap *image, fz_matrix ctm); +void fz_nullfillimagemask(void *user, fz_pixmap *image, fz_matrix ctm, fz_colorspace *colorspace, float *color, float alpha); +void fz_nullclipimagemask(void *user, fz_pixmap *image, fz_matrix ctm); + fz_device *fz_newtracedevice(void); +fz_device *fz_newdrawdevice(fz_pixmap *dest); +void fz_freedrawdevice(fz_device *dev); + +typedef struct fz_textline_s fz_textline; +typedef struct fz_textchar_s fz_textchar; + +struct fz_textchar_s +{ + int x, y; + int c; +}; + +struct fz_textline_s +{ + int len, cap; + fz_textchar *text; + fz_textline *next; +}; + +fz_textline * fz_newtextline(void); +void fz_freetextline(fz_textline *line); +void fz_debugtextline(fz_textline *line); + +fz_device *fz_newtextdevice(fz_textline *text); +void fz_freetextdevice(fz_device *dev); + typedef enum fz_blendkind_e { /* PDF 1.4 -- standard separable */ diff --git a/mupdf/mupdf.h b/mupdf/mupdf.h index 74b21dd8..d76617b6 100644 --- a/mupdf/mupdf.h +++ b/mupdf/mupdf.h @@ -567,8 +567,6 @@ void pdf_loadannots(pdf_comment **, pdf_link **, pdf_xref *, fz_obj *annots); */ typedef struct pdf_page_s pdf_page; -typedef struct pdf_textline_s pdf_textline; -typedef struct pdf_textchar_s pdf_textchar; struct pdf_page_s { @@ -580,19 +578,6 @@ struct pdf_page_s pdf_link *links; }; -struct pdf_textchar_s -{ - int x, y; - int c; -}; - -struct pdf_textline_s -{ - int len, cap; - pdf_textchar *text; - pdf_textline *next; -}; - /* pagetree.c */ int pdf_getpagecount(pdf_xref *xref); fz_obj * pdf_getpageobject(pdf_xref *xref, int p); @@ -602,11 +587,6 @@ int pdf_findpageobject(pdf_xref *xref, fz_obj *pageobj); fz_error pdf_loadpage(pdf_page **pagep, pdf_xref *xref, fz_obj *ref); void pdf_droppage(pdf_page *page); -/* unicode.c */ -void pdf_debugtextline(pdf_textline *line); -pdf_textline * pdf_newtextline(void); -void pdf_droptextline(pdf_textline *line); - /* * content stream parsing */ diff --git a/mupdf/pdf_build.c b/mupdf/pdf_build.c index 8b065f21..8f51c744 100644 --- a/mupdf/pdf_build.c +++ b/mupdf/pdf_build.c @@ -290,13 +290,10 @@ pdf_showpath(pdf_csi *csi, int doclose, int dofill, int dostroke, int evenodd) gstate->stroke.cs, gstate->stroke.v, gstate->stroke.alpha); break; case PDF_MPATTERN: - fz_warn("pattern fills not supported yet"); + fz_warn("pattern strokes not supported yet"); break; case PDF_MSHADE: -// csi->dev->clipstrokepath(csi->dev->user, csi->path); -// csi->dev->clippath(csi->dev->user, csi->path); - // csi->dev->drawshade(csi->dev->user, gstate->stroke.shade, gstate->ctm); - // csi->dev->popclip(csi->dev->user); + fz_warn("shading strokes not supported yet"); break; } } diff --git a/mupdf/pdf_unicode.c b/mupdf/pdf_unicode.c index 4c0ce353..91f149c0 100644 --- a/mupdf/pdf_unicode.c +++ b/mupdf/pdf_unicode.c @@ -1,29 +1,6 @@ #include "fitz.h" #include "mupdf.h" -#include <ft2build.h> -#include FT_FREETYPE_H - -#if ((FREETYPE_MAJOR == 2) && (FREETYPE_MINOR == 1)) || \ - ((FREETYPE_MAJOR == 2) && (FREETYPE_MINOR == 2)) || \ - ((FREETYPE_MAJOR == 2) && (FREETYPE_MINOR == 3) && (FREETYPE_PATCH < 8)) - -int FT_Get_Advance(FT_Face face, int gid, int masks, FT_Fixed *out) -{ - int fterr; - fterr = FT_Load_Glyph(face, gid, masks | FT_LOAD_IGNORE_TRANSFORM); - if (fterr) - return fterr; - *out = face->glyph->advance.x * 1024; - return 0; -} - -#else - -#include FT_ADVANCES_H - -#endif - /* * ToUnicode map for fonts */ @@ -115,143 +92,3 @@ pdf_loadtounicode(pdf_fontdesc *font, pdf_xref *xref, return fz_okay; } - -/* - * Extract lines of text from display tree. - * - * This extraction needs to be rewritten for the new tree - * architecture where glyph index and unicode characters are both stored - * in the text objects. - */ - -pdf_textline * -pdf_newtextline(void) -{ - pdf_textline *line; - line = fz_malloc(sizeof(pdf_textline)); - line->len = 0; - line->cap = 0; - line->text = nil; - line->next = nil; - return line; -} - -void -pdf_droptextline(pdf_textline *line) -{ - if (line->next) - pdf_droptextline(line->next); - fz_free(line->text); - fz_free(line); -} - -static void -addtextchar(pdf_textline *line, int x, int y, int c) -{ - if (line->len + 1 >= line->cap) - { - line->cap = line->cap ? (line->cap * 3) / 2 : 80; - line->text = fz_realloc(line->text, sizeof(pdf_textchar) * line->cap); - } - line->text[line->len].x = x; - line->text[line->len].y = y; - line->text[line->len].c = c; - line->len ++; -} - -void -pdf_extracttextline(pdf_textline **line, fz_text *text, fz_matrix ctm, fz_point *oldpt) -{ - fz_font *font = text->font; - fz_matrix tm = text->trm; - fz_matrix inv = fz_invertmatrix(text->trm); - fz_matrix trm; - float dx, dy; - fz_point p; - float adv; - int i, x, y, fterr; - - if (font->ftface) - { - FT_Set_Transform(font->ftface, NULL, NULL); - fterr = FT_Set_Char_Size(font->ftface, 64, 64, 72, 72); - if (fterr) - fz_warn("freetype set character size: %s", ft_errorstring(fterr)); - } - - for (i = 0; i < text->len; i++) - { - tm.e = text->els[i].x; - tm.f = text->els[i].y; - trm = fz_concat(tm, ctm); - x = trm.e; - y = trm.f; - trm.e = 0; - trm.f = 0; - - p.x = text->els[i].x; - p.y = text->els[i].y; - p = fz_transformpoint(inv, p); - dx = oldpt->x - p.x; - dy = oldpt->y - p.y; - *oldpt = p; - - /* TODO: flip advance and test for vertical writing */ - - if (font->ftface) - { - FT_Fixed ftadv; - fterr = FT_Get_Advance(font->ftface, text->els[i].gid, - FT_LOAD_NO_BITMAP | FT_LOAD_NO_HINTING, - &ftadv); - if (fterr) - fz_warn("freetype get advance (gid %d): %s", text->els[i].gid, ft_errorstring(fterr)); - adv = ftadv / 65536.0; - oldpt->x += adv; - } - else - { - adv = font->t3widths[text->els[i].gid]; - oldpt->x += adv; - } - - if (fabs(dy) > 0.2) - { - pdf_textline *newline; - newline = pdf_newtextline(); - (*line)->next = newline; - *line = newline; - } - else if (fabs(dx) > 0.2) - { - addtextchar(*line, x, y, ' '); - } - - addtextchar(*line, x, y, text->els[i].ucs); - } -} - -void -pdf_debugtextline(pdf_textline *line) -{ - char buf[10]; - int c, n, k, i; - - for (i = 0; i < line->len; i++) - { - c = line->text[i].c; - if (c < 128) - putchar(c); - else - { - n = runetochar(buf, &c); - for (k = 0; k < n; k++) - putchar(buf[k]); - } - } - putchar('\n'); - - if (line->next) - pdf_debugtextline(line->next); -} - |