diff options
author | Tor Andersson <tor.andersson@artifex.com> | 2012-03-06 17:41:04 +0100 |
---|---|---|
committer | Tor Andersson <tor.andersson@artifex.com> | 2012-03-12 18:43:02 +0100 |
commit | 0b0e2af392428b5dbc88d6fbd2f6b5181e85165f (patch) | |
tree | 9668c284b68b912c103f0778ff298aafb4d6453b /apps | |
parent | bc1c06ddd335f899025470dde7e839a82d792972 (diff) | |
download | mupdf-0b0e2af392428b5dbc88d6fbd2f6b5181e85165f.tar.xz |
Create style sheet and group extracted text into blocks, lines and spans.
Diffstat (limited to 'apps')
-rw-r--r-- | apps/mudraw.c | 58 | ||||
-rw-r--r-- | apps/pdfapp.c | 160 | ||||
-rw-r--r-- | apps/pdfapp.h | 3 | ||||
-rw-r--r-- | apps/x11_main.c | 2 |
4 files changed, 134 insertions, 89 deletions
diff --git a/apps/mudraw.c b/apps/mudraw.c index 19f359a1..31847665 100644 --- a/apps/mudraw.c +++ b/apps/mudraw.c @@ -10,6 +10,8 @@ #include <sys/time.h> #endif +enum { TEXT_PLAIN = 1, TEXT_HTML = 2, TEXT_XML = 3 }; + static char *output = NULL; static float resolution = 72; static float rotation = 0; @@ -28,6 +30,7 @@ static int width = 0; static int height = 0; static int fit = 0; +static fz_text_sheet *sheet = NULL; static fz_colorspace *colorspace; static char *filename; @@ -157,42 +160,43 @@ static void drawpage(fz_context *ctx, fz_document *doc, int pagenum) if (showtext) { - fz_text_span *text = NULL; + fz_text_page *text = NULL; fz_var(text); fz_try(ctx) { - text = fz_new_text_span(ctx); - dev = fz_new_text_device(ctx, text); + text = fz_new_text_page(ctx, fz_bound_page(doc, page)); + dev = fz_new_text_device(ctx, sheet, text); if (list) fz_run_display_list(list, dev, fz_identity, fz_infinite_bbox, NULL); else fz_run_page(doc, page, dev, fz_identity, NULL); fz_free_device(dev); dev = NULL; - if (showtext > 1) + if (showtext == TEXT_XML) { - printf("<page number=\"%d\">\n", pagenum); - fz_debug_text_span_xml(text); - printf("</page>\n"); + fz_print_text_page_xml(stdout, text); } - else + else if (showtext == TEXT_HTML) + { + fz_print_text_page_html(stdout, text); + } + else if (showtext == TEXT_PLAIN) { - printf("[Page %d]\n", pagenum); - fz_debug_text_span(text); + fz_print_text_page(stdout, text); + printf("\f\n"); } - printf("\n"); } fz_catch(ctx) { fz_free_device(dev); - fz_free_text_span(ctx, text); + fz_free_text_page(ctx, text); fz_free_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } - fz_free_text_span(ctx, text); + fz_free_text_page(ctx, text); } if (showmd5 || showtime) @@ -456,9 +460,23 @@ int main(int argc, char **argv) timing.minpage = 0; timing.maxpage = 0; - if (showxml || showtext > 1) + if (showxml || showtext == TEXT_XML) printf("<?xml version=\"1.0\"?>\n"); + if (showtext) + sheet = fz_new_text_sheet(ctx); + + if (showtext == TEXT_HTML) + { + printf("<style>\n"); + printf("body{background-color:gray;margin:12tp;}\n"); + printf("div.page{background-color:white;margin:6pt;padding:6pt;}\n"); + printf("div.block{border:1px solid gray;margin:6pt;padding:6pt;}\n"); + printf("p{margin:0;padding:0;}\n"); + printf("</style>\n"); + printf("<body>\n"); + } + fz_try(ctx) { while (fz_optind < argc) @@ -478,7 +496,7 @@ int main(int argc, char **argv) if (!fz_authenticate_password(doc, password)) fz_throw(ctx, "cannot authenticate password: %s", filename); - if (showxml || showtext > 1) + if (showxml || showtext == TEXT_XML) printf("<document name=\"%s\">\n", filename); if (showoutline) @@ -492,7 +510,7 @@ int main(int argc, char **argv) drawrange(ctx, doc, argv[fz_optind++]); } - if (showxml || showtext > 1) + if (showxml || showtext == TEXT_XML) printf("</document>\n"); fz_close_document(doc); @@ -504,6 +522,14 @@ int main(int argc, char **argv) fz_close_document(doc); } + if (showtext == TEXT_HTML) + { + printf("</body>\n"); + printf("<style>\n"); + fz_print_text_sheet(stdout, sheet); + printf("</style>\n"); + } + if (showtime) { printf("total %dms / %d pages for an average of %dms\n", diff --git a/apps/pdfapp.c b/apps/pdfapp.c index 4f093508..e5742645 100644 --- a/apps/pdfapp.c +++ b/apps/pdfapp.c @@ -157,7 +157,7 @@ void pdfapp_close(pdfapp_t *app) app->page_list = NULL; if (app->page_text) - fz_free_text_span(app->ctx, app->page_text); + fz_free_text_page(app->ctx, app->page_text); app->page_text = NULL; if (app->page_links) @@ -228,7 +228,7 @@ static void pdfapp_loadpage(pdfapp_t *app) if (app->page_list) fz_free_display_list(app->ctx, app->page_list); if (app->page_text) - fz_free_text_span(app->ctx, app->page_text); + fz_free_text_page(app->ctx, app->page_text); if (app->page_links) fz_drop_link(app->ctx, app->page_links); if (app->page) @@ -273,8 +273,9 @@ static void pdfapp_showpage(pdfapp_t *app, int loadpage, int drawpage, int repai app->hitlen = 0; /* Extract text */ - app->page_text = fz_new_text_span(app->ctx); - tdev = fz_new_text_device(app->ctx, app->page_text); + app->page_sheet = fz_new_text_sheet(app->ctx); + app->page_text = fz_new_text_page(app->ctx, app->page_bbox); + tdev = fz_new_text_device(app->ctx, app->page_sheet, app->page_text); fz_run_display_list(app->page_list, tdev, fz_identity, fz_infinite_bbox, NULL); fz_free_device(tdev); } @@ -354,23 +355,61 @@ static void pdfapp_gotopage(pdfapp_t *app, int number) pdfapp_showpage(app, 1, 1, 1); } -static inline fz_bbox bboxcharat(fz_text_span *span, int idx) +static fz_text_char textcharat(fz_text_page *page, int idx) { + static fz_text_char emptychar = { {0,0,0,0}, ' ' }; + fz_text_block *block; + fz_text_line *line; + fz_text_span *span; int ofs = 0; - while (span) + for (block = page->blocks; block < page->blocks + page->len; block++) { - if (idx < ofs + span->len) - return span->text[idx - ofs].bbox; - if (span->eol) + for (line = block->lines; line < block->lines + block->len; line++) { - if (idx == ofs + span->len) - return fz_empty_bbox; - ofs ++; + for (span = line->spans; span < line->spans + line->len; span++) + { + if (idx < ofs + span->len) + return span->text[idx - ofs]; + /* pseudo-newline */ + if (span + 1 == line->spans + line->len) + { + if (idx == ofs + span->len) + return emptychar; + ofs++; + } + ofs += span->len; + } } - ofs += span->len; - span = span->next; } - return fz_empty_bbox; + return emptychar; +} + +static int textlen(fz_text_page *page) +{ + fz_text_block *block; + fz_text_line *line; + fz_text_span *span; + int len = 0; + for (block = page->blocks; block < page->blocks + page->len; block++) + { + for (line = block->lines; line < block->lines + block->len; line++) + { + for (span = line->spans; span < line->spans + line->len; span++) + len += span->len; + len++; /* pseudo-newline */ + } + } + return len; +} + +static inline int charat(fz_text_page *page, int idx) +{ + return textcharat(page, idx).c; +} + +static inline fz_bbox bboxcharat(fz_text_page *page, int idx) +{ + return fz_round_rect(textcharat(page, idx).bbox); } void pdfapp_inverthit(pdfapp_t *app) @@ -404,52 +443,20 @@ void pdfapp_inverthit(pdfapp_t *app) pdfapp_invert(app, fz_transform_bbox(ctm, hitbox)); } -static inline int charat(fz_text_span *span, int idx) -{ - int ofs = 0; - while (span) - { - if (idx < ofs + span->len) - return span->text[idx - ofs].c; - if (span->eol) - { - if (idx == ofs + span->len) - return ' '; - ofs ++; - } - ofs += span->len; - span = span->next; - } - return 0; -} - -static int textlen(fz_text_span *span) -{ - int len = 0; - while (span) - { - len += span->len; - if (span->eol) - len ++; - span = span->next; - } - return len; -} - -static int match(char *s, fz_text_span *span, int n) +static int match(char *s, fz_text_page *page, int n) { int orig = n; int c; while ((c = *s++)) { - if (c == ' ' && charat(span, n) == ' ') + if (c == ' ' && charat(page, n) == ' ') { - while (charat(span, n) == ' ') + while (charat(page, n) == ' ') n++; } else { - if (tolower(c) != tolower(charat(span, n))) + if (tolower(c) != tolower(charat(page, n))) return 0; n++; } @@ -1067,6 +1074,9 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen) { fz_bbox hitbox; fz_matrix ctm; + fz_text_page *page = app->page_text; + fz_text_block *block; + fz_text_line *line; fz_text_span *span; int c, i, p; int seen; @@ -1079,32 +1089,40 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen) ctm = pdfapp_viewctm(app); p = 0; - for (span = app->page_text; span; span = span->next) - { - seen = 0; - for (i = 0; i < span->len; i++) + for (block = page->blocks; block < page->blocks + page->len; block++) + { + for (line = block->lines; line < block->lines + block->len; line++) { - hitbox = fz_transform_bbox(ctm, span->text[i].bbox); - c = span->text[i].c; - if (c < 32) - c = '?'; - if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) + for (span = line->spans; span < line->spans + line->len; span++) { - if (p < ucslen - 1) - ucsbuf[p++] = c; - seen = 1; - } - } + seen = 0; - if (seen && span->eol) - { + for (i = 0; i < span->len; i++) + { + hitbox = fz_round_rect(span->text[i].bbox); + hitbox = fz_transform_bbox(ctm, hitbox); + c = span->text[i].c; + if (c < 32) + c = '?'; + if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) + { + if (p < ucslen - 1) + ucsbuf[p++] = c; + seen = 1; + } + } + + if (seen && span + 1 == line->spans + line->len) + { #ifdef _WIN32 - if (p < ucslen - 1) - ucsbuf[p++] = '\r'; + if (p < ucslen - 1) + ucsbuf[p++] = '\r'; #endif - if (p < ucslen - 1) - ucsbuf[p++] = '\n'; + if (p < ucslen - 1) + ucsbuf[p++] = '\n'; + } + } } } diff --git a/apps/pdfapp.h b/apps/pdfapp.h index 0c1b6ac4..db83335f 100644 --- a/apps/pdfapp.h +++ b/apps/pdfapp.h @@ -52,7 +52,8 @@ struct pdfapp_s fz_page *page; fz_rect page_bbox; fz_display_list *page_list; - fz_text_span *page_text; + fz_text_page *page_text; + fz_text_sheet *page_sheet; fz_link *page_links; /* snapback history */ diff --git a/apps/x11_main.c b/apps/x11_main.c index 091f0ec6..fe0196fc 100644 --- a/apps/x11_main.c +++ b/apps/x11_main.c @@ -466,7 +466,7 @@ void windocopy(pdfapp_t *app) { ucs = ucs2[0]; - utf8 += runetochar(utf8, &ucs); + utf8 += fz_runetochar(utf8, ucs); if (ucs < 256) *latin1++ = ucs; |