diff options
-rw-r--r-- | apps/mudraw.c | 58 | ||||
-rw-r--r-- | apps/pdfapp.c | 160 | ||||
-rw-r--r-- | apps/pdfapp.h | 3 | ||||
-rw-r--r-- | apps/x11_main.c | 2 | ||||
-rw-r--r-- | fitz/dev_text.c | 689 | ||||
-rw-r--r-- | fitz/fitz.h | 102 |
6 files changed, 716 insertions, 298 deletions
diff --git a/apps/mudraw.c b/apps/mudraw.c index 19f359a1..31847665 100644 --- a/apps/mudraw.c +++ b/apps/mudraw.c @@ -10,6 +10,8 @@ #include <sys/time.h> #endif +enum { TEXT_PLAIN = 1, TEXT_HTML = 2, TEXT_XML = 3 }; + static char *output = NULL; static float resolution = 72; static float rotation = 0; @@ -28,6 +30,7 @@ static int width = 0; static int height = 0; static int fit = 0; +static fz_text_sheet *sheet = NULL; static fz_colorspace *colorspace; static char *filename; @@ -157,42 +160,43 @@ static void drawpage(fz_context *ctx, fz_document *doc, int pagenum) if (showtext) { - fz_text_span *text = NULL; + fz_text_page *text = NULL; fz_var(text); fz_try(ctx) { - text = fz_new_text_span(ctx); - dev = fz_new_text_device(ctx, text); + text = fz_new_text_page(ctx, fz_bound_page(doc, page)); + dev = fz_new_text_device(ctx, sheet, text); if (list) fz_run_display_list(list, dev, fz_identity, fz_infinite_bbox, NULL); else fz_run_page(doc, page, dev, fz_identity, NULL); fz_free_device(dev); dev = NULL; - if (showtext > 1) + if (showtext == TEXT_XML) { - printf("<page number=\"%d\">\n", pagenum); - fz_debug_text_span_xml(text); - printf("</page>\n"); + fz_print_text_page_xml(stdout, text); } - else + else if (showtext == TEXT_HTML) + { + fz_print_text_page_html(stdout, text); + } + else if (showtext == TEXT_PLAIN) { - printf("[Page %d]\n", pagenum); - fz_debug_text_span(text); + fz_print_text_page(stdout, text); + printf("\f\n"); } - printf("\n"); } fz_catch(ctx) { fz_free_device(dev); - fz_free_text_span(ctx, text); + fz_free_text_page(ctx, text); fz_free_display_list(ctx, list); fz_free_page(doc, page); fz_rethrow(ctx); } - fz_free_text_span(ctx, text); + fz_free_text_page(ctx, text); } if (showmd5 || showtime) @@ -456,9 +460,23 @@ int main(int argc, char **argv) timing.minpage = 0; timing.maxpage = 0; - if (showxml || showtext > 1) + if (showxml || showtext == TEXT_XML) printf("<?xml version=\"1.0\"?>\n"); + if (showtext) + sheet = fz_new_text_sheet(ctx); + + if (showtext == TEXT_HTML) + { + printf("<style>\n"); + printf("body{background-color:gray;margin:12tp;}\n"); + printf("div.page{background-color:white;margin:6pt;padding:6pt;}\n"); + printf("div.block{border:1px solid gray;margin:6pt;padding:6pt;}\n"); + printf("p{margin:0;padding:0;}\n"); + printf("</style>\n"); + printf("<body>\n"); + } + fz_try(ctx) { while (fz_optind < argc) @@ -478,7 +496,7 @@ int main(int argc, char **argv) if (!fz_authenticate_password(doc, password)) fz_throw(ctx, "cannot authenticate password: %s", filename); - if (showxml || showtext > 1) + if (showxml || showtext == TEXT_XML) printf("<document name=\"%s\">\n", filename); if (showoutline) @@ -492,7 +510,7 @@ int main(int argc, char **argv) drawrange(ctx, doc, argv[fz_optind++]); } - if (showxml || showtext > 1) + if (showxml || showtext == TEXT_XML) printf("</document>\n"); fz_close_document(doc); @@ -504,6 +522,14 @@ int main(int argc, char **argv) fz_close_document(doc); } + if (showtext == TEXT_HTML) + { + printf("</body>\n"); + printf("<style>\n"); + fz_print_text_sheet(stdout, sheet); + printf("</style>\n"); + } + if (showtime) { printf("total %dms / %d pages for an average of %dms\n", diff --git a/apps/pdfapp.c b/apps/pdfapp.c index 4f093508..e5742645 100644 --- a/apps/pdfapp.c +++ b/apps/pdfapp.c @@ -157,7 +157,7 @@ void pdfapp_close(pdfapp_t *app) app->page_list = NULL; if (app->page_text) - fz_free_text_span(app->ctx, app->page_text); + fz_free_text_page(app->ctx, app->page_text); app->page_text = NULL; if (app->page_links) @@ -228,7 +228,7 @@ static void pdfapp_loadpage(pdfapp_t *app) if (app->page_list) fz_free_display_list(app->ctx, app->page_list); if (app->page_text) - fz_free_text_span(app->ctx, app->page_text); + fz_free_text_page(app->ctx, app->page_text); if (app->page_links) fz_drop_link(app->ctx, app->page_links); if (app->page) @@ -273,8 +273,9 @@ static void pdfapp_showpage(pdfapp_t *app, int loadpage, int drawpage, int repai app->hitlen = 0; /* Extract text */ - app->page_text = fz_new_text_span(app->ctx); - tdev = fz_new_text_device(app->ctx, app->page_text); + app->page_sheet = fz_new_text_sheet(app->ctx); + app->page_text = fz_new_text_page(app->ctx, app->page_bbox); + tdev = fz_new_text_device(app->ctx, app->page_sheet, app->page_text); fz_run_display_list(app->page_list, tdev, fz_identity, fz_infinite_bbox, NULL); fz_free_device(tdev); } @@ -354,23 +355,61 @@ static void pdfapp_gotopage(pdfapp_t *app, int number) pdfapp_showpage(app, 1, 1, 1); } -static inline fz_bbox bboxcharat(fz_text_span *span, int idx) +static fz_text_char textcharat(fz_text_page *page, int idx) { + static fz_text_char emptychar = { {0,0,0,0}, ' ' }; + fz_text_block *block; + fz_text_line *line; + fz_text_span *span; int ofs = 0; - while (span) + for (block = page->blocks; block < page->blocks + page->len; block++) { - if (idx < ofs + span->len) - return span->text[idx - ofs].bbox; - if (span->eol) + for (line = block->lines; line < block->lines + block->len; line++) { - if (idx == ofs + span->len) - return fz_empty_bbox; - ofs ++; + for (span = line->spans; span < line->spans + line->len; span++) + { + if (idx < ofs + span->len) + return span->text[idx - ofs]; + /* pseudo-newline */ + if (span + 1 == line->spans + line->len) + { + if (idx == ofs + span->len) + return emptychar; + ofs++; + } + ofs += span->len; + } } - ofs += span->len; - span = span->next; } - return fz_empty_bbox; + return emptychar; +} + +static int textlen(fz_text_page *page) +{ + fz_text_block *block; + fz_text_line *line; + fz_text_span *span; + int len = 0; + for (block = page->blocks; block < page->blocks + page->len; block++) + { + for (line = block->lines; line < block->lines + block->len; line++) + { + for (span = line->spans; span < line->spans + line->len; span++) + len += span->len; + len++; /* pseudo-newline */ + } + } + return len; +} + +static inline int charat(fz_text_page *page, int idx) +{ + return textcharat(page, idx).c; +} + +static inline fz_bbox bboxcharat(fz_text_page *page, int idx) +{ + return fz_round_rect(textcharat(page, idx).bbox); } void pdfapp_inverthit(pdfapp_t *app) @@ -404,52 +443,20 @@ void pdfapp_inverthit(pdfapp_t *app) pdfapp_invert(app, fz_transform_bbox(ctm, hitbox)); } -static inline int charat(fz_text_span *span, int idx) -{ - int ofs = 0; - while (span) - { - if (idx < ofs + span->len) - return span->text[idx - ofs].c; - if (span->eol) - { - if (idx == ofs + span->len) - return ' '; - ofs ++; - } - ofs += span->len; - span = span->next; - } - return 0; -} - -static int textlen(fz_text_span *span) -{ - int len = 0; - while (span) - { - len += span->len; - if (span->eol) - len ++; - span = span->next; - } - return len; -} - -static int match(char *s, fz_text_span *span, int n) +static int match(char *s, fz_text_page *page, int n) { int orig = n; int c; while ((c = *s++)) { - if (c == ' ' && charat(span, n) == ' ') + if (c == ' ' && charat(page, n) == ' ') { - while (charat(span, n) == ' ') + while (charat(page, n) == ' ') n++; } else { - if (tolower(c) != tolower(charat(span, n))) + if (tolower(c) != tolower(charat(page, n))) return 0; n++; } @@ -1067,6 +1074,9 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen) { fz_bbox hitbox; fz_matrix ctm; + fz_text_page *page = app->page_text; + fz_text_block *block; + fz_text_line *line; fz_text_span *span; int c, i, p; int seen; @@ -1079,32 +1089,40 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen) ctm = pdfapp_viewctm(app); p = 0; - for (span = app->page_text; span; span = span->next) - { - seen = 0; - for (i = 0; i < span->len; i++) + for (block = page->blocks; block < page->blocks + page->len; block++) + { + for (line = block->lines; line < block->lines + block->len; line++) { - hitbox = fz_transform_bbox(ctm, span->text[i].bbox); - c = span->text[i].c; - if (c < 32) - c = '?'; - if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) + for (span = line->spans; span < line->spans + line->len; span++) { - if (p < ucslen - 1) - ucsbuf[p++] = c; - seen = 1; - } - } + seen = 0; - if (seen && span->eol) - { + for (i = 0; i < span->len; i++) + { + hitbox = fz_round_rect(span->text[i].bbox); + hitbox = fz_transform_bbox(ctm, hitbox); + c = span->text[i].c; + if (c < 32) + c = '?'; + if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) + { + if (p < ucslen - 1) + ucsbuf[p++] = c; + seen = 1; + } + } + + if (seen && span + 1 == line->spans + line->len) + { #ifdef _WIN32 - if (p < ucslen - 1) - ucsbuf[p++] = '\r'; + if (p < ucslen - 1) + ucsbuf[p++] = '\r'; #endif - if (p < ucslen - 1) - ucsbuf[p++] = '\n'; + if (p < ucslen - 1) + ucsbuf[p++] = '\n'; + } + } } } diff --git a/apps/pdfapp.h b/apps/pdfapp.h index 0c1b6ac4..db83335f 100644 --- a/apps/pdfapp.h +++ b/apps/pdfapp.h @@ -52,7 +52,8 @@ struct pdfapp_s fz_page *page; fz_rect page_bbox; fz_display_list *page_list; - fz_text_span *page_text; + fz_text_page *page_text; + fz_text_sheet *page_sheet; fz_link *page_links; /* snapback history */ diff --git a/apps/x11_main.c b/apps/x11_main.c index 091f0ec6..fe0196fc 100644 --- a/apps/x11_main.c +++ b/apps/x11_main.c @@ -466,7 +466,7 @@ void windocopy(pdfapp_t *app) { ucs = ucs2[0]; - utf8 += runetochar(utf8, &ucs); + utf8 += fz_runetochar(utf8, ucs); if (ucs < 256) *latin1++ = ucs; diff --git a/fitz/dev_text.c b/fitz/dev_text.c index 79d8c137..7a56b56c 100644 --- a/fitz/dev_text.c +++ b/fitz/dev_text.c @@ -2,6 +2,7 @@ #define LINE_DIST 0.9f #define SPACE_DIST 0.2f +#define PARAGRAPH_DIST 0.5f #include <ft2build.h> #include FT_FREETYPE_H @@ -11,60 +12,211 @@ typedef struct fz_text_device_s fz_text_device; struct fz_text_device_s { + fz_text_sheet *sheet; + fz_text_page *page; + fz_text_line cur_line; + fz_text_span cur_span; fz_point point; - fz_text_span *head; - fz_text_span *span; }; -fz_text_span * -fz_new_text_span(fz_context *ctx) +fz_text_sheet * +fz_new_text_sheet(fz_context *ctx) { - fz_text_span *span; - span = fz_malloc_struct(ctx, fz_text_span); - span->font = NULL; - span->wmode = 0; - span->size = 0; - span->len = 0; - span->cap = 0; - span->text = NULL; - span->next = NULL; - span->eol = 0; - return span; + fz_text_sheet *sheet = fz_malloc(ctx, sizeof *sheet); + sheet->maxid = 0; + sheet->style = NULL; + return sheet; } void -fz_free_text_span(fz_context *ctx, fz_text_span *span) +fz_free_text_sheet(fz_context *ctx, fz_text_sheet *sheet) +{ + fz_text_style *style = sheet->style; + while (style) + { + fz_text_style *next = style->next; + fz_drop_font(ctx, style->font); + fz_free(ctx, style); + style = next; + } +} + +static fz_text_style * +fz_find_text_style_imp(fz_context *ctx, fz_text_sheet *sheet, + float size, fz_font *font, int wmode, int script) +{ + fz_text_style *style; + + for (style = sheet->style; style; style = style->next) + { + if (style->font == font && + style->size == size && + style->wmode == wmode && + style->script == script) /* FIXME: others */ + { + return style; + } + } + + /* Better make a new one and add it to our list */ + style = fz_malloc(ctx, sizeof *style); + style->id = sheet->maxid++; + style->font = fz_keep_font(ctx, font); + style->size = size; + style->wmode = wmode; + style->script = script; + style->next = sheet->style; + sheet->style = style; + return style; +} + +static fz_text_style * +fz_find_text_style(fz_context *ctx, fz_text_sheet *sheet, fz_text *text, fz_matrix *ctm, + fz_colorspace *colorspace, float *color, float alpha, fz_stroke_state *stroke) { - fz_text_span *next; + float size = 1.0f; + fz_font *font = text ? text->font : NULL; + int wmode = text ? text->wmode : 0; + if (ctm && text) + { + fz_matrix tm = text->trm; + fz_matrix trm; + tm.e = 0; + tm.f = 0; + trm = fz_concat(tm, *ctm); + size = fz_matrix_expansion(trm); + } + return fz_find_text_style_imp(ctx, sheet, size, font, wmode, 0); +} - while (span) +fz_text_page * +fz_new_text_page(fz_context *ctx, fz_rect mediabox) +{ + fz_text_page *page = fz_malloc(ctx, sizeof(*page)); + page->mediabox = mediabox; + page->len = 0; + page->cap = 0; + page->blocks = NULL; + return page; +} + +void +fz_free_text_page(fz_context *ctx, fz_text_page *page) +{ + fz_text_block *block; + fz_text_line *line; + fz_text_span *span; + for (block = page->blocks; block < page->blocks + page->len; block++) { - if (span->font) - fz_drop_font(ctx, span->font); - next = span->next; - fz_free(ctx, span->text); - fz_free(ctx, span); - span = next; + for (line = block->lines; line < block->lines + block->len; line++) + { + for (span = line->spans; span < line->spans + line->len; span++) + { + fz_free(ctx, span->text); + } + fz_free(ctx, line->spans); + } + fz_free(ctx, block->lines); } + fz_free(ctx, page->blocks); + fz_free(ctx, page); } static void -fz_add_text_char_imp(fz_context *ctx, fz_text_span *span, int c, fz_bbox bbox) +append_char(fz_context *ctx, fz_text_span *span, int c, fz_rect bbox) { - if (span->len + 1 >= span->cap) + if (span->len == span->cap) { - span->cap = span->cap > 1 ? (span->cap * 3) / 2 : 80; - span->text = fz_resize_array(ctx, span->text, span->cap, sizeof(fz_text_char)); + span->cap = MAX(64, span->cap * 2); + span->text = fz_resize_array(ctx, span->text, span->cap, sizeof(*span->text)); } + span->bbox = fz_union_rect(span->bbox, bbox); span->text[span->len].c = c; span->text[span->len].bbox = bbox; - span->len ++; + span->len++; +} + +static void +init_span(fz_context *ctx, fz_text_span *span, fz_text_style *style) +{ + span->style = style; + span->bbox = fz_empty_rect; + span->len = span->cap = 0; + span->text = NULL; +} + +static void +append_span(fz_context *ctx, fz_text_line *line, fz_text_span *span) +{ + if (line->len == line->cap) + { + line->cap = MAX(8, line->cap * 2); + line->spans = fz_resize_array(ctx, line->spans, line->cap, sizeof(*line->spans)); + } + line->bbox = fz_union_rect(line->bbox, span->bbox); + line->spans[line->len++] = *span; } -static fz_bbox -fz_split_bbox(fz_bbox bbox, int i, int n) +static void +init_line(fz_context *ctx, fz_text_line *line) +{ + line->bbox = fz_empty_rect; + line->len = line->cap = 0; + line->spans = NULL; +} + +static void +append_line(fz_context *ctx, fz_text_block *block, fz_text_line *line) { - float w = (float)(bbox.x1 - bbox.x0) / n; + if (block->len == block->cap) + { + block->cap = MAX(16, block->cap * 2); + block->lines = fz_resize_array(ctx, block->lines, block->cap, sizeof *block->lines); + } + block->bbox = fz_union_rect(block->bbox, line->bbox); + block->lines[block->len++] = *line; +} + +static fz_text_block * +find_block_for_line(fz_context *ctx, fz_text_page *page, fz_text_line *line) +{ + float size = line->len > 0 && line->spans[0].len > 0 ? line->spans[0].style->size : 1; + int i; + + for (i = 0; i < page->len; i++) + { + fz_text_block *block = page->blocks + i; + int w = block->bbox.x1 - block->bbox.x0; + if (block->bbox.y0 - line->bbox.y1 < size * PARAGRAPH_DIST) + if (line->bbox.x0 < block->bbox.x1 && line->bbox.x1 > block->bbox.x0) + if (ABS(line->bbox.x0 - block->bbox.x0) < w / 4) + return block; + } + + if (page->len == page->cap) + { + page->cap = MAX(16, page->cap * 2); + page->blocks = fz_resize_array(ctx, page->blocks, page->cap, sizeof(*page->blocks)); + } + + page->blocks[page->len].bbox = fz_empty_rect; + page->blocks[page->len].len = 0; + page->blocks[page->len].cap = 0; + page->blocks[page->len].lines = NULL; + + return &page->blocks[page->len++]; +} + +static void +insert_line(fz_context *ctx, fz_text_page *page, fz_text_line *line) +{ + append_line(ctx, find_block_for_line(ctx, page, line), line); +} + +static fz_rect +fz_split_bbox(fz_rect bbox, int i, int n) +{ + float w = (bbox.x1 - bbox.x0) / n; float x0 = bbox.x0; bbox.x0 = x0 + i * w; bbox.x1 = x0 + (i + 1) * w; @@ -72,154 +224,71 @@ fz_split_bbox(fz_bbox bbox, int i, int n) } static void -fz_add_text_char(fz_context *ctx, fz_text_span **last, fz_font *font, float size, int wmode, int c, fz_bbox bbox) +fz_flush_text_line(fz_context *ctx, fz_text_device *dev, fz_text_style *style) { - fz_text_span *span = *last; - - if (!span->font) - { - span->font = fz_keep_font(ctx, font); - span->size = size; - } + append_span(ctx, &dev->cur_line, &dev->cur_span); + insert_line(ctx, dev->page, &dev->cur_line); + init_span(ctx, &dev->cur_span, style); + init_line(ctx, &dev->cur_line); +} - if ((span->font != font || span->size != size || span->wmode != wmode) && c != 32) +static void +fz_add_text_char_imp(fz_context *ctx, fz_text_device *dev, fz_text_style *style, int c, fz_rect bbox) +{ + if (!dev->cur_span.style) + dev->cur_span.style = style; + if (style != dev->cur_span.style) { - span = fz_new_text_span(ctx); - span->font = fz_keep_font(ctx, font); - span->size = size; - span->wmode = wmode; - (*last)->next = span; - *last = span; + append_span(ctx, &dev->cur_line, &dev->cur_span); + init_span(ctx, &dev->cur_span, style); } + append_char(ctx, &dev->cur_span, c, bbox); +} +static void +fz_add_text_char(fz_context *ctx, fz_text_device *dev, fz_text_style *style, int c, fz_rect bbox) +{ switch (c) { case -1: /* ignore when one unicode character maps to multiple glyphs */ break; case 0xFB00: /* ff */ - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 2)); - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 1, 2)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 2)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 1, 2)); break; case 0xFB01: /* fi */ - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 2)); - fz_add_text_char_imp(ctx, span, 'i', fz_split_bbox(bbox, 1, 2)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 2)); + fz_add_text_char_imp(ctx, dev, style, 'i', fz_split_bbox(bbox, 1, 2)); break; case 0xFB02: /* fl */ - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 2)); - fz_add_text_char_imp(ctx, span, 'l', fz_split_bbox(bbox, 1, 2)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 2)); + fz_add_text_char_imp(ctx, dev, style, 'l', fz_split_bbox(bbox, 1, 2)); break; case 0xFB03: /* ffi */ - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 3)); - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 1, 3)); - fz_add_text_char_imp(ctx, span, 'i', fz_split_bbox(bbox, 2, 3)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 3)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 1, 3)); + fz_add_text_char_imp(ctx, dev, style, 'i', fz_split_bbox(bbox, 2, 3)); break; case 0xFB04: /* ffl */ - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 3)); - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 1, 3)); - fz_add_text_char_imp(ctx, span, 'l', fz_split_bbox(bbox, 2, 3)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 3)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 1, 3)); + fz_add_text_char_imp(ctx, dev, style, 'l', fz_split_bbox(bbox, 2, 3)); break; case 0xFB05: /* long st */ case 0xFB06: /* st */ - fz_add_text_char_imp(ctx, span, 's', fz_split_bbox(bbox, 0, 2)); - fz_add_text_char_imp(ctx, span, 't', fz_split_bbox(bbox, 1, 2)); + fz_add_text_char_imp(ctx, dev, style, 's', fz_split_bbox(bbox, 0, 2)); + fz_add_text_char_imp(ctx, dev, style, 't', fz_split_bbox(bbox, 1, 2)); break; default: - fz_add_text_char_imp(ctx, span, c, bbox); + fz_add_text_char_imp(ctx, dev, style, c, bbox); break; } } static void -fz_divide_text_chars(fz_text_span **last, int n, fz_bbox bbox) -{ - fz_text_span *span = *last; - int i, x; - x = span->len - n; - if (x >= 0) - for (i = 0; i < n; i++) - span->text[x + i].bbox = fz_split_bbox(bbox, i, n); -} - -static void -fz_add_text_newline(fz_context *ctx, fz_text_span **last, fz_font *font, float size, int wmode) -{ - fz_text_span *span; - span = fz_new_text_span(ctx); - span->font = fz_keep_font(ctx, font); - span->size = size; - span->wmode = wmode; - (*last)->eol = 1; - (*last)->next = span; - *last = span; -} - -void -fz_debug_text_span_xml(fz_text_span *span) -{ - char buf[10]; - int c, n, k, i; - - while (span) - { - printf("<span font=\"%s\" size=\"%g\" wmode=\"%d\" eol=\"%d\">\n", - span->font ? span->font->name : "NULL", span->size, span->wmode, span->eol); - - for (i = 0; i < span->len; i++) - { - printf("\t<char ucs=\""); - c = span->text[i].c; - if (c < 128) - putchar(c); - else - { - n = fz_runetochar(buf, c); - for (k = 0; k < n; k++) - putchar(buf[k]); - } - printf("\" bbox=\"%d %d %d %d\" />\n", - span->text[i].bbox.x0, - span->text[i].bbox.y0, - span->text[i].bbox.x1, - span->text[i].bbox.y1); - } - - printf("</span>\n"); - - span = span->next; - } -} - -void -fz_debug_text_span(fz_text_span *span) -{ - char buf[10]; - int c, n, k, i; - - while (span) - { - for (i = 0; i < span->len; i++) - { - c = span->text[i].c; - if (c < 128) - putchar(c); - else - { - n = fz_runetochar(buf, c); - for (k = 0; k < n; k++) - putchar(buf[k]); - } - } - - if (span->eol) - putchar('\n'); - - span = span->next; - } -} - -static void -fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_matrix ctm, fz_point *pen) +fz_text_extract(fz_context *ctx, fz_text_device *dev, fz_text *text, fz_matrix ctm, fz_text_style *style) { + fz_point *pen = &dev->point; fz_font *font = text->font; FT_Face face = font->ft_face; fz_matrix tm = text->trm; @@ -233,19 +302,20 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat float ascender = 1; float descender = 0; int multi; - int i, err; + int i, j, err; if (text->len == 0) return; - fz_lock(ctx, FZ_LOCK_FREETYPE); if (font->ft_face) { + fz_lock(ctx, FZ_LOCK_FREETYPE); err = FT_Set_Char_Size(font->ft_face, 64, 64, 72, 72); if (err) fz_warn(ctx, "freetype set character size: %s", ft_error_string(err)); ascender = (float)face->ascender / face->units_per_EM; descender = (float)face->descender / face->units_per_EM; + fz_unlock(ctx, FZ_LOCK_FREETYPE); } rect = fz_empty_rect; @@ -264,6 +334,7 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat tm.e = 0; tm.f = 0; trm = fz_concat(tm, ctm); + dir = fz_transform_vector(trm, dir); dist = sqrtf(dir.x * dir.x + dir.y * dir.y); ndir.x = dir.x / dist; @@ -271,19 +342,10 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat size = fz_matrix_expansion(trm); - multi = 1; + int lastchar = ' '; for (i = 0; i < text->len; i++) { - if (text->items[i].gid < 0) - { - fz_add_text_char(ctx, last, font, size, text->wmode, text->items[i].ucs, fz_round_rect(rect)); - multi ++; - fz_divide_text_chars(last, multi, fz_round_rect(rect)); - continue; - } - multi = 1; - /* Calculate new pen location and delta */ tm.e = text->items[i].x; tm.f = text->items[i].y; @@ -305,20 +367,19 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat if (dist > size * LINE_DIST) { - fz_add_text_newline(ctx, last, font, size, text->wmode); + fz_flush_text_line(ctx, dev, style); + lastchar = ' '; } - else if (fabsf(dot) > 0.95f && dist > size * SPACE_DIST) + else if (fabsf(dot) > 0.95f && dist > size * SPACE_DIST && lastchar != ' ') { - if ((*last)->len > 0 && (*last)->text[(*last)->len - 1].c != ' ') - { - fz_rect spacerect; - spacerect.x0 = -0.2f; - spacerect.y0 = 0; - spacerect.x1 = 0; - spacerect.y1 = 1; - spacerect = fz_transform_rect(trm, spacerect); - fz_add_text_char(ctx, last, font, size, text->wmode, ' ', fz_round_rect(spacerect)); - } + fz_rect spacerect; + spacerect.x0 = -0.2f; + spacerect.y0 = 0; + spacerect.x1 = 0; + spacerect.y1 = 1; + spacerect = fz_transform_rect(trm, spacerect); + fz_add_text_char(ctx, dev, style, ' ', spacerect); + lastchar = ' '; } } @@ -331,8 +392,13 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat /* TODO: freetype returns broken vertical metrics */ /* if (text->wmode) mask |= FT_LOAD_VERTICAL_LAYOUT; */ + fz_lock(ctx, FZ_LOCK_FREETYPE); + err = FT_Set_Char_Size(font->ft_face, 64, 64, 72, 72); + if (err) + fz_warn(ctx, "freetype set character size: %s", ft_error_string(err)); FT_Get_Advance(font->ft_face, text->items[i].gid, mask, &ftadv); adv = ftadv / 65536.0f; + fz_unlock(ctx, FZ_LOCK_FREETYPE); rect.x0 = 0; rect.y0 = descender; @@ -352,9 +418,27 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat pen->x = trm.e + dir.x * adv; pen->y = trm.f + dir.y * adv; - fz_add_text_char(ctx, last, font, size, text->wmode, text->items[i].ucs, fz_round_rect(rect)); + /* Check for one glyph to many char mapping */ + for (j = i + 1; j < text->len; j++) + if (text->items[j].gid >= 0) + break; + multi = j - i; + + if (multi == 1) + { + fz_add_text_char(ctx, dev, style, text->items[i].ucs, rect); + } + else + { + for (j = 0; j < multi; j++) + { + fz_rect part = fz_split_bbox(rect, j, multi); + fz_add_text_char(ctx, dev, style, text->items[i].ucs, part); + } + } + + lastchar = text->items[i].ucs; } - fz_unlock(ctx, FZ_LOCK_FREETYPE); } static void @@ -362,7 +446,9 @@ fz_text_fill_text(fz_device *dev, fz_text *text, fz_matrix ctm, fz_colorspace *colorspace, float *color, float alpha) { fz_text_device *tdev = dev->user; - fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point); + fz_text_style *style; + style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, colorspace, color, alpha, NULL); + fz_text_extract(dev->ctx, tdev, text, ctm, style); } static void @@ -370,36 +456,57 @@ fz_text_stroke_text(fz_device *dev, fz_text *text, fz_stroke_state *stroke, fz_m fz_colorspace *colorspace, float *color, float alpha) { fz_text_device *tdev = dev->user; - fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point); + fz_text_style *style; + style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, colorspace, color, alpha, stroke); + fz_text_extract(dev->ctx, tdev, text, ctm, style); } static void fz_text_clip_text(fz_device *dev, fz_text *text, fz_matrix ctm, int accumulate) { fz_text_device *tdev = dev->user; - fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point); + fz_text_style *style; + style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, NULL, NULL, 0, NULL); + fz_text_extract(dev->ctx, tdev, text, ctm, style); } static void fz_text_clip_stroke_text(fz_device *dev, fz_text *text, fz_stroke_state *stroke, fz_matrix ctm) { fz_text_device *tdev = dev->user; - fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point); + fz_text_style *style; + style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, NULL, NULL, 0, stroke); + fz_text_extract(dev->ctx, tdev, text, ctm, style); } static void fz_text_ignore_text(fz_device *dev, fz_text *text, fz_matrix ctm) { fz_text_device *tdev = dev->user; - fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point); + fz_text_style *style; + style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, NULL, NULL, 0, NULL); + fz_text_extract(dev->ctx, tdev, text, ctm, style); +} + +static int cmp_block(const void *av, const void *bv) +{ + const fz_text_block *a = av; + const fz_text_block *b = bv; + int x = a->bbox.x0 - b->bbox.x0; + if (x) return x; + return -(a->bbox.y0 - b->bbox.y0); } static void fz_text_free_user(fz_device *dev) { + fz_context *ctx = dev->ctx; fz_text_device *tdev = dev->user; - tdev->span->eol = 1; + append_span(ctx, &tdev->cur_line, &tdev->cur_span); + insert_line(ctx, tdev->page, &tdev->cur_line); + + qsort(tdev->page->blocks, tdev->page->len, sizeof *tdev->page->blocks, cmp_block); /* TODO: unicode NFC normalization */ /* TODO: bidi logical reordering */ @@ -408,15 +515,19 @@ fz_text_free_user(fz_device *dev) } fz_device * -fz_new_text_device(fz_context *ctx, fz_text_span *root) +fz_new_text_device(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page) { fz_device *dev; + fz_text_device *tdev = fz_malloc_struct(ctx, fz_text_device); - tdev->head = root; - tdev->span = root; + tdev->sheet = sheet; + tdev->page = page; tdev->point.x = -1; tdev->point.y = -1; + init_line(ctx, &tdev->cur_line); + init_span(ctx, &tdev->cur_span, NULL); + dev = fz_new_device(ctx, tdev); dev->hints = FZ_IGNORE_IMAGE | FZ_IGNORE_SHADE; dev->free_user = fz_text_free_user; @@ -427,3 +538,209 @@ fz_new_text_device(fz_context *ctx, fz_text_span *root) dev->ignore_text = fz_text_ignore_text; return dev; } + +/* XML, HTML and plain-text output */ + +static int font_is_bold(fz_font *font) +{ + FT_Face face = font->ft_face; + if (face && (face->style_flags & FT_STYLE_FLAG_BOLD)) + return 1; + if (strstr(font->name, "Bold")) + return 1; + return 0; +} + +static int font_is_italic(fz_font *font) +{ + FT_Face face = font->ft_face; + if (face && (face->style_flags & FT_STYLE_FLAG_ITALIC)) + return 1; + if (strstr(font->name, "Italic") || strstr(font->name, "Oblique")) + return 1; + return 0; +} + +static void +fz_print_style_begin(FILE *out, fz_text_style *style) +{ + int script = style->script; + fprintf(out, "<span class=\"s%d\">", style->id); + while (script-- > 0) + fprintf(out, "<sup>"); + while (++script < 0) + fprintf(out, "<sub>"); +} + +static void +fz_print_style_end(FILE *out, fz_text_style *style) +{ + int script = style->script; + while (script-- > 0) + fprintf(out, "</sup>"); + while (++script < 0) + fprintf(out, "</sub>"); + fprintf(out, "</span>"); +} + +static void +fz_print_style(FILE *out, fz_text_style *style) +{ + char *s = strchr(style->font->name, '+'); + s = s ? s + 1 : style->font->name; + fprintf(out, "span.s%d{font-family:\"%s\";font-size:%gpt;", + style->id, s, style->size); + if (font_is_italic(style->font)) + fprintf(out, "font-style:italic;"); + if (font_is_bold(style->font)) + fprintf(out, "font-weight:bold;"); + fprintf(out, "}\n"); +} + +void +fz_print_text_sheet(FILE *out, fz_text_sheet *sheet) +{ + fz_text_style *style; + for (style = sheet->style; style; style = style->next) + fz_print_style(out, style); +} + +void +fz_print_text_page_html(FILE *out, fz_text_page *page) +{ + int block_n, line_n, span_n, ch_n; + fz_text_style *style = NULL; + fz_text_block *block; + fz_text_line *line; + fz_text_span *span; + + fprintf(out, "<div class=\"page\">\n"); + + for (block_n = 0; block_n < page->len; block_n++) + { + block = &page->blocks[block_n]; + fprintf(out, "<div class=\"block\">\n"); + for (line_n = 0; line_n < block->len; line_n++) + { + line = &block->lines[line_n]; + fprintf(out, "<p>"); + style = NULL; + + for (span_n = 0; span_n < line->len; span_n++) + { + span = &line->spans[span_n]; + if (style != span->style) + { + if (style != NULL) + fz_print_style_end(out, style); + fz_print_style_begin(out, span->style); + style = span->style; + } + + for (ch_n = 0; ch_n < span->len; ch_n++) + { + fz_text_char *ch = &span->text[ch_n]; + if (ch->c == '<') + fprintf(out, "<"); + else if (ch->c == '>') + fprintf(out, ">"); + else if (ch->c == '&') + fprintf(out, "&"); + else if (ch->c >= 32 && ch->c <= 127) + fprintf(out, "%c", ch->c); + else + fprintf(out, "&#x%x;", ch->c); + } + } + fz_print_style_end(out, style); + fprintf(out, "</p>\n"); + } + fprintf(out, "</div>\n"); + } + + fprintf(out, "</div>\n"); +} + +void +fz_print_text_page_xml(FILE *out, fz_text_page *page) +{ + fz_text_block *block; + fz_text_line *line; + fz_text_span *span; + fz_text_char *ch; + char *s; + + fprintf(out, "<page>\n"); + for (block = page->blocks; block < page->blocks + page->len; block++) + { + fprintf(out, "<block bbox=\"%g %g %g %g\">\n", + block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); + for (line = block->lines; line < block->lines + block->len; line++) + { + fprintf(out, "<line bbox=\"%g %g %g %g\">\n", + line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1); + for (span = line->spans; span < line->spans + line->len; span++) + { + fz_text_style *style = span->style; + s = strchr(style->font->name, '+'); + s = s ? s + 1 : style->font->name; + fprintf(out, "<span bbox=\"%g %g %g %g\" font=\"%s\" size=\"%g\">\n", + span->bbox.x0, span->bbox.y0, span->bbox.x1, span->bbox.y1, + s, style->size); + for (ch = span->text; ch < span->text + span->len; ch++) + { + fprintf(out, "<char bbox=\"%g %g %g %g\" c=\"", + ch->bbox.x0, ch->bbox.y0, ch->bbox.x1, ch->bbox.y1); + switch (ch->c) + { + case '<': fprintf(out, "<"); break; + case '>': fprintf(out, ">"); break; + case '&': fprintf(out, "&"); break; + case '"': fprintf(out, """); break; + case '\'': fprintf(out, "'"); break; + default: + if (ch->c >= 32 && ch->c <= 127) + fprintf(out, "%c", ch->c); + else + fprintf(out, "&#x%x;", ch->c); + break; + } + fprintf(out, "\"/>\n"); + } + fprintf(out, "</span>\n"); + } + fprintf(out, "</line>\n"); + } + fprintf(out, "</block>\n"); + } + fprintf(out, "</page>\n"); +} + +void +fz_print_text_page(FILE *out, fz_text_page *page) +{ + fz_text_block *block; + fz_text_line *line; + fz_text_span *span; + fz_text_char *ch; + char utf[10]; + int i, n; + + for (block = page->blocks; block < page->blocks + page->len; block++) + { + for (line = block->lines; line < block->lines + block->len; line++) + { + for (span = line->spans; span < line->spans + line->len; span++) + { + for (ch = span->text; ch < span->text + span->len; ch++) + { + n = fz_runetochar(utf, ch->c); + for (i = 0; i < n; i++) + putc(utf[i], out); + } + } + fprintf(out, "\n"); + } + fprintf(out, "\n"); + } +} diff --git a/fitz/fitz.h b/fitz/fitz.h index c01de8ac..8570f077 100644 --- a/fitz/fitz.h +++ b/fitz/fitz.h @@ -1424,46 +1424,102 @@ fz_device *fz_new_bbox_device(fz_context *ctx, fz_bbox *bboxp); fz_device *fz_new_draw_device(fz_context *ctx, fz_pixmap *dest); /* - Text extraction device -*/ -typedef struct fz_text_span_s fz_text_span; + * Text extraction device + */ + +typedef struct fz_text_style_s fz_text_style; typedef struct fz_text_char_s fz_text_char; +typedef struct fz_text_span_s fz_text_span; +typedef struct fz_text_line_s fz_text_line; +typedef struct fz_text_block_s fz_text_block; + +typedef struct fz_text_sheet_s fz_text_sheet; +typedef struct fz_text_page_s fz_text_page; + +struct fz_text_style_s +{ + int id; + fz_font *font; + float size; + int wmode; + int script; + /* etc... */ + fz_text_style *next; +}; + +struct fz_text_sheet_s +{ + int maxid; + fz_text_style *style; +}; struct fz_text_char_s { + fz_rect bbox; int c; - fz_bbox bbox; }; struct fz_text_span_s { - fz_font *font; - float size; - int wmode; + fz_rect bbox; int len, cap; fz_text_char *text; - fz_text_span *next; - int eol; + fz_text_style *style; +}; + +struct fz_text_line_s +{ + fz_rect bbox; + int len, cap; + fz_text_span *spans; +}; + +struct fz_text_block_s +{ + fz_rect bbox; + int len, cap; + fz_text_line *lines; }; -fz_text_span *fz_new_text_span(fz_context *ctx); -void fz_free_text_span(fz_context *ctx, fz_text_span *line); -void fz_debug_text_span(fz_text_span *line); -void fz_debug_text_span_xml(fz_text_span *span); +struct fz_text_page_s +{ + fz_rect mediabox; + int len, cap; + fz_text_block *blocks; +}; /* - fz_new_text_device: Create a device to print the text on a - page in XML. + fz_new_text_device: Create a device to extract the text on a page. - The text on a page will be translated into a sequnce of XML - elements. For each text span the font, font size, writing mode - and end of line flag is printed. Since text can be placed at - arbitrary positions then heuristics must be used to try to - collect text spans together that are roughly located on the - same baseline. Each character in the text span will have its - UTF-8 character printed along with a bounding box containing it. + Gather and sort the text on a page into spans of uniform style, + arranged into lines and blocks by reading order. The reading order + is determined by various heuristics, so may not be accurate. */ -fz_device *fz_new_text_device(fz_context *ctx, fz_text_span *text); +fz_device *fz_new_text_device(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page); + +/* + fz_new_text_sheet: Create an empty style sheet. + + The style sheet is filled out by the text device, creating + one style for each unique font, color, size combination that + is used. +*/ +fz_text_sheet *fz_new_text_sheet(fz_context *ctx); +void fz_free_text_sheet(fz_context *ctx, fz_text_sheet *sheet); + +/* + fz_new_text_page: Create an empty text page. + + The text page is filled out by the text device to contain the blocks, + lines and spans of text on the page. +*/ +fz_text_page *fz_new_text_page(fz_context *ctx, fz_rect mediabox); +void fz_free_text_page(fz_context *ctx, fz_text_page *page); + +void fz_print_text_sheet(FILE *out, fz_text_sheet *sheet); +void fz_print_text_page_html(FILE *out, fz_text_page *page); +void fz_print_text_page_xml(FILE *out, fz_text_page *page); +void fz_print_text_page(FILE *out, fz_text_page *page); /* * Cookie support - simple communication channel between app/library. |