diff options
Diffstat (limited to 'source/fitz/stext-output.c')
-rw-r--r-- | source/fitz/stext-output.c | 386 |
1 files changed, 163 insertions, 223 deletions
diff --git a/source/fitz/stext-output.c b/source/fitz/stext-output.c index 63124aa7..f5f72412 100644 --- a/source/fitz/stext-output.c +++ b/source/fitz/stext-output.c @@ -9,40 +9,28 @@ /* HTML output (visual formatting with preserved layout) */ static void -fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_stext_style *style) +fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size) { - int is_bold = fz_font_is_bold(ctx, style->font); - int is_italic = fz_font_is_italic(ctx, style->font); - int is_serif = fz_font_is_serif(ctx, style->font); - int is_mono = fz_font_is_monospaced(ctx, style->font); - int script = style->script; + int is_bold = fz_font_is_bold(ctx, font); + int is_italic = fz_font_is_italic(ctx, font); + int is_serif = fz_font_is_serif(ctx, font); + int is_mono = fz_font_is_monospaced(ctx, font); - fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%gpt;\">", is_serif ? "serif" : "sans-serif", style->size); + fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%gpt;\">", is_serif ? "serif" : "sans-serif", size); if (is_mono) fz_write_string(ctx, out, "<tt>"); if (is_bold) fz_write_string(ctx, out, "<b>"); if (is_italic) fz_write_string(ctx, out, "<i>"); - - while (script-- > 0) - fz_write_string(ctx, out, "<sup>"); - while (++script < 0) - fz_write_string(ctx, out, "<sub>"); } static void -fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_stext_style *style) +fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size) { - int is_mono = fz_font_is_monospaced(ctx, style->font); - int is_bold = fz_font_is_bold(ctx, style->font); - int is_italic = fz_font_is_italic(ctx, style->font); - int script = style->script; - - while (script-- > 0) - fz_write_string(ctx, out, "</sup>"); - while (++script < 0) - fz_write_string(ctx, out, "</sub>"); + int is_mono = fz_font_is_monospaced(ctx, font); + int is_bold = fz_font_is_bold(ctx,font); + int is_italic = fz_font_is_italic(ctx, font); if (is_italic) fz_write_string(ctx, out, "</i>"); @@ -54,7 +42,7 @@ fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_stext_style *style) } static void -fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_image_block *block) +fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) { int x = block->bbox.x0; int y = block->bbox.y0; @@ -62,90 +50,78 @@ fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_image_block *bl int h = block->bbox.y1 - block->bbox.y0; fz_write_printf(ctx, out, "<img style=\"top:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"data:", y, x, w, h); - fz_write_image_as_data_uri(ctx, out, block->image); + fz_write_image_as_data_uri(ctx, out, block->u.i.image); fz_write_string(ctx, out, "\">\n"); } void fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) { - fz_stext_style *style = NULL; fz_stext_line *line; - fz_stext_span *span; fz_stext_char *ch; int x, y; - style = NULL; + fz_font *font = NULL; + float size = 0; - for (line = block->lines; line < block->lines + block->len; ++line) + for (line = block->u.t.first_line; line; line = line->next) { - for (span = line->first_span; span; span = span->next) + x = line->bbox.x0; + y = line->bbox.y0; + + fz_write_printf(ctx, out, "<p style=\"top:%dpt;left:%dpt;\">", y, x); + font = NULL; + + for (ch = line->first_char; ch; ch = ch->next) { - if (span == line->first_span || span->spacing > 1) + if (ch->font != font || ch->size != size) { - if (style) - { - fz_print_style_end_html(ctx, out, style); - fz_write_string(ctx, out, "</p>\n"); - style = NULL; - } - x = span->bbox.x0; - y = span->bbox.y0; - fz_write_printf(ctx, out, "<p style=\"top:%dpt;left:%dpt;\">", y, x); + if (font) + fz_print_style_end_html(ctx, out, font, size); + font = ch->font; + size = ch->size; + fz_print_style_begin_html(ctx, out, font, size); } - for (ch = span->text; ch < span->text + span->len; ++ch) + switch (ch->c) { - if (ch->style != style) - { - if (style) - fz_print_style_end_html(ctx, out, style); - style = ch->style; - fz_print_style_begin_html(ctx, out, style); - } - - switch (ch->c) - { - default: - if (ch->c >= 32 && ch->c <= 127) - fz_write_byte(ctx, out, ch->c); - else - fz_write_printf(ctx, out, "&#x%x;", ch->c); - break; - case '<': fz_write_string(ctx, out, "<"); break; - case '>': fz_write_string(ctx, out, ">"); break; - case '&': fz_write_string(ctx, out, "&"); break; - case '"': fz_write_string(ctx, out, """); break; - case '\'': fz_write_string(ctx, out, "'"); break; - } + default: + if (ch->c >= 32 && ch->c <= 127) + fz_write_byte(ctx, out, ch->c); + else + fz_write_printf(ctx, out, "&#x%x;", ch->c); + break; + case '<': fz_write_string(ctx, out, "<"); break; + case '>': fz_write_string(ctx, out, ">"); break; + case '&': fz_write_string(ctx, out, "&"); break; + case '"': fz_write_string(ctx, out, """); break; + case '\'': fz_write_string(ctx, out, "'"); break; } } - if (style) - { - fz_print_style_end_html(ctx, out, style); - fz_write_string(ctx, out, "</p>\n"); - style = NULL; - } + if (font) + fz_print_style_end_html(ctx, out, font, size); + + fz_write_string(ctx, out, "</p>\n"); } } void fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page) { - fz_page_block *block; + fz_stext_block *block; int w = page->mediabox.x1 - page->mediabox.x0; int h = page->mediabox.y1 - page->mediabox.y0; fz_write_printf(ctx, out, "<div style=\"width:%dpt;height:%dpt\">\n", w, h); - for (block = page->blocks; block < page->blocks + page->len; ++block) + for (block = page->first_block; block; block = block->next) { - if (block->type == FZ_PAGE_BLOCK_IMAGE) - fz_print_stext_image_as_html(ctx, out, block->u.image); - else if (block->type == FZ_PAGE_BLOCK_TEXT) - fz_print_stext_block_as_html(ctx, out, block->u.text); + if (block->type == FZ_STEXT_BLOCK_IMAGE) + fz_print_stext_image_as_html(ctx, out, block); + else if (block->type == FZ_STEXT_BLOCK_TEXT) + fz_print_stext_block_as_html(ctx, out, block); } fz_write_string(ctx, out, "</div>\n"); @@ -177,23 +153,22 @@ fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out) /* XHTML output (semantic, little layout, suitable for reflow) */ static void -fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_image_block *block) +fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) { int w = block->bbox.x1 - block->bbox.x0; int h = block->bbox.y1 - block->bbox.y0; fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"data:", w, h); - fz_write_image_as_data_uri(ctx, out, block->image); + fz_write_image_as_data_uri(ctx, out, block->u.i.image); fz_write_string(ctx, out, "\"/></p>\n"); } static void -fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) +fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, float size) { - int is_mono = fz_font_is_monospaced(ctx, style->font); - int is_bold = fz_font_is_bold(ctx, style->font); - int is_italic = fz_font_is_italic(ctx, style->font); - int script = style->script; + int is_mono = fz_font_is_monospaced(ctx, font); + int is_bold = fz_font_is_bold(ctx, font); + int is_italic = fz_font_is_italic(ctx, font); if (is_mono) fz_write_string(ctx, out, "<tt>"); @@ -201,25 +176,14 @@ fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *styl fz_write_string(ctx, out, "<b>"); if (is_italic) fz_write_string(ctx, out, "<i>"); - - while (script-- > 0) - fz_write_string(ctx, out, "<sup>"); - while (++script < 0) - fz_write_string(ctx, out, "<sub>"); } static void -fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) +fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, float size) { - int is_mono = fz_font_is_monospaced(ctx, style->font); - int is_bold = fz_font_is_bold(ctx, style->font); - int is_italic = fz_font_is_italic(ctx, style->font); - int script = style->script; - - while (script-- > 0) - fz_write_string(ctx, out, "</sup>"); - while (++script < 0) - fz_write_string(ctx, out, "</sub>"); + int is_mono = fz_font_is_monospaced(ctx, font); + int is_bold = fz_font_is_bold(ctx, font); + int is_italic = fz_font_is_italic(ctx, font); if (is_italic) fz_write_string(ctx, out, "</i>"); @@ -232,68 +196,63 @@ fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) { fz_stext_line *line; - fz_stext_span *span; fz_stext_char *ch; - fz_stext_style *style; - style = NULL; - fz_write_string(ctx, out, "<p>\n"); + fz_font *font = NULL; + float size = 0; + + fz_write_string(ctx, out, "<p>"); - for (line = block->lines; line < block->lines + block->len; ++line) + for (line = block->u.t.first_line; line; line = line->next) { - if (line > block->lines) - fz_write_string(ctx, out, "<br/>\n"); - for (span = line->first_span; span; span = span->next) + if (line != block->u.t.first_line) + fz_write_string(ctx, out, "\n"); + for (ch = line->first_char; ch; ch = ch->next) { - if (span->spacing > 1) - fz_write_byte(ctx, out, ' '); - - for (ch = span->text; ch < span->text + span->len; ++ch) + if (ch->font != font || ch->size != size) { - if (ch->style != style) - { - if (style) - fz_print_style_end_xhtml(ctx, out, style); - style = ch->style; - fz_print_style_begin_xhtml(ctx, out, style); - } + if (font) + fz_print_style_end_xhtml(ctx, out, font, size); + font = ch->font; + size = ch->size; + fz_print_style_begin_xhtml(ctx, out, font, size); + } - switch (ch->c) - { - default: - if (ch->c >= 32 && ch->c <= 127) - fz_write_byte(ctx, out, ch->c); - else - fz_write_printf(ctx, out, "&#x%x;", ch->c); - break; - case '<': fz_write_string(ctx, out, "<"); break; - case '>': fz_write_string(ctx, out, ">"); break; - case '&': fz_write_string(ctx, out, "&"); break; - case '"': fz_write_string(ctx, out, """); break; - case '\'': fz_write_string(ctx, out, "'"); break; - } + switch (ch->c) + { + default: + if (ch->c >= 32 && ch->c <= 127) + fz_write_byte(ctx, out, ch->c); + else + fz_write_printf(ctx, out, "&#x%x;", ch->c); + break; + case '<': fz_write_string(ctx, out, "<"); break; + case '>': fz_write_string(ctx, out, ">"); break; + case '&': fz_write_string(ctx, out, "&"); break; + case '"': fz_write_string(ctx, out, """); break; + case '\'': fz_write_string(ctx, out, "'"); break; } } } - if (style) - fz_print_style_end_xhtml(ctx, out, style); - fz_write_string(ctx, out, "\n</p>\n"); + if (font) + fz_print_style_end_xhtml(ctx, out, font, size); + fz_write_string(ctx, out, "</p>\n"); } void fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page) { - fz_page_block *block; + fz_stext_block *block; fz_write_string(ctx, out, "<div>\n"); - for (block = page->blocks; block < page->blocks + page->len; ++block) + for (block = page->first_block; block; block = block->next) { - if (block->type == FZ_PAGE_BLOCK_IMAGE) - fz_print_stext_image_as_xhtml(ctx, out, block->u.image); - else if (block->type == FZ_PAGE_BLOCK_TEXT) - fz_print_stext_block_as_xhtml(ctx, out, block->u.text); + if (block->type == FZ_STEXT_BLOCK_IMAGE) + fz_print_stext_image_as_xhtml(ctx, out, block); + else if (block->type == FZ_STEXT_BLOCK_TEXT) + fz_print_stext_block_as_xhtml(ctx, out, block); } fz_write_string(ctx, out, "</div>\n"); @@ -311,6 +270,7 @@ fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out) fz_write_string(ctx, out, "<style>\n"); fz_write_string(ctx, out, "body{background-color:gray}\n"); fz_write_string(ctx, out, "div{background-color:white;margin:1em;padding:1em}\n"); + fz_write_string(ctx, out, "p{white-space:pre-wrap}\n"); fz_write_string(ctx, out, "</style>\n"); fz_write_string(ctx, out, "</head>\n"); fz_write_string(ctx, out, "<body>\n"); @@ -328,87 +288,79 @@ fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out) void fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) { - int block_n; + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; fz_write_printf(ctx, out, "<page width=\"%g\" height=\"%g\">\n", page->mediabox.x1 - page->mediabox.x0, page->mediabox.y1 - page->mediabox.y0); - for (block_n = 0; block_n < page->len; block_n++) + for (block = page->first_block; block; block = block->next) { - switch (page->blocks[block_n].type) - { - case FZ_PAGE_BLOCK_TEXT: + switch (block->type) { - fz_stext_block *block = page->blocks[block_n].u.text; - fz_stext_line *line; - const char *s; - + case FZ_STEXT_BLOCK_TEXT: fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\">\n", - block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); - for (line = block->lines; line < block->lines + block->len; line++) + block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); + for (line = block->u.t.first_line; line; line = line->next) { - fz_stext_span *span; + fz_font *font = NULL; + float size = 0; + const char *name = NULL; + const char *s; + fz_rect rect; + fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\">\n", - line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1); - for (span = line->first_span; span; span = span->next) + line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1); + + for (ch = line->first_char; ch; ch = ch->next) { - fz_stext_style *style = NULL; - const char *name = NULL; - int char_num; - for (char_num = 0; char_num < span->len; char_num++) + if (ch->font != font || ch->size != size) + { + if (font) + fz_write_string(ctx, out, "</font>\n"); + font = ch->font; + size = ch->size; + name = fz_font_name(ctx, font); + s = strchr(name, '+'); + s = s ? s + 1 : name; + fz_write_printf(ctx, out, "<font name=\"%s\" size=\"%g\">\n", s, size); + } + fz_stext_char_bbox(ctx, &rect, line, ch); + fz_write_printf(ctx, out, "<char bbox=\"%g %g %g %g\" x=\"%g\" y=\"%g\" c=\"", + rect.x0, rect.y0, rect.x1, rect.y1, ch->origin.x, ch->origin.y); + switch (ch->c) { - fz_stext_char *ch = &span->text[char_num]; - if (ch->style != style) - { - if (style) - { - fz_write_string(ctx, out, "</span>\n"); - } - style = ch->style; - name = fz_font_name(ctx, style->font); - s = strchr(name, '+'); - s = s ? s + 1 : name; - fz_write_printf(ctx, out, "<span bbox=\"%g %g %g %g\" font=\"%s\" size=\"%g\">\n", - span->bbox.x0, span->bbox.y0, span->bbox.x1, span->bbox.y1, - s, style->size); - } - { - fz_rect rect; - fz_stext_char_bbox(ctx, &rect, span, char_num); - fz_write_printf(ctx, out, "<char bbox=\"%g %g %g %g\" x=\"%g\" y=\"%g\" c=\"", - rect.x0, rect.y0, rect.x1, rect.y1, ch->p.x, ch->p.y); - } - switch (ch->c) - { - case '<': fz_write_string(ctx, out, "<"); break; - case '>': fz_write_string(ctx, out, ">"); break; - case '&': fz_write_string(ctx, out, "&"); break; - case '"': fz_write_string(ctx, out, """); break; - case '\'': fz_write_string(ctx, out, "'"); break; - default: - if (ch->c >= 32 && ch->c <= 127) - fz_write_printf(ctx, out, "%c", ch->c); - else - fz_write_printf(ctx, out, "&#x%x;", ch->c); - break; - } - fz_write_string(ctx, out, "\"/>\n"); + case '<': fz_write_string(ctx, out, "<"); break; + case '>': fz_write_string(ctx, out, ">"); break; + case '&': fz_write_string(ctx, out, "&"); break; + case '"': fz_write_string(ctx, out, """); break; + case '\'': fz_write_string(ctx, out, "'"); break; + default: + if (ch->c >= 32 && ch->c <= 127) + fz_write_printf(ctx, out, "%c", ch->c); + else + fz_write_printf(ctx, out, "&#x%x;", ch->c); + break; } - if (style) - fz_write_string(ctx, out, "</span>\n"); + fz_write_string(ctx, out, "\"/>\n"); } + + if (font) + fz_write_string(ctx, out, "</font>\n"); + fz_write_string(ctx, out, "</line>\n"); } fz_write_string(ctx, out, "</block>\n"); break; - } - case FZ_PAGE_BLOCK_IMAGE: - { + + case FZ_STEXT_BLOCK_IMAGE: + fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n", + block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); break; } } - } fz_write_string(ctx, out, "</page>\n"); } @@ -417,31 +369,23 @@ fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) void fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page) { - fz_page_block *pblock; + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; + char utf[10]; + int i, n; - for (pblock = page->blocks; pblock < page->blocks + page->len; ++pblock) + for (block = page->first_block; block; block = block->next) { - if (pblock->type == FZ_PAGE_BLOCK_TEXT) + if (block->type == FZ_STEXT_BLOCK_TEXT) { - fz_stext_block *block = pblock->u.text; - fz_stext_line *line; - fz_stext_char *ch; - char utf[10]; - int i, n; - - for (line = block->lines; line < block->lines + block->len; line++) + for (line = block->u.t.first_line; line; line = line->next) { - fz_stext_span *span; - for (span = line->first_span; span; span = span->next) + for (ch = line->first_char; ch; ch = ch->next) { - if (span->spacing > 1) - fz_write_byte(ctx, out, ' '); - for (ch = span->text; ch < span->text + span->len; ch++) - { - n = fz_runetochar(utf, ch->c); - for (i = 0; i < n; i++) - fz_write_byte(ctx, out, utf[i]); - } + n = fz_runetochar(utf, ch->c); + for (i = 0; i < n; i++) + fz_write_byte(ctx, out, utf[i]); } fz_write_string(ctx, out, "\n"); } @@ -466,7 +410,6 @@ struct fz_text_writer_s fz_document_writer super; int format; fz_stext_options opts; - fz_stext_sheet *sheet; fz_stext_page *page; fz_output *out; }; @@ -483,7 +426,7 @@ text_begin_page(fz_context *ctx, fz_document_writer *wri_, const fz_rect *mediab } wri->page = fz_new_stext_page(ctx, mediabox); - return fz_new_stext_device(ctx, wri->sheet, wri->page, &wri->opts); + return fz_new_stext_device(ctx, wri->page, &wri->opts); } static void @@ -537,7 +480,6 @@ text_drop_writer(fz_context *ctx, fz_document_writer *wri_) { fz_text_writer *wri = (fz_text_writer*)wri_; fz_drop_stext_page(ctx, wri->page); - fz_drop_stext_sheet(ctx, wri->sheet); fz_drop_output(ctx, wri->out); } @@ -561,7 +503,6 @@ fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const else if (!strcmp(format, "stext")) wri->format = FZ_FORMAT_STEXT; - wri->sheet = fz_new_stext_sheet(ctx); wri->out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0); switch (wri->format) @@ -581,7 +522,6 @@ fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const fz_catch(ctx) { fz_drop_output(ctx, wri->out); - fz_drop_stext_sheet(ctx, wri->sheet); fz_free(ctx, wri); fz_rethrow(ctx); } |