From f7f89d9d0073e583bc78e6a5d4fc55a533caa566 Mon Sep 17 00:00:00 2001 From: Tor Andersson Date: Thu, 27 Jul 2017 14:24:55 +0200 Subject: Add text format document writer. Improve HTML output using absolute positioning, and add XHTML output for simpler semantic html output. --- source/fitz/stext-output.c | 645 ++++++++++++++++++++++++++++++++------------- source/fitz/util.c | 2 + source/fitz/writer.c | 9 + 3 files changed, 468 insertions(+), 188 deletions(-) (limited to 'source/fitz') diff --git a/source/fitz/stext-output.c b/source/fitz/stext-output.c index d2fe2d69..8a769615 100644 --- a/source/fitz/stext-output.c +++ b/source/fitz/stext-output.c @@ -6,220 +6,352 @@ #include #include FT_FREETYPE_H -/* XML, HTML and plain-text output */ +/* HTML output (visual formatting with preserved layout) */ static void -fz_print_style_begin(fz_context *ctx, fz_output *out, fz_stext_style *style) +fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_stext_style *style) { + int is_bold = fz_font_is_bold(ctx, style->font); + int is_italic = fz_font_is_italic(ctx, style->font); + int is_serif = fz_font_is_serif(ctx, style->font); + int is_mono = fz_font_is_monospaced(ctx, style->font); int script = style->script; - fz_write_printf(ctx, out, "", style->id); + + fz_write_printf(ctx, out, "", is_serif ? "serif" : "sans-serif", style->size); + if (is_mono) + fz_write_string(ctx, out, ""); + if (is_bold) + fz_write_string(ctx, out, ""); + if (is_italic) + fz_write_string(ctx, out, ""); + while (script-- > 0) - fz_write_printf(ctx, out, ""); + fz_write_string(ctx, out, ""); while (++script < 0) - fz_write_printf(ctx, out, ""); + fz_write_string(ctx, out, ""); } static void -fz_print_style_end(fz_context *ctx, fz_output *out, fz_stext_style *style) +fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_stext_style *style) { + int is_mono = fz_font_is_monospaced(ctx, style->font); + int is_bold = fz_font_is_bold(ctx, style->font); + int is_italic = fz_font_is_italic(ctx, style->font); int script = style->script; + while (script-- > 0) - fz_write_printf(ctx, out, ""); + fz_write_string(ctx, out, ""); while (++script < 0) - fz_write_printf(ctx, out, ""); - fz_write_printf(ctx, out, ""); + fz_write_string(ctx, out, ""); + + if (is_italic) + fz_write_string(ctx, out, ""); + if (is_bold) + fz_write_string(ctx, out, ""); + if (is_mono) + fz_write_string(ctx, out, ""); + fz_write_string(ctx, out, ""); } static void -fz_print_style(fz_context *ctx, fz_output *out, fz_stext_style *style) +fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_image_block *block, int xhtml) { - const char *name = fz_font_name(ctx, style->font); - const char *s = strchr(name, '+'); - s = s ? s + 1 : name; - fz_write_printf(ctx, out, "span.s%d{font-family:\"%s\";font-size:%gpt;", - style->id, s, style->size); - if (fz_font_is_italic(ctx, style->font)) - fz_write_printf(ctx, out, "font-style:italic;"); - if (fz_font_is_bold(ctx, style->font)) - fz_write_printf(ctx, out, "font-weight:bold;"); - fz_write_printf(ctx, out, "}\n"); -} + fz_compressed_buffer *cbuf; + fz_buffer *buf; -void -fz_print_stext_sheet(fz_context *ctx, fz_output *out, fz_stext_sheet *sheet) -{ - fz_stext_style *style; - for (style = sheet->style; style; style = style->next) - fz_print_style(ctx, out, style); + int x = block->bbox.x0; + int y = block->bbox.y0; + int w = block->bbox.x1 - block->bbox.x0; + int h = block->bbox.y1 - block->bbox.y0; + + cbuf = fz_compressed_image_buffer(ctx, block->image); + + if (xhtml) + fz_write_printf(ctx, out, "params.type) + { + case FZ_IMAGE_JPEG: + fz_write_string(ctx, out, "image/jpeg;base64,"); + fz_write_base64_buffer(ctx, out, cbuf->buffer, 1); + break; + case FZ_IMAGE_PNG: + fz_write_string(ctx, out, "image/png;base64,"); + fz_write_base64_buffer(ctx, out, cbuf->buffer, 1); + break; + default: + buf = fz_new_buffer_from_image_as_png(ctx, block->image, NULL); + fz_try(ctx) + { + fz_write_string(ctx, out, "image/png;base64,"); + fz_write_base64_buffer(ctx, out, buf, 1); + } + fz_always(ctx) + fz_drop_buffer(ctx, buf); + fz_catch(ctx) + fz_rethrow(ctx); + break; + } + + if (xhtml) + fz_write_string(ctx, out, "\"/>\n"); + else + fz_write_string(ctx, out, "\">\n"); } void -fz_print_stext_page_html(fz_context *ctx, fz_output *out, fz_stext_page *page) +fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) { - int block_n, line_n, ch_n; fz_stext_style *style = NULL; fz_stext_line *line; fz_stext_span *span; - void *last_region = NULL; + fz_stext_char *ch; + int x, y; - fz_write_printf(ctx, out, "
\n"); + style = NULL; - for (block_n = 0; block_n < page->len; block_n++) + for (line = block->lines; line < block->lines + block->len; ++line) { - switch (page->blocks[block_n].type) + for (span = line->first_span; span; span = span->next) { - case FZ_PAGE_BLOCK_TEXT: - { - fz_stext_block * block = page->blocks[block_n].u.text; - fz_write_printf(ctx, out, "

\n"); - for (line_n = 0; line_n < block->len; line_n++) + if (span == line->first_span || span->spacing > 1) { - int lastcol=-1; - line = &block->lines[line_n]; - style = NULL; - - if (line->region != last_region) + if (style) { - if (last_region) - fz_write_printf(ctx, out, "

"); - fz_write_printf(ctx, out, "
"); - last_region = line->region; + fz_print_style_end_html(ctx, out, style); + fz_write_string(ctx, out, "

\n"); + style = NULL; } - fz_write_printf(ctx, out, "
region) - fz_write_printf(ctx, out, " region=\"%x\"", line->region); -#endif - fz_write_printf(ctx, out, ">"); - for (span = line->first_span; span; span = span->next) - { - float size = fz_matrix_expansion(&span->transform); - float base_offset = span->base_offset / size; - - if (lastcol != span->column) - { - if (lastcol >= 0) - { - fz_write_printf(ctx, out, "
"); - } - /* If we skipped any columns then output some spacer spans */ - while (lastcol < span->column-1) - { - fz_write_printf(ctx, out, "
"); - lastcol++; - } - lastcol++; - /* Now output the span to contain this entire column */ - fz_write_printf(ctx, out, "
next; sn; sn = sn->next) - { - if (sn->column != lastcol) - break; - } - fz_write_printf(ctx, out, "width:%g%%;align:%s", span->column_width, (span->align == 0 ? "left" : (span->align == 1 ? "center" : "right"))); - } - if (span->indent > 1) - fz_write_printf(ctx, out, ";padding-left:1em;text-indent:-1em"); - if (span->indent < -1) - fz_write_printf(ctx, out, ";text-indent:1em"); - fz_write_printf(ctx, out, "\">"); - } -#ifdef DEBUG_INTERNALS - fz_write_printf(ctx, out, "column) - fz_write_printf(ctx, out, " col=\"%x\"", span->column); - fz_write_printf(ctx, out, ">"); -#endif - if (span->spacing >= 1) - fz_write_printf(ctx, out, " "); - if (base_offset > SUBSCRIPT_OFFSET) - fz_write_printf(ctx, out, ""); - else if (base_offset < SUPERSCRIPT_OFFSET) - fz_write_printf(ctx, out, ""); - for (ch_n = 0; ch_n < span->len; ch_n++) - { - fz_stext_char *ch = &span->text[ch_n]; - if (style != ch->style) - { - if (style) - fz_print_style_end(ctx, out, style); - fz_print_style_begin(ctx, out, ch->style); - style = ch->style; - } + x = span->bbox.x0; + y = span->bbox.y0; + fz_write_printf(ctx, out, "

", y, x); + } - if (ch->c == '<') - fz_write_printf(ctx, out, "<"); - else if (ch->c == '>') - fz_write_printf(ctx, out, ">"); - else if (ch->c == '&') - fz_write_printf(ctx, out, "&"); - else if (ch->c >= 32 && ch->c <= 127) - fz_write_printf(ctx, out, "%c", ch->c); - else - fz_write_printf(ctx, out, "&#x%x;", ch->c); - } + for (ch = span->text; ch < span->text + span->len; ++ch) + { + if (ch->style != style) + { if (style) - { - fz_print_style_end(ctx, out, style); - style = NULL; - } - if (base_offset > SUBSCRIPT_OFFSET) - fz_write_printf(ctx, out, ""); - else if (base_offset < SUPERSCRIPT_OFFSET) - fz_write_printf(ctx, out, ""); -#ifdef DEBUG_INTERNALS - fz_write_printf(ctx, out, ""); -#endif + fz_print_style_end_html(ctx, out, style); + style = ch->style; + fz_print_style_begin_html(ctx, out, style); + } + + switch (ch->c) + { + default: + if (ch->c >= 32 && ch->c <= 127) + fz_write_byte(ctx, out, ch->c); + else + fz_write_printf(ctx, out, "&#x%x;", ch->c); + break; + case '<': fz_write_string(ctx, out, "<"); break; + case '>': fz_write_string(ctx, out, ">"); break; + case '&': fz_write_string(ctx, out, "&"); break; + case '"': fz_write_string(ctx, out, """); break; + case '\'': fz_write_string(ctx, out, "'"); break; } - /* Close our floating span */ - fz_write_printf(ctx, out, "

"); - /* Close the line */ - fz_write_printf(ctx, out, "
"); - fz_write_printf(ctx, out, "\n"); } - /* Close the metaline */ - fz_write_printf(ctx, out, "
"); - last_region = NULL; - fz_write_printf(ctx, out, "

\n"); - break; } - case FZ_PAGE_BLOCK_IMAGE: + + if (style) + { + fz_print_style_end_html(ctx, out, style); + fz_write_string(ctx, out, "

\n"); + style = NULL; + } + } +} + +void +fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page) +{ + fz_page_block *block; + + int w = page->mediabox.x1 - page->mediabox.x0; + int h = page->mediabox.y1 - page->mediabox.y0; + + fz_write_printf(ctx, out, "
\n", w, h); + + for (block = page->blocks; block < page->blocks + page->len; ++block) + { + if (block->type == FZ_PAGE_BLOCK_IMAGE) + fz_print_stext_image_as_html(ctx, out, block->u.image, 0); + else if (block->type == FZ_PAGE_BLOCK_TEXT) + fz_print_stext_block_as_html(ctx, out, block->u.text); + } + + fz_write_string(ctx, out, "
\n"); +} + +void +fz_print_stext_header_as_html(fz_context *ctx, fz_output *out) +{ + fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); +} + +void +fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out) +{ + fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); +} + +/* XHTML output (semantic, little layout, suitable for reflow) */ + +static void +fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) +{ + int is_mono = fz_font_is_monospaced(ctx, style->font); + int is_bold = fz_font_is_bold(ctx, style->font); + int is_italic = fz_font_is_italic(ctx, style->font); + int script = style->script; + + if (is_mono) + fz_write_string(ctx, out, ""); + if (is_bold) + fz_write_string(ctx, out, ""); + if (is_italic) + fz_write_string(ctx, out, ""); + + while (script-- > 0) + fz_write_string(ctx, out, ""); + while (++script < 0) + fz_write_string(ctx, out, ""); +} + +static void +fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) +{ + int is_mono = fz_font_is_monospaced(ctx, style->font); + int is_bold = fz_font_is_bold(ctx, style->font); + int is_italic = fz_font_is_italic(ctx, style->font); + int script = style->script; + + while (script-- > 0) + fz_write_string(ctx, out, ""); + while (++script < 0) + fz_write_string(ctx, out, ""); + + if (is_italic) + fz_write_string(ctx, out, ""); + if (is_bold) + fz_write_string(ctx, out, ""); + if (is_mono) + fz_write_string(ctx, out, ""); +} + +static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) +{ + fz_stext_line *line; + fz_stext_span *span; + fz_stext_char *ch; + fz_stext_style *style; + + style = NULL; + fz_write_string(ctx, out, "

\n"); + + for (line = block->lines; line < block->lines + block->len; ++line) + { + if (line > block->lines) + fz_write_string(ctx, out, "
\n"); + for (span = line->first_span; span; span = span->next) { - fz_image_block *image = page->blocks[block_n].u.image; - fz_compressed_buffer *buffer = fz_compressed_image_buffer(ctx, image->image); - fz_write_printf(ctx, out, "image->w, image->image->h); - switch (buffer == NULL ? FZ_IMAGE_JPX : buffer->params.type) + if (span->spacing > 1) + fz_write_byte(ctx, out, ' '); + + for (ch = span->text; ch < span->text + span->len; ++ch) { - case FZ_IMAGE_JPEG: - fz_write_printf(ctx, out, "image/jpeg;base64,"); - fz_write_base64_buffer(ctx, out, buffer->buffer, 1); - break; - case FZ_IMAGE_PNG: - fz_write_printf(ctx, out, "image/png;base64,"); - fz_write_base64_buffer(ctx, out, buffer->buffer, 1); - break; - default: + if (ch->style != style) { - fz_buffer *buf = fz_new_buffer_from_image_as_png(ctx, image->image, NULL); - fz_write_printf(ctx, out, "image/png;base64,"); - fz_write_base64_buffer(ctx, out, buf, 1); - fz_drop_buffer(ctx, buf); + if (style) + fz_print_style_end_xhtml(ctx, out, style); + style = ch->style; + fz_print_style_begin_xhtml(ctx, out, style); + } + + switch (ch->c) + { + default: + if (ch->c >= 32 && ch->c <= 127) + fz_write_byte(ctx, out, ch->c); + else + fz_write_printf(ctx, out, "&#x%x;", ch->c); break; + case '<': fz_write_string(ctx, out, "<"); break; + case '>': fz_write_string(ctx, out, ">"); break; + case '&': fz_write_string(ctx, out, "&"); break; + case '"': fz_write_string(ctx, out, """); break; + case '\'': fz_write_string(ctx, out, "'"); break; } } - fz_write_printf(ctx, out, "\">\n"); - break; - } } } - fz_write_printf(ctx, out, "\n"); + if (style) + fz_print_style_end_xhtml(ctx, out, style); + fz_write_string(ctx, out, "\n

\n"); } void -fz_print_stext_page_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) +fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page) +{ + fz_page_block *block; + + fz_write_string(ctx, out, "
\n"); + + for (block = page->blocks; block < page->blocks + page->len; ++block) + { + if (block->type == FZ_PAGE_BLOCK_IMAGE) + fz_print_stext_image_as_html(ctx, out, block->u.image, 1); + else if (block->type == FZ_PAGE_BLOCK_TEXT) + fz_print_stext_block_as_xhtml(ctx, out, block->u.text); + } + + fz_write_string(ctx, out, "
\n"); +} + +void +fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out) +{ + fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); +} + +void +fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out) +{ + fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); +} + +/* Detailed XML dump of the entire structured text data */ + +void +fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) { int block_n; @@ -256,7 +388,7 @@ fz_print_stext_page_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) { if (style) { - fz_write_printf(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); } style = ch->style; name = fz_font_name(ctx, style->font); @@ -274,11 +406,11 @@ fz_print_stext_page_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) } switch (ch->c) { - case '<': fz_write_printf(ctx, out, "<"); break; - case '>': fz_write_printf(ctx, out, ">"); break; - case '&': fz_write_printf(ctx, out, "&"); break; - case '"': fz_write_printf(ctx, out, """); break; - case '\'': fz_write_printf(ctx, out, "'"); break; + case '<': fz_write_string(ctx, out, "<"); break; + case '>': fz_write_string(ctx, out, ">"); break; + case '&': fz_write_string(ctx, out, "&"); break; + case '"': fz_write_string(ctx, out, """); break; + case '\'': fz_write_string(ctx, out, "'"); break; default: if (ch->c >= 32 && ch->c <= 127) fz_write_printf(ctx, out, "%c", ch->c); @@ -286,14 +418,14 @@ fz_print_stext_page_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) fz_write_printf(ctx, out, "&#x%x;", ch->c); break; } - fz_write_printf(ctx, out, "\"/>\n"); + fz_write_string(ctx, out, "\"/>\n"); } if (style) - fz_write_printf(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); } - fz_write_printf(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); } - fz_write_printf(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); break; } case FZ_PAGE_BLOCK_IMAGE: @@ -302,21 +434,21 @@ fz_print_stext_page_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) } } } - fz_write_printf(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); } +/* Plain text */ + void -fz_print_stext_page(fz_context *ctx, fz_output *out, fz_stext_page *page) +fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page) { - int block_n; + fz_page_block *pblock; - for (block_n = 0; block_n < page->len; block_n++) + for (pblock = page->blocks; pblock < page->blocks + page->len; ++pblock) { - switch (page->blocks[block_n].type) + if (pblock->type == FZ_PAGE_BLOCK_TEXT) { - case FZ_PAGE_BLOCK_TEXT: - { - fz_stext_block *block = page->blocks[block_n].u.text; + fz_stext_block *block = pblock->u.text; fz_stext_line *line; fz_stext_char *ch; char utf[10]; @@ -327,20 +459,157 @@ fz_print_stext_page(fz_context *ctx, fz_output *out, fz_stext_page *page) fz_stext_span *span; for (span = line->first_span; span; span = span->next) { + if (span->spacing > 1) + fz_write_byte(ctx, out, ' '); for (ch = span->text; ch < span->text + span->len; ch++) { n = fz_runetochar(utf, ch->c); for (i = 0; i < n; i++) - fz_write_printf(ctx, out, "%c", utf[i]); + fz_write_byte(ctx, out, utf[i]); } } - fz_write_printf(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); } - fz_write_printf(ctx, out, "\n"); - break; + fz_write_string(ctx, out, "\n"); } - case FZ_PAGE_BLOCK_IMAGE: + } +} + +/* Text output writer */ + +enum { + FZ_FORMAT_TEXT, + FZ_FORMAT_HTML, + FZ_FORMAT_XHTML, + FZ_FORMAT_STEXT, +}; + +typedef struct fz_text_writer_s fz_text_writer; + +struct fz_text_writer_s +{ + fz_document_writer super; + int format; + fz_stext_options opts; + fz_stext_sheet *sheet; + fz_stext_page *page; + fz_output *out; +}; + +static fz_device * +text_begin_page(fz_context *ctx, fz_document_writer *wri_, const fz_rect *mediabox) +{ + fz_text_writer *wri = (fz_text_writer*)wri_; + + if (wri->page) + { + fz_drop_stext_page(ctx, wri->page); + wri->page = NULL; + } + + wri->page = fz_new_stext_page(ctx, mediabox); + return fz_new_stext_device(ctx, wri->sheet, wri->page, &wri->opts); +} + +static void +text_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev) +{ + fz_text_writer *wri = (fz_text_writer*)wri_; + fz_close_device(ctx, dev); + fz_drop_device(ctx, dev); + + switch (wri->format) + { + default: + case FZ_FORMAT_TEXT: + fz_print_stext_page_as_text(ctx, wri->out, wri->page); + break; + case FZ_FORMAT_HTML: + fz_print_stext_page_as_html(ctx, wri->out, wri->page); + break; + case FZ_FORMAT_XHTML: + fz_print_stext_page_as_xhtml(ctx, wri->out, wri->page); + break; + case FZ_FORMAT_STEXT: + fz_print_stext_page_as_xml(ctx, wri->out, wri->page); + break; + } + + fz_drop_stext_page(ctx, wri->page); + wri->page = NULL; +} + +static void +text_close_writer(fz_context *ctx, fz_document_writer *wri_) +{ + fz_text_writer *wri = (fz_text_writer*)wri_; + switch (wri->format) + { + case FZ_FORMAT_HTML: + fz_print_stext_trailer_as_html(ctx, wri->out); + break; + case FZ_FORMAT_XHTML: + fz_print_stext_trailer_as_xhtml(ctx, wri->out); + break; + case FZ_FORMAT_STEXT: + fz_write_string(ctx, wri->out, "\n"); + break; + } +} + +static void +text_drop_writer(fz_context *ctx, fz_document_writer *wri_) +{ + fz_text_writer *wri = (fz_text_writer*)wri_; + fz_drop_stext_page(ctx, wri->page); + fz_drop_stext_sheet(ctx, wri->sheet); + fz_drop_output(ctx, wri->out); +} + +fz_document_writer * +fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const char *args) +{ + fz_text_writer *wri; + + wri = fz_new_derived_document_writer(ctx, fz_text_writer, text_begin_page, text_end_page, text_close_writer, text_drop_writer); + fz_try(ctx) + { + fz_parse_stext_options(ctx, &wri->opts, args); + + wri->format = FZ_FORMAT_TEXT; + if (!strcmp(format, "text")) + wri->format = FZ_FORMAT_TEXT; + else if (!strcmp(format, "html")) + wri->format = FZ_FORMAT_HTML; + else if (!strcmp(format, "xhtml")) + wri->format = FZ_FORMAT_XHTML; + else if (!strcmp(format, "stext")) + wri->format = FZ_FORMAT_STEXT; + + wri->sheet = fz_new_stext_sheet(ctx); + wri->out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0); + + switch (wri->format) + { + case FZ_FORMAT_HTML: + fz_print_stext_header_as_html(ctx, wri->out); + break; + case FZ_FORMAT_XHTML: + fz_print_stext_header_as_xhtml(ctx, wri->out); + break; + case FZ_FORMAT_STEXT: + fz_write_string(ctx, wri->out, "\n"); + fz_write_string(ctx, wri->out, "\n"); break; } } + fz_catch(ctx) + { + fz_drop_output(ctx, wri->out); + fz_drop_stext_sheet(ctx, wri->sheet); + fz_free(ctx, wri); + fz_rethrow(ctx); + } + + return (fz_document_writer*)wri; } diff --git a/source/fitz/util.c b/source/fitz/util.c index f4dee8a1..d532734a 100644 --- a/source/fitz/util.c +++ b/source/fitz/util.c @@ -453,6 +453,8 @@ fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rec int saw_text = 0; for (span = line->first_span; span; span = span->next) { + if (span->spacing > 1) + fz_append_byte(ctx, buf, ' '); for (i = 0; i < span->len; i++) { int c; diff --git a/source/fitz/writer.c b/source/fitz/writer.c index cd71329a..4e436e08 100644 --- a/source/fitz/writer.c +++ b/source/fitz/writer.c @@ -140,6 +140,15 @@ fz_new_document_writer(fz_context *ctx, const char *path, const char *format, co if (!fz_strcasecmp(format, "pkm")) return fz_new_pkm_pixmap_writer(ctx, path, options); + if (!fz_strcasecmp(format, "txt") || !fz_strcasecmp(format, "text")) + return fz_new_text_writer(ctx, "text", path, options); + if (!fz_strcasecmp(format, "html")) + return fz_new_text_writer(ctx, format, path, options); + if (!fz_strcasecmp(format, "xhtml")) + return fz_new_text_writer(ctx, format, path, options); + if (!fz_strcasecmp(format, "stext")) + return fz_new_text_writer(ctx, format, path, options); + fz_throw(ctx, FZ_ERROR_GENERIC, "unknown output document format: %s", format); } -- cgit v1.2.3