#include "fitz-imp.h" #define SUBSCRIPT_OFFSET 0.2f #define SUPERSCRIPT_OFFSET -0.2f #include #include FT_FREETYPE_H /* HTML output (visual formatting with preserved layout) */ static void fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_stext_style *style) { int is_bold = fz_font_is_bold(ctx, style->font); int is_italic = fz_font_is_italic(ctx, style->font); int is_serif = fz_font_is_serif(ctx, style->font); int is_mono = fz_font_is_monospaced(ctx, style->font); int script = style->script; fz_write_printf(ctx, out, "", is_serif ? "serif" : "sans-serif", style->size); if (is_mono) fz_write_string(ctx, out, ""); if (is_bold) fz_write_string(ctx, out, ""); if (is_italic) fz_write_string(ctx, out, ""); while (script-- > 0) fz_write_string(ctx, out, ""); while (++script < 0) fz_write_string(ctx, out, ""); } static void fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_stext_style *style) { int is_mono = fz_font_is_monospaced(ctx, style->font); int is_bold = fz_font_is_bold(ctx, style->font); int is_italic = fz_font_is_italic(ctx, style->font); int script = style->script; while (script-- > 0) fz_write_string(ctx, out, ""); while (++script < 0) fz_write_string(ctx, out, ""); if (is_italic) fz_write_string(ctx, out, ""); if (is_bold) fz_write_string(ctx, out, ""); if (is_mono) fz_write_string(ctx, out, ""); fz_write_string(ctx, out, ""); } static void fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_image_block *block) { int x = block->bbox.x0; int y = block->bbox.y0; int w = block->bbox.x1 - block->bbox.x0; int h = block->bbox.y1 - block->bbox.y0; fz_write_printf(ctx, out, "image); fz_write_string(ctx, out, "\">\n"); } void fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) { fz_stext_style *style = NULL; fz_stext_line *line; fz_stext_span *span; fz_stext_char *ch; int x, y; style = NULL; for (line = block->lines; line < block->lines + block->len; ++line) { for (span = line->first_span; span; span = span->next) { if (span == line->first_span || span->spacing > 1) { if (style) { fz_print_style_end_html(ctx, out, style); fz_write_string(ctx, out, "

\n"); style = NULL; } x = span->bbox.x0; y = span->bbox.y0; fz_write_printf(ctx, out, "

", y, x); } for (ch = span->text; ch < span->text + span->len; ++ch) { if (ch->style != style) { if (style) fz_print_style_end_html(ctx, out, style); style = ch->style; fz_print_style_begin_html(ctx, out, style); } switch (ch->c) { default: if (ch->c >= 32 && ch->c <= 127) fz_write_byte(ctx, out, ch->c); else fz_write_printf(ctx, out, "&#x%x;", ch->c); break; case '<': fz_write_string(ctx, out, "<"); break; case '>': fz_write_string(ctx, out, ">"); break; case '&': fz_write_string(ctx, out, "&"); break; case '"': fz_write_string(ctx, out, """); break; case '\'': fz_write_string(ctx, out, "'"); break; } } } if (style) { fz_print_style_end_html(ctx, out, style); fz_write_string(ctx, out, "

\n"); style = NULL; } } } void fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page) { fz_page_block *block; int w = page->mediabox.x1 - page->mediabox.x0; int h = page->mediabox.y1 - page->mediabox.y0; fz_write_printf(ctx, out, "
\n", w, h); for (block = page->blocks; block < page->blocks + page->len; ++block) { if (block->type == FZ_PAGE_BLOCK_IMAGE) fz_print_stext_image_as_html(ctx, out, block->u.image); else if (block->type == FZ_PAGE_BLOCK_TEXT) fz_print_stext_block_as_html(ctx, out, block->u.text); } fz_write_string(ctx, out, "
\n"); } void fz_print_stext_header_as_html(fz_context *ctx, fz_output *out) { fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); } void fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out) { fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); } /* XHTML output (semantic, little layout, suitable for reflow) */ static void fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_image_block *block) { int w = block->bbox.x1 - block->bbox.x0; int h = block->bbox.y1 - block->bbox.y0; fz_write_printf(ctx, out, "

image); fz_write_string(ctx, out, "\"/>

\n"); } static void fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) { int is_mono = fz_font_is_monospaced(ctx, style->font); int is_bold = fz_font_is_bold(ctx, style->font); int is_italic = fz_font_is_italic(ctx, style->font); int script = style->script; if (is_mono) fz_write_string(ctx, out, ""); if (is_bold) fz_write_string(ctx, out, ""); if (is_italic) fz_write_string(ctx, out, ""); while (script-- > 0) fz_write_string(ctx, out, ""); while (++script < 0) fz_write_string(ctx, out, ""); } static void fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) { int is_mono = fz_font_is_monospaced(ctx, style->font); int is_bold = fz_font_is_bold(ctx, style->font); int is_italic = fz_font_is_italic(ctx, style->font); int script = style->script; while (script-- > 0) fz_write_string(ctx, out, ""); while (++script < 0) fz_write_string(ctx, out, ""); if (is_italic) fz_write_string(ctx, out, ""); if (is_bold) fz_write_string(ctx, out, ""); if (is_mono) fz_write_string(ctx, out, ""); } static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) { fz_stext_line *line; fz_stext_span *span; fz_stext_char *ch; fz_stext_style *style; style = NULL; fz_write_string(ctx, out, "

\n"); for (line = block->lines; line < block->lines + block->len; ++line) { if (line > block->lines) fz_write_string(ctx, out, "
\n"); for (span = line->first_span; span; span = span->next) { if (span->spacing > 1) fz_write_byte(ctx, out, ' '); for (ch = span->text; ch < span->text + span->len; ++ch) { if (ch->style != style) { if (style) fz_print_style_end_xhtml(ctx, out, style); style = ch->style; fz_print_style_begin_xhtml(ctx, out, style); } switch (ch->c) { default: if (ch->c >= 32 && ch->c <= 127) fz_write_byte(ctx, out, ch->c); else fz_write_printf(ctx, out, "&#x%x;", ch->c); break; case '<': fz_write_string(ctx, out, "<"); break; case '>': fz_write_string(ctx, out, ">"); break; case '&': fz_write_string(ctx, out, "&"); break; case '"': fz_write_string(ctx, out, """); break; case '\'': fz_write_string(ctx, out, "'"); break; } } } } if (style) fz_print_style_end_xhtml(ctx, out, style); fz_write_string(ctx, out, "\n

\n"); } void fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page) { fz_page_block *block; fz_write_string(ctx, out, "
\n"); for (block = page->blocks; block < page->blocks + page->len; ++block) { if (block->type == FZ_PAGE_BLOCK_IMAGE) fz_print_stext_image_as_xhtml(ctx, out, block->u.image); else if (block->type == FZ_PAGE_BLOCK_TEXT) fz_print_stext_block_as_xhtml(ctx, out, block->u.text); } fz_write_string(ctx, out, "
\n"); } void fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out) { fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); } void fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out) { fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); } /* Detailed XML dump of the entire structured text data */ void fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) { int block_n; fz_write_printf(ctx, out, "\n", page->mediabox.x1 - page->mediabox.x0, page->mediabox.y1 - page->mediabox.y0); for (block_n = 0; block_n < page->len; block_n++) { switch (page->blocks[block_n].type) { case FZ_PAGE_BLOCK_TEXT: { fz_stext_block *block = page->blocks[block_n].u.text; fz_stext_line *line; const char *s; fz_write_printf(ctx, out, "\n", block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); for (line = block->lines; line < block->lines + block->len; line++) { fz_stext_span *span; fz_write_printf(ctx, out, "\n", line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1); for (span = line->first_span; span; span = span->next) { fz_stext_style *style = NULL; const char *name = NULL; int char_num; for (char_num = 0; char_num < span->len; char_num++) { fz_stext_char *ch = &span->text[char_num]; if (ch->style != style) { if (style) { fz_write_string(ctx, out, "\n"); } style = ch->style; name = fz_font_name(ctx, style->font); s = strchr(name, '+'); s = s ? s + 1 : name; fz_write_printf(ctx, out, "\n", span->bbox.x0, span->bbox.y0, span->bbox.x1, span->bbox.y1, s, style->size); } { fz_rect rect; fz_stext_char_bbox(ctx, &rect, span, char_num); fz_write_printf(ctx, out, "p.x, ch->p.y); } switch (ch->c) { case '<': fz_write_string(ctx, out, "<"); break; case '>': fz_write_string(ctx, out, ">"); break; case '&': fz_write_string(ctx, out, "&"); break; case '"': fz_write_string(ctx, out, """); break; case '\'': fz_write_string(ctx, out, "'"); break; default: if (ch->c >= 32 && ch->c <= 127) fz_write_printf(ctx, out, "%c", ch->c); else fz_write_printf(ctx, out, "&#x%x;", ch->c); break; } fz_write_string(ctx, out, "\"/>\n"); } if (style) fz_write_string(ctx, out, "\n"); } fz_write_string(ctx, out, "\n"); } fz_write_string(ctx, out, "\n"); break; } case FZ_PAGE_BLOCK_IMAGE: { break; } } } fz_write_string(ctx, out, "\n"); } /* Plain text */ void fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page) { fz_page_block *pblock; for (pblock = page->blocks; pblock < page->blocks + page->len; ++pblock) { if (pblock->type == FZ_PAGE_BLOCK_TEXT) { fz_stext_block *block = pblock->u.text; fz_stext_line *line; fz_stext_char *ch; char utf[10]; int i, n; for (line = block->lines; line < block->lines + block->len; line++) { fz_stext_span *span; for (span = line->first_span; span; span = span->next) { if (span->spacing > 1) fz_write_byte(ctx, out, ' '); for (ch = span->text; ch < span->text + span->len; ch++) { n = fz_runetochar(utf, ch->c); for (i = 0; i < n; i++) fz_write_byte(ctx, out, utf[i]); } } fz_write_string(ctx, out, "\n"); } fz_write_string(ctx, out, "\n"); } } } /* Text output writer */ enum { FZ_FORMAT_TEXT, FZ_FORMAT_HTML, FZ_FORMAT_XHTML, FZ_FORMAT_STEXT, }; typedef struct fz_text_writer_s fz_text_writer; struct fz_text_writer_s { fz_document_writer super; int format; fz_stext_options opts; fz_stext_sheet *sheet; fz_stext_page *page; fz_output *out; }; static fz_device * text_begin_page(fz_context *ctx, fz_document_writer *wri_, const fz_rect *mediabox) { fz_text_writer *wri = (fz_text_writer*)wri_; if (wri->page) { fz_drop_stext_page(ctx, wri->page); wri->page = NULL; } wri->page = fz_new_stext_page(ctx, mediabox); return fz_new_stext_device(ctx, wri->sheet, wri->page, &wri->opts); } static void text_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev) { fz_text_writer *wri = (fz_text_writer*)wri_; fz_close_device(ctx, dev); fz_drop_device(ctx, dev); switch (wri->format) { default: case FZ_FORMAT_TEXT: fz_print_stext_page_as_text(ctx, wri->out, wri->page); break; case FZ_FORMAT_HTML: fz_print_stext_page_as_html(ctx, wri->out, wri->page); break; case FZ_FORMAT_XHTML: fz_print_stext_page_as_xhtml(ctx, wri->out, wri->page); break; case FZ_FORMAT_STEXT: fz_print_stext_page_as_xml(ctx, wri->out, wri->page); break; } fz_drop_stext_page(ctx, wri->page); wri->page = NULL; } static void text_close_writer(fz_context *ctx, fz_document_writer *wri_) { fz_text_writer *wri = (fz_text_writer*)wri_; switch (wri->format) { case FZ_FORMAT_HTML: fz_print_stext_trailer_as_html(ctx, wri->out); break; case FZ_FORMAT_XHTML: fz_print_stext_trailer_as_xhtml(ctx, wri->out); break; case FZ_FORMAT_STEXT: fz_write_string(ctx, wri->out, "\n"); break; } } static void text_drop_writer(fz_context *ctx, fz_document_writer *wri_) { fz_text_writer *wri = (fz_text_writer*)wri_; fz_drop_stext_page(ctx, wri->page); fz_drop_stext_sheet(ctx, wri->sheet); fz_drop_output(ctx, wri->out); } fz_document_writer * fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const char *args) { fz_text_writer *wri; wri = fz_new_derived_document_writer(ctx, fz_text_writer, text_begin_page, text_end_page, text_close_writer, text_drop_writer); fz_try(ctx) { fz_parse_stext_options(ctx, &wri->opts, args); wri->format = FZ_FORMAT_TEXT; if (!strcmp(format, "text")) wri->format = FZ_FORMAT_TEXT; else if (!strcmp(format, "html")) wri->format = FZ_FORMAT_HTML; else if (!strcmp(format, "xhtml")) wri->format = FZ_FORMAT_XHTML; else if (!strcmp(format, "stext")) wri->format = FZ_FORMAT_STEXT; wri->sheet = fz_new_stext_sheet(ctx); wri->out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0); switch (wri->format) { case FZ_FORMAT_HTML: fz_print_stext_header_as_html(ctx, wri->out); break; case FZ_FORMAT_XHTML: fz_print_stext_header_as_xhtml(ctx, wri->out); break; case FZ_FORMAT_STEXT: fz_write_string(ctx, wri->out, "\n"); fz_write_string(ctx, wri->out, "\n"); break; } } fz_catch(ctx) { fz_drop_output(ctx, wri->out); fz_drop_stext_sheet(ctx, wri->sheet); fz_free(ctx, wri); fz_rethrow(ctx); } return (fz_document_writer*)wri; }