summaryrefslogtreecommitdiff
path: root/source/fitz/stext-output.c
diff options
context:
space:
mode:
Diffstat (limited to 'source/fitz/stext-output.c')
-rw-r--r--source/fitz/stext-output.c386
1 files changed, 163 insertions, 223 deletions
diff --git a/source/fitz/stext-output.c b/source/fitz/stext-output.c
index 63124aa7..f5f72412 100644
--- a/source/fitz/stext-output.c
+++ b/source/fitz/stext-output.c
@@ -9,40 +9,28 @@
/* HTML output (visual formatting with preserved layout) */
static void
-fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_stext_style *style)
+fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size)
{
- int is_bold = fz_font_is_bold(ctx, style->font);
- int is_italic = fz_font_is_italic(ctx, style->font);
- int is_serif = fz_font_is_serif(ctx, style->font);
- int is_mono = fz_font_is_monospaced(ctx, style->font);
- int script = style->script;
+ int is_bold = fz_font_is_bold(ctx, font);
+ int is_italic = fz_font_is_italic(ctx, font);
+ int is_serif = fz_font_is_serif(ctx, font);
+ int is_mono = fz_font_is_monospaced(ctx, font);
- fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%gpt;\">", is_serif ? "serif" : "sans-serif", style->size);
+ fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%gpt;\">", is_serif ? "serif" : "sans-serif", size);
if (is_mono)
fz_write_string(ctx, out, "<tt>");
if (is_bold)
fz_write_string(ctx, out, "<b>");
if (is_italic)
fz_write_string(ctx, out, "<i>");
-
- while (script-- > 0)
- fz_write_string(ctx, out, "<sup>");
- while (++script < 0)
- fz_write_string(ctx, out, "<sub>");
}
static void
-fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_stext_style *style)
+fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size)
{
- int is_mono = fz_font_is_monospaced(ctx, style->font);
- int is_bold = fz_font_is_bold(ctx, style->font);
- int is_italic = fz_font_is_italic(ctx, style->font);
- int script = style->script;
-
- while (script-- > 0)
- fz_write_string(ctx, out, "</sup>");
- while (++script < 0)
- fz_write_string(ctx, out, "</sub>");
+ int is_mono = fz_font_is_monospaced(ctx, font);
+ int is_bold = fz_font_is_bold(ctx,font);
+ int is_italic = fz_font_is_italic(ctx, font);
if (is_italic)
fz_write_string(ctx, out, "</i>");
@@ -54,7 +42,7 @@ fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_stext_style *style)
}
static void
-fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_image_block *block)
+fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
{
int x = block->bbox.x0;
int y = block->bbox.y0;
@@ -62,90 +50,78 @@ fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_image_block *bl
int h = block->bbox.y1 - block->bbox.y0;
fz_write_printf(ctx, out, "<img style=\"top:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"data:", y, x, w, h);
- fz_write_image_as_data_uri(ctx, out, block->image);
+ fz_write_image_as_data_uri(ctx, out, block->u.i.image);
fz_write_string(ctx, out, "\">\n");
}
void
fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
{
- fz_stext_style *style = NULL;
fz_stext_line *line;
- fz_stext_span *span;
fz_stext_char *ch;
int x, y;
- style = NULL;
+ fz_font *font = NULL;
+ float size = 0;
- for (line = block->lines; line < block->lines + block->len; ++line)
+ for (line = block->u.t.first_line; line; line = line->next)
{
- for (span = line->first_span; span; span = span->next)
+ x = line->bbox.x0;
+ y = line->bbox.y0;
+
+ fz_write_printf(ctx, out, "<p style=\"top:%dpt;left:%dpt;\">", y, x);
+ font = NULL;
+
+ for (ch = line->first_char; ch; ch = ch->next)
{
- if (span == line->first_span || span->spacing > 1)
+ if (ch->font != font || ch->size != size)
{
- if (style)
- {
- fz_print_style_end_html(ctx, out, style);
- fz_write_string(ctx, out, "</p>\n");
- style = NULL;
- }
- x = span->bbox.x0;
- y = span->bbox.y0;
- fz_write_printf(ctx, out, "<p style=\"top:%dpt;left:%dpt;\">", y, x);
+ if (font)
+ fz_print_style_end_html(ctx, out, font, size);
+ font = ch->font;
+ size = ch->size;
+ fz_print_style_begin_html(ctx, out, font, size);
}
- for (ch = span->text; ch < span->text + span->len; ++ch)
+ switch (ch->c)
{
- if (ch->style != style)
- {
- if (style)
- fz_print_style_end_html(ctx, out, style);
- style = ch->style;
- fz_print_style_begin_html(ctx, out, style);
- }
-
- switch (ch->c)
- {
- default:
- if (ch->c >= 32 && ch->c <= 127)
- fz_write_byte(ctx, out, ch->c);
- else
- fz_write_printf(ctx, out, "&#x%x;", ch->c);
- break;
- case '<': fz_write_string(ctx, out, "&lt;"); break;
- case '>': fz_write_string(ctx, out, "&gt;"); break;
- case '&': fz_write_string(ctx, out, "&amp;"); break;
- case '"': fz_write_string(ctx, out, "&quot;"); break;
- case '\'': fz_write_string(ctx, out, "&apos;"); break;
- }
+ default:
+ if (ch->c >= 32 && ch->c <= 127)
+ fz_write_byte(ctx, out, ch->c);
+ else
+ fz_write_printf(ctx, out, "&#x%x;", ch->c);
+ break;
+ case '<': fz_write_string(ctx, out, "&lt;"); break;
+ case '>': fz_write_string(ctx, out, "&gt;"); break;
+ case '&': fz_write_string(ctx, out, "&amp;"); break;
+ case '"': fz_write_string(ctx, out, "&quot;"); break;
+ case '\'': fz_write_string(ctx, out, "&apos;"); break;
}
}
- if (style)
- {
- fz_print_style_end_html(ctx, out, style);
- fz_write_string(ctx, out, "</p>\n");
- style = NULL;
- }
+ if (font)
+ fz_print_style_end_html(ctx, out, font, size);
+
+ fz_write_string(ctx, out, "</p>\n");
}
}
void
fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page)
{
- fz_page_block *block;
+ fz_stext_block *block;
int w = page->mediabox.x1 - page->mediabox.x0;
int h = page->mediabox.y1 - page->mediabox.y0;
fz_write_printf(ctx, out, "<div style=\"width:%dpt;height:%dpt\">\n", w, h);
- for (block = page->blocks; block < page->blocks + page->len; ++block)
+ for (block = page->first_block; block; block = block->next)
{
- if (block->type == FZ_PAGE_BLOCK_IMAGE)
- fz_print_stext_image_as_html(ctx, out, block->u.image);
- else if (block->type == FZ_PAGE_BLOCK_TEXT)
- fz_print_stext_block_as_html(ctx, out, block->u.text);
+ if (block->type == FZ_STEXT_BLOCK_IMAGE)
+ fz_print_stext_image_as_html(ctx, out, block);
+ else if (block->type == FZ_STEXT_BLOCK_TEXT)
+ fz_print_stext_block_as_html(ctx, out, block);
}
fz_write_string(ctx, out, "</div>\n");
@@ -177,23 +153,22 @@ fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out)
/* XHTML output (semantic, little layout, suitable for reflow) */
static void
-fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_image_block *block)
+fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
{
int w = block->bbox.x1 - block->bbox.x0;
int h = block->bbox.y1 - block->bbox.y0;
fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"data:", w, h);
- fz_write_image_as_data_uri(ctx, out, block->image);
+ fz_write_image_as_data_uri(ctx, out, block->u.i.image);
fz_write_string(ctx, out, "\"/></p>\n");
}
static void
-fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style)
+fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, float size)
{
- int is_mono = fz_font_is_monospaced(ctx, style->font);
- int is_bold = fz_font_is_bold(ctx, style->font);
- int is_italic = fz_font_is_italic(ctx, style->font);
- int script = style->script;
+ int is_mono = fz_font_is_monospaced(ctx, font);
+ int is_bold = fz_font_is_bold(ctx, font);
+ int is_italic = fz_font_is_italic(ctx, font);
if (is_mono)
fz_write_string(ctx, out, "<tt>");
@@ -201,25 +176,14 @@ fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *styl
fz_write_string(ctx, out, "<b>");
if (is_italic)
fz_write_string(ctx, out, "<i>");
-
- while (script-- > 0)
- fz_write_string(ctx, out, "<sup>");
- while (++script < 0)
- fz_write_string(ctx, out, "<sub>");
}
static void
-fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style)
+fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, float size)
{
- int is_mono = fz_font_is_monospaced(ctx, style->font);
- int is_bold = fz_font_is_bold(ctx, style->font);
- int is_italic = fz_font_is_italic(ctx, style->font);
- int script = style->script;
-
- while (script-- > 0)
- fz_write_string(ctx, out, "</sup>");
- while (++script < 0)
- fz_write_string(ctx, out, "</sub>");
+ int is_mono = fz_font_is_monospaced(ctx, font);
+ int is_bold = fz_font_is_bold(ctx, font);
+ int is_italic = fz_font_is_italic(ctx, font);
if (is_italic)
fz_write_string(ctx, out, "</i>");
@@ -232,68 +196,63 @@ fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style)
static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
{
fz_stext_line *line;
- fz_stext_span *span;
fz_stext_char *ch;
- fz_stext_style *style;
- style = NULL;
- fz_write_string(ctx, out, "<p>\n");
+ fz_font *font = NULL;
+ float size = 0;
+
+ fz_write_string(ctx, out, "<p>");
- for (line = block->lines; line < block->lines + block->len; ++line)
+ for (line = block->u.t.first_line; line; line = line->next)
{
- if (line > block->lines)
- fz_write_string(ctx, out, "<br/>\n");
- for (span = line->first_span; span; span = span->next)
+ if (line != block->u.t.first_line)
+ fz_write_string(ctx, out, "\n");
+ for (ch = line->first_char; ch; ch = ch->next)
{
- if (span->spacing > 1)
- fz_write_byte(ctx, out, ' ');
-
- for (ch = span->text; ch < span->text + span->len; ++ch)
+ if (ch->font != font || ch->size != size)
{
- if (ch->style != style)
- {
- if (style)
- fz_print_style_end_xhtml(ctx, out, style);
- style = ch->style;
- fz_print_style_begin_xhtml(ctx, out, style);
- }
+ if (font)
+ fz_print_style_end_xhtml(ctx, out, font, size);
+ font = ch->font;
+ size = ch->size;
+ fz_print_style_begin_xhtml(ctx, out, font, size);
+ }
- switch (ch->c)
- {
- default:
- if (ch->c >= 32 && ch->c <= 127)
- fz_write_byte(ctx, out, ch->c);
- else
- fz_write_printf(ctx, out, "&#x%x;", ch->c);
- break;
- case '<': fz_write_string(ctx, out, "&lt;"); break;
- case '>': fz_write_string(ctx, out, "&gt;"); break;
- case '&': fz_write_string(ctx, out, "&amp;"); break;
- case '"': fz_write_string(ctx, out, "&quot;"); break;
- case '\'': fz_write_string(ctx, out, "&apos;"); break;
- }
+ switch (ch->c)
+ {
+ default:
+ if (ch->c >= 32 && ch->c <= 127)
+ fz_write_byte(ctx, out, ch->c);
+ else
+ fz_write_printf(ctx, out, "&#x%x;", ch->c);
+ break;
+ case '<': fz_write_string(ctx, out, "&lt;"); break;
+ case '>': fz_write_string(ctx, out, "&gt;"); break;
+ case '&': fz_write_string(ctx, out, "&amp;"); break;
+ case '"': fz_write_string(ctx, out, "&quot;"); break;
+ case '\'': fz_write_string(ctx, out, "&apos;"); break;
}
}
}
- if (style)
- fz_print_style_end_xhtml(ctx, out, style);
- fz_write_string(ctx, out, "\n</p>\n");
+ if (font)
+ fz_print_style_end_xhtml(ctx, out, font, size);
+ fz_write_string(ctx, out, "</p>\n");
}
void
fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page)
{
- fz_page_block *block;
+ fz_stext_block *block;
fz_write_string(ctx, out, "<div>\n");
- for (block = page->blocks; block < page->blocks + page->len; ++block)
+ for (block = page->first_block; block; block = block->next)
{
- if (block->type == FZ_PAGE_BLOCK_IMAGE)
- fz_print_stext_image_as_xhtml(ctx, out, block->u.image);
- else if (block->type == FZ_PAGE_BLOCK_TEXT)
- fz_print_stext_block_as_xhtml(ctx, out, block->u.text);
+ if (block->type == FZ_STEXT_BLOCK_IMAGE)
+ fz_print_stext_image_as_xhtml(ctx, out, block);
+ else if (block->type == FZ_STEXT_BLOCK_TEXT)
+ fz_print_stext_block_as_xhtml(ctx, out, block);
}
fz_write_string(ctx, out, "</div>\n");
@@ -311,6 +270,7 @@ fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out)
fz_write_string(ctx, out, "<style>\n");
fz_write_string(ctx, out, "body{background-color:gray}\n");
fz_write_string(ctx, out, "div{background-color:white;margin:1em;padding:1em}\n");
+ fz_write_string(ctx, out, "p{white-space:pre-wrap}\n");
fz_write_string(ctx, out, "</style>\n");
fz_write_string(ctx, out, "</head>\n");
fz_write_string(ctx, out, "<body>\n");
@@ -328,87 +288,79 @@ fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out)
void
fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page)
{
- int block_n;
+ fz_stext_block *block;
+ fz_stext_line *line;
+ fz_stext_char *ch;
fz_write_printf(ctx, out, "<page width=\"%g\" height=\"%g\">\n",
page->mediabox.x1 - page->mediabox.x0,
page->mediabox.y1 - page->mediabox.y0);
- for (block_n = 0; block_n < page->len; block_n++)
+ for (block = page->first_block; block; block = block->next)
{
- switch (page->blocks[block_n].type)
- {
- case FZ_PAGE_BLOCK_TEXT:
+ switch (block->type)
{
- fz_stext_block *block = page->blocks[block_n].u.text;
- fz_stext_line *line;
- const char *s;
-
+ case FZ_STEXT_BLOCK_TEXT:
fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\">\n",
- block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
- for (line = block->lines; line < block->lines + block->len; line++)
+ block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
+ for (line = block->u.t.first_line; line; line = line->next)
{
- fz_stext_span *span;
+ fz_font *font = NULL;
+ float size = 0;
+ const char *name = NULL;
+ const char *s;
+ fz_rect rect;
+
fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\">\n",
- line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1);
- for (span = line->first_span; span; span = span->next)
+ line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1);
+
+ for (ch = line->first_char; ch; ch = ch->next)
{
- fz_stext_style *style = NULL;
- const char *name = NULL;
- int char_num;
- for (char_num = 0; char_num < span->len; char_num++)
+ if (ch->font != font || ch->size != size)
+ {
+ if (font)
+ fz_write_string(ctx, out, "</font>\n");
+ font = ch->font;
+ size = ch->size;
+ name = fz_font_name(ctx, font);
+ s = strchr(name, '+');
+ s = s ? s + 1 : name;
+ fz_write_printf(ctx, out, "<font name=\"%s\" size=\"%g\">\n", s, size);
+ }
+ fz_stext_char_bbox(ctx, &rect, line, ch);
+ fz_write_printf(ctx, out, "<char bbox=\"%g %g %g %g\" x=\"%g\" y=\"%g\" c=\"",
+ rect.x0, rect.y0, rect.x1, rect.y1, ch->origin.x, ch->origin.y);
+ switch (ch->c)
{
- fz_stext_char *ch = &span->text[char_num];
- if (ch->style != style)
- {
- if (style)
- {
- fz_write_string(ctx, out, "</span>\n");
- }
- style = ch->style;
- name = fz_font_name(ctx, style->font);
- s = strchr(name, '+');
- s = s ? s + 1 : name;
- fz_write_printf(ctx, out, "<span bbox=\"%g %g %g %g\" font=\"%s\" size=\"%g\">\n",
- span->bbox.x0, span->bbox.y0, span->bbox.x1, span->bbox.y1,
- s, style->size);
- }
- {
- fz_rect rect;
- fz_stext_char_bbox(ctx, &rect, span, char_num);
- fz_write_printf(ctx, out, "<char bbox=\"%g %g %g %g\" x=\"%g\" y=\"%g\" c=\"",
- rect.x0, rect.y0, rect.x1, rect.y1, ch->p.x, ch->p.y);
- }
- switch (ch->c)
- {
- case '<': fz_write_string(ctx, out, "&lt;"); break;
- case '>': fz_write_string(ctx, out, "&gt;"); break;
- case '&': fz_write_string(ctx, out, "&amp;"); break;
- case '"': fz_write_string(ctx, out, "&quot;"); break;
- case '\'': fz_write_string(ctx, out, "&apos;"); break;
- default:
- if (ch->c >= 32 && ch->c <= 127)
- fz_write_printf(ctx, out, "%c", ch->c);
- else
- fz_write_printf(ctx, out, "&#x%x;", ch->c);
- break;
- }
- fz_write_string(ctx, out, "\"/>\n");
+ case '<': fz_write_string(ctx, out, "&lt;"); break;
+ case '>': fz_write_string(ctx, out, "&gt;"); break;
+ case '&': fz_write_string(ctx, out, "&amp;"); break;
+ case '"': fz_write_string(ctx, out, "&quot;"); break;
+ case '\'': fz_write_string(ctx, out, "&apos;"); break;
+ default:
+ if (ch->c >= 32 && ch->c <= 127)
+ fz_write_printf(ctx, out, "%c", ch->c);
+ else
+ fz_write_printf(ctx, out, "&#x%x;", ch->c);
+ break;
}
- if (style)
- fz_write_string(ctx, out, "</span>\n");
+ fz_write_string(ctx, out, "\"/>\n");
}
+
+ if (font)
+ fz_write_string(ctx, out, "</font>\n");
+
fz_write_string(ctx, out, "</line>\n");
}
fz_write_string(ctx, out, "</block>\n");
break;
- }
- case FZ_PAGE_BLOCK_IMAGE:
- {
+
+ case FZ_STEXT_BLOCK_IMAGE:
+ fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n",
+ block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
break;
}
}
- }
fz_write_string(ctx, out, "</page>\n");
}
@@ -417,31 +369,23 @@ fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page)
void
fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page)
{
- fz_page_block *pblock;
+ fz_stext_block *block;
+ fz_stext_line *line;
+ fz_stext_char *ch;
+ char utf[10];
+ int i, n;
- for (pblock = page->blocks; pblock < page->blocks + page->len; ++pblock)
+ for (block = page->first_block; block; block = block->next)
{
- if (pblock->type == FZ_PAGE_BLOCK_TEXT)
+ if (block->type == FZ_STEXT_BLOCK_TEXT)
{
- fz_stext_block *block = pblock->u.text;
- fz_stext_line *line;
- fz_stext_char *ch;
- char utf[10];
- int i, n;
-
- for (line = block->lines; line < block->lines + block->len; line++)
+ for (line = block->u.t.first_line; line; line = line->next)
{
- fz_stext_span *span;
- for (span = line->first_span; span; span = span->next)
+ for (ch = line->first_char; ch; ch = ch->next)
{
- if (span->spacing > 1)
- fz_write_byte(ctx, out, ' ');
- for (ch = span->text; ch < span->text + span->len; ch++)
- {
- n = fz_runetochar(utf, ch->c);
- for (i = 0; i < n; i++)
- fz_write_byte(ctx, out, utf[i]);
- }
+ n = fz_runetochar(utf, ch->c);
+ for (i = 0; i < n; i++)
+ fz_write_byte(ctx, out, utf[i]);
}
fz_write_string(ctx, out, "\n");
}
@@ -466,7 +410,6 @@ struct fz_text_writer_s
fz_document_writer super;
int format;
fz_stext_options opts;
- fz_stext_sheet *sheet;
fz_stext_page *page;
fz_output *out;
};
@@ -483,7 +426,7 @@ text_begin_page(fz_context *ctx, fz_document_writer *wri_, const fz_rect *mediab
}
wri->page = fz_new_stext_page(ctx, mediabox);
- return fz_new_stext_device(ctx, wri->sheet, wri->page, &wri->opts);
+ return fz_new_stext_device(ctx, wri->page, &wri->opts);
}
static void
@@ -537,7 +480,6 @@ text_drop_writer(fz_context *ctx, fz_document_writer *wri_)
{
fz_text_writer *wri = (fz_text_writer*)wri_;
fz_drop_stext_page(ctx, wri->page);
- fz_drop_stext_sheet(ctx, wri->sheet);
fz_drop_output(ctx, wri->out);
}
@@ -561,7 +503,6 @@ fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const
else if (!strcmp(format, "stext"))
wri->format = FZ_FORMAT_STEXT;
- wri->sheet = fz_new_stext_sheet(ctx);
wri->out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0);
switch (wri->format)
@@ -581,7 +522,6 @@ fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const
fz_catch(ctx)
{
fz_drop_output(ctx, wri->out);
- fz_drop_stext_sheet(ctx, wri->sheet);
fz_free(ctx, wri);
fz_rethrow(ctx);
}