summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2012-03-06 17:41:04 +0100
committerTor Andersson <tor.andersson@artifex.com>2012-03-12 18:43:02 +0100
commit0b0e2af392428b5dbc88d6fbd2f6b5181e85165f (patch)
tree9668c284b68b912c103f0778ff298aafb4d6453b
parentbc1c06ddd335f899025470dde7e839a82d792972 (diff)
downloadmupdf-0b0e2af392428b5dbc88d6fbd2f6b5181e85165f.tar.xz
Create style sheet and group extracted text into blocks, lines and spans.
-rw-r--r--apps/mudraw.c58
-rw-r--r--apps/pdfapp.c160
-rw-r--r--apps/pdfapp.h3
-rw-r--r--apps/x11_main.c2
-rw-r--r--fitz/dev_text.c689
-rw-r--r--fitz/fitz.h102
6 files changed, 716 insertions, 298 deletions
diff --git a/apps/mudraw.c b/apps/mudraw.c
index 19f359a1..31847665 100644
--- a/apps/mudraw.c
+++ b/apps/mudraw.c
@@ -10,6 +10,8 @@
#include <sys/time.h>
#endif
+enum { TEXT_PLAIN = 1, TEXT_HTML = 2, TEXT_XML = 3 };
+
static char *output = NULL;
static float resolution = 72;
static float rotation = 0;
@@ -28,6 +30,7 @@ static int width = 0;
static int height = 0;
static int fit = 0;
+static fz_text_sheet *sheet = NULL;
static fz_colorspace *colorspace;
static char *filename;
@@ -157,42 +160,43 @@ static void drawpage(fz_context *ctx, fz_document *doc, int pagenum)
if (showtext)
{
- fz_text_span *text = NULL;
+ fz_text_page *text = NULL;
fz_var(text);
fz_try(ctx)
{
- text = fz_new_text_span(ctx);
- dev = fz_new_text_device(ctx, text);
+ text = fz_new_text_page(ctx, fz_bound_page(doc, page));
+ dev = fz_new_text_device(ctx, sheet, text);
if (list)
fz_run_display_list(list, dev, fz_identity, fz_infinite_bbox, NULL);
else
fz_run_page(doc, page, dev, fz_identity, NULL);
fz_free_device(dev);
dev = NULL;
- if (showtext > 1)
+ if (showtext == TEXT_XML)
{
- printf("<page number=\"%d\">\n", pagenum);
- fz_debug_text_span_xml(text);
- printf("</page>\n");
+ fz_print_text_page_xml(stdout, text);
}
- else
+ else if (showtext == TEXT_HTML)
+ {
+ fz_print_text_page_html(stdout, text);
+ }
+ else if (showtext == TEXT_PLAIN)
{
- printf("[Page %d]\n", pagenum);
- fz_debug_text_span(text);
+ fz_print_text_page(stdout, text);
+ printf("\f\n");
}
- printf("\n");
}
fz_catch(ctx)
{
fz_free_device(dev);
- fz_free_text_span(ctx, text);
+ fz_free_text_page(ctx, text);
fz_free_display_list(ctx, list);
fz_free_page(doc, page);
fz_rethrow(ctx);
}
- fz_free_text_span(ctx, text);
+ fz_free_text_page(ctx, text);
}
if (showmd5 || showtime)
@@ -456,9 +460,23 @@ int main(int argc, char **argv)
timing.minpage = 0;
timing.maxpage = 0;
- if (showxml || showtext > 1)
+ if (showxml || showtext == TEXT_XML)
printf("<?xml version=\"1.0\"?>\n");
+ if (showtext)
+ sheet = fz_new_text_sheet(ctx);
+
+ if (showtext == TEXT_HTML)
+ {
+ printf("<style>\n");
+ printf("body{background-color:gray;margin:12tp;}\n");
+ printf("div.page{background-color:white;margin:6pt;padding:6pt;}\n");
+ printf("div.block{border:1px solid gray;margin:6pt;padding:6pt;}\n");
+ printf("p{margin:0;padding:0;}\n");
+ printf("</style>\n");
+ printf("<body>\n");
+ }
+
fz_try(ctx)
{
while (fz_optind < argc)
@@ -478,7 +496,7 @@ int main(int argc, char **argv)
if (!fz_authenticate_password(doc, password))
fz_throw(ctx, "cannot authenticate password: %s", filename);
- if (showxml || showtext > 1)
+ if (showxml || showtext == TEXT_XML)
printf("<document name=\"%s\">\n", filename);
if (showoutline)
@@ -492,7 +510,7 @@ int main(int argc, char **argv)
drawrange(ctx, doc, argv[fz_optind++]);
}
- if (showxml || showtext > 1)
+ if (showxml || showtext == TEXT_XML)
printf("</document>\n");
fz_close_document(doc);
@@ -504,6 +522,14 @@ int main(int argc, char **argv)
fz_close_document(doc);
}
+ if (showtext == TEXT_HTML)
+ {
+ printf("</body>\n");
+ printf("<style>\n");
+ fz_print_text_sheet(stdout, sheet);
+ printf("</style>\n");
+ }
+
if (showtime)
{
printf("total %dms / %d pages for an average of %dms\n",
diff --git a/apps/pdfapp.c b/apps/pdfapp.c
index 4f093508..e5742645 100644
--- a/apps/pdfapp.c
+++ b/apps/pdfapp.c
@@ -157,7 +157,7 @@ void pdfapp_close(pdfapp_t *app)
app->page_list = NULL;
if (app->page_text)
- fz_free_text_span(app->ctx, app->page_text);
+ fz_free_text_page(app->ctx, app->page_text);
app->page_text = NULL;
if (app->page_links)
@@ -228,7 +228,7 @@ static void pdfapp_loadpage(pdfapp_t *app)
if (app->page_list)
fz_free_display_list(app->ctx, app->page_list);
if (app->page_text)
- fz_free_text_span(app->ctx, app->page_text);
+ fz_free_text_page(app->ctx, app->page_text);
if (app->page_links)
fz_drop_link(app->ctx, app->page_links);
if (app->page)
@@ -273,8 +273,9 @@ static void pdfapp_showpage(pdfapp_t *app, int loadpage, int drawpage, int repai
app->hitlen = 0;
/* Extract text */
- app->page_text = fz_new_text_span(app->ctx);
- tdev = fz_new_text_device(app->ctx, app->page_text);
+ app->page_sheet = fz_new_text_sheet(app->ctx);
+ app->page_text = fz_new_text_page(app->ctx, app->page_bbox);
+ tdev = fz_new_text_device(app->ctx, app->page_sheet, app->page_text);
fz_run_display_list(app->page_list, tdev, fz_identity, fz_infinite_bbox, NULL);
fz_free_device(tdev);
}
@@ -354,23 +355,61 @@ static void pdfapp_gotopage(pdfapp_t *app, int number)
pdfapp_showpage(app, 1, 1, 1);
}
-static inline fz_bbox bboxcharat(fz_text_span *span, int idx)
+static fz_text_char textcharat(fz_text_page *page, int idx)
{
+ static fz_text_char emptychar = { {0,0,0,0}, ' ' };
+ fz_text_block *block;
+ fz_text_line *line;
+ fz_text_span *span;
int ofs = 0;
- while (span)
+ for (block = page->blocks; block < page->blocks + page->len; block++)
{
- if (idx < ofs + span->len)
- return span->text[idx - ofs].bbox;
- if (span->eol)
+ for (line = block->lines; line < block->lines + block->len; line++)
{
- if (idx == ofs + span->len)
- return fz_empty_bbox;
- ofs ++;
+ for (span = line->spans; span < line->spans + line->len; span++)
+ {
+ if (idx < ofs + span->len)
+ return span->text[idx - ofs];
+ /* pseudo-newline */
+ if (span + 1 == line->spans + line->len)
+ {
+ if (idx == ofs + span->len)
+ return emptychar;
+ ofs++;
+ }
+ ofs += span->len;
+ }
}
- ofs += span->len;
- span = span->next;
}
- return fz_empty_bbox;
+ return emptychar;
+}
+
+static int textlen(fz_text_page *page)
+{
+ fz_text_block *block;
+ fz_text_line *line;
+ fz_text_span *span;
+ int len = 0;
+ for (block = page->blocks; block < page->blocks + page->len; block++)
+ {
+ for (line = block->lines; line < block->lines + block->len; line++)
+ {
+ for (span = line->spans; span < line->spans + line->len; span++)
+ len += span->len;
+ len++; /* pseudo-newline */
+ }
+ }
+ return len;
+}
+
+static inline int charat(fz_text_page *page, int idx)
+{
+ return textcharat(page, idx).c;
+}
+
+static inline fz_bbox bboxcharat(fz_text_page *page, int idx)
+{
+ return fz_round_rect(textcharat(page, idx).bbox);
}
void pdfapp_inverthit(pdfapp_t *app)
@@ -404,52 +443,20 @@ void pdfapp_inverthit(pdfapp_t *app)
pdfapp_invert(app, fz_transform_bbox(ctm, hitbox));
}
-static inline int charat(fz_text_span *span, int idx)
-{
- int ofs = 0;
- while (span)
- {
- if (idx < ofs + span->len)
- return span->text[idx - ofs].c;
- if (span->eol)
- {
- if (idx == ofs + span->len)
- return ' ';
- ofs ++;
- }
- ofs += span->len;
- span = span->next;
- }
- return 0;
-}
-
-static int textlen(fz_text_span *span)
-{
- int len = 0;
- while (span)
- {
- len += span->len;
- if (span->eol)
- len ++;
- span = span->next;
- }
- return len;
-}
-
-static int match(char *s, fz_text_span *span, int n)
+static int match(char *s, fz_text_page *page, int n)
{
int orig = n;
int c;
while ((c = *s++))
{
- if (c == ' ' && charat(span, n) == ' ')
+ if (c == ' ' && charat(page, n) == ' ')
{
- while (charat(span, n) == ' ')
+ while (charat(page, n) == ' ')
n++;
}
else
{
- if (tolower(c) != tolower(charat(span, n)))
+ if (tolower(c) != tolower(charat(page, n)))
return 0;
n++;
}
@@ -1067,6 +1074,9 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen)
{
fz_bbox hitbox;
fz_matrix ctm;
+ fz_text_page *page = app->page_text;
+ fz_text_block *block;
+ fz_text_line *line;
fz_text_span *span;
int c, i, p;
int seen;
@@ -1079,32 +1089,40 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen)
ctm = pdfapp_viewctm(app);
p = 0;
- for (span = app->page_text; span; span = span->next)
- {
- seen = 0;
- for (i = 0; i < span->len; i++)
+ for (block = page->blocks; block < page->blocks + page->len; block++)
+ {
+ for (line = block->lines; line < block->lines + block->len; line++)
{
- hitbox = fz_transform_bbox(ctm, span->text[i].bbox);
- c = span->text[i].c;
- if (c < 32)
- c = '?';
- if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
+ for (span = line->spans; span < line->spans + line->len; span++)
{
- if (p < ucslen - 1)
- ucsbuf[p++] = c;
- seen = 1;
- }
- }
+ seen = 0;
- if (seen && span->eol)
- {
+ for (i = 0; i < span->len; i++)
+ {
+ hitbox = fz_round_rect(span->text[i].bbox);
+ hitbox = fz_transform_bbox(ctm, hitbox);
+ c = span->text[i].c;
+ if (c < 32)
+ c = '?';
+ if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
+ {
+ if (p < ucslen - 1)
+ ucsbuf[p++] = c;
+ seen = 1;
+ }
+ }
+
+ if (seen && span + 1 == line->spans + line->len)
+ {
#ifdef _WIN32
- if (p < ucslen - 1)
- ucsbuf[p++] = '\r';
+ if (p < ucslen - 1)
+ ucsbuf[p++] = '\r';
#endif
- if (p < ucslen - 1)
- ucsbuf[p++] = '\n';
+ if (p < ucslen - 1)
+ ucsbuf[p++] = '\n';
+ }
+ }
}
}
diff --git a/apps/pdfapp.h b/apps/pdfapp.h
index 0c1b6ac4..db83335f 100644
--- a/apps/pdfapp.h
+++ b/apps/pdfapp.h
@@ -52,7 +52,8 @@ struct pdfapp_s
fz_page *page;
fz_rect page_bbox;
fz_display_list *page_list;
- fz_text_span *page_text;
+ fz_text_page *page_text;
+ fz_text_sheet *page_sheet;
fz_link *page_links;
/* snapback history */
diff --git a/apps/x11_main.c b/apps/x11_main.c
index 091f0ec6..fe0196fc 100644
--- a/apps/x11_main.c
+++ b/apps/x11_main.c
@@ -466,7 +466,7 @@ void windocopy(pdfapp_t *app)
{
ucs = ucs2[0];
- utf8 += runetochar(utf8, &ucs);
+ utf8 += fz_runetochar(utf8, ucs);
if (ucs < 256)
*latin1++ = ucs;
diff --git a/fitz/dev_text.c b/fitz/dev_text.c
index 79d8c137..7a56b56c 100644
--- a/fitz/dev_text.c
+++ b/fitz/dev_text.c
@@ -2,6 +2,7 @@
#define LINE_DIST 0.9f
#define SPACE_DIST 0.2f
+#define PARAGRAPH_DIST 0.5f
#include <ft2build.h>
#include FT_FREETYPE_H
@@ -11,60 +12,211 @@ typedef struct fz_text_device_s fz_text_device;
struct fz_text_device_s
{
+ fz_text_sheet *sheet;
+ fz_text_page *page;
+ fz_text_line cur_line;
+ fz_text_span cur_span;
fz_point point;
- fz_text_span *head;
- fz_text_span *span;
};
-fz_text_span *
-fz_new_text_span(fz_context *ctx)
+fz_text_sheet *
+fz_new_text_sheet(fz_context *ctx)
{
- fz_text_span *span;
- span = fz_malloc_struct(ctx, fz_text_span);
- span->font = NULL;
- span->wmode = 0;
- span->size = 0;
- span->len = 0;
- span->cap = 0;
- span->text = NULL;
- span->next = NULL;
- span->eol = 0;
- return span;
+ fz_text_sheet *sheet = fz_malloc(ctx, sizeof *sheet);
+ sheet->maxid = 0;
+ sheet->style = NULL;
+ return sheet;
}
void
-fz_free_text_span(fz_context *ctx, fz_text_span *span)
+fz_free_text_sheet(fz_context *ctx, fz_text_sheet *sheet)
+{
+ fz_text_style *style = sheet->style;
+ while (style)
+ {
+ fz_text_style *next = style->next;
+ fz_drop_font(ctx, style->font);
+ fz_free(ctx, style);
+ style = next;
+ }
+}
+
+static fz_text_style *
+fz_find_text_style_imp(fz_context *ctx, fz_text_sheet *sheet,
+ float size, fz_font *font, int wmode, int script)
+{
+ fz_text_style *style;
+
+ for (style = sheet->style; style; style = style->next)
+ {
+ if (style->font == font &&
+ style->size == size &&
+ style->wmode == wmode &&
+ style->script == script) /* FIXME: others */
+ {
+ return style;
+ }
+ }
+
+ /* Better make a new one and add it to our list */
+ style = fz_malloc(ctx, sizeof *style);
+ style->id = sheet->maxid++;
+ style->font = fz_keep_font(ctx, font);
+ style->size = size;
+ style->wmode = wmode;
+ style->script = script;
+ style->next = sheet->style;
+ sheet->style = style;
+ return style;
+}
+
+static fz_text_style *
+fz_find_text_style(fz_context *ctx, fz_text_sheet *sheet, fz_text *text, fz_matrix *ctm,
+ fz_colorspace *colorspace, float *color, float alpha, fz_stroke_state *stroke)
{
- fz_text_span *next;
+ float size = 1.0f;
+ fz_font *font = text ? text->font : NULL;
+ int wmode = text ? text->wmode : 0;
+ if (ctm && text)
+ {
+ fz_matrix tm = text->trm;
+ fz_matrix trm;
+ tm.e = 0;
+ tm.f = 0;
+ trm = fz_concat(tm, *ctm);
+ size = fz_matrix_expansion(trm);
+ }
+ return fz_find_text_style_imp(ctx, sheet, size, font, wmode, 0);
+}
- while (span)
+fz_text_page *
+fz_new_text_page(fz_context *ctx, fz_rect mediabox)
+{
+ fz_text_page *page = fz_malloc(ctx, sizeof(*page));
+ page->mediabox = mediabox;
+ page->len = 0;
+ page->cap = 0;
+ page->blocks = NULL;
+ return page;
+}
+
+void
+fz_free_text_page(fz_context *ctx, fz_text_page *page)
+{
+ fz_text_block *block;
+ fz_text_line *line;
+ fz_text_span *span;
+ for (block = page->blocks; block < page->blocks + page->len; block++)
{
- if (span->font)
- fz_drop_font(ctx, span->font);
- next = span->next;
- fz_free(ctx, span->text);
- fz_free(ctx, span);
- span = next;
+ for (line = block->lines; line < block->lines + block->len; line++)
+ {
+ for (span = line->spans; span < line->spans + line->len; span++)
+ {
+ fz_free(ctx, span->text);
+ }
+ fz_free(ctx, line->spans);
+ }
+ fz_free(ctx, block->lines);
}
+ fz_free(ctx, page->blocks);
+ fz_free(ctx, page);
}
static void
-fz_add_text_char_imp(fz_context *ctx, fz_text_span *span, int c, fz_bbox bbox)
+append_char(fz_context *ctx, fz_text_span *span, int c, fz_rect bbox)
{
- if (span->len + 1 >= span->cap)
+ if (span->len == span->cap)
{
- span->cap = span->cap > 1 ? (span->cap * 3) / 2 : 80;
- span->text = fz_resize_array(ctx, span->text, span->cap, sizeof(fz_text_char));
+ span->cap = MAX(64, span->cap * 2);
+ span->text = fz_resize_array(ctx, span->text, span->cap, sizeof(*span->text));
}
+ span->bbox = fz_union_rect(span->bbox, bbox);
span->text[span->len].c = c;
span->text[span->len].bbox = bbox;
- span->len ++;
+ span->len++;
+}
+
+static void
+init_span(fz_context *ctx, fz_text_span *span, fz_text_style *style)
+{
+ span->style = style;
+ span->bbox = fz_empty_rect;
+ span->len = span->cap = 0;
+ span->text = NULL;
+}
+
+static void
+append_span(fz_context *ctx, fz_text_line *line, fz_text_span *span)
+{
+ if (line->len == line->cap)
+ {
+ line->cap = MAX(8, line->cap * 2);
+ line->spans = fz_resize_array(ctx, line->spans, line->cap, sizeof(*line->spans));
+ }
+ line->bbox = fz_union_rect(line->bbox, span->bbox);
+ line->spans[line->len++] = *span;
}
-static fz_bbox
-fz_split_bbox(fz_bbox bbox, int i, int n)
+static void
+init_line(fz_context *ctx, fz_text_line *line)
+{
+ line->bbox = fz_empty_rect;
+ line->len = line->cap = 0;
+ line->spans = NULL;
+}
+
+static void
+append_line(fz_context *ctx, fz_text_block *block, fz_text_line *line)
{
- float w = (float)(bbox.x1 - bbox.x0) / n;
+ if (block->len == block->cap)
+ {
+ block->cap = MAX(16, block->cap * 2);
+ block->lines = fz_resize_array(ctx, block->lines, block->cap, sizeof *block->lines);
+ }
+ block->bbox = fz_union_rect(block->bbox, line->bbox);
+ block->lines[block->len++] = *line;
+}
+
+static fz_text_block *
+find_block_for_line(fz_context *ctx, fz_text_page *page, fz_text_line *line)
+{
+ float size = line->len > 0 && line->spans[0].len > 0 ? line->spans[0].style->size : 1;
+ int i;
+
+ for (i = 0; i < page->len; i++)
+ {
+ fz_text_block *block = page->blocks + i;
+ int w = block->bbox.x1 - block->bbox.x0;
+ if (block->bbox.y0 - line->bbox.y1 < size * PARAGRAPH_DIST)
+ if (line->bbox.x0 < block->bbox.x1 && line->bbox.x1 > block->bbox.x0)
+ if (ABS(line->bbox.x0 - block->bbox.x0) < w / 4)
+ return block;
+ }
+
+ if (page->len == page->cap)
+ {
+ page->cap = MAX(16, page->cap * 2);
+ page->blocks = fz_resize_array(ctx, page->blocks, page->cap, sizeof(*page->blocks));
+ }
+
+ page->blocks[page->len].bbox = fz_empty_rect;
+ page->blocks[page->len].len = 0;
+ page->blocks[page->len].cap = 0;
+ page->blocks[page->len].lines = NULL;
+
+ return &page->blocks[page->len++];
+}
+
+static void
+insert_line(fz_context *ctx, fz_text_page *page, fz_text_line *line)
+{
+ append_line(ctx, find_block_for_line(ctx, page, line), line);
+}
+
+static fz_rect
+fz_split_bbox(fz_rect bbox, int i, int n)
+{
+ float w = (bbox.x1 - bbox.x0) / n;
float x0 = bbox.x0;
bbox.x0 = x0 + i * w;
bbox.x1 = x0 + (i + 1) * w;
@@ -72,154 +224,71 @@ fz_split_bbox(fz_bbox bbox, int i, int n)
}
static void
-fz_add_text_char(fz_context *ctx, fz_text_span **last, fz_font *font, float size, int wmode, int c, fz_bbox bbox)
+fz_flush_text_line(fz_context *ctx, fz_text_device *dev, fz_text_style *style)
{
- fz_text_span *span = *last;
-
- if (!span->font)
- {
- span->font = fz_keep_font(ctx, font);
- span->size = size;
- }
+ append_span(ctx, &dev->cur_line, &dev->cur_span);
+ insert_line(ctx, dev->page, &dev->cur_line);
+ init_span(ctx, &dev->cur_span, style);
+ init_line(ctx, &dev->cur_line);
+}
- if ((span->font != font || span->size != size || span->wmode != wmode) && c != 32)
+static void
+fz_add_text_char_imp(fz_context *ctx, fz_text_device *dev, fz_text_style *style, int c, fz_rect bbox)
+{
+ if (!dev->cur_span.style)
+ dev->cur_span.style = style;
+ if (style != dev->cur_span.style)
{
- span = fz_new_text_span(ctx);
- span->font = fz_keep_font(ctx, font);
- span->size = size;
- span->wmode = wmode;
- (*last)->next = span;
- *last = span;
+ append_span(ctx, &dev->cur_line, &dev->cur_span);
+ init_span(ctx, &dev->cur_span, style);
}
+ append_char(ctx, &dev->cur_span, c, bbox);
+}
+static void
+fz_add_text_char(fz_context *ctx, fz_text_device *dev, fz_text_style *style, int c, fz_rect bbox)
+{
switch (c)
{
case -1: /* ignore when one unicode character maps to multiple glyphs */
break;
case 0xFB00: /* ff */
- fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 2));
- fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 1, 2));
+ fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 2));
+ fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 1, 2));
break;
case 0xFB01: /* fi */
- fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 2));
- fz_add_text_char_imp(ctx, span, 'i', fz_split_bbox(bbox, 1, 2));
+ fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 2));
+ fz_add_text_char_imp(ctx, dev, style, 'i', fz_split_bbox(bbox, 1, 2));
break;
case 0xFB02: /* fl */
- fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 2));
- fz_add_text_char_imp(ctx, span, 'l', fz_split_bbox(bbox, 1, 2));
+ fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 2));
+ fz_add_text_char_imp(ctx, dev, style, 'l', fz_split_bbox(bbox, 1, 2));
break;
case 0xFB03: /* ffi */
- fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 3));
- fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 1, 3));
- fz_add_text_char_imp(ctx, span, 'i', fz_split_bbox(bbox, 2, 3));
+ fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 3));
+ fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 1, 3));
+ fz_add_text_char_imp(ctx, dev, style, 'i', fz_split_bbox(bbox, 2, 3));
break;
case 0xFB04: /* ffl */
- fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 3));
- fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 1, 3));
- fz_add_text_char_imp(ctx, span, 'l', fz_split_bbox(bbox, 2, 3));
+ fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 3));
+ fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 1, 3));
+ fz_add_text_char_imp(ctx, dev, style, 'l', fz_split_bbox(bbox, 2, 3));
break;
case 0xFB05: /* long st */
case 0xFB06: /* st */
- fz_add_text_char_imp(ctx, span, 's', fz_split_bbox(bbox, 0, 2));
- fz_add_text_char_imp(ctx, span, 't', fz_split_bbox(bbox, 1, 2));
+ fz_add_text_char_imp(ctx, dev, style, 's', fz_split_bbox(bbox, 0, 2));
+ fz_add_text_char_imp(ctx, dev, style, 't', fz_split_bbox(bbox, 1, 2));
break;
default:
- fz_add_text_char_imp(ctx, span, c, bbox);
+ fz_add_text_char_imp(ctx, dev, style, c, bbox);
break;
}
}
static void
-fz_divide_text_chars(fz_text_span **last, int n, fz_bbox bbox)
-{
- fz_text_span *span = *last;
- int i, x;
- x = span->len - n;
- if (x >= 0)
- for (i = 0; i < n; i++)
- span->text[x + i].bbox = fz_split_bbox(bbox, i, n);
-}
-
-static void
-fz_add_text_newline(fz_context *ctx, fz_text_span **last, fz_font *font, float size, int wmode)
-{
- fz_text_span *span;
- span = fz_new_text_span(ctx);
- span->font = fz_keep_font(ctx, font);
- span->size = size;
- span->wmode = wmode;
- (*last)->eol = 1;
- (*last)->next = span;
- *last = span;
-}
-
-void
-fz_debug_text_span_xml(fz_text_span *span)
-{
- char buf[10];
- int c, n, k, i;
-
- while (span)
- {
- printf("<span font=\"%s\" size=\"%g\" wmode=\"%d\" eol=\"%d\">\n",
- span->font ? span->font->name : "NULL", span->size, span->wmode, span->eol);
-
- for (i = 0; i < span->len; i++)
- {
- printf("\t<char ucs=\"");
- c = span->text[i].c;
- if (c < 128)
- putchar(c);
- else
- {
- n = fz_runetochar(buf, c);
- for (k = 0; k < n; k++)
- putchar(buf[k]);
- }
- printf("\" bbox=\"%d %d %d %d\" />\n",
- span->text[i].bbox.x0,
- span->text[i].bbox.y0,
- span->text[i].bbox.x1,
- span->text[i].bbox.y1);
- }
-
- printf("</span>\n");
-
- span = span->next;
- }
-}
-
-void
-fz_debug_text_span(fz_text_span *span)
-{
- char buf[10];
- int c, n, k, i;
-
- while (span)
- {
- for (i = 0; i < span->len; i++)
- {
- c = span->text[i].c;
- if (c < 128)
- putchar(c);
- else
- {
- n = fz_runetochar(buf, c);
- for (k = 0; k < n; k++)
- putchar(buf[k]);
- }
- }
-
- if (span->eol)
- putchar('\n');
-
- span = span->next;
- }
-}
-
-static void
-fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_matrix ctm, fz_point *pen)
+fz_text_extract(fz_context *ctx, fz_text_device *dev, fz_text *text, fz_matrix ctm, fz_text_style *style)
{
+ fz_point *pen = &dev->point;
fz_font *font = text->font;
FT_Face face = font->ft_face;
fz_matrix tm = text->trm;
@@ -233,19 +302,20 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat
float ascender = 1;
float descender = 0;
int multi;
- int i, err;
+ int i, j, err;
if (text->len == 0)
return;
- fz_lock(ctx, FZ_LOCK_FREETYPE);
if (font->ft_face)
{
+ fz_lock(ctx, FZ_LOCK_FREETYPE);
err = FT_Set_Char_Size(font->ft_face, 64, 64, 72, 72);
if (err)
fz_warn(ctx, "freetype set character size: %s", ft_error_string(err));
ascender = (float)face->ascender / face->units_per_EM;
descender = (float)face->descender / face->units_per_EM;
+ fz_unlock(ctx, FZ_LOCK_FREETYPE);
}
rect = fz_empty_rect;
@@ -264,6 +334,7 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat
tm.e = 0;
tm.f = 0;
trm = fz_concat(tm, ctm);
+
dir = fz_transform_vector(trm, dir);
dist = sqrtf(dir.x * dir.x + dir.y * dir.y);
ndir.x = dir.x / dist;
@@ -271,19 +342,10 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat
size = fz_matrix_expansion(trm);
- multi = 1;
+ int lastchar = ' ';
for (i = 0; i < text->len; i++)
{
- if (text->items[i].gid < 0)
- {
- fz_add_text_char(ctx, last, font, size, text->wmode, text->items[i].ucs, fz_round_rect(rect));
- multi ++;
- fz_divide_text_chars(last, multi, fz_round_rect(rect));
- continue;
- }
- multi = 1;
-
/* Calculate new pen location and delta */
tm.e = text->items[i].x;
tm.f = text->items[i].y;
@@ -305,20 +367,19 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat
if (dist > size * LINE_DIST)
{
- fz_add_text_newline(ctx, last, font, size, text->wmode);
+ fz_flush_text_line(ctx, dev, style);
+ lastchar = ' ';
}
- else if (fabsf(dot) > 0.95f && dist > size * SPACE_DIST)
+ else if (fabsf(dot) > 0.95f && dist > size * SPACE_DIST && lastchar != ' ')
{
- if ((*last)->len > 0 && (*last)->text[(*last)->len - 1].c != ' ')
- {
- fz_rect spacerect;
- spacerect.x0 = -0.2f;
- spacerect.y0 = 0;
- spacerect.x1 = 0;
- spacerect.y1 = 1;
- spacerect = fz_transform_rect(trm, spacerect);
- fz_add_text_char(ctx, last, font, size, text->wmode, ' ', fz_round_rect(spacerect));
- }
+ fz_rect spacerect;
+ spacerect.x0 = -0.2f;
+ spacerect.y0 = 0;
+ spacerect.x1 = 0;
+ spacerect.y1 = 1;
+ spacerect = fz_transform_rect(trm, spacerect);
+ fz_add_text_char(ctx, dev, style, ' ', spacerect);
+ lastchar = ' ';
}
}
@@ -331,8 +392,13 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat
/* TODO: freetype returns broken vertical metrics */
/* if (text->wmode) mask |= FT_LOAD_VERTICAL_LAYOUT; */
+ fz_lock(ctx, FZ_LOCK_FREETYPE);
+ err = FT_Set_Char_Size(font->ft_face, 64, 64, 72, 72);
+ if (err)
+ fz_warn(ctx, "freetype set character size: %s", ft_error_string(err));
FT_Get_Advance(font->ft_face, text->items[i].gid, mask, &ftadv);
adv = ftadv / 65536.0f;
+ fz_unlock(ctx, FZ_LOCK_FREETYPE);
rect.x0 = 0;
rect.y0 = descender;
@@ -352,9 +418,27 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat
pen->x = trm.e + dir.x * adv;
pen->y = trm.f + dir.y * adv;
- fz_add_text_char(ctx, last, font, size, text->wmode, text->items[i].ucs, fz_round_rect(rect));
+ /* Check for one glyph to many char mapping */
+ for (j = i + 1; j < text->len; j++)
+ if (text->items[j].gid >= 0)
+ break;
+ multi = j - i;
+
+ if (multi == 1)
+ {
+ fz_add_text_char(ctx, dev, style, text->items[i].ucs, rect);
+ }
+ else
+ {
+ for (j = 0; j < multi; j++)
+ {
+ fz_rect part = fz_split_bbox(rect, j, multi);
+ fz_add_text_char(ctx, dev, style, text->items[i].ucs, part);
+ }
+ }
+
+ lastchar = text->items[i].ucs;
}
- fz_unlock(ctx, FZ_LOCK_FREETYPE);
}
static void
@@ -362,7 +446,9 @@ fz_text_fill_text(fz_device *dev, fz_text *text, fz_matrix ctm,
fz_colorspace *colorspace, float *color, float alpha)
{
fz_text_device *tdev = dev->user;
- fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point);
+ fz_text_style *style;
+ style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, colorspace, color, alpha, NULL);
+ fz_text_extract(dev->ctx, tdev, text, ctm, style);
}
static void
@@ -370,36 +456,57 @@ fz_text_stroke_text(fz_device *dev, fz_text *text, fz_stroke_state *stroke, fz_m
fz_colorspace *colorspace, float *color, float alpha)
{
fz_text_device *tdev = dev->user;
- fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point);
+ fz_text_style *style;
+ style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, colorspace, color, alpha, stroke);
+ fz_text_extract(dev->ctx, tdev, text, ctm, style);
}
static void
fz_text_clip_text(fz_device *dev, fz_text *text, fz_matrix ctm, int accumulate)
{
fz_text_device *tdev = dev->user;
- fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point);
+ fz_text_style *style;
+ style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, NULL, NULL, 0, NULL);
+ fz_text_extract(dev->ctx, tdev, text, ctm, style);
}
static void
fz_text_clip_stroke_text(fz_device *dev, fz_text *text, fz_stroke_state *stroke, fz_matrix ctm)
{
fz_text_device *tdev = dev->user;
- fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point);
+ fz_text_style *style;
+ style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, NULL, NULL, 0, stroke);
+ fz_text_extract(dev->ctx, tdev, text, ctm, style);
}
static void
fz_text_ignore_text(fz_device *dev, fz_text *text, fz_matrix ctm)
{
fz_text_device *tdev = dev->user;
- fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point);
+ fz_text_style *style;
+ style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, NULL, NULL, 0, NULL);
+ fz_text_extract(dev->ctx, tdev, text, ctm, style);
+}
+
+static int cmp_block(const void *av, const void *bv)
+{
+ const fz_text_block *a = av;
+ const fz_text_block *b = bv;
+ int x = a->bbox.x0 - b->bbox.x0;
+ if (x) return x;
+ return -(a->bbox.y0 - b->bbox.y0);
}
static void
fz_text_free_user(fz_device *dev)
{
+ fz_context *ctx = dev->ctx;
fz_text_device *tdev = dev->user;
- tdev->span->eol = 1;
+ append_span(ctx, &tdev->cur_line, &tdev->cur_span);
+ insert_line(ctx, tdev->page, &tdev->cur_line);
+
+ qsort(tdev->page->blocks, tdev->page->len, sizeof *tdev->page->blocks, cmp_block);
/* TODO: unicode NFC normalization */
/* TODO: bidi logical reordering */
@@ -408,15 +515,19 @@ fz_text_free_user(fz_device *dev)
}
fz_device *
-fz_new_text_device(fz_context *ctx, fz_text_span *root)
+fz_new_text_device(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page)
{
fz_device *dev;
+
fz_text_device *tdev = fz_malloc_struct(ctx, fz_text_device);
- tdev->head = root;
- tdev->span = root;
+ tdev->sheet = sheet;
+ tdev->page = page;
tdev->point.x = -1;
tdev->point.y = -1;
+ init_line(ctx, &tdev->cur_line);
+ init_span(ctx, &tdev->cur_span, NULL);
+
dev = fz_new_device(ctx, tdev);
dev->hints = FZ_IGNORE_IMAGE | FZ_IGNORE_SHADE;
dev->free_user = fz_text_free_user;
@@ -427,3 +538,209 @@ fz_new_text_device(fz_context *ctx, fz_text_span *root)
dev->ignore_text = fz_text_ignore_text;
return dev;
}
+
+/* XML, HTML and plain-text output */
+
+static int font_is_bold(fz_font *font)
+{
+ FT_Face face = font->ft_face;
+ if (face && (face->style_flags & FT_STYLE_FLAG_BOLD))
+ return 1;
+ if (strstr(font->name, "Bold"))
+ return 1;
+ return 0;
+}
+
+static int font_is_italic(fz_font *font)
+{
+ FT_Face face = font->ft_face;
+ if (face && (face->style_flags & FT_STYLE_FLAG_ITALIC))
+ return 1;
+ if (strstr(font->name, "Italic") || strstr(font->name, "Oblique"))
+ return 1;
+ return 0;
+}
+
+static void
+fz_print_style_begin(FILE *out, fz_text_style *style)
+{
+ int script = style->script;
+ fprintf(out, "<span class=\"s%d\">", style->id);
+ while (script-- > 0)
+ fprintf(out, "<sup>");
+ while (++script < 0)
+ fprintf(out, "<sub>");
+}
+
+static void
+fz_print_style_end(FILE *out, fz_text_style *style)
+{
+ int script = style->script;
+ while (script-- > 0)
+ fprintf(out, "</sup>");
+ while (++script < 0)
+ fprintf(out, "</sub>");
+ fprintf(out, "</span>");
+}
+
+static void
+fz_print_style(FILE *out, fz_text_style *style)
+{
+ char *s = strchr(style->font->name, '+');
+ s = s ? s + 1 : style->font->name;
+ fprintf(out, "span.s%d{font-family:\"%s\";font-size:%gpt;",
+ style->id, s, style->size);
+ if (font_is_italic(style->font))
+ fprintf(out, "font-style:italic;");
+ if (font_is_bold(style->font))
+ fprintf(out, "font-weight:bold;");
+ fprintf(out, "}\n");
+}
+
+void
+fz_print_text_sheet(FILE *out, fz_text_sheet *sheet)
+{
+ fz_text_style *style;
+ for (style = sheet->style; style; style = style->next)
+ fz_print_style(out, style);
+}
+
+void
+fz_print_text_page_html(FILE *out, fz_text_page *page)
+{
+ int block_n, line_n, span_n, ch_n;
+ fz_text_style *style = NULL;
+ fz_text_block *block;
+ fz_text_line *line;
+ fz_text_span *span;
+
+ fprintf(out, "<div class=\"page\">\n");
+
+ for (block_n = 0; block_n < page->len; block_n++)
+ {
+ block = &page->blocks[block_n];
+ fprintf(out, "<div class=\"block\">\n");
+ for (line_n = 0; line_n < block->len; line_n++)
+ {
+ line = &block->lines[line_n];
+ fprintf(out, "<p>");
+ style = NULL;
+
+ for (span_n = 0; span_n < line->len; span_n++)
+ {
+ span = &line->spans[span_n];
+ if (style != span->style)
+ {
+ if (style != NULL)
+ fz_print_style_end(out, style);
+ fz_print_style_begin(out, span->style);
+ style = span->style;
+ }
+
+ for (ch_n = 0; ch_n < span->len; ch_n++)
+ {
+ fz_text_char *ch = &span->text[ch_n];
+ if (ch->c == '<')
+ fprintf(out, "&lt;");
+ else if (ch->c == '>')
+ fprintf(out, "&gt;");
+ else if (ch->c == '&')
+ fprintf(out, "&amp;");
+ else if (ch->c >= 32 && ch->c <= 127)
+ fprintf(out, "%c", ch->c);
+ else
+ fprintf(out, "&#x%x;", ch->c);
+ }
+ }
+ fz_print_style_end(out, style);
+ fprintf(out, "</p>\n");
+ }
+ fprintf(out, "</div>\n");
+ }
+
+ fprintf(out, "</div>\n");
+}
+
+void
+fz_print_text_page_xml(FILE *out, fz_text_page *page)
+{
+ fz_text_block *block;
+ fz_text_line *line;
+ fz_text_span *span;
+ fz_text_char *ch;
+ char *s;
+
+ fprintf(out, "<page>\n");
+ for (block = page->blocks; block < page->blocks + page->len; block++)
+ {
+ fprintf(out, "<block bbox=\"%g %g %g %g\">\n",
+ block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
+ for (line = block->lines; line < block->lines + block->len; line++)
+ {
+ fprintf(out, "<line bbox=\"%g %g %g %g\">\n",
+ line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1);
+ for (span = line->spans; span < line->spans + line->len; span++)
+ {
+ fz_text_style *style = span->style;
+ s = strchr(style->font->name, '+');
+ s = s ? s + 1 : style->font->name;
+ fprintf(out, "<span bbox=\"%g %g %g %g\" font=\"%s\" size=\"%g\">\n",
+ span->bbox.x0, span->bbox.y0, span->bbox.x1, span->bbox.y1,
+ s, style->size);
+ for (ch = span->text; ch < span->text + span->len; ch++)
+ {
+ fprintf(out, "<char bbox=\"%g %g %g %g\" c=\"",
+ ch->bbox.x0, ch->bbox.y0, ch->bbox.x1, ch->bbox.y1);
+ switch (ch->c)
+ {
+ case '<': fprintf(out, "&lt;"); break;
+ case '>': fprintf(out, "&gt;"); break;
+ case '&': fprintf(out, "&amp;"); break;
+ case '"': fprintf(out, "&quot;"); break;
+ case '\'': fprintf(out, "&apos;"); break;
+ default:
+ if (ch->c >= 32 && ch->c <= 127)
+ fprintf(out, "%c", ch->c);
+ else
+ fprintf(out, "&#x%x;", ch->c);
+ break;
+ }
+ fprintf(out, "\"/>\n");
+ }
+ fprintf(out, "</span>\n");
+ }
+ fprintf(out, "</line>\n");
+ }
+ fprintf(out, "</block>\n");
+ }
+ fprintf(out, "</page>\n");
+}
+
+void
+fz_print_text_page(FILE *out, fz_text_page *page)
+{
+ fz_text_block *block;
+ fz_text_line *line;
+ fz_text_span *span;
+ fz_text_char *ch;
+ char utf[10];
+ int i, n;
+
+ for (block = page->blocks; block < page->blocks + page->len; block++)
+ {
+ for (line = block->lines; line < block->lines + block->len; line++)
+ {
+ for (span = line->spans; span < line->spans + line->len; span++)
+ {
+ for (ch = span->text; ch < span->text + span->len; ch++)
+ {
+ n = fz_runetochar(utf, ch->c);
+ for (i = 0; i < n; i++)
+ putc(utf[i], out);
+ }
+ }
+ fprintf(out, "\n");
+ }
+ fprintf(out, "\n");
+ }
+}
diff --git a/fitz/fitz.h b/fitz/fitz.h
index c01de8ac..8570f077 100644
--- a/fitz/fitz.h
+++ b/fitz/fitz.h
@@ -1424,46 +1424,102 @@ fz_device *fz_new_bbox_device(fz_context *ctx, fz_bbox *bboxp);
fz_device *fz_new_draw_device(fz_context *ctx, fz_pixmap *dest);
/*
- Text extraction device
-*/
-typedef struct fz_text_span_s fz_text_span;
+ * Text extraction device
+ */
+
+typedef struct fz_text_style_s fz_text_style;
typedef struct fz_text_char_s fz_text_char;
+typedef struct fz_text_span_s fz_text_span;
+typedef struct fz_text_line_s fz_text_line;
+typedef struct fz_text_block_s fz_text_block;
+
+typedef struct fz_text_sheet_s fz_text_sheet;
+typedef struct fz_text_page_s fz_text_page;
+
+struct fz_text_style_s
+{
+ int id;
+ fz_font *font;
+ float size;
+ int wmode;
+ int script;
+ /* etc... */
+ fz_text_style *next;
+};
+
+struct fz_text_sheet_s
+{
+ int maxid;
+ fz_text_style *style;
+};
struct fz_text_char_s
{
+ fz_rect bbox;
int c;
- fz_bbox bbox;
};
struct fz_text_span_s
{
- fz_font *font;
- float size;
- int wmode;
+ fz_rect bbox;
int len, cap;
fz_text_char *text;
- fz_text_span *next;
- int eol;
+ fz_text_style *style;
+};
+
+struct fz_text_line_s
+{
+ fz_rect bbox;
+ int len, cap;
+ fz_text_span *spans;
+};
+
+struct fz_text_block_s
+{
+ fz_rect bbox;
+ int len, cap;
+ fz_text_line *lines;
};
-fz_text_span *fz_new_text_span(fz_context *ctx);
-void fz_free_text_span(fz_context *ctx, fz_text_span *line);
-void fz_debug_text_span(fz_text_span *line);
-void fz_debug_text_span_xml(fz_text_span *span);
+struct fz_text_page_s
+{
+ fz_rect mediabox;
+ int len, cap;
+ fz_text_block *blocks;
+};
/*
- fz_new_text_device: Create a device to print the text on a
- page in XML.
+ fz_new_text_device: Create a device to extract the text on a page.
- The text on a page will be translated into a sequnce of XML
- elements. For each text span the font, font size, writing mode
- and end of line flag is printed. Since text can be placed at
- arbitrary positions then heuristics must be used to try to
- collect text spans together that are roughly located on the
- same baseline. Each character in the text span will have its
- UTF-8 character printed along with a bounding box containing it.
+ Gather and sort the text on a page into spans of uniform style,
+ arranged into lines and blocks by reading order. The reading order
+ is determined by various heuristics, so may not be accurate.
*/
-fz_device *fz_new_text_device(fz_context *ctx, fz_text_span *text);
+fz_device *fz_new_text_device(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page);
+
+/*
+ fz_new_text_sheet: Create an empty style sheet.
+
+ The style sheet is filled out by the text device, creating
+ one style for each unique font, color, size combination that
+ is used.
+*/
+fz_text_sheet *fz_new_text_sheet(fz_context *ctx);
+void fz_free_text_sheet(fz_context *ctx, fz_text_sheet *sheet);
+
+/*
+ fz_new_text_page: Create an empty text page.
+
+ The text page is filled out by the text device to contain the blocks,
+ lines and spans of text on the page.
+*/
+fz_text_page *fz_new_text_page(fz_context *ctx, fz_rect mediabox);
+void fz_free_text_page(fz_context *ctx, fz_text_page *page);
+
+void fz_print_text_sheet(FILE *out, fz_text_sheet *sheet);
+void fz_print_text_page_html(FILE *out, fz_text_page *page);
+void fz_print_text_page_xml(FILE *out, fz_text_page *page);
+void fz_print_text_page(FILE *out, fz_text_page *page);
/*
* Cookie support - simple communication channel between app/library.