summaryrefslogtreecommitdiff
path: root/fitz
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2013-02-06 10:46:52 +0000
committerRobin Watts <robin.watts@artifex.com>2013-02-06 11:23:08 +0000
commit0399332d547b92c79bfea20982a3a1492f6df272 (patch)
treea412b7267a37fdc4307333d681d7cc392e5f9c2e /fitz
parent92f7f71619220210f43a1281d1abf6c9434e3c1b (diff)
downloadmupdf-0399332d547b92c79bfea20982a3a1492f6df272.tar.xz
Tweak text extraction block creation.
Better tolerate long horizontal spaces without breaking lines.
Diffstat (limited to 'fitz')
-rw-r--r--fitz/dev_text.c258
1 files changed, 247 insertions, 11 deletions
diff --git a/fitz/dev_text.c b/fitz/dev_text.c
index 10096fcf..9748485c 100644
--- a/fitz/dev_text.c
+++ b/fitz/dev_text.c
@@ -2,6 +2,7 @@
#define LINE_DIST 0.9f
#define SPACE_DIST 0.2f
+#define SPACE_MAX_DIST 15.0f
#define PARAGRAPH_DIST 0.5f
#include <ft2build.h>
@@ -380,20 +381,23 @@ fz_text_extract(fz_context *ctx, fz_text_device *dev, fz_text *text, fz_matrix c
ndelta.y = delta.y / dist;
dot = ndelta.x * ndir.x + ndelta.y * ndir.y;
- if (dist > size * LINE_DIST)
+ if (fabsf(dot) > 0.95f && dist > size * SPACE_DIST && dist < size * SPACE_MAX_DIST)
{
- fz_flush_text_line(ctx, dev, style);
- dev->lastchar = ' ';
+ if (dev->lastchar != ' ')
+ {
+ fz_rect spacerect;
+ spacerect.x0 = -0.2f;
+ spacerect.y0 = descender;
+ spacerect.x1 = 0;
+ spacerect.y1 = ascender;
+ spacerect = fz_transform_rect(trm, spacerect);
+ fz_add_text_char(ctx, dev, style, ' ', spacerect);
+ dev->lastchar = ' ';
+ }
}
- else if (fabsf(dot) > 0.95f && dist > size * SPACE_DIST && dev->lastchar != ' ')
+ else if (dist > size * LINE_DIST)
{
- fz_rect spacerect;
- spacerect.x0 = -0.2f;
- spacerect.y0 = descender;
- spacerect.x1 = 0;
- spacerect.y1 = ascender;
- spacerect = fz_transform_rect(trm, spacerect);
- fz_add_text_char(ctx, dev, style, ' ', spacerect);
+ fz_flush_text_line(ctx, dev, style);
dev->lastchar = ' ';
}
}
@@ -752,3 +756,235 @@ fz_print_text_page(fz_context *ctx, fz_output *out, fz_text_page *page)
fz_printf(out, "\n");
}
}
+
+typedef struct line_height_s
+{
+ float height;
+ int count;
+ fz_text_style *style;
+} line_height;
+
+typedef struct line_heights_s
+{
+ fz_context *ctx;
+ int cap;
+ int len;
+ line_height *lh;
+} line_heights;
+
+static line_heights *
+new_line_heights(fz_context *ctx)
+{
+ line_heights *lh = fz_malloc_struct(ctx, line_heights);
+ lh->ctx = ctx;
+ return lh;
+}
+
+static void
+insert_line_height(line_heights *lh, fz_text_style *style, float height)
+{
+ int i;
+
+ /* If we have one already, add it in */
+ for (i=0; i < lh->cap; i++)
+ {
+ /* Match if we are within 5% */
+ if (lh->lh[i].style == style && lh->lh[i].height * 0.95 <= height && lh->lh[i].height * 1.05 >= height)
+ {
+ /* Ensure that the average height is correct */
+ lh->lh[i].height = (lh->lh[i].height * lh->lh[i].count + height) / (lh->lh[i].count+1);
+ lh->lh[i].count++;
+ return;
+ }
+ }
+
+ /* Otherwise extend (if required) and add it */
+ if (lh->cap == lh->len)
+ {
+ int newcap = (lh->cap ? lh->cap * 2 : 4);
+ lh->lh = fz_resize_array(lh->ctx, lh->lh, newcap, sizeof(line_height));
+ lh->cap = newcap;
+ }
+
+ lh->lh[lh->len].count = 1;
+ lh->lh[lh->len].height = height;
+ lh->lh[lh->len].style = style;
+ lh->len++;
+}
+
+static void
+cull_line_heights(line_heights *lh)
+{
+ int i, j, k;
+
+ for (i = 0; i < lh->len; i++)
+ {
+ fz_text_style *style = lh->lh[i].style;
+ int count = lh->lh[i].count;
+ int max = i;
+
+ /* Find the max for this style */
+ for (j = i+1; j < lh->len; j++)
+ {
+ if (lh->lh[j].style == style && lh->lh[j].count > count)
+ {
+ max = j;
+ count = lh->lh[j].count;
+ }
+ }
+
+ /* Destroy all the ones other than the max */
+ if (max != i)
+ {
+ lh->lh[i].count = count;
+ lh->lh[i].height = lh->lh[max].height;
+ lh->lh[max].count = 0;
+ }
+ j = i+1;
+ for (k = j; k < lh->len; k++)
+ {
+ if (lh->lh[k].style == style)
+ {
+ k++;
+ }
+ else
+ {
+ lh->lh[j++] = lh->lh[k];
+ }
+ }
+ lh->len = j;
+ }
+}
+
+static float
+line_height_for_style(line_heights *lh, fz_text_style *style)
+{
+ int i;
+
+ for (i=0; i < lh->len; i++)
+ {
+ if (lh->lh[i].style == style)
+ return lh->lh[i].height;
+ }
+ return 0.0; /* Never reached */
+}
+
+static void
+split_block(fz_context *ctx, fz_text_page *page, int blocknum, int linenum)
+{
+ int split_len;
+
+ if (page->len == page->cap)
+ {
+ int new_cap = fz_maxi(16, page->cap * 2);
+ page->blocks = fz_resize_array(ctx, page->blocks, new_cap, sizeof(*page->blocks));
+ page->cap = new_cap;
+ }
+
+ memmove(page->blocks+blocknum+1, page->blocks+blocknum, (page->len - blocknum)*sizeof(*page->blocks));
+ page->len++;
+
+ split_len = page->blocks[blocknum].len - linenum;
+ page->blocks[blocknum+1].bbox = page->blocks[blocknum].bbox; /* FIXME! */
+ page->blocks[blocknum+1].cap = 0;
+ page->blocks[blocknum+1].len = 0;
+ page->blocks[blocknum+1].lines = NULL;
+ page->blocks[blocknum+1].lines = fz_malloc_array(ctx, split_len, sizeof(fz_text_line));
+ page->blocks[blocknum+1].cap = page->blocks[blocknum+1].len;
+ page->blocks[blocknum+1].len = split_len;
+ page->blocks[blocknum].len = linenum;
+ memcpy(page->blocks[blocknum+1].lines, page->blocks[blocknum].lines + linenum, split_len * sizeof(fz_text_line));
+}
+
+void
+fz_text_analysis(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page)
+{
+ fz_text_block *block;
+ fz_text_line *line;
+ fz_text_span *span;
+ fz_text_line *prev_line;
+ line_heights *lh;
+ int blocknum;
+
+ /* Simple paragraph analysis; look for the most common 'inter line'
+ * spacing. This will be assumed to be our line spacing. Anything
+ * more than 25% wider than this will be assumed to be a paragraph
+ * space. */
+
+ /* Step 1: Gather the line height information */
+ lh = new_line_heights(ctx);
+ prev_line = NULL;
+ for (block = page->blocks; block < page->blocks + page->len; block++)
+ {
+ for (line = block->lines; line < block->lines + block->len; line++)
+ {
+ /* In a line made up of several spans, find the tallest
+ * span. This line difference will count as being a
+ * difference in a line of that style. */
+ fz_text_span *tallest_span = NULL;
+ float tallest = 0;
+ float span_height;
+ for (span = line->spans; span < line->spans + line->len; span++)
+ {
+ span_height = span->bbox.y1 - span->bbox.y0;
+ if (tallest_span == NULL || span_height > tallest)
+ {
+ tallest_span = span;
+ tallest = span_height;
+ }
+ }
+ if (prev_line)
+ {
+ /* Should really work on the baseline positions,
+ * but we don't have that at this stage. */
+ float line_step = line->bbox.y1 - prev_line->bbox.y1;
+ if (line_step > 0)
+ {
+ insert_line_height(lh, tallest_span->style, line_step);
+ }
+ }
+ prev_line = line;
+ }
+ }
+
+ /* Step 2: Find the most popular line height for each style */
+ cull_line_heights(lh);
+
+ /* Step 3: Run through the blocks, breaking each block into two if
+ * the line height isn't right. */
+ prev_line = NULL;
+ for (blocknum = 0; blocknum < page->len; blocknum++)
+ {
+ block = &page->blocks[blocknum];
+ for (line = block->lines; line < block->lines + block->len; line++)
+ {
+ /* In a line made up of several spans, find the tallest
+ * span. This line difference will count as being a
+ * difference in a line of that style. */
+ fz_text_span *tallest_span = NULL;
+ float tallest = 0;
+ float span_height;
+ for (span = line->spans; span < line->spans + line->len; span++)
+ {
+ span_height = span->bbox.y1 - span->bbox.y0;
+ if (tallest_span == NULL || span_height > tallest)
+ {
+ tallest_span = span;
+ tallest = span_height;
+ }
+ }
+ if (prev_line)
+ {
+ float proper_step = line_height_for_style(lh, tallest_span->style);
+ float line_step = line->bbox.y1 - prev_line->bbox.y1;
+ if (proper_step * 0.95 > line_step || line_step > proper_step * 1.05)
+ {
+ split_block(ctx, page, block - page->blocks, line - block->lines);
+ prev_line = NULL;
+ break;
+ }
+ }
+ prev_line = line;
+ }
+ }
+}