diff options
author | Robin Watts <robin.watts@artifex.com> | 2013-02-06 10:46:52 +0000 |
---|---|---|
committer | Robin Watts <robin.watts@artifex.com> | 2013-02-06 11:23:08 +0000 |
commit | 0399332d547b92c79bfea20982a3a1492f6df272 (patch) | |
tree | a412b7267a37fdc4307333d681d7cc392e5f9c2e | |
parent | 92f7f71619220210f43a1281d1abf6c9434e3c1b (diff) | |
download | mupdf-0399332d547b92c79bfea20982a3a1492f6df272.tar.xz |
Tweak text extraction block creation.
Better tolerate long horizontal spaces without breaking lines.
-rw-r--r-- | fitz/dev_text.c | 258 |
1 files changed, 247 insertions, 11 deletions
diff --git a/fitz/dev_text.c b/fitz/dev_text.c index 10096fcf..9748485c 100644 --- a/fitz/dev_text.c +++ b/fitz/dev_text.c @@ -2,6 +2,7 @@ #define LINE_DIST 0.9f #define SPACE_DIST 0.2f +#define SPACE_MAX_DIST 15.0f #define PARAGRAPH_DIST 0.5f #include <ft2build.h> @@ -380,20 +381,23 @@ fz_text_extract(fz_context *ctx, fz_text_device *dev, fz_text *text, fz_matrix c ndelta.y = delta.y / dist; dot = ndelta.x * ndir.x + ndelta.y * ndir.y; - if (dist > size * LINE_DIST) + if (fabsf(dot) > 0.95f && dist > size * SPACE_DIST && dist < size * SPACE_MAX_DIST) { - fz_flush_text_line(ctx, dev, style); - dev->lastchar = ' '; + if (dev->lastchar != ' ') + { + fz_rect spacerect; + spacerect.x0 = -0.2f; + spacerect.y0 = descender; + spacerect.x1 = 0; + spacerect.y1 = ascender; + spacerect = fz_transform_rect(trm, spacerect); + fz_add_text_char(ctx, dev, style, ' ', spacerect); + dev->lastchar = ' '; + } } - else if (fabsf(dot) > 0.95f && dist > size * SPACE_DIST && dev->lastchar != ' ') + else if (dist > size * LINE_DIST) { - fz_rect spacerect; - spacerect.x0 = -0.2f; - spacerect.y0 = descender; - spacerect.x1 = 0; - spacerect.y1 = ascender; - spacerect = fz_transform_rect(trm, spacerect); - fz_add_text_char(ctx, dev, style, ' ', spacerect); + fz_flush_text_line(ctx, dev, style); dev->lastchar = ' '; } } @@ -752,3 +756,235 @@ fz_print_text_page(fz_context *ctx, fz_output *out, fz_text_page *page) fz_printf(out, "\n"); } } + +typedef struct line_height_s +{ + float height; + int count; + fz_text_style *style; +} line_height; + +typedef struct line_heights_s +{ + fz_context *ctx; + int cap; + int len; + line_height *lh; +} line_heights; + +static line_heights * +new_line_heights(fz_context *ctx) +{ + line_heights *lh = fz_malloc_struct(ctx, line_heights); + lh->ctx = ctx; + return lh; +} + +static void +insert_line_height(line_heights *lh, fz_text_style *style, float height) +{ + int i; + + /* If we have one already, add it in */ + for (i=0; i < lh->cap; i++) + { + /* Match if we are within 5% */ + if (lh->lh[i].style == style && lh->lh[i].height * 0.95 <= height && lh->lh[i].height * 1.05 >= height) + { + /* Ensure that the average height is correct */ + lh->lh[i].height = (lh->lh[i].height * lh->lh[i].count + height) / (lh->lh[i].count+1); + lh->lh[i].count++; + return; + } + } + + /* Otherwise extend (if required) and add it */ + if (lh->cap == lh->len) + { + int newcap = (lh->cap ? lh->cap * 2 : 4); + lh->lh = fz_resize_array(lh->ctx, lh->lh, newcap, sizeof(line_height)); + lh->cap = newcap; + } + + lh->lh[lh->len].count = 1; + lh->lh[lh->len].height = height; + lh->lh[lh->len].style = style; + lh->len++; +} + +static void +cull_line_heights(line_heights *lh) +{ + int i, j, k; + + for (i = 0; i < lh->len; i++) + { + fz_text_style *style = lh->lh[i].style; + int count = lh->lh[i].count; + int max = i; + + /* Find the max for this style */ + for (j = i+1; j < lh->len; j++) + { + if (lh->lh[j].style == style && lh->lh[j].count > count) + { + max = j; + count = lh->lh[j].count; + } + } + + /* Destroy all the ones other than the max */ + if (max != i) + { + lh->lh[i].count = count; + lh->lh[i].height = lh->lh[max].height; + lh->lh[max].count = 0; + } + j = i+1; + for (k = j; k < lh->len; k++) + { + if (lh->lh[k].style == style) + { + k++; + } + else + { + lh->lh[j++] = lh->lh[k]; + } + } + lh->len = j; + } +} + +static float +line_height_for_style(line_heights *lh, fz_text_style *style) +{ + int i; + + for (i=0; i < lh->len; i++) + { + if (lh->lh[i].style == style) + return lh->lh[i].height; + } + return 0.0; /* Never reached */ +} + +static void +split_block(fz_context *ctx, fz_text_page *page, int blocknum, int linenum) +{ + int split_len; + + if (page->len == page->cap) + { + int new_cap = fz_maxi(16, page->cap * 2); + page->blocks = fz_resize_array(ctx, page->blocks, new_cap, sizeof(*page->blocks)); + page->cap = new_cap; + } + + memmove(page->blocks+blocknum+1, page->blocks+blocknum, (page->len - blocknum)*sizeof(*page->blocks)); + page->len++; + + split_len = page->blocks[blocknum].len - linenum; + page->blocks[blocknum+1].bbox = page->blocks[blocknum].bbox; /* FIXME! */ + page->blocks[blocknum+1].cap = 0; + page->blocks[blocknum+1].len = 0; + page->blocks[blocknum+1].lines = NULL; + page->blocks[blocknum+1].lines = fz_malloc_array(ctx, split_len, sizeof(fz_text_line)); + page->blocks[blocknum+1].cap = page->blocks[blocknum+1].len; + page->blocks[blocknum+1].len = split_len; + page->blocks[blocknum].len = linenum; + memcpy(page->blocks[blocknum+1].lines, page->blocks[blocknum].lines + linenum, split_len * sizeof(fz_text_line)); +} + +void +fz_text_analysis(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page) +{ + fz_text_block *block; + fz_text_line *line; + fz_text_span *span; + fz_text_line *prev_line; + line_heights *lh; + int blocknum; + + /* Simple paragraph analysis; look for the most common 'inter line' + * spacing. This will be assumed to be our line spacing. Anything + * more than 25% wider than this will be assumed to be a paragraph + * space. */ + + /* Step 1: Gather the line height information */ + lh = new_line_heights(ctx); + prev_line = NULL; + for (block = page->blocks; block < page->blocks + page->len; block++) + { + for (line = block->lines; line < block->lines + block->len; line++) + { + /* In a line made up of several spans, find the tallest + * span. This line difference will count as being a + * difference in a line of that style. */ + fz_text_span *tallest_span = NULL; + float tallest = 0; + float span_height; + for (span = line->spans; span < line->spans + line->len; span++) + { + span_height = span->bbox.y1 - span->bbox.y0; + if (tallest_span == NULL || span_height > tallest) + { + tallest_span = span; + tallest = span_height; + } + } + if (prev_line) + { + /* Should really work on the baseline positions, + * but we don't have that at this stage. */ + float line_step = line->bbox.y1 - prev_line->bbox.y1; + if (line_step > 0) + { + insert_line_height(lh, tallest_span->style, line_step); + } + } + prev_line = line; + } + } + + /* Step 2: Find the most popular line height for each style */ + cull_line_heights(lh); + + /* Step 3: Run through the blocks, breaking each block into two if + * the line height isn't right. */ + prev_line = NULL; + for (blocknum = 0; blocknum < page->len; blocknum++) + { + block = &page->blocks[blocknum]; + for (line = block->lines; line < block->lines + block->len; line++) + { + /* In a line made up of several spans, find the tallest + * span. This line difference will count as being a + * difference in a line of that style. */ + fz_text_span *tallest_span = NULL; + float tallest = 0; + float span_height; + for (span = line->spans; span < line->spans + line->len; span++) + { + span_height = span->bbox.y1 - span->bbox.y0; + if (tallest_span == NULL || span_height > tallest) + { + tallest_span = span; + tallest = span_height; + } + } + if (prev_line) + { + float proper_step = line_height_for_style(lh, tallest_span->style); + float line_step = line->bbox.y1 - prev_line->bbox.y1; + if (proper_step * 0.95 > line_step || line_step > proper_step * 1.05) + { + split_block(ctx, page, block - page->blocks, line - block->lines); + prev_line = NULL; + break; + } + } + prev_line = line; + } + } +} |