diff options
author | Tor Andersson <tor@ghostscript.com> | 2010-06-21 19:03:08 +0200 |
---|---|---|
committer | Tor Andersson <tor@ghostscript.com> | 2010-06-21 19:03:08 +0200 |
commit | b8ff5fa4b93f0f6e7f5cd25290f7719d58930796 (patch) | |
tree | 6effbade74368a1dfc2c03eae1ac6dab180dc4f2 /fitz | |
parent | bc74252a06621e4f3af7d5201f92a3486e7664d2 (diff) | |
download | mupdf-b8ff5fa4b93f0f6e7f5cd25290f7719d58930796.tar.xz |
Improve newline detection heuristic in text extraction device.
Diffstat (limited to 'fitz')
-rw-r--r-- | fitz/dev_text.c | 49 |
1 files changed, 30 insertions, 19 deletions
diff --git a/fitz/dev_text.c b/fitz/dev_text.c index d107c50a..3e0c063e 100644 --- a/fitz/dev_text.c +++ b/fitz/dev_text.c @@ -234,15 +234,15 @@ fz_textextractspan(fz_textspan **last, fz_text *text, fz_matrix ctm, fz_point *p fz_matrix tm = text->trm; fz_matrix trm; float size; - float dx, dy; float adv; fz_rect rect; - fz_point dir; - int i, err; - float cross, dist2; + fz_point dir, ndir; + fz_point delta, ndelta; + float dist, dot; float ascender = 1; float descender = 0; int multi; + int i, err; if (text->len == 0) return; @@ -273,6 +273,10 @@ fz_textextractspan(fz_textspan **last, fz_text *text, fz_matrix ctm, fz_point *p tm.f = 0; trm = fz_concat(tm, ctm); dir = fz_transformvector(trm, dir); + dist = sqrtf(dir.x * dir.x + dir.y * dir.y); + ndir.x = dir.x / dist; + ndir.y = dir.y / dist; + size = fz_matrixexpansion(trm); multi = 1; @@ -293,29 +297,36 @@ fz_textextractspan(fz_textspan **last, fz_text *text, fz_matrix ctm, fz_point *p tm.f = text->els[i].y; trm = fz_concat(tm, ctm); - dx = pen->x - trm.e; - dy = pen->y - trm.f; + delta.x = pen->x - trm.e; + delta.y = pen->y - trm.f; if (pen->x == -1 && pen->y == -1) - dx = dy = 0; - cross = fabsf(dx * dir.y - dy * dir.x); - dist2 = dx * dx + dy * dy; + delta.x = delta.y = 0; + + dist = sqrtf(delta.x * delta.x + delta.y * delta.y); /* Add space and newlines based on pen movement */ - if (dist2 > size * size * 0.04f) + if (dist > size * 0.1f) { - if (cross > 0.1f) + ndelta.x = delta.x / dist; + ndelta.y = delta.y / dist; + dot = ndelta.x * ndir.x + ndelta.y * ndir.y; + + if (dist > size * 0.9f) { fz_addtextnewline(last, font, size, text->wmode); } - else if (cross < 0.1f && dist2 > size * size * 0.04f) + else if (fabsf(dot) > 0.95f && dist > size * 0.1f) { - fz_rect spacerect; - spacerect.x0 = -0.2f; - spacerect.y0 = 0; - spacerect.x1 = 0; - spacerect.y1 = 1; - spacerect = fz_transformrect(trm, spacerect); - fz_addtextchar(last, font, size, text->wmode, ' ', fz_roundrect(spacerect)); + if ((*last)->len == 0 || (*last)->text[(*last)->len - 1].c != ' ') + { + fz_rect spacerect; + spacerect.x0 = -0.2f; + spacerect.y0 = 0; + spacerect.x1 = 0; + spacerect.y1 = 1; + spacerect = fz_transformrect(trm, spacerect); + fz_addtextchar(last, font, size, text->wmode, ' ', fz_roundrect(spacerect)); + } } } |