diff options
author | Robin Watts <robin.watts@artifex.com> | 2013-02-05 20:14:15 +0000 |
---|---|---|
committer | Robin Watts <robin.watts@artifex.com> | 2013-03-26 12:29:59 +0000 |
commit | 0fcb8d9539931f1c0a5a19ff0eb1aca427856e18 (patch) | |
tree | 7083f7e3e16b94ef1c85e1032c7a761c3a9002de /android | |
parent | 62f1176aca1cf99e0d1135193c32b2e160aade8b (diff) | |
download | mupdf-0fcb8d9539931f1c0a5a19ff0eb1aca427856e18.tar.xz |
Rework text extraction structures.
Rework the text extraction structures - the broad strokes are similar
but we now hold more information at each stage to enable us to perform
more detailed analysis on the structure of the page.
We now hold:
fz_text_char's (the position, ucs value, and style of each char).
fz_text_span's (sets of chars that share the same baseline/transform,
with no more than an expected amount of whitespace between each char).
fz_text_line's (sets of spans that share the same baseline (more or
less, allowing for super/subscript, but possibly with a larger than
expected amount of whitespace).
fz_text_block's (sets of lines that follow one another)
After fz_text_analysis is called, we hope to have fz_text_blocks split
such that each block is a paragraph.
This new implementation has the same restrictions as the current
implementation it replaces, namely that chars are only considered for
addition onto the most recent span at the moment, but this revised form
is designed to allow more easy extension, and for this restriction to
be lifted.
Also add simple paragraph splitting based on finding the most common
'line distance' in blocks.
When we add spans together to collate them into lines, we record the
'horizontal' and 'vertical' spacing between them. (Not actually
horizontal or vertical, so much as 'in the direction of writing' and
'perpendicular to the direction of writing').
The 'horizontal' value enables us to more correctly output spaces when
converting to (say) html later.
The 'vertical' value enables us to spot subscripts and superscripts etc,
as well as small changes in the baseline due to style changes. We are
careful to base the baseline comparison on the baseline for the line,
not the baseline for the previous span, as otherwise superscripts/
subscripts on the end of the line affect what we match next.
Also, we are less tolerant of vertical shifts after a large gap. This
avoids false positives where different columns just happen to almost
line up.
Diffstat (limited to 'android')
-rw-r--r-- | android/jni/mupdf.c | 50 |
1 files changed, 15 insertions, 35 deletions
diff --git a/android/jni/mupdf.c b/android/jni/mupdf.c index 75626133..122fc636 100644 --- a/android/jni/mupdf.c +++ b/android/jni/mupdf.c @@ -886,45 +886,18 @@ JNI_FN(MuPDFCore_updatePageInternal)(JNIEnv *env, jobject thiz, jobject bitmap, return 1; } -static fz_text_char textcharat(fz_text_page *page, int idx) -{ - static fz_text_char emptychar = { {0,0,0,0}, ' ' }; - fz_text_block *block; - fz_text_line *line; - fz_text_span *span; - int ofs = 0; - for (block = page->blocks; block < page->blocks + page->len; block++) - { - for (line = block->lines; line < block->lines + block->len; line++) - { - for (span = line->spans; span < line->spans + line->len; span++) - { - if (idx < ofs + span->len) - return span->text[idx - ofs]; - /* pseudo-newline */ - if (span + 1 == line->spans + line->len) - { - if (idx == ofs + span->len) - return emptychar; - ofs++; - } - ofs += span->len; - } - } - } - return emptychar; -} - static int charat(fz_text_page *page, int idx) { - return textcharat(page, idx).c; + fz_char_and_box cab; + return fz_text_char_at(&cab, page, idx)->c; } static fz_rect bboxcharat(fz_text_page *page, int idx) { - return textcharat(page, idx).bbox; + fz_char_and_box cab; + return fz_text_char_at(&cab, page, idx)->bbox; } static int @@ -932,14 +905,17 @@ textlen(fz_text_page *page) { fz_text_block *block; fz_text_line *line; - fz_text_span *span; int len = 0; for (block = page->blocks; block < page->blocks + page->len; block++) { for (line = block->lines; line < block->lines + block->len; line++) { - for (span = line->spans; span < line->spans + line->len; span++) + int span_num; + for (span_num = 0; span_num < line->len; span_num++) + { + fz_text_span *span = line->spans[span_num]; len += span->len; + } len++; /* pseudo-newline */ } } @@ -1250,14 +1226,16 @@ JNI_FN(MuPDFCore_text)(JNIEnv * env, jobject thiz) for (s = 0; s < line->len; s++) { - fz_text_span *span = &line->spans[s]; + fz_text_span *span = line->spans[s]; jobjectArray *carr = (*env)->NewObjectArray(env, span->len, textCharClass, NULL); if (carr == NULL) fz_throw(ctx, "NewObjectArray failed"); for (c = 0; c < span->len; c++) { fz_text_char *ch = &span->text[c]; - jobject cobj = (*env)->NewObject(env, textCharClass, ctor, ch->bbox.x0, ch->bbox.y0, ch->bbox.x1, ch->bbox.y1, ch->c); + fz_rect bbox; + fz_text_char_bbox(&bbox, span, c); + jobject cobj = (*env)->NewObject(env, textCharClass, ctor, bbox.x0, bbox.y0, bbox.x1, bbox.y1, ch->c); if (cobj == NULL) fz_throw(ctx, "NewObjectfailed"); (*env)->SetObjectArrayElement(env, carr, c, cobj); @@ -1329,6 +1307,8 @@ JNI_FN(MuPDFCore_textAsHtml)(JNIEnv * env, jobject thiz) fz_free_device(dev); dev = NULL; + fz_text_analysis(ctx, sheet, text); + buf = fz_new_buffer(ctx, 256); out = fz_new_output_buffer(ctx, buf); fz_printf(out, "<html>\n"); |