From 0fcb8d9539931f1c0a5a19ff0eb1aca427856e18 Mon Sep 17 00:00:00 2001 From: Robin Watts Date: Tue, 5 Feb 2013 20:14:15 +0000 Subject: Rework text extraction structures. Rework the text extraction structures - the broad strokes are similar but we now hold more information at each stage to enable us to perform more detailed analysis on the structure of the page. We now hold: fz_text_char's (the position, ucs value, and style of each char). fz_text_span's (sets of chars that share the same baseline/transform, with no more than an expected amount of whitespace between each char). fz_text_line's (sets of spans that share the same baseline (more or less, allowing for super/subscript, but possibly with a larger than expected amount of whitespace). fz_text_block's (sets of lines that follow one another) After fz_text_analysis is called, we hope to have fz_text_blocks split such that each block is a paragraph. This new implementation has the same restrictions as the current implementation it replaces, namely that chars are only considered for addition onto the most recent span at the moment, but this revised form is designed to allow more easy extension, and for this restriction to be lifted. Also add simple paragraph splitting based on finding the most common 'line distance' in blocks. When we add spans together to collate them into lines, we record the 'horizontal' and 'vertical' spacing between them. (Not actually horizontal or vertical, so much as 'in the direction of writing' and 'perpendicular to the direction of writing'). The 'horizontal' value enables us to more correctly output spaces when converting to (say) html later. The 'vertical' value enables us to spot subscripts and superscripts etc, as well as small changes in the baseline due to style changes. We are careful to base the baseline comparison on the baseline for the line, not the baseline for the previous span, as otherwise superscripts/ subscripts on the end of the line affect what we match next. Also, we are less tolerant of vertical shifts after a large gap. This avoids false positives where different columns just happen to almost line up. --- android/jni/mupdf.c | 50 +++++++++++++++----------------------------------- 1 file changed, 15 insertions(+), 35 deletions(-) (limited to 'android') diff --git a/android/jni/mupdf.c b/android/jni/mupdf.c index 75626133..122fc636 100644 --- a/android/jni/mupdf.c +++ b/android/jni/mupdf.c @@ -886,45 +886,18 @@ JNI_FN(MuPDFCore_updatePageInternal)(JNIEnv *env, jobject thiz, jobject bitmap, return 1; } -static fz_text_char textcharat(fz_text_page *page, int idx) -{ - static fz_text_char emptychar = { {0,0,0,0}, ' ' }; - fz_text_block *block; - fz_text_line *line; - fz_text_span *span; - int ofs = 0; - for (block = page->blocks; block < page->blocks + page->len; block++) - { - for (line = block->lines; line < block->lines + block->len; line++) - { - for (span = line->spans; span < line->spans + line->len; span++) - { - if (idx < ofs + span->len) - return span->text[idx - ofs]; - /* pseudo-newline */ - if (span + 1 == line->spans + line->len) - { - if (idx == ofs + span->len) - return emptychar; - ofs++; - } - ofs += span->len; - } - } - } - return emptychar; -} - static int charat(fz_text_page *page, int idx) { - return textcharat(page, idx).c; + fz_char_and_box cab; + return fz_text_char_at(&cab, page, idx)->c; } static fz_rect bboxcharat(fz_text_page *page, int idx) { - return textcharat(page, idx).bbox; + fz_char_and_box cab; + return fz_text_char_at(&cab, page, idx)->bbox; } static int @@ -932,14 +905,17 @@ textlen(fz_text_page *page) { fz_text_block *block; fz_text_line *line; - fz_text_span *span; int len = 0; for (block = page->blocks; block < page->blocks + page->len; block++) { for (line = block->lines; line < block->lines + block->len; line++) { - for (span = line->spans; span < line->spans + line->len; span++) + int span_num; + for (span_num = 0; span_num < line->len; span_num++) + { + fz_text_span *span = line->spans[span_num]; len += span->len; + } len++; /* pseudo-newline */ } } @@ -1250,14 +1226,16 @@ JNI_FN(MuPDFCore_text)(JNIEnv * env, jobject thiz) for (s = 0; s < line->len; s++) { - fz_text_span *span = &line->spans[s]; + fz_text_span *span = line->spans[s]; jobjectArray *carr = (*env)->NewObjectArray(env, span->len, textCharClass, NULL); if (carr == NULL) fz_throw(ctx, "NewObjectArray failed"); for (c = 0; c < span->len; c++) { fz_text_char *ch = &span->text[c]; - jobject cobj = (*env)->NewObject(env, textCharClass, ctor, ch->bbox.x0, ch->bbox.y0, ch->bbox.x1, ch->bbox.y1, ch->c); + fz_rect bbox; + fz_text_char_bbox(&bbox, span, c); + jobject cobj = (*env)->NewObject(env, textCharClass, ctor, bbox.x0, bbox.y0, bbox.x1, bbox.y1, ch->c); if (cobj == NULL) fz_throw(ctx, "NewObjectfailed"); (*env)->SetObjectArrayElement(env, carr, c, cobj); @@ -1329,6 +1307,8 @@ JNI_FN(MuPDFCore_textAsHtml)(JNIEnv * env, jobject thiz) fz_free_device(dev); dev = NULL; + fz_text_analysis(ctx, sheet, text); + buf = fz_new_buffer(ctx, 256); out = fz_new_output_buffer(ctx, buf); fz_printf(out, "\n"); -- cgit v1.2.3