From 0fcb8d9539931f1c0a5a19ff0eb1aca427856e18 Mon Sep 17 00:00:00 2001
From: Robin Watts <robin.watts@artifex.com>
Date: Tue, 5 Feb 2013 20:14:15 +0000
Subject: Rework text extraction structures.

Rework the text extraction structures - the broad strokes are similar
but we now hold more information at each stage to enable us to perform
more detailed analysis on the structure of the page.

We now hold:

  fz_text_char's (the position, ucs value, and style of each char).

  fz_text_span's (sets of chars that share the same baseline/transform,
  with no more than an expected amount of whitespace between each char).

  fz_text_line's (sets of spans that share the same baseline (more or
  less, allowing for super/subscript, but possibly with a larger than
  expected amount of whitespace).

  fz_text_block's (sets of lines that follow one another)

After fz_text_analysis is called, we hope to have fz_text_blocks split
such that each block is a paragraph.

This new implementation has the same restrictions as the current
implementation it replaces, namely that chars are only considered for
addition onto the most recent span at the moment, but this revised form
is designed to allow more easy extension, and for this restriction to
be lifted.

Also add simple paragraph splitting based on finding the most common
'line distance' in blocks.

When we add spans together to collate them into lines, we record the
'horizontal' and 'vertical' spacing between them. (Not actually
horizontal or vertical, so much as 'in the direction of writing' and
'perpendicular to the direction of writing').

The 'horizontal' value enables us to more correctly output spaces when
converting to (say) html later.

The 'vertical' value enables us to spot subscripts and superscripts etc,
as well as small changes in the baseline due to style changes. We are
careful to base the baseline comparison on the baseline for the line,
not the baseline for the previous span, as otherwise superscripts/
subscripts on the end of the line affect what we match next.

Also, we are less tolerant of vertical shifts after a large gap. This
avoids false positives where different columns just happen to almost
line up.
---
 android/jni/mupdf.c | 50 +++++++++++++++-----------------------------------
 1 file changed, 15 insertions(+), 35 deletions(-)

(limited to 'android')

diff --git a/android/jni/mupdf.c b/android/jni/mupdf.c
index 75626133..122fc636 100644
--- a/android/jni/mupdf.c
+++ b/android/jni/mupdf.c
@@ -886,45 +886,18 @@ JNI_FN(MuPDFCore_updatePageInternal)(JNIEnv *env, jobject thiz, jobject bitmap,
 	return 1;
 }
 
-static fz_text_char textcharat(fz_text_page *page, int idx)
-{
-	static fz_text_char emptychar = { {0,0,0,0}, ' ' };
-	fz_text_block *block;
-	fz_text_line *line;
-	fz_text_span *span;
-	int ofs = 0;
-	for (block = page->blocks; block < page->blocks + page->len; block++)
-	{
-		for (line = block->lines; line < block->lines + block->len; line++)
-		{
-			for (span = line->spans; span < line->spans + line->len; span++)
-			{
-				if (idx < ofs + span->len)
-					return span->text[idx - ofs];
-				/* pseudo-newline */
-				if (span + 1 == line->spans + line->len)
-				{
-					if (idx == ofs + span->len)
-						return emptychar;
-					ofs++;
-				}
-				ofs += span->len;
-			}
-		}
-	}
-	return emptychar;
-}
-
 static int
 charat(fz_text_page *page, int idx)
 {
-	return textcharat(page, idx).c;
+	fz_char_and_box cab;
+	return fz_text_char_at(&cab, page, idx)->c;
 }
 
 static fz_rect
 bboxcharat(fz_text_page *page, int idx)
 {
-	return textcharat(page, idx).bbox;
+	fz_char_and_box cab;
+	return fz_text_char_at(&cab, page, idx)->bbox;
 }
 
 static int
@@ -932,14 +905,17 @@ textlen(fz_text_page *page)
 {
 	fz_text_block *block;
 	fz_text_line *line;
-	fz_text_span *span;
 	int len = 0;
 	for (block = page->blocks; block < page->blocks + page->len; block++)
 	{
 		for (line = block->lines; line < block->lines + block->len; line++)
 		{
-			for (span = line->spans; span < line->spans + line->len; span++)
+			int span_num;
+			for (span_num = 0; span_num < line->len; span_num++)
+			{
+				fz_text_span *span = line->spans[span_num];
 				len += span->len;
+			}
 			len++; /* pseudo-newline */
 		}
 	}
@@ -1250,14 +1226,16 @@ JNI_FN(MuPDFCore_text)(JNIEnv * env, jobject thiz)
 
 				for (s = 0; s < line->len; s++)
 				{
-					fz_text_span *span = &line->spans[s];
+					fz_text_span *span = line->spans[s];
 					jobjectArray *carr = (*env)->NewObjectArray(env, span->len, textCharClass, NULL);
 					if (carr == NULL) fz_throw(ctx, "NewObjectArray failed");
 
 					for (c = 0; c < span->len; c++)
 					{
 						fz_text_char *ch = &span->text[c];
-						jobject cobj = (*env)->NewObject(env, textCharClass, ctor, ch->bbox.x0, ch->bbox.y0, ch->bbox.x1, ch->bbox.y1, ch->c);
+						fz_rect bbox;
+						fz_text_char_bbox(&bbox, span, c);
+						jobject cobj = (*env)->NewObject(env, textCharClass, ctor, bbox.x0, bbox.y0, bbox.x1, bbox.y1, ch->c);
 						if (cobj == NULL) fz_throw(ctx, "NewObjectfailed");
 
 						(*env)->SetObjectArrayElement(env, carr, c, cobj);
@@ -1329,6 +1307,8 @@ JNI_FN(MuPDFCore_textAsHtml)(JNIEnv * env, jobject thiz)
 		fz_free_device(dev);
 		dev = NULL;
 
+		fz_text_analysis(ctx, sheet, text);
+
 		buf = fz_new_buffer(ctx, 256);
 		out = fz_new_output_buffer(ctx, buf);
 		fz_printf(out, "<html>\n");
-- 
cgit v1.2.3