Rework text extraction structures.

Rework the text extraction structures - the broad strokes are similar but we now hold more information at each stage to enable us to perform more detailed analysis on the structure of the page. We now hold: fz_text_char's (the position, ucs value, and style of each char). fz_text_span's (sets of chars that share the same baseline/transform, with no more than an expected amount of whitespace between each char). fz_text_line's (sets of spans that share the same baseline (more or less, allowing for super/subscript, but possibly with a larger than expected amount of whitespace). fz_text_block's (sets of lines that follow one another) After fz_text_analysis is called, we hope to have fz_text_blocks split such that each block is a paragraph. This new implementation has the same restrictions as the current implementation it replaces, namely that chars are only considered for addition onto the most recent span at the moment, but this revised form is designed to allow more easy extension, and for this restriction to be lifted. Also add simple paragraph splitting based on finding the most common 'line distance' in blocks. When we add spans together to collate them into lines, we record the 'horizontal' and 'vertical' spacing between them. (Not actually horizontal or vertical, so much as 'in the direction of writing' and 'perpendicular to the direction of writing'). The 'horizontal' value enables us to more correctly output spaces when converting to (say) html later. The 'vertical' value enables us to spot subscripts and superscripts etc, as well as small changes in the baseline due to style changes. We are careful to base the baseline comparison on the baseline for the line, not the baseline for the previous span, as otherwise superscripts/ subscripts on the end of the line affect what we match next. Also, we are less tolerant of vertical shifts after a large gap. This avoids false positives where different columns just happen to almost line up.
author: Robin Watts <robin.watts@artifex.com> 2013-02-05 20:14:15 +0000
committer: Robin Watts <robin.watts@artifex.com> 2013-03-26 12:29:59 +0000
commit: 0fcb8d9539931f1c0a5a19ff0eb1aca427856e18 (patch)
tree: 7083f7e3e16b94ef1c85e1032c7a761c3a9002de /apps/pdfapp.c
parent: 62f1176aca1cf99e0d1135193c32b2e160aade8b (diff)
download: mupdf-0fcb8d9539931f1c0a5a19ff0eb1aca427856e18.tar.xz
1 files changed, 14 insertions, 37 deletions
diff --git a/apps/pdfapp.c b/apps/pdfapp.c
index cdb110c6..99cb386b 100644
--- a/apps/pdfapp.c
+++ b/apps/pdfapp.c
@@ -759,47 +759,21 @@ void pdfapp_gotopage(pdfapp_t *app, int number)
 	pdfapp_showpage(app, 1, 1, 1, 0);
 }
 
-static fz_text_char textcharat(fz_text_page *page, int idx)
-{
-	static fz_text_char emptychar = { {0,0,0,0}, ' ' };
-	fz_text_block *block;
-	fz_text_line *line;
-	fz_text_span *span;
-	int ofs = 0;
-	for (block = page->blocks; block < page->blocks + page->len; block++)
-	{
-		for (line = block->lines; line < block->lines + block->len; line++)
-		{
-			for (span = line->spans; span < line->spans + line->len; span++)
-			{
-				if (idx < ofs + span->len)
-					return span->text[idx - ofs];
-				/* pseudo-newline */
-				if (span + 1 == line->spans + line->len)
-				{
-					if (idx == ofs + span->len)
-						return emptychar;
-					ofs++;
-				}
-				ofs += span->len;
-			}
-		}
-	}
-	return emptychar;
-}
-
 static int textlen(fz_text_page *page)
 {
 	fz_text_block *block;
 	fz_text_line *line;
-	fz_text_span *span;
 	int len = 0;
 	for (block = page->blocks; block < page->blocks + page->len; block++)
 	{
 		for (line = block->lines; line < block->lines + block->len; line++)
 		{
-			for (span = line->spans; span < line->spans + line->len; span++)
+			int span_num;
+			for (span_num = 0; span_num < line->len; span_num++)
+			{
+				fz_text_span *span = line->spans[span_num];
 				len += span->len;
+			}
 			len++; /* pseudo-newline */
 		}
 	}
@@ -808,12 +782,14 @@ static int textlen(fz_text_page *page)
 
 static inline int charat(fz_text_page *page, int idx)
 {
-	return textcharat(page, idx).c;
+	fz_char_and_box cab;
+	return fz_text_char_at(&cab, page, idx)->c;
 }
 
 static inline fz_rect bboxcharat(fz_text_page *page, int idx)
 {
-	return textcharat(page, idx).bbox;
+	fz_char_and_box cab;
+	return fz_text_char_at(&cab, page, idx)->bbox;
 }
 
 void pdfapp_inverthit(pdfapp_t *app)
@@ -1619,7 +1595,6 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen)
 	fz_text_page *page = app->page_text;
 	fz_text_block *block;
 	fz_text_line *line;
-	fz_text_span *span;
 	int c, i, p;
 	int seen = 0;
 
@@ -1636,8 +1611,10 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen)
 	{
 		for (line = block->lines; line < block->lines + block->len; line++)
 		{
-			for (span = line->spans; span < line->spans + line->len; span++)
+			int span_num;
+			for (span_num = 0; span_num < line->len; span_num++)
 			{
+				fz_text_span *span = line->spans[span_num];
 				if (seen)
 				{
 #ifdef _WIN32
@@ -1652,7 +1629,7 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen)
 
 				for (i = 0; i < span->len; i++)
 				{
-					hitbox = span->text[i].bbox;
+					fz_text_char_bbox(&hitbox, span, i);
 					fz_transform_rect(&hitbox, &ctm);
 					c = span->text[i].c;
 					if (c < 32)
@@ -1665,7 +1642,7 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen)
 					}
 				}
 
-				seen = (seen && span + 1 == line->spans + line->len);
+				seen = (seen && span_num + 1 == line->len);
 			}
 		}
 	}
author	Robin Watts <robin.watts@artifex.com>	2013-02-05 20:14:15 +0000
committer	Robin Watts <robin.watts@artifex.com>	2013-03-26 12:29:59 +0000
commit	0fcb8d9539931f1c0a5a19ff0eb1aca427856e18 (patch)
tree	7083f7e3e16b94ef1c85e1032c7a761c3a9002de /apps/pdfapp.c
parent	62f1176aca1cf99e0d1135193c32b2e160aade8b (diff)
download	mupdf-0fcb8d9539931f1c0a5a19ff0eb1aca427856e18.tar.xz