Rejig Bidirectional and Text code.

We move to using bidirectional "levels" throughout. This should give us better behaviour vis-a-vis nested l2r/l2r text. This also allows us to carry xps levels throughout with no loss of information. This also avoids the need to special case numbers. We accordingly carry more information into fz_text. As well as wmode, also hold additional details about the text spans. We now include the directionality of the bidi level text (either as derived from bidi code, or from the original document (e.g. xps)), the directionality of text (as specified in the original document (e.g. html)), and the language of the text (if specified in the original document).
author: Robin Watts <Robin.Watts@artifex.com> 2016-03-02 08:03:53 -0800
committer: Robin Watts <robin.watts@artifex.com> 2016-03-11 11:57:48 +0000
commit: a3785935df081674d048655048984bcba09f8387 (patch)
tree: 31f6a63292d9f3d11be9c4b7003001c700b9b3c5 /source/fitz/bidi.c
parent: c5b80367dfcd3d3df09068b7f31119a400cfe241 (diff)
download: mupdf-a3785935df081674d048655048984bcba09f8387.tar.xz
1 files changed, 9 insertions, 85 deletions
diff --git a/source/fitz/bidi.c b/source/fitz/bidi.c
index 74548d4d..979d2f1c 100644
--- a/source/fitz/bidi.c
+++ b/source/fitz/bidi.c
@@ -170,47 +170,11 @@ static fz_bidi_chartype class_from_ch_n(uint32_t ch)
 	return from_ch_ws;
 }
 
-static int
-is_european_number(const uint32_t *str, unsigned int len)
-{
-	const uint32_t *end = str + len;
-
-	for ( ; str != end; str++)
-	{
-		const uint32_t u = *str;
-		if ((u >= UNICODE_RTL_START && u < UNICODE_ARABIC_INDIC_DIGIT_ZERO) ||
-			(u > UNICODE_ARABIC_INDIC_DIGIT_NINE && u < UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO) ||
-			(u > UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE && u <= UNICODE_RTL_END))
-		{
-			/* This is just a normal RTL character or accent */
-			return FALSE;
-		}
-		else if (!((u >= UNICODE_DIGIT_ZERO && u <= UNICODE_DIGIT_NINE) ||
-			(u == UNICODE_SUPERSCRIPT_TWO) ||
-			(u == UNICODE_SUPERSCRIPT_THREE) ||
-			(u == UNICODE_SUPERSCRIPT_ONE) ||
-			(u >= UNICODE_ARABIC_INDIC_DIGIT_ZERO && u <= UNICODE_ARABIC_INDIC_DIGIT_NINE) ||
-			(u >= UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO && u <= UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE) ||
-			(u == UNICODE_SUPERSCRIPT_ZERO) ||
-			(u >= UNICODE_SUPERSCRIPT_FOUR && u <= UNICODE_SUPERSCRIPT_NINE) ||
-			(u >= UNICODE_SUBSCRIPT_ZERO && u <= UNICODE_SUBSCRIPT_NINE) ||
-			(u >= UNICODE_CIRCLED_DIGIT_ONE && u <= UNICODE_NUMBER_TWENTY_FULL_STOP) ||
-			(u == UNICODE_CIRCLED_DIGIT_ZERO) ||
-			(u >= UNICODE_FULLWIDTH_DIGIT_ZERO && u <= UNICODE_FULLWIDTH_DIGIT_NINE) ||
-			(u == UNICODE_ZERO_WIDTH_NON_JOINER)))
-		{
-			return FALSE;
-		}
-	}
-	return TRUE;
-}
-
 /* Split fragments into single scripts (or punctation + single script) */
 static void
 split_at_script(const uint32_t *fragment,
 		size_t fragment_len,
-		int block_r2l,
-		int char_r2l,
+		int level,
 		void *arg,
 		fz_bidi_fragment_callback *callback)
 {
@@ -237,53 +201,17 @@ split_at_script(const uint32_t *fragment,
 		else
 		{
 			/* Change of script. Break the fragment. */
-			(*callback)(&fragment[script_start], i - script_start, block_r2l, char_r2l, script, arg);
+			(*callback)(&fragment[script_start], i - script_start, level, script, arg);
 			script_start = i+1;
 			script = s;
 		}
 	}
 	if (script_start != fragment_len)
 	{
-		(*callback)(&fragment[script_start], fragment_len - script_start, block_r2l, char_r2l, script, arg);
+		(*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg);
 	}
 }
 
-static void
-detect_numbers(const uint32_t *fragment,
-		size_t fragment_len,
-		size_t start,
-		size_t end,
-		const fz_bidi_level *levels,
-		void *arg,
-		fz_bidi_fragment_callback *callback)
-{
-	int block_r2l = ODD(levels[start]);
-	int char_r2l = block_r2l;
-
-	/* Check to see if we've got a number. Numbers should
-	 * never be block_r2l, so we can avoid the test. */
-	if (block_r2l || !is_european_number(&fragment[start], end-start))
-	{
-		/* No number, just split as normal */
-		split_at_script(&fragment[start],
-				end-start,
-				block_r2l,
-				char_r2l,
-				arg,
-				callback);
-		return;
-	}
-
-	/* We have a number. We have to check to see whether this
-	 * should be handled as a block_r2l thing. */
-	if (start != 0)
-		block_r2l = ODD(levels[start-1]);
-	if (block_r2l && end != fragment_len)
-		block_r2l = ODD(levels[end]);
-
-	split_at_script(&fragment[start], end-start, block_r2l, char_r2l, arg, callback);
-}
-
 /* Determines the character classes for all following
  * passes of the algorithm. A character class is basically the type of Bidi
  * behaviour that the character exhibits.
@@ -614,11 +542,9 @@ void fz_bidi_fragment_text(fz_context *ctx,
 				 * Create a text object for it, then start
 				 * a new fragment.
 				 */
-				detect_numbers(text,
-						textlen,
-						startOfFragment,
-						i,
-						levels,
+				split_at_script(&text[startOfFragment],
+						i - startOfFragment,
+						levels[startOfFragment],
 						arg,
 						callback);
 				startOfFragment = i;
@@ -626,11 +552,9 @@ void fz_bidi_fragment_text(fz_context *ctx,
 		}
 		/* Now i == textlen. Deal with the final (or maybe only) fragment. */
 		/* otherwise create 1 fragment */
-		detect_numbers(text,
-				textlen,
-				startOfFragment,
-				i,
-				levels,
+		split_at_script(&text[startOfFragment],
+				i - startOfFragment,
+				levels[startOfFragment],
 				arg,
 				callback);
 	}
author	Robin Watts <Robin.Watts@artifex.com>	2016-03-02 08:03:53 -0800
committer	Robin Watts <robin.watts@artifex.com>	2016-03-11 11:57:48 +0000
commit	a3785935df081674d048655048984bcba09f8387 (patch)
tree	31f6a63292d9f3d11be9c4b7003001c700b9b3c5 /source/fitz/bidi.c
parent	c5b80367dfcd3d3df09068b7f31119a400cfe241 (diff)
download	mupdf-a3785935df081674d048655048984bcba09f8387.tar.xz