6 files changed, 522 insertions, 211 deletions
diff --git a/include/mupdf/fitz/bidi.h b/include/mupdf/fitz/bidi.h
index dfa439f3..8428ffc1 100644
--- a/include/mupdf/fitz/bidi.h
+++ b/include/mupdf/fitz/bidi.h
@@ -10,8 +10,6 @@
  * Processes Unicode text by arranging the characters into an order suitable
  * for display. E.g. Hebrew text will be arranged from right-to-left and
  * any English within the text will remain in the left-to-right order.
- * Characters such as parenthesis will be substituted for their mirrored
- * equivalents if they are part of text which must be reversed.
  *
  * This is an implementation of the Unicode Bidirectional Algorithm which
  * can be found here: http://www.unicode.org/reports/tr9/ and is based
@@ -55,14 +53,15 @@ enum
  *				as right-to-left
  * @param	char_r2l	true if characters within block should be laid out
  *				as right-to-left
- * @param	mirror		The mirror code of the fragment if it exists
+ * @param       script          the script in use for this fragment (other than common
+ *                              or inherited)
  * @param	arg		data from caller of Bidi_fragmentText
  */
 typedef void (fz_bidi_fragment_callback)(const uint32_t *fragment,
 					size_t fragmentLen,
 					int block_r2l,
 					int char_r2l,
-					uint32_t mirror,
+					int script,
 					void *arg);
 
 /**
diff --git a/include/mupdf/fitz/font.h b/include/mupdf/fitz/font.h
index 47866f9e..0de3e00e 100644
--- a/include/mupdf/fitz/font.h
+++ b/include/mupdf/fitz/font.h
@@ -68,6 +68,9 @@ struct fz_font_s
 
 	/* cached encoding lookup */
 	uint16_t *encoding_cache[256];
+
+	/* Shaping information */
+	void *shaper;
 };
 
 /* common CJK font collections */
diff --git a/include/mupdf/html.h b/include/mupdf/html.h
index 4969e15c..c734ea0d 100644
--- a/include/mupdf/html.h
+++ b/include/mupdf/html.h
@@ -221,11 +221,27 @@ enum
  */
 struct fz_html_flow_s
 {
+	/* What type of node */
 	unsigned int type : 2;
+
+	/* Whether this should expand during justification */
 	unsigned int expand : 1;
+
+	/* Whether the chars should be laid out r2l or l2r */
 	unsigned int char_r2l : 1;
+
+	/* Whether this block should stack with its neighbours r2l or l2r */
 	unsigned int block_r2l : 1;
-	unsigned int mirror : 1;
+
+	/* Whether the markup specifies a given direction. */
+	unsigned int markup_r2l : 2;
+
+	/* Whether the markup specifies a given language. */
+	unsigned int markup_lang : 8;
+
+	/* The script detected by the bidi code. */
+	unsigned int script : 8;
+
 	float x, y, w, h, em;
 	fz_css_style *style;
 	union {
diff --git a/source/fitz/bidi.c b/source/fitz/bidi.c
index e711e705..74548d4d 100644
--- a/source/fitz/bidi.c
+++ b/source/fitz/bidi.c
@@ -205,76 +205,83 @@ is_european_number(const uint32_t *str, unsigned int len)
 	return TRUE;
 }
 
+/* Split fragments into single scripts (or punctation + single script) */
 static void
-do_callback(const uint32_t *fragment,
+split_at_script(const uint32_t *fragment,
 		size_t fragment_len,
 		int block_r2l,
-		uint32_t mirror,
+		int char_r2l,
 		void *arg,
 		fz_bidi_fragment_callback *callback)
 {
-	char char_r2l = block_r2l;
+	int script = UCDN_SCRIPT_COMMON;
+	size_t script_start, i;
 
-	char_r2l = block_r2l && !is_european_number(fragment, fragment_len);
-
-	(*callback)(fragment, fragment_len, block_r2l, char_r2l, mirror, arg);
-}
-
-/* Searches a RTL fragment for a mirror character
- * When it finds one it creates a separate fragment for the
- * character and the surrounding fragments. It passes the mirrored
- * character back through the callback.
- */
-static void
-create_fragment_mirrors(const uint32_t *text,
-		int len,
-		fz_bidi_fragment_callback *callback,
-		void *arg)
-{
-	int i;
-	int lastPtr;
-	uint32_t mirror;
-
-	assert(text != NULL);
-	assert(len > 0);
-	lastPtr = 0;
-	for (i = 0; i < len; i ++)
+	script_start = 0;
+	for (i = 0; i < fragment_len; i++)
 	{
-		mirror = ucdn_mirror(text[i]);
-		if (mirror != UNICODE_EOS)
+		int s = ucdn_get_script(fragment[i]);
+		if (s == UCDN_SCRIPT_COMMON || s == UCDN_SCRIPT_INHERITED)
 		{
-			/* create preceding fragment */
-			if (i > lastPtr)
-			{
-				do_callback(&text[lastPtr],
-						i - lastPtr,
-						TRUE,
-						UNICODE_EOS,
-						arg,
-						callback);
-				DBUGVF(("create mirror fragment for %x\n",(int)text[i]));
-			}
-			/* create mirror fragment */
-			do_callback(&text[i],
-					1,
-					TRUE,
-					mirror,
-					arg,
-					callback);
-			lastPtr = i + 1;
+			/* Punctuation etc. This is fine. */
+		}
+		else if (s == script)
+		{
+			/* Same script. Still fine. */
+		}
+		else if (script == UCDN_SCRIPT_COMMON || script == UCDN_SCRIPT_INHERITED)
+		{
+			/* First non punctuation thing. Set the script. */
+			script = s;
 		}
+		else
+		{
+			/* Change of script. Break the fragment. */
+			(*callback)(&fragment[script_start], i - script_start, block_r2l, char_r2l, script, arg);
+			script_start = i+1;
+			script = s;
+		}
+	}
+	if (script_start != fragment_len)
+	{
+		(*callback)(&fragment[script_start], fragment_len - script_start, block_r2l, char_r2l, script, arg);
 	}
+}
+
+static void
+detect_numbers(const uint32_t *fragment,
+		size_t fragment_len,
+		size_t start,
+		size_t end,
+		const fz_bidi_level *levels,
+		void *arg,
+		fz_bidi_fragment_callback *callback)
+{
+	int block_r2l = ODD(levels[start]);
+	int char_r2l = block_r2l;
 
-	if (lastPtr < len)
+	/* Check to see if we've got a number. Numbers should
+	 * never be block_r2l, so we can avoid the test. */
+	if (block_r2l || !is_european_number(&fragment[start], end-start))
 	{
-		/* create end fragment */
-		do_callback(&text[lastPtr],
-				len - lastPtr,
-				TRUE,
-				UNICODE_EOS,
+		/* No number, just split as normal */
+		split_at_script(&fragment[start],
+				end-start,
+				block_r2l,
+				char_r2l,
 				arg,
 				callback);
+		return;
 	}
+
+	/* We have a number. We have to check to see whether this
+	 * should be handled as a block_r2l thing. */
+	if (start != 0)
+		block_r2l = ODD(levels[start-1]);
+	if (block_r2l && end != fragment_len)
+		block_r2l = ODD(levels[end]);
+
+	split_at_script(&fragment[start], end-start, block_r2l, char_r2l, arg, callback);
 }
 
 /* Determines the character classes for all following
@@ -607,45 +614,25 @@ void fz_bidi_fragment_text(fz_context *ctx,
 				 * Create a text object for it, then start
 				 * a new fragment.
 				 */
-				if (ODD(levels[startOfFragment]) != 0)
-				{
-					/* if RTL check for mirrors and create sub-frags */
-					create_fragment_mirrors(&text[startOfFragment],
-							i - startOfFragment,
-							callback,
-							arg);
-				}
-				else
-				{
-					do_callback(&text[startOfFragment],
-							i - startOfFragment,
-							ODD(levels[startOfFragment]),
-							UNICODE_EOS,
-							arg,
-							callback);
-				}
+				detect_numbers(text,
+						textlen,
+						startOfFragment,
+						i,
+						levels,
+						arg,
+						callback);
 				startOfFragment = i;
 			}
 		}
 		/* Now i == textlen. Deal with the final (or maybe only) fragment. */
-		if (ODD(levels[startOfFragment]) != 0)
-		{
-			/* if RTL check for mirrors and create sub-frags */
-			create_fragment_mirrors(&text[startOfFragment],
-					i - startOfFragment,
-					callback,
-					arg);
-		}
-		else
-		{
-			/* otherwise create 1 fragment */
-			do_callback(&text[startOfFragment],
-					i - startOfFragment,
-					ODD(levels[startOfFragment]),
-					UNICODE_EOS,
-					arg,
-					callback);
-		}
+		/* otherwise create 1 fragment */
+		detect_numbers(text,
+				textlen,
+				startOfFragment,
+				i,
+				levels,
+				arg,
+				callback);
 	}
 	fz_always(ctx)
 	{
diff --git a/source/fitz/font.c b/source/fitz/font.c
index a7db05d4..e7dbc827 100644
--- a/source/fitz/font.c
+++ b/source/fitz/font.c
@@ -1,6 +1,9 @@
 #include "mupdf/fitz.h"
 
 #include <ft2build.h>
+#include "hb.h"
+#include "hb-ft.h"
+
 #include FT_FREETYPE_H
 #include FT_ADVANCES_H
 #include FT_STROKER_H
@@ -150,6 +153,7 @@ fz_drop_font(fz_context *ctx, fz_font *font)
 	fz_free(ctx, font->bbox_table);
 	fz_free(ctx, font->width_table);
 	fz_free(ctx, font->advance_cache);
+	hb_font_destroy(font->shaper);
 	fz_free(ctx, font);
 }
 
diff --git a/source/html/html-layout.c b/source/html/html-layout.c
index 3f6fe919..259dcbc5 100644
--- a/source/html/html-layout.c
+++ b/source/html/html-layout.c
@@ -1,5 +1,11 @@
 #include "mupdf/html.h"
 
+#include "hb.h"
+#include "hb-ft.h"
+#include <ft2build.h>
+
+#undef DEBUG_HARFBUZZ
+
 enum { T, R, B, L };
 
 static const char *default_css =
@@ -78,9 +84,9 @@ static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html *top, fz_c
 	fz_html_flow *flow = fz_pool_alloc(ctx, pool, sizeof *flow);
 	flow->type = type;
 	flow->expand = 0;
-	flow->char_r2l = 0;
-	flow->block_r2l = 0;
-	flow->mirror = 0;
+	flow->char_r2l = BIDI_LEFT_TO_RIGHT;
+	flow->block_r2l = BIDI_RIGHT_TO_LEFT;
+	flow->markup_r2l = BIDI_NEUTRAL;
 	flow->style = style;
 	*top->flow_tail = flow;
 	top->flow_tail = &flow->next;
@@ -536,27 +542,102 @@ static void measure_image(fz_context *ctx, fz_html_flow *node, float max_w, floa
 	node->h = node->content.image->h * s;
 }
 
-static void measure_word(fz_context *ctx, fz_html_flow *node, float em)
+static void measure_word(fz_context *ctx, fz_html_flow *node, float em, hb_buffer_t *hb_buf)
 {
-	fz_font *font;
-	const char *s;
-	int c, g;
-	float w;
+	fz_font *font, *next_font;
+	hb_glyph_position_t *glyph_pos;
+	unsigned int glyph_count, i;
+	int max_x, x;
+	const char *s, *start, *end;
+	FT_Face face;
+	int fterr;
+	int scale;
 
 	em = fz_from_css_number(node->style->font_size, em, em);
 	node->x = 0;
 	node->y = 0;
+	node->w = 0;
 	node->h = fz_from_css_number_scale(node->style->line_height, em, em, em);
 
-	w = 0;
-	s = node->content.text;
-	while (*s)
+	start = end = s = node->content.text;
+	font = NULL;
+	while (*start)
 	{
-		s += fz_chartorune(&c, s);
-		g = fz_encode_character_with_fallback(ctx, node->style->font, c, 0, &font);
-		w += fz_advance_glyph(ctx, font, g) * em;
+		/* Run through the string, encoding chars until we find one
+		 * that requires a different fallback font. */
+		while (*s)
+		{
+			int c;
+
+			s += fz_chartorune(&c, s);
+			(void)fz_encode_character_with_fallback(ctx, node->style->font, c, node->script, &next_font);
+			if (next_font != font)
+			{
+				if (font != NULL)
+					break;
+				font = next_font;
+			}
+			end = s;
+		}
+
+		fz_try(ctx)
+		{
+			hb_lock(ctx);
+
+			/* So, shape from start to end in font */
+			face = font->ft_face;
+			scale = face->units_per_EM;
+			fterr = FT_Set_Char_Size(face, scale, scale, 72, 72);
+			if (fterr)
+				fz_throw(ctx, FZ_ERROR_GENERIC, "Failure sizing font (%d)", fterr);
+
+			if (font->shaper == NULL)
+				font->shaper = (void *)hb_ft_font_create(face, NULL);
+
+			hb_buffer_clear_contents(hb_buf);
+			hb_buffer_set_direction(hb_buf, node->char_r2l ? HB_DIRECTION_RTL : HB_DIRECTION_LTR);
+			/* We don't know script or language, so leave them blank */
+			/* hb_buffer_set_script(hb_buf, HB_SCRIPT_LATIN); */
+			/* hb_buffer_set_language(hb_buf, hb_language_from_string("en", strlen("en"))); */
+
+			/* First put the text content into a harfbuzz buffer
+			 * labelled with the position within the word. */
+			hb_buffer_add_utf8(hb_buf, start, end - start, 0, -1);
+			hb_buffer_guess_segment_properties(hb_buf);
+
+			/* Now shape that buffer */
+			hb_shape(font->shaper, hb_buf, NULL, 0);
+
+			glyph_pos = hb_buffer_get_glyph_positions(hb_buf, &glyph_count);
+		}
+		fz_always(ctx)
+		{
+			hb_unlock(ctx);
+		}
+		fz_catch(ctx)
+		{
+			fz_rethrow(ctx);
+		}
+
+		max_x = 0;
+		x = 0;
+		for (i = 0; i < glyph_count; i++)
+		{
+			int lx;
+
+			x += glyph_pos[i].x_advance;
+			lx = x + glyph_pos[i].x_offset;
+			if (lx > max_x)
+				max_x = lx;
+		}
+
+		start = end;
+		end = s;
+		font = next_font;
+
+		node->w += max_x * em / scale;
 	}
-	node->w = w;
+
 	node->em = em;
 }
 
@@ -586,7 +667,7 @@ static float measure_line(fz_html_flow *node, fz_html_flow *end, float *baseline
 	return h;
 }
 
-static void layout_line(fz_context *ctx, float indent, float page_w, float line_w, int align, fz_html_flow *node, fz_html_flow *end, fz_html *box, float baseline)
+static void layout_line(fz_context *ctx, float indent, float page_w, float line_w, int align, fz_html_flow *start, fz_html_flow *end, fz_html *box, float baseline)
 {
 	float x = box->x + indent;
 	float y = box->y + box->h;
@@ -594,7 +675,8 @@ static void layout_line(fz_context *ctx, float indent, float page_w, float line_
 	float justify = 0;
 	float va;
 	int n = 0;
-	fz_html_flow *start, *mid;
+	fz_html_flow *node = start;
+	fz_html_flow *mid;
 
 	if (align == TA_JUSTIFY)
 	{
@@ -613,8 +695,7 @@ static void layout_line(fz_context *ctx, float indent, float page_w, float line_
 	/* We have the invariants that 1) start...mid are always laid out
 	 * correctly and 2) mid..node are the most recent set of right to left
 	 * blocks. */
-	start = node;
-	mid = node;
+	mid = start;
 	while (node != end)
 	{
 		float w = node->w + (node->type == FLOW_GLUE && node->expand ? justify : 0);
@@ -691,7 +772,7 @@ static void flush_line(fz_context *ctx, fz_html *box, float page_h, float page_w
 	box->h += line_h;
 }
 
-static void layout_flow(fz_context *ctx, fz_html *box, fz_html *top, float em, float page_h)
+static void layout_flow(fz_context *ctx, fz_html *box, fz_html *top, float em, float page_h, hb_buffer_t *hb_buf)
 {
 	fz_html_flow *node, *line, *mark;
 	float line_w;
@@ -729,7 +810,7 @@ static void layout_flow(fz_context *ctx, fz_html *box, fz_html *top, float em, f
 		}
 		else
 		{
-			measure_word(ctx, node, em);
+			measure_word(ctx, node, em, hb_buf);
 		}
 	}
 
@@ -793,7 +874,7 @@ static void layout_flow(fz_context *ctx, fz_html *box, fz_html *top, float em, f
 	}
 }
 
-static float layout_block(fz_context *ctx, fz_html *box, fz_html *top, float em, float page_h, float vertical)
+static float layout_block(fz_context *ctx, fz_html *box, fz_html *top, float em, float page_h, float vertical, hb_buffer_t *hb_buf)
 {
 	fz_html *child;
 	int first;
@@ -841,7 +922,7 @@ static float layout_block(fz_context *ctx, fz_html *box, fz_html *top, float em,
 	{
 		if (child->type == BOX_BLOCK)
 		{
-			vertical = layout_block(ctx, child, box, em, page_h, vertical);
+			vertical = layout_block(ctx, child, box, em, page_h, vertical, hb_buf);
 			if (first)
 			{
 				/* move collapsed parent/child top margins to parent */
@@ -863,7 +944,7 @@ static float layout_block(fz_context *ctx, fz_html *box, fz_html *top, float em,
 		}
 		else if (child->type == BOX_FLOW)
 		{
-			layout_flow(ctx, child, box, em, page_h);
+			layout_flow(ctx, child, box, em, page_h, hb_buf);
 			if (child->h > 0)
 			{
 				box->h += child->h;
@@ -897,15 +978,23 @@ static float layout_block(fz_context *ctx, fz_html *box, fz_html *top, float em,
 	return vertical;
 }
 
-static void draw_flow_box(fz_context *ctx, fz_html *box, float page_top, float page_bot, fz_device *dev, const fz_matrix *ctm)
+static void draw_flow_box(fz_context *ctx, fz_html *box, float page_top, float page_bot, fz_device *dev, const fz_matrix *ctm, hb_buffer_t *hb_buf)
 {
-	fz_font *font;
+	fz_font *font, *next_font;
 	fz_html_flow *node;
 	fz_text *text;
 	fz_matrix trm;
 	const char *s;
+	const char *t;
+	const char *start;
+	const char *end;
 	float color[3];
-	int c, g;
+	int c, scale, fterr;
+	float node_scale;
+	FT_Face face;
+	float w, lx, ly;
+
+	/* FIXME: HB_DIRECTION_TTB? */
 
 	for (node = box->flow_head; node; node = node->next)
 	{
@@ -922,6 +1011,15 @@ static void draw_flow_box(fz_context *ctx, fz_html *box, float page_top, float p
 
 		if (node->type == FLOW_WORD)
 		{
+			int idx;
+			unsigned int gp, glyph_count;
+			hb_glyph_info_t *glyph_info;
+			hb_glyph_position_t *glyph_pos;
+			float x, y;
+
+			if (node->content.text == NULL)
+				continue;
+
 			fz_scale(&trm, node->em, -node->em);
 
 			color[0] = node->style->color.r / 255.0f;
@@ -931,54 +1029,189 @@ static void draw_flow_box(fz_context *ctx, fz_html *box, float page_top, float p
 			/* TODO: reuse text object if color is unchanged */
 			text = fz_new_text(ctx);
 
-
-			trm.e = node->x;
-			trm.f = node->y;
-			s = node->content.text;
-			if (node->char_r2l)
+			x = node->x;
+			y = node->y;
+			w = node->w;
+			start = end = s = node->content.text;
+			font = NULL;
+			while (*start)
 			{
-				float w = 0;
-				const char *t = s;
+				/* Run through the string, encoding chars until we find one
+				 * that requires a different fallback font. */
+				while (*s)
+				{
+					int c;
 
-				while (*t)
+					s += fz_chartorune(&c, s);
+					(void)fz_encode_character_with_fallback(ctx, node->style->font, c, node->script, &next_font);
+					if (next_font != font)
+					{
+						if (font != NULL)
+							break;
+						font = next_font;
+					}
+					end = s;
+				}
+
+				fz_try(ctx)
+				{
+					hb_lock(ctx);
+
+					/* So, shape from start to end in font */
+					face = font->ft_face;
+					scale = face->units_per_EM;
+					fterr = FT_Set_Char_Size(face, scale, scale, 72, 72);
+					if (fterr)
+						fz_throw(ctx, FZ_ERROR_GENERIC, "Failure sizing font (%d)", fterr);
+
+					if (font->shaper == NULL)
+						font->shaper = (void *)hb_ft_font_create(face, NULL);
+
+					hb_buffer_clear_contents(hb_buf);
+					hb_buffer_set_direction(hb_buf, node->char_r2l ? HB_DIRECTION_RTL : HB_DIRECTION_LTR);
+					/* We don't know script or language, so leave them blank */
+					/* hb_buffer_set_script(hb_buf, HB_SCRIPT_LATIN); */
+					/* hb_buffer_set_language(hb_buf, hb_language_from_string("en", strlen("en"))); */
+
+					/* First put the text content into a harfbuzz buffer
+					 * labelled with the position within the word. */
+					hb_buffer_add_utf8(hb_buf, start, end - start, 0, -1);
+					hb_buffer_guess_segment_properties(hb_buf);
+
+					face = font->ft_face;
+					scale = face->units_per_EM;
+					fterr = FT_Set_Char_Size(face, scale, scale, 72, 72);
+					if (fterr)
+						fz_throw(ctx, FZ_ERROR_GENERIC, "Failure sizing font (%d)", fterr);
+
+					/* Now shape that buffer */
+					hb_shape(font->shaper, hb_buf, NULL, 0);
+
+					glyph_info = hb_buffer_get_glyph_infos(hb_buf, &glyph_count);
+					glyph_pos = hb_buffer_get_glyph_positions(hb_buf, &glyph_count);
+				}
+				fz_always(ctx)
+				{
+					hb_unlock(ctx);
+				}
+				fz_catch(ctx)
+				{
+					fz_rethrow(ctx);
+				}
+
+#ifdef DEBUG_HARFBUZZ
+				printf("fragment: ");
+				t = start;
+				while (t != end)
 				{
 					t += fz_chartorune(&c, t);
-					if (node->mirror)
-						c = ucdn_mirror(c);
-					g = fz_encode_character_with_fallback(ctx, node->style->font, c, 0, &font);
-					w += fz_advance_glyph(ctx, font, g) * node->em;
+					if (c >= 127)
+						printf("<%x>", c);
+					else
+						printf("%c", c);
+				}
+				printf("\n");
+#endif /* DEBUG_HARFBUZZ */
+
+				/* Now offset the glyph_info with the correct positions.
+				 * Harfbuzz always gives us the shaped glyphs for plotting in l2r
+				 * order. We however still want to send glyphs r2l rather than l2r
+				 * for r2l blocks so that text extraction works. So, regardless
+				 * of ordering we resolve the positions here. The nasty thing is
+				 * that we right the resolved positions back into the Harfbuzz
+				 * buffer with a change of type. */
+				node_scale = node->em / scale;
+
+				lx = 0;
+				ly = 0;
+				for (gp = 0; gp < glyph_count; gp++)
+				{
+					hb_glyph_position_t *p = &glyph_pos[gp];
+#ifdef DEBUG_HARFBUZZ
+					hb_glyph_info_t *g = &glyph_info[gp];
+
+					printf("glyph: %x(%d) @ %d %d + %d %d",
+						g->codepoint, g->cluster, p->x_offset, p->y_offset,
+						p->x_advance, p->y_advance);
+#endif /* DEBUG_HARFBUZZ */
+					*(float *)(&p->x_offset) = x + (lx + p->x_offset) * node_scale;
+					*(float *)(&p->y_offset) = y + (ly + p->y_offset) * node_scale;
+#ifdef DEBUG_HARFBUZZ
+					printf(" => %g %g\n", *(float *)(&p->x_offset), *(float *)(&p->y_offset));
+#endif /* DEBUG_HARFBUZZ */
+					lx += p->x_advance;
+					ly += p->y_advance;
 				}
 
-				trm.e += w;
-				while (*s)
+				if (node->char_r2l)
 				{
-					s += fz_chartorune(&c, s);
-					if (node->mirror)
-						c = ucdn_mirror(c);
-					g = fz_encode_character_with_fallback(ctx, node->style->font, c, 0, &font);
-					trm.e -= fz_advance_glyph(ctx, font, g) * node->em;
-					if (node->style->visibility == V_VISIBLE)
-						fz_add_text(ctx, text, font, 0, &trm, g, c);
+					w -= lx * node_scale;
+					for (gp = 0; gp < glyph_count; gp++)
+					{
+						hb_glyph_position_t *p = &glyph_pos[gp];
+						*(float *)(&p->x_offset) += w;
+					}
 				}
-				trm.e += w;
-			}
-			else
-			{
-				while (*s)
+				else
 				{
-					s += fz_chartorune(&c, s);
-					g = fz_encode_character_with_fallback(ctx, node->style->font, c, 0, &font);
-					if (node->style->visibility == V_VISIBLE)
-						fz_add_text(ctx, text, font, 0, &trm, g, c);
-					trm.e += fz_advance_glyph(ctx, font, g) * node->em;
+					x += node_scale * lx;
+					y += node_scale * ly;
 				}
-			}
 
-			if (text)
-			{
-				fz_fill_text(ctx, dev, text, ctm, fz_device_rgb(ctx), color, 1);
-				fz_drop_text(ctx, text);
+				/* Now read the data back out again, and turn it into
+				 * glyph/ucs pairs to go to fz_text */
+				idx = 0;
+				t = start;
+				if (node->style->visibility == V_VISIBLE)
+				{
+					while (*t)
+					{
+						int l = fz_chartorune(&c, t);
+						t += l;
+
+						for (gp = 0; gp < glyph_count; gp++)
+						{
+							hb_glyph_info_t *g = &glyph_info[gp];
+							hb_glyph_position_t *p = &glyph_pos[gp];
+							if (g->cluster != idx)
+								continue;
+							trm.e = *(float *)&p->x_offset;
+							trm.f = *(float *)&p->y_offset;
+							fz_add_text(ctx, text, font, 0, &trm, g->codepoint, c);
+							break;
+						}
+						if (gp == glyph_count)
+						{
+							/* We failed to find a glyph for this codepoint, presumably
+							 * because we've been shaped away into another. We can't afford
+							 * to just drop the codepoint as this will upset text extraction.
+							 */
+							fz_add_text(ctx, text, font, 0, &trm, -1, c);
+						}
+						else
+						{
+							/* We've send the codepoint and glyph. Make sure there aren't
+							 * more glyphs to come from the same codepoint. */
+							for (gp++ ;gp < glyph_count; gp++)
+							{
+								hb_glyph_info_t *g = &glyph_info[gp];
+								hb_glyph_position_t *p = &glyph_pos[gp];
+								if (g->cluster != idx)
+									continue;
+								trm.e = *(float *)&p->x_offset;
+								trm.f = *(float *)&p->y_offset;
+								fz_add_text(ctx, text, font, 0, &trm, g->codepoint, -1);
+							}
+						}
+						idx += l;
+					}
+				}
+				start = end;
+				end = s;
+				font = next_font;
 			}
+			fz_fill_text(ctx, dev, text, ctm, fz_device_rgb(ctx), color, 1);
+			fz_drop_text(ctx, text);
 		}
 		else if (node->type == FLOW_IMAGE)
 		{
@@ -1179,7 +1412,7 @@ static void draw_list_mark(fz_context *ctx, fz_html *box, float page_top, float
 	fz_drop_text(ctx, text);
 }
 
-static void draw_block_box(fz_context *ctx, fz_html *box, float page_top, float page_bot, fz_device *dev, const fz_matrix *ctm)
+static void draw_block_box(fz_context *ctx, fz_html *box, float page_top, float page_bot, fz_device *dev, const fz_matrix *ctm, hb_buffer_t *hb_buf)
 {
 	float x0, y0, x1, y1;
 
@@ -1215,8 +1448,8 @@ static void draw_block_box(fz_context *ctx, fz_html *box, float page_top, float
 	{
 		switch (box->type)
 		{
-		case BOX_BLOCK: draw_block_box(ctx, box, page_top, page_bot, dev, ctm); break;
-		case BOX_FLOW: draw_flow_box(ctx, box, page_top, page_bot, dev, ctm); break;
+		case BOX_BLOCK: draw_block_box(ctx, box, page_top, page_bot, dev, ctm, hb_buf); break;
+		case BOX_FLOW: draw_flow_box(ctx, box, page_top, page_bot, dev, ctm, hb_buf); break;
 		}
 	}
 }
@@ -1225,8 +1458,33 @@ void
 fz_draw_html(fz_context *ctx, fz_html *box, float page_top, float page_bot, fz_device *dev, const fz_matrix *inctm)
 {
 	fz_matrix ctm = *inctm;
-	fz_pre_translate(&ctm, 0, -page_top);
-	draw_block_box(ctx, box, page_top, page_bot, dev, &ctm);
+	hb_buffer_t *hb_buf = NULL;
+	int unlocked = 0;
+
+	fz_var(hb_buf);
+	fz_var(unlocked);
+
+	hb_lock(ctx);
+
+	fz_try(ctx)
+	{
+		hb_buf = hb_buffer_create();
+		hb_unlock(ctx);
+		unlocked = 1;
+		fz_pre_translate(&ctm, 0, -page_top);
+		draw_block_box(ctx, box, page_top, page_bot, dev, &ctm, hb_buf);
+	}
+	fz_always(ctx)
+	{
+		if (unlocked)
+			hb_lock(ctx);
+		hb_buffer_destroy(hb_buf);
+		hb_unlock(ctx);
+	}
+	fz_catch(ctx)
+	{
+		fz_rethrow(ctx);
+	}
 }
 
 static char *concat_text(fz_context *ctx, fz_xml *root)
@@ -1413,12 +1671,36 @@ void
 fz_layout_html(fz_context *ctx, fz_html *box, float w, float h, float em)
 {
 	fz_html page_box;
+	hb_buffer_t *hb_buf = NULL;
+	int unlocked = 0;
+
+	fz_var(hb_buf);
+	fz_var(unlocked);
 
-	init_box(ctx, &page_box);
-	page_box.w = w;
-	page_box.h = 0;
+	hb_lock(ctx);
 
-	layout_block(ctx, box, &page_box, em, h, 0);
+	fz_try(ctx)
+	{
+		hb_buf = hb_buffer_create();
+		unlocked = 1;
+		hb_unlock(ctx);
+		init_box(ctx, &page_box);
+		page_box.w = w;
+		page_box.h = 0;
+
+		layout_block(ctx, box, &page_box, em, h, 0, hb_buf);
+	}
+	fz_always(ctx)
+	{
+		if (unlocked)
+			hb_lock(ctx);
+		hb_buffer_destroy(hb_buf);
+		hb_unlock(ctx);
+	}
+	fz_catch(ctx)
+	{
+		fz_rethrow(ctx);
+	}
 }
 
 typedef struct
@@ -1453,7 +1735,7 @@ static void newFragCb(const uint32_t *fragment,
 			size_t fragment_len,
 			int block_r2l,
 			int char_r2l,
-			uint32_t mirror,
+			int script,
 			void *arg)
 {
 	bidi_data *data = (bidi_data *)arg;
@@ -1490,74 +1772,94 @@ static void newFragCb(const uint32_t *fragment,
 		/* This flow box is entirely contained within this fragment. */
 		data->flow->block_r2l = block_r2l;
 		data->flow->char_r2l = char_r2l;
-		if (mirror != 0)
-			data->flow->mirror = 1;
+		data->flow->script = script;
 		data->flow = data->flow->next;
 		fragment_offset += len;
 		fragment_len -= len;
 	}
 }
 
+static int
+dirn_matches(int dirn, int dirn2)
+{
+	return (dirn == BIDI_NEUTRAL || dirn2 == BIDI_NEUTRAL || dirn == dirn2);
+}
+
 static void
 detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_bidi_direction *baseDir, fz_html_flow *flow)
 {
 	fz_html_flow *end = flow;
 	const char *text;
 	bidi_data data;
+	fz_bidi_direction dirn;
 
-	/* Stage 1: Gather the text from the flow up into a single buffer */
-	buffer->len = 0;
 	while (end)
 	{
-		size_t len;
-		int broken = 0;
+		dirn = BIDI_NEUTRAL;
 
-		switch (end->type)
+		/* Gather the text from the flow up into a single buffer (at
+		 * least, as much of it as has the same direction markup). */
+		buffer->len = 0;
+		while (end && dirn_matches(dirn, end->markup_r2l))
 		{
-		case FLOW_WORD:
-			len = utf8len(end->content.text);
-			text = end->content.text;
-			break;
-		case FLOW_GLUE:
-			len = 1;
-			text = " ";
-			break;
-		case FLOW_BREAK:
-		case FLOW_IMAGE:
-			broken = 1;
-			break;
-		}
+			size_t len;
+			int broken = 0;
 
-		if (broken)
-			break;
+			dirn = end->markup_r2l;
 
-		/* Make sure the buffer is large enough */
-		if (buffer->len + len > buffer->cap)
-		{
-			size_t newcap = buffer->cap * 2;
-			if (newcap == 0)
-				newcap = 128; /* Sensible small default */
-			buffer->data = fz_resize_array(ctx, buffer->data, newcap, sizeof(uint32_t));
-			buffer->cap = newcap;
+			switch (end->type)
+			{
+			case FLOW_WORD:
+				len = utf8len(end->content.text);
+				text = end->content.text;
+				break;
+			case FLOW_GLUE:
+				len = 1;
+				text = " ";
+				break;
+			case FLOW_BREAK:
+			case FLOW_IMAGE:
+				broken = 1;
+				break;
+			}
+
+			if (broken)
+				break;
+
+			/* Make sure the buffer is large enough */
+			if (buffer->len + len > buffer->cap)
+			{
+				size_t newcap = buffer->cap * 2;
+				if (newcap == 0)
+					newcap = 128; /* Sensible small default */
+				buffer->data = fz_resize_array(ctx, buffer->data, newcap, sizeof(uint32_t));
+				buffer->cap = newcap;
+			}
+
+			/* Expand the utf8 text into Unicode and store it in the buffer */
+			while (*text)
+			{
+				int rune;
+				text += fz_chartorune(&rune, text);
+				buffer->data[buffer->len++] = rune;
+			}
+
+			end = end->next;
 		}
 
-		/* Expand the utf8 text into Unicode and store it in the buffer */
-		while (*text)
+		/* Detect directionality for the buffer */
+		data.ctx = ctx;
+		data.pool = pool;
+		data.flow = flow;
+		data.buffer = buffer;
+		fz_bidi_fragment_text(ctx, buffer->data, buffer->len, &dirn, &newFragCb, &data, 0 /* Flags */);
+
+		/* Set the default flow of the box to be the first non NEUTRAL thing we find */
+		if (*baseDir == BIDI_NEUTRAL)
 		{
-			int rune;
-			text += fz_chartorune(&rune, text);
-			buffer->data[buffer->len++] = rune;
+			*baseDir = dirn;
 		}
-
-		end = end->next;
 	}
-
-	/* Detect directionality for the buffer */
-	data.ctx = ctx;
-	data.pool = pool;
-	data.flow = flow;
-	data.buffer = buffer;
-	fz_bidi_fragment_text(ctx, buffer->data, buffer->len, baseDir, &newFragCb, &data, 0 /* Flags */);
 }
 
 static void