epub: Use markup language when shaping and selecting fallback fonts.

author: Tor Andersson <tor.andersson@artifex.com> 2016-06-23 13:41:53 +0200
committer: Robin Watts <robin.watts@artifex.com> 2016-06-23 16:10:29 +0100
commit: 6e48c939dc9913a6af747d5b6961624551c8d90d (patch)
tree: 4b3f69c237dc7af00606c87ac94cd2c4893da628 /source
parent: cf7b2cbbfe6192fbf697237735ab45bc951304e4 (diff)
download: mupdf-6e48c939dc9913a6af747d5b6961624551c8d90d.tar.xz
4 files changed, 228 insertions, 169 deletions
diff --git a/source/fitz/font.c b/source/fitz/font.c
index 792e1875..b0248bd7 100644
--- a/source/fitz/font.c
+++ b/source/fitz/font.c
@@ -295,9 +295,10 @@ fz_font *fz_load_system_cjk_font(fz_context *ctx, const char *name, int ros, int
 	return font;
 }
 
-fz_font *fz_load_fallback_font(fz_context *ctx, int script, int serif, int bold, int italic)
+fz_font *fz_load_fallback_font(fz_context *ctx, int script, int language, int serif, int bold, int italic)
 {
 	const char *data;
+	int index;
 	int size;
 
 	if (script < 0 || script > nelem(ctx->font->fallback))
@@ -305,25 +306,42 @@ fz_font *fz_load_fallback_font(fz_context *ctx, int script, int serif, int bold,
 
 	/* TODO: bold and italic */
 
+	index = script;
+	if (script == UCDN_SCRIPT_HAN)
+	{
+		switch (language)
+		{
+		case FZ_LANG_ja: index = UCDN_LAST_SCRIPT + 1; break;
+		case FZ_LANG_ko: index = UCDN_LAST_SCRIPT + 2; break;
+		case FZ_LANG_zh_Hant: index = UCDN_LAST_SCRIPT + 3; break;
+		case FZ_LANG_zh_Hans: index = UCDN_LAST_SCRIPT + 4; break;
+		}
+	}
+	if (script == UCDN_SCRIPT_ARABIC)
+	{
+		if (language == FZ_LANG_ur || language == FZ_LANG_urd)
+			index = UCDN_LAST_SCRIPT + 5;
+	}
+
 	if (serif)
 	{
-		if (ctx->font->fallback[script].serif)
-			return ctx->font->fallback[script].serif;
-		data = fz_lookup_noto_font(ctx, script, 1, &size);
+		if (ctx->font->fallback[index].serif)
+			return ctx->font->fallback[index].serif;
+		data = fz_lookup_noto_font(ctx, script, language, 1, &size);
 		if (data)
 		{
-			ctx->font->fallback[script].serif = fz_new_font_from_memory(ctx, NULL, data, size, 0, 0);
-			return ctx->font->fallback[script].serif;
+			ctx->font->fallback[index].serif = fz_new_font_from_memory(ctx, NULL, data, size, 0, 0);
+			return ctx->font->fallback[index].serif;
 		}
 	}
 
-	if (ctx->font->fallback[script].sans)
-		return ctx->font->fallback[script].sans;
-	data = fz_lookup_noto_font(ctx, script, 0, &size);
+	if (ctx->font->fallback[index].sans)
+		return ctx->font->fallback[index].sans;
+	data = fz_lookup_noto_font(ctx, script, language, 0, &size);
 	if (data)
 	{
-		ctx->font->fallback[script].sans = fz_new_font_from_memory(ctx, NULL, data, size, 0, 0);
-		return ctx->font->fallback[script].sans;
+		ctx->font->fallback[index].sans = fz_new_font_from_memory(ctx, NULL, data, size, 0, 0);
+		return ctx->font->fallback[index].sans;
 	}
 
 	return NULL;
@@ -1488,7 +1506,7 @@ fz_encode_character(fz_context *ctx, fz_font *font, int ucs)
 /* FIXME: This should take language too eventually, to allow for fonts where we can select different
  * languages using opentype features. */
 int
-fz_encode_character_with_fallback(fz_context *ctx, fz_font *user_font, int unicode, int script, fz_font **out_font)
+fz_encode_character_with_fallback(fz_context *ctx, fz_font *user_font, int unicode, int script, int language, fz_font **out_font)
 {
 	fz_font *font;
 	int gid;
@@ -1500,7 +1518,7 @@ fz_encode_character_with_fallback(fz_context *ctx, fz_font *user_font, int unico
 	if (script == 0)
 		script = ucdn_get_script(unicode);
 
-	font = fz_load_fallback_font(ctx, script, user_font->is_serif, user_font->is_bold, user_font->is_italic);
+	font = fz_load_fallback_font(ctx, script, language, user_font->is_serif, user_font->is_bold, user_font->is_italic);
 	if (font)
 	{
 		gid = fz_encode_character(ctx, font, unicode);
diff --git a/source/fitz/noto.c b/source/fitz/noto.c
index 70f3bb8f..894d7bd6 100644
--- a/source/fitz/noto.c
+++ b/source/fitz/noto.c
@@ -6,7 +6,7 @@
 	DroidSansFallback from Android for CJK.
 	Charis SIL from SIL.
 
-	Define TOFU to skip all the Noto fonts except CJK.
+	Define TOFU to only include the Base14 and CJK fonts.
 
 	Define TOFU_CJK to skip CJK font.
 	Define TOFU_CJK_EXT to skip CJK Extension A support.
@@ -15,6 +15,8 @@
 	Define TOFU_HISTORIC to skip ancient/historic scripts.
 	Define TOFU_SYMBOL to skip symbol font.
 	Define TOFU_SIL to skip the SIL fonts.
+
+	Define TOFU_BASE14 to skip the Base 14 fonts (warning: makes PDF unusable).
 */
 
 #ifdef NOTO_SMALL
@@ -37,39 +39,39 @@
 #endif
 
 #define RETURN(NAME) \
-	do {\
+	do { \
 	extern const int fz_font_ ## NAME ## _size; \
 	extern const char fz_font_ ## NAME []; \
-	return *size = fz_font_ ## NAME ## _size, fz_font_ ## NAME;\
+	return *size = fz_font_ ## NAME ## _size, fz_font_ ## NAME; \
 	} while (0)
 
 const char *
 fz_lookup_base14_font(fz_context *ctx, const char *name, int *size)
 {
 #ifndef TOFU_BASE14
-	if (!strcmp(name, "Courier")) { RETURN(NimbusMonoPS_Regular_cff); }
-	if (!strcmp(name, "Courier-Oblique")) { RETURN(NimbusMonoPS_Italic_cff); }
-	if (!strcmp(name, "Courier-Bold")) { RETURN(NimbusMonoPS_Bold_cff); }
-	if (!strcmp(name, "Courier-BoldOblique")) { RETURN(NimbusMonoPS_BoldItalic_cff); }
-	if (!strcmp(name, "Helvetica")) { RETURN(NimbusSans_Regular_cff); }
-	if (!strcmp(name, "Helvetica-Oblique")) { RETURN(NimbusSans_Oblique_cff); }
-	if (!strcmp(name, "Helvetica-Bold")) { RETURN(NimbusSans_Bold_cff); }
-	if (!strcmp(name, "Helvetica-BoldOblique")) { RETURN(NimbusSans_BoldOblique_cff); }
-	if (!strcmp(name, "Times-Roman")) { RETURN(NimbusRoman_Regular_cff); }
-	if (!strcmp(name, "Times-Italic")) { RETURN(NimbusRoman_Italic_cff); }
-	if (!strcmp(name, "Times-Bold")) { RETURN(NimbusRoman_Bold_cff); }
-	if (!strcmp(name, "Times-BoldItalic")) { RETURN(NimbusRoman_BoldItalic_cff); }
-	if (!strcmp(name, "Symbol")) { RETURN(StandardSymbolsPS_cff); }
-	if (!strcmp(name, "ZapfDingbats")) { RETURN(Dingbats_cff); }
+	if (!strcmp(name, "Courier")) RETURN(NimbusMonoPS_Regular_cff);
+	if (!strcmp(name, "Courier-Oblique")) RETURN(NimbusMonoPS_Italic_cff);
+	if (!strcmp(name, "Courier-Bold")) RETURN(NimbusMonoPS_Bold_cff);
+	if (!strcmp(name, "Courier-BoldOblique")) RETURN(NimbusMonoPS_BoldItalic_cff);
+	if (!strcmp(name, "Helvetica")) RETURN(NimbusSans_Regular_cff);
+	if (!strcmp(name, "Helvetica-Oblique")) RETURN(NimbusSans_Oblique_cff);
+	if (!strcmp(name, "Helvetica-Bold")) RETURN(NimbusSans_Bold_cff);
+	if (!strcmp(name, "Helvetica-BoldOblique")) RETURN(NimbusSans_BoldOblique_cff);
+	if (!strcmp(name, "Times-Roman")) RETURN(NimbusRoman_Regular_cff);
+	if (!strcmp(name, "Times-Italic")) RETURN(NimbusRoman_Italic_cff);
+	if (!strcmp(name, "Times-Bold")) RETURN(NimbusRoman_Bold_cff);
+	if (!strcmp(name, "Times-BoldItalic")) RETURN(NimbusRoman_BoldItalic_cff);
+	if (!strcmp(name, "Symbol")) RETURN(StandardSymbolsPS_cff);
+	if (!strcmp(name, "ZapfDingbats")) RETURN(Dingbats_cff);
 #endif
 	return *size = 0, NULL;
 }
 
 #define FAMILY(R, I, B, BI) \
 	if (!is_bold) { \
-		if (!is_italic) { RETURN(R); } else { RETURN(I); } \
+		if (!is_italic) RETURN(R); else RETURN(I); \
 	} else { \
-		if (!is_italic) { RETURN(B); } else { RETURN(BI); } \
+		if (!is_italic) RETURN(B); else RETURN(BI); \
 	}
 
 const char *
@@ -140,20 +142,16 @@ fz_lookup_cjk_font(fz_context *ctx, int registry, int serif, int wmode, int *siz
 #endif
 }
 
-#define Noto(SANS) { RETURN(Noto ## SANS ## _Regular_ttf); } break
+#define Noto(SANS) RETURN(Noto ## SANS ## _Regular_ttf)
 
 #define Noto2(SANS,SERIF) \
-	if (serif) { RETURN(Noto ## SERIF ## _Regular_ttf); } \
-	else { RETURN(Noto ## SANS ## _Regular_ttf); } \
-	break
-
-#define Noto3(SANS,SERIF,UNUSED) \
-	Noto2(SANS,SERIF)
+	if (serif) { RETURN(Noto ## SERIF ## _Regular_ttf); } else { RETURN(Noto ## SANS ## _Regular_ttf); }
 
 const char *
-fz_lookup_noto_font(fz_context *ctx, int script, int serif, int *size)
+fz_lookup_noto_font(fz_context *ctx, int script, int language, int serif, int *size)
 {
-	/* Unused Noto fonts: NastaliqUrdu, SansSyriacEstrangela */
+	/* TODO: Noto(SansSyriacEstrangela); */
+	/* TODO: Noto(SansSyriacWestern); */
 
 	switch (script)
 	{
@@ -171,141 +169,152 @@ fz_lookup_noto_font(fz_context *ctx, int script, int serif, int *size)
 	case UCDN_SCRIPT_BOPOMOFO:
 		return fz_lookup_cjk_font(ctx, FZ_ADOBE_GB_1, serif, 0, size, NULL);
 	case UCDN_SCRIPT_HAN:
-		return fz_lookup_cjk_font(ctx, FZ_ADOBE_GB_1, serif, 0, size, NULL);
+		switch (language)
+		{
+		case FZ_LANG_ja: return fz_lookup_cjk_font(ctx, FZ_ADOBE_JAPAN_1, serif, 0, size, NULL);
+		case FZ_LANG_ko: return fz_lookup_cjk_font(ctx, FZ_ADOBE_KOREA_1, serif, 0, size, NULL);
+		case FZ_LANG_zh_Hant: return fz_lookup_cjk_font(ctx, FZ_ADOBE_CNS_1, serif, 0, size, NULL);
+		default:
+		case FZ_LANG_zh_Hans: return fz_lookup_cjk_font(ctx, FZ_ADOBE_GB_1, serif, 0, size, NULL);
+		}
 
 #ifndef TOFU
-
-#ifndef TOFU_HISTORIC
-	case UCDN_SCRIPT_IMPERIAL_ARAMAIC: Noto(SansImperialAramaic);
-	case UCDN_SCRIPT_AVESTAN: Noto(SansAvestan);
-	case UCDN_SCRIPT_CARIAN: Noto(SansCarian);
-	case UCDN_SCRIPT_CYPRIOT: Noto(SansCypriot);
-	case UCDN_SCRIPT_EGYPTIAN_HIEROGLYPHS: Noto(SansEgyptianHieroglyphs);
-	case UCDN_SCRIPT_GLAGOLITIC: Noto(SansGlagolitic);
-	case UCDN_SCRIPT_GOTHIC: Noto(SansGothic);
-	case UCDN_SCRIPT_OLD_ITALIC: Noto(SansOldItalic);
-	case UCDN_SCRIPT_KHAROSHTHI: Noto(SansKharoshthi);
-	case UCDN_SCRIPT_KAITHI: Noto(SansKaithi);
-	case UCDN_SCRIPT_LINEAR_B: Noto(SansLinearB);
-	case UCDN_SCRIPT_LYCIAN: Noto(SansLycian);
-	case UCDN_SCRIPT_LYDIAN: Noto(SansLydian);
-	case UCDN_SCRIPT_OGHAM: Noto(SansOgham);
-	case UCDN_SCRIPT_OLD_TURKIC: Noto(SansOldTurkic);
-	case UCDN_SCRIPT_PHAGS_PA: Noto(SansPhagsPa);
-	case UCDN_SCRIPT_INSCRIPTIONAL_PAHLAVI: Noto(SansInscriptionalPahlavi);
-	case UCDN_SCRIPT_INSCRIPTIONAL_PARTHIAN: Noto(SansInscriptionalParthian);
-	case UCDN_SCRIPT_RUNIC: Noto(SansRunic);
-	case UCDN_SCRIPT_OLD_SOUTH_ARABIAN: Noto(SansOldSouthArabian);
-	case UCDN_SCRIPT_UGARITIC: Noto(SansUgaritic);
-	case UCDN_SCRIPT_OLD_PERSIAN: Noto(SansOldPersian);
-	case UCDN_SCRIPT_CUNEIFORM: Noto(SansCuneiform);
-	case UCDN_SCRIPT_COPTIC: Noto(SansCoptic);
-#endif
-
 	case UCDN_SCRIPT_LATIN: Noto2(Sans, Serif);
 	case UCDN_SCRIPT_GREEK: Noto2(Sans, Serif);
 	case UCDN_SCRIPT_CYRILLIC: Noto2(Sans, Serif);
+
+	case UCDN_SCRIPT_ARABIC:
+		if (language == FZ_LANG_ur || language == FZ_LANG_urd)
+			Noto(NastaliqUrdu);
+		Noto2(KufiArabic, NaskhArabic);
+
 	case UCDN_SCRIPT_ARMENIAN: Noto2(SansArmenian, SerifArmenian);
-	case UCDN_SCRIPT_HEBREW: Noto(SansHebrew);
-	case UCDN_SCRIPT_ARABIC: Noto3(KufiArabic, NaskhArabic, NastaliqUrdu);
-	case UCDN_SCRIPT_SYRIAC: Noto3(SansSyriacEastern, SansSyriacWestern, SansSyriacEstrangela);
-	case UCDN_SCRIPT_THAANA: Noto(SansThaana);
-	case UCDN_SCRIPT_DEVANAGARI: Noto(SansDevanagari);
+	case UCDN_SCRIPT_BALINESE: Noto(SansBalinese);
+	case UCDN_SCRIPT_BAMUM: Noto(SansBamum);
+	case UCDN_SCRIPT_BATAK: Noto(SansBatak);
 	case UCDN_SCRIPT_BENGALI: Noto2(SansBengali, SerifBengali);
-	case UCDN_SCRIPT_GURMUKHI: Noto(SansGurmukhi);
+	case UCDN_SCRIPT_CANADIAN_ABORIGINAL: Noto(SansCanadianAboriginal);
+	case UCDN_SCRIPT_CHAM: Noto(SansCham);
+	case UCDN_SCRIPT_CHEROKEE: Noto(SansCherokee);
+	case UCDN_SCRIPT_DEVANAGARI: Noto(SansDevanagari);
+	case UCDN_SCRIPT_ETHIOPIC: Noto(SansEthiopic);
+	case UCDN_SCRIPT_GEORGIAN: Noto2(SansGeorgian, SerifGeorgian);
 	case UCDN_SCRIPT_GUJARATI: Noto2(SansGujarati, SerifGujarati);
-	case UCDN_SCRIPT_ORIYA: Noto(SansOriya);
-	case UCDN_SCRIPT_TAMIL: Noto2(SansTamil, SerifTamil);
-	case UCDN_SCRIPT_TELUGU: Noto2(SansTelugu, SerifTelugu);
+	case UCDN_SCRIPT_GURMUKHI: Noto(SansGurmukhi);
+	case UCDN_SCRIPT_HEBREW: Noto(SansHebrew);
+	case UCDN_SCRIPT_JAVANESE: Noto(SansJavanese);
 	case UCDN_SCRIPT_KANNADA: Noto2(SansKannada, SerifKannada);
+	case UCDN_SCRIPT_KAYAH_LI: Noto(SansKayahLi);
+	case UCDN_SCRIPT_KHMER: Noto2(SansKhmer, SerifKhmer);
+	case UCDN_SCRIPT_LAO: Noto2(SansLao, SerifLao);
+	case UCDN_SCRIPT_LEPCHA: Noto(SansLepcha);
+	case UCDN_SCRIPT_LIMBU: Noto(SansLimbu);
+	case UCDN_SCRIPT_LISU: Noto(SansLisu);
 	case UCDN_SCRIPT_MALAYALAM: Noto2(SansMalayalam, SerifMalayalam);
+	case UCDN_SCRIPT_MANDAIC: Noto(SansMandaic);
+	case UCDN_SCRIPT_MEETEI_MAYEK: Noto(SansMeeteiMayek);
+	case UCDN_SCRIPT_MONGOLIAN: Noto(SansMongolian);
+	case UCDN_SCRIPT_MYANMAR: Noto(SansMyanmar);
+	case UCDN_SCRIPT_NEW_TAI_LUE: Noto(SansNewTaiLue);
+	case UCDN_SCRIPT_NKO: Noto(SansNKo);
+	case UCDN_SCRIPT_OL_CHIKI: Noto(SansOlChiki);
+	case UCDN_SCRIPT_ORIYA: Noto(SansOriya);
+	case UCDN_SCRIPT_SAURASHTRA: Noto(SansSaurashtra);
 	case UCDN_SCRIPT_SINHALA: Noto(SansSinhala);
+	case UCDN_SCRIPT_SUNDANESE: Noto(SansSundanese);
+	case UCDN_SCRIPT_SYLOTI_NAGRI: Noto(SansSylotiNagri);
+	case UCDN_SCRIPT_SYRIAC: Noto(SansSyriacEastern);
+	case UCDN_SCRIPT_TAI_LE: Noto(SansTaiLe);
+	case UCDN_SCRIPT_TAI_THAM: Noto(SansTaiTham);
+	case UCDN_SCRIPT_TAI_VIET: Noto(SansTaiViet);
+	case UCDN_SCRIPT_TAMIL: Noto2(SansTamil, SerifTamil);
+	case UCDN_SCRIPT_TELUGU: Noto2(SansTelugu, SerifTelugu);
+	case UCDN_SCRIPT_THAANA: Noto(SansThaana);
 	case UCDN_SCRIPT_THAI: Noto2(SansThai, SerifThai);
-	case UCDN_SCRIPT_LAO: Noto2(SansLao, SerifLao);
 	case UCDN_SCRIPT_TIBETAN: Noto(SansTibetan);
-	case UCDN_SCRIPT_MYANMAR: Noto(SansMyanmar);
-	case UCDN_SCRIPT_GEORGIAN: Noto2(SansGeorgian, SerifGeorgian);
-	case UCDN_SCRIPT_ETHIOPIC: Noto(SansEthiopic);
-	case UCDN_SCRIPT_CHEROKEE: Noto(SansCherokee);
-	case UCDN_SCRIPT_CANADIAN_ABORIGINAL: Noto(SansCanadianAboriginal);
-	case UCDN_SCRIPT_KHMER: Noto2(SansKhmer, SerifKhmer);
-	case UCDN_SCRIPT_MONGOLIAN: Noto(SansMongolian);
+	case UCDN_SCRIPT_TIFINAGH: Noto(SansTifinagh);
+	case UCDN_SCRIPT_VAI: Noto(SansVai);
 	case UCDN_SCRIPT_YI: Noto(SansYi);
+
+#ifndef TOFU_HISTORIC
+	case UCDN_SCRIPT_AVESTAN: Noto(SansAvestan);
+	case UCDN_SCRIPT_BRAHMI: Noto(SansBrahmi);
+	case UCDN_SCRIPT_BUGINESE: Noto(SansBuginese);
+	case UCDN_SCRIPT_BUHID: Noto(SansBuhid);
+	case UCDN_SCRIPT_CARIAN: Noto(SansCarian);
+	case UCDN_SCRIPT_COPTIC: Noto(SansCoptic);
+	case UCDN_SCRIPT_CUNEIFORM: Noto(SansCuneiform);
+	case UCDN_SCRIPT_CYPRIOT: Noto(SansCypriot);
 	case UCDN_SCRIPT_DESERET: Noto(SansDeseret);
-	case UCDN_SCRIPT_TAGALOG: Noto(SansTagalog);
+	case UCDN_SCRIPT_EGYPTIAN_HIEROGLYPHS: Noto(SansEgyptianHieroglyphs);
+	case UCDN_SCRIPT_GLAGOLITIC: Noto(SansGlagolitic);
+	case UCDN_SCRIPT_GOTHIC: Noto(SansGothic);
 	case UCDN_SCRIPT_HANUNOO: Noto(SansHanunoo);
-	case UCDN_SCRIPT_BUHID: Noto(SansBuhid);
-	case UCDN_SCRIPT_TAGBANWA: Noto(SansTagbanwa);
-	case UCDN_SCRIPT_LIMBU: Noto(SansLimbu);
-	case UCDN_SCRIPT_TAI_LE: Noto(SansTaiLe);
-	case UCDN_SCRIPT_SHAVIAN: Noto(SansShavian);
+	case UCDN_SCRIPT_IMPERIAL_ARAMAIC: Noto(SansImperialAramaic);
+	case UCDN_SCRIPT_INSCRIPTIONAL_PAHLAVI: Noto(SansInscriptionalPahlavi);
+	case UCDN_SCRIPT_INSCRIPTIONAL_PARTHIAN: Noto(SansInscriptionalParthian);
+	case UCDN_SCRIPT_KAITHI: Noto(SansKaithi);
+	case UCDN_SCRIPT_KHAROSHTHI: Noto(SansKharoshthi);
+	case UCDN_SCRIPT_LINEAR_B: Noto(SansLinearB);
+	case UCDN_SCRIPT_LYCIAN: Noto(SansLycian);
+	case UCDN_SCRIPT_LYDIAN: Noto(SansLydian);
+	case UCDN_SCRIPT_OGHAM: Noto(SansOgham);
+	case UCDN_SCRIPT_OLD_ITALIC: Noto(SansOldItalic);
+	case UCDN_SCRIPT_OLD_PERSIAN: Noto(SansOldPersian);
+	case UCDN_SCRIPT_OLD_SOUTH_ARABIAN: Noto(SansOldSouthArabian);
+	case UCDN_SCRIPT_OLD_TURKIC: Noto(SansOldTurkic);
 	case UCDN_SCRIPT_OSMANYA: Noto(SansOsmanya);
-	case UCDN_SCRIPT_BUGINESE: Noto(SansBuginese);
-	case UCDN_SCRIPT_NEW_TAI_LUE: Noto(SansNewTaiLue);
-	case UCDN_SCRIPT_TIFINAGH: Noto(SansTifinagh);
-	case UCDN_SCRIPT_SYLOTI_NAGRI: Noto(SansSylotiNagri);
-	case UCDN_SCRIPT_BALINESE: Noto(SansBalinese);
+	case UCDN_SCRIPT_PHAGS_PA: Noto(SansPhagsPa);
 	case UCDN_SCRIPT_PHOENICIAN: Noto(SansPhoenician);
-	case UCDN_SCRIPT_NKO: Noto(SansNKo);
-	case UCDN_SCRIPT_SUNDANESE: Noto(SansSundanese);
-	case UCDN_SCRIPT_LEPCHA: Noto(SansLepcha);
-	case UCDN_SCRIPT_OL_CHIKI: Noto(SansOlChiki);
-	case UCDN_SCRIPT_VAI: Noto(SansVai);
-	case UCDN_SCRIPT_SAURASHTRA: Noto(SansSaurashtra);
-	case UCDN_SCRIPT_KAYAH_LI: Noto(SansKayahLi);
 	case UCDN_SCRIPT_REJANG: Noto(SansRejang);
-	case UCDN_SCRIPT_CHAM: Noto(SansCham);
-	case UCDN_SCRIPT_TAI_THAM: Noto(SansTaiTham);
-	case UCDN_SCRIPT_TAI_VIET: Noto(SansTaiViet);
+	case UCDN_SCRIPT_RUNIC: Noto(SansRunic);
 	case UCDN_SCRIPT_SAMARITAN: Noto(SansSamaritan);
-	case UCDN_SCRIPT_LISU: Noto(SansLisu);
-	case UCDN_SCRIPT_BAMUM: Noto(SansBamum);
-	case UCDN_SCRIPT_JAVANESE: Noto(SansJavanese);
-	case UCDN_SCRIPT_MEETEI_MAYEK: Noto(SansMeeteiMayek);
-	case UCDN_SCRIPT_BATAK: Noto(SansBatak);
-	case UCDN_SCRIPT_BRAHMI: Noto(SansBrahmi);
-	case UCDN_SCRIPT_MANDAIC: Noto(SansMandaic);
+	case UCDN_SCRIPT_SHAVIAN: Noto(SansShavian);
+	case UCDN_SCRIPT_TAGALOG: Noto(SansTagalog);
+	case UCDN_SCRIPT_TAGBANWA: Noto(SansTagbanwa);
+	case UCDN_SCRIPT_UGARITIC: Noto(SansUgaritic);
+#endif
 
 	/* No fonts available for these scripts: */
+	case UCDN_SCRIPT_BRAILLE: /* no dedicated font; fallback to NotoSansSymbols will cover this */
+	case UCDN_SCRIPT_CHAKMA: break;
+	case UCDN_SCRIPT_MIAO: break;
 #ifndef TOFU_HISTORIC
 	case UCDN_SCRIPT_AHOM: break;
+	case UCDN_SCRIPT_ANATOLIAN_HIEROGLYPHS: break;
 	case UCDN_SCRIPT_BASSA_VAH: break;
+	case UCDN_SCRIPT_CAUCASIAN_ALBANIAN: break;
+	case UCDN_SCRIPT_DUPLOYAN: break;
 	case UCDN_SCRIPT_ELBASAN: break;
 	case UCDN_SCRIPT_GRANTHA: break;
 	case UCDN_SCRIPT_HATRAN: break;
-	case UCDN_SCRIPT_ANATOLIAN_HIEROGLYPHS: break;
-	case UCDN_SCRIPT_OLD_HUNGARIAN: break;
 	case UCDN_SCRIPT_KHOJKI: break;
+	case UCDN_SCRIPT_KHUDAWADI: break;
 	case UCDN_SCRIPT_LINEAR_A: break;
 	case UCDN_SCRIPT_MAHAJANI: break;
 	case UCDN_SCRIPT_MANICHAEAN: break;
+	case UCDN_SCRIPT_MENDE_KIKAKUI: break;
 	case UCDN_SCRIPT_MEROITIC_CURSIVE: break;
 	case UCDN_SCRIPT_MEROITIC_HIEROGLYPHS: break;
 	case UCDN_SCRIPT_MODI: break;
+	case UCDN_SCRIPT_MRO: break;
 	case UCDN_SCRIPT_MULTANI: break;
-	case UCDN_SCRIPT_OLD_NORTH_ARABIAN: break;
 	case UCDN_SCRIPT_NABATAEAN: break;
-	case UCDN_SCRIPT_PALMYRENE: break;
+	case UCDN_SCRIPT_OLD_HUNGARIAN: break;
+	case UCDN_SCRIPT_OLD_NORTH_ARABIAN: break;
 	case UCDN_SCRIPT_OLD_PERMIC: break;
+	case UCDN_SCRIPT_PAHAWH_HMONG: break;
+	case UCDN_SCRIPT_PALMYRENE: break;
+	case UCDN_SCRIPT_PAU_CIN_HAU: break;
 	case UCDN_SCRIPT_PSALTER_PAHLAVI: break;
-	case UCDN_SCRIPT_SIDDHAM: break;
-#endif
-	case UCDN_SCRIPT_BRAILLE: break; /* no dedicated font */
-	case UCDN_SCRIPT_CHAKMA: break;
-	case UCDN_SCRIPT_MIAO: break;
 	case UCDN_SCRIPT_SHARADA: break;
+	case UCDN_SCRIPT_SIDDHAM: break;
+	case UCDN_SCRIPT_SIGNWRITING: break;
 	case UCDN_SCRIPT_SORA_SOMPENG: break;
 	case UCDN_SCRIPT_TAKRI: break;
-	case UCDN_SCRIPT_CAUCASIAN_ALBANIAN: break;
-	case UCDN_SCRIPT_DUPLOYAN: break;
-	case UCDN_SCRIPT_KHUDAWADI: break;
-	case UCDN_SCRIPT_MENDE_KIKAKUI: break;
-	case UCDN_SCRIPT_MRO: break;
-	case UCDN_SCRIPT_PAHAWH_HMONG: break;
-	case UCDN_SCRIPT_PAU_CIN_HAU: break;
 	case UCDN_SCRIPT_TIRHUTA: break;
 	case UCDN_SCRIPT_WARANG_CITI: break;
-	case UCDN_SCRIPT_SIGNWRITING: break;
+#endif
 
 #endif
 	}
diff --git a/source/fitz/text.c b/source/fitz/text.c
index 76838fe8..a0e2abc8 100644
--- a/source/fitz/text.c
+++ b/source/fitz/text.c
@@ -114,7 +114,7 @@ fz_show_string(fz_context *ctx, fz_text *text, fz_font *user_font, fz_matrix *tr
 	while (*s)
 	{
 		s += fz_chartorune(&ucs, s);
-		gid = fz_encode_character_with_fallback(ctx, user_font, ucs, 0, &font);
+		gid = fz_encode_character_with_fallback(ctx, user_font, ucs, 0, language, &font);
 		fz_show_glyph(ctx, text, font, trm, gid, ucs, wmode, bidi_level, markup_dir, language);
 		adv = fz_advance_glyph(ctx, font, gid, wmode);
 		if (wmode == 0)
@@ -227,6 +227,16 @@ fz_text_language fz_text_language_from_string(const char *str)
 	if (str == NULL)
 		return FZ_LANG_UNSET;
 
+	if (!strcmp(str, "zh-Hant") ||
+			!strcmp(str, "zh-HK") ||
+			!strcmp(str, "zh-MO") ||
+			!strcmp(str, "zh-SG") ||
+			!strcmp(str, "zh-TW"))
+		return FZ_LANG_zh_Hant;
+	if (!strcmp(str, "zh-Hans") ||
+			!strcmp(str, "zh-CN"))
+		return FZ_LANG_zh_Hans;
+
 	/* 1st char */
 	if (str[0] >= 'a' && str[0] <= 'z')
 		lang = str[0] - 'a' + 1;
@@ -254,23 +264,30 @@ fz_text_language fz_text_language_from_string(const char *str)
 	return lang;
 }
 
-char *fz_string_from_text_language(char str[4], fz_text_language lang)
+char *fz_string_from_text_language(char str[8], fz_text_language lang)
 {
 	int c;
 
-	/* str is supposed to be at least 4 chars in size */
+	/* str is supposed to be at least 8 chars in size */
 	if (str == NULL)
 		return NULL;
 
-	c = lang % 27;
-	lang = lang / 27;
-	str[0] = c == 0 ? 0 : c - 1 + 'a';
-	c = lang % 27;
-	lang = lang / 27;
-	str[1] = c == 0 ? 0 : c - 1 + 'a';
-	c = lang % 27;
-	str[2] = c == 0 ? 0 : c - 1 + 'a';
-	str[3] = 0;
+	if (lang == FZ_LANG_zh_Hant)
+		fz_strlcpy(str, "zh-Hant", 8);
+	else if (lang == FZ_LANG_zh_Hans)
+		fz_strlcpy(str, "zh-Hans", 8);
+	else
+	{
+		c = lang % 27;
+		lang = lang / 27;
+		str[0] = c == 0 ? 0 : c - 1 + 'a';
+		c = lang % 27;
+		lang = lang / 27;
+		str[1] = c == 0 ? 0 : c - 1 + 'a';
+		c = lang % 27;
+		str[2] = c == 0 ? 0 : c - 1 + 'a';
+		str[3] = 0;
+	}
 
 	return str;
 }
diff --git a/source/html/html-layout.c b/source/html/html-layout.c
index 95d6151a..e671c64d 100644
--- a/source/html/html-layout.c
+++ b/source/html/html-layout.c
@@ -154,6 +154,7 @@ static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html *top, fz_h
 	flow->type = type;
 	flow->expand = 0;
 	flow->bidi_level = 0;
+	flow->markup_lang = 0;
 	flow->breaks_line = 0;
 	flow->box = inline_box;
 	*top->flow_tail = flow;
@@ -182,12 +183,13 @@ static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html *top, fz_ht
 	(void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN);
 }
 
-static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html *top, fz_html *inline_box, const char *a, const char *b)
+static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html *top, fz_html *inline_box, const char *a, const char *b, int lang)
 {
 	fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD);
 	flow->content.text = fz_pool_alloc(ctx, pool, b - a + 1);
 	memcpy(flow->content.text, a, b - a);
 	flow->content.text[b - a] = 0;
+	flow->markup_lang = lang;
 }
 
 static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html *top, fz_html *inline_box, fz_image *img)
@@ -223,7 +225,7 @@ static fz_html_flow *split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *fl
 	return new_flow;
 }
 
-static void flush_space(fz_context *ctx, fz_pool *pool, fz_html *flow, fz_html *inline_box, struct genstate *g)
+static void flush_space(fz_context *ctx, fz_pool *pool, fz_html *flow, fz_html *inline_box, int lang, struct genstate *g)
 {
 	static const char *space = " ";
 	int bsp = inline_box->style.white_space & WS_ALLOW_BREAK_SPACE;
@@ -234,7 +236,7 @@ static void flush_space(fz_context *ctx, fz_pool *pool, fz_html *flow, fz_html *
 			if (bsp)
 				add_flow_space(ctx, pool, flow, inline_box);
 			else
-				add_flow_word(ctx, pool, flow, inline_box, space, space+1);
+				add_flow_word(ctx, pool, flow, inline_box, space, space+1, lang);
 		}
 		g->emit_white = 0;
 	}
@@ -276,7 +278,7 @@ static const char *pairbrk[29] =
 	"_^^%%%^^^_______%%__^^^_____%", /* RI regional indicator */
 };
 
-static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const char *text, struct genstate *g)
+static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const char *text, int lang, struct genstate *g)
 {
 	fz_html *flow;
 
@@ -319,7 +321,7 @@ static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const ch
 				if (bsp)
 					add_flow_space(ctx, pool, flow, box);
 				else
-					add_flow_word(ctx, pool, flow, box, space, space+1);
+					add_flow_word(ctx, pool, flow, box, space, space+1, lang);
 				++text;
 			}
 			g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */
@@ -329,7 +331,7 @@ static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const ch
 			const char *prev, *mark = text;
 			int c;
 
-			flush_space(ctx, pool, flow, box, g);
+			flush_space(ctx, pool, flow, box, lang, g);
 
 			if (g->at_bol)
 				g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ;
@@ -341,7 +343,7 @@ static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const ch
 				if (c == 0xAD) /* soft hyphen */
 				{
 					if (mark != prev)
-						add_flow_word(ctx, pool, flow, box, mark, prev);
+						add_flow_word(ctx, pool, flow, box, mark, prev, lang);
 					add_flow_shyphen(ctx, pool, flow, box);
 					mark = text;
 					g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */
@@ -361,7 +363,7 @@ static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const ch
 						if (brk == '_')
 						{
 							if (mark != prev)
-								add_flow_word(ctx, pool, flow, box, mark, prev);
+								add_flow_word(ctx, pool, flow, box, mark, prev, lang);
 							add_flow_sbreak(ctx, pool, flow, box);
 							mark = prev;
 						}
@@ -371,7 +373,7 @@ static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const ch
 				}
 			}
 			if (mark != text)
-				add_flow_word(ctx, pool, flow, box, mark, text);
+				add_flow_word(ctx, pool, flow, box, mark, text, lang);
 
 			g->at_bol = 0;
 		}
@@ -420,12 +422,12 @@ static void generate_image(fz_context *ctx, fz_pool *pool, fz_html *box, fz_imag
 	while (flow->type != BOX_FLOW)
 		flow = flow->up;
 
-	flush_space(ctx, pool, flow, box, g);
+	flush_space(ctx, pool, flow, box, 0, g);
 
 	if (!img)
 	{
 		const char *alt = "[image]";
-		add_flow_word(ctx, pool, flow, box, alt, alt + 7);
+		add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0);
 	}
 	else
 	{
@@ -578,7 +580,7 @@ static void insert_inline_box(fz_context *ctx, fz_pool *pool, fz_html *box, fz_h
 }
 
 static void generate_boxes(fz_context *ctx, fz_xml *node, fz_html *top,
-		fz_css_match *up_match, int list_counter, int markup_dir, struct genstate *g)
+		fz_css_match *up_match, int list_counter, int markup_dir, int markup_lang, struct genstate *g)
 {
 	fz_css_match match;
 	fz_html *box;
@@ -656,9 +658,11 @@ static void generate_boxes(fz_context *ctx, fz_xml *node, fz_html *top,
 
 			else if (display != DIS_NONE)
 			{
+				const char *dir, *lang;
 				int child_dir = markup_dir;
+				int child_lang = markup_lang;
 
-				const char *dir = fz_xml_att(node, "dir");
+				dir = fz_xml_att(node, "dir");
 				if (dir)
 				{
 					if (!strcmp(dir, "auto"))
@@ -671,6 +675,10 @@ static void generate_boxes(fz_context *ctx, fz_xml *node, fz_html *top,
 						child_dir = DEFAULT_DIR;
 				}
 
+				lang = fz_xml_att(node, "lang");
+				if (lang)
+					child_lang = fz_text_language_from_string(lang);
+
 				box = new_box(ctx, g->pool, child_dir);
 				fz_apply_css_style(ctx, g->set, &box->style, &match);
 
@@ -698,7 +706,7 @@ static void generate_boxes(fz_context *ctx, fz_xml *node, fz_html *top,
 					int child_counter = list_counter;
 					if (!strcmp(tag, "ul") || !strcmp(tag, "ol"))
 						child_counter = 0;
-					generate_boxes(ctx, fz_xml_down(node), box, &match, child_counter, child_dir, g);
+					generate_boxes(ctx, fz_xml_down(node), box, &match, child_counter, child_dir, child_lang, g);
 				}
 			}
 		}
@@ -721,11 +729,11 @@ static void generate_boxes(fz_context *ctx, fz_xml *node, fz_html *top,
 					/* Make sure not to recursively multiply font sizes. */
 					box->style.font_size.value = 1;
 					box->style.font_size.unit = N_SCALE;
-					generate_text(ctx, g->pool, box, text, g);
+					generate_text(ctx, g->pool, box, text, markup_lang, g);
 				}
 				else
 				{
-					generate_text(ctx, g->pool, top, text, g);
+					generate_text(ctx, g->pool, top, text, markup_lang, g);
 				}
 			}
 		}
@@ -760,6 +768,7 @@ typedef struct string_walker
 	const char *s;
 	fz_font *base_font;
 	int script;
+	int language;
 	fz_font *font;
 	fz_font *next_font;
 	hb_glyph_position_t *glyph_pos;
@@ -813,7 +822,7 @@ static int quick_ligature(fz_context *ctx, string_walker *walker, unsigned int i
 	return walker->glyph_info[i].codepoint;
 }
 
-static void init_string_walker(fz_context *ctx, string_walker *walker, hb_buffer_t *hb_buf, int rtl, fz_font *font, int script, const char *text)
+static void init_string_walker(fz_context *ctx, string_walker *walker, hb_buffer_t *hb_buf, int rtl, fz_font *font, int script, int language, const char *text)
 {
 	walker->ctx = ctx;
 	walker->hb_buf = hb_buf;
@@ -823,6 +832,7 @@ static void init_string_walker(fz_context *ctx, string_walker *walker, hb_buffer
 	walker->s = text;
 	walker->base_font = font;
 	walker->script = script;
+	walker->language = language;
 	walker->font = NULL;
 	walker->next_font = NULL;
 }
@@ -835,6 +845,7 @@ static int walk_string(string_walker *walker)
 	FT_Face face;
 	int fterr;
 	int quickshape;
+	char lang[8];
 
 	walker->start = walker->end;
 	walker->end = walker->s;
@@ -850,7 +861,7 @@ static int walk_string(string_walker *walker)
 		int c;
 
 		walker->s += fz_chartorune(&c, walker->s);
-		(void)fz_encode_character_with_fallback(ctx, walker->base_font, c, walker->script, &walker->next_font);
+		(void)fz_encode_character_with_fallback(ctx, walker->base_font, c, walker->script, walker->language, &walker->next_font);
 		if (walker->next_font != walker->font)
 		{
 			if (walker->font != NULL)
@@ -876,8 +887,12 @@ static int walk_string(string_walker *walker)
 
 		hb_buffer_clear_contents(walker->hb_buf);
 		hb_buffer_set_direction(walker->hb_buf, walker->rtl ? HB_DIRECTION_RTL : HB_DIRECTION_LTR);
-		/* hb_buffer_set_script(hb_buf, hb_ucdn_script_translate(script)); */
-		/* hb_buffer_set_language(hb_buf, hb_language_from_string("en", strlen("en"))); */
+		/* hb_buffer_set_script(walker->hb_buf, hb_ucdn_script_translate(walker->script)); */
+		if (walker->language)
+		{
+			fz_string_from_text_language(lang, walker->language);
+			hb_buffer_set_language(walker->hb_buf, hb_language_from_string(lang, strlen(lang)));
+		}
 		/* hb_buffer_set_cluster_level(hb_buf, HB_BUFFER_CLUSTER_LEVEL_CHARACTERS); */
 
 		hb_buffer_add_utf8(walker->hb_buf, walker->start, walker->end - walker->start, 0, -1);
@@ -955,7 +970,7 @@ static void measure_string(fz_context *ctx, fz_html_flow *node, hb_buffer_t *hb_
 	node->h = fz_from_css_number_scale(node->box->style.line_height, em, em, em);
 
 	s = get_node_text(ctx, node);
-	init_string_walker(ctx, &walker, hb_buf, node->bidi_level & 1, node->box->style.font, node->script, s);
+	init_string_walker(ctx, &walker, hb_buf, node->bidi_level & 1, node->box->style.font, node->script, node->markup_lang, s);
 	while (walk_string(&walker))
 	{
 		int x = 0;
@@ -1488,7 +1503,7 @@ static void draw_flow_box(fz_context *ctx, fz_html *box, float page_top, float p
 			trm.f = y;
 
 			s = get_node_text(ctx, node);
-			init_string_walker(ctx, &walker, hb_buf, node->bidi_level & 1, style->font, node->script, s);
+			init_string_walker(ctx, &walker, hb_buf, node->bidi_level & 1, style->font, node->script, node->markup_lang, s);
 			while (walk_string(&walker))
 			{
 				float node_scale = node->box->em / walker.scale;
@@ -1726,7 +1741,7 @@ static void draw_list_mark(fz_context *ctx, fz_html *box, float page_top, float
 	while (*s)
 	{
 		s += fz_chartorune(&c, s);
-		g = fz_encode_character_with_fallback(ctx, box->style.font, c, UCDN_SCRIPT_LATIN, &font);
+		g = fz_encode_character_with_fallback(ctx, box->style.font, c, UCDN_SCRIPT_LATIN, FZ_LANG_UNSET, &font);
 		w += fz_advance_glyph(ctx, font, g, 0) * box->em;
 	}
 
@@ -1736,7 +1751,7 @@ static void draw_list_mark(fz_context *ctx, fz_html *box, float page_top, float
 	while (*s)
 	{
 		s += fz_chartorune(&c, s);
-		g = fz_encode_character_with_fallback(ctx, box->style.font, c, UCDN_SCRIPT_LATIN, &font);
+		g = fz_encode_character_with_fallback(ctx, box->style.font, c, UCDN_SCRIPT_LATIN, FZ_LANG_UNSET, &font);
 		fz_show_glyph(ctx, text, font, &trm, g, c, 0, 0, FZ_BIDI_NEUTRAL, FZ_LANG_UNSET);
 		trm.e += fz_advance_glyph(ctx, font, g, 0) * box->em;
 	}
@@ -2301,7 +2316,7 @@ fz_parse_html(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const cha
 	fz_apply_css_style(ctx, g.set, &box->style, &match);
 	// TODO: transfer page margins out of this hacky box
 
-	generate_boxes(ctx, xml, box, &match, 0, DEFAULT_DIR, &g);
+	generate_boxes(ctx, xml, box, &match, 0, DEFAULT_DIR, FZ_LANG_UNSET, &g);
 
 	fz_drop_css(ctx, g.css);
 	fz_drop_xml(ctx, xml);
author	Tor Andersson <tor.andersson@artifex.com>	2016-06-23 13:41:53 +0200
committer	Robin Watts <robin.watts@artifex.com>	2016-06-23 16:10:29 +0100
commit	6e48c939dc9913a6af747d5b6961624551c8d90d (patch)
tree	4b3f69c237dc7af00606c87ac94cd2c4893da628 /source
parent	cf7b2cbbfe6192fbf697237735ab45bc951304e4 (diff)
download	mupdf-6e48c939dc9913a6af747d5b6961624551c8d90d.tar.xz