summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2018-11-13 23:03:25 +0100
committerTor Andersson <tor.andersson@artifex.com>2018-11-13 23:45:37 +0100
commit955573c9792c6992398072f83a1ae0de1a29dcd9 (patch)
tree0e148527480321df31f51451bab50473ede2f76a
parent1fc1ef4d35c724b48428d9338b392a07d8150d58 (diff)
downloadmupdf-955573c9792c6992398072f83a1ae0de1a29dcd9.tar.xz
Bug 697414: Allow more encodings in XML parser.
Support some common FB2 encodings as well as UTF-8 and UTF-16.
-rw-r--r--source/fitz/xml.c56
1 files changed, 55 insertions, 1 deletions
diff --git a/source/fitz/xml.c b/source/fitz/xml.c
index b7c89cf4..6b117479 100644
--- a/source/fitz/xml.c
+++ b/source/fitz/xml.c
@@ -581,8 +581,50 @@ parse_attribute_value:
return "end of data in attribute value";
}
-static char *convert_to_utf8(fz_context *doc, const unsigned char *s, size_t n, int *dofree)
+static int startswith(const char *a, const char *b)
{
+ return !fz_strncasecmp(a, b, strlen(b));
+}
+
+static const unsigned short *find_xml_encoding(char *s)
+{
+ const unsigned short *table = NULL;
+ char *end, *xml, *enc;
+
+ end = strchr(s, '>');
+ if (end)
+ {
+ *end = 0;
+ xml = strstr(s, "<?xml");
+ if (xml)
+ {
+ enc = strstr(xml, "encoding=");
+ if (enc)
+ {
+ enc += 10;
+ if (startswith(enc, "iso-8859-1") || startswith(enc, "latin1"))
+ table = fz_unicode_from_iso8859_1;
+ else if (startswith(enc, "iso-8859-7") || startswith(enc, "greek"))
+ table = fz_unicode_from_iso8859_7;
+ else if (startswith(enc, "koi8"))
+ table = fz_unicode_from_koi8u;
+ else if (startswith(enc, "windows-1250"))
+ table = fz_unicode_from_windows_1250;
+ else if (startswith(enc, "windows-1251"))
+ table = fz_unicode_from_windows_1251;
+ else if (startswith(enc, "windows-1252"))
+ table = fz_unicode_from_windows_1252;
+ }
+ }
+ *end = '>';
+ }
+
+ return table;
+}
+
+static char *convert_to_utf8(fz_context *doc, unsigned char *s, size_t n, int *dofree)
+{
+ const unsigned short *table;
const unsigned char *e = s + n;
char *dst, *d;
int c;
@@ -613,6 +655,18 @@ static char *convert_to_utf8(fz_context *doc, const unsigned char *s, size_t n,
return dst;
}
+ table = find_xml_encoding((char*)s);
+ if (table) {
+ dst = d = fz_malloc(doc, n * FZ_UTFMAX);
+ while (*s) {
+ c = table[*s++];
+ d += fz_runetochar(d, c);
+ }
+ *d = 0;
+ *dofree = 1;
+ return dst;
+ }
+
*dofree = 0;
if (s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF)