diff options
author | Tor Andersson <tor.andersson@artifex.com> | 2018-11-13 23:03:25 +0100 |
---|---|---|
committer | Tor Andersson <tor.andersson@artifex.com> | 2018-11-13 23:45:37 +0100 |
commit | 955573c9792c6992398072f83a1ae0de1a29dcd9 (patch) | |
tree | 0e148527480321df31f51451bab50473ede2f76a /source | |
parent | 1fc1ef4d35c724b48428d9338b392a07d8150d58 (diff) | |
download | mupdf-955573c9792c6992398072f83a1ae0de1a29dcd9.tar.xz |
Bug 697414: Allow more encodings in XML parser.
Support some common FB2 encodings as well as UTF-8 and UTF-16.
Diffstat (limited to 'source')
-rw-r--r-- | source/fitz/xml.c | 56 |
1 files changed, 55 insertions, 1 deletions
diff --git a/source/fitz/xml.c b/source/fitz/xml.c index b7c89cf4..6b117479 100644 --- a/source/fitz/xml.c +++ b/source/fitz/xml.c @@ -581,8 +581,50 @@ parse_attribute_value: return "end of data in attribute value"; } -static char *convert_to_utf8(fz_context *doc, const unsigned char *s, size_t n, int *dofree) +static int startswith(const char *a, const char *b) { + return !fz_strncasecmp(a, b, strlen(b)); +} + +static const unsigned short *find_xml_encoding(char *s) +{ + const unsigned short *table = NULL; + char *end, *xml, *enc; + + end = strchr(s, '>'); + if (end) + { + *end = 0; + xml = strstr(s, "<?xml"); + if (xml) + { + enc = strstr(xml, "encoding="); + if (enc) + { + enc += 10; + if (startswith(enc, "iso-8859-1") || startswith(enc, "latin1")) + table = fz_unicode_from_iso8859_1; + else if (startswith(enc, "iso-8859-7") || startswith(enc, "greek")) + table = fz_unicode_from_iso8859_7; + else if (startswith(enc, "koi8")) + table = fz_unicode_from_koi8u; + else if (startswith(enc, "windows-1250")) + table = fz_unicode_from_windows_1250; + else if (startswith(enc, "windows-1251")) + table = fz_unicode_from_windows_1251; + else if (startswith(enc, "windows-1252")) + table = fz_unicode_from_windows_1252; + } + } + *end = '>'; + } + + return table; +} + +static char *convert_to_utf8(fz_context *doc, unsigned char *s, size_t n, int *dofree) +{ + const unsigned short *table; const unsigned char *e = s + n; char *dst, *d; int c; @@ -613,6 +655,18 @@ static char *convert_to_utf8(fz_context *doc, const unsigned char *s, size_t n, return dst; } + table = find_xml_encoding((char*)s); + if (table) { + dst = d = fz_malloc(doc, n * FZ_UTFMAX); + while (*s) { + c = table[*s++]; + d += fz_runetochar(d, c); + } + *d = 0; + *dofree = 1; + return dst; + } + *dofree = 0; if (s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF) |