summaryrefslogtreecommitdiff
path: root/source
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2016-10-06 16:43:51 +0200
committerTor Andersson <tor.andersson@artifex.com>2016-10-07 17:52:50 +0200
commitb18795e1019b59e60241b8ebd1a80239ae8c984c (patch)
treef30081d90245b834a5c4700691f3938df8dc198d /source
parent420bd4b2232a7f7c02bc1550ea657a2cf9da6486 (diff)
downloadmupdf-b18795e1019b59e60241b8ebd1a80239ae8c984c.tar.xz
pdf: Support UTF-8 encoded text strings.
New in PDF 2.0.
Diffstat (limited to 'source')
-rw-r--r--source/pdf/pdf-parse.c85
1 files changed, 58 insertions, 27 deletions
diff --git a/source/pdf/pdf-parse.c b/source/pdf/pdf-parse.c
index a6d341b3..5b77a466 100644
--- a/source/pdf/pdf-parse.c
+++ b/source/pdf/pdf-parse.c
@@ -55,13 +55,35 @@ rune_from_utf16be(int *out, unsigned char *s, unsigned char *end)
return 1;
}
+static size_t
+skip_language_code_utf16be(unsigned char *s, size_t n, size_t i)
+{
+ /* skip language escape codes */
+ if (i + 6 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+4] == 0 && s[i+5] == 27)
+ return 6;
+ else if (i + 8 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+6] == 0 && s[i+7] == 27)
+ return 8;
+ return 0;
+}
+
+static size_t
+skip_language_code_utf8(unsigned char *s, size_t n, size_t i)
+{
+ /* skip language escape codes */
+ if (i + 3 <= n && s[i] == 27 && s[i+3])
+ return 3;
+ else if (i + 5 <= n && s[i] == 27 && s[i+5] == 27)
+ return 5;
+ return 0;
+}
+
char *
pdf_to_utf8_imp(fz_context *ctx, unsigned char *srcptr, size_t srclen)
{
char *dstptr, *dst;
size_t dstlen = 0;
int ucs;
- size_t i;
+ size_t i, n;
/* UTF-16BE */
if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
@@ -69,19 +91,9 @@ pdf_to_utf8_imp(fz_context *ctx, unsigned char *srcptr, size_t srclen)
i = 2;
while (i + 2 <= srclen)
{
- /* skip language escape codes */
- if (i + 6 <= srclen &&
- srcptr[i+0] == 0 && srcptr[i+1] == 27 &&
- srcptr[i+4] == 0 && srcptr[i+5] == 27)
- {
- i += 6;
- }
- else if (i + 8 <= srclen &&
- srcptr[i+0] == 0 && srcptr[i+1] == 27 &&
- srcptr[i+6] == 0 && srcptr[i+7] == 27)
- {
- i += 8;
- }
+ n = skip_language_code_utf16be(srcptr, srclen, i);
+ if (n)
+ i += n;
else
{
i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
@@ -94,19 +106,9 @@ pdf_to_utf8_imp(fz_context *ctx, unsigned char *srcptr, size_t srclen)
i = 2;
while (i + 2 <= srclen)
{
- /* skip language escape codes */
- if (i + 6 <= srclen &&
- srcptr[i+0] == 0 && srcptr[i+1] == 27 &&
- srcptr[i+4] == 0 && srcptr[i+5] == 27)
- {
- i += 6;
- }
- else if (i + 8 <= srclen &&
- srcptr[i+0] == 0 && srcptr[i+1] == 27 &&
- srcptr[i+6] == 0 && srcptr[i+7] == 27)
- {
- i += 8;
- }
+ n = skip_language_code_utf16be(srcptr, srclen, i);
+ if (n)
+ i += n;
else
{
i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
@@ -115,6 +117,35 @@ pdf_to_utf8_imp(fz_context *ctx, unsigned char *srcptr, size_t srclen)
}
}
+ /* UTF-8 */
+ else if (srclen >= 3 && srcptr[0] == 239 && srcptr[1] == 187 && srcptr[2] == 191)
+ {
+ i = 3;
+ while (i < srclen)
+ {
+ n = skip_language_code_utf8(srcptr, srclen, i);
+ if (n)
+ i += n;
+ else
+ {
+ i += 1;
+ dstlen += 1;
+ }
+ }
+
+ dstptr = dst = fz_malloc(ctx, dstlen + 1);
+
+ i = 3;
+ while (i < srclen)
+ {
+ n = skip_language_code_utf8(srcptr, srclen, i);
+ if (n)
+ i += n;
+ else
+ *dstptr++ = srcptr[i++];
+ }
+ }
+
/* PDFDocEncoding */
else
{