summaryrefslogtreecommitdiff
path: root/source/pdf
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2016-10-06 12:54:21 +0200
committerTor Andersson <tor.andersson@artifex.com>2016-10-07 17:22:59 +0200
commit5ae898c46a8c7c1a4612808619b5037e74c83622 (patch)
treec6a2c1d70114eab6f9a2905e1184a203f1afcce1 /source/pdf
parent85e0700f52581793de276ba4763de066082909c1 (diff)
downloadmupdf-5ae898c46a8c7c1a4612808619b5037e74c83622.tar.xz
pdf: Separate functions to read text strings and text streams as UTF-8.
The stream loading is used only by the JS code loading.
Diffstat (limited to 'source/pdf')
-rw-r--r--source/pdf/pdf-form.c2
-rw-r--r--source/pdf/pdf-js.c2
-rw-r--r--source/pdf/pdf-parse.c161
3 files changed, 87 insertions, 78 deletions
diff --git a/source/pdf/pdf-form.c b/source/pdf/pdf-form.c
index dfaefe98..4fbccfa2 100644
--- a/source/pdf/pdf-form.c
+++ b/source/pdf/pdf-form.c
@@ -340,7 +340,7 @@ static void execute_action(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf
pdf_obj *js = pdf_dict_get(ctx, a, PDF_NAME_JS);
if (js)
{
- char *code = pdf_to_utf8(ctx, js);
+ char *code = pdf_load_stream_or_string_as_utf8(ctx, js);
fz_try(ctx)
{
pdf_js_execute(doc->js, code);
diff --git a/source/pdf/pdf-js.c b/source/pdf/pdf-js.c
index bc74eedc..4fc6dc98 100644
--- a/source/pdf/pdf-js.c
+++ b/source/pdf/pdf-js.c
@@ -646,7 +646,7 @@ static void pdf_js_load_document_level(pdf_js *js)
{
pdf_obj *fragment = pdf_dict_get_val(ctx, javascript, i);
pdf_obj *code = pdf_dict_get(ctx, fragment, PDF_NAME_JS);
- char *codebuf = pdf_to_utf8(ctx, code);
+ char *codebuf = pdf_load_stream_or_string_as_utf8(ctx, code);
pdf_js_execute(js, codebuf);
fz_free(ctx, codebuf);
}
diff --git a/source/pdf/pdf-parse.c b/source/pdf/pdf-parse.c
index 74990ead..a6d341b3 100644
--- a/source/pdf/pdf-parse.c
+++ b/source/pdf/pdf-parse.c
@@ -55,116 +55,125 @@ rune_from_utf16be(int *out, unsigned char *s, unsigned char *end)
return 1;
}
-/* Convert Unicode/PdfDocEncoding string into utf-8 */
char *
-pdf_to_utf8(fz_context *ctx, pdf_obj *src)
+pdf_to_utf8_imp(fz_context *ctx, unsigned char *srcptr, size_t srclen)
{
- fz_buffer *stmbuf = NULL;
- unsigned char *srcptr;
char *dstptr, *dst;
- size_t srclen;
size_t dstlen = 0;
int ucs;
size_t i;
- fz_var(stmbuf);
- fz_try(ctx)
+ /* UTF-16BE */
+ if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
{
- if (pdf_is_string(ctx, src))
- {
- srcptr = (unsigned char *) pdf_to_str_buf(ctx, src);
- srclen = pdf_to_str_len(ctx, src);
- }
- else if (pdf_is_stream(ctx, src))
- {
- stmbuf = pdf_load_stream(ctx, src);
- srclen = fz_buffer_storage(ctx, stmbuf, (unsigned char **)&srcptr);
- }
- else
+ i = 2;
+ while (i + 2 <= srclen)
{
- srclen = 0;
- }
-
- /* UTF-16BE */
- if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
- {
- i = 2;
- while (i + 2 <= srclen)
- {
- /* skip language escape codes */
- if (i + 6 <= srclen &&
+ /* skip language escape codes */
+ if (i + 6 <= srclen &&
srcptr[i+0] == 0 && srcptr[i+1] == 27 &&
srcptr[i+4] == 0 && srcptr[i+5] == 27)
- {
- i += 6;
- }
- else if (i + 8 <= srclen &&
+ {
+ i += 6;
+ }
+ else if (i + 8 <= srclen &&
srcptr[i+0] == 0 && srcptr[i+1] == 27 &&
srcptr[i+6] == 0 && srcptr[i+7] == 27)
- {
- i += 8;
- }
- else
- {
- i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
- dstlen += fz_runelen(ucs);
- }
+ {
+ i += 8;
+ }
+ else
+ {
+ i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
+ dstlen += fz_runelen(ucs);
}
+ }
- dstptr = dst = fz_malloc(ctx, dstlen + 1);
+ dstptr = dst = fz_malloc(ctx, dstlen + 1);
- i = 2;
- while (i + 2 <= srclen)
- {
- /* skip language escape codes */
- if (i + 6 <= srclen &&
+ i = 2;
+ while (i + 2 <= srclen)
+ {
+ /* skip language escape codes */
+ if (i + 6 <= srclen &&
srcptr[i+0] == 0 && srcptr[i+1] == 27 &&
srcptr[i+4] == 0 && srcptr[i+5] == 27)
- {
- i += 6;
- }
- else if (i + 8 <= srclen &&
+ {
+ i += 6;
+ }
+ else if (i + 8 <= srclen &&
srcptr[i+0] == 0 && srcptr[i+1] == 27 &&
srcptr[i+6] == 0 && srcptr[i+7] == 27)
- {
- i += 8;
- }
- else
- {
- i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
- dstptr += fz_runetochar(dstptr, ucs);
- }
+ {
+ i += 8;
+ }
+ else
+ {
+ i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
+ dstptr += fz_runetochar(dstptr, ucs);
}
}
+ }
- /* PDFDocEncoding */
- else
- {
- for (i = 0; i < srclen; i++)
- dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]);
+ /* PDFDocEncoding */
+ else
+ {
+ for (i = 0; i < srclen; i++)
+ dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]);
- dstptr = dst = fz_malloc(ctx, dstlen + 1);
+ dstptr = dst = fz_malloc(ctx, dstlen + 1);
- for (i = 0; i < srclen; i++)
- {
- ucs = pdf_doc_encoding[srcptr[i]];
- dstptr += fz_runetochar(dstptr, ucs);
- }
+ for (i = 0; i < srclen; i++)
+ {
+ ucs = pdf_doc_encoding[srcptr[i]];
+ dstptr += fz_runetochar(dstptr, ucs);
}
}
+
+ *dstptr = 0;
+ return dst;
+}
+
+/* Convert Unicode/PdfDocEncoding string into utf-8 */
+char *
+pdf_to_utf8(fz_context *ctx, pdf_obj *src)
+{
+ unsigned char *srcptr;
+ size_t srclen;
+ srcptr = (unsigned char *) pdf_to_str_buf(ctx, src);
+ srclen = pdf_to_str_len(ctx, src);
+ return pdf_to_utf8_imp(ctx, srcptr, srclen);
+}
+
+/* Load text stream and convert to UTF-8 */
+char *
+pdf_load_stream_as_utf8(fz_context *ctx, pdf_obj *src)
+{
+ fz_buffer *stmbuf;
+ unsigned char *srcptr;
+ size_t srclen;
+ char *dst;
+
+ stmbuf = pdf_load_stream(ctx, src);
+ srclen = fz_buffer_storage(ctx, stmbuf, &srcptr);
+ fz_try(ctx)
+ dst = pdf_to_utf8_imp(ctx, srcptr, srclen);
fz_always(ctx)
- {
fz_drop_buffer(ctx, stmbuf);
- }
fz_catch(ctx)
- {
fz_rethrow(ctx);
- }
-
- *dstptr = '\0';
return dst;
}
+/* Load text stream or text string and convert to UTF-8 */
+char *
+pdf_load_stream_or_string_as_utf8(fz_context *ctx, pdf_obj *src)
+{
+ if (pdf_is_stream(ctx, src))
+ return pdf_load_stream_as_utf8(ctx, src);
+ return pdf_to_utf8(ctx, src);
+}
+
/* Convert Unicode/PdfDocEncoding string into ucs-2 */
unsigned short *
pdf_to_ucs2(fz_context *ctx, pdf_obj *src)