From 5ae898c46a8c7c1a4612808619b5037e74c83622 Mon Sep 17 00:00:00 2001 From: Tor Andersson Date: Thu, 6 Oct 2016 12:54:21 +0200 Subject: pdf: Separate functions to read text strings and text streams as UTF-8. The stream loading is used only by the JS code loading. --- source/pdf/pdf-form.c | 2 +- source/pdf/pdf-js.c | 2 +- source/pdf/pdf-parse.c | 161 ++++++++++++++++++++++++++----------------------- 3 files changed, 87 insertions(+), 78 deletions(-) (limited to 'source/pdf') diff --git a/source/pdf/pdf-form.c b/source/pdf/pdf-form.c index dfaefe98..4fbccfa2 100644 --- a/source/pdf/pdf-form.c +++ b/source/pdf/pdf-form.c @@ -340,7 +340,7 @@ static void execute_action(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf pdf_obj *js = pdf_dict_get(ctx, a, PDF_NAME_JS); if (js) { - char *code = pdf_to_utf8(ctx, js); + char *code = pdf_load_stream_or_string_as_utf8(ctx, js); fz_try(ctx) { pdf_js_execute(doc->js, code); diff --git a/source/pdf/pdf-js.c b/source/pdf/pdf-js.c index bc74eedc..4fc6dc98 100644 --- a/source/pdf/pdf-js.c +++ b/source/pdf/pdf-js.c @@ -646,7 +646,7 @@ static void pdf_js_load_document_level(pdf_js *js) { pdf_obj *fragment = pdf_dict_get_val(ctx, javascript, i); pdf_obj *code = pdf_dict_get(ctx, fragment, PDF_NAME_JS); - char *codebuf = pdf_to_utf8(ctx, code); + char *codebuf = pdf_load_stream_or_string_as_utf8(ctx, code); pdf_js_execute(js, codebuf); fz_free(ctx, codebuf); } diff --git a/source/pdf/pdf-parse.c b/source/pdf/pdf-parse.c index 74990ead..a6d341b3 100644 --- a/source/pdf/pdf-parse.c +++ b/source/pdf/pdf-parse.c @@ -55,116 +55,125 @@ rune_from_utf16be(int *out, unsigned char *s, unsigned char *end) return 1; } -/* Convert Unicode/PdfDocEncoding string into utf-8 */ char * -pdf_to_utf8(fz_context *ctx, pdf_obj *src) +pdf_to_utf8_imp(fz_context *ctx, unsigned char *srcptr, size_t srclen) { - fz_buffer *stmbuf = NULL; - unsigned char *srcptr; char *dstptr, *dst; - size_t srclen; size_t dstlen = 0; int ucs; size_t i; - fz_var(stmbuf); - fz_try(ctx) + /* UTF-16BE */ + if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) { - if (pdf_is_string(ctx, src)) - { - srcptr = (unsigned char *) pdf_to_str_buf(ctx, src); - srclen = pdf_to_str_len(ctx, src); - } - else if (pdf_is_stream(ctx, src)) - { - stmbuf = pdf_load_stream(ctx, src); - srclen = fz_buffer_storage(ctx, stmbuf, (unsigned char **)&srcptr); - } - else + i = 2; + while (i + 2 <= srclen) { - srclen = 0; - } - - /* UTF-16BE */ - if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) - { - i = 2; - while (i + 2 <= srclen) - { - /* skip language escape codes */ - if (i + 6 <= srclen && + /* skip language escape codes */ + if (i + 6 <= srclen && srcptr[i+0] == 0 && srcptr[i+1] == 27 && srcptr[i+4] == 0 && srcptr[i+5] == 27) - { - i += 6; - } - else if (i + 8 <= srclen && + { + i += 6; + } + else if (i + 8 <= srclen && srcptr[i+0] == 0 && srcptr[i+1] == 27 && srcptr[i+6] == 0 && srcptr[i+7] == 27) - { - i += 8; - } - else - { - i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen); - dstlen += fz_runelen(ucs); - } + { + i += 8; + } + else + { + i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen); + dstlen += fz_runelen(ucs); } + } - dstptr = dst = fz_malloc(ctx, dstlen + 1); + dstptr = dst = fz_malloc(ctx, dstlen + 1); - i = 2; - while (i + 2 <= srclen) - { - /* skip language escape codes */ - if (i + 6 <= srclen && + i = 2; + while (i + 2 <= srclen) + { + /* skip language escape codes */ + if (i + 6 <= srclen && srcptr[i+0] == 0 && srcptr[i+1] == 27 && srcptr[i+4] == 0 && srcptr[i+5] == 27) - { - i += 6; - } - else if (i + 8 <= srclen && + { + i += 6; + } + else if (i + 8 <= srclen && srcptr[i+0] == 0 && srcptr[i+1] == 27 && srcptr[i+6] == 0 && srcptr[i+7] == 27) - { - i += 8; - } - else - { - i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen); - dstptr += fz_runetochar(dstptr, ucs); - } + { + i += 8; + } + else + { + i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen); + dstptr += fz_runetochar(dstptr, ucs); } } + } - /* PDFDocEncoding */ - else - { - for (i = 0; i < srclen; i++) - dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]); + /* PDFDocEncoding */ + else + { + for (i = 0; i < srclen; i++) + dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]); - dstptr = dst = fz_malloc(ctx, dstlen + 1); + dstptr = dst = fz_malloc(ctx, dstlen + 1); - for (i = 0; i < srclen; i++) - { - ucs = pdf_doc_encoding[srcptr[i]]; - dstptr += fz_runetochar(dstptr, ucs); - } + for (i = 0; i < srclen; i++) + { + ucs = pdf_doc_encoding[srcptr[i]]; + dstptr += fz_runetochar(dstptr, ucs); } } + + *dstptr = 0; + return dst; +} + +/* Convert Unicode/PdfDocEncoding string into utf-8 */ +char * +pdf_to_utf8(fz_context *ctx, pdf_obj *src) +{ + unsigned char *srcptr; + size_t srclen; + srcptr = (unsigned char *) pdf_to_str_buf(ctx, src); + srclen = pdf_to_str_len(ctx, src); + return pdf_to_utf8_imp(ctx, srcptr, srclen); +} + +/* Load text stream and convert to UTF-8 */ +char * +pdf_load_stream_as_utf8(fz_context *ctx, pdf_obj *src) +{ + fz_buffer *stmbuf; + unsigned char *srcptr; + size_t srclen; + char *dst; + + stmbuf = pdf_load_stream(ctx, src); + srclen = fz_buffer_storage(ctx, stmbuf, &srcptr); + fz_try(ctx) + dst = pdf_to_utf8_imp(ctx, srcptr, srclen); fz_always(ctx) - { fz_drop_buffer(ctx, stmbuf); - } fz_catch(ctx) - { fz_rethrow(ctx); - } - - *dstptr = '\0'; return dst; } +/* Load text stream or text string and convert to UTF-8 */ +char * +pdf_load_stream_or_string_as_utf8(fz_context *ctx, pdf_obj *src) +{ + if (pdf_is_stream(ctx, src)) + return pdf_load_stream_as_utf8(ctx, src); + return pdf_to_utf8(ctx, src); +} + /* Convert Unicode/PdfDocEncoding string into ucs-2 */ unsigned short * pdf_to_ucs2(fz_context *ctx, pdf_obj *src) -- cgit v1.2.3