summaryrefslogtreecommitdiff
path: root/source/pdf/pdf-parse.c
diff options
context:
space:
mode:
Diffstat (limited to 'source/pdf/pdf-parse.c')
-rw-r--r--source/pdf/pdf-parse.c611
1 files changed, 611 insertions, 0 deletions
diff --git a/source/pdf/pdf-parse.c b/source/pdf/pdf-parse.c
new file mode 100644
index 00000000..18ab3113
--- /dev/null
+++ b/source/pdf/pdf-parse.c
@@ -0,0 +1,611 @@
+#include "mupdf/pdf.h"
+
+fz_rect *
+pdf_to_rect(fz_context *ctx, pdf_obj *array, fz_rect *r)
+{
+ float a = pdf_to_real(pdf_array_get(array, 0));
+ float b = pdf_to_real(pdf_array_get(array, 1));
+ float c = pdf_to_real(pdf_array_get(array, 2));
+ float d = pdf_to_real(pdf_array_get(array, 3));
+ r->x0 = fz_min(a, c);
+ r->y0 = fz_min(b, d);
+ r->x1 = fz_max(a, c);
+ r->y1 = fz_max(b, d);
+ return r;
+}
+
+fz_matrix *
+pdf_to_matrix(fz_context *ctx, pdf_obj *array, fz_matrix *m)
+{
+ m->a = pdf_to_real(pdf_array_get(array, 0));
+ m->b = pdf_to_real(pdf_array_get(array, 1));
+ m->c = pdf_to_real(pdf_array_get(array, 2));
+ m->d = pdf_to_real(pdf_array_get(array, 3));
+ m->e = pdf_to_real(pdf_array_get(array, 4));
+ m->f = pdf_to_real(pdf_array_get(array, 5));
+ return m;
+}
+
+/* Convert Unicode/PdfDocEncoding string into utf-8 */
+char *
+pdf_to_utf8(pdf_document *xref, pdf_obj *src)
+{
+ fz_context *ctx = xref->ctx;
+ fz_buffer *strmbuf = NULL;
+ unsigned char *srcptr;
+ char *dstptr, *dst;
+ int srclen;
+ int dstlen = 0;
+ int ucs;
+ int i;
+
+ fz_var(strmbuf);
+ fz_try(ctx)
+ {
+ if (pdf_is_string(src))
+ {
+ srcptr = (unsigned char *) pdf_to_str_buf(src);
+ srclen = pdf_to_str_len(src);
+ }
+ else if (pdf_is_stream(xref, pdf_to_num(src), pdf_to_gen(src)))
+ {
+ strmbuf = pdf_load_stream(xref, pdf_to_num(src), pdf_to_gen(src));
+ srclen = fz_buffer_storage(ctx, strmbuf, (unsigned char **)&srcptr);
+ }
+ else
+ {
+ srclen = 0;
+ }
+
+ if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
+ {
+ for (i = 2; i + 1 < srclen; i += 2)
+ {
+ ucs = srcptr[i] << 8 | srcptr[i+1];
+ dstlen += fz_runelen(ucs);
+ }
+
+ dstptr = dst = fz_malloc(ctx, dstlen + 1);
+
+ for (i = 2; i + 1 < srclen; i += 2)
+ {
+ ucs = srcptr[i] << 8 | srcptr[i+1];
+ dstptr += fz_runetochar(dstptr, ucs);
+ }
+ }
+ else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
+ {
+ for (i = 2; i + 1 < srclen; i += 2)
+ {
+ ucs = srcptr[i] | srcptr[i+1] << 8;
+ dstlen += fz_runelen(ucs);
+ }
+
+ dstptr = dst = fz_malloc(ctx, dstlen + 1);
+
+ for (i = 2; i + 1 < srclen; i += 2)
+ {
+ ucs = srcptr[i] | srcptr[i+1] << 8;
+ dstptr += fz_runetochar(dstptr, ucs);
+ }
+ }
+ else
+ {
+ for (i = 0; i < srclen; i++)
+ dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]);
+
+ dstptr = dst = fz_malloc(ctx, dstlen + 1);
+
+ for (i = 0; i < srclen; i++)
+ {
+ ucs = pdf_doc_encoding[srcptr[i]];
+ dstptr += fz_runetochar(dstptr, ucs);
+ }
+ }
+ }
+ fz_always(ctx)
+ {
+ fz_drop_buffer(ctx, strmbuf);
+ }
+ fz_catch(ctx)
+ {
+ fz_rethrow(ctx);
+ }
+
+ *dstptr = '\0';
+ return dst;
+}
+
+/* Convert Unicode/PdfDocEncoding string into ucs-2 */
+unsigned short *
+pdf_to_ucs2(pdf_document *xref, pdf_obj *src)
+{
+ fz_context *ctx = xref->ctx;
+ unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(src);
+ unsigned short *dstptr, *dst;
+ int srclen = pdf_to_str_len(src);
+ int i;
+
+ if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
+ {
+ dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short));
+ for (i = 2; i + 1 < srclen; i += 2)
+ *dstptr++ = srcptr[i] << 8 | srcptr[i+1];
+ }
+ else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
+ {
+ dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short));
+ for (i = 2; i + 1 < srclen; i += 2)
+ *dstptr++ = srcptr[i] | srcptr[i+1] << 8;
+ }
+ else
+ {
+ dstptr = dst = fz_malloc_array(ctx, srclen + 1, sizeof(short));
+ for (i = 0; i < srclen; i++)
+ *dstptr++ = pdf_doc_encoding[srcptr[i]];
+ }
+
+ *dstptr = '\0';
+ return dst;
+}
+
+/* allow to convert to UCS-2 without the need for an fz_context */
+/* (buffer must be at least (fz_to_str_len(src) + 1) * 2 bytes in size) */
+void
+pdf_to_ucs2_buf(unsigned short *buffer, pdf_obj *src)
+{
+ unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(src);
+ unsigned short *dstptr = buffer;
+ int srclen = pdf_to_str_len(src);
+ int i;
+
+ if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
+ {
+ for (i = 2; i + 1 < srclen; i += 2)
+ *dstptr++ = srcptr[i] << 8 | srcptr[i+1];
+ }
+ else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
+ {
+ for (i = 2; i + 1 < srclen; i += 2)
+ *dstptr++ = srcptr[i] | srcptr[i+1] << 8;
+ }
+ else
+ {
+ for (i = 0; i < srclen; i++)
+ *dstptr++ = pdf_doc_encoding[srcptr[i]];
+ }
+
+ *dstptr = '\0';
+}
+
+/* Convert UCS-2 string into PdfDocEncoding for authentication */
+char *
+pdf_from_ucs2(pdf_document *xref, unsigned short *src)
+{
+ fz_context *ctx = xref->ctx;
+ int i, j, len;
+ char *docstr;
+
+ len = 0;
+ while (src[len])
+ len++;
+
+ docstr = fz_malloc(ctx, len + 1);
+
+ for (i = 0; i < len; i++)
+ {
+ /* shortcut: check if the character has the same code point in both encodings */
+ if (0 < src[i] && src[i] < 256 && pdf_doc_encoding[src[i]] == src[i]) {
+ docstr[i] = src[i];
+ continue;
+ }
+
+ /* search through pdf_docencoding for the character's code point */
+ for (j = 0; j < 256; j++)
+ if (pdf_doc_encoding[j] == src[i])
+ break;
+ docstr[i] = j;
+
+ /* fail, if a character can't be encoded */
+ if (!docstr[i])
+ {
+ fz_free(ctx, docstr);
+ return NULL;
+ }
+ }
+ docstr[len] = '\0';
+
+ return docstr;
+}
+
+pdf_obj *
+pdf_to_utf8_name(pdf_document *xref, pdf_obj *src)
+{
+ char *buf = pdf_to_utf8(xref, src);
+ pdf_obj *dst = pdf_new_name(xref->ctx, buf);
+ fz_free(xref->ctx, buf);
+ return dst;
+}
+
+pdf_obj *
+pdf_parse_array(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
+{
+ pdf_obj *ary = NULL;
+ pdf_obj *obj = NULL;
+ int a = 0, b = 0, n = 0;
+ pdf_token tok;
+ fz_context *ctx = file->ctx;
+ pdf_obj *op;
+
+ fz_var(obj);
+
+ ary = pdf_new_array(ctx, 4);
+
+ fz_try(ctx)
+ {
+ while (1)
+ {
+ tok = pdf_lex(file, buf);
+
+ if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
+ {
+ if (n > 0)
+ {
+ obj = pdf_new_int(ctx, a);
+ pdf_array_push(ary, obj);
+ pdf_drop_obj(obj);
+ obj = NULL;
+ }
+ if (n > 1)
+ {
+ obj = pdf_new_int(ctx, b);
+ pdf_array_push(ary, obj);
+ pdf_drop_obj(obj);
+ obj = NULL;
+ }
+ n = 0;
+ }
+
+ if (tok == PDF_TOK_INT && n == 2)
+ {
+ obj = pdf_new_int(ctx, a);
+ pdf_array_push(ary, obj);
+ pdf_drop_obj(obj);
+ obj = NULL;
+ a = b;
+ n --;
+ }
+
+ switch (tok)
+ {
+ case PDF_TOK_CLOSE_ARRAY:
+ op = ary;
+ goto end;
+
+ case PDF_TOK_INT:
+ if (n == 0)
+ a = buf->i;
+ if (n == 1)
+ b = buf->i;
+ n ++;
+ break;
+
+ case PDF_TOK_R:
+ if (n != 2)
+ fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse indirect reference in array");
+ obj = pdf_new_indirect(ctx, a, b, xref);
+ pdf_array_push(ary, obj);
+ pdf_drop_obj(obj);
+ obj = NULL;
+ n = 0;
+ break;
+
+ case PDF_TOK_OPEN_ARRAY:
+ obj = pdf_parse_array(xref, file, buf);
+ pdf_array_push(ary, obj);
+ pdf_drop_obj(obj);
+ obj = NULL;
+ break;
+
+ case PDF_TOK_OPEN_DICT:
+ obj = pdf_parse_dict(xref, file, buf);
+ pdf_array_push(ary, obj);
+ pdf_drop_obj(obj);
+ obj = NULL;
+ break;
+
+ case PDF_TOK_NAME:
+ obj = pdf_new_name(ctx, buf->scratch);
+ pdf_array_push(ary, obj);
+ pdf_drop_obj(obj);
+ obj = NULL;
+ break;
+ case PDF_TOK_REAL:
+ obj = pdf_new_real(ctx, buf->f);
+ pdf_array_push(ary, obj);
+ pdf_drop_obj(obj);
+ obj = NULL;
+ break;
+ case PDF_TOK_STRING:
+ obj = pdf_new_string(ctx, buf->scratch, buf->len);
+ pdf_array_push(ary, obj);
+ pdf_drop_obj(obj);
+ obj = NULL;
+ break;
+ case PDF_TOK_TRUE:
+ obj = pdf_new_bool(ctx, 1);
+ pdf_array_push(ary, obj);
+ pdf_drop_obj(obj);
+ obj = NULL;
+ break;
+ case PDF_TOK_FALSE:
+ obj = pdf_new_bool(ctx, 0);
+ pdf_array_push(ary, obj);
+ pdf_drop_obj(obj);
+ obj = NULL;
+ break;
+ case PDF_TOK_NULL:
+ obj = pdf_new_null(ctx);
+ pdf_array_push(ary, obj);
+ pdf_drop_obj(obj);
+ obj = NULL;
+ break;
+
+ default:
+ fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse token in array");
+ }
+ }
+end:
+ {}
+ }
+ fz_catch(ctx)
+ {
+ pdf_drop_obj(obj);
+ pdf_drop_obj(ary);
+ fz_rethrow_message(ctx, "cannot parse array");
+ }
+ return op;
+}
+
+pdf_obj *
+pdf_parse_dict(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
+{
+ pdf_obj *dict;
+ pdf_obj *key = NULL;
+ pdf_obj *val = NULL;
+ pdf_token tok;
+ int a, b;
+ fz_context *ctx = file->ctx;
+
+ dict = pdf_new_dict(ctx, 8);
+
+ fz_var(key);
+ fz_var(val);
+
+ fz_try(ctx)
+ {
+ while (1)
+ {
+ tok = pdf_lex(file, buf);
+ skip:
+ if (tok == PDF_TOK_CLOSE_DICT)
+ break;
+
+ /* for BI .. ID .. EI in content streams */
+ if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))
+ break;
+
+ if (tok != PDF_TOK_NAME)
+ fz_throw(ctx, FZ_ERROR_GENERIC, "invalid key in dict");
+
+ key = pdf_new_name(ctx, buf->scratch);
+
+ tok = pdf_lex(file, buf);
+
+ switch (tok)
+ {
+ case PDF_TOK_OPEN_ARRAY:
+ val = pdf_parse_array(xref, file, buf);
+ break;
+
+ case PDF_TOK_OPEN_DICT:
+ val = pdf_parse_dict(xref, file, buf);
+ break;
+
+ case PDF_TOK_NAME: val = pdf_new_name(ctx, buf->scratch); break;
+ case PDF_TOK_REAL: val = pdf_new_real(ctx, buf->f); break;
+ case PDF_TOK_STRING: val = pdf_new_string(ctx, buf->scratch, buf->len); break;
+ case PDF_TOK_TRUE: val = pdf_new_bool(ctx, 1); break;
+ case PDF_TOK_FALSE: val = pdf_new_bool(ctx, 0); break;
+ case PDF_TOK_NULL: val = pdf_new_null(ctx); break;
+
+ case PDF_TOK_INT:
+ /* 64-bit to allow for numbers > INT_MAX and overflow */
+ a = buf->i;
+ tok = pdf_lex(file, buf);
+ if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
+ (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")))
+ {
+ val = pdf_new_int(ctx, a);
+ pdf_dict_put(dict, key, val);
+ pdf_drop_obj(val);
+ val = NULL;
+ pdf_drop_obj(key);
+ key = NULL;
+ goto skip;
+ }
+ if (tok == PDF_TOK_INT)
+ {
+ b = buf->i;
+ tok = pdf_lex(file, buf);
+ if (tok == PDF_TOK_R)
+ {
+ val = pdf_new_indirect(ctx, a, b, xref);
+ break;
+ }
+ }
+ fz_throw(ctx, FZ_ERROR_GENERIC, "invalid indirect reference in dict");
+
+ default:
+ fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in dict");
+ }
+
+ pdf_dict_put(dict, key, val);
+ pdf_drop_obj(val);
+ val = NULL;
+ pdf_drop_obj(key);
+ key = NULL;
+ }
+ }
+ fz_catch(ctx)
+ {
+ pdf_drop_obj(dict);
+ pdf_drop_obj(key);
+ pdf_drop_obj(val);
+ fz_rethrow_message(ctx, "cannot parse dict");
+ }
+ return dict;
+}
+
+pdf_obj *
+pdf_parse_stm_obj(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
+{
+ pdf_token tok;
+ fz_context *ctx = file->ctx;
+
+ tok = pdf_lex(file, buf);
+
+ switch (tok)
+ {
+ case PDF_TOK_OPEN_ARRAY:
+ return pdf_parse_array(xref, file, buf);
+ case PDF_TOK_OPEN_DICT:
+ return pdf_parse_dict(xref, file, buf);
+ case PDF_TOK_NAME: return pdf_new_name(ctx, buf->scratch); break;
+ case PDF_TOK_REAL: return pdf_new_real(ctx, buf->f); break;
+ case PDF_TOK_STRING: return pdf_new_string(ctx, buf->scratch, buf->len); break;
+ case PDF_TOK_TRUE: return pdf_new_bool(ctx, 1); break;
+ case PDF_TOK_FALSE: return pdf_new_bool(ctx, 0); break;
+ case PDF_TOK_NULL: return pdf_new_null(ctx); break;
+ case PDF_TOK_INT: return pdf_new_int(ctx, buf->i); break;
+ default: fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in object stream");
+ }
+ return NULL; /* Stupid MSVC */
+}
+
+pdf_obj *
+pdf_parse_ind_obj(pdf_document *xref,
+ fz_stream *file, pdf_lexbuf *buf,
+ int *onum, int *ogen, int *ostmofs)
+{
+ pdf_obj *obj = NULL;
+ int num = 0, gen = 0, stm_ofs;
+ pdf_token tok;
+ int a, b;
+ fz_context *ctx = file->ctx;
+
+ fz_var(obj);
+
+ tok = pdf_lex(file, buf);
+ if (tok != PDF_TOK_INT)
+ fz_throw(ctx, FZ_ERROR_GENERIC, "expected object number");
+ num = buf->i;
+
+ tok = pdf_lex(file, buf);
+ if (tok != PDF_TOK_INT)
+ fz_throw(ctx, FZ_ERROR_GENERIC, "expected generation number (%d ? obj)", num);
+ gen = buf->i;
+
+ tok = pdf_lex(file, buf);
+ if (tok != PDF_TOK_OBJ)
+ fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'obj' keyword (%d %d ?)", num, gen);
+
+ tok = pdf_lex(file, buf);
+
+ switch (tok)
+ {
+ case PDF_TOK_OPEN_ARRAY:
+ obj = pdf_parse_array(xref, file, buf);
+ break;
+
+ case PDF_TOK_OPEN_DICT:
+ obj = pdf_parse_dict(xref, file, buf);
+ break;
+
+ case PDF_TOK_NAME: obj = pdf_new_name(ctx, buf->scratch); break;
+ case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); break;
+ case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); break;
+ case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, 1); break;
+ case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, 0); break;
+ case PDF_TOK_NULL: obj = pdf_new_null(ctx); break;
+
+ case PDF_TOK_INT:
+ a = buf->i;
+ tok = pdf_lex(file, buf);
+
+ if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
+ {
+ obj = pdf_new_int(ctx, a);
+ goto skip;
+ }
+ if (tok == PDF_TOK_INT)
+ {
+ b = buf->i;
+ tok = pdf_lex(file, buf);
+ if (tok == PDF_TOK_R)
+ {
+ obj = pdf_new_indirect(ctx, a, b, xref);
+ break;
+ }
+ }
+ fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'R' keyword (%d %d R)", num, gen);
+
+ case PDF_TOK_ENDOBJ:
+ obj = pdf_new_null(ctx);
+ goto skip;
+
+ default:
+ fz_throw(ctx, FZ_ERROR_GENERIC, "syntax error in object (%d %d R)", num, gen);
+ }
+
+ fz_try(ctx)
+ {
+ tok = pdf_lex(file, buf);
+ }
+ fz_catch(ctx)
+ {
+ pdf_drop_obj(obj);
+ fz_rethrow_message(ctx, "cannot parse indirect object (%d %d R)", num, gen);
+ }
+
+skip:
+ if (tok == PDF_TOK_STREAM)
+ {
+ int c = fz_read_byte(file);
+ while (c == ' ')
+ c = fz_read_byte(file);
+ if (c == '\r')
+ {
+ c = fz_peek_byte(file);
+ if (c != '\n')
+ fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
+ else
+ fz_read_byte(file);
+ }
+ stm_ofs = fz_tell(file);
+ }
+ else if (tok == PDF_TOK_ENDOBJ)
+ {
+ stm_ofs = 0;
+ }
+ else
+ {
+ fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
+ stm_ofs = 0;
+ }
+
+ if (onum) *onum = num;
+ if (ogen) *ogen = gen;
+ if (ostmofs) *ostmofs = stm_ofs;
+ return obj;
+}