summaryrefslogtreecommitdiff
path: root/pdf/pdf_parse.c
diff options
context:
space:
mode:
Diffstat (limited to 'pdf/pdf_parse.c')
-rw-r--r--pdf/pdf_parse.c611
1 files changed, 0 insertions, 611 deletions
diff --git a/pdf/pdf_parse.c b/pdf/pdf_parse.c
deleted file mode 100644
index 18ab3113..00000000
--- a/pdf/pdf_parse.c
+++ /dev/null
@@ -1,611 +0,0 @@
-#include "mupdf/pdf.h"
-
-fz_rect *
-pdf_to_rect(fz_context *ctx, pdf_obj *array, fz_rect *r)
-{
- float a = pdf_to_real(pdf_array_get(array, 0));
- float b = pdf_to_real(pdf_array_get(array, 1));
- float c = pdf_to_real(pdf_array_get(array, 2));
- float d = pdf_to_real(pdf_array_get(array, 3));
- r->x0 = fz_min(a, c);
- r->y0 = fz_min(b, d);
- r->x1 = fz_max(a, c);
- r->y1 = fz_max(b, d);
- return r;
-}
-
-fz_matrix *
-pdf_to_matrix(fz_context *ctx, pdf_obj *array, fz_matrix *m)
-{
- m->a = pdf_to_real(pdf_array_get(array, 0));
- m->b = pdf_to_real(pdf_array_get(array, 1));
- m->c = pdf_to_real(pdf_array_get(array, 2));
- m->d = pdf_to_real(pdf_array_get(array, 3));
- m->e = pdf_to_real(pdf_array_get(array, 4));
- m->f = pdf_to_real(pdf_array_get(array, 5));
- return m;
-}
-
-/* Convert Unicode/PdfDocEncoding string into utf-8 */
-char *
-pdf_to_utf8(pdf_document *xref, pdf_obj *src)
-{
- fz_context *ctx = xref->ctx;
- fz_buffer *strmbuf = NULL;
- unsigned char *srcptr;
- char *dstptr, *dst;
- int srclen;
- int dstlen = 0;
- int ucs;
- int i;
-
- fz_var(strmbuf);
- fz_try(ctx)
- {
- if (pdf_is_string(src))
- {
- srcptr = (unsigned char *) pdf_to_str_buf(src);
- srclen = pdf_to_str_len(src);
- }
- else if (pdf_is_stream(xref, pdf_to_num(src), pdf_to_gen(src)))
- {
- strmbuf = pdf_load_stream(xref, pdf_to_num(src), pdf_to_gen(src));
- srclen = fz_buffer_storage(ctx, strmbuf, (unsigned char **)&srcptr);
- }
- else
- {
- srclen = 0;
- }
-
- if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
- {
- for (i = 2; i + 1 < srclen; i += 2)
- {
- ucs = srcptr[i] << 8 | srcptr[i+1];
- dstlen += fz_runelen(ucs);
- }
-
- dstptr = dst = fz_malloc(ctx, dstlen + 1);
-
- for (i = 2; i + 1 < srclen; i += 2)
- {
- ucs = srcptr[i] << 8 | srcptr[i+1];
- dstptr += fz_runetochar(dstptr, ucs);
- }
- }
- else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
- {
- for (i = 2; i + 1 < srclen; i += 2)
- {
- ucs = srcptr[i] | srcptr[i+1] << 8;
- dstlen += fz_runelen(ucs);
- }
-
- dstptr = dst = fz_malloc(ctx, dstlen + 1);
-
- for (i = 2; i + 1 < srclen; i += 2)
- {
- ucs = srcptr[i] | srcptr[i+1] << 8;
- dstptr += fz_runetochar(dstptr, ucs);
- }
- }
- else
- {
- for (i = 0; i < srclen; i++)
- dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]);
-
- dstptr = dst = fz_malloc(ctx, dstlen + 1);
-
- for (i = 0; i < srclen; i++)
- {
- ucs = pdf_doc_encoding[srcptr[i]];
- dstptr += fz_runetochar(dstptr, ucs);
- }
- }
- }
- fz_always(ctx)
- {
- fz_drop_buffer(ctx, strmbuf);
- }
- fz_catch(ctx)
- {
- fz_rethrow(ctx);
- }
-
- *dstptr = '\0';
- return dst;
-}
-
-/* Convert Unicode/PdfDocEncoding string into ucs-2 */
-unsigned short *
-pdf_to_ucs2(pdf_document *xref, pdf_obj *src)
-{
- fz_context *ctx = xref->ctx;
- unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(src);
- unsigned short *dstptr, *dst;
- int srclen = pdf_to_str_len(src);
- int i;
-
- if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
- {
- dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short));
- for (i = 2; i + 1 < srclen; i += 2)
- *dstptr++ = srcptr[i] << 8 | srcptr[i+1];
- }
- else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
- {
- dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short));
- for (i = 2; i + 1 < srclen; i += 2)
- *dstptr++ = srcptr[i] | srcptr[i+1] << 8;
- }
- else
- {
- dstptr = dst = fz_malloc_array(ctx, srclen + 1, sizeof(short));
- for (i = 0; i < srclen; i++)
- *dstptr++ = pdf_doc_encoding[srcptr[i]];
- }
-
- *dstptr = '\0';
- return dst;
-}
-
-/* allow to convert to UCS-2 without the need for an fz_context */
-/* (buffer must be at least (fz_to_str_len(src) + 1) * 2 bytes in size) */
-void
-pdf_to_ucs2_buf(unsigned short *buffer, pdf_obj *src)
-{
- unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(src);
- unsigned short *dstptr = buffer;
- int srclen = pdf_to_str_len(src);
- int i;
-
- if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
- {
- for (i = 2; i + 1 < srclen; i += 2)
- *dstptr++ = srcptr[i] << 8 | srcptr[i+1];
- }
- else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
- {
- for (i = 2; i + 1 < srclen; i += 2)
- *dstptr++ = srcptr[i] | srcptr[i+1] << 8;
- }
- else
- {
- for (i = 0; i < srclen; i++)
- *dstptr++ = pdf_doc_encoding[srcptr[i]];
- }
-
- *dstptr = '\0';
-}
-
-/* Convert UCS-2 string into PdfDocEncoding for authentication */
-char *
-pdf_from_ucs2(pdf_document *xref, unsigned short *src)
-{
- fz_context *ctx = xref->ctx;
- int i, j, len;
- char *docstr;
-
- len = 0;
- while (src[len])
- len++;
-
- docstr = fz_malloc(ctx, len + 1);
-
- for (i = 0; i < len; i++)
- {
- /* shortcut: check if the character has the same code point in both encodings */
- if (0 < src[i] && src[i] < 256 && pdf_doc_encoding[src[i]] == src[i]) {
- docstr[i] = src[i];
- continue;
- }
-
- /* search through pdf_docencoding for the character's code point */
- for (j = 0; j < 256; j++)
- if (pdf_doc_encoding[j] == src[i])
- break;
- docstr[i] = j;
-
- /* fail, if a character can't be encoded */
- if (!docstr[i])
- {
- fz_free(ctx, docstr);
- return NULL;
- }
- }
- docstr[len] = '\0';
-
- return docstr;
-}
-
-pdf_obj *
-pdf_to_utf8_name(pdf_document *xref, pdf_obj *src)
-{
- char *buf = pdf_to_utf8(xref, src);
- pdf_obj *dst = pdf_new_name(xref->ctx, buf);
- fz_free(xref->ctx, buf);
- return dst;
-}
-
-pdf_obj *
-pdf_parse_array(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
-{
- pdf_obj *ary = NULL;
- pdf_obj *obj = NULL;
- int a = 0, b = 0, n = 0;
- pdf_token tok;
- fz_context *ctx = file->ctx;
- pdf_obj *op;
-
- fz_var(obj);
-
- ary = pdf_new_array(ctx, 4);
-
- fz_try(ctx)
- {
- while (1)
- {
- tok = pdf_lex(file, buf);
-
- if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
- {
- if (n > 0)
- {
- obj = pdf_new_int(ctx, a);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- }
- if (n > 1)
- {
- obj = pdf_new_int(ctx, b);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- }
- n = 0;
- }
-
- if (tok == PDF_TOK_INT && n == 2)
- {
- obj = pdf_new_int(ctx, a);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- a = b;
- n --;
- }
-
- switch (tok)
- {
- case PDF_TOK_CLOSE_ARRAY:
- op = ary;
- goto end;
-
- case PDF_TOK_INT:
- if (n == 0)
- a = buf->i;
- if (n == 1)
- b = buf->i;
- n ++;
- break;
-
- case PDF_TOK_R:
- if (n != 2)
- fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse indirect reference in array");
- obj = pdf_new_indirect(ctx, a, b, xref);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- n = 0;
- break;
-
- case PDF_TOK_OPEN_ARRAY:
- obj = pdf_parse_array(xref, file, buf);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
-
- case PDF_TOK_OPEN_DICT:
- obj = pdf_parse_dict(xref, file, buf);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
-
- case PDF_TOK_NAME:
- obj = pdf_new_name(ctx, buf->scratch);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
- case PDF_TOK_REAL:
- obj = pdf_new_real(ctx, buf->f);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
- case PDF_TOK_STRING:
- obj = pdf_new_string(ctx, buf->scratch, buf->len);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
- case PDF_TOK_TRUE:
- obj = pdf_new_bool(ctx, 1);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
- case PDF_TOK_FALSE:
- obj = pdf_new_bool(ctx, 0);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
- case PDF_TOK_NULL:
- obj = pdf_new_null(ctx);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
-
- default:
- fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse token in array");
- }
- }
-end:
- {}
- }
- fz_catch(ctx)
- {
- pdf_drop_obj(obj);
- pdf_drop_obj(ary);
- fz_rethrow_message(ctx, "cannot parse array");
- }
- return op;
-}
-
-pdf_obj *
-pdf_parse_dict(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
-{
- pdf_obj *dict;
- pdf_obj *key = NULL;
- pdf_obj *val = NULL;
- pdf_token tok;
- int a, b;
- fz_context *ctx = file->ctx;
-
- dict = pdf_new_dict(ctx, 8);
-
- fz_var(key);
- fz_var(val);
-
- fz_try(ctx)
- {
- while (1)
- {
- tok = pdf_lex(file, buf);
- skip:
- if (tok == PDF_TOK_CLOSE_DICT)
- break;
-
- /* for BI .. ID .. EI in content streams */
- if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))
- break;
-
- if (tok != PDF_TOK_NAME)
- fz_throw(ctx, FZ_ERROR_GENERIC, "invalid key in dict");
-
- key = pdf_new_name(ctx, buf->scratch);
-
- tok = pdf_lex(file, buf);
-
- switch (tok)
- {
- case PDF_TOK_OPEN_ARRAY:
- val = pdf_parse_array(xref, file, buf);
- break;
-
- case PDF_TOK_OPEN_DICT:
- val = pdf_parse_dict(xref, file, buf);
- break;
-
- case PDF_TOK_NAME: val = pdf_new_name(ctx, buf->scratch); break;
- case PDF_TOK_REAL: val = pdf_new_real(ctx, buf->f); break;
- case PDF_TOK_STRING: val = pdf_new_string(ctx, buf->scratch, buf->len); break;
- case PDF_TOK_TRUE: val = pdf_new_bool(ctx, 1); break;
- case PDF_TOK_FALSE: val = pdf_new_bool(ctx, 0); break;
- case PDF_TOK_NULL: val = pdf_new_null(ctx); break;
-
- case PDF_TOK_INT:
- /* 64-bit to allow for numbers > INT_MAX and overflow */
- a = buf->i;
- tok = pdf_lex(file, buf);
- if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
- (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")))
- {
- val = pdf_new_int(ctx, a);
- pdf_dict_put(dict, key, val);
- pdf_drop_obj(val);
- val = NULL;
- pdf_drop_obj(key);
- key = NULL;
- goto skip;
- }
- if (tok == PDF_TOK_INT)
- {
- b = buf->i;
- tok = pdf_lex(file, buf);
- if (tok == PDF_TOK_R)
- {
- val = pdf_new_indirect(ctx, a, b, xref);
- break;
- }
- }
- fz_throw(ctx, FZ_ERROR_GENERIC, "invalid indirect reference in dict");
-
- default:
- fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in dict");
- }
-
- pdf_dict_put(dict, key, val);
- pdf_drop_obj(val);
- val = NULL;
- pdf_drop_obj(key);
- key = NULL;
- }
- }
- fz_catch(ctx)
- {
- pdf_drop_obj(dict);
- pdf_drop_obj(key);
- pdf_drop_obj(val);
- fz_rethrow_message(ctx, "cannot parse dict");
- }
- return dict;
-}
-
-pdf_obj *
-pdf_parse_stm_obj(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
-{
- pdf_token tok;
- fz_context *ctx = file->ctx;
-
- tok = pdf_lex(file, buf);
-
- switch (tok)
- {
- case PDF_TOK_OPEN_ARRAY:
- return pdf_parse_array(xref, file, buf);
- case PDF_TOK_OPEN_DICT:
- return pdf_parse_dict(xref, file, buf);
- case PDF_TOK_NAME: return pdf_new_name(ctx, buf->scratch); break;
- case PDF_TOK_REAL: return pdf_new_real(ctx, buf->f); break;
- case PDF_TOK_STRING: return pdf_new_string(ctx, buf->scratch, buf->len); break;
- case PDF_TOK_TRUE: return pdf_new_bool(ctx, 1); break;
- case PDF_TOK_FALSE: return pdf_new_bool(ctx, 0); break;
- case PDF_TOK_NULL: return pdf_new_null(ctx); break;
- case PDF_TOK_INT: return pdf_new_int(ctx, buf->i); break;
- default: fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in object stream");
- }
- return NULL; /* Stupid MSVC */
-}
-
-pdf_obj *
-pdf_parse_ind_obj(pdf_document *xref,
- fz_stream *file, pdf_lexbuf *buf,
- int *onum, int *ogen, int *ostmofs)
-{
- pdf_obj *obj = NULL;
- int num = 0, gen = 0, stm_ofs;
- pdf_token tok;
- int a, b;
- fz_context *ctx = file->ctx;
-
- fz_var(obj);
-
- tok = pdf_lex(file, buf);
- if (tok != PDF_TOK_INT)
- fz_throw(ctx, FZ_ERROR_GENERIC, "expected object number");
- num = buf->i;
-
- tok = pdf_lex(file, buf);
- if (tok != PDF_TOK_INT)
- fz_throw(ctx, FZ_ERROR_GENERIC, "expected generation number (%d ? obj)", num);
- gen = buf->i;
-
- tok = pdf_lex(file, buf);
- if (tok != PDF_TOK_OBJ)
- fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'obj' keyword (%d %d ?)", num, gen);
-
- tok = pdf_lex(file, buf);
-
- switch (tok)
- {
- case PDF_TOK_OPEN_ARRAY:
- obj = pdf_parse_array(xref, file, buf);
- break;
-
- case PDF_TOK_OPEN_DICT:
- obj = pdf_parse_dict(xref, file, buf);
- break;
-
- case PDF_TOK_NAME: obj = pdf_new_name(ctx, buf->scratch); break;
- case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); break;
- case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); break;
- case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, 1); break;
- case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, 0); break;
- case PDF_TOK_NULL: obj = pdf_new_null(ctx); break;
-
- case PDF_TOK_INT:
- a = buf->i;
- tok = pdf_lex(file, buf);
-
- if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
- {
- obj = pdf_new_int(ctx, a);
- goto skip;
- }
- if (tok == PDF_TOK_INT)
- {
- b = buf->i;
- tok = pdf_lex(file, buf);
- if (tok == PDF_TOK_R)
- {
- obj = pdf_new_indirect(ctx, a, b, xref);
- break;
- }
- }
- fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'R' keyword (%d %d R)", num, gen);
-
- case PDF_TOK_ENDOBJ:
- obj = pdf_new_null(ctx);
- goto skip;
-
- default:
- fz_throw(ctx, FZ_ERROR_GENERIC, "syntax error in object (%d %d R)", num, gen);
- }
-
- fz_try(ctx)
- {
- tok = pdf_lex(file, buf);
- }
- fz_catch(ctx)
- {
- pdf_drop_obj(obj);
- fz_rethrow_message(ctx, "cannot parse indirect object (%d %d R)", num, gen);
- }
-
-skip:
- if (tok == PDF_TOK_STREAM)
- {
- int c = fz_read_byte(file);
- while (c == ' ')
- c = fz_read_byte(file);
- if (c == '\r')
- {
- c = fz_peek_byte(file);
- if (c != '\n')
- fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
- else
- fz_read_byte(file);
- }
- stm_ofs = fz_tell(file);
- }
- else if (tok == PDF_TOK_ENDOBJ)
- {
- stm_ofs = 0;
- }
- else
- {
- fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
- stm_ofs = 0;
- }
-
- if (onum) *onum = num;
- if (ogen) *ogen = gen;
- if (ostmofs) *ostmofs = stm_ofs;
- return obj;
-}