diff options
author | Tor Andersson <tor.andersson@artifex.com> | 2013-06-19 15:29:44 +0200 |
---|---|---|
committer | Tor Andersson <tor.andersson@artifex.com> | 2013-06-20 16:45:35 +0200 |
commit | 0a927854a10e1e6b9770a81e2e1d9f3093631757 (patch) | |
tree | 3d65d820d9fdba2d0d394d99c36290c851b78ca0 /source/pdf/pdf-parse.c | |
parent | 1ae8f19179c5f0f8c6352b3c7855465325d5449a (diff) | |
download | mupdf-0a927854a10e1e6b9770a81e2e1d9f3093631757.tar.xz |
Rearrange source files.
Diffstat (limited to 'source/pdf/pdf-parse.c')
-rw-r--r-- | source/pdf/pdf-parse.c | 611 |
1 files changed, 611 insertions, 0 deletions
diff --git a/source/pdf/pdf-parse.c b/source/pdf/pdf-parse.c new file mode 100644 index 00000000..18ab3113 --- /dev/null +++ b/source/pdf/pdf-parse.c @@ -0,0 +1,611 @@ +#include "mupdf/pdf.h" + +fz_rect * +pdf_to_rect(fz_context *ctx, pdf_obj *array, fz_rect *r) +{ + float a = pdf_to_real(pdf_array_get(array, 0)); + float b = pdf_to_real(pdf_array_get(array, 1)); + float c = pdf_to_real(pdf_array_get(array, 2)); + float d = pdf_to_real(pdf_array_get(array, 3)); + r->x0 = fz_min(a, c); + r->y0 = fz_min(b, d); + r->x1 = fz_max(a, c); + r->y1 = fz_max(b, d); + return r; +} + +fz_matrix * +pdf_to_matrix(fz_context *ctx, pdf_obj *array, fz_matrix *m) +{ + m->a = pdf_to_real(pdf_array_get(array, 0)); + m->b = pdf_to_real(pdf_array_get(array, 1)); + m->c = pdf_to_real(pdf_array_get(array, 2)); + m->d = pdf_to_real(pdf_array_get(array, 3)); + m->e = pdf_to_real(pdf_array_get(array, 4)); + m->f = pdf_to_real(pdf_array_get(array, 5)); + return m; +} + +/* Convert Unicode/PdfDocEncoding string into utf-8 */ +char * +pdf_to_utf8(pdf_document *xref, pdf_obj *src) +{ + fz_context *ctx = xref->ctx; + fz_buffer *strmbuf = NULL; + unsigned char *srcptr; + char *dstptr, *dst; + int srclen; + int dstlen = 0; + int ucs; + int i; + + fz_var(strmbuf); + fz_try(ctx) + { + if (pdf_is_string(src)) + { + srcptr = (unsigned char *) pdf_to_str_buf(src); + srclen = pdf_to_str_len(src); + } + else if (pdf_is_stream(xref, pdf_to_num(src), pdf_to_gen(src))) + { + strmbuf = pdf_load_stream(xref, pdf_to_num(src), pdf_to_gen(src)); + srclen = fz_buffer_storage(ctx, strmbuf, (unsigned char **)&srcptr); + } + else + { + srclen = 0; + } + + if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) + { + for (i = 2; i + 1 < srclen; i += 2) + { + ucs = srcptr[i] << 8 | srcptr[i+1]; + dstlen += fz_runelen(ucs); + } + + dstptr = dst = fz_malloc(ctx, dstlen + 1); + + for (i = 2; i + 1 < srclen; i += 2) + { + ucs = srcptr[i] << 8 | srcptr[i+1]; + dstptr += fz_runetochar(dstptr, ucs); + } + } + else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254) + { + for (i = 2; i + 1 < srclen; i += 2) + { + ucs = srcptr[i] | srcptr[i+1] << 8; + dstlen += fz_runelen(ucs); + } + + dstptr = dst = fz_malloc(ctx, dstlen + 1); + + for (i = 2; i + 1 < srclen; i += 2) + { + ucs = srcptr[i] | srcptr[i+1] << 8; + dstptr += fz_runetochar(dstptr, ucs); + } + } + else + { + for (i = 0; i < srclen; i++) + dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]); + + dstptr = dst = fz_malloc(ctx, dstlen + 1); + + for (i = 0; i < srclen; i++) + { + ucs = pdf_doc_encoding[srcptr[i]]; + dstptr += fz_runetochar(dstptr, ucs); + } + } + } + fz_always(ctx) + { + fz_drop_buffer(ctx, strmbuf); + } + fz_catch(ctx) + { + fz_rethrow(ctx); + } + + *dstptr = '\0'; + return dst; +} + +/* Convert Unicode/PdfDocEncoding string into ucs-2 */ +unsigned short * +pdf_to_ucs2(pdf_document *xref, pdf_obj *src) +{ + fz_context *ctx = xref->ctx; + unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(src); + unsigned short *dstptr, *dst; + int srclen = pdf_to_str_len(src); + int i; + + if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) + { + dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short)); + for (i = 2; i + 1 < srclen; i += 2) + *dstptr++ = srcptr[i] << 8 | srcptr[i+1]; + } + else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254) + { + dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short)); + for (i = 2; i + 1 < srclen; i += 2) + *dstptr++ = srcptr[i] | srcptr[i+1] << 8; + } + else + { + dstptr = dst = fz_malloc_array(ctx, srclen + 1, sizeof(short)); + for (i = 0; i < srclen; i++) + *dstptr++ = pdf_doc_encoding[srcptr[i]]; + } + + *dstptr = '\0'; + return dst; +} + +/* allow to convert to UCS-2 without the need for an fz_context */ +/* (buffer must be at least (fz_to_str_len(src) + 1) * 2 bytes in size) */ +void +pdf_to_ucs2_buf(unsigned short *buffer, pdf_obj *src) +{ + unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(src); + unsigned short *dstptr = buffer; + int srclen = pdf_to_str_len(src); + int i; + + if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) + { + for (i = 2; i + 1 < srclen; i += 2) + *dstptr++ = srcptr[i] << 8 | srcptr[i+1]; + } + else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254) + { + for (i = 2; i + 1 < srclen; i += 2) + *dstptr++ = srcptr[i] | srcptr[i+1] << 8; + } + else + { + for (i = 0; i < srclen; i++) + *dstptr++ = pdf_doc_encoding[srcptr[i]]; + } + + *dstptr = '\0'; +} + +/* Convert UCS-2 string into PdfDocEncoding for authentication */ +char * +pdf_from_ucs2(pdf_document *xref, unsigned short *src) +{ + fz_context *ctx = xref->ctx; + int i, j, len; + char *docstr; + + len = 0; + while (src[len]) + len++; + + docstr = fz_malloc(ctx, len + 1); + + for (i = 0; i < len; i++) + { + /* shortcut: check if the character has the same code point in both encodings */ + if (0 < src[i] && src[i] < 256 && pdf_doc_encoding[src[i]] == src[i]) { + docstr[i] = src[i]; + continue; + } + + /* search through pdf_docencoding for the character's code point */ + for (j = 0; j < 256; j++) + if (pdf_doc_encoding[j] == src[i]) + break; + docstr[i] = j; + + /* fail, if a character can't be encoded */ + if (!docstr[i]) + { + fz_free(ctx, docstr); + return NULL; + } + } + docstr[len] = '\0'; + + return docstr; +} + +pdf_obj * +pdf_to_utf8_name(pdf_document *xref, pdf_obj *src) +{ + char *buf = pdf_to_utf8(xref, src); + pdf_obj *dst = pdf_new_name(xref->ctx, buf); + fz_free(xref->ctx, buf); + return dst; +} + +pdf_obj * +pdf_parse_array(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf) +{ + pdf_obj *ary = NULL; + pdf_obj *obj = NULL; + int a = 0, b = 0, n = 0; + pdf_token tok; + fz_context *ctx = file->ctx; + pdf_obj *op; + + fz_var(obj); + + ary = pdf_new_array(ctx, 4); + + fz_try(ctx) + { + while (1) + { + tok = pdf_lex(file, buf); + + if (tok != PDF_TOK_INT && tok != PDF_TOK_R) + { + if (n > 0) + { + obj = pdf_new_int(ctx, a); + pdf_array_push(ary, obj); + pdf_drop_obj(obj); + obj = NULL; + } + if (n > 1) + { + obj = pdf_new_int(ctx, b); + pdf_array_push(ary, obj); + pdf_drop_obj(obj); + obj = NULL; + } + n = 0; + } + + if (tok == PDF_TOK_INT && n == 2) + { + obj = pdf_new_int(ctx, a); + pdf_array_push(ary, obj); + pdf_drop_obj(obj); + obj = NULL; + a = b; + n --; + } + + switch (tok) + { + case PDF_TOK_CLOSE_ARRAY: + op = ary; + goto end; + + case PDF_TOK_INT: + if (n == 0) + a = buf->i; + if (n == 1) + b = buf->i; + n ++; + break; + + case PDF_TOK_R: + if (n != 2) + fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse indirect reference in array"); + obj = pdf_new_indirect(ctx, a, b, xref); + pdf_array_push(ary, obj); + pdf_drop_obj(obj); + obj = NULL; + n = 0; + break; + + case PDF_TOK_OPEN_ARRAY: + obj = pdf_parse_array(xref, file, buf); + pdf_array_push(ary, obj); + pdf_drop_obj(obj); + obj = NULL; + break; + + case PDF_TOK_OPEN_DICT: + obj = pdf_parse_dict(xref, file, buf); + pdf_array_push(ary, obj); + pdf_drop_obj(obj); + obj = NULL; + break; + + case PDF_TOK_NAME: + obj = pdf_new_name(ctx, buf->scratch); + pdf_array_push(ary, obj); + pdf_drop_obj(obj); + obj = NULL; + break; + case PDF_TOK_REAL: + obj = pdf_new_real(ctx, buf->f); + pdf_array_push(ary, obj); + pdf_drop_obj(obj); + obj = NULL; + break; + case PDF_TOK_STRING: + obj = pdf_new_string(ctx, buf->scratch, buf->len); + pdf_array_push(ary, obj); + pdf_drop_obj(obj); + obj = NULL; + break; + case PDF_TOK_TRUE: + obj = pdf_new_bool(ctx, 1); + pdf_array_push(ary, obj); + pdf_drop_obj(obj); + obj = NULL; + break; + case PDF_TOK_FALSE: + obj = pdf_new_bool(ctx, 0); + pdf_array_push(ary, obj); + pdf_drop_obj(obj); + obj = NULL; + break; + case PDF_TOK_NULL: + obj = pdf_new_null(ctx); + pdf_array_push(ary, obj); + pdf_drop_obj(obj); + obj = NULL; + break; + + default: + fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse token in array"); + } + } +end: + {} + } + fz_catch(ctx) + { + pdf_drop_obj(obj); + pdf_drop_obj(ary); + fz_rethrow_message(ctx, "cannot parse array"); + } + return op; +} + +pdf_obj * +pdf_parse_dict(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf) +{ + pdf_obj *dict; + pdf_obj *key = NULL; + pdf_obj *val = NULL; + pdf_token tok; + int a, b; + fz_context *ctx = file->ctx; + + dict = pdf_new_dict(ctx, 8); + + fz_var(key); + fz_var(val); + + fz_try(ctx) + { + while (1) + { + tok = pdf_lex(file, buf); + skip: + if (tok == PDF_TOK_CLOSE_DICT) + break; + + /* for BI .. ID .. EI in content streams */ + if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")) + break; + + if (tok != PDF_TOK_NAME) + fz_throw(ctx, FZ_ERROR_GENERIC, "invalid key in dict"); + + key = pdf_new_name(ctx, buf->scratch); + + tok = pdf_lex(file, buf); + + switch (tok) + { + case PDF_TOK_OPEN_ARRAY: + val = pdf_parse_array(xref, file, buf); + break; + + case PDF_TOK_OPEN_DICT: + val = pdf_parse_dict(xref, file, buf); + break; + + case PDF_TOK_NAME: val = pdf_new_name(ctx, buf->scratch); break; + case PDF_TOK_REAL: val = pdf_new_real(ctx, buf->f); break; + case PDF_TOK_STRING: val = pdf_new_string(ctx, buf->scratch, buf->len); break; + case PDF_TOK_TRUE: val = pdf_new_bool(ctx, 1); break; + case PDF_TOK_FALSE: val = pdf_new_bool(ctx, 0); break; + case PDF_TOK_NULL: val = pdf_new_null(ctx); break; + + case PDF_TOK_INT: + /* 64-bit to allow for numbers > INT_MAX and overflow */ + a = buf->i; + tok = pdf_lex(file, buf); + if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME || + (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))) + { + val = pdf_new_int(ctx, a); + pdf_dict_put(dict, key, val); + pdf_drop_obj(val); + val = NULL; + pdf_drop_obj(key); + key = NULL; + goto skip; + } + if (tok == PDF_TOK_INT) + { + b = buf->i; + tok = pdf_lex(file, buf); + if (tok == PDF_TOK_R) + { + val = pdf_new_indirect(ctx, a, b, xref); + break; + } + } + fz_throw(ctx, FZ_ERROR_GENERIC, "invalid indirect reference in dict"); + + default: + fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in dict"); + } + + pdf_dict_put(dict, key, val); + pdf_drop_obj(val); + val = NULL; + pdf_drop_obj(key); + key = NULL; + } + } + fz_catch(ctx) + { + pdf_drop_obj(dict); + pdf_drop_obj(key); + pdf_drop_obj(val); + fz_rethrow_message(ctx, "cannot parse dict"); + } + return dict; +} + +pdf_obj * +pdf_parse_stm_obj(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf) +{ + pdf_token tok; + fz_context *ctx = file->ctx; + + tok = pdf_lex(file, buf); + + switch (tok) + { + case PDF_TOK_OPEN_ARRAY: + return pdf_parse_array(xref, file, buf); + case PDF_TOK_OPEN_DICT: + return pdf_parse_dict(xref, file, buf); + case PDF_TOK_NAME: return pdf_new_name(ctx, buf->scratch); break; + case PDF_TOK_REAL: return pdf_new_real(ctx, buf->f); break; + case PDF_TOK_STRING: return pdf_new_string(ctx, buf->scratch, buf->len); break; + case PDF_TOK_TRUE: return pdf_new_bool(ctx, 1); break; + case PDF_TOK_FALSE: return pdf_new_bool(ctx, 0); break; + case PDF_TOK_NULL: return pdf_new_null(ctx); break; + case PDF_TOK_INT: return pdf_new_int(ctx, buf->i); break; + default: fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in object stream"); + } + return NULL; /* Stupid MSVC */ +} + +pdf_obj * +pdf_parse_ind_obj(pdf_document *xref, + fz_stream *file, pdf_lexbuf *buf, + int *onum, int *ogen, int *ostmofs) +{ + pdf_obj *obj = NULL; + int num = 0, gen = 0, stm_ofs; + pdf_token tok; + int a, b; + fz_context *ctx = file->ctx; + + fz_var(obj); + + tok = pdf_lex(file, buf); + if (tok != PDF_TOK_INT) + fz_throw(ctx, FZ_ERROR_GENERIC, "expected object number"); + num = buf->i; + + tok = pdf_lex(file, buf); + if (tok != PDF_TOK_INT) + fz_throw(ctx, FZ_ERROR_GENERIC, "expected generation number (%d ? obj)", num); + gen = buf->i; + + tok = pdf_lex(file, buf); + if (tok != PDF_TOK_OBJ) + fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'obj' keyword (%d %d ?)", num, gen); + + tok = pdf_lex(file, buf); + + switch (tok) + { + case PDF_TOK_OPEN_ARRAY: + obj = pdf_parse_array(xref, file, buf); + break; + + case PDF_TOK_OPEN_DICT: + obj = pdf_parse_dict(xref, file, buf); + break; + + case PDF_TOK_NAME: obj = pdf_new_name(ctx, buf->scratch); break; + case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); break; + case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); break; + case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, 1); break; + case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, 0); break; + case PDF_TOK_NULL: obj = pdf_new_null(ctx); break; + + case PDF_TOK_INT: + a = buf->i; + tok = pdf_lex(file, buf); + + if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ) + { + obj = pdf_new_int(ctx, a); + goto skip; + } + if (tok == PDF_TOK_INT) + { + b = buf->i; + tok = pdf_lex(file, buf); + if (tok == PDF_TOK_R) + { + obj = pdf_new_indirect(ctx, a, b, xref); + break; + } + } + fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'R' keyword (%d %d R)", num, gen); + + case PDF_TOK_ENDOBJ: + obj = pdf_new_null(ctx); + goto skip; + + default: + fz_throw(ctx, FZ_ERROR_GENERIC, "syntax error in object (%d %d R)", num, gen); + } + + fz_try(ctx) + { + tok = pdf_lex(file, buf); + } + fz_catch(ctx) + { + pdf_drop_obj(obj); + fz_rethrow_message(ctx, "cannot parse indirect object (%d %d R)", num, gen); + } + +skip: + if (tok == PDF_TOK_STREAM) + { + int c = fz_read_byte(file); + while (c == ' ') + c = fz_read_byte(file); + if (c == '\r') + { + c = fz_peek_byte(file); + if (c != '\n') + fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen); + else + fz_read_byte(file); + } + stm_ofs = fz_tell(file); + } + else if (tok == PDF_TOK_ENDOBJ) + { + stm_ofs = 0; + } + else + { + fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen); + stm_ofs = 0; + } + + if (onum) *onum = num; + if (ogen) *ogen = gen; + if (ostmofs) *ostmofs = stm_ofs; + return obj; +} |