#include "fitz.h" #include "mupdf.h" fz_rect pdf_to_rect(fz_context *ctx, fz_obj *array) { fz_rect r; float a = fz_to_real(ctx, fz_array_get(ctx, array, 0)); float b = fz_to_real(ctx, fz_array_get(ctx, array, 1)); float c = fz_to_real(ctx, fz_array_get(ctx, array, 2)); float d = fz_to_real(ctx, fz_array_get(ctx, array, 3)); r.x0 = MIN(a, c); r.y0 = MIN(b, d); r.x1 = MAX(a, c); r.y1 = MAX(b, d); return r; } fz_matrix pdf_to_matrix(fz_context *ctx, fz_obj *array) { fz_matrix m; m.a = fz_to_real(ctx, fz_array_get(ctx, array, 0)); m.b = fz_to_real(ctx, fz_array_get(ctx, array, 1)); m.c = fz_to_real(ctx, fz_array_get(ctx, array, 2)); m.d = fz_to_real(ctx, fz_array_get(ctx, array, 3)); m.e = fz_to_real(ctx, fz_array_get(ctx, array, 4)); m.f = fz_to_real(ctx, fz_array_get(ctx, array, 5)); return m; } /* Convert Unicode/PdfDocEncoding string into utf-8 */ char * pdf_to_utf8(fz_context *ctx, fz_obj *src) { unsigned char *srcptr = (unsigned char *) fz_to_str_buf(ctx, src); char *dstptr, *dst; int srclen = fz_to_str_len(ctx, src); int dstlen = 0; int ucs; int i; if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) { for (i = 2; i < srclen; i += 2) { ucs = srcptr[i] << 8 | srcptr[i+1]; dstlen += runelen(ucs); } dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 2; i < srclen; i += 2) { ucs = srcptr[i] << 8 | srcptr[i+1]; dstptr += runetochar(dstptr, &ucs); } } else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254) { for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] | srcptr[i+1] << 8; dstlen += runelen(ucs); } dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 2; i + 1 < srclen; i += 2) { ucs = srcptr[i] | srcptr[i+1] << 8; dstptr += runetochar(dstptr, &ucs); } } else { for (i = 0; i < srclen; i++) dstlen += runelen(pdf_doc_encoding[srcptr[i]]); dstptr = dst = fz_malloc(ctx, dstlen + 1); for (i = 0; i < srclen; i++) { ucs = pdf_doc_encoding[srcptr[i]]; dstptr += runetochar(dstptr, &ucs); } } *dstptr = '\0'; return dst; } /* Convert Unicode/PdfDocEncoding string into ucs-2 */ unsigned short * pdf_to_ucs2(fz_context *ctx, fz_obj *src) { unsigned char *srcptr = (unsigned char *) fz_to_str_buf(ctx, src); unsigned short *dstptr, *dst; int srclen = fz_to_str_len(ctx, src); int i; if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) { dstptr = dst = fz_calloc(ctx, (srclen - 2) / 2 + 1, sizeof(short)); for (i = 2; i + 1 < srclen; i += 2) *dstptr++ = srcptr[i] << 8 | srcptr[i+1]; } else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254) { dstptr = dst = fz_calloc(ctx, (srclen - 2) / 2 + 1, sizeof(short)); for (i = 2; i + 1 < srclen; i += 2) *dstptr++ = srcptr[i] | srcptr[i+1] << 8; } else { dstptr = dst = fz_calloc(ctx, srclen + 1, sizeof(short)); for (i = 0; i < srclen; i++) *dstptr++ = pdf_doc_encoding[srcptr[i]]; } *dstptr = '\0'; return dst; } /* Convert UCS-2 string into PdfDocEncoding for authentication */ char * pdf_from_ucs2(fz_context *ctx, unsigned short *src) { int i, j, len; char *docstr; len = 0; while (src[len]) len++; docstr = fz_malloc(ctx, len + 1); for (i = 0; i < len; i++) { /* shortcut: check if the character has the same code point in both encodings */ if (0 < src[i] && src[i] < 256 && pdf_doc_encoding[src[i]] == src[i]) { docstr[i] = src[i]; continue; } /* search through pdf_docencoding for the character's code point */ for (j = 0; j < 256; j++) if (pdf_doc_encoding[j] == src[i]) break; docstr[i] = j; /* fail, if a character can't be encoded */ if (!docstr[i]) { fz_free(ctx, docstr); return NULL; } } docstr[len] = '\0'; return docstr; } fz_obj * pdf_to_utf8_name(fz_context *ctx, fz_obj *src) { char *buf = pdf_to_utf8(ctx, src); fz_obj *dst = fz_new_name(ctx, buf); fz_free(ctx, buf); return dst; } fz_error pdf_parse_array(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error = fz_okay; fz_obj *ary = NULL; fz_obj *obj = NULL; int a = 0, b = 0, n = 0; int tok; int len; fz_context *ctx = file->ctx; ary = fz_new_array(ctx, 4); while (1) { error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(ctx, ary); return fz_error_note(error, "cannot parse array"); } if (tok != PDF_TOK_INT && tok != PDF_TOK_R) { if (n > 0) { obj = fz_new_int(ctx, a); fz_array_push(ctx, ary, obj); fz_drop_obj(ctx, obj); } if (n > 1) { obj = fz_new_int(ctx, b); fz_array_push(ctx, ary, obj); fz_drop_obj(ctx, obj); } n = 0; } if (tok == PDF_TOK_INT && n == 2) { obj = fz_new_int(ctx, a); fz_array_push(ctx, ary, obj); fz_drop_obj(ctx, obj); a = b; n --; } switch (tok) { case PDF_TOK_CLOSE_ARRAY: *op = ary; return fz_okay; case PDF_TOK_INT: if (n == 0) a = atoi(buf); if (n == 1) b = atoi(buf); n ++; break; case PDF_TOK_R: if (n != 2) { fz_drop_obj(ctx, ary); return fz_error_make("cannot parse indirect reference in array"); } obj = fz_new_indirect(ctx, a, b, xref); fz_array_push(ctx, ary, obj); fz_drop_obj(ctx, obj); n = 0; break; case PDF_TOK_OPEN_ARRAY: error = pdf_parse_array(&obj, xref, file, buf, cap); if (error) { fz_drop_obj(ctx, ary); return fz_error_note(error, "cannot parse array"); } fz_array_push(ctx, ary, obj); fz_drop_obj(ctx, obj); break; case PDF_TOK_OPEN_DICT: error = pdf_parse_dict(&obj, xref, file, buf, cap); if (error) { fz_drop_obj(ctx, ary); return fz_error_note(error, "cannot parse array"); } fz_array_push(ctx, ary, obj); fz_drop_obj(ctx, obj); break; case PDF_TOK_NAME: obj = fz_new_name(ctx, buf); fz_array_push(ctx, ary, obj); fz_drop_obj(ctx, obj); break; case PDF_TOK_REAL: obj = fz_new_real(ctx, fz_atof(buf)); fz_array_push(ctx, ary, obj); fz_drop_obj(ctx, obj); break; case PDF_TOK_STRING: obj = fz_new_string(ctx, buf, len); fz_array_push(ctx, ary, obj); fz_drop_obj(ctx, obj); break; case PDF_TOK_TRUE: obj = fz_new_bool(ctx, 1); fz_array_push(ctx, ary, obj); fz_drop_obj(ctx, obj); break; case PDF_TOK_FALSE: obj = fz_new_bool(ctx, 0); fz_array_push(ctx, ary, obj); fz_drop_obj(ctx, obj); break; case PDF_TOK_NULL: obj = fz_new_null(ctx); fz_array_push(ctx, ary, obj); fz_drop_obj(ctx, obj); break; default: fz_drop_obj(ctx, ary); return fz_error_make("cannot parse token in array"); } } } fz_error pdf_parse_dict(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error = fz_okay; fz_obj *dict = NULL; fz_obj *key = NULL; fz_obj *val = NULL; int tok; int len; int a, b; fz_context *ctx = file->ctx; dict = fz_new_dict(ctx, 8); while (1) { error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(ctx, dict); return fz_error_note(error, "cannot parse dict"); } skip: if (tok == PDF_TOK_CLOSE_DICT) { *op = dict; return fz_okay; } /* for BI .. ID .. EI in content streams */ if (tok == PDF_TOK_KEYWORD && !strcmp(buf, "ID")) { *op = dict; return fz_okay; } if (tok != PDF_TOK_NAME) { fz_drop_obj(ctx, dict); return fz_error_make("invalid key in dict"); } key = fz_new_name(ctx, buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(ctx, key); fz_drop_obj(ctx, dict); return fz_error_note(error, "cannot parse dict"); } switch (tok) { case PDF_TOK_OPEN_ARRAY: error = pdf_parse_array(&val, xref, file, buf, cap); if (error) { fz_drop_obj(ctx, key); fz_drop_obj(ctx, dict); return fz_error_note(error, "cannot parse dict"); } break; case PDF_TOK_OPEN_DICT: error = pdf_parse_dict(&val, xref, file, buf, cap); if (error) { fz_drop_obj(ctx, key); fz_drop_obj(ctx, dict); return fz_error_note(error, "cannot parse dict"); } break; case PDF_TOK_NAME: val = fz_new_name(ctx, buf); break; case PDF_TOK_REAL: val = fz_new_real(ctx, fz_atof(buf)); break; case PDF_TOK_STRING: val = fz_new_string(ctx, buf, len); break; case PDF_TOK_TRUE: val = fz_new_bool(ctx, 1); break; case PDF_TOK_FALSE: val = fz_new_bool(ctx, 0); break; case PDF_TOK_NULL: val = fz_new_null(ctx); break; case PDF_TOK_INT: /* 64-bit to allow for numbers > INT_MAX and overflow */ a = (int) strtoll(buf, 0, 10); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(ctx, key); fz_drop_obj(ctx, dict); return fz_error_note(error, "cannot parse dict"); } if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME || (tok == PDF_TOK_KEYWORD && !strcmp(buf, "ID"))) { val = fz_new_int(ctx, a); fz_dict_put(ctx, dict, key, val); fz_drop_obj(ctx, val); fz_drop_obj(ctx, key); goto skip; } if (tok == PDF_TOK_INT) { b = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(ctx, key); fz_drop_obj(ctx, dict); return fz_error_note(error, "cannot parse dict"); } if (tok == PDF_TOK_R) { val = fz_new_indirect(ctx, a, b, xref); break; } } fz_drop_obj(ctx, key); fz_drop_obj(ctx, dict); return fz_error_make("invalid indirect reference in dict"); default: fz_drop_obj(ctx, key); fz_drop_obj(ctx, dict); return fz_error_make("unknown token in dict"); } fz_dict_put(ctx, dict, key, val); fz_drop_obj(ctx, val); fz_drop_obj(ctx, key); } } fz_error pdf_parse_stm_obj(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap) { fz_error error; int tok; int len; fz_context *ctx = file->ctx; error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_error_note(error, "cannot parse token in object stream"); switch (tok) { case PDF_TOK_OPEN_ARRAY: error = pdf_parse_array(op, xref, file, buf, cap); if (error) return fz_error_note(error, "cannot parse object stream"); break; case PDF_TOK_OPEN_DICT: error = pdf_parse_dict(op, xref, file, buf, cap); if (error) return fz_error_note(error, "cannot parse object stream"); break; case PDF_TOK_NAME: *op = fz_new_name(ctx, buf); break; case PDF_TOK_REAL: *op = fz_new_real(ctx, fz_atof(buf)); break; case PDF_TOK_STRING: *op = fz_new_string(ctx, buf, len); break; case PDF_TOK_TRUE: *op = fz_new_bool(ctx, 1); break; case PDF_TOK_FALSE: *op = fz_new_bool(ctx, 0); break; case PDF_TOK_NULL: *op = fz_new_null(ctx); break; case PDF_TOK_INT: *op = fz_new_int(ctx, atoi(buf)); break; default: return fz_error_make("unknown token in object stream"); } return fz_okay; } fz_error pdf_parse_ind_obj(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap, int *onum, int *ogen, int *ostmofs) { fz_error error = fz_okay; fz_obj *obj = NULL; int num = 0, gen = 0, stm_ofs; int tok; int len; int a, b; fz_context *ctx = file->ctx; error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_error_note(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TOK_INT) return fz_error_make("expected object number (%d %d R)", num, gen); num = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_error_note(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TOK_INT) return fz_error_make("expected generation number (%d %d R)", num, gen); gen = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_error_note(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok != PDF_TOK_OBJ) return fz_error_make("expected 'obj' keyword (%d %d R)", num, gen); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_error_note(error, "cannot parse indirect object (%d %d R)", num, gen); switch (tok) { case PDF_TOK_OPEN_ARRAY: error = pdf_parse_array(&obj, xref, file, buf, cap); if (error) return fz_error_note(error, "cannot parse indirect object (%d %d R)", num, gen); break; case PDF_TOK_OPEN_DICT: error = pdf_parse_dict(&obj, xref, file, buf, cap); if (error) return fz_error_note(error, "cannot parse indirect object (%d %d R)", num, gen); break; case PDF_TOK_NAME: obj = fz_new_name(ctx, buf); break; case PDF_TOK_REAL: obj = fz_new_real(ctx, fz_atof(buf)); break; case PDF_TOK_STRING: obj = fz_new_string(ctx, buf, len); break; case PDF_TOK_TRUE: obj = fz_new_bool(ctx, 1); break; case PDF_TOK_FALSE: obj = fz_new_bool(ctx, 0); break; case PDF_TOK_NULL: obj = fz_new_null(ctx); break; case PDF_TOK_INT: a = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_error_note(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ) { obj = fz_new_int(ctx, a); goto skip; } if (tok == PDF_TOK_INT) { b = atoi(buf); error = pdf_lex(&tok, file, buf, cap, &len); if (error) return fz_error_note(error, "cannot parse indirect object (%d %d R)", num, gen); if (tok == PDF_TOK_R) { obj = fz_new_indirect(ctx, a, b, xref); break; } } return fz_error_make("expected 'R' keyword (%d %d R)", num, gen); case PDF_TOK_ENDOBJ: obj = fz_new_null(ctx); goto skip; default: return fz_error_make("syntax error in object (%d %d R)", num, gen); } error = pdf_lex(&tok, file, buf, cap, &len); if (error) { fz_drop_obj(ctx, obj); return fz_error_note(error, "cannot parse indirect object (%d %d R)", num, gen); } skip: if (tok == PDF_TOK_STREAM) { int c = fz_read_byte(file); while (c == ' ') c = fz_read_byte(file); if (c == '\r') { c = fz_peek_byte(file); if (c != '\n') fz_warn("line feed missing after stream begin marker (%d %d R)", num, gen); else fz_read_byte(file); } stm_ofs = fz_tell(file); } else if (tok == PDF_TOK_ENDOBJ) { stm_ofs = 0; } else { fz_warn("expected 'endobj' or 'stream' keyword (%d %d R)", num, gen); stm_ofs = 0; } if (onum) *onum = num; if (ogen) *ogen = gen; if (ostmofs) *ostmofs = stm_ofs; *op = obj; return fz_okay; }