diff options
-rw-r--r-- | pdf/mupdf-internal.h | 1 | ||||
-rw-r--r-- | pdf/pdf_lex.c | 24 | ||||
-rw-r--r-- | pdf/pdf_repair.c | 39 | ||||
-rw-r--r-- | pdf/pdf_xref.c | 5 |
4 files changed, 56 insertions, 13 deletions
diff --git a/pdf/mupdf-internal.h b/pdf/mupdf-internal.h index 95931352..f6791335 100644 --- a/pdf/mupdf-internal.h +++ b/pdf/mupdf-internal.h @@ -58,6 +58,7 @@ void pdf_lexbuf_fin(pdf_lexbuf *lexbuf); ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lexbuf); pdf_token pdf_lex(fz_stream *f, pdf_lexbuf *lexbuf); +pdf_token pdf_lex_repair(fz_stream *f, pdf_lexbuf *buf); pdf_obj *pdf_parse_array(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf); pdf_obj *pdf_parse_dict(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf); diff --git a/pdf/pdf_lex.c b/pdf/pdf_lex.c index 6b03d8d9..8e179042 100644 --- a/pdf/pdf_lex.c +++ b/pdf/pdf_lex.c @@ -442,8 +442,8 @@ ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lb) return lb->scratch - old; } -pdf_token -pdf_lex(fz_stream *f, pdf_lexbuf *buf) +static pdf_token +do_lex(fz_stream *f, pdf_lexbuf *buf, int repair) { while (1) { @@ -484,6 +484,7 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf) return PDF_TOK_CLOSE_DICT; } fz_warn(f->ctx, "lexical error (unexpected '>')"); + fz_unread_byte(f); continue; case '[': return PDF_TOK_OPEN_ARRAY; @@ -494,6 +495,13 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf) case '}': return PDF_TOK_CLOSE_BRACE; case IS_NUMBER: + /* In the 'repair' case, we want to stop before + * consuming the int. */ + if (repair) + { + fz_unread_byte(f); + return PDF_TOK_INT; + } return lex_number(f, buf, c); default: /* isregular: !isdelim && !iswhite && c != EOF */ fz_unread_byte(f); @@ -503,6 +511,18 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf) } } +pdf_token +pdf_lex(fz_stream *f, pdf_lexbuf *buf) +{ + return do_lex(f, buf, 0); +} + +pdf_token +pdf_lex_repair(fz_stream *f, pdf_lexbuf *buf) +{ + return do_lex(f, buf, 1); +} + void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf) { switch (tok) diff --git a/pdf/pdf_repair.c b/pdf/pdf_repair.c index e5a1a8a9..4240e305 100644 --- a/pdf/pdf_repair.c +++ b/pdf/pdf_repair.c @@ -15,8 +15,8 @@ struct entry int stm_len; }; -static void -pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id) +static int +pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, int *tmpofs) { pdf_token tok; int stm_len; @@ -28,6 +28,10 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf stm_len = 0; + /* On entry to this function, we know that we've just seen + * '<int> <int> obj'. We expect the next thing we see to be a + * pdf object. Regardless of the type of thing we meet next + * we only need to fully parse it if it is a dictionary. */ tok = pdf_lex(file, buf); if (tok == PDF_TOK_OPEN_DICT) @@ -79,15 +83,13 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf tok != PDF_TOK_EOF && tok != PDF_TOK_INT ) { + *tmpofs = fz_tell(file); + if (*tmpofs < 0) + fz_throw(ctx, "cannot tell in file"); tok = pdf_lex(file, buf); } - if (tok == PDF_TOK_INT) - { - while (buf->len-- > 0) - fz_unread_byte(file); - } - else if (tok == PDF_TOK_STREAM) + if (tok == PDF_TOK_STREAM) { int c = fz_read_byte(file); if (c == '\r') { @@ -132,10 +134,22 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf *stmlenp = fz_tell(file) - *stmofsp - 9; atobjend: + *tmpofs = fz_tell(file); + if (*tmpofs < 0) + fz_throw(ctx, "cannot tell in file"); tok = pdf_lex(file, buf); if (tok != PDF_TOK_ENDOBJ) fz_warn(ctx, "object missing 'endobj' token"); + else + { + /* Read another token as we always return the next one */ + *tmpofs = fz_tell(file); + if (*tmpofs < 0) + fz_throw(ctx, "cannot tell in file"); + tok = pdf_lex(file, buf); + } } + return tok; } static void @@ -288,6 +302,11 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf) break; } + /* If we have the next token already, then we'll jump + * back here, rather than going through the top of + * the loop. */ + have_next_token: + if (tok == PDF_TOK_INT) { numofs = genofs; @@ -300,7 +319,7 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf) { fz_try(ctx) { - pdf_repair_obj(xref->file, buf, &stm_ofs, &stm_len, &encrypt, &id); + tok = pdf_repair_obj(xref->file, buf, &stm_ofs, &stm_len, &encrypt, &id, &tmpofs); } fz_catch(ctx) { @@ -341,6 +360,8 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf) if (num > maxnum) maxnum = num; + + goto have_next_token; } /* trailer dictionary */ diff --git a/pdf/pdf_xref.c b/pdf/pdf_xref.c index 92f93352..332ddaf0 100644 --- a/pdf/pdf_xref.c +++ b/pdf/pdf_xref.c @@ -237,8 +237,9 @@ pdf_read_start_xref(pdf_document *xref) while (iswhite(buf[i]) && i < n) i ++; xref->startxref = atoi((char*)(buf + i)); - - return; + if (xref->startxref != 0) + return; + break; } } |