diff options
author | Robin Watts <robin.watts@artifex.com> | 2013-06-11 16:53:35 +0100 |
---|---|---|
committer | Robin Watts <robin.watts@artifex.com> | 2013-06-13 15:22:37 +0100 |
commit | 919bc6d91961caac1f1e4aeb95fb581f778a5542 (patch) | |
tree | 5938f9140e0a112918ec8a6292124890343fa26e /pdf | |
parent | f9ff77036a2a85ab990244191e1b7ba4b1a1665e (diff) | |
download | mupdf-919bc6d91961caac1f1e4aeb95fb581f778a5542.tar.xz |
Bug 694303: Fix various problems in pdf parsing
Spot a 'startxref' of 0 as being an error; otherwise the code
falls through and we SEGV.
Also update the pdf_repair_obj function to cope better with the new
way we parse ints. Previously we parsed ints into the buffer and
atoi'd them there - to step backwards over the int was therefore a
matter of stepping backwards over the specified number of chars.
In the 'new' code (now quite old) we parse ints directly, hence we
cannot do this stepping back. Also, stepping backwards by more than
1 byte is risky anyway. We therefore adopt a smarter approach of
returning the next lexed token from pdf_repair_obj.
Thanks to zeniko for reporting these problems and providing a test
file.
Diffstat (limited to 'pdf')
-rw-r--r-- | pdf/mupdf-internal.h | 1 | ||||
-rw-r--r-- | pdf/pdf_lex.c | 24 | ||||
-rw-r--r-- | pdf/pdf_repair.c | 39 | ||||
-rw-r--r-- | pdf/pdf_xref.c | 5 |
4 files changed, 56 insertions, 13 deletions
diff --git a/pdf/mupdf-internal.h b/pdf/mupdf-internal.h index 95931352..f6791335 100644 --- a/pdf/mupdf-internal.h +++ b/pdf/mupdf-internal.h @@ -58,6 +58,7 @@ void pdf_lexbuf_fin(pdf_lexbuf *lexbuf); ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lexbuf); pdf_token pdf_lex(fz_stream *f, pdf_lexbuf *lexbuf); +pdf_token pdf_lex_repair(fz_stream *f, pdf_lexbuf *buf); pdf_obj *pdf_parse_array(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf); pdf_obj *pdf_parse_dict(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf); diff --git a/pdf/pdf_lex.c b/pdf/pdf_lex.c index 6b03d8d9..8e179042 100644 --- a/pdf/pdf_lex.c +++ b/pdf/pdf_lex.c @@ -442,8 +442,8 @@ ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lb) return lb->scratch - old; } -pdf_token -pdf_lex(fz_stream *f, pdf_lexbuf *buf) +static pdf_token +do_lex(fz_stream *f, pdf_lexbuf *buf, int repair) { while (1) { @@ -484,6 +484,7 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf) return PDF_TOK_CLOSE_DICT; } fz_warn(f->ctx, "lexical error (unexpected '>')"); + fz_unread_byte(f); continue; case '[': return PDF_TOK_OPEN_ARRAY; @@ -494,6 +495,13 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf) case '}': return PDF_TOK_CLOSE_BRACE; case IS_NUMBER: + /* In the 'repair' case, we want to stop before + * consuming the int. */ + if (repair) + { + fz_unread_byte(f); + return PDF_TOK_INT; + } return lex_number(f, buf, c); default: /* isregular: !isdelim && !iswhite && c != EOF */ fz_unread_byte(f); @@ -503,6 +511,18 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf) } } +pdf_token +pdf_lex(fz_stream *f, pdf_lexbuf *buf) +{ + return do_lex(f, buf, 0); +} + +pdf_token +pdf_lex_repair(fz_stream *f, pdf_lexbuf *buf) +{ + return do_lex(f, buf, 1); +} + void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf) { switch (tok) diff --git a/pdf/pdf_repair.c b/pdf/pdf_repair.c index e5a1a8a9..4240e305 100644 --- a/pdf/pdf_repair.c +++ b/pdf/pdf_repair.c @@ -15,8 +15,8 @@ struct entry int stm_len; }; -static void -pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id) +static int +pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, int *tmpofs) { pdf_token tok; int stm_len; @@ -28,6 +28,10 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf stm_len = 0; + /* On entry to this function, we know that we've just seen + * '<int> <int> obj'. We expect the next thing we see to be a + * pdf object. Regardless of the type of thing we meet next + * we only need to fully parse it if it is a dictionary. */ tok = pdf_lex(file, buf); if (tok == PDF_TOK_OPEN_DICT) @@ -79,15 +83,13 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf tok != PDF_TOK_EOF && tok != PDF_TOK_INT ) { + *tmpofs = fz_tell(file); + if (*tmpofs < 0) + fz_throw(ctx, "cannot tell in file"); tok = pdf_lex(file, buf); } - if (tok == PDF_TOK_INT) - { - while (buf->len-- > 0) - fz_unread_byte(file); - } - else if (tok == PDF_TOK_STREAM) + if (tok == PDF_TOK_STREAM) { int c = fz_read_byte(file); if (c == '\r') { @@ -132,10 +134,22 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf *stmlenp = fz_tell(file) - *stmofsp - 9; atobjend: + *tmpofs = fz_tell(file); + if (*tmpofs < 0) + fz_throw(ctx, "cannot tell in file"); tok = pdf_lex(file, buf); if (tok != PDF_TOK_ENDOBJ) fz_warn(ctx, "object missing 'endobj' token"); + else + { + /* Read another token as we always return the next one */ + *tmpofs = fz_tell(file); + if (*tmpofs < 0) + fz_throw(ctx, "cannot tell in file"); + tok = pdf_lex(file, buf); + } } + return tok; } static void @@ -288,6 +302,11 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf) break; } + /* If we have the next token already, then we'll jump + * back here, rather than going through the top of + * the loop. */ + have_next_token: + if (tok == PDF_TOK_INT) { numofs = genofs; @@ -300,7 +319,7 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf) { fz_try(ctx) { - pdf_repair_obj(xref->file, buf, &stm_ofs, &stm_len, &encrypt, &id); + tok = pdf_repair_obj(xref->file, buf, &stm_ofs, &stm_len, &encrypt, &id, &tmpofs); } fz_catch(ctx) { @@ -341,6 +360,8 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf) if (num > maxnum) maxnum = num; + + goto have_next_token; } /* trailer dictionary */ diff --git a/pdf/pdf_xref.c b/pdf/pdf_xref.c index 92f93352..332ddaf0 100644 --- a/pdf/pdf_xref.c +++ b/pdf/pdf_xref.c @@ -237,8 +237,9 @@ pdf_read_start_xref(pdf_document *xref) while (iswhite(buf[i]) && i < n) i ++; xref->startxref = atoi((char*)(buf + i)); - - return; + if (xref->startxref != 0) + return; + break; } } |