Bug 694303: Fix various problems in pdf parsing

Spot a 'startxref' of 0 as being an error; otherwise the code falls through and we SEGV. Also update the pdf_repair_obj function to cope better with the new way we parse ints. Previously we parsed ints into the buffer and atoi'd them there - to step backwards over the int was therefore a matter of stepping backwards over the specified number of chars. In the 'new' code (now quite old) we parse ints directly, hence we cannot do this stepping back. Also, stepping backwards by more than 1 byte is risky anyway. We therefore adopt a smarter approach of returning the next lexed token from pdf_repair_obj. Thanks to zeniko for reporting these problems and providing a test file.
author: Robin Watts <robin.watts@artifex.com> 2013-06-11 16:53:35 +0100
committer: Robin Watts <robin.watts@artifex.com> 2013-06-13 15:22:37 +0100
commit: 919bc6d91961caac1f1e4aeb95fb581f778a5542 (patch)
tree: 5938f9140e0a112918ec8a6292124890343fa26e /pdf
parent: f9ff77036a2a85ab990244191e1b7ba4b1a1665e (diff)
download: mupdf-919bc6d91961caac1f1e4aeb95fb581f778a5542.tar.xz
4 files changed, 56 insertions, 13 deletions
diff --git a/pdf/mupdf-internal.h b/pdf/mupdf-internal.h
index 95931352..f6791335 100644
--- a/pdf/mupdf-internal.h
+++ b/pdf/mupdf-internal.h
@@ -58,6 +58,7 @@ void pdf_lexbuf_fin(pdf_lexbuf *lexbuf);
 ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lexbuf);
 
 pdf_token pdf_lex(fz_stream *f, pdf_lexbuf *lexbuf);
+pdf_token pdf_lex_repair(fz_stream *f, pdf_lexbuf *buf);
 
 pdf_obj *pdf_parse_array(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf);
 pdf_obj *pdf_parse_dict(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf);
diff --git a/pdf/pdf_lex.c b/pdf/pdf_lex.c
index 6b03d8d9..8e179042 100644
--- a/pdf/pdf_lex.c
+++ b/pdf/pdf_lex.c
@@ -442,8 +442,8 @@ ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lb)
 	return lb->scratch - old;
 }
 
-pdf_token
-pdf_lex(fz_stream *f, pdf_lexbuf *buf)
+static pdf_token
+do_lex(fz_stream *f, pdf_lexbuf *buf, int repair)
 {
 	while (1)
 	{
@@ -484,6 +484,7 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
 				return PDF_TOK_CLOSE_DICT;
 			}
 			fz_warn(f->ctx, "lexical error (unexpected '>')");
+			fz_unread_byte(f);
 			continue;
 		case '[':
 			return PDF_TOK_OPEN_ARRAY;
@@ -494,6 +495,13 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
 		case '}':
 			return PDF_TOK_CLOSE_BRACE;
 		case IS_NUMBER:
+			/* In the 'repair' case, we want to stop before
+			 * consuming the int. */
+			if (repair)
+			{
+				fz_unread_byte(f);
+				return PDF_TOK_INT;
+			}
 			return lex_number(f, buf, c);
 		default: /* isregular: !isdelim && !iswhite && c != EOF */
 			fz_unread_byte(f);
@@ -503,6 +511,18 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
 	}
 }
 
+pdf_token
+pdf_lex(fz_stream *f, pdf_lexbuf *buf)
+{
+	return do_lex(f, buf, 0);
+}
+
+pdf_token
+pdf_lex_repair(fz_stream *f, pdf_lexbuf *buf)
+{
+	return do_lex(f, buf, 1);
+}
+
 void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
 {
 	switch (tok)
diff --git a/pdf/pdf_repair.c b/pdf/pdf_repair.c
index e5a1a8a9..4240e305 100644
--- a/pdf/pdf_repair.c
+++ b/pdf/pdf_repair.c
@@ -15,8 +15,8 @@ struct entry
 	int stm_len;
 };
 
-static void
-pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id)
+static int
+pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, int *tmpofs)
 {
 	pdf_token tok;
 	int stm_len;
@@ -28,6 +28,10 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf
 
 	stm_len = 0;
 
+	/* On entry to this function, we know that we've just seen
+	 * '<int> <int> obj'. We expect the next thing we see to be a
+	 * pdf object. Regardless of the type of thing we meet next
+	 * we only need to fully parse it if it is a dictionary. */
 	tok = pdf_lex(file, buf);
 
 	if (tok == PDF_TOK_OPEN_DICT)
@@ -79,15 +83,13 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf
 		tok != PDF_TOK_EOF &&
 		tok != PDF_TOK_INT )
 	{
+		*tmpofs = fz_tell(file);
+		if (*tmpofs < 0)
+			fz_throw(ctx, "cannot tell in file");
 		tok = pdf_lex(file, buf);
 	}
 
-	if (tok == PDF_TOK_INT)
-	{
-		while (buf->len-- > 0)
-			fz_unread_byte(file);
-	}
-	else if (tok == PDF_TOK_STREAM)
+	if (tok == PDF_TOK_STREAM)
 	{
 		int c = fz_read_byte(file);
 		if (c == '\r') {
@@ -132,10 +134,22 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf
 		*stmlenp = fz_tell(file) - *stmofsp - 9;
 
 atobjend:
+		*tmpofs = fz_tell(file);
+		if (*tmpofs < 0)
+			fz_throw(ctx, "cannot tell in file");
 		tok = pdf_lex(file, buf);
 		if (tok != PDF_TOK_ENDOBJ)
 			fz_warn(ctx, "object missing 'endobj' token");
+		else
+		{
+			/* Read another token as we always return the next one */
+			*tmpofs = fz_tell(file);
+			if (*tmpofs < 0)
+				fz_throw(ctx, "cannot tell in file");
+			tok = pdf_lex(file, buf);
+		}
 	}
+	return tok;
 }
 
 static void
@@ -288,6 +302,11 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf)
 				break;
 			}
 
+			/* If we have the next token already, then we'll jump
+			 * back here, rather than going through the top of
+			 * the loop. */
+		have_next_token:
+
 			if (tok == PDF_TOK_INT)
 			{
 				numofs = genofs;
@@ -300,7 +319,7 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf)
 			{
 				fz_try(ctx)
 				{
-					pdf_repair_obj(xref->file, buf, &stm_ofs, &stm_len, &encrypt, &id);
+					tok = pdf_repair_obj(xref->file, buf, &stm_ofs, &stm_len, &encrypt, &id, &tmpofs);
 				}
 				fz_catch(ctx)
 				{
@@ -341,6 +360,8 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf)
 
 				if (num > maxnum)
 					maxnum = num;
+
+				goto have_next_token;
 			}
 
 			/* trailer dictionary */
diff --git a/pdf/pdf_xref.c b/pdf/pdf_xref.c
index 92f93352..332ddaf0 100644
--- a/pdf/pdf_xref.c
+++ b/pdf/pdf_xref.c
@@ -237,8 +237,9 @@ pdf_read_start_xref(pdf_document *xref)
 			while (iswhite(buf[i]) && i < n)
 				i ++;
 			xref->startxref = atoi((char*)(buf + i));
-
-			return;
+			if (xref->startxref != 0)
+				return;
+			break;
 		}
 	}
author	Robin Watts <robin.watts@artifex.com>	2013-06-11 16:53:35 +0100
committer	Robin Watts <robin.watts@artifex.com>	2013-06-13 15:22:37 +0100
commit	919bc6d91961caac1f1e4aeb95fb581f778a5542 (patch)
tree	5938f9140e0a112918ec8a6292124890343fa26e /pdf
parent	f9ff77036a2a85ab990244191e1b7ba4b1a1665e (diff)
download	mupdf-919bc6d91961caac1f1e4aeb95fb581f778a5542.tar.xz