4 files changed, 56 insertions, 13 deletions
diff --git a/pdf/mupdf-internal.h b/pdf/mupdf-internal.h
index 95931352..f6791335 100644
--- a/pdf/mupdf-internal.h
+++ b/pdf/mupdf-internal.h
@@ -58,6 +58,7 @@ void pdf_lexbuf_fin(pdf_lexbuf *lexbuf);
 ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lexbuf);
 
 pdf_token pdf_lex(fz_stream *f, pdf_lexbuf *lexbuf);
+pdf_token pdf_lex_repair(fz_stream *f, pdf_lexbuf *buf);
 
 pdf_obj *pdf_parse_array(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf);
 pdf_obj *pdf_parse_dict(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf);
diff --git a/pdf/pdf_lex.c b/pdf/pdf_lex.c
index 6b03d8d9..8e179042 100644
--- a/pdf/pdf_lex.c
+++ b/pdf/pdf_lex.c
@@ -442,8 +442,8 @@ ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lb)
 	return lb->scratch - old;
 }
 
-pdf_token
-pdf_lex(fz_stream *f, pdf_lexbuf *buf)
+static pdf_token
+do_lex(fz_stream *f, pdf_lexbuf *buf, int repair)
 {
 	while (1)
 	{
@@ -484,6 +484,7 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
 				return PDF_TOK_CLOSE_DICT;
 			}
 			fz_warn(f->ctx, "lexical error (unexpected '>')");
+			fz_unread_byte(f);
 			continue;
 		case '[':
 			return PDF_TOK_OPEN_ARRAY;
@@ -494,6 +495,13 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
 		case '}':
 			return PDF_TOK_CLOSE_BRACE;
 		case IS_NUMBER:
+			/* In the 'repair' case, we want to stop before
+			 * consuming the int. */
+			if (repair)
+			{
+				fz_unread_byte(f);
+				return PDF_TOK_INT;
+			}
 			return lex_number(f, buf, c);
 		default: /* isregular: !isdelim && !iswhite && c != EOF */
 			fz_unread_byte(f);
@@ -503,6 +511,18 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
 	}
 }
 
+pdf_token
+pdf_lex(fz_stream *f, pdf_lexbuf *buf)
+{
+	return do_lex(f, buf, 0);
+}
+
+pdf_token
+pdf_lex_repair(fz_stream *f, pdf_lexbuf *buf)
+{
+	return do_lex(f, buf, 1);
+}
+
 void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
 {
 	switch (tok)
diff --git a/pdf/pdf_repair.c b/pdf/pdf_repair.c
index e5a1a8a9..4240e305 100644
--- a/pdf/pdf_repair.c
+++ b/pdf/pdf_repair.c
@@ -15,8 +15,8 @@ struct entry
 	int stm_len;
 };
 
-static void
-pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id)
+static int
+pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, int *tmpofs)
 {
 	pdf_token tok;
 	int stm_len;
@@ -28,6 +28,10 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf
 
 	stm_len = 0;
 
+	/* On entry to this function, we know that we've just seen
+	 * '<int> <int> obj'. We expect the next thing we see to be a
+	 * pdf object. Regardless of the type of thing we meet next
+	 * we only need to fully parse it if it is a dictionary. */
 	tok = pdf_lex(file, buf);
 
 	if (tok == PDF_TOK_OPEN_DICT)
@@ -79,15 +83,13 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf
 		tok != PDF_TOK_EOF &&
 		tok != PDF_TOK_INT )
 	{
+		*tmpofs = fz_tell(file);
+		if (*tmpofs < 0)
+			fz_throw(ctx, "cannot tell in file");
 		tok = pdf_lex(file, buf);
 	}
 
-	if (tok == PDF_TOK_INT)
-	{
-		while (buf->len-- > 0)
-			fz_unread_byte(file);
-	}
-	else if (tok == PDF_TOK_STREAM)
+	if (tok == PDF_TOK_STREAM)
 	{
 		int c = fz_read_byte(file);
 		if (c == '\r') {
@@ -132,10 +134,22 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf
 		*stmlenp = fz_tell(file) - *stmofsp - 9;
 
 atobjend:
+		*tmpofs = fz_tell(file);
+		if (*tmpofs < 0)
+			fz_throw(ctx, "cannot tell in file");
 		tok = pdf_lex(file, buf);
 		if (tok != PDF_TOK_ENDOBJ)
 			fz_warn(ctx, "object missing 'endobj' token");
+		else
+		{
+			/* Read another token as we always return the next one */
+			*tmpofs = fz_tell(file);
+			if (*tmpofs < 0)
+				fz_throw(ctx, "cannot tell in file");
+			tok = pdf_lex(file, buf);
+		}
 	}
+	return tok;
 }
 
 static void
@@ -288,6 +302,11 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf)
 				break;
 			}
 
+			/* If we have the next token already, then we'll jump
+			 * back here, rather than going through the top of
+			 * the loop. */
+		have_next_token:
+
 			if (tok == PDF_TOK_INT)
 			{
 				numofs = genofs;
@@ -300,7 +319,7 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf)
 			{
 				fz_try(ctx)
 				{
-					pdf_repair_obj(xref->file, buf, &stm_ofs, &stm_len, &encrypt, &id);
+					tok = pdf_repair_obj(xref->file, buf, &stm_ofs, &stm_len, &encrypt, &id, &tmpofs);
 				}
 				fz_catch(ctx)
 				{
@@ -341,6 +360,8 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf)
 
 				if (num > maxnum)
 					maxnum = num;
+
+				goto have_next_token;
 			}
 
 			/* trailer dictionary */
diff --git a/pdf/pdf_xref.c b/pdf/pdf_xref.c
index 92f93352..332ddaf0 100644
--- a/pdf/pdf_xref.c
+++ b/pdf/pdf_xref.c
@@ -237,8 +237,9 @@ pdf_read_start_xref(pdf_document *xref)
 			while (iswhite(buf[i]) && i < n)
 				i ++;
 			xref->startxref = atoi((char*)(buf + i));
-
-			return;
+			if (xref->startxref != 0)
+				return;
+			break;
 		}
 	}