3 files changed, 87 insertions, 12 deletions
diff --git a/include/mupdf/pdf/parse.h b/include/mupdf/pdf/parse.h
index 0564a748..625ebfca 100644
--- a/include/mupdf/pdf/parse.h
+++ b/include/mupdf/pdf/parse.h
@@ -24,6 +24,7 @@ void pdf_lexbuf_fin(pdf_lexbuf *lexbuf);
 ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lexbuf);
 
 pdf_token pdf_lex(fz_stream *f, pdf_lexbuf *lexbuf);
+pdf_token pdf_lex_no_string(fz_stream *f, pdf_lexbuf *lexbuf);
 
 pdf_obj *pdf_parse_array(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf);
 pdf_obj *pdf_parse_dict(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf);
diff --git a/source/pdf/pdf-lex.c b/source/pdf/pdf-lex.c
index 1be369eb..b019b7b2 100644
--- a/source/pdf/pdf-lex.c
+++ b/source/pdf/pdf-lex.c
@@ -507,6 +507,69 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
 	}
 }
 
+pdf_token
+pdf_lex_no_string(fz_stream *f, pdf_lexbuf *buf)
+{
+	while (1)
+	{
+		int c = fz_read_byte(f);
+		switch (c)
+		{
+		case EOF:
+			return PDF_TOK_EOF;
+		case IS_WHITE:
+			lex_white(f);
+			break;
+		case '%':
+			lex_comment(f);
+			break;
+		case '/':
+			lex_name(f, buf);
+			return PDF_TOK_NAME;
+		case '(':
+			continue;
+		case ')':
+			continue;
+		case '<':
+			c = fz_read_byte(f);
+			if (c == '<')
+			{
+				return PDF_TOK_OPEN_DICT;
+			}
+			else
+			{
+				continue;
+			}
+		case '>':
+			c = fz_read_byte(f);
+			if (c == '>')
+			{
+				return PDF_TOK_CLOSE_DICT;
+			}
+			if (c == EOF)
+			{
+				return PDF_TOK_EOF;
+			}
+			fz_unread_byte(f);
+			continue;
+		case '[':
+			return PDF_TOK_OPEN_ARRAY;
+		case ']':
+			return PDF_TOK_CLOSE_ARRAY;
+		case '{':
+			return PDF_TOK_OPEN_BRACE;
+		case '}':
+			return PDF_TOK_CLOSE_BRACE;
+		case IS_NUMBER:
+			return lex_number(f, buf, c);
+		default: /* isregular: !isdelim && !iswhite && c != EOF */
+			fz_unread_byte(f);
+			lex_name(f, buf);
+			return pdf_token_from_keyword(buf->scratch);
+		}
+	}
+}
+
 void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
 {
 	switch (tok)
diff --git a/source/pdf/pdf-repair.c b/source/pdf/pdf-repair.c
index bf0e2d83..e7449de8 100644
--- a/source/pdf/pdf-repair.c
+++ b/source/pdf/pdf-repair.c
@@ -311,7 +311,7 @@ pdf_repair_xref(pdf_document *doc, pdf_lexbuf *buf)
 
 			fz_try(ctx)
 			{
-				tok = pdf_lex(doc->file, buf);
+				tok = pdf_lex_no_string(doc->file, buf);
 			}
 			fz_catch(ctx)
 			{
@@ -327,6 +327,12 @@ pdf_repair_xref(pdf_document *doc, pdf_lexbuf *buf)
 
 			if (tok == PDF_TOK_INT)
 			{
+				if (buf->i < 0)
+				{
+					num = 0;
+					gen = 0;
+					continue;
+				}
 				numofs = genofs;
 				num = gen;
 				genofs = tmpofs;
@@ -380,7 +386,9 @@ pdf_repair_xref(pdf_document *doc, pdf_lexbuf *buf)
 				goto have_next_token;
 			}
 
-			/* trailer dictionary */
+			/* If we find a dictionary it is probably the trailer,
+			 * but could be a stream (or bogus) dictionary caused
+			 * by a corrupt file. */
 			else if (tok == PDF_TOK_OPEN_DICT)
 			{
 				fz_try(ctx)
@@ -390,13 +398,11 @@ pdf_repair_xref(pdf_document *doc, pdf_lexbuf *buf)
 				fz_catch(ctx)
 				{
 					fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
-					/* If we haven't seen a root yet, there is nothing
-					 * we can do, but give up. Otherwise, we'll make
-					 * do. */
-					if (!root)
-						fz_rethrow(ctx);
-					fz_warn(ctx, "cannot parse trailer dictionary - ignoring rest of file");
-					break;
+					/* If this was the real trailer dict
+					 * it was broken, in which case we are
+					 * in trouble. Keep going though in
+					 * case this was just a bogus dict. */
+					continue;
 				}
 
 				obj = pdf_dict_gets(dict, "Encrypt");
@@ -431,11 +437,16 @@ pdf_repair_xref(pdf_document *doc, pdf_lexbuf *buf)
 				obj = NULL;
 			}
 
-			else if (tok == PDF_TOK_ERROR)
-				fz_read_byte(doc->file);
-
 			else if (tok == PDF_TOK_EOF)
 				break;
+			else
+			{
+				if (tok == PDF_TOK_ERROR)
+					fz_read_byte(doc->file);
+				num = 0;
+				gen = 0;
+			}
+
 		}
 
 		/* make xref reasonable */