Improve PDF repair logic.

When we meet a broken PDF file, we attempt to repair it. We do this by reading tokens from the file and attempting to interpret them as a normal PDF stream. Unfortunately, if the file is corrupt enough so that we start to read from the middle of a stream, and we happen to hit an '(' character, we can go into string reading mode. We can then end up skipping over vast swathes of file that we could otherwise repair. We fix this here by using a new version of the pdf_lex function that refuses to ever return a string. This means we may take more time over skipping things than we did before, but are less likely to skip stuff. We also tweak other parts of the pdf repair logic here. If we hit a badly formed piece of data, clear the num/gen we have stored so that the next plausible piece we get does not get assigned to a random object number.
author: Robin Watts <robin.watts@artifex.com> 2013-12-30 17:59:13 +0000
committer: Robin Watts <robin.watts@artifex.com> 2014-01-02 20:04:38 +0000
commit: bd7393d1be2e3905a6c3f1bb722198217c6195dc (patch)
tree: 558533819a1412513e8a7c97c9e08c2a2870dded /source/pdf/pdf-lex.c
parent: cf9a5e5e7af55d15a83f542041fc63c73ba57425 (diff)
download: mupdf-bd7393d1be2e3905a6c3f1bb722198217c6195dc.tar.xz
1 files changed, 63 insertions, 0 deletions
diff --git a/source/pdf/pdf-lex.c b/source/pdf/pdf-lex.c
index 1be369eb..b019b7b2 100644
--- a/source/pdf/pdf-lex.c
+++ b/source/pdf/pdf-lex.c
@@ -507,6 +507,69 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
 	}
 }
 
+pdf_token
+pdf_lex_no_string(fz_stream *f, pdf_lexbuf *buf)
+{
+	while (1)
+	{
+		int c = fz_read_byte(f);
+		switch (c)
+		{
+		case EOF:
+			return PDF_TOK_EOF;
+		case IS_WHITE:
+			lex_white(f);
+			break;
+		case '%':
+			lex_comment(f);
+			break;
+		case '/':
+			lex_name(f, buf);
+			return PDF_TOK_NAME;
+		case '(':
+			continue;
+		case ')':
+			continue;
+		case '<':
+			c = fz_read_byte(f);
+			if (c == '<')
+			{
+				return PDF_TOK_OPEN_DICT;
+			}
+			else
+			{
+				continue;
+			}
+		case '>':
+			c = fz_read_byte(f);
+			if (c == '>')
+			{
+				return PDF_TOK_CLOSE_DICT;
+			}
+			if (c == EOF)
+			{
+				return PDF_TOK_EOF;
+			}
+			fz_unread_byte(f);
+			continue;
+		case '[':
+			return PDF_TOK_OPEN_ARRAY;
+		case ']':
+			return PDF_TOK_CLOSE_ARRAY;
+		case '{':
+			return PDF_TOK_OPEN_BRACE;
+		case '}':
+			return PDF_TOK_CLOSE_BRACE;
+		case IS_NUMBER:
+			return lex_number(f, buf, c);
+		default: /* isregular: !isdelim && !iswhite && c != EOF */
+			fz_unread_byte(f);
+			lex_name(f, buf);
+			return pdf_token_from_keyword(buf->scratch);
+		}
+	}
+}
+
 void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
 {
 	switch (tok)
author	Robin Watts <robin.watts@artifex.com>	2013-12-30 17:59:13 +0000
committer	Robin Watts <robin.watts@artifex.com>	2014-01-02 20:04:38 +0000
commit	bd7393d1be2e3905a6c3f1bb722198217c6195dc (patch)
tree	558533819a1412513e8a7c97c9e08c2a2870dded /source/pdf/pdf-lex.c
parent	cf9a5e5e7af55d15a83f542041fc63c73ba57425 (diff)
download	mupdf-bd7393d1be2e3905a6c3f1bb722198217c6195dc.tar.xz