summaryrefslogtreecommitdiff
path: root/source/pdf/pdf-lex.c
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2013-12-30 17:59:13 +0000
committerRobin Watts <robin.watts@artifex.com>2014-01-02 20:04:38 +0000
commitbd7393d1be2e3905a6c3f1bb722198217c6195dc (patch)
tree558533819a1412513e8a7c97c9e08c2a2870dded /source/pdf/pdf-lex.c
parentcf9a5e5e7af55d15a83f542041fc63c73ba57425 (diff)
downloadmupdf-bd7393d1be2e3905a6c3f1bb722198217c6195dc.tar.xz
Improve PDF repair logic.
When we meet a broken PDF file, we attempt to repair it. We do this by reading tokens from the file and attempting to interpret them as a normal PDF stream. Unfortunately, if the file is corrupt enough so that we start to read from the middle of a stream, and we happen to hit an '(' character, we can go into string reading mode. We can then end up skipping over vast swathes of file that we could otherwise repair. We fix this here by using a new version of the pdf_lex function that refuses to ever return a string. This means we may take more time over skipping things than we did before, but are less likely to skip stuff. We also tweak other parts of the pdf repair logic here. If we hit a badly formed piece of data, clear the num/gen we have stored so that the next plausible piece we get does not get assigned to a random object number.
Diffstat (limited to 'source/pdf/pdf-lex.c')
-rw-r--r--source/pdf/pdf-lex.c63
1 files changed, 63 insertions, 0 deletions
diff --git a/source/pdf/pdf-lex.c b/source/pdf/pdf-lex.c
index 1be369eb..b019b7b2 100644
--- a/source/pdf/pdf-lex.c
+++ b/source/pdf/pdf-lex.c
@@ -507,6 +507,69 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
}
}
+pdf_token
+pdf_lex_no_string(fz_stream *f, pdf_lexbuf *buf)
+{
+ while (1)
+ {
+ int c = fz_read_byte(f);
+ switch (c)
+ {
+ case EOF:
+ return PDF_TOK_EOF;
+ case IS_WHITE:
+ lex_white(f);
+ break;
+ case '%':
+ lex_comment(f);
+ break;
+ case '/':
+ lex_name(f, buf);
+ return PDF_TOK_NAME;
+ case '(':
+ continue;
+ case ')':
+ continue;
+ case '<':
+ c = fz_read_byte(f);
+ if (c == '<')
+ {
+ return PDF_TOK_OPEN_DICT;
+ }
+ else
+ {
+ continue;
+ }
+ case '>':
+ c = fz_read_byte(f);
+ if (c == '>')
+ {
+ return PDF_TOK_CLOSE_DICT;
+ }
+ if (c == EOF)
+ {
+ return PDF_TOK_EOF;
+ }
+ fz_unread_byte(f);
+ continue;
+ case '[':
+ return PDF_TOK_OPEN_ARRAY;
+ case ']':
+ return PDF_TOK_CLOSE_ARRAY;
+ case '{':
+ return PDF_TOK_OPEN_BRACE;
+ case '}':
+ return PDF_TOK_CLOSE_BRACE;
+ case IS_NUMBER:
+ return lex_number(f, buf, c);
+ default: /* isregular: !isdelim && !iswhite && c != EOF */
+ fz_unread_byte(f);
+ lex_name(f, buf);
+ return pdf_token_from_keyword(buf->scratch);
+ }
+ }
+}
+
void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
{
switch (tok)