summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/mupdf/pdf/parse.h1
-rw-r--r--source/pdf/pdf-lex.c63
-rw-r--r--source/pdf/pdf-repair.c35
3 files changed, 87 insertions, 12 deletions
diff --git a/include/mupdf/pdf/parse.h b/include/mupdf/pdf/parse.h
index 0564a748..625ebfca 100644
--- a/include/mupdf/pdf/parse.h
+++ b/include/mupdf/pdf/parse.h
@@ -24,6 +24,7 @@ void pdf_lexbuf_fin(pdf_lexbuf *lexbuf);
ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lexbuf);
pdf_token pdf_lex(fz_stream *f, pdf_lexbuf *lexbuf);
+pdf_token pdf_lex_no_string(fz_stream *f, pdf_lexbuf *lexbuf);
pdf_obj *pdf_parse_array(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf);
pdf_obj *pdf_parse_dict(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf);
diff --git a/source/pdf/pdf-lex.c b/source/pdf/pdf-lex.c
index 1be369eb..b019b7b2 100644
--- a/source/pdf/pdf-lex.c
+++ b/source/pdf/pdf-lex.c
@@ -507,6 +507,69 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
}
}
+pdf_token
+pdf_lex_no_string(fz_stream *f, pdf_lexbuf *buf)
+{
+ while (1)
+ {
+ int c = fz_read_byte(f);
+ switch (c)
+ {
+ case EOF:
+ return PDF_TOK_EOF;
+ case IS_WHITE:
+ lex_white(f);
+ break;
+ case '%':
+ lex_comment(f);
+ break;
+ case '/':
+ lex_name(f, buf);
+ return PDF_TOK_NAME;
+ case '(':
+ continue;
+ case ')':
+ continue;
+ case '<':
+ c = fz_read_byte(f);
+ if (c == '<')
+ {
+ return PDF_TOK_OPEN_DICT;
+ }
+ else
+ {
+ continue;
+ }
+ case '>':
+ c = fz_read_byte(f);
+ if (c == '>')
+ {
+ return PDF_TOK_CLOSE_DICT;
+ }
+ if (c == EOF)
+ {
+ return PDF_TOK_EOF;
+ }
+ fz_unread_byte(f);
+ continue;
+ case '[':
+ return PDF_TOK_OPEN_ARRAY;
+ case ']':
+ return PDF_TOK_CLOSE_ARRAY;
+ case '{':
+ return PDF_TOK_OPEN_BRACE;
+ case '}':
+ return PDF_TOK_CLOSE_BRACE;
+ case IS_NUMBER:
+ return lex_number(f, buf, c);
+ default: /* isregular: !isdelim && !iswhite && c != EOF */
+ fz_unread_byte(f);
+ lex_name(f, buf);
+ return pdf_token_from_keyword(buf->scratch);
+ }
+ }
+}
+
void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
{
switch (tok)
diff --git a/source/pdf/pdf-repair.c b/source/pdf/pdf-repair.c
index bf0e2d83..e7449de8 100644
--- a/source/pdf/pdf-repair.c
+++ b/source/pdf/pdf-repair.c
@@ -311,7 +311,7 @@ pdf_repair_xref(pdf_document *doc, pdf_lexbuf *buf)
fz_try(ctx)
{
- tok = pdf_lex(doc->file, buf);
+ tok = pdf_lex_no_string(doc->file, buf);
}
fz_catch(ctx)
{
@@ -327,6 +327,12 @@ pdf_repair_xref(pdf_document *doc, pdf_lexbuf *buf)
if (tok == PDF_TOK_INT)
{
+ if (buf->i < 0)
+ {
+ num = 0;
+ gen = 0;
+ continue;
+ }
numofs = genofs;
num = gen;
genofs = tmpofs;
@@ -380,7 +386,9 @@ pdf_repair_xref(pdf_document *doc, pdf_lexbuf *buf)
goto have_next_token;
}
- /* trailer dictionary */
+ /* If we find a dictionary it is probably the trailer,
+ * but could be a stream (or bogus) dictionary caused
+ * by a corrupt file. */
else if (tok == PDF_TOK_OPEN_DICT)
{
fz_try(ctx)
@@ -390,13 +398,11 @@ pdf_repair_xref(pdf_document *doc, pdf_lexbuf *buf)
fz_catch(ctx)
{
fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
- /* If we haven't seen a root yet, there is nothing
- * we can do, but give up. Otherwise, we'll make
- * do. */
- if (!root)
- fz_rethrow(ctx);
- fz_warn(ctx, "cannot parse trailer dictionary - ignoring rest of file");
- break;
+ /* If this was the real trailer dict
+ * it was broken, in which case we are
+ * in trouble. Keep going though in
+ * case this was just a bogus dict. */
+ continue;
}
obj = pdf_dict_gets(dict, "Encrypt");
@@ -431,11 +437,16 @@ pdf_repair_xref(pdf_document *doc, pdf_lexbuf *buf)
obj = NULL;
}
- else if (tok == PDF_TOK_ERROR)
- fz_read_byte(doc->file);
-
else if (tok == PDF_TOK_EOF)
break;
+ else
+ {
+ if (tok == PDF_TOK_ERROR)
+ fz_read_byte(doc->file);
+ num = 0;
+ gen = 0;
+ }
+
}
/* make xref reasonable */