summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pdf/mupdf-internal.h1
-rw-r--r--pdf/pdf_lex.c24
-rw-r--r--pdf/pdf_repair.c39
-rw-r--r--pdf/pdf_xref.c5
4 files changed, 56 insertions, 13 deletions
diff --git a/pdf/mupdf-internal.h b/pdf/mupdf-internal.h
index 95931352..f6791335 100644
--- a/pdf/mupdf-internal.h
+++ b/pdf/mupdf-internal.h
@@ -58,6 +58,7 @@ void pdf_lexbuf_fin(pdf_lexbuf *lexbuf);
ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lexbuf);
pdf_token pdf_lex(fz_stream *f, pdf_lexbuf *lexbuf);
+pdf_token pdf_lex_repair(fz_stream *f, pdf_lexbuf *buf);
pdf_obj *pdf_parse_array(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf);
pdf_obj *pdf_parse_dict(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf);
diff --git a/pdf/pdf_lex.c b/pdf/pdf_lex.c
index 6b03d8d9..8e179042 100644
--- a/pdf/pdf_lex.c
+++ b/pdf/pdf_lex.c
@@ -442,8 +442,8 @@ ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lb)
return lb->scratch - old;
}
-pdf_token
-pdf_lex(fz_stream *f, pdf_lexbuf *buf)
+static pdf_token
+do_lex(fz_stream *f, pdf_lexbuf *buf, int repair)
{
while (1)
{
@@ -484,6 +484,7 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
return PDF_TOK_CLOSE_DICT;
}
fz_warn(f->ctx, "lexical error (unexpected '>')");
+ fz_unread_byte(f);
continue;
case '[':
return PDF_TOK_OPEN_ARRAY;
@@ -494,6 +495,13 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
case '}':
return PDF_TOK_CLOSE_BRACE;
case IS_NUMBER:
+ /* In the 'repair' case, we want to stop before
+ * consuming the int. */
+ if (repair)
+ {
+ fz_unread_byte(f);
+ return PDF_TOK_INT;
+ }
return lex_number(f, buf, c);
default: /* isregular: !isdelim && !iswhite && c != EOF */
fz_unread_byte(f);
@@ -503,6 +511,18 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
}
}
+pdf_token
+pdf_lex(fz_stream *f, pdf_lexbuf *buf)
+{
+ return do_lex(f, buf, 0);
+}
+
+pdf_token
+pdf_lex_repair(fz_stream *f, pdf_lexbuf *buf)
+{
+ return do_lex(f, buf, 1);
+}
+
void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
{
switch (tok)
diff --git a/pdf/pdf_repair.c b/pdf/pdf_repair.c
index e5a1a8a9..4240e305 100644
--- a/pdf/pdf_repair.c
+++ b/pdf/pdf_repair.c
@@ -15,8 +15,8 @@ struct entry
int stm_len;
};
-static void
-pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id)
+static int
+pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, int *tmpofs)
{
pdf_token tok;
int stm_len;
@@ -28,6 +28,10 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf
stm_len = 0;
+ /* On entry to this function, we know that we've just seen
+ * '<int> <int> obj'. We expect the next thing we see to be a
+ * pdf object. Regardless of the type of thing we meet next
+ * we only need to fully parse it if it is a dictionary. */
tok = pdf_lex(file, buf);
if (tok == PDF_TOK_OPEN_DICT)
@@ -79,15 +83,13 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf
tok != PDF_TOK_EOF &&
tok != PDF_TOK_INT )
{
+ *tmpofs = fz_tell(file);
+ if (*tmpofs < 0)
+ fz_throw(ctx, "cannot tell in file");
tok = pdf_lex(file, buf);
}
- if (tok == PDF_TOK_INT)
- {
- while (buf->len-- > 0)
- fz_unread_byte(file);
- }
- else if (tok == PDF_TOK_STREAM)
+ if (tok == PDF_TOK_STREAM)
{
int c = fz_read_byte(file);
if (c == '\r') {
@@ -132,10 +134,22 @@ pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf
*stmlenp = fz_tell(file) - *stmofsp - 9;
atobjend:
+ *tmpofs = fz_tell(file);
+ if (*tmpofs < 0)
+ fz_throw(ctx, "cannot tell in file");
tok = pdf_lex(file, buf);
if (tok != PDF_TOK_ENDOBJ)
fz_warn(ctx, "object missing 'endobj' token");
+ else
+ {
+ /* Read another token as we always return the next one */
+ *tmpofs = fz_tell(file);
+ if (*tmpofs < 0)
+ fz_throw(ctx, "cannot tell in file");
+ tok = pdf_lex(file, buf);
+ }
}
+ return tok;
}
static void
@@ -288,6 +302,11 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf)
break;
}
+ /* If we have the next token already, then we'll jump
+ * back here, rather than going through the top of
+ * the loop. */
+ have_next_token:
+
if (tok == PDF_TOK_INT)
{
numofs = genofs;
@@ -300,7 +319,7 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf)
{
fz_try(ctx)
{
- pdf_repair_obj(xref->file, buf, &stm_ofs, &stm_len, &encrypt, &id);
+ tok = pdf_repair_obj(xref->file, buf, &stm_ofs, &stm_len, &encrypt, &id, &tmpofs);
}
fz_catch(ctx)
{
@@ -341,6 +360,8 @@ pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf)
if (num > maxnum)
maxnum = num;
+
+ goto have_next_token;
}
/* trailer dictionary */
diff --git a/pdf/pdf_xref.c b/pdf/pdf_xref.c
index 92f93352..332ddaf0 100644
--- a/pdf/pdf_xref.c
+++ b/pdf/pdf_xref.c
@@ -237,8 +237,9 @@ pdf_read_start_xref(pdf_document *xref)
while (iswhite(buf[i]) && i < n)
i ++;
xref->startxref = atoi((char*)(buf + i));
-
- return;
+ if (xref->startxref != 0)
+ return;
+ break;
}
}