From 6b08b13fc4e95f9f0446a7199f1d9b5d9348d2f9 Mon Sep 17 00:00:00 2001
From: Robin Watts <robin.watts@artifex.com>
Date: Mon, 23 Dec 2013 11:54:49 +0000
Subject: Bug 694810: Implement late file repair for PDFs.

Currently, if we spot a bad xref as we are reading a PDF in, we can
repair that PDF by doing a long exhaustive read of the file. This
reconstructs the information that was in the xref, and the file can
be opened (and later saved) as normal.

If we hit an object that is not in the expected place however, we
cannot trigger a repair at that point - so xrefs with duff offsets
in (within the bounds of the file) will never be repaired.

This commit solves that by triggering a repair (just once) whenever
we fail to parse an object in the expected place.
---
 source/pdf/pdf-parse.c  | 14 +++++++++++++-
 source/pdf/pdf-repair.c |  5 ++++-
 source/pdf/pdf-xref.c   | 36 +++++++++++++++++++++++++++++-------
 3 files changed, 46 insertions(+), 9 deletions(-)

(limited to 'source')

diff --git a/source/pdf/pdf-parse.c b/source/pdf/pdf-parse.c
index aaa45c0d..66e0fbe7 100644
--- a/source/pdf/pdf-parse.c
+++ b/source/pdf/pdf-parse.c
@@ -496,7 +496,7 @@ pdf_parse_stm_obj(pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
 pdf_obj *
 pdf_parse_ind_obj(pdf_document *doc,
 	fz_stream *file, pdf_lexbuf *buf,
-	int *onum, int *ogen, int *ostmofs)
+	int *onum, int *ogen, int *ostmofs, int *try_repair)
 {
 	pdf_obj *obj = NULL;
 	int num = 0, gen = 0, stm_ofs;
@@ -508,17 +508,29 @@ pdf_parse_ind_obj(pdf_document *doc,
 
 	tok = pdf_lex(file, buf);
 	if (tok != PDF_TOK_INT)
+	{
+		if (try_repair)
+			*try_repair = 1;
 		fz_throw(ctx, FZ_ERROR_GENERIC, "expected object number");
+	}
 	num = buf->i;
 
 	tok = pdf_lex(file, buf);
 	if (tok != PDF_TOK_INT)
+	{
+		if (try_repair)
+			*try_repair = 1;
 		fz_throw(ctx, FZ_ERROR_GENERIC, "expected generation number (%d ? obj)", num);
+	}
 	gen = buf->i;
 
 	tok = pdf_lex(file, buf);
 	if (tok != PDF_TOK_OBJ)
+	{
+		if (try_repair)
+			*try_repair = 1;
 		fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'obj' keyword (%d %d ?)", num, gen);
+	}
 
 	tok = pdf_lex(file, buf);
 
diff --git a/source/pdf/pdf-repair.c b/source/pdf/pdf-repair.c
index c742714d..e9a60986 100644
--- a/source/pdf/pdf-repair.c
+++ b/source/pdf/pdf-repair.c
@@ -234,7 +234,6 @@ pdf_repair_obj_stm(pdf_document *doc, int num, int gen)
 	}
 }
 
-/* Entered with file locked, remains locked throughout. */
 void
 pdf_repair_xref(pdf_document *doc, pdf_lexbuf *buf)
 {
@@ -267,6 +266,10 @@ pdf_repair_xref(pdf_document *doc, pdf_lexbuf *buf)
 	fz_var(list);
 	fz_var(obj);
 
+	if (doc->repair_attempted)
+		fz_throw(doc->ctx, FZ_ERROR_GENERIC, "Repair failed already - not trying again");
+	doc->repair_attempted = 1;
+
 	doc->dirty = 1;
 	/* Can't support incremental update after repair */
 	doc->freeze_updates = 1;
diff --git a/source/pdf/pdf-xref.c b/source/pdf/pdf-xref.c
index 338dec94..f3deeebf 100644
--- a/source/pdf/pdf-xref.c
+++ b/source/pdf/pdf-xref.c
@@ -562,7 +562,7 @@ pdf_read_new_xref(pdf_document *doc, pdf_lexbuf *buf)
 	{
 		pdf_xref_entry *entry;
 		int ofs = fz_tell(doc->file);
-		trailer = pdf_parse_ind_obj(doc, doc->file, buf, &num, &gen, &stm_ofs);
+		trailer = pdf_parse_ind_obj(doc, doc->file, buf, &num, &gen, &stm_ofs, NULL);
 		entry = pdf_get_populating_xref_entry(doc, num);
 		entry->ofs = ofs;
 		entry->gen = gen;
@@ -832,7 +832,7 @@ pdf_load_linear(pdf_document *doc)
 	{
 		pdf_xref_entry *entry;
 
-		dict = pdf_parse_ind_obj(doc, doc->file, &doc->lexbuf.base, &num, &gen, &stmofs);
+		dict = pdf_parse_ind_obj(doc, doc->file, &doc->lexbuf.base, &num, &gen, &stmofs, NULL);
 		if (!pdf_is_dict(dict))
 			fz_throw(ctx, FZ_ERROR_GENERIC, "Failed to read linearized dictionary");
 		o = pdf_dict_gets(dict, "Linearized");
@@ -1611,13 +1611,18 @@ void
 pdf_cache_object(pdf_document *doc, int num, int gen)
 {
 	pdf_xref_entry *x;
-	int rnum, rgen;
+	int rnum, rgen, try_repair;
 	fz_context *ctx = doc->ctx;
 
+	fz_var(try_repair);
+
 	if (num < 0 || num >= pdf_xref_len(doc))
 		fz_throw(ctx, FZ_ERROR_GENERIC, "object out of range (%d %d R); xref size %d", num, gen, pdf_xref_len(doc));
 
 object_updated:
+	try_repair = 0;
+	rnum = num;
+
 	x = pdf_get_xref_entry(doc, num);
 
 	if (x->obj)
@@ -1634,18 +1639,35 @@ object_updated:
 		fz_try(ctx)
 		{
 			x->obj = pdf_parse_ind_obj(doc, doc->file, &doc->lexbuf.base,
-					&rnum, &rgen, &x->stm_ofs);
+					&rnum, &rgen, &x->stm_ofs, &try_repair);
 		}
 		fz_catch(ctx)
 		{
-			fz_rethrow_message(ctx, "cannot parse object (%d %d R)", num, gen);
+			if (!try_repair || fz_caught(ctx) == FZ_ERROR_TRYLATER)
+				fz_rethrow(ctx);
 		}
 
-		if (rnum != num)
+		if (!try_repair && rnum != num)
 		{
 			pdf_drop_obj(x->obj);
 			x->obj = NULL;
-			fz_rethrow_message(ctx, "found object (%d %d R) instead of (%d %d R)", rnum, rgen, num, gen);
+			try_repair = 1;
+		}
+
+		if (try_repair)
+		{
+			fz_try(ctx)
+			{
+				pdf_repair_xref(doc, &doc->lexbuf.base);
+			}
+			fz_catch(ctx)
+			{
+				if (rnum == num)
+					fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse object (%d %d R)", num, gen);
+				else
+					fz_throw(ctx, FZ_ERROR_GENERIC, "found object (%d %d R) instead of (%d %d R)", rnum, rgen, num, gen);
+			}
+			goto object_updated;
 		}
 
 		if (doc->crypt)
-- 
cgit v1.2.3