From 841398e7693a511df66dd0f1b33d9604371d53e0 Mon Sep 17 00:00:00 2001
From: Robin Watts <robin.watts@artifex.com>
Date: Wed, 30 Sep 2015 19:10:45 +0100
Subject: Bug 696146: Improve pdf_repair to find /Root in new style XRefs.

The current code never looks for /Root objects in dictionaries
as it parses them. This means that 'New style' files end up
without any Roots after repair.

The new code therefore updates pdf_repair_obj to look for Root
objects in the same way it looks for encrypt and id objects.
These go into the list of found roots.

The Root object almost certainly has indirections within it, so
it is vital that the 'doc' pointer gets set. This means we have
to make a slight adjustment to pdf_repair_obj so that the dict
is parsed with a doc pointer. In turn this means we need to
manually ensure that none of the other information read from
the dict during the repair operation will cause indirections
to be resolved. This is achieved by checking for
!pdf_is_indirect at various points.
---
 source/pdf/pdf-repair.c | 89 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 63 insertions(+), 26 deletions(-)

(limited to 'source/pdf/pdf-repair.c')

diff --git a/source/pdf/pdf-repair.c b/source/pdf/pdf-repair.c
index 6e62bf00..fdd46483 100644
--- a/source/pdf/pdf-repair.c
+++ b/source/pdf/pdf-repair.c
@@ -14,8 +14,21 @@ struct entry
 	int stm_len;
 };
 
+static void add_root(fz_context *ctx, pdf_obj *obj, pdf_obj ***roots, int *num_roots, int *max_roots)
+{
+	if (*num_roots == *max_roots)
+	{
+		int new_max_roots = *max_roots * 2;
+		if (new_max_roots == 0)
+			new_max_roots = 4;
+		*roots = fz_resize_array(ctx, *roots, new_max_roots, sizeof(**roots));
+		*max_roots = new_max_roots;
+	}
+	(*roots)[(*num_roots)++] = pdf_keep_obj(ctx, obj);
+}
+
 int
-pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, fz_off_t *tmpofs)
+pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, fz_off_t *tmpofs, pdf_obj **root)
 {
 	fz_stream *file = doc->file;
 	pdf_token tok;
@@ -37,10 +50,9 @@ pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *st
 	{
 		pdf_obj *dict, *obj;
 
-		/* Send NULL xref so we don't try to resolve references */
 		fz_try(ctx)
 		{
-			dict = pdf_parse_dict(ctx, NULL, file, buf);
+			dict = pdf_parse_dict(ctx, doc, file, buf);
 		}
 		fz_catch(ctx)
 		{
@@ -52,24 +64,39 @@ pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *st
 			dict = pdf_new_dict(ctx, NULL, 2);
 		}
 
-		if (encrypt && id)
+		/* We must be careful not to try to resolve any indirections
+		 * here. We have just read dict, so we know it to be a non
+		 * indirected dictionary. Before we look at any values that
+		 * we get back from looking up in it, we need to check they
+		 * aren't indirected. */
+
+		if (encrypt || id || root)
 		{
 			obj = pdf_dict_get(ctx, dict, PDF_NAME_Type);
-			if (pdf_name_eq(ctx, obj, PDF_NAME_XRef))
+			if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME_XRef))
 			{
-				obj = pdf_dict_get(ctx, dict, PDF_NAME_Encrypt);
-				if (obj)
+				if (encrypt)
 				{
-					pdf_drop_obj(ctx, *encrypt);
-					*encrypt = pdf_keep_obj(ctx, obj);
+					obj = pdf_dict_get(ctx, dict, PDF_NAME_Encrypt);
+					if (obj)
+					{
+						pdf_drop_obj(ctx, *encrypt);
+						*encrypt = pdf_keep_obj(ctx, obj);
+					}
 				}
 
-				obj = pdf_dict_get(ctx, dict, PDF_NAME_ID);
-				if (obj)
+				if (id)
 				{
-					pdf_drop_obj(ctx, *id);
-					*id = pdf_keep_obj(ctx, obj);
+					obj = pdf_dict_get(ctx, dict, PDF_NAME_ID);
+					if (obj)
+					{
+						pdf_drop_obj(ctx, *id);
+						*id = pdf_keep_obj(ctx, obj);
+					}
 				}
+
+				if (root)
+					*root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME_Root));
 			}
 		}
 
@@ -80,7 +107,7 @@ pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *st
 		if (doc->file_reading_linearly && page)
 		{
 			obj = pdf_dict_get(ctx, dict, PDF_NAME_Type);
-			if (pdf_name_eq(ctx, obj, PDF_NAME_Page))
+			if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME_Page))
 			{
 				pdf_drop_obj(ctx, *page);
 				*page = pdf_keep_obj(ctx, dict);
@@ -343,11 +370,19 @@ pdf_repair_xref(fz_context *ctx, pdf_document *doc)
 
 			else if (tok == PDF_TOK_OBJ)
 			{
+				pdf_obj *root = NULL;
+
 				fz_try(ctx)
 				{
 					stm_len = 0;
 					stm_ofs = 0;
-					tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs);
+					tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs, &root);
+					if (root)
+						add_root(ctx, root, &roots, &num_roots, &max_roots);
+				}
+				fz_always(ctx)
+				{
+					pdf_drop_obj(ctx, root);
 				}
 				fz_catch(ctx)
 				{
@@ -423,17 +458,7 @@ pdf_repair_xref(fz_context *ctx, pdf_document *doc)
 
 				obj = pdf_dict_get(ctx, dict, PDF_NAME_Root);
 				if (obj)
-				{
-					if (num_roots == max_roots)
-					{
-						int new_max_roots = max_roots * 2;
-						if (new_max_roots == 0)
-							new_max_roots = 4;
-						roots = fz_resize_array(ctx, roots, new_max_roots, sizeof(*roots));
-						max_roots = new_max_roots;
-					}
-					roots[num_roots++] = pdf_keep_obj(ctx, obj);
-				}
+					add_root(ctx, obj, &roots, &num_roots, &max_roots);
 
 				obj = pdf_dict_get(ctx, dict, PDF_NAME_Info);
 				if (obj)
@@ -471,6 +496,18 @@ pdf_repair_xref(fz_context *ctx, pdf_document *doc)
 		 * 0 to maxnum. */
 		pdf_ensure_solid_xref(ctx, doc, maxnum);
 
+		for (i = 1; i < maxnum; i++)
+		{
+			entry = pdf_get_populating_xref_entry(ctx, doc, i);
+			if (entry->obj != NULL)
+				continue;
+			entry->type = 'f';
+			entry->ofs = 0;
+			entry->gen = 0;
+
+			entry->stm_ofs = 0;
+		}
+
 		for (i = 0; i < listlen; i++)
 		{
 			entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num);
-- 
cgit v1.2.3