Bug 696146: Improve pdf_repair to find /Root in new style XRefs.

The current code never looks for /Root objects in dictionaries as it parses them. This means that 'New style' files end up without any Roots after repair. The new code therefore updates pdf_repair_obj to look for Root objects in the same way it looks for encrypt and id objects. These go into the list of found roots. The Root object almost certainly has indirections within it, so it is vital that the 'doc' pointer gets set. This means we have to make a slight adjustment to pdf_repair_obj so that the dict is parsed with a doc pointer. In turn this means we need to manually ensure that none of the other information read from the dict during the repair operation will cause indirections to be resolved. This is achieved by checking for !pdf_is_indirect at various points.
author: Robin Watts <robin.watts@artifex.com> 2015-09-30 19:10:45 +0100
committer: Robin Watts <robin.watts@artifex.com> 2015-10-01 11:11:10 +0100
commit: 841398e7693a511df66dd0f1b33d9604371d53e0 (patch)
tree: 9543c6591a5fd856da2b2c52d6c7ed970a697ca8
parent: e6a6cccd2295ad09fc569debda49ebcef8b2cd92 (diff)
download: mupdf-841398e7693a511df66dd0f1b33d9604371d53e0.tar.xz
3 files changed, 66 insertions, 29 deletions
diff --git a/include/mupdf/pdf/xref.h b/include/mupdf/pdf/xref.h
index 30b59222..c70dc67c 100644
--- a/include/mupdf/pdf/xref.h
+++ b/include/mupdf/pdf/xref.h
@@ -108,7 +108,7 @@ void pdf_mark_xref(fz_context *ctx, pdf_document *doc);
 void pdf_clear_xref(fz_context *ctx, pdf_document *doc);
 void pdf_clear_xref_to_mark(fz_context *ctx, pdf_document *doc);
 
-int pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, fz_off_t *tmpofs);
+int pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, fz_off_t *tmpofs, pdf_obj **root);
 
 pdf_obj *pdf_progressive_advance(fz_context *ctx, pdf_document *doc, int pagenum);
 
diff --git a/source/pdf/pdf-repair.c b/source/pdf/pdf-repair.c
index 6e62bf00..fdd46483 100644
--- a/source/pdf/pdf-repair.c
+++ b/source/pdf/pdf-repair.c
@@ -14,8 +14,21 @@ struct entry
 	int stm_len;
 };
 
+static void add_root(fz_context *ctx, pdf_obj *obj, pdf_obj ***roots, int *num_roots, int *max_roots)
+{
+	if (*num_roots == *max_roots)
+	{
+		int new_max_roots = *max_roots * 2;
+		if (new_max_roots == 0)
+			new_max_roots = 4;
+		*roots = fz_resize_array(ctx, *roots, new_max_roots, sizeof(**roots));
+		*max_roots = new_max_roots;
+	}
+	(*roots)[(*num_roots)++] = pdf_keep_obj(ctx, obj);
+}
+
 int
-pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, fz_off_t *tmpofs)
+pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, fz_off_t *tmpofs, pdf_obj **root)
 {
 	fz_stream *file = doc->file;
 	pdf_token tok;
@@ -37,10 +50,9 @@ pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *st
 	{
 		pdf_obj *dict, *obj;
 
-		/* Send NULL xref so we don't try to resolve references */
 		fz_try(ctx)
 		{
-			dict = pdf_parse_dict(ctx, NULL, file, buf);
+			dict = pdf_parse_dict(ctx, doc, file, buf);
 		}
 		fz_catch(ctx)
 		{
@@ -52,24 +64,39 @@ pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *st
 			dict = pdf_new_dict(ctx, NULL, 2);
 		}
 
-		if (encrypt && id)
+		/* We must be careful not to try to resolve any indirections
+		 * here. We have just read dict, so we know it to be a non
+		 * indirected dictionary. Before we look at any values that
+		 * we get back from looking up in it, we need to check they
+		 * aren't indirected. */
+
+		if (encrypt || id || root)
 		{
 			obj = pdf_dict_get(ctx, dict, PDF_NAME_Type);
-			if (pdf_name_eq(ctx, obj, PDF_NAME_XRef))
+			if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME_XRef))
 			{
-				obj = pdf_dict_get(ctx, dict, PDF_NAME_Encrypt);
-				if (obj)
+				if (encrypt)
 				{
-					pdf_drop_obj(ctx, *encrypt);
-					*encrypt = pdf_keep_obj(ctx, obj);
+					obj = pdf_dict_get(ctx, dict, PDF_NAME_Encrypt);
+					if (obj)
+					{
+						pdf_drop_obj(ctx, *encrypt);
+						*encrypt = pdf_keep_obj(ctx, obj);
+					}
 				}
 
-				obj = pdf_dict_get(ctx, dict, PDF_NAME_ID);
-				if (obj)
+				if (id)
 				{
-					pdf_drop_obj(ctx, *id);
-					*id = pdf_keep_obj(ctx, obj);
+					obj = pdf_dict_get(ctx, dict, PDF_NAME_ID);
+					if (obj)
+					{
+						pdf_drop_obj(ctx, *id);
+						*id = pdf_keep_obj(ctx, obj);
+					}
 				}
+
+				if (root)
+					*root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME_Root));
 			}
 		}
 
@@ -80,7 +107,7 @@ pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *st
 		if (doc->file_reading_linearly && page)
 		{
 			obj = pdf_dict_get(ctx, dict, PDF_NAME_Type);
-			if (pdf_name_eq(ctx, obj, PDF_NAME_Page))
+			if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME_Page))
 			{
 				pdf_drop_obj(ctx, *page);
 				*page = pdf_keep_obj(ctx, dict);
@@ -343,11 +370,19 @@ pdf_repair_xref(fz_context *ctx, pdf_document *doc)
 
 			else if (tok == PDF_TOK_OBJ)
 			{
+				pdf_obj *root = NULL;
+
 				fz_try(ctx)
 				{
 					stm_len = 0;
 					stm_ofs = 0;
-					tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs);
+					tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs, &root);
+					if (root)
+						add_root(ctx, root, &roots, &num_roots, &max_roots);
+				}
+				fz_always(ctx)
+				{
+					pdf_drop_obj(ctx, root);
 				}
 				fz_catch(ctx)
 				{
@@ -423,17 +458,7 @@ pdf_repair_xref(fz_context *ctx, pdf_document *doc)
 
 				obj = pdf_dict_get(ctx, dict, PDF_NAME_Root);
 				if (obj)
-				{
-					if (num_roots == max_roots)
-					{
-						int new_max_roots = max_roots * 2;
-						if (new_max_roots == 0)
-							new_max_roots = 4;
-						roots = fz_resize_array(ctx, roots, new_max_roots, sizeof(*roots));
-						max_roots = new_max_roots;
-					}
-					roots[num_roots++] = pdf_keep_obj(ctx, obj);
-				}
+					add_root(ctx, obj, &roots, &num_roots, &max_roots);
 
 				obj = pdf_dict_get(ctx, dict, PDF_NAME_Info);
 				if (obj)
@@ -471,6 +496,18 @@ pdf_repair_xref(fz_context *ctx, pdf_document *doc)
 		 * 0 to maxnum. */
 		pdf_ensure_solid_xref(ctx, doc, maxnum);
 
+		for (i = 1; i < maxnum; i++)
+		{
+			entry = pdf_get_populating_xref_entry(ctx, doc, i);
+			if (entry->obj != NULL)
+				continue;
+			entry->type = 'f';
+			entry->ofs = 0;
+			entry->gen = 0;
+
+			entry->stm_ofs = 0;
+		}
+
 		for (i = 0; i < listlen; i++)
 		{
 			entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num);
diff --git a/source/pdf/pdf-xref.c b/source/pdf/pdf-xref.c
index 3a9f70df..727dcda7 100644
--- a/source/pdf/pdf-xref.c
+++ b/source/pdf/pdf-xref.c
@@ -1807,7 +1807,7 @@ pdf_obj_read(fz_context *ctx, pdf_document *doc, fz_off_t *offset, int *nump, pd
 	 * whenever we read an object it should just go into the
 	 * previous xref.
 	 */
-	tok = pdf_repair_obj(ctx, doc, buf, &stmofs, NULL, NULL, NULL, page, &newtmpofs);
+	tok = pdf_repair_obj(ctx, doc, buf, &stmofs, NULL, NULL, NULL, page, &newtmpofs, NULL);
 
 	do /* So we can break out of it */
 	{
@@ -2600,7 +2600,7 @@ pdf_load_hint_object(fz_context *ctx, pdf_document *doc)
 			tok = pdf_lex(ctx, doc->file, buf);
 			if (tok != PDF_TOK_OBJ)
 				break;
-			(void)pdf_repair_obj(ctx, doc, buf, &tmpofs, NULL, NULL, NULL, &page, &tmpofs);
+			(void)pdf_repair_obj(ctx, doc, buf, &tmpofs, NULL, NULL, NULL, &page, &tmpofs, NULL);
 			pdf_load_hints(ctx, doc, num, gen);
 		}
 	}
author	Robin Watts <robin.watts@artifex.com>	2015-09-30 19:10:45 +0100
committer	Robin Watts <robin.watts@artifex.com>	2015-10-01 11:11:10 +0100
commit	841398e7693a511df66dd0f1b33d9604371d53e0 (patch)
tree	9543c6591a5fd856da2b2c52d6c7ed970a697ca8
parent	e6a6cccd2295ad09fc569debda49ebcef8b2cd92 (diff)
download	mupdf-841398e7693a511df66dd0f1b33d9604371d53e0.tar.xz