Strip outlines when cleaning a pdf file.

Arrange to keep only the Outline entries that still refer to valid pages. More specifically, leaf outline entries that refer to pages that are elided are not dropped. Non leaf outline entries that refer to pages that have been elided are kept (for the sake of the children) but have NULL destinations.
author: Robin Watts <robin.watts@artifex.com> 2016-02-29 14:13:11 +0000
committer: Robin Watts <robin.watts@artifex.com> 2016-02-29 17:14:53 +0000
commit: dde049432d9f28d29aa4be6730e67ebc28415ef3 (patch)
tree: 7e9c29d40467c1ee6a3aa61eeed7d11bfa01c6f6
parent: d065aca45bce7ee3668f59ca13cda46b52bc9bd6 (diff)
download: mupdf-dde049432d9f28d29aa4be6730e67ebc28415ef3.tar.xz
2 files changed, 151 insertions, 30 deletions
diff --git a/resources/pdf/names.txt b/resources/pdf/names.txt
index a173b6bb..ff63d99d 100644
--- a/resources/pdf/names.txt
+++ b/resources/pdf/names.txt
@@ -198,6 +198,7 @@ LZ
 LZW
 LZWDecode
 Lab
+Last
 LastChar
 Launch
 Length
diff --git a/source/pdf/pdf-clean-file.c b/source/pdf/pdf-clean-file.c
index 29efbd5c..b766af89 100644
--- a/source/pdf/pdf-clean-file.c
+++ b/source/pdf/pdf-clean-file.c
@@ -36,47 +36,163 @@ static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parent, pdf_
 	pdf_array_push(ctx, kids, pageref);
 }
 
-static int dest_is_valid_page(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
+static int dest_is_valid_page(fz_context *ctx, pdf_obj *obj, int *page_object_nums, int pagecount)
 {
 	int i;
 	int num = pdf_to_num(ctx, obj);
-	int pagecount = pdf_count_pages(ctx, doc);
 
 	if (num == 0)
 		return 0;
 	for (i = 0; i < pagecount; i++)
 	{
-		pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
-
-		if (pdf_to_num(ctx, pageref) == num)
+		if (page_object_nums[i] == num)
 			return 1;
 	}
 	return 0;
 }
 
+static int dest_is_valid(fz_context *ctx, pdf_obj *o, int page_count, int *page_object_nums, pdf_obj *names_list)
+{
+	pdf_obj *p;
+
+	p = pdf_dict_get(ctx, o, PDF_NAME_A);
+	if (pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME_S), PDF_NAME_GoTo) &&
+		!string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME_D), names_list))
+		return 0;
+
+	p = pdf_dict_get(ctx, o, PDF_NAME_Dest);
+	if (p == NULL)
+	{}
+	else if (pdf_is_string(ctx, p))
+	{
+		return string_in_names_list(ctx, p, names_list);
+	}
+	else if (!dest_is_valid_page(ctx, pdf_array_get(ctx, p, 0), page_object_nums, page_count))
+		return 0;
+
+	return 1;
+}
+
+static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list);
+
+static int strip_outline(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_obj **pfirst, pdf_obj **plast)
+{
+	pdf_obj *prev = NULL;
+	pdf_obj *first = NULL;
+	pdf_obj *current;
+	int count = 0;
+
+	for (current = outlines; current != NULL; )
+	{
+		int nc;
+
+		/* Strip any children to start with. This takes care of
+		 * First/Last/Count for us. */
+		nc = strip_outlines(ctx, doc, current, page_count, page_object_nums, names_list);
+
+		if (!dest_is_valid(ctx, current, page_count, page_object_nums, names_list))
+		{
+			if (nc == 0)
+			{
+				/* Outline with invalid dest and no children. Drop it by
+				 * pulling the next one in here. */
+				pdf_obj *next = pdf_dict_get(ctx, current, PDF_NAME_Next);
+				if (next == NULL)
+				{
+					/* There is no next one to pull in */
+					if (prev != NULL)
+						pdf_dict_del(ctx, prev, PDF_NAME_Next);
+				}
+				else if (prev != NULL)
+				{
+					pdf_dict_put(ctx, prev, PDF_NAME_Next, next);
+					pdf_dict_put(ctx, next, PDF_NAME_Prev, prev);
+				}
+				else
+				{
+					pdf_dict_del(ctx, next, PDF_NAME_Prev);
+				}
+				current = next;
+			}
+			else
+			{
+				/* Outline with invalid dest, but children. Just drop the dest. */
+				pdf_dict_del(ctx, current, PDF_NAME_Dest);
+				pdf_dict_del(ctx, current, PDF_NAME_A);
+				current = pdf_dict_get(ctx, current, PDF_NAME_Next);
+			}
+		}
+		else
+		{
+			/* Keep this one */
+			if (first == NULL)
+				first = current;
+			prev = current;
+			current = pdf_dict_get(ctx, current, PDF_NAME_Next);
+			count++;
+		}
+	}
+
+	*pfirst = first;
+	*plast = prev;
+
+	return count;
+}
+
+static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list)
+{
+	int nc;
+	pdf_obj *first;
+	pdf_obj *last;
+
+	first = pdf_dict_get(ctx, outlines, PDF_NAME_First);
+	if (first == NULL)
+		nc = 0;
+	else
+		nc = strip_outline(ctx, doc, first, page_count, page_object_nums, names_list, &first, &last);
+
+	if (nc == 0)
+	{
+		pdf_dict_del(ctx, outlines, PDF_NAME_First);
+		pdf_dict_del(ctx, outlines, PDF_NAME_Last);
+		pdf_dict_del(ctx, outlines, PDF_NAME_Count);
+	}
+	else
+	{
+		int old_count = pdf_to_int(ctx, pdf_dict_get(ctx, outlines, PDF_NAME_Count));
+		pdf_dict_put(ctx, outlines, PDF_NAME_First, first);
+		pdf_dict_put(ctx, outlines, PDF_NAME_Last, last);
+		pdf_dict_put(ctx, outlines, PDF_NAME_Count, pdf_new_int(ctx, doc, old_count > 0 ? nc : -nc));
+	}
+
+	return nc;
+}
+
 static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv)
 {
 	pdf_obj *oldroot, *root, *pages, *kids, *countobj, *parent, *olddests;
 	pdf_document *doc = glo->doc;
 	int argidx = 0;
 	pdf_obj *names_list = NULL;
+	pdf_obj *outlines;
 	int pagecount;
 	int i;
+	int *page_object_nums;
 
 	/* Keep only pages/type and (reduced) dest entries to avoid
 	 * references to unretained pages */
 	oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root);
 	pages = pdf_dict_get(ctx, oldroot, PDF_NAME_Pages);
 	olddests = pdf_load_name_tree(ctx, doc, PDF_NAME_Dests);
+	outlines = pdf_dict_get(ctx, oldroot, PDF_NAME_Outlines);
 
-	root = pdf_new_dict(ctx, doc, 2);
+	root = pdf_new_dict(ctx, doc, 3);
 	pdf_dict_put(ctx, root, PDF_NAME_Type, pdf_dict_get(ctx, oldroot, PDF_NAME_Type));
 	pdf_dict_put(ctx, root, PDF_NAME_Pages, pdf_dict_get(ctx, oldroot, PDF_NAME_Pages));
+	pdf_dict_put(ctx, root, PDF_NAME_Outlines, outlines);
 
 	pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root);
 
-	pdf_drop_obj(ctx, root);
-
 	/* Create a new kids array with only the pages we want to keep */
 	parent = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, pages), pdf_to_gen(ctx, pages));
 	kids = pdf_new_array(ctx, doc, 1);
@@ -132,7 +248,21 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv)
 	pdf_dict_put(ctx, pages, PDF_NAME_Kids, kids);
 	pdf_drop_obj(ctx, kids);
 
-	/* Also preserve the (partial) Dests name tree */
+	/* Force the next call to pdf_count_pages to recount */
+	glo->doc->page_count = 0;
+
+	pagecount = pdf_count_pages(ctx, doc);
+	page_object_nums = fz_calloc(ctx, pagecount, sizeof(*page_object_nums));
+	for (i = 0; i < pagecount; i++)
+	{
+		pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
+		page_object_nums[i] = pdf_to_num(ctx, pageref);
+	}
+
+	/* If we had an old Dests tree (now reformed as an olddests
+	 * dictionary), keep any entries in there that point to
+	 * valid pages. This may mean we keep more than we need, but
+	 * it's safe at least. */
 	if (olddests)
 	{
 		pdf_obj *names = pdf_new_dict(ctx, doc, 1);
@@ -148,7 +278,7 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv)
 			pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME_D);
 
 			dest = pdf_array_get(ctx, dest ? dest : val, 0);
-			if (pdf_array_contains(ctx, pdf_dict_get(ctx, pages, PDF_NAME_Kids), dest))
+			if (dest_is_valid_page(ctx, dest, page_object_nums, pagecount))
 			{
 				pdf_obj *key_str = pdf_new_string(ctx, doc, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key)));
 				pdf_array_push(ctx, names_list, key_str);
@@ -157,23 +287,17 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv)
 			}
 		}
 
-		root = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root);
 		pdf_dict_put(ctx, dests, PDF_NAME_Names, names_list);
 		pdf_dict_put(ctx, names, PDF_NAME_Dests, dests);
 		pdf_dict_put(ctx, root, PDF_NAME_Names, names);
 
 		pdf_drop_obj(ctx, names);
 		pdf_drop_obj(ctx, dests);
-		pdf_drop_obj(ctx, names_list);
 		pdf_drop_obj(ctx, olddests);
 	}
 
-	/* Force the next call to pdf_count_pages to recount */
-	glo->doc->page_count = 0;
-
 	/* Edit each pages /Annot list to remove any links that point to
 	 * nowhere. */
-	pagecount = pdf_count_pages(ctx, doc);
 	for (i = 0; i < pagecount; i++)
 	{
 		pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
@@ -187,24 +311,11 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv)
 		for (j = 0; j < len; j++)
 		{
 			pdf_obj *o = pdf_array_get(ctx, annots, j);
-			pdf_obj *p;
-			int remove = 0;
 
 			if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME_Subtype), PDF_NAME_Link))
 				continue;
 
-			p = pdf_dict_get(ctx, o, PDF_NAME_A);
-			if (pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME_S), PDF_NAME_GoTo) &&
-				!string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME_D), names_list))
-				remove = 1;
-
-			p = pdf_dict_get(ctx, 0, PDF_NAME_Dest);
-			if (!dest_is_valid_page(ctx, doc, pdf_array_get(ctx, p, 0)))
-				remove = 1;
-
-			/* FIXME: Should probably look at Next too */
-
-			if (remove)
+			if (!dest_is_valid(ctx, o, pagecount, page_object_nums, names_list))
 			{
 				/* Remove this annotation */
 				pdf_array_delete(ctx, annots, j);
@@ -212,6 +323,15 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv)
 			}
 		}
 	}
+
+	if (strip_outlines(ctx, doc, outlines, pagecount, page_object_nums, names_list) == 0)
+	{
+		pdf_dict_del(ctx, root, PDF_NAME_Outlines);
+	}
+
+	fz_free(ctx, page_object_nums);
+	pdf_drop_obj(ctx, names_list);
+	pdf_drop_obj(ctx, root);
 }
 
 void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, pdf_write_options *opts, char *argv[], int argc)
author	Robin Watts <robin.watts@artifex.com>	2016-02-29 14:13:11 +0000
committer	Robin Watts <robin.watts@artifex.com>	2016-02-29 17:14:53 +0000
commit	dde049432d9f28d29aa4be6730e67ebc28415ef3 (patch)
tree	7e9c29d40467c1ee6a3aa61eeed7d11bfa01c6f6
parent	d065aca45bce7ee3668f59ca13cda46b52bc9bd6 (diff)
download	mupdf-dde049432d9f28d29aa4be6730e67ebc28415ef3.tar.xz