Move the guts of pdfclean into the lib.

Michael needs to be able to call pdfclean from gsview. At the moment he's having to do this by including the pdfclean.c file into the lib build, and then calling pdfclean_main with a faked up command line. This isn't nice. pdfclean.c is implemented by pdfclean_main parsing the options/filenames out of argv and then passing the filenames/options on to a pdfclean_clean function. This seems like a much nicer API to offer to the world. We therefore pull the guts of pdfclean.c (pdfclean_clean and its subsidiary structures/functions) into pdf-clean-file.c and include this in the library build. This leaves pdfclean.c just as the command line parsing. This should not affect the size of any of the resulting binaries.
author: Robin Watts <robin.watts@artifex.com> 2015-04-02 17:04:08 +0100
committer: Robin Watts <robin.watts@artifex.com> 2015-04-06 19:30:55 +0100
commit: b4d256b9e9d2e9f3b5f3ca944f591ae6bb0c5d71 (patch)
tree: 8d82fc82c7586634b14518704def1e5011249ca9
parent: 95d746f13a86d914dd88310c994b27b08db4bb5b (diff)
download: mupdf-b4d256b9e9d2e9f3b5f3ca944f591ae6bb0c5d71.tar.xz
5 files changed, 239 insertions, 221 deletions
diff --git a/include/mupdf/pdf.h b/include/mupdf/pdf.h
index a6f812f3..4b1459b4 100644
--- a/include/mupdf/pdf.h
+++ b/include/mupdf/pdf.h
@@ -25,4 +25,6 @@
 
 #include "mupdf/pdf/output-pdf.h"
 
+#include "mupdf/pdf/clean.h"
+
 #endif
diff --git a/include/mupdf/pdf/clean.h b/include/mupdf/pdf/clean.h
new file mode 100644
index 00000000..11ebf0c7
--- /dev/null
+++ b/include/mupdf/pdf/clean.h
@@ -0,0 +1,7 @@
+#ifndef MUPDF_PDF_CLEAN_H
+#define MUPDF_PDF_CLEAN_H
+
+/* Read infile, and write selected pages to outfile with the given options. */
+void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, fz_write_options *opts, char *retainlist[], int retainlen);
+
+#endif
diff --git a/platform/win32/libmupdf.vcproj b/platform/win32/libmupdf.vcproj
index c3dc0aa3..d7a55349 100644
--- a/platform/win32/libmupdf.vcproj
+++ b/platform/win32/libmupdf.vcproj
@@ -1066,6 +1066,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\source\pdf\pdf-clean-file.c"
+				>
+			</File>
+			<File
 				RelativePath="..\..\source\pdf\pdf-clean.c"
 				>
 			</File>
@@ -1293,6 +1297,10 @@
 					>
 				</File>
 				<File
+					RelativePath="..\..\include\mupdf\pdf\clean.h"
+					>
+				</File>
+				<File
 					RelativePath="..\..\include\mupdf\pdf\cmap.h"
 					>
 				</File>
diff --git a/source/pdf/pdf-clean-file.c b/source/pdf/pdf-clean-file.c
new file mode 100644
index 00000000..d224dd9a
--- /dev/null
+++ b/source/pdf/pdf-clean-file.c
@@ -0,0 +1,221 @@
+#include "mupdf/pdf.h"
+
+typedef struct globals_s
+{
+	pdf_document *doc;
+	fz_context *ctx;
+} globals;
+
+static int
+string_in_names_list(fz_context *ctx, pdf_obj *p, pdf_obj *names_list)
+{
+	int n = pdf_array_len(ctx, names_list);
+	int i;
+	char *str = pdf_to_str_buf(ctx, p);
+
+	for (i = 0; i < n ; i += 2)
+	{
+		if (!strcmp(pdf_to_str_buf(ctx, pdf_array_get(ctx, names_list, i)), str))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Recreate page tree to only retain specified pages.
+ */
+
+static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parent, pdf_obj *kids, int page)
+{
+	pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, page-1);
+	pdf_obj *pageobj = pdf_resolve_indirect(ctx, pageref);
+
+	pdf_dict_put(ctx, pageobj, PDF_NAME_Parent, parent);
+
+	/* Store page object in new kids array */
+	pdf_array_push(ctx, kids, pageref);
+}
+
+static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv)
+{
+	pdf_obj *oldroot, *root, *pages, *kids, *countobj, *parent, *olddests;
+	pdf_document *doc = glo->doc;
+	int argidx = 0;
+	pdf_obj *names_list = NULL;
+	int pagecount;
+	int i;
+
+	/* Keep only pages/type and (reduced) dest entries to avoid
+	 * references to unretained pages */
+	oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root);
+	pages = pdf_dict_get(ctx, oldroot, PDF_NAME_Pages);
+	olddests = pdf_load_name_tree(ctx, doc, PDF_NAME_Dests);
+
+	root = pdf_new_dict(ctx, doc, 2);
+	pdf_dict_put(ctx, root, PDF_NAME_Type, pdf_dict_get(ctx, oldroot, PDF_NAME_Type));
+	pdf_dict_put(ctx, root, PDF_NAME_Pages, pdf_dict_get(ctx, oldroot, PDF_NAME_Pages));
+
+	pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root);
+
+	pdf_drop_obj(ctx, root);
+
+	/* Create a new kids array with only the pages we want to keep */
+	parent = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, pages), pdf_to_gen(ctx, pages));
+	kids = pdf_new_array(ctx, doc, 1);
+
+	/* Retain pages specified */
+	while (argc - argidx)
+	{
+		int page, spage, epage;
+		char *spec, *dash;
+		char *pagelist = argv[argidx];
+
+		pagecount = pdf_count_pages(ctx, doc);
+		spec = fz_strsep(&pagelist, ",");
+		while (spec)
+		{
+			dash = strchr(spec, '-');
+
+			if (dash == spec)
+				spage = epage = pagecount;
+			else
+				spage = epage = atoi(spec);
+
+			if (dash)
+			{
+				if (strlen(dash) > 1)
+					epage = atoi(dash + 1);
+				else
+					epage = pagecount;
+			}
+
+			spage = fz_clampi(spage, 1, pagecount);
+			epage = fz_clampi(epage, 1, pagecount);
+
+			if (spage < epage)
+				for (page = spage; page <= epage; ++page)
+					retainpage(ctx, doc, parent, kids, page);
+			else
+				for (page = spage; page >= epage; --page)
+					retainpage(ctx, doc, parent, kids, page);
+
+			spec = fz_strsep(&pagelist, ",");
+		}
+
+		argidx++;
+	}
+
+	pdf_drop_obj(ctx, parent);
+
+	/* Update page count and kids array */
+	countobj = pdf_new_int(ctx, doc, pdf_array_len(ctx, kids));
+	pdf_dict_put(ctx, pages, PDF_NAME_Count, countobj);
+	pdf_drop_obj(ctx, countobj);
+	pdf_dict_put(ctx, pages, PDF_NAME_Kids, kids);
+	pdf_drop_obj(ctx, kids);
+
+	/* Also preserve the (partial) Dests name tree */
+	if (olddests)
+	{
+		pdf_obj *names = pdf_new_dict(ctx, doc, 1);
+		pdf_obj *dests = pdf_new_dict(ctx, doc, 1);
+		int len = pdf_dict_len(ctx, olddests);
+
+		names_list = pdf_new_array(ctx, doc, 32);
+
+		for (i = 0; i < len; i++)
+		{
+			pdf_obj *key = pdf_dict_get_key(ctx, olddests, i);
+			pdf_obj *val = pdf_dict_get_val(ctx, olddests, i);
+			pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME_D);
+
+			dest = pdf_array_get(ctx, dest ? dest : val, 0);
+			if (pdf_array_contains(ctx, pdf_dict_get(ctx, pages, PDF_NAME_Kids), dest))
+			{
+				pdf_obj *key_str = pdf_new_string(ctx, doc, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key)));
+				pdf_array_push(ctx, names_list, key_str);
+				pdf_array_push(ctx, names_list, val);
+				pdf_drop_obj(ctx, key_str);
+			}
+		}
+
+		root = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root);
+		pdf_dict_put(ctx, dests, PDF_NAME_Names, names_list);
+		pdf_dict_put(ctx, names, PDF_NAME_Dests, dests);
+		pdf_dict_put(ctx, root, PDF_NAME_Names, names);
+
+		pdf_drop_obj(ctx, names);
+		pdf_drop_obj(ctx, dests);
+		pdf_drop_obj(ctx, names_list);
+		pdf_drop_obj(ctx, olddests);
+	}
+
+	/* Force the next call to pdf_count_pages to recount */
+	glo->doc->page_count = 0;
+
+	/* Edit each pages /Annot list to remove any links that point to
+	 * nowhere. */
+	pagecount = pdf_count_pages(ctx, doc);
+	for (i = 0; i < pagecount; i++)
+	{
+		pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
+		pdf_obj *pageobj = pdf_resolve_indirect(ctx, pageref);
+
+		pdf_obj *annots = pdf_dict_get(ctx, pageobj, PDF_NAME_Annots);
+
+		int len = pdf_array_len(ctx, annots);
+		int j;
+
+		for (j = 0; j < len; j++)
+		{
+			pdf_obj *o = pdf_array_get(ctx, annots, j);
+			pdf_obj *p;
+
+			if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME_Subtype), PDF_NAME_Link))
+				continue;
+
+			p = pdf_dict_get(ctx, o, PDF_NAME_A);
+			if (!pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME_S), PDF_NAME_GoTo))
+				continue;
+
+			if (string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME_D), names_list))
+				continue;
+
+			/* FIXME: Should probably look at Next too */
+
+			/* Remove this annotation */
+			pdf_array_delete(ctx, annots, j);
+			j--;
+		}
+	}
+}
+
+void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, fz_write_options *opts, char *argv[], int argc)
+{
+	globals glo = { 0 };
+
+	glo.ctx = ctx;
+
+	fz_try(ctx)
+	{
+		glo.doc = pdf_open_document_no_run(ctx, infile);
+		if (pdf_needs_password(ctx, glo.doc))
+			if (!pdf_authenticate_password(ctx, glo.doc, password))
+				fz_throw(glo.ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s", infile);
+
+		/* Only retain the specified subset of the pages */
+		if (argc)
+			retainpages(ctx, &glo, argc, argv);
+
+		pdf_write_document(ctx, glo.doc, outfile, opts);
+	}
+	fz_always(ctx)
+	{
+		pdf_close_document(ctx, glo.doc);
+	}
+	fz_catch(ctx)
+	{
+		if (opts && opts->errors)
+			*opts->errors = *opts->errors+1;
+	}
+}
diff --git a/source/tools/pdfclean.c b/source/tools/pdfclean.c
index 051ec4bd..9d38d552 100644
--- a/source/tools/pdfclean.c
+++ b/source/tools/pdfclean.c
@@ -11,12 +11,6 @@
 
 #include "mupdf/pdf.h"
 
-typedef struct globals_s
-{
-	pdf_document *doc;
-	fz_context *ctx;
-} globals;
-
 static void usage(void)
 {
 	fprintf(stderr,
@@ -36,220 +30,6 @@ static void usage(void)
 	exit(1);
 }
 
-static int
-string_in_names_list(fz_context *ctx, pdf_obj *p, pdf_obj *names_list)
-{
-	int n = pdf_array_len(ctx, names_list);
-	int i;
-	char *str = pdf_to_str_buf(ctx, p);
-
-	for (i = 0; i < n ; i += 2)
-	{
-		if (!strcmp(pdf_to_str_buf(ctx, pdf_array_get(ctx, names_list, i)), str))
-			return 1;
-	}
-	return 0;
-}
-
-/*
- * Recreate page tree to only retain specified pages.
- */
-
-static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parent, pdf_obj *kids, int page)
-{
-	pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, page-1);
-	pdf_obj *pageobj = pdf_resolve_indirect(ctx, pageref);
-
-	pdf_dict_put(ctx, pageobj, PDF_NAME_Parent, parent);
-
-	/* Store page object in new kids array */
-	pdf_array_push(ctx, kids, pageref);
-}
-
-static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv)
-{
-	pdf_obj *oldroot, *root, *pages, *kids, *countobj, *parent, *olddests;
-	pdf_document *doc = glo->doc;
-	int argidx = 0;
-	pdf_obj *names_list = NULL;
-	int pagecount;
-	int i;
-
-	/* Keep only pages/type and (reduced) dest entries to avoid
-	 * references to unretained pages */
-	oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root);
-	pages = pdf_dict_get(ctx, oldroot, PDF_NAME_Pages);
-	olddests = pdf_load_name_tree(ctx, doc, PDF_NAME_Dests);
-
-	root = pdf_new_dict(ctx, doc, 2);
-	pdf_dict_put(ctx, root, PDF_NAME_Type, pdf_dict_get(ctx, oldroot, PDF_NAME_Type));
-	pdf_dict_put(ctx, root, PDF_NAME_Pages, pdf_dict_get(ctx, oldroot, PDF_NAME_Pages));
-
-	pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root);
-
-	pdf_drop_obj(ctx, root);
-
-	/* Create a new kids array with only the pages we want to keep */
-	parent = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, pages), pdf_to_gen(ctx, pages));
-	kids = pdf_new_array(ctx, doc, 1);
-
-	/* Retain pages specified */
-	while (argc - argidx)
-	{
-		int page, spage, epage;
-		char *spec, *dash;
-		char *pagelist = argv[argidx];
-
-		pagecount = pdf_count_pages(ctx, doc);
-		spec = fz_strsep(&pagelist, ",");
-		while (spec)
-		{
-			dash = strchr(spec, '-');
-
-			if (dash == spec)
-				spage = epage = pagecount;
-			else
-				spage = epage = atoi(spec);
-
-			if (dash)
-			{
-				if (strlen(dash) > 1)
-					epage = atoi(dash + 1);
-				else
-					epage = pagecount;
-			}
-
-			spage = fz_clampi(spage, 1, pagecount);
-			epage = fz_clampi(epage, 1, pagecount);
-
-			if (spage < epage)
-				for (page = spage; page <= epage; ++page)
-					retainpage(ctx, doc, parent, kids, page);
-			else
-				for (page = spage; page >= epage; --page)
-					retainpage(ctx, doc, parent, kids, page);
-
-			spec = fz_strsep(&pagelist, ",");
-		}
-
-		argidx++;
-	}
-
-	pdf_drop_obj(ctx, parent);
-
-	/* Update page count and kids array */
-	countobj = pdf_new_int(ctx, doc, pdf_array_len(ctx, kids));
-	pdf_dict_put(ctx, pages, PDF_NAME_Count, countobj);
-	pdf_drop_obj(ctx, countobj);
-	pdf_dict_put(ctx, pages, PDF_NAME_Kids, kids);
-	pdf_drop_obj(ctx, kids);
-
-	/* Also preserve the (partial) Dests name tree */
-	if (olddests)
-	{
-		pdf_obj *names = pdf_new_dict(ctx, doc, 1);
-		pdf_obj *dests = pdf_new_dict(ctx, doc, 1);
-		int len = pdf_dict_len(ctx, olddests);
-
-		names_list = pdf_new_array(ctx, doc, 32);
-
-		for (i = 0; i < len; i++)
-		{
-			pdf_obj *key = pdf_dict_get_key(ctx, olddests, i);
-			pdf_obj *val = pdf_dict_get_val(ctx, olddests, i);
-			pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME_D);
-
-			dest = pdf_array_get(ctx, dest ? dest : val, 0);
-			if (pdf_array_contains(ctx, pdf_dict_get(ctx, pages, PDF_NAME_Kids), dest))
-			{
-				pdf_obj *key_str = pdf_new_string(ctx, doc, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key)));
-				pdf_array_push(ctx, names_list, key_str);
-				pdf_array_push(ctx, names_list, val);
-				pdf_drop_obj(ctx, key_str);
-			}
-		}
-
-		root = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root);
-		pdf_dict_put(ctx, dests, PDF_NAME_Names, names_list);
-		pdf_dict_put(ctx, names, PDF_NAME_Dests, dests);
-		pdf_dict_put(ctx, root, PDF_NAME_Names, names);
-
-		pdf_drop_obj(ctx, names);
-		pdf_drop_obj(ctx, dests);
-		pdf_drop_obj(ctx, names_list);
-		pdf_drop_obj(ctx, olddests);
-	}
-
-	/* Force the next call to pdf_count_pages to recount */
-	glo->doc->page_count = 0;
-
-	/* Edit each pages /Annot list to remove any links that point to
-	 * nowhere. */
-	pagecount = pdf_count_pages(ctx, doc);
-	for (i = 0; i < pagecount; i++)
-	{
-		pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
-		pdf_obj *pageobj = pdf_resolve_indirect(ctx, pageref);
-
-		pdf_obj *annots = pdf_dict_get(ctx, pageobj, PDF_NAME_Annots);
-
-		int len = pdf_array_len(ctx, annots);
-		int j;
-
-		for (j = 0; j < len; j++)
-		{
-			pdf_obj *o = pdf_array_get(ctx, annots, j);
-			pdf_obj *p;
-
-			if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME_Subtype), PDF_NAME_Link))
-				continue;
-
-			p = pdf_dict_get(ctx, o, PDF_NAME_A);
-			if (!pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME_S), PDF_NAME_GoTo))
-				continue;
-
-			if (string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME_D), names_list))
-				continue;
-
-			/* FIXME: Should probably look at Next too */
-
-			/* Remove this annotation */
-			pdf_array_delete(ctx, annots, j);
-			j--;
-		}
-	}
-}
-
-void pdfclean_clean(fz_context *ctx, char *infile, char *outfile, char *password, fz_write_options *opts, char *argv[], int argc)
-{
-	globals glo = { 0 };
-
-	glo.ctx = ctx;
-
-	fz_try(ctx)
-	{
-		glo.doc = pdf_open_document_no_run(ctx, infile);
-		if (pdf_needs_password(ctx, glo.doc))
-			if (!pdf_authenticate_password(ctx, glo.doc, password))
-				fz_throw(glo.ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s", infile);
-
-		/* Only retain the specified subset of the pages */
-		if (argc)
-			retainpages(ctx, &glo, argc, argv);
-
-		pdf_write_document(ctx, glo.doc, outfile, opts);
-	}
-	fz_always(ctx)
-	{
-		pdf_close_document(ctx, glo.doc);
-	}
-	fz_catch(ctx)
-	{
-		if (opts && opts->errors)
-			*opts->errors = *opts->errors+1;
-	}
-}
-
 int pdfclean_main(int argc, char **argv)
 {
 	char *infile;
@@ -305,7 +85,7 @@ int pdfclean_main(int argc, char **argv)
 
 	fz_try(ctx)
 	{
-		pdfclean_clean(ctx, infile, outfile, password, &opts, &argv[fz_optind], argc - fz_optind);
+		pdf_clean_file(ctx, infile, outfile, password, &opts, &argv[fz_optind], argc - fz_optind);
 	}
 	fz_catch(ctx)
 	{
author	Robin Watts <robin.watts@artifex.com>	2015-04-02 17:04:08 +0100
committer	Robin Watts <robin.watts@artifex.com>	2015-04-06 19:30:55 +0100
commit	b4d256b9e9d2e9f3b5f3ca944f591ae6bb0c5d71 (patch)
tree	8d82fc82c7586634b14518704def1e5011249ca9
parent	95d746f13a86d914dd88310c994b27b08db4bb5b (diff)
download	mupdf-b4d256b9e9d2e9f3b5f3ca944f591ae6bb0c5d71.tar.xz