1 files changed, 233 insertions, 0 deletions
diff --git a/apps/pdfclean.c b/apps/pdfclean.c
new file mode 100644
index 00000000..409d4fbb
--- /dev/null
+++ b/apps/pdfclean.c
@@ -0,0 +1,233 @@
+/*
+ * PDF cleaning tool: general purpose pdf syntax washer.
+ *
+ * Rewrite PDF with pretty printed objects.
+ * Garbage collect unreachable objects.
+ * Inflate compressed streams.
+ * Create subset documents.
+ *
+ * TODO: linearize document for fast web view
+ */
+
+#include "fitz.h"
+#include "mupdf-internal.h"
+
+static pdf_document *xref = NULL;
+static fz_context *ctx = NULL;
+
+static void usage(void)
+{
+	fprintf(stderr,
+		"usage: mutool clean [options] input.pdf [output.pdf] [pages]\n"
+		"\t-p -\tpassword\n"
+		"\t-g\tgarbage collect unused objects\n"
+		"\t-gg\tin addition to -g compact xref table\n"
+		"\t-ggg\tin addition to -gg merge duplicate objects\n"
+		"\t-d\tdecompress all streams\n"
+		"\t-l\tlinearize PDF\n"
+		"\t-i\ttoggle decompression of image streams\n"
+		"\t-f\ttoggle decompression of font streams\n"
+		"\t-a\tascii hex encode binary streams\n"
+		"\tpages\tcomma separated list of ranges\n");
+	exit(1);
+}
+
+/*
+ * Recreate page tree to only retain specified pages.
+ */
+
+static void retainpages(int argc, char **argv)
+{
+	pdf_obj *oldroot, *root, *pages, *kids, *countobj, *parent, *olddests;
+
+	/* Keep only pages/type and (reduced) dest entries to avoid
+	 * references to unretained pages */
+	oldroot = pdf_dict_gets(xref->trailer, "Root");
+	pages = pdf_dict_gets(oldroot, "Pages");
+	olddests = pdf_load_name_tree(xref, "Dests");
+
+	root = pdf_new_dict(ctx, 2);
+	pdf_dict_puts(root, "Type", pdf_dict_gets(oldroot, "Type"));
+	pdf_dict_puts(root, "Pages", pdf_dict_gets(oldroot, "Pages"));
+
+	pdf_update_object(xref, pdf_to_num(oldroot), root);
+
+	pdf_drop_obj(root);
+
+	/* Create a new kids array with only the pages we want to keep */
+	parent = pdf_new_indirect(ctx, pdf_to_num(pages), pdf_to_gen(pages), xref);
+	kids = pdf_new_array(ctx, 1);
+
+	/* Retain pages specified */
+	while (argc - fz_optind)
+	{
+		int page, spage, epage, pagecount;
+		char *spec, *dash;
+		char *pagelist = argv[fz_optind];
+
+		pagecount = pdf_count_pages(xref);
+		spec = fz_strsep(&pagelist, ",");
+		while (spec)
+		{
+			dash = strchr(spec, '-');
+
+			if (dash == spec)
+				spage = epage = pagecount;
+			else
+				spage = epage = atoi(spec);
+
+			if (dash)
+			{
+				if (strlen(dash) > 1)
+					epage = atoi(dash + 1);
+				else
+					epage = pagecount;
+			}
+
+			if (spage > epage)
+				page = spage, spage = epage, epage = page;
+
+			spage = fz_clampi(spage, 1, pagecount);
+			epage = fz_clampi(epage, 1, pagecount);
+
+			for (page = spage; page <= epage; page++)
+			{
+				pdf_obj *pageobj = xref->page_objs[page-1];
+				pdf_obj *pageref = xref->page_refs[page-1];
+
+				pdf_dict_puts(pageobj, "Parent", parent);
+
+				/* Store page object in new kids array */
+				pdf_array_push(kids, pageref);
+			}
+
+			spec = fz_strsep(&pagelist, ",");
+		}
+
+		fz_optind++;
+	}
+
+	pdf_drop_obj(parent);
+
+	/* Update page count and kids array */
+	countobj = pdf_new_int(ctx, pdf_array_len(kids));
+	pdf_dict_puts(pages, "Count", countobj);
+	pdf_drop_obj(countobj);
+	pdf_dict_puts(pages, "Kids", kids);
+	pdf_drop_obj(kids);
+
+	/* Also preserve the (partial) Dests name tree */
+	if (olddests)
+	{
+		int i;
+		pdf_obj *names = pdf_new_dict(ctx, 1);
+		pdf_obj *dests = pdf_new_dict(ctx, 1);
+		pdf_obj *names_list = pdf_new_array(ctx, 32);
+		int len = pdf_dict_len(olddests);
+
+		for (i = 0; i < len; i++)
+		{
+			pdf_obj *key = pdf_dict_get_key(olddests, i);
+			pdf_obj *val = pdf_dict_get_val(olddests, i);
+			pdf_obj *key_str = pdf_new_string(ctx, pdf_to_name(key), strlen(pdf_to_name(key)));
+			pdf_obj *dest = pdf_dict_gets(val, "D");
+
+			dest = pdf_array_get(dest ? dest : val, 0);
+			if (pdf_array_contains(pdf_dict_gets(pages, "Kids"), dest))
+			{
+				pdf_array_push(names_list, key_str);
+				pdf_array_push(names_list, val);
+			}
+			pdf_drop_obj(key_str);
+		}
+
+		root = pdf_dict_gets(xref->trailer, "Root");
+		pdf_dict_puts(dests, "Names", names_list);
+		pdf_dict_puts(names, "Dests", dests);
+		pdf_dict_puts(root, "Names", names);
+
+		pdf_drop_obj(names);
+		pdf_drop_obj(dests);
+		pdf_drop_obj(names_list);
+		pdf_drop_obj(olddests);
+	}
+}
+
+int pdfclean_main(int argc, char **argv)
+{
+	char *infile;
+	char *outfile = "out.pdf";
+	char *password = "";
+	int c;
+	int subset;
+	fz_write_options opts;
+	int write_failed = 0;
+
+	opts.do_garbage = 0;
+	opts.do_expand = 0;
+	opts.do_ascii = 0;
+	opts.do_linear = 0;
+
+	while ((c = fz_getopt(argc, argv, "adfgilp:")) != -1)
+	{
+		switch (c)
+		{
+		case 'p': password = fz_optarg; break;
+		case 'g': opts.do_garbage ++; break;
+		case 'd': opts.do_expand ^= fz_expand_all; break;
+		case 'f': opts.do_expand ^= fz_expand_fonts; break;
+		case 'i': opts.do_expand ^= fz_expand_images; break;
+		case 'l': opts.do_linear ++; break;
+		case 'a': opts.do_ascii ++; break;
+		default: usage(); break;
+		}
+	}
+
+	if (argc - fz_optind < 1)
+		usage();
+
+	infile = argv[fz_optind++];
+
+	if (argc - fz_optind > 0 &&
+		(strstr(argv[fz_optind], ".pdf") || strstr(argv[fz_optind], ".PDF")))
+	{
+		outfile = argv[fz_optind++];
+	}
+
+	subset = 0;
+	if (argc - fz_optind > 0)
+		subset = 1;
+
+	ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
+	if (!ctx)
+	{
+		fprintf(stderr, "cannot initialise context\n");
+		exit(1);
+	}
+
+	fz_try(ctx)
+	{
+		xref = pdf_open_document_no_run(ctx, infile);
+		if (pdf_needs_password(xref))
+			if (!pdf_authenticate_password(xref, password))
+				fz_throw(ctx, "cannot authenticate password: %s", infile);
+
+		/* Only retain the specified subset of the pages */
+		if (subset)
+			retainpages(argc, argv);
+
+		pdf_write_document(xref, outfile, &opts);
+	}
+	fz_always(ctx)
+	{
+		pdf_close_document(xref);
+	}
+	fz_catch(ctx)
+	{
+		write_failed = 1;
+	}
+
+	fz_free_context(ctx);
+
+	return write_failed ? 1 : 0;
+}