10 files changed, 352 insertions, 8 deletions
diff --git a/include/mupdf/fitz/write-document.h b/include/mupdf/fitz/write-document.h
index 9fe27f5f..56b9ef76 100644
--- a/include/mupdf/fitz/write-document.h
+++ b/include/mupdf/fitz/write-document.h
@@ -21,6 +21,7 @@ struct fz_write_options_s
 	int do_garbage; /* If non-zero then attempt (where possible) to
 				garbage collect the file before writing. */
 	int do_linear; /* If non-zero then write linearised. */
+	int do_clean; /* If non-zero then clean contents */
 	int continue_on_error; /* If non-zero, errors are (optionally)
 					counted and writing continues. */
 	int *errors; /* Pointer to a place to store a count of errors */
diff --git a/include/mupdf/pdf/page.h b/include/mupdf/pdf/page.h
index e6c4f9b2..c1b690b6 100644
--- a/include/mupdf/pdf/page.h
+++ b/include/mupdf/pdf/page.h
@@ -50,6 +50,19 @@ void pdf_free_page(pdf_document *doc, pdf_page *page);
 */
 void pdf_run_page(pdf_document *doc, pdf_page *page, fz_device *dev, const fz_matrix *ctm, fz_cookie *cookie);
 
+/*
+	pdf_run_page: Interpret a loaded page and render it on a device.
+
+	page: A page loaded by pdf_load_page.
+
+	dev: Device used for rendering, obtained from fz_new_*_device.
+
+	ctm: A transformation matrix applied to the objects on the page,
+	e.g. to scale or rotate the page contents as desired.
+
+	cookie: A pointer to an optional fz_cookie structure that can be used
+	to track progress, collect errors etc.
+*/
 void pdf_run_page_with_usage(pdf_document *doc, pdf_page *page, fz_device *dev, const fz_matrix *ctm, char *event, fz_cookie *cookie);
 
 /*
@@ -66,6 +79,22 @@ void pdf_run_page_with_usage(pdf_document *doc, pdf_page *page, fz_device *dev,
 void pdf_run_page_contents(pdf_document *doc, pdf_page *page, fz_device *dev, const fz_matrix *ctm, fz_cookie *cookie);
 
 /*
+	pdf_clean_page_contents: Clean a loaded pages rendering operations.
+	This involves filtering the PDF operators used to avoid (some cases
+	of) repetition, and leaves the page in a balanced state with an
+	unchanged top level matrix etc. Just the main page contents without
+	the annotations
+
+	page: A page loaded by pdf_load_page.
+
+	dev: Device used for rendering, obtained from fz_new_*_device.
+
+	cookie: A pointer to an optional fz_cookie structure that can be used
+	to track progress, collect errors etc.
+*/
+void pdf_clean_page_contents(pdf_document *doc, pdf_page *page, fz_cookie *cookie);
+
+/*
 	Presentation interface.
 */
 fz_transition *pdf_page_presentation(pdf_document *doc, pdf_page *page, float *duration);
diff --git a/platform/win32/libmupdf.vcproj b/platform/win32/libmupdf.vcproj
index 3b30d923..40732f99 100644
--- a/platform/win32/libmupdf.vcproj
+++ b/platform/win32/libmupdf.vcproj
@@ -718,6 +718,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\source\pdf\pdf-clean.c"
+				>
+			</File>
+			<File
 				RelativePath="..\..\source\pdf\pdf-cmap-load.c"
 				>
 			</File>
diff --git a/source/pdf/pdf-annot.c b/source/pdf/pdf-annot.c
index e9702125..1fcbbcf4 100644
--- a/source/pdf/pdf-annot.c
+++ b/source/pdf/pdf-annot.c
@@ -66,6 +66,8 @@ pdf_parse_link_dest(pdf_document *doc, fz_link_kind kind, pdf_obj *dest)
 	ld.ld.gotor.dest = NULL;
 
 	dest = resolve_dest(doc, dest, kind);
+	if (dest == NULL)
+		fz_throw(doc->ctx, FZ_ERROR_GENERIC, "Undefined link_dest");
 
 	if (pdf_is_name(dest))
 	{
diff --git a/source/pdf/pdf-clean.c b/source/pdf/pdf-clean.c
new file mode 100644
index 00000000..51a8b6de
--- /dev/null
+++ b/source/pdf/pdf-clean.c
@@ -0,0 +1,283 @@
+#include "pdf-interpret-imp.h"
+
+static void
+pdf_clean_stream_object(pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res)
+{
+	fz_context *ctx = doc->ctx;
+	pdf_process process, process2;
+	fz_buffer *buffer;
+	int num;
+	pdf_obj *res = NULL;
+	pdf_obj *ref = NULL;
+
+	if (!obj)
+		return;
+
+	fz_var(res);
+	fz_var(ref);
+
+	buffer = fz_new_buffer(ctx, 1024);
+
+	fz_try(ctx)
+	{
+		if (own_res)
+		{
+			pdf_obj *r = pdf_dict_gets(obj, "Resources");
+			if (r)
+				orig_res = r;
+		}
+
+		res = pdf_new_dict(doc, 1);
+
+		pdf_process_buffer(&process2, ctx, buffer);
+		pdf_process_filter(&process, ctx, &process2, res);
+
+		pdf_process_stream_object(doc, obj, &process, orig_res, cookie);
+
+		num = pdf_to_num(obj);
+		pdf_dict_dels(obj, "Filter");
+		pdf_update_stream(doc, num, buffer);
+
+		if (own_res)
+		{
+			ref = pdf_new_ref(doc, res);
+			pdf_dict_puts(obj, "Resources", ref);
+		}
+	}
+	fz_always(ctx)
+	{
+		fz_drop_buffer(ctx, buffer);
+		pdf_drop_obj(res);
+		pdf_drop_obj(ref);
+	}
+	fz_catch(ctx)
+	{
+		fz_rethrow_message(ctx, "Failed while cleaning xobject");
+	}
+}
+
+static void
+pdf_clean_type3(pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie)
+{
+	fz_context *ctx = doc->ctx;
+	pdf_process process, process2;
+	fz_buffer *buffer;
+	int num, i, l;
+	pdf_obj *res = NULL;
+	pdf_obj *ref = NULL;
+	pdf_obj *charprocs;
+
+	fz_var(res);
+	fz_var(ref);
+
+	fz_try(ctx)
+	{
+		res = pdf_dict_gets(obj, "Resources");
+		if (res)
+			orig_res = res;
+		res = NULL;
+
+		res = pdf_new_dict(doc, 1);
+
+		charprocs = pdf_dict_gets(obj, "CharProcs");
+		l = pdf_dict_len(charprocs);
+
+		for (i = 0; i < l; i++)
+		{
+			pdf_obj *key = pdf_dict_get_key(charprocs, i);
+			pdf_obj *val = pdf_dict_get_val(charprocs, i);
+
+			buffer = fz_new_buffer(ctx, 1024);
+			pdf_process_buffer(&process2, ctx, buffer);
+			pdf_process_filter(&process, ctx, &process2, res);
+
+			pdf_process_stream_object(doc, val, &process, orig_res, cookie);
+
+			num = pdf_to_num(val);
+			pdf_dict_dels(val, "Filter");
+			pdf_update_stream(doc, num, buffer);
+			pdf_dict_put(charprocs, key, val);
+			fz_drop_buffer(ctx, buffer);
+			buffer = NULL;
+		}
+
+		/* ProcSet - no cleaning possible. Inherit this from the old dict. */
+		pdf_dict_puts(res, "ProcSet", pdf_dict_gets(orig_res, "ProcSet"));
+
+		ref = pdf_new_ref(doc, res);
+		pdf_dict_puts(obj, "Resources", ref);
+	}
+	fz_always(ctx)
+	{
+		fz_drop_buffer(ctx, buffer);
+		pdf_drop_obj(res);
+		pdf_drop_obj(ref);
+	}
+	fz_catch(ctx)
+	{
+		fz_rethrow_message(ctx, "Failed while cleaning xobject");
+	}
+}
+
+void pdf_clean_page_contents(pdf_document *doc, pdf_page *page, fz_cookie *cookie)
+{
+	fz_context *ctx = doc->ctx;
+	pdf_process process, process2;
+	fz_buffer *buffer = fz_new_buffer(ctx, 1024);
+	int num;
+	pdf_obj *contents;
+	pdf_obj *new_obj = NULL;
+	pdf_obj *new_ref = NULL;
+	pdf_obj *res = NULL;
+	pdf_obj *ref = NULL;
+	pdf_obj *obj;
+
+	fz_var(new_obj);
+	fz_var(new_ref);
+	fz_var(res);
+	fz_var(ref);
+
+	fz_try(ctx)
+	{
+		res = pdf_new_dict(doc, 1);
+
+		pdf_process_buffer(&process2, ctx, buffer);
+		pdf_process_filter(&process, ctx, &process2, res);
+
+		pdf_process_stream_object(doc, page->contents, &process, page->resources, cookie);
+
+		contents = page->contents;
+		if (pdf_is_array(contents))
+		{
+			int n = pdf_array_len(contents);
+			int i;
+
+			for (i = n-1; i > 0; i--)
+				pdf_array_delete(contents, i);
+			/* We cannot rewrite the 0th entry of contents
+			 * directly as it may occur in other pages content
+			 * dictionaries too. We therefore clone it and make
+			 * a new object reference. */
+			new_obj = pdf_copy_dict(pdf_array_get(contents, 0));
+			new_ref = pdf_new_ref(doc, new_obj);
+			num = pdf_to_num(new_ref);
+			pdf_array_put(contents, 0, new_ref);
+			pdf_dict_dels(new_obj, "Filter");
+		}
+		else
+		{
+			num = pdf_to_num(contents);
+			pdf_dict_dels(contents, "Filter");
+		}
+		pdf_update_stream(doc, num, buffer);
+
+		/* Now deal with resources. The spec allows for Type3 fonts and form
+		 * XObjects to omit a resource dictionary and look in the parent.
+		 * Avoid that by flattening here as part of the cleaning. This could
+		 * conceivably cause changes in rendering, but we don't care. */
+
+		/* ExtGState */
+		obj = pdf_dict_gets(res, "ExtGState");
+		if (obj)
+		{
+			int i, l;
+
+			l = pdf_dict_len(obj);
+			for (i = 0; i < l; i++)
+			{
+				pdf_obj *o = pdf_dict_gets(pdf_dict_get_val(obj, i), "SMask");
+
+				if (!o)
+					continue;
+				o = pdf_dict_gets(o, "G");
+				if (!o)
+					continue;
+
+				/* Transparency group XObject */
+				pdf_clean_stream_object(doc, o, page->resources, cookie, 1);
+			}
+		}
+
+		/* ColorSpace - no cleaning possible */
+
+		/* Pattern */
+		obj = pdf_dict_gets(res, "Pattern");
+		if (obj)
+		{
+			int i, l;
+
+			l = pdf_dict_len(obj);
+			for (i = 0; i < l; i++)
+			{
+				pdf_obj *pat = pdf_dict_get_val(obj, i);
+
+				if (!pat)
+					continue;
+				if (pdf_to_int(pdf_dict_gets(pat, "PatternType")) == 1)
+					pdf_clean_stream_object(doc, pat, page->resources, cookie, 0);
+			}
+		}
+
+		/* Shading - no cleaning possible */
+
+		/* XObject */
+		obj = pdf_dict_gets(res, "XObject");
+		if (obj)
+		{
+			int i, l;
+
+			l = pdf_dict_len(obj);
+			for (i = 0; i < l; i++)
+			{
+				pdf_obj *xobj = pdf_dict_get_val(obj, i);
+
+				if (strcmp(pdf_to_name(pdf_dict_gets(xobj, "Subtype")), "Form"))
+					continue;
+
+				pdf_clean_stream_object(doc, xobj, page->resources, cookie, 1);
+			}
+		}
+
+		/* Font */
+		obj = pdf_dict_gets(res, "Font");
+		if (obj)
+		{
+			int i, l;
+
+			l = pdf_dict_len(obj);
+			for (i = 0; i < l; i++)
+			{
+				pdf_obj *o = pdf_dict_get_val(obj, i);
+
+				if (!strcmp(pdf_to_name(pdf_dict_gets(o, "Subtype")), "Type3"))
+				{
+					pdf_clean_type3(doc, o, page->resources, cookie);
+				}
+			}
+		}
+
+		/* ProcSet - no cleaning possible. Inherit this from the old dict. */
+		obj = pdf_dict_gets(page->resources, "ProcSet");
+		if (obj)
+			pdf_dict_puts(res, "ProcSet", obj);
+
+		/* Properties - no cleaning possible. */
+
+		pdf_drop_obj(page->resources);
+		ref = pdf_new_ref(doc, res);
+		page->resources = pdf_keep_obj(ref);
+		pdf_dict_puts(page->me, "Resources", ref);
+	}
+	fz_always(ctx)
+	{
+		fz_drop_buffer(ctx, buffer);
+		pdf_drop_obj(new_obj);
+		pdf_drop_obj(new_ref);
+		pdf_drop_obj(res);
+		pdf_drop_obj(ref);
+	}
+	fz_catch(ctx)
+	{
+		fz_rethrow_message(ctx, "Failed while cleaning page");
+	}
+}
diff --git a/source/pdf/pdf-interpret-imp.h b/source/pdf/pdf-interpret-imp.h
index 69a40147..314e6d2b 100644
--- a/source/pdf/pdf-interpret-imp.h
+++ b/source/pdf/pdf-interpret-imp.h
@@ -137,6 +137,7 @@ static inline void pdf_process_op(pdf_csi *csi, int op, const pdf_process *proce
 	process->processor->op_table[op](csi, process->state);
 }
 
+/* Helper functions for the filter implementations to call */
 void pdf_process_contents_object(pdf_csi *csi, pdf_obj *rdb, pdf_obj *contents);
 void pdf_process_stream(pdf_csi *csi, pdf_lexbuf *buf);
 
@@ -145,10 +146,10 @@ pdf_process *pdf_process_run(pdf_process *process, fz_device *dev, const fz_matr
 pdf_process *pdf_process_buffer(pdf_process *process, fz_context *ctx, fz_buffer *buffer);
 pdf_process *pdf_process_filter(pdf_process *process, fz_context *ctx, pdf_process *underlying, pdf_obj *resources);
 
-/* Functions to actually use the pdf_process structures to process pages,
- * annotations and glyphs */
+/* Functions to actually use the pdf_process structures to process
+ * annotations, glyphs and general stream objects */
 void pdf_process_annot(pdf_document *doc, pdf_page *page, pdf_annot *annot, const pdf_process *process, fz_cookie *cookie);
-void pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *process, fz_cookie *cookie);
 void pdf_process_glyph(pdf_document *doc, pdf_obj *resources, fz_buffer *contents, pdf_process *process);
+void pdf_process_stream_object(pdf_document *doc, pdf_obj *obj, const pdf_process *process, pdf_obj *res, fz_cookie *cookie);
 
 #endif
diff --git a/source/pdf/pdf-interpret.c b/source/pdf/pdf-interpret.c
index 525d2ead..507bf0fa 100644
--- a/source/pdf/pdf-interpret.c
+++ b/source/pdf/pdf-interpret.c
@@ -606,7 +606,7 @@ pdf_process_contents_buffer(pdf_csi *csi, pdf_obj *rdb, fz_buffer *contents)
 }
 
 void
-pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *process, fz_cookie *cookie)
+pdf_process_stream_object(pdf_document *doc, pdf_obj *obj, const pdf_process *process, pdf_obj *res, fz_cookie *cookie)
 {
 	fz_context *ctx = doc->ctx;
 	pdf_csi *csi;
@@ -614,7 +614,7 @@ pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *
 	csi = pdf_new_csi(doc, cookie, process);
 	fz_try(ctx)
 	{
-		csi->process.processor->process_contents(csi, csi->process.state, page->resources, page->contents);
+		csi->process.processor->process_contents(csi, csi->process.state, res, obj);
 	}
 	fz_always(ctx)
 	{
@@ -622,7 +622,7 @@ pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *
 	}
 	fz_catch(ctx)
 	{
-		fz_rethrow_message(ctx, "cannot parse page content stream");
+		fz_rethrow_message(ctx, "cannot parse content stream");
 	}
 }
 
diff --git a/source/pdf/pdf-run.c b/source/pdf/pdf-run.c
index a77dd50e..d52d8282 100644
--- a/source/pdf/pdf-run.c
+++ b/source/pdf/pdf-run.c
@@ -28,7 +28,7 @@ static void pdf_run_page_contents_with_usage(pdf_document *doc, pdf_page *page,
 
 	pdf_process_run(&process, dev, &local_ctm, event, NULL, 0);
 
-	pdf_process_page_contents(doc, page, &process, cookie);
+	pdf_process_stream_object(doc, page->contents, &process, page->resources, cookie);
 
 	if (page->transparency)
 		fz_end_group(dev);
diff --git a/source/pdf/pdf-write.c b/source/pdf/pdf-write.c
index b5894421..f218429d 100644
--- a/source/pdf/pdf-write.c
+++ b/source/pdf/pdf-write.c
@@ -51,6 +51,7 @@ struct pdf_write_options_s
 	int do_expand;
 	int do_garbage;
 	int do_linear;
+	int do_clean;
 	int *use_list;
 	int *ofs_list;
 	int *gen_list;
@@ -2541,6 +2542,21 @@ static void complete_signatures(pdf_document *doc, pdf_write_options *opts, char
 	}
 }
 
+static void sanitise(pdf_document *doc)
+{
+	int n = pdf_count_pages(doc);
+	int i;
+
+	for (i = 0; i < n; i++)
+	{
+		pdf_page *page = pdf_load_page(doc, i);
+
+		pdf_clean_page_contents(doc, page, NULL);
+
+		pdf_free_page(doc, page);
+	}
+}
+
 void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_opts)
 {
 	int lastfree;
@@ -2559,6 +2575,10 @@ void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_
 	doc->freeze_updates = 1;
 	ctx = doc->ctx;
 
+	/* Sanitise the operator streams */
+	if (fz_opts->do_clean)
+		sanitise(doc);
+
 	pdf_finish_edit(doc);
 	presize_unsaved_signature_byteranges(doc);
 
@@ -2585,6 +2605,7 @@ void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_
 		opts.do_garbage = fz_opts->do_garbage;
 		opts.do_ascii = fz_opts->do_ascii;
 		opts.do_linear = fz_opts->do_linear;
+		opts.do_clean = fz_opts->do_clean;
 		opts.start = 0;
 		opts.main_xref_offset = INT_MIN;
 		/* We deliberately make these arrays long enough to cope with
diff --git a/source/tools/pdfclean.c b/source/tools/pdfclean.c
index 74b70394..d2ded0b7 100644
--- a/source/tools/pdfclean.c
+++ b/source/tools/pdfclean.c
@@ -25,6 +25,7 @@ static void usage(void)
 		"\t-g\tgarbage collect unused objects\n"
 		"\t-gg\tin addition to -g compact xref table\n"
 		"\t-ggg\tin addition to -gg merge duplicate objects\n"
+		"\t-s\tclean content streams\n"
 		"\t-d\tdecompress all streams\n"
 		"\t-l\tlinearize PDF\n"
 		"\t-i\ttoggle decompression of image streams\n"
@@ -260,8 +261,9 @@ int pdfclean_main(int argc, char **argv)
 	opts.do_linear = 0;
 	opts.continue_on_error = 1;
 	opts.errors = &errors;
+	opts.do_clean = 0;
 
-	while ((c = fz_getopt(argc, argv, "adfgilp:")) != -1)
+	while ((c = fz_getopt(argc, argv, "adfgilp:s")) != -1)
 	{
 		switch (c)
 		{
@@ -272,6 +274,7 @@ int pdfclean_main(int argc, char **argv)
 		case 'i': opts.do_expand ^= fz_expand_images; break;
 		case 'l': opts.do_linear ++; break;
 		case 'a': opts.do_ascii ++; break;
+		case 's': opts.do_clean ++; break;
 		default: usage(); break;
 		}
 	}