summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/mupdf/fitz/write-document.h1
-rw-r--r--include/mupdf/pdf/page.h29
-rw-r--r--platform/win32/libmupdf.vcproj4
-rw-r--r--source/pdf/pdf-annot.c2
-rw-r--r--source/pdf/pdf-clean.c283
-rw-r--r--source/pdf/pdf-interpret-imp.h7
-rw-r--r--source/pdf/pdf-interpret.c6
-rw-r--r--source/pdf/pdf-run.c2
-rw-r--r--source/pdf/pdf-write.c21
-rw-r--r--source/tools/pdfclean.c5
10 files changed, 352 insertions, 8 deletions
diff --git a/include/mupdf/fitz/write-document.h b/include/mupdf/fitz/write-document.h
index 9fe27f5f..56b9ef76 100644
--- a/include/mupdf/fitz/write-document.h
+++ b/include/mupdf/fitz/write-document.h
@@ -21,6 +21,7 @@ struct fz_write_options_s
int do_garbage; /* If non-zero then attempt (where possible) to
garbage collect the file before writing. */
int do_linear; /* If non-zero then write linearised. */
+ int do_clean; /* If non-zero then clean contents */
int continue_on_error; /* If non-zero, errors are (optionally)
counted and writing continues. */
int *errors; /* Pointer to a place to store a count of errors */
diff --git a/include/mupdf/pdf/page.h b/include/mupdf/pdf/page.h
index e6c4f9b2..c1b690b6 100644
--- a/include/mupdf/pdf/page.h
+++ b/include/mupdf/pdf/page.h
@@ -50,6 +50,19 @@ void pdf_free_page(pdf_document *doc, pdf_page *page);
*/
void pdf_run_page(pdf_document *doc, pdf_page *page, fz_device *dev, const fz_matrix *ctm, fz_cookie *cookie);
+/*
+ pdf_run_page: Interpret a loaded page and render it on a device.
+
+ page: A page loaded by pdf_load_page.
+
+ dev: Device used for rendering, obtained from fz_new_*_device.
+
+ ctm: A transformation matrix applied to the objects on the page,
+ e.g. to scale or rotate the page contents as desired.
+
+ cookie: A pointer to an optional fz_cookie structure that can be used
+ to track progress, collect errors etc.
+*/
void pdf_run_page_with_usage(pdf_document *doc, pdf_page *page, fz_device *dev, const fz_matrix *ctm, char *event, fz_cookie *cookie);
/*
@@ -66,6 +79,22 @@ void pdf_run_page_with_usage(pdf_document *doc, pdf_page *page, fz_device *dev,
void pdf_run_page_contents(pdf_document *doc, pdf_page *page, fz_device *dev, const fz_matrix *ctm, fz_cookie *cookie);
/*
+ pdf_clean_page_contents: Clean a loaded pages rendering operations.
+ This involves filtering the PDF operators used to avoid (some cases
+ of) repetition, and leaves the page in a balanced state with an
+ unchanged top level matrix etc. Just the main page contents without
+ the annotations
+
+ page: A page loaded by pdf_load_page.
+
+ dev: Device used for rendering, obtained from fz_new_*_device.
+
+ cookie: A pointer to an optional fz_cookie structure that can be used
+ to track progress, collect errors etc.
+*/
+void pdf_clean_page_contents(pdf_document *doc, pdf_page *page, fz_cookie *cookie);
+
+/*
Presentation interface.
*/
fz_transition *pdf_page_presentation(pdf_document *doc, pdf_page *page, float *duration);
diff --git a/platform/win32/libmupdf.vcproj b/platform/win32/libmupdf.vcproj
index 3b30d923..40732f99 100644
--- a/platform/win32/libmupdf.vcproj
+++ b/platform/win32/libmupdf.vcproj
@@ -718,6 +718,10 @@
>
</File>
<File
+ RelativePath="..\..\source\pdf\pdf-clean.c"
+ >
+ </File>
+ <File
RelativePath="..\..\source\pdf\pdf-cmap-load.c"
>
</File>
diff --git a/source/pdf/pdf-annot.c b/source/pdf/pdf-annot.c
index e9702125..1fcbbcf4 100644
--- a/source/pdf/pdf-annot.c
+++ b/source/pdf/pdf-annot.c
@@ -66,6 +66,8 @@ pdf_parse_link_dest(pdf_document *doc, fz_link_kind kind, pdf_obj *dest)
ld.ld.gotor.dest = NULL;
dest = resolve_dest(doc, dest, kind);
+ if (dest == NULL)
+ fz_throw(doc->ctx, FZ_ERROR_GENERIC, "Undefined link_dest");
if (pdf_is_name(dest))
{
diff --git a/source/pdf/pdf-clean.c b/source/pdf/pdf-clean.c
new file mode 100644
index 00000000..51a8b6de
--- /dev/null
+++ b/source/pdf/pdf-clean.c
@@ -0,0 +1,283 @@
+#include "pdf-interpret-imp.h"
+
+static void
+pdf_clean_stream_object(pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res)
+{
+ fz_context *ctx = doc->ctx;
+ pdf_process process, process2;
+ fz_buffer *buffer;
+ int num;
+ pdf_obj *res = NULL;
+ pdf_obj *ref = NULL;
+
+ if (!obj)
+ return;
+
+ fz_var(res);
+ fz_var(ref);
+
+ buffer = fz_new_buffer(ctx, 1024);
+
+ fz_try(ctx)
+ {
+ if (own_res)
+ {
+ pdf_obj *r = pdf_dict_gets(obj, "Resources");
+ if (r)
+ orig_res = r;
+ }
+
+ res = pdf_new_dict(doc, 1);
+
+ pdf_process_buffer(&process2, ctx, buffer);
+ pdf_process_filter(&process, ctx, &process2, res);
+
+ pdf_process_stream_object(doc, obj, &process, orig_res, cookie);
+
+ num = pdf_to_num(obj);
+ pdf_dict_dels(obj, "Filter");
+ pdf_update_stream(doc, num, buffer);
+
+ if (own_res)
+ {
+ ref = pdf_new_ref(doc, res);
+ pdf_dict_puts(obj, "Resources", ref);
+ }
+ }
+ fz_always(ctx)
+ {
+ fz_drop_buffer(ctx, buffer);
+ pdf_drop_obj(res);
+ pdf_drop_obj(ref);
+ }
+ fz_catch(ctx)
+ {
+ fz_rethrow_message(ctx, "Failed while cleaning xobject");
+ }
+}
+
+static void
+pdf_clean_type3(pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie)
+{
+ fz_context *ctx = doc->ctx;
+ pdf_process process, process2;
+ fz_buffer *buffer;
+ int num, i, l;
+ pdf_obj *res = NULL;
+ pdf_obj *ref = NULL;
+ pdf_obj *charprocs;
+
+ fz_var(res);
+ fz_var(ref);
+
+ fz_try(ctx)
+ {
+ res = pdf_dict_gets(obj, "Resources");
+ if (res)
+ orig_res = res;
+ res = NULL;
+
+ res = pdf_new_dict(doc, 1);
+
+ charprocs = pdf_dict_gets(obj, "CharProcs");
+ l = pdf_dict_len(charprocs);
+
+ for (i = 0; i < l; i++)
+ {
+ pdf_obj *key = pdf_dict_get_key(charprocs, i);
+ pdf_obj *val = pdf_dict_get_val(charprocs, i);
+
+ buffer = fz_new_buffer(ctx, 1024);
+ pdf_process_buffer(&process2, ctx, buffer);
+ pdf_process_filter(&process, ctx, &process2, res);
+
+ pdf_process_stream_object(doc, val, &process, orig_res, cookie);
+
+ num = pdf_to_num(val);
+ pdf_dict_dels(val, "Filter");
+ pdf_update_stream(doc, num, buffer);
+ pdf_dict_put(charprocs, key, val);
+ fz_drop_buffer(ctx, buffer);
+ buffer = NULL;
+ }
+
+ /* ProcSet - no cleaning possible. Inherit this from the old dict. */
+ pdf_dict_puts(res, "ProcSet", pdf_dict_gets(orig_res, "ProcSet"));
+
+ ref = pdf_new_ref(doc, res);
+ pdf_dict_puts(obj, "Resources", ref);
+ }
+ fz_always(ctx)
+ {
+ fz_drop_buffer(ctx, buffer);
+ pdf_drop_obj(res);
+ pdf_drop_obj(ref);
+ }
+ fz_catch(ctx)
+ {
+ fz_rethrow_message(ctx, "Failed while cleaning xobject");
+ }
+}
+
+void pdf_clean_page_contents(pdf_document *doc, pdf_page *page, fz_cookie *cookie)
+{
+ fz_context *ctx = doc->ctx;
+ pdf_process process, process2;
+ fz_buffer *buffer = fz_new_buffer(ctx, 1024);
+ int num;
+ pdf_obj *contents;
+ pdf_obj *new_obj = NULL;
+ pdf_obj *new_ref = NULL;
+ pdf_obj *res = NULL;
+ pdf_obj *ref = NULL;
+ pdf_obj *obj;
+
+ fz_var(new_obj);
+ fz_var(new_ref);
+ fz_var(res);
+ fz_var(ref);
+
+ fz_try(ctx)
+ {
+ res = pdf_new_dict(doc, 1);
+
+ pdf_process_buffer(&process2, ctx, buffer);
+ pdf_process_filter(&process, ctx, &process2, res);
+
+ pdf_process_stream_object(doc, page->contents, &process, page->resources, cookie);
+
+ contents = page->contents;
+ if (pdf_is_array(contents))
+ {
+ int n = pdf_array_len(contents);
+ int i;
+
+ for (i = n-1; i > 0; i--)
+ pdf_array_delete(contents, i);
+ /* We cannot rewrite the 0th entry of contents
+ * directly as it may occur in other pages content
+ * dictionaries too. We therefore clone it and make
+ * a new object reference. */
+ new_obj = pdf_copy_dict(pdf_array_get(contents, 0));
+ new_ref = pdf_new_ref(doc, new_obj);
+ num = pdf_to_num(new_ref);
+ pdf_array_put(contents, 0, new_ref);
+ pdf_dict_dels(new_obj, "Filter");
+ }
+ else
+ {
+ num = pdf_to_num(contents);
+ pdf_dict_dels(contents, "Filter");
+ }
+ pdf_update_stream(doc, num, buffer);
+
+ /* Now deal with resources. The spec allows for Type3 fonts and form
+ * XObjects to omit a resource dictionary and look in the parent.
+ * Avoid that by flattening here as part of the cleaning. This could
+ * conceivably cause changes in rendering, but we don't care. */
+
+ /* ExtGState */
+ obj = pdf_dict_gets(res, "ExtGState");
+ if (obj)
+ {
+ int i, l;
+
+ l = pdf_dict_len(obj);
+ for (i = 0; i < l; i++)
+ {
+ pdf_obj *o = pdf_dict_gets(pdf_dict_get_val(obj, i), "SMask");
+
+ if (!o)
+ continue;
+ o = pdf_dict_gets(o, "G");
+ if (!o)
+ continue;
+
+ /* Transparency group XObject */
+ pdf_clean_stream_object(doc, o, page->resources, cookie, 1);
+ }
+ }
+
+ /* ColorSpace - no cleaning possible */
+
+ /* Pattern */
+ obj = pdf_dict_gets(res, "Pattern");
+ if (obj)
+ {
+ int i, l;
+
+ l = pdf_dict_len(obj);
+ for (i = 0; i < l; i++)
+ {
+ pdf_obj *pat = pdf_dict_get_val(obj, i);
+
+ if (!pat)
+ continue;
+ if (pdf_to_int(pdf_dict_gets(pat, "PatternType")) == 1)
+ pdf_clean_stream_object(doc, pat, page->resources, cookie, 0);
+ }
+ }
+
+ /* Shading - no cleaning possible */
+
+ /* XObject */
+ obj = pdf_dict_gets(res, "XObject");
+ if (obj)
+ {
+ int i, l;
+
+ l = pdf_dict_len(obj);
+ for (i = 0; i < l; i++)
+ {
+ pdf_obj *xobj = pdf_dict_get_val(obj, i);
+
+ if (strcmp(pdf_to_name(pdf_dict_gets(xobj, "Subtype")), "Form"))
+ continue;
+
+ pdf_clean_stream_object(doc, xobj, page->resources, cookie, 1);
+ }
+ }
+
+ /* Font */
+ obj = pdf_dict_gets(res, "Font");
+ if (obj)
+ {
+ int i, l;
+
+ l = pdf_dict_len(obj);
+ for (i = 0; i < l; i++)
+ {
+ pdf_obj *o = pdf_dict_get_val(obj, i);
+
+ if (!strcmp(pdf_to_name(pdf_dict_gets(o, "Subtype")), "Type3"))
+ {
+ pdf_clean_type3(doc, o, page->resources, cookie);
+ }
+ }
+ }
+
+ /* ProcSet - no cleaning possible. Inherit this from the old dict. */
+ obj = pdf_dict_gets(page->resources, "ProcSet");
+ if (obj)
+ pdf_dict_puts(res, "ProcSet", obj);
+
+ /* Properties - no cleaning possible. */
+
+ pdf_drop_obj(page->resources);
+ ref = pdf_new_ref(doc, res);
+ page->resources = pdf_keep_obj(ref);
+ pdf_dict_puts(page->me, "Resources", ref);
+ }
+ fz_always(ctx)
+ {
+ fz_drop_buffer(ctx, buffer);
+ pdf_drop_obj(new_obj);
+ pdf_drop_obj(new_ref);
+ pdf_drop_obj(res);
+ pdf_drop_obj(ref);
+ }
+ fz_catch(ctx)
+ {
+ fz_rethrow_message(ctx, "Failed while cleaning page");
+ }
+}
diff --git a/source/pdf/pdf-interpret-imp.h b/source/pdf/pdf-interpret-imp.h
index 69a40147..314e6d2b 100644
--- a/source/pdf/pdf-interpret-imp.h
+++ b/source/pdf/pdf-interpret-imp.h
@@ -137,6 +137,7 @@ static inline void pdf_process_op(pdf_csi *csi, int op, const pdf_process *proce
process->processor->op_table[op](csi, process->state);
}
+/* Helper functions for the filter implementations to call */
void pdf_process_contents_object(pdf_csi *csi, pdf_obj *rdb, pdf_obj *contents);
void pdf_process_stream(pdf_csi *csi, pdf_lexbuf *buf);
@@ -145,10 +146,10 @@ pdf_process *pdf_process_run(pdf_process *process, fz_device *dev, const fz_matr
pdf_process *pdf_process_buffer(pdf_process *process, fz_context *ctx, fz_buffer *buffer);
pdf_process *pdf_process_filter(pdf_process *process, fz_context *ctx, pdf_process *underlying, pdf_obj *resources);
-/* Functions to actually use the pdf_process structures to process pages,
- * annotations and glyphs */
+/* Functions to actually use the pdf_process structures to process
+ * annotations, glyphs and general stream objects */
void pdf_process_annot(pdf_document *doc, pdf_page *page, pdf_annot *annot, const pdf_process *process, fz_cookie *cookie);
-void pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *process, fz_cookie *cookie);
void pdf_process_glyph(pdf_document *doc, pdf_obj *resources, fz_buffer *contents, pdf_process *process);
+void pdf_process_stream_object(pdf_document *doc, pdf_obj *obj, const pdf_process *process, pdf_obj *res, fz_cookie *cookie);
#endif
diff --git a/source/pdf/pdf-interpret.c b/source/pdf/pdf-interpret.c
index 525d2ead..507bf0fa 100644
--- a/source/pdf/pdf-interpret.c
+++ b/source/pdf/pdf-interpret.c
@@ -606,7 +606,7 @@ pdf_process_contents_buffer(pdf_csi *csi, pdf_obj *rdb, fz_buffer *contents)
}
void
-pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *process, fz_cookie *cookie)
+pdf_process_stream_object(pdf_document *doc, pdf_obj *obj, const pdf_process *process, pdf_obj *res, fz_cookie *cookie)
{
fz_context *ctx = doc->ctx;
pdf_csi *csi;
@@ -614,7 +614,7 @@ pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *
csi = pdf_new_csi(doc, cookie, process);
fz_try(ctx)
{
- csi->process.processor->process_contents(csi, csi->process.state, page->resources, page->contents);
+ csi->process.processor->process_contents(csi, csi->process.state, res, obj);
}
fz_always(ctx)
{
@@ -622,7 +622,7 @@ pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *
}
fz_catch(ctx)
{
- fz_rethrow_message(ctx, "cannot parse page content stream");
+ fz_rethrow_message(ctx, "cannot parse content stream");
}
}
diff --git a/source/pdf/pdf-run.c b/source/pdf/pdf-run.c
index a77dd50e..d52d8282 100644
--- a/source/pdf/pdf-run.c
+++ b/source/pdf/pdf-run.c
@@ -28,7 +28,7 @@ static void pdf_run_page_contents_with_usage(pdf_document *doc, pdf_page *page,
pdf_process_run(&process, dev, &local_ctm, event, NULL, 0);
- pdf_process_page_contents(doc, page, &process, cookie);
+ pdf_process_stream_object(doc, page->contents, &process, page->resources, cookie);
if (page->transparency)
fz_end_group(dev);
diff --git a/source/pdf/pdf-write.c b/source/pdf/pdf-write.c
index b5894421..f218429d 100644
--- a/source/pdf/pdf-write.c
+++ b/source/pdf/pdf-write.c
@@ -51,6 +51,7 @@ struct pdf_write_options_s
int do_expand;
int do_garbage;
int do_linear;
+ int do_clean;
int *use_list;
int *ofs_list;
int *gen_list;
@@ -2541,6 +2542,21 @@ static void complete_signatures(pdf_document *doc, pdf_write_options *opts, char
}
}
+static void sanitise(pdf_document *doc)
+{
+ int n = pdf_count_pages(doc);
+ int i;
+
+ for (i = 0; i < n; i++)
+ {
+ pdf_page *page = pdf_load_page(doc, i);
+
+ pdf_clean_page_contents(doc, page, NULL);
+
+ pdf_free_page(doc, page);
+ }
+}
+
void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_opts)
{
int lastfree;
@@ -2559,6 +2575,10 @@ void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_
doc->freeze_updates = 1;
ctx = doc->ctx;
+ /* Sanitise the operator streams */
+ if (fz_opts->do_clean)
+ sanitise(doc);
+
pdf_finish_edit(doc);
presize_unsaved_signature_byteranges(doc);
@@ -2585,6 +2605,7 @@ void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_
opts.do_garbage = fz_opts->do_garbage;
opts.do_ascii = fz_opts->do_ascii;
opts.do_linear = fz_opts->do_linear;
+ opts.do_clean = fz_opts->do_clean;
opts.start = 0;
opts.main_xref_offset = INT_MIN;
/* We deliberately make these arrays long enough to cope with
diff --git a/source/tools/pdfclean.c b/source/tools/pdfclean.c
index 74b70394..d2ded0b7 100644
--- a/source/tools/pdfclean.c
+++ b/source/tools/pdfclean.c
@@ -25,6 +25,7 @@ static void usage(void)
"\t-g\tgarbage collect unused objects\n"
"\t-gg\tin addition to -g compact xref table\n"
"\t-ggg\tin addition to -gg merge duplicate objects\n"
+ "\t-s\tclean content streams\n"
"\t-d\tdecompress all streams\n"
"\t-l\tlinearize PDF\n"
"\t-i\ttoggle decompression of image streams\n"
@@ -260,8 +261,9 @@ int pdfclean_main(int argc, char **argv)
opts.do_linear = 0;
opts.continue_on_error = 1;
opts.errors = &errors;
+ opts.do_clean = 0;
- while ((c = fz_getopt(argc, argv, "adfgilp:")) != -1)
+ while ((c = fz_getopt(argc, argv, "adfgilp:s")) != -1)
{
switch (c)
{
@@ -272,6 +274,7 @@ int pdfclean_main(int argc, char **argv)
case 'i': opts.do_expand ^= fz_expand_images; break;
case 'l': opts.do_linear ++; break;
case 'a': opts.do_ascii ++; break;
+ case 's': opts.do_clean ++; break;
default: usage(); break;
}
}