summaryrefslogtreecommitdiff
path: root/source/pdf
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2014-03-19 19:04:50 +0000
committerRobin Watts <robin.watts@artifex.com>2014-03-19 19:04:50 +0000
commite4d36ce68e0467ac4702f717386934a44970f4e5 (patch)
tree758a9999434f03a4607d1f23e43d9a58828b6444 /source/pdf
parent441954b6fb378e3af72500653be5636c7ade29ee (diff)
downloadmupdf-e4d36ce68e0467ac4702f717386934a44970f4e5.tar.xz
Add routine to clean pdf content streams for pages.
New routine to filter the content streams for pages, xobjects, type3 charprocs, patterns etc. The filtered streams are guaranteed to be properly matched with q/Q's, and to not have changed the top level ctm. Additionally we remove (some) repeated settings of colors etc. This filtering can be extended to be smarter later. The idea of this is to both repair after editing, and to leave the streams in a form that can be easily appended to. This is preparatory to work on Bates numbering and Watermarking. Currently the streams produced are uncompressed.
Diffstat (limited to 'source/pdf')
-rw-r--r--source/pdf/pdf-annot.c2
-rw-r--r--source/pdf/pdf-clean.c283
-rw-r--r--source/pdf/pdf-interpret-imp.h7
-rw-r--r--source/pdf/pdf-interpret.c6
-rw-r--r--source/pdf/pdf-run.c2
-rw-r--r--source/pdf/pdf-write.c21
6 files changed, 314 insertions, 7 deletions
diff --git a/source/pdf/pdf-annot.c b/source/pdf/pdf-annot.c
index e9702125..1fcbbcf4 100644
--- a/source/pdf/pdf-annot.c
+++ b/source/pdf/pdf-annot.c
@@ -66,6 +66,8 @@ pdf_parse_link_dest(pdf_document *doc, fz_link_kind kind, pdf_obj *dest)
ld.ld.gotor.dest = NULL;
dest = resolve_dest(doc, dest, kind);
+ if (dest == NULL)
+ fz_throw(doc->ctx, FZ_ERROR_GENERIC, "Undefined link_dest");
if (pdf_is_name(dest))
{
diff --git a/source/pdf/pdf-clean.c b/source/pdf/pdf-clean.c
new file mode 100644
index 00000000..51a8b6de
--- /dev/null
+++ b/source/pdf/pdf-clean.c
@@ -0,0 +1,283 @@
+#include "pdf-interpret-imp.h"
+
+static void
+pdf_clean_stream_object(pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res)
+{
+ fz_context *ctx = doc->ctx;
+ pdf_process process, process2;
+ fz_buffer *buffer;
+ int num;
+ pdf_obj *res = NULL;
+ pdf_obj *ref = NULL;
+
+ if (!obj)
+ return;
+
+ fz_var(res);
+ fz_var(ref);
+
+ buffer = fz_new_buffer(ctx, 1024);
+
+ fz_try(ctx)
+ {
+ if (own_res)
+ {
+ pdf_obj *r = pdf_dict_gets(obj, "Resources");
+ if (r)
+ orig_res = r;
+ }
+
+ res = pdf_new_dict(doc, 1);
+
+ pdf_process_buffer(&process2, ctx, buffer);
+ pdf_process_filter(&process, ctx, &process2, res);
+
+ pdf_process_stream_object(doc, obj, &process, orig_res, cookie);
+
+ num = pdf_to_num(obj);
+ pdf_dict_dels(obj, "Filter");
+ pdf_update_stream(doc, num, buffer);
+
+ if (own_res)
+ {
+ ref = pdf_new_ref(doc, res);
+ pdf_dict_puts(obj, "Resources", ref);
+ }
+ }
+ fz_always(ctx)
+ {
+ fz_drop_buffer(ctx, buffer);
+ pdf_drop_obj(res);
+ pdf_drop_obj(ref);
+ }
+ fz_catch(ctx)
+ {
+ fz_rethrow_message(ctx, "Failed while cleaning xobject");
+ }
+}
+
+static void
+pdf_clean_type3(pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie)
+{
+ fz_context *ctx = doc->ctx;
+ pdf_process process, process2;
+ fz_buffer *buffer;
+ int num, i, l;
+ pdf_obj *res = NULL;
+ pdf_obj *ref = NULL;
+ pdf_obj *charprocs;
+
+ fz_var(res);
+ fz_var(ref);
+
+ fz_try(ctx)
+ {
+ res = pdf_dict_gets(obj, "Resources");
+ if (res)
+ orig_res = res;
+ res = NULL;
+
+ res = pdf_new_dict(doc, 1);
+
+ charprocs = pdf_dict_gets(obj, "CharProcs");
+ l = pdf_dict_len(charprocs);
+
+ for (i = 0; i < l; i++)
+ {
+ pdf_obj *key = pdf_dict_get_key(charprocs, i);
+ pdf_obj *val = pdf_dict_get_val(charprocs, i);
+
+ buffer = fz_new_buffer(ctx, 1024);
+ pdf_process_buffer(&process2, ctx, buffer);
+ pdf_process_filter(&process, ctx, &process2, res);
+
+ pdf_process_stream_object(doc, val, &process, orig_res, cookie);
+
+ num = pdf_to_num(val);
+ pdf_dict_dels(val, "Filter");
+ pdf_update_stream(doc, num, buffer);
+ pdf_dict_put(charprocs, key, val);
+ fz_drop_buffer(ctx, buffer);
+ buffer = NULL;
+ }
+
+ /* ProcSet - no cleaning possible. Inherit this from the old dict. */
+ pdf_dict_puts(res, "ProcSet", pdf_dict_gets(orig_res, "ProcSet"));
+
+ ref = pdf_new_ref(doc, res);
+ pdf_dict_puts(obj, "Resources", ref);
+ }
+ fz_always(ctx)
+ {
+ fz_drop_buffer(ctx, buffer);
+ pdf_drop_obj(res);
+ pdf_drop_obj(ref);
+ }
+ fz_catch(ctx)
+ {
+ fz_rethrow_message(ctx, "Failed while cleaning xobject");
+ }
+}
+
+void pdf_clean_page_contents(pdf_document *doc, pdf_page *page, fz_cookie *cookie)
+{
+ fz_context *ctx = doc->ctx;
+ pdf_process process, process2;
+ fz_buffer *buffer = fz_new_buffer(ctx, 1024);
+ int num;
+ pdf_obj *contents;
+ pdf_obj *new_obj = NULL;
+ pdf_obj *new_ref = NULL;
+ pdf_obj *res = NULL;
+ pdf_obj *ref = NULL;
+ pdf_obj *obj;
+
+ fz_var(new_obj);
+ fz_var(new_ref);
+ fz_var(res);
+ fz_var(ref);
+
+ fz_try(ctx)
+ {
+ res = pdf_new_dict(doc, 1);
+
+ pdf_process_buffer(&process2, ctx, buffer);
+ pdf_process_filter(&process, ctx, &process2, res);
+
+ pdf_process_stream_object(doc, page->contents, &process, page->resources, cookie);
+
+ contents = page->contents;
+ if (pdf_is_array(contents))
+ {
+ int n = pdf_array_len(contents);
+ int i;
+
+ for (i = n-1; i > 0; i--)
+ pdf_array_delete(contents, i);
+ /* We cannot rewrite the 0th entry of contents
+ * directly as it may occur in other pages content
+ * dictionaries too. We therefore clone it and make
+ * a new object reference. */
+ new_obj = pdf_copy_dict(pdf_array_get(contents, 0));
+ new_ref = pdf_new_ref(doc, new_obj);
+ num = pdf_to_num(new_ref);
+ pdf_array_put(contents, 0, new_ref);
+ pdf_dict_dels(new_obj, "Filter");
+ }
+ else
+ {
+ num = pdf_to_num(contents);
+ pdf_dict_dels(contents, "Filter");
+ }
+ pdf_update_stream(doc, num, buffer);
+
+ /* Now deal with resources. The spec allows for Type3 fonts and form
+ * XObjects to omit a resource dictionary and look in the parent.
+ * Avoid that by flattening here as part of the cleaning. This could
+ * conceivably cause changes in rendering, but we don't care. */
+
+ /* ExtGState */
+ obj = pdf_dict_gets(res, "ExtGState");
+ if (obj)
+ {
+ int i, l;
+
+ l = pdf_dict_len(obj);
+ for (i = 0; i < l; i++)
+ {
+ pdf_obj *o = pdf_dict_gets(pdf_dict_get_val(obj, i), "SMask");
+
+ if (!o)
+ continue;
+ o = pdf_dict_gets(o, "G");
+ if (!o)
+ continue;
+
+ /* Transparency group XObject */
+ pdf_clean_stream_object(doc, o, page->resources, cookie, 1);
+ }
+ }
+
+ /* ColorSpace - no cleaning possible */
+
+ /* Pattern */
+ obj = pdf_dict_gets(res, "Pattern");
+ if (obj)
+ {
+ int i, l;
+
+ l = pdf_dict_len(obj);
+ for (i = 0; i < l; i++)
+ {
+ pdf_obj *pat = pdf_dict_get_val(obj, i);
+
+ if (!pat)
+ continue;
+ if (pdf_to_int(pdf_dict_gets(pat, "PatternType")) == 1)
+ pdf_clean_stream_object(doc, pat, page->resources, cookie, 0);
+ }
+ }
+
+ /* Shading - no cleaning possible */
+
+ /* XObject */
+ obj = pdf_dict_gets(res, "XObject");
+ if (obj)
+ {
+ int i, l;
+
+ l = pdf_dict_len(obj);
+ for (i = 0; i < l; i++)
+ {
+ pdf_obj *xobj = pdf_dict_get_val(obj, i);
+
+ if (strcmp(pdf_to_name(pdf_dict_gets(xobj, "Subtype")), "Form"))
+ continue;
+
+ pdf_clean_stream_object(doc, xobj, page->resources, cookie, 1);
+ }
+ }
+
+ /* Font */
+ obj = pdf_dict_gets(res, "Font");
+ if (obj)
+ {
+ int i, l;
+
+ l = pdf_dict_len(obj);
+ for (i = 0; i < l; i++)
+ {
+ pdf_obj *o = pdf_dict_get_val(obj, i);
+
+ if (!strcmp(pdf_to_name(pdf_dict_gets(o, "Subtype")), "Type3"))
+ {
+ pdf_clean_type3(doc, o, page->resources, cookie);
+ }
+ }
+ }
+
+ /* ProcSet - no cleaning possible. Inherit this from the old dict. */
+ obj = pdf_dict_gets(page->resources, "ProcSet");
+ if (obj)
+ pdf_dict_puts(res, "ProcSet", obj);
+
+ /* Properties - no cleaning possible. */
+
+ pdf_drop_obj(page->resources);
+ ref = pdf_new_ref(doc, res);
+ page->resources = pdf_keep_obj(ref);
+ pdf_dict_puts(page->me, "Resources", ref);
+ }
+ fz_always(ctx)
+ {
+ fz_drop_buffer(ctx, buffer);
+ pdf_drop_obj(new_obj);
+ pdf_drop_obj(new_ref);
+ pdf_drop_obj(res);
+ pdf_drop_obj(ref);
+ }
+ fz_catch(ctx)
+ {
+ fz_rethrow_message(ctx, "Failed while cleaning page");
+ }
+}
diff --git a/source/pdf/pdf-interpret-imp.h b/source/pdf/pdf-interpret-imp.h
index 69a40147..314e6d2b 100644
--- a/source/pdf/pdf-interpret-imp.h
+++ b/source/pdf/pdf-interpret-imp.h
@@ -137,6 +137,7 @@ static inline void pdf_process_op(pdf_csi *csi, int op, const pdf_process *proce
process->processor->op_table[op](csi, process->state);
}
+/* Helper functions for the filter implementations to call */
void pdf_process_contents_object(pdf_csi *csi, pdf_obj *rdb, pdf_obj *contents);
void pdf_process_stream(pdf_csi *csi, pdf_lexbuf *buf);
@@ -145,10 +146,10 @@ pdf_process *pdf_process_run(pdf_process *process, fz_device *dev, const fz_matr
pdf_process *pdf_process_buffer(pdf_process *process, fz_context *ctx, fz_buffer *buffer);
pdf_process *pdf_process_filter(pdf_process *process, fz_context *ctx, pdf_process *underlying, pdf_obj *resources);
-/* Functions to actually use the pdf_process structures to process pages,
- * annotations and glyphs */
+/* Functions to actually use the pdf_process structures to process
+ * annotations, glyphs and general stream objects */
void pdf_process_annot(pdf_document *doc, pdf_page *page, pdf_annot *annot, const pdf_process *process, fz_cookie *cookie);
-void pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *process, fz_cookie *cookie);
void pdf_process_glyph(pdf_document *doc, pdf_obj *resources, fz_buffer *contents, pdf_process *process);
+void pdf_process_stream_object(pdf_document *doc, pdf_obj *obj, const pdf_process *process, pdf_obj *res, fz_cookie *cookie);
#endif
diff --git a/source/pdf/pdf-interpret.c b/source/pdf/pdf-interpret.c
index 525d2ead..507bf0fa 100644
--- a/source/pdf/pdf-interpret.c
+++ b/source/pdf/pdf-interpret.c
@@ -606,7 +606,7 @@ pdf_process_contents_buffer(pdf_csi *csi, pdf_obj *rdb, fz_buffer *contents)
}
void
-pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *process, fz_cookie *cookie)
+pdf_process_stream_object(pdf_document *doc, pdf_obj *obj, const pdf_process *process, pdf_obj *res, fz_cookie *cookie)
{
fz_context *ctx = doc->ctx;
pdf_csi *csi;
@@ -614,7 +614,7 @@ pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *
csi = pdf_new_csi(doc, cookie, process);
fz_try(ctx)
{
- csi->process.processor->process_contents(csi, csi->process.state, page->resources, page->contents);
+ csi->process.processor->process_contents(csi, csi->process.state, res, obj);
}
fz_always(ctx)
{
@@ -622,7 +622,7 @@ pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *
}
fz_catch(ctx)
{
- fz_rethrow_message(ctx, "cannot parse page content stream");
+ fz_rethrow_message(ctx, "cannot parse content stream");
}
}
diff --git a/source/pdf/pdf-run.c b/source/pdf/pdf-run.c
index a77dd50e..d52d8282 100644
--- a/source/pdf/pdf-run.c
+++ b/source/pdf/pdf-run.c
@@ -28,7 +28,7 @@ static void pdf_run_page_contents_with_usage(pdf_document *doc, pdf_page *page,
pdf_process_run(&process, dev, &local_ctm, event, NULL, 0);
- pdf_process_page_contents(doc, page, &process, cookie);
+ pdf_process_stream_object(doc, page->contents, &process, page->resources, cookie);
if (page->transparency)
fz_end_group(dev);
diff --git a/source/pdf/pdf-write.c b/source/pdf/pdf-write.c
index b5894421..f218429d 100644
--- a/source/pdf/pdf-write.c
+++ b/source/pdf/pdf-write.c
@@ -51,6 +51,7 @@ struct pdf_write_options_s
int do_expand;
int do_garbage;
int do_linear;
+ int do_clean;
int *use_list;
int *ofs_list;
int *gen_list;
@@ -2541,6 +2542,21 @@ static void complete_signatures(pdf_document *doc, pdf_write_options *opts, char
}
}
+static void sanitise(pdf_document *doc)
+{
+ int n = pdf_count_pages(doc);
+ int i;
+
+ for (i = 0; i < n; i++)
+ {
+ pdf_page *page = pdf_load_page(doc, i);
+
+ pdf_clean_page_contents(doc, page, NULL);
+
+ pdf_free_page(doc, page);
+ }
+}
+
void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_opts)
{
int lastfree;
@@ -2559,6 +2575,10 @@ void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_
doc->freeze_updates = 1;
ctx = doc->ctx;
+ /* Sanitise the operator streams */
+ if (fz_opts->do_clean)
+ sanitise(doc);
+
pdf_finish_edit(doc);
presize_unsaved_signature_byteranges(doc);
@@ -2585,6 +2605,7 @@ void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_
opts.do_garbage = fz_opts->do_garbage;
opts.do_ascii = fz_opts->do_ascii;
opts.do_linear = fz_opts->do_linear;
+ opts.do_clean = fz_opts->do_clean;
opts.start = 0;
opts.main_xref_offset = INT_MIN;
/* We deliberately make these arrays long enough to cope with