From e4d36ce68e0467ac4702f717386934a44970f4e5 Mon Sep 17 00:00:00 2001 From: Robin Watts Date: Wed, 19 Mar 2014 19:04:50 +0000 Subject: Add routine to clean pdf content streams for pages. New routine to filter the content streams for pages, xobjects, type3 charprocs, patterns etc. The filtered streams are guaranteed to be properly matched with q/Q's, and to not have changed the top level ctm. Additionally we remove (some) repeated settings of colors etc. This filtering can be extended to be smarter later. The idea of this is to both repair after editing, and to leave the streams in a form that can be easily appended to. This is preparatory to work on Bates numbering and Watermarking. Currently the streams produced are uncompressed. --- source/pdf/pdf-annot.c | 2 + source/pdf/pdf-clean.c | 283 +++++++++++++++++++++++++++++++++++++++++ source/pdf/pdf-interpret-imp.h | 7 +- source/pdf/pdf-interpret.c | 6 +- source/pdf/pdf-run.c | 2 +- source/pdf/pdf-write.c | 21 +++ 6 files changed, 314 insertions(+), 7 deletions(-) create mode 100644 source/pdf/pdf-clean.c (limited to 'source/pdf') diff --git a/source/pdf/pdf-annot.c b/source/pdf/pdf-annot.c index e9702125..1fcbbcf4 100644 --- a/source/pdf/pdf-annot.c +++ b/source/pdf/pdf-annot.c @@ -66,6 +66,8 @@ pdf_parse_link_dest(pdf_document *doc, fz_link_kind kind, pdf_obj *dest) ld.ld.gotor.dest = NULL; dest = resolve_dest(doc, dest, kind); + if (dest == NULL) + fz_throw(doc->ctx, FZ_ERROR_GENERIC, "Undefined link_dest"); if (pdf_is_name(dest)) { diff --git a/source/pdf/pdf-clean.c b/source/pdf/pdf-clean.c new file mode 100644 index 00000000..51a8b6de --- /dev/null +++ b/source/pdf/pdf-clean.c @@ -0,0 +1,283 @@ +#include "pdf-interpret-imp.h" + +static void +pdf_clean_stream_object(pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res) +{ + fz_context *ctx = doc->ctx; + pdf_process process, process2; + fz_buffer *buffer; + int num; + pdf_obj *res = NULL; + pdf_obj *ref = NULL; + + if (!obj) + return; + + fz_var(res); + fz_var(ref); + + buffer = fz_new_buffer(ctx, 1024); + + fz_try(ctx) + { + if (own_res) + { + pdf_obj *r = pdf_dict_gets(obj, "Resources"); + if (r) + orig_res = r; + } + + res = pdf_new_dict(doc, 1); + + pdf_process_buffer(&process2, ctx, buffer); + pdf_process_filter(&process, ctx, &process2, res); + + pdf_process_stream_object(doc, obj, &process, orig_res, cookie); + + num = pdf_to_num(obj); + pdf_dict_dels(obj, "Filter"); + pdf_update_stream(doc, num, buffer); + + if (own_res) + { + ref = pdf_new_ref(doc, res); + pdf_dict_puts(obj, "Resources", ref); + } + } + fz_always(ctx) + { + fz_drop_buffer(ctx, buffer); + pdf_drop_obj(res); + pdf_drop_obj(ref); + } + fz_catch(ctx) + { + fz_rethrow_message(ctx, "Failed while cleaning xobject"); + } +} + +static void +pdf_clean_type3(pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie) +{ + fz_context *ctx = doc->ctx; + pdf_process process, process2; + fz_buffer *buffer; + int num, i, l; + pdf_obj *res = NULL; + pdf_obj *ref = NULL; + pdf_obj *charprocs; + + fz_var(res); + fz_var(ref); + + fz_try(ctx) + { + res = pdf_dict_gets(obj, "Resources"); + if (res) + orig_res = res; + res = NULL; + + res = pdf_new_dict(doc, 1); + + charprocs = pdf_dict_gets(obj, "CharProcs"); + l = pdf_dict_len(charprocs); + + for (i = 0; i < l; i++) + { + pdf_obj *key = pdf_dict_get_key(charprocs, i); + pdf_obj *val = pdf_dict_get_val(charprocs, i); + + buffer = fz_new_buffer(ctx, 1024); + pdf_process_buffer(&process2, ctx, buffer); + pdf_process_filter(&process, ctx, &process2, res); + + pdf_process_stream_object(doc, val, &process, orig_res, cookie); + + num = pdf_to_num(val); + pdf_dict_dels(val, "Filter"); + pdf_update_stream(doc, num, buffer); + pdf_dict_put(charprocs, key, val); + fz_drop_buffer(ctx, buffer); + buffer = NULL; + } + + /* ProcSet - no cleaning possible. Inherit this from the old dict. */ + pdf_dict_puts(res, "ProcSet", pdf_dict_gets(orig_res, "ProcSet")); + + ref = pdf_new_ref(doc, res); + pdf_dict_puts(obj, "Resources", ref); + } + fz_always(ctx) + { + fz_drop_buffer(ctx, buffer); + pdf_drop_obj(res); + pdf_drop_obj(ref); + } + fz_catch(ctx) + { + fz_rethrow_message(ctx, "Failed while cleaning xobject"); + } +} + +void pdf_clean_page_contents(pdf_document *doc, pdf_page *page, fz_cookie *cookie) +{ + fz_context *ctx = doc->ctx; + pdf_process process, process2; + fz_buffer *buffer = fz_new_buffer(ctx, 1024); + int num; + pdf_obj *contents; + pdf_obj *new_obj = NULL; + pdf_obj *new_ref = NULL; + pdf_obj *res = NULL; + pdf_obj *ref = NULL; + pdf_obj *obj; + + fz_var(new_obj); + fz_var(new_ref); + fz_var(res); + fz_var(ref); + + fz_try(ctx) + { + res = pdf_new_dict(doc, 1); + + pdf_process_buffer(&process2, ctx, buffer); + pdf_process_filter(&process, ctx, &process2, res); + + pdf_process_stream_object(doc, page->contents, &process, page->resources, cookie); + + contents = page->contents; + if (pdf_is_array(contents)) + { + int n = pdf_array_len(contents); + int i; + + for (i = n-1; i > 0; i--) + pdf_array_delete(contents, i); + /* We cannot rewrite the 0th entry of contents + * directly as it may occur in other pages content + * dictionaries too. We therefore clone it and make + * a new object reference. */ + new_obj = pdf_copy_dict(pdf_array_get(contents, 0)); + new_ref = pdf_new_ref(doc, new_obj); + num = pdf_to_num(new_ref); + pdf_array_put(contents, 0, new_ref); + pdf_dict_dels(new_obj, "Filter"); + } + else + { + num = pdf_to_num(contents); + pdf_dict_dels(contents, "Filter"); + } + pdf_update_stream(doc, num, buffer); + + /* Now deal with resources. The spec allows for Type3 fonts and form + * XObjects to omit a resource dictionary and look in the parent. + * Avoid that by flattening here as part of the cleaning. This could + * conceivably cause changes in rendering, but we don't care. */ + + /* ExtGState */ + obj = pdf_dict_gets(res, "ExtGState"); + if (obj) + { + int i, l; + + l = pdf_dict_len(obj); + for (i = 0; i < l; i++) + { + pdf_obj *o = pdf_dict_gets(pdf_dict_get_val(obj, i), "SMask"); + + if (!o) + continue; + o = pdf_dict_gets(o, "G"); + if (!o) + continue; + + /* Transparency group XObject */ + pdf_clean_stream_object(doc, o, page->resources, cookie, 1); + } + } + + /* ColorSpace - no cleaning possible */ + + /* Pattern */ + obj = pdf_dict_gets(res, "Pattern"); + if (obj) + { + int i, l; + + l = pdf_dict_len(obj); + for (i = 0; i < l; i++) + { + pdf_obj *pat = pdf_dict_get_val(obj, i); + + if (!pat) + continue; + if (pdf_to_int(pdf_dict_gets(pat, "PatternType")) == 1) + pdf_clean_stream_object(doc, pat, page->resources, cookie, 0); + } + } + + /* Shading - no cleaning possible */ + + /* XObject */ + obj = pdf_dict_gets(res, "XObject"); + if (obj) + { + int i, l; + + l = pdf_dict_len(obj); + for (i = 0; i < l; i++) + { + pdf_obj *xobj = pdf_dict_get_val(obj, i); + + if (strcmp(pdf_to_name(pdf_dict_gets(xobj, "Subtype")), "Form")) + continue; + + pdf_clean_stream_object(doc, xobj, page->resources, cookie, 1); + } + } + + /* Font */ + obj = pdf_dict_gets(res, "Font"); + if (obj) + { + int i, l; + + l = pdf_dict_len(obj); + for (i = 0; i < l; i++) + { + pdf_obj *o = pdf_dict_get_val(obj, i); + + if (!strcmp(pdf_to_name(pdf_dict_gets(o, "Subtype")), "Type3")) + { + pdf_clean_type3(doc, o, page->resources, cookie); + } + } + } + + /* ProcSet - no cleaning possible. Inherit this from the old dict. */ + obj = pdf_dict_gets(page->resources, "ProcSet"); + if (obj) + pdf_dict_puts(res, "ProcSet", obj); + + /* Properties - no cleaning possible. */ + + pdf_drop_obj(page->resources); + ref = pdf_new_ref(doc, res); + page->resources = pdf_keep_obj(ref); + pdf_dict_puts(page->me, "Resources", ref); + } + fz_always(ctx) + { + fz_drop_buffer(ctx, buffer); + pdf_drop_obj(new_obj); + pdf_drop_obj(new_ref); + pdf_drop_obj(res); + pdf_drop_obj(ref); + } + fz_catch(ctx) + { + fz_rethrow_message(ctx, "Failed while cleaning page"); + } +} diff --git a/source/pdf/pdf-interpret-imp.h b/source/pdf/pdf-interpret-imp.h index 69a40147..314e6d2b 100644 --- a/source/pdf/pdf-interpret-imp.h +++ b/source/pdf/pdf-interpret-imp.h @@ -137,6 +137,7 @@ static inline void pdf_process_op(pdf_csi *csi, int op, const pdf_process *proce process->processor->op_table[op](csi, process->state); } +/* Helper functions for the filter implementations to call */ void pdf_process_contents_object(pdf_csi *csi, pdf_obj *rdb, pdf_obj *contents); void pdf_process_stream(pdf_csi *csi, pdf_lexbuf *buf); @@ -145,10 +146,10 @@ pdf_process *pdf_process_run(pdf_process *process, fz_device *dev, const fz_matr pdf_process *pdf_process_buffer(pdf_process *process, fz_context *ctx, fz_buffer *buffer); pdf_process *pdf_process_filter(pdf_process *process, fz_context *ctx, pdf_process *underlying, pdf_obj *resources); -/* Functions to actually use the pdf_process structures to process pages, - * annotations and glyphs */ +/* Functions to actually use the pdf_process structures to process + * annotations, glyphs and general stream objects */ void pdf_process_annot(pdf_document *doc, pdf_page *page, pdf_annot *annot, const pdf_process *process, fz_cookie *cookie); -void pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *process, fz_cookie *cookie); void pdf_process_glyph(pdf_document *doc, pdf_obj *resources, fz_buffer *contents, pdf_process *process); +void pdf_process_stream_object(pdf_document *doc, pdf_obj *obj, const pdf_process *process, pdf_obj *res, fz_cookie *cookie); #endif diff --git a/source/pdf/pdf-interpret.c b/source/pdf/pdf-interpret.c index 525d2ead..507bf0fa 100644 --- a/source/pdf/pdf-interpret.c +++ b/source/pdf/pdf-interpret.c @@ -606,7 +606,7 @@ pdf_process_contents_buffer(pdf_csi *csi, pdf_obj *rdb, fz_buffer *contents) } void -pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *process, fz_cookie *cookie) +pdf_process_stream_object(pdf_document *doc, pdf_obj *obj, const pdf_process *process, pdf_obj *res, fz_cookie *cookie) { fz_context *ctx = doc->ctx; pdf_csi *csi; @@ -614,7 +614,7 @@ pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process * csi = pdf_new_csi(doc, cookie, process); fz_try(ctx) { - csi->process.processor->process_contents(csi, csi->process.state, page->resources, page->contents); + csi->process.processor->process_contents(csi, csi->process.state, res, obj); } fz_always(ctx) { @@ -622,7 +622,7 @@ pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process * } fz_catch(ctx) { - fz_rethrow_message(ctx, "cannot parse page content stream"); + fz_rethrow_message(ctx, "cannot parse content stream"); } } diff --git a/source/pdf/pdf-run.c b/source/pdf/pdf-run.c index a77dd50e..d52d8282 100644 --- a/source/pdf/pdf-run.c +++ b/source/pdf/pdf-run.c @@ -28,7 +28,7 @@ static void pdf_run_page_contents_with_usage(pdf_document *doc, pdf_page *page, pdf_process_run(&process, dev, &local_ctm, event, NULL, 0); - pdf_process_page_contents(doc, page, &process, cookie); + pdf_process_stream_object(doc, page->contents, &process, page->resources, cookie); if (page->transparency) fz_end_group(dev); diff --git a/source/pdf/pdf-write.c b/source/pdf/pdf-write.c index b5894421..f218429d 100644 --- a/source/pdf/pdf-write.c +++ b/source/pdf/pdf-write.c @@ -51,6 +51,7 @@ struct pdf_write_options_s int do_expand; int do_garbage; int do_linear; + int do_clean; int *use_list; int *ofs_list; int *gen_list; @@ -2541,6 +2542,21 @@ static void complete_signatures(pdf_document *doc, pdf_write_options *opts, char } } +static void sanitise(pdf_document *doc) +{ + int n = pdf_count_pages(doc); + int i; + + for (i = 0; i < n; i++) + { + pdf_page *page = pdf_load_page(doc, i); + + pdf_clean_page_contents(doc, page, NULL); + + pdf_free_page(doc, page); + } +} + void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_opts) { int lastfree; @@ -2559,6 +2575,10 @@ void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_ doc->freeze_updates = 1; ctx = doc->ctx; + /* Sanitise the operator streams */ + if (fz_opts->do_clean) + sanitise(doc); + pdf_finish_edit(doc); presize_unsaved_signature_byteranges(doc); @@ -2585,6 +2605,7 @@ void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_ opts.do_garbage = fz_opts->do_garbage; opts.do_ascii = fz_opts->do_ascii; opts.do_linear = fz_opts->do_linear; + opts.do_clean = fz_opts->do_clean; opts.start = 0; opts.main_xref_offset = INT_MIN; /* We deliberately make these arrays long enough to cope with -- cgit v1.2.3