diff options
author | Robin Watts <robin.watts@artifex.com> | 2014-03-19 19:04:50 +0000 |
---|---|---|
committer | Robin Watts <robin.watts@artifex.com> | 2014-03-19 19:04:50 +0000 |
commit | e4d36ce68e0467ac4702f717386934a44970f4e5 (patch) | |
tree | 758a9999434f03a4607d1f23e43d9a58828b6444 /source/pdf | |
parent | 441954b6fb378e3af72500653be5636c7ade29ee (diff) | |
download | mupdf-e4d36ce68e0467ac4702f717386934a44970f4e5.tar.xz |
Add routine to clean pdf content streams for pages.
New routine to filter the content streams for pages, xobjects,
type3 charprocs, patterns etc. The filtered streams are guaranteed
to be properly matched with q/Q's, and to not have changed the top
level ctm. Additionally we remove (some) repeated settings of
colors etc. This filtering can be extended to be smarter later.
The idea of this is to both repair after editing, and to leave the
streams in a form that can be easily appended to.
This is preparatory to work on Bates numbering and Watermarking.
Currently the streams produced are uncompressed.
Diffstat (limited to 'source/pdf')
-rw-r--r-- | source/pdf/pdf-annot.c | 2 | ||||
-rw-r--r-- | source/pdf/pdf-clean.c | 283 | ||||
-rw-r--r-- | source/pdf/pdf-interpret-imp.h | 7 | ||||
-rw-r--r-- | source/pdf/pdf-interpret.c | 6 | ||||
-rw-r--r-- | source/pdf/pdf-run.c | 2 | ||||
-rw-r--r-- | source/pdf/pdf-write.c | 21 |
6 files changed, 314 insertions, 7 deletions
diff --git a/source/pdf/pdf-annot.c b/source/pdf/pdf-annot.c index e9702125..1fcbbcf4 100644 --- a/source/pdf/pdf-annot.c +++ b/source/pdf/pdf-annot.c @@ -66,6 +66,8 @@ pdf_parse_link_dest(pdf_document *doc, fz_link_kind kind, pdf_obj *dest) ld.ld.gotor.dest = NULL; dest = resolve_dest(doc, dest, kind); + if (dest == NULL) + fz_throw(doc->ctx, FZ_ERROR_GENERIC, "Undefined link_dest"); if (pdf_is_name(dest)) { diff --git a/source/pdf/pdf-clean.c b/source/pdf/pdf-clean.c new file mode 100644 index 00000000..51a8b6de --- /dev/null +++ b/source/pdf/pdf-clean.c @@ -0,0 +1,283 @@ +#include "pdf-interpret-imp.h" + +static void +pdf_clean_stream_object(pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res) +{ + fz_context *ctx = doc->ctx; + pdf_process process, process2; + fz_buffer *buffer; + int num; + pdf_obj *res = NULL; + pdf_obj *ref = NULL; + + if (!obj) + return; + + fz_var(res); + fz_var(ref); + + buffer = fz_new_buffer(ctx, 1024); + + fz_try(ctx) + { + if (own_res) + { + pdf_obj *r = pdf_dict_gets(obj, "Resources"); + if (r) + orig_res = r; + } + + res = pdf_new_dict(doc, 1); + + pdf_process_buffer(&process2, ctx, buffer); + pdf_process_filter(&process, ctx, &process2, res); + + pdf_process_stream_object(doc, obj, &process, orig_res, cookie); + + num = pdf_to_num(obj); + pdf_dict_dels(obj, "Filter"); + pdf_update_stream(doc, num, buffer); + + if (own_res) + { + ref = pdf_new_ref(doc, res); + pdf_dict_puts(obj, "Resources", ref); + } + } + fz_always(ctx) + { + fz_drop_buffer(ctx, buffer); + pdf_drop_obj(res); + pdf_drop_obj(ref); + } + fz_catch(ctx) + { + fz_rethrow_message(ctx, "Failed while cleaning xobject"); + } +} + +static void +pdf_clean_type3(pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie) +{ + fz_context *ctx = doc->ctx; + pdf_process process, process2; + fz_buffer *buffer; + int num, i, l; + pdf_obj *res = NULL; + pdf_obj *ref = NULL; + pdf_obj *charprocs; + + fz_var(res); + fz_var(ref); + + fz_try(ctx) + { + res = pdf_dict_gets(obj, "Resources"); + if (res) + orig_res = res; + res = NULL; + + res = pdf_new_dict(doc, 1); + + charprocs = pdf_dict_gets(obj, "CharProcs"); + l = pdf_dict_len(charprocs); + + for (i = 0; i < l; i++) + { + pdf_obj *key = pdf_dict_get_key(charprocs, i); + pdf_obj *val = pdf_dict_get_val(charprocs, i); + + buffer = fz_new_buffer(ctx, 1024); + pdf_process_buffer(&process2, ctx, buffer); + pdf_process_filter(&process, ctx, &process2, res); + + pdf_process_stream_object(doc, val, &process, orig_res, cookie); + + num = pdf_to_num(val); + pdf_dict_dels(val, "Filter"); + pdf_update_stream(doc, num, buffer); + pdf_dict_put(charprocs, key, val); + fz_drop_buffer(ctx, buffer); + buffer = NULL; + } + + /* ProcSet - no cleaning possible. Inherit this from the old dict. */ + pdf_dict_puts(res, "ProcSet", pdf_dict_gets(orig_res, "ProcSet")); + + ref = pdf_new_ref(doc, res); + pdf_dict_puts(obj, "Resources", ref); + } + fz_always(ctx) + { + fz_drop_buffer(ctx, buffer); + pdf_drop_obj(res); + pdf_drop_obj(ref); + } + fz_catch(ctx) + { + fz_rethrow_message(ctx, "Failed while cleaning xobject"); + } +} + +void pdf_clean_page_contents(pdf_document *doc, pdf_page *page, fz_cookie *cookie) +{ + fz_context *ctx = doc->ctx; + pdf_process process, process2; + fz_buffer *buffer = fz_new_buffer(ctx, 1024); + int num; + pdf_obj *contents; + pdf_obj *new_obj = NULL; + pdf_obj *new_ref = NULL; + pdf_obj *res = NULL; + pdf_obj *ref = NULL; + pdf_obj *obj; + + fz_var(new_obj); + fz_var(new_ref); + fz_var(res); + fz_var(ref); + + fz_try(ctx) + { + res = pdf_new_dict(doc, 1); + + pdf_process_buffer(&process2, ctx, buffer); + pdf_process_filter(&process, ctx, &process2, res); + + pdf_process_stream_object(doc, page->contents, &process, page->resources, cookie); + + contents = page->contents; + if (pdf_is_array(contents)) + { + int n = pdf_array_len(contents); + int i; + + for (i = n-1; i > 0; i--) + pdf_array_delete(contents, i); + /* We cannot rewrite the 0th entry of contents + * directly as it may occur in other pages content + * dictionaries too. We therefore clone it and make + * a new object reference. */ + new_obj = pdf_copy_dict(pdf_array_get(contents, 0)); + new_ref = pdf_new_ref(doc, new_obj); + num = pdf_to_num(new_ref); + pdf_array_put(contents, 0, new_ref); + pdf_dict_dels(new_obj, "Filter"); + } + else + { + num = pdf_to_num(contents); + pdf_dict_dels(contents, "Filter"); + } + pdf_update_stream(doc, num, buffer); + + /* Now deal with resources. The spec allows for Type3 fonts and form + * XObjects to omit a resource dictionary and look in the parent. + * Avoid that by flattening here as part of the cleaning. This could + * conceivably cause changes in rendering, but we don't care. */ + + /* ExtGState */ + obj = pdf_dict_gets(res, "ExtGState"); + if (obj) + { + int i, l; + + l = pdf_dict_len(obj); + for (i = 0; i < l; i++) + { + pdf_obj *o = pdf_dict_gets(pdf_dict_get_val(obj, i), "SMask"); + + if (!o) + continue; + o = pdf_dict_gets(o, "G"); + if (!o) + continue; + + /* Transparency group XObject */ + pdf_clean_stream_object(doc, o, page->resources, cookie, 1); + } + } + + /* ColorSpace - no cleaning possible */ + + /* Pattern */ + obj = pdf_dict_gets(res, "Pattern"); + if (obj) + { + int i, l; + + l = pdf_dict_len(obj); + for (i = 0; i < l; i++) + { + pdf_obj *pat = pdf_dict_get_val(obj, i); + + if (!pat) + continue; + if (pdf_to_int(pdf_dict_gets(pat, "PatternType")) == 1) + pdf_clean_stream_object(doc, pat, page->resources, cookie, 0); + } + } + + /* Shading - no cleaning possible */ + + /* XObject */ + obj = pdf_dict_gets(res, "XObject"); + if (obj) + { + int i, l; + + l = pdf_dict_len(obj); + for (i = 0; i < l; i++) + { + pdf_obj *xobj = pdf_dict_get_val(obj, i); + + if (strcmp(pdf_to_name(pdf_dict_gets(xobj, "Subtype")), "Form")) + continue; + + pdf_clean_stream_object(doc, xobj, page->resources, cookie, 1); + } + } + + /* Font */ + obj = pdf_dict_gets(res, "Font"); + if (obj) + { + int i, l; + + l = pdf_dict_len(obj); + for (i = 0; i < l; i++) + { + pdf_obj *o = pdf_dict_get_val(obj, i); + + if (!strcmp(pdf_to_name(pdf_dict_gets(o, "Subtype")), "Type3")) + { + pdf_clean_type3(doc, o, page->resources, cookie); + } + } + } + + /* ProcSet - no cleaning possible. Inherit this from the old dict. */ + obj = pdf_dict_gets(page->resources, "ProcSet"); + if (obj) + pdf_dict_puts(res, "ProcSet", obj); + + /* Properties - no cleaning possible. */ + + pdf_drop_obj(page->resources); + ref = pdf_new_ref(doc, res); + page->resources = pdf_keep_obj(ref); + pdf_dict_puts(page->me, "Resources", ref); + } + fz_always(ctx) + { + fz_drop_buffer(ctx, buffer); + pdf_drop_obj(new_obj); + pdf_drop_obj(new_ref); + pdf_drop_obj(res); + pdf_drop_obj(ref); + } + fz_catch(ctx) + { + fz_rethrow_message(ctx, "Failed while cleaning page"); + } +} diff --git a/source/pdf/pdf-interpret-imp.h b/source/pdf/pdf-interpret-imp.h index 69a40147..314e6d2b 100644 --- a/source/pdf/pdf-interpret-imp.h +++ b/source/pdf/pdf-interpret-imp.h @@ -137,6 +137,7 @@ static inline void pdf_process_op(pdf_csi *csi, int op, const pdf_process *proce process->processor->op_table[op](csi, process->state); } +/* Helper functions for the filter implementations to call */ void pdf_process_contents_object(pdf_csi *csi, pdf_obj *rdb, pdf_obj *contents); void pdf_process_stream(pdf_csi *csi, pdf_lexbuf *buf); @@ -145,10 +146,10 @@ pdf_process *pdf_process_run(pdf_process *process, fz_device *dev, const fz_matr pdf_process *pdf_process_buffer(pdf_process *process, fz_context *ctx, fz_buffer *buffer); pdf_process *pdf_process_filter(pdf_process *process, fz_context *ctx, pdf_process *underlying, pdf_obj *resources); -/* Functions to actually use the pdf_process structures to process pages, - * annotations and glyphs */ +/* Functions to actually use the pdf_process structures to process + * annotations, glyphs and general stream objects */ void pdf_process_annot(pdf_document *doc, pdf_page *page, pdf_annot *annot, const pdf_process *process, fz_cookie *cookie); -void pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *process, fz_cookie *cookie); void pdf_process_glyph(pdf_document *doc, pdf_obj *resources, fz_buffer *contents, pdf_process *process); +void pdf_process_stream_object(pdf_document *doc, pdf_obj *obj, const pdf_process *process, pdf_obj *res, fz_cookie *cookie); #endif diff --git a/source/pdf/pdf-interpret.c b/source/pdf/pdf-interpret.c index 525d2ead..507bf0fa 100644 --- a/source/pdf/pdf-interpret.c +++ b/source/pdf/pdf-interpret.c @@ -606,7 +606,7 @@ pdf_process_contents_buffer(pdf_csi *csi, pdf_obj *rdb, fz_buffer *contents) } void -pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *process, fz_cookie *cookie) +pdf_process_stream_object(pdf_document *doc, pdf_obj *obj, const pdf_process *process, pdf_obj *res, fz_cookie *cookie) { fz_context *ctx = doc->ctx; pdf_csi *csi; @@ -614,7 +614,7 @@ pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process * csi = pdf_new_csi(doc, cookie, process); fz_try(ctx) { - csi->process.processor->process_contents(csi, csi->process.state, page->resources, page->contents); + csi->process.processor->process_contents(csi, csi->process.state, res, obj); } fz_always(ctx) { @@ -622,7 +622,7 @@ pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process * } fz_catch(ctx) { - fz_rethrow_message(ctx, "cannot parse page content stream"); + fz_rethrow_message(ctx, "cannot parse content stream"); } } diff --git a/source/pdf/pdf-run.c b/source/pdf/pdf-run.c index a77dd50e..d52d8282 100644 --- a/source/pdf/pdf-run.c +++ b/source/pdf/pdf-run.c @@ -28,7 +28,7 @@ static void pdf_run_page_contents_with_usage(pdf_document *doc, pdf_page *page, pdf_process_run(&process, dev, &local_ctm, event, NULL, 0); - pdf_process_page_contents(doc, page, &process, cookie); + pdf_process_stream_object(doc, page->contents, &process, page->resources, cookie); if (page->transparency) fz_end_group(dev); diff --git a/source/pdf/pdf-write.c b/source/pdf/pdf-write.c index b5894421..f218429d 100644 --- a/source/pdf/pdf-write.c +++ b/source/pdf/pdf-write.c @@ -51,6 +51,7 @@ struct pdf_write_options_s int do_expand; int do_garbage; int do_linear; + int do_clean; int *use_list; int *ofs_list; int *gen_list; @@ -2541,6 +2542,21 @@ static void complete_signatures(pdf_document *doc, pdf_write_options *opts, char } } +static void sanitise(pdf_document *doc) +{ + int n = pdf_count_pages(doc); + int i; + + for (i = 0; i < n; i++) + { + pdf_page *page = pdf_load_page(doc, i); + + pdf_clean_page_contents(doc, page, NULL); + + pdf_free_page(doc, page); + } +} + void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_opts) { int lastfree; @@ -2559,6 +2575,10 @@ void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_ doc->freeze_updates = 1; ctx = doc->ctx; + /* Sanitise the operator streams */ + if (fz_opts->do_clean) + sanitise(doc); + pdf_finish_edit(doc); presize_unsaved_signature_byteranges(doc); @@ -2585,6 +2605,7 @@ void pdf_write_document(pdf_document *doc, char *filename, fz_write_options *fz_ opts.do_garbage = fz_opts->do_garbage; opts.do_ascii = fz_opts->do_ascii; opts.do_linear = fz_opts->do_linear; + opts.do_clean = fz_opts->do_clean; opts.start = 0; opts.main_xref_offset = INT_MIN; /* We deliberately make these arrays long enough to cope with |