diff options
author | Tor Andersson <tor.andersson@artifex.com> | 2017-12-01 15:28:32 +0100 |
---|---|---|
committer | Tor Andersson <tor.andersson@artifex.com> | 2017-12-13 15:01:05 +0100 |
commit | 634d64e47f82f23bc8cd7a05464a9defc66fee08 (patch) | |
tree | 94c3ad2ff718875af9a1798828e9adff2ad7c4ba | |
parent | c4d4d6cfcacdbeb277ab6624dc33d0b490024f9b (diff) | |
download | mupdf-634d64e47f82f23bc8cd7a05464a9defc66fee08.tar.xz |
Add 'clean' option to pdfclean to clean (but not sanitize) content streams.
This goes well with the 'mutool clean -d' decompression option to debug
content streams, without doing the sanitize optimization pass.
-rw-r--r-- | include/mupdf/pdf/document.h | 4 | ||||
-rw-r--r-- | include/mupdf/pdf/page.h | 10 | ||||
-rw-r--r-- | source/pdf/pdf-clean.c | 57 | ||||
-rw-r--r-- | source/pdf/pdf-write.c | 19 | ||||
-rw-r--r-- | source/tools/pdfclean.c | 8 |
5 files changed, 62 insertions, 36 deletions
diff --git a/include/mupdf/pdf/document.h b/include/mupdf/pdf/document.h index 30cfba48..63f94bcd 100644 --- a/include/mupdf/pdf/document.h +++ b/include/mupdf/pdf/document.h @@ -844,7 +844,8 @@ struct pdf_write_options_s int do_decompress; /* Decompress streams (except when compressing images/fonts). */ int do_garbage; /* Garbage collect objects before saving; 1=gc, 2=re-number, 3=de-duplicate. */ int do_linear; /* Write linearised. */ - int do_clean; /* Sanitize content streams. */ + int do_clean; /* Clean content streams. */ + int do_sanitize; /* Sanitize content streams. */ int continue_on_error; /* If set, errors are (optionally) counted and writing continues. */ int *errors; /* Pointer to a place to store a count of errors */ }; @@ -857,6 +858,7 @@ struct pdf_write_options_s l: linearize a: ascii hex encode z: deflate + c: clean content streams s: sanitize content streams */ pdf_write_options *pdf_parse_write_options(fz_context *ctx, pdf_write_options *opts, const char *args); diff --git a/include/mupdf/pdf/page.h b/include/mupdf/pdf/page.h index f904a7c1..bd0cec9d 100644 --- a/include/mupdf/pdf/page.h +++ b/include/mupdf/pdf/page.h @@ -141,7 +141,7 @@ typedef void (pdf_page_contents_process_fn)(fz_context *ctx, fz_buffer *buffer, to track progress, collect errors etc. */ void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, - pdf_page_contents_process_fn *proc, void *proc_arg, int ascii); + pdf_page_contents_process_fn *proc, void *proc_arg, int sanitize, int ascii); /* pdf_clean_annot_contents: Clean a loaded annotations rendering operations, @@ -168,7 +168,7 @@ void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, to track progress, collect errors etc. */ void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, - pdf_page_contents_process_fn *proc, void *proc_arg, int ascii); + pdf_page_contents_process_fn *proc, void *proc_arg, int sanitize, int ascii); /* pdf_filter_page_contents: Performs the same task as @@ -184,7 +184,8 @@ void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *ann arg: Opaque value to be passed to callback functions. */ void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, - pdf_page_contents_process_fn *proc_fn, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int ascii); + pdf_page_contents_process_fn *proc_fn, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, + int sanitize, int ascii); /* pdf_filter_annot_contents: Performs the same task as @@ -200,7 +201,8 @@ void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page arg: Opaque value to be passed to callback functions. */ void pdf_filter_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, - pdf_page_contents_process_fn *proc, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int ascii); + pdf_page_contents_process_fn *proc, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, + int sanitize, int ascii); /* Presentation interface. diff --git a/source/pdf/pdf-clean.c b/source/pdf/pdf-clean.c index 7fe0d6fe..0c9cb974 100644 --- a/source/pdf/pdf-clean.c +++ b/source/pdf/pdf-clean.c @@ -2,7 +2,9 @@ #include "mupdf/pdf.h" static void -pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res, int ascii, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg) +pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res, + pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, + int sanitize, int ascii) { pdf_processor *proc_buffer = NULL; pdf_processor *proc_filter = NULL; @@ -59,7 +61,7 @@ pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_ob } static void -pdf_clean_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int ascii) +pdf_clean_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int sanitize, int ascii) { pdf_processor *proc_buffer = NULL; pdf_processor *proc_filter = NULL; @@ -91,10 +93,16 @@ pdf_clean_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_ fz_try(ctx) { proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii); - proc_filter = pdf_new_filter_processor(ctx, doc, proc_buffer, orig_res, res); - - pdf_process_contents(ctx, proc_filter, doc, orig_res, val, cookie); - pdf_close_processor(ctx, proc_filter); + if (sanitize) + { + proc_filter = pdf_new_filter_processor(ctx, doc, proc_buffer, orig_res, res); + pdf_process_contents(ctx, proc_filter, doc, orig_res, val, cookie); + pdf_close_processor(ctx, proc_filter); + } + else + { + pdf_process_contents(ctx, proc_filter, doc, orig_res, val, cookie); + } pdf_close_processor(ctx, proc_buffer); pdf_update_stream(ctx, doc, val, buffer, 0); @@ -127,12 +135,14 @@ pdf_clean_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_ } } -void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *arg, int ascii) +void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *arg, int sanitize, int ascii) { - pdf_filter_page_contents(ctx, doc, page, cookie, proc_fn, NULL, NULL, arg, ascii); + pdf_filter_page_contents(ctx, doc, page, cookie, proc_fn, NULL, NULL, arg, sanitize, ascii); } -void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *proc_arg, int ascii) +void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, + pdf_page_contents_process_fn *proc_fn, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *proc_arg, + int sanitize, int ascii) { pdf_processor *proc_buffer = NULL; pdf_processor *proc_filter = NULL; @@ -162,10 +172,17 @@ void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page resources = pdf_page_resources(ctx, page); proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii); - proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, proc_buffer, resources, res, text_filter, after_text, proc_arg); + if (sanitize) + { + proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, proc_buffer, resources, res, text_filter, after_text, proc_arg); - pdf_process_contents(ctx, proc_filter, doc, resources, contents, cookie); - pdf_close_processor(ctx, proc_filter); + pdf_process_contents(ctx, proc_filter, doc, resources, contents, cookie); + pdf_close_processor(ctx, proc_filter); + } + else + { + pdf_process_contents(ctx, proc_buffer, doc, resources, contents, cookie); + } pdf_close_processor(ctx, proc_buffer); /* Deal with page content stream. */ @@ -207,7 +224,7 @@ void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page if (!o) continue; /* Transparency group XObject */ - pdf_clean_stream_object(ctx, doc, o, resources, cookie, 1, ascii, text_filter, after_text, proc_arg); + pdf_clean_stream_object(ctx, doc, o, resources, cookie, 1, text_filter, after_text, proc_arg, sanitize, ascii); } } @@ -223,7 +240,7 @@ void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page if (!pat) continue; if (pdf_to_int(ctx, pdf_dict_get(ctx, pat, PDF_NAME_PatternType)) == 1) - pdf_clean_stream_object(ctx, doc, pat, resources, cookie, 0, ascii, text_filter, after_text, proc_arg); + pdf_clean_stream_object(ctx, doc, pat, resources, cookie, 0, text_filter, after_text, proc_arg, sanitize, ascii); } } @@ -239,7 +256,7 @@ void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page if (!xobj) continue; if (pdf_name_eq(ctx, PDF_NAME_Form, pdf_dict_get(ctx, xobj, PDF_NAME_Subtype))) - pdf_clean_stream_object(ctx, doc, xobj, resources, cookie, 1, ascii, text_filter, after_text, proc_arg); + pdf_clean_stream_object(ctx, doc, xobj, resources, cookie, 1, text_filter, after_text, proc_arg, sanitize, ascii); } } @@ -255,7 +272,7 @@ void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page if (!o) continue; if (pdf_name_eq(ctx, PDF_NAME_Type3, pdf_dict_get(ctx, o, PDF_NAME_Subtype))) - pdf_clean_type3(ctx, doc, o, resources, cookie, ascii); + pdf_clean_type3(ctx, doc, o, resources, cookie, sanitize, ascii); } } @@ -290,13 +307,13 @@ void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page } } -void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *proc_arg, int ascii) +void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *proc_arg, int sanitize, int ascii) { - pdf_filter_annot_contents(ctx, doc, annot, cookie, proc_fn, NULL, NULL, proc_arg, ascii); + pdf_filter_annot_contents(ctx, doc, annot, cookie, proc_fn, NULL, NULL, proc_arg, sanitize, ascii); } void pdf_filter_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, - pdf_page_contents_process_fn *proc, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int ascii) + pdf_page_contents_process_fn *proc, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int sanitize, int ascii) { pdf_obj *ap; int i, n; @@ -313,6 +330,6 @@ void pdf_filter_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *an if (v == NULL) continue; - pdf_clean_stream_object(ctx, doc, v, NULL, cookie, 1, 1, text_filter, after_text, arg); + pdf_clean_stream_object(ctx, doc, v, NULL, cookie, 1, text_filter, after_text, arg, sanitize, ascii); } } diff --git a/source/pdf/pdf-write.c b/source/pdf/pdf-write.c index 154655e0..9fcdbf0a 100644 --- a/source/pdf/pdf-write.c +++ b/source/pdf/pdf-write.c @@ -2703,7 +2703,7 @@ static void complete_signatures(fz_context *ctx, pdf_document *doc, pdf_write_st } } -static void sanitize(fz_context *ctx, pdf_document *doc, int ascii) +static void clean_content_streams(fz_context *ctx, pdf_document *doc, int sanitize, int ascii) { int n = pdf_count_pages(ctx, doc); int i; @@ -2712,11 +2712,11 @@ static void sanitize(fz_context *ctx, pdf_document *doc, int ascii) { pdf_annot *annot; pdf_page *page = pdf_load_page(ctx, doc, i); - pdf_clean_page_contents(ctx, doc, page, NULL, NULL, NULL, ascii); + pdf_clean_page_contents(ctx, doc, page, NULL, NULL, NULL, sanitize, ascii); for (annot = pdf_first_annot(ctx, page); annot != NULL; annot = pdf_next_annot(ctx, annot)) { - pdf_clean_annot_contents(ctx, doc, annot, NULL, NULL, NULL, ascii); + pdf_clean_annot_contents(ctx, doc, annot, NULL, NULL, NULL, sanitize, ascii); } fz_drop_page(ctx, &page->super); @@ -2787,7 +2787,8 @@ const char *fz_pdf_write_options_usage = "\tascii: ASCII hex encode binary streams\n" "\tpretty: pretty-print objects with indentation\n" "\tlinearize: optimize for web browsers\n" - "\tsanitize: clean up graphics commands in content streams\n" + "\tclean: pretty-print graphics commands in content streams\n" + "\tsanitize: sanitize graphics commands in content streams\n" "\tgarbage: garbage collect unused objects\n" "\tincremental: write changes as incremental update\n" "\tcontinue-on-error: continue saving the document even if there is an error\n" @@ -2816,8 +2817,10 @@ pdf_parse_write_options(fz_context *ctx, pdf_write_options *opts, const char *ar opts->do_pretty = fz_option_eq(val, "yes"); if (fz_has_option(ctx, args, "linearize", &val)) opts->do_linear = fz_option_eq(val, "yes"); - if (fz_has_option(ctx, args, "sanitize", &val)) + if (fz_has_option(ctx, args, "clean", &val)) opts->do_clean = fz_option_eq(val, "yes"); + if (fz_has_option(ctx, args, "sanitize", &val)) + opts->do_sanitize = fz_option_eq(val, "yes"); if (fz_has_option(ctx, args, "incremental", &val)) opts->do_incremental = fz_option_eq(val, "yes"); if (fz_has_option(ctx, args, "continue-on-error", &val)) @@ -2851,9 +2854,9 @@ prepare_for_save(fz_context *ctx, pdf_document *doc, pdf_write_options *in_opts) { doc->freeze_updates = 1; - /* Sanitize the operator streams */ - if (in_opts->do_clean) - sanitize(ctx, doc, in_opts->do_ascii); + /* Rewrite (and possibly sanitize) the operator streams */ + if (in_opts->do_clean || in_opts->do_sanitize) + clean_content_streams(ctx, doc, in_opts->do_sanitize, in_opts->do_ascii); pdf_finish_edit(ctx, doc); presize_unsaved_signature_byteranges(ctx, doc); diff --git a/source/tools/pdfclean.c b/source/tools/pdfclean.c index ff60bbb5..6d5a69be 100644 --- a/source/tools/pdfclean.c +++ b/source/tools/pdfclean.c @@ -31,7 +31,8 @@ static void usage(void) "\t-z\tdeflate uncompressed streams\n" "\t-f\tcompress font streams\n" "\t-i\tcompress image streams\n" - "\t-s\tclean content streams\n" + "\t-c\tclean content streams\n" + "\t-s\tsanitize content streams\n" "\tpages\tcomma separated list of page numbers and ranges\n" ); exit(1); @@ -50,7 +51,7 @@ int pdfclean_main(int argc, char **argv) opts.continue_on_error = 1; opts.errors = &errors; - while ((c = fz_getopt(argc, argv, "adfgilp:sz")) != -1) + while ((c = fz_getopt(argc, argv, "adfgilp:scz")) != -1) { switch (c) { @@ -63,7 +64,8 @@ int pdfclean_main(int argc, char **argv) case 'a': opts.do_ascii += 1; break; case 'g': opts.do_garbage += 1; break; case 'l': opts.do_linear += 1; break; - case 's': opts.do_clean += 1; break; + case 'c': opts.do_clean += 1; break; + case 's': opts.do_sanitize += 1; break; default: usage(); break; } } |