summaryrefslogtreecommitdiff
path: root/source
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2017-12-01 15:28:32 +0100
committerTor Andersson <tor.andersson@artifex.com>2017-12-13 15:01:05 +0100
commit634d64e47f82f23bc8cd7a05464a9defc66fee08 (patch)
tree94c3ad2ff718875af9a1798828e9adff2ad7c4ba /source
parentc4d4d6cfcacdbeb277ab6624dc33d0b490024f9b (diff)
downloadmupdf-634d64e47f82f23bc8cd7a05464a9defc66fee08.tar.xz
Add 'clean' option to pdfclean to clean (but not sanitize) content streams.
This goes well with the 'mutool clean -d' decompression option to debug content streams, without doing the sanitize optimization pass.
Diffstat (limited to 'source')
-rw-r--r--source/pdf/pdf-clean.c57
-rw-r--r--source/pdf/pdf-write.c19
-rw-r--r--source/tools/pdfclean.c8
3 files changed, 53 insertions, 31 deletions
diff --git a/source/pdf/pdf-clean.c b/source/pdf/pdf-clean.c
index 7fe0d6fe..0c9cb974 100644
--- a/source/pdf/pdf-clean.c
+++ b/source/pdf/pdf-clean.c
@@ -2,7 +2,9 @@
#include "mupdf/pdf.h"
static void
-pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res, int ascii, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg)
+pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res,
+ pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg,
+ int sanitize, int ascii)
{
pdf_processor *proc_buffer = NULL;
pdf_processor *proc_filter = NULL;
@@ -59,7 +61,7 @@ pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_ob
}
static void
-pdf_clean_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int ascii)
+pdf_clean_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int sanitize, int ascii)
{
pdf_processor *proc_buffer = NULL;
pdf_processor *proc_filter = NULL;
@@ -91,10 +93,16 @@ pdf_clean_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_
fz_try(ctx)
{
proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii);
- proc_filter = pdf_new_filter_processor(ctx, doc, proc_buffer, orig_res, res);
-
- pdf_process_contents(ctx, proc_filter, doc, orig_res, val, cookie);
- pdf_close_processor(ctx, proc_filter);
+ if (sanitize)
+ {
+ proc_filter = pdf_new_filter_processor(ctx, doc, proc_buffer, orig_res, res);
+ pdf_process_contents(ctx, proc_filter, doc, orig_res, val, cookie);
+ pdf_close_processor(ctx, proc_filter);
+ }
+ else
+ {
+ pdf_process_contents(ctx, proc_filter, doc, orig_res, val, cookie);
+ }
pdf_close_processor(ctx, proc_buffer);
pdf_update_stream(ctx, doc, val, buffer, 0);
@@ -127,12 +135,14 @@ pdf_clean_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_
}
}
-void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *arg, int ascii)
+void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *arg, int sanitize, int ascii)
{
- pdf_filter_page_contents(ctx, doc, page, cookie, proc_fn, NULL, NULL, arg, ascii);
+ pdf_filter_page_contents(ctx, doc, page, cookie, proc_fn, NULL, NULL, arg, sanitize, ascii);
}
-void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *proc_arg, int ascii)
+void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie,
+ pdf_page_contents_process_fn *proc_fn, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *proc_arg,
+ int sanitize, int ascii)
{
pdf_processor *proc_buffer = NULL;
pdf_processor *proc_filter = NULL;
@@ -162,10 +172,17 @@ void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page
resources = pdf_page_resources(ctx, page);
proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii);
- proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, proc_buffer, resources, res, text_filter, after_text, proc_arg);
+ if (sanitize)
+ {
+ proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, proc_buffer, resources, res, text_filter, after_text, proc_arg);
- pdf_process_contents(ctx, proc_filter, doc, resources, contents, cookie);
- pdf_close_processor(ctx, proc_filter);
+ pdf_process_contents(ctx, proc_filter, doc, resources, contents, cookie);
+ pdf_close_processor(ctx, proc_filter);
+ }
+ else
+ {
+ pdf_process_contents(ctx, proc_buffer, doc, resources, contents, cookie);
+ }
pdf_close_processor(ctx, proc_buffer);
/* Deal with page content stream. */
@@ -207,7 +224,7 @@ void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page
if (!o)
continue;
/* Transparency group XObject */
- pdf_clean_stream_object(ctx, doc, o, resources, cookie, 1, ascii, text_filter, after_text, proc_arg);
+ pdf_clean_stream_object(ctx, doc, o, resources, cookie, 1, text_filter, after_text, proc_arg, sanitize, ascii);
}
}
@@ -223,7 +240,7 @@ void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page
if (!pat)
continue;
if (pdf_to_int(ctx, pdf_dict_get(ctx, pat, PDF_NAME_PatternType)) == 1)
- pdf_clean_stream_object(ctx, doc, pat, resources, cookie, 0, ascii, text_filter, after_text, proc_arg);
+ pdf_clean_stream_object(ctx, doc, pat, resources, cookie, 0, text_filter, after_text, proc_arg, sanitize, ascii);
}
}
@@ -239,7 +256,7 @@ void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page
if (!xobj)
continue;
if (pdf_name_eq(ctx, PDF_NAME_Form, pdf_dict_get(ctx, xobj, PDF_NAME_Subtype)))
- pdf_clean_stream_object(ctx, doc, xobj, resources, cookie, 1, ascii, text_filter, after_text, proc_arg);
+ pdf_clean_stream_object(ctx, doc, xobj, resources, cookie, 1, text_filter, after_text, proc_arg, sanitize, ascii);
}
}
@@ -255,7 +272,7 @@ void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page
if (!o)
continue;
if (pdf_name_eq(ctx, PDF_NAME_Type3, pdf_dict_get(ctx, o, PDF_NAME_Subtype)))
- pdf_clean_type3(ctx, doc, o, resources, cookie, ascii);
+ pdf_clean_type3(ctx, doc, o, resources, cookie, sanitize, ascii);
}
}
@@ -290,13 +307,13 @@ void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page
}
}
-void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *proc_arg, int ascii)
+void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *proc_arg, int sanitize, int ascii)
{
- pdf_filter_annot_contents(ctx, doc, annot, cookie, proc_fn, NULL, NULL, proc_arg, ascii);
+ pdf_filter_annot_contents(ctx, doc, annot, cookie, proc_fn, NULL, NULL, proc_arg, sanitize, ascii);
}
void pdf_filter_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie,
- pdf_page_contents_process_fn *proc, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int ascii)
+ pdf_page_contents_process_fn *proc, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int sanitize, int ascii)
{
pdf_obj *ap;
int i, n;
@@ -313,6 +330,6 @@ void pdf_filter_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *an
if (v == NULL)
continue;
- pdf_clean_stream_object(ctx, doc, v, NULL, cookie, 1, 1, text_filter, after_text, arg);
+ pdf_clean_stream_object(ctx, doc, v, NULL, cookie, 1, text_filter, after_text, arg, sanitize, ascii);
}
}
diff --git a/source/pdf/pdf-write.c b/source/pdf/pdf-write.c
index 154655e0..9fcdbf0a 100644
--- a/source/pdf/pdf-write.c
+++ b/source/pdf/pdf-write.c
@@ -2703,7 +2703,7 @@ static void complete_signatures(fz_context *ctx, pdf_document *doc, pdf_write_st
}
}
-static void sanitize(fz_context *ctx, pdf_document *doc, int ascii)
+static void clean_content_streams(fz_context *ctx, pdf_document *doc, int sanitize, int ascii)
{
int n = pdf_count_pages(ctx, doc);
int i;
@@ -2712,11 +2712,11 @@ static void sanitize(fz_context *ctx, pdf_document *doc, int ascii)
{
pdf_annot *annot;
pdf_page *page = pdf_load_page(ctx, doc, i);
- pdf_clean_page_contents(ctx, doc, page, NULL, NULL, NULL, ascii);
+ pdf_clean_page_contents(ctx, doc, page, NULL, NULL, NULL, sanitize, ascii);
for (annot = pdf_first_annot(ctx, page); annot != NULL; annot = pdf_next_annot(ctx, annot))
{
- pdf_clean_annot_contents(ctx, doc, annot, NULL, NULL, NULL, ascii);
+ pdf_clean_annot_contents(ctx, doc, annot, NULL, NULL, NULL, sanitize, ascii);
}
fz_drop_page(ctx, &page->super);
@@ -2787,7 +2787,8 @@ const char *fz_pdf_write_options_usage =
"\tascii: ASCII hex encode binary streams\n"
"\tpretty: pretty-print objects with indentation\n"
"\tlinearize: optimize for web browsers\n"
- "\tsanitize: clean up graphics commands in content streams\n"
+ "\tclean: pretty-print graphics commands in content streams\n"
+ "\tsanitize: sanitize graphics commands in content streams\n"
"\tgarbage: garbage collect unused objects\n"
"\tincremental: write changes as incremental update\n"
"\tcontinue-on-error: continue saving the document even if there is an error\n"
@@ -2816,8 +2817,10 @@ pdf_parse_write_options(fz_context *ctx, pdf_write_options *opts, const char *ar
opts->do_pretty = fz_option_eq(val, "yes");
if (fz_has_option(ctx, args, "linearize", &val))
opts->do_linear = fz_option_eq(val, "yes");
- if (fz_has_option(ctx, args, "sanitize", &val))
+ if (fz_has_option(ctx, args, "clean", &val))
opts->do_clean = fz_option_eq(val, "yes");
+ if (fz_has_option(ctx, args, "sanitize", &val))
+ opts->do_sanitize = fz_option_eq(val, "yes");
if (fz_has_option(ctx, args, "incremental", &val))
opts->do_incremental = fz_option_eq(val, "yes");
if (fz_has_option(ctx, args, "continue-on-error", &val))
@@ -2851,9 +2854,9 @@ prepare_for_save(fz_context *ctx, pdf_document *doc, pdf_write_options *in_opts)
{
doc->freeze_updates = 1;
- /* Sanitize the operator streams */
- if (in_opts->do_clean)
- sanitize(ctx, doc, in_opts->do_ascii);
+ /* Rewrite (and possibly sanitize) the operator streams */
+ if (in_opts->do_clean || in_opts->do_sanitize)
+ clean_content_streams(ctx, doc, in_opts->do_sanitize, in_opts->do_ascii);
pdf_finish_edit(ctx, doc);
presize_unsaved_signature_byteranges(ctx, doc);
diff --git a/source/tools/pdfclean.c b/source/tools/pdfclean.c
index ff60bbb5..6d5a69be 100644
--- a/source/tools/pdfclean.c
+++ b/source/tools/pdfclean.c
@@ -31,7 +31,8 @@ static void usage(void)
"\t-z\tdeflate uncompressed streams\n"
"\t-f\tcompress font streams\n"
"\t-i\tcompress image streams\n"
- "\t-s\tclean content streams\n"
+ "\t-c\tclean content streams\n"
+ "\t-s\tsanitize content streams\n"
"\tpages\tcomma separated list of page numbers and ranges\n"
);
exit(1);
@@ -50,7 +51,7 @@ int pdfclean_main(int argc, char **argv)
opts.continue_on_error = 1;
opts.errors = &errors;
- while ((c = fz_getopt(argc, argv, "adfgilp:sz")) != -1)
+ while ((c = fz_getopt(argc, argv, "adfgilp:scz")) != -1)
{
switch (c)
{
@@ -63,7 +64,8 @@ int pdfclean_main(int argc, char **argv)
case 'a': opts.do_ascii += 1; break;
case 'g': opts.do_garbage += 1; break;
case 'l': opts.do_linear += 1; break;
- case 's': opts.do_clean += 1; break;
+ case 'c': opts.do_clean += 1; break;
+ case 's': opts.do_sanitize += 1; break;
default: usage(); break;
}
}