From 90a289b18e2936bd2e585265964474df31c0dd5f Mon Sep 17 00:00:00 2001 From: Robin Watts Date: Wed, 9 May 2012 19:13:14 +0100 Subject: mupdfclean - update to allow renumbering of encrypted objects mupdfclean (or more correctly, the pdf_write function) currently has a limitation, in that we cannot renumber objects when encryption is being used. This is because the object/generation number is pickled into the stream, and renumbering the object causes it to become unreadable. The solution used here is to provide extended functions that take both the object/generation number and the original object/generation number. The original object numbers are only used for setting up the encryption. pdf_write now keeps track of the original object/generation number for each object. This fix is important, if we ever want to output linearized pdf as this requires us to be able to renumber objects to a very specific order. We also make a fix in removeduplicateobjects that should only matter in the case where we fail to read an object correctly. --- pdf/mupdf-internal.h | 7 +++++-- pdf/pdf_image.c | 4 +++- pdf/pdf_stream.c | 58 ++++++++++++++++++++++++++++++++++------------------ pdf/pdf_write.c | 56 +++++++++++++++++++++++++++++++++++++------------- 4 files changed, 88 insertions(+), 37 deletions(-) diff --git a/pdf/mupdf-internal.h b/pdf/mupdf-internal.h index 3396bb8a..e947cbbd 100644 --- a/pdf/mupdf-internal.h +++ b/pdf/mupdf-internal.h @@ -194,11 +194,14 @@ struct pdf_document_s void pdf_cache_object(pdf_document *doc, int num, int gen); fz_stream *pdf_open_inline_stream(pdf_document *doc, pdf_obj *stmobj, int length, fz_stream *chain, pdf_image_params *params); -fz_buffer *pdf_load_image_stream(pdf_document *doc, int num, int gen, pdf_image_params *params); -fz_stream *pdf_open_image_stream(pdf_document *doc, int num, int gen, pdf_image_params *params); +fz_buffer *pdf_load_image_stream(pdf_document *doc, int num, int gen, int orig_num, int orig_gen, pdf_image_params *params); +fz_stream *pdf_open_image_stream(pdf_document *doc, int num, int gen, int orig_num, int orig_gen, pdf_image_params *params); fz_stream *pdf_open_stream_with_offset(pdf_document *doc, int num, int gen, pdf_obj *dict, int stm_ofs); fz_stream *pdf_open_image_decomp_stream(fz_context *ctx, fz_buffer *, pdf_image_params *params, int *factor); fz_stream *pdf_open_contents_stream(pdf_document *xref, pdf_obj *obj); +fz_buffer *pdf_load_raw_renumbered_stream(pdf_document *doc, int num, int gen, int orig_num, int orig_gen); +fz_buffer *pdf_load_renumbered_stream(pdf_document *doc, int num, int gen, int orig_num, int orig_gen); +fz_stream *pdf_open_raw_renumbered_stream(pdf_document *doc, int num, int gen, int orig_num, int orig_gen); void pdf_repair_xref(pdf_document *doc, pdf_lexbuf *buf); void pdf_repair_obj_stms(pdf_document *doc); diff --git a/pdf/pdf_image.c b/pdf/pdf_image.c index f4ddbc72..b4571bbe 100644 --- a/pdf/pdf_image.c +++ b/pdf/pdf_image.c @@ -430,7 +430,9 @@ pdf_load_image_imp(pdf_document *xref, pdf_obj *rdb, pdf_obj *dict, fz_stream *c { /* Just load the compressed image data now and we can * decode it on demand. */ - image->buffer = pdf_load_image_stream(xref, pdf_to_num(dict), pdf_to_gen(dict), &image->params); + int num = pdf_to_num(dict); + int gen = pdf_to_gen(dict); + image->buffer = pdf_load_image_stream(xref, num, gen, num, gen, &image->params); break; /* Out of fz_try */ } diff --git a/pdf/pdf_stream.c b/pdf/pdf_stream.c index 3086fbc9..8cc755f2 100644 --- a/pdf/pdf_stream.c +++ b/pdf/pdf_stream.c @@ -222,8 +222,11 @@ build_filter_chain(fz_stream *chain, pdf_document *xref, pdf_obj *fs, pdf_obj *p /* * Build a filter for reading raw stream data. - * This is a null filter to constrain reading to the - * stream length, followed by a decryption filter. + * This is a null filter to constrain reading to the stream length (and to + * allow for other people accessing the file), followed by a decryption + * filter. + * + * num and gen are used purely to seed the encryption. */ static fz_stream * pdf_open_raw_filter(fz_stream *chain, pdf_document *xref, pdf_obj *stmobj, int num, int gen, int offset) @@ -302,14 +305,17 @@ pdf_open_inline_stream(pdf_document *xref, pdf_obj *stmobj, int length, fz_strea /* * Open a stream for reading the raw (compressed but decrypted) data. - * Using xref->file while this is open is a bad idea. */ fz_stream * pdf_open_raw_stream(pdf_document *xref, int num, int gen) { - pdf_xref_entry *x; + return pdf_open_raw_renumbered_stream(xref, num, gen, num, gen); +} - fz_var(x); +fz_stream * +pdf_open_raw_renumbered_stream(pdf_document *xref, int num, int gen, int orig_num, int orig_gen) +{ + pdf_xref_entry *x; if (num < 0 || num >= xref->len) fz_throw(xref->ctx, "object id out of range (%d %d R)", num, gen); @@ -322,7 +328,7 @@ pdf_open_raw_stream(pdf_document *xref, int num, int gen) if (x->stm_ofs == 0) fz_throw(xref->ctx, "object is not a stream"); - return pdf_open_raw_filter(xref->file, xref, x->obj, num, gen, x->stm_ofs); + return pdf_open_raw_filter(xref->file, xref, x->obj, orig_num, orig_gen, x->stm_ofs); } /* @@ -333,11 +339,11 @@ pdf_open_raw_stream(pdf_document *xref, int num, int gen) fz_stream * pdf_open_stream(pdf_document *xref, int num, int gen) { - return pdf_open_image_stream(xref, num, gen, NULL); + return pdf_open_image_stream(xref, num, gen, num, gen, NULL); } fz_stream * -pdf_open_image_stream(pdf_document *xref, int num, int gen, pdf_image_params *params) +pdf_open_image_stream(pdf_document *xref, int num, int gen, int orig_num, int orig_gen, pdf_image_params *params) { pdf_xref_entry *x; @@ -352,7 +358,7 @@ pdf_open_image_stream(pdf_document *xref, int num, int gen, pdf_image_params *pa if (x->stm_ofs == 0) fz_throw(xref->ctx, "object is not a stream"); - return pdf_open_filter(xref->file, xref, x->obj, num, gen, x->stm_ofs, params); + return pdf_open_filter(xref->file, xref, x->obj, orig_num, orig_gen, x->stm_ofs, params); } fz_stream * @@ -413,6 +419,12 @@ pdf_open_stream_with_offset(pdf_document *xref, int num, int gen, pdf_obj *dict, */ fz_buffer * pdf_load_raw_stream(pdf_document *xref, int num, int gen) +{ + return pdf_load_raw_renumbered_stream(xref, num, gen, num, gen); +} + +fz_buffer * +pdf_load_raw_renumbered_stream(pdf_document *xref, int num, int gen, int orig_num, int orig_gen) { fz_stream *stm; pdf_obj *dict; @@ -426,7 +438,7 @@ pdf_load_raw_stream(pdf_document *xref, int num, int gen) pdf_drop_obj(dict); - stm = pdf_open_raw_stream(xref, num, gen); + stm = pdf_open_raw_renumbered_stream(xref, num, gen, orig_num, orig_gen); /* RJW: "cannot open raw stream (%d %d R)", num, gen */ buf = fz_read_all(stm, len); @@ -458,11 +470,17 @@ pdf_guess_filter_length(int len, char *filter) fz_buffer * pdf_load_stream(pdf_document *xref, int num, int gen) { - return pdf_load_image_stream(xref, num, gen, NULL); + return pdf_load_image_stream(xref, num, gen, num, gen, NULL); } fz_buffer * -pdf_load_image_stream(pdf_document *xref, int num, int gen, pdf_image_params *params) +pdf_load_renumbered_stream(pdf_document *xref, int num, int gen, int orig_num, int orig_gen) +{ + return pdf_load_image_stream(xref, num, gen, orig_num, orig_gen, NULL); +} + +fz_buffer * +pdf_load_image_stream(pdf_document *xref, int num, int gen, int orig_num, int orig_gen, pdf_image_params *params) { fz_context *ctx = xref->ctx; fz_stream *stm = NULL; @@ -484,7 +502,7 @@ pdf_load_image_stream(pdf_document *xref, int num, int gen, pdf_image_params *pa pdf_drop_obj(dict); - stm = pdf_open_image_stream(xref, num, gen, params); + stm = pdf_open_image_stream(xref, num, gen, orig_num, orig_gen, params); /* RJW: "cannot open stream (%d %d R)", num, gen */ fz_try(ctx) @@ -535,16 +553,16 @@ fz_stream * pdf_open_contents_stream(pdf_document *xref, pdf_obj *obj) { fz_context *ctx = xref->ctx; + int num, gen; if (pdf_is_array(obj)) - { return pdf_open_object_array(xref, obj); - } - else if (pdf_is_stream(xref, pdf_to_num(obj), pdf_to_gen(obj))) - { - return pdf_open_image_stream(xref, pdf_to_num(obj), pdf_to_gen(obj), NULL); - } - fz_warn(ctx, "pdf object stream missing (%d %d R)", pdf_to_num(obj), pdf_to_gen(obj)); + num = pdf_to_num(obj); + gen = pdf_to_gen(obj); + if (pdf_is_stream(xref, num, gen)) + return pdf_open_image_stream(xref, num, gen, num, gen, NULL); + + fz_warn(ctx, "pdf object stream missing (%d %d R)", num, gen); return NULL; } diff --git a/pdf/pdf_write.c b/pdf/pdf_write.c index e14cfa28..e2086e5c 100644 --- a/pdf/pdf_write.c +++ b/pdf/pdf_write.c @@ -13,6 +13,8 @@ struct pdf_write_options_s int *ofslist; int *genlist; int *renumbermap; + int *revrenumbermap; + int *revgenlist; }; /* @@ -91,7 +93,7 @@ static void removeduplicateobjs(pdf_document *xref, pdf_write_options *opts) for (other = 1; other < num; other++) { pdf_obj *a, *b; - int match; + int differ, newnum; if (num == other || !opts->uselist[num] || !opts->uselist[other]) continue; @@ -104,14 +106,14 @@ static void removeduplicateobjs(pdf_document *xref, pdf_write_options *opts) */ fz_try(ctx) { - match = (pdf_is_stream(xref, num, 0) || pdf_is_stream(xref, other, 0)); + differ = (pdf_is_stream(xref, num, 0) || pdf_is_stream(xref, other, 0)); } fz_catch(ctx) { /* Assume different */ - match = 0; + differ = 1; } - if (match) + if (differ) continue; a = xref->table[num].obj; @@ -124,8 +126,10 @@ static void removeduplicateobjs(pdf_document *xref, pdf_write_options *opts) continue; /* Keep the lowest numbered object */ - opts->renumbermap[num] = MIN(num, other); - opts->renumbermap[other] = MIN(num, other); + newnum = MIN(num, other); + opts->renumbermap[num] = newnum; + opts->renumbermap[other] = newnum; + opts->revrenumbermap[newnum] = num; /* Either will do */ opts->uselist[MAX(num, other)] = 0; /* One duplicate was found, do not look for another */ @@ -136,6 +140,8 @@ static void removeduplicateobjs(pdf_document *xref, pdf_write_options *opts) /* * Renumber objects sequentially so the xref is more compact + * + * This code assumes that any opts->renumbermap[n] <= n for all n. */ static void compactxref(pdf_document *xref, pdf_write_options *opts) @@ -152,10 +158,25 @@ static void compactxref(pdf_document *xref, pdf_write_options *opts) newnum = 1; for (num = 1; num < xref->len; num++) { - if (opts->uselist[num] && opts->renumbermap[num] == num) + /* If it's not used, map it to zero */ + if (!opts->uselist[num]) + { + opts->renumbermap[num] = 0; + } + /* If it's not moved, compact it. */ + else if (opts->renumbermap[num] == num) + { + opts->revrenumbermap[newnum] = opts->revrenumbermap[num]; + opts->revgenlist[newnum] = opts->revgenlist[num]; opts->renumbermap[num] = newnum++; - else if (opts->renumbermap[num] != num) + } + /* Otherwise it's used, and moved. We know that it must have + * moved down, so the place it's moved to will be in the right + * place already. */ + else + { opts->renumbermap[num] = opts->renumbermap[opts->renumbermap[num]]; + } } } @@ -384,8 +405,10 @@ static void copystream(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj fz_buffer *buf, *tmp; pdf_obj *newlen; fz_context *ctx = xref->ctx; + int orig_num = opts->revrenumbermap[num]; + int orig_gen = opts->revgenlist[num]; - buf = pdf_load_raw_stream(xref, num, gen); + buf = pdf_load_raw_renumbered_stream(xref, num, gen, orig_num, orig_gen); if (opts->doascii && isbinarystream(buf)) { @@ -414,8 +437,10 @@ static void expandstream(pdf_document *xref, pdf_write_options *opts, pdf_obj *o fz_buffer *buf, *tmp; pdf_obj *newlen; fz_context *ctx = xref->ctx; + int orig_num = opts->revrenumbermap[num]; + int orig_gen = opts->revgenlist[num]; - buf = pdf_load_stream(xref, num, gen); + buf = pdf_load_renumbered_stream(xref, num, gen, orig_num, orig_gen); pdf_dict_dels(obj, "Filter"); pdf_dict_dels(obj, "DecodeParms"); @@ -580,6 +605,8 @@ void pdf_write(pdf_document *xref, char *filename, fz_write_options *fz_opts) opts.ofslist = fz_malloc_array(ctx, xref->len + 1, sizeof(int)); opts.genlist = fz_malloc_array(ctx, xref->len + 1, sizeof(int)); opts.renumbermap = fz_malloc_array(ctx, xref->len + 1, sizeof(int)); + opts.revrenumbermap = fz_malloc_array(ctx, xref->len + 1, sizeof(int)); + opts.revgenlist = fz_malloc_array(ctx, xref->len + 1, sizeof(int)); fprintf(opts.out, "%%PDF-%d.%d\n", xref->version / 10, xref->version % 10); fprintf(opts.out, "%%\316\274\341\277\246\n\n"); @@ -589,6 +616,8 @@ void pdf_write(pdf_document *xref, char *filename, fz_write_options *fz_opts) opts.uselist[num] = 0; opts.ofslist[num] = 0; opts.renumbermap[num] = num; + opts.revrenumbermap[num] = num; + opts.revgenlist[num] = xref->table[num].gen; } /* Make sure any objects hidden in compressed streams have been loaded */ @@ -607,10 +636,7 @@ void pdf_write(pdf_document *xref, char *filename, fz_write_options *fz_opts) compactxref(xref, &opts); /* Make renumbering affect all indirect references and update xref */ - /* Do not renumber objects if encryption is in use, as the object - * numbers are baked into the streams/strings, and we can't currently - * cope with moving them. See bug 692627. */ - if (opts.dogarbage >= 2 && !xref->crypt) + if (opts.dogarbage >= 2) renumberobjs(xref, &opts); for (num = 0; num < xref->len; num++) @@ -653,6 +679,8 @@ void pdf_write(pdf_document *xref, char *filename, fz_write_options *fz_opts) fz_free(ctx, opts.ofslist); fz_free(ctx, opts.genlist); fz_free(ctx, opts.renumbermap); + fz_free(ctx, opts.revrenumbermap); + fz_free(ctx, opts.revgenlist); fclose(opts.out); } fz_catch(ctx) -- cgit v1.2.3