summaryrefslogtreecommitdiff
path: root/source/pdf/pdf-write.c
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2018-01-01 17:24:42 +0000
committerRobin Watts <robin.watts@artifex.com>2018-01-05 11:47:08 +0000
commit25593f4f9df0c4a9b9adaa84aaa33fe2a89087f6 (patch)
tree207c75e3a1bb4b05e83846762e3cf5fb030a0eed /source/pdf/pdf-write.c
parent1202a24a5b2729093545a89d013eaef1557a5fe9 (diff)
downloadmupdf-25593f4f9df0c4a9b9adaa84aaa33fe2a89087f6.tar.xz
Fix "being able to search for redacted text" bug.
A customer reports that even after text has been redacted, we can still search for the redacted text. The example file supplied had many instances of the word 'words', and 4 instances of 'apple'. The 'apple' instances were redacted, and the document saved out. 2 such instances were on the first page; when we searched for 'apple' acrobat would find the word after the first removed instance of apple, then find the word 2 after the second removed instance of apple. After much head scratching and cutting down of the file, it appears that the information genuinely isn't in the file. Acrobat is somehow remembering it. It appears to be doing this using the 'ID' entries in the trailer dict. My suspicion is that Acrobat has cached the text extraction from the original document, and is using this on all files that match the IDs. Change the IDs (or remove them) and the problem goes away. The spec says that the ID should be 2 bytestrings in an array. The first is supposed to stay the same in all versions of a file (i.e. it shows the *original* version of the file, and it is the one that is used by encrypt). The second bytestring is supposed to change more often, so here we simply return a new random string on each writing.
Diffstat (limited to 'source/pdf/pdf-write.c')
-rw-r--r--source/pdf/pdf-write.c19
1 files changed, 19 insertions, 0 deletions
diff --git a/source/pdf/pdf-write.c b/source/pdf/pdf-write.c
index 0580a4d2..f8b6a5d1 100644
--- a/source/pdf/pdf-write.c
+++ b/source/pdf/pdf-write.c
@@ -2864,6 +2864,23 @@ prepare_for_save(fz_context *ctx, pdf_document *doc, pdf_write_options *in_opts)
}
static void
+change_identity(fz_context *ctx, pdf_document *doc)
+{
+ pdf_obj *identity = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_ID);
+ pdf_obj *str;
+ unsigned char rnd[16];
+
+ if (pdf_array_len(ctx, identity) < 2)
+ return;
+
+ /* Maybe recalculate this in future. For now, just change the second one. */
+ fz_memrnd(ctx, rnd, 16);
+ str = pdf_new_string(ctx, doc, (char *)rnd, 16);
+ pdf_array_put_drop(ctx, identity, 1, str);
+
+}
+
+static void
do_pdf_save_document(fz_context *ctx, pdf_document *doc, pdf_write_state *opts, pdf_write_options *in_opts)
{
int lastfree;
@@ -2893,6 +2910,8 @@ do_pdf_save_document(fz_context *ctx, pdf_document *doc, pdf_write_state *opts,
{
pdf_ensure_solid_xref(ctx, doc, xref_len);
preloadobjstms(ctx, doc);
+
+ change_identity(ctx, doc);
}
/* Sweep & mark objects from the trailer */