diff options
author | Robin Watts <robin.watts@artifex.com> | 2015-09-30 19:10:45 +0100 |
---|---|---|
committer | Robin Watts <robin.watts@artifex.com> | 2015-10-01 11:11:10 +0100 |
commit | 841398e7693a511df66dd0f1b33d9604371d53e0 (patch) | |
tree | 9543c6591a5fd856da2b2c52d6c7ed970a697ca8 /source/pdf | |
parent | e6a6cccd2295ad09fc569debda49ebcef8b2cd92 (diff) | |
download | mupdf-841398e7693a511df66dd0f1b33d9604371d53e0.tar.xz |
Bug 696146: Improve pdf_repair to find /Root in new style XRefs.
The current code never looks for /Root objects in dictionaries
as it parses them. This means that 'New style' files end up
without any Roots after repair.
The new code therefore updates pdf_repair_obj to look for Root
objects in the same way it looks for encrypt and id objects.
These go into the list of found roots.
The Root object almost certainly has indirections within it, so
it is vital that the 'doc' pointer gets set. This means we have
to make a slight adjustment to pdf_repair_obj so that the dict
is parsed with a doc pointer. In turn this means we need to
manually ensure that none of the other information read from
the dict during the repair operation will cause indirections
to be resolved. This is achieved by checking for
!pdf_is_indirect at various points.
Diffstat (limited to 'source/pdf')
-rw-r--r-- | source/pdf/pdf-repair.c | 89 | ||||
-rw-r--r-- | source/pdf/pdf-xref.c | 4 |
2 files changed, 65 insertions, 28 deletions
diff --git a/source/pdf/pdf-repair.c b/source/pdf/pdf-repair.c index 6e62bf00..fdd46483 100644 --- a/source/pdf/pdf-repair.c +++ b/source/pdf/pdf-repair.c @@ -14,8 +14,21 @@ struct entry int stm_len; }; +static void add_root(fz_context *ctx, pdf_obj *obj, pdf_obj ***roots, int *num_roots, int *max_roots) +{ + if (*num_roots == *max_roots) + { + int new_max_roots = *max_roots * 2; + if (new_max_roots == 0) + new_max_roots = 4; + *roots = fz_resize_array(ctx, *roots, new_max_roots, sizeof(**roots)); + *max_roots = new_max_roots; + } + (*roots)[(*num_roots)++] = pdf_keep_obj(ctx, obj); +} + int -pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, fz_off_t *tmpofs) +pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, fz_off_t *tmpofs, pdf_obj **root) { fz_stream *file = doc->file; pdf_token tok; @@ -37,10 +50,9 @@ pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *st { pdf_obj *dict, *obj; - /* Send NULL xref so we don't try to resolve references */ fz_try(ctx) { - dict = pdf_parse_dict(ctx, NULL, file, buf); + dict = pdf_parse_dict(ctx, doc, file, buf); } fz_catch(ctx) { @@ -52,24 +64,39 @@ pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *st dict = pdf_new_dict(ctx, NULL, 2); } - if (encrypt && id) + /* We must be careful not to try to resolve any indirections + * here. We have just read dict, so we know it to be a non + * indirected dictionary. Before we look at any values that + * we get back from looking up in it, we need to check they + * aren't indirected. */ + + if (encrypt || id || root) { obj = pdf_dict_get(ctx, dict, PDF_NAME_Type); - if (pdf_name_eq(ctx, obj, PDF_NAME_XRef)) + if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME_XRef)) { - obj = pdf_dict_get(ctx, dict, PDF_NAME_Encrypt); - if (obj) + if (encrypt) { - pdf_drop_obj(ctx, *encrypt); - *encrypt = pdf_keep_obj(ctx, obj); + obj = pdf_dict_get(ctx, dict, PDF_NAME_Encrypt); + if (obj) + { + pdf_drop_obj(ctx, *encrypt); + *encrypt = pdf_keep_obj(ctx, obj); + } } - obj = pdf_dict_get(ctx, dict, PDF_NAME_ID); - if (obj) + if (id) { - pdf_drop_obj(ctx, *id); - *id = pdf_keep_obj(ctx, obj); + obj = pdf_dict_get(ctx, dict, PDF_NAME_ID); + if (obj) + { + pdf_drop_obj(ctx, *id); + *id = pdf_keep_obj(ctx, obj); + } } + + if (root) + *root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME_Root)); } } @@ -80,7 +107,7 @@ pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, fz_off_t *st if (doc->file_reading_linearly && page) { obj = pdf_dict_get(ctx, dict, PDF_NAME_Type); - if (pdf_name_eq(ctx, obj, PDF_NAME_Page)) + if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME_Page)) { pdf_drop_obj(ctx, *page); *page = pdf_keep_obj(ctx, dict); @@ -343,11 +370,19 @@ pdf_repair_xref(fz_context *ctx, pdf_document *doc) else if (tok == PDF_TOK_OBJ) { + pdf_obj *root = NULL; + fz_try(ctx) { stm_len = 0; stm_ofs = 0; - tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs); + tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs, &root); + if (root) + add_root(ctx, root, &roots, &num_roots, &max_roots); + } + fz_always(ctx) + { + pdf_drop_obj(ctx, root); } fz_catch(ctx) { @@ -423,17 +458,7 @@ pdf_repair_xref(fz_context *ctx, pdf_document *doc) obj = pdf_dict_get(ctx, dict, PDF_NAME_Root); if (obj) - { - if (num_roots == max_roots) - { - int new_max_roots = max_roots * 2; - if (new_max_roots == 0) - new_max_roots = 4; - roots = fz_resize_array(ctx, roots, new_max_roots, sizeof(*roots)); - max_roots = new_max_roots; - } - roots[num_roots++] = pdf_keep_obj(ctx, obj); - } + add_root(ctx, obj, &roots, &num_roots, &max_roots); obj = pdf_dict_get(ctx, dict, PDF_NAME_Info); if (obj) @@ -471,6 +496,18 @@ pdf_repair_xref(fz_context *ctx, pdf_document *doc) * 0 to maxnum. */ pdf_ensure_solid_xref(ctx, doc, maxnum); + for (i = 1; i < maxnum; i++) + { + entry = pdf_get_populating_xref_entry(ctx, doc, i); + if (entry->obj != NULL) + continue; + entry->type = 'f'; + entry->ofs = 0; + entry->gen = 0; + + entry->stm_ofs = 0; + } + for (i = 0; i < listlen; i++) { entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num); diff --git a/source/pdf/pdf-xref.c b/source/pdf/pdf-xref.c index 3a9f70df..727dcda7 100644 --- a/source/pdf/pdf-xref.c +++ b/source/pdf/pdf-xref.c @@ -1807,7 +1807,7 @@ pdf_obj_read(fz_context *ctx, pdf_document *doc, fz_off_t *offset, int *nump, pd * whenever we read an object it should just go into the * previous xref. */ - tok = pdf_repair_obj(ctx, doc, buf, &stmofs, NULL, NULL, NULL, page, &newtmpofs); + tok = pdf_repair_obj(ctx, doc, buf, &stmofs, NULL, NULL, NULL, page, &newtmpofs, NULL); do /* So we can break out of it */ { @@ -2600,7 +2600,7 @@ pdf_load_hint_object(fz_context *ctx, pdf_document *doc) tok = pdf_lex(ctx, doc->file, buf); if (tok != PDF_TOK_OBJ) break; - (void)pdf_repair_obj(ctx, doc, buf, &tmpofs, NULL, NULL, NULL, &page, &tmpofs); + (void)pdf_repair_obj(ctx, doc, buf, &tmpofs, NULL, NULL, NULL, &page, &tmpofs, NULL); pdf_load_hints(ctx, doc, num, gen); } } |