diff options
author | Robin Watts <robin.watts@artifex.com> | 2016-02-29 14:13:11 +0000 |
---|---|---|
committer | Robin Watts <robin.watts@artifex.com> | 2016-02-29 17:14:53 +0000 |
commit | dde049432d9f28d29aa4be6730e67ebc28415ef3 (patch) | |
tree | 7e9c29d40467c1ee6a3aa61eeed7d11bfa01c6f6 | |
parent | d065aca45bce7ee3668f59ca13cda46b52bc9bd6 (diff) | |
download | mupdf-dde049432d9f28d29aa4be6730e67ebc28415ef3.tar.xz |
Strip outlines when cleaning a pdf file.
Arrange to keep only the Outline entries that still refer to
valid pages.
More specifically, leaf outline entries that refer to pages that
are elided are not dropped. Non leaf outline entries that refer
to pages that have been elided are kept (for the sake of the
children) but have NULL destinations.
-rw-r--r-- | resources/pdf/names.txt | 1 | ||||
-rw-r--r-- | source/pdf/pdf-clean-file.c | 180 |
2 files changed, 151 insertions, 30 deletions
diff --git a/resources/pdf/names.txt b/resources/pdf/names.txt index a173b6bb..ff63d99d 100644 --- a/resources/pdf/names.txt +++ b/resources/pdf/names.txt @@ -198,6 +198,7 @@ LZ LZW LZWDecode Lab +Last LastChar Launch Length diff --git a/source/pdf/pdf-clean-file.c b/source/pdf/pdf-clean-file.c index 29efbd5c..b766af89 100644 --- a/source/pdf/pdf-clean-file.c +++ b/source/pdf/pdf-clean-file.c @@ -36,47 +36,163 @@ static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parent, pdf_ pdf_array_push(ctx, kids, pageref); } -static int dest_is_valid_page(fz_context *ctx, pdf_document *doc, pdf_obj *obj) +static int dest_is_valid_page(fz_context *ctx, pdf_obj *obj, int *page_object_nums, int pagecount) { int i; int num = pdf_to_num(ctx, obj); - int pagecount = pdf_count_pages(ctx, doc); if (num == 0) return 0; for (i = 0; i < pagecount; i++) { - pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); - - if (pdf_to_num(ctx, pageref) == num) + if (page_object_nums[i] == num) return 1; } return 0; } +static int dest_is_valid(fz_context *ctx, pdf_obj *o, int page_count, int *page_object_nums, pdf_obj *names_list) +{ + pdf_obj *p; + + p = pdf_dict_get(ctx, o, PDF_NAME_A); + if (pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME_S), PDF_NAME_GoTo) && + !string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME_D), names_list)) + return 0; + + p = pdf_dict_get(ctx, o, PDF_NAME_Dest); + if (p == NULL) + {} + else if (pdf_is_string(ctx, p)) + { + return string_in_names_list(ctx, p, names_list); + } + else if (!dest_is_valid_page(ctx, pdf_array_get(ctx, p, 0), page_object_nums, page_count)) + return 0; + + return 1; +} + +static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list); + +static int strip_outline(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_obj **pfirst, pdf_obj **plast) +{ + pdf_obj *prev = NULL; + pdf_obj *first = NULL; + pdf_obj *current; + int count = 0; + + for (current = outlines; current != NULL; ) + { + int nc; + + /* Strip any children to start with. This takes care of + * First/Last/Count for us. */ + nc = strip_outlines(ctx, doc, current, page_count, page_object_nums, names_list); + + if (!dest_is_valid(ctx, current, page_count, page_object_nums, names_list)) + { + if (nc == 0) + { + /* Outline with invalid dest and no children. Drop it by + * pulling the next one in here. */ + pdf_obj *next = pdf_dict_get(ctx, current, PDF_NAME_Next); + if (next == NULL) + { + /* There is no next one to pull in */ + if (prev != NULL) + pdf_dict_del(ctx, prev, PDF_NAME_Next); + } + else if (prev != NULL) + { + pdf_dict_put(ctx, prev, PDF_NAME_Next, next); + pdf_dict_put(ctx, next, PDF_NAME_Prev, prev); + } + else + { + pdf_dict_del(ctx, next, PDF_NAME_Prev); + } + current = next; + } + else + { + /* Outline with invalid dest, but children. Just drop the dest. */ + pdf_dict_del(ctx, current, PDF_NAME_Dest); + pdf_dict_del(ctx, current, PDF_NAME_A); + current = pdf_dict_get(ctx, current, PDF_NAME_Next); + } + } + else + { + /* Keep this one */ + if (first == NULL) + first = current; + prev = current; + current = pdf_dict_get(ctx, current, PDF_NAME_Next); + count++; + } + } + + *pfirst = first; + *plast = prev; + + return count; +} + +static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list) +{ + int nc; + pdf_obj *first; + pdf_obj *last; + + first = pdf_dict_get(ctx, outlines, PDF_NAME_First); + if (first == NULL) + nc = 0; + else + nc = strip_outline(ctx, doc, first, page_count, page_object_nums, names_list, &first, &last); + + if (nc == 0) + { + pdf_dict_del(ctx, outlines, PDF_NAME_First); + pdf_dict_del(ctx, outlines, PDF_NAME_Last); + pdf_dict_del(ctx, outlines, PDF_NAME_Count); + } + else + { + int old_count = pdf_to_int(ctx, pdf_dict_get(ctx, outlines, PDF_NAME_Count)); + pdf_dict_put(ctx, outlines, PDF_NAME_First, first); + pdf_dict_put(ctx, outlines, PDF_NAME_Last, last); + pdf_dict_put(ctx, outlines, PDF_NAME_Count, pdf_new_int(ctx, doc, old_count > 0 ? nc : -nc)); + } + + return nc; +} + static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) { pdf_obj *oldroot, *root, *pages, *kids, *countobj, *parent, *olddests; pdf_document *doc = glo->doc; int argidx = 0; pdf_obj *names_list = NULL; + pdf_obj *outlines; int pagecount; int i; + int *page_object_nums; /* Keep only pages/type and (reduced) dest entries to avoid * references to unretained pages */ oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); pages = pdf_dict_get(ctx, oldroot, PDF_NAME_Pages); olddests = pdf_load_name_tree(ctx, doc, PDF_NAME_Dests); + outlines = pdf_dict_get(ctx, oldroot, PDF_NAME_Outlines); - root = pdf_new_dict(ctx, doc, 2); + root = pdf_new_dict(ctx, doc, 3); pdf_dict_put(ctx, root, PDF_NAME_Type, pdf_dict_get(ctx, oldroot, PDF_NAME_Type)); pdf_dict_put(ctx, root, PDF_NAME_Pages, pdf_dict_get(ctx, oldroot, PDF_NAME_Pages)); + pdf_dict_put(ctx, root, PDF_NAME_Outlines, outlines); pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root); - pdf_drop_obj(ctx, root); - /* Create a new kids array with only the pages we want to keep */ parent = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, pages), pdf_to_gen(ctx, pages)); kids = pdf_new_array(ctx, doc, 1); @@ -132,7 +248,21 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) pdf_dict_put(ctx, pages, PDF_NAME_Kids, kids); pdf_drop_obj(ctx, kids); - /* Also preserve the (partial) Dests name tree */ + /* Force the next call to pdf_count_pages to recount */ + glo->doc->page_count = 0; + + pagecount = pdf_count_pages(ctx, doc); + page_object_nums = fz_calloc(ctx, pagecount, sizeof(*page_object_nums)); + for (i = 0; i < pagecount; i++) + { + pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); + page_object_nums[i] = pdf_to_num(ctx, pageref); + } + + /* If we had an old Dests tree (now reformed as an olddests + * dictionary), keep any entries in there that point to + * valid pages. This may mean we keep more than we need, but + * it's safe at least. */ if (olddests) { pdf_obj *names = pdf_new_dict(ctx, doc, 1); @@ -148,7 +278,7 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME_D); dest = pdf_array_get(ctx, dest ? dest : val, 0); - if (pdf_array_contains(ctx, pdf_dict_get(ctx, pages, PDF_NAME_Kids), dest)) + if (dest_is_valid_page(ctx, dest, page_object_nums, pagecount)) { pdf_obj *key_str = pdf_new_string(ctx, doc, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key))); pdf_array_push(ctx, names_list, key_str); @@ -157,23 +287,17 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) } } - root = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); pdf_dict_put(ctx, dests, PDF_NAME_Names, names_list); pdf_dict_put(ctx, names, PDF_NAME_Dests, dests); pdf_dict_put(ctx, root, PDF_NAME_Names, names); pdf_drop_obj(ctx, names); pdf_drop_obj(ctx, dests); - pdf_drop_obj(ctx, names_list); pdf_drop_obj(ctx, olddests); } - /* Force the next call to pdf_count_pages to recount */ - glo->doc->page_count = 0; - /* Edit each pages /Annot list to remove any links that point to * nowhere. */ - pagecount = pdf_count_pages(ctx, doc); for (i = 0; i < pagecount; i++) { pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); @@ -187,24 +311,11 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) for (j = 0; j < len; j++) { pdf_obj *o = pdf_array_get(ctx, annots, j); - pdf_obj *p; - int remove = 0; if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME_Subtype), PDF_NAME_Link)) continue; - p = pdf_dict_get(ctx, o, PDF_NAME_A); - if (pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME_S), PDF_NAME_GoTo) && - !string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME_D), names_list)) - remove = 1; - - p = pdf_dict_get(ctx, 0, PDF_NAME_Dest); - if (!dest_is_valid_page(ctx, doc, pdf_array_get(ctx, p, 0))) - remove = 1; - - /* FIXME: Should probably look at Next too */ - - if (remove) + if (!dest_is_valid(ctx, o, pagecount, page_object_nums, names_list)) { /* Remove this annotation */ pdf_array_delete(ctx, annots, j); @@ -212,6 +323,15 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) } } } + + if (strip_outlines(ctx, doc, outlines, pagecount, page_object_nums, names_list) == 0) + { + pdf_dict_del(ctx, root, PDF_NAME_Outlines); + } + + fz_free(ctx, page_object_nums); + pdf_drop_obj(ctx, names_list); + pdf_drop_obj(ctx, root); } void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, pdf_write_options *opts, char *argv[], int argc) |