diff options
-rw-r--r-- | resources/pdf/names.txt | 1 | ||||
-rw-r--r-- | source/pdf/pdf-clean-file.c | 180 |
2 files changed, 151 insertions, 30 deletions
diff --git a/resources/pdf/names.txt b/resources/pdf/names.txt index a173b6bb..ff63d99d 100644 --- a/resources/pdf/names.txt +++ b/resources/pdf/names.txt @@ -198,6 +198,7 @@ LZ LZW LZWDecode Lab +Last LastChar Launch Length diff --git a/source/pdf/pdf-clean-file.c b/source/pdf/pdf-clean-file.c index 29efbd5c..b766af89 100644 --- a/source/pdf/pdf-clean-file.c +++ b/source/pdf/pdf-clean-file.c @@ -36,47 +36,163 @@ static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parent, pdf_ pdf_array_push(ctx, kids, pageref); } -static int dest_is_valid_page(fz_context *ctx, pdf_document *doc, pdf_obj *obj) +static int dest_is_valid_page(fz_context *ctx, pdf_obj *obj, int *page_object_nums, int pagecount) { int i; int num = pdf_to_num(ctx, obj); - int pagecount = pdf_count_pages(ctx, doc); if (num == 0) return 0; for (i = 0; i < pagecount; i++) { - pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); - - if (pdf_to_num(ctx, pageref) == num) + if (page_object_nums[i] == num) return 1; } return 0; } +static int dest_is_valid(fz_context *ctx, pdf_obj *o, int page_count, int *page_object_nums, pdf_obj *names_list) +{ + pdf_obj *p; + + p = pdf_dict_get(ctx, o, PDF_NAME_A); + if (pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME_S), PDF_NAME_GoTo) && + !string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME_D), names_list)) + return 0; + + p = pdf_dict_get(ctx, o, PDF_NAME_Dest); + if (p == NULL) + {} + else if (pdf_is_string(ctx, p)) + { + return string_in_names_list(ctx, p, names_list); + } + else if (!dest_is_valid_page(ctx, pdf_array_get(ctx, p, 0), page_object_nums, page_count)) + return 0; + + return 1; +} + +static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list); + +static int strip_outline(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_obj **pfirst, pdf_obj **plast) +{ + pdf_obj *prev = NULL; + pdf_obj *first = NULL; + pdf_obj *current; + int count = 0; + + for (current = outlines; current != NULL; ) + { + int nc; + + /* Strip any children to start with. This takes care of + * First/Last/Count for us. */ + nc = strip_outlines(ctx, doc, current, page_count, page_object_nums, names_list); + + if (!dest_is_valid(ctx, current, page_count, page_object_nums, names_list)) + { + if (nc == 0) + { + /* Outline with invalid dest and no children. Drop it by + * pulling the next one in here. */ + pdf_obj *next = pdf_dict_get(ctx, current, PDF_NAME_Next); + if (next == NULL) + { + /* There is no next one to pull in */ + if (prev != NULL) + pdf_dict_del(ctx, prev, PDF_NAME_Next); + } + else if (prev != NULL) + { + pdf_dict_put(ctx, prev, PDF_NAME_Next, next); + pdf_dict_put(ctx, next, PDF_NAME_Prev, prev); + } + else + { + pdf_dict_del(ctx, next, PDF_NAME_Prev); + } + current = next; + } + else + { + /* Outline with invalid dest, but children. Just drop the dest. */ + pdf_dict_del(ctx, current, PDF_NAME_Dest); + pdf_dict_del(ctx, current, PDF_NAME_A); + current = pdf_dict_get(ctx, current, PDF_NAME_Next); + } + } + else + { + /* Keep this one */ + if (first == NULL) + first = current; + prev = current; + current = pdf_dict_get(ctx, current, PDF_NAME_Next); + count++; + } + } + + *pfirst = first; + *plast = prev; + + return count; +} + +static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list) +{ + int nc; + pdf_obj *first; + pdf_obj *last; + + first = pdf_dict_get(ctx, outlines, PDF_NAME_First); + if (first == NULL) + nc = 0; + else + nc = strip_outline(ctx, doc, first, page_count, page_object_nums, names_list, &first, &last); + + if (nc == 0) + { + pdf_dict_del(ctx, outlines, PDF_NAME_First); + pdf_dict_del(ctx, outlines, PDF_NAME_Last); + pdf_dict_del(ctx, outlines, PDF_NAME_Count); + } + else + { + int old_count = pdf_to_int(ctx, pdf_dict_get(ctx, outlines, PDF_NAME_Count)); + pdf_dict_put(ctx, outlines, PDF_NAME_First, first); + pdf_dict_put(ctx, outlines, PDF_NAME_Last, last); + pdf_dict_put(ctx, outlines, PDF_NAME_Count, pdf_new_int(ctx, doc, old_count > 0 ? nc : -nc)); + } + + return nc; +} + static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) { pdf_obj *oldroot, *root, *pages, *kids, *countobj, *parent, *olddests; pdf_document *doc = glo->doc; int argidx = 0; pdf_obj *names_list = NULL; + pdf_obj *outlines; int pagecount; int i; + int *page_object_nums; /* Keep only pages/type and (reduced) dest entries to avoid * references to unretained pages */ oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); pages = pdf_dict_get(ctx, oldroot, PDF_NAME_Pages); olddests = pdf_load_name_tree(ctx, doc, PDF_NAME_Dests); + outlines = pdf_dict_get(ctx, oldroot, PDF_NAME_Outlines); - root = pdf_new_dict(ctx, doc, 2); + root = pdf_new_dict(ctx, doc, 3); pdf_dict_put(ctx, root, PDF_NAME_Type, pdf_dict_get(ctx, oldroot, PDF_NAME_Type)); pdf_dict_put(ctx, root, PDF_NAME_Pages, pdf_dict_get(ctx, oldroot, PDF_NAME_Pages)); + pdf_dict_put(ctx, root, PDF_NAME_Outlines, outlines); pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root); - pdf_drop_obj(ctx, root); - /* Create a new kids array with only the pages we want to keep */ parent = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, pages), pdf_to_gen(ctx, pages)); kids = pdf_new_array(ctx, doc, 1); @@ -132,7 +248,21 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) pdf_dict_put(ctx, pages, PDF_NAME_Kids, kids); pdf_drop_obj(ctx, kids); - /* Also preserve the (partial) Dests name tree */ + /* Force the next call to pdf_count_pages to recount */ + glo->doc->page_count = 0; + + pagecount = pdf_count_pages(ctx, doc); + page_object_nums = fz_calloc(ctx, pagecount, sizeof(*page_object_nums)); + for (i = 0; i < pagecount; i++) + { + pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); + page_object_nums[i] = pdf_to_num(ctx, pageref); + } + + /* If we had an old Dests tree (now reformed as an olddests + * dictionary), keep any entries in there that point to + * valid pages. This may mean we keep more than we need, but + * it's safe at least. */ if (olddests) { pdf_obj *names = pdf_new_dict(ctx, doc, 1); @@ -148,7 +278,7 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME_D); dest = pdf_array_get(ctx, dest ? dest : val, 0); - if (pdf_array_contains(ctx, pdf_dict_get(ctx, pages, PDF_NAME_Kids), dest)) + if (dest_is_valid_page(ctx, dest, page_object_nums, pagecount)) { pdf_obj *key_str = pdf_new_string(ctx, doc, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key))); pdf_array_push(ctx, names_list, key_str); @@ -157,23 +287,17 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) } } - root = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); pdf_dict_put(ctx, dests, PDF_NAME_Names, names_list); pdf_dict_put(ctx, names, PDF_NAME_Dests, dests); pdf_dict_put(ctx, root, PDF_NAME_Names, names); pdf_drop_obj(ctx, names); pdf_drop_obj(ctx, dests); - pdf_drop_obj(ctx, names_list); pdf_drop_obj(ctx, olddests); } - /* Force the next call to pdf_count_pages to recount */ - glo->doc->page_count = 0; - /* Edit each pages /Annot list to remove any links that point to * nowhere. */ - pagecount = pdf_count_pages(ctx, doc); for (i = 0; i < pagecount; i++) { pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); @@ -187,24 +311,11 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) for (j = 0; j < len; j++) { pdf_obj *o = pdf_array_get(ctx, annots, j); - pdf_obj *p; - int remove = 0; if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME_Subtype), PDF_NAME_Link)) continue; - p = pdf_dict_get(ctx, o, PDF_NAME_A); - if (pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME_S), PDF_NAME_GoTo) && - !string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME_D), names_list)) - remove = 1; - - p = pdf_dict_get(ctx, 0, PDF_NAME_Dest); - if (!dest_is_valid_page(ctx, doc, pdf_array_get(ctx, p, 0))) - remove = 1; - - /* FIXME: Should probably look at Next too */ - - if (remove) + if (!dest_is_valid(ctx, o, pagecount, page_object_nums, names_list)) { /* Remove this annotation */ pdf_array_delete(ctx, annots, j); @@ -212,6 +323,15 @@ static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) } } } + + if (strip_outlines(ctx, doc, outlines, pagecount, page_object_nums, names_list) == 0) + { + pdf_dict_del(ctx, root, PDF_NAME_Outlines); + } + + fz_free(ctx, page_object_nums); + pdf_drop_obj(ctx, names_list); + pdf_drop_obj(ctx, root); } void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, pdf_write_options *opts, char *argv[], int argc) |