diff options
-rw-r--r-- | include/mupdf/pdf.h | 2 | ||||
-rw-r--r-- | include/mupdf/pdf/clean.h | 7 | ||||
-rw-r--r-- | platform/win32/libmupdf.vcproj | 8 | ||||
-rw-r--r-- | source/pdf/pdf-clean-file.c | 221 | ||||
-rw-r--r-- | source/tools/pdfclean.c | 222 |
5 files changed, 239 insertions, 221 deletions
diff --git a/include/mupdf/pdf.h b/include/mupdf/pdf.h index a6f812f3..4b1459b4 100644 --- a/include/mupdf/pdf.h +++ b/include/mupdf/pdf.h @@ -25,4 +25,6 @@ #include "mupdf/pdf/output-pdf.h" +#include "mupdf/pdf/clean.h" + #endif diff --git a/include/mupdf/pdf/clean.h b/include/mupdf/pdf/clean.h new file mode 100644 index 00000000..11ebf0c7 --- /dev/null +++ b/include/mupdf/pdf/clean.h @@ -0,0 +1,7 @@ +#ifndef MUPDF_PDF_CLEAN_H +#define MUPDF_PDF_CLEAN_H + +/* Read infile, and write selected pages to outfile with the given options. */ +void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, fz_write_options *opts, char *retainlist[], int retainlen); + +#endif diff --git a/platform/win32/libmupdf.vcproj b/platform/win32/libmupdf.vcproj index c3dc0aa3..d7a55349 100644 --- a/platform/win32/libmupdf.vcproj +++ b/platform/win32/libmupdf.vcproj @@ -1066,6 +1066,10 @@ > </File> <File + RelativePath="..\..\source\pdf\pdf-clean-file.c" + > + </File> + <File RelativePath="..\..\source\pdf\pdf-clean.c" > </File> @@ -1293,6 +1297,10 @@ > </File> <File + RelativePath="..\..\include\mupdf\pdf\clean.h" + > + </File> + <File RelativePath="..\..\include\mupdf\pdf\cmap.h" > </File> diff --git a/source/pdf/pdf-clean-file.c b/source/pdf/pdf-clean-file.c new file mode 100644 index 00000000..d224dd9a --- /dev/null +++ b/source/pdf/pdf-clean-file.c @@ -0,0 +1,221 @@ +#include "mupdf/pdf.h" + +typedef struct globals_s +{ + pdf_document *doc; + fz_context *ctx; +} globals; + +static int +string_in_names_list(fz_context *ctx, pdf_obj *p, pdf_obj *names_list) +{ + int n = pdf_array_len(ctx, names_list); + int i; + char *str = pdf_to_str_buf(ctx, p); + + for (i = 0; i < n ; i += 2) + { + if (!strcmp(pdf_to_str_buf(ctx, pdf_array_get(ctx, names_list, i)), str)) + return 1; + } + return 0; +} + +/* + * Recreate page tree to only retain specified pages. + */ + +static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parent, pdf_obj *kids, int page) +{ + pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, page-1); + pdf_obj *pageobj = pdf_resolve_indirect(ctx, pageref); + + pdf_dict_put(ctx, pageobj, PDF_NAME_Parent, parent); + + /* Store page object in new kids array */ + pdf_array_push(ctx, kids, pageref); +} + +static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) +{ + pdf_obj *oldroot, *root, *pages, *kids, *countobj, *parent, *olddests; + pdf_document *doc = glo->doc; + int argidx = 0; + pdf_obj *names_list = NULL; + int pagecount; + int i; + + /* Keep only pages/type and (reduced) dest entries to avoid + * references to unretained pages */ + oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); + pages = pdf_dict_get(ctx, oldroot, PDF_NAME_Pages); + olddests = pdf_load_name_tree(ctx, doc, PDF_NAME_Dests); + + root = pdf_new_dict(ctx, doc, 2); + pdf_dict_put(ctx, root, PDF_NAME_Type, pdf_dict_get(ctx, oldroot, PDF_NAME_Type)); + pdf_dict_put(ctx, root, PDF_NAME_Pages, pdf_dict_get(ctx, oldroot, PDF_NAME_Pages)); + + pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root); + + pdf_drop_obj(ctx, root); + + /* Create a new kids array with only the pages we want to keep */ + parent = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, pages), pdf_to_gen(ctx, pages)); + kids = pdf_new_array(ctx, doc, 1); + + /* Retain pages specified */ + while (argc - argidx) + { + int page, spage, epage; + char *spec, *dash; + char *pagelist = argv[argidx]; + + pagecount = pdf_count_pages(ctx, doc); + spec = fz_strsep(&pagelist, ","); + while (spec) + { + dash = strchr(spec, '-'); + + if (dash == spec) + spage = epage = pagecount; + else + spage = epage = atoi(spec); + + if (dash) + { + if (strlen(dash) > 1) + epage = atoi(dash + 1); + else + epage = pagecount; + } + + spage = fz_clampi(spage, 1, pagecount); + epage = fz_clampi(epage, 1, pagecount); + + if (spage < epage) + for (page = spage; page <= epage; ++page) + retainpage(ctx, doc, parent, kids, page); + else + for (page = spage; page >= epage; --page) + retainpage(ctx, doc, parent, kids, page); + + spec = fz_strsep(&pagelist, ","); + } + + argidx++; + } + + pdf_drop_obj(ctx, parent); + + /* Update page count and kids array */ + countobj = pdf_new_int(ctx, doc, pdf_array_len(ctx, kids)); + pdf_dict_put(ctx, pages, PDF_NAME_Count, countobj); + pdf_drop_obj(ctx, countobj); + pdf_dict_put(ctx, pages, PDF_NAME_Kids, kids); + pdf_drop_obj(ctx, kids); + + /* Also preserve the (partial) Dests name tree */ + if (olddests) + { + pdf_obj *names = pdf_new_dict(ctx, doc, 1); + pdf_obj *dests = pdf_new_dict(ctx, doc, 1); + int len = pdf_dict_len(ctx, olddests); + + names_list = pdf_new_array(ctx, doc, 32); + + for (i = 0; i < len; i++) + { + pdf_obj *key = pdf_dict_get_key(ctx, olddests, i); + pdf_obj *val = pdf_dict_get_val(ctx, olddests, i); + pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME_D); + + dest = pdf_array_get(ctx, dest ? dest : val, 0); + if (pdf_array_contains(ctx, pdf_dict_get(ctx, pages, PDF_NAME_Kids), dest)) + { + pdf_obj *key_str = pdf_new_string(ctx, doc, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key))); + pdf_array_push(ctx, names_list, key_str); + pdf_array_push(ctx, names_list, val); + pdf_drop_obj(ctx, key_str); + } + } + + root = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); + pdf_dict_put(ctx, dests, PDF_NAME_Names, names_list); + pdf_dict_put(ctx, names, PDF_NAME_Dests, dests); + pdf_dict_put(ctx, root, PDF_NAME_Names, names); + + pdf_drop_obj(ctx, names); + pdf_drop_obj(ctx, dests); + pdf_drop_obj(ctx, names_list); + pdf_drop_obj(ctx, olddests); + } + + /* Force the next call to pdf_count_pages to recount */ + glo->doc->page_count = 0; + + /* Edit each pages /Annot list to remove any links that point to + * nowhere. */ + pagecount = pdf_count_pages(ctx, doc); + for (i = 0; i < pagecount; i++) + { + pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); + pdf_obj *pageobj = pdf_resolve_indirect(ctx, pageref); + + pdf_obj *annots = pdf_dict_get(ctx, pageobj, PDF_NAME_Annots); + + int len = pdf_array_len(ctx, annots); + int j; + + for (j = 0; j < len; j++) + { + pdf_obj *o = pdf_array_get(ctx, annots, j); + pdf_obj *p; + + if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME_Subtype), PDF_NAME_Link)) + continue; + + p = pdf_dict_get(ctx, o, PDF_NAME_A); + if (!pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME_S), PDF_NAME_GoTo)) + continue; + + if (string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME_D), names_list)) + continue; + + /* FIXME: Should probably look at Next too */ + + /* Remove this annotation */ + pdf_array_delete(ctx, annots, j); + j--; + } + } +} + +void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, fz_write_options *opts, char *argv[], int argc) +{ + globals glo = { 0 }; + + glo.ctx = ctx; + + fz_try(ctx) + { + glo.doc = pdf_open_document_no_run(ctx, infile); + if (pdf_needs_password(ctx, glo.doc)) + if (!pdf_authenticate_password(ctx, glo.doc, password)) + fz_throw(glo.ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s", infile); + + /* Only retain the specified subset of the pages */ + if (argc) + retainpages(ctx, &glo, argc, argv); + + pdf_write_document(ctx, glo.doc, outfile, opts); + } + fz_always(ctx) + { + pdf_close_document(ctx, glo.doc); + } + fz_catch(ctx) + { + if (opts && opts->errors) + *opts->errors = *opts->errors+1; + } +} diff --git a/source/tools/pdfclean.c b/source/tools/pdfclean.c index 051ec4bd..9d38d552 100644 --- a/source/tools/pdfclean.c +++ b/source/tools/pdfclean.c @@ -11,12 +11,6 @@ #include "mupdf/pdf.h" -typedef struct globals_s -{ - pdf_document *doc; - fz_context *ctx; -} globals; - static void usage(void) { fprintf(stderr, @@ -36,220 +30,6 @@ static void usage(void) exit(1); } -static int -string_in_names_list(fz_context *ctx, pdf_obj *p, pdf_obj *names_list) -{ - int n = pdf_array_len(ctx, names_list); - int i; - char *str = pdf_to_str_buf(ctx, p); - - for (i = 0; i < n ; i += 2) - { - if (!strcmp(pdf_to_str_buf(ctx, pdf_array_get(ctx, names_list, i)), str)) - return 1; - } - return 0; -} - -/* - * Recreate page tree to only retain specified pages. - */ - -static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parent, pdf_obj *kids, int page) -{ - pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, page-1); - pdf_obj *pageobj = pdf_resolve_indirect(ctx, pageref); - - pdf_dict_put(ctx, pageobj, PDF_NAME_Parent, parent); - - /* Store page object in new kids array */ - pdf_array_push(ctx, kids, pageref); -} - -static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) -{ - pdf_obj *oldroot, *root, *pages, *kids, *countobj, *parent, *olddests; - pdf_document *doc = glo->doc; - int argidx = 0; - pdf_obj *names_list = NULL; - int pagecount; - int i; - - /* Keep only pages/type and (reduced) dest entries to avoid - * references to unretained pages */ - oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); - pages = pdf_dict_get(ctx, oldroot, PDF_NAME_Pages); - olddests = pdf_load_name_tree(ctx, doc, PDF_NAME_Dests); - - root = pdf_new_dict(ctx, doc, 2); - pdf_dict_put(ctx, root, PDF_NAME_Type, pdf_dict_get(ctx, oldroot, PDF_NAME_Type)); - pdf_dict_put(ctx, root, PDF_NAME_Pages, pdf_dict_get(ctx, oldroot, PDF_NAME_Pages)); - - pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root); - - pdf_drop_obj(ctx, root); - - /* Create a new kids array with only the pages we want to keep */ - parent = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, pages), pdf_to_gen(ctx, pages)); - kids = pdf_new_array(ctx, doc, 1); - - /* Retain pages specified */ - while (argc - argidx) - { - int page, spage, epage; - char *spec, *dash; - char *pagelist = argv[argidx]; - - pagecount = pdf_count_pages(ctx, doc); - spec = fz_strsep(&pagelist, ","); - while (spec) - { - dash = strchr(spec, '-'); - - if (dash == spec) - spage = epage = pagecount; - else - spage = epage = atoi(spec); - - if (dash) - { - if (strlen(dash) > 1) - epage = atoi(dash + 1); - else - epage = pagecount; - } - - spage = fz_clampi(spage, 1, pagecount); - epage = fz_clampi(epage, 1, pagecount); - - if (spage < epage) - for (page = spage; page <= epage; ++page) - retainpage(ctx, doc, parent, kids, page); - else - for (page = spage; page >= epage; --page) - retainpage(ctx, doc, parent, kids, page); - - spec = fz_strsep(&pagelist, ","); - } - - argidx++; - } - - pdf_drop_obj(ctx, parent); - - /* Update page count and kids array */ - countobj = pdf_new_int(ctx, doc, pdf_array_len(ctx, kids)); - pdf_dict_put(ctx, pages, PDF_NAME_Count, countobj); - pdf_drop_obj(ctx, countobj); - pdf_dict_put(ctx, pages, PDF_NAME_Kids, kids); - pdf_drop_obj(ctx, kids); - - /* Also preserve the (partial) Dests name tree */ - if (olddests) - { - pdf_obj *names = pdf_new_dict(ctx, doc, 1); - pdf_obj *dests = pdf_new_dict(ctx, doc, 1); - int len = pdf_dict_len(ctx, olddests); - - names_list = pdf_new_array(ctx, doc, 32); - - for (i = 0; i < len; i++) - { - pdf_obj *key = pdf_dict_get_key(ctx, olddests, i); - pdf_obj *val = pdf_dict_get_val(ctx, olddests, i); - pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME_D); - - dest = pdf_array_get(ctx, dest ? dest : val, 0); - if (pdf_array_contains(ctx, pdf_dict_get(ctx, pages, PDF_NAME_Kids), dest)) - { - pdf_obj *key_str = pdf_new_string(ctx, doc, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key))); - pdf_array_push(ctx, names_list, key_str); - pdf_array_push(ctx, names_list, val); - pdf_drop_obj(ctx, key_str); - } - } - - root = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); - pdf_dict_put(ctx, dests, PDF_NAME_Names, names_list); - pdf_dict_put(ctx, names, PDF_NAME_Dests, dests); - pdf_dict_put(ctx, root, PDF_NAME_Names, names); - - pdf_drop_obj(ctx, names); - pdf_drop_obj(ctx, dests); - pdf_drop_obj(ctx, names_list); - pdf_drop_obj(ctx, olddests); - } - - /* Force the next call to pdf_count_pages to recount */ - glo->doc->page_count = 0; - - /* Edit each pages /Annot list to remove any links that point to - * nowhere. */ - pagecount = pdf_count_pages(ctx, doc); - for (i = 0; i < pagecount; i++) - { - pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); - pdf_obj *pageobj = pdf_resolve_indirect(ctx, pageref); - - pdf_obj *annots = pdf_dict_get(ctx, pageobj, PDF_NAME_Annots); - - int len = pdf_array_len(ctx, annots); - int j; - - for (j = 0; j < len; j++) - { - pdf_obj *o = pdf_array_get(ctx, annots, j); - pdf_obj *p; - - if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME_Subtype), PDF_NAME_Link)) - continue; - - p = pdf_dict_get(ctx, o, PDF_NAME_A); - if (!pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME_S), PDF_NAME_GoTo)) - continue; - - if (string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME_D), names_list)) - continue; - - /* FIXME: Should probably look at Next too */ - - /* Remove this annotation */ - pdf_array_delete(ctx, annots, j); - j--; - } - } -} - -void pdfclean_clean(fz_context *ctx, char *infile, char *outfile, char *password, fz_write_options *opts, char *argv[], int argc) -{ - globals glo = { 0 }; - - glo.ctx = ctx; - - fz_try(ctx) - { - glo.doc = pdf_open_document_no_run(ctx, infile); - if (pdf_needs_password(ctx, glo.doc)) - if (!pdf_authenticate_password(ctx, glo.doc, password)) - fz_throw(glo.ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s", infile); - - /* Only retain the specified subset of the pages */ - if (argc) - retainpages(ctx, &glo, argc, argv); - - pdf_write_document(ctx, glo.doc, outfile, opts); - } - fz_always(ctx) - { - pdf_close_document(ctx, glo.doc); - } - fz_catch(ctx) - { - if (opts && opts->errors) - *opts->errors = *opts->errors+1; - } -} - int pdfclean_main(int argc, char **argv) { char *infile; @@ -305,7 +85,7 @@ int pdfclean_main(int argc, char **argv) fz_try(ctx) { - pdfclean_clean(ctx, infile, outfile, password, &opts, &argv[fz_optind], argc - fz_optind); + pdf_clean_file(ctx, infile, outfile, password, &opts, &argv[fz_optind], argc - fz_optind); } fz_catch(ctx) { |