diff options
author | Robin Watts <robin.watts@artifex.com> | 2015-04-02 17:04:08 +0100 |
---|---|---|
committer | Robin Watts <robin.watts@artifex.com> | 2015-04-06 19:30:55 +0100 |
commit | b4d256b9e9d2e9f3b5f3ca944f591ae6bb0c5d71 (patch) | |
tree | 8d82fc82c7586634b14518704def1e5011249ca9 | |
parent | 95d746f13a86d914dd88310c994b27b08db4bb5b (diff) | |
download | mupdf-b4d256b9e9d2e9f3b5f3ca944f591ae6bb0c5d71.tar.xz |
Move the guts of pdfclean into the lib.
Michael needs to be able to call pdfclean from gsview. At the moment
he's having to do this by including the pdfclean.c file into the lib
build, and then calling pdfclean_main with a faked up command line.
This isn't nice.
pdfclean.c is implemented by pdfclean_main parsing the options/filenames
out of argv and then passing the filenames/options on to a
pdfclean_clean function.
This seems like a much nicer API to offer to the world.
We therefore pull the guts of pdfclean.c (pdfclean_clean and its
subsidiary structures/functions) into pdf-clean-file.c and include
this in the library build.
This leaves pdfclean.c just as the command line parsing.
This should not affect the size of any of the resulting binaries.
-rw-r--r-- | include/mupdf/pdf.h | 2 | ||||
-rw-r--r-- | include/mupdf/pdf/clean.h | 7 | ||||
-rw-r--r-- | platform/win32/libmupdf.vcproj | 8 | ||||
-rw-r--r-- | source/pdf/pdf-clean-file.c | 221 | ||||
-rw-r--r-- | source/tools/pdfclean.c | 222 |
5 files changed, 239 insertions, 221 deletions
diff --git a/include/mupdf/pdf.h b/include/mupdf/pdf.h index a6f812f3..4b1459b4 100644 --- a/include/mupdf/pdf.h +++ b/include/mupdf/pdf.h @@ -25,4 +25,6 @@ #include "mupdf/pdf/output-pdf.h" +#include "mupdf/pdf/clean.h" + #endif diff --git a/include/mupdf/pdf/clean.h b/include/mupdf/pdf/clean.h new file mode 100644 index 00000000..11ebf0c7 --- /dev/null +++ b/include/mupdf/pdf/clean.h @@ -0,0 +1,7 @@ +#ifndef MUPDF_PDF_CLEAN_H +#define MUPDF_PDF_CLEAN_H + +/* Read infile, and write selected pages to outfile with the given options. */ +void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, fz_write_options *opts, char *retainlist[], int retainlen); + +#endif diff --git a/platform/win32/libmupdf.vcproj b/platform/win32/libmupdf.vcproj index c3dc0aa3..d7a55349 100644 --- a/platform/win32/libmupdf.vcproj +++ b/platform/win32/libmupdf.vcproj @@ -1066,6 +1066,10 @@ > </File> <File + RelativePath="..\..\source\pdf\pdf-clean-file.c" + > + </File> + <File RelativePath="..\..\source\pdf\pdf-clean.c" > </File> @@ -1293,6 +1297,10 @@ > </File> <File + RelativePath="..\..\include\mupdf\pdf\clean.h" + > + </File> + <File RelativePath="..\..\include\mupdf\pdf\cmap.h" > </File> diff --git a/source/pdf/pdf-clean-file.c b/source/pdf/pdf-clean-file.c new file mode 100644 index 00000000..d224dd9a --- /dev/null +++ b/source/pdf/pdf-clean-file.c @@ -0,0 +1,221 @@ +#include "mupdf/pdf.h" + +typedef struct globals_s +{ + pdf_document *doc; + fz_context *ctx; +} globals; + +static int +string_in_names_list(fz_context *ctx, pdf_obj *p, pdf_obj *names_list) +{ + int n = pdf_array_len(ctx, names_list); + int i; + char *str = pdf_to_str_buf(ctx, p); + + for (i = 0; i < n ; i += 2) + { + if (!strcmp(pdf_to_str_buf(ctx, pdf_array_get(ctx, names_list, i)), str)) + return 1; + } + return 0; +} + +/* + * Recreate page tree to only retain specified pages. + */ + +static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parent, pdf_obj *kids, int page) +{ + pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, page-1); + pdf_obj *pageobj = pdf_resolve_indirect(ctx, pageref); + + pdf_dict_put(ctx, pageobj, PDF_NAME_Parent, parent); + + /* Store page object in new kids array */ + pdf_array_push(ctx, kids, pageref); +} + +static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) +{ + pdf_obj *oldroot, *root, *pages, *kids, *countobj, *parent, *olddests; + pdf_document *doc = glo->doc; + int argidx = 0; + pdf_obj *names_list = NULL; + int pagecount; + int i; + + /* Keep only pages/type and (reduced) dest entries to avoid + * references to unretained pages */ + oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); + pages = pdf_dict_get(ctx, oldroot, PDF_NAME_Pages); + olddests = pdf_load_name_tree(ctx, doc, PDF_NAME_Dests); + + root = pdf_new_dict(ctx, doc, 2); + pdf_dict_put(ctx, root, PDF_NAME_Type, pdf_dict_get(ctx, oldroot, PDF_NAME_Type)); + pdf_dict_put(ctx, root, PDF_NAME_Pages, pdf_dict_get(ctx, oldroot, PDF_NAME_Pages)); + + pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root); + + pdf_drop_obj(ctx, root); + + /* Create a new kids array with only the pages we want to keep */ + parent = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, pages), pdf_to_gen(ctx, pages)); + kids = pdf_new_array(ctx, doc, 1); + + /* Retain pages specified */ + while (argc - argidx) + { + int page, spage, epage; + char *spec, *dash; + char *pagelist = argv[argidx]; + + pagecount = pdf_count_pages(ctx, doc); + spec = fz_strsep(&pagelist, ","); + while (spec) + { + dash = strchr(spec, '-'); + + if (dash == spec) + spage = epage = pagecount; + else + spage = epage = atoi(spec); + + if (dash) + { + if (strlen(dash) > 1) + epage = atoi(dash + 1); + else + epage = pagecount; + } + + spage = fz_clampi(spage, 1, pagecount); + epage = fz_clampi(epage, 1, pagecount); + + if (spage < epage) + for (page = spage; page <= epage; ++page) + retainpage(ctx, doc, parent, kids, page); + else + for (page = spage; page >= epage; --page) + retainpage(ctx, doc, parent, kids, page); + + spec = fz_strsep(&pagelist, ","); + } + + argidx++; + } + + pdf_drop_obj(ctx, parent); + + /* Update page count and kids array */ + countobj = pdf_new_int(ctx, doc, pdf_array_len(ctx, kids)); + pdf_dict_put(ctx, pages, PDF_NAME_Count, countobj); + pdf_drop_obj(ctx, countobj); + pdf_dict_put(ctx, pages, PDF_NAME_Kids, kids); + pdf_drop_obj(ctx, kids); + + /* Also preserve the (partial) Dests name tree */ + if (olddests) + { + pdf_obj *names = pdf_new_dict(ctx, doc, 1); + pdf_obj *dests = pdf_new_dict(ctx, doc, 1); + int len = pdf_dict_len(ctx, olddests); + + names_list = pdf_new_array(ctx, doc, 32); + + for (i = 0; i < len; i++) + { + pdf_obj *key = pdf_dict_get_key(ctx, olddests, i); + pdf_obj *val = pdf_dict_get_val(ctx, olddests, i); + pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME_D); + + dest = pdf_array_get(ctx, dest ? dest : val, 0); + if (pdf_array_contains(ctx, pdf_dict_get(ctx, pages, PDF_NAME_Kids), dest)) + { + pdf_obj *key_str = pdf_new_string(ctx, doc, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key))); + pdf_array_push(ctx, names_list, key_str); + pdf_array_push(ctx, names_list, val); + pdf_drop_obj(ctx, key_str); + } + } + + root = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); + pdf_dict_put(ctx, dests, PDF_NAME_Names, names_list); + pdf_dict_put(ctx, names, PDF_NAME_Dests, dests); + pdf_dict_put(ctx, root, PDF_NAME_Names, names); + + pdf_drop_obj(ctx, names); + pdf_drop_obj(ctx, dests); + pdf_drop_obj(ctx, names_list); + pdf_drop_obj(ctx, olddests); + } + + /* Force the next call to pdf_count_pages to recount */ + glo->doc->page_count = 0; + + /* Edit each pages /Annot list to remove any links that point to + * nowhere. */ + pagecount = pdf_count_pages(ctx, doc); + for (i = 0; i < pagecount; i++) + { + pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); + pdf_obj *pageobj = pdf_resolve_indirect(ctx, pageref); + + pdf_obj *annots = pdf_dict_get(ctx, pageobj, PDF_NAME_Annots); + + int len = pdf_array_len(ctx, annots); + int j; + + for (j = 0; j < len; j++) + { + pdf_obj *o = pdf_array_get(ctx, annots, j); + pdf_obj *p; + + if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME_Subtype), PDF_NAME_Link)) + continue; + + p = pdf_dict_get(ctx, o, PDF_NAME_A); + if (!pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME_S), PDF_NAME_GoTo)) + continue; + + if (string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME_D), names_list)) + continue; + + /* FIXME: Should probably look at Next too */ + + /* Remove this annotation */ + pdf_array_delete(ctx, annots, j); + j--; + } + } +} + +void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, fz_write_options *opts, char *argv[], int argc) +{ + globals glo = { 0 }; + + glo.ctx = ctx; + + fz_try(ctx) + { + glo.doc = pdf_open_document_no_run(ctx, infile); + if (pdf_needs_password(ctx, glo.doc)) + if (!pdf_authenticate_password(ctx, glo.doc, password)) + fz_throw(glo.ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s", infile); + + /* Only retain the specified subset of the pages */ + if (argc) + retainpages(ctx, &glo, argc, argv); + + pdf_write_document(ctx, glo.doc, outfile, opts); + } + fz_always(ctx) + { + pdf_close_document(ctx, glo.doc); + } + fz_catch(ctx) + { + if (opts && opts->errors) + *opts->errors = *opts->errors+1; + } +} diff --git a/source/tools/pdfclean.c b/source/tools/pdfclean.c index 051ec4bd..9d38d552 100644 --- a/source/tools/pdfclean.c +++ b/source/tools/pdfclean.c @@ -11,12 +11,6 @@ #include "mupdf/pdf.h" -typedef struct globals_s -{ - pdf_document *doc; - fz_context *ctx; -} globals; - static void usage(void) { fprintf(stderr, @@ -36,220 +30,6 @@ static void usage(void) exit(1); } -static int -string_in_names_list(fz_context *ctx, pdf_obj *p, pdf_obj *names_list) -{ - int n = pdf_array_len(ctx, names_list); - int i; - char *str = pdf_to_str_buf(ctx, p); - - for (i = 0; i < n ; i += 2) - { - if (!strcmp(pdf_to_str_buf(ctx, pdf_array_get(ctx, names_list, i)), str)) - return 1; - } - return 0; -} - -/* - * Recreate page tree to only retain specified pages. - */ - -static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parent, pdf_obj *kids, int page) -{ - pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, page-1); - pdf_obj *pageobj = pdf_resolve_indirect(ctx, pageref); - - pdf_dict_put(ctx, pageobj, PDF_NAME_Parent, parent); - - /* Store page object in new kids array */ - pdf_array_push(ctx, kids, pageref); -} - -static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) -{ - pdf_obj *oldroot, *root, *pages, *kids, *countobj, *parent, *olddests; - pdf_document *doc = glo->doc; - int argidx = 0; - pdf_obj *names_list = NULL; - int pagecount; - int i; - - /* Keep only pages/type and (reduced) dest entries to avoid - * references to unretained pages */ - oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); - pages = pdf_dict_get(ctx, oldroot, PDF_NAME_Pages); - olddests = pdf_load_name_tree(ctx, doc, PDF_NAME_Dests); - - root = pdf_new_dict(ctx, doc, 2); - pdf_dict_put(ctx, root, PDF_NAME_Type, pdf_dict_get(ctx, oldroot, PDF_NAME_Type)); - pdf_dict_put(ctx, root, PDF_NAME_Pages, pdf_dict_get(ctx, oldroot, PDF_NAME_Pages)); - - pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root); - - pdf_drop_obj(ctx, root); - - /* Create a new kids array with only the pages we want to keep */ - parent = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, pages), pdf_to_gen(ctx, pages)); - kids = pdf_new_array(ctx, doc, 1); - - /* Retain pages specified */ - while (argc - argidx) - { - int page, spage, epage; - char *spec, *dash; - char *pagelist = argv[argidx]; - - pagecount = pdf_count_pages(ctx, doc); - spec = fz_strsep(&pagelist, ","); - while (spec) - { - dash = strchr(spec, '-'); - - if (dash == spec) - spage = epage = pagecount; - else - spage = epage = atoi(spec); - - if (dash) - { - if (strlen(dash) > 1) - epage = atoi(dash + 1); - else - epage = pagecount; - } - - spage = fz_clampi(spage, 1, pagecount); - epage = fz_clampi(epage, 1, pagecount); - - if (spage < epage) - for (page = spage; page <= epage; ++page) - retainpage(ctx, doc, parent, kids, page); - else - for (page = spage; page >= epage; --page) - retainpage(ctx, doc, parent, kids, page); - - spec = fz_strsep(&pagelist, ","); - } - - argidx++; - } - - pdf_drop_obj(ctx, parent); - - /* Update page count and kids array */ - countobj = pdf_new_int(ctx, doc, pdf_array_len(ctx, kids)); - pdf_dict_put(ctx, pages, PDF_NAME_Count, countobj); - pdf_drop_obj(ctx, countobj); - pdf_dict_put(ctx, pages, PDF_NAME_Kids, kids); - pdf_drop_obj(ctx, kids); - - /* Also preserve the (partial) Dests name tree */ - if (olddests) - { - pdf_obj *names = pdf_new_dict(ctx, doc, 1); - pdf_obj *dests = pdf_new_dict(ctx, doc, 1); - int len = pdf_dict_len(ctx, olddests); - - names_list = pdf_new_array(ctx, doc, 32); - - for (i = 0; i < len; i++) - { - pdf_obj *key = pdf_dict_get_key(ctx, olddests, i); - pdf_obj *val = pdf_dict_get_val(ctx, olddests, i); - pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME_D); - - dest = pdf_array_get(ctx, dest ? dest : val, 0); - if (pdf_array_contains(ctx, pdf_dict_get(ctx, pages, PDF_NAME_Kids), dest)) - { - pdf_obj *key_str = pdf_new_string(ctx, doc, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key))); - pdf_array_push(ctx, names_list, key_str); - pdf_array_push(ctx, names_list, val); - pdf_drop_obj(ctx, key_str); - } - } - - root = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); - pdf_dict_put(ctx, dests, PDF_NAME_Names, names_list); - pdf_dict_put(ctx, names, PDF_NAME_Dests, dests); - pdf_dict_put(ctx, root, PDF_NAME_Names, names); - - pdf_drop_obj(ctx, names); - pdf_drop_obj(ctx, dests); - pdf_drop_obj(ctx, names_list); - pdf_drop_obj(ctx, olddests); - } - - /* Force the next call to pdf_count_pages to recount */ - glo->doc->page_count = 0; - - /* Edit each pages /Annot list to remove any links that point to - * nowhere. */ - pagecount = pdf_count_pages(ctx, doc); - for (i = 0; i < pagecount; i++) - { - pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); - pdf_obj *pageobj = pdf_resolve_indirect(ctx, pageref); - - pdf_obj *annots = pdf_dict_get(ctx, pageobj, PDF_NAME_Annots); - - int len = pdf_array_len(ctx, annots); - int j; - - for (j = 0; j < len; j++) - { - pdf_obj *o = pdf_array_get(ctx, annots, j); - pdf_obj *p; - - if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME_Subtype), PDF_NAME_Link)) - continue; - - p = pdf_dict_get(ctx, o, PDF_NAME_A); - if (!pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME_S), PDF_NAME_GoTo)) - continue; - - if (string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME_D), names_list)) - continue; - - /* FIXME: Should probably look at Next too */ - - /* Remove this annotation */ - pdf_array_delete(ctx, annots, j); - j--; - } - } -} - -void pdfclean_clean(fz_context *ctx, char *infile, char *outfile, char *password, fz_write_options *opts, char *argv[], int argc) -{ - globals glo = { 0 }; - - glo.ctx = ctx; - - fz_try(ctx) - { - glo.doc = pdf_open_document_no_run(ctx, infile); - if (pdf_needs_password(ctx, glo.doc)) - if (!pdf_authenticate_password(ctx, glo.doc, password)) - fz_throw(glo.ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s", infile); - - /* Only retain the specified subset of the pages */ - if (argc) - retainpages(ctx, &glo, argc, argv); - - pdf_write_document(ctx, glo.doc, outfile, opts); - } - fz_always(ctx) - { - pdf_close_document(ctx, glo.doc); - } - fz_catch(ctx) - { - if (opts && opts->errors) - *opts->errors = *opts->errors+1; - } -} - int pdfclean_main(int argc, char **argv) { char *infile; @@ -305,7 +85,7 @@ int pdfclean_main(int argc, char **argv) fz_try(ctx) { - pdfclean_clean(ctx, infile, outfile, password, &opts, &argv[fz_optind], argc - fz_optind); + pdf_clean_file(ctx, infile, outfile, password, &opts, &argv[fz_optind], argc - fz_optind); } fz_catch(ctx) { |