From 1bbde93ba39613877ded2015965aaab2341d3d36 Mon Sep 17 00:00:00 2001 From: Michael Vrhel Date: Tue, 15 Mar 2016 16:33:22 -0700 Subject: Add mutool merge This commit adds a page merging tool. The tool demonstrates the use of object grafting. The object grafting function recursively goes through the object to add all referenced objects. A map is maintained to ensure that objects that have already been copied are not copied again. --- Makefile | 2 +- include/mupdf/pdf.h | 1 + include/mupdf/pdf/graft.h | 17 +++ platform/win32/libmupdf.vcproj | 8 ++ platform/win32/mutool.vcproj | 4 + source/fitz/document.c | 3 +- source/pdf/pdf-graft.c | 163 +++++++++++++++++++++++++++++ source/tools/mutool.c | 2 + source/tools/pdfmerge.c | 228 +++++++++++++++++++++++++++++++++++++++++ 9 files changed, 426 insertions(+), 2 deletions(-) create mode 100644 include/mupdf/pdf/graft.h create mode 100644 source/pdf/pdf-graft.c create mode 100644 source/tools/pdfmerge.c diff --git a/Makefile b/Makefile index 5e1d2778..d0dba772 100644 --- a/Makefile +++ b/Makefile @@ -249,7 +249,7 @@ $(OUT)/cmapdump.o : include/mupdf/pdf/cmap.h source/pdf/pdf-cmap.c source/pdf/pd # --- Tools and Apps --- MUTOOL := $(addprefix $(OUT)/, mutool) -MUTOOL_OBJ := $(addprefix $(OUT)/tools/, mutool.o mudraw.o murun.o pdfclean.o pdfcreate.o pdfextract.o pdfinfo.o pdfposter.o pdfshow.o pdfpages.o) +MUTOOL_OBJ := $(addprefix $(OUT)/tools/, mutool.o mudraw.o murun.o pdfclean.o pdfcreate.o pdfextract.o pdfinfo.o pdfposter.o pdfshow.o pdfpages.o pdfmerge.o) $(MUTOOL_OBJ): $(FITZ_HDR) $(PDF_HDR) $(MUTOOL) : $(MUPDF_LIB) $(THIRD_LIB) $(MUTOOL) : $(MUTOOL_OBJ) diff --git a/include/mupdf/pdf.h b/include/mupdf/pdf.h index eab70ee9..cb4bbdd5 100644 --- a/include/mupdf/pdf.h +++ b/include/mupdf/pdf.h @@ -15,6 +15,7 @@ extern "C" { #include "mupdf/pdf/crypt.h" #include "mupdf/pdf/page.h" +#include "mupdf/pdf/graft.h" #include "mupdf/pdf/resource.h" #include "mupdf/pdf/cmap.h" #include "mupdf/pdf/font.h" diff --git a/include/mupdf/pdf/graft.h b/include/mupdf/pdf/graft.h new file mode 100644 index 00000000..0b2d8151 --- /dev/null +++ b/include/mupdf/pdf/graft.h @@ -0,0 +1,17 @@ +#ifndef MUPDF_PDF_GRAFT_H +#define MUPDF_PDF_GRAFT_H + +typedef struct pdf_graft_map_s pdf_graft_map; + +struct pdf_graft_map_s +{ + int refs; + int len; + int *dst_from_src; +}; + +pdf_graft_map *pdf_new_graft_map(fz_context *ctx, pdf_document *src); +void pdf_drop_graft_map(fz_context *ctx, pdf_graft_map *map); +pdf_obj *pdf_graft_object(fz_context *ctx, pdf_document *dst, pdf_document *src, pdf_obj *obj, pdf_graft_map *map); + +#endif diff --git a/platform/win32/libmupdf.vcproj b/platform/win32/libmupdf.vcproj index 384e27cf..a760821d 100644 --- a/platform/win32/libmupdf.vcproj +++ b/platform/win32/libmupdf.vcproj @@ -1186,6 +1186,10 @@ RelativePath="..\..\source\pdf\pdf-glyphlist.h" > + + @@ -1373,6 +1377,10 @@ RelativePath="..\..\include\mupdf\pdf\font.h" > + + diff --git a/platform/win32/mutool.vcproj b/platform/win32/mutool.vcproj index 9d16ada5..1bf3e485 100644 --- a/platform/win32/mutool.vcproj +++ b/platform/win32/mutool.vcproj @@ -486,6 +486,10 @@ RelativePath="..\..\source\tools\pdfextract.c" > + + diff --git a/source/fitz/document.c b/source/fitz/document.c index 1719aaf6..7041616b 100644 --- a/source/fitz/document.c +++ b/source/fitz/document.c @@ -157,7 +157,8 @@ fz_new_document(fz_context *ctx, int size) fz_document * fz_keep_document(fz_context *ctx, fz_document *doc) { - ++doc->refs; + if (doc) + ++doc->refs; return doc; } diff --git a/source/pdf/pdf-graft.c b/source/pdf/pdf-graft.c new file mode 100644 index 00000000..bbe5e17b --- /dev/null +++ b/source/pdf/pdf-graft.c @@ -0,0 +1,163 @@ +#include "mupdf/pdf.h" + +pdf_graft_map * +pdf_new_graft_map(fz_context *ctx, pdf_document *src) +{ + pdf_graft_map *map = NULL; + + map = fz_malloc_struct(ctx, pdf_graft_map); + + fz_try(ctx) + { + map->len = pdf_xref_len(ctx, src); + map->dst_from_src = fz_calloc(ctx, map->len, sizeof(int)); + } + fz_catch(ctx) + { + fz_free(ctx, map); + fz_rethrow(ctx); + } + map->refs = 1; + return map; +} + +pdf_graft_map * +fz_keep_graft_map(fz_context *ctx, pdf_graft_map *map) +{ + if (map) + ++map->refs; + return map; +} + +void +pdf_drop_graft_map(fz_context *ctx, pdf_graft_map *map) +{ + if (map && --map->refs == 0) + { + fz_free(ctx, map->dst_from_src); + fz_free(ctx, map); + } +} + +/* Graft object from dst to source */ +pdf_obj * +pdf_graft_object(fz_context *ctx, pdf_document *dst, pdf_document *src, pdf_obj *obj_ref, pdf_graft_map *map) +{ + pdf_obj *val, *key; + pdf_obj *new_obj = NULL; + pdf_obj *new_dict = NULL; + pdf_obj *new_array = NULL; + pdf_obj *ref = NULL; + fz_buffer *buffer = NULL; + pdf_graft_map *drop_map = NULL; + int new_num, src_num, len, i; + + if (map == NULL) + drop_map = map = pdf_new_graft_map(ctx, src); + + if (pdf_is_indirect(ctx, obj_ref)) + { + src_num = pdf_to_num(ctx, obj_ref); + + /* Check if we have done this one. If yes, then drop map (if allocated) + * and return our indirect ref */ + if (map->dst_from_src[src_num] != 0) + { + int dest_num = map->dst_from_src[src_num]; + pdf_drop_graft_map(ctx, drop_map); + return pdf_new_indirect(ctx, dst, dest_num, 0); + } + + fz_var(buffer); + fz_var(ref); + + fz_try(ctx) + { + /* Create new slot for our src object, set the mapping and call again + * using the resolved indirect reference */ + new_num = pdf_create_object(ctx, dst); + map->dst_from_src[src_num] = new_num; + new_obj = pdf_graft_object(ctx, dst, src, pdf_resolve_indirect(ctx, obj_ref), map); + + /* Return a ref to the new_obj making sure to attach any stream */ + pdf_update_object(ctx, dst, new_num, new_obj); + pdf_drop_obj(ctx, new_obj); + ref = pdf_new_indirect(ctx, dst, new_num, 0); + if (pdf_is_stream(ctx, obj_ref)) + { + buffer = pdf_load_raw_stream(ctx, src, src_num, 0); + pdf_update_stream(ctx, dst, ref, buffer, 1); + } + } + fz_always(ctx) + { + fz_drop_buffer(ctx, buffer); + pdf_drop_graft_map(ctx, drop_map); + } + fz_catch(ctx) + { + pdf_drop_obj(ctx, ref); + fz_rethrow(ctx); + } + return ref; + } + else if (pdf_is_dict(ctx, obj_ref)) + { + fz_var(new_dict); + + fz_try(ctx) + { + len = pdf_dict_len(ctx, obj_ref); + new_dict = pdf_new_dict(ctx, dst, len); + + for (i = 0; i < len; i++) + { + key = pdf_dict_get_key(ctx, obj_ref, i); + val = pdf_dict_get_val(ctx, obj_ref, i); + pdf_dict_put_drop(ctx, new_dict, key, pdf_graft_object(ctx, dst, src, val, map)); + } + } + fz_always(ctx) + { + pdf_drop_graft_map(ctx, drop_map); + } + fz_catch(ctx) + { + pdf_drop_obj(ctx, new_dict); + fz_rethrow(ctx); + } + return new_dict; + } + else if (pdf_is_array(ctx, obj_ref)) + { + fz_var(new_array); + + fz_try(ctx) + { + /* Step through the array items handling indirect refs */ + len = pdf_array_len(ctx, obj_ref); + new_array = pdf_new_array(ctx, dst, len); + + for (i = 0; i < len; i++) + { + val = pdf_array_get(ctx, obj_ref, i); + pdf_array_push_drop(ctx, new_array, pdf_graft_object(ctx, dst, src, val, map)); + } + } + fz_always(ctx) + { + pdf_drop_graft_map(ctx, drop_map); + } + fz_catch(ctx) + { + pdf_drop_obj(ctx, new_array); + fz_rethrow(ctx); + } + return new_array; + } + else + { + pdf_drop_graft_map(ctx, drop_map); + return pdf_keep_obj(ctx, obj_ref);; + } +} diff --git a/source/tools/mutool.c b/source/tools/mutool.c index 1f163b21..fd38be4d 100644 --- a/source/tools/mutool.c +++ b/source/tools/mutool.c @@ -17,6 +17,7 @@ int pdfposter_main(int argc, char *argv[]); int pdfshow_main(int argc, char *argv[]); int pdfpages_main(int argc, char *argv[]); int pdfcreate_main(int argc, char *argv[]); +int pdfmerge_main(int argc, char *argv[]); static struct { int (*func)(int argc, char *argv[]); @@ -32,6 +33,7 @@ static struct { { pdfposter_main, "poster", "split large page into many tiles" }, { pdfshow_main, "show", "show internal pdf objects" }, { pdfcreate_main, "create", "create pdf document" }, + { pdfmerge_main, "merge", "merge pages from multiple pdf sources into a new pdf" }, }; static int diff --git a/source/tools/pdfmerge.c b/source/tools/pdfmerge.c new file mode 100644 index 00000000..27258a9e --- /dev/null +++ b/source/tools/pdfmerge.c @@ -0,0 +1,228 @@ +/* + * PDF merge tool: Tool for merging pdf content. + * + * Simple test bed to work with merging pages from multiple PDFs into a single PDF. + */ + +#include "mupdf/pdf.h" + +static void usage(void) +{ + fprintf(stderr, + "usage: mutool merge [-o output.pdf] [-adlsz] input.pdf [pages] [input2.pdf] [pages2] ...\n" + "\t-o\tname of PDF file to create\n" + "\t-a\tascii hex encode binary streams\n" + "\t-d\tdecompress all streams\n" + "\t-l\tlinearize PDF\n" + "\t-s\tclean content streams\n" + "\t-z\tdeflate uncompressed streams\n" + "\tinput.pdf name of first PDF file from which we are copying pages\n" + ); + exit(1); +} + +static fz_context *ctx = NULL; +static pdf_document *doc_des = NULL; +static pdf_document *doc_src = NULL; + +/* This isrange is a duplicate with mudraw.c Not sure how we want to organize or if + * we are fine with the small amount of code duplication */ +static int isrange(char *s) +{ + while (*s) + { + if ((*s < '0' || *s > '9') && *s != '-' && *s != ',') + return 0; + s++; + } + return 1; +} + +static void page_merge(int page_from, int page_to, pdf_graft_map *graft_map) +{ + pdf_obj *pageref = NULL; + pdf_obj *page_dict; + pdf_obj *obj = NULL, *ref = NULL; + /* Include minimal number of objects for page. Do not include items that + * reference other pages */ + pdf_obj *known_page_objs[] = { PDF_NAME_Contents, PDF_NAME_Resources, + PDF_NAME_MediaBox, PDF_NAME_CropBox, PDF_NAME_BleedBox, PDF_NAME_TrimBox, + PDF_NAME_ArtBox, PDF_NAME_Rotate, PDF_NAME_UserUnit}; + int n = nelem(known_page_objs); + int i; + int num; + + fz_var(obj); + fz_var(ref); + + fz_try(ctx) + { + pageref = pdf_lookup_page_obj(ctx, doc_src, page_from - 1); + + /* Make a new dictionary and copy over the items from the source object to + * the new dict that we want to deep copy. */ + page_dict = pdf_new_dict(ctx, doc_des, 4); + + pdf_dict_put_drop(ctx, page_dict, PDF_NAME_Type, PDF_NAME_Page); + + for (i = 0; i < n; i++) + { + obj = pdf_dict_get(ctx, pageref, known_page_objs[i]); + if (obj != NULL) + pdf_dict_put_drop(ctx, page_dict, known_page_objs[i], pdf_graft_object(ctx, doc_des, doc_src, obj, graft_map)); + } + + /* Add the dictionary */ + obj = pdf_add_object_drop(ctx, doc_des, page_dict); + + /* Get indirect ref */ + num = pdf_to_num(ctx, obj); + ref = pdf_new_indirect(ctx, doc_des, num, 0); + + /* Insert */ + pdf_insert_page(ctx, doc_des, page_to - 1, ref); + } + fz_always(ctx) + { + pdf_drop_obj(ctx, obj); + pdf_drop_obj(ctx, ref); + } + fz_catch(ctx) + { + fz_rethrow(ctx); + } +} + +static void merge_range(char *range) +{ + int page, spage, epage, src_pagecount, des_pagecount; + char *spec, *dash; + pdf_graft_map *graft_map; + + src_pagecount = fz_count_pages(ctx, (fz_document*) doc_src); + des_pagecount = fz_count_pages(ctx, (fz_document*) doc_des); + spec = fz_strsep(&range, ","); + graft_map = pdf_new_graft_map(ctx, doc_src); + + fz_try(ctx) + { + while (spec) + { + dash = strchr(spec, '-'); + + if (dash == spec) + spage = epage = src_pagecount; + else + spage = epage = atoi(spec); + + if (dash) + { + if (strlen(dash) > 1) + epage = atoi(dash + 1); + else + epage = src_pagecount; + } + + spage = fz_clampi(spage, 1, src_pagecount); + epage = fz_clampi(epage, 1, src_pagecount); + + if (spage < epage) + for (page = spage; page <= epage; page++, des_pagecount++) + page_merge(page, des_pagecount + 1, graft_map); + else + for (page = spage; page >= epage; page--, des_pagecount++) + page_merge(page, des_pagecount + 1, graft_map); + spec = fz_strsep(&range, ","); + } + } + fz_always(ctx) + { + pdf_drop_graft_map(ctx, graft_map); + } + fz_catch(ctx) + { + fz_rethrow(ctx); + } +} + +int pdfmerge_main(int argc, char **argv) +{ + pdf_write_options opts = { 0 }; + char *output = "out.pdf"; + char *infile_src; + int c; + + while ((c = fz_getopt(argc, argv, "adlszo:")) != -1) + { + switch (c) + { + case 'o': output = fz_optarg; break; + case 'a': opts.do_ascii ++; break; + case 'd': opts.do_expand ^= PDF_EXPAND_ALL; break; + case 'l': opts.do_linear ++; break; + case 's': opts.do_clean ++; break; + case 'z': opts.do_deflate ++; break; + default: usage(); break; + } + } + + if (fz_optind == argc) + usage(); + + ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED); + if (!ctx) + { + fprintf(stderr, "Cannot initialise context\n"); + exit(1); + } + + fz_try(ctx) + { + doc_des = pdf_create_document(ctx); + } + fz_catch(ctx) + { + fprintf(stderr, "Failed to allocate destination document file %s\n", output); + exit(1); + } + + /* Step through the source files */ + while (fz_optind < argc) + { + fz_try(ctx) + { + infile_src = argv[fz_optind++]; + pdf_drop_document(ctx, doc_src); + doc_src = pdf_open_document(ctx, infile_src); + + if (fz_optind == argc || !isrange(argv[fz_optind])) + merge_range("1-"); + else + merge_range(argv[fz_optind++]); + } + fz_catch(ctx) + { + fprintf(stderr, "Failed merging document %s\n", infile_src); + exit(1); + } + } + + fz_try(ctx) + { + pdf_save_document(ctx, doc_des, output, &opts); + } + fz_always(ctx) + { + pdf_drop_document(ctx, doc_des); + pdf_drop_document(ctx, doc_src); + } + fz_catch(ctx) + { + fprintf(stderr, "Error encountered during file save.\n"); + exit(1); + } + fz_flush_warnings(ctx); + fz_drop_context(ctx); + + return 0; +} -- cgit v1.2.3