summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Vrhel <michael.vrhel@artifex.com>2016-03-15 16:33:22 -0700
committerRobin Watts <robin.watts@artifex.com>2016-04-19 18:30:40 +0100
commit1bbde93ba39613877ded2015965aaab2341d3d36 (patch)
treef2a4936de895c6ddc40deb7bc93ccfb66de514d3
parent335798189cbd40cd518ce49d1fa4c7eaa2811977 (diff)
downloadmupdf-1bbde93ba39613877ded2015965aaab2341d3d36.tar.xz
Add mutool merge
This commit adds a page merging tool. The tool demonstrates the use of object grafting. The object grafting function recursively goes through the object to add all referenced objects. A map is maintained to ensure that objects that have already been copied are not copied again.
-rw-r--r--Makefile2
-rw-r--r--include/mupdf/pdf.h1
-rw-r--r--include/mupdf/pdf/graft.h17
-rw-r--r--platform/win32/libmupdf.vcproj8
-rw-r--r--platform/win32/mutool.vcproj4
-rw-r--r--source/fitz/document.c3
-rw-r--r--source/pdf/pdf-graft.c163
-rw-r--r--source/tools/mutool.c2
-rw-r--r--source/tools/pdfmerge.c228
9 files changed, 426 insertions, 2 deletions
diff --git a/Makefile b/Makefile
index 5e1d2778..d0dba772 100644
--- a/Makefile
+++ b/Makefile
@@ -249,7 +249,7 @@ $(OUT)/cmapdump.o : include/mupdf/pdf/cmap.h source/pdf/pdf-cmap.c source/pdf/pd
# --- Tools and Apps ---
MUTOOL := $(addprefix $(OUT)/, mutool)
-MUTOOL_OBJ := $(addprefix $(OUT)/tools/, mutool.o mudraw.o murun.o pdfclean.o pdfcreate.o pdfextract.o pdfinfo.o pdfposter.o pdfshow.o pdfpages.o)
+MUTOOL_OBJ := $(addprefix $(OUT)/tools/, mutool.o mudraw.o murun.o pdfclean.o pdfcreate.o pdfextract.o pdfinfo.o pdfposter.o pdfshow.o pdfpages.o pdfmerge.o)
$(MUTOOL_OBJ): $(FITZ_HDR) $(PDF_HDR)
$(MUTOOL) : $(MUPDF_LIB) $(THIRD_LIB)
$(MUTOOL) : $(MUTOOL_OBJ)
diff --git a/include/mupdf/pdf.h b/include/mupdf/pdf.h
index eab70ee9..cb4bbdd5 100644
--- a/include/mupdf/pdf.h
+++ b/include/mupdf/pdf.h
@@ -15,6 +15,7 @@ extern "C" {
#include "mupdf/pdf/crypt.h"
#include "mupdf/pdf/page.h"
+#include "mupdf/pdf/graft.h"
#include "mupdf/pdf/resource.h"
#include "mupdf/pdf/cmap.h"
#include "mupdf/pdf/font.h"
diff --git a/include/mupdf/pdf/graft.h b/include/mupdf/pdf/graft.h
new file mode 100644
index 00000000..0b2d8151
--- /dev/null
+++ b/include/mupdf/pdf/graft.h
@@ -0,0 +1,17 @@
+#ifndef MUPDF_PDF_GRAFT_H
+#define MUPDF_PDF_GRAFT_H
+
+typedef struct pdf_graft_map_s pdf_graft_map;
+
+struct pdf_graft_map_s
+{
+ int refs;
+ int len;
+ int *dst_from_src;
+};
+
+pdf_graft_map *pdf_new_graft_map(fz_context *ctx, pdf_document *src);
+void pdf_drop_graft_map(fz_context *ctx, pdf_graft_map *map);
+pdf_obj *pdf_graft_object(fz_context *ctx, pdf_document *dst, pdf_document *src, pdf_obj *obj, pdf_graft_map *map);
+
+#endif
diff --git a/platform/win32/libmupdf.vcproj b/platform/win32/libmupdf.vcproj
index 384e27cf..a760821d 100644
--- a/platform/win32/libmupdf.vcproj
+++ b/platform/win32/libmupdf.vcproj
@@ -1186,6 +1186,10 @@
RelativePath="..\..\source\pdf\pdf-glyphlist.h"
>
</File>
+ <File
+ RelativePath="..\..\source\pdf\pdf-graft.c"
+ >
+ </File>
<File
RelativePath="..\..\source\pdf\pdf-image.c"
>
@@ -1373,6 +1377,10 @@
RelativePath="..\..\include\mupdf\pdf\font.h"
>
</File>
+ <File
+ RelativePath="..\..\include\mupdf\pdf\graft.h"
+ >
+ </File>
<File
RelativePath="..\..\include\mupdf\pdf\javascript.h"
>
diff --git a/platform/win32/mutool.vcproj b/platform/win32/mutool.vcproj
index 9d16ada5..1bf3e485 100644
--- a/platform/win32/mutool.vcproj
+++ b/platform/win32/mutool.vcproj
@@ -486,6 +486,10 @@
RelativePath="..\..\source\tools\pdfextract.c"
>
</File>
+ <File
+ RelativePath="..\..\source\tools\pdfmerge.c"
+ >
+ </File>
<File
RelativePath="..\..\source\tools\pdfinfo.c"
>
diff --git a/source/fitz/document.c b/source/fitz/document.c
index 1719aaf6..7041616b 100644
--- a/source/fitz/document.c
+++ b/source/fitz/document.c
@@ -157,7 +157,8 @@ fz_new_document(fz_context *ctx, int size)
fz_document *
fz_keep_document(fz_context *ctx, fz_document *doc)
{
- ++doc->refs;
+ if (doc)
+ ++doc->refs;
return doc;
}
diff --git a/source/pdf/pdf-graft.c b/source/pdf/pdf-graft.c
new file mode 100644
index 00000000..bbe5e17b
--- /dev/null
+++ b/source/pdf/pdf-graft.c
@@ -0,0 +1,163 @@
+#include "mupdf/pdf.h"
+
+pdf_graft_map *
+pdf_new_graft_map(fz_context *ctx, pdf_document *src)
+{
+ pdf_graft_map *map = NULL;
+
+ map = fz_malloc_struct(ctx, pdf_graft_map);
+
+ fz_try(ctx)
+ {
+ map->len = pdf_xref_len(ctx, src);
+ map->dst_from_src = fz_calloc(ctx, map->len, sizeof(int));
+ }
+ fz_catch(ctx)
+ {
+ fz_free(ctx, map);
+ fz_rethrow(ctx);
+ }
+ map->refs = 1;
+ return map;
+}
+
+pdf_graft_map *
+fz_keep_graft_map(fz_context *ctx, pdf_graft_map *map)
+{
+ if (map)
+ ++map->refs;
+ return map;
+}
+
+void
+pdf_drop_graft_map(fz_context *ctx, pdf_graft_map *map)
+{
+ if (map && --map->refs == 0)
+ {
+ fz_free(ctx, map->dst_from_src);
+ fz_free(ctx, map);
+ }
+}
+
+/* Graft object from dst to source */
+pdf_obj *
+pdf_graft_object(fz_context *ctx, pdf_document *dst, pdf_document *src, pdf_obj *obj_ref, pdf_graft_map *map)
+{
+ pdf_obj *val, *key;
+ pdf_obj *new_obj = NULL;
+ pdf_obj *new_dict = NULL;
+ pdf_obj *new_array = NULL;
+ pdf_obj *ref = NULL;
+ fz_buffer *buffer = NULL;
+ pdf_graft_map *drop_map = NULL;
+ int new_num, src_num, len, i;
+
+ if (map == NULL)
+ drop_map = map = pdf_new_graft_map(ctx, src);
+
+ if (pdf_is_indirect(ctx, obj_ref))
+ {
+ src_num = pdf_to_num(ctx, obj_ref);
+
+ /* Check if we have done this one. If yes, then drop map (if allocated)
+ * and return our indirect ref */
+ if (map->dst_from_src[src_num] != 0)
+ {
+ int dest_num = map->dst_from_src[src_num];
+ pdf_drop_graft_map(ctx, drop_map);
+ return pdf_new_indirect(ctx, dst, dest_num, 0);
+ }
+
+ fz_var(buffer);
+ fz_var(ref);
+
+ fz_try(ctx)
+ {
+ /* Create new slot for our src object, set the mapping and call again
+ * using the resolved indirect reference */
+ new_num = pdf_create_object(ctx, dst);
+ map->dst_from_src[src_num] = new_num;
+ new_obj = pdf_graft_object(ctx, dst, src, pdf_resolve_indirect(ctx, obj_ref), map);
+
+ /* Return a ref to the new_obj making sure to attach any stream */
+ pdf_update_object(ctx, dst, new_num, new_obj);
+ pdf_drop_obj(ctx, new_obj);
+ ref = pdf_new_indirect(ctx, dst, new_num, 0);
+ if (pdf_is_stream(ctx, obj_ref))
+ {
+ buffer = pdf_load_raw_stream(ctx, src, src_num, 0);
+ pdf_update_stream(ctx, dst, ref, buffer, 1);
+ }
+ }
+ fz_always(ctx)
+ {
+ fz_drop_buffer(ctx, buffer);
+ pdf_drop_graft_map(ctx, drop_map);
+ }
+ fz_catch(ctx)
+ {
+ pdf_drop_obj(ctx, ref);
+ fz_rethrow(ctx);
+ }
+ return ref;
+ }
+ else if (pdf_is_dict(ctx, obj_ref))
+ {
+ fz_var(new_dict);
+
+ fz_try(ctx)
+ {
+ len = pdf_dict_len(ctx, obj_ref);
+ new_dict = pdf_new_dict(ctx, dst, len);
+
+ for (i = 0; i < len; i++)
+ {
+ key = pdf_dict_get_key(ctx, obj_ref, i);
+ val = pdf_dict_get_val(ctx, obj_ref, i);
+ pdf_dict_put_drop(ctx, new_dict, key, pdf_graft_object(ctx, dst, src, val, map));
+ }
+ }
+ fz_always(ctx)
+ {
+ pdf_drop_graft_map(ctx, drop_map);
+ }
+ fz_catch(ctx)
+ {
+ pdf_drop_obj(ctx, new_dict);
+ fz_rethrow(ctx);
+ }
+ return new_dict;
+ }
+ else if (pdf_is_array(ctx, obj_ref))
+ {
+ fz_var(new_array);
+
+ fz_try(ctx)
+ {
+ /* Step through the array items handling indirect refs */
+ len = pdf_array_len(ctx, obj_ref);
+ new_array = pdf_new_array(ctx, dst, len);
+
+ for (i = 0; i < len; i++)
+ {
+ val = pdf_array_get(ctx, obj_ref, i);
+ pdf_array_push_drop(ctx, new_array, pdf_graft_object(ctx, dst, src, val, map));
+ }
+ }
+ fz_always(ctx)
+ {
+ pdf_drop_graft_map(ctx, drop_map);
+ }
+ fz_catch(ctx)
+ {
+ pdf_drop_obj(ctx, new_array);
+ fz_rethrow(ctx);
+ }
+ return new_array;
+ }
+ else
+ {
+ pdf_drop_graft_map(ctx, drop_map);
+ return pdf_keep_obj(ctx, obj_ref);;
+ }
+}
diff --git a/source/tools/mutool.c b/source/tools/mutool.c
index 1f163b21..fd38be4d 100644
--- a/source/tools/mutool.c
+++ b/source/tools/mutool.c
@@ -17,6 +17,7 @@ int pdfposter_main(int argc, char *argv[]);
int pdfshow_main(int argc, char *argv[]);
int pdfpages_main(int argc, char *argv[]);
int pdfcreate_main(int argc, char *argv[]);
+int pdfmerge_main(int argc, char *argv[]);
static struct {
int (*func)(int argc, char *argv[]);
@@ -32,6 +33,7 @@ static struct {
{ pdfposter_main, "poster", "split large page into many tiles" },
{ pdfshow_main, "show", "show internal pdf objects" },
{ pdfcreate_main, "create", "create pdf document" },
+ { pdfmerge_main, "merge", "merge pages from multiple pdf sources into a new pdf" },
};
static int
diff --git a/source/tools/pdfmerge.c b/source/tools/pdfmerge.c
new file mode 100644
index 00000000..27258a9e
--- /dev/null
+++ b/source/tools/pdfmerge.c
@@ -0,0 +1,228 @@
+/*
+ * PDF merge tool: Tool for merging pdf content.
+ *
+ * Simple test bed to work with merging pages from multiple PDFs into a single PDF.
+ */
+
+#include "mupdf/pdf.h"
+
+static void usage(void)
+{
+ fprintf(stderr,
+ "usage: mutool merge [-o output.pdf] [-adlsz] input.pdf [pages] [input2.pdf] [pages2] ...\n"
+ "\t-o\tname of PDF file to create\n"
+ "\t-a\tascii hex encode binary streams\n"
+ "\t-d\tdecompress all streams\n"
+ "\t-l\tlinearize PDF\n"
+ "\t-s\tclean content streams\n"
+ "\t-z\tdeflate uncompressed streams\n"
+ "\tinput.pdf name of first PDF file from which we are copying pages\n"
+ );
+ exit(1);
+}
+
+static fz_context *ctx = NULL;
+static pdf_document *doc_des = NULL;
+static pdf_document *doc_src = NULL;
+
+/* This isrange is a duplicate with mudraw.c Not sure how we want to organize or if
+ * we are fine with the small amount of code duplication */
+static int isrange(char *s)
+{
+ while (*s)
+ {
+ if ((*s < '0' || *s > '9') && *s != '-' && *s != ',')
+ return 0;
+ s++;
+ }
+ return 1;
+}
+
+static void page_merge(int page_from, int page_to, pdf_graft_map *graft_map)
+{
+ pdf_obj *pageref = NULL;
+ pdf_obj *page_dict;
+ pdf_obj *obj = NULL, *ref = NULL;
+ /* Include minimal number of objects for page. Do not include items that
+ * reference other pages */
+ pdf_obj *known_page_objs[] = { PDF_NAME_Contents, PDF_NAME_Resources,
+ PDF_NAME_MediaBox, PDF_NAME_CropBox, PDF_NAME_BleedBox, PDF_NAME_TrimBox,
+ PDF_NAME_ArtBox, PDF_NAME_Rotate, PDF_NAME_UserUnit};
+ int n = nelem(known_page_objs);
+ int i;
+ int num;
+
+ fz_var(obj);
+ fz_var(ref);
+
+ fz_try(ctx)
+ {
+ pageref = pdf_lookup_page_obj(ctx, doc_src, page_from - 1);
+
+ /* Make a new dictionary and copy over the items from the source object to
+ * the new dict that we want to deep copy. */
+ page_dict = pdf_new_dict(ctx, doc_des, 4);
+
+ pdf_dict_put_drop(ctx, page_dict, PDF_NAME_Type, PDF_NAME_Page);
+
+ for (i = 0; i < n; i++)
+ {
+ obj = pdf_dict_get(ctx, pageref, known_page_objs[i]);
+ if (obj != NULL)
+ pdf_dict_put_drop(ctx, page_dict, known_page_objs[i], pdf_graft_object(ctx, doc_des, doc_src, obj, graft_map));
+ }
+
+ /* Add the dictionary */
+ obj = pdf_add_object_drop(ctx, doc_des, page_dict);
+
+ /* Get indirect ref */
+ num = pdf_to_num(ctx, obj);
+ ref = pdf_new_indirect(ctx, doc_des, num, 0);
+
+ /* Insert */
+ pdf_insert_page(ctx, doc_des, page_to - 1, ref);
+ }
+ fz_always(ctx)
+ {
+ pdf_drop_obj(ctx, obj);
+ pdf_drop_obj(ctx, ref);
+ }
+ fz_catch(ctx)
+ {
+ fz_rethrow(ctx);
+ }
+}
+
+static void merge_range(char *range)
+{
+ int page, spage, epage, src_pagecount, des_pagecount;
+ char *spec, *dash;
+ pdf_graft_map *graft_map;
+
+ src_pagecount = fz_count_pages(ctx, (fz_document*) doc_src);
+ des_pagecount = fz_count_pages(ctx, (fz_document*) doc_des);
+ spec = fz_strsep(&range, ",");
+ graft_map = pdf_new_graft_map(ctx, doc_src);
+
+ fz_try(ctx)
+ {
+ while (spec)
+ {
+ dash = strchr(spec, '-');
+
+ if (dash == spec)
+ spage = epage = src_pagecount;
+ else
+ spage = epage = atoi(spec);
+
+ if (dash)
+ {
+ if (strlen(dash) > 1)
+ epage = atoi(dash + 1);
+ else
+ epage = src_pagecount;
+ }
+
+ spage = fz_clampi(spage, 1, src_pagecount);
+ epage = fz_clampi(epage, 1, src_pagecount);
+
+ if (spage < epage)
+ for (page = spage; page <= epage; page++, des_pagecount++)
+ page_merge(page, des_pagecount + 1, graft_map);
+ else
+ for (page = spage; page >= epage; page--, des_pagecount++)
+ page_merge(page, des_pagecount + 1, graft_map);
+ spec = fz_strsep(&range, ",");
+ }
+ }
+ fz_always(ctx)
+ {
+ pdf_drop_graft_map(ctx, graft_map);
+ }
+ fz_catch(ctx)
+ {
+ fz_rethrow(ctx);
+ }
+}
+
+int pdfmerge_main(int argc, char **argv)
+{
+ pdf_write_options opts = { 0 };
+ char *output = "out.pdf";
+ char *infile_src;
+ int c;
+
+ while ((c = fz_getopt(argc, argv, "adlszo:")) != -1)
+ {
+ switch (c)
+ {
+ case 'o': output = fz_optarg; break;
+ case 'a': opts.do_ascii ++; break;
+ case 'd': opts.do_expand ^= PDF_EXPAND_ALL; break;
+ case 'l': opts.do_linear ++; break;
+ case 's': opts.do_clean ++; break;
+ case 'z': opts.do_deflate ++; break;
+ default: usage(); break;
+ }
+ }
+
+ if (fz_optind == argc)
+ usage();
+
+ ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
+ if (!ctx)
+ {
+ fprintf(stderr, "Cannot initialise context\n");
+ exit(1);
+ }
+
+ fz_try(ctx)
+ {
+ doc_des = pdf_create_document(ctx);
+ }
+ fz_catch(ctx)
+ {
+ fprintf(stderr, "Failed to allocate destination document file %s\n", output);
+ exit(1);
+ }
+
+ /* Step through the source files */
+ while (fz_optind < argc)
+ {
+ fz_try(ctx)
+ {
+ infile_src = argv[fz_optind++];
+ pdf_drop_document(ctx, doc_src);
+ doc_src = pdf_open_document(ctx, infile_src);
+
+ if (fz_optind == argc || !isrange(argv[fz_optind]))
+ merge_range("1-");
+ else
+ merge_range(argv[fz_optind++]);
+ }
+ fz_catch(ctx)
+ {
+ fprintf(stderr, "Failed merging document %s\n", infile_src);
+ exit(1);
+ }
+ }
+
+ fz_try(ctx)
+ {
+ pdf_save_document(ctx, doc_des, output, &opts);
+ }
+ fz_always(ctx)
+ {
+ pdf_drop_document(ctx, doc_des);
+ pdf_drop_document(ctx, doc_src);
+ }
+ fz_catch(ctx)
+ {
+ fprintf(stderr, "Error encountered during file save.\n");
+ exit(1);
+ }
+ fz_flush_warnings(ctx);
+ fz_drop_context(ctx);
+
+ return 0;
+}