From 927d36d58bf0896c2ab8b470f926e1ed1a736561 Mon Sep 17 00:00:00 2001
From: Michael Vrhel <michael.vrhel@artifex.com>
Date: Wed, 4 Nov 2015 15:05:22 -0800
Subject: Add mutool create tool, and PDF font and image resource creation.

Initial framework for creating pdfs

This adds a create option to mutool for us to use in working
on the API for creating content as well as adding content to
existing documents.

mutool create: Get page sizes and add them

Start the parsing of the contents.txt file which may have
multiple page information.  Add the pages at the proper sizes.

Further work on mutool create_pdf

Remove the calls that were being made to the pdf-write device.
Clean up several issues with the reading of the page contents.
Get the content streams for each page associated with the page->contents
Temp. created a pdf_create_page_contents procedure.  I will merge
this with pdf_create_page as there is significant overlap.

Next is to add in the font and image resources and indirect references.

Include pdfcreate in build

Merge pdf_create_page_contents and pdf_create_page

Add support for images in pdfcreate

This adds images to the pdf document using a function stolen from pdf-device (send_image).
This was renamed pdf_add_image_res and added to pdf-image.   Down the road, send-image will
be removed.  Prior to that, I need to work on making sure that multiple copies of the same
image do not end up in the document.

Code was also added to create the page resources to point to the proper image in the document.

Next fonts will be added in a similar manner, then I will work on computing the md5 sums of
image and fonts to ensure only one copy ends up in the document.  Then pdf-write will be
reworked to use the same code as opposed to its current list of md5 sums that are stored in
a device structure.

mutool pdfcreate: support for WinAnsiEncoded fonts

Added support for very simple fonts (WinAnsiEncoding).  Methods
added in pdf-font.c.  Added first_width and last_width to fz_font_s
and stem_v to pdf_font_desc_s.

Ran code through memento with simple test of 4 page document
creation including an image and a font.  Fixed several leaks
as well as buffer corruption issues (main changes in pdfcreate).
Thanks to Robin for the help with Memento in finding leaks.

Added StemV to pdf names as it was needed for the font descriptor creation.

Fix for pdf_write_document rename to pdf_save_document

Add resource_ids to pdf document structure

The purpose of this structure will be to allow the search
and reuse of resources when we attempt to add new ones
to the document.

Fix name changes from recent updates

pdf_create branch updated to work with recent changes in master

Initial use of hash table for resources

To avoid adding in the same resource this adds a
resource_tables member to pdf_document.  The
resource_tables structure consists of multiple
fz_hash_table entries, one for each resource type.
When an attempt is made to search for an existing
resource, the table will be initialized in a brute
force search for existing resources.  Currently this
is only set up for the image resources and accessed
through pdf_add_image_res.  If a match is found,
the reference object is returned.  If no match is found
NULL is returned and the ref object created in pdf_add_image_res
is added into the hash table.  In this case, a command line
such as
create -o output.pdf -f F0:font.ttf -i Im0:image.jpg -i Im1:image1.jpg \\
-i Im2:image.jpg contents.txt
will avoid the insertion of two copies of image.jpg into the
output PDF document.

CID Identity-H Font added for handing ttf

This adds a method for adding a ttf to a PDF as a
CID font with Identity-H mapping and a ToUnicode
entry that is created using FT_Get_Char_Index

This takes much care in the creation of the ToUnicode
CMap to ensure that the minimum number of entries
are created in that we try to use beginbfrange as
much as possible before using beginbfchar.  The
code makes sure to limit the number of entries in
a group to 100 and to not cross first-byte boundaries
for the CID values as described in the Adobe
Technical note 5411.

Add missing file pdf-resources.c

pdf-resources.c was missing and should have been
committed earlier.  Added to windows project file.
Not sure where else it needs to be added for the
other platforms.

Clean up names and spacing

Make sure that the visible functions have the proper namespace (e.g. pdf_xxxx)
Also make sure we have a blank line prior to comment.
Be consistent with static function naming in pdf_resources.c

pdfwrite make use of image resource fz_hash_table

The pdfwrite device now shares the structure that stores the
resource images for pdfcreate.   With this fix, pdfwrite now
avoids duplicating the writing of the same images that are
shared across multiple pages.

Add missing file pdf-resources.c

Initial work toward having pdfwrite use Identity-H Type0 encoding for fonts

Finish of CID type0 Identity-H font for pdfwrite

This adds in the proper widths which may have been stored in the source font
in the width table (parsed from the W entry in the pdf file) or if the
free type structure has its own cmap then we can get the width from free type.
Widths are restructured into format described in 5.6.3 of PDF spec.

Fix issue from conflict merging and multiple define of structure

Clean up warnings and make mutool create use simple font
---
 source/tools/mudraw.c    |   2 +-
 source/tools/mutool.c    |   2 +
 source/tools/pdfcreate.c | 516 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 519 insertions(+), 1 deletion(-)
 create mode 100644 source/tools/pdfcreate.c

(limited to 'source/tools')

diff --git a/source/tools/mudraw.c b/source/tools/mudraw.c
index 47aab044..d63f936c 100644
--- a/source/tools/mudraw.c
+++ b/source/tools/mudraw.c
@@ -387,7 +387,7 @@ static void drawpage(fz_context *ctx, fz_document *doc, int pagenum)
 		tbounds = bounds;
 		fz_transform_rect(&tbounds, &ctm);
 
-		newpage = pdf_create_page(ctx, pdfout, bounds, 72, 0);
+		newpage = pdf_create_page(ctx, pdfout, bounds, NULL, 0);
 
 		fz_try(ctx)
 		{
diff --git a/source/tools/mutool.c b/source/tools/mutool.c
index 6b61dab4..c662ffef 100644
--- a/source/tools/mutool.c
+++ b/source/tools/mutool.c
@@ -15,6 +15,7 @@ int pdfinfo_main(int argc, char *argv[]);
 int pdfposter_main(int argc, char *argv[]);
 int pdfshow_main(int argc, char *argv[]);
 int pdfpages_main(int argc, char *argv[]);
+int pdfcreate_main(int argc, char *argv[]);
 
 static struct {
 	int (*func)(int argc, char *argv[]);
@@ -28,6 +29,7 @@ static struct {
 	{ pdfpages_main, "pages", "show information about pdf pages" },
 	{ pdfposter_main, "poster", "split large page into many tiles" },
 	{ pdfshow_main, "show", "show internal pdf objects" },
+	{ pdfcreate_main, "create", "create pdf document" },
 };
 
 static int
diff --git a/source/tools/pdfcreate.c b/source/tools/pdfcreate.c
new file mode 100644
index 00000000..4e4a622e
--- /dev/null
+++ b/source/tools/pdfcreate.c
@@ -0,0 +1,516 @@
+/*
+ * PDF creation tool: Tool for creating pdf content.
+ *
+ * Simple test bed to work with adding content and creating PDFs
+ *
+ */
+
+#include "mupdf/pdf.h"
+
+#define MAX_IMAGES 10
+#define MAX_FONTS 10
+#define MAX_REF_NAME 32
+
+enum { RES_FONT, RES_XOBJECT };
+
+static void usage(void)
+{
+	fprintf(stderr,
+		"usage: mutool create [-o output.pdf] [fonts][images] contents\n"
+		"\t-f\tfont label:font file\n"
+		"\t-i\timage label:image file\n"
+		"\tcontents file, defines page size, graphics, references fonts and images\n"
+		);
+	exit(1);
+}
+
+/* Simple structures to hold and manage contents */
+typedef struct doc_content_s doc_content;
+typedef struct resources_s resources;
+typedef struct page_resource_xref_s page_resource_xref;
+
+struct page_resource_xref_s
+{
+	char *res_name;
+	char res_ref[MAX_REF_NAME];
+	pdf_obj *obj;
+	int type;
+};
+
+struct resources_s
+{
+	char *name;
+	int ref;
+};
+
+struct doc_content_s
+{
+	fz_stream *stm;
+	int num_pages;
+	fz_point *page_sizes;
+	fz_off_t *content_offsets;
+	int *content_lengths;
+	resources *fonts;
+	int num_fonts;
+	resources *images;
+	int num_images;
+	page_resource_xref **ref_im_resources;
+	page_resource_xref **ref_font_resources;
+	int *num_page_im_res;
+	int *num_page_font_res;
+};
+
+/* Look for the presence of image or font refererences in the content */
+static void check_for_reference(fz_context *ctx, resources *resource, int num_res, char *buffer, int res_type, doc_content *doc, int page_num)
+{
+	int i;
+	char * pch;
+	int length;
+	char name[MAX_REF_NAME];
+	int res_num;
+
+	for (i = 0; i < num_res; i++)
+	{
+		pch = strchr(resource[i].name, ':');
+		if (pch != NULL)
+		{
+			length = pch - resource[i].name;
+			if (length < sizeof(name))
+			{
+				memcpy(name, resource[i].name, length);
+				name[length] = 0;
+				pch = strstr(buffer, name);
+				if (pch != NULL)
+				{
+					/* Resource reference is in the content for this document. Mark
+					 * it as such so that we know to add it to the document */
+					resource[i].ref = 1;
+
+					/* Also note this page has a reference to this name so
+					 * we can add it to the page resource list.  Select from
+					 * font or image */
+					if (res_type == RES_FONT)
+					{
+						res_num = doc->num_page_font_res[page_num];
+						doc->ref_font_resources[page_num][res_num].obj = NULL; /* Set later */
+						doc->ref_font_resources[page_num][res_num].res_name = resource[i].name;
+						doc->ref_font_resources[page_num][res_num].type = res_type;
+						memcpy(doc->ref_font_resources[page_num][res_num].res_ref, name, sizeof(name));
+						doc->num_page_font_res[page_num] += 1;
+					}
+					else
+					{
+						res_num = doc->num_page_im_res[page_num];
+						doc->ref_im_resources[page_num][res_num].obj = NULL; /* Set later */
+						doc->ref_im_resources[page_num][res_num].res_name = resource[i].name;
+						doc->ref_im_resources[page_num][res_num].type = res_type;
+						memcpy(doc->ref_im_resources[page_num][res_num].res_ref, name, sizeof(name));
+						doc->num_page_im_res[page_num] += 1;
+					}
+				}
+			}
+			else
+				fz_throw(ctx, FZ_ERROR_GENERIC, "Image/Font indirect name too long");
+		}
+		else
+			fz_throw(ctx, FZ_ERROR_GENERIC, "Internal parsing error");
+	}
+}
+
+static void drop_ref_objs(fz_context *ctx, page_resource_xref *refs, int num_refs)
+{
+	int i;
+
+	for (i = 0; i < num_refs; i++)
+		pdf_drop_obj(ctx, refs[i].obj);
+}
+
+static void free_contents(fz_context *ctx, doc_content *content)
+{
+	int i;
+
+	fz_free(ctx, content->content_offsets);
+	fz_free(ctx, content->content_lengths);
+	fz_free(ctx, content->page_sizes);
+	for (i = 0; i < content->num_pages; i++)
+	{
+		drop_ref_objs(ctx, content->ref_font_resources[i], content->num_page_font_res[i]);
+		fz_free(ctx, content->ref_font_resources[i]);
+	}
+	fz_free(ctx, content->ref_font_resources);
+	for (i = 0; i < content->num_pages; i++)
+	{
+		drop_ref_objs(ctx, content->ref_im_resources[i], content->num_page_im_res[i]);
+		fz_free(ctx, content->ref_im_resources[i]);
+	}
+	fz_free(ctx, content->num_page_im_res);
+	fz_free(ctx, content->num_page_font_res);
+	fz_free(ctx, content->ref_im_resources);
+	content->content_offsets = NULL;
+	content->page_sizes = NULL;
+	content->content_lengths = NULL;
+	fz_drop_stream(ctx, content->stm);
+}
+
+/* This is a VERY simple format to give us something to play with
+ * in terms of defining pages and content. Here we parse the contents
+ * defining our page sizes and the command locations for each page.
+ * The format is as follows:
+ * 1) Comment lines are preceded by %
+ * 2) Number of pages is indicated at the begining with /Pages #
+ * 3) Each page is indicated by /Page # [X Y] (# is zero based)
+ * 4) The content is in the form of simple PDF content stream that
+ *    may included various drawing commands and reference the
+ *    image and font resources.
+ * */
+static void init_parse_contents(fz_context *ctx, char *content_fn, doc_content *content)
+{
+	fz_stream *stm;
+	char buf[1024];
+	int page_count = 0;
+	fz_off_t pre_off;
+	int i;
+
+	fz_var(stm);
+
+	fz_try(ctx)
+	{
+		stm = fz_open_file(ctx, content_fn);
+		while (1)
+		{
+			pre_off = fz_tell(ctx, stm);
+			fz_read_line(ctx, stm, buf, sizeof buf);
+			if (buf[0] == '\0')
+			{
+				if (content->num_pages > (page_count + 1))
+					fz_throw(ctx, FZ_ERROR_GENERIC, "Missing defined pages");
+				else
+				{
+					content->content_lengths[page_count - 1] =
+						fz_tell(ctx, stm) - content->content_offsets[page_count - 1];
+				}
+				break;
+			}
+			if (buf[0] != '%')
+			{
+				if (strncmp(buf, "/Pages", strlen("/Pages")) == 0)
+				{
+					if ((content->num_pages = atoi(&(buf[strlen("/Pages")]))) <= 0)
+						fz_throw(ctx, FZ_ERROR_GENERIC, "Page count invalid");
+					content->content_offsets = fz_malloc_array(ctx, content->num_pages, sizeof(fz_off_t));
+					content->content_lengths = fz_malloc_array(ctx, content->num_pages, sizeof(int));
+					content->page_sizes = fz_malloc_array(ctx, content->num_pages, sizeof(fz_point));
+					content->num_page_im_res = fz_calloc(ctx, content->num_pages, sizeof(int));
+					content->num_page_font_res = fz_calloc(ctx, content->num_pages, sizeof(int));
+					content->ref_font_resources = fz_malloc_array(ctx, content->num_pages, sizeof(page_resource_xref*));
+					content->ref_im_resources = fz_malloc_array(ctx, content->num_pages, sizeof(page_resource_xref*));
+					for (i = 0; i < content->num_pages; i++)
+					{
+						content->ref_font_resources[i] = fz_malloc_array(ctx, MAX_FONTS, sizeof(page_resource_xref));
+						content->ref_im_resources[i] = fz_malloc_array(ctx, MAX_FONTS, sizeof(page_resource_xref));
+					}
+				}
+				else if (strncmp(buf, "/Page", strlen("/Page")) == 0)
+				{
+					int page_num;
+					if (page_count > 0)
+					{
+						content->content_lengths[page_count - 1] =
+							(int)(pre_off - content->content_offsets[page_count - 1]);
+					}
+					if (sscanf(&(buf[strlen("/Page")]), "%d [%f %f]", &page_num,
+						&(content->page_sizes[page_count].x), &(content->page_sizes[page_count].y)) != 3)
+						fz_throw(ctx, FZ_ERROR_GENERIC, "Page size invalid");
+					if (page_num < 0 || page_num >= content->num_pages)
+						fz_throw(ctx, FZ_ERROR_GENERIC, "Page value invalid");
+					if (content->page_sizes[page_count].x < 0 || content->page_sizes[page_count].y < 0)
+						fz_throw(ctx, FZ_ERROR_GENERIC, "Page dimensions invalid");
+					content->content_offsets[page_count] = fz_tell(ctx, stm);
+					page_count++;
+				}
+				else
+				{
+					check_for_reference(ctx, content->images, content->num_images, buf, RES_XOBJECT, content, page_count - 1);
+					check_for_reference(ctx, content->fonts, content->num_fonts, buf, RES_FONT, content, page_count - 1);
+				}
+			}
+		}
+	}
+	fz_catch(ctx)
+	{
+		free_contents(ctx, content);
+		fz_drop_stream(ctx, stm);
+		fz_rethrow(ctx);
+	}
+	content->stm = stm;
+}
+
+/* Get the page contents */
+static int get_page_contents(fz_context *ctx, int page_num, doc_content *content,
+	unsigned char *buffer)
+{
+	int size = content->content_lengths[page_num];
+
+	if (buffer == NULL)
+		return size;
+	fz_seek(ctx, content->stm, content->content_offsets[page_num], SEEK_SET);
+	fz_read(ctx, content->stm, buffer, size);
+	return size;
+}
+
+static void update_res(fz_context *ctx, int num_pages, int *num_res, page_resource_xref **page_resource,
+	char *res_name, pdf_obj *obj)
+{
+	int j, i;
+
+	for (j = 0; j < num_pages; j++)
+		for (i = 0; i < num_res[j]; i++)
+			if (strcmp(res_name, page_resource[j][i].res_name) == 0)
+				page_resource[j][i].obj = obj;
+}
+
+static pdf_obj* create_page_res_dict(fz_context *ctx, pdf_document *pdf,
+	page_resource_xref *ref_res, const char type[], int num_items)
+{
+	pdf_obj *dict = NULL;
+	int i;
+
+	if (num_items <= 0)
+		return NULL;
+
+	fz_var(dict);
+
+	fz_try(ctx)
+	{
+		dict = pdf_new_dict(ctx, pdf, num_items);
+		for (i = 0; i < num_items; i++)
+		{
+			char text[32];
+			snprintf(text, sizeof(text), "%s/%s", type, ref_res[i].res_ref);
+			pdf_dict_putp(ctx, dict, text, ref_res[i].obj);
+		}
+	}
+	fz_catch(ctx)
+	{
+		pdf_drop_obj(ctx, dict);
+		fz_rethrow(ctx);
+	}
+	return dict;
+}
+
+static int create_pdf(fz_context *ctx, char *output, resources fonts[], int num_fonts,
+	resources images[], int num_images, char *contents)
+{
+	fz_rect bounds;
+	pdf_document *pdf = NULL;
+	pdf_page *newpage = NULL;
+	unsigned char *buffer = NULL;
+	fz_buffer *fz_buf = NULL;
+	fz_buffer *im_font_buff = NULL;
+	fz_image *image = NULL;
+	pdf_obj *font_dict = NULL;
+	pdf_obj *im_dict = NULL;
+	pdf_res *im_res = NULL;
+	pdf_res *font_res = NULL;
+	pdf_write_options opts = { 0 };
+	doc_content content = { 0 };
+	int k;
+	int length;
+	char *pch;
+
+	fz_var(pdf);
+	fz_var(newpage);
+	fz_var(buffer);
+	fz_var(fz_buf);
+	fz_var(im_font_buff);
+	fz_var(image);
+	fz_var(font_dict);
+	fz_var(im_dict);
+	fz_var(im_res);
+	fz_var(font_res);
+
+	fz_try(ctx)
+	{
+		pdf = pdf_create_document(ctx);
+		content.num_fonts = num_fonts;
+		content.num_images = num_images;
+		content.fonts = fonts;
+		content.images = images;
+		init_parse_contents(ctx, contents, &content);
+
+		/* Add the resources, getting the reference numbers in the process. */
+		for (k = 0; k < content.num_images; k++)
+		{
+			if (content.images[k].ref)
+			{
+				/* Get the fz_image */
+				pch = strchr(content.images[k].name, ':');
+				if (pch != NULL)
+				{
+					im_font_buff = fz_read_file(ctx, &(pch[1]));
+					image = fz_new_image_from_buffer(ctx, im_font_buff);
+					fz_drop_buffer(ctx, im_font_buff);
+					im_font_buff = NULL;
+					im_res = pdf_add_image_res(ctx, pdf, image, 0);
+					fz_drop_image(ctx, image);
+					image = NULL;
+
+					/* Look through our image page resources and update the
+					 * indirect reference number. Here we don't use the numbers
+					 * set by the doc resource holder (i.e im_res->num) since we
+					 * are using our own content specified for pdfcreate */
+					update_res(ctx, content.num_pages, content.num_page_im_res,
+						content.ref_im_resources, content.images[k].name, im_res->obj);
+				}
+				else
+					fz_throw(ctx, FZ_ERROR_GENERIC, "Image indirect name too long");
+			}
+		}
+		for (k = 0; k < content.num_fonts; k++)
+		{
+			if (content.fonts[k].ref)
+			{
+				pch = strchr(content.fonts[k].name, ':');
+				if (pch != NULL)
+				{
+					im_font_buff = fz_read_file(ctx, &(pch[1]));
+					font_res = pdf_add_simple_font_res(ctx, pdf, im_font_buff);
+					fz_drop_buffer(ctx, im_font_buff);
+					im_font_buff = NULL;
+
+					/* Look through our font page resources and update the indirect
+					 * reference number */
+					update_res(ctx, content.num_pages, content.num_page_font_res,
+						content.ref_font_resources, content.fonts[k].name, font_res->obj);
+				}
+				else
+					fz_throw(ctx, FZ_ERROR_GENERIC, "Font indirect name too long");
+			}
+		}
+
+		/* Now the page contents */
+		for (k = 0; k < content.num_pages; k++)
+		{
+			bounds.x0 = 0;
+			bounds.y0 = 0;
+			bounds.x1 = content.page_sizes[k].x;
+			bounds.y1 = content.page_sizes[k].y;
+
+			length = get_page_contents(ctx, k, &content, NULL);
+			if (length > 0)
+			{
+				buffer = fz_malloc(ctx, length);
+				length = get_page_contents(ctx, k, &content, buffer);
+				fz_buf = fz_new_buffer_from_data(ctx, buffer, length);
+				buffer = NULL;
+				newpage = pdf_create_page(ctx, pdf, bounds, fz_buf, 0);
+				/* Create the dicts for the page resources */
+				font_dict = create_page_res_dict(ctx, pdf, content.ref_font_resources[k],
+					"Font", content.num_page_font_res[k]);
+				im_dict = create_page_res_dict(ctx, pdf, content.ref_im_resources[k],
+					"XObject", content.num_page_im_res[k]);
+				if (im_dict != NULL)
+				{
+					pdf_dict_puts(ctx, newpage->me, "Resources", im_dict);
+					pdf_drop_obj(ctx, im_dict);
+				}
+				if (font_dict != NULL)
+				{
+					pdf_dict_puts(ctx, newpage->me, "Resources", font_dict);
+					pdf_drop_obj(ctx, font_dict);
+				}
+				fz_drop_buffer(ctx, fz_buf);
+				fz_buf = NULL;
+			}
+			else
+			{
+				newpage = pdf_create_page(ctx, pdf, bounds, NULL, 0);
+			}
+			pdf_insert_page(ctx, pdf, newpage, INT_MAX);
+			pdf_drop_page(ctx, newpage);
+			newpage = NULL;
+		}
+		pdf_save_document(ctx, pdf, output, &opts);
+	}
+	fz_always(ctx)
+	{
+		pdf_drop_page(ctx, newpage);
+		pdf_close_document(ctx, pdf);
+		free_contents(ctx, &content);
+		fz_free(ctx, buffer);
+		fz_drop_buffer(ctx, fz_buf);
+		fz_drop_buffer(ctx, im_font_buff);
+		fz_drop_image(ctx, image);
+	}
+	fz_catch(ctx)
+	{
+		fz_rethrow(ctx);
+	}
+	return 0;
+}
+
+int pdfcreate_main(int argc, char **argv)
+{
+	char *outfile = "out.pdf";
+	resources fonts[MAX_FONTS];
+	resources images[MAX_IMAGES];
+	char *contents = "";
+	int nfonts = 0;
+	int nimages = 0;
+	int c;
+	int errors = 0;
+	fz_context *ctx;
+
+	while ((c = fz_getopt(argc, argv, "f:i:o:")) != -1)
+	{
+		switch (c)
+		{
+		case 'f':
+			if (nfonts == MAX_FONTS)
+			{
+				fprintf(stderr, "max number of fonts exceeded\n");
+				exit(1);
+			}
+			fonts[nfonts].name = fz_optarg;
+			fonts[nfonts].ref = 0;
+			nfonts++;
+			break;
+		case 'i':
+			if (nimages == MAX_IMAGES)
+			{
+				fprintf(stderr, "max number of images exceeded\n");
+				exit(1);
+			}
+			images[nimages].name = fz_optarg;
+			images[nimages].ref = 0;
+			nimages++;
+			break;
+		case 'o': outfile = fz_optarg; break;
+		default: usage(); break;
+		}
+	}
+
+	if (fz_optind == argc)
+		usage();
+
+	contents = argv[fz_optind++];
+
+	ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
+	if (!ctx)
+	{
+		fprintf(stderr, "cannot initialise context\n");
+		exit(1);
+	}
+
+	fz_try(ctx)
+	{
+		create_pdf(ctx, outfile, fonts, nfonts, images, nimages, contents);
+	}
+	fz_catch(ctx)
+	{
+		errors++;
+	}
+	fz_drop_context(ctx);
+
+	return errors != 0;
+}
-- 
cgit v1.2.3