Move guts of pdfclean into new pdf_write function.

Expose pdf_write function through the document interface.
author: Robin Watts <robin.watts@artifex.com> 2012-04-25 20:40:29 +0100
committer: Robin Watts <robin.watts@artifex.com> 2012-04-28 09:44:49 +0100
commit: 4e3d351910f0927283fdc6911574ffe982ca678b (patch)
tree: 854e8367ae8906f957c3bb766ae578188e38cd61
parent: 463e9b6c69cf9232da5ae9659eaad6e7050b594d (diff)
download: mupdf-4e3d351910f0927283fdc6911574ffe982ca678b.tar.xz
7 files changed, 735 insertions, 643 deletions
diff --git a/apps/mupdfclean.c b/apps/mupdfclean.c
index 5b95fdae..f2676782 100644
--- a/apps/mupdfclean.c
+++ b/apps/mupdfclean.c
@@ -14,29 +14,13 @@
 
 static FILE *out = NULL;
 
-enum
-{
-	expand_images = 1,
-	expand_fonts = 2,
-	expand_all = -1
-};
-
-static char *uselist = NULL;
-static int *ofslist = NULL;
-static int *genlist = NULL;
-static int *renumbermap = NULL;
-
-static int dogarbage = 0;
-static int doexpand = 0;
-static int doascii = 0;
-
 static pdf_document *xref = NULL;
 static fz_context *ctx = NULL;
 
 static void usage(void)
 {
 	fprintf(stderr,
-		"usage: pdfclean [options] input.pdf [output.pdf] [pages]\n"
+		"usage: mupdfclean [options] input.pdf [output.pdf] [pages]\n"
 		"\t-p -\tpassword\n"
 		"\t-g\tgarbage collect unused objects\n"
 		"\t-gg\tin addition to -g compact xref table\n"
@@ -50,251 +34,6 @@ static void usage(void)
 }
 
 /*
- * Garbage collect objects not reachable from the trailer.
- */
-
-static pdf_obj *sweepref(pdf_obj *obj)
-{
-	int num = pdf_to_num(obj);
-	int gen = pdf_to_gen(obj);
-
-	if (num < 0 || num >= xref->len)
-		return NULL;
-	if (uselist[num])
-		return NULL;
-
-	uselist[num] = 1;
-
-	/* Bake in /Length in stream objects */
-	fz_try(ctx)
-	{
-		if (pdf_is_stream(xref, num, gen))
-		{
-			pdf_obj *len = pdf_dict_gets(obj, "Length");
-			if (pdf_is_indirect(len))
-			{
-				uselist[pdf_to_num(len)] = 0;
-				len = pdf_resolve_indirect(len);
-				pdf_dict_puts(obj, "Length", len);
-			}
-		}
-	}
-	fz_catch(ctx)
-	{
-		/* Leave broken */
-	}
-
-	return pdf_resolve_indirect(obj);
-}
-
-static void sweepobj(pdf_obj *obj)
-{
-	int i;
-
-	if (pdf_is_indirect(obj))
-		obj = sweepref(obj);
-
-	if (pdf_is_dict(obj))
-	{
-		int n = pdf_dict_len(obj);
-		for (i = 0; i < n; i++)
-			sweepobj(pdf_dict_get_val(obj, i));
-	}
-
-	else if (pdf_is_array(obj))
-	{
-		int n = pdf_array_len(obj);
-		for (i = 0; i < n; i++)
-			sweepobj(pdf_array_get(obj, i));
-	}
-}
-
-/*
- * Scan for and remove duplicate objects (slow)
- */
-
-static void removeduplicateobjs(void)
-{
-	int num, other;
-
-	for (num = 1; num < xref->len; num++)
-	{
-		/* Only compare an object to objects preceding it */
-		for (other = 1; other < num; other++)
-		{
-			pdf_obj *a, *b;
-
-			if (num == other || !uselist[num] || !uselist[other])
-				continue;
-
-			/*
-			 * Comparing stream objects data contents would take too long.
-			 *
-			 * pdf_is_stream calls pdf_cache_object and ensures
-			 * that the xref table has the objects loaded.
-			 */
-			fz_try(ctx)
-			{
-				if (pdf_is_stream(xref, num, 0) || pdf_is_stream(xref, other, 0))
-					continue;
-			}
-			fz_catch(ctx)
-			{
-				/* Assume different */
-			}
-
-			a = xref->table[num].obj;
-			b = xref->table[other].obj;
-
-			a = pdf_resolve_indirect(a);
-			b = pdf_resolve_indirect(b);
-
-			if (pdf_objcmp(a, b))
-				continue;
-
-			/* Keep the lowest numbered object */
-			renumbermap[num] = MIN(num, other);
-			renumbermap[other] = MIN(num, other);
-			uselist[MAX(num, other)] = 0;
-
-			/* One duplicate was found, do not look for another */
-			break;
-		}
-	}
-}
-
-/*
- * Renumber objects sequentially so the xref is more compact
- */
-
-static void compactxref(void)
-{
-	int num, newnum;
-
-	/*
-	 * Update renumbermap in-place, clustering all used
-	 * objects together at low object ids. Objects that
-	 * already should be renumbered will have their new
-	 * object ids be updated to reflect the compaction.
-	 */
-
-	newnum = 1;
-	for (num = 1; num < xref->len; num++)
-	{
-		if (uselist[num] && renumbermap[num] == num)
-			renumbermap[num] = newnum++;
-		else if (renumbermap[num] != num)
-			renumbermap[num] = renumbermap[renumbermap[num]];
-	}
-}
-
-/*
- * Update indirect objects according to renumbering established when
- * removing duplicate objects and compacting the xref.
- */
-
-static void renumberobj(pdf_obj *obj)
-{
-	int i;
-	fz_context *ctx = xref->ctx;
-
-	if (pdf_is_dict(obj))
-	{
-		int n = pdf_dict_len(obj);
-		for (i = 0; i < n; i++)
-		{
-			pdf_obj *key = pdf_dict_get_key(obj, i);
-			pdf_obj *val = pdf_dict_get_val(obj, i);
-			if (pdf_is_indirect(val))
-			{
-				val = pdf_new_indirect(ctx, renumbermap[pdf_to_num(val)], 0, xref);
-				fz_dict_put(obj, key, val);
-				pdf_drop_obj(val);
-			}
-			else
-			{
-				renumberobj(val);
-			}
-		}
-	}
-
-	else if (pdf_is_array(obj))
-	{
-		int n = pdf_array_len(obj);
-		for (i = 0; i < n; i++)
-		{
-			pdf_obj *val = pdf_array_get(obj, i);
-			if (pdf_is_indirect(val))
-			{
-				val = pdf_new_indirect(ctx, renumbermap[pdf_to_num(val)], 0, xref);
-				pdf_array_put(obj, i, val);
-				pdf_drop_obj(val);
-			}
-			else
-			{
-				renumberobj(val);
-			}
-		}
-	}
-}
-
-static void renumberobjs(void)
-{
-	pdf_xref_entry *oldxref;
-	int newlen;
-	int num;
-
-	/* Apply renumber map to indirect references in all objects in xref */
-	renumberobj(xref->trailer);
-	for (num = 0; num < xref->len; num++)
-	{
-		pdf_obj *obj = xref->table[num].obj;
-
-		if (pdf_is_indirect(obj))
-		{
-			obj = pdf_new_indirect(ctx, renumbermap[pdf_to_num(obj)], 0, xref);
-			pdf_update_object(xref, num, 0, obj);
-			pdf_drop_obj(obj);
-		}
-		else
-		{
-			renumberobj(obj);
-		}
-	}
-
-	/* Create new table for the reordered, compacted xref */
-	oldxref = xref->table;
-	xref->table = fz_malloc_array(xref->ctx, xref->len, sizeof(pdf_xref_entry));
-	xref->table[0] = oldxref[0];
-
-	/* Move used objects into the new compacted xref */
-	newlen = 0;
-	for (num = 1; num < xref->len; num++)
-	{
-		if (uselist[num])
-		{
-			if (newlen < renumbermap[num])
-				newlen = renumbermap[num];
-			xref->table[renumbermap[num]] = oldxref[num];
-		}
-		else
-		{
-			if (oldxref[num].obj)
-				pdf_drop_obj(oldxref[num].obj);
-		}
-	}
-
-	fz_free(xref->ctx, oldxref);
-
-	/* Update the used objects count in compacted xref */
-	xref->len = newlen + 1;
-
-	/* Update list of used objects to fit with compacted xref */
-	for (num = 1; num < xref->len; num++)
-		uselist[num] = 1;
-}
-
-/*
  * Recreate page tree to only retain specified pages.
  */
 
@@ -415,330 +154,6 @@ static void retainpages(int argc, char **argv)
 	}
 }
 
-/*
- * Make sure we have loaded objects from object streams.
- */
-
-static void preloadobjstms(void)
-{
-	pdf_obj *obj;
-	int num;
-
-	for (num = 0; num < xref->len; num++)
-	{
-		if (xref->table[num].type == 'o')
-		{
-			obj = pdf_load_object(xref, num, 0);
-			pdf_drop_obj(obj);
-		}
-	}
-}
-
-/*
- * Save streams and objects to the output
- */
-
-static inline int isbinary(int c)
-{
-	if (c == '\n' || c == '\r' || c == '\t')
-		return 0;
-	return c < 32 || c > 127;
-}
-
-static int isbinarystream(fz_buffer *buf)
-{
-	int i;
-	for (i = 0; i < buf->len; i++)
-		if (isbinary(buf->data[i]))
-			return 1;
-	return 0;
-}
-
-static fz_buffer *hexbuf(unsigned char *p, int n)
-{
-	static const char hex[16] = "0123456789abcdef";
-	fz_buffer *buf;
-	int x = 0;
-
-	buf = fz_new_buffer(ctx, n * 2 + (n / 32) + 2);
-
-	while (n--)
-	{
-		buf->data[buf->len++] = hex[*p >> 4];
-		buf->data[buf->len++] = hex[*p & 15];
-		if (++x == 32)
-		{
-			buf->data[buf->len++] = '\n';
-			x = 0;
-		}
-		p++;
-	}
-
-	buf->data[buf->len++] = '>';
-	buf->data[buf->len++] = '\n';
-
-	return buf;
-}
-
-static void addhexfilter(pdf_obj *dict)
-{
-	pdf_obj *f, *dp, *newf, *newdp;
-	pdf_obj *ahx, *nullobj;
-
-	ahx = fz_new_name(ctx, "ASCIIHexDecode");
-	nullobj = pdf_new_null(ctx);
-	newf = newdp = NULL;
-
-	f = pdf_dict_gets(dict, "Filter");
-	dp = pdf_dict_gets(dict, "DecodeParms");
-
-	if (pdf_is_name(f))
-	{
-		newf = pdf_new_array(ctx, 2);
-		pdf_array_push(newf, ahx);
-		pdf_array_push(newf, f);
-		f = newf;
-		if (pdf_is_dict(dp))
-		{
-			newdp = pdf_new_array(ctx, 2);
-			pdf_array_push(newdp, nullobj);
-			pdf_array_push(newdp, dp);
-			dp = newdp;
-		}
-	}
-	else if (pdf_is_array(f))
-	{
-		pdf_array_insert(f, ahx);
-		if (pdf_is_array(dp))
-			pdf_array_insert(dp, nullobj);
-	}
-	else
-		f = ahx;
-
-	pdf_dict_puts(dict, "Filter", f);
-	if (dp)
-		pdf_dict_puts(dict, "DecodeParms", dp);
-
-	pdf_drop_obj(ahx);
-	pdf_drop_obj(nullobj);
-	if (newf)
-		pdf_drop_obj(newf);
-	if (newdp)
-		pdf_drop_obj(newdp);
-}
-
-static void copystream(pdf_obj *obj, int num, int gen)
-{
-	fz_buffer *buf, *tmp;
-	pdf_obj *newlen;
-
-	buf = pdf_load_raw_stream(xref, num, gen);
-
-	if (doascii && isbinarystream(buf))
-	{
-		tmp = hexbuf(buf->data, buf->len);
-		fz_drop_buffer(ctx, buf);
-		buf = tmp;
-
-		addhexfilter(obj);
-
-		newlen = pdf_new_int(ctx, buf->len);
-		pdf_dict_puts(obj, "Length", newlen);
-		pdf_drop_obj(newlen);
-	}
-
-	fprintf(out, "%d %d obj\n", num, gen);
-	pdf_fprint_obj(out, obj, doexpand == 0);
-	fprintf(out, "stream\n");
-	fwrite(buf->data, 1, buf->len, out);
-	fprintf(out, "endstream\nendobj\n\n");
-
-	fz_drop_buffer(ctx, buf);
-}
-
-static void expandstream(pdf_obj *obj, int num, int gen)
-{
-	fz_buffer *buf, *tmp;
-	pdf_obj *newlen;
-
-	buf = pdf_load_stream(xref, num, gen);
-
-	pdf_dict_dels(obj, "Filter");
-	pdf_dict_dels(obj, "DecodeParms");
-
-	if (doascii && isbinarystream(buf))
-	{
-		tmp = hexbuf(buf->data, buf->len);
-		fz_drop_buffer(ctx, buf);
-		buf = tmp;
-
-		addhexfilter(obj);
-	}
-
-	newlen = pdf_new_int(ctx, buf->len);
-	pdf_dict_puts(obj, "Length", newlen);
-	pdf_drop_obj(newlen);
-
-	fprintf(out, "%d %d obj\n", num, gen);
-	pdf_fprint_obj(out, obj, doexpand == 0);
-	fprintf(out, "stream\n");
-	fwrite(buf->data, 1, buf->len, out);
-	fprintf(out, "endstream\nendobj\n\n");
-
-	fz_drop_buffer(ctx, buf);
-}
-
-static void writeobject(int num, int gen)
-{
-	pdf_obj *obj;
-	pdf_obj *type;
-
-	obj = pdf_load_object(xref, num, gen);
-
-	/* skip ObjStm and XRef objects */
-	if (pdf_is_dict(obj))
-	{
-		type = pdf_dict_gets(obj, "Type");
-		if (pdf_is_name(type) && !strcmp(pdf_to_name(type), "ObjStm"))
-		{
-			uselist[num] = 0;
-			pdf_drop_obj(obj);
-			return;
-		}
-		if (pdf_is_name(type) && !strcmp(pdf_to_name(type), "XRef"))
-		{
-			uselist[num] = 0;
-			pdf_drop_obj(obj);
-			return;
-		}
-	}
-
-	if (!pdf_is_stream(xref, num, gen))
-	{
-		fprintf(out, "%d %d obj\n", num, gen);
-		pdf_fprint_obj(out, obj, doexpand == 0);
-		fprintf(out, "endobj\n\n");
-	}
-	else
-	{
-		int dontexpand = 0;
-		if (doexpand != 0 && doexpand != expand_all)
-		{
-			pdf_obj *o;
-
-			if ((o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "XObject")) &&
-				(o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "Image")))
-				dontexpand = !(doexpand & expand_images);
-			if (o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "Font"))
-				dontexpand = !(doexpand & expand_fonts);
-			if (o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "FontDescriptor"))
-				dontexpand = !(doexpand & expand_fonts);
-			if ((o = pdf_dict_gets(obj, "Length1")) != NULL)
-				dontexpand = !(doexpand & expand_fonts);
-			if ((o = pdf_dict_gets(obj, "Length2")) != NULL)
-				dontexpand = !(doexpand & expand_fonts);
-			if ((o = pdf_dict_gets(obj, "Length3")) != NULL)
-				dontexpand = !(doexpand & expand_fonts);
-			if (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "Type1C"))
-				dontexpand = !(doexpand & expand_fonts);
-			if (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "CIDFontType0C"))
-				dontexpand = !(doexpand & expand_fonts);
-		}
-		if (doexpand && !dontexpand && !pdf_is_jpx_image(ctx, obj))
-			expandstream(obj, num, gen);
-		else
-			copystream(obj, num, gen);
-	}
-
-	pdf_drop_obj(obj);
-}
-
-static void writexref(void)
-{
-	pdf_obj *trailer;
-	pdf_obj *obj;
-	int startxref;
-	int num;
-
-	startxref = ftell(out);
-
-	fprintf(out, "xref\n0 %d\n", xref->len);
-	for (num = 0; num < xref->len; num++)
-	{
-		if (uselist[num])
-			fprintf(out, "%010d %05d n \n", ofslist[num], genlist[num]);
-		else
-			fprintf(out, "%010d %05d f \n", ofslist[num], genlist[num]);
-	}
-	fprintf(out, "\n");
-
-	trailer = pdf_new_dict(ctx, 5);
-
-	obj = pdf_new_int(ctx, xref->len);
-	pdf_dict_puts(trailer, "Size", obj);
-	pdf_drop_obj(obj);
-
-	obj = pdf_dict_gets(xref->trailer, "Info");
-	if (obj)
-		pdf_dict_puts(trailer, "Info", obj);
-
-	obj = pdf_dict_gets(xref->trailer, "Root");
-	if (obj)
-		pdf_dict_puts(trailer, "Root", obj);
-
-	obj = pdf_dict_gets(xref->trailer, "ID");
-	if (obj)
-		pdf_dict_puts(trailer, "ID", obj);
-
-	fprintf(out, "trailer\n");
-	pdf_fprint_obj(out, trailer, doexpand == 0);
-	fprintf(out, "\n");
-
-	pdf_drop_obj(trailer);
-
-	fprintf(out, "startxref\n%d\n%%%%EOF\n", startxref);
-}
-
-static void writepdf(void)
-{
-	int lastfree;
-	int num;
-
-	for (num = 0; num < xref->len; num++)
-	{
-		if (xref->table[num].type == 'f')
-			genlist[num] = xref->table[num].gen;
-		if (xref->table[num].type == 'n')
-			genlist[num] = xref->table[num].gen;
-		if (xref->table[num].type == 'o')
-			genlist[num] = 0;
-
-		if (dogarbage && !uselist[num])
-			continue;
-
-		if (xref->table[num].type == 'n' || xref->table[num].type == 'o')
-		{
-			uselist[num] = 1;
-			ofslist[num] = ftell(out);
-			writeobject(num, genlist[num]);
-		}
-	}
-
-	/* Construct linked list of free object slots */
-	lastfree = 0;
-	for (num = 0; num < xref->len; num++)
-	{
-		if (!uselist[num])
-		{
-			genlist[num]++;
-			ofslist[lastfree] = num;
-			lastfree = num;
-		}
-	}
-
-	writexref();
-}
-
 #ifdef MUPDF_COMBINED_EXE
 int pdfclean_main(int argc, char **argv)
 #else
@@ -748,19 +163,24 @@ int main(int argc, char **argv)
 	char *infile;
 	char *outfile = "out.pdf";
 	char *password = "";
-	int c, num;
+	int c;
 	int subset;
+	fz_write_options opts;
+
+	opts.dogarbage = 0;
+	opts.doexpand = 0;
+	opts.doascii = 0;
 
 	while ((c = fz_getopt(argc, argv, "adfgip:")) != -1)
 	{
 		switch (c)
 		{
 		case 'p': password = fz_optarg; break;
-		case 'g': dogarbage ++; break;
-		case 'd': doexpand ^= expand_all; break;
-		case 'f': doexpand ^= expand_fonts; break;
-		case 'i': doexpand ^= expand_images; break;
-		case 'a': doascii ++; break;
+		case 'g': opts.dogarbage ++; break;
+		case 'd': opts.doexpand ^= fz_expand_all; break;
+		case 'f': opts.doexpand ^= fz_expand_fonts; break;
+		case 'i': opts.doexpand ^= fz_expand_images; break;
+		case 'a': opts.doascii ++; break;
 		default: usage(); break;
 		}
 	}
@@ -792,61 +212,11 @@ int main(int argc, char **argv)
 		if (!pdf_authenticate_password(xref, password))
 			fz_throw(ctx, "cannot authenticate password: %s", infile);
 
-	out = fopen(outfile, "wb");
-	if (!out)
-		fz_throw(ctx, "cannot open output file '%s'", outfile);
-
-	fprintf(out, "%%PDF-%d.%d\n", xref->version / 10, xref->version % 10);
-	fprintf(out, "%%\316\274\341\277\246\n\n");
-
-	uselist = fz_malloc_array(ctx, xref->len + 1, sizeof(char));
-	ofslist = fz_malloc_array(ctx, xref->len + 1, sizeof(int));
-	genlist = fz_malloc_array(ctx, xref->len + 1, sizeof(int));
-	renumbermap = fz_malloc_array(ctx, xref->len + 1, sizeof(int));
-
-	for (num = 0; num < xref->len; num++)
-	{
-		uselist[num] = 0;
-		ofslist[num] = 0;
-		genlist[num] = 0;
-		renumbermap[num] = num;
-	}
-
-	/* Make sure any objects hidden in compressed streams have been loaded */
-	preloadobjstms();
-
 	/* Only retain the specified subset of the pages */
 	if (subset)
 		retainpages(argc, argv);
 
-	/* Sweep & mark objects from the trailer */
-	if (dogarbage >= 1)
-		sweepobj(xref->trailer);
-
-	/* Coalesce and renumber duplicate objects */
-	if (dogarbage >= 3)
-		removeduplicateobjs();
-
-	/* Compact xref by renumbering and removing unused objects */
-	if (dogarbage >= 2)
-		compactxref();
-
-	/* Make renumbering affect all indirect references and update xref */
-	/* Do not renumber objects if encryption is in use, as the object
-	 * numbers are baked into the streams/strings, and we can't currently
-	 * cope with moving them. See bug 692627. */
-	if (dogarbage >= 2 && !xref->crypt)
-		renumberobjs();
-
-	writepdf();
-
-	if (fclose(out))
-		fz_throw(ctx, "cannot close output file '%s'", outfile);
-
-	fz_free(xref->ctx, uselist);
-	fz_free(xref->ctx, ofslist);
-	fz_free(xref->ctx, genlist);
-	fz_free(xref->ctx, renumbermap);
+	pdf_write(xref, outfile, &opts);
 
 	pdf_close_document(xref);
 	fz_free_context(ctx);
diff --git a/fitz/doc_document.c b/fitz/doc_document.c
index 22229892..83dbe53f 100644
--- a/fitz/doc_document.c
+++ b/fitz/doc_document.c
@@ -129,3 +129,10 @@ fz_meta(fz_document *doc, int key, void *ptr, int size)
 		return doc->meta(doc, key, ptr, size);
 	return FZ_META_UNKNOWN_KEY;
 }
+
+void
+fz_write(fz_document *doc, char *filename, fz_write_opts *opts)
+{
+	if (doc && doc->write)
+		doc->write(doc, filename, opts);
+}
+\ No newline at end of file
diff --git a/fitz/fitz-internal.h b/fitz/fitz-internal.h
index bd19886c..1e54c900 100644
--- a/fitz/fitz-internal.h
+++ b/fitz/fitz-internal.h
@@ -1080,6 +1080,7 @@ struct fz_document_s
 	void (*run_page)(fz_document *doc, fz_page *page, fz_device *dev, fz_matrix transform, fz_cookie *cookie);
 	void (*free_page)(fz_document *doc, fz_page *page);
 	int (*meta)(fz_document *doc, int key, void *ptr, int size);
+	void (*write)(fz_document *doc, char *filename, fz_write_options *opts);
 };
 
 #endif
diff --git a/fitz/fitz.h b/fitz/fitz.h
index 04215c71..61f80756 100644
--- a/fitz/fitz.h
+++ b/fitz/fitz.h
@@ -2231,4 +2231,50 @@ enum
 	FZ_META_INFO = 4,
 };
 
+typedef struct fz_write_options_s fz_write_options;
+
+/*
+	In calls to fz_write, the following options structure can be used
+	to control aspects of the writing process. This structure may grow
+	in future, and should be zero-filled to allow forwards compatiblity.
+*/
+struct fz_write_options_s
+{
+	int doascii;    /*	If non-zero then attempt (where possible) to
+				make the output ascii. */
+	int doexpand;	/*	Bitflags; each non zero bit indicates an aspect
+				of the file that should be 'expanded' on
+				writing. */
+	int dogarbage;	/*	If non-zero then attempt (where possible) to
+				garbage collect the file before writing. */
+};
+
+/*	An enumeration of bitflags to use in the above 'doexpand' field of
+	fz_write_options.
+*/
+enum
+{
+	fz_expand_images = 1,
+	fz_expand_fonts = 2,
+	fz_expand_all = -1
+};
+
+/*
+	fz_write: Write a document out.
+
+	(In development - Subject to change in future versions)
+
+	Save a copy of the current document in its original format.
+	Internally the document may change.
+
+	doc: The document to save.
+
+	filename: The filename to save to.
+
+	opts: NULL, or a pointer to an options structure.
+
+	May throw exceptions.
+*/
+void fz_write(fz_document *doc, char *filename, fz_write_options *opts);
+
 #endif
diff --git a/pdf/mupdf.h b/pdf/mupdf.h
index b88f7423..72c5c4cf 100644
--- a/pdf/mupdf.h
+++ b/pdf/mupdf.h
@@ -144,6 +144,8 @@ pdf_document *pdf_open_document_with_stream(fz_stream *file);
 */
 void pdf_close_document(pdf_document *doc);
 
+void pdf_write(pdf_document *doc, char *filename, fz_write_options *opts);
+
 int pdf_needs_password(pdf_document *doc);
 int pdf_authenticate_password(pdf_document *doc, char *pw);
 
diff --git a/pdf/pdf_write.c b/pdf/pdf_write.c
new file mode 100644
index 00000000..e14cfa28
--- /dev/null
+++ b/pdf/pdf_write.c
@@ -0,0 +1,662 @@
+#include "fitz.h"
+#include "mupdf-internal.h"
+
+typedef struct pdf_write_options_s pdf_write_options;
+
+struct pdf_write_options_s
+{
+	FILE *out;
+	int doascii;
+	int doexpand;
+	int dogarbage;
+	char *uselist;
+	int *ofslist;
+	int *genlist;
+	int *renumbermap;
+};
+
+/*
+ * Garbage collect objects not reachable from the trailer.
+ */
+
+static pdf_obj *sweepref(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj)
+{
+	int num = pdf_to_num(obj);
+	int gen = pdf_to_gen(obj);
+	fz_context *ctx = xref->ctx;
+
+	if (num < 0 || num >= xref->len)
+		return NULL;
+	if (opts->uselist[num])
+		return NULL;
+
+	opts->uselist[num] = 1;
+
+	/* Bake in /Length in stream objects */
+	fz_try(ctx)
+	{
+		if (pdf_is_stream(xref, num, gen))
+		{
+			pdf_obj *len = pdf_dict_gets(obj, "Length");
+			if (pdf_is_indirect(len))
+			{
+				opts->uselist[pdf_to_num(len)] = 0;
+				len = pdf_resolve_indirect(len);
+				pdf_dict_puts(obj, "Length", len);
+			}
+		}
+	}
+	fz_catch(ctx)
+	{
+		/* Leave broken */
+	}
+
+	return pdf_resolve_indirect(obj);
+}
+
+static void sweepobj(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj)
+{
+	int i;
+
+	if (pdf_is_indirect(obj))
+		obj = sweepref(xref, opts, obj);
+
+	if (pdf_is_dict(obj))
+	{
+		int n = pdf_dict_len(obj);
+		for (i = 0; i < n; i++)
+			sweepobj(xref, opts, pdf_dict_get_val(obj, i));
+	}
+
+	else if (pdf_is_array(obj))
+	{
+		int n = pdf_array_len(obj);
+		for (i = 0; i < n; i++)
+			sweepobj(xref, opts, pdf_array_get(obj, i));
+	}
+}
+
+/*
+ * Scan for and remove duplicate objects (slow)
+ */
+
+static void removeduplicateobjs(pdf_document *xref, pdf_write_options *opts)
+{
+	int num, other;
+	fz_context *ctx = xref->ctx;
+
+	for (num = 1; num < xref->len; num++)
+	{
+		/* Only compare an object to objects preceding it */
+		for (other = 1; other < num; other++)
+		{
+			pdf_obj *a, *b;
+			int match;
+
+			if (num == other || !opts->uselist[num] || !opts->uselist[other])
+				continue;
+
+			/*
+			 * Comparing stream objects data contents would take too long.
+			 *
+			 * pdf_is_stream calls pdf_cache_object and ensures
+			 * that the xref table has the objects loaded.
+			 */
+			fz_try(ctx)
+			{
+				match = (pdf_is_stream(xref, num, 0) || pdf_is_stream(xref, other, 0));
+			}
+			fz_catch(ctx)
+			{
+				/* Assume different */
+				match = 0;
+			}
+			if (match)
+				continue;
+
+			a = xref->table[num].obj;
+			b = xref->table[other].obj;
+
+			a = pdf_resolve_indirect(a);
+			b = pdf_resolve_indirect(b);
+
+			if (pdf_objcmp(a, b))
+				continue;
+
+			/* Keep the lowest numbered object */
+			opts->renumbermap[num] = MIN(num, other);
+			opts->renumbermap[other] = MIN(num, other);
+			opts->uselist[MAX(num, other)] = 0;
+
+			/* One duplicate was found, do not look for another */
+			break;
+		}
+	}
+}
+
+/*
+ * Renumber objects sequentially so the xref is more compact
+ */
+
+static void compactxref(pdf_document *xref, pdf_write_options *opts)
+{
+	int num, newnum;
+
+	/*
+	 * Update renumbermap in-place, clustering all used
+	 * objects together at low object ids. Objects that
+	 * already should be renumbered will have their new
+	 * object ids be updated to reflect the compaction.
+	 */
+
+	newnum = 1;
+	for (num = 1; num < xref->len; num++)
+	{
+		if (opts->uselist[num] && opts->renumbermap[num] == num)
+			opts->renumbermap[num] = newnum++;
+		else if (opts->renumbermap[num] != num)
+			opts->renumbermap[num] = opts->renumbermap[opts->renumbermap[num]];
+	}
+}
+
+/*
+ * Update indirect objects according to renumbering established when
+ * removing duplicate objects and compacting the xref.
+ */
+
+static void renumberobj(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj)
+{
+	int i;
+	fz_context *ctx = xref->ctx;
+
+	if (pdf_is_dict(obj))
+	{
+		int n = pdf_dict_len(obj);
+		for (i = 0; i < n; i++)
+		{
+			pdf_obj *key = pdf_dict_get_key(obj, i);
+			pdf_obj *val = pdf_dict_get_val(obj, i);
+			if (pdf_is_indirect(val))
+			{
+				val = pdf_new_indirect(ctx, opts->renumbermap[pdf_to_num(val)], 0, xref);
+				fz_dict_put(obj, key, val);
+				pdf_drop_obj(val);
+			}
+			else
+			{
+				renumberobj(xref, opts, val);
+			}
+		}
+	}
+
+	else if (pdf_is_array(obj))
+	{
+		int n = pdf_array_len(obj);
+		for (i = 0; i < n; i++)
+		{
+			pdf_obj *val = pdf_array_get(obj, i);
+			if (pdf_is_indirect(val))
+			{
+				val = pdf_new_indirect(ctx, opts->renumbermap[pdf_to_num(val)], 0, xref);
+				pdf_array_put(obj, i, val);
+				pdf_drop_obj(val);
+			}
+			else
+			{
+				renumberobj(xref, opts, val);
+			}
+		}
+	}
+}
+
+static void renumberobjs(pdf_document *xref, pdf_write_options *opts)
+{
+	pdf_xref_entry *oldxref;
+	int newlen;
+	int num;
+	fz_context *ctx = xref->ctx;
+
+	/* Apply renumber map to indirect references in all objects in xref */
+	renumberobj(xref, opts, xref->trailer);
+	for (num = 0; num < xref->len; num++)
+	{
+		pdf_obj *obj = xref->table[num].obj;
+
+		if (pdf_is_indirect(obj))
+		{
+			obj = pdf_new_indirect(ctx, opts->renumbermap[pdf_to_num(obj)], 0, xref);
+			pdf_update_object(xref, num, 0, obj);
+			pdf_drop_obj(obj);
+		}
+		else
+		{
+			renumberobj(xref, opts, obj);
+		}
+	}
+
+	/* Create new table for the reordered, compacted xref */
+	oldxref = xref->table;
+	xref->table = fz_malloc_array(xref->ctx, xref->len, sizeof(pdf_xref_entry));
+	xref->table[0] = oldxref[0];
+
+	/* Move used objects into the new compacted xref */
+	newlen = 0;
+	for (num = 1; num < xref->len; num++)
+	{
+		if (opts->uselist[num])
+		{
+			if (newlen < opts->renumbermap[num])
+				newlen = opts->renumbermap[num];
+			xref->table[opts->renumbermap[num]] = oldxref[num];
+		}
+		else
+		{
+			if (oldxref[num].obj)
+				pdf_drop_obj(oldxref[num].obj);
+		}
+	}
+
+	fz_free(xref->ctx, oldxref);
+
+	/* Update the used objects count in compacted xref */
+	xref->len = newlen + 1;
+
+	/* Update list of used objects to fit with compacted xref */
+	for (num = 1; num < xref->len; num++)
+		opts->uselist[num] = 1;
+}
+
+/*
+ * Make sure we have loaded objects from object streams.
+ */
+
+static void preloadobjstms(pdf_document *xref)
+{
+	pdf_obj *obj;
+	int num;
+
+	for (num = 0; num < xref->len; num++)
+	{
+		if (xref->table[num].type == 'o')
+		{
+			obj = pdf_load_object(xref, num, 0);
+			pdf_drop_obj(obj);
+		}
+	}
+}
+
+/*
+ * Save streams and objects to the output
+ */
+
+static inline int isbinary(int c)
+{
+	if (c == '\n' || c == '\r' || c == '\t')
+		return 0;
+	return c < 32 || c > 127;
+}
+
+static int isbinarystream(fz_buffer *buf)
+{
+	int i;
+	for (i = 0; i < buf->len; i++)
+		if (isbinary(buf->data[i]))
+			return 1;
+	return 0;
+}
+
+static fz_buffer *hexbuf(fz_context *ctx, unsigned char *p, int n)
+{
+	static const char hex[16] = "0123456789abcdef";
+	fz_buffer *buf;
+	int x = 0;
+
+	buf = fz_new_buffer(ctx, n * 2 + (n / 32) + 2);
+
+	while (n--)
+	{
+		buf->data[buf->len++] = hex[*p >> 4];
+		buf->data[buf->len++] = hex[*p & 15];
+		if (++x == 32)
+		{
+			buf->data[buf->len++] = '\n';
+			x = 0;
+		}
+		p++;
+	}
+
+	buf->data[buf->len++] = '>';
+	buf->data[buf->len++] = '\n';
+
+	return buf;
+}
+
+static void addhexfilter(pdf_document *xref, pdf_obj *dict)
+{
+	pdf_obj *f, *dp, *newf, *newdp;
+	pdf_obj *ahx, *nullobj;
+	fz_context *ctx = xref->ctx;
+
+	ahx = fz_new_name(ctx, "ASCIIHexDecode");
+	nullobj = pdf_new_null(ctx);
+	newf = newdp = NULL;
+
+	f = pdf_dict_gets(dict, "Filter");
+	dp = pdf_dict_gets(dict, "DecodeParms");
+
+	if (pdf_is_name(f))
+	{
+		newf = pdf_new_array(ctx, 2);
+		pdf_array_push(newf, ahx);
+		pdf_array_push(newf, f);
+		f = newf;
+		if (pdf_is_dict(dp))
+		{
+			newdp = pdf_new_array(ctx, 2);
+			pdf_array_push(newdp, nullobj);
+			pdf_array_push(newdp, dp);
+			dp = newdp;
+		}
+	}
+	else if (pdf_is_array(f))
+	{
+		pdf_array_insert(f, ahx);
+		if (pdf_is_array(dp))
+			pdf_array_insert(dp, nullobj);
+	}
+	else
+		f = ahx;
+
+	pdf_dict_puts(dict, "Filter", f);
+	if (dp)
+		pdf_dict_puts(dict, "DecodeParms", dp);
+
+	pdf_drop_obj(ahx);
+	pdf_drop_obj(nullobj);
+	if (newf)
+		pdf_drop_obj(newf);
+	if (newdp)
+		pdf_drop_obj(newdp);
+}
+
+static void copystream(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj, int num, int gen)
+{
+	fz_buffer *buf, *tmp;
+	pdf_obj *newlen;
+	fz_context *ctx = xref->ctx;
+
+	buf = pdf_load_raw_stream(xref, num, gen);
+
+	if (opts->doascii && isbinarystream(buf))
+	{
+		tmp = hexbuf(ctx, buf->data, buf->len);
+		fz_drop_buffer(ctx, buf);
+		buf = tmp;
+
+		addhexfilter(xref, obj);
+
+		newlen = pdf_new_int(ctx, buf->len);
+		pdf_dict_puts(obj, "Length", newlen);
+		pdf_drop_obj(newlen);
+	}
+
+	fprintf(opts->out, "%d %d obj\n", num, gen);
+	pdf_fprint_obj(opts->out, obj, opts->doexpand == 0);
+	fprintf(opts->out, "stream\n");
+	fwrite(buf->data, 1, buf->len, opts->out);
+	fprintf(opts->out, "endstream\nendobj\n\n");
+
+	fz_drop_buffer(ctx, buf);
+}
+
+static void expandstream(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj, int num, int gen)
+{
+	fz_buffer *buf, *tmp;
+	pdf_obj *newlen;
+	fz_context *ctx = xref->ctx;
+
+	buf = pdf_load_stream(xref, num, gen);
+
+	pdf_dict_dels(obj, "Filter");
+	pdf_dict_dels(obj, "DecodeParms");
+
+	if (opts->doascii && isbinarystream(buf))
+	{
+		tmp = hexbuf(ctx, buf->data, buf->len);
+		fz_drop_buffer(ctx, buf);
+		buf = tmp;
+
+		addhexfilter(xref, obj);
+	}
+
+	newlen = pdf_new_int(ctx, buf->len);
+	pdf_dict_puts(obj, "Length", newlen);
+	pdf_drop_obj(newlen);
+
+	fprintf(opts->out, "%d %d obj\n", num, gen);
+	pdf_fprint_obj(opts->out, obj, opts->doexpand == 0);
+	fprintf(opts->out, "stream\n");
+	fwrite(buf->data, 1, buf->len, opts->out);
+	fprintf(opts->out, "endstream\nendobj\n\n");
+
+	fz_drop_buffer(ctx, buf);
+}
+
+static void writeobject(pdf_document *xref, pdf_write_options *opts, int num, int gen)
+{
+	pdf_obj *obj;
+	pdf_obj *type;
+	fz_context *ctx = xref->ctx;
+
+	obj = pdf_load_object(xref, num, gen);
+
+	/* skip ObjStm and XRef objects */
+	if (pdf_is_dict(obj))
+	{
+		type = pdf_dict_gets(obj, "Type");
+		if (pdf_is_name(type) && !strcmp(pdf_to_name(type), "ObjStm"))
+		{
+			opts->uselist[num] = 0;
+			pdf_drop_obj(obj);
+			return;
+		}
+		if (pdf_is_name(type) && !strcmp(pdf_to_name(type), "XRef"))
+		{
+			opts->uselist[num] = 0;
+			pdf_drop_obj(obj);
+			return;
+		}
+	}
+
+	if (!pdf_is_stream(xref, num, gen))
+	{
+		fprintf(opts->out, "%d %d obj\n", num, gen);
+		pdf_fprint_obj(opts->out, obj, opts->doexpand == 0);
+		fprintf(opts->out, "endobj\n\n");
+	}
+	else
+	{
+		int dontexpand = 0;
+		if (opts->doexpand != 0 && opts->doexpand != fz_expand_all)
+		{
+			pdf_obj *o;
+
+			if ((o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "XObject")) &&
+				(o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "Image")))
+				dontexpand = !(opts->doexpand & fz_expand_images);
+			if (o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "Font"))
+				dontexpand = !(opts->doexpand & fz_expand_fonts);
+			if (o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "FontDescriptor"))
+				dontexpand = !(opts->doexpand & fz_expand_fonts);
+			if ((o = pdf_dict_gets(obj, "Length1")) != NULL)
+				dontexpand = !(opts->doexpand & fz_expand_fonts);
+			if ((o = pdf_dict_gets(obj, "Length2")) != NULL)
+				dontexpand = !(opts->doexpand & fz_expand_fonts);
+			if ((o = pdf_dict_gets(obj, "Length3")) != NULL)
+				dontexpand = !(opts->doexpand & fz_expand_fonts);
+			if (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "Type1C"))
+				dontexpand = !(opts->doexpand & fz_expand_fonts);
+			if (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "CIDFontType0C"))
+				dontexpand = !(opts->doexpand & fz_expand_fonts);
+		}
+		if (opts->doexpand && !dontexpand && !pdf_is_jpx_image(ctx, obj))
+			expandstream(xref, opts, obj, num, gen);
+		else
+			copystream(xref, opts, obj, num, gen);
+	}
+
+	pdf_drop_obj(obj);
+}
+
+static void writexref(pdf_document *xref, pdf_write_options *opts)
+{
+	pdf_obj *trailer;
+	pdf_obj *obj;
+	int startxref;
+	int num;
+	fz_context *ctx = xref->ctx;
+
+	startxref = ftell(opts->out);
+
+	fprintf(opts->out, "xref\n0 %d\n", xref->len);
+	for (num = 0; num < xref->len; num++)
+	{
+		if (opts->uselist[num])
+			fprintf(opts->out, "%010d %05d n \n", opts->ofslist[num], opts->genlist[num]);
+		else
+			fprintf(opts->out, "%010d %05d f \n", opts->ofslist[num], opts->genlist[num]);
+	}
+	fprintf(opts->out, "\n");
+
+	trailer = pdf_new_dict(ctx, 5);
+
+	obj = pdf_new_int(ctx, xref->len);
+	pdf_dict_puts(trailer, "Size", obj);
+	pdf_drop_obj(obj);
+
+	obj = pdf_dict_gets(xref->trailer, "Info");
+	if (obj)
+		pdf_dict_puts(trailer, "Info", obj);
+
+	obj = pdf_dict_gets(xref->trailer, "Root");
+	if (obj)
+		pdf_dict_puts(trailer, "Root", obj);
+
+	obj = pdf_dict_gets(xref->trailer, "ID");
+	if (obj)
+		pdf_dict_puts(trailer, "ID", obj);
+
+	fprintf(opts->out, "trailer\n");
+	pdf_fprint_obj(opts->out, trailer, opts->doexpand == 0);
+	fprintf(opts->out, "\n");
+
+	pdf_drop_obj(trailer);
+
+	fprintf(opts->out, "startxref\n%d\n%%%%EOF\n", startxref);
+}
+
+void pdf_write(pdf_document *xref, char *filename, fz_write_options *fz_opts)
+{
+	int lastfree;
+	int num;
+	pdf_write_options opts = { 0 };
+	fz_context *ctx;
+
+	if (!xref || !fz_opts)
+		return;
+
+	ctx = xref->ctx;
+
+	opts.out = fopen(filename, "wb");
+	if (!opts.out)
+		fz_throw(ctx, "cannot open output file '%s'", filename);
+
+	fz_try(ctx)
+	{
+		opts.doexpand = fz_opts ? fz_opts->doexpand : 0;
+		opts.dogarbage = fz_opts ? fz_opts->dogarbage : 0;
+		opts.doascii = fz_opts ? fz_opts->doascii: 0;
+		opts.uselist = fz_malloc_array(ctx, xref->len + 1, sizeof(char));
+		opts.ofslist = fz_malloc_array(ctx, xref->len + 1, sizeof(int));
+		opts.genlist = fz_malloc_array(ctx, xref->len + 1, sizeof(int));
+		opts.renumbermap = fz_malloc_array(ctx, xref->len + 1, sizeof(int));
+
+		fprintf(opts.out, "%%PDF-%d.%d\n", xref->version / 10, xref->version % 10);
+		fprintf(opts.out, "%%\316\274\341\277\246\n\n");
+
+		for (num = 0; num < xref->len; num++)
+		{
+			opts.uselist[num] = 0;
+			opts.ofslist[num] = 0;
+			opts.renumbermap[num] = num;
+		}
+
+		/* Make sure any objects hidden in compressed streams have been loaded */
+		preloadobjstms(xref);
+
+		/* Sweep & mark objects from the trailer */
+		if (opts.dogarbage >= 1)
+			sweepobj(xref, &opts, xref->trailer);
+
+		/* Coalesce and renumber duplicate objects */
+		if (opts.dogarbage >= 3)
+			removeduplicateobjs(xref, &opts);
+
+		/* Compact xref by renumbering and removing unused objects */
+		if (opts.dogarbage >= 2)
+			compactxref(xref, &opts);
+
+		/* Make renumbering affect all indirect references and update xref */
+		/* Do not renumber objects if encryption is in use, as the object
+		 * numbers are baked into the streams/strings, and we can't currently
+		 * cope with moving them. See bug 692627. */
+		if (opts.dogarbage >= 2 && !xref->crypt)
+			renumberobjs(xref, &opts);
+
+		for (num = 0; num < xref->len; num++)
+		{
+			if (xref->table[num].type == 'f')
+				opts.genlist[num] = xref->table[num].gen;
+			if (xref->table[num].type == 'n')
+				opts.genlist[num] = xref->table[num].gen;
+			if (xref->table[num].type == 'o')
+				opts.genlist[num] = 0;
+
+			if (opts.dogarbage && !opts.uselist[num])
+				continue;
+
+			if (xref->table[num].type == 'n' || xref->table[num].type == 'o')
+			{
+				opts.uselist[num] = 1;
+				opts.ofslist[num] = ftell(opts.out);
+				writeobject(xref, &opts, num, opts.genlist[num]);
+			}
+		}
+
+		/* Construct linked list of free object slots */
+		lastfree = 0;
+		for (num = 0; num < xref->len; num++)
+		{
+			if (!opts.uselist[num])
+			{
+				opts.genlist[num]++;
+				opts.ofslist[lastfree] = num;
+				lastfree = num;
+			}
+		}
+
+		writexref(xref, &opts);
+	}
+	fz_always(ctx)
+	{
+		fz_free(ctx, opts.uselist);
+		fz_free(ctx, opts.ofslist);
+		fz_free(ctx, opts.genlist);
+		fz_free(ctx, opts.renumbermap);
+		fclose(opts.out);
+	}
+	fz_catch(ctx)
+	{
+		fz_rethrow(ctx);
+	}
+}
diff --git a/win32/libmupdf.vcproj b/win32/libmupdf.vcproj
index b984cc8f..98b1aa5e 100644
--- a/win32/libmupdf.vcproj
+++ b/win32/libmupdf.vcproj
@@ -334,6 +334,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\pdf\pdf_write.c"
+				>
+			</File>
+			<File
 				RelativePath="..\pdf\pdf_xobject.c"
 				>
 			</File>
author	Robin Watts <robin.watts@artifex.com>	2012-04-25 20:40:29 +0100
committer	Robin Watts <robin.watts@artifex.com>	2012-04-28 09:44:49 +0100
commit	4e3d351910f0927283fdc6911574ffe982ca678b (patch)
tree	854e8367ae8906f957c3bb766ae578188e38cd61
parent	463e9b6c69cf9232da5ae9659eaad6e7050b594d (diff)
download	mupdf-4e3d351910f0927283fdc6911574ffe982ca678b.tar.xz