Add option to ascii hex encode binary streams in pdfclean.

author: Tor Andersson <tor@ghostscript.com> 2010-12-04 21:17:43 +0000
committer: Tor Andersson <tor@ghostscript.com> 2010-12-04 21:17:43 +0000
commit: cf4e4a89c9614198fc0e38dc2f740ca1c05ab867 (patch)
tree: 81461dcf04b38f06b797ab0f612071e4139bcfa7
parent: c8c6acfa08a280e6eabd89eb7a2fedd95b4d2f48 (diff)
download: mupdf-cf4e4a89c9614198fc0e38dc2f740ca1c05ab867.tar.xz
5 files changed, 155 insertions, 16 deletions
diff --git a/apps/pdfclean.c b/apps/pdfclean.c
index 87d6e9c3..c4d7b879 100644
--- a/apps/pdfclean.c
+++ b/apps/pdfclean.c
@@ -21,6 +21,7 @@ static int *renumbermap = NULL;
 
 static int dogarbage = 0;
 static int doexpand = 0;
+static int doascii = 0;
 
 static pdf_xref *xref = NULL;
 
@@ -41,6 +42,7 @@ static void usage(void)
 		"\t-gg\tin addition to -g compact xref table\n"
 		"\t-ggg\tin addition to -gg merge duplicate objects\n"
 		"\t-d\tdecompress streams\n"
+		"\t-a\tascii hex encode binary streams\n"
 		"\tpages\tcomma separated list of ranges\n");
 	exit(1);
 }
@@ -385,15 +387,118 @@ static void preloadobjstms(void)
  * Save streams and objects to the output
  */
 
+static inline int isbinary(int c)
+{
+	if (c == '\n' || c == '\r' || c == '\t')
+		return 0;
+	return c < 32 || c > 127;
+}
+
+static int isbinarystream(fz_buffer *buf)
+{
+	int i;
+	for (i = 0; i < buf->len; i++)
+		if (isbinary(buf->data[i]))
+			return 1;
+	return 0;
+}
+
+static fz_buffer *hexbuf(unsigned char *p, int n)
+{
+	static const char hex[16] = "0123456789abcdef";
+	fz_buffer *buf;
+	int x = 0;
+
+	buf = fz_newbuffer(n * 2 + (n / 32) + 2);
+
+	while (n--)
+	{
+		buf->data[buf->len++] = hex[*p >> 4];
+		buf->data[buf->len++] = hex[*p & 15];
+		if (++x == 32)
+		{
+			buf->data[buf->len++] = '\n';
+			x = 0;
+		}
+		p++;
+	}
+
+	buf->data[buf->len++] = '>';
+	buf->data[buf->len++] = '\n';
+
+	return buf;
+}
+
+static void addhexfilter(fz_obj *dict)
+{
+	fz_obj *f, *dp, *newf, *newdp;
+	fz_obj *ahx, *nullobj;
+
+	ahx = fz_newname("ASCIIHexDecode");
+	nullobj = fz_newnull();
+	newf = newdp = nil;
+
+	f = fz_dictgets(dict, "Filter");
+	dp = fz_dictgets(dict, "DecodeParms");
+
+	if (fz_isname(f))
+	{
+		newf = fz_newarray(2);
+		fz_arraypush(newf, ahx);
+		fz_arraypush(newf, f);
+		f = newf;
+		if (fz_isdict(dp))
+		{
+			newdp = fz_newarray(2);
+			fz_arraypush(newdp, nullobj);
+			fz_arraypush(newdp, dp);
+			dp = newdp;
+		}
+	}
+	else if (fz_isarray(f))
+	{
+		fz_arrayinsert(f, ahx);
+		if (fz_isarray(dp))
+			fz_arrayinsert(dp, nullobj);
+	}
+	else
+		f = ahx;
+
+	fz_dictputs(dict, "Filter", f);
+	if (dp)
+		fz_dictputs(dict, "DecodeParms", dp);
+
+	fz_dropobj(ahx);
+	fz_dropobj(nullobj);
+	if (newf)
+		fz_dropobj(newf);
+	if (newdp)
+		fz_dropobj(newdp);
+}
+
 static void copystream(fz_obj *obj, int num, int gen)
 {
 	fz_error error;
-	fz_buffer *buf;
+	fz_buffer *buf, *tmp;
+	fz_obj *newlen;
 
 	error = pdf_loadrawstream(&buf, xref, num, gen);
 	if (error)
 		die(error);
 
+	if (doascii && isbinarystream(buf))
+	{
+		tmp = hexbuf(buf->data, buf->len);
+		fz_dropbuffer(buf);
+		buf = tmp;
+
+		addhexfilter(obj);
+
+		newlen = fz_newint(buf->len);
+		fz_dictputs(obj, "Length", newlen);
+		fz_dropobj(newlen);
+	}
+
 	fprintf(out, "%d %d obj\n", num, gen);
 	fz_fprintobj(out, obj, !doexpand);
 	fprintf(out, "stream\n");
@@ -406,29 +511,35 @@ static void copystream(fz_obj *obj, int num, int gen)
 static void expandstream(fz_obj *obj, int num, int gen)
 {
 	fz_error error;
-	fz_buffer *buf;
-	fz_obj *newdict, *newlen;
+	fz_buffer *buf, *tmp;
+	fz_obj *newlen;
 
 	error = pdf_loadstream(&buf, xref, num, gen);
 	if (error)
 		die(error);
 
-	newdict = fz_copydict(obj);
-	fz_dictdels(newdict, "Filter");
-	fz_dictdels(newdict, "DecodeParms");
+	fz_dictdels(obj, "Filter");
+	fz_dictdels(obj, "DecodeParms");
+
+	if (doascii && isbinarystream(buf))
+	{
+		tmp = hexbuf(buf->data, buf->len);
+		fz_dropbuffer(buf);
+		buf = tmp;
+
+		addhexfilter(obj);
+	}
 
 	newlen = fz_newint(buf->len);
-	fz_dictputs(newdict, "Length", newlen);
+	fz_dictputs(obj, "Length", newlen);
 	fz_dropobj(newlen);
 
 	fprintf(out, "%d %d obj\n", num, gen);
-	fz_fprintobj(out, newdict, !doexpand);
+	fz_fprintobj(out, obj, !doexpand);
 	fprintf(out, "stream\n");
 	fwrite(buf->data, 1, buf->len, out);
 	fprintf(out, "endstream\nendobj\n\n");
 
-	fz_dropobj(newdict);
-
 	fz_dropbuffer(buf);
 }
 
@@ -468,7 +579,7 @@ static void writeobject(int num, int gen)
 	}
 	else
 	{
-		if (doexpand)
+		if (doexpand && !pdf_isjpximage(obj))
 			expandstream(obj, num, gen);
 		else
 			copystream(obj, num, gen);
@@ -572,13 +683,14 @@ int main(int argc, char **argv)
 	int c, num;
 	int subset;
 
-	while ((c = fz_getopt(argc, argv, "gdp:")) != -1)
+	while ((c = fz_getopt(argc, argv, "adgp:")) != -1)
 	{
 		switch (c)
 		{
 		case 'p': password = fz_optarg; break;
 		case 'g': dogarbage ++; break;
 		case 'd': doexpand ++; break;
+		case 'a': doascii ++; break;
 		default: usage(); break;
 		}
 	}
diff --git a/fitz/fitz.h b/fitz/fitz.h
index a2803899..99751b3e 100644
--- a/fitz/fitz.h
+++ b/fitz/fitz.h
@@ -424,6 +424,7 @@ int fz_arraylen(fz_obj *array);
 fz_obj *fz_arrayget(fz_obj *array, int i);
 void fz_arrayput(fz_obj *array, int i, fz_obj *obj);
 void fz_arraypush(fz_obj *array, fz_obj *obj);
+void fz_arrayinsert(fz_obj *array, fz_obj *obj);
 
 int fz_dictlen(fz_obj *dict);
 fz_obj *fz_dictgetkey(fz_obj *dict, int idx);
diff --git a/fitz/obj_array.c b/fitz/obj_array.c
index d7d2d2e7..75f4a6b5 100644
--- a/fitz/obj_array.c
+++ b/fitz/obj_array.c
@@ -104,6 +104,29 @@ fz_arraypush(fz_obj *obj, fz_obj *item)
 }
 
 void
+fz_arrayinsert(fz_obj *obj, fz_obj *item)
+{
+	obj = fz_resolveindirect(obj);
+
+	if (!fz_isarray(obj))
+		fz_warn("assert: not an array (%s)", fz_objkindstr(obj));
+	else
+	{
+		if (obj->u.a.len + 1 > obj->u.a.cap)
+		{
+			int i;
+			obj->u.a.cap = (obj->u.a.cap * 3) / 2;
+			obj->u.a.items = fz_realloc(obj->u.a.items, sizeof (fz_obj*) * obj->u.a.cap);
+			for (i = obj->u.a.len ; i < obj->u.a.cap; i++)
+				obj->u.a.items[i] = nil;
+		}
+		memmove(obj->u.a.items + 1, obj->u.a.items, obj->u.a.len * sizeof(fz_obj*));
+		obj->u.a.items[0] = fz_keepobj(item);
+		obj->u.a.len++;
+	}
+}
+
+void
 fz_freearray(fz_obj *obj)
 {
 	int i;
diff --git a/mupdf/mupdf.h b/mupdf/mupdf.h
index 0fc0ec75..419a06db 100644
--- a/mupdf/mupdf.h
+++ b/mupdf/mupdf.h
@@ -260,6 +260,7 @@ void pdf_dropxobject(pdf_xobject *xobj);
 
 fz_error pdf_loadinlineimage(fz_pixmap **imgp, pdf_xref *xref, fz_obj *rdb, fz_obj *dict, fz_stream *file);
 fz_error pdf_loadimage(fz_pixmap **imgp, pdf_xref *xref, fz_obj *rdb, fz_obj *obj);
+int pdf_isjpximage(fz_obj *dict);
 
 /*
  * CMap
diff --git a/mupdf/pdf_image.c b/mupdf/pdf_image.c
index 6021da09..9eb1925e 100644
--- a/mupdf/pdf_image.c
+++ b/mupdf/pdf_image.c
@@ -251,10 +251,13 @@ pdf_loadinlineimage(fz_pixmap **pixp, pdf_xref *xref, fz_obj *rdb, fz_obj *dict,
 	return fz_okay;
 }
 
-static int
-pdf_isjpximage(fz_obj *filter)
+int
+pdf_isjpximage(fz_obj *dict)
 {
+	fz_obj *filter;
 	int i;
+
+	filter = fz_dictgets(dict, "Filter");
 	if (!strcmp(fz_toname(filter), "JPXDecode"))
 		return 1;
 	for (i = 0; i < fz_arraylen(filter); i++)
@@ -342,8 +345,7 @@ pdf_loadimage(fz_pixmap **pixp, pdf_xref *xref, fz_obj *rdb, fz_obj *dict)
 	pdf_logimage("load image (%d 0 R) {\n", fz_tonum(dict));
 
 	/* special case for JPEG2000 images */
-	obj = fz_dictgets(dict, "Filter");
-	if (pdf_isjpximage(obj))
+	if (pdf_isjpximage(dict))
 	{
 		error = pdf_loadjpximage(pixp, xref, rdb, dict);
 		if (error)
author	Tor Andersson <tor@ghostscript.com>	2010-12-04 21:17:43 +0000
committer	Tor Andersson <tor@ghostscript.com>	2010-12-04 21:17:43 +0000
commit	cf4e4a89c9614198fc0e38dc2f740ca1c05ab867 (patch)
tree	81461dcf04b38f06b797ab0f612071e4139bcfa7
parent	c8c6acfa08a280e6eabd89eb7a2fedd95b4d2f48 (diff)
download	mupdf-cf4e4a89c9614198fc0e38dc2f740ca1c05ab867.tar.xz