From cf4e4a89c9614198fc0e38dc2f740ca1c05ab867 Mon Sep 17 00:00:00 2001 From: Tor Andersson Date: Sat, 4 Dec 2010 21:17:43 +0000 Subject: Add option to ascii hex encode binary streams in pdfclean. --- apps/pdfclean.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++----- fitz/fitz.h | 1 + fitz/obj_array.c | 23 +++++++++ mupdf/mupdf.h | 1 + mupdf/pdf_image.c | 10 ++-- 5 files changed, 155 insertions(+), 16 deletions(-) diff --git a/apps/pdfclean.c b/apps/pdfclean.c index 87d6e9c3..c4d7b879 100644 --- a/apps/pdfclean.c +++ b/apps/pdfclean.c @@ -21,6 +21,7 @@ static int *renumbermap = NULL; static int dogarbage = 0; static int doexpand = 0; +static int doascii = 0; static pdf_xref *xref = NULL; @@ -41,6 +42,7 @@ static void usage(void) "\t-gg\tin addition to -g compact xref table\n" "\t-ggg\tin addition to -gg merge duplicate objects\n" "\t-d\tdecompress streams\n" + "\t-a\tascii hex encode binary streams\n" "\tpages\tcomma separated list of ranges\n"); exit(1); } @@ -385,15 +387,118 @@ static void preloadobjstms(void) * Save streams and objects to the output */ +static inline int isbinary(int c) +{ + if (c == '\n' || c == '\r' || c == '\t') + return 0; + return c < 32 || c > 127; +} + +static int isbinarystream(fz_buffer *buf) +{ + int i; + for (i = 0; i < buf->len; i++) + if (isbinary(buf->data[i])) + return 1; + return 0; +} + +static fz_buffer *hexbuf(unsigned char *p, int n) +{ + static const char hex[16] = "0123456789abcdef"; + fz_buffer *buf; + int x = 0; + + buf = fz_newbuffer(n * 2 + (n / 32) + 2); + + while (n--) + { + buf->data[buf->len++] = hex[*p >> 4]; + buf->data[buf->len++] = hex[*p & 15]; + if (++x == 32) + { + buf->data[buf->len++] = '\n'; + x = 0; + } + p++; + } + + buf->data[buf->len++] = '>'; + buf->data[buf->len++] = '\n'; + + return buf; +} + +static void addhexfilter(fz_obj *dict) +{ + fz_obj *f, *dp, *newf, *newdp; + fz_obj *ahx, *nullobj; + + ahx = fz_newname("ASCIIHexDecode"); + nullobj = fz_newnull(); + newf = newdp = nil; + + f = fz_dictgets(dict, "Filter"); + dp = fz_dictgets(dict, "DecodeParms"); + + if (fz_isname(f)) + { + newf = fz_newarray(2); + fz_arraypush(newf, ahx); + fz_arraypush(newf, f); + f = newf; + if (fz_isdict(dp)) + { + newdp = fz_newarray(2); + fz_arraypush(newdp, nullobj); + fz_arraypush(newdp, dp); + dp = newdp; + } + } + else if (fz_isarray(f)) + { + fz_arrayinsert(f, ahx); + if (fz_isarray(dp)) + fz_arrayinsert(dp, nullobj); + } + else + f = ahx; + + fz_dictputs(dict, "Filter", f); + if (dp) + fz_dictputs(dict, "DecodeParms", dp); + + fz_dropobj(ahx); + fz_dropobj(nullobj); + if (newf) + fz_dropobj(newf); + if (newdp) + fz_dropobj(newdp); +} + static void copystream(fz_obj *obj, int num, int gen) { fz_error error; - fz_buffer *buf; + fz_buffer *buf, *tmp; + fz_obj *newlen; error = pdf_loadrawstream(&buf, xref, num, gen); if (error) die(error); + if (doascii && isbinarystream(buf)) + { + tmp = hexbuf(buf->data, buf->len); + fz_dropbuffer(buf); + buf = tmp; + + addhexfilter(obj); + + newlen = fz_newint(buf->len); + fz_dictputs(obj, "Length", newlen); + fz_dropobj(newlen); + } + fprintf(out, "%d %d obj\n", num, gen); fz_fprintobj(out, obj, !doexpand); fprintf(out, "stream\n"); @@ -406,29 +511,35 @@ static void copystream(fz_obj *obj, int num, int gen) static void expandstream(fz_obj *obj, int num, int gen) { fz_error error; - fz_buffer *buf; - fz_obj *newdict, *newlen; + fz_buffer *buf, *tmp; + fz_obj *newlen; error = pdf_loadstream(&buf, xref, num, gen); if (error) die(error); - newdict = fz_copydict(obj); - fz_dictdels(newdict, "Filter"); - fz_dictdels(newdict, "DecodeParms"); + fz_dictdels(obj, "Filter"); + fz_dictdels(obj, "DecodeParms"); + + if (doascii && isbinarystream(buf)) + { + tmp = hexbuf(buf->data, buf->len); + fz_dropbuffer(buf); + buf = tmp; + + addhexfilter(obj); + } newlen = fz_newint(buf->len); - fz_dictputs(newdict, "Length", newlen); + fz_dictputs(obj, "Length", newlen); fz_dropobj(newlen); fprintf(out, "%d %d obj\n", num, gen); - fz_fprintobj(out, newdict, !doexpand); + fz_fprintobj(out, obj, !doexpand); fprintf(out, "stream\n"); fwrite(buf->data, 1, buf->len, out); fprintf(out, "endstream\nendobj\n\n"); - fz_dropobj(newdict); - fz_dropbuffer(buf); } @@ -468,7 +579,7 @@ static void writeobject(int num, int gen) } else { - if (doexpand) + if (doexpand && !pdf_isjpximage(obj)) expandstream(obj, num, gen); else copystream(obj, num, gen); @@ -572,13 +683,14 @@ int main(int argc, char **argv) int c, num; int subset; - while ((c = fz_getopt(argc, argv, "gdp:")) != -1) + while ((c = fz_getopt(argc, argv, "adgp:")) != -1) { switch (c) { case 'p': password = fz_optarg; break; case 'g': dogarbage ++; break; case 'd': doexpand ++; break; + case 'a': doascii ++; break; default: usage(); break; } } diff --git a/fitz/fitz.h b/fitz/fitz.h index a2803899..99751b3e 100644 --- a/fitz/fitz.h +++ b/fitz/fitz.h @@ -424,6 +424,7 @@ int fz_arraylen(fz_obj *array); fz_obj *fz_arrayget(fz_obj *array, int i); void fz_arrayput(fz_obj *array, int i, fz_obj *obj); void fz_arraypush(fz_obj *array, fz_obj *obj); +void fz_arrayinsert(fz_obj *array, fz_obj *obj); int fz_dictlen(fz_obj *dict); fz_obj *fz_dictgetkey(fz_obj *dict, int idx); diff --git a/fitz/obj_array.c b/fitz/obj_array.c index d7d2d2e7..75f4a6b5 100644 --- a/fitz/obj_array.c +++ b/fitz/obj_array.c @@ -103,6 +103,29 @@ fz_arraypush(fz_obj *obj, fz_obj *item) } } +void +fz_arrayinsert(fz_obj *obj, fz_obj *item) +{ + obj = fz_resolveindirect(obj); + + if (!fz_isarray(obj)) + fz_warn("assert: not an array (%s)", fz_objkindstr(obj)); + else + { + if (obj->u.a.len + 1 > obj->u.a.cap) + { + int i; + obj->u.a.cap = (obj->u.a.cap * 3) / 2; + obj->u.a.items = fz_realloc(obj->u.a.items, sizeof (fz_obj*) * obj->u.a.cap); + for (i = obj->u.a.len ; i < obj->u.a.cap; i++) + obj->u.a.items[i] = nil; + } + memmove(obj->u.a.items + 1, obj->u.a.items, obj->u.a.len * sizeof(fz_obj*)); + obj->u.a.items[0] = fz_keepobj(item); + obj->u.a.len++; + } +} + void fz_freearray(fz_obj *obj) { diff --git a/mupdf/mupdf.h b/mupdf/mupdf.h index 0fc0ec75..419a06db 100644 --- a/mupdf/mupdf.h +++ b/mupdf/mupdf.h @@ -260,6 +260,7 @@ void pdf_dropxobject(pdf_xobject *xobj); fz_error pdf_loadinlineimage(fz_pixmap **imgp, pdf_xref *xref, fz_obj *rdb, fz_obj *dict, fz_stream *file); fz_error pdf_loadimage(fz_pixmap **imgp, pdf_xref *xref, fz_obj *rdb, fz_obj *obj); +int pdf_isjpximage(fz_obj *dict); /* * CMap diff --git a/mupdf/pdf_image.c b/mupdf/pdf_image.c index 6021da09..9eb1925e 100644 --- a/mupdf/pdf_image.c +++ b/mupdf/pdf_image.c @@ -251,10 +251,13 @@ pdf_loadinlineimage(fz_pixmap **pixp, pdf_xref *xref, fz_obj *rdb, fz_obj *dict, return fz_okay; } -static int -pdf_isjpximage(fz_obj *filter) +int +pdf_isjpximage(fz_obj *dict) { + fz_obj *filter; int i; + + filter = fz_dictgets(dict, "Filter"); if (!strcmp(fz_toname(filter), "JPXDecode")) return 1; for (i = 0; i < fz_arraylen(filter); i++) @@ -342,8 +345,7 @@ pdf_loadimage(fz_pixmap **pixp, pdf_xref *xref, fz_obj *rdb, fz_obj *dict) pdf_logimage("load image (%d 0 R) {\n", fz_tonum(dict)); /* special case for JPEG2000 images */ - obj = fz_dictgets(dict, "Filter"); - if (pdf_isjpximage(obj)) + if (pdf_isjpximage(dict)) { error = pdf_loadjpximage(pixp, xref, rdb, dict); if (error) -- cgit v1.2.3