diff options
Diffstat (limited to 'apps/pdfclean.c')
-rw-r--r-- | apps/pdfclean.c | 342 |
1 files changed, 189 insertions, 153 deletions
diff --git a/apps/pdfclean.c b/apps/pdfclean.c index ee6ca722..0ecbb98e 100644 --- a/apps/pdfclean.c +++ b/apps/pdfclean.c @@ -4,10 +4,13 @@ * Rewrite PDF with pretty printed objects. * Garbage collect unreachable objects. * Inflate compressed streams. - * Encrypt output. + * Create subset documents. + * + * TODO: linearize document for fast web view */ -#include "pdftool.h" +#include "fitz.h" +#include "mupdf.h" static FILE *out = NULL; @@ -20,6 +23,29 @@ static pdf_xrefentry *oldxreflist = NULL; static int dogarbage = 0; static int doexpand = 0; +static pdf_xref *xref = NULL; + +void die(fz_error error) +{ + fz_catch(error, "aborting"); + if (xref) + pdf_freexref(xref); + exit(1); +} + +static void usage(void) +{ + fprintf(stderr, + "usage: pdfclean [options] input.pdf [output.pdf] [pages]\n" + "\t-p -\tpassword\n" + "\t-g\tgarbage collect unused objects\n" + "\t-gg\tin addition to -g compact xref table\n" + "\t-ggg\tin addition to -gg merge duplicate objects\n" + "\t-x\texpand compressed streams\n" + "\tpages\tcomma separated list of ranges\n"); + exit(1); +} + /* * Garbage collect objects not reachable from the trailer. */ @@ -67,6 +93,10 @@ static void sweepref(fz_obj *obj) sweepobj(fz_resolveindirect(obj)); } +/* + * Renumber objects to compact the xref table + */ + static void renumberobj(fz_obj *obj) { int i; @@ -109,6 +139,53 @@ static void renumberobj(fz_obj *obj) } } +static void renumberxref(void) +{ + int num, newnum; + + newnumlist = fz_malloc(xref->len * sizeof(int)); + oldxreflist = fz_malloc(xref->len * sizeof(pdf_xrefentry)); + for (num = 0; num < xref->len; num++) + { + newnumlist[num] = -1; + oldxreflist[num] = xref->table[num]; + } + + newnum = 1; + for (num = 0; num < xref->len; num++) + { + if (xref->table[num].type == 'f') + uselist[num] = 0; + if (uselist[num]) + newnumlist[num] = newnum++; + } + + renumberobj(xref->trailer); + for (num = 0; num < xref->len; num++) + renumberobj(xref->table[num].obj); + + for (num = 0; num < xref->len; num++) + uselist[num] = 0; + + for (num = 0; num < xref->len; num++) + { + if (newnumlist[num] >= 0) + { + xref->table[newnumlist[num]] = oldxreflist[num]; + uselist[newnumlist[num]] = 1; + } + } + + fz_free(oldxreflist); + fz_free(newnumlist); + + xref->len = newnum; +} + +/* + * Scan and remove duplicate objects (slow) + */ + static void removeduplicateobjs(void) { int num, other; @@ -142,6 +219,99 @@ static void removeduplicateobjs(void) fz_free(newnumlist); } +/* + * Recreate page tree to only retain specified pages. + */ + +static void retainpages(int argc, char **argv) +{ + fz_obj *root, *pages, *kids; + int count; + + /* Snatch pages entry from root dict */ + root = fz_dictgets(xref->trailer, "Root"); + pages = fz_keepobj(fz_dictgets(root, "Pages")); + + /* Then empty the root dict */ + while (fz_dictlen(root) > 0) + { + fz_obj *key = fz_dictgetkey(root, 0); + fz_dictdel(root, key); + } + + /* And only retain pages and type entries */ + fz_dictputs(root, "Pages", pages); + fz_dictputs(root, "Type", fz_newname("Catalog")); + fz_dropobj(pages); + + /* Create a new kids array too add into pages dict + * since each element must be replaced to point to + * a retained page */ + kids = fz_newarray(1); + count = 0; + + /* Retain pages specified */ + while (argc - fz_optind) + { + int page, spage, epage; + char *spec, *dash; + char *pagelist = argv[fz_optind]; + + spec = fz_strsep(&pagelist, ","); + while (spec) + { + dash = strchr(spec, '-'); + + if (dash == spec) + spage = epage = 1; + else + spage = epage = atoi(spec); + + if (dash) + { + if (strlen(dash) > 1) + epage = atoi(dash + 1); + else + epage = pdf_getpagecount(xref); + } + + if (spage > epage) + page = spage, spage = epage, epage = page; + + if (spage < 1) + spage = 1; + if (epage > pdf_getpagecount(xref)) + epage = pdf_getpagecount(xref); + + for (page = spage; page <= epage; page++) + { + fz_obj *pageobj = pdf_getpageobject(xref, page); + fz_obj *pageref = pdf_getpageref(xref, page); + + /* Update parent reference */ + fz_dictputs(pageobj, "Parent", pages); + + /* Store page object in new kids array */ + fz_arraypush(kids, pageref); + count++; + + fz_dropobj(pageref); + } + + spec = fz_strsep(&pagelist, ","); + } + + fz_optind++; + } + + /* Update page count and kids array */ + fz_dictputs(pages, "Count", fz_newint(count)); + fz_dictputs(pages, "Kids", kids); +} + +/* + * Make sure we have loaded objects from object streams. + */ static void preloadobjstms(void) { @@ -161,6 +331,10 @@ static void preloadobjstms(void) } } +/* + * Save streams and objects to the output + */ + static void copystream(fz_obj *obj, int num, int gen) { fz_error error; @@ -208,7 +382,7 @@ static void expandstream(fz_obj *obj, int num, int gen) fz_dropbuffer(buf); } -static void saveobject(int num, int gen) +static void writeobject(int num, int gen) { fz_error error; fz_obj *obj; @@ -253,7 +427,7 @@ static void saveobject(int num, int gen) fz_dropobj(obj); } -static void savexref(void) +static void writexref(void) { fz_obj *trailer; fz_obj *obj; @@ -297,148 +471,7 @@ static void savexref(void) fprintf(out, "startxref\n%d\n%%%%EOF\n", startxref); } -static void cleanusage(void) -{ - fprintf(stderr, - "usage: pdfclean [options] input.pdf [outfile.pdf] [pages]\n" - "\t-p -\tpassword for decryption\n" - "\t-g\tgarbage collect unused objects\n" - "\t-gg\tin addition to -g xref is compacted\n" - "\t-ggg\tin addition to -gg identical objects are garbage collected\n" - "\t-x\texpand compressed streams\n"); - exit(1); -} - -static void retainpages(int argc, char **argv) -{ - fz_obj *root, *pages, *kids; - int count; - - /* Snatch pages entry from root dict */ - root = fz_dictgets(xref->trailer, "Root"); - pages = fz_keepobj(fz_dictgets(root, "Pages")); - - /* Then empty the root dict */ - while (fz_dictlen(root) > 0) - { - fz_obj *key = fz_dictgetkey(root, 0); - fz_dictdel(root, key); - } - - /* And only retain pages and type entries */ - fz_dictputs(root, "Pages", pages); - fz_dictputs(root, "Type", fz_newname("Catalog")); - fz_dropobj(pages); - - /* Create a new kids array too add into pages dict - * since each element must be replaced to point to - * a retained page */ - kids = fz_newarray(1); - count = 0; - - /* Retain pages specified */ - while (argc - fz_optind) - { - int page, spage, epage; - char *spec, *dash; - char *pagelist = argv[fz_optind]; - - spec = fz_strsep(&pagelist, ","); - while (spec) - { - dash = strchr(spec, '-'); - - if (dash == spec) - spage = epage = 1; - else - spage = epage = atoi(spec); - - if (dash) - { - if (strlen(dash) > 1) - epage = atoi(dash + 1); - else - epage = pagecount; - } - - if (spage > epage) - page = spage, spage = epage, epage = page; - - if (spage < 1) - spage = 1; - if (epage > pagecount) - epage = pagecount; - - for (page = spage; page <= epage; page++) - { - fz_obj *pageobj = pdf_getpageobject(xref, page); - fz_obj *pageref = pdf_getpageref(xref, page); - - /* Update parent reference */ - fz_dictputs(pageobj, "Parent", pages); - - /* Store page object in new kids array */ - fz_arraypush(kids, pageref); - count++; - - fz_dropobj(pageref); - } - - spec = fz_strsep(&pagelist, ","); - } - - fz_optind++; - } - - /* Update page count and kids array */ - fz_dictputs(pages, "Count", fz_newint(count)); - fz_dictputs(pages, "Kids", kids); -} - -static void renumberxref(void) -{ - int num, newnum; - - newnumlist = fz_malloc(xref->len * sizeof(int)); - oldxreflist = fz_malloc(xref->len * sizeof(pdf_xrefentry)); - for (num = 0; num < xref->len; num++) - { - newnumlist[num] = -1; - oldxreflist[num] = xref->table[num]; - } - - newnum = 1; - for (num = 0; num < xref->len; num++) - { - if (xref->table[num].type == 'f') - uselist[num] = 0; - if (uselist[num]) - newnumlist[num] = newnum++; - } - - renumberobj(xref->trailer); - for (num = 0; num < xref->len; num++) - renumberobj(xref->table[num].obj); - - for (num = 0; num < xref->len; num++) - uselist[num] = 0; - - for (num = 0; num < xref->len; num++) - { - if (newnumlist[num] >= 0) - { - xref->table[newnumlist[num]] = oldxreflist[num]; - uselist[newnumlist[num]] = 1; - } - } - - fz_free(oldxreflist); - fz_free(newnumlist); - - xref->len = newnum; -} - -static void outputpdf(void) +static void writepdf(void) { int lastfree; int num; @@ -461,7 +494,7 @@ static void outputpdf(void) if (xref->table[num].type == 'n' || xref->table[num].type == 'o') { ofslist[num] = ftell(out); - saveobject(num, genlist[num]); + writeobject(num, genlist[num]); } } @@ -477,11 +510,12 @@ static void outputpdf(void) } } - savexref(); + writexref(); } int main(int argc, char **argv) { + fz_error error; char *infile; char *outfile = "out.pdf"; char *password = ""; @@ -495,12 +529,12 @@ int main(int argc, char **argv) case 'p': password = fz_optarg; break; case 'g': dogarbage ++; break; case 'x': doexpand ++; break; - default: cleanusage(); break; + default: usage(); break; } } if (argc - fz_optind < 1) - cleanusage(); + usage(); infile = argv[fz_optind++]; @@ -514,7 +548,9 @@ int main(int argc, char **argv) if (argc - fz_optind > 0) subset = 1; - openxref(infile, password, 0, subset); + error = pdf_openxref(&xref, infile, password); + if (error) + die(fz_rethrow(error, "cannot open input file '%s'", infile)); out = fopen(outfile, "wb"); if (!out) @@ -552,9 +588,9 @@ int main(int argc, char **argv) if (dogarbage >= 2) renumberxref(); - outputpdf(); + writepdf(); - closexref(); + pdf_freexref(xref); return 0; } |