summaryrefslogtreecommitdiff
path: root/apps/pdfclean.c
diff options
context:
space:
mode:
authorTor Andersson <tor@ghostscript.com>2010-07-15 23:25:00 +0000
committerTor Andersson <tor@ghostscript.com>2010-07-15 23:25:00 +0000
commit22c78bd39b2d8316af4c7d20d7f322fa7d534102 (patch)
tree5e957a0c183049970f4eda05dc6d8f4a1692e835 /apps/pdfclean.c
parent3effc3ad56c9e90f39638a6edd14a9aa07d9dabe (diff)
downloadmupdf-22c78bd39b2d8316af4c7d20d7f322fa7d534102.tar.xz
Refactored xref opening functions, command line tools, and rewrote pdfdraw to be more illustrative.
Diffstat (limited to 'apps/pdfclean.c')
-rw-r--r--apps/pdfclean.c342
1 files changed, 189 insertions, 153 deletions
diff --git a/apps/pdfclean.c b/apps/pdfclean.c
index ee6ca722..0ecbb98e 100644
--- a/apps/pdfclean.c
+++ b/apps/pdfclean.c
@@ -4,10 +4,13 @@
* Rewrite PDF with pretty printed objects.
* Garbage collect unreachable objects.
* Inflate compressed streams.
- * Encrypt output.
+ * Create subset documents.
+ *
+ * TODO: linearize document for fast web view
*/
-#include "pdftool.h"
+#include "fitz.h"
+#include "mupdf.h"
static FILE *out = NULL;
@@ -20,6 +23,29 @@ static pdf_xrefentry *oldxreflist = NULL;
static int dogarbage = 0;
static int doexpand = 0;
+static pdf_xref *xref = NULL;
+
+void die(fz_error error)
+{
+ fz_catch(error, "aborting");
+ if (xref)
+ pdf_freexref(xref);
+ exit(1);
+}
+
+static void usage(void)
+{
+ fprintf(stderr,
+ "usage: pdfclean [options] input.pdf [output.pdf] [pages]\n"
+ "\t-p -\tpassword\n"
+ "\t-g\tgarbage collect unused objects\n"
+ "\t-gg\tin addition to -g compact xref table\n"
+ "\t-ggg\tin addition to -gg merge duplicate objects\n"
+ "\t-x\texpand compressed streams\n"
+ "\tpages\tcomma separated list of ranges\n");
+ exit(1);
+}
+
/*
* Garbage collect objects not reachable from the trailer.
*/
@@ -67,6 +93,10 @@ static void sweepref(fz_obj *obj)
sweepobj(fz_resolveindirect(obj));
}
+/*
+ * Renumber objects to compact the xref table
+ */
+
static void renumberobj(fz_obj *obj)
{
int i;
@@ -109,6 +139,53 @@ static void renumberobj(fz_obj *obj)
}
}
+static void renumberxref(void)
+{
+ int num, newnum;
+
+ newnumlist = fz_malloc(xref->len * sizeof(int));
+ oldxreflist = fz_malloc(xref->len * sizeof(pdf_xrefentry));
+ for (num = 0; num < xref->len; num++)
+ {
+ newnumlist[num] = -1;
+ oldxreflist[num] = xref->table[num];
+ }
+
+ newnum = 1;
+ for (num = 0; num < xref->len; num++)
+ {
+ if (xref->table[num].type == 'f')
+ uselist[num] = 0;
+ if (uselist[num])
+ newnumlist[num] = newnum++;
+ }
+
+ renumberobj(xref->trailer);
+ for (num = 0; num < xref->len; num++)
+ renumberobj(xref->table[num].obj);
+
+ for (num = 0; num < xref->len; num++)
+ uselist[num] = 0;
+
+ for (num = 0; num < xref->len; num++)
+ {
+ if (newnumlist[num] >= 0)
+ {
+ xref->table[newnumlist[num]] = oldxreflist[num];
+ uselist[newnumlist[num]] = 1;
+ }
+ }
+
+ fz_free(oldxreflist);
+ fz_free(newnumlist);
+
+ xref->len = newnum;
+}
+
+/*
+ * Scan and remove duplicate objects (slow)
+ */
+
static void removeduplicateobjs(void)
{
int num, other;
@@ -142,6 +219,99 @@ static void removeduplicateobjs(void)
fz_free(newnumlist);
}
+/*
+ * Recreate page tree to only retain specified pages.
+ */
+
+static void retainpages(int argc, char **argv)
+{
+ fz_obj *root, *pages, *kids;
+ int count;
+
+ /* Snatch pages entry from root dict */
+ root = fz_dictgets(xref->trailer, "Root");
+ pages = fz_keepobj(fz_dictgets(root, "Pages"));
+
+ /* Then empty the root dict */
+ while (fz_dictlen(root) > 0)
+ {
+ fz_obj *key = fz_dictgetkey(root, 0);
+ fz_dictdel(root, key);
+ }
+
+ /* And only retain pages and type entries */
+ fz_dictputs(root, "Pages", pages);
+ fz_dictputs(root, "Type", fz_newname("Catalog"));
+ fz_dropobj(pages);
+
+ /* Create a new kids array too add into pages dict
+ * since each element must be replaced to point to
+ * a retained page */
+ kids = fz_newarray(1);
+ count = 0;
+
+ /* Retain pages specified */
+ while (argc - fz_optind)
+ {
+ int page, spage, epage;
+ char *spec, *dash;
+ char *pagelist = argv[fz_optind];
+
+ spec = fz_strsep(&pagelist, ",");
+ while (spec)
+ {
+ dash = strchr(spec, '-');
+
+ if (dash == spec)
+ spage = epage = 1;
+ else
+ spage = epage = atoi(spec);
+
+ if (dash)
+ {
+ if (strlen(dash) > 1)
+ epage = atoi(dash + 1);
+ else
+ epage = pdf_getpagecount(xref);
+ }
+
+ if (spage > epage)
+ page = spage, spage = epage, epage = page;
+
+ if (spage < 1)
+ spage = 1;
+ if (epage > pdf_getpagecount(xref))
+ epage = pdf_getpagecount(xref);
+
+ for (page = spage; page <= epage; page++)
+ {
+ fz_obj *pageobj = pdf_getpageobject(xref, page);
+ fz_obj *pageref = pdf_getpageref(xref, page);
+
+ /* Update parent reference */
+ fz_dictputs(pageobj, "Parent", pages);
+
+ /* Store page object in new kids array */
+ fz_arraypush(kids, pageref);
+ count++;
+
+ fz_dropobj(pageref);
+ }
+
+ spec = fz_strsep(&pagelist, ",");
+ }
+
+ fz_optind++;
+ }
+
+ /* Update page count and kids array */
+ fz_dictputs(pages, "Count", fz_newint(count));
+ fz_dictputs(pages, "Kids", kids);
+}
+
+/*
+ * Make sure we have loaded objects from object streams.
+ */
static void preloadobjstms(void)
{
@@ -161,6 +331,10 @@ static void preloadobjstms(void)
}
}
+/*
+ * Save streams and objects to the output
+ */
+
static void copystream(fz_obj *obj, int num, int gen)
{
fz_error error;
@@ -208,7 +382,7 @@ static void expandstream(fz_obj *obj, int num, int gen)
fz_dropbuffer(buf);
}
-static void saveobject(int num, int gen)
+static void writeobject(int num, int gen)
{
fz_error error;
fz_obj *obj;
@@ -253,7 +427,7 @@ static void saveobject(int num, int gen)
fz_dropobj(obj);
}
-static void savexref(void)
+static void writexref(void)
{
fz_obj *trailer;
fz_obj *obj;
@@ -297,148 +471,7 @@ static void savexref(void)
fprintf(out, "startxref\n%d\n%%%%EOF\n", startxref);
}
-static void cleanusage(void)
-{
- fprintf(stderr,
- "usage: pdfclean [options] input.pdf [outfile.pdf] [pages]\n"
- "\t-p -\tpassword for decryption\n"
- "\t-g\tgarbage collect unused objects\n"
- "\t-gg\tin addition to -g xref is compacted\n"
- "\t-ggg\tin addition to -gg identical objects are garbage collected\n"
- "\t-x\texpand compressed streams\n");
- exit(1);
-}
-
-static void retainpages(int argc, char **argv)
-{
- fz_obj *root, *pages, *kids;
- int count;
-
- /* Snatch pages entry from root dict */
- root = fz_dictgets(xref->trailer, "Root");
- pages = fz_keepobj(fz_dictgets(root, "Pages"));
-
- /* Then empty the root dict */
- while (fz_dictlen(root) > 0)
- {
- fz_obj *key = fz_dictgetkey(root, 0);
- fz_dictdel(root, key);
- }
-
- /* And only retain pages and type entries */
- fz_dictputs(root, "Pages", pages);
- fz_dictputs(root, "Type", fz_newname("Catalog"));
- fz_dropobj(pages);
-
- /* Create a new kids array too add into pages dict
- * since each element must be replaced to point to
- * a retained page */
- kids = fz_newarray(1);
- count = 0;
-
- /* Retain pages specified */
- while (argc - fz_optind)
- {
- int page, spage, epage;
- char *spec, *dash;
- char *pagelist = argv[fz_optind];
-
- spec = fz_strsep(&pagelist, ",");
- while (spec)
- {
- dash = strchr(spec, '-');
-
- if (dash == spec)
- spage = epage = 1;
- else
- spage = epage = atoi(spec);
-
- if (dash)
- {
- if (strlen(dash) > 1)
- epage = atoi(dash + 1);
- else
- epage = pagecount;
- }
-
- if (spage > epage)
- page = spage, spage = epage, epage = page;
-
- if (spage < 1)
- spage = 1;
- if (epage > pagecount)
- epage = pagecount;
-
- for (page = spage; page <= epage; page++)
- {
- fz_obj *pageobj = pdf_getpageobject(xref, page);
- fz_obj *pageref = pdf_getpageref(xref, page);
-
- /* Update parent reference */
- fz_dictputs(pageobj, "Parent", pages);
-
- /* Store page object in new kids array */
- fz_arraypush(kids, pageref);
- count++;
-
- fz_dropobj(pageref);
- }
-
- spec = fz_strsep(&pagelist, ",");
- }
-
- fz_optind++;
- }
-
- /* Update page count and kids array */
- fz_dictputs(pages, "Count", fz_newint(count));
- fz_dictputs(pages, "Kids", kids);
-}
-
-static void renumberxref(void)
-{
- int num, newnum;
-
- newnumlist = fz_malloc(xref->len * sizeof(int));
- oldxreflist = fz_malloc(xref->len * sizeof(pdf_xrefentry));
- for (num = 0; num < xref->len; num++)
- {
- newnumlist[num] = -1;
- oldxreflist[num] = xref->table[num];
- }
-
- newnum = 1;
- for (num = 0; num < xref->len; num++)
- {
- if (xref->table[num].type == 'f')
- uselist[num] = 0;
- if (uselist[num])
- newnumlist[num] = newnum++;
- }
-
- renumberobj(xref->trailer);
- for (num = 0; num < xref->len; num++)
- renumberobj(xref->table[num].obj);
-
- for (num = 0; num < xref->len; num++)
- uselist[num] = 0;
-
- for (num = 0; num < xref->len; num++)
- {
- if (newnumlist[num] >= 0)
- {
- xref->table[newnumlist[num]] = oldxreflist[num];
- uselist[newnumlist[num]] = 1;
- }
- }
-
- fz_free(oldxreflist);
- fz_free(newnumlist);
-
- xref->len = newnum;
-}
-
-static void outputpdf(void)
+static void writepdf(void)
{
int lastfree;
int num;
@@ -461,7 +494,7 @@ static void outputpdf(void)
if (xref->table[num].type == 'n' || xref->table[num].type == 'o')
{
ofslist[num] = ftell(out);
- saveobject(num, genlist[num]);
+ writeobject(num, genlist[num]);
}
}
@@ -477,11 +510,12 @@ static void outputpdf(void)
}
}
- savexref();
+ writexref();
}
int main(int argc, char **argv)
{
+ fz_error error;
char *infile;
char *outfile = "out.pdf";
char *password = "";
@@ -495,12 +529,12 @@ int main(int argc, char **argv)
case 'p': password = fz_optarg; break;
case 'g': dogarbage ++; break;
case 'x': doexpand ++; break;
- default: cleanusage(); break;
+ default: usage(); break;
}
}
if (argc - fz_optind < 1)
- cleanusage();
+ usage();
infile = argv[fz_optind++];
@@ -514,7 +548,9 @@ int main(int argc, char **argv)
if (argc - fz_optind > 0)
subset = 1;
- openxref(infile, password, 0, subset);
+ error = pdf_openxref(&xref, infile, password);
+ if (error)
+ die(fz_rethrow(error, "cannot open input file '%s'", infile));
out = fopen(outfile, "wb");
if (!out)
@@ -552,9 +588,9 @@ int main(int argc, char **argv)
if (dogarbage >= 2)
renumberxref();
- outputpdf();
+ writepdf();
- closexref();
+ pdf_freexref(xref);
return 0;
}