summaryrefslogtreecommitdiff
path: root/test/pdfclean.c
diff options
context:
space:
mode:
Diffstat (limited to 'test/pdfclean.c')
-rw-r--r--test/pdfclean.c501
1 files changed, 501 insertions, 0 deletions
diff --git a/test/pdfclean.c b/test/pdfclean.c
new file mode 100644
index 00000000..00721842
--- /dev/null
+++ b/test/pdfclean.c
@@ -0,0 +1,501 @@
+#include <fitz.h>
+#include <mupdf.h>
+
+#define encrypt encrypt3am
+
+int (*printobj)(FILE*,fz_obj*) = fz_fprintobj;
+
+/*
+ * Rewrite PDF with cleaned up syntax, and consolidate the xref table.
+ * Remove encryption while we're at it :)
+ */
+
+static FILE *out;
+
+static pdf_xref *xt;
+
+static int *ofslist;
+static int *genlist;
+static int *uselist;
+
+static int dorebuild = 0;
+static int doexpand = 0;
+static pdf_crypt *encrypt = nil;
+static fz_obj *encryptdict = nil;
+static fz_obj *id = nil;
+static int dogc = 0;
+
+void usage()
+{
+ fprintf(stderr,
+ "usage: pdfclean [options] infile.pdf outfile.pdf\n"
+ " -r\treconstruct broken xref table\n"
+ " -g\tgarbage collect unused objects\n"
+ " -x\texpand compressed streams\n"
+ " -c\twrite compact objects\n"
+ " -d -\tset user password for decryption\n"
+ " -e\tencrypt outfile\n"
+ " -u -\tset user password for encryption\n"
+ " -o -\tset owner password\n"
+ " -p -\tset permissions\n"
+ " -n -\tkey length in bits: 40 <= n <= 128\n"
+ );
+ exit(1);
+}
+
+void garbagecollect(fz_obj *ref);
+
+void gc0(fz_obj *obj)
+{
+ int i;
+
+ if (fz_isdict(obj))
+ for (i = 0; i < fz_dictlen(obj); i++)
+ gc0(fz_dictgetval(obj, i));
+
+ if (fz_isarray(obj))
+ for (i = 0; i < fz_arraylen(obj); i++)
+ gc0(fz_arrayget(obj, i));
+
+ if (fz_isindirect(obj))
+ garbagecollect(obj);
+}
+
+void garbagecollect(fz_obj *ref)
+{
+ fz_obj *obj;
+ int stmofs;
+
+ if (uselist[fz_toobjid(ref)])
+ return;
+
+ uselist[fz_toobjid(ref)] = 1;
+
+ pdf_loadindirect(&obj, xt, ref, &stmofs);
+
+ if (stmofs != -1) {
+ fz_obj *len = fz_dictgets(obj, "Length");
+ if (fz_isindirect(len)) {
+ pdf_loadindirect(&len, xt, len, nil);
+ fz_dictputs(obj, "Length", len);
+ }
+ }
+
+ gc0(obj);
+}
+
+void decodestream(fz_obj *obj, int ofs, int oid, int gid)
+{
+ fz_error *error;
+ unsigned char buf[512];
+ fz_filter *filter;
+ fz_file *sf;
+ int n;
+ int len;
+ fz_obj *lenobj;
+ fz_obj *newdict;
+
+ /* count length of decoded data */
+ len = 0;
+
+ error = pdf_buildstream(&filter, xt, obj, oid, gid);
+ if (error) fz_abort(error);
+
+ n = fz_seek(xt->file, ofs);
+
+ fz_chainfile(&sf, xt->file, filter);
+
+ while (1) {
+ n = fz_read(sf, buf, sizeof buf);
+ if (n < 0)
+ fz_abort(fz_ferror(sf));
+ if (n == 0)
+ break;
+ len += n;
+ }
+
+ fz_unchainfile(sf);
+
+ /* change dictionary of object... */
+ fz_copydict(&newdict, obj);
+
+ fz_newint(&lenobj, len);
+ fz_dictputs(newdict, "Length", lenobj);
+ fz_dropobj(lenobj);
+
+ fz_dictdels(newdict, "Filter");
+ fz_dictdels(newdict, "DecodeParms");
+
+ /* save object */
+ fprintf(out, "%d %d obj\n", oid, gid);
+ printobj(out, newdict);
+ fprintf(out, "\n");
+ fprintf(out, "stream\n");
+
+ fz_dropobj(newdict);
+
+ /* now decode stream for real */
+ error = pdf_buildstream(&filter, xt, obj, oid, gid);
+ if (error) fz_abort(error);
+
+ fz_seek(xt->file, ofs);
+
+ if (encrypt) {
+ fz_filter *cf;
+ pdf_cryptstm(&cf, encrypt, oid, gid);
+ fz_newpipeline(&filter, filter, cf);
+ }
+
+ fz_chainfile(&sf, xt->file, filter);
+ while (1) {
+ n = fz_read(sf, buf, sizeof buf);
+ if (n < 0)
+ fz_abort(fz_ferror(sf));
+ if (n == 0)
+ break;
+ fwrite(buf, 1, n, out);
+ }
+ fz_unchainfile(sf);
+
+ /* the end */
+ fprintf(out, "endstream\nendobj\n\n");
+
+ return;
+}
+
+void savestream(fz_obj *obj, int ofs, int oid, int gid)
+{
+ unsigned char buf[512];
+ fz_filter *filter;
+ fz_file *sf;
+ int len;
+ int n;
+
+ /* save object */
+ fprintf(out, "%d %d obj\n", oid, gid);
+ printobj(out, obj);
+ fprintf(out, "\n");
+ fprintf(out, "stream\n");
+
+ /* copy stream */
+ obj = fz_dictgets(obj, "Length");
+ if (fz_isindirect(obj)) {
+ pdf_loadindirect(&obj, xt, obj, nil);
+ len = fz_toint(obj);
+ fz_dropobj(obj);
+ }
+ else {
+ len = fz_toint(obj);
+ }
+
+ fz_newnullfilter(&filter, len);
+
+ if (xt->crypt) {
+ fz_filter *cf;
+ pdf_cryptstm(&cf, xt->crypt, oid, gid);
+ fz_newpipeline(&filter, cf, filter);
+ }
+
+ if (encrypt) {
+ fz_filter *cf;
+ pdf_cryptstm(&cf, encrypt, oid, gid);
+ fz_newpipeline(&filter, filter, cf);
+ }
+
+ fz_seek(xt->file, ofs);
+ fz_chainfile(&sf, xt->file, filter);
+ while (1)
+ {
+ n = fz_read(sf, buf, sizeof buf);
+ if (n == 0)
+ break;
+ if (n < 0)
+ fz_abort(fz_ferror(sf));
+ fwrite(buf, 1, n, out);
+ }
+ fz_unchainfile(sf);
+
+ /* the end */
+ fprintf(out, "endstream\nendobj\n\n");
+}
+
+void deleteobject(int oid, int gid)
+{
+ uselist[oid] = 0;
+}
+
+void saveobject(int oid, int gid)
+{
+ fz_error *error;
+ fz_obj *obj;
+ fz_obj *t;
+ int stmofs;
+
+ error = pdf_loadobj(&obj, xt, oid, gid, &stmofs);
+ if (error) fz_abort(error);
+
+ /* trash ObjStm and XrefStm objects */
+ if (fz_isdict(obj)) {
+ t = fz_dictgets(obj, "Type");
+ if (fz_isname(t) && strcmp(fz_toname(t), "ObjStm") == 0) {
+ deleteobject(oid, gid);
+ fz_dropobj(obj);
+ return;
+ }
+ if (fz_isname(t) && strcmp(fz_toname(t), "XRef") == 0) {
+ deleteobject(oid, gid);
+ fz_dropobj(obj);
+ return;
+ }
+ }
+
+ if (encrypt)
+ pdf_cryptobj(encrypt, obj, oid, gid);
+
+ if (stmofs == -1) {
+ fprintf(out, "%d %d obj\n", oid, gid);
+ printobj(out, obj);
+ fprintf(out, "\nendobj\n\n");
+ }
+ else if (doexpand) {
+ decodestream(obj, stmofs, oid, gid);
+ }
+ else {
+ savestream(obj, stmofs, oid, gid);
+ }
+
+ fz_dropobj(obj);
+}
+
+void savexref(void)
+{
+ fz_obj *newtrailer;
+ fz_obj *obj;
+ int startxref;
+ int i;
+
+ startxref = ftell(out);
+
+ fprintf(out, "xref\n0 %d\n", xt->size);
+ for (i = 0; i < xt->size; i++) {
+ if (uselist[i])
+ fprintf(out, "%010d %05d n \n", ofslist[i], genlist[i]);
+ else
+ fprintf(out, "%010d %05d f \n", ofslist[i], genlist[i]);
+ }
+ fprintf(out, "\n");
+
+ fz_newdict(&newtrailer, 5);
+
+ fz_newint(&obj, xt->size);
+ fz_dictputs(newtrailer, "Size", obj);
+ fz_dropobj(obj);
+
+ obj = fz_dictgets(xt->trailer, "Info");
+ if (obj) fz_dictputs(newtrailer, "Info", obj);
+
+ obj = fz_dictgets(xt->trailer, "Root");
+ if (obj) fz_dictputs(newtrailer, "Root", obj);
+
+ fz_dictputs(newtrailer, "ID", id);
+
+ if (encryptdict) {
+ fz_newindirect(&obj, xt->size - 1, 0);
+ fz_dictputs(newtrailer, "Encrypt", obj);
+ fz_dropobj(obj);
+ }
+
+ fprintf(out, "trailer\n");
+ printobj(out, newtrailer);
+ fprintf(out, "\n\n");
+
+ fprintf(out, "startxref\n%d\n%%%%EOF\n", startxref);
+}
+
+int main(int argc, char **argv)
+{
+ fz_error *error;
+ fz_obj *obj;
+ int lastfree;
+ char *filename;
+ int i;
+ int c;
+
+ int doencrypt = 0;
+ char *password = "";
+ char *userpw = "";
+ char *ownerpw = "";
+ int perms = -4; /* 0xfffffffc */
+ int keylen = 40;
+
+ while (1)
+ {
+ c = getopt(argc, argv, "rcxgeu:o:p:n:d:");
+
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case 'r':
+ dorebuild ++;
+ break;
+ case 'x':
+ doexpand ++;
+ break;
+ case 'g':
+ dogc ++;
+ break;
+ case 'c':
+ printobj = fz_fprintcobj;
+ break;
+ case 'd':
+ password = optarg;
+ break;
+ case 'e':
+ doencrypt ++;
+ break;
+ case 'u':
+ userpw = optarg;
+ break;
+ case 'o':
+ ownerpw = optarg;
+ break;
+ case 'p':
+ perms = atoi(optarg);
+ break;
+ case 'n':
+ keylen = atoi(optarg);
+ break;
+ default:
+ usage();
+ }
+ }
+
+ if (argc - optind != 2)
+ usage();
+
+ filename = argv[optind];
+
+ if (dorebuild)
+ error = pdf_rebuildxref(&xt, filename);
+ else
+ error = pdf_openxref(&xt, filename);
+ if (error) fz_abort(error);
+
+ if (doencrypt && keylen > 40 && xt->version < 1.4)
+ xt->version = 1.4;
+
+ id = fz_dictgets(xt->trailer, "ID");
+ obj = fz_dictgets(xt->trailer, "Encrypt");
+ if (fz_isindirect(obj)) {
+ pdf_loadindirect(&obj, xt, obj, nil);
+ }
+ if (obj && id) {
+ pdf_newdecrypt(&xt->crypt, obj, id);
+ error = pdf_setpassword(xt->crypt, password);
+ if (error) fz_abort(error);
+ }
+
+ id = fz_dictgets(xt->trailer, "ID");
+ if (!id)
+ fz_parseobj(&id, "[ (foobar) (foobar) ]");
+
+ if (doencrypt)
+ pdf_newencrypt(&encrypt, &encryptdict, userpw, ownerpw, perms, keylen, id);
+
+ out = fopen(argv[optind + 1], "w");
+ if (!out) {
+ fz_abort(fz_throw("open(%s): %s", argv[optind + 1], strerror(errno)));
+ }
+
+ fprintf(out, "%%PDF-%.1f\n\n", xt->version);
+
+ ofslist = fz_malloc(sizeof(int) * (xt->size + 1));
+ genlist = fz_malloc(sizeof(int) * (xt->size + 1));
+ uselist = fz_malloc(sizeof(int) * (xt->size + 1));
+
+ lastfree = 0;
+
+ ofslist[0] = 0;
+ genlist[0] = 65535;
+ uselist[0] = 0;
+
+ for (i = 1; i < xt->size; i++) {
+ ofslist[i] = 0;
+ genlist[i] = 0;
+ uselist[i] = 1;
+ }
+
+ /* garbage collect from roots in trailer */
+ if (dogc)
+ {
+ for (i = 1; i < xt->size; i++)
+ uselist[i] = 0;
+
+ obj = fz_dictgets(xt->trailer, "Info");
+ if (fz_isindirect(obj))
+ garbagecollect(obj);
+
+ obj = fz_dictgets(xt->trailer, "Root");
+ if (fz_isindirect(obj))
+ garbagecollect(obj);
+
+ obj = fz_dictgets(xt->trailer, "ID");
+ if (fz_isindirect(obj))
+ garbagecollect(obj);
+ }
+
+ /* pretty print objects */
+ for (i = 0; i < xt->size; i++)
+ {
+ if (xt->table[i].type == 0)
+ uselist[i] = 0;
+
+ if (xt->table[i].type == 0)
+ genlist[i] = xt->table[i].gen;
+ if (xt->table[i].type == 1)
+ genlist[i] = xt->table[i].gen;
+ if (xt->table[i].type == 2)
+ genlist[i] = 0;
+
+ if (dogc && !uselist[i])
+ continue;
+
+ if (xt->table[i].type == 1 || xt->table[i].type == 2)
+ {
+ ofslist[i] = ftell(out);
+ saveobject(i, genlist[i]);
+ }
+ }
+
+ /* add encryption dictionary if we crypted */
+ if (encryptdict) {
+ xt->size ++;
+ ofslist[xt->size - 1] = ftell(out);
+ genlist[xt->size - 1] = 0;
+ uselist[xt->size - 1] = 1;
+ fprintf(out, "%d %d obj\n", xt->size - 1, 0);
+ printobj(out, encryptdict);
+ fprintf(out, "\nendobj\n\n");
+ }
+
+ /* construct linked list of free object slots */
+ lastfree = 0;
+ for (i = 1; i < xt->size; i++) {
+ if (!uselist[i]) {
+ genlist[i] ++;
+ ofslist[lastfree] = i;
+ lastfree = i;
+ }
+ }
+
+ savexref();
+
+ pdf_closexref(xt);
+
+ fclose(out);
+
+ return 0;
+}
+