From d62f4a4cb145b5da1ae38bea340fbc9e903f6612 Mon Sep 17 00:00:00 2001 From: Tor Andersson Date: Sat, 4 Dec 2010 22:54:08 +0000 Subject: Support repairing PDF documents with compressed object streams. --- apps/pdfclean.c | 2 +- mupdf/mupdf.h | 3 +- mupdf/pdf_repair.c | 87 +++++++++++++++++++++++++++++++++++++++++++------- mupdf/pdf_xref.c | 93 ++++++++++++++++++++++-------------------------------- 4 files changed, 115 insertions(+), 70 deletions(-) diff --git a/apps/pdfclean.c b/apps/pdfclean.c index c4d7b879..0cbeb77e 100644 --- a/apps/pdfclean.c +++ b/apps/pdfclean.c @@ -241,7 +241,7 @@ static void renumberobjs(void) /* Create new table for the reordered, compacted xref */ oldxref = xref->table; - xref->table = fz_malloc(xref->cap * sizeof (pdf_xrefentry)); + xref->table = fz_malloc(xref->len * sizeof (pdf_xrefentry)); xref->table[0] = oldxref[0]; /* Move used objects into the new compacted xref */ diff --git a/mupdf/mupdf.h b/mupdf/mupdf.h index 419a06db..52fe415b 100644 --- a/mupdf/mupdf.h +++ b/mupdf/mupdf.h @@ -128,7 +128,6 @@ struct pdf_xref_s fz_obj *trailer; int len; - int cap; pdf_xrefentry *table; int pagelen; @@ -168,7 +167,9 @@ void pdf_freexref(pdf_xref *); /* private */ fz_error pdf_repairxref(pdf_xref *xref, char *buf, int bufsize); +fz_error pdf_repairobjstms(pdf_xref *xref); void pdf_debugxref(pdf_xref *); +void pdf_resizexref(pdf_xref *xref, int newcap); /* * Resource store diff --git a/mupdf/pdf_repair.c b/mupdf/pdf_repair.c index 9f30a690..3b062e79 100644 --- a/mupdf/pdf_repair.c +++ b/mupdf/pdf_repair.c @@ -110,6 +110,59 @@ atobjend: return fz_okay; } +static fz_error +pdf_repairobjstm(pdf_xref *xref, int num, int gen) +{ + fz_error error; + fz_obj *obj; + fz_stream *stm; + pdf_token_e tok; + int i, n, count; + char buf[256]; + + error = pdf_loadobject(&obj, xref, num, gen); + if (error) + return fz_rethrow(error, "cannot load object stream object (%d %d R)", num, gen); + + count = fz_toint(fz_dictgets(obj, "N")); + + fz_dropobj(obj); + + error = pdf_openstream(&stm, xref, num, gen); + if (error) + return fz_rethrow(error, "cannot open object stream object (%d %d R)", num, gen); + + for (i = 0; i < count; i++) + { + error = pdf_lex(&tok, stm, buf, sizeof buf, &n); + if (error || tok != PDF_TINT) + { + fz_close(stm); + return fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen); + } + + n = atoi(buf); + if (n >= xref->len) + pdf_resizexref(xref, n + 1); + + xref->table[n].ofs = num; + xref->table[n].gen = i; + xref->table[n].stmofs = 0; + xref->table[n].obj = nil; + xref->table[n].type = 'o'; + + error = pdf_lex(&tok, stm, buf, sizeof buf, &n); + if (error || tok != PDF_TINT) + { + fz_close(stm); + return fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen); + } + } + + fz_close(stm); + return fz_okay; +} + fz_error pdf_repairxref(pdf_xref *xref, char *buf, int bufsize) { @@ -268,9 +321,7 @@ pdf_repairxref(pdf_xref *xref, char *buf, int bufsize) fz_dropobj(id); } - xref->len = maxnum + 1; - xref->cap = xref->len; - xref->table = fz_malloc(xref->cap * sizeof(pdf_xrefentry)); + pdf_resizexref(xref, maxnum + 1); xref->table[0].type = 'f'; xref->table[0].ofs = 0; @@ -278,15 +329,6 @@ pdf_repairxref(pdf_xref *xref, char *buf, int bufsize) xref->table[0].stmofs = 0; xref->table[0].obj = nil; - for (i = 1; i < xref->len; i++) - { - xref->table[i].type = 'f'; - xref->table[i].ofs = 0; - xref->table[i].gen = 0; - xref->table[i].stmofs = 0; - xref->table[i].obj = nil; - } - for (i = 0; i < listlen; i++) { xref->table[list[i].num].type = 'n'; @@ -314,6 +356,7 @@ pdf_repairxref(pdf_xref *xref, char *buf, int bufsize) fz_dropobj(dict); } + } next = 0; @@ -335,3 +378,23 @@ cleanup: fz_free(list); return error; /* already rethrown */ } + +fz_error +pdf_repairobjstms(pdf_xref *xref) +{ + fz_obj *dict; + int i; + + for (i = 0; i < xref->len; i++) + { + if (xref->table[i].stmofs) + { + pdf_loadobject(&dict, xref, i, 0); + if (!strcmp(fz_toname(fz_dictgets(dict, "Type")), "ObjStm")) + pdf_repairobjstm(xref, i, 0); + fz_dropobj(dict); + } + } + + return fz_okay; +} diff --git a/mupdf/pdf_xref.c b/mupdf/pdf_xref.c index 6bf1ae55..aece3cb1 100644 --- a/mupdf/pdf_xref.c +++ b/mupdf/pdf_xref.c @@ -175,6 +175,23 @@ pdf_readtrailer(pdf_xref *xref, char *buf, int cap) * xref tables */ +void +pdf_resizexref(pdf_xref *xref, int newlen) +{ + int i; + + xref->table = fz_realloc(xref->table, newlen * sizeof(pdf_xrefentry)); + for (i = xref->len; i < newlen; i++) + { + xref->table[i].type = 0; + xref->table[i].ofs = 0; + xref->table[i].gen = 0; + xref->table[i].stmofs = 0; + xref->table[i].obj = nil; + } + xref->len = newlen; +} + static fz_error pdf_readoldxref(fz_obj **trailerp, pdf_xref *xref, char *buf, int cap) { @@ -211,24 +228,10 @@ pdf_readoldxref(fz_obj **trailerp, pdf_xref *xref, char *buf, int cap) } /* broken pdfs where size in trailer undershoots entries in xref sections */ - if (ofs + len > xref->cap) + if (ofs + len > xref->len) { fz_warn("broken xref section, proceeding anyway."); - xref->cap = ofs + len; - xref->table = fz_realloc(xref->table, xref->cap * sizeof(pdf_xrefentry)); - } - - if ((ofs + len) > xref->len) - { - for (i = xref->len; i < (ofs + len); i++) - { - xref->table[i].ofs = 0; - xref->table[i].gen = 0; - xref->table[i].stmofs = 0; - xref->table[i].obj = nil; - xref->table[i].type = 0; - } - xref->len = ofs + len; + pdf_resizexref(xref, ofs + len); } for (i = ofs; i < ofs + len; i++) @@ -318,7 +321,6 @@ pdf_readnewxref(fz_obj **trailerp, pdf_xref *xref, char *buf, int cap) int num, gen, stmofs; int size, w0, w1, w2; int t; - int i; pdf_logxref("load new xref format\n"); @@ -334,38 +336,15 @@ pdf_readnewxref(fz_obj **trailerp, pdf_xref *xref, char *buf, int cap) } size = fz_toint(obj); - if (size >= xref->cap) - { - xref->cap = size + 1; /* for hack to allow broken pdf generators with off-by-one errors */ - xref->table = fz_realloc(xref->table, xref->cap * sizeof(pdf_xrefentry)); - } - if (size > xref->len) { - for (i = xref->len; i < xref->cap; i++) - { - xref->table[i].ofs = 0; - xref->table[i].gen = 0; - xref->table[i].stmofs = 0; - xref->table[i].obj = nil; - xref->table[i].type = 0; - } - xref->len = size; + pdf_resizexref(xref, size); } if (num < 0 || num >= xref->len) { - if (num == xref->len && num < xref->cap) - { - /* allow broken pdf files that have off-by-one errors in the xref */ - fz_warn("object id (%d %d R) out of range (0..%d)", num, gen, xref->len - 1); - xref->len ++; - } - else - { - fz_dropobj(trailer); - return fz_throw("object id (%d %d R) out of range (0..%d)", num, gen, xref->len - 1); - } + fz_dropobj(trailer); + return fz_throw("object id (%d %d R) out of range (0..%d)", num, gen, xref->len - 1); } pdf_logxref("\tnum=%d gen=%d size=%d\n", num, gen, size); @@ -523,17 +502,7 @@ pdf_loadxref(pdf_xref *xref, char *buf, int bufsize) pdf_logxref("\tsize %d at %#x\n", fz_toint(size), xref->startxref); - xref->len = fz_toint(size); - xref->cap = xref->len + 1; /* for hack to allow broken pdf generators with off-by-one errors */ - xref->table = fz_malloc(xref->cap * sizeof(pdf_xrefentry)); - for (i = 0; i < xref->cap; i++) - { - xref->table[i].ofs = 0; - xref->table[i].gen = 0; - xref->table[i].stmofs = 0; - xref->table[i].obj = nil; - xref->table[i].type = 0; - } + pdf_resizexref(xref, fz_toint(size)); error = pdf_readxrefsections(xref, xref->startxref, buf, bufsize); if (error) @@ -547,7 +516,7 @@ pdf_loadxref(pdf_xref *xref, char *buf, int bufsize) for (i = 0; i < xref->len; i++) if (xref->table[i].type == 'n') if (xref->table[i].ofs <= 0 || xref->table[i].ofs >= xref->filesize) - return fz_throw("object offset out of range: %d", xref->table[i].ofs); + return fz_throw("object offset out of range: %d (%d 0 R)", xref->table[i].ofs, i); return fz_okay; } @@ -565,6 +534,8 @@ pdf_openxrefwithstream(pdf_xref **xrefp, fz_stream *file, char *password) fz_obj *encrypt; fz_obj *id; + int repaired = 0; + xref = fz_malloc(sizeof(pdf_xref)); memset(xref, 0, sizeof(pdf_xref)); @@ -582,7 +553,6 @@ pdf_openxrefwithstream(pdf_xref **xrefp, fz_stream *file, char *password) fz_free(xref->table); xref->table = NULL; xref->len = 0; - xref->cap = 0; } error = pdf_repairxref(xref, xref->scratch, sizeof xref->scratch); if (error) @@ -590,6 +560,7 @@ pdf_openxrefwithstream(pdf_xref **xrefp, fz_stream *file, char *password) pdf_freexref(xref); return fz_rethrow(error, "cannot repair document"); } + repaired = 1; } encrypt = fz_dictgets(xref->trailer, "Encrypt"); @@ -618,6 +589,16 @@ pdf_openxrefwithstream(pdf_xref **xrefp, fz_stream *file, char *password) } } + if (repaired) + { + error = pdf_repairobjstms(xref); + if (error) + { + pdf_freexref(xref); + return fz_rethrow(error, "cannot repair document"); + } + } + *xrefp = xref; return fz_okay; } -- cgit v1.2.3