diff options
-rw-r--r-- | apps/pdfapp.c | 5 | ||||
-rw-r--r-- | apps/pdfclean.c | 2 | ||||
-rw-r--r-- | apps/pdfdraw.c | 4 | ||||
-rw-r--r-- | apps/pdfextract.c | 2 | ||||
-rw-r--r-- | apps/pdfinfo.c | 2 | ||||
-rw-r--r-- | apps/pdfshow.c | 2 | ||||
-rw-r--r-- | apps/pdftool.c | 11 | ||||
-rw-r--r-- | apps/pdftool.h | 2 | ||||
-rw-r--r-- | mupdf/mupdf.h | 6 | ||||
-rw-r--r-- | mupdf/pdf_pagetree.c | 400 |
10 files changed, 114 insertions, 322 deletions
diff --git a/apps/pdfapp.c b/apps/pdfapp.c index 75ec8368..c955d1a0 100644 --- a/apps/pdfapp.c +++ b/apps/pdfapp.c @@ -89,6 +89,7 @@ void pdfapp_invert(pdfapp_t *app, fz_bbox rect) void pdfapp_open(pdfapp_t *app, char *filename, int fd) { + fz_error error; fz_obj *obj; fz_obj *info; char *password = ""; @@ -147,6 +148,10 @@ void pdfapp_open(pdfapp_t *app, char *filename, int fd) * Start at first page */ + error = pdf_loadpagetree(app->xref); + if (error) + pdfapp_error(app, fz_rethrow(error, "cannot load page tree")); + app->pagecount = pdf_getpagecount(app->xref); app->shrinkwrap = 1; diff --git a/apps/pdfclean.c b/apps/pdfclean.c index a6cf402c..021eb2bc 100644 --- a/apps/pdfclean.c +++ b/apps/pdfclean.c @@ -295,7 +295,7 @@ int main(int argc, char **argv) if (argc - fz_optind > 0) subset = 1; - openxref(infile, password, 0); + openxref(infile, password, 0, 0); out = fopen(outfile, "wb"); if (!out) diff --git a/apps/pdfdraw.c b/apps/pdfdraw.c index 345d8c73..46925838 100644 --- a/apps/pdfdraw.c +++ b/apps/pdfdraw.c @@ -292,7 +292,7 @@ static void drawtxt(int pagenum, struct benchmark *loadtimes) fz_freedevice(dev); printf("[Page %d]\n", pagenum); - fz_debugtextspan(text); + fz_debugtextspanxml(text); printf("\n"); fz_freetextspan(text); @@ -446,7 +446,7 @@ int main(int argc, char **argv) drawcache = fz_newglyphcache(); - openxref(argv[fz_optind], password, 0); + openxref(argv[fz_optind], password, 0, 1); state = NO_PAGES_DRAWN; } else diff --git a/apps/pdfextract.c b/apps/pdfextract.c index 25b00e8e..297fdbe5 100644 --- a/apps/pdfextract.c +++ b/apps/pdfextract.c @@ -230,7 +230,7 @@ int main(int argc, char **argv) if (fz_optind == argc) showusage(); - openxref(argv[fz_optind++], password, 0); + openxref(argv[fz_optind++], password, 0, 0); if (fz_optind == argc) for (o = 0; o < xref->len; o++) diff --git a/apps/pdfinfo.c b/apps/pdfinfo.c index ceeb8ded..7702f25b 100644 --- a/apps/pdfinfo.c +++ b/apps/pdfinfo.c @@ -1021,7 +1021,7 @@ int main(int argc, char **argv) closexref(); filename = argv[fz_optind]; printf("%s:\n", filename); - openxref(filename, password, 0); + openxref(filename, password, 0, 1); gatherglobalinfo(); state = NO_INFO_GATHERED; } diff --git a/apps/pdfshow.c b/apps/pdfshow.c index 85b78d29..ba2d51f6 100644 --- a/apps/pdfshow.c +++ b/apps/pdfshow.c @@ -141,7 +141,7 @@ int main(int argc, char **argv) if (fz_optind == argc) showusage(); - openxref(argv[fz_optind++], password, 0); + openxref(argv[fz_optind++], password, 0, 0); if (fz_optind == argc) showtrailer(); diff --git a/apps/pdftool.c b/apps/pdftool.c index 42f712dd..5724e022 100644 --- a/apps/pdftool.c +++ b/apps/pdftool.c @@ -21,9 +21,10 @@ void setcleanup(void (*func)(void)) cleanup = func; } -void openxref(char *filename, char *password, int dieonbadpass) +void openxref(char *filename, char *password, int dieonbadpass, int loadpages) { fz_stream *file; + fz_error error; int okay; int fd; @@ -52,7 +53,13 @@ void openxref(char *filename, char *password, int dieonbadpass) die(fz_throw("invalid password")); } - pagecount = pdf_getpagecount(xref); + if (loadpages) + { + error = pdf_loadpagetree(xref); + if (error) + die(fz_rethrow(error, "cannot load page tree")); + pagecount = pdf_getpagecount(xref); + } } void flushxref(void) diff --git a/apps/pdftool.h b/apps/pdftool.h index 2bc39657..1560c225 100644 --- a/apps/pdftool.h +++ b/apps/pdftool.h @@ -8,7 +8,7 @@ extern int pagecount; void die(fz_error error); void setcleanup(void (*cleanup)(void)); -void openxref(char *filename, char *password, int dieonbadpass); +void openxref(char *filename, char *password, int dieonbadpass, int loadpages); void flushxref(void); void closexref(void); diff --git a/mupdf/mupdf.h b/mupdf/mupdf.h index a0b3155f..73610990 100644 --- a/mupdf/mupdf.h +++ b/mupdf/mupdf.h @@ -129,6 +129,11 @@ struct pdf_xref_s int cap; pdf_xrefentry *table; + int pagelen; + int pagecap; + fz_obj **pageobjs; + fz_obj **pagerefs; + struct pdf_store_s *store; struct pdf_outline_s *outlines; @@ -538,6 +543,7 @@ struct pdf_page_s }; /* pagetree.c */ +fz_error pdf_loadpagetree(pdf_xref *xref); int pdf_getpagecount(pdf_xref *xref); fz_obj * pdf_getpageobject(pdf_xref *xref, int p); int pdf_findpageobject(pdf_xref *xref, fz_obj *pageobj); diff --git a/mupdf/pdf_pagetree.c b/mupdf/pdf_pagetree.c index d9339712..086dec7e 100644 --- a/mupdf/pdf_pagetree.c +++ b/mupdf/pdf_pagetree.c @@ -1,7 +1,7 @@ #include "fitz.h" #include "mupdf.h" -struct stuff +struct info { fz_obj *resources; fz_obj *mediabox; @@ -9,343 +9,117 @@ struct stuff fz_obj *rotate; }; -static void -pdf_getpagecountimp(pdf_xref *xref, fz_obj *node, int *pagesp) -{ - fz_obj *type; - fz_obj *kids; - fz_obj *count; - char *typestr; - int pages = 0; - int i; - - if (!fz_isdict(node)) - { - fz_warn("pagetree node is missing, igoring missing pages..."); - return; - } - - type = fz_dictgets(node, "Type"); - kids = fz_dictgets(node, "Kids"); - count = fz_dictgets(node, "Count"); - - if (fz_isname(type)) - typestr = fz_toname(type); - else - { - fz_warn("pagetree node (%d %d R) lacks required type", fz_tonum(node), fz_togen(node)); - - kids = fz_dictgets(node, "Kids"); - if (kids) - { - fz_warn("guessing it may be a pagetree node, continuing..."); - typestr = "Pages"; - } - else - { - fz_warn("guessing it may be a page, continuing..."); - typestr = "Page"; - } - } - - if (!strcmp(typestr, "Page")) - (*pagesp)++; - - else if (!strcmp(typestr, "Pages")) - { - if (!fz_isarray(kids)) - fz_warn("page tree node contains no pages"); - - pdf_logpage("subtree (%d %d R) {\n", fz_tonum(node), fz_togen(node)); - - for (i = 0; i < fz_arraylen(kids); i++) - { - fz_obj *obj = fz_arrayget(kids, i); - - /* prevent infinite recursion possible in maliciously crafted PDFs */ - if (obj == node) - { - fz_warn("cyclic page tree"); - return; - } - - pdf_getpagecountimp(xref, obj, &pages); - } - - if (pages != fz_toint(count)) - { - fz_warn("page tree node contains incorrect number of pages, continuing..."); - count = fz_newint(pages); - fz_dictputs(node, "Count", count); - fz_dropobj(count); - } - - pdf_logpage("%d pages\n", pages); - - (*pagesp) += pages; - - pdf_logpage("}\n"); - } -} - int pdf_getpagecount(pdf_xref *xref) { - fz_obj *catalog; - fz_obj *pages; - int count; - - catalog = fz_dictgets(xref->trailer, "Root"); - - pages = fz_dictgets(catalog, "Pages"); - pdf_logpage("determining page count (%d %d R) {\n", fz_tonum(pages), fz_togen(pages)); - - count = 0; - pdf_getpagecountimp(xref, pages, &count); - - pdf_logpage("}\n"); - - return count; + return xref->pagelen; } -static void -pdf_getpageobjectimp(pdf_xref *xref, struct stuff inherit, fz_obj *node, int *pagesp, int pageno, fz_obj **pagep) +fz_obj * +pdf_getpageobject(pdf_xref *xref, int number) { - char *typestr; - fz_obj *type; - fz_obj *kids; - fz_obj *count; - fz_obj *inh; - int i; - - if (!fz_isdict(node)) - { - fz_warn("pagetree node is missing, ignoring missing pages..."); - *pagep = nil; - return; - } - - type = fz_dictgets(node, "Type"); - kids = fz_dictgets(node, "Kids"); - count = fz_dictgets(node, "Count"); - - if (fz_isname(type)) - typestr = fz_toname(type); - else - { - fz_warn("pagetree node (%d %d R) lacks required type", fz_tonum(node), fz_togen(node)); - - kids = fz_dictgets(node, "Kids"); - if (kids) - { - fz_warn("guessing it may be a pagetree node, continuing..."); - typestr = "Pages"; - } - else - { - fz_warn("guessing it may be a page, continuing..."); - typestr = "Page"; - } - } - - if (!strcmp(typestr, "Page")) - { - (*pagesp)++; - if (*pagesp == pageno) - { - pdf_logpage("page %d (%d %d R)\n", *pagesp, fz_tonum(node), fz_togen(node)); - - if (inherit.resources && !fz_dictgets(node, "Resources")) - { - pdf_logpage("inherited resources\n"); - fz_dictputs(node, "Resources", inherit.resources); - } - - if (inherit.mediabox && !fz_dictgets(node, "MediaBox")) - { - pdf_logpage("inherit mediabox\n"); - fz_dictputs(node, "MediaBox", inherit.mediabox); - } - - if (inherit.cropbox && !fz_dictgets(node, "CropBox")) - { - pdf_logpage("inherit cropbox\n"); - fz_dictputs(node, "CropBox", inherit.cropbox); - } - - if (inherit.rotate && !fz_dictgets(node, "Rotate")) - { - pdf_logpage("inherit rotate\n"); - fz_dictputs(node, "Rotate", inherit.rotate); - } - - *pagep = node; - } - } - - else if (!strcmp(typestr, "Pages")) - { - if (!fz_isarray(kids)) - fz_warn("page tree node contains no pages"); - - if (*pagesp + fz_toint(count) < pageno) - { - (*pagesp) += fz_toint(count); - return; - } - - inh = fz_dictgets(node, "Resources"); - if (inh) inherit.resources = inh; - - inh = fz_dictgets(node, "MediaBox"); - if (inh) inherit.mediabox = inh; - - inh = fz_dictgets(node, "CropBox"); - if (inh) inherit.cropbox = inh; - - inh = fz_dictgets(node, "Rotate"); - if (inh) inherit.rotate = inh; - - pdf_logpage("subtree (%d %d R) {\n", fz_tonum(node), fz_togen(node)); - - for (i = 0; !(*pagep) && i < fz_arraylen(kids); i++) - { - fz_obj *obj = fz_arrayget(kids, i); - - /* prevent infinite recursion possible in maliciously crafted PDFs */ - if (obj == node) - { - fz_warn("cyclic page tree"); - return; - } - - pdf_getpageobjectimp(xref, inherit, obj, pagesp, pageno, pagep); - } - - pdf_logpage("}\n"); - } + if (number > 0 && number <= xref->pagelen) + return xref->pageobjs[number - 1]; + return nil; } -fz_obj * -pdf_getpageobject(pdf_xref *xref, int pageno) +int +pdf_findpageobject(pdf_xref *xref, fz_obj *page) { - struct stuff inherit; - fz_obj *catalog; - fz_obj *pages; - fz_obj *page; - int count; - - inherit.resources = nil; - inherit.mediabox = nil; - inherit.cropbox = nil; - inherit.rotate = nil; - - catalog = fz_dictgets(xref->trailer, "Root"); - - pages = fz_dictgets(catalog, "Pages"); - pdf_logpage("get page %d (%d %d R) {\n", pageno, fz_tonum(pages), fz_togen(pages)); - - page = nil; - count = 0; - pdf_getpageobjectimp(xref, inherit, pages, &count, pageno, &page); - if (!page) - fz_warn("cannot find page %d", pageno); - - pdf_logpage("}\n"); - - return page; + int num = fz_tonum(page); + int gen = fz_togen(page); + int i; + for (i = 0; i < xref->pagelen; i++) + if (num == fz_tonum(xref->pagerefs[i]) && gen == fz_togen(xref->pagerefs[i])) + return i + 1; + return 0; } -static void -pdf_findpageobjectimp(pdf_xref *xref, fz_obj *node, fz_obj *page, int *pagenop, int *foundp) +void +pdf_loadpagetreenode(pdf_xref *xref, fz_obj *node, struct info info) { - char *typestr; - fz_obj *type; + fz_obj *dict; fz_obj *kids; - int i; - - if (!fz_isdict(node)) - return; + fz_obj *count; + fz_obj *obj; + int i, n; - type = fz_dictgets(node, "Type"); kids = fz_dictgets(node, "Kids"); + count = fz_dictgets(node, "Count"); - if (fz_isname(type)) - typestr = fz_toname(type); - else - { - fz_warn("pagetree node (%d %d R) lacks required type", fz_tonum(node), fz_togen(node)); - - kids = fz_dictgets(node, "Kids"); - if (kids) - { - fz_warn("guessing it may be a pagetree node, continuing..."); - typestr = "Pages"; - } - else - { - fz_warn("guessing it may be a page, continuing..."); - typestr = "Page"; - } - } - - if (!strcmp(typestr, "Page")) + if (fz_isarray(kids) && fz_isint(count)) { - (*pagenop)++; - if (fz_tonum(node) == fz_tonum(page)) + obj = fz_dictgets(node, "Resources"); + if (obj) + info.resources = obj; + obj = fz_dictgets(node, "MediaBox"); + if (obj) + info.mediabox = obj; + obj = fz_dictgets(node, "CropBox"); + if (obj) + info.cropbox = obj; + obj = fz_dictgets(node, "Rotate"); + if (obj) + info.rotate = obj; + + n = fz_arraylen(kids); + for (i = 0; i < n; i++) { - pdf_logpage("page %d (%d %d R)\n", *pagenop, fz_tonum(node), fz_togen(node)); - *foundp = 1; + obj = fz_arrayget(kids, i); + pdf_loadpagetreenode(xref, obj, info); } } - - else if (!strcmp(typestr, "Pages")) + else { - if (!fz_isarray(kids)) - fz_warn("page tree node contains no pages"); - - pdf_logpage("subtree (%d %d R) {\n", fz_tonum(node), fz_togen(node)); - - for (i = 0; !(*foundp) && i < fz_arraylen(kids); i++) + dict = fz_resolveindirect(node); + + if (info.resources && !fz_dictgets(dict, "Resources")) + fz_dictputs(dict, "Resources", info.resources); + if (info.mediabox && !fz_dictgets(dict, "MediaBox")) + fz_dictputs(dict, "MediaBox", info.mediabox); + if (info.cropbox && !fz_dictgets(dict, "CropBox")) + fz_dictputs(dict, "CropBox", info.cropbox); + if (info.rotate && !fz_dictgets(dict, "Rotate")) + fz_dictputs(dict, "Rotate", info.rotate); + + if (xref->pagelen == xref->pagecap) { - fz_obj *obj = fz_arrayget(kids, i); - - /* prevent infinite recursion possible in maliciously crafted PDFs */ - if (obj == node) - { - fz_warn("cyclic page tree"); - return; - } - - pdf_findpageobjectimp(xref, obj, page, pagenop, foundp); + fz_warn("found more pages than expected"); + xref->pagecap ++; + xref->pagerefs = fz_realloc(xref->pagerefs, sizeof(fz_obj*) * xref->pagecap); + xref->pageobjs = fz_realloc(xref->pageobjs, sizeof(fz_obj*) * xref->pagecap); } - pdf_logpage("}\n"); + xref->pagerefs[xref->pagelen] = fz_keepobj(node); + xref->pageobjs[xref->pagelen] = fz_keepobj(dict); + xref->pagelen ++; } } -int -pdf_findpageobject(pdf_xref *xref, fz_obj *page) +fz_error +pdf_loadpagetree(pdf_xref *xref) { - fz_obj *catalog; - fz_obj *pages; - int pageno; - int found; - - catalog = fz_dictgets(xref->trailer, "Root"); - - pages = fz_dictgets(catalog, "Pages"); - pdf_logpage("find page object (%d %d R) (%d %d R) {\n", fz_tonum(page), fz_togen(page), fz_tonum(pages), fz_togen(pages)); - - pageno = 0; - found = 0; - pdf_findpageobjectimp(xref, pages, page, &pageno, &found); - - pdf_logpage("}\n"); - - if (!found) - fz_warn("cannot find page object (%d %d R)", fz_tonum(page), fz_togen(page)); - - return pageno; -} - + struct info info; + fz_obj *catalog = fz_dictgets(xref->trailer, "Root"); + fz_obj *pages = fz_dictgets(catalog, "Pages"); + fz_obj *count = fz_dictgets(pages, "Count"); + + if (!fz_isdict(pages)) + return fz_throw("missing page tree"); + if (!fz_isint(count)) + return fz_throw("missing page count"); + + xref->pagecap = fz_toint(count); + xref->pagelen = 0; + xref->pagerefs = fz_malloc(sizeof(fz_obj*) * xref->pagecap); + xref->pageobjs = fz_malloc(sizeof(fz_obj*) * xref->pagecap); + + info.resources = nil; + info.mediabox = nil; + info.cropbox = nil; + info.rotate = nil; + + pdf_loadpagetreenode(xref, pages, info); + + return fz_okay; +}
\ No newline at end of file |