diff options
author | Tor Andersson <tor.andersson@artifex.com> | 2017-02-23 15:11:08 +0100 |
---|---|---|
committer | Tor Andersson <tor.andersson@artifex.com> | 2017-03-01 15:22:46 +0100 |
commit | 53f816f2f06de5aeb13cac72232c4f52714201cb (patch) | |
tree | 924eccc6f31db7564ab434a3491b27c7d572eed3 | |
parent | 8c547d302fb23950bc3f928a37d4c8baf93838cc (diff) | |
download | mupdf-53f816f2f06de5aeb13cac72232c4f52714201cb.tar.xz |
Add page lookup cache for faster link destination lookups in outlines.
Loading outlines wants to look up all link destinations, and doing the
normal link destination lookups triggers loading all page objects used.
This means we need to parse a lot of objects, which can be quite slow.
We can load the page tree faster by only looking at intermediate page
tree nodes. If we load the page tree and create a reverse lookup
table for use when loading the outline, we can speed up the time to
run 'mutool show pdfref17.pdf outline' from 900ms to 100ms.
-rw-r--r-- | include/mupdf/pdf/document.h | 8 | ||||
-rw-r--r-- | include/mupdf/pdf/page.h | 2 | ||||
-rw-r--r-- | source/pdf/pdf-outline.c | 21 | ||||
-rw-r--r-- | source/pdf/pdf-page.c | 116 | ||||
-rw-r--r-- | source/pdf/pdf-xref.c | 2 |
5 files changed, 137 insertions, 12 deletions
diff --git a/include/mupdf/pdf/document.h b/include/mupdf/pdf/document.h index a84ee872..177f947c 100644 --- a/include/mupdf/pdf/document.h +++ b/include/mupdf/pdf/document.h @@ -549,6 +549,13 @@ struct pdf_unsaved_sig_s pdf_unsaved_sig *next; }; +typedef struct pdf_rev_page_map_s pdf_rev_page_map; +struct pdf_rev_page_map_s +{ + int page; + int object; +}; + struct pdf_document_s { fz_document super; @@ -576,6 +583,7 @@ struct pdf_document_s int has_xref_streams; int page_count; + pdf_rev_page_map *rev_page_map; int repair_attempted; diff --git a/include/mupdf/pdf/page.h b/include/mupdf/pdf/page.h index f86d54cc..52d2a4ad 100644 --- a/include/mupdf/pdf/page.h +++ b/include/mupdf/pdf/page.h @@ -4,6 +4,8 @@ int pdf_lookup_page_number(fz_context *ctx, pdf_document *doc, pdf_obj *pageobj); int pdf_count_pages(fz_context *ctx, pdf_document *doc); pdf_obj *pdf_lookup_page_obj(fz_context *ctx, pdf_document *doc, int needle); +void pdf_load_page_tree(fz_context *ctx, pdf_document *doc); +void pdf_drop_page_tree(fz_context *ctx, pdf_document *doc); /* pdf_lookup_anchor: Find the page number of a named destination. diff --git a/source/pdf/pdf-outline.c b/source/pdf/pdf-outline.c index c31a1954..7c8cfdaa 100644 --- a/source/pdf/pdf-outline.c +++ b/source/pdf/pdf-outline.c @@ -69,12 +69,21 @@ fz_outline * pdf_load_outline(fz_context *ctx, pdf_document *doc) { pdf_obj *root, *obj, *first; + fz_outline *outline = NULL; - root = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); - obj = pdf_dict_get(ctx, root, PDF_NAME_Outlines); - first = pdf_dict_get(ctx, obj, PDF_NAME_First); - if (first) - return pdf_load_outline_imp(ctx, doc, first); + pdf_load_page_tree(ctx, doc); /* cache page tree for fast link destination lookups */ + fz_try(ctx) + { + root = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); + obj = pdf_dict_get(ctx, root, PDF_NAME_Outlines); + first = pdf_dict_get(ctx, obj, PDF_NAME_First); + if (first) + outline = pdf_load_outline_imp(ctx, doc, first); + } + fz_always(ctx) + pdf_drop_page_tree(ctx, doc); + fz_catch(ctx) + fz_rethrow(ctx); - return NULL; + return outline; } diff --git a/source/pdf/pdf-page.c b/source/pdf/pdf-page.c index 44f652bd..0699c672 100644 --- a/source/pdf/pdf-page.c +++ b/source/pdf/pdf-page.c @@ -4,11 +4,87 @@ int pdf_count_pages(fz_context *ctx, pdf_document *doc) { if (doc->page_count == 0) + doc->page_count = pdf_to_int(ctx, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/Pages/Count")); + return doc->page_count; +} + +static int +pdf_load_page_tree_imp(fz_context *ctx, pdf_document *doc, pdf_obj *node, int idx) +{ + pdf_obj *type = pdf_dict_get(ctx, node, PDF_NAME_Type); + if (pdf_name_eq(ctx, type, PDF_NAME_Pages)) { - pdf_obj *count = pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/Pages/Count"); - doc->page_count = pdf_to_int(ctx, count); + pdf_obj *kids = pdf_dict_get(ctx, node, PDF_NAME_Kids); + int count = pdf_to_int(ctx, pdf_dict_get(ctx, node, PDF_NAME_Count)); + int i, n = pdf_array_len(ctx, kids); + + /* if Kids length is same as Count, all children must be page objects */ + if (n == count) + { + for (i = 0; i < n; ++i) + { + if (idx >= doc->page_count) + fz_throw(ctx, FZ_ERROR_GENERIC, "too many kids in page tree"); + doc->rev_page_map[idx].page = idx; + doc->rev_page_map[idx].object = pdf_to_num(ctx, pdf_array_get(ctx, kids, i)); + ++idx; + } + } + + /* else Kids may contain intermediate nodes */ + else + { + if (pdf_mark_obj(ctx, node)) + fz_throw(ctx, FZ_ERROR_GENERIC, "cycle in page tree"); + fz_try(ctx) + for (i = 0; i < n; ++i) + idx = pdf_load_page_tree_imp(ctx, doc, pdf_array_get(ctx, kids, i), idx); + fz_always(ctx) + pdf_unmark_obj(ctx, node); + fz_catch(ctx) + fz_rethrow(ctx); + } + } + else if (pdf_name_eq(ctx, type, PDF_NAME_Page)) + { + if (idx >= doc->page_count) + fz_throw(ctx, FZ_ERROR_GENERIC, "too many kids in page tree"); + doc->rev_page_map[idx].page = idx; + doc->rev_page_map[idx].object = pdf_to_num(ctx, node); + ++idx; + } + else + { + fz_throw(ctx, FZ_ERROR_GENERIC, "non-page object in page tree"); + } + return idx; +} + +static int +cmp_rev_page_map(const void *va, const void *vb) +{ + const pdf_rev_page_map *a = va; + const pdf_rev_page_map *b = vb; + return a->object - b->object; +} + +void +pdf_load_page_tree(fz_context *ctx, pdf_document *doc) +{ + if (!doc->rev_page_map) + { + int n = pdf_count_pages(ctx, doc); + doc->rev_page_map = fz_malloc_array(ctx, n, sizeof *doc->rev_page_map); + pdf_load_page_tree_imp(ctx, doc, pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/Pages"), 0); + qsort(doc->rev_page_map, n, sizeof *doc->rev_page_map, cmp_rev_page_map); } - return doc->page_count; +} + +void +pdf_drop_page_tree(fz_context *ctx, pdf_document *doc) +{ + fz_free(ctx, doc->rev_page_map); + doc->rev_page_map = NULL; } enum @@ -40,7 +116,7 @@ pdf_lookup_page_loc_imp(fz_context *ctx, pdf_document *doc, pdf_obj *node, int * len = pdf_array_len(ctx, kids); if (len == 0) - fz_throw(ctx, FZ_ERROR_GENERIC, "Malformed pages tree"); + fz_throw(ctx, FZ_ERROR_GENERIC, "malformed page tree"); /* Every node we need to unmark goes into the stack */ if (stack_len == stack_max) @@ -158,8 +234,8 @@ pdf_count_pages_before_kid(fz_context *ctx, pdf_document *doc, pdf_obj *parent, fz_throw(ctx, FZ_ERROR_GENERIC, "kid not found in parent's kids array"); } -int -pdf_lookup_page_number(fz_context *ctx, pdf_document *doc, pdf_obj *node) +static int +pdf_lookup_page_number_slow(fz_context *ctx, pdf_document *doc, pdf_obj *node) { int needle = pdf_to_num(ctx, node); int total = 0; @@ -200,6 +276,34 @@ pdf_lookup_page_number(fz_context *ctx, pdf_document *doc, pdf_obj *node) return total; } +static int +pdf_lookup_page_number_fast(fz_context *ctx, pdf_document *doc, int needle) +{ + int l = 0; + int r = doc->page_count - 1; + while (l <= r) + { + int m = (l + r) >> 1; + int c = needle - doc->rev_page_map[m].object; + if (c < 0) + r = m - 1; + else if (c > 0) + l = m + 1; + else + return doc->rev_page_map[m].page; + } + return -1; +} + +int +pdf_lookup_page_number(fz_context *ctx, pdf_document *doc, pdf_obj *page) +{ + if (doc->rev_page_map) + return pdf_lookup_page_number_fast(ctx, doc, pdf_to_num(ctx, page)); + else + return pdf_lookup_page_number_slow(ctx, doc, page); +} + int pdf_lookup_anchor(fz_context *ctx, pdf_document *doc, const char *name, float *xp, float *yp) { diff --git a/source/pdf/pdf-xref.c b/source/pdf/pdf-xref.c index 739a5693..e7278ae8 100644 --- a/source/pdf/pdf-xref.c +++ b/source/pdf/pdf-xref.c @@ -1528,6 +1528,8 @@ pdf_drop_document_imp(fz_context *ctx, pdf_document *doc) pdf_drop_obj(ctx, doc->orphans[i]); fz_free(ctx, doc->orphans); + + fz_free(ctx, doc->rev_page_map); } fz_always(ctx) { |