From 69875363f1766f95c35c1fe429dd85ac9a19add5 Mon Sep 17 00:00:00 2001 From: Tor Andersson Date: Tue, 5 Apr 2011 01:18:03 +0200 Subject: Clean up xps and pdf page access functions. --- pdf/pdf_page.c | 230 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 180 insertions(+), 50 deletions(-) (limited to 'pdf/pdf_page.c') diff --git a/pdf/pdf_page.c b/pdf/pdf_page.c index 685ec3ae..39cae41d 100644 --- a/pdf/pdf_page.c +++ b/pdf/pdf_page.c @@ -1,67 +1,123 @@ #include "fitz.h" #include "mupdf.h" -/* we need to combine all sub-streams into one for the content stream interpreter */ +struct info +{ + fz_obj *resources; + fz_obj *mediabox; + fz_obj *cropbox; + fz_obj *rotate; +}; + +int +pdf_count_pages(pdf_xref *xref) +{ + return xref->page_len; +} -static fz_error -pdf_load_page_contents_array(fz_buffer **bigbufp, pdf_xref *xref, fz_obj *list) +int +pdf_find_page_number(pdf_xref *xref, fz_obj *page) { - fz_error error; - fz_buffer *big; - fz_buffer *one; - int i; + int i, num = fz_to_num(page); + for (i = 0; i < xref->page_len; i++) + if (num == fz_to_num(xref->page_refs[i])) + return i; + return -1; +} - pdf_log_page("multiple content streams: %d\n", fz_array_len(list)); +static void +pdf_load_page_tree_node(pdf_xref *xref, fz_obj *node, struct info info) +{ + fz_obj *dict, *kids, *count; + fz_obj *obj, *tmp; + int i, n; - /* TODO: openstream, read, close into big buffer at once */ + /* prevent infinite recursion */ + if (fz_dict_gets(node, ".seen")) + return; - big = fz_new_buffer(32 * 1024); + kids = fz_dict_gets(node, "Kids"); + count = fz_dict_gets(node, "Count"); - for (i = 0; i < fz_array_len(list); i++) + if (fz_is_array(kids) && fz_is_int(count)) { - fz_obj *stm = fz_array_get(list, i); - error = pdf_load_stream(&one, xref, fz_to_num(stm), fz_to_gen(stm)); - if (error) + obj = fz_dict_gets(node, "Resources"); + if (obj) + info.resources = obj; + obj = fz_dict_gets(node, "MediaBox"); + if (obj) + info.mediabox = obj; + obj = fz_dict_gets(node, "CropBox"); + if (obj) + info.cropbox = obj; + obj = fz_dict_gets(node, "Rotate"); + if (obj) + info.rotate = obj; + + tmp = fz_new_null(); + fz_dict_puts(node, ".seen", tmp); + fz_drop_obj(tmp); + + n = fz_array_len(kids); + for (i = 0; i < n; i++) { - fz_drop_buffer(big); - return fz_rethrow(error, "cannot load content stream part %d/%d (%d %d R)", i + 1, fz_array_len(list), fz_to_num(stm), fz_to_gen(stm)); + obj = fz_array_get(kids, i); + pdf_load_page_tree_node(xref, obj, info); } - if (big->len + one->len + 1 > big->cap) - fz_resize_buffer(big, big->len + one->len + 1); - memcpy(big->data + big->len, one->data, one->len); - big->data[big->len + one->len] = ' '; - big->len += one->len + 1; - - fz_drop_buffer(one); + fz_dict_dels(node, ".seen"); } + else + { + dict = fz_resolve_indirect(node); + + if (info.resources && !fz_dict_gets(dict, "Resources")) + fz_dict_puts(dict, "Resources", info.resources); + if (info.mediabox && !fz_dict_gets(dict, "MediaBox")) + fz_dict_puts(dict, "MediaBox", info.mediabox); + if (info.cropbox && !fz_dict_gets(dict, "CropBox")) + fz_dict_puts(dict, "CropBox", info.cropbox); + if (info.rotate && !fz_dict_gets(dict, "Rotate")) + fz_dict_puts(dict, "Rotate", info.rotate); + + if (xref->page_len == xref->page_cap) + { + fz_warn("found more pages than expected"); + xref->page_cap ++; + xref->page_refs = fz_realloc(xref->page_refs, xref->page_cap, sizeof(fz_obj*)); + xref->page_objs = fz_realloc(xref->page_objs, xref->page_cap, sizeof(fz_obj*)); + } - *bigbufp = big; - return fz_okay; + xref->page_refs[xref->page_len] = fz_keep_obj(node); + xref->page_objs[xref->page_len] = fz_keep_obj(dict); + xref->page_len ++; + } } -static fz_error -pdf_load_page_contents(fz_buffer **bufp, pdf_xref *xref, fz_obj *obj) +fz_error +pdf_load_page_tree(pdf_xref *xref) { - fz_error error; + struct info info; + fz_obj *catalog = fz_dict_gets(xref->trailer, "Root"); + fz_obj *pages = fz_dict_gets(catalog, "Pages"); + fz_obj *count = fz_dict_gets(pages, "Count"); - if (fz_is_array(obj)) - { - error = pdf_load_page_contents_array(bufp, xref, obj); - if (error) - return fz_rethrow(error, "cannot load content stream array (%d 0 R)", fz_to_num(obj)); - } - else if (pdf_is_stream(xref, fz_to_num(obj), fz_to_gen(obj))) - { - error = pdf_load_stream(bufp, xref, fz_to_num(obj), fz_to_gen(obj)); - if (error) - return fz_rethrow(error, "cannot load content stream (%d 0 R)", fz_to_num(obj)); - } - else - { - fz_warn("page contents missing, leaving page blank"); - *bufp = fz_new_buffer(0); - } + if (!fz_is_dict(pages)) + return fz_throw("missing page tree"); + if (!fz_is_int(count)) + return fz_throw("missing page count"); + + xref->page_cap = fz_to_int(count); + xref->page_len = 0; + xref->page_refs = fz_calloc(xref->page_cap, sizeof(fz_obj*)); + xref->page_objs = fz_calloc(xref->page_cap, sizeof(fz_obj*)); + + info.resources = NULL; + info.mediabox = NULL; + info.cropbox = NULL; + info.rotate = NULL; + + pdf_load_page_tree_node(xref, pages, info); return fz_okay; } @@ -149,21 +205,92 @@ found: return 1; } +/* we need to combine all sub-streams into one for the content stream interpreter */ + +static fz_error +pdf_load_page_contents_array(fz_buffer **bigbufp, pdf_xref *xref, fz_obj *list) +{ + fz_error error; + fz_buffer *big; + fz_buffer *one; + int i; + + pdf_log_page("multiple content streams: %d\n", fz_array_len(list)); + + /* TODO: openstream, read, close into big buffer at once */ + + big = fz_new_buffer(32 * 1024); + + for (i = 0; i < fz_array_len(list); i++) + { + fz_obj *stm = fz_array_get(list, i); + error = pdf_load_stream(&one, xref, fz_to_num(stm), fz_to_gen(stm)); + if (error) + { + fz_drop_buffer(big); + return fz_rethrow(error, "cannot load content stream part %d/%d (%d 0 R)", + i + 1, fz_array_len(list), fz_to_num(stm)); + } + + if (big->len + one->len + 1 > big->cap) + fz_resize_buffer(big, big->len + one->len + 1); + memcpy(big->data + big->len, one->data, one->len); + big->data[big->len + one->len] = ' '; + big->len += one->len + 1; + + fz_drop_buffer(one); + } + + *bigbufp = big; + return fz_okay; +} + +static fz_error +pdf_load_page_contents(fz_buffer **bufp, pdf_xref *xref, fz_obj *obj) +{ + fz_error error; + + if (fz_is_array(obj)) + { + error = pdf_load_page_contents_array(bufp, xref, obj); + if (error) + return fz_rethrow(error, "cannot load content stream array (%d 0 R)", fz_to_num(obj)); + } + else if (pdf_is_stream(xref, fz_to_num(obj), fz_to_gen(obj))) + { + error = pdf_load_stream(bufp, xref, fz_to_num(obj), fz_to_gen(obj)); + if (error) + return fz_rethrow(error, "cannot load content stream (%d 0 R)", fz_to_num(obj)); + } + else + { + fz_warn("page contents missing, leaving page blank"); + *bufp = fz_new_buffer(0); + } + + return fz_okay; +} + fz_error -pdf_load_page(pdf_page **pagep, pdf_xref *xref, fz_obj *dict) +pdf_load_page(pdf_page **pagep, pdf_xref *xref, int number) { fz_error error; pdf_page *page; + fz_obj *dict; fz_obj *obj; fz_bbox bbox; + if (number < 0 || number >= xref->page_len) + return fz_throw("cannot find page %d", number + 1); + pdf_log_page("load page {\n"); - // TODO: move this to a more appropriate place /* Ensure that we have a store for resource objects */ if (!xref->store) xref->store = pdf_new_store(); + dict = xref->page_objs[number]; + page = fz_malloc(sizeof(pdf_page)); page->resources = NULL; page->contents = NULL; @@ -175,7 +302,7 @@ pdf_load_page(pdf_page **pagep, pdf_xref *xref, fz_obj *dict) bbox = fz_round_rect(pdf_to_rect(obj)); if (fz_is_empty_rect(pdf_to_rect(obj))) { - fz_warn("cannot find page bounds, guessing page bounds."); + fz_warn("cannot find page size for page %d", number + 1); bbox.x0 = 0; bbox.y0 = 0; bbox.x1 = 612; @@ -195,7 +322,10 @@ pdf_load_page(pdf_page **pagep, pdf_xref *xref, fz_obj *dict) page->mediabox.y1 = MAX(bbox.y0, bbox.y1); if (page->mediabox.x1 - page->mediabox.x0 < 1 || page->mediabox.y1 - page->mediabox.y0 < 1) - return fz_throw("invalid page size"); + { + fz_warn("invalid page size in page %d", number + 1); + page->mediabox = fz_unit_rect; + } page->rotate = fz_to_int(fz_dict_gets(dict, "Rotate")); @@ -218,7 +348,7 @@ pdf_load_page(pdf_page **pagep, pdf_xref *xref, fz_obj *dict) if (error) { pdf_free_page(page); - return fz_rethrow(error, "cannot load page contents (%d %d R)", fz_to_num(obj), fz_to_gen(obj)); + return fz_rethrow(error, "cannot load page contents for page %d (%d 0 R)", number + 1, fz_to_num(obj)); } if (page->resources && pdf_resources_use_blending(page->resources)) -- cgit v1.2.3