#include "fitz-internal.h" #include "mupdf-internal.h" /* #define DEBUG_LINEARIZATION */ /* #define DEBUG_HEAP_SORT */ /* #define DEBUG_WRITING */ typedef struct pdf_write_options_s pdf_write_options; /* As part of linearization, we need to keep a list of what objects are used * by what page. We do this by recording the objects used in a given page * in a page_objects structure. We have a list of these structures (one per * page) in the page_objects_list structure. * * The page_objects structure maintains a heap in the object array, so * insertion takes log n time, and we can heapsort and dedupe at the end for * a total worse case n log n time. * * The magic heap invariant is that: * entry[n] >= entry[(n+1)*2-1] & entry[n] >= entry[(n+1)*2] * or equivalently: * entry[(n-1)>>1] >= entry[n] * * For a discussion of the heap data structure (and heapsort) see Kingston, * "Algorithms and Data Structures". */ typedef struct { int num_shared; int page_object_number; int num_objects; int min_ofs; int max_ofs; /* Extensible list of objects used on this page */ int cap; int len; int object[1]; } page_objects; typedef struct { int cap; int len; page_objects *page[1]; } page_objects_list; struct pdf_write_options_s { FILE *out; int do_ascii; int do_expand; int do_garbage; int do_linear; int *use_list; int *ofs_list; int *gen_list; int *renumber_map; /* The following extras are required for linearization */ int *rev_renumber_map; int *rev_gen_list; int start; int first_xref_offset; int main_xref_offset; int first_xref_entry_offset; int file_len; int hints_shared_offset; int hintstream_len; pdf_obj *linear_l; pdf_obj *linear_h0; pdf_obj *linear_h1; pdf_obj *linear_o; pdf_obj *linear_e; pdf_obj *linear_n; pdf_obj *linear_t; pdf_obj *hints_s; pdf_obj *hints_length; int page_count; page_objects_list *page_object_lists; }; /* * Constants for use with use_list. * * If use_list[num] = 0, then object num is unused. * If use_list[num] & PARAMS, then object num is the linearisation params obj. * If use_list[num] & CATALOGUE, then object num is used by the catalogue. * If use_list[num] & PAGE1, then object num is used by page 1. * If use_list[num] & SHARED, then object num is shared between pages. * If use_list[num] & PAGE_OBJECT then this must be the first object in a page. * Otherwise object num is used by page (use_list[num]>>USE_PAGE_SHIFT). */ enum { USE_CATALOGUE = 2, USE_PAGE1 = 4, USE_SHARED = 8, USE_PARAMS = 16, USE_HINTS = 32, USE_PAGE_OBJECT = 64, USE_PAGE_MASK = ~127, USE_PAGE_SHIFT = 7 }; /* * page_objects and page_object_list handling functions */ static page_objects_list * page_objects_list_create(fz_context *ctx) { page_objects_list *pol = fz_calloc(ctx, 1, sizeof(*pol)); pol->cap = 1; pol->len = 0; return pol; } static void page_objects_list_destroy(fz_context *ctx, page_objects_list *pol) { int i; if (!pol) return; for (i = 0; i < pol->len; i++) { fz_free(ctx, pol->page[i]); } fz_free(ctx, pol); } static void page_objects_list_ensure(fz_context *ctx, page_objects_list **pol, int newcap) { int oldcap = (*pol)->cap; if (newcap <= oldcap) return; *pol = fz_resize_array(ctx, *pol, 1, sizeof(page_objects_list) + (newcap-1)*sizeof(page_objects *)); memset(&(*pol)->page[oldcap], 0, (newcap-oldcap)*sizeof(page_objects *)); (*pol)->cap = newcap; } static page_objects * page_objects_create(fz_context *ctx) { int initial_cap = 8; page_objects *po = fz_calloc(ctx, 1, sizeof(*po) + (initial_cap-1) * sizeof(int)); po->cap = initial_cap; po->len = 0; return po; } static void page_objects_insert(fz_context *ctx, page_objects **ppo, int i) { page_objects *po; /* Make a page_objects if we don't have one */ if (*ppo == NULL) *ppo = page_objects_create(ctx); po = *ppo; /* page_objects insertion: extend the page_objects by 1, and put us on the end */ if (po->len == po->cap) { po = fz_resize_array(ctx, po, 1, sizeof(page_objects) + (po->cap*2 - 1)*sizeof(int)); po->cap *= 2; *ppo = po; } po->object[po->len++] = i; } static void page_objects_list_insert(fz_context *ctx, pdf_write_options *opts, int page, int object) { page_objects_list_ensure(ctx, &opts->page_object_lists, page+1); if (opts->page_object_lists->len < page+1) opts->page_object_lists->len = page+1; page_objects_insert(ctx, &opts->page_object_lists->page[page], object); } static void page_objects_list_set_page_object(fz_context *ctx, pdf_write_options *opts, int page, int object) { page_objects_list_ensure(ctx, &opts->page_object_lists, page+1); opts->page_object_lists->page[page]->page_object_number = object; } static void page_objects_sort(fz_context *ctx, page_objects *po) { int i, j; int n = po->len; /* Step 1: Make a heap */ /* Invariant: Valid heap in [0..i), unsorted elements in [i..n) */ for (i = 1; i < n; i++) { /* Now bubble backwards to maintain heap invariant */ j = i; while (j != 0) { int tmp; int k = (j-1)>>1; if (po->object[k] >= po->object[j]) break; tmp = po->object[k]; po->object[k] = po->object[j]; po->object[j] = tmp; j = k; } } /* Step 2: Heap sort */ /* Invariant: valid heap in [0..i), sorted list in [i..n) */ /* Initially: i = n */ for (i = n-1; i > 0; i--) { /* Swap the maximum (0th) element from the page_objects into its place * in the sorted list (position i). */ int tmp = po->object[0]; po->object[0] = po->object[i]; po->object[i] = tmp; /* Now, the page_objects is invalid because the 0th element is out * of place. Bubble it until the page_objects is valid. */ j = 0; while (1) { /* Children are k and k+1 */ int k = (j+1)*2-1; /* If both children out of the page_objects, we're done */ if (k > i-1) break; /* If both are in the page_objects, pick the larger one */ if (k < i-1 && po->object[k] < po->object[k+1]) k++; /* If j is bigger than k (i.e. both of it's children), * we're done */ if (po->object[j] > po->object[k]) break; tmp = po->object[k]; po->object[k] = po->object[j]; po->object[j] = tmp; j = k; } } } static int order_ge(int ui, int uj) { /* For linearization, we need to order the sections as follows: * Remaining pages * Shared objects * Objects not associated with any page * (Linearization params) * Catalogue (and other document level objects) * First page * (Primary Hint stream) (*) * Any free objects * Note, this is NOT the same order they appear in * the final file! * * The PDF reference gives us the option of putting the hint stream * after the first page, and we take it, for simplicity. */ /* If the 2 objects are in the same section, then page object comes * first. */ if (((ui ^ uj) & ~USE_PAGE_OBJECT) == 0) return ((ui & USE_PAGE_OBJECT) == 0); /* Put unused objects last */ else if (ui == 0) return 1; else if (uj == 0) return 0; /* Put the hint stream before that... */ else if (ui & USE_HINTS) return 1; else if (uj & USE_HINTS) return 0; /* Put page 1 before that... */ else if (ui & USE_PAGE1) return 1; else if (uj & USE_PAGE1) return 0; /* Put the calagoue before that... */ else if (ui & USE_CATALOGUE) return 1; else if (uj & USE_CATALOGUE) return 0; /* Put the linearization params before that... */ else if (ui & USE_PARAMS) return 1; else if (uj & USE_PARAMS) return 0; /* Put objects not associated with any page (anything * not touched by the catalogue) before that... */ else if (ui == 0) return 1; else if (uj == 0) return 0; /* Put shared objects before that... */ else if (ui & USE_SHARED) return 1; else if (uj & USE_SHARED) return 0; /* And otherwise, order by the page number on which * they are used. */ return (ui>>USE_PAGE_SHIFT) >= (uj>>USE_PAGE_SHIFT); } static void heap_sort(int *list, int n, const int *val, int (*ge)(int, int)) { int i, j; #ifdef DEBUG_HEAP_SORT fprintf(stderr, "Initially:\n"); for (i=0; i < n; i++) { fprintf(stderr, "%d: %d %x\n", i, list[i], val[list[i]]); } #endif /* Step 1: Make a heap */ /* Invariant: Valid heap in [0..i), unsorted elements in [i..n) */ for (i = 1; i < n; i++) { /* Now bubble backwards to maintain heap invariant */ j = i; while (j != 0) { int tmp; int k = (j-1)>>1; if (ge(val[list[k]], val[list[j]])) break; tmp = list[k]; list[k] = list[j]; list[j] = tmp; j = k; } } #ifdef DEBUG_HEAP_SORT fprintf(stderr, "Valid heap:\n"); for (i=0; i < n; i++) { int k; fprintf(stderr, "%d: %d %x ", i, list[i], val[list[i]]); k = (i+1)*2-1; if (k < n) { if (ge(val[list[i]], val[list[k]])) fprintf(stderr, "OK "); else fprintf(stderr, "BAD "); } if (k+1 < n) { if (ge(val[list[i]], val[list[k+1]])) fprintf(stderr, "OK\n"); else fprintf(stderr, "BAD\n"); } else fprintf(stderr, "\n"); } #endif /* Step 2: Heap sort */ /* Invariant: valid heap in [0..i), sorted list in [i..n) */ /* Initially: i = n */ for (i = n-1; i > 0; i--) { /* Swap the maximum (0th) element from the page_objects into its place * in the sorted list (position i). */ int tmp = list[0]; list[0] = list[i]; list[i] = tmp; /* Now, the page_objects is invalid because the 0th element is out * of place. Bubble it until the page_objects is valid. */ j = 0; while (1) { /* Children are k and k+1 */ int k = (j+1)*2-1; /* If both children out of the page_objects, we're done */ if (k > i-1) break; /* If both are in the page_objects, pick the larger one */ if (k < i-1 && ge(val[list[k+1]], val[list[k]])) k++; /* If j is bigger than k (i.e. both of it's children), * we're done */ if (ge(val[list[j]], val[list[k]])) break; tmp = list[k]; list[k] = list[j]; list[j] = tmp; j = k; } } #ifdef DEBUG_HEAP_SORT fprintf(stderr, "Sorted:\n"); for (i=0; i < n; i++) { fprintf(stderr, "%d: %d %x ", i, list[i], val[list[i]]); if (i+1 < n) { if (ge(val[list[i+1]], val[list[i]])) fprintf(stderr, "OK"); else fprintf(stderr, "BAD"); } fprintf(stderr, "\n"); } #endif } static void page_objects_dedupe(fz_context *ctx, page_objects *po) { int i, j; int n = po->len-1; for (i = 0; i < n; i++) { if (po->object[i] == po->object[i+1]) break; } j = i; /* j points to the last valid one */ i++; /* i points to the first one we haven't looked at */ for (; i < n; i++) { if (po->object[j] != po->object[i]) po->object[++j] = po->object[i]; } po->len = j+1; } static void page_objects_list_sort_and_dedupe(fz_context *ctx, page_objects_list *pol) { int i; int n = pol->len; for (i = 0; i < n; i++) { page_objects_sort(ctx, pol->page[i]); page_objects_dedupe(ctx, pol->page[i]); } } #ifdef DEBUG_LINEARIZATION static void page_objects_dump(pdf_write_options *opts) { page_objects_list *pol = opts->page_object_lists; int i, j; for (i = 0; i < pol->len; i++) { page_objects *p = pol->page[i]; fprintf(stderr, "Page %d\n", i+1); for (j = 0; j < p->len; j++) { int o = p->object[j]; fprintf(stderr, " Object %d: use=%x\n", o, opts->use_list[o]); } fprintf(stderr, "Byte range=%d->%d\n", p->min_ofs, p->max_ofs); fprintf(stderr, "Number of objects=%d, Number of shared objects=%d\n", p->num_objects, p->num_shared); fprintf(stderr, "Page object number=%d\n", p->page_object_number); } } static void objects_dump(pdf_document *xref, pdf_write_options *opts) { int i; for (i=0; i < xref->len; i++) { fprintf(stderr, "Object %d use=%x offset=%d\n", i, opts->use_list[i], opts->ofs_list[i]); } } #endif /* * Garbage collect objects not reachable from the trailer. */ static pdf_obj *sweepref(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj) { int num = pdf_to_num(obj); int gen = pdf_to_gen(obj); fz_context *ctx = xref->ctx; if (num < 0 || num >= xref->len) return NULL; if (opts->use_list[num]) return NULL; opts->use_list[num] = 1; /* Bake in /Length in stream objects */ fz_try(ctx) { if (pdf_is_stream(xref, num, gen)) { pdf_obj *len = pdf_dict_gets(obj, "Length"); if (pdf_is_indirect(len)) { opts->use_list[pdf_to_num(len)] = 0; len = pdf_resolve_indirect(len); pdf_dict_puts(obj, "Length", len); } } } fz_catch(ctx) { /* Leave broken */ } return pdf_resolve_indirect(obj); } static void sweepobj(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj) { int i; if (pdf_is_indirect(obj)) obj = sweepref(xref, opts, obj); if (pdf_is_dict(obj)) { int n = pdf_dict_len(obj); for (i = 0; i < n; i++) sweepobj(xref, opts, pdf_dict_get_val(obj, i)); } else if (pdf_is_array(obj)) { int n = pdf_array_len(obj); for (i = 0; i < n; i++) sweepobj(xref, opts, pdf_array_get(obj, i)); } } /* * Scan for and remove duplicate objects (slow) */ static void removeduplicateobjs(pdf_document *xref, pdf_write_options *opts) { int num, other; fz_context *ctx = xref->ctx; for (num = 1; num < xref->len; num++) { /* Only compare an object to objects preceding it */ for (other = 1; other < num; other++) { pdf_obj *a, *b; int differ, newnum; if (num == other || !opts->use_list[num] || !opts->use_list[other]) continue; /* * Comparing stream objects data contents would take too long. * * pdf_is_stream calls pdf_cache_object and ensures * that the xref table has the objects loaded. */ fz_try(ctx) { differ = (pdf_is_stream(xref, num, 0) || pdf_is_stream(xref, other, 0)); } fz_catch(ctx) { /* Assume different */ differ = 1; } if (differ) continue; a = xref->table[num].obj; b = xref->table[other].obj; a = pdf_resolve_indirect(a); b = pdf_resolve_indirect(b); if (pdf_objcmp(a, b)) continue; /* Keep the lowest numbered object */ newnum = fz_mini(num, other); opts->renumber_map[num] = newnum; opts->renumber_map[other] = newnum; opts->rev_renumber_map[newnum] = num; /* Either will do */ opts->use_list[fz_maxi(num, other)] = 0; /* One duplicate was found, do not look for another */ break; } } } /* * Renumber objects sequentially so the xref is more compact * * This code assumes that any opts->renumber_map[n] <= n for all n. */ static void compactxref(pdf_document *xref, pdf_write_options *opts) { int num, newnum; /* * Update renumber_map in-place, clustering all used * objects together at low object ids. Objects that * already should be renumbered will have their new * object ids be updated to reflect the compaction. */ newnum = 1; for (num = 1; num < xref->len; num++) { /* If it's not used, map it to zero */ if (!opts->use_list[opts->renumber_map[num]]) { opts->renumber_map[num] = 0; } /* If it's not moved, compact it. */ else if (opts->renumber_map[num] == num) { opts->rev_renumber_map[newnum] = opts->rev_renumber_map[num]; opts->rev_gen_list[newnum] = opts->rev_gen_list[num]; opts->renumber_map[num] = newnum++; } /* Otherwise it's used, and moved. We know that it must have * moved down, so the place it's moved to will be in the right * place already. */ else { opts->renumber_map[num] = opts->renumber_map[opts->renumber_map[num]]; } } } /* * Update indirect objects according to renumbering established when * removing duplicate objects and compacting the xref. */ static void renumberobj(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj) { int i; fz_context *ctx = xref->ctx; if (pdf_is_dict(obj)) { int n = pdf_dict_len(obj); for (i = 0; i < n; i++) { pdf_obj *key = pdf_dict_get_key(obj, i); pdf_obj *val = pdf_dict_get_val(obj, i); if (pdf_is_indirect(val)) { val = pdf_new_indirect(ctx, opts->renumber_map[pdf_to_num(val)], 0, xref); pdf_dict_put(obj, key, val); pdf_drop_obj(val); } else { renumberobj(xref, opts, val); } } } else if (pdf_is_array(obj)) { int n = pdf_array_len(obj); for (i = 0; i < n; i++) { pdf_obj *val = pdf_array_get(obj, i); if (pdf_is_indirect(val)) { val = pdf_new_indirect(ctx, opts->renumber_map[pdf_to_num(val)], 0, xref); pdf_array_put(obj, i, val); pdf_drop_obj(val); } else { renumberobj(xref, opts, val); } } } } static void renumberobjs(pdf_document *xref, pdf_write_options *opts) { pdf_xref_entry *oldxref; int newlen; int num; fz_context *ctx = xref->ctx; int *new_use_list; new_use_list = fz_calloc(ctx, xref->len+3, sizeof(int)); fz_try(ctx) { /* Apply renumber map to indirect references in all objects in xref */ renumberobj(xref, opts, xref->trailer); for (num = 0; num < xref->len; num++) { pdf_obj *obj = xref->table[num].obj; if (pdf_is_indirect(obj)) { obj = pdf_new_indirect(ctx, opts->renumber_map[pdf_to_num(obj)], 0, xref); pdf_update_object(xref, num, obj); pdf_drop_obj(obj); } else { renumberobj(xref, opts, obj); } } /* Create new table for the reordered, compacted xref */ oldxref = xref->table; xref->table = fz_malloc_array(ctx, xref->len + 3, sizeof(pdf_xref_entry)); xref->table[0] = oldxref[0]; /* Move used objects into the new compacted xref */ newlen = 0; for (num = 1; num < xref->len; num++) { if (opts->use_list[num]) { if (newlen < opts->renumber_map[num]) newlen = opts->renumber_map[num]; xref->table[opts->renumber_map[num]] = oldxref[num]; new_use_list[opts->renumber_map[num]] = opts->use_list[num]; } else { pdf_drop_obj(oldxref[num].obj); } } } fz_catch(ctx) { fz_free(ctx, new_use_list); fz_rethrow(ctx); } fz_free(ctx, oldxref); fz_free(ctx, opts->use_list); opts->use_list = new_use_list; /* Update the used objects count in compacted xref */ xref->len = newlen + 1; for (num = 1; num < xref->len; num++) { opts->renumber_map[num] = num; } } static void page_objects_list_renumber(pdf_write_options *opts) { int i, j; for (i = 0; i < opts->page_object_lists->len; i++) { page_objects *po = opts->page_object_lists->page[i]; for (j = 0; j < po->len; j++) { po->object[j] = opts->renumber_map[po->object[j]]; } po->page_object_number = opts->renumber_map[po->page_object_number]; } } static void mark_all(pdf_document *xref, pdf_write_options *opts, pdf_obj *val, int flag, int page) { fz_context *ctx = xref->ctx; if (pdf_dict_mark(val)) return; fz_try(ctx) { if (pdf_is_indirect(val)) { int num = pdf_to_num(val); if (flag >= 16 && (opts->use_list[num] & USE_PAGE_MASK)) /* Already used */ opts->use_list[num] |= USE_SHARED; else opts->use_list[num] |= flag; if (page >= 0) page_objects_list_insert(ctx, opts, page, num); } if (pdf_is_dict(val)) { int i, n = pdf_dict_len(val); for (i = 0; i < n; i++) { mark_all(xref, opts, pdf_dict_get_val(val, i), flag, page); } } else if (pdf_is_array(val)) { int i, n = pdf_array_len(val); for (i = 0; i < n; i++) { mark_all(xref, opts, pdf_array_get(val, i), flag, page); } } } fz_always(ctx) { pdf_dict_unmark(val); } fz_catch(ctx) { fz_rethrow(ctx); } } static int mark_pages(pdf_document *xref, pdf_write_options *opts, pdf_obj *val, int pagenum) { fz_context *ctx = xref->ctx; if (pdf_dict_mark(val)) return pagenum; fz_try(ctx) { if (pdf_is_dict(val)) { if (!strcmp("Page", pdf_to_name(pdf_dict_gets(val, "Type")))) { int num = pdf_to_num(val); pdf_dict_unmark(val); mark_all(xref, opts, val, pagenum == 0 ? USE_PAGE1 : (pagenum<use_list[num] |= USE_PAGE_OBJECT; } else { int i, n = pdf_dict_len(val); for (i = 0; i < n; i++) { pdf_obj *key = pdf_dict_get_key(val, i); pdf_obj *obj = pdf_dict_get_val(val, i); if (!strcmp("Kids", pdf_to_name(key))) pagenum = mark_pages(xref, opts, obj, pagenum); else mark_all(xref, opts, obj, USE_CATALOGUE, -1); } if (pdf_is_indirect(val)) { int num = pdf_to_num(val); opts->use_list[num] |= USE_CATALOGUE; } } } else if (pdf_is_array(val)) { int i, n = pdf_array_len(val); for (i = 0; i < n; i++) { pagenum = mark_pages(xref, opts, pdf_array_get(val, i), pagenum); } if (pdf_is_indirect(val)) { int num = pdf_to_num(val); opts->use_list[num] |= USE_CATALOGUE; } } } fz_always(ctx) { pdf_dict_unmark(val); } fz_catch(ctx) { fz_rethrow(ctx); } return pagenum; } static void mark_root(pdf_document *xref, pdf_write_options *opts, pdf_obj *dict) { fz_context *ctx = xref->ctx; int i, n = pdf_dict_len(dict); if (pdf_dict_mark(dict)) return; fz_try(ctx) { if (pdf_is_indirect(dict)) { int num = pdf_to_num(dict); opts->use_list[num] |= USE_CATALOGUE; } for (i = 0; i < n; i++) { char *key = pdf_to_name(pdf_dict_get_key(dict, i)); pdf_obj *val = pdf_dict_get_val(dict, i); if (!strcmp("Pages", key)) opts->page_count = mark_pages(xref, opts, val, 0); else if (!strcmp("Outlines", key)) { /* FIXME: Look at PageMode to decide whether to * USE_OTHERPAGES or USE_PAGE1 here. */ if (0 /* PageMode == "Outlines" */) mark_all(xref, opts, val, USE_PAGE1, -1); } else mark_all(xref, opts, val, USE_CATALOGUE, -1); } } fz_always(ctx) { pdf_dict_unmark(dict); } fz_catch(ctx) { fz_rethrow(ctx); } } static void mark_trailer(pdf_document *xref, pdf_write_options *opts, pdf_obj *dict) { fz_context *ctx = xref->ctx; int i, n = pdf_dict_len(dict); if (pdf_dict_mark(dict)) return; fz_try(ctx) { for (i = 0; i < n; i++) { char *key = pdf_to_name(pdf_dict_get_key(dict, i)); pdf_obj *val = pdf_dict_get_val(dict, i); if (!strcmp("Root", key)) mark_root(xref, opts, val); else mark_all(xref, opts, val, USE_CATALOGUE, -1); } } fz_always(ctx) { pdf_dict_unmark(dict); } fz_catch(ctx) { fz_rethrow(ctx); } } static void add_linearization_objs(pdf_document *xref, pdf_write_options *opts) { pdf_obj *params_obj = NULL; pdf_obj *params_ref = NULL; pdf_obj *hint_obj = NULL; pdf_obj *hint_ref = NULL; pdf_obj *o = NULL; int params_num, hint_num; fz_context *ctx = xref->ctx; fz_var(params_obj); fz_var(params_ref); fz_var(hint_obj); fz_var(hint_ref); fz_var(o); fz_try(ctx) { /* Linearization params */ params_obj = pdf_new_dict(ctx, 10); params_ref = pdf_new_ref(xref, params_obj); params_num = pdf_to_num(params_ref); opts->use_list[params_num] = USE_PARAMS; opts->renumber_map[params_num] = params_num; opts->rev_renumber_map[params_num] = params_num; opts->gen_list[params_num] = 0; opts->rev_gen_list[params_num] = 0; o = pdf_new_real(ctx, 1.0); pdf_dict_puts(params_obj, "Linearized", o); pdf_drop_obj(o); o = NULL; opts->linear_l = pdf_new_int(ctx, INT_MIN); pdf_dict_puts(params_obj, "L", opts->linear_l); opts->linear_h0 = pdf_new_int(ctx, INT_MIN); o = pdf_new_array(ctx, 2); pdf_array_push(o, opts->linear_h0); opts->linear_h1 = pdf_new_int(ctx, INT_MIN); pdf_array_push(o, opts->linear_h1); pdf_dict_puts(params_obj, "H", o); pdf_drop_obj(o); o = NULL; opts->linear_o = pdf_new_int(ctx, INT_MIN); pdf_dict_puts(params_obj, "O", opts->linear_o); opts->linear_e = pdf_new_int(ctx, INT_MIN); pdf_dict_puts(params_obj, "E", opts->linear_e); opts->linear_n = pdf_new_int(ctx, INT_MIN); pdf_dict_puts(params_obj, "N", opts->linear_n); opts->linear_t = pdf_new_int(ctx, INT_MIN); pdf_dict_puts(params_obj, "T", opts->linear_t); /* Primary hint stream */ hint_obj = pdf_new_dict(ctx, 10); hint_ref = pdf_new_ref(xref, hint_obj); hint_num = pdf_to_num(hint_ref); opts->use_list[hint_num] = USE_HINTS; opts->renumber_map[hint_num] = hint_num; opts->rev_renumber_map[hint_num] = hint_num; opts->gen_list[hint_num] = 0; opts->rev_gen_list[hint_num] = 0; o = pdf_new_int(ctx, 0); pdf_dict_puts(hint_obj, "P", o); pdf_drop_obj(o); o = NULL; opts->hints_s = pdf_new_int(ctx, INT_MIN); pdf_dict_puts(hint_obj, "S", opts->hints_s); /* FIXME: Do we have thumbnails? Do a T entry */ /* FIXME: Do we have outlines? Do an O entry */ /* FIXME: Do we have article threads? Do an A entry */ /* FIXME: Do we have named destinations? Do a E entry */ /* FIXME: Do we have interactive forms? Do a V entry */ /* FIXME: Do we have document information? Do an I entry */ /* FIXME: Do we have logical structure heirarchy? Do a C entry */ /* FIXME: Do L, Page Label hint table */ o = pdf_new_name(ctx, "FlateDecode"); pdf_dict_puts(hint_obj, "Filter", o); pdf_drop_obj(o); o = NULL; opts->hints_length = pdf_new_int(ctx, INT_MIN); pdf_dict_puts(hint_obj, "Length", opts->hints_length); xref->table[hint_num].stm_ofs = -1; } fz_always(ctx) { pdf_drop_obj(params_obj); pdf_drop_obj(params_ref); pdf_drop_obj(hint_ref); pdf_drop_obj(hint_obj); pdf_drop_obj(o); } fz_catch(ctx) { fz_rethrow(ctx); } } static void lpr_inherit_res_contents(fz_context *ctx, pdf_obj *res, pdf_obj *dict, char *text) { pdf_obj *o, *r; int i, n; /* If the parent node doesn't have an entry of this type, give up. */ o = pdf_dict_gets(dict, text); if (!o) return; /* If the resources dict we are building doesn't have an entry of this * type yet, then just copy it (ensuring it's not a reference) */ r = pdf_dict_gets(res, text); if (r == NULL) { o = pdf_resolve_indirect(o); if (pdf_is_dict(o)) o = pdf_copy_dict(ctx, o); else if (pdf_is_array(o)) o = pdf_copy_array(ctx, o); else o = NULL; if (o) pdf_dict_puts(res, text, o); return; } /* Otherwise we need to merge o into r */ if (pdf_is_dict(o)) { n = pdf_dict_len(o); for (i = 0; i < n; i++) { pdf_obj *key = pdf_dict_get_key(o, i); pdf_obj *val = pdf_dict_get_val(o, i); if (pdf_dict_gets(res, pdf_to_name(key))) continue; pdf_dict_puts(res, pdf_to_name(key), val); } } } static void lpr_inherit_res(fz_context *ctx, pdf_obj *node, int depth, pdf_obj *dict) { while (1) { pdf_obj *o; node = pdf_dict_gets(node, "Parent"); depth--; if (!node || depth < 0) break; o = pdf_dict_gets(node, "Resources"); if (o) { lpr_inherit_res_contents(ctx, dict, o, "ExtGState"); lpr_inherit_res_contents(ctx, dict, o, "ColorSpace"); lpr_inherit_res_contents(ctx, dict, o, "Pattern"); lpr_inherit_res_contents(ctx, dict, o, "Shading"); lpr_inherit_res_contents(ctx, dict, o, "XObject"); lpr_inherit_res_contents(ctx, dict, o, "Font"); lpr_inherit_res_contents(ctx, dict, o, "ProcSet"); lpr_inherit_res_contents(ctx, dict, o, "Properties"); } } } static pdf_obj * lpr_inherit(fz_context *ctx, pdf_obj *node, char *text, int depth) { do { pdf_obj *o = pdf_dict_gets(node, text); if (o) return pdf_resolve_indirect(o); node = pdf_dict_gets(node, "Parent"); depth--; } while (depth >= 0 && node); return NULL; } static int lpr(fz_context *ctx, pdf_write_options *opts, pdf_obj *node, int depth, int page) { pdf_obj *kids; pdf_obj *o = NULL; int i, n; if (pdf_dict_mark(node)) return page; fz_var(o); fz_try(ctx) { if (!strcmp("Page", pdf_to_name(pdf_dict_gets(node, "Type")))) { pdf_obj *r; /* r is deliberately not cleaned up */ /* Copy resources down to the child */ o = pdf_keep_obj(pdf_dict_gets(node, "Resources")); if (!o) { o = pdf_keep_obj(pdf_new_dict(ctx, 2)); pdf_dict_puts(node, "Resources", o); } lpr_inherit_res(ctx, node, depth, o); r = lpr_inherit(ctx, node, "MediaBox", depth); if (r) pdf_dict_puts(node, "MediaBox", r); r = lpr_inherit(ctx, node, "CropBox", depth); if (r) pdf_dict_puts(node, "CropBox", r); r = lpr_inherit(ctx, node, "BleedBox", depth); if (r) pdf_dict_puts(node, "BleedBox", r); r = lpr_inherit(ctx, node, "TrimBox", depth); if (r) pdf_dict_puts(node, "TrimBox", r); r = lpr_inherit(ctx, node, "ArtBox", depth); if (r) pdf_dict_puts(node, "ArtBox", r); r = lpr_inherit(ctx, node, "Rotate", depth); if (r) pdf_dict_puts(node, "Rotate", r); page++; } else { kids = pdf_dict_gets(node, "Kids"); n = pdf_array_len(kids); for(i = 0; i < n; i++) { page = lpr(ctx, opts, pdf_array_get(kids, i), depth+1, page); } pdf_dict_dels(node, "Resources"); pdf_dict_dels(node, "MediaBox"); pdf_dict_dels(node, "CropBox"); pdf_dict_dels(node, "BleedBox"); pdf_dict_dels(node, "TrimBox"); pdf_dict_dels(node, "ArtBox"); pdf_dict_dels(node, "Rotate"); } } fz_always(ctx) { pdf_drop_obj(o); } fz_catch(ctx) { fz_rethrow(ctx); } pdf_dict_unmark(node); return page; } static void linearize_page_resources(pdf_document *xref, pdf_write_options *opts) { fz_context *ctx = xref->ctx; lpr(ctx, opts, pdf_dict_gets(pdf_dict_gets(xref->trailer, "Root"), "Pages"), 0, 0); } static void linearize(pdf_document *xref, pdf_write_options *opts) { int i; int n = xref->len + 2; int *reorder; int *rev_renumber_map; int *rev_gen_list; fz_context *ctx = xref->ctx; opts->page_object_lists = page_objects_list_create(ctx); /* Ensure that every page has local references of its resources */ /* FIXME: We could 'thin' the resources according to what is actually * required for each page, but this would require us to run the page * content streams. */ linearize_page_resources(xref, opts); /* Walk the objects for each page, marking which ones are used, where */ memset(opts->use_list, 0, n * sizeof(int)); mark_trailer(xref, opts, xref->trailer); /* Add new objects required for linearization */ add_linearization_objs(xref, opts); #ifdef DEBUG_WRITING fprintf(stderr, "Usage calculated:\n"); for (i=0; i < xref->len; i++) { fprintf(stderr, "%d: use=%d\n", i, opts->use_list[i]); } #endif /* Allocate/init the structures used for renumbering the objects */ reorder = fz_calloc(ctx, n, sizeof(int)); rev_renumber_map = fz_calloc(ctx, n, sizeof(int)); rev_gen_list = fz_calloc(ctx, n, sizeof(int)); for (i = 0; i < n; i++) { reorder[i] = i; } /* Heap sort the reordering */ heap_sort(reorder+1, n-1, opts->use_list, &order_ge); #ifdef DEBUG_WRITING fprintf(stderr, "Reordered:\n"); for (i=1; i < xref->len; i++) { fprintf(stderr, "%d: use=%d\n", i, opts->use_list[reorder[i]]); } #endif /* Find the split point */ for (i = 1; (opts->use_list[reorder[i]] & USE_PARAMS) == 0; i++); opts->start = i; /* Roll the reordering into the renumber_map */ for (i = 0; i < n; i++) { opts->renumber_map[reorder[i]] = i; rev_renumber_map[i] = opts->rev_renumber_map[reorder[i]]; rev_gen_list[i] = opts->rev_gen_list[reorder[i]]; } fz_free(ctx, opts->rev_renumber_map); fz_free(ctx, opts->rev_gen_list); opts->rev_renumber_map = rev_renumber_map; opts->rev_gen_list = rev_gen_list; fz_free(ctx, reorder); /* Apply the renumber_map */ page_objects_list_renumber(opts); renumberobjs(xref, opts); page_objects_list_sort_and_dedupe(ctx, opts->page_object_lists); } static void update_linearization_params(pdf_document *xref, pdf_write_options *opts) { int offset; pdf_set_int(opts->linear_l, opts->file_len); /* Primary hint stream offset (of object, not stream!) */ pdf_set_int(opts->linear_h0, opts->ofs_list[xref->len-1]); /* Primary hint stream length (of object, not stream!) */ offset = (opts->start == 1 ? opts->main_xref_offset : opts->ofs_list[1] + opts->hintstream_len); pdf_set_int(opts->linear_h1, offset - opts->ofs_list[xref->len-1]); /* Object number of first pages page object (the first object of page 0) */ pdf_set_int(opts->linear_o, opts->page_object_lists->page[0]->object[0]); /* Offset of end of first page (first page is followed by primary * hint stream (object n-1) then remaining pages (object 1...). The * primary hint stream counts as part of the first pages data, I think. */ offset = (opts->start == 1 ? opts->main_xref_offset : opts->ofs_list[1] + opts->hintstream_len); pdf_set_int(opts->linear_e, offset); /* Number of pages in document */ pdf_set_int(opts->linear_n, opts->page_count); /* Offset of first entry in main xref table */ pdf_set_int(opts->linear_t, opts->first_xref_entry_offset + opts->hintstream_len); /* Offset of shared objects hint table in the primary hint stream */ pdf_set_int(opts->hints_s, opts->hints_shared_offset); /* Primary hint stream length */ pdf_set_int(opts->hints_length, opts->hintstream_len); } /* * Make sure we have loaded objects from object streams. */ static void preloadobjstms(pdf_document *xref) { pdf_obj *obj; int num; for (num = 0; num < xref->len; num++) { if (xref->table[num].type == 'o') { obj = pdf_load_object(xref, num, 0); pdf_drop_obj(obj); } } } /* * Save streams and objects to the output */ static inline int isbinary(int c) { if (c == '\n' || c == '\r' || c == '\t') return 0; return c < 32 || c > 127; } static int isbinarystream(fz_buffer *buf) { int i; for (i = 0; i < buf->len; i++) if (isbinary(buf->data[i])) return 1; return 0; } static fz_buffer *hexbuf(fz_context *ctx, unsigned char *p, int n) { static const char hex[16] = "0123456789abcdef"; fz_buffer *buf; int x = 0; buf = fz_new_buffer(ctx, n * 2 + (n / 32) + 2); while (n--) { buf->data[buf->len++] = hex[*p >> 4]; buf->data[buf->len++] = hex[*p & 15]; if (++x == 32) { buf->data[buf->len++] = '\n'; x = 0; } p++; } buf->data[buf->len++] = '>'; buf->data[buf->len++] = '\n'; return buf; } static void addhexfilter(pdf_document *xref, pdf_obj *dict) { pdf_obj *f, *dp, *newf, *newdp; pdf_obj *ahx, *nullobj; fz_context *ctx = xref->ctx; ahx = pdf_new_name(ctx, "ASCIIHexDecode"); nullobj = pdf_new_null(ctx); newf = newdp = NULL; f = pdf_dict_gets(dict, "Filter"); dp = pdf_dict_gets(dict, "DecodeParms"); if (pdf_is_name(f)) { newf = pdf_new_array(ctx, 2); pdf_array_push(newf, ahx); pdf_array_push(newf, f); f = newf; if (pdf_is_dict(dp)) { newdp = pdf_new_array(ctx, 2); pdf_array_push(newdp, nullobj); pdf_array_push(newdp, dp); dp = newdp; } } else if (pdf_is_array(f)) { pdf_array_insert(f, ahx); if (pdf_is_array(dp)) pdf_array_insert(dp, nullobj); } else f = ahx; pdf_dict_puts(dict, "Filter", f); if (dp) pdf_dict_puts(dict, "DecodeParms", dp); pdf_drop_obj(ahx); pdf_drop_obj(nullobj); pdf_drop_obj(newf); pdf_drop_obj(newdp); } static void copystream(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj_orig, int num, int gen) { fz_buffer *buf, *tmp; pdf_obj *newlen; pdf_obj *obj; fz_context *ctx = xref->ctx; int orig_num = opts->rev_renumber_map[num]; int orig_gen = opts->rev_gen_list[num]; buf = pdf_load_raw_renumbered_stream(xref, num, gen, orig_num, orig_gen); obj = pdf_copy_dict(ctx, obj_orig); if (opts->do_ascii && isbinarystream(buf)) { tmp = hexbuf(ctx, buf->data, buf->len); fz_drop_buffer(ctx, buf); buf = tmp; addhexfilter(xref, obj); newlen = pdf_new_int(ctx, buf->len); pdf_dict_puts(obj, "Length", newlen); pdf_drop_obj(newlen); } fprintf(opts->out, "%d %d obj\n", num, gen); pdf_fprint_obj(opts->out, obj, opts->do_expand == 0); fprintf(opts->out, "stream\n"); fwrite(buf->data, 1, buf->len, opts->out); fprintf(opts->out, "endstream\nendobj\n\n"); fz_drop_buffer(ctx, buf); pdf_drop_obj(obj); } static void expandstream(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj_orig, int num, int gen) { fz_buffer *buf, *tmp; pdf_obj *newlen; pdf_obj *obj; fz_context *ctx = xref->ctx; int orig_num = opts->rev_renumber_map[num]; int orig_gen = opts->rev_gen_list[num]; buf = pdf_load_renumbered_stream(xref, num, gen, orig_num, orig_gen); obj = pdf_copy_dict(ctx, obj_orig); pdf_dict_dels(obj, "Filter"); pdf_dict_dels(obj, "DecodeParms"); if (opts->do_ascii && isbinarystream(buf)) { tmp = hexbuf(ctx, buf->data, buf->len); fz_drop_buffer(ctx, buf); buf = tmp; addhexfilter(xref, obj); } newlen = pdf_new_int(ctx, buf->len); pdf_dict_puts(obj, "Length", newlen); pdf_drop_obj(newlen); fprintf(opts->out, "%d %d obj\n", num, gen); pdf_fprint_obj(opts->out, obj, opts->do_expand == 0); fprintf(opts->out, "stream\n"); fwrite(buf->data, 1, buf->len, opts->out); fprintf(opts->out, "endstream\nendobj\n\n"); fz_drop_buffer(ctx, buf); pdf_drop_obj(obj); } static int is_image_filter(char *s) { if ( !strcmp(s, "CCITTFaxDecode") || !strcmp(s, "CCF") || !strcmp(s, "DCTDecode") || !strcmp(s, "DCT") || !strcmp(s, "RunLengthDecode") || !strcmp(s, "RL") || !strcmp(s, "JBIG2Decode") || !strcmp(s, "JPXDecode")) return 1; return 0; } static int filter_implies_image(pdf_document *xref, pdf_obj *o) { if (!o) return 0; if (pdf_is_name(o)) return is_image_filter(pdf_to_name(o)); if (pdf_is_array(o)) { int i, len; len = pdf_array_len(o); for (i = 0; i < len; i++) if (is_image_filter(pdf_to_name(pdf_array_get(o, i)))) return 1; } return 0; } static void writeobject(pdf_document *xref, pdf_write_options *opts, int num, int gen) { pdf_obj *obj; pdf_obj *type; fz_context *ctx = xref->ctx; obj = pdf_load_object(xref, num, gen); /* skip ObjStm and XRef objects */ if (pdf_is_dict(obj)) { type = pdf_dict_gets(obj, "Type"); if (pdf_is_name(type) && !strcmp(pdf_to_name(type), "ObjStm")) { opts->use_list[num] = 0; pdf_drop_obj(obj); return; } if (pdf_is_name(type) && !strcmp(pdf_to_name(type), "XRef")) { opts->use_list[num] = 0; pdf_drop_obj(obj); return; } } if (!pdf_is_stream(xref, num, gen)) { fprintf(opts->out, "%d %d obj\n", num, gen); pdf_fprint_obj(opts->out, obj, opts->do_expand == 0); fprintf(opts->out, "endobj\n\n"); } else if (xref->table[num].stm_ofs < 0 && xref->table[num].stm_buf == NULL) { fprintf(opts->out, "%d %d obj\n", num, gen); pdf_fprint_obj(opts->out, obj, opts->do_expand == 0); fprintf(opts->out, "stream\nendstream\nendobj\n\n"); } else { int dontexpand = 0; if (opts->do_expand != 0 && opts->do_expand != fz_expand_all) { pdf_obj *o; if ((o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "XObject")) && (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "Image"))) dontexpand = !(opts->do_expand & fz_expand_images); if (o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "Font")) dontexpand = !(opts->do_expand & fz_expand_fonts); if (o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "FontDescriptor")) dontexpand = !(opts->do_expand & fz_expand_fonts); if ((o = pdf_dict_gets(obj, "Length1")) != NULL) dontexpand = !(opts->do_expand & fz_expand_fonts); if ((o = pdf_dict_gets(obj, "Length2")) != NULL) dontexpand = !(opts->do_expand & fz_expand_fonts); if ((o = pdf_dict_gets(obj, "Length3")) != NULL) dontexpand = !(opts->do_expand & fz_expand_fonts); if (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "Type1C")) dontexpand = !(opts->do_expand & fz_expand_fonts); if (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "CIDFontType0C")) dontexpand = !(opts->do_expand & fz_expand_fonts); if (o = pdf_dict_gets(obj, "Filter"), filter_implies_image(xref, o)) dontexpand = !(opts->do_expand & fz_expand_images); if (pdf_dict_gets(obj, "Width") != NULL && pdf_dict_gets(obj, "Height") != NULL) dontexpand = !(opts->do_expand & fz_expand_images); } if (opts->do_expand && !dontexpand && !pdf_is_jpx_image(ctx, obj)) expandstream(xref, opts, obj, num, gen); else copystream(xref, opts, obj, num, gen); } pdf_drop_obj(obj); } static void writexref(pdf_document *xref, pdf_write_options *opts, int from, int to, int first, int main_xref_offset, int startxref) { pdf_obj *trailer = NULL; pdf_obj *obj; pdf_obj *nobj = NULL; int num; fz_context *ctx = xref->ctx; fprintf(opts->out, "xref\n%d %d\n", from, to - from); opts->first_xref_entry_offset = ftell(opts->out); for (num = from; num < to; num++) { if (opts->use_list[num]) fprintf(opts->out, "%010d %05d n \n", opts->ofs_list[num], opts->gen_list[num]); else fprintf(opts->out, "%010d %05d f \n", opts->ofs_list[num], opts->gen_list[num]); } fprintf(opts->out, "\n"); fz_var(trailer); fz_var(nobj); fz_try(ctx) { trailer = pdf_new_dict(ctx, 5); nobj = pdf_new_int(ctx, to); pdf_dict_puts(trailer, "Size", nobj); pdf_drop_obj(nobj); nobj = NULL; if (first) { obj = pdf_dict_gets(xref->trailer, "Info"); if (obj) pdf_dict_puts(trailer, "Info", obj); obj = pdf_dict_gets(xref->trailer, "Root"); if (obj) pdf_dict_puts(trailer, "Root", obj); obj = pdf_dict_gets(xref->trailer, "ID"); if (obj) pdf_dict_puts(trailer, "ID", obj); } if (main_xref_offset != 0) { nobj = pdf_new_int(ctx, main_xref_offset); pdf_dict_puts(trailer, "Prev", nobj); pdf_drop_obj(nobj); nobj = NULL; } } fz_always(ctx) { pdf_drop_obj(nobj); } fz_catch(ctx) { fz_rethrow(ctx); } fprintf(opts->out, "trailer\n"); pdf_fprint_obj(opts->out, trailer, opts->do_expand == 0); fprintf(opts->out, "\n"); pdf_drop_obj(trailer); fprintf(opts->out, "startxref\n%d\n%%%%EOF\n", startxref); } static void padto(FILE *file, int target) { int pos = ftell(file); assert(pos <= target); while (pos < target) { fputc('\n', file); pos++; } } static void dowriteobject(pdf_document *xref, pdf_write_options *opts, int num, int pass) { if (xref->table[num].type == 'f') opts->gen_list[num] = xref->table[num].gen; if (xref->table[num].type == 'n') opts->gen_list[num] = xref->table[num].gen; if (xref->table[num].type == 'o') opts->gen_list[num] = 0; if (opts->do_garbage && !opts->use_list[num]) return; if (xref->table[num].type == 'n' || xref->table[num].type == 'o') { if (pass > 0) padto(opts->out, opts->ofs_list[num]); opts->ofs_list[num] = ftell(opts->out); writeobject(xref, opts, num, opts->gen_list[num]); } else opts->use_list[num] = 0; } static void writeobjects(pdf_document *xref, pdf_write_options *opts, int pass) { int num; fprintf(opts->out, "%%PDF-%d.%d\n", xref->version / 10, xref->version % 10); fprintf(opts->out, "%%\316\274\341\277\246\n\n"); dowriteobject(xref, opts, opts->start, pass); if (opts->do_linear) { /* Write first xref */ if (pass == 0) opts->first_xref_offset = ftell(opts->out); else padto(opts->out, opts->first_xref_offset); writexref(xref, opts, opts->start, xref->len, 1, opts->main_xref_offset, 0); } for (num = opts->start+1; num < xref->len; num++) dowriteobject(xref, opts, num, pass); if (opts->do_linear && pass == 1) { int offset = (opts->start == 1 ? opts->main_xref_offset : opts->ofs_list[1] + opts->hintstream_len); padto(opts->out, offset); } for (num = 1; num < opts->start; num++) { if (pass == 1) opts->ofs_list[num] += opts->hintstream_len; dowriteobject(xref, opts, num, pass); } } static int my_log2(int x) { int i = 0; if (x <= 0) return 0; while ((1< 0) i++; if ((1<ctx; int i, j; int min_objs_per_page, max_objs_per_page; int min_page_length, max_page_length; int objs_per_page_bits; int min_shared_object, max_shared_object; int max_shared_object_refs; int min_shared_length, max_shared_length; page_objects **pop = &opts->page_object_lists->page[0]; int page_len_bits, shared_object_bits, shared_object_id_bits; int shared_length_bits; min_shared_object = xref->len; max_shared_object = 1; min_shared_length = opts->file_len; max_shared_length = 0; for (i=1; i < xref->len; i++) { int min, max, page; min = opts->ofs_list[i]; if (i == opts->start-1 || (opts->start == 1 && i == xref->len-1)) max = opts->main_xref_offset; else if (i == xref->len-1) max = opts->ofs_list[1]; else max = opts->ofs_list[i+1]; assert(max > min); if (opts->use_list[i] & USE_SHARED) { page = -1; if (i < min_shared_object) min_shared_object = i; if (i > max_shared_object) max_shared_object = i; if (min_shared_length > max - min) min_shared_length = max - min; if (max_shared_length < max - min) max_shared_length = max - min; } else if (opts->use_list[i] & (USE_CATALOGUE | USE_HINTS | USE_PARAMS)) page = -1; else if (opts->use_list[i] & USE_PAGE1) { page = 0; if (min_shared_length > max - min) min_shared_length = max - min; if (max_shared_length < max - min) max_shared_length = max - min; } else if (opts->use_list[i] == 0) page = -1; else page = opts->use_list[i]>>USE_PAGE_SHIFT; if (page >= 0) { pop[page]->num_objects++; if (pop[page]->min_ofs > min) pop[page]->min_ofs = min; if (pop[page]->max_ofs < max) pop[page]->max_ofs = max; } } min_objs_per_page = max_objs_per_page = pop[0]->num_objects; min_page_length = max_page_length = pop[0]->max_ofs - pop[0]->min_ofs; for (i=1; i < opts->page_count; i++) { int tmp; if (min_objs_per_page > pop[i]->num_objects) min_objs_per_page = pop[i]->num_objects; if (max_objs_per_page < pop[i]->num_objects) max_objs_per_page = pop[i]->num_objects; tmp = pop[i]->max_ofs - pop[i]->min_ofs; if (tmp < min_page_length) min_page_length = tmp; if (tmp > max_page_length) max_page_length = tmp; } for (i=0; i < opts->page_count; i++) { int count = 0; int j; page_objects *po = opts->page_object_lists->page[i]; for (j = 0; j < po->len; j++) { if (i == 0 && opts->use_list[po->object[j]] & USE_PAGE1) count++; else if (i != 0 && opts->use_list[po->object[j]] & USE_SHARED) count++; } po->num_shared = count; if (i == 0 || count > max_shared_object_refs) max_shared_object_refs = count; } if (min_shared_object > max_shared_object) min_shared_object = max_shared_object = 0; /* Table F.3 - Header */ /* Header Item 1: Least number of objects in a page */ fz_write_buffer_bits(ctx, buf, min_objs_per_page, 32); /* Header Item 2: Location of first pages page object */ fz_write_buffer_bits(ctx, buf, opts->ofs_list[pop[0]->page_object_number], 32); /* Header Item 3: Number of bits required to represent the difference * between the greatest and least number of objects in a page. */ objs_per_page_bits = my_log2(max_objs_per_page - min_objs_per_page); fz_write_buffer_bits(ctx, buf, objs_per_page_bits, 16); /* Header Item 4: Least length of a page. */ fz_write_buffer_bits(ctx, buf, min_page_length, 32); /* Header Item 5: Number of bits needed to represent the difference * between the greatest and least length of a page. */ page_len_bits = my_log2(max_page_length - min_page_length); fz_write_buffer_bits(ctx, buf, page_len_bits, 16); /* Header Item 6: Least offset to start of content stream (Acrobat * sets this to always be 0) */ fz_write_buffer_bits(ctx, buf, 0, 32); /* Header Item 7: Number of bits needed to represent the difference * between the greatest and least offset to content stream (Acrobat * sets this to always be 0) */ fz_write_buffer_bits(ctx, buf, 0, 16); /* Header Item 8: Least content stream length. (Acrobat * sets this to always be 0) */ fz_write_buffer_bits(ctx, buf, 0, 32); /* Header Item 9: Number of bits needed to represent the difference * between the greatest and least content stream length (Acrobat * sets this to always be the same as item 5) */ fz_write_buffer_bits(ctx, buf, page_len_bits, 16); /* Header Item 10: Number of bits needed to represent the greatest * number of shared object references. */ shared_object_bits = my_log2(max_shared_object_refs); fz_write_buffer_bits(ctx, buf, shared_object_bits, 16); /* Header Item 11: Number of bits needed to represent the greatest * shared object identifier. */ shared_object_id_bits = my_log2(max_shared_object - min_shared_object + pop[0]->num_shared); fz_write_buffer_bits(ctx, buf, shared_object_id_bits, 16); /* Header Item 12: Number of bits needed to represent the numerator * of the fractions. We always send 0. */ fz_write_buffer_bits(ctx, buf, 0, 16); /* Header Item 13: Number of bits needed to represent the denominator * of the fractions. We always send 0. */ fz_write_buffer_bits(ctx, buf, 0, 16); /* Table F.4 - Page offset hint table (per page) */ /* Item 1: A number that, when added to the least number of objects * on a page, gives the number of objects in the page. */ for (i = 0; i < opts->page_count; i++) { fz_write_buffer_bits(ctx, buf, pop[i]->num_objects - min_objs_per_page, objs_per_page_bits); } fz_write_buffer_pad(ctx, buf); /* Item 2: A number that, when added to the least page length, gives * the length of the page in bytes. */ for (i = 0; i < opts->page_count; i++) { fz_write_buffer_bits(ctx, buf, pop[i]->max_ofs - pop[i]->min_ofs - min_page_length, page_len_bits); } fz_write_buffer_pad(ctx, buf); /* Item 3: The number of shared objects referenced from the page. */ for (i = 0; i < opts->page_count; i++) { fz_write_buffer_bits(ctx, buf, pop[i]->num_shared, shared_object_bits); } fz_write_buffer_pad(ctx, buf); /* Item 4: Shared object id for each shared object ref in every page. * Spec says "not for page 1", but acrobat does send page 1's - all * as zeros. */ for (i = 0; i < opts->page_count; i++) { for (j = 0; j < pop[i]->len; j++) { int o = pop[i]->object[j]; if (i == 0 && opts->use_list[o] & USE_PAGE1) fz_write_buffer_bits(ctx, buf, 0 /* o - pop[0]->page_object_number */, shared_object_id_bits); if (i != 0 && opts->use_list[o] & USE_SHARED) fz_write_buffer_bits(ctx, buf, o - min_shared_object + pop[0]->num_shared, shared_object_id_bits); } } fz_write_buffer_pad(ctx, buf); /* Item 5: Numerator of fractional position for each shared object reference. */ /* We always send 0 in 0 bits */ /* Item 6: A number that, when added to the least offset to the start * of the content stream (F.3 Item 6), gives the offset in bytes of * start of the pages content stream object relative to the beginning * of the page. Always 0 in 0 bits. */ /* Item 7: A number that, when added to the least content stream length * (F.3 Item 8), gives the length of the pages content stream object. * Always == Item 2 as least content stream length = least page stream * length. */ for (i = 0; i < opts->page_count; i++) { fz_write_buffer_bits(ctx, buf, pop[i]->max_ofs - pop[i]->min_ofs - min_page_length, page_len_bits); } /* Pad, and then do shared object hint table */ fz_write_buffer_pad(ctx, buf); opts->hints_shared_offset = buf->len; /* Table F.5: */ /* Header Item 1: Object number of the first object in the shared * objects section. */ fz_write_buffer_bits(ctx, buf, min_shared_object, 32); /* Header Item 2: Location of first object in the shared objects * section. */ fz_write_buffer_bits(ctx, buf, opts->ofs_list[min_shared_object], 32); /* Header Item 3: The number of shared object entries for the first * page. */ fz_write_buffer_bits(ctx, buf, pop[0]->num_shared, 32); /* Header Item 4: The number of shared object entries for the shared * objects section + first page. */ fz_write_buffer_bits(ctx, buf, max_shared_object - min_shared_object + pop[0]->num_shared, 32); /* Header Item 5: The number of bits needed to represent the greatest * number of objects in a shared object group (Always 0). */ fz_write_buffer_bits(ctx, buf, 0, 16); /* Header Item 6: The least length of a shared object group in bytes. */ fz_write_buffer_bits(ctx, buf, min_shared_length, 32); /* Header Item 7: The number of bits required to represent the * difference between the greatest and least length of a shared object * group. */ shared_length_bits = my_log2(max_shared_length - min_shared_length); fz_write_buffer_bits(ctx, buf, shared_length_bits, 16); /* Table F.6 */ /* Item 1: Shared object group length (page 1 objects) */ for (j = 0; j < pop[0]->len; j++) { int o = pop[0]->object[j]; int min, max; min = opts->ofs_list[o]; if (o == opts->start-1) max = opts->main_xref_offset; else if (o < xref->len-1) max = opts->ofs_list[o+1]; else max = opts->ofs_list[1]; if (opts->use_list[o] & USE_PAGE1) fz_write_buffer_bits(ctx, buf, max - min - min_shared_length, shared_length_bits); } /* Item 1: Shared object group length (shared objects) */ for (i = min_shared_object; i <= max_shared_object; i++) { int min, max; min = opts->ofs_list[i]; if (i == opts->start-1) max = opts->main_xref_offset; else if (i < xref->len-1) max = opts->ofs_list[i+1]; else max = opts->ofs_list[1]; fz_write_buffer_bits(ctx, buf, max - min - min_shared_length, shared_length_bits); } fz_write_buffer_pad(ctx, buf); /* Item 2: MD5 presence flags */ for (i = max_shared_object - min_shared_object + pop[0]->num_shared; i > 0; i--) { fz_write_buffer_bits(ctx, buf, 0, 1); } fz_write_buffer_pad(ctx, buf); /* Item 3: MD5 sums (not present) */ fz_write_buffer_pad(ctx, buf); /* Item 4: Number of objects in the group (not present) */ } static void make_hint_stream(pdf_document *xref, pdf_write_options *opts) { fz_context *ctx = xref->ctx; fz_buffer *buf = fz_new_buffer(ctx, 100); fz_try(ctx) { make_page_offset_hints(xref, opts, buf); pdf_update_stream(xref, xref->len-1, buf); opts->hintstream_len = buf->len; fz_drop_buffer(ctx, buf); } fz_catch(ctx) { fz_drop_buffer(ctx, buf); fz_rethrow(ctx); } } #ifdef DEBUG_WRITING static void dump_object_details(pdf_document *xref, pdf_write_options *opts) { int i; for (i = 0; i < xref->len; i++) { fprintf(stderr, "%d@%d: use=%d\n", i, opts->ofs_list[i], opts->use_list[i]); } } #endif void pdf_write_document(pdf_document *xref, char *filename, fz_write_options *fz_opts) { int lastfree; int num; pdf_write_options opts = { 0 }; fz_context *ctx; if (!xref || !fz_opts) return; ctx = xref->ctx; opts.out = fopen(filename, "wb"); if (!opts.out) fz_throw(ctx, "cannot open output file '%s'", filename); fz_try(ctx) { opts.do_expand = fz_opts ? fz_opts->do_expand : 0; opts.do_garbage = fz_opts ? fz_opts->do_garbage : 0; opts.do_ascii = fz_opts ? fz_opts->do_ascii: 0; opts.do_linear = fz_opts ? fz_opts->do_linear: 0; opts.start = 0; opts.main_xref_offset = INT_MIN; /* We deliberately make these arrays long enough to cope with * 1 to n access rather than 0..n-1, and add space for 2 new * extra entries that may be required for linearization. */ opts.use_list = fz_malloc_array(ctx, xref->len + 3, sizeof(int)); opts.ofs_list = fz_malloc_array(ctx, xref->len + 3, sizeof(int)); opts.gen_list = fz_calloc(ctx, xref->len + 3, sizeof(int)); opts.renumber_map = fz_malloc_array(ctx, xref->len + 3, sizeof(int)); opts.rev_renumber_map = fz_malloc_array(ctx, xref->len + 3, sizeof(int)); opts.rev_gen_list = fz_malloc_array(ctx, xref->len + 3, sizeof(int)); for (num = 0; num < xref->len; num++) { opts.use_list[num] = 0; opts.ofs_list[num] = 0; opts.renumber_map[num] = num; opts.rev_renumber_map[num] = num; opts.rev_gen_list[num] = xref->table[num].gen; } /* Make sure any objects hidden in compressed streams have been loaded */ preloadobjstms(xref); /* Sweep & mark objects from the trailer */ if (opts.do_garbage >= 1) sweepobj(xref, &opts, xref->trailer); else for (num = 0; num < xref->len; num++) opts.use_list[num] = 1; /* Coalesce and renumber duplicate objects */ if (opts.do_garbage >= 3) removeduplicateobjs(xref, &opts); /* Compact xref by renumbering and removing unused objects */ if (opts.do_garbage >= 2 || opts.do_linear) compactxref(xref, &opts); /* Make renumbering affect all indirect references and update xref */ if (opts.do_garbage >= 2 || opts.do_linear) renumberobjs(xref, &opts); if (opts.do_linear) { linearize(xref, &opts); } writeobjects(xref, &opts, 0); #ifdef DEBUG_WRITING dump_object_details(xref, &opts); #endif /* Construct linked list of free object slots */ lastfree = 0; for (num = 0; num < xref->len; num++) { if (!opts.use_list[num]) { opts.gen_list[num]++; opts.ofs_list[lastfree] = num; lastfree = num; } } if (opts.do_linear) { opts.main_xref_offset = ftell(opts.out); writexref(xref, &opts, 0, opts.start, 0, 0, opts.first_xref_offset); opts.file_len = ftell(opts.out); make_hint_stream(xref, &opts); opts.file_len += opts.hintstream_len; opts.main_xref_offset += opts.hintstream_len; update_linearization_params(xref, &opts); fseek(opts.out, 0, 0); writeobjects(xref, &opts, 1); padto(opts.out, opts.main_xref_offset); writexref(xref, &opts, 0, opts.start, 0, 0, opts.first_xref_offset); } else { opts.first_xref_offset = ftell(opts.out); writexref(xref, &opts, 0, xref->len, 1, 0, opts.first_xref_offset); } xref->dirty = 0; } fz_always(ctx) { #ifdef DEBUG_LINEARIZATION page_objects_dump(&opts); objects_dump(xref, &opts); #endif fz_free(ctx, opts.use_list); fz_free(ctx, opts.ofs_list); fz_free(ctx, opts.gen_list); fz_free(ctx, opts.renumber_map); fz_free(ctx, opts.rev_renumber_map); fz_free(ctx, opts.rev_gen_list); pdf_drop_obj(opts.linear_l); pdf_drop_obj(opts.linear_h0); pdf_drop_obj(opts.linear_h1); pdf_drop_obj(opts.linear_o); pdf_drop_obj(opts.linear_e); pdf_drop_obj(opts.linear_n); pdf_drop_obj(opts.linear_t); pdf_drop_obj(opts.hints_s); pdf_drop_obj(opts.hints_length); page_objects_list_destroy(ctx, opts.page_object_lists); fclose(opts.out); } fz_catch(ctx) { fz_rethrow(ctx); } }