diff options
author | Robin Watts <robin.watts@artifex.com> | 2012-04-25 20:40:29 +0100 |
---|---|---|
committer | Robin Watts <robin.watts@artifex.com> | 2012-04-28 09:44:49 +0100 |
commit | 4e3d351910f0927283fdc6911574ffe982ca678b (patch) | |
tree | 854e8367ae8906f957c3bb766ae578188e38cd61 | |
parent | 463e9b6c69cf9232da5ae9659eaad6e7050b594d (diff) | |
download | mupdf-4e3d351910f0927283fdc6911574ffe982ca678b.tar.xz |
Move guts of pdfclean into new pdf_write function.
Expose pdf_write function through the document interface.
-rw-r--r-- | apps/mupdfclean.c | 656 | ||||
-rw-r--r-- | fitz/doc_document.c | 7 | ||||
-rw-r--r-- | fitz/fitz-internal.h | 1 | ||||
-rw-r--r-- | fitz/fitz.h | 46 | ||||
-rw-r--r-- | pdf/mupdf.h | 2 | ||||
-rw-r--r-- | pdf/pdf_write.c | 662 | ||||
-rw-r--r-- | win32/libmupdf.vcproj | 4 |
7 files changed, 735 insertions, 643 deletions
diff --git a/apps/mupdfclean.c b/apps/mupdfclean.c index 5b95fdae..f2676782 100644 --- a/apps/mupdfclean.c +++ b/apps/mupdfclean.c @@ -14,29 +14,13 @@ static FILE *out = NULL; -enum -{ - expand_images = 1, - expand_fonts = 2, - expand_all = -1 -}; - -static char *uselist = NULL; -static int *ofslist = NULL; -static int *genlist = NULL; -static int *renumbermap = NULL; - -static int dogarbage = 0; -static int doexpand = 0; -static int doascii = 0; - static pdf_document *xref = NULL; static fz_context *ctx = NULL; static void usage(void) { fprintf(stderr, - "usage: pdfclean [options] input.pdf [output.pdf] [pages]\n" + "usage: mupdfclean [options] input.pdf [output.pdf] [pages]\n" "\t-p -\tpassword\n" "\t-g\tgarbage collect unused objects\n" "\t-gg\tin addition to -g compact xref table\n" @@ -50,251 +34,6 @@ static void usage(void) } /* - * Garbage collect objects not reachable from the trailer. - */ - -static pdf_obj *sweepref(pdf_obj *obj) -{ - int num = pdf_to_num(obj); - int gen = pdf_to_gen(obj); - - if (num < 0 || num >= xref->len) - return NULL; - if (uselist[num]) - return NULL; - - uselist[num] = 1; - - /* Bake in /Length in stream objects */ - fz_try(ctx) - { - if (pdf_is_stream(xref, num, gen)) - { - pdf_obj *len = pdf_dict_gets(obj, "Length"); - if (pdf_is_indirect(len)) - { - uselist[pdf_to_num(len)] = 0; - len = pdf_resolve_indirect(len); - pdf_dict_puts(obj, "Length", len); - } - } - } - fz_catch(ctx) - { - /* Leave broken */ - } - - return pdf_resolve_indirect(obj); -} - -static void sweepobj(pdf_obj *obj) -{ - int i; - - if (pdf_is_indirect(obj)) - obj = sweepref(obj); - - if (pdf_is_dict(obj)) - { - int n = pdf_dict_len(obj); - for (i = 0; i < n; i++) - sweepobj(pdf_dict_get_val(obj, i)); - } - - else if (pdf_is_array(obj)) - { - int n = pdf_array_len(obj); - for (i = 0; i < n; i++) - sweepobj(pdf_array_get(obj, i)); - } -} - -/* - * Scan for and remove duplicate objects (slow) - */ - -static void removeduplicateobjs(void) -{ - int num, other; - - for (num = 1; num < xref->len; num++) - { - /* Only compare an object to objects preceding it */ - for (other = 1; other < num; other++) - { - pdf_obj *a, *b; - - if (num == other || !uselist[num] || !uselist[other]) - continue; - - /* - * Comparing stream objects data contents would take too long. - * - * pdf_is_stream calls pdf_cache_object and ensures - * that the xref table has the objects loaded. - */ - fz_try(ctx) - { - if (pdf_is_stream(xref, num, 0) || pdf_is_stream(xref, other, 0)) - continue; - } - fz_catch(ctx) - { - /* Assume different */ - } - - a = xref->table[num].obj; - b = xref->table[other].obj; - - a = pdf_resolve_indirect(a); - b = pdf_resolve_indirect(b); - - if (pdf_objcmp(a, b)) - continue; - - /* Keep the lowest numbered object */ - renumbermap[num] = MIN(num, other); - renumbermap[other] = MIN(num, other); - uselist[MAX(num, other)] = 0; - - /* One duplicate was found, do not look for another */ - break; - } - } -} - -/* - * Renumber objects sequentially so the xref is more compact - */ - -static void compactxref(void) -{ - int num, newnum; - - /* - * Update renumbermap in-place, clustering all used - * objects together at low object ids. Objects that - * already should be renumbered will have their new - * object ids be updated to reflect the compaction. - */ - - newnum = 1; - for (num = 1; num < xref->len; num++) - { - if (uselist[num] && renumbermap[num] == num) - renumbermap[num] = newnum++; - else if (renumbermap[num] != num) - renumbermap[num] = renumbermap[renumbermap[num]]; - } -} - -/* - * Update indirect objects according to renumbering established when - * removing duplicate objects and compacting the xref. - */ - -static void renumberobj(pdf_obj *obj) -{ - int i; - fz_context *ctx = xref->ctx; - - if (pdf_is_dict(obj)) - { - int n = pdf_dict_len(obj); - for (i = 0; i < n; i++) - { - pdf_obj *key = pdf_dict_get_key(obj, i); - pdf_obj *val = pdf_dict_get_val(obj, i); - if (pdf_is_indirect(val)) - { - val = pdf_new_indirect(ctx, renumbermap[pdf_to_num(val)], 0, xref); - fz_dict_put(obj, key, val); - pdf_drop_obj(val); - } - else - { - renumberobj(val); - } - } - } - - else if (pdf_is_array(obj)) - { - int n = pdf_array_len(obj); - for (i = 0; i < n; i++) - { - pdf_obj *val = pdf_array_get(obj, i); - if (pdf_is_indirect(val)) - { - val = pdf_new_indirect(ctx, renumbermap[pdf_to_num(val)], 0, xref); - pdf_array_put(obj, i, val); - pdf_drop_obj(val); - } - else - { - renumberobj(val); - } - } - } -} - -static void renumberobjs(void) -{ - pdf_xref_entry *oldxref; - int newlen; - int num; - - /* Apply renumber map to indirect references in all objects in xref */ - renumberobj(xref->trailer); - for (num = 0; num < xref->len; num++) - { - pdf_obj *obj = xref->table[num].obj; - - if (pdf_is_indirect(obj)) - { - obj = pdf_new_indirect(ctx, renumbermap[pdf_to_num(obj)], 0, xref); - pdf_update_object(xref, num, 0, obj); - pdf_drop_obj(obj); - } - else - { - renumberobj(obj); - } - } - - /* Create new table for the reordered, compacted xref */ - oldxref = xref->table; - xref->table = fz_malloc_array(xref->ctx, xref->len, sizeof(pdf_xref_entry)); - xref->table[0] = oldxref[0]; - - /* Move used objects into the new compacted xref */ - newlen = 0; - for (num = 1; num < xref->len; num++) - { - if (uselist[num]) - { - if (newlen < renumbermap[num]) - newlen = renumbermap[num]; - xref->table[renumbermap[num]] = oldxref[num]; - } - else - { - if (oldxref[num].obj) - pdf_drop_obj(oldxref[num].obj); - } - } - - fz_free(xref->ctx, oldxref); - - /* Update the used objects count in compacted xref */ - xref->len = newlen + 1; - - /* Update list of used objects to fit with compacted xref */ - for (num = 1; num < xref->len; num++) - uselist[num] = 1; -} - -/* * Recreate page tree to only retain specified pages. */ @@ -415,330 +154,6 @@ static void retainpages(int argc, char **argv) } } -/* - * Make sure we have loaded objects from object streams. - */ - -static void preloadobjstms(void) -{ - pdf_obj *obj; - int num; - - for (num = 0; num < xref->len; num++) - { - if (xref->table[num].type == 'o') - { - obj = pdf_load_object(xref, num, 0); - pdf_drop_obj(obj); - } - } -} - -/* - * Save streams and objects to the output - */ - -static inline int isbinary(int c) -{ - if (c == '\n' || c == '\r' || c == '\t') - return 0; - return c < 32 || c > 127; -} - -static int isbinarystream(fz_buffer *buf) -{ - int i; - for (i = 0; i < buf->len; i++) - if (isbinary(buf->data[i])) - return 1; - return 0; -} - -static fz_buffer *hexbuf(unsigned char *p, int n) -{ - static const char hex[16] = "0123456789abcdef"; - fz_buffer *buf; - int x = 0; - - buf = fz_new_buffer(ctx, n * 2 + (n / 32) + 2); - - while (n--) - { - buf->data[buf->len++] = hex[*p >> 4]; - buf->data[buf->len++] = hex[*p & 15]; - if (++x == 32) - { - buf->data[buf->len++] = '\n'; - x = 0; - } - p++; - } - - buf->data[buf->len++] = '>'; - buf->data[buf->len++] = '\n'; - - return buf; -} - -static void addhexfilter(pdf_obj *dict) -{ - pdf_obj *f, *dp, *newf, *newdp; - pdf_obj *ahx, *nullobj; - - ahx = fz_new_name(ctx, "ASCIIHexDecode"); - nullobj = pdf_new_null(ctx); - newf = newdp = NULL; - - f = pdf_dict_gets(dict, "Filter"); - dp = pdf_dict_gets(dict, "DecodeParms"); - - if (pdf_is_name(f)) - { - newf = pdf_new_array(ctx, 2); - pdf_array_push(newf, ahx); - pdf_array_push(newf, f); - f = newf; - if (pdf_is_dict(dp)) - { - newdp = pdf_new_array(ctx, 2); - pdf_array_push(newdp, nullobj); - pdf_array_push(newdp, dp); - dp = newdp; - } - } - else if (pdf_is_array(f)) - { - pdf_array_insert(f, ahx); - if (pdf_is_array(dp)) - pdf_array_insert(dp, nullobj); - } - else - f = ahx; - - pdf_dict_puts(dict, "Filter", f); - if (dp) - pdf_dict_puts(dict, "DecodeParms", dp); - - pdf_drop_obj(ahx); - pdf_drop_obj(nullobj); - if (newf) - pdf_drop_obj(newf); - if (newdp) - pdf_drop_obj(newdp); -} - -static void copystream(pdf_obj *obj, int num, int gen) -{ - fz_buffer *buf, *tmp; - pdf_obj *newlen; - - buf = pdf_load_raw_stream(xref, num, gen); - - if (doascii && isbinarystream(buf)) - { - tmp = hexbuf(buf->data, buf->len); - fz_drop_buffer(ctx, buf); - buf = tmp; - - addhexfilter(obj); - - newlen = pdf_new_int(ctx, buf->len); - pdf_dict_puts(obj, "Length", newlen); - pdf_drop_obj(newlen); - } - - fprintf(out, "%d %d obj\n", num, gen); - pdf_fprint_obj(out, obj, doexpand == 0); - fprintf(out, "stream\n"); - fwrite(buf->data, 1, buf->len, out); - fprintf(out, "endstream\nendobj\n\n"); - - fz_drop_buffer(ctx, buf); -} - -static void expandstream(pdf_obj *obj, int num, int gen) -{ - fz_buffer *buf, *tmp; - pdf_obj *newlen; - - buf = pdf_load_stream(xref, num, gen); - - pdf_dict_dels(obj, "Filter"); - pdf_dict_dels(obj, "DecodeParms"); - - if (doascii && isbinarystream(buf)) - { - tmp = hexbuf(buf->data, buf->len); - fz_drop_buffer(ctx, buf); - buf = tmp; - - addhexfilter(obj); - } - - newlen = pdf_new_int(ctx, buf->len); - pdf_dict_puts(obj, "Length", newlen); - pdf_drop_obj(newlen); - - fprintf(out, "%d %d obj\n", num, gen); - pdf_fprint_obj(out, obj, doexpand == 0); - fprintf(out, "stream\n"); - fwrite(buf->data, 1, buf->len, out); - fprintf(out, "endstream\nendobj\n\n"); - - fz_drop_buffer(ctx, buf); -} - -static void writeobject(int num, int gen) -{ - pdf_obj *obj; - pdf_obj *type; - - obj = pdf_load_object(xref, num, gen); - - /* skip ObjStm and XRef objects */ - if (pdf_is_dict(obj)) - { - type = pdf_dict_gets(obj, "Type"); - if (pdf_is_name(type) && !strcmp(pdf_to_name(type), "ObjStm")) - { - uselist[num] = 0; - pdf_drop_obj(obj); - return; - } - if (pdf_is_name(type) && !strcmp(pdf_to_name(type), "XRef")) - { - uselist[num] = 0; - pdf_drop_obj(obj); - return; - } - } - - if (!pdf_is_stream(xref, num, gen)) - { - fprintf(out, "%d %d obj\n", num, gen); - pdf_fprint_obj(out, obj, doexpand == 0); - fprintf(out, "endobj\n\n"); - } - else - { - int dontexpand = 0; - if (doexpand != 0 && doexpand != expand_all) - { - pdf_obj *o; - - if ((o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "XObject")) && - (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "Image"))) - dontexpand = !(doexpand & expand_images); - if (o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "Font")) - dontexpand = !(doexpand & expand_fonts); - if (o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "FontDescriptor")) - dontexpand = !(doexpand & expand_fonts); - if ((o = pdf_dict_gets(obj, "Length1")) != NULL) - dontexpand = !(doexpand & expand_fonts); - if ((o = pdf_dict_gets(obj, "Length2")) != NULL) - dontexpand = !(doexpand & expand_fonts); - if ((o = pdf_dict_gets(obj, "Length3")) != NULL) - dontexpand = !(doexpand & expand_fonts); - if (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "Type1C")) - dontexpand = !(doexpand & expand_fonts); - if (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "CIDFontType0C")) - dontexpand = !(doexpand & expand_fonts); - } - if (doexpand && !dontexpand && !pdf_is_jpx_image(ctx, obj)) - expandstream(obj, num, gen); - else - copystream(obj, num, gen); - } - - pdf_drop_obj(obj); -} - -static void writexref(void) -{ - pdf_obj *trailer; - pdf_obj *obj; - int startxref; - int num; - - startxref = ftell(out); - - fprintf(out, "xref\n0 %d\n", xref->len); - for (num = 0; num < xref->len; num++) - { - if (uselist[num]) - fprintf(out, "%010d %05d n \n", ofslist[num], genlist[num]); - else - fprintf(out, "%010d %05d f \n", ofslist[num], genlist[num]); - } - fprintf(out, "\n"); - - trailer = pdf_new_dict(ctx, 5); - - obj = pdf_new_int(ctx, xref->len); - pdf_dict_puts(trailer, "Size", obj); - pdf_drop_obj(obj); - - obj = pdf_dict_gets(xref->trailer, "Info"); - if (obj) - pdf_dict_puts(trailer, "Info", obj); - - obj = pdf_dict_gets(xref->trailer, "Root"); - if (obj) - pdf_dict_puts(trailer, "Root", obj); - - obj = pdf_dict_gets(xref->trailer, "ID"); - if (obj) - pdf_dict_puts(trailer, "ID", obj); - - fprintf(out, "trailer\n"); - pdf_fprint_obj(out, trailer, doexpand == 0); - fprintf(out, "\n"); - - pdf_drop_obj(trailer); - - fprintf(out, "startxref\n%d\n%%%%EOF\n", startxref); -} - -static void writepdf(void) -{ - int lastfree; - int num; - - for (num = 0; num < xref->len; num++) - { - if (xref->table[num].type == 'f') - genlist[num] = xref->table[num].gen; - if (xref->table[num].type == 'n') - genlist[num] = xref->table[num].gen; - if (xref->table[num].type == 'o') - genlist[num] = 0; - - if (dogarbage && !uselist[num]) - continue; - - if (xref->table[num].type == 'n' || xref->table[num].type == 'o') - { - uselist[num] = 1; - ofslist[num] = ftell(out); - writeobject(num, genlist[num]); - } - } - - /* Construct linked list of free object slots */ - lastfree = 0; - for (num = 0; num < xref->len; num++) - { - if (!uselist[num]) - { - genlist[num]++; - ofslist[lastfree] = num; - lastfree = num; - } - } - - writexref(); -} - #ifdef MUPDF_COMBINED_EXE int pdfclean_main(int argc, char **argv) #else @@ -748,19 +163,24 @@ int main(int argc, char **argv) char *infile; char *outfile = "out.pdf"; char *password = ""; - int c, num; + int c; int subset; + fz_write_options opts; + + opts.dogarbage = 0; + opts.doexpand = 0; + opts.doascii = 0; while ((c = fz_getopt(argc, argv, "adfgip:")) != -1) { switch (c) { case 'p': password = fz_optarg; break; - case 'g': dogarbage ++; break; - case 'd': doexpand ^= expand_all; break; - case 'f': doexpand ^= expand_fonts; break; - case 'i': doexpand ^= expand_images; break; - case 'a': doascii ++; break; + case 'g': opts.dogarbage ++; break; + case 'd': opts.doexpand ^= fz_expand_all; break; + case 'f': opts.doexpand ^= fz_expand_fonts; break; + case 'i': opts.doexpand ^= fz_expand_images; break; + case 'a': opts.doascii ++; break; default: usage(); break; } } @@ -792,61 +212,11 @@ int main(int argc, char **argv) if (!pdf_authenticate_password(xref, password)) fz_throw(ctx, "cannot authenticate password: %s", infile); - out = fopen(outfile, "wb"); - if (!out) - fz_throw(ctx, "cannot open output file '%s'", outfile); - - fprintf(out, "%%PDF-%d.%d\n", xref->version / 10, xref->version % 10); - fprintf(out, "%%\316\274\341\277\246\n\n"); - - uselist = fz_malloc_array(ctx, xref->len + 1, sizeof(char)); - ofslist = fz_malloc_array(ctx, xref->len + 1, sizeof(int)); - genlist = fz_malloc_array(ctx, xref->len + 1, sizeof(int)); - renumbermap = fz_malloc_array(ctx, xref->len + 1, sizeof(int)); - - for (num = 0; num < xref->len; num++) - { - uselist[num] = 0; - ofslist[num] = 0; - genlist[num] = 0; - renumbermap[num] = num; - } - - /* Make sure any objects hidden in compressed streams have been loaded */ - preloadobjstms(); - /* Only retain the specified subset of the pages */ if (subset) retainpages(argc, argv); - /* Sweep & mark objects from the trailer */ - if (dogarbage >= 1) - sweepobj(xref->trailer); - - /* Coalesce and renumber duplicate objects */ - if (dogarbage >= 3) - removeduplicateobjs(); - - /* Compact xref by renumbering and removing unused objects */ - if (dogarbage >= 2) - compactxref(); - - /* Make renumbering affect all indirect references and update xref */ - /* Do not renumber objects if encryption is in use, as the object - * numbers are baked into the streams/strings, and we can't currently - * cope with moving them. See bug 692627. */ - if (dogarbage >= 2 && !xref->crypt) - renumberobjs(); - - writepdf(); - - if (fclose(out)) - fz_throw(ctx, "cannot close output file '%s'", outfile); - - fz_free(xref->ctx, uselist); - fz_free(xref->ctx, ofslist); - fz_free(xref->ctx, genlist); - fz_free(xref->ctx, renumbermap); + pdf_write(xref, outfile, &opts); pdf_close_document(xref); fz_free_context(ctx); diff --git a/fitz/doc_document.c b/fitz/doc_document.c index 22229892..83dbe53f 100644 --- a/fitz/doc_document.c +++ b/fitz/doc_document.c @@ -129,3 +129,10 @@ fz_meta(fz_document *doc, int key, void *ptr, int size) return doc->meta(doc, key, ptr, size); return FZ_META_UNKNOWN_KEY; } + +void +fz_write(fz_document *doc, char *filename, fz_write_opts *opts) +{ + if (doc && doc->write) + doc->write(doc, filename, opts); +}
\ No newline at end of file diff --git a/fitz/fitz-internal.h b/fitz/fitz-internal.h index bd19886c..1e54c900 100644 --- a/fitz/fitz-internal.h +++ b/fitz/fitz-internal.h @@ -1080,6 +1080,7 @@ struct fz_document_s void (*run_page)(fz_document *doc, fz_page *page, fz_device *dev, fz_matrix transform, fz_cookie *cookie); void (*free_page)(fz_document *doc, fz_page *page); int (*meta)(fz_document *doc, int key, void *ptr, int size); + void (*write)(fz_document *doc, char *filename, fz_write_options *opts); }; #endif diff --git a/fitz/fitz.h b/fitz/fitz.h index 04215c71..61f80756 100644 --- a/fitz/fitz.h +++ b/fitz/fitz.h @@ -2231,4 +2231,50 @@ enum FZ_META_INFO = 4, }; +typedef struct fz_write_options_s fz_write_options; + +/* + In calls to fz_write, the following options structure can be used + to control aspects of the writing process. This structure may grow + in future, and should be zero-filled to allow forwards compatiblity. +*/ +struct fz_write_options_s +{ + int doascii; /* If non-zero then attempt (where possible) to + make the output ascii. */ + int doexpand; /* Bitflags; each non zero bit indicates an aspect + of the file that should be 'expanded' on + writing. */ + int dogarbage; /* If non-zero then attempt (where possible) to + garbage collect the file before writing. */ +}; + +/* An enumeration of bitflags to use in the above 'doexpand' field of + fz_write_options. +*/ +enum +{ + fz_expand_images = 1, + fz_expand_fonts = 2, + fz_expand_all = -1 +}; + +/* + fz_write: Write a document out. + + (In development - Subject to change in future versions) + + Save a copy of the current document in its original format. + Internally the document may change. + + doc: The document to save. + + filename: The filename to save to. + + opts: NULL, or a pointer to an options structure. + + May throw exceptions. +*/ +void fz_write(fz_document *doc, char *filename, fz_write_options *opts); + #endif diff --git a/pdf/mupdf.h b/pdf/mupdf.h index b88f7423..72c5c4cf 100644 --- a/pdf/mupdf.h +++ b/pdf/mupdf.h @@ -144,6 +144,8 @@ pdf_document *pdf_open_document_with_stream(fz_stream *file); */ void pdf_close_document(pdf_document *doc); +void pdf_write(pdf_document *doc, char *filename, fz_write_options *opts); + int pdf_needs_password(pdf_document *doc); int pdf_authenticate_password(pdf_document *doc, char *pw); diff --git a/pdf/pdf_write.c b/pdf/pdf_write.c new file mode 100644 index 00000000..e14cfa28 --- /dev/null +++ b/pdf/pdf_write.c @@ -0,0 +1,662 @@ +#include "fitz.h" +#include "mupdf-internal.h" + +typedef struct pdf_write_options_s pdf_write_options; + +struct pdf_write_options_s +{ + FILE *out; + int doascii; + int doexpand; + int dogarbage; + char *uselist; + int *ofslist; + int *genlist; + int *renumbermap; +}; + +/* + * Garbage collect objects not reachable from the trailer. + */ + +static pdf_obj *sweepref(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj) +{ + int num = pdf_to_num(obj); + int gen = pdf_to_gen(obj); + fz_context *ctx = xref->ctx; + + if (num < 0 || num >= xref->len) + return NULL; + if (opts->uselist[num]) + return NULL; + + opts->uselist[num] = 1; + + /* Bake in /Length in stream objects */ + fz_try(ctx) + { + if (pdf_is_stream(xref, num, gen)) + { + pdf_obj *len = pdf_dict_gets(obj, "Length"); + if (pdf_is_indirect(len)) + { + opts->uselist[pdf_to_num(len)] = 0; + len = pdf_resolve_indirect(len); + pdf_dict_puts(obj, "Length", len); + } + } + } + fz_catch(ctx) + { + /* Leave broken */ + } + + return pdf_resolve_indirect(obj); +} + +static void sweepobj(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj) +{ + int i; + + if (pdf_is_indirect(obj)) + obj = sweepref(xref, opts, obj); + + if (pdf_is_dict(obj)) + { + int n = pdf_dict_len(obj); + for (i = 0; i < n; i++) + sweepobj(xref, opts, pdf_dict_get_val(obj, i)); + } + + else if (pdf_is_array(obj)) + { + int n = pdf_array_len(obj); + for (i = 0; i < n; i++) + sweepobj(xref, opts, pdf_array_get(obj, i)); + } +} + +/* + * Scan for and remove duplicate objects (slow) + */ + +static void removeduplicateobjs(pdf_document *xref, pdf_write_options *opts) +{ + int num, other; + fz_context *ctx = xref->ctx; + + for (num = 1; num < xref->len; num++) + { + /* Only compare an object to objects preceding it */ + for (other = 1; other < num; other++) + { + pdf_obj *a, *b; + int match; + + if (num == other || !opts->uselist[num] || !opts->uselist[other]) + continue; + + /* + * Comparing stream objects data contents would take too long. + * + * pdf_is_stream calls pdf_cache_object and ensures + * that the xref table has the objects loaded. + */ + fz_try(ctx) + { + match = (pdf_is_stream(xref, num, 0) || pdf_is_stream(xref, other, 0)); + } + fz_catch(ctx) + { + /* Assume different */ + match = 0; + } + if (match) + continue; + + a = xref->table[num].obj; + b = xref->table[other].obj; + + a = pdf_resolve_indirect(a); + b = pdf_resolve_indirect(b); + + if (pdf_objcmp(a, b)) + continue; + + /* Keep the lowest numbered object */ + opts->renumbermap[num] = MIN(num, other); + opts->renumbermap[other] = MIN(num, other); + opts->uselist[MAX(num, other)] = 0; + + /* One duplicate was found, do not look for another */ + break; + } + } +} + +/* + * Renumber objects sequentially so the xref is more compact + */ + +static void compactxref(pdf_document *xref, pdf_write_options *opts) +{ + int num, newnum; + + /* + * Update renumbermap in-place, clustering all used + * objects together at low object ids. Objects that + * already should be renumbered will have their new + * object ids be updated to reflect the compaction. + */ + + newnum = 1; + for (num = 1; num < xref->len; num++) + { + if (opts->uselist[num] && opts->renumbermap[num] == num) + opts->renumbermap[num] = newnum++; + else if (opts->renumbermap[num] != num) + opts->renumbermap[num] = opts->renumbermap[opts->renumbermap[num]]; + } +} + +/* + * Update indirect objects according to renumbering established when + * removing duplicate objects and compacting the xref. + */ + +static void renumberobj(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj) +{ + int i; + fz_context *ctx = xref->ctx; + + if (pdf_is_dict(obj)) + { + int n = pdf_dict_len(obj); + for (i = 0; i < n; i++) + { + pdf_obj *key = pdf_dict_get_key(obj, i); + pdf_obj *val = pdf_dict_get_val(obj, i); + if (pdf_is_indirect(val)) + { + val = pdf_new_indirect(ctx, opts->renumbermap[pdf_to_num(val)], 0, xref); + fz_dict_put(obj, key, val); + pdf_drop_obj(val); + } + else + { + renumberobj(xref, opts, val); + } + } + } + + else if (pdf_is_array(obj)) + { + int n = pdf_array_len(obj); + for (i = 0; i < n; i++) + { + pdf_obj *val = pdf_array_get(obj, i); + if (pdf_is_indirect(val)) + { + val = pdf_new_indirect(ctx, opts->renumbermap[pdf_to_num(val)], 0, xref); + pdf_array_put(obj, i, val); + pdf_drop_obj(val); + } + else + { + renumberobj(xref, opts, val); + } + } + } +} + +static void renumberobjs(pdf_document *xref, pdf_write_options *opts) +{ + pdf_xref_entry *oldxref; + int newlen; + int num; + fz_context *ctx = xref->ctx; + + /* Apply renumber map to indirect references in all objects in xref */ + renumberobj(xref, opts, xref->trailer); + for (num = 0; num < xref->len; num++) + { + pdf_obj *obj = xref->table[num].obj; + + if (pdf_is_indirect(obj)) + { + obj = pdf_new_indirect(ctx, opts->renumbermap[pdf_to_num(obj)], 0, xref); + pdf_update_object(xref, num, 0, obj); + pdf_drop_obj(obj); + } + else + { + renumberobj(xref, opts, obj); + } + } + + /* Create new table for the reordered, compacted xref */ + oldxref = xref->table; + xref->table = fz_malloc_array(xref->ctx, xref->len, sizeof(pdf_xref_entry)); + xref->table[0] = oldxref[0]; + + /* Move used objects into the new compacted xref */ + newlen = 0; + for (num = 1; num < xref->len; num++) + { + if (opts->uselist[num]) + { + if (newlen < opts->renumbermap[num]) + newlen = opts->renumbermap[num]; + xref->table[opts->renumbermap[num]] = oldxref[num]; + } + else + { + if (oldxref[num].obj) + pdf_drop_obj(oldxref[num].obj); + } + } + + fz_free(xref->ctx, oldxref); + + /* Update the used objects count in compacted xref */ + xref->len = newlen + 1; + + /* Update list of used objects to fit with compacted xref */ + for (num = 1; num < xref->len; num++) + opts->uselist[num] = 1; +} + +/* + * Make sure we have loaded objects from object streams. + */ + +static void preloadobjstms(pdf_document *xref) +{ + pdf_obj *obj; + int num; + + for (num = 0; num < xref->len; num++) + { + if (xref->table[num].type == 'o') + { + obj = pdf_load_object(xref, num, 0); + pdf_drop_obj(obj); + } + } +} + +/* + * Save streams and objects to the output + */ + +static inline int isbinary(int c) +{ + if (c == '\n' || c == '\r' || c == '\t') + return 0; + return c < 32 || c > 127; +} + +static int isbinarystream(fz_buffer *buf) +{ + int i; + for (i = 0; i < buf->len; i++) + if (isbinary(buf->data[i])) + return 1; + return 0; +} + +static fz_buffer *hexbuf(fz_context *ctx, unsigned char *p, int n) +{ + static const char hex[16] = "0123456789abcdef"; + fz_buffer *buf; + int x = 0; + + buf = fz_new_buffer(ctx, n * 2 + (n / 32) + 2); + + while (n--) + { + buf->data[buf->len++] = hex[*p >> 4]; + buf->data[buf->len++] = hex[*p & 15]; + if (++x == 32) + { + buf->data[buf->len++] = '\n'; + x = 0; + } + p++; + } + + buf->data[buf->len++] = '>'; + buf->data[buf->len++] = '\n'; + + return buf; +} + +static void addhexfilter(pdf_document *xref, pdf_obj *dict) +{ + pdf_obj *f, *dp, *newf, *newdp; + pdf_obj *ahx, *nullobj; + fz_context *ctx = xref->ctx; + + ahx = fz_new_name(ctx, "ASCIIHexDecode"); + nullobj = pdf_new_null(ctx); + newf = newdp = NULL; + + f = pdf_dict_gets(dict, "Filter"); + dp = pdf_dict_gets(dict, "DecodeParms"); + + if (pdf_is_name(f)) + { + newf = pdf_new_array(ctx, 2); + pdf_array_push(newf, ahx); + pdf_array_push(newf, f); + f = newf; + if (pdf_is_dict(dp)) + { + newdp = pdf_new_array(ctx, 2); + pdf_array_push(newdp, nullobj); + pdf_array_push(newdp, dp); + dp = newdp; + } + } + else if (pdf_is_array(f)) + { + pdf_array_insert(f, ahx); + if (pdf_is_array(dp)) + pdf_array_insert(dp, nullobj); + } + else + f = ahx; + + pdf_dict_puts(dict, "Filter", f); + if (dp) + pdf_dict_puts(dict, "DecodeParms", dp); + + pdf_drop_obj(ahx); + pdf_drop_obj(nullobj); + if (newf) + pdf_drop_obj(newf); + if (newdp) + pdf_drop_obj(newdp); +} + +static void copystream(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj, int num, int gen) +{ + fz_buffer *buf, *tmp; + pdf_obj *newlen; + fz_context *ctx = xref->ctx; + + buf = pdf_load_raw_stream(xref, num, gen); + + if (opts->doascii && isbinarystream(buf)) + { + tmp = hexbuf(ctx, buf->data, buf->len); + fz_drop_buffer(ctx, buf); + buf = tmp; + + addhexfilter(xref, obj); + + newlen = pdf_new_int(ctx, buf->len); + pdf_dict_puts(obj, "Length", newlen); + pdf_drop_obj(newlen); + } + + fprintf(opts->out, "%d %d obj\n", num, gen); + pdf_fprint_obj(opts->out, obj, opts->doexpand == 0); + fprintf(opts->out, "stream\n"); + fwrite(buf->data, 1, buf->len, opts->out); + fprintf(opts->out, "endstream\nendobj\n\n"); + + fz_drop_buffer(ctx, buf); +} + +static void expandstream(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj, int num, int gen) +{ + fz_buffer *buf, *tmp; + pdf_obj *newlen; + fz_context *ctx = xref->ctx; + + buf = pdf_load_stream(xref, num, gen); + + pdf_dict_dels(obj, "Filter"); + pdf_dict_dels(obj, "DecodeParms"); + + if (opts->doascii && isbinarystream(buf)) + { + tmp = hexbuf(ctx, buf->data, buf->len); + fz_drop_buffer(ctx, buf); + buf = tmp; + + addhexfilter(xref, obj); + } + + newlen = pdf_new_int(ctx, buf->len); + pdf_dict_puts(obj, "Length", newlen); + pdf_drop_obj(newlen); + + fprintf(opts->out, "%d %d obj\n", num, gen); + pdf_fprint_obj(opts->out, obj, opts->doexpand == 0); + fprintf(opts->out, "stream\n"); + fwrite(buf->data, 1, buf->len, opts->out); + fprintf(opts->out, "endstream\nendobj\n\n"); + + fz_drop_buffer(ctx, buf); +} + +static void writeobject(pdf_document *xref, pdf_write_options *opts, int num, int gen) +{ + pdf_obj *obj; + pdf_obj *type; + fz_context *ctx = xref->ctx; + + obj = pdf_load_object(xref, num, gen); + + /* skip ObjStm and XRef objects */ + if (pdf_is_dict(obj)) + { + type = pdf_dict_gets(obj, "Type"); + if (pdf_is_name(type) && !strcmp(pdf_to_name(type), "ObjStm")) + { + opts->uselist[num] = 0; + pdf_drop_obj(obj); + return; + } + if (pdf_is_name(type) && !strcmp(pdf_to_name(type), "XRef")) + { + opts->uselist[num] = 0; + pdf_drop_obj(obj); + return; + } + } + + if (!pdf_is_stream(xref, num, gen)) + { + fprintf(opts->out, "%d %d obj\n", num, gen); + pdf_fprint_obj(opts->out, obj, opts->doexpand == 0); + fprintf(opts->out, "endobj\n\n"); + } + else + { + int dontexpand = 0; + if (opts->doexpand != 0 && opts->doexpand != fz_expand_all) + { + pdf_obj *o; + + if ((o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "XObject")) && + (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "Image"))) + dontexpand = !(opts->doexpand & fz_expand_images); + if (o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "Font")) + dontexpand = !(opts->doexpand & fz_expand_fonts); + if (o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "FontDescriptor")) + dontexpand = !(opts->doexpand & fz_expand_fonts); + if ((o = pdf_dict_gets(obj, "Length1")) != NULL) + dontexpand = !(opts->doexpand & fz_expand_fonts); + if ((o = pdf_dict_gets(obj, "Length2")) != NULL) + dontexpand = !(opts->doexpand & fz_expand_fonts); + if ((o = pdf_dict_gets(obj, "Length3")) != NULL) + dontexpand = !(opts->doexpand & fz_expand_fonts); + if (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "Type1C")) + dontexpand = !(opts->doexpand & fz_expand_fonts); + if (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "CIDFontType0C")) + dontexpand = !(opts->doexpand & fz_expand_fonts); + } + if (opts->doexpand && !dontexpand && !pdf_is_jpx_image(ctx, obj)) + expandstream(xref, opts, obj, num, gen); + else + copystream(xref, opts, obj, num, gen); + } + + pdf_drop_obj(obj); +} + +static void writexref(pdf_document *xref, pdf_write_options *opts) +{ + pdf_obj *trailer; + pdf_obj *obj; + int startxref; + int num; + fz_context *ctx = xref->ctx; + + startxref = ftell(opts->out); + + fprintf(opts->out, "xref\n0 %d\n", xref->len); + for (num = 0; num < xref->len; num++) + { + if (opts->uselist[num]) + fprintf(opts->out, "%010d %05d n \n", opts->ofslist[num], opts->genlist[num]); + else + fprintf(opts->out, "%010d %05d f \n", opts->ofslist[num], opts->genlist[num]); + } + fprintf(opts->out, "\n"); + + trailer = pdf_new_dict(ctx, 5); + + obj = pdf_new_int(ctx, xref->len); + pdf_dict_puts(trailer, "Size", obj); + pdf_drop_obj(obj); + + obj = pdf_dict_gets(xref->trailer, "Info"); + if (obj) + pdf_dict_puts(trailer, "Info", obj); + + obj = pdf_dict_gets(xref->trailer, "Root"); + if (obj) + pdf_dict_puts(trailer, "Root", obj); + + obj = pdf_dict_gets(xref->trailer, "ID"); + if (obj) + pdf_dict_puts(trailer, "ID", obj); + + fprintf(opts->out, "trailer\n"); + pdf_fprint_obj(opts->out, trailer, opts->doexpand == 0); + fprintf(opts->out, "\n"); + + pdf_drop_obj(trailer); + + fprintf(opts->out, "startxref\n%d\n%%%%EOF\n", startxref); +} + +void pdf_write(pdf_document *xref, char *filename, fz_write_options *fz_opts) +{ + int lastfree; + int num; + pdf_write_options opts = { 0 }; + fz_context *ctx; + + if (!xref || !fz_opts) + return; + + ctx = xref->ctx; + + opts.out = fopen(filename, "wb"); + if (!opts.out) + fz_throw(ctx, "cannot open output file '%s'", filename); + + fz_try(ctx) + { + opts.doexpand = fz_opts ? fz_opts->doexpand : 0; + opts.dogarbage = fz_opts ? fz_opts->dogarbage : 0; + opts.doascii = fz_opts ? fz_opts->doascii: 0; + opts.uselist = fz_malloc_array(ctx, xref->len + 1, sizeof(char)); + opts.ofslist = fz_malloc_array(ctx, xref->len + 1, sizeof(int)); + opts.genlist = fz_malloc_array(ctx, xref->len + 1, sizeof(int)); + opts.renumbermap = fz_malloc_array(ctx, xref->len + 1, sizeof(int)); + + fprintf(opts.out, "%%PDF-%d.%d\n", xref->version / 10, xref->version % 10); + fprintf(opts.out, "%%\316\274\341\277\246\n\n"); + + for (num = 0; num < xref->len; num++) + { + opts.uselist[num] = 0; + opts.ofslist[num] = 0; + opts.renumbermap[num] = num; + } + + /* Make sure any objects hidden in compressed streams have been loaded */ + preloadobjstms(xref); + + /* Sweep & mark objects from the trailer */ + if (opts.dogarbage >= 1) + sweepobj(xref, &opts, xref->trailer); + + /* Coalesce and renumber duplicate objects */ + if (opts.dogarbage >= 3) + removeduplicateobjs(xref, &opts); + + /* Compact xref by renumbering and removing unused objects */ + if (opts.dogarbage >= 2) + compactxref(xref, &opts); + + /* Make renumbering affect all indirect references and update xref */ + /* Do not renumber objects if encryption is in use, as the object + * numbers are baked into the streams/strings, and we can't currently + * cope with moving them. See bug 692627. */ + if (opts.dogarbage >= 2 && !xref->crypt) + renumberobjs(xref, &opts); + + for (num = 0; num < xref->len; num++) + { + if (xref->table[num].type == 'f') + opts.genlist[num] = xref->table[num].gen; + if (xref->table[num].type == 'n') + opts.genlist[num] = xref->table[num].gen; + if (xref->table[num].type == 'o') + opts.genlist[num] = 0; + + if (opts.dogarbage && !opts.uselist[num]) + continue; + + if (xref->table[num].type == 'n' || xref->table[num].type == 'o') + { + opts.uselist[num] = 1; + opts.ofslist[num] = ftell(opts.out); + writeobject(xref, &opts, num, opts.genlist[num]); + } + } + + /* Construct linked list of free object slots */ + lastfree = 0; + for (num = 0; num < xref->len; num++) + { + if (!opts.uselist[num]) + { + opts.genlist[num]++; + opts.ofslist[lastfree] = num; + lastfree = num; + } + } + + writexref(xref, &opts); + } + fz_always(ctx) + { + fz_free(ctx, opts.uselist); + fz_free(ctx, opts.ofslist); + fz_free(ctx, opts.genlist); + fz_free(ctx, opts.renumbermap); + fclose(opts.out); + } + fz_catch(ctx) + { + fz_rethrow(ctx); + } +} diff --git a/win32/libmupdf.vcproj b/win32/libmupdf.vcproj index b984cc8f..98b1aa5e 100644 --- a/win32/libmupdf.vcproj +++ b/win32/libmupdf.vcproj @@ -334,6 +334,10 @@ > </File> <File + RelativePath="..\pdf\pdf_write.c" + > + </File> + <File RelativePath="..\pdf\pdf_xobject.c" > </File> |