diff options
author | Tor Andersson <tor.andersson@artifex.com> | 2013-06-19 15:29:44 +0200 |
---|---|---|
committer | Tor Andersson <tor.andersson@artifex.com> | 2013-06-20 16:45:35 +0200 |
commit | 0a927854a10e1e6b9770a81e2e1d9f3093631757 (patch) | |
tree | 3d65d820d9fdba2d0d394d99c36290c851b78ca0 /source/tools/pdfextract.c | |
parent | 1ae8f19179c5f0f8c6352b3c7855465325d5449a (diff) | |
download | mupdf-0a927854a10e1e6b9770a81e2e1d9f3093631757.tar.xz |
Rearrange source files.
Diffstat (limited to 'source/tools/pdfextract.c')
-rw-r--r-- | source/tools/pdfextract.c | 231 |
1 files changed, 231 insertions, 0 deletions
diff --git a/source/tools/pdfextract.c b/source/tools/pdfextract.c new file mode 100644 index 00000000..6e8e4aec --- /dev/null +++ b/source/tools/pdfextract.c @@ -0,0 +1,231 @@ +/* + * pdfextract -- the ultimate way to extract images and fonts from pdfs + */ + +#include "mupdf/pdf.h" + +static pdf_document *doc = NULL; +static fz_context *ctx = NULL; +static int dorgb = 0; + +static void usage(void) +{ + fprintf(stderr, "usage: mutool extract [options] file.pdf [object numbers]\n"); + fprintf(stderr, "\t-p\tpassword\n"); + fprintf(stderr, "\t-r\tconvert images to rgb\n"); + exit(1); +} + +static int isimage(pdf_obj *obj) +{ + pdf_obj *type = pdf_dict_gets(obj, "Subtype"); + return pdf_is_name(type) && !strcmp(pdf_to_name(type), "Image"); +} + +static int isfontdesc(pdf_obj *obj) +{ + pdf_obj *type = pdf_dict_gets(obj, "Type"); + return pdf_is_name(type) && !strcmp(pdf_to_name(type), "FontDescriptor"); +} + +static void writepixmap(fz_context *ctx, fz_pixmap *pix, char *file, int rgb) +{ + char name[1024]; + fz_pixmap *converted = NULL; + + if (!pix) + return; + + if (rgb && pix->colorspace && pix->colorspace != fz_device_rgb(ctx)) + { + fz_irect bbox; + converted = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), fz_pixmap_bbox(ctx, pix, &bbox)); + fz_convert_pixmap(ctx, converted, pix); + pix = converted; + } + + if (pix->n <= 4) + { + sprintf(name, "%s.png", file); + printf("extracting image %s\n", name); + fz_write_png(ctx, pix, name, 0); + } + else + { + sprintf(name, "%s.pam", file); + printf("extracting image %s\n", name); + fz_write_pam(ctx, pix, name, 0); + } + + fz_drop_pixmap(ctx, converted); +} + +static void saveimage(int num) +{ + fz_image *image; + fz_pixmap *pix; + pdf_obj *ref; + char name[32]; + + ref = pdf_new_indirect(ctx, num, 0, doc); + + /* TODO: detect DCTD and save as jpeg */ + + image = pdf_load_image(doc, ref); + pix = fz_image_to_pixmap(ctx, image, 0, 0); + fz_drop_image(ctx, image); + + sprintf(name, "img-%04d", num); + writepixmap(ctx, pix, name, dorgb); + + fz_drop_pixmap(ctx, pix); + pdf_drop_obj(ref); +} + +static void savefont(pdf_obj *dict, int num) +{ + char name[1024]; + char *subtype; + fz_buffer *buf; + pdf_obj *stream = NULL; + pdf_obj *obj; + char *ext = ""; + FILE *f; + char *fontname = "font"; + int n, len; + unsigned char *data; + + obj = pdf_dict_gets(dict, "FontName"); + if (obj) + fontname = pdf_to_name(obj); + + obj = pdf_dict_gets(dict, "FontFile"); + if (obj) + { + stream = obj; + ext = "pfa"; + } + + obj = pdf_dict_gets(dict, "FontFile2"); + if (obj) + { + stream = obj; + ext = "ttf"; + } + + obj = pdf_dict_gets(dict, "FontFile3"); + if (obj) + { + stream = obj; + + obj = pdf_dict_gets(obj, "Subtype"); + if (obj && !pdf_is_name(obj)) + fz_throw(ctx, FZ_ERROR_GENERIC, "Invalid font descriptor subtype"); + + subtype = pdf_to_name(obj); + if (!strcmp(subtype, "Type1C")) + ext = "cff"; + else if (!strcmp(subtype, "CIDFontType0C")) + ext = "cid"; + else if (!strcmp(subtype, "OpenType")) + ext = "otf"; + else + fz_throw(ctx, FZ_ERROR_GENERIC, "Unhandled font type '%s'", subtype); + } + + if (!stream) + { + fz_warn(ctx, "Unhandled font type"); + return; + } + + buf = pdf_load_stream(doc, pdf_to_num(stream), pdf_to_gen(stream)); + + sprintf(name, "%s-%04d.%s", fontname, num, ext); + printf("extracting font %s\n", name); + + f = fopen(name, "wb"); + if (!f) + fz_throw(ctx, FZ_ERROR_GENERIC, "Error creating font file"); + + len = fz_buffer_storage(ctx, buf, &data); + n = fwrite(data, 1, len, f); + if (n < len) + fz_throw(ctx, FZ_ERROR_GENERIC, "Error writing font file"); + + if (fclose(f) < 0) + fz_throw(ctx, FZ_ERROR_GENERIC, "Error closing font file"); + + fz_drop_buffer(ctx, buf); +} + +static void showobject(int num) +{ + pdf_obj *obj; + + if (!doc) + fz_throw(ctx, FZ_ERROR_GENERIC, "no file specified"); + + obj = pdf_load_object(doc, num, 0); + + if (isimage(obj)) + saveimage(num); + else if (isfontdesc(obj)) + savefont(obj, num); + + pdf_drop_obj(obj); +} + +int pdfextract_main(int argc, char **argv) +{ + char *infile; + char *password = ""; + int c, o; + + while ((c = fz_getopt(argc, argv, "p:r")) != -1) + { + switch (c) + { + case 'p': password = fz_optarg; break; + case 'r': dorgb++; break; + default: usage(); break; + } + } + + if (fz_optind == argc) + usage(); + + infile = argv[fz_optind++]; + + ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED); + if (!ctx) + { + fprintf(stderr, "cannot initialise context\n"); + exit(1); + } + + doc = pdf_open_document_no_run(ctx, infile); + if (pdf_needs_password(doc)) + if (!pdf_authenticate_password(doc, password)) + fz_throw(ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s", infile); + + if (fz_optind == argc) + { + int len = pdf_count_objects(doc); + for (o = 0; o < len; o++) + showobject(o); + } + else + { + while (fz_optind < argc) + { + showobject(atoi(argv[fz_optind])); + fz_optind++; + } + } + + pdf_close_document(doc); + fz_flush_warnings(ctx); + fz_free_context(ctx); + return 0; +} |