summaryrefslogtreecommitdiff
path: root/source/tools/pdfextract.c
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2013-06-19 15:29:44 +0200
committerTor Andersson <tor.andersson@artifex.com>2013-06-20 16:45:35 +0200
commit0a927854a10e1e6b9770a81e2e1d9f3093631757 (patch)
tree3d65d820d9fdba2d0d394d99c36290c851b78ca0 /source/tools/pdfextract.c
parent1ae8f19179c5f0f8c6352b3c7855465325d5449a (diff)
downloadmupdf-0a927854a10e1e6b9770a81e2e1d9f3093631757.tar.xz
Rearrange source files.
Diffstat (limited to 'source/tools/pdfextract.c')
-rw-r--r--source/tools/pdfextract.c231
1 files changed, 231 insertions, 0 deletions
diff --git a/source/tools/pdfextract.c b/source/tools/pdfextract.c
new file mode 100644
index 00000000..6e8e4aec
--- /dev/null
+++ b/source/tools/pdfextract.c
@@ -0,0 +1,231 @@
+/*
+ * pdfextract -- the ultimate way to extract images and fonts from pdfs
+ */
+
+#include "mupdf/pdf.h"
+
+static pdf_document *doc = NULL;
+static fz_context *ctx = NULL;
+static int dorgb = 0;
+
+static void usage(void)
+{
+ fprintf(stderr, "usage: mutool extract [options] file.pdf [object numbers]\n");
+ fprintf(stderr, "\t-p\tpassword\n");
+ fprintf(stderr, "\t-r\tconvert images to rgb\n");
+ exit(1);
+}
+
+static int isimage(pdf_obj *obj)
+{
+ pdf_obj *type = pdf_dict_gets(obj, "Subtype");
+ return pdf_is_name(type) && !strcmp(pdf_to_name(type), "Image");
+}
+
+static int isfontdesc(pdf_obj *obj)
+{
+ pdf_obj *type = pdf_dict_gets(obj, "Type");
+ return pdf_is_name(type) && !strcmp(pdf_to_name(type), "FontDescriptor");
+}
+
+static void writepixmap(fz_context *ctx, fz_pixmap *pix, char *file, int rgb)
+{
+ char name[1024];
+ fz_pixmap *converted = NULL;
+
+ if (!pix)
+ return;
+
+ if (rgb && pix->colorspace && pix->colorspace != fz_device_rgb(ctx))
+ {
+ fz_irect bbox;
+ converted = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), fz_pixmap_bbox(ctx, pix, &bbox));
+ fz_convert_pixmap(ctx, converted, pix);
+ pix = converted;
+ }
+
+ if (pix->n <= 4)
+ {
+ sprintf(name, "%s.png", file);
+ printf("extracting image %s\n", name);
+ fz_write_png(ctx, pix, name, 0);
+ }
+ else
+ {
+ sprintf(name, "%s.pam", file);
+ printf("extracting image %s\n", name);
+ fz_write_pam(ctx, pix, name, 0);
+ }
+
+ fz_drop_pixmap(ctx, converted);
+}
+
+static void saveimage(int num)
+{
+ fz_image *image;
+ fz_pixmap *pix;
+ pdf_obj *ref;
+ char name[32];
+
+ ref = pdf_new_indirect(ctx, num, 0, doc);
+
+ /* TODO: detect DCTD and save as jpeg */
+
+ image = pdf_load_image(doc, ref);
+ pix = fz_image_to_pixmap(ctx, image, 0, 0);
+ fz_drop_image(ctx, image);
+
+ sprintf(name, "img-%04d", num);
+ writepixmap(ctx, pix, name, dorgb);
+
+ fz_drop_pixmap(ctx, pix);
+ pdf_drop_obj(ref);
+}
+
+static void savefont(pdf_obj *dict, int num)
+{
+ char name[1024];
+ char *subtype;
+ fz_buffer *buf;
+ pdf_obj *stream = NULL;
+ pdf_obj *obj;
+ char *ext = "";
+ FILE *f;
+ char *fontname = "font";
+ int n, len;
+ unsigned char *data;
+
+ obj = pdf_dict_gets(dict, "FontName");
+ if (obj)
+ fontname = pdf_to_name(obj);
+
+ obj = pdf_dict_gets(dict, "FontFile");
+ if (obj)
+ {
+ stream = obj;
+ ext = "pfa";
+ }
+
+ obj = pdf_dict_gets(dict, "FontFile2");
+ if (obj)
+ {
+ stream = obj;
+ ext = "ttf";
+ }
+
+ obj = pdf_dict_gets(dict, "FontFile3");
+ if (obj)
+ {
+ stream = obj;
+
+ obj = pdf_dict_gets(obj, "Subtype");
+ if (obj && !pdf_is_name(obj))
+ fz_throw(ctx, FZ_ERROR_GENERIC, "Invalid font descriptor subtype");
+
+ subtype = pdf_to_name(obj);
+ if (!strcmp(subtype, "Type1C"))
+ ext = "cff";
+ else if (!strcmp(subtype, "CIDFontType0C"))
+ ext = "cid";
+ else if (!strcmp(subtype, "OpenType"))
+ ext = "otf";
+ else
+ fz_throw(ctx, FZ_ERROR_GENERIC, "Unhandled font type '%s'", subtype);
+ }
+
+ if (!stream)
+ {
+ fz_warn(ctx, "Unhandled font type");
+ return;
+ }
+
+ buf = pdf_load_stream(doc, pdf_to_num(stream), pdf_to_gen(stream));
+
+ sprintf(name, "%s-%04d.%s", fontname, num, ext);
+ printf("extracting font %s\n", name);
+
+ f = fopen(name, "wb");
+ if (!f)
+ fz_throw(ctx, FZ_ERROR_GENERIC, "Error creating font file");
+
+ len = fz_buffer_storage(ctx, buf, &data);
+ n = fwrite(data, 1, len, f);
+ if (n < len)
+ fz_throw(ctx, FZ_ERROR_GENERIC, "Error writing font file");
+
+ if (fclose(f) < 0)
+ fz_throw(ctx, FZ_ERROR_GENERIC, "Error closing font file");
+
+ fz_drop_buffer(ctx, buf);
+}
+
+static void showobject(int num)
+{
+ pdf_obj *obj;
+
+ if (!doc)
+ fz_throw(ctx, FZ_ERROR_GENERIC, "no file specified");
+
+ obj = pdf_load_object(doc, num, 0);
+
+ if (isimage(obj))
+ saveimage(num);
+ else if (isfontdesc(obj))
+ savefont(obj, num);
+
+ pdf_drop_obj(obj);
+}
+
+int pdfextract_main(int argc, char **argv)
+{
+ char *infile;
+ char *password = "";
+ int c, o;
+
+ while ((c = fz_getopt(argc, argv, "p:r")) != -1)
+ {
+ switch (c)
+ {
+ case 'p': password = fz_optarg; break;
+ case 'r': dorgb++; break;
+ default: usage(); break;
+ }
+ }
+
+ if (fz_optind == argc)
+ usage();
+
+ infile = argv[fz_optind++];
+
+ ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
+ if (!ctx)
+ {
+ fprintf(stderr, "cannot initialise context\n");
+ exit(1);
+ }
+
+ doc = pdf_open_document_no_run(ctx, infile);
+ if (pdf_needs_password(doc))
+ if (!pdf_authenticate_password(doc, password))
+ fz_throw(ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s", infile);
+
+ if (fz_optind == argc)
+ {
+ int len = pdf_count_objects(doc);
+ for (o = 0; o < len; o++)
+ showobject(o);
+ }
+ else
+ {
+ while (fz_optind < argc)
+ {
+ showobject(atoi(argv[fz_optind]));
+ fz_optind++;
+ }
+ }
+
+ pdf_close_document(doc);
+ fz_flush_warnings(ctx);
+ fz_free_context(ctx);
+ return 0;
+}