summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2017-10-25 16:58:50 +0100
committerRobin Watts <robin.watts@artifex.com>2017-11-06 11:41:14 +0000
commitc19e1ab67f06bf11e12dac08685a28753a3b276e (patch)
tree54794b85da83e3c88320fa05eab5a465c1ca453c
parentf76bc6be7d3addfbd65fdac454b17911337ee2c6 (diff)
downloadmupdf-c19e1ab67f06bf11e12dac08685a28753a3b276e.tar.xz
Expose text filtering through pdf_clean interface.
-rw-r--r--include/mupdf/pdf/font.h2
-rw-r--r--include/mupdf/pdf/interpret.h3
-rw-r--r--include/mupdf/pdf/page.h34
-rw-r--r--source/pdf/pdf-clean.c27
4 files changed, 58 insertions, 8 deletions
diff --git a/include/mupdf/pdf/font.h b/include/mupdf/pdf/font.h
index 23bc77bf..10571f0b 100644
--- a/include/mupdf/pdf/font.h
+++ b/include/mupdf/pdf/font.h
@@ -1,6 +1,8 @@
#ifndef MUPDF_PDF_FONT_H
#define MUPDF_PDF_FONT_H
+#include "mupdf/pdf/cmap.h"
+
/*
* Font
*/
diff --git a/include/mupdf/pdf/interpret.h b/include/mupdf/pdf/interpret.h
index 68f1a71c..f7104499 100644
--- a/include/mupdf/pdf/interpret.h
+++ b/include/mupdf/pdf/interpret.h
@@ -1,6 +1,9 @@
#ifndef PDF_INTERPRET_H
#define PDF_INTERPRET_H
+#include "mupdf/pdf/font.h"
+#include "mupdf/pdf/resource.h"
+
typedef struct pdf_csi_s pdf_csi;
typedef struct pdf_gstate_s pdf_gstate;
typedef struct pdf_processor_s pdf_processor;
diff --git a/include/mupdf/pdf/page.h b/include/mupdf/pdf/page.h
index 969aedf0..f904a7c1 100644
--- a/include/mupdf/pdf/page.h
+++ b/include/mupdf/pdf/page.h
@@ -1,6 +1,8 @@
#ifndef MUPDF_PDF_PAGE_H
#define MUPDF_PDF_PAGE_H
+#include "mupdf/pdf/interpret.h"
+
int pdf_lookup_page_number(fz_context *ctx, pdf_document *doc, pdf_obj *pageobj);
int pdf_count_pages(fz_context *ctx, pdf_document *doc);
pdf_obj *pdf_lookup_page_obj(fz_context *ctx, pdf_document *doc, int needle);
@@ -169,6 +171,38 @@ void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *ann
pdf_page_contents_process_fn *proc, void *proc_arg, int ascii);
/*
+ pdf_filter_page_contents: Performs the same task as
+ pdf_clean_page_contents, but with an optional text filter
+ function.
+
+ text_filter: Function to assess whether a given character
+ should be kept (return 0) or removed (return 1).
+
+ after_text: Function called after each text object is closed
+ to allow other output to be sent.
+
+ arg: Opaque value to be passed to callback functions.
+*/
+void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie,
+ pdf_page_contents_process_fn *proc_fn, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int ascii);
+
+/*
+ pdf_filter_annot_contents: Performs the same task as
+ pdf_clean_annot_contents, but with an optional text filter
+ function.
+
+ text_filter: Function to assess whether a given character
+ should be kept (return 0) or removed (return 1).
+
+ after_text: Function called after each text object is closed
+ to allow other output to be sent.
+
+ arg: Opaque value to be passed to callback functions.
+*/
+void pdf_filter_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie,
+ pdf_page_contents_process_fn *proc, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int ascii);
+
+/*
Presentation interface.
*/
fz_transition *pdf_page_presentation(fz_context *ctx, pdf_page *page, fz_transition *transition, float *duration);
diff --git a/source/pdf/pdf-clean.c b/source/pdf/pdf-clean.c
index 846188c6..7fe0d6fe 100644
--- a/source/pdf/pdf-clean.c
+++ b/source/pdf/pdf-clean.c
@@ -2,7 +2,7 @@
#include "mupdf/pdf.h"
static void
-pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res, int ascii)
+pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res, int ascii, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg)
{
pdf_processor *proc_buffer = NULL;
pdf_processor *proc_filter = NULL;
@@ -31,7 +31,7 @@ pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_ob
res = pdf_new_dict(ctx, doc, 1);
proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii);
- proc_filter = pdf_new_filter_processor(ctx, doc, proc_buffer, orig_res, res);
+ proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, proc_buffer, orig_res, res, text_filter, after_text, arg);
pdf_process_contents(ctx, proc_filter, doc, orig_res, obj, cookie);
pdf_close_processor(ctx, proc_filter);
@@ -127,7 +127,12 @@ pdf_clean_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_
}
}
-void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *proc_arg, int ascii)
+void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *arg, int ascii)
+{
+ pdf_filter_page_contents(ctx, doc, page, cookie, proc_fn, NULL, NULL, arg, ascii);
+}
+
+void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *proc_arg, int ascii)
{
pdf_processor *proc_buffer = NULL;
pdf_processor *proc_filter = NULL;
@@ -157,7 +162,7 @@ void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page,
resources = pdf_page_resources(ctx, page);
proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii);
- proc_filter = pdf_new_filter_processor(ctx, doc, proc_buffer, resources, res);
+ proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, proc_buffer, resources, res, text_filter, after_text, proc_arg);
pdf_process_contents(ctx, proc_filter, doc, resources, contents, cookie);
pdf_close_processor(ctx, proc_filter);
@@ -202,7 +207,7 @@ void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page,
if (!o)
continue;
/* Transparency group XObject */
- pdf_clean_stream_object(ctx, doc, o, resources, cookie, 1, ascii);
+ pdf_clean_stream_object(ctx, doc, o, resources, cookie, 1, ascii, text_filter, after_text, proc_arg);
}
}
@@ -218,7 +223,7 @@ void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page,
if (!pat)
continue;
if (pdf_to_int(ctx, pdf_dict_get(ctx, pat, PDF_NAME_PatternType)) == 1)
- pdf_clean_stream_object(ctx, doc, pat, resources, cookie, 0, ascii);
+ pdf_clean_stream_object(ctx, doc, pat, resources, cookie, 0, ascii, text_filter, after_text, proc_arg);
}
}
@@ -234,7 +239,7 @@ void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page,
if (!xobj)
continue;
if (pdf_name_eq(ctx, PDF_NAME_Form, pdf_dict_get(ctx, xobj, PDF_NAME_Subtype)))
- pdf_clean_stream_object(ctx, doc, xobj, resources, cookie, 1, ascii);
+ pdf_clean_stream_object(ctx, doc, xobj, resources, cookie, 1, ascii, text_filter, after_text, proc_arg);
}
}
@@ -287,6 +292,12 @@ void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page,
void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *proc_arg, int ascii)
{
+ pdf_filter_annot_contents(ctx, doc, annot, cookie, proc_fn, NULL, NULL, proc_arg, ascii);
+}
+
+void pdf_filter_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie,
+ pdf_page_contents_process_fn *proc, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int ascii)
+{
pdf_obj *ap;
int i, n;
@@ -302,6 +313,6 @@ void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *ann
if (v == NULL)
continue;
- pdf_clean_stream_object(ctx, doc, v, NULL, cookie, 1, 1);
+ pdf_clean_stream_object(ctx, doc, v, NULL, cookie, 1, 1, text_filter, after_text, arg);
}
}