diff options
-rw-r--r-- | include/mupdf/pdf/font.h | 2 | ||||
-rw-r--r-- | include/mupdf/pdf/interpret.h | 3 | ||||
-rw-r--r-- | include/mupdf/pdf/page.h | 34 | ||||
-rw-r--r-- | source/pdf/pdf-clean.c | 27 |
4 files changed, 58 insertions, 8 deletions
diff --git a/include/mupdf/pdf/font.h b/include/mupdf/pdf/font.h index 23bc77bf..10571f0b 100644 --- a/include/mupdf/pdf/font.h +++ b/include/mupdf/pdf/font.h @@ -1,6 +1,8 @@ #ifndef MUPDF_PDF_FONT_H #define MUPDF_PDF_FONT_H +#include "mupdf/pdf/cmap.h" + /* * Font */ diff --git a/include/mupdf/pdf/interpret.h b/include/mupdf/pdf/interpret.h index 68f1a71c..f7104499 100644 --- a/include/mupdf/pdf/interpret.h +++ b/include/mupdf/pdf/interpret.h @@ -1,6 +1,9 @@ #ifndef PDF_INTERPRET_H #define PDF_INTERPRET_H +#include "mupdf/pdf/font.h" +#include "mupdf/pdf/resource.h" + typedef struct pdf_csi_s pdf_csi; typedef struct pdf_gstate_s pdf_gstate; typedef struct pdf_processor_s pdf_processor; diff --git a/include/mupdf/pdf/page.h b/include/mupdf/pdf/page.h index 969aedf0..f904a7c1 100644 --- a/include/mupdf/pdf/page.h +++ b/include/mupdf/pdf/page.h @@ -1,6 +1,8 @@ #ifndef MUPDF_PDF_PAGE_H #define MUPDF_PDF_PAGE_H +#include "mupdf/pdf/interpret.h" + int pdf_lookup_page_number(fz_context *ctx, pdf_document *doc, pdf_obj *pageobj); int pdf_count_pages(fz_context *ctx, pdf_document *doc); pdf_obj *pdf_lookup_page_obj(fz_context *ctx, pdf_document *doc, int needle); @@ -169,6 +171,38 @@ void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *ann pdf_page_contents_process_fn *proc, void *proc_arg, int ascii); /* + pdf_filter_page_contents: Performs the same task as + pdf_clean_page_contents, but with an optional text filter + function. + + text_filter: Function to assess whether a given character + should be kept (return 0) or removed (return 1). + + after_text: Function called after each text object is closed + to allow other output to be sent. + + arg: Opaque value to be passed to callback functions. +*/ +void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, + pdf_page_contents_process_fn *proc_fn, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int ascii); + +/* + pdf_filter_annot_contents: Performs the same task as + pdf_clean_annot_contents, but with an optional text filter + function. + + text_filter: Function to assess whether a given character + should be kept (return 0) or removed (return 1). + + after_text: Function called after each text object is closed + to allow other output to be sent. + + arg: Opaque value to be passed to callback functions. +*/ +void pdf_filter_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, + pdf_page_contents_process_fn *proc, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int ascii); + +/* Presentation interface. */ fz_transition *pdf_page_presentation(fz_context *ctx, pdf_page *page, fz_transition *transition, float *duration); diff --git a/source/pdf/pdf-clean.c b/source/pdf/pdf-clean.c index 846188c6..7fe0d6fe 100644 --- a/source/pdf/pdf-clean.c +++ b/source/pdf/pdf-clean.c @@ -2,7 +2,7 @@ #include "mupdf/pdf.h" static void -pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res, int ascii) +pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res, int ascii, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg) { pdf_processor *proc_buffer = NULL; pdf_processor *proc_filter = NULL; @@ -31,7 +31,7 @@ pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_ob res = pdf_new_dict(ctx, doc, 1); proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii); - proc_filter = pdf_new_filter_processor(ctx, doc, proc_buffer, orig_res, res); + proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, proc_buffer, orig_res, res, text_filter, after_text, arg); pdf_process_contents(ctx, proc_filter, doc, orig_res, obj, cookie); pdf_close_processor(ctx, proc_filter); @@ -127,7 +127,12 @@ pdf_clean_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_ } } -void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *proc_arg, int ascii) +void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *arg, int ascii) +{ + pdf_filter_page_contents(ctx, doc, page, cookie, proc_fn, NULL, NULL, arg, ascii); +} + +void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *proc_arg, int ascii) { pdf_processor *proc_buffer = NULL; pdf_processor *proc_filter = NULL; @@ -157,7 +162,7 @@ void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, resources = pdf_page_resources(ctx, page); proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii); - proc_filter = pdf_new_filter_processor(ctx, doc, proc_buffer, resources, res); + proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, proc_buffer, resources, res, text_filter, after_text, proc_arg); pdf_process_contents(ctx, proc_filter, doc, resources, contents, cookie); pdf_close_processor(ctx, proc_filter); @@ -202,7 +207,7 @@ void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, if (!o) continue; /* Transparency group XObject */ - pdf_clean_stream_object(ctx, doc, o, resources, cookie, 1, ascii); + pdf_clean_stream_object(ctx, doc, o, resources, cookie, 1, ascii, text_filter, after_text, proc_arg); } } @@ -218,7 +223,7 @@ void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, if (!pat) continue; if (pdf_to_int(ctx, pdf_dict_get(ctx, pat, PDF_NAME_PatternType)) == 1) - pdf_clean_stream_object(ctx, doc, pat, resources, cookie, 0, ascii); + pdf_clean_stream_object(ctx, doc, pat, resources, cookie, 0, ascii, text_filter, after_text, proc_arg); } } @@ -234,7 +239,7 @@ void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, if (!xobj) continue; if (pdf_name_eq(ctx, PDF_NAME_Form, pdf_dict_get(ctx, xobj, PDF_NAME_Subtype))) - pdf_clean_stream_object(ctx, doc, xobj, resources, cookie, 1, ascii); + pdf_clean_stream_object(ctx, doc, xobj, resources, cookie, 1, ascii, text_filter, after_text, proc_arg); } } @@ -287,6 +292,12 @@ void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *proc_arg, int ascii) { + pdf_filter_annot_contents(ctx, doc, annot, cookie, proc_fn, NULL, NULL, proc_arg, ascii); +} + +void pdf_filter_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, + pdf_page_contents_process_fn *proc, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int ascii) +{ pdf_obj *ap; int i, n; @@ -302,6 +313,6 @@ void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *ann if (v == NULL) continue; - pdf_clean_stream_object(ctx, doc, v, NULL, cookie, 1, 1); + pdf_clean_stream_object(ctx, doc, v, NULL, cookie, 1, 1, text_filter, after_text, arg); } } |