From b83d0c4c1ad97c0ff68fcbfaf4aceb5fc7e4e642 Mon Sep 17 00:00:00 2001 From: Robin Watts Date: Mon, 3 Mar 2014 18:07:02 +0000 Subject: Add pdf_process interface. Currently the only processing we can do of PDF pages is to run them through an fz_device. We introduce new "pdf_process" functionality here to enable us to do more things. We define a pdf_processor structure with a set of function pointers in, one per PDF operator, together with functions for processing xobjects etc. The guts of pdf_run_page_contents and pdf_run_annot operations are then extracted to give pdf_process_page_contents and pdf_process_annot, and the originals implemented in terms of these. This commit contains just one instance of a pdf_processor, namely the "run" processor, which contains the original code refactored. The graphical state (and device pointer) is now part of private data to the run operator set, rather than being in pdf_csi. --- platform/win32/libmupdf.vcproj | 12 + source/pdf/pdf-interpret-imp.h | 151 ++ source/pdf/pdf-interpret.c | 3062 +++------------------------------------- source/pdf/pdf-op-run.c | 2838 +++++++++++++++++++++++++++++++++++++ source/pdf/pdf-run.c | 105 ++ 5 files changed, 3311 insertions(+), 2857 deletions(-) create mode 100644 source/pdf/pdf-interpret-imp.h create mode 100644 source/pdf/pdf-op-run.c create mode 100644 source/pdf/pdf-run.c diff --git a/platform/win32/libmupdf.vcproj b/platform/win32/libmupdf.vcproj index 2c80d712..f501cd9f 100644 --- a/platform/win32/libmupdf.vcproj +++ b/platform/win32/libmupdf.vcproj @@ -777,6 +777,10 @@ RelativePath="..\..\source\pdf\pdf-image.c" > + + @@ -797,6 +801,10 @@ RelativePath="..\..\source\pdf\pdf-object.c" > + + @@ -821,6 +829,10 @@ RelativePath="..\..\source\pdf\pdf-repair.c" > + + diff --git a/source/pdf/pdf-interpret-imp.h b/source/pdf/pdf-interpret-imp.h new file mode 100644 index 00000000..2aafb830 --- /dev/null +++ b/source/pdf/pdf-interpret-imp.h @@ -0,0 +1,151 @@ +#ifndef PDF_INTERPRET_IMP_H +#define PDF_INTERPRET_IMP_H + +#include "mupdf/pdf.h" + +typedef struct pdf_csi_s pdf_csi; +typedef struct pdf_gstate_s pdf_gstate; + +typedef void (*pdf_operator_fn)(pdf_csi *, void *user); +typedef void (*pdf_process_annot_fn)(pdf_csi *csi, void *user, pdf_obj *resources, pdf_annot *annot); +typedef void (*pdf_process_stream_fn)(pdf_csi *csi, void *user, pdf_lexbuf *buf); +typedef void (*pdf_process_contents_fn)(pdf_csi *csi, void *user, pdf_obj *resources, pdf_obj *contents); + +typedef enum { + /* The first section of op's all run without a try/catch */ + PDF_OP_dquote, + PDF_OP_squote, + PDF_OP_B, + PDF_OP_Bstar, + PDF_OP_BDC, + PDF_OP_BI, + PDF_OP_BMC, + PDF_OP_BT, + PDF_OP_BX, + PDF_OP_CS, + PDF_OP_DP, + PDF_OP_EMC, + PDF_OP_ET, + PDF_OP_EX, + PDF_OP_F, + PDF_OP_G, + PDF_OP_J, + PDF_OP_K, + PDF_OP_M, + PDF_OP_MP, + PDF_OP_Q, + PDF_OP_RG, + PDF_OP_S, + PDF_OP_SC, + PDF_OP_SCN, + PDF_OP_Tstar, + PDF_OP_TD, + PDF_OP_TJ, + PDF_OP_TL, + PDF_OP_Tc, + PDF_OP_Td, + PDF_OP_Tj, + PDF_OP_Tm, + PDF_OP_Tr, + PDF_OP_Ts, + PDF_OP_Tw, + PDF_OP_Tz, + PDF_OP_W, + PDF_OP_Wstar, + PDF_OP_b, + PDF_OP_bstar, + PDF_OP_c, + PDF_OP_cm, + PDF_OP_cs, + PDF_OP_d, + PDF_OP_d0, + PDF_OP_d1, + PDF_OP_f, + PDF_OP_fstar, + PDF_OP_g, + PDF_OP_h, + PDF_OP_i, + PDF_OP_j, + PDF_OP_k, + PDF_OP_l, + PDF_OP_m, + PDF_OP_n, + PDF_OP_q, + PDF_OP_re, + PDF_OP_rg, + PDF_OP_ri, + PDF_OP_s, + PDF_OP_sc, + PDF_OP_scn, + PDF_OP_v, + PDF_OP_w, + PDF_OP_y, + /* ops in this second section require additional try/catch handling */ + PDF_OP_Do, + PDF_OP_Tf, + PDF_OP_gs, + PDF_OP_sh, + /* END is used to signify end of stream (finalise and close down) */ + PDF_OP_END, + /* And finally we have a max */ + PDF_OP_MAX +} PDF_OP; + +typedef struct pdf_processor_s { + pdf_operator_fn op_table[PDF_OP_MAX]; + pdf_process_annot_fn process_annot; + pdf_process_stream_fn process_stream; + pdf_process_contents_fn process_contents; +} pdf_processor; + +typedef struct pdf_process_s +{ + const pdf_processor *processor; + void *state; +} pdf_process; + +struct pdf_csi_s +{ + pdf_document *doc; + + /* Current resource dict and file. These are in here to reduce param + * passing. */ + pdf_obj *rdb; + fz_stream *file; + + /* Operator table */ + pdf_process process; + + /* interpreter stack */ + pdf_obj *obj; + char name[256]; + unsigned char string[256]; + int string_len; + float stack[32]; + int top; + + int xbalance; + int in_text; + + /* cookie support */ + fz_cookie *cookie; +}; + +static inline void pdf_process_op(pdf_csi *csi, int op, const pdf_process *process) +{ + process->processor->op_table[op](csi, process->state); +} + +void pdf_process_contents_object(pdf_csi *csi, pdf_obj *rdb, pdf_obj *contents); +void pdf_process_stream(pdf_csi *csi, pdf_lexbuf *buf); + +/* Functions to set up pdf_process structures */ +pdf_process *pdf_process_run(pdf_process *process, fz_device *dev, const fz_matrix *ctm, const char *event, pdf_gstate *gstate, int nested); + +/* Functions to actually use the pdf_process structures to process pages, + * annotations and glyphs */ +void pdf_process_annot(pdf_document *doc, pdf_page *page, pdf_annot *annot, const pdf_process *process, fz_cookie *cookie); +void pdf_process_page_contents(pdf_document *doc, pdf_page *page, const pdf_process *process, fz_cookie *cookie); +void pdf_process_glyph(pdf_document *doc, pdf_obj *resources, fz_buffer *contents, pdf_process *process); + +#endif diff --git a/source/pdf/pdf-interpret.c b/source/pdf/pdf-interpret.c index c01d7ea5..1508e829 100644 --- a/source/pdf/pdf-interpret.c +++ b/source/pdf/pdf-interpret.c @@ -1,2634 +1,63 @@ -#include "mupdf/pdf.h" +#include "pdf-interpret-imp.h" -#define TILE - -typedef struct pdf_material_s pdf_material; -typedef struct pdf_gstate_s pdf_gstate; -typedef struct pdf_csi_s pdf_csi; - -enum -{ - PDF_FILL, - PDF_STROKE, -}; - -enum -{ - PDF_MAT_NONE, - PDF_MAT_COLOR, - PDF_MAT_PATTERN, - PDF_MAT_SHADE, -}; - -struct pdf_material_s -{ - int kind; - fz_colorspace *colorspace; - pdf_pattern *pattern; - fz_shade *shade; - int gstate_num; - float alpha; - float v[FZ_MAX_COLORS]; -}; - -struct pdf_gstate_s -{ - fz_matrix ctm; - int clip_depth; - - /* path stroking */ - fz_stroke_state *stroke_state; - - /* materials */ - pdf_material stroke; - pdf_material fill; - - /* text state */ - float char_space; - float word_space; - float scale; - float leading; - pdf_font_desc *font; - float size; - int render; - float rise; - - /* transparency */ - int blendmode; - pdf_xobject *softmask; - fz_matrix softmask_ctm; - float softmask_bc[FZ_MAX_COLORS]; - int luminosity; -}; - -struct pdf_csi_s -{ - fz_device *dev; - pdf_document *doc; - - int nested_depth; - - /* usage mode for optional content groups */ - char *event; /* "View", "Print", "Export" */ - - /* Current resource dict and file. These are in here to reduce param - * passing. */ - pdf_obj *rdb; - fz_stream *file; - - /* interpreter stack */ - pdf_obj *obj; - char name[256]; - unsigned char string[256]; - int string_len; - float stack[32]; - int top; - - int xbalance; - int in_text; - int in_hidden_ocg; - - /* path object state */ - fz_path *path; - int clip; - int clip_even_odd; - - /* text object state */ - fz_text *text; - fz_rect text_bbox; - fz_matrix tlm; - fz_matrix tm; - int text_mode; - int accumulate; - - /* graphics state */ - pdf_gstate *gstate; - int gcap; - int gtop; - int gbot; - int gparent; - - /* cookie support */ - fz_cookie *cookie; -}; - -static void pdf_run_contents_object(pdf_csi *csi, pdf_obj *rdb, pdf_obj *contents); -static void pdf_run_xobject(pdf_csi *csi, pdf_obj *resources, pdf_xobject *xobj, const fz_matrix *transform); -static void pdf_show_pattern(pdf_csi *csi, pdf_pattern *pat, pdf_gstate *pat_gstate, const fz_rect *area, int what); - -static int -ocg_intents_include(pdf_ocg_descriptor *desc, char *name) -{ - int i, len; - - if (strcmp(name, "All") == 0) - return 1; - - /* In the absence of a specified intent, it's 'View' */ - if (!desc->intent) - return (strcmp(name, "View") == 0); - - if (pdf_is_name(desc->intent)) - { - char *intent = pdf_to_name(desc->intent); - if (strcmp(intent, "All") == 0) - return 1; - return (strcmp(intent, name) == 0); - } - if (!pdf_is_array(desc->intent)) - return 0; - - len = pdf_array_len(desc->intent); - for (i=0; i < len; i++) - { - char *intent = pdf_to_name(pdf_array_get(desc->intent, i)); - if (strcmp(intent, "All") == 0) - return 1; - if (strcmp(intent, name) == 0) - return 1; - } - return 0; -} - -static int -pdf_is_hidden_ocg(pdf_obj *ocg, pdf_csi *csi, pdf_obj *rdb) -{ - char event_state[16]; - pdf_obj *obj, *obj2; - char *type; - pdf_ocg_descriptor *desc = csi->doc->ocg; - fz_context *ctx = csi->dev->ctx; - - /* Avoid infinite recursions */ - if (pdf_obj_marked(ocg)) - return 0; - - /* If no ocg descriptor, everything is visible */ - if (!desc) - return 0; - - /* If we've been handed a name, look it up in the properties. */ - if (pdf_is_name(ocg)) - { - ocg = pdf_dict_gets(pdf_dict_gets(rdb, "Properties"), pdf_to_name(ocg)); - } - /* If we haven't been given an ocg at all, then we're visible */ - if (!ocg) - return 0; - - fz_strlcpy(event_state, csi->event, sizeof event_state); - fz_strlcat(event_state, "State", sizeof event_state); - - type = pdf_to_name(pdf_dict_gets(ocg, "Type")); - - if (strcmp(type, "OCG") == 0) - { - /* An Optional Content Group */ - int default_value = 0; - int num = pdf_to_num(ocg); - int gen = pdf_to_gen(ocg); - int len = desc->len; - int i; - - /* by default an OCG is visible, unless it's explicitly hidden */ - for (i = 0; i < len; i++) - { - if (desc->ocgs[i].num == num && desc->ocgs[i].gen == gen) - { - default_value = desc->ocgs[i].state == 0; - break; - } - } - - /* Check Intents; if our intent is not part of the set given - * by the current config, we should ignore it. */ - obj = pdf_dict_gets(ocg, "Intent"); - if (pdf_is_name(obj)) - { - /* If it doesn't match, it's hidden */ - if (ocg_intents_include(desc, pdf_to_name(obj)) == 0) - return 1; - } - else if (pdf_is_array(obj)) - { - int match = 0; - len = pdf_array_len(obj); - for (i=0; i