From e7875fcd16a80d880c33b49c9142bce1d870e9a0 Mon Sep 17 00:00:00 2001 From: Sebastian Rasmussen Date: Mon, 24 Oct 2016 21:41:55 +0800 Subject: Introduce options for structured text. --- include/mupdf/fitz/structured-text.h | 22 ++++++++++++++++++---- include/mupdf/fitz/util.h | 12 ++++++------ source/fitz/stext-device.c | 32 +++++++++++++++++++++++++++----- source/fitz/util.c | 12 ++++++------ source/tools/muconvert.c | 1 + source/tools/murun.c | 12 ++++++++---- 6 files changed, 66 insertions(+), 25 deletions(-) diff --git a/include/mupdf/fitz/structured-text.h b/include/mupdf/fitz/structured-text.h index a12c3cc9..061fc83c 100644 --- a/include/mupdf/fitz/structured-text.h +++ b/include/mupdf/fitz/structured-text.h @@ -201,6 +201,8 @@ struct fz_char_and_box_s fz_rect bbox; }; +extern const char *fz_stext_options_usage; + fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stext_page *page, int idx); /* @@ -285,6 +287,20 @@ int fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, f */ char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect); +/* + struct fz_stext_options: Options for creating a pixmap and draw device. +*/ +typedef struct fz_stext_options_s fz_stext_options; + +struct fz_stext_options_s +{ + int flags; +}; +/* + fz_parse_stext_options: Parse stext device options from a comma separated key-value string. +*/ +fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string); + /* fz_new_stext_device: Create a device to extract the text on a page. @@ -302,10 +318,8 @@ char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect); containing data already (for example when merging multiple pages, or watermarking). - options: Mask of heuristic options to activate. If 0 is given the - default is to activate both FZ_STEXT_PRESERVE_LIGATURES and - FZ_STEXT_PRESERVE_WHITESPACE. + options: Options to configure the stext device. */ -fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, int options); +fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *options); #endif diff --git a/include/mupdf/fitz/util.h b/include/mupdf/fitz/util.h index f8dec50a..a9a0d59d 100644 --- a/include/mupdf/fitz/util.h +++ b/include/mupdf/fitz/util.h @@ -38,9 +38,9 @@ fz_pixmap *fz_new_pixmap_from_annot(fz_context *ctx, fz_annot *annot, const fz_m /* fz_new_stext_page_from_page: Extract structured text from a page. The sheet must not be NULL. */ -fz_stext_page *fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, int options); -fz_stext_page *fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, int options); -fz_stext_page *fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, int options); +fz_stext_page *fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, const fz_stext_options *options); +fz_stext_page *fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, const fz_stext_options *options); +fz_stext_page *fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, const fz_stext_options *options); /* fz_new_buffer_from_stext_page: Convert structured text into plain text, cropped by the selection rectangle. @@ -48,9 +48,9 @@ fz_stext_page *fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_l otherwise '\n'. */ fz_buffer *fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rect *sel, int crlf); -fz_buffer *fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, int options); -fz_buffer *fz_new_buffer_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_rect *sel, int crlf, int options); -fz_buffer *fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, int options); +fz_buffer *fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, const fz_stext_options *options); +fz_buffer *fz_new_buffer_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_rect *sel, int crlf, const fz_stext_options *options); +fz_buffer *fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, const fz_stext_options *options); /* fz_search_page: Search for the 'needle' text on the page. diff --git a/source/fitz/stext-device.c b/source/fitz/stext-device.c index 588c1454..4019fb24 100644 --- a/source/fitz/stext-device.c +++ b/source/fitz/stext-device.c @@ -30,9 +30,15 @@ struct fz_stext_device_s span_soup *spans; fz_stext_span *cur_span; int lastchar; - int options; + int flags; }; +const char *fz_stext_options_usage = + "Structured text output options:\n" + "\tpreserve-ligatures: do not expand all ligatures into constituent characters\n" + "\tpreserve-whitespace: do not convert all whitespace characters into spaces\n" + "\n"; + static fz_rect * add_point_to_rect(fz_rect *a, const fz_point *p) { @@ -728,7 +734,7 @@ fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, if (c == -1) return; - if (!(dev->options & FZ_STEXT_PRESERVE_LIGATURES)) + if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES)) switch (c) { case 0xFB00: /* ff */ @@ -760,7 +766,7 @@ fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, return; } - if (!(dev->options & FZ_STEXT_PRESERVE_WHITESPACE)) + if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE)) switch (c) { case 0x0009: /* tab */ @@ -1074,8 +1080,23 @@ fz_stext_drop_device(fz_context *ctx, fz_device *dev) tdev->spans = NULL; } +fz_stext_options * +fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string) +{ + const char *val; + + memset(opts, 0, sizeof *opts); + + if (fz_has_option(ctx, string, "preserve-ligatures", &val) && fz_option_eq(val, "yes")) + opts->flags |= FZ_STEXT_PRESERVE_LIGATURES; + if (fz_has_option(ctx, string, "preserve-whitespace", &val) && fz_option_eq(val, "yes")) + opts->flags |= FZ_STEXT_PRESERVE_WHITESPACE; + + return opts; +} + fz_device * -fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, int options) +fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *opts) { fz_stext_device *dev = fz_new_device(ctx, sizeof *dev); @@ -1097,7 +1118,8 @@ fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, dev->spans = NULL; dev->cur_span = NULL; dev->lastchar = ' '; - dev->options = options; + if (opts) + dev->flags = opts->flags; return (fz_device*)dev; } diff --git a/source/fitz/util.c b/source/fitz/util.c index 6e0982e7..be8502a5 100644 --- a/source/fitz/util.c +++ b/source/fitz/util.c @@ -265,7 +265,7 @@ fz_new_pixmap_from_page_number(fz_context *ctx, fz_document *doc, int number, co } fz_stext_page * -fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, int options) +fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, const fz_stext_options *options) { fz_stext_page *text; fz_device *dev; @@ -295,7 +295,7 @@ fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_s } fz_stext_page * -fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, int options) +fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, const fz_stext_options *options) { fz_stext_page *text; fz_device *dev; @@ -325,7 +325,7 @@ fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *shee } fz_stext_page * -fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, int options) +fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, const fz_stext_options *options) { fz_page *page; fz_stext_page *text; @@ -478,7 +478,7 @@ fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rec } fz_buffer * -fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, int options) +fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, const fz_stext_options *options) { fz_stext_sheet *sheet; fz_stext_page *text; @@ -499,7 +499,7 @@ fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz } fz_buffer * -fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, int options) +fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, const fz_stext_options *options) { fz_stext_sheet *sheet; fz_stext_page *text; @@ -520,7 +520,7 @@ fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int } fz_buffer * -fz_new_buffer_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_rect *sel, int crlf, int options) +fz_new_buffer_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_rect *sel, int crlf, const fz_stext_options *options) { fz_page *page; fz_buffer *buf; diff --git a/source/tools/muconvert.c b/source/tools/muconvert.c index 13afee5d..a62a0dd3 100644 --- a/source/tools/muconvert.c +++ b/source/tools/muconvert.c @@ -44,6 +44,7 @@ static void usage(void) "\n" ); fputs(fz_draw_options_usage, stderr); + fputs(fz_stext_options_usage, stderr); fputs(fz_cbz_write_options_usage, stderr); fputs(fz_png_write_options_usage, stderr); #if FZ_ENABLE_PDF diff --git a/source/tools/murun.c b/source/tools/murun.c index 898ed456..fc887105 100644 --- a/source/tools/murun.c +++ b/source/tools/murun.c @@ -1734,15 +1734,17 @@ static void ffi_Page_toStructuredText(js_State *J) { fz_context *ctx = js_getcontext(J); fz_page *page = ffi_topage(J, 0); - int options = js_tointeger(J, 1); + const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL; fz_stext_sheet *sheet = NULL; + fz_stext_options so; fz_stext_page *text; fz_var(sheet); fz_try(ctx) { sheet = fz_new_stext_sheet(ctx); - text = fz_new_stext_page_from_page(ctx, page, sheet, options); + fz_parse_stext_options(ctx, &so, options); + text = fz_new_stext_page_from_page(ctx, page, sheet, &so); } fz_always(ctx) fz_drop_stext_sheet(ctx, sheet); @@ -2573,15 +2575,17 @@ static void ffi_DisplayList_toStructuredText(js_State *J) { fz_context *ctx = js_getcontext(J); fz_display_list *list = js_touserdata(J, 0, "fz_display_list"); - int options = js_tointeger(J, 1); + const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL; fz_stext_sheet *sheet = NULL; + fz_stext_options so; fz_stext_page *text; fz_var(sheet); fz_try(ctx) { sheet = fz_new_stext_sheet(ctx); - text = fz_new_stext_page_from_display_list(ctx, list, sheet, options); + fz_parse_stext_options(ctx, &so, options); + text = fz_new_stext_page_from_display_list(ctx, list, sheet, &so); } fz_always(ctx) fz_drop_stext_sheet(ctx, sheet); -- cgit v1.2.3