From 0c61b5737fd5b8fc03ac1457b2dc85033677e8f0 Mon Sep 17 00:00:00 2001 From: Sebastian Rasmussen Date: Wed, 3 Aug 2016 03:40:33 +0800 Subject: Add options to control heuristics in structured text. --- source/fitz/stext-device.c | 103 +++++++++++++++++++++++++++++---------------- source/fitz/util.c | 28 ++++++------ source/tools/mudraw.c | 2 +- source/tools/murun.c | 6 ++- 4 files changed, 85 insertions(+), 54 deletions(-) (limited to 'source') diff --git a/source/fitz/stext-device.c b/source/fitz/stext-device.c index 607dcbab..909d0a46 100644 --- a/source/fitz/stext-device.c +++ b/source/fitz/stext-device.c @@ -30,6 +30,7 @@ struct fz_stext_device_s span_soup *spans; fz_stext_span *cur_span; int lastchar; + int options; }; static fz_rect * @@ -715,41 +716,68 @@ no_glyph: static void fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, int c, int glyph, fz_matrix *trm, float adv, int wmode) { - switch (c) - { - case -1: /* ignore when one unicode character maps to multiple glyphs */ - break; - case 0xFB00: /* ff */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); - break; - case 0xFB01: /* fi */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode); - break; - case 0xFB02: /* fl */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode); - break; - case 0xFB03: /* ffi */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode); - break; - case 0xFB04: /* ffl */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode); - break; - case 0xFB05: /* long st */ - case 0xFB06: /* st */ - fz_add_stext_char_imp(ctx, dev, style, 's', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 't', -1, trm, 0, wmode); - break; - default: - fz_add_stext_char_imp(ctx, dev, style, c, glyph, trm, adv, wmode); - break; - } + /* ignore when one unicode character maps to multiple glyphs */ + if (c == -1) + return; + + if (!(dev->options & FZ_STEXT_PRESERVE_LIGATURES)) + switch (c) + { + case 0xFB00: /* ff */ + fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); + return; + case 0xFB01: /* fi */ + fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode); + return; + case 0xFB02: /* fl */ + fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode); + return; + case 0xFB03: /* ffi */ + fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode); + return; + case 0xFB04: /* ffl */ + fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode); + return; + case 0xFB05: /* long st */ + case 0xFB06: /* st */ + fz_add_stext_char_imp(ctx, dev, style, 's', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, style, 't', -1, trm, 0, wmode); + return; + } + + if (!(dev->options & FZ_STEXT_PRESERVE_WHITESPACE)) + switch (c) + { + case 0x0009: /* tab */ + case 0x0020: /* space */ + case 0x00A0: /* no-break space */ + case 0x1680: /* ogham space mark */ + case 0x180E: /* mongolian vowel separator */ + case 0x2000: /* en quad */ + case 0x2001: /* em quad */ + case 0x2002: /* en space */ + case 0x2003: /* em space */ + case 0x2004: /* three-per-em space */ + case 0x2005: /* four-per-em space */ + case 0x2006: /* six-per-em space */ + case 0x2007: /* figure space */ + case 0x2008: /* punctuation space */ + case 0x2009: /* thin space */ + case 0x200A: /* hair space */ + case 0x202F: /* narrow no-break space */ + case 0x205F: /* medium mathematical space */ + case 0x3000: /* ideographic spac */ + fz_add_stext_char_imp(ctx, dev, style, ' ', glyph, trm, adv, wmode); + } + + fz_add_stext_char_imp(ctx, dev, style, c, glyph, trm, adv, wmode); } static void @@ -1039,11 +1067,11 @@ fz_stext_drop_device(fz_context *ctx, fz_device *dev) } fz_device * -fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page) +fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, int options) { fz_stext_device *dev = fz_new_device(ctx, sizeof *dev); - dev->super.hints = FZ_IGNORE_IMAGE | FZ_IGNORE_SHADE; + dev->super.hints = FZ_IGNORE_IMAGE | FZ_IGNORE_SHADE | FZ_STEXT_PRESERVE_LIGATURES | FZ_STEXT_PRESERVE_WHITESPACE; dev->super.close_device = fz_stext_close_device; dev->super.drop_device = fz_stext_drop_device; @@ -1061,6 +1089,7 @@ fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page) dev->spans = NULL; dev->cur_span = NULL; dev->lastchar = ' '; + dev->options = options ? options : FZ_STEXT_PRESERVE_LIGATURES | FZ_STEXT_PRESERVE_WHITESPACE; return (fz_device*)dev; } diff --git a/source/fitz/util.c b/source/fitz/util.c index 4c7b3232..6e0982e7 100644 --- a/source/fitz/util.c +++ b/source/fitz/util.c @@ -265,7 +265,7 @@ fz_new_pixmap_from_page_number(fz_context *ctx, fz_document *doc, int number, co } fz_stext_page * -fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet) +fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, int options) { fz_stext_page *text; fz_device *dev; @@ -277,7 +277,7 @@ fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_s text = fz_new_stext_page(ctx, fz_bound_display_list(ctx, list, &mediabox)); fz_try(ctx) { - dev = fz_new_stext_device(ctx, sheet, text); + dev = fz_new_stext_device(ctx, sheet, text, options); fz_run_display_list(ctx, list, dev, &fz_identity, NULL, NULL); fz_close_device(ctx, dev); } @@ -295,7 +295,7 @@ fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_s } fz_stext_page * -fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet) +fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, int options) { fz_stext_page *text; fz_device *dev; @@ -307,7 +307,7 @@ fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *shee text = fz_new_stext_page(ctx, fz_bound_page(ctx, page, &mediabox)); fz_try(ctx) { - dev = fz_new_stext_device(ctx, sheet, text); + dev = fz_new_stext_device(ctx, sheet, text, options); fz_run_page(ctx, page, dev, &fz_identity, NULL); fz_close_device(ctx, dev); } @@ -325,14 +325,14 @@ fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *shee } fz_stext_page * -fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet) +fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, int options) { fz_page *page; fz_stext_page *text; page = fz_load_page(ctx, doc, number); fz_try(ctx) - text = fz_new_stext_page_from_page(ctx, page, sheet); + text = fz_new_stext_page_from_page(ctx, page, sheet, options); fz_always(ctx) fz_drop_page(ctx, page); fz_catch(ctx) @@ -350,7 +350,7 @@ fz_search_display_list(fz_context *ctx, fz_display_list *list, const char *needl sheet = fz_new_stext_sheet(ctx); fz_try(ctx) { - text = fz_new_stext_page_from_display_list(ctx, list, sheet); + text = fz_new_stext_page_from_display_list(ctx, list, sheet, 0); count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max); } fz_always(ctx) @@ -371,7 +371,7 @@ fz_search_page(fz_context *ctx, fz_page *page, const char *needle, fz_rect *hit_ sheet = fz_new_stext_sheet(ctx); fz_try(ctx) { - text = fz_new_stext_page_from_page(ctx, page, sheet); + text = fz_new_stext_page_from_page(ctx, page, sheet, 0); count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max); } fz_always(ctx) @@ -478,7 +478,7 @@ fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rec } fz_buffer * -fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf) +fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, int options) { fz_stext_sheet *sheet; fz_stext_page *text; @@ -487,7 +487,7 @@ fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz sheet = fz_new_stext_sheet(ctx); fz_try(ctx) { - text = fz_new_stext_page_from_display_list(ctx, list, sheet); + text = fz_new_stext_page_from_display_list(ctx, list, sheet, options); buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf); } fz_always(ctx) @@ -499,7 +499,7 @@ fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz } fz_buffer * -fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf) +fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, int options) { fz_stext_sheet *sheet; fz_stext_page *text; @@ -508,7 +508,7 @@ fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int sheet = fz_new_stext_sheet(ctx); fz_try(ctx) { - text = fz_new_stext_page_from_page(ctx, page, sheet); + text = fz_new_stext_page_from_page(ctx, page, sheet, options); buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf); } fz_always(ctx) @@ -520,14 +520,14 @@ fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int } fz_buffer * -fz_new_buffer_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_rect *sel, int crlf) +fz_new_buffer_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_rect *sel, int crlf, int options) { fz_page *page; fz_buffer *buf; page = fz_load_page(ctx, doc, number); fz_try(ctx) - buf = fz_new_buffer_from_page(ctx, page, sel, crlf); + buf = fz_new_buffer_from_page(ctx, page, sel, crlf, options); fz_always(ctx) fz_drop_page(ctx, page); fz_catch(ctx) diff --git a/source/tools/mudraw.c b/source/tools/mudraw.c index e7dc629b..fde776d2 100644 --- a/source/tools/mudraw.c +++ b/source/tools/mudraw.c @@ -626,7 +626,7 @@ static void dodrawpage(fz_context *ctx, fz_page *page, fz_display_list *list, in else fz_bound_page(ctx, page, &mediabox); text = fz_new_stext_page(ctx, &mediabox); - dev = fz_new_stext_device(ctx, sheet, text); + dev = fz_new_stext_device(ctx, sheet, text, 0); if (lowmemory) fz_enable_device_hints(ctx, dev, FZ_NO_CACHE); if (output_format == OUT_HTML) diff --git a/source/tools/murun.c b/source/tools/murun.c index 5767d27d..3843ea1f 100644 --- a/source/tools/murun.c +++ b/source/tools/murun.c @@ -1627,6 +1627,7 @@ static void ffi_Page_toStructuredText(js_State *J) { fz_context *ctx = js_getcontext(J); fz_page *page = js_touserdata(J, 0, "fz_page"); + const char *options = js_tointeger(J, 1); fz_stext_sheet *sheet = NULL; fz_stext_page *text; @@ -1634,7 +1635,7 @@ static void ffi_Page_toStructuredText(js_State *J) fz_try(ctx) { sheet = fz_new_stext_sheet(ctx); - text = fz_new_stext_page_from_page(ctx, page, sheet); + text = fz_new_stext_page_from_page(ctx, page, sheet, options); } fz_always(ctx) fz_drop_stext_sheet(ctx, sheet); @@ -2458,6 +2459,7 @@ static void ffi_DisplayList_toStructuredText(js_State *J) { fz_context *ctx = js_getcontext(J); fz_display_list *list = js_touserdata(J, 0, "fz_display_list"); + const char *options = js_tointeger(J, 1); fz_stext_sheet *sheet = NULL; fz_stext_page *text; @@ -2465,7 +2467,7 @@ static void ffi_DisplayList_toStructuredText(js_State *J) fz_try(ctx) { sheet = fz_new_stext_sheet(ctx); - text = fz_new_stext_page_from_display_list(ctx, list, sheet); + text = fz_new_stext_page_from_display_list(ctx, list, sheet, options); } fz_always(ctx) fz_drop_stext_sheet(ctx, sheet); -- cgit v1.2.3