summaryrefslogtreecommitdiff
path: root/source
diff options
context:
space:
mode:
authorSebastian Rasmussen <sebras@gmail.com>2016-08-03 03:40:33 +0800
committerSebastian Rasmussen <sebras@gmail.com>2016-09-08 18:53:00 +0800
commit0c61b5737fd5b8fc03ac1457b2dc85033677e8f0 (patch)
tree3576120d2ed62e5a8c581ace77e425c6098f5157 /source
parentdc2c77351a2b3188c971551b1231cf480dad9986 (diff)
downloadmupdf-0c61b5737fd5b8fc03ac1457b2dc85033677e8f0.tar.xz
Add options to control heuristics in structured text.
Diffstat (limited to 'source')
-rw-r--r--source/fitz/stext-device.c103
-rw-r--r--source/fitz/util.c28
-rw-r--r--source/tools/mudraw.c2
-rw-r--r--source/tools/murun.c6
4 files changed, 85 insertions, 54 deletions
diff --git a/source/fitz/stext-device.c b/source/fitz/stext-device.c
index 607dcbab..909d0a46 100644
--- a/source/fitz/stext-device.c
+++ b/source/fitz/stext-device.c
@@ -30,6 +30,7 @@ struct fz_stext_device_s
span_soup *spans;
fz_stext_span *cur_span;
int lastchar;
+ int options;
};
static fz_rect *
@@ -715,41 +716,68 @@ no_glyph:
static void
fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, int c, int glyph, fz_matrix *trm, float adv, int wmode)
{
- switch (c)
- {
- case -1: /* ignore when one unicode character maps to multiple glyphs */
- break;
- case 0xFB00: /* ff */
- fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode);
- break;
- case 0xFB01: /* fi */
- fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode);
- break;
- case 0xFB02: /* fl */
- fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode);
- break;
- case 0xFB03: /* ffi */
- fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode);
- break;
- case 0xFB04: /* ffl */
- fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode);
- break;
- case 0xFB05: /* long st */
- case 0xFB06: /* st */
- fz_add_stext_char_imp(ctx, dev, style, 's', glyph, trm, adv, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 't', -1, trm, 0, wmode);
- break;
- default:
- fz_add_stext_char_imp(ctx, dev, style, c, glyph, trm, adv, wmode);
- break;
- }
+ /* ignore when one unicode character maps to multiple glyphs */
+ if (c == -1)
+ return;
+
+ if (!(dev->options & FZ_STEXT_PRESERVE_LIGATURES))
+ switch (c)
+ {
+ case 0xFB00: /* ff */
+ fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
+ fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode);
+ return;
+ case 0xFB01: /* fi */
+ fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
+ fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode);
+ return;
+ case 0xFB02: /* fl */
+ fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
+ fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode);
+ return;
+ case 0xFB03: /* ffi */
+ fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
+ fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode);
+ fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode);
+ return;
+ case 0xFB04: /* ffl */
+ fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
+ fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode);
+ fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode);
+ return;
+ case 0xFB05: /* long st */
+ case 0xFB06: /* st */
+ fz_add_stext_char_imp(ctx, dev, style, 's', glyph, trm, adv, wmode);
+ fz_add_stext_char_imp(ctx, dev, style, 't', -1, trm, 0, wmode);
+ return;
+ }
+
+ if (!(dev->options & FZ_STEXT_PRESERVE_WHITESPACE))
+ switch (c)
+ {
+ case 0x0009: /* tab */
+ case 0x0020: /* space */
+ case 0x00A0: /* no-break space */
+ case 0x1680: /* ogham space mark */
+ case 0x180E: /* mongolian vowel separator */
+ case 0x2000: /* en quad */
+ case 0x2001: /* em quad */
+ case 0x2002: /* en space */
+ case 0x2003: /* em space */
+ case 0x2004: /* three-per-em space */
+ case 0x2005: /* four-per-em space */
+ case 0x2006: /* six-per-em space */
+ case 0x2007: /* figure space */
+ case 0x2008: /* punctuation space */
+ case 0x2009: /* thin space */
+ case 0x200A: /* hair space */
+ case 0x202F: /* narrow no-break space */
+ case 0x205F: /* medium mathematical space */
+ case 0x3000: /* ideographic spac */
+ fz_add_stext_char_imp(ctx, dev, style, ' ', glyph, trm, adv, wmode);
+ }
+
+ fz_add_stext_char_imp(ctx, dev, style, c, glyph, trm, adv, wmode);
}
static void
@@ -1039,11 +1067,11 @@ fz_stext_drop_device(fz_context *ctx, fz_device *dev)
}
fz_device *
-fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page)
+fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, int options)
{
fz_stext_device *dev = fz_new_device(ctx, sizeof *dev);
- dev->super.hints = FZ_IGNORE_IMAGE | FZ_IGNORE_SHADE;
+ dev->super.hints = FZ_IGNORE_IMAGE | FZ_IGNORE_SHADE | FZ_STEXT_PRESERVE_LIGATURES | FZ_STEXT_PRESERVE_WHITESPACE;
dev->super.close_device = fz_stext_close_device;
dev->super.drop_device = fz_stext_drop_device;
@@ -1061,6 +1089,7 @@ fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page)
dev->spans = NULL;
dev->cur_span = NULL;
dev->lastchar = ' ';
+ dev->options = options ? options : FZ_STEXT_PRESERVE_LIGATURES | FZ_STEXT_PRESERVE_WHITESPACE;
return (fz_device*)dev;
}
diff --git a/source/fitz/util.c b/source/fitz/util.c
index 4c7b3232..6e0982e7 100644
--- a/source/fitz/util.c
+++ b/source/fitz/util.c
@@ -265,7 +265,7 @@ fz_new_pixmap_from_page_number(fz_context *ctx, fz_document *doc, int number, co
}
fz_stext_page *
-fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet)
+fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, int options)
{
fz_stext_page *text;
fz_device *dev;
@@ -277,7 +277,7 @@ fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_s
text = fz_new_stext_page(ctx, fz_bound_display_list(ctx, list, &mediabox));
fz_try(ctx)
{
- dev = fz_new_stext_device(ctx, sheet, text);
+ dev = fz_new_stext_device(ctx, sheet, text, options);
fz_run_display_list(ctx, list, dev, &fz_identity, NULL, NULL);
fz_close_device(ctx, dev);
}
@@ -295,7 +295,7 @@ fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_s
}
fz_stext_page *
-fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet)
+fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, int options)
{
fz_stext_page *text;
fz_device *dev;
@@ -307,7 +307,7 @@ fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *shee
text = fz_new_stext_page(ctx, fz_bound_page(ctx, page, &mediabox));
fz_try(ctx)
{
- dev = fz_new_stext_device(ctx, sheet, text);
+ dev = fz_new_stext_device(ctx, sheet, text, options);
fz_run_page(ctx, page, dev, &fz_identity, NULL);
fz_close_device(ctx, dev);
}
@@ -325,14 +325,14 @@ fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *shee
}
fz_stext_page *
-fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet)
+fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, int options)
{
fz_page *page;
fz_stext_page *text;
page = fz_load_page(ctx, doc, number);
fz_try(ctx)
- text = fz_new_stext_page_from_page(ctx, page, sheet);
+ text = fz_new_stext_page_from_page(ctx, page, sheet, options);
fz_always(ctx)
fz_drop_page(ctx, page);
fz_catch(ctx)
@@ -350,7 +350,7 @@ fz_search_display_list(fz_context *ctx, fz_display_list *list, const char *needl
sheet = fz_new_stext_sheet(ctx);
fz_try(ctx)
{
- text = fz_new_stext_page_from_display_list(ctx, list, sheet);
+ text = fz_new_stext_page_from_display_list(ctx, list, sheet, 0);
count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max);
}
fz_always(ctx)
@@ -371,7 +371,7 @@ fz_search_page(fz_context *ctx, fz_page *page, const char *needle, fz_rect *hit_
sheet = fz_new_stext_sheet(ctx);
fz_try(ctx)
{
- text = fz_new_stext_page_from_page(ctx, page, sheet);
+ text = fz_new_stext_page_from_page(ctx, page, sheet, 0);
count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max);
}
fz_always(ctx)
@@ -478,7 +478,7 @@ fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rec
}
fz_buffer *
-fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf)
+fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, int options)
{
fz_stext_sheet *sheet;
fz_stext_page *text;
@@ -487,7 +487,7 @@ fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz
sheet = fz_new_stext_sheet(ctx);
fz_try(ctx)
{
- text = fz_new_stext_page_from_display_list(ctx, list, sheet);
+ text = fz_new_stext_page_from_display_list(ctx, list, sheet, options);
buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf);
}
fz_always(ctx)
@@ -499,7 +499,7 @@ fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz
}
fz_buffer *
-fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf)
+fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, int options)
{
fz_stext_sheet *sheet;
fz_stext_page *text;
@@ -508,7 +508,7 @@ fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int
sheet = fz_new_stext_sheet(ctx);
fz_try(ctx)
{
- text = fz_new_stext_page_from_page(ctx, page, sheet);
+ text = fz_new_stext_page_from_page(ctx, page, sheet, options);
buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf);
}
fz_always(ctx)
@@ -520,14 +520,14 @@ fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int
}
fz_buffer *
-fz_new_buffer_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_rect *sel, int crlf)
+fz_new_buffer_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_rect *sel, int crlf, int options)
{
fz_page *page;
fz_buffer *buf;
page = fz_load_page(ctx, doc, number);
fz_try(ctx)
- buf = fz_new_buffer_from_page(ctx, page, sel, crlf);
+ buf = fz_new_buffer_from_page(ctx, page, sel, crlf, options);
fz_always(ctx)
fz_drop_page(ctx, page);
fz_catch(ctx)
diff --git a/source/tools/mudraw.c b/source/tools/mudraw.c
index e7dc629b..fde776d2 100644
--- a/source/tools/mudraw.c
+++ b/source/tools/mudraw.c
@@ -626,7 +626,7 @@ static void dodrawpage(fz_context *ctx, fz_page *page, fz_display_list *list, in
else
fz_bound_page(ctx, page, &mediabox);
text = fz_new_stext_page(ctx, &mediabox);
- dev = fz_new_stext_device(ctx, sheet, text);
+ dev = fz_new_stext_device(ctx, sheet, text, 0);
if (lowmemory)
fz_enable_device_hints(ctx, dev, FZ_NO_CACHE);
if (output_format == OUT_HTML)
diff --git a/source/tools/murun.c b/source/tools/murun.c
index 5767d27d..3843ea1f 100644
--- a/source/tools/murun.c
+++ b/source/tools/murun.c
@@ -1627,6 +1627,7 @@ static void ffi_Page_toStructuredText(js_State *J)
{
fz_context *ctx = js_getcontext(J);
fz_page *page = js_touserdata(J, 0, "fz_page");
+ const char *options = js_tointeger(J, 1);
fz_stext_sheet *sheet = NULL;
fz_stext_page *text;
@@ -1634,7 +1635,7 @@ static void ffi_Page_toStructuredText(js_State *J)
fz_try(ctx) {
sheet = fz_new_stext_sheet(ctx);
- text = fz_new_stext_page_from_page(ctx, page, sheet);
+ text = fz_new_stext_page_from_page(ctx, page, sheet, options);
}
fz_always(ctx)
fz_drop_stext_sheet(ctx, sheet);
@@ -2458,6 +2459,7 @@ static void ffi_DisplayList_toStructuredText(js_State *J)
{
fz_context *ctx = js_getcontext(J);
fz_display_list *list = js_touserdata(J, 0, "fz_display_list");
+ const char *options = js_tointeger(J, 1);
fz_stext_sheet *sheet = NULL;
fz_stext_page *text;
@@ -2465,7 +2467,7 @@ static void ffi_DisplayList_toStructuredText(js_State *J)
fz_try(ctx) {
sheet = fz_new_stext_sheet(ctx);
- text = fz_new_stext_page_from_display_list(ctx, list, sheet);
+ text = fz_new_stext_page_from_display_list(ctx, list, sheet, options);
}
fz_always(ctx)
fz_drop_stext_sheet(ctx, sheet);