summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/mupdf/fitz/structured-text.h22
-rw-r--r--include/mupdf/fitz/util.h12
-rw-r--r--source/fitz/stext-device.c32
-rw-r--r--source/fitz/util.c12
-rw-r--r--source/tools/muconvert.c1
-rw-r--r--source/tools/murun.c12
6 files changed, 66 insertions, 25 deletions
diff --git a/include/mupdf/fitz/structured-text.h b/include/mupdf/fitz/structured-text.h
index a12c3cc9..061fc83c 100644
--- a/include/mupdf/fitz/structured-text.h
+++ b/include/mupdf/fitz/structured-text.h
@@ -201,6 +201,8 @@ struct fz_char_and_box_s
fz_rect bbox;
};
+extern const char *fz_stext_options_usage;
+
fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stext_page *page, int idx);
/*
@@ -286,6 +288,20 @@ int fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, f
char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect);
/*
+ struct fz_stext_options: Options for creating a pixmap and draw device.
+*/
+typedef struct fz_stext_options_s fz_stext_options;
+
+struct fz_stext_options_s
+{
+ int flags;
+};
+/*
+ fz_parse_stext_options: Parse stext device options from a comma separated key-value string.
+*/
+fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string);
+
+/*
fz_new_stext_device: Create a device to extract the text on a page.
Gather and sort the text on a page into spans of uniform style,
@@ -302,10 +318,8 @@ char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect);
containing data already (for example when merging multiple pages,
or watermarking).
- options: Mask of heuristic options to activate. If 0 is given the
- default is to activate both FZ_STEXT_PRESERVE_LIGATURES and
- FZ_STEXT_PRESERVE_WHITESPACE.
+ options: Options to configure the stext device.
*/
-fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, int options);
+fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *options);
#endif
diff --git a/include/mupdf/fitz/util.h b/include/mupdf/fitz/util.h
index f8dec50a..a9a0d59d 100644
--- a/include/mupdf/fitz/util.h
+++ b/include/mupdf/fitz/util.h
@@ -38,9 +38,9 @@ fz_pixmap *fz_new_pixmap_from_annot(fz_context *ctx, fz_annot *annot, const fz_m
/*
fz_new_stext_page_from_page: Extract structured text from a page. The sheet must not be NULL.
*/
-fz_stext_page *fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, int options);
-fz_stext_page *fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, int options);
-fz_stext_page *fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, int options);
+fz_stext_page *fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, const fz_stext_options *options);
+fz_stext_page *fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, const fz_stext_options *options);
+fz_stext_page *fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, const fz_stext_options *options);
/*
fz_new_buffer_from_stext_page: Convert structured text into plain text, cropped by the selection rectangle.
@@ -48,9 +48,9 @@ fz_stext_page *fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_l
otherwise '\n'.
*/
fz_buffer *fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rect *sel, int crlf);
-fz_buffer *fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, int options);
-fz_buffer *fz_new_buffer_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_rect *sel, int crlf, int options);
-fz_buffer *fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, int options);
+fz_buffer *fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, const fz_stext_options *options);
+fz_buffer *fz_new_buffer_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_rect *sel, int crlf, const fz_stext_options *options);
+fz_buffer *fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, const fz_stext_options *options);
/*
fz_search_page: Search for the 'needle' text on the page.
diff --git a/source/fitz/stext-device.c b/source/fitz/stext-device.c
index 588c1454..4019fb24 100644
--- a/source/fitz/stext-device.c
+++ b/source/fitz/stext-device.c
@@ -30,9 +30,15 @@ struct fz_stext_device_s
span_soup *spans;
fz_stext_span *cur_span;
int lastchar;
- int options;
+ int flags;
};
+const char *fz_stext_options_usage =
+ "Structured text output options:\n"
+ "\tpreserve-ligatures: do not expand all ligatures into constituent characters\n"
+ "\tpreserve-whitespace: do not convert all whitespace characters into spaces\n"
+ "\n";
+
static fz_rect *
add_point_to_rect(fz_rect *a, const fz_point *p)
{
@@ -728,7 +734,7 @@ fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style,
if (c == -1)
return;
- if (!(dev->options & FZ_STEXT_PRESERVE_LIGATURES))
+ if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES))
switch (c)
{
case 0xFB00: /* ff */
@@ -760,7 +766,7 @@ fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style,
return;
}
- if (!(dev->options & FZ_STEXT_PRESERVE_WHITESPACE))
+ if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE))
switch (c)
{
case 0x0009: /* tab */
@@ -1074,8 +1080,23 @@ fz_stext_drop_device(fz_context *ctx, fz_device *dev)
tdev->spans = NULL;
}
+fz_stext_options *
+fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string)
+{
+ const char *val;
+
+ memset(opts, 0, sizeof *opts);
+
+ if (fz_has_option(ctx, string, "preserve-ligatures", &val) && fz_option_eq(val, "yes"))
+ opts->flags |= FZ_STEXT_PRESERVE_LIGATURES;
+ if (fz_has_option(ctx, string, "preserve-whitespace", &val) && fz_option_eq(val, "yes"))
+ opts->flags |= FZ_STEXT_PRESERVE_WHITESPACE;
+
+ return opts;
+}
+
fz_device *
-fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, int options)
+fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *opts)
{
fz_stext_device *dev = fz_new_device(ctx, sizeof *dev);
@@ -1097,7 +1118,8 @@ fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page,
dev->spans = NULL;
dev->cur_span = NULL;
dev->lastchar = ' ';
- dev->options = options;
+ if (opts)
+ dev->flags = opts->flags;
return (fz_device*)dev;
}
diff --git a/source/fitz/util.c b/source/fitz/util.c
index 6e0982e7..be8502a5 100644
--- a/source/fitz/util.c
+++ b/source/fitz/util.c
@@ -265,7 +265,7 @@ fz_new_pixmap_from_page_number(fz_context *ctx, fz_document *doc, int number, co
}
fz_stext_page *
-fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, int options)
+fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, const fz_stext_options *options)
{
fz_stext_page *text;
fz_device *dev;
@@ -295,7 +295,7 @@ fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_s
}
fz_stext_page *
-fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, int options)
+fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, const fz_stext_options *options)
{
fz_stext_page *text;
fz_device *dev;
@@ -325,7 +325,7 @@ fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *shee
}
fz_stext_page *
-fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, int options)
+fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, const fz_stext_options *options)
{
fz_page *page;
fz_stext_page *text;
@@ -478,7 +478,7 @@ fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rec
}
fz_buffer *
-fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, int options)
+fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, const fz_stext_options *options)
{
fz_stext_sheet *sheet;
fz_stext_page *text;
@@ -499,7 +499,7 @@ fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz
}
fz_buffer *
-fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, int options)
+fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, const fz_stext_options *options)
{
fz_stext_sheet *sheet;
fz_stext_page *text;
@@ -520,7 +520,7 @@ fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int
}
fz_buffer *
-fz_new_buffer_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_rect *sel, int crlf, int options)
+fz_new_buffer_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_rect *sel, int crlf, const fz_stext_options *options)
{
fz_page *page;
fz_buffer *buf;
diff --git a/source/tools/muconvert.c b/source/tools/muconvert.c
index 13afee5d..a62a0dd3 100644
--- a/source/tools/muconvert.c
+++ b/source/tools/muconvert.c
@@ -44,6 +44,7 @@ static void usage(void)
"\n"
);
fputs(fz_draw_options_usage, stderr);
+ fputs(fz_stext_options_usage, stderr);
fputs(fz_cbz_write_options_usage, stderr);
fputs(fz_png_write_options_usage, stderr);
#if FZ_ENABLE_PDF
diff --git a/source/tools/murun.c b/source/tools/murun.c
index 898ed456..fc887105 100644
--- a/source/tools/murun.c
+++ b/source/tools/murun.c
@@ -1734,15 +1734,17 @@ static void ffi_Page_toStructuredText(js_State *J)
{
fz_context *ctx = js_getcontext(J);
fz_page *page = ffi_topage(J, 0);
- int options = js_tointeger(J, 1);
+ const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL;
fz_stext_sheet *sheet = NULL;
+ fz_stext_options so;
fz_stext_page *text;
fz_var(sheet);
fz_try(ctx) {
sheet = fz_new_stext_sheet(ctx);
- text = fz_new_stext_page_from_page(ctx, page, sheet, options);
+ fz_parse_stext_options(ctx, &so, options);
+ text = fz_new_stext_page_from_page(ctx, page, sheet, &so);
}
fz_always(ctx)
fz_drop_stext_sheet(ctx, sheet);
@@ -2573,15 +2575,17 @@ static void ffi_DisplayList_toStructuredText(js_State *J)
{
fz_context *ctx = js_getcontext(J);
fz_display_list *list = js_touserdata(J, 0, "fz_display_list");
- int options = js_tointeger(J, 1);
+ const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL;
fz_stext_sheet *sheet = NULL;
+ fz_stext_options so;
fz_stext_page *text;
fz_var(sheet);
fz_try(ctx) {
sheet = fz_new_stext_sheet(ctx);
- text = fz_new_stext_page_from_display_list(ctx, list, sheet, options);
+ fz_parse_stext_options(ctx, &so, options);
+ text = fz_new_stext_page_from_display_list(ctx, list, sheet, &so);
}
fz_always(ctx)
fz_drop_stext_sheet(ctx, sheet);