summaryrefslogtreecommitdiff
path: root/fitz/fitz.h
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2013-02-05 20:14:15 +0000
committerRobin Watts <robin.watts@artifex.com>2013-03-26 12:29:59 +0000
commit0fcb8d9539931f1c0a5a19ff0eb1aca427856e18 (patch)
tree7083f7e3e16b94ef1c85e1032c7a761c3a9002de /fitz/fitz.h
parent62f1176aca1cf99e0d1135193c32b2e160aade8b (diff)
downloadmupdf-0fcb8d9539931f1c0a5a19ff0eb1aca427856e18.tar.xz
Rework text extraction structures.
Rework the text extraction structures - the broad strokes are similar but we now hold more information at each stage to enable us to perform more detailed analysis on the structure of the page. We now hold: fz_text_char's (the position, ucs value, and style of each char). fz_text_span's (sets of chars that share the same baseline/transform, with no more than an expected amount of whitespace between each char). fz_text_line's (sets of spans that share the same baseline (more or less, allowing for super/subscript, but possibly with a larger than expected amount of whitespace). fz_text_block's (sets of lines that follow one another) After fz_text_analysis is called, we hope to have fz_text_blocks split such that each block is a paragraph. This new implementation has the same restrictions as the current implementation it replaces, namely that chars are only considered for addition onto the most recent span at the moment, but this revised form is designed to allow more easy extension, and for this restriction to be lifted. Also add simple paragraph splitting based on finding the most common 'line distance' in blocks. When we add spans together to collate them into lines, we record the 'horizontal' and 'vertical' spacing between them. (Not actually horizontal or vertical, so much as 'in the direction of writing' and 'perpendicular to the direction of writing'). The 'horizontal' value enables us to more correctly output spaces when converting to (say) html later. The 'vertical' value enables us to spot subscripts and superscripts etc, as well as small changes in the baseline due to style changes. We are careful to base the baseline comparison on the baseline for the line, not the baseline for the previous span, as otherwise superscripts/ subscripts on the end of the line affect what we match next. Also, we are less tolerant of vertical shifts after a large gap. This avoids false positives where different columns just happen to almost line up.
Diffstat (limited to 'fitz/fitz.h')
-rw-r--r--fitz/fitz.h74
1 files changed, 58 insertions, 16 deletions
diff --git a/fitz/fitz.h b/fitz/fitz.h
index c614161f..b5db25c6 100644
--- a/fitz/fitz.h
+++ b/fitz/fitz.h
@@ -1869,6 +1869,8 @@ struct fz_text_style_s
float size;
int wmode;
int script;
+ float ascender;
+ float descender;
/* etc... */
};
@@ -1897,43 +1899,81 @@ struct fz_text_block_s
/*
fz_text_line: A text line is a list of text spans, with the same
- (or very similar) baseline. In typical cases this should correspond
- (as expected) to complete lines of text. A collection of lines makes
- up a block.
+ baseline. In typical cases this should correspond (as expected) to
+ complete lines of text. A collection of lines makes up a block.
*/
struct fz_text_line_s
{
- fz_rect bbox;
int len, cap;
- fz_text_span *spans;
+ fz_text_span **spans;
+
+ /* Cached information */
+ float distance; /* Perpendicular distance from previous line */
+ fz_rect bbox;
};
/*
- fz_text_span: A text span is a list of characters in the same style
- that share a common (or very similar) baseline. In typical cases
- (where only one font style is used in a line), a single span may be
- enough to represent a complete line. In cases where multiple
- font styles are used (for example italics), then a line will be
- broken down into a series of spans.
+ fz_text_span: A text span is a list of characters that share a common
+ baseline/transformation. In typical cases a single span may be enough
+ to represent a complete line. In cases where the text has big gaps in
+ it (perhaps as it crosses columns or tables), a line may be represented
+ by multiple spans.
*/
struct fz_text_span_s
{
- fz_rect bbox;
int len, cap;
fz_text_char *text;
- fz_text_style *style;
+ fz_point min; /* Device space */
+ fz_point max; /* Device space */
+ int wmode; /* 0 for horizontal, 1 for vertical */
+ fz_matrix transform; /* e and f are always 0 here */
+ float ascender_max; /* Document space */
+ float descender_min; /* Document space */
+ fz_rect bbox; /* Device space */
+
+ /* Cached information */
+ float base_offset; /* Perpendicular distance from baseline of line */
+ float spacing; /* Distance along baseline from previous span in this line (or 0 if first) */
};
/*
- fz_text_char: A text char is a unicode character and the bounding
- box with which it appears on the page.
+ fz_text_char: A text char is a unicode character, the style in which
+ is appears, and the point at which it is positioned. Transform
+ (and hence bbox) information is given by the enclosing span.
*/
struct fz_text_char_s
{
- fz_rect bbox;
+ fz_point p; /* Device space */
int c;
+ fz_text_style *style;
};
+typedef struct fz_char_and_box_s fz_char_and_box;
+
+struct fz_char_and_box_s
+{
+ int c;
+ fz_rect bbox;
+};
+
+fz_char_and_box *fz_text_char_at(fz_char_and_box *cab, fz_text_page *page, int idx);
+
+/*
+ fz_text_char_bbox: Return the bbox of a text char. Calculated from
+ the supplied enclosing span.
+
+ bbox: A place to store the bbox
+
+ span: The enclosing span
+
+ idx: The index of the char within the span
+
+ Returns bbox (updated)
+
+ Does not throw exceptions
+*/
+fz_rect *fz_text_char_bbox(fz_rect *bbox, fz_text_span *span, int idx);
+
/*
fz_new_text_device: Create a device to extract the text on a page.
@@ -1972,6 +2012,8 @@ void fz_free_text_sheet(fz_context *ctx, fz_text_sheet *sheet);
fz_text_page *fz_new_text_page(fz_context *ctx, const fz_rect *mediabox);
void fz_free_text_page(fz_context *ctx, fz_text_page *page);
+void fz_text_analysis(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page);
+
typedef struct fz_output_s fz_output;
struct fz_output_s