summaryrefslogtreecommitdiff
path: root/include/mupdf/fitz/structured-text.h
blob: 7ec5772adeb1afaa4183243517e87db5c859f42d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
#ifndef MUPDF_FITZ_STRUCTURED_TEXT_H
#define MUPDF_FITZ_STRUCTURED_TEXT_H

#include "mupdf/fitz/system.h"
#include "mupdf/fitz/context.h"
#include "mupdf/fitz/geometry.h"
#include "mupdf/fitz/font.h"
#include "mupdf/fitz/colorspace.h"
#include "mupdf/fitz/image.h"
#include "mupdf/fitz/output.h"
#include "mupdf/fitz/device.h"

/*
	Text extraction device: Used for searching, format conversion etc.

	(In development - Subject to change in future versions)
*/

typedef struct fz_stext_style_s fz_stext_style;
typedef struct fz_stext_char_s fz_stext_char;
typedef struct fz_stext_span_s fz_stext_span;
typedef struct fz_stext_line_s fz_stext_line;
typedef struct fz_stext_block_s fz_stext_block;
typedef struct fz_image_block_s fz_image_block;
typedef struct fz_page_block_s fz_page_block;

typedef struct fz_stext_sheet_s fz_stext_sheet;
typedef struct fz_stext_page_s fz_stext_page;

/*
	FZ_STEXT_PRESERVE_LIGATURES: If this option is activated ligatures
	are passed through to the application in their original form. If
	this option is deactivated ligatures are expanded into their
	constituent parts, e.g. the ligature ffi is expanded into three
	separate characters f, f and i.

	FZ_STEXT_PRESERVE_WHITESPACE: If this option is actived whitespace
	is passed through to the application in its original form. If this
	option is deactivated any type of horizontal whitespace (including
	horizontal tabs) will be replaced with space characters of variable
	width.
*/
enum
{
	FZ_STEXT_PRESERVE_LIGATURES = 1,
	FZ_STEXT_PRESERVE_WHITESPACE = 2,
};

/*
	fz_stext_sheet: A text sheet contains a list of distinct text styles
	used on a page (or a series of pages).
*/
struct fz_stext_sheet_s
{
	int maxid;
	fz_stext_style *style;
};

/*
	fz_stext_style: A text style contains details of a distinct text style
	used on a page.
*/
struct fz_stext_style_s
{
	fz_stext_style *next;
	int id;
	fz_font *font;
	float size;
	int wmode;
	int script;
	/* Ascender and Descender only have the conventional sense in
	 * horizontal mode; in vertical mode they are rotated too - they are
	 * the maximum and minimum bounds respectively. */
	float ascender;
	float descender;
	/* etc... */
};

/*
	fz_stext_page: A text page is a list of page blocks, together with
	an overall bounding box.
*/
struct fz_stext_page_s
{
	fz_rect mediabox;
	int len, cap;
	fz_page_block *blocks;
	fz_stext_page *next;
};

/*
	fz_page_block: A page block is a typed block pointer.
*/
struct fz_page_block_s
{
	int type;
	union
	{
		fz_stext_block *text;
		fz_image_block *image;
	} u;
};

enum
{
	FZ_PAGE_BLOCK_TEXT = 0,
	FZ_PAGE_BLOCK_IMAGE = 1
};

/*
	fz_stext_block: A text block is a list of lines of text. In typical
	cases this may correspond to a paragraph or a column of text. A
	collection of blocks makes up a page.
*/
struct fz_stext_block_s
{
	fz_rect bbox;
	int len, cap;
	fz_stext_line *lines;
};

/*
	fz_image_block: An image block is an image, together with the  list of lines of text. In typical
	cases this may correspond to a paragraph or a column of text. A
	collection of blocks makes up a page.
*/
struct fz_image_block_s
{
	fz_rect bbox;
	fz_matrix mat;
	fz_image *image;
	fz_colorspace *cspace;
	float colors[FZ_MAX_COLORS];
};

/*
	fz_stext_line: A text line is a list of text spans, with the same
	baseline. In typical cases this should correspond (as expected) to
	complete lines of text. A collection of lines makes up a block.
*/
struct fz_stext_line_s
{
	fz_stext_span *first_span, *last_span;

	/* Cached information */
	float distance; /* Perpendicular distance from previous line */
	fz_rect bbox;
	void *region; /* Opaque value for matching line masks */
};

/*
	fz_stext_span: A text span is a list of characters that share a common
	baseline/transformation. In typical cases a single span may be enough
	to represent a complete line. In cases where the text has big gaps in
	it (perhaps as it crosses columns or tables), a line may be represented
	by multiple spans.
*/
struct fz_stext_span_s
{
	int len, cap;
	fz_stext_char *text;
	fz_point min; /* Device space */
	fz_point max; /* Device space */
	int wmode; /* 0 for horizontal, 1 for vertical */
	fz_matrix transform; /* e and f are always 0 here */
	/* Ascender_max and Descender_min only have the conventional sense in
	 * horizontal mode; in vertical mode they are rotated too - they are
	 * the maximum and minimum bounds respectively. */
	float ascender_max; /* Document space */
	float descender_min; /* Document space */
	fz_rect bbox; /* Device space */

	/* Cached information */
	float base_offset; /* Perpendicular distance from baseline of line */
	float spacing; /* Distance along baseline from previous span in this line (or 0 if first) */
	int column; /* If non zero, the column that it's in */
	float column_width; /* Percentage */
	int align; /* 0 = left, 1 = centre, 2 = right */
	float indent; /* The indent position for this column. */

	fz_stext_span *next;
};

/*
	fz_stext_char: A text char is a unicode character, the style in which
	is appears, and the point at which it is positioned. Transform
	(and hence bbox) information is given by the enclosing span.
*/
struct fz_stext_char_s
{
	fz_point p; /* Device space */
	int c;
	fz_stext_style *style;
};

typedef struct fz_char_and_box_s fz_char_and_box;

struct fz_char_and_box_s
{
	int c;
	fz_rect bbox;
};

extern const char *fz_stext_options_usage;

fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stext_page *page, int idx);

/*
	fz_stext_char_bbox: Return the bbox of a text char. Calculated from
	the supplied enclosing span.

	bbox: A place to store the bbox

	span: The enclosing span

	idx: The index of the char within the span

	Returns bbox (updated)

	Does not throw exceptions
*/
fz_rect *fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_span *span, int idx);

/*
	fz_new_stext_sheet: Create an empty style sheet.

	The style sheet is filled out by the text device, creating
	one style for each unique font, color, size combination that
	is used.
*/
fz_stext_sheet *fz_new_stext_sheet(fz_context *ctx);
void fz_drop_stext_sheet(fz_context *ctx, fz_stext_sheet *sheet);

/*
	fz_new_stext_page: Create an empty text page.

	The text page is filled out by the text device to contain the blocks,
	lines and spans of text on the page.

	mediabox: optional mediabox information.
*/
fz_stext_page *fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox);
void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page);

void fz_analyze_text(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page);

/*
	fz_print_stext_sheet: Output a text sheet to a file as CSS.
*/
void fz_print_stext_sheet(fz_context *ctx, fz_output *out, fz_stext_sheet *sheet);

/*
	fz_print_stext_page_html: Output a page to a file in HTML format.
*/
void fz_print_stext_page_html(fz_context *ctx, fz_output *out, fz_stext_page *page);

/*
	fz_print_stext_page_xml: Output a page to a file in XML format.
*/
void fz_print_stext_page_xml(fz_context *ctx, fz_output *out, fz_stext_page *page);

/*
	fz_print_stext_page: Output a page to a file in UTF-8 format.
*/
void fz_print_stext_page(fz_context *ctx, fz_output *out, fz_stext_page *page);

/*
	fz_search_stext_page: Search for occurrence of 'needle' in text page.

	Return the number of hits and store hit bboxes in the passed in array.

	NOTE: This is an experimental interface and subject to change without notice.
*/
int fz_search_stext_page(fz_context *ctx, fz_stext_page *text, const char *needle, fz_rect *hit_bbox, int hit_max);

/*
	fz_highlight_selection: Return a list of rectangles to highlight given a selection rectangle.

	NOTE: This is an experimental interface and subject to change without notice.
*/
int fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, fz_rect *hit_bbox, int hit_max);

/*
	fz_copy_selection: Return a newly allocated UTF-8 string with the text for a given selection rectangle.

	NOTE: This is an experimental interface and subject to change without notice.
*/
char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect);

/*
	struct fz_stext_options: Options for creating a pixmap and draw device.
*/
typedef struct fz_stext_options_s fz_stext_options;

struct fz_stext_options_s
{
	int flags;
};
/*
	fz_parse_stext_options: Parse stext device options from a comma separated key-value string.
*/
fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string);

/*
	fz_new_stext_device: Create a device to extract the text on a page.

	Gather and sort the text on a page into spans of uniform style,
	arranged into lines and blocks by reading order. The reading order
	is determined by various heuristics, so may not be accurate.

	sheet: The text sheet to which styles should be added. This can
	either be a newly created (empty) text sheet, or one containing
	styles from a previous text device. The same sheet cannot be used
	in multiple threads simultaneously.

	page: The text page to which content should be added. This will
	usually be a newly created (empty) text page, but it can be one
	containing data already (for example when merging multiple pages,
	or watermarking).

	options: Options to configure the stext device.
*/
fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *options);

#endif