summaryrefslogtreecommitdiff
path: root/public/fpdf_text.h
blob: d37715f6b89161f7e032b23a58861801e8c4855a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
// Copyright 2014 PDFium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
 
// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com

#ifndef _FPDFTEXT_H_
#define _FPDFTEXT_H_

#include "fpdfview.h"

// Exported Functions
#ifdef __cplusplus
extern "C" {
#endif

// Function: FPDFText_LoadPage
//			Prepare information about all characters in a page.
// Parameters:
//			page	-	Handle to the page. Returned by FPDF_LoadPage function (in FPDFVIEW module).
// Return value:
//			A handle to the text page information structure.
//			NULL if something goes wrong.
// Comments:
//			Application must call FPDFText_ClosePage to release the text page information.
//
DLLEXPORT FPDF_TEXTPAGE	STDCALL FPDFText_LoadPage(FPDF_PAGE page);

// Function: FPDFText_ClosePage
//			Release all resources allocated for a text page information structure.
// Parameters:
//			text_page	-	Handle to a text page information structure. Returned by FPDFText_LoadPage function.
// Return Value:
//			None.
//
DLLEXPORT void STDCALL FPDFText_ClosePage(FPDF_TEXTPAGE text_page);

// Function: FPDFText_CountChars
//			Get number of characters in a page.
// Parameters:
//			text_page	-	Handle to a text page information structure. Returned by FPDFText_LoadPage function.
// Return value:
//			Number of characters in the page. Return -1 for error.
//			Generated characters, like additional space characters, new line characters, are also counted.
// Comments:
//			Characters in a page form a "stream", inside the stream, each character has an index.
//			We will use the index parameters in many of FPDFTEXT functions. The first character in the page
//			has an index value of zero.
//
DLLEXPORT int STDCALL FPDFText_CountChars(FPDF_TEXTPAGE text_page);

// Function: FPDFText_GetUnicode
//			Get Unicode of a character in a page.
// Parameters:
//			text_page	-	Handle to a text page information structure. Returned by FPDFText_LoadPage function.
//			index		-	Zero-based index of the character.
// Return value:
//			The Unicode of the particular character.
//			If a character is not encoded in Unicode and Foxit engine can't convert to Unicode,
//			the return value will be zero.
//
DLLEXPORT unsigned int STDCALL FPDFText_GetUnicode(FPDF_TEXTPAGE text_page, int index);

// Function: FPDFText_GetFontSize
//			Get the font size of a particular character.
// Parameters:
//			text_page	-	Handle to a text page information structure. Returned by FPDFText_LoadPage function.
//			index		-	Zero-based index of the character.
// Return value:
//			The font size of the particular character, measured in points (about 1/72 inch).
//			This is the typographic size of the font (so called "em size").
//
DLLEXPORT double STDCALL FPDFText_GetFontSize(FPDF_TEXTPAGE text_page, int index);

// Function: FPDFText_GetCharBox
//			Get bounding box of a particular character.
// Parameters:
//			text_page	-	Handle to a text page information structure. Returned by FPDFText_LoadPage function.
//			index		-	Zero-based index of the character.
//			left		-	Pointer to a double number receiving left position of the character box.
//			right		-	Pointer to a double number receiving right position of the character box.
//			bottom		-	Pointer to a double number receiving bottom position of the character box.
//			top			-	Pointer to a double number receiving top position of the character box.
// Return Value:
//			None.
// Comments:
//			All positions are measured in PDF "user space".
//
DLLEXPORT void STDCALL FPDFText_GetCharBox(FPDF_TEXTPAGE text_page, int index, double* left,
													double* right, double* bottom, double* top);

// Function: FPDFText_GetCharIndexAtPos
//			Get the index of a character at or nearby a certain position on the page.
// Parameters:
//			text_page	-	Handle to a text page information structure. Returned by FPDFText_LoadPage function.
//			x			-	X position in PDF "user space".
//			y			-	Y position in PDF "user space".
//			xTolerance	-	An x-axis tolerance value for character hit detection, in point unit.
//			yTolerance	-	A y-axis tolerance value for character hit detection, in point unit.
// Return Value:
//			The zero-based index of the character at, or nearby the point (x,y).
//			If there is no character at or nearby the point, return value will be -1.
//			If an error occurs, -3 will be returned.
//
DLLEXPORT int STDCALL FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page,
												 double x, double y, double xTorelance, double yTolerance);

// Function: FPDFText_GetText
//			Extract unicode text string from the page.
// Parameters:
//			text_page	-	Handle to a text page information structure. Returned by FPDFText_LoadPage function.
//			start_index	-	Index for the start characters.
//			count		-	Number of characters to be extracted.
//			result		-	A buffer (allocated by application) receiving the extracted unicodes.
//							The size of the buffer must be able to hold the number of characters plus a terminator.
// Return Value:
//			Number of characters written into the result buffer, including the trailing terminator.
// Comments:
//			This function ignores characters without unicode information.
//
DLLEXPORT int STDCALL FPDFText_GetText(FPDF_TEXTPAGE text_page, int start_index, int count, unsigned short* result);

// Function: FPDFText_CountRects
//			Count number of rectangular areas occupied by a segment of texts.
// Parameters:
//			text_page	-	Handle to a text page information structure. Returned by FPDFText_LoadPage function.
//			start_index	-	Index for the start characters.
//			count		-	Number of characters.
// Return value:
//			Number of rectangles. Zero for error.
// Comments:
//			This function, along with FPDFText_GetRect can be used by applications to detect the position
//			on the page for a text segment, so proper areas can be highlighted or something.
//			FPDFTEXT will automatically merge small character boxes into bigger one if those characters
//			are on the same line and use same font settings.
//
DLLEXPORT int STDCALL FPDFText_CountRects(FPDF_TEXTPAGE text_page, int start_index, int count);

// Function: FPDFText_GetRect
//			Get a rectangular area from the result generated by FPDFText_CountRects.
// Parameters:
//			text_page	-	Handle to a text page information structure. Returned by FPDFText_LoadPage function.
//			rect_index	-	Zero-based index for the rectangle.
//			left		-	Pointer to a double value receiving the rectangle left boundary.
//			top			-	Pointer to a double value receiving the rectangle top boundary.
//			right		-	Pointer to a double value receiving the rectangle right boundary.
//			bottom		-	Pointer to a double value receiving the rectangle bottom boundary.
// Return Value:
//			None.
//
DLLEXPORT void STDCALL FPDFText_GetRect(FPDF_TEXTPAGE text_page, int rect_index, double* left, double* top,
											double* right, double* bottom);

// Function: FPDFText_GetBoundedText
//			Extract unicode text within a rectangular boundary on the page.
// Parameters:
//			text_page	-	Handle to a text page information structure. Returned by FPDFText_LoadPage function.
//			left		-	Left boundary.
//			top			-	Top boundary.
//			right		-	Right boundary.
//			bottom		-	Bottom boundary.
//			buffer		-	A unicode buffer.
//			buflen		-	Number of characters (not bytes) for the buffer, excluding an additional terminator.
// Return Value:
//			If buffer is NULL or buflen is zero, return number of characters (not bytes) of text present within
//			the rectangle, excluding a terminating NUL.  Generally you should pass a buffer at least one larger
//			than this if you want a terminating NUL, which will be provided if space is available.
//			Otherwise, return number of characters copied into the buffer, including the terminating NUL
//			when space for it is available.
// Comment:
//			If the buffer is too small, as much text as will fit is copied into it.
//
DLLEXPORT int STDCALL FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page,double left, double top,
											  double right, double bottom,unsigned short* buffer,int buflen);


// Flags used by FPDFText_FindStart function.
#define FPDF_MATCHCASE      0x00000001		//If not set, it will not match case by default.
#define FPDF_MATCHWHOLEWORD 0x00000002		//If not set, it will not match the whole word by default.

// Function: FPDFText_FindStart
//			Start a search.
// Parameters:
//			text_page	-	Handle to a text page information structure. Returned by FPDFText_LoadPage function.
//			findwhat	-	A unicode match pattern.
//			flags		-	Option flags.
//			start_index	-	Start from this character. -1 for end of the page.
// Return Value:
//			A handle for the search context. FPDFText_FindClose must be called to release this handle.
//
DLLEXPORT FPDF_SCHHANDLE STDCALL FPDFText_FindStart(FPDF_TEXTPAGE text_page, FPDF_WIDESTRING findwhat,
													unsigned long flags, int start_index);

// Function: FPDFText_FindNext
//			Search in the direction from page start to end.
// Parameters:
//			handle		-	A search context handle returned by FPDFText_FindStart.
// Return Value:
//			Whether a match is found.
//
DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindNext(FPDF_SCHHANDLE handle);

// Function: FPDFText_FindPrev
//			Search in the direction from page end to start.
// Parameters:
//			handle		-	A search context handle returned by FPDFText_FindStart.
// Return Value:
//			Whether a match is found.
//
DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindPrev(FPDF_SCHHANDLE handle);

// Function: FPDFText_GetSchResultIndex
//			Get the starting character index of the search result.
// Parameters:
//			handle		-	A search context handle returned by FPDFText_FindStart.
// Return Value:
//			Index for the starting character.
//
DLLEXPORT int STDCALL FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle);

// Function: FPDFText_GetSchCount
//			Get the number of matched characters in the search result.
// Parameters:
//			handle		-	A search context handle returned by FPDFText_FindStart.
// Return Value:
//			Number of matched characters.
//
DLLEXPORT int STDCALL FPDFText_GetSchCount(FPDF_SCHHANDLE handle);

// Function: FPDFText_FindClose
//			Release a search context.
// Parameters:
//			handle		-	A search context handle returned by FPDFText_FindStart.
// Return Value:
//			None.
//
DLLEXPORT void STDCALL FPDFText_FindClose(FPDF_SCHHANDLE handle);

// Function: FPDFLink_LoadWebLinks
//			Prepare information about weblinks in a page.
// Parameters:
//			text_page	-	Handle to a text page information structure. Returned by FPDFText_LoadPage function.
// Return Value:
//			A handle to the page's links information structure.
//			NULL if something goes wrong.
// Comments:
//			Weblinks are those links implicitly embedded in PDF pages. PDF also has a type of
//			annotation called "link", FPDFTEXT doesn't deal with that kind of link.
//			FPDFTEXT weblink feature is useful for automatically detecting links in the page
//			contents. For example, things like "http://www.foxitsoftware.com" will be detected,
//			so applications can allow user to click on those characters to activate the link,
//			even the PDF doesn't come with link annotations.
//
//			FPDFLink_CloseWebLinks must be called to release resources.
//
DLLEXPORT FPDF_PAGELINK STDCALL FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page);

// Function: FPDFLink_CountWebLinks
//			Count number of detected web links.
// Parameters:
//			link_page	-	Handle returned by FPDFLink_LoadWebLinks.
// Return Value:
//			Number of detected web links.
//
DLLEXPORT int STDCALL FPDFLink_CountWebLinks(FPDF_PAGELINK link_page);

// Function: FPDFLink_GetURL
//			Fetch the URL information for a detected web link.
// Parameters:
//			link_page	-	Handle returned by FPDFLink_LoadWebLinks.
//			link_index	-	Zero-based index for the link.
//			buffer		-	A unicode buffer.
//			buflen		-	Number of characters (not bytes) for the buffer, including an additional terminator.
// Return Value:
//			If buffer is NULL or buflen is zero, return number of characters (not bytes and an additional terminator is also counted) needed,
//			otherwise, return number of characters copied into the buffer.
//
DLLEXPORT int STDCALL FPDFLink_GetURL(FPDF_PAGELINK link_page, int link_index, unsigned short* buffer,int buflen);

// Function: FPDFLink_CountRects
//			Count number of rectangular areas for the link.
// Parameters:
//			link_page	-	Handle returned by FPDFLink_LoadWebLinks.
//			link_index	-	Zero-based index for the link.
// Return Value:
//			Number of rectangular areas for the link.
//
DLLEXPORT int STDCALL FPDFLink_CountRects(FPDF_PAGELINK link_page, int link_index);

// Function: FPDFLink_GetRect
//			Fetch the boundaries of a rectangle for a link.
// Parameters:
//			link_page	-	Handle returned by FPDFLink_LoadWebLinks.
//			link_index	-	Zero-based index for the link.
//			rect_index	-	Zero-based index for a rectangle.
//			left		-	Pointer to a double value receiving the rectangle left boundary.
//			top			-	Pointer to a double value receiving the rectangle top boundary.
//			right		-	Pointer to a double value receiving the rectangle right boundary.
//			bottom		-	Pointer to a double value receiving the rectangle bottom boundary.
// Return Value:
//			None.
//
DLLEXPORT void STDCALL FPDFLink_GetRect(FPDF_PAGELINK link_page, int link_index, int rect_index,
										double* left, double* top,double* right, double* bottom);

// Function: FPDFLink_CloseWebLinks
//			Release resources used by weblink feature.
// Parameters:
//			link_page	-	Handle returned by FPDFLink_LoadWebLinks.
// Return Value:
//			None.
//
DLLEXPORT void STDCALL FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page);


#ifdef __cplusplus
};
#endif

#endif//_FPDFTEXT_H_