Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
7858 views
1
#ifndef MUPDF_FITZ_STRUCTURED_TEXT_H
2
#define MUPDF_FITZ_STRUCTURED_TEXT_H
3
4
#include "mupdf/fitz/system.h"
5
#include "mupdf/fitz/context.h"
6
#include "mupdf/fitz/math.h"
7
#include "mupdf/fitz/font.h"
8
#include "mupdf/fitz/colorspace.h"
9
#include "mupdf/fitz/image.h"
10
#include "mupdf/fitz/output.h"
11
#include "mupdf/fitz/device.h"
12
13
/*
14
Text extraction device: Used for searching, format conversion etc.
15
16
(In development - Subject to change in future versions)
17
*/
18
19
typedef struct fz_text_style_s fz_text_style;
20
typedef struct fz_text_char_s fz_text_char;
21
typedef struct fz_text_span_s fz_text_span;
22
typedef struct fz_text_line_s fz_text_line;
23
typedef struct fz_text_block_s fz_text_block;
24
typedef struct fz_image_block_s fz_image_block;
25
typedef struct fz_page_block_s fz_page_block;
26
27
typedef struct fz_text_sheet_s fz_text_sheet;
28
typedef struct fz_text_page_s fz_text_page;
29
30
/*
31
fz_text_sheet: A text sheet contains a list of distinct text styles
32
used on a page (or a series of pages).
33
*/
34
struct fz_text_sheet_s
35
{
36
int maxid;
37
fz_text_style *style;
38
};
39
40
/*
41
fz_text_style: A text style contains details of a distinct text style
42
used on a page.
43
*/
44
struct fz_text_style_s
45
{
46
fz_text_style *next;
47
int id;
48
fz_font *font;
49
float size;
50
int wmode;
51
int script;
52
/* Ascender and Descender only have the conventional sense in
53
* horizontal mode; in vertical mode they are rotated too - they are
54
* the maximum and minimum bounds respectively. */
55
float ascender;
56
float descender;
57
/* etc... */
58
};
59
60
/*
61
fz_text_page: A text page is a list of page blocks, together with
62
an overall bounding box.
63
*/
64
struct fz_text_page_s
65
{
66
fz_rect mediabox;
67
int len, cap;
68
fz_page_block *blocks;
69
fz_text_page *next;
70
};
71
72
/*
73
fz_page_block: A page block is a typed block pointer.
74
*/
75
struct fz_page_block_s
76
{
77
int type;
78
union
79
{
80
fz_text_block *text;
81
fz_image_block *image;
82
} u;
83
};
84
85
enum
86
{
87
FZ_PAGE_BLOCK_TEXT = 0,
88
FZ_PAGE_BLOCK_IMAGE = 1
89
};
90
91
/*
92
fz_text_block: A text block is a list of lines of text. In typical
93
cases this may correspond to a paragraph or a column of text. A
94
collection of blocks makes up a page.
95
*/
96
struct fz_text_block_s
97
{
98
fz_rect bbox;
99
int len, cap;
100
fz_text_line *lines;
101
};
102
103
/*
104
fz_image_block: An image block is an image, together with the list of lines of text. In typical
105
cases this may correspond to a paragraph or a column of text. A
106
collection of blocks makes up a page.
107
*/
108
struct fz_image_block_s
109
{
110
fz_rect bbox;
111
fz_matrix mat;
112
fz_image *image;
113
fz_colorspace *cspace;
114
float colors[FZ_MAX_COLORS];
115
};
116
117
/*
118
fz_text_line: A text line is a list of text spans, with the same
119
baseline. In typical cases this should correspond (as expected) to
120
complete lines of text. A collection of lines makes up a block.
121
*/
122
struct fz_text_line_s
123
{
124
fz_text_span *first_span, *last_span;
125
126
/* Cached information */
127
float distance; /* Perpendicular distance from previous line */
128
fz_rect bbox;
129
void *region; /* Opaque value for matching line masks */
130
};
131
132
/*
133
fz_text_span: A text span is a list of characters that share a common
134
baseline/transformation. In typical cases a single span may be enough
135
to represent a complete line. In cases where the text has big gaps in
136
it (perhaps as it crosses columns or tables), a line may be represented
137
by multiple spans.
138
*/
139
struct fz_text_span_s
140
{
141
int len, cap;
142
fz_text_char *text;
143
fz_point min; /* Device space */
144
fz_point max; /* Device space */
145
int wmode; /* 0 for horizontal, 1 for vertical */
146
fz_matrix transform; /* e and f are always 0 here */
147
/* Ascender_max and Descender_min only have the conventional sense in
148
* horizontal mode; in vertical mode they are rotated too - they are
149
* the maximum and minimum bounds respectively. */
150
float ascender_max; /* Document space */
151
float descender_min; /* Document space */
152
fz_rect bbox; /* Device space */
153
154
/* Cached information */
155
float base_offset; /* Perpendicular distance from baseline of line */
156
float spacing; /* Distance along baseline from previous span in this line (or 0 if first) */
157
int column; /* If non zero, the column that it's in */
158
float column_width; /* Percentage */
159
int align; /* 0 = left, 1 = centre, 2 = right */
160
float indent; /* The indent position for this column. */
161
162
fz_text_span *next;
163
};
164
165
/*
166
fz_text_char: A text char is a unicode character, the style in which
167
is appears, and the point at which it is positioned. Transform
168
(and hence bbox) information is given by the enclosing span.
169
*/
170
struct fz_text_char_s
171
{
172
fz_point p; /* Device space */
173
int c;
174
fz_text_style *style;
175
};
176
177
typedef struct fz_char_and_box_s fz_char_and_box;
178
179
struct fz_char_and_box_s
180
{
181
int c;
182
fz_rect bbox;
183
};
184
185
fz_char_and_box *fz_text_char_at(fz_context *ctx, fz_char_and_box *cab, fz_text_page *page, int idx);
186
187
/*
188
fz_text_char_bbox: Return the bbox of a text char. Calculated from
189
the supplied enclosing span.
190
191
bbox: A place to store the bbox
192
193
span: The enclosing span
194
195
idx: The index of the char within the span
196
197
Returns bbox (updated)
198
199
Does not throw exceptions
200
*/
201
fz_rect *fz_text_char_bbox(fz_context *ctx, fz_rect *bbox, fz_text_span *span, int idx);
202
203
/*
204
fz_new_text_sheet: Create an empty style sheet.
205
206
The style sheet is filled out by the text device, creating
207
one style for each unique font, color, size combination that
208
is used.
209
*/
210
fz_text_sheet *fz_new_text_sheet(fz_context *ctx);
211
void fz_drop_text_sheet(fz_context *ctx, fz_text_sheet *sheet);
212
213
/*
214
fz_new_text_page: Create an empty text page.
215
216
The text page is filled out by the text device to contain the blocks,
217
lines and spans of text on the page.
218
*/
219
fz_text_page *fz_new_text_page(fz_context *ctx);
220
void fz_drop_text_page(fz_context *ctx, fz_text_page *page);
221
222
void fz_analyze_text(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page);
223
224
/*
225
fz_print_text_sheet: Output a text sheet to a file as CSS.
226
*/
227
void fz_print_text_sheet(fz_context *ctx, fz_output *out, fz_text_sheet *sheet);
228
229
/*
230
fz_print_text_page_html: Output a page to a file in HTML format.
231
*/
232
void fz_print_text_page_html(fz_context *ctx, fz_output *out, fz_text_page *page);
233
234
/*
235
fz_print_text_page_xml: Output a page to a file in XML format.
236
*/
237
void fz_print_text_page_xml(fz_context *ctx, fz_output *out, fz_text_page *page);
238
239
/*
240
fz_print_text_page: Output a page to a file in UTF-8 format.
241
*/
242
void fz_print_text_page(fz_context *ctx, fz_output *out, fz_text_page *page);
243
244
/*
245
fz_search_text_page: Search for occurrence of 'needle' in text page.
246
247
Return the number of hits and store hit bboxes in the passed in array.
248
249
NOTE: This is an experimental interface and subject to change without notice.
250
*/
251
int fz_search_text_page(fz_context *ctx, fz_text_page *text, const char *needle, fz_rect *hit_bbox, int hit_max);
252
253
/*
254
fz_highlight_selection: Return a list of rectangles to highlight given a selection rectangle.
255
256
NOTE: This is an experimental interface and subject to change without notice.
257
*/
258
int fz_highlight_selection(fz_context *ctx, fz_text_page *page, fz_rect rect, fz_rect *hit_bbox, int hit_max);
259
260
/*
261
fz_copy_selection: Return a newly allocated UTF-8 string with the text for a given selection rectangle.
262
263
NOTE: This is an experimental interface and subject to change without notice.
264
*/
265
char *fz_copy_selection(fz_context *ctx, fz_text_page *page, fz_rect rect);
266
267
/*
268
fz_new_text_device: Create a device to extract the text on a page.
269
270
Gather and sort the text on a page into spans of uniform style,
271
arranged into lines and blocks by reading order. The reading order
272
is determined by various heuristics, so may not be accurate.
273
274
sheet: The text sheet to which styles should be added. This can
275
either be a newly created (empty) text sheet, or one containing
276
styles from a previous text device. The same sheet cannot be used
277
in multiple threads simultaneously.
278
279
page: The text page to which content should be added. This will
280
usually be a newly created (empty) text page, but it can be one
281
containing data already (for example when merging multiple pages, or
282
watermarking).
283
*/
284
fz_device *fz_new_text_device(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page);
285
286
#endif
287
288