#ifndef MUPDF_FITZ_STRUCTURED_TEXT_H1#define MUPDF_FITZ_STRUCTURED_TEXT_H23#include "mupdf/fitz/system.h"4#include "mupdf/fitz/context.h"5#include "mupdf/fitz/math.h"6#include "mupdf/fitz/font.h"7#include "mupdf/fitz/colorspace.h"8#include "mupdf/fitz/image.h"9#include "mupdf/fitz/output.h"10#include "mupdf/fitz/device.h"1112/*13Text extraction device: Used for searching, format conversion etc.1415(In development - Subject to change in future versions)16*/1718typedef struct fz_text_style_s fz_text_style;19typedef struct fz_text_char_s fz_text_char;20typedef struct fz_text_span_s fz_text_span;21typedef struct fz_text_line_s fz_text_line;22typedef struct fz_text_block_s fz_text_block;23typedef struct fz_image_block_s fz_image_block;24typedef struct fz_page_block_s fz_page_block;2526typedef struct fz_text_sheet_s fz_text_sheet;27typedef struct fz_text_page_s fz_text_page;2829/*30fz_text_sheet: A text sheet contains a list of distinct text styles31used on a page (or a series of pages).32*/33struct fz_text_sheet_s34{35int maxid;36fz_text_style *style;37};3839/*40fz_text_style: A text style contains details of a distinct text style41used on a page.42*/43struct fz_text_style_s44{45fz_text_style *next;46int id;47fz_font *font;48float size;49int wmode;50int script;51/* Ascender and Descender only have the conventional sense in52* horizontal mode; in vertical mode they are rotated too - they are53* the maximum and minimum bounds respectively. */54float ascender;55float descender;56/* etc... */57};5859/*60fz_text_page: A text page is a list of page blocks, together with61an overall bounding box.62*/63struct fz_text_page_s64{65fz_rect mediabox;66int len, cap;67fz_page_block *blocks;68fz_text_page *next;69};7071/*72fz_page_block: A page block is a typed block pointer.73*/74struct fz_page_block_s75{76int type;77union78{79fz_text_block *text;80fz_image_block *image;81} u;82};8384enum85{86FZ_PAGE_BLOCK_TEXT = 0,87FZ_PAGE_BLOCK_IMAGE = 188};8990/*91fz_text_block: A text block is a list of lines of text. In typical92cases this may correspond to a paragraph or a column of text. A93collection of blocks makes up a page.94*/95struct fz_text_block_s96{97fz_rect bbox;98int len, cap;99fz_text_line *lines;100};101102/*103fz_image_block: An image block is an image, together with the list of lines of text. In typical104cases this may correspond to a paragraph or a column of text. A105collection of blocks makes up a page.106*/107struct fz_image_block_s108{109fz_rect bbox;110fz_matrix mat;111fz_image *image;112fz_colorspace *cspace;113float colors[FZ_MAX_COLORS];114};115116/*117fz_text_line: A text line is a list of text spans, with the same118baseline. In typical cases this should correspond (as expected) to119complete lines of text. A collection of lines makes up a block.120*/121struct fz_text_line_s122{123fz_text_span *first_span, *last_span;124125/* Cached information */126float distance; /* Perpendicular distance from previous line */127fz_rect bbox;128void *region; /* Opaque value for matching line masks */129};130131/*132fz_text_span: A text span is a list of characters that share a common133baseline/transformation. In typical cases a single span may be enough134to represent a complete line. In cases where the text has big gaps in135it (perhaps as it crosses columns or tables), a line may be represented136by multiple spans.137*/138struct fz_text_span_s139{140int len, cap;141fz_text_char *text;142fz_point min; /* Device space */143fz_point max; /* Device space */144int wmode; /* 0 for horizontal, 1 for vertical */145fz_matrix transform; /* e and f are always 0 here */146/* Ascender_max and Descender_min only have the conventional sense in147* horizontal mode; in vertical mode they are rotated too - they are148* the maximum and minimum bounds respectively. */149float ascender_max; /* Document space */150float descender_min; /* Document space */151fz_rect bbox; /* Device space */152153/* Cached information */154float base_offset; /* Perpendicular distance from baseline of line */155float spacing; /* Distance along baseline from previous span in this line (or 0 if first) */156int column; /* If non zero, the column that it's in */157float column_width; /* Percentage */158int align; /* 0 = left, 1 = centre, 2 = right */159float indent; /* The indent position for this column. */160161fz_text_span *next;162};163164/*165fz_text_char: A text char is a unicode character, the style in which166is appears, and the point at which it is positioned. Transform167(and hence bbox) information is given by the enclosing span.168*/169struct fz_text_char_s170{171fz_point p; /* Device space */172int c;173fz_text_style *style;174};175176typedef struct fz_char_and_box_s fz_char_and_box;177178struct fz_char_and_box_s179{180int c;181fz_rect bbox;182};183184fz_char_and_box *fz_text_char_at(fz_context *ctx, fz_char_and_box *cab, fz_text_page *page, int idx);185186/*187fz_text_char_bbox: Return the bbox of a text char. Calculated from188the supplied enclosing span.189190bbox: A place to store the bbox191192span: The enclosing span193194idx: The index of the char within the span195196Returns bbox (updated)197198Does not throw exceptions199*/200fz_rect *fz_text_char_bbox(fz_context *ctx, fz_rect *bbox, fz_text_span *span, int idx);201202/*203fz_new_text_sheet: Create an empty style sheet.204205The style sheet is filled out by the text device, creating206one style for each unique font, color, size combination that207is used.208*/209fz_text_sheet *fz_new_text_sheet(fz_context *ctx);210void fz_drop_text_sheet(fz_context *ctx, fz_text_sheet *sheet);211212/*213fz_new_text_page: Create an empty text page.214215The text page is filled out by the text device to contain the blocks,216lines and spans of text on the page.217*/218fz_text_page *fz_new_text_page(fz_context *ctx);219void fz_drop_text_page(fz_context *ctx, fz_text_page *page);220221void fz_analyze_text(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page);222223/*224fz_print_text_sheet: Output a text sheet to a file as CSS.225*/226void fz_print_text_sheet(fz_context *ctx, fz_output *out, fz_text_sheet *sheet);227228/*229fz_print_text_page_html: Output a page to a file in HTML format.230*/231void fz_print_text_page_html(fz_context *ctx, fz_output *out, fz_text_page *page);232233/*234fz_print_text_page_xml: Output a page to a file in XML format.235*/236void fz_print_text_page_xml(fz_context *ctx, fz_output *out, fz_text_page *page);237238/*239fz_print_text_page: Output a page to a file in UTF-8 format.240*/241void fz_print_text_page(fz_context *ctx, fz_output *out, fz_text_page *page);242243/*244fz_search_text_page: Search for occurrence of 'needle' in text page.245246Return the number of hits and store hit bboxes in the passed in array.247248NOTE: This is an experimental interface and subject to change without notice.249*/250int fz_search_text_page(fz_context *ctx, fz_text_page *text, const char *needle, fz_rect *hit_bbox, int hit_max);251252/*253fz_highlight_selection: Return a list of rectangles to highlight given a selection rectangle.254255NOTE: This is an experimental interface and subject to change without notice.256*/257int fz_highlight_selection(fz_context *ctx, fz_text_page *page, fz_rect rect, fz_rect *hit_bbox, int hit_max);258259/*260fz_copy_selection: Return a newly allocated UTF-8 string with the text for a given selection rectangle.261262NOTE: This is an experimental interface and subject to change without notice.263*/264char *fz_copy_selection(fz_context *ctx, fz_text_page *page, fz_rect rect);265266/*267fz_new_text_device: Create a device to extract the text on a page.268269Gather and sort the text on a page into spans of uniform style,270arranged into lines and blocks by reading order. The reading order271is determined by various heuristics, so may not be accurate.272273sheet: The text sheet to which styles should be added. This can274either be a newly created (empty) text sheet, or one containing275styles from a previous text device. The same sheet cannot be used276in multiple threads simultaneously.277278page: The text page to which content should be added. This will279usually be a newly created (empty) text page, but it can be one280containing data already (for example when merging multiple pages, or281watermarking).282*/283fz_device *fz_new_text_device(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page);284285#endif286287288