Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/xml2/include/libxml/HTMLparser.h
4394 views
1
/*
2
* Summary: interface for an HTML 4.0 non-verifying parser
3
* Description: this module implements an HTML 4.0 non-verifying parser
4
* with API compatible with the XML parser ones. It should
5
* be able to parse "real world" HTML, even if severely
6
* broken from a specification point of view.
7
*
8
* Copy: See Copyright for the status of this software.
9
*
10
* Author: Daniel Veillard
11
*/
12
13
#ifndef __HTML_PARSER_H__
14
#define __HTML_PARSER_H__
15
#include <libxml/xmlversion.h>
16
#include <libxml/parser.h>
17
18
#ifdef LIBXML_HTML_ENABLED
19
20
#ifdef __cplusplus
21
extern "C" {
22
#endif
23
24
/*
25
* Most of the back-end structures from XML and HTML are shared.
26
*/
27
typedef xmlParserCtxt htmlParserCtxt;
28
typedef xmlParserCtxtPtr htmlParserCtxtPtr;
29
typedef xmlParserNodeInfo htmlParserNodeInfo;
30
typedef xmlSAXHandler htmlSAXHandler;
31
typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
32
typedef xmlParserInput htmlParserInput;
33
typedef xmlParserInputPtr htmlParserInputPtr;
34
typedef xmlDocPtr htmlDocPtr;
35
typedef xmlNodePtr htmlNodePtr;
36
37
/*
38
* Internal description of an HTML element, representing HTML 4.01
39
* and XHTML 1.0 (which share the same structure).
40
*/
41
typedef struct _htmlElemDesc htmlElemDesc;
42
typedef htmlElemDesc *htmlElemDescPtr;
43
struct _htmlElemDesc {
44
const char *name; /* The tag name */
45
char startTag; /* Whether the start tag can be implied */
46
char endTag; /* Whether the end tag can be implied */
47
char saveEndTag; /* Whether the end tag should be saved */
48
char empty; /* Is this an empty element ? */
49
char depr; /* Is this a deprecated element ? */
50
char dtd; /* 1: only in Loose DTD, 2: only Frameset one */
51
char isinline; /* is this a block 0 or inline 1 element */
52
const char *desc; /* the description */
53
54
/* NRK Jan.2003
55
* New fields encapsulating HTML structure
56
*
57
* Bugs:
58
* This is a very limited representation. It fails to tell us when
59
* an element *requires* subelements (we only have whether they're
60
* allowed or not), and it doesn't tell us where CDATA and PCDATA
61
* are allowed. Some element relationships are not fully represented:
62
* these are flagged with the word MODIFIER
63
*/
64
const char** subelts; /* allowed sub-elements of this element */
65
const char* defaultsubelt; /* subelement for suggested auto-repair
66
if necessary or NULL */
67
const char** attrs_opt; /* Optional Attributes */
68
const char** attrs_depr; /* Additional deprecated attributes */
69
const char** attrs_req; /* Required attributes */
70
};
71
72
/*
73
* Internal description of an HTML entity.
74
*/
75
typedef struct _htmlEntityDesc htmlEntityDesc;
76
typedef htmlEntityDesc *htmlEntityDescPtr;
77
struct _htmlEntityDesc {
78
unsigned int value; /* the UNICODE value for the character */
79
const char *name; /* The entity name */
80
const char *desc; /* the description */
81
};
82
83
/** DOC_DISABLE */
84
#ifdef LIBXML_SAX1_ENABLED
85
#define XML_GLOBALS_HTML \
86
XML_OP(htmlDefaultSAXHandler, xmlSAXHandlerV1, XML_DEPRECATED)
87
#else
88
#define XML_GLOBALS_HTML
89
#endif
90
91
#define XML_OP XML_DECLARE_GLOBAL
92
XML_GLOBALS_HTML
93
#undef XML_OP
94
95
#if defined(LIBXML_THREAD_ENABLED) && !defined(XML_GLOBALS_NO_REDEFINITION)
96
#define htmlDefaultSAXHandler XML_GLOBAL_MACRO(htmlDefaultSAXHandler)
97
#endif
98
/** DOC_ENABLE */
99
100
/*
101
* There is only few public functions.
102
*/
103
XML_DEPRECATED
104
XMLPUBFUN void
105
htmlInitAutoClose (void);
106
XMLPUBFUN const htmlElemDesc *
107
htmlTagLookup (const xmlChar *tag);
108
XMLPUBFUN const htmlEntityDesc *
109
htmlEntityLookup(const xmlChar *name);
110
XMLPUBFUN const htmlEntityDesc *
111
htmlEntityValueLookup(unsigned int value);
112
113
XMLPUBFUN int
114
htmlIsAutoClosed(htmlDocPtr doc,
115
htmlNodePtr elem);
116
XMLPUBFUN int
117
htmlAutoCloseTag(htmlDocPtr doc,
118
const xmlChar *name,
119
htmlNodePtr elem);
120
XML_DEPRECATED
121
XMLPUBFUN const htmlEntityDesc *
122
htmlParseEntityRef(htmlParserCtxtPtr ctxt,
123
const xmlChar **str);
124
XML_DEPRECATED
125
XMLPUBFUN int
126
htmlParseCharRef(htmlParserCtxtPtr ctxt);
127
XML_DEPRECATED
128
XMLPUBFUN void
129
htmlParseElement(htmlParserCtxtPtr ctxt);
130
131
XMLPUBFUN htmlParserCtxtPtr
132
htmlNewParserCtxt(void);
133
XMLPUBFUN htmlParserCtxtPtr
134
htmlNewSAXParserCtxt(const htmlSAXHandler *sax,
135
void *userData);
136
137
XMLPUBFUN htmlParserCtxtPtr
138
htmlCreateMemoryParserCtxt(const char *buffer,
139
int size);
140
141
XMLPUBFUN int
142
htmlParseDocument(htmlParserCtxtPtr ctxt);
143
XML_DEPRECATED
144
XMLPUBFUN htmlDocPtr
145
htmlSAXParseDoc (const xmlChar *cur,
146
const char *encoding,
147
htmlSAXHandlerPtr sax,
148
void *userData);
149
XMLPUBFUN htmlDocPtr
150
htmlParseDoc (const xmlChar *cur,
151
const char *encoding);
152
XMLPUBFUN htmlParserCtxtPtr
153
htmlCreateFileParserCtxt(const char *filename,
154
const char *encoding);
155
XML_DEPRECATED
156
XMLPUBFUN htmlDocPtr
157
htmlSAXParseFile(const char *filename,
158
const char *encoding,
159
htmlSAXHandlerPtr sax,
160
void *userData);
161
XMLPUBFUN htmlDocPtr
162
htmlParseFile (const char *filename,
163
const char *encoding);
164
XMLPUBFUN int
165
UTF8ToHtml (unsigned char *out,
166
int *outlen,
167
const unsigned char *in,
168
int *inlen);
169
XMLPUBFUN int
170
htmlEncodeEntities(unsigned char *out,
171
int *outlen,
172
const unsigned char *in,
173
int *inlen, int quoteChar);
174
XMLPUBFUN int
175
htmlIsScriptAttribute(const xmlChar *name);
176
XMLPUBFUN int
177
htmlHandleOmittedElem(int val);
178
179
#ifdef LIBXML_PUSH_ENABLED
180
/**
181
* Interfaces for the Push mode.
182
*/
183
XMLPUBFUN htmlParserCtxtPtr
184
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
185
void *user_data,
186
const char *chunk,
187
int size,
188
const char *filename,
189
xmlCharEncoding enc);
190
XMLPUBFUN int
191
htmlParseChunk (htmlParserCtxtPtr ctxt,
192
const char *chunk,
193
int size,
194
int terminate);
195
#endif /* LIBXML_PUSH_ENABLED */
196
197
XMLPUBFUN void
198
htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);
199
200
/*
201
* New set of simpler/more flexible APIs
202
*/
203
/**
204
* xmlParserOption:
205
*
206
* This is the set of XML parser options that can be passed down
207
* to the xmlReadDoc() and similar calls.
208
*/
209
typedef enum {
210
HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */
211
HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */
212
HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */
213
HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
214
HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
215
HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
216
HTML_PARSE_NONET = 1<<11,/* Forbid network access */
217
HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */
218
HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */
219
HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */
220
} htmlParserOption;
221
222
XMLPUBFUN void
223
htmlCtxtReset (htmlParserCtxtPtr ctxt);
224
XMLPUBFUN int
225
htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
226
int options);
227
XMLPUBFUN htmlDocPtr
228
htmlReadDoc (const xmlChar *cur,
229
const char *URL,
230
const char *encoding,
231
int options);
232
XMLPUBFUN htmlDocPtr
233
htmlReadFile (const char *URL,
234
const char *encoding,
235
int options);
236
XMLPUBFUN htmlDocPtr
237
htmlReadMemory (const char *buffer,
238
int size,
239
const char *URL,
240
const char *encoding,
241
int options);
242
XMLPUBFUN htmlDocPtr
243
htmlReadFd (int fd,
244
const char *URL,
245
const char *encoding,
246
int options);
247
XMLPUBFUN htmlDocPtr
248
htmlReadIO (xmlInputReadCallback ioread,
249
xmlInputCloseCallback ioclose,
250
void *ioctx,
251
const char *URL,
252
const char *encoding,
253
int options);
254
XMLPUBFUN htmlDocPtr
255
htmlCtxtReadDoc (xmlParserCtxtPtr ctxt,
256
const xmlChar *cur,
257
const char *URL,
258
const char *encoding,
259
int options);
260
XMLPUBFUN htmlDocPtr
261
htmlCtxtReadFile (xmlParserCtxtPtr ctxt,
262
const char *filename,
263
const char *encoding,
264
int options);
265
XMLPUBFUN htmlDocPtr
266
htmlCtxtReadMemory (xmlParserCtxtPtr ctxt,
267
const char *buffer,
268
int size,
269
const char *URL,
270
const char *encoding,
271
int options);
272
XMLPUBFUN htmlDocPtr
273
htmlCtxtReadFd (xmlParserCtxtPtr ctxt,
274
int fd,
275
const char *URL,
276
const char *encoding,
277
int options);
278
XMLPUBFUN htmlDocPtr
279
htmlCtxtReadIO (xmlParserCtxtPtr ctxt,
280
xmlInputReadCallback ioread,
281
xmlInputCloseCallback ioclose,
282
void *ioctx,
283
const char *URL,
284
const char *encoding,
285
int options);
286
287
/* NRK/Jan2003: further knowledge of HTML structure
288
*/
289
typedef enum {
290
HTML_NA = 0 , /* something we don't check at all */
291
HTML_INVALID = 0x1 ,
292
HTML_DEPRECATED = 0x2 ,
293
HTML_VALID = 0x4 ,
294
HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
295
} htmlStatus ;
296
297
/* Using htmlElemDesc rather than name here, to emphasise the fact
298
that otherwise there's a lookup overhead
299
*/
300
XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
301
XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
302
XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
303
XMLPUBFUN htmlStatus htmlNodeStatus(const htmlNodePtr, int) ;
304
/**
305
* htmlDefaultSubelement:
306
* @elt: HTML element
307
*
308
* Returns the default subelement for this element
309
*/
310
#define htmlDefaultSubelement(elt) elt->defaultsubelt
311
/**
312
* htmlElementAllowedHereDesc:
313
* @parent: HTML parent element
314
* @elt: HTML element
315
*
316
* Checks whether an HTML element description may be a
317
* direct child of the specified element.
318
*
319
* Returns 1 if allowed; 0 otherwise.
320
*/
321
#define htmlElementAllowedHereDesc(parent,elt) \
322
htmlElementAllowedHere((parent), (elt)->name)
323
/**
324
* htmlRequiredAttrs:
325
* @elt: HTML element
326
*
327
* Returns the attributes required for the specified element.
328
*/
329
#define htmlRequiredAttrs(elt) (elt)->attrs_req
330
331
332
#ifdef __cplusplus
333
}
334
#endif
335
336
#else /* LIBXML_HTML_ENABLED */
337
338
/** DOC_DISABLE */
339
#define XML_GLOBALS_HTML
340
/** DOC_ENABLE */
341
342
#endif /* LIBXML_HTML_ENABLED */
343
#endif /* __HTML_PARSER_H__ */
344
345