Path: blob/master/libs/xml2/include/libxml/HTMLparser.h
4394 views
/*1* Summary: interface for an HTML 4.0 non-verifying parser2* Description: this module implements an HTML 4.0 non-verifying parser3* with API compatible with the XML parser ones. It should4* be able to parse "real world" HTML, even if severely5* broken from a specification point of view.6*7* Copy: See Copyright for the status of this software.8*9* Author: Daniel Veillard10*/1112#ifndef __HTML_PARSER_H__13#define __HTML_PARSER_H__14#include <libxml/xmlversion.h>15#include <libxml/parser.h>1617#ifdef LIBXML_HTML_ENABLED1819#ifdef __cplusplus20extern "C" {21#endif2223/*24* Most of the back-end structures from XML and HTML are shared.25*/26typedef xmlParserCtxt htmlParserCtxt;27typedef xmlParserCtxtPtr htmlParserCtxtPtr;28typedef xmlParserNodeInfo htmlParserNodeInfo;29typedef xmlSAXHandler htmlSAXHandler;30typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;31typedef xmlParserInput htmlParserInput;32typedef xmlParserInputPtr htmlParserInputPtr;33typedef xmlDocPtr htmlDocPtr;34typedef xmlNodePtr htmlNodePtr;3536/*37* Internal description of an HTML element, representing HTML 4.0138* and XHTML 1.0 (which share the same structure).39*/40typedef struct _htmlElemDesc htmlElemDesc;41typedef htmlElemDesc *htmlElemDescPtr;42struct _htmlElemDesc {43const char *name; /* The tag name */44char startTag; /* Whether the start tag can be implied */45char endTag; /* Whether the end tag can be implied */46char saveEndTag; /* Whether the end tag should be saved */47char empty; /* Is this an empty element ? */48char depr; /* Is this a deprecated element ? */49char dtd; /* 1: only in Loose DTD, 2: only Frameset one */50char isinline; /* is this a block 0 or inline 1 element */51const char *desc; /* the description */5253/* NRK Jan.200354* New fields encapsulating HTML structure55*56* Bugs:57* This is a very limited representation. It fails to tell us when58* an element *requires* subelements (we only have whether they're59* allowed or not), and it doesn't tell us where CDATA and PCDATA60* are allowed. Some element relationships are not fully represented:61* these are flagged with the word MODIFIER62*/63const char** subelts; /* allowed sub-elements of this element */64const char* defaultsubelt; /* subelement for suggested auto-repair65if necessary or NULL */66const char** attrs_opt; /* Optional Attributes */67const char** attrs_depr; /* Additional deprecated attributes */68const char** attrs_req; /* Required attributes */69};7071/*72* Internal description of an HTML entity.73*/74typedef struct _htmlEntityDesc htmlEntityDesc;75typedef htmlEntityDesc *htmlEntityDescPtr;76struct _htmlEntityDesc {77unsigned int value; /* the UNICODE value for the character */78const char *name; /* The entity name */79const char *desc; /* the description */80};8182/** DOC_DISABLE */83#ifdef LIBXML_SAX1_ENABLED84#define XML_GLOBALS_HTML \85XML_OP(htmlDefaultSAXHandler, xmlSAXHandlerV1, XML_DEPRECATED)86#else87#define XML_GLOBALS_HTML88#endif8990#define XML_OP XML_DECLARE_GLOBAL91XML_GLOBALS_HTML92#undef XML_OP9394#if defined(LIBXML_THREAD_ENABLED) && !defined(XML_GLOBALS_NO_REDEFINITION)95#define htmlDefaultSAXHandler XML_GLOBAL_MACRO(htmlDefaultSAXHandler)96#endif97/** DOC_ENABLE */9899/*100* There is only few public functions.101*/102XML_DEPRECATED103XMLPUBFUN void104htmlInitAutoClose (void);105XMLPUBFUN const htmlElemDesc *106htmlTagLookup (const xmlChar *tag);107XMLPUBFUN const htmlEntityDesc *108htmlEntityLookup(const xmlChar *name);109XMLPUBFUN const htmlEntityDesc *110htmlEntityValueLookup(unsigned int value);111112XMLPUBFUN int113htmlIsAutoClosed(htmlDocPtr doc,114htmlNodePtr elem);115XMLPUBFUN int116htmlAutoCloseTag(htmlDocPtr doc,117const xmlChar *name,118htmlNodePtr elem);119XML_DEPRECATED120XMLPUBFUN const htmlEntityDesc *121htmlParseEntityRef(htmlParserCtxtPtr ctxt,122const xmlChar **str);123XML_DEPRECATED124XMLPUBFUN int125htmlParseCharRef(htmlParserCtxtPtr ctxt);126XML_DEPRECATED127XMLPUBFUN void128htmlParseElement(htmlParserCtxtPtr ctxt);129130XMLPUBFUN htmlParserCtxtPtr131htmlNewParserCtxt(void);132XMLPUBFUN htmlParserCtxtPtr133htmlNewSAXParserCtxt(const htmlSAXHandler *sax,134void *userData);135136XMLPUBFUN htmlParserCtxtPtr137htmlCreateMemoryParserCtxt(const char *buffer,138int size);139140XMLPUBFUN int141htmlParseDocument(htmlParserCtxtPtr ctxt);142XML_DEPRECATED143XMLPUBFUN htmlDocPtr144htmlSAXParseDoc (const xmlChar *cur,145const char *encoding,146htmlSAXHandlerPtr sax,147void *userData);148XMLPUBFUN htmlDocPtr149htmlParseDoc (const xmlChar *cur,150const char *encoding);151XMLPUBFUN htmlParserCtxtPtr152htmlCreateFileParserCtxt(const char *filename,153const char *encoding);154XML_DEPRECATED155XMLPUBFUN htmlDocPtr156htmlSAXParseFile(const char *filename,157const char *encoding,158htmlSAXHandlerPtr sax,159void *userData);160XMLPUBFUN htmlDocPtr161htmlParseFile (const char *filename,162const char *encoding);163XMLPUBFUN int164UTF8ToHtml (unsigned char *out,165int *outlen,166const unsigned char *in,167int *inlen);168XMLPUBFUN int169htmlEncodeEntities(unsigned char *out,170int *outlen,171const unsigned char *in,172int *inlen, int quoteChar);173XMLPUBFUN int174htmlIsScriptAttribute(const xmlChar *name);175XMLPUBFUN int176htmlHandleOmittedElem(int val);177178#ifdef LIBXML_PUSH_ENABLED179/**180* Interfaces for the Push mode.181*/182XMLPUBFUN htmlParserCtxtPtr183htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,184void *user_data,185const char *chunk,186int size,187const char *filename,188xmlCharEncoding enc);189XMLPUBFUN int190htmlParseChunk (htmlParserCtxtPtr ctxt,191const char *chunk,192int size,193int terminate);194#endif /* LIBXML_PUSH_ENABLED */195196XMLPUBFUN void197htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);198199/*200* New set of simpler/more flexible APIs201*/202/**203* xmlParserOption:204*205* This is the set of XML parser options that can be passed down206* to the xmlReadDoc() and similar calls.207*/208typedef enum {209HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */210HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */211HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */212HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */213HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */214HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */215HTML_PARSE_NONET = 1<<11,/* Forbid network access */216HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */217HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */218HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */219} htmlParserOption;220221XMLPUBFUN void222htmlCtxtReset (htmlParserCtxtPtr ctxt);223XMLPUBFUN int224htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,225int options);226XMLPUBFUN htmlDocPtr227htmlReadDoc (const xmlChar *cur,228const char *URL,229const char *encoding,230int options);231XMLPUBFUN htmlDocPtr232htmlReadFile (const char *URL,233const char *encoding,234int options);235XMLPUBFUN htmlDocPtr236htmlReadMemory (const char *buffer,237int size,238const char *URL,239const char *encoding,240int options);241XMLPUBFUN htmlDocPtr242htmlReadFd (int fd,243const char *URL,244const char *encoding,245int options);246XMLPUBFUN htmlDocPtr247htmlReadIO (xmlInputReadCallback ioread,248xmlInputCloseCallback ioclose,249void *ioctx,250const char *URL,251const char *encoding,252int options);253XMLPUBFUN htmlDocPtr254htmlCtxtReadDoc (xmlParserCtxtPtr ctxt,255const xmlChar *cur,256const char *URL,257const char *encoding,258int options);259XMLPUBFUN htmlDocPtr260htmlCtxtReadFile (xmlParserCtxtPtr ctxt,261const char *filename,262const char *encoding,263int options);264XMLPUBFUN htmlDocPtr265htmlCtxtReadMemory (xmlParserCtxtPtr ctxt,266const char *buffer,267int size,268const char *URL,269const char *encoding,270int options);271XMLPUBFUN htmlDocPtr272htmlCtxtReadFd (xmlParserCtxtPtr ctxt,273int fd,274const char *URL,275const char *encoding,276int options);277XMLPUBFUN htmlDocPtr278htmlCtxtReadIO (xmlParserCtxtPtr ctxt,279xmlInputReadCallback ioread,280xmlInputCloseCallback ioclose,281void *ioctx,282const char *URL,283const char *encoding,284int options);285286/* NRK/Jan2003: further knowledge of HTML structure287*/288typedef enum {289HTML_NA = 0 , /* something we don't check at all */290HTML_INVALID = 0x1 ,291HTML_DEPRECATED = 0x2 ,292HTML_VALID = 0x4 ,293HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */294} htmlStatus ;295296/* Using htmlElemDesc rather than name here, to emphasise the fact297that otherwise there's a lookup overhead298*/299XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;300XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;301XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;302XMLPUBFUN htmlStatus htmlNodeStatus(const htmlNodePtr, int) ;303/**304* htmlDefaultSubelement:305* @elt: HTML element306*307* Returns the default subelement for this element308*/309#define htmlDefaultSubelement(elt) elt->defaultsubelt310/**311* htmlElementAllowedHereDesc:312* @parent: HTML parent element313* @elt: HTML element314*315* Checks whether an HTML element description may be a316* direct child of the specified element.317*318* Returns 1 if allowed; 0 otherwise.319*/320#define htmlElementAllowedHereDesc(parent,elt) \321htmlElementAllowedHere((parent), (elt)->name)322/**323* htmlRequiredAttrs:324* @elt: HTML element325*326* Returns the attributes required for the specified element.327*/328#define htmlRequiredAttrs(elt) (elt)->attrs_req329330331#ifdef __cplusplus332}333#endif334335#else /* LIBXML_HTML_ENABLED */336337/** DOC_DISABLE */338#define XML_GLOBALS_HTML339/** DOC_ENABLE */340341#endif /* LIBXML_HTML_ENABLED */342#endif /* __HTML_PARSER_H__ */343344345