Path: blob/master/venv/Lib/site-packages/lxml/includes/libxml/HTMLparser.h
811 views
/*1* Summary: interface for an HTML 4.0 non-verifying parser2* Description: this module implements an HTML 4.0 non-verifying parser3* with API compatible with the XML parser ones. It should4* be able to parse "real world" HTML, even if severely5* broken from a specification point of view.6*7* Copy: See Copyright for the status of this software.8*9* Author: Daniel Veillard10*/1112#ifndef __HTML_PARSER_H__13#define __HTML_PARSER_H__14#include <libxml/xmlversion.h>15#include <libxml/parser.h>1617#ifdef LIBXML_HTML_ENABLED1819#ifdef __cplusplus20extern "C" {21#endif2223/*24* Most of the back-end structures from XML and HTML are shared.25*/26typedef xmlParserCtxt htmlParserCtxt;27typedef xmlParserCtxtPtr htmlParserCtxtPtr;28typedef xmlParserNodeInfo htmlParserNodeInfo;29typedef xmlSAXHandler htmlSAXHandler;30typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;31typedef xmlParserInput htmlParserInput;32typedef xmlParserInputPtr htmlParserInputPtr;33typedef xmlDocPtr htmlDocPtr;34typedef xmlNodePtr htmlNodePtr;3536/*37* Internal description of an HTML element, representing HTML 4.0138* and XHTML 1.0 (which share the same structure).39*/40typedef struct _htmlElemDesc htmlElemDesc;41typedef htmlElemDesc *htmlElemDescPtr;42struct _htmlElemDesc {43const char *name; /* The tag name */44char startTag; /* Whether the start tag can be implied */45char endTag; /* Whether the end tag can be implied */46char saveEndTag; /* Whether the end tag should be saved */47char empty; /* Is this an empty element ? */48char depr; /* Is this a deprecated element ? */49char dtd; /* 1: only in Loose DTD, 2: only Frameset one */50char isinline; /* is this a block 0 or inline 1 element */51const char *desc; /* the description */5253/* NRK Jan.200354* New fields encapsulating HTML structure55*56* Bugs:57* This is a very limited representation. It fails to tell us when58* an element *requires* subelements (we only have whether they're59* allowed or not), and it doesn't tell us where CDATA and PCDATA60* are allowed. Some element relationships are not fully represented:61* these are flagged with the word MODIFIER62*/63const char** subelts; /* allowed sub-elements of this element */64const char* defaultsubelt; /* subelement for suggested auto-repair65if necessary or NULL */66const char** attrs_opt; /* Optional Attributes */67const char** attrs_depr; /* Additional deprecated attributes */68const char** attrs_req; /* Required attributes */69};7071/*72* Internal description of an HTML entity.73*/74typedef struct _htmlEntityDesc htmlEntityDesc;75typedef htmlEntityDesc *htmlEntityDescPtr;76struct _htmlEntityDesc {77unsigned int value; /* the UNICODE value for the character */78const char *name; /* The entity name */79const char *desc; /* the description */80};8182/*83* There is only few public functions.84*/85XMLPUBFUN const htmlElemDesc * XMLCALL86htmlTagLookup (const xmlChar *tag);87XMLPUBFUN const htmlEntityDesc * XMLCALL88htmlEntityLookup(const xmlChar *name);89XMLPUBFUN const htmlEntityDesc * XMLCALL90htmlEntityValueLookup(unsigned int value);9192XMLPUBFUN int XMLCALL93htmlIsAutoClosed(htmlDocPtr doc,94htmlNodePtr elem);95XMLPUBFUN int XMLCALL96htmlAutoCloseTag(htmlDocPtr doc,97const xmlChar *name,98htmlNodePtr elem);99XMLPUBFUN const htmlEntityDesc * XMLCALL100htmlParseEntityRef(htmlParserCtxtPtr ctxt,101const xmlChar **str);102XMLPUBFUN int XMLCALL103htmlParseCharRef(htmlParserCtxtPtr ctxt);104XMLPUBFUN void XMLCALL105htmlParseElement(htmlParserCtxtPtr ctxt);106107XMLPUBFUN htmlParserCtxtPtr XMLCALL108htmlNewParserCtxt(void);109110XMLPUBFUN htmlParserCtxtPtr XMLCALL111htmlCreateMemoryParserCtxt(const char *buffer,112int size);113114XMLPUBFUN int XMLCALL115htmlParseDocument(htmlParserCtxtPtr ctxt);116XMLPUBFUN htmlDocPtr XMLCALL117htmlSAXParseDoc (const xmlChar *cur,118const char *encoding,119htmlSAXHandlerPtr sax,120void *userData);121XMLPUBFUN htmlDocPtr XMLCALL122htmlParseDoc (const xmlChar *cur,123const char *encoding);124XMLPUBFUN htmlDocPtr XMLCALL125htmlSAXParseFile(const char *filename,126const char *encoding,127htmlSAXHandlerPtr sax,128void *userData);129XMLPUBFUN htmlDocPtr XMLCALL130htmlParseFile (const char *filename,131const char *encoding);132XMLPUBFUN int XMLCALL133UTF8ToHtml (unsigned char *out,134int *outlen,135const unsigned char *in,136int *inlen);137XMLPUBFUN int XMLCALL138htmlEncodeEntities(unsigned char *out,139int *outlen,140const unsigned char *in,141int *inlen, int quoteChar);142XMLPUBFUN int XMLCALL143htmlIsScriptAttribute(const xmlChar *name);144XMLPUBFUN int XMLCALL145htmlHandleOmittedElem(int val);146147#ifdef LIBXML_PUSH_ENABLED148/**149* Interfaces for the Push mode.150*/151XMLPUBFUN htmlParserCtxtPtr XMLCALL152htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,153void *user_data,154const char *chunk,155int size,156const char *filename,157xmlCharEncoding enc);158XMLPUBFUN int XMLCALL159htmlParseChunk (htmlParserCtxtPtr ctxt,160const char *chunk,161int size,162int terminate);163#endif /* LIBXML_PUSH_ENABLED */164165XMLPUBFUN void XMLCALL166htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);167168/*169* New set of simpler/more flexible APIs170*/171/**172* xmlParserOption:173*174* This is the set of XML parser options that can be passed down175* to the xmlReadDoc() and similar calls.176*/177typedef enum {178HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */179HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */180HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */181HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */182HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */183HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */184HTML_PARSE_NONET = 1<<11,/* Forbid network access */185HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */186HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */187HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */188} htmlParserOption;189190XMLPUBFUN void XMLCALL191htmlCtxtReset (htmlParserCtxtPtr ctxt);192XMLPUBFUN int XMLCALL193htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,194int options);195XMLPUBFUN htmlDocPtr XMLCALL196htmlReadDoc (const xmlChar *cur,197const char *URL,198const char *encoding,199int options);200XMLPUBFUN htmlDocPtr XMLCALL201htmlReadFile (const char *URL,202const char *encoding,203int options);204XMLPUBFUN htmlDocPtr XMLCALL205htmlReadMemory (const char *buffer,206int size,207const char *URL,208const char *encoding,209int options);210XMLPUBFUN htmlDocPtr XMLCALL211htmlReadFd (int fd,212const char *URL,213const char *encoding,214int options);215XMLPUBFUN htmlDocPtr XMLCALL216htmlReadIO (xmlInputReadCallback ioread,217xmlInputCloseCallback ioclose,218void *ioctx,219const char *URL,220const char *encoding,221int options);222XMLPUBFUN htmlDocPtr XMLCALL223htmlCtxtReadDoc (xmlParserCtxtPtr ctxt,224const xmlChar *cur,225const char *URL,226const char *encoding,227int options);228XMLPUBFUN htmlDocPtr XMLCALL229htmlCtxtReadFile (xmlParserCtxtPtr ctxt,230const char *filename,231const char *encoding,232int options);233XMLPUBFUN htmlDocPtr XMLCALL234htmlCtxtReadMemory (xmlParserCtxtPtr ctxt,235const char *buffer,236int size,237const char *URL,238const char *encoding,239int options);240XMLPUBFUN htmlDocPtr XMLCALL241htmlCtxtReadFd (xmlParserCtxtPtr ctxt,242int fd,243const char *URL,244const char *encoding,245int options);246XMLPUBFUN htmlDocPtr XMLCALL247htmlCtxtReadIO (xmlParserCtxtPtr ctxt,248xmlInputReadCallback ioread,249xmlInputCloseCallback ioclose,250void *ioctx,251const char *URL,252const char *encoding,253int options);254255/* NRK/Jan2003: further knowledge of HTML structure256*/257typedef enum {258HTML_NA = 0 , /* something we don't check at all */259HTML_INVALID = 0x1 ,260HTML_DEPRECATED = 0x2 ,261HTML_VALID = 0x4 ,262HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */263} htmlStatus ;264265/* Using htmlElemDesc rather than name here, to emphasise the fact266that otherwise there's a lookup overhead267*/268XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;269XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;270XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;271XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ;272/**273* htmlDefaultSubelement:274* @elt: HTML element275*276* Returns the default subelement for this element277*/278#define htmlDefaultSubelement(elt) elt->defaultsubelt279/**280* htmlElementAllowedHereDesc:281* @parent: HTML parent element282* @elt: HTML element283*284* Checks whether an HTML element description may be a285* direct child of the specified element.286*287* Returns 1 if allowed; 0 otherwise.288*/289#define htmlElementAllowedHereDesc(parent,elt) \290htmlElementAllowedHere((parent), (elt)->name)291/**292* htmlRequiredAttrs:293* @elt: HTML element294*295* Returns the attributes required for the specified element.296*/297#define htmlRequiredAttrs(elt) (elt)->attrs_req298299300#ifdef __cplusplus301}302#endif303304#endif /* LIBXML_HTML_ENABLED */305#endif /* __HTML_PARSER_H__ */306307308