Path: blob/master/venv/Lib/site-packages/lxml/html/html5parser.py
811 views
"""1An interface to html5lib that mimics the lxml.html interface.2"""3import sys4import string56from html5lib import HTMLParser as _HTMLParser7from html5lib.treebuilders.etree_lxml import TreeBuilder8from lxml import etree9from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag1011# python3 compatibility12try:13_strings = basestring14except NameError:15_strings = (bytes, str)16try:17from urllib2 import urlopen18except ImportError:19from urllib.request import urlopen20try:21from urlparse import urlparse22except ImportError:23from urllib.parse import urlparse242526class HTMLParser(_HTMLParser):27"""An html5lib HTML parser with lxml as tree."""2829def __init__(self, strict=False, **kwargs):30_HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)313233try:34from html5lib import XHTMLParser as _XHTMLParser35except ImportError:36pass37else:38class XHTMLParser(_XHTMLParser):39"""An html5lib XHTML Parser with lxml as tree."""4041def __init__(self, strict=False, **kwargs):42_XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)4344xhtml_parser = XHTMLParser()454647def _find_tag(tree, tag):48elem = tree.find(tag)49if elem is not None:50return elem51return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))525354def document_fromstring(html, guess_charset=None, parser=None):55"""56Parse a whole document into a string.5758If `guess_charset` is true, or if the input is not Unicode but a59byte string, the `chardet` library will perform charset guessing60on the string.61"""62if not isinstance(html, _strings):63raise TypeError('string required')6465if parser is None:66parser = html_parser6768options = {}69if guess_charset is None and isinstance(html, bytes):70# html5lib does not accept useChardet as an argument, if it71# detected the html argument would produce unicode objects.72guess_charset = True73if guess_charset is not None:74options['useChardet'] = guess_charset75return parser.parse(html, **options).getroot()767778def fragments_fromstring(html, no_leading_text=False,79guess_charset=None, parser=None):80"""Parses several HTML elements, returning a list of elements.8182The first item in the list may be a string. If no_leading_text is true,83then it will be an error if there is leading text, and it will always be84a list of only elements.8586If `guess_charset` is true, the `chardet` library will perform charset87guessing on the string.88"""89if not isinstance(html, _strings):90raise TypeError('string required')9192if parser is None:93parser = html_parser9495options = {}96if guess_charset is None and isinstance(html, bytes):97# html5lib does not accept useChardet as an argument, if it98# detected the html argument would produce unicode objects.99guess_charset = False100if guess_charset is not None:101options['useChardet'] = guess_charset102children = parser.parseFragment(html, 'div', **options)103if children and isinstance(children[0], _strings):104if no_leading_text:105if children[0].strip():106raise etree.ParserError('There is leading text: %r' %107children[0])108del children[0]109return children110111112def fragment_fromstring(html, create_parent=False,113guess_charset=None, parser=None):114"""Parses a single HTML element; it is an error if there is more than115one element, or if anything but whitespace precedes or follows the116element.117118If 'create_parent' is true (or is a tag name) then a parent node119will be created to encapsulate the HTML in a single element. In120this case, leading or trailing text is allowed.121122If `guess_charset` is true, the `chardet` library will perform charset123guessing on the string.124"""125if not isinstance(html, _strings):126raise TypeError('string required')127128accept_leading_text = bool(create_parent)129130elements = fragments_fromstring(131html, guess_charset=guess_charset, parser=parser,132no_leading_text=not accept_leading_text)133134if create_parent:135if not isinstance(create_parent, _strings):136create_parent = 'div'137new_root = Element(create_parent)138if elements:139if isinstance(elements[0], _strings):140new_root.text = elements[0]141del elements[0]142new_root.extend(elements)143return new_root144145if not elements:146raise etree.ParserError('No elements found')147if len(elements) > 1:148raise etree.ParserError('Multiple elements found')149result = elements[0]150if result.tail and result.tail.strip():151raise etree.ParserError('Element followed by text: %r' % result.tail)152result.tail = None153return result154155156def fromstring(html, guess_charset=None, parser=None):157"""Parse the html, returning a single element/document.158159This tries to minimally parse the chunk of text, without knowing if it160is a fragment or a document.161162'base_url' will set the document's base_url attribute (and the tree's163docinfo.URL)164165If `guess_charset` is true, or if the input is not Unicode but a166byte string, the `chardet` library will perform charset guessing167on the string.168"""169if not isinstance(html, _strings):170raise TypeError('string required')171doc = document_fromstring(html, parser=parser,172guess_charset=guess_charset)173174# document starts with doctype or <html>, full document!175start = html[:50]176if isinstance(start, bytes):177# Allow text comparison in python3.178# Decode as ascii, that also covers latin-1 and utf-8 for the179# characters we need.180start = start.decode('ascii', 'replace')181182start = start.lstrip().lower()183if start.startswith('<html') or start.startswith('<!doctype'):184return doc185186head = _find_tag(doc, 'head')187188# if the head is not empty we have a full document189if len(head):190return doc191192body = _find_tag(doc, 'body')193194# The body has just one element, so it was probably a single195# element passed in196if (len(body) == 1 and (not body.text or not body.text.strip())197and (not body[-1].tail or not body[-1].tail.strip())):198return body[0]199200# Now we have a body which represents a bunch of tags which have the201# content that was passed in. We will create a fake container, which202# is the body tag, except <body> implies too much structure.203if _contains_block_level_tag(body):204body.tag = 'div'205else:206body.tag = 'span'207return body208209210def parse(filename_url_or_file, guess_charset=None, parser=None):211"""Parse a filename, URL, or file-like object into an HTML document212tree. Note: this returns a tree, not an element. Use213``parse(...).getroot()`` to get the document root.214215If ``guess_charset`` is true, the ``useChardet`` option is passed into216html5lib to enable character detection. This option is on by default217when parsing from URLs, off by default when parsing from file(-like)218objects (which tend to return Unicode more often than not), and on by219default when parsing from a file path (which is read in binary mode).220"""221if parser is None:222parser = html_parser223if not isinstance(filename_url_or_file, _strings):224fp = filename_url_or_file225if guess_charset is None:226# assume that file-like objects return Unicode more often than bytes227guess_charset = False228elif _looks_like_url(filename_url_or_file):229fp = urlopen(filename_url_or_file)230if guess_charset is None:231# assume that URLs return bytes232guess_charset = True233else:234fp = open(filename_url_or_file, 'rb')235if guess_charset is None:236guess_charset = True237238options = {}239# html5lib does not accept useChardet as an argument, if it240# detected the html argument would produce unicode objects.241if guess_charset:242options['useChardet'] = guess_charset243return parser.parse(fp, **options)244245246def _looks_like_url(str):247scheme = urlparse(str)[0]248if not scheme:249return False250elif (sys.platform == 'win32' and251scheme in string.ascii_letters252and len(scheme) == 1):253# looks like a 'normal' absolute path254return False255else:256return True257258259html_parser = HTMLParser()260261262