Path: blob/master/venv/Lib/site-packages/lxml/html/__init__.py
811 views
# Copyright (c) 2004 Ian Bicking. All rights reserved.1#2# Redistribution and use in source and binary forms, with or without3# modification, are permitted provided that the following conditions are4# met:5#6# 1. Redistributions of source code must retain the above copyright7# notice, this list of conditions and the following disclaimer.8#9# 2. Redistributions in binary form must reproduce the above copyright10# notice, this list of conditions and the following disclaimer in11# the documentation and/or other materials provided with the12# distribution.13#14# 3. Neither the name of Ian Bicking nor the names of its contributors may15# be used to endorse or promote products derived from this software16# without specific prior written permission.17#18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS19# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT20# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR21# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR22# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,23# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,24# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR25# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF26# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING27# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS28# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.2930"""The ``lxml.html`` tool set for HTML handling.31"""3233from __future__ import absolute_import3435__all__ = [36'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',37'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',38'find_rel_links', 'find_class', 'make_links_absolute',39'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']404142import copy43import sys44import re45from functools import partial4647try:48from collections.abc import MutableMapping, MutableSet49except ImportError:50from collections import MutableMapping, MutableSet5152from .. import etree53from . import defs54from ._setmixin import SetMixin5556try:57from urlparse import urljoin58except ImportError:59# Python 360from urllib.parse import urljoin6162try:63unicode64except NameError:65# Python 366unicode = str67try:68basestring69except NameError:70# Python 371basestring = (str, bytes)727374def __fix_docstring(s):75if not s:76return s77if sys.version_info[0] >= 3:78sub = re.compile(r"^(\s*)u'", re.M).sub79else:80sub = re.compile(r"^(\s*)b'", re.M).sub81return sub(r"\1'", s)828384XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"8586_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",87namespaces={'x':XHTML_NAMESPACE})88_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",89namespaces={'x':XHTML_NAMESPACE})90_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",91namespaces={'x':XHTML_NAMESPACE})92#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})93_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")94_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")95_collect_string_content = etree.XPath("string()")96_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer97_iter_css_imports = re.compile(r'@import "(.*?)"').finditer98_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",99namespaces={'x':XHTML_NAMESPACE})100_archive_re = re.compile(r'[^ ]+')101_parse_meta_refresh_url = re.compile(102r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search103104105def _unquote_match(s, pos):106if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":107return s[1:-1], pos+1108else:109return s,pos110111112def _transform_result(typ, result):113"""Convert the result back into the input type.114"""115if issubclass(typ, bytes):116return tostring(result, encoding='utf-8')117elif issubclass(typ, unicode):118return tostring(result, encoding='unicode')119else:120return result121122123def _nons(tag):124if isinstance(tag, basestring):125if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:126return tag.split('}')[-1]127return tag128129130class Classes(MutableSet):131"""Provides access to an element's class attribute as a set-like collection.132Usage::133134>>> el = fromstring('<p class="hidden large">Text</p>')135>>> classes = el.classes # or: classes = Classes(el.attrib)136>>> classes |= ['block', 'paragraph']137>>> el.get('class')138'hidden large block paragraph'139>>> classes.toggle('hidden')140False141>>> el.get('class')142'large block paragraph'143>>> classes -= ('some', 'classes', 'block')144>>> el.get('class')145'large paragraph'146"""147def __init__(self, attributes):148self._attributes = attributes149self._get_class_value = partial(attributes.get, 'class', '')150151def add(self, value):152"""153Add a class.154155This has no effect if the class is already present.156"""157if not value or re.search(r'\s', value):158raise ValueError("Invalid class name: %r" % value)159classes = self._get_class_value().split()160if value in classes:161return162classes.append(value)163self._attributes['class'] = ' '.join(classes)164165def discard(self, value):166"""167Remove a class if it is currently present.168169If the class is not present, do nothing.170"""171if not value or re.search(r'\s', value):172raise ValueError("Invalid class name: %r" % value)173classes = [name for name in self._get_class_value().split()174if name != value]175if classes:176self._attributes['class'] = ' '.join(classes)177elif 'class' in self._attributes:178del self._attributes['class']179180def remove(self, value):181"""182Remove a class; it must currently be present.183184If the class is not present, raise a KeyError.185"""186if not value or re.search(r'\s', value):187raise ValueError("Invalid class name: %r" % value)188super(Classes, self).remove(value)189190def __contains__(self, name):191classes = self._get_class_value()192return name in classes and name in classes.split()193194def __iter__(self):195return iter(self._get_class_value().split())196197def __len__(self):198return len(self._get_class_value().split())199200# non-standard methods201202def update(self, values):203"""204Add all names from 'values'.205"""206classes = self._get_class_value().split()207extended = False208for value in values:209if value not in classes:210classes.append(value)211extended = True212if extended:213self._attributes['class'] = ' '.join(classes)214215def toggle(self, value):216"""217Add a class name if it isn't there yet, or remove it if it exists.218219Returns true if the class was added (and is now enabled) and220false if it was removed (and is now disabled).221"""222if not value or re.search(r'\s', value):223raise ValueError("Invalid class name: %r" % value)224classes = self._get_class_value().split()225try:226classes.remove(value)227enabled = False228except ValueError:229classes.append(value)230enabled = True231if classes:232self._attributes['class'] = ' '.join(classes)233else:234del self._attributes['class']235return enabled236237238class HtmlMixin(object):239240def set(self, key, value=None):241"""set(self, key, value=None)242243Sets an element attribute. If no value is provided, or if the value is None,244creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"245for ``form.set('novalidate')``.246"""247super(HtmlElement, self).set(key, value)248249@property250def classes(self):251"""252A set-like wrapper around the 'class' attribute.253"""254return Classes(self.attrib)255256@classes.setter257def classes(self, classes):258assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.259value = classes._get_class_value()260if value:261self.set('class', value)262elif self.get('class') is not None:263del self.attrib['class']264265@property266def base_url(self):267"""268Returns the base URL, given when the page was parsed.269270Use with ``urlparse.urljoin(el.base_url, href)`` to get271absolute URLs.272"""273return self.getroottree().docinfo.URL274275@property276def forms(self):277"""278Return a list of all the forms279"""280return _forms_xpath(self)281282@property283def body(self):284"""285Return the <body> element. Can be called from a child element286to get the document's head.287"""288return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]289290@property291def head(self):292"""293Returns the <head> element. Can be called from a child294element to get the document's head.295"""296return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]297298@property299def label(self):300"""301Get or set any <label> element associated with this element.302"""303id = self.get('id')304if not id:305return None306result = _label_xpath(self, id=id)307if not result:308return None309else:310return result[0]311312@label.setter313def label(self, label):314id = self.get('id')315if not id:316raise TypeError(317"You cannot set a label for an element (%r) that has no id"318% self)319if _nons(label.tag) != 'label':320raise TypeError(321"You can only assign label to a label element (not %r)"322% label)323label.set('for', id)324325@label.deleter326def label(self):327label = self.label328if label is not None:329del label.attrib['for']330331def drop_tree(self):332"""333Removes this element from the tree, including its children and334text. The tail text is joined to the previous element or335parent.336"""337parent = self.getparent()338assert parent is not None339if self.tail:340previous = self.getprevious()341if previous is None:342parent.text = (parent.text or '') + self.tail343else:344previous.tail = (previous.tail or '') + self.tail345parent.remove(self)346347def drop_tag(self):348"""349Remove the tag, but not its children or text. The children and text350are merged into the parent.351352Example::353354>>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')355>>> h.find('.//b').drop_tag()356>>> print(tostring(h, encoding='unicode'))357<div>Hello World!</div>358"""359parent = self.getparent()360assert parent is not None361previous = self.getprevious()362if self.text and isinstance(self.tag, basestring):363# not a Comment, etc.364if previous is None:365parent.text = (parent.text or '') + self.text366else:367previous.tail = (previous.tail or '') + self.text368if self.tail:369if len(self):370last = self[-1]371last.tail = (last.tail or '') + self.tail372elif previous is None:373parent.text = (parent.text or '') + self.tail374else:375previous.tail = (previous.tail or '') + self.tail376index = parent.index(self)377parent[index:index+1] = self[:]378379def find_rel_links(self, rel):380"""381Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.382"""383rel = rel.lower()384return [el for el in _rel_links_xpath(self)385if el.get('rel').lower() == rel]386387def find_class(self, class_name):388"""389Find any elements with the given class name.390"""391return _class_xpath(self, class_name=class_name)392393def get_element_by_id(self, id, *default):394"""395Get the first element in a document with the given id. If none is396found, return the default argument if provided or raise KeyError397otherwise.398399Note that there can be more than one element with the same id,400and this isn't uncommon in HTML documents found in the wild.401Browsers return only the first match, and this function does402the same.403"""404try:405# FIXME: should this check for multiple matches?406# browsers just return the first one407return _id_xpath(self, id=id)[0]408except IndexError:409if default:410return default[0]411else:412raise KeyError(id)413414def text_content(self):415"""416Return the text content of the tag (and the text in any children).417"""418return _collect_string_content(self)419420def cssselect(self, expr, translator='html'):421"""422Run the CSS expression on this element and its children,423returning a list of the results.424425Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)426-- note that pre-compiling the expression can provide a substantial427speedup.428"""429# Do the import here to make the dependency optional.430from lxml.cssselect import CSSSelector431return CSSSelector(expr, translator=translator)(self)432433########################################434## Link functions435########################################436437def make_links_absolute(self, base_url=None, resolve_base_href=True,438handle_failures=None):439"""440Make all links in the document absolute, given the441``base_url`` for the document (the full URL where the document442came from), or if no ``base_url`` is given, then the ``.base_url``443of the document.444445If ``resolve_base_href`` is true, then any ``<base href>``446tags in the document are used *and* removed from the document.447If it is false then any such tag is ignored.448449If ``handle_failures`` is None (default), a failure to process450a URL will abort the processing. If set to 'ignore', errors451are ignored. If set to 'discard', failing URLs will be removed.452"""453if base_url is None:454base_url = self.base_url455if base_url is None:456raise TypeError(457"No base_url given, and the document has no base_url")458if resolve_base_href:459self.resolve_base_href()460461if handle_failures == 'ignore':462def link_repl(href):463try:464return urljoin(base_url, href)465except ValueError:466return href467elif handle_failures == 'discard':468def link_repl(href):469try:470return urljoin(base_url, href)471except ValueError:472return None473elif handle_failures is None:474def link_repl(href):475return urljoin(base_url, href)476else:477raise ValueError(478"unexpected value for handle_failures: %r" % handle_failures)479480self.rewrite_links(link_repl)481482def resolve_base_href(self, handle_failures=None):483"""484Find any ``<base href>`` tag in the document, and apply its485values to all links found in the document. Also remove the486tag once it has been applied.487488If ``handle_failures`` is None (default), a failure to process489a URL will abort the processing. If set to 'ignore', errors490are ignored. If set to 'discard', failing URLs will be removed.491"""492base_href = None493basetags = self.xpath('//base[@href]|//x:base[@href]',494namespaces={'x': XHTML_NAMESPACE})495for b in basetags:496base_href = b.get('href')497b.drop_tree()498if not base_href:499return500self.make_links_absolute(base_href, resolve_base_href=False,501handle_failures=handle_failures)502503def iterlinks(self):504"""505Yield (element, attribute, link, pos), where attribute may be None506(indicating the link is in the text). ``pos`` is the position507where the link occurs; often 0, but sometimes something else in508the case of links in stylesheets or style tags.509510Note: <base href> is *not* taken into account in any way. The511link you get is exactly the link in the document.512513Note: multiple links inside of a single text string or514attribute value are returned in reversed order. This makes it515possible to replace or delete them from the text string value516based on their reported text positions. Otherwise, a517modification at one text position can change the positions of518links reported later on.519"""520link_attrs = defs.link_attrs521for el in self.iter(etree.Element):522attribs = el.attrib523tag = _nons(el.tag)524if tag == 'object':525codebase = None526## <object> tags have attributes that are relative to527## codebase528if 'codebase' in attribs:529codebase = el.get('codebase')530yield (el, 'codebase', codebase, 0)531for attrib in ('classid', 'data'):532if attrib in attribs:533value = el.get(attrib)534if codebase is not None:535value = urljoin(codebase, value)536yield (el, attrib, value, 0)537if 'archive' in attribs:538for match in _archive_re.finditer(el.get('archive')):539value = match.group(0)540if codebase is not None:541value = urljoin(codebase, value)542yield (el, 'archive', value, match.start())543else:544for attrib in link_attrs:545if attrib in attribs:546yield (el, attrib, attribs[attrib], 0)547if tag == 'meta':548http_equiv = attribs.get('http-equiv', '').lower()549if http_equiv == 'refresh':550content = attribs.get('content', '')551match = _parse_meta_refresh_url(content)552url = (match.group('url') if match else content).strip()553# unexpected content means the redirect won't work, but we might554# as well be permissive and return the entire string.555if url:556url, pos = _unquote_match(557url, match.start('url') if match else content.find(url))558yield (el, 'content', url, pos)559elif tag == 'param':560valuetype = el.get('valuetype') or ''561if valuetype.lower() == 'ref':562## FIXME: while it's fine we *find* this link,563## according to the spec we aren't supposed to564## actually change the value, including resolving565## it. It can also still be a link, even if it566## doesn't have a valuetype="ref" (which seems to be the norm)567## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype568yield (el, 'value', el.get('value'), 0)569elif tag == 'style' and el.text:570urls = [571# (start_pos, url)572_unquote_match(match.group(1), match.start(1))[::-1]573for match in _iter_css_urls(el.text)574] + [575(match.start(1), match.group(1))576for match in _iter_css_imports(el.text)577]578if urls:579# sort by start pos to bring both match sets back into order580# and reverse the list to report correct positions despite581# modifications582urls.sort(reverse=True)583for start, url in urls:584yield (el, None, url, start)585if 'style' in attribs:586urls = list(_iter_css_urls(attribs['style']))587if urls:588# return in reversed order to simplify in-place modifications589for match in urls[::-1]:590url, start = _unquote_match(match.group(1), match.start(1))591yield (el, 'style', url, start)592593def rewrite_links(self, link_repl_func, resolve_base_href=True,594base_href=None):595"""596Rewrite all the links in the document. For each link597``link_repl_func(link)`` will be called, and the return value598will replace the old link.599600Note that links may not be absolute (unless you first called601``make_links_absolute()``), and may be internal (e.g.,602``'#anchor'``). They can also be values like603``'mailto:email'`` or ``'javascript:expr'``.604605If you give ``base_href`` then all links passed to606``link_repl_func()`` will take that into account.607608If the ``link_repl_func`` returns None, the attribute or609tag text will be removed completely.610"""611if base_href is not None:612# FIXME: this can be done in one pass with a wrapper613# around link_repl_func614self.make_links_absolute(615base_href, resolve_base_href=resolve_base_href)616elif resolve_base_href:617self.resolve_base_href()618619for el, attrib, link, pos in self.iterlinks():620new_link = link_repl_func(link.strip())621if new_link == link:622continue623if new_link is None:624# Remove the attribute or element content625if attrib is None:626el.text = ''627else:628del el.attrib[attrib]629continue630631if attrib is None:632new = el.text[:pos] + new_link + el.text[pos+len(link):]633el.text = new634else:635cur = el.get(attrib)636if not pos and len(cur) == len(link):637new = new_link # most common case638else:639new = cur[:pos] + new_link + cur[pos+len(link):]640el.set(attrib, new)641642643class _MethodFunc(object):644"""645An object that represents a method on an element as a function;646the function takes either an element or an HTML string. It647returns whatever the function normally returns, or if the function648works in-place (and so returns None) it returns a serialized form649of the resulting document.650"""651def __init__(self, name, copy=False, source_class=HtmlMixin):652self.name = name653self.copy = copy654self.__doc__ = getattr(source_class, self.name).__doc__655def __call__(self, doc, *args, **kw):656result_type = type(doc)657if isinstance(doc, basestring):658if 'copy' in kw:659raise TypeError(660"The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)661doc = fromstring(doc, **kw)662else:663if 'copy' in kw:664make_a_copy = kw.pop('copy')665else:666make_a_copy = self.copy667if make_a_copy:668doc = copy.deepcopy(doc)669meth = getattr(doc, self.name)670result = meth(*args, **kw)671# FIXME: this None test is a bit sloppy672if result is None:673# Then return what we got in674return _transform_result(result_type, doc)675else:676return result677678679find_rel_links = _MethodFunc('find_rel_links', copy=False)680find_class = _MethodFunc('find_class', copy=False)681make_links_absolute = _MethodFunc('make_links_absolute', copy=True)682resolve_base_href = _MethodFunc('resolve_base_href', copy=True)683iterlinks = _MethodFunc('iterlinks', copy=False)684rewrite_links = _MethodFunc('rewrite_links', copy=True)685686687class HtmlComment(etree.CommentBase, HtmlMixin):688pass689690691class HtmlElement(etree.ElementBase, HtmlMixin):692# Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?)693cssselect = HtmlMixin.cssselect694set = HtmlMixin.set695696697class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):698pass699700701class HtmlEntity(etree.EntityBase, HtmlMixin):702pass703704705class HtmlElementClassLookup(etree.CustomElementClassLookup):706"""A lookup scheme for HTML Element classes.707708To create a lookup instance with different Element classes, pass a tag709name mapping of Element classes in the ``classes`` keyword argument and/or710a tag name mapping of Mixin classes in the ``mixins`` keyword argument.711The special key '*' denotes a Mixin class that should be mixed into all712Element classes.713"""714_default_element_classes = {}715716def __init__(self, classes=None, mixins=None):717etree.CustomElementClassLookup.__init__(self)718if classes is None:719classes = self._default_element_classes.copy()720if mixins:721mixers = {}722for name, value in mixins:723if name == '*':724for n in classes.keys():725mixers.setdefault(n, []).append(value)726else:727mixers.setdefault(name, []).append(value)728for name, mix_bases in mixers.items():729cur = classes.get(name, HtmlElement)730bases = tuple(mix_bases + [cur])731classes[name] = type(cur.__name__, bases, {})732self._element_classes = classes733734def lookup(self, node_type, document, namespace, name):735if node_type == 'element':736return self._element_classes.get(name.lower(), HtmlElement)737elif node_type == 'comment':738return HtmlComment739elif node_type == 'PI':740return HtmlProcessingInstruction741elif node_type == 'entity':742return HtmlEntity743# Otherwise normal lookup744return None745746747################################################################################748# parsing749################################################################################750751_looks_like_full_html_unicode = re.compile(752unicode(r'^\s*<(?:html|!doctype)'), re.I).match753_looks_like_full_html_bytes = re.compile(754r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match755756757def document_fromstring(html, parser=None, ensure_head_body=False, **kw):758if parser is None:759parser = html_parser760value = etree.fromstring(html, parser, **kw)761if value is None:762raise etree.ParserError(763"Document is empty")764if ensure_head_body and value.find('head') is None:765value.insert(0, Element('head'))766if ensure_head_body and value.find('body') is None:767value.append(Element('body'))768return value769770771def fragments_fromstring(html, no_leading_text=False, base_url=None,772parser=None, **kw):773"""Parses several HTML elements, returning a list of elements.774775The first item in the list may be a string.776If no_leading_text is true, then it will be an error if there is777leading text, and it will always be a list of only elements.778779base_url will set the document's base_url attribute780(and the tree's docinfo.URL).781"""782if parser is None:783parser = html_parser784# FIXME: check what happens when you give html with a body, head, etc.785if isinstance(html, bytes):786if not _looks_like_full_html_bytes(html):787# can't use %-formatting in early Py3 versions788html = ('<html><body>'.encode('ascii') + html +789'</body></html>'.encode('ascii'))790else:791if not _looks_like_full_html_unicode(html):792html = '<html><body>%s</body></html>' % html793doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)794assert _nons(doc.tag) == 'html'795bodies = [e for e in doc if _nons(e.tag) == 'body']796assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))797body = bodies[0]798elements = []799if no_leading_text and body.text and body.text.strip():800raise etree.ParserError(801"There is leading text: %r" % body.text)802if body.text and body.text.strip():803elements.append(body.text)804elements.extend(body)805# FIXME: removing the reference to the parent artificial document806# would be nice807return elements808809810def fragment_fromstring(html, create_parent=False, base_url=None,811parser=None, **kw):812"""813Parses a single HTML element; it is an error if there is more than814one element, or if anything but whitespace precedes or follows the815element.816817If ``create_parent`` is true (or is a tag name) then a parent node818will be created to encapsulate the HTML in a single element. In this819case, leading or trailing text is also allowed, as are multiple elements820as result of the parsing.821822Passing a ``base_url`` will set the document's ``base_url`` attribute823(and the tree's docinfo.URL).824"""825if parser is None:826parser = html_parser827828accept_leading_text = bool(create_parent)829830elements = fragments_fromstring(831html, parser=parser, no_leading_text=not accept_leading_text,832base_url=base_url, **kw)833834if create_parent:835if not isinstance(create_parent, basestring):836create_parent = 'div'837new_root = Element(create_parent)838if elements:839if isinstance(elements[0], basestring):840new_root.text = elements[0]841del elements[0]842new_root.extend(elements)843return new_root844845if not elements:846raise etree.ParserError('No elements found')847if len(elements) > 1:848raise etree.ParserError(849"Multiple elements found (%s)"850% ', '.join([_element_name(e) for e in elements]))851el = elements[0]852if el.tail and el.tail.strip():853raise etree.ParserError(854"Element followed by text: %r" % el.tail)855el.tail = None856return el857858859def fromstring(html, base_url=None, parser=None, **kw):860"""861Parse the html, returning a single element/document.862863This tries to minimally parse the chunk of text, without knowing if it864is a fragment or a document.865866base_url will set the document's base_url attribute (and the tree's docinfo.URL)867"""868if parser is None:869parser = html_parser870if isinstance(html, bytes):871is_full_html = _looks_like_full_html_bytes(html)872else:873is_full_html = _looks_like_full_html_unicode(html)874doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)875if is_full_html:876return doc877# otherwise, lets parse it out...878bodies = doc.findall('body')879if not bodies:880bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)881if bodies:882body = bodies[0]883if len(bodies) > 1:884# Somehow there are multiple bodies, which is bad, but just885# smash them into one body886for other_body in bodies[1:]:887if other_body.text:888if len(body):889body[-1].tail = (body[-1].tail or '') + other_body.text890else:891body.text = (body.text or '') + other_body.text892body.extend(other_body)893# We'll ignore tail894# I guess we are ignoring attributes too895other_body.drop_tree()896else:897body = None898heads = doc.findall('head')899if not heads:900heads = doc.findall('{%s}head' % XHTML_NAMESPACE)901if heads:902# Well, we have some sort of structure, so lets keep it all903head = heads[0]904if len(heads) > 1:905for other_head in heads[1:]:906head.extend(other_head)907# We don't care about text or tail in a head908other_head.drop_tree()909return doc910if body is None:911return doc912if (len(body) == 1 and (not body.text or not body.text.strip())913and (not body[-1].tail or not body[-1].tail.strip())):914# The body has just one element, so it was probably a single915# element passed in916return body[0]917# Now we have a body which represents a bunch of tags which have the918# content that was passed in. We will create a fake container, which919# is the body tag, except <body> implies too much structure.920if _contains_block_level_tag(body):921body.tag = 'div'922else:923body.tag = 'span'924return body925926927def parse(filename_or_url, parser=None, base_url=None, **kw):928"""929Parse a filename, URL, or file-like object into an HTML document930tree. Note: this returns a tree, not an element. Use931``parse(...).getroot()`` to get the document root.932933You can override the base URL with the ``base_url`` keyword. This934is most useful when parsing from a file-like object.935"""936if parser is None:937parser = html_parser938return etree.parse(filename_or_url, parser, base_url=base_url, **kw)939940941def _contains_block_level_tag(el):942# FIXME: I could do this with XPath, but would that just be943# unnecessarily slow?944for el in el.iter(etree.Element):945if _nons(el.tag) in defs.block_tags:946return True947return False948949950def _element_name(el):951if isinstance(el, etree.CommentBase):952return 'comment'953elif isinstance(el, basestring):954return 'string'955else:956return _nons(el.tag)957958959################################################################################960# form handling961################################################################################962963class FormElement(HtmlElement):964"""965Represents a <form> element.966"""967968@property969def inputs(self):970"""971Returns an accessor for all the input elements in the form.972973See `InputGetter` for more information about the object.974"""975return InputGetter(self)976977@property978def fields(self):979"""980Dictionary-like object that represents all the fields in this981form. You can set values in this dictionary to effect the982form.983"""984return FieldsDict(self.inputs)985986@fields.setter987def fields(self, value):988fields = self.fields989prev_keys = fields.keys()990for key, value in value.items():991if key in prev_keys:992prev_keys.remove(key)993fields[key] = value994for key in prev_keys:995if key is None:996# Case of an unnamed input; these aren't really997# expressed in form_values() anyway.998continue999fields[key] = None10001001def _name(self):1002if self.get('name'):1003return self.get('name')1004elif self.get('id'):1005return '#' + self.get('id')1006iter_tags = self.body.iter1007forms = list(iter_tags('form'))1008if not forms:1009forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))1010return str(forms.index(self))10111012def form_values(self):1013"""1014Return a list of tuples of the field values for the form.1015This is suitable to be passed to ``urllib.urlencode()``.1016"""1017results = []1018for el in self.inputs:1019name = el.name1020if not name or 'disabled' in el.attrib:1021continue1022tag = _nons(el.tag)1023if tag == 'textarea':1024results.append((name, el.value))1025elif tag == 'select':1026value = el.value1027if el.multiple:1028for v in value:1029results.append((name, v))1030elif value is not None:1031results.append((name, el.value))1032else:1033assert tag == 'input', (1034"Unexpected tag: %r" % el)1035if el.checkable and not el.checked:1036continue1037if el.type in ('submit', 'image', 'reset', 'file'):1038continue1039value = el.value1040if value is not None:1041results.append((name, el.value))1042return results10431044@property1045def action(self):1046"""1047Get/set the form's ``action`` attribute.1048"""1049base_url = self.base_url1050action = self.get('action')1051if base_url and action is not None:1052return urljoin(base_url, action)1053else:1054return action10551056@action.setter1057def action(self, value):1058self.set('action', value)10591060@action.deleter1061def action(self):1062attrib = self.attrib1063if 'action' in attrib:1064del attrib['action']10651066@property1067def method(self):1068"""1069Get/set the form's method. Always returns a capitalized1070string, and defaults to ``'GET'``1071"""1072return self.get('method', 'GET').upper()10731074@method.setter1075def method(self, value):1076self.set('method', value.upper())107710781079HtmlElementClassLookup._default_element_classes['form'] = FormElement108010811082def submit_form(form, extra_values=None, open_http=None):1083"""1084Helper function to submit a form. Returns a file-like object, as from1085``urllib.urlopen()``. This object also has a ``.geturl()`` function,1086which shows the URL if there were any redirects.10871088You can use this like::10891090form = doc.forms[0]1091form.inputs['foo'].value = 'bar' # etc1092response = form.submit()1093doc = parse(response)1094doc.make_links_absolute(response.geturl())10951096To change the HTTP requester, pass a function as ``open_http`` keyword1097argument that opens the URL for you. The function must have the following1098signature::10991100open_http(method, URL, values)11011102The action is one of 'GET' or 'POST', the URL is the target URL as a1103string, and the values are a sequence of ``(name, value)`` tuples with the1104form data.1105"""1106values = form.form_values()1107if extra_values:1108if hasattr(extra_values, 'items'):1109extra_values = extra_values.items()1110values.extend(extra_values)1111if open_http is None:1112open_http = open_http_urllib1113if form.action:1114url = form.action1115else:1116url = form.base_url1117return open_http(form.method, url, values)111811191120def open_http_urllib(method, url, values):1121if not url:1122raise ValueError("cannot submit, no URL provided")1123## FIXME: should test that it's not a relative URL or something1124try:1125from urllib import urlencode, urlopen1126except ImportError: # Python 31127from urllib.request import urlopen1128from urllib.parse import urlencode1129if method == 'GET':1130if '?' in url:1131url += '&'1132else:1133url += '?'1134url += urlencode(values)1135data = None1136else:1137data = urlencode(values)1138if not isinstance(data, bytes):1139data = data.encode('ASCII')1140return urlopen(url, data)114111421143class FieldsDict(MutableMapping):11441145def __init__(self, inputs):1146self.inputs = inputs1147def __getitem__(self, item):1148return self.inputs[item].value1149def __setitem__(self, item, value):1150self.inputs[item].value = value1151def __delitem__(self, item):1152raise KeyError(1153"You cannot remove keys from ElementDict")1154def keys(self):1155return self.inputs.keys()1156def __contains__(self, item):1157return item in self.inputs1158def __iter__(self):1159return iter(self.inputs.keys())1160def __len__(self):1161return len(self.inputs)11621163def __repr__(self):1164return '<%s for form %s>' % (1165self.__class__.__name__,1166self.inputs.form._name())116711681169class InputGetter(object):11701171"""1172An accessor that represents all the input fields in a form.11731174You can get fields by name from this, with1175``form.inputs['field_name']``. If there are a set of checkboxes1176with the same name, they are returned as a list (a `CheckboxGroup`1177which also allows value setting). Radio inputs are handled1178similarly.11791180You can also iterate over this to get all input elements. This1181won't return the same thing as if you get all the names, as1182checkboxes and radio elements are returned individually.1183"""11841185_name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]")1186_all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']")11871188def __init__(self, form):1189self.form = form11901191def __repr__(self):1192return '<%s for form %s>' % (1193self.__class__.__name__,1194self.form._name())11951196## FIXME: there should be more methods, and it's unclear if this is1197## a dictionary-like object or list-like object11981199def __getitem__(self, name):1200results = self._name_xpath(self.form, name=name)1201if results:1202type = results[0].get('type')1203if type == 'radio' and len(results) > 1:1204group = RadioGroup(results)1205group.name = name1206return group1207elif type == 'checkbox' and len(results) > 1:1208group = CheckboxGroup(results)1209group.name = name1210return group1211else:1212# I don't like throwing away elements like this1213return results[0]1214else:1215raise KeyError(1216"No input element with the name %r" % name)12171218def __contains__(self, name):1219results = self._name_xpath(self.form, name=name)1220return bool(results)12211222def keys(self):1223names = set()1224for el in self:1225names.add(el.name)1226if None in names:1227names.remove(None)1228return list(names)12291230def __iter__(self):1231## FIXME: kind of dumb to turn a list into an iterator, only1232## to have it likely turned back into a list again :(1233return iter(self._all_xpath(self.form))123412351236class InputMixin(object):1237"""1238Mix-in for all input elements (input, select, and textarea)1239"""1240@property1241def name(self):1242"""1243Get/set the name of the element1244"""1245return self.get('name')12461247@name.setter1248def name(self, value):1249self.set('name', value)12501251@name.deleter1252def name(self):1253attrib = self.attrib1254if 'name' in attrib:1255del attrib['name']12561257def __repr__(self):1258type_name = getattr(self, 'type', None)1259if type_name:1260type_name = ' type=%r' % type_name1261else:1262type_name = ''1263return '<%s %x name=%r%s>' % (1264self.__class__.__name__, id(self), self.name, type_name)126512661267class TextareaElement(InputMixin, HtmlElement):1268"""1269``<textarea>`` element. You can get the name with ``.name`` and1270get/set the value with ``.value``1271"""1272@property1273def value(self):1274"""1275Get/set the value (which is the contents of this element)1276"""1277content = self.text or ''1278if self.tag.startswith("{%s}" % XHTML_NAMESPACE):1279serialisation_method = 'xml'1280else:1281serialisation_method = 'html'1282for el in self:1283# it's rare that we actually get here, so let's not use ''.join()1284content += etree.tostring(1285el, method=serialisation_method, encoding='unicode')1286return content12871288@value.setter1289def value(self, value):1290del self[:]1291self.text = value12921293@value.deleter1294def value(self):1295self.text = ''1296del self[:]129712981299HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement130013011302class SelectElement(InputMixin, HtmlElement):1303"""1304``<select>`` element. You can get the name with ``.name``.13051306``.value`` will be the value of the selected option, unless this1307is a multi-select element (``<select multiple>``), in which case1308it will be a set-like object. In either case ``.value_options``1309gives the possible values.13101311The boolean attribute ``.multiple`` shows if this is a1312multi-select.1313"""1314@property1315def value(self):1316"""1317Get/set the value of this select (the selected option).13181319If this is a multi-select, this is a set-like object that1320represents all the selected options.1321"""1322if self.multiple:1323return MultipleSelectOptions(self)1324options = _options_xpath(self)13251326try:1327selected_option = next(el for el in reversed(options) if el.get('selected') is not None)1328except StopIteration:1329try:1330selected_option = next(el for el in options if el.get('disabled') is None)1331except StopIteration:1332return None1333value = selected_option.get('value')1334if value is None:1335value = (selected_option.text or '').strip()1336return value13371338@value.setter1339def value(self, value):1340if self.multiple:1341if isinstance(value, basestring):1342raise TypeError("You must pass in a sequence")1343values = self.value1344values.clear()1345values.update(value)1346return1347checked_option = None1348if value is not None:1349for el in _options_xpath(self):1350opt_value = el.get('value')1351if opt_value is None:1352opt_value = (el.text or '').strip()1353if opt_value == value:1354checked_option = el1355break1356else:1357raise ValueError(1358"There is no option with the value of %r" % value)1359for el in _options_xpath(self):1360if 'selected' in el.attrib:1361del el.attrib['selected']1362if checked_option is not None:1363checked_option.set('selected', '')13641365@value.deleter1366def value(self):1367# FIXME: should del be allowed at all?1368if self.multiple:1369self.value.clear()1370else:1371self.value = None13721373@property1374def value_options(self):1375"""1376All the possible values this select can have (the ``value``1377attribute of all the ``<option>`` elements.1378"""1379options = []1380for el in _options_xpath(self):1381value = el.get('value')1382if value is None:1383value = (el.text or '').strip()1384options.append(value)1385return options13861387@property1388def multiple(self):1389"""1390Boolean attribute: is there a ``multiple`` attribute on this element.1391"""1392return 'multiple' in self.attrib13931394@multiple.setter1395def multiple(self, value):1396if value:1397self.set('multiple', '')1398elif 'multiple' in self.attrib:1399del self.attrib['multiple']140014011402HtmlElementClassLookup._default_element_classes['select'] = SelectElement140314041405class MultipleSelectOptions(SetMixin):1406"""1407Represents all the selected options in a ``<select multiple>`` element.14081409You can add to this set-like option to select an option, or remove1410to unselect the option.1411"""14121413def __init__(self, select):1414self.select = select14151416@property1417def options(self):1418"""1419Iterator of all the ``<option>`` elements.1420"""1421return iter(_options_xpath(self.select))14221423def __iter__(self):1424for option in self.options:1425if 'selected' in option.attrib:1426opt_value = option.get('value')1427if opt_value is None:1428opt_value = (option.text or '').strip()1429yield opt_value14301431def add(self, item):1432for option in self.options:1433opt_value = option.get('value')1434if opt_value is None:1435opt_value = (option.text or '').strip()1436if opt_value == item:1437option.set('selected', '')1438break1439else:1440raise ValueError(1441"There is no option with the value %r" % item)14421443def remove(self, item):1444for option in self.options:1445opt_value = option.get('value')1446if opt_value is None:1447opt_value = (option.text or '').strip()1448if opt_value == item:1449if 'selected' in option.attrib:1450del option.attrib['selected']1451else:1452raise ValueError(1453"The option %r is not currently selected" % item)1454break1455else:1456raise ValueError(1457"There is not option with the value %r" % item)14581459def __repr__(self):1460return '<%s {%s} for select name=%r>' % (1461self.__class__.__name__,1462', '.join([repr(v) for v in self]),1463self.select.name)146414651466class RadioGroup(list):1467"""1468This object represents several ``<input type=radio>`` elements1469that have the same name.14701471You can use this like a list, but also use the property1472``.value`` to check/uncheck inputs. Also you can use1473``.value_options`` to get the possible values.1474"""1475@property1476def value(self):1477"""1478Get/set the value, which checks the radio with that value (and1479unchecks any other value).1480"""1481for el in self:1482if 'checked' in el.attrib:1483return el.get('value')1484return None14851486@value.setter1487def value(self, value):1488checked_option = None1489if value is not None:1490for el in self:1491if el.get('value') == value:1492checked_option = el1493break1494else:1495raise ValueError("There is no radio input with the value %r" % value)1496for el in self:1497if 'checked' in el.attrib:1498del el.attrib['checked']1499if checked_option is not None:1500checked_option.set('checked', '')15011502@value.deleter1503def value(self):1504self.value = None15051506@property1507def value_options(self):1508"""1509Returns a list of all the possible values.1510"""1511return [el.get('value') for el in self]15121513def __repr__(self):1514return '%s(%s)' % (1515self.__class__.__name__,1516list.__repr__(self))151715181519class CheckboxGroup(list):1520"""1521Represents a group of checkboxes (``<input type=checkbox>``) that1522have the same name.15231524In addition to using this like a list, the ``.value`` attribute1525returns a set-like object that you can add to or remove from to1526check and uncheck checkboxes. You can also use ``.value_options``1527to get the possible values.1528"""1529@property1530def value(self):1531"""1532Return a set-like object that can be modified to check or1533uncheck individual checkboxes according to their value.1534"""1535return CheckboxValues(self)15361537@value.setter1538def value(self, value):1539values = self.value1540values.clear()1541if not hasattr(value, '__iter__'):1542raise ValueError(1543"A CheckboxGroup (name=%r) must be set to a sequence (not %r)"1544% (self[0].name, value))1545values.update(value)15461547@value.deleter1548def value(self):1549self.value.clear()15501551@property1552def value_options(self):1553"""1554Returns a list of all the possible values.1555"""1556return [el.get('value') for el in self]15571558def __repr__(self):1559return '%s(%s)' % (1560self.__class__.__name__, list.__repr__(self))156115621563class CheckboxValues(SetMixin):1564"""1565Represents the values of the checked checkboxes in a group of1566checkboxes with the same name.1567"""15681569def __init__(self, group):1570self.group = group15711572def __iter__(self):1573return iter([1574el.get('value')1575for el in self.group1576if 'checked' in el.attrib])15771578def add(self, value):1579for el in self.group:1580if el.get('value') == value:1581el.set('checked', '')1582break1583else:1584raise KeyError("No checkbox with value %r" % value)15851586def remove(self, value):1587for el in self.group:1588if el.get('value') == value:1589if 'checked' in el.attrib:1590del el.attrib['checked']1591else:1592raise KeyError(1593"The checkbox with value %r was already unchecked" % value)1594break1595else:1596raise KeyError(1597"No checkbox with value %r" % value)15981599def __repr__(self):1600return '<%s {%s} for checkboxes name=%r>' % (1601self.__class__.__name__,1602', '.join([repr(v) for v in self]),1603self.group.name)160416051606class InputElement(InputMixin, HtmlElement):1607"""1608Represents an ``<input>`` element.16091610You can get the type with ``.type`` (which is lower-cased and1611defaults to ``'text'``).16121613Also you can get and set the value with ``.value``16141615Checkboxes and radios have the attribute ``input.checkable ==1616True`` (for all others it is false) and a boolean attribute1617``.checked``.16181619"""16201621## FIXME: I'm a little uncomfortable with the use of .checked1622@property1623def value(self):1624"""1625Get/set the value of this element, using the ``value`` attribute.16261627Also, if this is a checkbox and it has no value, this defaults1628to ``'on'``. If it is a checkbox or radio that is not1629checked, this returns None.1630"""1631if self.checkable:1632if self.checked:1633return self.get('value') or 'on'1634else:1635return None1636return self.get('value')16371638@value.setter1639def value(self, value):1640if self.checkable:1641if not value:1642self.checked = False1643else:1644self.checked = True1645if isinstance(value, basestring):1646self.set('value', value)1647else:1648self.set('value', value)16491650@value.deleter1651def value(self):1652if self.checkable:1653self.checked = False1654else:1655if 'value' in self.attrib:1656del self.attrib['value']16571658@property1659def type(self):1660"""1661Return the type of this element (using the type attribute).1662"""1663return self.get('type', 'text').lower()16641665@type.setter1666def type(self, value):1667self.set('type', value)16681669@property1670def checkable(self):1671"""1672Boolean: can this element be checked?1673"""1674return self.type in ('checkbox', 'radio')16751676@property1677def checked(self):1678"""1679Boolean attribute to get/set the presence of the ``checked``1680attribute.16811682You can only use this on checkable input types.1683"""1684if not self.checkable:1685raise AttributeError('Not a checkable input type')1686return 'checked' in self.attrib16871688@checked.setter1689def checked(self, value):1690if not self.checkable:1691raise AttributeError('Not a checkable input type')1692if value:1693self.set('checked', '')1694else:1695attrib = self.attrib1696if 'checked' in attrib:1697del attrib['checked']169816991700HtmlElementClassLookup._default_element_classes['input'] = InputElement170117021703class LabelElement(HtmlElement):1704"""1705Represents a ``<label>`` element.17061707Label elements are linked to other elements with their ``for``1708attribute. You can access this element with ``label.for_element``.1709"""1710@property1711def for_element(self):1712"""1713Get/set the element this label points to. Return None if it1714can't be found.1715"""1716id = self.get('for')1717if not id:1718return None1719return self.body.get_element_by_id(id)17201721@for_element.setter1722def for_element(self, other):1723id = other.get('id')1724if not id:1725raise TypeError(1726"Element %r has no id attribute" % other)1727self.set('for', id)17281729@for_element.deleter1730def for_element(self):1731attrib = self.attrib1732if 'id' in attrib:1733del attrib['id']173417351736HtmlElementClassLookup._default_element_classes['label'] = LabelElement173717381739############################################################1740## Serialization1741############################################################17421743def html_to_xhtml(html):1744"""Convert all tags in an HTML tree to XHTML by moving them to the1745XHTML namespace.1746"""1747try:1748html = html.getroot()1749except AttributeError:1750pass1751prefix = "{%s}" % XHTML_NAMESPACE1752for el in html.iter(etree.Element):1753tag = el.tag1754if tag[0] != '{':1755el.tag = prefix + tag175617571758def xhtml_to_html(xhtml):1759"""Convert all tags in an XHTML tree to HTML by removing their1760XHTML namespace.1761"""1762try:1763xhtml = xhtml.getroot()1764except AttributeError:1765pass1766prefix = "{%s}" % XHTML_NAMESPACE1767prefix_len = len(prefix)1768for el in xhtml.iter(prefix + "*"):1769el.tag = el.tag[prefix_len:]177017711772# This isn't a general match, but it's a match for what libxml21773# specifically serialises:1774__str_replace_meta_content_type = re.compile(1775r'<meta http-equiv="Content-Type"[^>]*>').sub1776__bytes_replace_meta_content_type = re.compile(1777r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub177817791780def tostring(doc, pretty_print=False, include_meta_content_type=False,1781encoding=None, method="html", with_tail=True, doctype=None):1782"""Return an HTML string representation of the document.17831784Note: if include_meta_content_type is true this will create a1785``<meta http-equiv="Content-Type" ...>`` tag in the head;1786regardless of the value of include_meta_content_type any existing1787``<meta http-equiv="Content-Type" ...>`` tag will be removed17881789The ``encoding`` argument controls the output encoding (defaults to1790ASCII, with &#...; character references for any characters outside1791of ASCII). Note that you can pass the name ``'unicode'`` as1792``encoding`` argument to serialise to a Unicode string.17931794The ``method`` argument defines the output method. It defaults to1795'html', but can also be 'xml' for xhtml output, or 'text' to1796serialise to plain text without markup.17971798To leave out the tail text of the top-level element that is being1799serialised, pass ``with_tail=False``.18001801The ``doctype`` option allows passing in a plain string that will1802be serialised before the XML tree. Note that passing in non1803well-formed content here will make the XML output non well-formed.1804Also, an existing doctype in the document tree will not be removed1805when serialising an ElementTree instance.18061807Example::18081809>>> from lxml import html1810>>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')18111812>>> html.tostring(root)1813b'<p>Hello<br>world!</p>'1814>>> html.tostring(root, method='html')1815b'<p>Hello<br>world!</p>'18161817>>> html.tostring(root, method='xml')1818b'<p>Hello<br/>world!</p>'18191820>>> html.tostring(root, method='text')1821b'Helloworld!'18221823>>> html.tostring(root, method='text', encoding='unicode')1824u'Helloworld!'18251826>>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')1827>>> html.tostring(root[0], method='text', encoding='unicode')1828u'Helloworld!TAIL'18291830>>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)1831u'Helloworld!'18321833>>> doc = html.document_fromstring('<p>Hello<br>world!</p>')1834>>> html.tostring(doc, method='html', encoding='unicode')1835u'<html><body><p>Hello<br>world!</p></body></html>'18361837>>> print(html.tostring(doc, method='html', encoding='unicode',1838... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'1839... ' "http://www.w3.org/TR/html4/strict.dtd">'))1840<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">1841<html><body><p>Hello<br>world!</p></body></html>1842"""1843html = etree.tostring(doc, method=method, pretty_print=pretty_print,1844encoding=encoding, with_tail=with_tail,1845doctype=doctype)1846if method == 'html' and not include_meta_content_type:1847if isinstance(html, str):1848html = __str_replace_meta_content_type('', html)1849else:1850html = __bytes_replace_meta_content_type(bytes(), html)1851return html185218531854tostring.__doc__ = __fix_docstring(tostring.__doc__)185518561857def open_in_browser(doc, encoding=None):1858"""1859Open the HTML document in a web browser, saving it to a temporary1860file to open it. Note that this does not delete the file after1861use. This is mainly meant for debugging.1862"""1863import os1864import webbrowser1865import tempfile1866if not isinstance(doc, etree._ElementTree):1867doc = etree.ElementTree(doc)1868handle, fn = tempfile.mkstemp(suffix='.html')1869f = os.fdopen(handle, 'wb')1870try:1871doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")1872finally:1873# we leak the file itself here, but we should at least close it1874f.close()1875url = 'file://' + fn.replace(os.path.sep, '/')1876print(url)1877webbrowser.open(url)187818791880################################################################################1881# configure Element class lookup1882################################################################################18831884class HTMLParser(etree.HTMLParser):1885"""An HTML parser that is configured to return lxml.html Element1886objects.1887"""1888def __init__(self, **kwargs):1889super(HTMLParser, self).__init__(**kwargs)1890self.set_element_class_lookup(HtmlElementClassLookup())189118921893class XHTMLParser(etree.XMLParser):1894"""An XML parser that is configured to return lxml.html Element1895objects.18961897Note that this parser is not really XHTML aware unless you let it1898load a DTD that declares the HTML entities. To do this, make sure1899you have the XHTML DTDs installed in your catalogs, and create the1900parser like this::19011902>>> parser = XHTMLParser(load_dtd=True)19031904If you additionally want to validate the document, use this::19051906>>> parser = XHTMLParser(dtd_validation=True)19071908For catalog support, see http://www.xmlsoft.org/catalog.html.1909"""1910def __init__(self, **kwargs):1911super(XHTMLParser, self).__init__(**kwargs)1912self.set_element_class_lookup(HtmlElementClassLookup())191319141915def Element(*args, **kw):1916"""Create a new HTML Element.19171918This can also be used for XHTML documents.1919"""1920v = html_parser.makeelement(*args, **kw)1921return v192219231924html_parser = HTMLParser()1925xhtml_parser = XHTMLParser()192619271928