Path: blob/master/venv/Lib/site-packages/bs4/builder/_html5lib.py
811 views
# Use of this source code is governed by the MIT license.1__license__ = "MIT"23__all__ = [4'HTML5TreeBuilder',5]67import warnings8import re9from bs4.builder import (10PERMISSIVE,11HTML,12HTML_5,13HTMLTreeBuilder,14)15from bs4.element import (16NamespacedAttribute,17nonwhitespace_re,18)19import html5lib20from html5lib.constants import (21namespaces,22prefixes,23)24from bs4.element import (25Comment,26Doctype,27NavigableString,28Tag,29)3031try:32# Pre-0.9999999933from html5lib.treebuilders import _base as treebuilder_base34new_html5lib = False35except ImportError as e:36# 0.99999999 and up37from html5lib.treebuilders import base as treebuilder_base38new_html5lib = True3940class HTML5TreeBuilder(HTMLTreeBuilder):41"""Use html5lib to build a tree.4243Note that this TreeBuilder does not support some features common44to HTML TreeBuilders. Some of these features could theoretically45be implemented, but at the very least it's quite difficult,46because html5lib moves the parse tree around as it's being built.4748* This TreeBuilder doesn't use different subclasses of NavigableString49based on the name of the tag in which the string was found.5051* You can't use a SoupStrainer to parse only part of a document.52"""5354NAME = "html5lib"5556features = [NAME, PERMISSIVE, HTML_5, HTML]5758# html5lib can tell us which line number and position in the59# original file is the source of an element.60TRACKS_LINE_NUMBERS = True6162def prepare_markup(self, markup, user_specified_encoding,63document_declared_encoding=None, exclude_encodings=None):64# Store the user-specified encoding for use later on.65self.user_specified_encoding = user_specified_encoding6667# document_declared_encoding and exclude_encodings aren't used68# ATM because the html5lib TreeBuilder doesn't use69# UnicodeDammit.70if exclude_encodings:71warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")72yield (markup, None, None, False)7374# These methods are defined by Beautiful Soup.75def feed(self, markup):76if self.soup.parse_only is not None:77warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")78parser = html5lib.HTMLParser(tree=self.create_treebuilder)79self.underlying_builder.parser = parser80extra_kwargs = dict()81if not isinstance(markup, str):82if new_html5lib:83extra_kwargs['override_encoding'] = self.user_specified_encoding84else:85extra_kwargs['encoding'] = self.user_specified_encoding86doc = parser.parse(markup, **extra_kwargs)8788# Set the character encoding detected by the tokenizer.89if isinstance(markup, str):90# We need to special-case this because html5lib sets91# charEncoding to UTF-8 if it gets Unicode input.92doc.original_encoding = None93else:94original_encoding = parser.tokenizer.stream.charEncoding[0]95if not isinstance(original_encoding, str):96# In 0.99999999 and up, the encoding is an html5lib97# Encoding object. We want to use a string for compatibility98# with other tree builders.99original_encoding = original_encoding.name100doc.original_encoding = original_encoding101self.underlying_builder.parser = None102103def create_treebuilder(self, namespaceHTMLElements):104self.underlying_builder = TreeBuilderForHtml5lib(105namespaceHTMLElements, self.soup,106store_line_numbers=self.store_line_numbers107)108return self.underlying_builder109110def test_fragment_to_document(self, fragment):111"""See `TreeBuilder`."""112return '<html><head></head><body>%s</body></html>' % fragment113114115class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):116117def __init__(self, namespaceHTMLElements, soup=None,118store_line_numbers=True, **kwargs):119if soup:120self.soup = soup121else:122from bs4 import BeautifulSoup123# TODO: Why is the parser 'html.parser' here? To avoid an124# infinite loop?125self.soup = BeautifulSoup(126"", "html.parser", store_line_numbers=store_line_numbers,127**kwargs128)129# TODO: What are **kwargs exactly? Should they be passed in130# here in addition to/instead of being passed to the BeautifulSoup131# constructor?132super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)133134# This will be set later to an html5lib.html5parser.HTMLParser135# object, which we can use to track the current line number.136self.parser = None137self.store_line_numbers = store_line_numbers138139def documentClass(self):140self.soup.reset()141return Element(self.soup, self.soup, None)142143def insertDoctype(self, token):144name = token["name"]145publicId = token["publicId"]146systemId = token["systemId"]147148doctype = Doctype.for_name_and_ids(name, publicId, systemId)149self.soup.object_was_parsed(doctype)150151def elementClass(self, name, namespace):152kwargs = {}153if self.parser and self.store_line_numbers:154# This represents the point immediately after the end of the155# tag. We don't know when the tag started, but we do know156# where it ended -- the character just before this one.157sourceline, sourcepos = self.parser.tokenizer.stream.position()158kwargs['sourceline'] = sourceline159kwargs['sourcepos'] = sourcepos-1160tag = self.soup.new_tag(name, namespace, **kwargs)161162return Element(tag, self.soup, namespace)163164def commentClass(self, data):165return TextNode(Comment(data), self.soup)166167def fragmentClass(self):168from bs4 import BeautifulSoup169# TODO: Why is the parser 'html.parser' here? To avoid an170# infinite loop?171self.soup = BeautifulSoup("", "html.parser")172self.soup.name = "[document_fragment]"173return Element(self.soup, self.soup, None)174175def appendChild(self, node):176# XXX This code is not covered by the BS4 tests.177self.soup.append(node.element)178179def getDocument(self):180return self.soup181182def getFragment(self):183return treebuilder_base.TreeBuilder.getFragment(self).element184185def testSerializer(self, element):186from bs4 import BeautifulSoup187rv = []188doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')189190def serializeElement(element, indent=0):191if isinstance(element, BeautifulSoup):192pass193if isinstance(element, Doctype):194m = doctype_re.match(element)195if m:196name = m.group(1)197if m.lastindex > 1:198publicId = m.group(2) or ""199systemId = m.group(3) or m.group(4) or ""200rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %201(' ' * indent, name, publicId, systemId))202else:203rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))204else:205rv.append("|%s<!DOCTYPE >" % (' ' * indent,))206elif isinstance(element, Comment):207rv.append("|%s<!-- %s -->" % (' ' * indent, element))208elif isinstance(element, NavigableString):209rv.append("|%s\"%s\"" % (' ' * indent, element))210else:211if element.namespace:212name = "%s %s" % (prefixes[element.namespace],213element.name)214else:215name = element.name216rv.append("|%s<%s>" % (' ' * indent, name))217if element.attrs:218attributes = []219for name, value in list(element.attrs.items()):220if isinstance(name, NamespacedAttribute):221name = "%s %s" % (prefixes[name.namespace], name.name)222if isinstance(value, list):223value = " ".join(value)224attributes.append((name, value))225226for name, value in sorted(attributes):227rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))228indent += 2229for child in element.children:230serializeElement(child, indent)231serializeElement(element, 0)232233return "\n".join(rv)234235class AttrList(object):236def __init__(self, element):237self.element = element238self.attrs = dict(self.element.attrs)239def __iter__(self):240return list(self.attrs.items()).__iter__()241def __setitem__(self, name, value):242# If this attribute is a multi-valued attribute for this element,243# turn its value into a list.244list_attr = self.element.cdata_list_attributes245if (name in list_attr['*']246or (self.element.name in list_attr247and name in list_attr[self.element.name])):248# A node that is being cloned may have already undergone249# this procedure.250if not isinstance(value, list):251value = nonwhitespace_re.findall(value)252self.element[name] = value253def items(self):254return list(self.attrs.items())255def keys(self):256return list(self.attrs.keys())257def __len__(self):258return len(self.attrs)259def __getitem__(self, name):260return self.attrs[name]261def __contains__(self, name):262return name in list(self.attrs.keys())263264265class Element(treebuilder_base.Node):266def __init__(self, element, soup, namespace):267treebuilder_base.Node.__init__(self, element.name)268self.element = element269self.soup = soup270self.namespace = namespace271272def appendChild(self, node):273string_child = child = None274if isinstance(node, str):275# Some other piece of code decided to pass in a string276# instead of creating a TextElement object to contain the277# string.278string_child = child = node279elif isinstance(node, Tag):280# Some other piece of code decided to pass in a Tag281# instead of creating an Element object to contain the282# Tag.283child = node284elif node.element.__class__ == NavigableString:285string_child = child = node.element286node.parent = self287else:288child = node.element289node.parent = self290291if not isinstance(child, str) and child.parent is not None:292node.element.extract()293294if (string_child is not None and self.element.contents295and self.element.contents[-1].__class__ == NavigableString):296# We are appending a string onto another string.297# TODO This has O(n^2) performance, for input like298# "a</a>a</a>a</a>..."299old_element = self.element.contents[-1]300new_element = self.soup.new_string(old_element + string_child)301old_element.replace_with(new_element)302self.soup._most_recent_element = new_element303else:304if isinstance(node, str):305# Create a brand new NavigableString from this string.306child = self.soup.new_string(node)307308# Tell Beautiful Soup to act as if it parsed this element309# immediately after the parent's last descendant. (Or310# immediately after the parent, if it has no children.)311if self.element.contents:312most_recent_element = self.element._last_descendant(False)313elif self.element.next_element is not None:314# Something from further ahead in the parse tree is315# being inserted into this earlier element. This is316# very annoying because it means an expensive search317# for the last element in the tree.318most_recent_element = self.soup._last_descendant()319else:320most_recent_element = self.element321322self.soup.object_was_parsed(323child, parent=self.element,324most_recent_element=most_recent_element)325326def getAttributes(self):327if isinstance(self.element, Comment):328return {}329return AttrList(self.element)330331def setAttributes(self, attributes):332if attributes is not None and len(attributes) > 0:333converted_attributes = []334for name, value in list(attributes.items()):335if isinstance(name, tuple):336new_name = NamespacedAttribute(*name)337del attributes[name]338attributes[new_name] = value339340self.soup.builder._replace_cdata_list_attribute_values(341self.name, attributes)342for name, value in list(attributes.items()):343self.element[name] = value344345# The attributes may contain variables that need substitution.346# Call set_up_substitutions manually.347#348# The Tag constructor called this method when the Tag was created,349# but we just set/changed the attributes, so call it again.350self.soup.builder.set_up_substitutions(self.element)351attributes = property(getAttributes, setAttributes)352353def insertText(self, data, insertBefore=None):354text = TextNode(self.soup.new_string(data), self.soup)355if insertBefore:356self.insertBefore(text, insertBefore)357else:358self.appendChild(text)359360def insertBefore(self, node, refNode):361index = self.element.index(refNode.element)362if (node.element.__class__ == NavigableString and self.element.contents363and self.element.contents[index-1].__class__ == NavigableString):364# (See comments in appendChild)365old_node = self.element.contents[index-1]366new_str = self.soup.new_string(old_node + node.element)367old_node.replace_with(new_str)368else:369self.element.insert(index, node.element)370node.parent = self371372def removeChild(self, node):373node.element.extract()374375def reparentChildren(self, new_parent):376"""Move all of this tag's children into another tag."""377# print("MOVE", self.element.contents)378# print("FROM", self.element)379# print("TO", new_parent.element)380381element = self.element382new_parent_element = new_parent.element383# Determine what this tag's next_element will be once all the children384# are removed.385final_next_element = element.next_sibling386387new_parents_last_descendant = new_parent_element._last_descendant(False, False)388if len(new_parent_element.contents) > 0:389# The new parent already contains children. We will be390# appending this tag's children to the end.391new_parents_last_child = new_parent_element.contents[-1]392new_parents_last_descendant_next_element = new_parents_last_descendant.next_element393else:394# The new parent contains no children.395new_parents_last_child = None396new_parents_last_descendant_next_element = new_parent_element.next_element397398to_append = element.contents399if len(to_append) > 0:400# Set the first child's previous_element and previous_sibling401# to elements within the new parent402first_child = to_append[0]403if new_parents_last_descendant is not None:404first_child.previous_element = new_parents_last_descendant405else:406first_child.previous_element = new_parent_element407first_child.previous_sibling = new_parents_last_child408if new_parents_last_descendant is not None:409new_parents_last_descendant.next_element = first_child410else:411new_parent_element.next_element = first_child412if new_parents_last_child is not None:413new_parents_last_child.next_sibling = first_child414415# Find the very last element being moved. It is now the416# parent's last descendant. It has no .next_sibling and417# its .next_element is whatever the previous last418# descendant had.419last_childs_last_descendant = to_append[-1]._last_descendant(False, True)420421last_childs_last_descendant.next_element = new_parents_last_descendant_next_element422if new_parents_last_descendant_next_element is not None:423# TODO: This code has no test coverage and I'm not sure424# how to get html5lib to go through this path, but it's425# just the other side of the previous line.426new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant427last_childs_last_descendant.next_sibling = None428429for child in to_append:430child.parent = new_parent_element431new_parent_element.contents.append(child)432433# Now that this element has no children, change its .next_element.434element.contents = []435element.next_element = final_next_element436437# print("DONE WITH MOVE")438# print("FROM", self.element)439# print("TO", new_parent_element)440441def cloneNode(self):442tag = self.soup.new_tag(self.element.name, self.namespace)443node = Element(tag, self.soup, self.namespace)444for key,value in self.attributes:445node.attributes[key] = value446return node447448def hasContent(self):449return self.element.contents450451def getNameTuple(self):452if self.namespace == None:453return namespaces["html"], self.name454else:455return self.namespace, self.name456457nameTuple = property(getNameTuple)458459class TextNode(Element):460def __init__(self, element, soup):461treebuilder_base.Node.__init__(self, None)462self.element = element463self.soup = soup464465def cloneNode(self):466raise NotImplementedError467468469