Path: blob/master/venv/Lib/site-packages/bs4/builder/__init__.py
811 views
# Use of this source code is governed by the MIT license.1__license__ = "MIT"23from collections import defaultdict4import itertools5import sys6from bs4.element import (7CharsetMetaAttributeValue,8ContentMetaAttributeValue,9Stylesheet,10Script,11TemplateString,12nonwhitespace_re13)1415__all__ = [16'HTMLTreeBuilder',17'SAXTreeBuilder',18'TreeBuilder',19'TreeBuilderRegistry',20]2122# Some useful features for a TreeBuilder to have.23FAST = 'fast'24PERMISSIVE = 'permissive'25STRICT = 'strict'26XML = 'xml'27HTML = 'html'28HTML_5 = 'html5'293031class TreeBuilderRegistry(object):32"""A way of looking up TreeBuilder subclasses by their name or by desired33features.34"""3536def __init__(self):37self.builders_for_feature = defaultdict(list)38self.builders = []3940def register(self, treebuilder_class):41"""Register a treebuilder based on its advertised features.4243:param treebuilder_class: A subclass of Treebuilder. its .features44attribute should list its features.45"""46for feature in treebuilder_class.features:47self.builders_for_feature[feature].insert(0, treebuilder_class)48self.builders.insert(0, treebuilder_class)4950def lookup(self, *features):51"""Look up a TreeBuilder subclass with the desired features.5253:param features: A list of features to look for. If none are54provided, the most recently registered TreeBuilder subclass55will be used.56:return: A TreeBuilder subclass, or None if there's no57registered subclass with all the requested features.58"""59if len(self.builders) == 0:60# There are no builders at all.61return None6263if len(features) == 0:64# They didn't ask for any features. Give them the most65# recently registered builder.66return self.builders[0]6768# Go down the list of features in order, and eliminate any builders69# that don't match every feature.70features = list(features)71features.reverse()72candidates = None73candidate_set = None74while len(features) > 0:75feature = features.pop()76we_have_the_feature = self.builders_for_feature.get(feature, [])77if len(we_have_the_feature) > 0:78if candidates is None:79candidates = we_have_the_feature80candidate_set = set(candidates)81else:82# Eliminate any candidates that don't have this feature.83candidate_set = candidate_set.intersection(84set(we_have_the_feature))8586# The only valid candidates are the ones in candidate_set.87# Go through the original list of candidates and pick the first one88# that's in candidate_set.89if candidate_set is None:90return None91for candidate in candidates:92if candidate in candidate_set:93return candidate94return None9596# The BeautifulSoup class will take feature lists from developers and use them97# to look up builders in this registry.98builder_registry = TreeBuilderRegistry()99100class TreeBuilder(object):101"""Turn a textual document into a Beautiful Soup object tree."""102103NAME = "[Unknown tree builder]"104ALTERNATE_NAMES = []105features = []106107is_xml = False108picklable = False109empty_element_tags = None # A tag will be considered an empty-element110# tag when and only when it has no contents.111112# A value for these tag/attribute combinations is a space- or113# comma-separated list of CDATA, rather than a single CDATA.114DEFAULT_CDATA_LIST_ATTRIBUTES = {}115116# Whitespace should be preserved inside these tags.117DEFAULT_PRESERVE_WHITESPACE_TAGS = set()118119# The textual contents of tags with these names should be120# instantiated with some class other than NavigableString.121DEFAULT_STRING_CONTAINERS = {}122123USE_DEFAULT = object()124125# Most parsers don't keep track of line numbers.126TRACKS_LINE_NUMBERS = False127128def __init__(self, multi_valued_attributes=USE_DEFAULT,129preserve_whitespace_tags=USE_DEFAULT,130store_line_numbers=USE_DEFAULT,131string_containers=USE_DEFAULT,132):133"""Constructor.134135:param multi_valued_attributes: If this is set to None, the136TreeBuilder will not turn any values for attributes like137'class' into lists. Setting this to a dictionary will138customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES139for an example.140141Internally, these are called "CDATA list attributes", but that142probably doesn't make sense to an end-user, so the argument name143is `multi_valued_attributes`.144145:param preserve_whitespace_tags: A list of tags to treat146the way <pre> tags are treated in HTML. Tags in this list147are immune from pretty-printing; their contents will always be148output as-is.149150:param string_containers: A dictionary mapping tag names to151the classes that should be instantiated to contain the textual152contents of those tags. The default is to use NavigableString153for every tag, no matter what the name. You can override the154default by changing DEFAULT_STRING_CONTAINERS.155156:param store_line_numbers: If the parser keeps track of the157line numbers and positions of the original markup, that158information will, by default, be stored in each corresponding159`Tag` object. You can turn this off by passing160store_line_numbers=False. If the parser you're using doesn't161keep track of this information, then setting store_line_numbers=True162will do nothing.163"""164self.soup = None165if multi_valued_attributes is self.USE_DEFAULT:166multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES167self.cdata_list_attributes = multi_valued_attributes168if preserve_whitespace_tags is self.USE_DEFAULT:169preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS170self.preserve_whitespace_tags = preserve_whitespace_tags171if store_line_numbers == self.USE_DEFAULT:172store_line_numbers = self.TRACKS_LINE_NUMBERS173self.store_line_numbers = store_line_numbers174if string_containers == self.USE_DEFAULT:175string_containers = self.DEFAULT_STRING_CONTAINERS176self.string_containers = string_containers177178def initialize_soup(self, soup):179"""The BeautifulSoup object has been initialized and is now180being associated with the TreeBuilder.181182:param soup: A BeautifulSoup object.183"""184self.soup = soup185186def reset(self):187"""Do any work necessary to reset the underlying parser188for a new document.189190By default, this does nothing.191"""192pass193194def can_be_empty_element(self, tag_name):195"""Might a tag with this name be an empty-element tag?196197The final markup may or may not actually present this tag as198self-closing.199200For instance: an HTMLBuilder does not consider a <p> tag to be201an empty-element tag (it's not in202HTMLBuilder.empty_element_tags). This means an empty <p> tag203will be presented as "<p></p>", not "<p/>" or "<p>".204205The default implementation has no opinion about which tags are206empty-element tags, so a tag will be presented as an207empty-element tag if and only if it has no children.208"<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will209be left alone.210211:param tag_name: The name of a markup tag.212"""213if self.empty_element_tags is None:214return True215return tag_name in self.empty_element_tags216217def feed(self, markup):218"""Run some incoming markup through some parsing process,219populating the `BeautifulSoup` object in self.soup.220221This method is not implemented in TreeBuilder; it must be222implemented in subclasses.223224:return: None.225"""226raise NotImplementedError()227228def prepare_markup(self, markup, user_specified_encoding=None,229document_declared_encoding=None, exclude_encodings=None):230"""Run any preliminary steps necessary to make incoming markup231acceptable to the parser.232233:param markup: Some markup -- probably a bytestring.234:param user_specified_encoding: The user asked to try this encoding.235:param document_declared_encoding: The markup itself claims to be236in this encoding.237:param exclude_encodings: The user asked _not_ to try any of238these encodings.239240:yield: A series of 4-tuples:241(markup, encoding, declared encoding,242has undergone character replacement)243244Each 4-tuple represents a strategy for converting the245document to Unicode and parsing it. Each strategy will be tried246in turn.247248By default, the only strategy is to parse the markup249as-is. See `LXMLTreeBuilderForXML` and250`HTMLParserTreeBuilder` for implementations that take into251account the quirks of particular parsers.252"""253yield markup, None, None, False254255def test_fragment_to_document(self, fragment):256"""Wrap an HTML fragment to make it look like a document.257258Different parsers do this differently. For instance, lxml259introduces an empty <head> tag, and html5lib260doesn't. Abstracting this away lets us write simple tests261which run HTML fragments through the parser and compare the262results against other HTML fragments.263264This method should not be used outside of tests.265266:param fragment: A string -- fragment of HTML.267:return: A string -- a full HTML document.268"""269return fragment270271def set_up_substitutions(self, tag):272"""Set up any substitutions that will need to be performed on273a `Tag` when it's output as a string.274275By default, this does nothing. See `HTMLTreeBuilder` for a276case where this is used.277278:param tag: A `Tag`279:return: Whether or not a substitution was performed.280"""281return False282283def _replace_cdata_list_attribute_values(self, tag_name, attrs):284"""When an attribute value is associated with a tag that can285have multiple values for that attribute, convert the string286value to a list of strings.287288Basically, replaces class="foo bar" with class=["foo", "bar"]289290NOTE: This method modifies its input in place.291292:param tag_name: The name of a tag.293:param attrs: A dictionary containing the tag's attributes.294Any appropriate attribute values will be modified in place.295"""296if not attrs:297return attrs298if self.cdata_list_attributes:299universal = self.cdata_list_attributes.get('*', [])300tag_specific = self.cdata_list_attributes.get(301tag_name.lower(), None)302for attr in list(attrs.keys()):303if attr in universal or (tag_specific and attr in tag_specific):304# We have a "class"-type attribute whose string305# value is a whitespace-separated list of306# values. Split it into a list.307value = attrs[attr]308if isinstance(value, str):309values = nonwhitespace_re.findall(value)310else:311# html5lib sometimes calls setAttributes twice312# for the same tag when rearranging the parse313# tree. On the second call the attribute value314# here is already a list. If this happens,315# leave the value alone rather than trying to316# split it again.317values = value318attrs[attr] = values319return attrs320321class SAXTreeBuilder(TreeBuilder):322"""A Beautiful Soup treebuilder that listens for SAX events.323324This is not currently used for anything, but it demonstrates325how a simple TreeBuilder would work.326"""327328def feed(self, markup):329raise NotImplementedError()330331def close(self):332pass333334def startElement(self, name, attrs):335attrs = dict((key[1], value) for key, value in list(attrs.items()))336#print("Start %s, %r" % (name, attrs))337self.soup.handle_starttag(name, attrs)338339def endElement(self, name):340#print("End %s" % name)341self.soup.handle_endtag(name)342343def startElementNS(self, nsTuple, nodeName, attrs):344# Throw away (ns, nodeName) for now.345self.startElement(nodeName, attrs)346347def endElementNS(self, nsTuple, nodeName):348# Throw away (ns, nodeName) for now.349self.endElement(nodeName)350#handler.endElementNS((ns, node.nodeName), node.nodeName)351352def startPrefixMapping(self, prefix, nodeValue):353# Ignore the prefix for now.354pass355356def endPrefixMapping(self, prefix):357# Ignore the prefix for now.358# handler.endPrefixMapping(prefix)359pass360361def characters(self, content):362self.soup.handle_data(content)363364def startDocument(self):365pass366367def endDocument(self):368pass369370371class HTMLTreeBuilder(TreeBuilder):372"""This TreeBuilder knows facts about HTML.373374Such as which tags are empty-element tags.375"""376377empty_element_tags = set([378# These are from HTML5.379'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',380381# These are from earlier versions of HTML and are removed in HTML5.382'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'383])384385# The HTML standard defines these as block-level elements. Beautiful386# Soup does not treat these elements differently from other elements,387# but it may do so eventually, and this information is available if388# you need to use it.389block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])390391# The HTML standard defines an unusual content model for these tags.392# We represent this by using a string class other than NavigableString393# inside these tags.394#395# I made this list by going through the HTML spec396# (https://html.spec.whatwg.org/#metadata-content) and looking for397# "metadata content" elements that can contain strings.398#399# TODO: Arguably <noscript> could go here but it seems400# qualitatively different from the other tags.401DEFAULT_STRING_CONTAINERS = {402'style': Stylesheet,403'script': Script,404'template': TemplateString,405}406407# The HTML standard defines these attributes as containing a408# space-separated list of values, not a single value. That is,409# class="foo bar" means that the 'class' attribute has two values,410# 'foo' and 'bar', not the single value 'foo bar'. When we411# encounter one of these attributes, we will parse its value into412# a list of values if possible. Upon output, the list will be413# converted back into a string.414DEFAULT_CDATA_LIST_ATTRIBUTES = {415"*" : ['class', 'accesskey', 'dropzone'],416"a" : ['rel', 'rev'],417"link" : ['rel', 'rev'],418"td" : ["headers"],419"th" : ["headers"],420"td" : ["headers"],421"form" : ["accept-charset"],422"object" : ["archive"],423424# These are HTML5 specific, as are *.accesskey and *.dropzone above.425"area" : ["rel"],426"icon" : ["sizes"],427"iframe" : ["sandbox"],428"output" : ["for"],429}430431DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])432433def set_up_substitutions(self, tag):434"""Replace the declared encoding in a <meta> tag with a placeholder,435to be substituted when the tag is output to a string.436437An HTML document may come in to Beautiful Soup as one438encoding, but exit in a different encoding, and the <meta> tag439needs to be changed to reflect this.440441:param tag: A `Tag`442:return: Whether or not a substitution was performed.443"""444# We are only interested in <meta> tags445if tag.name != 'meta':446return False447448http_equiv = tag.get('http-equiv')449content = tag.get('content')450charset = tag.get('charset')451452# We are interested in <meta> tags that say what encoding the453# document was originally in. This means HTML 5-style <meta>454# tags that provide the "charset" attribute. It also means455# HTML 4-style <meta> tags that provide the "content"456# attribute and have "http-equiv" set to "content-type".457#458# In both cases we will replace the value of the appropriate459# attribute with a standin object that can take on any460# encoding.461meta_encoding = None462if charset is not None:463# HTML 5 style:464# <meta charset="utf8">465meta_encoding = charset466tag['charset'] = CharsetMetaAttributeValue(charset)467468elif (content is not None and http_equiv is not None469and http_equiv.lower() == 'content-type'):470# HTML 4 style:471# <meta http-equiv="content-type" content="text/html; charset=utf8">472tag['content'] = ContentMetaAttributeValue(content)473474return (meta_encoding is not None)475476def register_treebuilders_from(module):477"""Copy TreeBuilders from the given module into this module."""478# I'm fairly sure this is not the best way to do this.479this_module = sys.modules['bs4.builder']480for name in module.__all__:481obj = getattr(module, name)482483if issubclass(obj, TreeBuilder):484setattr(this_module, name, obj)485this_module.__all__.append(name)486# Register the builder while we're at it.487this_module.builder_registry.register(obj)488489class ParserRejectedMarkup(Exception):490"""An Exception to be raised when the underlying parser simply491refuses to parse the given markup.492"""493def __init__(self, message_or_exception):494"""Explain why the parser rejected the given markup, either495with a textual explanation or another exception.496"""497if isinstance(message_or_exception, Exception):498e = message_or_exception499message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))500super(ParserRejectedMarkup, self).__init__(message_or_exception)501502# Builders are registered in reverse order of priority, so that custom503# builder registrations will take precedence. In general, we want lxml504# to take precedence over html5lib, because it's faster. And we only505# want to use HTMLParser as a last resort.506from . import _htmlparser507register_treebuilders_from(_htmlparser)508try:509from . import _html5lib510register_treebuilders_from(_html5lib)511except ImportError:512# They don't have html5lib installed.513pass514try:515from . import _lxml516register_treebuilders_from(_lxml)517except ImportError:518# They don't have lxml installed.519pass520521522