Path: blob/master/venv/Lib/site-packages/bs4/__init__.py
811 views
"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".12http://www.crummy.com/software/BeautifulSoup/34Beautiful Soup uses a pluggable XML or HTML parser to parse a5(possibly invalid) document into a tree representation. Beautiful Soup6provides methods and Pythonic idioms that make it easy to navigate,7search, and modify the parse tree.89Beautiful Soup works with Python 2.7 and up. It works better if lxml10and/or html5lib is installed.1112For more than you ever wanted to know about Beautiful Soup, see the13documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/14"""1516__author__ = "Leonard Richardson ([email protected])"17__version__ = "4.9.1"18__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson"19# Use of this source code is governed by the MIT license.20__license__ = "MIT"2122__all__ = ['BeautifulSoup']2324import os25import re26import sys27import traceback28import warnings2930from .builder import builder_registry, ParserRejectedMarkup31from .dammit import UnicodeDammit32from .element import (33CData,34Comment,35DEFAULT_OUTPUT_ENCODING,36Declaration,37Doctype,38NavigableString,39PageElement,40ProcessingInstruction,41PYTHON_SPECIFIC_ENCODINGS,42ResultSet,43Script,44Stylesheet,45SoupStrainer,46Tag,47TemplateString,48)4950# The very first thing we do is give a useful error if someone is51# running this code under Python 3 without converting it.52'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'5354# Define some custom warnings.55class GuessedAtParserWarning(UserWarning):56"""The warning issued when BeautifulSoup has to guess what parser to57use -- probably because no parser was specified in the constructor.58"""5960class MarkupResemblesLocatorWarning(UserWarning):61"""The warning issued when BeautifulSoup is given 'markup' that62actually looks like a resource locator -- a URL or a path to a file63on disk.64"""656667class BeautifulSoup(Tag):68"""A data structure representing a parsed HTML or XML document.6970Most of the methods you'll call on a BeautifulSoup object are inherited from71PageElement or Tag.7273Internally, this class defines the basic interface called by the74tree builders when converting an HTML/XML document into a data75structure. The interface abstracts away the differences between76parsers. To write a new tree builder, you'll need to understand77these methods as a whole.7879These methods will be called by the BeautifulSoup constructor:80* reset()81* feed(markup)8283The tree builder may call these methods from its feed() implementation:84* handle_starttag(name, attrs) # See note about return value85* handle_endtag(name)86* handle_data(data) # Appends to the current data node87* endData(containerClass) # Ends the current data node8889No matter how complicated the underlying parser is, you should be90able to build a tree using 'start tag' events, 'end tag' events,91'data' events, and "done with data" events.9293If you encounter an empty-element tag (aka a self-closing tag,94like HTML's <br> tag), call handle_starttag and then95handle_endtag.96"""9798# Since BeautifulSoup subclasses Tag, it's possible to treat it as99# a Tag with a .name. This name makes it clear the BeautifulSoup100# object isn't a real markup tag.101ROOT_TAG_NAME = '[document]'102103# If the end-user gives no indication which tree builder they104# want, look for one with these features.105DEFAULT_BUILDER_FEATURES = ['html', 'fast']106107# A string containing all ASCII whitespace characters, used in108# endData() to detect data chunks that seem 'empty'.109ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'110111NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"112113def __init__(self, markup="", features=None, builder=None,114parse_only=None, from_encoding=None, exclude_encodings=None,115element_classes=None, **kwargs):116"""Constructor.117118:param markup: A string or a file-like object representing119markup to be parsed.120121:param features: Desirable features of the parser to be122used. This may be the name of a specific parser ("lxml",123"lxml-xml", "html.parser", or "html5lib") or it may be the124type of markup to be used ("html", "html5", "xml"). It's125recommended that you name a specific parser, so that126Beautiful Soup gives you the same results across platforms127and virtual environments.128129:param builder: A TreeBuilder subclass to instantiate (or130instance to use) instead of looking one up based on131`features`. You only need to use this if you've implemented a132custom TreeBuilder.133134:param parse_only: A SoupStrainer. Only parts of the document135matching the SoupStrainer will be considered. This is useful136when parsing part of a document that would otherwise be too137large to fit into memory.138139:param from_encoding: A string indicating the encoding of the140document to be parsed. Pass this in if Beautiful Soup is141guessing wrongly about the document's encoding.142143:param exclude_encodings: A list of strings indicating144encodings known to be wrong. Pass this in if you don't know145the document's encoding but you know Beautiful Soup's guess is146wrong.147148:param element_classes: A dictionary mapping BeautifulSoup149classes like Tag and NavigableString, to other classes you'd150like to be instantiated instead as the parse tree is151built. This is useful for subclassing Tag or NavigableString152to modify default behavior.153154:param kwargs: For backwards compatibility purposes, the155constructor accepts certain keyword arguments used in156Beautiful Soup 3. None of these arguments do anything in157Beautiful Soup 4; they will result in a warning and then be158ignored.159160Apart from this, any keyword arguments passed into the161BeautifulSoup constructor are propagated to the TreeBuilder162constructor. This makes it possible to configure a163TreeBuilder by passing in arguments, not just by saying which164one to use.165"""166if 'convertEntities' in kwargs:167del kwargs['convertEntities']168warnings.warn(169"BS4 does not respect the convertEntities argument to the "170"BeautifulSoup constructor. Entities are always converted "171"to Unicode characters.")172173if 'markupMassage' in kwargs:174del kwargs['markupMassage']175warnings.warn(176"BS4 does not respect the markupMassage argument to the "177"BeautifulSoup constructor. The tree builder is responsible "178"for any necessary markup massage.")179180if 'smartQuotesTo' in kwargs:181del kwargs['smartQuotesTo']182warnings.warn(183"BS4 does not respect the smartQuotesTo argument to the "184"BeautifulSoup constructor. Smart quotes are always converted "185"to Unicode characters.")186187if 'selfClosingTags' in kwargs:188del kwargs['selfClosingTags']189warnings.warn(190"BS4 does not respect the selfClosingTags argument to the "191"BeautifulSoup constructor. The tree builder is responsible "192"for understanding self-closing tags.")193194if 'isHTML' in kwargs:195del kwargs['isHTML']196warnings.warn(197"BS4 does not respect the isHTML argument to the "198"BeautifulSoup constructor. Suggest you use "199"features='lxml' for HTML and features='lxml-xml' for "200"XML.")201202def deprecated_argument(old_name, new_name):203if old_name in kwargs:204warnings.warn(205'The "%s" argument to the BeautifulSoup constructor '206'has been renamed to "%s."' % (old_name, new_name))207value = kwargs[old_name]208del kwargs[old_name]209return value210return None211212parse_only = parse_only or deprecated_argument(213"parseOnlyThese", "parse_only")214215from_encoding = from_encoding or deprecated_argument(216"fromEncoding", "from_encoding")217218if from_encoding and isinstance(markup, str):219warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")220from_encoding = None221222self.element_classes = element_classes or dict()223224# We need this information to track whether or not the builder225# was specified well enough that we can omit the 'you need to226# specify a parser' warning.227original_builder = builder228original_features = features229230if isinstance(builder, type):231# A builder class was passed in; it needs to be instantiated.232builder_class = builder233builder = None234elif builder is None:235if isinstance(features, str):236features = [features]237if features is None or len(features) == 0:238features = self.DEFAULT_BUILDER_FEATURES239builder_class = builder_registry.lookup(*features)240if builder_class is None:241raise FeatureNotFound(242"Couldn't find a tree builder with the features you "243"requested: %s. Do you need to install a parser library?"244% ",".join(features))245246# At this point either we have a TreeBuilder instance in247# builder, or we have a builder_class that we can instantiate248# with the remaining **kwargs.249if builder is None:250builder = builder_class(**kwargs)251if not original_builder and not (252original_features == builder.NAME or253original_features in builder.ALTERNATE_NAMES254):255if builder.is_xml:256markup_type = "XML"257else:258markup_type = "HTML"259260# This code adapted from warnings.py so that we get the same line261# of code as our warnings.warn() call gets, even if the answer is wrong262# (as it may be in a multithreading situation).263caller = None264try:265caller = sys._getframe(1)266except ValueError:267pass268if caller:269globals = caller.f_globals270line_number = caller.f_lineno271else:272globals = sys.__dict__273line_number= 1274filename = globals.get('__file__')275if filename:276fnl = filename.lower()277if fnl.endswith((".pyc", ".pyo")):278filename = filename[:-1]279if filename:280# If there is no filename at all, the user is most likely in a REPL,281# and the warning is not necessary.282values = dict(283filename=filename,284line_number=line_number,285parser=builder.NAME,286markup_type=markup_type287)288warnings.warn(289self.NO_PARSER_SPECIFIED_WARNING % values,290GuessedAtParserWarning, stacklevel=2291)292else:293if kwargs:294warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")295296self.builder = builder297self.is_xml = builder.is_xml298self.known_xml = self.is_xml299self._namespaces = dict()300self.parse_only = parse_only301302self.builder.initialize_soup(self)303304if hasattr(markup, 'read'): # It's a file-type object.305markup = markup.read()306elif len(markup) <= 256 and (307(isinstance(markup, bytes) and not b'<' in markup)308or (isinstance(markup, str) and not '<' in markup)309):310# Print out warnings for a couple beginner problems311# involving passing non-markup to Beautiful Soup.312# Beautiful Soup will still parse the input as markup,313# just in case that's what the user really wants.314if (isinstance(markup, str)315and not os.path.supports_unicode_filenames):316possible_filename = markup.encode("utf8")317else:318possible_filename = markup319is_file = False320try:321is_file = os.path.exists(possible_filename)322except Exception as e:323# This is almost certainly a problem involving324# characters not valid in filenames on this325# system. Just let it go.326pass327if is_file:328warnings.warn(329'"%s" looks like a filename, not markup. You should'330' probably open this file and pass the filehandle into'331' Beautiful Soup.' % self._decode_markup(markup),332MarkupResemblesLocatorWarning333)334self._check_markup_is_url(markup)335336rejections = []337success = False338for (self.markup, self.original_encoding, self.declared_html_encoding,339self.contains_replacement_characters) in (340self.builder.prepare_markup(341markup, from_encoding, exclude_encodings=exclude_encodings)):342self.reset()343try:344self._feed()345success = True346break347except ParserRejectedMarkup as e:348rejections.append(e)349pass350351if not success:352other_exceptions = [str(e) for e in rejections]353raise ParserRejectedMarkup(354"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)355)356357# Clear out the markup and remove the builder's circular358# reference to this object.359self.markup = None360self.builder.soup = None361362def __copy__(self):363"""Copy a BeautifulSoup object by converting the document to a string and parsing it again."""364copy = type(self)(365self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'366)367368# Although we encoded the tree to UTF-8, that may not have369# been the encoding of the original markup. Set the copy's370# .original_encoding to reflect the original object's371# .original_encoding.372copy.original_encoding = self.original_encoding373return copy374375def __getstate__(self):376# Frequently a tree builder can't be pickled.377d = dict(self.__dict__)378if 'builder' in d and not self.builder.picklable:379d['builder'] = None380return d381382@classmethod383def _decode_markup(cls, markup):384"""Ensure `markup` is bytes so it's safe to send into warnings.warn.385386TODO: warnings.warn had this problem back in 2010 but it might not387anymore.388"""389if isinstance(markup, bytes):390decoded = markup.decode('utf-8', 'replace')391else:392decoded = markup393return decoded394395@classmethod396def _check_markup_is_url(cls, markup):397"""Error-handling method to raise a warning if incoming markup looks398like a URL.399400:param markup: A string.401"""402if isinstance(markup, bytes):403space = b' '404cant_start_with = (b"http:", b"https:")405elif isinstance(markup, str):406space = ' '407cant_start_with = ("http:", "https:")408else:409return410411if any(markup.startswith(prefix) for prefix in cant_start_with):412if not space in markup:413warnings.warn(414'"%s" looks like a URL. Beautiful Soup is not an'415' HTTP client. You should probably use an HTTP client like'416' requests to get the document behind the URL, and feed'417' that document to Beautiful Soup.' % cls._decode_markup(418markup419),420MarkupResemblesLocatorWarning421)422423def _feed(self):424"""Internal method that parses previously set markup, creating a large425number of Tag and NavigableString objects.426"""427# Convert the document to Unicode.428self.builder.reset()429430self.builder.feed(self.markup)431# Close out any unfinished strings and close all the open tags.432self.endData()433while self.currentTag.name != self.ROOT_TAG_NAME:434self.popTag()435436def reset(self):437"""Reset this object to a state as though it had never parsed any438markup.439"""440Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)441self.hidden = 1442self.builder.reset()443self.current_data = []444self.currentTag = None445self.tagStack = []446self.preserve_whitespace_tag_stack = []447self.string_container_stack = []448self.pushTag(self)449450def new_tag(self, name, namespace=None, nsprefix=None, attrs={},451sourceline=None, sourcepos=None, **kwattrs):452"""Create a new Tag associated with this BeautifulSoup object.453454:param name: The name of the new Tag.455:param namespace: The URI of the new Tag's XML namespace, if any.456:param prefix: The prefix for the new Tag's XML namespace, if any.457:param attrs: A dictionary of this Tag's attribute values; can458be used instead of `kwattrs` for attributes like 'class'459that are reserved words in Python.460:param sourceline: The line number where this tag was461(purportedly) found in its source document.462:param sourcepos: The character position within `sourceline` where this463tag was (purportedly) found.464:param kwattrs: Keyword arguments for the new Tag's attribute values.465466"""467kwattrs.update(attrs)468return self.element_classes.get(Tag, Tag)(469None, self.builder, name, namespace, nsprefix, kwattrs,470sourceline=sourceline, sourcepos=sourcepos471)472473def string_container(self, base_class=None):474container = base_class or NavigableString475476# There may be a general override of NavigableString.477container = self.element_classes.get(478container, container479)480481# On top of that, we may be inside a tag that needs a special482# container class.483if self.string_container_stack:484container = self.builder.string_containers.get(485self.string_container_stack[-1].name, container486)487return container488489def new_string(self, s, subclass=None):490"""Create a new NavigableString associated with this BeautifulSoup491object.492"""493container = self.string_container(subclass)494return container(s)495496def insert_before(self, successor):497"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement498it because there is nothing before or after it in the parse tree.499"""500raise NotImplementedError("BeautifulSoup objects don't support insert_before().")501502def insert_after(self, successor):503"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement504it because there is nothing before or after it in the parse tree.505"""506raise NotImplementedError("BeautifulSoup objects don't support insert_after().")507508def popTag(self):509"""Internal method called by _popToTag when a tag is closed."""510tag = self.tagStack.pop()511if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:512self.preserve_whitespace_tag_stack.pop()513if self.string_container_stack and tag == self.string_container_stack[-1]:514self.string_container_stack.pop()515#print("Pop", tag.name)516if self.tagStack:517self.currentTag = self.tagStack[-1]518return self.currentTag519520def pushTag(self, tag):521"""Internal method called by handle_starttag when a tag is opened."""522#print("Push", tag.name)523if self.currentTag is not None:524self.currentTag.contents.append(tag)525self.tagStack.append(tag)526self.currentTag = self.tagStack[-1]527if tag.name in self.builder.preserve_whitespace_tags:528self.preserve_whitespace_tag_stack.append(tag)529if tag.name in self.builder.string_containers:530self.string_container_stack.append(tag)531532def endData(self, containerClass=None):533"""Method called by the TreeBuilder when the end of a data segment534occurs.535"""536containerClass = self.string_container(containerClass)537538if self.current_data:539current_data = ''.join(self.current_data)540# If whitespace is not preserved, and this string contains541# nothing but ASCII spaces, replace it with a single space542# or newline.543if not self.preserve_whitespace_tag_stack:544strippable = True545for i in current_data:546if i not in self.ASCII_SPACES:547strippable = False548break549if strippable:550if '\n' in current_data:551current_data = '\n'552else:553current_data = ' '554555# Reset the data collector.556self.current_data = []557558# Should we add this string to the tree at all?559if self.parse_only and len(self.tagStack) <= 1 and \560(not self.parse_only.text or \561not self.parse_only.search(current_data)):562return563564o = containerClass(current_data)565self.object_was_parsed(o)566567def object_was_parsed(self, o, parent=None, most_recent_element=None):568"""Method called by the TreeBuilder to integrate an object into the parse tree."""569if parent is None:570parent = self.currentTag571if most_recent_element is not None:572previous_element = most_recent_element573else:574previous_element = self._most_recent_element575576next_element = previous_sibling = next_sibling = None577if isinstance(o, Tag):578next_element = o.next_element579next_sibling = o.next_sibling580previous_sibling = o.previous_sibling581if previous_element is None:582previous_element = o.previous_element583584fix = parent.next_element is not None585586o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)587588self._most_recent_element = o589parent.contents.append(o)590591# Check if we are inserting into an already parsed node.592if fix:593self._linkage_fixer(parent)594595def _linkage_fixer(self, el):596"""Make sure linkage of this fragment is sound."""597598first = el.contents[0]599child = el.contents[-1]600descendant = child601602if child is first and el.parent is not None:603# Parent should be linked to first child604el.next_element = child605# We are no longer linked to whatever this element is606prev_el = child.previous_element607if prev_el is not None and prev_el is not el:608prev_el.next_element = None609# First child should be linked to the parent, and no previous siblings.610child.previous_element = el611child.previous_sibling = None612613# We have no sibling as we've been appended as the last.614child.next_sibling = None615616# This index is a tag, dig deeper for a "last descendant"617if isinstance(child, Tag) and child.contents:618descendant = child._last_descendant(False)619620# As the final step, link last descendant. It should be linked621# to the parent's next sibling (if found), else walk up the chain622# and find a parent with a sibling. It should have no next sibling.623descendant.next_element = None624descendant.next_sibling = None625target = el626while True:627if target is None:628break629elif target.next_sibling is not None:630descendant.next_element = target.next_sibling631target.next_sibling.previous_element = child632break633target = target.parent634635def _popToTag(self, name, nsprefix=None, inclusivePop=True):636"""Pops the tag stack up to and including the most recent637instance of the given tag.638639:param name: Pop up to the most recent tag with this name.640:param nsprefix: The namespace prefix that goes with `name`.641:param inclusivePop: It this is false, pops the tag stack up642to but *not* including the most recent instqance of the643given tag.644"""645#print("Popping to %s" % name)646if name == self.ROOT_TAG_NAME:647# The BeautifulSoup object itself can never be popped.648return649650most_recently_popped = None651652stack_size = len(self.tagStack)653for i in range(stack_size - 1, 0, -1):654t = self.tagStack[i]655if (name == t.name and nsprefix == t.prefix):656if inclusivePop:657most_recently_popped = self.popTag()658break659most_recently_popped = self.popTag()660661return most_recently_popped662663def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,664sourcepos=None):665"""Called by the tree builder when a new tag is encountered.666667:param name: Name of the tag.668:param nsprefix: Namespace prefix for the tag.669:param attrs: A dictionary of attribute values.670:param sourceline: The line number where this tag was found in its671source document.672:param sourcepos: The character position within `sourceline` where this673tag was found.674675If this method returns None, the tag was rejected by an active676SoupStrainer. You should proceed as if the tag had not occurred677in the document. For instance, if this was a self-closing tag,678don't call handle_endtag.679"""680# print("Start tag %s: %s" % (name, attrs))681self.endData()682683if (self.parse_only and len(self.tagStack) <= 1684and (self.parse_only.text685or not self.parse_only.search_tag(name, attrs))):686return None687688tag = self.element_classes.get(Tag, Tag)(689self, self.builder, name, namespace, nsprefix, attrs,690self.currentTag, self._most_recent_element,691sourceline=sourceline, sourcepos=sourcepos692)693if tag is None:694return tag695if self._most_recent_element is not None:696self._most_recent_element.next_element = tag697self._most_recent_element = tag698self.pushTag(tag)699return tag700701def handle_endtag(self, name, nsprefix=None):702"""Called by the tree builder when an ending tag is encountered.703704:param name: Name of the tag.705:param nsprefix: Namespace prefix for the tag.706"""707#print("End tag: " + name)708self.endData()709self._popToTag(name, nsprefix)710711def handle_data(self, data):712"""Called by the tree builder when a chunk of textual data is encountered."""713self.current_data.append(data)714715def decode(self, pretty_print=False,716eventual_encoding=DEFAULT_OUTPUT_ENCODING,717formatter="minimal"):718"""Returns a string or Unicode representation of the parse tree719as an HTML or XML document.720721:param pretty_print: If this is True, indentation will be used to722make the document more readable.723:param eventual_encoding: The encoding of the final document.724If this is None, the document will be a Unicode string.725"""726if self.is_xml:727# Print the XML declaration728encoding_part = ''729if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:730# This is a special Python encoding; it can't actually731# go into an XML document because it means nothing732# outside of Python.733eventual_encoding = None734if eventual_encoding != None:735encoding_part = ' encoding="%s"' % eventual_encoding736prefix = '<?xml version="1.0"%s?>\n' % encoding_part737else:738prefix = ''739if not pretty_print:740indent_level = None741else:742indent_level = 0743return prefix + super(BeautifulSoup, self).decode(744indent_level, eventual_encoding, formatter)745746# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'747_s = BeautifulSoup748_soup = BeautifulSoup749750class BeautifulStoneSoup(BeautifulSoup):751"""Deprecated interface to an XML parser."""752753def __init__(self, *args, **kwargs):754kwargs['features'] = 'xml'755warnings.warn(756'The BeautifulStoneSoup class is deprecated. Instead of using '757'it, pass features="xml" into the BeautifulSoup constructor.')758super(BeautifulStoneSoup, self).__init__(*args, **kwargs)759760761class StopParsing(Exception):762"""Exception raised by a TreeBuilder if it's unable to continue parsing."""763pass764765class FeatureNotFound(ValueError):766"""Exception raised by the BeautifulSoup constructor if no parser with the767requested features is found.768"""769pass770771772#If this file is run as a script, act as an HTML pretty-printer.773if __name__ == '__main__':774import sys775soup = BeautifulSoup(sys.stdin)776print((soup.prettify()))777778779