Path: blob/master/venv/Lib/site-packages/bs4/element.py
811 views
# Use of this source code is governed by the MIT license.1__license__ = "MIT"23try:4from collections.abc import Callable # Python 3.65except ImportError as e:6from collections import Callable7import re8import sys9import warnings10try:11import soupsieve12except ImportError as e:13soupsieve = None14warnings.warn(15'The soupsieve package is not installed. CSS selectors cannot be used.'16)1718from bs4.formatter import (19Formatter,20HTMLFormatter,21XMLFormatter,22)2324DEFAULT_OUTPUT_ENCODING = "utf-8"25PY3K = (sys.version_info[0] > 2)2627nonwhitespace_re = re.compile(r"\S+")2829# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on30# the off chance someone imported it for their own use.31whitespace_re = re.compile(r"\s+")3233def _alias(attr):34"""Alias one attribute name to another for backward compatibility"""35@property36def alias(self):37return getattr(self, attr)3839@alias.setter40def alias(self):41return setattr(self, attr)42return alias434445# These encodings are recognized by Python (so PageElement.encode46# could theoretically support them) but XML and HTML don't recognize47# them (so they should not show up in an XML or HTML document as that48# document's encoding).49#50# If an XML document is encoded in one of these encodings, no encoding51# will be mentioned in the XML declaration. If an HTML document is52# encoded in one of these encodings, and the HTML document has a53# <meta> tag that mentions an encoding, the encoding will be given as54# the empty string.55#56# Source:57# https://docs.python.org/3/library/codecs.html#python-specific-encodings58PYTHON_SPECIFIC_ENCODINGS = set([59"idna",60"mbcs",61"oem",62"palmos",63"punycode",64"raw_unicode_escape",65"undefined",66"unicode_escape",67"raw-unicode-escape",68"unicode-escape",69"string-escape",70"string_escape",71])727374class NamespacedAttribute(str):75"""A namespaced string (e.g. 'xml:lang') that remembers the namespace76('xml') and the name ('lang') that were used to create it.77"""7879def __new__(cls, prefix, name=None, namespace=None):80if not name:81# This is the default namespace. Its name "has no value"82# per https://www.w3.org/TR/xml-names/#defaulting83name = None8485if name is None:86obj = str.__new__(cls, prefix)87elif prefix is None:88# Not really namespaced.89obj = str.__new__(cls, name)90else:91obj = str.__new__(cls, prefix + ":" + name)92obj.prefix = prefix93obj.name = name94obj.namespace = namespace95return obj9697class AttributeValueWithCharsetSubstitution(str):98"""A stand-in object for a character encoding specified in HTML."""99100class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):101"""A generic stand-in for the value of a meta tag's 'charset' attribute.102103When Beautiful Soup parses the markup '<meta charset="utf8">', the104value of the 'charset' attribute will be one of these objects.105"""106107def __new__(cls, original_value):108obj = str.__new__(cls, original_value)109obj.original_value = original_value110return obj111112def encode(self, encoding):113"""When an HTML document is being encoded to a given encoding, the114value of a meta tag's 'charset' is the name of the encoding.115"""116if encoding in PYTHON_SPECIFIC_ENCODINGS:117return ''118return encoding119120121class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):122"""A generic stand-in for the value of a meta tag's 'content' attribute.123124When Beautiful Soup parses the markup:125<meta http-equiv="content-type" content="text/html; charset=utf8">126127The value of the 'content' attribute will be one of these objects.128"""129130CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)131132def __new__(cls, original_value):133match = cls.CHARSET_RE.search(original_value)134if match is None:135# No substitution necessary.136return str.__new__(str, original_value)137138obj = str.__new__(cls, original_value)139obj.original_value = original_value140return obj141142def encode(self, encoding):143if encoding in PYTHON_SPECIFIC_ENCODINGS:144return ''145def rewrite(match):146return match.group(1) + encoding147return self.CHARSET_RE.sub(rewrite, self.original_value)148149150class PageElement(object):151"""Contains the navigational information for some part of the page:152that is, its current location in the parse tree.153154NavigableString, Tag, etc. are all subclasses of PageElement.155"""156157def setup(self, parent=None, previous_element=None, next_element=None,158previous_sibling=None, next_sibling=None):159"""Sets up the initial relations between this element and160other elements.161162:param parent: The parent of this element.163164:param previous_element: The element parsed immediately before165this one.166167:param next_element: The element parsed immediately before168this one.169170:param previous_sibling: The most recently encountered element171on the same level of the parse tree as this one.172173:param previous_sibling: The next element to be encountered174on the same level of the parse tree as this one.175"""176self.parent = parent177178self.previous_element = previous_element179if previous_element is not None:180self.previous_element.next_element = self181182self.next_element = next_element183if self.next_element is not None:184self.next_element.previous_element = self185186self.next_sibling = next_sibling187if self.next_sibling is not None:188self.next_sibling.previous_sibling = self189190if (previous_sibling is None191and self.parent is not None and self.parent.contents):192previous_sibling = self.parent.contents[-1]193194self.previous_sibling = previous_sibling195if previous_sibling is not None:196self.previous_sibling.next_sibling = self197198def format_string(self, s, formatter):199"""Format the given string using the given formatter.200201:param s: A string.202:param formatter: A Formatter object, or a string naming one of the standard formatters.203"""204if formatter is None:205return s206if not isinstance(formatter, Formatter):207formatter = self.formatter_for_name(formatter)208output = formatter.substitute(s)209return output210211def formatter_for_name(self, formatter):212"""Look up or create a Formatter for the given identifier,213if necessary.214215:param formatter: Can be a Formatter object (used as-is), a216function (used as the entity substitution hook for an217XMLFormatter or HTMLFormatter), or a string (used to look218up an XMLFormatter or HTMLFormatter in the appropriate219registry.220"""221if isinstance(formatter, Formatter):222return formatter223if self._is_xml:224c = XMLFormatter225else:226c = HTMLFormatter227if isinstance(formatter, Callable):228return c(entity_substitution=formatter)229return c.REGISTRY[formatter]230231@property232def _is_xml(self):233"""Is this element part of an XML tree or an HTML tree?234235This is used in formatter_for_name, when deciding whether an236XMLFormatter or HTMLFormatter is more appropriate. It can be237inefficient, but it should be called very rarely.238"""239if self.known_xml is not None:240# Most of the time we will have determined this when the241# document is parsed.242return self.known_xml243244# Otherwise, it's likely that this element was created by245# direct invocation of the constructor from within the user's246# Python code.247if self.parent is None:248# This is the top-level object. It should have .known_xml set249# from tree creation. If not, take a guess--BS is usually250# used on HTML markup.251return getattr(self, 'is_xml', False)252return self.parent._is_xml253254nextSibling = _alias("next_sibling") # BS3255previousSibling = _alias("previous_sibling") # BS3256257def replace_with(self, replace_with):258"""Replace this PageElement with another one, keeping the rest of the259tree the same.260261:param replace_with: A PageElement.262:return: `self`, no longer part of the tree.263"""264if self.parent is None:265raise ValueError(266"Cannot replace one element with another when the "267"element to be replaced is not part of a tree.")268if replace_with is self:269return270if replace_with is self.parent:271raise ValueError("Cannot replace a Tag with its parent.")272old_parent = self.parent273my_index = self.parent.index(self)274self.extract(_self_index=my_index)275old_parent.insert(my_index, replace_with)276return self277replaceWith = replace_with # BS3278279def unwrap(self):280"""Replace this PageElement with its contents.281282:return: `self`, no longer part of the tree.283"""284my_parent = self.parent285if self.parent is None:286raise ValueError(287"Cannot replace an element with its contents when that"288"element is not part of a tree.")289my_index = self.parent.index(self)290self.extract(_self_index=my_index)291for child in reversed(self.contents[:]):292my_parent.insert(my_index, child)293return self294replace_with_children = unwrap295replaceWithChildren = unwrap # BS3296297def wrap(self, wrap_inside):298"""Wrap this PageElement inside another one.299300:param wrap_inside: A PageElement.301:return: `wrap_inside`, occupying the position in the tree that used302to be occupied by `self`, and with `self` inside it.303"""304me = self.replace_with(wrap_inside)305wrap_inside.append(me)306return wrap_inside307308def extract(self, _self_index=None):309"""Destructively rips this element out of the tree.310311:param _self_index: The location of this element in its parent's312.contents, if known. Passing this in allows for a performance313optimization.314315:return: `self`, no longer part of the tree.316"""317if self.parent is not None:318if _self_index is None:319_self_index = self.parent.index(self)320del self.parent.contents[_self_index]321322#Find the two elements that would be next to each other if323#this element (and any children) hadn't been parsed. Connect324#the two.325last_child = self._last_descendant()326next_element = last_child.next_element327328if (self.previous_element is not None and329self.previous_element is not next_element):330self.previous_element.next_element = next_element331if next_element is not None and next_element is not self.previous_element:332next_element.previous_element = self.previous_element333self.previous_element = None334last_child.next_element = None335336self.parent = None337if (self.previous_sibling is not None338and self.previous_sibling is not self.next_sibling):339self.previous_sibling.next_sibling = self.next_sibling340if (self.next_sibling is not None341and self.next_sibling is not self.previous_sibling):342self.next_sibling.previous_sibling = self.previous_sibling343self.previous_sibling = self.next_sibling = None344return self345346def _last_descendant(self, is_initialized=True, accept_self=True):347"""Finds the last element beneath this object to be parsed.348349:param is_initialized: Has `setup` been called on this PageElement350yet?351:param accept_self: Is `self` an acceptable answer to the question?352"""353if is_initialized and self.next_sibling is not None:354last_child = self.next_sibling.previous_element355else:356last_child = self357while isinstance(last_child, Tag) and last_child.contents:358last_child = last_child.contents[-1]359if not accept_self and last_child is self:360last_child = None361return last_child362# BS3: Not part of the API!363_lastRecursiveChild = _last_descendant364365def insert(self, position, new_child):366"""Insert a new PageElement in the list of this PageElement's children.367368This works the same way as `list.insert`.369370:param position: The numeric position that should be occupied371in `self.children` by the new PageElement.372:param new_child: A PageElement.373"""374if new_child is None:375raise ValueError("Cannot insert None into a tag.")376if new_child is self:377raise ValueError("Cannot insert a tag into itself.")378if (isinstance(new_child, str)379and not isinstance(new_child, NavigableString)):380new_child = NavigableString(new_child)381382from bs4 import BeautifulSoup383if isinstance(new_child, BeautifulSoup):384# We don't want to end up with a situation where one BeautifulSoup385# object contains another. Insert the children one at a time.386for subchild in list(new_child.contents):387self.insert(position, subchild)388position += 1389return390position = min(position, len(self.contents))391if hasattr(new_child, 'parent') and new_child.parent is not None:392# We're 'inserting' an element that's already one393# of this object's children.394if new_child.parent is self:395current_index = self.index(new_child)396if current_index < position:397# We're moving this element further down the list398# of this object's children. That means that when399# we extract this element, our target index will400# jump down one.401position -= 1402new_child.extract()403404new_child.parent = self405previous_child = None406if position == 0:407new_child.previous_sibling = None408new_child.previous_element = self409else:410previous_child = self.contents[position - 1]411new_child.previous_sibling = previous_child412new_child.previous_sibling.next_sibling = new_child413new_child.previous_element = previous_child._last_descendant(False)414if new_child.previous_element is not None:415new_child.previous_element.next_element = new_child416417new_childs_last_element = new_child._last_descendant(False)418419if position >= len(self.contents):420new_child.next_sibling = None421422parent = self423parents_next_sibling = None424while parents_next_sibling is None and parent is not None:425parents_next_sibling = parent.next_sibling426parent = parent.parent427if parents_next_sibling is not None:428# We found the element that comes next in the document.429break430if parents_next_sibling is not None:431new_childs_last_element.next_element = parents_next_sibling432else:433# The last element of this tag is the last element in434# the document.435new_childs_last_element.next_element = None436else:437next_child = self.contents[position]438new_child.next_sibling = next_child439if new_child.next_sibling is not None:440new_child.next_sibling.previous_sibling = new_child441new_childs_last_element.next_element = next_child442443if new_childs_last_element.next_element is not None:444new_childs_last_element.next_element.previous_element = new_childs_last_element445self.contents.insert(position, new_child)446447def append(self, tag):448"""Appends the given PageElement to the contents of this one.449450:param tag: A PageElement.451"""452self.insert(len(self.contents), tag)453454def extend(self, tags):455"""Appends the given PageElements to this one's contents.456457:param tags: A list of PageElements.458"""459for tag in tags:460self.append(tag)461462def insert_before(self, *args):463"""Makes the given element(s) the immediate predecessor of this one.464465All the elements will have the same parent, and the given elements466will be immediately before this one.467468:param args: One or more PageElements.469"""470parent = self.parent471if parent is None:472raise ValueError(473"Element has no parent, so 'before' has no meaning.")474if any(x is self for x in args):475raise ValueError("Can't insert an element before itself.")476for predecessor in args:477# Extract first so that the index won't be screwed up if they478# are siblings.479if isinstance(predecessor, PageElement):480predecessor.extract()481index = parent.index(self)482parent.insert(index, predecessor)483484def insert_after(self, *args):485"""Makes the given element(s) the immediate successor of this one.486487The elements will have the same parent, and the given elements488will be immediately after this one.489490:param args: One or more PageElements.491"""492# Do all error checking before modifying the tree.493parent = self.parent494if parent is None:495raise ValueError(496"Element has no parent, so 'after' has no meaning.")497if any(x is self for x in args):498raise ValueError("Can't insert an element after itself.")499500offset = 0501for successor in args:502# Extract first so that the index won't be screwed up if they503# are siblings.504if isinstance(successor, PageElement):505successor.extract()506index = parent.index(self)507parent.insert(index+1+offset, successor)508offset += 1509510def find_next(self, name=None, attrs={}, text=None, **kwargs):511"""Find the first PageElement that matches the given criteria and512appears later in the document than this PageElement.513514All find_* methods take a common set of arguments. See the online515documentation for detailed explanations.516517:param name: A filter on tag name.518:param attrs: A dictionary of filters on attribute values.519:param text: A filter for a NavigableString with specific text.520:kwargs: A dictionary of filters on attribute values.521:return: A PageElement.522:rtype: bs4.element.Tag | bs4.element.NavigableString523"""524return self._find_one(self.find_all_next, name, attrs, text, **kwargs)525findNext = find_next # BS3526527def find_all_next(self, name=None, attrs={}, text=None, limit=None,528**kwargs):529"""Find all PageElements that match the given criteria and appear530later in the document than this PageElement.531532All find_* methods take a common set of arguments. See the online533documentation for detailed explanations.534535:param name: A filter on tag name.536:param attrs: A dictionary of filters on attribute values.537:param text: A filter for a NavigableString with specific text.538:param limit: Stop looking after finding this many results.539:kwargs: A dictionary of filters on attribute values.540:return: A ResultSet containing PageElements.541"""542return self._find_all(name, attrs, text, limit, self.next_elements,543**kwargs)544findAllNext = find_all_next # BS3545546def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):547"""Find the closest sibling to this PageElement that matches the548given criteria and appears later in the document.549550All find_* methods take a common set of arguments. See the551online documentation for detailed explanations.552553:param name: A filter on tag name.554:param attrs: A dictionary of filters on attribute values.555:param text: A filter for a NavigableString with specific text.556:kwargs: A dictionary of filters on attribute values.557:return: A PageElement.558:rtype: bs4.element.Tag | bs4.element.NavigableString559"""560return self._find_one(self.find_next_siblings, name, attrs, text,561**kwargs)562findNextSibling = find_next_sibling # BS3563564def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,565**kwargs):566"""Find all siblings of this PageElement that match the given criteria567and appear later in the document.568569All find_* methods take a common set of arguments. See the online570documentation for detailed explanations.571572:param name: A filter on tag name.573:param attrs: A dictionary of filters on attribute values.574:param text: A filter for a NavigableString with specific text.575:param limit: Stop looking after finding this many results.576:kwargs: A dictionary of filters on attribute values.577:return: A ResultSet of PageElements.578:rtype: bs4.element.ResultSet579"""580return self._find_all(name, attrs, text, limit,581self.next_siblings, **kwargs)582findNextSiblings = find_next_siblings # BS3583fetchNextSiblings = find_next_siblings # BS2584585def find_previous(self, name=None, attrs={}, text=None, **kwargs):586"""Look backwards in the document from this PageElement and find the587first PageElement that matches the given criteria.588589All find_* methods take a common set of arguments. See the online590documentation for detailed explanations.591592:param name: A filter on tag name.593:param attrs: A dictionary of filters on attribute values.594:param text: A filter for a NavigableString with specific text.595:kwargs: A dictionary of filters on attribute values.596:return: A PageElement.597:rtype: bs4.element.Tag | bs4.element.NavigableString598"""599return self._find_one(600self.find_all_previous, name, attrs, text, **kwargs)601findPrevious = find_previous # BS3602603def find_all_previous(self, name=None, attrs={}, text=None, limit=None,604**kwargs):605"""Look backwards in the document from this PageElement and find all606PageElements that match the given criteria.607608All find_* methods take a common set of arguments. See the online609documentation for detailed explanations.610611:param name: A filter on tag name.612:param attrs: A dictionary of filters on attribute values.613:param text: A filter for a NavigableString with specific text.614:param limit: Stop looking after finding this many results.615:kwargs: A dictionary of filters on attribute values.616:return: A ResultSet of PageElements.617:rtype: bs4.element.ResultSet618"""619return self._find_all(name, attrs, text, limit, self.previous_elements,620**kwargs)621findAllPrevious = find_all_previous # BS3622fetchPrevious = find_all_previous # BS2623624def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):625"""Returns the closest sibling to this PageElement that matches the626given criteria and appears earlier in the document.627628All find_* methods take a common set of arguments. See the online629documentation for detailed explanations.630631:param name: A filter on tag name.632:param attrs: A dictionary of filters on attribute values.633:param text: A filter for a NavigableString with specific text.634:kwargs: A dictionary of filters on attribute values.635:return: A PageElement.636:rtype: bs4.element.Tag | bs4.element.NavigableString637"""638return self._find_one(self.find_previous_siblings, name, attrs, text,639**kwargs)640findPreviousSibling = find_previous_sibling # BS3641642def find_previous_siblings(self, name=None, attrs={}, text=None,643limit=None, **kwargs):644"""Returns all siblings to this PageElement that match the645given criteria and appear earlier in the document.646647All find_* methods take a common set of arguments. See the online648documentation for detailed explanations.649650:param name: A filter on tag name.651:param attrs: A dictionary of filters on attribute values.652:param text: A filter for a NavigableString with specific text.653:param limit: Stop looking after finding this many results.654:kwargs: A dictionary of filters on attribute values.655:return: A ResultSet of PageElements.656:rtype: bs4.element.ResultSet657"""658return self._find_all(name, attrs, text, limit,659self.previous_siblings, **kwargs)660findPreviousSiblings = find_previous_siblings # BS3661fetchPreviousSiblings = find_previous_siblings # BS2662663def find_parent(self, name=None, attrs={}, **kwargs):664"""Find the closest parent of this PageElement that matches the given665criteria.666667All find_* methods take a common set of arguments. See the online668documentation for detailed explanations.669670:param name: A filter on tag name.671:param attrs: A dictionary of filters on attribute values.672:kwargs: A dictionary of filters on attribute values.673674:return: A PageElement.675:rtype: bs4.element.Tag | bs4.element.NavigableString676"""677# NOTE: We can't use _find_one because findParents takes a different678# set of arguments.679r = None680l = self.find_parents(name, attrs, 1, **kwargs)681if l:682r = l[0]683return r684findParent = find_parent # BS3685686def find_parents(self, name=None, attrs={}, limit=None, **kwargs):687"""Find all parents of this PageElement that match the given criteria.688689All find_* methods take a common set of arguments. See the online690documentation for detailed explanations.691692:param name: A filter on tag name.693:param attrs: A dictionary of filters on attribute values.694:param limit: Stop looking after finding this many results.695:kwargs: A dictionary of filters on attribute values.696697:return: A PageElement.698:rtype: bs4.element.Tag | bs4.element.NavigableString699"""700return self._find_all(name, attrs, None, limit, self.parents,701**kwargs)702findParents = find_parents # BS3703fetchParents = find_parents # BS2704705@property706def next(self):707"""The PageElement, if any, that was parsed just after this one.708709:return: A PageElement.710:rtype: bs4.element.Tag | bs4.element.NavigableString711"""712return self.next_element713714@property715def previous(self):716"""The PageElement, if any, that was parsed just before this one.717718:return: A PageElement.719:rtype: bs4.element.Tag | bs4.element.NavigableString720"""721return self.previous_element722723#These methods do the real heavy lifting.724725def _find_one(self, method, name, attrs, text, **kwargs):726r = None727l = method(name, attrs, text, 1, **kwargs)728if l:729r = l[0]730return r731732def _find_all(self, name, attrs, text, limit, generator, **kwargs):733"Iterates over a generator looking for things that match."734735if text is None and 'string' in kwargs:736text = kwargs['string']737del kwargs['string']738739if isinstance(name, SoupStrainer):740strainer = name741else:742strainer = SoupStrainer(name, attrs, text, **kwargs)743744if text is None and not limit and not attrs and not kwargs:745if name is True or name is None:746# Optimization to find all tags.747result = (element for element in generator748if isinstance(element, Tag))749return ResultSet(strainer, result)750elif isinstance(name, str):751# Optimization to find all tags with a given name.752if name.count(':') == 1:753# This is a name with a prefix. If this is a namespace-aware document,754# we need to match the local name against tag.name. If not,755# we need to match the fully-qualified name against tag.name.756prefix, local_name = name.split(':', 1)757else:758prefix = None759local_name = name760result = (element for element in generator761if isinstance(element, Tag)762and (763element.name == name764) or (765element.name == local_name766and (prefix is None or element.prefix == prefix)767)768)769return ResultSet(strainer, result)770results = ResultSet(strainer)771while True:772try:773i = next(generator)774except StopIteration:775break776if i:777found = strainer.search(i)778if found:779results.append(found)780if limit and len(results) >= limit:781break782return results783784#These generators can be used to navigate starting from both785#NavigableStrings and Tags.786@property787def next_elements(self):788"""All PageElements that were parsed after this one.789790:yield: A sequence of PageElements.791"""792i = self.next_element793while i is not None:794yield i795i = i.next_element796797@property798def next_siblings(self):799"""All PageElements that are siblings of this one but were parsed800later.801802:yield: A sequence of PageElements.803"""804i = self.next_sibling805while i is not None:806yield i807i = i.next_sibling808809@property810def previous_elements(self):811"""All PageElements that were parsed before this one.812813:yield: A sequence of PageElements.814"""815i = self.previous_element816while i is not None:817yield i818i = i.previous_element819820@property821def previous_siblings(self):822"""All PageElements that are siblings of this one but were parsed823earlier.824825:yield: A sequence of PageElements.826"""827i = self.previous_sibling828while i is not None:829yield i830i = i.previous_sibling831832@property833def parents(self):834"""All PageElements that are parents of this PageElement.835836:yield: A sequence of PageElements.837"""838i = self.parent839while i is not None:840yield i841i = i.parent842843@property844def decomposed(self):845"""Check whether a PageElement has been decomposed.846847:rtype: bool848"""849return getattr(self, '_decomposed', False) or False850851# Old non-property versions of the generators, for backwards852# compatibility with BS3.853def nextGenerator(self):854return self.next_elements855856def nextSiblingGenerator(self):857return self.next_siblings858859def previousGenerator(self):860return self.previous_elements861862def previousSiblingGenerator(self):863return self.previous_siblings864865def parentGenerator(self):866return self.parents867868869class NavigableString(str, PageElement):870"""A Python Unicode string that is part of a parse tree.871872When Beautiful Soup parses the markup <b>penguin</b>, it will873create a NavigableString for the string "penguin".874"""875876PREFIX = ''877SUFFIX = ''878879# We can't tell just by looking at a string whether it's contained880# in an XML document or an HTML document.881882known_xml = None883884def __new__(cls, value):885"""Create a new NavigableString.886887When unpickling a NavigableString, this method is called with888the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be889passed in to the superclass's __new__ or the superclass won't know890how to handle non-ASCII characters.891"""892if isinstance(value, str):893u = str.__new__(cls, value)894else:895u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)896u.setup()897return u898899def __copy__(self):900"""A copy of a NavigableString has the same contents and class901as the original, but it is not connected to the parse tree.902"""903return type(self)(self)904905def __getnewargs__(self):906return (str(self),)907908def __getattr__(self, attr):909"""text.string gives you text. This is for backwards910compatibility for Navigable*String, but for CData* it lets you911get the string without the CData wrapper."""912if attr == 'string':913return self914else:915raise AttributeError(916"'%s' object has no attribute '%s'" % (917self.__class__.__name__, attr))918919def output_ready(self, formatter="minimal"):920"""Run the string through the provided formatter.921922:param formatter: A Formatter object, or a string naming one of the standard formatters.923"""924output = self.format_string(self, formatter)925return self.PREFIX + output + self.SUFFIX926927@property928def name(self):929"""Since a NavigableString is not a Tag, it has no .name.930931This property is implemented so that code like this doesn't crash932when run on a mixture of Tag and NavigableString objects:933[x.name for x in tag.children]934"""935return None936937@name.setter938def name(self, name):939"""Prevent NavigableString.name from ever being set."""940raise AttributeError("A NavigableString cannot be given a name.")941942943class PreformattedString(NavigableString):944"""A NavigableString not subject to the normal formatting rules.945946This is an abstract class used for special kinds of strings such947as comments (the Comment class) and CDATA blocks (the CData948class).949"""950951PREFIX = ''952SUFFIX = ''953954def output_ready(self, formatter=None):955"""Make this string ready for output by adding any subclass-specific956prefix or suffix.957958:param formatter: A Formatter object, or a string naming one959of the standard formatters. The string will be passed into the960Formatter, but only to trigger any side effects: the return961value is ignored.962963:return: The string, with any subclass-specific prefix and964suffix added on.965"""966if formatter is not None:967ignore = self.format_string(self, formatter)968return self.PREFIX + self + self.SUFFIX969970class CData(PreformattedString):971"""A CDATA block."""972PREFIX = '<![CDATA['973SUFFIX = ']]>'974975class ProcessingInstruction(PreformattedString):976"""A SGML processing instruction."""977978PREFIX = '<?'979SUFFIX = '>'980981class XMLProcessingInstruction(ProcessingInstruction):982"""An XML processing instruction."""983PREFIX = '<?'984SUFFIX = '?>'985986class Comment(PreformattedString):987"""An HTML or XML comment."""988PREFIX = '<!--'989SUFFIX = '-->'990991992class Declaration(PreformattedString):993"""An XML declaration."""994PREFIX = '<?'995SUFFIX = '?>'996997998class Doctype(PreformattedString):999"""A document type declaration."""1000@classmethod1001def for_name_and_ids(cls, name, pub_id, system_id):1002"""Generate an appropriate document type declaration for a given1003public ID and system ID.10041005:param name: The name of the document's root element, e.g. 'html'.1006:param pub_id: The Formal Public Identifier for this document type,1007e.g. '-//W3C//DTD XHTML 1.1//EN'1008:param system_id: The system identifier for this document type,1009e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'10101011:return: A Doctype.1012"""1013value = name or ''1014if pub_id is not None:1015value += ' PUBLIC "%s"' % pub_id1016if system_id is not None:1017value += ' "%s"' % system_id1018elif system_id is not None:1019value += ' SYSTEM "%s"' % system_id10201021return Doctype(value)10221023PREFIX = '<!DOCTYPE '1024SUFFIX = '>\n'102510261027class Stylesheet(NavigableString):1028"""A NavigableString representing an stylesheet (probably1029CSS).10301031Used to distinguish embedded stylesheets from textual content.1032"""1033pass103410351036class Script(NavigableString):1037"""A NavigableString representing an executable script (probably1038Javascript).10391040Used to distinguish executable code from textual content.1041"""1042pass104310441045class TemplateString(NavigableString):1046"""A NavigableString representing a string found inside an HTML1047template embedded in a larger document.10481049Used to distinguish such strings from the main body of the document.1050"""1051pass105210531054class Tag(PageElement):1055"""Represents an HTML or XML tag that is part of a parse tree, along1056with its attributes and contents.10571058When Beautiful Soup parses the markup <b>penguin</b>, it will1059create a Tag object representing the <b> tag.1060"""10611062def __init__(self, parser=None, builder=None, name=None, namespace=None,1063prefix=None, attrs=None, parent=None, previous=None,1064is_xml=None, sourceline=None, sourcepos=None,1065can_be_empty_element=None, cdata_list_attributes=None,1066preserve_whitespace_tags=None1067):1068"""Basic constructor.10691070:param parser: A BeautifulSoup object.1071:param builder: A TreeBuilder.1072:param name: The name of the tag.1073:param namespace: The URI of this Tag's XML namespace, if any.1074:param prefix: The prefix for this Tag's XML namespace, if any.1075:param attrs: A dictionary of this Tag's attribute values.1076:param parent: The PageElement to use as this Tag's parent.1077:param previous: The PageElement that was parsed immediately before1078this tag.1079:param is_xml: If True, this is an XML tag. Otherwise, this is an1080HTML tag.1081:param sourceline: The line number where this tag was found in its1082source document.1083:param sourcepos: The character position within `sourceline` where this1084tag was found.1085:param can_be_empty_element: If True, this tag should be1086represented as <tag/>. If False, this tag should be represented1087as <tag></tag>.1088:param cdata_list_attributes: A list of attributes whose values should1089be treated as CDATA if they ever show up on this tag.1090:param preserve_whitespace_tags: A list of tag names whose contents1091should have their whitespace preserved.1092"""1093if parser is None:1094self.parser_class = None1095else:1096# We don't actually store the parser object: that lets extracted1097# chunks be garbage-collected.1098self.parser_class = parser.__class__1099if name is None:1100raise ValueError("No value provided for new tag's name.")1101self.name = name1102self.namespace = namespace1103self.prefix = prefix1104if ((not builder or builder.store_line_numbers)1105and (sourceline is not None or sourcepos is not None)):1106self.sourceline = sourceline1107self.sourcepos = sourcepos1108if attrs is None:1109attrs = {}1110elif attrs:1111if builder is not None and builder.cdata_list_attributes:1112attrs = builder._replace_cdata_list_attribute_values(1113self.name, attrs)1114else:1115attrs = dict(attrs)1116else:1117attrs = dict(attrs)11181119# If possible, determine ahead of time whether this tag is an1120# XML tag.1121if builder:1122self.known_xml = builder.is_xml1123else:1124self.known_xml = is_xml1125self.attrs = attrs1126self.contents = []1127self.setup(parent, previous)1128self.hidden = False11291130if builder is None:1131# In the absence of a TreeBuilder, use whatever values were1132# passed in here. They're probably None, unless this is a copy of some1133# other tag.1134self.can_be_empty_element = can_be_empty_element1135self.cdata_list_attributes = cdata_list_attributes1136self.preserve_whitespace_tags = preserve_whitespace_tags1137else:1138# Set up any substitutions for this tag, such as the charset in a META tag.1139builder.set_up_substitutions(self)11401141# Ask the TreeBuilder whether this tag might be an empty-element tag.1142self.can_be_empty_element = builder.can_be_empty_element(name)11431144# Keep track of the list of attributes of this tag that1145# might need to be treated as a list.1146#1147# For performance reasons, we store the whole data structure1148# rather than asking the question of every tag. Asking would1149# require building a new data structure every time, and1150# (unlike can_be_empty_element), we almost never need1151# to check this.1152self.cdata_list_attributes = builder.cdata_list_attributes11531154# Keep track of the names that might cause this tag to be treated as a1155# whitespace-preserved tag.1156self.preserve_whitespace_tags = builder.preserve_whitespace_tags11571158parserClass = _alias("parser_class") # BS311591160def __copy__(self):1161"""A copy of a Tag is a new Tag, unconnected to the parse tree.1162Its contents are a copy of the old Tag's contents.1163"""1164clone = type(self)(1165None, self.builder, self.name, self.namespace,1166self.prefix, self.attrs, is_xml=self._is_xml,1167sourceline=self.sourceline, sourcepos=self.sourcepos,1168can_be_empty_element=self.can_be_empty_element,1169cdata_list_attributes=self.cdata_list_attributes,1170preserve_whitespace_tags=self.preserve_whitespace_tags1171)1172for attr in ('can_be_empty_element', 'hidden'):1173setattr(clone, attr, getattr(self, attr))1174for child in self.contents:1175clone.append(child.__copy__())1176return clone11771178@property1179def is_empty_element(self):1180"""Is this tag an empty-element tag? (aka a self-closing tag)11811182A tag that has contents is never an empty-element tag.11831184A tag that has no contents may or may not be an empty-element1185tag. It depends on the builder used to create the tag. If the1186builder has a designated list of empty-element tags, then only1187a tag whose name shows up in that list is considered an1188empty-element tag.11891190If the builder has no designated list of empty-element tags,1191then any tag with no contents is an empty-element tag.1192"""1193return len(self.contents) == 0 and self.can_be_empty_element1194isSelfClosing = is_empty_element # BS311951196@property1197def string(self):1198"""Convenience property to get the single string within this1199PageElement.12001201TODO It might make sense to have NavigableString.string return1202itself.12031204:return: If this element has a single string child, return1205value is that string. If this element has one child tag,1206return value is the 'string' attribute of the child tag,1207recursively. If this element is itself a string, has no1208children, or has more than one child, return value is None.1209"""1210if len(self.contents) != 1:1211return None1212child = self.contents[0]1213if isinstance(child, NavigableString):1214return child1215return child.string12161217@string.setter1218def string(self, string):1219"""Replace this PageElement's contents with `string`."""1220self.clear()1221self.append(string.__class__(string))12221223def _all_strings(self, strip=False, types=(NavigableString, CData)):1224"""Yield all strings of certain classes, possibly stripping them.12251226:param strip: If True, all strings will be stripped before being1227yielded.12281229:types: A tuple of NavigableString subclasses. Any strings of1230a subclass not found in this list will be ignored. By1231default, this means only NavigableString and CData objects1232will be considered. So no comments, processing instructions,1233etc.12341235:yield: A sequence of strings.1236"""1237for descendant in self.descendants:1238if (1239(types is None and not isinstance(descendant, NavigableString))1240or1241(types is not None and type(descendant) not in types)):1242continue1243if strip:1244descendant = descendant.strip()1245if len(descendant) == 0:1246continue1247yield descendant12481249strings = property(_all_strings)12501251@property1252def stripped_strings(self):1253"""Yield all strings in the document, stripping them first.12541255:yield: A sequence of stripped strings.1256"""1257for string in self._all_strings(True):1258yield string12591260def get_text(self, separator="", strip=False,1261types=(NavigableString, CData)):1262"""Get all child strings, concatenated using the given separator.12631264:param separator: Strings will be concatenated using this separator.12651266:param strip: If True, strings will be stripped before being1267concatenated.12681269:types: A tuple of NavigableString subclasses. Any strings of1270a subclass not found in this list will be ignored. By1271default, this means only NavigableString and CData objects1272will be considered. So no comments, processing instructions,1273stylesheets, etc.12741275:return: A string.1276"""1277return separator.join([s for s in self._all_strings(1278strip, types=types)])1279getText = get_text1280text = property(get_text)12811282def decompose(self):1283"""Recursively destroys this PageElement and its children.12841285This element will be removed from the tree and wiped out; so1286will everything beneath it.12871288The behavior of a decomposed PageElement is undefined and you1289should never use one for anything, but if you need to _check_1290whether an element has been decomposed, you can use the1291`decomposed` property.1292"""1293self.extract()1294i = self1295while i is not None:1296n = i.next_element1297i.__dict__.clear()1298i.contents = []1299i._decomposed = True1300i = n13011302def clear(self, decompose=False):1303"""Wipe out all children of this PageElement by calling extract()1304on them.13051306:param decompose: If this is True, decompose() (a more1307destructive method) will be called instead of extract().1308"""1309if decompose:1310for element in self.contents[:]:1311if isinstance(element, Tag):1312element.decompose()1313else:1314element.extract()1315else:1316for element in self.contents[:]:1317element.extract()13181319def smooth(self):1320"""Smooth out this element's children by consolidating consecutive1321strings.13221323This makes pretty-printed output look more natural following a1324lot of operations that modified the tree.1325"""1326# Mark the first position of every pair of children that need1327# to be consolidated. Do this rather than making a copy of1328# self.contents, since in most cases very few strings will be1329# affected.1330marked = []1331for i, a in enumerate(self.contents):1332if isinstance(a, Tag):1333# Recursively smooth children.1334a.smooth()1335if i == len(self.contents)-1:1336# This is the last item in .contents, and it's not a1337# tag. There's no chance it needs any work.1338continue1339b = self.contents[i+1]1340if (isinstance(a, NavigableString)1341and isinstance(b, NavigableString)1342and not isinstance(a, PreformattedString)1343and not isinstance(b, PreformattedString)1344):1345marked.append(i)13461347# Go over the marked positions in reverse order, so that1348# removing items from .contents won't affect the remaining1349# positions.1350for i in reversed(marked):1351a = self.contents[i]1352b = self.contents[i+1]1353b.extract()1354n = NavigableString(a+b)1355a.replace_with(n)13561357def index(self, element):1358"""Find the index of a child by identity, not value.13591360Avoids issues with tag.contents.index(element) getting the1361index of equal elements.13621363:param element: Look for this PageElement in `self.contents`.1364"""1365for i, child in enumerate(self.contents):1366if child is element:1367return i1368raise ValueError("Tag.index: element not in tag")13691370def get(self, key, default=None):1371"""Returns the value of the 'key' attribute for the tag, or1372the value given for 'default' if it doesn't have that1373attribute."""1374return self.attrs.get(key, default)13751376def get_attribute_list(self, key, default=None):1377"""The same as get(), but always returns a list.13781379:param key: The attribute to look for.1380:param default: Use this value if the attribute is not present1381on this PageElement.1382:return: A list of values, probably containing only a single1383value.1384"""1385value = self.get(key, default)1386if not isinstance(value, list):1387value = [value]1388return value13891390def has_attr(self, key):1391"""Does this PageElement have an attribute with the given name?"""1392return key in self.attrs13931394def __hash__(self):1395return str(self).__hash__()13961397def __getitem__(self, key):1398"""tag[key] returns the value of the 'key' attribute for the Tag,1399and throws an exception if it's not there."""1400return self.attrs[key]14011402def __iter__(self):1403"Iterating over a Tag iterates over its contents."1404return iter(self.contents)14051406def __len__(self):1407"The length of a Tag is the length of its list of contents."1408return len(self.contents)14091410def __contains__(self, x):1411return x in self.contents14121413def __bool__(self):1414"A tag is non-None even if it has no contents."1415return True14161417def __setitem__(self, key, value):1418"""Setting tag[key] sets the value of the 'key' attribute for the1419tag."""1420self.attrs[key] = value14211422def __delitem__(self, key):1423"Deleting tag[key] deletes all 'key' attributes for the tag."1424self.attrs.pop(key, None)14251426def __call__(self, *args, **kwargs):1427"""Calling a Tag like a function is the same as calling its1428find_all() method. Eg. tag('a') returns a list of all the A tags1429found within this tag."""1430return self.find_all(*args, **kwargs)14311432def __getattr__(self, tag):1433"""Calling tag.subtag is the same as calling tag.find(name="subtag")"""1434#print("Getattr %s.%s" % (self.__class__, tag))1435if len(tag) > 3 and tag.endswith('Tag'):1436# BS3: soup.aTag -> "soup.find("a")1437tag_name = tag[:-3]1438warnings.warn(1439'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(1440name=tag_name1441)1442)1443return self.find(tag_name)1444# We special case contents to avoid recursion.1445elif not tag.startswith("__") and not tag == "contents":1446return self.find(tag)1447raise AttributeError(1448"'%s' object has no attribute '%s'" % (self.__class__, tag))14491450def __eq__(self, other):1451"""Returns true iff this Tag has the same name, the same attributes,1452and the same contents (recursively) as `other`."""1453if self is other:1454return True1455if (not hasattr(other, 'name') or1456not hasattr(other, 'attrs') or1457not hasattr(other, 'contents') or1458self.name != other.name or1459self.attrs != other.attrs or1460len(self) != len(other)):1461return False1462for i, my_child in enumerate(self.contents):1463if my_child != other.contents[i]:1464return False1465return True14661467def __ne__(self, other):1468"""Returns true iff this Tag is not identical to `other`,1469as defined in __eq__."""1470return not self == other14711472def __repr__(self, encoding="unicode-escape"):1473"""Renders this PageElement as a string.14741475:param encoding: The encoding to use (Python 2 only).1476:return: Under Python 2, a bytestring; under Python 3,1477a Unicode string.1478"""1479if PY3K:1480# "The return value must be a string object", i.e. Unicode1481return self.decode()1482else:1483# "The return value must be a string object", i.e. a bytestring.1484# By convention, the return value of __repr__ should also be1485# an ASCII string.1486return self.encode(encoding)14871488def __unicode__(self):1489"""Renders this PageElement as a Unicode string."""1490return self.decode()14911492def __str__(self):1493"""Renders this PageElement as a generic string.14941495:return: Under Python 2, a UTF-8 bytestring; under Python 3,1496a Unicode string.1497"""1498if PY3K:1499return self.decode()1500else:1501return self.encode()15021503if PY3K:1504__str__ = __repr__ = __unicode__15051506def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,1507indent_level=None, formatter="minimal",1508errors="xmlcharrefreplace"):1509"""Render a bytestring representation of this PageElement and its1510contents.15111512:param encoding: The destination encoding.1513:param indent_level: Each line of the rendering will be1514indented this many spaces. Used internally in1515recursive calls while pretty-printing.1516:param formatter: A Formatter object, or a string naming one of1517the standard formatters.1518:param errors: An error handling strategy such as1519'xmlcharrefreplace'. This value is passed along into1520encode() and its value should be one of the constants1521defined by Python.1522:return: A bytestring.15231524"""1525# Turn the data structure into Unicode, then encode the1526# Unicode.1527u = self.decode(indent_level, encoding, formatter)1528return u.encode(encoding, errors)15291530def decode(self, indent_level=None,1531eventual_encoding=DEFAULT_OUTPUT_ENCODING,1532formatter="minimal"):1533"""Render a Unicode representation of this PageElement and its1534contents.15351536:param indent_level: Each line of the rendering will be1537indented this many spaces. Used internally in1538recursive calls while pretty-printing.1539:param eventual_encoding: The tag is destined to be1540encoded into this encoding. This method is _not_1541responsible for performing that encoding. This information1542is passed in so that it can be substituted in if the1543document contains a <META> tag that mentions the document's1544encoding.1545:param formatter: A Formatter object, or a string naming one of1546the standard formatters.1547"""15481549# First off, turn a non-Formatter `formatter` into a Formatter1550# object. This will stop the lookup from happening over and1551# over again.1552if not isinstance(formatter, Formatter):1553formatter = self.formatter_for_name(formatter)1554attributes = formatter.attributes(self)1555attrs = []1556for key, val in attributes:1557if val is None:1558decoded = key1559else:1560if isinstance(val, list) or isinstance(val, tuple):1561val = ' '.join(val)1562elif not isinstance(val, str):1563val = str(val)1564elif (1565isinstance(val, AttributeValueWithCharsetSubstitution)1566and eventual_encoding is not None1567):1568val = val.encode(eventual_encoding)15691570text = formatter.attribute_value(val)1571decoded = (1572str(key) + '='1573+ formatter.quoted_attribute_value(text))1574attrs.append(decoded)1575close = ''1576closeTag = ''15771578prefix = ''1579if self.prefix:1580prefix = self.prefix + ":"15811582if self.is_empty_element:1583close = formatter.void_element_close_prefix or ''1584else:1585closeTag = '</%s%s>' % (prefix, self.name)15861587pretty_print = self._should_pretty_print(indent_level)1588space = ''1589indent_space = ''1590if indent_level is not None:1591indent_space = (' ' * (indent_level - 1))1592if pretty_print:1593space = indent_space1594indent_contents = indent_level + 11595else:1596indent_contents = None1597contents = self.decode_contents(1598indent_contents, eventual_encoding, formatter1599)16001601if self.hidden:1602# This is the 'document root' object.1603s = contents1604else:1605s = []1606attribute_string = ''1607if attrs:1608attribute_string = ' ' + ' '.join(attrs)1609if indent_level is not None:1610# Even if this particular tag is not pretty-printed,1611# we should indent up to the start of the tag.1612s.append(indent_space)1613s.append('<%s%s%s%s>' % (1614prefix, self.name, attribute_string, close))1615if pretty_print:1616s.append("\n")1617s.append(contents)1618if pretty_print and contents and contents[-1] != "\n":1619s.append("\n")1620if pretty_print and closeTag:1621s.append(space)1622s.append(closeTag)1623if indent_level is not None and closeTag and self.next_sibling:1624# Even if this particular tag is not pretty-printed,1625# we're now done with the tag, and we should add a1626# newline if appropriate.1627s.append("\n")1628s = ''.join(s)1629return s16301631def _should_pretty_print(self, indent_level):1632"""Should this tag be pretty-printed?16331634Most of them should, but some (such as <pre> in HTML1635documents) should not.1636"""1637return (1638indent_level is not None1639and (1640not self.preserve_whitespace_tags1641or self.name not in self.preserve_whitespace_tags1642)1643)16441645def prettify(self, encoding=None, formatter="minimal"):1646"""Pretty-print this PageElement as a string.16471648:param encoding: The eventual encoding of the string. If this is None,1649a Unicode string will be returned.1650:param formatter: A Formatter object, or a string naming one of1651the standard formatters.1652:return: A Unicode string (if encoding==None) or a bytestring1653(otherwise).1654"""1655if encoding is None:1656return self.decode(True, formatter=formatter)1657else:1658return self.encode(encoding, True, formatter=formatter)16591660def decode_contents(self, indent_level=None,1661eventual_encoding=DEFAULT_OUTPUT_ENCODING,1662formatter="minimal"):1663"""Renders the contents of this tag as a Unicode string.16641665:param indent_level: Each line of the rendering will be1666indented this many spaces. Used internally in1667recursive calls while pretty-printing.16681669:param eventual_encoding: The tag is destined to be1670encoded into this encoding. decode_contents() is _not_1671responsible for performing that encoding. This information1672is passed in so that it can be substituted in if the1673document contains a <META> tag that mentions the document's1674encoding.16751676:param formatter: A Formatter object, or a string naming one of1677the standard Formatters.1678"""1679# First off, turn a string formatter into a Formatter object. This1680# will stop the lookup from happening over and over again.1681if not isinstance(formatter, Formatter):1682formatter = self.formatter_for_name(formatter)16831684pretty_print = (indent_level is not None)1685s = []1686for c in self:1687text = None1688if isinstance(c, NavigableString):1689text = c.output_ready(formatter)1690elif isinstance(c, Tag):1691s.append(c.decode(indent_level, eventual_encoding,1692formatter))1693preserve_whitespace = (1694self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags1695)1696if text and indent_level and not preserve_whitespace:1697text = text.strip()1698if text:1699if pretty_print and not preserve_whitespace:1700s.append(" " * (indent_level - 1))1701s.append(text)1702if pretty_print and not preserve_whitespace:1703s.append("\n")1704return ''.join(s)17051706def encode_contents(1707self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,1708formatter="minimal"):1709"""Renders the contents of this PageElement as a bytestring.17101711:param indent_level: Each line of the rendering will be1712indented this many spaces. Used internally in1713recursive calls while pretty-printing.17141715:param eventual_encoding: The bytestring will be in this encoding.17161717:param formatter: A Formatter object, or a string naming one of1718the standard Formatters.17191720:return: A bytestring.1721"""1722contents = self.decode_contents(indent_level, encoding, formatter)1723return contents.encode(encoding)17241725# Old method for BS3 compatibility1726def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,1727prettyPrint=False, indentLevel=0):1728"""Deprecated method for BS3 compatibility."""1729if not prettyPrint:1730indentLevel = None1731return self.encode_contents(1732indent_level=indentLevel, encoding=encoding)17331734#Soup methods17351736def find(self, name=None, attrs={}, recursive=True, text=None,1737**kwargs):1738"""Look in the children of this PageElement and find the first1739PageElement that matches the given criteria.17401741All find_* methods take a common set of arguments. See the online1742documentation for detailed explanations.17431744:param name: A filter on tag name.1745:param attrs: A dictionary of filters on attribute values.1746:param recursive: If this is True, find() will perform a1747recursive search of this PageElement's children. Otherwise,1748only the direct children will be considered.1749:param limit: Stop looking after finding this many results.1750:kwargs: A dictionary of filters on attribute values.1751:return: A PageElement.1752:rtype: bs4.element.Tag | bs4.element.NavigableString1753"""1754r = None1755l = self.find_all(name, attrs, recursive, text, 1, **kwargs)1756if l:1757r = l[0]1758return r1759findChild = find #BS217601761def find_all(self, name=None, attrs={}, recursive=True, text=None,1762limit=None, **kwargs):1763"""Look in the children of this PageElement and find all1764PageElements that match the given criteria.17651766All find_* methods take a common set of arguments. See the online1767documentation for detailed explanations.17681769:param name: A filter on tag name.1770:param attrs: A dictionary of filters on attribute values.1771:param recursive: If this is True, find_all() will perform a1772recursive search of this PageElement's children. Otherwise,1773only the direct children will be considered.1774:param limit: Stop looking after finding this many results.1775:kwargs: A dictionary of filters on attribute values.1776:return: A ResultSet of PageElements.1777:rtype: bs4.element.ResultSet1778"""1779generator = self.descendants1780if not recursive:1781generator = self.children1782return self._find_all(name, attrs, text, limit, generator, **kwargs)1783findAll = find_all # BS31784findChildren = find_all # BS217851786#Generator methods1787@property1788def children(self):1789"""Iterate over all direct children of this PageElement.17901791:yield: A sequence of PageElements.1792"""1793# return iter() to make the purpose of the method clear1794return iter(self.contents) # XXX This seems to be untested.17951796@property1797def descendants(self):1798"""Iterate over all children of this PageElement in a1799breadth-first sequence.18001801:yield: A sequence of PageElements.1802"""1803if not len(self.contents):1804return1805stopNode = self._last_descendant().next_element1806current = self.contents[0]1807while current is not stopNode:1808yield current1809current = current.next_element18101811# CSS selector code1812def select_one(self, selector, namespaces=None, **kwargs):1813"""Perform a CSS selection operation on the current element.18141815:param selector: A CSS selector.18161817:param namespaces: A dictionary mapping namespace prefixes1818used in the CSS selector to namespace URIs. By default,1819Beautiful Soup will use the prefixes it encountered while1820parsing the document.18211822:param kwargs: Keyword arguments to be passed into SoupSieve's1823soupsieve.select() method.18241825:return: A Tag.1826:rtype: bs4.element.Tag1827"""1828value = self.select(selector, namespaces, 1, **kwargs)1829if value:1830return value[0]1831return None18321833def select(self, selector, namespaces=None, limit=None, **kwargs):1834"""Perform a CSS selection operation on the current element.18351836This uses the SoupSieve library.18371838:param selector: A string containing a CSS selector.18391840:param namespaces: A dictionary mapping namespace prefixes1841used in the CSS selector to namespace URIs. By default,1842Beautiful Soup will use the prefixes it encountered while1843parsing the document.18441845:param limit: After finding this number of results, stop looking.18461847:param kwargs: Keyword arguments to be passed into SoupSieve's1848soupsieve.select() method.18491850:return: A ResultSet of Tags.1851:rtype: bs4.element.ResultSet1852"""1853if namespaces is None:1854namespaces = self._namespaces18551856if limit is None:1857limit = 01858if soupsieve is None:1859raise NotImplementedError(1860"Cannot execute CSS selectors because the soupsieve package is not installed."1861)18621863results = soupsieve.select(selector, self, namespaces, limit, **kwargs)18641865# We do this because it's more consistent and because1866# ResultSet.__getattr__ has a helpful error message.1867return ResultSet(None, results)18681869# Old names for backwards compatibility1870def childGenerator(self):1871"""Deprecated generator."""1872return self.children18731874def recursiveChildGenerator(self):1875"""Deprecated generator."""1876return self.descendants18771878def has_key(self, key):1879"""Deprecated method. This was kind of misleading because has_key()1880(attributes) was different from __in__ (contents).18811882has_key() is gone in Python 3, anyway.1883"""1884warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (1885key))1886return self.has_attr(key)18871888# Next, a couple classes to represent queries and their results.1889class SoupStrainer(object):1890"""Encapsulates a number of ways of matching a markup element (tag or1891string).18921893This is primarily used to underpin the find_* methods, but you can1894create one yourself and pass it in as `parse_only` to the1895`BeautifulSoup` constructor, to parse a subset of a large1896document.1897"""18981899def __init__(self, name=None, attrs={}, text=None, **kwargs):1900"""Constructor.19011902The SoupStrainer constructor takes the same arguments passed1903into the find_* methods. See the online documentation for1904detailed explanations.19051906:param name: A filter on tag name.1907:param attrs: A dictionary of filters on attribute values.1908:param text: A filter for a NavigableString with specific text.1909:kwargs: A dictionary of filters on attribute values.1910"""1911self.name = self._normalize_search_value(name)1912if not isinstance(attrs, dict):1913# Treat a non-dict value for attrs as a search for the 'class'1914# attribute.1915kwargs['class'] = attrs1916attrs = None19171918if 'class_' in kwargs:1919# Treat class_="foo" as a search for the 'class'1920# attribute, overriding any non-dict value for attrs.1921kwargs['class'] = kwargs['class_']1922del kwargs['class_']19231924if kwargs:1925if attrs:1926attrs = attrs.copy()1927attrs.update(kwargs)1928else:1929attrs = kwargs1930normalized_attrs = {}1931for key, value in list(attrs.items()):1932normalized_attrs[key] = self._normalize_search_value(value)19331934self.attrs = normalized_attrs1935self.text = self._normalize_search_value(text)19361937def _normalize_search_value(self, value):1938# Leave it alone if it's a Unicode string, a callable, a1939# regular expression, a boolean, or None.1940if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')1941or isinstance(value, bool) or value is None):1942return value19431944# If it's a bytestring, convert it to Unicode, treating it as UTF-8.1945if isinstance(value, bytes):1946return value.decode("utf8")19471948# If it's listlike, convert it into a list of strings.1949if hasattr(value, '__iter__'):1950new_value = []1951for v in value:1952if (hasattr(v, '__iter__') and not isinstance(v, bytes)1953and not isinstance(v, str)):1954# This is almost certainly the user's mistake. In the1955# interests of avoiding infinite loops, we'll let1956# it through as-is rather than doing a recursive call.1957new_value.append(v)1958else:1959new_value.append(self._normalize_search_value(v))1960return new_value19611962# Otherwise, convert it into a Unicode string.1963# The unicode(str()) thing is so this will do the same thing on Python 21964# and Python 3.1965return str(str(value))19661967def __str__(self):1968"""A human-readable representation of this SoupStrainer."""1969if self.text:1970return self.text1971else:1972return "%s|%s" % (self.name, self.attrs)19731974def search_tag(self, markup_name=None, markup_attrs={}):1975"""Check whether a Tag with the given name and attributes would1976match this SoupStrainer.19771978Used prospectively to decide whether to even bother creating a Tag1979object.19801981:param markup_name: A tag name as found in some markup.1982:param markup_attrs: A dictionary of attributes as found in some markup.19831984:return: True if the prospective tag would match this SoupStrainer;1985False otherwise.1986"""1987found = None1988markup = None1989if isinstance(markup_name, Tag):1990markup = markup_name1991markup_attrs = markup1992call_function_with_tag_data = (1993isinstance(self.name, Callable)1994and not isinstance(markup_name, Tag))19951996if ((not self.name)1997or call_function_with_tag_data1998or (markup and self._matches(markup, self.name))1999or (not markup and self._matches(markup_name, self.name))):2000if call_function_with_tag_data:2001match = self.name(markup_name, markup_attrs)2002else:2003match = True2004markup_attr_map = None2005for attr, match_against in list(self.attrs.items()):2006if not markup_attr_map:2007if hasattr(markup_attrs, 'get'):2008markup_attr_map = markup_attrs2009else:2010markup_attr_map = {}2011for k, v in markup_attrs:2012markup_attr_map[k] = v2013attr_value = markup_attr_map.get(attr)2014if not self._matches(attr_value, match_against):2015match = False2016break2017if match:2018if markup:2019found = markup2020else:2021found = markup_name2022if found and self.text and not self._matches(found.string, self.text):2023found = None2024return found20252026# For BS3 compatibility.2027searchTag = search_tag20282029def search(self, markup):2030"""Find all items in `markup` that match this SoupStrainer.20312032Used by the core _find_all() method, which is ultimately2033called by all find_* methods.20342035:param markup: A PageElement or a list of them.2036"""2037# print('looking for %s in %s' % (self, markup))2038found = None2039# If given a list of items, scan it for a text element that2040# matches.2041if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):2042for element in markup:2043if isinstance(element, NavigableString) \2044and self.search(element):2045found = element2046break2047# If it's a Tag, make sure its name or attributes match.2048# Don't bother with Tags if we're searching for text.2049elif isinstance(markup, Tag):2050if not self.text or self.name or self.attrs:2051found = self.search_tag(markup)2052# If it's text, make sure the text matches.2053elif isinstance(markup, NavigableString) or \2054isinstance(markup, str):2055if not self.name and not self.attrs and self._matches(markup, self.text):2056found = markup2057else:2058raise Exception(2059"I don't know how to match against a %s" % markup.__class__)2060return found20612062def _matches(self, markup, match_against, already_tried=None):2063# print(u"Matching %s against %s" % (markup, match_against))2064result = False2065if isinstance(markup, list) or isinstance(markup, tuple):2066# This should only happen when searching a multi-valued attribute2067# like 'class'.2068for item in markup:2069if self._matches(item, match_against):2070return True2071# We didn't match any particular value of the multivalue2072# attribute, but maybe we match the attribute value when2073# considered as a string.2074if self._matches(' '.join(markup), match_against):2075return True2076return False20772078if match_against is True:2079# True matches any non-None value.2080return markup is not None20812082if isinstance(match_against, Callable):2083return match_against(markup)20842085# Custom callables take the tag as an argument, but all2086# other ways of matching match the tag name as a string.2087original_markup = markup2088if isinstance(markup, Tag):2089markup = markup.name20902091# Ensure that `markup` is either a Unicode string, or None.2092markup = self._normalize_search_value(markup)20932094if markup is None:2095# None matches None, False, an empty string, an empty list, and so on.2096return not match_against20972098if (hasattr(match_against, '__iter__')2099and not isinstance(match_against, str)):2100# We're asked to match against an iterable of items.2101# The markup must be match at least one item in the2102# iterable. We'll try each one in turn.2103#2104# To avoid infinite recursion we need to keep track of2105# items we've already seen.2106if not already_tried:2107already_tried = set()2108for item in match_against:2109if item.__hash__:2110key = item2111else:2112key = id(item)2113if key in already_tried:2114continue2115else:2116already_tried.add(key)2117if self._matches(original_markup, item, already_tried):2118return True2119else:2120return False21212122# Beyond this point we might need to run the test twice: once against2123# the tag's name and once against its prefixed name.2124match = False21252126if not match and isinstance(match_against, str):2127# Exact string match2128match = markup == match_against21292130if not match and hasattr(match_against, 'search'):2131# Regexp match2132return match_against.search(markup)21332134if (not match2135and isinstance(original_markup, Tag)2136and original_markup.prefix):2137# Try the whole thing again with the prefixed tag name.2138return self._matches(2139original_markup.prefix + ':' + original_markup.name, match_against2140)21412142return match214321442145class ResultSet(list):2146"""A ResultSet is just a list that keeps track of the SoupStrainer2147that created it."""2148def __init__(self, source, result=()):2149"""Constructor.21502151:param source: A SoupStrainer.2152:param result: A list of PageElements.2153"""2154super(ResultSet, self).__init__(result)2155self.source = source21562157def __getattr__(self, key):2158"""Raise a helpful exception to explain a common code fix."""2159raise AttributeError(2160"ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key2161)216221632164