Path: blob/master/thirdparty/beautifulsoup/beautifulsoup.py
2992 views
"""Beautiful Soup1Elixir and Tonic2"The Screen-Scraper's Friend"3http://www.crummy.com/software/BeautifulSoup/45Beautiful Soup parses a (possibly invalid) XML or HTML document into a6tree representation. It provides methods and Pythonic idioms that make7it easy to navigate, search, and modify the tree.89A well-formed XML/HTML document yields a well-formed data10structure. An ill-formed XML/HTML document yields a correspondingly11ill-formed data structure. If your document is only locally12well-formed, you can use this library to find and process the13well-formed part of it.1415Beautiful Soup works with Python 2.2 and up. It has no external16dependencies, but you'll have more success at converting data to UTF-817if you also install these three packages:1819* chardet, for auto-detecting character encodings20http://chardet.feedparser.org/21* cjkcodecs and iconv_codec, which add more encodings to the ones supported22by stock Python.23http://cjkpython.i18n.org/2425Beautiful Soup defines classes for two main parsing strategies:2627* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific28language that kind of looks like XML.2930* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid31or invalid. This class has web browser-like heuristics for32obtaining a sensible parse tree in the face of common HTML errors.3334Beautiful Soup also defines a class (UnicodeDammit) for autodetecting35the encoding of an HTML or XML document, and converting it to36Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.3738For more than you ever wanted to know about Beautiful Soup, see the39documentation:40http://www.crummy.com/software/BeautifulSoup/documentation.html4142Here, have some legalese:4344Copyright (c) 2004-2010, Leonard Richardson4546All rights reserved.4748Redistribution and use in source and binary forms, with or without49modification, are permitted provided that the following conditions are50met:5152* Redistributions of source code must retain the above copyright53notice, this list of conditions and the following disclaimer.5455* Redistributions in binary form must reproduce the above56copyright notice, this list of conditions and the following57disclaimer in the documentation and/or other materials provided58with the distribution.5960* Neither the name of the Beautiful Soup Consortium and All61Night Kosher Bakery nor the names of its contributors may be62used to endorse or promote products derived from this software63without specific prior written permission.6465THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS66"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT67LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR68A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR69CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,70EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,71PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR72PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF73LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING74NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS75SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.7677"""78from __future__ import generators79from __future__ import print_function8081__author__ = "Leonard Richardson ([email protected])"82__version__ = "3.2.1b"83__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"84__license__ = "New-style BSD"8586import codecs87import re88import sys8990if sys.version_info >= (3, 0):91xrange = range92text_type = str93binary_type = bytes94basestring = str95unichr = chr96else:97text_type = unicode98binary_type = str99100try:101from html.entities import name2codepoint102except ImportError:103from htmlentitydefs import name2codepoint104105try:106set107except NameError:108from sets import Set as set109110try:111import sgmllib112except ImportError:113from lib.utils import sgmllib114115try:116import markupbase117except ImportError:118import _markupbase as markupbase119120#These hacks make Beautiful Soup able to parse XML with namespaces121sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')122markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match123124DEFAULT_OUTPUT_ENCODING = "utf-8"125126def _match_css_class(str):127"""Build a RE to match the given CSS class."""128return re.compile(r"(^|.*\s)%s($|\s)" % str)129130# First, the classes that represent markup elements.131132class PageElement(object):133"""Contains the navigational information for some part of the page134(either a tag or a piece of text)"""135136def _invert(h):137"Cheap function to invert a hash."138i = {}139for k,v in h.items():140i[v] = k141return i142143XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",144"quot" : '"',145"amp" : "&",146"lt" : "<",147"gt" : ">" }148149XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)150151def setup(self, parent=None, previous=None):152"""Sets up the initial relations between this element and153other elements."""154self.parent = parent155self.previous = previous156self.next = None157self.previousSibling = None158self.nextSibling = None159if self.parent and self.parent.contents:160self.previousSibling = self.parent.contents[-1]161self.previousSibling.nextSibling = self162163def replaceWith(self, replaceWith):164oldParent = self.parent165myIndex = self.parent.index(self)166if hasattr(replaceWith, "parent")\167and replaceWith.parent is self.parent:168# We're replacing this element with one of its siblings.169index = replaceWith.parent.index(replaceWith)170if index and index < myIndex:171# Furthermore, it comes before this element. That172# means that when we extract it, the index of this173# element will change.174myIndex = myIndex - 1175self.extract()176oldParent.insert(myIndex, replaceWith)177178def replaceWithChildren(self):179myParent = self.parent180myIndex = self.parent.index(self)181self.extract()182reversedChildren = list(self.contents)183reversedChildren.reverse()184for child in reversedChildren:185myParent.insert(myIndex, child)186187def extract(self):188"""Destructively rips this element out of the tree."""189if self.parent:190try:191del self.parent.contents[self.parent.index(self)]192except ValueError:193pass194195#Find the two elements that would be next to each other if196#this element (and any children) hadn't been parsed. Connect197#the two.198lastChild = self._lastRecursiveChild()199nextElement = lastChild.next200201if self.previous:202self.previous.next = nextElement203if nextElement:204nextElement.previous = self.previous205self.previous = None206lastChild.next = None207208self.parent = None209if self.previousSibling:210self.previousSibling.nextSibling = self.nextSibling211if self.nextSibling:212self.nextSibling.previousSibling = self.previousSibling213self.previousSibling = self.nextSibling = None214return self215216def _lastRecursiveChild(self):217"Finds the last element beneath this object to be parsed."218lastChild = self219while hasattr(lastChild, 'contents') and lastChild.contents:220lastChild = lastChild.contents[-1]221return lastChild222223def insert(self, position, newChild):224if isinstance(newChild, basestring) \225and not isinstance(newChild, NavigableString):226newChild = NavigableString(newChild)227228position = min(position, len(self.contents))229if hasattr(newChild, 'parent') and newChild.parent is not None:230# We're 'inserting' an element that's already one231# of this object's children.232if newChild.parent is self:233index = self.index(newChild)234if index > position:235# Furthermore we're moving it further down the236# list of this object's children. That means that237# when we extract this element, our target index238# will jump down one.239position = position - 1240newChild.extract()241242newChild.parent = self243previousChild = None244if position == 0:245newChild.previousSibling = None246newChild.previous = self247else:248previousChild = self.contents[position-1]249newChild.previousSibling = previousChild250newChild.previousSibling.nextSibling = newChild251newChild.previous = previousChild._lastRecursiveChild()252if newChild.previous:253newChild.previous.next = newChild254255newChildsLastElement = newChild._lastRecursiveChild()256257if position >= len(self.contents):258newChild.nextSibling = None259260parent = self261parentsNextSibling = None262while not parentsNextSibling:263parentsNextSibling = parent.nextSibling264parent = parent.parent265if not parent: # This is the last element in the document.266break267if parentsNextSibling:268newChildsLastElement.next = parentsNextSibling269else:270newChildsLastElement.next = None271else:272nextChild = self.contents[position]273newChild.nextSibling = nextChild274if newChild.nextSibling:275newChild.nextSibling.previousSibling = newChild276newChildsLastElement.next = nextChild277278if newChildsLastElement.next:279newChildsLastElement.next.previous = newChildsLastElement280self.contents.insert(position, newChild)281282def append(self, tag):283"""Appends the given tag to the contents of this tag."""284self.insert(len(self.contents), tag)285286def findNext(self, name=None, attrs={}, text=None, **kwargs):287"""Returns the first item that matches the given criteria and288appears after this Tag in the document."""289return self._findOne(self.findAllNext, name, attrs, text, **kwargs)290291def findAllNext(self, name=None, attrs={}, text=None, limit=None,292**kwargs):293"""Returns all items that match the given criteria and appear294after this Tag in the document."""295return self._findAll(name, attrs, text, limit, self.nextGenerator,296**kwargs)297298def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):299"""Returns the closest sibling to this Tag that matches the300given criteria and appears after this Tag in the document."""301return self._findOne(self.findNextSiblings, name, attrs, text,302**kwargs)303304def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,305**kwargs):306"""Returns the siblings of this Tag that match the given307criteria and appear after this Tag in the document."""308return self._findAll(name, attrs, text, limit,309self.nextSiblingGenerator, **kwargs)310fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x311312def findPrevious(self, name=None, attrs={}, text=None, **kwargs):313"""Returns the first item that matches the given criteria and314appears before this Tag in the document."""315return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)316317def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,318**kwargs):319"""Returns all items that match the given criteria and appear320before this Tag in the document."""321return self._findAll(name, attrs, text, limit, self.previousGenerator,322**kwargs)323fetchPrevious = findAllPrevious # Compatibility with pre-3.x324325def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):326"""Returns the closest sibling to this Tag that matches the327given criteria and appears before this Tag in the document."""328return self._findOne(self.findPreviousSiblings, name, attrs, text,329**kwargs)330331def findPreviousSiblings(self, name=None, attrs={}, text=None,332limit=None, **kwargs):333"""Returns the siblings of this Tag that match the given334criteria and appear before this Tag in the document."""335return self._findAll(name, attrs, text, limit,336self.previousSiblingGenerator, **kwargs)337fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x338339def findParent(self, name=None, attrs={}, **kwargs):340"""Returns the closest parent of this Tag that matches the given341criteria."""342# NOTE: We can't use _findOne because findParents takes a different343# set of arguments.344r = None345l = self.findParents(name, attrs, 1)346if l:347r = l[0]348return r349350def findParents(self, name=None, attrs={}, limit=None, **kwargs):351"""Returns the parents of this Tag that match the given352criteria."""353354return self._findAll(name, attrs, None, limit, self.parentGenerator,355**kwargs)356fetchParents = findParents # Compatibility with pre-3.x357358#These methods do the real heavy lifting.359360def _findOne(self, method, name, attrs, text, **kwargs):361r = None362l = method(name, attrs, text, 1, **kwargs)363if l:364r = l[0]365return r366367def _findAll(self, name, attrs, text, limit, generator, **kwargs):368"Iterates over a generator looking for things that match."369370if isinstance(name, SoupStrainer):371strainer = name372# (Possibly) special case some findAll*(...) searches373elif text is None and not limit and not attrs and not kwargs:374# findAll*(True)375if name is True:376return [element for element in generator()377if isinstance(element, Tag)]378# findAll*('tag-name')379elif isinstance(name, basestring):380return [element for element in generator()381if isinstance(element, Tag) and382element.name == name]383else:384strainer = SoupStrainer(name, attrs, text, **kwargs)385# Build a SoupStrainer386else:387strainer = SoupStrainer(name, attrs, text, **kwargs)388results = ResultSet(strainer)389g = generator()390while True:391try:392i = next(g)393except StopIteration:394break395if i:396found = strainer.search(i)397if found:398results.append(found)399if limit and len(results) >= limit:400break401return results402403#These Generators can be used to navigate starting from both404#NavigableStrings and Tags.405def nextGenerator(self):406i = self407while i is not None:408i = i.next409yield i410411def nextSiblingGenerator(self):412i = self413while i is not None:414i = i.nextSibling415yield i416417def previousGenerator(self):418i = self419while i is not None:420i = i.previous421yield i422423def previousSiblingGenerator(self):424i = self425while i is not None:426i = i.previousSibling427yield i428429def parentGenerator(self):430i = self431while i is not None:432i = i.parent433yield i434435# Utility methods436def substituteEncoding(self, str, encoding=None):437encoding = encoding or "utf-8"438return str.replace("%SOUP-ENCODING%", encoding)439440def toEncoding(self, s, encoding=None):441"""Encodes an object to a string in some encoding, or to Unicode.442."""443if isinstance(s, text_type):444if encoding:445s = s.encode(encoding)446elif isinstance(s, binary_type):447s = s.encode(encoding or "utf8")448else:449s = self.toEncoding(str(s), encoding or "utf8")450return s451452BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;))")453454def _sub_entity(self, x):455"""Used with a regular expression to substitute the456appropriate XML entity for an XML special character."""457return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"458459460class NavigableString(text_type, PageElement):461462def __new__(cls, value):463"""Create a new NavigableString.464465When unpickling a NavigableString, this method is called with466the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be467passed in to the superclass's __new__ or the superclass won't know468how to handle non-ASCII characters.469"""470if isinstance(value, text_type):471return text_type.__new__(cls, value)472return text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)473474def __getnewargs__(self):475return (NavigableString.__str__(self),)476477def __getattr__(self, attr):478"""text.string gives you text. This is for backwards479compatibility for Navigable*String, but for CData* it lets you480get the string without the CData wrapper."""481if attr == 'string':482return self483else:484raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr))485486def __unicode__(self):487return str(self).decode(DEFAULT_OUTPUT_ENCODING)488489def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):490# Substitute outgoing XML entities.491data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self)492if encoding:493return data.encode(encoding)494else:495return data496497class CData(NavigableString):498499def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):500return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)501502class ProcessingInstruction(NavigableString):503def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):504output = self505if "%SOUP-ENCODING%" in output:506output = self.substituteEncoding(output, encoding)507return "<?%s?>" % self.toEncoding(output, encoding)508509class Comment(NavigableString):510def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):511return "<!--%s-->" % NavigableString.__str__(self, encoding)512513class Declaration(NavigableString):514def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):515return "<!%s>" % NavigableString.__str__(self, encoding)516517class Tag(PageElement):518519"""Represents a found HTML tag with its attributes and contents."""520521def _convertEntities(self, match):522"""Used in a call to re.sub to replace HTML, XML, and numeric523entities with the appropriate Unicode characters. If HTML524entities are being converted, any unrecognized entities are525escaped."""526try:527x = match.group(1)528if self.convertHTMLEntities and x in name2codepoint:529return unichr(name2codepoint[x])530elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:531if self.convertXMLEntities:532return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]533else:534return u'&%s;' % x535elif len(x) > 0 and x[0] == '#':536# Handle numeric entities537if len(x) > 1 and x[1] == 'x':538return unichr(int(x[2:], 16))539else:540return unichr(int(x[1:]))541542elif self.escapeUnrecognizedEntities:543return u'&%s;' % x544545except ValueError: # e.g. ValueError: unichr() arg not in range(0x10000)546pass547548return u'&%s;' % x549550def __init__(self, parser, name, attrs=None, parent=None,551previous=None):552"Basic constructor."553554# We don't actually store the parser object: that lets extracted555# chunks be garbage-collected556self.parserClass = parser.__class__557self.isSelfClosing = parser.isSelfClosingTag(name)558self.name = name559if attrs is None:560attrs = []561elif isinstance(attrs, dict):562attrs = attrs.items()563self.attrs = attrs564self.contents = []565self.setup(parent, previous)566self.hidden = False567self.containsSubstitutions = False568self.convertHTMLEntities = parser.convertHTMLEntities569self.convertXMLEntities = parser.convertXMLEntities570self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities571572# Convert any HTML, XML, or numeric entities in the attribute values.573# Reference: https://github.com/pkrumins/xgoogle/pull/16/commits/3dba1165c436b0d6e5bdbd09e53ca0dbf8a043f8574convert = lambda k_val: (k_val[0],575re.sub(r"&(#\d+|#x[0-9a-fA-F]+|\w+);",576self._convertEntities,577k_val[1]))578self.attrs = map(convert, self.attrs)579580def getString(self):581if (len(self.contents) == 1582and isinstance(self.contents[0], NavigableString)):583return self.contents[0]584585def setString(self, string):586"""Replace the contents of the tag with a string"""587self.clear()588self.append(string)589590string = property(getString, setString)591592def getText(self, separator=u""):593if not len(self.contents):594return u""595stopNode = self._lastRecursiveChild().next596strings = []597current = self.contents[0]598while current and current is not stopNode:599if isinstance(current, NavigableString):600strings.append(current.strip())601current = current.next602return separator.join(strings)603604text = property(getText)605606def get(self, key, default=None):607"""Returns the value of the 'key' attribute for the tag, or608the value given for 'default' if it doesn't have that609attribute."""610return self._getAttrMap().get(key, default)611612def clear(self):613"""Extract all children."""614for child in self.contents[:]:615child.extract()616617def index(self, element):618for i, child in enumerate(self.contents):619if child is element:620return i621raise ValueError("Tag.index: element not in tag")622623def has_key(self, key):624return self._getAttrMap().has_key(key)625626def __getitem__(self, key):627"""tag[key] returns the value of the 'key' attribute for the tag,628and throws an exception if it's not there."""629return self._getAttrMap()[key]630631def __iter__(self):632"Iterating over a tag iterates over its contents."633return iter(self.contents)634635def __len__(self):636"The length of a tag is the length of its list of contents."637return len(self.contents)638639def __contains__(self, x):640return x in self.contents641642def __nonzero__(self):643"A tag is non-None even if it has no contents."644return True645646def __setitem__(self, key, value):647"""Setting tag[key] sets the value of the 'key' attribute for the648tag."""649self._getAttrMap()650self.attrMap[key] = value651found = False652for i in xrange(0, len(self.attrs)):653if self.attrs[i][0] == key:654self.attrs[i] = (key, value)655found = True656if not found:657self.attrs.append((key, value))658self._getAttrMap()[key] = value659660def __delitem__(self, key):661"Deleting tag[key] deletes all 'key' attributes for the tag."662for item in self.attrs:663if item[0] == key:664self.attrs.remove(item)665#We don't break because bad HTML can define the same666#attribute multiple times.667self._getAttrMap()668if self.attrMap.has_key(key):669del self.attrMap[key]670671def __call__(self, *args, **kwargs):672"""Calling a tag like a function is the same as calling its673findAll() method. Eg. tag('a') returns a list of all the A tags674found within this tag."""675return self.findAll(*args, **kwargs)676677def __getattr__(self, tag):678#print "Getattr %s.%s" % (self.__class__, tag)679if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:680return self.find(tag[:-3])681elif tag.find('__') != 0:682return self.find(tag)683raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag))684685def __eq__(self, other):686"""Returns true iff this tag has the same name, the same attributes,687and the same contents (recursively) as the given tag.688689NOTE: right now this will return false if two tags have the690same attributes in a different order. Should this be fixed?"""691if other is self:692return True693if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):694return False695for i in xrange(0, len(self.contents)):696if self.contents[i] != other.contents[i]:697return False698return True699700def __ne__(self, other):701"""Returns true iff this tag is not identical to the other tag,702as defined in __eq__."""703return not self == other704705def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):706"""Renders this tag as a string."""707return self.__str__(encoding)708709def __unicode__(self):710return self.__str__(None)711712def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,713prettyPrint=False, indentLevel=0):714"""Returns a string or Unicode representation of this tag and715its contents. To get Unicode, pass None for encoding.716717NOTE: since Python's HTML parser consumes whitespace, this718method is not certain to reproduce the whitespace present in719the original string."""720721encodedName = self.toEncoding(self.name, encoding)722723attrs = []724if self.attrs:725for key, val in self.attrs:726fmt = '%s="%s"'727if isinstance(val, basestring):728if self.containsSubstitutions and '%SOUP-ENCODING%' in val:729val = self.substituteEncoding(val, encoding)730731# The attribute value either:732#733# * Contains no embedded double quotes or single quotes.734# No problem: we enclose it in double quotes.735# * Contains embedded single quotes. No problem:736# double quotes work here too.737# * Contains embedded double quotes. No problem:738# we enclose it in single quotes.739# * Embeds both single _and_ double quotes. This740# can't happen naturally, but it can happen if741# you modify an attribute value after parsing742# the document. Now we have a bit of a743# problem. We solve it by enclosing the744# attribute in single quotes, and escaping any745# embedded single quotes to XML entities.746if '"' in val:747fmt = "%s='%s'"748if "'" in val:749# TODO: replace with apos when750# appropriate.751val = val.replace("'", "&squot;")752753# Now we're okay w/r/t quotes. But the attribute754# value might also contain angle brackets, or755# ampersands that aren't part of entities. We need756# to escape those to XML entities too.757val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)758759attrs.append(fmt % (self.toEncoding(key, encoding),760self.toEncoding(val, encoding)))761close = ''762closeTag = ''763if self.isSelfClosing:764close = ' /'765else:766closeTag = '</%s>' % encodedName767768indentTag, indentContents = 0, 0769if prettyPrint:770indentTag = indentLevel771space = (' ' * (indentTag-1))772indentContents = indentTag + 1773contents = self.renderContents(encoding, prettyPrint, indentContents)774if self.hidden:775s = contents776else:777s = []778attributeString = ''779if attrs:780attributeString = ' ' + ' '.join(attrs)781if prettyPrint:782s.append(space)783s.append('<%s%s%s>' % (encodedName, attributeString, close))784if prettyPrint:785s.append("\n")786s.append(contents)787if prettyPrint and contents and contents[-1] != "\n":788s.append("\n")789if prettyPrint and closeTag:790s.append(space)791s.append(closeTag)792if prettyPrint and closeTag and self.nextSibling:793s.append("\n")794s = ''.join(s)795return s796797def decompose(self):798"""Recursively destroys the contents of this tree."""799self.extract()800if len(self.contents) == 0:801return802current = self.contents[0]803while current is not None:804next = current.next805if isinstance(current, Tag):806del current.contents[:]807current.parent = None808current.previous = None809current.previousSibling = None810current.next = None811current.nextSibling = None812current = next813814def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):815return self.__str__(encoding, True)816817def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,818prettyPrint=False, indentLevel=0):819"""Renders the contents of this tag as a string in the given820encoding. If encoding is None, returns a Unicode string.."""821s=[]822for c in self:823text = None824if isinstance(c, NavigableString):825text = c.__str__(encoding)826elif isinstance(c, Tag):827s.append(c.__str__(encoding, prettyPrint, indentLevel))828if text and prettyPrint:829text = text.strip()830if text:831if prettyPrint:832s.append(" " * (indentLevel-1))833s.append(text)834if prettyPrint:835s.append("\n")836837return ''.join(s)838839#Soup methods840841def find(self, name=None, attrs={}, recursive=True, text=None,842**kwargs):843"""Return only the first child of this Tag matching the given844criteria."""845r = None846l = self.findAll(name, attrs, recursive, text, 1, **kwargs)847if l:848r = l[0]849return r850findChild = find851852def findAll(self, name=None, attrs={}, recursive=True, text=None,853limit=None, **kwargs):854"""Extracts a list of Tag objects that match the given855criteria. You can specify the name of the Tag and any856attributes you want the Tag to have.857858The value of a key-value pair in the 'attrs' map can be a859string, a list of strings, a regular expression object, or a860callable that takes a string and returns whether or not the861string matches for some custom definition of 'matches'. The862same is true of the tag name."""863generator = self.recursiveChildGenerator864if not recursive:865generator = self.childGenerator866return self._findAll(name, attrs, text, limit, generator, **kwargs)867findChildren = findAll868869# Pre-3.x compatibility methods870first = find871fetch = findAll872873def fetchText(self, text=None, recursive=True, limit=None):874return self.findAll(text=text, recursive=recursive, limit=limit)875876def firstText(self, text=None, recursive=True):877return self.find(text=text, recursive=recursive)878879#Private methods880881def _getAttrMap(self):882"""Initializes a map representation of this tag's attributes,883if not already initialized."""884if not getattr(self, 'attrMap'):885self.attrMap = {}886for (key, value) in self.attrs:887self.attrMap[key] = value888return self.attrMap889890#Generator methods891def childGenerator(self):892# Just use the iterator from the contents893return iter(self.contents)894895def recursiveChildGenerator(self):896if not len(self.contents):897return # Note: https://stackoverflow.com/a/30217723 (PEP 479)898stopNode = self._lastRecursiveChild().next899current = self.contents[0]900while current and current is not stopNode:901yield current902current = current.next903904905# Next, a couple classes to represent queries and their results.906class SoupStrainer:907"""Encapsulates a number of ways of matching a markup element (tag or908text)."""909910def __init__(self, name=None, attrs={}, text=None, **kwargs):911self.name = name912if isinstance(attrs, basestring):913kwargs['class'] = _match_css_class(attrs)914attrs = None915if kwargs:916if attrs:917attrs = attrs.copy()918attrs.update(kwargs)919else:920attrs = kwargs921self.attrs = attrs922self.text = text923924def __str__(self):925if self.text:926return self.text927else:928return "%s|%s" % (self.name, self.attrs)929930def searchTag(self, markupName=None, markupAttrs={}):931found = None932markup = None933if isinstance(markupName, Tag):934markup = markupName935markupAttrs = markup936callFunctionWithTagData = callable(self.name) \937and not isinstance(markupName, Tag)938939if (not self.name) \940or callFunctionWithTagData \941or (markup and self._matches(markup, self.name)) \942or (not markup and self._matches(markupName, self.name)):943if callFunctionWithTagData:944match = self.name(markupName, markupAttrs)945else:946match = True947markupAttrMap = None948for attr, matchAgainst in self.attrs.items():949if not markupAttrMap:950if hasattr(markupAttrs, 'get'):951markupAttrMap = markupAttrs952else:953markupAttrMap = {}954for k,v in markupAttrs:955markupAttrMap[k] = v956attrValue = markupAttrMap.get(attr)957if not self._matches(attrValue, matchAgainst):958match = False959break960if match:961if markup:962found = markup963else:964found = markupName965return found966967def search(self, markup):968#print 'looking for %s in %s' % (self, markup)969found = None970# If given a list of items, scan it for a text element that971# matches.972if hasattr(markup, "__iter__") \973and not isinstance(markup, Tag):974for element in markup:975if isinstance(element, NavigableString) \976and self.search(element):977found = element978break979# If it's a Tag, make sure its name or attributes match.980# Don't bother with Tags if we're searching for text.981elif isinstance(markup, Tag):982if not self.text:983found = self.searchTag(markup)984# If it's text, make sure the text matches.985elif isinstance(markup, NavigableString) or \986isinstance(markup, basestring):987if self._matches(markup, self.text):988found = markup989else:990raise Exception("I don't know how to match against a %s" \991% markup.__class__)992return found993994def _matches(self, markup, matchAgainst):995#print "Matching %s against %s" % (markup, matchAgainst)996result = False997if matchAgainst is True:998result = markup is not None999elif callable(matchAgainst):1000result = matchAgainst(markup)1001else:1002#Custom match methods take the tag as an argument, but all1003#other ways of matching match the tag name as a string.1004if isinstance(markup, Tag):1005markup = markup.name1006if markup and not isinstance(markup, basestring):1007markup = text_type(markup)1008#Now we know that chunk is either a string, or None.1009if hasattr(matchAgainst, 'match'):1010# It's a regexp object.1011result = markup and matchAgainst.search(markup)1012elif hasattr(matchAgainst, '__iter__'): # list-like1013result = markup in matchAgainst1014elif hasattr(matchAgainst, 'items'):1015result = markup.has_key(matchAgainst)1016elif matchAgainst and isinstance(markup, basestring):1017if isinstance(markup, text_type):1018matchAgainst = text_type(matchAgainst)1019else:1020matchAgainst = str(matchAgainst)10211022if not result:1023result = matchAgainst == markup1024return result10251026class ResultSet(list):1027"""A ResultSet is just a list that keeps track of the SoupStrainer1028that created it."""1029def __init__(self, source):1030list.__init__([])1031self.source = source10321033# Now, some helper functions.10341035def buildTagMap(default, *args):1036"""Turns a list of maps, lists, or scalars into a single map.1037Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and1038NESTING_RESET_TAGS maps out of lists and partial maps."""1039built = {}1040for portion in args:1041if hasattr(portion, 'items'):1042#It's a map. Merge it.1043for k,v in portion.items():1044built[k] = v1045elif hasattr(portion, '__iter__'): # is a list1046#It's a list. Map each item to the default.1047for k in portion:1048built[k] = default1049else:1050#It's a scalar. Map it to the default.1051built[portion] = default1052return built10531054# Now, the parser classes.10551056class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):10571058"""This class contains the basic parser and search code. It defines1059a parser that knows nothing about tag behavior except for the1060following:10611062You can't close a tag without closing all the tags it encloses.1063That is, "<foo><bar></foo>" actually means1064"<foo><bar></bar></foo>".10651066[Another possible explanation is "<foo><bar /></foo>", but since1067this class defines no SELF_CLOSING_TAGS, it will never use that1068explanation.]10691070This class is useful for parsing XML or made-up markup languages,1071or when BeautifulSoup makes an assumption counter to what you were1072expecting."""10731074SELF_CLOSING_TAGS = {}1075NESTABLE_TAGS = {}1076RESET_NESTING_TAGS = {}1077QUOTE_TAGS = {}1078PRESERVE_WHITESPACE_TAGS = []10791080MARKUP_MASSAGE = [(re.compile(r'(<[^<>]*)/>'),1081lambda x: x.group(1) + ' />'),1082(re.compile(r'<!\s+([^<>]*)>'),1083lambda x: '<!' + x.group(1) + '>')1084]10851086ROOT_TAG_NAME = u'[document]'10871088HTML_ENTITIES = "html"1089XML_ENTITIES = "xml"1090XHTML_ENTITIES = "xhtml"1091# TODO: This only exists for backwards-compatibility1092ALL_ENTITIES = XHTML_ENTITIES10931094# Used when determining whether a text node is all whitespace and1095# can be replaced with a single space. A text node that contains1096# fancy Unicode spaces (usually non-breaking) should be left1097# alone.1098STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }10991100def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,1101markupMassage=True, smartQuotesTo=XML_ENTITIES,1102convertEntities=None, selfClosingTags=None, isHTML=False):1103"""The Soup object is initialized as the 'root tag', and the1104provided markup (which can be a string or a file-like object)1105is fed into the underlying parser.11061107sgmllib will process most bad HTML, and the BeautifulSoup1108class has some tricks for dealing with some HTML that kills1109sgmllib, but Beautiful Soup can nonetheless choke or lose data1110if your data uses self-closing tags or declarations1111incorrectly.11121113By default, Beautiful Soup uses regexes to sanitize input,1114avoiding the vast majority of these problems. If the problems1115don't apply to you, pass in False for markupMassage, and1116you'll get better performance.11171118The default parser massage techniques fix the two most common1119instances of invalid HTML that choke sgmllib:11201121<br/> (No space between name of closing tag and tag close)1122<! --Comment--> (Extraneous whitespace in declaration)11231124You can pass in a custom list of (RE object, replace method)1125tuples to get Beautiful Soup to scrub your input the way you1126want."""11271128self.parseOnlyThese = parseOnlyThese1129self.fromEncoding = fromEncoding1130self.smartQuotesTo = smartQuotesTo1131self.convertEntities = convertEntities1132# Set the rules for how we'll deal with the entities we1133# encounter1134if self.convertEntities:1135# It doesn't make sense to convert encoded characters to1136# entities even while you're converting entities to Unicode.1137# Just convert it all to Unicode.1138self.smartQuotesTo = None1139if convertEntities == self.HTML_ENTITIES:1140self.convertXMLEntities = False1141self.convertHTMLEntities = True1142self.escapeUnrecognizedEntities = True1143elif convertEntities == self.XHTML_ENTITIES:1144self.convertXMLEntities = True1145self.convertHTMLEntities = True1146self.escapeUnrecognizedEntities = False1147elif convertEntities == self.XML_ENTITIES:1148self.convertXMLEntities = True1149self.convertHTMLEntities = False1150self.escapeUnrecognizedEntities = False1151else:1152self.convertXMLEntities = False1153self.convertHTMLEntities = False1154self.escapeUnrecognizedEntities = False11551156self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)1157sgmllib.SGMLParser.__init__(self)11581159if hasattr(markup, 'read'): # It's a file-type object.1160markup = markup.read()1161self.markup = markup1162self.markupMassage = markupMassage1163try:1164self._feed(isHTML=isHTML)1165except StopParsing:1166pass1167self.markup = None # The markup can now be GCed11681169def convert_charref(self, name):1170"""This method fixes a bug in Python's SGMLParser."""1171try:1172n = int(name)1173except ValueError:1174return1175if not 0 <= n <= 127 : # ASCII ends at 127, not 2551176return1177return self.convert_codepoint(n)11781179def _feed(self, inDocumentEncoding=None, isHTML=False):1180# Convert the document to Unicode.1181markup = self.markup1182if isinstance(markup, text_type):1183if not hasattr(self, 'originalEncoding'):1184self.originalEncoding = None1185else:1186dammit = UnicodeDammit\1187(markup, [self.fromEncoding, inDocumentEncoding],1188smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)1189markup = dammit.unicode1190self.originalEncoding = dammit.originalEncoding1191self.declaredHTMLEncoding = dammit.declaredHTMLEncoding1192if markup:1193if self.markupMassage:1194if not hasattr(self.markupMassage, "__iter__"):1195self.markupMassage = self.MARKUP_MASSAGE1196for fix, m in self.markupMassage:1197markup = fix.sub(m, markup)1198# TODO: We get rid of markupMassage so that the1199# soup object can be deepcopied later on. Some1200# Python installations can't copy regexes. If anyone1201# was relying on the existence of markupMassage, this1202# might cause problems.1203del(self.markupMassage)1204self.reset()12051206sgmllib.SGMLParser.feed(self, markup)1207# Close out any unfinished strings and close all the open tags.1208self.endData()1209while self.currentTag.name != self.ROOT_TAG_NAME:1210self.popTag()12111212def __getattr__(self, methodName):1213"""This method routes method call requests to either the SGMLParser1214superclass or the Tag superclass, depending on the method name."""1215#print "__getattr__ called on %s.%s" % (self.__class__, methodName)12161217if methodName.startswith('start_') or methodName.startswith('end_') \1218or methodName.startswith('do_'):1219return sgmllib.SGMLParser.__getattr__(self, methodName)1220elif not methodName.startswith('__'):1221return Tag.__getattr__(self, methodName)1222else:1223raise AttributeError12241225def isSelfClosingTag(self, name):1226"""Returns true iff the given string is the name of a1227self-closing tag according to this parser."""1228return name in self.SELF_CLOSING_TAGS \1229or name in self.instanceSelfClosingTags12301231def reset(self):1232Tag.__init__(self, self, self.ROOT_TAG_NAME)1233self.hidden = 11234sgmllib.SGMLParser.reset(self)1235self.currentData = []1236self.currentTag = None1237self.tagStack = []1238self.quoteStack = []1239self.pushTag(self)12401241def popTag(self):1242tag = self.tagStack.pop()12431244#print "Pop", tag.name1245if self.tagStack:1246self.currentTag = self.tagStack[-1]1247return self.currentTag12481249def pushTag(self, tag):1250#print "Push", tag.name1251if self.currentTag:1252self.currentTag.contents.append(tag)1253self.tagStack.append(tag)1254self.currentTag = self.tagStack[-1]12551256def endData(self, containerClass=NavigableString):1257if self.currentData:1258currentData = u''.join(self.currentData)1259if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and1260not set([tag.name for tag in self.tagStack]).intersection(1261self.PRESERVE_WHITESPACE_TAGS)):1262if '\n' in currentData:1263currentData = '\n'1264else:1265currentData = ' '1266self.currentData = []1267if self.parseOnlyThese and len(self.tagStack) <= 1 and \1268(not self.parseOnlyThese.text or \1269not self.parseOnlyThese.search(currentData)):1270return1271o = containerClass(currentData)1272o.setup(self.currentTag, self.previous)1273if self.previous:1274self.previous.next = o1275self.previous = o1276self.currentTag.contents.append(o)127712781279def _popToTag(self, name, inclusivePop=True):1280"""Pops the tag stack up to and including the most recent1281instance of the given tag. If inclusivePop is false, pops the tag1282stack up to but *not* including the most recent instqance of1283the given tag."""1284#print "Popping to %s" % name1285if name == self.ROOT_TAG_NAME:1286return12871288numPops = 01289mostRecentTag = None1290for i in xrange(len(self.tagStack)-1, 0, -1):1291if name == self.tagStack[i].name:1292numPops = len(self.tagStack)-i1293break1294if not inclusivePop:1295numPops = numPops - 112961297for i in xrange(0, numPops):1298mostRecentTag = self.popTag()1299return mostRecentTag13001301def _smartPop(self, name):13021303"""We need to pop up to the previous tag of this type, unless1304one of this tag's nesting reset triggers comes between this1305tag and the previous tag of this type, OR unless this tag is a1306generic nesting trigger and another generic nesting trigger1307comes between this tag and the previous tag of this type.13081309Examples:1310<p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.1311<p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.1312<p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.13131314<li><ul><li> *<li>* should pop to 'ul', not the first 'li'.1315<tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'1316<td><tr><td> *<td>* should pop to 'tr', not the first 'td'1317"""13181319nestingResetTriggers = self.NESTABLE_TAGS.get(name)1320isNestable = nestingResetTriggers != None1321isResetNesting = name in self.RESET_NESTING_TAGS1322popTo = None1323inclusive = True1324for i in xrange(len(self.tagStack)-1, 0, -1):1325p = self.tagStack[i]1326if (not p or p.name == name) and not isNestable:1327#Non-nestable tags get popped to the top or to their1328#last occurance.1329popTo = name1330break1331if (nestingResetTriggers is not None1332and p.name in nestingResetTriggers) \1333or (nestingResetTriggers is None and isResetNesting1334and p.name in self.RESET_NESTING_TAGS):13351336#If we encounter one of the nesting reset triggers1337#peculiar to this tag, or we encounter another tag1338#that causes nesting to reset, pop up to but not1339#including that tag.1340popTo = p.name1341inclusive = False1342break1343p = p.parent1344if popTo:1345self._popToTag(popTo, inclusive)13461347def unknown_starttag(self, name, attrs, selfClosing=0):1348#print "Start tag %s: %s" % (name, attrs)1349if self.quoteStack:1350#This is not a real tag.1351#print "<%s> is not real!" % name1352attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])1353self.handle_data('<%s%s>' % (name, attrs))1354return1355self.endData()13561357if not self.isSelfClosingTag(name) and not selfClosing:1358self._smartPop(name)13591360if self.parseOnlyThese and len(self.tagStack) <= 1 \1361and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):1362return13631364tag = Tag(self, name, attrs, self.currentTag, self.previous)1365if self.previous:1366self.previous.next = tag1367self.previous = tag1368self.pushTag(tag)1369if selfClosing or self.isSelfClosingTag(name):1370self.popTag()1371if name in self.QUOTE_TAGS:1372#print "Beginning quote (%s)" % name1373self.quoteStack.append(name)1374self.literal = 11375return tag13761377def unknown_endtag(self, name):1378#print "End tag %s" % name1379if self.quoteStack and self.quoteStack[-1] != name:1380#This is not a real end tag.1381#print "</%s> is not real!" % name1382self.handle_data('</%s>' % name)1383return1384self.endData()1385self._popToTag(name)1386if self.quoteStack and self.quoteStack[-1] == name:1387self.quoteStack.pop()1388self.literal = (len(self.quoteStack) > 0)13891390def handle_data(self, data):1391self.currentData.append(data)13921393def _toStringSubclass(self, text, subclass):1394"""Adds a certain piece of text to the tree as a NavigableString1395subclass."""1396self.endData()1397self.handle_data(text)1398self.endData(subclass)13991400def handle_pi(self, text):1401"""Handle a processing instruction as a ProcessingInstruction1402object, possibly one with a %SOUP-ENCODING% slot into which an1403encoding will be plugged later."""1404if text[:3] == "xml":1405text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"1406self._toStringSubclass(text, ProcessingInstruction)14071408def handle_comment(self, text):1409"Handle comments as Comment objects."1410self._toStringSubclass(text, Comment)14111412def handle_charref(self, ref):1413"Handle character references as data."1414if self.convertEntities:1415data = unichr(int(ref))1416else:1417data = '&#%s;' % ref1418self.handle_data(data)14191420def handle_entityref(self, ref):1421"""Handle entity references as data, possibly converting known1422HTML and/or XML entity references to the corresponding Unicode1423characters."""1424data = None1425if self.convertHTMLEntities:1426try:1427data = unichr(name2codepoint[ref])1428except KeyError:1429pass14301431if not data and self.convertXMLEntities:1432data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)14331434if not data and self.convertHTMLEntities and \1435not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):1436# TODO: We've got a problem here. We're told this is1437# an entity reference, but it's not an XML entity1438# reference or an HTML entity reference. Nonetheless,1439# the logical thing to do is to pass it through as an1440# unrecognized entity reference.1441#1442# Except: when the input is "&carol;" this function1443# will be called with input "carol". When the input is1444# "AT&T", this function will be called with input1445# "T". We have no way of knowing whether a semicolon1446# was present originally, so we don't know whether1447# this is an unknown entity or just a misplaced1448# ampersand.1449#1450# The more common case is a misplaced ampersand, so I1451# escape the ampersand and omit the trailing semicolon.1452data = "&%s" % ref1453if not data:1454# This case is different from the one above, because we1455# haven't already gone through a supposedly comprehensive1456# mapping of entities to Unicode characters. We might not1457# have gone through any mapping at all. So the chances are1458# very high that this is a real entity, and not a1459# misplaced ampersand.1460data = "&%s;" % ref1461self.handle_data(data)14621463def handle_decl(self, data):1464"Handle DOCTYPEs and the like as Declaration objects."1465self._toStringSubclass(data, Declaration)14661467def parse_declaration(self, i):1468"""Treat a bogus SGML declaration as raw data. Treat a CDATA1469declaration as a CData object."""1470j = None1471if self.rawdata[i:i+9] == '<![CDATA[':1472k = self.rawdata.find(']]>', i)1473if k == -1:1474k = len(self.rawdata)1475data = self.rawdata[i+9:k]1476j = k+31477self._toStringSubclass(data, CData)1478else:1479try:1480j = sgmllib.SGMLParser.parse_declaration(self, i)1481except sgmllib.SGMLParseError:1482toHandle = self.rawdata[i:]1483self.handle_data(toHandle)1484j = i + len(toHandle)1485return j14861487class BeautifulSoup(BeautifulStoneSoup):14881489"""This parser knows the following facts about HTML:14901491* Some tags have no closing tag and should be interpreted as being1492closed as soon as they are encountered.14931494* The text inside some tags (ie. 'script') may contain tags which1495are not really part of the document and which should be parsed1496as text, not tags. If you want to parse the text as tags, you can1497always fetch it and parse it explicitly.14981499* Tag nesting rules:15001501Most tags can't be nested at all. For instance, the occurance of1502a <p> tag should implicitly close the previous <p> tag.15031504<p>Para1<p>Para21505should be transformed into:1506<p>Para1</p><p>Para215071508Some tags can be nested arbitrarily. For instance, the occurance1509of a <blockquote> tag should _not_ implicitly close the previous1510<blockquote> tag.15111512Alice said: <blockquote>Bob said: <blockquote>Blah1513should NOT be transformed into:1514Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah15151516Some tags can be nested, but the nesting is reset by the1517interposition of other tags. For instance, a <tr> tag should1518implicitly close the previous <tr> tag within the same <table>,1519but not close a <tr> tag in another table.15201521<table><tr>Blah<tr>Blah1522should be transformed into:1523<table><tr>Blah</tr><tr>Blah1524but,1525<tr>Blah<table><tr>Blah1526should NOT be transformed into1527<tr>Blah<table></tr><tr>Blah15281529Differing assumptions about tag nesting rules are a major source1530of problems with the BeautifulSoup class. If BeautifulSoup is not1531treating as nestable a tag your page author treats as nestable,1532try ICantBelieveItsBeautifulSoup, MinimalSoup, or1533BeautifulStoneSoup before writing your own subclass."""15341535def __init__(self, *args, **kwargs):1536if 'smartQuotesTo' not in kwargs:1537kwargs['smartQuotesTo'] = self.HTML_ENTITIES1538kwargs['isHTML'] = True1539BeautifulStoneSoup.__init__(self, *args, **kwargs)15401541SELF_CLOSING_TAGS = buildTagMap(None,1542('br' , 'hr', 'input', 'img', 'meta',1543'spacer', 'link', 'frame', 'base', 'col'))15441545PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])15461547QUOTE_TAGS = {'script' : None, 'textarea' : None}15481549#According to the HTML standard, each of these inline tags can1550#contain another tag of the same type. Furthermore, it's common1551#to actually use these tags this way.1552NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',1553'center')15541555#According to the HTML standard, these block tags can contain1556#another tag of the same type. Furthermore, it's common1557#to actually use these tags this way.1558NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')15591560#Lists can contain other lists, but there are restrictions.1561NESTABLE_LIST_TAGS = { 'ol' : [],1562'ul' : [],1563'li' : ['ul', 'ol'],1564'dl' : [],1565'dd' : ['dl'],1566'dt' : ['dl'] }15671568#Tables can contain other tables, but there are restrictions.1569NESTABLE_TABLE_TAGS = {'table' : [],1570'tr' : ['table', 'tbody', 'tfoot', 'thead'],1571'td' : ['tr'],1572'th' : ['tr'],1573'thead' : ['table'],1574'tbody' : ['table'],1575'tfoot' : ['table'],1576}15771578NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')15791580#If one of these tags is encountered, all tags up to the next tag of1581#this type are popped.1582RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',1583NON_NESTABLE_BLOCK_TAGS,1584NESTABLE_LIST_TAGS,1585NESTABLE_TABLE_TAGS)15861587NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,1588NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)15891590# Used to detect the charset in a META tag; see start_meta1591CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)15921593def start_meta(self, attrs):1594"""Beautiful Soup can detect a charset included in a META tag,1595try to convert the document to that charset, and re-parse the1596document from the beginning."""1597httpEquiv = None1598contentType = None1599contentTypeIndex = None1600tagNeedsEncodingSubstitution = False16011602for i in xrange(0, len(attrs)):1603key, value = attrs[i]1604key = key.lower()1605if key == 'http-equiv':1606httpEquiv = value1607elif key == 'content':1608contentType = value1609contentTypeIndex = i16101611if httpEquiv and contentType: # It's an interesting meta tag.1612match = self.CHARSET_RE.search(contentType)1613if match:1614if (self.declaredHTMLEncoding is not None or1615self.originalEncoding == self.fromEncoding):1616# An HTML encoding was sniffed while converting1617# the document to Unicode, or an HTML encoding was1618# sniffed during a previous pass through the1619# document, or an encoding was specified1620# explicitly and it worked. Rewrite the meta tag.1621def rewrite(match):1622return match.group(1) + "%SOUP-ENCODING%"1623newAttr = self.CHARSET_RE.sub(rewrite, contentType)1624attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],1625newAttr)1626tagNeedsEncodingSubstitution = True1627else:1628# This is our first pass through the document.1629# Go through it again with the encoding information.1630newCharset = match.group(3)1631if newCharset and newCharset != self.originalEncoding:1632self.declaredHTMLEncoding = newCharset1633self._feed(self.declaredHTMLEncoding)1634raise StopParsing1635pass1636tag = self.unknown_starttag("meta", attrs)1637if tag and tagNeedsEncodingSubstitution:1638tag.containsSubstitutions = True16391640class StopParsing(Exception):1641pass16421643class ICantBelieveItsBeautifulSoup(BeautifulSoup):16441645"""The BeautifulSoup class is oriented towards skipping over1646common HTML errors like unclosed tags. However, sometimes it makes1647errors of its own. For instance, consider this fragment:16481649<b>Foo<b>Bar</b></b>16501651This is perfectly valid (if bizarre) HTML. However, the1652BeautifulSoup class will implicitly close the first b tag when it1653encounters the second 'b'. It will think the author wrote1654"<b>Foo<b>Bar", and didn't close the first 'b' tag, because1655there's no real-world reason to bold something that's already1656bold. When it encounters '</b></b>' it will close two more 'b'1657tags, for a grand total of three tags closed instead of two. This1658can throw off the rest of your document structure. The same is1659true of a number of other tags, listed below.16601661It's much more common for someone to forget to close a 'b' tag1662than to actually use nested 'b' tags, and the BeautifulSoup class1663handles the common case. This class handles the not-co-common1664case: where you can't believe someone wrote what they did, but1665it's valid HTML and BeautifulSoup screwed up by assuming it1666wouldn't be."""16671668I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \1669('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',1670'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',1671'big')16721673I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)16741675NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,1676I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,1677I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)16781679class MinimalSoup(BeautifulSoup):1680"""The MinimalSoup class is for parsing HTML that contains1681pathologically bad markup. It makes no assumptions about tag1682nesting, but it does know which tags are self-closing, that1683<script> tags contain Javascript and should not be parsed, that1684META tags may contain encoding information, and so on.16851686This also makes it better for subclassing than BeautifulStoneSoup1687or BeautifulSoup."""16881689RESET_NESTING_TAGS = buildTagMap('noscript')1690NESTABLE_TAGS = {}16911692class BeautifulSOAP(BeautifulStoneSoup):1693"""This class will push a tag with only a single string child into1694the tag's parent as an attribute. The attribute's name is the tag1695name, and the value is the string child. An example should give1696the flavor of the change:16971698<foo><bar>baz</bar></foo>1699=>1700<foo bar="baz"><bar>baz</bar></foo>17011702You can then access fooTag['bar'] instead of fooTag.barTag.string.17031704This is, of course, useful for scraping structures that tend to1705use subelements instead of attributes, such as SOAP messages. Note1706that it modifies its input, so don't print the modified version1707out.17081709I'm not sure how many people really want to use this class; let me1710know if you do. Mainly I like the name."""17111712def popTag(self):1713if len(self.tagStack) > 1:1714tag = self.tagStack[-1]1715parent = self.tagStack[-2]1716parent._getAttrMap()1717if (isinstance(tag, Tag) and len(tag.contents) == 1 and1718isinstance(tag.contents[0], NavigableString) and1719not parent.attrMap.has_key(tag.name)):1720parent[tag.name] = tag.contents[0]1721BeautifulStoneSoup.popTag(self)17221723#Enterprise class names! It has come to our attention that some people1724#think the names of the Beautiful Soup parser classes are too silly1725#and "unprofessional" for use in enterprise screen-scraping. We feel1726#your pain! For such-minded folk, the Beautiful Soup Consortium And1727#All-Night Kosher Bakery recommends renaming this file to1728#"RobustParser.py" (or, in cases of extreme enterprisiness,1729#"RobustParserBeanInterface.class") and using the following1730#enterprise-friendly class aliases:1731class RobustXMLParser(BeautifulStoneSoup):1732pass1733class RobustHTMLParser(BeautifulSoup):1734pass1735class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):1736pass1737class RobustInsanelyWackAssHTMLParser(MinimalSoup):1738pass1739class SimplifyingSOAPParser(BeautifulSOAP):1740pass17411742######################################################1743#1744# Bonus library: Unicode, Dammit1745#1746# This class forces XML data into a standard format (usually to UTF-81747# or Unicode). It is heavily based on code from Mark Pilgrim's1748# Universal Feed Parser. It does not rewrite the XML or HTML to1749# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi1750# (XML) and BeautifulSoup.start_meta (HTML).17511752# Autodetects character encodings.1753# Download from http://chardet.feedparser.org/1754try:1755import chardet1756# import chardet.constants1757# chardet.constants._debug = 11758except ImportError:1759chardet = None17601761# cjkcodecs and iconv_codec make Python know about more character encodings.1762# Both are available from http://cjkpython.i18n.org/1763# They're built in if you use Python 2.4.1764try:1765import cjkcodecs.aliases1766except ImportError:1767pass1768try:1769import iconv_codec1770except ImportError:1771pass17721773class UnicodeDammit:1774"""A class for detecting the encoding of a *ML document and1775converting it to a Unicode string. If the source encoding is1776windows-1252, can replace MS smart quotes with their HTML or XML1777equivalents."""17781779# This dictionary maps commonly seen values for "charset" in HTML1780# meta tags to the corresponding Python codec names. It only covers1781# values that aren't in Python's aliases and can't be determined1782# by the heuristics in find_codec.1783CHARSET_ALIASES = { "macintosh" : "mac-roman",1784"x-sjis" : "shift-jis" }17851786def __init__(self, markup, overrideEncodings=[],1787smartQuotesTo='xml', isHTML=False):1788self.declaredHTMLEncoding = None1789self.markup, documentEncoding, sniffedEncoding = \1790self._detectEncoding(markup, isHTML)1791self.smartQuotesTo = smartQuotesTo1792self.triedEncodings = []1793if markup == '' or isinstance(markup, text_type):1794self.originalEncoding = None1795self.unicode = text_type(markup)1796return17971798u = None1799for proposedEncoding in overrideEncodings:1800u = self._convertFrom(proposedEncoding)1801if u: break1802if not u:1803for proposedEncoding in (documentEncoding, sniffedEncoding):1804u = self._convertFrom(proposedEncoding)1805if u: break18061807# If no luck and we have auto-detection library, try that:1808if not u and chardet and not isinstance(self.markup, text_type):1809u = self._convertFrom(chardet.detect(self.markup)['encoding'])18101811# As a last resort, try utf-8 and windows-1252:1812if not u:1813for proposed_encoding in ("utf-8", "windows-1252"):1814u = self._convertFrom(proposed_encoding)1815if u: break18161817self.unicode = u1818if not u: self.originalEncoding = None18191820def _subMSChar(self, orig):1821"""Changes a MS smart quote character to an XML or HTML1822entity."""1823sub = self.MS_CHARS.get(orig)1824if isinstance(sub, tuple):1825if self.smartQuotesTo == 'xml':1826sub = '&#x%s;' % sub[1]1827else:1828sub = '&%s;' % sub[0]1829return sub18301831def _convertFrom(self, proposed):1832proposed = self.find_codec(proposed)1833if not proposed or proposed in self.triedEncodings:1834return None1835self.triedEncodings.append(proposed)1836markup = self.markup18371838# Convert smart quotes to HTML if coming from an encoding1839# that might have them.1840if self.smartQuotesTo and proposed.lower() in("windows-1252",1841"iso-8859-1",1842"iso-8859-2"):1843markup = re.compile("([\x80-\x9f])").sub \1844(lambda x: self._subMSChar(x.group(1)),1845markup)18461847try:1848# print "Trying to convert document to %s" % proposed1849u = self._toUnicode(markup, proposed)1850self.markup = u1851self.originalEncoding = proposed1852except Exception as e:1853# print "That didn't work!"1854# print e1855return None1856#print "Correct encoding: %s" % proposed1857return self.markup18581859def _toUnicode(self, data, encoding):1860'''Given a string and its encoding, decodes the string into Unicode.1861%encoding is a string recognized by encodings.aliases'''18621863# strip Byte Order Mark (if present)1864if (len(data) >= 4) and (data[:2] == '\xfe\xff') \1865and (data[2:4] != '\x00\x00'):1866encoding = 'utf-16be'1867data = data[2:]1868elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \1869and (data[2:4] != '\x00\x00'):1870encoding = 'utf-16le'1871data = data[2:]1872elif data[:3] == '\xef\xbb\xbf':1873encoding = 'utf-8'1874data = data[3:]1875elif data[:4] == '\x00\x00\xfe\xff':1876encoding = 'utf-32be'1877data = data[4:]1878elif data[:4] == '\xff\xfe\x00\x00':1879encoding = 'utf-32le'1880data = data[4:]1881newdata = text_type(data, encoding)1882return newdata18831884def _detectEncoding(self, xml_data, isHTML=False):1885"""Given a document, tries to detect its XML encoding."""1886xml_encoding = sniffed_xml_encoding = None1887try:1888if xml_data[:4] == '\x4c\x6f\xa7\x94':1889# EBCDIC1890xml_data = self._ebcdic_to_ascii(xml_data)1891elif xml_data[:4] == '\x00\x3c\x00\x3f':1892# UTF-16BE1893sniffed_xml_encoding = 'utf-16be'1894xml_data = text_type(xml_data, 'utf-16be').encode('utf-8')1895elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \1896and (xml_data[2:4] != '\x00\x00'):1897# UTF-16BE with BOM1898sniffed_xml_encoding = 'utf-16be'1899xml_data = text_type(xml_data[2:], 'utf-16be').encode('utf-8')1900elif xml_data[:4] == '\x3c\x00\x3f\x00':1901# UTF-16LE1902sniffed_xml_encoding = 'utf-16le'1903xml_data = text_type(xml_data, 'utf-16le').encode('utf-8')1904elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \1905(xml_data[2:4] != '\x00\x00'):1906# UTF-16LE with BOM1907sniffed_xml_encoding = 'utf-16le'1908xml_data = text_type(xml_data[2:], 'utf-16le').encode('utf-8')1909elif xml_data[:4] == '\x00\x00\x00\x3c':1910# UTF-32BE1911sniffed_xml_encoding = 'utf-32be'1912xml_data = text_type(xml_data, 'utf-32be').encode('utf-8')1913elif xml_data[:4] == '\x3c\x00\x00\x00':1914# UTF-32LE1915sniffed_xml_encoding = 'utf-32le'1916xml_data = text_type(xml_data, 'utf-32le').encode('utf-8')1917elif xml_data[:4] == '\x00\x00\xfe\xff':1918# UTF-32BE with BOM1919sniffed_xml_encoding = 'utf-32be'1920xml_data = text_type(xml_data[4:], 'utf-32be').encode('utf-8')1921elif xml_data[:4] == '\xff\xfe\x00\x00':1922# UTF-32LE with BOM1923sniffed_xml_encoding = 'utf-32le'1924xml_data = text_type(xml_data[4:], 'utf-32le').encode('utf-8')1925elif xml_data[:3] == '\xef\xbb\xbf':1926# UTF-8 with BOM1927sniffed_xml_encoding = 'utf-8'1928xml_data = text_type(xml_data[3:], 'utf-8').encode('utf-8')1929else:1930sniffed_xml_encoding = 'ascii'1931pass1932except:1933xml_encoding_match = None1934xml_encoding_match = re.compile(1935r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)1936if not xml_encoding_match and isHTML:1937regexp = re.compile(r'<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)1938xml_encoding_match = regexp.search(xml_data)1939if xml_encoding_match is not None:1940xml_encoding = xml_encoding_match.groups()[0].lower()1941if isHTML:1942self.declaredHTMLEncoding = xml_encoding1943if sniffed_xml_encoding and \1944(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',1945'iso-10646-ucs-4', 'ucs-4', 'csucs4',1946'utf-16', 'utf-32', 'utf_16', 'utf_32',1947'utf16', 'u16')):1948xml_encoding = sniffed_xml_encoding1949return xml_data, xml_encoding, sniffed_xml_encoding195019511952def find_codec(self, charset):1953return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \1954or (charset and self._codec(charset.replace("-", ""))) \1955or (charset and self._codec(charset.replace("-", "_"))) \1956or charset19571958def _codec(self, charset):1959if not charset: return charset1960codec = None1961try:1962codecs.lookup(charset)1963codec = charset1964except (LookupError, ValueError):1965pass1966return codec19671968EBCDIC_TO_ASCII_MAP = None1969def _ebcdic_to_ascii(self, s):1970c = self.__class__1971if not c.EBCDIC_TO_ASCII_MAP:1972emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,197316,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,1974128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,1975144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,197632,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,197738,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,197845,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,1979186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,1980195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,1981201,202,106,107,108,109,110,111,112,113,114,203,204,205,1982206,207,208,209,126,115,116,117,118,119,120,121,122,210,1983211,212,213,214,215,216,217,218,219,220,221,222,223,224,1984225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,198573,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,198682,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,198790,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,1988250,251,252,253,254,255)1989import string1990c.EBCDIC_TO_ASCII_MAP = string.maketrans( \1991''.join(map(chr, xrange(256))), ''.join(map(chr, emap)))1992return s.translate(c.EBCDIC_TO_ASCII_MAP)19931994MS_CHARS = { '\x80' : ('euro', '20AC'),1995'\x81' : ' ',1996'\x82' : ('sbquo', '201A'),1997'\x83' : ('fnof', '192'),1998'\x84' : ('bdquo', '201E'),1999'\x85' : ('hellip', '2026'),2000'\x86' : ('dagger', '2020'),2001'\x87' : ('Dagger', '2021'),2002'\x88' : ('circ', '2C6'),2003'\x89' : ('permil', '2030'),2004'\x8A' : ('Scaron', '160'),2005'\x8B' : ('lsaquo', '2039'),2006'\x8C' : ('OElig', '152'),2007'\x8D' : '?',2008'\x8E' : ('#x17D', '17D'),2009'\x8F' : '?',2010'\x90' : '?',2011'\x91' : ('lsquo', '2018'),2012'\x92' : ('rsquo', '2019'),2013'\x93' : ('ldquo', '201C'),2014'\x94' : ('rdquo', '201D'),2015'\x95' : ('bull', '2022'),2016'\x96' : ('ndash', '2013'),2017'\x97' : ('mdash', '2014'),2018'\x98' : ('tilde', '2DC'),2019'\x99' : ('trade', '2122'),2020'\x9a' : ('scaron', '161'),2021'\x9b' : ('rsaquo', '203A'),2022'\x9c' : ('oelig', '153'),2023'\x9d' : '?',2024'\x9e' : ('#x17E', '17E'),2025'\x9f' : ('Yuml', ''),}20262027#######################################################################202820292030#By default, act as an HTML pretty-printer.2031if __name__ == '__main__':2032soup = BeautifulSoup(sys.stdin)2033print(soup.prettify())203420352036