CoCalc -- beautifulsoup.py

GitHub Repository: sqlmapproject/sqlmap
Path: blob/master/thirdparty/beautifulsoup/beautifulsoup.py
²⁹⁹² views
1
"""Beautiful Soup
2
Elixir and Tonic
3
"The Screen-Scraper's Friend"
4
http://www.crummy.com/software/BeautifulSoup/
5

6
Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7
tree representation. It provides methods and Pythonic idioms that make
8
it easy to navigate, search, and modify the tree.
9

10
A well-formed XML/HTML document yields a well-formed data
11
structure. An ill-formed XML/HTML document yields a correspondingly
12
ill-formed data structure. If your document is only locally
13
well-formed, you can use this library to find and process the
14
well-formed part of it.
15

16
Beautiful Soup works with Python 2.2 and up. It has no external
17
dependencies, but you'll have more success at converting data to UTF-8
18
if you also install these three packages:
19

20
* chardet, for auto-detecting character encodings
21
  http://chardet.feedparser.org/
22
* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23
  by stock Python.
24
  http://cjkpython.i18n.org/
25

26
Beautiful Soup defines classes for two main parsing strategies:
27

28
 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29
   language that kind of looks like XML.
30

31
 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32
   or invalid. This class has web browser-like heuristics for
33
   obtaining a sensible parse tree in the face of common HTML errors.
34

35
Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36
the encoding of an HTML or XML document, and converting it to
37
Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38

39
For more than you ever wanted to know about Beautiful Soup, see the
40
documentation:
41
http://www.crummy.com/software/BeautifulSoup/documentation.html
42

43
Here, have some legalese:
44

45
Copyright (c) 2004-2010, Leonard Richardson
46

47
All rights reserved.
48

49
Redistribution and use in source and binary forms, with or without
50
modification, are permitted provided that the following conditions are
51
met:
52

53
  * Redistributions of source code must retain the above copyright
54
    notice, this list of conditions and the following disclaimer.
55

56
  * Redistributions in binary form must reproduce the above
57
    copyright notice, this list of conditions and the following
58
    disclaimer in the documentation and/or other materials provided
59
    with the distribution.
60

61
  * Neither the name of the Beautiful Soup Consortium and All
62
    Night Kosher Bakery nor the names of its contributors may be
63
    used to endorse or promote products derived from this software
64
    without specific prior written permission.
65

66
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77

78
"""
79
from __future__ import generators
80
from __future__ import print_function
81

82
__author__ = "Leonard Richardson ([email protected])"
83
__version__ = "3.2.1b"
84
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
85
__license__ = "New-style BSD"
86

87
import codecs
88
import re
89
import sys
90

91
if sys.version_info >= (3, 0):
92
    xrange = range
93
    text_type = str
94
    binary_type = bytes
95
    basestring = str
96
    unichr = chr
97
else:
98
    text_type = unicode
99
    binary_type = str
100

101
try:
102
    from html.entities import name2codepoint
103
except ImportError:
104
    from htmlentitydefs import name2codepoint
105

106
try:
107
    set
108
except NameError:
109
    from sets import Set as set
110

111
try:
112
    import sgmllib
113
except ImportError:
114
    from lib.utils import sgmllib
115

116
try:
117
    import markupbase
118
except ImportError:
119
    import _markupbase as markupbase
120

121
#These hacks make Beautiful Soup able to parse XML with namespaces
122
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
123
markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
124

125
DEFAULT_OUTPUT_ENCODING = "utf-8"
126

127
def _match_css_class(str):
128
    """Build a RE to match the given CSS class."""
129
    return re.compile(r"(^|.*\s)%s($|\s)" % str)
130

131
# First, the classes that represent markup elements.
132

133
class PageElement(object):
134
    """Contains the navigational information for some part of the page
135
    (either a tag or a piece of text)"""
136

137
    def _invert(h):
138
        "Cheap function to invert a hash."
139
        i = {}
140
        for k,v in h.items():
141
            i[v] = k
142
        return i
143

144
    XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
145
                                      "quot" : '"',
146
                                      "amp" : "&",
147
                                      "lt" : "<",
148
                                      "gt" : ">" }
149

150
    XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
151

152
    def setup(self, parent=None, previous=None):
153
        """Sets up the initial relations between this element and
154
        other elements."""
155
        self.parent = parent
156
        self.previous = previous
157
        self.next = None
158
        self.previousSibling = None
159
        self.nextSibling = None
160
        if self.parent and self.parent.contents:
161
            self.previousSibling = self.parent.contents[-1]
162
            self.previousSibling.nextSibling = self
163

164
    def replaceWith(self, replaceWith):
165
        oldParent = self.parent
166
        myIndex = self.parent.index(self)
167
        if hasattr(replaceWith, "parent")\
168
                  and replaceWith.parent is self.parent:
169
            # We're replacing this element with one of its siblings.
170
            index = replaceWith.parent.index(replaceWith)
171
            if index and index < myIndex:
172
                # Furthermore, it comes before this element. That
173
                # means that when we extract it, the index of this
174
                # element will change.
175
                myIndex = myIndex - 1
176
        self.extract()
177
        oldParent.insert(myIndex, replaceWith)
178

179
    def replaceWithChildren(self):
180
        myParent = self.parent
181
        myIndex = self.parent.index(self)
182
        self.extract()
183
        reversedChildren = list(self.contents)
184
        reversedChildren.reverse()
185
        for child in reversedChildren:
186
            myParent.insert(myIndex, child)
187

188
    def extract(self):
189
        """Destructively rips this element out of the tree."""
190
        if self.parent:
191
            try:
192
                del self.parent.contents[self.parent.index(self)]
193
            except ValueError:
194
                pass
195

196
        #Find the two elements that would be next to each other if
197
        #this element (and any children) hadn't been parsed. Connect
198
        #the two.
199
        lastChild = self._lastRecursiveChild()
200
        nextElement = lastChild.next
201

202
        if self.previous:
203
            self.previous.next = nextElement
204
        if nextElement:
205
            nextElement.previous = self.previous
206
        self.previous = None
207
        lastChild.next = None
208

209
        self.parent = None
210
        if self.previousSibling:
211
            self.previousSibling.nextSibling = self.nextSibling
212
        if self.nextSibling:
213
            self.nextSibling.previousSibling = self.previousSibling
214
        self.previousSibling = self.nextSibling = None
215
        return self
216

217
    def _lastRecursiveChild(self):
218
        "Finds the last element beneath this object to be parsed."
219
        lastChild = self
220
        while hasattr(lastChild, 'contents') and lastChild.contents:
221
            lastChild = lastChild.contents[-1]
222
        return lastChild
223

224
    def insert(self, position, newChild):
225
        if isinstance(newChild, basestring) \
226
            and not isinstance(newChild, NavigableString):
227
            newChild = NavigableString(newChild)
228

229
        position =  min(position, len(self.contents))
230
        if hasattr(newChild, 'parent') and newChild.parent is not None:
231
            # We're 'inserting' an element that's already one
232
            # of this object's children.
233
            if newChild.parent is self:
234
                index = self.index(newChild)
235
                if index > position:
236
                    # Furthermore we're moving it further down the
237
                    # list of this object's children. That means that
238
                    # when we extract this element, our target index
239
                    # will jump down one.
240
                    position = position - 1
241
            newChild.extract()
242

243
        newChild.parent = self
244
        previousChild = None
245
        if position == 0:
246
            newChild.previousSibling = None
247
            newChild.previous = self
248
        else:
249
            previousChild = self.contents[position-1]
250
            newChild.previousSibling = previousChild
251
            newChild.previousSibling.nextSibling = newChild
252
            newChild.previous = previousChild._lastRecursiveChild()
253
        if newChild.previous:
254
            newChild.previous.next = newChild
255

256
        newChildsLastElement = newChild._lastRecursiveChild()
257

258
        if position >= len(self.contents):
259
            newChild.nextSibling = None
260

261
            parent = self
262
            parentsNextSibling = None
263
            while not parentsNextSibling:
264
                parentsNextSibling = parent.nextSibling
265
                parent = parent.parent
266
                if not parent: # This is the last element in the document.
267
                    break
268
            if parentsNextSibling:
269
                newChildsLastElement.next = parentsNextSibling
270
            else:
271
                newChildsLastElement.next = None
272
        else:
273
            nextChild = self.contents[position]
274
            newChild.nextSibling = nextChild
275
            if newChild.nextSibling:
276
                newChild.nextSibling.previousSibling = newChild
277
            newChildsLastElement.next = nextChild
278

279
        if newChildsLastElement.next:
280
            newChildsLastElement.next.previous = newChildsLastElement
281
        self.contents.insert(position, newChild)
282

283
    def append(self, tag):
284
        """Appends the given tag to the contents of this tag."""
285
        self.insert(len(self.contents), tag)
286

287
    def findNext(self, name=None, attrs={}, text=None, **kwargs):
288
        """Returns the first item that matches the given criteria and
289
        appears after this Tag in the document."""
290
        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
291

292
    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
293
                    **kwargs):
294
        """Returns all items that match the given criteria and appear
295
        after this Tag in the document."""
296
        return self._findAll(name, attrs, text, limit, self.nextGenerator,
297
                             **kwargs)
298

299
    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
300
        """Returns the closest sibling to this Tag that matches the
301
        given criteria and appears after this Tag in the document."""
302
        return self._findOne(self.findNextSiblings, name, attrs, text,
303
                             **kwargs)
304

305
    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
306
                         **kwargs):
307
        """Returns the siblings of this Tag that match the given
308
        criteria and appear after this Tag in the document."""
309
        return self._findAll(name, attrs, text, limit,
310
                             self.nextSiblingGenerator, **kwargs)
311
    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
312

313
    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
314
        """Returns the first item that matches the given criteria and
315
        appears before this Tag in the document."""
316
        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
317

318
    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
319
                        **kwargs):
320
        """Returns all items that match the given criteria and appear
321
        before this Tag in the document."""
322
        return self._findAll(name, attrs, text, limit, self.previousGenerator,
323
                           **kwargs)
324
    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
325

326
    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
327
        """Returns the closest sibling to this Tag that matches the
328
        given criteria and appears before this Tag in the document."""
329
        return self._findOne(self.findPreviousSiblings, name, attrs, text,
330
                             **kwargs)
331

332
    def findPreviousSiblings(self, name=None, attrs={}, text=None,
333
                             limit=None, **kwargs):
334
        """Returns the siblings of this Tag that match the given
335
        criteria and appear before this Tag in the document."""
336
        return self._findAll(name, attrs, text, limit,
337
                             self.previousSiblingGenerator, **kwargs)
338
    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
339

340
    def findParent(self, name=None, attrs={}, **kwargs):
341
        """Returns the closest parent of this Tag that matches the given
342
        criteria."""
343
        # NOTE: We can't use _findOne because findParents takes a different
344
        # set of arguments.
345
        r = None
346
        l = self.findParents(name, attrs, 1)
347
        if l:
348
            r = l[0]
349
        return r
350

351
    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
352
        """Returns the parents of this Tag that match the given
353
        criteria."""
354

355
        return self._findAll(name, attrs, None, limit, self.parentGenerator,
356
                             **kwargs)
357
    fetchParents = findParents # Compatibility with pre-3.x
358

359
    #These methods do the real heavy lifting.
360

361
    def _findOne(self, method, name, attrs, text, **kwargs):
362
        r = None
363
        l = method(name, attrs, text, 1, **kwargs)
364
        if l:
365
            r = l[0]
366
        return r
367

368
    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
369
        "Iterates over a generator looking for things that match."
370

371
        if isinstance(name, SoupStrainer):
372
            strainer = name
373
        # (Possibly) special case some findAll*(...) searches
374
        elif text is None and not limit and not attrs and not kwargs:
375
            # findAll*(True)
376
            if name is True:
377
                return [element for element in generator()
378
                        if isinstance(element, Tag)]
379
            # findAll*('tag-name')
380
            elif isinstance(name, basestring):
381
                return [element for element in generator()
382
                        if isinstance(element, Tag) and
383
                        element.name == name]
384
            else:
385
                strainer = SoupStrainer(name, attrs, text, **kwargs)
386
        # Build a SoupStrainer
387
        else:
388
            strainer = SoupStrainer(name, attrs, text, **kwargs)
389
        results = ResultSet(strainer)
390
        g = generator()
391
        while True:
392
            try:
393
                i = next(g)
394
            except StopIteration:
395
                break
396
            if i:
397
                found = strainer.search(i)
398
                if found:
399
                    results.append(found)
400
                    if limit and len(results) >= limit:
401
                        break
402
        return results
403

404
    #These Generators can be used to navigate starting from both
405
    #NavigableStrings and Tags.
406
    def nextGenerator(self):
407
        i = self
408
        while i is not None:
409
            i = i.next
410
            yield i
411

412
    def nextSiblingGenerator(self):
413
        i = self
414
        while i is not None:
415
            i = i.nextSibling
416
            yield i
417

418
    def previousGenerator(self):
419
        i = self
420
        while i is not None:
421
            i = i.previous
422
            yield i
423

424
    def previousSiblingGenerator(self):
425
        i = self
426
        while i is not None:
427
            i = i.previousSibling
428
            yield i
429

430
    def parentGenerator(self):
431
        i = self
432
        while i is not None:
433
            i = i.parent
434
            yield i
435

436
    # Utility methods
437
    def substituteEncoding(self, str, encoding=None):
438
        encoding = encoding or "utf-8"
439
        return str.replace("%SOUP-ENCODING%", encoding)
440

441
    def toEncoding(self, s, encoding=None):
442
        """Encodes an object to a string in some encoding, or to Unicode.
443
        ."""
444
        if isinstance(s, text_type):
445
            if encoding:
446
                s = s.encode(encoding)
447
        elif isinstance(s, binary_type):
448
            s = s.encode(encoding or "utf8")
449
        else:
450
            s  = self.toEncoding(str(s), encoding or "utf8")
451
        return s
452

453
    BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;))")
454

455
    def _sub_entity(self, x):
456
        """Used with a regular expression to substitute the
457
        appropriate XML entity for an XML special character."""
458
        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
459

460

461
class NavigableString(text_type, PageElement):
462

463
    def __new__(cls, value):
464
        """Create a new NavigableString.
465

466
        When unpickling a NavigableString, this method is called with
467
        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
468
        passed in to the superclass's __new__ or the superclass won't know
469
        how to handle non-ASCII characters.
470
        """
471
        if isinstance(value, text_type):
472
            return text_type.__new__(cls, value)
473
        return text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
474

475
    def __getnewargs__(self):
476
        return (NavigableString.__str__(self),)
477

478
    def __getattr__(self, attr):
479
        """text.string gives you text. This is for backwards
480
        compatibility for Navigable*String, but for CData* it lets you
481
        get the string without the CData wrapper."""
482
        if attr == 'string':
483
            return self
484
        else:
485
            raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr))
486

487
    def __unicode__(self):
488
        return str(self).decode(DEFAULT_OUTPUT_ENCODING)
489

490
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
491
        # Substitute outgoing XML entities.
492
        data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self)
493
        if encoding:
494
            return data.encode(encoding)
495
        else:
496
            return data
497

498
class CData(NavigableString):
499

500
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
501
        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
502

503
class ProcessingInstruction(NavigableString):
504
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
505
        output = self
506
        if "%SOUP-ENCODING%" in output:
507
            output = self.substituteEncoding(output, encoding)
508
        return "<?%s?>" % self.toEncoding(output, encoding)
509

510
class Comment(NavigableString):
511
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
512
        return "<!--%s-->" % NavigableString.__str__(self, encoding)
513

514
class Declaration(NavigableString):
515
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
516
        return "<!%s>" % NavigableString.__str__(self, encoding)
517

518
class Tag(PageElement):
519

520
    """Represents a found HTML tag with its attributes and contents."""
521

522
    def _convertEntities(self, match):
523
        """Used in a call to re.sub to replace HTML, XML, and numeric
524
        entities with the appropriate Unicode characters. If HTML
525
        entities are being converted, any unrecognized entities are
526
        escaped."""
527
        try:
528
            x = match.group(1)
529
            if self.convertHTMLEntities and x in name2codepoint:
530
                return unichr(name2codepoint[x])
531
            elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
532
                if self.convertXMLEntities:
533
                    return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
534
                else:
535
                    return u'&%s;' % x
536
            elif len(x) > 0 and x[0] == '#':
537
                # Handle numeric entities
538
                if len(x) > 1 and x[1] == 'x':
539
                    return unichr(int(x[2:], 16))
540
                else:
541
                    return unichr(int(x[1:]))
542

543
            elif self.escapeUnrecognizedEntities:
544
                return u'&amp;%s;' % x
545

546
        except ValueError:  # e.g. ValueError: unichr() arg not in range(0x10000)
547
            pass
548

549
        return u'&%s;' % x
550

551
    def __init__(self, parser, name, attrs=None, parent=None,
552
                 previous=None):
553
        "Basic constructor."
554

555
        # We don't actually store the parser object: that lets extracted
556
        # chunks be garbage-collected
557
        self.parserClass = parser.__class__
558
        self.isSelfClosing = parser.isSelfClosingTag(name)
559
        self.name = name
560
        if attrs is None:
561
            attrs = []
562
        elif isinstance(attrs, dict):
563
            attrs = attrs.items()
564
        self.attrs = attrs
565
        self.contents = []
566
        self.setup(parent, previous)
567
        self.hidden = False
568
        self.containsSubstitutions = False
569
        self.convertHTMLEntities = parser.convertHTMLEntities
570
        self.convertXMLEntities = parser.convertXMLEntities
571
        self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
572

573
        # Convert any HTML, XML, or numeric entities in the attribute values.
574
        # Reference: https://github.com/pkrumins/xgoogle/pull/16/commits/3dba1165c436b0d6e5bdbd09e53ca0dbf8a043f8
575
        convert = lambda k_val: (k_val[0],
576
                                 re.sub(r"&(#\d+|#x[0-9a-fA-F]+|\w+);",
577
                                     self._convertEntities,
578
                                     k_val[1]))
579
        self.attrs = map(convert, self.attrs)
580

581
    def getString(self):
582
        if (len(self.contents) == 1
583
            and isinstance(self.contents[0], NavigableString)):
584
            return self.contents[0]
585

586
    def setString(self, string):
587
        """Replace the contents of the tag with a string"""
588
        self.clear()
589
        self.append(string)
590

591
    string = property(getString, setString)
592

593
    def getText(self, separator=u""):
594
        if not len(self.contents):
595
            return u""
596
        stopNode = self._lastRecursiveChild().next
597
        strings = []
598
        current = self.contents[0]
599
        while current and current is not stopNode:
600
            if isinstance(current, NavigableString):
601
                strings.append(current.strip())
602
            current = current.next
603
        return separator.join(strings)
604

605
    text = property(getText)
606

607
    def get(self, key, default=None):
608
        """Returns the value of the 'key' attribute for the tag, or
609
        the value given for 'default' if it doesn't have that
610
        attribute."""
611
        return self._getAttrMap().get(key, default)
612

613
    def clear(self):
614
        """Extract all children."""
615
        for child in self.contents[:]:
616
            child.extract()
617

618
    def index(self, element):
619
        for i, child in enumerate(self.contents):
620
            if child is element:
621
                return i
622
        raise ValueError("Tag.index: element not in tag")
623

624
    def has_key(self, key):
625
        return self._getAttrMap().has_key(key)
626

627
    def __getitem__(self, key):
628
        """tag[key] returns the value of the 'key' attribute for the tag,
629
        and throws an exception if it's not there."""
630
        return self._getAttrMap()[key]
631

632
    def __iter__(self):
633
        "Iterating over a tag iterates over its contents."
634
        return iter(self.contents)
635

636
    def __len__(self):
637
        "The length of a tag is the length of its list of contents."
638
        return len(self.contents)
639

640
    def __contains__(self, x):
641
        return x in self.contents
642

643
    def __nonzero__(self):
644
        "A tag is non-None even if it has no contents."
645
        return True
646

647
    def __setitem__(self, key, value):
648
        """Setting tag[key] sets the value of the 'key' attribute for the
649
        tag."""
650
        self._getAttrMap()
651
        self.attrMap[key] = value
652
        found = False
653
        for i in xrange(0, len(self.attrs)):
654
            if self.attrs[i][0] == key:
655
                self.attrs[i] = (key, value)
656
                found = True
657
        if not found:
658
            self.attrs.append((key, value))
659
        self._getAttrMap()[key] = value
660

661
    def __delitem__(self, key):
662
        "Deleting tag[key] deletes all 'key' attributes for the tag."
663
        for item in self.attrs:
664
            if item[0] == key:
665
                self.attrs.remove(item)
666
                #We don't break because bad HTML can define the same
667
                #attribute multiple times.
668
            self._getAttrMap()
669
            if self.attrMap.has_key(key):
670
                del self.attrMap[key]
671

672
    def __call__(self, *args, **kwargs):
673
        """Calling a tag like a function is the same as calling its
674
        findAll() method. Eg. tag('a') returns a list of all the A tags
675
        found within this tag."""
676
        return self.findAll(*args, **kwargs)
677

678
    def __getattr__(self, tag):
679
        #print "Getattr %s.%s" % (self.__class__, tag)
680
        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
681
            return self.find(tag[:-3])
682
        elif tag.find('__') != 0:
683
            return self.find(tag)
684
        raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag))
685

686
    def __eq__(self, other):
687
        """Returns true iff this tag has the same name, the same attributes,
688
        and the same contents (recursively) as the given tag.
689

690
        NOTE: right now this will return false if two tags have the
691
        same attributes in a different order. Should this be fixed?"""
692
        if other is self:
693
            return True
694
        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
695
            return False
696
        for i in xrange(0, len(self.contents)):
697
            if self.contents[i] != other.contents[i]:
698
                return False
699
        return True
700

701
    def __ne__(self, other):
702
        """Returns true iff this tag is not identical to the other tag,
703
        as defined in __eq__."""
704
        return not self == other
705

706
    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
707
        """Renders this tag as a string."""
708
        return self.__str__(encoding)
709

710
    def __unicode__(self):
711
        return self.__str__(None)
712

713
    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
714
                prettyPrint=False, indentLevel=0):
715
        """Returns a string or Unicode representation of this tag and
716
        its contents. To get Unicode, pass None for encoding.
717

718
        NOTE: since Python's HTML parser consumes whitespace, this
719
        method is not certain to reproduce the whitespace present in
720
        the original string."""
721

722
        encodedName = self.toEncoding(self.name, encoding)
723

724
        attrs = []
725
        if self.attrs:
726
            for key, val in self.attrs:
727
                fmt = '%s="%s"'
728
                if isinstance(val, basestring):
729
                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
730
                        val = self.substituteEncoding(val, encoding)
731

732
                    # The attribute value either:
733
                    #
734
                    # * Contains no embedded double quotes or single quotes.
735
                    #   No problem: we enclose it in double quotes.
736
                    # * Contains embedded single quotes. No problem:
737
                    #   double quotes work here too.
738
                    # * Contains embedded double quotes. No problem:
739
                    #   we enclose it in single quotes.
740
                    # * Embeds both single _and_ double quotes. This
741
                    #   can't happen naturally, but it can happen if
742
                    #   you modify an attribute value after parsing
743
                    #   the document. Now we have a bit of a
744
                    #   problem. We solve it by enclosing the
745
                    #   attribute in single quotes, and escaping any
746
                    #   embedded single quotes to XML entities.
747
                    if '"' in val:
748
                        fmt = "%s='%s'"
749
                        if "'" in val:
750
                            # TODO: replace with apos when
751
                            # appropriate.
752
                            val = val.replace("'", "&squot;")
753

754
                    # Now we're okay w/r/t quotes. But the attribute
755
                    # value might also contain angle brackets, or
756
                    # ampersands that aren't part of entities. We need
757
                    # to escape those to XML entities too.
758
                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
759

760
                attrs.append(fmt % (self.toEncoding(key, encoding),
761
                                    self.toEncoding(val, encoding)))
762
        close = ''
763
        closeTag = ''
764
        if self.isSelfClosing:
765
            close = ' /'
766
        else:
767
            closeTag = '</%s>' % encodedName
768

769
        indentTag, indentContents = 0, 0
770
        if prettyPrint:
771
            indentTag = indentLevel
772
            space = (' ' * (indentTag-1))
773
            indentContents = indentTag + 1
774
        contents = self.renderContents(encoding, prettyPrint, indentContents)
775
        if self.hidden:
776
            s = contents
777
        else:
778
            s = []
779
            attributeString = ''
780
            if attrs:
781
                attributeString = ' ' + ' '.join(attrs)
782
            if prettyPrint:
783
                s.append(space)
784
            s.append('<%s%s%s>' % (encodedName, attributeString, close))
785
            if prettyPrint:
786
                s.append("\n")
787
            s.append(contents)
788
            if prettyPrint and contents and contents[-1] != "\n":
789
                s.append("\n")
790
            if prettyPrint and closeTag:
791
                s.append(space)
792
            s.append(closeTag)
793
            if prettyPrint and closeTag and self.nextSibling:
794
                s.append("\n")
795
            s = ''.join(s)
796
        return s
797

798
    def decompose(self):
799
        """Recursively destroys the contents of this tree."""
800
        self.extract()
801
        if len(self.contents) == 0:
802
            return
803
        current = self.contents[0]
804
        while current is not None:
805
            next = current.next
806
            if isinstance(current, Tag):
807
                del current.contents[:]
808
            current.parent = None
809
            current.previous = None
810
            current.previousSibling = None
811
            current.next = None
812
            current.nextSibling = None
813
            current = next
814

815
    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
816
        return self.__str__(encoding, True)
817

818
    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
819
                       prettyPrint=False, indentLevel=0):
820
        """Renders the contents of this tag as a string in the given
821
        encoding. If encoding is None, returns a Unicode string.."""
822
        s=[]
823
        for c in self:
824
            text = None
825
            if isinstance(c, NavigableString):
826
                text = c.__str__(encoding)
827
            elif isinstance(c, Tag):
828
                s.append(c.__str__(encoding, prettyPrint, indentLevel))
829
            if text and prettyPrint:
830
                text = text.strip()
831
            if text:
832
                if prettyPrint:
833
                    s.append(" " * (indentLevel-1))
834
                s.append(text)
835
                if prettyPrint:
836
                    s.append("\n")
837

838
        return ''.join(s)
839

840
    #Soup methods
841

842
    def find(self, name=None, attrs={}, recursive=True, text=None,
843
             **kwargs):
844
        """Return only the first child of this Tag matching the given
845
        criteria."""
846
        r = None
847
        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
848
        if l:
849
            r = l[0]
850
        return r
851
    findChild = find
852

853
    def findAll(self, name=None, attrs={}, recursive=True, text=None,
854
                limit=None, **kwargs):
855
        """Extracts a list of Tag objects that match the given
856
        criteria.  You can specify the name of the Tag and any
857
        attributes you want the Tag to have.
858

859
        The value of a key-value pair in the 'attrs' map can be a
860
        string, a list of strings, a regular expression object, or a
861
        callable that takes a string and returns whether or not the
862
        string matches for some custom definition of 'matches'. The
863
        same is true of the tag name."""
864
        generator = self.recursiveChildGenerator
865
        if not recursive:
866
            generator = self.childGenerator
867
        return self._findAll(name, attrs, text, limit, generator, **kwargs)
868
    findChildren = findAll
869

870
    # Pre-3.x compatibility methods
871
    first = find
872
    fetch = findAll
873

874
    def fetchText(self, text=None, recursive=True, limit=None):
875
        return self.findAll(text=text, recursive=recursive, limit=limit)
876

877
    def firstText(self, text=None, recursive=True):
878
        return self.find(text=text, recursive=recursive)
879

880
    #Private methods
881

882
    def _getAttrMap(self):
883
        """Initializes a map representation of this tag's attributes,
884
        if not already initialized."""
885
        if not getattr(self, 'attrMap'):
886
            self.attrMap = {}
887
            for (key, value) in self.attrs:
888
                self.attrMap[key] = value
889
        return self.attrMap
890

891
    #Generator methods
892
    def childGenerator(self):
893
        # Just use the iterator from the contents
894
        return iter(self.contents)
895

896
    def recursiveChildGenerator(self):
897
        if not len(self.contents):
898
            return  # Note: https://stackoverflow.com/a/30217723 (PEP 479)
899
        stopNode = self._lastRecursiveChild().next
900
        current = self.contents[0]
901
        while current and current is not stopNode:
902
            yield current
903
            current = current.next
904

905

906
# Next, a couple classes to represent queries and their results.
907
class SoupStrainer:
908
    """Encapsulates a number of ways of matching a markup element (tag or
909
    text)."""
910

911
    def __init__(self, name=None, attrs={}, text=None, **kwargs):
912
        self.name = name
913
        if isinstance(attrs, basestring):
914
            kwargs['class'] = _match_css_class(attrs)
915
            attrs = None
916
        if kwargs:
917
            if attrs:
918
                attrs = attrs.copy()
919
                attrs.update(kwargs)
920
            else:
921
                attrs = kwargs
922
        self.attrs = attrs
923
        self.text = text
924

925
    def __str__(self):
926
        if self.text:
927
            return self.text
928
        else:
929
            return "%s|%s" % (self.name, self.attrs)
930

931
    def searchTag(self, markupName=None, markupAttrs={}):
932
        found = None
933
        markup = None
934
        if isinstance(markupName, Tag):
935
            markup = markupName
936
            markupAttrs = markup
937
        callFunctionWithTagData = callable(self.name) \
938
                                and not isinstance(markupName, Tag)
939

940
        if (not self.name) \
941
               or callFunctionWithTagData \
942
               or (markup and self._matches(markup, self.name)) \
943
               or (not markup and self._matches(markupName, self.name)):
944
            if callFunctionWithTagData:
945
                match = self.name(markupName, markupAttrs)
946
            else:
947
                match = True
948
                markupAttrMap = None
949
                for attr, matchAgainst in self.attrs.items():
950
                    if not markupAttrMap:
951
                         if hasattr(markupAttrs, 'get'):
952
                            markupAttrMap = markupAttrs
953
                         else:
954
                            markupAttrMap = {}
955
                            for k,v in markupAttrs:
956
                                markupAttrMap[k] = v
957
                    attrValue = markupAttrMap.get(attr)
958
                    if not self._matches(attrValue, matchAgainst):
959
                        match = False
960
                        break
961
            if match:
962
                if markup:
963
                    found = markup
964
                else:
965
                    found = markupName
966
        return found
967

968
    def search(self, markup):
969
        #print 'looking for %s in %s' % (self, markup)
970
        found = None
971
        # If given a list of items, scan it for a text element that
972
        # matches.
973
        if hasattr(markup, "__iter__") \
974
                and not isinstance(markup, Tag):
975
            for element in markup:
976
                if isinstance(element, NavigableString) \
977
                       and self.search(element):
978
                    found = element
979
                    break
980
        # If it's a Tag, make sure its name or attributes match.
981
        # Don't bother with Tags if we're searching for text.
982
        elif isinstance(markup, Tag):
983
            if not self.text:
984
                found = self.searchTag(markup)
985
        # If it's text, make sure the text matches.
986
        elif isinstance(markup, NavigableString) or \
987
                 isinstance(markup, basestring):
988
            if self._matches(markup, self.text):
989
                found = markup
990
        else:
991
            raise Exception("I don't know how to match against a %s" \
992
                  % markup.__class__)
993
        return found
994

995
    def _matches(self, markup, matchAgainst):
996
        #print "Matching %s against %s" % (markup, matchAgainst)
997
        result = False
998
        if matchAgainst is True:
999
            result = markup is not None
1000
        elif callable(matchAgainst):
1001
            result = matchAgainst(markup)
1002
        else:
1003
            #Custom match methods take the tag as an argument, but all
1004
            #other ways of matching match the tag name as a string.
1005
            if isinstance(markup, Tag):
1006
                markup = markup.name
1007
            if markup and not isinstance(markup, basestring):
1008
                markup = text_type(markup)
1009
            #Now we know that chunk is either a string, or None.
1010
            if hasattr(matchAgainst, 'match'):
1011
                # It's a regexp object.
1012
                result = markup and matchAgainst.search(markup)
1013
            elif hasattr(matchAgainst, '__iter__'): # list-like
1014
                result = markup in matchAgainst
1015
            elif hasattr(matchAgainst, 'items'):
1016
                result = markup.has_key(matchAgainst)
1017
            elif matchAgainst and isinstance(markup, basestring):
1018
                if isinstance(markup, text_type):
1019
                    matchAgainst = text_type(matchAgainst)
1020
                else:
1021
                    matchAgainst = str(matchAgainst)
1022

1023
            if not result:
1024
                result = matchAgainst == markup
1025
        return result
1026

1027
class ResultSet(list):
1028
    """A ResultSet is just a list that keeps track of the SoupStrainer
1029
    that created it."""
1030
    def __init__(self, source):
1031
        list.__init__([])
1032
        self.source = source
1033

1034
# Now, some helper functions.
1035

1036
def buildTagMap(default, *args):
1037
    """Turns a list of maps, lists, or scalars into a single map.
1038
    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1039
    NESTING_RESET_TAGS maps out of lists and partial maps."""
1040
    built = {}
1041
    for portion in args:
1042
        if hasattr(portion, 'items'):
1043
            #It's a map. Merge it.
1044
            for k,v in portion.items():
1045
                built[k] = v
1046
        elif hasattr(portion, '__iter__'): # is a list
1047
            #It's a list. Map each item to the default.
1048
            for k in portion:
1049
                built[k] = default
1050
        else:
1051
            #It's a scalar. Map it to the default.
1052
            built[portion] = default
1053
    return built
1054

1055
# Now, the parser classes.
1056

1057
class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):
1058

1059
    """This class contains the basic parser and search code. It defines
1060
    a parser that knows nothing about tag behavior except for the
1061
    following:
1062

1063
      You can't close a tag without closing all the tags it encloses.
1064
      That is, "<foo><bar></foo>" actually means
1065
      "<foo><bar></bar></foo>".
1066

1067
    [Another possible explanation is "<foo><bar /></foo>", but since
1068
    this class defines no SELF_CLOSING_TAGS, it will never use that
1069
    explanation.]
1070

1071
    This class is useful for parsing XML or made-up markup languages,
1072
    or when BeautifulSoup makes an assumption counter to what you were
1073
    expecting."""
1074

1075
    SELF_CLOSING_TAGS = {}
1076
    NESTABLE_TAGS = {}
1077
    RESET_NESTING_TAGS = {}
1078
    QUOTE_TAGS = {}
1079
    PRESERVE_WHITESPACE_TAGS = []
1080

1081
    MARKUP_MASSAGE = [(re.compile(r'(<[^<>]*)/>'),
1082
                       lambda x: x.group(1) + ' />'),
1083
                      (re.compile(r'<!\s+([^<>]*)>'),
1084
                       lambda x: '<!' + x.group(1) + '>')
1085
                      ]
1086

1087
    ROOT_TAG_NAME = u'[document]'
1088

1089
    HTML_ENTITIES = "html"
1090
    XML_ENTITIES = "xml"
1091
    XHTML_ENTITIES = "xhtml"
1092
    # TODO: This only exists for backwards-compatibility
1093
    ALL_ENTITIES = XHTML_ENTITIES
1094

1095
    # Used when determining whether a text node is all whitespace and
1096
    # can be replaced with a single space. A text node that contains
1097
    # fancy Unicode spaces (usually non-breaking) should be left
1098
    # alone.
1099
    STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1100

1101
    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1102
                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
1103
                 convertEntities=None, selfClosingTags=None, isHTML=False):
1104
        """The Soup object is initialized as the 'root tag', and the
1105
        provided markup (which can be a string or a file-like object)
1106
        is fed into the underlying parser.
1107

1108
        sgmllib will process most bad HTML, and the BeautifulSoup
1109
        class has some tricks for dealing with some HTML that kills
1110
        sgmllib, but Beautiful Soup can nonetheless choke or lose data
1111
        if your data uses self-closing tags or declarations
1112
        incorrectly.
1113

1114
        By default, Beautiful Soup uses regexes to sanitize input,
1115
        avoiding the vast majority of these problems. If the problems
1116
        don't apply to you, pass in False for markupMassage, and
1117
        you'll get better performance.
1118

1119
        The default parser massage techniques fix the two most common
1120
        instances of invalid HTML that choke sgmllib:
1121

1122
         <br/> (No space between name of closing tag and tag close)
1123
         <! --Comment--> (Extraneous whitespace in declaration)
1124

1125
        You can pass in a custom list of (RE object, replace method)
1126
        tuples to get Beautiful Soup to scrub your input the way you
1127
        want."""
1128

1129
        self.parseOnlyThese = parseOnlyThese
1130
        self.fromEncoding = fromEncoding
1131
        self.smartQuotesTo = smartQuotesTo
1132
        self.convertEntities = convertEntities
1133
        # Set the rules for how we'll deal with the entities we
1134
        # encounter
1135
        if self.convertEntities:
1136
            # It doesn't make sense to convert encoded characters to
1137
            # entities even while you're converting entities to Unicode.
1138
            # Just convert it all to Unicode.
1139
            self.smartQuotesTo = None
1140
            if convertEntities == self.HTML_ENTITIES:
1141
                self.convertXMLEntities = False
1142
                self.convertHTMLEntities = True
1143
                self.escapeUnrecognizedEntities = True
1144
            elif convertEntities == self.XHTML_ENTITIES:
1145
                self.convertXMLEntities = True
1146
                self.convertHTMLEntities = True
1147
                self.escapeUnrecognizedEntities = False
1148
            elif convertEntities == self.XML_ENTITIES:
1149
                self.convertXMLEntities = True
1150
                self.convertHTMLEntities = False
1151
                self.escapeUnrecognizedEntities = False
1152
        else:
1153
            self.convertXMLEntities = False
1154
            self.convertHTMLEntities = False
1155
            self.escapeUnrecognizedEntities = False
1156

1157
        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1158
        sgmllib.SGMLParser.__init__(self)
1159

1160
        if hasattr(markup, 'read'):        # It's a file-type object.
1161
            markup = markup.read()
1162
        self.markup = markup
1163
        self.markupMassage = markupMassage
1164
        try:
1165
            self._feed(isHTML=isHTML)
1166
        except StopParsing:
1167
            pass
1168
        self.markup = None                 # The markup can now be GCed
1169

1170
    def convert_charref(self, name):
1171
        """This method fixes a bug in Python's SGMLParser."""
1172
        try:
1173
            n = int(name)
1174
        except ValueError:
1175
            return
1176
        if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1177
            return
1178
        return self.convert_codepoint(n)
1179

1180
    def _feed(self, inDocumentEncoding=None, isHTML=False):
1181
        # Convert the document to Unicode.
1182
        markup = self.markup
1183
        if isinstance(markup, text_type):
1184
            if not hasattr(self, 'originalEncoding'):
1185
                self.originalEncoding = None
1186
        else:
1187
            dammit = UnicodeDammit\
1188
                     (markup, [self.fromEncoding, inDocumentEncoding],
1189
                      smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1190
            markup = dammit.unicode
1191
            self.originalEncoding = dammit.originalEncoding
1192
            self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1193
        if markup:
1194
            if self.markupMassage:
1195
                if not hasattr(self.markupMassage, "__iter__"):
1196
                    self.markupMassage = self.MARKUP_MASSAGE
1197
                for fix, m in self.markupMassage:
1198
                    markup = fix.sub(m, markup)
1199
                # TODO: We get rid of markupMassage so that the
1200
                # soup object can be deepcopied later on. Some
1201
                # Python installations can't copy regexes. If anyone
1202
                # was relying on the existence of markupMassage, this
1203
                # might cause problems.
1204
                del(self.markupMassage)
1205
        self.reset()
1206

1207
        sgmllib.SGMLParser.feed(self, markup)
1208
        # Close out any unfinished strings and close all the open tags.
1209
        self.endData()
1210
        while self.currentTag.name != self.ROOT_TAG_NAME:
1211
            self.popTag()
1212

1213
    def __getattr__(self, methodName):
1214
        """This method routes method call requests to either the SGMLParser
1215
        superclass or the Tag superclass, depending on the method name."""
1216
        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1217

1218
        if methodName.startswith('start_') or methodName.startswith('end_') \
1219
               or methodName.startswith('do_'):
1220
            return sgmllib.SGMLParser.__getattr__(self, methodName)
1221
        elif not methodName.startswith('__'):
1222
            return Tag.__getattr__(self, methodName)
1223
        else:
1224
            raise AttributeError
1225

1226
    def isSelfClosingTag(self, name):
1227
        """Returns true iff the given string is the name of a
1228
        self-closing tag according to this parser."""
1229
        return name in self.SELF_CLOSING_TAGS \
1230
               or name in self.instanceSelfClosingTags
1231

1232
    def reset(self):
1233
        Tag.__init__(self, self, self.ROOT_TAG_NAME)
1234
        self.hidden = 1
1235
        sgmllib.SGMLParser.reset(self)
1236
        self.currentData = []
1237
        self.currentTag = None
1238
        self.tagStack = []
1239
        self.quoteStack = []
1240
        self.pushTag(self)
1241

1242
    def popTag(self):
1243
        tag = self.tagStack.pop()
1244

1245
        #print "Pop", tag.name
1246
        if self.tagStack:
1247
            self.currentTag = self.tagStack[-1]
1248
        return self.currentTag
1249

1250
    def pushTag(self, tag):
1251
        #print "Push", tag.name
1252
        if self.currentTag:
1253
            self.currentTag.contents.append(tag)
1254
        self.tagStack.append(tag)
1255
        self.currentTag = self.tagStack[-1]
1256

1257
    def endData(self, containerClass=NavigableString):
1258
        if self.currentData:
1259
            currentData = u''.join(self.currentData)
1260
            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1261
                not set([tag.name for tag in self.tagStack]).intersection(
1262
                    self.PRESERVE_WHITESPACE_TAGS)):
1263
                if '\n' in currentData:
1264
                    currentData = '\n'
1265
                else:
1266
                    currentData = ' '
1267
            self.currentData = []
1268
            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1269
                   (not self.parseOnlyThese.text or \
1270
                    not self.parseOnlyThese.search(currentData)):
1271
                return
1272
            o = containerClass(currentData)
1273
            o.setup(self.currentTag, self.previous)
1274
            if self.previous:
1275
                self.previous.next = o
1276
            self.previous = o
1277
            self.currentTag.contents.append(o)
1278

1279

1280
    def _popToTag(self, name, inclusivePop=True):
1281
        """Pops the tag stack up to and including the most recent
1282
        instance of the given tag. If inclusivePop is false, pops the tag
1283
        stack up to but *not* including the most recent instqance of
1284
        the given tag."""
1285
        #print "Popping to %s" % name
1286
        if name == self.ROOT_TAG_NAME:
1287
            return
1288

1289
        numPops = 0
1290
        mostRecentTag = None
1291
        for i in xrange(len(self.tagStack)-1, 0, -1):
1292
            if name == self.tagStack[i].name:
1293
                numPops = len(self.tagStack)-i
1294
                break
1295
        if not inclusivePop:
1296
            numPops = numPops - 1
1297

1298
        for i in xrange(0, numPops):
1299
            mostRecentTag = self.popTag()
1300
        return mostRecentTag
1301

1302
    def _smartPop(self, name):
1303

1304
        """We need to pop up to the previous tag of this type, unless
1305
        one of this tag's nesting reset triggers comes between this
1306
        tag and the previous tag of this type, OR unless this tag is a
1307
        generic nesting trigger and another generic nesting trigger
1308
        comes between this tag and the previous tag of this type.
1309

1310
        Examples:
1311
         <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1312
         <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1313
         <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1314

1315
         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1316
         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1317
         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1318
        """
1319

1320
        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1321
        isNestable = nestingResetTriggers != None
1322
        isResetNesting = name in self.RESET_NESTING_TAGS
1323
        popTo = None
1324
        inclusive = True
1325
        for i in xrange(len(self.tagStack)-1, 0, -1):
1326
            p = self.tagStack[i]
1327
            if (not p or p.name == name) and not isNestable:
1328
                #Non-nestable tags get popped to the top or to their
1329
                #last occurance.
1330
                popTo = name
1331
                break
1332
            if (nestingResetTriggers is not None
1333
                and p.name in nestingResetTriggers) \
1334
                or (nestingResetTriggers is None and isResetNesting
1335
                    and p.name in self.RESET_NESTING_TAGS):
1336

1337
                #If we encounter one of the nesting reset triggers
1338
                #peculiar to this tag, or we encounter another tag
1339
                #that causes nesting to reset, pop up to but not
1340
                #including that tag.
1341
                popTo = p.name
1342
                inclusive = False
1343
                break
1344
            p = p.parent
1345
        if popTo:
1346
            self._popToTag(popTo, inclusive)
1347

1348
    def unknown_starttag(self, name, attrs, selfClosing=0):
1349
        #print "Start tag %s: %s" % (name, attrs)
1350
        if self.quoteStack:
1351
            #This is not a real tag.
1352
            #print "<%s> is not real!" % name
1353
            attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1354
            self.handle_data('<%s%s>' % (name, attrs))
1355
            return
1356
        self.endData()
1357

1358
        if not self.isSelfClosingTag(name) and not selfClosing:
1359
            self._smartPop(name)
1360

1361
        if self.parseOnlyThese and len(self.tagStack) <= 1 \
1362
               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1363
            return
1364

1365
        tag = Tag(self, name, attrs, self.currentTag, self.previous)
1366
        if self.previous:
1367
            self.previous.next = tag
1368
        self.previous = tag
1369
        self.pushTag(tag)
1370
        if selfClosing or self.isSelfClosingTag(name):
1371
            self.popTag()
1372
        if name in self.QUOTE_TAGS:
1373
            #print "Beginning quote (%s)" % name
1374
            self.quoteStack.append(name)
1375
            self.literal = 1
1376
        return tag
1377

1378
    def unknown_endtag(self, name):
1379
        #print "End tag %s" % name
1380
        if self.quoteStack and self.quoteStack[-1] != name:
1381
            #This is not a real end tag.
1382
            #print "</%s> is not real!" % name
1383
            self.handle_data('</%s>' % name)
1384
            return
1385
        self.endData()
1386
        self._popToTag(name)
1387
        if self.quoteStack and self.quoteStack[-1] == name:
1388
            self.quoteStack.pop()
1389
            self.literal = (len(self.quoteStack) > 0)
1390

1391
    def handle_data(self, data):
1392
        self.currentData.append(data)
1393

1394
    def _toStringSubclass(self, text, subclass):
1395
        """Adds a certain piece of text to the tree as a NavigableString
1396
        subclass."""
1397
        self.endData()
1398
        self.handle_data(text)
1399
        self.endData(subclass)
1400

1401
    def handle_pi(self, text):
1402
        """Handle a processing instruction as a ProcessingInstruction
1403
        object, possibly one with a %SOUP-ENCODING% slot into which an
1404
        encoding will be plugged later."""
1405
        if text[:3] == "xml":
1406
            text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1407
        self._toStringSubclass(text, ProcessingInstruction)
1408

1409
    def handle_comment(self, text):
1410
        "Handle comments as Comment objects."
1411
        self._toStringSubclass(text, Comment)
1412

1413
    def handle_charref(self, ref):
1414
        "Handle character references as data."
1415
        if self.convertEntities:
1416
            data = unichr(int(ref))
1417
        else:
1418
            data = '&#%s;' % ref
1419
        self.handle_data(data)
1420

1421
    def handle_entityref(self, ref):
1422
        """Handle entity references as data, possibly converting known
1423
        HTML and/or XML entity references to the corresponding Unicode
1424
        characters."""
1425
        data = None
1426
        if self.convertHTMLEntities:
1427
            try:
1428
                data = unichr(name2codepoint[ref])
1429
            except KeyError:
1430
                pass
1431

1432
        if not data and self.convertXMLEntities:
1433
                data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1434

1435
        if not data and self.convertHTMLEntities and \
1436
            not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1437
                # TODO: We've got a problem here. We're told this is
1438
                # an entity reference, but it's not an XML entity
1439
                # reference or an HTML entity reference. Nonetheless,
1440
                # the logical thing to do is to pass it through as an
1441
                # unrecognized entity reference.
1442
                #
1443
                # Except: when the input is "&carol;" this function
1444
                # will be called with input "carol". When the input is
1445
                # "AT&T", this function will be called with input
1446
                # "T". We have no way of knowing whether a semicolon
1447
                # was present originally, so we don't know whether
1448
                # this is an unknown entity or just a misplaced
1449
                # ampersand.
1450
                #
1451
                # The more common case is a misplaced ampersand, so I
1452
                # escape the ampersand and omit the trailing semicolon.
1453
                data = "&amp;%s" % ref
1454
        if not data:
1455
            # This case is different from the one above, because we
1456
            # haven't already gone through a supposedly comprehensive
1457
            # mapping of entities to Unicode characters. We might not
1458
            # have gone through any mapping at all. So the chances are
1459
            # very high that this is a real entity, and not a
1460
            # misplaced ampersand.
1461
            data = "&%s;" % ref
1462
        self.handle_data(data)
1463

1464
    def handle_decl(self, data):
1465
        "Handle DOCTYPEs and the like as Declaration objects."
1466
        self._toStringSubclass(data, Declaration)
1467

1468
    def parse_declaration(self, i):
1469
        """Treat a bogus SGML declaration as raw data. Treat a CDATA
1470
        declaration as a CData object."""
1471
        j = None
1472
        if self.rawdata[i:i+9] == '<![CDATA[':
1473
             k = self.rawdata.find(']]>', i)
1474
             if k == -1:
1475
                 k = len(self.rawdata)
1476
             data = self.rawdata[i+9:k]
1477
             j = k+3
1478
             self._toStringSubclass(data, CData)
1479
        else:
1480
            try:
1481
                j = sgmllib.SGMLParser.parse_declaration(self, i)
1482
            except sgmllib.SGMLParseError:
1483
                toHandle = self.rawdata[i:]
1484
                self.handle_data(toHandle)
1485
                j = i + len(toHandle)
1486
        return j
1487

1488
class BeautifulSoup(BeautifulStoneSoup):
1489

1490
    """This parser knows the following facts about HTML:
1491

1492
    * Some tags have no closing tag and should be interpreted as being
1493
      closed as soon as they are encountered.
1494

1495
    * The text inside some tags (ie. 'script') may contain tags which
1496
      are not really part of the document and which should be parsed
1497
      as text, not tags. If you want to parse the text as tags, you can
1498
      always fetch it and parse it explicitly.
1499

1500
    * Tag nesting rules:
1501

1502
      Most tags can't be nested at all. For instance, the occurance of
1503
      a <p> tag should implicitly close the previous <p> tag.
1504

1505
       <p>Para1<p>Para2
1506
        should be transformed into:
1507
       <p>Para1</p><p>Para2
1508

1509
      Some tags can be nested arbitrarily. For instance, the occurance
1510
      of a <blockquote> tag should _not_ implicitly close the previous
1511
      <blockquote> tag.
1512

1513
       Alice said: <blockquote>Bob said: <blockquote>Blah
1514
        should NOT be transformed into:
1515
       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1516

1517
      Some tags can be nested, but the nesting is reset by the
1518
      interposition of other tags. For instance, a <tr> tag should
1519
      implicitly close the previous <tr> tag within the same <table>,
1520
      but not close a <tr> tag in another table.
1521

1522
       <table><tr>Blah<tr>Blah
1523
        should be transformed into:
1524
       <table><tr>Blah</tr><tr>Blah
1525
        but,
1526
       <tr>Blah<table><tr>Blah
1527
        should NOT be transformed into
1528
       <tr>Blah<table></tr><tr>Blah
1529

1530
    Differing assumptions about tag nesting rules are a major source
1531
    of problems with the BeautifulSoup class. If BeautifulSoup is not
1532
    treating as nestable a tag your page author treats as nestable,
1533
    try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1534
    BeautifulStoneSoup before writing your own subclass."""
1535

1536
    def __init__(self, *args, **kwargs):
1537
        if 'smartQuotesTo' not in kwargs:
1538
            kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1539
        kwargs['isHTML'] = True
1540
        BeautifulStoneSoup.__init__(self, *args, **kwargs)
1541

1542
    SELF_CLOSING_TAGS = buildTagMap(None,
1543
                                    ('br' , 'hr', 'input', 'img', 'meta',
1544
                                    'spacer', 'link', 'frame', 'base', 'col'))
1545

1546
    PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1547

1548
    QUOTE_TAGS = {'script' : None, 'textarea' : None}
1549

1550
    #According to the HTML standard, each of these inline tags can
1551
    #contain another tag of the same type. Furthermore, it's common
1552
    #to actually use these tags this way.
1553
    NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1554
                            'center')
1555

1556
    #According to the HTML standard, these block tags can contain
1557
    #another tag of the same type. Furthermore, it's common
1558
    #to actually use these tags this way.
1559
    NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
1560

1561
    #Lists can contain other lists, but there are restrictions.
1562
    NESTABLE_LIST_TAGS = { 'ol' : [],
1563
                           'ul' : [],
1564
                           'li' : ['ul', 'ol'],
1565
                           'dl' : [],
1566
                           'dd' : ['dl'],
1567
                           'dt' : ['dl'] }
1568

1569
    #Tables can contain other tables, but there are restrictions.
1570
    NESTABLE_TABLE_TAGS = {'table' : [],
1571
                           'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1572
                           'td' : ['tr'],
1573
                           'th' : ['tr'],
1574
                           'thead' : ['table'],
1575
                           'tbody' : ['table'],
1576
                           'tfoot' : ['table'],
1577
                           }
1578

1579
    NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
1580

1581
    #If one of these tags is encountered, all tags up to the next tag of
1582
    #this type are popped.
1583
    RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1584
                                     NON_NESTABLE_BLOCK_TAGS,
1585
                                     NESTABLE_LIST_TAGS,
1586
                                     NESTABLE_TABLE_TAGS)
1587

1588
    NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1589
                                NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1590

1591
    # Used to detect the charset in a META tag; see start_meta
1592
    CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
1593

1594
    def start_meta(self, attrs):
1595
        """Beautiful Soup can detect a charset included in a META tag,
1596
        try to convert the document to that charset, and re-parse the
1597
        document from the beginning."""
1598
        httpEquiv = None
1599
        contentType = None
1600
        contentTypeIndex = None
1601
        tagNeedsEncodingSubstitution = False
1602

1603
        for i in xrange(0, len(attrs)):
1604
            key, value = attrs[i]
1605
            key = key.lower()
1606
            if key == 'http-equiv':
1607
                httpEquiv = value
1608
            elif key == 'content':
1609
                contentType = value
1610
                contentTypeIndex = i
1611

1612
        if httpEquiv and contentType: # It's an interesting meta tag.
1613
            match = self.CHARSET_RE.search(contentType)
1614
            if match:
1615
                if (self.declaredHTMLEncoding is not None or
1616
                    self.originalEncoding == self.fromEncoding):
1617
                    # An HTML encoding was sniffed while converting
1618
                    # the document to Unicode, or an HTML encoding was
1619
                    # sniffed during a previous pass through the
1620
                    # document, or an encoding was specified
1621
                    # explicitly and it worked. Rewrite the meta tag.
1622
                    def rewrite(match):
1623
                        return match.group(1) + "%SOUP-ENCODING%"
1624
                    newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1625
                    attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1626
                                               newAttr)
1627
                    tagNeedsEncodingSubstitution = True
1628
                else:
1629
                    # This is our first pass through the document.
1630
                    # Go through it again with the encoding information.
1631
                    newCharset = match.group(3)
1632
                    if newCharset and newCharset != self.originalEncoding:
1633
                        self.declaredHTMLEncoding = newCharset
1634
                        self._feed(self.declaredHTMLEncoding)
1635
                        raise StopParsing
1636
                    pass
1637
        tag = self.unknown_starttag("meta", attrs)
1638
        if tag and tagNeedsEncodingSubstitution:
1639
            tag.containsSubstitutions = True
1640

1641
class StopParsing(Exception):
1642
    pass
1643

1644
class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1645

1646
    """The BeautifulSoup class is oriented towards skipping over
1647
    common HTML errors like unclosed tags. However, sometimes it makes
1648
    errors of its own. For instance, consider this fragment:
1649

1650
     <b>Foo<b>Bar</b></b>
1651

1652
    This is perfectly valid (if bizarre) HTML. However, the
1653
    BeautifulSoup class will implicitly close the first b tag when it
1654
    encounters the second 'b'. It will think the author wrote
1655
    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1656
    there's no real-world reason to bold something that's already
1657
    bold. When it encounters '</b></b>' it will close two more 'b'
1658
    tags, for a grand total of three tags closed instead of two. This
1659
    can throw off the rest of your document structure. The same is
1660
    true of a number of other tags, listed below.
1661

1662
    It's much more common for someone to forget to close a 'b' tag
1663
    than to actually use nested 'b' tags, and the BeautifulSoup class
1664
    handles the common case. This class handles the not-co-common
1665
    case: where you can't believe someone wrote what they did, but
1666
    it's valid HTML and BeautifulSoup screwed up by assuming it
1667
    wouldn't be."""
1668

1669
    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1670
     ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1671
      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1672
      'big')
1673

1674
    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
1675

1676
    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1677
                                I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1678
                                I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1679

1680
class MinimalSoup(BeautifulSoup):
1681
    """The MinimalSoup class is for parsing HTML that contains
1682
    pathologically bad markup. It makes no assumptions about tag
1683
    nesting, but it does know which tags are self-closing, that
1684
    <script> tags contain Javascript and should not be parsed, that
1685
    META tags may contain encoding information, and so on.
1686

1687
    This also makes it better for subclassing than BeautifulStoneSoup
1688
    or BeautifulSoup."""
1689

1690
    RESET_NESTING_TAGS = buildTagMap('noscript')
1691
    NESTABLE_TAGS = {}
1692

1693
class BeautifulSOAP(BeautifulStoneSoup):
1694
    """This class will push a tag with only a single string child into
1695
    the tag's parent as an attribute. The attribute's name is the tag
1696
    name, and the value is the string child. An example should give
1697
    the flavor of the change:
1698

1699
    <foo><bar>baz</bar></foo>
1700
     =>
1701
    <foo bar="baz"><bar>baz</bar></foo>
1702

1703
    You can then access fooTag['bar'] instead of fooTag.barTag.string.
1704

1705
    This is, of course, useful for scraping structures that tend to
1706
    use subelements instead of attributes, such as SOAP messages. Note
1707
    that it modifies its input, so don't print the modified version
1708
    out.
1709

1710
    I'm not sure how many people really want to use this class; let me
1711
    know if you do. Mainly I like the name."""
1712

1713
    def popTag(self):
1714
        if len(self.tagStack) > 1:
1715
            tag = self.tagStack[-1]
1716
            parent = self.tagStack[-2]
1717
            parent._getAttrMap()
1718
            if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1719
                isinstance(tag.contents[0], NavigableString) and
1720
                not parent.attrMap.has_key(tag.name)):
1721
                parent[tag.name] = tag.contents[0]
1722
        BeautifulStoneSoup.popTag(self)
1723

1724
#Enterprise class names! It has come to our attention that some people
1725
#think the names of the Beautiful Soup parser classes are too silly
1726
#and "unprofessional" for use in enterprise screen-scraping. We feel
1727
#your pain! For such-minded folk, the Beautiful Soup Consortium And
1728
#All-Night Kosher Bakery recommends renaming this file to
1729
#"RobustParser.py" (or, in cases of extreme enterprisiness,
1730
#"RobustParserBeanInterface.class") and using the following
1731
#enterprise-friendly class aliases:
1732
class RobustXMLParser(BeautifulStoneSoup):
1733
    pass
1734
class RobustHTMLParser(BeautifulSoup):
1735
    pass
1736
class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1737
    pass
1738
class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1739
    pass
1740
class SimplifyingSOAPParser(BeautifulSOAP):
1741
    pass
1742

1743
######################################################
1744
#
1745
# Bonus library: Unicode, Dammit
1746
#
1747
# This class forces XML data into a standard format (usually to UTF-8
1748
# or Unicode).  It is heavily based on code from Mark Pilgrim's
1749
# Universal Feed Parser. It does not rewrite the XML or HTML to
1750
# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1751
# (XML) and BeautifulSoup.start_meta (HTML).
1752

1753
# Autodetects character encodings.
1754
# Download from http://chardet.feedparser.org/
1755
try:
1756
    import chardet
1757
#    import chardet.constants
1758
#    chardet.constants._debug = 1
1759
except ImportError:
1760
    chardet = None
1761

1762
# cjkcodecs and iconv_codec make Python know about more character encodings.
1763
# Both are available from http://cjkpython.i18n.org/
1764
# They're built in if you use Python 2.4.
1765
try:
1766
    import cjkcodecs.aliases
1767
except ImportError:
1768
    pass
1769
try:
1770
    import iconv_codec
1771
except ImportError:
1772
    pass
1773

1774
class UnicodeDammit:
1775
    """A class for detecting the encoding of a *ML document and
1776
    converting it to a Unicode string. If the source encoding is
1777
    windows-1252, can replace MS smart quotes with their HTML or XML
1778
    equivalents."""
1779

1780
    # This dictionary maps commonly seen values for "charset" in HTML
1781
    # meta tags to the corresponding Python codec names. It only covers
1782
    # values that aren't in Python's aliases and can't be determined
1783
    # by the heuristics in find_codec.
1784
    CHARSET_ALIASES = { "macintosh" : "mac-roman",
1785
                        "x-sjis" : "shift-jis" }
1786

1787
    def __init__(self, markup, overrideEncodings=[],
1788
                 smartQuotesTo='xml', isHTML=False):
1789
        self.declaredHTMLEncoding = None
1790
        self.markup, documentEncoding, sniffedEncoding = \
1791
                     self._detectEncoding(markup, isHTML)
1792
        self.smartQuotesTo = smartQuotesTo
1793
        self.triedEncodings = []
1794
        if markup == '' or isinstance(markup, text_type):
1795
            self.originalEncoding = None
1796
            self.unicode = text_type(markup)
1797
            return
1798

1799
        u = None
1800
        for proposedEncoding in overrideEncodings:
1801
            u = self._convertFrom(proposedEncoding)
1802
            if u: break
1803
        if not u:
1804
            for proposedEncoding in (documentEncoding, sniffedEncoding):
1805
                u = self._convertFrom(proposedEncoding)
1806
                if u: break
1807

1808
        # If no luck and we have auto-detection library, try that:
1809
        if not u and chardet and not isinstance(self.markup, text_type):
1810
            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1811

1812
        # As a last resort, try utf-8 and windows-1252:
1813
        if not u:
1814
            for proposed_encoding in ("utf-8", "windows-1252"):
1815
                u = self._convertFrom(proposed_encoding)
1816
                if u: break
1817

1818
        self.unicode = u
1819
        if not u: self.originalEncoding = None
1820

1821
    def _subMSChar(self, orig):
1822
        """Changes a MS smart quote character to an XML or HTML
1823
        entity."""
1824
        sub = self.MS_CHARS.get(orig)
1825
        if isinstance(sub, tuple):
1826
            if self.smartQuotesTo == 'xml':
1827
                sub = '&#x%s;' % sub[1]
1828
            else:
1829
                sub = '&%s;' % sub[0]
1830
        return sub
1831

1832
    def _convertFrom(self, proposed):
1833
        proposed = self.find_codec(proposed)
1834
        if not proposed or proposed in self.triedEncodings:
1835
            return None
1836
        self.triedEncodings.append(proposed)
1837
        markup = self.markup
1838

1839
        # Convert smart quotes to HTML if coming from an encoding
1840
        # that might have them.
1841
        if self.smartQuotesTo and proposed.lower() in("windows-1252",
1842
                                                      "iso-8859-1",
1843
                                                      "iso-8859-2"):
1844
            markup = re.compile("([\x80-\x9f])").sub \
1845
                     (lambda x: self._subMSChar(x.group(1)),
1846
                      markup)
1847

1848
        try:
1849
            # print "Trying to convert document to %s" % proposed
1850
            u = self._toUnicode(markup, proposed)
1851
            self.markup = u
1852
            self.originalEncoding = proposed
1853
        except Exception as e:
1854
            # print "That didn't work!"
1855
            # print e
1856
            return None
1857
        #print "Correct encoding: %s" % proposed
1858
        return self.markup
1859

1860
    def _toUnicode(self, data, encoding):
1861
        '''Given a string and its encoding, decodes the string into Unicode.
1862
        %encoding is a string recognized by encodings.aliases'''
1863

1864
        # strip Byte Order Mark (if present)
1865
        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1866
               and (data[2:4] != '\x00\x00'):
1867
            encoding = 'utf-16be'
1868
            data = data[2:]
1869
        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1870
                 and (data[2:4] != '\x00\x00'):
1871
            encoding = 'utf-16le'
1872
            data = data[2:]
1873
        elif data[:3] == '\xef\xbb\xbf':
1874
            encoding = 'utf-8'
1875
            data = data[3:]
1876
        elif data[:4] == '\x00\x00\xfe\xff':
1877
            encoding = 'utf-32be'
1878
            data = data[4:]
1879
        elif data[:4] == '\xff\xfe\x00\x00':
1880
            encoding = 'utf-32le'
1881
            data = data[4:]
1882
        newdata = text_type(data, encoding)
1883
        return newdata
1884

1885
    def _detectEncoding(self, xml_data, isHTML=False):
1886
        """Given a document, tries to detect its XML encoding."""
1887
        xml_encoding = sniffed_xml_encoding = None
1888
        try:
1889
            if xml_data[:4] == '\x4c\x6f\xa7\x94':
1890
                # EBCDIC
1891
                xml_data = self._ebcdic_to_ascii(xml_data)
1892
            elif xml_data[:4] == '\x00\x3c\x00\x3f':
1893
                # UTF-16BE
1894
                sniffed_xml_encoding = 'utf-16be'
1895
                xml_data = text_type(xml_data, 'utf-16be').encode('utf-8')
1896
            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1897
                     and (xml_data[2:4] != '\x00\x00'):
1898
                # UTF-16BE with BOM
1899
                sniffed_xml_encoding = 'utf-16be'
1900
                xml_data = text_type(xml_data[2:], 'utf-16be').encode('utf-8')
1901
            elif xml_data[:4] == '\x3c\x00\x3f\x00':
1902
                # UTF-16LE
1903
                sniffed_xml_encoding = 'utf-16le'
1904
                xml_data = text_type(xml_data, 'utf-16le').encode('utf-8')
1905
            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1906
                     (xml_data[2:4] != '\x00\x00'):
1907
                # UTF-16LE with BOM
1908
                sniffed_xml_encoding = 'utf-16le'
1909
                xml_data = text_type(xml_data[2:], 'utf-16le').encode('utf-8')
1910
            elif xml_data[:4] == '\x00\x00\x00\x3c':
1911
                # UTF-32BE
1912
                sniffed_xml_encoding = 'utf-32be'
1913
                xml_data = text_type(xml_data, 'utf-32be').encode('utf-8')
1914
            elif xml_data[:4] == '\x3c\x00\x00\x00':
1915
                # UTF-32LE
1916
                sniffed_xml_encoding = 'utf-32le'
1917
                xml_data = text_type(xml_data, 'utf-32le').encode('utf-8')
1918
            elif xml_data[:4] == '\x00\x00\xfe\xff':
1919
                # UTF-32BE with BOM
1920
                sniffed_xml_encoding = 'utf-32be'
1921
                xml_data = text_type(xml_data[4:], 'utf-32be').encode('utf-8')
1922
            elif xml_data[:4] == '\xff\xfe\x00\x00':
1923
                # UTF-32LE with BOM
1924
                sniffed_xml_encoding = 'utf-32le'
1925
                xml_data = text_type(xml_data[4:], 'utf-32le').encode('utf-8')
1926
            elif xml_data[:3] == '\xef\xbb\xbf':
1927
                # UTF-8 with BOM
1928
                sniffed_xml_encoding = 'utf-8'
1929
                xml_data = text_type(xml_data[3:], 'utf-8').encode('utf-8')
1930
            else:
1931
                sniffed_xml_encoding = 'ascii'
1932
                pass
1933
        except:
1934
            xml_encoding_match = None
1935
        xml_encoding_match = re.compile(
1936
            r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1937
        if not xml_encoding_match and isHTML:
1938
            regexp = re.compile(r'<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1939
            xml_encoding_match = regexp.search(xml_data)
1940
        if xml_encoding_match is not None:
1941
            xml_encoding = xml_encoding_match.groups()[0].lower()
1942
            if isHTML:
1943
                self.declaredHTMLEncoding = xml_encoding
1944
            if sniffed_xml_encoding and \
1945
               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1946
                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1947
                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1948
                                 'utf16', 'u16')):
1949
                xml_encoding = sniffed_xml_encoding
1950
        return xml_data, xml_encoding, sniffed_xml_encoding
1951

1952

1953
    def find_codec(self, charset):
1954
        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1955
               or (charset and self._codec(charset.replace("-", ""))) \
1956
               or (charset and self._codec(charset.replace("-", "_"))) \
1957
               or charset
1958

1959
    def _codec(self, charset):
1960
        if not charset: return charset
1961
        codec = None
1962
        try:
1963
            codecs.lookup(charset)
1964
            codec = charset
1965
        except (LookupError, ValueError):
1966
            pass
1967
        return codec
1968

1969
    EBCDIC_TO_ASCII_MAP = None
1970
    def _ebcdic_to_ascii(self, s):
1971
        c = self.__class__
1972
        if not c.EBCDIC_TO_ASCII_MAP:
1973
            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1974
                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1975
                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1976
                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1977
                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1978
                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1979
                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1980
                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1981
                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1982
                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1983
                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1984
                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1985
                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1986
                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1987
                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1988
                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1989
                    250,251,252,253,254,255)
1990
            import string
1991
            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1992
            ''.join(map(chr, xrange(256))), ''.join(map(chr, emap)))
1993
        return s.translate(c.EBCDIC_TO_ASCII_MAP)
1994

1995
    MS_CHARS = { '\x80' : ('euro', '20AC'),
1996
                 '\x81' : ' ',
1997
                 '\x82' : ('sbquo', '201A'),
1998
                 '\x83' : ('fnof', '192'),
1999
                 '\x84' : ('bdquo', '201E'),
2000
                 '\x85' : ('hellip', '2026'),
2001
                 '\x86' : ('dagger', '2020'),
2002
                 '\x87' : ('Dagger', '2021'),
2003
                 '\x88' : ('circ', '2C6'),
2004
                 '\x89' : ('permil', '2030'),
2005
                 '\x8A' : ('Scaron', '160'),
2006
                 '\x8B' : ('lsaquo', '2039'),
2007
                 '\x8C' : ('OElig', '152'),
2008
                 '\x8D' : '?',
2009
                 '\x8E' : ('#x17D', '17D'),
2010
                 '\x8F' : '?',
2011
                 '\x90' : '?',
2012
                 '\x91' : ('lsquo', '2018'),
2013
                 '\x92' : ('rsquo', '2019'),
2014
                 '\x93' : ('ldquo', '201C'),
2015
                 '\x94' : ('rdquo', '201D'),
2016
                 '\x95' : ('bull', '2022'),
2017
                 '\x96' : ('ndash', '2013'),
2018
                 '\x97' : ('mdash', '2014'),
2019
                 '\x98' : ('tilde', '2DC'),
2020
                 '\x99' : ('trade', '2122'),
2021
                 '\x9a' : ('scaron', '161'),
2022
                 '\x9b' : ('rsaquo', '203A'),
2023
                 '\x9c' : ('oelig', '153'),
2024
                 '\x9d' : '?',
2025
                 '\x9e' : ('#x17E', '17E'),
2026
                 '\x9f' : ('Yuml', ''),}
2027

2028
#######################################################################
2029

2030

2031
#By default, act as an HTML pretty-printer.
2032
if __name__ == '__main__':
2033
    soup = BeautifulSoup(sys.stdin)
2034
    print(soup.prettify())
2035

2036
Product

Resources

Company