Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sqlmapproject
GitHub Repository: sqlmapproject/sqlmap
Path: blob/master/thirdparty/beautifulsoup/beautifulsoup.py
2992 views
1
"""Beautiful Soup
2
Elixir and Tonic
3
"The Screen-Scraper's Friend"
4
http://www.crummy.com/software/BeautifulSoup/
5
6
Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7
tree representation. It provides methods and Pythonic idioms that make
8
it easy to navigate, search, and modify the tree.
9
10
A well-formed XML/HTML document yields a well-formed data
11
structure. An ill-formed XML/HTML document yields a correspondingly
12
ill-formed data structure. If your document is only locally
13
well-formed, you can use this library to find and process the
14
well-formed part of it.
15
16
Beautiful Soup works with Python 2.2 and up. It has no external
17
dependencies, but you'll have more success at converting data to UTF-8
18
if you also install these three packages:
19
20
* chardet, for auto-detecting character encodings
21
http://chardet.feedparser.org/
22
* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23
by stock Python.
24
http://cjkpython.i18n.org/
25
26
Beautiful Soup defines classes for two main parsing strategies:
27
28
* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29
language that kind of looks like XML.
30
31
* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32
or invalid. This class has web browser-like heuristics for
33
obtaining a sensible parse tree in the face of common HTML errors.
34
35
Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36
the encoding of an HTML or XML document, and converting it to
37
Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38
39
For more than you ever wanted to know about Beautiful Soup, see the
40
documentation:
41
http://www.crummy.com/software/BeautifulSoup/documentation.html
42
43
Here, have some legalese:
44
45
Copyright (c) 2004-2010, Leonard Richardson
46
47
All rights reserved.
48
49
Redistribution and use in source and binary forms, with or without
50
modification, are permitted provided that the following conditions are
51
met:
52
53
* Redistributions of source code must retain the above copyright
54
notice, this list of conditions and the following disclaimer.
55
56
* Redistributions in binary form must reproduce the above
57
copyright notice, this list of conditions and the following
58
disclaimer in the documentation and/or other materials provided
59
with the distribution.
60
61
* Neither the name of the Beautiful Soup Consortium and All
62
Night Kosher Bakery nor the names of its contributors may be
63
used to endorse or promote products derived from this software
64
without specific prior written permission.
65
66
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77
78
"""
79
from __future__ import generators
80
from __future__ import print_function
81
82
__author__ = "Leonard Richardson ([email protected])"
83
__version__ = "3.2.1b"
84
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
85
__license__ = "New-style BSD"
86
87
import codecs
88
import re
89
import sys
90
91
if sys.version_info >= (3, 0):
92
xrange = range
93
text_type = str
94
binary_type = bytes
95
basestring = str
96
unichr = chr
97
else:
98
text_type = unicode
99
binary_type = str
100
101
try:
102
from html.entities import name2codepoint
103
except ImportError:
104
from htmlentitydefs import name2codepoint
105
106
try:
107
set
108
except NameError:
109
from sets import Set as set
110
111
try:
112
import sgmllib
113
except ImportError:
114
from lib.utils import sgmllib
115
116
try:
117
import markupbase
118
except ImportError:
119
import _markupbase as markupbase
120
121
#These hacks make Beautiful Soup able to parse XML with namespaces
122
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
123
markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
124
125
DEFAULT_OUTPUT_ENCODING = "utf-8"
126
127
def _match_css_class(str):
128
"""Build a RE to match the given CSS class."""
129
return re.compile(r"(^|.*\s)%s($|\s)" % str)
130
131
# First, the classes that represent markup elements.
132
133
class PageElement(object):
134
"""Contains the navigational information for some part of the page
135
(either a tag or a piece of text)"""
136
137
def _invert(h):
138
"Cheap function to invert a hash."
139
i = {}
140
for k,v in h.items():
141
i[v] = k
142
return i
143
144
XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
145
"quot" : '"',
146
"amp" : "&",
147
"lt" : "<",
148
"gt" : ">" }
149
150
XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
151
152
def setup(self, parent=None, previous=None):
153
"""Sets up the initial relations between this element and
154
other elements."""
155
self.parent = parent
156
self.previous = previous
157
self.next = None
158
self.previousSibling = None
159
self.nextSibling = None
160
if self.parent and self.parent.contents:
161
self.previousSibling = self.parent.contents[-1]
162
self.previousSibling.nextSibling = self
163
164
def replaceWith(self, replaceWith):
165
oldParent = self.parent
166
myIndex = self.parent.index(self)
167
if hasattr(replaceWith, "parent")\
168
and replaceWith.parent is self.parent:
169
# We're replacing this element with one of its siblings.
170
index = replaceWith.parent.index(replaceWith)
171
if index and index < myIndex:
172
# Furthermore, it comes before this element. That
173
# means that when we extract it, the index of this
174
# element will change.
175
myIndex = myIndex - 1
176
self.extract()
177
oldParent.insert(myIndex, replaceWith)
178
179
def replaceWithChildren(self):
180
myParent = self.parent
181
myIndex = self.parent.index(self)
182
self.extract()
183
reversedChildren = list(self.contents)
184
reversedChildren.reverse()
185
for child in reversedChildren:
186
myParent.insert(myIndex, child)
187
188
def extract(self):
189
"""Destructively rips this element out of the tree."""
190
if self.parent:
191
try:
192
del self.parent.contents[self.parent.index(self)]
193
except ValueError:
194
pass
195
196
#Find the two elements that would be next to each other if
197
#this element (and any children) hadn't been parsed. Connect
198
#the two.
199
lastChild = self._lastRecursiveChild()
200
nextElement = lastChild.next
201
202
if self.previous:
203
self.previous.next = nextElement
204
if nextElement:
205
nextElement.previous = self.previous
206
self.previous = None
207
lastChild.next = None
208
209
self.parent = None
210
if self.previousSibling:
211
self.previousSibling.nextSibling = self.nextSibling
212
if self.nextSibling:
213
self.nextSibling.previousSibling = self.previousSibling
214
self.previousSibling = self.nextSibling = None
215
return self
216
217
def _lastRecursiveChild(self):
218
"Finds the last element beneath this object to be parsed."
219
lastChild = self
220
while hasattr(lastChild, 'contents') and lastChild.contents:
221
lastChild = lastChild.contents[-1]
222
return lastChild
223
224
def insert(self, position, newChild):
225
if isinstance(newChild, basestring) \
226
and not isinstance(newChild, NavigableString):
227
newChild = NavigableString(newChild)
228
229
position = min(position, len(self.contents))
230
if hasattr(newChild, 'parent') and newChild.parent is not None:
231
# We're 'inserting' an element that's already one
232
# of this object's children.
233
if newChild.parent is self:
234
index = self.index(newChild)
235
if index > position:
236
# Furthermore we're moving it further down the
237
# list of this object's children. That means that
238
# when we extract this element, our target index
239
# will jump down one.
240
position = position - 1
241
newChild.extract()
242
243
newChild.parent = self
244
previousChild = None
245
if position == 0:
246
newChild.previousSibling = None
247
newChild.previous = self
248
else:
249
previousChild = self.contents[position-1]
250
newChild.previousSibling = previousChild
251
newChild.previousSibling.nextSibling = newChild
252
newChild.previous = previousChild._lastRecursiveChild()
253
if newChild.previous:
254
newChild.previous.next = newChild
255
256
newChildsLastElement = newChild._lastRecursiveChild()
257
258
if position >= len(self.contents):
259
newChild.nextSibling = None
260
261
parent = self
262
parentsNextSibling = None
263
while not parentsNextSibling:
264
parentsNextSibling = parent.nextSibling
265
parent = parent.parent
266
if not parent: # This is the last element in the document.
267
break
268
if parentsNextSibling:
269
newChildsLastElement.next = parentsNextSibling
270
else:
271
newChildsLastElement.next = None
272
else:
273
nextChild = self.contents[position]
274
newChild.nextSibling = nextChild
275
if newChild.nextSibling:
276
newChild.nextSibling.previousSibling = newChild
277
newChildsLastElement.next = nextChild
278
279
if newChildsLastElement.next:
280
newChildsLastElement.next.previous = newChildsLastElement
281
self.contents.insert(position, newChild)
282
283
def append(self, tag):
284
"""Appends the given tag to the contents of this tag."""
285
self.insert(len(self.contents), tag)
286
287
def findNext(self, name=None, attrs={}, text=None, **kwargs):
288
"""Returns the first item that matches the given criteria and
289
appears after this Tag in the document."""
290
return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
291
292
def findAllNext(self, name=None, attrs={}, text=None, limit=None,
293
**kwargs):
294
"""Returns all items that match the given criteria and appear
295
after this Tag in the document."""
296
return self._findAll(name, attrs, text, limit, self.nextGenerator,
297
**kwargs)
298
299
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
300
"""Returns the closest sibling to this Tag that matches the
301
given criteria and appears after this Tag in the document."""
302
return self._findOne(self.findNextSiblings, name, attrs, text,
303
**kwargs)
304
305
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
306
**kwargs):
307
"""Returns the siblings of this Tag that match the given
308
criteria and appear after this Tag in the document."""
309
return self._findAll(name, attrs, text, limit,
310
self.nextSiblingGenerator, **kwargs)
311
fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
312
313
def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
314
"""Returns the first item that matches the given criteria and
315
appears before this Tag in the document."""
316
return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
317
318
def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
319
**kwargs):
320
"""Returns all items that match the given criteria and appear
321
before this Tag in the document."""
322
return self._findAll(name, attrs, text, limit, self.previousGenerator,
323
**kwargs)
324
fetchPrevious = findAllPrevious # Compatibility with pre-3.x
325
326
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
327
"""Returns the closest sibling to this Tag that matches the
328
given criteria and appears before this Tag in the document."""
329
return self._findOne(self.findPreviousSiblings, name, attrs, text,
330
**kwargs)
331
332
def findPreviousSiblings(self, name=None, attrs={}, text=None,
333
limit=None, **kwargs):
334
"""Returns the siblings of this Tag that match the given
335
criteria and appear before this Tag in the document."""
336
return self._findAll(name, attrs, text, limit,
337
self.previousSiblingGenerator, **kwargs)
338
fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
339
340
def findParent(self, name=None, attrs={}, **kwargs):
341
"""Returns the closest parent of this Tag that matches the given
342
criteria."""
343
# NOTE: We can't use _findOne because findParents takes a different
344
# set of arguments.
345
r = None
346
l = self.findParents(name, attrs, 1)
347
if l:
348
r = l[0]
349
return r
350
351
def findParents(self, name=None, attrs={}, limit=None, **kwargs):
352
"""Returns the parents of this Tag that match the given
353
criteria."""
354
355
return self._findAll(name, attrs, None, limit, self.parentGenerator,
356
**kwargs)
357
fetchParents = findParents # Compatibility with pre-3.x
358
359
#These methods do the real heavy lifting.
360
361
def _findOne(self, method, name, attrs, text, **kwargs):
362
r = None
363
l = method(name, attrs, text, 1, **kwargs)
364
if l:
365
r = l[0]
366
return r
367
368
def _findAll(self, name, attrs, text, limit, generator, **kwargs):
369
"Iterates over a generator looking for things that match."
370
371
if isinstance(name, SoupStrainer):
372
strainer = name
373
# (Possibly) special case some findAll*(...) searches
374
elif text is None and not limit and not attrs and not kwargs:
375
# findAll*(True)
376
if name is True:
377
return [element for element in generator()
378
if isinstance(element, Tag)]
379
# findAll*('tag-name')
380
elif isinstance(name, basestring):
381
return [element for element in generator()
382
if isinstance(element, Tag) and
383
element.name == name]
384
else:
385
strainer = SoupStrainer(name, attrs, text, **kwargs)
386
# Build a SoupStrainer
387
else:
388
strainer = SoupStrainer(name, attrs, text, **kwargs)
389
results = ResultSet(strainer)
390
g = generator()
391
while True:
392
try:
393
i = next(g)
394
except StopIteration:
395
break
396
if i:
397
found = strainer.search(i)
398
if found:
399
results.append(found)
400
if limit and len(results) >= limit:
401
break
402
return results
403
404
#These Generators can be used to navigate starting from both
405
#NavigableStrings and Tags.
406
def nextGenerator(self):
407
i = self
408
while i is not None:
409
i = i.next
410
yield i
411
412
def nextSiblingGenerator(self):
413
i = self
414
while i is not None:
415
i = i.nextSibling
416
yield i
417
418
def previousGenerator(self):
419
i = self
420
while i is not None:
421
i = i.previous
422
yield i
423
424
def previousSiblingGenerator(self):
425
i = self
426
while i is not None:
427
i = i.previousSibling
428
yield i
429
430
def parentGenerator(self):
431
i = self
432
while i is not None:
433
i = i.parent
434
yield i
435
436
# Utility methods
437
def substituteEncoding(self, str, encoding=None):
438
encoding = encoding or "utf-8"
439
return str.replace("%SOUP-ENCODING%", encoding)
440
441
def toEncoding(self, s, encoding=None):
442
"""Encodes an object to a string in some encoding, or to Unicode.
443
."""
444
if isinstance(s, text_type):
445
if encoding:
446
s = s.encode(encoding)
447
elif isinstance(s, binary_type):
448
s = s.encode(encoding or "utf8")
449
else:
450
s = self.toEncoding(str(s), encoding or "utf8")
451
return s
452
453
BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;))")
454
455
def _sub_entity(self, x):
456
"""Used with a regular expression to substitute the
457
appropriate XML entity for an XML special character."""
458
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
459
460
461
class NavigableString(text_type, PageElement):
462
463
def __new__(cls, value):
464
"""Create a new NavigableString.
465
466
When unpickling a NavigableString, this method is called with
467
the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
468
passed in to the superclass's __new__ or the superclass won't know
469
how to handle non-ASCII characters.
470
"""
471
if isinstance(value, text_type):
472
return text_type.__new__(cls, value)
473
return text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
474
475
def __getnewargs__(self):
476
return (NavigableString.__str__(self),)
477
478
def __getattr__(self, attr):
479
"""text.string gives you text. This is for backwards
480
compatibility for Navigable*String, but for CData* it lets you
481
get the string without the CData wrapper."""
482
if attr == 'string':
483
return self
484
else:
485
raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr))
486
487
def __unicode__(self):
488
return str(self).decode(DEFAULT_OUTPUT_ENCODING)
489
490
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
491
# Substitute outgoing XML entities.
492
data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self)
493
if encoding:
494
return data.encode(encoding)
495
else:
496
return data
497
498
class CData(NavigableString):
499
500
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
501
return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
502
503
class ProcessingInstruction(NavigableString):
504
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
505
output = self
506
if "%SOUP-ENCODING%" in output:
507
output = self.substituteEncoding(output, encoding)
508
return "<?%s?>" % self.toEncoding(output, encoding)
509
510
class Comment(NavigableString):
511
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
512
return "<!--%s-->" % NavigableString.__str__(self, encoding)
513
514
class Declaration(NavigableString):
515
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
516
return "<!%s>" % NavigableString.__str__(self, encoding)
517
518
class Tag(PageElement):
519
520
"""Represents a found HTML tag with its attributes and contents."""
521
522
def _convertEntities(self, match):
523
"""Used in a call to re.sub to replace HTML, XML, and numeric
524
entities with the appropriate Unicode characters. If HTML
525
entities are being converted, any unrecognized entities are
526
escaped."""
527
try:
528
x = match.group(1)
529
if self.convertHTMLEntities and x in name2codepoint:
530
return unichr(name2codepoint[x])
531
elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
532
if self.convertXMLEntities:
533
return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
534
else:
535
return u'&%s;' % x
536
elif len(x) > 0 and x[0] == '#':
537
# Handle numeric entities
538
if len(x) > 1 and x[1] == 'x':
539
return unichr(int(x[2:], 16))
540
else:
541
return unichr(int(x[1:]))
542
543
elif self.escapeUnrecognizedEntities:
544
return u'&amp;%s;' % x
545
546
except ValueError: # e.g. ValueError: unichr() arg not in range(0x10000)
547
pass
548
549
return u'&%s;' % x
550
551
def __init__(self, parser, name, attrs=None, parent=None,
552
previous=None):
553
"Basic constructor."
554
555
# We don't actually store the parser object: that lets extracted
556
# chunks be garbage-collected
557
self.parserClass = parser.__class__
558
self.isSelfClosing = parser.isSelfClosingTag(name)
559
self.name = name
560
if attrs is None:
561
attrs = []
562
elif isinstance(attrs, dict):
563
attrs = attrs.items()
564
self.attrs = attrs
565
self.contents = []
566
self.setup(parent, previous)
567
self.hidden = False
568
self.containsSubstitutions = False
569
self.convertHTMLEntities = parser.convertHTMLEntities
570
self.convertXMLEntities = parser.convertXMLEntities
571
self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
572
573
# Convert any HTML, XML, or numeric entities in the attribute values.
574
# Reference: https://github.com/pkrumins/xgoogle/pull/16/commits/3dba1165c436b0d6e5bdbd09e53ca0dbf8a043f8
575
convert = lambda k_val: (k_val[0],
576
re.sub(r"&(#\d+|#x[0-9a-fA-F]+|\w+);",
577
self._convertEntities,
578
k_val[1]))
579
self.attrs = map(convert, self.attrs)
580
581
def getString(self):
582
if (len(self.contents) == 1
583
and isinstance(self.contents[0], NavigableString)):
584
return self.contents[0]
585
586
def setString(self, string):
587
"""Replace the contents of the tag with a string"""
588
self.clear()
589
self.append(string)
590
591
string = property(getString, setString)
592
593
def getText(self, separator=u""):
594
if not len(self.contents):
595
return u""
596
stopNode = self._lastRecursiveChild().next
597
strings = []
598
current = self.contents[0]
599
while current and current is not stopNode:
600
if isinstance(current, NavigableString):
601
strings.append(current.strip())
602
current = current.next
603
return separator.join(strings)
604
605
text = property(getText)
606
607
def get(self, key, default=None):
608
"""Returns the value of the 'key' attribute for the tag, or
609
the value given for 'default' if it doesn't have that
610
attribute."""
611
return self._getAttrMap().get(key, default)
612
613
def clear(self):
614
"""Extract all children."""
615
for child in self.contents[:]:
616
child.extract()
617
618
def index(self, element):
619
for i, child in enumerate(self.contents):
620
if child is element:
621
return i
622
raise ValueError("Tag.index: element not in tag")
623
624
def has_key(self, key):
625
return self._getAttrMap().has_key(key)
626
627
def __getitem__(self, key):
628
"""tag[key] returns the value of the 'key' attribute for the tag,
629
and throws an exception if it's not there."""
630
return self._getAttrMap()[key]
631
632
def __iter__(self):
633
"Iterating over a tag iterates over its contents."
634
return iter(self.contents)
635
636
def __len__(self):
637
"The length of a tag is the length of its list of contents."
638
return len(self.contents)
639
640
def __contains__(self, x):
641
return x in self.contents
642
643
def __nonzero__(self):
644
"A tag is non-None even if it has no contents."
645
return True
646
647
def __setitem__(self, key, value):
648
"""Setting tag[key] sets the value of the 'key' attribute for the
649
tag."""
650
self._getAttrMap()
651
self.attrMap[key] = value
652
found = False
653
for i in xrange(0, len(self.attrs)):
654
if self.attrs[i][0] == key:
655
self.attrs[i] = (key, value)
656
found = True
657
if not found:
658
self.attrs.append((key, value))
659
self._getAttrMap()[key] = value
660
661
def __delitem__(self, key):
662
"Deleting tag[key] deletes all 'key' attributes for the tag."
663
for item in self.attrs:
664
if item[0] == key:
665
self.attrs.remove(item)
666
#We don't break because bad HTML can define the same
667
#attribute multiple times.
668
self._getAttrMap()
669
if self.attrMap.has_key(key):
670
del self.attrMap[key]
671
672
def __call__(self, *args, **kwargs):
673
"""Calling a tag like a function is the same as calling its
674
findAll() method. Eg. tag('a') returns a list of all the A tags
675
found within this tag."""
676
return self.findAll(*args, **kwargs)
677
678
def __getattr__(self, tag):
679
#print "Getattr %s.%s" % (self.__class__, tag)
680
if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
681
return self.find(tag[:-3])
682
elif tag.find('__') != 0:
683
return self.find(tag)
684
raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag))
685
686
def __eq__(self, other):
687
"""Returns true iff this tag has the same name, the same attributes,
688
and the same contents (recursively) as the given tag.
689
690
NOTE: right now this will return false if two tags have the
691
same attributes in a different order. Should this be fixed?"""
692
if other is self:
693
return True
694
if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
695
return False
696
for i in xrange(0, len(self.contents)):
697
if self.contents[i] != other.contents[i]:
698
return False
699
return True
700
701
def __ne__(self, other):
702
"""Returns true iff this tag is not identical to the other tag,
703
as defined in __eq__."""
704
return not self == other
705
706
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
707
"""Renders this tag as a string."""
708
return self.__str__(encoding)
709
710
def __unicode__(self):
711
return self.__str__(None)
712
713
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
714
prettyPrint=False, indentLevel=0):
715
"""Returns a string or Unicode representation of this tag and
716
its contents. To get Unicode, pass None for encoding.
717
718
NOTE: since Python's HTML parser consumes whitespace, this
719
method is not certain to reproduce the whitespace present in
720
the original string."""
721
722
encodedName = self.toEncoding(self.name, encoding)
723
724
attrs = []
725
if self.attrs:
726
for key, val in self.attrs:
727
fmt = '%s="%s"'
728
if isinstance(val, basestring):
729
if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
730
val = self.substituteEncoding(val, encoding)
731
732
# The attribute value either:
733
#
734
# * Contains no embedded double quotes or single quotes.
735
# No problem: we enclose it in double quotes.
736
# * Contains embedded single quotes. No problem:
737
# double quotes work here too.
738
# * Contains embedded double quotes. No problem:
739
# we enclose it in single quotes.
740
# * Embeds both single _and_ double quotes. This
741
# can't happen naturally, but it can happen if
742
# you modify an attribute value after parsing
743
# the document. Now we have a bit of a
744
# problem. We solve it by enclosing the
745
# attribute in single quotes, and escaping any
746
# embedded single quotes to XML entities.
747
if '"' in val:
748
fmt = "%s='%s'"
749
if "'" in val:
750
# TODO: replace with apos when
751
# appropriate.
752
val = val.replace("'", "&squot;")
753
754
# Now we're okay w/r/t quotes. But the attribute
755
# value might also contain angle brackets, or
756
# ampersands that aren't part of entities. We need
757
# to escape those to XML entities too.
758
val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
759
760
attrs.append(fmt % (self.toEncoding(key, encoding),
761
self.toEncoding(val, encoding)))
762
close = ''
763
closeTag = ''
764
if self.isSelfClosing:
765
close = ' /'
766
else:
767
closeTag = '</%s>' % encodedName
768
769
indentTag, indentContents = 0, 0
770
if prettyPrint:
771
indentTag = indentLevel
772
space = (' ' * (indentTag-1))
773
indentContents = indentTag + 1
774
contents = self.renderContents(encoding, prettyPrint, indentContents)
775
if self.hidden:
776
s = contents
777
else:
778
s = []
779
attributeString = ''
780
if attrs:
781
attributeString = ' ' + ' '.join(attrs)
782
if prettyPrint:
783
s.append(space)
784
s.append('<%s%s%s>' % (encodedName, attributeString, close))
785
if prettyPrint:
786
s.append("\n")
787
s.append(contents)
788
if prettyPrint and contents and contents[-1] != "\n":
789
s.append("\n")
790
if prettyPrint and closeTag:
791
s.append(space)
792
s.append(closeTag)
793
if prettyPrint and closeTag and self.nextSibling:
794
s.append("\n")
795
s = ''.join(s)
796
return s
797
798
def decompose(self):
799
"""Recursively destroys the contents of this tree."""
800
self.extract()
801
if len(self.contents) == 0:
802
return
803
current = self.contents[0]
804
while current is not None:
805
next = current.next
806
if isinstance(current, Tag):
807
del current.contents[:]
808
current.parent = None
809
current.previous = None
810
current.previousSibling = None
811
current.next = None
812
current.nextSibling = None
813
current = next
814
815
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
816
return self.__str__(encoding, True)
817
818
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
819
prettyPrint=False, indentLevel=0):
820
"""Renders the contents of this tag as a string in the given
821
encoding. If encoding is None, returns a Unicode string.."""
822
s=[]
823
for c in self:
824
text = None
825
if isinstance(c, NavigableString):
826
text = c.__str__(encoding)
827
elif isinstance(c, Tag):
828
s.append(c.__str__(encoding, prettyPrint, indentLevel))
829
if text and prettyPrint:
830
text = text.strip()
831
if text:
832
if prettyPrint:
833
s.append(" " * (indentLevel-1))
834
s.append(text)
835
if prettyPrint:
836
s.append("\n")
837
838
return ''.join(s)
839
840
#Soup methods
841
842
def find(self, name=None, attrs={}, recursive=True, text=None,
843
**kwargs):
844
"""Return only the first child of this Tag matching the given
845
criteria."""
846
r = None
847
l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
848
if l:
849
r = l[0]
850
return r
851
findChild = find
852
853
def findAll(self, name=None, attrs={}, recursive=True, text=None,
854
limit=None, **kwargs):
855
"""Extracts a list of Tag objects that match the given
856
criteria. You can specify the name of the Tag and any
857
attributes you want the Tag to have.
858
859
The value of a key-value pair in the 'attrs' map can be a
860
string, a list of strings, a regular expression object, or a
861
callable that takes a string and returns whether or not the
862
string matches for some custom definition of 'matches'. The
863
same is true of the tag name."""
864
generator = self.recursiveChildGenerator
865
if not recursive:
866
generator = self.childGenerator
867
return self._findAll(name, attrs, text, limit, generator, **kwargs)
868
findChildren = findAll
869
870
# Pre-3.x compatibility methods
871
first = find
872
fetch = findAll
873
874
def fetchText(self, text=None, recursive=True, limit=None):
875
return self.findAll(text=text, recursive=recursive, limit=limit)
876
877
def firstText(self, text=None, recursive=True):
878
return self.find(text=text, recursive=recursive)
879
880
#Private methods
881
882
def _getAttrMap(self):
883
"""Initializes a map representation of this tag's attributes,
884
if not already initialized."""
885
if not getattr(self, 'attrMap'):
886
self.attrMap = {}
887
for (key, value) in self.attrs:
888
self.attrMap[key] = value
889
return self.attrMap
890
891
#Generator methods
892
def childGenerator(self):
893
# Just use the iterator from the contents
894
return iter(self.contents)
895
896
def recursiveChildGenerator(self):
897
if not len(self.contents):
898
return # Note: https://stackoverflow.com/a/30217723 (PEP 479)
899
stopNode = self._lastRecursiveChild().next
900
current = self.contents[0]
901
while current and current is not stopNode:
902
yield current
903
current = current.next
904
905
906
# Next, a couple classes to represent queries and their results.
907
class SoupStrainer:
908
"""Encapsulates a number of ways of matching a markup element (tag or
909
text)."""
910
911
def __init__(self, name=None, attrs={}, text=None, **kwargs):
912
self.name = name
913
if isinstance(attrs, basestring):
914
kwargs['class'] = _match_css_class(attrs)
915
attrs = None
916
if kwargs:
917
if attrs:
918
attrs = attrs.copy()
919
attrs.update(kwargs)
920
else:
921
attrs = kwargs
922
self.attrs = attrs
923
self.text = text
924
925
def __str__(self):
926
if self.text:
927
return self.text
928
else:
929
return "%s|%s" % (self.name, self.attrs)
930
931
def searchTag(self, markupName=None, markupAttrs={}):
932
found = None
933
markup = None
934
if isinstance(markupName, Tag):
935
markup = markupName
936
markupAttrs = markup
937
callFunctionWithTagData = callable(self.name) \
938
and not isinstance(markupName, Tag)
939
940
if (not self.name) \
941
or callFunctionWithTagData \
942
or (markup and self._matches(markup, self.name)) \
943
or (not markup and self._matches(markupName, self.name)):
944
if callFunctionWithTagData:
945
match = self.name(markupName, markupAttrs)
946
else:
947
match = True
948
markupAttrMap = None
949
for attr, matchAgainst in self.attrs.items():
950
if not markupAttrMap:
951
if hasattr(markupAttrs, 'get'):
952
markupAttrMap = markupAttrs
953
else:
954
markupAttrMap = {}
955
for k,v in markupAttrs:
956
markupAttrMap[k] = v
957
attrValue = markupAttrMap.get(attr)
958
if not self._matches(attrValue, matchAgainst):
959
match = False
960
break
961
if match:
962
if markup:
963
found = markup
964
else:
965
found = markupName
966
return found
967
968
def search(self, markup):
969
#print 'looking for %s in %s' % (self, markup)
970
found = None
971
# If given a list of items, scan it for a text element that
972
# matches.
973
if hasattr(markup, "__iter__") \
974
and not isinstance(markup, Tag):
975
for element in markup:
976
if isinstance(element, NavigableString) \
977
and self.search(element):
978
found = element
979
break
980
# If it's a Tag, make sure its name or attributes match.
981
# Don't bother with Tags if we're searching for text.
982
elif isinstance(markup, Tag):
983
if not self.text:
984
found = self.searchTag(markup)
985
# If it's text, make sure the text matches.
986
elif isinstance(markup, NavigableString) or \
987
isinstance(markup, basestring):
988
if self._matches(markup, self.text):
989
found = markup
990
else:
991
raise Exception("I don't know how to match against a %s" \
992
% markup.__class__)
993
return found
994
995
def _matches(self, markup, matchAgainst):
996
#print "Matching %s against %s" % (markup, matchAgainst)
997
result = False
998
if matchAgainst is True:
999
result = markup is not None
1000
elif callable(matchAgainst):
1001
result = matchAgainst(markup)
1002
else:
1003
#Custom match methods take the tag as an argument, but all
1004
#other ways of matching match the tag name as a string.
1005
if isinstance(markup, Tag):
1006
markup = markup.name
1007
if markup and not isinstance(markup, basestring):
1008
markup = text_type(markup)
1009
#Now we know that chunk is either a string, or None.
1010
if hasattr(matchAgainst, 'match'):
1011
# It's a regexp object.
1012
result = markup and matchAgainst.search(markup)
1013
elif hasattr(matchAgainst, '__iter__'): # list-like
1014
result = markup in matchAgainst
1015
elif hasattr(matchAgainst, 'items'):
1016
result = markup.has_key(matchAgainst)
1017
elif matchAgainst and isinstance(markup, basestring):
1018
if isinstance(markup, text_type):
1019
matchAgainst = text_type(matchAgainst)
1020
else:
1021
matchAgainst = str(matchAgainst)
1022
1023
if not result:
1024
result = matchAgainst == markup
1025
return result
1026
1027
class ResultSet(list):
1028
"""A ResultSet is just a list that keeps track of the SoupStrainer
1029
that created it."""
1030
def __init__(self, source):
1031
list.__init__([])
1032
self.source = source
1033
1034
# Now, some helper functions.
1035
1036
def buildTagMap(default, *args):
1037
"""Turns a list of maps, lists, or scalars into a single map.
1038
Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1039
NESTING_RESET_TAGS maps out of lists and partial maps."""
1040
built = {}
1041
for portion in args:
1042
if hasattr(portion, 'items'):
1043
#It's a map. Merge it.
1044
for k,v in portion.items():
1045
built[k] = v
1046
elif hasattr(portion, '__iter__'): # is a list
1047
#It's a list. Map each item to the default.
1048
for k in portion:
1049
built[k] = default
1050
else:
1051
#It's a scalar. Map it to the default.
1052
built[portion] = default
1053
return built
1054
1055
# Now, the parser classes.
1056
1057
class BeautifulStoneSoup(Tag, sgmllib.SGMLParser):
1058
1059
"""This class contains the basic parser and search code. It defines
1060
a parser that knows nothing about tag behavior except for the
1061
following:
1062
1063
You can't close a tag without closing all the tags it encloses.
1064
That is, "<foo><bar></foo>" actually means
1065
"<foo><bar></bar></foo>".
1066
1067
[Another possible explanation is "<foo><bar /></foo>", but since
1068
this class defines no SELF_CLOSING_TAGS, it will never use that
1069
explanation.]
1070
1071
This class is useful for parsing XML or made-up markup languages,
1072
or when BeautifulSoup makes an assumption counter to what you were
1073
expecting."""
1074
1075
SELF_CLOSING_TAGS = {}
1076
NESTABLE_TAGS = {}
1077
RESET_NESTING_TAGS = {}
1078
QUOTE_TAGS = {}
1079
PRESERVE_WHITESPACE_TAGS = []
1080
1081
MARKUP_MASSAGE = [(re.compile(r'(<[^<>]*)/>'),
1082
lambda x: x.group(1) + ' />'),
1083
(re.compile(r'<!\s+([^<>]*)>'),
1084
lambda x: '<!' + x.group(1) + '>')
1085
]
1086
1087
ROOT_TAG_NAME = u'[document]'
1088
1089
HTML_ENTITIES = "html"
1090
XML_ENTITIES = "xml"
1091
XHTML_ENTITIES = "xhtml"
1092
# TODO: This only exists for backwards-compatibility
1093
ALL_ENTITIES = XHTML_ENTITIES
1094
1095
# Used when determining whether a text node is all whitespace and
1096
# can be replaced with a single space. A text node that contains
1097
# fancy Unicode spaces (usually non-breaking) should be left
1098
# alone.
1099
STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1100
1101
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1102
markupMassage=True, smartQuotesTo=XML_ENTITIES,
1103
convertEntities=None, selfClosingTags=None, isHTML=False):
1104
"""The Soup object is initialized as the 'root tag', and the
1105
provided markup (which can be a string or a file-like object)
1106
is fed into the underlying parser.
1107
1108
sgmllib will process most bad HTML, and the BeautifulSoup
1109
class has some tricks for dealing with some HTML that kills
1110
sgmllib, but Beautiful Soup can nonetheless choke or lose data
1111
if your data uses self-closing tags or declarations
1112
incorrectly.
1113
1114
By default, Beautiful Soup uses regexes to sanitize input,
1115
avoiding the vast majority of these problems. If the problems
1116
don't apply to you, pass in False for markupMassage, and
1117
you'll get better performance.
1118
1119
The default parser massage techniques fix the two most common
1120
instances of invalid HTML that choke sgmllib:
1121
1122
<br/> (No space between name of closing tag and tag close)
1123
<! --Comment--> (Extraneous whitespace in declaration)
1124
1125
You can pass in a custom list of (RE object, replace method)
1126
tuples to get Beautiful Soup to scrub your input the way you
1127
want."""
1128
1129
self.parseOnlyThese = parseOnlyThese
1130
self.fromEncoding = fromEncoding
1131
self.smartQuotesTo = smartQuotesTo
1132
self.convertEntities = convertEntities
1133
# Set the rules for how we'll deal with the entities we
1134
# encounter
1135
if self.convertEntities:
1136
# It doesn't make sense to convert encoded characters to
1137
# entities even while you're converting entities to Unicode.
1138
# Just convert it all to Unicode.
1139
self.smartQuotesTo = None
1140
if convertEntities == self.HTML_ENTITIES:
1141
self.convertXMLEntities = False
1142
self.convertHTMLEntities = True
1143
self.escapeUnrecognizedEntities = True
1144
elif convertEntities == self.XHTML_ENTITIES:
1145
self.convertXMLEntities = True
1146
self.convertHTMLEntities = True
1147
self.escapeUnrecognizedEntities = False
1148
elif convertEntities == self.XML_ENTITIES:
1149
self.convertXMLEntities = True
1150
self.convertHTMLEntities = False
1151
self.escapeUnrecognizedEntities = False
1152
else:
1153
self.convertXMLEntities = False
1154
self.convertHTMLEntities = False
1155
self.escapeUnrecognizedEntities = False
1156
1157
self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1158
sgmllib.SGMLParser.__init__(self)
1159
1160
if hasattr(markup, 'read'): # It's a file-type object.
1161
markup = markup.read()
1162
self.markup = markup
1163
self.markupMassage = markupMassage
1164
try:
1165
self._feed(isHTML=isHTML)
1166
except StopParsing:
1167
pass
1168
self.markup = None # The markup can now be GCed
1169
1170
def convert_charref(self, name):
1171
"""This method fixes a bug in Python's SGMLParser."""
1172
try:
1173
n = int(name)
1174
except ValueError:
1175
return
1176
if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1177
return
1178
return self.convert_codepoint(n)
1179
1180
def _feed(self, inDocumentEncoding=None, isHTML=False):
1181
# Convert the document to Unicode.
1182
markup = self.markup
1183
if isinstance(markup, text_type):
1184
if not hasattr(self, 'originalEncoding'):
1185
self.originalEncoding = None
1186
else:
1187
dammit = UnicodeDammit\
1188
(markup, [self.fromEncoding, inDocumentEncoding],
1189
smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1190
markup = dammit.unicode
1191
self.originalEncoding = dammit.originalEncoding
1192
self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1193
if markup:
1194
if self.markupMassage:
1195
if not hasattr(self.markupMassage, "__iter__"):
1196
self.markupMassage = self.MARKUP_MASSAGE
1197
for fix, m in self.markupMassage:
1198
markup = fix.sub(m, markup)
1199
# TODO: We get rid of markupMassage so that the
1200
# soup object can be deepcopied later on. Some
1201
# Python installations can't copy regexes. If anyone
1202
# was relying on the existence of markupMassage, this
1203
# might cause problems.
1204
del(self.markupMassage)
1205
self.reset()
1206
1207
sgmllib.SGMLParser.feed(self, markup)
1208
# Close out any unfinished strings and close all the open tags.
1209
self.endData()
1210
while self.currentTag.name != self.ROOT_TAG_NAME:
1211
self.popTag()
1212
1213
def __getattr__(self, methodName):
1214
"""This method routes method call requests to either the SGMLParser
1215
superclass or the Tag superclass, depending on the method name."""
1216
#print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1217
1218
if methodName.startswith('start_') or methodName.startswith('end_') \
1219
or methodName.startswith('do_'):
1220
return sgmllib.SGMLParser.__getattr__(self, methodName)
1221
elif not methodName.startswith('__'):
1222
return Tag.__getattr__(self, methodName)
1223
else:
1224
raise AttributeError
1225
1226
def isSelfClosingTag(self, name):
1227
"""Returns true iff the given string is the name of a
1228
self-closing tag according to this parser."""
1229
return name in self.SELF_CLOSING_TAGS \
1230
or name in self.instanceSelfClosingTags
1231
1232
def reset(self):
1233
Tag.__init__(self, self, self.ROOT_TAG_NAME)
1234
self.hidden = 1
1235
sgmllib.SGMLParser.reset(self)
1236
self.currentData = []
1237
self.currentTag = None
1238
self.tagStack = []
1239
self.quoteStack = []
1240
self.pushTag(self)
1241
1242
def popTag(self):
1243
tag = self.tagStack.pop()
1244
1245
#print "Pop", tag.name
1246
if self.tagStack:
1247
self.currentTag = self.tagStack[-1]
1248
return self.currentTag
1249
1250
def pushTag(self, tag):
1251
#print "Push", tag.name
1252
if self.currentTag:
1253
self.currentTag.contents.append(tag)
1254
self.tagStack.append(tag)
1255
self.currentTag = self.tagStack[-1]
1256
1257
def endData(self, containerClass=NavigableString):
1258
if self.currentData:
1259
currentData = u''.join(self.currentData)
1260
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1261
not set([tag.name for tag in self.tagStack]).intersection(
1262
self.PRESERVE_WHITESPACE_TAGS)):
1263
if '\n' in currentData:
1264
currentData = '\n'
1265
else:
1266
currentData = ' '
1267
self.currentData = []
1268
if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1269
(not self.parseOnlyThese.text or \
1270
not self.parseOnlyThese.search(currentData)):
1271
return
1272
o = containerClass(currentData)
1273
o.setup(self.currentTag, self.previous)
1274
if self.previous:
1275
self.previous.next = o
1276
self.previous = o
1277
self.currentTag.contents.append(o)
1278
1279
1280
def _popToTag(self, name, inclusivePop=True):
1281
"""Pops the tag stack up to and including the most recent
1282
instance of the given tag. If inclusivePop is false, pops the tag
1283
stack up to but *not* including the most recent instqance of
1284
the given tag."""
1285
#print "Popping to %s" % name
1286
if name == self.ROOT_TAG_NAME:
1287
return
1288
1289
numPops = 0
1290
mostRecentTag = None
1291
for i in xrange(len(self.tagStack)-1, 0, -1):
1292
if name == self.tagStack[i].name:
1293
numPops = len(self.tagStack)-i
1294
break
1295
if not inclusivePop:
1296
numPops = numPops - 1
1297
1298
for i in xrange(0, numPops):
1299
mostRecentTag = self.popTag()
1300
return mostRecentTag
1301
1302
def _smartPop(self, name):
1303
1304
"""We need to pop up to the previous tag of this type, unless
1305
one of this tag's nesting reset triggers comes between this
1306
tag and the previous tag of this type, OR unless this tag is a
1307
generic nesting trigger and another generic nesting trigger
1308
comes between this tag and the previous tag of this type.
1309
1310
Examples:
1311
<p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1312
<p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1313
<p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1314
1315
<li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1316
<tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1317
<td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1318
"""
1319
1320
nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1321
isNestable = nestingResetTriggers != None
1322
isResetNesting = name in self.RESET_NESTING_TAGS
1323
popTo = None
1324
inclusive = True
1325
for i in xrange(len(self.tagStack)-1, 0, -1):
1326
p = self.tagStack[i]
1327
if (not p or p.name == name) and not isNestable:
1328
#Non-nestable tags get popped to the top or to their
1329
#last occurance.
1330
popTo = name
1331
break
1332
if (nestingResetTriggers is not None
1333
and p.name in nestingResetTriggers) \
1334
or (nestingResetTriggers is None and isResetNesting
1335
and p.name in self.RESET_NESTING_TAGS):
1336
1337
#If we encounter one of the nesting reset triggers
1338
#peculiar to this tag, or we encounter another tag
1339
#that causes nesting to reset, pop up to but not
1340
#including that tag.
1341
popTo = p.name
1342
inclusive = False
1343
break
1344
p = p.parent
1345
if popTo:
1346
self._popToTag(popTo, inclusive)
1347
1348
def unknown_starttag(self, name, attrs, selfClosing=0):
1349
#print "Start tag %s: %s" % (name, attrs)
1350
if self.quoteStack:
1351
#This is not a real tag.
1352
#print "<%s> is not real!" % name
1353
attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1354
self.handle_data('<%s%s>' % (name, attrs))
1355
return
1356
self.endData()
1357
1358
if not self.isSelfClosingTag(name) and not selfClosing:
1359
self._smartPop(name)
1360
1361
if self.parseOnlyThese and len(self.tagStack) <= 1 \
1362
and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1363
return
1364
1365
tag = Tag(self, name, attrs, self.currentTag, self.previous)
1366
if self.previous:
1367
self.previous.next = tag
1368
self.previous = tag
1369
self.pushTag(tag)
1370
if selfClosing or self.isSelfClosingTag(name):
1371
self.popTag()
1372
if name in self.QUOTE_TAGS:
1373
#print "Beginning quote (%s)" % name
1374
self.quoteStack.append(name)
1375
self.literal = 1
1376
return tag
1377
1378
def unknown_endtag(self, name):
1379
#print "End tag %s" % name
1380
if self.quoteStack and self.quoteStack[-1] != name:
1381
#This is not a real end tag.
1382
#print "</%s> is not real!" % name
1383
self.handle_data('</%s>' % name)
1384
return
1385
self.endData()
1386
self._popToTag(name)
1387
if self.quoteStack and self.quoteStack[-1] == name:
1388
self.quoteStack.pop()
1389
self.literal = (len(self.quoteStack) > 0)
1390
1391
def handle_data(self, data):
1392
self.currentData.append(data)
1393
1394
def _toStringSubclass(self, text, subclass):
1395
"""Adds a certain piece of text to the tree as a NavigableString
1396
subclass."""
1397
self.endData()
1398
self.handle_data(text)
1399
self.endData(subclass)
1400
1401
def handle_pi(self, text):
1402
"""Handle a processing instruction as a ProcessingInstruction
1403
object, possibly one with a %SOUP-ENCODING% slot into which an
1404
encoding will be plugged later."""
1405
if text[:3] == "xml":
1406
text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1407
self._toStringSubclass(text, ProcessingInstruction)
1408
1409
def handle_comment(self, text):
1410
"Handle comments as Comment objects."
1411
self._toStringSubclass(text, Comment)
1412
1413
def handle_charref(self, ref):
1414
"Handle character references as data."
1415
if self.convertEntities:
1416
data = unichr(int(ref))
1417
else:
1418
data = '&#%s;' % ref
1419
self.handle_data(data)
1420
1421
def handle_entityref(self, ref):
1422
"""Handle entity references as data, possibly converting known
1423
HTML and/or XML entity references to the corresponding Unicode
1424
characters."""
1425
data = None
1426
if self.convertHTMLEntities:
1427
try:
1428
data = unichr(name2codepoint[ref])
1429
except KeyError:
1430
pass
1431
1432
if not data and self.convertXMLEntities:
1433
data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1434
1435
if not data and self.convertHTMLEntities and \
1436
not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1437
# TODO: We've got a problem here. We're told this is
1438
# an entity reference, but it's not an XML entity
1439
# reference or an HTML entity reference. Nonetheless,
1440
# the logical thing to do is to pass it through as an
1441
# unrecognized entity reference.
1442
#
1443
# Except: when the input is "&carol;" this function
1444
# will be called with input "carol". When the input is
1445
# "AT&T", this function will be called with input
1446
# "T". We have no way of knowing whether a semicolon
1447
# was present originally, so we don't know whether
1448
# this is an unknown entity or just a misplaced
1449
# ampersand.
1450
#
1451
# The more common case is a misplaced ampersand, so I
1452
# escape the ampersand and omit the trailing semicolon.
1453
data = "&amp;%s" % ref
1454
if not data:
1455
# This case is different from the one above, because we
1456
# haven't already gone through a supposedly comprehensive
1457
# mapping of entities to Unicode characters. We might not
1458
# have gone through any mapping at all. So the chances are
1459
# very high that this is a real entity, and not a
1460
# misplaced ampersand.
1461
data = "&%s;" % ref
1462
self.handle_data(data)
1463
1464
def handle_decl(self, data):
1465
"Handle DOCTYPEs and the like as Declaration objects."
1466
self._toStringSubclass(data, Declaration)
1467
1468
def parse_declaration(self, i):
1469
"""Treat a bogus SGML declaration as raw data. Treat a CDATA
1470
declaration as a CData object."""
1471
j = None
1472
if self.rawdata[i:i+9] == '<![CDATA[':
1473
k = self.rawdata.find(']]>', i)
1474
if k == -1:
1475
k = len(self.rawdata)
1476
data = self.rawdata[i+9:k]
1477
j = k+3
1478
self._toStringSubclass(data, CData)
1479
else:
1480
try:
1481
j = sgmllib.SGMLParser.parse_declaration(self, i)
1482
except sgmllib.SGMLParseError:
1483
toHandle = self.rawdata[i:]
1484
self.handle_data(toHandle)
1485
j = i + len(toHandle)
1486
return j
1487
1488
class BeautifulSoup(BeautifulStoneSoup):
1489
1490
"""This parser knows the following facts about HTML:
1491
1492
* Some tags have no closing tag and should be interpreted as being
1493
closed as soon as they are encountered.
1494
1495
* The text inside some tags (ie. 'script') may contain tags which
1496
are not really part of the document and which should be parsed
1497
as text, not tags. If you want to parse the text as tags, you can
1498
always fetch it and parse it explicitly.
1499
1500
* Tag nesting rules:
1501
1502
Most tags can't be nested at all. For instance, the occurance of
1503
a <p> tag should implicitly close the previous <p> tag.
1504
1505
<p>Para1<p>Para2
1506
should be transformed into:
1507
<p>Para1</p><p>Para2
1508
1509
Some tags can be nested arbitrarily. For instance, the occurance
1510
of a <blockquote> tag should _not_ implicitly close the previous
1511
<blockquote> tag.
1512
1513
Alice said: <blockquote>Bob said: <blockquote>Blah
1514
should NOT be transformed into:
1515
Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1516
1517
Some tags can be nested, but the nesting is reset by the
1518
interposition of other tags. For instance, a <tr> tag should
1519
implicitly close the previous <tr> tag within the same <table>,
1520
but not close a <tr> tag in another table.
1521
1522
<table><tr>Blah<tr>Blah
1523
should be transformed into:
1524
<table><tr>Blah</tr><tr>Blah
1525
but,
1526
<tr>Blah<table><tr>Blah
1527
should NOT be transformed into
1528
<tr>Blah<table></tr><tr>Blah
1529
1530
Differing assumptions about tag nesting rules are a major source
1531
of problems with the BeautifulSoup class. If BeautifulSoup is not
1532
treating as nestable a tag your page author treats as nestable,
1533
try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1534
BeautifulStoneSoup before writing your own subclass."""
1535
1536
def __init__(self, *args, **kwargs):
1537
if 'smartQuotesTo' not in kwargs:
1538
kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1539
kwargs['isHTML'] = True
1540
BeautifulStoneSoup.__init__(self, *args, **kwargs)
1541
1542
SELF_CLOSING_TAGS = buildTagMap(None,
1543
('br' , 'hr', 'input', 'img', 'meta',
1544
'spacer', 'link', 'frame', 'base', 'col'))
1545
1546
PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1547
1548
QUOTE_TAGS = {'script' : None, 'textarea' : None}
1549
1550
#According to the HTML standard, each of these inline tags can
1551
#contain another tag of the same type. Furthermore, it's common
1552
#to actually use these tags this way.
1553
NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1554
'center')
1555
1556
#According to the HTML standard, these block tags can contain
1557
#another tag of the same type. Furthermore, it's common
1558
#to actually use these tags this way.
1559
NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
1560
1561
#Lists can contain other lists, but there are restrictions.
1562
NESTABLE_LIST_TAGS = { 'ol' : [],
1563
'ul' : [],
1564
'li' : ['ul', 'ol'],
1565
'dl' : [],
1566
'dd' : ['dl'],
1567
'dt' : ['dl'] }
1568
1569
#Tables can contain other tables, but there are restrictions.
1570
NESTABLE_TABLE_TAGS = {'table' : [],
1571
'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1572
'td' : ['tr'],
1573
'th' : ['tr'],
1574
'thead' : ['table'],
1575
'tbody' : ['table'],
1576
'tfoot' : ['table'],
1577
}
1578
1579
NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
1580
1581
#If one of these tags is encountered, all tags up to the next tag of
1582
#this type are popped.
1583
RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1584
NON_NESTABLE_BLOCK_TAGS,
1585
NESTABLE_LIST_TAGS,
1586
NESTABLE_TABLE_TAGS)
1587
1588
NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1589
NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1590
1591
# Used to detect the charset in a META tag; see start_meta
1592
CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
1593
1594
def start_meta(self, attrs):
1595
"""Beautiful Soup can detect a charset included in a META tag,
1596
try to convert the document to that charset, and re-parse the
1597
document from the beginning."""
1598
httpEquiv = None
1599
contentType = None
1600
contentTypeIndex = None
1601
tagNeedsEncodingSubstitution = False
1602
1603
for i in xrange(0, len(attrs)):
1604
key, value = attrs[i]
1605
key = key.lower()
1606
if key == 'http-equiv':
1607
httpEquiv = value
1608
elif key == 'content':
1609
contentType = value
1610
contentTypeIndex = i
1611
1612
if httpEquiv and contentType: # It's an interesting meta tag.
1613
match = self.CHARSET_RE.search(contentType)
1614
if match:
1615
if (self.declaredHTMLEncoding is not None or
1616
self.originalEncoding == self.fromEncoding):
1617
# An HTML encoding was sniffed while converting
1618
# the document to Unicode, or an HTML encoding was
1619
# sniffed during a previous pass through the
1620
# document, or an encoding was specified
1621
# explicitly and it worked. Rewrite the meta tag.
1622
def rewrite(match):
1623
return match.group(1) + "%SOUP-ENCODING%"
1624
newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1625
attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1626
newAttr)
1627
tagNeedsEncodingSubstitution = True
1628
else:
1629
# This is our first pass through the document.
1630
# Go through it again with the encoding information.
1631
newCharset = match.group(3)
1632
if newCharset and newCharset != self.originalEncoding:
1633
self.declaredHTMLEncoding = newCharset
1634
self._feed(self.declaredHTMLEncoding)
1635
raise StopParsing
1636
pass
1637
tag = self.unknown_starttag("meta", attrs)
1638
if tag and tagNeedsEncodingSubstitution:
1639
tag.containsSubstitutions = True
1640
1641
class StopParsing(Exception):
1642
pass
1643
1644
class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1645
1646
"""The BeautifulSoup class is oriented towards skipping over
1647
common HTML errors like unclosed tags. However, sometimes it makes
1648
errors of its own. For instance, consider this fragment:
1649
1650
<b>Foo<b>Bar</b></b>
1651
1652
This is perfectly valid (if bizarre) HTML. However, the
1653
BeautifulSoup class will implicitly close the first b tag when it
1654
encounters the second 'b'. It will think the author wrote
1655
"<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1656
there's no real-world reason to bold something that's already
1657
bold. When it encounters '</b></b>' it will close two more 'b'
1658
tags, for a grand total of three tags closed instead of two. This
1659
can throw off the rest of your document structure. The same is
1660
true of a number of other tags, listed below.
1661
1662
It's much more common for someone to forget to close a 'b' tag
1663
than to actually use nested 'b' tags, and the BeautifulSoup class
1664
handles the common case. This class handles the not-co-common
1665
case: where you can't believe someone wrote what they did, but
1666
it's valid HTML and BeautifulSoup screwed up by assuming it
1667
wouldn't be."""
1668
1669
I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1670
('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1671
'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1672
'big')
1673
1674
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
1675
1676
NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1677
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1678
I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1679
1680
class MinimalSoup(BeautifulSoup):
1681
"""The MinimalSoup class is for parsing HTML that contains
1682
pathologically bad markup. It makes no assumptions about tag
1683
nesting, but it does know which tags are self-closing, that
1684
<script> tags contain Javascript and should not be parsed, that
1685
META tags may contain encoding information, and so on.
1686
1687
This also makes it better for subclassing than BeautifulStoneSoup
1688
or BeautifulSoup."""
1689
1690
RESET_NESTING_TAGS = buildTagMap('noscript')
1691
NESTABLE_TAGS = {}
1692
1693
class BeautifulSOAP(BeautifulStoneSoup):
1694
"""This class will push a tag with only a single string child into
1695
the tag's parent as an attribute. The attribute's name is the tag
1696
name, and the value is the string child. An example should give
1697
the flavor of the change:
1698
1699
<foo><bar>baz</bar></foo>
1700
=>
1701
<foo bar="baz"><bar>baz</bar></foo>
1702
1703
You can then access fooTag['bar'] instead of fooTag.barTag.string.
1704
1705
This is, of course, useful for scraping structures that tend to
1706
use subelements instead of attributes, such as SOAP messages. Note
1707
that it modifies its input, so don't print the modified version
1708
out.
1709
1710
I'm not sure how many people really want to use this class; let me
1711
know if you do. Mainly I like the name."""
1712
1713
def popTag(self):
1714
if len(self.tagStack) > 1:
1715
tag = self.tagStack[-1]
1716
parent = self.tagStack[-2]
1717
parent._getAttrMap()
1718
if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1719
isinstance(tag.contents[0], NavigableString) and
1720
not parent.attrMap.has_key(tag.name)):
1721
parent[tag.name] = tag.contents[0]
1722
BeautifulStoneSoup.popTag(self)
1723
1724
#Enterprise class names! It has come to our attention that some people
1725
#think the names of the Beautiful Soup parser classes are too silly
1726
#and "unprofessional" for use in enterprise screen-scraping. We feel
1727
#your pain! For such-minded folk, the Beautiful Soup Consortium And
1728
#All-Night Kosher Bakery recommends renaming this file to
1729
#"RobustParser.py" (or, in cases of extreme enterprisiness,
1730
#"RobustParserBeanInterface.class") and using the following
1731
#enterprise-friendly class aliases:
1732
class RobustXMLParser(BeautifulStoneSoup):
1733
pass
1734
class RobustHTMLParser(BeautifulSoup):
1735
pass
1736
class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1737
pass
1738
class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1739
pass
1740
class SimplifyingSOAPParser(BeautifulSOAP):
1741
pass
1742
1743
######################################################
1744
#
1745
# Bonus library: Unicode, Dammit
1746
#
1747
# This class forces XML data into a standard format (usually to UTF-8
1748
# or Unicode). It is heavily based on code from Mark Pilgrim's
1749
# Universal Feed Parser. It does not rewrite the XML or HTML to
1750
# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1751
# (XML) and BeautifulSoup.start_meta (HTML).
1752
1753
# Autodetects character encodings.
1754
# Download from http://chardet.feedparser.org/
1755
try:
1756
import chardet
1757
# import chardet.constants
1758
# chardet.constants._debug = 1
1759
except ImportError:
1760
chardet = None
1761
1762
# cjkcodecs and iconv_codec make Python know about more character encodings.
1763
# Both are available from http://cjkpython.i18n.org/
1764
# They're built in if you use Python 2.4.
1765
try:
1766
import cjkcodecs.aliases
1767
except ImportError:
1768
pass
1769
try:
1770
import iconv_codec
1771
except ImportError:
1772
pass
1773
1774
class UnicodeDammit:
1775
"""A class for detecting the encoding of a *ML document and
1776
converting it to a Unicode string. If the source encoding is
1777
windows-1252, can replace MS smart quotes with their HTML or XML
1778
equivalents."""
1779
1780
# This dictionary maps commonly seen values for "charset" in HTML
1781
# meta tags to the corresponding Python codec names. It only covers
1782
# values that aren't in Python's aliases and can't be determined
1783
# by the heuristics in find_codec.
1784
CHARSET_ALIASES = { "macintosh" : "mac-roman",
1785
"x-sjis" : "shift-jis" }
1786
1787
def __init__(self, markup, overrideEncodings=[],
1788
smartQuotesTo='xml', isHTML=False):
1789
self.declaredHTMLEncoding = None
1790
self.markup, documentEncoding, sniffedEncoding = \
1791
self._detectEncoding(markup, isHTML)
1792
self.smartQuotesTo = smartQuotesTo
1793
self.triedEncodings = []
1794
if markup == '' or isinstance(markup, text_type):
1795
self.originalEncoding = None
1796
self.unicode = text_type(markup)
1797
return
1798
1799
u = None
1800
for proposedEncoding in overrideEncodings:
1801
u = self._convertFrom(proposedEncoding)
1802
if u: break
1803
if not u:
1804
for proposedEncoding in (documentEncoding, sniffedEncoding):
1805
u = self._convertFrom(proposedEncoding)
1806
if u: break
1807
1808
# If no luck and we have auto-detection library, try that:
1809
if not u and chardet and not isinstance(self.markup, text_type):
1810
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1811
1812
# As a last resort, try utf-8 and windows-1252:
1813
if not u:
1814
for proposed_encoding in ("utf-8", "windows-1252"):
1815
u = self._convertFrom(proposed_encoding)
1816
if u: break
1817
1818
self.unicode = u
1819
if not u: self.originalEncoding = None
1820
1821
def _subMSChar(self, orig):
1822
"""Changes a MS smart quote character to an XML or HTML
1823
entity."""
1824
sub = self.MS_CHARS.get(orig)
1825
if isinstance(sub, tuple):
1826
if self.smartQuotesTo == 'xml':
1827
sub = '&#x%s;' % sub[1]
1828
else:
1829
sub = '&%s;' % sub[0]
1830
return sub
1831
1832
def _convertFrom(self, proposed):
1833
proposed = self.find_codec(proposed)
1834
if not proposed or proposed in self.triedEncodings:
1835
return None
1836
self.triedEncodings.append(proposed)
1837
markup = self.markup
1838
1839
# Convert smart quotes to HTML if coming from an encoding
1840
# that might have them.
1841
if self.smartQuotesTo and proposed.lower() in("windows-1252",
1842
"iso-8859-1",
1843
"iso-8859-2"):
1844
markup = re.compile("([\x80-\x9f])").sub \
1845
(lambda x: self._subMSChar(x.group(1)),
1846
markup)
1847
1848
try:
1849
# print "Trying to convert document to %s" % proposed
1850
u = self._toUnicode(markup, proposed)
1851
self.markup = u
1852
self.originalEncoding = proposed
1853
except Exception as e:
1854
# print "That didn't work!"
1855
# print e
1856
return None
1857
#print "Correct encoding: %s" % proposed
1858
return self.markup
1859
1860
def _toUnicode(self, data, encoding):
1861
'''Given a string and its encoding, decodes the string into Unicode.
1862
%encoding is a string recognized by encodings.aliases'''
1863
1864
# strip Byte Order Mark (if present)
1865
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1866
and (data[2:4] != '\x00\x00'):
1867
encoding = 'utf-16be'
1868
data = data[2:]
1869
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1870
and (data[2:4] != '\x00\x00'):
1871
encoding = 'utf-16le'
1872
data = data[2:]
1873
elif data[:3] == '\xef\xbb\xbf':
1874
encoding = 'utf-8'
1875
data = data[3:]
1876
elif data[:4] == '\x00\x00\xfe\xff':
1877
encoding = 'utf-32be'
1878
data = data[4:]
1879
elif data[:4] == '\xff\xfe\x00\x00':
1880
encoding = 'utf-32le'
1881
data = data[4:]
1882
newdata = text_type(data, encoding)
1883
return newdata
1884
1885
def _detectEncoding(self, xml_data, isHTML=False):
1886
"""Given a document, tries to detect its XML encoding."""
1887
xml_encoding = sniffed_xml_encoding = None
1888
try:
1889
if xml_data[:4] == '\x4c\x6f\xa7\x94':
1890
# EBCDIC
1891
xml_data = self._ebcdic_to_ascii(xml_data)
1892
elif xml_data[:4] == '\x00\x3c\x00\x3f':
1893
# UTF-16BE
1894
sniffed_xml_encoding = 'utf-16be'
1895
xml_data = text_type(xml_data, 'utf-16be').encode('utf-8')
1896
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1897
and (xml_data[2:4] != '\x00\x00'):
1898
# UTF-16BE with BOM
1899
sniffed_xml_encoding = 'utf-16be'
1900
xml_data = text_type(xml_data[2:], 'utf-16be').encode('utf-8')
1901
elif xml_data[:4] == '\x3c\x00\x3f\x00':
1902
# UTF-16LE
1903
sniffed_xml_encoding = 'utf-16le'
1904
xml_data = text_type(xml_data, 'utf-16le').encode('utf-8')
1905
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1906
(xml_data[2:4] != '\x00\x00'):
1907
# UTF-16LE with BOM
1908
sniffed_xml_encoding = 'utf-16le'
1909
xml_data = text_type(xml_data[2:], 'utf-16le').encode('utf-8')
1910
elif xml_data[:4] == '\x00\x00\x00\x3c':
1911
# UTF-32BE
1912
sniffed_xml_encoding = 'utf-32be'
1913
xml_data = text_type(xml_data, 'utf-32be').encode('utf-8')
1914
elif xml_data[:4] == '\x3c\x00\x00\x00':
1915
# UTF-32LE
1916
sniffed_xml_encoding = 'utf-32le'
1917
xml_data = text_type(xml_data, 'utf-32le').encode('utf-8')
1918
elif xml_data[:4] == '\x00\x00\xfe\xff':
1919
# UTF-32BE with BOM
1920
sniffed_xml_encoding = 'utf-32be'
1921
xml_data = text_type(xml_data[4:], 'utf-32be').encode('utf-8')
1922
elif xml_data[:4] == '\xff\xfe\x00\x00':
1923
# UTF-32LE with BOM
1924
sniffed_xml_encoding = 'utf-32le'
1925
xml_data = text_type(xml_data[4:], 'utf-32le').encode('utf-8')
1926
elif xml_data[:3] == '\xef\xbb\xbf':
1927
# UTF-8 with BOM
1928
sniffed_xml_encoding = 'utf-8'
1929
xml_data = text_type(xml_data[3:], 'utf-8').encode('utf-8')
1930
else:
1931
sniffed_xml_encoding = 'ascii'
1932
pass
1933
except:
1934
xml_encoding_match = None
1935
xml_encoding_match = re.compile(
1936
r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1937
if not xml_encoding_match and isHTML:
1938
regexp = re.compile(r'<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1939
xml_encoding_match = regexp.search(xml_data)
1940
if xml_encoding_match is not None:
1941
xml_encoding = xml_encoding_match.groups()[0].lower()
1942
if isHTML:
1943
self.declaredHTMLEncoding = xml_encoding
1944
if sniffed_xml_encoding and \
1945
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1946
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1947
'utf-16', 'utf-32', 'utf_16', 'utf_32',
1948
'utf16', 'u16')):
1949
xml_encoding = sniffed_xml_encoding
1950
return xml_data, xml_encoding, sniffed_xml_encoding
1951
1952
1953
def find_codec(self, charset):
1954
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1955
or (charset and self._codec(charset.replace("-", ""))) \
1956
or (charset and self._codec(charset.replace("-", "_"))) \
1957
or charset
1958
1959
def _codec(self, charset):
1960
if not charset: return charset
1961
codec = None
1962
try:
1963
codecs.lookup(charset)
1964
codec = charset
1965
except (LookupError, ValueError):
1966
pass
1967
return codec
1968
1969
EBCDIC_TO_ASCII_MAP = None
1970
def _ebcdic_to_ascii(self, s):
1971
c = self.__class__
1972
if not c.EBCDIC_TO_ASCII_MAP:
1973
emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1974
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1975
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1976
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1977
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1978
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1979
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1980
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1981
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1982
201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1983
206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1984
211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1985
225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1986
73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1987
82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1988
90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1989
250,251,252,253,254,255)
1990
import string
1991
c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1992
''.join(map(chr, xrange(256))), ''.join(map(chr, emap)))
1993
return s.translate(c.EBCDIC_TO_ASCII_MAP)
1994
1995
MS_CHARS = { '\x80' : ('euro', '20AC'),
1996
'\x81' : ' ',
1997
'\x82' : ('sbquo', '201A'),
1998
'\x83' : ('fnof', '192'),
1999
'\x84' : ('bdquo', '201E'),
2000
'\x85' : ('hellip', '2026'),
2001
'\x86' : ('dagger', '2020'),
2002
'\x87' : ('Dagger', '2021'),
2003
'\x88' : ('circ', '2C6'),
2004
'\x89' : ('permil', '2030'),
2005
'\x8A' : ('Scaron', '160'),
2006
'\x8B' : ('lsaquo', '2039'),
2007
'\x8C' : ('OElig', '152'),
2008
'\x8D' : '?',
2009
'\x8E' : ('#x17D', '17D'),
2010
'\x8F' : '?',
2011
'\x90' : '?',
2012
'\x91' : ('lsquo', '2018'),
2013
'\x92' : ('rsquo', '2019'),
2014
'\x93' : ('ldquo', '201C'),
2015
'\x94' : ('rdquo', '201D'),
2016
'\x95' : ('bull', '2022'),
2017
'\x96' : ('ndash', '2013'),
2018
'\x97' : ('mdash', '2014'),
2019
'\x98' : ('tilde', '2DC'),
2020
'\x99' : ('trade', '2122'),
2021
'\x9a' : ('scaron', '161'),
2022
'\x9b' : ('rsaquo', '203A'),
2023
'\x9c' : ('oelig', '153'),
2024
'\x9d' : '?',
2025
'\x9e' : ('#x17E', '17E'),
2026
'\x9f' : ('Yuml', ''),}
2027
2028
#######################################################################
2029
2030
2031
#By default, act as an HTML pretty-printer.
2032
if __name__ == '__main__':
2033
soup = BeautifulSoup(sys.stdin)
2034
print(soup.prettify())
2035
2036