Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/element.py
811 views
1
# Use of this source code is governed by the MIT license.
2
__license__ = "MIT"
3
4
try:
5
from collections.abc import Callable # Python 3.6
6
except ImportError as e:
7
from collections import Callable
8
import re
9
import sys
10
import warnings
11
try:
12
import soupsieve
13
except ImportError as e:
14
soupsieve = None
15
warnings.warn(
16
'The soupsieve package is not installed. CSS selectors cannot be used.'
17
)
18
19
from bs4.formatter import (
20
Formatter,
21
HTMLFormatter,
22
XMLFormatter,
23
)
24
25
DEFAULT_OUTPUT_ENCODING = "utf-8"
26
PY3K = (sys.version_info[0] > 2)
27
28
nonwhitespace_re = re.compile(r"\S+")
29
30
# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
31
# the off chance someone imported it for their own use.
32
whitespace_re = re.compile(r"\s+")
33
34
def _alias(attr):
35
"""Alias one attribute name to another for backward compatibility"""
36
@property
37
def alias(self):
38
return getattr(self, attr)
39
40
@alias.setter
41
def alias(self):
42
return setattr(self, attr)
43
return alias
44
45
46
# These encodings are recognized by Python (so PageElement.encode
47
# could theoretically support them) but XML and HTML don't recognize
48
# them (so they should not show up in an XML or HTML document as that
49
# document's encoding).
50
#
51
# If an XML document is encoded in one of these encodings, no encoding
52
# will be mentioned in the XML declaration. If an HTML document is
53
# encoded in one of these encodings, and the HTML document has a
54
# <meta> tag that mentions an encoding, the encoding will be given as
55
# the empty string.
56
#
57
# Source:
58
# https://docs.python.org/3/library/codecs.html#python-specific-encodings
59
PYTHON_SPECIFIC_ENCODINGS = set([
60
"idna",
61
"mbcs",
62
"oem",
63
"palmos",
64
"punycode",
65
"raw_unicode_escape",
66
"undefined",
67
"unicode_escape",
68
"raw-unicode-escape",
69
"unicode-escape",
70
"string-escape",
71
"string_escape",
72
])
73
74
75
class NamespacedAttribute(str):
76
"""A namespaced string (e.g. 'xml:lang') that remembers the namespace
77
('xml') and the name ('lang') that were used to create it.
78
"""
79
80
def __new__(cls, prefix, name=None, namespace=None):
81
if not name:
82
# This is the default namespace. Its name "has no value"
83
# per https://www.w3.org/TR/xml-names/#defaulting
84
name = None
85
86
if name is None:
87
obj = str.__new__(cls, prefix)
88
elif prefix is None:
89
# Not really namespaced.
90
obj = str.__new__(cls, name)
91
else:
92
obj = str.__new__(cls, prefix + ":" + name)
93
obj.prefix = prefix
94
obj.name = name
95
obj.namespace = namespace
96
return obj
97
98
class AttributeValueWithCharsetSubstitution(str):
99
"""A stand-in object for a character encoding specified in HTML."""
100
101
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
102
"""A generic stand-in for the value of a meta tag's 'charset' attribute.
103
104
When Beautiful Soup parses the markup '<meta charset="utf8">', the
105
value of the 'charset' attribute will be one of these objects.
106
"""
107
108
def __new__(cls, original_value):
109
obj = str.__new__(cls, original_value)
110
obj.original_value = original_value
111
return obj
112
113
def encode(self, encoding):
114
"""When an HTML document is being encoded to a given encoding, the
115
value of a meta tag's 'charset' is the name of the encoding.
116
"""
117
if encoding in PYTHON_SPECIFIC_ENCODINGS:
118
return ''
119
return encoding
120
121
122
class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
123
"""A generic stand-in for the value of a meta tag's 'content' attribute.
124
125
When Beautiful Soup parses the markup:
126
<meta http-equiv="content-type" content="text/html; charset=utf8">
127
128
The value of the 'content' attribute will be one of these objects.
129
"""
130
131
CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
132
133
def __new__(cls, original_value):
134
match = cls.CHARSET_RE.search(original_value)
135
if match is None:
136
# No substitution necessary.
137
return str.__new__(str, original_value)
138
139
obj = str.__new__(cls, original_value)
140
obj.original_value = original_value
141
return obj
142
143
def encode(self, encoding):
144
if encoding in PYTHON_SPECIFIC_ENCODINGS:
145
return ''
146
def rewrite(match):
147
return match.group(1) + encoding
148
return self.CHARSET_RE.sub(rewrite, self.original_value)
149
150
151
class PageElement(object):
152
"""Contains the navigational information for some part of the page:
153
that is, its current location in the parse tree.
154
155
NavigableString, Tag, etc. are all subclasses of PageElement.
156
"""
157
158
def setup(self, parent=None, previous_element=None, next_element=None,
159
previous_sibling=None, next_sibling=None):
160
"""Sets up the initial relations between this element and
161
other elements.
162
163
:param parent: The parent of this element.
164
165
:param previous_element: The element parsed immediately before
166
this one.
167
168
:param next_element: The element parsed immediately before
169
this one.
170
171
:param previous_sibling: The most recently encountered element
172
on the same level of the parse tree as this one.
173
174
:param previous_sibling: The next element to be encountered
175
on the same level of the parse tree as this one.
176
"""
177
self.parent = parent
178
179
self.previous_element = previous_element
180
if previous_element is not None:
181
self.previous_element.next_element = self
182
183
self.next_element = next_element
184
if self.next_element is not None:
185
self.next_element.previous_element = self
186
187
self.next_sibling = next_sibling
188
if self.next_sibling is not None:
189
self.next_sibling.previous_sibling = self
190
191
if (previous_sibling is None
192
and self.parent is not None and self.parent.contents):
193
previous_sibling = self.parent.contents[-1]
194
195
self.previous_sibling = previous_sibling
196
if previous_sibling is not None:
197
self.previous_sibling.next_sibling = self
198
199
def format_string(self, s, formatter):
200
"""Format the given string using the given formatter.
201
202
:param s: A string.
203
:param formatter: A Formatter object, or a string naming one of the standard formatters.
204
"""
205
if formatter is None:
206
return s
207
if not isinstance(formatter, Formatter):
208
formatter = self.formatter_for_name(formatter)
209
output = formatter.substitute(s)
210
return output
211
212
def formatter_for_name(self, formatter):
213
"""Look up or create a Formatter for the given identifier,
214
if necessary.
215
216
:param formatter: Can be a Formatter object (used as-is), a
217
function (used as the entity substitution hook for an
218
XMLFormatter or HTMLFormatter), or a string (used to look
219
up an XMLFormatter or HTMLFormatter in the appropriate
220
registry.
221
"""
222
if isinstance(formatter, Formatter):
223
return formatter
224
if self._is_xml:
225
c = XMLFormatter
226
else:
227
c = HTMLFormatter
228
if isinstance(formatter, Callable):
229
return c(entity_substitution=formatter)
230
return c.REGISTRY[formatter]
231
232
@property
233
def _is_xml(self):
234
"""Is this element part of an XML tree or an HTML tree?
235
236
This is used in formatter_for_name, when deciding whether an
237
XMLFormatter or HTMLFormatter is more appropriate. It can be
238
inefficient, but it should be called very rarely.
239
"""
240
if self.known_xml is not None:
241
# Most of the time we will have determined this when the
242
# document is parsed.
243
return self.known_xml
244
245
# Otherwise, it's likely that this element was created by
246
# direct invocation of the constructor from within the user's
247
# Python code.
248
if self.parent is None:
249
# This is the top-level object. It should have .known_xml set
250
# from tree creation. If not, take a guess--BS is usually
251
# used on HTML markup.
252
return getattr(self, 'is_xml', False)
253
return self.parent._is_xml
254
255
nextSibling = _alias("next_sibling") # BS3
256
previousSibling = _alias("previous_sibling") # BS3
257
258
def replace_with(self, replace_with):
259
"""Replace this PageElement with another one, keeping the rest of the
260
tree the same.
261
262
:param replace_with: A PageElement.
263
:return: `self`, no longer part of the tree.
264
"""
265
if self.parent is None:
266
raise ValueError(
267
"Cannot replace one element with another when the "
268
"element to be replaced is not part of a tree.")
269
if replace_with is self:
270
return
271
if replace_with is self.parent:
272
raise ValueError("Cannot replace a Tag with its parent.")
273
old_parent = self.parent
274
my_index = self.parent.index(self)
275
self.extract(_self_index=my_index)
276
old_parent.insert(my_index, replace_with)
277
return self
278
replaceWith = replace_with # BS3
279
280
def unwrap(self):
281
"""Replace this PageElement with its contents.
282
283
:return: `self`, no longer part of the tree.
284
"""
285
my_parent = self.parent
286
if self.parent is None:
287
raise ValueError(
288
"Cannot replace an element with its contents when that"
289
"element is not part of a tree.")
290
my_index = self.parent.index(self)
291
self.extract(_self_index=my_index)
292
for child in reversed(self.contents[:]):
293
my_parent.insert(my_index, child)
294
return self
295
replace_with_children = unwrap
296
replaceWithChildren = unwrap # BS3
297
298
def wrap(self, wrap_inside):
299
"""Wrap this PageElement inside another one.
300
301
:param wrap_inside: A PageElement.
302
:return: `wrap_inside`, occupying the position in the tree that used
303
to be occupied by `self`, and with `self` inside it.
304
"""
305
me = self.replace_with(wrap_inside)
306
wrap_inside.append(me)
307
return wrap_inside
308
309
def extract(self, _self_index=None):
310
"""Destructively rips this element out of the tree.
311
312
:param _self_index: The location of this element in its parent's
313
.contents, if known. Passing this in allows for a performance
314
optimization.
315
316
:return: `self`, no longer part of the tree.
317
"""
318
if self.parent is not None:
319
if _self_index is None:
320
_self_index = self.parent.index(self)
321
del self.parent.contents[_self_index]
322
323
#Find the two elements that would be next to each other if
324
#this element (and any children) hadn't been parsed. Connect
325
#the two.
326
last_child = self._last_descendant()
327
next_element = last_child.next_element
328
329
if (self.previous_element is not None and
330
self.previous_element is not next_element):
331
self.previous_element.next_element = next_element
332
if next_element is not None and next_element is not self.previous_element:
333
next_element.previous_element = self.previous_element
334
self.previous_element = None
335
last_child.next_element = None
336
337
self.parent = None
338
if (self.previous_sibling is not None
339
and self.previous_sibling is not self.next_sibling):
340
self.previous_sibling.next_sibling = self.next_sibling
341
if (self.next_sibling is not None
342
and self.next_sibling is not self.previous_sibling):
343
self.next_sibling.previous_sibling = self.previous_sibling
344
self.previous_sibling = self.next_sibling = None
345
return self
346
347
def _last_descendant(self, is_initialized=True, accept_self=True):
348
"""Finds the last element beneath this object to be parsed.
349
350
:param is_initialized: Has `setup` been called on this PageElement
351
yet?
352
:param accept_self: Is `self` an acceptable answer to the question?
353
"""
354
if is_initialized and self.next_sibling is not None:
355
last_child = self.next_sibling.previous_element
356
else:
357
last_child = self
358
while isinstance(last_child, Tag) and last_child.contents:
359
last_child = last_child.contents[-1]
360
if not accept_self and last_child is self:
361
last_child = None
362
return last_child
363
# BS3: Not part of the API!
364
_lastRecursiveChild = _last_descendant
365
366
def insert(self, position, new_child):
367
"""Insert a new PageElement in the list of this PageElement's children.
368
369
This works the same way as `list.insert`.
370
371
:param position: The numeric position that should be occupied
372
in `self.children` by the new PageElement.
373
:param new_child: A PageElement.
374
"""
375
if new_child is None:
376
raise ValueError("Cannot insert None into a tag.")
377
if new_child is self:
378
raise ValueError("Cannot insert a tag into itself.")
379
if (isinstance(new_child, str)
380
and not isinstance(new_child, NavigableString)):
381
new_child = NavigableString(new_child)
382
383
from bs4 import BeautifulSoup
384
if isinstance(new_child, BeautifulSoup):
385
# We don't want to end up with a situation where one BeautifulSoup
386
# object contains another. Insert the children one at a time.
387
for subchild in list(new_child.contents):
388
self.insert(position, subchild)
389
position += 1
390
return
391
position = min(position, len(self.contents))
392
if hasattr(new_child, 'parent') and new_child.parent is not None:
393
# We're 'inserting' an element that's already one
394
# of this object's children.
395
if new_child.parent is self:
396
current_index = self.index(new_child)
397
if current_index < position:
398
# We're moving this element further down the list
399
# of this object's children. That means that when
400
# we extract this element, our target index will
401
# jump down one.
402
position -= 1
403
new_child.extract()
404
405
new_child.parent = self
406
previous_child = None
407
if position == 0:
408
new_child.previous_sibling = None
409
new_child.previous_element = self
410
else:
411
previous_child = self.contents[position - 1]
412
new_child.previous_sibling = previous_child
413
new_child.previous_sibling.next_sibling = new_child
414
new_child.previous_element = previous_child._last_descendant(False)
415
if new_child.previous_element is not None:
416
new_child.previous_element.next_element = new_child
417
418
new_childs_last_element = new_child._last_descendant(False)
419
420
if position >= len(self.contents):
421
new_child.next_sibling = None
422
423
parent = self
424
parents_next_sibling = None
425
while parents_next_sibling is None and parent is not None:
426
parents_next_sibling = parent.next_sibling
427
parent = parent.parent
428
if parents_next_sibling is not None:
429
# We found the element that comes next in the document.
430
break
431
if parents_next_sibling is not None:
432
new_childs_last_element.next_element = parents_next_sibling
433
else:
434
# The last element of this tag is the last element in
435
# the document.
436
new_childs_last_element.next_element = None
437
else:
438
next_child = self.contents[position]
439
new_child.next_sibling = next_child
440
if new_child.next_sibling is not None:
441
new_child.next_sibling.previous_sibling = new_child
442
new_childs_last_element.next_element = next_child
443
444
if new_childs_last_element.next_element is not None:
445
new_childs_last_element.next_element.previous_element = new_childs_last_element
446
self.contents.insert(position, new_child)
447
448
def append(self, tag):
449
"""Appends the given PageElement to the contents of this one.
450
451
:param tag: A PageElement.
452
"""
453
self.insert(len(self.contents), tag)
454
455
def extend(self, tags):
456
"""Appends the given PageElements to this one's contents.
457
458
:param tags: A list of PageElements.
459
"""
460
for tag in tags:
461
self.append(tag)
462
463
def insert_before(self, *args):
464
"""Makes the given element(s) the immediate predecessor of this one.
465
466
All the elements will have the same parent, and the given elements
467
will be immediately before this one.
468
469
:param args: One or more PageElements.
470
"""
471
parent = self.parent
472
if parent is None:
473
raise ValueError(
474
"Element has no parent, so 'before' has no meaning.")
475
if any(x is self for x in args):
476
raise ValueError("Can't insert an element before itself.")
477
for predecessor in args:
478
# Extract first so that the index won't be screwed up if they
479
# are siblings.
480
if isinstance(predecessor, PageElement):
481
predecessor.extract()
482
index = parent.index(self)
483
parent.insert(index, predecessor)
484
485
def insert_after(self, *args):
486
"""Makes the given element(s) the immediate successor of this one.
487
488
The elements will have the same parent, and the given elements
489
will be immediately after this one.
490
491
:param args: One or more PageElements.
492
"""
493
# Do all error checking before modifying the tree.
494
parent = self.parent
495
if parent is None:
496
raise ValueError(
497
"Element has no parent, so 'after' has no meaning.")
498
if any(x is self for x in args):
499
raise ValueError("Can't insert an element after itself.")
500
501
offset = 0
502
for successor in args:
503
# Extract first so that the index won't be screwed up if they
504
# are siblings.
505
if isinstance(successor, PageElement):
506
successor.extract()
507
index = parent.index(self)
508
parent.insert(index+1+offset, successor)
509
offset += 1
510
511
def find_next(self, name=None, attrs={}, text=None, **kwargs):
512
"""Find the first PageElement that matches the given criteria and
513
appears later in the document than this PageElement.
514
515
All find_* methods take a common set of arguments. See the online
516
documentation for detailed explanations.
517
518
:param name: A filter on tag name.
519
:param attrs: A dictionary of filters on attribute values.
520
:param text: A filter for a NavigableString with specific text.
521
:kwargs: A dictionary of filters on attribute values.
522
:return: A PageElement.
523
:rtype: bs4.element.Tag | bs4.element.NavigableString
524
"""
525
return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
526
findNext = find_next # BS3
527
528
def find_all_next(self, name=None, attrs={}, text=None, limit=None,
529
**kwargs):
530
"""Find all PageElements that match the given criteria and appear
531
later in the document than this PageElement.
532
533
All find_* methods take a common set of arguments. See the online
534
documentation for detailed explanations.
535
536
:param name: A filter on tag name.
537
:param attrs: A dictionary of filters on attribute values.
538
:param text: A filter for a NavigableString with specific text.
539
:param limit: Stop looking after finding this many results.
540
:kwargs: A dictionary of filters on attribute values.
541
:return: A ResultSet containing PageElements.
542
"""
543
return self._find_all(name, attrs, text, limit, self.next_elements,
544
**kwargs)
545
findAllNext = find_all_next # BS3
546
547
def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
548
"""Find the closest sibling to this PageElement that matches the
549
given criteria and appears later in the document.
550
551
All find_* methods take a common set of arguments. See the
552
online documentation for detailed explanations.
553
554
:param name: A filter on tag name.
555
:param attrs: A dictionary of filters on attribute values.
556
:param text: A filter for a NavigableString with specific text.
557
:kwargs: A dictionary of filters on attribute values.
558
:return: A PageElement.
559
:rtype: bs4.element.Tag | bs4.element.NavigableString
560
"""
561
return self._find_one(self.find_next_siblings, name, attrs, text,
562
**kwargs)
563
findNextSibling = find_next_sibling # BS3
564
565
def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
566
**kwargs):
567
"""Find all siblings of this PageElement that match the given criteria
568
and appear later in the document.
569
570
All find_* methods take a common set of arguments. See the online
571
documentation for detailed explanations.
572
573
:param name: A filter on tag name.
574
:param attrs: A dictionary of filters on attribute values.
575
:param text: A filter for a NavigableString with specific text.
576
:param limit: Stop looking after finding this many results.
577
:kwargs: A dictionary of filters on attribute values.
578
:return: A ResultSet of PageElements.
579
:rtype: bs4.element.ResultSet
580
"""
581
return self._find_all(name, attrs, text, limit,
582
self.next_siblings, **kwargs)
583
findNextSiblings = find_next_siblings # BS3
584
fetchNextSiblings = find_next_siblings # BS2
585
586
def find_previous(self, name=None, attrs={}, text=None, **kwargs):
587
"""Look backwards in the document from this PageElement and find the
588
first PageElement that matches the given criteria.
589
590
All find_* methods take a common set of arguments. See the online
591
documentation for detailed explanations.
592
593
:param name: A filter on tag name.
594
:param attrs: A dictionary of filters on attribute values.
595
:param text: A filter for a NavigableString with specific text.
596
:kwargs: A dictionary of filters on attribute values.
597
:return: A PageElement.
598
:rtype: bs4.element.Tag | bs4.element.NavigableString
599
"""
600
return self._find_one(
601
self.find_all_previous, name, attrs, text, **kwargs)
602
findPrevious = find_previous # BS3
603
604
def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
605
**kwargs):
606
"""Look backwards in the document from this PageElement and find all
607
PageElements that match the given criteria.
608
609
All find_* methods take a common set of arguments. See the online
610
documentation for detailed explanations.
611
612
:param name: A filter on tag name.
613
:param attrs: A dictionary of filters on attribute values.
614
:param text: A filter for a NavigableString with specific text.
615
:param limit: Stop looking after finding this many results.
616
:kwargs: A dictionary of filters on attribute values.
617
:return: A ResultSet of PageElements.
618
:rtype: bs4.element.ResultSet
619
"""
620
return self._find_all(name, attrs, text, limit, self.previous_elements,
621
**kwargs)
622
findAllPrevious = find_all_previous # BS3
623
fetchPrevious = find_all_previous # BS2
624
625
def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
626
"""Returns the closest sibling to this PageElement that matches the
627
given criteria and appears earlier in the document.
628
629
All find_* methods take a common set of arguments. See the online
630
documentation for detailed explanations.
631
632
:param name: A filter on tag name.
633
:param attrs: A dictionary of filters on attribute values.
634
:param text: A filter for a NavigableString with specific text.
635
:kwargs: A dictionary of filters on attribute values.
636
:return: A PageElement.
637
:rtype: bs4.element.Tag | bs4.element.NavigableString
638
"""
639
return self._find_one(self.find_previous_siblings, name, attrs, text,
640
**kwargs)
641
findPreviousSibling = find_previous_sibling # BS3
642
643
def find_previous_siblings(self, name=None, attrs={}, text=None,
644
limit=None, **kwargs):
645
"""Returns all siblings to this PageElement that match the
646
given criteria and appear earlier in the document.
647
648
All find_* methods take a common set of arguments. See the online
649
documentation for detailed explanations.
650
651
:param name: A filter on tag name.
652
:param attrs: A dictionary of filters on attribute values.
653
:param text: A filter for a NavigableString with specific text.
654
:param limit: Stop looking after finding this many results.
655
:kwargs: A dictionary of filters on attribute values.
656
:return: A ResultSet of PageElements.
657
:rtype: bs4.element.ResultSet
658
"""
659
return self._find_all(name, attrs, text, limit,
660
self.previous_siblings, **kwargs)
661
findPreviousSiblings = find_previous_siblings # BS3
662
fetchPreviousSiblings = find_previous_siblings # BS2
663
664
def find_parent(self, name=None, attrs={}, **kwargs):
665
"""Find the closest parent of this PageElement that matches the given
666
criteria.
667
668
All find_* methods take a common set of arguments. See the online
669
documentation for detailed explanations.
670
671
:param name: A filter on tag name.
672
:param attrs: A dictionary of filters on attribute values.
673
:kwargs: A dictionary of filters on attribute values.
674
675
:return: A PageElement.
676
:rtype: bs4.element.Tag | bs4.element.NavigableString
677
"""
678
# NOTE: We can't use _find_one because findParents takes a different
679
# set of arguments.
680
r = None
681
l = self.find_parents(name, attrs, 1, **kwargs)
682
if l:
683
r = l[0]
684
return r
685
findParent = find_parent # BS3
686
687
def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
688
"""Find all parents of this PageElement that match the given criteria.
689
690
All find_* methods take a common set of arguments. See the online
691
documentation for detailed explanations.
692
693
:param name: A filter on tag name.
694
:param attrs: A dictionary of filters on attribute values.
695
:param limit: Stop looking after finding this many results.
696
:kwargs: A dictionary of filters on attribute values.
697
698
:return: A PageElement.
699
:rtype: bs4.element.Tag | bs4.element.NavigableString
700
"""
701
return self._find_all(name, attrs, None, limit, self.parents,
702
**kwargs)
703
findParents = find_parents # BS3
704
fetchParents = find_parents # BS2
705
706
@property
707
def next(self):
708
"""The PageElement, if any, that was parsed just after this one.
709
710
:return: A PageElement.
711
:rtype: bs4.element.Tag | bs4.element.NavigableString
712
"""
713
return self.next_element
714
715
@property
716
def previous(self):
717
"""The PageElement, if any, that was parsed just before this one.
718
719
:return: A PageElement.
720
:rtype: bs4.element.Tag | bs4.element.NavigableString
721
"""
722
return self.previous_element
723
724
#These methods do the real heavy lifting.
725
726
def _find_one(self, method, name, attrs, text, **kwargs):
727
r = None
728
l = method(name, attrs, text, 1, **kwargs)
729
if l:
730
r = l[0]
731
return r
732
733
def _find_all(self, name, attrs, text, limit, generator, **kwargs):
734
"Iterates over a generator looking for things that match."
735
736
if text is None and 'string' in kwargs:
737
text = kwargs['string']
738
del kwargs['string']
739
740
if isinstance(name, SoupStrainer):
741
strainer = name
742
else:
743
strainer = SoupStrainer(name, attrs, text, **kwargs)
744
745
if text is None and not limit and not attrs and not kwargs:
746
if name is True or name is None:
747
# Optimization to find all tags.
748
result = (element for element in generator
749
if isinstance(element, Tag))
750
return ResultSet(strainer, result)
751
elif isinstance(name, str):
752
# Optimization to find all tags with a given name.
753
if name.count(':') == 1:
754
# This is a name with a prefix. If this is a namespace-aware document,
755
# we need to match the local name against tag.name. If not,
756
# we need to match the fully-qualified name against tag.name.
757
prefix, local_name = name.split(':', 1)
758
else:
759
prefix = None
760
local_name = name
761
result = (element for element in generator
762
if isinstance(element, Tag)
763
and (
764
element.name == name
765
) or (
766
element.name == local_name
767
and (prefix is None or element.prefix == prefix)
768
)
769
)
770
return ResultSet(strainer, result)
771
results = ResultSet(strainer)
772
while True:
773
try:
774
i = next(generator)
775
except StopIteration:
776
break
777
if i:
778
found = strainer.search(i)
779
if found:
780
results.append(found)
781
if limit and len(results) >= limit:
782
break
783
return results
784
785
#These generators can be used to navigate starting from both
786
#NavigableStrings and Tags.
787
@property
788
def next_elements(self):
789
"""All PageElements that were parsed after this one.
790
791
:yield: A sequence of PageElements.
792
"""
793
i = self.next_element
794
while i is not None:
795
yield i
796
i = i.next_element
797
798
@property
799
def next_siblings(self):
800
"""All PageElements that are siblings of this one but were parsed
801
later.
802
803
:yield: A sequence of PageElements.
804
"""
805
i = self.next_sibling
806
while i is not None:
807
yield i
808
i = i.next_sibling
809
810
@property
811
def previous_elements(self):
812
"""All PageElements that were parsed before this one.
813
814
:yield: A sequence of PageElements.
815
"""
816
i = self.previous_element
817
while i is not None:
818
yield i
819
i = i.previous_element
820
821
@property
822
def previous_siblings(self):
823
"""All PageElements that are siblings of this one but were parsed
824
earlier.
825
826
:yield: A sequence of PageElements.
827
"""
828
i = self.previous_sibling
829
while i is not None:
830
yield i
831
i = i.previous_sibling
832
833
@property
834
def parents(self):
835
"""All PageElements that are parents of this PageElement.
836
837
:yield: A sequence of PageElements.
838
"""
839
i = self.parent
840
while i is not None:
841
yield i
842
i = i.parent
843
844
@property
845
def decomposed(self):
846
"""Check whether a PageElement has been decomposed.
847
848
:rtype: bool
849
"""
850
return getattr(self, '_decomposed', False) or False
851
852
# Old non-property versions of the generators, for backwards
853
# compatibility with BS3.
854
def nextGenerator(self):
855
return self.next_elements
856
857
def nextSiblingGenerator(self):
858
return self.next_siblings
859
860
def previousGenerator(self):
861
return self.previous_elements
862
863
def previousSiblingGenerator(self):
864
return self.previous_siblings
865
866
def parentGenerator(self):
867
return self.parents
868
869
870
class NavigableString(str, PageElement):
871
"""A Python Unicode string that is part of a parse tree.
872
873
When Beautiful Soup parses the markup <b>penguin</b>, it will
874
create a NavigableString for the string "penguin".
875
"""
876
877
PREFIX = ''
878
SUFFIX = ''
879
880
# We can't tell just by looking at a string whether it's contained
881
# in an XML document or an HTML document.
882
883
known_xml = None
884
885
def __new__(cls, value):
886
"""Create a new NavigableString.
887
888
When unpickling a NavigableString, this method is called with
889
the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
890
passed in to the superclass's __new__ or the superclass won't know
891
how to handle non-ASCII characters.
892
"""
893
if isinstance(value, str):
894
u = str.__new__(cls, value)
895
else:
896
u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
897
u.setup()
898
return u
899
900
def __copy__(self):
901
"""A copy of a NavigableString has the same contents and class
902
as the original, but it is not connected to the parse tree.
903
"""
904
return type(self)(self)
905
906
def __getnewargs__(self):
907
return (str(self),)
908
909
def __getattr__(self, attr):
910
"""text.string gives you text. This is for backwards
911
compatibility for Navigable*String, but for CData* it lets you
912
get the string without the CData wrapper."""
913
if attr == 'string':
914
return self
915
else:
916
raise AttributeError(
917
"'%s' object has no attribute '%s'" % (
918
self.__class__.__name__, attr))
919
920
def output_ready(self, formatter="minimal"):
921
"""Run the string through the provided formatter.
922
923
:param formatter: A Formatter object, or a string naming one of the standard formatters.
924
"""
925
output = self.format_string(self, formatter)
926
return self.PREFIX + output + self.SUFFIX
927
928
@property
929
def name(self):
930
"""Since a NavigableString is not a Tag, it has no .name.
931
932
This property is implemented so that code like this doesn't crash
933
when run on a mixture of Tag and NavigableString objects:
934
[x.name for x in tag.children]
935
"""
936
return None
937
938
@name.setter
939
def name(self, name):
940
"""Prevent NavigableString.name from ever being set."""
941
raise AttributeError("A NavigableString cannot be given a name.")
942
943
944
class PreformattedString(NavigableString):
945
"""A NavigableString not subject to the normal formatting rules.
946
947
This is an abstract class used for special kinds of strings such
948
as comments (the Comment class) and CDATA blocks (the CData
949
class).
950
"""
951
952
PREFIX = ''
953
SUFFIX = ''
954
955
def output_ready(self, formatter=None):
956
"""Make this string ready for output by adding any subclass-specific
957
prefix or suffix.
958
959
:param formatter: A Formatter object, or a string naming one
960
of the standard formatters. The string will be passed into the
961
Formatter, but only to trigger any side effects: the return
962
value is ignored.
963
964
:return: The string, with any subclass-specific prefix and
965
suffix added on.
966
"""
967
if formatter is not None:
968
ignore = self.format_string(self, formatter)
969
return self.PREFIX + self + self.SUFFIX
970
971
class CData(PreformattedString):
972
"""A CDATA block."""
973
PREFIX = '<![CDATA['
974
SUFFIX = ']]>'
975
976
class ProcessingInstruction(PreformattedString):
977
"""A SGML processing instruction."""
978
979
PREFIX = '<?'
980
SUFFIX = '>'
981
982
class XMLProcessingInstruction(ProcessingInstruction):
983
"""An XML processing instruction."""
984
PREFIX = '<?'
985
SUFFIX = '?>'
986
987
class Comment(PreformattedString):
988
"""An HTML or XML comment."""
989
PREFIX = '<!--'
990
SUFFIX = '-->'
991
992
993
class Declaration(PreformattedString):
994
"""An XML declaration."""
995
PREFIX = '<?'
996
SUFFIX = '?>'
997
998
999
class Doctype(PreformattedString):
1000
"""A document type declaration."""
1001
@classmethod
1002
def for_name_and_ids(cls, name, pub_id, system_id):
1003
"""Generate an appropriate document type declaration for a given
1004
public ID and system ID.
1005
1006
:param name: The name of the document's root element, e.g. 'html'.
1007
:param pub_id: The Formal Public Identifier for this document type,
1008
e.g. '-//W3C//DTD XHTML 1.1//EN'
1009
:param system_id: The system identifier for this document type,
1010
e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1011
1012
:return: A Doctype.
1013
"""
1014
value = name or ''
1015
if pub_id is not None:
1016
value += ' PUBLIC "%s"' % pub_id
1017
if system_id is not None:
1018
value += ' "%s"' % system_id
1019
elif system_id is not None:
1020
value += ' SYSTEM "%s"' % system_id
1021
1022
return Doctype(value)
1023
1024
PREFIX = '<!DOCTYPE '
1025
SUFFIX = '>\n'
1026
1027
1028
class Stylesheet(NavigableString):
1029
"""A NavigableString representing an stylesheet (probably
1030
CSS).
1031
1032
Used to distinguish embedded stylesheets from textual content.
1033
"""
1034
pass
1035
1036
1037
class Script(NavigableString):
1038
"""A NavigableString representing an executable script (probably
1039
Javascript).
1040
1041
Used to distinguish executable code from textual content.
1042
"""
1043
pass
1044
1045
1046
class TemplateString(NavigableString):
1047
"""A NavigableString representing a string found inside an HTML
1048
template embedded in a larger document.
1049
1050
Used to distinguish such strings from the main body of the document.
1051
"""
1052
pass
1053
1054
1055
class Tag(PageElement):
1056
"""Represents an HTML or XML tag that is part of a parse tree, along
1057
with its attributes and contents.
1058
1059
When Beautiful Soup parses the markup <b>penguin</b>, it will
1060
create a Tag object representing the <b> tag.
1061
"""
1062
1063
def __init__(self, parser=None, builder=None, name=None, namespace=None,
1064
prefix=None, attrs=None, parent=None, previous=None,
1065
is_xml=None, sourceline=None, sourcepos=None,
1066
can_be_empty_element=None, cdata_list_attributes=None,
1067
preserve_whitespace_tags=None
1068
):
1069
"""Basic constructor.
1070
1071
:param parser: A BeautifulSoup object.
1072
:param builder: A TreeBuilder.
1073
:param name: The name of the tag.
1074
:param namespace: The URI of this Tag's XML namespace, if any.
1075
:param prefix: The prefix for this Tag's XML namespace, if any.
1076
:param attrs: A dictionary of this Tag's attribute values.
1077
:param parent: The PageElement to use as this Tag's parent.
1078
:param previous: The PageElement that was parsed immediately before
1079
this tag.
1080
:param is_xml: If True, this is an XML tag. Otherwise, this is an
1081
HTML tag.
1082
:param sourceline: The line number where this tag was found in its
1083
source document.
1084
:param sourcepos: The character position within `sourceline` where this
1085
tag was found.
1086
:param can_be_empty_element: If True, this tag should be
1087
represented as <tag/>. If False, this tag should be represented
1088
as <tag></tag>.
1089
:param cdata_list_attributes: A list of attributes whose values should
1090
be treated as CDATA if they ever show up on this tag.
1091
:param preserve_whitespace_tags: A list of tag names whose contents
1092
should have their whitespace preserved.
1093
"""
1094
if parser is None:
1095
self.parser_class = None
1096
else:
1097
# We don't actually store the parser object: that lets extracted
1098
# chunks be garbage-collected.
1099
self.parser_class = parser.__class__
1100
if name is None:
1101
raise ValueError("No value provided for new tag's name.")
1102
self.name = name
1103
self.namespace = namespace
1104
self.prefix = prefix
1105
if ((not builder or builder.store_line_numbers)
1106
and (sourceline is not None or sourcepos is not None)):
1107
self.sourceline = sourceline
1108
self.sourcepos = sourcepos
1109
if attrs is None:
1110
attrs = {}
1111
elif attrs:
1112
if builder is not None and builder.cdata_list_attributes:
1113
attrs = builder._replace_cdata_list_attribute_values(
1114
self.name, attrs)
1115
else:
1116
attrs = dict(attrs)
1117
else:
1118
attrs = dict(attrs)
1119
1120
# If possible, determine ahead of time whether this tag is an
1121
# XML tag.
1122
if builder:
1123
self.known_xml = builder.is_xml
1124
else:
1125
self.known_xml = is_xml
1126
self.attrs = attrs
1127
self.contents = []
1128
self.setup(parent, previous)
1129
self.hidden = False
1130
1131
if builder is None:
1132
# In the absence of a TreeBuilder, use whatever values were
1133
# passed in here. They're probably None, unless this is a copy of some
1134
# other tag.
1135
self.can_be_empty_element = can_be_empty_element
1136
self.cdata_list_attributes = cdata_list_attributes
1137
self.preserve_whitespace_tags = preserve_whitespace_tags
1138
else:
1139
# Set up any substitutions for this tag, such as the charset in a META tag.
1140
builder.set_up_substitutions(self)
1141
1142
# Ask the TreeBuilder whether this tag might be an empty-element tag.
1143
self.can_be_empty_element = builder.can_be_empty_element(name)
1144
1145
# Keep track of the list of attributes of this tag that
1146
# might need to be treated as a list.
1147
#
1148
# For performance reasons, we store the whole data structure
1149
# rather than asking the question of every tag. Asking would
1150
# require building a new data structure every time, and
1151
# (unlike can_be_empty_element), we almost never need
1152
# to check this.
1153
self.cdata_list_attributes = builder.cdata_list_attributes
1154
1155
# Keep track of the names that might cause this tag to be treated as a
1156
# whitespace-preserved tag.
1157
self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1158
1159
parserClass = _alias("parser_class") # BS3
1160
1161
def __copy__(self):
1162
"""A copy of a Tag is a new Tag, unconnected to the parse tree.
1163
Its contents are a copy of the old Tag's contents.
1164
"""
1165
clone = type(self)(
1166
None, self.builder, self.name, self.namespace,
1167
self.prefix, self.attrs, is_xml=self._is_xml,
1168
sourceline=self.sourceline, sourcepos=self.sourcepos,
1169
can_be_empty_element=self.can_be_empty_element,
1170
cdata_list_attributes=self.cdata_list_attributes,
1171
preserve_whitespace_tags=self.preserve_whitespace_tags
1172
)
1173
for attr in ('can_be_empty_element', 'hidden'):
1174
setattr(clone, attr, getattr(self, attr))
1175
for child in self.contents:
1176
clone.append(child.__copy__())
1177
return clone
1178
1179
@property
1180
def is_empty_element(self):
1181
"""Is this tag an empty-element tag? (aka a self-closing tag)
1182
1183
A tag that has contents is never an empty-element tag.
1184
1185
A tag that has no contents may or may not be an empty-element
1186
tag. It depends on the builder used to create the tag. If the
1187
builder has a designated list of empty-element tags, then only
1188
a tag whose name shows up in that list is considered an
1189
empty-element tag.
1190
1191
If the builder has no designated list of empty-element tags,
1192
then any tag with no contents is an empty-element tag.
1193
"""
1194
return len(self.contents) == 0 and self.can_be_empty_element
1195
isSelfClosing = is_empty_element # BS3
1196
1197
@property
1198
def string(self):
1199
"""Convenience property to get the single string within this
1200
PageElement.
1201
1202
TODO It might make sense to have NavigableString.string return
1203
itself.
1204
1205
:return: If this element has a single string child, return
1206
value is that string. If this element has one child tag,
1207
return value is the 'string' attribute of the child tag,
1208
recursively. If this element is itself a string, has no
1209
children, or has more than one child, return value is None.
1210
"""
1211
if len(self.contents) != 1:
1212
return None
1213
child = self.contents[0]
1214
if isinstance(child, NavigableString):
1215
return child
1216
return child.string
1217
1218
@string.setter
1219
def string(self, string):
1220
"""Replace this PageElement's contents with `string`."""
1221
self.clear()
1222
self.append(string.__class__(string))
1223
1224
def _all_strings(self, strip=False, types=(NavigableString, CData)):
1225
"""Yield all strings of certain classes, possibly stripping them.
1226
1227
:param strip: If True, all strings will be stripped before being
1228
yielded.
1229
1230
:types: A tuple of NavigableString subclasses. Any strings of
1231
a subclass not found in this list will be ignored. By
1232
default, this means only NavigableString and CData objects
1233
will be considered. So no comments, processing instructions,
1234
etc.
1235
1236
:yield: A sequence of strings.
1237
"""
1238
for descendant in self.descendants:
1239
if (
1240
(types is None and not isinstance(descendant, NavigableString))
1241
or
1242
(types is not None and type(descendant) not in types)):
1243
continue
1244
if strip:
1245
descendant = descendant.strip()
1246
if len(descendant) == 0:
1247
continue
1248
yield descendant
1249
1250
strings = property(_all_strings)
1251
1252
@property
1253
def stripped_strings(self):
1254
"""Yield all strings in the document, stripping them first.
1255
1256
:yield: A sequence of stripped strings.
1257
"""
1258
for string in self._all_strings(True):
1259
yield string
1260
1261
def get_text(self, separator="", strip=False,
1262
types=(NavigableString, CData)):
1263
"""Get all child strings, concatenated using the given separator.
1264
1265
:param separator: Strings will be concatenated using this separator.
1266
1267
:param strip: If True, strings will be stripped before being
1268
concatenated.
1269
1270
:types: A tuple of NavigableString subclasses. Any strings of
1271
a subclass not found in this list will be ignored. By
1272
default, this means only NavigableString and CData objects
1273
will be considered. So no comments, processing instructions,
1274
stylesheets, etc.
1275
1276
:return: A string.
1277
"""
1278
return separator.join([s for s in self._all_strings(
1279
strip, types=types)])
1280
getText = get_text
1281
text = property(get_text)
1282
1283
def decompose(self):
1284
"""Recursively destroys this PageElement and its children.
1285
1286
This element will be removed from the tree and wiped out; so
1287
will everything beneath it.
1288
1289
The behavior of a decomposed PageElement is undefined and you
1290
should never use one for anything, but if you need to _check_
1291
whether an element has been decomposed, you can use the
1292
`decomposed` property.
1293
"""
1294
self.extract()
1295
i = self
1296
while i is not None:
1297
n = i.next_element
1298
i.__dict__.clear()
1299
i.contents = []
1300
i._decomposed = True
1301
i = n
1302
1303
def clear(self, decompose=False):
1304
"""Wipe out all children of this PageElement by calling extract()
1305
on them.
1306
1307
:param decompose: If this is True, decompose() (a more
1308
destructive method) will be called instead of extract().
1309
"""
1310
if decompose:
1311
for element in self.contents[:]:
1312
if isinstance(element, Tag):
1313
element.decompose()
1314
else:
1315
element.extract()
1316
else:
1317
for element in self.contents[:]:
1318
element.extract()
1319
1320
def smooth(self):
1321
"""Smooth out this element's children by consolidating consecutive
1322
strings.
1323
1324
This makes pretty-printed output look more natural following a
1325
lot of operations that modified the tree.
1326
"""
1327
# Mark the first position of every pair of children that need
1328
# to be consolidated. Do this rather than making a copy of
1329
# self.contents, since in most cases very few strings will be
1330
# affected.
1331
marked = []
1332
for i, a in enumerate(self.contents):
1333
if isinstance(a, Tag):
1334
# Recursively smooth children.
1335
a.smooth()
1336
if i == len(self.contents)-1:
1337
# This is the last item in .contents, and it's not a
1338
# tag. There's no chance it needs any work.
1339
continue
1340
b = self.contents[i+1]
1341
if (isinstance(a, NavigableString)
1342
and isinstance(b, NavigableString)
1343
and not isinstance(a, PreformattedString)
1344
and not isinstance(b, PreformattedString)
1345
):
1346
marked.append(i)
1347
1348
# Go over the marked positions in reverse order, so that
1349
# removing items from .contents won't affect the remaining
1350
# positions.
1351
for i in reversed(marked):
1352
a = self.contents[i]
1353
b = self.contents[i+1]
1354
b.extract()
1355
n = NavigableString(a+b)
1356
a.replace_with(n)
1357
1358
def index(self, element):
1359
"""Find the index of a child by identity, not value.
1360
1361
Avoids issues with tag.contents.index(element) getting the
1362
index of equal elements.
1363
1364
:param element: Look for this PageElement in `self.contents`.
1365
"""
1366
for i, child in enumerate(self.contents):
1367
if child is element:
1368
return i
1369
raise ValueError("Tag.index: element not in tag")
1370
1371
def get(self, key, default=None):
1372
"""Returns the value of the 'key' attribute for the tag, or
1373
the value given for 'default' if it doesn't have that
1374
attribute."""
1375
return self.attrs.get(key, default)
1376
1377
def get_attribute_list(self, key, default=None):
1378
"""The same as get(), but always returns a list.
1379
1380
:param key: The attribute to look for.
1381
:param default: Use this value if the attribute is not present
1382
on this PageElement.
1383
:return: A list of values, probably containing only a single
1384
value.
1385
"""
1386
value = self.get(key, default)
1387
if not isinstance(value, list):
1388
value = [value]
1389
return value
1390
1391
def has_attr(self, key):
1392
"""Does this PageElement have an attribute with the given name?"""
1393
return key in self.attrs
1394
1395
def __hash__(self):
1396
return str(self).__hash__()
1397
1398
def __getitem__(self, key):
1399
"""tag[key] returns the value of the 'key' attribute for the Tag,
1400
and throws an exception if it's not there."""
1401
return self.attrs[key]
1402
1403
def __iter__(self):
1404
"Iterating over a Tag iterates over its contents."
1405
return iter(self.contents)
1406
1407
def __len__(self):
1408
"The length of a Tag is the length of its list of contents."
1409
return len(self.contents)
1410
1411
def __contains__(self, x):
1412
return x in self.contents
1413
1414
def __bool__(self):
1415
"A tag is non-None even if it has no contents."
1416
return True
1417
1418
def __setitem__(self, key, value):
1419
"""Setting tag[key] sets the value of the 'key' attribute for the
1420
tag."""
1421
self.attrs[key] = value
1422
1423
def __delitem__(self, key):
1424
"Deleting tag[key] deletes all 'key' attributes for the tag."
1425
self.attrs.pop(key, None)
1426
1427
def __call__(self, *args, **kwargs):
1428
"""Calling a Tag like a function is the same as calling its
1429
find_all() method. Eg. tag('a') returns a list of all the A tags
1430
found within this tag."""
1431
return self.find_all(*args, **kwargs)
1432
1433
def __getattr__(self, tag):
1434
"""Calling tag.subtag is the same as calling tag.find(name="subtag")"""
1435
#print("Getattr %s.%s" % (self.__class__, tag))
1436
if len(tag) > 3 and tag.endswith('Tag'):
1437
# BS3: soup.aTag -> "soup.find("a")
1438
tag_name = tag[:-3]
1439
warnings.warn(
1440
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
1441
name=tag_name
1442
)
1443
)
1444
return self.find(tag_name)
1445
# We special case contents to avoid recursion.
1446
elif not tag.startswith("__") and not tag == "contents":
1447
return self.find(tag)
1448
raise AttributeError(
1449
"'%s' object has no attribute '%s'" % (self.__class__, tag))
1450
1451
def __eq__(self, other):
1452
"""Returns true iff this Tag has the same name, the same attributes,
1453
and the same contents (recursively) as `other`."""
1454
if self is other:
1455
return True
1456
if (not hasattr(other, 'name') or
1457
not hasattr(other, 'attrs') or
1458
not hasattr(other, 'contents') or
1459
self.name != other.name or
1460
self.attrs != other.attrs or
1461
len(self) != len(other)):
1462
return False
1463
for i, my_child in enumerate(self.contents):
1464
if my_child != other.contents[i]:
1465
return False
1466
return True
1467
1468
def __ne__(self, other):
1469
"""Returns true iff this Tag is not identical to `other`,
1470
as defined in __eq__."""
1471
return not self == other
1472
1473
def __repr__(self, encoding="unicode-escape"):
1474
"""Renders this PageElement as a string.
1475
1476
:param encoding: The encoding to use (Python 2 only).
1477
:return: Under Python 2, a bytestring; under Python 3,
1478
a Unicode string.
1479
"""
1480
if PY3K:
1481
# "The return value must be a string object", i.e. Unicode
1482
return self.decode()
1483
else:
1484
# "The return value must be a string object", i.e. a bytestring.
1485
# By convention, the return value of __repr__ should also be
1486
# an ASCII string.
1487
return self.encode(encoding)
1488
1489
def __unicode__(self):
1490
"""Renders this PageElement as a Unicode string."""
1491
return self.decode()
1492
1493
def __str__(self):
1494
"""Renders this PageElement as a generic string.
1495
1496
:return: Under Python 2, a UTF-8 bytestring; under Python 3,
1497
a Unicode string.
1498
"""
1499
if PY3K:
1500
return self.decode()
1501
else:
1502
return self.encode()
1503
1504
if PY3K:
1505
__str__ = __repr__ = __unicode__
1506
1507
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1508
indent_level=None, formatter="minimal",
1509
errors="xmlcharrefreplace"):
1510
"""Render a bytestring representation of this PageElement and its
1511
contents.
1512
1513
:param encoding: The destination encoding.
1514
:param indent_level: Each line of the rendering will be
1515
indented this many spaces. Used internally in
1516
recursive calls while pretty-printing.
1517
:param formatter: A Formatter object, or a string naming one of
1518
the standard formatters.
1519
:param errors: An error handling strategy such as
1520
'xmlcharrefreplace'. This value is passed along into
1521
encode() and its value should be one of the constants
1522
defined by Python.
1523
:return: A bytestring.
1524
1525
"""
1526
# Turn the data structure into Unicode, then encode the
1527
# Unicode.
1528
u = self.decode(indent_level, encoding, formatter)
1529
return u.encode(encoding, errors)
1530
1531
def decode(self, indent_level=None,
1532
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1533
formatter="minimal"):
1534
"""Render a Unicode representation of this PageElement and its
1535
contents.
1536
1537
:param indent_level: Each line of the rendering will be
1538
indented this many spaces. Used internally in
1539
recursive calls while pretty-printing.
1540
:param eventual_encoding: The tag is destined to be
1541
encoded into this encoding. This method is _not_
1542
responsible for performing that encoding. This information
1543
is passed in so that it can be substituted in if the
1544
document contains a <META> tag that mentions the document's
1545
encoding.
1546
:param formatter: A Formatter object, or a string naming one of
1547
the standard formatters.
1548
"""
1549
1550
# First off, turn a non-Formatter `formatter` into a Formatter
1551
# object. This will stop the lookup from happening over and
1552
# over again.
1553
if not isinstance(formatter, Formatter):
1554
formatter = self.formatter_for_name(formatter)
1555
attributes = formatter.attributes(self)
1556
attrs = []
1557
for key, val in attributes:
1558
if val is None:
1559
decoded = key
1560
else:
1561
if isinstance(val, list) or isinstance(val, tuple):
1562
val = ' '.join(val)
1563
elif not isinstance(val, str):
1564
val = str(val)
1565
elif (
1566
isinstance(val, AttributeValueWithCharsetSubstitution)
1567
and eventual_encoding is not None
1568
):
1569
val = val.encode(eventual_encoding)
1570
1571
text = formatter.attribute_value(val)
1572
decoded = (
1573
str(key) + '='
1574
+ formatter.quoted_attribute_value(text))
1575
attrs.append(decoded)
1576
close = ''
1577
closeTag = ''
1578
1579
prefix = ''
1580
if self.prefix:
1581
prefix = self.prefix + ":"
1582
1583
if self.is_empty_element:
1584
close = formatter.void_element_close_prefix or ''
1585
else:
1586
closeTag = '</%s%s>' % (prefix, self.name)
1587
1588
pretty_print = self._should_pretty_print(indent_level)
1589
space = ''
1590
indent_space = ''
1591
if indent_level is not None:
1592
indent_space = (' ' * (indent_level - 1))
1593
if pretty_print:
1594
space = indent_space
1595
indent_contents = indent_level + 1
1596
else:
1597
indent_contents = None
1598
contents = self.decode_contents(
1599
indent_contents, eventual_encoding, formatter
1600
)
1601
1602
if self.hidden:
1603
# This is the 'document root' object.
1604
s = contents
1605
else:
1606
s = []
1607
attribute_string = ''
1608
if attrs:
1609
attribute_string = ' ' + ' '.join(attrs)
1610
if indent_level is not None:
1611
# Even if this particular tag is not pretty-printed,
1612
# we should indent up to the start of the tag.
1613
s.append(indent_space)
1614
s.append('<%s%s%s%s>' % (
1615
prefix, self.name, attribute_string, close))
1616
if pretty_print:
1617
s.append("\n")
1618
s.append(contents)
1619
if pretty_print and contents and contents[-1] != "\n":
1620
s.append("\n")
1621
if pretty_print and closeTag:
1622
s.append(space)
1623
s.append(closeTag)
1624
if indent_level is not None and closeTag and self.next_sibling:
1625
# Even if this particular tag is not pretty-printed,
1626
# we're now done with the tag, and we should add a
1627
# newline if appropriate.
1628
s.append("\n")
1629
s = ''.join(s)
1630
return s
1631
1632
def _should_pretty_print(self, indent_level):
1633
"""Should this tag be pretty-printed?
1634
1635
Most of them should, but some (such as <pre> in HTML
1636
documents) should not.
1637
"""
1638
return (
1639
indent_level is not None
1640
and (
1641
not self.preserve_whitespace_tags
1642
or self.name not in self.preserve_whitespace_tags
1643
)
1644
)
1645
1646
def prettify(self, encoding=None, formatter="minimal"):
1647
"""Pretty-print this PageElement as a string.
1648
1649
:param encoding: The eventual encoding of the string. If this is None,
1650
a Unicode string will be returned.
1651
:param formatter: A Formatter object, or a string naming one of
1652
the standard formatters.
1653
:return: A Unicode string (if encoding==None) or a bytestring
1654
(otherwise).
1655
"""
1656
if encoding is None:
1657
return self.decode(True, formatter=formatter)
1658
else:
1659
return self.encode(encoding, True, formatter=formatter)
1660
1661
def decode_contents(self, indent_level=None,
1662
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1663
formatter="minimal"):
1664
"""Renders the contents of this tag as a Unicode string.
1665
1666
:param indent_level: Each line of the rendering will be
1667
indented this many spaces. Used internally in
1668
recursive calls while pretty-printing.
1669
1670
:param eventual_encoding: The tag is destined to be
1671
encoded into this encoding. decode_contents() is _not_
1672
responsible for performing that encoding. This information
1673
is passed in so that it can be substituted in if the
1674
document contains a <META> tag that mentions the document's
1675
encoding.
1676
1677
:param formatter: A Formatter object, or a string naming one of
1678
the standard Formatters.
1679
"""
1680
# First off, turn a string formatter into a Formatter object. This
1681
# will stop the lookup from happening over and over again.
1682
if not isinstance(formatter, Formatter):
1683
formatter = self.formatter_for_name(formatter)
1684
1685
pretty_print = (indent_level is not None)
1686
s = []
1687
for c in self:
1688
text = None
1689
if isinstance(c, NavigableString):
1690
text = c.output_ready(formatter)
1691
elif isinstance(c, Tag):
1692
s.append(c.decode(indent_level, eventual_encoding,
1693
formatter))
1694
preserve_whitespace = (
1695
self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
1696
)
1697
if text and indent_level and not preserve_whitespace:
1698
text = text.strip()
1699
if text:
1700
if pretty_print and not preserve_whitespace:
1701
s.append(" " * (indent_level - 1))
1702
s.append(text)
1703
if pretty_print and not preserve_whitespace:
1704
s.append("\n")
1705
return ''.join(s)
1706
1707
def encode_contents(
1708
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1709
formatter="minimal"):
1710
"""Renders the contents of this PageElement as a bytestring.
1711
1712
:param indent_level: Each line of the rendering will be
1713
indented this many spaces. Used internally in
1714
recursive calls while pretty-printing.
1715
1716
:param eventual_encoding: The bytestring will be in this encoding.
1717
1718
:param formatter: A Formatter object, or a string naming one of
1719
the standard Formatters.
1720
1721
:return: A bytestring.
1722
"""
1723
contents = self.decode_contents(indent_level, encoding, formatter)
1724
return contents.encode(encoding)
1725
1726
# Old method for BS3 compatibility
1727
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1728
prettyPrint=False, indentLevel=0):
1729
"""Deprecated method for BS3 compatibility."""
1730
if not prettyPrint:
1731
indentLevel = None
1732
return self.encode_contents(
1733
indent_level=indentLevel, encoding=encoding)
1734
1735
#Soup methods
1736
1737
def find(self, name=None, attrs={}, recursive=True, text=None,
1738
**kwargs):
1739
"""Look in the children of this PageElement and find the first
1740
PageElement that matches the given criteria.
1741
1742
All find_* methods take a common set of arguments. See the online
1743
documentation for detailed explanations.
1744
1745
:param name: A filter on tag name.
1746
:param attrs: A dictionary of filters on attribute values.
1747
:param recursive: If this is True, find() will perform a
1748
recursive search of this PageElement's children. Otherwise,
1749
only the direct children will be considered.
1750
:param limit: Stop looking after finding this many results.
1751
:kwargs: A dictionary of filters on attribute values.
1752
:return: A PageElement.
1753
:rtype: bs4.element.Tag | bs4.element.NavigableString
1754
"""
1755
r = None
1756
l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1757
if l:
1758
r = l[0]
1759
return r
1760
findChild = find #BS2
1761
1762
def find_all(self, name=None, attrs={}, recursive=True, text=None,
1763
limit=None, **kwargs):
1764
"""Look in the children of this PageElement and find all
1765
PageElements that match the given criteria.
1766
1767
All find_* methods take a common set of arguments. See the online
1768
documentation for detailed explanations.
1769
1770
:param name: A filter on tag name.
1771
:param attrs: A dictionary of filters on attribute values.
1772
:param recursive: If this is True, find_all() will perform a
1773
recursive search of this PageElement's children. Otherwise,
1774
only the direct children will be considered.
1775
:param limit: Stop looking after finding this many results.
1776
:kwargs: A dictionary of filters on attribute values.
1777
:return: A ResultSet of PageElements.
1778
:rtype: bs4.element.ResultSet
1779
"""
1780
generator = self.descendants
1781
if not recursive:
1782
generator = self.children
1783
return self._find_all(name, attrs, text, limit, generator, **kwargs)
1784
findAll = find_all # BS3
1785
findChildren = find_all # BS2
1786
1787
#Generator methods
1788
@property
1789
def children(self):
1790
"""Iterate over all direct children of this PageElement.
1791
1792
:yield: A sequence of PageElements.
1793
"""
1794
# return iter() to make the purpose of the method clear
1795
return iter(self.contents) # XXX This seems to be untested.
1796
1797
@property
1798
def descendants(self):
1799
"""Iterate over all children of this PageElement in a
1800
breadth-first sequence.
1801
1802
:yield: A sequence of PageElements.
1803
"""
1804
if not len(self.contents):
1805
return
1806
stopNode = self._last_descendant().next_element
1807
current = self.contents[0]
1808
while current is not stopNode:
1809
yield current
1810
current = current.next_element
1811
1812
# CSS selector code
1813
def select_one(self, selector, namespaces=None, **kwargs):
1814
"""Perform a CSS selection operation on the current element.
1815
1816
:param selector: A CSS selector.
1817
1818
:param namespaces: A dictionary mapping namespace prefixes
1819
used in the CSS selector to namespace URIs. By default,
1820
Beautiful Soup will use the prefixes it encountered while
1821
parsing the document.
1822
1823
:param kwargs: Keyword arguments to be passed into SoupSieve's
1824
soupsieve.select() method.
1825
1826
:return: A Tag.
1827
:rtype: bs4.element.Tag
1828
"""
1829
value = self.select(selector, namespaces, 1, **kwargs)
1830
if value:
1831
return value[0]
1832
return None
1833
1834
def select(self, selector, namespaces=None, limit=None, **kwargs):
1835
"""Perform a CSS selection operation on the current element.
1836
1837
This uses the SoupSieve library.
1838
1839
:param selector: A string containing a CSS selector.
1840
1841
:param namespaces: A dictionary mapping namespace prefixes
1842
used in the CSS selector to namespace URIs. By default,
1843
Beautiful Soup will use the prefixes it encountered while
1844
parsing the document.
1845
1846
:param limit: After finding this number of results, stop looking.
1847
1848
:param kwargs: Keyword arguments to be passed into SoupSieve's
1849
soupsieve.select() method.
1850
1851
:return: A ResultSet of Tags.
1852
:rtype: bs4.element.ResultSet
1853
"""
1854
if namespaces is None:
1855
namespaces = self._namespaces
1856
1857
if limit is None:
1858
limit = 0
1859
if soupsieve is None:
1860
raise NotImplementedError(
1861
"Cannot execute CSS selectors because the soupsieve package is not installed."
1862
)
1863
1864
results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
1865
1866
# We do this because it's more consistent and because
1867
# ResultSet.__getattr__ has a helpful error message.
1868
return ResultSet(None, results)
1869
1870
# Old names for backwards compatibility
1871
def childGenerator(self):
1872
"""Deprecated generator."""
1873
return self.children
1874
1875
def recursiveChildGenerator(self):
1876
"""Deprecated generator."""
1877
return self.descendants
1878
1879
def has_key(self, key):
1880
"""Deprecated method. This was kind of misleading because has_key()
1881
(attributes) was different from __in__ (contents).
1882
1883
has_key() is gone in Python 3, anyway.
1884
"""
1885
warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
1886
key))
1887
return self.has_attr(key)
1888
1889
# Next, a couple classes to represent queries and their results.
1890
class SoupStrainer(object):
1891
"""Encapsulates a number of ways of matching a markup element (tag or
1892
string).
1893
1894
This is primarily used to underpin the find_* methods, but you can
1895
create one yourself and pass it in as `parse_only` to the
1896
`BeautifulSoup` constructor, to parse a subset of a large
1897
document.
1898
"""
1899
1900
def __init__(self, name=None, attrs={}, text=None, **kwargs):
1901
"""Constructor.
1902
1903
The SoupStrainer constructor takes the same arguments passed
1904
into the find_* methods. See the online documentation for
1905
detailed explanations.
1906
1907
:param name: A filter on tag name.
1908
:param attrs: A dictionary of filters on attribute values.
1909
:param text: A filter for a NavigableString with specific text.
1910
:kwargs: A dictionary of filters on attribute values.
1911
"""
1912
self.name = self._normalize_search_value(name)
1913
if not isinstance(attrs, dict):
1914
# Treat a non-dict value for attrs as a search for the 'class'
1915
# attribute.
1916
kwargs['class'] = attrs
1917
attrs = None
1918
1919
if 'class_' in kwargs:
1920
# Treat class_="foo" as a search for the 'class'
1921
# attribute, overriding any non-dict value for attrs.
1922
kwargs['class'] = kwargs['class_']
1923
del kwargs['class_']
1924
1925
if kwargs:
1926
if attrs:
1927
attrs = attrs.copy()
1928
attrs.update(kwargs)
1929
else:
1930
attrs = kwargs
1931
normalized_attrs = {}
1932
for key, value in list(attrs.items()):
1933
normalized_attrs[key] = self._normalize_search_value(value)
1934
1935
self.attrs = normalized_attrs
1936
self.text = self._normalize_search_value(text)
1937
1938
def _normalize_search_value(self, value):
1939
# Leave it alone if it's a Unicode string, a callable, a
1940
# regular expression, a boolean, or None.
1941
if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
1942
or isinstance(value, bool) or value is None):
1943
return value
1944
1945
# If it's a bytestring, convert it to Unicode, treating it as UTF-8.
1946
if isinstance(value, bytes):
1947
return value.decode("utf8")
1948
1949
# If it's listlike, convert it into a list of strings.
1950
if hasattr(value, '__iter__'):
1951
new_value = []
1952
for v in value:
1953
if (hasattr(v, '__iter__') and not isinstance(v, bytes)
1954
and not isinstance(v, str)):
1955
# This is almost certainly the user's mistake. In the
1956
# interests of avoiding infinite loops, we'll let
1957
# it through as-is rather than doing a recursive call.
1958
new_value.append(v)
1959
else:
1960
new_value.append(self._normalize_search_value(v))
1961
return new_value
1962
1963
# Otherwise, convert it into a Unicode string.
1964
# The unicode(str()) thing is so this will do the same thing on Python 2
1965
# and Python 3.
1966
return str(str(value))
1967
1968
def __str__(self):
1969
"""A human-readable representation of this SoupStrainer."""
1970
if self.text:
1971
return self.text
1972
else:
1973
return "%s|%s" % (self.name, self.attrs)
1974
1975
def search_tag(self, markup_name=None, markup_attrs={}):
1976
"""Check whether a Tag with the given name and attributes would
1977
match this SoupStrainer.
1978
1979
Used prospectively to decide whether to even bother creating a Tag
1980
object.
1981
1982
:param markup_name: A tag name as found in some markup.
1983
:param markup_attrs: A dictionary of attributes as found in some markup.
1984
1985
:return: True if the prospective tag would match this SoupStrainer;
1986
False otherwise.
1987
"""
1988
found = None
1989
markup = None
1990
if isinstance(markup_name, Tag):
1991
markup = markup_name
1992
markup_attrs = markup
1993
call_function_with_tag_data = (
1994
isinstance(self.name, Callable)
1995
and not isinstance(markup_name, Tag))
1996
1997
if ((not self.name)
1998
or call_function_with_tag_data
1999
or (markup and self._matches(markup, self.name))
2000
or (not markup and self._matches(markup_name, self.name))):
2001
if call_function_with_tag_data:
2002
match = self.name(markup_name, markup_attrs)
2003
else:
2004
match = True
2005
markup_attr_map = None
2006
for attr, match_against in list(self.attrs.items()):
2007
if not markup_attr_map:
2008
if hasattr(markup_attrs, 'get'):
2009
markup_attr_map = markup_attrs
2010
else:
2011
markup_attr_map = {}
2012
for k, v in markup_attrs:
2013
markup_attr_map[k] = v
2014
attr_value = markup_attr_map.get(attr)
2015
if not self._matches(attr_value, match_against):
2016
match = False
2017
break
2018
if match:
2019
if markup:
2020
found = markup
2021
else:
2022
found = markup_name
2023
if found and self.text and not self._matches(found.string, self.text):
2024
found = None
2025
return found
2026
2027
# For BS3 compatibility.
2028
searchTag = search_tag
2029
2030
def search(self, markup):
2031
"""Find all items in `markup` that match this SoupStrainer.
2032
2033
Used by the core _find_all() method, which is ultimately
2034
called by all find_* methods.
2035
2036
:param markup: A PageElement or a list of them.
2037
"""
2038
# print('looking for %s in %s' % (self, markup))
2039
found = None
2040
# If given a list of items, scan it for a text element that
2041
# matches.
2042
if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
2043
for element in markup:
2044
if isinstance(element, NavigableString) \
2045
and self.search(element):
2046
found = element
2047
break
2048
# If it's a Tag, make sure its name or attributes match.
2049
# Don't bother with Tags if we're searching for text.
2050
elif isinstance(markup, Tag):
2051
if not self.text or self.name or self.attrs:
2052
found = self.search_tag(markup)
2053
# If it's text, make sure the text matches.
2054
elif isinstance(markup, NavigableString) or \
2055
isinstance(markup, str):
2056
if not self.name and not self.attrs and self._matches(markup, self.text):
2057
found = markup
2058
else:
2059
raise Exception(
2060
"I don't know how to match against a %s" % markup.__class__)
2061
return found
2062
2063
def _matches(self, markup, match_against, already_tried=None):
2064
# print(u"Matching %s against %s" % (markup, match_against))
2065
result = False
2066
if isinstance(markup, list) or isinstance(markup, tuple):
2067
# This should only happen when searching a multi-valued attribute
2068
# like 'class'.
2069
for item in markup:
2070
if self._matches(item, match_against):
2071
return True
2072
# We didn't match any particular value of the multivalue
2073
# attribute, but maybe we match the attribute value when
2074
# considered as a string.
2075
if self._matches(' '.join(markup), match_against):
2076
return True
2077
return False
2078
2079
if match_against is True:
2080
# True matches any non-None value.
2081
return markup is not None
2082
2083
if isinstance(match_against, Callable):
2084
return match_against(markup)
2085
2086
# Custom callables take the tag as an argument, but all
2087
# other ways of matching match the tag name as a string.
2088
original_markup = markup
2089
if isinstance(markup, Tag):
2090
markup = markup.name
2091
2092
# Ensure that `markup` is either a Unicode string, or None.
2093
markup = self._normalize_search_value(markup)
2094
2095
if markup is None:
2096
# None matches None, False, an empty string, an empty list, and so on.
2097
return not match_against
2098
2099
if (hasattr(match_against, '__iter__')
2100
and not isinstance(match_against, str)):
2101
# We're asked to match against an iterable of items.
2102
# The markup must be match at least one item in the
2103
# iterable. We'll try each one in turn.
2104
#
2105
# To avoid infinite recursion we need to keep track of
2106
# items we've already seen.
2107
if not already_tried:
2108
already_tried = set()
2109
for item in match_against:
2110
if item.__hash__:
2111
key = item
2112
else:
2113
key = id(item)
2114
if key in already_tried:
2115
continue
2116
else:
2117
already_tried.add(key)
2118
if self._matches(original_markup, item, already_tried):
2119
return True
2120
else:
2121
return False
2122
2123
# Beyond this point we might need to run the test twice: once against
2124
# the tag's name and once against its prefixed name.
2125
match = False
2126
2127
if not match and isinstance(match_against, str):
2128
# Exact string match
2129
match = markup == match_against
2130
2131
if not match and hasattr(match_against, 'search'):
2132
# Regexp match
2133
return match_against.search(markup)
2134
2135
if (not match
2136
and isinstance(original_markup, Tag)
2137
and original_markup.prefix):
2138
# Try the whole thing again with the prefixed tag name.
2139
return self._matches(
2140
original_markup.prefix + ':' + original_markup.name, match_against
2141
)
2142
2143
return match
2144
2145
2146
class ResultSet(list):
2147
"""A ResultSet is just a list that keeps track of the SoupStrainer
2148
that created it."""
2149
def __init__(self, source, result=()):
2150
"""Constructor.
2151
2152
:param source: A SoupStrainer.
2153
:param result: A list of PageElements.
2154
"""
2155
super(ResultSet, self).__init__(result)
2156
self.source = source
2157
2158
def __getattr__(self, key):
2159
"""Raise a helpful exception to explain a common code fix."""
2160
raise AttributeError(
2161
"ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
2162
)
2163
2164