CoCalc -- element.py

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/element.py
⁸¹¹ views
1
# Use of this source code is governed by the MIT license.
2
__license__ = "MIT"
3

4
try:
5
    from collections.abc import Callable # Python 3.6
6
except ImportError as e:
7
    from collections import Callable
8
import re
9
import sys
10
import warnings
11
try:
12
    import soupsieve
13
except ImportError as e:
14
    soupsieve = None
15
    warnings.warn(
16
        'The soupsieve package is not installed. CSS selectors cannot be used.'
17
    )
18

19
from bs4.formatter import (
20
    Formatter,
21
    HTMLFormatter,
22
    XMLFormatter,
23
)
24

25
DEFAULT_OUTPUT_ENCODING = "utf-8"
26
PY3K = (sys.version_info[0] > 2)
27

28
nonwhitespace_re = re.compile(r"\S+")
29

30
# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
31
# the off chance someone imported it for their own use.
32
whitespace_re = re.compile(r"\s+")
33

34
def _alias(attr):
35
    """Alias one attribute name to another for backward compatibility"""
36
    @property
37
    def alias(self):
38
        return getattr(self, attr)
39

40
    @alias.setter
41
    def alias(self):
42
        return setattr(self, attr)
43
    return alias
44

45

46
# These encodings are recognized by Python (so PageElement.encode
47
# could theoretically support them) but XML and HTML don't recognize
48
# them (so they should not show up in an XML or HTML document as that
49
# document's encoding).
50
#
51
# If an XML document is encoded in one of these encodings, no encoding
52
# will be mentioned in the XML declaration. If an HTML document is
53
# encoded in one of these encodings, and the HTML document has a
54
# <meta> tag that mentions an encoding, the encoding will be given as
55
# the empty string.
56
#
57
# Source:
58
# https://docs.python.org/3/library/codecs.html#python-specific-encodings
59
PYTHON_SPECIFIC_ENCODINGS = set([
60
    "idna",
61
    "mbcs",
62
    "oem",
63
    "palmos",
64
    "punycode",
65
    "raw_unicode_escape",
66
    "undefined",
67
    "unicode_escape",
68
    "raw-unicode-escape",
69
    "unicode-escape",
70
    "string-escape",
71
    "string_escape",
72
])
73
    
74

75
class NamespacedAttribute(str):
76
    """A namespaced string (e.g. 'xml:lang') that remembers the namespace
77
    ('xml') and the name ('lang') that were used to create it.
78
    """
79
    
80
    def __new__(cls, prefix, name=None, namespace=None):
81
        if not name:
82
            # This is the default namespace. Its name "has no value"
83
            # per https://www.w3.org/TR/xml-names/#defaulting
84
            name = None
85

86
        if name is None:
87
            obj = str.__new__(cls, prefix)
88
        elif prefix is None:
89
            # Not really namespaced.
90
            obj = str.__new__(cls, name)
91
        else:
92
            obj = str.__new__(cls, prefix + ":" + name)
93
        obj.prefix = prefix
94
        obj.name = name
95
        obj.namespace = namespace
96
        return obj
97

98
class AttributeValueWithCharsetSubstitution(str):
99
    """A stand-in object for a character encoding specified in HTML."""
100

101
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
102
    """A generic stand-in for the value of a meta tag's 'charset' attribute.
103

104
    When Beautiful Soup parses the markup '<meta charset="utf8">', the
105
    value of the 'charset' attribute will be one of these objects.
106
    """
107

108
    def __new__(cls, original_value):
109
        obj = str.__new__(cls, original_value)
110
        obj.original_value = original_value
111
        return obj
112

113
    def encode(self, encoding):
114
        """When an HTML document is being encoded to a given encoding, the
115
        value of a meta tag's 'charset' is the name of the encoding.
116
        """
117
        if encoding in PYTHON_SPECIFIC_ENCODINGS:
118
            return ''
119
        return encoding
120

121

122
class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
123
    """A generic stand-in for the value of a meta tag's 'content' attribute.
124

125
    When Beautiful Soup parses the markup:
126
     <meta http-equiv="content-type" content="text/html; charset=utf8">
127

128
    The value of the 'content' attribute will be one of these objects.
129
    """
130

131
    CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
132

133
    def __new__(cls, original_value):
134
        match = cls.CHARSET_RE.search(original_value)
135
        if match is None:
136
            # No substitution necessary.
137
            return str.__new__(str, original_value)
138

139
        obj = str.__new__(cls, original_value)
140
        obj.original_value = original_value
141
        return obj
142

143
    def encode(self, encoding):
144
        if encoding in PYTHON_SPECIFIC_ENCODINGS:
145
            return ''
146
        def rewrite(match):
147
            return match.group(1) + encoding
148
        return self.CHARSET_RE.sub(rewrite, self.original_value)
149

150
    
151
class PageElement(object):
152
    """Contains the navigational information for some part of the page:
153
    that is, its current location in the parse tree.
154

155
    NavigableString, Tag, etc. are all subclasses of PageElement.
156
    """
157
   
158
    def setup(self, parent=None, previous_element=None, next_element=None,
159
              previous_sibling=None, next_sibling=None):
160
        """Sets up the initial relations between this element and
161
        other elements.
162

163
        :param parent: The parent of this element.
164

165
        :param previous_element: The element parsed immediately before
166
            this one.
167
        
168
        :param next_element: The element parsed immediately before
169
            this one.
170

171
        :param previous_sibling: The most recently encountered element
172
            on the same level of the parse tree as this one.
173

174
        :param previous_sibling: The next element to be encountered
175
            on the same level of the parse tree as this one.
176
        """
177
        self.parent = parent
178

179
        self.previous_element = previous_element
180
        if previous_element is not None:
181
            self.previous_element.next_element = self
182

183
        self.next_element = next_element
184
        if self.next_element is not None:
185
            self.next_element.previous_element = self
186

187
        self.next_sibling = next_sibling
188
        if self.next_sibling is not None:
189
            self.next_sibling.previous_sibling = self
190

191
        if (previous_sibling is None
192
            and self.parent is not None and self.parent.contents):
193
            previous_sibling = self.parent.contents[-1]
194

195
        self.previous_sibling = previous_sibling
196
        if previous_sibling is not None:
197
            self.previous_sibling.next_sibling = self
198

199
    def format_string(self, s, formatter):
200
        """Format the given string using the given formatter.
201

202
        :param s: A string.
203
        :param formatter: A Formatter object, or a string naming one of the standard formatters.
204
        """
205
        if formatter is None:
206
            return s
207
        if not isinstance(formatter, Formatter):
208
            formatter = self.formatter_for_name(formatter)
209
        output = formatter.substitute(s)
210
        return output
211

212
    def formatter_for_name(self, formatter):
213
        """Look up or create a Formatter for the given identifier,
214
        if necessary.
215

216
        :param formatter: Can be a Formatter object (used as-is), a
217
            function (used as the entity substitution hook for an
218
            XMLFormatter or HTMLFormatter), or a string (used to look
219
            up an XMLFormatter or HTMLFormatter in the appropriate
220
            registry.
221
        """
222
        if isinstance(formatter, Formatter):
223
            return formatter
224
        if self._is_xml:
225
            c = XMLFormatter
226
        else:
227
            c = HTMLFormatter
228
        if isinstance(formatter, Callable):
229
            return c(entity_substitution=formatter)
230
        return c.REGISTRY[formatter]
231

232
    @property
233
    def _is_xml(self):
234
        """Is this element part of an XML tree or an HTML tree?
235

236
        This is used in formatter_for_name, when deciding whether an
237
        XMLFormatter or HTMLFormatter is more appropriate. It can be
238
        inefficient, but it should be called very rarely.
239
        """
240
        if self.known_xml is not None:
241
            # Most of the time we will have determined this when the
242
            # document is parsed.
243
            return self.known_xml
244

245
        # Otherwise, it's likely that this element was created by
246
        # direct invocation of the constructor from within the user's
247
        # Python code.
248
        if self.parent is None:
249
            # This is the top-level object. It should have .known_xml set
250
            # from tree creation. If not, take a guess--BS is usually
251
            # used on HTML markup.
252
            return getattr(self, 'is_xml', False)
253
        return self.parent._is_xml
254

255
    nextSibling = _alias("next_sibling")  # BS3
256
    previousSibling = _alias("previous_sibling")  # BS3
257

258
    def replace_with(self, replace_with):
259
        """Replace this PageElement with another one, keeping the rest of the
260
        tree the same.
261
        
262
        :param replace_with: A PageElement.
263
        :return: `self`, no longer part of the tree.
264
        """
265
        if self.parent is None:
266
            raise ValueError(
267
                "Cannot replace one element with another when the "
268
                "element to be replaced is not part of a tree.")
269
        if replace_with is self:
270
            return
271
        if replace_with is self.parent:
272
            raise ValueError("Cannot replace a Tag with its parent.")
273
        old_parent = self.parent
274
        my_index = self.parent.index(self)
275
        self.extract(_self_index=my_index)
276
        old_parent.insert(my_index, replace_with)
277
        return self
278
    replaceWith = replace_with  # BS3
279

280
    def unwrap(self):
281
        """Replace this PageElement with its contents.
282

283
        :return: `self`, no longer part of the tree.
284
        """
285
        my_parent = self.parent
286
        if self.parent is None:
287
            raise ValueError(
288
                "Cannot replace an element with its contents when that"
289
                "element is not part of a tree.")
290
        my_index = self.parent.index(self)
291
        self.extract(_self_index=my_index)
292
        for child in reversed(self.contents[:]):
293
            my_parent.insert(my_index, child)
294
        return self
295
    replace_with_children = unwrap
296
    replaceWithChildren = unwrap  # BS3
297

298
    def wrap(self, wrap_inside):
299
        """Wrap this PageElement inside another one.
300

301
        :param wrap_inside: A PageElement.
302
        :return: `wrap_inside`, occupying the position in the tree that used
303
           to be occupied by `self`, and with `self` inside it.
304
        """
305
        me = self.replace_with(wrap_inside)
306
        wrap_inside.append(me)
307
        return wrap_inside
308

309
    def extract(self, _self_index=None):
310
        """Destructively rips this element out of the tree.
311

312
        :param _self_index: The location of this element in its parent's
313
           .contents, if known. Passing this in allows for a performance
314
           optimization.
315

316
        :return: `self`, no longer part of the tree.
317
        """
318
        if self.parent is not None:
319
            if _self_index is None:
320
                _self_index = self.parent.index(self)
321
            del self.parent.contents[_self_index]
322

323
        #Find the two elements that would be next to each other if
324
        #this element (and any children) hadn't been parsed. Connect
325
        #the two.
326
        last_child = self._last_descendant()
327
        next_element = last_child.next_element
328

329
        if (self.previous_element is not None and
330
            self.previous_element is not next_element):
331
            self.previous_element.next_element = next_element
332
        if next_element is not None and next_element is not self.previous_element:
333
            next_element.previous_element = self.previous_element
334
        self.previous_element = None
335
        last_child.next_element = None
336

337
        self.parent = None
338
        if (self.previous_sibling is not None
339
            and self.previous_sibling is not self.next_sibling):
340
            self.previous_sibling.next_sibling = self.next_sibling
341
        if (self.next_sibling is not None
342
            and self.next_sibling is not self.previous_sibling):
343
            self.next_sibling.previous_sibling = self.previous_sibling
344
        self.previous_sibling = self.next_sibling = None
345
        return self
346

347
    def _last_descendant(self, is_initialized=True, accept_self=True):
348
        """Finds the last element beneath this object to be parsed.
349

350
        :param is_initialized: Has `setup` been called on this PageElement
351
            yet?
352
        :param accept_self: Is `self` an acceptable answer to the question?
353
        """
354
        if is_initialized and self.next_sibling is not None:
355
            last_child = self.next_sibling.previous_element
356
        else:
357
            last_child = self
358
            while isinstance(last_child, Tag) and last_child.contents:
359
                last_child = last_child.contents[-1]
360
        if not accept_self and last_child is self:
361
            last_child = None
362
        return last_child
363
    # BS3: Not part of the API!
364
    _lastRecursiveChild = _last_descendant
365

366
    def insert(self, position, new_child):
367
        """Insert a new PageElement in the list of this PageElement's children.
368

369
        This works the same way as `list.insert`.
370

371
        :param position: The numeric position that should be occupied
372
           in `self.children` by the new PageElement. 
373
        :param new_child: A PageElement.
374
        """
375
        if new_child is None:
376
            raise ValueError("Cannot insert None into a tag.")
377
        if new_child is self:
378
            raise ValueError("Cannot insert a tag into itself.")
379
        if (isinstance(new_child, str)
380
            and not isinstance(new_child, NavigableString)):
381
            new_child = NavigableString(new_child)
382

383
        from bs4 import BeautifulSoup
384
        if isinstance(new_child, BeautifulSoup):
385
            # We don't want to end up with a situation where one BeautifulSoup
386
            # object contains another. Insert the children one at a time.
387
            for subchild in list(new_child.contents):
388
                self.insert(position, subchild)
389
                position += 1
390
            return
391
        position = min(position, len(self.contents))
392
        if hasattr(new_child, 'parent') and new_child.parent is not None:
393
            # We're 'inserting' an element that's already one
394
            # of this object's children.
395
            if new_child.parent is self:
396
                current_index = self.index(new_child)
397
                if current_index < position:
398
                    # We're moving this element further down the list
399
                    # of this object's children. That means that when
400
                    # we extract this element, our target index will
401
                    # jump down one.
402
                    position -= 1
403
            new_child.extract()
404

405
        new_child.parent = self
406
        previous_child = None
407
        if position == 0:
408
            new_child.previous_sibling = None
409
            new_child.previous_element = self
410
        else:
411
            previous_child = self.contents[position - 1]
412
            new_child.previous_sibling = previous_child
413
            new_child.previous_sibling.next_sibling = new_child
414
            new_child.previous_element = previous_child._last_descendant(False)
415
        if new_child.previous_element is not None:
416
            new_child.previous_element.next_element = new_child
417

418
        new_childs_last_element = new_child._last_descendant(False)
419

420
        if position >= len(self.contents):
421
            new_child.next_sibling = None
422

423
            parent = self
424
            parents_next_sibling = None
425
            while parents_next_sibling is None and parent is not None:
426
                parents_next_sibling = parent.next_sibling
427
                parent = parent.parent
428
                if parents_next_sibling is not None:
429
                    # We found the element that comes next in the document.
430
                    break
431
            if parents_next_sibling is not None:
432
                new_childs_last_element.next_element = parents_next_sibling
433
            else:
434
                # The last element of this tag is the last element in
435
                # the document.
436
                new_childs_last_element.next_element = None
437
        else:
438
            next_child = self.contents[position]
439
            new_child.next_sibling = next_child
440
            if new_child.next_sibling is not None:
441
                new_child.next_sibling.previous_sibling = new_child
442
            new_childs_last_element.next_element = next_child
443

444
        if new_childs_last_element.next_element is not None:
445
            new_childs_last_element.next_element.previous_element = new_childs_last_element
446
        self.contents.insert(position, new_child)
447

448
    def append(self, tag):
449
        """Appends the given PageElement to the contents of this one.
450

451
        :param tag: A PageElement.
452
        """
453
        self.insert(len(self.contents), tag)
454

455
    def extend(self, tags):
456
        """Appends the given PageElements to this one's contents.
457

458
        :param tags: A list of PageElements.
459
        """
460
        for tag in tags:
461
            self.append(tag)
462

463
    def insert_before(self, *args):
464
        """Makes the given element(s) the immediate predecessor of this one.
465

466
        All the elements will have the same parent, and the given elements
467
        will be immediately before this one.
468

469
        :param args: One or more PageElements.
470
        """
471
        parent = self.parent
472
        if parent is None:
473
            raise ValueError(
474
                "Element has no parent, so 'before' has no meaning.")
475
        if any(x is self for x in args):
476
                raise ValueError("Can't insert an element before itself.")
477
        for predecessor in args:
478
            # Extract first so that the index won't be screwed up if they
479
            # are siblings.
480
            if isinstance(predecessor, PageElement):
481
                predecessor.extract()
482
            index = parent.index(self)
483
            parent.insert(index, predecessor)
484

485
    def insert_after(self, *args):
486
        """Makes the given element(s) the immediate successor of this one.
487

488
        The elements will have the same parent, and the given elements
489
        will be immediately after this one.
490

491
        :param args: One or more PageElements.
492
        """
493
        # Do all error checking before modifying the tree.
494
        parent = self.parent
495
        if parent is None:
496
            raise ValueError(
497
                "Element has no parent, so 'after' has no meaning.")
498
        if any(x is self for x in args):
499
            raise ValueError("Can't insert an element after itself.")
500
        
501
        offset = 0
502
        for successor in args:
503
            # Extract first so that the index won't be screwed up if they
504
            # are siblings.
505
            if isinstance(successor, PageElement):
506
                successor.extract()
507
            index = parent.index(self)
508
            parent.insert(index+1+offset, successor)
509
            offset += 1
510

511
    def find_next(self, name=None, attrs={}, text=None, **kwargs):
512
        """Find the first PageElement that matches the given criteria and
513
        appears later in the document than this PageElement.
514

515
        All find_* methods take a common set of arguments. See the online
516
        documentation for detailed explanations.
517

518
        :param name: A filter on tag name.
519
        :param attrs: A dictionary of filters on attribute values.
520
        :param text: A filter for a NavigableString with specific text.
521
        :kwargs: A dictionary of filters on attribute values.
522
        :return: A PageElement.
523
        :rtype: bs4.element.Tag | bs4.element.NavigableString
524
        """
525
        return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
526
    findNext = find_next  # BS3
527

528
    def find_all_next(self, name=None, attrs={}, text=None, limit=None,
529
                    **kwargs):
530
        """Find all PageElements that match the given criteria and appear
531
        later in the document than this PageElement.
532

533
        All find_* methods take a common set of arguments. See the online
534
        documentation for detailed explanations.
535

536
        :param name: A filter on tag name.
537
        :param attrs: A dictionary of filters on attribute values.
538
        :param text: A filter for a NavigableString with specific text.
539
        :param limit: Stop looking after finding this many results.
540
        :kwargs: A dictionary of filters on attribute values.
541
        :return: A ResultSet containing PageElements.
542
        """
543
        return self._find_all(name, attrs, text, limit, self.next_elements,
544
                             **kwargs)
545
    findAllNext = find_all_next  # BS3
546

547
    def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
548
        """Find the closest sibling to this PageElement that matches the
549
        given criteria and appears later in the document.
550

551
        All find_* methods take a common set of arguments. See the
552
        online documentation for detailed explanations.
553

554
        :param name: A filter on tag name.
555
        :param attrs: A dictionary of filters on attribute values.
556
        :param text: A filter for a NavigableString with specific text.
557
        :kwargs: A dictionary of filters on attribute values.
558
        :return: A PageElement.
559
        :rtype: bs4.element.Tag | bs4.element.NavigableString
560
        """
561
        return self._find_one(self.find_next_siblings, name, attrs, text,
562
                             **kwargs)
563
    findNextSibling = find_next_sibling  # BS3
564

565
    def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
566
                           **kwargs):
567
        """Find all siblings of this PageElement that match the given criteria
568
        and appear later in the document.
569

570
        All find_* methods take a common set of arguments. See the online
571
        documentation for detailed explanations.
572

573
        :param name: A filter on tag name.
574
        :param attrs: A dictionary of filters on attribute values.
575
        :param text: A filter for a NavigableString with specific text.
576
        :param limit: Stop looking after finding this many results.
577
        :kwargs: A dictionary of filters on attribute values.
578
        :return: A ResultSet of PageElements.
579
        :rtype: bs4.element.ResultSet
580
        """
581
        return self._find_all(name, attrs, text, limit,
582
                              self.next_siblings, **kwargs)
583
    findNextSiblings = find_next_siblings   # BS3
584
    fetchNextSiblings = find_next_siblings  # BS2
585

586
    def find_previous(self, name=None, attrs={}, text=None, **kwargs):
587
        """Look backwards in the document from this PageElement and find the
588
        first PageElement that matches the given criteria.
589

590
        All find_* methods take a common set of arguments. See the online
591
        documentation for detailed explanations.
592

593
        :param name: A filter on tag name.
594
        :param attrs: A dictionary of filters on attribute values.
595
        :param text: A filter for a NavigableString with specific text.
596
        :kwargs: A dictionary of filters on attribute values.
597
        :return: A PageElement.
598
        :rtype: bs4.element.Tag | bs4.element.NavigableString
599
        """
600
        return self._find_one(
601
            self.find_all_previous, name, attrs, text, **kwargs)
602
    findPrevious = find_previous  # BS3
603

604
    def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
605
                        **kwargs):
606
        """Look backwards in the document from this PageElement and find all
607
        PageElements that match the given criteria.
608

609
        All find_* methods take a common set of arguments. See the online
610
        documentation for detailed explanations.
611

612
        :param name: A filter on tag name.
613
        :param attrs: A dictionary of filters on attribute values.
614
        :param text: A filter for a NavigableString with specific text.
615
        :param limit: Stop looking after finding this many results.
616
        :kwargs: A dictionary of filters on attribute values.
617
        :return: A ResultSet of PageElements.
618
        :rtype: bs4.element.ResultSet
619
        """
620
        return self._find_all(name, attrs, text, limit, self.previous_elements,
621
                           **kwargs)
622
    findAllPrevious = find_all_previous  # BS3
623
    fetchPrevious = find_all_previous    # BS2
624

625
    def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
626
        """Returns the closest sibling to this PageElement that matches the
627
        given criteria and appears earlier in the document.
628

629
        All find_* methods take a common set of arguments. See the online
630
        documentation for detailed explanations.
631

632
        :param name: A filter on tag name.
633
        :param attrs: A dictionary of filters on attribute values.
634
        :param text: A filter for a NavigableString with specific text.
635
        :kwargs: A dictionary of filters on attribute values.
636
        :return: A PageElement.
637
        :rtype: bs4.element.Tag | bs4.element.NavigableString
638
        """
639
        return self._find_one(self.find_previous_siblings, name, attrs, text,
640
                             **kwargs)
641
    findPreviousSibling = find_previous_sibling  # BS3
642

643
    def find_previous_siblings(self, name=None, attrs={}, text=None,
644
                               limit=None, **kwargs):
645
        """Returns all siblings to this PageElement that match the
646
        given criteria and appear earlier in the document.
647

648
        All find_* methods take a common set of arguments. See the online
649
        documentation for detailed explanations.
650

651
        :param name: A filter on tag name.
652
        :param attrs: A dictionary of filters on attribute values.
653
        :param text: A filter for a NavigableString with specific text.
654
        :param limit: Stop looking after finding this many results.
655
        :kwargs: A dictionary of filters on attribute values.
656
        :return: A ResultSet of PageElements.
657
        :rtype: bs4.element.ResultSet
658
        """
659
        return self._find_all(name, attrs, text, limit,
660
                              self.previous_siblings, **kwargs)
661
    findPreviousSiblings = find_previous_siblings   # BS3
662
    fetchPreviousSiblings = find_previous_siblings  # BS2
663

664
    def find_parent(self, name=None, attrs={}, **kwargs):
665
        """Find the closest parent of this PageElement that matches the given
666
        criteria.
667

668
        All find_* methods take a common set of arguments. See the online
669
        documentation for detailed explanations.
670

671
        :param name: A filter on tag name.
672
        :param attrs: A dictionary of filters on attribute values.
673
        :kwargs: A dictionary of filters on attribute values.
674

675
        :return: A PageElement.
676
        :rtype: bs4.element.Tag | bs4.element.NavigableString
677
        """
678
        # NOTE: We can't use _find_one because findParents takes a different
679
        # set of arguments.
680
        r = None
681
        l = self.find_parents(name, attrs, 1, **kwargs)
682
        if l:
683
            r = l[0]
684
        return r
685
    findParent = find_parent  # BS3
686

687
    def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
688
        """Find all parents of this PageElement that match the given criteria.
689

690
        All find_* methods take a common set of arguments. See the online
691
        documentation for detailed explanations.
692

693
        :param name: A filter on tag name.
694
        :param attrs: A dictionary of filters on attribute values.
695
        :param limit: Stop looking after finding this many results.
696
        :kwargs: A dictionary of filters on attribute values.
697

698
        :return: A PageElement.
699
        :rtype: bs4.element.Tag | bs4.element.NavigableString
700
        """
701
        return self._find_all(name, attrs, None, limit, self.parents,
702
                             **kwargs)
703
    findParents = find_parents   # BS3
704
    fetchParents = find_parents  # BS2
705

706
    @property
707
    def next(self):
708
        """The PageElement, if any, that was parsed just after this one.
709

710
        :return: A PageElement.
711
        :rtype: bs4.element.Tag | bs4.element.NavigableString
712
        """
713
        return self.next_element
714

715
    @property
716
    def previous(self):
717
        """The PageElement, if any, that was parsed just before this one.
718

719
        :return: A PageElement.
720
        :rtype: bs4.element.Tag | bs4.element.NavigableString
721
        """
722
        return self.previous_element
723

724
    #These methods do the real heavy lifting.
725

726
    def _find_one(self, method, name, attrs, text, **kwargs):
727
        r = None
728
        l = method(name, attrs, text, 1, **kwargs)
729
        if l:
730
            r = l[0]
731
        return r
732

733
    def _find_all(self, name, attrs, text, limit, generator, **kwargs):
734
        "Iterates over a generator looking for things that match."
735

736
        if text is None and 'string' in kwargs:
737
            text = kwargs['string']
738
            del kwargs['string']
739

740
        if isinstance(name, SoupStrainer):
741
            strainer = name
742
        else:
743
            strainer = SoupStrainer(name, attrs, text, **kwargs)
744

745
        if text is None and not limit and not attrs and not kwargs:
746
            if name is True or name is None:
747
                # Optimization to find all tags.
748
                result = (element for element in generator
749
                          if isinstance(element, Tag))
750
                return ResultSet(strainer, result)
751
            elif isinstance(name, str):
752
                # Optimization to find all tags with a given name.
753
                if name.count(':') == 1:
754
                    # This is a name with a prefix. If this is a namespace-aware document,
755
                    # we need to match the local name against tag.name. If not,
756
                    # we need to match the fully-qualified name against tag.name.
757
                    prefix, local_name = name.split(':', 1)
758
                else:
759
                    prefix = None
760
                    local_name = name
761
                result = (element for element in generator
762
                          if isinstance(element, Tag)
763
                          and (
764
                              element.name == name
765
                          ) or (
766
                              element.name == local_name
767
                              and (prefix is None or element.prefix == prefix)
768
                          )
769
                )
770
                return ResultSet(strainer, result)
771
        results = ResultSet(strainer)
772
        while True:
773
            try:
774
                i = next(generator)
775
            except StopIteration:
776
                break
777
            if i:
778
                found = strainer.search(i)
779
                if found:
780
                    results.append(found)
781
                    if limit and len(results) >= limit:
782
                        break
783
        return results
784

785
    #These generators can be used to navigate starting from both
786
    #NavigableStrings and Tags.
787
    @property
788
    def next_elements(self):
789
        """All PageElements that were parsed after this one.
790

791
        :yield: A sequence of PageElements.
792
        """
793
        i = self.next_element
794
        while i is not None:
795
            yield i
796
            i = i.next_element
797

798
    @property
799
    def next_siblings(self):
800
        """All PageElements that are siblings of this one but were parsed
801
        later.
802

803
        :yield: A sequence of PageElements.
804
        """
805
        i = self.next_sibling
806
        while i is not None:
807
            yield i
808
            i = i.next_sibling
809

810
    @property
811
    def previous_elements(self):
812
        """All PageElements that were parsed before this one.
813

814
        :yield: A sequence of PageElements.
815
        """
816
        i = self.previous_element
817
        while i is not None:
818
            yield i
819
            i = i.previous_element
820

821
    @property
822
    def previous_siblings(self):
823
        """All PageElements that are siblings of this one but were parsed
824
        earlier.
825

826
        :yield: A sequence of PageElements.
827
        """
828
        i = self.previous_sibling
829
        while i is not None:
830
            yield i
831
            i = i.previous_sibling
832

833
    @property
834
    def parents(self):
835
        """All PageElements that are parents of this PageElement.
836

837
        :yield: A sequence of PageElements.
838
        """
839
        i = self.parent
840
        while i is not None:
841
            yield i
842
            i = i.parent
843

844
    @property
845
    def decomposed(self):
846
        """Check whether a PageElement has been decomposed.
847

848
        :rtype: bool
849
        """
850
        return getattr(self, '_decomposed', False) or False
851
            
852
    # Old non-property versions of the generators, for backwards
853
    # compatibility with BS3.
854
    def nextGenerator(self):
855
        return self.next_elements
856

857
    def nextSiblingGenerator(self):
858
        return self.next_siblings
859

860
    def previousGenerator(self):
861
        return self.previous_elements
862

863
    def previousSiblingGenerator(self):
864
        return self.previous_siblings
865

866
    def parentGenerator(self):
867
        return self.parents
868

869

870
class NavigableString(str, PageElement):
871
    """A Python Unicode string that is part of a parse tree.
872

873
    When Beautiful Soup parses the markup <b>penguin</b>, it will
874
    create a NavigableString for the string "penguin".
875
    """   
876

877
    PREFIX = ''
878
    SUFFIX = ''
879

880
    # We can't tell just by looking at a string whether it's contained
881
    # in an XML document or an HTML document.
882

883
    known_xml = None
884

885
    def __new__(cls, value):
886
        """Create a new NavigableString.
887

888
        When unpickling a NavigableString, this method is called with
889
        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
890
        passed in to the superclass's __new__ or the superclass won't know
891
        how to handle non-ASCII characters.
892
        """
893
        if isinstance(value, str):
894
            u = str.__new__(cls, value)
895
        else:
896
            u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
897
        u.setup()
898
        return u
899

900
    def __copy__(self):
901
        """A copy of a NavigableString has the same contents and class
902
        as the original, but it is not connected to the parse tree.
903
        """
904
        return type(self)(self)
905

906
    def __getnewargs__(self):
907
        return (str(self),)
908

909
    def __getattr__(self, attr):
910
        """text.string gives you text. This is for backwards
911
        compatibility for Navigable*String, but for CData* it lets you
912
        get the string without the CData wrapper."""
913
        if attr == 'string':
914
            return self
915
        else:
916
            raise AttributeError(
917
                "'%s' object has no attribute '%s'" % (
918
                    self.__class__.__name__, attr))
919

920
    def output_ready(self, formatter="minimal"):
921
        """Run the string through the provided formatter.
922

923
        :param formatter: A Formatter object, or a string naming one of the standard formatters.
924
        """
925
        output = self.format_string(self, formatter)
926
        return self.PREFIX + output + self.SUFFIX
927

928
    @property
929
    def name(self):
930
        """Since a NavigableString is not a Tag, it has no .name.
931

932
        This property is implemented so that code like this doesn't crash
933
        when run on a mixture of Tag and NavigableString objects:
934
            [x.name for x in tag.children]
935
        """
936
        return None
937

938
    @name.setter
939
    def name(self, name):
940
        """Prevent NavigableString.name from ever being set."""
941
        raise AttributeError("A NavigableString cannot be given a name.")
942

943
    
944
class PreformattedString(NavigableString):
945
    """A NavigableString not subject to the normal formatting rules.
946

947
    This is an abstract class used for special kinds of strings such
948
    as comments (the Comment class) and CDATA blocks (the CData
949
    class).
950
    """
951
    
952
    PREFIX = ''
953
    SUFFIX = ''
954
    
955
    def output_ready(self, formatter=None):
956
        """Make this string ready for output by adding any subclass-specific
957
            prefix or suffix.
958

959
        :param formatter: A Formatter object, or a string naming one
960
            of the standard formatters. The string will be passed into the
961
            Formatter, but only to trigger any side effects: the return
962
            value is ignored.
963

964
        :return: The string, with any subclass-specific prefix and
965
           suffix added on.
966
        """
967
        if formatter is not None:
968
            ignore = self.format_string(self, formatter)
969
        return self.PREFIX + self + self.SUFFIX
970

971
class CData(PreformattedString):
972
    """A CDATA block."""
973
    PREFIX = '<![CDATA['
974
    SUFFIX = ']]>'
975

976
class ProcessingInstruction(PreformattedString):
977
    """A SGML processing instruction."""
978

979
    PREFIX = '<?'
980
    SUFFIX = '>'
981

982
class XMLProcessingInstruction(ProcessingInstruction):
983
    """An XML processing instruction."""
984
    PREFIX = '<?'
985
    SUFFIX = '?>'
986

987
class Comment(PreformattedString):
988
    """An HTML or XML comment."""
989
    PREFIX = '<!--'
990
    SUFFIX = '-->'
991

992

993
class Declaration(PreformattedString):
994
    """An XML declaration."""
995
    PREFIX = '<?'
996
    SUFFIX = '?>'
997

998

999
class Doctype(PreformattedString):
1000
    """A document type declaration."""
1001
    @classmethod
1002
    def for_name_and_ids(cls, name, pub_id, system_id):
1003
        """Generate an appropriate document type declaration for a given
1004
        public ID and system ID.
1005

1006
        :param name: The name of the document's root element, e.g. 'html'.
1007
        :param pub_id: The Formal Public Identifier for this document type,
1008
            e.g. '-//W3C//DTD XHTML 1.1//EN'
1009
        :param system_id: The system identifier for this document type,
1010
            e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1011

1012
        :return: A Doctype.
1013
        """
1014
        value = name or ''
1015
        if pub_id is not None:
1016
            value += ' PUBLIC "%s"' % pub_id
1017
            if system_id is not None:
1018
                value += ' "%s"' % system_id
1019
        elif system_id is not None:
1020
            value += ' SYSTEM "%s"' % system_id
1021

1022
        return Doctype(value)
1023

1024
    PREFIX = '<!DOCTYPE '
1025
    SUFFIX = '>\n'
1026

1027

1028
class Stylesheet(NavigableString):
1029
    """A NavigableString representing an stylesheet (probably
1030
    CSS).
1031

1032
    Used to distinguish embedded stylesheets from textual content.
1033
    """
1034
    pass
1035

1036
    
1037
class Script(NavigableString):
1038
    """A NavigableString representing an executable script (probably
1039
    Javascript).
1040

1041
    Used to distinguish executable code from textual content.
1042
    """
1043
    pass
1044

1045

1046
class TemplateString(NavigableString):
1047
    """A NavigableString representing a string found inside an HTML
1048
    template embedded in a larger document.
1049

1050
    Used to distinguish such strings from the main body of the document.
1051
    """
1052
    pass
1053

1054

1055
class Tag(PageElement):
1056
    """Represents an HTML or XML tag that is part of a parse tree, along
1057
    with its attributes and contents.
1058

1059
    When Beautiful Soup parses the markup <b>penguin</b>, it will
1060
    create a Tag object representing the <b> tag.
1061
    """
1062

1063
    def __init__(self, parser=None, builder=None, name=None, namespace=None,
1064
                 prefix=None, attrs=None, parent=None, previous=None,
1065
                 is_xml=None, sourceline=None, sourcepos=None,
1066
                 can_be_empty_element=None, cdata_list_attributes=None,
1067
                 preserve_whitespace_tags=None
1068
    ):
1069
        """Basic constructor.
1070

1071
        :param parser: A BeautifulSoup object.
1072
        :param builder: A TreeBuilder.
1073
        :param name: The name of the tag.
1074
        :param namespace: The URI of this Tag's XML namespace, if any.
1075
        :param prefix: The prefix for this Tag's XML namespace, if any.
1076
        :param attrs: A dictionary of this Tag's attribute values.
1077
        :param parent: The PageElement to use as this Tag's parent.
1078
        :param previous: The PageElement that was parsed immediately before
1079
            this tag.
1080
        :param is_xml: If True, this is an XML tag. Otherwise, this is an
1081
            HTML tag.
1082
        :param sourceline: The line number where this tag was found in its
1083
            source document.
1084
        :param sourcepos: The character position within `sourceline` where this
1085
            tag was found.
1086
        :param can_be_empty_element: If True, this tag should be
1087
            represented as <tag/>. If False, this tag should be represented
1088
            as <tag></tag>.
1089
        :param cdata_list_attributes: A list of attributes whose values should
1090
            be treated as CDATA if they ever show up on this tag.
1091
        :param preserve_whitespace_tags: A list of tag names whose contents
1092
            should have their whitespace preserved.
1093
        """
1094
        if parser is None:
1095
            self.parser_class = None
1096
        else:
1097
            # We don't actually store the parser object: that lets extracted
1098
            # chunks be garbage-collected.
1099
            self.parser_class = parser.__class__
1100
        if name is None:
1101
            raise ValueError("No value provided for new tag's name.")
1102
        self.name = name
1103
        self.namespace = namespace
1104
        self.prefix = prefix
1105
        if ((not builder or builder.store_line_numbers)
1106
            and (sourceline is not None or sourcepos is not None)):
1107
            self.sourceline = sourceline
1108
            self.sourcepos = sourcepos        
1109
        if attrs is None:
1110
            attrs = {}
1111
        elif attrs:
1112
            if builder is not None and builder.cdata_list_attributes:
1113
                attrs = builder._replace_cdata_list_attribute_values(
1114
                    self.name, attrs)
1115
            else:
1116
                attrs = dict(attrs)
1117
        else:
1118
            attrs = dict(attrs)
1119

1120
        # If possible, determine ahead of time whether this tag is an
1121
        # XML tag.
1122
        if builder:
1123
            self.known_xml = builder.is_xml
1124
        else:
1125
            self.known_xml = is_xml
1126
        self.attrs = attrs
1127
        self.contents = []
1128
        self.setup(parent, previous)
1129
        self.hidden = False
1130

1131
        if builder is None:
1132
            # In the absence of a TreeBuilder, use whatever values were
1133
            # passed in here. They're probably None, unless this is a copy of some
1134
            # other tag.
1135
            self.can_be_empty_element = can_be_empty_element
1136
            self.cdata_list_attributes = cdata_list_attributes
1137
            self.preserve_whitespace_tags = preserve_whitespace_tags
1138
        else:
1139
            # Set up any substitutions for this tag, such as the charset in a META tag.
1140
            builder.set_up_substitutions(self)
1141

1142
            # Ask the TreeBuilder whether this tag might be an empty-element tag.
1143
            self.can_be_empty_element = builder.can_be_empty_element(name)
1144

1145
            # Keep track of the list of attributes of this tag that
1146
            # might need to be treated as a list.
1147
            #
1148
            # For performance reasons, we store the whole data structure
1149
            # rather than asking the question of every tag. Asking would
1150
            # require building a new data structure every time, and
1151
            # (unlike can_be_empty_element), we almost never need
1152
            # to check this.
1153
            self.cdata_list_attributes = builder.cdata_list_attributes
1154

1155
            # Keep track of the names that might cause this tag to be treated as a
1156
            # whitespace-preserved tag.
1157
            self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1158
            
1159
    parserClass = _alias("parser_class")  # BS3
1160

1161
    def __copy__(self):
1162
        """A copy of a Tag is a new Tag, unconnected to the parse tree.
1163
        Its contents are a copy of the old Tag's contents.
1164
        """
1165
        clone = type(self)(
1166
            None, self.builder, self.name, self.namespace,
1167
            self.prefix, self.attrs, is_xml=self._is_xml,
1168
            sourceline=self.sourceline, sourcepos=self.sourcepos,
1169
            can_be_empty_element=self.can_be_empty_element,
1170
            cdata_list_attributes=self.cdata_list_attributes,
1171
            preserve_whitespace_tags=self.preserve_whitespace_tags
1172
        )
1173
        for attr in ('can_be_empty_element', 'hidden'):
1174
            setattr(clone, attr, getattr(self, attr))
1175
        for child in self.contents:
1176
            clone.append(child.__copy__())
1177
        return clone
1178

1179
    @property
1180
    def is_empty_element(self):
1181
        """Is this tag an empty-element tag? (aka a self-closing tag)
1182

1183
        A tag that has contents is never an empty-element tag.
1184

1185
        A tag that has no contents may or may not be an empty-element
1186
        tag. It depends on the builder used to create the tag. If the
1187
        builder has a designated list of empty-element tags, then only
1188
        a tag whose name shows up in that list is considered an
1189
        empty-element tag.
1190

1191
        If the builder has no designated list of empty-element tags,
1192
        then any tag with no contents is an empty-element tag.
1193
        """
1194
        return len(self.contents) == 0 and self.can_be_empty_element
1195
    isSelfClosing = is_empty_element  # BS3
1196

1197
    @property
1198
    def string(self):
1199
        """Convenience property to get the single string within this
1200
        PageElement.
1201

1202
        TODO It might make sense to have NavigableString.string return
1203
        itself.
1204

1205
        :return: If this element has a single string child, return
1206
         value is that string. If this element has one child tag,
1207
         return value is the 'string' attribute of the child tag,
1208
         recursively. If this element is itself a string, has no
1209
         children, or has more than one child, return value is None.
1210
        """
1211
        if len(self.contents) != 1:
1212
            return None
1213
        child = self.contents[0]
1214
        if isinstance(child, NavigableString):
1215
            return child
1216
        return child.string
1217

1218
    @string.setter
1219
    def string(self, string):
1220
        """Replace this PageElement's contents with `string`."""
1221
        self.clear()
1222
        self.append(string.__class__(string))
1223

1224
    def _all_strings(self, strip=False, types=(NavigableString, CData)):
1225
        """Yield all strings of certain classes, possibly stripping them.
1226

1227
        :param strip: If True, all strings will be stripped before being
1228
            yielded.
1229

1230
        :types: A tuple of NavigableString subclasses. Any strings of
1231
            a subclass not found in this list will be ignored. By
1232
            default, this means only NavigableString and CData objects
1233
            will be considered. So no comments, processing instructions,
1234
            etc.
1235

1236
        :yield: A sequence of strings.
1237
        """
1238
        for descendant in self.descendants:
1239
            if (
1240
                (types is None and not isinstance(descendant, NavigableString))
1241
                or
1242
                (types is not None and type(descendant) not in types)):
1243
                continue
1244
            if strip:
1245
                descendant = descendant.strip()
1246
                if len(descendant) == 0:
1247
                    continue
1248
            yield descendant
1249

1250
    strings = property(_all_strings)
1251

1252
    @property
1253
    def stripped_strings(self):
1254
        """Yield all strings in the document, stripping them first.
1255

1256
        :yield: A sequence of stripped strings.
1257
        """
1258
        for string in self._all_strings(True):
1259
            yield string
1260

1261
    def get_text(self, separator="", strip=False,
1262
                 types=(NavigableString, CData)):
1263
        """Get all child strings, concatenated using the given separator.
1264

1265
        :param separator: Strings will be concatenated using this separator.
1266

1267
        :param strip: If True, strings will be stripped before being
1268
            concatenated.
1269

1270
        :types: A tuple of NavigableString subclasses. Any strings of
1271
            a subclass not found in this list will be ignored. By
1272
            default, this means only NavigableString and CData objects
1273
            will be considered. So no comments, processing instructions,
1274
            stylesheets, etc.
1275

1276
        :return: A string.
1277
        """
1278
        return separator.join([s for s in self._all_strings(
1279
                    strip, types=types)])
1280
    getText = get_text
1281
    text = property(get_text)
1282

1283
    def decompose(self):
1284
        """Recursively destroys this PageElement and its children.
1285

1286
        This element will be removed from the tree and wiped out; so
1287
        will everything beneath it.
1288

1289
        The behavior of a decomposed PageElement is undefined and you
1290
        should never use one for anything, but if you need to _check_
1291
        whether an element has been decomposed, you can use the
1292
        `decomposed` property.
1293
        """
1294
        self.extract()
1295
        i = self
1296
        while i is not None:
1297
            n = i.next_element
1298
            i.__dict__.clear()
1299
            i.contents = []
1300
            i._decomposed = True
1301
            i = n
1302
           
1303
    def clear(self, decompose=False):
1304
        """Wipe out all children of this PageElement by calling extract()
1305
           on them.
1306

1307
        :param decompose: If this is True, decompose() (a more
1308
            destructive method) will be called instead of extract().
1309
        """
1310
        if decompose:
1311
            for element in self.contents[:]:
1312
                if isinstance(element, Tag):
1313
                    element.decompose()
1314
                else:
1315
                    element.extract()
1316
        else:
1317
            for element in self.contents[:]:
1318
                element.extract()
1319

1320
    def smooth(self):
1321
        """Smooth out this element's children by consolidating consecutive
1322
        strings.
1323

1324
        This makes pretty-printed output look more natural following a
1325
        lot of operations that modified the tree.
1326
        """
1327
        # Mark the first position of every pair of children that need
1328
        # to be consolidated.  Do this rather than making a copy of
1329
        # self.contents, since in most cases very few strings will be
1330
        # affected.
1331
        marked = []
1332
        for i, a in enumerate(self.contents):
1333
            if isinstance(a, Tag):
1334
                # Recursively smooth children.
1335
                a.smooth()
1336
            if i == len(self.contents)-1:
1337
                # This is the last item in .contents, and it's not a
1338
                # tag. There's no chance it needs any work.
1339
                continue
1340
            b = self.contents[i+1]
1341
            if (isinstance(a, NavigableString)
1342
                and isinstance(b, NavigableString)
1343
                and not isinstance(a, PreformattedString)
1344
                and not isinstance(b, PreformattedString)
1345
            ):
1346
                marked.append(i)
1347

1348
        # Go over the marked positions in reverse order, so that
1349
        # removing items from .contents won't affect the remaining
1350
        # positions.
1351
        for i in reversed(marked):
1352
            a = self.contents[i]
1353
            b = self.contents[i+1]
1354
            b.extract()
1355
            n = NavigableString(a+b)
1356
            a.replace_with(n)
1357

1358
    def index(self, element):
1359
        """Find the index of a child by identity, not value.
1360

1361
        Avoids issues with tag.contents.index(element) getting the
1362
        index of equal elements.
1363

1364
        :param element: Look for this PageElement in `self.contents`.
1365
        """
1366
        for i, child in enumerate(self.contents):
1367
            if child is element:
1368
                return i
1369
        raise ValueError("Tag.index: element not in tag")
1370

1371
    def get(self, key, default=None):
1372
        """Returns the value of the 'key' attribute for the tag, or
1373
        the value given for 'default' if it doesn't have that
1374
        attribute."""
1375
        return self.attrs.get(key, default)
1376

1377
    def get_attribute_list(self, key, default=None):
1378
        """The same as get(), but always returns a list.
1379

1380
        :param key: The attribute to look for.
1381
        :param default: Use this value if the attribute is not present
1382
            on this PageElement.
1383
        :return: A list of values, probably containing only a single
1384
            value.
1385
        """
1386
        value = self.get(key, default)
1387
        if not isinstance(value, list):
1388
            value = [value]
1389
        return value
1390
    
1391
    def has_attr(self, key):
1392
        """Does this PageElement have an attribute with the given name?"""
1393
        return key in self.attrs
1394

1395
    def __hash__(self):
1396
        return str(self).__hash__()
1397

1398
    def __getitem__(self, key):
1399
        """tag[key] returns the value of the 'key' attribute for the Tag,
1400
        and throws an exception if it's not there."""
1401
        return self.attrs[key]
1402

1403
    def __iter__(self):
1404
        "Iterating over a Tag iterates over its contents."
1405
        return iter(self.contents)
1406

1407
    def __len__(self):
1408
        "The length of a Tag is the length of its list of contents."
1409
        return len(self.contents)
1410

1411
    def __contains__(self, x):
1412
        return x in self.contents
1413

1414
    def __bool__(self):
1415
        "A tag is non-None even if it has no contents."
1416
        return True
1417

1418
    def __setitem__(self, key, value):
1419
        """Setting tag[key] sets the value of the 'key' attribute for the
1420
        tag."""
1421
        self.attrs[key] = value
1422

1423
    def __delitem__(self, key):
1424
        "Deleting tag[key] deletes all 'key' attributes for the tag."
1425
        self.attrs.pop(key, None)
1426

1427
    def __call__(self, *args, **kwargs):
1428
        """Calling a Tag like a function is the same as calling its
1429
        find_all() method. Eg. tag('a') returns a list of all the A tags
1430
        found within this tag."""
1431
        return self.find_all(*args, **kwargs)
1432

1433
    def __getattr__(self, tag):
1434
        """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
1435
        #print("Getattr %s.%s" % (self.__class__, tag))
1436
        if len(tag) > 3 and tag.endswith('Tag'):
1437
            # BS3: soup.aTag -> "soup.find("a")
1438
            tag_name = tag[:-3]
1439
            warnings.warn(
1440
                '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
1441
                    name=tag_name
1442
                )
1443
            )
1444
            return self.find(tag_name)
1445
        # We special case contents to avoid recursion.
1446
        elif not tag.startswith("__") and not tag == "contents":
1447
            return self.find(tag)
1448
        raise AttributeError(
1449
            "'%s' object has no attribute '%s'" % (self.__class__, tag))
1450

1451
    def __eq__(self, other):
1452
        """Returns true iff this Tag has the same name, the same attributes,
1453
        and the same contents (recursively) as `other`."""
1454
        if self is other:
1455
            return True
1456
        if (not hasattr(other, 'name') or
1457
            not hasattr(other, 'attrs') or
1458
            not hasattr(other, 'contents') or
1459
            self.name != other.name or
1460
            self.attrs != other.attrs or
1461
            len(self) != len(other)):
1462
            return False
1463
        for i, my_child in enumerate(self.contents):
1464
            if my_child != other.contents[i]:
1465
                return False
1466
        return True
1467

1468
    def __ne__(self, other):
1469
        """Returns true iff this Tag is not identical to `other`,
1470
        as defined in __eq__."""
1471
        return not self == other
1472

1473
    def __repr__(self, encoding="unicode-escape"):
1474
        """Renders this PageElement as a string.
1475

1476
        :param encoding: The encoding to use (Python 2 only).
1477
        :return: Under Python 2, a bytestring; under Python 3,
1478
            a Unicode string.
1479
        """
1480
        if PY3K:
1481
            # "The return value must be a string object", i.e. Unicode
1482
            return self.decode()
1483
        else:
1484
            # "The return value must be a string object", i.e. a bytestring.
1485
            # By convention, the return value of __repr__ should also be
1486
            # an ASCII string.
1487
            return self.encode(encoding)
1488

1489
    def __unicode__(self):
1490
        """Renders this PageElement as a Unicode string."""
1491
        return self.decode()
1492

1493
    def __str__(self):
1494
        """Renders this PageElement as a generic string.
1495

1496
        :return: Under Python 2, a UTF-8 bytestring; under Python 3,
1497
            a Unicode string.        
1498
        """
1499
        if PY3K:
1500
            return self.decode()
1501
        else:
1502
            return self.encode()
1503

1504
    if PY3K:
1505
        __str__ = __repr__ = __unicode__
1506

1507
    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1508
               indent_level=None, formatter="minimal",
1509
               errors="xmlcharrefreplace"):
1510
        """Render a bytestring representation of this PageElement and its
1511
        contents.
1512

1513
        :param encoding: The destination encoding.
1514
        :param indent_level: Each line of the rendering will be
1515
            indented this many spaces. Used internally in
1516
            recursive calls while pretty-printing.
1517
        :param formatter: A Formatter object, or a string naming one of
1518
            the standard formatters.
1519
        :param errors: An error handling strategy such as
1520
            'xmlcharrefreplace'. This value is passed along into
1521
            encode() and its value should be one of the constants
1522
            defined by Python.
1523
        :return: A bytestring.
1524

1525
        """
1526
        # Turn the data structure into Unicode, then encode the
1527
        # Unicode.
1528
        u = self.decode(indent_level, encoding, formatter)
1529
        return u.encode(encoding, errors)
1530

1531
    def decode(self, indent_level=None,
1532
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1533
               formatter="minimal"):
1534
        """Render a Unicode representation of this PageElement and its
1535
        contents.
1536

1537
        :param indent_level: Each line of the rendering will be
1538
             indented this many spaces. Used internally in
1539
             recursive calls while pretty-printing.
1540
        :param eventual_encoding: The tag is destined to be
1541
            encoded into this encoding. This method is _not_
1542
            responsible for performing that encoding. This information
1543
            is passed in so that it can be substituted in if the
1544
            document contains a <META> tag that mentions the document's
1545
            encoding.
1546
        :param formatter: A Formatter object, or a string naming one of
1547
            the standard formatters.
1548
        """
1549

1550
        # First off, turn a non-Formatter `formatter` into a Formatter
1551
        # object. This will stop the lookup from happening over and
1552
        # over again.
1553
        if not isinstance(formatter, Formatter):
1554
            formatter = self.formatter_for_name(formatter)
1555
        attributes = formatter.attributes(self)
1556
        attrs = []
1557
        for key, val in attributes:
1558
            if val is None:
1559
                decoded = key
1560
            else:
1561
                if isinstance(val, list) or isinstance(val, tuple):
1562
                    val = ' '.join(val)
1563
                elif not isinstance(val, str):
1564
                    val = str(val)
1565
                elif (
1566
                        isinstance(val, AttributeValueWithCharsetSubstitution)
1567
                        and eventual_encoding is not None
1568
                ):
1569
                    val = val.encode(eventual_encoding)
1570

1571
                text = formatter.attribute_value(val)
1572
                decoded = (
1573
                    str(key) + '='
1574
                    + formatter.quoted_attribute_value(text))
1575
            attrs.append(decoded)
1576
        close = ''
1577
        closeTag = ''
1578

1579
        prefix = ''
1580
        if self.prefix:
1581
            prefix = self.prefix + ":"
1582

1583
        if self.is_empty_element:
1584
            close = formatter.void_element_close_prefix or ''
1585
        else:
1586
            closeTag = '</%s%s>' % (prefix, self.name)
1587

1588
        pretty_print = self._should_pretty_print(indent_level)
1589
        space = ''
1590
        indent_space = ''
1591
        if indent_level is not None:
1592
            indent_space = (' ' * (indent_level - 1))
1593
        if pretty_print:
1594
            space = indent_space
1595
            indent_contents = indent_level + 1
1596
        else:
1597
            indent_contents = None
1598
        contents = self.decode_contents(
1599
            indent_contents, eventual_encoding, formatter
1600
        )
1601

1602
        if self.hidden:
1603
            # This is the 'document root' object.
1604
            s = contents
1605
        else:
1606
            s = []
1607
            attribute_string = ''
1608
            if attrs:
1609
                attribute_string = ' ' + ' '.join(attrs)
1610
            if indent_level is not None:
1611
                # Even if this particular tag is not pretty-printed,
1612
                # we should indent up to the start of the tag.
1613
                s.append(indent_space)
1614
            s.append('<%s%s%s%s>' % (
1615
                    prefix, self.name, attribute_string, close))
1616
            if pretty_print:
1617
                s.append("\n")
1618
            s.append(contents)
1619
            if pretty_print and contents and contents[-1] != "\n":
1620
                s.append("\n")
1621
            if pretty_print and closeTag:
1622
                s.append(space)
1623
            s.append(closeTag)
1624
            if indent_level is not None and closeTag and self.next_sibling:
1625
                # Even if this particular tag is not pretty-printed,
1626
                # we're now done with the tag, and we should add a
1627
                # newline if appropriate.
1628
                s.append("\n")
1629
            s = ''.join(s)
1630
        return s
1631

1632
    def _should_pretty_print(self, indent_level):
1633
        """Should this tag be pretty-printed?
1634

1635
        Most of them should, but some (such as <pre> in HTML
1636
        documents) should not.
1637
        """
1638
        return (
1639
            indent_level is not None
1640
            and (
1641
                not self.preserve_whitespace_tags
1642
                or self.name not in self.preserve_whitespace_tags
1643
            )
1644
        )
1645

1646
    def prettify(self, encoding=None, formatter="minimal"):
1647
        """Pretty-print this PageElement as a string.
1648

1649
        :param encoding: The eventual encoding of the string. If this is None,
1650
            a Unicode string will be returned.
1651
        :param formatter: A Formatter object, or a string naming one of
1652
            the standard formatters.
1653
        :return: A Unicode string (if encoding==None) or a bytestring 
1654
            (otherwise).
1655
        """
1656
        if encoding is None:
1657
            return self.decode(True, formatter=formatter)
1658
        else:
1659
            return self.encode(encoding, True, formatter=formatter)
1660

1661
    def decode_contents(self, indent_level=None,
1662
                       eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1663
                       formatter="minimal"):
1664
        """Renders the contents of this tag as a Unicode string.
1665

1666
        :param indent_level: Each line of the rendering will be
1667
           indented this many spaces. Used internally in
1668
           recursive calls while pretty-printing.
1669

1670
        :param eventual_encoding: The tag is destined to be
1671
           encoded into this encoding. decode_contents() is _not_
1672
           responsible for performing that encoding. This information
1673
           is passed in so that it can be substituted in if the
1674
           document contains a <META> tag that mentions the document's
1675
           encoding.
1676

1677
        :param formatter: A Formatter object, or a string naming one of
1678
            the standard Formatters.
1679
        """
1680
        # First off, turn a string formatter into a Formatter object. This
1681
        # will stop the lookup from happening over and over again.
1682
        if not isinstance(formatter, Formatter):
1683
            formatter = self.formatter_for_name(formatter)
1684

1685
        pretty_print = (indent_level is not None)
1686
        s = []
1687
        for c in self:
1688
            text = None
1689
            if isinstance(c, NavigableString):
1690
                text = c.output_ready(formatter)
1691
            elif isinstance(c, Tag):
1692
                s.append(c.decode(indent_level, eventual_encoding,
1693
                                  formatter))
1694
            preserve_whitespace = (
1695
                self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
1696
            )
1697
            if text and indent_level and not preserve_whitespace:
1698
                text = text.strip()
1699
            if text:
1700
                if pretty_print and not preserve_whitespace:
1701
                    s.append(" " * (indent_level - 1))
1702
                s.append(text)
1703
                if pretty_print and not preserve_whitespace:
1704
                    s.append("\n")
1705
        return ''.join(s)
1706
       
1707
    def encode_contents(
1708
        self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1709
        formatter="minimal"):
1710
        """Renders the contents of this PageElement as a bytestring.
1711

1712
        :param indent_level: Each line of the rendering will be
1713
           indented this many spaces. Used internally in
1714
           recursive calls while pretty-printing.
1715

1716
        :param eventual_encoding: The bytestring will be in this encoding.
1717

1718
        :param formatter: A Formatter object, or a string naming one of
1719
            the standard Formatters.
1720

1721
        :return: A bytestring.
1722
        """
1723
        contents = self.decode_contents(indent_level, encoding, formatter)
1724
        return contents.encode(encoding)
1725

1726
    # Old method for BS3 compatibility
1727
    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1728
                       prettyPrint=False, indentLevel=0):
1729
        """Deprecated method for BS3 compatibility."""
1730
        if not prettyPrint:
1731
            indentLevel = None
1732
        return self.encode_contents(
1733
            indent_level=indentLevel, encoding=encoding)
1734

1735
    #Soup methods
1736

1737
    def find(self, name=None, attrs={}, recursive=True, text=None,
1738
             **kwargs):
1739
        """Look in the children of this PageElement and find the first
1740
        PageElement that matches the given criteria.
1741

1742
        All find_* methods take a common set of arguments. See the online
1743
        documentation for detailed explanations.
1744

1745
        :param name: A filter on tag name.
1746
        :param attrs: A dictionary of filters on attribute values.
1747
        :param recursive: If this is True, find() will perform a
1748
            recursive search of this PageElement's children. Otherwise,
1749
            only the direct children will be considered.
1750
        :param limit: Stop looking after finding this many results.
1751
        :kwargs: A dictionary of filters on attribute values.
1752
        :return: A PageElement.
1753
        :rtype: bs4.element.Tag | bs4.element.NavigableString
1754
        """
1755
        r = None
1756
        l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1757
        if l:
1758
            r = l[0]
1759
        return r
1760
    findChild = find #BS2
1761

1762
    def find_all(self, name=None, attrs={}, recursive=True, text=None,
1763
                 limit=None, **kwargs):
1764
        """Look in the children of this PageElement and find all
1765
        PageElements that match the given criteria.
1766

1767
        All find_* methods take a common set of arguments. See the online
1768
        documentation for detailed explanations.
1769

1770
        :param name: A filter on tag name.
1771
        :param attrs: A dictionary of filters on attribute values.
1772
        :param recursive: If this is True, find_all() will perform a
1773
            recursive search of this PageElement's children. Otherwise,
1774
            only the direct children will be considered.
1775
        :param limit: Stop looking after finding this many results.
1776
        :kwargs: A dictionary of filters on attribute values.
1777
        :return: A ResultSet of PageElements.
1778
        :rtype: bs4.element.ResultSet
1779
        """
1780
        generator = self.descendants
1781
        if not recursive:
1782
            generator = self.children
1783
        return self._find_all(name, attrs, text, limit, generator, **kwargs)
1784
    findAll = find_all       # BS3
1785
    findChildren = find_all  # BS2
1786

1787
    #Generator methods
1788
    @property
1789
    def children(self):
1790
        """Iterate over all direct children of this PageElement.
1791

1792
        :yield: A sequence of PageElements.
1793
        """
1794
        # return iter() to make the purpose of the method clear
1795
        return iter(self.contents)  # XXX This seems to be untested.
1796

1797
    @property
1798
    def descendants(self):
1799
        """Iterate over all children of this PageElement in a
1800
        breadth-first sequence.
1801

1802
        :yield: A sequence of PageElements.
1803
        """
1804
        if not len(self.contents):
1805
            return
1806
        stopNode = self._last_descendant().next_element
1807
        current = self.contents[0]
1808
        while current is not stopNode:
1809
            yield current
1810
            current = current.next_element
1811

1812
    # CSS selector code
1813
    def select_one(self, selector, namespaces=None, **kwargs):
1814
        """Perform a CSS selection operation on the current element.
1815

1816
        :param selector: A CSS selector.
1817

1818
        :param namespaces: A dictionary mapping namespace prefixes
1819
           used in the CSS selector to namespace URIs. By default,
1820
           Beautiful Soup will use the prefixes it encountered while
1821
           parsing the document.
1822

1823
        :param kwargs: Keyword arguments to be passed into SoupSieve's 
1824
           soupsieve.select() method.
1825

1826
        :return: A Tag.
1827
        :rtype: bs4.element.Tag
1828
        """
1829
        value = self.select(selector, namespaces, 1, **kwargs)
1830
        if value:
1831
            return value[0]
1832
        return None
1833

1834
    def select(self, selector, namespaces=None, limit=None, **kwargs):
1835
        """Perform a CSS selection operation on the current element.
1836

1837
        This uses the SoupSieve library.
1838

1839
        :param selector: A string containing a CSS selector.
1840

1841
        :param namespaces: A dictionary mapping namespace prefixes
1842
           used in the CSS selector to namespace URIs. By default,
1843
           Beautiful Soup will use the prefixes it encountered while
1844
           parsing the document.
1845

1846
        :param limit: After finding this number of results, stop looking.
1847

1848
        :param kwargs: Keyword arguments to be passed into SoupSieve's 
1849
           soupsieve.select() method.
1850

1851
        :return: A ResultSet of Tags.
1852
        :rtype: bs4.element.ResultSet
1853
        """
1854
        if namespaces is None:
1855
            namespaces = self._namespaces
1856
        
1857
        if limit is None:
1858
            limit = 0
1859
        if soupsieve is None:
1860
            raise NotImplementedError(
1861
                "Cannot execute CSS selectors because the soupsieve package is not installed."
1862
            )
1863
            
1864
        results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
1865

1866
        # We do this because it's more consistent and because
1867
        # ResultSet.__getattr__ has a helpful error message.
1868
        return ResultSet(None, results)
1869

1870
    # Old names for backwards compatibility
1871
    def childGenerator(self):
1872
        """Deprecated generator."""
1873
        return self.children
1874

1875
    def recursiveChildGenerator(self):
1876
        """Deprecated generator."""
1877
        return self.descendants
1878

1879
    def has_key(self, key):
1880
        """Deprecated method. This was kind of misleading because has_key()
1881
        (attributes) was different from __in__ (contents).
1882

1883
        has_key() is gone in Python 3, anyway.
1884
        """
1885
        warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
1886
                key))
1887
        return self.has_attr(key)
1888

1889
# Next, a couple classes to represent queries and their results.
1890
class SoupStrainer(object):
1891
    """Encapsulates a number of ways of matching a markup element (tag or
1892
    string).
1893

1894
    This is primarily used to underpin the find_* methods, but you can
1895
    create one yourself and pass it in as `parse_only` to the
1896
    `BeautifulSoup` constructor, to parse a subset of a large
1897
    document.
1898
    """
1899

1900
    def __init__(self, name=None, attrs={}, text=None, **kwargs):
1901
        """Constructor.
1902

1903
        The SoupStrainer constructor takes the same arguments passed
1904
        into the find_* methods. See the online documentation for
1905
        detailed explanations.
1906

1907
        :param name: A filter on tag name.
1908
        :param attrs: A dictionary of filters on attribute values.
1909
        :param text: A filter for a NavigableString with specific text.
1910
        :kwargs: A dictionary of filters on attribute values.
1911
        """        
1912
        self.name = self._normalize_search_value(name)
1913
        if not isinstance(attrs, dict):
1914
            # Treat a non-dict value for attrs as a search for the 'class'
1915
            # attribute.
1916
            kwargs['class'] = attrs
1917
            attrs = None
1918

1919
        if 'class_' in kwargs:
1920
            # Treat class_="foo" as a search for the 'class'
1921
            # attribute, overriding any non-dict value for attrs.
1922
            kwargs['class'] = kwargs['class_']
1923
            del kwargs['class_']
1924

1925
        if kwargs:
1926
            if attrs:
1927
                attrs = attrs.copy()
1928
                attrs.update(kwargs)
1929
            else:
1930
                attrs = kwargs
1931
        normalized_attrs = {}
1932
        for key, value in list(attrs.items()):
1933
            normalized_attrs[key] = self._normalize_search_value(value)
1934

1935
        self.attrs = normalized_attrs
1936
        self.text = self._normalize_search_value(text)
1937

1938
    def _normalize_search_value(self, value):
1939
        # Leave it alone if it's a Unicode string, a callable, a
1940
        # regular expression, a boolean, or None.
1941
        if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
1942
            or isinstance(value, bool) or value is None):
1943
            return value
1944

1945
        # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
1946
        if isinstance(value, bytes):
1947
            return value.decode("utf8")
1948

1949
        # If it's listlike, convert it into a list of strings.
1950
        if hasattr(value, '__iter__'):
1951
            new_value = []
1952
            for v in value:
1953
                if (hasattr(v, '__iter__') and not isinstance(v, bytes)
1954
                    and not isinstance(v, str)):
1955
                    # This is almost certainly the user's mistake. In the
1956
                    # interests of avoiding infinite loops, we'll let
1957
                    # it through as-is rather than doing a recursive call.
1958
                    new_value.append(v)
1959
                else:
1960
                    new_value.append(self._normalize_search_value(v))
1961
            return new_value
1962

1963
        # Otherwise, convert it into a Unicode string.
1964
        # The unicode(str()) thing is so this will do the same thing on Python 2
1965
        # and Python 3.
1966
        return str(str(value))
1967

1968
    def __str__(self):
1969
        """A human-readable representation of this SoupStrainer."""
1970
        if self.text:
1971
            return self.text
1972
        else:
1973
            return "%s|%s" % (self.name, self.attrs)
1974

1975
    def search_tag(self, markup_name=None, markup_attrs={}):
1976
        """Check whether a Tag with the given name and attributes would
1977
        match this SoupStrainer.
1978

1979
        Used prospectively to decide whether to even bother creating a Tag
1980
        object.
1981

1982
        :param markup_name: A tag name as found in some markup.
1983
        :param markup_attrs: A dictionary of attributes as found in some markup.
1984

1985
        :return: True if the prospective tag would match this SoupStrainer;
1986
            False otherwise.
1987
        """
1988
        found = None
1989
        markup = None
1990
        if isinstance(markup_name, Tag):
1991
            markup = markup_name
1992
            markup_attrs = markup
1993
        call_function_with_tag_data = (
1994
            isinstance(self.name, Callable)
1995
            and not isinstance(markup_name, Tag))
1996

1997
        if ((not self.name)
1998
            or call_function_with_tag_data
1999
            or (markup and self._matches(markup, self.name))
2000
            or (not markup and self._matches(markup_name, self.name))):
2001
            if call_function_with_tag_data:
2002
                match = self.name(markup_name, markup_attrs)
2003
            else:
2004
                match = True
2005
                markup_attr_map = None
2006
                for attr, match_against in list(self.attrs.items()):
2007
                    if not markup_attr_map:
2008
                        if hasattr(markup_attrs, 'get'):
2009
                            markup_attr_map = markup_attrs
2010
                        else:
2011
                            markup_attr_map = {}
2012
                            for k, v in markup_attrs:
2013
                                markup_attr_map[k] = v
2014
                    attr_value = markup_attr_map.get(attr)
2015
                    if not self._matches(attr_value, match_against):
2016
                        match = False
2017
                        break
2018
            if match:
2019
                if markup:
2020
                    found = markup
2021
                else:
2022
                    found = markup_name
2023
        if found and self.text and not self._matches(found.string, self.text):
2024
            found = None
2025
        return found
2026

2027
    # For BS3 compatibility.
2028
    searchTag = search_tag
2029

2030
    def search(self, markup):
2031
        """Find all items in `markup` that match this SoupStrainer.
2032

2033
        Used by the core _find_all() method, which is ultimately
2034
        called by all find_* methods.
2035

2036
        :param markup: A PageElement or a list of them.
2037
        """
2038
        # print('looking for %s in %s' % (self, markup))
2039
        found = None
2040
        # If given a list of items, scan it for a text element that
2041
        # matches.
2042
        if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
2043
            for element in markup:
2044
                if isinstance(element, NavigableString) \
2045
                       and self.search(element):
2046
                    found = element
2047
                    break
2048
        # If it's a Tag, make sure its name or attributes match.
2049
        # Don't bother with Tags if we're searching for text.
2050
        elif isinstance(markup, Tag):
2051
            if not self.text or self.name or self.attrs:
2052
                found = self.search_tag(markup)
2053
        # If it's text, make sure the text matches.
2054
        elif isinstance(markup, NavigableString) or \
2055
                 isinstance(markup, str):
2056
            if not self.name and not self.attrs and self._matches(markup, self.text):
2057
                found = markup
2058
        else:
2059
            raise Exception(
2060
                "I don't know how to match against a %s" % markup.__class__)
2061
        return found
2062

2063
    def _matches(self, markup, match_against, already_tried=None):
2064
        # print(u"Matching %s against %s" % (markup, match_against))
2065
        result = False
2066
        if isinstance(markup, list) or isinstance(markup, tuple):
2067
            # This should only happen when searching a multi-valued attribute
2068
            # like 'class'.
2069
            for item in markup:
2070
                if self._matches(item, match_against):
2071
                    return True
2072
            # We didn't match any particular value of the multivalue
2073
            # attribute, but maybe we match the attribute value when
2074
            # considered as a string.
2075
            if self._matches(' '.join(markup), match_against):
2076
                return True
2077
            return False
2078
        
2079
        if match_against is True:
2080
            # True matches any non-None value.
2081
            return markup is not None
2082

2083
        if isinstance(match_against, Callable):
2084
            return match_against(markup)
2085

2086
        # Custom callables take the tag as an argument, but all
2087
        # other ways of matching match the tag name as a string.
2088
        original_markup = markup
2089
        if isinstance(markup, Tag):
2090
            markup = markup.name
2091

2092
        # Ensure that `markup` is either a Unicode string, or None.
2093
        markup = self._normalize_search_value(markup)
2094

2095
        if markup is None:
2096
            # None matches None, False, an empty string, an empty list, and so on.
2097
            return not match_against
2098

2099
        if (hasattr(match_against, '__iter__')
2100
            and not isinstance(match_against, str)):
2101
            # We're asked to match against an iterable of items.
2102
            # The markup must be match at least one item in the
2103
            # iterable. We'll try each one in turn.
2104
            #
2105
            # To avoid infinite recursion we need to keep track of
2106
            # items we've already seen.
2107
            if not already_tried:
2108
                already_tried = set()
2109
            for item in match_against:
2110
                if item.__hash__:
2111
                    key = item
2112
                else:
2113
                    key = id(item)
2114
                if key in already_tried:
2115
                    continue
2116
                else:
2117
                    already_tried.add(key)
2118
                    if self._matches(original_markup, item, already_tried):
2119
                        return True
2120
            else:
2121
                return False
2122
        
2123
        # Beyond this point we might need to run the test twice: once against
2124
        # the tag's name and once against its prefixed name.
2125
        match = False
2126
        
2127
        if not match and isinstance(match_against, str):
2128
            # Exact string match
2129
            match = markup == match_against
2130

2131
        if not match and hasattr(match_against, 'search'):
2132
            # Regexp match
2133
            return match_against.search(markup)
2134

2135
        if (not match
2136
            and isinstance(original_markup, Tag)
2137
            and original_markup.prefix):
2138
            # Try the whole thing again with the prefixed tag name.
2139
            return self._matches(
2140
                original_markup.prefix + ':' + original_markup.name, match_against
2141
            )
2142

2143
        return match
2144

2145

2146
class ResultSet(list):
2147
    """A ResultSet is just a list that keeps track of the SoupStrainer
2148
    that created it."""
2149
    def __init__(self, source, result=()):
2150
        """Constructor.
2151

2152
        :param source: A SoupStrainer.
2153
        :param result: A list of PageElements.
2154
        """
2155
        super(ResultSet, self).__init__(result)
2156
        self.source = source
2157

2158
    def __getattr__(self, key):
2159
        """Raise a helpful exception to explain a common code fix."""
2160
        raise AttributeError(
2161
            "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
2162
        )
2163

2164
Product

Resources

Company