CoCalc -- __init_

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/__init__.py
⁸¹¹ views
1
"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
2

3
http://www.crummy.com/software/BeautifulSoup/
4

5
Beautiful Soup uses a pluggable XML or HTML parser to parse a
6
(possibly invalid) document into a tree representation. Beautiful Soup
7
provides methods and Pythonic idioms that make it easy to navigate,
8
search, and modify the parse tree.
9

10
Beautiful Soup works with Python 2.7 and up. It works better if lxml
11
and/or html5lib is installed.
12

13
For more than you ever wanted to know about Beautiful Soup, see the
14
documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
15
"""
16

17
__author__ = "Leonard Richardson ([email protected])"
18
__version__ = "4.9.1"
19
__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson"
20
# Use of this source code is governed by the MIT license.
21
__license__ = "MIT"
22

23
__all__ = ['BeautifulSoup']
24

25
import os
26
import re
27
import sys
28
import traceback
29
import warnings
30

31
from .builder import builder_registry, ParserRejectedMarkup
32
from .dammit import UnicodeDammit
33
from .element import (
34
    CData,
35
    Comment,
36
    DEFAULT_OUTPUT_ENCODING,
37
    Declaration,
38
    Doctype,
39
    NavigableString,
40
    PageElement,
41
    ProcessingInstruction,
42
    PYTHON_SPECIFIC_ENCODINGS,
43
    ResultSet,
44
    Script,
45
    Stylesheet,
46
    SoupStrainer,
47
    Tag,
48
    TemplateString,
49
    )
50

51
# The very first thing we do is give a useful error if someone is
52
# running this code under Python 3 without converting it.
53
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
54

55
# Define some custom warnings.
56
class GuessedAtParserWarning(UserWarning):
57
    """The warning issued when BeautifulSoup has to guess what parser to
58
    use -- probably because no parser was specified in the constructor.
59
    """
60

61
class MarkupResemblesLocatorWarning(UserWarning):
62
    """The warning issued when BeautifulSoup is given 'markup' that
63
    actually looks like a resource locator -- a URL or a path to a file
64
    on disk.
65
    """
66

67

68
class BeautifulSoup(Tag):
69
    """A data structure representing a parsed HTML or XML document.
70

71
    Most of the methods you'll call on a BeautifulSoup object are inherited from
72
    PageElement or Tag.
73

74
    Internally, this class defines the basic interface called by the
75
    tree builders when converting an HTML/XML document into a data
76
    structure. The interface abstracts away the differences between
77
    parsers. To write a new tree builder, you'll need to understand
78
    these methods as a whole.
79

80
    These methods will be called by the BeautifulSoup constructor:
81
      * reset()
82
      * feed(markup)
83

84
    The tree builder may call these methods from its feed() implementation:
85
      * handle_starttag(name, attrs) # See note about return value
86
      * handle_endtag(name)
87
      * handle_data(data) # Appends to the current data node
88
      * endData(containerClass) # Ends the current data node
89

90
    No matter how complicated the underlying parser is, you should be
91
    able to build a tree using 'start tag' events, 'end tag' events,
92
    'data' events, and "done with data" events.
93

94
    If you encounter an empty-element tag (aka a self-closing tag,
95
    like HTML's <br> tag), call handle_starttag and then
96
    handle_endtag.
97
    """
98

99
    # Since BeautifulSoup subclasses Tag, it's possible to treat it as
100
    # a Tag with a .name. This name makes it clear the BeautifulSoup
101
    # object isn't a real markup tag.
102
    ROOT_TAG_NAME = '[document]'
103

104
    # If the end-user gives no indication which tree builder they
105
    # want, look for one with these features.
106
    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
107

108
    # A string containing all ASCII whitespace characters, used in
109
    # endData() to detect data chunks that seem 'empty'.
110
    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
111

112
    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
113
    
114
    def __init__(self, markup="", features=None, builder=None,
115
                 parse_only=None, from_encoding=None, exclude_encodings=None,
116
                 element_classes=None, **kwargs):
117
        """Constructor.
118

119
        :param markup: A string or a file-like object representing
120
         markup to be parsed.
121

122
        :param features: Desirable features of the parser to be
123
         used. This may be the name of a specific parser ("lxml",
124
         "lxml-xml", "html.parser", or "html5lib") or it may be the
125
         type of markup to be used ("html", "html5", "xml"). It's
126
         recommended that you name a specific parser, so that
127
         Beautiful Soup gives you the same results across platforms
128
         and virtual environments.
129

130
        :param builder: A TreeBuilder subclass to instantiate (or
131
         instance to use) instead of looking one up based on
132
         `features`. You only need to use this if you've implemented a
133
         custom TreeBuilder.
134

135
        :param parse_only: A SoupStrainer. Only parts of the document
136
         matching the SoupStrainer will be considered. This is useful
137
         when parsing part of a document that would otherwise be too
138
         large to fit into memory.
139

140
        :param from_encoding: A string indicating the encoding of the
141
         document to be parsed. Pass this in if Beautiful Soup is
142
         guessing wrongly about the document's encoding.
143

144
        :param exclude_encodings: A list of strings indicating
145
         encodings known to be wrong. Pass this in if you don't know
146
         the document's encoding but you know Beautiful Soup's guess is
147
         wrong.
148

149
        :param element_classes: A dictionary mapping BeautifulSoup
150
         classes like Tag and NavigableString, to other classes you'd
151
         like to be instantiated instead as the parse tree is
152
         built. This is useful for subclassing Tag or NavigableString
153
         to modify default behavior.
154

155
        :param kwargs: For backwards compatibility purposes, the
156
         constructor accepts certain keyword arguments used in
157
         Beautiful Soup 3. None of these arguments do anything in
158
         Beautiful Soup 4; they will result in a warning and then be
159
         ignored.
160
         
161
         Apart from this, any keyword arguments passed into the
162
         BeautifulSoup constructor are propagated to the TreeBuilder
163
         constructor. This makes it possible to configure a
164
         TreeBuilder by passing in arguments, not just by saying which
165
         one to use.
166
        """
167
        if 'convertEntities' in kwargs:
168
            del kwargs['convertEntities']
169
            warnings.warn(
170
                "BS4 does not respect the convertEntities argument to the "
171
                "BeautifulSoup constructor. Entities are always converted "
172
                "to Unicode characters.")
173

174
        if 'markupMassage' in kwargs:
175
            del kwargs['markupMassage']
176
            warnings.warn(
177
                "BS4 does not respect the markupMassage argument to the "
178
                "BeautifulSoup constructor. The tree builder is responsible "
179
                "for any necessary markup massage.")
180

181
        if 'smartQuotesTo' in kwargs:
182
            del kwargs['smartQuotesTo']
183
            warnings.warn(
184
                "BS4 does not respect the smartQuotesTo argument to the "
185
                "BeautifulSoup constructor. Smart quotes are always converted "
186
                "to Unicode characters.")
187

188
        if 'selfClosingTags' in kwargs:
189
            del kwargs['selfClosingTags']
190
            warnings.warn(
191
                "BS4 does not respect the selfClosingTags argument to the "
192
                "BeautifulSoup constructor. The tree builder is responsible "
193
                "for understanding self-closing tags.")
194

195
        if 'isHTML' in kwargs:
196
            del kwargs['isHTML']
197
            warnings.warn(
198
                "BS4 does not respect the isHTML argument to the "
199
                "BeautifulSoup constructor. Suggest you use "
200
                "features='lxml' for HTML and features='lxml-xml' for "
201
                "XML.")
202

203
        def deprecated_argument(old_name, new_name):
204
            if old_name in kwargs:
205
                warnings.warn(
206
                    'The "%s" argument to the BeautifulSoup constructor '
207
                    'has been renamed to "%s."' % (old_name, new_name))
208
                value = kwargs[old_name]
209
                del kwargs[old_name]
210
                return value
211
            return None
212

213
        parse_only = parse_only or deprecated_argument(
214
            "parseOnlyThese", "parse_only")
215

216
        from_encoding = from_encoding or deprecated_argument(
217
            "fromEncoding", "from_encoding")
218

219
        if from_encoding and isinstance(markup, str):
220
            warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
221
            from_encoding = None
222

223
        self.element_classes = element_classes or dict()
224

225
        # We need this information to track whether or not the builder
226
        # was specified well enough that we can omit the 'you need to
227
        # specify a parser' warning.
228
        original_builder = builder
229
        original_features = features
230
            
231
        if isinstance(builder, type):
232
            # A builder class was passed in; it needs to be instantiated.
233
            builder_class = builder
234
            builder = None
235
        elif builder is None:
236
            if isinstance(features, str):
237
                features = [features]
238
            if features is None or len(features) == 0:
239
                features = self.DEFAULT_BUILDER_FEATURES
240
            builder_class = builder_registry.lookup(*features)
241
            if builder_class is None:
242
                raise FeatureNotFound(
243
                    "Couldn't find a tree builder with the features you "
244
                    "requested: %s. Do you need to install a parser library?"
245
                    % ",".join(features))
246

247
        # At this point either we have a TreeBuilder instance in
248
        # builder, or we have a builder_class that we can instantiate
249
        # with the remaining **kwargs.
250
        if builder is None:
251
            builder = builder_class(**kwargs)
252
            if not original_builder and not (
253
                    original_features == builder.NAME or
254
                    original_features in builder.ALTERNATE_NAMES
255
            ):
256
                if builder.is_xml:
257
                    markup_type = "XML"
258
                else:
259
                    markup_type = "HTML"
260

261
                # This code adapted from warnings.py so that we get the same line
262
                # of code as our warnings.warn() call gets, even if the answer is wrong
263
                # (as it may be in a multithreading situation).
264
                caller = None
265
                try:
266
                    caller = sys._getframe(1)
267
                except ValueError:
268
                    pass
269
                if caller:
270
                    globals = caller.f_globals
271
                    line_number = caller.f_lineno
272
                else:
273
                    globals = sys.__dict__
274
                    line_number= 1                    
275
                filename = globals.get('__file__')
276
                if filename:
277
                    fnl = filename.lower()
278
                    if fnl.endswith((".pyc", ".pyo")):
279
                        filename = filename[:-1]
280
                if filename:
281
                    # If there is no filename at all, the user is most likely in a REPL,
282
                    # and the warning is not necessary.
283
                    values = dict(
284
                        filename=filename,
285
                        line_number=line_number,
286
                        parser=builder.NAME,
287
                        markup_type=markup_type
288
                    )
289
                    warnings.warn(
290
                        self.NO_PARSER_SPECIFIED_WARNING % values,
291
                        GuessedAtParserWarning, stacklevel=2
292
                    )
293
        else:
294
            if kwargs:
295
                warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
296
                    
297
        self.builder = builder
298
        self.is_xml = builder.is_xml
299
        self.known_xml = self.is_xml
300
        self._namespaces = dict()
301
        self.parse_only = parse_only
302

303
        self.builder.initialize_soup(self)
304

305
        if hasattr(markup, 'read'):        # It's a file-type object.
306
            markup = markup.read()
307
        elif len(markup) <= 256 and (
308
                (isinstance(markup, bytes) and not b'<' in markup)
309
                or (isinstance(markup, str) and not '<' in markup)
310
        ):
311
            # Print out warnings for a couple beginner problems
312
            # involving passing non-markup to Beautiful Soup.
313
            # Beautiful Soup will still parse the input as markup,
314
            # just in case that's what the user really wants.
315
            if (isinstance(markup, str)
316
                and not os.path.supports_unicode_filenames):
317
                possible_filename = markup.encode("utf8")
318
            else:
319
                possible_filename = markup
320
            is_file = False
321
            try:
322
                is_file = os.path.exists(possible_filename)
323
            except Exception as e:
324
                # This is almost certainly a problem involving
325
                # characters not valid in filenames on this
326
                # system. Just let it go.
327
                pass
328
            if is_file:
329
                warnings.warn(
330
                    '"%s" looks like a filename, not markup. You should'
331
                    ' probably open this file and pass the filehandle into'
332
                    ' Beautiful Soup.' % self._decode_markup(markup),
333
                    MarkupResemblesLocatorWarning
334
                )
335
            self._check_markup_is_url(markup)
336

337
        rejections = []
338
        success = False
339
        for (self.markup, self.original_encoding, self.declared_html_encoding,
340
         self.contains_replacement_characters) in (
341
             self.builder.prepare_markup(
342
                 markup, from_encoding, exclude_encodings=exclude_encodings)):
343
            self.reset()
344
            try:
345
                self._feed()
346
                success = True
347
                break
348
            except ParserRejectedMarkup as e:
349
                rejections.append(e)
350
                pass
351

352
        if not success:
353
            other_exceptions = [str(e) for e in rejections]
354
            raise ParserRejectedMarkup(
355
                "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
356
            )
357

358
        # Clear out the markup and remove the builder's circular
359
        # reference to this object.
360
        self.markup = None
361
        self.builder.soup = None
362

363
    def __copy__(self):
364
        """Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
365
        copy = type(self)(
366
            self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
367
        )
368

369
        # Although we encoded the tree to UTF-8, that may not have
370
        # been the encoding of the original markup. Set the copy's
371
        # .original_encoding to reflect the original object's
372
        # .original_encoding.
373
        copy.original_encoding = self.original_encoding
374
        return copy
375

376
    def __getstate__(self):
377
        # Frequently a tree builder can't be pickled.
378
        d = dict(self.__dict__)
379
        if 'builder' in d and not self.builder.picklable:
380
            d['builder'] = None
381
        return d
382

383
    @classmethod
384
    def _decode_markup(cls, markup):
385
        """Ensure `markup` is bytes so it's safe to send into warnings.warn.
386

387
        TODO: warnings.warn had this problem back in 2010 but it might not
388
        anymore.
389
        """
390
        if isinstance(markup, bytes):
391
            decoded = markup.decode('utf-8', 'replace')
392
        else:
393
            decoded = markup
394
        return decoded
395

396
    @classmethod
397
    def _check_markup_is_url(cls, markup):
398
        """Error-handling method to raise a warning if incoming markup looks
399
        like a URL.
400

401
        :param markup: A string.
402
        """
403
        if isinstance(markup, bytes):
404
            space = b' '
405
            cant_start_with = (b"http:", b"https:")
406
        elif isinstance(markup, str):
407
            space = ' '
408
            cant_start_with = ("http:", "https:")
409
        else:
410
            return
411

412
        if any(markup.startswith(prefix) for prefix in cant_start_with):
413
            if not space in markup:
414
                warnings.warn(
415
                    '"%s" looks like a URL. Beautiful Soup is not an'
416
                    ' HTTP client. You should probably use an HTTP client like'
417
                    ' requests to get the document behind the URL, and feed'
418
                    ' that document to Beautiful Soup.' % cls._decode_markup(
419
                        markup
420
                    ),
421
                    MarkupResemblesLocatorWarning
422
                )
423

424
    def _feed(self):
425
        """Internal method that parses previously set markup, creating a large
426
        number of Tag and NavigableString objects.
427
        """
428
        # Convert the document to Unicode.
429
        self.builder.reset()
430

431
        self.builder.feed(self.markup)
432
        # Close out any unfinished strings and close all the open tags.
433
        self.endData()
434
        while self.currentTag.name != self.ROOT_TAG_NAME:
435
            self.popTag()
436

437
    def reset(self):
438
        """Reset this object to a state as though it had never parsed any
439
        markup.
440
        """
441
        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
442
        self.hidden = 1
443
        self.builder.reset()
444
        self.current_data = []
445
        self.currentTag = None
446
        self.tagStack = []
447
        self.preserve_whitespace_tag_stack = []
448
        self.string_container_stack = []
449
        self.pushTag(self)
450

451
    def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
452
                sourceline=None, sourcepos=None, **kwattrs):
453
        """Create a new Tag associated with this BeautifulSoup object.
454

455
        :param name: The name of the new Tag.
456
        :param namespace: The URI of the new Tag's XML namespace, if any.
457
        :param prefix: The prefix for the new Tag's XML namespace, if any.
458
        :param attrs: A dictionary of this Tag's attribute values; can
459
            be used instead of `kwattrs` for attributes like 'class'
460
            that are reserved words in Python.
461
        :param sourceline: The line number where this tag was
462
            (purportedly) found in its source document.
463
        :param sourcepos: The character position within `sourceline` where this
464
            tag was (purportedly) found.
465
        :param kwattrs: Keyword arguments for the new Tag's attribute values.
466

467
        """
468
        kwattrs.update(attrs)
469
        return self.element_classes.get(Tag, Tag)(
470
            None, self.builder, name, namespace, nsprefix, kwattrs,
471
            sourceline=sourceline, sourcepos=sourcepos
472
        )
473

474
    def string_container(self, base_class=None):
475
        container = base_class or NavigableString
476
        
477
        # There may be a general override of NavigableString.
478
        container = self.element_classes.get(
479
            container, container
480
        )
481

482
        # On top of that, we may be inside a tag that needs a special
483
        # container class.
484
        if self.string_container_stack:
485
            container = self.builder.string_containers.get(
486
                self.string_container_stack[-1].name, container
487
            )
488
        return container
489
        
490
    def new_string(self, s, subclass=None):
491
        """Create a new NavigableString associated with this BeautifulSoup
492
        object.
493
        """
494
        container = self.string_container(subclass)
495
        return container(s)
496

497
    def insert_before(self, successor):
498
        """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
499
        it because there is nothing before or after it in the parse tree.
500
        """
501
        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
502

503
    def insert_after(self, successor):
504
        """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
505
        it because there is nothing before or after it in the parse tree.
506
        """
507
        raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
508

509
    def popTag(self):
510
        """Internal method called by _popToTag when a tag is closed."""
511
        tag = self.tagStack.pop()
512
        if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
513
            self.preserve_whitespace_tag_stack.pop()
514
        if self.string_container_stack and tag == self.string_container_stack[-1]:
515
            self.string_container_stack.pop()
516
        #print("Pop", tag.name)
517
        if self.tagStack:
518
            self.currentTag = self.tagStack[-1]
519
        return self.currentTag
520

521
    def pushTag(self, tag):
522
        """Internal method called by handle_starttag when a tag is opened."""
523
        #print("Push", tag.name)
524
        if self.currentTag is not None:
525
            self.currentTag.contents.append(tag)
526
        self.tagStack.append(tag)
527
        self.currentTag = self.tagStack[-1]
528
        if tag.name in self.builder.preserve_whitespace_tags:
529
            self.preserve_whitespace_tag_stack.append(tag)
530
        if tag.name in self.builder.string_containers:
531
            self.string_container_stack.append(tag)
532

533
    def endData(self, containerClass=None):
534
        """Method called by the TreeBuilder when the end of a data segment
535
        occurs.
536
        """
537
        containerClass = self.string_container(containerClass)
538
        
539
        if self.current_data:
540
            current_data = ''.join(self.current_data)
541
            # If whitespace is not preserved, and this string contains
542
            # nothing but ASCII spaces, replace it with a single space
543
            # or newline.
544
            if not self.preserve_whitespace_tag_stack:
545
                strippable = True
546
                for i in current_data:
547
                    if i not in self.ASCII_SPACES:
548
                        strippable = False
549
                        break
550
                if strippable:
551
                    if '\n' in current_data:
552
                        current_data = '\n'
553
                    else:
554
                        current_data = ' '
555

556
            # Reset the data collector.
557
            self.current_data = []
558

559
            # Should we add this string to the tree at all?
560
            if self.parse_only and len(self.tagStack) <= 1 and \
561
                   (not self.parse_only.text or \
562
                    not self.parse_only.search(current_data)):
563
                return
564

565
            o = containerClass(current_data)
566
            self.object_was_parsed(o)
567

568
    def object_was_parsed(self, o, parent=None, most_recent_element=None):
569
        """Method called by the TreeBuilder to integrate an object into the parse tree."""
570
        if parent is None:
571
            parent = self.currentTag
572
        if most_recent_element is not None:
573
            previous_element = most_recent_element
574
        else:
575
            previous_element = self._most_recent_element
576

577
        next_element = previous_sibling = next_sibling = None
578
        if isinstance(o, Tag):
579
            next_element = o.next_element
580
            next_sibling = o.next_sibling
581
            previous_sibling = o.previous_sibling
582
            if previous_element is None:
583
                previous_element = o.previous_element
584

585
        fix = parent.next_element is not None
586

587
        o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
588

589
        self._most_recent_element = o
590
        parent.contents.append(o)
591

592
        # Check if we are inserting into an already parsed node.
593
        if fix:
594
            self._linkage_fixer(parent)
595

596
    def _linkage_fixer(self, el):
597
        """Make sure linkage of this fragment is sound."""
598

599
        first = el.contents[0]
600
        child = el.contents[-1]
601
        descendant = child
602

603
        if child is first and el.parent is not None:
604
            # Parent should be linked to first child
605
            el.next_element = child
606
            # We are no longer linked to whatever this element is
607
            prev_el = child.previous_element
608
            if prev_el is not None and prev_el is not el:
609
                prev_el.next_element = None
610
            # First child should be linked to the parent, and no previous siblings.
611
            child.previous_element = el
612
            child.previous_sibling = None
613

614
        # We have no sibling as we've been appended as the last.
615
        child.next_sibling = None
616

617
        # This index is a tag, dig deeper for a "last descendant"
618
        if isinstance(child, Tag) and child.contents:
619
            descendant = child._last_descendant(False)
620

621
        # As the final step, link last descendant. It should be linked
622
        # to the parent's next sibling (if found), else walk up the chain
623
        # and find a parent with a sibling. It should have no next sibling.
624
        descendant.next_element = None
625
        descendant.next_sibling = None
626
        target = el
627
        while True:
628
            if target is None:
629
                break
630
            elif target.next_sibling is not None:
631
                descendant.next_element = target.next_sibling
632
                target.next_sibling.previous_element = child
633
                break
634
            target = target.parent
635

636
    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
637
        """Pops the tag stack up to and including the most recent
638
        instance of the given tag. 
639

640
        :param name: Pop up to the most recent tag with this name.
641
        :param nsprefix: The namespace prefix that goes with `name`.
642
        :param inclusivePop: It this is false, pops the tag stack up
643
          to but *not* including the most recent instqance of the
644
          given tag.
645
        """
646
        #print("Popping to %s" % name)
647
        if name == self.ROOT_TAG_NAME:
648
            # The BeautifulSoup object itself can never be popped.
649
            return
650

651
        most_recently_popped = None
652

653
        stack_size = len(self.tagStack)
654
        for i in range(stack_size - 1, 0, -1):
655
            t = self.tagStack[i]
656
            if (name == t.name and nsprefix == t.prefix):
657
                if inclusivePop:
658
                    most_recently_popped = self.popTag()
659
                break
660
            most_recently_popped = self.popTag()
661

662
        return most_recently_popped
663

664
    def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
665
                        sourcepos=None):
666
        """Called by the tree builder when a new tag is encountered.
667

668
        :param name: Name of the tag.
669
        :param nsprefix: Namespace prefix for the tag.
670
        :param attrs: A dictionary of attribute values.
671
        :param sourceline: The line number where this tag was found in its
672
            source document.
673
        :param sourcepos: The character position within `sourceline` where this
674
            tag was found.
675

676
        If this method returns None, the tag was rejected by an active
677
        SoupStrainer. You should proceed as if the tag had not occurred
678
        in the document. For instance, if this was a self-closing tag,
679
        don't call handle_endtag.
680
        """
681
        # print("Start tag %s: %s" % (name, attrs))
682
        self.endData()
683

684
        if (self.parse_only and len(self.tagStack) <= 1
685
            and (self.parse_only.text
686
                 or not self.parse_only.search_tag(name, attrs))):
687
            return None
688

689
        tag = self.element_classes.get(Tag, Tag)(
690
            self, self.builder, name, namespace, nsprefix, attrs,
691
            self.currentTag, self._most_recent_element,
692
            sourceline=sourceline, sourcepos=sourcepos
693
        )
694
        if tag is None:
695
            return tag
696
        if self._most_recent_element is not None:
697
            self._most_recent_element.next_element = tag
698
        self._most_recent_element = tag
699
        self.pushTag(tag)
700
        return tag
701

702
    def handle_endtag(self, name, nsprefix=None):
703
        """Called by the tree builder when an ending tag is encountered.
704

705
        :param name: Name of the tag.
706
        :param nsprefix: Namespace prefix for the tag.
707
        """
708
        #print("End tag: " + name)
709
        self.endData()
710
        self._popToTag(name, nsprefix)
711

712
    def handle_data(self, data):
713
        """Called by the tree builder when a chunk of textual data is encountered."""
714
        self.current_data.append(data)
715
       
716
    def decode(self, pretty_print=False,
717
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
718
               formatter="minimal"):
719
        """Returns a string or Unicode representation of the parse tree
720
            as an HTML or XML document.
721

722
        :param pretty_print: If this is True, indentation will be used to
723
            make the document more readable.
724
        :param eventual_encoding: The encoding of the final document.
725
            If this is None, the document will be a Unicode string.
726
        """
727
        if self.is_xml:
728
            # Print the XML declaration
729
            encoding_part = ''
730
            if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
731
                # This is a special Python encoding; it can't actually
732
                # go into an XML document because it means nothing
733
                # outside of Python.
734
                eventual_encoding = None
735
            if eventual_encoding != None:
736
                encoding_part = ' encoding="%s"' % eventual_encoding
737
            prefix = '<?xml version="1.0"%s?>\n' % encoding_part
738
        else:
739
            prefix = ''
740
        if not pretty_print:
741
            indent_level = None
742
        else:
743
            indent_level = 0
744
        return prefix + super(BeautifulSoup, self).decode(
745
            indent_level, eventual_encoding, formatter)
746

747
# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
748
_s = BeautifulSoup
749
_soup = BeautifulSoup
750

751
class BeautifulStoneSoup(BeautifulSoup):
752
    """Deprecated interface to an XML parser."""
753

754
    def __init__(self, *args, **kwargs):
755
        kwargs['features'] = 'xml'
756
        warnings.warn(
757
            'The BeautifulStoneSoup class is deprecated. Instead of using '
758
            'it, pass features="xml" into the BeautifulSoup constructor.')
759
        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
760

761

762
class StopParsing(Exception):
763
    """Exception raised by a TreeBuilder if it's unable to continue parsing."""
764
    pass
765

766
class FeatureNotFound(ValueError):
767
    """Exception raised by the BeautifulSoup constructor if no parser with the
768
    requested features is found.
769
    """
770
    pass
771

772

773
#If this file is run as a script, act as an HTML pretty-printer.
774
if __name__ == '__main__':
775
    import sys
776
    soup = BeautifulSoup(sys.stdin)
777
    print((soup.prettify()))
778

779
Product

Resources

Company