Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/__init__.py
811 views
1
"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
2
3
http://www.crummy.com/software/BeautifulSoup/
4
5
Beautiful Soup uses a pluggable XML or HTML parser to parse a
6
(possibly invalid) document into a tree representation. Beautiful Soup
7
provides methods and Pythonic idioms that make it easy to navigate,
8
search, and modify the parse tree.
9
10
Beautiful Soup works with Python 2.7 and up. It works better if lxml
11
and/or html5lib is installed.
12
13
For more than you ever wanted to know about Beautiful Soup, see the
14
documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
15
"""
16
17
__author__ = "Leonard Richardson ([email protected])"
18
__version__ = "4.9.1"
19
__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson"
20
# Use of this source code is governed by the MIT license.
21
__license__ = "MIT"
22
23
__all__ = ['BeautifulSoup']
24
25
import os
26
import re
27
import sys
28
import traceback
29
import warnings
30
31
from .builder import builder_registry, ParserRejectedMarkup
32
from .dammit import UnicodeDammit
33
from .element import (
34
CData,
35
Comment,
36
DEFAULT_OUTPUT_ENCODING,
37
Declaration,
38
Doctype,
39
NavigableString,
40
PageElement,
41
ProcessingInstruction,
42
PYTHON_SPECIFIC_ENCODINGS,
43
ResultSet,
44
Script,
45
Stylesheet,
46
SoupStrainer,
47
Tag,
48
TemplateString,
49
)
50
51
# The very first thing we do is give a useful error if someone is
52
# running this code under Python 3 without converting it.
53
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
54
55
# Define some custom warnings.
56
class GuessedAtParserWarning(UserWarning):
57
"""The warning issued when BeautifulSoup has to guess what parser to
58
use -- probably because no parser was specified in the constructor.
59
"""
60
61
class MarkupResemblesLocatorWarning(UserWarning):
62
"""The warning issued when BeautifulSoup is given 'markup' that
63
actually looks like a resource locator -- a URL or a path to a file
64
on disk.
65
"""
66
67
68
class BeautifulSoup(Tag):
69
"""A data structure representing a parsed HTML or XML document.
70
71
Most of the methods you'll call on a BeautifulSoup object are inherited from
72
PageElement or Tag.
73
74
Internally, this class defines the basic interface called by the
75
tree builders when converting an HTML/XML document into a data
76
structure. The interface abstracts away the differences between
77
parsers. To write a new tree builder, you'll need to understand
78
these methods as a whole.
79
80
These methods will be called by the BeautifulSoup constructor:
81
* reset()
82
* feed(markup)
83
84
The tree builder may call these methods from its feed() implementation:
85
* handle_starttag(name, attrs) # See note about return value
86
* handle_endtag(name)
87
* handle_data(data) # Appends to the current data node
88
* endData(containerClass) # Ends the current data node
89
90
No matter how complicated the underlying parser is, you should be
91
able to build a tree using 'start tag' events, 'end tag' events,
92
'data' events, and "done with data" events.
93
94
If you encounter an empty-element tag (aka a self-closing tag,
95
like HTML's <br> tag), call handle_starttag and then
96
handle_endtag.
97
"""
98
99
# Since BeautifulSoup subclasses Tag, it's possible to treat it as
100
# a Tag with a .name. This name makes it clear the BeautifulSoup
101
# object isn't a real markup tag.
102
ROOT_TAG_NAME = '[document]'
103
104
# If the end-user gives no indication which tree builder they
105
# want, look for one with these features.
106
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
107
108
# A string containing all ASCII whitespace characters, used in
109
# endData() to detect data chunks that seem 'empty'.
110
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
111
112
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
113
114
def __init__(self, markup="", features=None, builder=None,
115
parse_only=None, from_encoding=None, exclude_encodings=None,
116
element_classes=None, **kwargs):
117
"""Constructor.
118
119
:param markup: A string or a file-like object representing
120
markup to be parsed.
121
122
:param features: Desirable features of the parser to be
123
used. This may be the name of a specific parser ("lxml",
124
"lxml-xml", "html.parser", or "html5lib") or it may be the
125
type of markup to be used ("html", "html5", "xml"). It's
126
recommended that you name a specific parser, so that
127
Beautiful Soup gives you the same results across platforms
128
and virtual environments.
129
130
:param builder: A TreeBuilder subclass to instantiate (or
131
instance to use) instead of looking one up based on
132
`features`. You only need to use this if you've implemented a
133
custom TreeBuilder.
134
135
:param parse_only: A SoupStrainer. Only parts of the document
136
matching the SoupStrainer will be considered. This is useful
137
when parsing part of a document that would otherwise be too
138
large to fit into memory.
139
140
:param from_encoding: A string indicating the encoding of the
141
document to be parsed. Pass this in if Beautiful Soup is
142
guessing wrongly about the document's encoding.
143
144
:param exclude_encodings: A list of strings indicating
145
encodings known to be wrong. Pass this in if you don't know
146
the document's encoding but you know Beautiful Soup's guess is
147
wrong.
148
149
:param element_classes: A dictionary mapping BeautifulSoup
150
classes like Tag and NavigableString, to other classes you'd
151
like to be instantiated instead as the parse tree is
152
built. This is useful for subclassing Tag or NavigableString
153
to modify default behavior.
154
155
:param kwargs: For backwards compatibility purposes, the
156
constructor accepts certain keyword arguments used in
157
Beautiful Soup 3. None of these arguments do anything in
158
Beautiful Soup 4; they will result in a warning and then be
159
ignored.
160
161
Apart from this, any keyword arguments passed into the
162
BeautifulSoup constructor are propagated to the TreeBuilder
163
constructor. This makes it possible to configure a
164
TreeBuilder by passing in arguments, not just by saying which
165
one to use.
166
"""
167
if 'convertEntities' in kwargs:
168
del kwargs['convertEntities']
169
warnings.warn(
170
"BS4 does not respect the convertEntities argument to the "
171
"BeautifulSoup constructor. Entities are always converted "
172
"to Unicode characters.")
173
174
if 'markupMassage' in kwargs:
175
del kwargs['markupMassage']
176
warnings.warn(
177
"BS4 does not respect the markupMassage argument to the "
178
"BeautifulSoup constructor. The tree builder is responsible "
179
"for any necessary markup massage.")
180
181
if 'smartQuotesTo' in kwargs:
182
del kwargs['smartQuotesTo']
183
warnings.warn(
184
"BS4 does not respect the smartQuotesTo argument to the "
185
"BeautifulSoup constructor. Smart quotes are always converted "
186
"to Unicode characters.")
187
188
if 'selfClosingTags' in kwargs:
189
del kwargs['selfClosingTags']
190
warnings.warn(
191
"BS4 does not respect the selfClosingTags argument to the "
192
"BeautifulSoup constructor. The tree builder is responsible "
193
"for understanding self-closing tags.")
194
195
if 'isHTML' in kwargs:
196
del kwargs['isHTML']
197
warnings.warn(
198
"BS4 does not respect the isHTML argument to the "
199
"BeautifulSoup constructor. Suggest you use "
200
"features='lxml' for HTML and features='lxml-xml' for "
201
"XML.")
202
203
def deprecated_argument(old_name, new_name):
204
if old_name in kwargs:
205
warnings.warn(
206
'The "%s" argument to the BeautifulSoup constructor '
207
'has been renamed to "%s."' % (old_name, new_name))
208
value = kwargs[old_name]
209
del kwargs[old_name]
210
return value
211
return None
212
213
parse_only = parse_only or deprecated_argument(
214
"parseOnlyThese", "parse_only")
215
216
from_encoding = from_encoding or deprecated_argument(
217
"fromEncoding", "from_encoding")
218
219
if from_encoding and isinstance(markup, str):
220
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
221
from_encoding = None
222
223
self.element_classes = element_classes or dict()
224
225
# We need this information to track whether or not the builder
226
# was specified well enough that we can omit the 'you need to
227
# specify a parser' warning.
228
original_builder = builder
229
original_features = features
230
231
if isinstance(builder, type):
232
# A builder class was passed in; it needs to be instantiated.
233
builder_class = builder
234
builder = None
235
elif builder is None:
236
if isinstance(features, str):
237
features = [features]
238
if features is None or len(features) == 0:
239
features = self.DEFAULT_BUILDER_FEATURES
240
builder_class = builder_registry.lookup(*features)
241
if builder_class is None:
242
raise FeatureNotFound(
243
"Couldn't find a tree builder with the features you "
244
"requested: %s. Do you need to install a parser library?"
245
% ",".join(features))
246
247
# At this point either we have a TreeBuilder instance in
248
# builder, or we have a builder_class that we can instantiate
249
# with the remaining **kwargs.
250
if builder is None:
251
builder = builder_class(**kwargs)
252
if not original_builder and not (
253
original_features == builder.NAME or
254
original_features in builder.ALTERNATE_NAMES
255
):
256
if builder.is_xml:
257
markup_type = "XML"
258
else:
259
markup_type = "HTML"
260
261
# This code adapted from warnings.py so that we get the same line
262
# of code as our warnings.warn() call gets, even if the answer is wrong
263
# (as it may be in a multithreading situation).
264
caller = None
265
try:
266
caller = sys._getframe(1)
267
except ValueError:
268
pass
269
if caller:
270
globals = caller.f_globals
271
line_number = caller.f_lineno
272
else:
273
globals = sys.__dict__
274
line_number= 1
275
filename = globals.get('__file__')
276
if filename:
277
fnl = filename.lower()
278
if fnl.endswith((".pyc", ".pyo")):
279
filename = filename[:-1]
280
if filename:
281
# If there is no filename at all, the user is most likely in a REPL,
282
# and the warning is not necessary.
283
values = dict(
284
filename=filename,
285
line_number=line_number,
286
parser=builder.NAME,
287
markup_type=markup_type
288
)
289
warnings.warn(
290
self.NO_PARSER_SPECIFIED_WARNING % values,
291
GuessedAtParserWarning, stacklevel=2
292
)
293
else:
294
if kwargs:
295
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
296
297
self.builder = builder
298
self.is_xml = builder.is_xml
299
self.known_xml = self.is_xml
300
self._namespaces = dict()
301
self.parse_only = parse_only
302
303
self.builder.initialize_soup(self)
304
305
if hasattr(markup, 'read'): # It's a file-type object.
306
markup = markup.read()
307
elif len(markup) <= 256 and (
308
(isinstance(markup, bytes) and not b'<' in markup)
309
or (isinstance(markup, str) and not '<' in markup)
310
):
311
# Print out warnings for a couple beginner problems
312
# involving passing non-markup to Beautiful Soup.
313
# Beautiful Soup will still parse the input as markup,
314
# just in case that's what the user really wants.
315
if (isinstance(markup, str)
316
and not os.path.supports_unicode_filenames):
317
possible_filename = markup.encode("utf8")
318
else:
319
possible_filename = markup
320
is_file = False
321
try:
322
is_file = os.path.exists(possible_filename)
323
except Exception as e:
324
# This is almost certainly a problem involving
325
# characters not valid in filenames on this
326
# system. Just let it go.
327
pass
328
if is_file:
329
warnings.warn(
330
'"%s" looks like a filename, not markup. You should'
331
' probably open this file and pass the filehandle into'
332
' Beautiful Soup.' % self._decode_markup(markup),
333
MarkupResemblesLocatorWarning
334
)
335
self._check_markup_is_url(markup)
336
337
rejections = []
338
success = False
339
for (self.markup, self.original_encoding, self.declared_html_encoding,
340
self.contains_replacement_characters) in (
341
self.builder.prepare_markup(
342
markup, from_encoding, exclude_encodings=exclude_encodings)):
343
self.reset()
344
try:
345
self._feed()
346
success = True
347
break
348
except ParserRejectedMarkup as e:
349
rejections.append(e)
350
pass
351
352
if not success:
353
other_exceptions = [str(e) for e in rejections]
354
raise ParserRejectedMarkup(
355
"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
356
)
357
358
# Clear out the markup and remove the builder's circular
359
# reference to this object.
360
self.markup = None
361
self.builder.soup = None
362
363
def __copy__(self):
364
"""Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
365
copy = type(self)(
366
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
367
)
368
369
# Although we encoded the tree to UTF-8, that may not have
370
# been the encoding of the original markup. Set the copy's
371
# .original_encoding to reflect the original object's
372
# .original_encoding.
373
copy.original_encoding = self.original_encoding
374
return copy
375
376
def __getstate__(self):
377
# Frequently a tree builder can't be pickled.
378
d = dict(self.__dict__)
379
if 'builder' in d and not self.builder.picklable:
380
d['builder'] = None
381
return d
382
383
@classmethod
384
def _decode_markup(cls, markup):
385
"""Ensure `markup` is bytes so it's safe to send into warnings.warn.
386
387
TODO: warnings.warn had this problem back in 2010 but it might not
388
anymore.
389
"""
390
if isinstance(markup, bytes):
391
decoded = markup.decode('utf-8', 'replace')
392
else:
393
decoded = markup
394
return decoded
395
396
@classmethod
397
def _check_markup_is_url(cls, markup):
398
"""Error-handling method to raise a warning if incoming markup looks
399
like a URL.
400
401
:param markup: A string.
402
"""
403
if isinstance(markup, bytes):
404
space = b' '
405
cant_start_with = (b"http:", b"https:")
406
elif isinstance(markup, str):
407
space = ' '
408
cant_start_with = ("http:", "https:")
409
else:
410
return
411
412
if any(markup.startswith(prefix) for prefix in cant_start_with):
413
if not space in markup:
414
warnings.warn(
415
'"%s" looks like a URL. Beautiful Soup is not an'
416
' HTTP client. You should probably use an HTTP client like'
417
' requests to get the document behind the URL, and feed'
418
' that document to Beautiful Soup.' % cls._decode_markup(
419
markup
420
),
421
MarkupResemblesLocatorWarning
422
)
423
424
def _feed(self):
425
"""Internal method that parses previously set markup, creating a large
426
number of Tag and NavigableString objects.
427
"""
428
# Convert the document to Unicode.
429
self.builder.reset()
430
431
self.builder.feed(self.markup)
432
# Close out any unfinished strings and close all the open tags.
433
self.endData()
434
while self.currentTag.name != self.ROOT_TAG_NAME:
435
self.popTag()
436
437
def reset(self):
438
"""Reset this object to a state as though it had never parsed any
439
markup.
440
"""
441
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
442
self.hidden = 1
443
self.builder.reset()
444
self.current_data = []
445
self.currentTag = None
446
self.tagStack = []
447
self.preserve_whitespace_tag_stack = []
448
self.string_container_stack = []
449
self.pushTag(self)
450
451
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
452
sourceline=None, sourcepos=None, **kwattrs):
453
"""Create a new Tag associated with this BeautifulSoup object.
454
455
:param name: The name of the new Tag.
456
:param namespace: The URI of the new Tag's XML namespace, if any.
457
:param prefix: The prefix for the new Tag's XML namespace, if any.
458
:param attrs: A dictionary of this Tag's attribute values; can
459
be used instead of `kwattrs` for attributes like 'class'
460
that are reserved words in Python.
461
:param sourceline: The line number where this tag was
462
(purportedly) found in its source document.
463
:param sourcepos: The character position within `sourceline` where this
464
tag was (purportedly) found.
465
:param kwattrs: Keyword arguments for the new Tag's attribute values.
466
467
"""
468
kwattrs.update(attrs)
469
return self.element_classes.get(Tag, Tag)(
470
None, self.builder, name, namespace, nsprefix, kwattrs,
471
sourceline=sourceline, sourcepos=sourcepos
472
)
473
474
def string_container(self, base_class=None):
475
container = base_class or NavigableString
476
477
# There may be a general override of NavigableString.
478
container = self.element_classes.get(
479
container, container
480
)
481
482
# On top of that, we may be inside a tag that needs a special
483
# container class.
484
if self.string_container_stack:
485
container = self.builder.string_containers.get(
486
self.string_container_stack[-1].name, container
487
)
488
return container
489
490
def new_string(self, s, subclass=None):
491
"""Create a new NavigableString associated with this BeautifulSoup
492
object.
493
"""
494
container = self.string_container(subclass)
495
return container(s)
496
497
def insert_before(self, successor):
498
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
499
it because there is nothing before or after it in the parse tree.
500
"""
501
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
502
503
def insert_after(self, successor):
504
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
505
it because there is nothing before or after it in the parse tree.
506
"""
507
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
508
509
def popTag(self):
510
"""Internal method called by _popToTag when a tag is closed."""
511
tag = self.tagStack.pop()
512
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
513
self.preserve_whitespace_tag_stack.pop()
514
if self.string_container_stack and tag == self.string_container_stack[-1]:
515
self.string_container_stack.pop()
516
#print("Pop", tag.name)
517
if self.tagStack:
518
self.currentTag = self.tagStack[-1]
519
return self.currentTag
520
521
def pushTag(self, tag):
522
"""Internal method called by handle_starttag when a tag is opened."""
523
#print("Push", tag.name)
524
if self.currentTag is not None:
525
self.currentTag.contents.append(tag)
526
self.tagStack.append(tag)
527
self.currentTag = self.tagStack[-1]
528
if tag.name in self.builder.preserve_whitespace_tags:
529
self.preserve_whitespace_tag_stack.append(tag)
530
if tag.name in self.builder.string_containers:
531
self.string_container_stack.append(tag)
532
533
def endData(self, containerClass=None):
534
"""Method called by the TreeBuilder when the end of a data segment
535
occurs.
536
"""
537
containerClass = self.string_container(containerClass)
538
539
if self.current_data:
540
current_data = ''.join(self.current_data)
541
# If whitespace is not preserved, and this string contains
542
# nothing but ASCII spaces, replace it with a single space
543
# or newline.
544
if not self.preserve_whitespace_tag_stack:
545
strippable = True
546
for i in current_data:
547
if i not in self.ASCII_SPACES:
548
strippable = False
549
break
550
if strippable:
551
if '\n' in current_data:
552
current_data = '\n'
553
else:
554
current_data = ' '
555
556
# Reset the data collector.
557
self.current_data = []
558
559
# Should we add this string to the tree at all?
560
if self.parse_only and len(self.tagStack) <= 1 and \
561
(not self.parse_only.text or \
562
not self.parse_only.search(current_data)):
563
return
564
565
o = containerClass(current_data)
566
self.object_was_parsed(o)
567
568
def object_was_parsed(self, o, parent=None, most_recent_element=None):
569
"""Method called by the TreeBuilder to integrate an object into the parse tree."""
570
if parent is None:
571
parent = self.currentTag
572
if most_recent_element is not None:
573
previous_element = most_recent_element
574
else:
575
previous_element = self._most_recent_element
576
577
next_element = previous_sibling = next_sibling = None
578
if isinstance(o, Tag):
579
next_element = o.next_element
580
next_sibling = o.next_sibling
581
previous_sibling = o.previous_sibling
582
if previous_element is None:
583
previous_element = o.previous_element
584
585
fix = parent.next_element is not None
586
587
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
588
589
self._most_recent_element = o
590
parent.contents.append(o)
591
592
# Check if we are inserting into an already parsed node.
593
if fix:
594
self._linkage_fixer(parent)
595
596
def _linkage_fixer(self, el):
597
"""Make sure linkage of this fragment is sound."""
598
599
first = el.contents[0]
600
child = el.contents[-1]
601
descendant = child
602
603
if child is first and el.parent is not None:
604
# Parent should be linked to first child
605
el.next_element = child
606
# We are no longer linked to whatever this element is
607
prev_el = child.previous_element
608
if prev_el is not None and prev_el is not el:
609
prev_el.next_element = None
610
# First child should be linked to the parent, and no previous siblings.
611
child.previous_element = el
612
child.previous_sibling = None
613
614
# We have no sibling as we've been appended as the last.
615
child.next_sibling = None
616
617
# This index is a tag, dig deeper for a "last descendant"
618
if isinstance(child, Tag) and child.contents:
619
descendant = child._last_descendant(False)
620
621
# As the final step, link last descendant. It should be linked
622
# to the parent's next sibling (if found), else walk up the chain
623
# and find a parent with a sibling. It should have no next sibling.
624
descendant.next_element = None
625
descendant.next_sibling = None
626
target = el
627
while True:
628
if target is None:
629
break
630
elif target.next_sibling is not None:
631
descendant.next_element = target.next_sibling
632
target.next_sibling.previous_element = child
633
break
634
target = target.parent
635
636
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
637
"""Pops the tag stack up to and including the most recent
638
instance of the given tag.
639
640
:param name: Pop up to the most recent tag with this name.
641
:param nsprefix: The namespace prefix that goes with `name`.
642
:param inclusivePop: It this is false, pops the tag stack up
643
to but *not* including the most recent instqance of the
644
given tag.
645
"""
646
#print("Popping to %s" % name)
647
if name == self.ROOT_TAG_NAME:
648
# The BeautifulSoup object itself can never be popped.
649
return
650
651
most_recently_popped = None
652
653
stack_size = len(self.tagStack)
654
for i in range(stack_size - 1, 0, -1):
655
t = self.tagStack[i]
656
if (name == t.name and nsprefix == t.prefix):
657
if inclusivePop:
658
most_recently_popped = self.popTag()
659
break
660
most_recently_popped = self.popTag()
661
662
return most_recently_popped
663
664
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
665
sourcepos=None):
666
"""Called by the tree builder when a new tag is encountered.
667
668
:param name: Name of the tag.
669
:param nsprefix: Namespace prefix for the tag.
670
:param attrs: A dictionary of attribute values.
671
:param sourceline: The line number where this tag was found in its
672
source document.
673
:param sourcepos: The character position within `sourceline` where this
674
tag was found.
675
676
If this method returns None, the tag was rejected by an active
677
SoupStrainer. You should proceed as if the tag had not occurred
678
in the document. For instance, if this was a self-closing tag,
679
don't call handle_endtag.
680
"""
681
# print("Start tag %s: %s" % (name, attrs))
682
self.endData()
683
684
if (self.parse_only and len(self.tagStack) <= 1
685
and (self.parse_only.text
686
or not self.parse_only.search_tag(name, attrs))):
687
return None
688
689
tag = self.element_classes.get(Tag, Tag)(
690
self, self.builder, name, namespace, nsprefix, attrs,
691
self.currentTag, self._most_recent_element,
692
sourceline=sourceline, sourcepos=sourcepos
693
)
694
if tag is None:
695
return tag
696
if self._most_recent_element is not None:
697
self._most_recent_element.next_element = tag
698
self._most_recent_element = tag
699
self.pushTag(tag)
700
return tag
701
702
def handle_endtag(self, name, nsprefix=None):
703
"""Called by the tree builder when an ending tag is encountered.
704
705
:param name: Name of the tag.
706
:param nsprefix: Namespace prefix for the tag.
707
"""
708
#print("End tag: " + name)
709
self.endData()
710
self._popToTag(name, nsprefix)
711
712
def handle_data(self, data):
713
"""Called by the tree builder when a chunk of textual data is encountered."""
714
self.current_data.append(data)
715
716
def decode(self, pretty_print=False,
717
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
718
formatter="minimal"):
719
"""Returns a string or Unicode representation of the parse tree
720
as an HTML or XML document.
721
722
:param pretty_print: If this is True, indentation will be used to
723
make the document more readable.
724
:param eventual_encoding: The encoding of the final document.
725
If this is None, the document will be a Unicode string.
726
"""
727
if self.is_xml:
728
# Print the XML declaration
729
encoding_part = ''
730
if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
731
# This is a special Python encoding; it can't actually
732
# go into an XML document because it means nothing
733
# outside of Python.
734
eventual_encoding = None
735
if eventual_encoding != None:
736
encoding_part = ' encoding="%s"' % eventual_encoding
737
prefix = '<?xml version="1.0"%s?>\n' % encoding_part
738
else:
739
prefix = ''
740
if not pretty_print:
741
indent_level = None
742
else:
743
indent_level = 0
744
return prefix + super(BeautifulSoup, self).decode(
745
indent_level, eventual_encoding, formatter)
746
747
# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
748
_s = BeautifulSoup
749
_soup = BeautifulSoup
750
751
class BeautifulStoneSoup(BeautifulSoup):
752
"""Deprecated interface to an XML parser."""
753
754
def __init__(self, *args, **kwargs):
755
kwargs['features'] = 'xml'
756
warnings.warn(
757
'The BeautifulStoneSoup class is deprecated. Instead of using '
758
'it, pass features="xml" into the BeautifulSoup constructor.')
759
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
760
761
762
class StopParsing(Exception):
763
"""Exception raised by a TreeBuilder if it's unable to continue parsing."""
764
pass
765
766
class FeatureNotFound(ValueError):
767
"""Exception raised by the BeautifulSoup constructor if no parser with the
768
requested features is found.
769
"""
770
pass
771
772
773
#If this file is run as a script, act as an HTML pretty-printer.
774
if __name__ == '__main__':
775
import sys
776
soup = BeautifulSoup(sys.stdin)
777
print((soup.prettify()))
778
779