Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/builder/__init__.py
811 views
1
# Use of this source code is governed by the MIT license.
2
__license__ = "MIT"
3
4
from collections import defaultdict
5
import itertools
6
import sys
7
from bs4.element import (
8
CharsetMetaAttributeValue,
9
ContentMetaAttributeValue,
10
Stylesheet,
11
Script,
12
TemplateString,
13
nonwhitespace_re
14
)
15
16
__all__ = [
17
'HTMLTreeBuilder',
18
'SAXTreeBuilder',
19
'TreeBuilder',
20
'TreeBuilderRegistry',
21
]
22
23
# Some useful features for a TreeBuilder to have.
24
FAST = 'fast'
25
PERMISSIVE = 'permissive'
26
STRICT = 'strict'
27
XML = 'xml'
28
HTML = 'html'
29
HTML_5 = 'html5'
30
31
32
class TreeBuilderRegistry(object):
33
"""A way of looking up TreeBuilder subclasses by their name or by desired
34
features.
35
"""
36
37
def __init__(self):
38
self.builders_for_feature = defaultdict(list)
39
self.builders = []
40
41
def register(self, treebuilder_class):
42
"""Register a treebuilder based on its advertised features.
43
44
:param treebuilder_class: A subclass of Treebuilder. its .features
45
attribute should list its features.
46
"""
47
for feature in treebuilder_class.features:
48
self.builders_for_feature[feature].insert(0, treebuilder_class)
49
self.builders.insert(0, treebuilder_class)
50
51
def lookup(self, *features):
52
"""Look up a TreeBuilder subclass with the desired features.
53
54
:param features: A list of features to look for. If none are
55
provided, the most recently registered TreeBuilder subclass
56
will be used.
57
:return: A TreeBuilder subclass, or None if there's no
58
registered subclass with all the requested features.
59
"""
60
if len(self.builders) == 0:
61
# There are no builders at all.
62
return None
63
64
if len(features) == 0:
65
# They didn't ask for any features. Give them the most
66
# recently registered builder.
67
return self.builders[0]
68
69
# Go down the list of features in order, and eliminate any builders
70
# that don't match every feature.
71
features = list(features)
72
features.reverse()
73
candidates = None
74
candidate_set = None
75
while len(features) > 0:
76
feature = features.pop()
77
we_have_the_feature = self.builders_for_feature.get(feature, [])
78
if len(we_have_the_feature) > 0:
79
if candidates is None:
80
candidates = we_have_the_feature
81
candidate_set = set(candidates)
82
else:
83
# Eliminate any candidates that don't have this feature.
84
candidate_set = candidate_set.intersection(
85
set(we_have_the_feature))
86
87
# The only valid candidates are the ones in candidate_set.
88
# Go through the original list of candidates and pick the first one
89
# that's in candidate_set.
90
if candidate_set is None:
91
return None
92
for candidate in candidates:
93
if candidate in candidate_set:
94
return candidate
95
return None
96
97
# The BeautifulSoup class will take feature lists from developers and use them
98
# to look up builders in this registry.
99
builder_registry = TreeBuilderRegistry()
100
101
class TreeBuilder(object):
102
"""Turn a textual document into a Beautiful Soup object tree."""
103
104
NAME = "[Unknown tree builder]"
105
ALTERNATE_NAMES = []
106
features = []
107
108
is_xml = False
109
picklable = False
110
empty_element_tags = None # A tag will be considered an empty-element
111
# tag when and only when it has no contents.
112
113
# A value for these tag/attribute combinations is a space- or
114
# comma-separated list of CDATA, rather than a single CDATA.
115
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
116
117
# Whitespace should be preserved inside these tags.
118
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
119
120
# The textual contents of tags with these names should be
121
# instantiated with some class other than NavigableString.
122
DEFAULT_STRING_CONTAINERS = {}
123
124
USE_DEFAULT = object()
125
126
# Most parsers don't keep track of line numbers.
127
TRACKS_LINE_NUMBERS = False
128
129
def __init__(self, multi_valued_attributes=USE_DEFAULT,
130
preserve_whitespace_tags=USE_DEFAULT,
131
store_line_numbers=USE_DEFAULT,
132
string_containers=USE_DEFAULT,
133
):
134
"""Constructor.
135
136
:param multi_valued_attributes: If this is set to None, the
137
TreeBuilder will not turn any values for attributes like
138
'class' into lists. Setting this to a dictionary will
139
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
140
for an example.
141
142
Internally, these are called "CDATA list attributes", but that
143
probably doesn't make sense to an end-user, so the argument name
144
is `multi_valued_attributes`.
145
146
:param preserve_whitespace_tags: A list of tags to treat
147
the way <pre> tags are treated in HTML. Tags in this list
148
are immune from pretty-printing; their contents will always be
149
output as-is.
150
151
:param string_containers: A dictionary mapping tag names to
152
the classes that should be instantiated to contain the textual
153
contents of those tags. The default is to use NavigableString
154
for every tag, no matter what the name. You can override the
155
default by changing DEFAULT_STRING_CONTAINERS.
156
157
:param store_line_numbers: If the parser keeps track of the
158
line numbers and positions of the original markup, that
159
information will, by default, be stored in each corresponding
160
`Tag` object. You can turn this off by passing
161
store_line_numbers=False. If the parser you're using doesn't
162
keep track of this information, then setting store_line_numbers=True
163
will do nothing.
164
"""
165
self.soup = None
166
if multi_valued_attributes is self.USE_DEFAULT:
167
multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
168
self.cdata_list_attributes = multi_valued_attributes
169
if preserve_whitespace_tags is self.USE_DEFAULT:
170
preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
171
self.preserve_whitespace_tags = preserve_whitespace_tags
172
if store_line_numbers == self.USE_DEFAULT:
173
store_line_numbers = self.TRACKS_LINE_NUMBERS
174
self.store_line_numbers = store_line_numbers
175
if string_containers == self.USE_DEFAULT:
176
string_containers = self.DEFAULT_STRING_CONTAINERS
177
self.string_containers = string_containers
178
179
def initialize_soup(self, soup):
180
"""The BeautifulSoup object has been initialized and is now
181
being associated with the TreeBuilder.
182
183
:param soup: A BeautifulSoup object.
184
"""
185
self.soup = soup
186
187
def reset(self):
188
"""Do any work necessary to reset the underlying parser
189
for a new document.
190
191
By default, this does nothing.
192
"""
193
pass
194
195
def can_be_empty_element(self, tag_name):
196
"""Might a tag with this name be an empty-element tag?
197
198
The final markup may or may not actually present this tag as
199
self-closing.
200
201
For instance: an HTMLBuilder does not consider a <p> tag to be
202
an empty-element tag (it's not in
203
HTMLBuilder.empty_element_tags). This means an empty <p> tag
204
will be presented as "<p></p>", not "<p/>" or "<p>".
205
206
The default implementation has no opinion about which tags are
207
empty-element tags, so a tag will be presented as an
208
empty-element tag if and only if it has no children.
209
"<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
210
be left alone.
211
212
:param tag_name: The name of a markup tag.
213
"""
214
if self.empty_element_tags is None:
215
return True
216
return tag_name in self.empty_element_tags
217
218
def feed(self, markup):
219
"""Run some incoming markup through some parsing process,
220
populating the `BeautifulSoup` object in self.soup.
221
222
This method is not implemented in TreeBuilder; it must be
223
implemented in subclasses.
224
225
:return: None.
226
"""
227
raise NotImplementedError()
228
229
def prepare_markup(self, markup, user_specified_encoding=None,
230
document_declared_encoding=None, exclude_encodings=None):
231
"""Run any preliminary steps necessary to make incoming markup
232
acceptable to the parser.
233
234
:param markup: Some markup -- probably a bytestring.
235
:param user_specified_encoding: The user asked to try this encoding.
236
:param document_declared_encoding: The markup itself claims to be
237
in this encoding.
238
:param exclude_encodings: The user asked _not_ to try any of
239
these encodings.
240
241
:yield: A series of 4-tuples:
242
(markup, encoding, declared encoding,
243
has undergone character replacement)
244
245
Each 4-tuple represents a strategy for converting the
246
document to Unicode and parsing it. Each strategy will be tried
247
in turn.
248
249
By default, the only strategy is to parse the markup
250
as-is. See `LXMLTreeBuilderForXML` and
251
`HTMLParserTreeBuilder` for implementations that take into
252
account the quirks of particular parsers.
253
"""
254
yield markup, None, None, False
255
256
def test_fragment_to_document(self, fragment):
257
"""Wrap an HTML fragment to make it look like a document.
258
259
Different parsers do this differently. For instance, lxml
260
introduces an empty <head> tag, and html5lib
261
doesn't. Abstracting this away lets us write simple tests
262
which run HTML fragments through the parser and compare the
263
results against other HTML fragments.
264
265
This method should not be used outside of tests.
266
267
:param fragment: A string -- fragment of HTML.
268
:return: A string -- a full HTML document.
269
"""
270
return fragment
271
272
def set_up_substitutions(self, tag):
273
"""Set up any substitutions that will need to be performed on
274
a `Tag` when it's output as a string.
275
276
By default, this does nothing. See `HTMLTreeBuilder` for a
277
case where this is used.
278
279
:param tag: A `Tag`
280
:return: Whether or not a substitution was performed.
281
"""
282
return False
283
284
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
285
"""When an attribute value is associated with a tag that can
286
have multiple values for that attribute, convert the string
287
value to a list of strings.
288
289
Basically, replaces class="foo bar" with class=["foo", "bar"]
290
291
NOTE: This method modifies its input in place.
292
293
:param tag_name: The name of a tag.
294
:param attrs: A dictionary containing the tag's attributes.
295
Any appropriate attribute values will be modified in place.
296
"""
297
if not attrs:
298
return attrs
299
if self.cdata_list_attributes:
300
universal = self.cdata_list_attributes.get('*', [])
301
tag_specific = self.cdata_list_attributes.get(
302
tag_name.lower(), None)
303
for attr in list(attrs.keys()):
304
if attr in universal or (tag_specific and attr in tag_specific):
305
# We have a "class"-type attribute whose string
306
# value is a whitespace-separated list of
307
# values. Split it into a list.
308
value = attrs[attr]
309
if isinstance(value, str):
310
values = nonwhitespace_re.findall(value)
311
else:
312
# html5lib sometimes calls setAttributes twice
313
# for the same tag when rearranging the parse
314
# tree. On the second call the attribute value
315
# here is already a list. If this happens,
316
# leave the value alone rather than trying to
317
# split it again.
318
values = value
319
attrs[attr] = values
320
return attrs
321
322
class SAXTreeBuilder(TreeBuilder):
323
"""A Beautiful Soup treebuilder that listens for SAX events.
324
325
This is not currently used for anything, but it demonstrates
326
how a simple TreeBuilder would work.
327
"""
328
329
def feed(self, markup):
330
raise NotImplementedError()
331
332
def close(self):
333
pass
334
335
def startElement(self, name, attrs):
336
attrs = dict((key[1], value) for key, value in list(attrs.items()))
337
#print("Start %s, %r" % (name, attrs))
338
self.soup.handle_starttag(name, attrs)
339
340
def endElement(self, name):
341
#print("End %s" % name)
342
self.soup.handle_endtag(name)
343
344
def startElementNS(self, nsTuple, nodeName, attrs):
345
# Throw away (ns, nodeName) for now.
346
self.startElement(nodeName, attrs)
347
348
def endElementNS(self, nsTuple, nodeName):
349
# Throw away (ns, nodeName) for now.
350
self.endElement(nodeName)
351
#handler.endElementNS((ns, node.nodeName), node.nodeName)
352
353
def startPrefixMapping(self, prefix, nodeValue):
354
# Ignore the prefix for now.
355
pass
356
357
def endPrefixMapping(self, prefix):
358
# Ignore the prefix for now.
359
# handler.endPrefixMapping(prefix)
360
pass
361
362
def characters(self, content):
363
self.soup.handle_data(content)
364
365
def startDocument(self):
366
pass
367
368
def endDocument(self):
369
pass
370
371
372
class HTMLTreeBuilder(TreeBuilder):
373
"""This TreeBuilder knows facts about HTML.
374
375
Such as which tags are empty-element tags.
376
"""
377
378
empty_element_tags = set([
379
# These are from HTML5.
380
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
381
382
# These are from earlier versions of HTML and are removed in HTML5.
383
'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
384
])
385
386
# The HTML standard defines these as block-level elements. Beautiful
387
# Soup does not treat these elements differently from other elements,
388
# but it may do so eventually, and this information is available if
389
# you need to use it.
390
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
391
392
# The HTML standard defines an unusual content model for these tags.
393
# We represent this by using a string class other than NavigableString
394
# inside these tags.
395
#
396
# I made this list by going through the HTML spec
397
# (https://html.spec.whatwg.org/#metadata-content) and looking for
398
# "metadata content" elements that can contain strings.
399
#
400
# TODO: Arguably <noscript> could go here but it seems
401
# qualitatively different from the other tags.
402
DEFAULT_STRING_CONTAINERS = {
403
'style': Stylesheet,
404
'script': Script,
405
'template': TemplateString,
406
}
407
408
# The HTML standard defines these attributes as containing a
409
# space-separated list of values, not a single value. That is,
410
# class="foo bar" means that the 'class' attribute has two values,
411
# 'foo' and 'bar', not the single value 'foo bar'. When we
412
# encounter one of these attributes, we will parse its value into
413
# a list of values if possible. Upon output, the list will be
414
# converted back into a string.
415
DEFAULT_CDATA_LIST_ATTRIBUTES = {
416
"*" : ['class', 'accesskey', 'dropzone'],
417
"a" : ['rel', 'rev'],
418
"link" : ['rel', 'rev'],
419
"td" : ["headers"],
420
"th" : ["headers"],
421
"td" : ["headers"],
422
"form" : ["accept-charset"],
423
"object" : ["archive"],
424
425
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
426
"area" : ["rel"],
427
"icon" : ["sizes"],
428
"iframe" : ["sandbox"],
429
"output" : ["for"],
430
}
431
432
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
433
434
def set_up_substitutions(self, tag):
435
"""Replace the declared encoding in a <meta> tag with a placeholder,
436
to be substituted when the tag is output to a string.
437
438
An HTML document may come in to Beautiful Soup as one
439
encoding, but exit in a different encoding, and the <meta> tag
440
needs to be changed to reflect this.
441
442
:param tag: A `Tag`
443
:return: Whether or not a substitution was performed.
444
"""
445
# We are only interested in <meta> tags
446
if tag.name != 'meta':
447
return False
448
449
http_equiv = tag.get('http-equiv')
450
content = tag.get('content')
451
charset = tag.get('charset')
452
453
# We are interested in <meta> tags that say what encoding the
454
# document was originally in. This means HTML 5-style <meta>
455
# tags that provide the "charset" attribute. It also means
456
# HTML 4-style <meta> tags that provide the "content"
457
# attribute and have "http-equiv" set to "content-type".
458
#
459
# In both cases we will replace the value of the appropriate
460
# attribute with a standin object that can take on any
461
# encoding.
462
meta_encoding = None
463
if charset is not None:
464
# HTML 5 style:
465
# <meta charset="utf8">
466
meta_encoding = charset
467
tag['charset'] = CharsetMetaAttributeValue(charset)
468
469
elif (content is not None and http_equiv is not None
470
and http_equiv.lower() == 'content-type'):
471
# HTML 4 style:
472
# <meta http-equiv="content-type" content="text/html; charset=utf8">
473
tag['content'] = ContentMetaAttributeValue(content)
474
475
return (meta_encoding is not None)
476
477
def register_treebuilders_from(module):
478
"""Copy TreeBuilders from the given module into this module."""
479
# I'm fairly sure this is not the best way to do this.
480
this_module = sys.modules['bs4.builder']
481
for name in module.__all__:
482
obj = getattr(module, name)
483
484
if issubclass(obj, TreeBuilder):
485
setattr(this_module, name, obj)
486
this_module.__all__.append(name)
487
# Register the builder while we're at it.
488
this_module.builder_registry.register(obj)
489
490
class ParserRejectedMarkup(Exception):
491
"""An Exception to be raised when the underlying parser simply
492
refuses to parse the given markup.
493
"""
494
def __init__(self, message_or_exception):
495
"""Explain why the parser rejected the given markup, either
496
with a textual explanation or another exception.
497
"""
498
if isinstance(message_or_exception, Exception):
499
e = message_or_exception
500
message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
501
super(ParserRejectedMarkup, self).__init__(message_or_exception)
502
503
# Builders are registered in reverse order of priority, so that custom
504
# builder registrations will take precedence. In general, we want lxml
505
# to take precedence over html5lib, because it's faster. And we only
506
# want to use HTMLParser as a last resort.
507
from . import _htmlparser
508
register_treebuilders_from(_htmlparser)
509
try:
510
from . import _html5lib
511
register_treebuilders_from(_html5lib)
512
except ImportError:
513
# They don't have html5lib installed.
514
pass
515
try:
516
from . import _lxml
517
register_treebuilders_from(_lxml)
518
except ImportError:
519
# They don't have lxml installed.
520
pass
521
522