Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/builder/_html5lib.py
811 views
1
# Use of this source code is governed by the MIT license.
2
__license__ = "MIT"
3
4
__all__ = [
5
'HTML5TreeBuilder',
6
]
7
8
import warnings
9
import re
10
from bs4.builder import (
11
PERMISSIVE,
12
HTML,
13
HTML_5,
14
HTMLTreeBuilder,
15
)
16
from bs4.element import (
17
NamespacedAttribute,
18
nonwhitespace_re,
19
)
20
import html5lib
21
from html5lib.constants import (
22
namespaces,
23
prefixes,
24
)
25
from bs4.element import (
26
Comment,
27
Doctype,
28
NavigableString,
29
Tag,
30
)
31
32
try:
33
# Pre-0.99999999
34
from html5lib.treebuilders import _base as treebuilder_base
35
new_html5lib = False
36
except ImportError as e:
37
# 0.99999999 and up
38
from html5lib.treebuilders import base as treebuilder_base
39
new_html5lib = True
40
41
class HTML5TreeBuilder(HTMLTreeBuilder):
42
"""Use html5lib to build a tree.
43
44
Note that this TreeBuilder does not support some features common
45
to HTML TreeBuilders. Some of these features could theoretically
46
be implemented, but at the very least it's quite difficult,
47
because html5lib moves the parse tree around as it's being built.
48
49
* This TreeBuilder doesn't use different subclasses of NavigableString
50
based on the name of the tag in which the string was found.
51
52
* You can't use a SoupStrainer to parse only part of a document.
53
"""
54
55
NAME = "html5lib"
56
57
features = [NAME, PERMISSIVE, HTML_5, HTML]
58
59
# html5lib can tell us which line number and position in the
60
# original file is the source of an element.
61
TRACKS_LINE_NUMBERS = True
62
63
def prepare_markup(self, markup, user_specified_encoding,
64
document_declared_encoding=None, exclude_encodings=None):
65
# Store the user-specified encoding for use later on.
66
self.user_specified_encoding = user_specified_encoding
67
68
# document_declared_encoding and exclude_encodings aren't used
69
# ATM because the html5lib TreeBuilder doesn't use
70
# UnicodeDammit.
71
if exclude_encodings:
72
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
73
yield (markup, None, None, False)
74
75
# These methods are defined by Beautiful Soup.
76
def feed(self, markup):
77
if self.soup.parse_only is not None:
78
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
79
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
80
self.underlying_builder.parser = parser
81
extra_kwargs = dict()
82
if not isinstance(markup, str):
83
if new_html5lib:
84
extra_kwargs['override_encoding'] = self.user_specified_encoding
85
else:
86
extra_kwargs['encoding'] = self.user_specified_encoding
87
doc = parser.parse(markup, **extra_kwargs)
88
89
# Set the character encoding detected by the tokenizer.
90
if isinstance(markup, str):
91
# We need to special-case this because html5lib sets
92
# charEncoding to UTF-8 if it gets Unicode input.
93
doc.original_encoding = None
94
else:
95
original_encoding = parser.tokenizer.stream.charEncoding[0]
96
if not isinstance(original_encoding, str):
97
# In 0.99999999 and up, the encoding is an html5lib
98
# Encoding object. We want to use a string for compatibility
99
# with other tree builders.
100
original_encoding = original_encoding.name
101
doc.original_encoding = original_encoding
102
self.underlying_builder.parser = None
103
104
def create_treebuilder(self, namespaceHTMLElements):
105
self.underlying_builder = TreeBuilderForHtml5lib(
106
namespaceHTMLElements, self.soup,
107
store_line_numbers=self.store_line_numbers
108
)
109
return self.underlying_builder
110
111
def test_fragment_to_document(self, fragment):
112
"""See `TreeBuilder`."""
113
return '<html><head></head><body>%s</body></html>' % fragment
114
115
116
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
117
118
def __init__(self, namespaceHTMLElements, soup=None,
119
store_line_numbers=True, **kwargs):
120
if soup:
121
self.soup = soup
122
else:
123
from bs4 import BeautifulSoup
124
# TODO: Why is the parser 'html.parser' here? To avoid an
125
# infinite loop?
126
self.soup = BeautifulSoup(
127
"", "html.parser", store_line_numbers=store_line_numbers,
128
**kwargs
129
)
130
# TODO: What are **kwargs exactly? Should they be passed in
131
# here in addition to/instead of being passed to the BeautifulSoup
132
# constructor?
133
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
134
135
# This will be set later to an html5lib.html5parser.HTMLParser
136
# object, which we can use to track the current line number.
137
self.parser = None
138
self.store_line_numbers = store_line_numbers
139
140
def documentClass(self):
141
self.soup.reset()
142
return Element(self.soup, self.soup, None)
143
144
def insertDoctype(self, token):
145
name = token["name"]
146
publicId = token["publicId"]
147
systemId = token["systemId"]
148
149
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
150
self.soup.object_was_parsed(doctype)
151
152
def elementClass(self, name, namespace):
153
kwargs = {}
154
if self.parser and self.store_line_numbers:
155
# This represents the point immediately after the end of the
156
# tag. We don't know when the tag started, but we do know
157
# where it ended -- the character just before this one.
158
sourceline, sourcepos = self.parser.tokenizer.stream.position()
159
kwargs['sourceline'] = sourceline
160
kwargs['sourcepos'] = sourcepos-1
161
tag = self.soup.new_tag(name, namespace, **kwargs)
162
163
return Element(tag, self.soup, namespace)
164
165
def commentClass(self, data):
166
return TextNode(Comment(data), self.soup)
167
168
def fragmentClass(self):
169
from bs4 import BeautifulSoup
170
# TODO: Why is the parser 'html.parser' here? To avoid an
171
# infinite loop?
172
self.soup = BeautifulSoup("", "html.parser")
173
self.soup.name = "[document_fragment]"
174
return Element(self.soup, self.soup, None)
175
176
def appendChild(self, node):
177
# XXX This code is not covered by the BS4 tests.
178
self.soup.append(node.element)
179
180
def getDocument(self):
181
return self.soup
182
183
def getFragment(self):
184
return treebuilder_base.TreeBuilder.getFragment(self).element
185
186
def testSerializer(self, element):
187
from bs4 import BeautifulSoup
188
rv = []
189
doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
190
191
def serializeElement(element, indent=0):
192
if isinstance(element, BeautifulSoup):
193
pass
194
if isinstance(element, Doctype):
195
m = doctype_re.match(element)
196
if m:
197
name = m.group(1)
198
if m.lastindex > 1:
199
publicId = m.group(2) or ""
200
systemId = m.group(3) or m.group(4) or ""
201
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
202
(' ' * indent, name, publicId, systemId))
203
else:
204
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
205
else:
206
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
207
elif isinstance(element, Comment):
208
rv.append("|%s<!-- %s -->" % (' ' * indent, element))
209
elif isinstance(element, NavigableString):
210
rv.append("|%s\"%s\"" % (' ' * indent, element))
211
else:
212
if element.namespace:
213
name = "%s %s" % (prefixes[element.namespace],
214
element.name)
215
else:
216
name = element.name
217
rv.append("|%s<%s>" % (' ' * indent, name))
218
if element.attrs:
219
attributes = []
220
for name, value in list(element.attrs.items()):
221
if isinstance(name, NamespacedAttribute):
222
name = "%s %s" % (prefixes[name.namespace], name.name)
223
if isinstance(value, list):
224
value = " ".join(value)
225
attributes.append((name, value))
226
227
for name, value in sorted(attributes):
228
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
229
indent += 2
230
for child in element.children:
231
serializeElement(child, indent)
232
serializeElement(element, 0)
233
234
return "\n".join(rv)
235
236
class AttrList(object):
237
def __init__(self, element):
238
self.element = element
239
self.attrs = dict(self.element.attrs)
240
def __iter__(self):
241
return list(self.attrs.items()).__iter__()
242
def __setitem__(self, name, value):
243
# If this attribute is a multi-valued attribute for this element,
244
# turn its value into a list.
245
list_attr = self.element.cdata_list_attributes
246
if (name in list_attr['*']
247
or (self.element.name in list_attr
248
and name in list_attr[self.element.name])):
249
# A node that is being cloned may have already undergone
250
# this procedure.
251
if not isinstance(value, list):
252
value = nonwhitespace_re.findall(value)
253
self.element[name] = value
254
def items(self):
255
return list(self.attrs.items())
256
def keys(self):
257
return list(self.attrs.keys())
258
def __len__(self):
259
return len(self.attrs)
260
def __getitem__(self, name):
261
return self.attrs[name]
262
def __contains__(self, name):
263
return name in list(self.attrs.keys())
264
265
266
class Element(treebuilder_base.Node):
267
def __init__(self, element, soup, namespace):
268
treebuilder_base.Node.__init__(self, element.name)
269
self.element = element
270
self.soup = soup
271
self.namespace = namespace
272
273
def appendChild(self, node):
274
string_child = child = None
275
if isinstance(node, str):
276
# Some other piece of code decided to pass in a string
277
# instead of creating a TextElement object to contain the
278
# string.
279
string_child = child = node
280
elif isinstance(node, Tag):
281
# Some other piece of code decided to pass in a Tag
282
# instead of creating an Element object to contain the
283
# Tag.
284
child = node
285
elif node.element.__class__ == NavigableString:
286
string_child = child = node.element
287
node.parent = self
288
else:
289
child = node.element
290
node.parent = self
291
292
if not isinstance(child, str) and child.parent is not None:
293
node.element.extract()
294
295
if (string_child is not None and self.element.contents
296
and self.element.contents[-1].__class__ == NavigableString):
297
# We are appending a string onto another string.
298
# TODO This has O(n^2) performance, for input like
299
# "a</a>a</a>a</a>..."
300
old_element = self.element.contents[-1]
301
new_element = self.soup.new_string(old_element + string_child)
302
old_element.replace_with(new_element)
303
self.soup._most_recent_element = new_element
304
else:
305
if isinstance(node, str):
306
# Create a brand new NavigableString from this string.
307
child = self.soup.new_string(node)
308
309
# Tell Beautiful Soup to act as if it parsed this element
310
# immediately after the parent's last descendant. (Or
311
# immediately after the parent, if it has no children.)
312
if self.element.contents:
313
most_recent_element = self.element._last_descendant(False)
314
elif self.element.next_element is not None:
315
# Something from further ahead in the parse tree is
316
# being inserted into this earlier element. This is
317
# very annoying because it means an expensive search
318
# for the last element in the tree.
319
most_recent_element = self.soup._last_descendant()
320
else:
321
most_recent_element = self.element
322
323
self.soup.object_was_parsed(
324
child, parent=self.element,
325
most_recent_element=most_recent_element)
326
327
def getAttributes(self):
328
if isinstance(self.element, Comment):
329
return {}
330
return AttrList(self.element)
331
332
def setAttributes(self, attributes):
333
if attributes is not None and len(attributes) > 0:
334
converted_attributes = []
335
for name, value in list(attributes.items()):
336
if isinstance(name, tuple):
337
new_name = NamespacedAttribute(*name)
338
del attributes[name]
339
attributes[new_name] = value
340
341
self.soup.builder._replace_cdata_list_attribute_values(
342
self.name, attributes)
343
for name, value in list(attributes.items()):
344
self.element[name] = value
345
346
# The attributes may contain variables that need substitution.
347
# Call set_up_substitutions manually.
348
#
349
# The Tag constructor called this method when the Tag was created,
350
# but we just set/changed the attributes, so call it again.
351
self.soup.builder.set_up_substitutions(self.element)
352
attributes = property(getAttributes, setAttributes)
353
354
def insertText(self, data, insertBefore=None):
355
text = TextNode(self.soup.new_string(data), self.soup)
356
if insertBefore:
357
self.insertBefore(text, insertBefore)
358
else:
359
self.appendChild(text)
360
361
def insertBefore(self, node, refNode):
362
index = self.element.index(refNode.element)
363
if (node.element.__class__ == NavigableString and self.element.contents
364
and self.element.contents[index-1].__class__ == NavigableString):
365
# (See comments in appendChild)
366
old_node = self.element.contents[index-1]
367
new_str = self.soup.new_string(old_node + node.element)
368
old_node.replace_with(new_str)
369
else:
370
self.element.insert(index, node.element)
371
node.parent = self
372
373
def removeChild(self, node):
374
node.element.extract()
375
376
def reparentChildren(self, new_parent):
377
"""Move all of this tag's children into another tag."""
378
# print("MOVE", self.element.contents)
379
# print("FROM", self.element)
380
# print("TO", new_parent.element)
381
382
element = self.element
383
new_parent_element = new_parent.element
384
# Determine what this tag's next_element will be once all the children
385
# are removed.
386
final_next_element = element.next_sibling
387
388
new_parents_last_descendant = new_parent_element._last_descendant(False, False)
389
if len(new_parent_element.contents) > 0:
390
# The new parent already contains children. We will be
391
# appending this tag's children to the end.
392
new_parents_last_child = new_parent_element.contents[-1]
393
new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
394
else:
395
# The new parent contains no children.
396
new_parents_last_child = None
397
new_parents_last_descendant_next_element = new_parent_element.next_element
398
399
to_append = element.contents
400
if len(to_append) > 0:
401
# Set the first child's previous_element and previous_sibling
402
# to elements within the new parent
403
first_child = to_append[0]
404
if new_parents_last_descendant is not None:
405
first_child.previous_element = new_parents_last_descendant
406
else:
407
first_child.previous_element = new_parent_element
408
first_child.previous_sibling = new_parents_last_child
409
if new_parents_last_descendant is not None:
410
new_parents_last_descendant.next_element = first_child
411
else:
412
new_parent_element.next_element = first_child
413
if new_parents_last_child is not None:
414
new_parents_last_child.next_sibling = first_child
415
416
# Find the very last element being moved. It is now the
417
# parent's last descendant. It has no .next_sibling and
418
# its .next_element is whatever the previous last
419
# descendant had.
420
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
421
422
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
423
if new_parents_last_descendant_next_element is not None:
424
# TODO: This code has no test coverage and I'm not sure
425
# how to get html5lib to go through this path, but it's
426
# just the other side of the previous line.
427
new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
428
last_childs_last_descendant.next_sibling = None
429
430
for child in to_append:
431
child.parent = new_parent_element
432
new_parent_element.contents.append(child)
433
434
# Now that this element has no children, change its .next_element.
435
element.contents = []
436
element.next_element = final_next_element
437
438
# print("DONE WITH MOVE")
439
# print("FROM", self.element)
440
# print("TO", new_parent_element)
441
442
def cloneNode(self):
443
tag = self.soup.new_tag(self.element.name, self.namespace)
444
node = Element(tag, self.soup, self.namespace)
445
for key,value in self.attributes:
446
node.attributes[key] = value
447
return node
448
449
def hasContent(self):
450
return self.element.contents
451
452
def getNameTuple(self):
453
if self.namespace == None:
454
return namespaces["html"], self.name
455
else:
456
return self.namespace, self.name
457
458
nameTuple = property(getNameTuple)
459
460
class TextNode(Element):
461
def __init__(self, element, soup):
462
treebuilder_base.Node.__init__(self, None)
463
self.element = element
464
self.soup = soup
465
466
def cloneNode(self):
467
raise NotImplementedError
468
469