Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/builder/_lxml.py
811 views
1
# Use of this source code is governed by the MIT license.
2
__license__ = "MIT"
3
4
__all__ = [
5
'LXMLTreeBuilderForXML',
6
'LXMLTreeBuilder',
7
]
8
9
try:
10
from collections.abc import Callable # Python 3.6
11
except ImportError as e:
12
from collections import Callable
13
14
from io import BytesIO
15
from io import StringIO
16
from lxml import etree
17
from bs4.element import (
18
Comment,
19
Doctype,
20
NamespacedAttribute,
21
ProcessingInstruction,
22
XMLProcessingInstruction,
23
)
24
from bs4.builder import (
25
FAST,
26
HTML,
27
HTMLTreeBuilder,
28
PERMISSIVE,
29
ParserRejectedMarkup,
30
TreeBuilder,
31
XML)
32
from bs4.dammit import EncodingDetector
33
34
LXML = 'lxml'
35
36
def _invert(d):
37
"Invert a dictionary."
38
return dict((v,k) for k, v in list(d.items()))
39
40
class LXMLTreeBuilderForXML(TreeBuilder):
41
DEFAULT_PARSER_CLASS = etree.XMLParser
42
43
is_xml = True
44
processing_instruction_class = XMLProcessingInstruction
45
46
NAME = "lxml-xml"
47
ALTERNATE_NAMES = ["xml"]
48
49
# Well, it's permissive by XML parser standards.
50
features = [NAME, LXML, XML, FAST, PERMISSIVE]
51
52
CHUNK_SIZE = 512
53
54
# This namespace mapping is specified in the XML Namespace
55
# standard.
56
DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
57
58
DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
59
60
# NOTE: If we parsed Element objects and looked at .sourceline,
61
# we'd be able to see the line numbers from the original document.
62
# But instead we build an XMLParser or HTMLParser object to serve
63
# as the target of parse messages, and those messages don't include
64
# line numbers.
65
# See: https://bugs.launchpad.net/lxml/+bug/1846906
66
67
def initialize_soup(self, soup):
68
"""Let the BeautifulSoup object know about the standard namespace
69
mapping.
70
71
:param soup: A `BeautifulSoup`.
72
"""
73
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
74
self._register_namespaces(self.DEFAULT_NSMAPS)
75
76
def _register_namespaces(self, mapping):
77
"""Let the BeautifulSoup object know about namespaces encountered
78
while parsing the document.
79
80
This might be useful later on when creating CSS selectors.
81
82
:param mapping: A dictionary mapping namespace prefixes to URIs.
83
"""
84
for key, value in list(mapping.items()):
85
if key and key not in self.soup._namespaces:
86
# Let the BeautifulSoup object know about a new namespace.
87
# If there are multiple namespaces defined with the same
88
# prefix, the first one in the document takes precedence.
89
self.soup._namespaces[key] = value
90
91
def default_parser(self, encoding):
92
"""Find the default parser for the given encoding.
93
94
:param encoding: A string.
95
:return: Either a parser object or a class, which
96
will be instantiated with default arguments.
97
"""
98
if self._default_parser is not None:
99
return self._default_parser
100
return etree.XMLParser(
101
target=self, strip_cdata=False, recover=True, encoding=encoding)
102
103
def parser_for(self, encoding):
104
"""Instantiate an appropriate parser for the given encoding.
105
106
:param encoding: A string.
107
:return: A parser object such as an `etree.XMLParser`.
108
"""
109
# Use the default parser.
110
parser = self.default_parser(encoding)
111
112
if isinstance(parser, Callable):
113
# Instantiate the parser with default arguments
114
parser = parser(
115
target=self, strip_cdata=False, recover=True, encoding=encoding
116
)
117
return parser
118
119
def __init__(self, parser=None, empty_element_tags=None, **kwargs):
120
# TODO: Issue a warning if parser is present but not a
121
# callable, since that means there's no way to create new
122
# parsers for different encodings.
123
self._default_parser = parser
124
if empty_element_tags is not None:
125
self.empty_element_tags = set(empty_element_tags)
126
self.soup = None
127
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
128
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
129
130
def _getNsTag(self, tag):
131
# Split the namespace URL out of a fully-qualified lxml tag
132
# name. Copied from lxml's src/lxml/sax.py.
133
if tag[0] == '{':
134
return tuple(tag[1:].split('}', 1))
135
else:
136
return (None, tag)
137
138
def prepare_markup(self, markup, user_specified_encoding=None,
139
exclude_encodings=None,
140
document_declared_encoding=None):
141
"""Run any preliminary steps necessary to make incoming markup
142
acceptable to the parser.
143
144
lxml really wants to get a bytestring and convert it to
145
Unicode itself. So instead of using UnicodeDammit to convert
146
the bytestring to Unicode using different encodings, this
147
implementation uses EncodingDetector to iterate over the
148
encodings, and tell lxml to try to parse the document as each
149
one in turn.
150
151
:param markup: Some markup -- hopefully a bytestring.
152
:param user_specified_encoding: The user asked to try this encoding.
153
:param document_declared_encoding: The markup itself claims to be
154
in this encoding.
155
:param exclude_encodings: The user asked _not_ to try any of
156
these encodings.
157
158
:yield: A series of 4-tuples:
159
(markup, encoding, declared encoding,
160
has undergone character replacement)
161
162
Each 4-tuple represents a strategy for converting the
163
document to Unicode and parsing it. Each strategy will be tried
164
in turn.
165
"""
166
is_html = not self.is_xml
167
if is_html:
168
self.processing_instruction_class = ProcessingInstruction
169
else:
170
self.processing_instruction_class = XMLProcessingInstruction
171
172
if isinstance(markup, str):
173
# We were given Unicode. Maybe lxml can parse Unicode on
174
# this system?
175
yield markup, None, document_declared_encoding, False
176
177
if isinstance(markup, str):
178
# No, apparently not. Convert the Unicode to UTF-8 and
179
# tell lxml to parse it as UTF-8.
180
yield (markup.encode("utf8"), "utf8",
181
document_declared_encoding, False)
182
183
try_encodings = [user_specified_encoding, document_declared_encoding]
184
detector = EncodingDetector(
185
markup, try_encodings, is_html, exclude_encodings)
186
for encoding in detector.encodings:
187
yield (detector.markup, encoding, document_declared_encoding, False)
188
189
def feed(self, markup):
190
if isinstance(markup, bytes):
191
markup = BytesIO(markup)
192
elif isinstance(markup, str):
193
markup = StringIO(markup)
194
195
# Call feed() at least once, even if the markup is empty,
196
# or the parser won't be initialized.
197
data = markup.read(self.CHUNK_SIZE)
198
try:
199
self.parser = self.parser_for(self.soup.original_encoding)
200
self.parser.feed(data)
201
while len(data) != 0:
202
# Now call feed() on the rest of the data, chunk by chunk.
203
data = markup.read(self.CHUNK_SIZE)
204
if len(data) != 0:
205
self.parser.feed(data)
206
self.parser.close()
207
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
208
raise ParserRejectedMarkup(e)
209
210
def close(self):
211
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
212
213
def start(self, name, attrs, nsmap={}):
214
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
215
attrs = dict(attrs)
216
nsprefix = None
217
# Invert each namespace map as it comes in.
218
if len(nsmap) == 0 and len(self.nsmaps) > 1:
219
# There are no new namespaces for this tag, but
220
# non-default namespaces are in play, so we need a
221
# separate tag stack to know when they end.
222
self.nsmaps.append(None)
223
elif len(nsmap) > 0:
224
# A new namespace mapping has come into play.
225
226
# First, Let the BeautifulSoup object know about it.
227
self._register_namespaces(nsmap)
228
229
# Then, add it to our running list of inverted namespace
230
# mappings.
231
self.nsmaps.append(_invert(nsmap))
232
233
# Also treat the namespace mapping as a set of attributes on the
234
# tag, so we can recreate it later.
235
attrs = attrs.copy()
236
for prefix, namespace in list(nsmap.items()):
237
attribute = NamespacedAttribute(
238
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
239
attrs[attribute] = namespace
240
241
# Namespaces are in play. Find any attributes that came in
242
# from lxml with namespaces attached to their names, and
243
# turn then into NamespacedAttribute objects.
244
new_attrs = {}
245
for attr, value in list(attrs.items()):
246
namespace, attr = self._getNsTag(attr)
247
if namespace is None:
248
new_attrs[attr] = value
249
else:
250
nsprefix = self._prefix_for_namespace(namespace)
251
attr = NamespacedAttribute(nsprefix, attr, namespace)
252
new_attrs[attr] = value
253
attrs = new_attrs
254
255
namespace, name = self._getNsTag(name)
256
nsprefix = self._prefix_for_namespace(namespace)
257
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
258
259
def _prefix_for_namespace(self, namespace):
260
"""Find the currently active prefix for the given namespace."""
261
if namespace is None:
262
return None
263
for inverted_nsmap in reversed(self.nsmaps):
264
if inverted_nsmap is not None and namespace in inverted_nsmap:
265
return inverted_nsmap[namespace]
266
return None
267
268
def end(self, name):
269
self.soup.endData()
270
completed_tag = self.soup.tagStack[-1]
271
namespace, name = self._getNsTag(name)
272
nsprefix = None
273
if namespace is not None:
274
for inverted_nsmap in reversed(self.nsmaps):
275
if inverted_nsmap is not None and namespace in inverted_nsmap:
276
nsprefix = inverted_nsmap[namespace]
277
break
278
self.soup.handle_endtag(name, nsprefix)
279
if len(self.nsmaps) > 1:
280
# This tag, or one of its parents, introduced a namespace
281
# mapping, so pop it off the stack.
282
self.nsmaps.pop()
283
284
def pi(self, target, data):
285
self.soup.endData()
286
self.soup.handle_data(target + ' ' + data)
287
self.soup.endData(self.processing_instruction_class)
288
289
def data(self, content):
290
self.soup.handle_data(content)
291
292
def doctype(self, name, pubid, system):
293
self.soup.endData()
294
doctype = Doctype.for_name_and_ids(name, pubid, system)
295
self.soup.object_was_parsed(doctype)
296
297
def comment(self, content):
298
"Handle comments as Comment objects."
299
self.soup.endData()
300
self.soup.handle_data(content)
301
self.soup.endData(Comment)
302
303
def test_fragment_to_document(self, fragment):
304
"""See `TreeBuilder`."""
305
return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
306
307
308
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
309
310
NAME = LXML
311
ALTERNATE_NAMES = ["lxml-html"]
312
313
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
314
is_xml = False
315
processing_instruction_class = ProcessingInstruction
316
317
def default_parser(self, encoding):
318
return etree.HTMLParser
319
320
def feed(self, markup):
321
encoding = self.soup.original_encoding
322
try:
323
self.parser = self.parser_for(encoding)
324
self.parser.feed(markup)
325
self.parser.close()
326
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
327
raise ParserRejectedMarkup(e)
328
329
330
def test_fragment_to_document(self, fragment):
331
"""See `TreeBuilder`."""
332
return '<html><body>%s</body></html>' % fragment
333
334