Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/builder/_htmlparser.py
811 views
1
# encoding: utf-8
2
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
3
4
# Use of this source code is governed by the MIT license.
5
__license__ = "MIT"
6
7
__all__ = [
8
'HTMLParserTreeBuilder',
9
]
10
11
from html.parser import HTMLParser
12
13
try:
14
from html.parser import HTMLParseError
15
except ImportError as e:
16
# HTMLParseError is removed in Python 3.5. Since it can never be
17
# thrown in 3.5, we can just define our own class as a placeholder.
18
class HTMLParseError(Exception):
19
pass
20
21
import sys
22
import warnings
23
24
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
25
# argument, which we'd like to set to False. Unfortunately,
26
# http://bugs.python.org/issue13273 makes strict=True a better bet
27
# before Python 3.2.3.
28
#
29
# At the end of this file, we monkeypatch HTMLParser so that
30
# strict=True works well on Python 3.2.2.
31
major, minor, release = sys.version_info[:3]
32
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
33
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
34
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
35
36
37
from bs4.element import (
38
CData,
39
Comment,
40
Declaration,
41
Doctype,
42
ProcessingInstruction,
43
)
44
from bs4.dammit import EntitySubstitution, UnicodeDammit
45
46
from bs4.builder import (
47
HTML,
48
HTMLTreeBuilder,
49
STRICT,
50
)
51
52
53
HTMLPARSER = 'html.parser'
54
55
class BeautifulSoupHTMLParser(HTMLParser):
56
"""A subclass of the Python standard library's HTMLParser class, which
57
listens for HTMLParser events and translates them into calls
58
to Beautiful Soup's tree construction API.
59
"""
60
61
# Strategies for handling duplicate attributes
62
IGNORE = 'ignore'
63
REPLACE = 'replace'
64
65
def __init__(self, *args, **kwargs):
66
"""Constructor.
67
68
:param on_duplicate_attribute: A strategy for what to do if a
69
tag includes the same attribute more than once. Accepted
70
values are: REPLACE (replace earlier values with later
71
ones, the default), IGNORE (keep the earliest value
72
encountered), or a callable. A callable must take three
73
arguments: the dictionary of attributes already processed,
74
the name of the duplicate attribute, and the most recent value
75
encountered.
76
"""
77
self.on_duplicate_attribute = kwargs.pop(
78
'on_duplicate_attribute', self.REPLACE
79
)
80
HTMLParser.__init__(self, *args, **kwargs)
81
82
# Keep a list of empty-element tags that were encountered
83
# without an explicit closing tag. If we encounter a closing tag
84
# of this type, we'll associate it with one of those entries.
85
#
86
# This isn't a stack because we don't care about the
87
# order. It's a list of closing tags we've already handled and
88
# will ignore, assuming they ever show up.
89
self.already_closed_empty_element = []
90
91
def error(self, msg):
92
"""In Python 3, HTMLParser subclasses must implement error(), although
93
this requirement doesn't appear to be documented.
94
95
In Python 2, HTMLParser implements error() by raising an exception,
96
which we don't want to do.
97
98
In any event, this method is called only on very strange
99
markup and our best strategy is to pretend it didn't happen
100
and keep going.
101
"""
102
warnings.warn(msg)
103
104
def handle_startendtag(self, name, attrs):
105
"""Handle an incoming empty-element tag.
106
107
This is only called when the markup looks like <tag/>.
108
109
:param name: Name of the tag.
110
:param attrs: Dictionary of the tag's attributes.
111
"""
112
# is_startend() tells handle_starttag not to close the tag
113
# just because its name matches a known empty-element tag. We
114
# know that this is an empty-element tag and we want to call
115
# handle_endtag ourselves.
116
tag = self.handle_starttag(name, attrs, handle_empty_element=False)
117
self.handle_endtag(name)
118
119
def handle_starttag(self, name, attrs, handle_empty_element=True):
120
"""Handle an opening tag, e.g. '<tag>'
121
122
:param name: Name of the tag.
123
:param attrs: Dictionary of the tag's attributes.
124
:param handle_empty_element: True if this tag is known to be
125
an empty-element tag (i.e. there is not expected to be any
126
closing tag).
127
"""
128
# XXX namespace
129
attr_dict = {}
130
for key, value in attrs:
131
# Change None attribute values to the empty string
132
# for consistency with the other tree builders.
133
if value is None:
134
value = ''
135
if key in attr_dict:
136
# A single attribute shows up multiple times in this
137
# tag. How to handle it depends on the
138
# on_duplicate_attribute setting.
139
on_dupe = self.on_duplicate_attribute
140
if on_dupe == self.IGNORE:
141
pass
142
elif on_dupe in (None, self.REPLACE):
143
attr_dict[key] = value
144
else:
145
on_dupe(attr_dict, key, value)
146
else:
147
attr_dict[key] = value
148
attrvalue = '""'
149
#print("START", name)
150
sourceline, sourcepos = self.getpos()
151
tag = self.soup.handle_starttag(
152
name, None, None, attr_dict, sourceline=sourceline,
153
sourcepos=sourcepos
154
)
155
if tag and tag.is_empty_element and handle_empty_element:
156
# Unlike other parsers, html.parser doesn't send separate end tag
157
# events for empty-element tags. (It's handled in
158
# handle_startendtag, but only if the original markup looked like
159
# <tag/>.)
160
#
161
# So we need to call handle_endtag() ourselves. Since we
162
# know the start event is identical to the end event, we
163
# don't want handle_endtag() to cross off any previous end
164
# events for tags of this name.
165
self.handle_endtag(name, check_already_closed=False)
166
167
# But we might encounter an explicit closing tag for this tag
168
# later on. If so, we want to ignore it.
169
self.already_closed_empty_element.append(name)
170
171
def handle_endtag(self, name, check_already_closed=True):
172
"""Handle a closing tag, e.g. '</tag>'
173
174
:param name: A tag name.
175
:param check_already_closed: True if this tag is expected to
176
be the closing portion of an empty-element tag,
177
e.g. '<tag></tag>'.
178
"""
179
#print("END", name)
180
if check_already_closed and name in self.already_closed_empty_element:
181
# This is a redundant end tag for an empty-element tag.
182
# We've already called handle_endtag() for it, so just
183
# check it off the list.
184
# print("ALREADY CLOSED", name)
185
self.already_closed_empty_element.remove(name)
186
else:
187
self.soup.handle_endtag(name)
188
189
def handle_data(self, data):
190
"""Handle some textual data that shows up between tags."""
191
self.soup.handle_data(data)
192
193
def handle_charref(self, name):
194
"""Handle a numeric character reference by converting it to the
195
corresponding Unicode character and treating it as textual
196
data.
197
198
:param name: Character number, possibly in hexadecimal.
199
"""
200
# XXX workaround for a bug in HTMLParser. Remove this once
201
# it's fixed in all supported versions.
202
# http://bugs.python.org/issue13633
203
if name.startswith('x'):
204
real_name = int(name.lstrip('x'), 16)
205
elif name.startswith('X'):
206
real_name = int(name.lstrip('X'), 16)
207
else:
208
real_name = int(name)
209
210
data = None
211
if real_name < 256:
212
# HTML numeric entities are supposed to reference Unicode
213
# code points, but sometimes they reference code points in
214
# some other encoding (ahem, Windows-1252). E.g. &#147;
215
# instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
216
# code tries to detect this situation and compensate.
217
for encoding in (self.soup.original_encoding, 'windows-1252'):
218
if not encoding:
219
continue
220
try:
221
data = bytearray([real_name]).decode(encoding)
222
except UnicodeDecodeError as e:
223
pass
224
if not data:
225
try:
226
data = chr(real_name)
227
except (ValueError, OverflowError) as e:
228
pass
229
data = data or "\N{REPLACEMENT CHARACTER}"
230
self.handle_data(data)
231
232
def handle_entityref(self, name):
233
"""Handle a named entity reference by converting it to the
234
corresponding Unicode character and treating it as textual
235
data.
236
237
:param name: Name of the entity reference.
238
"""
239
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
240
if character is not None:
241
data = character
242
else:
243
# If this were XML, it would be ambiguous whether "&foo"
244
# was an character entity reference with a missing
245
# semicolon or the literal string "&foo". Since this is
246
# HTML, we have a complete list of all character entity references,
247
# and this one wasn't found, so assume it's the literal string "&foo".
248
data = "&%s" % name
249
self.handle_data(data)
250
251
def handle_comment(self, data):
252
"""Handle an HTML comment.
253
254
:param data: The text of the comment.
255
"""
256
self.soup.endData()
257
self.soup.handle_data(data)
258
self.soup.endData(Comment)
259
260
def handle_decl(self, data):
261
"""Handle a DOCTYPE declaration.
262
263
:param data: The text of the declaration.
264
"""
265
self.soup.endData()
266
data = data[len("DOCTYPE "):]
267
self.soup.handle_data(data)
268
self.soup.endData(Doctype)
269
270
def unknown_decl(self, data):
271
"""Handle a declaration of unknown type -- probably a CDATA block.
272
273
:param data: The text of the declaration.
274
"""
275
if data.upper().startswith('CDATA['):
276
cls = CData
277
data = data[len('CDATA['):]
278
else:
279
cls = Declaration
280
self.soup.endData()
281
self.soup.handle_data(data)
282
self.soup.endData(cls)
283
284
def handle_pi(self, data):
285
"""Handle a processing instruction.
286
287
:param data: The text of the instruction.
288
"""
289
self.soup.endData()
290
self.soup.handle_data(data)
291
self.soup.endData(ProcessingInstruction)
292
293
294
class HTMLParserTreeBuilder(HTMLTreeBuilder):
295
"""A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
296
found in the Python standard library.
297
"""
298
is_xml = False
299
picklable = True
300
NAME = HTMLPARSER
301
features = [NAME, HTML, STRICT]
302
303
# The html.parser knows which line number and position in the
304
# original file is the source of an element.
305
TRACKS_LINE_NUMBERS = True
306
307
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
308
"""Constructor.
309
310
:param parser_args: Positional arguments to pass into
311
the BeautifulSoupHTMLParser constructor, once it's
312
invoked.
313
:param parser_kwargs: Keyword arguments to pass into
314
the BeautifulSoupHTMLParser constructor, once it's
315
invoked.
316
:param kwargs: Keyword arguments for the superclass constructor.
317
"""
318
# Some keyword arguments will be pulled out of kwargs and placed
319
# into parser_kwargs.
320
extra_parser_kwargs = dict()
321
for arg in ('on_duplicate_attribute',):
322
if arg in kwargs:
323
value = kwargs.pop(arg)
324
extra_parser_kwargs[arg] = value
325
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
326
parser_args = parser_args or []
327
parser_kwargs = parser_kwargs or {}
328
parser_kwargs.update(extra_parser_kwargs)
329
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
330
parser_kwargs['strict'] = False
331
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
332
parser_kwargs['convert_charrefs'] = False
333
self.parser_args = (parser_args, parser_kwargs)
334
335
def prepare_markup(self, markup, user_specified_encoding=None,
336
document_declared_encoding=None, exclude_encodings=None):
337
338
"""Run any preliminary steps necessary to make incoming markup
339
acceptable to the parser.
340
341
:param markup: Some markup -- probably a bytestring.
342
:param user_specified_encoding: The user asked to try this encoding.
343
:param document_declared_encoding: The markup itself claims to be
344
in this encoding.
345
:param exclude_encodings: The user asked _not_ to try any of
346
these encodings.
347
348
:yield: A series of 4-tuples:
349
(markup, encoding, declared encoding,
350
has undergone character replacement)
351
352
Each 4-tuple represents a strategy for converting the
353
document to Unicode and parsing it. Each strategy will be tried
354
in turn.
355
"""
356
if isinstance(markup, str):
357
# Parse Unicode as-is.
358
yield (markup, None, None, False)
359
return
360
361
# Ask UnicodeDammit to sniff the most likely encoding.
362
try_encodings = [user_specified_encoding, document_declared_encoding]
363
dammit = UnicodeDammit(markup, try_encodings, is_html=True,
364
exclude_encodings=exclude_encodings)
365
yield (dammit.markup, dammit.original_encoding,
366
dammit.declared_html_encoding,
367
dammit.contains_replacement_characters)
368
369
def feed(self, markup):
370
"""Run some incoming markup through some parsing process,
371
populating the `BeautifulSoup` object in self.soup.
372
"""
373
args, kwargs = self.parser_args
374
parser = BeautifulSoupHTMLParser(*args, **kwargs)
375
parser.soup = self.soup
376
try:
377
parser.feed(markup)
378
parser.close()
379
except HTMLParseError as e:
380
warnings.warn(RuntimeWarning(
381
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
382
raise e
383
parser.already_closed_empty_element = []
384
385
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
386
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
387
# string.
388
#
389
# XXX This code can be removed once most Python 3 users are on 3.2.3.
390
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
391
import re
392
attrfind_tolerant = re.compile(
393
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
394
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
395
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
396
397
locatestarttagend = re.compile(r"""
398
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
399
(?:\s+ # whitespace before attribute name
400
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
401
(?:\s*=\s* # value indicator
402
(?:'[^']*' # LITA-enclosed value
403
|\"[^\"]*\" # LIT-enclosed value
404
|[^'\">\s]+ # bare value
405
)
406
)?
407
)
408
)*
409
\s* # trailing whitespace
410
""", re.VERBOSE)
411
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
412
413
from html.parser import tagfind, attrfind
414
415
def parse_starttag(self, i):
416
self.__starttag_text = None
417
endpos = self.check_for_whole_start_tag(i)
418
if endpos < 0:
419
return endpos
420
rawdata = self.rawdata
421
self.__starttag_text = rawdata[i:endpos]
422
423
# Now parse the data between i+1 and j into a tag and attrs
424
attrs = []
425
match = tagfind.match(rawdata, i+1)
426
assert match, 'unexpected call to parse_starttag()'
427
k = match.end()
428
self.lasttag = tag = rawdata[i+1:k].lower()
429
while k < endpos:
430
if self.strict:
431
m = attrfind.match(rawdata, k)
432
else:
433
m = attrfind_tolerant.match(rawdata, k)
434
if not m:
435
break
436
attrname, rest, attrvalue = m.group(1, 2, 3)
437
if not rest:
438
attrvalue = None
439
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
440
attrvalue[:1] == '"' == attrvalue[-1:]:
441
attrvalue = attrvalue[1:-1]
442
if attrvalue:
443
attrvalue = self.unescape(attrvalue)
444
attrs.append((attrname.lower(), attrvalue))
445
k = m.end()
446
447
end = rawdata[k:endpos].strip()
448
if end not in (">", "/>"):
449
lineno, offset = self.getpos()
450
if "\n" in self.__starttag_text:
451
lineno = lineno + self.__starttag_text.count("\n")
452
offset = len(self.__starttag_text) \
453
- self.__starttag_text.rfind("\n")
454
else:
455
offset = offset + len(self.__starttag_text)
456
if self.strict:
457
self.error("junk characters in start tag: %r"
458
% (rawdata[k:endpos][:20],))
459
self.handle_data(rawdata[i:endpos])
460
return endpos
461
if end.endswith('/>'):
462
# XHTML-style empty tag: <span attr="value" />
463
self.handle_startendtag(tag, attrs)
464
else:
465
self.handle_starttag(tag, attrs)
466
if tag in self.CDATA_CONTENT_ELEMENTS:
467
self.set_cdata_mode(tag)
468
return endpos
469
470
def set_cdata_mode(self, elem):
471
self.cdata_elem = elem.lower()
472
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
473
474
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
475
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
476
477
CONSTRUCTOR_TAKES_STRICT = True
478
479