CoCalc -- _htmlparser.py

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/builder/_htmlparser.py
⁸¹¹ views
1
# encoding: utf-8
2
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
3

4
# Use of this source code is governed by the MIT license.
5
__license__ = "MIT"
6

7
__all__ = [
8
    'HTMLParserTreeBuilder',
9
    ]
10

11
from html.parser import HTMLParser
12

13
try:
14
    from html.parser import HTMLParseError
15
except ImportError as e:
16
    # HTMLParseError is removed in Python 3.5. Since it can never be
17
    # thrown in 3.5, we can just define our own class as a placeholder.
18
    class HTMLParseError(Exception):
19
        pass
20

21
import sys
22
import warnings
23

24
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
25
# argument, which we'd like to set to False. Unfortunately,
26
# http://bugs.python.org/issue13273 makes strict=True a better bet
27
# before Python 3.2.3.
28
#
29
# At the end of this file, we monkeypatch HTMLParser so that
30
# strict=True works well on Python 3.2.2.
31
major, minor, release = sys.version_info[:3]
32
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
33
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
34
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
35

36

37
from bs4.element import (
38
    CData,
39
    Comment,
40
    Declaration,
41
    Doctype,
42
    ProcessingInstruction,
43
    )
44
from bs4.dammit import EntitySubstitution, UnicodeDammit
45

46
from bs4.builder import (
47
    HTML,
48
    HTMLTreeBuilder,
49
    STRICT,
50
    )
51

52

53
HTMLPARSER = 'html.parser'
54

55
class BeautifulSoupHTMLParser(HTMLParser):
56
    """A subclass of the Python standard library's HTMLParser class, which
57
    listens for HTMLParser events and translates them into calls
58
    to Beautiful Soup's tree construction API.
59
    """
60

61
    # Strategies for handling duplicate attributes
62
    IGNORE = 'ignore'
63
    REPLACE = 'replace'
64
    
65
    def __init__(self, *args, **kwargs):
66
        """Constructor.
67

68
        :param on_duplicate_attribute: A strategy for what to do if a
69
            tag includes the same attribute more than once. Accepted
70
            values are: REPLACE (replace earlier values with later
71
            ones, the default), IGNORE (keep the earliest value
72
            encountered), or a callable. A callable must take three
73
            arguments: the dictionary of attributes already processed,
74
            the name of the duplicate attribute, and the most recent value
75
            encountered.           
76
        """
77
        self.on_duplicate_attribute = kwargs.pop(
78
            'on_duplicate_attribute', self.REPLACE
79
        )
80
        HTMLParser.__init__(self, *args, **kwargs)
81

82
        # Keep a list of empty-element tags that were encountered
83
        # without an explicit closing tag. If we encounter a closing tag
84
        # of this type, we'll associate it with one of those entries.
85
        #
86
        # This isn't a stack because we don't care about the
87
        # order. It's a list of closing tags we've already handled and
88
        # will ignore, assuming they ever show up.
89
        self.already_closed_empty_element = []
90

91
    def error(self, msg):
92
        """In Python 3, HTMLParser subclasses must implement error(), although
93
        this requirement doesn't appear to be documented.
94

95
        In Python 2, HTMLParser implements error() by raising an exception,
96
        which we don't want to do.
97

98
        In any event, this method is called only on very strange
99
        markup and our best strategy is to pretend it didn't happen
100
        and keep going.
101
        """
102
        warnings.warn(msg)
103
        
104
    def handle_startendtag(self, name, attrs):
105
        """Handle an incoming empty-element tag.
106

107
        This is only called when the markup looks like <tag/>.
108

109
        :param name: Name of the tag.
110
        :param attrs: Dictionary of the tag's attributes.
111
        """
112
        # is_startend() tells handle_starttag not to close the tag
113
        # just because its name matches a known empty-element tag. We
114
        # know that this is an empty-element tag and we want to call
115
        # handle_endtag ourselves.
116
        tag = self.handle_starttag(name, attrs, handle_empty_element=False)
117
        self.handle_endtag(name)
118
        
119
    def handle_starttag(self, name, attrs, handle_empty_element=True):
120
        """Handle an opening tag, e.g. '<tag>'
121

122
        :param name: Name of the tag.
123
        :param attrs: Dictionary of the tag's attributes.
124
        :param handle_empty_element: True if this tag is known to be
125
            an empty-element tag (i.e. there is not expected to be any
126
            closing tag).
127
        """
128
        # XXX namespace
129
        attr_dict = {}
130
        for key, value in attrs:
131
            # Change None attribute values to the empty string
132
            # for consistency with the other tree builders.
133
            if value is None:
134
                value = ''
135
            if key in attr_dict:
136
                # A single attribute shows up multiple times in this
137
                # tag. How to handle it depends on the
138
                # on_duplicate_attribute setting.
139
                on_dupe = self.on_duplicate_attribute
140
                if on_dupe == self.IGNORE:
141
                    pass
142
                elif on_dupe in (None, self.REPLACE):
143
                    attr_dict[key] = value
144
                else:
145
                    on_dupe(attr_dict, key, value)
146
            else:
147
                attr_dict[key] = value
148
            attrvalue = '""'
149
        #print("START", name)
150
        sourceline, sourcepos = self.getpos()
151
        tag = self.soup.handle_starttag(
152
            name, None, None, attr_dict, sourceline=sourceline,
153
            sourcepos=sourcepos
154
        )
155
        if tag and tag.is_empty_element and handle_empty_element:
156
            # Unlike other parsers, html.parser doesn't send separate end tag
157
            # events for empty-element tags. (It's handled in
158
            # handle_startendtag, but only if the original markup looked like
159
            # <tag/>.)
160
            #
161
            # So we need to call handle_endtag() ourselves. Since we
162
            # know the start event is identical to the end event, we
163
            # don't want handle_endtag() to cross off any previous end
164
            # events for tags of this name.
165
            self.handle_endtag(name, check_already_closed=False)
166

167
            # But we might encounter an explicit closing tag for this tag
168
            # later on. If so, we want to ignore it.
169
            self.already_closed_empty_element.append(name)
170
            
171
    def handle_endtag(self, name, check_already_closed=True):
172
        """Handle a closing tag, e.g. '</tag>'
173
        
174
        :param name: A tag name.
175
        :param check_already_closed: True if this tag is expected to
176
           be the closing portion of an empty-element tag,
177
           e.g. '<tag></tag>'.
178
        """
179
        #print("END", name)
180
        if check_already_closed and name in self.already_closed_empty_element:
181
            # This is a redundant end tag for an empty-element tag.
182
            # We've already called handle_endtag() for it, so just
183
            # check it off the list.
184
            # print("ALREADY CLOSED", name)
185
            self.already_closed_empty_element.remove(name)
186
        else:
187
            self.soup.handle_endtag(name)
188

189
    def handle_data(self, data):
190
        """Handle some textual data that shows up between tags."""
191
        self.soup.handle_data(data)
192

193
    def handle_charref(self, name):
194
        """Handle a numeric character reference by converting it to the
195
        corresponding Unicode character and treating it as textual
196
        data.
197

198
        :param name: Character number, possibly in hexadecimal.
199
        """
200
        # XXX workaround for a bug in HTMLParser. Remove this once
201
        # it's fixed in all supported versions.
202
        # http://bugs.python.org/issue13633
203
        if name.startswith('x'):
204
            real_name = int(name.lstrip('x'), 16)
205
        elif name.startswith('X'):
206
            real_name = int(name.lstrip('X'), 16)
207
        else:
208
            real_name = int(name)
209

210
        data = None
211
        if real_name < 256:
212
            # HTML numeric entities are supposed to reference Unicode
213
            # code points, but sometimes they reference code points in
214
            # some other encoding (ahem, Windows-1252). E.g. &#147;
215
            # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
216
            # code tries to detect this situation and compensate.
217
            for encoding in (self.soup.original_encoding, 'windows-1252'):
218
                if not encoding:
219
                    continue
220
                try:
221
                    data = bytearray([real_name]).decode(encoding)
222
                except UnicodeDecodeError as e:
223
                    pass
224
        if not data:
225
            try:
226
                data = chr(real_name)
227
            except (ValueError, OverflowError) as e:
228
                pass
229
        data = data or "\N{REPLACEMENT CHARACTER}"
230
        self.handle_data(data)
231

232
    def handle_entityref(self, name):
233
        """Handle a named entity reference by converting it to the
234
        corresponding Unicode character and treating it as textual
235
        data.
236

237
        :param name: Name of the entity reference.
238
        """
239
        character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
240
        if character is not None:
241
            data = character
242
        else:
243
            # If this were XML, it would be ambiguous whether "&foo"
244
            # was an character entity reference with a missing
245
            # semicolon or the literal string "&foo". Since this is
246
            # HTML, we have a complete list of all character entity references,
247
            # and this one wasn't found, so assume it's the literal string "&foo".
248
            data = "&%s" % name
249
        self.handle_data(data)
250

251
    def handle_comment(self, data):
252
        """Handle an HTML comment.
253

254
        :param data: The text of the comment.
255
        """
256
        self.soup.endData()
257
        self.soup.handle_data(data)
258
        self.soup.endData(Comment)
259

260
    def handle_decl(self, data):
261
        """Handle a DOCTYPE declaration.
262

263
        :param data: The text of the declaration.
264
        """
265
        self.soup.endData()
266
        data = data[len("DOCTYPE "):]
267
        self.soup.handle_data(data)
268
        self.soup.endData(Doctype)
269

270
    def unknown_decl(self, data):
271
        """Handle a declaration of unknown type -- probably a CDATA block.
272

273
        :param data: The text of the declaration.
274
        """
275
        if data.upper().startswith('CDATA['):
276
            cls = CData
277
            data = data[len('CDATA['):]
278
        else:
279
            cls = Declaration
280
        self.soup.endData()
281
        self.soup.handle_data(data)
282
        self.soup.endData(cls)
283

284
    def handle_pi(self, data):
285
        """Handle a processing instruction.
286

287
        :param data: The text of the instruction.
288
        """
289
        self.soup.endData()
290
        self.soup.handle_data(data)
291
        self.soup.endData(ProcessingInstruction)
292

293

294
class HTMLParserTreeBuilder(HTMLTreeBuilder):
295
    """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
296
    found in the Python standard library.
297
    """
298
    is_xml = False
299
    picklable = True
300
    NAME = HTMLPARSER
301
    features = [NAME, HTML, STRICT]
302

303
    # The html.parser knows which line number and position in the
304
    # original file is the source of an element.
305
    TRACKS_LINE_NUMBERS = True
306

307
    def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
308
        """Constructor.
309

310
        :param parser_args: Positional arguments to pass into 
311
            the BeautifulSoupHTMLParser constructor, once it's
312
            invoked.
313
        :param parser_kwargs: Keyword arguments to pass into 
314
            the BeautifulSoupHTMLParser constructor, once it's
315
            invoked.
316
        :param kwargs: Keyword arguments for the superclass constructor.
317
        """
318
        # Some keyword arguments will be pulled out of kwargs and placed
319
        # into parser_kwargs.
320
        extra_parser_kwargs = dict()
321
        for arg in ('on_duplicate_attribute',):
322
            if arg in kwargs:
323
                value = kwargs.pop(arg)
324
                extra_parser_kwargs[arg] = value
325
        super(HTMLParserTreeBuilder, self).__init__(**kwargs)
326
        parser_args = parser_args or []
327
        parser_kwargs = parser_kwargs or {}
328
        parser_kwargs.update(extra_parser_kwargs)
329
        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
330
            parser_kwargs['strict'] = False
331
        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
332
            parser_kwargs['convert_charrefs'] = False
333
        self.parser_args = (parser_args, parser_kwargs)
334
        
335
    def prepare_markup(self, markup, user_specified_encoding=None,
336
                       document_declared_encoding=None, exclude_encodings=None):
337

338
        """Run any preliminary steps necessary to make incoming markup
339
        acceptable to the parser.
340

341
        :param markup: Some markup -- probably a bytestring.
342
        :param user_specified_encoding: The user asked to try this encoding.
343
        :param document_declared_encoding: The markup itself claims to be
344
            in this encoding.
345
        :param exclude_encodings: The user asked _not_ to try any of
346
            these encodings.
347

348
        :yield: A series of 4-tuples:
349
         (markup, encoding, declared encoding,
350
          has undergone character replacement)
351

352
         Each 4-tuple represents a strategy for converting the
353
         document to Unicode and parsing it. Each strategy will be tried 
354
         in turn.
355
        """
356
        if isinstance(markup, str):
357
            # Parse Unicode as-is.
358
            yield (markup, None, None, False)
359
            return
360

361
        # Ask UnicodeDammit to sniff the most likely encoding.
362
        try_encodings = [user_specified_encoding, document_declared_encoding]
363
        dammit = UnicodeDammit(markup, try_encodings, is_html=True,
364
                               exclude_encodings=exclude_encodings)
365
        yield (dammit.markup, dammit.original_encoding,
366
               dammit.declared_html_encoding,
367
               dammit.contains_replacement_characters)
368

369
    def feed(self, markup):
370
        """Run some incoming markup through some parsing process,
371
        populating the `BeautifulSoup` object in self.soup.
372
        """
373
        args, kwargs = self.parser_args
374
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
375
        parser.soup = self.soup
376
        try:
377
            parser.feed(markup)
378
            parser.close()
379
        except HTMLParseError as e:
380
            warnings.warn(RuntimeWarning(
381
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
382
            raise e
383
        parser.already_closed_empty_element = []
384

385
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
386
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
387
# string.
388
#
389
# XXX This code can be removed once most Python 3 users are on 3.2.3.
390
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
391
    import re
392
    attrfind_tolerant = re.compile(
393
        r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
394
        r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
395
    HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
396

397
    locatestarttagend = re.compile(r"""
398
  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
399
  (?:\s+                             # whitespace before attribute name
400
    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
401
      (?:\s*=\s*                     # value indicator
402
        (?:'[^']*'                   # LITA-enclosed value
403
          |\"[^\"]*\"                # LIT-enclosed value
404
          |[^'\">\s]+                # bare value
405
         )
406
       )?
407
     )
408
   )*
409
  \s*                                # trailing whitespace
410
""", re.VERBOSE)
411
    BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
412

413
    from html.parser import tagfind, attrfind
414

415
    def parse_starttag(self, i):
416
        self.__starttag_text = None
417
        endpos = self.check_for_whole_start_tag(i)
418
        if endpos < 0:
419
            return endpos
420
        rawdata = self.rawdata
421
        self.__starttag_text = rawdata[i:endpos]
422

423
        # Now parse the data between i+1 and j into a tag and attrs
424
        attrs = []
425
        match = tagfind.match(rawdata, i+1)
426
        assert match, 'unexpected call to parse_starttag()'
427
        k = match.end()
428
        self.lasttag = tag = rawdata[i+1:k].lower()
429
        while k < endpos:
430
            if self.strict:
431
                m = attrfind.match(rawdata, k)
432
            else:
433
                m = attrfind_tolerant.match(rawdata, k)
434
            if not m:
435
                break
436
            attrname, rest, attrvalue = m.group(1, 2, 3)
437
            if not rest:
438
                attrvalue = None
439
            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
440
                 attrvalue[:1] == '"' == attrvalue[-1:]:
441
                attrvalue = attrvalue[1:-1]
442
            if attrvalue:
443
                attrvalue = self.unescape(attrvalue)
444
            attrs.append((attrname.lower(), attrvalue))
445
            k = m.end()
446

447
        end = rawdata[k:endpos].strip()
448
        if end not in (">", "/>"):
449
            lineno, offset = self.getpos()
450
            if "\n" in self.__starttag_text:
451
                lineno = lineno + self.__starttag_text.count("\n")
452
                offset = len(self.__starttag_text) \
453
                         - self.__starttag_text.rfind("\n")
454
            else:
455
                offset = offset + len(self.__starttag_text)
456
            if self.strict:
457
                self.error("junk characters in start tag: %r"
458
                           % (rawdata[k:endpos][:20],))
459
            self.handle_data(rawdata[i:endpos])
460
            return endpos
461
        if end.endswith('/>'):
462
            # XHTML-style empty tag: <span attr="value" />
463
            self.handle_startendtag(tag, attrs)
464
        else:
465
            self.handle_starttag(tag, attrs)
466
            if tag in self.CDATA_CONTENT_ELEMENTS:
467
                self.set_cdata_mode(tag)
468
        return endpos
469

470
    def set_cdata_mode(self, elem):
471
        self.cdata_elem = elem.lower()
472
        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
473

474
    BeautifulSoupHTMLParser.parse_starttag = parse_starttag
475
    BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
476

477
    CONSTRUCTOR_TAKES_STRICT = True
478

479
Product

Resources

Company