CoCalc -- __init_

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/builder/__init__.py
⁸¹¹ views
1
# Use of this source code is governed by the MIT license.
2
__license__ = "MIT"
3

4
from collections import defaultdict
5
import itertools
6
import sys
7
from bs4.element import (
8
    CharsetMetaAttributeValue,
9
    ContentMetaAttributeValue,
10
    Stylesheet,
11
    Script,
12
    TemplateString,
13
    nonwhitespace_re
14
)
15

16
__all__ = [
17
    'HTMLTreeBuilder',
18
    'SAXTreeBuilder',
19
    'TreeBuilder',
20
    'TreeBuilderRegistry',
21
    ]
22

23
# Some useful features for a TreeBuilder to have.
24
FAST = 'fast'
25
PERMISSIVE = 'permissive'
26
STRICT = 'strict'
27
XML = 'xml'
28
HTML = 'html'
29
HTML_5 = 'html5'
30

31

32
class TreeBuilderRegistry(object):
33
    """A way of looking up TreeBuilder subclasses by their name or by desired
34
    features.
35
    """
36
    
37
    def __init__(self):
38
        self.builders_for_feature = defaultdict(list)
39
        self.builders = []
40

41
    def register(self, treebuilder_class):
42
        """Register a treebuilder based on its advertised features.
43

44
        :param treebuilder_class: A subclass of Treebuilder. its .features
45
           attribute should list its features.
46
        """
47
        for feature in treebuilder_class.features:
48
            self.builders_for_feature[feature].insert(0, treebuilder_class)
49
        self.builders.insert(0, treebuilder_class)
50

51
    def lookup(self, *features):
52
        """Look up a TreeBuilder subclass with the desired features.
53

54
        :param features: A list of features to look for. If none are
55
            provided, the most recently registered TreeBuilder subclass
56
            will be used.
57
        :return: A TreeBuilder subclass, or None if there's no
58
            registered subclass with all the requested features.
59
        """
60
        if len(self.builders) == 0:
61
            # There are no builders at all.
62
            return None
63

64
        if len(features) == 0:
65
            # They didn't ask for any features. Give them the most
66
            # recently registered builder.
67
            return self.builders[0]
68

69
        # Go down the list of features in order, and eliminate any builders
70
        # that don't match every feature.
71
        features = list(features)
72
        features.reverse()
73
        candidates = None
74
        candidate_set = None
75
        while len(features) > 0:
76
            feature = features.pop()
77
            we_have_the_feature = self.builders_for_feature.get(feature, [])
78
            if len(we_have_the_feature) > 0:
79
                if candidates is None:
80
                    candidates = we_have_the_feature
81
                    candidate_set = set(candidates)
82
                else:
83
                    # Eliminate any candidates that don't have this feature.
84
                    candidate_set = candidate_set.intersection(
85
                        set(we_have_the_feature))
86

87
        # The only valid candidates are the ones in candidate_set.
88
        # Go through the original list of candidates and pick the first one
89
        # that's in candidate_set.
90
        if candidate_set is None:
91
            return None
92
        for candidate in candidates:
93
            if candidate in candidate_set:
94
                return candidate
95
        return None
96

97
# The BeautifulSoup class will take feature lists from developers and use them
98
# to look up builders in this registry.
99
builder_registry = TreeBuilderRegistry()
100

101
class TreeBuilder(object):
102
    """Turn a textual document into a Beautiful Soup object tree."""
103

104
    NAME = "[Unknown tree builder]"
105
    ALTERNATE_NAMES = []
106
    features = []
107

108
    is_xml = False
109
    picklable = False
110
    empty_element_tags = None # A tag will be considered an empty-element
111
                              # tag when and only when it has no contents.
112
    
113
    # A value for these tag/attribute combinations is a space- or
114
    # comma-separated list of CDATA, rather than a single CDATA.
115
    DEFAULT_CDATA_LIST_ATTRIBUTES = {}
116

117
    # Whitespace should be preserved inside these tags.
118
    DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
119

120
    # The textual contents of tags with these names should be
121
    # instantiated with some class other than NavigableString.
122
    DEFAULT_STRING_CONTAINERS = {}
123
    
124
    USE_DEFAULT = object()
125

126
    # Most parsers don't keep track of line numbers.
127
    TRACKS_LINE_NUMBERS = False
128
    
129
    def __init__(self, multi_valued_attributes=USE_DEFAULT,
130
                 preserve_whitespace_tags=USE_DEFAULT,
131
                 store_line_numbers=USE_DEFAULT,
132
                 string_containers=USE_DEFAULT,
133
    ):
134
        """Constructor.
135

136
        :param multi_valued_attributes: If this is set to None, the
137
         TreeBuilder will not turn any values for attributes like
138
         'class' into lists. Setting this to a dictionary will
139
         customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
140
         for an example.
141

142
         Internally, these are called "CDATA list attributes", but that
143
         probably doesn't make sense to an end-user, so the argument name
144
         is `multi_valued_attributes`.
145

146
        :param preserve_whitespace_tags: A list of tags to treat
147
         the way <pre> tags are treated in HTML. Tags in this list
148
         are immune from pretty-printing; their contents will always be
149
         output as-is.
150

151
        :param string_containers: A dictionary mapping tag names to
152
        the classes that should be instantiated to contain the textual
153
        contents of those tags. The default is to use NavigableString
154
        for every tag, no matter what the name. You can override the
155
        default by changing DEFAULT_STRING_CONTAINERS.
156

157
        :param store_line_numbers: If the parser keeps track of the
158
         line numbers and positions of the original markup, that
159
         information will, by default, be stored in each corresponding
160
         `Tag` object. You can turn this off by passing
161
         store_line_numbers=False. If the parser you're using doesn't 
162
         keep track of this information, then setting store_line_numbers=True
163
         will do nothing.
164
        """
165
        self.soup = None
166
        if multi_valued_attributes is self.USE_DEFAULT:
167
            multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
168
        self.cdata_list_attributes = multi_valued_attributes
169
        if preserve_whitespace_tags is self.USE_DEFAULT:
170
            preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
171
        self.preserve_whitespace_tags = preserve_whitespace_tags
172
        if store_line_numbers == self.USE_DEFAULT:
173
            store_line_numbers = self.TRACKS_LINE_NUMBERS
174
        self.store_line_numbers = store_line_numbers 
175
        if string_containers == self.USE_DEFAULT:
176
            string_containers = self.DEFAULT_STRING_CONTAINERS
177
        self.string_containers = string_containers
178
        
179
    def initialize_soup(self, soup):
180
        """The BeautifulSoup object has been initialized and is now
181
        being associated with the TreeBuilder.
182

183
        :param soup: A BeautifulSoup object.
184
        """
185
        self.soup = soup
186
        
187
    def reset(self):
188
        """Do any work necessary to reset the underlying parser
189
        for a new document.
190

191
        By default, this does nothing.
192
        """
193
        pass
194

195
    def can_be_empty_element(self, tag_name):
196
        """Might a tag with this name be an empty-element tag?
197

198
        The final markup may or may not actually present this tag as
199
        self-closing.
200

201
        For instance: an HTMLBuilder does not consider a <p> tag to be
202
        an empty-element tag (it's not in
203
        HTMLBuilder.empty_element_tags). This means an empty <p> tag
204
        will be presented as "<p></p>", not "<p/>" or "<p>".
205

206
        The default implementation has no opinion about which tags are
207
        empty-element tags, so a tag will be presented as an
208
        empty-element tag if and only if it has no children.
209
        "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
210
        be left alone.
211

212
        :param tag_name: The name of a markup tag.
213
        """
214
        if self.empty_element_tags is None:
215
            return True
216
        return tag_name in self.empty_element_tags
217
    
218
    def feed(self, markup):
219
        """Run some incoming markup through some parsing process,
220
        populating the `BeautifulSoup` object in self.soup.
221

222
        This method is not implemented in TreeBuilder; it must be
223
        implemented in subclasses.
224

225
        :return: None.
226
        """
227
        raise NotImplementedError()
228

229
    def prepare_markup(self, markup, user_specified_encoding=None,
230
                       document_declared_encoding=None, exclude_encodings=None):
231
        """Run any preliminary steps necessary to make incoming markup
232
        acceptable to the parser.
233

234
        :param markup: Some markup -- probably a bytestring.
235
        :param user_specified_encoding: The user asked to try this encoding.
236
        :param document_declared_encoding: The markup itself claims to be
237
            in this encoding.
238
        :param exclude_encodings: The user asked _not_ to try any of
239
            these encodings.
240

241
        :yield: A series of 4-tuples:
242
         (markup, encoding, declared encoding,
243
          has undergone character replacement)
244

245
         Each 4-tuple represents a strategy for converting the
246
         document to Unicode and parsing it. Each strategy will be tried 
247
         in turn.
248

249
         By default, the only strategy is to parse the markup
250
         as-is. See `LXMLTreeBuilderForXML` and
251
         `HTMLParserTreeBuilder` for implementations that take into
252
         account the quirks of particular parsers.
253
        """
254
        yield markup, None, None, False
255

256
    def test_fragment_to_document(self, fragment):
257
        """Wrap an HTML fragment to make it look like a document.
258

259
        Different parsers do this differently. For instance, lxml
260
        introduces an empty <head> tag, and html5lib
261
        doesn't. Abstracting this away lets us write simple tests
262
        which run HTML fragments through the parser and compare the
263
        results against other HTML fragments.
264

265
        This method should not be used outside of tests.
266

267
        :param fragment: A string -- fragment of HTML.
268
        :return: A string -- a full HTML document.
269
        """
270
        return fragment
271

272
    def set_up_substitutions(self, tag):
273
        """Set up any substitutions that will need to be performed on 
274
        a `Tag` when it's output as a string.
275

276
        By default, this does nothing. See `HTMLTreeBuilder` for a
277
        case where this is used.
278

279
        :param tag: A `Tag`
280
        :return: Whether or not a substitution was performed.
281
        """
282
        return False
283

284
    def _replace_cdata_list_attribute_values(self, tag_name, attrs):
285
        """When an attribute value is associated with a tag that can
286
        have multiple values for that attribute, convert the string
287
        value to a list of strings.
288

289
        Basically, replaces class="foo bar" with class=["foo", "bar"]
290

291
        NOTE: This method modifies its input in place.
292

293
        :param tag_name: The name of a tag.
294
        :param attrs: A dictionary containing the tag's attributes.
295
           Any appropriate attribute values will be modified in place.
296
        """
297
        if not attrs:
298
            return attrs
299
        if self.cdata_list_attributes:
300
            universal = self.cdata_list_attributes.get('*', [])
301
            tag_specific = self.cdata_list_attributes.get(
302
                tag_name.lower(), None)
303
            for attr in list(attrs.keys()):
304
                if attr in universal or (tag_specific and attr in tag_specific):
305
                    # We have a "class"-type attribute whose string
306
                    # value is a whitespace-separated list of
307
                    # values. Split it into a list.
308
                    value = attrs[attr]
309
                    if isinstance(value, str):
310
                        values = nonwhitespace_re.findall(value)
311
                    else:
312
                        # html5lib sometimes calls setAttributes twice
313
                        # for the same tag when rearranging the parse
314
                        # tree. On the second call the attribute value
315
                        # here is already a list.  If this happens,
316
                        # leave the value alone rather than trying to
317
                        # split it again.
318
                        values = value
319
                    attrs[attr] = values
320
        return attrs
321

322
class SAXTreeBuilder(TreeBuilder):
323
    """A Beautiful Soup treebuilder that listens for SAX events.
324

325
    This is not currently used for anything, but it demonstrates
326
    how a simple TreeBuilder would work.
327
    """
328

329
    def feed(self, markup):
330
        raise NotImplementedError()
331

332
    def close(self):
333
        pass
334

335
    def startElement(self, name, attrs):
336
        attrs = dict((key[1], value) for key, value in list(attrs.items()))
337
        #print("Start %s, %r" % (name, attrs))
338
        self.soup.handle_starttag(name, attrs)
339

340
    def endElement(self, name):
341
        #print("End %s" % name)
342
        self.soup.handle_endtag(name)
343

344
    def startElementNS(self, nsTuple, nodeName, attrs):
345
        # Throw away (ns, nodeName) for now.
346
        self.startElement(nodeName, attrs)
347

348
    def endElementNS(self, nsTuple, nodeName):
349
        # Throw away (ns, nodeName) for now.
350
        self.endElement(nodeName)
351
        #handler.endElementNS((ns, node.nodeName), node.nodeName)
352

353
    def startPrefixMapping(self, prefix, nodeValue):
354
        # Ignore the prefix for now.
355
        pass
356

357
    def endPrefixMapping(self, prefix):
358
        # Ignore the prefix for now.
359
        # handler.endPrefixMapping(prefix)
360
        pass
361

362
    def characters(self, content):
363
        self.soup.handle_data(content)
364

365
    def startDocument(self):
366
        pass
367

368
    def endDocument(self):
369
        pass
370

371

372
class HTMLTreeBuilder(TreeBuilder):
373
    """This TreeBuilder knows facts about HTML.
374

375
    Such as which tags are empty-element tags.
376
    """
377

378
    empty_element_tags = set([
379
        # These are from HTML5.
380
        'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
381
        
382
        # These are from earlier versions of HTML and are removed in HTML5.
383
        'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
384
    ])
385

386
    # The HTML standard defines these as block-level elements. Beautiful
387
    # Soup does not treat these elements differently from other elements,
388
    # but it may do so eventually, and this information is available if
389
    # you need to use it.
390
    block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
391

392
    # The HTML standard defines an unusual content model for these tags.
393
    # We represent this by using a string class other than NavigableString
394
    # inside these tags.
395
    #
396
    # I made this list by going through the HTML spec
397
    # (https://html.spec.whatwg.org/#metadata-content) and looking for
398
    # "metadata content" elements that can contain strings.
399
    #
400
    # TODO: Arguably <noscript> could go here but it seems
401
    # qualitatively different from the other tags.
402
    DEFAULT_STRING_CONTAINERS = {
403
        'style': Stylesheet,
404
        'script': Script,
405
        'template': TemplateString,
406
    }    
407
    
408
    # The HTML standard defines these attributes as containing a
409
    # space-separated list of values, not a single value. That is,
410
    # class="foo bar" means that the 'class' attribute has two values,
411
    # 'foo' and 'bar', not the single value 'foo bar'.  When we
412
    # encounter one of these attributes, we will parse its value into
413
    # a list of values if possible. Upon output, the list will be
414
    # converted back into a string.
415
    DEFAULT_CDATA_LIST_ATTRIBUTES = {
416
        "*" : ['class', 'accesskey', 'dropzone'],
417
        "a" : ['rel', 'rev'],
418
        "link" :  ['rel', 'rev'],
419
        "td" : ["headers"],
420
        "th" : ["headers"],
421
        "td" : ["headers"],
422
        "form" : ["accept-charset"],
423
        "object" : ["archive"],
424

425
        # These are HTML5 specific, as are *.accesskey and *.dropzone above.
426
        "area" : ["rel"],
427
        "icon" : ["sizes"],
428
        "iframe" : ["sandbox"],
429
        "output" : ["for"],
430
        }
431

432
    DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
433
    
434
    def set_up_substitutions(self, tag):
435
        """Replace the declared encoding in a <meta> tag with a placeholder,
436
        to be substituted when the tag is output to a string.
437

438
        An HTML document may come in to Beautiful Soup as one
439
        encoding, but exit in a different encoding, and the <meta> tag
440
        needs to be changed to reflect this.
441

442
        :param tag: A `Tag`
443
        :return: Whether or not a substitution was performed.
444
        """
445
        # We are only interested in <meta> tags
446
        if tag.name != 'meta':
447
            return False
448

449
        http_equiv = tag.get('http-equiv')
450
        content = tag.get('content')
451
        charset = tag.get('charset')
452

453
        # We are interested in <meta> tags that say what encoding the
454
        # document was originally in. This means HTML 5-style <meta>
455
        # tags that provide the "charset" attribute. It also means
456
        # HTML 4-style <meta> tags that provide the "content"
457
        # attribute and have "http-equiv" set to "content-type".
458
        #
459
        # In both cases we will replace the value of the appropriate
460
        # attribute with a standin object that can take on any
461
        # encoding.
462
        meta_encoding = None
463
        if charset is not None:
464
            # HTML 5 style:
465
            # <meta charset="utf8">
466
            meta_encoding = charset
467
            tag['charset'] = CharsetMetaAttributeValue(charset)
468

469
        elif (content is not None and http_equiv is not None
470
              and http_equiv.lower() == 'content-type'):
471
            # HTML 4 style:
472
            # <meta http-equiv="content-type" content="text/html; charset=utf8">
473
            tag['content'] = ContentMetaAttributeValue(content)
474

475
        return (meta_encoding is not None)
476

477
def register_treebuilders_from(module):
478
    """Copy TreeBuilders from the given module into this module."""
479
    # I'm fairly sure this is not the best way to do this.
480
    this_module = sys.modules['bs4.builder']
481
    for name in module.__all__:
482
        obj = getattr(module, name)
483

484
        if issubclass(obj, TreeBuilder):
485
            setattr(this_module, name, obj)
486
            this_module.__all__.append(name)
487
            # Register the builder while we're at it.
488
            this_module.builder_registry.register(obj)
489

490
class ParserRejectedMarkup(Exception):
491
    """An Exception to be raised when the underlying parser simply
492
    refuses to parse the given markup.
493
    """
494
    def __init__(self, message_or_exception):
495
        """Explain why the parser rejected the given markup, either
496
        with a textual explanation or another exception.
497
        """
498
        if isinstance(message_or_exception, Exception):
499
            e = message_or_exception
500
            message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
501
        super(ParserRejectedMarkup, self).__init__(message_or_exception)
502
            
503
# Builders are registered in reverse order of priority, so that custom
504
# builder registrations will take precedence. In general, we want lxml
505
# to take precedence over html5lib, because it's faster. And we only
506
# want to use HTMLParser as a last resort.
507
from . import _htmlparser
508
register_treebuilders_from(_htmlparser)
509
try:
510
    from . import _html5lib
511
    register_treebuilders_from(_html5lib)
512
except ImportError:
513
    # They don't have html5lib installed.
514
    pass
515
try:
516
    from . import _lxml
517
    register_treebuilders_from(_lxml)
518
except ImportError:
519
    # They don't have lxml installed.
520
    pass
521

522
Product

Resources

Company