CoCalc -- _html5lib.py

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/builder/_html5lib.py
⁸¹¹ views
1
# Use of this source code is governed by the MIT license.
2
__license__ = "MIT"
3

4
__all__ = [
5
    'HTML5TreeBuilder',
6
    ]
7

8
import warnings
9
import re
10
from bs4.builder import (
11
    PERMISSIVE,
12
    HTML,
13
    HTML_5,
14
    HTMLTreeBuilder,
15
    )
16
from bs4.element import (
17
    NamespacedAttribute,
18
    nonwhitespace_re,
19
)
20
import html5lib
21
from html5lib.constants import (
22
    namespaces,
23
    prefixes,
24
    )
25
from bs4.element import (
26
    Comment,
27
    Doctype,
28
    NavigableString,
29
    Tag,
30
    )
31

32
try:
33
    # Pre-0.99999999
34
    from html5lib.treebuilders import _base as treebuilder_base
35
    new_html5lib = False
36
except ImportError as e:
37
    # 0.99999999 and up
38
    from html5lib.treebuilders import base as treebuilder_base
39
    new_html5lib = True
40

41
class HTML5TreeBuilder(HTMLTreeBuilder):
42
    """Use html5lib to build a tree.
43

44
    Note that this TreeBuilder does not support some features common
45
    to HTML TreeBuilders. Some of these features could theoretically
46
    be implemented, but at the very least it's quite difficult,
47
    because html5lib moves the parse tree around as it's being built.
48

49
    * This TreeBuilder doesn't use different subclasses of NavigableString
50
      based on the name of the tag in which the string was found.
51

52
    * You can't use a SoupStrainer to parse only part of a document.
53
    """
54

55
    NAME = "html5lib"
56

57
    features = [NAME, PERMISSIVE, HTML_5, HTML]
58

59
    # html5lib can tell us which line number and position in the
60
    # original file is the source of an element.
61
    TRACKS_LINE_NUMBERS = True
62
    
63
    def prepare_markup(self, markup, user_specified_encoding,
64
                       document_declared_encoding=None, exclude_encodings=None):
65
        # Store the user-specified encoding for use later on.
66
        self.user_specified_encoding = user_specified_encoding
67

68
        # document_declared_encoding and exclude_encodings aren't used
69
        # ATM because the html5lib TreeBuilder doesn't use
70
        # UnicodeDammit.
71
        if exclude_encodings:
72
            warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
73
        yield (markup, None, None, False)
74

75
    # These methods are defined by Beautiful Soup.
76
    def feed(self, markup):
77
        if self.soup.parse_only is not None:
78
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
79
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
80
        self.underlying_builder.parser = parser
81
        extra_kwargs = dict()
82
        if not isinstance(markup, str):
83
            if new_html5lib:
84
                extra_kwargs['override_encoding'] = self.user_specified_encoding
85
            else:
86
                extra_kwargs['encoding'] = self.user_specified_encoding
87
        doc = parser.parse(markup, **extra_kwargs)
88
        
89
        # Set the character encoding detected by the tokenizer.
90
        if isinstance(markup, str):
91
            # We need to special-case this because html5lib sets
92
            # charEncoding to UTF-8 if it gets Unicode input.
93
            doc.original_encoding = None
94
        else:
95
            original_encoding = parser.tokenizer.stream.charEncoding[0]
96
            if not isinstance(original_encoding, str):
97
                # In 0.99999999 and up, the encoding is an html5lib
98
                # Encoding object. We want to use a string for compatibility
99
                # with other tree builders.
100
                original_encoding = original_encoding.name
101
            doc.original_encoding = original_encoding
102
        self.underlying_builder.parser = None
103
            
104
    def create_treebuilder(self, namespaceHTMLElements):
105
        self.underlying_builder = TreeBuilderForHtml5lib(
106
            namespaceHTMLElements, self.soup,
107
            store_line_numbers=self.store_line_numbers
108
        )
109
        return self.underlying_builder
110

111
    def test_fragment_to_document(self, fragment):
112
        """See `TreeBuilder`."""
113
        return '<html><head></head><body>%s</body></html>' % fragment
114

115

116
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
117
    
118
    def __init__(self, namespaceHTMLElements, soup=None,
119
                 store_line_numbers=True, **kwargs):
120
        if soup:
121
            self.soup = soup
122
        else:
123
            from bs4 import BeautifulSoup
124
            # TODO: Why is the parser 'html.parser' here? To avoid an
125
            # infinite loop?
126
            self.soup = BeautifulSoup(
127
                "", "html.parser", store_line_numbers=store_line_numbers,
128
                **kwargs
129
            )
130
        # TODO: What are **kwargs exactly? Should they be passed in
131
        # here in addition to/instead of being passed to the BeautifulSoup
132
        # constructor?
133
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
134

135
        # This will be set later to an html5lib.html5parser.HTMLParser
136
        # object, which we can use to track the current line number.
137
        self.parser = None
138
        self.store_line_numbers = store_line_numbers
139
        
140
    def documentClass(self):
141
        self.soup.reset()
142
        return Element(self.soup, self.soup, None)
143

144
    def insertDoctype(self, token):
145
        name = token["name"]
146
        publicId = token["publicId"]
147
        systemId = token["systemId"]
148

149
        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
150
        self.soup.object_was_parsed(doctype)
151

152
    def elementClass(self, name, namespace):
153
        kwargs = {}
154
        if self.parser and self.store_line_numbers:
155
            # This represents the point immediately after the end of the
156
            # tag. We don't know when the tag started, but we do know
157
            # where it ended -- the character just before this one.
158
            sourceline, sourcepos = self.parser.tokenizer.stream.position()
159
            kwargs['sourceline'] = sourceline
160
            kwargs['sourcepos'] = sourcepos-1
161
        tag = self.soup.new_tag(name, namespace, **kwargs)
162

163
        return Element(tag, self.soup, namespace)
164

165
    def commentClass(self, data):
166
        return TextNode(Comment(data), self.soup)
167

168
    def fragmentClass(self):
169
        from bs4 import BeautifulSoup
170
        # TODO: Why is the parser 'html.parser' here? To avoid an
171
        # infinite loop?
172
        self.soup = BeautifulSoup("", "html.parser")
173
        self.soup.name = "[document_fragment]"
174
        return Element(self.soup, self.soup, None)
175

176
    def appendChild(self, node):
177
        # XXX This code is not covered by the BS4 tests.
178
        self.soup.append(node.element)
179

180
    def getDocument(self):
181
        return self.soup
182

183
    def getFragment(self):
184
        return treebuilder_base.TreeBuilder.getFragment(self).element
185

186
    def testSerializer(self, element):
187
        from bs4 import BeautifulSoup
188
        rv = []
189
        doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
190

191
        def serializeElement(element, indent=0):
192
            if isinstance(element, BeautifulSoup):
193
                pass
194
            if isinstance(element, Doctype):
195
                m = doctype_re.match(element)
196
                if m:
197
                    name = m.group(1)
198
                    if m.lastindex > 1:
199
                        publicId = m.group(2) or ""
200
                        systemId = m.group(3) or m.group(4) or ""
201
                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
202
                                  (' ' * indent, name, publicId, systemId))
203
                    else:
204
                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
205
                else:
206
                    rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
207
            elif isinstance(element, Comment):
208
                rv.append("|%s<!-- %s -->" % (' ' * indent, element))
209
            elif isinstance(element, NavigableString):
210
                rv.append("|%s\"%s\"" % (' ' * indent, element))
211
            else:
212
                if element.namespace:
213
                    name = "%s %s" % (prefixes[element.namespace],
214
                                      element.name)
215
                else:
216
                    name = element.name
217
                rv.append("|%s<%s>" % (' ' * indent, name))
218
                if element.attrs:
219
                    attributes = []
220
                    for name, value in list(element.attrs.items()):
221
                        if isinstance(name, NamespacedAttribute):
222
                            name = "%s %s" % (prefixes[name.namespace], name.name)
223
                        if isinstance(value, list):
224
                            value = " ".join(value)
225
                        attributes.append((name, value))
226

227
                    for name, value in sorted(attributes):
228
                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
229
                indent += 2
230
                for child in element.children:
231
                    serializeElement(child, indent)
232
        serializeElement(element, 0)
233

234
        return "\n".join(rv)
235

236
class AttrList(object):
237
    def __init__(self, element):
238
        self.element = element
239
        self.attrs = dict(self.element.attrs)
240
    def __iter__(self):
241
        return list(self.attrs.items()).__iter__()
242
    def __setitem__(self, name, value):
243
        # If this attribute is a multi-valued attribute for this element,
244
        # turn its value into a list.
245
        list_attr = self.element.cdata_list_attributes
246
        if (name in list_attr['*']
247
            or (self.element.name in list_attr
248
                and name in list_attr[self.element.name])):
249
            # A node that is being cloned may have already undergone
250
            # this procedure.
251
            if not isinstance(value, list):
252
                value = nonwhitespace_re.findall(value)
253
        self.element[name] = value
254
    def items(self):
255
        return list(self.attrs.items())
256
    def keys(self):
257
        return list(self.attrs.keys())
258
    def __len__(self):
259
        return len(self.attrs)
260
    def __getitem__(self, name):
261
        return self.attrs[name]
262
    def __contains__(self, name):
263
        return name in list(self.attrs.keys())
264

265

266
class Element(treebuilder_base.Node):
267
    def __init__(self, element, soup, namespace):
268
        treebuilder_base.Node.__init__(self, element.name)
269
        self.element = element
270
        self.soup = soup
271
        self.namespace = namespace
272

273
    def appendChild(self, node):
274
        string_child = child = None
275
        if isinstance(node, str):
276
            # Some other piece of code decided to pass in a string
277
            # instead of creating a TextElement object to contain the
278
            # string.
279
            string_child = child = node
280
        elif isinstance(node, Tag):
281
            # Some other piece of code decided to pass in a Tag
282
            # instead of creating an Element object to contain the
283
            # Tag.
284
            child = node
285
        elif node.element.__class__ == NavigableString:
286
            string_child = child = node.element
287
            node.parent = self
288
        else:
289
            child = node.element
290
            node.parent = self
291

292
        if not isinstance(child, str) and child.parent is not None:
293
            node.element.extract()
294

295
        if (string_child is not None and self.element.contents
296
            and self.element.contents[-1].__class__ == NavigableString):
297
            # We are appending a string onto another string.
298
            # TODO This has O(n^2) performance, for input like
299
            # "a</a>a</a>a</a>..."
300
            old_element = self.element.contents[-1]
301
            new_element = self.soup.new_string(old_element + string_child)
302
            old_element.replace_with(new_element)
303
            self.soup._most_recent_element = new_element
304
        else:
305
            if isinstance(node, str):
306
                # Create a brand new NavigableString from this string.
307
                child = self.soup.new_string(node)
308

309
            # Tell Beautiful Soup to act as if it parsed this element
310
            # immediately after the parent's last descendant. (Or
311
            # immediately after the parent, if it has no children.)
312
            if self.element.contents:
313
                most_recent_element = self.element._last_descendant(False)
314
            elif self.element.next_element is not None:
315
                # Something from further ahead in the parse tree is
316
                # being inserted into this earlier element. This is
317
                # very annoying because it means an expensive search
318
                # for the last element in the tree.
319
                most_recent_element = self.soup._last_descendant()
320
            else:
321
                most_recent_element = self.element
322

323
            self.soup.object_was_parsed(
324
                child, parent=self.element,
325
                most_recent_element=most_recent_element)
326

327
    def getAttributes(self):
328
        if isinstance(self.element, Comment):
329
            return {}
330
        return AttrList(self.element)
331

332
    def setAttributes(self, attributes):
333
        if attributes is not None and len(attributes) > 0:
334
            converted_attributes = []
335
            for name, value in list(attributes.items()):
336
                if isinstance(name, tuple):
337
                    new_name = NamespacedAttribute(*name)
338
                    del attributes[name]
339
                    attributes[new_name] = value
340

341
            self.soup.builder._replace_cdata_list_attribute_values(
342
                self.name, attributes)
343
            for name, value in list(attributes.items()):
344
                self.element[name] = value
345

346
            # The attributes may contain variables that need substitution.
347
            # Call set_up_substitutions manually.
348
            #
349
            # The Tag constructor called this method when the Tag was created,
350
            # but we just set/changed the attributes, so call it again.
351
            self.soup.builder.set_up_substitutions(self.element)
352
    attributes = property(getAttributes, setAttributes)
353

354
    def insertText(self, data, insertBefore=None):
355
        text = TextNode(self.soup.new_string(data), self.soup)
356
        if insertBefore:
357
            self.insertBefore(text, insertBefore)
358
        else:
359
            self.appendChild(text)
360

361
    def insertBefore(self, node, refNode):
362
        index = self.element.index(refNode.element)
363
        if (node.element.__class__ == NavigableString and self.element.contents
364
            and self.element.contents[index-1].__class__ == NavigableString):
365
            # (See comments in appendChild)
366
            old_node = self.element.contents[index-1]
367
            new_str = self.soup.new_string(old_node + node.element)
368
            old_node.replace_with(new_str)
369
        else:
370
            self.element.insert(index, node.element)
371
            node.parent = self
372

373
    def removeChild(self, node):
374
        node.element.extract()
375

376
    def reparentChildren(self, new_parent):
377
        """Move all of this tag's children into another tag."""
378
        # print("MOVE", self.element.contents)
379
        # print("FROM", self.element)
380
        # print("TO", new_parent.element)
381

382
        element = self.element
383
        new_parent_element = new_parent.element
384
        # Determine what this tag's next_element will be once all the children
385
        # are removed.
386
        final_next_element = element.next_sibling
387

388
        new_parents_last_descendant = new_parent_element._last_descendant(False, False)
389
        if len(new_parent_element.contents) > 0:
390
            # The new parent already contains children. We will be
391
            # appending this tag's children to the end.
392
            new_parents_last_child = new_parent_element.contents[-1]
393
            new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
394
        else:
395
            # The new parent contains no children.
396
            new_parents_last_child = None
397
            new_parents_last_descendant_next_element = new_parent_element.next_element
398

399
        to_append = element.contents
400
        if len(to_append) > 0:
401
            # Set the first child's previous_element and previous_sibling
402
            # to elements within the new parent
403
            first_child = to_append[0]
404
            if new_parents_last_descendant is not None:
405
                first_child.previous_element = new_parents_last_descendant
406
            else:
407
                first_child.previous_element = new_parent_element
408
            first_child.previous_sibling = new_parents_last_child
409
            if new_parents_last_descendant is not None:
410
                new_parents_last_descendant.next_element = first_child
411
            else:
412
                new_parent_element.next_element = first_child
413
            if new_parents_last_child is not None:
414
                new_parents_last_child.next_sibling = first_child
415

416
            # Find the very last element being moved. It is now the
417
            # parent's last descendant. It has no .next_sibling and
418
            # its .next_element is whatever the previous last
419
            # descendant had.
420
            last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
421

422
            last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
423
            if new_parents_last_descendant_next_element is not None:
424
                # TODO: This code has no test coverage and I'm not sure
425
                # how to get html5lib to go through this path, but it's
426
                # just the other side of the previous line.
427
                new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
428
            last_childs_last_descendant.next_sibling = None
429

430
        for child in to_append:
431
            child.parent = new_parent_element
432
            new_parent_element.contents.append(child)
433

434
        # Now that this element has no children, change its .next_element.
435
        element.contents = []
436
        element.next_element = final_next_element
437

438
        # print("DONE WITH MOVE")
439
        # print("FROM", self.element)
440
        # print("TO", new_parent_element)
441

442
    def cloneNode(self):
443
        tag = self.soup.new_tag(self.element.name, self.namespace)
444
        node = Element(tag, self.soup, self.namespace)
445
        for key,value in self.attributes:
446
            node.attributes[key] = value
447
        return node
448

449
    def hasContent(self):
450
        return self.element.contents
451

452
    def getNameTuple(self):
453
        if self.namespace == None:
454
            return namespaces["html"], self.name
455
        else:
456
            return self.namespace, self.name
457

458
    nameTuple = property(getNameTuple)
459

460
class TextNode(Element):
461
    def __init__(self, element, soup):
462
        treebuilder_base.Node.__init__(self, None)
463
        self.element = element
464
        self.soup = soup
465

466
    def cloneNode(self):
467
        raise NotImplementedError
468

469
Product

Resources

Company