CoCalc -- _lxml.py

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/builder/_lxml.py
⁸¹¹ views
1
# Use of this source code is governed by the MIT license.
2
__license__ = "MIT"
3

4
__all__ = [
5
    'LXMLTreeBuilderForXML',
6
    'LXMLTreeBuilder',
7
    ]
8

9
try:
10
    from collections.abc import Callable # Python 3.6
11
except ImportError as e:
12
    from collections import Callable
13

14
from io import BytesIO
15
from io import StringIO
16
from lxml import etree
17
from bs4.element import (
18
    Comment,
19
    Doctype,
20
    NamespacedAttribute,
21
    ProcessingInstruction,
22
    XMLProcessingInstruction,
23
)
24
from bs4.builder import (
25
    FAST,
26
    HTML,
27
    HTMLTreeBuilder,
28
    PERMISSIVE,
29
    ParserRejectedMarkup,
30
    TreeBuilder,
31
    XML)
32
from bs4.dammit import EncodingDetector
33

34
LXML = 'lxml'
35

36
def _invert(d):
37
    "Invert a dictionary."
38
    return dict((v,k) for k, v in list(d.items()))
39

40
class LXMLTreeBuilderForXML(TreeBuilder):
41
    DEFAULT_PARSER_CLASS = etree.XMLParser
42

43
    is_xml = True
44
    processing_instruction_class = XMLProcessingInstruction
45

46
    NAME = "lxml-xml"
47
    ALTERNATE_NAMES = ["xml"]
48

49
    # Well, it's permissive by XML parser standards.
50
    features = [NAME, LXML, XML, FAST, PERMISSIVE]
51

52
    CHUNK_SIZE = 512
53

54
    # This namespace mapping is specified in the XML Namespace
55
    # standard.
56
    DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
57

58
    DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
59

60
    # NOTE: If we parsed Element objects and looked at .sourceline,
61
    # we'd be able to see the line numbers from the original document.
62
    # But instead we build an XMLParser or HTMLParser object to serve
63
    # as the target of parse messages, and those messages don't include
64
    # line numbers.
65
    # See: https://bugs.launchpad.net/lxml/+bug/1846906
66
    
67
    def initialize_soup(self, soup):
68
        """Let the BeautifulSoup object know about the standard namespace
69
        mapping.
70

71
        :param soup: A `BeautifulSoup`.
72
        """
73
        super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
74
        self._register_namespaces(self.DEFAULT_NSMAPS)
75

76
    def _register_namespaces(self, mapping):
77
        """Let the BeautifulSoup object know about namespaces encountered
78
        while parsing the document.
79

80
        This might be useful later on when creating CSS selectors.
81

82
        :param mapping: A dictionary mapping namespace prefixes to URIs.
83
        """
84
        for key, value in list(mapping.items()):
85
            if key and key not in self.soup._namespaces:
86
                # Let the BeautifulSoup object know about a new namespace.
87
                # If there are multiple namespaces defined with the same
88
                # prefix, the first one in the document takes precedence.
89
                self.soup._namespaces[key] = value
90

91
    def default_parser(self, encoding):
92
        """Find the default parser for the given encoding.
93

94
        :param encoding: A string.
95
        :return: Either a parser object or a class, which
96
          will be instantiated with default arguments.
97
        """
98
        if self._default_parser is not None:
99
            return self._default_parser
100
        return etree.XMLParser(
101
            target=self, strip_cdata=False, recover=True, encoding=encoding)
102

103
    def parser_for(self, encoding):
104
        """Instantiate an appropriate parser for the given encoding.
105

106
        :param encoding: A string.
107
        :return: A parser object such as an `etree.XMLParser`.
108
        """
109
        # Use the default parser.
110
        parser = self.default_parser(encoding)
111

112
        if isinstance(parser, Callable):
113
            # Instantiate the parser with default arguments
114
            parser = parser(
115
                target=self, strip_cdata=False, recover=True, encoding=encoding
116
            )
117
        return parser
118

119
    def __init__(self, parser=None, empty_element_tags=None, **kwargs):
120
        # TODO: Issue a warning if parser is present but not a
121
        # callable, since that means there's no way to create new
122
        # parsers for different encodings.
123
        self._default_parser = parser
124
        if empty_element_tags is not None:
125
            self.empty_element_tags = set(empty_element_tags)
126
        self.soup = None
127
        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
128
        super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
129
        
130
    def _getNsTag(self, tag):
131
        # Split the namespace URL out of a fully-qualified lxml tag
132
        # name. Copied from lxml's src/lxml/sax.py.
133
        if tag[0] == '{':
134
            return tuple(tag[1:].split('}', 1))
135
        else:
136
            return (None, tag)
137

138
    def prepare_markup(self, markup, user_specified_encoding=None,
139
                       exclude_encodings=None,
140
                       document_declared_encoding=None):
141
        """Run any preliminary steps necessary to make incoming markup
142
        acceptable to the parser.
143

144
        lxml really wants to get a bytestring and convert it to
145
        Unicode itself. So instead of using UnicodeDammit to convert
146
        the bytestring to Unicode using different encodings, this
147
        implementation uses EncodingDetector to iterate over the
148
        encodings, and tell lxml to try to parse the document as each
149
        one in turn.
150

151
        :param markup: Some markup -- hopefully a bytestring.
152
        :param user_specified_encoding: The user asked to try this encoding.
153
        :param document_declared_encoding: The markup itself claims to be
154
            in this encoding.
155
        :param exclude_encodings: The user asked _not_ to try any of
156
            these encodings.
157

158
        :yield: A series of 4-tuples:
159
         (markup, encoding, declared encoding,
160
          has undergone character replacement)
161

162
         Each 4-tuple represents a strategy for converting the
163
         document to Unicode and parsing it. Each strategy will be tried 
164
         in turn.
165
        """
166
        is_html = not self.is_xml
167
        if is_html:
168
            self.processing_instruction_class = ProcessingInstruction
169
        else:
170
            self.processing_instruction_class = XMLProcessingInstruction
171

172
        if isinstance(markup, str):
173
            # We were given Unicode. Maybe lxml can parse Unicode on
174
            # this system?
175
            yield markup, None, document_declared_encoding, False
176

177
        if isinstance(markup, str):
178
            # No, apparently not. Convert the Unicode to UTF-8 and
179
            # tell lxml to parse it as UTF-8.
180
            yield (markup.encode("utf8"), "utf8",
181
                   document_declared_encoding, False)
182

183
        try_encodings = [user_specified_encoding, document_declared_encoding]
184
        detector = EncodingDetector(
185
            markup, try_encodings, is_html, exclude_encodings)
186
        for encoding in detector.encodings:
187
            yield (detector.markup, encoding, document_declared_encoding, False)
188

189
    def feed(self, markup):
190
        if isinstance(markup, bytes):
191
            markup = BytesIO(markup)
192
        elif isinstance(markup, str):
193
            markup = StringIO(markup)
194

195
        # Call feed() at least once, even if the markup is empty,
196
        # or the parser won't be initialized.
197
        data = markup.read(self.CHUNK_SIZE)
198
        try:
199
            self.parser = self.parser_for(self.soup.original_encoding)
200
            self.parser.feed(data)
201
            while len(data) != 0:
202
                # Now call feed() on the rest of the data, chunk by chunk.
203
                data = markup.read(self.CHUNK_SIZE)
204
                if len(data) != 0:
205
                    self.parser.feed(data)
206
            self.parser.close()
207
        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
208
            raise ParserRejectedMarkup(e)
209

210
    def close(self):
211
        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
212

213
    def start(self, name, attrs, nsmap={}):
214
        # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
215
        attrs = dict(attrs)
216
        nsprefix = None
217
        # Invert each namespace map as it comes in.
218
        if len(nsmap) == 0 and len(self.nsmaps) > 1:
219
                # There are no new namespaces for this tag, but
220
                # non-default namespaces are in play, so we need a
221
                # separate tag stack to know when they end.
222
                self.nsmaps.append(None)
223
        elif len(nsmap) > 0:
224
            # A new namespace mapping has come into play.
225

226
            # First, Let the BeautifulSoup object know about it.
227
            self._register_namespaces(nsmap)
228

229
            # Then, add it to our running list of inverted namespace
230
            # mappings.
231
            self.nsmaps.append(_invert(nsmap))
232

233
            # Also treat the namespace mapping as a set of attributes on the
234
            # tag, so we can recreate it later.
235
            attrs = attrs.copy()
236
            for prefix, namespace in list(nsmap.items()):
237
                attribute = NamespacedAttribute(
238
                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
239
                attrs[attribute] = namespace
240

241
        # Namespaces are in play. Find any attributes that came in
242
        # from lxml with namespaces attached to their names, and
243
        # turn then into NamespacedAttribute objects.
244
        new_attrs = {}
245
        for attr, value in list(attrs.items()):
246
            namespace, attr = self._getNsTag(attr)
247
            if namespace is None:
248
                new_attrs[attr] = value
249
            else:
250
                nsprefix = self._prefix_for_namespace(namespace)
251
                attr = NamespacedAttribute(nsprefix, attr, namespace)
252
                new_attrs[attr] = value
253
        attrs = new_attrs
254

255
        namespace, name = self._getNsTag(name)
256
        nsprefix = self._prefix_for_namespace(namespace)
257
        self.soup.handle_starttag(name, namespace, nsprefix, attrs)
258

259
    def _prefix_for_namespace(self, namespace):
260
        """Find the currently active prefix for the given namespace."""
261
        if namespace is None:
262
            return None
263
        for inverted_nsmap in reversed(self.nsmaps):
264
            if inverted_nsmap is not None and namespace in inverted_nsmap:
265
                return inverted_nsmap[namespace]
266
        return None
267

268
    def end(self, name):
269
        self.soup.endData()
270
        completed_tag = self.soup.tagStack[-1]
271
        namespace, name = self._getNsTag(name)
272
        nsprefix = None
273
        if namespace is not None:
274
            for inverted_nsmap in reversed(self.nsmaps):
275
                if inverted_nsmap is not None and namespace in inverted_nsmap:
276
                    nsprefix = inverted_nsmap[namespace]
277
                    break
278
        self.soup.handle_endtag(name, nsprefix)
279
        if len(self.nsmaps) > 1:
280
            # This tag, or one of its parents, introduced a namespace
281
            # mapping, so pop it off the stack.
282
            self.nsmaps.pop()
283

284
    def pi(self, target, data):
285
        self.soup.endData()
286
        self.soup.handle_data(target + ' ' + data)
287
        self.soup.endData(self.processing_instruction_class)
288

289
    def data(self, content):
290
        self.soup.handle_data(content)
291

292
    def doctype(self, name, pubid, system):
293
        self.soup.endData()
294
        doctype = Doctype.for_name_and_ids(name, pubid, system)
295
        self.soup.object_was_parsed(doctype)
296

297
    def comment(self, content):
298
        "Handle comments as Comment objects."
299
        self.soup.endData()
300
        self.soup.handle_data(content)
301
        self.soup.endData(Comment)
302

303
    def test_fragment_to_document(self, fragment):
304
        """See `TreeBuilder`."""
305
        return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
306

307

308
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
309

310
    NAME = LXML
311
    ALTERNATE_NAMES = ["lxml-html"]
312

313
    features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
314
    is_xml = False
315
    processing_instruction_class = ProcessingInstruction
316

317
    def default_parser(self, encoding):
318
        return etree.HTMLParser
319

320
    def feed(self, markup):
321
        encoding = self.soup.original_encoding
322
        try:
323
            self.parser = self.parser_for(encoding)
324
            self.parser.feed(markup)
325
            self.parser.close()
326
        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
327
            raise ParserRejectedMarkup(e)
328

329

330
    def test_fragment_to_document(self, fragment):
331
        """See `TreeBuilder`."""
332
        return '<html><body>%s</body></html>' % fragment
333

334
Product

Resources

Company