CoCalc -- html5parser.py

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/lxml/html/html5parser.py
⁸¹¹ views
1
"""
2
An interface to html5lib that mimics the lxml.html interface.
3
"""
4
import sys
5
import string
6

7
from html5lib import HTMLParser as _HTMLParser
8
from html5lib.treebuilders.etree_lxml import TreeBuilder
9
from lxml import etree
10
from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
11

12
# python3 compatibility
13
try:
14
    _strings = basestring
15
except NameError:
16
    _strings = (bytes, str)
17
try:
18
    from urllib2 import urlopen
19
except ImportError:
20
    from urllib.request import urlopen
21
try:
22
    from urlparse import urlparse
23
except ImportError:
24
    from urllib.parse import urlparse
25

26

27
class HTMLParser(_HTMLParser):
28
    """An html5lib HTML parser with lxml as tree."""
29

30
    def __init__(self, strict=False, **kwargs):
31
        _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
32

33

34
try:
35
    from html5lib import XHTMLParser as _XHTMLParser
36
except ImportError:
37
    pass
38
else:
39
    class XHTMLParser(_XHTMLParser):
40
        """An html5lib XHTML Parser with lxml as tree."""
41

42
        def __init__(self, strict=False, **kwargs):
43
            _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
44

45
    xhtml_parser = XHTMLParser()
46

47

48
def _find_tag(tree, tag):
49
    elem = tree.find(tag)
50
    if elem is not None:
51
        return elem
52
    return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
53

54

55
def document_fromstring(html, guess_charset=None, parser=None):
56
    """
57
    Parse a whole document into a string.
58

59
    If `guess_charset` is true, or if the input is not Unicode but a
60
    byte string, the `chardet` library will perform charset guessing
61
    on the string.
62
    """
63
    if not isinstance(html, _strings):
64
        raise TypeError('string required')
65

66
    if parser is None:
67
        parser = html_parser
68

69
    options = {}
70
    if guess_charset is None and isinstance(html, bytes):
71
        # html5lib does not accept useChardet as an argument, if it
72
        # detected the html argument would produce unicode objects.
73
        guess_charset = True
74
    if guess_charset is not None:
75
        options['useChardet'] = guess_charset
76
    return parser.parse(html, **options).getroot()
77

78

79
def fragments_fromstring(html, no_leading_text=False,
80
                         guess_charset=None, parser=None):
81
    """Parses several HTML elements, returning a list of elements.
82

83
    The first item in the list may be a string.  If no_leading_text is true,
84
    then it will be an error if there is leading text, and it will always be
85
    a list of only elements.
86

87
    If `guess_charset` is true, the `chardet` library will perform charset
88
    guessing on the string.
89
    """
90
    if not isinstance(html, _strings):
91
        raise TypeError('string required')
92

93
    if parser is None:
94
        parser = html_parser
95

96
    options = {}
97
    if guess_charset is None and isinstance(html, bytes):
98
        # html5lib does not accept useChardet as an argument, if it
99
        # detected the html argument would produce unicode objects.
100
        guess_charset = False
101
    if guess_charset is not None:
102
        options['useChardet'] = guess_charset
103
    children = parser.parseFragment(html, 'div', **options)
104
    if children and isinstance(children[0], _strings):
105
        if no_leading_text:
106
            if children[0].strip():
107
                raise etree.ParserError('There is leading text: %r' %
108
                                        children[0])
109
            del children[0]
110
    return children
111

112

113
def fragment_fromstring(html, create_parent=False,
114
                        guess_charset=None, parser=None):
115
    """Parses a single HTML element; it is an error if there is more than
116
    one element, or if anything but whitespace precedes or follows the
117
    element.
118

119
    If 'create_parent' is true (or is a tag name) then a parent node
120
    will be created to encapsulate the HTML in a single element.  In
121
    this case, leading or trailing text is allowed.
122

123
    If `guess_charset` is true, the `chardet` library will perform charset
124
    guessing on the string.
125
    """
126
    if not isinstance(html, _strings):
127
        raise TypeError('string required')
128

129
    accept_leading_text = bool(create_parent)
130

131
    elements = fragments_fromstring(
132
        html, guess_charset=guess_charset, parser=parser,
133
        no_leading_text=not accept_leading_text)
134

135
    if create_parent:
136
        if not isinstance(create_parent, _strings):
137
            create_parent = 'div'
138
        new_root = Element(create_parent)
139
        if elements:
140
            if isinstance(elements[0], _strings):
141
                new_root.text = elements[0]
142
                del elements[0]
143
            new_root.extend(elements)
144
        return new_root
145

146
    if not elements:
147
        raise etree.ParserError('No elements found')
148
    if len(elements) > 1:
149
        raise etree.ParserError('Multiple elements found')
150
    result = elements[0]
151
    if result.tail and result.tail.strip():
152
        raise etree.ParserError('Element followed by text: %r' % result.tail)
153
    result.tail = None
154
    return result
155

156

157
def fromstring(html, guess_charset=None, parser=None):
158
    """Parse the html, returning a single element/document.
159

160
    This tries to minimally parse the chunk of text, without knowing if it
161
    is a fragment or a document.
162

163
    'base_url' will set the document's base_url attribute (and the tree's
164
    docinfo.URL)
165

166
    If `guess_charset` is true, or if the input is not Unicode but a
167
    byte string, the `chardet` library will perform charset guessing
168
    on the string.
169
    """
170
    if not isinstance(html, _strings):
171
        raise TypeError('string required')
172
    doc = document_fromstring(html, parser=parser,
173
                              guess_charset=guess_charset)
174

175
    # document starts with doctype or <html>, full document!
176
    start = html[:50]
177
    if isinstance(start, bytes):
178
        # Allow text comparison in python3.
179
        # Decode as ascii, that also covers latin-1 and utf-8 for the
180
        # characters we need.
181
        start = start.decode('ascii', 'replace')
182

183
    start = start.lstrip().lower()
184
    if start.startswith('<html') or start.startswith('<!doctype'):
185
        return doc
186

187
    head = _find_tag(doc, 'head')
188

189
    # if the head is not empty we have a full document
190
    if len(head):
191
        return doc
192

193
    body = _find_tag(doc, 'body')
194

195
    # The body has just one element, so it was probably a single
196
    # element passed in
197
    if (len(body) == 1 and (not body.text or not body.text.strip())
198
        and (not body[-1].tail or not body[-1].tail.strip())):
199
        return body[0]
200

201
    # Now we have a body which represents a bunch of tags which have the
202
    # content that was passed in.  We will create a fake container, which
203
    # is the body tag, except <body> implies too much structure.
204
    if _contains_block_level_tag(body):
205
        body.tag = 'div'
206
    else:
207
        body.tag = 'span'
208
    return body
209

210

211
def parse(filename_url_or_file, guess_charset=None, parser=None):
212
    """Parse a filename, URL, or file-like object into an HTML document
213
    tree.  Note: this returns a tree, not an element.  Use
214
    ``parse(...).getroot()`` to get the document root.
215

216
    If ``guess_charset`` is true, the ``useChardet`` option is passed into
217
    html5lib to enable character detection.  This option is on by default
218
    when parsing from URLs, off by default when parsing from file(-like)
219
    objects (which tend to return Unicode more often than not), and on by
220
    default when parsing from a file path (which is read in binary mode).
221
    """
222
    if parser is None:
223
        parser = html_parser
224
    if not isinstance(filename_url_or_file, _strings):
225
        fp = filename_url_or_file
226
        if guess_charset is None:
227
            # assume that file-like objects return Unicode more often than bytes
228
            guess_charset = False
229
    elif _looks_like_url(filename_url_or_file):
230
        fp = urlopen(filename_url_or_file)
231
        if guess_charset is None:
232
            # assume that URLs return bytes
233
            guess_charset = True
234
    else:
235
        fp = open(filename_url_or_file, 'rb')
236
        if guess_charset is None:
237
            guess_charset = True
238

239
    options = {}
240
    # html5lib does not accept useChardet as an argument, if it
241
    # detected the html argument would produce unicode objects.
242
    if guess_charset:
243
        options['useChardet'] = guess_charset
244
    return parser.parse(fp, **options)
245

246

247
def _looks_like_url(str):
248
    scheme = urlparse(str)[0]
249
    if not scheme:
250
        return False
251
    elif (sys.platform == 'win32' and
252
            scheme in string.ascii_letters
253
            and len(scheme) == 1):
254
        # looks like a 'normal' absolute path
255
        return False
256
    else:
257
        return True
258

259

260
html_parser = HTMLParser()
261

262
Product

Resources

Company