Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/lxml/html/html5parser.py
811 views
1
"""
2
An interface to html5lib that mimics the lxml.html interface.
3
"""
4
import sys
5
import string
6
7
from html5lib import HTMLParser as _HTMLParser
8
from html5lib.treebuilders.etree_lxml import TreeBuilder
9
from lxml import etree
10
from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
11
12
# python3 compatibility
13
try:
14
_strings = basestring
15
except NameError:
16
_strings = (bytes, str)
17
try:
18
from urllib2 import urlopen
19
except ImportError:
20
from urllib.request import urlopen
21
try:
22
from urlparse import urlparse
23
except ImportError:
24
from urllib.parse import urlparse
25
26
27
class HTMLParser(_HTMLParser):
28
"""An html5lib HTML parser with lxml as tree."""
29
30
def __init__(self, strict=False, **kwargs):
31
_HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
32
33
34
try:
35
from html5lib import XHTMLParser as _XHTMLParser
36
except ImportError:
37
pass
38
else:
39
class XHTMLParser(_XHTMLParser):
40
"""An html5lib XHTML Parser with lxml as tree."""
41
42
def __init__(self, strict=False, **kwargs):
43
_XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
44
45
xhtml_parser = XHTMLParser()
46
47
48
def _find_tag(tree, tag):
49
elem = tree.find(tag)
50
if elem is not None:
51
return elem
52
return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
53
54
55
def document_fromstring(html, guess_charset=None, parser=None):
56
"""
57
Parse a whole document into a string.
58
59
If `guess_charset` is true, or if the input is not Unicode but a
60
byte string, the `chardet` library will perform charset guessing
61
on the string.
62
"""
63
if not isinstance(html, _strings):
64
raise TypeError('string required')
65
66
if parser is None:
67
parser = html_parser
68
69
options = {}
70
if guess_charset is None and isinstance(html, bytes):
71
# html5lib does not accept useChardet as an argument, if it
72
# detected the html argument would produce unicode objects.
73
guess_charset = True
74
if guess_charset is not None:
75
options['useChardet'] = guess_charset
76
return parser.parse(html, **options).getroot()
77
78
79
def fragments_fromstring(html, no_leading_text=False,
80
guess_charset=None, parser=None):
81
"""Parses several HTML elements, returning a list of elements.
82
83
The first item in the list may be a string. If no_leading_text is true,
84
then it will be an error if there is leading text, and it will always be
85
a list of only elements.
86
87
If `guess_charset` is true, the `chardet` library will perform charset
88
guessing on the string.
89
"""
90
if not isinstance(html, _strings):
91
raise TypeError('string required')
92
93
if parser is None:
94
parser = html_parser
95
96
options = {}
97
if guess_charset is None and isinstance(html, bytes):
98
# html5lib does not accept useChardet as an argument, if it
99
# detected the html argument would produce unicode objects.
100
guess_charset = False
101
if guess_charset is not None:
102
options['useChardet'] = guess_charset
103
children = parser.parseFragment(html, 'div', **options)
104
if children and isinstance(children[0], _strings):
105
if no_leading_text:
106
if children[0].strip():
107
raise etree.ParserError('There is leading text: %r' %
108
children[0])
109
del children[0]
110
return children
111
112
113
def fragment_fromstring(html, create_parent=False,
114
guess_charset=None, parser=None):
115
"""Parses a single HTML element; it is an error if there is more than
116
one element, or if anything but whitespace precedes or follows the
117
element.
118
119
If 'create_parent' is true (or is a tag name) then a parent node
120
will be created to encapsulate the HTML in a single element. In
121
this case, leading or trailing text is allowed.
122
123
If `guess_charset` is true, the `chardet` library will perform charset
124
guessing on the string.
125
"""
126
if not isinstance(html, _strings):
127
raise TypeError('string required')
128
129
accept_leading_text = bool(create_parent)
130
131
elements = fragments_fromstring(
132
html, guess_charset=guess_charset, parser=parser,
133
no_leading_text=not accept_leading_text)
134
135
if create_parent:
136
if not isinstance(create_parent, _strings):
137
create_parent = 'div'
138
new_root = Element(create_parent)
139
if elements:
140
if isinstance(elements[0], _strings):
141
new_root.text = elements[0]
142
del elements[0]
143
new_root.extend(elements)
144
return new_root
145
146
if not elements:
147
raise etree.ParserError('No elements found')
148
if len(elements) > 1:
149
raise etree.ParserError('Multiple elements found')
150
result = elements[0]
151
if result.tail and result.tail.strip():
152
raise etree.ParserError('Element followed by text: %r' % result.tail)
153
result.tail = None
154
return result
155
156
157
def fromstring(html, guess_charset=None, parser=None):
158
"""Parse the html, returning a single element/document.
159
160
This tries to minimally parse the chunk of text, without knowing if it
161
is a fragment or a document.
162
163
'base_url' will set the document's base_url attribute (and the tree's
164
docinfo.URL)
165
166
If `guess_charset` is true, or if the input is not Unicode but a
167
byte string, the `chardet` library will perform charset guessing
168
on the string.
169
"""
170
if not isinstance(html, _strings):
171
raise TypeError('string required')
172
doc = document_fromstring(html, parser=parser,
173
guess_charset=guess_charset)
174
175
# document starts with doctype or <html>, full document!
176
start = html[:50]
177
if isinstance(start, bytes):
178
# Allow text comparison in python3.
179
# Decode as ascii, that also covers latin-1 and utf-8 for the
180
# characters we need.
181
start = start.decode('ascii', 'replace')
182
183
start = start.lstrip().lower()
184
if start.startswith('<html') or start.startswith('<!doctype'):
185
return doc
186
187
head = _find_tag(doc, 'head')
188
189
# if the head is not empty we have a full document
190
if len(head):
191
return doc
192
193
body = _find_tag(doc, 'body')
194
195
# The body has just one element, so it was probably a single
196
# element passed in
197
if (len(body) == 1 and (not body.text or not body.text.strip())
198
and (not body[-1].tail or not body[-1].tail.strip())):
199
return body[0]
200
201
# Now we have a body which represents a bunch of tags which have the
202
# content that was passed in. We will create a fake container, which
203
# is the body tag, except <body> implies too much structure.
204
if _contains_block_level_tag(body):
205
body.tag = 'div'
206
else:
207
body.tag = 'span'
208
return body
209
210
211
def parse(filename_url_or_file, guess_charset=None, parser=None):
212
"""Parse a filename, URL, or file-like object into an HTML document
213
tree. Note: this returns a tree, not an element. Use
214
``parse(...).getroot()`` to get the document root.
215
216
If ``guess_charset`` is true, the ``useChardet`` option is passed into
217
html5lib to enable character detection. This option is on by default
218
when parsing from URLs, off by default when parsing from file(-like)
219
objects (which tend to return Unicode more often than not), and on by
220
default when parsing from a file path (which is read in binary mode).
221
"""
222
if parser is None:
223
parser = html_parser
224
if not isinstance(filename_url_or_file, _strings):
225
fp = filename_url_or_file
226
if guess_charset is None:
227
# assume that file-like objects return Unicode more often than bytes
228
guess_charset = False
229
elif _looks_like_url(filename_url_or_file):
230
fp = urlopen(filename_url_or_file)
231
if guess_charset is None:
232
# assume that URLs return bytes
233
guess_charset = True
234
else:
235
fp = open(filename_url_or_file, 'rb')
236
if guess_charset is None:
237
guess_charset = True
238
239
options = {}
240
# html5lib does not accept useChardet as an argument, if it
241
# detected the html argument would produce unicode objects.
242
if guess_charset:
243
options['useChardet'] = guess_charset
244
return parser.parse(fp, **options)
245
246
247
def _looks_like_url(str):
248
scheme = urlparse(str)[0]
249
if not scheme:
250
return False
251
elif (sys.platform == 'win32' and
252
scheme in string.ascii_letters
253
and len(scheme) == 1):
254
# looks like a 'normal' absolute path
255
return False
256
else:
257
return True
258
259
260
html_parser = HTMLParser()
261
262