Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/diagnose.py
811 views
1
"""Diagnostic functions, mainly for use when doing tech support."""
2
3
# Use of this source code is governed by the MIT license.
4
__license__ = "MIT"
5
6
import cProfile
7
from io import StringIO
8
from html.parser import HTMLParser
9
import bs4
10
from bs4 import BeautifulSoup, __version__
11
from bs4.builder import builder_registry
12
13
import os
14
import pstats
15
import random
16
import tempfile
17
import time
18
import traceback
19
import sys
20
import cProfile
21
22
def diagnose(data):
23
"""Diagnostic suite for isolating common problems.
24
25
:param data: A string containing markup that needs to be explained.
26
:return: None; diagnostics are printed to standard output.
27
"""
28
print(("Diagnostic running on Beautiful Soup %s" % __version__))
29
print(("Python version %s" % sys.version))
30
31
basic_parsers = ["html.parser", "html5lib", "lxml"]
32
for name in basic_parsers:
33
for builder in builder_registry.builders:
34
if name in builder.features:
35
break
36
else:
37
basic_parsers.remove(name)
38
print((
39
"I noticed that %s is not installed. Installing it may help." %
40
name))
41
42
if 'lxml' in basic_parsers:
43
basic_parsers.append("lxml-xml")
44
try:
45
from lxml import etree
46
print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
47
except ImportError as e:
48
print(
49
"lxml is not installed or couldn't be imported.")
50
51
52
if 'html5lib' in basic_parsers:
53
try:
54
import html5lib
55
print(("Found html5lib version %s" % html5lib.__version__))
56
except ImportError as e:
57
print(
58
"html5lib is not installed or couldn't be imported.")
59
60
if hasattr(data, 'read'):
61
data = data.read()
62
elif data.startswith("http:") or data.startswith("https:"):
63
print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
64
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
65
return
66
else:
67
try:
68
if os.path.exists(data):
69
print(('"%s" looks like a filename. Reading data from the file.' % data))
70
with open(data) as fp:
71
data = fp.read()
72
except ValueError:
73
# This can happen on some platforms when the 'filename' is
74
# too long. Assume it's data and not a filename.
75
pass
76
print("")
77
78
for parser in basic_parsers:
79
print(("Trying to parse your markup with %s" % parser))
80
success = False
81
try:
82
soup = BeautifulSoup(data, features=parser)
83
success = True
84
except Exception as e:
85
print(("%s could not parse the markup." % parser))
86
traceback.print_exc()
87
if success:
88
print(("Here's what %s did with the markup:" % parser))
89
print((soup.prettify()))
90
91
print(("-" * 80))
92
93
def lxml_trace(data, html=True, **kwargs):
94
"""Print out the lxml events that occur during parsing.
95
96
This lets you see how lxml parses a document when no Beautiful
97
Soup code is running. You can use this to determine whether
98
an lxml-specific problem is in Beautiful Soup's lxml tree builders
99
or in lxml itself.
100
101
:param data: Some markup.
102
:param html: If True, markup will be parsed with lxml's HTML parser.
103
if False, lxml's XML parser will be used.
104
"""
105
from lxml import etree
106
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
107
print(("%s, %4s, %s" % (event, element.tag, element.text)))
108
109
class AnnouncingParser(HTMLParser):
110
"""Subclass of HTMLParser that announces parse events, without doing
111
anything else.
112
113
You can use this to get a picture of how html.parser sees a given
114
document. The easiest way to do this is to call `htmlparser_trace`.
115
"""
116
117
def _p(self, s):
118
print(s)
119
120
def handle_starttag(self, name, attrs):
121
self._p("%s START" % name)
122
123
def handle_endtag(self, name):
124
self._p("%s END" % name)
125
126
def handle_data(self, data):
127
self._p("%s DATA" % data)
128
129
def handle_charref(self, name):
130
self._p("%s CHARREF" % name)
131
132
def handle_entityref(self, name):
133
self._p("%s ENTITYREF" % name)
134
135
def handle_comment(self, data):
136
self._p("%s COMMENT" % data)
137
138
def handle_decl(self, data):
139
self._p("%s DECL" % data)
140
141
def unknown_decl(self, data):
142
self._p("%s UNKNOWN-DECL" % data)
143
144
def handle_pi(self, data):
145
self._p("%s PI" % data)
146
147
def htmlparser_trace(data):
148
"""Print out the HTMLParser events that occur during parsing.
149
150
This lets you see how HTMLParser parses a document when no
151
Beautiful Soup code is running.
152
153
:param data: Some markup.
154
"""
155
parser = AnnouncingParser()
156
parser.feed(data)
157
158
_vowels = "aeiou"
159
_consonants = "bcdfghjklmnpqrstvwxyz"
160
161
def rword(length=5):
162
"Generate a random word-like string."
163
s = ''
164
for i in range(length):
165
if i % 2 == 0:
166
t = _consonants
167
else:
168
t = _vowels
169
s += random.choice(t)
170
return s
171
172
def rsentence(length=4):
173
"Generate a random sentence-like string."
174
return " ".join(rword(random.randint(4,9)) for i in list(range(length)))
175
176
def rdoc(num_elements=1000):
177
"""Randomly generate an invalid HTML document."""
178
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
179
elements = []
180
for i in range(num_elements):
181
choice = random.randint(0,3)
182
if choice == 0:
183
# New tag.
184
tag_name = random.choice(tag_names)
185
elements.append("<%s>" % tag_name)
186
elif choice == 1:
187
elements.append(rsentence(random.randint(1,4)))
188
elif choice == 2:
189
# Close a tag.
190
tag_name = random.choice(tag_names)
191
elements.append("</%s>" % tag_name)
192
return "<html>" + "\n".join(elements) + "</html>"
193
194
def benchmark_parsers(num_elements=100000):
195
"""Very basic head-to-head performance benchmark."""
196
print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
197
data = rdoc(num_elements)
198
print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
199
200
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
201
success = False
202
try:
203
a = time.time()
204
soup = BeautifulSoup(data, parser)
205
b = time.time()
206
success = True
207
except Exception as e:
208
print(("%s could not parse the markup." % parser))
209
traceback.print_exc()
210
if success:
211
print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
212
213
from lxml import etree
214
a = time.time()
215
etree.HTML(data)
216
b = time.time()
217
print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
218
219
import html5lib
220
parser = html5lib.HTMLParser()
221
a = time.time()
222
parser.parse(data)
223
b = time.time()
224
print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
225
226
def profile(num_elements=100000, parser="lxml"):
227
"""Use Python's profiler on a randomly generated document."""
228
filehandle = tempfile.NamedTemporaryFile()
229
filename = filehandle.name
230
231
data = rdoc(num_elements)
232
vars = dict(bs4=bs4, data=data, parser=parser)
233
cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
234
235
stats = pstats.Stats(filename)
236
# stats.strip_dirs()
237
stats.sort_stats("cumulative")
238
stats.print_stats('_html5lib|bs4', 50)
239
240
# If this file is run as a script, standard input is diagnosed.
241
if __name__ == '__main__':
242
diagnose(sys.stdin.read())
243
244