CoCalc -- diagnose.py

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/diagnose.py
⁸¹¹ views
1
"""Diagnostic functions, mainly for use when doing tech support."""
2

3
# Use of this source code is governed by the MIT license.
4
__license__ = "MIT"
5

6
import cProfile
7
from io import StringIO
8
from html.parser import HTMLParser
9
import bs4
10
from bs4 import BeautifulSoup, __version__
11
from bs4.builder import builder_registry
12

13
import os
14
import pstats
15
import random
16
import tempfile
17
import time
18
import traceback
19
import sys
20
import cProfile
21

22
def diagnose(data):
23
    """Diagnostic suite for isolating common problems.
24

25
    :param data: A string containing markup that needs to be explained.
26
    :return: None; diagnostics are printed to standard output.
27
    """
28
    print(("Diagnostic running on Beautiful Soup %s" % __version__))
29
    print(("Python version %s" % sys.version))
30

31
    basic_parsers = ["html.parser", "html5lib", "lxml"]
32
    for name in basic_parsers:
33
        for builder in builder_registry.builders:
34
            if name in builder.features:
35
                break
36
        else:
37
            basic_parsers.remove(name)
38
            print((
39
                "I noticed that %s is not installed. Installing it may help." %
40
                name))
41

42
    if 'lxml' in basic_parsers:
43
        basic_parsers.append("lxml-xml")
44
        try:
45
            from lxml import etree
46
            print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
47
        except ImportError as e:
48
            print(
49
                "lxml is not installed or couldn't be imported.")
50

51

52
    if 'html5lib' in basic_parsers:
53
        try:
54
            import html5lib
55
            print(("Found html5lib version %s" % html5lib.__version__))
56
        except ImportError as e:
57
            print(
58
                "html5lib is not installed or couldn't be imported.")
59

60
    if hasattr(data, 'read'):
61
        data = data.read()
62
    elif data.startswith("http:") or data.startswith("https:"):
63
        print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
64
        print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
65
        return
66
    else:
67
        try:
68
            if os.path.exists(data):
69
                print(('"%s" looks like a filename. Reading data from the file.' % data))
70
                with open(data) as fp:
71
                    data = fp.read()
72
        except ValueError:
73
            # This can happen on some platforms when the 'filename' is
74
            # too long. Assume it's data and not a filename.
75
            pass
76
        print("")
77

78
    for parser in basic_parsers:
79
        print(("Trying to parse your markup with %s" % parser))
80
        success = False
81
        try:
82
            soup = BeautifulSoup(data, features=parser)
83
            success = True
84
        except Exception as e:
85
            print(("%s could not parse the markup." % parser))
86
            traceback.print_exc()
87
        if success:
88
            print(("Here's what %s did with the markup:" % parser))
89
            print((soup.prettify()))
90

91
        print(("-" * 80))
92

93
def lxml_trace(data, html=True, **kwargs):
94
    """Print out the lxml events that occur during parsing.
95

96
    This lets you see how lxml parses a document when no Beautiful
97
    Soup code is running. You can use this to determine whether
98
    an lxml-specific problem is in Beautiful Soup's lxml tree builders
99
    or in lxml itself.
100

101
    :param data: Some markup.
102
    :param html: If True, markup will be parsed with lxml's HTML parser.
103
       if False, lxml's XML parser will be used.
104
    """
105
    from lxml import etree
106
    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
107
        print(("%s, %4s, %s" % (event, element.tag, element.text)))
108

109
class AnnouncingParser(HTMLParser):
110
    """Subclass of HTMLParser that announces parse events, without doing
111
    anything else.
112

113
    You can use this to get a picture of how html.parser sees a given
114
    document. The easiest way to do this is to call `htmlparser_trace`.
115
    """
116

117
    def _p(self, s):
118
        print(s)
119

120
    def handle_starttag(self, name, attrs):
121
        self._p("%s START" % name)
122

123
    def handle_endtag(self, name):
124
        self._p("%s END" % name)
125

126
    def handle_data(self, data):
127
        self._p("%s DATA" % data)
128

129
    def handle_charref(self, name):
130
        self._p("%s CHARREF" % name)
131

132
    def handle_entityref(self, name):
133
        self._p("%s ENTITYREF" % name)
134

135
    def handle_comment(self, data):
136
        self._p("%s COMMENT" % data)
137

138
    def handle_decl(self, data):
139
        self._p("%s DECL" % data)
140

141
    def unknown_decl(self, data):
142
        self._p("%s UNKNOWN-DECL" % data)
143

144
    def handle_pi(self, data):
145
        self._p("%s PI" % data)
146

147
def htmlparser_trace(data):
148
    """Print out the HTMLParser events that occur during parsing.
149

150
    This lets you see how HTMLParser parses a document when no
151
    Beautiful Soup code is running.
152

153
    :param data: Some markup.
154
    """
155
    parser = AnnouncingParser()
156
    parser.feed(data)
157

158
_vowels = "aeiou"
159
_consonants = "bcdfghjklmnpqrstvwxyz"
160

161
def rword(length=5):
162
    "Generate a random word-like string."
163
    s = ''
164
    for i in range(length):
165
        if i % 2 == 0:
166
            t = _consonants
167
        else:
168
            t = _vowels
169
        s += random.choice(t)
170
    return s
171

172
def rsentence(length=4):
173
    "Generate a random sentence-like string."
174
    return " ".join(rword(random.randint(4,9)) for i in list(range(length)))
175
        
176
def rdoc(num_elements=1000):
177
    """Randomly generate an invalid HTML document."""
178
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
179
    elements = []
180
    for i in range(num_elements):
181
        choice = random.randint(0,3)
182
        if choice == 0:
183
            # New tag.
184
            tag_name = random.choice(tag_names)
185
            elements.append("<%s>" % tag_name)
186
        elif choice == 1:
187
            elements.append(rsentence(random.randint(1,4)))
188
        elif choice == 2:
189
            # Close a tag.
190
            tag_name = random.choice(tag_names)
191
            elements.append("</%s>" % tag_name)
192
    return "<html>" + "\n".join(elements) + "</html>"
193

194
def benchmark_parsers(num_elements=100000):
195
    """Very basic head-to-head performance benchmark."""
196
    print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
197
    data = rdoc(num_elements)
198
    print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
199
    
200
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
201
        success = False
202
        try:
203
            a = time.time()
204
            soup = BeautifulSoup(data, parser)
205
            b = time.time()
206
            success = True
207
        except Exception as e:
208
            print(("%s could not parse the markup." % parser))
209
            traceback.print_exc()
210
        if success:
211
            print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
212

213
    from lxml import etree
214
    a = time.time()
215
    etree.HTML(data)
216
    b = time.time()
217
    print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
218

219
    import html5lib
220
    parser = html5lib.HTMLParser()
221
    a = time.time()
222
    parser.parse(data)
223
    b = time.time()
224
    print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
225

226
def profile(num_elements=100000, parser="lxml"):
227
    """Use Python's profiler on a randomly generated document."""
228
    filehandle = tempfile.NamedTemporaryFile()
229
    filename = filehandle.name
230

231
    data = rdoc(num_elements)
232
    vars = dict(bs4=bs4, data=data, parser=parser)
233
    cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
234

235
    stats = pstats.Stats(filename)
236
    # stats.strip_dirs()
237
    stats.sort_stats("cumulative")
238
    stats.print_stats('_html5lib|bs4', 50)
239

240
# If this file is run as a script, standard input is diagnosed.
241
if __name__ == '__main__':
242
    diagnose(sys.stdin.read())
243

244
Product

Resources

Company