CoCalc -- testing.py

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/testing.py
⁸¹¹ views
1
# encoding: utf-8
2
"""Helper classes for tests."""
3

4
# Use of this source code is governed by the MIT license.
5
__license__ = "MIT"
6

7
import pickle
8
import copy
9
import functools
10
import unittest
11
from unittest import TestCase
12
from bs4 import BeautifulSoup
13
from bs4.element import (
14
    CharsetMetaAttributeValue,
15
    Comment,
16
    ContentMetaAttributeValue,
17
    Doctype,
18
    PYTHON_SPECIFIC_ENCODINGS,
19
    SoupStrainer,
20
    Script,
21
    Stylesheet,
22
    Tag
23
)
24

25
from bs4.builder import HTMLParserTreeBuilder
26
default_builder = HTMLParserTreeBuilder
27

28
BAD_DOCUMENT = """A bare string
29
<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
30
<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
31
<div><![CDATA[A CDATA section where it doesn't belong]]></div>
32
<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
33
<div>A <meta> tag</div>
34
<div>A <br> tag that supposedly has contents.</br></div>
35
<div>AT&T</div>
36
<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
37
<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
38
<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
39
<div><a href="http://example.com/</a> that attribute value never got closed</div>
40
<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
41
<! This document starts with a bogus declaration ><div>a</div>
42
<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
43
<div>This document ends with <!an incomplete declaration
44
<div><a style={height:21px;}>That attribute value was bogus</a></div>
45
<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
46
<div><table><td nowrap>That boolean attribute had no value</td></table></div>
47
<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
48
<div>This document ends before the entity finishes: &gt
49
<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
50
<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
51
<div><table><tr><td>Here's a table</td></tr></table></div>
52
<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
53
<div>This tag contains nothing but whitespace: <b>    </b></div>
54
<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
55
<div><table><div>This table contains bare markup</div></table></div>
56
<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n   <a href="link2">This link is closed.</a>\n  </div>\n</div></div>
57
<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
58
<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
59
<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
60
<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
61
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
62
"""
63

64

65
class SoupTest(unittest.TestCase):
66

67
    @property
68
    def default_builder(self):
69
        return default_builder
70

71
    def soup(self, markup, **kwargs):
72
        """Build a Beautiful Soup object from markup."""
73
        builder = kwargs.pop('builder', self.default_builder)
74
        return BeautifulSoup(markup, builder=builder, **kwargs)
75

76
    def document_for(self, markup, **kwargs):
77
        """Turn an HTML fragment into a document.
78

79
        The details depend on the builder.
80
        """
81
        return self.default_builder(**kwargs).test_fragment_to_document(markup)
82

83
    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
84
        builder = self.default_builder
85
        obj = BeautifulSoup(to_parse, builder=builder)
86
        if compare_parsed_to is None:
87
            compare_parsed_to = to_parse
88

89
        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
90

91
    def assertConnectedness(self, element):
92
        """Ensure that next_element and previous_element are properly
93
        set for all descendants of the given element.
94
        """
95
        earlier = None
96
        for e in element.descendants:
97
            if earlier:
98
                self.assertEqual(e, earlier.next_element)
99
                self.assertEqual(earlier, e.previous_element)
100
            earlier = e
101

102
    def linkage_validator(self, el, _recursive_call=False):
103
        """Ensure proper linkage throughout the document."""
104
        descendant = None
105
        # Document element should have no previous element or previous sibling.
106
        # It also shouldn't have a next sibling.
107
        if el.parent is None:
108
            assert el.previous_element is None,\
109
                "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
110
                    el, el.previous_element, None
111
                )
112
            assert el.previous_sibling is None,\
113
                "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
114
                    el, el.previous_sibling, None
115
                )
116
            assert el.next_sibling is None,\
117
                "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
118
                    el, el.next_sibling, None
119
                )
120

121
        idx = 0
122
        child = None
123
        last_child = None
124
        last_idx = len(el.contents) - 1
125
        for child in el.contents:
126
            descendant = None
127

128
            # Parent should link next element to their first child
129
            # That child should have no previous sibling
130
            if idx == 0:
131
                if el.parent is not None:
132
                    assert el.next_element is child,\
133
                       "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
134
                            el, el.next_element, child
135
                        )
136
                    assert child.previous_element is el,\
137
                       "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
138
                            child, child.previous_element, el
139
                        )
140
                    assert child.previous_sibling is None,\
141
                       "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
142
                            child, child.previous_sibling, None
143
                        )
144

145
            # If not the first child, previous index should link as sibling to this index
146
            # Previous element should match the last index or the last bubbled up descendant
147
            else:
148
                assert child.previous_sibling is el.contents[idx - 1],\
149
                    "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
150
                        child, child.previous_sibling, el.contents[idx - 1]
151
                    )
152
                assert el.contents[idx - 1].next_sibling is child,\
153
                    "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
154
                        el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
155
                    )
156

157
                if last_child is not None:
158
                    assert child.previous_element is last_child,\
159
                        "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
160
                            child, child.previous_element, last_child, child.parent.contents
161
                        )
162
                    assert last_child.next_element is child,\
163
                        "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
164
                            last_child, last_child.next_element, child
165
                        )
166

167
            if isinstance(child, Tag) and child.contents:
168
                descendant = self.linkage_validator(child, True)
169
                # A bubbled up descendant should have no next siblings
170
                assert descendant.next_sibling is None,\
171
                    "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
172
                        descendant, descendant.next_sibling, None
173
                    )
174

175
            # Mark last child as either the bubbled up descendant or the current child
176
            if descendant is not None:
177
                last_child = descendant
178
            else:
179
                last_child = child
180

181
            # If last child, there are non next siblings
182
            if idx == last_idx:
183
                assert child.next_sibling is None,\
184
                    "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
185
                        child, child.next_sibling, None
186
                    )
187
            idx += 1
188

189
        child = descendant if descendant is not None else child
190
        if child is None:
191
            child = el
192

193
        if not _recursive_call and child is not None:
194
            target = el
195
            while True:
196
                if target is None:
197
                    assert child.next_element is None, \
198
                        "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
199
                            child, child.next_element, None
200
                        )
201
                    break
202
                elif target.next_sibling is not None:
203
                    assert child.next_element is target.next_sibling, \
204
                        "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
205
                            child, child.next_element, target.next_sibling
206
                        )
207
                    break
208
                target = target.parent
209

210
            # We are done, so nothing to return
211
            return None
212
        else:
213
            # Return the child to the recursive caller
214
            return child
215

216

217
class HTMLTreeBuilderSmokeTest(object):
218

219
    """A basic test of a treebuilder's competence.
220

221
    Any HTML treebuilder, present or future, should be able to pass
222
    these tests. With invalid markup, there's room for interpretation,
223
    and different parsers can handle it differently. But with the
224
    markup in these tests, there's not much room for interpretation.
225
    """
226

227
    def test_empty_element_tags(self):
228
        """Verify that all HTML4 and HTML5 empty element (aka void element) tags
229
        are handled correctly.
230
        """
231
        for name in [
232
                'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
233
                'spacer', 'frame'
234
        ]:
235
            soup = self.soup("")
236
            new_tag = soup.new_tag(name)
237
            self.assertEqual(True, new_tag.is_empty_element)
238

239
    def test_special_string_containers(self):
240
        soup = self.soup(
241
            "<style>Some CSS</style><script>Some Javascript</script>"
242
        )
243
        assert isinstance(soup.style.string, Stylesheet)
244
        assert isinstance(soup.script.string, Script)
245

246
        soup = self.soup(
247
            "<style><!--Some CSS--></style>"
248
        )
249
        assert isinstance(soup.style.string, Stylesheet)
250
        # The contents of the style tag resemble an HTML comment, but
251
        # it's not treated as a comment.
252
        self.assertEqual("<!--Some CSS-->", soup.style.string)
253
        assert isinstance(soup.style.string, Stylesheet)
254
        
255
    def test_pickle_and_unpickle_identity(self):
256
        # Pickling a tree, then unpickling it, yields a tree identical
257
        # to the original.
258
        tree = self.soup("<a><b>foo</a>")
259
        dumped = pickle.dumps(tree, 2)
260
        loaded = pickle.loads(dumped)
261
        self.assertEqual(loaded.__class__, BeautifulSoup)
262
        self.assertEqual(loaded.decode(), tree.decode())
263

264
    def assertDoctypeHandled(self, doctype_fragment):
265
        """Assert that a given doctype string is handled correctly."""
266
        doctype_str, soup = self._document_with_doctype(doctype_fragment)
267

268
        # Make sure a Doctype object was created.
269
        doctype = soup.contents[0]
270
        self.assertEqual(doctype.__class__, Doctype)
271
        self.assertEqual(doctype, doctype_fragment)
272
        self.assertEqual(
273
            soup.encode("utf8")[:len(doctype_str)],
274
            doctype_str
275
        )
276

277
        # Make sure that the doctype was correctly associated with the
278
        # parse tree and that the rest of the document parsed.
279
        self.assertEqual(soup.p.contents[0], 'foo')
280

281
    def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"):
282
        """Generate and parse a document with the given doctype."""
283
        doctype = '<!%s %s>' % (doctype_string, doctype_fragment)
284
        markup = doctype + '\n<p>foo</p>'
285
        soup = self.soup(markup)
286
        return doctype.encode("utf8"), soup
287

288
    def test_normal_doctypes(self):
289
        """Make sure normal, everyday HTML doctypes are handled correctly."""
290
        self.assertDoctypeHandled("html")
291
        self.assertDoctypeHandled(
292
            'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
293

294
    def test_empty_doctype(self):
295
        soup = self.soup("<!DOCTYPE>")
296
        doctype = soup.contents[0]
297
        self.assertEqual("", doctype.strip())
298

299
    def test_mixed_case_doctype(self):
300
        # A lowercase or mixed-case doctype becomes a Doctype.
301
        for doctype_fragment in ("doctype", "DocType"):
302
            doctype_str, soup = self._document_with_doctype(
303
                "html", doctype_fragment
304
            )
305

306
            # Make sure a Doctype object was created and that the DOCTYPE
307
            # is uppercase.
308
            doctype = soup.contents[0]
309
            self.assertEqual(doctype.__class__, Doctype)
310
            self.assertEqual(doctype, "html")
311
            self.assertEqual(
312
                soup.encode("utf8")[:len(doctype_str)],
313
                b"<!DOCTYPE html>"
314
            )
315

316
            # Make sure that the doctype was correctly associated with the
317
            # parse tree and that the rest of the document parsed.
318
            self.assertEqual(soup.p.contents[0], 'foo')
319
        
320
    def test_public_doctype_with_url(self):
321
        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
322
        self.assertDoctypeHandled(doctype)
323

324
    def test_system_doctype(self):
325
        self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
326

327
    def test_namespaced_system_doctype(self):
328
        # We can handle a namespaced doctype with a system ID.
329
        self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
330

331
    def test_namespaced_public_doctype(self):
332
        # Test a namespaced doctype with a public id.
333
        self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
334

335
    def test_real_xhtml_document(self):
336
        """A real XHTML document should come out more or less the same as it went in."""
337
        markup = b"""<?xml version="1.0" encoding="utf-8"?>
338
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
339
<html xmlns="http://www.w3.org/1999/xhtml">
340
<head><title>Hello.</title></head>
341
<body>Goodbye.</body>
342
</html>"""
343
        soup = self.soup(markup)
344
        self.assertEqual(
345
            soup.encode("utf-8").replace(b"\n", b""),
346
            markup.replace(b"\n", b""))
347

348
    def test_namespaced_html(self):
349
        """When a namespaced XML document is parsed as HTML it should
350
        be treated as HTML with weird tag names.
351
        """
352
        markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
353
        soup = self.soup(markup)
354
        self.assertEqual(2, len(soup.find_all("ns1:foo")))
355
        
356
    def test_processing_instruction(self):
357
        # We test both Unicode and bytestring to verify that
358
        # process_markup correctly sets processing_instruction_class
359
        # even when the markup is already Unicode and there is no
360
        # need to process anything.
361
        markup = """<?PITarget PIContent?>"""
362
        soup = self.soup(markup)
363
        self.assertEqual(markup, soup.decode())
364

365
        markup = b"""<?PITarget PIContent?>"""
366
        soup = self.soup(markup)
367
        self.assertEqual(markup, soup.encode("utf8"))
368

369
    def test_deepcopy(self):
370
        """Make sure you can copy the tree builder.
371

372
        This is important because the builder is part of a
373
        BeautifulSoup object, and we want to be able to copy that.
374
        """
375
        copy.deepcopy(self.default_builder)
376

377
    def test_p_tag_is_never_empty_element(self):
378
        """A <p> tag is never designated as an empty-element tag.
379

380
        Even if the markup shows it as an empty-element tag, it
381
        shouldn't be presented that way.
382
        """
383
        soup = self.soup("<p/>")
384
        self.assertFalse(soup.p.is_empty_element)
385
        self.assertEqual(str(soup.p), "<p></p>")
386

387
    def test_unclosed_tags_get_closed(self):
388
        """A tag that's not closed by the end of the document should be closed.
389

390
        This applies to all tags except empty-element tags.
391
        """
392
        self.assertSoupEquals("<p>", "<p></p>")
393
        self.assertSoupEquals("<b>", "<b></b>")
394

395
        self.assertSoupEquals("<br>", "<br/>")
396

397
    def test_br_is_always_empty_element_tag(self):
398
        """A <br> tag is designated as an empty-element tag.
399

400
        Some parsers treat <br></br> as one <br/> tag, some parsers as
401
        two tags, but it should always be an empty-element tag.
402
        """
403
        soup = self.soup("<br></br>")
404
        self.assertTrue(soup.br.is_empty_element)
405
        self.assertEqual(str(soup.br), "<br/>")
406

407
    def test_nested_formatting_elements(self):
408
        self.assertSoupEquals("<em><em></em></em>")
409

410
    def test_double_head(self):
411
        html = '''<!DOCTYPE html>
412
<html>
413
<head>
414
<title>Ordinary HEAD element test</title>
415
</head>
416
<script type="text/javascript">
417
alert("Help!");
418
</script>
419
<body>
420
Hello, world!
421
</body>
422
</html>
423
'''
424
        soup = self.soup(html)
425
        self.assertEqual("text/javascript", soup.find('script')['type'])
426

427
    def test_comment(self):
428
        # Comments are represented as Comment objects.
429
        markup = "<p>foo<!--foobar-->baz</p>"
430
        self.assertSoupEquals(markup)
431

432
        soup = self.soup(markup)
433
        comment = soup.find(text="foobar")
434
        self.assertEqual(comment.__class__, Comment)
435

436
        # The comment is properly integrated into the tree.
437
        foo = soup.find(text="foo")
438
        self.assertEqual(comment, foo.next_element)
439
        baz = soup.find(text="baz")
440
        self.assertEqual(comment, baz.previous_element)
441

442
    def test_preserved_whitespace_in_pre_and_textarea(self):
443
        """Whitespace must be preserved in <pre> and <textarea> tags,
444
        even if that would mean not prettifying the markup.
445
        """
446
        pre_markup = "<pre>   </pre>"
447
        textarea_markup = "<textarea> woo\nwoo  </textarea>"
448
        self.assertSoupEquals(pre_markup)
449
        self.assertSoupEquals(textarea_markup)
450

451
        soup = self.soup(pre_markup)
452
        self.assertEqual(soup.pre.prettify(), pre_markup)
453

454
        soup = self.soup(textarea_markup)
455
        self.assertEqual(soup.textarea.prettify(), textarea_markup)
456

457
        soup = self.soup("<textarea></textarea>")
458
        self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>")
459

460
    def test_nested_inline_elements(self):
461
        """Inline elements can be nested indefinitely."""
462
        b_tag = "<b>Inside a B tag</b>"
463
        self.assertSoupEquals(b_tag)
464

465
        nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
466
        self.assertSoupEquals(nested_b_tag)
467

468
        double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
469
        self.assertSoupEquals(nested_b_tag)
470

471
    def test_nested_block_level_elements(self):
472
        """Block elements can be nested."""
473
        soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
474
        blockquote = soup.blockquote
475
        self.assertEqual(blockquote.p.b.string, 'Foo')
476
        self.assertEqual(blockquote.b.string, 'Foo')
477

478
    def test_correctly_nested_tables(self):
479
        """One table can go inside another one."""
480
        markup = ('<table id="1">'
481
                  '<tr>'
482
                  "<td>Here's another table:"
483
                  '<table id="2">'
484
                  '<tr><td>foo</td></tr>'
485
                  '</table></td>')
486

487
        self.assertSoupEquals(
488
            markup,
489
            '<table id="1"><tr><td>Here\'s another table:'
490
            '<table id="2"><tr><td>foo</td></tr></table>'
491
            '</td></tr></table>')
492

493
        self.assertSoupEquals(
494
            "<table><thead><tr><td>Foo</td></tr></thead>"
495
            "<tbody><tr><td>Bar</td></tr></tbody>"
496
            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
497

498
    def test_multivalued_attribute_with_whitespace(self):
499
        # Whitespace separating the values of a multi-valued attribute
500
        # should be ignored.
501

502
        markup = '<div class=" foo bar	 "></a>'
503
        soup = self.soup(markup)
504
        self.assertEqual(['foo', 'bar'], soup.div['class'])
505

506
        # If you search by the literal name of the class it's like the whitespace
507
        # wasn't there.
508
        self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
509
        
510
    def test_deeply_nested_multivalued_attribute(self):
511
        # html5lib can set the attributes of the same tag many times
512
        # as it rearranges the tree. This has caused problems with
513
        # multivalued attributes.
514
        markup = '<table><div><div class="css"></div></div></table>'
515
        soup = self.soup(markup)
516
        self.assertEqual(["css"], soup.div.div['class'])
517

518
    def test_multivalued_attribute_on_html(self):
519
        # html5lib uses a different API to set the attributes ot the
520
        # <html> tag. This has caused problems with multivalued
521
        # attributes.
522
        markup = '<html class="a b"></html>'
523
        soup = self.soup(markup)
524
        self.assertEqual(["a", "b"], soup.html['class'])
525

526
    def test_angle_brackets_in_attribute_values_are_escaped(self):
527
        self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
528

529
    def test_strings_resembling_character_entity_references(self):
530
        # "&T" and "&p" look like incomplete character entities, but they are
531
        # not.
532
        self.assertSoupEquals(
533
            "<p>&bull; AT&T is in the s&p 500</p>",
534
            "<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
535
        )
536

537
    def test_apos_entity(self):
538
        self.assertSoupEquals(
539
            "<p>Bob&apos;s Bar</p>",
540
            "<p>Bob's Bar</p>",
541
        )
542
        
543
    def test_entities_in_foreign_document_encoding(self):
544
        # &#147; and &#148; are invalid numeric entities referencing
545
        # Windows-1252 characters. &#45; references a character common
546
        # to Windows-1252 and Unicode, and &#9731; references a
547
        # character only found in Unicode.
548
        #
549
        # All of these entities should be converted to Unicode
550
        # characters.
551
        markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
552
        soup = self.soup(markup)
553
        self.assertEqual("“Hello” -☃", soup.p.string)
554
        
555
    def test_entities_in_attributes_converted_to_unicode(self):
556
        expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
557
        self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
558
        self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
559
        self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
560
        self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
561

562
    def test_entities_in_text_converted_to_unicode(self):
563
        expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
564
        self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
565
        self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
566
        self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
567
        self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
568

569
    def test_quot_entity_converted_to_quotation_mark(self):
570
        self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
571
                              '<p>I said "good day!"</p>')
572

573
    def test_out_of_range_entity(self):
574
        expect = "\N{REPLACEMENT CHARACTER}"
575
        self.assertSoupEquals("&#10000000000000;", expect)
576
        self.assertSoupEquals("&#x10000000000000;", expect)
577
        self.assertSoupEquals("&#1000000000;", expect)
578
        
579
    def test_multipart_strings(self):
580
        "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
581
        soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
582
        self.assertEqual("p", soup.h2.string.next_element.name)
583
        self.assertEqual("p", soup.p.name)
584
        self.assertConnectedness(soup)
585

586
    def test_empty_element_tags(self):
587
        """Verify consistent handling of empty-element tags,
588
        no matter how they come in through the markup.
589
        """
590
        self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>")
591
        self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>")
592
        
593
    def test_head_tag_between_head_and_body(self):
594
        "Prevent recurrence of a bug in the html5lib treebuilder."
595
        content = """<html><head></head>
596
  <link></link>
597
  <body>foo</body>
598
</html>
599
"""
600
        soup = self.soup(content)
601
        self.assertNotEqual(None, soup.html.body)
602
        self.assertConnectedness(soup)
603

604
    def test_multiple_copies_of_a_tag(self):
605
        "Prevent recurrence of a bug in the html5lib treebuilder."
606
        content = """<!DOCTYPE html>
607
<html>
608
 <body>
609
   <article id="a" >
610
   <div><a href="1"></div>
611
   <footer>
612
     <a href="2"></a>
613
   </footer>
614
  </article>
615
  </body>
616
</html>
617
"""
618
        soup = self.soup(content)
619
        self.assertConnectedness(soup.article)
620

621
    def test_basic_namespaces(self):
622
        """Parsers don't need to *understand* namespaces, but at the
623
        very least they should not choke on namespaces or lose
624
        data."""
625

626
        markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
627
        soup = self.soup(markup)
628
        self.assertEqual(markup, soup.encode())
629
        html = soup.html
630
        self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
631
        self.assertEqual(
632
            'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
633
        self.assertEqual(
634
            'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
635

636
    def test_multivalued_attribute_value_becomes_list(self):
637
        markup = b'<a class="foo bar">'
638
        soup = self.soup(markup)
639
        self.assertEqual(['foo', 'bar'], soup.a['class'])
640

641
    #
642
    # Generally speaking, tests below this point are more tests of
643
    # Beautiful Soup than tests of the tree builders. But parsers are
644
    # weird, so we run these tests separately for every tree builder
645
    # to detect any differences between them.
646
    #
647

648
    def test_can_parse_unicode_document(self):
649
        # A seemingly innocuous document... but it's in Unicode! And
650
        # it contains characters that can't be represented in the
651
        # encoding found in the  declaration! The horror!
652
        markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
653
        soup = self.soup(markup)
654
        self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
655

656
    def test_soupstrainer(self):
657
        """Parsers should be able to work with SoupStrainers."""
658
        strainer = SoupStrainer("b")
659
        soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
660
                         parse_only=strainer)
661
        self.assertEqual(soup.decode(), "<b>bold</b>")
662

663
    def test_single_quote_attribute_values_become_double_quotes(self):
664
        self.assertSoupEquals("<foo attr='bar'></foo>",
665
                              '<foo attr="bar"></foo>')
666

667
    def test_attribute_values_with_nested_quotes_are_left_alone(self):
668
        text = """<foo attr='bar "brawls" happen'>a</foo>"""
669
        self.assertSoupEquals(text)
670

671
    def test_attribute_values_with_double_nested_quotes_get_quoted(self):
672
        text = """<foo attr='bar "brawls" happen'>a</foo>"""
673
        soup = self.soup(text)
674
        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
675
        self.assertSoupEquals(
676
            soup.foo.decode(),
677
            """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
678

679
    def test_ampersand_in_attribute_value_gets_escaped(self):
680
        self.assertSoupEquals('<this is="really messed up & stuff"></this>',
681
                              '<this is="really messed up &amp; stuff"></this>')
682

683
        self.assertSoupEquals(
684
            '<a href="http://example.org?a=1&b=2;3">foo</a>',
685
            '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
686

687
    def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
688
        self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
689

690
    def test_entities_in_strings_converted_during_parsing(self):
691
        # Both XML and HTML entities are converted to Unicode characters
692
        # during parsing.
693
        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
694
        expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
695
        self.assertSoupEquals(text, expected)
696

697
    def test_smart_quotes_converted_on_the_way_in(self):
698
        # Microsoft smart quotes are converted to Unicode characters during
699
        # parsing.
700
        quote = b"<p>\x91Foo\x92</p>"
701
        soup = self.soup(quote)
702
        self.assertEqual(
703
            soup.p.string,
704
            "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
705

706
    def test_non_breaking_spaces_converted_on_the_way_in(self):
707
        soup = self.soup("<a>&nbsp;&nbsp;</a>")
708
        self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
709

710
    def test_entities_converted_on_the_way_out(self):
711
        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
712
        expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
713
        soup = self.soup(text)
714
        self.assertEqual(soup.p.encode("utf-8"), expected)
715

716
    def test_real_iso_latin_document(self):
717
        # Smoke test of interrelated functionality, using an
718
        # easy-to-understand document.
719

720
        # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
721
        unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
722

723
        # That's because we're going to encode it into ISO-Latin-1, and use
724
        # that to test.
725
        iso_latin_html = unicode_html.encode("iso-8859-1")
726

727
        # Parse the ISO-Latin-1 HTML.
728
        soup = self.soup(iso_latin_html)
729
        # Encode it to UTF-8.
730
        result = soup.encode("utf-8")
731

732
        # What do we expect the result to look like? Well, it would
733
        # look like unicode_html, except that the META tag would say
734
        # UTF-8 instead of ISO-Latin-1.
735
        expected = unicode_html.replace("ISO-Latin-1", "utf-8")
736

737
        # And, of course, it would be in UTF-8, not Unicode.
738
        expected = expected.encode("utf-8")
739

740
        # Ta-da!
741
        self.assertEqual(result, expected)
742

743
    def test_real_shift_jis_document(self):
744
        # Smoke test to make sure the parser can handle a document in
745
        # Shift-JIS encoding, without choking.
746
        shift_jis_html = (
747
            b'<html><head></head><body><pre>'
748
            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
749
            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
750
            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
751
            b'</pre></body></html>')
752
        unicode_html = shift_jis_html.decode("shift-jis")
753
        soup = self.soup(unicode_html)
754

755
        # Make sure the parse tree is correctly encoded to various
756
        # encodings.
757
        self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
758
        self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
759

760
    def test_real_hebrew_document(self):
761
        # A real-world test to make sure we can convert ISO-8859-9 (a
762
        # Hebrew encoding) to UTF-8.
763
        hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
764
        soup = self.soup(
765
            hebrew_document, from_encoding="iso8859-8")
766
        # Some tree builders call it iso8859-8, others call it iso-8859-9.
767
        # That's not a difference we really care about.
768
        assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
769
        self.assertEqual(
770
            soup.encode('utf-8'),
771
            hebrew_document.decode("iso8859-8").encode("utf-8"))
772

773
    def test_meta_tag_reflects_current_encoding(self):
774
        # Here's the <meta> tag saying that a document is
775
        # encoded in Shift-JIS.
776
        meta_tag = ('<meta content="text/html; charset=x-sjis" '
777
                    'http-equiv="Content-type"/>')
778

779
        # Here's a document incorporating that meta tag.
780
        shift_jis_html = (
781
            '<html><head>\n%s\n'
782
            '<meta http-equiv="Content-language" content="ja"/>'
783
            '</head><body>Shift-JIS markup goes here.') % meta_tag
784
        soup = self.soup(shift_jis_html)
785

786
        # Parse the document, and the charset is seemingly unaffected.
787
        parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
788
        content = parsed_meta['content']
789
        self.assertEqual('text/html; charset=x-sjis', content)
790

791
        # But that value is actually a ContentMetaAttributeValue object.
792
        self.assertTrue(isinstance(content, ContentMetaAttributeValue))
793

794
        # And it will take on a value that reflects its current
795
        # encoding.
796
        self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
797

798
        # For the rest of the story, see TestSubstitutions in
799
        # test_tree.py.
800

801
    def test_html5_style_meta_tag_reflects_current_encoding(self):
802
        # Here's the <meta> tag saying that a document is
803
        # encoded in Shift-JIS.
804
        meta_tag = ('<meta id="encoding" charset="x-sjis" />')
805

806
        # Here's a document incorporating that meta tag.
807
        shift_jis_html = (
808
            '<html><head>\n%s\n'
809
            '<meta http-equiv="Content-language" content="ja"/>'
810
            '</head><body>Shift-JIS markup goes here.') % meta_tag
811
        soup = self.soup(shift_jis_html)
812

813
        # Parse the document, and the charset is seemingly unaffected.
814
        parsed_meta = soup.find('meta', id="encoding")
815
        charset = parsed_meta['charset']
816
        self.assertEqual('x-sjis', charset)
817

818
        # But that value is actually a CharsetMetaAttributeValue object.
819
        self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
820

821
        # And it will take on a value that reflects its current
822
        # encoding.
823
        self.assertEqual('utf8', charset.encode("utf8"))
824

825
    def test_python_specific_encodings_not_used_in_charset(self):
826
        # You can encode an HTML document using a Python-specific
827
        # encoding, but that encoding won't be mentioned _inside_ the
828
        # resulting document. Instead, the document will appear to
829
        # have no encoding.
830
        for markup in [
831
            b'<meta charset="utf8"></head>'
832
            b'<meta id="encoding" charset="utf-8" />'
833
        ]:
834
            soup = self.soup(markup)
835
            for encoding in PYTHON_SPECIFIC_ENCODINGS:
836
                if encoding in (
837
                    'idna', 'mbcs', 'oem', 'undefined',
838
                    'string_escape', 'string-escape'
839
                ):
840
                    # For one reason or another, these will raise an
841
                    # exception if we actually try to use them, so don't
842
                    # bother.
843
                    continue
844
                encoded = soup.encode(encoding)
845
                assert b'meta charset=""' in encoded
846
                assert encoding.encode("ascii") not in encoded
847
        
848
    def test_tag_with_no_attributes_can_have_attributes_added(self):
849
        data = self.soup("<a>text</a>")
850
        data.a['foo'] = 'bar'
851
        self.assertEqual('<a foo="bar">text</a>', data.a.decode())
852

853
    def test_worst_case(self):
854
        """Test the worst case (currently) for linking issues."""
855

856
        soup = self.soup(BAD_DOCUMENT)
857
        self.linkage_validator(soup)
858

859

860
class XMLTreeBuilderSmokeTest(object):
861

862
    def test_pickle_and_unpickle_identity(self):
863
        # Pickling a tree, then unpickling it, yields a tree identical
864
        # to the original.
865
        tree = self.soup("<a><b>foo</a>")
866
        dumped = pickle.dumps(tree, 2)
867
        loaded = pickle.loads(dumped)
868
        self.assertEqual(loaded.__class__, BeautifulSoup)
869
        self.assertEqual(loaded.decode(), tree.decode())
870

871
    def test_docstring_generated(self):
872
        soup = self.soup("<root/>")
873
        self.assertEqual(
874
            soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
875

876
    def test_xml_declaration(self):
877
        markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
878
        soup = self.soup(markup)
879
        self.assertEqual(markup, soup.encode("utf8"))
880

881
    def test_python_specific_encodings_not_used_in_xml_declaration(self):
882
        # You can encode an XML document using a Python-specific
883
        # encoding, but that encoding won't be mentioned _inside_ the
884
        # resulting document.
885
        markup = b"""<?xml version="1.0"?>\n<foo/>"""
886
        soup = self.soup(markup)
887
        for encoding in PYTHON_SPECIFIC_ENCODINGS:
888
            if encoding in (
889
                'idna', 'mbcs', 'oem', 'undefined',
890
                'string_escape', 'string-escape'
891
            ):
892
                # For one reason or another, these will raise an
893
                # exception if we actually try to use them, so don't
894
                # bother.
895
                continue
896
            encoded = soup.encode(encoding)
897
            assert b'<?xml version="1.0"?>' in encoded
898
            assert encoding.encode("ascii") not in encoded
899

900
    def test_processing_instruction(self):
901
        markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
902
        soup = self.soup(markup)
903
        self.assertEqual(markup, soup.encode("utf8"))
904

905
    def test_real_xhtml_document(self):
906
        """A real XHTML document should come out *exactly* the same as it went in."""
907
        markup = b"""<?xml version="1.0" encoding="utf-8"?>
908
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
909
<html xmlns="http://www.w3.org/1999/xhtml">
910
<head><title>Hello.</title></head>
911
<body>Goodbye.</body>
912
</html>"""
913
        soup = self.soup(markup)
914
        self.assertEqual(
915
            soup.encode("utf-8"), markup)
916
       
917
    def test_nested_namespaces(self):
918
        doc = b"""<?xml version="1.0" encoding="utf-8"?>
919
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
920
<parent xmlns="http://ns1/">
921
<child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
922
<grandchild ns3:attr="value" xmlns="http://ns4/"/>
923
</child>
924
</parent>"""
925
        soup = self.soup(doc)
926
        self.assertEqual(doc, soup.encode())
927
        
928
    def test_formatter_processes_script_tag_for_xml_documents(self):
929
        doc = """
930
  <script type="text/javascript">
931
  </script>
932
"""
933
        soup = BeautifulSoup(doc, "lxml-xml")
934
        # lxml would have stripped this while parsing, but we can add
935
        # it later.
936
        soup.script.string = 'console.log("< < hey > > ");'
937
        encoded = soup.encode()
938
        self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
939

940
    def test_can_parse_unicode_document(self):
941
        markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
942
        soup = self.soup(markup)
943
        self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
944

945
    def test_popping_namespaced_tag(self):
946
        markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
947
        soup = self.soup(markup)
948
        self.assertEqual(
949
            str(soup.rss), markup)
950

951
    def test_docstring_includes_correct_encoding(self):
952
        soup = self.soup("<root/>")
953
        self.assertEqual(
954
            soup.encode("latin1"),
955
            b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
956

957
    def test_large_xml_document(self):
958
        """A large XML document should come out the same as it went in."""
959
        markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
960
                  + b'0' * (2**12)
961
                  + b'</root>')
962
        soup = self.soup(markup)
963
        self.assertEqual(soup.encode("utf-8"), markup)
964

965

966
    def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
967
        self.assertSoupEquals("<p>", "<p/>")
968
        self.assertSoupEquals("<p>foo</p>")
969

970
    def test_namespaces_are_preserved(self):
971
        markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
972
        soup = self.soup(markup)
973
        root = soup.root
974
        self.assertEqual("http://example.com/", root['xmlns:a'])
975
        self.assertEqual("http://example.net/", root['xmlns:b'])
976

977
    def test_closing_namespaced_tag(self):
978
        markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
979
        soup = self.soup(markup)
980
        self.assertEqual(str(soup.p), markup)
981

982
    def test_namespaced_attributes(self):
983
        markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
984
        soup = self.soup(markup)
985
        self.assertEqual(str(soup.foo), markup)
986

987
    def test_namespaced_attributes_xml_namespace(self):
988
        markup = '<foo xml:lang="fr">bar</foo>'
989
        soup = self.soup(markup)
990
        self.assertEqual(str(soup.foo), markup)
991

992
    def test_find_by_prefixed_name(self):
993
        doc = """<?xml version="1.0" encoding="utf-8"?>
994
<Document xmlns="http://example.com/ns0"
995
    xmlns:ns1="http://example.com/ns1"
996
    xmlns:ns2="http://example.com/ns2"
997
    <ns1:tag>foo</ns1:tag>
998
    <ns1:tag>bar</ns1:tag>
999
    <ns2:tag key="value">baz</ns2:tag>
1000
</Document>
1001
"""
1002
        soup = self.soup(doc)
1003

1004
        # There are three <tag> tags.
1005
        self.assertEqual(3, len(soup.find_all('tag')))
1006

1007
        # But two of them are ns1:tag and one of them is ns2:tag.
1008
        self.assertEqual(2, len(soup.find_all('ns1:tag')))
1009
        self.assertEqual(1, len(soup.find_all('ns2:tag')))
1010
        
1011
        self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
1012
        self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
1013
        
1014
    def test_copy_tag_preserves_namespace(self):
1015
        xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1016
<w:document xmlns:w="http://example.com/ns0"/>"""
1017
    
1018
        soup = self.soup(xml)
1019
        tag = soup.document
1020
        duplicate = copy.copy(tag)
1021

1022
        # The two tags have the same namespace prefix.
1023
        self.assertEqual(tag.prefix, duplicate.prefix)
1024

1025
    def test_worst_case(self):
1026
        """Test the worst case (currently) for linking issues."""
1027

1028
        soup = self.soup(BAD_DOCUMENT)
1029
        self.linkage_validator(soup)
1030

1031

1032
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
1033
    """Smoke test for a tree builder that supports HTML5."""
1034

1035
    def test_real_xhtml_document(self):
1036
        # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
1037
        # XHTML documents in any particular way.
1038
        pass
1039

1040
    def test_html_tags_have_namespace(self):
1041
        markup = "<a>"
1042
        soup = self.soup(markup)
1043
        self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
1044

1045
    def test_svg_tags_have_namespace(self):
1046
        markup = '<svg><circle/></svg>'
1047
        soup = self.soup(markup)
1048
        namespace = "http://www.w3.org/2000/svg"
1049
        self.assertEqual(namespace, soup.svg.namespace)
1050
        self.assertEqual(namespace, soup.circle.namespace)
1051

1052

1053
    def test_mathml_tags_have_namespace(self):
1054
        markup = '<math><msqrt>5</msqrt></math>'
1055
        soup = self.soup(markup)
1056
        namespace = 'http://www.w3.org/1998/Math/MathML'
1057
        self.assertEqual(namespace, soup.math.namespace)
1058
        self.assertEqual(namespace, soup.msqrt.namespace)
1059

1060
    def test_xml_declaration_becomes_comment(self):
1061
        markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
1062
        soup = self.soup(markup)
1063
        self.assertTrue(isinstance(soup.contents[0], Comment))
1064
        self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
1065
        self.assertEqual("html", soup.contents[0].next_element.name)
1066

1067
def skipIf(condition, reason):
1068
   def nothing(test, *args, **kwargs):
1069
       return None
1070

1071
   def decorator(test_item):
1072
       if condition:
1073
           return nothing
1074
       else:
1075
           return test_item
1076

1077
   return decorator
1078

1079
Product

Resources

Company