CoCalc -- test

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/tests/test_tree.py
⁸¹¹ views
1
# -*- coding: utf-8 -*-
2
"""Tests for Beautiful Soup's tree traversal methods.
3

4
The tree traversal methods are the main advantage of using Beautiful
5
Soup over just using a parser.
6

7
Different parsers will build different Beautiful Soup trees given the
8
same markup, but all Beautiful Soup trees can be traversed with the
9
methods tested here.
10
"""
11

12
from pdb import set_trace
13
import copy
14
import pickle
15
import re
16
import warnings
17
from bs4 import BeautifulSoup
18
from bs4.builder import (
19
    builder_registry,
20
    HTMLParserTreeBuilder,
21
)
22
from bs4.element import (
23
    PY3K,
24
    CData,
25
    Comment,
26
    Declaration,
27
    Doctype,
28
    Formatter,
29
    NavigableString,
30
    Script,
31
    SoupStrainer,
32
    Stylesheet,
33
    Tag,
34
    TemplateString,
35
)
36
from bs4.testing import (
37
    SoupTest,
38
    skipIf,
39
)
40
from soupsieve import SelectorSyntaxError
41

42
XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
43
LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
44

45
class TreeTest(SoupTest):
46

47
    def assertSelects(self, tags, should_match):
48
        """Make sure that the given tags have the correct text.
49

50
        This is used in tests that define a bunch of tags, each
51
        containing a single string, and then select certain strings by
52
        some mechanism.
53
        """
54
        self.assertEqual([tag.string for tag in tags], should_match)
55

56
    def assertSelectsIDs(self, tags, should_match):
57
        """Make sure that the given tags have the correct IDs.
58

59
        This is used in tests that define a bunch of tags, each
60
        containing a single string, and then select certain strings by
61
        some mechanism.
62
        """
63
        self.assertEqual([tag['id'] for tag in tags], should_match)
64

65

66
class TestFind(TreeTest):
67
    """Basic tests of the find() method.
68

69
    find() just calls find_all() with limit=1, so it's not tested all
70
    that thouroughly here.
71
    """
72

73
    def test_find_tag(self):
74
        soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>")
75
        self.assertEqual(soup.find("b").string, "2")
76

77
    def test_unicode_text_find(self):
78
        soup = self.soup('<h1>Räksmörgås</h1>')
79
        self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
80

81
    def test_unicode_attribute_find(self):
82
        soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
83
        str(soup)
84
        self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
85

86

87
    def test_find_everything(self):
88
        """Test an optimization that finds all tags."""
89
        soup = self.soup("<a>foo</a><b>bar</b>")
90
        self.assertEqual(2, len(soup.find_all()))
91

92
    def test_find_everything_with_name(self):
93
        """Test an optimization that finds all tags with a given name."""
94
        soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>")
95
        self.assertEqual(2, len(soup.find_all('a')))
96

97
class TestFindAll(TreeTest):
98
    """Basic tests of the find_all() method."""
99

100
    def test_find_all_text_nodes(self):
101
        """You can search the tree for text nodes."""
102
        soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
103
        # Exact match.
104
        self.assertEqual(soup.find_all(string="bar"), ["bar"])
105
        self.assertEqual(soup.find_all(text="bar"), ["bar"])
106
        # Match any of a number of strings.
107
        self.assertEqual(
108
            soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
109
        # Match a regular expression.
110
        self.assertEqual(soup.find_all(text=re.compile('.*')),
111
                         ["Foo", "bar", '\xbb'])
112
        # Match anything.
113
        self.assertEqual(soup.find_all(text=True),
114
                         ["Foo", "bar", '\xbb'])
115

116
    def test_find_all_limit(self):
117
        """You can limit the number of items returned by find_all."""
118
        soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>")
119
        self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"])
120
        self.assertSelects(soup.find_all('a', limit=1), ["1"])
121
        self.assertSelects(
122
            soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"])
123

124
        # A limit of 0 means no limit.
125
        self.assertSelects(
126
            soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"])
127

128
    def test_calling_a_tag_is_calling_findall(self):
129
        soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>")
130
        self.assertSelects(soup('a', limit=1), ["1"])
131
        self.assertSelects(soup.b(id="foo"), ["3"])
132

133
    def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self):
134
        soup = self.soup("<a></a>")
135
        # Create a self-referential list.
136
        l = []
137
        l.append(l)
138

139
        # Without special code in _normalize_search_value, this would cause infinite
140
        # recursion.
141
        self.assertEqual([], soup.find_all(l))
142

143
    def test_find_all_resultset(self):
144
        """All find_all calls return a ResultSet"""
145
        soup = self.soup("<a></a>")
146
        result = soup.find_all("a")
147
        self.assertTrue(hasattr(result, "source"))
148

149
        result = soup.find_all(True)
150
        self.assertTrue(hasattr(result, "source"))
151

152
        result = soup.find_all(text="foo")
153
        self.assertTrue(hasattr(result, "source"))
154

155

156
class TestFindAllBasicNamespaces(TreeTest):
157

158
    def test_find_by_namespaced_name(self):
159
        soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">')
160
        self.assertEqual("4", soup.find("mathml:msqrt").string)
161
        self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name)
162

163

164
class TestFindAllByName(TreeTest):
165
    """Test ways of finding tags by tag name."""
166

167
    def setUp(self):
168
        super(TreeTest, self).setUp()
169
        self.tree =  self.soup("""<a>First tag.</a>
170
                                  <b>Second tag.</b>
171
                                  <c>Third <a>Nested tag.</a> tag.</c>""")
172

173
    def test_find_all_by_tag_name(self):
174
        # Find all the <a> tags.
175
        self.assertSelects(
176
            self.tree.find_all('a'), ['First tag.', 'Nested tag.'])
177

178
    def test_find_all_by_name_and_text(self):
179
        self.assertSelects(
180
            self.tree.find_all('a', text='First tag.'), ['First tag.'])
181

182
        self.assertSelects(
183
            self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.'])
184

185
        self.assertSelects(
186
            self.tree.find_all('a', text=re.compile("tag")),
187
            ['First tag.', 'Nested tag.'])
188

189

190
    def test_find_all_on_non_root_element(self):
191
        # You can call find_all on any node, not just the root.
192
        self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.'])
193

194
    def test_calling_element_invokes_find_all(self):
195
        self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.'])
196

197
    def test_find_all_by_tag_strainer(self):
198
        self.assertSelects(
199
            self.tree.find_all(SoupStrainer('a')),
200
            ['First tag.', 'Nested tag.'])
201

202
    def test_find_all_by_tag_names(self):
203
        self.assertSelects(
204
            self.tree.find_all(['a', 'b']),
205
            ['First tag.', 'Second tag.', 'Nested tag.'])
206

207
    def test_find_all_by_tag_dict(self):
208
        self.assertSelects(
209
            self.tree.find_all({'a' : True, 'b' : True}),
210
            ['First tag.', 'Second tag.', 'Nested tag.'])
211

212
    def test_find_all_by_tag_re(self):
213
        self.assertSelects(
214
            self.tree.find_all(re.compile('^[ab]$')),
215
            ['First tag.', 'Second tag.', 'Nested tag.'])
216

217
    def test_find_all_with_tags_matching_method(self):
218
        # You can define an oracle method that determines whether
219
        # a tag matches the search.
220
        def id_matches_name(tag):
221
            return tag.name == tag.get('id')
222

223
        tree = self.soup("""<a id="a">Match 1.</a>
224
                            <a id="1">Does not match.</a>
225
                            <b id="b">Match 2.</a>""")
226

227
        self.assertSelects(
228
            tree.find_all(id_matches_name), ["Match 1.", "Match 2."])
229

230
    def test_find_with_multi_valued_attribute(self):
231
        soup = self.soup(
232
            "<div class='a b'>1</div><div class='a c'>2</div><div class='a d'>3</div>"
233
        )
234
        r1 = soup.find('div', 'a d');
235
        r2 = soup.find('div', re.compile(r'a d'));
236
        r3, r4 = soup.find_all('div', ['a b', 'a d']);
237
        self.assertEqual('3', r1.string)
238
        self.assertEqual('3', r2.string)
239
        self.assertEqual('1', r3.string)
240
        self.assertEqual('3', r4.string)
241

242
        
243
class TestFindAllByAttribute(TreeTest):
244

245
    def test_find_all_by_attribute_name(self):
246
        # You can pass in keyword arguments to find_all to search by
247
        # attribute.
248
        tree = self.soup("""
249
                         <a id="first">Matching a.</a>
250
                         <a id="second">
251
                          Non-matching <b id="first">Matching b.</b>a.
252
                         </a>""")
253
        self.assertSelects(tree.find_all(id='first'),
254
                           ["Matching a.", "Matching b."])
255

256
    def test_find_all_by_utf8_attribute_value(self):
257
        peace = "םולש".encode("utf8")
258
        data = '<a title="םולש"></a>'.encode("utf8")
259
        soup = self.soup(data)
260
        self.assertEqual([soup.a], soup.find_all(title=peace))
261
        self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
262
        self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"]))
263

264
    def test_find_all_by_attribute_dict(self):
265
        # You can pass in a dictionary as the argument 'attrs'. This
266
        # lets you search for attributes like 'name' (a fixed argument
267
        # to find_all) and 'class' (a reserved word in Python.)
268
        tree = self.soup("""
269
                         <a name="name1" class="class1">Name match.</a>
270
                         <a name="name2" class="class2">Class match.</a>
271
                         <a name="name3" class="class3">Non-match.</a>
272
                         <name1>A tag called 'name1'.</name1>
273
                         """)
274

275
        # This doesn't do what you want.
276
        self.assertSelects(tree.find_all(name='name1'),
277
                           ["A tag called 'name1'."])
278
        # This does what you want.
279
        self.assertSelects(tree.find_all(attrs={'name' : 'name1'}),
280
                           ["Name match."])
281

282
        self.assertSelects(tree.find_all(attrs={'class' : 'class2'}),
283
                           ["Class match."])
284

285
    def test_find_all_by_class(self):
286
        tree = self.soup("""
287
                         <a class="1">Class 1.</a>
288
                         <a class="2">Class 2.</a>
289
                         <b class="1">Class 1.</b>
290
                         <c class="3 4">Class 3 and 4.</c>
291
                         """)
292

293
        # Passing in the class_ keyword argument will search against
294
        # the 'class' attribute.
295
        self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.'])
296
        self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.'])
297
        self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.'])
298

299
        # Passing in a string to 'attrs' will also search the CSS class.
300
        self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
301
        self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
302
        self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
303
        self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
304

305
    def test_find_by_class_when_multiple_classes_present(self):
306
        tree = self.soup("<gar class='foo bar'>Found it</gar>")
307

308
        f = tree.find_all("gar", class_=re.compile("o"))
309
        self.assertSelects(f, ["Found it"])
310

311
        f = tree.find_all("gar", class_=re.compile("a"))
312
        self.assertSelects(f, ["Found it"])
313

314
        # If the search fails to match the individual strings "foo" and "bar",
315
        # it will be tried against the combined string "foo bar".
316
        f = tree.find_all("gar", class_=re.compile("o b"))
317
        self.assertSelects(f, ["Found it"])
318

319
    def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
320
        soup = self.soup("<a class='bar'>Found it</a>")
321

322
        self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"])
323

324
        def big_attribute_value(value):
325
            return len(value) > 3
326

327
        self.assertSelects(soup.find_all("a", big_attribute_value), [])
328

329
        def small_attribute_value(value):
330
            return len(value) <= 3
331

332
        self.assertSelects(
333
            soup.find_all("a", small_attribute_value), ["Found it"])
334

335
    def test_find_all_with_string_for_attrs_finds_multiple_classes(self):
336
        soup = self.soup('<a class="foo bar"></a><a class="foo"></a>')
337
        a, a2 = soup.find_all("a")
338
        self.assertEqual([a, a2], soup.find_all("a", "foo"))
339
        self.assertEqual([a], soup.find_all("a", "bar"))
340

341
        # If you specify the class as a string that contains a
342
        # space, only that specific value will be found.
343
        self.assertEqual([a], soup.find_all("a", class_="foo bar"))
344
        self.assertEqual([a], soup.find_all("a", "foo bar"))
345
        self.assertEqual([], soup.find_all("a", "bar foo"))
346

347
    def test_find_all_by_attribute_soupstrainer(self):
348
        tree = self.soup("""
349
                         <a id="first">Match.</a>
350
                         <a id="second">Non-match.</a>""")
351

352
        strainer = SoupStrainer(attrs={'id' : 'first'})
353
        self.assertSelects(tree.find_all(strainer), ['Match.'])
354

355
    def test_find_all_with_missing_attribute(self):
356
        # You can pass in None as the value of an attribute to find_all.
357
        # This will match tags that do not have that attribute set.
358
        tree = self.soup("""<a id="1">ID present.</a>
359
                            <a>No ID present.</a>
360
                            <a id="">ID is empty.</a>""")
361
        self.assertSelects(tree.find_all('a', id=None), ["No ID present."])
362

363
    def test_find_all_with_defined_attribute(self):
364
        # You can pass in None as the value of an attribute to find_all.
365
        # This will match tags that have that attribute set to any value.
366
        tree = self.soup("""<a id="1">ID present.</a>
367
                            <a>No ID present.</a>
368
                            <a id="">ID is empty.</a>""")
369
        self.assertSelects(
370
            tree.find_all(id=True), ["ID present.", "ID is empty."])
371

372
    def test_find_all_with_numeric_attribute(self):
373
        # If you search for a number, it's treated as a string.
374
        tree = self.soup("""<a id=1>Unquoted attribute.</a>
375
                            <a id="1">Quoted attribute.</a>""")
376

377
        expected = ["Unquoted attribute.", "Quoted attribute."]
378
        self.assertSelects(tree.find_all(id=1), expected)
379
        self.assertSelects(tree.find_all(id="1"), expected)
380

381
    def test_find_all_with_list_attribute_values(self):
382
        # You can pass a list of attribute values instead of just one,
383
        # and you'll get tags that match any of the values.
384
        tree = self.soup("""<a id="1">1</a>
385
                            <a id="2">2</a>
386
                            <a id="3">3</a>
387
                            <a>No ID.</a>""")
388
        self.assertSelects(tree.find_all(id=["1", "3", "4"]),
389
                           ["1", "3"])
390

391
    def test_find_all_with_regular_expression_attribute_value(self):
392
        # You can pass a regular expression as an attribute value, and
393
        # you'll get tags whose values for that attribute match the
394
        # regular expression.
395
        tree = self.soup("""<a id="a">One a.</a>
396
                            <a id="aa">Two as.</a>
397
                            <a id="ab">Mixed as and bs.</a>
398
                            <a id="b">One b.</a>
399
                            <a>No ID.</a>""")
400

401
        self.assertSelects(tree.find_all(id=re.compile("^a+$")),
402
                           ["One a.", "Two as."])
403

404
    def test_find_by_name_and_containing_string(self):
405
        soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>")
406
        a = soup.a
407

408
        self.assertEqual([a], soup.find_all("a", text="foo"))
409
        self.assertEqual([], soup.find_all("a", text="bar"))
410
        self.assertEqual([], soup.find_all("a", text="bar"))
411

412
    def test_find_by_name_and_containing_string_when_string_is_buried(self):
413
        soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>")
414
        self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo"))
415

416
    def test_find_by_attribute_and_containing_string(self):
417
        soup = self.soup('<b id="1">foo</b><a id="2">foo</a>')
418
        a = soup.a
419

420
        self.assertEqual([a], soup.find_all(id=2, text="foo"))
421
        self.assertEqual([], soup.find_all(id=1, text="bar"))
422

423

424
class TestSmooth(TreeTest):
425
    """Test Tag.smooth."""
426

427
    def test_smooth(self):
428
        soup = self.soup("<div>a</div>")
429
        div = soup.div
430
        div.append("b")
431
        div.append("c")
432
        div.append(Comment("Comment 1"))
433
        div.append(Comment("Comment 2"))
434
        div.append("d")
435
        builder = self.default_builder()
436
        span = Tag(soup, builder, 'span')
437
        span.append('1')
438
        span.append('2')
439
        div.append(span)
440

441
        # At this point the tree has a bunch of adjacent
442
        # NavigableStrings. This is normal, but it has no meaning in
443
        # terms of HTML, so we may want to smooth things out for
444
        # output.
445

446
        # Since the <span> tag has two children, its .string is None.
447
        self.assertEqual(None, div.span.string)
448

449
        self.assertEqual(7, len(div.contents))
450
        div.smooth()
451
        self.assertEqual(5, len(div.contents))
452

453
        # The three strings at the beginning of div.contents have been
454
        # merged into on string.
455
        #
456
        self.assertEqual('abc', div.contents[0])
457

458
        # The call is recursive -- the <span> tag was also smoothed.
459
        self.assertEqual('12', div.span.string)
460

461
        # The two comments have _not_ been merged, even though
462
        # comments are strings. Merging comments would change the
463
        # meaning of the HTML.
464
        self.assertEqual('Comment 1', div.contents[1])
465
        self.assertEqual('Comment 2', div.contents[2])
466

467

468
class TestIndex(TreeTest):
469
    """Test Tag.index"""
470
    def test_index(self):
471
        tree = self.soup("""<div>
472
                            <a>Identical</a>
473
                            <b>Not identical</b>
474
                            <a>Identical</a>
475

476
                            <c><d>Identical with child</d></c>
477
                            <b>Also not identical</b>
478
                            <c><d>Identical with child</d></c>
479
                            </div>""")
480
        div = tree.div
481
        for i, element in enumerate(div.contents):
482
            self.assertEqual(i, div.index(element))
483
        self.assertRaises(ValueError, tree.index, 1)
484

485

486
class TestParentOperations(TreeTest):
487
    """Test navigation and searching through an element's parents."""
488

489
    def setUp(self):
490
        super(TestParentOperations, self).setUp()
491
        self.tree = self.soup('''<ul id="empty"></ul>
492
                                 <ul id="top">
493
                                  <ul id="middle">
494
                                   <ul id="bottom">
495
                                    <b>Start here</b>
496
                                   </ul>
497
                                  </ul>''')
498
        self.start = self.tree.b
499

500

501
    def test_parent(self):
502
        self.assertEqual(self.start.parent['id'], 'bottom')
503
        self.assertEqual(self.start.parent.parent['id'], 'middle')
504
        self.assertEqual(self.start.parent.parent.parent['id'], 'top')
505

506
    def test_parent_of_top_tag_is_soup_object(self):
507
        top_tag = self.tree.contents[0]
508
        self.assertEqual(top_tag.parent, self.tree)
509

510
    def test_soup_object_has_no_parent(self):
511
        self.assertEqual(None, self.tree.parent)
512

513
    def test_find_parents(self):
514
        self.assertSelectsIDs(
515
            self.start.find_parents('ul'), ['bottom', 'middle', 'top'])
516
        self.assertSelectsIDs(
517
            self.start.find_parents('ul', id="middle"), ['middle'])
518

519
    def test_find_parent(self):
520
        self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
521
        self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
522

523
    def test_parent_of_text_element(self):
524
        text = self.tree.find(text="Start here")
525
        self.assertEqual(text.parent.name, 'b')
526

527
    def test_text_element_find_parent(self):
528
        text = self.tree.find(text="Start here")
529
        self.assertEqual(text.find_parent('ul')['id'], 'bottom')
530

531
    def test_parent_generator(self):
532
        parents = [parent['id'] for parent in self.start.parents
533
                   if parent is not None and 'id' in parent.attrs]
534
        self.assertEqual(parents, ['bottom', 'middle', 'top'])
535

536

537
class ProximityTest(TreeTest):
538

539
    def setUp(self):
540
        super(TreeTest, self).setUp()
541
        self.tree = self.soup(
542
            '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>')
543

544

545
class TestNextOperations(ProximityTest):
546

547
    def setUp(self):
548
        super(TestNextOperations, self).setUp()
549
        self.start = self.tree.b
550

551
    def test_next(self):
552
        self.assertEqual(self.start.next_element, "One")
553
        self.assertEqual(self.start.next_element.next_element['id'], "2")
554

555
    def test_next_of_last_item_is_none(self):
556
        last = self.tree.find(text="Three")
557
        self.assertEqual(last.next_element, None)
558

559
    def test_next_of_root_is_none(self):
560
        # The document root is outside the next/previous chain.
561
        self.assertEqual(self.tree.next_element, None)
562

563
    def test_find_all_next(self):
564
        self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"])
565
        self.start.find_all_next(id=3)
566
        self.assertSelects(self.start.find_all_next(id=3), ["Three"])
567

568
    def test_find_next(self):
569
        self.assertEqual(self.start.find_next('b')['id'], '2')
570
        self.assertEqual(self.start.find_next(text="Three"), "Three")
571

572
    def test_find_next_for_text_element(self):
573
        text = self.tree.find(text="One")
574
        self.assertEqual(text.find_next("b").string, "Two")
575
        self.assertSelects(text.find_all_next("b"), ["Two", "Three"])
576

577
    def test_next_generator(self):
578
        start = self.tree.find(text="Two")
579
        successors = [node for node in start.next_elements]
580
        # There are two successors: the final <b> tag and its text contents.
581
        tag, contents = successors
582
        self.assertEqual(tag['id'], '3')
583
        self.assertEqual(contents, "Three")
584

585
class TestPreviousOperations(ProximityTest):
586

587
    def setUp(self):
588
        super(TestPreviousOperations, self).setUp()
589
        self.end = self.tree.find(text="Three")
590

591
    def test_previous(self):
592
        self.assertEqual(self.end.previous_element['id'], "3")
593
        self.assertEqual(self.end.previous_element.previous_element, "Two")
594

595
    def test_previous_of_first_item_is_none(self):
596
        first = self.tree.find('html')
597
        self.assertEqual(first.previous_element, None)
598

599
    def test_previous_of_root_is_none(self):
600
        # The document root is outside the next/previous chain.
601
        # XXX This is broken!
602
        #self.assertEqual(self.tree.previous_element, None)
603
        pass
604

605
    def test_find_all_previous(self):
606
        # The <b> tag containing the "Three" node is the predecessor
607
        # of the "Three" node itself, which is why "Three" shows up
608
        # here.
609
        self.assertSelects(
610
            self.end.find_all_previous('b'), ["Three", "Two", "One"])
611
        self.assertSelects(self.end.find_all_previous(id=1), ["One"])
612

613
    def test_find_previous(self):
614
        self.assertEqual(self.end.find_previous('b')['id'], '3')
615
        self.assertEqual(self.end.find_previous(text="One"), "One")
616

617
    def test_find_previous_for_text_element(self):
618
        text = self.tree.find(text="Three")
619
        self.assertEqual(text.find_previous("b").string, "Three")
620
        self.assertSelects(
621
            text.find_all_previous("b"), ["Three", "Two", "One"])
622

623
    def test_previous_generator(self):
624
        start = self.tree.find(text="One")
625
        predecessors = [node for node in start.previous_elements]
626

627
        # There are four predecessors: the <b> tag containing "One"
628
        # the <body> tag, the <head> tag, and the <html> tag.
629
        b, body, head, html = predecessors
630
        self.assertEqual(b['id'], '1')
631
        self.assertEqual(body.name, "body")
632
        self.assertEqual(head.name, "head")
633
        self.assertEqual(html.name, "html")
634

635

636
class SiblingTest(TreeTest):
637

638
    def setUp(self):
639
        super(SiblingTest, self).setUp()
640
        markup = '''<html>
641
                    <span id="1">
642
                     <span id="1.1"></span>
643
                    </span>
644
                    <span id="2">
645
                     <span id="2.1"></span>
646
                    </span>
647
                    <span id="3">
648
                     <span id="3.1"></span>
649
                    </span>
650
                    <span id="4"></span>
651
                    </html>'''
652
        # All that whitespace looks good but makes the tests more
653
        # difficult. Get rid of it.
654
        markup = re.compile(r"\n\s*").sub("", markup)
655
        self.tree = self.soup(markup)
656

657

658
class TestNextSibling(SiblingTest):
659

660
    def setUp(self):
661
        super(TestNextSibling, self).setUp()
662
        self.start = self.tree.find(id="1")
663

664
    def test_next_sibling_of_root_is_none(self):
665
        self.assertEqual(self.tree.next_sibling, None)
666

667
    def test_next_sibling(self):
668
        self.assertEqual(self.start.next_sibling['id'], '2')
669
        self.assertEqual(self.start.next_sibling.next_sibling['id'], '3')
670

671
        # Note the difference between next_sibling and next_element.
672
        self.assertEqual(self.start.next_element['id'], '1.1')
673

674
    def test_next_sibling_may_not_exist(self):
675
        self.assertEqual(self.tree.html.next_sibling, None)
676

677
        nested_span = self.tree.find(id="1.1")
678
        self.assertEqual(nested_span.next_sibling, None)
679

680
        last_span = self.tree.find(id="4")
681
        self.assertEqual(last_span.next_sibling, None)
682

683
    def test_find_next_sibling(self):
684
        self.assertEqual(self.start.find_next_sibling('span')['id'], '2')
685

686
    def test_next_siblings(self):
687
        self.assertSelectsIDs(self.start.find_next_siblings("span"),
688
                              ['2', '3', '4'])
689

690
        self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3'])
691

692
    def test_next_sibling_for_text_element(self):
693
        soup = self.soup("Foo<b>bar</b>baz")
694
        start = soup.find(text="Foo")
695
        self.assertEqual(start.next_sibling.name, 'b')
696
        self.assertEqual(start.next_sibling.next_sibling, 'baz')
697

698
        self.assertSelects(start.find_next_siblings('b'), ['bar'])
699
        self.assertEqual(start.find_next_sibling(text="baz"), "baz")
700
        self.assertEqual(start.find_next_sibling(text="nonesuch"), None)
701

702

703
class TestPreviousSibling(SiblingTest):
704

705
    def setUp(self):
706
        super(TestPreviousSibling, self).setUp()
707
        self.end = self.tree.find(id="4")
708

709
    def test_previous_sibling_of_root_is_none(self):
710
        self.assertEqual(self.tree.previous_sibling, None)
711

712
    def test_previous_sibling(self):
713
        self.assertEqual(self.end.previous_sibling['id'], '3')
714
        self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2')
715

716
        # Note the difference between previous_sibling and previous_element.
717
        self.assertEqual(self.end.previous_element['id'], '3.1')
718

719
    def test_previous_sibling_may_not_exist(self):
720
        self.assertEqual(self.tree.html.previous_sibling, None)
721

722
        nested_span = self.tree.find(id="1.1")
723
        self.assertEqual(nested_span.previous_sibling, None)
724

725
        first_span = self.tree.find(id="1")
726
        self.assertEqual(first_span.previous_sibling, None)
727

728
    def test_find_previous_sibling(self):
729
        self.assertEqual(self.end.find_previous_sibling('span')['id'], '3')
730

731
    def test_previous_siblings(self):
732
        self.assertSelectsIDs(self.end.find_previous_siblings("span"),
733
                              ['3', '2', '1'])
734

735
        self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1'])
736

737
    def test_previous_sibling_for_text_element(self):
738
        soup = self.soup("Foo<b>bar</b>baz")
739
        start = soup.find(text="baz")
740
        self.assertEqual(start.previous_sibling.name, 'b')
741
        self.assertEqual(start.previous_sibling.previous_sibling, 'Foo')
742

743
        self.assertSelects(start.find_previous_siblings('b'), ['bar'])
744
        self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo")
745
        self.assertEqual(start.find_previous_sibling(text="nonesuch"), None)
746

747

748
class TestTag(SoupTest):
749

750
    # Test various methods of Tag.
751

752
    def test__should_pretty_print(self):
753
        # Test the rules about when a tag should be pretty-printed.
754
        tag = self.soup("").new_tag("a_tag")
755

756
        # No list of whitespace-preserving tags -> pretty-print
757
        tag._preserve_whitespace_tags = None
758
        self.assertEqual(True, tag._should_pretty_print(0))
759

760
        # List exists but tag is not on the list -> pretty-print
761
        tag.preserve_whitespace_tags = ["some_other_tag"]
762
        self.assertEqual(True, tag._should_pretty_print(1))
763

764
        # Indent level is None -> don't pretty-print
765
        self.assertEqual(False, tag._should_pretty_print(None))
766
        
767
        # Tag is on the whitespace-preserving list -> don't pretty-print
768
        tag.preserve_whitespace_tags = ["some_other_tag", "a_tag"]
769
        self.assertEqual(False, tag._should_pretty_print(1))
770

771
        
772
class TestTagCreation(SoupTest):
773
    """Test the ability to create new tags."""
774
    def test_new_tag(self):
775
        soup = self.soup("")
776
        new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"})
777
        self.assertTrue(isinstance(new_tag, Tag))
778
        self.assertEqual("foo", new_tag.name)
779
        self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs)
780
        self.assertEqual(None, new_tag.parent)
781
        
782
    def test_tag_inherits_self_closing_rules_from_builder(self):
783
        if XML_BUILDER_PRESENT:
784
            xml_soup = BeautifulSoup("", "lxml-xml")
785
            xml_br = xml_soup.new_tag("br")
786
            xml_p = xml_soup.new_tag("p")
787

788
            # Both the <br> and <p> tag are empty-element, just because
789
            # they have no contents.
790
            self.assertEqual(b"<br/>", xml_br.encode())
791
            self.assertEqual(b"<p/>", xml_p.encode())
792

793
        html_soup = BeautifulSoup("", "html.parser")
794
        html_br = html_soup.new_tag("br")
795
        html_p = html_soup.new_tag("p")
796

797
        # The HTML builder users HTML's rules about which tags are
798
        # empty-element tags, and the new tags reflect these rules.
799
        self.assertEqual(b"<br/>", html_br.encode())
800
        self.assertEqual(b"<p></p>", html_p.encode())
801

802
    def test_new_string_creates_navigablestring(self):
803
        soup = self.soup("")
804
        s = soup.new_string("foo")
805
        self.assertEqual("foo", s)
806
        self.assertTrue(isinstance(s, NavigableString))
807

808
    def test_new_string_can_create_navigablestring_subclass(self):
809
        soup = self.soup("")
810
        s = soup.new_string("foo", Comment)
811
        self.assertEqual("foo", s)
812
        self.assertTrue(isinstance(s, Comment))
813

814
class TestTreeModification(SoupTest):
815

816
    def test_attribute_modification(self):
817
        soup = self.soup('<a id="1"></a>')
818
        soup.a['id'] = 2
819
        self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>'))
820
        del(soup.a['id'])
821
        self.assertEqual(soup.decode(), self.document_for('<a></a>'))
822
        soup.a['id2'] = 'foo'
823
        self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>'))
824

825
    def test_new_tag_creation(self):
826
        builder = builder_registry.lookup('html')()
827
        soup = self.soup("<body></body>", builder=builder)
828
        a = Tag(soup, builder, 'a')
829
        ol = Tag(soup, builder, 'ol')
830
        a['href'] = 'http://foo.com/'
831
        soup.body.insert(0, a)
832
        soup.body.insert(1, ol)
833
        self.assertEqual(
834
            soup.body.encode(),
835
            b'<body><a href="http://foo.com/"></a><ol></ol></body>')
836

837
    def test_append_to_contents_moves_tag(self):
838
        doc = """<p id="1">Don't leave me <b>here</b>.</p>
839
                <p id="2">Don\'t leave!</p>"""
840
        soup = self.soup(doc)
841
        second_para = soup.find(id='2')
842
        bold = soup.b
843

844
        # Move the <b> tag to the end of the second paragraph.
845
        soup.find(id='2').append(soup.b)
846

847
        # The <b> tag is now a child of the second paragraph.
848
        self.assertEqual(bold.parent, second_para)
849

850
        self.assertEqual(
851
            soup.decode(), self.document_for(
852
                '<p id="1">Don\'t leave me .</p>\n'
853
                '<p id="2">Don\'t leave!<b>here</b></p>'))
854

855
    def test_replace_with_returns_thing_that_was_replaced(self):
856
        text = "<a></a><b><c></c></b>"
857
        soup = self.soup(text)
858
        a = soup.a
859
        new_a = a.replace_with(soup.c)
860
        self.assertEqual(a, new_a)
861

862
    def test_unwrap_returns_thing_that_was_replaced(self):
863
        text = "<a><b></b><c></c></a>"
864
        soup = self.soup(text)
865
        a = soup.a
866
        new_a = a.unwrap()
867
        self.assertEqual(a, new_a)
868

869
    def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self):
870
        soup = self.soup("<a><b>Foo</b></a><c>Bar</c>")
871
        a = soup.a
872
        a.extract()
873
        self.assertEqual(None, a.parent)
874
        self.assertRaises(ValueError, a.unwrap)
875
        self.assertRaises(ValueError, a.replace_with, soup.c)
876

877
    def test_replace_tag_with_itself(self):
878
        text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
879
        soup = self.soup(text)
880
        c = soup.c
881
        soup.c.replace_with(c)
882
        self.assertEqual(soup.decode(), self.document_for(text))
883

884
    def test_replace_tag_with_its_parent_raises_exception(self):
885
        text = "<a><b></b></a>"
886
        soup = self.soup(text)
887
        self.assertRaises(ValueError, soup.b.replace_with, soup.a)
888

889
    def test_insert_tag_into_itself_raises_exception(self):
890
        text = "<a><b></b></a>"
891
        soup = self.soup(text)
892
        self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
893

894
    def test_insert_beautifulsoup_object_inserts_children(self):
895
        """Inserting one BeautifulSoup object into another actually inserts all
896
        of its children -- you'll never combine BeautifulSoup objects.
897
        """
898
        soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>")
899
        
900
        text = "<p>p2</p><p>p3</p>"
901
        to_insert = self.soup(text)
902
        soup.insert(1, to_insert)
903

904
        for i in soup.descendants:
905
            assert not isinstance(i, BeautifulSoup)
906
        
907
        p1, p2, p3, p4 = list(soup.children)
908
        self.assertEqual("And now, a word:", p1.string)
909
        self.assertEqual("p2", p2.string)
910
        self.assertEqual("p3", p3.string)
911
        self.assertEqual("And we're back.", p4.string)
912
        
913
        
914
    def test_replace_with_maintains_next_element_throughout(self):
915
        soup = self.soup('<p><a>one</a><b>three</b></p>')
916
        a = soup.a
917
        b = a.contents[0]
918
        # Make it so the <a> tag has two text children.
919
        a.insert(1, "two")
920

921
        # Now replace each one with the empty string.
922
        left, right = a.contents
923
        left.replaceWith('')
924
        right.replaceWith('')
925

926
        # The <b> tag is still connected to the tree.
927
        self.assertEqual("three", soup.b.string)
928

929
    def test_replace_final_node(self):
930
        soup = self.soup("<b>Argh!</b>")
931
        soup.find(text="Argh!").replace_with("Hooray!")
932
        new_text = soup.find(text="Hooray!")
933
        b = soup.b
934
        self.assertEqual(new_text.previous_element, b)
935
        self.assertEqual(new_text.parent, b)
936
        self.assertEqual(new_text.previous_element.next_element, new_text)
937
        self.assertEqual(new_text.next_element, None)
938

939
    def test_consecutive_text_nodes(self):
940
        # A builder should never create two consecutive text nodes,
941
        # but if you insert one next to another, Beautiful Soup will
942
        # handle it correctly.
943
        soup = self.soup("<a><b>Argh!</b><c></c></a>")
944
        soup.b.insert(1, "Hooray!")
945

946
        self.assertEqual(
947
            soup.decode(), self.document_for(
948
                "<a><b>Argh!Hooray!</b><c></c></a>"))
949

950
        new_text = soup.find(text="Hooray!")
951
        self.assertEqual(new_text.previous_element, "Argh!")
952
        self.assertEqual(new_text.previous_element.next_element, new_text)
953

954
        self.assertEqual(new_text.previous_sibling, "Argh!")
955
        self.assertEqual(new_text.previous_sibling.next_sibling, new_text)
956

957
        self.assertEqual(new_text.next_sibling, None)
958
        self.assertEqual(new_text.next_element, soup.c)
959

960
    def test_insert_string(self):
961
        soup = self.soup("<a></a>")
962
        soup.a.insert(0, "bar")
963
        soup.a.insert(0, "foo")
964
        # The string were added to the tag.
965
        self.assertEqual(["foo", "bar"], soup.a.contents)
966
        # And they were converted to NavigableStrings.
967
        self.assertEqual(soup.a.contents[0].next_element, "bar")
968

969
    def test_insert_tag(self):
970
        builder = self.default_builder()
971
        soup = self.soup(
972
            "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
973
        magic_tag = Tag(soup, builder, 'magictag')
974
        magic_tag.insert(0, "the")
975
        soup.a.insert(1, magic_tag)
976

977
        self.assertEqual(
978
            soup.decode(), self.document_for(
979
                "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>"))
980

981
        # Make sure all the relationships are hooked up correctly.
982
        b_tag = soup.b
983
        self.assertEqual(b_tag.next_sibling, magic_tag)
984
        self.assertEqual(magic_tag.previous_sibling, b_tag)
985

986
        find = b_tag.find(text="Find")
987
        self.assertEqual(find.next_element, magic_tag)
988
        self.assertEqual(magic_tag.previous_element, find)
989

990
        c_tag = soup.c
991
        self.assertEqual(magic_tag.next_sibling, c_tag)
992
        self.assertEqual(c_tag.previous_sibling, magic_tag)
993

994
        the = magic_tag.find(text="the")
995
        self.assertEqual(the.parent, magic_tag)
996
        self.assertEqual(the.next_element, c_tag)
997
        self.assertEqual(c_tag.previous_element, the)
998

999
    def test_append_child_thats_already_at_the_end(self):
1000
        data = "<a><b></b></a>"
1001
        soup = self.soup(data)
1002
        soup.a.append(soup.b)
1003
        self.assertEqual(data, soup.decode())
1004

1005
    def test_extend(self):
1006
        data = "<a><b><c><d><e><f><g></g></f></e></d></c></b></a>"
1007
        soup = self.soup(data)
1008
        l = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b]
1009
        soup.a.extend(l)
1010
        self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode())
1011

1012
    def test_move_tag_to_beginning_of_parent(self):
1013
        data = "<a><b></b><c></c><d></d></a>"
1014
        soup = self.soup(data)
1015
        soup.a.insert(0, soup.d)
1016
        self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode())
1017

1018
    def test_insert_works_on_empty_element_tag(self):
1019
        # This is a little strange, since most HTML parsers don't allow
1020
        # markup like this to come through. But in general, we don't
1021
        # know what the parser would or wouldn't have allowed, so
1022
        # I'm letting this succeed for now.
1023
        soup = self.soup("<br/>")
1024
        soup.br.insert(1, "Contents")
1025
        self.assertEqual(str(soup.br), "<br>Contents</br>")
1026

1027
    def test_insert_before(self):
1028
        soup = self.soup("<a>foo</a><b>bar</b>")
1029
        soup.b.insert_before("BAZ")
1030
        soup.a.insert_before("QUUX")
1031
        self.assertEqual(
1032
            soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>"))
1033

1034
        soup.a.insert_before(soup.b)
1035
        self.assertEqual(
1036
            soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
1037

1038
        # Can't insert an element before itself.
1039
        b = soup.b
1040
        self.assertRaises(ValueError, b.insert_before, b)
1041

1042
        # Can't insert before if an element has no parent.
1043
        b.extract()
1044
        self.assertRaises(ValueError, b.insert_before, "nope")
1045

1046
        # Can insert an identical element
1047
        soup = self.soup("<a>")
1048
        soup.a.insert_before(soup.new_tag("a"))
1049
        
1050
    def test_insert_multiple_before(self):
1051
        soup = self.soup("<a>foo</a><b>bar</b>")
1052
        soup.b.insert_before("BAZ", " ", "QUUX")
1053
        soup.a.insert_before("QUUX", " ", "BAZ")
1054
        self.assertEqual(
1055
            soup.decode(), self.document_for("QUUX BAZ<a>foo</a>BAZ QUUX<b>bar</b>"))
1056

1057
        soup.a.insert_before(soup.b, "FOO")
1058
        self.assertEqual(
1059
            soup.decode(), self.document_for("QUUX BAZ<b>bar</b>FOO<a>foo</a>BAZ QUUX"))
1060

1061
    def test_insert_after(self):
1062
        soup = self.soup("<a>foo</a><b>bar</b>")
1063
        soup.b.insert_after("BAZ")
1064
        soup.a.insert_after("QUUX")
1065
        self.assertEqual(
1066
            soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ"))
1067
        soup.b.insert_after(soup.a)
1068
        self.assertEqual(
1069
            soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
1070

1071
        # Can't insert an element after itself.
1072
        b = soup.b
1073
        self.assertRaises(ValueError, b.insert_after, b)
1074

1075
        # Can't insert after if an element has no parent.
1076
        b.extract()
1077
        self.assertRaises(ValueError, b.insert_after, "nope")
1078

1079
        # Can insert an identical element
1080
        soup = self.soup("<a>")
1081
        soup.a.insert_before(soup.new_tag("a"))
1082
        
1083
    def test_insert_multiple_after(self):
1084
        soup = self.soup("<a>foo</a><b>bar</b>")
1085
        soup.b.insert_after("BAZ", " ", "QUUX")
1086
        soup.a.insert_after("QUUX", " ", "BAZ")
1087
        self.assertEqual(
1088
            soup.decode(), self.document_for("<a>foo</a>QUUX BAZ<b>bar</b>BAZ QUUX"))
1089
        soup.b.insert_after(soup.a, "FOO ")
1090
        self.assertEqual(
1091
            soup.decode(), self.document_for("QUUX BAZ<b>bar</b><a>foo</a>FOO BAZ QUUX"))
1092

1093
    def test_insert_after_raises_exception_if_after_has_no_meaning(self):
1094
        soup = self.soup("")
1095
        tag = soup.new_tag("a")
1096
        string = soup.new_string("")
1097
        self.assertRaises(ValueError, string.insert_after, tag)
1098
        self.assertRaises(NotImplementedError, soup.insert_after, tag)
1099
        self.assertRaises(ValueError, tag.insert_after, tag)
1100

1101
    def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self):
1102
        soup = self.soup("")
1103
        tag = soup.new_tag("a")
1104
        string = soup.new_string("")
1105
        self.assertRaises(ValueError, string.insert_before, tag)
1106
        self.assertRaises(NotImplementedError, soup.insert_before, tag)
1107
        self.assertRaises(ValueError, tag.insert_before, tag)
1108

1109
    def test_replace_with(self):
1110
        soup = self.soup(
1111
                "<p>There's <b>no</b> business like <b>show</b> business</p>")
1112
        no, show = soup.find_all('b')
1113
        show.replace_with(no)
1114
        self.assertEqual(
1115
            soup.decode(),
1116
            self.document_for(
1117
                "<p>There's  business like <b>no</b> business</p>"))
1118

1119
        self.assertEqual(show.parent, None)
1120
        self.assertEqual(no.parent, soup.p)
1121
        self.assertEqual(no.next_element, "no")
1122
        self.assertEqual(no.next_sibling, " business")
1123

1124
    def test_replace_first_child(self):
1125
        data = "<a><b></b><c></c></a>"
1126
        soup = self.soup(data)
1127
        soup.b.replace_with(soup.c)
1128
        self.assertEqual("<a><c></c></a>", soup.decode())
1129

1130
    def test_replace_last_child(self):
1131
        data = "<a><b></b><c></c></a>"
1132
        soup = self.soup(data)
1133
        soup.c.replace_with(soup.b)
1134
        self.assertEqual("<a><b></b></a>", soup.decode())
1135

1136
    def test_nested_tag_replace_with(self):
1137
        soup = self.soup(
1138
            """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
1139

1140
        # Replace the entire <b> tag and its contents ("reserve the
1141
        # right") with the <f> tag ("refuse").
1142
        remove_tag = soup.b
1143
        move_tag = soup.f
1144
        remove_tag.replace_with(move_tag)
1145

1146
        self.assertEqual(
1147
            soup.decode(), self.document_for(
1148
                "<a>We<f>refuse</f></a><e>to<g>service</g></e>"))
1149

1150
        # The <b> tag is now an orphan.
1151
        self.assertEqual(remove_tag.parent, None)
1152
        self.assertEqual(remove_tag.find(text="right").next_element, None)
1153
        self.assertEqual(remove_tag.previous_element, None)
1154
        self.assertEqual(remove_tag.next_sibling, None)
1155
        self.assertEqual(remove_tag.previous_sibling, None)
1156

1157
        # The <f> tag is now connected to the <a> tag.
1158
        self.assertEqual(move_tag.parent, soup.a)
1159
        self.assertEqual(move_tag.previous_element, "We")
1160
        self.assertEqual(move_tag.next_element.next_element, soup.e)
1161
        self.assertEqual(move_tag.next_sibling, None)
1162

1163
        # The gap where the <f> tag used to be has been mended, and
1164
        # the word "to" is now connected to the <g> tag.
1165
        to_text = soup.find(text="to")
1166
        g_tag = soup.g
1167
        self.assertEqual(to_text.next_element, g_tag)
1168
        self.assertEqual(to_text.next_sibling, g_tag)
1169
        self.assertEqual(g_tag.previous_element, to_text)
1170
        self.assertEqual(g_tag.previous_sibling, to_text)
1171

1172
    def test_unwrap(self):
1173
        tree = self.soup("""
1174
            <p>Unneeded <em>formatting</em> is unneeded</p>
1175
            """)
1176
        tree.em.unwrap()
1177
        self.assertEqual(tree.em, None)
1178
        self.assertEqual(tree.p.text, "Unneeded formatting is unneeded")
1179

1180
    def test_wrap(self):
1181
        soup = self.soup("I wish I was bold.")
1182
        value = soup.string.wrap(soup.new_tag("b"))
1183
        self.assertEqual(value.decode(), "<b>I wish I was bold.</b>")
1184
        self.assertEqual(
1185
            soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1186

1187
    def test_wrap_extracts_tag_from_elsewhere(self):
1188
        soup = self.soup("<b></b>I wish I was bold.")
1189
        soup.b.next_sibling.wrap(soup.b)
1190
        self.assertEqual(
1191
            soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1192

1193
    def test_wrap_puts_new_contents_at_the_end(self):
1194
        soup = self.soup("<b>I like being bold.</b>I wish I was bold.")
1195
        soup.b.next_sibling.wrap(soup.b)
1196
        self.assertEqual(2, len(soup.b.contents))
1197
        self.assertEqual(
1198
            soup.decode(), self.document_for(
1199
                "<b>I like being bold.I wish I was bold.</b>"))
1200

1201
    def test_extract(self):
1202
        soup = self.soup(
1203
            '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>')
1204

1205
        self.assertEqual(len(soup.body.contents), 3)
1206
        extracted = soup.find(id="nav").extract()
1207

1208
        self.assertEqual(
1209
            soup.decode(), "<html><body>Some content.  More content.</body></html>")
1210
        self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
1211

1212
        # The extracted tag is now an orphan.
1213
        self.assertEqual(len(soup.body.contents), 2)
1214
        self.assertEqual(extracted.parent, None)
1215
        self.assertEqual(extracted.previous_element, None)
1216
        self.assertEqual(extracted.next_element.next_element, None)
1217

1218
        # The gap where the extracted tag used to be has been mended.
1219
        content_1 = soup.find(text="Some content. ")
1220
        content_2 = soup.find(text=" More content.")
1221
        self.assertEqual(content_1.next_element, content_2)
1222
        self.assertEqual(content_1.next_sibling, content_2)
1223
        self.assertEqual(content_2.previous_element, content_1)
1224
        self.assertEqual(content_2.previous_sibling, content_1)
1225

1226
    def test_extract_distinguishes_between_identical_strings(self):
1227
        soup = self.soup("<a>foo</a><b>bar</b>")
1228
        foo_1 = soup.a.string
1229
        bar_1 = soup.b.string
1230
        foo_2 = soup.new_string("foo")
1231
        bar_2 = soup.new_string("bar")
1232
        soup.a.append(foo_2)
1233
        soup.b.append(bar_2)
1234

1235
        # Now there are two identical strings in the <a> tag, and two
1236
        # in the <b> tag. Let's remove the first "foo" and the second
1237
        # "bar".
1238
        foo_1.extract()
1239
        bar_2.extract()
1240
        self.assertEqual(foo_2, soup.a.string)
1241
        self.assertEqual(bar_2, soup.b.string)
1242

1243
    def test_extract_multiples_of_same_tag(self):
1244
        soup = self.soup("""
1245
<html>
1246
<head>
1247
<script>foo</script>
1248
</head>
1249
<body>
1250
 <script>bar</script>
1251
 <a></a>
1252
</body>
1253
<script>baz</script>
1254
</html>""")
1255
        [soup.script.extract() for i in soup.find_all("script")]
1256
        self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
1257

1258

1259
    def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
1260
        soup = self.soup(
1261
 '<html>\n'
1262
 '<body>hi</body>\n'
1263
 '</html>')
1264
        soup.find('body').extract()
1265
        self.assertEqual(None, soup.find('body'))
1266

1267

1268
    def test_clear(self):
1269
        """Tag.clear()"""
1270
        soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
1271
        # clear using extract()
1272
        a = soup.a
1273
        soup.p.clear()
1274
        self.assertEqual(len(soup.p.contents), 0)
1275
        self.assertTrue(hasattr(a, "contents"))
1276

1277
        # clear using decompose()
1278
        em = a.em
1279
        a.clear(decompose=True)
1280
        self.assertEqual(0, len(em.contents))
1281

1282
       
1283
    def test_decompose(self):
1284
        # Test PageElement.decompose() and PageElement.decomposed
1285
        soup = self.soup("<p><a>String <em>Italicized</em></a></p><p>Another para</p>")
1286
        p1, p2 = soup.find_all('p')
1287
        a = p1.a
1288
        text = p1.em.string
1289
        for i in [p1, p2, a, text]:
1290
            self.assertEqual(False, i.decomposed)
1291

1292
        # This sets p1 and everything beneath it to decomposed.
1293
        p1.decompose()
1294
        for i in [p1, a, text]:
1295
            self.assertEqual(True, i.decomposed)
1296
        # p2 is unaffected.
1297
        self.assertEqual(False, p2.decomposed)
1298
            
1299
    def test_string_set(self):
1300
        """Tag.string = 'string'"""
1301
        soup = self.soup("<a></a> <b><c></c></b>")
1302
        soup.a.string = "foo"
1303
        self.assertEqual(soup.a.contents, ["foo"])
1304
        soup.b.string = "bar"
1305
        self.assertEqual(soup.b.contents, ["bar"])
1306

1307
    def test_string_set_does_not_affect_original_string(self):
1308
        soup = self.soup("<a><b>foo</b><c>bar</c>")
1309
        soup.b.string = soup.c.string
1310
        self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>")
1311

1312
    def test_set_string_preserves_class_of_string(self):
1313
        soup = self.soup("<a></a>")
1314
        cdata = CData("foo")
1315
        soup.a.string = cdata
1316
        self.assertTrue(isinstance(soup.a.string, CData))
1317

1318
class TestElementObjects(SoupTest):
1319
    """Test various features of element objects."""
1320

1321
    def test_len(self):
1322
        """The length of an element is its number of children."""
1323
        soup = self.soup("<top>1<b>2</b>3</top>")
1324

1325
        # The BeautifulSoup object itself contains one element: the
1326
        # <top> tag.
1327
        self.assertEqual(len(soup.contents), 1)
1328
        self.assertEqual(len(soup), 1)
1329

1330
        # The <top> tag contains three elements: the text node "1", the
1331
        # <b> tag, and the text node "3".
1332
        self.assertEqual(len(soup.top), 3)
1333
        self.assertEqual(len(soup.top.contents), 3)
1334

1335
    def test_member_access_invokes_find(self):
1336
        """Accessing a Python member .foo invokes find('foo')"""
1337
        soup = self.soup('<b><i></i></b>')
1338
        self.assertEqual(soup.b, soup.find('b'))
1339
        self.assertEqual(soup.b.i, soup.find('b').find('i'))
1340
        self.assertEqual(soup.a, None)
1341

1342
    def test_deprecated_member_access(self):
1343
        soup = self.soup('<b><i></i></b>')
1344
        with warnings.catch_warnings(record=True) as w:
1345
            tag = soup.bTag
1346
        self.assertEqual(soup.b, tag)
1347
        self.assertEqual(
1348
            '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")',
1349
            str(w[0].message))
1350

1351
    def test_has_attr(self):
1352
        """has_attr() checks for the presence of an attribute.
1353

1354
        Please note note: has_attr() is different from
1355
        __in__. has_attr() checks the tag's attributes and __in__
1356
        checks the tag's chidlren.
1357
        """
1358
        soup = self.soup("<foo attr='bar'>")
1359
        self.assertTrue(soup.foo.has_attr('attr'))
1360
        self.assertFalse(soup.foo.has_attr('attr2'))
1361

1362

1363
    def test_attributes_come_out_in_alphabetical_order(self):
1364
        markup = '<b a="1" z="5" m="3" f="2" y="4"></b>'
1365
        self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>')
1366

1367
    def test_string(self):
1368
        # A tag that contains only a text node makes that node
1369
        # available as .string.
1370
        soup = self.soup("<b>foo</b>")
1371
        self.assertEqual(soup.b.string, 'foo')
1372

1373
    def test_empty_tag_has_no_string(self):
1374
        # A tag with no children has no .stirng.
1375
        soup = self.soup("<b></b>")
1376
        self.assertEqual(soup.b.string, None)
1377

1378
    def test_tag_with_multiple_children_has_no_string(self):
1379
        # A tag with no children has no .string.
1380
        soup = self.soup("<a>foo<b></b><b></b></b>")
1381
        self.assertEqual(soup.b.string, None)
1382

1383
        soup = self.soup("<a>foo<b></b>bar</b>")
1384
        self.assertEqual(soup.b.string, None)
1385

1386
        # Even if all the children are strings, due to trickery,
1387
        # it won't work--but this would be a good optimization.
1388
        soup = self.soup("<a>foo</b>")
1389
        soup.a.insert(1, "bar")
1390
        self.assertEqual(soup.a.string, None)
1391

1392
    def test_tag_with_recursive_string_has_string(self):
1393
        # A tag with a single child which has a .string inherits that
1394
        # .string.
1395
        soup = self.soup("<a><b>foo</b></a>")
1396
        self.assertEqual(soup.a.string, "foo")
1397
        self.assertEqual(soup.string, "foo")
1398

1399
    def test_lack_of_string(self):
1400
        """Only a tag containing a single text node has a .string."""
1401
        soup = self.soup("<b>f<i>e</i>o</b>")
1402
        self.assertFalse(soup.b.string)
1403

1404
        soup = self.soup("<b></b>")
1405
        self.assertFalse(soup.b.string)
1406

1407
    def test_all_text(self):
1408
        """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
1409
        soup = self.soup("<a>a<b>r</b>   <r> t </r></a>")
1410
        self.assertEqual(soup.a.text, "ar  t ")
1411
        self.assertEqual(soup.a.get_text(strip=True), "art")
1412
        self.assertEqual(soup.a.get_text(","), "a,r, , t ")
1413
        self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
1414

1415
    def test_get_text_ignores_special_string_containers(self):
1416
        soup = self.soup("foo<!--IGNORE-->bar")
1417
        self.assertEqual(soup.get_text(), "foobar")
1418

1419
        self.assertEqual(
1420
            soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
1421
        self.assertEqual(
1422
            soup.get_text(types=None), "fooIGNOREbar")
1423

1424
        soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
1425
        self.assertEqual(soup.get_text(), "foobar")
1426
        
1427
    def test_all_strings_ignores_special_string_containers(self):
1428
        soup = self.soup("foo<!--IGNORE-->bar")
1429
        self.assertEqual(['foo', 'bar'], list(soup.strings))
1430

1431
        soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
1432
        self.assertEqual(['foo', 'bar'], list(soup.strings))
1433

1434

1435
class TestCDAtaListAttributes(SoupTest):
1436

1437
    """Testing cdata-list attributes like 'class'.
1438
    """
1439
    def test_single_value_becomes_list(self):
1440
        soup = self.soup("<a class='foo'>")
1441
        self.assertEqual(["foo"],soup.a['class'])
1442

1443
    def test_multiple_values_becomes_list(self):
1444
        soup = self.soup("<a class='foo bar'>")
1445
        self.assertEqual(["foo", "bar"], soup.a['class'])
1446

1447
    def test_multiple_values_separated_by_weird_whitespace(self):
1448
        soup = self.soup("<a class='foo\tbar\nbaz'>")
1449
        self.assertEqual(["foo", "bar", "baz"],soup.a['class'])
1450

1451
    def test_attributes_joined_into_string_on_output(self):
1452
        soup = self.soup("<a class='foo\tbar'>")
1453
        self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode())
1454

1455
    def test_get_attribute_list(self):
1456
        soup = self.soup("<a id='abc def'>")
1457
        self.assertEqual(['abc def'], soup.a.get_attribute_list('id'))
1458
        
1459
    def test_accept_charset(self):
1460
        soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
1461
        self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
1462

1463
    def test_cdata_attribute_applying_only_to_one_tag(self):
1464
        data = '<a accept-charset="ISO-8859-1 UTF-8"></a>'
1465
        soup = self.soup(data)
1466
        # We saw in another test that accept-charset is a cdata-list
1467
        # attribute for the <form> tag. But it's not a cdata-list
1468
        # attribute for any other tag.
1469
        self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset'])
1470

1471
    def test_string_has_immutable_name_property(self):
1472
        string = self.soup("s").string
1473
        self.assertEqual(None, string.name)
1474
        def t():
1475
            string.name = 'foo'
1476
        self.assertRaises(AttributeError, t)
1477

1478
class TestPersistence(SoupTest):
1479
    "Testing features like pickle and deepcopy."
1480

1481
    def setUp(self):
1482
        super(TestPersistence, self).setUp()
1483
        self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
1484
"http://www.w3.org/TR/REC-html40/transitional.dtd">
1485
<html>
1486
<head>
1487
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
1488
<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
1489
<link rev="made" href="mailto:[email protected]">
1490
<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
1491
<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
1492
<meta name="author" content="Leonard Richardson">
1493
</head>
1494
<body>
1495
<a href="foo">foo</a>
1496
<a href="foo"><b>bar</b></a>
1497
</body>
1498
</html>"""
1499
        self.tree = self.soup(self.page)
1500

1501
    def test_pickle_and_unpickle_identity(self):
1502
        # Pickling a tree, then unpickling it, yields a tree identical
1503
        # to the original.
1504
        dumped = pickle.dumps(self.tree, 2)
1505
        loaded = pickle.loads(dumped)
1506
        self.assertEqual(loaded.__class__, BeautifulSoup)
1507
        self.assertEqual(loaded.decode(), self.tree.decode())
1508

1509
    def test_deepcopy_identity(self):
1510
        # Making a deepcopy of a tree yields an identical tree.
1511
        copied = copy.deepcopy(self.tree)
1512
        self.assertEqual(copied.decode(), self.tree.decode())
1513

1514
    def test_copy_preserves_encoding(self):
1515
        soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')
1516
        encoding = soup.original_encoding
1517
        copy = soup.__copy__()
1518
        self.assertEqual("<p> </p>", str(copy))
1519
        self.assertEqual(encoding, copy.original_encoding)
1520

1521
    def test_copy_preserves_builder_information(self):
1522

1523
        tag = self.soup('<p></p>').p
1524

1525
        # Simulate a tag obtained from a source file.
1526
        tag.sourceline = 10
1527
        tag.sourcepos = 33
1528
        
1529
        copied = tag.__copy__()
1530

1531
        # The TreeBuilder object is no longer availble, but information
1532
        # obtained from it gets copied over to the new Tag object.
1533
        self.assertEqual(tag.sourceline, copied.sourceline)
1534
        self.assertEqual(tag.sourcepos, copied.sourcepos)
1535
        self.assertEqual(
1536
            tag.can_be_empty_element, copied.can_be_empty_element
1537
        )
1538
        self.assertEqual(
1539
            tag.cdata_list_attributes, copied.cdata_list_attributes
1540
        )
1541
        self.assertEqual(
1542
            tag.preserve_whitespace_tags, copied.preserve_whitespace_tags
1543
        )
1544
        
1545
        
1546
    def test_unicode_pickle(self):
1547
        # A tree containing Unicode characters can be pickled.
1548
        html = "<b>\N{SNOWMAN}</b>"
1549
        soup = self.soup(html)
1550
        dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
1551
        loaded = pickle.loads(dumped)
1552
        self.assertEqual(loaded.decode(), soup.decode())
1553

1554
    def test_copy_navigablestring_is_not_attached_to_tree(self):
1555
        html = "<b>Foo<a></a></b><b>Bar</b>"
1556
        soup = self.soup(html)
1557
        s1 = soup.find(string="Foo")
1558
        s2 = copy.copy(s1)
1559
        self.assertEqual(s1, s2)
1560
        self.assertEqual(None, s2.parent)
1561
        self.assertEqual(None, s2.next_element)
1562
        self.assertNotEqual(None, s1.next_sibling)
1563
        self.assertEqual(None, s2.next_sibling)
1564
        self.assertEqual(None, s2.previous_element)
1565

1566
    def test_copy_navigablestring_subclass_has_same_type(self):
1567
        html = "<b><!--Foo--></b>"
1568
        soup = self.soup(html)
1569
        s1 = soup.string
1570
        s2 = copy.copy(s1)
1571
        self.assertEqual(s1, s2)
1572
        self.assertTrue(isinstance(s2, Comment))
1573

1574
    def test_copy_entire_soup(self):
1575
        html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1576
        soup = self.soup(html)
1577
        soup_copy = copy.copy(soup)
1578
        self.assertEqual(soup, soup_copy)
1579

1580
    def test_copy_tag_copies_contents(self):
1581
        html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1582
        soup = self.soup(html)
1583
        div = soup.div
1584
        div_copy = copy.copy(div)
1585

1586
        # The two tags look the same, and evaluate to equal.
1587
        self.assertEqual(str(div), str(div_copy))
1588
        self.assertEqual(div, div_copy)
1589

1590
        # But they're not the same object.
1591
        self.assertFalse(div is div_copy)
1592

1593
        # And they don't have the same relation to the parse tree. The
1594
        # copy is not associated with a parse tree at all.
1595
        self.assertEqual(None, div_copy.parent)
1596
        self.assertEqual(None, div_copy.previous_element)
1597
        self.assertEqual(None, div_copy.find(string='Bar').next_element)
1598
        self.assertNotEqual(None, div.find(string='Bar').next_element)
1599

1600
class TestSubstitutions(SoupTest):
1601

1602
    def test_default_formatter_is_minimal(self):
1603
        markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1604
        soup = self.soup(markup)
1605
        decoded = soup.decode(formatter="minimal")
1606
        # The < is converted back into &lt; but the e-with-acute is left alone.
1607
        self.assertEqual(
1608
            decoded,
1609
            self.document_for(
1610
                "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1611

1612
    def test_formatter_html(self):
1613
        markup = "<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1614
        soup = self.soup(markup)
1615
        decoded = soup.decode(formatter="html")
1616
        self.assertEqual(
1617
            decoded,
1618
            self.document_for("<br/><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
1619

1620
    def test_formatter_html5(self):
1621
        markup = "<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1622
        soup = self.soup(markup)
1623
        decoded = soup.decode(formatter="html5")
1624
        self.assertEqual(
1625
            decoded,
1626
            self.document_for("<br><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
1627
        
1628
    def test_formatter_minimal(self):
1629
        markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1630
        soup = self.soup(markup)
1631
        decoded = soup.decode(formatter="minimal")
1632
        # The < is converted back into &lt; but the e-with-acute is left alone.
1633
        self.assertEqual(
1634
            decoded,
1635
            self.document_for(
1636
                "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
1637

1638
    def test_formatter_null(self):
1639
        markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
1640
        soup = self.soup(markup)
1641
        decoded = soup.decode(formatter=None)
1642
        # Neither the angle brackets nor the e-with-acute are converted.
1643
        # This is not valid HTML, but it's what the user wanted.
1644
        self.assertEqual(decoded,
1645
                          self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
1646

1647
    def test_formatter_custom(self):
1648
        markup = "<b>&lt;foo&gt;</b><b>bar</b><br/>"
1649
        soup = self.soup(markup)
1650
        decoded = soup.decode(formatter = lambda x: x.upper())
1651
        # Instead of normal entity conversion code, the custom
1652
        # callable is called on every string.
1653
        self.assertEqual(
1654
            decoded,
1655
            self.document_for("<b><FOO></b><b>BAR</b><br/>"))
1656

1657
    def test_formatter_is_run_on_attribute_values(self):
1658
        markup = '<a href="http://a.com?a=b&c=é">e</a>'
1659
        soup = self.soup(markup)
1660
        a = soup.a
1661

1662
        expect_minimal = '<a href="http://a.com?a=b&amp;c=é">e</a>'
1663

1664
        self.assertEqual(expect_minimal, a.decode())
1665
        self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
1666

1667
        expect_html = '<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
1668
        self.assertEqual(expect_html, a.decode(formatter="html"))
1669

1670
        self.assertEqual(markup, a.decode(formatter=None))
1671
        expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
1672
        self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
1673

1674
    def test_formatter_skips_script_tag_for_html_documents(self):
1675
        doc = """
1676
  <script type="text/javascript">
1677
   console.log("< < hey > > ");
1678
  </script>
1679
"""
1680
        encoded = BeautifulSoup(doc, 'html.parser').encode()
1681
        self.assertTrue(b"< < hey > >" in encoded)
1682

1683
    def test_formatter_skips_style_tag_for_html_documents(self):
1684
        doc = """
1685
  <style type="text/css">
1686
   console.log("< < hey > > ");
1687
  </style>
1688
"""
1689
        encoded = BeautifulSoup(doc, 'html.parser').encode()
1690
        self.assertTrue(b"< < hey > >" in encoded)
1691

1692
    def test_prettify_leaves_preformatted_text_alone(self):
1693
        soup = self.soup("<div>  foo  <pre>  \tbar\n  \n  </pre>  baz  <textarea> eee\nfff\t</textarea></div>")
1694
        # Everything outside the <pre> tag is reformatted, but everything
1695
        # inside is left alone.
1696
        self.assertEqual(
1697
            '<div>\n foo\n <pre>  \tbar\n  \n  </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>',
1698
            soup.div.prettify())
1699

1700
    def test_prettify_accepts_formatter_function(self):
1701
        soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
1702
        pretty = soup.prettify(formatter = lambda x: x.upper())
1703
        self.assertTrue("FOO" in pretty)
1704

1705
    def test_prettify_outputs_unicode_by_default(self):
1706
        soup = self.soup("<a></a>")
1707
        self.assertEqual(str, type(soup.prettify()))
1708

1709
    def test_prettify_can_encode_data(self):
1710
        soup = self.soup("<a></a>")
1711
        self.assertEqual(bytes, type(soup.prettify("utf-8")))
1712

1713
    def test_html_entity_substitution_off_by_default(self):
1714
        markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
1715
        soup = self.soup(markup)
1716
        encoded = soup.b.encode("utf-8")
1717
        self.assertEqual(encoded, markup.encode('utf-8'))
1718

1719
    def test_encoding_substitution(self):
1720
        # Here's the <meta> tag saying that a document is
1721
        # encoded in Shift-JIS.
1722
        meta_tag = ('<meta content="text/html; charset=x-sjis" '
1723
                    'http-equiv="Content-type"/>')
1724
        soup = self.soup(meta_tag)
1725

1726
        # Parse the document, and the charset apprears unchanged.
1727
        self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis')
1728

1729
        # Encode the document into some encoding, and the encoding is
1730
        # substituted into the meta tag.
1731
        utf_8 = soup.encode("utf-8")
1732
        self.assertTrue(b"charset=utf-8" in utf_8)
1733

1734
        euc_jp = soup.encode("euc_jp")
1735
        self.assertTrue(b"charset=euc_jp" in euc_jp)
1736

1737
        shift_jis = soup.encode("shift-jis")
1738
        self.assertTrue(b"charset=shift-jis" in shift_jis)
1739

1740
        utf_16_u = soup.encode("utf-16").decode("utf-16")
1741
        self.assertTrue("charset=utf-16" in utf_16_u)
1742

1743
    def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
1744
        markup = ('<head><meta content="text/html; charset=x-sjis" '
1745
                    'http-equiv="Content-type"/></head><pre>foo</pre>')
1746

1747
        # Beautiful Soup used to try to rewrite the meta tag even if the
1748
        # meta tag got filtered out by the strainer. This test makes
1749
        # sure that doesn't happen.
1750
        strainer = SoupStrainer('pre')
1751
        soup = self.soup(markup, parse_only=strainer)
1752
        self.assertEqual(soup.contents[0].name, 'pre')
1753

1754
class TestEncoding(SoupTest):
1755
    """Test the ability to encode objects into strings."""
1756

1757
    def test_unicode_string_can_be_encoded(self):
1758
        html = "<b>\N{SNOWMAN}</b>"
1759
        soup = self.soup(html)
1760
        self.assertEqual(soup.b.string.encode("utf-8"),
1761
                          "\N{SNOWMAN}".encode("utf-8"))
1762

1763
    def test_tag_containing_unicode_string_can_be_encoded(self):
1764
        html = "<b>\N{SNOWMAN}</b>"
1765
        soup = self.soup(html)
1766
        self.assertEqual(
1767
            soup.b.encode("utf-8"), html.encode("utf-8"))
1768

1769
    def test_encoding_substitutes_unrecognized_characters_by_default(self):
1770
        html = "<b>\N{SNOWMAN}</b>"
1771
        soup = self.soup(html)
1772
        self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
1773

1774
    def test_encoding_can_be_made_strict(self):
1775
        html = "<b>\N{SNOWMAN}</b>"
1776
        soup = self.soup(html)
1777
        self.assertRaises(
1778
            UnicodeEncodeError, soup.encode, "ascii", errors="strict")
1779

1780
    def test_decode_contents(self):
1781
        html = "<b>\N{SNOWMAN}</b>"
1782
        soup = self.soup(html)
1783
        self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
1784

1785
    def test_encode_contents(self):
1786
        html = "<b>\N{SNOWMAN}</b>"
1787
        soup = self.soup(html)
1788
        self.assertEqual(
1789
            "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
1790
                encoding="utf8"))
1791

1792
    def test_deprecated_renderContents(self):
1793
        html = "<b>\N{SNOWMAN}</b>"
1794
        soup = self.soup(html)
1795
        self.assertEqual(
1796
            "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
1797

1798
    def test_repr(self):
1799
        html = "<b>\N{SNOWMAN}</b>"
1800
        soup = self.soup(html)
1801
        if PY3K:
1802
            self.assertEqual(html, repr(soup))
1803
        else:
1804
            self.assertEqual(b'<b>\\u2603</b>', repr(soup))
1805

1806
class TestFormatter(SoupTest):
1807

1808
    def test_default_attributes(self):
1809
        # Test the default behavior of Formatter.attributes().
1810
        formatter = Formatter()
1811
        tag = Tag(name="tag")
1812
        tag['b'] = 1
1813
        tag['a'] = 2
1814

1815
        # Attributes come out sorted by name. In Python 3, attributes
1816
        # normally come out of a dictionary in the order they were
1817
        # added.
1818
        self.assertEqual([('a', 2), ('b', 1)], formatter.attributes(tag))
1819

1820
        # This works even if Tag.attrs is None, though this shouldn't
1821
        # normally happen.
1822
        tag.attrs = None
1823
        self.assertEqual([], formatter.attributes(tag))
1824
        
1825
    def test_sort_attributes(self):
1826
        # Test the ability to override Formatter.attributes() to,
1827
        # e.g., disable the normal sorting of attributes.
1828
        class UnsortedFormatter(Formatter):
1829
            def attributes(self, tag):
1830
                self.called_with = tag
1831
                for k, v in sorted(tag.attrs.items()):
1832
                    if k == 'ignore':
1833
                        continue
1834
                    yield k,v
1835

1836
        soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
1837
        formatter = UnsortedFormatter()
1838
        decoded = soup.decode(formatter=formatter)
1839

1840
        # attributes() was called on the <p> tag. It filtered out one
1841
        # attribute and sorted the other two.
1842
        self.assertEqual(formatter.called_with, soup.p)
1843
        self.assertEqual('<p aval="2" cval="1"></p>', decoded)
1844

1845

1846
class TestNavigableStringSubclasses(SoupTest):
1847

1848
    def test_cdata(self):
1849
        # None of the current builders turn CDATA sections into CData
1850
        # objects, but you can create them manually.
1851
        soup = self.soup("")
1852
        cdata = CData("foo")
1853
        soup.insert(1, cdata)
1854
        self.assertEqual(str(soup), "<![CDATA[foo]]>")
1855
        self.assertEqual(soup.find(text="foo"), "foo")
1856
        self.assertEqual(soup.contents[0], "foo")
1857

1858
    def test_cdata_is_never_formatted(self):
1859
        """Text inside a CData object is passed into the formatter.
1860

1861
        But the return value is ignored.
1862
        """
1863

1864
        self.count = 0
1865
        def increment(*args):
1866
            self.count += 1
1867
            return "BITTER FAILURE"
1868

1869
        soup = self.soup("")
1870
        cdata = CData("<><><>")
1871
        soup.insert(1, cdata)
1872
        self.assertEqual(
1873
            b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
1874
        self.assertEqual(1, self.count)
1875

1876
    def test_doctype_ends_in_newline(self):
1877
        # Unlike other NavigableString subclasses, a DOCTYPE always ends
1878
        # in a newline.
1879
        doctype = Doctype("foo")
1880
        soup = self.soup("")
1881
        soup.insert(1, doctype)
1882
        self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
1883

1884
    def test_declaration(self):
1885
        d = Declaration("foo")
1886
        self.assertEqual("<?foo?>", d.output_ready())
1887

1888
    def test_default_string_containers(self):
1889
        # In some cases, we use different NavigableString subclasses for
1890
        # the same text in different tags.
1891
        soup = self.soup(
1892
            "<div>text</div><script>text</script><style>text</style>"
1893
        )
1894
        self.assertEqual(
1895
            [NavigableString, Script, Stylesheet],
1896
            [x.__class__ for x in soup.find_all(text=True)]
1897
        )
1898

1899
        # The TemplateString is a little unusual because it's generally found
1900
        # _inside_ children of a <template> element, not a direct child of the
1901
        # <template> element.
1902
        soup = self.soup(
1903
            "<template>Some text<p>In a tag</p></template>Some text outside"
1904
        )
1905
        assert all(isinstance(x, TemplateString) for x in soup.template.strings)
1906

1907
        # Once the <template> tag closed, we went back to using
1908
        # NavigableString.
1909
        outside = soup.template.next_sibling
1910
        assert isinstance(outside, NavigableString)
1911
        assert not isinstance(outside, TemplateString)
1912

1913
class TestSoupSelector(TreeTest):
1914

1915
    HTML = """
1916
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
1917
"http://www.w3.org/TR/html4/strict.dtd">
1918
<html>
1919
<head>
1920
<title>The title</title>
1921
<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
1922
</head>
1923
<body>
1924
<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
1925
<div id="main" class="fancy">
1926
<div id="inner">
1927
<h1 id="header1">An H1</h1>
1928
<p>Some text</p>
1929
<p class="onep" id="p1">Some more text</p>
1930
<h2 id="header2">An H2</h2>
1931
<p class="class1 class2 class3" id="pmulti">Another</p>
1932
<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
1933
<h2 id="header3">Another H2</h2>
1934
<a id="me" href="http://simonwillison.net/" rel="me">me</a>
1935
<span class="s1">
1936
<a href="#" id="s1a1">span1a1</a>
1937
<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
1938
<span class="span2">
1939
<a href="#" id="s2a1">span2a1</a>
1940
</span>
1941
<span class="span3"></span>
1942
<custom-dashed-tag class="dashed" id="dash2"/>
1943
<div data-tag="dashedvalue" id="data1"/>
1944
</span>
1945
</div>
1946
<x id="xid">
1947
<z id="zida"/>
1948
<z id="zidab"/>
1949
<z id="zidac"/>
1950
</x>
1951
<y id="yid">
1952
<z id="zidb"/>
1953
</y>
1954
<p lang="en" id="lang-en">English</p>
1955
<p lang="en-gb" id="lang-en-gb">English UK</p>
1956
<p lang="en-us" id="lang-en-us">English US</p>
1957
<p lang="fr" id="lang-fr">French</p>
1958
</div>
1959

1960
<div id="footer">
1961
</div>
1962
"""
1963

1964
    def setUp(self):
1965
        self.soup = BeautifulSoup(self.HTML, 'html.parser')
1966

1967
    def assertSelects(self, selector, expected_ids, **kwargs):
1968
        el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)]
1969
        el_ids.sort()
1970
        expected_ids.sort()
1971
        self.assertEqual(expected_ids, el_ids,
1972
            "Selector %s, expected [%s], got [%s]" % (
1973
                selector, ', '.join(expected_ids), ', '.join(el_ids)
1974
            )
1975
        )
1976

1977
    assertSelect = assertSelects
1978

1979
    def assertSelectMultiple(self, *tests):
1980
        for selector, expected_ids in tests:
1981
            self.assertSelect(selector, expected_ids)
1982

1983
    def test_one_tag_one(self):
1984
        els = self.soup.select('title')
1985
        self.assertEqual(len(els), 1)
1986
        self.assertEqual(els[0].name, 'title')
1987
        self.assertEqual(els[0].contents, ['The title'])
1988

1989
    def test_one_tag_many(self):
1990
        els = self.soup.select('div')
1991
        self.assertEqual(len(els), 4)
1992
        for div in els:
1993
            self.assertEqual(div.name, 'div')
1994

1995
        el = self.soup.select_one('div')
1996
        self.assertEqual('main', el['id'])
1997

1998
    def test_select_one_returns_none_if_no_match(self):
1999
        match = self.soup.select_one('nonexistenttag')
2000
        self.assertEqual(None, match)
2001

2002

2003
    def test_tag_in_tag_one(self):
2004
        els = self.soup.select('div div')
2005
        self.assertSelects('div div', ['inner', 'data1'])
2006

2007
    def test_tag_in_tag_many(self):
2008
        for selector in ('html div', 'html body div', 'body div'):
2009
            self.assertSelects(selector, ['data1', 'main', 'inner', 'footer'])
2010

2011

2012
    def test_limit(self):
2013
        self.assertSelects('html div', ['main'], limit=1)
2014
        self.assertSelects('html body div', ['inner', 'main'], limit=2)
2015
        self.assertSelects('body div', ['data1', 'main', 'inner', 'footer'],
2016
                           limit=10)
2017

2018
    def test_tag_no_match(self):
2019
        self.assertEqual(len(self.soup.select('del')), 0)
2020

2021
    def test_invalid_tag(self):
2022
        self.assertRaises(SelectorSyntaxError, self.soup.select, 'tag%t')
2023

2024
    def test_select_dashed_tag_ids(self):
2025
        self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
2026

2027
    def test_select_dashed_by_id(self):
2028
        dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
2029
        self.assertEqual(dashed[0].name, 'custom-dashed-tag')
2030
        self.assertEqual(dashed[0]['id'], 'dash2')
2031

2032
    def test_dashed_tag_text(self):
2033
        self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
2034

2035
    def test_select_dashed_matches_find_all(self):
2036
        self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
2037

2038
    def test_header_tags(self):
2039
        self.assertSelectMultiple(
2040
            ('h1', ['header1']),
2041
            ('h2', ['header2', 'header3']),
2042
        )
2043

2044
    def test_class_one(self):
2045
        for selector in ('.onep', 'p.onep', 'html p.onep'):
2046
            els = self.soup.select(selector)
2047
            self.assertEqual(len(els), 1)
2048
            self.assertEqual(els[0].name, 'p')
2049
            self.assertEqual(els[0]['class'], ['onep'])
2050

2051
    def test_class_mismatched_tag(self):
2052
        els = self.soup.select('div.onep')
2053
        self.assertEqual(len(els), 0)
2054

2055
    def test_one_id(self):
2056
        for selector in ('div#inner', '#inner', 'div div#inner'):
2057
            self.assertSelects(selector, ['inner'])
2058

2059
    def test_bad_id(self):
2060
        els = self.soup.select('#doesnotexist')
2061
        self.assertEqual(len(els), 0)
2062

2063
    def test_items_in_id(self):
2064
        els = self.soup.select('div#inner p')
2065
        self.assertEqual(len(els), 3)
2066
        for el in els:
2067
            self.assertEqual(el.name, 'p')
2068
        self.assertEqual(els[1]['class'], ['onep'])
2069
        self.assertFalse(els[0].has_attr('class'))
2070

2071
    def test_a_bunch_of_emptys(self):
2072
        for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
2073
            self.assertEqual(len(self.soup.select(selector)), 0)
2074

2075
    def test_multi_class_support(self):
2076
        for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
2077
            '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
2078
            self.assertSelects(selector, ['pmulti'])
2079

2080
    def test_multi_class_selection(self):
2081
        for selector in ('.class1.class3', '.class3.class2',
2082
                         '.class1.class2.class3'):
2083
            self.assertSelects(selector, ['pmulti'])
2084

2085
    def test_child_selector(self):
2086
        self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
2087
        self.assertSelects('.s1 > a span', ['s1a2s1'])
2088

2089
    def test_child_selector_id(self):
2090
        self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
2091

2092
    def test_attribute_equals(self):
2093
        self.assertSelectMultiple(
2094
            ('p[class="onep"]', ['p1']),
2095
            ('p[id="p1"]', ['p1']),
2096
            ('[class="onep"]', ['p1']),
2097
            ('[id="p1"]', ['p1']),
2098
            ('link[rel="stylesheet"]', ['l1']),
2099
            ('link[type="text/css"]', ['l1']),
2100
            ('link[href="blah.css"]', ['l1']),
2101
            ('link[href="no-blah.css"]', []),
2102
            ('[rel="stylesheet"]', ['l1']),
2103
            ('[type="text/css"]', ['l1']),
2104
            ('[href="blah.css"]', ['l1']),
2105
            ('[href="no-blah.css"]', []),
2106
            ('p[href="no-blah.css"]', []),
2107
            ('[href="no-blah.css"]', []),
2108
        )
2109

2110
    def test_attribute_tilde(self):
2111
        self.assertSelectMultiple(
2112
            ('p[class~="class1"]', ['pmulti']),
2113
            ('p[class~="class2"]', ['pmulti']),
2114
            ('p[class~="class3"]', ['pmulti']),
2115
            ('[class~="class1"]', ['pmulti']),
2116
            ('[class~="class2"]', ['pmulti']),
2117
            ('[class~="class3"]', ['pmulti']),
2118
            ('a[rel~="friend"]', ['bob']),
2119
            ('a[rel~="met"]', ['bob']),
2120
            ('[rel~="friend"]', ['bob']),
2121
            ('[rel~="met"]', ['bob']),
2122
        )
2123

2124
    def test_attribute_startswith(self):
2125
        self.assertSelectMultiple(
2126
            ('[rel^="style"]', ['l1']),
2127
            ('link[rel^="style"]', ['l1']),
2128
            ('notlink[rel^="notstyle"]', []),
2129
            ('[rel^="notstyle"]', []),
2130
            ('link[rel^="notstyle"]', []),
2131
            ('link[href^="bla"]', ['l1']),
2132
            ('a[href^="http://"]', ['bob', 'me']),
2133
            ('[href^="http://"]', ['bob', 'me']),
2134
            ('[id^="p"]', ['pmulti', 'p1']),
2135
            ('[id^="m"]', ['me', 'main']),
2136
            ('div[id^="m"]', ['main']),
2137
            ('a[id^="m"]', ['me']),
2138
            ('div[data-tag^="dashed"]', ['data1'])
2139
        )
2140

2141
    def test_attribute_endswith(self):
2142
        self.assertSelectMultiple(
2143
            ('[href$=".css"]', ['l1']),
2144
            ('link[href$=".css"]', ['l1']),
2145
            ('link[id$="1"]', ['l1']),
2146
            ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
2147
            ('div[id$="1"]', ['data1']),
2148
            ('[id$="noending"]', []),
2149
        )
2150

2151
    def test_attribute_contains(self):
2152
        self.assertSelectMultiple(
2153
            # From test_attribute_startswith
2154
            ('[rel*="style"]', ['l1']),
2155
            ('link[rel*="style"]', ['l1']),
2156
            ('notlink[rel*="notstyle"]', []),
2157
            ('[rel*="notstyle"]', []),
2158
            ('link[rel*="notstyle"]', []),
2159
            ('link[href*="bla"]', ['l1']),
2160
            ('[href*="http://"]', ['bob', 'me']),
2161
            ('[id*="p"]', ['pmulti', 'p1']),
2162
            ('div[id*="m"]', ['main']),
2163
            ('a[id*="m"]', ['me']),
2164
            # From test_attribute_endswith
2165
            ('[href*=".css"]', ['l1']),
2166
            ('link[href*=".css"]', ['l1']),
2167
            ('link[id*="1"]', ['l1']),
2168
            ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
2169
            ('div[id*="1"]', ['data1']),
2170
            ('[id*="noending"]', []),
2171
            # New for this test
2172
            ('[href*="."]', ['bob', 'me', 'l1']),
2173
            ('a[href*="."]', ['bob', 'me']),
2174
            ('link[href*="."]', ['l1']),
2175
            ('div[id*="n"]', ['main', 'inner']),
2176
            ('div[id*="nn"]', ['inner']),
2177
            ('div[data-tag*="edval"]', ['data1'])
2178
        )
2179

2180
    def test_attribute_exact_or_hypen(self):
2181
        self.assertSelectMultiple(
2182
            ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
2183
            ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
2184
            ('p[lang|="fr"]', ['lang-fr']),
2185
            ('p[lang|="gb"]', []),
2186
        )
2187

2188
    def test_attribute_exists(self):
2189
        self.assertSelectMultiple(
2190
            ('[rel]', ['l1', 'bob', 'me']),
2191
            ('link[rel]', ['l1']),
2192
            ('a[rel]', ['bob', 'me']),
2193
            ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
2194
            ('p[class]', ['p1', 'pmulti']),
2195
            ('[blah]', []),
2196
            ('p[blah]', []),
2197
            ('div[data-tag]', ['data1'])
2198
        )
2199

2200
    def test_quoted_space_in_selector_name(self):
2201
        html = """<div style="display: wrong">nope</div>
2202
        <div style="display: right">yes</div>
2203
        """
2204
        soup = BeautifulSoup(html, 'html.parser')
2205
        [chosen] = soup.select('div[style="display: right"]')
2206
        self.assertEqual("yes", chosen.string)
2207

2208
    def test_unsupported_pseudoclass(self):
2209
        self.assertRaises(
2210
            NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
2211

2212
        self.assertRaises(
2213
            SelectorSyntaxError, self.soup.select, "a:nth-of-type(a)")
2214

2215
    def test_nth_of_type(self):
2216
        # Try to select first paragraph
2217
        els = self.soup.select('div#inner p:nth-of-type(1)')
2218
        self.assertEqual(len(els), 1)
2219
        self.assertEqual(els[0].string, 'Some text')
2220

2221
        # Try to select third paragraph
2222
        els = self.soup.select('div#inner p:nth-of-type(3)')
2223
        self.assertEqual(len(els), 1)
2224
        self.assertEqual(els[0].string, 'Another')
2225

2226
        # Try to select (non-existent!) fourth paragraph
2227
        els = self.soup.select('div#inner p:nth-of-type(4)')
2228
        self.assertEqual(len(els), 0)
2229

2230
        # Zero will select no tags.
2231
        els = self.soup.select('div p:nth-of-type(0)')
2232
        self.assertEqual(len(els), 0)
2233

2234
    def test_nth_of_type_direct_descendant(self):
2235
        els = self.soup.select('div#inner > p:nth-of-type(1)')
2236
        self.assertEqual(len(els), 1)
2237
        self.assertEqual(els[0].string, 'Some text')
2238

2239
    def test_id_child_selector_nth_of_type(self):
2240
        self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
2241

2242
    def test_select_on_element(self):
2243
        # Other tests operate on the tree; this operates on an element
2244
        # within the tree.
2245
        inner = self.soup.find("div", id="main")
2246
        selected = inner.select("div")
2247
        # The <div id="inner"> tag was selected. The <div id="footer">
2248
        # tag was not.
2249
        self.assertSelectsIDs(selected, ['inner', 'data1'])
2250

2251
    def test_overspecified_child_id(self):
2252
        self.assertSelects(".fancy #inner", ['inner'])
2253
        self.assertSelects(".normal #inner", [])
2254

2255
    def test_adjacent_sibling_selector(self):
2256
        self.assertSelects('#p1 + h2', ['header2'])
2257
        self.assertSelects('#p1 + h2 + p', ['pmulti'])
2258
        self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
2259
        self.assertEqual([], self.soup.select('#p1 + p'))
2260

2261
    def test_general_sibling_selector(self):
2262
        self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
2263
        self.assertSelects('#p1 ~ #header2', ['header2'])
2264
        self.assertSelects('#p1 ~ h2 + a', ['me'])
2265
        self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
2266
        self.assertEqual([], self.soup.select('#inner ~ h2'))
2267

2268
    def test_dangling_combinator(self):
2269
        self.assertRaises(SelectorSyntaxError, self.soup.select, 'h1 >')
2270

2271
    def test_sibling_combinator_wont_select_same_tag_twice(self):
2272
        self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
2273

2274
    # Test the selector grouping operator (the comma)
2275
    def test_multiple_select(self):
2276
        self.assertSelects('x, y', ['xid', 'yid'])
2277

2278
    def test_multiple_select_with_no_space(self):
2279
        self.assertSelects('x,y', ['xid', 'yid'])
2280

2281
    def test_multiple_select_with_more_space(self):
2282
        self.assertSelects('x,    y', ['xid', 'yid'])
2283

2284
    def test_multiple_select_duplicated(self):
2285
        self.assertSelects('x, x', ['xid'])
2286

2287
    def test_multiple_select_sibling(self):
2288
        self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
2289

2290
    def test_multiple_select_tag_and_direct_descendant(self):
2291
        self.assertSelects('x, y > z', ['xid', 'zidb'])
2292

2293
    def test_multiple_select_direct_descendant_and_tags(self):
2294
        self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
2295

2296
    def test_multiple_select_indirect_descendant(self):
2297
        self.assertSelects('div x,y,  z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
2298

2299
    def test_invalid_multiple_select(self):
2300
        self.assertRaises(SelectorSyntaxError, self.soup.select, ',x, y')
2301
        self.assertRaises(SelectorSyntaxError, self.soup.select, 'x,,y')
2302

2303
    def test_multiple_select_attrs(self):
2304
        self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
2305

2306
    def test_multiple_select_ids(self):
2307
        self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
2308

2309
    def test_multiple_select_nested(self):
2310
        self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
2311

2312
    def test_select_duplicate_elements(self):
2313
        # When markup contains duplicate elements, a multiple select
2314
        # will find all of them.
2315
        markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
2316
        soup = BeautifulSoup(markup, 'html.parser')
2317
        selected = soup.select(".c1, .c2")
2318
        self.assertEqual(3, len(selected))
2319

2320
        # Verify that find_all finds the same elements, though because
2321
        # of an implementation detail it finds them in a different
2322
        # order.
2323
        for element in soup.find_all(class_=['c1', 'c2']):
2324
            assert element in selected
2325

2326
Product

Resources

Company