CoCalc -- test

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/tests/test_soup.py
⁸¹¹ views
1
# -*- coding: utf-8 -*-
2
"""Tests of Beautiful Soup as a whole."""
3

4
from pdb import set_trace
5
import logging
6
import unittest
7
import sys
8
import tempfile
9

10
from bs4 import (
11
    BeautifulSoup,
12
    BeautifulStoneSoup,
13
    GuessedAtParserWarning,
14
    MarkupResemblesLocatorWarning,
15
)
16
from bs4.builder import (
17
    TreeBuilder,
18
    ParserRejectedMarkup,
19
)
20
from bs4.element import (
21
    CharsetMetaAttributeValue,
22
    Comment,
23
    ContentMetaAttributeValue,
24
    SoupStrainer,
25
    NamespacedAttribute,
26
    Tag,
27
    NavigableString,
28
    )
29

30
import bs4.dammit
31
from bs4.dammit import (
32
    EntitySubstitution,
33
    UnicodeDammit,
34
    EncodingDetector,
35
)
36
from bs4.testing import (
37
    default_builder,
38
    SoupTest,
39
    skipIf,
40
)
41
import warnings
42

43
try:
44
    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
45
    LXML_PRESENT = True
46
except ImportError as e:
47
    LXML_PRESENT = False
48

49
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
50

51
class TestConstructor(SoupTest):
52

53
    def test_short_unicode_input(self):
54
        data = "<h1>éé</h1>"
55
        soup = self.soup(data)
56
        self.assertEqual("éé", soup.h1.string)
57

58
    def test_embedded_null(self):
59
        data = "<h1>foo\0bar</h1>"
60
        soup = self.soup(data)
61
        self.assertEqual("foo\0bar", soup.h1.string)
62

63
    def test_exclude_encodings(self):
64
        utf8_data = "Räksmörgås".encode("utf-8")
65
        soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
66
        self.assertEqual("windows-1252", soup.original_encoding)
67

68
    def test_custom_builder_class(self):
69
        # Verify that you can pass in a custom Builder class and
70
        # it'll be instantiated with the appropriate keyword arguments.
71
        class Mock(object):
72
            def __init__(self, **kwargs):
73
                self.called_with = kwargs
74
                self.is_xml = True
75
                self.store_line_numbers = False
76
                self.cdata_list_attributes = []
77
                self.preserve_whitespace_tags = []
78
                self.string_containers = {}
79
            def initialize_soup(self, soup):
80
                pass
81
            def feed(self, markup):
82
                self.fed = markup
83
            def reset(self):
84
                pass
85
            def ignore(self, ignore):
86
                pass
87
            set_up_substitutions = can_be_empty_element = ignore
88
            def prepare_markup(self, *args, **kwargs):
89
                yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters"
90
                
91
        kwargs = dict(
92
            var="value",
93
            # This is a deprecated BS3-era keyword argument, which
94
            # will be stripped out.
95
            convertEntities=True,
96
        )
97
        with warnings.catch_warnings(record=True):
98
            soup = BeautifulSoup('', builder=Mock, **kwargs)
99
        assert isinstance(soup.builder, Mock)
100
        self.assertEqual(dict(var="value"), soup.builder.called_with)
101
        self.assertEqual("prepared markup", soup.builder.fed)
102
        
103
        # You can also instantiate the TreeBuilder yourself. In this
104
        # case, that specific object is used and any keyword arguments
105
        # to the BeautifulSoup constructor are ignored.
106
        builder = Mock(**kwargs)
107
        with warnings.catch_warnings(record=True) as w:
108
            soup = BeautifulSoup(
109
                '', builder=builder, ignored_value=True,
110
            )
111
        msg = str(w[0].message)
112
        assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
113
        self.assertEqual(builder, soup.builder)
114
        self.assertEqual(kwargs, builder.called_with)
115

116
    def test_parser_markup_rejection(self):
117
        # If markup is completely rejected by the parser, an
118
        # explanatory ParserRejectedMarkup exception is raised.
119
        class Mock(TreeBuilder):
120
            def feed(self, *args, **kwargs):
121
                raise ParserRejectedMarkup("Nope.")
122

123
        def prepare_markup(self, *args, **kwargs):
124
            # We're going to try two different ways of preparing this markup,
125
            # but feed() will reject both of them.
126
            yield markup, None, None, False
127
            yield markup, None, None, False
128
            
129
        import re
130
        self.assertRaisesRegex(
131
            ParserRejectedMarkup,
132
            "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.",
133
            BeautifulSoup, '', builder=Mock,
134
        )
135
        
136
    def test_cdata_list_attributes(self):
137
        # Most attribute values are represented as scalars, but the
138
        # HTML standard says that some attributes, like 'class' have
139
        # space-separated lists as values.
140
        markup = '<a id=" an id " class=" a class "></a>'
141
        soup = self.soup(markup)
142

143
        # Note that the spaces are stripped for 'class' but not for 'id'.
144
        a = soup.a
145
        self.assertEqual(" an id ", a['id'])
146
        self.assertEqual(["a", "class"], a['class'])
147

148
        # TreeBuilder takes an argument called 'mutli_valued_attributes'  which lets
149
        # you customize or disable this. As always, you can customize the TreeBuilder
150
        # by passing in a keyword argument to the BeautifulSoup constructor.
151
        soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
152
        self.assertEqual(" a class ", soup.a['class'])
153

154
        # Here are two ways of saying that `id` is a multi-valued
155
        # attribute in this context, but 'class' is not.
156
        for switcheroo in ({'*': 'id'}, {'a': 'id'}):
157
            with warnings.catch_warnings(record=True) as w:
158
                # This will create a warning about not explicitly
159
                # specifying a parser, but we'll ignore it.
160
                soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
161
            a = soup.a
162
            self.assertEqual(["an", "id"], a['id'])
163
            self.assertEqual(" a class ", a['class'])
164

165
    def test_replacement_classes(self):
166
        # Test the ability to pass in replacements for element classes
167
        # which will be used when building the tree.
168
        class TagPlus(Tag):
169
            pass
170

171
        class StringPlus(NavigableString):
172
            pass
173

174
        class CommentPlus(Comment):
175
            pass
176
        
177
        soup = self.soup(
178
            "<a><b>foo</b>bar</a><!--whee-->",
179
            element_classes = {
180
                Tag: TagPlus,
181
                NavigableString: StringPlus,
182
                Comment: CommentPlus,
183
            }
184
        )
185

186
        # The tree was built with TagPlus, StringPlus, and CommentPlus objects,
187
        # rather than Tag, String, and Comment objects.
188
        assert all(
189
            isinstance(x, (TagPlus, StringPlus, CommentPlus))
190
            for x in soup.recursiveChildGenerator()
191
        )
192

193
    def test_alternate_string_containers(self):
194
        # Test the ability to customize the string containers for
195
        # different types of tags.
196
        class PString(NavigableString):
197
            pass
198

199
        class BString(NavigableString):
200
            pass
201

202
        soup = self.soup(
203
            "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
204
            string_containers = {
205
                'b': BString,
206
                'p': PString,
207
            }
208
        )
209

210
        # The string before the <p> tag is a regular NavigableString.
211
        assert isinstance(soup.div.contents[0], NavigableString)
212
        
213
        # The string inside the <p> tag, but not inside the <i> tag,
214
        # is a PString.
215
        assert isinstance(soup.p.contents[0], PString)
216

217
        # Every string inside the <b> tag is a BString, even the one that
218
        # was also inside an <i> tag.
219
        for s in soup.b.strings:
220
            assert isinstance(s, BString)
221

222
        # Now that parsing was complete, the string_container_stack
223
        # (where this information was kept) has been cleared out.
224
        self.assertEqual([], soup.string_container_stack)
225

226

227
class TestWarnings(SoupTest):
228

229
    def _assert_warning(self, warnings, cls):
230
        for w in warnings:
231
            if isinstance(w.message, cls):
232
                return w
233
        raise Exception("%s warning not found in %r" % cls, warnings)
234
    
235
    def _assert_no_parser_specified(self, w):
236
        warning = self._assert_warning(w, GuessedAtParserWarning)
237
        message = str(warning.message)
238
        self.assertTrue(
239
            message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
240
        )
241

242
    def test_warning_if_no_parser_specified(self):
243
        with warnings.catch_warnings(record=True) as w:
244
            soup = BeautifulSoup("<a><b></b></a>")
245
        self._assert_no_parser_specified(w)
246

247
    def test_warning_if_parser_specified_too_vague(self):
248
        with warnings.catch_warnings(record=True) as w:
249
            soup = BeautifulSoup("<a><b></b></a>", "html")
250
        self._assert_no_parser_specified(w)
251

252
    def test_no_warning_if_explicit_parser_specified(self):
253
        with warnings.catch_warnings(record=True) as w:
254
            soup = BeautifulSoup("<a><b></b></a>", "html.parser")
255
        self.assertEqual([], w)
256

257
    def test_parseOnlyThese_renamed_to_parse_only(self):
258
        with warnings.catch_warnings(record=True) as w:
259
            soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
260
        msg = str(w[0].message)
261
        self.assertTrue("parseOnlyThese" in msg)
262
        self.assertTrue("parse_only" in msg)
263
        self.assertEqual(b"<b></b>", soup.encode())
264

265
    def test_fromEncoding_renamed_to_from_encoding(self):
266
        with warnings.catch_warnings(record=True) as w:
267
            utf8 = b"\xc3\xa9"
268
            soup = self.soup(utf8, fromEncoding="utf8")
269
        msg = str(w[0].message)
270
        self.assertTrue("fromEncoding" in msg)
271
        self.assertTrue("from_encoding" in msg)
272
        self.assertEqual("utf8", soup.original_encoding)
273

274
    def test_unrecognized_keyword_argument(self):
275
        self.assertRaises(
276
            TypeError, self.soup, "<a>", no_such_argument=True)
277

278
    def test_disk_file_warning(self):
279
        filehandle = tempfile.NamedTemporaryFile()
280
        filename = filehandle.name
281
        try:
282
            with warnings.catch_warnings(record=True) as w:
283
                soup = self.soup(filename)
284
            warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
285
            self.assertTrue("looks like a filename" in str(warning.message))
286
        finally:
287
            filehandle.close()
288

289
        # The file no longer exists, so Beautiful Soup will no longer issue the warning.
290
        with warnings.catch_warnings(record=True) as w:
291
            soup = self.soup(filename)
292
        self.assertEqual([], w)
293

294
    def test_url_warning_with_bytes_url(self):
295
        with warnings.catch_warnings(record=True) as warning_list:
296
            soup = self.soup(b"http://www.crummybytes.com/")
297
        warning = self._assert_warning(
298
            warning_list, MarkupResemblesLocatorWarning
299
        )
300
        self.assertTrue("looks like a URL" in str(warning.message))
301

302
    def test_url_warning_with_unicode_url(self):
303
        with warnings.catch_warnings(record=True) as warning_list:
304
            # note - this url must differ from the bytes one otherwise
305
            # python's warnings system swallows the second warning
306
            soup = self.soup("http://www.crummyunicode.com/")
307
        warning = self._assert_warning(
308
            warning_list, MarkupResemblesLocatorWarning
309
        )
310
        self.assertTrue("looks like a URL" in str(warning.message))
311

312
    def test_url_warning_with_bytes_and_space(self):
313
        # Here the markup contains something besides a URL, so no warning
314
        # is issued.
315
        with warnings.catch_warnings(record=True) as warning_list:
316
            soup = self.soup(b"http://www.crummybytes.com/ is great")
317
        self.assertFalse(any("looks like a URL" in str(w.message) 
318
            for w in warning_list))
319

320
    def test_url_warning_with_unicode_and_space(self):
321
        with warnings.catch_warnings(record=True) as warning_list:
322
            soup = self.soup("http://www.crummyuncode.com/ is great")
323
        self.assertFalse(any("looks like a URL" in str(w.message) 
324
            for w in warning_list))
325

326

327
class TestSelectiveParsing(SoupTest):
328

329
    def test_parse_with_soupstrainer(self):
330
        markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
331
        strainer = SoupStrainer("b")
332
        soup = self.soup(markup, parse_only=strainer)
333
        self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
334

335

336
class TestEntitySubstitution(unittest.TestCase):
337
    """Standalone tests of the EntitySubstitution class."""
338
    def setUp(self):
339
        self.sub = EntitySubstitution
340

341
    def test_simple_html_substitution(self):
342
        # Unicode characters corresponding to named HTML entites
343
        # are substituted, and no others.
344
        s = "foo\u2200\N{SNOWMAN}\u00f5bar"
345
        self.assertEqual(self.sub.substitute_html(s),
346
                          "foo&forall;\N{SNOWMAN}&otilde;bar")
347

348
    def test_smart_quote_substitution(self):
349
        # MS smart quotes are a common source of frustration, so we
350
        # give them a special test.
351
        quotes = b"\x91\x92foo\x93\x94"
352
        dammit = UnicodeDammit(quotes)
353
        self.assertEqual(self.sub.substitute_html(dammit.markup),
354
                          "&lsquo;&rsquo;foo&ldquo;&rdquo;")
355

356
    def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
357
        s = 'Welcome to "my bar"'
358
        self.assertEqual(self.sub.substitute_xml(s, False), s)
359

360
    def test_xml_attribute_quoting_normally_uses_double_quotes(self):
361
        self.assertEqual(self.sub.substitute_xml("Welcome", True),
362
                          '"Welcome"')
363
        self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
364
                          '"Bob\'s Bar"')
365

366
    def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
367
        s = 'Welcome to "my bar"'
368
        self.assertEqual(self.sub.substitute_xml(s, True),
369
                          "'Welcome to \"my bar\"'")
370

371
    def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
372
        s = 'Welcome to "Bob\'s Bar"'
373
        self.assertEqual(
374
            self.sub.substitute_xml(s, True),
375
            '"Welcome to &quot;Bob\'s Bar&quot;"')
376

377
    def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
378
        quoted = 'Welcome to "Bob\'s Bar"'
379
        self.assertEqual(self.sub.substitute_xml(quoted), quoted)
380

381
    def test_xml_quoting_handles_angle_brackets(self):
382
        self.assertEqual(
383
            self.sub.substitute_xml("foo<bar>"),
384
            "foo&lt;bar&gt;")
385

386
    def test_xml_quoting_handles_ampersands(self):
387
        self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
388

389
    def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
390
        self.assertEqual(
391
            self.sub.substitute_xml("&Aacute;T&T"),
392
            "&amp;Aacute;T&amp;T")
393

394
    def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
395
        self.assertEqual(
396
            self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
397
            "&Aacute;T&amp;T")
398
       
399
    def test_quotes_not_html_substituted(self):
400
        """There's no need to do this except inside attribute values."""
401
        text = 'Bob\'s "bar"'
402
        self.assertEqual(self.sub.substitute_html(text), text)
403

404

405
class TestEncodingConversion(SoupTest):
406
    # Test Beautiful Soup's ability to decode and encode from various
407
    # encodings.
408

409
    def setUp(self):
410
        super(TestEncodingConversion, self).setUp()
411
        self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
412
        self.utf8_data = self.unicode_data.encode("utf-8")
413
        # Just so you know what it looks like.
414
        self.assertEqual(
415
            self.utf8_data,
416
            b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
417

418
    def test_ascii_in_unicode_out(self):
419
        # ASCII input is converted to Unicode. The original_encoding
420
        # attribute is set to 'utf-8', a superset of ASCII.
421
        chardet = bs4.dammit.chardet_dammit
422
        logging.disable(logging.WARNING)
423
        try:
424
            def noop(str):
425
                return None
426
            # Disable chardet, which will realize that the ASCII is ASCII.
427
            bs4.dammit.chardet_dammit = noop
428
            ascii = b"<foo>a</foo>"
429
            soup_from_ascii = self.soup(ascii)
430
            unicode_output = soup_from_ascii.decode()
431
            self.assertTrue(isinstance(unicode_output, str))
432
            self.assertEqual(unicode_output, self.document_for(ascii.decode()))
433
            self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
434
        finally:
435
            logging.disable(logging.NOTSET)
436
            bs4.dammit.chardet_dammit = chardet
437

438
    def test_unicode_in_unicode_out(self):
439
        # Unicode input is left alone. The original_encoding attribute
440
        # is not set.
441
        soup_from_unicode = self.soup(self.unicode_data)
442
        self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
443
        self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
444
        self.assertEqual(soup_from_unicode.original_encoding, None)
445

446
    def test_utf8_in_unicode_out(self):
447
        # UTF-8 input is converted to Unicode. The original_encoding
448
        # attribute is set.
449
        soup_from_utf8 = self.soup(self.utf8_data)
450
        self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
451
        self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
452

453
    def test_utf8_out(self):
454
        # The internal data structures can be encoded as UTF-8.
455
        soup_from_unicode = self.soup(self.unicode_data)
456
        self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
457

458
    @skipIf(
459
        PYTHON_3_PRE_3_2,
460
        "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
461
    def test_attribute_name_containing_unicode_characters(self):
462
        markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
463
        self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
464

465
class TestUnicodeDammit(unittest.TestCase):
466
    """Standalone tests of UnicodeDammit."""
467

468
    def test_unicode_input(self):
469
        markup = "I'm already Unicode! \N{SNOWMAN}"
470
        dammit = UnicodeDammit(markup)
471
        self.assertEqual(dammit.unicode_markup, markup)
472

473
    def test_smart_quotes_to_unicode(self):
474
        markup = b"<foo>\x91\x92\x93\x94</foo>"
475
        dammit = UnicodeDammit(markup)
476
        self.assertEqual(
477
            dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
478

479
    def test_smart_quotes_to_xml_entities(self):
480
        markup = b"<foo>\x91\x92\x93\x94</foo>"
481
        dammit = UnicodeDammit(markup, smart_quotes_to="xml")
482
        self.assertEqual(
483
            dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
484

485
    def test_smart_quotes_to_html_entities(self):
486
        markup = b"<foo>\x91\x92\x93\x94</foo>"
487
        dammit = UnicodeDammit(markup, smart_quotes_to="html")
488
        self.assertEqual(
489
            dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
490

491
    def test_smart_quotes_to_ascii(self):
492
        markup = b"<foo>\x91\x92\x93\x94</foo>"
493
        dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
494
        self.assertEqual(
495
            dammit.unicode_markup, """<foo>''""</foo>""")
496

497
    def test_detect_utf8(self):
498
        utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
499
        dammit = UnicodeDammit(utf8)
500
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
501
        self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
502

503

504
    def test_convert_hebrew(self):
505
        hebrew = b"\xed\xe5\xec\xf9"
506
        dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
507
        self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
508
        self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
509

510
    def test_dont_see_smart_quotes_where_there_are_none(self):
511
        utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
512
        dammit = UnicodeDammit(utf_8)
513
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
514
        self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
515

516
    def test_ignore_inappropriate_codecs(self):
517
        utf8_data = "Räksmörgås".encode("utf-8")
518
        dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
519
        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
520

521
    def test_ignore_invalid_codecs(self):
522
        utf8_data = "Räksmörgås".encode("utf-8")
523
        for bad_encoding in ['.utf8', '...', 'utF---16.!']:
524
            dammit = UnicodeDammit(utf8_data, [bad_encoding])
525
            self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
526

527
    def test_exclude_encodings(self):
528
        # This is UTF-8.
529
        utf8_data = "Räksmörgås".encode("utf-8")
530

531
        # But if we exclude UTF-8 from consideration, the guess is
532
        # Windows-1252.
533
        dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
534
        self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
535

536
        # And if we exclude that, there is no valid guess at all.
537
        dammit = UnicodeDammit(
538
            utf8_data, exclude_encodings=["utf-8", "windows-1252"])
539
        self.assertEqual(dammit.original_encoding, None)
540

541
    def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
542
        detected = EncodingDetector(
543
            b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
544
        encodings = list(detected.encodings)
545
        assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
546

547
    def test_detect_html5_style_meta_tag(self):
548

549
        for data in (
550
            b'<html><meta charset="euc-jp" /></html>',
551
            b"<html><meta charset='euc-jp' /></html>",
552
            b"<html><meta charset=euc-jp /></html>",
553
            b"<html><meta charset=euc-jp/></html>"):
554
            dammit = UnicodeDammit(data, is_html=True)
555
            self.assertEqual(
556
                "euc-jp", dammit.original_encoding)
557

558
    def test_last_ditch_entity_replacement(self):
559
        # This is a UTF-8 document that contains bytestrings
560
        # completely incompatible with UTF-8 (ie. encoded with some other
561
        # encoding).
562
        #
563
        # Since there is no consistent encoding for the document,
564
        # Unicode, Dammit will eventually encode the document as UTF-8
565
        # and encode the incompatible characters as REPLACEMENT
566
        # CHARACTER.
567
        #
568
        # If chardet is installed, it will detect that the document
569
        # can be converted into ISO-8859-1 without errors. This happens
570
        # to be the wrong encoding, but it is a consistent encoding, so the
571
        # code we're testing here won't run.
572
        #
573
        # So we temporarily disable chardet if it's present.
574
        doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
575
<html><b>\330\250\330\252\330\261</b>
576
<i>\310\322\321\220\312\321\355\344</i></html>"""
577
        chardet = bs4.dammit.chardet_dammit
578
        logging.disable(logging.WARNING)
579
        try:
580
            def noop(str):
581
                return None
582
            bs4.dammit.chardet_dammit = noop
583
            dammit = UnicodeDammit(doc)
584
            self.assertEqual(True, dammit.contains_replacement_characters)
585
            self.assertTrue("\ufffd" in dammit.unicode_markup)
586

587
            soup = BeautifulSoup(doc, "html.parser")
588
            self.assertTrue(soup.contains_replacement_characters)
589
        finally:
590
            logging.disable(logging.NOTSET)
591
            bs4.dammit.chardet_dammit = chardet
592

593
    def test_byte_order_mark_removed(self):
594
        # A document written in UTF-16LE will have its byte order marker stripped.
595
        data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
596
        dammit = UnicodeDammit(data)
597
        self.assertEqual("<a>áé</a>", dammit.unicode_markup)
598
        self.assertEqual("utf-16le", dammit.original_encoding)
599

600
    def test_detwingle(self):
601
        # Here's a UTF8 document.
602
        utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
603

604
        # Here's a Windows-1252 document.
605
        windows_1252 = (
606
            "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
607
            "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
608

609
        # Through some unholy alchemy, they've been stuck together.
610
        doc = utf8 + windows_1252 + utf8
611

612
        # The document can't be turned into UTF-8:
613
        self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
614

615
        # Unicode, Dammit thinks the whole document is Windows-1252,
616
        # and decodes it into "â˜ƒâ˜ƒâ˜ƒ“Hi, I like Windows!”â˜ƒâ˜ƒâ˜ƒ"
617

618
        # But if we run it through fix_embedded_windows_1252, it's fixed:
619

620
        fixed = UnicodeDammit.detwingle(doc)
621
        self.assertEqual(
622
            "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
623

624
    def test_detwingle_ignores_multibyte_characters(self):
625
        # Each of these characters has a UTF-8 representation ending
626
        # in \x93. \x93 is a smart quote if interpreted as
627
        # Windows-1252. But our code knows to skip over multibyte
628
        # UTF-8 characters, so they'll survive the process unscathed.
629
        for tricky_unicode_char in (
630
            "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
631
            "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
632
            "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
633
            ):
634
            input = tricky_unicode_char.encode("utf8")
635
            self.assertTrue(input.endswith(b'\x93'))
636
            output = UnicodeDammit.detwingle(input)
637
            self.assertEqual(output, input)
638

639
    def test_find_declared_encoding(self):
640
        # Test our ability to find a declared encoding inside an
641
        # XML or HTML document.
642
        #
643
        # Even if the document comes in as Unicode, it may be
644
        # interesting to know what encoding was claimed
645
        # originally.
646

647
        html_unicode = '<html><head><meta charset="utf-8"></head></html>'
648
        html_bytes = html_unicode.encode("ascii")
649

650
        xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
651
        xml_bytes = xml_unicode.encode("ascii")
652

653
        m = EncodingDetector.find_declared_encoding
654
        self.assertEqual(None, m(html_unicode, is_html=False))
655
        self.assertEqual("utf-8", m(html_unicode, is_html=True))
656
        self.assertEqual("utf-8", m(html_bytes, is_html=True))
657

658
        self.assertEqual("iso-8859-1", m(xml_unicode))
659
        self.assertEqual("iso-8859-1", m(xml_bytes))
660

661
        # Normally, only the first few kilobytes of a document are checked for
662
        # an encoding.
663
        spacer = b' ' * 5000
664
        self.assertEqual(None, m(spacer + html_bytes))
665
        self.assertEqual(None, m(spacer + xml_bytes))
666

667
        # But you can tell find_declared_encoding to search an entire
668
        # HTML document.
669
        self.assertEqual(
670
            "utf-8",
671
            m(spacer + html_bytes, is_html=True, search_entire_document=True)
672
        )
673

674
        # The XML encoding declaration has to be the very first thing
675
        # in the document. We'll allow whitespace before the document
676
        # starts, but nothing else.
677
        self.assertEqual(
678
            "iso-8859-1",
679
            m(xml_bytes, search_entire_document=True)
680
        )
681
        self.assertEqual(
682
            None, m(b'a' + xml_bytes, search_entire_document=True)
683
        )
684
            
685
class TestNamedspacedAttribute(SoupTest):
686

687
    def test_name_may_be_none_or_missing(self):
688
        a = NamespacedAttribute("xmlns", None)
689
        self.assertEqual(a, "xmlns")
690

691
        a = NamespacedAttribute("xmlns")
692
        self.assertEqual(a, "xmlns")
693
        
694
    def test_attribute_is_equivalent_to_colon_separated_string(self):
695
        a = NamespacedAttribute("a", "b")
696
        self.assertEqual("a:b", a)
697

698
    def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
699
        a = NamespacedAttribute("a", "b", "c")
700
        b = NamespacedAttribute("a", "b", "c")
701
        self.assertEqual(a, b)
702

703
        # The actual namespace is not considered.
704
        c = NamespacedAttribute("a", "b", None)
705
        self.assertEqual(a, c)
706

707
        # But name and prefix are important.
708
        d = NamespacedAttribute("a", "z", "c")
709
        self.assertNotEqual(a, d)
710

711
        e = NamespacedAttribute("z", "b", "c")
712
        self.assertNotEqual(a, e)
713

714

715
class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
716

717
    def test_content_meta_attribute_value(self):
718
        value = CharsetMetaAttributeValue("euc-jp")
719
        self.assertEqual("euc-jp", value)
720
        self.assertEqual("euc-jp", value.original_value)
721
        self.assertEqual("utf8", value.encode("utf8"))
722

723

724
    def test_content_meta_attribute_value(self):
725
        value = ContentMetaAttributeValue("text/html; charset=euc-jp")
726
        self.assertEqual("text/html; charset=euc-jp", value)
727
        self.assertEqual("text/html; charset=euc-jp", value.original_value)
728
        self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
729

730
Product

Resources

Company