Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/tests/test_soup.py
811 views
1
# -*- coding: utf-8 -*-
2
"""Tests of Beautiful Soup as a whole."""
3
4
from pdb import set_trace
5
import logging
6
import unittest
7
import sys
8
import tempfile
9
10
from bs4 import (
11
BeautifulSoup,
12
BeautifulStoneSoup,
13
GuessedAtParserWarning,
14
MarkupResemblesLocatorWarning,
15
)
16
from bs4.builder import (
17
TreeBuilder,
18
ParserRejectedMarkup,
19
)
20
from bs4.element import (
21
CharsetMetaAttributeValue,
22
Comment,
23
ContentMetaAttributeValue,
24
SoupStrainer,
25
NamespacedAttribute,
26
Tag,
27
NavigableString,
28
)
29
30
import bs4.dammit
31
from bs4.dammit import (
32
EntitySubstitution,
33
UnicodeDammit,
34
EncodingDetector,
35
)
36
from bs4.testing import (
37
default_builder,
38
SoupTest,
39
skipIf,
40
)
41
import warnings
42
43
try:
44
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
45
LXML_PRESENT = True
46
except ImportError as e:
47
LXML_PRESENT = False
48
49
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
50
51
class TestConstructor(SoupTest):
52
53
def test_short_unicode_input(self):
54
data = "<h1>éé</h1>"
55
soup = self.soup(data)
56
self.assertEqual("éé", soup.h1.string)
57
58
def test_embedded_null(self):
59
data = "<h1>foo\0bar</h1>"
60
soup = self.soup(data)
61
self.assertEqual("foo\0bar", soup.h1.string)
62
63
def test_exclude_encodings(self):
64
utf8_data = "Räksmörgås".encode("utf-8")
65
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
66
self.assertEqual("windows-1252", soup.original_encoding)
67
68
def test_custom_builder_class(self):
69
# Verify that you can pass in a custom Builder class and
70
# it'll be instantiated with the appropriate keyword arguments.
71
class Mock(object):
72
def __init__(self, **kwargs):
73
self.called_with = kwargs
74
self.is_xml = True
75
self.store_line_numbers = False
76
self.cdata_list_attributes = []
77
self.preserve_whitespace_tags = []
78
self.string_containers = {}
79
def initialize_soup(self, soup):
80
pass
81
def feed(self, markup):
82
self.fed = markup
83
def reset(self):
84
pass
85
def ignore(self, ignore):
86
pass
87
set_up_substitutions = can_be_empty_element = ignore
88
def prepare_markup(self, *args, **kwargs):
89
yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters"
90
91
kwargs = dict(
92
var="value",
93
# This is a deprecated BS3-era keyword argument, which
94
# will be stripped out.
95
convertEntities=True,
96
)
97
with warnings.catch_warnings(record=True):
98
soup = BeautifulSoup('', builder=Mock, **kwargs)
99
assert isinstance(soup.builder, Mock)
100
self.assertEqual(dict(var="value"), soup.builder.called_with)
101
self.assertEqual("prepared markup", soup.builder.fed)
102
103
# You can also instantiate the TreeBuilder yourself. In this
104
# case, that specific object is used and any keyword arguments
105
# to the BeautifulSoup constructor are ignored.
106
builder = Mock(**kwargs)
107
with warnings.catch_warnings(record=True) as w:
108
soup = BeautifulSoup(
109
'', builder=builder, ignored_value=True,
110
)
111
msg = str(w[0].message)
112
assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
113
self.assertEqual(builder, soup.builder)
114
self.assertEqual(kwargs, builder.called_with)
115
116
def test_parser_markup_rejection(self):
117
# If markup is completely rejected by the parser, an
118
# explanatory ParserRejectedMarkup exception is raised.
119
class Mock(TreeBuilder):
120
def feed(self, *args, **kwargs):
121
raise ParserRejectedMarkup("Nope.")
122
123
def prepare_markup(self, *args, **kwargs):
124
# We're going to try two different ways of preparing this markup,
125
# but feed() will reject both of them.
126
yield markup, None, None, False
127
yield markup, None, None, False
128
129
import re
130
self.assertRaisesRegex(
131
ParserRejectedMarkup,
132
"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.",
133
BeautifulSoup, '', builder=Mock,
134
)
135
136
def test_cdata_list_attributes(self):
137
# Most attribute values are represented as scalars, but the
138
# HTML standard says that some attributes, like 'class' have
139
# space-separated lists as values.
140
markup = '<a id=" an id " class=" a class "></a>'
141
soup = self.soup(markup)
142
143
# Note that the spaces are stripped for 'class' but not for 'id'.
144
a = soup.a
145
self.assertEqual(" an id ", a['id'])
146
self.assertEqual(["a", "class"], a['class'])
147
148
# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
149
# you customize or disable this. As always, you can customize the TreeBuilder
150
# by passing in a keyword argument to the BeautifulSoup constructor.
151
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
152
self.assertEqual(" a class ", soup.a['class'])
153
154
# Here are two ways of saying that `id` is a multi-valued
155
# attribute in this context, but 'class' is not.
156
for switcheroo in ({'*': 'id'}, {'a': 'id'}):
157
with warnings.catch_warnings(record=True) as w:
158
# This will create a warning about not explicitly
159
# specifying a parser, but we'll ignore it.
160
soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
161
a = soup.a
162
self.assertEqual(["an", "id"], a['id'])
163
self.assertEqual(" a class ", a['class'])
164
165
def test_replacement_classes(self):
166
# Test the ability to pass in replacements for element classes
167
# which will be used when building the tree.
168
class TagPlus(Tag):
169
pass
170
171
class StringPlus(NavigableString):
172
pass
173
174
class CommentPlus(Comment):
175
pass
176
177
soup = self.soup(
178
"<a><b>foo</b>bar</a><!--whee-->",
179
element_classes = {
180
Tag: TagPlus,
181
NavigableString: StringPlus,
182
Comment: CommentPlus,
183
}
184
)
185
186
# The tree was built with TagPlus, StringPlus, and CommentPlus objects,
187
# rather than Tag, String, and Comment objects.
188
assert all(
189
isinstance(x, (TagPlus, StringPlus, CommentPlus))
190
for x in soup.recursiveChildGenerator()
191
)
192
193
def test_alternate_string_containers(self):
194
# Test the ability to customize the string containers for
195
# different types of tags.
196
class PString(NavigableString):
197
pass
198
199
class BString(NavigableString):
200
pass
201
202
soup = self.soup(
203
"<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
204
string_containers = {
205
'b': BString,
206
'p': PString,
207
}
208
)
209
210
# The string before the <p> tag is a regular NavigableString.
211
assert isinstance(soup.div.contents[0], NavigableString)
212
213
# The string inside the <p> tag, but not inside the <i> tag,
214
# is a PString.
215
assert isinstance(soup.p.contents[0], PString)
216
217
# Every string inside the <b> tag is a BString, even the one that
218
# was also inside an <i> tag.
219
for s in soup.b.strings:
220
assert isinstance(s, BString)
221
222
# Now that parsing was complete, the string_container_stack
223
# (where this information was kept) has been cleared out.
224
self.assertEqual([], soup.string_container_stack)
225
226
227
class TestWarnings(SoupTest):
228
229
def _assert_warning(self, warnings, cls):
230
for w in warnings:
231
if isinstance(w.message, cls):
232
return w
233
raise Exception("%s warning not found in %r" % cls, warnings)
234
235
def _assert_no_parser_specified(self, w):
236
warning = self._assert_warning(w, GuessedAtParserWarning)
237
message = str(warning.message)
238
self.assertTrue(
239
message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
240
)
241
242
def test_warning_if_no_parser_specified(self):
243
with warnings.catch_warnings(record=True) as w:
244
soup = BeautifulSoup("<a><b></b></a>")
245
self._assert_no_parser_specified(w)
246
247
def test_warning_if_parser_specified_too_vague(self):
248
with warnings.catch_warnings(record=True) as w:
249
soup = BeautifulSoup("<a><b></b></a>", "html")
250
self._assert_no_parser_specified(w)
251
252
def test_no_warning_if_explicit_parser_specified(self):
253
with warnings.catch_warnings(record=True) as w:
254
soup = BeautifulSoup("<a><b></b></a>", "html.parser")
255
self.assertEqual([], w)
256
257
def test_parseOnlyThese_renamed_to_parse_only(self):
258
with warnings.catch_warnings(record=True) as w:
259
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
260
msg = str(w[0].message)
261
self.assertTrue("parseOnlyThese" in msg)
262
self.assertTrue("parse_only" in msg)
263
self.assertEqual(b"<b></b>", soup.encode())
264
265
def test_fromEncoding_renamed_to_from_encoding(self):
266
with warnings.catch_warnings(record=True) as w:
267
utf8 = b"\xc3\xa9"
268
soup = self.soup(utf8, fromEncoding="utf8")
269
msg = str(w[0].message)
270
self.assertTrue("fromEncoding" in msg)
271
self.assertTrue("from_encoding" in msg)
272
self.assertEqual("utf8", soup.original_encoding)
273
274
def test_unrecognized_keyword_argument(self):
275
self.assertRaises(
276
TypeError, self.soup, "<a>", no_such_argument=True)
277
278
def test_disk_file_warning(self):
279
filehandle = tempfile.NamedTemporaryFile()
280
filename = filehandle.name
281
try:
282
with warnings.catch_warnings(record=True) as w:
283
soup = self.soup(filename)
284
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
285
self.assertTrue("looks like a filename" in str(warning.message))
286
finally:
287
filehandle.close()
288
289
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
290
with warnings.catch_warnings(record=True) as w:
291
soup = self.soup(filename)
292
self.assertEqual([], w)
293
294
def test_url_warning_with_bytes_url(self):
295
with warnings.catch_warnings(record=True) as warning_list:
296
soup = self.soup(b"http://www.crummybytes.com/")
297
warning = self._assert_warning(
298
warning_list, MarkupResemblesLocatorWarning
299
)
300
self.assertTrue("looks like a URL" in str(warning.message))
301
302
def test_url_warning_with_unicode_url(self):
303
with warnings.catch_warnings(record=True) as warning_list:
304
# note - this url must differ from the bytes one otherwise
305
# python's warnings system swallows the second warning
306
soup = self.soup("http://www.crummyunicode.com/")
307
warning = self._assert_warning(
308
warning_list, MarkupResemblesLocatorWarning
309
)
310
self.assertTrue("looks like a URL" in str(warning.message))
311
312
def test_url_warning_with_bytes_and_space(self):
313
# Here the markup contains something besides a URL, so no warning
314
# is issued.
315
with warnings.catch_warnings(record=True) as warning_list:
316
soup = self.soup(b"http://www.crummybytes.com/ is great")
317
self.assertFalse(any("looks like a URL" in str(w.message)
318
for w in warning_list))
319
320
def test_url_warning_with_unicode_and_space(self):
321
with warnings.catch_warnings(record=True) as warning_list:
322
soup = self.soup("http://www.crummyuncode.com/ is great")
323
self.assertFalse(any("looks like a URL" in str(w.message)
324
for w in warning_list))
325
326
327
class TestSelectiveParsing(SoupTest):
328
329
def test_parse_with_soupstrainer(self):
330
markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
331
strainer = SoupStrainer("b")
332
soup = self.soup(markup, parse_only=strainer)
333
self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
334
335
336
class TestEntitySubstitution(unittest.TestCase):
337
"""Standalone tests of the EntitySubstitution class."""
338
def setUp(self):
339
self.sub = EntitySubstitution
340
341
def test_simple_html_substitution(self):
342
# Unicode characters corresponding to named HTML entites
343
# are substituted, and no others.
344
s = "foo\u2200\N{SNOWMAN}\u00f5bar"
345
self.assertEqual(self.sub.substitute_html(s),
346
"foo&forall;\N{SNOWMAN}&otilde;bar")
347
348
def test_smart_quote_substitution(self):
349
# MS smart quotes are a common source of frustration, so we
350
# give them a special test.
351
quotes = b"\x91\x92foo\x93\x94"
352
dammit = UnicodeDammit(quotes)
353
self.assertEqual(self.sub.substitute_html(dammit.markup),
354
"&lsquo;&rsquo;foo&ldquo;&rdquo;")
355
356
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
357
s = 'Welcome to "my bar"'
358
self.assertEqual(self.sub.substitute_xml(s, False), s)
359
360
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
361
self.assertEqual(self.sub.substitute_xml("Welcome", True),
362
'"Welcome"')
363
self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
364
'"Bob\'s Bar"')
365
366
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
367
s = 'Welcome to "my bar"'
368
self.assertEqual(self.sub.substitute_xml(s, True),
369
"'Welcome to \"my bar\"'")
370
371
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
372
s = 'Welcome to "Bob\'s Bar"'
373
self.assertEqual(
374
self.sub.substitute_xml(s, True),
375
'"Welcome to &quot;Bob\'s Bar&quot;"')
376
377
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
378
quoted = 'Welcome to "Bob\'s Bar"'
379
self.assertEqual(self.sub.substitute_xml(quoted), quoted)
380
381
def test_xml_quoting_handles_angle_brackets(self):
382
self.assertEqual(
383
self.sub.substitute_xml("foo<bar>"),
384
"foo&lt;bar&gt;")
385
386
def test_xml_quoting_handles_ampersands(self):
387
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
388
389
def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
390
self.assertEqual(
391
self.sub.substitute_xml("&Aacute;T&T"),
392
"&amp;Aacute;T&amp;T")
393
394
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
395
self.assertEqual(
396
self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
397
"&Aacute;T&amp;T")
398
399
def test_quotes_not_html_substituted(self):
400
"""There's no need to do this except inside attribute values."""
401
text = 'Bob\'s "bar"'
402
self.assertEqual(self.sub.substitute_html(text), text)
403
404
405
class TestEncodingConversion(SoupTest):
406
# Test Beautiful Soup's ability to decode and encode from various
407
# encodings.
408
409
def setUp(self):
410
super(TestEncodingConversion, self).setUp()
411
self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
412
self.utf8_data = self.unicode_data.encode("utf-8")
413
# Just so you know what it looks like.
414
self.assertEqual(
415
self.utf8_data,
416
b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
417
418
def test_ascii_in_unicode_out(self):
419
# ASCII input is converted to Unicode. The original_encoding
420
# attribute is set to 'utf-8', a superset of ASCII.
421
chardet = bs4.dammit.chardet_dammit
422
logging.disable(logging.WARNING)
423
try:
424
def noop(str):
425
return None
426
# Disable chardet, which will realize that the ASCII is ASCII.
427
bs4.dammit.chardet_dammit = noop
428
ascii = b"<foo>a</foo>"
429
soup_from_ascii = self.soup(ascii)
430
unicode_output = soup_from_ascii.decode()
431
self.assertTrue(isinstance(unicode_output, str))
432
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
433
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
434
finally:
435
logging.disable(logging.NOTSET)
436
bs4.dammit.chardet_dammit = chardet
437
438
def test_unicode_in_unicode_out(self):
439
# Unicode input is left alone. The original_encoding attribute
440
# is not set.
441
soup_from_unicode = self.soup(self.unicode_data)
442
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
443
self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
444
self.assertEqual(soup_from_unicode.original_encoding, None)
445
446
def test_utf8_in_unicode_out(self):
447
# UTF-8 input is converted to Unicode. The original_encoding
448
# attribute is set.
449
soup_from_utf8 = self.soup(self.utf8_data)
450
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
451
self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
452
453
def test_utf8_out(self):
454
# The internal data structures can be encoded as UTF-8.
455
soup_from_unicode = self.soup(self.unicode_data)
456
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
457
458
@skipIf(
459
PYTHON_3_PRE_3_2,
460
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
461
def test_attribute_name_containing_unicode_characters(self):
462
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
463
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
464
465
class TestUnicodeDammit(unittest.TestCase):
466
"""Standalone tests of UnicodeDammit."""
467
468
def test_unicode_input(self):
469
markup = "I'm already Unicode! \N{SNOWMAN}"
470
dammit = UnicodeDammit(markup)
471
self.assertEqual(dammit.unicode_markup, markup)
472
473
def test_smart_quotes_to_unicode(self):
474
markup = b"<foo>\x91\x92\x93\x94</foo>"
475
dammit = UnicodeDammit(markup)
476
self.assertEqual(
477
dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
478
479
def test_smart_quotes_to_xml_entities(self):
480
markup = b"<foo>\x91\x92\x93\x94</foo>"
481
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
482
self.assertEqual(
483
dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
484
485
def test_smart_quotes_to_html_entities(self):
486
markup = b"<foo>\x91\x92\x93\x94</foo>"
487
dammit = UnicodeDammit(markup, smart_quotes_to="html")
488
self.assertEqual(
489
dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
490
491
def test_smart_quotes_to_ascii(self):
492
markup = b"<foo>\x91\x92\x93\x94</foo>"
493
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
494
self.assertEqual(
495
dammit.unicode_markup, """<foo>''""</foo>""")
496
497
def test_detect_utf8(self):
498
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
499
dammit = UnicodeDammit(utf8)
500
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
501
self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
502
503
504
def test_convert_hebrew(self):
505
hebrew = b"\xed\xe5\xec\xf9"
506
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
507
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
508
self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
509
510
def test_dont_see_smart_quotes_where_there_are_none(self):
511
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
512
dammit = UnicodeDammit(utf_8)
513
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
514
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
515
516
def test_ignore_inappropriate_codecs(self):
517
utf8_data = "Räksmörgås".encode("utf-8")
518
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
519
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
520
521
def test_ignore_invalid_codecs(self):
522
utf8_data = "Räksmörgås".encode("utf-8")
523
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
524
dammit = UnicodeDammit(utf8_data, [bad_encoding])
525
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
526
527
def test_exclude_encodings(self):
528
# This is UTF-8.
529
utf8_data = "Räksmörgås".encode("utf-8")
530
531
# But if we exclude UTF-8 from consideration, the guess is
532
# Windows-1252.
533
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
534
self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
535
536
# And if we exclude that, there is no valid guess at all.
537
dammit = UnicodeDammit(
538
utf8_data, exclude_encodings=["utf-8", "windows-1252"])
539
self.assertEqual(dammit.original_encoding, None)
540
541
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
542
detected = EncodingDetector(
543
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
544
encodings = list(detected.encodings)
545
assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
546
547
def test_detect_html5_style_meta_tag(self):
548
549
for data in (
550
b'<html><meta charset="euc-jp" /></html>',
551
b"<html><meta charset='euc-jp' /></html>",
552
b"<html><meta charset=euc-jp /></html>",
553
b"<html><meta charset=euc-jp/></html>"):
554
dammit = UnicodeDammit(data, is_html=True)
555
self.assertEqual(
556
"euc-jp", dammit.original_encoding)
557
558
def test_last_ditch_entity_replacement(self):
559
# This is a UTF-8 document that contains bytestrings
560
# completely incompatible with UTF-8 (ie. encoded with some other
561
# encoding).
562
#
563
# Since there is no consistent encoding for the document,
564
# Unicode, Dammit will eventually encode the document as UTF-8
565
# and encode the incompatible characters as REPLACEMENT
566
# CHARACTER.
567
#
568
# If chardet is installed, it will detect that the document
569
# can be converted into ISO-8859-1 without errors. This happens
570
# to be the wrong encoding, but it is a consistent encoding, so the
571
# code we're testing here won't run.
572
#
573
# So we temporarily disable chardet if it's present.
574
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
575
<html><b>\330\250\330\252\330\261</b>
576
<i>\310\322\321\220\312\321\355\344</i></html>"""
577
chardet = bs4.dammit.chardet_dammit
578
logging.disable(logging.WARNING)
579
try:
580
def noop(str):
581
return None
582
bs4.dammit.chardet_dammit = noop
583
dammit = UnicodeDammit(doc)
584
self.assertEqual(True, dammit.contains_replacement_characters)
585
self.assertTrue("\ufffd" in dammit.unicode_markup)
586
587
soup = BeautifulSoup(doc, "html.parser")
588
self.assertTrue(soup.contains_replacement_characters)
589
finally:
590
logging.disable(logging.NOTSET)
591
bs4.dammit.chardet_dammit = chardet
592
593
def test_byte_order_mark_removed(self):
594
# A document written in UTF-16LE will have its byte order marker stripped.
595
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
596
dammit = UnicodeDammit(data)
597
self.assertEqual("<a>áé</a>", dammit.unicode_markup)
598
self.assertEqual("utf-16le", dammit.original_encoding)
599
600
def test_detwingle(self):
601
# Here's a UTF8 document.
602
utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
603
604
# Here's a Windows-1252 document.
605
windows_1252 = (
606
"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
607
"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
608
609
# Through some unholy alchemy, they've been stuck together.
610
doc = utf8 + windows_1252 + utf8
611
612
# The document can't be turned into UTF-8:
613
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
614
615
# Unicode, Dammit thinks the whole document is Windows-1252,
616
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
617
618
# But if we run it through fix_embedded_windows_1252, it's fixed:
619
620
fixed = UnicodeDammit.detwingle(doc)
621
self.assertEqual(
622
"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
623
624
def test_detwingle_ignores_multibyte_characters(self):
625
# Each of these characters has a UTF-8 representation ending
626
# in \x93. \x93 is a smart quote if interpreted as
627
# Windows-1252. But our code knows to skip over multibyte
628
# UTF-8 characters, so they'll survive the process unscathed.
629
for tricky_unicode_char in (
630
"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
631
"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
632
"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
633
):
634
input = tricky_unicode_char.encode("utf8")
635
self.assertTrue(input.endswith(b'\x93'))
636
output = UnicodeDammit.detwingle(input)
637
self.assertEqual(output, input)
638
639
def test_find_declared_encoding(self):
640
# Test our ability to find a declared encoding inside an
641
# XML or HTML document.
642
#
643
# Even if the document comes in as Unicode, it may be
644
# interesting to know what encoding was claimed
645
# originally.
646
647
html_unicode = '<html><head><meta charset="utf-8"></head></html>'
648
html_bytes = html_unicode.encode("ascii")
649
650
xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
651
xml_bytes = xml_unicode.encode("ascii")
652
653
m = EncodingDetector.find_declared_encoding
654
self.assertEqual(None, m(html_unicode, is_html=False))
655
self.assertEqual("utf-8", m(html_unicode, is_html=True))
656
self.assertEqual("utf-8", m(html_bytes, is_html=True))
657
658
self.assertEqual("iso-8859-1", m(xml_unicode))
659
self.assertEqual("iso-8859-1", m(xml_bytes))
660
661
# Normally, only the first few kilobytes of a document are checked for
662
# an encoding.
663
spacer = b' ' * 5000
664
self.assertEqual(None, m(spacer + html_bytes))
665
self.assertEqual(None, m(spacer + xml_bytes))
666
667
# But you can tell find_declared_encoding to search an entire
668
# HTML document.
669
self.assertEqual(
670
"utf-8",
671
m(spacer + html_bytes, is_html=True, search_entire_document=True)
672
)
673
674
# The XML encoding declaration has to be the very first thing
675
# in the document. We'll allow whitespace before the document
676
# starts, but nothing else.
677
self.assertEqual(
678
"iso-8859-1",
679
m(xml_bytes, search_entire_document=True)
680
)
681
self.assertEqual(
682
None, m(b'a' + xml_bytes, search_entire_document=True)
683
)
684
685
class TestNamedspacedAttribute(SoupTest):
686
687
def test_name_may_be_none_or_missing(self):
688
a = NamespacedAttribute("xmlns", None)
689
self.assertEqual(a, "xmlns")
690
691
a = NamespacedAttribute("xmlns")
692
self.assertEqual(a, "xmlns")
693
694
def test_attribute_is_equivalent_to_colon_separated_string(self):
695
a = NamespacedAttribute("a", "b")
696
self.assertEqual("a:b", a)
697
698
def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
699
a = NamespacedAttribute("a", "b", "c")
700
b = NamespacedAttribute("a", "b", "c")
701
self.assertEqual(a, b)
702
703
# The actual namespace is not considered.
704
c = NamespacedAttribute("a", "b", None)
705
self.assertEqual(a, c)
706
707
# But name and prefix are important.
708
d = NamespacedAttribute("a", "z", "c")
709
self.assertNotEqual(a, d)
710
711
e = NamespacedAttribute("z", "b", "c")
712
self.assertNotEqual(a, e)
713
714
715
class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
716
717
def test_content_meta_attribute_value(self):
718
value = CharsetMetaAttributeValue("euc-jp")
719
self.assertEqual("euc-jp", value)
720
self.assertEqual("euc-jp", value.original_value)
721
self.assertEqual("utf8", value.encode("utf8"))
722
723
724
def test_content_meta_attribute_value(self):
725
value = ContentMetaAttributeValue("text/html; charset=euc-jp")
726
self.assertEqual("text/html; charset=euc-jp", value)
727
self.assertEqual("text/html; charset=euc-jp", value.original_value)
728
self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
729
730