Path: blob/master/venv/Lib/site-packages/bs4/tests/test_soup.py
811 views
# -*- coding: utf-8 -*-1"""Tests of Beautiful Soup as a whole."""23from pdb import set_trace4import logging5import unittest6import sys7import tempfile89from bs4 import (10BeautifulSoup,11BeautifulStoneSoup,12GuessedAtParserWarning,13MarkupResemblesLocatorWarning,14)15from bs4.builder import (16TreeBuilder,17ParserRejectedMarkup,18)19from bs4.element import (20CharsetMetaAttributeValue,21Comment,22ContentMetaAttributeValue,23SoupStrainer,24NamespacedAttribute,25Tag,26NavigableString,27)2829import bs4.dammit30from bs4.dammit import (31EntitySubstitution,32UnicodeDammit,33EncodingDetector,34)35from bs4.testing import (36default_builder,37SoupTest,38skipIf,39)40import warnings4142try:43from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML44LXML_PRESENT = True45except ImportError as e:46LXML_PRESENT = False4748PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))4950class TestConstructor(SoupTest):5152def test_short_unicode_input(self):53data = "<h1>éé</h1>"54soup = self.soup(data)55self.assertEqual("éé", soup.h1.string)5657def test_embedded_null(self):58data = "<h1>foo\0bar</h1>"59soup = self.soup(data)60self.assertEqual("foo\0bar", soup.h1.string)6162def test_exclude_encodings(self):63utf8_data = "Räksmörgås".encode("utf-8")64soup = self.soup(utf8_data, exclude_encodings=["utf-8"])65self.assertEqual("windows-1252", soup.original_encoding)6667def test_custom_builder_class(self):68# Verify that you can pass in a custom Builder class and69# it'll be instantiated with the appropriate keyword arguments.70class Mock(object):71def __init__(self, **kwargs):72self.called_with = kwargs73self.is_xml = True74self.store_line_numbers = False75self.cdata_list_attributes = []76self.preserve_whitespace_tags = []77self.string_containers = {}78def initialize_soup(self, soup):79pass80def feed(self, markup):81self.fed = markup82def reset(self):83pass84def ignore(self, ignore):85pass86set_up_substitutions = can_be_empty_element = ignore87def prepare_markup(self, *args, **kwargs):88yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters"8990kwargs = dict(91var="value",92# This is a deprecated BS3-era keyword argument, which93# will be stripped out.94convertEntities=True,95)96with warnings.catch_warnings(record=True):97soup = BeautifulSoup('', builder=Mock, **kwargs)98assert isinstance(soup.builder, Mock)99self.assertEqual(dict(var="value"), soup.builder.called_with)100self.assertEqual("prepared markup", soup.builder.fed)101102# You can also instantiate the TreeBuilder yourself. In this103# case, that specific object is used and any keyword arguments104# to the BeautifulSoup constructor are ignored.105builder = Mock(**kwargs)106with warnings.catch_warnings(record=True) as w:107soup = BeautifulSoup(108'', builder=builder, ignored_value=True,109)110msg = str(w[0].message)111assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")112self.assertEqual(builder, soup.builder)113self.assertEqual(kwargs, builder.called_with)114115def test_parser_markup_rejection(self):116# If markup is completely rejected by the parser, an117# explanatory ParserRejectedMarkup exception is raised.118class Mock(TreeBuilder):119def feed(self, *args, **kwargs):120raise ParserRejectedMarkup("Nope.")121122def prepare_markup(self, *args, **kwargs):123# We're going to try two different ways of preparing this markup,124# but feed() will reject both of them.125yield markup, None, None, False126yield markup, None, None, False127128import re129self.assertRaisesRegex(130ParserRejectedMarkup,131"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.",132BeautifulSoup, '', builder=Mock,133)134135def test_cdata_list_attributes(self):136# Most attribute values are represented as scalars, but the137# HTML standard says that some attributes, like 'class' have138# space-separated lists as values.139markup = '<a id=" an id " class=" a class "></a>'140soup = self.soup(markup)141142# Note that the spaces are stripped for 'class' but not for 'id'.143a = soup.a144self.assertEqual(" an id ", a['id'])145self.assertEqual(["a", "class"], a['class'])146147# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets148# you customize or disable this. As always, you can customize the TreeBuilder149# by passing in a keyword argument to the BeautifulSoup constructor.150soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)151self.assertEqual(" a class ", soup.a['class'])152153# Here are two ways of saying that `id` is a multi-valued154# attribute in this context, but 'class' is not.155for switcheroo in ({'*': 'id'}, {'a': 'id'}):156with warnings.catch_warnings(record=True) as w:157# This will create a warning about not explicitly158# specifying a parser, but we'll ignore it.159soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)160a = soup.a161self.assertEqual(["an", "id"], a['id'])162self.assertEqual(" a class ", a['class'])163164def test_replacement_classes(self):165# Test the ability to pass in replacements for element classes166# which will be used when building the tree.167class TagPlus(Tag):168pass169170class StringPlus(NavigableString):171pass172173class CommentPlus(Comment):174pass175176soup = self.soup(177"<a><b>foo</b>bar</a><!--whee-->",178element_classes = {179Tag: TagPlus,180NavigableString: StringPlus,181Comment: CommentPlus,182}183)184185# The tree was built with TagPlus, StringPlus, and CommentPlus objects,186# rather than Tag, String, and Comment objects.187assert all(188isinstance(x, (TagPlus, StringPlus, CommentPlus))189for x in soup.recursiveChildGenerator()190)191192def test_alternate_string_containers(self):193# Test the ability to customize the string containers for194# different types of tags.195class PString(NavigableString):196pass197198class BString(NavigableString):199pass200201soup = self.soup(202"<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",203string_containers = {204'b': BString,205'p': PString,206}207)208209# The string before the <p> tag is a regular NavigableString.210assert isinstance(soup.div.contents[0], NavigableString)211212# The string inside the <p> tag, but not inside the <i> tag,213# is a PString.214assert isinstance(soup.p.contents[0], PString)215216# Every string inside the <b> tag is a BString, even the one that217# was also inside an <i> tag.218for s in soup.b.strings:219assert isinstance(s, BString)220221# Now that parsing was complete, the string_container_stack222# (where this information was kept) has been cleared out.223self.assertEqual([], soup.string_container_stack)224225226class TestWarnings(SoupTest):227228def _assert_warning(self, warnings, cls):229for w in warnings:230if isinstance(w.message, cls):231return w232raise Exception("%s warning not found in %r" % cls, warnings)233234def _assert_no_parser_specified(self, w):235warning = self._assert_warning(w, GuessedAtParserWarning)236message = str(warning.message)237self.assertTrue(238message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])239)240241def test_warning_if_no_parser_specified(self):242with warnings.catch_warnings(record=True) as w:243soup = BeautifulSoup("<a><b></b></a>")244self._assert_no_parser_specified(w)245246def test_warning_if_parser_specified_too_vague(self):247with warnings.catch_warnings(record=True) as w:248soup = BeautifulSoup("<a><b></b></a>", "html")249self._assert_no_parser_specified(w)250251def test_no_warning_if_explicit_parser_specified(self):252with warnings.catch_warnings(record=True) as w:253soup = BeautifulSoup("<a><b></b></a>", "html.parser")254self.assertEqual([], w)255256def test_parseOnlyThese_renamed_to_parse_only(self):257with warnings.catch_warnings(record=True) as w:258soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))259msg = str(w[0].message)260self.assertTrue("parseOnlyThese" in msg)261self.assertTrue("parse_only" in msg)262self.assertEqual(b"<b></b>", soup.encode())263264def test_fromEncoding_renamed_to_from_encoding(self):265with warnings.catch_warnings(record=True) as w:266utf8 = b"\xc3\xa9"267soup = self.soup(utf8, fromEncoding="utf8")268msg = str(w[0].message)269self.assertTrue("fromEncoding" in msg)270self.assertTrue("from_encoding" in msg)271self.assertEqual("utf8", soup.original_encoding)272273def test_unrecognized_keyword_argument(self):274self.assertRaises(275TypeError, self.soup, "<a>", no_such_argument=True)276277def test_disk_file_warning(self):278filehandle = tempfile.NamedTemporaryFile()279filename = filehandle.name280try:281with warnings.catch_warnings(record=True) as w:282soup = self.soup(filename)283warning = self._assert_warning(w, MarkupResemblesLocatorWarning)284self.assertTrue("looks like a filename" in str(warning.message))285finally:286filehandle.close()287288# The file no longer exists, so Beautiful Soup will no longer issue the warning.289with warnings.catch_warnings(record=True) as w:290soup = self.soup(filename)291self.assertEqual([], w)292293def test_url_warning_with_bytes_url(self):294with warnings.catch_warnings(record=True) as warning_list:295soup = self.soup(b"http://www.crummybytes.com/")296warning = self._assert_warning(297warning_list, MarkupResemblesLocatorWarning298)299self.assertTrue("looks like a URL" in str(warning.message))300301def test_url_warning_with_unicode_url(self):302with warnings.catch_warnings(record=True) as warning_list:303# note - this url must differ from the bytes one otherwise304# python's warnings system swallows the second warning305soup = self.soup("http://www.crummyunicode.com/")306warning = self._assert_warning(307warning_list, MarkupResemblesLocatorWarning308)309self.assertTrue("looks like a URL" in str(warning.message))310311def test_url_warning_with_bytes_and_space(self):312# Here the markup contains something besides a URL, so no warning313# is issued.314with warnings.catch_warnings(record=True) as warning_list:315soup = self.soup(b"http://www.crummybytes.com/ is great")316self.assertFalse(any("looks like a URL" in str(w.message)317for w in warning_list))318319def test_url_warning_with_unicode_and_space(self):320with warnings.catch_warnings(record=True) as warning_list:321soup = self.soup("http://www.crummyuncode.com/ is great")322self.assertFalse(any("looks like a URL" in str(w.message)323for w in warning_list))324325326class TestSelectiveParsing(SoupTest):327328def test_parse_with_soupstrainer(self):329markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"330strainer = SoupStrainer("b")331soup = self.soup(markup, parse_only=strainer)332self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")333334335class TestEntitySubstitution(unittest.TestCase):336"""Standalone tests of the EntitySubstitution class."""337def setUp(self):338self.sub = EntitySubstitution339340def test_simple_html_substitution(self):341# Unicode characters corresponding to named HTML entites342# are substituted, and no others.343s = "foo\u2200\N{SNOWMAN}\u00f5bar"344self.assertEqual(self.sub.substitute_html(s),345"foo∀\N{SNOWMAN}õbar")346347def test_smart_quote_substitution(self):348# MS smart quotes are a common source of frustration, so we349# give them a special test.350quotes = b"\x91\x92foo\x93\x94"351dammit = UnicodeDammit(quotes)352self.assertEqual(self.sub.substitute_html(dammit.markup),353"‘’foo“”")354355def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):356s = 'Welcome to "my bar"'357self.assertEqual(self.sub.substitute_xml(s, False), s)358359def test_xml_attribute_quoting_normally_uses_double_quotes(self):360self.assertEqual(self.sub.substitute_xml("Welcome", True),361'"Welcome"')362self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),363'"Bob\'s Bar"')364365def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):366s = 'Welcome to "my bar"'367self.assertEqual(self.sub.substitute_xml(s, True),368"'Welcome to \"my bar\"'")369370def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):371s = 'Welcome to "Bob\'s Bar"'372self.assertEqual(373self.sub.substitute_xml(s, True),374'"Welcome to "Bob\'s Bar""')375376def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):377quoted = 'Welcome to "Bob\'s Bar"'378self.assertEqual(self.sub.substitute_xml(quoted), quoted)379380def test_xml_quoting_handles_angle_brackets(self):381self.assertEqual(382self.sub.substitute_xml("foo<bar>"),383"foo<bar>")384385def test_xml_quoting_handles_ampersands(self):386self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T")387388def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):389self.assertEqual(390self.sub.substitute_xml("ÁT&T"),391"&Aacute;T&T")392393def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):394self.assertEqual(395self.sub.substitute_xml_containing_entities("ÁT&T"),396"ÁT&T")397398def test_quotes_not_html_substituted(self):399"""There's no need to do this except inside attribute values."""400text = 'Bob\'s "bar"'401self.assertEqual(self.sub.substitute_html(text), text)402403404class TestEncodingConversion(SoupTest):405# Test Beautiful Soup's ability to decode and encode from various406# encodings.407408def setUp(self):409super(TestEncodingConversion, self).setUp()410self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'411self.utf8_data = self.unicode_data.encode("utf-8")412# Just so you know what it looks like.413self.assertEqual(414self.utf8_data,415b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')416417def test_ascii_in_unicode_out(self):418# ASCII input is converted to Unicode. The original_encoding419# attribute is set to 'utf-8', a superset of ASCII.420chardet = bs4.dammit.chardet_dammit421logging.disable(logging.WARNING)422try:423def noop(str):424return None425# Disable chardet, which will realize that the ASCII is ASCII.426bs4.dammit.chardet_dammit = noop427ascii = b"<foo>a</foo>"428soup_from_ascii = self.soup(ascii)429unicode_output = soup_from_ascii.decode()430self.assertTrue(isinstance(unicode_output, str))431self.assertEqual(unicode_output, self.document_for(ascii.decode()))432self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")433finally:434logging.disable(logging.NOTSET)435bs4.dammit.chardet_dammit = chardet436437def test_unicode_in_unicode_out(self):438# Unicode input is left alone. The original_encoding attribute439# is not set.440soup_from_unicode = self.soup(self.unicode_data)441self.assertEqual(soup_from_unicode.decode(), self.unicode_data)442self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')443self.assertEqual(soup_from_unicode.original_encoding, None)444445def test_utf8_in_unicode_out(self):446# UTF-8 input is converted to Unicode. The original_encoding447# attribute is set.448soup_from_utf8 = self.soup(self.utf8_data)449self.assertEqual(soup_from_utf8.decode(), self.unicode_data)450self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')451452def test_utf8_out(self):453# The internal data structures can be encoded as UTF-8.454soup_from_unicode = self.soup(self.unicode_data)455self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)456457@skipIf(458PYTHON_3_PRE_3_2,459"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")460def test_attribute_name_containing_unicode_characters(self):461markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'462self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))463464class TestUnicodeDammit(unittest.TestCase):465"""Standalone tests of UnicodeDammit."""466467def test_unicode_input(self):468markup = "I'm already Unicode! \N{SNOWMAN}"469dammit = UnicodeDammit(markup)470self.assertEqual(dammit.unicode_markup, markup)471472def test_smart_quotes_to_unicode(self):473markup = b"<foo>\x91\x92\x93\x94</foo>"474dammit = UnicodeDammit(markup)475self.assertEqual(476dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")477478def test_smart_quotes_to_xml_entities(self):479markup = b"<foo>\x91\x92\x93\x94</foo>"480dammit = UnicodeDammit(markup, smart_quotes_to="xml")481self.assertEqual(482dammit.unicode_markup, "<foo>‘’“”</foo>")483484def test_smart_quotes_to_html_entities(self):485markup = b"<foo>\x91\x92\x93\x94</foo>"486dammit = UnicodeDammit(markup, smart_quotes_to="html")487self.assertEqual(488dammit.unicode_markup, "<foo>‘’“”</foo>")489490def test_smart_quotes_to_ascii(self):491markup = b"<foo>\x91\x92\x93\x94</foo>"492dammit = UnicodeDammit(markup, smart_quotes_to="ascii")493self.assertEqual(494dammit.unicode_markup, """<foo>''""</foo>""")495496def test_detect_utf8(self):497utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"498dammit = UnicodeDammit(utf8)499self.assertEqual(dammit.original_encoding.lower(), 'utf-8')500self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')501502503def test_convert_hebrew(self):504hebrew = b"\xed\xe5\xec\xf9"505dammit = UnicodeDammit(hebrew, ["iso-8859-8"])506self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')507self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')508509def test_dont_see_smart_quotes_where_there_are_none(self):510utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"511dammit = UnicodeDammit(utf_8)512self.assertEqual(dammit.original_encoding.lower(), 'utf-8')513self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)514515def test_ignore_inappropriate_codecs(self):516utf8_data = "Räksmörgås".encode("utf-8")517dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])518self.assertEqual(dammit.original_encoding.lower(), 'utf-8')519520def test_ignore_invalid_codecs(self):521utf8_data = "Räksmörgås".encode("utf-8")522for bad_encoding in ['.utf8', '...', 'utF---16.!']:523dammit = UnicodeDammit(utf8_data, [bad_encoding])524self.assertEqual(dammit.original_encoding.lower(), 'utf-8')525526def test_exclude_encodings(self):527# This is UTF-8.528utf8_data = "Räksmörgås".encode("utf-8")529530# But if we exclude UTF-8 from consideration, the guess is531# Windows-1252.532dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])533self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')534535# And if we exclude that, there is no valid guess at all.536dammit = UnicodeDammit(537utf8_data, exclude_encodings=["utf-8", "windows-1252"])538self.assertEqual(dammit.original_encoding, None)539540def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):541detected = EncodingDetector(542b'<?xml version="1.0" encoding="UTF-\xdb" ?>')543encodings = list(detected.encodings)544assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings545546def test_detect_html5_style_meta_tag(self):547548for data in (549b'<html><meta charset="euc-jp" /></html>',550b"<html><meta charset='euc-jp' /></html>",551b"<html><meta charset=euc-jp /></html>",552b"<html><meta charset=euc-jp/></html>"):553dammit = UnicodeDammit(data, is_html=True)554self.assertEqual(555"euc-jp", dammit.original_encoding)556557def test_last_ditch_entity_replacement(self):558# This is a UTF-8 document that contains bytestrings559# completely incompatible with UTF-8 (ie. encoded with some other560# encoding).561#562# Since there is no consistent encoding for the document,563# Unicode, Dammit will eventually encode the document as UTF-8564# and encode the incompatible characters as REPLACEMENT565# CHARACTER.566#567# If chardet is installed, it will detect that the document568# can be converted into ISO-8859-1 without errors. This happens569# to be the wrong encoding, but it is a consistent encoding, so the570# code we're testing here won't run.571#572# So we temporarily disable chardet if it's present.573doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>574<html><b>\330\250\330\252\330\261</b>575<i>\310\322\321\220\312\321\355\344</i></html>"""576chardet = bs4.dammit.chardet_dammit577logging.disable(logging.WARNING)578try:579def noop(str):580return None581bs4.dammit.chardet_dammit = noop582dammit = UnicodeDammit(doc)583self.assertEqual(True, dammit.contains_replacement_characters)584self.assertTrue("\ufffd" in dammit.unicode_markup)585586soup = BeautifulSoup(doc, "html.parser")587self.assertTrue(soup.contains_replacement_characters)588finally:589logging.disable(logging.NOTSET)590bs4.dammit.chardet_dammit = chardet591592def test_byte_order_mark_removed(self):593# A document written in UTF-16LE will have its byte order marker stripped.594data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'595dammit = UnicodeDammit(data)596self.assertEqual("<a>áé</a>", dammit.unicode_markup)597self.assertEqual("utf-16le", dammit.original_encoding)598599def test_detwingle(self):600# Here's a UTF8 document.601utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")602603# Here's a Windows-1252 document.604windows_1252 = (605"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"606"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")607608# Through some unholy alchemy, they've been stuck together.609doc = utf8 + windows_1252 + utf8610611# The document can't be turned into UTF-8:612self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")613614# Unicode, Dammit thinks the whole document is Windows-1252,615# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"616617# But if we run it through fix_embedded_windows_1252, it's fixed:618619fixed = UnicodeDammit.detwingle(doc)620self.assertEqual(621"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))622623def test_detwingle_ignores_multibyte_characters(self):624# Each of these characters has a UTF-8 representation ending625# in \x93. \x93 is a smart quote if interpreted as626# Windows-1252. But our code knows to skip over multibyte627# UTF-8 characters, so they'll survive the process unscathed.628for tricky_unicode_char in (629"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'630"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'631"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.632):633input = tricky_unicode_char.encode("utf8")634self.assertTrue(input.endswith(b'\x93'))635output = UnicodeDammit.detwingle(input)636self.assertEqual(output, input)637638def test_find_declared_encoding(self):639# Test our ability to find a declared encoding inside an640# XML or HTML document.641#642# Even if the document comes in as Unicode, it may be643# interesting to know what encoding was claimed644# originally.645646html_unicode = '<html><head><meta charset="utf-8"></head></html>'647html_bytes = html_unicode.encode("ascii")648649xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'650xml_bytes = xml_unicode.encode("ascii")651652m = EncodingDetector.find_declared_encoding653self.assertEqual(None, m(html_unicode, is_html=False))654self.assertEqual("utf-8", m(html_unicode, is_html=True))655self.assertEqual("utf-8", m(html_bytes, is_html=True))656657self.assertEqual("iso-8859-1", m(xml_unicode))658self.assertEqual("iso-8859-1", m(xml_bytes))659660# Normally, only the first few kilobytes of a document are checked for661# an encoding.662spacer = b' ' * 5000663self.assertEqual(None, m(spacer + html_bytes))664self.assertEqual(None, m(spacer + xml_bytes))665666# But you can tell find_declared_encoding to search an entire667# HTML document.668self.assertEqual(669"utf-8",670m(spacer + html_bytes, is_html=True, search_entire_document=True)671)672673# The XML encoding declaration has to be the very first thing674# in the document. We'll allow whitespace before the document675# starts, but nothing else.676self.assertEqual(677"iso-8859-1",678m(xml_bytes, search_entire_document=True)679)680self.assertEqual(681None, m(b'a' + xml_bytes, search_entire_document=True)682)683684class TestNamedspacedAttribute(SoupTest):685686def test_name_may_be_none_or_missing(self):687a = NamespacedAttribute("xmlns", None)688self.assertEqual(a, "xmlns")689690a = NamespacedAttribute("xmlns")691self.assertEqual(a, "xmlns")692693def test_attribute_is_equivalent_to_colon_separated_string(self):694a = NamespacedAttribute("a", "b")695self.assertEqual("a:b", a)696697def test_attributes_are_equivalent_if_prefix_and_name_identical(self):698a = NamespacedAttribute("a", "b", "c")699b = NamespacedAttribute("a", "b", "c")700self.assertEqual(a, b)701702# The actual namespace is not considered.703c = NamespacedAttribute("a", "b", None)704self.assertEqual(a, c)705706# But name and prefix are important.707d = NamespacedAttribute("a", "z", "c")708self.assertNotEqual(a, d)709710e = NamespacedAttribute("z", "b", "c")711self.assertNotEqual(a, e)712713714class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):715716def test_content_meta_attribute_value(self):717value = CharsetMetaAttributeValue("euc-jp")718self.assertEqual("euc-jp", value)719self.assertEqual("euc-jp", value.original_value)720self.assertEqual("utf8", value.encode("utf8"))721722723def test_content_meta_attribute_value(self):724value = ContentMetaAttributeValue("text/html; charset=euc-jp")725self.assertEqual("text/html; charset=euc-jp", value)726self.assertEqual("text/html; charset=euc-jp", value.original_value)727self.assertEqual("text/html; charset=utf8", value.encode("utf8"))728729730