Path: blob/master/venv/Lib/site-packages/bs4/tests/test_htmlparser.py
811 views
"""Tests to ensure that the html.parser tree builder generates good1trees."""23from pdb import set_trace4import pickle5from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest6from bs4.builder import HTMLParserTreeBuilder7from bs4.builder._htmlparser import BeautifulSoupHTMLParser89class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):1011default_builder = HTMLParserTreeBuilder1213def test_namespaced_system_doctype(self):14# html.parser can't handle namespaced doctypes, so skip this one.15pass1617def test_namespaced_public_doctype(self):18# html.parser can't handle namespaced doctypes, so skip this one.19pass2021def test_builder_is_pickled(self):22"""Unlike most tree builders, HTMLParserTreeBuilder and will23be restored after pickling.24"""25tree = self.soup("<a><b>foo</a>")26dumped = pickle.dumps(tree, 2)27loaded = pickle.loads(dumped)28self.assertTrue(isinstance(loaded.builder, type(tree.builder)))2930def test_redundant_empty_element_closing_tags(self):31self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")32self.assertSoupEquals('</br></br></br>', "")3334def test_empty_element(self):35# This verifies that any buffered data present when the parser36# finishes working is handled.37self.assertSoupEquals("foo &# bar", "foo &# bar")3839def test_tracking_line_numbers(self):40# The html.parser TreeBuilder keeps track of line number and41# position of each element.42markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"43soup = self.soup(markup)44self.assertEqual(2, soup.p.sourceline)45self.assertEqual(3, soup.p.sourcepos)46self.assertEqual("sourceline", soup.p.find('sourceline').name)4748# You can deactivate this behavior.49soup = self.soup(markup, store_line_numbers=False)50self.assertEqual("sourceline", soup.p.sourceline.name)51self.assertEqual("sourcepos", soup.p.sourcepos.name)5253def test_on_duplicate_attribute(self):54# The html.parser tree builder has a variety of ways of55# handling a tag that contains the same attribute multiple times.5657markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">'5859# If you don't provide any particular value for60# on_duplicate_attribute, later values replace earlier values.61soup = self.soup(markup)62self.assertEqual("url3", soup.a['href'])63self.assertEqual(["cls"], soup.a['class'])64self.assertEqual("id", soup.a['id'])6566# You can also get this behavior explicitly.67def assert_attribute(on_duplicate_attribute, expected):68soup = self.soup(69markup, on_duplicate_attribute=on_duplicate_attribute70)71self.assertEqual(expected, soup.a['href'])7273# Verify that non-duplicate attributes are treated normally.74self.assertEqual(["cls"], soup.a['class'])75self.assertEqual("id", soup.a['id'])76assert_attribute(None, "url3")77assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")7879# You can ignore subsequent values in favor of the first.80assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")8182# And you can pass in a callable that does whatever you want.83def accumulate(attrs, key, value):84if not isinstance(attrs[key], list):85attrs[key] = [attrs[key]]86attrs[key].append(value)87assert_attribute(accumulate, ["url1", "url2", "url3"])888990class TestHTMLParserSubclass(SoupTest):91def test_error(self):92"""Verify that our HTMLParser subclass implements error() in a way93that doesn't cause a crash.94"""95parser = BeautifulSoupHTMLParser()96parser.error("don't crash")979899