Path: blob/master/venv/Lib/site-packages/bs4/tests/test_lxml.py
811 views
"""Tests to ensure that the lxml tree builder generates good trees."""12import re3import warnings45try:6import lxml.etree7LXML_PRESENT = True8LXML_VERSION = lxml.etree.LXML_VERSION9except ImportError as e:10LXML_PRESENT = False11LXML_VERSION = (0,)1213if LXML_PRESENT:14from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML1516from bs4 import (17BeautifulSoup,18BeautifulStoneSoup,19)20from bs4.element import Comment, Doctype, SoupStrainer21from bs4.testing import skipIf22from bs4.tests import test_htmlparser23from bs4.testing import (24HTMLTreeBuilderSmokeTest,25XMLTreeBuilderSmokeTest,26SoupTest,27skipIf,28)2930@skipIf(31not LXML_PRESENT,32"lxml seems not to be present, not testing its tree builder.")33class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):34"""See ``HTMLTreeBuilderSmokeTest``."""3536@property37def default_builder(self):38return LXMLTreeBuilder3940def test_out_of_range_entity(self):41self.assertSoupEquals(42"<p>foo�bar</p>", "<p>foobar</p>")43self.assertSoupEquals(44"<p>foo�bar</p>", "<p>foobar</p>")45self.assertSoupEquals(46"<p>foo�bar</p>", "<p>foobar</p>")4748def test_entities_in_foreign_document_encoding(self):49# We can't implement this case correctly because by the time we50# hear about markup like "“", it's been (incorrectly) converted into51# a string like u'\x93'52pass5354# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this55# test if an old version of lxml is installed.5657@skipIf(58not LXML_PRESENT or LXML_VERSION < (2,3,5,0),59"Skipping doctype test for old version of lxml to avoid segfault.")60def test_empty_doctype(self):61soup = self.soup("<!DOCTYPE>")62doctype = soup.contents[0]63self.assertEqual("", doctype.strip())6465def test_beautifulstonesoup_is_xml_parser(self):66# Make sure that the deprecated BSS class uses an xml builder67# if one is installed.68with warnings.catch_warnings(record=True) as w:69soup = BeautifulStoneSoup("<b />")70self.assertEqual("<b/>", str(soup.b))71self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))7273def test_tracking_line_numbers(self):74# The lxml TreeBuilder cannot keep track of line numbers from75# the original markup. Even if you ask for line numbers, we76# don't have 'em.77#78# This means that if you have a tag like <sourceline> or79# <sourcepos>, attribute access will find it rather than80# giving you a numeric answer.81soup = self.soup(82"\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>",83store_line_numbers=True84)85self.assertEqual("sourceline", soup.p.sourceline.name)86self.assertEqual("sourcepos", soup.p.sourcepos.name)8788@skipIf(89not LXML_PRESENT,90"lxml seems not to be present, not testing its XML tree builder.")91class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):92"""See ``HTMLTreeBuilderSmokeTest``."""9394@property95def default_builder(self):96return LXMLTreeBuilderForXML9798def test_namespace_indexing(self):99# We should not track un-prefixed namespaces as we can only hold one100# and it will be recognized as the default namespace by soupsieve,101# which may be confusing in some situations. When no namespace is provided102# for a selector, the default namespace (if defined) is assumed.103104soup = self.soup(105'<?xml version="1.1"?>\n'106'<root>'107'<tag xmlns="http://unprefixed-namespace.com">content</tag>'108'<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>'109'</root>'110)111self.assertEqual(112soup._namespaces,113{'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}114)115116117