Path: blob/master/venv/Lib/site-packages/bs4/formatter.py
811 views
from bs4.dammit import EntitySubstitution12class Formatter(EntitySubstitution):3"""Describes a strategy to use when outputting a parse tree to a string.45Some parts of this strategy come from the distinction between6HTML4, HTML5, and XML. Others are configurable by the user.78Formatters are passed in as the `formatter` argument to methods9like `PageElement.encode`. Most people won't need to think about10formatters, and most people who need to think about them can pass11in one of these predefined strings as `formatter` rather than12making a new Formatter object:1314For HTML documents:15* 'html' - HTML entity substitution for generic HTML documents. (default)16* 'html5' - HTML entity substitution for HTML5 documents.17* 'minimal' - Only make the substitutions necessary to guarantee18valid HTML.19* None - Do not perform any substitution. This will be faster20but may result in invalid markup.2122For XML documents:23* 'html' - Entity substitution for XHTML documents.24* 'minimal' - Only make the substitutions necessary to guarantee25valid XML. (default)26* None - Do not perform any substitution. This will be faster27but may result in invalid markup.28"""29# Registries of XML and HTML formatters.30XML_FORMATTERS = {}31HTML_FORMATTERS = {}3233HTML = 'html'34XML = 'xml'3536HTML_DEFAULTS = dict(37cdata_containing_tags=set(["script", "style"]),38)3940def _default(self, language, value, kwarg):41if value is not None:42return value43if language == self.XML:44return set()45return self.HTML_DEFAULTS[kwarg]4647def __init__(48self, language=None, entity_substitution=None,49void_element_close_prefix='/', cdata_containing_tags=None,50):51"""Constructor.5253:param language: This should be Formatter.XML if you are formatting54XML markup and Formatter.HTML if you are formatting HTML markup.5556:param entity_substitution: A function to call to replace special57characters with XML/HTML entities. For examples, see58bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.59:param void_element_close_prefix: By default, void elements60are represented as <tag/> (XML rules) rather than <tag>61(HTML rules). To get <tag>, pass in the empty string.62:param cdata_containing_tags: The list of tags that are defined63as containing CDATA in this dialect. For example, in HTML,64<script> and <style> tags are defined as containing CDATA,65and their contents should not be formatted.66"""67self.language = language68self.entity_substitution = entity_substitution69self.void_element_close_prefix = void_element_close_prefix70self.cdata_containing_tags = self._default(71language, cdata_containing_tags, 'cdata_containing_tags'72)7374def substitute(self, ns):75"""Process a string that needs to undergo entity substitution.76This may be a string encountered in an attribute value or as77text.7879:param ns: A string.80:return: A string with certain characters replaced by named81or numeric entities.82"""83if not self.entity_substitution:84return ns85from .element import NavigableString86if (isinstance(ns, NavigableString)87and ns.parent is not None88and ns.parent.name in self.cdata_containing_tags):89# Do nothing.90return ns91# Substitute.92return self.entity_substitution(ns)9394def attribute_value(self, value):95"""Process the value of an attribute.9697:param ns: A string.98:return: A string with certain characters replaced by named99or numeric entities.100"""101return self.substitute(value)102103def attributes(self, tag):104"""Reorder a tag's attributes however you want.105106By default, attributes are sorted alphabetically. This makes107behavior consistent between Python 2 and Python 3, and preserves108backwards compatibility with older versions of Beautiful Soup.109"""110if tag.attrs is None:111return []112return sorted(tag.attrs.items())113114115class HTMLFormatter(Formatter):116"""A generic Formatter for HTML."""117REGISTRY = {}118def __init__(self, *args, **kwargs):119return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)120121122class XMLFormatter(Formatter):123"""A generic Formatter for XML."""124REGISTRY = {}125def __init__(self, *args, **kwargs):126return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)127128129# Set up aliases for the default formatters.130HTMLFormatter.REGISTRY['html'] = HTMLFormatter(131entity_substitution=EntitySubstitution.substitute_html132)133HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(134entity_substitution=EntitySubstitution.substitute_html,135void_element_close_prefix = None136)137HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(138entity_substitution=EntitySubstitution.substitute_xml139)140HTMLFormatter.REGISTRY[None] = HTMLFormatter(141entity_substitution=None142)143XMLFormatter.REGISTRY["html"] = XMLFormatter(144entity_substitution=EntitySubstitution.substitute_html145)146XMLFormatter.REGISTRY["minimal"] = XMLFormatter(147entity_substitution=EntitySubstitution.substitute_xml148)149XMLFormatter.REGISTRY[None] = Formatter(150Formatter(Formatter.XML, entity_substitution=None)151)152153154