Path: blob/master/venv/Lib/site-packages/bs4/builder/_htmlparser.py
811 views
# encoding: utf-81"""Use the HTMLParser library to parse HTML files that aren't too bad."""23# Use of this source code is governed by the MIT license.4__license__ = "MIT"56__all__ = [7'HTMLParserTreeBuilder',8]910from html.parser import HTMLParser1112try:13from html.parser import HTMLParseError14except ImportError as e:15# HTMLParseError is removed in Python 3.5. Since it can never be16# thrown in 3.5, we can just define our own class as a placeholder.17class HTMLParseError(Exception):18pass1920import sys21import warnings2223# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'24# argument, which we'd like to set to False. Unfortunately,25# http://bugs.python.org/issue13273 makes strict=True a better bet26# before Python 3.2.3.27#28# At the end of this file, we monkeypatch HTMLParser so that29# strict=True works well on Python 3.2.2.30major, minor, release = sys.version_info[:3]31CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 332CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 333CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4343536from bs4.element import (37CData,38Comment,39Declaration,40Doctype,41ProcessingInstruction,42)43from bs4.dammit import EntitySubstitution, UnicodeDammit4445from bs4.builder import (46HTML,47HTMLTreeBuilder,48STRICT,49)505152HTMLPARSER = 'html.parser'5354class BeautifulSoupHTMLParser(HTMLParser):55"""A subclass of the Python standard library's HTMLParser class, which56listens for HTMLParser events and translates them into calls57to Beautiful Soup's tree construction API.58"""5960# Strategies for handling duplicate attributes61IGNORE = 'ignore'62REPLACE = 'replace'6364def __init__(self, *args, **kwargs):65"""Constructor.6667:param on_duplicate_attribute: A strategy for what to do if a68tag includes the same attribute more than once. Accepted69values are: REPLACE (replace earlier values with later70ones, the default), IGNORE (keep the earliest value71encountered), or a callable. A callable must take three72arguments: the dictionary of attributes already processed,73the name of the duplicate attribute, and the most recent value74encountered.75"""76self.on_duplicate_attribute = kwargs.pop(77'on_duplicate_attribute', self.REPLACE78)79HTMLParser.__init__(self, *args, **kwargs)8081# Keep a list of empty-element tags that were encountered82# without an explicit closing tag. If we encounter a closing tag83# of this type, we'll associate it with one of those entries.84#85# This isn't a stack because we don't care about the86# order. It's a list of closing tags we've already handled and87# will ignore, assuming they ever show up.88self.already_closed_empty_element = []8990def error(self, msg):91"""In Python 3, HTMLParser subclasses must implement error(), although92this requirement doesn't appear to be documented.9394In Python 2, HTMLParser implements error() by raising an exception,95which we don't want to do.9697In any event, this method is called only on very strange98markup and our best strategy is to pretend it didn't happen99and keep going.100"""101warnings.warn(msg)102103def handle_startendtag(self, name, attrs):104"""Handle an incoming empty-element tag.105106This is only called when the markup looks like <tag/>.107108:param name: Name of the tag.109:param attrs: Dictionary of the tag's attributes.110"""111# is_startend() tells handle_starttag not to close the tag112# just because its name matches a known empty-element tag. We113# know that this is an empty-element tag and we want to call114# handle_endtag ourselves.115tag = self.handle_starttag(name, attrs, handle_empty_element=False)116self.handle_endtag(name)117118def handle_starttag(self, name, attrs, handle_empty_element=True):119"""Handle an opening tag, e.g. '<tag>'120121:param name: Name of the tag.122:param attrs: Dictionary of the tag's attributes.123:param handle_empty_element: True if this tag is known to be124an empty-element tag (i.e. there is not expected to be any125closing tag).126"""127# XXX namespace128attr_dict = {}129for key, value in attrs:130# Change None attribute values to the empty string131# for consistency with the other tree builders.132if value is None:133value = ''134if key in attr_dict:135# A single attribute shows up multiple times in this136# tag. How to handle it depends on the137# on_duplicate_attribute setting.138on_dupe = self.on_duplicate_attribute139if on_dupe == self.IGNORE:140pass141elif on_dupe in (None, self.REPLACE):142attr_dict[key] = value143else:144on_dupe(attr_dict, key, value)145else:146attr_dict[key] = value147attrvalue = '""'148#print("START", name)149sourceline, sourcepos = self.getpos()150tag = self.soup.handle_starttag(151name, None, None, attr_dict, sourceline=sourceline,152sourcepos=sourcepos153)154if tag and tag.is_empty_element and handle_empty_element:155# Unlike other parsers, html.parser doesn't send separate end tag156# events for empty-element tags. (It's handled in157# handle_startendtag, but only if the original markup looked like158# <tag/>.)159#160# So we need to call handle_endtag() ourselves. Since we161# know the start event is identical to the end event, we162# don't want handle_endtag() to cross off any previous end163# events for tags of this name.164self.handle_endtag(name, check_already_closed=False)165166# But we might encounter an explicit closing tag for this tag167# later on. If so, we want to ignore it.168self.already_closed_empty_element.append(name)169170def handle_endtag(self, name, check_already_closed=True):171"""Handle a closing tag, e.g. '</tag>'172173:param name: A tag name.174:param check_already_closed: True if this tag is expected to175be the closing portion of an empty-element tag,176e.g. '<tag></tag>'.177"""178#print("END", name)179if check_already_closed and name in self.already_closed_empty_element:180# This is a redundant end tag for an empty-element tag.181# We've already called handle_endtag() for it, so just182# check it off the list.183# print("ALREADY CLOSED", name)184self.already_closed_empty_element.remove(name)185else:186self.soup.handle_endtag(name)187188def handle_data(self, data):189"""Handle some textual data that shows up between tags."""190self.soup.handle_data(data)191192def handle_charref(self, name):193"""Handle a numeric character reference by converting it to the194corresponding Unicode character and treating it as textual195data.196197:param name: Character number, possibly in hexadecimal.198"""199# XXX workaround for a bug in HTMLParser. Remove this once200# it's fixed in all supported versions.201# http://bugs.python.org/issue13633202if name.startswith('x'):203real_name = int(name.lstrip('x'), 16)204elif name.startswith('X'):205real_name = int(name.lstrip('X'), 16)206else:207real_name = int(name)208209data = None210if real_name < 256:211# HTML numeric entities are supposed to reference Unicode212# code points, but sometimes they reference code points in213# some other encoding (ahem, Windows-1252). E.g. “214# instead of É for LEFT DOUBLE QUOTATION MARK. This215# code tries to detect this situation and compensate.216for encoding in (self.soup.original_encoding, 'windows-1252'):217if not encoding:218continue219try:220data = bytearray([real_name]).decode(encoding)221except UnicodeDecodeError as e:222pass223if not data:224try:225data = chr(real_name)226except (ValueError, OverflowError) as e:227pass228data = data or "\N{REPLACEMENT CHARACTER}"229self.handle_data(data)230231def handle_entityref(self, name):232"""Handle a named entity reference by converting it to the233corresponding Unicode character and treating it as textual234data.235236:param name: Name of the entity reference.237"""238character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)239if character is not None:240data = character241else:242# If this were XML, it would be ambiguous whether "&foo"243# was an character entity reference with a missing244# semicolon or the literal string "&foo". Since this is245# HTML, we have a complete list of all character entity references,246# and this one wasn't found, so assume it's the literal string "&foo".247data = "&%s" % name248self.handle_data(data)249250def handle_comment(self, data):251"""Handle an HTML comment.252253:param data: The text of the comment.254"""255self.soup.endData()256self.soup.handle_data(data)257self.soup.endData(Comment)258259def handle_decl(self, data):260"""Handle a DOCTYPE declaration.261262:param data: The text of the declaration.263"""264self.soup.endData()265data = data[len("DOCTYPE "):]266self.soup.handle_data(data)267self.soup.endData(Doctype)268269def unknown_decl(self, data):270"""Handle a declaration of unknown type -- probably a CDATA block.271272:param data: The text of the declaration.273"""274if data.upper().startswith('CDATA['):275cls = CData276data = data[len('CDATA['):]277else:278cls = Declaration279self.soup.endData()280self.soup.handle_data(data)281self.soup.endData(cls)282283def handle_pi(self, data):284"""Handle a processing instruction.285286:param data: The text of the instruction.287"""288self.soup.endData()289self.soup.handle_data(data)290self.soup.endData(ProcessingInstruction)291292293class HTMLParserTreeBuilder(HTMLTreeBuilder):294"""A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,295found in the Python standard library.296"""297is_xml = False298picklable = True299NAME = HTMLPARSER300features = [NAME, HTML, STRICT]301302# The html.parser knows which line number and position in the303# original file is the source of an element.304TRACKS_LINE_NUMBERS = True305306def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):307"""Constructor.308309:param parser_args: Positional arguments to pass into310the BeautifulSoupHTMLParser constructor, once it's311invoked.312:param parser_kwargs: Keyword arguments to pass into313the BeautifulSoupHTMLParser constructor, once it's314invoked.315:param kwargs: Keyword arguments for the superclass constructor.316"""317# Some keyword arguments will be pulled out of kwargs and placed318# into parser_kwargs.319extra_parser_kwargs = dict()320for arg in ('on_duplicate_attribute',):321if arg in kwargs:322value = kwargs.pop(arg)323extra_parser_kwargs[arg] = value324super(HTMLParserTreeBuilder, self).__init__(**kwargs)325parser_args = parser_args or []326parser_kwargs = parser_kwargs or {}327parser_kwargs.update(extra_parser_kwargs)328if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:329parser_kwargs['strict'] = False330if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:331parser_kwargs['convert_charrefs'] = False332self.parser_args = (parser_args, parser_kwargs)333334def prepare_markup(self, markup, user_specified_encoding=None,335document_declared_encoding=None, exclude_encodings=None):336337"""Run any preliminary steps necessary to make incoming markup338acceptable to the parser.339340:param markup: Some markup -- probably a bytestring.341:param user_specified_encoding: The user asked to try this encoding.342:param document_declared_encoding: The markup itself claims to be343in this encoding.344:param exclude_encodings: The user asked _not_ to try any of345these encodings.346347:yield: A series of 4-tuples:348(markup, encoding, declared encoding,349has undergone character replacement)350351Each 4-tuple represents a strategy for converting the352document to Unicode and parsing it. Each strategy will be tried353in turn.354"""355if isinstance(markup, str):356# Parse Unicode as-is.357yield (markup, None, None, False)358return359360# Ask UnicodeDammit to sniff the most likely encoding.361try_encodings = [user_specified_encoding, document_declared_encoding]362dammit = UnicodeDammit(markup, try_encodings, is_html=True,363exclude_encodings=exclude_encodings)364yield (dammit.markup, dammit.original_encoding,365dammit.declared_html_encoding,366dammit.contains_replacement_characters)367368def feed(self, markup):369"""Run some incoming markup through some parsing process,370populating the `BeautifulSoup` object in self.soup.371"""372args, kwargs = self.parser_args373parser = BeautifulSoupHTMLParser(*args, **kwargs)374parser.soup = self.soup375try:376parser.feed(markup)377parser.close()378except HTMLParseError as e:379warnings.warn(RuntimeWarning(380"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))381raise e382parser.already_closed_empty_element = []383384# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some385# 3.2.3 code. This ensures they don't treat markup like <p></p> as a386# string.387#388# XXX This code can be removed once most Python 3 users are on 3.2.3.389if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:390import re391attrfind_tolerant = re.compile(392r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'393r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')394HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant395396locatestarttagend = re.compile(r"""397<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name398(?:\s+ # whitespace before attribute name399(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name400(?:\s*=\s* # value indicator401(?:'[^']*' # LITA-enclosed value402|\"[^\"]*\" # LIT-enclosed value403|[^'\">\s]+ # bare value404)405)?406)407)*408\s* # trailing whitespace409""", re.VERBOSE)410BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend411412from html.parser import tagfind, attrfind413414def parse_starttag(self, i):415self.__starttag_text = None416endpos = self.check_for_whole_start_tag(i)417if endpos < 0:418return endpos419rawdata = self.rawdata420self.__starttag_text = rawdata[i:endpos]421422# Now parse the data between i+1 and j into a tag and attrs423attrs = []424match = tagfind.match(rawdata, i+1)425assert match, 'unexpected call to parse_starttag()'426k = match.end()427self.lasttag = tag = rawdata[i+1:k].lower()428while k < endpos:429if self.strict:430m = attrfind.match(rawdata, k)431else:432m = attrfind_tolerant.match(rawdata, k)433if not m:434break435attrname, rest, attrvalue = m.group(1, 2, 3)436if not rest:437attrvalue = None438elif attrvalue[:1] == '\'' == attrvalue[-1:] or \439attrvalue[:1] == '"' == attrvalue[-1:]:440attrvalue = attrvalue[1:-1]441if attrvalue:442attrvalue = self.unescape(attrvalue)443attrs.append((attrname.lower(), attrvalue))444k = m.end()445446end = rawdata[k:endpos].strip()447if end not in (">", "/>"):448lineno, offset = self.getpos()449if "\n" in self.__starttag_text:450lineno = lineno + self.__starttag_text.count("\n")451offset = len(self.__starttag_text) \452- self.__starttag_text.rfind("\n")453else:454offset = offset + len(self.__starttag_text)455if self.strict:456self.error("junk characters in start tag: %r"457% (rawdata[k:endpos][:20],))458self.handle_data(rawdata[i:endpos])459return endpos460if end.endswith('/>'):461# XHTML-style empty tag: <span attr="value" />462self.handle_startendtag(tag, attrs)463else:464self.handle_starttag(tag, attrs)465if tag in self.CDATA_CONTENT_ELEMENTS:466self.set_cdata_mode(tag)467return endpos468469def set_cdata_mode(self, elem):470self.cdata_elem = elem.lower()471self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)472473BeautifulSoupHTMLParser.parse_starttag = parse_starttag474BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode475476CONSTRUCTOR_TAKES_STRICT = True477478479