Path: blob/master/venv/Lib/site-packages/lxml/html/clean.py
811 views
# cython: language_level=3str12"""A cleanup tool for HTML.34Removes unwanted tags and content. See the `Cleaner` class for5details.6"""78from __future__ import absolute_import910import re11import copy12try:13from urlparse import urlsplit14from urllib import unquote_plus15except ImportError:16# Python 317from urllib.parse import urlsplit, unquote_plus18from lxml import etree19from lxml.html import defs20from lxml.html import fromstring, XHTML_NAMESPACE21from lxml.html import xhtml_to_html, _transform_result2223try:24unichr25except NameError:26# Python 327unichr = chr28try:29unicode30except NameError:31# Python 332unicode = str33try:34basestring35except NameError:36basestring = (str, bytes)373839__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',40'word_break', 'word_break_html']4142# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl43# Particularly the CSS cleaning; most of the tag cleaning is integrated now44# I have multiple kinds of schemes searched; but should schemes be45# whitelisted instead?46# max height?47# remove images? Also in CSS? background attribute?48# Some way to whitelist object, iframe, etc (e.g., if you want to49# allow *just* embedded YouTube movies)50# Log what was deleted and why?51# style="behavior: ..." might be bad in IE?52# Should we have something for just <meta http-equiv>? That's the worst of the53# metas.54# UTF-7 detections? Example:55# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-56# you don't always have to have the charset set, if the page has no charset57# and there's UTF7-like code in it.58# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php596061# This is an IE-specific construct you can have in a stylesheet to62# run some Javascript:63_css_javascript_re = re.compile(64r'expression\s*\(.*?\)', re.S|re.I)6566# Do I have to worry about @\nimport?67_css_import_re = re.compile(68r'@\s*import', re.I)6970# All kinds of schemes besides just javascript: that can cause71# execution:72_is_image_dataurl = re.compile(73r'^data:image/.+;base64', re.I).search74_is_possibly_malicious_scheme = re.compile(75r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',76re.I).search77def _is_javascript_scheme(s):78if _is_image_dataurl(s):79return None80return _is_possibly_malicious_scheme(s)8182_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub83# FIXME: should data: be blocked?8485# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx86_conditional_comment_re = re.compile(87r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)8889_find_styled_elements = etree.XPath(90"descendant-or-self::*[@style]")9192_find_external_links = etree.XPath(93("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"94"descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),95namespaces={'x':XHTML_NAMESPACE})969798class Cleaner(object):99"""100Instances cleans the document of each of the possible offending101elements. The cleaning is controlled by attributes; you can102override attributes in a subclass, or set them in the constructor.103104``scripts``:105Removes any ``<script>`` tags.106107``javascript``:108Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets109as they could contain Javascript.110111``comments``:112Removes any comments.113114``style``:115Removes any style tags.116117``inline_style``118Removes any style attributes. Defaults to the value of the ``style`` option.119120``links``:121Removes any ``<link>`` tags122123``meta``:124Removes any ``<meta>`` tags125126``page_structure``:127Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.128129``processing_instructions``:130Removes any processing instructions.131132``embedded``:133Removes any embedded objects (flash, iframes)134135``frames``:136Removes any frame-related tags137138``forms``:139Removes any form tags140141``annoying_tags``:142Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``143144``remove_tags``:145A list of tags to remove. Only the tags will be removed,146their content will get pulled up into the parent tag.147148``kill_tags``:149A list of tags to kill. Killing also removes the tag's content,150i.e. the whole subtree, not just the tag itself.151152``allow_tags``:153A list of tags to include (default include all).154155``remove_unknown_tags``:156Remove any tags that aren't standard parts of HTML.157158``safe_attrs_only``:159If true, only include 'safe' attributes (specifically the list160from the feedparser HTML sanitisation web site).161162``safe_attrs``:163A set of attribute names to override the default list of attributes164considered 'safe' (when safe_attrs_only=True).165166``add_nofollow``:167If true, then any <a> tags will have ``rel="nofollow"`` added to them.168169``host_whitelist``:170A list or set of hosts that you can use for embedded content171(for content like ``<object>``, ``<link rel="stylesheet">``, etc).172You can also implement/override the method173``allow_embedded_url(el, url)`` or ``allow_element(el)`` to174implement more complex rules for what can be embedded.175Anything that passes this test will be shown, regardless of176the value of (for instance) ``embedded``.177178Note that this parameter might not work as intended if you do not179make the links absolute before doing the cleaning.180181Note that you may also need to set ``whitelist_tags``.182183``whitelist_tags``:184A set of tags that can be included with ``host_whitelist``.185The default is ``iframe`` and ``embed``; you may wish to186include other tags like ``script``, or you may want to187implement ``allow_embedded_url`` for more control. Set to None to188include all tags.189190This modifies the document *in place*.191"""192193scripts = True194javascript = True195comments = True196style = False197inline_style = None198links = True199meta = True200page_structure = True201processing_instructions = True202embedded = True203frames = True204forms = True205annoying_tags = True206remove_tags = None207allow_tags = None208kill_tags = None209remove_unknown_tags = True210safe_attrs_only = True211safe_attrs = defs.safe_attrs212add_nofollow = False213host_whitelist = ()214whitelist_tags = {'iframe', 'embed'}215216def __init__(self, **kw):217not_an_attribute = object()218for name, value in kw.items():219default = getattr(self, name, not_an_attribute)220if (default is not None and default is not True and default is not False221and not isinstance(default, (frozenset, set, tuple, list))):222raise TypeError(223"Unknown parameter: %s=%r" % (name, value))224setattr(self, name, value)225if self.inline_style is None and 'inline_style' not in kw:226self.inline_style = self.style227228if kw.get("allow_tags"):229if kw.get("remove_unknown_tags"):230raise ValueError("It does not make sense to pass in both "231"allow_tags and remove_unknown_tags")232self.remove_unknown_tags = False233234# Used to lookup the primary URL for a given tag that is up for235# removal:236_tag_link_attrs = dict(237script='src',238link='href',239# From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html240# From what I can tell, both attributes can contain a link:241applet=['code', 'object'],242iframe='src',243embed='src',244layer='src',245# FIXME: there doesn't really seem like a general way to figure out what246# links an <object> tag uses; links often go in <param> tags with values247# that we don't really know. You'd have to have knowledge about specific248# kinds of plugins (probably keyed off classid), and match against those.249##object=?,250# FIXME: not looking at the action currently, because it is more complex251# than than -- if you keep the form, you should keep the form controls.252##form='action',253a='href',254)255256def __call__(self, doc):257"""258Cleans the document.259"""260try:261getroot = doc.getroot262except AttributeError:263pass # Element instance264else:265doc = getroot() # ElementTree instance, instead of an element266# convert XHTML to HTML267xhtml_to_html(doc)268# Normalize a case that IE treats <image> like <img>, and that269# can confuse either this step or later steps.270for el in doc.iter('image'):271el.tag = 'img'272if not self.comments:273# Of course, if we were going to kill comments anyway, we don't274# need to worry about this275self.kill_conditional_comments(doc)276277kill_tags = set(self.kill_tags or ())278remove_tags = set(self.remove_tags or ())279allow_tags = set(self.allow_tags or ())280281if self.scripts:282kill_tags.add('script')283if self.safe_attrs_only:284safe_attrs = set(self.safe_attrs)285for el in doc.iter(etree.Element):286attrib = el.attrib287for aname in attrib.keys():288if aname not in safe_attrs:289del attrib[aname]290if self.javascript:291if not (self.safe_attrs_only and292self.safe_attrs == defs.safe_attrs):293# safe_attrs handles events attributes itself294for el in doc.iter(etree.Element):295attrib = el.attrib296for aname in attrib.keys():297if aname.startswith('on'):298del attrib[aname]299doc.rewrite_links(self._remove_javascript_link,300resolve_base_href=False)301# If we're deleting style then we don't have to remove JS links302# from styles, otherwise...303if not self.inline_style:304for el in _find_styled_elements(doc):305old = el.get('style')306new = _css_javascript_re.sub('', old)307new = _css_import_re.sub('', new)308if self._has_sneaky_javascript(new):309# Something tricky is going on...310del el.attrib['style']311elif new != old:312el.set('style', new)313if not self.style:314for el in list(doc.iter('style')):315if el.get('type', '').lower().strip() == 'text/javascript':316el.drop_tree()317continue318old = el.text or ''319new = _css_javascript_re.sub('', old)320# The imported CSS can do anything; we just can't allow:321new = _css_import_re.sub('', old)322if self._has_sneaky_javascript(new):323# Something tricky is going on...324el.text = '/* deleted */'325elif new != old:326el.text = new327if self.comments:328kill_tags.add(etree.Comment)329if self.processing_instructions:330kill_tags.add(etree.ProcessingInstruction)331if self.style:332kill_tags.add('style')333if self.inline_style:334etree.strip_attributes(doc, 'style')335if self.links:336kill_tags.add('link')337elif self.style or self.javascript:338# We must get rid of included stylesheets if Javascript is not339# allowed, as you can put Javascript in them340for el in list(doc.iter('link')):341if 'stylesheet' in el.get('rel', '').lower():342# Note this kills alternate stylesheets as well343if not self.allow_element(el):344el.drop_tree()345if self.meta:346kill_tags.add('meta')347if self.page_structure:348remove_tags.update(('head', 'html', 'title'))349if self.embedded:350# FIXME: is <layer> really embedded?351# We should get rid of any <param> tags not inside <applet>;352# These are not really valid anyway.353for el in list(doc.iter('param')):354found_parent = False355parent = el.getparent()356while parent is not None and parent.tag not in ('applet', 'object'):357parent = parent.getparent()358if parent is None:359el.drop_tree()360kill_tags.update(('applet',))361# The alternate contents that are in an iframe are a good fallback:362remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))363if self.frames:364# FIXME: ideally we should look at the frame links, but365# generally frames don't mix properly with an HTML366# fragment anyway.367kill_tags.update(defs.frame_tags)368if self.forms:369remove_tags.add('form')370kill_tags.update(('button', 'input', 'select', 'textarea'))371if self.annoying_tags:372remove_tags.update(('blink', 'marquee'))373374_remove = []375_kill = []376for el in doc.iter():377if el.tag in kill_tags:378if self.allow_element(el):379continue380_kill.append(el)381elif el.tag in remove_tags:382if self.allow_element(el):383continue384_remove.append(el)385386if _remove and _remove[0] == doc:387# We have to drop the parent-most tag, which we can't388# do. Instead we'll rewrite it:389el = _remove.pop(0)390el.tag = 'div'391el.attrib.clear()392elif _kill and _kill[0] == doc:393# We have to drop the parent-most element, which we can't394# do. Instead we'll clear it:395el = _kill.pop(0)396if el.tag != 'html':397el.tag = 'div'398el.clear()399400_kill.reverse() # start with innermost tags401for el in _kill:402el.drop_tree()403for el in _remove:404el.drop_tag()405406if self.remove_unknown_tags:407if allow_tags:408raise ValueError(409"It does not make sense to pass in both allow_tags and remove_unknown_tags")410allow_tags = set(defs.tags)411if allow_tags:412# make sure we do not remove comments/PIs if users want them (which is rare enough)413if not self.comments:414allow_tags.add(etree.Comment)415if not self.processing_instructions:416allow_tags.add(etree.ProcessingInstruction)417418bad = []419for el in doc.iter():420if el.tag not in allow_tags:421bad.append(el)422if bad:423if bad[0] is doc:424el = bad.pop(0)425el.tag = 'div'426el.attrib.clear()427for el in bad:428el.drop_tag()429if self.add_nofollow:430for el in _find_external_links(doc):431if not self.allow_follow(el):432rel = el.get('rel')433if rel:434if ('nofollow' in rel435and ' nofollow ' in (' %s ' % rel)):436continue437rel = '%s nofollow' % rel438else:439rel = 'nofollow'440el.set('rel', rel)441442def allow_follow(self, anchor):443"""444Override to suppress rel="nofollow" on some anchors.445"""446return False447448def allow_element(self, el):449"""450Decide whether an element is configured to be accepted or rejected.451452:param el: an element.453:return: true to accept the element or false to reject/discard it.454"""455if el.tag not in self._tag_link_attrs:456return False457attr = self._tag_link_attrs[el.tag]458if isinstance(attr, (list, tuple)):459for one_attr in attr:460url = el.get(one_attr)461if not url:462return False463if not self.allow_embedded_url(el, url):464return False465return True466else:467url = el.get(attr)468if not url:469return False470return self.allow_embedded_url(el, url)471472def allow_embedded_url(self, el, url):473"""474Decide whether a URL that was found in an element's attributes or text475if configured to be accepted or rejected.476477:param el: an element.478:param url: a URL found on the element.479:return: true to accept the URL and false to reject it.480"""481if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:482return False483scheme, netloc, path, query, fragment = urlsplit(url)484netloc = netloc.lower().split(':', 1)[0]485if scheme not in ('http', 'https'):486return False487if netloc in self.host_whitelist:488return True489return False490491def kill_conditional_comments(self, doc):492"""493IE conditional comments basically embed HTML that the parser494doesn't normally see. We can't allow anything like that, so495we'll kill any comments that could be conditional.496"""497has_conditional_comment = _conditional_comment_re.search498self._kill_elements(499doc, lambda el: has_conditional_comment(el.text),500etree.Comment)501502def _kill_elements(self, doc, condition, iterate=None):503bad = []504for el in doc.iter(iterate):505if condition(el):506bad.append(el)507for el in bad:508el.drop_tree()509510def _remove_javascript_link(self, link):511# links like "j a v a s c r i p t:" might be interpreted in IE512new = _substitute_whitespace('', unquote_plus(link))513if _is_javascript_scheme(new):514# FIXME: should this be None to delete?515return ''516return link517518_substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub519520def _has_sneaky_javascript(self, style):521"""522Depending on the browser, stuff like ``e x p r e s s i o n(...)``523can get interpreted, or ``expre/* stuff */ssion(...)``. This524checks for attempt to do stuff like this.525526Typically the response will be to kill the entire style; if you527have just a bit of Javascript in the style another rule will catch528that and remove only the Javascript from the style; this catches529more sneaky attempts.530"""531style = self._substitute_comments('', style)532style = style.replace('\\', '')533style = _substitute_whitespace('', style)534style = style.lower()535if 'javascript:' in style:536return True537if 'expression(' in style:538return True539return False540541def clean_html(self, html):542result_type = type(html)543if isinstance(html, basestring):544doc = fromstring(html)545else:546doc = copy.deepcopy(html)547self(doc)548return _transform_result(result_type, doc)549550clean = Cleaner()551clean_html = clean.clean_html552553############################################################554## Autolinking555############################################################556557_link_regexes = [558re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),559# This is conservative, but autolinking can be a bit conservative:560re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),561]562563_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']564565_avoid_hosts = [566re.compile(r'^localhost', re.I),567re.compile(r'\bexample\.(?:com|org|net)$', re.I),568re.compile(r'^127\.0\.0\.1$'),569]570571_avoid_classes = ['nolink']572573def autolink(el, link_regexes=_link_regexes,574avoid_elements=_avoid_elements,575avoid_hosts=_avoid_hosts,576avoid_classes=_avoid_classes):577"""578Turn any URLs into links.579580It will search for links identified by the given regular581expressions (by default mailto and http(s) links).582583It won't link text in an element in avoid_elements, or an element584with a class in avoid_classes. It won't link to anything with a585host that matches one of the regular expressions in avoid_hosts586(default localhost and 127.0.0.1).587588If you pass in an element, the element's tail will not be589substituted, only the contents of the element.590"""591if el.tag in avoid_elements:592return593class_name = el.get('class')594if class_name:595class_name = class_name.split()596for match_class in avoid_classes:597if match_class in class_name:598return599for child in list(el):600autolink(child, link_regexes=link_regexes,601avoid_elements=avoid_elements,602avoid_hosts=avoid_hosts,603avoid_classes=avoid_classes)604if child.tail:605text, tail_children = _link_text(606child.tail, link_regexes, avoid_hosts, factory=el.makeelement)607if tail_children:608child.tail = text609index = el.index(child)610el[index+1:index+1] = tail_children611if el.text:612text, pre_children = _link_text(613el.text, link_regexes, avoid_hosts, factory=el.makeelement)614if pre_children:615el.text = text616el[:0] = pre_children617618def _link_text(text, link_regexes, avoid_hosts, factory):619leading_text = ''620links = []621last_pos = 0622while 1:623best_match, best_pos = None, None624for regex in link_regexes:625regex_pos = last_pos626while 1:627match = regex.search(text, pos=regex_pos)628if match is None:629break630host = match.group('host')631for host_regex in avoid_hosts:632if host_regex.search(host):633regex_pos = match.end()634break635else:636break637if match is None:638continue639if best_pos is None or match.start() < best_pos:640best_match = match641best_pos = match.start()642if best_match is None:643# No more matches644if links:645assert not links[-1].tail646links[-1].tail = text647else:648assert not leading_text649leading_text = text650break651link = best_match.group(0)652end = best_match.end()653if link.endswith('.') or link.endswith(','):654# These punctuation marks shouldn't end a link655end -= 1656link = link[:-1]657prev_text = text[:best_match.start()]658if links:659assert not links[-1].tail660links[-1].tail = prev_text661else:662assert not leading_text663leading_text = prev_text664anchor = factory('a')665anchor.set('href', link)666body = best_match.group('body')667if not body:668body = link669if body.endswith('.') or body.endswith(','):670body = body[:-1]671anchor.text = body672links.append(anchor)673text = text[end:]674return leading_text, links675676def autolink_html(html, *args, **kw):677result_type = type(html)678if isinstance(html, basestring):679doc = fromstring(html)680else:681doc = copy.deepcopy(html)682autolink(doc, *args, **kw)683return _transform_result(result_type, doc)684685autolink_html.__doc__ = autolink.__doc__686687############################################################688## Word wrapping689############################################################690691_avoid_word_break_elements = ['pre', 'textarea', 'code']692_avoid_word_break_classes = ['nobreak']693694def word_break(el, max_width=40,695avoid_elements=_avoid_word_break_elements,696avoid_classes=_avoid_word_break_classes,697break_character=unichr(0x200b)):698"""699Breaks any long words found in the body of the text (not attributes).700701Doesn't effect any of the tags in avoid_elements, by default702``<textarea>`` and ``<pre>``703704Breaks words by inserting ​, which is a unicode character705for Zero Width Space character. This generally takes up no space706in rendering, but does copy as a space, and in monospace contexts707usually takes up space.708709See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion710"""711# Character suggestion of ​ comes from:712# http://www.cs.tut.fi/~jkorpela/html/nobr.html713if el.tag in _avoid_word_break_elements:714return715class_name = el.get('class')716if class_name:717dont_break = False718class_name = class_name.split()719for avoid in avoid_classes:720if avoid in class_name:721dont_break = True722break723if dont_break:724return725if el.text:726el.text = _break_text(el.text, max_width, break_character)727for child in el:728word_break(child, max_width=max_width,729avoid_elements=avoid_elements,730avoid_classes=avoid_classes,731break_character=break_character)732if child.tail:733child.tail = _break_text(child.tail, max_width, break_character)734735def word_break_html(html, *args, **kw):736result_type = type(html)737doc = fromstring(html)738word_break(doc, *args, **kw)739return _transform_result(result_type, doc)740741def _break_text(text, max_width, break_character):742words = text.split()743for word in words:744if len(word) > max_width:745replacement = _insert_break(word, max_width, break_character)746text = text.replace(word, replacement)747return text748749_break_prefer_re = re.compile(r'[^a-z]', re.I)750751def _insert_break(word, width, break_character):752orig_word = word753result = ''754while len(word) > width:755start = word[:width]756breaks = list(_break_prefer_re.finditer(start))757if breaks:758last_break = breaks[-1]759# Only walk back up to 10 characters to find a nice break:760if last_break.end() > width-10:761# FIXME: should the break character be at the end of the762# chunk, or the beginning of the next chunk?763start = word[:last_break.end()]764result += start + break_character765word = word[len(start):]766result += word767return result768769770771