Path: blob/master/venv/Lib/site-packages/lxml/html/diff.py
811 views
# cython: language_level=312from __future__ import absolute_import34import difflib5from lxml import etree6from lxml.html import fragment_fromstring7import re89__all__ = ['html_annotate', 'htmldiff']1011try:12from html import escape as html_escape13except ImportError:14from cgi import escape as html_escape15try:16_unicode = unicode17except NameError:18# Python 319_unicode = str20try:21basestring22except NameError:23# Python 324basestring = str2526############################################################27## Annotation28############################################################2930def default_markup(text, version):31return '<span title="%s">%s</span>' % (32html_escape(_unicode(version), 1), text)3334def html_annotate(doclist, markup=default_markup):35"""36doclist should be ordered from oldest to newest, like::3738>>> version1 = 'Hello World'39>>> version2 = 'Goodbye World'40>>> print(html_annotate([(version1, 'version 1'),41... (version2, 'version 2')]))42<span title="version 2">Goodbye</span> <span title="version 1">World</span>4344The documents must be *fragments* (str/UTF8 or unicode), not45complete documents4647The markup argument is a function to markup the spans of words.48This function is called like markup('Hello', 'version 2'), and49returns HTML. The first argument is text and never includes any50markup. The default uses a span with a title:5152>>> print(default_markup('Some Text', 'by Joe'))53<span title="by Joe">Some Text</span>54"""55# The basic strategy we have is to split the documents up into56# logical tokens (which are words with attached markup). We then57# do diffs of each of the versions to track when a token first58# appeared in the document; the annotation attached to the token59# is the version where it first appeared.60tokenlist = [tokenize_annotated(doc, version)61for doc, version in doclist]62cur_tokens = tokenlist[0]63for tokens in tokenlist[1:]:64html_annotate_merge_annotations(cur_tokens, tokens)65cur_tokens = tokens6667# After we've tracked all the tokens, we can combine spans of text68# that are adjacent and have the same annotation69cur_tokens = compress_tokens(cur_tokens)70# And finally add markup71result = markup_serialize_tokens(cur_tokens, markup)72return ''.join(result).strip()7374def tokenize_annotated(doc, annotation):75"""Tokenize a document and add an annotation attribute to each token76"""77tokens = tokenize(doc, include_hrefs=False)78for tok in tokens:79tok.annotation = annotation80return tokens8182def html_annotate_merge_annotations(tokens_old, tokens_new):83"""Merge the annotations from tokens_old into tokens_new, when the84tokens in the new document already existed in the old document.85"""86s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)87commands = s.get_opcodes()8889for command, i1, i2, j1, j2 in commands:90if command == 'equal':91eq_old = tokens_old[i1:i2]92eq_new = tokens_new[j1:j2]93copy_annotations(eq_old, eq_new)9495def copy_annotations(src, dest):96"""97Copy annotations from the tokens listed in src to the tokens in dest98"""99assert len(src) == len(dest)100for src_tok, dest_tok in zip(src, dest):101dest_tok.annotation = src_tok.annotation102103def compress_tokens(tokens):104"""105Combine adjacent tokens when there is no HTML between the tokens,106and they share an annotation107"""108result = [tokens[0]]109for tok in tokens[1:]:110if (not result[-1].post_tags and111not tok.pre_tags and112result[-1].annotation == tok.annotation):113compress_merge_back(result, tok)114else:115result.append(tok)116return result117118def compress_merge_back(tokens, tok):119""" Merge tok into the last element of tokens (modifying the list of120tokens in-place). """121last = tokens[-1]122if type(last) is not token or type(tok) is not token:123tokens.append(tok)124else:125text = _unicode(last)126if last.trailing_whitespace:127text += last.trailing_whitespace128text += tok129merged = token(text,130pre_tags=last.pre_tags,131post_tags=tok.post_tags,132trailing_whitespace=tok.trailing_whitespace)133merged.annotation = last.annotation134tokens[-1] = merged135136def markup_serialize_tokens(tokens, markup_func):137"""138Serialize the list of tokens into a list of text chunks, calling139markup_func around text to add annotations.140"""141for token in tokens:142for pre in token.pre_tags:143yield pre144html = token.html()145html = markup_func(html, token.annotation)146if token.trailing_whitespace:147html += token.trailing_whitespace148yield html149for post in token.post_tags:150yield post151152153############################################################154## HTML Diffs155############################################################156157def htmldiff(old_html, new_html):158## FIXME: this should take parsed documents too, and use their body159## or other content.160""" Do a diff of the old and new document. The documents are HTML161*fragments* (str/UTF8 or unicode), they are not complete documents162(i.e., no <html> tag).163164Returns HTML with <ins> and <del> tags added around the165appropriate text.166167Markup is generally ignored, with the markup from new_html168preserved, and possibly some markup from old_html (though it is169considered acceptable to lose some of the old markup). Only the170words in the HTML are diffed. The exception is <img> tags, which171are treated like words, and the href attribute of <a> tags, which172are noted inside the tag itself when there are changes.173"""174old_html_tokens = tokenize(old_html)175new_html_tokens = tokenize(new_html)176result = htmldiff_tokens(old_html_tokens, new_html_tokens)177result = ''.join(result).strip()178return fixup_ins_del_tags(result)179180def htmldiff_tokens(html1_tokens, html2_tokens):181""" Does a diff on the tokens themselves, returning a list of text182chunks (not tokens).183"""184# There are several passes as we do the differences. The tokens185# isolate the portion of the content we care to diff; difflib does186# all the actual hard work at that point.187#188# Then we must create a valid document from pieces of both the old189# document and the new document. We generally prefer to take190# markup from the new document, and only do a best effort attempt191# to keep markup from the old document; anything that we can't192# resolve we throw away. Also we try to put the deletes as close193# to the location where we think they would have been -- because194# we are only keeping the markup from the new document, it can be195# fuzzy where in the new document the old text would have gone.196# Again we just do a best effort attempt.197s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)198commands = s.get_opcodes()199result = []200for command, i1, i2, j1, j2 in commands:201if command == 'equal':202result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))203continue204if command == 'insert' or command == 'replace':205ins_tokens = expand_tokens(html2_tokens[j1:j2])206merge_insert(ins_tokens, result)207if command == 'delete' or command == 'replace':208del_tokens = expand_tokens(html1_tokens[i1:i2])209merge_delete(del_tokens, result)210# If deletes were inserted directly as <del> then we'd have an211# invalid document at this point. Instead we put in special212# markers, and when the complete diffed document has been created213# we try to move the deletes around and resolve any problems.214result = cleanup_delete(result)215216return result217218def expand_tokens(tokens, equal=False):219"""Given a list of tokens, return a generator of the chunks of220text for the data in the tokens.221"""222for token in tokens:223for pre in token.pre_tags:224yield pre225if not equal or not token.hide_when_equal:226if token.trailing_whitespace:227yield token.html() + token.trailing_whitespace228else:229yield token.html()230for post in token.post_tags:231yield post232233def merge_insert(ins_chunks, doc):234""" doc is the already-handled document (as a list of text chunks);235here we add <ins>ins_chunks</ins> to the end of that. """236# Though we don't throw away unbalanced_start or unbalanced_end237# (we assume there is accompanying markup later or earlier in the238# document), we only put <ins> around the balanced portion.239unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)240doc.extend(unbalanced_start)241if doc and not doc[-1].endswith(' '):242# Fix up the case where the word before the insert didn't end with243# a space244doc[-1] += ' '245doc.append('<ins>')246if balanced and balanced[-1].endswith(' '):247# We move space outside of </ins>248balanced[-1] = balanced[-1][:-1]249doc.extend(balanced)250doc.append('</ins> ')251doc.extend(unbalanced_end)252253# These are sentinals to represent the start and end of a <del>254# segment, until we do the cleanup phase to turn them into proper255# markup:256class DEL_START:257pass258class DEL_END:259pass260261class NoDeletes(Exception):262""" Raised when the document no longer contains any pending deletes263(DEL_START/DEL_END) """264265def merge_delete(del_chunks, doc):266""" Adds the text chunks in del_chunks to the document doc (another267list of text chunks) with marker to show it is a delete.268cleanup_delete later resolves these markers into <del> tags."""269doc.append(DEL_START)270doc.extend(del_chunks)271doc.append(DEL_END)272273def cleanup_delete(chunks):274""" Cleans up any DEL_START/DEL_END markers in the document, replacing275them with <del></del>. To do this while keeping the document276valid, it may need to drop some tags (either start or end tags).277278It may also move the del into adjacent tags to try to move it to a279similar location where it was originally located (e.g., moving a280delete into preceding <div> tag, if the del looks like (DEL_START,281'Text</div>', DEL_END)"""282while 1:283# Find a pending DEL_START/DEL_END, splitting the document284# into stuff-preceding-DEL_START, stuff-inside, and285# stuff-following-DEL_END286try:287pre_delete, delete, post_delete = split_delete(chunks)288except NoDeletes:289# Nothing found, we've cleaned up the entire doc290break291# The stuff-inside-DEL_START/END may not be well balanced292# markup. First we figure out what unbalanced portions there are:293unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)294# Then we move the span forward and/or backward based on these295# unbalanced portions:296locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)297locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)298doc = pre_delete299if doc and not doc[-1].endswith(' '):300# Fix up case where the word before us didn't have a trailing space301doc[-1] += ' '302doc.append('<del>')303if balanced and balanced[-1].endswith(' '):304# We move space outside of </del>305balanced[-1] = balanced[-1][:-1]306doc.extend(balanced)307doc.append('</del> ')308doc.extend(post_delete)309chunks = doc310return chunks311312def split_unbalanced(chunks):313"""Return (unbalanced_start, balanced, unbalanced_end), where each is314a list of text and tag chunks.315316unbalanced_start is a list of all the tags that are opened, but317not closed in this span. Similarly, unbalanced_end is a list of318tags that are closed but were not opened. Extracting these might319mean some reordering of the chunks."""320start = []321end = []322tag_stack = []323balanced = []324for chunk in chunks:325if not chunk.startswith('<'):326balanced.append(chunk)327continue328endtag = chunk[1] == '/'329name = chunk.split()[0].strip('<>/')330if name in empty_tags:331balanced.append(chunk)332continue333if endtag:334if tag_stack and tag_stack[-1][0] == name:335balanced.append(chunk)336name, pos, tag = tag_stack.pop()337balanced[pos] = tag338elif tag_stack:339start.extend([tag for name, pos, tag in tag_stack])340tag_stack = []341end.append(chunk)342else:343end.append(chunk)344else:345tag_stack.append((name, len(balanced), chunk))346balanced.append(None)347start.extend(348[chunk for name, pos, chunk in tag_stack])349balanced = [chunk for chunk in balanced if chunk is not None]350return start, balanced, end351352def split_delete(chunks):353""" Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,354stuff_after_DEL_END). Returns the first case found (there may be355more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if356there's no DEL_START found. """357try:358pos = chunks.index(DEL_START)359except ValueError:360raise NoDeletes361pos2 = chunks.index(DEL_END)362return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]363364def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):365""" pre_delete and post_delete implicitly point to a place in the366document (where the two were split). This moves that point (by367popping items from one and pushing them onto the other). It moves368the point to try to find a place where unbalanced_start applies.369370As an example::371372>>> unbalanced_start = ['<div>']373>>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']374>>> pre, post = doc[:3], doc[3:]375>>> pre, post376(['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])377>>> locate_unbalanced_start(unbalanced_start, pre, post)378>>> pre, post379(['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])380381As you can see, we moved the point so that the dangling <div> that382we found will be effectively replaced by the div in the original383document. If this doesn't work out, we just throw away384unbalanced_start without doing anything.385"""386while 1:387if not unbalanced_start:388# We have totally succeeded in finding the position389break390finding = unbalanced_start[0]391finding_name = finding.split()[0].strip('<>')392if not post_delete:393break394next = post_delete[0]395if next is DEL_START or not next.startswith('<'):396# Reached a word, we can't move the delete text forward397break398if next[1] == '/':399# Reached a closing tag, can we go further? Maybe not...400break401name = next.split()[0].strip('<>')402if name == 'ins':403# Can't move into an insert404break405assert name != 'del', (406"Unexpected delete tag: %r" % next)407if name == finding_name:408unbalanced_start.pop(0)409pre_delete.append(post_delete.pop(0))410else:411# Found a tag that doesn't match412break413414def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):415""" like locate_unbalanced_start, except handling end tags and416possibly moving the point earlier in the document. """417while 1:418if not unbalanced_end:419# Success420break421finding = unbalanced_end[-1]422finding_name = finding.split()[0].strip('<>/')423if not pre_delete:424break425next = pre_delete[-1]426if next is DEL_END or not next.startswith('</'):427# A word or a start tag428break429name = next.split()[0].strip('<>/')430if name == 'ins' or name == 'del':431# Can't move into an insert or delete432break433if name == finding_name:434unbalanced_end.pop()435post_delete.insert(0, pre_delete.pop())436else:437# Found a tag that doesn't match438break439440class token(_unicode):441""" Represents a diffable token, generally a word that is displayed to442the user. Opening tags are attached to this token when they are443adjacent (pre_tags) and closing tags that follow the word444(post_tags). Some exceptions occur when there are empty tags445adjacent to a word, so there may be close tags in pre_tags, or446open tags in post_tags.447448We also keep track of whether the word was originally followed by449whitespace, even though we do not want to treat the word as450equivalent to a similar word that does not have a trailing451space."""452453# When this is true, the token will be eliminated from the454# displayed diff if no change has occurred:455hide_when_equal = False456457def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):458obj = _unicode.__new__(cls, text)459460if pre_tags is not None:461obj.pre_tags = pre_tags462else:463obj.pre_tags = []464465if post_tags is not None:466obj.post_tags = post_tags467else:468obj.post_tags = []469470obj.trailing_whitespace = trailing_whitespace471472return obj473474def __repr__(self):475return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,476self.post_tags, self.trailing_whitespace)477478def html(self):479return _unicode(self)480481class tag_token(token):482483""" Represents a token that is actually a tag. Currently this is just484the <img> tag, which takes up visible space just like a word but485is only represented in a document by a tag. """486487def __new__(cls, tag, data, html_repr, pre_tags=None,488post_tags=None, trailing_whitespace=""):489obj = token.__new__(cls, "%s: %s" % (type, data),490pre_tags=pre_tags,491post_tags=post_tags,492trailing_whitespace=trailing_whitespace)493obj.tag = tag494obj.data = data495obj.html_repr = html_repr496return obj497498def __repr__(self):499return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (500self.tag,501self.data,502self.html_repr,503self.pre_tags,504self.post_tags,505self.trailing_whitespace)506def html(self):507return self.html_repr508509class href_token(token):510511""" Represents the href in an anchor tag. Unlike other words, we only512show the href when it changes. """513514hide_when_equal = True515516def html(self):517return ' Link: %s' % self518519def tokenize(html, include_hrefs=True):520"""521Parse the given HTML and returns token objects (words with attached tags).522523This parses only the content of a page; anything in the head is524ignored, and the <head> and <body> elements are themselves525optional. The content is then parsed by lxml, which ensures the526validity of the resulting parsed document (though lxml may make527incorrect guesses when the markup is particular bad).528529<ins> and <del> tags are also eliminated from the document, as530that gets confusing.531532If include_hrefs is true, then the href attribute of <a> tags is533included as a special kind of diffable token."""534if etree.iselement(html):535body_el = html536else:537body_el = parse_html(html, cleanup=True)538# Then we split the document into text chunks for each tag, word, and end tag:539chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)540# Finally re-joining them into token objects:541return fixup_chunks(chunks)542543def parse_html(html, cleanup=True):544"""545Parses an HTML fragment, returning an lxml element. Note that the HTML will be546wrapped in a <div> tag that was not in the original document.547548If cleanup is true, make sure there's no <head> or <body>, and get549rid of any <ins> and <del> tags.550"""551if cleanup:552# This removes any extra markup or structure like <head>:553html = cleanup_html(html)554return fragment_fromstring(html, create_parent=True)555556_body_re = re.compile(r'<body.*?>', re.I|re.S)557_end_body_re = re.compile(r'</body.*?>', re.I|re.S)558_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)559560def cleanup_html(html):561""" This 'cleans' the HTML, meaning that any page structure is removed562(only the contents of <body> are used, if there is any <body).563Also <ins> and <del> tags are removed. """564match = _body_re.search(html)565if match:566html = html[match.end():]567match = _end_body_re.search(html)568if match:569html = html[:match.start()]570html = _ins_del_re.sub('', html)571return html572573574end_whitespace_re = re.compile(r'[ \t\n\r]$')575576def split_trailing_whitespace(word):577"""578This function takes a word, such as 'test\n\n' and returns ('test','\n\n')579"""580stripped_length = len(word.rstrip())581return word[0:stripped_length], word[stripped_length:]582583584def fixup_chunks(chunks):585"""586This function takes a list of chunks and produces a list of tokens.587"""588tag_accum = []589cur_word = None590result = []591for chunk in chunks:592if isinstance(chunk, tuple):593if chunk[0] == 'img':594src = chunk[1]595tag, trailing_whitespace = split_trailing_whitespace(chunk[2])596cur_word = tag_token('img', src, html_repr=tag,597pre_tags=tag_accum,598trailing_whitespace=trailing_whitespace)599tag_accum = []600result.append(cur_word)601602elif chunk[0] == 'href':603href = chunk[1]604cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")605tag_accum = []606result.append(cur_word)607continue608609if is_word(chunk):610chunk, trailing_whitespace = split_trailing_whitespace(chunk)611cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)612tag_accum = []613result.append(cur_word)614615elif is_start_tag(chunk):616tag_accum.append(chunk)617618elif is_end_tag(chunk):619if tag_accum:620tag_accum.append(chunk)621else:622assert cur_word, (623"Weird state, cur_word=%r, result=%r, chunks=%r of %r"624% (cur_word, result, chunk, chunks))625cur_word.post_tags.append(chunk)626else:627assert False628629if not result:630return [token('', pre_tags=tag_accum)]631else:632result[-1].post_tags.extend(tag_accum)633634return result635636637# All the tags in HTML that don't require end tags:638empty_tags = (639'param', 'img', 'area', 'br', 'basefont', 'input',640'base', 'meta', 'link', 'col')641642block_level_tags = (643'address',644'blockquote',645'center',646'dir',647'div',648'dl',649'fieldset',650'form',651'h1',652'h2',653'h3',654'h4',655'h5',656'h6',657'hr',658'isindex',659'menu',660'noframes',661'noscript',662'ol',663'p',664'pre',665'table',666'ul',667)668669block_level_container_tags = (670'dd',671'dt',672'frameset',673'li',674'tbody',675'td',676'tfoot',677'th',678'thead',679'tr',680)681682683def flatten_el(el, include_hrefs, skip_tag=False):684""" Takes an lxml element el, and generates all the text chunks for685that tag. Each start tag is a chunk, each word is a chunk, and each686end tag is a chunk.687688If skip_tag is true, then the outermost container tag is689not returned (just its contents)."""690if not skip_tag:691if el.tag == 'img':692yield ('img', el.get('src'), start_tag(el))693else:694yield start_tag(el)695if el.tag in empty_tags and not el.text and not len(el) and not el.tail:696return697start_words = split_words(el.text)698for word in start_words:699yield html_escape(word)700for child in el:701for item in flatten_el(child, include_hrefs=include_hrefs):702yield item703if el.tag == 'a' and el.get('href') and include_hrefs:704yield ('href', el.get('href'))705if not skip_tag:706yield end_tag(el)707end_words = split_words(el.tail)708for word in end_words:709yield html_escape(word)710711split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)712713def split_words(text):714""" Splits some text into words. Includes trailing whitespace715on each word when appropriate. """716if not text or not text.strip():717return []718719words = split_words_re.findall(text)720return words721722start_whitespace_re = re.compile(r'^[ \t\n\r]')723724def start_tag(el):725"""726The text representation of the start tag for a tag.727"""728return '<%s%s>' % (729el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))730for name, value in el.attrib.items()]))731732def end_tag(el):733""" The text representation of an end tag for a tag. Includes734trailing whitespace when appropriate. """735if el.tail and start_whitespace_re.search(el.tail):736extra = ' '737else:738extra = ''739return '</%s>%s' % (el.tag, extra)740741def is_word(tok):742return not tok.startswith('<')743744def is_end_tag(tok):745return tok.startswith('</')746747def is_start_tag(tok):748return tok.startswith('<') and not tok.startswith('</')749750def fixup_ins_del_tags(html):751""" Given an html string, move any <ins> or <del> tags inside of any752block-level elements, e.g. transform <ins><p>word</p></ins> to753<p><ins>word</ins></p> """754doc = parse_html(html, cleanup=False)755_fixup_ins_del_tags(doc)756html = serialize_html_fragment(doc, skip_outer=True)757return html758759def serialize_html_fragment(el, skip_outer=False):760""" Serialize a single lxml element as HTML. The serialized form761includes the elements tail.762763If skip_outer is true, then don't serialize the outermost tag764"""765assert not isinstance(el, basestring), (766"You should pass in an element, not a string like %r" % el)767html = etree.tostring(el, method="html", encoding=_unicode)768if skip_outer:769# Get rid of the extra starting tag:770html = html[html.find('>')+1:]771# Get rid of the extra end tag:772html = html[:html.rfind('<')]773return html.strip()774else:775return html776777def _fixup_ins_del_tags(doc):778"""fixup_ins_del_tags that works on an lxml document in-place779"""780for tag in ['ins', 'del']:781for el in doc.xpath('descendant-or-self::%s' % tag):782if not _contains_block_level_tag(el):783continue784_move_el_inside_block(el, tag=tag)785el.drop_tag()786#_merge_element_contents(el)787788def _contains_block_level_tag(el):789"""True if the element contains any block-level elements, like <p>, <td>, etc.790"""791if el.tag in block_level_tags or el.tag in block_level_container_tags:792return True793for child in el:794if _contains_block_level_tag(child):795return True796return False797798def _move_el_inside_block(el, tag):799""" helper for _fixup_ins_del_tags; actually takes the <ins> etc tags800and moves them inside any block-level tags. """801for child in el:802if _contains_block_level_tag(child):803break804else:805# No block-level tags in any child806children_tag = etree.Element(tag)807children_tag.text = el.text808el.text = None809children_tag.extend(list(el))810el[:] = [children_tag]811return812for child in list(el):813if _contains_block_level_tag(child):814_move_el_inside_block(child, tag)815if child.tail:816tail_tag = etree.Element(tag)817tail_tag.text = child.tail818child.tail = None819el.insert(el.index(child)+1, tail_tag)820else:821child_tag = etree.Element(tag)822el.replace(child, child_tag)823child_tag.append(child)824if el.text:825text_tag = etree.Element(tag)826text_tag.text = el.text827el.text = None828el.insert(0, text_tag)829830def _merge_element_contents(el):831"""832Removes an element, but merges its contents into its place, e.g.,833given <p>Hi <i>there!</i></p>, if you remove the <i> element you get834<p>Hi there!</p>835"""836parent = el.getparent()837text = el.text or ''838if el.tail:839if not len(el):840text += el.tail841else:842if el[-1].tail:843el[-1].tail += el.tail844else:845el[-1].tail = el.tail846index = parent.index(el)847if text:848if index == 0:849previous = None850else:851previous = parent[index-1]852if previous is None:853if parent.text:854parent.text += text855else:856parent.text = text857else:858if previous.tail:859previous.tail += text860else:861previous.tail = text862parent[index:index+1] = el.getchildren()863864class InsensitiveSequenceMatcher(difflib.SequenceMatcher):865"""866Acts like SequenceMatcher, but tries not to find very small equal867blocks amidst large spans of changes868"""869870threshold = 2871872def get_matching_blocks(self):873size = min(len(self.b), len(self.b))874threshold = min(self.threshold, size / 4)875actual = difflib.SequenceMatcher.get_matching_blocks(self)876return [item for item in actual877if item[2] > threshold878or not item[2]]879880if __name__ == '__main__':881from lxml.html import _diffcommand882_diffcommand.main()883884885886