Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/lxml/html/diff.py
811 views
1
# cython: language_level=3
2
3
from __future__ import absolute_import
4
5
import difflib
6
from lxml import etree
7
from lxml.html import fragment_fromstring
8
import re
9
10
__all__ = ['html_annotate', 'htmldiff']
11
12
try:
13
from html import escape as html_escape
14
except ImportError:
15
from cgi import escape as html_escape
16
try:
17
_unicode = unicode
18
except NameError:
19
# Python 3
20
_unicode = str
21
try:
22
basestring
23
except NameError:
24
# Python 3
25
basestring = str
26
27
############################################################
28
## Annotation
29
############################################################
30
31
def default_markup(text, version):
32
return '<span title="%s">%s</span>' % (
33
html_escape(_unicode(version), 1), text)
34
35
def html_annotate(doclist, markup=default_markup):
36
"""
37
doclist should be ordered from oldest to newest, like::
38
39
>>> version1 = 'Hello World'
40
>>> version2 = 'Goodbye World'
41
>>> print(html_annotate([(version1, 'version 1'),
42
... (version2, 'version 2')]))
43
<span title="version 2">Goodbye</span> <span title="version 1">World</span>
44
45
The documents must be *fragments* (str/UTF8 or unicode), not
46
complete documents
47
48
The markup argument is a function to markup the spans of words.
49
This function is called like markup('Hello', 'version 2'), and
50
returns HTML. The first argument is text and never includes any
51
markup. The default uses a span with a title:
52
53
>>> print(default_markup('Some Text', 'by Joe'))
54
<span title="by Joe">Some Text</span>
55
"""
56
# The basic strategy we have is to split the documents up into
57
# logical tokens (which are words with attached markup). We then
58
# do diffs of each of the versions to track when a token first
59
# appeared in the document; the annotation attached to the token
60
# is the version where it first appeared.
61
tokenlist = [tokenize_annotated(doc, version)
62
for doc, version in doclist]
63
cur_tokens = tokenlist[0]
64
for tokens in tokenlist[1:]:
65
html_annotate_merge_annotations(cur_tokens, tokens)
66
cur_tokens = tokens
67
68
# After we've tracked all the tokens, we can combine spans of text
69
# that are adjacent and have the same annotation
70
cur_tokens = compress_tokens(cur_tokens)
71
# And finally add markup
72
result = markup_serialize_tokens(cur_tokens, markup)
73
return ''.join(result).strip()
74
75
def tokenize_annotated(doc, annotation):
76
"""Tokenize a document and add an annotation attribute to each token
77
"""
78
tokens = tokenize(doc, include_hrefs=False)
79
for tok in tokens:
80
tok.annotation = annotation
81
return tokens
82
83
def html_annotate_merge_annotations(tokens_old, tokens_new):
84
"""Merge the annotations from tokens_old into tokens_new, when the
85
tokens in the new document already existed in the old document.
86
"""
87
s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
88
commands = s.get_opcodes()
89
90
for command, i1, i2, j1, j2 in commands:
91
if command == 'equal':
92
eq_old = tokens_old[i1:i2]
93
eq_new = tokens_new[j1:j2]
94
copy_annotations(eq_old, eq_new)
95
96
def copy_annotations(src, dest):
97
"""
98
Copy annotations from the tokens listed in src to the tokens in dest
99
"""
100
assert len(src) == len(dest)
101
for src_tok, dest_tok in zip(src, dest):
102
dest_tok.annotation = src_tok.annotation
103
104
def compress_tokens(tokens):
105
"""
106
Combine adjacent tokens when there is no HTML between the tokens,
107
and they share an annotation
108
"""
109
result = [tokens[0]]
110
for tok in tokens[1:]:
111
if (not result[-1].post_tags and
112
not tok.pre_tags and
113
result[-1].annotation == tok.annotation):
114
compress_merge_back(result, tok)
115
else:
116
result.append(tok)
117
return result
118
119
def compress_merge_back(tokens, tok):
120
""" Merge tok into the last element of tokens (modifying the list of
121
tokens in-place). """
122
last = tokens[-1]
123
if type(last) is not token or type(tok) is not token:
124
tokens.append(tok)
125
else:
126
text = _unicode(last)
127
if last.trailing_whitespace:
128
text += last.trailing_whitespace
129
text += tok
130
merged = token(text,
131
pre_tags=last.pre_tags,
132
post_tags=tok.post_tags,
133
trailing_whitespace=tok.trailing_whitespace)
134
merged.annotation = last.annotation
135
tokens[-1] = merged
136
137
def markup_serialize_tokens(tokens, markup_func):
138
"""
139
Serialize the list of tokens into a list of text chunks, calling
140
markup_func around text to add annotations.
141
"""
142
for token in tokens:
143
for pre in token.pre_tags:
144
yield pre
145
html = token.html()
146
html = markup_func(html, token.annotation)
147
if token.trailing_whitespace:
148
html += token.trailing_whitespace
149
yield html
150
for post in token.post_tags:
151
yield post
152
153
154
############################################################
155
## HTML Diffs
156
############################################################
157
158
def htmldiff(old_html, new_html):
159
## FIXME: this should take parsed documents too, and use their body
160
## or other content.
161
""" Do a diff of the old and new document. The documents are HTML
162
*fragments* (str/UTF8 or unicode), they are not complete documents
163
(i.e., no <html> tag).
164
165
Returns HTML with <ins> and <del> tags added around the
166
appropriate text.
167
168
Markup is generally ignored, with the markup from new_html
169
preserved, and possibly some markup from old_html (though it is
170
considered acceptable to lose some of the old markup). Only the
171
words in the HTML are diffed. The exception is <img> tags, which
172
are treated like words, and the href attribute of <a> tags, which
173
are noted inside the tag itself when there are changes.
174
"""
175
old_html_tokens = tokenize(old_html)
176
new_html_tokens = tokenize(new_html)
177
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
178
result = ''.join(result).strip()
179
return fixup_ins_del_tags(result)
180
181
def htmldiff_tokens(html1_tokens, html2_tokens):
182
""" Does a diff on the tokens themselves, returning a list of text
183
chunks (not tokens).
184
"""
185
# There are several passes as we do the differences. The tokens
186
# isolate the portion of the content we care to diff; difflib does
187
# all the actual hard work at that point.
188
#
189
# Then we must create a valid document from pieces of both the old
190
# document and the new document. We generally prefer to take
191
# markup from the new document, and only do a best effort attempt
192
# to keep markup from the old document; anything that we can't
193
# resolve we throw away. Also we try to put the deletes as close
194
# to the location where we think they would have been -- because
195
# we are only keeping the markup from the new document, it can be
196
# fuzzy where in the new document the old text would have gone.
197
# Again we just do a best effort attempt.
198
s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
199
commands = s.get_opcodes()
200
result = []
201
for command, i1, i2, j1, j2 in commands:
202
if command == 'equal':
203
result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
204
continue
205
if command == 'insert' or command == 'replace':
206
ins_tokens = expand_tokens(html2_tokens[j1:j2])
207
merge_insert(ins_tokens, result)
208
if command == 'delete' or command == 'replace':
209
del_tokens = expand_tokens(html1_tokens[i1:i2])
210
merge_delete(del_tokens, result)
211
# If deletes were inserted directly as <del> then we'd have an
212
# invalid document at this point. Instead we put in special
213
# markers, and when the complete diffed document has been created
214
# we try to move the deletes around and resolve any problems.
215
result = cleanup_delete(result)
216
217
return result
218
219
def expand_tokens(tokens, equal=False):
220
"""Given a list of tokens, return a generator of the chunks of
221
text for the data in the tokens.
222
"""
223
for token in tokens:
224
for pre in token.pre_tags:
225
yield pre
226
if not equal or not token.hide_when_equal:
227
if token.trailing_whitespace:
228
yield token.html() + token.trailing_whitespace
229
else:
230
yield token.html()
231
for post in token.post_tags:
232
yield post
233
234
def merge_insert(ins_chunks, doc):
235
""" doc is the already-handled document (as a list of text chunks);
236
here we add <ins>ins_chunks</ins> to the end of that. """
237
# Though we don't throw away unbalanced_start or unbalanced_end
238
# (we assume there is accompanying markup later or earlier in the
239
# document), we only put <ins> around the balanced portion.
240
unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
241
doc.extend(unbalanced_start)
242
if doc and not doc[-1].endswith(' '):
243
# Fix up the case where the word before the insert didn't end with
244
# a space
245
doc[-1] += ' '
246
doc.append('<ins>')
247
if balanced and balanced[-1].endswith(' '):
248
# We move space outside of </ins>
249
balanced[-1] = balanced[-1][:-1]
250
doc.extend(balanced)
251
doc.append('</ins> ')
252
doc.extend(unbalanced_end)
253
254
# These are sentinals to represent the start and end of a <del>
255
# segment, until we do the cleanup phase to turn them into proper
256
# markup:
257
class DEL_START:
258
pass
259
class DEL_END:
260
pass
261
262
class NoDeletes(Exception):
263
""" Raised when the document no longer contains any pending deletes
264
(DEL_START/DEL_END) """
265
266
def merge_delete(del_chunks, doc):
267
""" Adds the text chunks in del_chunks to the document doc (another
268
list of text chunks) with marker to show it is a delete.
269
cleanup_delete later resolves these markers into <del> tags."""
270
doc.append(DEL_START)
271
doc.extend(del_chunks)
272
doc.append(DEL_END)
273
274
def cleanup_delete(chunks):
275
""" Cleans up any DEL_START/DEL_END markers in the document, replacing
276
them with <del></del>. To do this while keeping the document
277
valid, it may need to drop some tags (either start or end tags).
278
279
It may also move the del into adjacent tags to try to move it to a
280
similar location where it was originally located (e.g., moving a
281
delete into preceding <div> tag, if the del looks like (DEL_START,
282
'Text</div>', DEL_END)"""
283
while 1:
284
# Find a pending DEL_START/DEL_END, splitting the document
285
# into stuff-preceding-DEL_START, stuff-inside, and
286
# stuff-following-DEL_END
287
try:
288
pre_delete, delete, post_delete = split_delete(chunks)
289
except NoDeletes:
290
# Nothing found, we've cleaned up the entire doc
291
break
292
# The stuff-inside-DEL_START/END may not be well balanced
293
# markup. First we figure out what unbalanced portions there are:
294
unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
295
# Then we move the span forward and/or backward based on these
296
# unbalanced portions:
297
locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
298
locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
299
doc = pre_delete
300
if doc and not doc[-1].endswith(' '):
301
# Fix up case where the word before us didn't have a trailing space
302
doc[-1] += ' '
303
doc.append('<del>')
304
if balanced and balanced[-1].endswith(' '):
305
# We move space outside of </del>
306
balanced[-1] = balanced[-1][:-1]
307
doc.extend(balanced)
308
doc.append('</del> ')
309
doc.extend(post_delete)
310
chunks = doc
311
return chunks
312
313
def split_unbalanced(chunks):
314
"""Return (unbalanced_start, balanced, unbalanced_end), where each is
315
a list of text and tag chunks.
316
317
unbalanced_start is a list of all the tags that are opened, but
318
not closed in this span. Similarly, unbalanced_end is a list of
319
tags that are closed but were not opened. Extracting these might
320
mean some reordering of the chunks."""
321
start = []
322
end = []
323
tag_stack = []
324
balanced = []
325
for chunk in chunks:
326
if not chunk.startswith('<'):
327
balanced.append(chunk)
328
continue
329
endtag = chunk[1] == '/'
330
name = chunk.split()[0].strip('<>/')
331
if name in empty_tags:
332
balanced.append(chunk)
333
continue
334
if endtag:
335
if tag_stack and tag_stack[-1][0] == name:
336
balanced.append(chunk)
337
name, pos, tag = tag_stack.pop()
338
balanced[pos] = tag
339
elif tag_stack:
340
start.extend([tag for name, pos, tag in tag_stack])
341
tag_stack = []
342
end.append(chunk)
343
else:
344
end.append(chunk)
345
else:
346
tag_stack.append((name, len(balanced), chunk))
347
balanced.append(None)
348
start.extend(
349
[chunk for name, pos, chunk in tag_stack])
350
balanced = [chunk for chunk in balanced if chunk is not None]
351
return start, balanced, end
352
353
def split_delete(chunks):
354
""" Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
355
stuff_after_DEL_END). Returns the first case found (there may be
356
more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
357
there's no DEL_START found. """
358
try:
359
pos = chunks.index(DEL_START)
360
except ValueError:
361
raise NoDeletes
362
pos2 = chunks.index(DEL_END)
363
return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
364
365
def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
366
""" pre_delete and post_delete implicitly point to a place in the
367
document (where the two were split). This moves that point (by
368
popping items from one and pushing them onto the other). It moves
369
the point to try to find a place where unbalanced_start applies.
370
371
As an example::
372
373
>>> unbalanced_start = ['<div>']
374
>>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
375
>>> pre, post = doc[:3], doc[3:]
376
>>> pre, post
377
(['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
378
>>> locate_unbalanced_start(unbalanced_start, pre, post)
379
>>> pre, post
380
(['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
381
382
As you can see, we moved the point so that the dangling <div> that
383
we found will be effectively replaced by the div in the original
384
document. If this doesn't work out, we just throw away
385
unbalanced_start without doing anything.
386
"""
387
while 1:
388
if not unbalanced_start:
389
# We have totally succeeded in finding the position
390
break
391
finding = unbalanced_start[0]
392
finding_name = finding.split()[0].strip('<>')
393
if not post_delete:
394
break
395
next = post_delete[0]
396
if next is DEL_START or not next.startswith('<'):
397
# Reached a word, we can't move the delete text forward
398
break
399
if next[1] == '/':
400
# Reached a closing tag, can we go further? Maybe not...
401
break
402
name = next.split()[0].strip('<>')
403
if name == 'ins':
404
# Can't move into an insert
405
break
406
assert name != 'del', (
407
"Unexpected delete tag: %r" % next)
408
if name == finding_name:
409
unbalanced_start.pop(0)
410
pre_delete.append(post_delete.pop(0))
411
else:
412
# Found a tag that doesn't match
413
break
414
415
def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
416
""" like locate_unbalanced_start, except handling end tags and
417
possibly moving the point earlier in the document. """
418
while 1:
419
if not unbalanced_end:
420
# Success
421
break
422
finding = unbalanced_end[-1]
423
finding_name = finding.split()[0].strip('<>/')
424
if not pre_delete:
425
break
426
next = pre_delete[-1]
427
if next is DEL_END or not next.startswith('</'):
428
# A word or a start tag
429
break
430
name = next.split()[0].strip('<>/')
431
if name == 'ins' or name == 'del':
432
# Can't move into an insert or delete
433
break
434
if name == finding_name:
435
unbalanced_end.pop()
436
post_delete.insert(0, pre_delete.pop())
437
else:
438
# Found a tag that doesn't match
439
break
440
441
class token(_unicode):
442
""" Represents a diffable token, generally a word that is displayed to
443
the user. Opening tags are attached to this token when they are
444
adjacent (pre_tags) and closing tags that follow the word
445
(post_tags). Some exceptions occur when there are empty tags
446
adjacent to a word, so there may be close tags in pre_tags, or
447
open tags in post_tags.
448
449
We also keep track of whether the word was originally followed by
450
whitespace, even though we do not want to treat the word as
451
equivalent to a similar word that does not have a trailing
452
space."""
453
454
# When this is true, the token will be eliminated from the
455
# displayed diff if no change has occurred:
456
hide_when_equal = False
457
458
def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
459
obj = _unicode.__new__(cls, text)
460
461
if pre_tags is not None:
462
obj.pre_tags = pre_tags
463
else:
464
obj.pre_tags = []
465
466
if post_tags is not None:
467
obj.post_tags = post_tags
468
else:
469
obj.post_tags = []
470
471
obj.trailing_whitespace = trailing_whitespace
472
473
return obj
474
475
def __repr__(self):
476
return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
477
self.post_tags, self.trailing_whitespace)
478
479
def html(self):
480
return _unicode(self)
481
482
class tag_token(token):
483
484
""" Represents a token that is actually a tag. Currently this is just
485
the <img> tag, which takes up visible space just like a word but
486
is only represented in a document by a tag. """
487
488
def __new__(cls, tag, data, html_repr, pre_tags=None,
489
post_tags=None, trailing_whitespace=""):
490
obj = token.__new__(cls, "%s: %s" % (type, data),
491
pre_tags=pre_tags,
492
post_tags=post_tags,
493
trailing_whitespace=trailing_whitespace)
494
obj.tag = tag
495
obj.data = data
496
obj.html_repr = html_repr
497
return obj
498
499
def __repr__(self):
500
return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
501
self.tag,
502
self.data,
503
self.html_repr,
504
self.pre_tags,
505
self.post_tags,
506
self.trailing_whitespace)
507
def html(self):
508
return self.html_repr
509
510
class href_token(token):
511
512
""" Represents the href in an anchor tag. Unlike other words, we only
513
show the href when it changes. """
514
515
hide_when_equal = True
516
517
def html(self):
518
return ' Link: %s' % self
519
520
def tokenize(html, include_hrefs=True):
521
"""
522
Parse the given HTML and returns token objects (words with attached tags).
523
524
This parses only the content of a page; anything in the head is
525
ignored, and the <head> and <body> elements are themselves
526
optional. The content is then parsed by lxml, which ensures the
527
validity of the resulting parsed document (though lxml may make
528
incorrect guesses when the markup is particular bad).
529
530
<ins> and <del> tags are also eliminated from the document, as
531
that gets confusing.
532
533
If include_hrefs is true, then the href attribute of <a> tags is
534
included as a special kind of diffable token."""
535
if etree.iselement(html):
536
body_el = html
537
else:
538
body_el = parse_html(html, cleanup=True)
539
# Then we split the document into text chunks for each tag, word, and end tag:
540
chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
541
# Finally re-joining them into token objects:
542
return fixup_chunks(chunks)
543
544
def parse_html(html, cleanup=True):
545
"""
546
Parses an HTML fragment, returning an lxml element. Note that the HTML will be
547
wrapped in a <div> tag that was not in the original document.
548
549
If cleanup is true, make sure there's no <head> or <body>, and get
550
rid of any <ins> and <del> tags.
551
"""
552
if cleanup:
553
# This removes any extra markup or structure like <head>:
554
html = cleanup_html(html)
555
return fragment_fromstring(html, create_parent=True)
556
557
_body_re = re.compile(r'<body.*?>', re.I|re.S)
558
_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
559
_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
560
561
def cleanup_html(html):
562
""" This 'cleans' the HTML, meaning that any page structure is removed
563
(only the contents of <body> are used, if there is any <body).
564
Also <ins> and <del> tags are removed. """
565
match = _body_re.search(html)
566
if match:
567
html = html[match.end():]
568
match = _end_body_re.search(html)
569
if match:
570
html = html[:match.start()]
571
html = _ins_del_re.sub('', html)
572
return html
573
574
575
end_whitespace_re = re.compile(r'[ \t\n\r]$')
576
577
def split_trailing_whitespace(word):
578
"""
579
This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
580
"""
581
stripped_length = len(word.rstrip())
582
return word[0:stripped_length], word[stripped_length:]
583
584
585
def fixup_chunks(chunks):
586
"""
587
This function takes a list of chunks and produces a list of tokens.
588
"""
589
tag_accum = []
590
cur_word = None
591
result = []
592
for chunk in chunks:
593
if isinstance(chunk, tuple):
594
if chunk[0] == 'img':
595
src = chunk[1]
596
tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
597
cur_word = tag_token('img', src, html_repr=tag,
598
pre_tags=tag_accum,
599
trailing_whitespace=trailing_whitespace)
600
tag_accum = []
601
result.append(cur_word)
602
603
elif chunk[0] == 'href':
604
href = chunk[1]
605
cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
606
tag_accum = []
607
result.append(cur_word)
608
continue
609
610
if is_word(chunk):
611
chunk, trailing_whitespace = split_trailing_whitespace(chunk)
612
cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
613
tag_accum = []
614
result.append(cur_word)
615
616
elif is_start_tag(chunk):
617
tag_accum.append(chunk)
618
619
elif is_end_tag(chunk):
620
if tag_accum:
621
tag_accum.append(chunk)
622
else:
623
assert cur_word, (
624
"Weird state, cur_word=%r, result=%r, chunks=%r of %r"
625
% (cur_word, result, chunk, chunks))
626
cur_word.post_tags.append(chunk)
627
else:
628
assert False
629
630
if not result:
631
return [token('', pre_tags=tag_accum)]
632
else:
633
result[-1].post_tags.extend(tag_accum)
634
635
return result
636
637
638
# All the tags in HTML that don't require end tags:
639
empty_tags = (
640
'param', 'img', 'area', 'br', 'basefont', 'input',
641
'base', 'meta', 'link', 'col')
642
643
block_level_tags = (
644
'address',
645
'blockquote',
646
'center',
647
'dir',
648
'div',
649
'dl',
650
'fieldset',
651
'form',
652
'h1',
653
'h2',
654
'h3',
655
'h4',
656
'h5',
657
'h6',
658
'hr',
659
'isindex',
660
'menu',
661
'noframes',
662
'noscript',
663
'ol',
664
'p',
665
'pre',
666
'table',
667
'ul',
668
)
669
670
block_level_container_tags = (
671
'dd',
672
'dt',
673
'frameset',
674
'li',
675
'tbody',
676
'td',
677
'tfoot',
678
'th',
679
'thead',
680
'tr',
681
)
682
683
684
def flatten_el(el, include_hrefs, skip_tag=False):
685
""" Takes an lxml element el, and generates all the text chunks for
686
that tag. Each start tag is a chunk, each word is a chunk, and each
687
end tag is a chunk.
688
689
If skip_tag is true, then the outermost container tag is
690
not returned (just its contents)."""
691
if not skip_tag:
692
if el.tag == 'img':
693
yield ('img', el.get('src'), start_tag(el))
694
else:
695
yield start_tag(el)
696
if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
697
return
698
start_words = split_words(el.text)
699
for word in start_words:
700
yield html_escape(word)
701
for child in el:
702
for item in flatten_el(child, include_hrefs=include_hrefs):
703
yield item
704
if el.tag == 'a' and el.get('href') and include_hrefs:
705
yield ('href', el.get('href'))
706
if not skip_tag:
707
yield end_tag(el)
708
end_words = split_words(el.tail)
709
for word in end_words:
710
yield html_escape(word)
711
712
split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
713
714
def split_words(text):
715
""" Splits some text into words. Includes trailing whitespace
716
on each word when appropriate. """
717
if not text or not text.strip():
718
return []
719
720
words = split_words_re.findall(text)
721
return words
722
723
start_whitespace_re = re.compile(r'^[ \t\n\r]')
724
725
def start_tag(el):
726
"""
727
The text representation of the start tag for a tag.
728
"""
729
return '<%s%s>' % (
730
el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
731
for name, value in el.attrib.items()]))
732
733
def end_tag(el):
734
""" The text representation of an end tag for a tag. Includes
735
trailing whitespace when appropriate. """
736
if el.tail and start_whitespace_re.search(el.tail):
737
extra = ' '
738
else:
739
extra = ''
740
return '</%s>%s' % (el.tag, extra)
741
742
def is_word(tok):
743
return not tok.startswith('<')
744
745
def is_end_tag(tok):
746
return tok.startswith('</')
747
748
def is_start_tag(tok):
749
return tok.startswith('<') and not tok.startswith('</')
750
751
def fixup_ins_del_tags(html):
752
""" Given an html string, move any <ins> or <del> tags inside of any
753
block-level elements, e.g. transform <ins><p>word</p></ins> to
754
<p><ins>word</ins></p> """
755
doc = parse_html(html, cleanup=False)
756
_fixup_ins_del_tags(doc)
757
html = serialize_html_fragment(doc, skip_outer=True)
758
return html
759
760
def serialize_html_fragment(el, skip_outer=False):
761
""" Serialize a single lxml element as HTML. The serialized form
762
includes the elements tail.
763
764
If skip_outer is true, then don't serialize the outermost tag
765
"""
766
assert not isinstance(el, basestring), (
767
"You should pass in an element, not a string like %r" % el)
768
html = etree.tostring(el, method="html", encoding=_unicode)
769
if skip_outer:
770
# Get rid of the extra starting tag:
771
html = html[html.find('>')+1:]
772
# Get rid of the extra end tag:
773
html = html[:html.rfind('<')]
774
return html.strip()
775
else:
776
return html
777
778
def _fixup_ins_del_tags(doc):
779
"""fixup_ins_del_tags that works on an lxml document in-place
780
"""
781
for tag in ['ins', 'del']:
782
for el in doc.xpath('descendant-or-self::%s' % tag):
783
if not _contains_block_level_tag(el):
784
continue
785
_move_el_inside_block(el, tag=tag)
786
el.drop_tag()
787
#_merge_element_contents(el)
788
789
def _contains_block_level_tag(el):
790
"""True if the element contains any block-level elements, like <p>, <td>, etc.
791
"""
792
if el.tag in block_level_tags or el.tag in block_level_container_tags:
793
return True
794
for child in el:
795
if _contains_block_level_tag(child):
796
return True
797
return False
798
799
def _move_el_inside_block(el, tag):
800
""" helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
801
and moves them inside any block-level tags. """
802
for child in el:
803
if _contains_block_level_tag(child):
804
break
805
else:
806
# No block-level tags in any child
807
children_tag = etree.Element(tag)
808
children_tag.text = el.text
809
el.text = None
810
children_tag.extend(list(el))
811
el[:] = [children_tag]
812
return
813
for child in list(el):
814
if _contains_block_level_tag(child):
815
_move_el_inside_block(child, tag)
816
if child.tail:
817
tail_tag = etree.Element(tag)
818
tail_tag.text = child.tail
819
child.tail = None
820
el.insert(el.index(child)+1, tail_tag)
821
else:
822
child_tag = etree.Element(tag)
823
el.replace(child, child_tag)
824
child_tag.append(child)
825
if el.text:
826
text_tag = etree.Element(tag)
827
text_tag.text = el.text
828
el.text = None
829
el.insert(0, text_tag)
830
831
def _merge_element_contents(el):
832
"""
833
Removes an element, but merges its contents into its place, e.g.,
834
given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
835
<p>Hi there!</p>
836
"""
837
parent = el.getparent()
838
text = el.text or ''
839
if el.tail:
840
if not len(el):
841
text += el.tail
842
else:
843
if el[-1].tail:
844
el[-1].tail += el.tail
845
else:
846
el[-1].tail = el.tail
847
index = parent.index(el)
848
if text:
849
if index == 0:
850
previous = None
851
else:
852
previous = parent[index-1]
853
if previous is None:
854
if parent.text:
855
parent.text += text
856
else:
857
parent.text = text
858
else:
859
if previous.tail:
860
previous.tail += text
861
else:
862
previous.tail = text
863
parent[index:index+1] = el.getchildren()
864
865
class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
866
"""
867
Acts like SequenceMatcher, but tries not to find very small equal
868
blocks amidst large spans of changes
869
"""
870
871
threshold = 2
872
873
def get_matching_blocks(self):
874
size = min(len(self.b), len(self.b))
875
threshold = min(self.threshold, size / 4)
876
actual = difflib.SequenceMatcher.get_matching_blocks(self)
877
return [item for item in actual
878
if item[2] > threshold
879
or not item[2]]
880
881
if __name__ == '__main__':
882
from lxml.html import _diffcommand
883
_diffcommand.main()
884
885
886