CoCalc -- diff.py

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/lxml/html/diff.py
⁸¹¹ views
1
# cython: language_level=3
2

3
from __future__ import absolute_import
4

5
import difflib
6
from lxml import etree
7
from lxml.html import fragment_fromstring
8
import re
9

10
__all__ = ['html_annotate', 'htmldiff']
11

12
try:
13
    from html import escape as html_escape
14
except ImportError:
15
    from cgi import escape as html_escape
16
try:
17
    _unicode = unicode
18
except NameError:
19
    # Python 3
20
    _unicode = str
21
try:
22
    basestring
23
except NameError:
24
    # Python 3
25
    basestring = str
26

27
############################################################
28
## Annotation
29
############################################################
30

31
def default_markup(text, version):
32
    return '<span title="%s">%s</span>' % (
33
        html_escape(_unicode(version), 1), text)
34

35
def html_annotate(doclist, markup=default_markup):
36
    """
37
    doclist should be ordered from oldest to newest, like::
38

39
        >>> version1 = 'Hello World'
40
        >>> version2 = 'Goodbye World'
41
        >>> print(html_annotate([(version1, 'version 1'),
42
        ...                      (version2, 'version 2')]))
43
        <span title="version 2">Goodbye</span> <span title="version 1">World</span>
44

45
    The documents must be *fragments* (str/UTF8 or unicode), not
46
    complete documents
47

48
    The markup argument is a function to markup the spans of words.
49
    This function is called like markup('Hello', 'version 2'), and
50
    returns HTML.  The first argument is text and never includes any
51
    markup.  The default uses a span with a title:
52

53
        >>> print(default_markup('Some Text', 'by Joe'))
54
        <span title="by Joe">Some Text</span>
55
    """
56
    # The basic strategy we have is to split the documents up into
57
    # logical tokens (which are words with attached markup).  We then
58
    # do diffs of each of the versions to track when a token first
59
    # appeared in the document; the annotation attached to the token
60
    # is the version where it first appeared.
61
    tokenlist = [tokenize_annotated(doc, version)
62
                 for doc, version in doclist]
63
    cur_tokens = tokenlist[0]
64
    for tokens in tokenlist[1:]:
65
        html_annotate_merge_annotations(cur_tokens, tokens)
66
        cur_tokens = tokens
67

68
    # After we've tracked all the tokens, we can combine spans of text
69
    # that are adjacent and have the same annotation
70
    cur_tokens = compress_tokens(cur_tokens)
71
    # And finally add markup
72
    result = markup_serialize_tokens(cur_tokens, markup)
73
    return ''.join(result).strip()
74

75
def tokenize_annotated(doc, annotation): 
76
    """Tokenize a document and add an annotation attribute to each token
77
    """
78
    tokens = tokenize(doc, include_hrefs=False)
79
    for tok in tokens: 
80
        tok.annotation = annotation
81
    return tokens
82

83
def html_annotate_merge_annotations(tokens_old, tokens_new): 
84
    """Merge the annotations from tokens_old into tokens_new, when the
85
    tokens in the new document already existed in the old document.
86
    """
87
    s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
88
    commands = s.get_opcodes()
89

90
    for command, i1, i2, j1, j2 in commands:
91
        if command == 'equal': 
92
            eq_old = tokens_old[i1:i2]
93
            eq_new = tokens_new[j1:j2]
94
            copy_annotations(eq_old, eq_new)
95

96
def copy_annotations(src, dest): 
97
    """
98
    Copy annotations from the tokens listed in src to the tokens in dest
99
    """
100
    assert len(src) == len(dest)
101
    for src_tok, dest_tok in zip(src, dest): 
102
        dest_tok.annotation = src_tok.annotation
103

104
def compress_tokens(tokens):
105
    """
106
    Combine adjacent tokens when there is no HTML between the tokens, 
107
    and they share an annotation
108
    """
109
    result = [tokens[0]] 
110
    for tok in tokens[1:]: 
111
        if (not result[-1].post_tags and 
112
            not tok.pre_tags and 
113
            result[-1].annotation == tok.annotation): 
114
            compress_merge_back(result, tok)
115
        else: 
116
            result.append(tok)
117
    return result
118

119
def compress_merge_back(tokens, tok): 
120
    """ Merge tok into the last element of tokens (modifying the list of
121
    tokens in-place).  """
122
    last = tokens[-1]
123
    if type(last) is not token or type(tok) is not token: 
124
        tokens.append(tok)
125
    else:
126
        text = _unicode(last)
127
        if last.trailing_whitespace:
128
            text += last.trailing_whitespace
129
        text += tok
130
        merged = token(text,
131
                       pre_tags=last.pre_tags,
132
                       post_tags=tok.post_tags,
133
                       trailing_whitespace=tok.trailing_whitespace)
134
        merged.annotation = last.annotation
135
        tokens[-1] = merged
136
    
137
def markup_serialize_tokens(tokens, markup_func):
138
    """
139
    Serialize the list of tokens into a list of text chunks, calling
140
    markup_func around text to add annotations.
141
    """
142
    for token in tokens:
143
        for pre in token.pre_tags:
144
            yield pre
145
        html = token.html()
146
        html = markup_func(html, token.annotation)
147
        if token.trailing_whitespace:
148
            html += token.trailing_whitespace
149
        yield html
150
        for post in token.post_tags:
151
            yield post
152

153

154
############################################################
155
## HTML Diffs
156
############################################################
157

158
def htmldiff(old_html, new_html):
159
    ## FIXME: this should take parsed documents too, and use their body
160
    ## or other content.
161
    """ Do a diff of the old and new document.  The documents are HTML
162
    *fragments* (str/UTF8 or unicode), they are not complete documents
163
    (i.e., no <html> tag).
164

165
    Returns HTML with <ins> and <del> tags added around the
166
    appropriate text.  
167

168
    Markup is generally ignored, with the markup from new_html
169
    preserved, and possibly some markup from old_html (though it is
170
    considered acceptable to lose some of the old markup).  Only the
171
    words in the HTML are diffed.  The exception is <img> tags, which
172
    are treated like words, and the href attribute of <a> tags, which
173
    are noted inside the tag itself when there are changes.
174
    """ 
175
    old_html_tokens = tokenize(old_html)
176
    new_html_tokens = tokenize(new_html)
177
    result = htmldiff_tokens(old_html_tokens, new_html_tokens)
178
    result = ''.join(result).strip()
179
    return fixup_ins_del_tags(result)
180

181
def htmldiff_tokens(html1_tokens, html2_tokens):
182
    """ Does a diff on the tokens themselves, returning a list of text
183
    chunks (not tokens).
184
    """
185
    # There are several passes as we do the differences.  The tokens
186
    # isolate the portion of the content we care to diff; difflib does
187
    # all the actual hard work at that point.  
188
    #
189
    # Then we must create a valid document from pieces of both the old
190
    # document and the new document.  We generally prefer to take
191
    # markup from the new document, and only do a best effort attempt
192
    # to keep markup from the old document; anything that we can't
193
    # resolve we throw away.  Also we try to put the deletes as close
194
    # to the location where we think they would have been -- because
195
    # we are only keeping the markup from the new document, it can be
196
    # fuzzy where in the new document the old text would have gone.
197
    # Again we just do a best effort attempt.
198
    s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
199
    commands = s.get_opcodes()
200
    result = []
201
    for command, i1, i2, j1, j2 in commands:
202
        if command == 'equal':
203
            result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
204
            continue
205
        if command == 'insert' or command == 'replace':
206
            ins_tokens = expand_tokens(html2_tokens[j1:j2])
207
            merge_insert(ins_tokens, result)
208
        if command == 'delete' or command == 'replace':
209
            del_tokens = expand_tokens(html1_tokens[i1:i2])
210
            merge_delete(del_tokens, result)
211
    # If deletes were inserted directly as <del> then we'd have an
212
    # invalid document at this point.  Instead we put in special
213
    # markers, and when the complete diffed document has been created
214
    # we try to move the deletes around and resolve any problems.
215
    result = cleanup_delete(result)
216

217
    return result
218

219
def expand_tokens(tokens, equal=False):
220
    """Given a list of tokens, return a generator of the chunks of
221
    text for the data in the tokens.
222
    """
223
    for token in tokens:
224
        for pre in token.pre_tags:
225
            yield pre
226
        if not equal or not token.hide_when_equal:
227
            if token.trailing_whitespace:
228
                yield token.html() + token.trailing_whitespace
229
            else:
230
                yield token.html()
231
        for post in token.post_tags:
232
            yield post
233

234
def merge_insert(ins_chunks, doc):
235
    """ doc is the already-handled document (as a list of text chunks);
236
    here we add <ins>ins_chunks</ins> to the end of that.  """
237
    # Though we don't throw away unbalanced_start or unbalanced_end
238
    # (we assume there is accompanying markup later or earlier in the
239
    # document), we only put <ins> around the balanced portion.
240
    unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
241
    doc.extend(unbalanced_start)
242
    if doc and not doc[-1].endswith(' '):
243
        # Fix up the case where the word before the insert didn't end with 
244
        # a space
245
        doc[-1] += ' '
246
    doc.append('<ins>')
247
    if balanced and balanced[-1].endswith(' '):
248
        # We move space outside of </ins>
249
        balanced[-1] = balanced[-1][:-1]
250
    doc.extend(balanced)
251
    doc.append('</ins> ')
252
    doc.extend(unbalanced_end)
253

254
# These are sentinals to represent the start and end of a <del>
255
# segment, until we do the cleanup phase to turn them into proper
256
# markup:
257
class DEL_START:
258
    pass
259
class DEL_END:
260
    pass
261

262
class NoDeletes(Exception):
263
    """ Raised when the document no longer contains any pending deletes
264
    (DEL_START/DEL_END) """
265

266
def merge_delete(del_chunks, doc):
267
    """ Adds the text chunks in del_chunks to the document doc (another
268
    list of text chunks) with marker to show it is a delete.
269
    cleanup_delete later resolves these markers into <del> tags."""
270
    doc.append(DEL_START)
271
    doc.extend(del_chunks)
272
    doc.append(DEL_END)
273

274
def cleanup_delete(chunks):
275
    """ Cleans up any DEL_START/DEL_END markers in the document, replacing
276
    them with <del></del>.  To do this while keeping the document
277
    valid, it may need to drop some tags (either start or end tags).
278

279
    It may also move the del into adjacent tags to try to move it to a
280
    similar location where it was originally located (e.g., moving a
281
    delete into preceding <div> tag, if the del looks like (DEL_START,
282
    'Text</div>', DEL_END)"""
283
    while 1:
284
        # Find a pending DEL_START/DEL_END, splitting the document
285
        # into stuff-preceding-DEL_START, stuff-inside, and
286
        # stuff-following-DEL_END
287
        try:
288
            pre_delete, delete, post_delete = split_delete(chunks)
289
        except NoDeletes:
290
            # Nothing found, we've cleaned up the entire doc
291
            break
292
        # The stuff-inside-DEL_START/END may not be well balanced
293
        # markup.  First we figure out what unbalanced portions there are:
294
        unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
295
        # Then we move the span forward and/or backward based on these
296
        # unbalanced portions:
297
        locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
298
        locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
299
        doc = pre_delete
300
        if doc and not doc[-1].endswith(' '):
301
            # Fix up case where the word before us didn't have a trailing space
302
            doc[-1] += ' '
303
        doc.append('<del>')
304
        if balanced and balanced[-1].endswith(' '):
305
            # We move space outside of </del>
306
            balanced[-1] = balanced[-1][:-1]
307
        doc.extend(balanced)
308
        doc.append('</del> ')
309
        doc.extend(post_delete)
310
        chunks = doc
311
    return chunks
312

313
def split_unbalanced(chunks):
314
    """Return (unbalanced_start, balanced, unbalanced_end), where each is
315
    a list of text and tag chunks.
316

317
    unbalanced_start is a list of all the tags that are opened, but
318
    not closed in this span.  Similarly, unbalanced_end is a list of
319
    tags that are closed but were not opened.  Extracting these might
320
    mean some reordering of the chunks."""
321
    start = []
322
    end = []
323
    tag_stack = []
324
    balanced = []
325
    for chunk in chunks:
326
        if not chunk.startswith('<'):
327
            balanced.append(chunk)
328
            continue
329
        endtag = chunk[1] == '/'
330
        name = chunk.split()[0].strip('<>/')
331
        if name in empty_tags:
332
            balanced.append(chunk)
333
            continue
334
        if endtag:
335
            if tag_stack and tag_stack[-1][0] == name:
336
                balanced.append(chunk)
337
                name, pos, tag = tag_stack.pop()
338
                balanced[pos] = tag
339
            elif tag_stack:
340
                start.extend([tag for name, pos, tag in tag_stack])
341
                tag_stack = []
342
                end.append(chunk)
343
            else:
344
                end.append(chunk)
345
        else:
346
            tag_stack.append((name, len(balanced), chunk))
347
            balanced.append(None)
348
    start.extend(
349
        [chunk for name, pos, chunk in tag_stack])
350
    balanced = [chunk for chunk in balanced if chunk is not None]
351
    return start, balanced, end
352

353
def split_delete(chunks):
354
    """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
355
    stuff_after_DEL_END).  Returns the first case found (there may be
356
    more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if
357
    there's no DEL_START found. """
358
    try:
359
        pos = chunks.index(DEL_START)
360
    except ValueError:
361
        raise NoDeletes
362
    pos2 = chunks.index(DEL_END)
363
    return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
364

365
def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
366
    """ pre_delete and post_delete implicitly point to a place in the
367
    document (where the two were split).  This moves that point (by
368
    popping items from one and pushing them onto the other).  It moves
369
    the point to try to find a place where unbalanced_start applies.
370

371
    As an example::
372

373
        >>> unbalanced_start = ['<div>']
374
        >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
375
        >>> pre, post = doc[:3], doc[3:]
376
        >>> pre, post
377
        (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
378
        >>> locate_unbalanced_start(unbalanced_start, pre, post)
379
        >>> pre, post
380
        (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
381

382
    As you can see, we moved the point so that the dangling <div> that
383
    we found will be effectively replaced by the div in the original
384
    document.  If this doesn't work out, we just throw away
385
    unbalanced_start without doing anything.
386
    """
387
    while 1:
388
        if not unbalanced_start:
389
            # We have totally succeeded in finding the position
390
            break
391
        finding = unbalanced_start[0]
392
        finding_name = finding.split()[0].strip('<>')
393
        if not post_delete:
394
            break
395
        next = post_delete[0]
396
        if next is DEL_START or not next.startswith('<'):
397
            # Reached a word, we can't move the delete text forward
398
            break
399
        if next[1] == '/':
400
            # Reached a closing tag, can we go further?  Maybe not...
401
            break
402
        name = next.split()[0].strip('<>')
403
        if name == 'ins':
404
            # Can't move into an insert
405
            break
406
        assert name != 'del', (
407
            "Unexpected delete tag: %r" % next)
408
        if name == finding_name:
409
            unbalanced_start.pop(0)
410
            pre_delete.append(post_delete.pop(0))
411
        else:
412
            # Found a tag that doesn't match
413
            break
414

415
def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
416
    """ like locate_unbalanced_start, except handling end tags and
417
    possibly moving the point earlier in the document.  """
418
    while 1:
419
        if not unbalanced_end:
420
            # Success
421
            break
422
        finding = unbalanced_end[-1]
423
        finding_name = finding.split()[0].strip('<>/')
424
        if not pre_delete:
425
            break
426
        next = pre_delete[-1]
427
        if next is DEL_END or not next.startswith('</'):
428
            # A word or a start tag
429
            break
430
        name = next.split()[0].strip('<>/')
431
        if name == 'ins' or name == 'del':
432
            # Can't move into an insert or delete
433
            break
434
        if name == finding_name:
435
            unbalanced_end.pop()
436
            post_delete.insert(0, pre_delete.pop())
437
        else:
438
            # Found a tag that doesn't match
439
            break
440

441
class token(_unicode):
442
    """ Represents a diffable token, generally a word that is displayed to
443
    the user.  Opening tags are attached to this token when they are
444
    adjacent (pre_tags) and closing tags that follow the word
445
    (post_tags).  Some exceptions occur when there are empty tags
446
    adjacent to a word, so there may be close tags in pre_tags, or
447
    open tags in post_tags.
448

449
    We also keep track of whether the word was originally followed by
450
    whitespace, even though we do not want to treat the word as
451
    equivalent to a similar word that does not have a trailing
452
    space."""
453

454
    # When this is true, the token will be eliminated from the
455
    # displayed diff if no change has occurred:
456
    hide_when_equal = False
457

458
    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
459
        obj = _unicode.__new__(cls, text)
460

461
        if pre_tags is not None:
462
            obj.pre_tags = pre_tags
463
        else:
464
            obj.pre_tags = []
465

466
        if post_tags is not None:
467
            obj.post_tags = post_tags
468
        else:
469
            obj.post_tags = []
470

471
        obj.trailing_whitespace = trailing_whitespace
472

473
        return obj
474

475
    def __repr__(self):
476
        return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
477
                                          self.post_tags, self.trailing_whitespace)
478

479
    def html(self):
480
        return _unicode(self)
481

482
class tag_token(token):
483

484
    """ Represents a token that is actually a tag.  Currently this is just
485
    the <img> tag, which takes up visible space just like a word but
486
    is only represented in a document by a tag.  """
487

488
    def __new__(cls, tag, data, html_repr, pre_tags=None, 
489
                post_tags=None, trailing_whitespace=""):
490
        obj = token.__new__(cls, "%s: %s" % (type, data), 
491
                            pre_tags=pre_tags, 
492
                            post_tags=post_tags, 
493
                            trailing_whitespace=trailing_whitespace)
494
        obj.tag = tag
495
        obj.data = data
496
        obj.html_repr = html_repr
497
        return obj
498

499
    def __repr__(self):
500
        return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
501
            self.tag, 
502
            self.data, 
503
            self.html_repr, 
504
            self.pre_tags, 
505
            self.post_tags, 
506
            self.trailing_whitespace)
507
    def html(self):
508
        return self.html_repr
509

510
class href_token(token):
511

512
    """ Represents the href in an anchor tag.  Unlike other words, we only
513
    show the href when it changes.  """
514

515
    hide_when_equal = True
516

517
    def html(self):
518
        return ' Link: %s' % self
519

520
def tokenize(html, include_hrefs=True):
521
    """
522
    Parse the given HTML and returns token objects (words with attached tags).
523

524
    This parses only the content of a page; anything in the head is
525
    ignored, and the <head> and <body> elements are themselves
526
    optional.  The content is then parsed by lxml, which ensures the
527
    validity of the resulting parsed document (though lxml may make
528
    incorrect guesses when the markup is particular bad).
529

530
    <ins> and <del> tags are also eliminated from the document, as
531
    that gets confusing.
532

533
    If include_hrefs is true, then the href attribute of <a> tags is
534
    included as a special kind of diffable token."""
535
    if etree.iselement(html):
536
        body_el = html
537
    else:
538
        body_el = parse_html(html, cleanup=True)
539
    # Then we split the document into text chunks for each tag, word, and end tag:
540
    chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
541
    # Finally re-joining them into token objects:
542
    return fixup_chunks(chunks)
543

544
def parse_html(html, cleanup=True):
545
    """
546
    Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
547
    wrapped in a <div> tag that was not in the original document.
548

549
    If cleanup is true, make sure there's no <head> or <body>, and get
550
    rid of any <ins> and <del> tags.
551
    """
552
    if cleanup:
553
        # This removes any extra markup or structure like <head>:
554
        html = cleanup_html(html)
555
    return fragment_fromstring(html, create_parent=True)
556

557
_body_re = re.compile(r'<body.*?>', re.I|re.S)
558
_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
559
_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
560

561
def cleanup_html(html):
562
    """ This 'cleans' the HTML, meaning that any page structure is removed
563
    (only the contents of <body> are used, if there is any <body).
564
    Also <ins> and <del> tags are removed.  """
565
    match = _body_re.search(html)
566
    if match:
567
        html = html[match.end():]
568
    match = _end_body_re.search(html)
569
    if match:
570
        html = html[:match.start()]
571
    html = _ins_del_re.sub('', html)
572
    return html
573
    
574

575
end_whitespace_re = re.compile(r'[ \t\n\r]$')
576

577
def split_trailing_whitespace(word):
578
    """
579
    This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
580
    """
581
    stripped_length = len(word.rstrip())
582
    return word[0:stripped_length], word[stripped_length:]
583

584

585
def fixup_chunks(chunks):
586
    """
587
    This function takes a list of chunks and produces a list of tokens.
588
    """
589
    tag_accum = []
590
    cur_word = None
591
    result = []
592
    for chunk in chunks:
593
        if isinstance(chunk, tuple):
594
            if chunk[0] == 'img':
595
                src = chunk[1]
596
                tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
597
                cur_word = tag_token('img', src, html_repr=tag,
598
                                     pre_tags=tag_accum,
599
                                     trailing_whitespace=trailing_whitespace)
600
                tag_accum = []
601
                result.append(cur_word)
602

603
            elif chunk[0] == 'href':
604
                href = chunk[1]
605
                cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
606
                tag_accum = []
607
                result.append(cur_word)
608
            continue
609

610
        if is_word(chunk):
611
            chunk, trailing_whitespace = split_trailing_whitespace(chunk)
612
            cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
613
            tag_accum = []
614
            result.append(cur_word)
615

616
        elif is_start_tag(chunk):
617
            tag_accum.append(chunk)
618

619
        elif is_end_tag(chunk):
620
            if tag_accum:
621
                tag_accum.append(chunk)
622
            else:
623
                assert cur_word, (
624
                    "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
625
                    % (cur_word, result, chunk, chunks))
626
                cur_word.post_tags.append(chunk)
627
        else:
628
            assert False
629

630
    if not result:
631
        return [token('', pre_tags=tag_accum)]
632
    else:
633
        result[-1].post_tags.extend(tag_accum)
634

635
    return result
636

637

638
# All the tags in HTML that don't require end tags:
639
empty_tags = (
640
    'param', 'img', 'area', 'br', 'basefont', 'input',
641
    'base', 'meta', 'link', 'col')
642

643
block_level_tags = (
644
    'address',
645
    'blockquote',
646
    'center',
647
    'dir',
648
    'div',
649
    'dl',
650
    'fieldset',
651
    'form',
652
    'h1',
653
    'h2',
654
    'h3',
655
    'h4',
656
    'h5',
657
    'h6',
658
    'hr',
659
    'isindex',
660
    'menu',
661
    'noframes',
662
    'noscript',
663
    'ol',
664
    'p',
665
    'pre',
666
    'table',
667
    'ul',
668
    )
669

670
block_level_container_tags = (
671
    'dd',
672
    'dt',
673
    'frameset',
674
    'li',
675
    'tbody',
676
    'td',
677
    'tfoot',
678
    'th',
679
    'thead',
680
    'tr',
681
    )
682

683

684
def flatten_el(el, include_hrefs, skip_tag=False):
685
    """ Takes an lxml element el, and generates all the text chunks for
686
    that tag.  Each start tag is a chunk, each word is a chunk, and each
687
    end tag is a chunk.
688

689
    If skip_tag is true, then the outermost container tag is
690
    not returned (just its contents)."""
691
    if not skip_tag:
692
        if el.tag == 'img':
693
            yield ('img', el.get('src'), start_tag(el))
694
        else:
695
            yield start_tag(el)
696
    if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
697
        return
698
    start_words = split_words(el.text)
699
    for word in start_words:
700
        yield html_escape(word)
701
    for child in el:
702
        for item in flatten_el(child, include_hrefs=include_hrefs):
703
            yield item
704
    if el.tag == 'a' and el.get('href') and include_hrefs:
705
        yield ('href', el.get('href'))
706
    if not skip_tag:
707
        yield end_tag(el)
708
        end_words = split_words(el.tail)
709
        for word in end_words:
710
            yield html_escape(word)
711

712
split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
713

714
def split_words(text):
715
    """ Splits some text into words. Includes trailing whitespace
716
    on each word when appropriate.  """
717
    if not text or not text.strip():
718
        return []
719

720
    words = split_words_re.findall(text)
721
    return words
722

723
start_whitespace_re = re.compile(r'^[ \t\n\r]')
724

725
def start_tag(el):
726
    """
727
    The text representation of the start tag for a tag.
728
    """
729
    return '<%s%s>' % (
730
        el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
731
                         for name, value in el.attrib.items()]))
732

733
def end_tag(el):
734
    """ The text representation of an end tag for a tag.  Includes
735
    trailing whitespace when appropriate.  """
736
    if el.tail and start_whitespace_re.search(el.tail):
737
        extra = ' '
738
    else:
739
        extra = ''
740
    return '</%s>%s' % (el.tag, extra)
741

742
def is_word(tok):
743
    return not tok.startswith('<')
744

745
def is_end_tag(tok):
746
    return tok.startswith('</')
747

748
def is_start_tag(tok):
749
    return tok.startswith('<') and not tok.startswith('</')
750

751
def fixup_ins_del_tags(html):
752
    """ Given an html string, move any <ins> or <del> tags inside of any
753
    block-level elements, e.g. transform <ins><p>word</p></ins> to
754
    <p><ins>word</ins></p> """
755
    doc = parse_html(html, cleanup=False)
756
    _fixup_ins_del_tags(doc)
757
    html = serialize_html_fragment(doc, skip_outer=True)
758
    return html
759

760
def serialize_html_fragment(el, skip_outer=False):
761
    """ Serialize a single lxml element as HTML.  The serialized form
762
    includes the elements tail.  
763

764
    If skip_outer is true, then don't serialize the outermost tag
765
    """
766
    assert not isinstance(el, basestring), (
767
        "You should pass in an element, not a string like %r" % el)
768
    html = etree.tostring(el, method="html", encoding=_unicode)
769
    if skip_outer:
770
        # Get rid of the extra starting tag:
771
        html = html[html.find('>')+1:]
772
        # Get rid of the extra end tag:
773
        html = html[:html.rfind('<')]
774
        return html.strip()
775
    else:
776
        return html
777

778
def _fixup_ins_del_tags(doc):
779
    """fixup_ins_del_tags that works on an lxml document in-place
780
    """
781
    for tag in ['ins', 'del']:
782
        for el in doc.xpath('descendant-or-self::%s' % tag):
783
            if not _contains_block_level_tag(el):
784
                continue
785
            _move_el_inside_block(el, tag=tag)
786
            el.drop_tag()
787
            #_merge_element_contents(el)
788

789
def _contains_block_level_tag(el):
790
    """True if the element contains any block-level elements, like <p>, <td>, etc.
791
    """
792
    if el.tag in block_level_tags or el.tag in block_level_container_tags:
793
        return True
794
    for child in el:
795
        if _contains_block_level_tag(child):
796
            return True
797
    return False
798

799
def _move_el_inside_block(el, tag):
800
    """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
801
    and moves them inside any block-level tags.  """
802
    for child in el:
803
        if _contains_block_level_tag(child):
804
            break
805
    else:
806
        # No block-level tags in any child
807
        children_tag = etree.Element(tag)
808
        children_tag.text = el.text
809
        el.text = None
810
        children_tag.extend(list(el))
811
        el[:] = [children_tag]
812
        return
813
    for child in list(el):
814
        if _contains_block_level_tag(child):
815
            _move_el_inside_block(child, tag)
816
            if child.tail:
817
                tail_tag = etree.Element(tag)
818
                tail_tag.text = child.tail
819
                child.tail = None
820
                el.insert(el.index(child)+1, tail_tag)
821
        else:
822
            child_tag = etree.Element(tag)
823
            el.replace(child, child_tag)
824
            child_tag.append(child)
825
    if el.text:
826
        text_tag = etree.Element(tag)
827
        text_tag.text = el.text
828
        el.text = None
829
        el.insert(0, text_tag)
830
            
831
def _merge_element_contents(el):
832
    """
833
    Removes an element, but merges its contents into its place, e.g.,
834
    given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
835
    <p>Hi there!</p>
836
    """
837
    parent = el.getparent()
838
    text = el.text or ''
839
    if el.tail:
840
        if not len(el):
841
            text += el.tail
842
        else:
843
            if el[-1].tail:
844
                el[-1].tail += el.tail
845
            else:
846
                el[-1].tail = el.tail
847
    index = parent.index(el)
848
    if text:
849
        if index == 0:
850
            previous = None
851
        else:
852
            previous = parent[index-1]
853
        if previous is None:
854
            if parent.text:
855
                parent.text += text
856
            else:
857
                parent.text = text
858
        else:
859
            if previous.tail:
860
                previous.tail += text
861
            else:
862
                previous.tail = text
863
    parent[index:index+1] = el.getchildren()
864

865
class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
866
    """
867
    Acts like SequenceMatcher, but tries not to find very small equal
868
    blocks amidst large spans of changes
869
    """
870

871
    threshold = 2
872
    
873
    def get_matching_blocks(self):
874
        size = min(len(self.b), len(self.b))
875
        threshold = min(self.threshold, size / 4)
876
        actual = difflib.SequenceMatcher.get_matching_blocks(self)
877
        return [item for item in actual
878
                if item[2] > threshold
879
                or not item[2]]
880

881
if __name__ == '__main__':
882
    from lxml.html import _diffcommand
883
    _diffcommand.main()
884
    
885

886
Product

Resources

Company