Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/lxml/html/clean.py
811 views
1
# cython: language_level=3str
2
3
"""A cleanup tool for HTML.
4
5
Removes unwanted tags and content. See the `Cleaner` class for
6
details.
7
"""
8
9
from __future__ import absolute_import
10
11
import re
12
import copy
13
try:
14
from urlparse import urlsplit
15
from urllib import unquote_plus
16
except ImportError:
17
# Python 3
18
from urllib.parse import urlsplit, unquote_plus
19
from lxml import etree
20
from lxml.html import defs
21
from lxml.html import fromstring, XHTML_NAMESPACE
22
from lxml.html import xhtml_to_html, _transform_result
23
24
try:
25
unichr
26
except NameError:
27
# Python 3
28
unichr = chr
29
try:
30
unicode
31
except NameError:
32
# Python 3
33
unicode = str
34
try:
35
basestring
36
except NameError:
37
basestring = (str, bytes)
38
39
40
__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
41
'word_break', 'word_break_html']
42
43
# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
44
# Particularly the CSS cleaning; most of the tag cleaning is integrated now
45
# I have multiple kinds of schemes searched; but should schemes be
46
# whitelisted instead?
47
# max height?
48
# remove images? Also in CSS? background attribute?
49
# Some way to whitelist object, iframe, etc (e.g., if you want to
50
# allow *just* embedded YouTube movies)
51
# Log what was deleted and why?
52
# style="behavior: ..." might be bad in IE?
53
# Should we have something for just <meta http-equiv>? That's the worst of the
54
# metas.
55
# UTF-7 detections? Example:
56
# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
57
# you don't always have to have the charset set, if the page has no charset
58
# and there's UTF7-like code in it.
59
# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
60
61
62
# This is an IE-specific construct you can have in a stylesheet to
63
# run some Javascript:
64
_css_javascript_re = re.compile(
65
r'expression\s*\(.*?\)', re.S|re.I)
66
67
# Do I have to worry about @\nimport?
68
_css_import_re = re.compile(
69
r'@\s*import', re.I)
70
71
# All kinds of schemes besides just javascript: that can cause
72
# execution:
73
_is_image_dataurl = re.compile(
74
r'^data:image/.+;base64', re.I).search
75
_is_possibly_malicious_scheme = re.compile(
76
r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
77
re.I).search
78
def _is_javascript_scheme(s):
79
if _is_image_dataurl(s):
80
return None
81
return _is_possibly_malicious_scheme(s)
82
83
_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
84
# FIXME: should data: be blocked?
85
86
# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
87
_conditional_comment_re = re.compile(
88
r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
89
90
_find_styled_elements = etree.XPath(
91
"descendant-or-self::*[@style]")
92
93
_find_external_links = etree.XPath(
94
("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
95
"descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
96
namespaces={'x':XHTML_NAMESPACE})
97
98
99
class Cleaner(object):
100
"""
101
Instances cleans the document of each of the possible offending
102
elements. The cleaning is controlled by attributes; you can
103
override attributes in a subclass, or set them in the constructor.
104
105
``scripts``:
106
Removes any ``<script>`` tags.
107
108
``javascript``:
109
Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
110
as they could contain Javascript.
111
112
``comments``:
113
Removes any comments.
114
115
``style``:
116
Removes any style tags.
117
118
``inline_style``
119
Removes any style attributes. Defaults to the value of the ``style`` option.
120
121
``links``:
122
Removes any ``<link>`` tags
123
124
``meta``:
125
Removes any ``<meta>`` tags
126
127
``page_structure``:
128
Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
129
130
``processing_instructions``:
131
Removes any processing instructions.
132
133
``embedded``:
134
Removes any embedded objects (flash, iframes)
135
136
``frames``:
137
Removes any frame-related tags
138
139
``forms``:
140
Removes any form tags
141
142
``annoying_tags``:
143
Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
144
145
``remove_tags``:
146
A list of tags to remove. Only the tags will be removed,
147
their content will get pulled up into the parent tag.
148
149
``kill_tags``:
150
A list of tags to kill. Killing also removes the tag's content,
151
i.e. the whole subtree, not just the tag itself.
152
153
``allow_tags``:
154
A list of tags to include (default include all).
155
156
``remove_unknown_tags``:
157
Remove any tags that aren't standard parts of HTML.
158
159
``safe_attrs_only``:
160
If true, only include 'safe' attributes (specifically the list
161
from the feedparser HTML sanitisation web site).
162
163
``safe_attrs``:
164
A set of attribute names to override the default list of attributes
165
considered 'safe' (when safe_attrs_only=True).
166
167
``add_nofollow``:
168
If true, then any <a> tags will have ``rel="nofollow"`` added to them.
169
170
``host_whitelist``:
171
A list or set of hosts that you can use for embedded content
172
(for content like ``<object>``, ``<link rel="stylesheet">``, etc).
173
You can also implement/override the method
174
``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
175
implement more complex rules for what can be embedded.
176
Anything that passes this test will be shown, regardless of
177
the value of (for instance) ``embedded``.
178
179
Note that this parameter might not work as intended if you do not
180
make the links absolute before doing the cleaning.
181
182
Note that you may also need to set ``whitelist_tags``.
183
184
``whitelist_tags``:
185
A set of tags that can be included with ``host_whitelist``.
186
The default is ``iframe`` and ``embed``; you may wish to
187
include other tags like ``script``, or you may want to
188
implement ``allow_embedded_url`` for more control. Set to None to
189
include all tags.
190
191
This modifies the document *in place*.
192
"""
193
194
scripts = True
195
javascript = True
196
comments = True
197
style = False
198
inline_style = None
199
links = True
200
meta = True
201
page_structure = True
202
processing_instructions = True
203
embedded = True
204
frames = True
205
forms = True
206
annoying_tags = True
207
remove_tags = None
208
allow_tags = None
209
kill_tags = None
210
remove_unknown_tags = True
211
safe_attrs_only = True
212
safe_attrs = defs.safe_attrs
213
add_nofollow = False
214
host_whitelist = ()
215
whitelist_tags = {'iframe', 'embed'}
216
217
def __init__(self, **kw):
218
not_an_attribute = object()
219
for name, value in kw.items():
220
default = getattr(self, name, not_an_attribute)
221
if (default is not None and default is not True and default is not False
222
and not isinstance(default, (frozenset, set, tuple, list))):
223
raise TypeError(
224
"Unknown parameter: %s=%r" % (name, value))
225
setattr(self, name, value)
226
if self.inline_style is None and 'inline_style' not in kw:
227
self.inline_style = self.style
228
229
if kw.get("allow_tags"):
230
if kw.get("remove_unknown_tags"):
231
raise ValueError("It does not make sense to pass in both "
232
"allow_tags and remove_unknown_tags")
233
self.remove_unknown_tags = False
234
235
# Used to lookup the primary URL for a given tag that is up for
236
# removal:
237
_tag_link_attrs = dict(
238
script='src',
239
link='href',
240
# From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
241
# From what I can tell, both attributes can contain a link:
242
applet=['code', 'object'],
243
iframe='src',
244
embed='src',
245
layer='src',
246
# FIXME: there doesn't really seem like a general way to figure out what
247
# links an <object> tag uses; links often go in <param> tags with values
248
# that we don't really know. You'd have to have knowledge about specific
249
# kinds of plugins (probably keyed off classid), and match against those.
250
##object=?,
251
# FIXME: not looking at the action currently, because it is more complex
252
# than than -- if you keep the form, you should keep the form controls.
253
##form='action',
254
a='href',
255
)
256
257
def __call__(self, doc):
258
"""
259
Cleans the document.
260
"""
261
try:
262
getroot = doc.getroot
263
except AttributeError:
264
pass # Element instance
265
else:
266
doc = getroot() # ElementTree instance, instead of an element
267
# convert XHTML to HTML
268
xhtml_to_html(doc)
269
# Normalize a case that IE treats <image> like <img>, and that
270
# can confuse either this step or later steps.
271
for el in doc.iter('image'):
272
el.tag = 'img'
273
if not self.comments:
274
# Of course, if we were going to kill comments anyway, we don't
275
# need to worry about this
276
self.kill_conditional_comments(doc)
277
278
kill_tags = set(self.kill_tags or ())
279
remove_tags = set(self.remove_tags or ())
280
allow_tags = set(self.allow_tags or ())
281
282
if self.scripts:
283
kill_tags.add('script')
284
if self.safe_attrs_only:
285
safe_attrs = set(self.safe_attrs)
286
for el in doc.iter(etree.Element):
287
attrib = el.attrib
288
for aname in attrib.keys():
289
if aname not in safe_attrs:
290
del attrib[aname]
291
if self.javascript:
292
if not (self.safe_attrs_only and
293
self.safe_attrs == defs.safe_attrs):
294
# safe_attrs handles events attributes itself
295
for el in doc.iter(etree.Element):
296
attrib = el.attrib
297
for aname in attrib.keys():
298
if aname.startswith('on'):
299
del attrib[aname]
300
doc.rewrite_links(self._remove_javascript_link,
301
resolve_base_href=False)
302
# If we're deleting style then we don't have to remove JS links
303
# from styles, otherwise...
304
if not self.inline_style:
305
for el in _find_styled_elements(doc):
306
old = el.get('style')
307
new = _css_javascript_re.sub('', old)
308
new = _css_import_re.sub('', new)
309
if self._has_sneaky_javascript(new):
310
# Something tricky is going on...
311
del el.attrib['style']
312
elif new != old:
313
el.set('style', new)
314
if not self.style:
315
for el in list(doc.iter('style')):
316
if el.get('type', '').lower().strip() == 'text/javascript':
317
el.drop_tree()
318
continue
319
old = el.text or ''
320
new = _css_javascript_re.sub('', old)
321
# The imported CSS can do anything; we just can't allow:
322
new = _css_import_re.sub('', old)
323
if self._has_sneaky_javascript(new):
324
# Something tricky is going on...
325
el.text = '/* deleted */'
326
elif new != old:
327
el.text = new
328
if self.comments:
329
kill_tags.add(etree.Comment)
330
if self.processing_instructions:
331
kill_tags.add(etree.ProcessingInstruction)
332
if self.style:
333
kill_tags.add('style')
334
if self.inline_style:
335
etree.strip_attributes(doc, 'style')
336
if self.links:
337
kill_tags.add('link')
338
elif self.style or self.javascript:
339
# We must get rid of included stylesheets if Javascript is not
340
# allowed, as you can put Javascript in them
341
for el in list(doc.iter('link')):
342
if 'stylesheet' in el.get('rel', '').lower():
343
# Note this kills alternate stylesheets as well
344
if not self.allow_element(el):
345
el.drop_tree()
346
if self.meta:
347
kill_tags.add('meta')
348
if self.page_structure:
349
remove_tags.update(('head', 'html', 'title'))
350
if self.embedded:
351
# FIXME: is <layer> really embedded?
352
# We should get rid of any <param> tags not inside <applet>;
353
# These are not really valid anyway.
354
for el in list(doc.iter('param')):
355
found_parent = False
356
parent = el.getparent()
357
while parent is not None and parent.tag not in ('applet', 'object'):
358
parent = parent.getparent()
359
if parent is None:
360
el.drop_tree()
361
kill_tags.update(('applet',))
362
# The alternate contents that are in an iframe are a good fallback:
363
remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
364
if self.frames:
365
# FIXME: ideally we should look at the frame links, but
366
# generally frames don't mix properly with an HTML
367
# fragment anyway.
368
kill_tags.update(defs.frame_tags)
369
if self.forms:
370
remove_tags.add('form')
371
kill_tags.update(('button', 'input', 'select', 'textarea'))
372
if self.annoying_tags:
373
remove_tags.update(('blink', 'marquee'))
374
375
_remove = []
376
_kill = []
377
for el in doc.iter():
378
if el.tag in kill_tags:
379
if self.allow_element(el):
380
continue
381
_kill.append(el)
382
elif el.tag in remove_tags:
383
if self.allow_element(el):
384
continue
385
_remove.append(el)
386
387
if _remove and _remove[0] == doc:
388
# We have to drop the parent-most tag, which we can't
389
# do. Instead we'll rewrite it:
390
el = _remove.pop(0)
391
el.tag = 'div'
392
el.attrib.clear()
393
elif _kill and _kill[0] == doc:
394
# We have to drop the parent-most element, which we can't
395
# do. Instead we'll clear it:
396
el = _kill.pop(0)
397
if el.tag != 'html':
398
el.tag = 'div'
399
el.clear()
400
401
_kill.reverse() # start with innermost tags
402
for el in _kill:
403
el.drop_tree()
404
for el in _remove:
405
el.drop_tag()
406
407
if self.remove_unknown_tags:
408
if allow_tags:
409
raise ValueError(
410
"It does not make sense to pass in both allow_tags and remove_unknown_tags")
411
allow_tags = set(defs.tags)
412
if allow_tags:
413
# make sure we do not remove comments/PIs if users want them (which is rare enough)
414
if not self.comments:
415
allow_tags.add(etree.Comment)
416
if not self.processing_instructions:
417
allow_tags.add(etree.ProcessingInstruction)
418
419
bad = []
420
for el in doc.iter():
421
if el.tag not in allow_tags:
422
bad.append(el)
423
if bad:
424
if bad[0] is doc:
425
el = bad.pop(0)
426
el.tag = 'div'
427
el.attrib.clear()
428
for el in bad:
429
el.drop_tag()
430
if self.add_nofollow:
431
for el in _find_external_links(doc):
432
if not self.allow_follow(el):
433
rel = el.get('rel')
434
if rel:
435
if ('nofollow' in rel
436
and ' nofollow ' in (' %s ' % rel)):
437
continue
438
rel = '%s nofollow' % rel
439
else:
440
rel = 'nofollow'
441
el.set('rel', rel)
442
443
def allow_follow(self, anchor):
444
"""
445
Override to suppress rel="nofollow" on some anchors.
446
"""
447
return False
448
449
def allow_element(self, el):
450
"""
451
Decide whether an element is configured to be accepted or rejected.
452
453
:param el: an element.
454
:return: true to accept the element or false to reject/discard it.
455
"""
456
if el.tag not in self._tag_link_attrs:
457
return False
458
attr = self._tag_link_attrs[el.tag]
459
if isinstance(attr, (list, tuple)):
460
for one_attr in attr:
461
url = el.get(one_attr)
462
if not url:
463
return False
464
if not self.allow_embedded_url(el, url):
465
return False
466
return True
467
else:
468
url = el.get(attr)
469
if not url:
470
return False
471
return self.allow_embedded_url(el, url)
472
473
def allow_embedded_url(self, el, url):
474
"""
475
Decide whether a URL that was found in an element's attributes or text
476
if configured to be accepted or rejected.
477
478
:param el: an element.
479
:param url: a URL found on the element.
480
:return: true to accept the URL and false to reject it.
481
"""
482
if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
483
return False
484
scheme, netloc, path, query, fragment = urlsplit(url)
485
netloc = netloc.lower().split(':', 1)[0]
486
if scheme not in ('http', 'https'):
487
return False
488
if netloc in self.host_whitelist:
489
return True
490
return False
491
492
def kill_conditional_comments(self, doc):
493
"""
494
IE conditional comments basically embed HTML that the parser
495
doesn't normally see. We can't allow anything like that, so
496
we'll kill any comments that could be conditional.
497
"""
498
has_conditional_comment = _conditional_comment_re.search
499
self._kill_elements(
500
doc, lambda el: has_conditional_comment(el.text),
501
etree.Comment)
502
503
def _kill_elements(self, doc, condition, iterate=None):
504
bad = []
505
for el in doc.iter(iterate):
506
if condition(el):
507
bad.append(el)
508
for el in bad:
509
el.drop_tree()
510
511
def _remove_javascript_link(self, link):
512
# links like "j a v a s c r i p t:" might be interpreted in IE
513
new = _substitute_whitespace('', unquote_plus(link))
514
if _is_javascript_scheme(new):
515
# FIXME: should this be None to delete?
516
return ''
517
return link
518
519
_substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
520
521
def _has_sneaky_javascript(self, style):
522
"""
523
Depending on the browser, stuff like ``e x p r e s s i o n(...)``
524
can get interpreted, or ``expre/* stuff */ssion(...)``. This
525
checks for attempt to do stuff like this.
526
527
Typically the response will be to kill the entire style; if you
528
have just a bit of Javascript in the style another rule will catch
529
that and remove only the Javascript from the style; this catches
530
more sneaky attempts.
531
"""
532
style = self._substitute_comments('', style)
533
style = style.replace('\\', '')
534
style = _substitute_whitespace('', style)
535
style = style.lower()
536
if 'javascript:' in style:
537
return True
538
if 'expression(' in style:
539
return True
540
return False
541
542
def clean_html(self, html):
543
result_type = type(html)
544
if isinstance(html, basestring):
545
doc = fromstring(html)
546
else:
547
doc = copy.deepcopy(html)
548
self(doc)
549
return _transform_result(result_type, doc)
550
551
clean = Cleaner()
552
clean_html = clean.clean_html
553
554
############################################################
555
## Autolinking
556
############################################################
557
558
_link_regexes = [
559
re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
560
# This is conservative, but autolinking can be a bit conservative:
561
re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
562
]
563
564
_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
565
566
_avoid_hosts = [
567
re.compile(r'^localhost', re.I),
568
re.compile(r'\bexample\.(?:com|org|net)$', re.I),
569
re.compile(r'^127\.0\.0\.1$'),
570
]
571
572
_avoid_classes = ['nolink']
573
574
def autolink(el, link_regexes=_link_regexes,
575
avoid_elements=_avoid_elements,
576
avoid_hosts=_avoid_hosts,
577
avoid_classes=_avoid_classes):
578
"""
579
Turn any URLs into links.
580
581
It will search for links identified by the given regular
582
expressions (by default mailto and http(s) links).
583
584
It won't link text in an element in avoid_elements, or an element
585
with a class in avoid_classes. It won't link to anything with a
586
host that matches one of the regular expressions in avoid_hosts
587
(default localhost and 127.0.0.1).
588
589
If you pass in an element, the element's tail will not be
590
substituted, only the contents of the element.
591
"""
592
if el.tag in avoid_elements:
593
return
594
class_name = el.get('class')
595
if class_name:
596
class_name = class_name.split()
597
for match_class in avoid_classes:
598
if match_class in class_name:
599
return
600
for child in list(el):
601
autolink(child, link_regexes=link_regexes,
602
avoid_elements=avoid_elements,
603
avoid_hosts=avoid_hosts,
604
avoid_classes=avoid_classes)
605
if child.tail:
606
text, tail_children = _link_text(
607
child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
608
if tail_children:
609
child.tail = text
610
index = el.index(child)
611
el[index+1:index+1] = tail_children
612
if el.text:
613
text, pre_children = _link_text(
614
el.text, link_regexes, avoid_hosts, factory=el.makeelement)
615
if pre_children:
616
el.text = text
617
el[:0] = pre_children
618
619
def _link_text(text, link_regexes, avoid_hosts, factory):
620
leading_text = ''
621
links = []
622
last_pos = 0
623
while 1:
624
best_match, best_pos = None, None
625
for regex in link_regexes:
626
regex_pos = last_pos
627
while 1:
628
match = regex.search(text, pos=regex_pos)
629
if match is None:
630
break
631
host = match.group('host')
632
for host_regex in avoid_hosts:
633
if host_regex.search(host):
634
regex_pos = match.end()
635
break
636
else:
637
break
638
if match is None:
639
continue
640
if best_pos is None or match.start() < best_pos:
641
best_match = match
642
best_pos = match.start()
643
if best_match is None:
644
# No more matches
645
if links:
646
assert not links[-1].tail
647
links[-1].tail = text
648
else:
649
assert not leading_text
650
leading_text = text
651
break
652
link = best_match.group(0)
653
end = best_match.end()
654
if link.endswith('.') or link.endswith(','):
655
# These punctuation marks shouldn't end a link
656
end -= 1
657
link = link[:-1]
658
prev_text = text[:best_match.start()]
659
if links:
660
assert not links[-1].tail
661
links[-1].tail = prev_text
662
else:
663
assert not leading_text
664
leading_text = prev_text
665
anchor = factory('a')
666
anchor.set('href', link)
667
body = best_match.group('body')
668
if not body:
669
body = link
670
if body.endswith('.') or body.endswith(','):
671
body = body[:-1]
672
anchor.text = body
673
links.append(anchor)
674
text = text[end:]
675
return leading_text, links
676
677
def autolink_html(html, *args, **kw):
678
result_type = type(html)
679
if isinstance(html, basestring):
680
doc = fromstring(html)
681
else:
682
doc = copy.deepcopy(html)
683
autolink(doc, *args, **kw)
684
return _transform_result(result_type, doc)
685
686
autolink_html.__doc__ = autolink.__doc__
687
688
############################################################
689
## Word wrapping
690
############################################################
691
692
_avoid_word_break_elements = ['pre', 'textarea', 'code']
693
_avoid_word_break_classes = ['nobreak']
694
695
def word_break(el, max_width=40,
696
avoid_elements=_avoid_word_break_elements,
697
avoid_classes=_avoid_word_break_classes,
698
break_character=unichr(0x200b)):
699
"""
700
Breaks any long words found in the body of the text (not attributes).
701
702
Doesn't effect any of the tags in avoid_elements, by default
703
``<textarea>`` and ``<pre>``
704
705
Breaks words by inserting &#8203;, which is a unicode character
706
for Zero Width Space character. This generally takes up no space
707
in rendering, but does copy as a space, and in monospace contexts
708
usually takes up space.
709
710
See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
711
"""
712
# Character suggestion of &#8203 comes from:
713
# http://www.cs.tut.fi/~jkorpela/html/nobr.html
714
if el.tag in _avoid_word_break_elements:
715
return
716
class_name = el.get('class')
717
if class_name:
718
dont_break = False
719
class_name = class_name.split()
720
for avoid in avoid_classes:
721
if avoid in class_name:
722
dont_break = True
723
break
724
if dont_break:
725
return
726
if el.text:
727
el.text = _break_text(el.text, max_width, break_character)
728
for child in el:
729
word_break(child, max_width=max_width,
730
avoid_elements=avoid_elements,
731
avoid_classes=avoid_classes,
732
break_character=break_character)
733
if child.tail:
734
child.tail = _break_text(child.tail, max_width, break_character)
735
736
def word_break_html(html, *args, **kw):
737
result_type = type(html)
738
doc = fromstring(html)
739
word_break(doc, *args, **kw)
740
return _transform_result(result_type, doc)
741
742
def _break_text(text, max_width, break_character):
743
words = text.split()
744
for word in words:
745
if len(word) > max_width:
746
replacement = _insert_break(word, max_width, break_character)
747
text = text.replace(word, replacement)
748
return text
749
750
_break_prefer_re = re.compile(r'[^a-z]', re.I)
751
752
def _insert_break(word, width, break_character):
753
orig_word = word
754
result = ''
755
while len(word) > width:
756
start = word[:width]
757
breaks = list(_break_prefer_re.finditer(start))
758
if breaks:
759
last_break = breaks[-1]
760
# Only walk back up to 10 characters to find a nice break:
761
if last_break.end() > width-10:
762
# FIXME: should the break character be at the end of the
763
# chunk, or the beginning of the next chunk?
764
start = word[:last_break.end()]
765
result += start + break_character
766
word = word[len(start):]
767
result += word
768
return result
769
770
771