Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/lxml/html/__init__.py
811 views
1
# Copyright (c) 2004 Ian Bicking. All rights reserved.
2
#
3
# Redistribution and use in source and binary forms, with or without
4
# modification, are permitted provided that the following conditions are
5
# met:
6
#
7
# 1. Redistributions of source code must retain the above copyright
8
# notice, this list of conditions and the following disclaimer.
9
#
10
# 2. Redistributions in binary form must reproduce the above copyright
11
# notice, this list of conditions and the following disclaimer in
12
# the documentation and/or other materials provided with the
13
# distribution.
14
#
15
# 3. Neither the name of Ian Bicking nor the names of its contributors may
16
# be used to endorse or promote products derived from this software
17
# without specific prior written permission.
18
#
19
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
23
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31
"""The ``lxml.html`` tool set for HTML handling.
32
"""
33
34
from __future__ import absolute_import
35
36
__all__ = [
37
'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
38
'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
39
'find_rel_links', 'find_class', 'make_links_absolute',
40
'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
41
42
43
import copy
44
import sys
45
import re
46
from functools import partial
47
48
try:
49
from collections.abc import MutableMapping, MutableSet
50
except ImportError:
51
from collections import MutableMapping, MutableSet
52
53
from .. import etree
54
from . import defs
55
from ._setmixin import SetMixin
56
57
try:
58
from urlparse import urljoin
59
except ImportError:
60
# Python 3
61
from urllib.parse import urljoin
62
63
try:
64
unicode
65
except NameError:
66
# Python 3
67
unicode = str
68
try:
69
basestring
70
except NameError:
71
# Python 3
72
basestring = (str, bytes)
73
74
75
def __fix_docstring(s):
76
if not s:
77
return s
78
if sys.version_info[0] >= 3:
79
sub = re.compile(r"^(\s*)u'", re.M).sub
80
else:
81
sub = re.compile(r"^(\s*)b'", re.M).sub
82
return sub(r"\1'", s)
83
84
85
XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
86
87
_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
88
namespaces={'x':XHTML_NAMESPACE})
89
_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
90
namespaces={'x':XHTML_NAMESPACE})
91
_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
92
namespaces={'x':XHTML_NAMESPACE})
93
#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
94
_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
95
_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
96
_collect_string_content = etree.XPath("string()")
97
_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
98
_iter_css_imports = re.compile(r'@import "(.*?)"').finditer
99
_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
100
namespaces={'x':XHTML_NAMESPACE})
101
_archive_re = re.compile(r'[^ ]+')
102
_parse_meta_refresh_url = re.compile(
103
r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
104
105
106
def _unquote_match(s, pos):
107
if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
108
return s[1:-1], pos+1
109
else:
110
return s,pos
111
112
113
def _transform_result(typ, result):
114
"""Convert the result back into the input type.
115
"""
116
if issubclass(typ, bytes):
117
return tostring(result, encoding='utf-8')
118
elif issubclass(typ, unicode):
119
return tostring(result, encoding='unicode')
120
else:
121
return result
122
123
124
def _nons(tag):
125
if isinstance(tag, basestring):
126
if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
127
return tag.split('}')[-1]
128
return tag
129
130
131
class Classes(MutableSet):
132
"""Provides access to an element's class attribute as a set-like collection.
133
Usage::
134
135
>>> el = fromstring('<p class="hidden large">Text</p>')
136
>>> classes = el.classes # or: classes = Classes(el.attrib)
137
>>> classes |= ['block', 'paragraph']
138
>>> el.get('class')
139
'hidden large block paragraph'
140
>>> classes.toggle('hidden')
141
False
142
>>> el.get('class')
143
'large block paragraph'
144
>>> classes -= ('some', 'classes', 'block')
145
>>> el.get('class')
146
'large paragraph'
147
"""
148
def __init__(self, attributes):
149
self._attributes = attributes
150
self._get_class_value = partial(attributes.get, 'class', '')
151
152
def add(self, value):
153
"""
154
Add a class.
155
156
This has no effect if the class is already present.
157
"""
158
if not value or re.search(r'\s', value):
159
raise ValueError("Invalid class name: %r" % value)
160
classes = self._get_class_value().split()
161
if value in classes:
162
return
163
classes.append(value)
164
self._attributes['class'] = ' '.join(classes)
165
166
def discard(self, value):
167
"""
168
Remove a class if it is currently present.
169
170
If the class is not present, do nothing.
171
"""
172
if not value or re.search(r'\s', value):
173
raise ValueError("Invalid class name: %r" % value)
174
classes = [name for name in self._get_class_value().split()
175
if name != value]
176
if classes:
177
self._attributes['class'] = ' '.join(classes)
178
elif 'class' in self._attributes:
179
del self._attributes['class']
180
181
def remove(self, value):
182
"""
183
Remove a class; it must currently be present.
184
185
If the class is not present, raise a KeyError.
186
"""
187
if not value or re.search(r'\s', value):
188
raise ValueError("Invalid class name: %r" % value)
189
super(Classes, self).remove(value)
190
191
def __contains__(self, name):
192
classes = self._get_class_value()
193
return name in classes and name in classes.split()
194
195
def __iter__(self):
196
return iter(self._get_class_value().split())
197
198
def __len__(self):
199
return len(self._get_class_value().split())
200
201
# non-standard methods
202
203
def update(self, values):
204
"""
205
Add all names from 'values'.
206
"""
207
classes = self._get_class_value().split()
208
extended = False
209
for value in values:
210
if value not in classes:
211
classes.append(value)
212
extended = True
213
if extended:
214
self._attributes['class'] = ' '.join(classes)
215
216
def toggle(self, value):
217
"""
218
Add a class name if it isn't there yet, or remove it if it exists.
219
220
Returns true if the class was added (and is now enabled) and
221
false if it was removed (and is now disabled).
222
"""
223
if not value or re.search(r'\s', value):
224
raise ValueError("Invalid class name: %r" % value)
225
classes = self._get_class_value().split()
226
try:
227
classes.remove(value)
228
enabled = False
229
except ValueError:
230
classes.append(value)
231
enabled = True
232
if classes:
233
self._attributes['class'] = ' '.join(classes)
234
else:
235
del self._attributes['class']
236
return enabled
237
238
239
class HtmlMixin(object):
240
241
def set(self, key, value=None):
242
"""set(self, key, value=None)
243
244
Sets an element attribute. If no value is provided, or if the value is None,
245
creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
246
for ``form.set('novalidate')``.
247
"""
248
super(HtmlElement, self).set(key, value)
249
250
@property
251
def classes(self):
252
"""
253
A set-like wrapper around the 'class' attribute.
254
"""
255
return Classes(self.attrib)
256
257
@classes.setter
258
def classes(self, classes):
259
assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.
260
value = classes._get_class_value()
261
if value:
262
self.set('class', value)
263
elif self.get('class') is not None:
264
del self.attrib['class']
265
266
@property
267
def base_url(self):
268
"""
269
Returns the base URL, given when the page was parsed.
270
271
Use with ``urlparse.urljoin(el.base_url, href)`` to get
272
absolute URLs.
273
"""
274
return self.getroottree().docinfo.URL
275
276
@property
277
def forms(self):
278
"""
279
Return a list of all the forms
280
"""
281
return _forms_xpath(self)
282
283
@property
284
def body(self):
285
"""
286
Return the <body> element. Can be called from a child element
287
to get the document's head.
288
"""
289
return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
290
291
@property
292
def head(self):
293
"""
294
Returns the <head> element. Can be called from a child
295
element to get the document's head.
296
"""
297
return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
298
299
@property
300
def label(self):
301
"""
302
Get or set any <label> element associated with this element.
303
"""
304
id = self.get('id')
305
if not id:
306
return None
307
result = _label_xpath(self, id=id)
308
if not result:
309
return None
310
else:
311
return result[0]
312
313
@label.setter
314
def label(self, label):
315
id = self.get('id')
316
if not id:
317
raise TypeError(
318
"You cannot set a label for an element (%r) that has no id"
319
% self)
320
if _nons(label.tag) != 'label':
321
raise TypeError(
322
"You can only assign label to a label element (not %r)"
323
% label)
324
label.set('for', id)
325
326
@label.deleter
327
def label(self):
328
label = self.label
329
if label is not None:
330
del label.attrib['for']
331
332
def drop_tree(self):
333
"""
334
Removes this element from the tree, including its children and
335
text. The tail text is joined to the previous element or
336
parent.
337
"""
338
parent = self.getparent()
339
assert parent is not None
340
if self.tail:
341
previous = self.getprevious()
342
if previous is None:
343
parent.text = (parent.text or '') + self.tail
344
else:
345
previous.tail = (previous.tail or '') + self.tail
346
parent.remove(self)
347
348
def drop_tag(self):
349
"""
350
Remove the tag, but not its children or text. The children and text
351
are merged into the parent.
352
353
Example::
354
355
>>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
356
>>> h.find('.//b').drop_tag()
357
>>> print(tostring(h, encoding='unicode'))
358
<div>Hello World!</div>
359
"""
360
parent = self.getparent()
361
assert parent is not None
362
previous = self.getprevious()
363
if self.text and isinstance(self.tag, basestring):
364
# not a Comment, etc.
365
if previous is None:
366
parent.text = (parent.text or '') + self.text
367
else:
368
previous.tail = (previous.tail or '') + self.text
369
if self.tail:
370
if len(self):
371
last = self[-1]
372
last.tail = (last.tail or '') + self.tail
373
elif previous is None:
374
parent.text = (parent.text or '') + self.tail
375
else:
376
previous.tail = (previous.tail or '') + self.tail
377
index = parent.index(self)
378
parent[index:index+1] = self[:]
379
380
def find_rel_links(self, rel):
381
"""
382
Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
383
"""
384
rel = rel.lower()
385
return [el for el in _rel_links_xpath(self)
386
if el.get('rel').lower() == rel]
387
388
def find_class(self, class_name):
389
"""
390
Find any elements with the given class name.
391
"""
392
return _class_xpath(self, class_name=class_name)
393
394
def get_element_by_id(self, id, *default):
395
"""
396
Get the first element in a document with the given id. If none is
397
found, return the default argument if provided or raise KeyError
398
otherwise.
399
400
Note that there can be more than one element with the same id,
401
and this isn't uncommon in HTML documents found in the wild.
402
Browsers return only the first match, and this function does
403
the same.
404
"""
405
try:
406
# FIXME: should this check for multiple matches?
407
# browsers just return the first one
408
return _id_xpath(self, id=id)[0]
409
except IndexError:
410
if default:
411
return default[0]
412
else:
413
raise KeyError(id)
414
415
def text_content(self):
416
"""
417
Return the text content of the tag (and the text in any children).
418
"""
419
return _collect_string_content(self)
420
421
def cssselect(self, expr, translator='html'):
422
"""
423
Run the CSS expression on this element and its children,
424
returning a list of the results.
425
426
Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
427
-- note that pre-compiling the expression can provide a substantial
428
speedup.
429
"""
430
# Do the import here to make the dependency optional.
431
from lxml.cssselect import CSSSelector
432
return CSSSelector(expr, translator=translator)(self)
433
434
########################################
435
## Link functions
436
########################################
437
438
def make_links_absolute(self, base_url=None, resolve_base_href=True,
439
handle_failures=None):
440
"""
441
Make all links in the document absolute, given the
442
``base_url`` for the document (the full URL where the document
443
came from), or if no ``base_url`` is given, then the ``.base_url``
444
of the document.
445
446
If ``resolve_base_href`` is true, then any ``<base href>``
447
tags in the document are used *and* removed from the document.
448
If it is false then any such tag is ignored.
449
450
If ``handle_failures`` is None (default), a failure to process
451
a URL will abort the processing. If set to 'ignore', errors
452
are ignored. If set to 'discard', failing URLs will be removed.
453
"""
454
if base_url is None:
455
base_url = self.base_url
456
if base_url is None:
457
raise TypeError(
458
"No base_url given, and the document has no base_url")
459
if resolve_base_href:
460
self.resolve_base_href()
461
462
if handle_failures == 'ignore':
463
def link_repl(href):
464
try:
465
return urljoin(base_url, href)
466
except ValueError:
467
return href
468
elif handle_failures == 'discard':
469
def link_repl(href):
470
try:
471
return urljoin(base_url, href)
472
except ValueError:
473
return None
474
elif handle_failures is None:
475
def link_repl(href):
476
return urljoin(base_url, href)
477
else:
478
raise ValueError(
479
"unexpected value for handle_failures: %r" % handle_failures)
480
481
self.rewrite_links(link_repl)
482
483
def resolve_base_href(self, handle_failures=None):
484
"""
485
Find any ``<base href>`` tag in the document, and apply its
486
values to all links found in the document. Also remove the
487
tag once it has been applied.
488
489
If ``handle_failures`` is None (default), a failure to process
490
a URL will abort the processing. If set to 'ignore', errors
491
are ignored. If set to 'discard', failing URLs will be removed.
492
"""
493
base_href = None
494
basetags = self.xpath('//base[@href]|//x:base[@href]',
495
namespaces={'x': XHTML_NAMESPACE})
496
for b in basetags:
497
base_href = b.get('href')
498
b.drop_tree()
499
if not base_href:
500
return
501
self.make_links_absolute(base_href, resolve_base_href=False,
502
handle_failures=handle_failures)
503
504
def iterlinks(self):
505
"""
506
Yield (element, attribute, link, pos), where attribute may be None
507
(indicating the link is in the text). ``pos`` is the position
508
where the link occurs; often 0, but sometimes something else in
509
the case of links in stylesheets or style tags.
510
511
Note: <base href> is *not* taken into account in any way. The
512
link you get is exactly the link in the document.
513
514
Note: multiple links inside of a single text string or
515
attribute value are returned in reversed order. This makes it
516
possible to replace or delete them from the text string value
517
based on their reported text positions. Otherwise, a
518
modification at one text position can change the positions of
519
links reported later on.
520
"""
521
link_attrs = defs.link_attrs
522
for el in self.iter(etree.Element):
523
attribs = el.attrib
524
tag = _nons(el.tag)
525
if tag == 'object':
526
codebase = None
527
## <object> tags have attributes that are relative to
528
## codebase
529
if 'codebase' in attribs:
530
codebase = el.get('codebase')
531
yield (el, 'codebase', codebase, 0)
532
for attrib in ('classid', 'data'):
533
if attrib in attribs:
534
value = el.get(attrib)
535
if codebase is not None:
536
value = urljoin(codebase, value)
537
yield (el, attrib, value, 0)
538
if 'archive' in attribs:
539
for match in _archive_re.finditer(el.get('archive')):
540
value = match.group(0)
541
if codebase is not None:
542
value = urljoin(codebase, value)
543
yield (el, 'archive', value, match.start())
544
else:
545
for attrib in link_attrs:
546
if attrib in attribs:
547
yield (el, attrib, attribs[attrib], 0)
548
if tag == 'meta':
549
http_equiv = attribs.get('http-equiv', '').lower()
550
if http_equiv == 'refresh':
551
content = attribs.get('content', '')
552
match = _parse_meta_refresh_url(content)
553
url = (match.group('url') if match else content).strip()
554
# unexpected content means the redirect won't work, but we might
555
# as well be permissive and return the entire string.
556
if url:
557
url, pos = _unquote_match(
558
url, match.start('url') if match else content.find(url))
559
yield (el, 'content', url, pos)
560
elif tag == 'param':
561
valuetype = el.get('valuetype') or ''
562
if valuetype.lower() == 'ref':
563
## FIXME: while it's fine we *find* this link,
564
## according to the spec we aren't supposed to
565
## actually change the value, including resolving
566
## it. It can also still be a link, even if it
567
## doesn't have a valuetype="ref" (which seems to be the norm)
568
## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
569
yield (el, 'value', el.get('value'), 0)
570
elif tag == 'style' and el.text:
571
urls = [
572
# (start_pos, url)
573
_unquote_match(match.group(1), match.start(1))[::-1]
574
for match in _iter_css_urls(el.text)
575
] + [
576
(match.start(1), match.group(1))
577
for match in _iter_css_imports(el.text)
578
]
579
if urls:
580
# sort by start pos to bring both match sets back into order
581
# and reverse the list to report correct positions despite
582
# modifications
583
urls.sort(reverse=True)
584
for start, url in urls:
585
yield (el, None, url, start)
586
if 'style' in attribs:
587
urls = list(_iter_css_urls(attribs['style']))
588
if urls:
589
# return in reversed order to simplify in-place modifications
590
for match in urls[::-1]:
591
url, start = _unquote_match(match.group(1), match.start(1))
592
yield (el, 'style', url, start)
593
594
def rewrite_links(self, link_repl_func, resolve_base_href=True,
595
base_href=None):
596
"""
597
Rewrite all the links in the document. For each link
598
``link_repl_func(link)`` will be called, and the return value
599
will replace the old link.
600
601
Note that links may not be absolute (unless you first called
602
``make_links_absolute()``), and may be internal (e.g.,
603
``'#anchor'``). They can also be values like
604
``'mailto:email'`` or ``'javascript:expr'``.
605
606
If you give ``base_href`` then all links passed to
607
``link_repl_func()`` will take that into account.
608
609
If the ``link_repl_func`` returns None, the attribute or
610
tag text will be removed completely.
611
"""
612
if base_href is not None:
613
# FIXME: this can be done in one pass with a wrapper
614
# around link_repl_func
615
self.make_links_absolute(
616
base_href, resolve_base_href=resolve_base_href)
617
elif resolve_base_href:
618
self.resolve_base_href()
619
620
for el, attrib, link, pos in self.iterlinks():
621
new_link = link_repl_func(link.strip())
622
if new_link == link:
623
continue
624
if new_link is None:
625
# Remove the attribute or element content
626
if attrib is None:
627
el.text = ''
628
else:
629
del el.attrib[attrib]
630
continue
631
632
if attrib is None:
633
new = el.text[:pos] + new_link + el.text[pos+len(link):]
634
el.text = new
635
else:
636
cur = el.get(attrib)
637
if not pos and len(cur) == len(link):
638
new = new_link # most common case
639
else:
640
new = cur[:pos] + new_link + cur[pos+len(link):]
641
el.set(attrib, new)
642
643
644
class _MethodFunc(object):
645
"""
646
An object that represents a method on an element as a function;
647
the function takes either an element or an HTML string. It
648
returns whatever the function normally returns, or if the function
649
works in-place (and so returns None) it returns a serialized form
650
of the resulting document.
651
"""
652
def __init__(self, name, copy=False, source_class=HtmlMixin):
653
self.name = name
654
self.copy = copy
655
self.__doc__ = getattr(source_class, self.name).__doc__
656
def __call__(self, doc, *args, **kw):
657
result_type = type(doc)
658
if isinstance(doc, basestring):
659
if 'copy' in kw:
660
raise TypeError(
661
"The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
662
doc = fromstring(doc, **kw)
663
else:
664
if 'copy' in kw:
665
make_a_copy = kw.pop('copy')
666
else:
667
make_a_copy = self.copy
668
if make_a_copy:
669
doc = copy.deepcopy(doc)
670
meth = getattr(doc, self.name)
671
result = meth(*args, **kw)
672
# FIXME: this None test is a bit sloppy
673
if result is None:
674
# Then return what we got in
675
return _transform_result(result_type, doc)
676
else:
677
return result
678
679
680
find_rel_links = _MethodFunc('find_rel_links', copy=False)
681
find_class = _MethodFunc('find_class', copy=False)
682
make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
683
resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
684
iterlinks = _MethodFunc('iterlinks', copy=False)
685
rewrite_links = _MethodFunc('rewrite_links', copy=True)
686
687
688
class HtmlComment(etree.CommentBase, HtmlMixin):
689
pass
690
691
692
class HtmlElement(etree.ElementBase, HtmlMixin):
693
# Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?)
694
cssselect = HtmlMixin.cssselect
695
set = HtmlMixin.set
696
697
698
class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
699
pass
700
701
702
class HtmlEntity(etree.EntityBase, HtmlMixin):
703
pass
704
705
706
class HtmlElementClassLookup(etree.CustomElementClassLookup):
707
"""A lookup scheme for HTML Element classes.
708
709
To create a lookup instance with different Element classes, pass a tag
710
name mapping of Element classes in the ``classes`` keyword argument and/or
711
a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
712
The special key '*' denotes a Mixin class that should be mixed into all
713
Element classes.
714
"""
715
_default_element_classes = {}
716
717
def __init__(self, classes=None, mixins=None):
718
etree.CustomElementClassLookup.__init__(self)
719
if classes is None:
720
classes = self._default_element_classes.copy()
721
if mixins:
722
mixers = {}
723
for name, value in mixins:
724
if name == '*':
725
for n in classes.keys():
726
mixers.setdefault(n, []).append(value)
727
else:
728
mixers.setdefault(name, []).append(value)
729
for name, mix_bases in mixers.items():
730
cur = classes.get(name, HtmlElement)
731
bases = tuple(mix_bases + [cur])
732
classes[name] = type(cur.__name__, bases, {})
733
self._element_classes = classes
734
735
def lookup(self, node_type, document, namespace, name):
736
if node_type == 'element':
737
return self._element_classes.get(name.lower(), HtmlElement)
738
elif node_type == 'comment':
739
return HtmlComment
740
elif node_type == 'PI':
741
return HtmlProcessingInstruction
742
elif node_type == 'entity':
743
return HtmlEntity
744
# Otherwise normal lookup
745
return None
746
747
748
################################################################################
749
# parsing
750
################################################################################
751
752
_looks_like_full_html_unicode = re.compile(
753
unicode(r'^\s*<(?:html|!doctype)'), re.I).match
754
_looks_like_full_html_bytes = re.compile(
755
r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
756
757
758
def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
759
if parser is None:
760
parser = html_parser
761
value = etree.fromstring(html, parser, **kw)
762
if value is None:
763
raise etree.ParserError(
764
"Document is empty")
765
if ensure_head_body and value.find('head') is None:
766
value.insert(0, Element('head'))
767
if ensure_head_body and value.find('body') is None:
768
value.append(Element('body'))
769
return value
770
771
772
def fragments_fromstring(html, no_leading_text=False, base_url=None,
773
parser=None, **kw):
774
"""Parses several HTML elements, returning a list of elements.
775
776
The first item in the list may be a string.
777
If no_leading_text is true, then it will be an error if there is
778
leading text, and it will always be a list of only elements.
779
780
base_url will set the document's base_url attribute
781
(and the tree's docinfo.URL).
782
"""
783
if parser is None:
784
parser = html_parser
785
# FIXME: check what happens when you give html with a body, head, etc.
786
if isinstance(html, bytes):
787
if not _looks_like_full_html_bytes(html):
788
# can't use %-formatting in early Py3 versions
789
html = ('<html><body>'.encode('ascii') + html +
790
'</body></html>'.encode('ascii'))
791
else:
792
if not _looks_like_full_html_unicode(html):
793
html = '<html><body>%s</body></html>' % html
794
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
795
assert _nons(doc.tag) == 'html'
796
bodies = [e for e in doc if _nons(e.tag) == 'body']
797
assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
798
body = bodies[0]
799
elements = []
800
if no_leading_text and body.text and body.text.strip():
801
raise etree.ParserError(
802
"There is leading text: %r" % body.text)
803
if body.text and body.text.strip():
804
elements.append(body.text)
805
elements.extend(body)
806
# FIXME: removing the reference to the parent artificial document
807
# would be nice
808
return elements
809
810
811
def fragment_fromstring(html, create_parent=False, base_url=None,
812
parser=None, **kw):
813
"""
814
Parses a single HTML element; it is an error if there is more than
815
one element, or if anything but whitespace precedes or follows the
816
element.
817
818
If ``create_parent`` is true (or is a tag name) then a parent node
819
will be created to encapsulate the HTML in a single element. In this
820
case, leading or trailing text is also allowed, as are multiple elements
821
as result of the parsing.
822
823
Passing a ``base_url`` will set the document's ``base_url`` attribute
824
(and the tree's docinfo.URL).
825
"""
826
if parser is None:
827
parser = html_parser
828
829
accept_leading_text = bool(create_parent)
830
831
elements = fragments_fromstring(
832
html, parser=parser, no_leading_text=not accept_leading_text,
833
base_url=base_url, **kw)
834
835
if create_parent:
836
if not isinstance(create_parent, basestring):
837
create_parent = 'div'
838
new_root = Element(create_parent)
839
if elements:
840
if isinstance(elements[0], basestring):
841
new_root.text = elements[0]
842
del elements[0]
843
new_root.extend(elements)
844
return new_root
845
846
if not elements:
847
raise etree.ParserError('No elements found')
848
if len(elements) > 1:
849
raise etree.ParserError(
850
"Multiple elements found (%s)"
851
% ', '.join([_element_name(e) for e in elements]))
852
el = elements[0]
853
if el.tail and el.tail.strip():
854
raise etree.ParserError(
855
"Element followed by text: %r" % el.tail)
856
el.tail = None
857
return el
858
859
860
def fromstring(html, base_url=None, parser=None, **kw):
861
"""
862
Parse the html, returning a single element/document.
863
864
This tries to minimally parse the chunk of text, without knowing if it
865
is a fragment or a document.
866
867
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
868
"""
869
if parser is None:
870
parser = html_parser
871
if isinstance(html, bytes):
872
is_full_html = _looks_like_full_html_bytes(html)
873
else:
874
is_full_html = _looks_like_full_html_unicode(html)
875
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
876
if is_full_html:
877
return doc
878
# otherwise, lets parse it out...
879
bodies = doc.findall('body')
880
if not bodies:
881
bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
882
if bodies:
883
body = bodies[0]
884
if len(bodies) > 1:
885
# Somehow there are multiple bodies, which is bad, but just
886
# smash them into one body
887
for other_body in bodies[1:]:
888
if other_body.text:
889
if len(body):
890
body[-1].tail = (body[-1].tail or '') + other_body.text
891
else:
892
body.text = (body.text or '') + other_body.text
893
body.extend(other_body)
894
# We'll ignore tail
895
# I guess we are ignoring attributes too
896
other_body.drop_tree()
897
else:
898
body = None
899
heads = doc.findall('head')
900
if not heads:
901
heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
902
if heads:
903
# Well, we have some sort of structure, so lets keep it all
904
head = heads[0]
905
if len(heads) > 1:
906
for other_head in heads[1:]:
907
head.extend(other_head)
908
# We don't care about text or tail in a head
909
other_head.drop_tree()
910
return doc
911
if body is None:
912
return doc
913
if (len(body) == 1 and (not body.text or not body.text.strip())
914
and (not body[-1].tail or not body[-1].tail.strip())):
915
# The body has just one element, so it was probably a single
916
# element passed in
917
return body[0]
918
# Now we have a body which represents a bunch of tags which have the
919
# content that was passed in. We will create a fake container, which
920
# is the body tag, except <body> implies too much structure.
921
if _contains_block_level_tag(body):
922
body.tag = 'div'
923
else:
924
body.tag = 'span'
925
return body
926
927
928
def parse(filename_or_url, parser=None, base_url=None, **kw):
929
"""
930
Parse a filename, URL, or file-like object into an HTML document
931
tree. Note: this returns a tree, not an element. Use
932
``parse(...).getroot()`` to get the document root.
933
934
You can override the base URL with the ``base_url`` keyword. This
935
is most useful when parsing from a file-like object.
936
"""
937
if parser is None:
938
parser = html_parser
939
return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
940
941
942
def _contains_block_level_tag(el):
943
# FIXME: I could do this with XPath, but would that just be
944
# unnecessarily slow?
945
for el in el.iter(etree.Element):
946
if _nons(el.tag) in defs.block_tags:
947
return True
948
return False
949
950
951
def _element_name(el):
952
if isinstance(el, etree.CommentBase):
953
return 'comment'
954
elif isinstance(el, basestring):
955
return 'string'
956
else:
957
return _nons(el.tag)
958
959
960
################################################################################
961
# form handling
962
################################################################################
963
964
class FormElement(HtmlElement):
965
"""
966
Represents a <form> element.
967
"""
968
969
@property
970
def inputs(self):
971
"""
972
Returns an accessor for all the input elements in the form.
973
974
See `InputGetter` for more information about the object.
975
"""
976
return InputGetter(self)
977
978
@property
979
def fields(self):
980
"""
981
Dictionary-like object that represents all the fields in this
982
form. You can set values in this dictionary to effect the
983
form.
984
"""
985
return FieldsDict(self.inputs)
986
987
@fields.setter
988
def fields(self, value):
989
fields = self.fields
990
prev_keys = fields.keys()
991
for key, value in value.items():
992
if key in prev_keys:
993
prev_keys.remove(key)
994
fields[key] = value
995
for key in prev_keys:
996
if key is None:
997
# Case of an unnamed input; these aren't really
998
# expressed in form_values() anyway.
999
continue
1000
fields[key] = None
1001
1002
def _name(self):
1003
if self.get('name'):
1004
return self.get('name')
1005
elif self.get('id'):
1006
return '#' + self.get('id')
1007
iter_tags = self.body.iter
1008
forms = list(iter_tags('form'))
1009
if not forms:
1010
forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))
1011
return str(forms.index(self))
1012
1013
def form_values(self):
1014
"""
1015
Return a list of tuples of the field values for the form.
1016
This is suitable to be passed to ``urllib.urlencode()``.
1017
"""
1018
results = []
1019
for el in self.inputs:
1020
name = el.name
1021
if not name or 'disabled' in el.attrib:
1022
continue
1023
tag = _nons(el.tag)
1024
if tag == 'textarea':
1025
results.append((name, el.value))
1026
elif tag == 'select':
1027
value = el.value
1028
if el.multiple:
1029
for v in value:
1030
results.append((name, v))
1031
elif value is not None:
1032
results.append((name, el.value))
1033
else:
1034
assert tag == 'input', (
1035
"Unexpected tag: %r" % el)
1036
if el.checkable and not el.checked:
1037
continue
1038
if el.type in ('submit', 'image', 'reset', 'file'):
1039
continue
1040
value = el.value
1041
if value is not None:
1042
results.append((name, el.value))
1043
return results
1044
1045
@property
1046
def action(self):
1047
"""
1048
Get/set the form's ``action`` attribute.
1049
"""
1050
base_url = self.base_url
1051
action = self.get('action')
1052
if base_url and action is not None:
1053
return urljoin(base_url, action)
1054
else:
1055
return action
1056
1057
@action.setter
1058
def action(self, value):
1059
self.set('action', value)
1060
1061
@action.deleter
1062
def action(self):
1063
attrib = self.attrib
1064
if 'action' in attrib:
1065
del attrib['action']
1066
1067
@property
1068
def method(self):
1069
"""
1070
Get/set the form's method. Always returns a capitalized
1071
string, and defaults to ``'GET'``
1072
"""
1073
return self.get('method', 'GET').upper()
1074
1075
@method.setter
1076
def method(self, value):
1077
self.set('method', value.upper())
1078
1079
1080
HtmlElementClassLookup._default_element_classes['form'] = FormElement
1081
1082
1083
def submit_form(form, extra_values=None, open_http=None):
1084
"""
1085
Helper function to submit a form. Returns a file-like object, as from
1086
``urllib.urlopen()``. This object also has a ``.geturl()`` function,
1087
which shows the URL if there were any redirects.
1088
1089
You can use this like::
1090
1091
form = doc.forms[0]
1092
form.inputs['foo'].value = 'bar' # etc
1093
response = form.submit()
1094
doc = parse(response)
1095
doc.make_links_absolute(response.geturl())
1096
1097
To change the HTTP requester, pass a function as ``open_http`` keyword
1098
argument that opens the URL for you. The function must have the following
1099
signature::
1100
1101
open_http(method, URL, values)
1102
1103
The action is one of 'GET' or 'POST', the URL is the target URL as a
1104
string, and the values are a sequence of ``(name, value)`` tuples with the
1105
form data.
1106
"""
1107
values = form.form_values()
1108
if extra_values:
1109
if hasattr(extra_values, 'items'):
1110
extra_values = extra_values.items()
1111
values.extend(extra_values)
1112
if open_http is None:
1113
open_http = open_http_urllib
1114
if form.action:
1115
url = form.action
1116
else:
1117
url = form.base_url
1118
return open_http(form.method, url, values)
1119
1120
1121
def open_http_urllib(method, url, values):
1122
if not url:
1123
raise ValueError("cannot submit, no URL provided")
1124
## FIXME: should test that it's not a relative URL or something
1125
try:
1126
from urllib import urlencode, urlopen
1127
except ImportError: # Python 3
1128
from urllib.request import urlopen
1129
from urllib.parse import urlencode
1130
if method == 'GET':
1131
if '?' in url:
1132
url += '&'
1133
else:
1134
url += '?'
1135
url += urlencode(values)
1136
data = None
1137
else:
1138
data = urlencode(values)
1139
if not isinstance(data, bytes):
1140
data = data.encode('ASCII')
1141
return urlopen(url, data)
1142
1143
1144
class FieldsDict(MutableMapping):
1145
1146
def __init__(self, inputs):
1147
self.inputs = inputs
1148
def __getitem__(self, item):
1149
return self.inputs[item].value
1150
def __setitem__(self, item, value):
1151
self.inputs[item].value = value
1152
def __delitem__(self, item):
1153
raise KeyError(
1154
"You cannot remove keys from ElementDict")
1155
def keys(self):
1156
return self.inputs.keys()
1157
def __contains__(self, item):
1158
return item in self.inputs
1159
def __iter__(self):
1160
return iter(self.inputs.keys())
1161
def __len__(self):
1162
return len(self.inputs)
1163
1164
def __repr__(self):
1165
return '<%s for form %s>' % (
1166
self.__class__.__name__,
1167
self.inputs.form._name())
1168
1169
1170
class InputGetter(object):
1171
1172
"""
1173
An accessor that represents all the input fields in a form.
1174
1175
You can get fields by name from this, with
1176
``form.inputs['field_name']``. If there are a set of checkboxes
1177
with the same name, they are returned as a list (a `CheckboxGroup`
1178
which also allows value setting). Radio inputs are handled
1179
similarly.
1180
1181
You can also iterate over this to get all input elements. This
1182
won't return the same thing as if you get all the names, as
1183
checkboxes and radio elements are returned individually.
1184
"""
1185
1186
_name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]")
1187
_all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']")
1188
1189
def __init__(self, form):
1190
self.form = form
1191
1192
def __repr__(self):
1193
return '<%s for form %s>' % (
1194
self.__class__.__name__,
1195
self.form._name())
1196
1197
## FIXME: there should be more methods, and it's unclear if this is
1198
## a dictionary-like object or list-like object
1199
1200
def __getitem__(self, name):
1201
results = self._name_xpath(self.form, name=name)
1202
if results:
1203
type = results[0].get('type')
1204
if type == 'radio' and len(results) > 1:
1205
group = RadioGroup(results)
1206
group.name = name
1207
return group
1208
elif type == 'checkbox' and len(results) > 1:
1209
group = CheckboxGroup(results)
1210
group.name = name
1211
return group
1212
else:
1213
# I don't like throwing away elements like this
1214
return results[0]
1215
else:
1216
raise KeyError(
1217
"No input element with the name %r" % name)
1218
1219
def __contains__(self, name):
1220
results = self._name_xpath(self.form, name=name)
1221
return bool(results)
1222
1223
def keys(self):
1224
names = set()
1225
for el in self:
1226
names.add(el.name)
1227
if None in names:
1228
names.remove(None)
1229
return list(names)
1230
1231
def __iter__(self):
1232
## FIXME: kind of dumb to turn a list into an iterator, only
1233
## to have it likely turned back into a list again :(
1234
return iter(self._all_xpath(self.form))
1235
1236
1237
class InputMixin(object):
1238
"""
1239
Mix-in for all input elements (input, select, and textarea)
1240
"""
1241
@property
1242
def name(self):
1243
"""
1244
Get/set the name of the element
1245
"""
1246
return self.get('name')
1247
1248
@name.setter
1249
def name(self, value):
1250
self.set('name', value)
1251
1252
@name.deleter
1253
def name(self):
1254
attrib = self.attrib
1255
if 'name' in attrib:
1256
del attrib['name']
1257
1258
def __repr__(self):
1259
type_name = getattr(self, 'type', None)
1260
if type_name:
1261
type_name = ' type=%r' % type_name
1262
else:
1263
type_name = ''
1264
return '<%s %x name=%r%s>' % (
1265
self.__class__.__name__, id(self), self.name, type_name)
1266
1267
1268
class TextareaElement(InputMixin, HtmlElement):
1269
"""
1270
``<textarea>`` element. You can get the name with ``.name`` and
1271
get/set the value with ``.value``
1272
"""
1273
@property
1274
def value(self):
1275
"""
1276
Get/set the value (which is the contents of this element)
1277
"""
1278
content = self.text or ''
1279
if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1280
serialisation_method = 'xml'
1281
else:
1282
serialisation_method = 'html'
1283
for el in self:
1284
# it's rare that we actually get here, so let's not use ''.join()
1285
content += etree.tostring(
1286
el, method=serialisation_method, encoding='unicode')
1287
return content
1288
1289
@value.setter
1290
def value(self, value):
1291
del self[:]
1292
self.text = value
1293
1294
@value.deleter
1295
def value(self):
1296
self.text = ''
1297
del self[:]
1298
1299
1300
HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1301
1302
1303
class SelectElement(InputMixin, HtmlElement):
1304
"""
1305
``<select>`` element. You can get the name with ``.name``.
1306
1307
``.value`` will be the value of the selected option, unless this
1308
is a multi-select element (``<select multiple>``), in which case
1309
it will be a set-like object. In either case ``.value_options``
1310
gives the possible values.
1311
1312
The boolean attribute ``.multiple`` shows if this is a
1313
multi-select.
1314
"""
1315
@property
1316
def value(self):
1317
"""
1318
Get/set the value of this select (the selected option).
1319
1320
If this is a multi-select, this is a set-like object that
1321
represents all the selected options.
1322
"""
1323
if self.multiple:
1324
return MultipleSelectOptions(self)
1325
options = _options_xpath(self)
1326
1327
try:
1328
selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
1329
except StopIteration:
1330
try:
1331
selected_option = next(el for el in options if el.get('disabled') is None)
1332
except StopIteration:
1333
return None
1334
value = selected_option.get('value')
1335
if value is None:
1336
value = (selected_option.text or '').strip()
1337
return value
1338
1339
@value.setter
1340
def value(self, value):
1341
if self.multiple:
1342
if isinstance(value, basestring):
1343
raise TypeError("You must pass in a sequence")
1344
values = self.value
1345
values.clear()
1346
values.update(value)
1347
return
1348
checked_option = None
1349
if value is not None:
1350
for el in _options_xpath(self):
1351
opt_value = el.get('value')
1352
if opt_value is None:
1353
opt_value = (el.text or '').strip()
1354
if opt_value == value:
1355
checked_option = el
1356
break
1357
else:
1358
raise ValueError(
1359
"There is no option with the value of %r" % value)
1360
for el in _options_xpath(self):
1361
if 'selected' in el.attrib:
1362
del el.attrib['selected']
1363
if checked_option is not None:
1364
checked_option.set('selected', '')
1365
1366
@value.deleter
1367
def value(self):
1368
# FIXME: should del be allowed at all?
1369
if self.multiple:
1370
self.value.clear()
1371
else:
1372
self.value = None
1373
1374
@property
1375
def value_options(self):
1376
"""
1377
All the possible values this select can have (the ``value``
1378
attribute of all the ``<option>`` elements.
1379
"""
1380
options = []
1381
for el in _options_xpath(self):
1382
value = el.get('value')
1383
if value is None:
1384
value = (el.text or '').strip()
1385
options.append(value)
1386
return options
1387
1388
@property
1389
def multiple(self):
1390
"""
1391
Boolean attribute: is there a ``multiple`` attribute on this element.
1392
"""
1393
return 'multiple' in self.attrib
1394
1395
@multiple.setter
1396
def multiple(self, value):
1397
if value:
1398
self.set('multiple', '')
1399
elif 'multiple' in self.attrib:
1400
del self.attrib['multiple']
1401
1402
1403
HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1404
1405
1406
class MultipleSelectOptions(SetMixin):
1407
"""
1408
Represents all the selected options in a ``<select multiple>`` element.
1409
1410
You can add to this set-like option to select an option, or remove
1411
to unselect the option.
1412
"""
1413
1414
def __init__(self, select):
1415
self.select = select
1416
1417
@property
1418
def options(self):
1419
"""
1420
Iterator of all the ``<option>`` elements.
1421
"""
1422
return iter(_options_xpath(self.select))
1423
1424
def __iter__(self):
1425
for option in self.options:
1426
if 'selected' in option.attrib:
1427
opt_value = option.get('value')
1428
if opt_value is None:
1429
opt_value = (option.text or '').strip()
1430
yield opt_value
1431
1432
def add(self, item):
1433
for option in self.options:
1434
opt_value = option.get('value')
1435
if opt_value is None:
1436
opt_value = (option.text or '').strip()
1437
if opt_value == item:
1438
option.set('selected', '')
1439
break
1440
else:
1441
raise ValueError(
1442
"There is no option with the value %r" % item)
1443
1444
def remove(self, item):
1445
for option in self.options:
1446
opt_value = option.get('value')
1447
if opt_value is None:
1448
opt_value = (option.text or '').strip()
1449
if opt_value == item:
1450
if 'selected' in option.attrib:
1451
del option.attrib['selected']
1452
else:
1453
raise ValueError(
1454
"The option %r is not currently selected" % item)
1455
break
1456
else:
1457
raise ValueError(
1458
"There is not option with the value %r" % item)
1459
1460
def __repr__(self):
1461
return '<%s {%s} for select name=%r>' % (
1462
self.__class__.__name__,
1463
', '.join([repr(v) for v in self]),
1464
self.select.name)
1465
1466
1467
class RadioGroup(list):
1468
"""
1469
This object represents several ``<input type=radio>`` elements
1470
that have the same name.
1471
1472
You can use this like a list, but also use the property
1473
``.value`` to check/uncheck inputs. Also you can use
1474
``.value_options`` to get the possible values.
1475
"""
1476
@property
1477
def value(self):
1478
"""
1479
Get/set the value, which checks the radio with that value (and
1480
unchecks any other value).
1481
"""
1482
for el in self:
1483
if 'checked' in el.attrib:
1484
return el.get('value')
1485
return None
1486
1487
@value.setter
1488
def value(self, value):
1489
checked_option = None
1490
if value is not None:
1491
for el in self:
1492
if el.get('value') == value:
1493
checked_option = el
1494
break
1495
else:
1496
raise ValueError("There is no radio input with the value %r" % value)
1497
for el in self:
1498
if 'checked' in el.attrib:
1499
del el.attrib['checked']
1500
if checked_option is not None:
1501
checked_option.set('checked', '')
1502
1503
@value.deleter
1504
def value(self):
1505
self.value = None
1506
1507
@property
1508
def value_options(self):
1509
"""
1510
Returns a list of all the possible values.
1511
"""
1512
return [el.get('value') for el in self]
1513
1514
def __repr__(self):
1515
return '%s(%s)' % (
1516
self.__class__.__name__,
1517
list.__repr__(self))
1518
1519
1520
class CheckboxGroup(list):
1521
"""
1522
Represents a group of checkboxes (``<input type=checkbox>``) that
1523
have the same name.
1524
1525
In addition to using this like a list, the ``.value`` attribute
1526
returns a set-like object that you can add to or remove from to
1527
check and uncheck checkboxes. You can also use ``.value_options``
1528
to get the possible values.
1529
"""
1530
@property
1531
def value(self):
1532
"""
1533
Return a set-like object that can be modified to check or
1534
uncheck individual checkboxes according to their value.
1535
"""
1536
return CheckboxValues(self)
1537
1538
@value.setter
1539
def value(self, value):
1540
values = self.value
1541
values.clear()
1542
if not hasattr(value, '__iter__'):
1543
raise ValueError(
1544
"A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
1545
% (self[0].name, value))
1546
values.update(value)
1547
1548
@value.deleter
1549
def value(self):
1550
self.value.clear()
1551
1552
@property
1553
def value_options(self):
1554
"""
1555
Returns a list of all the possible values.
1556
"""
1557
return [el.get('value') for el in self]
1558
1559
def __repr__(self):
1560
return '%s(%s)' % (
1561
self.__class__.__name__, list.__repr__(self))
1562
1563
1564
class CheckboxValues(SetMixin):
1565
"""
1566
Represents the values of the checked checkboxes in a group of
1567
checkboxes with the same name.
1568
"""
1569
1570
def __init__(self, group):
1571
self.group = group
1572
1573
def __iter__(self):
1574
return iter([
1575
el.get('value')
1576
for el in self.group
1577
if 'checked' in el.attrib])
1578
1579
def add(self, value):
1580
for el in self.group:
1581
if el.get('value') == value:
1582
el.set('checked', '')
1583
break
1584
else:
1585
raise KeyError("No checkbox with value %r" % value)
1586
1587
def remove(self, value):
1588
for el in self.group:
1589
if el.get('value') == value:
1590
if 'checked' in el.attrib:
1591
del el.attrib['checked']
1592
else:
1593
raise KeyError(
1594
"The checkbox with value %r was already unchecked" % value)
1595
break
1596
else:
1597
raise KeyError(
1598
"No checkbox with value %r" % value)
1599
1600
def __repr__(self):
1601
return '<%s {%s} for checkboxes name=%r>' % (
1602
self.__class__.__name__,
1603
', '.join([repr(v) for v in self]),
1604
self.group.name)
1605
1606
1607
class InputElement(InputMixin, HtmlElement):
1608
"""
1609
Represents an ``<input>`` element.
1610
1611
You can get the type with ``.type`` (which is lower-cased and
1612
defaults to ``'text'``).
1613
1614
Also you can get and set the value with ``.value``
1615
1616
Checkboxes and radios have the attribute ``input.checkable ==
1617
True`` (for all others it is false) and a boolean attribute
1618
``.checked``.
1619
1620
"""
1621
1622
## FIXME: I'm a little uncomfortable with the use of .checked
1623
@property
1624
def value(self):
1625
"""
1626
Get/set the value of this element, using the ``value`` attribute.
1627
1628
Also, if this is a checkbox and it has no value, this defaults
1629
to ``'on'``. If it is a checkbox or radio that is not
1630
checked, this returns None.
1631
"""
1632
if self.checkable:
1633
if self.checked:
1634
return self.get('value') or 'on'
1635
else:
1636
return None
1637
return self.get('value')
1638
1639
@value.setter
1640
def value(self, value):
1641
if self.checkable:
1642
if not value:
1643
self.checked = False
1644
else:
1645
self.checked = True
1646
if isinstance(value, basestring):
1647
self.set('value', value)
1648
else:
1649
self.set('value', value)
1650
1651
@value.deleter
1652
def value(self):
1653
if self.checkable:
1654
self.checked = False
1655
else:
1656
if 'value' in self.attrib:
1657
del self.attrib['value']
1658
1659
@property
1660
def type(self):
1661
"""
1662
Return the type of this element (using the type attribute).
1663
"""
1664
return self.get('type', 'text').lower()
1665
1666
@type.setter
1667
def type(self, value):
1668
self.set('type', value)
1669
1670
@property
1671
def checkable(self):
1672
"""
1673
Boolean: can this element be checked?
1674
"""
1675
return self.type in ('checkbox', 'radio')
1676
1677
@property
1678
def checked(self):
1679
"""
1680
Boolean attribute to get/set the presence of the ``checked``
1681
attribute.
1682
1683
You can only use this on checkable input types.
1684
"""
1685
if not self.checkable:
1686
raise AttributeError('Not a checkable input type')
1687
return 'checked' in self.attrib
1688
1689
@checked.setter
1690
def checked(self, value):
1691
if not self.checkable:
1692
raise AttributeError('Not a checkable input type')
1693
if value:
1694
self.set('checked', '')
1695
else:
1696
attrib = self.attrib
1697
if 'checked' in attrib:
1698
del attrib['checked']
1699
1700
1701
HtmlElementClassLookup._default_element_classes['input'] = InputElement
1702
1703
1704
class LabelElement(HtmlElement):
1705
"""
1706
Represents a ``<label>`` element.
1707
1708
Label elements are linked to other elements with their ``for``
1709
attribute. You can access this element with ``label.for_element``.
1710
"""
1711
@property
1712
def for_element(self):
1713
"""
1714
Get/set the element this label points to. Return None if it
1715
can't be found.
1716
"""
1717
id = self.get('for')
1718
if not id:
1719
return None
1720
return self.body.get_element_by_id(id)
1721
1722
@for_element.setter
1723
def for_element(self, other):
1724
id = other.get('id')
1725
if not id:
1726
raise TypeError(
1727
"Element %r has no id attribute" % other)
1728
self.set('for', id)
1729
1730
@for_element.deleter
1731
def for_element(self):
1732
attrib = self.attrib
1733
if 'id' in attrib:
1734
del attrib['id']
1735
1736
1737
HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1738
1739
1740
############################################################
1741
## Serialization
1742
############################################################
1743
1744
def html_to_xhtml(html):
1745
"""Convert all tags in an HTML tree to XHTML by moving them to the
1746
XHTML namespace.
1747
"""
1748
try:
1749
html = html.getroot()
1750
except AttributeError:
1751
pass
1752
prefix = "{%s}" % XHTML_NAMESPACE
1753
for el in html.iter(etree.Element):
1754
tag = el.tag
1755
if tag[0] != '{':
1756
el.tag = prefix + tag
1757
1758
1759
def xhtml_to_html(xhtml):
1760
"""Convert all tags in an XHTML tree to HTML by removing their
1761
XHTML namespace.
1762
"""
1763
try:
1764
xhtml = xhtml.getroot()
1765
except AttributeError:
1766
pass
1767
prefix = "{%s}" % XHTML_NAMESPACE
1768
prefix_len = len(prefix)
1769
for el in xhtml.iter(prefix + "*"):
1770
el.tag = el.tag[prefix_len:]
1771
1772
1773
# This isn't a general match, but it's a match for what libxml2
1774
# specifically serialises:
1775
__str_replace_meta_content_type = re.compile(
1776
r'<meta http-equiv="Content-Type"[^>]*>').sub
1777
__bytes_replace_meta_content_type = re.compile(
1778
r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1779
1780
1781
def tostring(doc, pretty_print=False, include_meta_content_type=False,
1782
encoding=None, method="html", with_tail=True, doctype=None):
1783
"""Return an HTML string representation of the document.
1784
1785
Note: if include_meta_content_type is true this will create a
1786
``<meta http-equiv="Content-Type" ...>`` tag in the head;
1787
regardless of the value of include_meta_content_type any existing
1788
``<meta http-equiv="Content-Type" ...>`` tag will be removed
1789
1790
The ``encoding`` argument controls the output encoding (defaults to
1791
ASCII, with &#...; character references for any characters outside
1792
of ASCII). Note that you can pass the name ``'unicode'`` as
1793
``encoding`` argument to serialise to a Unicode string.
1794
1795
The ``method`` argument defines the output method. It defaults to
1796
'html', but can also be 'xml' for xhtml output, or 'text' to
1797
serialise to plain text without markup.
1798
1799
To leave out the tail text of the top-level element that is being
1800
serialised, pass ``with_tail=False``.
1801
1802
The ``doctype`` option allows passing in a plain string that will
1803
be serialised before the XML tree. Note that passing in non
1804
well-formed content here will make the XML output non well-formed.
1805
Also, an existing doctype in the document tree will not be removed
1806
when serialising an ElementTree instance.
1807
1808
Example::
1809
1810
>>> from lxml import html
1811
>>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1812
1813
>>> html.tostring(root)
1814
b'<p>Hello<br>world!</p>'
1815
>>> html.tostring(root, method='html')
1816
b'<p>Hello<br>world!</p>'
1817
1818
>>> html.tostring(root, method='xml')
1819
b'<p>Hello<br/>world!</p>'
1820
1821
>>> html.tostring(root, method='text')
1822
b'Helloworld!'
1823
1824
>>> html.tostring(root, method='text', encoding='unicode')
1825
u'Helloworld!'
1826
1827
>>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1828
>>> html.tostring(root[0], method='text', encoding='unicode')
1829
u'Helloworld!TAIL'
1830
1831
>>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
1832
u'Helloworld!'
1833
1834
>>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1835
>>> html.tostring(doc, method='html', encoding='unicode')
1836
u'<html><body><p>Hello<br>world!</p></body></html>'
1837
1838
>>> print(html.tostring(doc, method='html', encoding='unicode',
1839
... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1840
... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1841
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1842
<html><body><p>Hello<br>world!</p></body></html>
1843
"""
1844
html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1845
encoding=encoding, with_tail=with_tail,
1846
doctype=doctype)
1847
if method == 'html' and not include_meta_content_type:
1848
if isinstance(html, str):
1849
html = __str_replace_meta_content_type('', html)
1850
else:
1851
html = __bytes_replace_meta_content_type(bytes(), html)
1852
return html
1853
1854
1855
tostring.__doc__ = __fix_docstring(tostring.__doc__)
1856
1857
1858
def open_in_browser(doc, encoding=None):
1859
"""
1860
Open the HTML document in a web browser, saving it to a temporary
1861
file to open it. Note that this does not delete the file after
1862
use. This is mainly meant for debugging.
1863
"""
1864
import os
1865
import webbrowser
1866
import tempfile
1867
if not isinstance(doc, etree._ElementTree):
1868
doc = etree.ElementTree(doc)
1869
handle, fn = tempfile.mkstemp(suffix='.html')
1870
f = os.fdopen(handle, 'wb')
1871
try:
1872
doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1873
finally:
1874
# we leak the file itself here, but we should at least close it
1875
f.close()
1876
url = 'file://' + fn.replace(os.path.sep, '/')
1877
print(url)
1878
webbrowser.open(url)
1879
1880
1881
################################################################################
1882
# configure Element class lookup
1883
################################################################################
1884
1885
class HTMLParser(etree.HTMLParser):
1886
"""An HTML parser that is configured to return lxml.html Element
1887
objects.
1888
"""
1889
def __init__(self, **kwargs):
1890
super(HTMLParser, self).__init__(**kwargs)
1891
self.set_element_class_lookup(HtmlElementClassLookup())
1892
1893
1894
class XHTMLParser(etree.XMLParser):
1895
"""An XML parser that is configured to return lxml.html Element
1896
objects.
1897
1898
Note that this parser is not really XHTML aware unless you let it
1899
load a DTD that declares the HTML entities. To do this, make sure
1900
you have the XHTML DTDs installed in your catalogs, and create the
1901
parser like this::
1902
1903
>>> parser = XHTMLParser(load_dtd=True)
1904
1905
If you additionally want to validate the document, use this::
1906
1907
>>> parser = XHTMLParser(dtd_validation=True)
1908
1909
For catalog support, see http://www.xmlsoft.org/catalog.html.
1910
"""
1911
def __init__(self, **kwargs):
1912
super(XHTMLParser, self).__init__(**kwargs)
1913
self.set_element_class_lookup(HtmlElementClassLookup())
1914
1915
1916
def Element(*args, **kw):
1917
"""Create a new HTML Element.
1918
1919
This can also be used for XHTML documents.
1920
"""
1921
v = html_parser.makeelement(*args, **kw)
1922
return v
1923
1924
1925
html_parser = HTMLParser()
1926
xhtml_parser = XHTMLParser()
1927
1928