CoCalc -- __init_

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/lxml/html/__init__.py
⁸¹¹ views
1
# Copyright (c) 2004 Ian Bicking. All rights reserved.
2
#
3
# Redistribution and use in source and binary forms, with or without
4
# modification, are permitted provided that the following conditions are
5
# met:
6
#
7
# 1. Redistributions of source code must retain the above copyright
8
# notice, this list of conditions and the following disclaimer.
9
#
10
# 2. Redistributions in binary form must reproduce the above copyright
11
# notice, this list of conditions and the following disclaimer in
12
# the documentation and/or other materials provided with the
13
# distribution.
14
#
15
# 3. Neither the name of Ian Bicking nor the names of its contributors may
16
# be used to endorse or promote products derived from this software
17
# without specific prior written permission.
18
#
19
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
23
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30

31
"""The ``lxml.html`` tool set for HTML handling.
32
"""
33

34
from __future__ import absolute_import
35

36
__all__ = [
37
    'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
38
    'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
39
    'find_rel_links', 'find_class', 'make_links_absolute',
40
    'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
41

42

43
import copy
44
import sys
45
import re
46
from functools import partial
47

48
try:
49
    from collections.abc import MutableMapping, MutableSet
50
except ImportError:
51
    from collections import MutableMapping, MutableSet
52

53
from .. import etree
54
from . import defs
55
from ._setmixin import SetMixin
56

57
try:
58
    from urlparse import urljoin
59
except ImportError:
60
    # Python 3
61
    from urllib.parse import urljoin
62

63
try:
64
    unicode
65
except NameError:
66
    # Python 3
67
    unicode = str
68
try:
69
    basestring
70
except NameError:
71
    # Python 3
72
    basestring = (str, bytes)
73

74

75
def __fix_docstring(s):
76
    if not s:
77
        return s
78
    if sys.version_info[0] >= 3:
79
        sub = re.compile(r"^(\s*)u'", re.M).sub
80
    else:
81
        sub = re.compile(r"^(\s*)b'", re.M).sub
82
    return sub(r"\1'", s)
83

84

85
XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
86

87
_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
88
                               namespaces={'x':XHTML_NAMESPACE})
89
_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
90
                             namespaces={'x':XHTML_NAMESPACE})
91
_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
92
                           namespaces={'x':XHTML_NAMESPACE})
93
#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
94
_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
95
_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
96
_collect_string_content = etree.XPath("string()")
97
_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
98
_iter_css_imports = re.compile(r'@import "(.*?)"').finditer
99
_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
100
                           namespaces={'x':XHTML_NAMESPACE})
101
_archive_re = re.compile(r'[^ ]+')
102
_parse_meta_refresh_url = re.compile(
103
    r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
104

105

106
def _unquote_match(s, pos):
107
    if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
108
        return s[1:-1], pos+1
109
    else:
110
        return s,pos
111

112

113
def _transform_result(typ, result):
114
    """Convert the result back into the input type.
115
    """
116
    if issubclass(typ, bytes):
117
        return tostring(result, encoding='utf-8')
118
    elif issubclass(typ, unicode):
119
        return tostring(result, encoding='unicode')
120
    else:
121
        return result
122

123

124
def _nons(tag):
125
    if isinstance(tag, basestring):
126
        if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
127
            return tag.split('}')[-1]
128
    return tag
129

130

131
class Classes(MutableSet):
132
    """Provides access to an element's class attribute as a set-like collection.
133
    Usage::
134

135
        >>> el = fromstring('<p class="hidden large">Text</p>')
136
        >>> classes = el.classes  # or: classes = Classes(el.attrib)
137
        >>> classes |= ['block', 'paragraph']
138
        >>> el.get('class')
139
        'hidden large block paragraph'
140
        >>> classes.toggle('hidden')
141
        False
142
        >>> el.get('class')
143
        'large block paragraph'
144
        >>> classes -= ('some', 'classes', 'block')
145
        >>> el.get('class')
146
        'large paragraph'
147
    """
148
    def __init__(self, attributes):
149
        self._attributes = attributes
150
        self._get_class_value = partial(attributes.get, 'class', '')
151

152
    def add(self, value):
153
        """
154
        Add a class.
155

156
        This has no effect if the class is already present.
157
        """
158
        if not value or re.search(r'\s', value):
159
            raise ValueError("Invalid class name: %r" % value)
160
        classes = self._get_class_value().split()
161
        if value in classes:
162
            return
163
        classes.append(value)
164
        self._attributes['class'] = ' '.join(classes)
165

166
    def discard(self, value):
167
        """
168
        Remove a class if it is currently present.
169

170
        If the class is not present, do nothing.
171
        """
172
        if not value or re.search(r'\s', value):
173
            raise ValueError("Invalid class name: %r" % value)
174
        classes = [name for name in self._get_class_value().split()
175
                   if name != value]
176
        if classes:
177
            self._attributes['class'] = ' '.join(classes)
178
        elif 'class' in self._attributes:
179
            del self._attributes['class']
180

181
    def remove(self, value):
182
        """
183
        Remove a class; it must currently be present.
184

185
        If the class is not present, raise a KeyError.
186
        """
187
        if not value or re.search(r'\s', value):
188
            raise ValueError("Invalid class name: %r" % value)
189
        super(Classes, self).remove(value)
190

191
    def __contains__(self, name):
192
        classes = self._get_class_value()
193
        return name in classes and name in classes.split()
194

195
    def __iter__(self):
196
        return iter(self._get_class_value().split())
197

198
    def __len__(self):
199
        return len(self._get_class_value().split())
200

201
    # non-standard methods
202

203
    def update(self, values):
204
        """
205
        Add all names from 'values'.
206
        """
207
        classes = self._get_class_value().split()
208
        extended = False
209
        for value in values:
210
            if value not in classes:
211
                classes.append(value)
212
                extended = True
213
        if extended:
214
            self._attributes['class'] = ' '.join(classes)
215

216
    def toggle(self, value):
217
        """
218
        Add a class name if it isn't there yet, or remove it if it exists.
219

220
        Returns true if the class was added (and is now enabled) and
221
        false if it was removed (and is now disabled).
222
        """
223
        if not value or re.search(r'\s', value):
224
            raise ValueError("Invalid class name: %r" % value)
225
        classes = self._get_class_value().split()
226
        try:
227
            classes.remove(value)
228
            enabled = False
229
        except ValueError:
230
            classes.append(value)
231
            enabled = True
232
        if classes:
233
            self._attributes['class'] = ' '.join(classes)
234
        else:
235
            del self._attributes['class']
236
        return enabled
237

238

239
class HtmlMixin(object):
240

241
    def set(self, key, value=None):
242
        """set(self, key, value=None)
243

244
        Sets an element attribute.  If no value is provided, or if the value is None,
245
        creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
246
        for ``form.set('novalidate')``.
247
        """
248
        super(HtmlElement, self).set(key, value)
249

250
    @property
251
    def classes(self):
252
        """
253
        A set-like wrapper around the 'class' attribute.
254
        """
255
        return Classes(self.attrib)
256

257
    @classes.setter
258
    def classes(self, classes):
259
        assert isinstance(classes, Classes)  # only allow "el.classes |= ..." etc.
260
        value = classes._get_class_value()
261
        if value:
262
            self.set('class', value)
263
        elif self.get('class') is not None:
264
            del self.attrib['class']
265

266
    @property
267
    def base_url(self):
268
        """
269
        Returns the base URL, given when the page was parsed.
270

271
        Use with ``urlparse.urljoin(el.base_url, href)`` to get
272
        absolute URLs.
273
        """
274
        return self.getroottree().docinfo.URL
275

276
    @property
277
    def forms(self):
278
        """
279
        Return a list of all the forms
280
        """
281
        return _forms_xpath(self)
282

283
    @property
284
    def body(self):
285
        """
286
        Return the <body> element.  Can be called from a child element
287
        to get the document's head.
288
        """
289
        return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
290

291
    @property
292
    def head(self):
293
        """
294
        Returns the <head> element.  Can be called from a child
295
        element to get the document's head.
296
        """
297
        return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
298

299
    @property
300
    def label(self):
301
        """
302
        Get or set any <label> element associated with this element.
303
        """
304
        id = self.get('id')
305
        if not id:
306
            return None
307
        result = _label_xpath(self, id=id)
308
        if not result:
309
            return None
310
        else:
311
            return result[0]
312

313
    @label.setter
314
    def label(self, label):
315
        id = self.get('id')
316
        if not id:
317
            raise TypeError(
318
                "You cannot set a label for an element (%r) that has no id"
319
                % self)
320
        if _nons(label.tag) != 'label':
321
            raise TypeError(
322
                "You can only assign label to a label element (not %r)"
323
                % label)
324
        label.set('for', id)
325

326
    @label.deleter
327
    def label(self):
328
        label = self.label
329
        if label is not None:
330
            del label.attrib['for']
331

332
    def drop_tree(self):
333
        """
334
        Removes this element from the tree, including its children and
335
        text.  The tail text is joined to the previous element or
336
        parent.
337
        """
338
        parent = self.getparent()
339
        assert parent is not None
340
        if self.tail:
341
            previous = self.getprevious()
342
            if previous is None:
343
                parent.text = (parent.text or '') + self.tail
344
            else:
345
                previous.tail = (previous.tail or '') + self.tail
346
        parent.remove(self)
347

348
    def drop_tag(self):
349
        """
350
        Remove the tag, but not its children or text.  The children and text
351
        are merged into the parent.
352

353
        Example::
354

355
            >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
356
            >>> h.find('.//b').drop_tag()
357
            >>> print(tostring(h, encoding='unicode'))
358
            <div>Hello World!</div>
359
        """
360
        parent = self.getparent()
361
        assert parent is not None
362
        previous = self.getprevious()
363
        if self.text and isinstance(self.tag, basestring):
364
            # not a Comment, etc.
365
            if previous is None:
366
                parent.text = (parent.text or '') + self.text
367
            else:
368
                previous.tail = (previous.tail or '') + self.text
369
        if self.tail:
370
            if len(self):
371
                last = self[-1]
372
                last.tail = (last.tail or '') + self.tail
373
            elif previous is None:
374
                parent.text = (parent.text or '') + self.tail
375
            else:
376
                previous.tail = (previous.tail or '') + self.tail
377
        index = parent.index(self)
378
        parent[index:index+1] = self[:]
379

380
    def find_rel_links(self, rel):
381
        """
382
        Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
383
        """
384
        rel = rel.lower()
385
        return [el for el in _rel_links_xpath(self)
386
                if el.get('rel').lower() == rel]
387

388
    def find_class(self, class_name):
389
        """
390
        Find any elements with the given class name.
391
        """
392
        return _class_xpath(self, class_name=class_name)
393

394
    def get_element_by_id(self, id, *default):
395
        """
396
        Get the first element in a document with the given id.  If none is
397
        found, return the default argument if provided or raise KeyError
398
        otherwise.
399

400
        Note that there can be more than one element with the same id,
401
        and this isn't uncommon in HTML documents found in the wild.
402
        Browsers return only the first match, and this function does
403
        the same.
404
        """
405
        try:
406
            # FIXME: should this check for multiple matches?
407
            # browsers just return the first one
408
            return _id_xpath(self, id=id)[0]
409
        except IndexError:
410
            if default:
411
                return default[0]
412
            else:
413
                raise KeyError(id)
414

415
    def text_content(self):
416
        """
417
        Return the text content of the tag (and the text in any children).
418
        """
419
        return _collect_string_content(self)
420

421
    def cssselect(self, expr, translator='html'):
422
        """
423
        Run the CSS expression on this element and its children,
424
        returning a list of the results.
425

426
        Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
427
        -- note that pre-compiling the expression can provide a substantial
428
        speedup.
429
        """
430
        # Do the import here to make the dependency optional.
431
        from lxml.cssselect import CSSSelector
432
        return CSSSelector(expr, translator=translator)(self)
433

434
    ########################################
435
    ## Link functions
436
    ########################################
437

438
    def make_links_absolute(self, base_url=None, resolve_base_href=True,
439
                            handle_failures=None):
440
        """
441
        Make all links in the document absolute, given the
442
        ``base_url`` for the document (the full URL where the document
443
        came from), or if no ``base_url`` is given, then the ``.base_url``
444
        of the document.
445

446
        If ``resolve_base_href`` is true, then any ``<base href>``
447
        tags in the document are used *and* removed from the document.
448
        If it is false then any such tag is ignored.
449

450
        If ``handle_failures`` is None (default), a failure to process
451
        a URL will abort the processing.  If set to 'ignore', errors
452
        are ignored.  If set to 'discard', failing URLs will be removed.
453
        """
454
        if base_url is None:
455
            base_url = self.base_url
456
            if base_url is None:
457
                raise TypeError(
458
                    "No base_url given, and the document has no base_url")
459
        if resolve_base_href:
460
            self.resolve_base_href()
461

462
        if handle_failures == 'ignore':
463
            def link_repl(href):
464
                try:
465
                    return urljoin(base_url, href)
466
                except ValueError:
467
                    return href
468
        elif handle_failures == 'discard':
469
            def link_repl(href):
470
                try:
471
                    return urljoin(base_url, href)
472
                except ValueError:
473
                    return None
474
        elif handle_failures is None:
475
            def link_repl(href):
476
                return urljoin(base_url, href)
477
        else:
478
            raise ValueError(
479
                "unexpected value for handle_failures: %r" % handle_failures)
480

481
        self.rewrite_links(link_repl)
482

483
    def resolve_base_href(self, handle_failures=None):
484
        """
485
        Find any ``<base href>`` tag in the document, and apply its
486
        values to all links found in the document.  Also remove the
487
        tag once it has been applied.
488

489
        If ``handle_failures`` is None (default), a failure to process
490
        a URL will abort the processing.  If set to 'ignore', errors
491
        are ignored.  If set to 'discard', failing URLs will be removed.
492
        """
493
        base_href = None
494
        basetags = self.xpath('//base[@href]|//x:base[@href]',
495
                              namespaces={'x': XHTML_NAMESPACE})
496
        for b in basetags:
497
            base_href = b.get('href')
498
            b.drop_tree()
499
        if not base_href:
500
            return
501
        self.make_links_absolute(base_href, resolve_base_href=False,
502
                                 handle_failures=handle_failures)
503

504
    def iterlinks(self):
505
        """
506
        Yield (element, attribute, link, pos), where attribute may be None
507
        (indicating the link is in the text).  ``pos`` is the position
508
        where the link occurs; often 0, but sometimes something else in
509
        the case of links in stylesheets or style tags.
510

511
        Note: <base href> is *not* taken into account in any way.  The
512
        link you get is exactly the link in the document.
513

514
        Note: multiple links inside of a single text string or
515
        attribute value are returned in reversed order.  This makes it
516
        possible to replace or delete them from the text string value
517
        based on their reported text positions.  Otherwise, a
518
        modification at one text position can change the positions of
519
        links reported later on.
520
        """
521
        link_attrs = defs.link_attrs
522
        for el in self.iter(etree.Element):
523
            attribs = el.attrib
524
            tag = _nons(el.tag)
525
            if tag == 'object':
526
                codebase = None
527
                ## <object> tags have attributes that are relative to
528
                ## codebase
529
                if 'codebase' in attribs:
530
                    codebase = el.get('codebase')
531
                    yield (el, 'codebase', codebase, 0)
532
                for attrib in ('classid', 'data'):
533
                    if attrib in attribs:
534
                        value = el.get(attrib)
535
                        if codebase is not None:
536
                            value = urljoin(codebase, value)
537
                        yield (el, attrib, value, 0)
538
                if 'archive' in attribs:
539
                    for match in _archive_re.finditer(el.get('archive')):
540
                        value = match.group(0)
541
                        if codebase is not None:
542
                            value = urljoin(codebase, value)
543
                        yield (el, 'archive', value, match.start())
544
            else:
545
                for attrib in link_attrs:
546
                    if attrib in attribs:
547
                        yield (el, attrib, attribs[attrib], 0)
548
            if tag == 'meta':
549
                http_equiv = attribs.get('http-equiv', '').lower()
550
                if http_equiv == 'refresh':
551
                    content = attribs.get('content', '')
552
                    match = _parse_meta_refresh_url(content)
553
                    url = (match.group('url') if match else content).strip()
554
                    # unexpected content means the redirect won't work, but we might
555
                    # as well be permissive and return the entire string.
556
                    if url:
557
                        url, pos = _unquote_match(
558
                            url, match.start('url') if match else content.find(url))
559
                        yield (el, 'content', url, pos)
560
            elif tag == 'param':
561
                valuetype = el.get('valuetype') or ''
562
                if valuetype.lower() == 'ref':
563
                    ## FIXME: while it's fine we *find* this link,
564
                    ## according to the spec we aren't supposed to
565
                    ## actually change the value, including resolving
566
                    ## it.  It can also still be a link, even if it
567
                    ## doesn't have a valuetype="ref" (which seems to be the norm)
568
                    ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
569
                    yield (el, 'value', el.get('value'), 0)
570
            elif tag == 'style' and el.text:
571
                urls = [
572
                    # (start_pos, url)
573
                    _unquote_match(match.group(1), match.start(1))[::-1]
574
                    for match in _iter_css_urls(el.text)
575
                    ] + [
576
                    (match.start(1), match.group(1))
577
                    for match in _iter_css_imports(el.text)
578
                    ]
579
                if urls:
580
                    # sort by start pos to bring both match sets back into order
581
                    # and reverse the list to report correct positions despite
582
                    # modifications
583
                    urls.sort(reverse=True)
584
                    for start, url in urls:
585
                        yield (el, None, url, start)
586
            if 'style' in attribs:
587
                urls = list(_iter_css_urls(attribs['style']))
588
                if urls:
589
                    # return in reversed order to simplify in-place modifications
590
                    for match in urls[::-1]:
591
                        url, start = _unquote_match(match.group(1), match.start(1))
592
                        yield (el, 'style', url, start)
593

594
    def rewrite_links(self, link_repl_func, resolve_base_href=True,
595
                      base_href=None):
596
        """
597
        Rewrite all the links in the document.  For each link
598
        ``link_repl_func(link)`` will be called, and the return value
599
        will replace the old link.
600

601
        Note that links may not be absolute (unless you first called
602
        ``make_links_absolute()``), and may be internal (e.g.,
603
        ``'#anchor'``).  They can also be values like
604
        ``'mailto:email'`` or ``'javascript:expr'``.
605

606
        If you give ``base_href`` then all links passed to
607
        ``link_repl_func()`` will take that into account.
608

609
        If the ``link_repl_func`` returns None, the attribute or
610
        tag text will be removed completely.
611
        """
612
        if base_href is not None:
613
            # FIXME: this can be done in one pass with a wrapper
614
            # around link_repl_func
615
            self.make_links_absolute(
616
                base_href, resolve_base_href=resolve_base_href)
617
        elif resolve_base_href:
618
            self.resolve_base_href()
619

620
        for el, attrib, link, pos in self.iterlinks():
621
            new_link = link_repl_func(link.strip())
622
            if new_link == link:
623
                continue
624
            if new_link is None:
625
                # Remove the attribute or element content
626
                if attrib is None:
627
                    el.text = ''
628
                else:
629
                    del el.attrib[attrib]
630
                continue
631

632
            if attrib is None:
633
                new = el.text[:pos] + new_link + el.text[pos+len(link):]
634
                el.text = new
635
            else:
636
                cur = el.get(attrib)
637
                if not pos and len(cur) == len(link):
638
                    new = new_link  # most common case
639
                else:
640
                    new = cur[:pos] + new_link + cur[pos+len(link):]
641
                el.set(attrib, new)
642

643

644
class _MethodFunc(object):
645
    """
646
    An object that represents a method on an element as a function;
647
    the function takes either an element or an HTML string.  It
648
    returns whatever the function normally returns, or if the function
649
    works in-place (and so returns None) it returns a serialized form
650
    of the resulting document.
651
    """
652
    def __init__(self, name, copy=False, source_class=HtmlMixin):
653
        self.name = name
654
        self.copy = copy
655
        self.__doc__ = getattr(source_class, self.name).__doc__
656
    def __call__(self, doc, *args, **kw):
657
        result_type = type(doc)
658
        if isinstance(doc, basestring):
659
            if 'copy' in kw:
660
                raise TypeError(
661
                    "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
662
            doc = fromstring(doc, **kw)
663
        else:
664
            if 'copy' in kw:
665
                make_a_copy = kw.pop('copy')
666
            else:
667
                make_a_copy = self.copy
668
            if make_a_copy:
669
                doc = copy.deepcopy(doc)
670
        meth = getattr(doc, self.name)
671
        result = meth(*args, **kw)
672
        # FIXME: this None test is a bit sloppy
673
        if result is None:
674
            # Then return what we got in
675
            return _transform_result(result_type, doc)
676
        else:
677
            return result
678

679

680
find_rel_links = _MethodFunc('find_rel_links', copy=False)
681
find_class = _MethodFunc('find_class', copy=False)
682
make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
683
resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
684
iterlinks = _MethodFunc('iterlinks', copy=False)
685
rewrite_links = _MethodFunc('rewrite_links', copy=True)
686

687

688
class HtmlComment(etree.CommentBase, HtmlMixin):
689
    pass
690

691

692
class HtmlElement(etree.ElementBase, HtmlMixin):
693
    # Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?)
694
    cssselect = HtmlMixin.cssselect
695
    set = HtmlMixin.set
696

697

698
class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
699
    pass
700

701

702
class HtmlEntity(etree.EntityBase, HtmlMixin):
703
    pass
704

705

706
class HtmlElementClassLookup(etree.CustomElementClassLookup):
707
    """A lookup scheme for HTML Element classes.
708

709
    To create a lookup instance with different Element classes, pass a tag
710
    name mapping of Element classes in the ``classes`` keyword argument and/or
711
    a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
712
    The special key '*' denotes a Mixin class that should be mixed into all
713
    Element classes.
714
    """
715
    _default_element_classes = {}
716

717
    def __init__(self, classes=None, mixins=None):
718
        etree.CustomElementClassLookup.__init__(self)
719
        if classes is None:
720
            classes = self._default_element_classes.copy()
721
        if mixins:
722
            mixers = {}
723
            for name, value in mixins:
724
                if name == '*':
725
                    for n in classes.keys():
726
                        mixers.setdefault(n, []).append(value)
727
                else:
728
                    mixers.setdefault(name, []).append(value)
729
            for name, mix_bases in mixers.items():
730
                cur = classes.get(name, HtmlElement)
731
                bases = tuple(mix_bases + [cur])
732
                classes[name] = type(cur.__name__, bases, {})
733
        self._element_classes = classes
734

735
    def lookup(self, node_type, document, namespace, name):
736
        if node_type == 'element':
737
            return self._element_classes.get(name.lower(), HtmlElement)
738
        elif node_type == 'comment':
739
            return HtmlComment
740
        elif node_type == 'PI':
741
            return HtmlProcessingInstruction
742
        elif node_type == 'entity':
743
            return HtmlEntity
744
        # Otherwise normal lookup
745
        return None
746

747

748
################################################################################
749
# parsing
750
################################################################################
751

752
_looks_like_full_html_unicode = re.compile(
753
    unicode(r'^\s*<(?:html|!doctype)'), re.I).match
754
_looks_like_full_html_bytes = re.compile(
755
    r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
756

757

758
def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
759
    if parser is None:
760
        parser = html_parser
761
    value = etree.fromstring(html, parser, **kw)
762
    if value is None:
763
        raise etree.ParserError(
764
            "Document is empty")
765
    if ensure_head_body and value.find('head') is None:
766
        value.insert(0, Element('head'))
767
    if ensure_head_body and value.find('body') is None:
768
        value.append(Element('body'))
769
    return value
770

771

772
def fragments_fromstring(html, no_leading_text=False, base_url=None,
773
                         parser=None, **kw):
774
    """Parses several HTML elements, returning a list of elements.
775

776
    The first item in the list may be a string.
777
    If no_leading_text is true, then it will be an error if there is
778
    leading text, and it will always be a list of only elements.
779

780
    base_url will set the document's base_url attribute
781
    (and the tree's docinfo.URL).
782
    """
783
    if parser is None:
784
        parser = html_parser
785
    # FIXME: check what happens when you give html with a body, head, etc.
786
    if isinstance(html, bytes):
787
        if not _looks_like_full_html_bytes(html):
788
            # can't use %-formatting in early Py3 versions
789
            html = ('<html><body>'.encode('ascii') + html +
790
                    '</body></html>'.encode('ascii'))
791
    else:
792
        if not _looks_like_full_html_unicode(html):
793
            html = '<html><body>%s</body></html>' % html
794
    doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
795
    assert _nons(doc.tag) == 'html'
796
    bodies = [e for e in doc if _nons(e.tag) == 'body']
797
    assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
798
    body = bodies[0]
799
    elements = []
800
    if no_leading_text and body.text and body.text.strip():
801
        raise etree.ParserError(
802
            "There is leading text: %r" % body.text)
803
    if body.text and body.text.strip():
804
        elements.append(body.text)
805
    elements.extend(body)
806
    # FIXME: removing the reference to the parent artificial document
807
    # would be nice
808
    return elements
809

810

811
def fragment_fromstring(html, create_parent=False, base_url=None,
812
                        parser=None, **kw):
813
    """
814
    Parses a single HTML element; it is an error if there is more than
815
    one element, or if anything but whitespace precedes or follows the
816
    element.
817

818
    If ``create_parent`` is true (or is a tag name) then a parent node
819
    will be created to encapsulate the HTML in a single element.  In this
820
    case, leading or trailing text is also allowed, as are multiple elements
821
    as result of the parsing.
822

823
    Passing a ``base_url`` will set the document's ``base_url`` attribute
824
    (and the tree's docinfo.URL).
825
    """
826
    if parser is None:
827
        parser = html_parser
828

829
    accept_leading_text = bool(create_parent)
830

831
    elements = fragments_fromstring(
832
        html, parser=parser, no_leading_text=not accept_leading_text,
833
        base_url=base_url, **kw)
834

835
    if create_parent:
836
        if not isinstance(create_parent, basestring):
837
            create_parent = 'div'
838
        new_root = Element(create_parent)
839
        if elements:
840
            if isinstance(elements[0], basestring):
841
                new_root.text = elements[0]
842
                del elements[0]
843
            new_root.extend(elements)
844
        return new_root
845

846
    if not elements:
847
        raise etree.ParserError('No elements found')
848
    if len(elements) > 1:
849
        raise etree.ParserError(
850
            "Multiple elements found (%s)"
851
            % ', '.join([_element_name(e) for e in elements]))
852
    el = elements[0]
853
    if el.tail and el.tail.strip():
854
        raise etree.ParserError(
855
            "Element followed by text: %r" % el.tail)
856
    el.tail = None
857
    return el
858

859

860
def fromstring(html, base_url=None, parser=None, **kw):
861
    """
862
    Parse the html, returning a single element/document.
863

864
    This tries to minimally parse the chunk of text, without knowing if it
865
    is a fragment or a document.
866

867
    base_url will set the document's base_url attribute (and the tree's docinfo.URL)
868
    """
869
    if parser is None:
870
        parser = html_parser
871
    if isinstance(html, bytes):
872
        is_full_html = _looks_like_full_html_bytes(html)
873
    else:
874
        is_full_html = _looks_like_full_html_unicode(html)
875
    doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
876
    if is_full_html:
877
        return doc
878
    # otherwise, lets parse it out...
879
    bodies = doc.findall('body')
880
    if not bodies:
881
        bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
882
    if bodies:
883
        body = bodies[0]
884
        if len(bodies) > 1:
885
            # Somehow there are multiple bodies, which is bad, but just
886
            # smash them into one body
887
            for other_body in bodies[1:]:
888
                if other_body.text:
889
                    if len(body):
890
                        body[-1].tail = (body[-1].tail or '') + other_body.text
891
                    else:
892
                        body.text = (body.text or '') + other_body.text
893
                body.extend(other_body)
894
                # We'll ignore tail
895
                # I guess we are ignoring attributes too
896
                other_body.drop_tree()
897
    else:
898
        body = None
899
    heads = doc.findall('head')
900
    if not heads:
901
        heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
902
    if heads:
903
        # Well, we have some sort of structure, so lets keep it all
904
        head = heads[0]
905
        if len(heads) > 1:
906
            for other_head in heads[1:]:
907
                head.extend(other_head)
908
                # We don't care about text or tail in a head
909
                other_head.drop_tree()
910
        return doc
911
    if body is None:
912
        return doc
913
    if (len(body) == 1 and (not body.text or not body.text.strip())
914
        and (not body[-1].tail or not body[-1].tail.strip())):
915
        # The body has just one element, so it was probably a single
916
        # element passed in
917
        return body[0]
918
    # Now we have a body which represents a bunch of tags which have the
919
    # content that was passed in.  We will create a fake container, which
920
    # is the body tag, except <body> implies too much structure.
921
    if _contains_block_level_tag(body):
922
        body.tag = 'div'
923
    else:
924
        body.tag = 'span'
925
    return body
926

927

928
def parse(filename_or_url, parser=None, base_url=None, **kw):
929
    """
930
    Parse a filename, URL, or file-like object into an HTML document
931
    tree.  Note: this returns a tree, not an element.  Use
932
    ``parse(...).getroot()`` to get the document root.
933

934
    You can override the base URL with the ``base_url`` keyword.  This
935
    is most useful when parsing from a file-like object.
936
    """
937
    if parser is None:
938
        parser = html_parser
939
    return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
940

941

942
def _contains_block_level_tag(el):
943
    # FIXME: I could do this with XPath, but would that just be
944
    # unnecessarily slow?
945
    for el in el.iter(etree.Element):
946
        if _nons(el.tag) in defs.block_tags:
947
            return True
948
    return False
949

950

951
def _element_name(el):
952
    if isinstance(el, etree.CommentBase):
953
        return 'comment'
954
    elif isinstance(el, basestring):
955
        return 'string'
956
    else:
957
        return _nons(el.tag)
958

959

960
################################################################################
961
# form handling
962
################################################################################
963

964
class FormElement(HtmlElement):
965
    """
966
    Represents a <form> element.
967
    """
968

969
    @property
970
    def inputs(self):
971
        """
972
        Returns an accessor for all the input elements in the form.
973

974
        See `InputGetter` for more information about the object.
975
        """
976
        return InputGetter(self)
977

978
    @property
979
    def fields(self):
980
        """
981
        Dictionary-like object that represents all the fields in this
982
        form.  You can set values in this dictionary to effect the
983
        form.
984
        """
985
        return FieldsDict(self.inputs)
986

987
    @fields.setter
988
    def fields(self, value):
989
        fields = self.fields
990
        prev_keys = fields.keys()
991
        for key, value in value.items():
992
            if key in prev_keys:
993
                prev_keys.remove(key)
994
            fields[key] = value
995
        for key in prev_keys:
996
            if key is None:
997
                # Case of an unnamed input; these aren't really
998
                # expressed in form_values() anyway.
999
                continue
1000
            fields[key] = None
1001

1002
    def _name(self):
1003
        if self.get('name'):
1004
            return self.get('name')
1005
        elif self.get('id'):
1006
            return '#' + self.get('id')
1007
        iter_tags = self.body.iter
1008
        forms = list(iter_tags('form'))
1009
        if not forms:
1010
            forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))
1011
        return str(forms.index(self))
1012

1013
    def form_values(self):
1014
        """
1015
        Return a list of tuples of the field values for the form.
1016
        This is suitable to be passed to ``urllib.urlencode()``.
1017
        """
1018
        results = []
1019
        for el in self.inputs:
1020
            name = el.name
1021
            if not name or 'disabled' in el.attrib:
1022
                continue
1023
            tag = _nons(el.tag)
1024
            if tag == 'textarea':
1025
                results.append((name, el.value))
1026
            elif tag == 'select':
1027
                value = el.value
1028
                if el.multiple:
1029
                    for v in value:
1030
                        results.append((name, v))
1031
                elif value is not None:
1032
                    results.append((name, el.value))
1033
            else:
1034
                assert tag == 'input', (
1035
                    "Unexpected tag: %r" % el)
1036
                if el.checkable and not el.checked:
1037
                    continue
1038
                if el.type in ('submit', 'image', 'reset', 'file'):
1039
                    continue
1040
                value = el.value
1041
                if value is not None:
1042
                    results.append((name, el.value))
1043
        return results
1044

1045
    @property
1046
    def action(self):
1047
        """
1048
        Get/set the form's ``action`` attribute.
1049
        """
1050
        base_url = self.base_url
1051
        action = self.get('action')
1052
        if base_url and action is not None:
1053
            return urljoin(base_url, action)
1054
        else:
1055
            return action
1056

1057
    @action.setter
1058
    def action(self, value):
1059
        self.set('action', value)
1060

1061
    @action.deleter
1062
    def action(self):
1063
        attrib = self.attrib
1064
        if 'action' in attrib:
1065
            del attrib['action']
1066

1067
    @property
1068
    def method(self):
1069
        """
1070
        Get/set the form's method.  Always returns a capitalized
1071
        string, and defaults to ``'GET'``
1072
        """
1073
        return self.get('method', 'GET').upper()
1074

1075
    @method.setter
1076
    def method(self, value):
1077
        self.set('method', value.upper())
1078

1079

1080
HtmlElementClassLookup._default_element_classes['form'] = FormElement
1081

1082

1083
def submit_form(form, extra_values=None, open_http=None):
1084
    """
1085
    Helper function to submit a form.  Returns a file-like object, as from
1086
    ``urllib.urlopen()``.  This object also has a ``.geturl()`` function,
1087
    which shows the URL if there were any redirects.
1088

1089
    You can use this like::
1090

1091
        form = doc.forms[0]
1092
        form.inputs['foo'].value = 'bar' # etc
1093
        response = form.submit()
1094
        doc = parse(response)
1095
        doc.make_links_absolute(response.geturl())
1096

1097
    To change the HTTP requester, pass a function as ``open_http`` keyword
1098
    argument that opens the URL for you.  The function must have the following
1099
    signature::
1100

1101
        open_http(method, URL, values)
1102

1103
    The action is one of 'GET' or 'POST', the URL is the target URL as a
1104
    string, and the values are a sequence of ``(name, value)`` tuples with the
1105
    form data.
1106
    """
1107
    values = form.form_values()
1108
    if extra_values:
1109
        if hasattr(extra_values, 'items'):
1110
            extra_values = extra_values.items()
1111
        values.extend(extra_values)
1112
    if open_http is None:
1113
        open_http = open_http_urllib
1114
    if form.action:
1115
        url = form.action
1116
    else:
1117
        url = form.base_url
1118
    return open_http(form.method, url, values)
1119

1120

1121
def open_http_urllib(method, url, values):
1122
    if not url:
1123
        raise ValueError("cannot submit, no URL provided")
1124
    ## FIXME: should test that it's not a relative URL or something
1125
    try:
1126
        from urllib import urlencode, urlopen
1127
    except ImportError: # Python 3
1128
        from urllib.request import urlopen
1129
        from urllib.parse import urlencode
1130
    if method == 'GET':
1131
        if '?' in url:
1132
            url += '&'
1133
        else:
1134
            url += '?'
1135
        url += urlencode(values)
1136
        data = None
1137
    else:
1138
        data = urlencode(values)
1139
        if not isinstance(data, bytes):
1140
            data = data.encode('ASCII')
1141
    return urlopen(url, data)
1142

1143

1144
class FieldsDict(MutableMapping):
1145

1146
    def __init__(self, inputs):
1147
        self.inputs = inputs
1148
    def __getitem__(self, item):
1149
        return self.inputs[item].value
1150
    def __setitem__(self, item, value):
1151
        self.inputs[item].value = value
1152
    def __delitem__(self, item):
1153
        raise KeyError(
1154
            "You cannot remove keys from ElementDict")
1155
    def keys(self):
1156
        return self.inputs.keys()
1157
    def __contains__(self, item):
1158
        return item in self.inputs
1159
    def __iter__(self):
1160
        return iter(self.inputs.keys())
1161
    def __len__(self):
1162
        return len(self.inputs)
1163

1164
    def __repr__(self):
1165
        return '<%s for form %s>' % (
1166
            self.__class__.__name__,
1167
            self.inputs.form._name())
1168

1169

1170
class InputGetter(object):
1171

1172
    """
1173
    An accessor that represents all the input fields in a form.
1174

1175
    You can get fields by name from this, with
1176
    ``form.inputs['field_name']``.  If there are a set of checkboxes
1177
    with the same name, they are returned as a list (a `CheckboxGroup`
1178
    which also allows value setting).  Radio inputs are handled
1179
    similarly.
1180

1181
    You can also iterate over this to get all input elements.  This
1182
    won't return the same thing as if you get all the names, as
1183
    checkboxes and radio elements are returned individually.
1184
    """
1185

1186
    _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]")
1187
    _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']")
1188

1189
    def __init__(self, form):
1190
        self.form = form
1191

1192
    def __repr__(self):
1193
        return '<%s for form %s>' % (
1194
            self.__class__.__name__,
1195
            self.form._name())
1196

1197
    ## FIXME: there should be more methods, and it's unclear if this is
1198
    ## a dictionary-like object or list-like object
1199

1200
    def __getitem__(self, name):
1201
        results = self._name_xpath(self.form, name=name)
1202
        if results:
1203
            type = results[0].get('type')
1204
            if type == 'radio' and len(results) > 1:
1205
                group = RadioGroup(results)
1206
                group.name = name
1207
                return group
1208
            elif type == 'checkbox' and len(results) > 1:
1209
                group = CheckboxGroup(results)
1210
                group.name = name
1211
                return group
1212
            else:
1213
                # I don't like throwing away elements like this
1214
                return results[0]
1215
        else:
1216
            raise KeyError(
1217
                "No input element with the name %r" % name)
1218

1219
    def __contains__(self, name):
1220
        results = self._name_xpath(self.form, name=name)
1221
        return bool(results)
1222

1223
    def keys(self):
1224
        names = set()
1225
        for el in self:
1226
            names.add(el.name)
1227
        if None in names:
1228
            names.remove(None)
1229
        return list(names)
1230

1231
    def __iter__(self):
1232
        ## FIXME: kind of dumb to turn a list into an iterator, only
1233
        ## to have it likely turned back into a list again :(
1234
        return iter(self._all_xpath(self.form))
1235

1236

1237
class InputMixin(object):
1238
    """
1239
    Mix-in for all input elements (input, select, and textarea)
1240
    """
1241
    @property
1242
    def name(self):
1243
        """
1244
        Get/set the name of the element
1245
        """
1246
        return self.get('name')
1247

1248
    @name.setter
1249
    def name(self, value):
1250
        self.set('name', value)
1251

1252
    @name.deleter
1253
    def name(self):
1254
        attrib = self.attrib
1255
        if 'name' in attrib:
1256
            del attrib['name']
1257

1258
    def __repr__(self):
1259
        type_name = getattr(self, 'type', None)
1260
        if type_name:
1261
            type_name = ' type=%r' % type_name
1262
        else:
1263
            type_name = ''
1264
        return '<%s %x name=%r%s>' % (
1265
            self.__class__.__name__, id(self), self.name, type_name)
1266

1267

1268
class TextareaElement(InputMixin, HtmlElement):
1269
    """
1270
    ``<textarea>`` element.  You can get the name with ``.name`` and
1271
    get/set the value with ``.value``
1272
    """
1273
    @property
1274
    def value(self):
1275
        """
1276
        Get/set the value (which is the contents of this element)
1277
        """
1278
        content = self.text or ''
1279
        if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1280
            serialisation_method = 'xml'
1281
        else:
1282
            serialisation_method = 'html'
1283
        for el in self:
1284
            # it's rare that we actually get here, so let's not use ''.join()
1285
            content += etree.tostring(
1286
                el, method=serialisation_method, encoding='unicode')
1287
        return content
1288

1289
    @value.setter
1290
    def value(self, value):
1291
        del self[:]
1292
        self.text = value
1293

1294
    @value.deleter
1295
    def value(self):
1296
        self.text = ''
1297
        del self[:]
1298

1299

1300
HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1301

1302

1303
class SelectElement(InputMixin, HtmlElement):
1304
    """
1305
    ``<select>`` element.  You can get the name with ``.name``.
1306

1307
    ``.value`` will be the value of the selected option, unless this
1308
    is a multi-select element (``<select multiple>``), in which case
1309
    it will be a set-like object.  In either case ``.value_options``
1310
    gives the possible values.
1311

1312
    The boolean attribute ``.multiple`` shows if this is a
1313
    multi-select.
1314
    """
1315
    @property
1316
    def value(self):
1317
        """
1318
        Get/set the value of this select (the selected option).
1319

1320
        If this is a multi-select, this is a set-like object that
1321
        represents all the selected options.
1322
        """
1323
        if self.multiple:
1324
            return MultipleSelectOptions(self)
1325
        options = _options_xpath(self)
1326

1327
        try:
1328
            selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
1329
        except StopIteration:
1330
            try:
1331
                selected_option = next(el for el in options if el.get('disabled') is None)
1332
            except StopIteration:
1333
                return None
1334
        value = selected_option.get('value')
1335
        if value is None:
1336
            value = (selected_option.text or '').strip()
1337
        return value
1338

1339
    @value.setter
1340
    def value(self, value):
1341
        if self.multiple:
1342
            if isinstance(value, basestring):
1343
                raise TypeError("You must pass in a sequence")
1344
            values = self.value
1345
            values.clear()
1346
            values.update(value)
1347
            return
1348
        checked_option = None
1349
        if value is not None:
1350
            for el in _options_xpath(self):
1351
                opt_value = el.get('value')
1352
                if opt_value is None:
1353
                    opt_value = (el.text or '').strip()
1354
                if opt_value == value:
1355
                    checked_option = el
1356
                    break
1357
            else:
1358
                raise ValueError(
1359
                    "There is no option with the value of %r" % value)
1360
        for el in _options_xpath(self):
1361
            if 'selected' in el.attrib:
1362
                del el.attrib['selected']
1363
        if checked_option is not None:
1364
            checked_option.set('selected', '')
1365

1366
    @value.deleter
1367
    def value(self):
1368
        # FIXME: should del be allowed at all?
1369
        if self.multiple:
1370
            self.value.clear()
1371
        else:
1372
            self.value = None
1373

1374
    @property
1375
    def value_options(self):
1376
        """
1377
        All the possible values this select can have (the ``value``
1378
        attribute of all the ``<option>`` elements.
1379
        """
1380
        options = []
1381
        for el in _options_xpath(self):
1382
            value = el.get('value')
1383
            if value is None:
1384
                value = (el.text or '').strip()
1385
            options.append(value)
1386
        return options
1387

1388
    @property
1389
    def multiple(self):
1390
        """
1391
        Boolean attribute: is there a ``multiple`` attribute on this element.
1392
        """
1393
        return 'multiple' in self.attrib
1394

1395
    @multiple.setter
1396
    def multiple(self, value):
1397
        if value:
1398
            self.set('multiple', '')
1399
        elif 'multiple' in self.attrib:
1400
            del self.attrib['multiple']
1401

1402

1403
HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1404

1405

1406
class MultipleSelectOptions(SetMixin):
1407
    """
1408
    Represents all the selected options in a ``<select multiple>`` element.
1409

1410
    You can add to this set-like option to select an option, or remove
1411
    to unselect the option.
1412
    """
1413

1414
    def __init__(self, select):
1415
        self.select = select
1416

1417
    @property
1418
    def options(self):
1419
        """
1420
        Iterator of all the ``<option>`` elements.
1421
        """
1422
        return iter(_options_xpath(self.select))
1423

1424
    def __iter__(self):
1425
        for option in self.options:
1426
            if 'selected' in option.attrib:
1427
                opt_value = option.get('value')
1428
                if opt_value is None:
1429
                    opt_value = (option.text or '').strip()
1430
                yield opt_value
1431

1432
    def add(self, item):
1433
        for option in self.options:
1434
            opt_value = option.get('value')
1435
            if opt_value is None:
1436
                opt_value = (option.text or '').strip()
1437
            if opt_value == item:
1438
                option.set('selected', '')
1439
                break
1440
        else:
1441
            raise ValueError(
1442
                "There is no option with the value %r" % item)
1443

1444
    def remove(self, item):
1445
        for option in self.options:
1446
            opt_value = option.get('value')
1447
            if opt_value is None:
1448
                opt_value = (option.text or '').strip()
1449
            if opt_value == item:
1450
                if 'selected' in option.attrib:
1451
                    del option.attrib['selected']
1452
                else:
1453
                    raise ValueError(
1454
                        "The option %r is not currently selected" % item)
1455
                break
1456
        else:
1457
            raise ValueError(
1458
                "There is not option with the value %r" % item)
1459

1460
    def __repr__(self):
1461
        return '<%s {%s} for select name=%r>' % (
1462
            self.__class__.__name__,
1463
            ', '.join([repr(v) for v in self]),
1464
            self.select.name)
1465

1466

1467
class RadioGroup(list):
1468
    """
1469
    This object represents several ``<input type=radio>`` elements
1470
    that have the same name.
1471

1472
    You can use this like a list, but also use the property
1473
    ``.value`` to check/uncheck inputs.  Also you can use
1474
    ``.value_options`` to get the possible values.
1475
    """
1476
    @property
1477
    def value(self):
1478
        """
1479
        Get/set the value, which checks the radio with that value (and
1480
        unchecks any other value).
1481
        """
1482
        for el in self:
1483
            if 'checked' in el.attrib:
1484
                return el.get('value')
1485
        return None
1486

1487
    @value.setter
1488
    def value(self, value):
1489
        checked_option = None
1490
        if value is not None:
1491
            for el in self:
1492
                if el.get('value') == value:
1493
                    checked_option = el
1494
                    break
1495
            else:
1496
                raise ValueError("There is no radio input with the value %r" % value)
1497
        for el in self:
1498
            if 'checked' in el.attrib:
1499
                del el.attrib['checked']
1500
        if checked_option is not None:
1501
            checked_option.set('checked', '')
1502

1503
    @value.deleter
1504
    def value(self):
1505
        self.value = None
1506

1507
    @property
1508
    def value_options(self):
1509
        """
1510
        Returns a list of all the possible values.
1511
        """
1512
        return [el.get('value') for el in self]
1513

1514
    def __repr__(self):
1515
        return '%s(%s)' % (
1516
            self.__class__.__name__,
1517
            list.__repr__(self))
1518

1519

1520
class CheckboxGroup(list):
1521
    """
1522
    Represents a group of checkboxes (``<input type=checkbox>``) that
1523
    have the same name.
1524

1525
    In addition to using this like a list, the ``.value`` attribute
1526
    returns a set-like object that you can add to or remove from to
1527
    check and uncheck checkboxes.  You can also use ``.value_options``
1528
    to get the possible values.
1529
    """
1530
    @property
1531
    def value(self):
1532
        """
1533
        Return a set-like object that can be modified to check or
1534
        uncheck individual checkboxes according to their value.
1535
        """
1536
        return CheckboxValues(self)
1537

1538
    @value.setter
1539
    def value(self, value):
1540
        values = self.value
1541
        values.clear()
1542
        if not hasattr(value, '__iter__'):
1543
            raise ValueError(
1544
                "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
1545
                % (self[0].name, value))
1546
        values.update(value)
1547

1548
    @value.deleter
1549
    def value(self):
1550
        self.value.clear()
1551

1552
    @property
1553
    def value_options(self):
1554
        """
1555
        Returns a list of all the possible values.
1556
        """
1557
        return [el.get('value') for el in self]
1558

1559
    def __repr__(self):
1560
        return '%s(%s)' % (
1561
            self.__class__.__name__, list.__repr__(self))
1562

1563

1564
class CheckboxValues(SetMixin):
1565
    """
1566
    Represents the values of the checked checkboxes in a group of
1567
    checkboxes with the same name.
1568
    """
1569

1570
    def __init__(self, group):
1571
        self.group = group
1572

1573
    def __iter__(self):
1574
        return iter([
1575
            el.get('value')
1576
            for el in self.group
1577
            if 'checked' in el.attrib])
1578

1579
    def add(self, value):
1580
        for el in self.group:
1581
            if el.get('value') == value:
1582
                el.set('checked', '')
1583
                break
1584
        else:
1585
            raise KeyError("No checkbox with value %r" % value)
1586

1587
    def remove(self, value):
1588
        for el in self.group:
1589
            if el.get('value') == value:
1590
                if 'checked' in el.attrib:
1591
                    del el.attrib['checked']
1592
                else:
1593
                    raise KeyError(
1594
                        "The checkbox with value %r was already unchecked" % value)
1595
                break
1596
        else:
1597
            raise KeyError(
1598
                "No checkbox with value %r" % value)
1599

1600
    def __repr__(self):
1601
        return '<%s {%s} for checkboxes name=%r>' % (
1602
            self.__class__.__name__,
1603
            ', '.join([repr(v) for v in self]),
1604
            self.group.name)
1605

1606

1607
class InputElement(InputMixin, HtmlElement):
1608
    """
1609
    Represents an ``<input>`` element.
1610

1611
    You can get the type with ``.type`` (which is lower-cased and
1612
    defaults to ``'text'``).
1613

1614
    Also you can get and set the value with ``.value``
1615

1616
    Checkboxes and radios have the attribute ``input.checkable ==
1617
    True`` (for all others it is false) and a boolean attribute
1618
    ``.checked``.
1619

1620
    """
1621

1622
    ## FIXME: I'm a little uncomfortable with the use of .checked
1623
    @property
1624
    def value(self):
1625
        """
1626
        Get/set the value of this element, using the ``value`` attribute.
1627

1628
        Also, if this is a checkbox and it has no value, this defaults
1629
        to ``'on'``.  If it is a checkbox or radio that is not
1630
        checked, this returns None.
1631
        """
1632
        if self.checkable:
1633
            if self.checked:
1634
                return self.get('value') or 'on'
1635
            else:
1636
                return None
1637
        return self.get('value')
1638

1639
    @value.setter
1640
    def value(self, value):
1641
        if self.checkable:
1642
            if not value:
1643
                self.checked = False
1644
            else:
1645
                self.checked = True
1646
                if isinstance(value, basestring):
1647
                    self.set('value', value)
1648
        else:
1649
            self.set('value', value)
1650

1651
    @value.deleter
1652
    def value(self):
1653
        if self.checkable:
1654
            self.checked = False
1655
        else:
1656
            if 'value' in self.attrib:
1657
                del self.attrib['value']
1658

1659
    @property
1660
    def type(self):
1661
        """
1662
        Return the type of this element (using the type attribute).
1663
        """
1664
        return self.get('type', 'text').lower()
1665

1666
    @type.setter
1667
    def type(self, value):
1668
        self.set('type', value)
1669

1670
    @property
1671
    def checkable(self):
1672
        """
1673
        Boolean: can this element be checked?
1674
        """
1675
        return self.type in ('checkbox', 'radio')
1676

1677
    @property
1678
    def checked(self):
1679
        """
1680
        Boolean attribute to get/set the presence of the ``checked``
1681
        attribute.
1682

1683
        You can only use this on checkable input types.
1684
        """
1685
        if not self.checkable:
1686
            raise AttributeError('Not a checkable input type')
1687
        return 'checked' in self.attrib
1688

1689
    @checked.setter
1690
    def checked(self, value):
1691
        if not self.checkable:
1692
            raise AttributeError('Not a checkable input type')
1693
        if value:
1694
            self.set('checked', '')
1695
        else:
1696
            attrib = self.attrib
1697
            if 'checked' in attrib:
1698
                del attrib['checked']
1699

1700

1701
HtmlElementClassLookup._default_element_classes['input'] = InputElement
1702

1703

1704
class LabelElement(HtmlElement):
1705
    """
1706
    Represents a ``<label>`` element.
1707

1708
    Label elements are linked to other elements with their ``for``
1709
    attribute.  You can access this element with ``label.for_element``.
1710
    """
1711
    @property
1712
    def for_element(self):
1713
        """
1714
        Get/set the element this label points to.  Return None if it
1715
        can't be found.
1716
        """
1717
        id = self.get('for')
1718
        if not id:
1719
            return None
1720
        return self.body.get_element_by_id(id)
1721

1722
    @for_element.setter
1723
    def for_element(self, other):
1724
        id = other.get('id')
1725
        if not id:
1726
            raise TypeError(
1727
                "Element %r has no id attribute" % other)
1728
        self.set('for', id)
1729

1730
    @for_element.deleter
1731
    def for_element(self):
1732
        attrib = self.attrib
1733
        if 'id' in attrib:
1734
            del attrib['id']
1735

1736

1737
HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1738

1739

1740
############################################################
1741
## Serialization
1742
############################################################
1743

1744
def html_to_xhtml(html):
1745
    """Convert all tags in an HTML tree to XHTML by moving them to the
1746
    XHTML namespace.
1747
    """
1748
    try:
1749
        html = html.getroot()
1750
    except AttributeError:
1751
        pass
1752
    prefix = "{%s}" % XHTML_NAMESPACE
1753
    for el in html.iter(etree.Element):
1754
        tag = el.tag
1755
        if tag[0] != '{':
1756
            el.tag = prefix + tag
1757

1758

1759
def xhtml_to_html(xhtml):
1760
    """Convert all tags in an XHTML tree to HTML by removing their
1761
    XHTML namespace.
1762
    """
1763
    try:
1764
        xhtml = xhtml.getroot()
1765
    except AttributeError:
1766
        pass
1767
    prefix = "{%s}" % XHTML_NAMESPACE
1768
    prefix_len = len(prefix)
1769
    for el in xhtml.iter(prefix + "*"):
1770
        el.tag = el.tag[prefix_len:]
1771

1772

1773
# This isn't a general match, but it's a match for what libxml2
1774
# specifically serialises:
1775
__str_replace_meta_content_type = re.compile(
1776
    r'<meta http-equiv="Content-Type"[^>]*>').sub
1777
__bytes_replace_meta_content_type = re.compile(
1778
    r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1779

1780

1781
def tostring(doc, pretty_print=False, include_meta_content_type=False,
1782
             encoding=None, method="html", with_tail=True, doctype=None):
1783
    """Return an HTML string representation of the document.
1784

1785
    Note: if include_meta_content_type is true this will create a
1786
    ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1787
    regardless of the value of include_meta_content_type any existing
1788
    ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1789

1790
    The ``encoding`` argument controls the output encoding (defaults to
1791
    ASCII, with &#...; character references for any characters outside
1792
    of ASCII).  Note that you can pass the name ``'unicode'`` as
1793
    ``encoding`` argument to serialise to a Unicode string.
1794

1795
    The ``method`` argument defines the output method.  It defaults to
1796
    'html', but can also be 'xml' for xhtml output, or 'text' to
1797
    serialise to plain text without markup.
1798

1799
    To leave out the tail text of the top-level element that is being
1800
    serialised, pass ``with_tail=False``.
1801

1802
    The ``doctype`` option allows passing in a plain string that will
1803
    be serialised before the XML tree.  Note that passing in non
1804
    well-formed content here will make the XML output non well-formed.
1805
    Also, an existing doctype in the document tree will not be removed
1806
    when serialising an ElementTree instance.
1807

1808
    Example::
1809

1810
        >>> from lxml import html
1811
        >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1812

1813
        >>> html.tostring(root)
1814
        b'<p>Hello<br>world!</p>'
1815
        >>> html.tostring(root, method='html')
1816
        b'<p>Hello<br>world!</p>'
1817

1818
        >>> html.tostring(root, method='xml')
1819
        b'<p>Hello<br/>world!</p>'
1820

1821
        >>> html.tostring(root, method='text')
1822
        b'Helloworld!'
1823

1824
        >>> html.tostring(root, method='text', encoding='unicode')
1825
        u'Helloworld!'
1826

1827
        >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1828
        >>> html.tostring(root[0], method='text', encoding='unicode')
1829
        u'Helloworld!TAIL'
1830

1831
        >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
1832
        u'Helloworld!'
1833

1834
        >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1835
        >>> html.tostring(doc, method='html', encoding='unicode')
1836
        u'<html><body><p>Hello<br>world!</p></body></html>'
1837

1838
        >>> print(html.tostring(doc, method='html', encoding='unicode',
1839
        ...          doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1840
        ...                  ' "http://www.w3.org/TR/html4/strict.dtd">'))
1841
        <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1842
        <html><body><p>Hello<br>world!</p></body></html>
1843
    """
1844
    html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1845
                          encoding=encoding, with_tail=with_tail,
1846
                          doctype=doctype)
1847
    if method == 'html' and not include_meta_content_type:
1848
        if isinstance(html, str):
1849
            html = __str_replace_meta_content_type('', html)
1850
        else:
1851
            html = __bytes_replace_meta_content_type(bytes(), html)
1852
    return html
1853

1854

1855
tostring.__doc__ = __fix_docstring(tostring.__doc__)
1856

1857

1858
def open_in_browser(doc, encoding=None):
1859
    """
1860
    Open the HTML document in a web browser, saving it to a temporary
1861
    file to open it.  Note that this does not delete the file after
1862
    use.  This is mainly meant for debugging.
1863
    """
1864
    import os
1865
    import webbrowser
1866
    import tempfile
1867
    if not isinstance(doc, etree._ElementTree):
1868
        doc = etree.ElementTree(doc)
1869
    handle, fn = tempfile.mkstemp(suffix='.html')
1870
    f = os.fdopen(handle, 'wb')
1871
    try:
1872
        doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1873
    finally:
1874
        # we leak the file itself here, but we should at least close it
1875
        f.close()
1876
    url = 'file://' + fn.replace(os.path.sep, '/')
1877
    print(url)
1878
    webbrowser.open(url)
1879

1880

1881
################################################################################
1882
# configure Element class lookup
1883
################################################################################
1884

1885
class HTMLParser(etree.HTMLParser):
1886
    """An HTML parser that is configured to return lxml.html Element
1887
    objects.
1888
    """
1889
    def __init__(self, **kwargs):
1890
        super(HTMLParser, self).__init__(**kwargs)
1891
        self.set_element_class_lookup(HtmlElementClassLookup())
1892

1893

1894
class XHTMLParser(etree.XMLParser):
1895
    """An XML parser that is configured to return lxml.html Element
1896
    objects.
1897

1898
    Note that this parser is not really XHTML aware unless you let it
1899
    load a DTD that declares the HTML entities.  To do this, make sure
1900
    you have the XHTML DTDs installed in your catalogs, and create the
1901
    parser like this::
1902

1903
        >>> parser = XHTMLParser(load_dtd=True)
1904

1905
    If you additionally want to validate the document, use this::
1906

1907
        >>> parser = XHTMLParser(dtd_validation=True)
1908

1909
    For catalog support, see http://www.xmlsoft.org/catalog.html.
1910
    """
1911
    def __init__(self, **kwargs):
1912
        super(XHTMLParser, self).__init__(**kwargs)
1913
        self.set_element_class_lookup(HtmlElementClassLookup())
1914

1915

1916
def Element(*args, **kw):
1917
    """Create a new HTML Element.
1918

1919
    This can also be used for XHTML documents.
1920
    """
1921
    v = html_parser.makeelement(*args, **kw)
1922
    return v
1923

1924

1925
html_parser = HTMLParser()
1926
xhtml_parser = XHTMLParser()
1927

1928
Product

Resources

Company