Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sqlmapproject
GitHub Repository: sqlmapproject/sqlmap
Path: blob/master/thirdparty/clientform/clientform.py
2992 views
1
"""HTML form handling for web clients.
2
3
ClientForm is a Python module for handling HTML forms on the client
4
side, useful for parsing HTML forms, filling them in and returning the
5
completed forms to the server. It has developed from a port of Gisle
6
Aas' Perl module HTML::Form, from the libwww-perl library, but the
7
interface is not the same.
8
9
The most useful docstring is the one for HTMLForm.
10
11
RFC 1866: HTML 2.0
12
RFC 1867: Form-based File Upload in HTML
13
RFC 2388: Returning Values from Forms: multipart/form-data
14
HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX)
15
HTML 4.01 Specification, W3C Recommendation 24 December 1999
16
17
18
Copyright 2002-2007 John J. Lee <[email protected]>
19
Copyright 2005 Gary Poster
20
Copyright 2005 Zope Corporation
21
Copyright 1998-2000 Gisle Aas.
22
23
This code is free software; you can redistribute it and/or modify it
24
under the terms of the BSD or ZPL 2.1 licenses (see the file
25
COPYING.txt included with the distribution).
26
27
"""
28
29
# XXX
30
# Remove parser testing hack
31
# safeUrl()-ize action
32
# Switch to unicode throughout (would be 0.3.x)
33
# See Wichert Akkerman's 2004-01-22 message to c.l.py.
34
# Add charset parameter to Content-type headers? How to find value??
35
# Add some more functional tests
36
# Especially single and multiple file upload on the internet.
37
# Does file upload work when name is missing? Sourceforge tracker form
38
# doesn't like it. Check standards, and test with Apache. Test
39
# binary upload with Apache.
40
# mailto submission & enctype text/plain
41
# I'm not going to fix this unless somebody tells me what real servers
42
# that want this encoding actually expect: If enctype is
43
# application/x-www-form-urlencoded and there's a FILE control present.
44
# Strictly, it should be 'name=data' (see HTML 4.01 spec., section
45
# 17.13.2), but I send "name=" ATM. What about multiple file upload??
46
47
# Would be nice, but I'm not going to do it myself:
48
# -------------------------------------------------
49
# Maybe a 0.4.x?
50
# Replace by_label etc. with moniker / selector concept. Allows, eg.,
51
# a choice between selection by value / id / label / element
52
# contents. Or choice between matching labels exactly or by
53
# substring. Etc.
54
# Remove deprecated methods.
55
# ...what else?
56
# Work on DOMForm.
57
# XForms? Don't know if there's a need here.
58
59
__all__ = ['AmbiguityError', 'CheckboxControl', 'Control',
60
'ControlNotFoundError', 'FileControl', 'FormParser', 'HTMLForm',
61
'HiddenControl', 'IgnoreControl', 'ImageControl', 'IsindexControl',
62
'Item', 'ItemCountError', 'ItemNotFoundError', 'Label',
63
'ListControl', 'LocateError', 'Missing', 'ParseError', 'ParseFile',
64
'ParseFileEx', 'ParseResponse', 'ParseResponseEx','PasswordControl',
65
'RadioControl', 'ScalarControl', 'SelectControl',
66
'SubmitButtonControl', 'SubmitControl', 'TextControl',
67
'TextareaControl', 'XHTMLCompatibleFormParser']
68
69
try:
70
import logging
71
import inspect
72
except ImportError:
73
def debug(msg, *args, **kwds):
74
pass
75
else:
76
_logger = logging.getLogger("ClientForm")
77
OPTIMIZATION_HACK = True
78
79
def debug(msg, *args, **kwds):
80
if OPTIMIZATION_HACK:
81
return
82
83
caller_name = inspect.stack()[1][3]
84
extended_msg = '%%s %s' % msg
85
extended_args = (caller_name,)+args
86
debug = _logger.debug(extended_msg, *extended_args, **kwds)
87
88
def _show_debug_messages():
89
global OPTIMIZATION_HACK
90
OPTIMIZATION_HACK = False
91
_logger.setLevel(logging.DEBUG)
92
handler = logging.StreamHandler(sys.stdout)
93
handler.setLevel(logging.DEBUG)
94
_logger.addHandler(handler)
95
96
try:
97
from thirdparty import six
98
from thirdparty.six import unichr as _unichr
99
from thirdparty.six.moves import cStringIO as _cStringIO
100
from thirdparty.six.moves import html_entities as _html_entities
101
from thirdparty.six.moves import urllib as _urllib
102
except ImportError:
103
import six
104
from six import unichr as _unichr
105
from six.moves import cStringIO as _cStringIO
106
from six.moves import html_entities as _html_entities
107
from six.moves import urllib as _urllib
108
109
try:
110
import sgmllib
111
except ImportError:
112
from lib.utils import sgmllib
113
114
import sys, re, random
115
116
if sys.version_info >= (3, 0):
117
xrange = range
118
119
# monkeypatch to fix http://www.python.org/sf/803422 :-(
120
sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
121
122
# HTMLParser.HTMLParser is recent, so live without it if it's not available
123
# (also, sgmllib.SGMLParser is much more tolerant of bad HTML)
124
try:
125
import HTMLParser
126
except ImportError:
127
HAVE_MODULE_HTMLPARSER = False
128
else:
129
HAVE_MODULE_HTMLPARSER = True
130
131
try:
132
import warnings
133
except ImportError:
134
def deprecation(message, stack_offset=0):
135
pass
136
else:
137
def deprecation(message, stack_offset=0):
138
warnings.warn(message, DeprecationWarning, stacklevel=3+stack_offset)
139
140
VERSION = "0.2.10"
141
142
CHUNK = 1024 # size of chunks fed to parser, in bytes
143
144
DEFAULT_ENCODING = "latin-1"
145
146
class Missing: pass
147
148
_compress_re = re.compile(r"\s+")
149
def compress_text(text): return _compress_re.sub(" ", text.strip())
150
151
def normalize_line_endings(text):
152
return re.sub(r"(?:(?<!\r)\n)|(?:\r(?!\n))", "\r\n", text)
153
154
def _quote_plus(value):
155
if not isinstance(value, six.string_types):
156
value = six.text_type(value)
157
158
if isinstance(value, six.text_type):
159
value = value.encode("utf8")
160
161
return _urllib.parse.quote_plus(value)
162
163
# This version of urlencode is from my Python 1.5.2 back-port of the
164
# Python 2.1 CVS maintenance branch of urllib. It will accept a sequence
165
# of pairs instead of a mapping -- the 2.0 version only accepts a mapping.
166
def urlencode(query,doseq=False,):
167
"""Encode a sequence of two-element tuples or dictionary into a URL query \
168
string.
169
170
If any values in the query arg are sequences and doseq is true, each
171
sequence element is converted to a separate parameter.
172
173
If the query arg is a sequence of two-element tuples, the order of the
174
parameters in the output will match the order of parameters in the
175
input.
176
"""
177
178
if hasattr(query,"items"):
179
# mapping objects
180
query = query.items()
181
else:
182
# it's a bother at times that strings and string-like objects are
183
# sequences...
184
try:
185
# non-sequence items should not work with len()
186
x = len(query)
187
# non-empty strings will fail this
188
if len(query) and type(query[0]) != tuple:
189
raise TypeError()
190
# zero-length sequences of all types will get here and succeed,
191
# but that's a minor nit - since the original implementation
192
# allowed empty dicts that type of behavior probably should be
193
# preserved for consistency
194
except TypeError:
195
ty,va,tb = sys.exc_info()
196
raise TypeError("not a valid non-string sequence or mapping "
197
"object", tb)
198
199
l = []
200
if not doseq:
201
# preserve old behavior
202
for k, v in query:
203
k = _quote_plus(k)
204
v = _quote_plus(v)
205
l.append(k + '=' + v)
206
else:
207
for k, v in query:
208
k = _quote_plus(k)
209
if isinstance(v, six.string_types):
210
v = _quote_plus(v)
211
l.append(k + '=' + v)
212
else:
213
try:
214
# is this a sufficient test for sequence-ness?
215
x = len(v)
216
except TypeError:
217
# not a sequence
218
v = _quote_plus(v)
219
l.append(k + '=' + v)
220
else:
221
# loop over the sequence
222
for elt in v:
223
l.append(k + '=' + _quote_plus(elt))
224
return '&'.join(l)
225
226
def unescape(data, entities, encoding=DEFAULT_ENCODING):
227
if data is None or "&" not in data:
228
return data
229
230
if isinstance(data, six.string_types):
231
encoding = None
232
233
def replace_entities(match, entities=entities, encoding=encoding):
234
ent = match.group()
235
if ent[1] == "#":
236
return unescape_charref(ent[2:-1], encoding)
237
238
repl = entities.get(ent)
239
if repl is not None:
240
if hasattr(repl, "decode") and encoding is not None:
241
try:
242
repl = repl.decode(encoding)
243
except UnicodeError:
244
repl = ent
245
else:
246
repl = ent
247
248
return repl
249
250
return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
251
252
def unescape_charref(data, encoding):
253
name, base = data, 10
254
if name.startswith("x"):
255
name, base= name[1:], 16
256
elif not name.isdigit():
257
base = 16
258
259
try:
260
return _unichr(int(name, base))
261
except:
262
return data
263
264
def get_entitydefs():
265
from codecs import latin_1_decode
266
entitydefs = {}
267
try:
268
_html_entities.name2codepoint
269
except AttributeError:
270
entitydefs = {}
271
for name, char in _html_entities.entitydefs.items():
272
uc = latin_1_decode(char)[0]
273
if uc.startswith("&#") and uc.endswith(";"):
274
uc = unescape_charref(uc[2:-1], None)
275
entitydefs["&%s;" % name] = uc
276
else:
277
for name, codepoint in _html_entities.name2codepoint.items():
278
entitydefs["&%s;" % name] = _unichr(codepoint)
279
return entitydefs
280
281
def issequence(x):
282
try:
283
x[0]
284
except (TypeError, KeyError):
285
return False
286
except IndexError:
287
pass
288
return True
289
290
def isstringlike(x):
291
try: x+""
292
except: return False
293
else: return True
294
295
296
def choose_boundary():
297
"""Return a string usable as a multipart boundary."""
298
# follow IE and firefox
299
nonce = "".join([str(random.randint(0, sys.maxsize-1)) for i in (0,1,2)])
300
return "-"*27 + nonce
301
302
# This cut-n-pasted MimeWriter from standard library is here so can add
303
# to HTTP headers rather than message body when appropriate. It also uses
304
# \r\n in place of \n. This is a bit nasty.
305
class MimeWriter:
306
307
"""Generic MIME writer.
308
309
Methods:
310
311
__init__()
312
addheader()
313
flushheaders()
314
startbody()
315
startmultipartbody()
316
nextpart()
317
lastpart()
318
319
A MIME writer is much more primitive than a MIME parser. It
320
doesn't seek around on the output file, and it doesn't use large
321
amounts of buffer space, so you have to write the parts in the
322
order they should occur on the output file. It does buffer the
323
headers you add, allowing you to rearrange their order.
324
325
General usage is:
326
327
f = <open the output file>
328
w = MimeWriter(f)
329
...call w.addheader(key, value) 0 or more times...
330
331
followed by either:
332
333
f = w.startbody(content_type)
334
...call f.write(data) for body data...
335
336
or:
337
338
w.startmultipartbody(subtype)
339
for each part:
340
subwriter = w.nextpart()
341
...use the subwriter's methods to create the subpart...
342
w.lastpart()
343
344
The subwriter is another MimeWriter instance, and should be
345
treated in the same way as the toplevel MimeWriter. This way,
346
writing recursive body parts is easy.
347
348
Warning: don't forget to call lastpart()!
349
350
XXX There should be more state so calls made in the wrong order
351
are detected.
352
353
Some special cases:
354
355
- startbody() just returns the file passed to the constructor;
356
but don't use this knowledge, as it may be changed.
357
358
- startmultipartbody() actually returns a file as well;
359
this can be used to write the initial 'if you can read this your
360
mailer is not MIME-aware' message.
361
362
- If you call flushheaders(), the headers accumulated so far are
363
written out (and forgotten); this is useful if you don't need a
364
body part at all, e.g. for a subpart of type message/rfc822
365
that's (mis)used to store some header-like information.
366
367
- Passing a keyword argument 'prefix=<flag>' to addheader(),
368
start*body() affects where the header is inserted; 0 means
369
append at the end, 1 means insert at the start; default is
370
append for addheader(), but insert for start*body(), which use
371
it to determine where the Content-type header goes.
372
373
"""
374
375
def __init__(self, fp, http_hdrs=None):
376
self._http_hdrs = http_hdrs
377
self._fp = fp
378
self._headers = []
379
self._boundary = []
380
self._first_part = True
381
382
def addheader(self, key, value, prefix=0,
383
add_to_http_hdrs=0):
384
"""
385
prefix is ignored if add_to_http_hdrs is true.
386
"""
387
lines = value.split("\r\n")
388
while lines and not lines[-1]: del lines[-1]
389
while lines and not lines[0]: del lines[0]
390
if add_to_http_hdrs:
391
value = "".join(lines)
392
# 2.2 urllib2 doesn't normalize header case
393
self._http_hdrs.append((key.capitalize(), value))
394
else:
395
for i in xrange(1, len(lines)):
396
lines[i] = " " + lines[i].strip()
397
value = "\r\n".join(lines) + "\r\n"
398
line = key.title() + ": " + value
399
if prefix:
400
self._headers.insert(0, line)
401
else:
402
self._headers.append(line)
403
404
def flushheaders(self):
405
self._fp.writelines(self._headers)
406
self._headers = []
407
408
def startbody(self, ctype=None, plist=[], prefix=1,
409
add_to_http_hdrs=0, content_type=1):
410
"""
411
prefix is ignored if add_to_http_hdrs is true.
412
"""
413
if content_type and ctype:
414
for name, value in plist:
415
ctype = ctype + ';\r\n %s=%s' % (name, value)
416
self.addheader("Content-Type", ctype, prefix=prefix,
417
add_to_http_hdrs=add_to_http_hdrs)
418
self.flushheaders()
419
if not add_to_http_hdrs: self._fp.write("\r\n")
420
self._first_part = True
421
return self._fp
422
423
def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1,
424
add_to_http_hdrs=0, content_type=1):
425
boundary = boundary or choose_boundary()
426
self._boundary.append(boundary)
427
return self.startbody("multipart/" + subtype,
428
[("boundary", boundary)] + plist,
429
prefix=prefix,
430
add_to_http_hdrs=add_to_http_hdrs,
431
content_type=content_type)
432
433
def nextpart(self):
434
boundary = self._boundary[-1]
435
if self._first_part:
436
self._first_part = False
437
else:
438
self._fp.write("\r\n")
439
self._fp.write("--" + boundary + "\r\n")
440
return self.__class__(self._fp)
441
442
def lastpart(self):
443
if self._first_part:
444
self.nextpart()
445
boundary = self._boundary.pop()
446
self._fp.write("\r\n--" + boundary + "--\r\n")
447
448
449
class LocateError(ValueError): pass
450
class AmbiguityError(LocateError): pass
451
class ControlNotFoundError(LocateError): pass
452
class ItemNotFoundError(LocateError): pass
453
454
class ItemCountError(ValueError): pass
455
456
# for backwards compatibility, ParseError derives from exceptions that were
457
# raised by versions of ClientForm <= 0.2.5
458
if HAVE_MODULE_HTMLPARSER:
459
SGMLLIB_PARSEERROR = sgmllib.SGMLParseError
460
class ParseError(sgmllib.SGMLParseError,
461
HTMLParser.HTMLParseError,
462
):
463
pass
464
else:
465
if hasattr(sgmllib, "SGMLParseError"):
466
SGMLLIB_PARSEERROR = sgmllib.SGMLParseError
467
class ParseError(sgmllib.SGMLParseError):
468
pass
469
else:
470
SGMLLIB_PARSEERROR = RuntimeError
471
class ParseError(RuntimeError):
472
pass
473
474
475
class _AbstractFormParser:
476
"""forms attribute contains HTMLForm instances on completion."""
477
# thanks to Moshe Zadka for an example of sgmllib/htmllib usage
478
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
479
if entitydefs is None:
480
entitydefs = get_entitydefs()
481
self._entitydefs = entitydefs
482
self._encoding = encoding
483
484
self.base = None
485
self.forms = []
486
self.labels = []
487
self._current_label = None
488
self._current_form = None
489
self._select = None
490
self._optgroup = None
491
self._option = None
492
self._textarea = None
493
494
# forms[0] will contain all controls that are outside of any form
495
# self._global_form is an alias for self.forms[0]
496
self._global_form = None
497
self.start_form([])
498
self.end_form()
499
self._current_form = self._global_form = self.forms[0]
500
501
def do_base(self, attrs):
502
debug("%s", attrs)
503
for key, value in attrs:
504
if key == "href":
505
self.base = self.unescape_attr_if_required(value)
506
507
def end_body(self):
508
debug("")
509
if self._current_label is not None:
510
self.end_label()
511
if self._current_form is not self._global_form:
512
self.end_form()
513
514
def start_form(self, attrs):
515
debug("%s", attrs)
516
if self._current_form is not self._global_form:
517
raise ParseError("nested FORMs")
518
name = None
519
action = None
520
enctype = "application/x-www-form-urlencoded"
521
method = "GET"
522
d = {}
523
for key, value in attrs:
524
if key == "name":
525
name = self.unescape_attr_if_required(value)
526
elif key == "action":
527
action = self.unescape_attr_if_required(value)
528
elif key == "method":
529
method = self.unescape_attr_if_required(value.upper())
530
elif key == "enctype":
531
enctype = self.unescape_attr_if_required(value.lower())
532
d[key] = self.unescape_attr_if_required(value)
533
controls = []
534
self._current_form = (name, action, method, enctype), d, controls
535
536
def end_form(self):
537
debug("")
538
if self._current_label is not None:
539
self.end_label()
540
if self._current_form is self._global_form:
541
raise ParseError("end of FORM before start")
542
self.forms.append(self._current_form)
543
self._current_form = self._global_form
544
545
def start_select(self, attrs):
546
debug("%s", attrs)
547
if self._select is not None:
548
raise ParseError("nested SELECTs")
549
if self._textarea is not None:
550
raise ParseError("SELECT inside TEXTAREA")
551
d = {}
552
for key, val in attrs:
553
d[key] = self.unescape_attr_if_required(val)
554
555
self._select = d
556
self._add_label(d)
557
558
self._append_select_control({"__select": d})
559
560
def end_select(self):
561
debug("")
562
if self._select is None:
563
raise ParseError("end of SELECT before start")
564
565
if self._option is not None:
566
self._end_option()
567
568
self._select = None
569
570
def start_optgroup(self, attrs):
571
debug("%s", attrs)
572
if self._select is None:
573
raise ParseError("OPTGROUP outside of SELECT")
574
d = {}
575
for key, val in attrs:
576
d[key] = self.unescape_attr_if_required(val)
577
578
self._optgroup = d
579
580
def end_optgroup(self):
581
debug("")
582
if self._optgroup is None:
583
raise ParseError("end of OPTGROUP before start")
584
self._optgroup = None
585
586
def _start_option(self, attrs):
587
debug("%s", attrs)
588
if self._select is None:
589
raise ParseError("OPTION outside of SELECT")
590
if self._option is not None:
591
self._end_option()
592
593
d = {}
594
for key, val in attrs:
595
d[key] = self.unescape_attr_if_required(val)
596
597
self._option = {}
598
self._option.update(d)
599
if (self._optgroup and "disabled" in self._optgroup and
600
"disabled" not in self._option):
601
self._option["disabled"] = None
602
603
def _end_option(self):
604
debug("")
605
if self._option is None:
606
raise ParseError("end of OPTION before start")
607
608
contents = self._option.get("contents", "").strip()
609
self._option["contents"] = contents
610
if "value" not in self._option:
611
self._option["value"] = contents
612
if "label" not in self._option:
613
self._option["label"] = contents
614
# stuff dict of SELECT HTML attrs into a special private key
615
# (gets deleted again later)
616
self._option["__select"] = self._select
617
self._append_select_control(self._option)
618
self._option = None
619
620
def _append_select_control(self, attrs):
621
debug("%s", attrs)
622
controls = self._current_form[2]
623
name = self._select.get("name")
624
controls.append(("select", name, attrs))
625
626
def start_textarea(self, attrs):
627
debug("%s", attrs)
628
if self._textarea is not None:
629
raise ParseError("nested TEXTAREAs")
630
if self._select is not None:
631
raise ParseError("TEXTAREA inside SELECT")
632
d = {}
633
for key, val in attrs:
634
d[key] = self.unescape_attr_if_required(val)
635
self._add_label(d)
636
637
self._textarea = d
638
639
def end_textarea(self):
640
debug("")
641
if self._textarea is None:
642
raise ParseError("end of TEXTAREA before start")
643
controls = self._current_form[2]
644
name = self._textarea.get("name")
645
controls.append(("textarea", name, self._textarea))
646
self._textarea = None
647
648
def start_label(self, attrs):
649
debug("%s", attrs)
650
if self._current_label:
651
self.end_label()
652
d = {}
653
for key, val in attrs:
654
d[key] = self.unescape_attr_if_required(val)
655
taken = bool(d.get("for")) # empty id is invalid
656
d["__text"] = ""
657
d["__taken"] = taken
658
if taken:
659
self.labels.append(d)
660
self._current_label = d
661
662
def end_label(self):
663
debug("")
664
label = self._current_label
665
if label is None:
666
# something is ugly in the HTML, but we're ignoring it
667
return
668
self._current_label = None
669
# if it is staying around, it is True in all cases
670
del label["__taken"]
671
672
def _add_label(self, d):
673
#debug("%s", d)
674
if self._current_label is not None:
675
if not self._current_label["__taken"]:
676
self._current_label["__taken"] = True
677
d["__label"] = self._current_label
678
679
def handle_data(self, data):
680
debug("%s", data)
681
682
if self._option is not None:
683
# self._option is a dictionary of the OPTION element's HTML
684
# attributes, but it has two special keys, one of which is the
685
# special "contents" key contains text between OPTION tags (the
686
# other is the "__select" key: see the end_option method)
687
map = self._option
688
key = "contents"
689
elif self._textarea is not None:
690
map = self._textarea
691
key = "value"
692
data = normalize_line_endings(data)
693
# not if within option or textarea
694
elif self._current_label is not None:
695
map = self._current_label
696
key = "__text"
697
else:
698
return
699
700
if data and key not in map:
701
# according to
702
# http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1 line break
703
# immediately after start tags or immediately before end tags must
704
# be ignored, but real browsers only ignore a line break after a
705
# start tag, so we'll do that.
706
if data[0:2] == "\r\n":
707
data = data[2:]
708
elif data[0:1] in ["\n", "\r"]:
709
data = data[1:]
710
map[key] = data
711
else:
712
map[key] = (map[key].decode("utf8", "replace") if isinstance(map[key], six.binary_type) else map[key]) + data
713
714
def do_button(self, attrs):
715
debug("%s", attrs)
716
d = {}
717
d["type"] = "submit" # default
718
for key, val in attrs:
719
d[key] = self.unescape_attr_if_required(val)
720
controls = self._current_form[2]
721
722
type = d["type"]
723
name = d.get("name")
724
# we don't want to lose information, so use a type string that
725
# doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON}
726
# e.g. type for BUTTON/RESET is "resetbutton"
727
# (type for INPUT/RESET is "reset")
728
type = type+"button"
729
self._add_label(d)
730
controls.append((type, name, d))
731
732
def do_input(self, attrs):
733
debug("%s", attrs)
734
d = {}
735
d["type"] = "text" # default
736
for key, val in attrs:
737
d[key] = self.unescape_attr_if_required(val)
738
controls = self._current_form[2]
739
740
type = d["type"]
741
name = d.get("name")
742
self._add_label(d)
743
controls.append((type, name, d))
744
745
def do_isindex(self, attrs):
746
debug("%s", attrs)
747
d = {}
748
for key, val in attrs:
749
d[key] = self.unescape_attr_if_required(val)
750
controls = self._current_form[2]
751
752
self._add_label(d)
753
# isindex doesn't have type or name HTML attributes
754
controls.append(("isindex", None, d))
755
756
def handle_entityref(self, name):
757
#debug("%s", name)
758
self.handle_data(unescape(
759
'&%s;' % name, self._entitydefs, self._encoding))
760
761
def handle_charref(self, name):
762
#debug("%s", name)
763
self.handle_data(unescape_charref(name, self._encoding))
764
765
def unescape_attr(self, name):
766
#debug("%s", name)
767
return unescape(name, self._entitydefs, self._encoding)
768
769
def unescape_attrs(self, attrs):
770
#debug("%s", attrs)
771
escaped_attrs = {}
772
for key, val in attrs.items():
773
try:
774
val.items
775
except AttributeError:
776
escaped_attrs[key] = self.unescape_attr(val)
777
else:
778
# e.g. "__select" -- yuck!
779
escaped_attrs[key] = self.unescape_attrs(val)
780
return escaped_attrs
781
782
def unknown_entityref(self, ref): self.handle_data("&%s;" % ref)
783
def unknown_charref(self, ref): self.handle_data("&#%s;" % ref)
784
785
786
if not HAVE_MODULE_HTMLPARSER:
787
class XHTMLCompatibleFormParser:
788
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
789
raise ValueError("HTMLParser could not be imported")
790
else:
791
class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
792
"""Good for XHTML, bad for tolerance of incorrect HTML."""
793
# thanks to Michael Howitz for this!
794
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
795
HTMLParser.HTMLParser.__init__(self)
796
_AbstractFormParser.__init__(self, entitydefs, encoding)
797
798
def feed(self, data):
799
try:
800
HTMLParser.HTMLParser.feed(self, data)
801
except HTMLParser.HTMLParseError as exc:
802
raise ParseError(exc)
803
804
def start_option(self, attrs):
805
_AbstractFormParser._start_option(self, attrs)
806
807
def end_option(self):
808
_AbstractFormParser._end_option(self)
809
810
def handle_starttag(self, tag, attrs):
811
try:
812
method = getattr(self, "start_" + tag)
813
except AttributeError:
814
try:
815
method = getattr(self, "do_" + tag)
816
except AttributeError:
817
pass # unknown tag
818
else:
819
method(attrs)
820
else:
821
method(attrs)
822
823
def handle_endtag(self, tag):
824
try:
825
method = getattr(self, "end_" + tag)
826
except AttributeError:
827
pass # unknown tag
828
else:
829
method()
830
831
def unescape(self, name):
832
# Use the entitydefs passed into constructor, not
833
# HTMLParser.HTMLParser's entitydefs.
834
return self.unescape_attr(name)
835
836
def unescape_attr_if_required(self, name):
837
return name # HTMLParser.HTMLParser already did it
838
def unescape_attrs_if_required(self, attrs):
839
return attrs # ditto
840
841
def close(self):
842
HTMLParser.HTMLParser.close(self)
843
self.end_body()
844
845
846
class _AbstractSgmllibParser(_AbstractFormParser):
847
848
def do_option(self, attrs):
849
_AbstractFormParser._start_option(self, attrs)
850
851
if sys.version_info[:2] >= (2,5):
852
# we override this attr to decode hex charrefs
853
entity_or_charref = re.compile(
854
'&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(x?[0-9a-fA-F]+))(;?)')
855
def convert_entityref(self, name):
856
return unescape("&%s;" % name, self._entitydefs, self._encoding)
857
def convert_charref(self, name):
858
return unescape_charref("%s" % name, self._encoding)
859
def unescape_attr_if_required(self, name):
860
return name # sgmllib already did it
861
def unescape_attrs_if_required(self, attrs):
862
return attrs # ditto
863
else:
864
def unescape_attr_if_required(self, name):
865
return self.unescape_attr(name)
866
def unescape_attrs_if_required(self, attrs):
867
return self.unescape_attrs(attrs)
868
869
870
class FormParser(_AbstractSgmllibParser, sgmllib.SGMLParser):
871
"""Good for tolerance of incorrect HTML, bad for XHTML."""
872
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
873
sgmllib.SGMLParser.__init__(self)
874
_AbstractFormParser.__init__(self, entitydefs, encoding)
875
876
def feed(self, data):
877
try:
878
sgmllib.SGMLParser.feed(self, data)
879
except SGMLLIB_PARSEERROR as exc:
880
raise ParseError(exc)
881
882
def close(self):
883
sgmllib.SGMLParser.close(self)
884
self.end_body()
885
886
887
# sigh, must support mechanize by allowing dynamic creation of classes based on
888
# its bundled copy of BeautifulSoup (which was necessary because of dependency
889
# problems)
890
891
def _create_bs_classes(bs,
892
icbinbs,
893
):
894
class _AbstractBSFormParser(_AbstractSgmllibParser):
895
bs_base_class = None
896
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
897
_AbstractFormParser.__init__(self, entitydefs, encoding)
898
self.bs_base_class.__init__(self)
899
def handle_data(self, data):
900
_AbstractFormParser.handle_data(self, data)
901
self.bs_base_class.handle_data(self, data)
902
def feed(self, data):
903
try:
904
self.bs_base_class.feed(self, data)
905
except SGMLLIB_PARSEERROR as exc:
906
raise ParseError(exc)
907
def close(self):
908
self.bs_base_class.close(self)
909
self.end_body()
910
911
class RobustFormParser(_AbstractBSFormParser, bs):
912
"""Tries to be highly tolerant of incorrect HTML."""
913
pass
914
RobustFormParser.bs_base_class = bs
915
class NestingRobustFormParser(_AbstractBSFormParser, icbinbs):
916
"""Tries to be highly tolerant of incorrect HTML.
917
918
Different from RobustFormParser in that it more often guesses nesting
919
above missing end tags (see BeautifulSoup docs).
920
921
"""
922
pass
923
NestingRobustFormParser.bs_base_class = icbinbs
924
925
return RobustFormParser, NestingRobustFormParser
926
927
try:
928
if sys.version_info[:2] < (2, 2):
929
raise ImportError # BeautifulSoup uses generators
930
import BeautifulSoup
931
except ImportError:
932
pass
933
else:
934
RobustFormParser, NestingRobustFormParser = _create_bs_classes(
935
BeautifulSoup.BeautifulSoup, BeautifulSoup.ICantBelieveItsBeautifulSoup
936
)
937
__all__ += ['RobustFormParser', 'NestingRobustFormParser']
938
939
940
#FormParser = XHTMLCompatibleFormParser # testing hack
941
#FormParser = RobustFormParser # testing hack
942
943
944
def ParseResponseEx(response,
945
select_default=False,
946
form_parser_class=FormParser,
947
request_class=_urllib.request.Request,
948
entitydefs=None,
949
encoding=DEFAULT_ENCODING,
950
951
# private
952
_urljoin=_urllib.parse.urljoin,
953
_urlparse=_urllib.parse.urlparse,
954
_urlunparse=_urllib.parse.urlunparse,
955
):
956
"""Identical to ParseResponse, except that:
957
958
1. The returned list contains an extra item. The first form in the list
959
contains all controls not contained in any FORM element.
960
961
2. The arguments ignore_errors and backwards_compat have been removed.
962
963
3. Backwards-compatibility mode (backwards_compat=True) is not available.
964
"""
965
return _ParseFileEx(response, response.geturl(),
966
select_default,
967
False,
968
form_parser_class,
969
request_class,
970
entitydefs,
971
False,
972
encoding,
973
_urljoin=_urljoin,
974
_urlparse=_urlparse,
975
_urlunparse=_urlunparse,
976
)
977
978
def ParseFileEx(file, base_uri,
979
select_default=False,
980
form_parser_class=FormParser,
981
request_class=_urllib.request.Request,
982
entitydefs=None,
983
encoding=DEFAULT_ENCODING,
984
985
# private
986
_urljoin=_urllib.parse.urljoin,
987
_urlparse=_urllib.parse.urlparse,
988
_urlunparse=_urllib.parse.urlunparse,
989
):
990
"""Identical to ParseFile, except that:
991
992
1. The returned list contains an extra item. The first form in the list
993
contains all controls not contained in any FORM element.
994
995
2. The arguments ignore_errors and backwards_compat have been removed.
996
997
3. Backwards-compatibility mode (backwards_compat=True) is not available.
998
"""
999
return _ParseFileEx(file, base_uri,
1000
select_default,
1001
False,
1002
form_parser_class,
1003
request_class,
1004
entitydefs,
1005
False,
1006
encoding,
1007
_urljoin=_urljoin,
1008
_urlparse=_urlparse,
1009
_urlunparse=_urlunparse,
1010
)
1011
1012
def ParseResponse(response, *args, **kwds):
1013
"""Parse HTTP response and return a list of HTMLForm instances.
1014
1015
The return value of urllib2.urlopen can be conveniently passed to this
1016
function as the response parameter.
1017
1018
ClientForm.ParseError is raised on parse errors.
1019
1020
response: file-like object (supporting read() method) with a method
1021
geturl(), returning the URI of the HTTP response
1022
select_default: for multiple-selection SELECT controls and RADIO controls,
1023
pick the first item as the default if none are selected in the HTML
1024
form_parser_class: class to instantiate and use to pass
1025
request_class: class to return from .click() method (default is
1026
_urllib.request.Request)
1027
entitydefs: mapping like {"&amp;": "&", ...} containing HTML entity
1028
definitions (a sensible default is used)
1029
encoding: character encoding used for encoding numeric character references
1030
when matching link text. ClientForm does not attempt to find the encoding
1031
in a META HTTP-EQUIV attribute in the document itself (mechanize, for
1032
example, does do that and will pass the correct value to ClientForm using
1033
this parameter).
1034
1035
backwards_compat: boolean that determines whether the returned HTMLForm
1036
objects are backwards-compatible with old code. If backwards_compat is
1037
true:
1038
1039
- ClientForm 0.1 code will continue to work as before.
1040
1041
- Label searches that do not specify a nr (number or count) will always
1042
get the first match, even if other controls match. If
1043
backwards_compat is False, label searches that have ambiguous results
1044
will raise an AmbiguityError.
1045
1046
- Item label matching is done by strict string comparison rather than
1047
substring matching.
1048
1049
- De-selecting individual list items is allowed even if the Item is
1050
disabled.
1051
1052
The backwards_compat argument will be deprecated in a future release.
1053
1054
Pass a true value for select_default if you want the behaviour specified by
1055
RFC 1866 (the HTML 2.0 standard), which is to select the first item in a
1056
RADIO or multiple-selection SELECT control if none were selected in the
1057
HTML. Most browsers (including Microsoft Internet Explorer (IE) and
1058
Netscape Navigator) instead leave all items unselected in these cases. The
1059
W3C HTML 4.0 standard leaves this behaviour undefined in the case of
1060
multiple-selection SELECT controls, but insists that at least one RADIO
1061
button should be checked at all times, in contradiction to browser
1062
behaviour.
1063
1064
There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses
1065
HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses
1066
sgmllib.SGMLParser) (the default) works better for ordinary grubby HTML.
1067
Note that HTMLParser is only available in Python 2.2 and later. You can
1068
pass your own class in here as a hack to work around bad HTML, but at your
1069
own risk: there is no well-defined interface.
1070
1071
"""
1072
return _ParseFileEx(response, response.geturl(), *args, **kwds)[1:]
1073
1074
def ParseFile(file, base_uri, *args, **kwds):
1075
"""Parse HTML and return a list of HTMLForm instances.
1076
1077
ClientForm.ParseError is raised on parse errors.
1078
1079
file: file-like object (supporting read() method) containing HTML with zero
1080
or more forms to be parsed
1081
base_uri: the URI of the document (note that the base URI used to submit
1082
the form will be that given in the BASE element if present, not that of
1083
the document)
1084
1085
For the other arguments and further details, see ParseResponse.__doc__.
1086
1087
"""
1088
return _ParseFileEx(file, base_uri, *args, **kwds)[1:]
1089
1090
def _ParseFileEx(file, base_uri,
1091
select_default=False,
1092
ignore_errors=False,
1093
form_parser_class=FormParser,
1094
request_class=_urllib.request.Request,
1095
entitydefs=None,
1096
backwards_compat=True,
1097
encoding=DEFAULT_ENCODING,
1098
_urljoin=_urllib.parse.urljoin,
1099
_urlparse=_urllib.parse.urlparse,
1100
_urlunparse=_urllib.parse.urlunparse,
1101
):
1102
if backwards_compat:
1103
deprecation("operating in backwards-compatibility mode", 1)
1104
fp = form_parser_class(entitydefs, encoding)
1105
while 1:
1106
data = file.read(CHUNK)
1107
try:
1108
fp.feed(data)
1109
except ParseError as e:
1110
e.base_uri = base_uri
1111
raise
1112
if len(data) != CHUNK: break
1113
fp.close()
1114
if fp.base is not None:
1115
# HTML BASE element takes precedence over document URI
1116
base_uri = fp.base
1117
labels = [] # Label(label) for label in fp.labels]
1118
id_to_labels = {}
1119
for l in fp.labels:
1120
label = Label(l)
1121
labels.append(label)
1122
for_id = l["for"]
1123
coll = id_to_labels.get(for_id)
1124
if coll is None:
1125
id_to_labels[for_id] = [label]
1126
else:
1127
coll.append(label)
1128
forms = []
1129
for (name, action, method, enctype), attrs, controls in fp.forms:
1130
if action is None:
1131
action = base_uri
1132
else:
1133
action = six.text_type(action, "utf8") if action and isinstance(action, six.binary_type) else action
1134
action = _urljoin(base_uri, action)
1135
# would be nice to make HTMLForm class (form builder) pluggable
1136
form = HTMLForm(
1137
action, method, enctype, name, attrs, request_class,
1138
forms, labels, id_to_labels, backwards_compat)
1139
form._urlparse = _urlparse
1140
form._urlunparse = _urlunparse
1141
for ii in xrange(len(controls)):
1142
type, name, attrs = controls[ii]
1143
# index=ii*10 allows ImageControl to return multiple ordered pairs
1144
form.new_control(
1145
type, name, attrs, select_default=select_default, index=ii*10)
1146
forms.append(form)
1147
for form in forms:
1148
try:
1149
form.fixup()
1150
except AttributeError as ex:
1151
if not any(_ in str(ex) for _ in ("is disabled", "is readonly")):
1152
raise
1153
return forms
1154
1155
1156
class Label:
1157
def __init__(self, attrs):
1158
self.id = attrs.get("for")
1159
self._text = attrs.get("__text").strip()
1160
self._ctext = compress_text(self._text)
1161
self.attrs = attrs
1162
self._backwards_compat = False # maintained by HTMLForm
1163
1164
def __getattr__(self, name):
1165
if name == "text":
1166
if self._backwards_compat:
1167
return self._text
1168
else:
1169
return self._ctext
1170
return getattr(Label, name)
1171
1172
def __setattr__(self, name, value):
1173
if name == "text":
1174
# don't see any need for this, so make it read-only
1175
raise AttributeError("text attribute is read-only")
1176
self.__dict__[name] = value
1177
1178
def __str__(self):
1179
return "<Label(id=%r, text=%r)>" % (self.id, self.text)
1180
1181
1182
def _get_label(attrs):
1183
text = attrs.get("__label")
1184
if text is not None:
1185
return Label(text)
1186
else:
1187
return None
1188
1189
class Control:
1190
"""An HTML form control.
1191
1192
An HTMLForm contains a sequence of Controls. The Controls in an HTMLForm
1193
are accessed using the HTMLForm.find_control method or the
1194
HTMLForm.controls attribute.
1195
1196
Control instances are usually constructed using the ParseFile /
1197
ParseResponse functions. If you use those functions, you can ignore the
1198
rest of this paragraph. A Control is only properly initialised after the
1199
fixup method has been called. In fact, this is only strictly necessary for
1200
ListControl instances. This is necessary because ListControls are built up
1201
from ListControls each containing only a single item, and their initial
1202
value(s) can only be known after the sequence is complete.
1203
1204
The types and values that are acceptable for assignment to the value
1205
attribute are defined by subclasses.
1206
1207
If the disabled attribute is true, this represents the state typically
1208
represented by browsers by 'greying out' a control. If the disabled
1209
attribute is true, the Control will raise AttributeError if an attempt is
1210
made to change its value. In addition, the control will not be considered
1211
'successful' as defined by the W3C HTML 4 standard -- ie. it will
1212
contribute no data to the return value of the HTMLForm.click* methods. To
1213
enable a control, set the disabled attribute to a false value.
1214
1215
If the readonly attribute is true, the Control will raise AttributeError if
1216
an attempt is made to change its value. To make a control writable, set
1217
the readonly attribute to a false value.
1218
1219
All controls have the disabled and readonly attributes, not only those that
1220
may have the HTML attributes of the same names.
1221
1222
On assignment to the value attribute, the following exceptions are raised:
1223
TypeError, AttributeError (if the value attribute should not be assigned
1224
to, because the control is disabled, for example) and ValueError.
1225
1226
If the name or value attributes are None, or the value is an empty list, or
1227
if the control is disabled, the control is not successful.
1228
1229
Public attributes:
1230
1231
type: string describing type of control (see the keys of the
1232
HTMLForm.type2class dictionary for the allowable values) (readonly)
1233
name: name of control (readonly)
1234
value: current value of control (subclasses may allow a single value, a
1235
sequence of values, or either)
1236
disabled: disabled state
1237
readonly: readonly state
1238
id: value of id HTML attribute
1239
1240
"""
1241
def __init__(self, type, name, attrs, index=None):
1242
"""
1243
type: string describing type of control (see the keys of the
1244
HTMLForm.type2class dictionary for the allowable values)
1245
name: control name
1246
attrs: HTML attributes of control's HTML element
1247
1248
"""
1249
raise NotImplementedError()
1250
1251
def add_to_form(self, form):
1252
self._form = form
1253
form.controls.append(self)
1254
1255
def fixup(self):
1256
pass
1257
1258
def is_of_kind(self, kind):
1259
raise NotImplementedError()
1260
1261
def clear(self):
1262
raise NotImplementedError()
1263
1264
def __getattr__(self, name): raise NotImplementedError()
1265
def __setattr__(self, name, value): raise NotImplementedError()
1266
1267
def pairs(self):
1268
"""Return list of (key, value) pairs suitable for passing to urlencode.
1269
"""
1270
return [(k, v) for (i, k, v) in self._totally_ordered_pairs()]
1271
1272
def _totally_ordered_pairs(self):
1273
"""Return list of (key, value, index) tuples.
1274
1275
Like pairs, but allows preserving correct ordering even where several
1276
controls are involved.
1277
1278
"""
1279
raise NotImplementedError()
1280
1281
def _write_mime_data(self, mw, name, value):
1282
"""Write data for a subitem of this control to a MimeWriter."""
1283
# called by HTMLForm
1284
mw2 = mw.nextpart()
1285
mw2.addheader("Content-Disposition",
1286
'form-data; name="%s"' % name, 1)
1287
f = mw2.startbody(prefix=0)
1288
f.write(value)
1289
1290
def __str__(self):
1291
raise NotImplementedError()
1292
1293
def get_labels(self):
1294
"""Return all labels (Label instances) for this control.
1295
1296
If the control was surrounded by a <label> tag, that will be the first
1297
label; all other labels, connected by 'for' and 'id', are in the order
1298
that appear in the HTML.
1299
1300
"""
1301
res = []
1302
if self._label:
1303
res.append(self._label)
1304
if self.id:
1305
res.extend(self._form._id_to_labels.get(self.id, ()))
1306
return res
1307
1308
1309
#---------------------------------------------------
1310
class ScalarControl(Control):
1311
"""Control whose value is not restricted to one of a prescribed set.
1312
1313
Some ScalarControls don't accept any value attribute. Otherwise, takes a
1314
single value, which must be string-like.
1315
1316
Additional read-only public attribute:
1317
1318
attrs: dictionary mapping the names of original HTML attributes of the
1319
control to their values
1320
1321
"""
1322
def __init__(self, type, name, attrs, index=None):
1323
self._index = index
1324
self._label = _get_label(attrs)
1325
self.__dict__["type"] = type.lower()
1326
self.__dict__["name"] = name
1327
self._value = attrs.get("value")
1328
self.disabled = "disabled" in attrs
1329
self.readonly = "readonly" in attrs
1330
self.id = attrs.get("id")
1331
1332
self.attrs = attrs.copy()
1333
1334
self._clicked = False
1335
1336
self._urlparse = _urllib.parse.urlparse
1337
self._urlunparse = _urllib.parse.urlunparse
1338
1339
def __getattr__(self, name):
1340
if name == "value":
1341
return self.__dict__["_value"]
1342
else:
1343
raise AttributeError("%s instance has no attribute '%s'" %
1344
(self.__class__.__name__, name))
1345
1346
def __setattr__(self, name, value):
1347
if name == "value":
1348
if not isstringlike(value):
1349
raise TypeError("must assign a string")
1350
elif self.readonly:
1351
raise AttributeError("control '%s' is readonly" % self.name)
1352
elif self.disabled:
1353
raise AttributeError("control '%s' is disabled" % self.name)
1354
self.__dict__["_value"] = value
1355
elif name in ("name", "type"):
1356
raise AttributeError("%s attribute is readonly" % name)
1357
else:
1358
self.__dict__[name] = value
1359
1360
def _totally_ordered_pairs(self):
1361
name = self.name
1362
value = self.value
1363
if name is None or value is None or self.disabled:
1364
return []
1365
return [(self._index, name, value)]
1366
1367
def clear(self):
1368
if self.readonly:
1369
raise AttributeError("control '%s' is readonly" % self.name)
1370
self.__dict__["_value"] = None
1371
1372
def __str__(self):
1373
name = self.name
1374
value = self.value
1375
if name is None: name = "<None>"
1376
if value is None: value = "<None>"
1377
1378
infos = []
1379
if self.disabled: infos.append("disabled")
1380
if self.readonly: infos.append("readonly")
1381
info = ", ".join(infos)
1382
if info: info = " (%s)" % info
1383
1384
return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
1385
1386
1387
#---------------------------------------------------
1388
class TextControl(ScalarControl):
1389
"""Textual input control.
1390
1391
Covers:
1392
1393
INPUT/TEXT
1394
INPUT/PASSWORD
1395
INPUT/HIDDEN
1396
TEXTAREA
1397
1398
"""
1399
def __init__(self, type, name, attrs, index=None):
1400
ScalarControl.__init__(self, type, name, attrs, index)
1401
if self.type == "hidden": self.readonly = True
1402
if self._value is None:
1403
self._value = ""
1404
1405
def is_of_kind(self, kind): return kind == "text"
1406
1407
#---------------------------------------------------
1408
class FileControl(ScalarControl):
1409
"""File upload with INPUT TYPE=FILE.
1410
1411
The value attribute of a FileControl is always None. Use add_file instead.
1412
1413
Additional public method: add_file
1414
1415
"""
1416
1417
def __init__(self, type, name, attrs, index=None):
1418
ScalarControl.__init__(self, type, name, attrs, index)
1419
self._value = None
1420
self._upload_data = []
1421
1422
def is_of_kind(self, kind): return kind == "file"
1423
1424
def clear(self):
1425
if self.readonly:
1426
raise AttributeError("control '%s' is readonly" % self.name)
1427
self._upload_data = []
1428
1429
def __setattr__(self, name, value):
1430
if name in ("value", "name", "type"):
1431
raise AttributeError("%s attribute is readonly" % name)
1432
else:
1433
self.__dict__[name] = value
1434
1435
def add_file(self, file_object, content_type=None, filename=None):
1436
if not hasattr(file_object, "read"):
1437
raise TypeError("file-like object must have read method")
1438
if content_type is not None and not isstringlike(content_type):
1439
raise TypeError("content type must be None or string-like")
1440
if filename is not None and not isstringlike(filename):
1441
raise TypeError("filename must be None or string-like")
1442
if content_type is None:
1443
content_type = "application/octet-stream"
1444
self._upload_data.append((file_object, content_type, filename))
1445
1446
def _totally_ordered_pairs(self):
1447
# XXX should it be successful even if unnamed?
1448
if self.name is None or self.disabled:
1449
return []
1450
return [(self._index, self.name, "")]
1451
1452
def _write_mime_data(self, mw, _name, _value):
1453
# called by HTMLForm
1454
# assert _name == self.name and _value == ''
1455
if len(self._upload_data) < 2:
1456
if len(self._upload_data) == 0:
1457
file_object = _cStringIO()
1458
content_type = "application/octet-stream"
1459
filename = ""
1460
else:
1461
file_object, content_type, filename = self._upload_data[0]
1462
if filename is None:
1463
filename = ""
1464
mw2 = mw.nextpart()
1465
fn_part = '; filename="%s"' % filename
1466
disp = 'form-data; name="%s"%s' % (self.name, fn_part)
1467
mw2.addheader("Content-Disposition", disp, prefix=1)
1468
fh = mw2.startbody(content_type, prefix=0)
1469
fh.write(file_object.read())
1470
else:
1471
# multiple files
1472
mw2 = mw.nextpart()
1473
disp = 'form-data; name="%s"' % self.name
1474
mw2.addheader("Content-Disposition", disp, prefix=1)
1475
fh = mw2.startmultipartbody("mixed", prefix=0)
1476
for file_object, content_type, filename in self._upload_data:
1477
mw3 = mw2.nextpart()
1478
if filename is None:
1479
filename = ""
1480
fn_part = '; filename="%s"' % filename
1481
disp = "file%s" % fn_part
1482
mw3.addheader("Content-Disposition", disp, prefix=1)
1483
fh2 = mw3.startbody(content_type, prefix=0)
1484
fh2.write(file_object.read())
1485
mw2.lastpart()
1486
1487
def __str__(self):
1488
name = self.name
1489
if name is None: name = "<None>"
1490
1491
if not self._upload_data:
1492
value = "<No files added>"
1493
else:
1494
value = []
1495
for file, ctype, filename in self._upload_data:
1496
if filename is None:
1497
value.append("<Unnamed file>")
1498
else:
1499
value.append(filename)
1500
value = ", ".join(value)
1501
1502
info = []
1503
if self.disabled: info.append("disabled")
1504
if self.readonly: info.append("readonly")
1505
info = ", ".join(info)
1506
if info: info = " (%s)" % info
1507
1508
return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
1509
1510
1511
#---------------------------------------------------
1512
class IsindexControl(ScalarControl):
1513
"""ISINDEX control.
1514
1515
ISINDEX is the odd-one-out of HTML form controls. In fact, it isn't really
1516
part of regular HTML forms at all, and predates it. You're only allowed
1517
one ISINDEX per HTML document. ISINDEX and regular form submission are
1518
mutually exclusive -- either submit a form, or the ISINDEX.
1519
1520
Having said this, since ISINDEX controls may appear in forms (which is
1521
probably bad HTML), ParseFile / ParseResponse will include them in the
1522
HTMLForm instances it returns. You can set the ISINDEX's value, as with
1523
any other control (but note that ISINDEX controls have no name, so you'll
1524
need to use the type argument of set_value!). When you submit the form,
1525
the ISINDEX will not be successful (ie., no data will get returned to the
1526
server as a result of its presence), unless you click on the ISINDEX
1527
control, in which case the ISINDEX gets submitted instead of the form:
1528
1529
form.set_value("my isindex value", type="isindex")
1530
urllib2.urlopen(form.click(type="isindex"))
1531
1532
ISINDEX elements outside of FORMs are ignored. If you want to submit one
1533
by hand, do it like so:
1534
1535
url = _urllib.parse.urljoin(page_uri, "?"+_urllib.parse.quote_plus("my isindex value"))
1536
result = urllib2.urlopen(url)
1537
1538
"""
1539
def __init__(self, type, name, attrs, index=None):
1540
ScalarControl.__init__(self, type, name, attrs, index)
1541
if self._value is None:
1542
self._value = ""
1543
1544
def is_of_kind(self, kind): return kind in ["text", "clickable"]
1545
1546
def _totally_ordered_pairs(self):
1547
return []
1548
1549
def _click(self, form, coord, return_type, request_class=_urllib.request.Request):
1550
# Relative URL for ISINDEX submission: instead of "foo=bar+baz",
1551
# want "bar+baz".
1552
# This doesn't seem to be specified in HTML 4.01 spec. (ISINDEX is
1553
# deprecated in 4.01, but it should still say how to submit it).
1554
# Submission of ISINDEX is explained in the HTML 3.2 spec, though.
1555
parts = self._urlparse(form.action)
1556
rest, (query, frag) = parts[:-2], parts[-2:]
1557
parts = rest + (_urllib.parse.quote_plus(self.value), None)
1558
url = self._urlunparse(parts)
1559
req_data = url, None, []
1560
1561
if return_type == "pairs":
1562
return []
1563
elif return_type == "request_data":
1564
return req_data
1565
else:
1566
return request_class(url)
1567
1568
def __str__(self):
1569
value = self.value
1570
if value is None: value = "<None>"
1571
1572
infos = []
1573
if self.disabled: infos.append("disabled")
1574
if self.readonly: infos.append("readonly")
1575
info = ", ".join(infos)
1576
if info: info = " (%s)" % info
1577
1578
return "<%s(%s)%s>" % (self.__class__.__name__, value, info)
1579
1580
1581
#---------------------------------------------------
1582
class IgnoreControl(ScalarControl):
1583
"""Control that we're not interested in.
1584
1585
Covers:
1586
1587
INPUT/RESET
1588
BUTTON/RESET
1589
INPUT/BUTTON
1590
BUTTON/BUTTON
1591
1592
These controls are always unsuccessful, in the terminology of HTML 4 (ie.
1593
they never require any information to be returned to the server).
1594
1595
BUTTON/BUTTON is used to generate events for script embedded in HTML.
1596
1597
The value attribute of IgnoreControl is always None.
1598
1599
"""
1600
def __init__(self, type, name, attrs, index=None):
1601
ScalarControl.__init__(self, type, name, attrs, index)
1602
self._value = None
1603
1604
def is_of_kind(self, kind): return False
1605
1606
def __setattr__(self, name, value):
1607
if name == "value":
1608
raise AttributeError(
1609
"control '%s' is ignored, hence read-only" % self.name)
1610
elif name in ("name", "type"):
1611
raise AttributeError("%s attribute is readonly" % name)
1612
else:
1613
self.__dict__[name] = value
1614
1615
1616
#---------------------------------------------------
1617
# ListControls
1618
1619
# helpers and subsidiary classes
1620
1621
class Item:
1622
def __init__(self, control, attrs, index=None):
1623
label = _get_label(attrs)
1624
self.__dict__.update({
1625
"name": attrs["value"],
1626
"_labels": label and [label] or [],
1627
"attrs": attrs,
1628
"_control": control,
1629
"disabled": "disabled" in attrs,
1630
"_selected": False,
1631
"id": attrs.get("id"),
1632
"_index": index,
1633
})
1634
control.items.append(self)
1635
1636
def get_labels(self):
1637
"""Return all labels (Label instances) for this item.
1638
1639
For items that represent radio buttons or checkboxes, if the item was
1640
surrounded by a <label> tag, that will be the first label; all other
1641
labels, connected by 'for' and 'id', are in the order that appear in
1642
the HTML.
1643
1644
For items that represent select options, if the option had a label
1645
attribute, that will be the first label. If the option has contents
1646
(text within the option tags) and it is not the same as the label
1647
attribute (if any), that will be a label. There is nothing in the
1648
spec to my knowledge that makes an option with an id unable to be the
1649
target of a label's for attribute, so those are included, if any, for
1650
the sake of consistency and completeness.
1651
1652
"""
1653
res = []
1654
res.extend(self._labels)
1655
if self.id:
1656
res.extend(self._control._form._id_to_labels.get(self.id, ()))
1657
return res
1658
1659
def __getattr__(self, name):
1660
if name=="selected":
1661
return self._selected
1662
raise AttributeError(name)
1663
1664
def __setattr__(self, name, value):
1665
if name == "selected":
1666
self._control._set_selected_state(self, value)
1667
elif name == "disabled":
1668
self.__dict__["disabled"] = bool(value)
1669
else:
1670
raise AttributeError(name)
1671
1672
def __str__(self):
1673
res = self.name
1674
if self.selected:
1675
res = "*" + res
1676
if self.disabled:
1677
res = "(%s)" % res
1678
return res
1679
1680
def __repr__(self):
1681
# XXX appending the attrs without distinguishing them from name and id
1682
# is silly
1683
attrs = [("name", self.name), ("id", self.id)]+self.attrs.items()
1684
return "<%s %s>" % (
1685
self.__class__.__name__,
1686
" ".join(["%s=%r" % (k, v) for k, v in attrs])
1687
)
1688
1689
def disambiguate(items, nr, **kwds):
1690
msgs = []
1691
for key, value in kwds.items():
1692
msgs.append("%s=%r" % (key, value))
1693
msg = " ".join(msgs)
1694
if not items:
1695
raise ItemNotFoundError(msg)
1696
if nr is None:
1697
if len(items) > 1:
1698
raise AmbiguityError(msg)
1699
nr = 0
1700
if len(items) <= nr:
1701
raise ItemNotFoundError(msg)
1702
return items[nr]
1703
1704
class ListControl(Control):
1705
"""Control representing a sequence of items.
1706
1707
The value attribute of a ListControl represents the successful list items
1708
in the control. The successful list items are those that are selected and
1709
not disabled.
1710
1711
ListControl implements both list controls that take a length-1 value
1712
(single-selection) and those that take length >1 values
1713
(multiple-selection).
1714
1715
ListControls accept sequence values only. Some controls only accept
1716
sequences of length 0 or 1 (RADIO, and single-selection SELECT).
1717
In those cases, ItemCountError is raised if len(sequence) > 1. CHECKBOXes
1718
and multiple-selection SELECTs (those having the "multiple" HTML attribute)
1719
accept sequences of any length.
1720
1721
Note the following mistake:
1722
1723
control.value = some_value
1724
assert control.value == some_value # not necessarily true
1725
1726
The reason for this is that the value attribute always gives the list items
1727
in the order they were listed in the HTML.
1728
1729
ListControl items can also be referred to by their labels instead of names.
1730
Use the label argument to .get(), and the .set_value_by_label(),
1731
.get_value_by_label() methods.
1732
1733
Note that, rather confusingly, though SELECT controls are represented in
1734
HTML by SELECT elements (which contain OPTION elements, representing
1735
individual list items), CHECKBOXes and RADIOs are not represented by *any*
1736
element. Instead, those controls are represented by a collection of INPUT
1737
elements. For example, this is a SELECT control, named "control1":
1738
1739
<select name="control1">
1740
<option>foo</option>
1741
<option value="1">bar</option>
1742
</select>
1743
1744
and this is a CHECKBOX control, named "control2":
1745
1746
<input type="checkbox" name="control2" value="foo" id="cbe1">
1747
<input type="checkbox" name="control2" value="bar" id="cbe2">
1748
1749
The id attribute of a CHECKBOX or RADIO ListControl is always that of its
1750
first element (for example, "cbe1" above).
1751
1752
1753
Additional read-only public attribute: multiple.
1754
1755
"""
1756
1757
# ListControls are built up by the parser from their component items by
1758
# creating one ListControl per item, consolidating them into a single
1759
# master ListControl held by the HTMLForm:
1760
1761
# -User calls form.new_control(...)
1762
# -Form creates Control, and calls control.add_to_form(self).
1763
# -Control looks for a Control with the same name and type in the form,
1764
# and if it finds one, merges itself with that control by calling
1765
# control.merge_control(self). The first Control added to the form, of
1766
# a particular name and type, is the only one that survives in the
1767
# form.
1768
# -Form calls control.fixup for all its controls. ListControls in the
1769
# form know they can now safely pick their default values.
1770
1771
# To create a ListControl without an HTMLForm, use:
1772
1773
# control.merge_control(new_control)
1774
1775
# (actually, it's much easier just to use ParseFile)
1776
1777
_label = None
1778
1779
def __init__(self, type, name, attrs={}, select_default=False,
1780
called_as_base_class=False, index=None):
1781
"""
1782
select_default: for RADIO and multiple-selection SELECT controls, pick
1783
the first item as the default if no 'selected' HTML attribute is
1784
present
1785
1786
"""
1787
if not called_as_base_class:
1788
raise NotImplementedError()
1789
1790
self.__dict__["type"] = type.lower()
1791
self.__dict__["name"] = name
1792
self._value = attrs.get("value")
1793
self.disabled = False
1794
self.readonly = False
1795
self.id = attrs.get("id")
1796
self._closed = False
1797
1798
# As Controls are merged in with .merge_control(), self.attrs will
1799
# refer to each Control in turn -- always the most recently merged
1800
# control. Each merged-in Control instance corresponds to a single
1801
# list item: see ListControl.__doc__.
1802
self.items = []
1803
self._form = None
1804
1805
self._select_default = select_default
1806
self._clicked = False
1807
1808
def clear(self):
1809
self.value = []
1810
1811
def is_of_kind(self, kind):
1812
if kind == "list":
1813
return True
1814
elif kind == "multilist":
1815
return bool(self.multiple)
1816
elif kind == "singlelist":
1817
return not self.multiple
1818
else:
1819
return False
1820
1821
def get_items(self, name=None, label=None, id=None,
1822
exclude_disabled=False):
1823
"""Return matching items by name or label.
1824
1825
For argument docs, see the docstring for .get()
1826
1827
"""
1828
if name is not None and not isstringlike(name):
1829
raise TypeError("item name must be string-like")
1830
if label is not None and not isstringlike(label):
1831
raise TypeError("item label must be string-like")
1832
if id is not None and not isstringlike(id):
1833
raise TypeError("item id must be string-like")
1834
items = [] # order is important
1835
compat = self._form.backwards_compat
1836
for o in self.items:
1837
if exclude_disabled and o.disabled:
1838
continue
1839
if name is not None and o.name != name:
1840
continue
1841
if label is not None:
1842
for l in o.get_labels():
1843
if ((compat and l.text == label) or
1844
(not compat and l.text.find(label) > -1)):
1845
break
1846
else:
1847
continue
1848
if id is not None and o.id != id:
1849
continue
1850
items.append(o)
1851
return items
1852
1853
def get(self, name=None, label=None, id=None, nr=None,
1854
exclude_disabled=False):
1855
"""Return item by name or label, disambiguating if necessary with nr.
1856
1857
All arguments must be passed by name, with the exception of 'name',
1858
which may be used as a positional argument.
1859
1860
If name is specified, then the item must have the indicated name.
1861
1862
If label is specified, then the item must have a label whose
1863
whitespace-compressed, stripped, text substring-matches the indicated
1864
label string (eg. label="please choose" will match
1865
" Do please choose an item ").
1866
1867
If id is specified, then the item must have the indicated id.
1868
1869
nr is an optional 0-based index of the items matching the query.
1870
1871
If nr is the default None value and more than item is found, raises
1872
AmbiguityError (unless the HTMLForm instance's backwards_compat
1873
attribute is true).
1874
1875
If no item is found, or if items are found but nr is specified and not
1876
found, raises ItemNotFoundError.
1877
1878
Optionally excludes disabled items.
1879
1880
"""
1881
if nr is None and self._form.backwards_compat:
1882
nr = 0 # :-/
1883
items = self.get_items(name, label, id, exclude_disabled)
1884
return disambiguate(items, nr, name=name, label=label, id=id)
1885
1886
def _get(self, name, by_label=False, nr=None, exclude_disabled=False):
1887
# strictly for use by deprecated methods
1888
if by_label:
1889
name, label = None, name
1890
else:
1891
name, label = name, None
1892
return self.get(name, label, nr, exclude_disabled)
1893
1894
def toggle(self, name, by_label=False, nr=None):
1895
"""Deprecated: given a name or label and optional disambiguating index
1896
nr, toggle the matching item's selection.
1897
1898
Selecting items follows the behavior described in the docstring of the
1899
'get' method.
1900
1901
if the item is disabled, or this control is disabled or readonly,
1902
raise AttributeError.
1903
1904
"""
1905
deprecation(
1906
"item = control.get(...); item.selected = not item.selected")
1907
o = self._get(name, by_label, nr)
1908
self._set_selected_state(o, not o.selected)
1909
1910
def set(self, selected, name, by_label=False, nr=None):
1911
"""Deprecated: given a name or label and optional disambiguating index
1912
nr, set the matching item's selection to the bool value of selected.
1913
1914
Selecting items follows the behavior described in the docstring of the
1915
'get' method.
1916
1917
if the item is disabled, or this control is disabled or readonly,
1918
raise AttributeError.
1919
1920
"""
1921
deprecation(
1922
"control.get(...).selected = <boolean>")
1923
self._set_selected_state(self._get(name, by_label, nr), selected)
1924
1925
def _set_selected_state(self, item, action):
1926
# action:
1927
# bool False: off
1928
# bool True: on
1929
if self.disabled:
1930
raise AttributeError("control '%s' is disabled" % self.name)
1931
if self.readonly:
1932
raise AttributeError("control '%s' is readonly" % self.name)
1933
action = bool(action)
1934
compat = self._form.backwards_compat
1935
if not compat and item.disabled:
1936
raise AttributeError("item is disabled")
1937
else:
1938
if compat and item.disabled and action:
1939
raise AttributeError("item is disabled")
1940
if self.multiple:
1941
item.__dict__["_selected"] = action
1942
else:
1943
if not action:
1944
item.__dict__["_selected"] = False
1945
else:
1946
for o in self.items:
1947
o.__dict__["_selected"] = False
1948
item.__dict__["_selected"] = True
1949
1950
def toggle_single(self, by_label=None):
1951
"""Deprecated: toggle the selection of the single item in this control.
1952
1953
Raises ItemCountError if the control does not contain only one item.
1954
1955
by_label argument is ignored, and included only for backwards
1956
compatibility.
1957
1958
"""
1959
deprecation(
1960
"control.items[0].selected = not control.items[0].selected")
1961
if len(self.items) != 1:
1962
raise ItemCountError(
1963
"'%s' is not a single-item control" % self.name)
1964
item = self.items[0]
1965
self._set_selected_state(item, not item.selected)
1966
1967
def set_single(self, selected, by_label=None):
1968
"""Deprecated: set the selection of the single item in this control.
1969
1970
Raises ItemCountError if the control does not contain only one item.
1971
1972
by_label argument is ignored, and included only for backwards
1973
compatibility.
1974
1975
"""
1976
deprecation(
1977
"control.items[0].selected = <boolean>")
1978
if len(self.items) != 1:
1979
raise ItemCountError(
1980
"'%s' is not a single-item control" % self.name)
1981
self._set_selected_state(self.items[0], selected)
1982
1983
def get_item_disabled(self, name, by_label=False, nr=None):
1984
"""Get disabled state of named list item in a ListControl."""
1985
deprecation(
1986
"control.get(...).disabled")
1987
return self._get(name, by_label, nr).disabled
1988
1989
def set_item_disabled(self, disabled, name, by_label=False, nr=None):
1990
"""Set disabled state of named list item in a ListControl.
1991
1992
disabled: boolean disabled state
1993
1994
"""
1995
deprecation(
1996
"control.get(...).disabled = <boolean>")
1997
self._get(name, by_label, nr).disabled = disabled
1998
1999
def set_all_items_disabled(self, disabled):
2000
"""Set disabled state of all list items in a ListControl.
2001
2002
disabled: boolean disabled state
2003
2004
"""
2005
for o in self.items:
2006
o.disabled = disabled
2007
2008
def get_item_attrs(self, name, by_label=False, nr=None):
2009
"""Return dictionary of HTML attributes for a single ListControl item.
2010
2011
The HTML element types that describe list items are: OPTION for SELECT
2012
controls, INPUT for the rest. These elements have HTML attributes that
2013
you may occasionally want to know about -- for example, the "alt" HTML
2014
attribute gives a text string describing the item (graphical browsers
2015
usually display this as a tooltip).
2016
2017
The returned dictionary maps HTML attribute names to values. The names
2018
and values are taken from the original HTML.
2019
2020
"""
2021
deprecation(
2022
"control.get(...).attrs")
2023
return self._get(name, by_label, nr).attrs
2024
2025
def close_control(self):
2026
self._closed = True
2027
2028
def add_to_form(self, form):
2029
assert self._form is None or form == self._form, (
2030
"can't add control to more than one form")
2031
self._form = form
2032
if self.name is None:
2033
# always count nameless elements as separate controls
2034
Control.add_to_form(self, form)
2035
else:
2036
for ii in xrange(len(form.controls)-1, -1, -1):
2037
control = form.controls[ii]
2038
if control.name == self.name and control.type == self.type:
2039
if control._closed:
2040
Control.add_to_form(self, form)
2041
else:
2042
control.merge_control(self)
2043
break
2044
else:
2045
Control.add_to_form(self, form)
2046
2047
def merge_control(self, control):
2048
assert bool(control.multiple) == bool(self.multiple)
2049
# usually, isinstance(control, self.__class__)
2050
self.items.extend(control.items)
2051
2052
def fixup(self):
2053
"""
2054
ListControls are built up from component list items (which are also
2055
ListControls) during parsing. This method should be called after all
2056
items have been added. See ListControl.__doc__ for the reason this is
2057
required.
2058
2059
"""
2060
# Need to set default selection where no item was indicated as being
2061
# selected by the HTML:
2062
2063
# CHECKBOX:
2064
# Nothing should be selected.
2065
# SELECT/single, SELECT/multiple and RADIO:
2066
# RFC 1866 (HTML 2.0): says first item should be selected.
2067
# W3C HTML 4.01 Specification: says that client behaviour is
2068
# undefined in this case. For RADIO, exactly one must be selected,
2069
# though which one is undefined.
2070
# Both Netscape and Microsoft Internet Explorer (IE) choose first
2071
# item for SELECT/single. However, both IE5 and Mozilla (both 1.0
2072
# and Firebird 0.6) leave all items unselected for RADIO and
2073
# SELECT/multiple.
2074
2075
# Since both Netscape and IE all choose the first item for
2076
# SELECT/single, we do the same. OTOH, both Netscape and IE
2077
# leave SELECT/multiple with nothing selected, in violation of RFC 1866
2078
# (but not in violation of the W3C HTML 4 standard); the same is true
2079
# of RADIO (which *is* in violation of the HTML 4 standard). We follow
2080
# RFC 1866 if the _select_default attribute is set, and Netscape and IE
2081
# otherwise. RFC 1866 and HTML 4 are always violated insofar as you
2082
# can deselect all items in a RadioControl.
2083
2084
for o in self.items:
2085
# set items' controls to self, now that we've merged
2086
o.__dict__["_control"] = self
2087
2088
def __getattr__(self, name):
2089
if name == "value":
2090
compat = self._form.backwards_compat
2091
if self.name is None:
2092
return []
2093
return [o.name for o in self.items if o.selected and
2094
(not o.disabled or compat)]
2095
else:
2096
raise AttributeError("%s instance has no attribute '%s'" %
2097
(self.__class__.__name__, name))
2098
2099
def __setattr__(self, name, value):
2100
if name == "value":
2101
if self.disabled:
2102
raise AttributeError("control '%s' is disabled" % self.name)
2103
if self.readonly:
2104
raise AttributeError("control '%s' is readonly" % self.name)
2105
self._set_value(value)
2106
elif name in ("name", "type", "multiple"):
2107
raise AttributeError("%s attribute is readonly" % name)
2108
else:
2109
self.__dict__[name] = value
2110
2111
def _set_value(self, value):
2112
if value is None or isstringlike(value):
2113
raise TypeError("ListControl, must set a sequence")
2114
if not value:
2115
compat = self._form.backwards_compat
2116
for o in self.items:
2117
if not o.disabled or compat:
2118
o.selected = False
2119
elif self.multiple:
2120
self._multiple_set_value(value)
2121
elif len(value) > 1:
2122
raise ItemCountError(
2123
"single selection list, must set sequence of "
2124
"length 0 or 1")
2125
else:
2126
self._single_set_value(value)
2127
2128
def _get_items(self, name, target=1):
2129
all_items = self.get_items(name)
2130
items = [o for o in all_items if not o.disabled]
2131
if len(items) < target:
2132
if len(all_items) < target:
2133
raise ItemNotFoundError(
2134
"insufficient items with name %r" % name)
2135
else:
2136
raise AttributeError(
2137
"insufficient non-disabled items with name %s" % name)
2138
on = []
2139
off = []
2140
for o in items:
2141
if o.selected:
2142
on.append(o)
2143
else:
2144
off.append(o)
2145
return on, off
2146
2147
def _single_set_value(self, value):
2148
assert len(value) == 1
2149
on, off = self._get_items(value[0])
2150
assert len(on) <= 1
2151
if not on:
2152
off[0].selected = True
2153
2154
def _multiple_set_value(self, value):
2155
compat = self._form.backwards_compat
2156
turn_on = [] # transactional-ish
2157
turn_off = [item for item in self.items if
2158
item.selected and (not item.disabled or compat)]
2159
names = {}
2160
for nn in value:
2161
if nn in names.keys():
2162
names[nn] += 1
2163
else:
2164
names[nn] = 1
2165
for name, count in names.items():
2166
on, off = self._get_items(name, count)
2167
for i in xrange(count):
2168
if on:
2169
item = on[0]
2170
del on[0]
2171
del turn_off[turn_off.index(item)]
2172
else:
2173
item = off[0]
2174
del off[0]
2175
turn_on.append(item)
2176
for item in turn_off:
2177
item.selected = False
2178
for item in turn_on:
2179
item.selected = True
2180
2181
def set_value_by_label(self, value):
2182
"""Set the value of control by item labels.
2183
2184
value is expected to be an iterable of strings that are substrings of
2185
the item labels that should be selected. Before substring matching is
2186
performed, the original label text is whitespace-compressed
2187
(consecutive whitespace characters are converted to a single space
2188
character) and leading and trailing whitespace is stripped. Ambiguous
2189
labels are accepted without complaint if the form's backwards_compat is
2190
True; otherwise, it will not complain as long as all ambiguous labels
2191
share the same item name (e.g. OPTION value).
2192
2193
"""
2194
if isstringlike(value):
2195
raise TypeError(value)
2196
if not self.multiple and len(value) > 1:
2197
raise ItemCountError(
2198
"single selection list, must set sequence of "
2199
"length 0 or 1")
2200
items = []
2201
for nn in value:
2202
found = self.get_items(label=nn)
2203
if len(found) > 1:
2204
if not self._form.backwards_compat:
2205
# ambiguous labels are fine as long as item names (e.g.
2206
# OPTION values) are same
2207
opt_name = found[0].name
2208
if [o for o in found[1:] if o.name != opt_name]:
2209
raise AmbiguityError(nn)
2210
else:
2211
# OK, we'll guess :-( Assume first available item.
2212
found = found[:1]
2213
for o in found:
2214
# For the multiple-item case, we could try to be smarter,
2215
# saving them up and trying to resolve, but that's too much.
2216
if self._form.backwards_compat or o not in items:
2217
items.append(o)
2218
break
2219
else: # all of them are used
2220
raise ItemNotFoundError(nn)
2221
# now we have all the items that should be on
2222
# let's just turn everything off and then back on.
2223
self.value = []
2224
for o in items:
2225
o.selected = True
2226
2227
def get_value_by_label(self):
2228
"""Return the value of the control as given by normalized labels."""
2229
res = []
2230
compat = self._form.backwards_compat
2231
for o in self.items:
2232
if (not o.disabled or compat) and o.selected:
2233
for l in o.get_labels():
2234
if l.text:
2235
res.append(l.text)
2236
break
2237
else:
2238
res.append(None)
2239
return res
2240
2241
def possible_items(self, by_label=False):
2242
"""Deprecated: return the names or labels of all possible items.
2243
2244
Includes disabled items, which may be misleading for some use cases.
2245
2246
"""
2247
deprecation(
2248
"[item.name for item in self.items]")
2249
if by_label:
2250
res = []
2251
for o in self.items:
2252
for l in o.get_labels():
2253
if l.text:
2254
res.append(l.text)
2255
break
2256
else:
2257
res.append(None)
2258
return res
2259
return [o.name for o in self.items]
2260
2261
def _totally_ordered_pairs(self):
2262
if self.disabled or self.name is None:
2263
return []
2264
else:
2265
return [(o._index, self.name, o.name) for o in self.items
2266
if o.selected and not o.disabled]
2267
2268
def __str__(self):
2269
name = self.name
2270
if name is None: name = "<None>"
2271
2272
display = [str(o) for o in self.items]
2273
2274
infos = []
2275
if self.disabled: infos.append("disabled")
2276
if self.readonly: infos.append("readonly")
2277
info = ", ".join(infos)
2278
if info: info = " (%s)" % info
2279
2280
return "<%s(%s=[%s])%s>" % (self.__class__.__name__,
2281
name, ", ".join(display), info)
2282
2283
2284
class RadioControl(ListControl):
2285
"""
2286
Covers:
2287
2288
INPUT/RADIO
2289
2290
"""
2291
def __init__(self, type, name, attrs, select_default=False, index=None):
2292
attrs.setdefault("value", "on")
2293
ListControl.__init__(self, type, name, attrs, select_default,
2294
called_as_base_class=True, index=index)
2295
self.__dict__["multiple"] = False
2296
o = Item(self, attrs, index)
2297
o.__dict__["_selected"] = "checked" in attrs
2298
2299
def fixup(self):
2300
ListControl.fixup(self)
2301
found = [o for o in self.items if o.selected and not o.disabled]
2302
if not found:
2303
if self._select_default:
2304
for o in self.items:
2305
if not o.disabled:
2306
o.selected = True
2307
break
2308
else:
2309
# Ensure only one item selected. Choose the last one,
2310
# following IE and Firefox.
2311
for o in found[:-1]:
2312
o.selected = False
2313
2314
def get_labels(self):
2315
return []
2316
2317
class CheckboxControl(ListControl):
2318
"""
2319
Covers:
2320
2321
INPUT/CHECKBOX
2322
2323
"""
2324
def __init__(self, type, name, attrs, select_default=False, index=None):
2325
attrs.setdefault("value", "on")
2326
ListControl.__init__(self, type, name, attrs, select_default,
2327
called_as_base_class=True, index=index)
2328
self.__dict__["multiple"] = True
2329
o = Item(self, attrs, index)
2330
o.__dict__["_selected"] = "checked" in attrs
2331
2332
def get_labels(self):
2333
return []
2334
2335
2336
class SelectControl(ListControl):
2337
"""
2338
Covers:
2339
2340
SELECT (and OPTION)
2341
2342
2343
OPTION 'values', in HTML parlance, are Item 'names' in ClientForm parlance.
2344
2345
SELECT control values and labels are subject to some messy defaulting
2346
rules. For example, if the HTML representation of the control is:
2347
2348
<SELECT name=year>
2349
<OPTION value=0 label="2002">current year</OPTION>
2350
<OPTION value=1>2001</OPTION>
2351
<OPTION>2000</OPTION>
2352
</SELECT>
2353
2354
The items, in order, have labels "2002", "2001" and "2000", whereas their
2355
names (the OPTION values) are "0", "1" and "2000" respectively. Note that
2356
the value of the last OPTION in this example defaults to its contents, as
2357
specified by RFC 1866, as do the labels of the second and third OPTIONs.
2358
2359
The OPTION labels are sometimes more meaningful than the OPTION values,
2360
which can make for more maintainable code.
2361
2362
Additional read-only public attribute: attrs
2363
2364
The attrs attribute is a dictionary of the original HTML attributes of the
2365
SELECT element. Other ListControls do not have this attribute, because in
2366
other cases the control as a whole does not correspond to any single HTML
2367
element. control.get(...).attrs may be used as usual to get at the HTML
2368
attributes of the HTML elements corresponding to individual list items (for
2369
SELECT controls, these are OPTION elements).
2370
2371
Another special case is that the Item.attrs dictionaries have a special key
2372
"contents" which does not correspond to any real HTML attribute, but rather
2373
contains the contents of the OPTION element:
2374
2375
<OPTION>this bit</OPTION>
2376
2377
"""
2378
# HTML attributes here are treated slightly differently from other list
2379
# controls:
2380
# -The SELECT HTML attributes dictionary is stuffed into the OPTION
2381
# HTML attributes dictionary under the "__select" key.
2382
# -The content of each OPTION element is stored under the special
2383
# "contents" key of the dictionary.
2384
# After all this, the dictionary is passed to the SelectControl constructor
2385
# as the attrs argument, as usual. However:
2386
# -The first SelectControl constructed when building up a SELECT control
2387
# has a constructor attrs argument containing only the __select key -- so
2388
# this SelectControl represents an empty SELECT control.
2389
# -Subsequent SelectControls have both OPTION HTML-attribute in attrs and
2390
# the __select dictionary containing the SELECT HTML-attributes.
2391
2392
def __init__(self, type, name, attrs, select_default=False, index=None):
2393
# fish out the SELECT HTML attributes from the OPTION HTML attributes
2394
# dictionary
2395
self.attrs = attrs["__select"].copy()
2396
self.__dict__["_label"] = _get_label(self.attrs)
2397
self.__dict__["id"] = self.attrs.get("id")
2398
self.__dict__["multiple"] = "multiple" in self.attrs
2399
# the majority of the contents, label, and value dance already happened
2400
contents = attrs.get("contents")
2401
attrs = attrs.copy()
2402
del attrs["__select"]
2403
2404
ListControl.__init__(self, type, name, self.attrs, select_default,
2405
called_as_base_class=True, index=index)
2406
self.disabled = "disabled" in self.attrs
2407
self.readonly = "readonly" in self.attrs
2408
if "value" in attrs:
2409
# otherwise it is a marker 'select started' token
2410
o = Item(self, attrs, index)
2411
o.__dict__["_selected"] = "selected" in attrs
2412
# add 'label' label and contents label, if different. If both are
2413
# provided, the 'label' label is used for display in HTML
2414
# 4.0-compliant browsers (and any lower spec? not sure) while the
2415
# contents are used for display in older or less-compliant
2416
# browsers. We make label objects for both, if the values are
2417
# different.
2418
label = attrs.get("label")
2419
if label:
2420
o._labels.append(Label({"__text": label}))
2421
if contents and contents != label:
2422
o._labels.append(Label({"__text": contents}))
2423
elif contents:
2424
o._labels.append(Label({"__text": contents}))
2425
2426
def fixup(self):
2427
ListControl.fixup(self)
2428
# Firefox doesn't exclude disabled items from those considered here
2429
# (i.e. from 'found', for both branches of the if below). Note that
2430
# IE6 doesn't support the disabled attribute on OPTIONs at all.
2431
found = [o for o in self.items if o.selected]
2432
if not found:
2433
if not self.multiple or self._select_default:
2434
for o in self.items:
2435
if not o.disabled:
2436
was_disabled = self.disabled
2437
self.disabled = False
2438
try:
2439
o.selected = True
2440
finally:
2441
o.disabled = was_disabled
2442
break
2443
elif not self.multiple:
2444
# Ensure only one item selected. Choose the last one,
2445
# following IE and Firefox.
2446
for o in found[:-1]:
2447
o.selected = False
2448
2449
2450
#---------------------------------------------------
2451
class SubmitControl(ScalarControl):
2452
"""
2453
Covers:
2454
2455
INPUT/SUBMIT
2456
BUTTON/SUBMIT
2457
2458
"""
2459
def __init__(self, type, name, attrs, index=None):
2460
ScalarControl.__init__(self, type, name, attrs, index)
2461
# IE5 defaults SUBMIT value to "Submit Query"; Firebird 0.6 leaves it
2462
# blank, Konqueror 3.1 defaults to "Submit". HTML spec. doesn't seem
2463
# to define this.
2464
if self.value is None and not self.disabled and not self.readonly: self.value = ""
2465
self.readonly = True
2466
2467
def get_labels(self):
2468
res = []
2469
if self.value:
2470
res.append(Label({"__text": self.value}))
2471
res.extend(ScalarControl.get_labels(self))
2472
return res
2473
2474
def is_of_kind(self, kind): return kind == "clickable"
2475
2476
def _click(self, form, coord, return_type, request_class=_urllib.request.Request):
2477
self._clicked = coord
2478
r = form._switch_click(return_type, request_class)
2479
self._clicked = False
2480
return r
2481
2482
def _totally_ordered_pairs(self):
2483
if not self._clicked:
2484
return []
2485
return ScalarControl._totally_ordered_pairs(self)
2486
2487
2488
#---------------------------------------------------
2489
class ImageControl(SubmitControl):
2490
"""
2491
Covers:
2492
2493
INPUT/IMAGE
2494
2495
Coordinates are specified using one of the HTMLForm.click* methods.
2496
2497
"""
2498
def __init__(self, type, name, attrs, index=None):
2499
SubmitControl.__init__(self, type, name, attrs, index)
2500
self.readonly = False
2501
2502
def _totally_ordered_pairs(self):
2503
clicked = self._clicked
2504
if self.disabled or not clicked:
2505
return []
2506
name = self.name
2507
if name is None: return []
2508
pairs = [
2509
(self._index, "%s.x" % name, str(clicked[0])),
2510
(self._index+1, "%s.y" % name, str(clicked[1])),
2511
]
2512
value = self._value
2513
if value:
2514
pairs.append((self._index+2, name, value))
2515
return pairs
2516
2517
get_labels = ScalarControl.get_labels
2518
2519
# aliases, just to make str(control) and str(form) clearer
2520
class PasswordControl(TextControl): pass
2521
class HiddenControl(TextControl): pass
2522
class TextareaControl(TextControl): pass
2523
class SubmitButtonControl(SubmitControl): pass
2524
2525
2526
def is_listcontrol(control): return control.is_of_kind("list")
2527
2528
2529
class HTMLForm:
2530
"""Represents a single HTML <form> ... </form> element.
2531
2532
A form consists of a sequence of controls that usually have names, and
2533
which can take on various values. The values of the various types of
2534
controls represent variously: text, zero-or-one-of-many or many-of-many
2535
choices, and files to be uploaded. Some controls can be clicked on to
2536
submit the form, and clickable controls' values sometimes include the
2537
coordinates of the click.
2538
2539
Forms can be filled in with data to be returned to the server, and then
2540
submitted, using the click method to generate a request object suitable for
2541
passing to urllib2.urlopen (or the click_request_data or click_pairs
2542
methods if you're not using urllib2).
2543
2544
import ClientForm
2545
forms = ClientForm.ParseFile(html, base_uri)
2546
form = forms[0]
2547
2548
form["query"] = "Python"
2549
form.find_control("nr_results").get("lots").selected = True
2550
2551
response = urllib2.urlopen(form.click())
2552
2553
Usually, HTMLForm instances are not created directly. Instead, the
2554
ParseFile or ParseResponse factory functions are used. If you do construct
2555
HTMLForm objects yourself, however, note that an HTMLForm instance is only
2556
properly initialised after the fixup method has been called (ParseFile and
2557
ParseResponse do this for you). See ListControl.__doc__ for the reason
2558
this is required.
2559
2560
Indexing a form (form["control_name"]) returns the named Control's value
2561
attribute. Assignment to a form index (form["control_name"] = something)
2562
is equivalent to assignment to the named Control's value attribute. If you
2563
need to be more specific than just supplying the control's name, use the
2564
set_value and get_value methods.
2565
2566
ListControl values are lists of item names (specifically, the names of the
2567
items that are selected and not disabled, and hence are "successful" -- ie.
2568
cause data to be returned to the server). The list item's name is the
2569
value of the corresponding HTML element's"value" attribute.
2570
2571
Example:
2572
2573
<INPUT type="CHECKBOX" name="cheeses" value="leicester"></INPUT>
2574
<INPUT type="CHECKBOX" name="cheeses" value="cheddar"></INPUT>
2575
2576
defines a CHECKBOX control with name "cheeses" which has two items, named
2577
"leicester" and "cheddar".
2578
2579
Another example:
2580
2581
<SELECT name="more_cheeses">
2582
<OPTION>1</OPTION>
2583
<OPTION value="2" label="CHEDDAR">cheddar</OPTION>
2584
</SELECT>
2585
2586
defines a SELECT control with name "more_cheeses" which has two items,
2587
named "1" and "2" (because the OPTION element's value HTML attribute
2588
defaults to the element contents -- see SelectControl.__doc__ for more on
2589
these defaulting rules).
2590
2591
To select, deselect or otherwise manipulate individual list items, use the
2592
HTMLForm.find_control() and ListControl.get() methods. To set the whole
2593
value, do as for any other control: use indexing or the set_/get_value
2594
methods.
2595
2596
Example:
2597
2598
# select *only* the item named "cheddar"
2599
form["cheeses"] = ["cheddar"]
2600
# select "cheddar", leave other items unaffected
2601
form.find_control("cheeses").get("cheddar").selected = True
2602
2603
Some controls (RADIO and SELECT without the multiple attribute) can only
2604
have zero or one items selected at a time. Some controls (CHECKBOX and
2605
SELECT with the multiple attribute) can have multiple items selected at a
2606
time. To set the whole value of a ListControl, assign a sequence to a form
2607
index:
2608
2609
form["cheeses"] = ["cheddar", "leicester"]
2610
2611
If the ListControl is not multiple-selection, the assigned list must be of
2612
length one.
2613
2614
To check if a control has an item, if an item is selected, or if an item is
2615
successful (selected and not disabled), respectively:
2616
2617
"cheddar" in [item.name for item in form.find_control("cheeses").items]
2618
"cheddar" in [item.name for item in form.find_control("cheeses").items and
2619
item.selected]
2620
"cheddar" in form["cheeses"] # (or "cheddar" in form.get_value("cheeses"))
2621
2622
Note that some list items may be disabled (see below).
2623
2624
Note the following mistake:
2625
2626
form[control_name] = control_value
2627
assert form[control_name] == control_value # not necessarily true
2628
2629
The reason for this is that form[control_name] always gives the list items
2630
in the order they were listed in the HTML.
2631
2632
List items (hence list values, too) can be referred to in terms of list
2633
item labels rather than list item names using the appropriate label
2634
arguments. Note that each item may have several labels.
2635
2636
The question of default values of OPTION contents, labels and values is
2637
somewhat complicated: see SelectControl.__doc__ and
2638
ListControl.get_item_attrs.__doc__ if you think you need to know.
2639
2640
Controls can be disabled or readonly. In either case, the control's value
2641
cannot be changed until you clear those flags (see example below).
2642
Disabled is the state typically represented by browsers by 'greying out' a
2643
control. Disabled controls are not 'successful' -- they don't cause data
2644
to get returned to the server. Readonly controls usually appear in
2645
browsers as read-only text boxes. Readonly controls are successful. List
2646
items can also be disabled. Attempts to select or deselect disabled items
2647
fail with AttributeError.
2648
2649
If a lot of controls are readonly, it can be useful to do this:
2650
2651
form.set_all_readonly(False)
2652
2653
To clear a control's value attribute, so that it is not successful (until a
2654
value is subsequently set):
2655
2656
form.clear("cheeses")
2657
2658
More examples:
2659
2660
control = form.find_control("cheeses")
2661
control.disabled = False
2662
control.readonly = False
2663
control.get("gruyere").disabled = True
2664
control.items[0].selected = True
2665
2666
See the various Control classes for further documentation. Many methods
2667
take name, type, kind, id, label and nr arguments to specify the control to
2668
be operated on: see HTMLForm.find_control.__doc__.
2669
2670
ControlNotFoundError (subclass of ValueError) is raised if the specified
2671
control can't be found. This includes occasions where a non-ListControl
2672
is found, but the method (set, for example) requires a ListControl.
2673
ItemNotFoundError (subclass of ValueError) is raised if a list item can't
2674
be found. ItemCountError (subclass of ValueError) is raised if an attempt
2675
is made to select more than one item and the control doesn't allow that, or
2676
set/get_single are called and the control contains more than one item.
2677
AttributeError is raised if a control or item is readonly or disabled and
2678
an attempt is made to alter its value.
2679
2680
Security note: Remember that any passwords you store in HTMLForm instances
2681
will be saved to disk in the clear if you pickle them (directly or
2682
indirectly). The simplest solution to this is to avoid pickling HTMLForm
2683
objects. You could also pickle before filling in any password, or just set
2684
the password to "" before pickling.
2685
2686
2687
Public attributes:
2688
2689
action: full (absolute URI) form action
2690
method: "GET" or "POST"
2691
enctype: form transfer encoding MIME type
2692
name: name of form (None if no name was specified)
2693
attrs: dictionary mapping original HTML form attributes to their values
2694
2695
controls: list of Control instances; do not alter this list
2696
(instead, call form.new_control to make a Control and add it to the
2697
form, or control.add_to_form if you already have a Control instance)
2698
2699
2700
2701
Methods for form filling:
2702
-------------------------
2703
2704
Most of the these methods have very similar arguments. See
2705
HTMLForm.find_control.__doc__ for details of the name, type, kind, label
2706
and nr arguments.
2707
2708
def find_control(self,
2709
name=None, type=None, kind=None, id=None, predicate=None,
2710
nr=None, label=None)
2711
2712
get_value(name=None, type=None, kind=None, id=None, nr=None,
2713
by_label=False, # by_label is deprecated
2714
label=None)
2715
set_value(value,
2716
name=None, type=None, kind=None, id=None, nr=None,
2717
by_label=False, # by_label is deprecated
2718
label=None)
2719
2720
clear_all()
2721
clear(name=None, type=None, kind=None, id=None, nr=None, label=None)
2722
2723
set_all_readonly(readonly)
2724
2725
2726
Method applying only to FileControls:
2727
2728
add_file(file_object,
2729
content_type="application/octet-stream", filename=None,
2730
name=None, id=None, nr=None, label=None)
2731
2732
2733
Methods applying only to clickable controls:
2734
2735
click(name=None, type=None, id=None, nr=0, coord=(1,1), label=None)
2736
click_request_data(name=None, type=None, id=None, nr=0, coord=(1,1),
2737
label=None)
2738
click_pairs(name=None, type=None, id=None, nr=0, coord=(1,1), label=None)
2739
2740
"""
2741
2742
type2class = {
2743
"text": TextControl,
2744
"password": PasswordControl,
2745
"hidden": HiddenControl,
2746
"textarea": TextareaControl,
2747
2748
"isindex": IsindexControl,
2749
2750
"file": FileControl,
2751
2752
"button": IgnoreControl,
2753
"buttonbutton": IgnoreControl,
2754
"reset": IgnoreControl,
2755
"resetbutton": IgnoreControl,
2756
2757
"submit": SubmitControl,
2758
"submitbutton": SubmitButtonControl,
2759
"image": ImageControl,
2760
2761
"radio": RadioControl,
2762
"checkbox": CheckboxControl,
2763
"select": SelectControl,
2764
}
2765
2766
#---------------------------------------------------
2767
# Initialisation. Use ParseResponse / ParseFile instead.
2768
2769
def __init__(self, action, method="GET",
2770
enctype=None,
2771
name=None, attrs=None,
2772
request_class=_urllib.request.Request,
2773
forms=None, labels=None, id_to_labels=None,
2774
backwards_compat=True):
2775
"""
2776
In the usual case, use ParseResponse (or ParseFile) to create new
2777
HTMLForm objects.
2778
2779
action: full (absolute URI) form action
2780
method: "GET" or "POST"
2781
enctype: form transfer encoding MIME type
2782
name: name of form
2783
attrs: dictionary mapping original HTML form attributes to their values
2784
2785
"""
2786
self.action = action
2787
self.method = method
2788
self.enctype = enctype or "application/x-www-form-urlencoded"
2789
self.name = name
2790
if attrs is not None:
2791
self.attrs = attrs.copy()
2792
else:
2793
self.attrs = {}
2794
self.controls = []
2795
self._request_class = request_class
2796
2797
# these attributes are used by zope.testbrowser
2798
self._forms = forms # this is a semi-public API!
2799
self._labels = labels # this is a semi-public API!
2800
self._id_to_labels = id_to_labels # this is a semi-public API!
2801
2802
self.backwards_compat = backwards_compat # note __setattr__
2803
2804
self._urlunparse = _urllib.parse.urlunparse
2805
self._urlparse = _urllib.parse.urlparse
2806
2807
def __getattr__(self, name):
2808
if name == "backwards_compat":
2809
return self._backwards_compat
2810
return getattr(HTMLForm, name)
2811
2812
def __setattr__(self, name, value):
2813
# yuck
2814
if name == "backwards_compat":
2815
name = "_backwards_compat"
2816
value = bool(value)
2817
for cc in self.controls:
2818
try:
2819
items = cc.items
2820
except AttributeError:
2821
continue
2822
else:
2823
for ii in items:
2824
for ll in ii.get_labels():
2825
ll._backwards_compat = value
2826
self.__dict__[name] = value
2827
2828
def new_control(self, type, name, attrs,
2829
ignore_unknown=False, select_default=False, index=None):
2830
"""Adds a new control to the form.
2831
2832
This is usually called by ParseFile and ParseResponse. Don't call it
2833
youself unless you're building your own Control instances.
2834
2835
Note that controls representing lists of items are built up from
2836
controls holding only a single list item. See ListControl.__doc__ for
2837
further information.
2838
2839
type: type of control (see Control.__doc__ for a list)
2840
attrs: HTML attributes of control
2841
ignore_unknown: if true, use a dummy Control instance for controls of
2842
unknown type; otherwise, use a TextControl
2843
select_default: for RADIO and multiple-selection SELECT controls, pick
2844
the first item as the default if no 'selected' HTML attribute is
2845
present (this defaulting happens when the HTMLForm.fixup method is
2846
called)
2847
index: index of corresponding element in HTML (see
2848
MoreFormTests.test_interspersed_controls for motivation)
2849
2850
"""
2851
type = type.lower()
2852
klass = self.type2class.get(type)
2853
if klass is None:
2854
if ignore_unknown:
2855
klass = IgnoreControl
2856
else:
2857
klass = TextControl
2858
2859
a = attrs.copy()
2860
if issubclass(klass, ListControl):
2861
control = klass(type, name, a, select_default, index)
2862
else:
2863
control = klass(type, name, a, index)
2864
2865
if type == "select" and len(attrs) == 1:
2866
for ii in xrange(len(self.controls)-1, -1, -1):
2867
ctl = self.controls[ii]
2868
if ctl.type == "select":
2869
ctl.close_control()
2870
break
2871
2872
control.add_to_form(self)
2873
control._urlparse = self._urlparse
2874
control._urlunparse = self._urlunparse
2875
2876
def fixup(self):
2877
"""Normalise form after all controls have been added.
2878
2879
This is usually called by ParseFile and ParseResponse. Don't call it
2880
youself unless you're building your own Control instances.
2881
2882
This method should only be called once, after all controls have been
2883
added to the form.
2884
2885
"""
2886
for control in self.controls:
2887
control.fixup()
2888
self.backwards_compat = self._backwards_compat
2889
2890
#---------------------------------------------------
2891
def __str__(self):
2892
header = "%s%s %s %s" % (
2893
(self.name and self.name+" " or ""),
2894
self.method, self.action, self.enctype)
2895
rep = [header]
2896
for control in self.controls:
2897
rep.append(" %s" % str(control))
2898
return "<%s>" % "\n".join(rep)
2899
2900
#---------------------------------------------------
2901
# Form-filling methods.
2902
2903
def __getitem__(self, name):
2904
return self.find_control(name).value
2905
def __contains__(self, name):
2906
return bool(self.find_control(name))
2907
def __setitem__(self, name, value):
2908
control = self.find_control(name)
2909
try:
2910
control.value = value
2911
except AttributeError as e:
2912
raise ValueError(str(e))
2913
2914
def get_value(self,
2915
name=None, type=None, kind=None, id=None, nr=None,
2916
by_label=False, # by_label is deprecated
2917
label=None):
2918
"""Return value of control.
2919
2920
If only name and value arguments are supplied, equivalent to
2921
2922
form[name]
2923
2924
"""
2925
if by_label:
2926
deprecation("form.get_value_by_label(...)")
2927
c = self.find_control(name, type, kind, id, label=label, nr=nr)
2928
if by_label:
2929
try:
2930
meth = c.get_value_by_label
2931
except AttributeError:
2932
raise NotImplementedError(
2933
"control '%s' does not yet support by_label" % c.name)
2934
else:
2935
return meth()
2936
else:
2937
return c.value
2938
def set_value(self, value,
2939
name=None, type=None, kind=None, id=None, nr=None,
2940
by_label=False, # by_label is deprecated
2941
label=None):
2942
"""Set value of control.
2943
2944
If only name and value arguments are supplied, equivalent to
2945
2946
form[name] = value
2947
2948
"""
2949
if by_label:
2950
deprecation("form.get_value_by_label(...)")
2951
c = self.find_control(name, type, kind, id, label=label, nr=nr)
2952
if by_label:
2953
try:
2954
meth = c.set_value_by_label
2955
except AttributeError:
2956
raise NotImplementedError(
2957
"control '%s' does not yet support by_label" % c.name)
2958
else:
2959
meth(value)
2960
else:
2961
c.value = value
2962
def get_value_by_label(
2963
self, name=None, type=None, kind=None, id=None, label=None, nr=None):
2964
"""
2965
2966
All arguments should be passed by name.
2967
2968
"""
2969
c = self.find_control(name, type, kind, id, label=label, nr=nr)
2970
return c.get_value_by_label()
2971
2972
def set_value_by_label(
2973
self, value,
2974
name=None, type=None, kind=None, id=None, label=None, nr=None):
2975
"""
2976
2977
All arguments should be passed by name.
2978
2979
"""
2980
c = self.find_control(name, type, kind, id, label=label, nr=nr)
2981
c.set_value_by_label(value)
2982
2983
def set_all_readonly(self, readonly):
2984
for control in self.controls:
2985
control.readonly = bool(readonly)
2986
2987
def clear_all(self):
2988
"""Clear the value attributes of all controls in the form.
2989
2990
See HTMLForm.clear.__doc__.
2991
2992
"""
2993
for control in self.controls:
2994
control.clear()
2995
2996
def clear(self,
2997
name=None, type=None, kind=None, id=None, nr=None, label=None):
2998
"""Clear the value attribute of a control.
2999
3000
As a result, the affected control will not be successful until a value
3001
is subsequently set. AttributeError is raised on readonly controls.
3002
3003
"""
3004
c = self.find_control(name, type, kind, id, label=label, nr=nr)
3005
c.clear()
3006
3007
3008
#---------------------------------------------------
3009
# Form-filling methods applying only to ListControls.
3010
3011
def possible_items(self, # deprecated
3012
name=None, type=None, kind=None, id=None,
3013
nr=None, by_label=False, label=None):
3014
"""Return a list of all values that the specified control can take."""
3015
c = self._find_list_control(name, type, kind, id, label, nr)
3016
return c.possible_items(by_label)
3017
3018
def set(self, selected, item_name, # deprecated
3019
name=None, type=None, kind=None, id=None, nr=None,
3020
by_label=False, label=None):
3021
"""Select / deselect named list item.
3022
3023
selected: boolean selected state
3024
3025
"""
3026
self._find_list_control(name, type, kind, id, label, nr).set(
3027
selected, item_name, by_label)
3028
def toggle(self, item_name, # deprecated
3029
name=None, type=None, kind=None, id=None, nr=None,
3030
by_label=False, label=None):
3031
"""Toggle selected state of named list item."""
3032
self._find_list_control(name, type, kind, id, label, nr).toggle(
3033
item_name, by_label)
3034
3035
def set_single(self, selected, # deprecated
3036
name=None, type=None, kind=None, id=None,
3037
nr=None, by_label=None, label=None):
3038
"""Select / deselect list item in a control having only one item.
3039
3040
If the control has multiple list items, ItemCountError is raised.
3041
3042
This is just a convenience method, so you don't need to know the item's
3043
name -- the item name in these single-item controls is usually
3044
something meaningless like "1" or "on".
3045
3046
For example, if a checkbox has a single item named "on", the following
3047
two calls are equivalent:
3048
3049
control.toggle("on")
3050
control.toggle_single()
3051
3052
""" # by_label ignored and deprecated
3053
self._find_list_control(
3054
name, type, kind, id, label, nr).set_single(selected)
3055
def toggle_single(self, name=None, type=None, kind=None, id=None,
3056
nr=None, by_label=None, label=None): # deprecated
3057
"""Toggle selected state of list item in control having only one item.
3058
3059
The rest is as for HTMLForm.set_single.__doc__.
3060
3061
""" # by_label ignored and deprecated
3062
self._find_list_control(name, type, kind, id, label, nr).toggle_single()
3063
3064
#---------------------------------------------------
3065
# Form-filling method applying only to FileControls.
3066
3067
def add_file(self, file_object, content_type=None, filename=None,
3068
name=None, id=None, nr=None, label=None):
3069
"""Add a file to be uploaded.
3070
3071
file_object: file-like object (with read method) from which to read
3072
data to upload
3073
content_type: MIME content type of data to upload
3074
filename: filename to pass to server
3075
3076
If filename is None, no filename is sent to the server.
3077
3078
If content_type is None, the content type is guessed based on the
3079
filename and the data from read from the file object.
3080
3081
XXX
3082
At the moment, guessed content type is always application/octet-stream.
3083
Use sndhdr, imghdr modules. Should also try to guess HTML, XML, and
3084
plain text.
3085
3086
Note the following useful HTML attributes of file upload controls (see
3087
HTML 4.01 spec, section 17):
3088
3089
accept: comma-separated list of content types that the server will
3090
handle correctly; you can use this to filter out non-conforming files
3091
size: XXX IIRC, this is indicative of whether form wants multiple or
3092
single files
3093
maxlength: XXX hint of max content length in bytes?
3094
3095
"""
3096
self.find_control(name, "file", id=id, label=label, nr=nr).add_file(
3097
file_object, content_type, filename)
3098
3099
#---------------------------------------------------
3100
# Form submission methods, applying only to clickable controls.
3101
3102
def click(self, name=None, type=None, id=None, nr=0, coord=(1,1),
3103
request_class=_urllib.request.Request,
3104
label=None):
3105
"""Return request that would result from clicking on a control.
3106
3107
The request object is a _urllib.request.Request instance, which you can pass to
3108
urllib2.urlopen (or ClientCookie.urlopen).
3109
3110
Only some control types (INPUT/SUBMIT & BUTTON/SUBMIT buttons and
3111
IMAGEs) can be clicked.
3112
3113
Will click on the first clickable control, subject to the name, type
3114
and nr arguments (as for find_control). If no name, type, id or number
3115
is specified and there are no clickable controls, a request will be
3116
returned for the form in its current, un-clicked, state.
3117
3118
IndexError is raised if any of name, type, id or nr is specified but no
3119
matching control is found. ValueError is raised if the HTMLForm has an
3120
enctype attribute that is not recognised.
3121
3122
You can optionally specify a coordinate to click at, which only makes a
3123
difference if you clicked on an image.
3124
3125
"""
3126
return self._click(name, type, id, label, nr, coord, "request",
3127
self._request_class)
3128
3129
def click_request_data(self,
3130
name=None, type=None, id=None,
3131
nr=0, coord=(1,1),
3132
request_class=_urllib.request.Request,
3133
label=None):
3134
"""As for click method, but return a tuple (url, data, headers).
3135
3136
You can use this data to send a request to the server. This is useful
3137
if you're using httplib or urllib rather than urllib2. Otherwise, use
3138
the click method.
3139
3140
# Untested. Have to subclass to add headers, I think -- so use urllib2
3141
# instead!
3142
import urllib
3143
url, data, hdrs = form.click_request_data()
3144
r = _urllib.request.urlopen(url, data)
3145
3146
# Untested. I don't know of any reason to use httplib -- you can get
3147
# just as much control with urllib2.
3148
import httplib, urlparse
3149
url, data, hdrs = form.click_request_data()
3150
tup = urlparse(url)
3151
host, path = tup[1], _urllib.parse.urlunparse((None, None)+tup[2:])
3152
conn = httplib.HTTPConnection(host)
3153
if data:
3154
httplib.request("POST", path, data, hdrs)
3155
else:
3156
httplib.request("GET", path, headers=hdrs)
3157
r = conn.getresponse()
3158
3159
"""
3160
return self._click(name, type, id, label, nr, coord, "request_data",
3161
self._request_class)
3162
3163
def click_pairs(self, name=None, type=None, id=None,
3164
nr=0, coord=(1,1),
3165
label=None):
3166
"""As for click_request_data, but returns a list of (key, value) pairs.
3167
3168
You can use this list as an argument to ClientForm.urlencode. This is
3169
usually only useful if you're using httplib or urllib rather than
3170
urllib2 or ClientCookie. It may also be useful if you want to manually
3171
tweak the keys and/or values, but this should not be necessary.
3172
Otherwise, use the click method.
3173
3174
Note that this method is only useful for forms of MIME type
3175
x-www-form-urlencoded. In particular, it does not return the
3176
information required for file upload. If you need file upload and are
3177
not using urllib2, use click_request_data.
3178
3179
Also note that Python 2.0's urllib.urlencode is slightly broken: it
3180
only accepts a mapping, not a sequence of pairs, as an argument. This
3181
messes up any ordering in the argument. Use ClientForm.urlencode
3182
instead.
3183
3184
"""
3185
return self._click(name, type, id, label, nr, coord, "pairs",
3186
self._request_class)
3187
3188
#---------------------------------------------------
3189
3190
def find_control(self,
3191
name=None, type=None, kind=None, id=None,
3192
predicate=None, nr=None,
3193
label=None):
3194
"""Locate and return some specific control within the form.
3195
3196
At least one of the name, type, kind, predicate and nr arguments must
3197
be supplied. If no matching control is found, ControlNotFoundError is
3198
raised.
3199
3200
If name is specified, then the control must have the indicated name.
3201
3202
If type is specified then the control must have the specified type (in
3203
addition to the types possible for <input> HTML tags: "text",
3204
"password", "hidden", "submit", "image", "button", "radio", "checkbox",
3205
"file" we also have "reset", "buttonbutton", "submitbutton",
3206
"resetbutton", "textarea", "select" and "isindex").
3207
3208
If kind is specified, then the control must fall into the specified
3209
group, each of which satisfies a particular interface. The types are
3210
"text", "list", "multilist", "singlelist", "clickable" and "file".
3211
3212
If id is specified, then the control must have the indicated id.
3213
3214
If predicate is specified, then the control must match that function.
3215
The predicate function is passed the control as its single argument,
3216
and should return a boolean value indicating whether the control
3217
matched.
3218
3219
nr, if supplied, is the sequence number of the control (where 0 is the
3220
first). Note that control 0 is the first control matching all the
3221
other arguments (if supplied); it is not necessarily the first control
3222
in the form. If no nr is supplied, AmbiguityError is raised if
3223
multiple controls match the other arguments (unless the
3224
.backwards-compat attribute is true).
3225
3226
If label is specified, then the control must have this label. Note
3227
that radio controls and checkboxes never have labels: their items do.
3228
3229
"""
3230
if ((name is None) and (type is None) and (kind is None) and
3231
(id is None) and (label is None) and (predicate is None) and
3232
(nr is None)):
3233
raise ValueError(
3234
"at least one argument must be supplied to specify control")
3235
return self._find_control(name, type, kind, id, label, predicate, nr)
3236
3237
#---------------------------------------------------
3238
# Private methods.
3239
3240
def _find_list_control(self,
3241
name=None, type=None, kind=None, id=None,
3242
label=None, nr=None):
3243
if ((name is None) and (type is None) and (kind is None) and
3244
(id is None) and (label is None) and (nr is None)):
3245
raise ValueError(
3246
"at least one argument must be supplied to specify control")
3247
3248
return self._find_control(name, type, kind, id, label,
3249
is_listcontrol, nr)
3250
3251
def _find_control(self, name, type, kind, id, label, predicate, nr):
3252
if ((name is not None) and (name is not Missing) and
3253
not isstringlike(name)):
3254
raise TypeError("control name must be string-like")
3255
if (type is not None) and not isstringlike(type):
3256
raise TypeError("control type must be string-like")
3257
if (kind is not None) and not isstringlike(kind):
3258
raise TypeError("control kind must be string-like")
3259
if (id is not None) and not isstringlike(id):
3260
raise TypeError("control id must be string-like")
3261
if (label is not None) and not isstringlike(label):
3262
raise TypeError("control label must be string-like")
3263
if (predicate is not None) and not callable(predicate):
3264
raise TypeError("control predicate must be callable")
3265
if (nr is not None) and nr < 0:
3266
raise ValueError("control number must be a positive integer")
3267
3268
orig_nr = nr
3269
found = None
3270
ambiguous = False
3271
if nr is None and self.backwards_compat:
3272
nr = 0
3273
3274
for control in self.controls:
3275
if ((name is not None and name != control.name) and
3276
(name is not Missing or control.name is not None)):
3277
continue
3278
if type is not None and type != control.type:
3279
continue
3280
if kind is not None and not control.is_of_kind(kind):
3281
continue
3282
if id is not None and id != control.id:
3283
continue
3284
if predicate and not predicate(control):
3285
continue
3286
if label:
3287
for l in control.get_labels():
3288
if l.text.find(label) > -1:
3289
break
3290
else:
3291
continue
3292
if nr is not None:
3293
if nr == 0:
3294
return control # early exit: unambiguous due to nr
3295
nr -= 1
3296
continue
3297
if found:
3298
ambiguous = True
3299
break
3300
found = control
3301
3302
if found and not ambiguous:
3303
return found
3304
3305
description = []
3306
if name is not None: description.append("name %s" % repr(name))
3307
if type is not None: description.append("type '%s'" % type)
3308
if kind is not None: description.append("kind '%s'" % kind)
3309
if id is not None: description.append("id '%s'" % id)
3310
if label is not None: description.append("label '%s'" % label)
3311
if predicate is not None:
3312
description.append("predicate %s" % predicate)
3313
if orig_nr: description.append("nr %d" % orig_nr)
3314
description = ", ".join(description)
3315
3316
if ambiguous:
3317
raise AmbiguityError("more than one control matching "+description)
3318
elif not found:
3319
raise ControlNotFoundError("no control matching "+description)
3320
assert False
3321
3322
def _click(self, name, type, id, label, nr, coord, return_type,
3323
request_class=_urllib.request.Request):
3324
try:
3325
control = self._find_control(
3326
name, type, "clickable", id, label, None, nr)
3327
except ControlNotFoundError:
3328
if ((name is not None) or (type is not None) or (id is not None) or
3329
(nr != 0)):
3330
raise
3331
# no clickable controls, but no control was explicitly requested,
3332
# so return state without clicking any control
3333
return self._switch_click(return_type, request_class)
3334
else:
3335
return control._click(self, coord, return_type, request_class)
3336
3337
def _pairs(self):
3338
"""Return sequence of (key, value) pairs suitable for urlencoding."""
3339
return [(k, v) for (i, k, v, c_i) in self._pairs_and_controls()]
3340
3341
3342
def _pairs_and_controls(self):
3343
"""Return sequence of (index, key, value, control_index)
3344
of totally ordered pairs suitable for urlencoding.
3345
3346
control_index is the index of the control in self.controls
3347
"""
3348
pairs = []
3349
for control_index in xrange(len(self.controls)):
3350
control = self.controls[control_index]
3351
for ii, key, val in control._totally_ordered_pairs():
3352
pairs.append((ii, key, val, control_index))
3353
3354
# stable sort by ONLY first item in tuple
3355
pairs.sort()
3356
3357
return pairs
3358
3359
def _request_data(self):
3360
"""Return a tuple (url, data, headers)."""
3361
method = self.method.upper()
3362
#scheme, netloc, path, parameters, query, frag = _urllib.parse.urlparse(self.action)
3363
parts = self._urlparse(self.action)
3364
rest, (query, frag) = parts[:-2], parts[-2:]
3365
3366
if method == "GET":
3367
self.enctype = "application/x-www-form-urlencoded" # force it
3368
parts = rest + (urlencode(self._pairs()), None)
3369
uri = self._urlunparse(parts)
3370
return uri, None, []
3371
elif method == "POST":
3372
parts = rest + (query, None)
3373
uri = self._urlunparse(parts)
3374
if self.enctype == "application/x-www-form-urlencoded":
3375
return (uri, urlencode(self._pairs()),
3376
[("Content-Type", self.enctype)])
3377
elif self.enctype == "text/plain":
3378
return (uri, self._pairs(),
3379
[("Content-Type", self.enctype)])
3380
elif self.enctype == "multipart/form-data":
3381
data = _cStringIO()
3382
http_hdrs = []
3383
mw = MimeWriter(data, http_hdrs)
3384
f = mw.startmultipartbody("form-data", add_to_http_hdrs=True,
3385
prefix=0)
3386
for ii, k, v, control_index in self._pairs_and_controls():
3387
self.controls[control_index]._write_mime_data(mw, k, v)
3388
mw.lastpart()
3389
return uri, data.getvalue(), http_hdrs
3390
else:
3391
raise ValueError(
3392
"unknown POST form encoding type '%s'" % self.enctype)
3393
else:
3394
raise ValueError("Unknown method '%s'" % method)
3395
3396
def _switch_click(self, return_type, request_class=_urllib.request.Request):
3397
# This is called by HTMLForm and clickable Controls to hide switching
3398
# on return_type.
3399
if return_type == "pairs":
3400
return self._pairs()
3401
elif return_type == "request_data":
3402
return self._request_data()
3403
else:
3404
req_data = self._request_data()
3405
3406
req = request_class(req_data[0], req_data[1])
3407
for key, val in req_data[2]:
3408
add_hdr = req.add_header
3409
if key.lower() == "content-type":
3410
try:
3411
add_hdr = req.add_unredirected_header
3412
except AttributeError:
3413
# pre-2.4 and not using ClientCookie
3414
pass
3415
add_hdr(key, val)
3416
return req
3417
3418