Path: blob/master/thirdparty/clientform/clientform.py
2992 views
"""HTML form handling for web clients.12ClientForm is a Python module for handling HTML forms on the client3side, useful for parsing HTML forms, filling them in and returning the4completed forms to the server. It has developed from a port of Gisle5Aas' Perl module HTML::Form, from the libwww-perl library, but the6interface is not the same.78The most useful docstring is the one for HTMLForm.910RFC 1866: HTML 2.011RFC 1867: Form-based File Upload in HTML12RFC 2388: Returning Values from Forms: multipart/form-data13HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX)14HTML 4.01 Specification, W3C Recommendation 24 December 1999151617Copyright 2002-2007 John J. Lee <[email protected]>18Copyright 2005 Gary Poster19Copyright 2005 Zope Corporation20Copyright 1998-2000 Gisle Aas.2122This code is free software; you can redistribute it and/or modify it23under the terms of the BSD or ZPL 2.1 licenses (see the file24COPYING.txt included with the distribution).2526"""2728# XXX29# Remove parser testing hack30# safeUrl()-ize action31# Switch to unicode throughout (would be 0.3.x)32# See Wichert Akkerman's 2004-01-22 message to c.l.py.33# Add charset parameter to Content-type headers? How to find value??34# Add some more functional tests35# Especially single and multiple file upload on the internet.36# Does file upload work when name is missing? Sourceforge tracker form37# doesn't like it. Check standards, and test with Apache. Test38# binary upload with Apache.39# mailto submission & enctype text/plain40# I'm not going to fix this unless somebody tells me what real servers41# that want this encoding actually expect: If enctype is42# application/x-www-form-urlencoded and there's a FILE control present.43# Strictly, it should be 'name=data' (see HTML 4.01 spec., section44# 17.13.2), but I send "name=" ATM. What about multiple file upload??4546# Would be nice, but I'm not going to do it myself:47# -------------------------------------------------48# Maybe a 0.4.x?49# Replace by_label etc. with moniker / selector concept. Allows, eg.,50# a choice between selection by value / id / label / element51# contents. Or choice between matching labels exactly or by52# substring. Etc.53# Remove deprecated methods.54# ...what else?55# Work on DOMForm.56# XForms? Don't know if there's a need here.5758__all__ = ['AmbiguityError', 'CheckboxControl', 'Control',59'ControlNotFoundError', 'FileControl', 'FormParser', 'HTMLForm',60'HiddenControl', 'IgnoreControl', 'ImageControl', 'IsindexControl',61'Item', 'ItemCountError', 'ItemNotFoundError', 'Label',62'ListControl', 'LocateError', 'Missing', 'ParseError', 'ParseFile',63'ParseFileEx', 'ParseResponse', 'ParseResponseEx','PasswordControl',64'RadioControl', 'ScalarControl', 'SelectControl',65'SubmitButtonControl', 'SubmitControl', 'TextControl',66'TextareaControl', 'XHTMLCompatibleFormParser']6768try:69import logging70import inspect71except ImportError:72def debug(msg, *args, **kwds):73pass74else:75_logger = logging.getLogger("ClientForm")76OPTIMIZATION_HACK = True7778def debug(msg, *args, **kwds):79if OPTIMIZATION_HACK:80return8182caller_name = inspect.stack()[1][3]83extended_msg = '%%s %s' % msg84extended_args = (caller_name,)+args85debug = _logger.debug(extended_msg, *extended_args, **kwds)8687def _show_debug_messages():88global OPTIMIZATION_HACK89OPTIMIZATION_HACK = False90_logger.setLevel(logging.DEBUG)91handler = logging.StreamHandler(sys.stdout)92handler.setLevel(logging.DEBUG)93_logger.addHandler(handler)9495try:96from thirdparty import six97from thirdparty.six import unichr as _unichr98from thirdparty.six.moves import cStringIO as _cStringIO99from thirdparty.six.moves import html_entities as _html_entities100from thirdparty.six.moves import urllib as _urllib101except ImportError:102import six103from six import unichr as _unichr104from six.moves import cStringIO as _cStringIO105from six.moves import html_entities as _html_entities106from six.moves import urllib as _urllib107108try:109import sgmllib110except ImportError:111from lib.utils import sgmllib112113import sys, re, random114115if sys.version_info >= (3, 0):116xrange = range117118# monkeypatch to fix http://www.python.org/sf/803422 :-(119sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")120121# HTMLParser.HTMLParser is recent, so live without it if it's not available122# (also, sgmllib.SGMLParser is much more tolerant of bad HTML)123try:124import HTMLParser125except ImportError:126HAVE_MODULE_HTMLPARSER = False127else:128HAVE_MODULE_HTMLPARSER = True129130try:131import warnings132except ImportError:133def deprecation(message, stack_offset=0):134pass135else:136def deprecation(message, stack_offset=0):137warnings.warn(message, DeprecationWarning, stacklevel=3+stack_offset)138139VERSION = "0.2.10"140141CHUNK = 1024 # size of chunks fed to parser, in bytes142143DEFAULT_ENCODING = "latin-1"144145class Missing: pass146147_compress_re = re.compile(r"\s+")148def compress_text(text): return _compress_re.sub(" ", text.strip())149150def normalize_line_endings(text):151return re.sub(r"(?:(?<!\r)\n)|(?:\r(?!\n))", "\r\n", text)152153def _quote_plus(value):154if not isinstance(value, six.string_types):155value = six.text_type(value)156157if isinstance(value, six.text_type):158value = value.encode("utf8")159160return _urllib.parse.quote_plus(value)161162# This version of urlencode is from my Python 1.5.2 back-port of the163# Python 2.1 CVS maintenance branch of urllib. It will accept a sequence164# of pairs instead of a mapping -- the 2.0 version only accepts a mapping.165def urlencode(query,doseq=False,):166"""Encode a sequence of two-element tuples or dictionary into a URL query \167string.168169If any values in the query arg are sequences and doseq is true, each170sequence element is converted to a separate parameter.171172If the query arg is a sequence of two-element tuples, the order of the173parameters in the output will match the order of parameters in the174input.175"""176177if hasattr(query,"items"):178# mapping objects179query = query.items()180else:181# it's a bother at times that strings and string-like objects are182# sequences...183try:184# non-sequence items should not work with len()185x = len(query)186# non-empty strings will fail this187if len(query) and type(query[0]) != tuple:188raise TypeError()189# zero-length sequences of all types will get here and succeed,190# but that's a minor nit - since the original implementation191# allowed empty dicts that type of behavior probably should be192# preserved for consistency193except TypeError:194ty,va,tb = sys.exc_info()195raise TypeError("not a valid non-string sequence or mapping "196"object", tb)197198l = []199if not doseq:200# preserve old behavior201for k, v in query:202k = _quote_plus(k)203v = _quote_plus(v)204l.append(k + '=' + v)205else:206for k, v in query:207k = _quote_plus(k)208if isinstance(v, six.string_types):209v = _quote_plus(v)210l.append(k + '=' + v)211else:212try:213# is this a sufficient test for sequence-ness?214x = len(v)215except TypeError:216# not a sequence217v = _quote_plus(v)218l.append(k + '=' + v)219else:220# loop over the sequence221for elt in v:222l.append(k + '=' + _quote_plus(elt))223return '&'.join(l)224225def unescape(data, entities, encoding=DEFAULT_ENCODING):226if data is None or "&" not in data:227return data228229if isinstance(data, six.string_types):230encoding = None231232def replace_entities(match, entities=entities, encoding=encoding):233ent = match.group()234if ent[1] == "#":235return unescape_charref(ent[2:-1], encoding)236237repl = entities.get(ent)238if repl is not None:239if hasattr(repl, "decode") and encoding is not None:240try:241repl = repl.decode(encoding)242except UnicodeError:243repl = ent244else:245repl = ent246247return repl248249return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)250251def unescape_charref(data, encoding):252name, base = data, 10253if name.startswith("x"):254name, base= name[1:], 16255elif not name.isdigit():256base = 16257258try:259return _unichr(int(name, base))260except:261return data262263def get_entitydefs():264from codecs import latin_1_decode265entitydefs = {}266try:267_html_entities.name2codepoint268except AttributeError:269entitydefs = {}270for name, char in _html_entities.entitydefs.items():271uc = latin_1_decode(char)[0]272if uc.startswith("&#") and uc.endswith(";"):273uc = unescape_charref(uc[2:-1], None)274entitydefs["&%s;" % name] = uc275else:276for name, codepoint in _html_entities.name2codepoint.items():277entitydefs["&%s;" % name] = _unichr(codepoint)278return entitydefs279280def issequence(x):281try:282x[0]283except (TypeError, KeyError):284return False285except IndexError:286pass287return True288289def isstringlike(x):290try: x+""291except: return False292else: return True293294295def choose_boundary():296"""Return a string usable as a multipart boundary."""297# follow IE and firefox298nonce = "".join([str(random.randint(0, sys.maxsize-1)) for i in (0,1,2)])299return "-"*27 + nonce300301# This cut-n-pasted MimeWriter from standard library is here so can add302# to HTTP headers rather than message body when appropriate. It also uses303# \r\n in place of \n. This is a bit nasty.304class MimeWriter:305306"""Generic MIME writer.307308Methods:309310__init__()311addheader()312flushheaders()313startbody()314startmultipartbody()315nextpart()316lastpart()317318A MIME writer is much more primitive than a MIME parser. It319doesn't seek around on the output file, and it doesn't use large320amounts of buffer space, so you have to write the parts in the321order they should occur on the output file. It does buffer the322headers you add, allowing you to rearrange their order.323324General usage is:325326f = <open the output file>327w = MimeWriter(f)328...call w.addheader(key, value) 0 or more times...329330followed by either:331332f = w.startbody(content_type)333...call f.write(data) for body data...334335or:336337w.startmultipartbody(subtype)338for each part:339subwriter = w.nextpart()340...use the subwriter's methods to create the subpart...341w.lastpart()342343The subwriter is another MimeWriter instance, and should be344treated in the same way as the toplevel MimeWriter. This way,345writing recursive body parts is easy.346347Warning: don't forget to call lastpart()!348349XXX There should be more state so calls made in the wrong order350are detected.351352Some special cases:353354- startbody() just returns the file passed to the constructor;355but don't use this knowledge, as it may be changed.356357- startmultipartbody() actually returns a file as well;358this can be used to write the initial 'if you can read this your359mailer is not MIME-aware' message.360361- If you call flushheaders(), the headers accumulated so far are362written out (and forgotten); this is useful if you don't need a363body part at all, e.g. for a subpart of type message/rfc822364that's (mis)used to store some header-like information.365366- Passing a keyword argument 'prefix=<flag>' to addheader(),367start*body() affects where the header is inserted; 0 means368append at the end, 1 means insert at the start; default is369append for addheader(), but insert for start*body(), which use370it to determine where the Content-type header goes.371372"""373374def __init__(self, fp, http_hdrs=None):375self._http_hdrs = http_hdrs376self._fp = fp377self._headers = []378self._boundary = []379self._first_part = True380381def addheader(self, key, value, prefix=0,382add_to_http_hdrs=0):383"""384prefix is ignored if add_to_http_hdrs is true.385"""386lines = value.split("\r\n")387while lines and not lines[-1]: del lines[-1]388while lines and not lines[0]: del lines[0]389if add_to_http_hdrs:390value = "".join(lines)391# 2.2 urllib2 doesn't normalize header case392self._http_hdrs.append((key.capitalize(), value))393else:394for i in xrange(1, len(lines)):395lines[i] = " " + lines[i].strip()396value = "\r\n".join(lines) + "\r\n"397line = key.title() + ": " + value398if prefix:399self._headers.insert(0, line)400else:401self._headers.append(line)402403def flushheaders(self):404self._fp.writelines(self._headers)405self._headers = []406407def startbody(self, ctype=None, plist=[], prefix=1,408add_to_http_hdrs=0, content_type=1):409"""410prefix is ignored if add_to_http_hdrs is true.411"""412if content_type and ctype:413for name, value in plist:414ctype = ctype + ';\r\n %s=%s' % (name, value)415self.addheader("Content-Type", ctype, prefix=prefix,416add_to_http_hdrs=add_to_http_hdrs)417self.flushheaders()418if not add_to_http_hdrs: self._fp.write("\r\n")419self._first_part = True420return self._fp421422def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1,423add_to_http_hdrs=0, content_type=1):424boundary = boundary or choose_boundary()425self._boundary.append(boundary)426return self.startbody("multipart/" + subtype,427[("boundary", boundary)] + plist,428prefix=prefix,429add_to_http_hdrs=add_to_http_hdrs,430content_type=content_type)431432def nextpart(self):433boundary = self._boundary[-1]434if self._first_part:435self._first_part = False436else:437self._fp.write("\r\n")438self._fp.write("--" + boundary + "\r\n")439return self.__class__(self._fp)440441def lastpart(self):442if self._first_part:443self.nextpart()444boundary = self._boundary.pop()445self._fp.write("\r\n--" + boundary + "--\r\n")446447448class LocateError(ValueError): pass449class AmbiguityError(LocateError): pass450class ControlNotFoundError(LocateError): pass451class ItemNotFoundError(LocateError): pass452453class ItemCountError(ValueError): pass454455# for backwards compatibility, ParseError derives from exceptions that were456# raised by versions of ClientForm <= 0.2.5457if HAVE_MODULE_HTMLPARSER:458SGMLLIB_PARSEERROR = sgmllib.SGMLParseError459class ParseError(sgmllib.SGMLParseError,460HTMLParser.HTMLParseError,461):462pass463else:464if hasattr(sgmllib, "SGMLParseError"):465SGMLLIB_PARSEERROR = sgmllib.SGMLParseError466class ParseError(sgmllib.SGMLParseError):467pass468else:469SGMLLIB_PARSEERROR = RuntimeError470class ParseError(RuntimeError):471pass472473474class _AbstractFormParser:475"""forms attribute contains HTMLForm instances on completion."""476# thanks to Moshe Zadka for an example of sgmllib/htmllib usage477def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):478if entitydefs is None:479entitydefs = get_entitydefs()480self._entitydefs = entitydefs481self._encoding = encoding482483self.base = None484self.forms = []485self.labels = []486self._current_label = None487self._current_form = None488self._select = None489self._optgroup = None490self._option = None491self._textarea = None492493# forms[0] will contain all controls that are outside of any form494# self._global_form is an alias for self.forms[0]495self._global_form = None496self.start_form([])497self.end_form()498self._current_form = self._global_form = self.forms[0]499500def do_base(self, attrs):501debug("%s", attrs)502for key, value in attrs:503if key == "href":504self.base = self.unescape_attr_if_required(value)505506def end_body(self):507debug("")508if self._current_label is not None:509self.end_label()510if self._current_form is not self._global_form:511self.end_form()512513def start_form(self, attrs):514debug("%s", attrs)515if self._current_form is not self._global_form:516raise ParseError("nested FORMs")517name = None518action = None519enctype = "application/x-www-form-urlencoded"520method = "GET"521d = {}522for key, value in attrs:523if key == "name":524name = self.unescape_attr_if_required(value)525elif key == "action":526action = self.unescape_attr_if_required(value)527elif key == "method":528method = self.unescape_attr_if_required(value.upper())529elif key == "enctype":530enctype = self.unescape_attr_if_required(value.lower())531d[key] = self.unescape_attr_if_required(value)532controls = []533self._current_form = (name, action, method, enctype), d, controls534535def end_form(self):536debug("")537if self._current_label is not None:538self.end_label()539if self._current_form is self._global_form:540raise ParseError("end of FORM before start")541self.forms.append(self._current_form)542self._current_form = self._global_form543544def start_select(self, attrs):545debug("%s", attrs)546if self._select is not None:547raise ParseError("nested SELECTs")548if self._textarea is not None:549raise ParseError("SELECT inside TEXTAREA")550d = {}551for key, val in attrs:552d[key] = self.unescape_attr_if_required(val)553554self._select = d555self._add_label(d)556557self._append_select_control({"__select": d})558559def end_select(self):560debug("")561if self._select is None:562raise ParseError("end of SELECT before start")563564if self._option is not None:565self._end_option()566567self._select = None568569def start_optgroup(self, attrs):570debug("%s", attrs)571if self._select is None:572raise ParseError("OPTGROUP outside of SELECT")573d = {}574for key, val in attrs:575d[key] = self.unescape_attr_if_required(val)576577self._optgroup = d578579def end_optgroup(self):580debug("")581if self._optgroup is None:582raise ParseError("end of OPTGROUP before start")583self._optgroup = None584585def _start_option(self, attrs):586debug("%s", attrs)587if self._select is None:588raise ParseError("OPTION outside of SELECT")589if self._option is not None:590self._end_option()591592d = {}593for key, val in attrs:594d[key] = self.unescape_attr_if_required(val)595596self._option = {}597self._option.update(d)598if (self._optgroup and "disabled" in self._optgroup and599"disabled" not in self._option):600self._option["disabled"] = None601602def _end_option(self):603debug("")604if self._option is None:605raise ParseError("end of OPTION before start")606607contents = self._option.get("contents", "").strip()608self._option["contents"] = contents609if "value" not in self._option:610self._option["value"] = contents611if "label" not in self._option:612self._option["label"] = contents613# stuff dict of SELECT HTML attrs into a special private key614# (gets deleted again later)615self._option["__select"] = self._select616self._append_select_control(self._option)617self._option = None618619def _append_select_control(self, attrs):620debug("%s", attrs)621controls = self._current_form[2]622name = self._select.get("name")623controls.append(("select", name, attrs))624625def start_textarea(self, attrs):626debug("%s", attrs)627if self._textarea is not None:628raise ParseError("nested TEXTAREAs")629if self._select is not None:630raise ParseError("TEXTAREA inside SELECT")631d = {}632for key, val in attrs:633d[key] = self.unescape_attr_if_required(val)634self._add_label(d)635636self._textarea = d637638def end_textarea(self):639debug("")640if self._textarea is None:641raise ParseError("end of TEXTAREA before start")642controls = self._current_form[2]643name = self._textarea.get("name")644controls.append(("textarea", name, self._textarea))645self._textarea = None646647def start_label(self, attrs):648debug("%s", attrs)649if self._current_label:650self.end_label()651d = {}652for key, val in attrs:653d[key] = self.unescape_attr_if_required(val)654taken = bool(d.get("for")) # empty id is invalid655d["__text"] = ""656d["__taken"] = taken657if taken:658self.labels.append(d)659self._current_label = d660661def end_label(self):662debug("")663label = self._current_label664if label is None:665# something is ugly in the HTML, but we're ignoring it666return667self._current_label = None668# if it is staying around, it is True in all cases669del label["__taken"]670671def _add_label(self, d):672#debug("%s", d)673if self._current_label is not None:674if not self._current_label["__taken"]:675self._current_label["__taken"] = True676d["__label"] = self._current_label677678def handle_data(self, data):679debug("%s", data)680681if self._option is not None:682# self._option is a dictionary of the OPTION element's HTML683# attributes, but it has two special keys, one of which is the684# special "contents" key contains text between OPTION tags (the685# other is the "__select" key: see the end_option method)686map = self._option687key = "contents"688elif self._textarea is not None:689map = self._textarea690key = "value"691data = normalize_line_endings(data)692# not if within option or textarea693elif self._current_label is not None:694map = self._current_label695key = "__text"696else:697return698699if data and key not in map:700# according to701# http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1 line break702# immediately after start tags or immediately before end tags must703# be ignored, but real browsers only ignore a line break after a704# start tag, so we'll do that.705if data[0:2] == "\r\n":706data = data[2:]707elif data[0:1] in ["\n", "\r"]:708data = data[1:]709map[key] = data710else:711map[key] = (map[key].decode("utf8", "replace") if isinstance(map[key], six.binary_type) else map[key]) + data712713def do_button(self, attrs):714debug("%s", attrs)715d = {}716d["type"] = "submit" # default717for key, val in attrs:718d[key] = self.unescape_attr_if_required(val)719controls = self._current_form[2]720721type = d["type"]722name = d.get("name")723# we don't want to lose information, so use a type string that724# doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON}725# e.g. type for BUTTON/RESET is "resetbutton"726# (type for INPUT/RESET is "reset")727type = type+"button"728self._add_label(d)729controls.append((type, name, d))730731def do_input(self, attrs):732debug("%s", attrs)733d = {}734d["type"] = "text" # default735for key, val in attrs:736d[key] = self.unescape_attr_if_required(val)737controls = self._current_form[2]738739type = d["type"]740name = d.get("name")741self._add_label(d)742controls.append((type, name, d))743744def do_isindex(self, attrs):745debug("%s", attrs)746d = {}747for key, val in attrs:748d[key] = self.unescape_attr_if_required(val)749controls = self._current_form[2]750751self._add_label(d)752# isindex doesn't have type or name HTML attributes753controls.append(("isindex", None, d))754755def handle_entityref(self, name):756#debug("%s", name)757self.handle_data(unescape(758'&%s;' % name, self._entitydefs, self._encoding))759760def handle_charref(self, name):761#debug("%s", name)762self.handle_data(unescape_charref(name, self._encoding))763764def unescape_attr(self, name):765#debug("%s", name)766return unescape(name, self._entitydefs, self._encoding)767768def unescape_attrs(self, attrs):769#debug("%s", attrs)770escaped_attrs = {}771for key, val in attrs.items():772try:773val.items774except AttributeError:775escaped_attrs[key] = self.unescape_attr(val)776else:777# e.g. "__select" -- yuck!778escaped_attrs[key] = self.unescape_attrs(val)779return escaped_attrs780781def unknown_entityref(self, ref): self.handle_data("&%s;" % ref)782def unknown_charref(self, ref): self.handle_data("&#%s;" % ref)783784785if not HAVE_MODULE_HTMLPARSER:786class XHTMLCompatibleFormParser:787def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):788raise ValueError("HTMLParser could not be imported")789else:790class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):791"""Good for XHTML, bad for tolerance of incorrect HTML."""792# thanks to Michael Howitz for this!793def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):794HTMLParser.HTMLParser.__init__(self)795_AbstractFormParser.__init__(self, entitydefs, encoding)796797def feed(self, data):798try:799HTMLParser.HTMLParser.feed(self, data)800except HTMLParser.HTMLParseError as exc:801raise ParseError(exc)802803def start_option(self, attrs):804_AbstractFormParser._start_option(self, attrs)805806def end_option(self):807_AbstractFormParser._end_option(self)808809def handle_starttag(self, tag, attrs):810try:811method = getattr(self, "start_" + tag)812except AttributeError:813try:814method = getattr(self, "do_" + tag)815except AttributeError:816pass # unknown tag817else:818method(attrs)819else:820method(attrs)821822def handle_endtag(self, tag):823try:824method = getattr(self, "end_" + tag)825except AttributeError:826pass # unknown tag827else:828method()829830def unescape(self, name):831# Use the entitydefs passed into constructor, not832# HTMLParser.HTMLParser's entitydefs.833return self.unescape_attr(name)834835def unescape_attr_if_required(self, name):836return name # HTMLParser.HTMLParser already did it837def unescape_attrs_if_required(self, attrs):838return attrs # ditto839840def close(self):841HTMLParser.HTMLParser.close(self)842self.end_body()843844845class _AbstractSgmllibParser(_AbstractFormParser):846847def do_option(self, attrs):848_AbstractFormParser._start_option(self, attrs)849850if sys.version_info[:2] >= (2,5):851# we override this attr to decode hex charrefs852entity_or_charref = re.compile(853'&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(x?[0-9a-fA-F]+))(;?)')854def convert_entityref(self, name):855return unescape("&%s;" % name, self._entitydefs, self._encoding)856def convert_charref(self, name):857return unescape_charref("%s" % name, self._encoding)858def unescape_attr_if_required(self, name):859return name # sgmllib already did it860def unescape_attrs_if_required(self, attrs):861return attrs # ditto862else:863def unescape_attr_if_required(self, name):864return self.unescape_attr(name)865def unescape_attrs_if_required(self, attrs):866return self.unescape_attrs(attrs)867868869class FormParser(_AbstractSgmllibParser, sgmllib.SGMLParser):870"""Good for tolerance of incorrect HTML, bad for XHTML."""871def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):872sgmllib.SGMLParser.__init__(self)873_AbstractFormParser.__init__(self, entitydefs, encoding)874875def feed(self, data):876try:877sgmllib.SGMLParser.feed(self, data)878except SGMLLIB_PARSEERROR as exc:879raise ParseError(exc)880881def close(self):882sgmllib.SGMLParser.close(self)883self.end_body()884885886# sigh, must support mechanize by allowing dynamic creation of classes based on887# its bundled copy of BeautifulSoup (which was necessary because of dependency888# problems)889890def _create_bs_classes(bs,891icbinbs,892):893class _AbstractBSFormParser(_AbstractSgmllibParser):894bs_base_class = None895def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):896_AbstractFormParser.__init__(self, entitydefs, encoding)897self.bs_base_class.__init__(self)898def handle_data(self, data):899_AbstractFormParser.handle_data(self, data)900self.bs_base_class.handle_data(self, data)901def feed(self, data):902try:903self.bs_base_class.feed(self, data)904except SGMLLIB_PARSEERROR as exc:905raise ParseError(exc)906def close(self):907self.bs_base_class.close(self)908self.end_body()909910class RobustFormParser(_AbstractBSFormParser, bs):911"""Tries to be highly tolerant of incorrect HTML."""912pass913RobustFormParser.bs_base_class = bs914class NestingRobustFormParser(_AbstractBSFormParser, icbinbs):915"""Tries to be highly tolerant of incorrect HTML.916917Different from RobustFormParser in that it more often guesses nesting918above missing end tags (see BeautifulSoup docs).919920"""921pass922NestingRobustFormParser.bs_base_class = icbinbs923924return RobustFormParser, NestingRobustFormParser925926try:927if sys.version_info[:2] < (2, 2):928raise ImportError # BeautifulSoup uses generators929import BeautifulSoup930except ImportError:931pass932else:933RobustFormParser, NestingRobustFormParser = _create_bs_classes(934BeautifulSoup.BeautifulSoup, BeautifulSoup.ICantBelieveItsBeautifulSoup935)936__all__ += ['RobustFormParser', 'NestingRobustFormParser']937938939#FormParser = XHTMLCompatibleFormParser # testing hack940#FormParser = RobustFormParser # testing hack941942943def ParseResponseEx(response,944select_default=False,945form_parser_class=FormParser,946request_class=_urllib.request.Request,947entitydefs=None,948encoding=DEFAULT_ENCODING,949950# private951_urljoin=_urllib.parse.urljoin,952_urlparse=_urllib.parse.urlparse,953_urlunparse=_urllib.parse.urlunparse,954):955"""Identical to ParseResponse, except that:9569571. The returned list contains an extra item. The first form in the list958contains all controls not contained in any FORM element.9599602. The arguments ignore_errors and backwards_compat have been removed.9619623. Backwards-compatibility mode (backwards_compat=True) is not available.963"""964return _ParseFileEx(response, response.geturl(),965select_default,966False,967form_parser_class,968request_class,969entitydefs,970False,971encoding,972_urljoin=_urljoin,973_urlparse=_urlparse,974_urlunparse=_urlunparse,975)976977def ParseFileEx(file, base_uri,978select_default=False,979form_parser_class=FormParser,980request_class=_urllib.request.Request,981entitydefs=None,982encoding=DEFAULT_ENCODING,983984# private985_urljoin=_urllib.parse.urljoin,986_urlparse=_urllib.parse.urlparse,987_urlunparse=_urllib.parse.urlunparse,988):989"""Identical to ParseFile, except that:9909911. The returned list contains an extra item. The first form in the list992contains all controls not contained in any FORM element.9939942. The arguments ignore_errors and backwards_compat have been removed.9959963. Backwards-compatibility mode (backwards_compat=True) is not available.997"""998return _ParseFileEx(file, base_uri,999select_default,1000False,1001form_parser_class,1002request_class,1003entitydefs,1004False,1005encoding,1006_urljoin=_urljoin,1007_urlparse=_urlparse,1008_urlunparse=_urlunparse,1009)10101011def ParseResponse(response, *args, **kwds):1012"""Parse HTTP response and return a list of HTMLForm instances.10131014The return value of urllib2.urlopen can be conveniently passed to this1015function as the response parameter.10161017ClientForm.ParseError is raised on parse errors.10181019response: file-like object (supporting read() method) with a method1020geturl(), returning the URI of the HTTP response1021select_default: for multiple-selection SELECT controls and RADIO controls,1022pick the first item as the default if none are selected in the HTML1023form_parser_class: class to instantiate and use to pass1024request_class: class to return from .click() method (default is1025_urllib.request.Request)1026entitydefs: mapping like {"&": "&", ...} containing HTML entity1027definitions (a sensible default is used)1028encoding: character encoding used for encoding numeric character references1029when matching link text. ClientForm does not attempt to find the encoding1030in a META HTTP-EQUIV attribute in the document itself (mechanize, for1031example, does do that and will pass the correct value to ClientForm using1032this parameter).10331034backwards_compat: boolean that determines whether the returned HTMLForm1035objects are backwards-compatible with old code. If backwards_compat is1036true:10371038- ClientForm 0.1 code will continue to work as before.10391040- Label searches that do not specify a nr (number or count) will always1041get the first match, even if other controls match. If1042backwards_compat is False, label searches that have ambiguous results1043will raise an AmbiguityError.10441045- Item label matching is done by strict string comparison rather than1046substring matching.10471048- De-selecting individual list items is allowed even if the Item is1049disabled.10501051The backwards_compat argument will be deprecated in a future release.10521053Pass a true value for select_default if you want the behaviour specified by1054RFC 1866 (the HTML 2.0 standard), which is to select the first item in a1055RADIO or multiple-selection SELECT control if none were selected in the1056HTML. Most browsers (including Microsoft Internet Explorer (IE) and1057Netscape Navigator) instead leave all items unselected in these cases. The1058W3C HTML 4.0 standard leaves this behaviour undefined in the case of1059multiple-selection SELECT controls, but insists that at least one RADIO1060button should be checked at all times, in contradiction to browser1061behaviour.10621063There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses1064HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses1065sgmllib.SGMLParser) (the default) works better for ordinary grubby HTML.1066Note that HTMLParser is only available in Python 2.2 and later. You can1067pass your own class in here as a hack to work around bad HTML, but at your1068own risk: there is no well-defined interface.10691070"""1071return _ParseFileEx(response, response.geturl(), *args, **kwds)[1:]10721073def ParseFile(file, base_uri, *args, **kwds):1074"""Parse HTML and return a list of HTMLForm instances.10751076ClientForm.ParseError is raised on parse errors.10771078file: file-like object (supporting read() method) containing HTML with zero1079or more forms to be parsed1080base_uri: the URI of the document (note that the base URI used to submit1081the form will be that given in the BASE element if present, not that of1082the document)10831084For the other arguments and further details, see ParseResponse.__doc__.10851086"""1087return _ParseFileEx(file, base_uri, *args, **kwds)[1:]10881089def _ParseFileEx(file, base_uri,1090select_default=False,1091ignore_errors=False,1092form_parser_class=FormParser,1093request_class=_urllib.request.Request,1094entitydefs=None,1095backwards_compat=True,1096encoding=DEFAULT_ENCODING,1097_urljoin=_urllib.parse.urljoin,1098_urlparse=_urllib.parse.urlparse,1099_urlunparse=_urllib.parse.urlunparse,1100):1101if backwards_compat:1102deprecation("operating in backwards-compatibility mode", 1)1103fp = form_parser_class(entitydefs, encoding)1104while 1:1105data = file.read(CHUNK)1106try:1107fp.feed(data)1108except ParseError as e:1109e.base_uri = base_uri1110raise1111if len(data) != CHUNK: break1112fp.close()1113if fp.base is not None:1114# HTML BASE element takes precedence over document URI1115base_uri = fp.base1116labels = [] # Label(label) for label in fp.labels]1117id_to_labels = {}1118for l in fp.labels:1119label = Label(l)1120labels.append(label)1121for_id = l["for"]1122coll = id_to_labels.get(for_id)1123if coll is None:1124id_to_labels[for_id] = [label]1125else:1126coll.append(label)1127forms = []1128for (name, action, method, enctype), attrs, controls in fp.forms:1129if action is None:1130action = base_uri1131else:1132action = six.text_type(action, "utf8") if action and isinstance(action, six.binary_type) else action1133action = _urljoin(base_uri, action)1134# would be nice to make HTMLForm class (form builder) pluggable1135form = HTMLForm(1136action, method, enctype, name, attrs, request_class,1137forms, labels, id_to_labels, backwards_compat)1138form._urlparse = _urlparse1139form._urlunparse = _urlunparse1140for ii in xrange(len(controls)):1141type, name, attrs = controls[ii]1142# index=ii*10 allows ImageControl to return multiple ordered pairs1143form.new_control(1144type, name, attrs, select_default=select_default, index=ii*10)1145forms.append(form)1146for form in forms:1147try:1148form.fixup()1149except AttributeError as ex:1150if not any(_ in str(ex) for _ in ("is disabled", "is readonly")):1151raise1152return forms115311541155class Label:1156def __init__(self, attrs):1157self.id = attrs.get("for")1158self._text = attrs.get("__text").strip()1159self._ctext = compress_text(self._text)1160self.attrs = attrs1161self._backwards_compat = False # maintained by HTMLForm11621163def __getattr__(self, name):1164if name == "text":1165if self._backwards_compat:1166return self._text1167else:1168return self._ctext1169return getattr(Label, name)11701171def __setattr__(self, name, value):1172if name == "text":1173# don't see any need for this, so make it read-only1174raise AttributeError("text attribute is read-only")1175self.__dict__[name] = value11761177def __str__(self):1178return "<Label(id=%r, text=%r)>" % (self.id, self.text)117911801181def _get_label(attrs):1182text = attrs.get("__label")1183if text is not None:1184return Label(text)1185else:1186return None11871188class Control:1189"""An HTML form control.11901191An HTMLForm contains a sequence of Controls. The Controls in an HTMLForm1192are accessed using the HTMLForm.find_control method or the1193HTMLForm.controls attribute.11941195Control instances are usually constructed using the ParseFile /1196ParseResponse functions. If you use those functions, you can ignore the1197rest of this paragraph. A Control is only properly initialised after the1198fixup method has been called. In fact, this is only strictly necessary for1199ListControl instances. This is necessary because ListControls are built up1200from ListControls each containing only a single item, and their initial1201value(s) can only be known after the sequence is complete.12021203The types and values that are acceptable for assignment to the value1204attribute are defined by subclasses.12051206If the disabled attribute is true, this represents the state typically1207represented by browsers by 'greying out' a control. If the disabled1208attribute is true, the Control will raise AttributeError if an attempt is1209made to change its value. In addition, the control will not be considered1210'successful' as defined by the W3C HTML 4 standard -- ie. it will1211contribute no data to the return value of the HTMLForm.click* methods. To1212enable a control, set the disabled attribute to a false value.12131214If the readonly attribute is true, the Control will raise AttributeError if1215an attempt is made to change its value. To make a control writable, set1216the readonly attribute to a false value.12171218All controls have the disabled and readonly attributes, not only those that1219may have the HTML attributes of the same names.12201221On assignment to the value attribute, the following exceptions are raised:1222TypeError, AttributeError (if the value attribute should not be assigned1223to, because the control is disabled, for example) and ValueError.12241225If the name or value attributes are None, or the value is an empty list, or1226if the control is disabled, the control is not successful.12271228Public attributes:12291230type: string describing type of control (see the keys of the1231HTMLForm.type2class dictionary for the allowable values) (readonly)1232name: name of control (readonly)1233value: current value of control (subclasses may allow a single value, a1234sequence of values, or either)1235disabled: disabled state1236readonly: readonly state1237id: value of id HTML attribute12381239"""1240def __init__(self, type, name, attrs, index=None):1241"""1242type: string describing type of control (see the keys of the1243HTMLForm.type2class dictionary for the allowable values)1244name: control name1245attrs: HTML attributes of control's HTML element12461247"""1248raise NotImplementedError()12491250def add_to_form(self, form):1251self._form = form1252form.controls.append(self)12531254def fixup(self):1255pass12561257def is_of_kind(self, kind):1258raise NotImplementedError()12591260def clear(self):1261raise NotImplementedError()12621263def __getattr__(self, name): raise NotImplementedError()1264def __setattr__(self, name, value): raise NotImplementedError()12651266def pairs(self):1267"""Return list of (key, value) pairs suitable for passing to urlencode.1268"""1269return [(k, v) for (i, k, v) in self._totally_ordered_pairs()]12701271def _totally_ordered_pairs(self):1272"""Return list of (key, value, index) tuples.12731274Like pairs, but allows preserving correct ordering even where several1275controls are involved.12761277"""1278raise NotImplementedError()12791280def _write_mime_data(self, mw, name, value):1281"""Write data for a subitem of this control to a MimeWriter."""1282# called by HTMLForm1283mw2 = mw.nextpart()1284mw2.addheader("Content-Disposition",1285'form-data; name="%s"' % name, 1)1286f = mw2.startbody(prefix=0)1287f.write(value)12881289def __str__(self):1290raise NotImplementedError()12911292def get_labels(self):1293"""Return all labels (Label instances) for this control.12941295If the control was surrounded by a <label> tag, that will be the first1296label; all other labels, connected by 'for' and 'id', are in the order1297that appear in the HTML.12981299"""1300res = []1301if self._label:1302res.append(self._label)1303if self.id:1304res.extend(self._form._id_to_labels.get(self.id, ()))1305return res130613071308#---------------------------------------------------1309class ScalarControl(Control):1310"""Control whose value is not restricted to one of a prescribed set.13111312Some ScalarControls don't accept any value attribute. Otherwise, takes a1313single value, which must be string-like.13141315Additional read-only public attribute:13161317attrs: dictionary mapping the names of original HTML attributes of the1318control to their values13191320"""1321def __init__(self, type, name, attrs, index=None):1322self._index = index1323self._label = _get_label(attrs)1324self.__dict__["type"] = type.lower()1325self.__dict__["name"] = name1326self._value = attrs.get("value")1327self.disabled = "disabled" in attrs1328self.readonly = "readonly" in attrs1329self.id = attrs.get("id")13301331self.attrs = attrs.copy()13321333self._clicked = False13341335self._urlparse = _urllib.parse.urlparse1336self._urlunparse = _urllib.parse.urlunparse13371338def __getattr__(self, name):1339if name == "value":1340return self.__dict__["_value"]1341else:1342raise AttributeError("%s instance has no attribute '%s'" %1343(self.__class__.__name__, name))13441345def __setattr__(self, name, value):1346if name == "value":1347if not isstringlike(value):1348raise TypeError("must assign a string")1349elif self.readonly:1350raise AttributeError("control '%s' is readonly" % self.name)1351elif self.disabled:1352raise AttributeError("control '%s' is disabled" % self.name)1353self.__dict__["_value"] = value1354elif name in ("name", "type"):1355raise AttributeError("%s attribute is readonly" % name)1356else:1357self.__dict__[name] = value13581359def _totally_ordered_pairs(self):1360name = self.name1361value = self.value1362if name is None or value is None or self.disabled:1363return []1364return [(self._index, name, value)]13651366def clear(self):1367if self.readonly:1368raise AttributeError("control '%s' is readonly" % self.name)1369self.__dict__["_value"] = None13701371def __str__(self):1372name = self.name1373value = self.value1374if name is None: name = "<None>"1375if value is None: value = "<None>"13761377infos = []1378if self.disabled: infos.append("disabled")1379if self.readonly: infos.append("readonly")1380info = ", ".join(infos)1381if info: info = " (%s)" % info13821383return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)138413851386#---------------------------------------------------1387class TextControl(ScalarControl):1388"""Textual input control.13891390Covers:13911392INPUT/TEXT1393INPUT/PASSWORD1394INPUT/HIDDEN1395TEXTAREA13961397"""1398def __init__(self, type, name, attrs, index=None):1399ScalarControl.__init__(self, type, name, attrs, index)1400if self.type == "hidden": self.readonly = True1401if self._value is None:1402self._value = ""14031404def is_of_kind(self, kind): return kind == "text"14051406#---------------------------------------------------1407class FileControl(ScalarControl):1408"""File upload with INPUT TYPE=FILE.14091410The value attribute of a FileControl is always None. Use add_file instead.14111412Additional public method: add_file14131414"""14151416def __init__(self, type, name, attrs, index=None):1417ScalarControl.__init__(self, type, name, attrs, index)1418self._value = None1419self._upload_data = []14201421def is_of_kind(self, kind): return kind == "file"14221423def clear(self):1424if self.readonly:1425raise AttributeError("control '%s' is readonly" % self.name)1426self._upload_data = []14271428def __setattr__(self, name, value):1429if name in ("value", "name", "type"):1430raise AttributeError("%s attribute is readonly" % name)1431else:1432self.__dict__[name] = value14331434def add_file(self, file_object, content_type=None, filename=None):1435if not hasattr(file_object, "read"):1436raise TypeError("file-like object must have read method")1437if content_type is not None and not isstringlike(content_type):1438raise TypeError("content type must be None or string-like")1439if filename is not None and not isstringlike(filename):1440raise TypeError("filename must be None or string-like")1441if content_type is None:1442content_type = "application/octet-stream"1443self._upload_data.append((file_object, content_type, filename))14441445def _totally_ordered_pairs(self):1446# XXX should it be successful even if unnamed?1447if self.name is None or self.disabled:1448return []1449return [(self._index, self.name, "")]14501451def _write_mime_data(self, mw, _name, _value):1452# called by HTMLForm1453# assert _name == self.name and _value == ''1454if len(self._upload_data) < 2:1455if len(self._upload_data) == 0:1456file_object = _cStringIO()1457content_type = "application/octet-stream"1458filename = ""1459else:1460file_object, content_type, filename = self._upload_data[0]1461if filename is None:1462filename = ""1463mw2 = mw.nextpart()1464fn_part = '; filename="%s"' % filename1465disp = 'form-data; name="%s"%s' % (self.name, fn_part)1466mw2.addheader("Content-Disposition", disp, prefix=1)1467fh = mw2.startbody(content_type, prefix=0)1468fh.write(file_object.read())1469else:1470# multiple files1471mw2 = mw.nextpart()1472disp = 'form-data; name="%s"' % self.name1473mw2.addheader("Content-Disposition", disp, prefix=1)1474fh = mw2.startmultipartbody("mixed", prefix=0)1475for file_object, content_type, filename in self._upload_data:1476mw3 = mw2.nextpart()1477if filename is None:1478filename = ""1479fn_part = '; filename="%s"' % filename1480disp = "file%s" % fn_part1481mw3.addheader("Content-Disposition", disp, prefix=1)1482fh2 = mw3.startbody(content_type, prefix=0)1483fh2.write(file_object.read())1484mw2.lastpart()14851486def __str__(self):1487name = self.name1488if name is None: name = "<None>"14891490if not self._upload_data:1491value = "<No files added>"1492else:1493value = []1494for file, ctype, filename in self._upload_data:1495if filename is None:1496value.append("<Unnamed file>")1497else:1498value.append(filename)1499value = ", ".join(value)15001501info = []1502if self.disabled: info.append("disabled")1503if self.readonly: info.append("readonly")1504info = ", ".join(info)1505if info: info = " (%s)" % info15061507return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)150815091510#---------------------------------------------------1511class IsindexControl(ScalarControl):1512"""ISINDEX control.15131514ISINDEX is the odd-one-out of HTML form controls. In fact, it isn't really1515part of regular HTML forms at all, and predates it. You're only allowed1516one ISINDEX per HTML document. ISINDEX and regular form submission are1517mutually exclusive -- either submit a form, or the ISINDEX.15181519Having said this, since ISINDEX controls may appear in forms (which is1520probably bad HTML), ParseFile / ParseResponse will include them in the1521HTMLForm instances it returns. You can set the ISINDEX's value, as with1522any other control (but note that ISINDEX controls have no name, so you'll1523need to use the type argument of set_value!). When you submit the form,1524the ISINDEX will not be successful (ie., no data will get returned to the1525server as a result of its presence), unless you click on the ISINDEX1526control, in which case the ISINDEX gets submitted instead of the form:15271528form.set_value("my isindex value", type="isindex")1529urllib2.urlopen(form.click(type="isindex"))15301531ISINDEX elements outside of FORMs are ignored. If you want to submit one1532by hand, do it like so:15331534url = _urllib.parse.urljoin(page_uri, "?"+_urllib.parse.quote_plus("my isindex value"))1535result = urllib2.urlopen(url)15361537"""1538def __init__(self, type, name, attrs, index=None):1539ScalarControl.__init__(self, type, name, attrs, index)1540if self._value is None:1541self._value = ""15421543def is_of_kind(self, kind): return kind in ["text", "clickable"]15441545def _totally_ordered_pairs(self):1546return []15471548def _click(self, form, coord, return_type, request_class=_urllib.request.Request):1549# Relative URL for ISINDEX submission: instead of "foo=bar+baz",1550# want "bar+baz".1551# This doesn't seem to be specified in HTML 4.01 spec. (ISINDEX is1552# deprecated in 4.01, but it should still say how to submit it).1553# Submission of ISINDEX is explained in the HTML 3.2 spec, though.1554parts = self._urlparse(form.action)1555rest, (query, frag) = parts[:-2], parts[-2:]1556parts = rest + (_urllib.parse.quote_plus(self.value), None)1557url = self._urlunparse(parts)1558req_data = url, None, []15591560if return_type == "pairs":1561return []1562elif return_type == "request_data":1563return req_data1564else:1565return request_class(url)15661567def __str__(self):1568value = self.value1569if value is None: value = "<None>"15701571infos = []1572if self.disabled: infos.append("disabled")1573if self.readonly: infos.append("readonly")1574info = ", ".join(infos)1575if info: info = " (%s)" % info15761577return "<%s(%s)%s>" % (self.__class__.__name__, value, info)157815791580#---------------------------------------------------1581class IgnoreControl(ScalarControl):1582"""Control that we're not interested in.15831584Covers:15851586INPUT/RESET1587BUTTON/RESET1588INPUT/BUTTON1589BUTTON/BUTTON15901591These controls are always unsuccessful, in the terminology of HTML 4 (ie.1592they never require any information to be returned to the server).15931594BUTTON/BUTTON is used to generate events for script embedded in HTML.15951596The value attribute of IgnoreControl is always None.15971598"""1599def __init__(self, type, name, attrs, index=None):1600ScalarControl.__init__(self, type, name, attrs, index)1601self._value = None16021603def is_of_kind(self, kind): return False16041605def __setattr__(self, name, value):1606if name == "value":1607raise AttributeError(1608"control '%s' is ignored, hence read-only" % self.name)1609elif name in ("name", "type"):1610raise AttributeError("%s attribute is readonly" % name)1611else:1612self.__dict__[name] = value161316141615#---------------------------------------------------1616# ListControls16171618# helpers and subsidiary classes16191620class Item:1621def __init__(self, control, attrs, index=None):1622label = _get_label(attrs)1623self.__dict__.update({1624"name": attrs["value"],1625"_labels": label and [label] or [],1626"attrs": attrs,1627"_control": control,1628"disabled": "disabled" in attrs,1629"_selected": False,1630"id": attrs.get("id"),1631"_index": index,1632})1633control.items.append(self)16341635def get_labels(self):1636"""Return all labels (Label instances) for this item.16371638For items that represent radio buttons or checkboxes, if the item was1639surrounded by a <label> tag, that will be the first label; all other1640labels, connected by 'for' and 'id', are in the order that appear in1641the HTML.16421643For items that represent select options, if the option had a label1644attribute, that will be the first label. If the option has contents1645(text within the option tags) and it is not the same as the label1646attribute (if any), that will be a label. There is nothing in the1647spec to my knowledge that makes an option with an id unable to be the1648target of a label's for attribute, so those are included, if any, for1649the sake of consistency and completeness.16501651"""1652res = []1653res.extend(self._labels)1654if self.id:1655res.extend(self._control._form._id_to_labels.get(self.id, ()))1656return res16571658def __getattr__(self, name):1659if name=="selected":1660return self._selected1661raise AttributeError(name)16621663def __setattr__(self, name, value):1664if name == "selected":1665self._control._set_selected_state(self, value)1666elif name == "disabled":1667self.__dict__["disabled"] = bool(value)1668else:1669raise AttributeError(name)16701671def __str__(self):1672res = self.name1673if self.selected:1674res = "*" + res1675if self.disabled:1676res = "(%s)" % res1677return res16781679def __repr__(self):1680# XXX appending the attrs without distinguishing them from name and id1681# is silly1682attrs = [("name", self.name), ("id", self.id)]+self.attrs.items()1683return "<%s %s>" % (1684self.__class__.__name__,1685" ".join(["%s=%r" % (k, v) for k, v in attrs])1686)16871688def disambiguate(items, nr, **kwds):1689msgs = []1690for key, value in kwds.items():1691msgs.append("%s=%r" % (key, value))1692msg = " ".join(msgs)1693if not items:1694raise ItemNotFoundError(msg)1695if nr is None:1696if len(items) > 1:1697raise AmbiguityError(msg)1698nr = 01699if len(items) <= nr:1700raise ItemNotFoundError(msg)1701return items[nr]17021703class ListControl(Control):1704"""Control representing a sequence of items.17051706The value attribute of a ListControl represents the successful list items1707in the control. The successful list items are those that are selected and1708not disabled.17091710ListControl implements both list controls that take a length-1 value1711(single-selection) and those that take length >1 values1712(multiple-selection).17131714ListControls accept sequence values only. Some controls only accept1715sequences of length 0 or 1 (RADIO, and single-selection SELECT).1716In those cases, ItemCountError is raised if len(sequence) > 1. CHECKBOXes1717and multiple-selection SELECTs (those having the "multiple" HTML attribute)1718accept sequences of any length.17191720Note the following mistake:17211722control.value = some_value1723assert control.value == some_value # not necessarily true17241725The reason for this is that the value attribute always gives the list items1726in the order they were listed in the HTML.17271728ListControl items can also be referred to by their labels instead of names.1729Use the label argument to .get(), and the .set_value_by_label(),1730.get_value_by_label() methods.17311732Note that, rather confusingly, though SELECT controls are represented in1733HTML by SELECT elements (which contain OPTION elements, representing1734individual list items), CHECKBOXes and RADIOs are not represented by *any*1735element. Instead, those controls are represented by a collection of INPUT1736elements. For example, this is a SELECT control, named "control1":17371738<select name="control1">1739<option>foo</option>1740<option value="1">bar</option>1741</select>17421743and this is a CHECKBOX control, named "control2":17441745<input type="checkbox" name="control2" value="foo" id="cbe1">1746<input type="checkbox" name="control2" value="bar" id="cbe2">17471748The id attribute of a CHECKBOX or RADIO ListControl is always that of its1749first element (for example, "cbe1" above).175017511752Additional read-only public attribute: multiple.17531754"""17551756# ListControls are built up by the parser from their component items by1757# creating one ListControl per item, consolidating them into a single1758# master ListControl held by the HTMLForm:17591760# -User calls form.new_control(...)1761# -Form creates Control, and calls control.add_to_form(self).1762# -Control looks for a Control with the same name and type in the form,1763# and if it finds one, merges itself with that control by calling1764# control.merge_control(self). The first Control added to the form, of1765# a particular name and type, is the only one that survives in the1766# form.1767# -Form calls control.fixup for all its controls. ListControls in the1768# form know they can now safely pick their default values.17691770# To create a ListControl without an HTMLForm, use:17711772# control.merge_control(new_control)17731774# (actually, it's much easier just to use ParseFile)17751776_label = None17771778def __init__(self, type, name, attrs={}, select_default=False,1779called_as_base_class=False, index=None):1780"""1781select_default: for RADIO and multiple-selection SELECT controls, pick1782the first item as the default if no 'selected' HTML attribute is1783present17841785"""1786if not called_as_base_class:1787raise NotImplementedError()17881789self.__dict__["type"] = type.lower()1790self.__dict__["name"] = name1791self._value = attrs.get("value")1792self.disabled = False1793self.readonly = False1794self.id = attrs.get("id")1795self._closed = False17961797# As Controls are merged in with .merge_control(), self.attrs will1798# refer to each Control in turn -- always the most recently merged1799# control. Each merged-in Control instance corresponds to a single1800# list item: see ListControl.__doc__.1801self.items = []1802self._form = None18031804self._select_default = select_default1805self._clicked = False18061807def clear(self):1808self.value = []18091810def is_of_kind(self, kind):1811if kind == "list":1812return True1813elif kind == "multilist":1814return bool(self.multiple)1815elif kind == "singlelist":1816return not self.multiple1817else:1818return False18191820def get_items(self, name=None, label=None, id=None,1821exclude_disabled=False):1822"""Return matching items by name or label.18231824For argument docs, see the docstring for .get()18251826"""1827if name is not None and not isstringlike(name):1828raise TypeError("item name must be string-like")1829if label is not None and not isstringlike(label):1830raise TypeError("item label must be string-like")1831if id is not None and not isstringlike(id):1832raise TypeError("item id must be string-like")1833items = [] # order is important1834compat = self._form.backwards_compat1835for o in self.items:1836if exclude_disabled and o.disabled:1837continue1838if name is not None and o.name != name:1839continue1840if label is not None:1841for l in o.get_labels():1842if ((compat and l.text == label) or1843(not compat and l.text.find(label) > -1)):1844break1845else:1846continue1847if id is not None and o.id != id:1848continue1849items.append(o)1850return items18511852def get(self, name=None, label=None, id=None, nr=None,1853exclude_disabled=False):1854"""Return item by name or label, disambiguating if necessary with nr.18551856All arguments must be passed by name, with the exception of 'name',1857which may be used as a positional argument.18581859If name is specified, then the item must have the indicated name.18601861If label is specified, then the item must have a label whose1862whitespace-compressed, stripped, text substring-matches the indicated1863label string (eg. label="please choose" will match1864" Do please choose an item ").18651866If id is specified, then the item must have the indicated id.18671868nr is an optional 0-based index of the items matching the query.18691870If nr is the default None value and more than item is found, raises1871AmbiguityError (unless the HTMLForm instance's backwards_compat1872attribute is true).18731874If no item is found, or if items are found but nr is specified and not1875found, raises ItemNotFoundError.18761877Optionally excludes disabled items.18781879"""1880if nr is None and self._form.backwards_compat:1881nr = 0 # :-/1882items = self.get_items(name, label, id, exclude_disabled)1883return disambiguate(items, nr, name=name, label=label, id=id)18841885def _get(self, name, by_label=False, nr=None, exclude_disabled=False):1886# strictly for use by deprecated methods1887if by_label:1888name, label = None, name1889else:1890name, label = name, None1891return self.get(name, label, nr, exclude_disabled)18921893def toggle(self, name, by_label=False, nr=None):1894"""Deprecated: given a name or label and optional disambiguating index1895nr, toggle the matching item's selection.18961897Selecting items follows the behavior described in the docstring of the1898'get' method.18991900if the item is disabled, or this control is disabled or readonly,1901raise AttributeError.19021903"""1904deprecation(1905"item = control.get(...); item.selected = not item.selected")1906o = self._get(name, by_label, nr)1907self._set_selected_state(o, not o.selected)19081909def set(self, selected, name, by_label=False, nr=None):1910"""Deprecated: given a name or label and optional disambiguating index1911nr, set the matching item's selection to the bool value of selected.19121913Selecting items follows the behavior described in the docstring of the1914'get' method.19151916if the item is disabled, or this control is disabled or readonly,1917raise AttributeError.19181919"""1920deprecation(1921"control.get(...).selected = <boolean>")1922self._set_selected_state(self._get(name, by_label, nr), selected)19231924def _set_selected_state(self, item, action):1925# action:1926# bool False: off1927# bool True: on1928if self.disabled:1929raise AttributeError("control '%s' is disabled" % self.name)1930if self.readonly:1931raise AttributeError("control '%s' is readonly" % self.name)1932action = bool(action)1933compat = self._form.backwards_compat1934if not compat and item.disabled:1935raise AttributeError("item is disabled")1936else:1937if compat and item.disabled and action:1938raise AttributeError("item is disabled")1939if self.multiple:1940item.__dict__["_selected"] = action1941else:1942if not action:1943item.__dict__["_selected"] = False1944else:1945for o in self.items:1946o.__dict__["_selected"] = False1947item.__dict__["_selected"] = True19481949def toggle_single(self, by_label=None):1950"""Deprecated: toggle the selection of the single item in this control.19511952Raises ItemCountError if the control does not contain only one item.19531954by_label argument is ignored, and included only for backwards1955compatibility.19561957"""1958deprecation(1959"control.items[0].selected = not control.items[0].selected")1960if len(self.items) != 1:1961raise ItemCountError(1962"'%s' is not a single-item control" % self.name)1963item = self.items[0]1964self._set_selected_state(item, not item.selected)19651966def set_single(self, selected, by_label=None):1967"""Deprecated: set the selection of the single item in this control.19681969Raises ItemCountError if the control does not contain only one item.19701971by_label argument is ignored, and included only for backwards1972compatibility.19731974"""1975deprecation(1976"control.items[0].selected = <boolean>")1977if len(self.items) != 1:1978raise ItemCountError(1979"'%s' is not a single-item control" % self.name)1980self._set_selected_state(self.items[0], selected)19811982def get_item_disabled(self, name, by_label=False, nr=None):1983"""Get disabled state of named list item in a ListControl."""1984deprecation(1985"control.get(...).disabled")1986return self._get(name, by_label, nr).disabled19871988def set_item_disabled(self, disabled, name, by_label=False, nr=None):1989"""Set disabled state of named list item in a ListControl.19901991disabled: boolean disabled state19921993"""1994deprecation(1995"control.get(...).disabled = <boolean>")1996self._get(name, by_label, nr).disabled = disabled19971998def set_all_items_disabled(self, disabled):1999"""Set disabled state of all list items in a ListControl.20002001disabled: boolean disabled state20022003"""2004for o in self.items:2005o.disabled = disabled20062007def get_item_attrs(self, name, by_label=False, nr=None):2008"""Return dictionary of HTML attributes for a single ListControl item.20092010The HTML element types that describe list items are: OPTION for SELECT2011controls, INPUT for the rest. These elements have HTML attributes that2012you may occasionally want to know about -- for example, the "alt" HTML2013attribute gives a text string describing the item (graphical browsers2014usually display this as a tooltip).20152016The returned dictionary maps HTML attribute names to values. The names2017and values are taken from the original HTML.20182019"""2020deprecation(2021"control.get(...).attrs")2022return self._get(name, by_label, nr).attrs20232024def close_control(self):2025self._closed = True20262027def add_to_form(self, form):2028assert self._form is None or form == self._form, (2029"can't add control to more than one form")2030self._form = form2031if self.name is None:2032# always count nameless elements as separate controls2033Control.add_to_form(self, form)2034else:2035for ii in xrange(len(form.controls)-1, -1, -1):2036control = form.controls[ii]2037if control.name == self.name and control.type == self.type:2038if control._closed:2039Control.add_to_form(self, form)2040else:2041control.merge_control(self)2042break2043else:2044Control.add_to_form(self, form)20452046def merge_control(self, control):2047assert bool(control.multiple) == bool(self.multiple)2048# usually, isinstance(control, self.__class__)2049self.items.extend(control.items)20502051def fixup(self):2052"""2053ListControls are built up from component list items (which are also2054ListControls) during parsing. This method should be called after all2055items have been added. See ListControl.__doc__ for the reason this is2056required.20572058"""2059# Need to set default selection where no item was indicated as being2060# selected by the HTML:20612062# CHECKBOX:2063# Nothing should be selected.2064# SELECT/single, SELECT/multiple and RADIO:2065# RFC 1866 (HTML 2.0): says first item should be selected.2066# W3C HTML 4.01 Specification: says that client behaviour is2067# undefined in this case. For RADIO, exactly one must be selected,2068# though which one is undefined.2069# Both Netscape and Microsoft Internet Explorer (IE) choose first2070# item for SELECT/single. However, both IE5 and Mozilla (both 1.02071# and Firebird 0.6) leave all items unselected for RADIO and2072# SELECT/multiple.20732074# Since both Netscape and IE all choose the first item for2075# SELECT/single, we do the same. OTOH, both Netscape and IE2076# leave SELECT/multiple with nothing selected, in violation of RFC 18662077# (but not in violation of the W3C HTML 4 standard); the same is true2078# of RADIO (which *is* in violation of the HTML 4 standard). We follow2079# RFC 1866 if the _select_default attribute is set, and Netscape and IE2080# otherwise. RFC 1866 and HTML 4 are always violated insofar as you2081# can deselect all items in a RadioControl.20822083for o in self.items:2084# set items' controls to self, now that we've merged2085o.__dict__["_control"] = self20862087def __getattr__(self, name):2088if name == "value":2089compat = self._form.backwards_compat2090if self.name is None:2091return []2092return [o.name for o in self.items if o.selected and2093(not o.disabled or compat)]2094else:2095raise AttributeError("%s instance has no attribute '%s'" %2096(self.__class__.__name__, name))20972098def __setattr__(self, name, value):2099if name == "value":2100if self.disabled:2101raise AttributeError("control '%s' is disabled" % self.name)2102if self.readonly:2103raise AttributeError("control '%s' is readonly" % self.name)2104self._set_value(value)2105elif name in ("name", "type", "multiple"):2106raise AttributeError("%s attribute is readonly" % name)2107else:2108self.__dict__[name] = value21092110def _set_value(self, value):2111if value is None or isstringlike(value):2112raise TypeError("ListControl, must set a sequence")2113if not value:2114compat = self._form.backwards_compat2115for o in self.items:2116if not o.disabled or compat:2117o.selected = False2118elif self.multiple:2119self._multiple_set_value(value)2120elif len(value) > 1:2121raise ItemCountError(2122"single selection list, must set sequence of "2123"length 0 or 1")2124else:2125self._single_set_value(value)21262127def _get_items(self, name, target=1):2128all_items = self.get_items(name)2129items = [o for o in all_items if not o.disabled]2130if len(items) < target:2131if len(all_items) < target:2132raise ItemNotFoundError(2133"insufficient items with name %r" % name)2134else:2135raise AttributeError(2136"insufficient non-disabled items with name %s" % name)2137on = []2138off = []2139for o in items:2140if o.selected:2141on.append(o)2142else:2143off.append(o)2144return on, off21452146def _single_set_value(self, value):2147assert len(value) == 12148on, off = self._get_items(value[0])2149assert len(on) <= 12150if not on:2151off[0].selected = True21522153def _multiple_set_value(self, value):2154compat = self._form.backwards_compat2155turn_on = [] # transactional-ish2156turn_off = [item for item in self.items if2157item.selected and (not item.disabled or compat)]2158names = {}2159for nn in value:2160if nn in names.keys():2161names[nn] += 12162else:2163names[nn] = 12164for name, count in names.items():2165on, off = self._get_items(name, count)2166for i in xrange(count):2167if on:2168item = on[0]2169del on[0]2170del turn_off[turn_off.index(item)]2171else:2172item = off[0]2173del off[0]2174turn_on.append(item)2175for item in turn_off:2176item.selected = False2177for item in turn_on:2178item.selected = True21792180def set_value_by_label(self, value):2181"""Set the value of control by item labels.21822183value is expected to be an iterable of strings that are substrings of2184the item labels that should be selected. Before substring matching is2185performed, the original label text is whitespace-compressed2186(consecutive whitespace characters are converted to a single space2187character) and leading and trailing whitespace is stripped. Ambiguous2188labels are accepted without complaint if the form's backwards_compat is2189True; otherwise, it will not complain as long as all ambiguous labels2190share the same item name (e.g. OPTION value).21912192"""2193if isstringlike(value):2194raise TypeError(value)2195if not self.multiple and len(value) > 1:2196raise ItemCountError(2197"single selection list, must set sequence of "2198"length 0 or 1")2199items = []2200for nn in value:2201found = self.get_items(label=nn)2202if len(found) > 1:2203if not self._form.backwards_compat:2204# ambiguous labels are fine as long as item names (e.g.2205# OPTION values) are same2206opt_name = found[0].name2207if [o for o in found[1:] if o.name != opt_name]:2208raise AmbiguityError(nn)2209else:2210# OK, we'll guess :-( Assume first available item.2211found = found[:1]2212for o in found:2213# For the multiple-item case, we could try to be smarter,2214# saving them up and trying to resolve, but that's too much.2215if self._form.backwards_compat or o not in items:2216items.append(o)2217break2218else: # all of them are used2219raise ItemNotFoundError(nn)2220# now we have all the items that should be on2221# let's just turn everything off and then back on.2222self.value = []2223for o in items:2224o.selected = True22252226def get_value_by_label(self):2227"""Return the value of the control as given by normalized labels."""2228res = []2229compat = self._form.backwards_compat2230for o in self.items:2231if (not o.disabled or compat) and o.selected:2232for l in o.get_labels():2233if l.text:2234res.append(l.text)2235break2236else:2237res.append(None)2238return res22392240def possible_items(self, by_label=False):2241"""Deprecated: return the names or labels of all possible items.22422243Includes disabled items, which may be misleading for some use cases.22442245"""2246deprecation(2247"[item.name for item in self.items]")2248if by_label:2249res = []2250for o in self.items:2251for l in o.get_labels():2252if l.text:2253res.append(l.text)2254break2255else:2256res.append(None)2257return res2258return [o.name for o in self.items]22592260def _totally_ordered_pairs(self):2261if self.disabled or self.name is None:2262return []2263else:2264return [(o._index, self.name, o.name) for o in self.items2265if o.selected and not o.disabled]22662267def __str__(self):2268name = self.name2269if name is None: name = "<None>"22702271display = [str(o) for o in self.items]22722273infos = []2274if self.disabled: infos.append("disabled")2275if self.readonly: infos.append("readonly")2276info = ", ".join(infos)2277if info: info = " (%s)" % info22782279return "<%s(%s=[%s])%s>" % (self.__class__.__name__,2280name, ", ".join(display), info)228122822283class RadioControl(ListControl):2284"""2285Covers:22862287INPUT/RADIO22882289"""2290def __init__(self, type, name, attrs, select_default=False, index=None):2291attrs.setdefault("value", "on")2292ListControl.__init__(self, type, name, attrs, select_default,2293called_as_base_class=True, index=index)2294self.__dict__["multiple"] = False2295o = Item(self, attrs, index)2296o.__dict__["_selected"] = "checked" in attrs22972298def fixup(self):2299ListControl.fixup(self)2300found = [o for o in self.items if o.selected and not o.disabled]2301if not found:2302if self._select_default:2303for o in self.items:2304if not o.disabled:2305o.selected = True2306break2307else:2308# Ensure only one item selected. Choose the last one,2309# following IE and Firefox.2310for o in found[:-1]:2311o.selected = False23122313def get_labels(self):2314return []23152316class CheckboxControl(ListControl):2317"""2318Covers:23192320INPUT/CHECKBOX23212322"""2323def __init__(self, type, name, attrs, select_default=False, index=None):2324attrs.setdefault("value", "on")2325ListControl.__init__(self, type, name, attrs, select_default,2326called_as_base_class=True, index=index)2327self.__dict__["multiple"] = True2328o = Item(self, attrs, index)2329o.__dict__["_selected"] = "checked" in attrs23302331def get_labels(self):2332return []233323342335class SelectControl(ListControl):2336"""2337Covers:23382339SELECT (and OPTION)234023412342OPTION 'values', in HTML parlance, are Item 'names' in ClientForm parlance.23432344SELECT control values and labels are subject to some messy defaulting2345rules. For example, if the HTML representation of the control is:23462347<SELECT name=year>2348<OPTION value=0 label="2002">current year</OPTION>2349<OPTION value=1>2001</OPTION>2350<OPTION>2000</OPTION>2351</SELECT>23522353The items, in order, have labels "2002", "2001" and "2000", whereas their2354names (the OPTION values) are "0", "1" and "2000" respectively. Note that2355the value of the last OPTION in this example defaults to its contents, as2356specified by RFC 1866, as do the labels of the second and third OPTIONs.23572358The OPTION labels are sometimes more meaningful than the OPTION values,2359which can make for more maintainable code.23602361Additional read-only public attribute: attrs23622363The attrs attribute is a dictionary of the original HTML attributes of the2364SELECT element. Other ListControls do not have this attribute, because in2365other cases the control as a whole does not correspond to any single HTML2366element. control.get(...).attrs may be used as usual to get at the HTML2367attributes of the HTML elements corresponding to individual list items (for2368SELECT controls, these are OPTION elements).23692370Another special case is that the Item.attrs dictionaries have a special key2371"contents" which does not correspond to any real HTML attribute, but rather2372contains the contents of the OPTION element:23732374<OPTION>this bit</OPTION>23752376"""2377# HTML attributes here are treated slightly differently from other list2378# controls:2379# -The SELECT HTML attributes dictionary is stuffed into the OPTION2380# HTML attributes dictionary under the "__select" key.2381# -The content of each OPTION element is stored under the special2382# "contents" key of the dictionary.2383# After all this, the dictionary is passed to the SelectControl constructor2384# as the attrs argument, as usual. However:2385# -The first SelectControl constructed when building up a SELECT control2386# has a constructor attrs argument containing only the __select key -- so2387# this SelectControl represents an empty SELECT control.2388# -Subsequent SelectControls have both OPTION HTML-attribute in attrs and2389# the __select dictionary containing the SELECT HTML-attributes.23902391def __init__(self, type, name, attrs, select_default=False, index=None):2392# fish out the SELECT HTML attributes from the OPTION HTML attributes2393# dictionary2394self.attrs = attrs["__select"].copy()2395self.__dict__["_label"] = _get_label(self.attrs)2396self.__dict__["id"] = self.attrs.get("id")2397self.__dict__["multiple"] = "multiple" in self.attrs2398# the majority of the contents, label, and value dance already happened2399contents = attrs.get("contents")2400attrs = attrs.copy()2401del attrs["__select"]24022403ListControl.__init__(self, type, name, self.attrs, select_default,2404called_as_base_class=True, index=index)2405self.disabled = "disabled" in self.attrs2406self.readonly = "readonly" in self.attrs2407if "value" in attrs:2408# otherwise it is a marker 'select started' token2409o = Item(self, attrs, index)2410o.__dict__["_selected"] = "selected" in attrs2411# add 'label' label and contents label, if different. If both are2412# provided, the 'label' label is used for display in HTML2413# 4.0-compliant browsers (and any lower spec? not sure) while the2414# contents are used for display in older or less-compliant2415# browsers. We make label objects for both, if the values are2416# different.2417label = attrs.get("label")2418if label:2419o._labels.append(Label({"__text": label}))2420if contents and contents != label:2421o._labels.append(Label({"__text": contents}))2422elif contents:2423o._labels.append(Label({"__text": contents}))24242425def fixup(self):2426ListControl.fixup(self)2427# Firefox doesn't exclude disabled items from those considered here2428# (i.e. from 'found', for both branches of the if below). Note that2429# IE6 doesn't support the disabled attribute on OPTIONs at all.2430found = [o for o in self.items if o.selected]2431if not found:2432if not self.multiple or self._select_default:2433for o in self.items:2434if not o.disabled:2435was_disabled = self.disabled2436self.disabled = False2437try:2438o.selected = True2439finally:2440o.disabled = was_disabled2441break2442elif not self.multiple:2443# Ensure only one item selected. Choose the last one,2444# following IE and Firefox.2445for o in found[:-1]:2446o.selected = False244724482449#---------------------------------------------------2450class SubmitControl(ScalarControl):2451"""2452Covers:24532454INPUT/SUBMIT2455BUTTON/SUBMIT24562457"""2458def __init__(self, type, name, attrs, index=None):2459ScalarControl.__init__(self, type, name, attrs, index)2460# IE5 defaults SUBMIT value to "Submit Query"; Firebird 0.6 leaves it2461# blank, Konqueror 3.1 defaults to "Submit". HTML spec. doesn't seem2462# to define this.2463if self.value is None and not self.disabled and not self.readonly: self.value = ""2464self.readonly = True24652466def get_labels(self):2467res = []2468if self.value:2469res.append(Label({"__text": self.value}))2470res.extend(ScalarControl.get_labels(self))2471return res24722473def is_of_kind(self, kind): return kind == "clickable"24742475def _click(self, form, coord, return_type, request_class=_urllib.request.Request):2476self._clicked = coord2477r = form._switch_click(return_type, request_class)2478self._clicked = False2479return r24802481def _totally_ordered_pairs(self):2482if not self._clicked:2483return []2484return ScalarControl._totally_ordered_pairs(self)248524862487#---------------------------------------------------2488class ImageControl(SubmitControl):2489"""2490Covers:24912492INPUT/IMAGE24932494Coordinates are specified using one of the HTMLForm.click* methods.24952496"""2497def __init__(self, type, name, attrs, index=None):2498SubmitControl.__init__(self, type, name, attrs, index)2499self.readonly = False25002501def _totally_ordered_pairs(self):2502clicked = self._clicked2503if self.disabled or not clicked:2504return []2505name = self.name2506if name is None: return []2507pairs = [2508(self._index, "%s.x" % name, str(clicked[0])),2509(self._index+1, "%s.y" % name, str(clicked[1])),2510]2511value = self._value2512if value:2513pairs.append((self._index+2, name, value))2514return pairs25152516get_labels = ScalarControl.get_labels25172518# aliases, just to make str(control) and str(form) clearer2519class PasswordControl(TextControl): pass2520class HiddenControl(TextControl): pass2521class TextareaControl(TextControl): pass2522class SubmitButtonControl(SubmitControl): pass252325242525def is_listcontrol(control): return control.is_of_kind("list")252625272528class HTMLForm:2529"""Represents a single HTML <form> ... </form> element.25302531A form consists of a sequence of controls that usually have names, and2532which can take on various values. The values of the various types of2533controls represent variously: text, zero-or-one-of-many or many-of-many2534choices, and files to be uploaded. Some controls can be clicked on to2535submit the form, and clickable controls' values sometimes include the2536coordinates of the click.25372538Forms can be filled in with data to be returned to the server, and then2539submitted, using the click method to generate a request object suitable for2540passing to urllib2.urlopen (or the click_request_data or click_pairs2541methods if you're not using urllib2).25422543import ClientForm2544forms = ClientForm.ParseFile(html, base_uri)2545form = forms[0]25462547form["query"] = "Python"2548form.find_control("nr_results").get("lots").selected = True25492550response = urllib2.urlopen(form.click())25512552Usually, HTMLForm instances are not created directly. Instead, the2553ParseFile or ParseResponse factory functions are used. If you do construct2554HTMLForm objects yourself, however, note that an HTMLForm instance is only2555properly initialised after the fixup method has been called (ParseFile and2556ParseResponse do this for you). See ListControl.__doc__ for the reason2557this is required.25582559Indexing a form (form["control_name"]) returns the named Control's value2560attribute. Assignment to a form index (form["control_name"] = something)2561is equivalent to assignment to the named Control's value attribute. If you2562need to be more specific than just supplying the control's name, use the2563set_value and get_value methods.25642565ListControl values are lists of item names (specifically, the names of the2566items that are selected and not disabled, and hence are "successful" -- ie.2567cause data to be returned to the server). The list item's name is the2568value of the corresponding HTML element's"value" attribute.25692570Example:25712572<INPUT type="CHECKBOX" name="cheeses" value="leicester"></INPUT>2573<INPUT type="CHECKBOX" name="cheeses" value="cheddar"></INPUT>25742575defines a CHECKBOX control with name "cheeses" which has two items, named2576"leicester" and "cheddar".25772578Another example:25792580<SELECT name="more_cheeses">2581<OPTION>1</OPTION>2582<OPTION value="2" label="CHEDDAR">cheddar</OPTION>2583</SELECT>25842585defines a SELECT control with name "more_cheeses" which has two items,2586named "1" and "2" (because the OPTION element's value HTML attribute2587defaults to the element contents -- see SelectControl.__doc__ for more on2588these defaulting rules).25892590To select, deselect or otherwise manipulate individual list items, use the2591HTMLForm.find_control() and ListControl.get() methods. To set the whole2592value, do as for any other control: use indexing or the set_/get_value2593methods.25942595Example:25962597# select *only* the item named "cheddar"2598form["cheeses"] = ["cheddar"]2599# select "cheddar", leave other items unaffected2600form.find_control("cheeses").get("cheddar").selected = True26012602Some controls (RADIO and SELECT without the multiple attribute) can only2603have zero or one items selected at a time. Some controls (CHECKBOX and2604SELECT with the multiple attribute) can have multiple items selected at a2605time. To set the whole value of a ListControl, assign a sequence to a form2606index:26072608form["cheeses"] = ["cheddar", "leicester"]26092610If the ListControl is not multiple-selection, the assigned list must be of2611length one.26122613To check if a control has an item, if an item is selected, or if an item is2614successful (selected and not disabled), respectively:26152616"cheddar" in [item.name for item in form.find_control("cheeses").items]2617"cheddar" in [item.name for item in form.find_control("cheeses").items and2618item.selected]2619"cheddar" in form["cheeses"] # (or "cheddar" in form.get_value("cheeses"))26202621Note that some list items may be disabled (see below).26222623Note the following mistake:26242625form[control_name] = control_value2626assert form[control_name] == control_value # not necessarily true26272628The reason for this is that form[control_name] always gives the list items2629in the order they were listed in the HTML.26302631List items (hence list values, too) can be referred to in terms of list2632item labels rather than list item names using the appropriate label2633arguments. Note that each item may have several labels.26342635The question of default values of OPTION contents, labels and values is2636somewhat complicated: see SelectControl.__doc__ and2637ListControl.get_item_attrs.__doc__ if you think you need to know.26382639Controls can be disabled or readonly. In either case, the control's value2640cannot be changed until you clear those flags (see example below).2641Disabled is the state typically represented by browsers by 'greying out' a2642control. Disabled controls are not 'successful' -- they don't cause data2643to get returned to the server. Readonly controls usually appear in2644browsers as read-only text boxes. Readonly controls are successful. List2645items can also be disabled. Attempts to select or deselect disabled items2646fail with AttributeError.26472648If a lot of controls are readonly, it can be useful to do this:26492650form.set_all_readonly(False)26512652To clear a control's value attribute, so that it is not successful (until a2653value is subsequently set):26542655form.clear("cheeses")26562657More examples:26582659control = form.find_control("cheeses")2660control.disabled = False2661control.readonly = False2662control.get("gruyere").disabled = True2663control.items[0].selected = True26642665See the various Control classes for further documentation. Many methods2666take name, type, kind, id, label and nr arguments to specify the control to2667be operated on: see HTMLForm.find_control.__doc__.26682669ControlNotFoundError (subclass of ValueError) is raised if the specified2670control can't be found. This includes occasions where a non-ListControl2671is found, but the method (set, for example) requires a ListControl.2672ItemNotFoundError (subclass of ValueError) is raised if a list item can't2673be found. ItemCountError (subclass of ValueError) is raised if an attempt2674is made to select more than one item and the control doesn't allow that, or2675set/get_single are called and the control contains more than one item.2676AttributeError is raised if a control or item is readonly or disabled and2677an attempt is made to alter its value.26782679Security note: Remember that any passwords you store in HTMLForm instances2680will be saved to disk in the clear if you pickle them (directly or2681indirectly). The simplest solution to this is to avoid pickling HTMLForm2682objects. You could also pickle before filling in any password, or just set2683the password to "" before pickling.268426852686Public attributes:26872688action: full (absolute URI) form action2689method: "GET" or "POST"2690enctype: form transfer encoding MIME type2691name: name of form (None if no name was specified)2692attrs: dictionary mapping original HTML form attributes to their values26932694controls: list of Control instances; do not alter this list2695(instead, call form.new_control to make a Control and add it to the2696form, or control.add_to_form if you already have a Control instance)2697269826992700Methods for form filling:2701-------------------------27022703Most of the these methods have very similar arguments. See2704HTMLForm.find_control.__doc__ for details of the name, type, kind, label2705and nr arguments.27062707def find_control(self,2708name=None, type=None, kind=None, id=None, predicate=None,2709nr=None, label=None)27102711get_value(name=None, type=None, kind=None, id=None, nr=None,2712by_label=False, # by_label is deprecated2713label=None)2714set_value(value,2715name=None, type=None, kind=None, id=None, nr=None,2716by_label=False, # by_label is deprecated2717label=None)27182719clear_all()2720clear(name=None, type=None, kind=None, id=None, nr=None, label=None)27212722set_all_readonly(readonly)272327242725Method applying only to FileControls:27262727add_file(file_object,2728content_type="application/octet-stream", filename=None,2729name=None, id=None, nr=None, label=None)273027312732Methods applying only to clickable controls:27332734click(name=None, type=None, id=None, nr=0, coord=(1,1), label=None)2735click_request_data(name=None, type=None, id=None, nr=0, coord=(1,1),2736label=None)2737click_pairs(name=None, type=None, id=None, nr=0, coord=(1,1), label=None)27382739"""27402741type2class = {2742"text": TextControl,2743"password": PasswordControl,2744"hidden": HiddenControl,2745"textarea": TextareaControl,27462747"isindex": IsindexControl,27482749"file": FileControl,27502751"button": IgnoreControl,2752"buttonbutton": IgnoreControl,2753"reset": IgnoreControl,2754"resetbutton": IgnoreControl,27552756"submit": SubmitControl,2757"submitbutton": SubmitButtonControl,2758"image": ImageControl,27592760"radio": RadioControl,2761"checkbox": CheckboxControl,2762"select": SelectControl,2763}27642765#---------------------------------------------------2766# Initialisation. Use ParseResponse / ParseFile instead.27672768def __init__(self, action, method="GET",2769enctype=None,2770name=None, attrs=None,2771request_class=_urllib.request.Request,2772forms=None, labels=None, id_to_labels=None,2773backwards_compat=True):2774"""2775In the usual case, use ParseResponse (or ParseFile) to create new2776HTMLForm objects.27772778action: full (absolute URI) form action2779method: "GET" or "POST"2780enctype: form transfer encoding MIME type2781name: name of form2782attrs: dictionary mapping original HTML form attributes to their values27832784"""2785self.action = action2786self.method = method2787self.enctype = enctype or "application/x-www-form-urlencoded"2788self.name = name2789if attrs is not None:2790self.attrs = attrs.copy()2791else:2792self.attrs = {}2793self.controls = []2794self._request_class = request_class27952796# these attributes are used by zope.testbrowser2797self._forms = forms # this is a semi-public API!2798self._labels = labels # this is a semi-public API!2799self._id_to_labels = id_to_labels # this is a semi-public API!28002801self.backwards_compat = backwards_compat # note __setattr__28022803self._urlunparse = _urllib.parse.urlunparse2804self._urlparse = _urllib.parse.urlparse28052806def __getattr__(self, name):2807if name == "backwards_compat":2808return self._backwards_compat2809return getattr(HTMLForm, name)28102811def __setattr__(self, name, value):2812# yuck2813if name == "backwards_compat":2814name = "_backwards_compat"2815value = bool(value)2816for cc in self.controls:2817try:2818items = cc.items2819except AttributeError:2820continue2821else:2822for ii in items:2823for ll in ii.get_labels():2824ll._backwards_compat = value2825self.__dict__[name] = value28262827def new_control(self, type, name, attrs,2828ignore_unknown=False, select_default=False, index=None):2829"""Adds a new control to the form.28302831This is usually called by ParseFile and ParseResponse. Don't call it2832youself unless you're building your own Control instances.28332834Note that controls representing lists of items are built up from2835controls holding only a single list item. See ListControl.__doc__ for2836further information.28372838type: type of control (see Control.__doc__ for a list)2839attrs: HTML attributes of control2840ignore_unknown: if true, use a dummy Control instance for controls of2841unknown type; otherwise, use a TextControl2842select_default: for RADIO and multiple-selection SELECT controls, pick2843the first item as the default if no 'selected' HTML attribute is2844present (this defaulting happens when the HTMLForm.fixup method is2845called)2846index: index of corresponding element in HTML (see2847MoreFormTests.test_interspersed_controls for motivation)28482849"""2850type = type.lower()2851klass = self.type2class.get(type)2852if klass is None:2853if ignore_unknown:2854klass = IgnoreControl2855else:2856klass = TextControl28572858a = attrs.copy()2859if issubclass(klass, ListControl):2860control = klass(type, name, a, select_default, index)2861else:2862control = klass(type, name, a, index)28632864if type == "select" and len(attrs) == 1:2865for ii in xrange(len(self.controls)-1, -1, -1):2866ctl = self.controls[ii]2867if ctl.type == "select":2868ctl.close_control()2869break28702871control.add_to_form(self)2872control._urlparse = self._urlparse2873control._urlunparse = self._urlunparse28742875def fixup(self):2876"""Normalise form after all controls have been added.28772878This is usually called by ParseFile and ParseResponse. Don't call it2879youself unless you're building your own Control instances.28802881This method should only be called once, after all controls have been2882added to the form.28832884"""2885for control in self.controls:2886control.fixup()2887self.backwards_compat = self._backwards_compat28882889#---------------------------------------------------2890def __str__(self):2891header = "%s%s %s %s" % (2892(self.name and self.name+" " or ""),2893self.method, self.action, self.enctype)2894rep = [header]2895for control in self.controls:2896rep.append(" %s" % str(control))2897return "<%s>" % "\n".join(rep)28982899#---------------------------------------------------2900# Form-filling methods.29012902def __getitem__(self, name):2903return self.find_control(name).value2904def __contains__(self, name):2905return bool(self.find_control(name))2906def __setitem__(self, name, value):2907control = self.find_control(name)2908try:2909control.value = value2910except AttributeError as e:2911raise ValueError(str(e))29122913def get_value(self,2914name=None, type=None, kind=None, id=None, nr=None,2915by_label=False, # by_label is deprecated2916label=None):2917"""Return value of control.29182919If only name and value arguments are supplied, equivalent to29202921form[name]29222923"""2924if by_label:2925deprecation("form.get_value_by_label(...)")2926c = self.find_control(name, type, kind, id, label=label, nr=nr)2927if by_label:2928try:2929meth = c.get_value_by_label2930except AttributeError:2931raise NotImplementedError(2932"control '%s' does not yet support by_label" % c.name)2933else:2934return meth()2935else:2936return c.value2937def set_value(self, value,2938name=None, type=None, kind=None, id=None, nr=None,2939by_label=False, # by_label is deprecated2940label=None):2941"""Set value of control.29422943If only name and value arguments are supplied, equivalent to29442945form[name] = value29462947"""2948if by_label:2949deprecation("form.get_value_by_label(...)")2950c = self.find_control(name, type, kind, id, label=label, nr=nr)2951if by_label:2952try:2953meth = c.set_value_by_label2954except AttributeError:2955raise NotImplementedError(2956"control '%s' does not yet support by_label" % c.name)2957else:2958meth(value)2959else:2960c.value = value2961def get_value_by_label(2962self, name=None, type=None, kind=None, id=None, label=None, nr=None):2963"""29642965All arguments should be passed by name.29662967"""2968c = self.find_control(name, type, kind, id, label=label, nr=nr)2969return c.get_value_by_label()29702971def set_value_by_label(2972self, value,2973name=None, type=None, kind=None, id=None, label=None, nr=None):2974"""29752976All arguments should be passed by name.29772978"""2979c = self.find_control(name, type, kind, id, label=label, nr=nr)2980c.set_value_by_label(value)29812982def set_all_readonly(self, readonly):2983for control in self.controls:2984control.readonly = bool(readonly)29852986def clear_all(self):2987"""Clear the value attributes of all controls in the form.29882989See HTMLForm.clear.__doc__.29902991"""2992for control in self.controls:2993control.clear()29942995def clear(self,2996name=None, type=None, kind=None, id=None, nr=None, label=None):2997"""Clear the value attribute of a control.29982999As a result, the affected control will not be successful until a value3000is subsequently set. AttributeError is raised on readonly controls.30013002"""3003c = self.find_control(name, type, kind, id, label=label, nr=nr)3004c.clear()300530063007#---------------------------------------------------3008# Form-filling methods applying only to ListControls.30093010def possible_items(self, # deprecated3011name=None, type=None, kind=None, id=None,3012nr=None, by_label=False, label=None):3013"""Return a list of all values that the specified control can take."""3014c = self._find_list_control(name, type, kind, id, label, nr)3015return c.possible_items(by_label)30163017def set(self, selected, item_name, # deprecated3018name=None, type=None, kind=None, id=None, nr=None,3019by_label=False, label=None):3020"""Select / deselect named list item.30213022selected: boolean selected state30233024"""3025self._find_list_control(name, type, kind, id, label, nr).set(3026selected, item_name, by_label)3027def toggle(self, item_name, # deprecated3028name=None, type=None, kind=None, id=None, nr=None,3029by_label=False, label=None):3030"""Toggle selected state of named list item."""3031self._find_list_control(name, type, kind, id, label, nr).toggle(3032item_name, by_label)30333034def set_single(self, selected, # deprecated3035name=None, type=None, kind=None, id=None,3036nr=None, by_label=None, label=None):3037"""Select / deselect list item in a control having only one item.30383039If the control has multiple list items, ItemCountError is raised.30403041This is just a convenience method, so you don't need to know the item's3042name -- the item name in these single-item controls is usually3043something meaningless like "1" or "on".30443045For example, if a checkbox has a single item named "on", the following3046two calls are equivalent:30473048control.toggle("on")3049control.toggle_single()30503051""" # by_label ignored and deprecated3052self._find_list_control(3053name, type, kind, id, label, nr).set_single(selected)3054def toggle_single(self, name=None, type=None, kind=None, id=None,3055nr=None, by_label=None, label=None): # deprecated3056"""Toggle selected state of list item in control having only one item.30573058The rest is as for HTMLForm.set_single.__doc__.30593060""" # by_label ignored and deprecated3061self._find_list_control(name, type, kind, id, label, nr).toggle_single()30623063#---------------------------------------------------3064# Form-filling method applying only to FileControls.30653066def add_file(self, file_object, content_type=None, filename=None,3067name=None, id=None, nr=None, label=None):3068"""Add a file to be uploaded.30693070file_object: file-like object (with read method) from which to read3071data to upload3072content_type: MIME content type of data to upload3073filename: filename to pass to server30743075If filename is None, no filename is sent to the server.30763077If content_type is None, the content type is guessed based on the3078filename and the data from read from the file object.30793080XXX3081At the moment, guessed content type is always application/octet-stream.3082Use sndhdr, imghdr modules. Should also try to guess HTML, XML, and3083plain text.30843085Note the following useful HTML attributes of file upload controls (see3086HTML 4.01 spec, section 17):30873088accept: comma-separated list of content types that the server will3089handle correctly; you can use this to filter out non-conforming files3090size: XXX IIRC, this is indicative of whether form wants multiple or3091single files3092maxlength: XXX hint of max content length in bytes?30933094"""3095self.find_control(name, "file", id=id, label=label, nr=nr).add_file(3096file_object, content_type, filename)30973098#---------------------------------------------------3099# Form submission methods, applying only to clickable controls.31003101def click(self, name=None, type=None, id=None, nr=0, coord=(1,1),3102request_class=_urllib.request.Request,3103label=None):3104"""Return request that would result from clicking on a control.31053106The request object is a _urllib.request.Request instance, which you can pass to3107urllib2.urlopen (or ClientCookie.urlopen).31083109Only some control types (INPUT/SUBMIT & BUTTON/SUBMIT buttons and3110IMAGEs) can be clicked.31113112Will click on the first clickable control, subject to the name, type3113and nr arguments (as for find_control). If no name, type, id or number3114is specified and there are no clickable controls, a request will be3115returned for the form in its current, un-clicked, state.31163117IndexError is raised if any of name, type, id or nr is specified but no3118matching control is found. ValueError is raised if the HTMLForm has an3119enctype attribute that is not recognised.31203121You can optionally specify a coordinate to click at, which only makes a3122difference if you clicked on an image.31233124"""3125return self._click(name, type, id, label, nr, coord, "request",3126self._request_class)31273128def click_request_data(self,3129name=None, type=None, id=None,3130nr=0, coord=(1,1),3131request_class=_urllib.request.Request,3132label=None):3133"""As for click method, but return a tuple (url, data, headers).31343135You can use this data to send a request to the server. This is useful3136if you're using httplib or urllib rather than urllib2. Otherwise, use3137the click method.31383139# Untested. Have to subclass to add headers, I think -- so use urllib23140# instead!3141import urllib3142url, data, hdrs = form.click_request_data()3143r = _urllib.request.urlopen(url, data)31443145# Untested. I don't know of any reason to use httplib -- you can get3146# just as much control with urllib2.3147import httplib, urlparse3148url, data, hdrs = form.click_request_data()3149tup = urlparse(url)3150host, path = tup[1], _urllib.parse.urlunparse((None, None)+tup[2:])3151conn = httplib.HTTPConnection(host)3152if data:3153httplib.request("POST", path, data, hdrs)3154else:3155httplib.request("GET", path, headers=hdrs)3156r = conn.getresponse()31573158"""3159return self._click(name, type, id, label, nr, coord, "request_data",3160self._request_class)31613162def click_pairs(self, name=None, type=None, id=None,3163nr=0, coord=(1,1),3164label=None):3165"""As for click_request_data, but returns a list of (key, value) pairs.31663167You can use this list as an argument to ClientForm.urlencode. This is3168usually only useful if you're using httplib or urllib rather than3169urllib2 or ClientCookie. It may also be useful if you want to manually3170tweak the keys and/or values, but this should not be necessary.3171Otherwise, use the click method.31723173Note that this method is only useful for forms of MIME type3174x-www-form-urlencoded. In particular, it does not return the3175information required for file upload. If you need file upload and are3176not using urllib2, use click_request_data.31773178Also note that Python 2.0's urllib.urlencode is slightly broken: it3179only accepts a mapping, not a sequence of pairs, as an argument. This3180messes up any ordering in the argument. Use ClientForm.urlencode3181instead.31823183"""3184return self._click(name, type, id, label, nr, coord, "pairs",3185self._request_class)31863187#---------------------------------------------------31883189def find_control(self,3190name=None, type=None, kind=None, id=None,3191predicate=None, nr=None,3192label=None):3193"""Locate and return some specific control within the form.31943195At least one of the name, type, kind, predicate and nr arguments must3196be supplied. If no matching control is found, ControlNotFoundError is3197raised.31983199If name is specified, then the control must have the indicated name.32003201If type is specified then the control must have the specified type (in3202addition to the types possible for <input> HTML tags: "text",3203"password", "hidden", "submit", "image", "button", "radio", "checkbox",3204"file" we also have "reset", "buttonbutton", "submitbutton",3205"resetbutton", "textarea", "select" and "isindex").32063207If kind is specified, then the control must fall into the specified3208group, each of which satisfies a particular interface. The types are3209"text", "list", "multilist", "singlelist", "clickable" and "file".32103211If id is specified, then the control must have the indicated id.32123213If predicate is specified, then the control must match that function.3214The predicate function is passed the control as its single argument,3215and should return a boolean value indicating whether the control3216matched.32173218nr, if supplied, is the sequence number of the control (where 0 is the3219first). Note that control 0 is the first control matching all the3220other arguments (if supplied); it is not necessarily the first control3221in the form. If no nr is supplied, AmbiguityError is raised if3222multiple controls match the other arguments (unless the3223.backwards-compat attribute is true).32243225If label is specified, then the control must have this label. Note3226that radio controls and checkboxes never have labels: their items do.32273228"""3229if ((name is None) and (type is None) and (kind is None) and3230(id is None) and (label is None) and (predicate is None) and3231(nr is None)):3232raise ValueError(3233"at least one argument must be supplied to specify control")3234return self._find_control(name, type, kind, id, label, predicate, nr)32353236#---------------------------------------------------3237# Private methods.32383239def _find_list_control(self,3240name=None, type=None, kind=None, id=None,3241label=None, nr=None):3242if ((name is None) and (type is None) and (kind is None) and3243(id is None) and (label is None) and (nr is None)):3244raise ValueError(3245"at least one argument must be supplied to specify control")32463247return self._find_control(name, type, kind, id, label,3248is_listcontrol, nr)32493250def _find_control(self, name, type, kind, id, label, predicate, nr):3251if ((name is not None) and (name is not Missing) and3252not isstringlike(name)):3253raise TypeError("control name must be string-like")3254if (type is not None) and not isstringlike(type):3255raise TypeError("control type must be string-like")3256if (kind is not None) and not isstringlike(kind):3257raise TypeError("control kind must be string-like")3258if (id is not None) and not isstringlike(id):3259raise TypeError("control id must be string-like")3260if (label is not None) and not isstringlike(label):3261raise TypeError("control label must be string-like")3262if (predicate is not None) and not callable(predicate):3263raise TypeError("control predicate must be callable")3264if (nr is not None) and nr < 0:3265raise ValueError("control number must be a positive integer")32663267orig_nr = nr3268found = None3269ambiguous = False3270if nr is None and self.backwards_compat:3271nr = 032723273for control in self.controls:3274if ((name is not None and name != control.name) and3275(name is not Missing or control.name is not None)):3276continue3277if type is not None and type != control.type:3278continue3279if kind is not None and not control.is_of_kind(kind):3280continue3281if id is not None and id != control.id:3282continue3283if predicate and not predicate(control):3284continue3285if label:3286for l in control.get_labels():3287if l.text.find(label) > -1:3288break3289else:3290continue3291if nr is not None:3292if nr == 0:3293return control # early exit: unambiguous due to nr3294nr -= 13295continue3296if found:3297ambiguous = True3298break3299found = control33003301if found and not ambiguous:3302return found33033304description = []3305if name is not None: description.append("name %s" % repr(name))3306if type is not None: description.append("type '%s'" % type)3307if kind is not None: description.append("kind '%s'" % kind)3308if id is not None: description.append("id '%s'" % id)3309if label is not None: description.append("label '%s'" % label)3310if predicate is not None:3311description.append("predicate %s" % predicate)3312if orig_nr: description.append("nr %d" % orig_nr)3313description = ", ".join(description)33143315if ambiguous:3316raise AmbiguityError("more than one control matching "+description)3317elif not found:3318raise ControlNotFoundError("no control matching "+description)3319assert False33203321def _click(self, name, type, id, label, nr, coord, return_type,3322request_class=_urllib.request.Request):3323try:3324control = self._find_control(3325name, type, "clickable", id, label, None, nr)3326except ControlNotFoundError:3327if ((name is not None) or (type is not None) or (id is not None) or3328(nr != 0)):3329raise3330# no clickable controls, but no control was explicitly requested,3331# so return state without clicking any control3332return self._switch_click(return_type, request_class)3333else:3334return control._click(self, coord, return_type, request_class)33353336def _pairs(self):3337"""Return sequence of (key, value) pairs suitable for urlencoding."""3338return [(k, v) for (i, k, v, c_i) in self._pairs_and_controls()]333933403341def _pairs_and_controls(self):3342"""Return sequence of (index, key, value, control_index)3343of totally ordered pairs suitable for urlencoding.33443345control_index is the index of the control in self.controls3346"""3347pairs = []3348for control_index in xrange(len(self.controls)):3349control = self.controls[control_index]3350for ii, key, val in control._totally_ordered_pairs():3351pairs.append((ii, key, val, control_index))33523353# stable sort by ONLY first item in tuple3354pairs.sort()33553356return pairs33573358def _request_data(self):3359"""Return a tuple (url, data, headers)."""3360method = self.method.upper()3361#scheme, netloc, path, parameters, query, frag = _urllib.parse.urlparse(self.action)3362parts = self._urlparse(self.action)3363rest, (query, frag) = parts[:-2], parts[-2:]33643365if method == "GET":3366self.enctype = "application/x-www-form-urlencoded" # force it3367parts = rest + (urlencode(self._pairs()), None)3368uri = self._urlunparse(parts)3369return uri, None, []3370elif method == "POST":3371parts = rest + (query, None)3372uri = self._urlunparse(parts)3373if self.enctype == "application/x-www-form-urlencoded":3374return (uri, urlencode(self._pairs()),3375[("Content-Type", self.enctype)])3376elif self.enctype == "text/plain":3377return (uri, self._pairs(),3378[("Content-Type", self.enctype)])3379elif self.enctype == "multipart/form-data":3380data = _cStringIO()3381http_hdrs = []3382mw = MimeWriter(data, http_hdrs)3383f = mw.startmultipartbody("form-data", add_to_http_hdrs=True,3384prefix=0)3385for ii, k, v, control_index in self._pairs_and_controls():3386self.controls[control_index]._write_mime_data(mw, k, v)3387mw.lastpart()3388return uri, data.getvalue(), http_hdrs3389else:3390raise ValueError(3391"unknown POST form encoding type '%s'" % self.enctype)3392else:3393raise ValueError("Unknown method '%s'" % method)33943395def _switch_click(self, return_type, request_class=_urllib.request.Request):3396# This is called by HTMLForm and clickable Controls to hide switching3397# on return_type.3398if return_type == "pairs":3399return self._pairs()3400elif return_type == "request_data":3401return self._request_data()3402else:3403req_data = self._request_data()34043405req = request_class(req_data[0], req_data[1])3406for key, val in req_data[2]:3407add_hdr = req.add_header3408if key.lower() == "content-type":3409try:3410add_hdr = req.add_unredirected_header3411except AttributeError:3412# pre-2.4 and not using ClientCookie3413pass3414add_hdr(key, val)3415return req341634173418