Path: blob/master/venv/Lib/site-packages/urllib3/util/url.py
811 views
from __future__ import absolute_import1import re2from collections import namedtuple34from ..exceptions import LocationParseError5from ..packages import six678url_attrs = ["scheme", "auth", "host", "port", "path", "query", "fragment"]910# We only want to normalize urls with an HTTP(S) scheme.11# urllib3 infers URLs without a scheme (None) to be http.12NORMALIZABLE_SCHEMES = ("http", "https", None)1314# Almost all of these patterns were derived from the15# 'rfc3986' module: https://github.com/python-hyper/rfc398616PERCENT_RE = re.compile(r"%[a-fA-F0-9]{2}")17SCHEME_RE = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)")18URI_RE = re.compile(19r"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?"20r"(?://([^\\/?#]*))?"21r"([^?#]*)"22r"(?:\?([^#]*))?"23r"(?:#(.*))?$",24re.UNICODE | re.DOTALL,25)2627IPV4_PAT = r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"28HEX_PAT = "[0-9A-Fa-f]{1,4}"29LS32_PAT = "(?:{hex}:{hex}|{ipv4})".format(hex=HEX_PAT, ipv4=IPV4_PAT)30_subs = {"hex": HEX_PAT, "ls32": LS32_PAT}31_variations = [32# 6( h16 ":" ) ls3233"(?:%(hex)s:){6}%(ls32)s",34# "::" 5( h16 ":" ) ls3235"::(?:%(hex)s:){5}%(ls32)s",36# [ h16 ] "::" 4( h16 ":" ) ls3237"(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s",38# [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls3239"(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s",40# [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls3241"(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s",42# [ *3( h16 ":" ) h16 ] "::" h16 ":" ls3243"(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s",44# [ *4( h16 ":" ) h16 ] "::" ls3245"(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s",46# [ *5( h16 ":" ) h16 ] "::" h1647"(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s",48# [ *6( h16 ":" ) h16 ] "::"49"(?:(?:%(hex)s:){0,6}%(hex)s)?::",50]5152UNRESERVED_PAT = r"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._!\-~"53IPV6_PAT = "(?:" + "|".join([x % _subs for x in _variations]) + ")"54ZONE_ID_PAT = "(?:%25|%)(?:[" + UNRESERVED_PAT + "]|%[a-fA-F0-9]{2})+"55IPV6_ADDRZ_PAT = r"\[" + IPV6_PAT + r"(?:" + ZONE_ID_PAT + r")?\]"56REG_NAME_PAT = r"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*"57TARGET_RE = re.compile(r"^(/[^?#]*)(?:\?([^#]*))?(?:#.*)?$")5859IPV4_RE = re.compile("^" + IPV4_PAT + "$")60IPV6_RE = re.compile("^" + IPV6_PAT + "$")61IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT + "$")62BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT[2:-2] + "$")63ZONE_ID_RE = re.compile("(" + ZONE_ID_PAT + r")\]$")6465SUBAUTHORITY_PAT = (u"^(?:(.*)@)?(%s|%s|%s)(?::([0-9]{0,5}))?$") % (66REG_NAME_PAT,67IPV4_PAT,68IPV6_ADDRZ_PAT,69)70SUBAUTHORITY_RE = re.compile(SUBAUTHORITY_PAT, re.UNICODE | re.DOTALL)7172UNRESERVED_CHARS = set(73"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~"74)75SUB_DELIM_CHARS = set("!$&'()*+,;=")76USERINFO_CHARS = UNRESERVED_CHARS | SUB_DELIM_CHARS | {":"}77PATH_CHARS = USERINFO_CHARS | {"@", "/"}78QUERY_CHARS = FRAGMENT_CHARS = PATH_CHARS | {"?"}798081class Url(namedtuple("Url", url_attrs)):82"""83Data structure for representing an HTTP URL. Used as a return value for84:func:`parse_url`. Both the scheme and host are normalized as they are85both case-insensitive according to RFC 3986.86"""8788__slots__ = ()8990def __new__(91cls,92scheme=None,93auth=None,94host=None,95port=None,96path=None,97query=None,98fragment=None,99):100if path and not path.startswith("/"):101path = "/" + path102if scheme is not None:103scheme = scheme.lower()104return super(Url, cls).__new__(105cls, scheme, auth, host, port, path, query, fragment106)107108@property109def hostname(self):110"""For backwards-compatibility with urlparse. We're nice like that."""111return self.host112113@property114def request_uri(self):115"""Absolute path including the query string."""116uri = self.path or "/"117118if self.query is not None:119uri += "?" + self.query120121return uri122123@property124def netloc(self):125"""Network location including host and port"""126if self.port:127return "%s:%d" % (self.host, self.port)128return self.host129130@property131def url(self):132"""133Convert self into a url134135This function should more or less round-trip with :func:`.parse_url`. The136returned url may not be exactly the same as the url inputted to137:func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls138with a blank port will have : removed).139140Example: ::141142>>> U = parse_url('http://google.com/mail/')143>>> U.url144'http://google.com/mail/'145>>> Url('http', 'username:password', 'host.com', 80,146... '/path', 'query', 'fragment').url147'http://username:[email protected]:80/path?query#fragment'148"""149scheme, auth, host, port, path, query, fragment = self150url = u""151152# We use "is not None" we want things to happen with empty strings (or 0 port)153if scheme is not None:154url += scheme + u"://"155if auth is not None:156url += auth + u"@"157if host is not None:158url += host159if port is not None:160url += u":" + str(port)161if path is not None:162url += path163if query is not None:164url += u"?" + query165if fragment is not None:166url += u"#" + fragment167168return url169170def __str__(self):171return self.url172173174def split_first(s, delims):175"""176.. deprecated:: 1.25177178Given a string and an iterable of delimiters, split on the first found179delimiter. Return two split parts and the matched delimiter.180181If not found, then the first part is the full input string.182183Example::184185>>> split_first('foo/bar?baz', '?/=')186('foo', 'bar?baz', '/')187>>> split_first('foo/bar?baz', '123')188('foo/bar?baz', '', None)189190Scales linearly with number of delims. Not ideal for large number of delims.191"""192min_idx = None193min_delim = None194for d in delims:195idx = s.find(d)196if idx < 0:197continue198199if min_idx is None or idx < min_idx:200min_idx = idx201min_delim = d202203if min_idx is None or min_idx < 0:204return s, "", None205206return s[:min_idx], s[min_idx + 1 :], min_delim207208209def _encode_invalid_chars(component, allowed_chars, encoding="utf-8"):210"""Percent-encodes a URI component without reapplying211onto an already percent-encoded component.212"""213if component is None:214return component215216component = six.ensure_text(component)217218# Normalize existing percent-encoded bytes.219# Try to see if the component we're encoding is already percent-encoded220# so we can skip all '%' characters but still encode all others.221component, percent_encodings = PERCENT_RE.subn(222lambda match: match.group(0).upper(), component223)224225uri_bytes = component.encode("utf-8", "surrogatepass")226is_percent_encoded = percent_encodings == uri_bytes.count(b"%")227encoded_component = bytearray()228229for i in range(0, len(uri_bytes)):230# Will return a single character bytestring on both Python 2 & 3231byte = uri_bytes[i : i + 1]232byte_ord = ord(byte)233if (is_percent_encoded and byte == b"%") or (234byte_ord < 128 and byte.decode() in allowed_chars235):236encoded_component += byte237continue238encoded_component.extend(b"%" + (hex(byte_ord)[2:].encode().zfill(2).upper()))239240return encoded_component.decode(encoding)241242243def _remove_path_dot_segments(path):244# See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code245segments = path.split("/") # Turn the path into a list of segments246output = [] # Initialize the variable to use to store output247248for segment in segments:249# '.' is the current directory, so ignore it, it is superfluous250if segment == ".":251continue252# Anything other than '..', should be appended to the output253elif segment != "..":254output.append(segment)255# In this case segment == '..', if we can, we should pop the last256# element257elif output:258output.pop()259260# If the path starts with '/' and the output is empty or the first string261# is non-empty262if path.startswith("/") and (not output or output[0]):263output.insert(0, "")264265# If the path starts with '/.' or '/..' ensure we add one more empty266# string to add a trailing '/'267if path.endswith(("/.", "/..")):268output.append("")269270return "/".join(output)271272273def _normalize_host(host, scheme):274if host:275if isinstance(host, six.binary_type):276host = six.ensure_str(host)277278if scheme in NORMALIZABLE_SCHEMES:279is_ipv6 = IPV6_ADDRZ_RE.match(host)280if is_ipv6:281match = ZONE_ID_RE.search(host)282if match:283start, end = match.span(1)284zone_id = host[start:end]285286if zone_id.startswith("%25") and zone_id != "%25":287zone_id = zone_id[3:]288else:289zone_id = zone_id[1:]290zone_id = "%" + _encode_invalid_chars(zone_id, UNRESERVED_CHARS)291return host[:start].lower() + zone_id + host[end:]292else:293return host.lower()294elif not IPV4_RE.match(host):295return six.ensure_str(296b".".join([_idna_encode(label) for label in host.split(".")])297)298return host299300301def _idna_encode(name):302if name and any([ord(x) > 128 for x in name]):303try:304import idna305except ImportError:306six.raise_from(307LocationParseError("Unable to parse URL without the 'idna' module"),308None,309)310try:311return idna.encode(name.lower(), strict=True, std3_rules=True)312except idna.IDNAError:313six.raise_from(314LocationParseError(u"Name '%s' is not a valid IDNA label" % name), None315)316return name.lower().encode("ascii")317318319def _encode_target(target):320"""Percent-encodes a request target so that there are no invalid characters"""321path, query = TARGET_RE.match(target).groups()322target = _encode_invalid_chars(path, PATH_CHARS)323query = _encode_invalid_chars(query, QUERY_CHARS)324if query is not None:325target += "?" + query326return target327328329def parse_url(url):330"""331Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is332performed to parse incomplete urls. Fields not provided will be None.333This parser is RFC 3986 compliant.334335The parser logic and helper functions are based heavily on336work done in the ``rfc3986`` module.337338:param str url: URL to parse into a :class:`.Url` namedtuple.339340Partly backwards-compatible with :mod:`urlparse`.341342Example::343344>>> parse_url('http://google.com/mail/')345Url(scheme='http', host='google.com', port=None, path='/mail/', ...)346>>> parse_url('google.com:80')347Url(scheme=None, host='google.com', port=80, path=None, ...)348>>> parse_url('/foo?bar')349Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)350"""351if not url:352# Empty353return Url()354355source_url = url356if not SCHEME_RE.search(url):357url = "//" + url358359try:360scheme, authority, path, query, fragment = URI_RE.match(url).groups()361normalize_uri = scheme is None or scheme.lower() in NORMALIZABLE_SCHEMES362363if scheme:364scheme = scheme.lower()365366if authority:367auth, host, port = SUBAUTHORITY_RE.match(authority).groups()368if auth and normalize_uri:369auth = _encode_invalid_chars(auth, USERINFO_CHARS)370if port == "":371port = None372else:373auth, host, port = None, None, None374375if port is not None:376port = int(port)377if not (0 <= port <= 65535):378raise LocationParseError(url)379380host = _normalize_host(host, scheme)381382if normalize_uri and path:383path = _remove_path_dot_segments(path)384path = _encode_invalid_chars(path, PATH_CHARS)385if normalize_uri and query:386query = _encode_invalid_chars(query, QUERY_CHARS)387if normalize_uri and fragment:388fragment = _encode_invalid_chars(fragment, FRAGMENT_CHARS)389390except (ValueError, AttributeError):391return six.raise_from(LocationParseError(source_url), None)392393# For the sake of backwards compatibility we put empty394# string values for path if there are any defined values395# beyond the path in the URL.396# TODO: Remove this when we break backwards compatibility.397if not path:398if query is not None or fragment is not None:399path = ""400else:401path = None402403# Ensure that each part of the URL is a `str` for404# backwards compatibility.405if isinstance(url, six.text_type):406ensure_func = six.ensure_text407else:408ensure_func = six.ensure_str409410def ensure_type(x):411return x if x is None else ensure_func(x)412413return Url(414scheme=ensure_type(scheme),415auth=ensure_type(auth),416host=ensure_type(host),417port=port,418path=ensure_type(path),419query=ensure_type(query),420fragment=ensure_type(fragment),421)422423424def get_host(url):425"""426Deprecated. Use :func:`parse_url` instead.427"""428p = parse_url(url)429return p.scheme or "http", p.hostname, p.port430431432