Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/urllib3/util/url.py
811 views
1
from __future__ import absolute_import
2
import re
3
from collections import namedtuple
4
5
from ..exceptions import LocationParseError
6
from ..packages import six
7
8
9
url_attrs = ["scheme", "auth", "host", "port", "path", "query", "fragment"]
10
11
# We only want to normalize urls with an HTTP(S) scheme.
12
# urllib3 infers URLs without a scheme (None) to be http.
13
NORMALIZABLE_SCHEMES = ("http", "https", None)
14
15
# Almost all of these patterns were derived from the
16
# 'rfc3986' module: https://github.com/python-hyper/rfc3986
17
PERCENT_RE = re.compile(r"%[a-fA-F0-9]{2}")
18
SCHEME_RE = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)")
19
URI_RE = re.compile(
20
r"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?"
21
r"(?://([^\\/?#]*))?"
22
r"([^?#]*)"
23
r"(?:\?([^#]*))?"
24
r"(?:#(.*))?$",
25
re.UNICODE | re.DOTALL,
26
)
27
28
IPV4_PAT = r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"
29
HEX_PAT = "[0-9A-Fa-f]{1,4}"
30
LS32_PAT = "(?:{hex}:{hex}|{ipv4})".format(hex=HEX_PAT, ipv4=IPV4_PAT)
31
_subs = {"hex": HEX_PAT, "ls32": LS32_PAT}
32
_variations = [
33
# 6( h16 ":" ) ls32
34
"(?:%(hex)s:){6}%(ls32)s",
35
# "::" 5( h16 ":" ) ls32
36
"::(?:%(hex)s:){5}%(ls32)s",
37
# [ h16 ] "::" 4( h16 ":" ) ls32
38
"(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s",
39
# [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
40
"(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s",
41
# [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
42
"(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s",
43
# [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
44
"(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s",
45
# [ *4( h16 ":" ) h16 ] "::" ls32
46
"(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s",
47
# [ *5( h16 ":" ) h16 ] "::" h16
48
"(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s",
49
# [ *6( h16 ":" ) h16 ] "::"
50
"(?:(?:%(hex)s:){0,6}%(hex)s)?::",
51
]
52
53
UNRESERVED_PAT = r"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._!\-~"
54
IPV6_PAT = "(?:" + "|".join([x % _subs for x in _variations]) + ")"
55
ZONE_ID_PAT = "(?:%25|%)(?:[" + UNRESERVED_PAT + "]|%[a-fA-F0-9]{2})+"
56
IPV6_ADDRZ_PAT = r"\[" + IPV6_PAT + r"(?:" + ZONE_ID_PAT + r")?\]"
57
REG_NAME_PAT = r"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*"
58
TARGET_RE = re.compile(r"^(/[^?#]*)(?:\?([^#]*))?(?:#.*)?$")
59
60
IPV4_RE = re.compile("^" + IPV4_PAT + "$")
61
IPV6_RE = re.compile("^" + IPV6_PAT + "$")
62
IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT + "$")
63
BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT[2:-2] + "$")
64
ZONE_ID_RE = re.compile("(" + ZONE_ID_PAT + r")\]$")
65
66
SUBAUTHORITY_PAT = (u"^(?:(.*)@)?(%s|%s|%s)(?::([0-9]{0,5}))?$") % (
67
REG_NAME_PAT,
68
IPV4_PAT,
69
IPV6_ADDRZ_PAT,
70
)
71
SUBAUTHORITY_RE = re.compile(SUBAUTHORITY_PAT, re.UNICODE | re.DOTALL)
72
73
UNRESERVED_CHARS = set(
74
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~"
75
)
76
SUB_DELIM_CHARS = set("!$&'()*+,;=")
77
USERINFO_CHARS = UNRESERVED_CHARS | SUB_DELIM_CHARS | {":"}
78
PATH_CHARS = USERINFO_CHARS | {"@", "/"}
79
QUERY_CHARS = FRAGMENT_CHARS = PATH_CHARS | {"?"}
80
81
82
class Url(namedtuple("Url", url_attrs)):
83
"""
84
Data structure for representing an HTTP URL. Used as a return value for
85
:func:`parse_url`. Both the scheme and host are normalized as they are
86
both case-insensitive according to RFC 3986.
87
"""
88
89
__slots__ = ()
90
91
def __new__(
92
cls,
93
scheme=None,
94
auth=None,
95
host=None,
96
port=None,
97
path=None,
98
query=None,
99
fragment=None,
100
):
101
if path and not path.startswith("/"):
102
path = "/" + path
103
if scheme is not None:
104
scheme = scheme.lower()
105
return super(Url, cls).__new__(
106
cls, scheme, auth, host, port, path, query, fragment
107
)
108
109
@property
110
def hostname(self):
111
"""For backwards-compatibility with urlparse. We're nice like that."""
112
return self.host
113
114
@property
115
def request_uri(self):
116
"""Absolute path including the query string."""
117
uri = self.path or "/"
118
119
if self.query is not None:
120
uri += "?" + self.query
121
122
return uri
123
124
@property
125
def netloc(self):
126
"""Network location including host and port"""
127
if self.port:
128
return "%s:%d" % (self.host, self.port)
129
return self.host
130
131
@property
132
def url(self):
133
"""
134
Convert self into a url
135
136
This function should more or less round-trip with :func:`.parse_url`. The
137
returned url may not be exactly the same as the url inputted to
138
:func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls
139
with a blank port will have : removed).
140
141
Example: ::
142
143
>>> U = parse_url('http://google.com/mail/')
144
>>> U.url
145
'http://google.com/mail/'
146
>>> Url('http', 'username:password', 'host.com', 80,
147
... '/path', 'query', 'fragment').url
148
'http://username:[email protected]:80/path?query#fragment'
149
"""
150
scheme, auth, host, port, path, query, fragment = self
151
url = u""
152
153
# We use "is not None" we want things to happen with empty strings (or 0 port)
154
if scheme is not None:
155
url += scheme + u"://"
156
if auth is not None:
157
url += auth + u"@"
158
if host is not None:
159
url += host
160
if port is not None:
161
url += u":" + str(port)
162
if path is not None:
163
url += path
164
if query is not None:
165
url += u"?" + query
166
if fragment is not None:
167
url += u"#" + fragment
168
169
return url
170
171
def __str__(self):
172
return self.url
173
174
175
def split_first(s, delims):
176
"""
177
.. deprecated:: 1.25
178
179
Given a string and an iterable of delimiters, split on the first found
180
delimiter. Return two split parts and the matched delimiter.
181
182
If not found, then the first part is the full input string.
183
184
Example::
185
186
>>> split_first('foo/bar?baz', '?/=')
187
('foo', 'bar?baz', '/')
188
>>> split_first('foo/bar?baz', '123')
189
('foo/bar?baz', '', None)
190
191
Scales linearly with number of delims. Not ideal for large number of delims.
192
"""
193
min_idx = None
194
min_delim = None
195
for d in delims:
196
idx = s.find(d)
197
if idx < 0:
198
continue
199
200
if min_idx is None or idx < min_idx:
201
min_idx = idx
202
min_delim = d
203
204
if min_idx is None or min_idx < 0:
205
return s, "", None
206
207
return s[:min_idx], s[min_idx + 1 :], min_delim
208
209
210
def _encode_invalid_chars(component, allowed_chars, encoding="utf-8"):
211
"""Percent-encodes a URI component without reapplying
212
onto an already percent-encoded component.
213
"""
214
if component is None:
215
return component
216
217
component = six.ensure_text(component)
218
219
# Normalize existing percent-encoded bytes.
220
# Try to see if the component we're encoding is already percent-encoded
221
# so we can skip all '%' characters but still encode all others.
222
component, percent_encodings = PERCENT_RE.subn(
223
lambda match: match.group(0).upper(), component
224
)
225
226
uri_bytes = component.encode("utf-8", "surrogatepass")
227
is_percent_encoded = percent_encodings == uri_bytes.count(b"%")
228
encoded_component = bytearray()
229
230
for i in range(0, len(uri_bytes)):
231
# Will return a single character bytestring on both Python 2 & 3
232
byte = uri_bytes[i : i + 1]
233
byte_ord = ord(byte)
234
if (is_percent_encoded and byte == b"%") or (
235
byte_ord < 128 and byte.decode() in allowed_chars
236
):
237
encoded_component += byte
238
continue
239
encoded_component.extend(b"%" + (hex(byte_ord)[2:].encode().zfill(2).upper()))
240
241
return encoded_component.decode(encoding)
242
243
244
def _remove_path_dot_segments(path):
245
# See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code
246
segments = path.split("/") # Turn the path into a list of segments
247
output = [] # Initialize the variable to use to store output
248
249
for segment in segments:
250
# '.' is the current directory, so ignore it, it is superfluous
251
if segment == ".":
252
continue
253
# Anything other than '..', should be appended to the output
254
elif segment != "..":
255
output.append(segment)
256
# In this case segment == '..', if we can, we should pop the last
257
# element
258
elif output:
259
output.pop()
260
261
# If the path starts with '/' and the output is empty or the first string
262
# is non-empty
263
if path.startswith("/") and (not output or output[0]):
264
output.insert(0, "")
265
266
# If the path starts with '/.' or '/..' ensure we add one more empty
267
# string to add a trailing '/'
268
if path.endswith(("/.", "/..")):
269
output.append("")
270
271
return "/".join(output)
272
273
274
def _normalize_host(host, scheme):
275
if host:
276
if isinstance(host, six.binary_type):
277
host = six.ensure_str(host)
278
279
if scheme in NORMALIZABLE_SCHEMES:
280
is_ipv6 = IPV6_ADDRZ_RE.match(host)
281
if is_ipv6:
282
match = ZONE_ID_RE.search(host)
283
if match:
284
start, end = match.span(1)
285
zone_id = host[start:end]
286
287
if zone_id.startswith("%25") and zone_id != "%25":
288
zone_id = zone_id[3:]
289
else:
290
zone_id = zone_id[1:]
291
zone_id = "%" + _encode_invalid_chars(zone_id, UNRESERVED_CHARS)
292
return host[:start].lower() + zone_id + host[end:]
293
else:
294
return host.lower()
295
elif not IPV4_RE.match(host):
296
return six.ensure_str(
297
b".".join([_idna_encode(label) for label in host.split(".")])
298
)
299
return host
300
301
302
def _idna_encode(name):
303
if name and any([ord(x) > 128 for x in name]):
304
try:
305
import idna
306
except ImportError:
307
six.raise_from(
308
LocationParseError("Unable to parse URL without the 'idna' module"),
309
None,
310
)
311
try:
312
return idna.encode(name.lower(), strict=True, std3_rules=True)
313
except idna.IDNAError:
314
six.raise_from(
315
LocationParseError(u"Name '%s' is not a valid IDNA label" % name), None
316
)
317
return name.lower().encode("ascii")
318
319
320
def _encode_target(target):
321
"""Percent-encodes a request target so that there are no invalid characters"""
322
path, query = TARGET_RE.match(target).groups()
323
target = _encode_invalid_chars(path, PATH_CHARS)
324
query = _encode_invalid_chars(query, QUERY_CHARS)
325
if query is not None:
326
target += "?" + query
327
return target
328
329
330
def parse_url(url):
331
"""
332
Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
333
performed to parse incomplete urls. Fields not provided will be None.
334
This parser is RFC 3986 compliant.
335
336
The parser logic and helper functions are based heavily on
337
work done in the ``rfc3986`` module.
338
339
:param str url: URL to parse into a :class:`.Url` namedtuple.
340
341
Partly backwards-compatible with :mod:`urlparse`.
342
343
Example::
344
345
>>> parse_url('http://google.com/mail/')
346
Url(scheme='http', host='google.com', port=None, path='/mail/', ...)
347
>>> parse_url('google.com:80')
348
Url(scheme=None, host='google.com', port=80, path=None, ...)
349
>>> parse_url('/foo?bar')
350
Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)
351
"""
352
if not url:
353
# Empty
354
return Url()
355
356
source_url = url
357
if not SCHEME_RE.search(url):
358
url = "//" + url
359
360
try:
361
scheme, authority, path, query, fragment = URI_RE.match(url).groups()
362
normalize_uri = scheme is None or scheme.lower() in NORMALIZABLE_SCHEMES
363
364
if scheme:
365
scheme = scheme.lower()
366
367
if authority:
368
auth, host, port = SUBAUTHORITY_RE.match(authority).groups()
369
if auth and normalize_uri:
370
auth = _encode_invalid_chars(auth, USERINFO_CHARS)
371
if port == "":
372
port = None
373
else:
374
auth, host, port = None, None, None
375
376
if port is not None:
377
port = int(port)
378
if not (0 <= port <= 65535):
379
raise LocationParseError(url)
380
381
host = _normalize_host(host, scheme)
382
383
if normalize_uri and path:
384
path = _remove_path_dot_segments(path)
385
path = _encode_invalid_chars(path, PATH_CHARS)
386
if normalize_uri and query:
387
query = _encode_invalid_chars(query, QUERY_CHARS)
388
if normalize_uri and fragment:
389
fragment = _encode_invalid_chars(fragment, FRAGMENT_CHARS)
390
391
except (ValueError, AttributeError):
392
return six.raise_from(LocationParseError(source_url), None)
393
394
# For the sake of backwards compatibility we put empty
395
# string values for path if there are any defined values
396
# beyond the path in the URL.
397
# TODO: Remove this when we break backwards compatibility.
398
if not path:
399
if query is not None or fragment is not None:
400
path = ""
401
else:
402
path = None
403
404
# Ensure that each part of the URL is a `str` for
405
# backwards compatibility.
406
if isinstance(url, six.text_type):
407
ensure_func = six.ensure_text
408
else:
409
ensure_func = six.ensure_str
410
411
def ensure_type(x):
412
return x if x is None else ensure_func(x)
413
414
return Url(
415
scheme=ensure_type(scheme),
416
auth=ensure_type(auth),
417
host=ensure_type(host),
418
port=port,
419
path=ensure_type(path),
420
query=ensure_type(query),
421
fragment=ensure_type(fragment),
422
)
423
424
425
def get_host(url):
426
"""
427
Deprecated. Use :func:`parse_url` instead.
428
"""
429
p = parse_url(url)
430
return p.scheme or "http", p.hostname, p.port
431
432