CoCalc -- url.py

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/urllib3/util/url.py
⁸¹¹ views
1
from __future__ import absolute_import
2
import re
3
from collections import namedtuple
4

5
from ..exceptions import LocationParseError
6
from ..packages import six
7

8

9
url_attrs = ["scheme", "auth", "host", "port", "path", "query", "fragment"]
10

11
# We only want to normalize urls with an HTTP(S) scheme.
12
# urllib3 infers URLs without a scheme (None) to be http.
13
NORMALIZABLE_SCHEMES = ("http", "https", None)
14

15
# Almost all of these patterns were derived from the
16
# 'rfc3986' module: https://github.com/python-hyper/rfc3986
17
PERCENT_RE = re.compile(r"%[a-fA-F0-9]{2}")
18
SCHEME_RE = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)")
19
URI_RE = re.compile(
20
    r"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?"
21
    r"(?://([^\\/?#]*))?"
22
    r"([^?#]*)"
23
    r"(?:\?([^#]*))?"
24
    r"(?:#(.*))?$",
25
    re.UNICODE | re.DOTALL,
26
)
27

28
IPV4_PAT = r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"
29
HEX_PAT = "[0-9A-Fa-f]{1,4}"
30
LS32_PAT = "(?:{hex}:{hex}|{ipv4})".format(hex=HEX_PAT, ipv4=IPV4_PAT)
31
_subs = {"hex": HEX_PAT, "ls32": LS32_PAT}
32
_variations = [
33
    #                            6( h16 ":" ) ls32
34
    "(?:%(hex)s:){6}%(ls32)s",
35
    #                       "::" 5( h16 ":" ) ls32
36
    "::(?:%(hex)s:){5}%(ls32)s",
37
    # [               h16 ] "::" 4( h16 ":" ) ls32
38
    "(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s",
39
    # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
40
    "(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s",
41
    # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
42
    "(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s",
43
    # [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
44
    "(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s",
45
    # [ *4( h16 ":" ) h16 ] "::"              ls32
46
    "(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s",
47
    # [ *5( h16 ":" ) h16 ] "::"              h16
48
    "(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s",
49
    # [ *6( h16 ":" ) h16 ] "::"
50
    "(?:(?:%(hex)s:){0,6}%(hex)s)?::",
51
]
52

53
UNRESERVED_PAT = r"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._!\-~"
54
IPV6_PAT = "(?:" + "|".join([x % _subs for x in _variations]) + ")"
55
ZONE_ID_PAT = "(?:%25|%)(?:[" + UNRESERVED_PAT + "]|%[a-fA-F0-9]{2})+"
56
IPV6_ADDRZ_PAT = r"\[" + IPV6_PAT + r"(?:" + ZONE_ID_PAT + r")?\]"
57
REG_NAME_PAT = r"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*"
58
TARGET_RE = re.compile(r"^(/[^?#]*)(?:\?([^#]*))?(?:#.*)?$")
59

60
IPV4_RE = re.compile("^" + IPV4_PAT + "$")
61
IPV6_RE = re.compile("^" + IPV6_PAT + "$")
62
IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT + "$")
63
BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT[2:-2] + "$")
64
ZONE_ID_RE = re.compile("(" + ZONE_ID_PAT + r")\]$")
65

66
SUBAUTHORITY_PAT = (u"^(?:(.*)@)?(%s|%s|%s)(?::([0-9]{0,5}))?$") % (
67
    REG_NAME_PAT,
68
    IPV4_PAT,
69
    IPV6_ADDRZ_PAT,
70
)
71
SUBAUTHORITY_RE = re.compile(SUBAUTHORITY_PAT, re.UNICODE | re.DOTALL)
72

73
UNRESERVED_CHARS = set(
74
    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~"
75
)
76
SUB_DELIM_CHARS = set("!$&'()*+,;=")
77
USERINFO_CHARS = UNRESERVED_CHARS | SUB_DELIM_CHARS | {":"}
78
PATH_CHARS = USERINFO_CHARS | {"@", "/"}
79
QUERY_CHARS = FRAGMENT_CHARS = PATH_CHARS | {"?"}
80

81

82
class Url(namedtuple("Url", url_attrs)):
83
    """
84
    Data structure for representing an HTTP URL. Used as a return value for
85
    :func:`parse_url`. Both the scheme and host are normalized as they are
86
    both case-insensitive according to RFC 3986.
87
    """
88

89
    __slots__ = ()
90

91
    def __new__(
92
        cls,
93
        scheme=None,
94
        auth=None,
95
        host=None,
96
        port=None,
97
        path=None,
98
        query=None,
99
        fragment=None,
100
    ):
101
        if path and not path.startswith("/"):
102
            path = "/" + path
103
        if scheme is not None:
104
            scheme = scheme.lower()
105
        return super(Url, cls).__new__(
106
            cls, scheme, auth, host, port, path, query, fragment
107
        )
108

109
    @property
110
    def hostname(self):
111
        """For backwards-compatibility with urlparse. We're nice like that."""
112
        return self.host
113

114
    @property
115
    def request_uri(self):
116
        """Absolute path including the query string."""
117
        uri = self.path or "/"
118

119
        if self.query is not None:
120
            uri += "?" + self.query
121

122
        return uri
123

124
    @property
125
    def netloc(self):
126
        """Network location including host and port"""
127
        if self.port:
128
            return "%s:%d" % (self.host, self.port)
129
        return self.host
130

131
    @property
132
    def url(self):
133
        """
134
        Convert self into a url
135

136
        This function should more or less round-trip with :func:`.parse_url`. The
137
        returned url may not be exactly the same as the url inputted to
138
        :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls
139
        with a blank port will have : removed).
140

141
        Example: ::
142

143
            >>> U = parse_url('http://google.com/mail/')
144
            >>> U.url
145
            'http://google.com/mail/'
146
            >>> Url('http', 'username:password', 'host.com', 80,
147
            ... '/path', 'query', 'fragment').url
148
            'http://username:[email protected]:80/path?query#fragment'
149
        """
150
        scheme, auth, host, port, path, query, fragment = self
151
        url = u""
152

153
        # We use "is not None" we want things to happen with empty strings (or 0 port)
154
        if scheme is not None:
155
            url += scheme + u"://"
156
        if auth is not None:
157
            url += auth + u"@"
158
        if host is not None:
159
            url += host
160
        if port is not None:
161
            url += u":" + str(port)
162
        if path is not None:
163
            url += path
164
        if query is not None:
165
            url += u"?" + query
166
        if fragment is not None:
167
            url += u"#" + fragment
168

169
        return url
170

171
    def __str__(self):
172
        return self.url
173

174

175
def split_first(s, delims):
176
    """
177
    .. deprecated:: 1.25
178

179
    Given a string and an iterable of delimiters, split on the first found
180
    delimiter. Return two split parts and the matched delimiter.
181

182
    If not found, then the first part is the full input string.
183

184
    Example::
185

186
        >>> split_first('foo/bar?baz', '?/=')
187
        ('foo', 'bar?baz', '/')
188
        >>> split_first('foo/bar?baz', '123')
189
        ('foo/bar?baz', '', None)
190

191
    Scales linearly with number of delims. Not ideal for large number of delims.
192
    """
193
    min_idx = None
194
    min_delim = None
195
    for d in delims:
196
        idx = s.find(d)
197
        if idx < 0:
198
            continue
199

200
        if min_idx is None or idx < min_idx:
201
            min_idx = idx
202
            min_delim = d
203

204
    if min_idx is None or min_idx < 0:
205
        return s, "", None
206

207
    return s[:min_idx], s[min_idx + 1 :], min_delim
208

209

210
def _encode_invalid_chars(component, allowed_chars, encoding="utf-8"):
211
    """Percent-encodes a URI component without reapplying
212
    onto an already percent-encoded component.
213
    """
214
    if component is None:
215
        return component
216

217
    component = six.ensure_text(component)
218

219
    # Normalize existing percent-encoded bytes.
220
    # Try to see if the component we're encoding is already percent-encoded
221
    # so we can skip all '%' characters but still encode all others.
222
    component, percent_encodings = PERCENT_RE.subn(
223
        lambda match: match.group(0).upper(), component
224
    )
225

226
    uri_bytes = component.encode("utf-8", "surrogatepass")
227
    is_percent_encoded = percent_encodings == uri_bytes.count(b"%")
228
    encoded_component = bytearray()
229

230
    for i in range(0, len(uri_bytes)):
231
        # Will return a single character bytestring on both Python 2 & 3
232
        byte = uri_bytes[i : i + 1]
233
        byte_ord = ord(byte)
234
        if (is_percent_encoded and byte == b"%") or (
235
            byte_ord < 128 and byte.decode() in allowed_chars
236
        ):
237
            encoded_component += byte
238
            continue
239
        encoded_component.extend(b"%" + (hex(byte_ord)[2:].encode().zfill(2).upper()))
240

241
    return encoded_component.decode(encoding)
242

243

244
def _remove_path_dot_segments(path):
245
    # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code
246
    segments = path.split("/")  # Turn the path into a list of segments
247
    output = []  # Initialize the variable to use to store output
248

249
    for segment in segments:
250
        # '.' is the current directory, so ignore it, it is superfluous
251
        if segment == ".":
252
            continue
253
        # Anything other than '..', should be appended to the output
254
        elif segment != "..":
255
            output.append(segment)
256
        # In this case segment == '..', if we can, we should pop the last
257
        # element
258
        elif output:
259
            output.pop()
260

261
    # If the path starts with '/' and the output is empty or the first string
262
    # is non-empty
263
    if path.startswith("/") and (not output or output[0]):
264
        output.insert(0, "")
265

266
    # If the path starts with '/.' or '/..' ensure we add one more empty
267
    # string to add a trailing '/'
268
    if path.endswith(("/.", "/..")):
269
        output.append("")
270

271
    return "/".join(output)
272

273

274
def _normalize_host(host, scheme):
275
    if host:
276
        if isinstance(host, six.binary_type):
277
            host = six.ensure_str(host)
278

279
        if scheme in NORMALIZABLE_SCHEMES:
280
            is_ipv6 = IPV6_ADDRZ_RE.match(host)
281
            if is_ipv6:
282
                match = ZONE_ID_RE.search(host)
283
                if match:
284
                    start, end = match.span(1)
285
                    zone_id = host[start:end]
286

287
                    if zone_id.startswith("%25") and zone_id != "%25":
288
                        zone_id = zone_id[3:]
289
                    else:
290
                        zone_id = zone_id[1:]
291
                    zone_id = "%" + _encode_invalid_chars(zone_id, UNRESERVED_CHARS)
292
                    return host[:start].lower() + zone_id + host[end:]
293
                else:
294
                    return host.lower()
295
            elif not IPV4_RE.match(host):
296
                return six.ensure_str(
297
                    b".".join([_idna_encode(label) for label in host.split(".")])
298
                )
299
    return host
300

301

302
def _idna_encode(name):
303
    if name and any([ord(x) > 128 for x in name]):
304
        try:
305
            import idna
306
        except ImportError:
307
            six.raise_from(
308
                LocationParseError("Unable to parse URL without the 'idna' module"),
309
                None,
310
            )
311
        try:
312
            return idna.encode(name.lower(), strict=True, std3_rules=True)
313
        except idna.IDNAError:
314
            six.raise_from(
315
                LocationParseError(u"Name '%s' is not a valid IDNA label" % name), None
316
            )
317
    return name.lower().encode("ascii")
318

319

320
def _encode_target(target):
321
    """Percent-encodes a request target so that there are no invalid characters"""
322
    path, query = TARGET_RE.match(target).groups()
323
    target = _encode_invalid_chars(path, PATH_CHARS)
324
    query = _encode_invalid_chars(query, QUERY_CHARS)
325
    if query is not None:
326
        target += "?" + query
327
    return target
328

329

330
def parse_url(url):
331
    """
332
    Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
333
    performed to parse incomplete urls. Fields not provided will be None.
334
    This parser is RFC 3986 compliant.
335

336
    The parser logic and helper functions are based heavily on
337
    work done in the ``rfc3986`` module.
338

339
    :param str url: URL to parse into a :class:`.Url` namedtuple.
340

341
    Partly backwards-compatible with :mod:`urlparse`.
342

343
    Example::
344

345
        >>> parse_url('http://google.com/mail/')
346
        Url(scheme='http', host='google.com', port=None, path='/mail/', ...)
347
        >>> parse_url('google.com:80')
348
        Url(scheme=None, host='google.com', port=80, path=None, ...)
349
        >>> parse_url('/foo?bar')
350
        Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)
351
    """
352
    if not url:
353
        # Empty
354
        return Url()
355

356
    source_url = url
357
    if not SCHEME_RE.search(url):
358
        url = "//" + url
359

360
    try:
361
        scheme, authority, path, query, fragment = URI_RE.match(url).groups()
362
        normalize_uri = scheme is None or scheme.lower() in NORMALIZABLE_SCHEMES
363

364
        if scheme:
365
            scheme = scheme.lower()
366

367
        if authority:
368
            auth, host, port = SUBAUTHORITY_RE.match(authority).groups()
369
            if auth and normalize_uri:
370
                auth = _encode_invalid_chars(auth, USERINFO_CHARS)
371
            if port == "":
372
                port = None
373
        else:
374
            auth, host, port = None, None, None
375

376
        if port is not None:
377
            port = int(port)
378
            if not (0 <= port <= 65535):
379
                raise LocationParseError(url)
380

381
        host = _normalize_host(host, scheme)
382

383
        if normalize_uri and path:
384
            path = _remove_path_dot_segments(path)
385
            path = _encode_invalid_chars(path, PATH_CHARS)
386
        if normalize_uri and query:
387
            query = _encode_invalid_chars(query, QUERY_CHARS)
388
        if normalize_uri and fragment:
389
            fragment = _encode_invalid_chars(fragment, FRAGMENT_CHARS)
390

391
    except (ValueError, AttributeError):
392
        return six.raise_from(LocationParseError(source_url), None)
393

394
    # For the sake of backwards compatibility we put empty
395
    # string values for path if there are any defined values
396
    # beyond the path in the URL.
397
    # TODO: Remove this when we break backwards compatibility.
398
    if not path:
399
        if query is not None or fragment is not None:
400
            path = ""
401
        else:
402
            path = None
403

404
    # Ensure that each part of the URL is a `str` for
405
    # backwards compatibility.
406
    if isinstance(url, six.text_type):
407
        ensure_func = six.ensure_text
408
    else:
409
        ensure_func = six.ensure_str
410

411
    def ensure_type(x):
412
        return x if x is None else ensure_func(x)
413

414
    return Url(
415
        scheme=ensure_type(scheme),
416
        auth=ensure_type(auth),
417
        host=ensure_type(host),
418
        port=port,
419
        path=ensure_type(path),
420
        query=ensure_type(query),
421
        fragment=ensure_type(fragment),
422
    )
423

424

425
def get_host(url):
426
    """
427
    Deprecated. Use :func:`parse_url` instead.
428
    """
429
    p = parse_url(url)
430
    return p.scheme or "http", p.hostname, p.port
431

432
Product

Resources

Company