CoCalc -- core.py

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/idna/core.py
⁸¹¹ views
1
from . import idnadata
2
import bisect
3
import unicodedata
4
import re
5
import sys
6
from .intranges import intranges_contain
7

8
_virama_combining_class = 9
9
_alabel_prefix = b'xn--'
10
_unicode_dots_re = re.compile(u'[\u002e\u3002\uff0e\uff61]')
11

12
if sys.version_info[0] >= 3:
13
    unicode = str
14
    unichr = chr
15

16
class IDNAError(UnicodeError):
17
    """ Base exception for all IDNA-encoding related problems """
18
    pass
19

20

21
class IDNABidiError(IDNAError):
22
    """ Exception when bidirectional requirements are not satisfied """
23
    pass
24

25

26
class InvalidCodepoint(IDNAError):
27
    """ Exception when a disallowed or unallocated codepoint is used """
28
    pass
29

30

31
class InvalidCodepointContext(IDNAError):
32
    """ Exception when the codepoint is not valid in the context it is used """
33
    pass
34

35

36
def _combining_class(cp):
37
    v = unicodedata.combining(unichr(cp))
38
    if v == 0:
39
        if not unicodedata.name(unichr(cp)):
40
            raise ValueError("Unknown character in unicodedata")
41
    return v
42

43
def _is_script(cp, script):
44
    return intranges_contain(ord(cp), idnadata.scripts[script])
45

46
def _punycode(s):
47
    return s.encode('punycode')
48

49
def _unot(s):
50
    return 'U+{0:04X}'.format(s)
51

52

53
def valid_label_length(label):
54

55
    if len(label) > 63:
56
        return False
57
    return True
58

59

60
def valid_string_length(label, trailing_dot):
61

62
    if len(label) > (254 if trailing_dot else 253):
63
        return False
64
    return True
65

66

67
def check_bidi(label, check_ltr=False):
68

69
    # Bidi rules should only be applied if string contains RTL characters
70
    bidi_label = False
71
    for (idx, cp) in enumerate(label, 1):
72
        direction = unicodedata.bidirectional(cp)
73
        if direction == '':
74
            # String likely comes from a newer version of Unicode
75
            raise IDNABidiError('Unknown directionality in label {0} at position {1}'.format(repr(label), idx))
76
        if direction in ['R', 'AL', 'AN']:
77
            bidi_label = True
78
    if not bidi_label and not check_ltr:
79
        return True
80

81
    # Bidi rule 1
82
    direction = unicodedata.bidirectional(label[0])
83
    if direction in ['R', 'AL']:
84
        rtl = True
85
    elif direction == 'L':
86
        rtl = False
87
    else:
88
        raise IDNABidiError('First codepoint in label {0} must be directionality L, R or AL'.format(repr(label)))
89

90
    valid_ending = False
91
    number_type = False
92
    for (idx, cp) in enumerate(label, 1):
93
        direction = unicodedata.bidirectional(cp)
94

95
        if rtl:
96
            # Bidi rule 2
97
            if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
98
                raise IDNABidiError('Invalid direction for codepoint at position {0} in a right-to-left label'.format(idx))
99
            # Bidi rule 3
100
            if direction in ['R', 'AL', 'EN', 'AN']:
101
                valid_ending = True
102
            elif direction != 'NSM':
103
                valid_ending = False
104
            # Bidi rule 4
105
            if direction in ['AN', 'EN']:
106
                if not number_type:
107
                    number_type = direction
108
                else:
109
                    if number_type != direction:
110
                        raise IDNABidiError('Can not mix numeral types in a right-to-left label')
111
        else:
112
            # Bidi rule 5
113
            if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
114
                raise IDNABidiError('Invalid direction for codepoint at position {0} in a left-to-right label'.format(idx))
115
            # Bidi rule 6
116
            if direction in ['L', 'EN']:
117
                valid_ending = True
118
            elif direction != 'NSM':
119
                valid_ending = False
120

121
    if not valid_ending:
122
        raise IDNABidiError('Label ends with illegal codepoint directionality')
123

124
    return True
125

126

127
def check_initial_combiner(label):
128

129
    if unicodedata.category(label[0])[0] == 'M':
130
        raise IDNAError('Label begins with an illegal combining character')
131
    return True
132

133

134
def check_hyphen_ok(label):
135

136
    if label[2:4] == '--':
137
        raise IDNAError('Label has disallowed hyphens in 3rd and 4th position')
138
    if label[0] == '-' or label[-1] == '-':
139
        raise IDNAError('Label must not start or end with a hyphen')
140
    return True
141

142

143
def check_nfc(label):
144

145
    if unicodedata.normalize('NFC', label) != label:
146
        raise IDNAError('Label must be in Normalization Form C')
147

148

149
def valid_contextj(label, pos):
150

151
    cp_value = ord(label[pos])
152

153
    if cp_value == 0x200c:
154

155
        if pos > 0:
156
            if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
157
                return True
158

159
        ok = False
160
        for i in range(pos-1, -1, -1):
161
            joining_type = idnadata.joining_types.get(ord(label[i]))
162
            if joining_type == ord('T'):
163
                continue
164
            if joining_type in [ord('L'), ord('D')]:
165
                ok = True
166
                break
167

168
        if not ok:
169
            return False
170

171
        ok = False
172
        for i in range(pos+1, len(label)):
173
            joining_type = idnadata.joining_types.get(ord(label[i]))
174
            if joining_type == ord('T'):
175
                continue
176
            if joining_type in [ord('R'), ord('D')]:
177
                ok = True
178
                break
179
        return ok
180

181
    if cp_value == 0x200d:
182

183
        if pos > 0:
184
            if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
185
                return True
186
        return False
187

188
    else:
189

190
        return False
191

192

193
def valid_contexto(label, pos, exception=False):
194

195
    cp_value = ord(label[pos])
196

197
    if cp_value == 0x00b7:
198
        if 0 < pos < len(label)-1:
199
            if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c:
200
                return True
201
        return False
202

203
    elif cp_value == 0x0375:
204
        if pos < len(label)-1 and len(label) > 1:
205
            return _is_script(label[pos + 1], 'Greek')
206
        return False
207

208
    elif cp_value == 0x05f3 or cp_value == 0x05f4:
209
        if pos > 0:
210
            return _is_script(label[pos - 1], 'Hebrew')
211
        return False
212

213
    elif cp_value == 0x30fb:
214
        for cp in label:
215
            if cp == u'\u30fb':
216
                continue
217
            if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'):
218
                return True
219
        return False
220

221
    elif 0x660 <= cp_value <= 0x669:
222
        for cp in label:
223
            if 0x6f0 <= ord(cp) <= 0x06f9:
224
                return False
225
        return True
226

227
    elif 0x6f0 <= cp_value <= 0x6f9:
228
        for cp in label:
229
            if 0x660 <= ord(cp) <= 0x0669:
230
                return False
231
        return True
232

233

234
def check_label(label):
235

236
    if isinstance(label, (bytes, bytearray)):
237
        label = label.decode('utf-8')
238
    if len(label) == 0:
239
        raise IDNAError('Empty Label')
240

241
    check_nfc(label)
242
    check_hyphen_ok(label)
243
    check_initial_combiner(label)
244

245
    for (pos, cp) in enumerate(label):
246
        cp_value = ord(cp)
247
        if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']):
248
            continue
249
        elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']):
250
            try:
251
                if not valid_contextj(label, pos):
252
                    raise InvalidCodepointContext('Joiner {0} not allowed at position {1} in {2}'.format(
253
                        _unot(cp_value), pos+1, repr(label)))
254
            except ValueError:
255
                raise IDNAError('Unknown codepoint adjacent to joiner {0} at position {1} in {2}'.format(
256
                    _unot(cp_value), pos+1, repr(label)))
257
        elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']):
258
            if not valid_contexto(label, pos):
259
                raise InvalidCodepointContext('Codepoint {0} not allowed at position {1} in {2}'.format(_unot(cp_value), pos+1, repr(label)))
260
        else:
261
            raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
262

263
    check_bidi(label)
264

265

266
def alabel(label):
267

268
    try:
269
        label = label.encode('ascii')
270
        ulabel(label)
271
        if not valid_label_length(label):
272
            raise IDNAError('Label too long')
273
        return label
274
    except UnicodeEncodeError:
275
        pass
276

277
    if not label:
278
        raise IDNAError('No Input')
279

280
    label = unicode(label)
281
    check_label(label)
282
    label = _punycode(label)
283
    label = _alabel_prefix + label
284

285
    if not valid_label_length(label):
286
        raise IDNAError('Label too long')
287

288
    return label
289

290

291
def ulabel(label):
292

293
    if not isinstance(label, (bytes, bytearray)):
294
        try:
295
            label = label.encode('ascii')
296
        except UnicodeEncodeError:
297
            check_label(label)
298
            return label
299

300
    label = label.lower()
301
    if label.startswith(_alabel_prefix):
302
        label = label[len(_alabel_prefix):]
303
        if not label:
304
            raise IDNAError('Malformed A-label, no Punycode eligible content found')
305
        if label.decode('ascii')[-1] == '-':
306
            raise IDNAError('A-label must not end with a hyphen')
307
    else:
308
        check_label(label)
309
        return label.decode('ascii')
310

311
    label = label.decode('punycode')
312
    check_label(label)
313
    return label
314

315

316
def uts46_remap(domain, std3_rules=True, transitional=False):
317
    """Re-map the characters in the string according to UTS46 processing."""
318
    from .uts46data import uts46data
319
    output = u""
320
    try:
321
        for pos, char in enumerate(domain):
322
            code_point = ord(char)
323
            uts46row = uts46data[code_point if code_point < 256 else
324
                bisect.bisect_left(uts46data, (code_point, "Z")) - 1]
325
            status = uts46row[1]
326
            replacement = uts46row[2] if len(uts46row) == 3 else None
327
            if (status == "V" or
328
                    (status == "D" and not transitional) or
329
                    (status == "3" and not std3_rules and replacement is None)):
330
                output += char
331
            elif replacement is not None and (status == "M" or
332
                    (status == "3" and not std3_rules) or
333
                    (status == "D" and transitional)):
334
                output += replacement
335
            elif status != "I":
336
                raise IndexError()
337
        return unicodedata.normalize("NFC", output)
338
    except IndexError:
339
        raise InvalidCodepoint(
340
            "Codepoint {0} not allowed at position {1} in {2}".format(
341
            _unot(code_point), pos + 1, repr(domain)))
342

343

344
def encode(s, strict=False, uts46=False, std3_rules=False, transitional=False):
345

346
    if isinstance(s, (bytes, bytearray)):
347
        s = s.decode("ascii")
348
    if uts46:
349
        s = uts46_remap(s, std3_rules, transitional)
350
    trailing_dot = False
351
    result = []
352
    if strict:
353
        labels = s.split('.')
354
    else:
355
        labels = _unicode_dots_re.split(s)
356
    if not labels or labels == ['']:
357
        raise IDNAError('Empty domain')
358
    if labels[-1] == '':
359
        del labels[-1]
360
        trailing_dot = True
361
    for label in labels:
362
        s = alabel(label)
363
        if s:
364
            result.append(s)
365
        else:
366
            raise IDNAError('Empty label')
367
    if trailing_dot:
368
        result.append(b'')
369
    s = b'.'.join(result)
370
    if not valid_string_length(s, trailing_dot):
371
        raise IDNAError('Domain too long')
372
    return s
373

374

375
def decode(s, strict=False, uts46=False, std3_rules=False):
376

377
    if isinstance(s, (bytes, bytearray)):
378
        s = s.decode("ascii")
379
    if uts46:
380
        s = uts46_remap(s, std3_rules, False)
381
    trailing_dot = False
382
    result = []
383
    if not strict:
384
        labels = _unicode_dots_re.split(s)
385
    else:
386
        labels = s.split(u'.')
387
    if not labels or labels == ['']:
388
        raise IDNAError('Empty domain')
389
    if not labels[-1]:
390
        del labels[-1]
391
        trailing_dot = True
392
    for label in labels:
393
        s = ulabel(label)
394
        if s:
395
            result.append(s)
396
        else:
397
            raise IDNAError('Empty label')
398
    if trailing_dot:
399
        result.append(u'')
400
    return u'.'.join(result)
401

402
Product

Resources

Company