Path: blob/master/venv/Lib/site-packages/idna/core.py
811 views
from . import idnadata1import bisect2import unicodedata3import re4import sys5from .intranges import intranges_contain67_virama_combining_class = 98_alabel_prefix = b'xn--'9_unicode_dots_re = re.compile(u'[\u002e\u3002\uff0e\uff61]')1011if sys.version_info[0] >= 3:12unicode = str13unichr = chr1415class IDNAError(UnicodeError):16""" Base exception for all IDNA-encoding related problems """17pass181920class IDNABidiError(IDNAError):21""" Exception when bidirectional requirements are not satisfied """22pass232425class InvalidCodepoint(IDNAError):26""" Exception when a disallowed or unallocated codepoint is used """27pass282930class InvalidCodepointContext(IDNAError):31""" Exception when the codepoint is not valid in the context it is used """32pass333435def _combining_class(cp):36v = unicodedata.combining(unichr(cp))37if v == 0:38if not unicodedata.name(unichr(cp)):39raise ValueError("Unknown character in unicodedata")40return v4142def _is_script(cp, script):43return intranges_contain(ord(cp), idnadata.scripts[script])4445def _punycode(s):46return s.encode('punycode')4748def _unot(s):49return 'U+{0:04X}'.format(s)505152def valid_label_length(label):5354if len(label) > 63:55return False56return True575859def valid_string_length(label, trailing_dot):6061if len(label) > (254 if trailing_dot else 253):62return False63return True646566def check_bidi(label, check_ltr=False):6768# Bidi rules should only be applied if string contains RTL characters69bidi_label = False70for (idx, cp) in enumerate(label, 1):71direction = unicodedata.bidirectional(cp)72if direction == '':73# String likely comes from a newer version of Unicode74raise IDNABidiError('Unknown directionality in label {0} at position {1}'.format(repr(label), idx))75if direction in ['R', 'AL', 'AN']:76bidi_label = True77if not bidi_label and not check_ltr:78return True7980# Bidi rule 181direction = unicodedata.bidirectional(label[0])82if direction in ['R', 'AL']:83rtl = True84elif direction == 'L':85rtl = False86else:87raise IDNABidiError('First codepoint in label {0} must be directionality L, R or AL'.format(repr(label)))8889valid_ending = False90number_type = False91for (idx, cp) in enumerate(label, 1):92direction = unicodedata.bidirectional(cp)9394if rtl:95# Bidi rule 296if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:97raise IDNABidiError('Invalid direction for codepoint at position {0} in a right-to-left label'.format(idx))98# Bidi rule 399if direction in ['R', 'AL', 'EN', 'AN']:100valid_ending = True101elif direction != 'NSM':102valid_ending = False103# Bidi rule 4104if direction in ['AN', 'EN']:105if not number_type:106number_type = direction107else:108if number_type != direction:109raise IDNABidiError('Can not mix numeral types in a right-to-left label')110else:111# Bidi rule 5112if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:113raise IDNABidiError('Invalid direction for codepoint at position {0} in a left-to-right label'.format(idx))114# Bidi rule 6115if direction in ['L', 'EN']:116valid_ending = True117elif direction != 'NSM':118valid_ending = False119120if not valid_ending:121raise IDNABidiError('Label ends with illegal codepoint directionality')122123return True124125126def check_initial_combiner(label):127128if unicodedata.category(label[0])[0] == 'M':129raise IDNAError('Label begins with an illegal combining character')130return True131132133def check_hyphen_ok(label):134135if label[2:4] == '--':136raise IDNAError('Label has disallowed hyphens in 3rd and 4th position')137if label[0] == '-' or label[-1] == '-':138raise IDNAError('Label must not start or end with a hyphen')139return True140141142def check_nfc(label):143144if unicodedata.normalize('NFC', label) != label:145raise IDNAError('Label must be in Normalization Form C')146147148def valid_contextj(label, pos):149150cp_value = ord(label[pos])151152if cp_value == 0x200c:153154if pos > 0:155if _combining_class(ord(label[pos - 1])) == _virama_combining_class:156return True157158ok = False159for i in range(pos-1, -1, -1):160joining_type = idnadata.joining_types.get(ord(label[i]))161if joining_type == ord('T'):162continue163if joining_type in [ord('L'), ord('D')]:164ok = True165break166167if not ok:168return False169170ok = False171for i in range(pos+1, len(label)):172joining_type = idnadata.joining_types.get(ord(label[i]))173if joining_type == ord('T'):174continue175if joining_type in [ord('R'), ord('D')]:176ok = True177break178return ok179180if cp_value == 0x200d:181182if pos > 0:183if _combining_class(ord(label[pos - 1])) == _virama_combining_class:184return True185return False186187else:188189return False190191192def valid_contexto(label, pos, exception=False):193194cp_value = ord(label[pos])195196if cp_value == 0x00b7:197if 0 < pos < len(label)-1:198if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c:199return True200return False201202elif cp_value == 0x0375:203if pos < len(label)-1 and len(label) > 1:204return _is_script(label[pos + 1], 'Greek')205return False206207elif cp_value == 0x05f3 or cp_value == 0x05f4:208if pos > 0:209return _is_script(label[pos - 1], 'Hebrew')210return False211212elif cp_value == 0x30fb:213for cp in label:214if cp == u'\u30fb':215continue216if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'):217return True218return False219220elif 0x660 <= cp_value <= 0x669:221for cp in label:222if 0x6f0 <= ord(cp) <= 0x06f9:223return False224return True225226elif 0x6f0 <= cp_value <= 0x6f9:227for cp in label:228if 0x660 <= ord(cp) <= 0x0669:229return False230return True231232233def check_label(label):234235if isinstance(label, (bytes, bytearray)):236label = label.decode('utf-8')237if len(label) == 0:238raise IDNAError('Empty Label')239240check_nfc(label)241check_hyphen_ok(label)242check_initial_combiner(label)243244for (pos, cp) in enumerate(label):245cp_value = ord(cp)246if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']):247continue248elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']):249try:250if not valid_contextj(label, pos):251raise InvalidCodepointContext('Joiner {0} not allowed at position {1} in {2}'.format(252_unot(cp_value), pos+1, repr(label)))253except ValueError:254raise IDNAError('Unknown codepoint adjacent to joiner {0} at position {1} in {2}'.format(255_unot(cp_value), pos+1, repr(label)))256elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']):257if not valid_contexto(label, pos):258raise InvalidCodepointContext('Codepoint {0} not allowed at position {1} in {2}'.format(_unot(cp_value), pos+1, repr(label)))259else:260raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))261262check_bidi(label)263264265def alabel(label):266267try:268label = label.encode('ascii')269ulabel(label)270if not valid_label_length(label):271raise IDNAError('Label too long')272return label273except UnicodeEncodeError:274pass275276if not label:277raise IDNAError('No Input')278279label = unicode(label)280check_label(label)281label = _punycode(label)282label = _alabel_prefix + label283284if not valid_label_length(label):285raise IDNAError('Label too long')286287return label288289290def ulabel(label):291292if not isinstance(label, (bytes, bytearray)):293try:294label = label.encode('ascii')295except UnicodeEncodeError:296check_label(label)297return label298299label = label.lower()300if label.startswith(_alabel_prefix):301label = label[len(_alabel_prefix):]302if not label:303raise IDNAError('Malformed A-label, no Punycode eligible content found')304if label.decode('ascii')[-1] == '-':305raise IDNAError('A-label must not end with a hyphen')306else:307check_label(label)308return label.decode('ascii')309310label = label.decode('punycode')311check_label(label)312return label313314315def uts46_remap(domain, std3_rules=True, transitional=False):316"""Re-map the characters in the string according to UTS46 processing."""317from .uts46data import uts46data318output = u""319try:320for pos, char in enumerate(domain):321code_point = ord(char)322uts46row = uts46data[code_point if code_point < 256 else323bisect.bisect_left(uts46data, (code_point, "Z")) - 1]324status = uts46row[1]325replacement = uts46row[2] if len(uts46row) == 3 else None326if (status == "V" or327(status == "D" and not transitional) or328(status == "3" and not std3_rules and replacement is None)):329output += char330elif replacement is not None and (status == "M" or331(status == "3" and not std3_rules) or332(status == "D" and transitional)):333output += replacement334elif status != "I":335raise IndexError()336return unicodedata.normalize("NFC", output)337except IndexError:338raise InvalidCodepoint(339"Codepoint {0} not allowed at position {1} in {2}".format(340_unot(code_point), pos + 1, repr(domain)))341342343def encode(s, strict=False, uts46=False, std3_rules=False, transitional=False):344345if isinstance(s, (bytes, bytearray)):346s = s.decode("ascii")347if uts46:348s = uts46_remap(s, std3_rules, transitional)349trailing_dot = False350result = []351if strict:352labels = s.split('.')353else:354labels = _unicode_dots_re.split(s)355if not labels or labels == ['']:356raise IDNAError('Empty domain')357if labels[-1] == '':358del labels[-1]359trailing_dot = True360for label in labels:361s = alabel(label)362if s:363result.append(s)364else:365raise IDNAError('Empty label')366if trailing_dot:367result.append(b'')368s = b'.'.join(result)369if not valid_string_length(s, trailing_dot):370raise IDNAError('Domain too long')371return s372373374def decode(s, strict=False, uts46=False, std3_rules=False):375376if isinstance(s, (bytes, bytearray)):377s = s.decode("ascii")378if uts46:379s = uts46_remap(s, std3_rules, False)380trailing_dot = False381result = []382if not strict:383labels = _unicode_dots_re.split(s)384else:385labels = s.split(u'.')386if not labels or labels == ['']:387raise IDNAError('Empty domain')388if not labels[-1]:389del labels[-1]390trailing_dot = True391for label in labels:392s = ulabel(label)393if s:394result.append(s)395else:396raise IDNAError('Empty label')397if trailing_dot:398result.append(u'')399return u'.'.join(result)400401402