Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Tools/build/generate_re_casefix.py
12 views
1
#! /usr/bin/env python3
2
# This script generates Lib/re/_casefix.py.
3
4
import collections
5
import sys
6
import unicodedata
7
8
SCRIPT_NAME = 'Tools/build/generate_re_casefix.py'
9
10
def update_file(file, content):
11
try:
12
with open(file, 'r', encoding='utf-8') as fobj:
13
if fobj.read() == content:
14
return False
15
except (OSError, ValueError):
16
pass
17
with open(file, 'w', encoding='utf-8') as fobj:
18
fobj.write(content)
19
return True
20
21
re_casefix_template = f"""\
22
# Auto-generated by {SCRIPT_NAME}.
23
24
# Maps the code of lowercased character to codes of different lowercased
25
# characters which have the same uppercase.
26
_EXTRA_CASES = {
27
%s
28
}
29
"""
30
31
def uname(i):
32
return unicodedata.name(chr(i), r'U+%04X' % i)
33
34
class hexint(int):
35
def __repr__(self):
36
return '%#06x' % self
37
38
def alpha(i):
39
c = chr(i)
40
return c if c.isalpha() else ascii(c)[1:-1]
41
42
43
def main(outfile='Lib/re/_casefix.py'):
44
# Find sets of characters which have the same uppercase.
45
equivalent_chars = collections.defaultdict(str)
46
for c in map(chr, range(sys.maxunicode + 1)):
47
equivalent_chars[c.upper()] += c
48
equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1]
49
50
# List of codes of lowercased characters which have the same uppercase.
51
equivalent_lower_codes = [sorted(t)
52
for s in equivalent_chars
53
for t in [set(ord(c.lower()) for c in s)]
54
if len(t) > 1]
55
56
bad_codes = []
57
for t in equivalent_lower_codes:
58
for i in t:
59
if i > 0xffff:
60
bad_codes.extend(t)
61
try:
62
bad_codes.append(ord(chr(i).upper()))
63
except (ValueError, TypeError):
64
pass
65
break
66
if bad_codes:
67
print('Case-insensitive matching may not work correctly for character:',
68
file=sys.stderr)
69
for i in sorted(bad_codes):
70
print(" '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)),
71
file=sys.stderr)
72
sys.exit(1)
73
74
mapping = {i: tuple(j for j in t if i != j)
75
for t in equivalent_lower_codes
76
for i in t}
77
78
items = []
79
for i, t in sorted(mapping.items()):
80
items.append(' # %s: %s' % (
81
uname(i),
82
', '.join(map(uname, t)),
83
))
84
items.append(" %r: %r, # '%s': '%s'" % (
85
hexint(i),
86
tuple(map(hexint, t)),
87
alpha(i),
88
''.join(map(alpha, t)),
89
))
90
91
update_file(outfile, re_casefix_template % '\n'.join(items))
92
93
94
if __name__ == '__main__':
95
import sys
96
main(*sys.argv[1:])
97
98