Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Tools/unicode/gencodec.py
12 views
1
""" Unicode Mapping Parser and Codec Generator.
2
3
This script parses Unicode mapping files as available from the Unicode
4
site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5
modules from them. The codecs use the standard character mapping codec
6
to actually apply the mapping.
7
8
Synopsis: gencodec.py dir codec_prefix
9
10
All files in dir are scanned and those producing non-empty mappings
11
will be written to <codec_prefix><mapname>.py with <mapname> being the
12
first part of the map's filename ('a' in a.b.c.txt) converted to
13
lowercase with hyphens replaced by underscores.
14
15
The tool also writes marshalled versions of the mapping tables to the
16
same location (with .mapping extension).
17
18
Written by Marc-Andre Lemburg ([email protected]).
19
20
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21
(c) Copyright Guido van Rossum, 2000.
22
23
Table generation:
24
(c) Copyright Marc-Andre Lemburg, 2005.
25
Licensed to PSF under a Contributor Agreement.
26
27
"""#"
28
29
import re, os, marshal, codecs
30
31
# Maximum allowed size of charmap tables
32
MAX_TABLE_SIZE = 8192
33
34
# Standard undefined Unicode code point
35
UNI_UNDEFINED = chr(0xFFFE)
36
37
# Placeholder for a missing code point
38
MISSING_CODE = -1
39
40
mapRE = re.compile(r'((?:0x[0-9a-fA-F]+\+?)+)'
41
r'\s+'
42
r'((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
43
r'\s*'
44
r'(#.+)?')
45
46
def parsecodes(codes, len=len, range=range):
47
48
""" Converts code combinations to either a single code integer
49
or a tuple of integers.
50
51
meta-codes (in angular brackets, e.g. <LR> and <RL>) are
52
ignored.
53
54
Empty codes or illegal ones are returned as None.
55
56
"""
57
if not codes:
58
return MISSING_CODE
59
l = codes.split('+')
60
if len(l) == 1:
61
return int(l[0],16)
62
for i in range(len(l)):
63
try:
64
l[i] = int(l[i],16)
65
except ValueError:
66
l[i] = MISSING_CODE
67
l = [x for x in l if x != MISSING_CODE]
68
if len(l) == 1:
69
return l[0]
70
else:
71
return tuple(l)
72
73
def readmap(filename):
74
75
with open(filename) as f:
76
lines = f.readlines()
77
enc2uni = {}
78
identity = []
79
unmapped = list(range(256))
80
81
# UTC mapping tables per convention don't include the identity
82
# mappings for code points 0x00 - 0x1F and 0x7F, unless these are
83
# explicitly mapped to different characters or undefined
84
for i in list(range(32)) + [127]:
85
identity.append(i)
86
unmapped.remove(i)
87
enc2uni[i] = (i, 'CONTROL CHARACTER')
88
89
for line in lines:
90
line = line.strip()
91
if not line or line[0] == '#':
92
continue
93
m = mapRE.match(line)
94
if not m:
95
#print '* not matched: %s' % repr(line)
96
continue
97
enc,uni,comment = m.groups()
98
enc = parsecodes(enc)
99
uni = parsecodes(uni)
100
if comment is None:
101
comment = ''
102
else:
103
comment = comment[1:].strip()
104
if not isinstance(enc, tuple) and enc < 256:
105
if enc in unmapped:
106
unmapped.remove(enc)
107
if enc == uni:
108
identity.append(enc)
109
enc2uni[enc] = (uni,comment)
110
else:
111
enc2uni[enc] = (uni,comment)
112
113
# If there are more identity-mapped entries than unmapped entries,
114
# it pays to generate an identity dictionary first, and add explicit
115
# mappings to None for the rest
116
if len(identity) >= len(unmapped):
117
for enc in unmapped:
118
enc2uni[enc] = (MISSING_CODE, "")
119
enc2uni['IDENTITY'] = 256
120
121
return enc2uni
122
123
def hexrepr(t, precision=4):
124
125
if t is None:
126
return 'None'
127
try:
128
len(t)
129
except TypeError:
130
return '0x%0*X' % (precision, t)
131
try:
132
return '(' + ', '.join(['0x%0*X' % (precision, item)
133
for item in t]) + ')'
134
except TypeError as why:
135
print('* failed to convert %r: %s' % (t, why))
136
raise
137
138
def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
139
140
l = []
141
append = l.append
142
if "IDENTITY" in map:
143
append("%s = codecs.make_identity_dict(range(%d))" %
144
(varname, map["IDENTITY"]))
145
append("%s.update({" % varname)
146
splits = 1
147
del map["IDENTITY"]
148
identity = 1
149
else:
150
append("%s = {" % varname)
151
splits = 0
152
identity = 0
153
154
mappings = sorted(map.items())
155
i = 0
156
key_precision, value_precision = precisions
157
for mapkey, mapvalue in mappings:
158
mapcomment = ''
159
if isinstance(mapkey, tuple):
160
(mapkey, mapcomment) = mapkey
161
if isinstance(mapvalue, tuple):
162
(mapvalue, mapcomment) = mapvalue
163
if mapkey is None:
164
continue
165
if (identity and
166
mapkey == mapvalue and
167
mapkey < 256):
168
# No need to include identity mappings, since these
169
# are already set for the first 256 code points.
170
continue
171
key = hexrepr(mapkey, key_precision)
172
value = hexrepr(mapvalue, value_precision)
173
if mapcomment and comments:
174
append(' %s: %s,\t# %s' % (key, value, mapcomment))
175
else:
176
append(' %s: %s,' % (key, value))
177
i += 1
178
if i == 4096:
179
# Split the definition into parts to that the Python
180
# parser doesn't dump core
181
if splits == 0:
182
append('}')
183
else:
184
append('})')
185
append('%s.update({' % varname)
186
i = 0
187
splits = splits + 1
188
if splits == 0:
189
append('}')
190
else:
191
append('})')
192
193
return l
194
195
def python_tabledef_code(varname, map, comments=1, key_precision=2):
196
197
l = []
198
append = l.append
199
append('%s = (' % varname)
200
201
# Analyze map and create table dict
202
mappings = sorted(map.items())
203
table = {}
204
maxkey = 255
205
if 'IDENTITY' in map:
206
for key in range(256):
207
table[key] = (key, '')
208
del map['IDENTITY']
209
for mapkey, mapvalue in mappings:
210
mapcomment = ''
211
if isinstance(mapkey, tuple):
212
(mapkey, mapcomment) = mapkey
213
if isinstance(mapvalue, tuple):
214
(mapvalue, mapcomment) = mapvalue
215
if mapkey == MISSING_CODE:
216
continue
217
table[mapkey] = (mapvalue, mapcomment)
218
if mapkey > maxkey:
219
maxkey = mapkey
220
if maxkey > MAX_TABLE_SIZE:
221
# Table too large
222
return None
223
224
# Create table code
225
maxchar = 0
226
for key in range(maxkey + 1):
227
if key not in table:
228
mapvalue = MISSING_CODE
229
mapcomment = 'UNDEFINED'
230
else:
231
mapvalue, mapcomment = table[key]
232
if mapvalue == MISSING_CODE:
233
mapchar = UNI_UNDEFINED
234
else:
235
if isinstance(mapvalue, tuple):
236
# 1-n mappings not supported
237
return None
238
else:
239
mapchar = chr(mapvalue)
240
maxchar = max(maxchar, ord(mapchar))
241
if mapcomment and comments:
242
append(' %a \t# %s -> %s' % (mapchar,
243
hexrepr(key, key_precision),
244
mapcomment))
245
else:
246
append(' %a' % mapchar)
247
248
if maxchar < 256:
249
append(' %a \t## Widen to UCS2 for optimization' % UNI_UNDEFINED)
250
append(')')
251
return l
252
253
def codegen(name, map, encodingname, comments=1):
254
255
""" Returns Python source for the given map.
256
257
Comments are included in the source, if comments is true (default).
258
259
"""
260
# Generate code
261
decoding_map_code = python_mapdef_code(
262
'decoding_map',
263
map,
264
comments=comments)
265
decoding_table_code = python_tabledef_code(
266
'decoding_table',
267
map,
268
comments=comments)
269
encoding_map_code = python_mapdef_code(
270
'encoding_map',
271
codecs.make_encoding_map(map),
272
comments=comments,
273
precisions=(4, 2))
274
275
if decoding_table_code:
276
suffix = 'table'
277
else:
278
suffix = 'map'
279
280
l = [
281
'''\
282
""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
283
284
"""#"
285
286
import codecs
287
288
### Codec APIs
289
290
class Codec(codecs.Codec):
291
292
def encode(self, input, errors='strict'):
293
return codecs.charmap_encode(input, errors, encoding_%s)
294
295
def decode(self, input, errors='strict'):
296
return codecs.charmap_decode(input, errors, decoding_%s)
297
''' % (encodingname, name, suffix, suffix)]
298
l.append('''\
299
class IncrementalEncoder(codecs.IncrementalEncoder):
300
def encode(self, input, final=False):
301
return codecs.charmap_encode(input, self.errors, encoding_%s)[0]
302
303
class IncrementalDecoder(codecs.IncrementalDecoder):
304
def decode(self, input, final=False):
305
return codecs.charmap_decode(input, self.errors, decoding_%s)[0]''' %
306
(suffix, suffix))
307
308
l.append('''
309
class StreamWriter(Codec, codecs.StreamWriter):
310
pass
311
312
class StreamReader(Codec, codecs.StreamReader):
313
pass
314
315
### encodings module API
316
317
def getregentry():
318
return codecs.CodecInfo(
319
name=%r,
320
encode=Codec().encode,
321
decode=Codec().decode,
322
incrementalencoder=IncrementalEncoder,
323
incrementaldecoder=IncrementalDecoder,
324
streamreader=StreamReader,
325
streamwriter=StreamWriter,
326
)
327
''' % encodingname.replace('_', '-'))
328
329
# Add decoding table or map (with preference to the table)
330
if not decoding_table_code:
331
l.append('''
332
### Decoding Map
333
''')
334
l.extend(decoding_map_code)
335
else:
336
l.append('''
337
### Decoding Table
338
''')
339
l.extend(decoding_table_code)
340
341
# Add encoding map
342
if decoding_table_code:
343
l.append('''
344
### Encoding table
345
encoding_table = codecs.charmap_build(decoding_table)
346
''')
347
else:
348
l.append('''
349
### Encoding Map
350
''')
351
l.extend(encoding_map_code)
352
353
# Final new-line
354
l.append('')
355
356
return '\n'.join(l).expandtabs()
357
358
def pymap(name,map,pyfile,encodingname,comments=1):
359
360
code = codegen(name,map,encodingname,comments)
361
with open(pyfile,'w') as f:
362
f.write(code)
363
364
def marshalmap(name,map,marshalfile):
365
366
d = {}
367
for e,(u,c) in map.items():
368
d[e] = (u,c)
369
with open(marshalfile,'wb') as f:
370
marshal.dump(d,f)
371
372
def convertdir(dir, dirprefix='', nameprefix='', comments=1):
373
374
mapnames = os.listdir(dir)
375
for mapname in mapnames:
376
mappathname = os.path.join(dir, mapname)
377
if not os.path.isfile(mappathname):
378
continue
379
name = os.path.split(mapname)[1]
380
name = name.replace('-','_')
381
name = name.split('.')[0]
382
name = name.lower()
383
name = nameprefix + name
384
codefile = name + '.py'
385
marshalfile = name + '.mapping'
386
print('converting %s to %s and %s' % (mapname,
387
dirprefix + codefile,
388
dirprefix + marshalfile))
389
try:
390
map = readmap(os.path.join(dir,mapname))
391
if not map:
392
print('* map is empty; skipping')
393
else:
394
pymap(mappathname, map, dirprefix + codefile,name,comments)
395
marshalmap(mappathname, map, dirprefix + marshalfile)
396
except ValueError as why:
397
print('* conversion failed: %s' % why)
398
raise
399
400
def rewritepythondir(dir, dirprefix='', comments=1):
401
402
mapnames = os.listdir(dir)
403
for mapname in mapnames:
404
if not mapname.endswith('.mapping'):
405
continue
406
name = mapname[:-len('.mapping')]
407
codefile = name + '.py'
408
print('converting %s to %s' % (mapname,
409
dirprefix + codefile))
410
try:
411
with open(os.path.join(dir, mapname), 'rb') as f:
412
map = marshal.load(f)
413
if not map:
414
print('* map is empty; skipping')
415
else:
416
pymap(mapname, map, dirprefix + codefile,name,comments)
417
except ValueError as why:
418
print('* conversion failed: %s' % why)
419
420
if __name__ == '__main__':
421
422
import sys
423
if 1:
424
convertdir(*sys.argv[1:])
425
else:
426
rewritepythondir(*sys.argv[1:])
427
428