CoCalc -- gencodec.py

GitHub Repository: allendowney/cpython
Path: blob/main/Tools/unicode/gencodec.py
¹² views
1
""" Unicode Mapping Parser and Codec Generator.
2

3
This script parses Unicode mapping files as available from the Unicode
4
site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5
modules from them. The codecs use the standard character mapping codec
6
to actually apply the mapping.
7

8
Synopsis: gencodec.py dir codec_prefix
9

10
All files in dir are scanned and those producing non-empty mappings
11
will be written to <codec_prefix><mapname>.py with <mapname> being the
12
first part of the map's filename ('a' in a.b.c.txt) converted to
13
lowercase with hyphens replaced by underscores.
14

15
The tool also writes marshalled versions of the mapping tables to the
16
same location (with .mapping extension).
17

18
Written by Marc-Andre Lemburg ([email protected]).
19

20
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21
(c) Copyright Guido van Rossum, 2000.
22

23
Table generation:
24
(c) Copyright Marc-Andre Lemburg, 2005.
25
    Licensed to PSF under a Contributor Agreement.
26

27
"""#"
28

29
import re, os, marshal, codecs
30

31
# Maximum allowed size of charmap tables
32
MAX_TABLE_SIZE = 8192
33

34
# Standard undefined Unicode code point
35
UNI_UNDEFINED = chr(0xFFFE)
36

37
# Placeholder for a missing code point
38
MISSING_CODE = -1
39

40
mapRE = re.compile(r'((?:0x[0-9a-fA-F]+\+?)+)'
41
                   r'\s+'
42
                   r'((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
43
                   r'\s*'
44
                   r'(#.+)?')
45

46
def parsecodes(codes, len=len, range=range):
47

48
    """ Converts code combinations to either a single code integer
49
        or a tuple of integers.
50

51
        meta-codes (in angular brackets, e.g. <LR> and <RL>) are
52
        ignored.
53

54
        Empty codes or illegal ones are returned as None.
55

56
    """
57
    if not codes:
58
        return MISSING_CODE
59
    l = codes.split('+')
60
    if len(l) == 1:
61
        return int(l[0],16)
62
    for i in range(len(l)):
63
        try:
64
            l[i] = int(l[i],16)
65
        except ValueError:
66
            l[i] = MISSING_CODE
67
    l = [x for x in l if x != MISSING_CODE]
68
    if len(l) == 1:
69
        return l[0]
70
    else:
71
        return tuple(l)
72

73
def readmap(filename):
74

75
    with open(filename) as f:
76
        lines = f.readlines()
77
    enc2uni = {}
78
    identity = []
79
    unmapped = list(range(256))
80

81
    # UTC mapping tables per convention don't include the identity
82
    # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
83
    # explicitly mapped to different characters or undefined
84
    for i in list(range(32)) + [127]:
85
        identity.append(i)
86
        unmapped.remove(i)
87
        enc2uni[i] = (i, 'CONTROL CHARACTER')
88

89
    for line in lines:
90
        line = line.strip()
91
        if not line or line[0] == '#':
92
            continue
93
        m = mapRE.match(line)
94
        if not m:
95
            #print '* not matched: %s' % repr(line)
96
            continue
97
        enc,uni,comment = m.groups()
98
        enc = parsecodes(enc)
99
        uni = parsecodes(uni)
100
        if comment is None:
101
            comment = ''
102
        else:
103
            comment = comment[1:].strip()
104
        if not isinstance(enc, tuple) and enc < 256:
105
            if enc in unmapped:
106
                unmapped.remove(enc)
107
            if enc == uni:
108
                identity.append(enc)
109
            enc2uni[enc] = (uni,comment)
110
        else:
111
            enc2uni[enc] = (uni,comment)
112

113
    # If there are more identity-mapped entries than unmapped entries,
114
    # it pays to generate an identity dictionary first, and add explicit
115
    # mappings to None for the rest
116
    if len(identity) >= len(unmapped):
117
        for enc in unmapped:
118
            enc2uni[enc] = (MISSING_CODE, "")
119
        enc2uni['IDENTITY'] = 256
120

121
    return enc2uni
122

123
def hexrepr(t, precision=4):
124

125
    if t is None:
126
        return 'None'
127
    try:
128
        len(t)
129
    except TypeError:
130
        return '0x%0*X' % (precision, t)
131
    try:
132
        return '(' + ', '.join(['0x%0*X' % (precision, item)
133
                                for item in t]) + ')'
134
    except TypeError as why:
135
        print('* failed to convert %r: %s' % (t, why))
136
        raise
137

138
def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
139

140
    l = []
141
    append = l.append
142
    if "IDENTITY" in map:
143
        append("%s = codecs.make_identity_dict(range(%d))" %
144
               (varname, map["IDENTITY"]))
145
        append("%s.update({" % varname)
146
        splits = 1
147
        del map["IDENTITY"]
148
        identity = 1
149
    else:
150
        append("%s = {" % varname)
151
        splits = 0
152
        identity = 0
153

154
    mappings = sorted(map.items())
155
    i = 0
156
    key_precision, value_precision = precisions
157
    for mapkey, mapvalue in mappings:
158
        mapcomment = ''
159
        if isinstance(mapkey, tuple):
160
            (mapkey, mapcomment) = mapkey
161
        if isinstance(mapvalue, tuple):
162
            (mapvalue, mapcomment) = mapvalue
163
        if mapkey is None:
164
            continue
165
        if (identity and
166
            mapkey == mapvalue and
167
            mapkey < 256):
168
            # No need to include identity mappings, since these
169
            # are already set for the first 256 code points.
170
            continue
171
        key = hexrepr(mapkey, key_precision)
172
        value = hexrepr(mapvalue, value_precision)
173
        if mapcomment and comments:
174
            append('    %s: %s,\t#  %s' % (key, value, mapcomment))
175
        else:
176
            append('    %s: %s,' % (key, value))
177
        i += 1
178
        if i == 4096:
179
            # Split the definition into parts to that the Python
180
            # parser doesn't dump core
181
            if splits == 0:
182
                append('}')
183
            else:
184
                append('})')
185
            append('%s.update({' % varname)
186
            i = 0
187
            splits = splits + 1
188
    if splits == 0:
189
        append('}')
190
    else:
191
        append('})')
192

193
    return l
194

195
def python_tabledef_code(varname, map, comments=1, key_precision=2):
196

197
    l = []
198
    append = l.append
199
    append('%s = (' % varname)
200

201
    # Analyze map and create table dict
202
    mappings = sorted(map.items())
203
    table = {}
204
    maxkey = 255
205
    if 'IDENTITY' in map:
206
        for key in range(256):
207
            table[key] = (key, '')
208
        del map['IDENTITY']
209
    for mapkey, mapvalue in mappings:
210
        mapcomment = ''
211
        if isinstance(mapkey, tuple):
212
            (mapkey, mapcomment) = mapkey
213
        if isinstance(mapvalue, tuple):
214
            (mapvalue, mapcomment) = mapvalue
215
        if mapkey == MISSING_CODE:
216
            continue
217
        table[mapkey] = (mapvalue, mapcomment)
218
        if mapkey > maxkey:
219
            maxkey = mapkey
220
    if maxkey > MAX_TABLE_SIZE:
221
        # Table too large
222
        return None
223

224
    # Create table code
225
    maxchar = 0
226
    for key in range(maxkey + 1):
227
        if key not in table:
228
            mapvalue = MISSING_CODE
229
            mapcomment = 'UNDEFINED'
230
        else:
231
            mapvalue, mapcomment = table[key]
232
        if mapvalue == MISSING_CODE:
233
            mapchar = UNI_UNDEFINED
234
        else:
235
            if isinstance(mapvalue, tuple):
236
                # 1-n mappings not supported
237
                return None
238
            else:
239
                mapchar = chr(mapvalue)
240
        maxchar = max(maxchar, ord(mapchar))
241
        if mapcomment and comments:
242
            append('    %a \t#  %s -> %s' % (mapchar,
243
                                            hexrepr(key, key_precision),
244
                                            mapcomment))
245
        else:
246
            append('    %a' % mapchar)
247

248
    if maxchar < 256:
249
        append('    %a \t## Widen to UCS2 for optimization' % UNI_UNDEFINED)
250
    append(')')
251
    return l
252

253
def codegen(name, map, encodingname, comments=1):
254

255
    """ Returns Python source for the given map.
256

257
        Comments are included in the source, if comments is true (default).
258

259
    """
260
    # Generate code
261
    decoding_map_code = python_mapdef_code(
262
        'decoding_map',
263
        map,
264
        comments=comments)
265
    decoding_table_code = python_tabledef_code(
266
        'decoding_table',
267
        map,
268
        comments=comments)
269
    encoding_map_code = python_mapdef_code(
270
        'encoding_map',
271
        codecs.make_encoding_map(map),
272
        comments=comments,
273
        precisions=(4, 2))
274

275
    if decoding_table_code:
276
        suffix = 'table'
277
    else:
278
        suffix = 'map'
279

280
    l = [
281
        '''\
282
""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
283

284
"""#"
285

286
import codecs
287

288
### Codec APIs
289

290
class Codec(codecs.Codec):
291

292
    def encode(self, input, errors='strict'):
293
        return codecs.charmap_encode(input, errors, encoding_%s)
294

295
    def decode(self, input, errors='strict'):
296
        return codecs.charmap_decode(input, errors, decoding_%s)
297
''' % (encodingname, name, suffix, suffix)]
298
    l.append('''\
299
class IncrementalEncoder(codecs.IncrementalEncoder):
300
    def encode(self, input, final=False):
301
        return codecs.charmap_encode(input, self.errors, encoding_%s)[0]
302

303
class IncrementalDecoder(codecs.IncrementalDecoder):
304
    def decode(self, input, final=False):
305
        return codecs.charmap_decode(input, self.errors, decoding_%s)[0]''' %
306
        (suffix, suffix))
307

308
    l.append('''
309
class StreamWriter(Codec, codecs.StreamWriter):
310
    pass
311

312
class StreamReader(Codec, codecs.StreamReader):
313
    pass
314

315
### encodings module API
316

317
def getregentry():
318
    return codecs.CodecInfo(
319
        name=%r,
320
        encode=Codec().encode,
321
        decode=Codec().decode,
322
        incrementalencoder=IncrementalEncoder,
323
        incrementaldecoder=IncrementalDecoder,
324
        streamreader=StreamReader,
325
        streamwriter=StreamWriter,
326
    )
327
''' % encodingname.replace('_', '-'))
328

329
    # Add decoding table or map (with preference to the table)
330
    if not decoding_table_code:
331
        l.append('''
332
### Decoding Map
333
''')
334
        l.extend(decoding_map_code)
335
    else:
336
        l.append('''
337
### Decoding Table
338
''')
339
        l.extend(decoding_table_code)
340

341
    # Add encoding map
342
    if decoding_table_code:
343
        l.append('''
344
### Encoding table
345
encoding_table = codecs.charmap_build(decoding_table)
346
''')
347
    else:
348
        l.append('''
349
### Encoding Map
350
''')
351
        l.extend(encoding_map_code)
352

353
    # Final new-line
354
    l.append('')
355

356
    return '\n'.join(l).expandtabs()
357

358
def pymap(name,map,pyfile,encodingname,comments=1):
359

360
    code = codegen(name,map,encodingname,comments)
361
    with open(pyfile,'w') as f:
362
        f.write(code)
363

364
def marshalmap(name,map,marshalfile):
365

366
    d = {}
367
    for e,(u,c) in map.items():
368
        d[e] = (u,c)
369
    with open(marshalfile,'wb') as f:
370
        marshal.dump(d,f)
371

372
def convertdir(dir, dirprefix='', nameprefix='', comments=1):
373

374
    mapnames = os.listdir(dir)
375
    for mapname in mapnames:
376
        mappathname = os.path.join(dir, mapname)
377
        if not os.path.isfile(mappathname):
378
            continue
379
        name = os.path.split(mapname)[1]
380
        name = name.replace('-','_')
381
        name = name.split('.')[0]
382
        name = name.lower()
383
        name = nameprefix + name
384
        codefile = name + '.py'
385
        marshalfile = name + '.mapping'
386
        print('converting %s to %s and %s' % (mapname,
387
                                              dirprefix + codefile,
388
                                              dirprefix + marshalfile))
389
        try:
390
            map = readmap(os.path.join(dir,mapname))
391
            if not map:
392
                print('* map is empty; skipping')
393
            else:
394
                pymap(mappathname, map, dirprefix + codefile,name,comments)
395
                marshalmap(mappathname, map, dirprefix + marshalfile)
396
        except ValueError as why:
397
            print('* conversion failed: %s' % why)
398
            raise
399

400
def rewritepythondir(dir, dirprefix='', comments=1):
401

402
    mapnames = os.listdir(dir)
403
    for mapname in mapnames:
404
        if not mapname.endswith('.mapping'):
405
            continue
406
        name = mapname[:-len('.mapping')]
407
        codefile = name + '.py'
408
        print('converting %s to %s' % (mapname,
409
                                       dirprefix + codefile))
410
        try:
411
            with open(os.path.join(dir, mapname), 'rb') as f:
412
                map = marshal.load(f)
413
            if not map:
414
                print('* map is empty; skipping')
415
            else:
416
                pymap(mapname, map, dirprefix + codefile,name,comments)
417
        except ValueError as why:
418
            print('* conversion failed: %s' % why)
419

420
if __name__ == '__main__':
421

422
    import sys
423
    if 1:
424
        convertdir(*sys.argv[1:])
425
    else:
426
        rewritepythondir(*sys.argv[1:])
427

428
Product

Resources

Company