CoCalc -- makeunicodedata.py

GitHub Repository: allendowney/cpython
Path: blob/main/Tools/unicode/makeunicodedata.py
¹² views
1
#
2
# (re)generate unicode property and type databases
3
#
4
# This script converts Unicode database files to Modules/unicodedata_db.h,
5
# Modules/unicodename_db.h, and Objects/unicodetype_db.h
6
#
7
# history:
8
# 2000-09-24 fl   created (based on bits and pieces from unidb)
9
# 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
10
# 2000-09-25 fl   added character type table
11
# 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
12
# 2000-11-03 fl   expand first/last ranges
13
# 2001-01-19 fl   added character name tables (2.1)
14
# 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
15
# 2002-09-11 wd   use string methods
16
# 2002-10-18 mvl  update to Unicode 3.2
17
# 2002-10-22 mvl  generate NFC tables
18
# 2002-11-24 mvl  expand all ranges, sort names version-independently
19
# 2002-11-25 mvl  add UNIDATA_VERSION
20
# 2004-05-29 perky add east asian width information
21
# 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
22
# 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
23
# 2011-10-21 ezio add support for name aliases and named sequences
24
# 2012-01    benjamin add full case mappings
25
#
26
# written by Fredrik Lundh ([email protected])
27
#
28

29
import dataclasses
30
import os
31
import sys
32
import zipfile
33

34
from functools import partial
35
from textwrap import dedent
36
from typing import Iterator, List, Optional, Set, Tuple
37

38
SCRIPT = sys.argv[0]
39
VERSION = "3.3"
40

41
# The Unicode Database
42
# --------------------
43
# When changing UCD version please update
44
#   * Doc/library/stdtypes.rst, and
45
#   * Doc/library/unicodedata.rst
46
#   * Doc/reference/lexical_analysis.rst (two occurrences)
47
UNIDATA_VERSION = "15.0.0"
48
UNICODE_DATA = "UnicodeData%s.txt"
49
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
50
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
51
UNIHAN = "Unihan%s.zip"
52
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
53
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
54
LINE_BREAK = "LineBreak%s.txt"
55
NAME_ALIASES = "NameAliases%s.txt"
56
NAMED_SEQUENCES = "NamedSequences%s.txt"
57
SPECIAL_CASING = "SpecialCasing%s.txt"
58
CASE_FOLDING = "CaseFolding%s.txt"
59

60
# Private Use Areas -- in planes 1, 15, 16
61
PUA_1 = range(0xE000, 0xF900)
62
PUA_15 = range(0xF0000, 0xFFFFE)
63
PUA_16 = range(0x100000, 0x10FFFE)
64

65
# we use this ranges of PUA_15 to store name aliases and named sequences
66
NAME_ALIASES_START = 0xF0000
67
NAMED_SEQUENCES_START = 0xF0200
68

69
old_versions = ["3.2.0"]
70

71
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
72
    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
73
    "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
74
    "So" ]
75

76
BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
77
    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
78
    "ON", "LRI", "RLI", "FSI", "PDI" ]
79

80
# "N" needs to be the first entry, see the comment in makeunicodedata
81
EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]
82

83
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
84

85
# note: should match definitions in Objects/unicodectype.c
86
ALPHA_MASK = 0x01
87
DECIMAL_MASK = 0x02
88
DIGIT_MASK = 0x04
89
LOWER_MASK = 0x08
90
LINEBREAK_MASK = 0x10
91
SPACE_MASK = 0x20
92
TITLE_MASK = 0x40
93
UPPER_MASK = 0x80
94
XID_START_MASK = 0x100
95
XID_CONTINUE_MASK = 0x200
96
PRINTABLE_MASK = 0x400
97
NUMERIC_MASK = 0x800
98
CASE_IGNORABLE_MASK = 0x1000
99
CASED_MASK = 0x2000
100
EXTENDED_CASE_MASK = 0x4000
101

102
# these ranges need to match unicodedata.c:is_unified_ideograph
103
cjk_ranges = [
104
    ('3400', '4DBF'),
105
    ('4E00', '9FFF'),
106
    ('20000', '2A6DF'),
107
    ('2A700', '2B739'),
108
    ('2B740', '2B81D'),
109
    ('2B820', '2CEA1'),
110
    ('2CEB0', '2EBE0'),
111
    ('30000', '3134A'),
112
    ('31350', '323AF'),
113
]
114

115

116
def maketables(trace=0):
117

118
    print("--- Reading", UNICODE_DATA % "", "...")
119

120
    unicode = UnicodeData(UNIDATA_VERSION)
121

122
    print(len(list(filter(None, unicode.table))), "characters")
123

124
    for version in old_versions:
125
        print("--- Reading", UNICODE_DATA % ("-"+version), "...")
126
        old_unicode = UnicodeData(version, cjk_check=False)
127
        print(len(list(filter(None, old_unicode.table))), "characters")
128
        merge_old_version(version, unicode, old_unicode)
129

130
    makeunicodename(unicode, trace)
131
    makeunicodedata(unicode, trace)
132
    makeunicodetype(unicode, trace)
133

134

135
# --------------------------------------------------------------------
136
# unicode character properties
137

138
def makeunicodedata(unicode, trace):
139

140
    # the default value of east_asian_width is "N", for unassigned code points
141
    # not mentioned in EastAsianWidth.txt
142
    # in addition there are some reserved but unassigned code points in CJK
143
    # ranges that are classified as "W". code points in private use areas
144
    # have a width of "A". both of these have entries in
145
    # EastAsianWidth.txt
146
    # see https://unicode.org/reports/tr11/#Unassigned
147
    assert EASTASIANWIDTH_NAMES[0] == "N"
148
    dummy = (0, 0, 0, 0, 0, 0)
149
    table = [dummy]
150
    cache = {0: dummy}
151
    index = [0] * len(unicode.chars)
152

153
    FILE = "Modules/unicodedata_db.h"
154

155
    print("--- Preparing", FILE, "...")
156

157
    # 1) database properties
158

159
    for char in unicode.chars:
160
        record = unicode.table[char]
161
        if record:
162
            # extract database properties
163
            category = CATEGORY_NAMES.index(record.general_category)
164
            combining = int(record.canonical_combining_class)
165
            bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
166
            mirrored = record.bidi_mirrored == "Y"
167
            eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
168
            normalizationquickcheck = record.quick_check
169
            item = (
170
                category, combining, bidirectional, mirrored, eastasianwidth,
171
                normalizationquickcheck
172
                )
173
        elif unicode.widths[char] is not None:
174
            # an unassigned but reserved character, with a known
175
            # east_asian_width
176
            eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
177
            item = (0, 0, 0, 0, eastasianwidth, 0)
178
        else:
179
            continue
180

181
        # add entry to index and item tables
182
        i = cache.get(item)
183
        if i is None:
184
            cache[item] = i = len(table)
185
            table.append(item)
186
        index[char] = i
187

188
    # 2) decomposition data
189

190
    decomp_data_cache = {}
191
    decomp_data = [0]
192
    decomp_prefix = [""]
193
    decomp_index = [0] * len(unicode.chars)
194
    decomp_size = 0
195

196
    comp_pairs = []
197
    comp_first = [None] * len(unicode.chars)
198
    comp_last = [None] * len(unicode.chars)
199

200
    for char in unicode.chars:
201
        record = unicode.table[char]
202
        if record:
203
            if record.decomposition_type:
204
                decomp = record.decomposition_type.split()
205
                if len(decomp) > 19:
206
                    raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
207
                # prefix
208
                if decomp[0][0] == "<":
209
                    prefix = decomp.pop(0)
210
                else:
211
                    prefix = ""
212
                try:
213
                    i = decomp_prefix.index(prefix)
214
                except ValueError:
215
                    i = len(decomp_prefix)
216
                    decomp_prefix.append(prefix)
217
                prefix = i
218
                assert prefix < 256
219
                # content
220
                decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
221
                # Collect NFC pairs
222
                if not prefix and len(decomp) == 3 and \
223
                   char not in unicode.exclusions and \
224
                   unicode.table[decomp[1]].canonical_combining_class == "0":
225
                    p, l, r = decomp
226
                    comp_first[l] = 1
227
                    comp_last[r] = 1
228
                    comp_pairs.append((l,r,char))
229
                key = tuple(decomp)
230
                i = decomp_data_cache.get(key, -1)
231
                if i == -1:
232
                    i = len(decomp_data)
233
                    decomp_data.extend(decomp)
234
                    decomp_size = decomp_size + len(decomp) * 2
235
                    decomp_data_cache[key] = i
236
                else:
237
                    assert decomp_data[i:i+len(decomp)] == decomp
238
            else:
239
                i = 0
240
            decomp_index[char] = i
241

242
    f = l = 0
243
    comp_first_ranges = []
244
    comp_last_ranges = []
245
    prev_f = prev_l = None
246
    for i in unicode.chars:
247
        if comp_first[i] is not None:
248
            comp_first[i] = f
249
            f += 1
250
            if prev_f is None:
251
                prev_f = (i,i)
252
            elif prev_f[1]+1 == i:
253
                prev_f = prev_f[0],i
254
            else:
255
                comp_first_ranges.append(prev_f)
256
                prev_f = (i,i)
257
        if comp_last[i] is not None:
258
            comp_last[i] = l
259
            l += 1
260
            if prev_l is None:
261
                prev_l = (i,i)
262
            elif prev_l[1]+1 == i:
263
                prev_l = prev_l[0],i
264
            else:
265
                comp_last_ranges.append(prev_l)
266
                prev_l = (i,i)
267
    comp_first_ranges.append(prev_f)
268
    comp_last_ranges.append(prev_l)
269
    total_first = f
270
    total_last = l
271

272
    comp_data = [0]*(total_first*total_last)
273
    for f,l,char in comp_pairs:
274
        f = comp_first[f]
275
        l = comp_last[l]
276
        comp_data[f*total_last+l] = char
277

278
    print(len(table), "unique properties")
279
    print(len(decomp_prefix), "unique decomposition prefixes")
280
    print(len(decomp_data), "unique decomposition entries:", end=' ')
281
    print(decomp_size, "bytes")
282
    print(total_first, "first characters in NFC")
283
    print(total_last, "last characters in NFC")
284
    print(len(comp_pairs), "NFC pairs")
285

286
    print("--- Writing", FILE, "...")
287

288
    with open(FILE, "w") as fp:
289
        fprint = partial(print, file=fp)
290

291
        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
292
        fprint()
293
        fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
294
        fprint("/* a list of unique database records */")
295
        fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
296
        for item in table:
297
            fprint("    {%d, %d, %d, %d, %d, %d}," % item)
298
        fprint("};")
299
        fprint()
300

301
        fprint("/* Reindexing of NFC first characters. */")
302
        fprint("#define TOTAL_FIRST",total_first)
303
        fprint("#define TOTAL_LAST",total_last)
304
        fprint("struct reindex{int start;short count,index;};")
305
        fprint("static struct reindex nfc_first[] = {")
306
        for start,end in comp_first_ranges:
307
            fprint("    { %d, %d, %d}," % (start,end-start,comp_first[start]))
308
        fprint("    {0,0,0}")
309
        fprint("};\n")
310
        fprint("static struct reindex nfc_last[] = {")
311
        for start,end in comp_last_ranges:
312
            fprint("  { %d, %d, %d}," % (start,end-start,comp_last[start]))
313
        fprint("  {0,0,0}")
314
        fprint("};\n")
315

316
        # FIXME: <fl> the following tables could be made static, and
317
        # the support code moved into unicodedatabase.c
318

319
        fprint("/* string literals */")
320
        fprint("const char *_PyUnicode_CategoryNames[] = {")
321
        for name in CATEGORY_NAMES:
322
            fprint("    \"%s\"," % name)
323
        fprint("    NULL")
324
        fprint("};")
325

326
        fprint("const char *_PyUnicode_BidirectionalNames[] = {")
327
        for name in BIDIRECTIONAL_NAMES:
328
            fprint("    \"%s\"," % name)
329
        fprint("    NULL")
330
        fprint("};")
331

332
        fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
333
        for name in EASTASIANWIDTH_NAMES:
334
            fprint("    \"%s\"," % name)
335
        fprint("    NULL")
336
        fprint("};")
337

338
        fprint("static const char *decomp_prefix[] = {")
339
        for name in decomp_prefix:
340
            fprint("    \"%s\"," % name)
341
        fprint("    NULL")
342
        fprint("};")
343

344
        # split record index table
345
        index1, index2, shift = splitbins(index, trace)
346

347
        fprint("/* index tables for the database records */")
348
        fprint("#define SHIFT", shift)
349
        Array("index1", index1).dump(fp, trace)
350
        Array("index2", index2).dump(fp, trace)
351

352
        # split decomposition index table
353
        index1, index2, shift = splitbins(decomp_index, trace)
354

355
        fprint("/* decomposition data */")
356
        Array("decomp_data", decomp_data).dump(fp, trace)
357

358
        fprint("/* index tables for the decomposition data */")
359
        fprint("#define DECOMP_SHIFT", shift)
360
        Array("decomp_index1", index1).dump(fp, trace)
361
        Array("decomp_index2", index2).dump(fp, trace)
362

363
        index, index2, shift = splitbins(comp_data, trace)
364
        fprint("/* NFC pairs */")
365
        fprint("#define COMP_SHIFT", shift)
366
        Array("comp_index", index).dump(fp, trace)
367
        Array("comp_data", index2).dump(fp, trace)
368

369
        # Generate delta tables for old versions
370
        for version, table, normalization in unicode.changed:
371
            cversion = version.replace(".","_")
372
            records = [table[0]]
373
            cache = {table[0]:0}
374
            index = [0] * len(table)
375
            for i, record in enumerate(table):
376
                try:
377
                    index[i] = cache[record]
378
                except KeyError:
379
                    index[i] = cache[record] = len(records)
380
                    records.append(record)
381
            index1, index2, shift = splitbins(index, trace)
382
            fprint("static const change_record change_records_%s[] = {" % cversion)
383
            for record in records:
384
                fprint("    { %s }," % ", ".join(map(str,record)))
385
            fprint("};")
386
            Array("changes_%s_index" % cversion, index1).dump(fp, trace)
387
            Array("changes_%s_data" % cversion, index2).dump(fp, trace)
388
            fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
389
            fprint("{")
390
            fprint("    int index;")
391
            fprint("    if (n >= 0x110000) index = 0;")
392
            fprint("    else {")
393
            fprint("        index = changes_%s_index[n>>%d];" % (cversion, shift))
394
            fprint("        index = changes_%s_data[(index<<%d)+(n & %d)];" % \
395
                   (cversion, shift, ((1<<shift)-1)))
396
            fprint("    }")
397
            fprint("    return change_records_%s+index;" % cversion)
398
            fprint("}\n")
399
            fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
400
            fprint("{")
401
            fprint("    switch(n) {")
402
            for k, v in normalization:
403
                fprint("    case %s: return 0x%s;" % (hex(k), v))
404
            fprint("    default: return 0;")
405
            fprint("    }\n}\n")
406

407

408
# --------------------------------------------------------------------
409
# unicode character type tables
410

411
def makeunicodetype(unicode, trace):
412

413
    FILE = "Objects/unicodetype_db.h"
414

415
    print("--- Preparing", FILE, "...")
416

417
    # extract unicode types
418
    dummy = (0, 0, 0, 0, 0, 0)
419
    table = [dummy]
420
    cache = {dummy: 0}
421
    index = [0] * len(unicode.chars)
422
    numeric = {}
423
    spaces = []
424
    linebreaks = []
425
    extra_casing = []
426

427
    for char in unicode.chars:
428
        record = unicode.table[char]
429
        if record:
430
            # extract database properties
431
            category = record.general_category
432
            bidirectional = record.bidi_class
433
            properties = record.binary_properties
434
            flags = 0
435
            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
436
                flags |= ALPHA_MASK
437
            if "Lowercase" in properties:
438
                flags |= LOWER_MASK
439
            if 'Line_Break' in properties or bidirectional == "B":
440
                flags |= LINEBREAK_MASK
441
                linebreaks.append(char)
442
            if category == "Zs" or bidirectional in ("WS", "B", "S"):
443
                flags |= SPACE_MASK
444
                spaces.append(char)
445
            if category == "Lt":
446
                flags |= TITLE_MASK
447
            if "Uppercase" in properties:
448
                flags |= UPPER_MASK
449
            if char == ord(" ") or category[0] not in ("C", "Z"):
450
                flags |= PRINTABLE_MASK
451
            if "XID_Start" in properties:
452
                flags |= XID_START_MASK
453
            if "XID_Continue" in properties:
454
                flags |= XID_CONTINUE_MASK
455
            if "Cased" in properties:
456
                flags |= CASED_MASK
457
            if "Case_Ignorable" in properties:
458
                flags |= CASE_IGNORABLE_MASK
459
            sc = unicode.special_casing.get(char)
460
            cf = unicode.case_folding.get(char, [char])
461
            if record.simple_uppercase_mapping:
462
                upper = int(record.simple_uppercase_mapping, 16)
463
            else:
464
                upper = char
465
            if record.simple_lowercase_mapping:
466
                lower = int(record.simple_lowercase_mapping, 16)
467
            else:
468
                lower = char
469
            if record.simple_titlecase_mapping:
470
                title = int(record.simple_titlecase_mapping, 16)
471
            else:
472
                title = upper
473
            if sc is None and cf != [lower]:
474
                sc = ([lower], [title], [upper])
475
            if sc is None:
476
                if upper == lower == title:
477
                    upper = lower = title = 0
478
                else:
479
                    upper = upper - char
480
                    lower = lower - char
481
                    title = title - char
482
                    assert (abs(upper) <= 2147483647 and
483
                            abs(lower) <= 2147483647 and
484
                            abs(title) <= 2147483647)
485
            else:
486
                # This happens either when some character maps to more than one
487
                # character in uppercase, lowercase, or titlecase or the
488
                # casefolded version of the character is different from the
489
                # lowercase. The extra characters are stored in a different
490
                # array.
491
                flags |= EXTENDED_CASE_MASK
492
                lower = len(extra_casing) | (len(sc[0]) << 24)
493
                extra_casing.extend(sc[0])
494
                if cf != sc[0]:
495
                    lower |= len(cf) << 20
496
                    extra_casing.extend(cf)
497
                upper = len(extra_casing) | (len(sc[2]) << 24)
498
                extra_casing.extend(sc[2])
499
                # Title is probably equal to upper.
500
                if sc[1] == sc[2]:
501
                    title = upper
502
                else:
503
                    title = len(extra_casing) | (len(sc[1]) << 24)
504
                    extra_casing.extend(sc[1])
505
            # decimal digit, integer digit
506
            decimal = 0
507
            if record.decomposition_mapping:
508
                flags |= DECIMAL_MASK
509
                decimal = int(record.decomposition_mapping)
510
            digit = 0
511
            if record.numeric_type:
512
                flags |= DIGIT_MASK
513
                digit = int(record.numeric_type)
514
            if record.numeric_value:
515
                flags |= NUMERIC_MASK
516
                numeric.setdefault(record.numeric_value, []).append(char)
517
            item = (
518
                upper, lower, title, decimal, digit, flags
519
                )
520
            # add entry to index and item tables
521
            i = cache.get(item)
522
            if i is None:
523
                cache[item] = i = len(table)
524
                table.append(item)
525
            index[char] = i
526

527
    print(len(table), "unique character type entries")
528
    print(sum(map(len, numeric.values())), "numeric code points")
529
    print(len(spaces), "whitespace code points")
530
    print(len(linebreaks), "linebreak code points")
531
    print(len(extra_casing), "extended case array")
532

533
    print("--- Writing", FILE, "...")
534

535
    with open(FILE, "w") as fp:
536
        fprint = partial(print, file=fp)
537

538
        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
539
        fprint()
540
        fprint("/* a list of unique character type descriptors */")
541
        fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
542
        for item in table:
543
            fprint("    {%d, %d, %d, %d, %d, %d}," % item)
544
        fprint("};")
545
        fprint()
546

547
        fprint("/* extended case mappings */")
548
        fprint()
549
        fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
550
        for c in extra_casing:
551
            fprint("    %d," % c)
552
        fprint("};")
553
        fprint()
554

555
        # split decomposition index table
556
        index1, index2, shift = splitbins(index, trace)
557

558
        fprint("/* type indexes */")
559
        fprint("#define SHIFT", shift)
560
        Array("index1", index1).dump(fp, trace)
561
        Array("index2", index2).dump(fp, trace)
562

563
        # Generate code for _PyUnicode_ToNumeric()
564
        numeric_items = sorted(numeric.items())
565
        fprint('/* Returns the numeric value as double for Unicode characters')
566
        fprint(' * having this property, -1.0 otherwise.')
567
        fprint(' */')
568
        fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
569
        fprint('{')
570
        fprint('    switch (ch) {')
571
        for value, codepoints in numeric_items:
572
            # Turn text into float literals
573
            parts = value.split('/')
574
            parts = [repr(float(part)) for part in parts]
575
            value = '/'.join(parts)
576

577
            codepoints.sort()
578
            for codepoint in codepoints:
579
                fprint('    case 0x%04X:' % (codepoint,))
580
            fprint('        return (double) %s;' % (value,))
581
        fprint('    }')
582
        fprint('    return -1.0;')
583
        fprint('}')
584
        fprint()
585

586
        # Generate code for _PyUnicode_IsWhitespace()
587
        fprint("/* Returns 1 for Unicode characters having the bidirectional")
588
        fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
589
        fprint(" */")
590
        fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
591
        fprint('{')
592
        fprint('    switch (ch) {')
593

594
        for codepoint in sorted(spaces):
595
            fprint('    case 0x%04X:' % (codepoint,))
596
        fprint('        return 1;')
597

598
        fprint('    }')
599
        fprint('    return 0;')
600
        fprint('}')
601
        fprint()
602

603
        # Generate code for _PyUnicode_IsLinebreak()
604
        fprint("/* Returns 1 for Unicode characters having the line break")
605
        fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
606
        fprint(" * type 'B', 0 otherwise.")
607
        fprint(" */")
608
        fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
609
        fprint('{')
610
        fprint('    switch (ch) {')
611
        for codepoint in sorted(linebreaks):
612
            fprint('    case 0x%04X:' % (codepoint,))
613
        fprint('        return 1;')
614

615
        fprint('    }')
616
        fprint('    return 0;')
617
        fprint('}')
618
        fprint()
619

620

621
# --------------------------------------------------------------------
622
# unicode name database
623

624
def makeunicodename(unicode, trace):
625

626
    FILE = "Modules/unicodename_db.h"
627

628
    print("--- Preparing", FILE, "...")
629

630
    # collect names
631
    names = [None] * len(unicode.chars)
632

633
    for char in unicode.chars:
634
        record = unicode.table[char]
635
        if record:
636
            name = record.name.strip()
637
            if name and name[0] != "<":
638
                names[char] = name + chr(0)
639

640
    print(len([n for n in names if n is not None]), "distinct names")
641

642
    # collect unique words from names (note that we differ between
643
    # words inside a sentence, and words ending a sentence.  the
644
    # latter includes the trailing null byte.
645

646
    words = {}
647
    n = b = 0
648
    for char in unicode.chars:
649
        name = names[char]
650
        if name:
651
            w = name.split()
652
            b = b + len(name)
653
            n = n + len(w)
654
            for w in w:
655
                l = words.get(w)
656
                if l:
657
                    l.append(None)
658
                else:
659
                    words[w] = [len(words)]
660

661
    print(n, "words in text;", b, "bytes")
662

663
    wordlist = list(words.items())
664

665
    # sort on falling frequency, then by name
666
    def word_key(a):
667
        aword, alist = a
668
        return -len(alist), aword
669
    wordlist.sort(key=word_key)
670

671
    # figure out how many phrasebook escapes we need
672
    escapes = 0
673
    while escapes * 256 < len(wordlist):
674
        escapes = escapes + 1
675
    print(escapes, "escapes")
676

677
    short = 256 - escapes
678

679
    assert short > 0
680

681
    print(short, "short indexes in lexicon")
682

683
    # statistics
684
    n = 0
685
    for i in range(short):
686
        n = n + len(wordlist[i][1])
687
    print(n, "short indexes in phrasebook")
688

689
    # pick the most commonly used words, and sort the rest on falling
690
    # length (to maximize overlap)
691

692
    wordlist, wordtail = wordlist[:short], wordlist[short:]
693
    wordtail.sort(key=lambda a: a[0], reverse=True)
694
    wordlist.extend(wordtail)
695

696
    # generate lexicon from words
697

698
    lexicon_offset = [0]
699
    lexicon = ""
700
    words = {}
701

702
    # build a lexicon string
703
    offset = 0
704
    for w, x in wordlist:
705
        # encoding: bit 7 indicates last character in word (chr(128)
706
        # indicates the last character in an entire string)
707
        ww = w[:-1] + chr(ord(w[-1])+128)
708
        # reuse string tails, when possible
709
        o = lexicon.find(ww)
710
        if o < 0:
711
            o = offset
712
            lexicon = lexicon + ww
713
            offset = offset + len(w)
714
        words[w] = len(lexicon_offset)
715
        lexicon_offset.append(o)
716

717
    lexicon = list(map(ord, lexicon))
718

719
    # generate phrasebook from names and lexicon
720
    phrasebook = [0]
721
    phrasebook_offset = [0] * len(unicode.chars)
722
    for char in unicode.chars:
723
        name = names[char]
724
        if name:
725
            w = name.split()
726
            phrasebook_offset[char] = len(phrasebook)
727
            for w in w:
728
                i = words[w]
729
                if i < short:
730
                    phrasebook.append(i)
731
                else:
732
                    # store as two bytes
733
                    phrasebook.append((i>>8) + short)
734
                    phrasebook.append(i&255)
735

736
    assert getsize(phrasebook) == 1
737

738
    #
739
    # unicode name hash table
740

741
    # extract names
742
    data = []
743
    for char in unicode.chars:
744
        record = unicode.table[char]
745
        if record:
746
            name = record.name.strip()
747
            if name and name[0] != "<":
748
                data.append((name, char))
749

750
    # the magic number 47 was chosen to minimize the number of
751
    # collisions on the current data set.  if you like, change it
752
    # and see what happens...
753

754
    codehash = Hash("code", data, 47)
755

756
    print("--- Writing", FILE, "...")
757

758
    with open(FILE, "w") as fp:
759
        fprint = partial(print, file=fp)
760

761
        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
762
        fprint()
763
        fprint("#define NAME_MAXLEN", 256)
764
        fprint()
765
        fprint("/* lexicon */")
766
        Array("lexicon", lexicon).dump(fp, trace)
767
        Array("lexicon_offset", lexicon_offset).dump(fp, trace)
768

769
        # split decomposition index table
770
        offset1, offset2, shift = splitbins(phrasebook_offset, trace)
771

772
        fprint("/* code->name phrasebook */")
773
        fprint("#define phrasebook_shift", shift)
774
        fprint("#define phrasebook_short", short)
775

776
        Array("phrasebook", phrasebook).dump(fp, trace)
777
        Array("phrasebook_offset1", offset1).dump(fp, trace)
778
        Array("phrasebook_offset2", offset2).dump(fp, trace)
779

780
        fprint("/* name->code dictionary */")
781
        codehash.dump(fp, trace)
782

783
        fprint()
784
        fprint('static const unsigned int aliases_start = %#x;' %
785
               NAME_ALIASES_START)
786
        fprint('static const unsigned int aliases_end = %#x;' %
787
               (NAME_ALIASES_START + len(unicode.aliases)))
788

789
        fprint('static const unsigned int name_aliases[] = {')
790
        for name, codepoint in unicode.aliases:
791
            fprint('    0x%04X,' % codepoint)
792
        fprint('};')
793

794
        # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
795
        # so we are using Py_UCS2 seq[4].  This needs to be updated if longer
796
        # sequences or sequences with non-BMP chars are added.
797
        # unicodedata_lookup should be adapted too.
798
        fprint(dedent("""
799
            typedef struct NamedSequence {
800
                int seqlen;
801
                Py_UCS2 seq[4];
802
            } named_sequence;
803
            """))
804

805
        fprint('static const unsigned int named_sequences_start = %#x;' %
806
               NAMED_SEQUENCES_START)
807
        fprint('static const unsigned int named_sequences_end = %#x;' %
808
               (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
809

810
        fprint('static const named_sequence named_sequences[] = {')
811
        for name, sequence in unicode.named_sequences:
812
            seq_str = ', '.join('0x%04X' % cp for cp in sequence)
813
            fprint('    {%d, {%s}},' % (len(sequence), seq_str))
814
        fprint('};')
815

816

817
def merge_old_version(version, new, old):
818
    # Changes to exclusion file not implemented yet
819
    if old.exclusions != new.exclusions:
820
        raise NotImplementedError("exclusions differ")
821

822
    # In these change records, 0xFF means "no change"
823
    bidir_changes = [0xFF]*0x110000
824
    category_changes = [0xFF]*0x110000
825
    decimal_changes = [0xFF]*0x110000
826
    mirrored_changes = [0xFF]*0x110000
827
    east_asian_width_changes = [0xFF]*0x110000
828
    # In numeric data, 0 means "no change",
829
    # -1 means "did not have a numeric value
830
    numeric_changes = [0] * 0x110000
831
    # normalization_changes is a list of key-value pairs
832
    normalization_changes = []
833
    for i in range(0x110000):
834
        if new.table[i] is None:
835
            # Characters unassigned in the new version ought to
836
            # be unassigned in the old one
837
            assert old.table[i] is None
838
            continue
839
        # check characters unassigned in the old version
840
        if old.table[i] is None:
841
            # category 0 is "unassigned"
842
            category_changes[i] = 0
843
            continue
844
        # check characters that differ
845
        if old.table[i] != new.table[i]:
846
            for k, field in enumerate(dataclasses.fields(UcdRecord)):
847
                value = getattr(old.table[i], field.name)
848
                new_value = getattr(new.table[i], field.name)
849
                if value != new_value:
850
                    if k == 1 and i in PUA_15:
851
                        # the name is not set in the old.table, but in the
852
                        # new.table we are using it for aliases and named seq
853
                        assert value == ''
854
                    elif k == 2:
855
                        category_changes[i] = CATEGORY_NAMES.index(value)
856
                    elif k == 4:
857
                        bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
858
                    elif k == 5:
859
                        # We assume that all normalization changes are in 1:1 mappings
860
                        assert " " not in value
861
                        normalization_changes.append((i, value))
862
                    elif k == 6:
863
                        # we only support changes where the old value is a single digit
864
                        assert value in "0123456789"
865
                        decimal_changes[i] = int(value)
866
                    elif k == 8:
867
                        # Since 0 encodes "no change", the old value is better not 0
868
                        if not value:
869
                            numeric_changes[i] = -1
870
                        else:
871
                            numeric_changes[i] = float(value)
872
                            assert numeric_changes[i] not in (0, -1)
873
                    elif k == 9:
874
                        if value == 'Y':
875
                            mirrored_changes[i] = '1'
876
                        else:
877
                            mirrored_changes[i] = '0'
878
                    elif k == 11:
879
                        # change to ISO comment, ignore
880
                        pass
881
                    elif k == 12:
882
                        # change to simple uppercase mapping; ignore
883
                        pass
884
                    elif k == 13:
885
                        # change to simple lowercase mapping; ignore
886
                        pass
887
                    elif k == 14:
888
                        # change to simple titlecase mapping; ignore
889
                        pass
890
                    elif k == 15:
891
                        # change to east asian width
892
                        east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
893
                    elif k == 16:
894
                        # derived property changes; not yet
895
                        pass
896
                    elif k == 17:
897
                        # normalization quickchecks are not performed
898
                        # for older versions
899
                        pass
900
                    else:
901
                        class Difference(Exception):pass
902
                        raise Difference(hex(i), k, old.table[i], new.table[i])
903
    new.changed.append((version, list(zip(bidir_changes, category_changes,
904
                                          decimal_changes, mirrored_changes,
905
                                          east_asian_width_changes,
906
                                          numeric_changes)),
907
                        normalization_changes))
908

909

910
DATA_DIR = os.path.join('Tools', 'unicode', 'data')
911

912
def open_data(template, version):
913
    local = os.path.join(DATA_DIR, template % ('-'+version,))
914
    if not os.path.exists(local):
915
        import urllib.request
916
        if version == '3.2.0':
917
            # irregular url structure
918
            url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
919
        else:
920
            url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
921
        os.makedirs(DATA_DIR, exist_ok=True)
922
        urllib.request.urlretrieve(url, filename=local)
923
    if local.endswith('.txt'):
924
        return open(local, encoding='utf-8')
925
    else:
926
        # Unihan.zip
927
        return open(local, 'rb')
928

929

930
def expand_range(char_range: str) -> Iterator[int]:
931
    '''
932
    Parses ranges of code points, as described in UAX #44:
933
      https://www.unicode.org/reports/tr44/#Code_Point_Ranges
934
    '''
935
    if '..' in char_range:
936
        first, last = [int(c, 16) for c in char_range.split('..')]
937
    else:
938
        first = last = int(char_range, 16)
939
    for char in range(first, last+1):
940
        yield char
941

942

943
class UcdFile:
944
    '''
945
    A file in the standard format of the UCD.
946

947
    See: https://www.unicode.org/reports/tr44/#Format_Conventions
948

949
    Note that, as described there, the Unihan data files have their
950
    own separate format.
951
    '''
952

953
    def __init__(self, template: str, version: str) -> None:
954
        self.template = template
955
        self.version = version
956

957
    def records(self) -> Iterator[List[str]]:
958
        with open_data(self.template, self.version) as file:
959
            for line in file:
960
                line = line.split('#', 1)[0].strip()
961
                if not line:
962
                    continue
963
                yield [field.strip() for field in line.split(';')]
964

965
    def __iter__(self) -> Iterator[List[str]]:
966
        return self.records()
967

968
    def expanded(self) -> Iterator[Tuple[int, List[str]]]:
969
        for record in self.records():
970
            char_range, rest = record[0], record[1:]
971
            for char in expand_range(char_range):
972
                yield char, rest
973

974

975
@dataclasses.dataclass
976
class UcdRecord:
977
    # 15 fields from UnicodeData.txt .  See:
978
    #   https://www.unicode.org/reports/tr44/#UnicodeData.txt
979
    codepoint: str
980
    name: str
981
    general_category: str
982
    canonical_combining_class: str
983
    bidi_class: str
984
    decomposition_type: str
985
    decomposition_mapping: str
986
    numeric_type: str
987
    numeric_value: str
988
    bidi_mirrored: str
989
    unicode_1_name: str  # obsolete
990
    iso_comment: str  # obsolete
991
    simple_uppercase_mapping: str
992
    simple_lowercase_mapping: str
993
    simple_titlecase_mapping: str
994

995
    # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
996
    east_asian_width: Optional[str]
997

998
    # Binary properties, as a set of those that are true.
999
    # Taken from multiple files:
1000
    #   https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
1001
    #   https://www.unicode.org/reports/tr44/#LineBreak.txt
1002
    binary_properties: Set[str]
1003

1004
    # The Quick_Check properties related to normalization:
1005
    #   https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
1006
    # We store them as a bitmask.
1007
    quick_check: int
1008

1009

1010
def from_row(row: List[str]) -> UcdRecord:
1011
    return UcdRecord(*row, None, set(), 0)
1012

1013

1014
# --------------------------------------------------------------------
1015
# the following support code is taken from the unidb utilities
1016
# Copyright (c) 1999-2000 by Secret Labs AB
1017

1018
# load a unicode-data file from disk
1019

1020
class UnicodeData:
1021
    # table: List[Optional[UcdRecord]]  # index is codepoint; None means unassigned
1022

1023
    def __init__(self, version, cjk_check=True):
1024
        self.changed = []
1025
        table = [None] * 0x110000
1026
        for s in UcdFile(UNICODE_DATA, version):
1027
            char = int(s[0], 16)
1028
            table[char] = from_row(s)
1029

1030
        cjk_ranges_found = []
1031

1032
        # expand first-last ranges
1033
        field = None
1034
        for i in range(0, 0x110000):
1035
            # The file UnicodeData.txt has its own distinct way of
1036
            # expressing ranges.  See:
1037
            #   https://www.unicode.org/reports/tr44/#Code_Point_Ranges
1038
            s = table[i]
1039
            if s:
1040
                if s.name[-6:] == "First>":
1041
                    s.name = ""
1042
                    field = dataclasses.astuple(s)[:15]
1043
                elif s.name[-5:] == "Last>":
1044
                    if s.name.startswith("<CJK Ideograph"):
1045
                        cjk_ranges_found.append((field[0],
1046
                                                 s.codepoint))
1047
                    s.name = ""
1048
                    field = None
1049
            elif field:
1050
                table[i] = from_row(('%X' % i,) + field[1:])
1051
        if cjk_check and cjk_ranges != cjk_ranges_found:
1052
            raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
1053

1054
        # public attributes
1055
        self.filename = UNICODE_DATA % ''
1056
        self.table = table
1057
        self.chars = list(range(0x110000)) # unicode 3.2
1058

1059
        # check for name aliases and named sequences, see #12753
1060
        # aliases and named sequences are not in 3.2.0
1061
        if version != '3.2.0':
1062
            self.aliases = []
1063
            # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
1064
            # in order to take advantage of the compression and lookup
1065
            # algorithms used for the other characters
1066
            pua_index = NAME_ALIASES_START
1067
            for char, name, abbrev in UcdFile(NAME_ALIASES, version):
1068
                char = int(char, 16)
1069
                self.aliases.append((name, char))
1070
                # also store the name in the PUA 1
1071
                self.table[pua_index].name = name
1072
                pua_index += 1
1073
            assert pua_index - NAME_ALIASES_START == len(self.aliases)
1074

1075
            self.named_sequences = []
1076
            # store named sequences in the PUA 1, in range U+F0100..,
1077
            # in order to take advantage of the compression and lookup
1078
            # algorithms used for the other characters.
1079

1080
            assert pua_index < NAMED_SEQUENCES_START
1081
            pua_index = NAMED_SEQUENCES_START
1082
            for name, chars in UcdFile(NAMED_SEQUENCES, version):
1083
                chars = tuple(int(char, 16) for char in chars.split())
1084
                # check that the structure defined in makeunicodename is OK
1085
                assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1086
                assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1087
                    "the NamedSequence struct and in unicodedata_lookup")
1088
                self.named_sequences.append((name, chars))
1089
                # also store these in the PUA 1
1090
                self.table[pua_index].name = name
1091
                pua_index += 1
1092
            assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1093

1094
        self.exclusions = {}
1095
        for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
1096
            char = int(char, 16)
1097
            self.exclusions[char] = 1
1098

1099
        widths = [None] * 0x110000
1100
        for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
1101
            widths[char] = width
1102

1103
        for i in range(0, 0x110000):
1104
            if table[i] is not None:
1105
                table[i].east_asian_width = widths[i]
1106
        self.widths = widths
1107

1108
        for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
1109
            if table[char]:
1110
                # Some properties (e.g. Default_Ignorable_Code_Point)
1111
                # apply to unassigned code points; ignore them
1112
                table[char].binary_properties.add(p)
1113

1114
        for char_range, value in UcdFile(LINE_BREAK, version):
1115
            if value not in MANDATORY_LINE_BREAKS:
1116
                continue
1117
            for char in expand_range(char_range):
1118
                table[char].binary_properties.add('Line_Break')
1119

1120
        # We only want the quickcheck properties
1121
        # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1122
        # Yes is the default, hence only N and M occur
1123
        # In 3.2.0, the format was different (NF?_NO)
1124
        # The parsing will incorrectly determine these as
1125
        # "yes", however, unicodedata.c will not perform quickchecks
1126
        # for older versions, and no delta records will be created.
1127
        quickchecks = [0] * 0x110000
1128
        qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
1129
        for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
1130
            if len(s) < 2 or s[1] not in qc_order:
1131
                continue
1132
            quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1133
            quickcheck_shift = qc_order.index(s[1])*2
1134
            quickcheck <<= quickcheck_shift
1135
            for char in expand_range(s[0]):
1136
                assert not (quickchecks[char]>>quickcheck_shift)&3
1137
                quickchecks[char] |= quickcheck
1138
        for i in range(0, 0x110000):
1139
            if table[i] is not None:
1140
                table[i].quick_check = quickchecks[i]
1141

1142
        with open_data(UNIHAN, version) as file:
1143
            zip = zipfile.ZipFile(file)
1144
            if version == '3.2.0':
1145
                data = zip.open('Unihan-3.2.0.txt').read()
1146
            else:
1147
                data = zip.open('Unihan_NumericValues.txt').read()
1148
        for line in data.decode("utf-8").splitlines():
1149
            if not line.startswith('U+'):
1150
                continue
1151
            code, tag, value = line.split(None, 3)[:3]
1152
            if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1153
                           'kOtherNumeric'):
1154
                continue
1155
            value = value.strip().replace(',', '')
1156
            i = int(code[2:], 16)
1157
            # Patch the numeric field
1158
            if table[i] is not None:
1159
                table[i].numeric_value = value
1160

1161
        sc = self.special_casing = {}
1162
        for data in UcdFile(SPECIAL_CASING, version):
1163
            if data[4]:
1164
                # We ignore all conditionals (since they depend on
1165
                # languages) except for one, which is hardcoded. See
1166
                # handle_capital_sigma in unicodeobject.c.
1167
                continue
1168
            c = int(data[0], 16)
1169
            lower = [int(char, 16) for char in data[1].split()]
1170
            title = [int(char, 16) for char in data[2].split()]
1171
            upper = [int(char, 16) for char in data[3].split()]
1172
            sc[c] = (lower, title, upper)
1173

1174
        cf = self.case_folding = {}
1175
        if version != '3.2.0':
1176
            for data in UcdFile(CASE_FOLDING, version):
1177
                if data[1] in "CF":
1178
                    c = int(data[0], 16)
1179
                    cf[c] = [int(char, 16) for char in data[2].split()]
1180

1181
    def uselatin1(self):
1182
        # restrict character range to ISO Latin 1
1183
        self.chars = list(range(256))
1184

1185

1186
# hash table tools
1187

1188
# this is a straight-forward reimplementation of Python's built-in
1189
# dictionary type, using a static data structure, and a custom string
1190
# hash algorithm.
1191

1192
def myhash(s, magic):
1193
    h = 0
1194
    for c in map(ord, s.upper()):
1195
        h = (h * magic) + c
1196
        ix = h & 0xff000000
1197
        if ix:
1198
            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1199
    return h
1200

1201

1202
SIZES = [
1203
    (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1204
    (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1205
    (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1206
    (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1207
]
1208

1209

1210
class Hash:
1211
    def __init__(self, name, data, magic):
1212
        # turn a (key, value) list into a static hash table structure
1213

1214
        # determine table size
1215
        for size, poly in SIZES:
1216
            if size > len(data):
1217
                poly = size + poly
1218
                break
1219
        else:
1220
            raise AssertionError("ran out of polynomials")
1221

1222
        print(size, "slots in hash table")
1223

1224
        table = [None] * size
1225

1226
        mask = size-1
1227

1228
        n = 0
1229

1230
        hash = myhash
1231

1232
        # initialize hash table
1233
        for key, value in data:
1234
            h = hash(key, magic)
1235
            i = (~h) & mask
1236
            v = table[i]
1237
            if v is None:
1238
                table[i] = value
1239
                continue
1240
            incr = (h ^ (h >> 3)) & mask
1241
            if not incr:
1242
                incr = mask
1243
            while 1:
1244
                n = n + 1
1245
                i = (i + incr) & mask
1246
                v = table[i]
1247
                if v is None:
1248
                    table[i] = value
1249
                    break
1250
                incr = incr << 1
1251
                if incr > mask:
1252
                    incr = incr ^ poly
1253

1254
        print(n, "collisions")
1255
        self.collisions = n
1256

1257
        for i in range(len(table)):
1258
            if table[i] is None:
1259
                table[i] = 0
1260

1261
        self.data = Array(name + "_hash", table)
1262
        self.magic = magic
1263
        self.name = name
1264
        self.size = size
1265
        self.poly = poly
1266

1267
    def dump(self, file, trace):
1268
        # write data to file, as a C array
1269
        self.data.dump(file, trace)
1270
        file.write("#define %s_magic %d\n" % (self.name, self.magic))
1271
        file.write("#define %s_size %d\n" % (self.name, self.size))
1272
        file.write("#define %s_poly %d\n" % (self.name, self.poly))
1273

1274

1275
# stuff to deal with arrays of unsigned integers
1276

1277
class Array:
1278

1279
    def __init__(self, name, data):
1280
        self.name = name
1281
        self.data = data
1282

1283
    def dump(self, file, trace=0):
1284
        # write data to file, as a C array
1285
        size = getsize(self.data)
1286
        if trace:
1287
            print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
1288
        file.write("static const ")
1289
        if size == 1:
1290
            file.write("unsigned char")
1291
        elif size == 2:
1292
            file.write("unsigned short")
1293
        else:
1294
            file.write("unsigned int")
1295
        file.write(" " + self.name + "[] = {\n")
1296
        if self.data:
1297
            s = "    "
1298
            for item in self.data:
1299
                i = str(item) + ", "
1300
                if len(s) + len(i) > 78:
1301
                    file.write(s.rstrip() + "\n")
1302
                    s = "    " + i
1303
                else:
1304
                    s = s + i
1305
            if s.strip():
1306
                file.write(s.rstrip() + "\n")
1307
        file.write("};\n\n")
1308

1309

1310
def getsize(data):
1311
    # return smallest possible integer size for the given array
1312
    maxdata = max(data)
1313
    if maxdata < 256:
1314
        return 1
1315
    elif maxdata < 65536:
1316
        return 2
1317
    else:
1318
        return 4
1319

1320

1321
def splitbins(t, trace=0):
1322
    """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
1323

1324
    t is a sequence of ints.  This function can be useful to save space if
1325
    many of the ints are the same.  t1 and t2 are lists of ints, and shift
1326
    is an int, chosen to minimize the combined size of t1 and t2 (in C
1327
    code), and where for each i in range(len(t)),
1328
        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1329
    where mask is a bitmask isolating the last "shift" bits.
1330

1331
    If optional arg trace is non-zero (default zero), progress info
1332
    is printed to sys.stderr.  The higher the value, the more info
1333
    you'll get.
1334
    """
1335

1336
    if trace:
1337
        def dump(t1, t2, shift, bytes):
1338
            print("%d+%d bins at shift %d; %d bytes" % (
1339
                len(t1), len(t2), shift, bytes), file=sys.stderr)
1340
        print("Size of original table:", len(t)*getsize(t), "bytes",
1341
              file=sys.stderr)
1342
    n = len(t)-1    # last valid index
1343
    maxshift = 0    # the most we can shift n and still have something left
1344
    if n > 0:
1345
        while n >> 1:
1346
            n >>= 1
1347
            maxshift += 1
1348
    del n
1349
    bytes = sys.maxsize  # smallest total size so far
1350
    t = tuple(t)    # so slices can be dict keys
1351
    for shift in range(maxshift + 1):
1352
        t1 = []
1353
        t2 = []
1354
        size = 2**shift
1355
        bincache = {}
1356
        for i in range(0, len(t), size):
1357
            bin = t[i:i+size]
1358
            index = bincache.get(bin)
1359
            if index is None:
1360
                index = len(t2)
1361
                bincache[bin] = index
1362
                t2.extend(bin)
1363
            t1.append(index >> shift)
1364
        # determine memory size
1365
        b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
1366
        if trace > 1:
1367
            dump(t1, t2, shift, b)
1368
        if b < bytes:
1369
            best = t1, t2, shift
1370
            bytes = b
1371
    t1, t2, shift = best
1372
    if trace:
1373
        print("Best:", end=' ', file=sys.stderr)
1374
        dump(t1, t2, shift, bytes)
1375
    if __debug__:
1376
        # exhaustively verify that the decomposition is correct
1377
        mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1378
        for i in range(len(t)):
1379
            assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1380
    return best
1381

1382

1383
if __name__ == "__main__":
1384
    maketables(1)
1385

1386
Product

Resources

Company