Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Tools/unicode/makeunicodedata.py
12 views
1
#
2
# (re)generate unicode property and type databases
3
#
4
# This script converts Unicode database files to Modules/unicodedata_db.h,
5
# Modules/unicodename_db.h, and Objects/unicodetype_db.h
6
#
7
# history:
8
# 2000-09-24 fl created (based on bits and pieces from unidb)
9
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
10
# 2000-09-25 fl added character type table
11
# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
12
# 2000-11-03 fl expand first/last ranges
13
# 2001-01-19 fl added character name tables (2.1)
14
# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
15
# 2002-09-11 wd use string methods
16
# 2002-10-18 mvl update to Unicode 3.2
17
# 2002-10-22 mvl generate NFC tables
18
# 2002-11-24 mvl expand all ranges, sort names version-independently
19
# 2002-11-25 mvl add UNIDATA_VERSION
20
# 2004-05-29 perky add east asian width information
21
# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
22
# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
23
# 2011-10-21 ezio add support for name aliases and named sequences
24
# 2012-01 benjamin add full case mappings
25
#
26
# written by Fredrik Lundh ([email protected])
27
#
28
29
import dataclasses
30
import os
31
import sys
32
import zipfile
33
34
from functools import partial
35
from textwrap import dedent
36
from typing import Iterator, List, Optional, Set, Tuple
37
38
SCRIPT = sys.argv[0]
39
VERSION = "3.3"
40
41
# The Unicode Database
42
# --------------------
43
# When changing UCD version please update
44
# * Doc/library/stdtypes.rst, and
45
# * Doc/library/unicodedata.rst
46
# * Doc/reference/lexical_analysis.rst (two occurrences)
47
UNIDATA_VERSION = "15.0.0"
48
UNICODE_DATA = "UnicodeData%s.txt"
49
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
50
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
51
UNIHAN = "Unihan%s.zip"
52
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
53
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
54
LINE_BREAK = "LineBreak%s.txt"
55
NAME_ALIASES = "NameAliases%s.txt"
56
NAMED_SEQUENCES = "NamedSequences%s.txt"
57
SPECIAL_CASING = "SpecialCasing%s.txt"
58
CASE_FOLDING = "CaseFolding%s.txt"
59
60
# Private Use Areas -- in planes 1, 15, 16
61
PUA_1 = range(0xE000, 0xF900)
62
PUA_15 = range(0xF0000, 0xFFFFE)
63
PUA_16 = range(0x100000, 0x10FFFE)
64
65
# we use this ranges of PUA_15 to store name aliases and named sequences
66
NAME_ALIASES_START = 0xF0000
67
NAMED_SEQUENCES_START = 0xF0200
68
69
old_versions = ["3.2.0"]
70
71
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
72
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
73
"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
74
"So" ]
75
76
BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
77
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
78
"ON", "LRI", "RLI", "FSI", "PDI" ]
79
80
# "N" needs to be the first entry, see the comment in makeunicodedata
81
EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]
82
83
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
84
85
# note: should match definitions in Objects/unicodectype.c
86
ALPHA_MASK = 0x01
87
DECIMAL_MASK = 0x02
88
DIGIT_MASK = 0x04
89
LOWER_MASK = 0x08
90
LINEBREAK_MASK = 0x10
91
SPACE_MASK = 0x20
92
TITLE_MASK = 0x40
93
UPPER_MASK = 0x80
94
XID_START_MASK = 0x100
95
XID_CONTINUE_MASK = 0x200
96
PRINTABLE_MASK = 0x400
97
NUMERIC_MASK = 0x800
98
CASE_IGNORABLE_MASK = 0x1000
99
CASED_MASK = 0x2000
100
EXTENDED_CASE_MASK = 0x4000
101
102
# these ranges need to match unicodedata.c:is_unified_ideograph
103
cjk_ranges = [
104
('3400', '4DBF'),
105
('4E00', '9FFF'),
106
('20000', '2A6DF'),
107
('2A700', '2B739'),
108
('2B740', '2B81D'),
109
('2B820', '2CEA1'),
110
('2CEB0', '2EBE0'),
111
('30000', '3134A'),
112
('31350', '323AF'),
113
]
114
115
116
def maketables(trace=0):
117
118
print("--- Reading", UNICODE_DATA % "", "...")
119
120
unicode = UnicodeData(UNIDATA_VERSION)
121
122
print(len(list(filter(None, unicode.table))), "characters")
123
124
for version in old_versions:
125
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
126
old_unicode = UnicodeData(version, cjk_check=False)
127
print(len(list(filter(None, old_unicode.table))), "characters")
128
merge_old_version(version, unicode, old_unicode)
129
130
makeunicodename(unicode, trace)
131
makeunicodedata(unicode, trace)
132
makeunicodetype(unicode, trace)
133
134
135
# --------------------------------------------------------------------
136
# unicode character properties
137
138
def makeunicodedata(unicode, trace):
139
140
# the default value of east_asian_width is "N", for unassigned code points
141
# not mentioned in EastAsianWidth.txt
142
# in addition there are some reserved but unassigned code points in CJK
143
# ranges that are classified as "W". code points in private use areas
144
# have a width of "A". both of these have entries in
145
# EastAsianWidth.txt
146
# see https://unicode.org/reports/tr11/#Unassigned
147
assert EASTASIANWIDTH_NAMES[0] == "N"
148
dummy = (0, 0, 0, 0, 0, 0)
149
table = [dummy]
150
cache = {0: dummy}
151
index = [0] * len(unicode.chars)
152
153
FILE = "Modules/unicodedata_db.h"
154
155
print("--- Preparing", FILE, "...")
156
157
# 1) database properties
158
159
for char in unicode.chars:
160
record = unicode.table[char]
161
if record:
162
# extract database properties
163
category = CATEGORY_NAMES.index(record.general_category)
164
combining = int(record.canonical_combining_class)
165
bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
166
mirrored = record.bidi_mirrored == "Y"
167
eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
168
normalizationquickcheck = record.quick_check
169
item = (
170
category, combining, bidirectional, mirrored, eastasianwidth,
171
normalizationquickcheck
172
)
173
elif unicode.widths[char] is not None:
174
# an unassigned but reserved character, with a known
175
# east_asian_width
176
eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
177
item = (0, 0, 0, 0, eastasianwidth, 0)
178
else:
179
continue
180
181
# add entry to index and item tables
182
i = cache.get(item)
183
if i is None:
184
cache[item] = i = len(table)
185
table.append(item)
186
index[char] = i
187
188
# 2) decomposition data
189
190
decomp_data_cache = {}
191
decomp_data = [0]
192
decomp_prefix = [""]
193
decomp_index = [0] * len(unicode.chars)
194
decomp_size = 0
195
196
comp_pairs = []
197
comp_first = [None] * len(unicode.chars)
198
comp_last = [None] * len(unicode.chars)
199
200
for char in unicode.chars:
201
record = unicode.table[char]
202
if record:
203
if record.decomposition_type:
204
decomp = record.decomposition_type.split()
205
if len(decomp) > 19:
206
raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
207
# prefix
208
if decomp[0][0] == "<":
209
prefix = decomp.pop(0)
210
else:
211
prefix = ""
212
try:
213
i = decomp_prefix.index(prefix)
214
except ValueError:
215
i = len(decomp_prefix)
216
decomp_prefix.append(prefix)
217
prefix = i
218
assert prefix < 256
219
# content
220
decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
221
# Collect NFC pairs
222
if not prefix and len(decomp) == 3 and \
223
char not in unicode.exclusions and \
224
unicode.table[decomp[1]].canonical_combining_class == "0":
225
p, l, r = decomp
226
comp_first[l] = 1
227
comp_last[r] = 1
228
comp_pairs.append((l,r,char))
229
key = tuple(decomp)
230
i = decomp_data_cache.get(key, -1)
231
if i == -1:
232
i = len(decomp_data)
233
decomp_data.extend(decomp)
234
decomp_size = decomp_size + len(decomp) * 2
235
decomp_data_cache[key] = i
236
else:
237
assert decomp_data[i:i+len(decomp)] == decomp
238
else:
239
i = 0
240
decomp_index[char] = i
241
242
f = l = 0
243
comp_first_ranges = []
244
comp_last_ranges = []
245
prev_f = prev_l = None
246
for i in unicode.chars:
247
if comp_first[i] is not None:
248
comp_first[i] = f
249
f += 1
250
if prev_f is None:
251
prev_f = (i,i)
252
elif prev_f[1]+1 == i:
253
prev_f = prev_f[0],i
254
else:
255
comp_first_ranges.append(prev_f)
256
prev_f = (i,i)
257
if comp_last[i] is not None:
258
comp_last[i] = l
259
l += 1
260
if prev_l is None:
261
prev_l = (i,i)
262
elif prev_l[1]+1 == i:
263
prev_l = prev_l[0],i
264
else:
265
comp_last_ranges.append(prev_l)
266
prev_l = (i,i)
267
comp_first_ranges.append(prev_f)
268
comp_last_ranges.append(prev_l)
269
total_first = f
270
total_last = l
271
272
comp_data = [0]*(total_first*total_last)
273
for f,l,char in comp_pairs:
274
f = comp_first[f]
275
l = comp_last[l]
276
comp_data[f*total_last+l] = char
277
278
print(len(table), "unique properties")
279
print(len(decomp_prefix), "unique decomposition prefixes")
280
print(len(decomp_data), "unique decomposition entries:", end=' ')
281
print(decomp_size, "bytes")
282
print(total_first, "first characters in NFC")
283
print(total_last, "last characters in NFC")
284
print(len(comp_pairs), "NFC pairs")
285
286
print("--- Writing", FILE, "...")
287
288
with open(FILE, "w") as fp:
289
fprint = partial(print, file=fp)
290
291
fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
292
fprint()
293
fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
294
fprint("/* a list of unique database records */")
295
fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
296
for item in table:
297
fprint(" {%d, %d, %d, %d, %d, %d}," % item)
298
fprint("};")
299
fprint()
300
301
fprint("/* Reindexing of NFC first characters. */")
302
fprint("#define TOTAL_FIRST",total_first)
303
fprint("#define TOTAL_LAST",total_last)
304
fprint("struct reindex{int start;short count,index;};")
305
fprint("static struct reindex nfc_first[] = {")
306
for start,end in comp_first_ranges:
307
fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))
308
fprint(" {0,0,0}")
309
fprint("};\n")
310
fprint("static struct reindex nfc_last[] = {")
311
for start,end in comp_last_ranges:
312
fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))
313
fprint(" {0,0,0}")
314
fprint("};\n")
315
316
# FIXME: <fl> the following tables could be made static, and
317
# the support code moved into unicodedatabase.c
318
319
fprint("/* string literals */")
320
fprint("const char *_PyUnicode_CategoryNames[] = {")
321
for name in CATEGORY_NAMES:
322
fprint(" \"%s\"," % name)
323
fprint(" NULL")
324
fprint("};")
325
326
fprint("const char *_PyUnicode_BidirectionalNames[] = {")
327
for name in BIDIRECTIONAL_NAMES:
328
fprint(" \"%s\"," % name)
329
fprint(" NULL")
330
fprint("};")
331
332
fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
333
for name in EASTASIANWIDTH_NAMES:
334
fprint(" \"%s\"," % name)
335
fprint(" NULL")
336
fprint("};")
337
338
fprint("static const char *decomp_prefix[] = {")
339
for name in decomp_prefix:
340
fprint(" \"%s\"," % name)
341
fprint(" NULL")
342
fprint("};")
343
344
# split record index table
345
index1, index2, shift = splitbins(index, trace)
346
347
fprint("/* index tables for the database records */")
348
fprint("#define SHIFT", shift)
349
Array("index1", index1).dump(fp, trace)
350
Array("index2", index2).dump(fp, trace)
351
352
# split decomposition index table
353
index1, index2, shift = splitbins(decomp_index, trace)
354
355
fprint("/* decomposition data */")
356
Array("decomp_data", decomp_data).dump(fp, trace)
357
358
fprint("/* index tables for the decomposition data */")
359
fprint("#define DECOMP_SHIFT", shift)
360
Array("decomp_index1", index1).dump(fp, trace)
361
Array("decomp_index2", index2).dump(fp, trace)
362
363
index, index2, shift = splitbins(comp_data, trace)
364
fprint("/* NFC pairs */")
365
fprint("#define COMP_SHIFT", shift)
366
Array("comp_index", index).dump(fp, trace)
367
Array("comp_data", index2).dump(fp, trace)
368
369
# Generate delta tables for old versions
370
for version, table, normalization in unicode.changed:
371
cversion = version.replace(".","_")
372
records = [table[0]]
373
cache = {table[0]:0}
374
index = [0] * len(table)
375
for i, record in enumerate(table):
376
try:
377
index[i] = cache[record]
378
except KeyError:
379
index[i] = cache[record] = len(records)
380
records.append(record)
381
index1, index2, shift = splitbins(index, trace)
382
fprint("static const change_record change_records_%s[] = {" % cversion)
383
for record in records:
384
fprint(" { %s }," % ", ".join(map(str,record)))
385
fprint("};")
386
Array("changes_%s_index" % cversion, index1).dump(fp, trace)
387
Array("changes_%s_data" % cversion, index2).dump(fp, trace)
388
fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
389
fprint("{")
390
fprint(" int index;")
391
fprint(" if (n >= 0x110000) index = 0;")
392
fprint(" else {")
393
fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))
394
fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
395
(cversion, shift, ((1<<shift)-1)))
396
fprint(" }")
397
fprint(" return change_records_%s+index;" % cversion)
398
fprint("}\n")
399
fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
400
fprint("{")
401
fprint(" switch(n) {")
402
for k, v in normalization:
403
fprint(" case %s: return 0x%s;" % (hex(k), v))
404
fprint(" default: return 0;")
405
fprint(" }\n}\n")
406
407
408
# --------------------------------------------------------------------
409
# unicode character type tables
410
411
def makeunicodetype(unicode, trace):
412
413
FILE = "Objects/unicodetype_db.h"
414
415
print("--- Preparing", FILE, "...")
416
417
# extract unicode types
418
dummy = (0, 0, 0, 0, 0, 0)
419
table = [dummy]
420
cache = {dummy: 0}
421
index = [0] * len(unicode.chars)
422
numeric = {}
423
spaces = []
424
linebreaks = []
425
extra_casing = []
426
427
for char in unicode.chars:
428
record = unicode.table[char]
429
if record:
430
# extract database properties
431
category = record.general_category
432
bidirectional = record.bidi_class
433
properties = record.binary_properties
434
flags = 0
435
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
436
flags |= ALPHA_MASK
437
if "Lowercase" in properties:
438
flags |= LOWER_MASK
439
if 'Line_Break' in properties or bidirectional == "B":
440
flags |= LINEBREAK_MASK
441
linebreaks.append(char)
442
if category == "Zs" or bidirectional in ("WS", "B", "S"):
443
flags |= SPACE_MASK
444
spaces.append(char)
445
if category == "Lt":
446
flags |= TITLE_MASK
447
if "Uppercase" in properties:
448
flags |= UPPER_MASK
449
if char == ord(" ") or category[0] not in ("C", "Z"):
450
flags |= PRINTABLE_MASK
451
if "XID_Start" in properties:
452
flags |= XID_START_MASK
453
if "XID_Continue" in properties:
454
flags |= XID_CONTINUE_MASK
455
if "Cased" in properties:
456
flags |= CASED_MASK
457
if "Case_Ignorable" in properties:
458
flags |= CASE_IGNORABLE_MASK
459
sc = unicode.special_casing.get(char)
460
cf = unicode.case_folding.get(char, [char])
461
if record.simple_uppercase_mapping:
462
upper = int(record.simple_uppercase_mapping, 16)
463
else:
464
upper = char
465
if record.simple_lowercase_mapping:
466
lower = int(record.simple_lowercase_mapping, 16)
467
else:
468
lower = char
469
if record.simple_titlecase_mapping:
470
title = int(record.simple_titlecase_mapping, 16)
471
else:
472
title = upper
473
if sc is None and cf != [lower]:
474
sc = ([lower], [title], [upper])
475
if sc is None:
476
if upper == lower == title:
477
upper = lower = title = 0
478
else:
479
upper = upper - char
480
lower = lower - char
481
title = title - char
482
assert (abs(upper) <= 2147483647 and
483
abs(lower) <= 2147483647 and
484
abs(title) <= 2147483647)
485
else:
486
# This happens either when some character maps to more than one
487
# character in uppercase, lowercase, or titlecase or the
488
# casefolded version of the character is different from the
489
# lowercase. The extra characters are stored in a different
490
# array.
491
flags |= EXTENDED_CASE_MASK
492
lower = len(extra_casing) | (len(sc[0]) << 24)
493
extra_casing.extend(sc[0])
494
if cf != sc[0]:
495
lower |= len(cf) << 20
496
extra_casing.extend(cf)
497
upper = len(extra_casing) | (len(sc[2]) << 24)
498
extra_casing.extend(sc[2])
499
# Title is probably equal to upper.
500
if sc[1] == sc[2]:
501
title = upper
502
else:
503
title = len(extra_casing) | (len(sc[1]) << 24)
504
extra_casing.extend(sc[1])
505
# decimal digit, integer digit
506
decimal = 0
507
if record.decomposition_mapping:
508
flags |= DECIMAL_MASK
509
decimal = int(record.decomposition_mapping)
510
digit = 0
511
if record.numeric_type:
512
flags |= DIGIT_MASK
513
digit = int(record.numeric_type)
514
if record.numeric_value:
515
flags |= NUMERIC_MASK
516
numeric.setdefault(record.numeric_value, []).append(char)
517
item = (
518
upper, lower, title, decimal, digit, flags
519
)
520
# add entry to index and item tables
521
i = cache.get(item)
522
if i is None:
523
cache[item] = i = len(table)
524
table.append(item)
525
index[char] = i
526
527
print(len(table), "unique character type entries")
528
print(sum(map(len, numeric.values())), "numeric code points")
529
print(len(spaces), "whitespace code points")
530
print(len(linebreaks), "linebreak code points")
531
print(len(extra_casing), "extended case array")
532
533
print("--- Writing", FILE, "...")
534
535
with open(FILE, "w") as fp:
536
fprint = partial(print, file=fp)
537
538
fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
539
fprint()
540
fprint("/* a list of unique character type descriptors */")
541
fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
542
for item in table:
543
fprint(" {%d, %d, %d, %d, %d, %d}," % item)
544
fprint("};")
545
fprint()
546
547
fprint("/* extended case mappings */")
548
fprint()
549
fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
550
for c in extra_casing:
551
fprint(" %d," % c)
552
fprint("};")
553
fprint()
554
555
# split decomposition index table
556
index1, index2, shift = splitbins(index, trace)
557
558
fprint("/* type indexes */")
559
fprint("#define SHIFT", shift)
560
Array("index1", index1).dump(fp, trace)
561
Array("index2", index2).dump(fp, trace)
562
563
# Generate code for _PyUnicode_ToNumeric()
564
numeric_items = sorted(numeric.items())
565
fprint('/* Returns the numeric value as double for Unicode characters')
566
fprint(' * having this property, -1.0 otherwise.')
567
fprint(' */')
568
fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
569
fprint('{')
570
fprint(' switch (ch) {')
571
for value, codepoints in numeric_items:
572
# Turn text into float literals
573
parts = value.split('/')
574
parts = [repr(float(part)) for part in parts]
575
value = '/'.join(parts)
576
577
codepoints.sort()
578
for codepoint in codepoints:
579
fprint(' case 0x%04X:' % (codepoint,))
580
fprint(' return (double) %s;' % (value,))
581
fprint(' }')
582
fprint(' return -1.0;')
583
fprint('}')
584
fprint()
585
586
# Generate code for _PyUnicode_IsWhitespace()
587
fprint("/* Returns 1 for Unicode characters having the bidirectional")
588
fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
589
fprint(" */")
590
fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
591
fprint('{')
592
fprint(' switch (ch) {')
593
594
for codepoint in sorted(spaces):
595
fprint(' case 0x%04X:' % (codepoint,))
596
fprint(' return 1;')
597
598
fprint(' }')
599
fprint(' return 0;')
600
fprint('}')
601
fprint()
602
603
# Generate code for _PyUnicode_IsLinebreak()
604
fprint("/* Returns 1 for Unicode characters having the line break")
605
fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
606
fprint(" * type 'B', 0 otherwise.")
607
fprint(" */")
608
fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
609
fprint('{')
610
fprint(' switch (ch) {')
611
for codepoint in sorted(linebreaks):
612
fprint(' case 0x%04X:' % (codepoint,))
613
fprint(' return 1;')
614
615
fprint(' }')
616
fprint(' return 0;')
617
fprint('}')
618
fprint()
619
620
621
# --------------------------------------------------------------------
622
# unicode name database
623
624
def makeunicodename(unicode, trace):
625
626
FILE = "Modules/unicodename_db.h"
627
628
print("--- Preparing", FILE, "...")
629
630
# collect names
631
names = [None] * len(unicode.chars)
632
633
for char in unicode.chars:
634
record = unicode.table[char]
635
if record:
636
name = record.name.strip()
637
if name and name[0] != "<":
638
names[char] = name + chr(0)
639
640
print(len([n for n in names if n is not None]), "distinct names")
641
642
# collect unique words from names (note that we differ between
643
# words inside a sentence, and words ending a sentence. the
644
# latter includes the trailing null byte.
645
646
words = {}
647
n = b = 0
648
for char in unicode.chars:
649
name = names[char]
650
if name:
651
w = name.split()
652
b = b + len(name)
653
n = n + len(w)
654
for w in w:
655
l = words.get(w)
656
if l:
657
l.append(None)
658
else:
659
words[w] = [len(words)]
660
661
print(n, "words in text;", b, "bytes")
662
663
wordlist = list(words.items())
664
665
# sort on falling frequency, then by name
666
def word_key(a):
667
aword, alist = a
668
return -len(alist), aword
669
wordlist.sort(key=word_key)
670
671
# figure out how many phrasebook escapes we need
672
escapes = 0
673
while escapes * 256 < len(wordlist):
674
escapes = escapes + 1
675
print(escapes, "escapes")
676
677
short = 256 - escapes
678
679
assert short > 0
680
681
print(short, "short indexes in lexicon")
682
683
# statistics
684
n = 0
685
for i in range(short):
686
n = n + len(wordlist[i][1])
687
print(n, "short indexes in phrasebook")
688
689
# pick the most commonly used words, and sort the rest on falling
690
# length (to maximize overlap)
691
692
wordlist, wordtail = wordlist[:short], wordlist[short:]
693
wordtail.sort(key=lambda a: a[0], reverse=True)
694
wordlist.extend(wordtail)
695
696
# generate lexicon from words
697
698
lexicon_offset = [0]
699
lexicon = ""
700
words = {}
701
702
# build a lexicon string
703
offset = 0
704
for w, x in wordlist:
705
# encoding: bit 7 indicates last character in word (chr(128)
706
# indicates the last character in an entire string)
707
ww = w[:-1] + chr(ord(w[-1])+128)
708
# reuse string tails, when possible
709
o = lexicon.find(ww)
710
if o < 0:
711
o = offset
712
lexicon = lexicon + ww
713
offset = offset + len(w)
714
words[w] = len(lexicon_offset)
715
lexicon_offset.append(o)
716
717
lexicon = list(map(ord, lexicon))
718
719
# generate phrasebook from names and lexicon
720
phrasebook = [0]
721
phrasebook_offset = [0] * len(unicode.chars)
722
for char in unicode.chars:
723
name = names[char]
724
if name:
725
w = name.split()
726
phrasebook_offset[char] = len(phrasebook)
727
for w in w:
728
i = words[w]
729
if i < short:
730
phrasebook.append(i)
731
else:
732
# store as two bytes
733
phrasebook.append((i>>8) + short)
734
phrasebook.append(i&255)
735
736
assert getsize(phrasebook) == 1
737
738
#
739
# unicode name hash table
740
741
# extract names
742
data = []
743
for char in unicode.chars:
744
record = unicode.table[char]
745
if record:
746
name = record.name.strip()
747
if name and name[0] != "<":
748
data.append((name, char))
749
750
# the magic number 47 was chosen to minimize the number of
751
# collisions on the current data set. if you like, change it
752
# and see what happens...
753
754
codehash = Hash("code", data, 47)
755
756
print("--- Writing", FILE, "...")
757
758
with open(FILE, "w") as fp:
759
fprint = partial(print, file=fp)
760
761
fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
762
fprint()
763
fprint("#define NAME_MAXLEN", 256)
764
fprint()
765
fprint("/* lexicon */")
766
Array("lexicon", lexicon).dump(fp, trace)
767
Array("lexicon_offset", lexicon_offset).dump(fp, trace)
768
769
# split decomposition index table
770
offset1, offset2, shift = splitbins(phrasebook_offset, trace)
771
772
fprint("/* code->name phrasebook */")
773
fprint("#define phrasebook_shift", shift)
774
fprint("#define phrasebook_short", short)
775
776
Array("phrasebook", phrasebook).dump(fp, trace)
777
Array("phrasebook_offset1", offset1).dump(fp, trace)
778
Array("phrasebook_offset2", offset2).dump(fp, trace)
779
780
fprint("/* name->code dictionary */")
781
codehash.dump(fp, trace)
782
783
fprint()
784
fprint('static const unsigned int aliases_start = %#x;' %
785
NAME_ALIASES_START)
786
fprint('static const unsigned int aliases_end = %#x;' %
787
(NAME_ALIASES_START + len(unicode.aliases)))
788
789
fprint('static const unsigned int name_aliases[] = {')
790
for name, codepoint in unicode.aliases:
791
fprint(' 0x%04X,' % codepoint)
792
fprint('};')
793
794
# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
795
# so we are using Py_UCS2 seq[4]. This needs to be updated if longer
796
# sequences or sequences with non-BMP chars are added.
797
# unicodedata_lookup should be adapted too.
798
fprint(dedent("""
799
typedef struct NamedSequence {
800
int seqlen;
801
Py_UCS2 seq[4];
802
} named_sequence;
803
"""))
804
805
fprint('static const unsigned int named_sequences_start = %#x;' %
806
NAMED_SEQUENCES_START)
807
fprint('static const unsigned int named_sequences_end = %#x;' %
808
(NAMED_SEQUENCES_START + len(unicode.named_sequences)))
809
810
fprint('static const named_sequence named_sequences[] = {')
811
for name, sequence in unicode.named_sequences:
812
seq_str = ', '.join('0x%04X' % cp for cp in sequence)
813
fprint(' {%d, {%s}},' % (len(sequence), seq_str))
814
fprint('};')
815
816
817
def merge_old_version(version, new, old):
818
# Changes to exclusion file not implemented yet
819
if old.exclusions != new.exclusions:
820
raise NotImplementedError("exclusions differ")
821
822
# In these change records, 0xFF means "no change"
823
bidir_changes = [0xFF]*0x110000
824
category_changes = [0xFF]*0x110000
825
decimal_changes = [0xFF]*0x110000
826
mirrored_changes = [0xFF]*0x110000
827
east_asian_width_changes = [0xFF]*0x110000
828
# In numeric data, 0 means "no change",
829
# -1 means "did not have a numeric value
830
numeric_changes = [0] * 0x110000
831
# normalization_changes is a list of key-value pairs
832
normalization_changes = []
833
for i in range(0x110000):
834
if new.table[i] is None:
835
# Characters unassigned in the new version ought to
836
# be unassigned in the old one
837
assert old.table[i] is None
838
continue
839
# check characters unassigned in the old version
840
if old.table[i] is None:
841
# category 0 is "unassigned"
842
category_changes[i] = 0
843
continue
844
# check characters that differ
845
if old.table[i] != new.table[i]:
846
for k, field in enumerate(dataclasses.fields(UcdRecord)):
847
value = getattr(old.table[i], field.name)
848
new_value = getattr(new.table[i], field.name)
849
if value != new_value:
850
if k == 1 and i in PUA_15:
851
# the name is not set in the old.table, but in the
852
# new.table we are using it for aliases and named seq
853
assert value == ''
854
elif k == 2:
855
category_changes[i] = CATEGORY_NAMES.index(value)
856
elif k == 4:
857
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
858
elif k == 5:
859
# We assume that all normalization changes are in 1:1 mappings
860
assert " " not in value
861
normalization_changes.append((i, value))
862
elif k == 6:
863
# we only support changes where the old value is a single digit
864
assert value in "0123456789"
865
decimal_changes[i] = int(value)
866
elif k == 8:
867
# Since 0 encodes "no change", the old value is better not 0
868
if not value:
869
numeric_changes[i] = -1
870
else:
871
numeric_changes[i] = float(value)
872
assert numeric_changes[i] not in (0, -1)
873
elif k == 9:
874
if value == 'Y':
875
mirrored_changes[i] = '1'
876
else:
877
mirrored_changes[i] = '0'
878
elif k == 11:
879
# change to ISO comment, ignore
880
pass
881
elif k == 12:
882
# change to simple uppercase mapping; ignore
883
pass
884
elif k == 13:
885
# change to simple lowercase mapping; ignore
886
pass
887
elif k == 14:
888
# change to simple titlecase mapping; ignore
889
pass
890
elif k == 15:
891
# change to east asian width
892
east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
893
elif k == 16:
894
# derived property changes; not yet
895
pass
896
elif k == 17:
897
# normalization quickchecks are not performed
898
# for older versions
899
pass
900
else:
901
class Difference(Exception):pass
902
raise Difference(hex(i), k, old.table[i], new.table[i])
903
new.changed.append((version, list(zip(bidir_changes, category_changes,
904
decimal_changes, mirrored_changes,
905
east_asian_width_changes,
906
numeric_changes)),
907
normalization_changes))
908
909
910
DATA_DIR = os.path.join('Tools', 'unicode', 'data')
911
912
def open_data(template, version):
913
local = os.path.join(DATA_DIR, template % ('-'+version,))
914
if not os.path.exists(local):
915
import urllib.request
916
if version == '3.2.0':
917
# irregular url structure
918
url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
919
else:
920
url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
921
os.makedirs(DATA_DIR, exist_ok=True)
922
urllib.request.urlretrieve(url, filename=local)
923
if local.endswith('.txt'):
924
return open(local, encoding='utf-8')
925
else:
926
# Unihan.zip
927
return open(local, 'rb')
928
929
930
def expand_range(char_range: str) -> Iterator[int]:
931
'''
932
Parses ranges of code points, as described in UAX #44:
933
https://www.unicode.org/reports/tr44/#Code_Point_Ranges
934
'''
935
if '..' in char_range:
936
first, last = [int(c, 16) for c in char_range.split('..')]
937
else:
938
first = last = int(char_range, 16)
939
for char in range(first, last+1):
940
yield char
941
942
943
class UcdFile:
944
'''
945
A file in the standard format of the UCD.
946
947
See: https://www.unicode.org/reports/tr44/#Format_Conventions
948
949
Note that, as described there, the Unihan data files have their
950
own separate format.
951
'''
952
953
def __init__(self, template: str, version: str) -> None:
954
self.template = template
955
self.version = version
956
957
def records(self) -> Iterator[List[str]]:
958
with open_data(self.template, self.version) as file:
959
for line in file:
960
line = line.split('#', 1)[0].strip()
961
if not line:
962
continue
963
yield [field.strip() for field in line.split(';')]
964
965
def __iter__(self) -> Iterator[List[str]]:
966
return self.records()
967
968
def expanded(self) -> Iterator[Tuple[int, List[str]]]:
969
for record in self.records():
970
char_range, rest = record[0], record[1:]
971
for char in expand_range(char_range):
972
yield char, rest
973
974
975
@dataclasses.dataclass
976
class UcdRecord:
977
# 15 fields from UnicodeData.txt . See:
978
# https://www.unicode.org/reports/tr44/#UnicodeData.txt
979
codepoint: str
980
name: str
981
general_category: str
982
canonical_combining_class: str
983
bidi_class: str
984
decomposition_type: str
985
decomposition_mapping: str
986
numeric_type: str
987
numeric_value: str
988
bidi_mirrored: str
989
unicode_1_name: str # obsolete
990
iso_comment: str # obsolete
991
simple_uppercase_mapping: str
992
simple_lowercase_mapping: str
993
simple_titlecase_mapping: str
994
995
# https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
996
east_asian_width: Optional[str]
997
998
# Binary properties, as a set of those that are true.
999
# Taken from multiple files:
1000
# https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
1001
# https://www.unicode.org/reports/tr44/#LineBreak.txt
1002
binary_properties: Set[str]
1003
1004
# The Quick_Check properties related to normalization:
1005
# https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
1006
# We store them as a bitmask.
1007
quick_check: int
1008
1009
1010
def from_row(row: List[str]) -> UcdRecord:
1011
return UcdRecord(*row, None, set(), 0)
1012
1013
1014
# --------------------------------------------------------------------
1015
# the following support code is taken from the unidb utilities
1016
# Copyright (c) 1999-2000 by Secret Labs AB
1017
1018
# load a unicode-data file from disk
1019
1020
class UnicodeData:
1021
# table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
1022
1023
def __init__(self, version, cjk_check=True):
1024
self.changed = []
1025
table = [None] * 0x110000
1026
for s in UcdFile(UNICODE_DATA, version):
1027
char = int(s[0], 16)
1028
table[char] = from_row(s)
1029
1030
cjk_ranges_found = []
1031
1032
# expand first-last ranges
1033
field = None
1034
for i in range(0, 0x110000):
1035
# The file UnicodeData.txt has its own distinct way of
1036
# expressing ranges. See:
1037
# https://www.unicode.org/reports/tr44/#Code_Point_Ranges
1038
s = table[i]
1039
if s:
1040
if s.name[-6:] == "First>":
1041
s.name = ""
1042
field = dataclasses.astuple(s)[:15]
1043
elif s.name[-5:] == "Last>":
1044
if s.name.startswith("<CJK Ideograph"):
1045
cjk_ranges_found.append((field[0],
1046
s.codepoint))
1047
s.name = ""
1048
field = None
1049
elif field:
1050
table[i] = from_row(('%X' % i,) + field[1:])
1051
if cjk_check and cjk_ranges != cjk_ranges_found:
1052
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
1053
1054
# public attributes
1055
self.filename = UNICODE_DATA % ''
1056
self.table = table
1057
self.chars = list(range(0x110000)) # unicode 3.2
1058
1059
# check for name aliases and named sequences, see #12753
1060
# aliases and named sequences are not in 3.2.0
1061
if version != '3.2.0':
1062
self.aliases = []
1063
# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
1064
# in order to take advantage of the compression and lookup
1065
# algorithms used for the other characters
1066
pua_index = NAME_ALIASES_START
1067
for char, name, abbrev in UcdFile(NAME_ALIASES, version):
1068
char = int(char, 16)
1069
self.aliases.append((name, char))
1070
# also store the name in the PUA 1
1071
self.table[pua_index].name = name
1072
pua_index += 1
1073
assert pua_index - NAME_ALIASES_START == len(self.aliases)
1074
1075
self.named_sequences = []
1076
# store named sequences in the PUA 1, in range U+F0100..,
1077
# in order to take advantage of the compression and lookup
1078
# algorithms used for the other characters.
1079
1080
assert pua_index < NAMED_SEQUENCES_START
1081
pua_index = NAMED_SEQUENCES_START
1082
for name, chars in UcdFile(NAMED_SEQUENCES, version):
1083
chars = tuple(int(char, 16) for char in chars.split())
1084
# check that the structure defined in makeunicodename is OK
1085
assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1086
assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1087
"the NamedSequence struct and in unicodedata_lookup")
1088
self.named_sequences.append((name, chars))
1089
# also store these in the PUA 1
1090
self.table[pua_index].name = name
1091
pua_index += 1
1092
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1093
1094
self.exclusions = {}
1095
for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
1096
char = int(char, 16)
1097
self.exclusions[char] = 1
1098
1099
widths = [None] * 0x110000
1100
for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
1101
widths[char] = width
1102
1103
for i in range(0, 0x110000):
1104
if table[i] is not None:
1105
table[i].east_asian_width = widths[i]
1106
self.widths = widths
1107
1108
for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
1109
if table[char]:
1110
# Some properties (e.g. Default_Ignorable_Code_Point)
1111
# apply to unassigned code points; ignore them
1112
table[char].binary_properties.add(p)
1113
1114
for char_range, value in UcdFile(LINE_BREAK, version):
1115
if value not in MANDATORY_LINE_BREAKS:
1116
continue
1117
for char in expand_range(char_range):
1118
table[char].binary_properties.add('Line_Break')
1119
1120
# We only want the quickcheck properties
1121
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
1122
# Yes is the default, hence only N and M occur
1123
# In 3.2.0, the format was different (NF?_NO)
1124
# The parsing will incorrectly determine these as
1125
# "yes", however, unicodedata.c will not perform quickchecks
1126
# for older versions, and no delta records will be created.
1127
quickchecks = [0] * 0x110000
1128
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
1129
for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
1130
if len(s) < 2 or s[1] not in qc_order:
1131
continue
1132
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1133
quickcheck_shift = qc_order.index(s[1])*2
1134
quickcheck <<= quickcheck_shift
1135
for char in expand_range(s[0]):
1136
assert not (quickchecks[char]>>quickcheck_shift)&3
1137
quickchecks[char] |= quickcheck
1138
for i in range(0, 0x110000):
1139
if table[i] is not None:
1140
table[i].quick_check = quickchecks[i]
1141
1142
with open_data(UNIHAN, version) as file:
1143
zip = zipfile.ZipFile(file)
1144
if version == '3.2.0':
1145
data = zip.open('Unihan-3.2.0.txt').read()
1146
else:
1147
data = zip.open('Unihan_NumericValues.txt').read()
1148
for line in data.decode("utf-8").splitlines():
1149
if not line.startswith('U+'):
1150
continue
1151
code, tag, value = line.split(None, 3)[:3]
1152
if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1153
'kOtherNumeric'):
1154
continue
1155
value = value.strip().replace(',', '')
1156
i = int(code[2:], 16)
1157
# Patch the numeric field
1158
if table[i] is not None:
1159
table[i].numeric_value = value
1160
1161
sc = self.special_casing = {}
1162
for data in UcdFile(SPECIAL_CASING, version):
1163
if data[4]:
1164
# We ignore all conditionals (since they depend on
1165
# languages) except for one, which is hardcoded. See
1166
# handle_capital_sigma in unicodeobject.c.
1167
continue
1168
c = int(data[0], 16)
1169
lower = [int(char, 16) for char in data[1].split()]
1170
title = [int(char, 16) for char in data[2].split()]
1171
upper = [int(char, 16) for char in data[3].split()]
1172
sc[c] = (lower, title, upper)
1173
1174
cf = self.case_folding = {}
1175
if version != '3.2.0':
1176
for data in UcdFile(CASE_FOLDING, version):
1177
if data[1] in "CF":
1178
c = int(data[0], 16)
1179
cf[c] = [int(char, 16) for char in data[2].split()]
1180
1181
def uselatin1(self):
1182
# restrict character range to ISO Latin 1
1183
self.chars = list(range(256))
1184
1185
1186
# hash table tools
1187
1188
# this is a straight-forward reimplementation of Python's built-in
1189
# dictionary type, using a static data structure, and a custom string
1190
# hash algorithm.
1191
1192
def myhash(s, magic):
1193
h = 0
1194
for c in map(ord, s.upper()):
1195
h = (h * magic) + c
1196
ix = h & 0xff000000
1197
if ix:
1198
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1199
return h
1200
1201
1202
SIZES = [
1203
(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1204
(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1205
(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1206
(2097152,5), (4194304,3), (8388608,33), (16777216,27)
1207
]
1208
1209
1210
class Hash:
1211
def __init__(self, name, data, magic):
1212
# turn a (key, value) list into a static hash table structure
1213
1214
# determine table size
1215
for size, poly in SIZES:
1216
if size > len(data):
1217
poly = size + poly
1218
break
1219
else:
1220
raise AssertionError("ran out of polynomials")
1221
1222
print(size, "slots in hash table")
1223
1224
table = [None] * size
1225
1226
mask = size-1
1227
1228
n = 0
1229
1230
hash = myhash
1231
1232
# initialize hash table
1233
for key, value in data:
1234
h = hash(key, magic)
1235
i = (~h) & mask
1236
v = table[i]
1237
if v is None:
1238
table[i] = value
1239
continue
1240
incr = (h ^ (h >> 3)) & mask
1241
if not incr:
1242
incr = mask
1243
while 1:
1244
n = n + 1
1245
i = (i + incr) & mask
1246
v = table[i]
1247
if v is None:
1248
table[i] = value
1249
break
1250
incr = incr << 1
1251
if incr > mask:
1252
incr = incr ^ poly
1253
1254
print(n, "collisions")
1255
self.collisions = n
1256
1257
for i in range(len(table)):
1258
if table[i] is None:
1259
table[i] = 0
1260
1261
self.data = Array(name + "_hash", table)
1262
self.magic = magic
1263
self.name = name
1264
self.size = size
1265
self.poly = poly
1266
1267
def dump(self, file, trace):
1268
# write data to file, as a C array
1269
self.data.dump(file, trace)
1270
file.write("#define %s_magic %d\n" % (self.name, self.magic))
1271
file.write("#define %s_size %d\n" % (self.name, self.size))
1272
file.write("#define %s_poly %d\n" % (self.name, self.poly))
1273
1274
1275
# stuff to deal with arrays of unsigned integers
1276
1277
class Array:
1278
1279
def __init__(self, name, data):
1280
self.name = name
1281
self.data = data
1282
1283
def dump(self, file, trace=0):
1284
# write data to file, as a C array
1285
size = getsize(self.data)
1286
if trace:
1287
print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
1288
file.write("static const ")
1289
if size == 1:
1290
file.write("unsigned char")
1291
elif size == 2:
1292
file.write("unsigned short")
1293
else:
1294
file.write("unsigned int")
1295
file.write(" " + self.name + "[] = {\n")
1296
if self.data:
1297
s = " "
1298
for item in self.data:
1299
i = str(item) + ", "
1300
if len(s) + len(i) > 78:
1301
file.write(s.rstrip() + "\n")
1302
s = " " + i
1303
else:
1304
s = s + i
1305
if s.strip():
1306
file.write(s.rstrip() + "\n")
1307
file.write("};\n\n")
1308
1309
1310
def getsize(data):
1311
# return smallest possible integer size for the given array
1312
maxdata = max(data)
1313
if maxdata < 256:
1314
return 1
1315
elif maxdata < 65536:
1316
return 2
1317
else:
1318
return 4
1319
1320
1321
def splitbins(t, trace=0):
1322
"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
1323
1324
t is a sequence of ints. This function can be useful to save space if
1325
many of the ints are the same. t1 and t2 are lists of ints, and shift
1326
is an int, chosen to minimize the combined size of t1 and t2 (in C
1327
code), and where for each i in range(len(t)),
1328
t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1329
where mask is a bitmask isolating the last "shift" bits.
1330
1331
If optional arg trace is non-zero (default zero), progress info
1332
is printed to sys.stderr. The higher the value, the more info
1333
you'll get.
1334
"""
1335
1336
if trace:
1337
def dump(t1, t2, shift, bytes):
1338
print("%d+%d bins at shift %d; %d bytes" % (
1339
len(t1), len(t2), shift, bytes), file=sys.stderr)
1340
print("Size of original table:", len(t)*getsize(t), "bytes",
1341
file=sys.stderr)
1342
n = len(t)-1 # last valid index
1343
maxshift = 0 # the most we can shift n and still have something left
1344
if n > 0:
1345
while n >> 1:
1346
n >>= 1
1347
maxshift += 1
1348
del n
1349
bytes = sys.maxsize # smallest total size so far
1350
t = tuple(t) # so slices can be dict keys
1351
for shift in range(maxshift + 1):
1352
t1 = []
1353
t2 = []
1354
size = 2**shift
1355
bincache = {}
1356
for i in range(0, len(t), size):
1357
bin = t[i:i+size]
1358
index = bincache.get(bin)
1359
if index is None:
1360
index = len(t2)
1361
bincache[bin] = index
1362
t2.extend(bin)
1363
t1.append(index >> shift)
1364
# determine memory size
1365
b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
1366
if trace > 1:
1367
dump(t1, t2, shift, b)
1368
if b < bytes:
1369
best = t1, t2, shift
1370
bytes = b
1371
t1, t2, shift = best
1372
if trace:
1373
print("Best:", end=' ', file=sys.stderr)
1374
dump(t1, t2, shift, bytes)
1375
if __debug__:
1376
# exhaustively verify that the decomposition is correct
1377
mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1378
for i in range(len(t)):
1379
assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1380
return best
1381
1382
1383
if __name__ == "__main__":
1384
maketables(1)
1385
1386