Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Tools/unicode/genmap_japanese.py
12 views
1
#
2
# genmap_ja_codecs.py: Japanese Codecs Map Generator
3
#
4
# Original Author: Hye-Shik Chang <[email protected]>
5
# Modified Author: Dong-hee Na <[email protected]>
6
#
7
import os
8
9
from genmap_support import *
10
11
JISX0208_C1 = (0x21, 0x74)
12
JISX0208_C2 = (0x21, 0x7e)
13
JISX0212_C1 = (0x22, 0x6d)
14
JISX0212_C2 = (0x21, 0x7e)
15
JISX0213_C1 = (0x21, 0x7e)
16
JISX0213_C2 = (0x21, 0x7e)
17
CP932P0_C1 = (0x81, 0x81) # patches between shift-jis and cp932
18
CP932P0_C2 = (0x5f, 0xca)
19
CP932P1_C1 = (0x87, 0x87) # CP932 P1
20
CP932P1_C2 = (0x40, 0x9c)
21
CP932P2_C1 = (0xed, 0xfc) # CP932 P2
22
CP932P2_C2 = (0x40, 0xfc)
23
24
MAPPINGS_JIS0208 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT'
25
MAPPINGS_JIS0212 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT'
26
MAPPINGS_CP932 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT'
27
MAPPINGS_JISX0213_2004 = 'http://wakaba-web.hp.infoseek.co.jp/table/jisx0213-2004-std.txt'
28
29
30
def loadmap_jisx0213(fo):
31
decmap3, decmap4 = {}, {} # maps to BMP for level 3 and 4
32
decmap3_2, decmap4_2 = {}, {} # maps to U+2xxxx for level 3 and 4
33
decmap3_pair = {} # maps to BMP-pair for level 3
34
for line in fo:
35
line = line.split('#', 1)[0].strip()
36
if not line or len(line.split()) < 2:
37
continue
38
39
row = line.split()
40
loc = eval('0x' + row[0][2:])
41
level = eval(row[0][0])
42
m = None
43
if len(row[1].split('+')) == 2: # single unicode
44
uni = eval('0x' + row[1][2:])
45
if level == 3:
46
if uni < 0x10000:
47
m = decmap3
48
elif 0x20000 <= uni < 0x30000:
49
uni -= 0x20000
50
m = decmap3_2
51
elif level == 4:
52
if uni < 0x10000:
53
m = decmap4
54
elif 0x20000 <= uni < 0x30000:
55
uni -= 0x20000
56
m = decmap4_2
57
m.setdefault((loc >> 8), {})
58
m[(loc >> 8)][(loc & 0xff)] = uni
59
else: # pair
60
uniprefix = eval('0x' + row[1][2:6]) # body
61
uni = eval('0x' + row[1][7:11]) # modifier
62
if level != 3:
63
raise ValueError("invalid map")
64
decmap3_pair.setdefault(uniprefix, {})
65
m = decmap3_pair[uniprefix]
66
67
if m is None:
68
raise ValueError("invalid map")
69
m.setdefault((loc >> 8), {})
70
m[(loc >> 8)][(loc & 0xff)] = uni
71
72
return decmap3, decmap4, decmap3_2, decmap4_2, decmap3_pair
73
74
75
def main():
76
jisx0208file = open_mapping_file('python-mappings/JIS0208.TXT', MAPPINGS_JIS0208)
77
jisx0212file = open_mapping_file('python-mappings/JIS0212.TXT', MAPPINGS_JIS0212)
78
cp932file = open_mapping_file('python-mappings/CP932.TXT', MAPPINGS_CP932)
79
jisx0213file = open_mapping_file('python-mappings/jisx0213-2004-std.txt', MAPPINGS_JISX0213_2004)
80
81
print("Loading Mapping File...")
82
83
sjisdecmap = loadmap(jisx0208file, natcol=0, unicol=2)
84
jisx0208decmap = loadmap(jisx0208file, natcol=1, unicol=2)
85
jisx0212decmap = loadmap(jisx0212file)
86
cp932decmap = loadmap(cp932file)
87
jis3decmap, jis4decmap, jis3_2_decmap, jis4_2_decmap, jis3_pairdecmap = loadmap_jisx0213(jisx0213file)
88
89
if jis3decmap[0x21][0x24] != 0xff0c:
90
raise SystemExit('Please adjust your JIS X 0213 map using jisx0213-2000-std.txt.diff')
91
92
sjisencmap, cp932encmap = {}, {}
93
jisx0208_0212encmap = {}
94
for c1, m in sjisdecmap.items():
95
for c2, code in m.items():
96
sjisencmap.setdefault(code >> 8, {})
97
sjisencmap[code >> 8][code & 0xff] = c1 << 8 | c2
98
for c1, m in cp932decmap.items():
99
for c2, code in m.items():
100
cp932encmap.setdefault(code >> 8, {})
101
if (code & 0xff) not in cp932encmap[code >> 8]:
102
cp932encmap[code >> 8][code & 0xff] = c1 << 8 | c2
103
for c1, m in cp932encmap.copy().items():
104
for c2, code in m.copy().items():
105
if c1 in sjisencmap and c2 in sjisencmap[c1] and sjisencmap[c1][c2] == code:
106
del cp932encmap[c1][c2]
107
if not cp932encmap[c1]:
108
del cp932encmap[c1]
109
110
jisx0213pairdecmap = {}
111
jisx0213pairencmap = []
112
for unibody, m1 in jis3_pairdecmap.items():
113
for c1, m2 in m1.items():
114
for c2, modifier in m2.items():
115
jisx0213pairencmap.append((unibody, modifier, c1 << 8 | c2))
116
jisx0213pairdecmap.setdefault(c1, {})
117
jisx0213pairdecmap[c1][c2] = unibody << 16 | modifier
118
119
# Twinmap for both of JIS X 0208 (MSB unset) and JIS X 0212 (MSB set)
120
for c1, m in jisx0208decmap.items():
121
for c2, code in m.items():
122
jisx0208_0212encmap.setdefault(code >> 8, {})
123
jisx0208_0212encmap[code >> 8][code & 0xff] = c1 << 8 | c2
124
125
for c1, m in jisx0212decmap.items():
126
for c2, code in m.items():
127
jisx0208_0212encmap.setdefault(code >> 8, {})
128
if (code & 0xff) in jisx0208_0212encmap[code >> 8]:
129
print("OOPS!!!", (code))
130
jisx0208_0212encmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
131
132
jisx0213bmpencmap = {}
133
for c1, m in jis3decmap.copy().items():
134
for c2, code in m.copy().items():
135
if c1 in jisx0208decmap and c2 in jisx0208decmap[c1]:
136
if code in jis3_pairdecmap:
137
jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair
138
jisx0213pairencmap.append((code, 0, c1 << 8 | c2))
139
elif jisx0208decmap[c1][c2] == code:
140
del jis3decmap[c1][c2]
141
if not jis3decmap[c1]:
142
del jis3decmap[c1]
143
else:
144
raise ValueError("Difference between JIS X 0208 and JIS X 0213 Plane 1 is found.")
145
else:
146
jisx0213bmpencmap.setdefault(code >> 8, {})
147
if code not in jis3_pairdecmap:
148
jisx0213bmpencmap[code >> 8][code & 0xff] = c1 << 8 | c2
149
else:
150
jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair
151
jisx0213pairencmap.append((code, 0, c1 << 8 | c2))
152
153
for c1, m in jis4decmap.items():
154
for c2, code in m.items():
155
jisx0213bmpencmap.setdefault(code >> 8, {})
156
jisx0213bmpencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
157
158
jisx0213empencmap = {}
159
for c1, m in jis3_2_decmap.items():
160
for c2, code in m.items():
161
jisx0213empencmap.setdefault(code >> 8, {})
162
jisx0213empencmap[code >> 8][code & 0xff] = c1 << 8 | c2
163
for c1, m in jis4_2_decmap.items():
164
for c2, code in m.items():
165
jisx0213empencmap.setdefault(code >> 8, {})
166
jisx0213empencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
167
168
with open("mappings_jp.h", "w") as fp:
169
print_autogen(fp, os.path.basename(__file__))
170
print("Generating JIS X 0208 decode map...")
171
writer = DecodeMapWriter(fp, "jisx0208", jisx0208decmap)
172
writer.update_decode_map(JISX0208_C1, JISX0208_C2)
173
writer.generate()
174
175
print("Generating JIS X 0212 decode map...")
176
writer = DecodeMapWriter(fp, "jisx0212", jisx0212decmap)
177
writer.update_decode_map(JISX0212_C1, JISX0212_C2)
178
writer.generate()
179
180
print("Generating JIS X 0208 && JIS X 0212 encode map...")
181
writer = EncodeMapWriter(fp, "jisxcommon", jisx0208_0212encmap)
182
writer.generate()
183
184
print("Generating CP932 Extension decode map...")
185
writer = DecodeMapWriter(fp, "cp932ext", cp932decmap)
186
writer.update_decode_map(CP932P0_C1, CP932P0_C2)
187
writer.update_decode_map(CP932P1_C1, CP932P1_C2)
188
writer.update_decode_map(CP932P2_C1, CP932P2_C2)
189
writer.generate()
190
191
print("Generating CP932 Extension encode map...")
192
writer = EncodeMapWriter(fp, "cp932ext", cp932encmap)
193
writer.generate()
194
195
print("Generating JIS X 0213 Plane 1 BMP decode map...")
196
writer = DecodeMapWriter(fp, "jisx0213_1_bmp", jis3decmap)
197
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
198
writer.generate()
199
200
print("Generating JIS X 0213 Plane 2 BMP decode map...")
201
writer = DecodeMapWriter(fp, "jisx0213_2_bmp", jis4decmap)
202
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
203
writer.generate()
204
205
print("Generating JIS X 0213 BMP encode map...")
206
writer = EncodeMapWriter(fp, "jisx0213_bmp", jisx0213bmpencmap)
207
writer.generate()
208
209
print("Generating JIS X 0213 Plane 1 EMP decode map...")
210
writer = DecodeMapWriter(fp, "jisx0213_1_emp", jis3_2_decmap)
211
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
212
writer.generate()
213
214
print("Generating JIS X 0213 Plane 2 EMP decode map...")
215
writer = DecodeMapWriter(fp, "jisx0213_2_emp", jis4_2_decmap)
216
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
217
writer.generate()
218
219
print("Generating JIS X 0213 EMP encode map...")
220
writer = EncodeMapWriter(fp, "jisx0213_emp", jisx0213empencmap)
221
writer.generate()
222
223
with open('mappings_jisx0213_pair.h', 'w') as fp:
224
print_autogen(fp, os.path.basename(__file__))
225
fp.write(f"#define JISX0213_ENCPAIRS {len(jisx0213pairencmap)}\n")
226
fp.write("""\
227
#ifdef EXTERN_JISX0213_PAIR
228
static const struct widedbcs_index *jisx0213_pair_decmap;
229
static const struct pair_encodemap *jisx0213_pair_encmap;
230
#else
231
""")
232
233
print("Generating JIS X 0213 unicode-pair decode map...")
234
writer = DecodeMapWriter(fp, "jisx0213_pair", jisx0213pairdecmap)
235
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
236
writer.generate(wide=True)
237
238
print("Generating JIS X 0213 unicode-pair encode map...")
239
jisx0213pairencmap.sort()
240
fp.write("static const struct pair_encodemap jisx0213_pair_encmap[JISX0213_ENCPAIRS] = {\n")
241
filler = BufferedFiller()
242
for body, modifier, jis in jisx0213pairencmap:
243
filler.write('{', '0x%04x%04x,' % (body, modifier), '0x%04x' % jis, '},')
244
filler.printout(fp)
245
fp.write("};\n")
246
fp.write("#endif\n")
247
248
print("Done!")
249
250
if __name__ == '__main__':
251
main()
252
253