Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Kitware
GitHub Repository: Kitware/CMake
Path: blob/master/Utilities/cmlibarchive/build/utils/gen_archive_string_composition_h.sh
5043 views
1
#!/bin/sh
2
set -eu
3
4
if [ $# != 1 ]
5
then
6
echo "Usage: $0 path/to/UnicodeData.txt"
7
exit 1
8
fi
9
10
#
11
# This needs http://unicode.org/Public/6.0.0/ucd/UnicodeData.txt
12
#
13
inputfile="$1" # Expect UnicodeData.txt
14
outfile=archive_string_composition.h
15
pickout=/tmp/mk_unicode_composition_tbl$$.awk
16
pickout2=/tmp/mk_unicode_composition_tbl2$$.awk
17
#nfdtmp=/tmp/mk_unicode_decomposition_tmp$$.txt
18
nfdtmp="nfdtmpx"
19
#################################################################################
20
#
21
# Append the file header of "archive_string_composition.h"
22
#
23
#################################################################################
24
append_copyright()
25
{
26
cat > ${outfile} <<CR_END
27
/*-
28
* Copyright (c) 2011-2012 libarchive Project
29
* All rights reserved.
30
*
31
* Redistribution and use in source and binary forms, with or without
32
* modification, are permitted provided that the following conditions
33
* are met:
34
* 1. Redistributions of source code must retain the above copyright
35
* notice, this list of conditions and the following disclaimer.
36
* 2. Redistributions in binary form must reproduce the above copyright
37
* notice, this list of conditions and the following disclaimer in the
38
* documentation and/or other materials provided with the distribution.
39
*
40
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
41
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
42
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
43
* IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
44
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
47
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
48
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
49
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50
*/
51
52
/*
53
* ATTENTION!
54
* This file is generated by build/utils/gen_archive_string_composition_h.sh
55
* from http://unicode.org/Public/6.0.0/ucd/UnicodeData.txt
56
*
57
* See also http://unicode.org/report/tr15/
58
*/
59
60
#ifndef __LIBARCHIVE_BUILD
61
#error This header is only to be used internally to libarchive.
62
#endif
63
64
#ifndef ARCHIVE_STRING_COMPOSITION_H_INCLUDED
65
#define ARCHIVE_STRING_COMPOSITION_H_INCLUDED
66
67
struct unicode_composition_table {
68
uint32_t cp1;
69
uint32_t cp2;
70
uint32_t nfc;
71
};
72
73
CR_END
74
}
75
#################################################################################
76
#
77
# awk script
78
#
79
#################################################################################
80
cat > ${pickout} <<AWK_END
81
#
82
BEGIN {
83
FS = ";"
84
min = "";
85
max = "";
86
cmd="sort | awk -F ' ' '{printf \"\\\\t{ 0x%s , 0x%s , 0x%s },\\\\n\",\$1,\$2,\$3}'"
87
nfdtbl="${nfdtmp}"
88
print "static const struct unicode_composition_table u_composition_table[] = {"
89
}
90
END {
91
close(cmd)
92
print "};"
93
print ""
94
#
95
# Output Canonical Combining Class tables used for translating NFD to NFC.
96
#
97
printf "#define CANONICAL_CLASS_MIN\\t0x%s\\n", min
98
printf "#define CANONICAL_CLASS_MAX\\t0x%s\\n", max
99
print ""
100
printf "#define IS_DECOMPOSABLE_BLOCK(uc)\\t\\\\\n"
101
printf "\\t(((uc)>>8) <= 0x%X && u_decomposable_blocks[(uc)>>8])\\n", highnum
102
printf "static const char u_decomposable_blocks[0x%X+1] = {\\n\\t", highnum
103
#
104
# Output blockmap
105
for (i = 0; i <= highnum; i++) {
106
if (i != 0 && i % 32 == 0)
107
printf "\\n\\t"
108
# Additionally Hangul[11XX(17), AC00(172) - D7FF(215)] is decomposable.
109
if (blockmap[i] || i == 17 || (i >= 172 && i <= 215))
110
printf "1,"
111
else
112
printf "0,"
113
}
114
printf "\\n};\\n\\n"
115
#
116
# Output a macro to get a canonical combining class.
117
#
118
print "/* Get Canonical Combining Class(CCC). */"
119
printf "#define CCC(uc)\\t\\\\\n"
120
printf "\\t(((uc) > 0x%s)?0:\\\\\\n", max
121
printf "\\tccc_val[ccc_val_index[ccc_index[(uc)>>8]][((uc)>>4)&0x0F]][(uc)&0x0F])\\n"
122
print ""
123
#
124
# Output a canonical combining class value table.
125
#
126
midcnt = 0
127
printf "/* The table of the value of Canonical Cimbining Class */\\n"
128
print "static const unsigned char ccc_val[][16] = {"
129
print " /* idx=0: XXXX0 - XXXXF */"
130
print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"
131
for (h = 0; h <= highnum; h++) {
132
if (!blockmap[h])
133
continue;
134
for (m = 0; m < 16; m++) {
135
if (!xx_blockmap[h, m])
136
continue;
137
midcnt++
138
printf " /* idx=%d: %03X%1X0 - %03X%1XF */\\n {", midcnt, h, m, h, m
139
for (l = 0; l < 15; l++) {
140
printf "%d, ", xxx_blockmap[h, m, l]
141
}
142
printf "%d },\n", xxx_blockmap[h, m, 15]
143
}
144
}
145
printf "};\n"
146
#
147
# Output the index table of the canonical combining class value table.
148
#
149
cnt = 0
150
midcnt = 0
151
printf "\\n/* The index table to ccc_val[*][16] */\\n"
152
print "static const unsigned char ccc_val_index[][16] = {"
153
print " /* idx=0: XXX00 - XXXFF */"
154
print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"
155
for (h = 0; h <= highnum; h++) {
156
if (!blockmap[h])
157
continue;
158
cnt++
159
printf " /* idx=%d: %03X00 - %03XFF */\\n {", cnt, h, h
160
for (m = 0; m < 16; m++) {
161
if (m != 0)
162
printf ","
163
if (xx_blockmap[h, m]) {
164
midcnt++
165
printf "%2d", midcnt
166
} else
167
printf " 0"
168
}
169
printf " },\\n"
170
}
171
printf "};\\n"
172
#
173
# Output the index table to the index table of the canonical combining
174
# class value table.
175
#
176
printf "\\n/* The index table to ccc_val_index[*][16] */\\n"
177
printf "static const unsigned char ccc_index[] = {\\n ", h
178
cnt = 0
179
for (h = 0; h <= highnum; h++) {
180
if (h != 0 && h % 24 == 0)
181
printf "\\n "
182
if (blockmap[h]) {
183
cnt++;
184
printf "%2d,", cnt
185
} else
186
printf " 0,"
187
}
188
print "};"
189
print ""
190
}
191
#
192
#
193
function hextoi(hex)
194
{
195
dec = 0
196
for (i=0; i < length(hex); i++) {
197
x = substr(hex, i+1, 1)
198
if (x ~/[0-9]/)
199
dec = dec * 16 + x;
200
else if (x == "A")
201
dec = dec * 16 + 10;
202
else if (x == "B")
203
dec = dec * 16 + 11;
204
else if (x == "C")
205
dec = dec * 16 + 12;
206
else if (x == "D")
207
dec = dec * 16 + 13;
208
else if (x == "E")
209
dec = dec * 16 + 14;
210
else if (x == "F")
211
dec = dec * 16 + 15;
212
}
213
return dec
214
}
215
#
216
# Collect Canonical Combining Class values.
217
#
218
\$4 ~/^[0-9A-F]+$/ {
219
if (\$4 !~/^0$/) {
220
if (min == "") {
221
min = \$1
222
}
223
max = \$1
224
high = substr(\$1, 1, length(\$1) -2)
225
highnum = hextoi(high)
226
mid = substr(\$1, length(\$1) -1, 1)
227
midnum = hextoi(mid)
228
low = substr(\$1, length(\$1), 1)
229
lownum = hextoi(low)
230
blockmap[highnum] = 1
231
xx_blockmap[highnum, midnum] = 1
232
xxx_blockmap[highnum, midnum, lownum] = \$4
233
}
234
}
235
#
236
# Following code points are not decomposed in MAC OS.
237
# U+2000 - U+2FFF
238
# U+F900 - U+FAFF
239
# U+2F800 - U+2FAFF
240
#
241
#\$1 ~/^2[0-9A-F][0-9A-F][0-9A-F]\$/ {
242
# next
243
#}
244
#\$1 ~/^F[9A][0-9A-F][0-9A-F]\$/ {
245
# next
246
#}
247
#\$1 ~/^2F[89A][0-9A-F][0-9A-F]\$/ {
248
# next
249
#}
250
#
251
# Exclusion code points specified by
252
# http://unicode.org/Public/6.0.0/ucd/CompositionExclusions.txt
253
##
254
# 1. Script Specifics
255
##
256
\$1 ~/^095[89ABCDEF]\$/ {
257
next
258
}
259
\$1 ~/^09D[CDF]\$/ {
260
next
261
}
262
\$1 ~/^0A3[36]\$/ {
263
next
264
}
265
\$1 ~/^0A5[9ABE]\$/ {
266
next
267
}
268
\$1 ~/^0B5[CD]\$/ {
269
next
270
}
271
\$1 ~/^0F4[3D]\$/ {
272
next
273
}
274
\$1 ~/^0F5[27C]\$/ {
275
next
276
}
277
\$1 ~/^0F69\$/ {
278
next
279
}
280
\$1 ~/^0F7[68]\$/ {
281
next
282
}
283
\$1 ~/^0F9[3D]\$/ {
284
next
285
}
286
\$1 ~/^0FA[27C]\$/ {
287
next
288
}
289
\$1 ~/^0FB9\$/ {
290
next
291
}
292
\$1 ~/^FB1[DF]\$/ {
293
next
294
}
295
\$1 ~/^FB2[ABCDEF]\$/ {
296
next
297
}
298
\$1 ~/^FB3[012345689ABCE]\$/ {
299
next
300
}
301
\$1 ~/^FB4[01346789ABCDE]\$/ {
302
next
303
}
304
##
305
# 2. Post Composition Version precomposed characters
306
##
307
\$1 ~/^2ADC\$/ {
308
next
309
}
310
\$1 ~/^1D15[EF]\$/ {
311
next
312
}
313
\$1 ~/^1D16[01234]\$/ {
314
next
315
}
316
\$1 ~/^1D1B[BCDEF]\$/ {
317
next
318
}
319
\$1 ~/^1D1C0\$/ {
320
next
321
}
322
##
323
# 3. Singleton Decompositions
324
##
325
\$1 ~/^034[01]\$/ {
326
next
327
}
328
\$1 ~/^037[4E]\$/ {
329
next
330
}
331
\$1 ~/^0387\$/ {
332
next
333
}
334
\$1 ~/^1F7[13579BD]\$/ {
335
next
336
}
337
\$1 ~/^1FB[BE]\$/ {
338
next
339
}
340
\$1 ~/^1FC[9B]\$/ {
341
next
342
}
343
\$1 ~/^1FD[3B]\$/ {
344
next
345
}
346
\$1 ~/^1FE[3BEF]\$/ {
347
next
348
}
349
\$1 ~/^1FF[9BD]\$/ {
350
next
351
}
352
\$1 ~/^200[01]\$/ {
353
next
354
}
355
\$1 ~/^212[6AB]\$/ {
356
next
357
}
358
\$1 ~/^232[9A]\$/ {
359
next
360
}
361
\$1 ~/^F9[0-9A-F][0-9A-F]\$/ {
362
next
363
}
364
\$1 ~/^FA0[0-9A-D]\$/ {
365
next
366
}
367
\$1 ~/^FA1[025-9A-E]\$/ {
368
next
369
}
370
\$1 ~/^FA2[0256A-D]\$/ {
371
next
372
}
373
\$1 ~/^FA[3-5][0-9A-F]\$/ {
374
next
375
}
376
\$1 ~/^FA6[0-9A-D]\$/ {
377
next
378
}
379
\$1 ~/^FA[7-9A-C][0-9A-F]\$/ {
380
next
381
}
382
\$1 ~/^FAD[0-9]\$/ {
383
next
384
}
385
\$1 ~/^2F[89][0-9A-F][0-9A-F]\$/ {
386
next
387
}
388
\$1 ~/^2FA0[0-9A-F]\$/ {
389
next
390
}
391
\$1 ~/^2FA1[0-9A-D]\$/ {
392
next
393
}
394
##
395
# 4. Non-Starter Decompositions
396
##
397
\$1 ~/^0344\$/ {
398
next
399
}
400
\$1 ~/^0F7[35]\$/ {
401
next
402
}
403
\$1 ~/^0F81\$/ {
404
next
405
}
406
#
407
# Output combinations for NFD ==> NFC.
408
#
409
\$6 ~/^[0-9A-F]+ [0-9A-F]+\$/ {
410
split(\$6, cp, " ")
411
if (length(\$1) == 4)
412
print "0"cp[1], "0"cp[2], "0"\$1 | cmd
413
else
414
print cp[1], cp[2], \$1 | cmd
415
# NFC ==> NFD table.
416
if (length(\$1) == 4)
417
print "0"\$1, "0"cp[1], "0"cp[2] >>nfdtbl
418
else
419
print \$1, cp[1], cp[2] >>nfdtbl
420
}
421
AWK_END
422
#################################################################################
423
# awk script
424
#
425
#################################################################################
426
cat > ${pickout2} <<AWK_END
427
#
428
BEGIN {
429
FS = " "
430
print "struct unicode_decomposition_table {"
431
print "\tuint32_t nfc;"
432
print "\tuint32_t cp1;"
433
print "\tuint32_t cp2;"
434
print "};"
435
print ""
436
print "static const struct unicode_decomposition_table u_decomposition_table[] = {"
437
}
438
END {
439
print "};"
440
print ""
441
}
442
{
443
printf "\t{ 0x%s , 0x%s , 0x%s },\n", \$1, \$2, \$3;
444
}
445
AWK_END
446
#################################################################################
447
#
448
# Run awk a script.
449
#
450
#################################################################################
451
append_copyright
452
awk -f ${pickout} ${inputfile} >> ${outfile}
453
awk -f ${pickout2} ${nfdtmp} >> ${outfile}
454
echo "#endif /* ARCHIVE_STRING_COMPOSITION_H_INCLUDED */" >> ${outfile}
455
echo "" >> ${outfile}
456
#
457
# Remove awk the script.
458
rm ${pickout}
459
rm ${pickout2}
460
rm ${nfdtmp}
461
462