Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Kitware
GitHub Repository: Kitware/CMake
Path: blob/master/Utilities/cmlibarchive/build/utils/gen_archive_string_composition_h.sh
3153 views
1
#!/bin/sh
2
#
3
# This needs http://unicode.org/Public/6.0.0/ucd/UnicodeData.txt
4
#
5
inputfile="$1" # Expect UnicodeData.txt
6
outfile=archive_string_composition.h
7
pickout=/tmp/mk_unicode_composition_tbl$$.awk
8
pickout2=/tmp/mk_unicode_composition_tbl2$$.awk
9
#nfdtmp=/tmp/mk_unicode_decomposition_tmp$$.txt
10
nfdtmp="nfdtmpx"
11
#################################################################################
12
#
13
# Append the file header of "archive_string_composition.h"
14
#
15
#################################################################################
16
append_copyright()
17
{
18
cat > ${outfile} <<CR_END
19
/*-
20
* Copyright (c) 2011-2012 libarchive Project
21
* All rights reserved.
22
*
23
* Redistribution and use in source and binary forms, with or without
24
* modification, are permitted provided that the following conditions
25
* are met:
26
* 1. Redistributions of source code must retain the above copyright
27
* notice, this list of conditions and the following disclaimer.
28
* 2. Redistributions in binary form must reproduce the above copyright
29
* notice, this list of conditions and the following disclaimer in the
30
* documentation and/or other materials provided with the distribution.
31
*
32
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
33
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
34
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
35
* IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
36
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
38
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
39
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
41
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42
*/
43
44
/*
45
* ATTENTION!
46
* This file is generated by build/utils/gen_archive_string_composition_h.sh
47
* from http://unicode.org/Public/6.0.0/ucd/UnicodeData.txt
48
*
49
* See also http://unicode.org/report/tr15/
50
*/
51
52
#ifndef __LIBARCHIVE_BUILD
53
#error This header is only to be used internally to libarchive.
54
#endif
55
56
#ifndef ARCHIVE_STRING_COMPOSITION_H_INCLUDED
57
#define ARCHIVE_STRING_COMPOSITION_H_INCLUDED
58
59
struct unicode_composition_table {
60
uint32_t cp1;
61
uint32_t cp2;
62
uint32_t nfc;
63
};
64
65
CR_END
66
}
67
#################################################################################
68
#
69
# awk script
70
#
71
#################################################################################
72
cat > ${pickout} <<AWK_END
73
#
74
BEGIN {
75
FS = ";"
76
min = "";
77
max = "";
78
cmd="sort | awk -F ' ' '{printf \"\\\\t{ 0x%s , 0x%s , 0x%s },\\\\n\",\$1,\$2,\$3}'"
79
nfdtbl="${nfdtmp}"
80
print "static const struct unicode_composition_table u_composition_table[] = {"
81
}
82
END {
83
close(cmd)
84
print "};"
85
print ""
86
#
87
# Output Canonical Combining Class tables used for translating NFD to NFC.
88
#
89
printf "#define CANONICAL_CLASS_MIN\\t0x%s\\n", min
90
printf "#define CANONICAL_CLASS_MAX\\t0x%s\\n", max
91
print ""
92
printf "#define IS_DECOMPOSABLE_BLOCK(uc)\\t\\\\\n"
93
printf "\\t(((uc)>>8) <= 0x%X && u_decomposable_blocks[(uc)>>8])\\n", highnum
94
printf "static const char u_decomposable_blocks[0x%X+1] = {\\n\\t", highnum
95
#
96
# Output blockmap
97
for (i = 0; i <= highnum; i++) {
98
if (i != 0 && i % 32 == 0)
99
printf "\\n\\t"
100
# Additionally Hangul[11XX(17), AC00(172) - D7FF(215)] is decomposable.
101
if (blockmap[i] || i == 17 || (i >= 172 && i <= 215))
102
printf "1,"
103
else
104
printf "0,"
105
}
106
printf "\\n};\\n\\n"
107
#
108
# Output a macro to get a canonical combining class.
109
#
110
print "/* Get Canonical Combining Class(CCC). */"
111
printf "#define CCC(uc)\\t\\\\\n"
112
printf "\\t(((uc) > 0x%s)?0:\\\\\\n", max
113
printf "\\tccc_val[ccc_val_index[ccc_index[(uc)>>8]][((uc)>>4)&0x0F]][(uc)&0x0F])\\n"
114
print ""
115
#
116
# Output a canonical combining class value table.
117
#
118
midcnt = 0
119
printf "/* The table of the value of Canonical Cimbining Class */\\n"
120
print "static const unsigned char ccc_val[][16] = {"
121
print " /* idx=0: XXXX0 - XXXXF */"
122
print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"
123
for (h = 0; h <= highnum; h++) {
124
if (!blockmap[h])
125
continue;
126
for (m = 0; m < 16; m++) {
127
if (!xx_blockmap[h, m])
128
continue;
129
midcnt++
130
printf " /* idx=%d: %03X%1X0 - %03X%1XF */\\n {", midcnt, h, m, h, m
131
for (l = 0; l < 15; l++) {
132
printf "%d, ", xxx_blockmap[h, m, l]
133
}
134
printf "%d },\n", xxx_blockmap[h, m, 15]
135
}
136
}
137
printf "};\n"
138
#
139
# Output the index table of the canonical combining class value table.
140
#
141
cnt = 0
142
midcnt = 0
143
printf "\\n/* The index table to ccc_val[*][16] */\\n"
144
print "static const unsigned char ccc_val_index[][16] = {"
145
print " /* idx=0: XXX00 - XXXFF */"
146
print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"
147
for (h = 0; h <= highnum; h++) {
148
if (!blockmap[h])
149
continue;
150
cnt++
151
printf " /* idx=%d: %03X00 - %03XFF */\\n {", cnt, h, h
152
for (m = 0; m < 16; m++) {
153
if (m != 0)
154
printf ","
155
if (xx_blockmap[h, m]) {
156
midcnt++
157
printf "%2d", midcnt
158
} else
159
printf " 0"
160
}
161
printf " },\\n"
162
}
163
printf "};\\n"
164
#
165
# Output the index table to the index table of the canonical combining
166
# class value table.
167
#
168
printf "\\n/* The index table to ccc_val_index[*][16] */\\n"
169
printf "static const unsigned char ccc_index[] = {\\n ", h
170
cnt = 0
171
for (h = 0; h <= highnum; h++) {
172
if (h != 0 && h % 24 == 0)
173
printf "\\n "
174
if (blockmap[h]) {
175
cnt++;
176
printf "%2d,", cnt
177
} else
178
printf " 0,"
179
}
180
print "};"
181
print ""
182
}
183
#
184
#
185
function hextoi(hex)
186
{
187
dec = 0
188
for (i=0; i < length(hex); i++) {
189
x = substr(hex, i+1, 1)
190
if (x ~/[0-9]/)
191
dec = dec * 16 + x;
192
else if (x == "A")
193
dec = dec * 16 + 10;
194
else if (x == "B")
195
dec = dec * 16 + 11;
196
else if (x == "C")
197
dec = dec * 16 + 12;
198
else if (x == "D")
199
dec = dec * 16 + 13;
200
else if (x == "E")
201
dec = dec * 16 + 14;
202
else if (x == "F")
203
dec = dec * 16 + 15;
204
}
205
return dec
206
}
207
#
208
# Collect Canonical Combining Class values.
209
#
210
\$4 ~/^[0-9A-F]+$/ {
211
if (\$4 !~/^0$/) {
212
if (min == "") {
213
min = \$1
214
}
215
max = \$1
216
high = substr(\$1, 1, length(\$1) -2)
217
highnum = hextoi(high)
218
mid = substr(\$1, length(\$1) -1, 1)
219
midnum = hextoi(mid)
220
low = substr(\$1, length(\$1), 1)
221
lownum = hextoi(low)
222
blockmap[highnum] = 1
223
xx_blockmap[highnum, midnum] = 1
224
xxx_blockmap[highnum, midnum, lownum] = \$4
225
}
226
}
227
#
228
# Following code points are not decomposed in MAC OS.
229
# U+2000 - U+2FFF
230
# U+F900 - U+FAFF
231
# U+2F800 - U+2FAFF
232
#
233
#\$1 ~/^2[0-9A-F][0-9A-F][0-9A-F]\$/ {
234
# next
235
#}
236
#\$1 ~/^F[9A][0-9A-F][0-9A-F]\$/ {
237
# next
238
#}
239
#\$1 ~/^2F[89A][0-9A-F][0-9A-F]\$/ {
240
# next
241
#}
242
#
243
# Exclusion code points specified by
244
# http://unicode.org/Public/6.0.0/ucd/CompositionExclusions.txt
245
##
246
# 1. Script Specifics
247
##
248
\$1 ~/^095[89ABCDEF]\$/ {
249
next
250
}
251
\$1 ~/^09D[CDF]\$/ {
252
next
253
}
254
\$1 ~/^0A3[36]\$/ {
255
next
256
}
257
\$1 ~/^0A5[9ABE]\$/ {
258
next
259
}
260
\$1 ~/^0B5[CD]\$/ {
261
next
262
}
263
\$1 ~/^0F4[3D]\$/ {
264
next
265
}
266
\$1 ~/^0F5[27C]\$/ {
267
next
268
}
269
\$1 ~/^0F69\$/ {
270
next
271
}
272
\$1 ~/^0F7[68]\$/ {
273
next
274
}
275
\$1 ~/^0F9[3D]\$/ {
276
next
277
}
278
\$1 ~/^0FA[27C]\$/ {
279
next
280
}
281
\$1 ~/^0FB9\$/ {
282
next
283
}
284
\$1 ~/^FB1[DF]\$/ {
285
next
286
}
287
\$1 ~/^FB2[ABCDEF]\$/ {
288
next
289
}
290
\$1 ~/^FB3[012345689ABCE]\$/ {
291
next
292
}
293
\$1 ~/^FB4[01346789ABCDE]\$/ {
294
next
295
}
296
##
297
# 2. Post Composition Version precomposed characters
298
##
299
\$1 ~/^2ADC\$/ {
300
next
301
}
302
\$1 ~/^1D15[EF]\$/ {
303
next
304
}
305
\$1 ~/^1D16[01234]\$/ {
306
next
307
}
308
\$1 ~/^1D1B[BCDEF]\$/ {
309
next
310
}
311
\$1 ~/^1D1C0\$/ {
312
next
313
}
314
##
315
# 3. Singleton Decompositions
316
##
317
\$1 ~/^034[01]\$/ {
318
next
319
}
320
\$1 ~/^037[4E]\$/ {
321
next
322
}
323
\$1 ~/^0387\$/ {
324
next
325
}
326
\$1 ~/^1F7[13579BD]\$/ {
327
next
328
}
329
\$1 ~/^1FB[BE]\$/ {
330
next
331
}
332
\$1 ~/^1FC[9B]\$/ {
333
next
334
}
335
\$1 ~/^1FD[3B]\$/ {
336
next
337
}
338
\$1 ~/^1FE[3BEF]\$/ {
339
next
340
}
341
\$1 ~/^1FF[9BD]\$/ {
342
next
343
}
344
\$1 ~/^200[01]\$/ {
345
next
346
}
347
\$1 ~/^212[6AB]\$/ {
348
next
349
}
350
\$1 ~/^232[9A]\$/ {
351
next
352
}
353
\$1 ~/^F9[0-9A-F][0-9A-F]\$/ {
354
next
355
}
356
\$1 ~/^FA0[0-9A-D]\$/ {
357
next
358
}
359
\$1 ~/^FA1[025-9A-E]\$/ {
360
next
361
}
362
\$1 ~/^FA2[0256A-D]\$/ {
363
next
364
}
365
\$1 ~/^FA[3-5][0-9A-F]\$/ {
366
next
367
}
368
\$1 ~/^FA6[0-9A-D]\$/ {
369
next
370
}
371
\$1 ~/^FA[7-9A-C][0-9A-F]\$/ {
372
next
373
}
374
\$1 ~/^FAD[0-9]\$/ {
375
next
376
}
377
\$1 ~/^2F[89][0-9A-F][0-9A-F]\$/ {
378
next
379
}
380
\$1 ~/^2FA0[0-9A-F]\$/ {
381
next
382
}
383
\$1 ~/^2FA1[0-9A-D]\$/ {
384
next
385
}
386
##
387
# 4. Non-Starter Decompositions
388
##
389
\$1 ~/^0344\$/ {
390
next
391
}
392
\$1 ~/^0F7[35]\$/ {
393
next
394
}
395
\$1 ~/^0F81\$/ {
396
next
397
}
398
#
399
# Output combinations for NFD ==> NFC.
400
#
401
\$6 ~/^[0-9A-F]+ [0-9A-F]+\$/ {
402
split(\$6, cp, " ")
403
if (length(\$1) == 4)
404
print "0"cp[1], "0"cp[2], "0"\$1 | cmd
405
else
406
print cp[1], cp[2], \$1 | cmd
407
# NFC ==> NFD table.
408
if (length(\$1) == 4)
409
print "0"\$1, "0"cp[1], "0"cp[2] >>nfdtbl
410
else
411
print \$1, cp[1], cp[2] >>nfdtbl
412
}
413
AWK_END
414
#################################################################################
415
# awk script
416
#
417
#################################################################################
418
cat > ${pickout2} <<AWK_END
419
#
420
BEGIN {
421
FS = " "
422
print "struct unicode_decomposition_table {"
423
print "\tuint32_t nfc;"
424
print "\tuint32_t cp1;"
425
print "\tuint32_t cp2;"
426
print "};"
427
print ""
428
print "static const struct unicode_decomposition_table u_decomposition_table[] = {"
429
}
430
END {
431
print "};"
432
print ""
433
}
434
{
435
printf "\t{ 0x%s , 0x%s , 0x%s },\n", \$1, \$2, \$3;
436
}
437
AWK_END
438
#################################################################################
439
#
440
# Run awk a script.
441
#
442
#################################################################################
443
append_copyright
444
awk -f ${pickout} ${inputfile} >> ${outfile}
445
awk -f ${pickout2} ${nfdtmp} >> ${outfile}
446
echo "#endif /* ARCHIVE_STRING_COMPOSITION_H_INCLUDED */" >> ${outfile}
447
echo "" >> ${outfile}
448
#
449
# Remove awk the script.
450
rm ${pickout}
451
rm ${pickout2}
452
rm ${nfdtmp}
453
454