Path: blob/master/Utilities/cmlibarchive/build/utils/gen_archive_string_composition_h.sh
5043 views
#!/bin/sh1set -eu23if [ $# != 1 ]4then5echo "Usage: $0 path/to/UnicodeData.txt"6exit 17fi89#10# This needs http://unicode.org/Public/6.0.0/ucd/UnicodeData.txt11#12inputfile="$1" # Expect UnicodeData.txt13outfile=archive_string_composition.h14pickout=/tmp/mk_unicode_composition_tbl$$.awk15pickout2=/tmp/mk_unicode_composition_tbl2$$.awk16#nfdtmp=/tmp/mk_unicode_decomposition_tmp$$.txt17nfdtmp="nfdtmpx"18#################################################################################19#20# Append the file header of "archive_string_composition.h"21#22#################################################################################23append_copyright()24{25cat > ${outfile} <<CR_END26/*-27* Copyright (c) 2011-2012 libarchive Project28* All rights reserved.29*30* Redistribution and use in source and binary forms, with or without31* modification, are permitted provided that the following conditions32* are met:33* 1. Redistributions of source code must retain the above copyright34* notice, this list of conditions and the following disclaimer.35* 2. Redistributions in binary form must reproduce the above copyright36* notice, this list of conditions and the following disclaimer in the37* documentation and/or other materials provided with the distribution.38*39* THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR40* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES41* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.42* IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,43* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT44* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,45* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY46* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT47* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF48* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.49*/5051/*52* ATTENTION!53* This file is generated by build/utils/gen_archive_string_composition_h.sh54* from http://unicode.org/Public/6.0.0/ucd/UnicodeData.txt55*56* See also http://unicode.org/report/tr15/57*/5859#ifndef __LIBARCHIVE_BUILD60#error This header is only to be used internally to libarchive.61#endif6263#ifndef ARCHIVE_STRING_COMPOSITION_H_INCLUDED64#define ARCHIVE_STRING_COMPOSITION_H_INCLUDED6566struct unicode_composition_table {67uint32_t cp1;68uint32_t cp2;69uint32_t nfc;70};7172CR_END73}74#################################################################################75#76# awk script77#78#################################################################################79cat > ${pickout} <<AWK_END80#81BEGIN {82FS = ";"83min = "";84max = "";85cmd="sort | awk -F ' ' '{printf \"\\\\t{ 0x%s , 0x%s , 0x%s },\\\\n\",\$1,\$2,\$3}'"86nfdtbl="${nfdtmp}"87print "static const struct unicode_composition_table u_composition_table[] = {"88}89END {90close(cmd)91print "};"92print ""93#94# Output Canonical Combining Class tables used for translating NFD to NFC.95#96printf "#define CANONICAL_CLASS_MIN\\t0x%s\\n", min97printf "#define CANONICAL_CLASS_MAX\\t0x%s\\n", max98print ""99printf "#define IS_DECOMPOSABLE_BLOCK(uc)\\t\\\\\n"100printf "\\t(((uc)>>8) <= 0x%X && u_decomposable_blocks[(uc)>>8])\\n", highnum101printf "static const char u_decomposable_blocks[0x%X+1] = {\\n\\t", highnum102#103# Output blockmap104for (i = 0; i <= highnum; i++) {105if (i != 0 && i % 32 == 0)106printf "\\n\\t"107# Additionally Hangul[11XX(17), AC00(172) - D7FF(215)] is decomposable.108if (blockmap[i] || i == 17 || (i >= 172 && i <= 215))109printf "1,"110else111printf "0,"112}113printf "\\n};\\n\\n"114#115# Output a macro to get a canonical combining class.116#117print "/* Get Canonical Combining Class(CCC). */"118printf "#define CCC(uc)\\t\\\\\n"119printf "\\t(((uc) > 0x%s)?0:\\\\\\n", max120printf "\\tccc_val[ccc_val_index[ccc_index[(uc)>>8]][((uc)>>4)&0x0F]][(uc)&0x0F])\\n"121print ""122#123# Output a canonical combining class value table.124#125midcnt = 0126printf "/* The table of the value of Canonical Cimbining Class */\\n"127print "static const unsigned char ccc_val[][16] = {"128print " /* idx=0: XXXX0 - XXXXF */"129print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"130for (h = 0; h <= highnum; h++) {131if (!blockmap[h])132continue;133for (m = 0; m < 16; m++) {134if (!xx_blockmap[h, m])135continue;136midcnt++137printf " /* idx=%d: %03X%1X0 - %03X%1XF */\\n {", midcnt, h, m, h, m138for (l = 0; l < 15; l++) {139printf "%d, ", xxx_blockmap[h, m, l]140}141printf "%d },\n", xxx_blockmap[h, m, 15]142}143}144printf "};\n"145#146# Output the index table of the canonical combining class value table.147#148cnt = 0149midcnt = 0150printf "\\n/* The index table to ccc_val[*][16] */\\n"151print "static const unsigned char ccc_val_index[][16] = {"152print " /* idx=0: XXX00 - XXXFF */"153print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"154for (h = 0; h <= highnum; h++) {155if (!blockmap[h])156continue;157cnt++158printf " /* idx=%d: %03X00 - %03XFF */\\n {", cnt, h, h159for (m = 0; m < 16; m++) {160if (m != 0)161printf ","162if (xx_blockmap[h, m]) {163midcnt++164printf "%2d", midcnt165} else166printf " 0"167}168printf " },\\n"169}170printf "};\\n"171#172# Output the index table to the index table of the canonical combining173# class value table.174#175printf "\\n/* The index table to ccc_val_index[*][16] */\\n"176printf "static const unsigned char ccc_index[] = {\\n ", h177cnt = 0178for (h = 0; h <= highnum; h++) {179if (h != 0 && h % 24 == 0)180printf "\\n "181if (blockmap[h]) {182cnt++;183printf "%2d,", cnt184} else185printf " 0,"186}187print "};"188print ""189}190#191#192function hextoi(hex)193{194dec = 0195for (i=0; i < length(hex); i++) {196x = substr(hex, i+1, 1)197if (x ~/[0-9]/)198dec = dec * 16 + x;199else if (x == "A")200dec = dec * 16 + 10;201else if (x == "B")202dec = dec * 16 + 11;203else if (x == "C")204dec = dec * 16 + 12;205else if (x == "D")206dec = dec * 16 + 13;207else if (x == "E")208dec = dec * 16 + 14;209else if (x == "F")210dec = dec * 16 + 15;211}212return dec213}214#215# Collect Canonical Combining Class values.216#217\$4 ~/^[0-9A-F]+$/ {218if (\$4 !~/^0$/) {219if (min == "") {220min = \$1221}222max = \$1223high = substr(\$1, 1, length(\$1) -2)224highnum = hextoi(high)225mid = substr(\$1, length(\$1) -1, 1)226midnum = hextoi(mid)227low = substr(\$1, length(\$1), 1)228lownum = hextoi(low)229blockmap[highnum] = 1230xx_blockmap[highnum, midnum] = 1231xxx_blockmap[highnum, midnum, lownum] = \$4232}233}234#235# Following code points are not decomposed in MAC OS.236# U+2000 - U+2FFF237# U+F900 - U+FAFF238# U+2F800 - U+2FAFF239#240#\$1 ~/^2[0-9A-F][0-9A-F][0-9A-F]\$/ {241# next242#}243#\$1 ~/^F[9A][0-9A-F][0-9A-F]\$/ {244# next245#}246#\$1 ~/^2F[89A][0-9A-F][0-9A-F]\$/ {247# next248#}249#250# Exclusion code points specified by251# http://unicode.org/Public/6.0.0/ucd/CompositionExclusions.txt252##253# 1. Script Specifics254##255\$1 ~/^095[89ABCDEF]\$/ {256next257}258\$1 ~/^09D[CDF]\$/ {259next260}261\$1 ~/^0A3[36]\$/ {262next263}264\$1 ~/^0A5[9ABE]\$/ {265next266}267\$1 ~/^0B5[CD]\$/ {268next269}270\$1 ~/^0F4[3D]\$/ {271next272}273\$1 ~/^0F5[27C]\$/ {274next275}276\$1 ~/^0F69\$/ {277next278}279\$1 ~/^0F7[68]\$/ {280next281}282\$1 ~/^0F9[3D]\$/ {283next284}285\$1 ~/^0FA[27C]\$/ {286next287}288\$1 ~/^0FB9\$/ {289next290}291\$1 ~/^FB1[DF]\$/ {292next293}294\$1 ~/^FB2[ABCDEF]\$/ {295next296}297\$1 ~/^FB3[012345689ABCE]\$/ {298next299}300\$1 ~/^FB4[01346789ABCDE]\$/ {301next302}303##304# 2. Post Composition Version precomposed characters305##306\$1 ~/^2ADC\$/ {307next308}309\$1 ~/^1D15[EF]\$/ {310next311}312\$1 ~/^1D16[01234]\$/ {313next314}315\$1 ~/^1D1B[BCDEF]\$/ {316next317}318\$1 ~/^1D1C0\$/ {319next320}321##322# 3. Singleton Decompositions323##324\$1 ~/^034[01]\$/ {325next326}327\$1 ~/^037[4E]\$/ {328next329}330\$1 ~/^0387\$/ {331next332}333\$1 ~/^1F7[13579BD]\$/ {334next335}336\$1 ~/^1FB[BE]\$/ {337next338}339\$1 ~/^1FC[9B]\$/ {340next341}342\$1 ~/^1FD[3B]\$/ {343next344}345\$1 ~/^1FE[3BEF]\$/ {346next347}348\$1 ~/^1FF[9BD]\$/ {349next350}351\$1 ~/^200[01]\$/ {352next353}354\$1 ~/^212[6AB]\$/ {355next356}357\$1 ~/^232[9A]\$/ {358next359}360\$1 ~/^F9[0-9A-F][0-9A-F]\$/ {361next362}363\$1 ~/^FA0[0-9A-D]\$/ {364next365}366\$1 ~/^FA1[025-9A-E]\$/ {367next368}369\$1 ~/^FA2[0256A-D]\$/ {370next371}372\$1 ~/^FA[3-5][0-9A-F]\$/ {373next374}375\$1 ~/^FA6[0-9A-D]\$/ {376next377}378\$1 ~/^FA[7-9A-C][0-9A-F]\$/ {379next380}381\$1 ~/^FAD[0-9]\$/ {382next383}384\$1 ~/^2F[89][0-9A-F][0-9A-F]\$/ {385next386}387\$1 ~/^2FA0[0-9A-F]\$/ {388next389}390\$1 ~/^2FA1[0-9A-D]\$/ {391next392}393##394# 4. Non-Starter Decompositions395##396\$1 ~/^0344\$/ {397next398}399\$1 ~/^0F7[35]\$/ {400next401}402\$1 ~/^0F81\$/ {403next404}405#406# Output combinations for NFD ==> NFC.407#408\$6 ~/^[0-9A-F]+ [0-9A-F]+\$/ {409split(\$6, cp, " ")410if (length(\$1) == 4)411print "0"cp[1], "0"cp[2], "0"\$1 | cmd412else413print cp[1], cp[2], \$1 | cmd414# NFC ==> NFD table.415if (length(\$1) == 4)416print "0"\$1, "0"cp[1], "0"cp[2] >>nfdtbl417else418print \$1, cp[1], cp[2] >>nfdtbl419}420AWK_END421#################################################################################422# awk script423#424#################################################################################425cat > ${pickout2} <<AWK_END426#427BEGIN {428FS = " "429print "struct unicode_decomposition_table {"430print "\tuint32_t nfc;"431print "\tuint32_t cp1;"432print "\tuint32_t cp2;"433print "};"434print ""435print "static const struct unicode_decomposition_table u_decomposition_table[] = {"436}437END {438print "};"439print ""440}441{442printf "\t{ 0x%s , 0x%s , 0x%s },\n", \$1, \$2, \$3;443}444AWK_END445#################################################################################446#447# Run awk a script.448#449#################################################################################450append_copyright451awk -f ${pickout} ${inputfile} >> ${outfile}452awk -f ${pickout2} ${nfdtmp} >> ${outfile}453echo "#endif /* ARCHIVE_STRING_COMPOSITION_H_INCLUDED */" >> ${outfile}454echo "" >> ${outfile}455#456# Remove awk the script.457rm ${pickout}458rm ${pickout2}459rm ${nfdtmp}460461462