Path: blob/master/Utilities/cmlibarchive/build/utils/gen_archive_string_composition_h.sh
3153 views
#!/bin/sh1#2# This needs http://unicode.org/Public/6.0.0/ucd/UnicodeData.txt3#4inputfile="$1" # Expect UnicodeData.txt5outfile=archive_string_composition.h6pickout=/tmp/mk_unicode_composition_tbl$$.awk7pickout2=/tmp/mk_unicode_composition_tbl2$$.awk8#nfdtmp=/tmp/mk_unicode_decomposition_tmp$$.txt9nfdtmp="nfdtmpx"10#################################################################################11#12# Append the file header of "archive_string_composition.h"13#14#################################################################################15append_copyright()16{17cat > ${outfile} <<CR_END18/*-19* Copyright (c) 2011-2012 libarchive Project20* All rights reserved.21*22* Redistribution and use in source and binary forms, with or without23* modification, are permitted provided that the following conditions24* are met:25* 1. Redistributions of source code must retain the above copyright26* notice, this list of conditions and the following disclaimer.27* 2. Redistributions in binary form must reproduce the above copyright28* notice, this list of conditions and the following disclaimer in the29* documentation and/or other materials provided with the distribution.30*31* THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR32* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES33* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.34* IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,35* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT36* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,37* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY38* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT39* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF40* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.41*/4243/*44* ATTENTION!45* This file is generated by build/utils/gen_archive_string_composition_h.sh46* from http://unicode.org/Public/6.0.0/ucd/UnicodeData.txt47*48* See also http://unicode.org/report/tr15/49*/5051#ifndef __LIBARCHIVE_BUILD52#error This header is only to be used internally to libarchive.53#endif5455#ifndef ARCHIVE_STRING_COMPOSITION_H_INCLUDED56#define ARCHIVE_STRING_COMPOSITION_H_INCLUDED5758struct unicode_composition_table {59uint32_t cp1;60uint32_t cp2;61uint32_t nfc;62};6364CR_END65}66#################################################################################67#68# awk script69#70#################################################################################71cat > ${pickout} <<AWK_END72#73BEGIN {74FS = ";"75min = "";76max = "";77cmd="sort | awk -F ' ' '{printf \"\\\\t{ 0x%s , 0x%s , 0x%s },\\\\n\",\$1,\$2,\$3}'"78nfdtbl="${nfdtmp}"79print "static const struct unicode_composition_table u_composition_table[] = {"80}81END {82close(cmd)83print "};"84print ""85#86# Output Canonical Combining Class tables used for translating NFD to NFC.87#88printf "#define CANONICAL_CLASS_MIN\\t0x%s\\n", min89printf "#define CANONICAL_CLASS_MAX\\t0x%s\\n", max90print ""91printf "#define IS_DECOMPOSABLE_BLOCK(uc)\\t\\\\\n"92printf "\\t(((uc)>>8) <= 0x%X && u_decomposable_blocks[(uc)>>8])\\n", highnum93printf "static const char u_decomposable_blocks[0x%X+1] = {\\n\\t", highnum94#95# Output blockmap96for (i = 0; i <= highnum; i++) {97if (i != 0 && i % 32 == 0)98printf "\\n\\t"99# Additionally Hangul[11XX(17), AC00(172) - D7FF(215)] is decomposable.100if (blockmap[i] || i == 17 || (i >= 172 && i <= 215))101printf "1,"102else103printf "0,"104}105printf "\\n};\\n\\n"106#107# Output a macro to get a canonical combining class.108#109print "/* Get Canonical Combining Class(CCC). */"110printf "#define CCC(uc)\\t\\\\\n"111printf "\\t(((uc) > 0x%s)?0:\\\\\\n", max112printf "\\tccc_val[ccc_val_index[ccc_index[(uc)>>8]][((uc)>>4)&0x0F]][(uc)&0x0F])\\n"113print ""114#115# Output a canonical combining class value table.116#117midcnt = 0118printf "/* The table of the value of Canonical Cimbining Class */\\n"119print "static const unsigned char ccc_val[][16] = {"120print " /* idx=0: XXXX0 - XXXXF */"121print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"122for (h = 0; h <= highnum; h++) {123if (!blockmap[h])124continue;125for (m = 0; m < 16; m++) {126if (!xx_blockmap[h, m])127continue;128midcnt++129printf " /* idx=%d: %03X%1X0 - %03X%1XF */\\n {", midcnt, h, m, h, m130for (l = 0; l < 15; l++) {131printf "%d, ", xxx_blockmap[h, m, l]132}133printf "%d },\n", xxx_blockmap[h, m, 15]134}135}136printf "};\n"137#138# Output the index table of the canonical combining class value table.139#140cnt = 0141midcnt = 0142printf "\\n/* The index table to ccc_val[*][16] */\\n"143print "static const unsigned char ccc_val_index[][16] = {"144print " /* idx=0: XXX00 - XXXFF */"145print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"146for (h = 0; h <= highnum; h++) {147if (!blockmap[h])148continue;149cnt++150printf " /* idx=%d: %03X00 - %03XFF */\\n {", cnt, h, h151for (m = 0; m < 16; m++) {152if (m != 0)153printf ","154if (xx_blockmap[h, m]) {155midcnt++156printf "%2d", midcnt157} else158printf " 0"159}160printf " },\\n"161}162printf "};\\n"163#164# Output the index table to the index table of the canonical combining165# class value table.166#167printf "\\n/* The index table to ccc_val_index[*][16] */\\n"168printf "static const unsigned char ccc_index[] = {\\n ", h169cnt = 0170for (h = 0; h <= highnum; h++) {171if (h != 0 && h % 24 == 0)172printf "\\n "173if (blockmap[h]) {174cnt++;175printf "%2d,", cnt176} else177printf " 0,"178}179print "};"180print ""181}182#183#184function hextoi(hex)185{186dec = 0187for (i=0; i < length(hex); i++) {188x = substr(hex, i+1, 1)189if (x ~/[0-9]/)190dec = dec * 16 + x;191else if (x == "A")192dec = dec * 16 + 10;193else if (x == "B")194dec = dec * 16 + 11;195else if (x == "C")196dec = dec * 16 + 12;197else if (x == "D")198dec = dec * 16 + 13;199else if (x == "E")200dec = dec * 16 + 14;201else if (x == "F")202dec = dec * 16 + 15;203}204return dec205}206#207# Collect Canonical Combining Class values.208#209\$4 ~/^[0-9A-F]+$/ {210if (\$4 !~/^0$/) {211if (min == "") {212min = \$1213}214max = \$1215high = substr(\$1, 1, length(\$1) -2)216highnum = hextoi(high)217mid = substr(\$1, length(\$1) -1, 1)218midnum = hextoi(mid)219low = substr(\$1, length(\$1), 1)220lownum = hextoi(low)221blockmap[highnum] = 1222xx_blockmap[highnum, midnum] = 1223xxx_blockmap[highnum, midnum, lownum] = \$4224}225}226#227# Following code points are not decomposed in MAC OS.228# U+2000 - U+2FFF229# U+F900 - U+FAFF230# U+2F800 - U+2FAFF231#232#\$1 ~/^2[0-9A-F][0-9A-F][0-9A-F]\$/ {233# next234#}235#\$1 ~/^F[9A][0-9A-F][0-9A-F]\$/ {236# next237#}238#\$1 ~/^2F[89A][0-9A-F][0-9A-F]\$/ {239# next240#}241#242# Exclusion code points specified by243# http://unicode.org/Public/6.0.0/ucd/CompositionExclusions.txt244##245# 1. Script Specifics246##247\$1 ~/^095[89ABCDEF]\$/ {248next249}250\$1 ~/^09D[CDF]\$/ {251next252}253\$1 ~/^0A3[36]\$/ {254next255}256\$1 ~/^0A5[9ABE]\$/ {257next258}259\$1 ~/^0B5[CD]\$/ {260next261}262\$1 ~/^0F4[3D]\$/ {263next264}265\$1 ~/^0F5[27C]\$/ {266next267}268\$1 ~/^0F69\$/ {269next270}271\$1 ~/^0F7[68]\$/ {272next273}274\$1 ~/^0F9[3D]\$/ {275next276}277\$1 ~/^0FA[27C]\$/ {278next279}280\$1 ~/^0FB9\$/ {281next282}283\$1 ~/^FB1[DF]\$/ {284next285}286\$1 ~/^FB2[ABCDEF]\$/ {287next288}289\$1 ~/^FB3[012345689ABCE]\$/ {290next291}292\$1 ~/^FB4[01346789ABCDE]\$/ {293next294}295##296# 2. Post Composition Version precomposed characters297##298\$1 ~/^2ADC\$/ {299next300}301\$1 ~/^1D15[EF]\$/ {302next303}304\$1 ~/^1D16[01234]\$/ {305next306}307\$1 ~/^1D1B[BCDEF]\$/ {308next309}310\$1 ~/^1D1C0\$/ {311next312}313##314# 3. Singleton Decompositions315##316\$1 ~/^034[01]\$/ {317next318}319\$1 ~/^037[4E]\$/ {320next321}322\$1 ~/^0387\$/ {323next324}325\$1 ~/^1F7[13579BD]\$/ {326next327}328\$1 ~/^1FB[BE]\$/ {329next330}331\$1 ~/^1FC[9B]\$/ {332next333}334\$1 ~/^1FD[3B]\$/ {335next336}337\$1 ~/^1FE[3BEF]\$/ {338next339}340\$1 ~/^1FF[9BD]\$/ {341next342}343\$1 ~/^200[01]\$/ {344next345}346\$1 ~/^212[6AB]\$/ {347next348}349\$1 ~/^232[9A]\$/ {350next351}352\$1 ~/^F9[0-9A-F][0-9A-F]\$/ {353next354}355\$1 ~/^FA0[0-9A-D]\$/ {356next357}358\$1 ~/^FA1[025-9A-E]\$/ {359next360}361\$1 ~/^FA2[0256A-D]\$/ {362next363}364\$1 ~/^FA[3-5][0-9A-F]\$/ {365next366}367\$1 ~/^FA6[0-9A-D]\$/ {368next369}370\$1 ~/^FA[7-9A-C][0-9A-F]\$/ {371next372}373\$1 ~/^FAD[0-9]\$/ {374next375}376\$1 ~/^2F[89][0-9A-F][0-9A-F]\$/ {377next378}379\$1 ~/^2FA0[0-9A-F]\$/ {380next381}382\$1 ~/^2FA1[0-9A-D]\$/ {383next384}385##386# 4. Non-Starter Decompositions387##388\$1 ~/^0344\$/ {389next390}391\$1 ~/^0F7[35]\$/ {392next393}394\$1 ~/^0F81\$/ {395next396}397#398# Output combinations for NFD ==> NFC.399#400\$6 ~/^[0-9A-F]+ [0-9A-F]+\$/ {401split(\$6, cp, " ")402if (length(\$1) == 4)403print "0"cp[1], "0"cp[2], "0"\$1 | cmd404else405print cp[1], cp[2], \$1 | cmd406# NFC ==> NFD table.407if (length(\$1) == 4)408print "0"\$1, "0"cp[1], "0"cp[2] >>nfdtbl409else410print \$1, cp[1], cp[2] >>nfdtbl411}412AWK_END413#################################################################################414# awk script415#416#################################################################################417cat > ${pickout2} <<AWK_END418#419BEGIN {420FS = " "421print "struct unicode_decomposition_table {"422print "\tuint32_t nfc;"423print "\tuint32_t cp1;"424print "\tuint32_t cp2;"425print "};"426print ""427print "static const struct unicode_decomposition_table u_decomposition_table[] = {"428}429END {430print "};"431print ""432}433{434printf "\t{ 0x%s , 0x%s , 0x%s },\n", \$1, \$2, \$3;435}436AWK_END437#################################################################################438#439# Run awk a script.440#441#################################################################################442append_copyright443awk -f ${pickout} ${inputfile} >> ${outfile}444awk -f ${pickout2} ${nfdtmp} >> ${outfile}445echo "#endif /* ARCHIVE_STRING_COMPOSITION_H_INCLUDED */" >> ${outfile}446echo "" >> ${outfile}447#448# Remove awk the script.449rm ${pickout}450rm ${pickout2}451rm ${nfdtmp}452453454