Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/make/src/classes/build/tools/generatecharacter/GenerateCharacter.java
32287 views
/*1* Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425package build.tools.generatecharacter;2627import java.io.IOException;28import java.io.FileNotFoundException;29import java.io.BufferedReader;30import java.io.FileReader;31import java.io.PrintWriter;32import java.io.BufferedWriter;33import java.io.FileWriter;34import java.io.File;35import java.util.List;3637import build.tools.generatecharacter.CharacterName;3839/**40* This program generates the source code for the class java.lang.Character.41* It also generates native C code that can perform the same operations.42* It requires two external input data files:43* <ul>44* <li> Unicode specification file45* <li> Character class template file46* </ul>47* The Unicode specification file is available from the Unicode consortium.48* It has character specification lines that look like this:49* <listing>50* 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;51* </listing>52* The Character class template file is filled in with additional53* information to produce the file Character.java, which can then be54* compiled by a Java compiler. The template file contains certain55* markers consisting of an alphabetic name string preceded by "$$".56* Such markers are replaced with generated program text. As a special57* case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of58* alphabetic characters constituting a variable name. The character "_"59* is considered alphabetic for these purposes.60*61* @author Guy Steele62* @author Alan Liu63* @author John O'Conner64*/6566public class GenerateCharacter {6768final static boolean DEBUG = false;6970final static String commandMarker = "$$";71static String ROOT = "";72static String DefaultUnicodeSpecFileName = ROOT + "UnicodeData.txt";73static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";74static String DefaultPropListFileName = ROOT + "PropList.txt";75static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";76static String DefaultJavaOutputFileName = ROOT + "Character.java";77static String DefaultCTemplateFileName = ROOT + "Character.c.template";78static String DefaultCOutputFileName = ROOT + "Character.c";7980static int plane = 0;8182/* The overall idea is that, in the generated Character class source code,83most character property data is stored in a special multi-level table whose84structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn].85The integers must sum to 16 (the number of bits in a character).86The first table is indexed by the k1 high-order bits of the character code.87The result is concatenated to the next k2 bits of the character code to index88the second table, and so on. Eventually the kn low-order bits of the character89code are concatenated and used to index one of two tables A and B; A contains9032-bit integer entries and B contains 16-bit short entries. The 48 bits that91can be thus obtained encode the properties for the character.9293The default specification is [9, 4, 3, 0]. This particular table format was94designed by conducting an exhaustive search of table formats to minimize the95space consumed by the tables: the first and third tables need have only byte96values (the second table must have short values). Another good choice is97[10, 6, 0], which produces a larger table but allows particularly fast table98lookup code.99100In each case, where the word "concatenated" is used, this may imply101first a << and then a | operation, or perhaps just a | operation if102the values in the table can be preshifted (generally possible if the table103entries are short rather than byte).104*/105106/* The character properties are currently encoded into A (32 bits)and B (16 bits)107two parts.108109A: the low 32 bits are defined in the following manner:1101111 bit Mirrored property.1124 bits Bidirectional category (see below) (unused if -nobidi switch specified)1139 bits A signed offset used for converting case .1141 bit If 1, adding the signed offset converts the character to lowercase.1151 bit If 1, subtracting the signed offset converts the character to uppercase.116Note: for a titlecase character, both of the preceding bits will be 1117and the signed offset will be 1.1181 bit If 1, this character has a titlecase equivalent (possibly itself);119in this case, the two bits before this bit can be used to decide120whether this character is in fact uppercase, lowercase, or titlecase.1213 bits This field provides a quick way to lex identifiers.122The eight possible values for this field are as follows:1230 May not be part of an identifier1241 Ignorable control; may continue a Unicode identifier or Java identifier1252 May continue a Java identifier but not a Unicode identifier (unused)1263 May continue a Unicode identifier or Java identifier1274 Is a Java whitespace character1285 May start or continue a Java identifier;129may continue but not start a Unicode identifier130(this value is used for connector punctuation such as _)1316 May start or continue a Java identifier;132may not occur in a Unicode identifier133(this value is used for currency symbols such as $)1347 May start or continue a Unicode identifier or Java identifier135Thus:1365, 6, 7 may start a Java identifier1371, 2, 3, 5, 6, 7 may continue a Java identifier1387 may start a Unicode identifier1391, 3, 5, 7 may continue a Unicode identifier1401 is ignorable within an identifier1414 is Java whitespace1422 bits This field indicates whether the character has a numeric property.143The four possible values for this field are as follows:1440 This character has no numeric property.1451 Adding the digit offset to the character code and then146masking with 0x1F will produce the desired numeric value.1472 This character has a "strange" numeric value.1483 A Java supradecimal digit: adding the digit offset to the149character code, then masking with 0x1F, then adding 10150will produce the desired numeric value.1515 bits The digit offset (see description of previous field)1525 bits Character type (see below)153154B: the high 16 bits are defined as:1551 bit Other_Lowercase property1561 bit Other_Uppercase property1571 bit Other_Alphabetic property1581 bit Other_Math property1591 bit Ideographic property1601 bit Noncharacter codepoint property161*/162163164// bit masks identify each component of a 32-bit property field described165// above.166// shift* indicates how many shifts right must happen to get the167// indicated property value in the lowest bits of the 32-bit space.168private static final int169shiftType = 0, maskType = 0x001F,170shiftDigitOffset = 5, maskDigitOffset = 0x03E0,171shiftNumericType = 10, maskNumericType = 0x0C00,172shiftIdentifierInfo = 12, maskIdentifierInfo = 0x7000,173maskUnicodePart = 0x1000,174shiftCaseInfo = 15, maskCaseInfo = 0x38000,175maskLowerCase = 0x20000,176maskUpperCase = 0x10000,177maskTitleCase = 0x08000,178shiftCaseOffset = 18, maskCaseOffset = 0x07FC0000,179shiftCaseOffsetSign = 5,180// used only when calculating and181// storing digit offsets from char values182maskDigit = 0x001F,183// case offset are 9 bits184maskCase = 0x01FF,185shiftBidi = 27, maskBidi = 0x78000000,186shiftMirrored = 31, //maskMirrored = 0x80000000,187shiftPlane = 16, maskPlane = 0xFF0000;188189// maskMirrored needs to be long, if up 16-bit190private static final long maskMirrored = 0x80000000L;191192// bit masks identify the 16-bit priperty field described above, in B193// table194private static final long195maskOtherLowercase = 0x100000000L,196maskOtherUppercase = 0x200000000L,197maskOtherAlphabetic = 0x400000000L,198maskOtherMath = 0x800000000L,199maskIdeographic = 0x1000000000L,200maskNoncharacterCP = 0x2000000000L;201202// Can compare masked values with these to determine203// numeric or lexical types.204public static int205valueNotNumeric = 0x0000,206valueDigit = 0x0400,207valueStrangeNumeric = 0x0800,208valueJavaSupradecimal = 0x0C00,209valueIgnorable = 0x1000,210valueJavaOnlyPart = 0x2000,211valueJavaUnicodePart = 0x3000,212valueJavaWhitespace = 0x4000,213valueJavaStartUnicodePart = 0x5000,214valueJavaOnlyStart = 0x6000,215valueJavaUnicodeStart = 0x7000,216lowJavaStart = 0x5000,217nonzeroJavaPart = 0x3000,218valueUnicodeStart = 0x7000;219220// these values are used when only identifier properties are generated221// for use in verifier code. Shortens the property down to a single byte.222private static final int223bitJavaStart = 0x02,224bitJavaPart = 0x01,225maskIsJavaIdentifierPart = bitJavaPart,226maskIsJavaIdentifierStart = bitJavaStart;227228static int maxOffset = maskCase/2 ;229static int minOffset = -maxOffset;230231/* The following routines provide simple, concise formatting of long integer values.232The number in the name of the method indicates the desired number of characters233to be produced. If the number of digits required to represent the integer value234is less than that number, then the output is padded on the left with zeros235(for hex) or with spaces (for decimal). If the number of digits required to236represent the integer value is greater than the desired number, then all the digits237that are required are actually produced.238*/239240static String hex(long n) { return Long.toHexString(n).toUpperCase(); }241242static String hex2(long n) {243String q = Long.toHexString(n & 0xFF).toUpperCase();244return "00".substring(Math.min(2, q.length())) + q;245}246247static String hex4(long n) {248String q = Long.toHexString(n & 0xFFFF).toUpperCase();249return "0000".substring(Math.min(4, q.length())) + q;250}251252static String hex8(long n) {253String q = Long.toHexString(n & 0xFFFFFFFFL).toUpperCase();254return "00000000".substring(Math.min(8, q.length())) + q;255}256257static String hex16(long n) {258String q = Long.toHexString(n).toUpperCase();259return "0000000000000000".substring(Math.min(16, q.length())) + q;260}261262static String dec3(long n) {263String q = Long.toString(n);264return " ".substring(Math.min(3, q.length())) + q;265}266267static String dec5(long n) {268String q = Long.toString(n);269return " ".substring(Math.min(5, q.length())) + q;270}271272/* This routine is called when some failure occurs. */273274static void FAIL(String s) {275System.out.println("** " + s);276}277278/**279* Given the data from the Unicode specification file, this routine builds a map.280*281* The specification file is assumed to contain its data in sorted order by282* character code; as a result, the array passed as an argument to this method283* has its components in the same sorted order, with one entry for each defined284* Unicode character or character range. (A range is indicated by two consecutive285* entries, such that the name of the first entry begins with "<" and ends with286* "First>" and the second entry begins with "<" and ends with "Last>".) This is287* therefore a sparse representation of the character property data.288*289* The resulting map is dense representation of the character data. It contains290* 2^16 = 65536 entries, each of which is a long integer. (Right now only 32 bits291* of this long value are used, but type long is used rather than int to facilitate292* future extensions of this source code generator that might require more than293* 32 bits to encode relevant character properties.) Entry k holds the encoded294* properties for character k.295*296* Method buildMap manages the transformation from the sparse representation to297* the dense representation. It calls method buildOne to handle the encoding298* of character property data from a single UnicodeSpec object into 32 bits.299* For undefined characters, method buildOne is not called and the map entry for300* that character is set to UnicodeSpec.UNASSIGNED.301*302* @param data character property data from the Unicode specification file303* @return an array of length 65536 with one entry for every possible char value304*305* @see GenerateCharacter#buildOne306*/307308static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList)309{310long[] result;311if (bLatin1 == true) {312result = new long[256];313} else {314result = new long[1<<16];315}316int k=0;317int codePoint = plane<<16;318UnicodeSpec nonCharSpec = new UnicodeSpec();319for (int j = 0; j < data.length && k < result.length; j++) {320if (data[j].codePoint == codePoint) {321result[k] = buildOne(codePoint, data[j], specialMaps);322++k;323++codePoint;324}325else if(data[j].codePoint > codePoint) {326if (data[j].name.endsWith("Last>")) {327// build map data for all chars except last in range328while (codePoint < data[j].codePoint && k < result.length) {329result[k] = buildOne(codePoint, data[j], specialMaps);330++k;331++codePoint;332}333}334else {335// we have a few unassigned chars before data[j].codePoint336while (codePoint < data[j].codePoint && k < result.length) {337result[k] = buildOne(codePoint, nonCharSpec, specialMaps);338++k;339++codePoint;340}341}342k = data[j].codePoint & 0xFFFF;343codePoint = data[j].codePoint;344result[k] = buildOne(codePoint, data[j], specialMaps);345++k;346++codePoint;347}348else {349System.out.println("An error has occured during spec mapping.");350System.exit(0);351}352}353// if there are still unprocessed chars, process them354// as unassigned/undefined.355codePoint = (plane<<16) | k;356while (k < result.length) {357result[k] = buildOne(codePoint, nonCharSpec, specialMaps);358++k;359++codePoint;360}361// now add all extra supported properties from PropList, to the362// upper 16-bit363addExProp(result, propList, "Other_Lowercase", maskOtherLowercase);364addExProp(result, propList, "Other_Uppercase", maskOtherUppercase);365addExProp(result, propList, "Other_Alphabetic", maskOtherAlphabetic);366addExProp(result, propList, "Ideographic", maskIdeographic);367//addExProp(result, propList, "Other_Math", maskOtherMath);368//addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP);369370return result;371}372373// The maximum and minimum offsets found while scanning the database374static int maxOffsetSeen = 0;375static int minOffsetSeen = 0;376377/**378* Some Unicode separator characters are not considered Java whitespace.379* @param c character to test380* @return true if c in an invalid Java whitespace character, false otherwise.381*/382static boolean isInvalidJavaWhiteSpace(int c) {383int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF};384boolean retValue = false;385for(int x=0;x<exceptions.length;x++) {386if(c == exceptions[x]) {387retValue = true;388break;389}390}391return retValue;392393}394395/**396* Given the character property data for one Unicode character, encode the data397* of interest into a single long integer value. (Right now only 32 bits398* of this long value are used, but type long is used rather than int to facilitate399* future extensions of this source code generator that might require more than400* 32 bits to encode relevant character properties.)401*402* @param c the character code for which to encode property data403* @param us property data record from the Unicode specification file404* (its character code might not be equal to c if it specifies data405* for a range of characters)406* @return an encoded long value that contains the properties for a single char407*408* @see GenerateCharacter#buildMap409*/410411static long buildOne(int c, UnicodeSpec us, SpecialCaseMap[] specialMaps) {412long resultA = 0;413// record the general category414resultA |= us.generalCategory;415416// record the numeric properties417NUMERIC: {418STRANGE: {419int val = 0;420// c is A-Z421if ((c >= 0x0041) && (c <= 0x005A)) {422val = c - 0x0041;423resultA |= valueJavaSupradecimal;424// c is a-z425} else if ((c >= 0x0061) && (c <= 0x007A)) {426val = c - 0x0061;427resultA |= valueJavaSupradecimal;428// c is a full-width A-Z429} else if ((c >= 0xFF21) && (c <= 0xFF3A)) {430val = c - 0xFF21;431resultA |= valueJavaSupradecimal;432// c is a full-width a-z433} else if ((c >= 0xFF41) && (c <= 0xFF5A)) {434val = c - 0xFF41;435resultA |= valueJavaSupradecimal;436} else if (us.isDecimalValue()) {437val = us.decimalValue;438resultA |= valueDigit;439} else if (us.isDigitValue()) {440val = us.digitValue;441resultA |= valueDigit;442} else {443if (us.numericValue.length() == 0) {444break NUMERIC; // no numeric value at all445} else {446try {447val = Integer.parseInt(us.numericValue);448if (val >= 32 || val < 0) break STRANGE;449if (c == 0x215F) break STRANGE;450} catch(NumberFormatException e) {451break STRANGE;452}453resultA |= valueDigit;454}455}456if (val >= 32 || val < 0) break STRANGE;457resultA |= ((val - c & maskDigit) << shiftDigitOffset);458break NUMERIC;459} // end STRANGE460resultA |= valueStrangeNumeric;461} // end NUMERIC462463// record case mapping464int offset = 0;465// might have a 1:M mapping466int specialMap = SpecialCaseMap.find(c, specialCaseMaps);467boolean bHasUpper = (us.hasUpperMap()) || (specialMap != -1);468if (bHasUpper) {469resultA |= maskUpperCase;470}471if (specialMap != -1) {472// has mapping, but cannot record the473// proper offset; can only flag it and provide special case474// code in Character.java475offset = -1;476}477else if (us.hasUpperMap()) {478offset = c - us.upperMap;479}480481if (us.hasLowerMap()) {482resultA |= maskLowerCase;483if (offset == 0)484offset = us.lowerMap - c;485else if (offset != (us.lowerMap - c)) {486if (DEBUG) {487FAIL("Character " + hex(c) +488" has incompatible lowercase and uppercase mappings");489}490}491}492if ((us.hasTitleMap() && us.titleMap != us.upperMap) ||493(bHasUpper && us.hasLowerMap())) {494resultA |= maskTitleCase;495}496if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) {497System.out.println("Warning: Character " + hex4(c) + " has upper but " +498"no title case; Java won't know this");499}500if (offset < minOffsetSeen) minOffsetSeen = offset;501if (offset > maxOffsetSeen) maxOffsetSeen = offset;502if (offset > maxOffset || offset < minOffset) {503if (DEBUG) {504FAIL("Case offset " + offset + " for character " + hex4(c) + " must be handled as a special case");505}506offset = maskCase;507}508resultA |= ((offset & maskCase) << shiftCaseOffset);509510// record lexical info about this character511if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER512|| us.generalCategory == UnicodeSpec.UPPERCASE_LETTER513|| us.generalCategory == UnicodeSpec.TITLECASE_LETTER514|| us.generalCategory == UnicodeSpec.MODIFIER_LETTER515|| us.generalCategory == UnicodeSpec.OTHER_LETTER516|| us.generalCategory == UnicodeSpec.LETTER_NUMBER) {517resultA |= valueJavaUnicodeStart;518}519else if (us.generalCategory == UnicodeSpec.COMBINING_SPACING_MARK520|| us.generalCategory == UnicodeSpec.NON_SPACING_MARK521|| us.generalCategory == UnicodeSpec.DECIMAL_DIGIT_NUMBER) {522resultA |= valueJavaUnicodePart;523}524else if (us.generalCategory == UnicodeSpec.CONNECTOR_PUNCTUATION) {525resultA |= valueJavaStartUnicodePart;526}527else if (us.generalCategory == UnicodeSpec.CURRENCY_SYMBOL) {528resultA |= valueJavaOnlyStart;529}530else if (((c >= 0x0000) && (c <= 0x0008))531|| ((c >= 0x000E) && (c <= 0x001B))532|| ((c >= 0x007F) && (c <= 0x009F))533|| us.generalCategory == UnicodeSpec.FORMAT) {534resultA |= valueIgnorable;535}536else if (us.generalCategory == UnicodeSpec.SPACE_SEPARATOR537|| us.generalCategory == UnicodeSpec.LINE_SEPARATOR538|| us.generalCategory == UnicodeSpec.PARAGRAPH_SEPARATOR) {539if (!isInvalidJavaWhiteSpace(c)) resultA |= valueJavaWhitespace;540}541else if (((c >= 0x0009) && (c <= 0x000D))542|| ((c >= 0x001C) && (c <= 0x001F))) {543resultA |= valueJavaWhitespace;544}545546// record bidi category547if (!nobidi) {548int tmpBidi =549(us.bidiCategory > UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS ||550us.bidiCategory == -1) ? maskBidi : (us.bidiCategory << shiftBidi);551resultA |= tmpBidi;552}553554// record mirrored property555if (!nomirror) {556resultA |= us.mirrored ? maskMirrored : 0;557}558559if (identifiers) {560long replacement = 0;561if ((resultA & maskIdentifierInfo) >= lowJavaStart) {562replacement |= bitJavaStart;563}564if ( ((resultA & nonzeroJavaPart) != 0)565&& ((resultA & maskIdentifierInfo) != valueIgnorable)) {566replacement |= bitJavaPart;567}568resultA = replacement;569}570return resultA;571}572573static void addExProp(long[] map, PropList propList, String prop, long mask) {574List<Integer> cps = propList.codepoints(prop);575if (cps != null) {576for (Integer cp : cps) {577if (cp < map.length)578map[cp] |= mask;579}580}581}582583/**584* This is the heart of the table compression strategy. The inputs are a map585* and a number of bits (size). The map is simply an array of long integer values;586* the number of bits indicates how index values for that map are to be split.587* The length of the given map must be a multiple of (1 << size). The result is588* a new map z and a compressed table t such that for every valid index value k589* for the original map, t[(z[k>>size]<<size)|(k & ((1<<size)-1))] == map[k].590*591* In other words, the index k can be split into two parts, namely the "size"592* low-order bits and all the remaining high-order bits; the high-order bits are then593* remapped by map z to produce an index into table t. In effect, the data of the594* original map m is broken up into blocks of size (1<<size); the compression relies595* on the expectation that many of these blocks will be identical and therefore need596* be represented only once in the compressed table t.597*598* This method is intended to be used iteratively. The first map to be handed599* to it is the one constructed by method buildMap. After that, the first of the600* two arrays returned by this method is fed back into it for further compression.601* At the end of the iteration, one has a starter map and a sequence of tables.602*603* The algorithm used to implement this computation is straightforward and not604* especially clever. It uses brute-force linear search (the loop labeled MIDDLE)605* to locate identical blocks, so overall the time complexity of the algorithm606* is quadratic in the length of the input map. Fortunately, speed is not crucial607* to this application.608*609* @param map a map to be compressed610* @param size the number of index bits to be split off by the compression611* @return an array of length 2 containing two arrays; the first is a new map612* and the second is a compressed data table613*614* @see GenerateCharacter#buildMap615*/616617static long[][] buildTable(long[] map, int size) {618int n = map.length;619if (((n >> size) << size) != n) {620FAIL("Length " + n + " is not a multiple of " + (1 << size));621}622int m = 1 << size;623// We know the final length of the new map up front.624long[] newmap = new long[n >> size];625// The buffer is used temporarily to hold data for the compressed table626// because we don't know its final length yet.627long[] buffer = new long[n];628int ptr = 0;629OUTER: for (int i = 0; i < n; i += m) {630// For every block of size m in the original map...631MIDDLE: for (int j = 0; j < ptr; j += m) {632// Find out whether there is already a block just like it in the buffer.633for (int k = 0; k < m; k++) {634if (buffer[j+k] != map[i+k])635continue MIDDLE;636}637// There is a block just like it at position j, so just638// put its index into the new map (thereby sharing it).639newmap[i >> size] = (j >> size);640continue OUTER;641} // end MIDDLE642// There is no block just like it already, so add it to643// the buffer and put its index into the new map.644for (int k = 0; k < m; k++) {645buffer[ptr+k] = map[i+k];646}647newmap[i >> size] = (ptr >> size);648ptr += m;649} // end OUTER650// Now we know how long the compressed table should be,651// so create a new array and copy data from the temporary buffer.652long[] newdata = new long[ptr];653for (int j = 0; j < ptr; j++) {654newdata[j] = buffer[j];655}656// Return the new map and the new data table.657long[][] result = { newmap, newdata };658return result;659}660661/**662* Once the compressed tables have been computed, this method reads in a663* template file for the source code to be generated and writes out the final664* source code by acting as a sort of specialized macro processor.665*666* The first output line is a comment saying that the file was automatically667* generated; it includes a timestamp. All other output is generated by668* reading a line from the template file, performing macro replacements,669* and then writing the resulting line or lines of code to the output file.670*671* This method handles the I/O, the timestamp comment, and the locating of672* macro calls within each input line. The method replaceCommand is called673* to generate replacement text for each macro call.674*675* Macro calls to be replaced are indicated in the template file by676* occurrences of the commandMarker "$$". The rest of the call may consist677* of Java letters (including the underscore "_") and also of balanced678* parentheses.679*680* @param theTemplateFileName681* the file name for the template input file682* @param theOutputFileName683* the file name for the source code output file684*685* @see GenerateCharacter#replaceCommand686*/687688static void generateCharacterClass(String theTemplateFileName,689String theOutputFileName)690throws FileNotFoundException, IOException {691BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName));692PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName)));693out.println(commentStart +694" This file was generated AUTOMATICALLY from a template file " +695new java.util.Date() + commentEnd);696int marklen = commandMarker.length();697LOOP: while(true) {698try {699String line = in.readLine();700if (line == null) break LOOP;701int pos = 0;702int depth = 0;703while ((pos = line.indexOf(commandMarker, pos)) >= 0) {704int newpos = pos + marklen;705char ch = 'x';706SCAN: while (newpos < line.length() &&707(Character.isJavaIdentifierStart(ch = line.charAt(newpos))708|| ch == '(' || (ch == ')' && depth > 0))) {709++newpos;710if (ch == '(') {711++depth;712}713else if (ch == ')') {714--depth;715if (depth == 0)716break SCAN;717}718}719String replacement = replaceCommand(line.substring(pos + marklen, newpos));720line = line.substring(0, pos) + replacement + line.substring(newpos);721pos += replacement.length();722}723out.println(line);724}725catch (IOException e) {726break LOOP;727}728}729in.close();730out.close();731}732733/**734* The replaceCommand method takes a command (a macro call without the735* leading marker "$$") and computes replacement text for it.736*737* Most of the commands are simply names of integer constants that are defined738* in the source code of this GenerateCharacter class. The replacement text is739* simply the value of the constant as an appropriately formatted integer literal.740*741* Two cases are more complicated, however. The command "Tables" causes the742* final map and compressed tables to be emitted, with elaborate comments743* describing their contents. (This is actually handled by method genTables.)744* The command "Lookup(xxx)", where "xxx" is the name of a variable, generates745* an expression that will return the character property data for the character746* whose code is the value of the variable "xxx". (this is handled by method747* "genAccess".)748*749* @param x a command from the template file to be replaced750* @return the replacement text, as a String751*752* @see GenerateCharacter#genTables753* @see GenerateCharacter#genAccess754* @see GenerateCharacter#generateCharacterClass755*/756757static String replaceCommand(String x) {758if (x.equals("Tables")) return genTables();759if (x.equals("Initializers")) return genInitializers();760if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") &&761x.substring(x.length()-1).equals(")") )762return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32));763if (x.length() >= 11 && x.substring(0, 9).equals("LookupEx(") &&764x.substring(x.length()-1).equals(")") )765return genAccess("B", x.substring(9, x.length()-1), 16);766if (x.equals("shiftType")) return Long.toString(shiftType);767if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo);768if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo);769if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart);770if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset);771if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo);772if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign);773if (x.equals("maskCase")) return "0x" + hex8(maskCase);774if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset);775if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase);776if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase);777if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase);778if (x.equals("maskOtherLowercase")) return "0x" + hex4(maskOtherLowercase >> 32);779if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32);780if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32);781if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32);782if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);783if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);784if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);785if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart);786if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart);787if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace);788if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart);789if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart);790if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart);791if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart);792if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart);793if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart);794if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart);795if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset);796if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset);797if (x.equals("maskDigit")) return "0x" + hex(maskDigit);798if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType);799if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType);800if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric);801if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);802if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);803if (x.equals("valueJavaSupradecimal")) return "0x" + hex8(valueJavaSupradecimal);804if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);805if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);806if (x.equals("maskType")) return "0x" + hex(maskType);807if (x.equals("shiftBidi")) return Long.toString(shiftBidi);808if (x.equals("maskBidi")) return "0x" + hex(maskBidi);809if (x.equals("maskMirrored")) return "0x" + hex8(maskMirrored);810if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UNASSIGNED][UnicodeSpec.LONG]))811return Integer.toString(UnicodeSpec.UNASSIGNED);812if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UPPERCASE_LETTER][UnicodeSpec.LONG]))813return Integer.toString(UnicodeSpec.UPPERCASE_LETTER);814if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LOWERCASE_LETTER][UnicodeSpec.LONG]))815return Integer.toString(UnicodeSpec.LOWERCASE_LETTER);816if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.TITLECASE_LETTER][UnicodeSpec.LONG]))817return Integer.toString(UnicodeSpec.TITLECASE_LETTER);818if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_LETTER][UnicodeSpec.LONG]))819return Integer.toString(UnicodeSpec.MODIFIER_LETTER);820if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_LETTER][UnicodeSpec.LONG]))821return Integer.toString(UnicodeSpec.OTHER_LETTER);822if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.NON_SPACING_MARK][UnicodeSpec.LONG]))823return Integer.toString(UnicodeSpec.NON_SPACING_MARK);824if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.ENCLOSING_MARK][UnicodeSpec.LONG]))825return Integer.toString(UnicodeSpec.ENCLOSING_MARK);826if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.COMBINING_SPACING_MARK][UnicodeSpec.LONG]))827return Integer.toString(UnicodeSpec.COMBINING_SPACING_MARK);828if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DECIMAL_DIGIT_NUMBER][UnicodeSpec.LONG]))829return Integer.toString(UnicodeSpec.DECIMAL_DIGIT_NUMBER);830if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_NUMBER][UnicodeSpec.LONG]))831return Integer.toString(UnicodeSpec.OTHER_NUMBER);832if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SPACE_SEPARATOR][UnicodeSpec.LONG]))833return Integer.toString(UnicodeSpec.SPACE_SEPARATOR);834if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LINE_SEPARATOR][UnicodeSpec.LONG]))835return Integer.toString(UnicodeSpec.LINE_SEPARATOR);836if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))837return Integer.toString(UnicodeSpec.PARAGRAPH_SEPARATOR);838if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONTROL][UnicodeSpec.LONG]))839return Integer.toString(UnicodeSpec.CONTROL);840if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FORMAT][UnicodeSpec.LONG]))841return Integer.toString(UnicodeSpec.FORMAT);842if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PRIVATE_USE][UnicodeSpec.LONG]))843return Integer.toString(UnicodeSpec.PRIVATE_USE);844if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SURROGATE][UnicodeSpec.LONG]))845return Integer.toString(UnicodeSpec.SURROGATE);846if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DASH_PUNCTUATION][UnicodeSpec.LONG]))847return Integer.toString(UnicodeSpec.DASH_PUNCTUATION);848if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.START_PUNCTUATION][UnicodeSpec.LONG]))849return Integer.toString(UnicodeSpec.START_PUNCTUATION);850if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.END_PUNCTUATION][UnicodeSpec.LONG]))851return Integer.toString(UnicodeSpec.END_PUNCTUATION);852if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.INITIAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))853return Integer.toString(UnicodeSpec.INITIAL_QUOTE_PUNCTUATION);854if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FINAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))855return Integer.toString(UnicodeSpec.FINAL_QUOTE_PUNCTUATION);856if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONNECTOR_PUNCTUATION][UnicodeSpec.LONG]))857return Integer.toString(UnicodeSpec.CONNECTOR_PUNCTUATION);858if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_PUNCTUATION][UnicodeSpec.LONG]))859return Integer.toString(UnicodeSpec.OTHER_PUNCTUATION);860if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LETTER_NUMBER][UnicodeSpec.LONG]))861return Integer.toString(UnicodeSpec.LETTER_NUMBER);862if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MATH_SYMBOL][UnicodeSpec.LONG]))863return Integer.toString(UnicodeSpec.MATH_SYMBOL);864if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CURRENCY_SYMBOL][UnicodeSpec.LONG]))865return Integer.toString(UnicodeSpec.CURRENCY_SYMBOL);866if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_SYMBOL][UnicodeSpec.LONG]))867return Integer.toString(UnicodeSpec.MODIFIER_SYMBOL);868if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_SYMBOL][UnicodeSpec.LONG]))869return Integer.toString(UnicodeSpec.OTHER_SYMBOL);870if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT][UnicodeSpec.LONG]))871return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT);872if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING][UnicodeSpec.LONG]))873return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING);874if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE][UnicodeSpec.LONG]))875return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE);876if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT][UnicodeSpec.LONG]))877return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT);878if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC][UnicodeSpec.LONG]))879return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC);880if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING][UnicodeSpec.LONG]))881return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING);882if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE][UnicodeSpec.LONG]))883return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE);884if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT][UnicodeSpec.LONG]))885return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT);886if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER][UnicodeSpec.LONG]))887return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER);888if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR][UnicodeSpec.LONG]))889return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR);890if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR][UnicodeSpec.LONG]))891return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR);892if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER][UnicodeSpec.LONG]))893return Integer.toString(UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER);894if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR][UnicodeSpec.LONG]))895return Integer.toString(UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR);896if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK][UnicodeSpec.LONG]))897return Integer.toString(UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK);898if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL][UnicodeSpec.LONG]))899return Integer.toString(UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL);900if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))901return Integer.toString(UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR);902if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR][UnicodeSpec.LONG]))903return Integer.toString(UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR);904if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_WHITESPACE][UnicodeSpec.LONG]))905return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE);906if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG]))907return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS);908FAIL("Unknown text substitution marker " + commandMarker + x);909return commandMarker + x;910}911912/**913* The genTables method generates source code for all the lookup tables914* needed to represent the various Unicode character properties.915* It simply calls the method genTable once for each table to be generated916* and then generates a summary comment.917*918* @return the replacement text for the "Tables" command, as a String919*920* @see GenerateCharacter#genTable921* @see GenerateCharacter#replaceCommand922*/923static String genTables() {924int n = sizes.length;925StringBuffer result = new StringBuffer();926// liu : Add a comment showing the source of this table927result.append(commentStart + " The following tables and code generated using:" +928commentEnd + "\n ");929result.append(commentStart + ' ' + commandLineDescription + commentEnd + "\n ");930931if (plane == 0 && bLatin1 == false) {932genCaseMapTableDeclaration(result);933genCaseMapTable(initializers, specialCaseMaps);934}935int totalBytes = 0;936for (int k = 0; k < n - 1; k++) {937genTable(result, tableNames[k], tables[k], 0, bytes[k]<<3, sizes[k], preshifted[k],938sizes[k+1], false, false, k==0);939int s = bytes[k];940if (s == 1 && useCharForByte) {941s = 2;942}943totalBytes += tables[k].length * s;944}945genTable(result, "A", tables[n - 1], 0, (identifiers ? 2 : 32),946sizes[n - 1], false, 0, true, !(identifiers), false);947948// If we ever need more than 32 bits to represent the character properties,949// then a table "B" may be needed as well.950genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false);951952totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2);953result.append(commentStart);954result.append(" In all, the character property tables require ");955result.append(totalBytes).append(" bytes.").append(commentEnd);956if (verbose) {957System.out.println("The character property tables require "958+ totalBytes + " bytes.");959}960return result.toString();961}962963/**964* The genInitializers method generates the body of the965* ensureInitted() method, which enables lazy initialization of966* the case map table and other tables.967*/968static String genInitializers() {969return initializers.toString();970}971972/**973* Return the total number of bytes needed by all tables. This is a stripped-974* down copy of genTables().975*/976static int getTotalBytes() {977int n = sizes.length;978int totalBytes = 0;979for (int k = 0; k < n - 1; k++) {980totalBytes += tables[k].length * bytes[k];981}982totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32))983+ 31) >> 5) << 2);984return totalBytes;985}986987static void appendEscapedStringFragment(StringBuffer result,988char[] line,989int length,990boolean lastFragment) {991result.append(" \"");992for (int k=0; k<length; ++k) {993result.append("\\u");994result.append(hex4(line[k]));995}996result.append("\"");997result.append(lastFragment ? ";" : "+");998result.append("\n");999}10001001static String SMALL_INITIALIZER =1002" { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+1003// " $$name = new $$type[$$size];\n"+1004" int len = $$name_DATA.length();\n"+1005" int j=0;\n"+1006" for (int i=0; i<len; ++i) {\n"+1007" int c = $$name_DATA.charAt(i);\n"+1008" for (int k=0; k<$$entriesPerChar; ++k) {\n"+1009" $$name[j++] = ($$type)c;\n"+1010" c >>= $$bits;\n"+1011" }\n"+1012" }\n"+1013" assert (j == $$size);\n"+1014" }\n";10151016static String SAME_SIZE_INITIALIZER =1017" { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+1018" assert ($$name_DATA.length() == $$size);\n"+1019// " $$name = new $$type[$$size];\n"+1020" for (int i=0; i<$$size; ++i)\n"+1021" $$name[i] = ($$type)$$name_DATA.charAt(i);\n"+1022" }\n";10231024static String BIG_INITIALIZER =1025" { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+1026// " $$name = new $$type[$$size];\n"+1027" int len = $$name_DATA.length();\n"+1028" int j=0;\n"+1029" int charsInEntry=0;\n"+1030" $$type entry=0;\n"+1031" for (int i=0; i<len; ++i) {\n"+1032" entry |= $$name_DATA.charAt(i);\n"+1033" if (++charsInEntry == $$charsPerEntry) {\n"+1034" $$name[j++] = entry;\n"+1035" entry = 0;\n"+1036" charsInEntry = 0;\n"+1037" }\n"+1038" else {\n"+1039" entry <<= 16;\n"+1040" }\n"+1041" }\n"+1042" assert (j == $$size);\n"+1043" }\n";10441045static String INT32_INITIALIZER =1046" { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+1047" char[] data = $$name_DATA.toCharArray();\n"+1048" assert (data.length == ($$size * 2));\n"+1049" int i = 0, j = 0;\n"+1050" while (i < ($$size * 2)) {\n"+1051" int entry = data[i++] << 16;\n"+1052" $$name[j++] = entry | data[i++];\n"+1053" }\n"+1054" }\n";10551056static void addInitializer(String name, String type, int entriesPerChar,1057int bits, int size) {10581059String template = (entriesPerChar == 1) ? SAME_SIZE_INITIALIZER :1060((entriesPerChar > 0) ? SMALL_INITIALIZER : BIG_INITIALIZER);1061if (entriesPerChar == -2) {1062template = INT32_INITIALIZER;1063}1064int marklen = commandMarker.length();1065int pos = 0;1066while ((pos = template.indexOf(commandMarker, pos)) >= 0) {1067int newpos = pos + marklen;1068char ch = 'x';1069while (newpos < template.length() &&1070Character.isJavaIdentifierStart(ch = template.charAt(newpos)) &&1071ch != '_') // Don't allow this in token names1072++newpos;1073String token = template.substring(pos+marklen, newpos);1074String replacement = "ERROR";10751076if (token.equals("name")) replacement = name;1077else if (token.equals("type")) replacement = type;1078else if (token.equals("bits")) replacement = ""+bits;1079else if (token.equals("size")) replacement = ""+size;1080else if (token.equals("entriesPerChar")) replacement = ""+entriesPerChar;1081else if (token.equals("charsPerEntry")) replacement = ""+(-entriesPerChar);1082else FAIL("Unrecognized token: " + token);10831084template = template.substring(0, pos) + replacement + template.substring(newpos);1085pos += replacement.length();1086}1087initializers.append(template);1088}10891090/**1091* The genTable method generates source code for one lookup table.1092* Most of the complexity stems from handling various options as to1093* the type of the array components, the precise representation of the1094* values, the format in which to render each value, the number of values1095* to emit on each line of source code, and the kinds of useful comments1096* to be generated.1097*1098* @param result a StringBuffer, to which the generated source code1099* text is to be appended1100* @param name the name of the table1101* @param table the table data (an array of long values)1102* @param extract a distance, in bits, by which each entry of the table1103* is to be right-shifted before it is processed1104* @param bits the number of bits (not bytes) to be used to represent1105* each table entry1106* @param size the table data is divided up into blocks of size (1<<size);1107* in this method, this information is used only to affect1108* how many table values are to be generated per line1109* @param preshifted if this flag is true, then the table entries are to be1110* emitted in a preshifted form; that is, each value should1111* be left-shifted by the amount "shift", so that this work1112* is built into the table and need not be performed by an1113* explicit shift operator at run time1114* @param shift this is the shift amount for preshifting of table entries1115* @param hexFormat if this flag is true, table entries should be emitted as1116* hexadecimal literals; otherwise decimal literals are used1117* @param properties if this flag is true, the table entries are encoded1118* character properties rather than indexes into yet other tables;1119* therefore comments describing the encoded properties should1120* be generated1121* @param hexComment if this flag is true, each line of output is labelled with1122* a hexadecimal comment indicating the character values to1123* which that line applies; otherwise, decimal values indicating1124* table indices are generated1125*1126* @see GenerateCharacter#genTables1127* @see GenerateCharacter#replaceCommand1128*/11291130static void genTable(StringBuffer result, String name,1131long[] table, int extract, int bits, int size,1132boolean preshifted, int shift, boolean hexFormat,1133boolean properties, boolean hexComment) {11341135String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") :1136bits == 2 ? (Csyntax ? "unsigned long" : "int") :1137bits == 4 ? (Csyntax ? "unsigned long" : "int") :1138bits == 8 ? (Csyntax ? "unsigned char" : "byte") :1139bits == 16 ? (Csyntax ? "unsigned short" : "char") :1140bits == 32 ? (Csyntax ? "unsigned long" : "int") :1141(Csyntax ? "int64" : "long");1142long maxPosEntry = bits == 1 ? Integer.MAX_VALUE : // liu1143bits == 2 ? Integer.MAX_VALUE :1144bits == 4 ? Integer.MAX_VALUE :1145bits == 8 ? Byte.MAX_VALUE :1146bits == 16 ? Short.MAX_VALUE :1147bits == 32 ? Integer.MAX_VALUE :1148Long.MAX_VALUE;1149int entriesPerChar = bits <= 16 ? (16 / bits) : -(bits / 16);1150boolean shiftEntries = preshifted && shift != 0;1151if (bits == 8 && tableAsString && useCharForByte) {1152atype = "char";1153maxPosEntry = Character.MAX_VALUE;1154entriesPerChar = 1;1155}1156boolean noConversion = atype.equals("char");11571158result.append(commentStart);1159result.append(" The ").append(name).append(" table has ").append(table.length);1160result.append(" entries for a total of ");1161int sizeOfTable = ((table.length * bits + 31) >> 5) << 2;1162if (bits == 8 && useCharForByte) {1163sizeOfTable *= 2;1164}1165result.append(sizeOfTable);1166result.append(" bytes.").append(commentEnd).append("\n\n");1167if (Csyntax)1168result.append(" static ");1169else1170result.append(" static final ");1171result.append(atype);1172result.append(" ").append(name).append("[");1173if (Csyntax)1174result.append(table.length >> (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0));1175if (tableAsString) {1176if (noConversion) {1177result.append("] = (\n");1178} else {1179result.append("] = new ").append(atype).append("["+table.length+"];\n ");1180result.append("static final String ").append(name).append("_DATA =\n");1181}1182int CHARS_PER_LINE = 8;1183StringBuffer theString = new StringBuffer();1184int entriesInCharSoFar = 0;1185char ch = '\u0000';1186int charsPerEntry = -entriesPerChar;1187for (int j=0; j<table.length; ++j) {1188//long entry = table[j] >> extract;1189long entry;1190if ("A".equals(name))1191entry = (table[j] & 0xffffffffL) >> extract;1192else1193entry = (table[j] >> extract);1194if (shiftEntries) entry <<= shift;1195if (entry >= (1L << bits)) {1196FAIL("Entry too big");1197}1198if (entriesPerChar > 0) {1199// Pack multiple entries into a character1200ch = (char)(((int)ch >> bits) | (entry << (entriesPerChar-1)*bits));1201++entriesInCharSoFar;1202if (entriesInCharSoFar == entriesPerChar) {1203// Character is full1204theString.append(ch);1205entriesInCharSoFar = 0;1206ch = '\u0000';1207}1208}1209else {1210// Use multiple characters per entry1211for (int k=0; k<charsPerEntry; ++k) {1212ch = (char)(entry >> ((charsPerEntry-1)*16));1213entry <<= 16;1214theString.append(ch);1215}1216}1217}1218if (entriesInCharSoFar > 0) {1219while (entriesInCharSoFar < entriesPerChar) {1220ch = (char)((int)ch >> bits);1221++entriesInCharSoFar;1222}1223theString.append(ch);1224entriesInCharSoFar = 0;1225}1226result.append(Utility.formatForSource(theString.toString(), " "));1227if (noConversion) {1228result.append(").toCharArray()");1229}1230result.append(";\n\n ");12311232if (!noConversion) {1233addInitializer(name, atype, entriesPerChar, bits, table.length);1234}1235}1236else {1237result.append("] = {");1238boolean castEntries = shiftEntries && (bits < 32);1239int printPerLine = hexFormat ? (bits == 1 ? 32*4 :1240bits == 2 ? 16*4 :1241bits == 4 ? 8*4 :1242bits == 8 ? 8 :1243bits == 16 ? 8 :1244bits == 32 ? 4 : 2) :1245(bits == 8 ? 8 :1246bits == 16 ? 8 : 4);1247int printMask = properties ? 0 :1248Math.min(1 << size,1249printPerLine >> (castEntries ? (Csyntax ? 2 : 1) : 0)) - 1;1250int commentShift = ((1 << size) == table.length) ? 0 : size;1251int commentMask = ((1 << size) == table.length) ? printMask : (1 << size) - 1;1252long val = 0;1253for (int j = 0; j < table.length; j++) {1254if ((j & printMask) == 0) {1255while (result.charAt(result.length() - 1) == ' ')1256result.setLength(result.length() - 1);1257result.append("\n ");1258}1259PRINT: {1260if (castEntries)1261result.append("(").append(atype).append(")(");1262long entry = table[j] >> extract;1263int packMask = ((1 << (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 2)) - 1);1264int k = j & packMask;1265if (bits >= 8)1266val = entry;1267else if (k == 0) {1268val = entry;1269break PRINT;1270}1271else {1272val |= (entry << (k*bits));1273if (k != packMask)1274break PRINT;1275}1276if (val > maxPosEntry && !Csyntax) { // liu1277// For values that are out of range, convert them to in-range negative values.1278// Actually, output the '-' and convert them to the negative of the corresponding1279// in-range negative values. E.g., convert 130 == -126 (in 8 bits) -> 126.1280result.append('-');1281val = maxPosEntry + maxPosEntry + 2 - val;1282}1283if (hexFormat) {1284result.append("0x");1285if (bits == 8)1286result.append(hex2((byte)val));1287else if (bits == 16)1288result.append(hex4((short)val));1289else if (bits == 32 || bits < 8)1290result.append(hex8((int)val));1291else {1292result.append(hex16(val));1293if (!Csyntax)1294result.append("L");1295}1296}1297else {1298if (bits == 8)1299result.append(dec3(val));1300else if (bits == 64) {1301result.append(dec5(val));1302if (!Csyntax)1303result.append("L");1304}1305else1306result.append(dec5(val));1307}1308if (shiftEntries)1309result.append("<<").append(shift);1310if (castEntries) result.append(")");1311if (j < (table.length - 1))1312result.append(", ");1313else1314result.append(" ");1315if ((j & printMask) == printMask) {1316result.append(" ").append(commentStart).append(" ");1317if (hexComment)1318result.append("0x").append(hex4((j & ~commentMask) << (16 - size)));1319else1320result.append(dec3((j & ~commentMask) >> commentShift));1321if (properties) propertiesComments(result, val);1322result.append(commentEnd);1323}1324} // end PRINT1325}1326result.append("\n };\n\n ");1327}1328}13291330static void genCaseMapTableDeclaration(StringBuffer result) {1331String myTab = " ";1332result.append(myTab + "static final char[][][] charMap;\n");1333}13341335static void genCaseMapTable(StringBuffer result, SpecialCaseMap[] specialCaseMaps){1336String myTab = " ";1337int ch;1338char[] map;1339result.append(myTab + "charMap = new char[][][] {\n");1340for (int x = 0; x < specialCaseMaps.length; x++) {1341ch = specialCaseMaps[x].getCharSource();1342map = specialCaseMaps[x].getUpperCaseMap();1343result.append(myTab + myTab);1344result.append("{ ");1345result.append("{\'\\u"+hex4(ch)+"\'}, {");1346for (int y = 0; y < map.length; y++) {1347result.append("\'\\u"+hex4(map[y])+"\', ");1348}1349result.append("} },\n");1350}1351result.append(myTab + "};\n");13521353}13541355/**1356* The propertiesComments method generates comments describing encoded1357* character properties.1358*1359* @param result a StringBuffer, to which the generated source code1360* text is to be appended1361* @param val encoded character properties1362*1363* @see GenerateCharacter#genTable1364*/13651366static void propertiesComments(StringBuffer result, long val) {1367result.append(" ");1368switch ((int)(val & maskType)) {1369case UnicodeSpec.CONTROL:1370result.append("Cc");1371break;1372case UnicodeSpec.FORMAT:1373result.append("Cf");1374break;1375case UnicodeSpec.PRIVATE_USE:1376result.append("Co");1377break;1378case UnicodeSpec.SURROGATE:1379result.append("Cs");1380break;1381case UnicodeSpec.LOWERCASE_LETTER:1382result.append("Ll");1383break;1384case UnicodeSpec.MODIFIER_LETTER:1385result.append("Lm");1386break;1387case UnicodeSpec.OTHER_LETTER:1388result.append("Lo");1389break;1390case UnicodeSpec.TITLECASE_LETTER:1391result.append("Lt");1392break;1393case UnicodeSpec.UPPERCASE_LETTER:1394result.append("Lu");1395break;1396case UnicodeSpec.COMBINING_SPACING_MARK:1397result.append("Mc");1398break;1399case UnicodeSpec.ENCLOSING_MARK:1400result.append("Me");1401break;1402case UnicodeSpec.NON_SPACING_MARK:1403result.append("Mn");1404break;1405case UnicodeSpec.DECIMAL_DIGIT_NUMBER:1406result.append("Nd");1407break;1408case UnicodeSpec.LETTER_NUMBER:1409result.append("Nl");1410break;1411case UnicodeSpec.OTHER_NUMBER:1412result.append("No");1413break;1414case UnicodeSpec.CONNECTOR_PUNCTUATION:1415result.append("Pc");1416break;1417case UnicodeSpec.DASH_PUNCTUATION:1418result.append("Pd");1419break;1420case UnicodeSpec.END_PUNCTUATION:1421result.append("Pe");1422break;1423case UnicodeSpec.OTHER_PUNCTUATION:1424result.append("Po");1425break;1426case UnicodeSpec.START_PUNCTUATION:1427result.append("Ps");1428break;1429case UnicodeSpec.CURRENCY_SYMBOL:1430result.append("Sc");1431break;1432case UnicodeSpec.MODIFIER_SYMBOL:1433result.append("Sk");1434break;1435case UnicodeSpec.MATH_SYMBOL:1436result.append("Sm");1437break;1438case UnicodeSpec.OTHER_SYMBOL:1439result.append("So");1440break;1441case UnicodeSpec.LINE_SEPARATOR:1442result.append("Zl"); break;1443case UnicodeSpec.PARAGRAPH_SEPARATOR:1444result.append("Zp");1445break;1446case UnicodeSpec.SPACE_SEPARATOR:1447result.append("Zs");1448break;1449case UnicodeSpec.UNASSIGNED:1450result.append("unassigned");1451break;1452}14531454switch ((int)((val & maskBidi) >> shiftBidi)) {1455case UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT:1456result.append(", L");1457break;1458case UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT:1459result.append(", R");1460break;1461case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER:1462result.append(", EN");1463break;1464case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR:1465result.append(", ES");1466break;1467case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR:1468result.append(", ET");1469break;1470case UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER:1471result.append(", AN");1472break;1473case UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR:1474result.append(", CS");1475break;1476case UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR:1477result.append(", B");1478break;1479case UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR:1480result.append(", S");1481break;1482case UnicodeSpec.DIRECTIONALITY_WHITESPACE:1483result.append(", WS");1484break;1485case UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS:1486result.append(", ON");1487break;1488}1489if ((val & maskUpperCase) != 0) {1490result.append(", hasUpper (subtract ");1491result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");1492}1493if ((val & maskLowerCase) != 0) {1494result.append(", hasLower (add ");1495result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");1496}1497if ((val & maskTitleCase) != 0) {1498result.append(", hasTitle");1499}1500if ((val & maskIdentifierInfo) == valueIgnorable) {1501result.append(", ignorable");1502}1503if ((val & maskIdentifierInfo) == valueJavaUnicodePart) {1504result.append(", identifier part");1505}1506if ((val & maskIdentifierInfo) == valueJavaStartUnicodePart) {1507result.append(", underscore");1508}1509if ((val & maskIdentifierInfo) == valueJavaWhitespace) {1510result.append(", whitespace");1511}1512if ((val & maskIdentifierInfo) == valueJavaOnlyStart) {1513result.append(", currency");1514}1515if ((val & maskIdentifierInfo) == valueJavaUnicodeStart) {1516result.append(", identifier start");1517}1518if ((val & maskNumericType) == valueDigit) {1519result.append(", decimal ");1520result.append((val & maskDigitOffset) >> shiftDigitOffset);1521}1522if ((val & maskNumericType) == valueStrangeNumeric) {1523result.append(", strange");1524}1525if ((val & maskNumericType) == valueJavaSupradecimal) {1526result.append(", supradecimal ");1527result.append((val & maskDigitOffset) >> shiftDigitOffset);1528}1529}15301531static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" };15321533static String tableName(int j) { return tableNames[j]; }15341535/**1536* The genAccess method generates source code for one table access expression.1537*1538* Most of the complexity stems from handling various options as to1539* table representation, such as whether it contains values so large that1540* they are represented as negative values and whether the table values are1541* preshifted. This method also avoids such "ugly" expressions as shifting1542* by distance zero, masking when no masking is necessary, and so on.1543* For clarity, it generates expressions that do not rely on operator1544* precedence, but otherwise it avoids generating redundant parentheses.1545*1546* A generated expression might look like A[Y[(X[ch>>6]<<6)|(ch&0x3F)]]1547* or A[Z[Y[(X[ch>>7]<<4)|((ch>>3)&0xF)]|(ch&0x7)]], for example.1548*1549* @param tbl the name of the final table to be accessed1550* @param var the variable name that appeared in parentheses in the1551* "Lookup" command1552* @param bits the number of bits (not bytes) to be used to represent1553* the final table entry1554* @return the replacement text for the "Lookup(xxx)" command, as a String1555*1556* @see GenerateCharacter#replaceCommand1557*/15581559static String genAccess(String tbl, String var, int bits) {1560String access = null;1561int bitoffset = bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0;1562for (int k = 0; k < sizes.length; k++) {1563int offset = ((k < sizes.length - 1) ? 0 : bitoffset);1564int shift = shifts[k] + offset;1565String shifted = (shift == 0) ? var : "(" + var + ">>" + shift + ")";1566int mask = (1 << (sizes[k] - offset)) - 1;1567String masked = (k == 0) ? shifted :1568"(" + shifted + "&0x" + hex(mask) + ")";1569String index = (k == 0) ? masked :1570(mask == 0) ? access : "(" + access + "|" + masked + ")";1571String indexNoParens = (index.charAt(0) != '(') ? index :1572index.substring(1, index.length() - 1);1573String tblname = (k == sizes.length - 1) ? tbl : tableName(k);1574String fetched = tblname + "[" + indexNoParens + "]";1575String zeroextended = (zeroextend[k] == 0) ? fetched :1576"(" + fetched + "&0x" + hex(zeroextend[k]) + ")";1577int adjustment = preshifted[k] ? 0 :1578sizes[k+1] - ((k == sizes.length - 2) ? bitoffset : 0);1579String adjusted = (preshifted[k] || adjustment == 0) ? zeroextended :1580"(" + zeroextended + "<<" + adjustment + ")";1581String bitshift = (bits == 1) ? "(" + var + "&0x1F)" :1582(bits == 2) ? "((" + var + "&0xF)<<1)" :1583(bits == 4) ? "((" + var + "&7)<<2)" : null;1584String extracted = ((k < sizes.length - 1) || (bits >= 8)) ? adjusted :1585"((" + adjusted + ">>" + bitshift + ")&" +1586(bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")";1587access = extracted;1588}1589return access;1590}15911592/* The command line arguments are decoded and used to set the following1593global variables.1594*/15951596static boolean verbose = false;1597static boolean nobidi = false;1598static boolean nomirror = false;1599static boolean identifiers = false;1600static boolean Csyntax = false;1601static String TemplateFileName = null;1602static String OutputFileName = null;1603static String UnicodeSpecFileName = null; // liu1604static String SpecialCasingFileName = null;1605static String PropListFileName = null;1606static boolean useCharForByte = false;1607static int[] sizes;1608static int bins = 0; // liu; if > 0, then perform search1609static boolean tableAsString = false;1610static boolean bLatin1 = false;16111612static String commandLineDescription;16131614/* Other global variables, equal in length to the "sizes" array. */16151616static int[] shifts;1617static int[] zeroextend;1618static int[] bytes;1619static boolean[] preshifted;1620static long[][] tables;162116221623/* Other global variables */1624static String commentStart;1625static String commentEnd;16261627static StringBuffer initializers = new StringBuffer();16281629/* special casing rules for 1:M toUpperCase mappings */1630static SpecialCaseMap[] specialCaseMaps;16311632/**1633* Process the command line arguments.1634*1635* The allowed flags in command line are:1636* <dl>1637* <dt> -verbose <dd> Emit comments to standard output describing1638* what's going on during the processing.1639* <dt> -nobidi <dd> Do not include bidi categories in the1640* encoded character properties.1641* <dt> -nomirror <dd> Do no include mirror property in the encoded1642* character properties.1643* <dt> -identifiers <dd> Generate tables for scanning identifiers only.1644* <dt> -c <dd> Output code in C syntax instead of Java syntax.1645* <dt> -o filename <dd> Specify output file name.1646* <dt> -template filename <dd> Specify template input file name.1647* <dt> -spec filename <dd> Specify Unicode spec file name.1648* <dt> -specialcasing filename <dd> Specify Unicode special casing file name.1649* <dt> -search bins <dd> Try different partitions into the specified1650* number of bins. E.g., for 2 bins, try1651* 16 0, 15 1,..., 0 16.1652* <dt> -string <dd> Create table as string. Only valid with Java1653* syntax.1654* <dt> -latin1 <dd> Create a latin 1 only property table.1655* </dl>1656* In addition, decimal literals may appear as command line arguments;1657* each one represents the number of bits of the character to be broken1658* off at each lookup step. If present, they must add up to 16 (the number1659* of bits in a char value). For smaller tables, the last value should1660* be 0; values other than the last one may not be zero. If no such1661* numeric values are provided, default values are used.1662*1663* @param args the command line arguments, as an array of String1664*1665* @see GenerateCharacter#main1666*/16671668static void processArgs(String[] args) {1669StringBuffer desc = new StringBuffer("java GenerateCharacter");1670for (int j=0; j<args.length; ++j) {1671desc.append(" " + args[j]);1672}1673for (int j = 0; j < args.length; j++) {1674if (args[j].equals("-verbose") || args[j].equals("-v"))1675verbose = true;1676else if (args[j].equals("-nobidi"))1677nobidi = true;1678else if (args[j].equals("-nomirror"))1679nomirror = true;1680else if (args[j].equals("-identifiers"))1681identifiers = true;1682else if (args[j].equals("-c"))1683Csyntax = true;1684else if (args[j].equals("-string"))1685tableAsString = true;1686else if (args[j].equals("-o")) {1687if (j == args.length - 1) {1688FAIL("File name missing after -o");1689}1690else {1691OutputFileName = args[++j];1692}1693}1694else if (args[j].equals("-search")) {1695if (j == args.length - 1)1696FAIL("Bin count missing after -search");1697else {1698bins = Integer.parseInt(args[++j]);1699if (bins < 1 || bins > 10)1700FAIL("Bin count must be >= 1 and <= 10");1701}1702}1703else if (args[j].equals("-template")) {1704if (j == args.length - 1)1705FAIL("File name missing after -template");1706else1707TemplateFileName = args[++j];1708}1709else if (args[j].equals("-spec")) { // liu1710if (j == args.length - 1) {1711FAIL("File name missing after -spec");1712}1713else {1714UnicodeSpecFileName = args[++j];1715}1716}1717else if (args[j].equals("-specialcasing")) {1718if (j == args.length -1) {1719FAIL("File name missing after -specialcasing");1720}1721else {1722SpecialCasingFileName = args[++j];1723}1724}1725else if (args[j].equals("-proplist")) {1726if (j == args.length -1) {1727FAIL("File name missing after -proplist");1728}1729else {1730PropListFileName = args[++j];1731}1732}1733else if (args[j].equals("-plane")) {1734if (j == args.length -1) {1735FAIL("Plane number missing after -plane");1736}1737else {1738plane = Integer.parseInt(args[++j]);1739}1740if (plane > 0) {1741bLatin1 = false;1742}1743}1744else if ("-usecharforbyte".equals(args[j])) {1745useCharForByte = true;1746}1747else if (args[j].equals("-latin1")) {1748bLatin1 = true;1749plane = 0;1750}1751else {1752try {1753int val = Integer.parseInt(args[j]);1754if (val < 0 || val > 32) FAIL("Incorrect bit field width: " + args[j]);1755if (sizes == null)1756sizes = new int[1];1757else {1758int[] newsizes = new int[sizes.length + 1];1759System.arraycopy(sizes, 0, newsizes, 0, sizes.length);1760sizes = newsizes;1761}1762sizes[sizes.length - 1] = val;1763}1764catch(NumberFormatException e) {1765FAIL("Unknown switch: " + args[j]);1766}1767}1768}1769if (Csyntax && tableAsString) {1770FAIL("Can't specify table as string with C syntax");1771}1772if (sizes == null) {1773desc.append(" [");1774if (identifiers) {1775int[] newsizes = { 8, 4, 4 }; // Good default values1776desc.append("8 4 4]");1777sizes = newsizes;1778}1779else {1780int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 }1781desc.append("10 5 1]");1782sizes = newsizes;1783}1784}1785if (UnicodeSpecFileName == null) { // liu1786UnicodeSpecFileName = DefaultUnicodeSpecFileName;1787desc.append(" [-spec " + UnicodeSpecFileName + ']');1788}1789if (SpecialCasingFileName == null) {1790SpecialCasingFileName = DefaultSpecialCasingFileName;1791desc.append(" [-specialcasing " + SpecialCasingFileName + ']');1792}1793if (PropListFileName == null) {1794PropListFileName = DefaultPropListFileName;1795desc.append(" [-proplist " + PropListFileName + ']');1796}1797if (TemplateFileName == null) {1798TemplateFileName = (Csyntax ? DefaultCTemplateFileName1799: DefaultJavaTemplateFileName);1800desc.append(" [-template " + TemplateFileName + ']');1801}1802if (OutputFileName == null) {1803OutputFileName = (Csyntax ? DefaultCOutputFileName1804: DefaultJavaOutputFileName);1805desc.append(" [-o " + OutputFileName + ']');1806}1807commentStart = (Csyntax ? "/*" : "//");1808commentEnd = (Csyntax ? " */" : "");1809commandLineDescription = desc.toString();1810}18111812private static void searchBins(long[] map, int binsOccupied) throws Exception {1813int bitsFree = 16;1814for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i];1815if (binsOccupied == (bins-1)) {1816sizes[binsOccupied] = bitsFree;1817generateForSizes(map);1818}1819else {1820for (int i=1; i<bitsFree; ++i) { // Don't allow bins of 0 except for last one1821sizes[binsOccupied] = i;1822searchBins(map, binsOccupied+1);1823}1824}1825}18261827private static void generateForSizes(long[] map) throws Exception {1828int sum = 0;1829shifts = new int[sizes.length];1830for (int k = sizes.length - 1; k >= 0; k--) {1831shifts[k] = sum;1832sum += sizes[k];1833}1834if ((1 << sum) < map.length || (1 << (sum - 1)) >= map.length) {1835FAIL("Bit field widths total to " + sum +1836": wrong total for map of size " + map.length);1837}1838// need a table for each set of lookup bits in char1839tables = new long[sizes.length][];1840// the last table is the map1841tables[sizes.length - 1] = map;1842for (int j = sizes.length - 1; j > 0; j--) {1843if (verbose && bins==0)1844System.err.println("Building map " + (j+1) + " of bit width " + sizes[j]);1845long[][] temp = buildTable(tables[j], sizes[j]);1846tables[j-1] = temp[0];1847tables[j] = temp[1];1848}1849preshifted = new boolean[sizes.length];1850zeroextend = new int[sizes.length];1851bytes = new int[sizes.length];1852for (int j = 0; j < sizes.length - 1; j++) {1853int len = tables[j+1].length;1854int size = sizes[j+1];1855if (len > 0x100 && (len >> size) <= 0x100) {1856len >>= size;1857preshifted[j] = false;1858}1859else if (len > 0x10000 && (len >> size) <= 0x10000) {1860len >>= size;1861preshifted[j] = false;1862}1863else preshifted[j] = true;1864if (Csyntax)1865zeroextend[j] = 0;1866else if (len > 0x7F && len <= 0xFF) {1867if (!useCharForByte) {1868zeroextend[j] = 0xFF;1869}1870} else if (len > 0x7FFF && len <= 0xFFFF)1871zeroextend[j] = 0xFFFF;1872else zeroextend[j] = 0;1873if (len <= 0x100) bytes[j] = 1;1874else if (len <= 0x10000) bytes[j] = 2;1875else bytes[j] = 4;1876}1877preshifted[sizes.length - 1] = true;1878zeroextend[sizes.length - 1] = 0;1879bytes[sizes.length - 1] = 0;1880if (bins > 0) {1881int totalBytes = getTotalBytes();1882String access = genAccess("A", "ch", (identifiers ? 2 : 32));1883int accessComplexity = 0;1884for (int j=0; j<access.length(); ++j) {1885char ch = access.charAt(j);1886if ("[&|><".indexOf(ch) >= 0) ++accessComplexity;1887if (ch == '<' || ch == '>') ++j;1888}1889System.out.print("(");1890for (int j=0; j<sizes.length; ++j) System.out.print(" " + sizes[j]);1891System.out.println(" ) " + totalBytes + " " + accessComplexity + " " + access);1892return;1893}1894if (verbose) {1895System.out.println(" n\t size\tlength\tshift\tzeroext\tbytes\tpreshifted");1896for (int j = 0; j < sizes.length; j++) {1897System.out.println(dec5(j) + "\t" +1898dec5(sizes[j]) + "\t" +1899dec5(tables[j].length) + "\t" +1900dec5(shifts[j]) + "\t" +1901dec5(zeroextend[j]) + "\t" +1902dec5(bytes[j]) + "\t " +1903preshifted[j]);1904}1905}1906if (verbose) {1907System.out.println("Generating source code for class Character");1908System.out.println("A table access looks like " +1909genAccess("A", "ch", (identifiers ? 2 : 32)));1910}1911generateCharacterClass(TemplateFileName, OutputFileName);1912}19131914/**1915* The main program for generating source code for the Character class.1916* The basic outline of its operation is:1917* <ol>1918* <li> Process the command line arguments. One result of this process1919* is a list of sizes (measured in bits and summing to 16).1920* <li> Get the Unicode character property data from the specification file.1921* <li> From that, build a map that has, for each character code, its1922* relevant properties encoded as a long integer value.1923* <li> Repeatedly compress the map, producing a compressed table and a1924* new map. This is done once for each size value in the list.1925* When this is done, we have a set of tables.1926* <li> Make some decisions about table representation; record these1927* decisions in arrays named preshifted, zeroextend, and bytes.1928* <li> Generate the source code for the class Character by performing1929* macro processing on a template file.1930* </ol>1931*1932* @param args the command line arguments, as an array of String1933*1934* @see GenerateCharacter#processArgs1935* @see UnicodeSpec@readSpecFile1936* @see GenerateCharacter#buildMap1937* @see GenerateCharacter#buildTable1938* @see GenerateCharacter#generateCharacterClass1939*/19401941public static void main(String[] args) {1942processArgs(args);1943try {19441945UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane);1946specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);1947PropList propList = PropList.readSpecFile(new File(PropListFileName), plane);19481949if (verbose) {1950System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu1951}1952long[] map = buildMap(data, specialCaseMaps, propList);1953if (verbose) {1954System.err.println("Completed building of initial map");1955}19561957if (bins == 0) {1958generateForSizes(map);1959}1960else {1961while (bins > 0) {1962sizes = new int[bins];1963searchBins(map, 0);1964--bins;1965}1966}1967if (verbose && false) {1968System.out.println("Offset range seen: -" + hex8(-minOffsetSeen) + "..+" +1969hex8(maxOffsetSeen));1970System.out.println(" allowed: -" + hex8(-minOffset) + "..+" +1971hex8(maxOffset));1972}1973}1974catch (FileNotFoundException e) { FAIL(e.toString()); }1975catch (IOException e) { FAIL(e.toString()); }1976catch (Throwable e) {1977System.out.println("Unexpected exception:");1978e.printStackTrace();1979FAIL("Unexpected exception!");1980}1981if (verbose) { System.out.println("Done!");}1982}19831984} // end class198519861987