Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/java/text/RBTableBuilder.java
38829 views
/*1* Copyright (c) 1999, 2012, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425/*26* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved27* (C) Copyright IBM Corp. 1996-1998 - All Rights Reserved28*29* The original version of this source code and documentation is copyrighted30* and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These31* materials are provided under terms of a License Agreement between Taligent32* and Sun. This technology is protected by multiple US and International33* patents. This notice and attribution to Taligent may not be removed.34* Taligent is a registered trademark of Taligent, Inc.35*36*/3738package java.text;3940import java.util.Vector;41import sun.text.UCompactIntArray;42import sun.text.IntHashtable;43import sun.text.ComposedCharIter;44import sun.text.CollatorUtilities;45import sun.text.normalizer.NormalizerImpl;4647/**48* This class contains all the code to parse a RuleBasedCollator pattern49* and build a RBCollationTables object from it. A particular instance50* of tis class exists only during the actual build process-- once an51* RBCollationTables object has been built, the RBTableBuilder object52* goes away. This object carries all of the state which is only needed53* during the build process, plus a "shadow" copy of all of the state54* that will go into the tables object itself. This object communicates55* with RBCollationTables through a separate class, RBCollationTables.BuildAPI,56* this is an inner class of RBCollationTables and provides a separate57* private API for communication with RBTableBuilder.58* This class isn't just an inner class of RBCollationTables itself because59* of its large size. For source-code readability, it seemed better for the60* builder to have its own source file.61*/62final class RBTableBuilder {6364public RBTableBuilder(RBCollationTables.BuildAPI tables) {65this.tables = tables;66}6768/**69* Create a table-based collation object with the given rules.70* This is the main function that actually builds the tables and71* stores them back in the RBCollationTables object. It is called72* ONLY by the RBCollationTables constructor.73* @see RuleBasedCollator#RuleBasedCollator74* @exception ParseException If the rules format is incorrect.75*/7677public void build(String pattern, int decmp) throws ParseException78{79boolean isSource = true;80int i = 0;81String expChars;82String groupChars;83if (pattern.length() == 0)84throw new ParseException("Build rules empty.", 0);8586// This array maps Unicode characters to their collation ordering87mapping = new UCompactIntArray(RBCollationTables.UNMAPPED);88// Normalize the build rules. Find occurances of all decomposed characters89// and normalize the rules before feeding into the builder. By "normalize",90// we mean that all precomposed Unicode characters must be converted into91// a base character and one or more combining characters (such as accents).92// When there are multiple combining characters attached to a base character,93// the combining characters must be in their canonical order94//95// sherman/Note:96//(1)decmp will be NO_DECOMPOSITION only in ko locale to prevent decompose97//hangual syllables to jamos, so we can actually just call decompose with98//normalizer's IGNORE_HANGUL option turned on99//100//(2)just call the "special version" in NormalizerImpl directly101//pattern = Normalizer.decompose(pattern, false, Normalizer.IGNORE_HANGUL, true);102//103//Normalizer.Mode mode = CollatorUtilities.toNormalizerMode(decmp);104//pattern = Normalizer.normalize(pattern, mode, 0, true);105106pattern = NormalizerImpl.canonicalDecomposeWithSingleQuotation(pattern);107108// Build the merged collation entries109// Since rules can be specified in any order in the string110// (e.g. "c , C < d , D < e , E .... C < CH")111// this splits all of the rules in the string out into separate112// objects and then sorts them. In the above example, it merges the113// "C < CH" rule in just before the "C < D" rule.114//115116mPattern = new MergeCollation(pattern);117118int order = 0;119120// Now walk though each entry and add it to my own tables121for (i = 0; i < mPattern.getCount(); ++i)122{123PatternEntry entry = mPattern.getItemAt(i);124if (entry != null) {125groupChars = entry.getChars();126if (groupChars.length() > 1) {127switch(groupChars.charAt(groupChars.length()-1)) {128case '@':129frenchSec = true;130groupChars = groupChars.substring(0, groupChars.length()-1);131break;132case '!':133seAsianSwapping = true;134groupChars = groupChars.substring(0, groupChars.length()-1);135break;136}137}138139order = increment(entry.getStrength(), order);140expChars = entry.getExtension();141142if (expChars.length() != 0) {143addExpandOrder(groupChars, expChars, order);144} else if (groupChars.length() > 1) {145char ch = groupChars.charAt(0);146if (Character.isHighSurrogate(ch) && groupChars.length() == 2) {147addOrder(Character.toCodePoint(ch, groupChars.charAt(1)), order);148} else {149addContractOrder(groupChars, order);150}151} else {152char ch = groupChars.charAt(0);153addOrder(ch, order);154}155}156}157addComposedChars();158159commit();160mapping.compact();161/*162System.out.println("mappingSize=" + mapping.getKSize());163for (int j = 0; j < 0xffff; j++) {164int value = mapping.elementAt(j);165if (value != RBCollationTables.UNMAPPED)166System.out.println("index=" + Integer.toString(j, 16)167+ ", value=" + Integer.toString(value, 16));168}169*/170tables.fillInTables(frenchSec, seAsianSwapping, mapping, contractTable, expandTable,171contractFlags, maxSecOrder, maxTerOrder);172}173174/** Add expanding entries for pre-composed unicode characters so that this175* collator can be used reasonably well with decomposition turned off.176*/177private void addComposedChars() throws ParseException {178// Iterate through all of the pre-composed characters in Unicode179ComposedCharIter iter = new ComposedCharIter();180int c;181while ((c = iter.next()) != ComposedCharIter.DONE) {182if (getCharOrder(c) == RBCollationTables.UNMAPPED) {183//184// We don't already have an ordering for this pre-composed character.185//186// First, see if the decomposed string is already in our187// tables as a single contracting-string ordering.188// If so, just map the precomposed character to that order.189//190// TODO: What we should really be doing here is trying to find the191// longest initial substring of the decomposition that is present192// in the tables as a contracting character sequence, and find its193// ordering. Then do this recursively with the remaining chars194// so that we build a list of orderings, and add that list to195// the expansion table.196// That would be more correct but also significantly slower, so197// I'm not totally sure it's worth doing.198//199String s = iter.decomposition();200201//sherman/Note: if this is 1 character decomposed string, the202//only thing need to do is to check if this decomposed character203//has an entry in our order table, this order is not necessary204//to be a contraction order, if it does have one, add an entry205//for the precomposed character by using the same order, the206//previous impl unnecessarily adds a single character expansion207//entry.208if (s.length() == 1) {209int order = getCharOrder(s.charAt(0));210if (order != RBCollationTables.UNMAPPED) {211addOrder(c, order);212}213continue;214} else if (s.length() == 2) {215char ch0 = s.charAt(0);216if (Character.isHighSurrogate(ch0)) {217int order = getCharOrder(s.codePointAt(0));218if (order != RBCollationTables.UNMAPPED) {219addOrder(c, order);220}221continue;222}223}224int contractOrder = getContractOrder(s);225if (contractOrder != RBCollationTables.UNMAPPED) {226addOrder(c, contractOrder);227} else {228//229// We don't have a contracting ordering for the entire string230// that results from the decomposition, but if we have orders231// for each individual character, we can add an expanding232// table entry for the pre-composed character233//234boolean allThere = true;235for (int i = 0; i < s.length(); i++) {236if (getCharOrder(s.charAt(i)) == RBCollationTables.UNMAPPED) {237allThere = false;238break;239}240}241if (allThere) {242addExpandOrder(c, s, RBCollationTables.UNMAPPED);243}244}245}246}247}248249/**250* Look up for unmapped values in the expanded character table.251*252* When the expanding character tables are built by addExpandOrder,253* it doesn't know what the final ordering of each character254* in the expansion will be. Instead, it just puts the raw character255* code into the table, adding CHARINDEX as a flag. Now that we've256* finished building the mapping table, we can go back and look up257* that character to see what its real collation order is and258* stick that into the expansion table. That lets us avoid doing259* a two-stage lookup later.260*/261private final void commit()262{263if (expandTable != null) {264for (int i = 0; i < expandTable.size(); i++) {265int[] valueList = expandTable.elementAt(i);266for (int j = 0; j < valueList.length; j++) {267int order = valueList[j];268if (order < RBCollationTables.EXPANDCHARINDEX && order > CHARINDEX) {269// found a expanding character that isn't filled in yet270int ch = order - CHARINDEX;271272// Get the real values for the non-filled entry273int realValue = getCharOrder(ch);274275if (realValue == RBCollationTables.UNMAPPED) {276// The real value is still unmapped, maybe it's ignorable277valueList[j] = IGNORABLEMASK & ch;278} else {279// just fill in the value280valueList[j] = realValue;281}282}283}284}285}286}287/**288* Increment of the last order based on the comparison level.289*/290private final int increment(int aStrength, int lastValue)291{292switch(aStrength)293{294case Collator.PRIMARY:295// increment priamry order and mask off secondary and tertiary difference296lastValue += PRIMARYORDERINCREMENT;297lastValue &= RBCollationTables.PRIMARYORDERMASK;298isOverIgnore = true;299break;300case Collator.SECONDARY:301// increment secondary order and mask off tertiary difference302lastValue += SECONDARYORDERINCREMENT;303lastValue &= RBCollationTables.SECONDARYDIFFERENCEONLY;304// record max # of ignorable chars with secondary difference305if (!isOverIgnore)306maxSecOrder++;307break;308case Collator.TERTIARY:309// increment tertiary order310lastValue += TERTIARYORDERINCREMENT;311// record max # of ignorable chars with tertiary difference312if (!isOverIgnore)313maxTerOrder++;314break;315}316return lastValue;317}318319/**320* Adds a character and its designated order into the collation table.321*/322private final void addOrder(int ch, int anOrder)323{324// See if the char already has an order in the mapping table325int order = mapping.elementAt(ch);326327if (order >= RBCollationTables.CONTRACTCHARINDEX) {328// There's already an entry for this character that points to a contracting329// character table. Instead of adding the character directly to the mapping330// table, we must add it to the contract table instead.331int length = 1;332if (Character.isSupplementaryCodePoint(ch)) {333length = Character.toChars(ch, keyBuf, 0);334} else {335keyBuf[0] = (char)ch;336}337addContractOrder(new String(keyBuf, 0, length), anOrder);338} else {339// add the entry to the mapping table,340// the same later entry replaces the previous one341mapping.setElementAt(ch, anOrder);342}343}344345private final void addContractOrder(String groupChars, int anOrder) {346addContractOrder(groupChars, anOrder, true);347}348349/**350* Adds the contracting string into the collation table.351*/352private final void addContractOrder(String groupChars, int anOrder,353boolean fwd)354{355if (contractTable == null) {356contractTable = new Vector<>(INITIALTABLESIZE);357}358359//initial character360int ch = groupChars.codePointAt(0);361/*362char ch0 = groupChars.charAt(0);363int ch = Character.isHighSurrogate(ch0)?364Character.toCodePoint(ch0, groupChars.charAt(1)):ch0;365*/366// See if the initial character of the string already has a contract table.367int entry = mapping.elementAt(ch);368Vector<EntryPair> entryTable = getContractValuesImpl(entry - RBCollationTables.CONTRACTCHARINDEX);369370if (entryTable == null) {371// We need to create a new table of contract entries for this base char372int tableIndex = RBCollationTables.CONTRACTCHARINDEX + contractTable.size();373entryTable = new Vector<>(INITIALTABLESIZE);374contractTable.addElement(entryTable);375376// Add the initial character's current ordering first. then377// update its mapping to point to this contract table378entryTable.addElement(new EntryPair(groupChars.substring(0,Character.charCount(ch)), entry));379mapping.setElementAt(ch, tableIndex);380}381382// Now add (or replace) this string in the table383int index = RBCollationTables.getEntry(entryTable, groupChars, fwd);384if (index != RBCollationTables.UNMAPPED) {385EntryPair pair = entryTable.elementAt(index);386pair.value = anOrder;387} else {388EntryPair pair = entryTable.lastElement();389390// NOTE: This little bit of logic is here to speed CollationElementIterator391// .nextContractChar(). This code ensures that the longest sequence in392// this list is always the _last_ one in the list. This keeps393// nextContractChar() from having to search the entire list for the longest394// sequence.395if (groupChars.length() > pair.entryName.length()) {396entryTable.addElement(new EntryPair(groupChars, anOrder, fwd));397} else {398entryTable.insertElementAt(new EntryPair(groupChars, anOrder,399fwd), entryTable.size() - 1);400}401}402403// If this was a forward mapping for a contracting string, also add a404// reverse mapping for it, so that CollationElementIterator.previous405// can work right406if (fwd && groupChars.length() > 1) {407addContractFlags(groupChars);408addContractOrder(new StringBuffer(groupChars).reverse().toString(),409anOrder, false);410}411}412413/**414* If the given string has been specified as a contracting string415* in this collation table, return its ordering.416* Otherwise return UNMAPPED.417*/418private int getContractOrder(String groupChars)419{420int result = RBCollationTables.UNMAPPED;421if (contractTable != null) {422int ch = groupChars.codePointAt(0);423/*424char ch0 = groupChars.charAt(0);425int ch = Character.isHighSurrogate(ch0)?426Character.toCodePoint(ch0, groupChars.charAt(1)):ch0;427*/428Vector<EntryPair> entryTable = getContractValues(ch);429if (entryTable != null) {430int index = RBCollationTables.getEntry(entryTable, groupChars, true);431if (index != RBCollationTables.UNMAPPED) {432EntryPair pair = entryTable.elementAt(index);433result = pair.value;434}435}436}437return result;438}439440private final int getCharOrder(int ch) {441int order = mapping.elementAt(ch);442443if (order >= RBCollationTables.CONTRACTCHARINDEX) {444Vector<EntryPair> groupList = getContractValuesImpl(order - RBCollationTables.CONTRACTCHARINDEX);445EntryPair pair = groupList.firstElement();446order = pair.value;447}448return order;449}450451/**452* Get the entry of hash table of the contracting string in the collation453* table.454* @param ch the starting character of the contracting string455*/456private Vector<EntryPair> getContractValues(int ch)457{458int index = mapping.elementAt(ch);459return getContractValuesImpl(index - RBCollationTables.CONTRACTCHARINDEX);460}461462private Vector<EntryPair> getContractValuesImpl(int index)463{464if (index >= 0)465{466return contractTable.elementAt(index);467}468else // not found469{470return null;471}472}473474/**475* Adds the expanding string into the collation table.476*/477private final void addExpandOrder(String contractChars,478String expandChars,479int anOrder) throws ParseException480{481// Create an expansion table entry482int tableIndex = addExpansion(anOrder, expandChars);483484// And add its index into the main mapping table485if (contractChars.length() > 1) {486char ch = contractChars.charAt(0);487if (Character.isHighSurrogate(ch) && contractChars.length() == 2) {488char ch2 = contractChars.charAt(1);489if (Character.isLowSurrogate(ch2)) {490//only add into table when it is a legal surrogate491addOrder(Character.toCodePoint(ch, ch2), tableIndex);492}493} else {494addContractOrder(contractChars, tableIndex);495}496} else {497addOrder(contractChars.charAt(0), tableIndex);498}499}500501private final void addExpandOrder(int ch, String expandChars, int anOrder)502throws ParseException503{504int tableIndex = addExpansion(anOrder, expandChars);505addOrder(ch, tableIndex);506}507508/**509* Create a new entry in the expansion table that contains the orderings510* for the given characers. If anOrder is valid, it is added to the511* beginning of the expanded list of orders.512*/513private int addExpansion(int anOrder, String expandChars) {514if (expandTable == null) {515expandTable = new Vector<>(INITIALTABLESIZE);516}517518// If anOrder is valid, we want to add it at the beginning of the list519int offset = (anOrder == RBCollationTables.UNMAPPED) ? 0 : 1;520521int[] valueList = new int[expandChars.length() + offset];522if (offset == 1) {523valueList[0] = anOrder;524}525526int j = offset;527for (int i = 0; i < expandChars.length(); i++) {528char ch0 = expandChars.charAt(i);529char ch1;530int ch;531if (Character.isHighSurrogate(ch0)) {532if (++i == expandChars.length() ||533!Character.isLowSurrogate(ch1=expandChars.charAt(i))) {534//ether we are missing the low surrogate or the next char535//is not a legal low surrogate, so stop loop536break;537}538ch = Character.toCodePoint(ch0, ch1);539540} else {541ch = ch0;542}543544int mapValue = getCharOrder(ch);545546if (mapValue != RBCollationTables.UNMAPPED) {547valueList[j++] = mapValue;548} else {549// can't find it in the table, will be filled in by commit().550valueList[j++] = CHARINDEX + ch;551}552}553if (j < valueList.length) {554//we had at least one supplementary character, the size of valueList555//is bigger than it really needs...556int[] tmpBuf = new int[j];557while (--j >= 0) {558tmpBuf[j] = valueList[j];559}560valueList = tmpBuf;561}562// Add the expanding char list into the expansion table.563int tableIndex = RBCollationTables.EXPANDCHARINDEX + expandTable.size();564expandTable.addElement(valueList);565566return tableIndex;567}568569private void addContractFlags(String chars) {570char c0;571int c;572int len = chars.length();573for (int i = 0; i < len; i++) {574c0 = chars.charAt(i);575c = Character.isHighSurrogate(c0)576?Character.toCodePoint(c0, chars.charAt(++i))577:c0;578contractFlags.put(c, 1);579}580}581582// ==============================================================583// constants584// ==============================================================585final static int CHARINDEX = 0x70000000; // need look up in .commit()586587private final static int IGNORABLEMASK = 0x0000ffff;588private final static int PRIMARYORDERINCREMENT = 0x00010000;589private final static int SECONDARYORDERINCREMENT = 0x00000100;590private final static int TERTIARYORDERINCREMENT = 0x00000001;591private final static int INITIALTABLESIZE = 20;592private final static int MAXKEYSIZE = 5;593594// ==============================================================595// instance variables596// ==============================================================597598// variables used by the build process599private RBCollationTables.BuildAPI tables = null;600private MergeCollation mPattern = null;601private boolean isOverIgnore = false;602private char[] keyBuf = new char[MAXKEYSIZE];603private IntHashtable contractFlags = new IntHashtable(100);604605// "shadow" copies of the instance variables in RBCollationTables606// (the values in these variables are copied back into RBCollationTables607// at the end of the build process)608private boolean frenchSec = false;609private boolean seAsianSwapping = false;610611private UCompactIntArray mapping = null;612private Vector<Vector<EntryPair>> contractTable = null;613private Vector<int[]> expandTable = null;614615private short maxSecOrder = 0;616private short maxTerOrder = 0;617}618619620