Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/java/text/CollationElementIterator.java
38829 views
/*1* Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425/*26* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved27* (C) Copyright IBM Corp. 1996-1998 - All Rights Reserved28*29* The original version of this source code and documentation is copyrighted30* and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These31* materials are provided under terms of a License Agreement between Taligent32* and Sun. This technology is protected by multiple US and International33* patents. This notice and attribution to Taligent may not be removed.34* Taligent is a registered trademark of Taligent, Inc.35*36*/3738package java.text;3940import java.lang.Character;41import java.util.Vector;42import sun.text.CollatorUtilities;43import sun.text.normalizer.NormalizerBase;4445/**46* The <code>CollationElementIterator</code> class is used as an iterator47* to walk through each character of an international string. Use the iterator48* to return the ordering priority of the positioned character. The ordering49* priority of a character, which we refer to as a key, defines how a character50* is collated in the given collation object.51*52* <p>53* For example, consider the following in Spanish:54* <blockquote>55* <pre>56* "ca" → the first key is key('c') and second key is key('a').57* "cha" → the first key is key('ch') and second key is key('a').58* </pre>59* </blockquote>60* And in German,61* <blockquote>62* <pre>63* "\u00e4b" → the first key is key('a'), the second key is key('e'), and64* the third key is key('b').65* </pre>66* </blockquote>67* The key of a character is an integer composed of primary order(short),68* secondary order(byte), and tertiary order(byte). Java strictly defines69* the size and signedness of its primitive data types. Therefore, the static70* functions <code>primaryOrder</code>, <code>secondaryOrder</code>, and71* <code>tertiaryOrder</code> return <code>int</code>, <code>short</code>,72* and <code>short</code> respectively to ensure the correctness of the key73* value.74*75* <p>76* Example of the iterator usage,77* <blockquote>78* <pre>79*80* String testString = "This is a test";81* Collator col = Collator.getInstance();82* if (col instanceof RuleBasedCollator) {83* RuleBasedCollator ruleBasedCollator = (RuleBasedCollator)col;84* CollationElementIterator collationElementIterator = ruleBasedCollator.getCollationElementIterator(testString);85* int primaryOrder = CollationElementIterator.primaryOrder(collationElementIterator.next());86* :87* }88* </pre>89* </blockquote>90*91* <p>92* <code>CollationElementIterator.next</code> returns the collation order93* of the next character. A collation order consists of primary order,94* secondary order and tertiary order. The data type of the collation95* order is <strong>int</strong>. The first 16 bits of a collation order96* is its primary order; the next 8 bits is the secondary order and the97* last 8 bits is the tertiary order.98*99* <p><b>Note:</b> <code>CollationElementIterator</code> is a part of100* <code>RuleBasedCollator</code> implementation. It is only usable101* with <code>RuleBasedCollator</code> instances.102*103* @see Collator104* @see RuleBasedCollator105* @author Helena Shih, Laura Werner, Richard Gillam106*/107public final class CollationElementIterator108{109/**110* Null order which indicates the end of string is reached by the111* cursor.112*/113public final static int NULLORDER = 0xffffffff;114115/**116* CollationElementIterator constructor. This takes the source string and117* the collation object. The cursor will walk thru the source string based118* on the predefined collation rules. If the source string is empty,119* NULLORDER will be returned on the calls to next().120* @param sourceText the source string.121* @param owner the collation object.122*/123CollationElementIterator(String sourceText, RuleBasedCollator owner) {124this.owner = owner;125ordering = owner.getTables();126if ( sourceText.length() != 0 ) {127NormalizerBase.Mode mode =128CollatorUtilities.toNormalizerMode(owner.getDecomposition());129text = new NormalizerBase(sourceText, mode);130}131}132133/**134* CollationElementIterator constructor. This takes the source string and135* the collation object. The cursor will walk thru the source string based136* on the predefined collation rules. If the source string is empty,137* NULLORDER will be returned on the calls to next().138* @param sourceText the source string.139* @param owner the collation object.140*/141CollationElementIterator(CharacterIterator sourceText, RuleBasedCollator owner) {142this.owner = owner;143ordering = owner.getTables();144NormalizerBase.Mode mode =145CollatorUtilities.toNormalizerMode(owner.getDecomposition());146text = new NormalizerBase(sourceText, mode);147}148149/**150* Resets the cursor to the beginning of the string. The next call151* to next() will return the first collation element in the string.152*/153public void reset()154{155if (text != null) {156text.reset();157NormalizerBase.Mode mode =158CollatorUtilities.toNormalizerMode(owner.getDecomposition());159text.setMode(mode);160}161buffer = null;162expIndex = 0;163swapOrder = 0;164}165166/**167* Get the next collation element in the string. <p>This iterator iterates168* over a sequence of collation elements that were built from the string.169* Because there isn't necessarily a one-to-one mapping from characters to170* collation elements, this doesn't mean the same thing as "return the171* collation element [or ordering priority] of the next character in the172* string".</p>173* <p>This function returns the collation element that the iterator is currently174* pointing to and then updates the internal pointer to point to the next element.175* previous() updates the pointer first and then returns the element. This176* means that when you change direction while iterating (i.e., call next() and177* then call previous(), or call previous() and then call next()), you'll get178* back the same element twice.</p>179*180* @return the next collation element181*/182public int next()183{184if (text == null) {185return NULLORDER;186}187NormalizerBase.Mode textMode = text.getMode();188// convert the owner's mode to something the Normalizer understands189NormalizerBase.Mode ownerMode =190CollatorUtilities.toNormalizerMode(owner.getDecomposition());191if (textMode != ownerMode) {192text.setMode(ownerMode);193}194195// if buffer contains any decomposed char values196// return their strength orders before continuing in197// the Normalizer's CharacterIterator.198if (buffer != null) {199if (expIndex < buffer.length) {200return strengthOrder(buffer[expIndex++]);201} else {202buffer = null;203expIndex = 0;204}205} else if (swapOrder != 0) {206if (Character.isSupplementaryCodePoint(swapOrder)) {207char[] chars = Character.toChars(swapOrder);208swapOrder = chars[1];209return chars[0] << 16;210}211int order = swapOrder << 16;212swapOrder = 0;213return order;214}215int ch = text.next();216217// are we at the end of Normalizer's text?218if (ch == NormalizerBase.DONE) {219return NULLORDER;220}221222int value = ordering.getUnicodeOrder(ch);223if (value == RuleBasedCollator.UNMAPPED) {224swapOrder = ch;225return UNMAPPEDCHARVALUE;226}227else if (value >= RuleBasedCollator.CONTRACTCHARINDEX) {228value = nextContractChar(ch);229}230if (value >= RuleBasedCollator.EXPANDCHARINDEX) {231buffer = ordering.getExpandValueList(value);232expIndex = 0;233value = buffer[expIndex++];234}235236if (ordering.isSEAsianSwapping()) {237int consonant;238if (isThaiPreVowel(ch)) {239consonant = text.next();240if (isThaiBaseConsonant(consonant)) {241buffer = makeReorderedBuffer(consonant, value, buffer, true);242value = buffer[0];243expIndex = 1;244} else if (consonant != NormalizerBase.DONE) {245text.previous();246}247}248if (isLaoPreVowel(ch)) {249consonant = text.next();250if (isLaoBaseConsonant(consonant)) {251buffer = makeReorderedBuffer(consonant, value, buffer, true);252value = buffer[0];253expIndex = 1;254} else if (consonant != NormalizerBase.DONE) {255text.previous();256}257}258}259260return strengthOrder(value);261}262263/**264* Get the previous collation element in the string. <p>This iterator iterates265* over a sequence of collation elements that were built from the string.266* Because there isn't necessarily a one-to-one mapping from characters to267* collation elements, this doesn't mean the same thing as "return the268* collation element [or ordering priority] of the previous character in the269* string".</p>270* <p>This function updates the iterator's internal pointer to point to the271* collation element preceding the one it's currently pointing to and then272* returns that element, while next() returns the current element and then273* updates the pointer. This means that when you change direction while274* iterating (i.e., call next() and then call previous(), or call previous()275* and then call next()), you'll get back the same element twice.</p>276*277* @return the previous collation element278* @since 1.2279*/280public int previous()281{282if (text == null) {283return NULLORDER;284}285NormalizerBase.Mode textMode = text.getMode();286// convert the owner's mode to something the Normalizer understands287NormalizerBase.Mode ownerMode =288CollatorUtilities.toNormalizerMode(owner.getDecomposition());289if (textMode != ownerMode) {290text.setMode(ownerMode);291}292if (buffer != null) {293if (expIndex > 0) {294return strengthOrder(buffer[--expIndex]);295} else {296buffer = null;297expIndex = 0;298}299} else if (swapOrder != 0) {300if (Character.isSupplementaryCodePoint(swapOrder)) {301char[] chars = Character.toChars(swapOrder);302swapOrder = chars[1];303return chars[0] << 16;304}305int order = swapOrder << 16;306swapOrder = 0;307return order;308}309int ch = text.previous();310if (ch == NormalizerBase.DONE) {311return NULLORDER;312}313314int value = ordering.getUnicodeOrder(ch);315316if (value == RuleBasedCollator.UNMAPPED) {317swapOrder = UNMAPPEDCHARVALUE;318return ch;319} else if (value >= RuleBasedCollator.CONTRACTCHARINDEX) {320value = prevContractChar(ch);321}322if (value >= RuleBasedCollator.EXPANDCHARINDEX) {323buffer = ordering.getExpandValueList(value);324expIndex = buffer.length;325value = buffer[--expIndex];326}327328if (ordering.isSEAsianSwapping()) {329int vowel;330if (isThaiBaseConsonant(ch)) {331vowel = text.previous();332if (isThaiPreVowel(vowel)) {333buffer = makeReorderedBuffer(vowel, value, buffer, false);334expIndex = buffer.length - 1;335value = buffer[expIndex];336} else {337text.next();338}339}340if (isLaoBaseConsonant(ch)) {341vowel = text.previous();342if (isLaoPreVowel(vowel)) {343buffer = makeReorderedBuffer(vowel, value, buffer, false);344expIndex = buffer.length - 1;345value = buffer[expIndex];346} else {347text.next();348}349}350}351352return strengthOrder(value);353}354355/**356* Return the primary component of a collation element.357* @param order the collation element358* @return the element's primary component359*/360public final static int primaryOrder(int order)361{362order &= RBCollationTables.PRIMARYORDERMASK;363return (order >>> RBCollationTables.PRIMARYORDERSHIFT);364}365/**366* Return the secondary component of a collation element.367* @param order the collation element368* @return the element's secondary component369*/370public final static short secondaryOrder(int order)371{372order = order & RBCollationTables.SECONDARYORDERMASK;373return ((short)(order >> RBCollationTables.SECONDARYORDERSHIFT));374}375/**376* Return the tertiary component of a collation element.377* @param order the collation element378* @return the element's tertiary component379*/380public final static short tertiaryOrder(int order)381{382return ((short)(order &= RBCollationTables.TERTIARYORDERMASK));383}384385/**386* Get the comparison order in the desired strength. Ignore the other387* differences.388* @param order The order value389*/390final int strengthOrder(int order)391{392int s = owner.getStrength();393if (s == Collator.PRIMARY)394{395order &= RBCollationTables.PRIMARYDIFFERENCEONLY;396} else if (s == Collator.SECONDARY)397{398order &= RBCollationTables.SECONDARYDIFFERENCEONLY;399}400return order;401}402403/**404* Sets the iterator to point to the collation element corresponding to405* the specified character (the parameter is a CHARACTER offset in the406* original string, not an offset into its corresponding sequence of407* collation elements). The value returned by the next call to next()408* will be the collation element corresponding to the specified position409* in the text. If that position is in the middle of a contracting410* character sequence, the result of the next call to next() is the411* collation element for that sequence. This means that getOffset()412* is not guaranteed to return the same value as was passed to a preceding413* call to setOffset().414*415* @param newOffset The new character offset into the original text.416* @since 1.2417*/418@SuppressWarnings("deprecation") // getBeginIndex, getEndIndex and setIndex are deprecated419public void setOffset(int newOffset)420{421if (text != null) {422if (newOffset < text.getBeginIndex()423|| newOffset >= text.getEndIndex()) {424text.setIndexOnly(newOffset);425} else {426int c = text.setIndex(newOffset);427428// if the desired character isn't used in a contracting character429// sequence, bypass all the backing-up logic-- we're sitting on430// the right character already431if (ordering.usedInContractSeq(c)) {432// walk backwards through the string until we see a character433// that DOESN'T participate in a contracting character sequence434while (ordering.usedInContractSeq(c)) {435c = text.previous();436}437// now walk forward using this object's next() method until438// we pass the starting point and set our current position439// to the beginning of the last "character" before or at440// our starting position441int last = text.getIndex();442while (text.getIndex() <= newOffset) {443last = text.getIndex();444next();445}446text.setIndexOnly(last);447// we don't need this, since last is the last index448// that is the starting of the contraction which encompass449// newOffset450// text.previous();451}452}453}454buffer = null;455expIndex = 0;456swapOrder = 0;457}458459/**460* Returns the character offset in the original text corresponding to the next461* collation element. (That is, getOffset() returns the position in the text462* corresponding to the collation element that will be returned by the next463* call to next().) This value will always be the index of the FIRST character464* corresponding to the collation element (a contracting character sequence is465* when two or more characters all correspond to the same collation element).466* This means if you do setOffset(x) followed immediately by getOffset(), getOffset()467* won't necessarily return x.468*469* @return The character offset in the original text corresponding to the collation470* element that will be returned by the next call to next().471* @since 1.2472*/473public int getOffset()474{475return (text != null) ? text.getIndex() : 0;476}477478479/**480* Return the maximum length of any expansion sequences that end481* with the specified comparison order.482* @param order a collation order returned by previous or next.483* @return the maximum length of any expansion sequences ending484* with the specified order.485* @since 1.2486*/487public int getMaxExpansion(int order)488{489return ordering.getMaxExpansion(order);490}491492/**493* Set a new string over which to iterate.494*495* @param source the new source text496* @since 1.2497*/498public void setText(String source)499{500buffer = null;501swapOrder = 0;502expIndex = 0;503NormalizerBase.Mode mode =504CollatorUtilities.toNormalizerMode(owner.getDecomposition());505if (text == null) {506text = new NormalizerBase(source, mode);507} else {508text.setMode(mode);509text.setText(source);510}511}512513/**514* Set a new string over which to iterate.515*516* @param source the new source text.517* @since 1.2518*/519public void setText(CharacterIterator source)520{521buffer = null;522swapOrder = 0;523expIndex = 0;524NormalizerBase.Mode mode =525CollatorUtilities.toNormalizerMode(owner.getDecomposition());526if (text == null) {527text = new NormalizerBase(source, mode);528} else {529text.setMode(mode);530text.setText(source);531}532}533534//============================================================535// privates536//============================================================537538/**539* Determine if a character is a Thai vowel (which sorts after540* its base consonant).541*/542private final static boolean isThaiPreVowel(int ch) {543return (ch >= 0x0e40) && (ch <= 0x0e44);544}545546/**547* Determine if a character is a Thai base consonant548*/549private final static boolean isThaiBaseConsonant(int ch) {550return (ch >= 0x0e01) && (ch <= 0x0e2e);551}552553/**554* Determine if a character is a Lao vowel (which sorts after555* its base consonant).556*/557private final static boolean isLaoPreVowel(int ch) {558return (ch >= 0x0ec0) && (ch <= 0x0ec4);559}560561/**562* Determine if a character is a Lao base consonant563*/564private final static boolean isLaoBaseConsonant(int ch) {565return (ch >= 0x0e81) && (ch <= 0x0eae);566}567568/**569* This method produces a buffer which contains the collation570* elements for the two characters, with colFirst's values preceding571* another character's. Presumably, the other character precedes colFirst572* in logical order (otherwise you wouldn't need this method would you?).573* The assumption is that the other char's value(s) have already been574* computed. If this char has a single element it is passed to this575* method as lastValue, and lastExpansion is null. If it has an576* expansion it is passed in lastExpansion, and colLastValue is ignored.577*/578private int[] makeReorderedBuffer(int colFirst,579int lastValue,580int[] lastExpansion,581boolean forward) {582583int[] result;584585int firstValue = ordering.getUnicodeOrder(colFirst);586if (firstValue >= RuleBasedCollator.CONTRACTCHARINDEX) {587firstValue = forward? nextContractChar(colFirst) : prevContractChar(colFirst);588}589590int[] firstExpansion = null;591if (firstValue >= RuleBasedCollator.EXPANDCHARINDEX) {592firstExpansion = ordering.getExpandValueList(firstValue);593}594595if (!forward) {596int temp1 = firstValue;597firstValue = lastValue;598lastValue = temp1;599int[] temp2 = firstExpansion;600firstExpansion = lastExpansion;601lastExpansion = temp2;602}603604if (firstExpansion == null && lastExpansion == null) {605result = new int [2];606result[0] = firstValue;607result[1] = lastValue;608}609else {610int firstLength = firstExpansion==null? 1 : firstExpansion.length;611int lastLength = lastExpansion==null? 1 : lastExpansion.length;612result = new int[firstLength + lastLength];613614if (firstExpansion == null) {615result[0] = firstValue;616}617else {618System.arraycopy(firstExpansion, 0, result, 0, firstLength);619}620621if (lastExpansion == null) {622result[firstLength] = lastValue;623}624else {625System.arraycopy(lastExpansion, 0, result, firstLength, lastLength);626}627}628629return result;630}631632/**633* Check if a comparison order is ignorable.634* @return true if a character is ignorable, false otherwise.635*/636final static boolean isIgnorable(int order)637{638return ((primaryOrder(order) == 0) ? true : false);639}640641/**642* Get the ordering priority of the next contracting character in the643* string.644* @param ch the starting character of a contracting character token645* @return the next contracting character's ordering. Returns NULLORDER646* if the end of string is reached.647*/648private int nextContractChar(int ch)649{650// First get the ordering of this single character,651// which is always the first element in the list652Vector<EntryPair> list = ordering.getContractValues(ch);653EntryPair pair = list.firstElement();654int order = pair.value;655656// find out the length of the longest contracting character sequence in the list.657// There's logic in the builder code to make sure the longest sequence is always658// the last.659pair = list.lastElement();660int maxLength = pair.entryName.length();661662// (the Normalizer is cloned here so that the seeking we do in the next loop663// won't affect our real position in the text)664NormalizerBase tempText = (NormalizerBase)text.clone();665666// extract the next maxLength characters in the string (we have to do this using the667// Normalizer to ensure that our offsets correspond to those the rest of the668// iterator is using) and store it in "fragment".669tempText.previous();670key.setLength(0);671int c = tempText.next();672while (maxLength > 0 && c != NormalizerBase.DONE) {673if (Character.isSupplementaryCodePoint(c)) {674key.append(Character.toChars(c));675maxLength -= 2;676} else {677key.append((char)c);678--maxLength;679}680c = tempText.next();681}682String fragment = key.toString();683// now that we have that fragment, iterate through this list looking for the684// longest sequence that matches the characters in the actual text. (maxLength685// is used here to keep track of the length of the longest sequence)686// Upon exit from this loop, maxLength will contain the length of the matching687// sequence and order will contain the collation-element value corresponding688// to this sequence689maxLength = 1;690for (int i = list.size() - 1; i > 0; i--) {691pair = list.elementAt(i);692if (!pair.fwd)693continue;694695if (fragment.startsWith(pair.entryName) && pair.entryName.length()696> maxLength) {697maxLength = pair.entryName.length();698order = pair.value;699}700}701702// seek our current iteration position to the end of the matching sequence703// and return the appropriate collation-element value (if there was no matching704// sequence, we're already seeked to the right position and order already contains705// the correct collation-element value for the single character)706while (maxLength > 1) {707c = text.next();708maxLength -= Character.charCount(c);709}710return order;711}712713/**714* Get the ordering priority of the previous contracting character in the715* string.716* @param ch the starting character of a contracting character token717* @return the next contracting character's ordering. Returns NULLORDER718* if the end of string is reached.719*/720private int prevContractChar(int ch)721{722// This function is identical to nextContractChar(), except that we've723// switched things so that the next() and previous() calls on the Normalizer724// are switched and so that we skip entry pairs with the fwd flag turned on725// rather than off. Notice that we still use append() and startsWith() when726// working on the fragment. This is because the entry pairs that are used727// in reverse iteration have their names reversed already.728Vector<EntryPair> list = ordering.getContractValues(ch);729EntryPair pair = list.firstElement();730int order = pair.value;731732pair = list.lastElement();733int maxLength = pair.entryName.length();734735NormalizerBase tempText = (NormalizerBase)text.clone();736737tempText.next();738key.setLength(0);739int c = tempText.previous();740while (maxLength > 0 && c != NormalizerBase.DONE) {741if (Character.isSupplementaryCodePoint(c)) {742key.append(Character.toChars(c));743maxLength -= 2;744} else {745key.append((char)c);746--maxLength;747}748c = tempText.previous();749}750String fragment = key.toString();751752maxLength = 1;753for (int i = list.size() - 1; i > 0; i--) {754pair = list.elementAt(i);755if (pair.fwd)756continue;757758if (fragment.startsWith(pair.entryName) && pair.entryName.length()759> maxLength) {760maxLength = pair.entryName.length();761order = pair.value;762}763}764765while (maxLength > 1) {766c = text.previous();767maxLength -= Character.charCount(c);768}769return order;770}771772final static int UNMAPPEDCHARVALUE = 0x7FFF0000;773774private NormalizerBase text = null;775private int[] buffer = null;776private int expIndex = 0;777private StringBuffer key = new StringBuffer(5);778private int swapOrder = 0;779private RBCollationTables ordering;780private RuleBasedCollator owner;781}782783784