Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/sun/text/normalizer/Trie.java
38830 views
/*1* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/24/*25*******************************************************************************26* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *27* *28* The original version of this source code and documentation is copyrighted *29* and owned by IBM, These materials are provided under terms of a License *30* Agreement between IBM and Sun. This technology is protected by multiple *31* US and International patents. This notice and attribution to IBM may not *32* to removed. *33*******************************************************************************34*/3536package sun.text.normalizer;3738import java.io.DataInputStream;39import java.io.InputStream;40import java.io.IOException;4142/**43* <p>A trie is a kind of compressed, serializable table of values44* associated with Unicode code points (0..0x10ffff).</p>45* <p>This class defines the basic structure of a trie and provides methods46* to <b>retrieve the offsets to the actual data</b>.</p>47* <p>Data will be the form of an array of basic types, char or int.</p>48* <p>The actual data format will have to be specified by the user in the49* inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>50* <p>This trie implementation is optimized for getting offset while walking51* forward through a UTF-16 string.52* Therefore, the simplest and fastest access macros are the53* fromLead() and fromOffsetTrail() methods.54* The fromBMP() method are a little more complicated; they get offsets even55* for lead surrogate codepoints, while the fromLead() method get special56* "folded" offsets for lead surrogate code units if there is relevant data57* associated with them.58* From such a folded offsets, an offset needs to be extracted to supply59* to the fromOffsetTrail() methods.60* To handle such supplementary codepoints, some offset information are kept61* in the data.</p>62* <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve63* that offset from the folded value for the lead surrogate unit.</p>64* <p>For examples of use, see com.ibm.icu.impl.CharTrie or65* com.ibm.icu.impl.IntTrie.</p>66* @author synwee67* @see com.ibm.icu.impl.CharTrie68* @see com.ibm.icu.impl.IntTrie69* @since release 2.1, Jan 01 200270*/71public abstract class Trie72{73// public class declaration ----------------------------------------7475/**76* Character data in com.ibm.impl.Trie have different user-specified format77* for different purposes.78* This interface specifies methods to be implemented in order for79* com.ibm.impl.Trie, to surrogate offset information encapsulated within80* the data.81*/82public static interface DataManipulate83{84/**85* Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's86* data87* the index array offset of the indexes for that lead surrogate.88* @param value data value for a surrogate from the trie, including the89* folding offset90* @return data offset or 0 if there is no data for the lead surrogate91*/92public int getFoldingOffset(int value);93}9495// default implementation96private static class DefaultGetFoldingOffset implements DataManipulate {97public int getFoldingOffset(int value) {98return value;99}100}101102// protected constructor -------------------------------------------103104/**105* Trie constructor for CharTrie use.106* @param inputStream ICU data file input stream which contains the107* trie108* @param dataManipulate object containing the information to parse the109* trie data110* @throws IOException thrown when input stream does not have the111* right header.112*/113protected Trie(InputStream inputStream,114DataManipulate dataManipulate) throws IOException115{116DataInputStream input = new DataInputStream(inputStream);117// Magic number to authenticate the data.118int signature = input.readInt();119m_options_ = input.readInt();120121if (!checkHeader(signature)) {122throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");123}124125if(dataManipulate != null) {126m_dataManipulate_ = dataManipulate;127} else {128m_dataManipulate_ = new DefaultGetFoldingOffset();129}130m_isLatin1Linear_ = (m_options_ &131HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;132m_dataOffset_ = input.readInt();133m_dataLength_ = input.readInt();134unserialize(inputStream);135}136137/**138* Trie constructor139* @param index array to be used for index140* @param options used by the trie141* @param dataManipulate object containing the information to parse the142* trie data143*/144protected Trie(char index[], int options, DataManipulate dataManipulate)145{146m_options_ = options;147if(dataManipulate != null) {148m_dataManipulate_ = dataManipulate;149} else {150m_dataManipulate_ = new DefaultGetFoldingOffset();151}152m_isLatin1Linear_ = (m_options_ &153HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;154m_index_ = index;155m_dataOffset_ = m_index_.length;156}157158// protected data members ------------------------------------------159160/**161* Lead surrogate code points' index displacement in the index array.162* 0x10000-0xd800=0x2800163* 0x2800 >> INDEX_STAGE_1_SHIFT_164*/165protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;166/**167* Shift size for shifting right the input index. 1..9168*/169protected static final int INDEX_STAGE_1_SHIFT_ = 5;170/**171* Shift size for shifting left the index array values.172* Increases possible data size with 16-bit index values at the cost173* of compactability.174* This requires blocks of stage 2 data to be aligned by175* DATA_GRANULARITY.176* 0..INDEX_STAGE_1_SHIFT177*/178protected static final int INDEX_STAGE_2_SHIFT_ = 2;179/**180* Number of data values in a stage 2 (data array) block.181*/182protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;183/**184* Mask for getting the lower bits from the input index.185* DATA_BLOCK_LENGTH - 1.186*/187protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;188/** Number of bits of a trail surrogate that are used in index table lookups. */189protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;190/**191* Number of index (stage 1) entries per lead surrogate.192* Same as number of index entries for 1024 trail surrogates,193* ==0x400>>INDEX_STAGE_1_SHIFT_194*/195protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);196/** Length of the BMP portion of the index (stage 1) array. */197protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;198/**199* Surrogate mask to use when shifting offset to retrieve supplementary200* values201*/202protected static final int SURROGATE_MASK_ = 0x3FF;203/**204* Index or UTF16 characters205*/206protected char m_index_[];207/**208* Internal TrieValue which handles the parsing of the data value.209* This class is to be implemented by the user210*/211protected DataManipulate m_dataManipulate_;212/**213* Start index of the data portion of the trie. CharTrie combines214* index and data into a char array, so this is used to indicate the215* initial offset to the data portion.216* Note this index always points to the initial value.217*/218protected int m_dataOffset_;219/**220* Length of the data array221*/222protected int m_dataLength_;223224// protected methods -----------------------------------------------225226/**227* Gets the offset to the data which the surrogate pair points to.228* @param lead lead surrogate229* @param trail trailing surrogate230* @return offset to data231*/232protected abstract int getSurrogateOffset(char lead, char trail);233234/**235* Gets the value at the argument index236* @param index value at index will be retrieved237* @return 32 bit value238*/239protected abstract int getValue(int index);240241/**242* Gets the default initial value243* @return 32 bit value244*/245protected abstract int getInitialValue();246247/**248* Gets the offset to the data which the index ch after variable offset249* points to.250* Note for locating a non-supplementary character data offset, calling251* <p>252* getRawOffset(0, ch);253* </p>254* will do. Otherwise if it is a supplementary character formed by255* surrogates lead and trail. Then we would have to call getRawOffset()256* with getFoldingIndexOffset(). See getSurrogateOffset().257* @param offset index offset which ch is to start from258* @param ch index to be used after offset259* @return offset to the data260*/261protected final int getRawOffset(int offset, char ch)262{263return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]264<< INDEX_STAGE_2_SHIFT_)265+ (ch & INDEX_STAGE_3_MASK_);266}267268/**269* Gets the offset to data which the BMP character points to270* Treats a lead surrogate as a normal code point.271* @param ch BMP character272* @return offset to data273*/274protected final int getBMPOffset(char ch)275{276return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE277&& ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)278? getRawOffset(LEAD_INDEX_OFFSET_, ch)279: getRawOffset(0, ch);280// using a getRawOffset(ch) makes no diff281}282283/**284* Gets the offset to the data which this lead surrogate character points285* to.286* Data at the returned offset may contain folding offset information for287* the next trailing surrogate character.288* @param ch lead surrogate character289* @return offset to data290*/291protected final int getLeadOffset(char ch)292{293return getRawOffset(0, ch);294}295296/**297* Internal trie getter from a code point.298* Could be faster(?) but longer with299* if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }300* Gets the offset to data which the codepoint points to301* @param ch codepoint302* @return offset to data303*/304protected final int getCodePointOffset(int ch)305{306// if ((ch >> 16) == 0) slower307if (ch < 0) {308return -1;309} else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {310// fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works311return getRawOffset(0, (char)ch);312} else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {313// BMP codepoint314return getBMPOffset((char)ch);315} else if (ch <= UCharacter.MAX_VALUE) {316// look at the construction of supplementary characters317// trail forms the ends of it.318return getSurrogateOffset(UTF16.getLeadSurrogate(ch),319(char)(ch & SURROGATE_MASK_));320} else {321// return -1 // if there is an error, in this case we return322return -1;323}324}325326/**327* <p>Parses the inputstream and creates the trie index with it.</p>328* <p>This is overwritten by the child classes.329* @param inputStream input stream containing the trie information330* @exception IOException thrown when data reading fails.331*/332protected void unserialize(InputStream inputStream) throws IOException333{334//indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_335m_index_ = new char[m_dataOffset_];336DataInputStream input = new DataInputStream(inputStream);337for (int i = 0; i < m_dataOffset_; i ++) {338m_index_[i] = input.readChar();339}340}341342/**343* Determines if this is a 32 bit trie344* @return true if options specifies this is a 32 bit trie345*/346protected final boolean isIntTrie()347{348return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;349}350351/**352* Determines if this is a 16 bit trie353* @return true if this is a 16 bit trie354*/355protected final boolean isCharTrie()356{357return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;358}359360// private data members --------------------------------------------361362/**363* Latin 1 option mask364*/365protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;366/**367* Constant number to authenticate the byte block368*/369protected static final int HEADER_SIGNATURE_ = 0x54726965;370/**371* Header option formatting372*/373private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;374protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;375protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;376377/**378* Flag indicator for Latin quick access data block379*/380private boolean m_isLatin1Linear_;381382/**383* <p>Trie options field.</p>384* <p>options bit field:<br>385* 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>386* 8 0 = 16-bit data, 1=32-bit data<br>387* 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>388* 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>389*/390private int m_options_;391392// private methods ---------------------------------------------------393394/**395* Authenticates raw data header.396* Checking the header information, signature and options.397* @param signature This contains the options and type of a Trie398* @return true if the header is authenticated valid399*/400private final boolean checkHeader(int signature)401{402// check the signature403// Trie in big-endian US-ASCII (0x54726965).404// Magic number to authenticate the data.405if (signature != HEADER_SIGNATURE_) {406return false;407}408409if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=410INDEX_STAGE_1_SHIFT_ ||411((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &412HEADER_OPTIONS_SHIFT_MASK_)413!= INDEX_STAGE_2_SHIFT_) {414return false;415}416return true;417}418}419420421