Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/sun/text/normalizer/UCharacterProperty.java
38830 views
/*1* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/24/*25*******************************************************************************26* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *27* *28* The original version of this source code and documentation is copyrighted *29* and owned by IBM, These materials are provided under terms of a License *30* Agreement between IBM and Sun. This technology is protected by multiple *31* US and International patents. This notice and attribution to IBM may not *32* to removed. *33*******************************************************************************34*/3536package sun.text.normalizer;3738import java.io.BufferedInputStream;39import java.io.InputStream;40import java.io.IOException;41import java.util.MissingResourceException;4243/**44* <p>Internal class used for Unicode character property database.</p>45* <p>This classes store binary data read from uprops.icu.46* It does not have the capability to parse the data into more high-level47* information. It only returns bytes of information when required.</p>48* <p>Due to the form most commonly used for retrieval, array of char is used49* to store the binary data.</p>50* <p>UCharacterPropertyDB also contains information on accessing indexes to51* significant points in the binary data.</p>52* <p>Responsibility for molding the binary data into more meaning form lies on53* <a href=UCharacter.html>UCharacter</a>.</p>54* @author Syn Wee Quek55* @since release 2.1, february 1st 200256*/5758public final class UCharacterProperty59{60// public data members -----------------------------------------------6162/**63* Trie data64*/65public CharTrie m_trie_;66/**67* Optimization68* CharTrie index array69*/70public char[] m_trieIndex_;71/**72* Optimization73* CharTrie data array74*/75public char[] m_trieData_;76/**77* Optimization78* CharTrie data offset79*/80public int m_trieInitialValue_;81/**82* Unicode version83*/84public VersionInfo m_unicodeVersion_;8586// uprops.h enum UPropertySource --------------------------------------- ***8788/** From uchar.c/uprops.icu properties vectors trie */89public static final int SRC_PROPSVEC=2;90/** One more than the highest UPropertySource (SRC_) constant. */91public static final int SRC_COUNT=9;9293// public methods ----------------------------------------------------9495/**96* Java friends implementation97*/98public void setIndexData(CharTrie.FriendAgent friendagent)99{100m_trieIndex_ = friendagent.getPrivateIndex();101m_trieData_ = friendagent.getPrivateData();102m_trieInitialValue_ = friendagent.getPrivateInitialValue();103}104105/**106* Gets the property value at the index.107* This is optimized.108* Note this is alittle different from CharTrie the index m_trieData_109* is never negative.110* @param ch code point whose property value is to be retrieved111* @return property value of code point112*/113public final int getProperty(int ch)114{115if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE116|| (ch > UTF16.LEAD_SURROGATE_MAX_VALUE117&& ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {118// BMP codepoint 0000..D7FF or DC00..FFFF119// optimized120try { // using try for ch < 0 is faster than using an if statement121return m_trieData_[122(m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]123<< Trie.INDEX_STAGE_2_SHIFT_)124+ (ch & Trie.INDEX_STAGE_3_MASK_)];125} catch (ArrayIndexOutOfBoundsException e) {126return m_trieInitialValue_;127}128}129if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {130// lead surrogate D800..DBFF131return m_trieData_[132(m_trieIndex_[Trie.LEAD_INDEX_OFFSET_133+ (ch >> Trie.INDEX_STAGE_1_SHIFT_)]134<< Trie.INDEX_STAGE_2_SHIFT_)135+ (ch & Trie.INDEX_STAGE_3_MASK_)];136}137if (ch <= UTF16.CODEPOINT_MAX_VALUE) {138// supplementary code point 10000..10FFFF139// look at the construction of supplementary characters140// trail forms the ends of it.141return m_trie_.getSurrogateValue(142UTF16.getLeadSurrogate(ch),143(char)(ch & Trie.SURROGATE_MASK_));144}145// ch is out of bounds146// return m_dataOffset_ if there is an error, in this case we return147// the default value: m_initialValue_148// we cannot assume that m_initialValue_ is at offset 0149// this is for optimization.150return m_trieInitialValue_;151152// this all is an inlined form of return m_trie_.getCodePointValue(ch);153}154155/**156* Getting the unsigned numeric value of a character embedded in the property157* argument158* @param prop the character159* @return unsigned numberic value160*/161public static int getUnsignedValue(int prop)162{163return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;164}165166/**167* Gets the unicode additional properties.168* C version getUnicodeProperties.169* @param codepoint codepoint whose additional properties is to be170* retrieved171* @param column172* @return unicode properties173*/174public int getAdditional(int codepoint, int column) {175if (column == -1) {176return getProperty(codepoint);177}178if (column < 0 || column >= m_additionalColumnsCount_) {179return 0;180}181return m_additionalVectors_[182m_additionalTrie_.getCodePointValue(codepoint) + column];183}184185/**186* <p>Get the "age" of the code point.</p>187* <p>The "age" is the Unicode version when the code point was first188* designated (as a non-character or for Private Use) or assigned a189* character.</p>190* <p>This can be useful to avoid emitting code points to receiving191* processes that do not accept newer characters.</p>192* <p>The data is from the UCD file DerivedAge.txt.</p>193* <p>This API does not check the validity of the codepoint.</p>194* @param codepoint The code point.195* @return the Unicode version number196*/197public VersionInfo getAge(int codepoint)198{199int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;200return VersionInfo.getInstance(201(version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,202version & LAST_NIBBLE_MASK_, 0, 0);203}204205/**206* Forms a supplementary code point from the argument character<br>207* Note this is for internal use hence no checks for the validity of the208* surrogate characters are done209* @param lead lead surrogate character210* @param trail trailing surrogate character211* @return code point of the supplementary character212*/213public static int getRawSupplementary(char lead, char trail)214{215return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;216}217218/**219* Loads the property data and initialize the UCharacterProperty instance.220* @throws MissingResourceException when data is missing or data has been corrupted221*/222public static UCharacterProperty getInstance()223{224if(INSTANCE_ == null) {225try {226INSTANCE_ = new UCharacterProperty();227}228catch (Exception e) {229throw new MissingResourceException(e.getMessage(),"","");230}231}232return INSTANCE_;233}234235/**236* Checks if the argument c is to be treated as a white space in ICU237* rules. Usually ICU rule white spaces are ignored unless quoted.238* Equivalent to test for Pattern_White_Space Unicode property.239* Stable set of characters, won't change.240* See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/241* @param c codepoint to check242* @return true if c is a ICU white space243*/244public static boolean isRuleWhiteSpace(int c)245{246/* "white space" in the sense of ICU rule parsers247This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.248See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/249U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029250Equivalent to test for Pattern_White_Space Unicode property.251*/252return (c >= 0x0009 && c <= 0x2029 &&253(c <= 0x000D || c == 0x0020 || c == 0x0085 ||254c == 0x200E || c == 0x200F || c >= 0x2028));255}256257// protected variables -----------------------------------------------258259/**260* Extra property trie261*/262CharTrie m_additionalTrie_;263/**264* Extra property vectors, 1st column for age and second for binary265* properties.266*/267int m_additionalVectors_[];268/**269* Number of additional columns270*/271int m_additionalColumnsCount_;272/**273* Maximum values for block, bits used as in vector word274* 0275*/276int m_maxBlockScriptValue_;277/**278* Maximum values for script, bits used as in vector word279* 0280*/281int m_maxJTGValue_;282283// private variables -------------------------------------------------284285/**286* UnicodeData.txt property object287*/288private static UCharacterProperty INSTANCE_ = null;289290/**291* Default name of the datafile292*/293private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";294295/**296* Default buffer size of datafile297*/298private static final int DATA_BUFFER_SIZE_ = 25000;299300/**301* Numeric value shift302*/303private static final int VALUE_SHIFT_ = 8;304305/**306* Mask to be applied after shifting to obtain an unsigned numeric value307*/308private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;309310/**311* Shift value for lead surrogate to form a supplementary character.312*/313private static final int LEAD_SURROGATE_SHIFT_ = 10;314/**315* Offset to add to combined surrogate pair to avoid msking.316*/317private static final int SURROGATE_OFFSET_ =318UTF16.SUPPLEMENTARY_MIN_VALUE -319(UTF16.SURROGATE_MIN_VALUE <<320LEAD_SURROGATE_SHIFT_) -321UTF16.TRAIL_SURROGATE_MIN_VALUE;322323// additional properties ----------------------------------------------324325/**326* First nibble shift327*/328private static final int FIRST_NIBBLE_SHIFT_ = 0x4;329/**330* Second nibble mask331*/332private static final int LAST_NIBBLE_MASK_ = 0xF;333/**334* Age value shift335*/336private static final int AGE_SHIFT_ = 24;337338// private constructors --------------------------------------------------339340/**341* Constructor342* @exception IOException thrown when data reading fails or data corrupted343*/344private UCharacterProperty() throws IOException345{346// jar access347InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);348BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);349UCharacterPropertyReader reader = new UCharacterPropertyReader(b);350reader.read(this);351b.close();352353m_trie_.putIndexData(this);354}355356public void upropsvec_addPropertyStarts(UnicodeSet set) {357/* add the start code point of each same-value range of the properties vectors trie */358if(m_additionalColumnsCount_>0) {359/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */360TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);361RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();362while(propsVectorsIter.next(propsVectorsResult)){363set.add(propsVectorsResult.start);364}365}366}367368}369370371