Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/sun/net/idn/StringPrep.java
38918 views
/*1* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.2*3* This code is free software; you can redistribute it and/or modify it4* under the terms of the GNU General Public License version 2 only, as5* published by the Free Software Foundation. Oracle designates this6* particular file as subject to the "Classpath" exception as provided7* by Oracle in the LICENSE file that accompanied this code.8*9* This code is distributed in the hope that it will be useful, but WITHOUT10* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or11* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License12* version 2 for more details (a copy is included in the LICENSE file that13* accompanied this code).14*15* You should have received a copy of the GNU General Public License version16* 2 along with this work; if not, write to the Free Software Foundation,17* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.18*19* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA20* or visit www.oracle.com if you need additional information or have any21* questions.22*/23/*24/*25*******************************************************************************26* Copyright (C) 2003-2004, International Business Machines Corporation and *27* others. All Rights Reserved. *28*******************************************************************************29*/30//31// CHANGELOG32// 2005-05-19 Edward Wang33// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java34// - move from package com.ibm.icu.text to package sun.net.idn35// - use ParseException instead of StringPrepParseException36// - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'37// - remove all @deprecated tag to make compiler happy38// 2007-08-14 Martin Buchholz39// - remove redundant casts40//41package sun.net.idn;4243import java.io.BufferedInputStream;44import java.io.ByteArrayInputStream;45import java.io.IOException;46import java.io.InputStream;47import java.text.ParseException;4849import sun.text.Normalizer;50import sun.text.normalizer.CharTrie;51import sun.text.normalizer.Trie;52import sun.text.normalizer.NormalizerImpl;53import sun.text.normalizer.VersionInfo;54import sun.text.normalizer.UCharacter;55import sun.text.normalizer.UCharacterIterator;56import sun.text.normalizer.UTF16;57import sun.net.idn.UCharacterDirection;58import sun.net.idn.StringPrepDataReader;5960/**61* StringPrep API implements the StingPrep framework as described by62* <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.63* StringPrep prepares Unicode strings for use in network protocols.64* Profiles of StingPrep are set of rules and data according to which the65* Unicode Strings are prepared. Each profiles contains tables which describe66* how a code point should be treated. The tables are broadly classied into67* <ul>68* <li> Unassigned Table: Contains code points that are unassigned69* in the Unicode Version supported by StringPrep. Currently70* RFC 3454 supports Unicode 3.2. </li>71* <li> Prohibited Table: Contains code points that are prohibted from72* the output of the StringPrep processing function. </li>73* <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>74* </ul>75*76* The procedure for preparing Unicode strings:77* <ol>78* <li> Map: For each character in the input, check if it has a mapping79* and, if so, replace it with its mapping. </li>80* <li> Normalize: Possibly normalize the result of step 1 using Unicode81* normalization. </li>82* <li> Prohibit: Check for any characters that are not allowed in the83* output. If any are found, return an error.</li>84* <li> Check bidi: Possibly check for right-to-left characters, and if85* any are found, make sure that the whole string satisfies the86* requirements for bidirectional strings. If the string does not87* satisfy the requirements for bidirectional strings, return an88* error. </li>89* </ol>90* @author Ram Viswanadha91* @draft ICU 2.892*/93public final class StringPrep {94/**95* Option to prohibit processing of unassigned code points in the input96*97* @see #prepare98* @draft ICU 2.899*/100public static final int DEFAULT = 0x0000;101102/**103* Option to allow processing of unassigned code points in the input104*105* @see #prepare106* @draft ICU 2.8107*/108public static final int ALLOW_UNASSIGNED = 0x0001;109110private static final int UNASSIGNED = 0x0000;111private static final int MAP = 0x0001;112private static final int PROHIBITED = 0x0002;113private static final int DELETE = 0x0003;114private static final int TYPE_LIMIT = 0x0004;115116private static final int NORMALIZATION_ON = 0x0001;117private static final int CHECK_BIDI_ON = 0x0002;118119private static final int TYPE_THRESHOLD = 0xFFF0;120private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/121private static final int MAX_INDEX_TOP_LENGTH = 0x0003;122123/* indexes[] value names */124private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */125private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */126private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */127private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */128private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */129private static final int THREE_UCHARS_MAPPING_INDEX_START = 5;130private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6;131private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */132private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */133134135/**136* Default buffer size of datafile137*/138private static final int DATA_BUFFER_SIZE = 25000;139140/* Wrappers for Trie implementations */141private static final class StringPrepTrieImpl implements Trie.DataManipulate{142private CharTrie sprepTrie = null;143/**144* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's145* data the index array offset of the indexes for that lead surrogate.146* @param property data value for a surrogate from the trie, including147* the folding offset148* @return data offset or 0 if there is no data for the lead surrogate149*/150public int getFoldingOffset(int value){151return value;152}153}154155// CharTrie implementation for reading the trie data156private StringPrepTrieImpl sprepTrieImpl;157// Indexes read from the data file158private int[] indexes;159// mapping data read from the data file160private char[] mappingData;161// format version of the data file162private byte[] formatVersion;163// the version of Unicode supported by the data file164private VersionInfo sprepUniVer;165// the Unicode version of last entry in the166// NormalizationCorrections.txt file if normalization167// is turned on168private VersionInfo normCorrVer;169// Option to turn on Normalization170private boolean doNFKC;171// Option to turn on checking for BiDi rules172private boolean checkBiDi;173174175private char getCodePointValue(int ch){176return sprepTrieImpl.sprepTrie.getCodePointValue(ch);177}178179private static VersionInfo getVersionInfo(int comp){180int micro = comp & 0xFF;181int milli =(comp >> 8) & 0xFF;182int minor =(comp >> 16) & 0xFF;183int major =(comp >> 24) & 0xFF;184return VersionInfo.getInstance(major,minor,milli,micro);185}186private static VersionInfo getVersionInfo(byte[] version){187if(version.length != 4){188return null;189}190return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);191}192/**193* Creates an StringPrep object after reading the input stream.194* The object does not hold a reference to the input steam, so the stream can be195* closed after the method returns.196*197* @param inputStream The stream for reading the StringPrep profile binarySun198* @throws IOException199* @draft ICU 2.8200*/201public StringPrep(InputStream inputStream) throws IOException{202203BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE);204205StringPrepDataReader reader = new StringPrepDataReader(b);206207// read the indexes208indexes = reader.readIndexes(INDEX_TOP);209210byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];211212213//indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes214mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2];215// load the rest of the data data and initialize the data members216reader.read(sprepBytes,mappingData);217218sprepTrieImpl = new StringPrepTrieImpl();219sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl );220221// get the data format version222formatVersion = reader.getDataFormatVersion();223224// get the options225doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);226checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);227sprepUniVer = getVersionInfo(reader.getUnicodeVersion());228normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);229VersionInfo normUniVer = NormalizerImpl.getUnicodeVersion();230if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */231normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */232((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/233){234throw new IOException("Normalization Correction version not supported");235}236b.close();237}238239private static final class Values{240boolean isIndex;241int value;242int type;243public void reset(){244isIndex = false;245value = 0;246type = -1;247}248}249250private static final void getValues(char trieWord,Values values){251values.reset();252if(trieWord == 0){253/*254* Initial value stored in the mapping table255* just return TYPE_LIMIT .. so that256* the source codepoint is copied to the destination257*/258values.type = TYPE_LIMIT;259}else if(trieWord >= TYPE_THRESHOLD){260values.type = (trieWord - TYPE_THRESHOLD);261}else{262/* get the type */263values.type = MAP;264/* ascertain if the value is index or delta */265if((trieWord & 0x02)>0){266values.isIndex = true;267values.value = trieWord >> 2; //mask off the lower 2 bits and shift268269}else{270values.isIndex = false;271values.value = (trieWord<<16)>>16;272values.value = (values.value >> 2);273274}275276if((trieWord>>2) == MAX_INDEX_VALUE){277values.type = DELETE;278values.isIndex = false;279values.value = 0;280}281}282}283284285286private StringBuffer map( UCharacterIterator iter, int options)287throws ParseException {288289Values val = new Values();290char result = 0;291int ch = UCharacterIterator.DONE;292StringBuffer dest = new StringBuffer();293boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);294295while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){296297result = getCodePointValue(ch);298getValues(result,val);299300// check if the source codepoint is unassigned301if(val.type == UNASSIGNED && allowUnassigned == false){302throw new ParseException("An unassigned code point was found in the input " +303iter.getText(), iter.getIndex());304}else if((val.type == MAP)){305int index, length;306307if(val.isIndex){308index = val.value;309if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&310index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){311length = 1;312}else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&313index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){314length = 2;315}else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&316index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){317length = 3;318}else{319length = mappingData[index++];320}321/* copy mapping to destination */322dest.append(mappingData,index,length);323continue;324325}else{326ch -= val.value;327}328}else if(val.type == DELETE){329// just consume the codepoint and contine330continue;331}332//copy the source into destination333UTF16.append(dest,ch);334}335336return dest;337}338339340private StringBuffer normalize(StringBuffer src){341/*342* Option UNORM_BEFORE_PRI_29:343*344* IDNA as interpreted by IETF members (see unicode mailing list 2004H1)345* requires strict adherence to Unicode 3.2 normalization,346* including buggy composition from before fixing Public Review Issue #29.347* Note that this results in some valid but nonsensical text to be348* either corrupted or rejected, depending on the text.349* See http://www.unicode.org/review/resolved-pri.html#pri29350* See unorm.cpp and cnormtst.c351*/352return new StringBuffer(353Normalizer.normalize(354src.toString(),355java.text.Normalizer.Form.NFKC,356Normalizer.UNICODE_3_2|NormalizerImpl.BEFORE_PRI_29));357}358/*359boolean isLabelSeparator(int ch){360int result = getCodePointValue(ch);361if( (result & 0x07) == LABEL_SEPARATOR){362return true;363}364return false;365}366*/367/*3681) Map -- For each character in the input, check if it has a mapping369and, if so, replace it with its mapping.3703712) Normalize -- Possibly normalize the result of step 1 using Unicode372normalization.3733743) Prohibit -- Check for any characters that are not allowed in the375output. If any are found, return an error.3763774) Check bidi -- Possibly check for right-to-left characters, and if378any are found, make sure that the whole string satisfies the379requirements for bidirectional strings. If the string does not380satisfy the requirements for bidirectional strings, return an381error.382[Unicode3.2] defines several bidirectional categories; each character383has one bidirectional category assigned to it. For the purposes of384the requirements below, an "RandALCat character" is a character that385has Unicode bidirectional categories "R" or "AL"; an "LCat character"386is a character that has Unicode bidirectional category "L". Note387388389that there are many characters which fall in neither of the above390definitions; Latin digits (<U+0030> through <U+0039>) are examples of391this because they have bidirectional category "EN".392393In any profile that specifies bidirectional character handling, all394three of the following requirements MUST be met:3953961) The characters in section 5.8 MUST be prohibited.3973982) If a string contains any RandALCat character, the string MUST NOT399contain any LCat character.4004013) If a string contains any RandALCat character, a RandALCat402character MUST be the first character of the string, and a403RandALCat character MUST be the last character of the string.404*/405/**406* Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),407* checks for prohited and BiDi characters in the order defined by RFC 3454408* depending on the options specified in the profile.409*410* @param src A UCharacterIterator object containing the source string411* @param options A bit set of options:412*413* - StringPrep.NONE Prohibit processing of unassigned code points in the input414*415* - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input416* as normal Unicode code points.417*418* @return StringBuffer A StringBuffer containing the output419* @throws ParseException420* @draft ICU 2.8421*/422public StringBuffer prepare(UCharacterIterator src, int options)423throws ParseException{424425// map426StringBuffer mapOut = map(src,options);427StringBuffer normOut = mapOut;// initialize428429if(doNFKC){430// normalize431normOut = normalize(mapOut);432}433434int ch;435char result;436UCharacterIterator iter = UCharacterIterator.getInstance(normOut);437Values val = new Values();438int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,439firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;440int rtlPos=-1, ltrPos=-1;441boolean rightToLeft=false, leftToRight=false;442443while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){444result = getCodePointValue(ch);445getValues(result,val);446447if(val.type == PROHIBITED ){448throw new ParseException("A prohibited code point was found in the input" +449iter.getText(), val.value);450}451452direction = UCharacter.getDirection(ch);453if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){454firstCharDir = direction;455}456if(direction == UCharacterDirection.LEFT_TO_RIGHT){457leftToRight = true;458ltrPos = iter.getIndex()-1;459}460if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){461rightToLeft = true;462rtlPos = iter.getIndex()-1;463}464}465if(checkBiDi == true){466// satisfy 2467if( leftToRight == true && rightToLeft == true){468throw new ParseException("The input does not conform to the rules for BiDi code points." +469iter.getText(),470(rtlPos>ltrPos) ? rtlPos : ltrPos);471}472473//satisfy 3474if( rightToLeft == true &&475!((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&476(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))477){478throw new ParseException("The input does not conform to the rules for BiDi code points." +479iter.getText(),480(rtlPos>ltrPos) ? rtlPos : ltrPos);481}482}483return normOut;484485}486}487488489