Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/sun/net/idn/Punycode.java
38918 views
/*1* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.2*3* This code is free software; you can redistribute it and/or modify it4* under the terms of the GNU General Public License version 2 only, as5* published by the Free Software Foundation. Oracle designates this6* particular file as subject to the "Classpath" exception as provided7* by Oracle in the LICENSE file that accompanied this code.8*9* This code is distributed in the hope that it will be useful, but WITHOUT10* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or11* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License12* version 2 for more details (a copy is included in the LICENSE file that13* accompanied this code).14*15* You should have received a copy of the GNU General Public License version16* 2 along with this work; if not, write to the Free Software Foundation,17* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.18*19* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA20* or visit www.oracle.com if you need additional information or have any21* questions.22*/23/*24*******************************************************************************25* Copyright (C) 2003-2004, International Business Machines Corporation and *26* others. All Rights Reserved. *27*******************************************************************************28*/29//30// CHANGELOG31// 2005-05-19 Edward Wang32// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java33// - move from package com.ibm.icu.text to package sun.net.idn34// - use ParseException instead of StringPrepParseException35// 2007-08-14 Martin Buchholz36// - remove redundant casts37//38package sun.net.idn;3940import java.text.ParseException;41import sun.text.normalizer.UCharacter;42import sun.text.normalizer.UTF16;4344/**45* Ported code from ICU punycode.c46* @author ram47*/4849/* Package Private class */50public final class Punycode {5152/* Punycode parameters for Bootstring */53private static final int BASE = 36;54private static final int TMIN = 1;55private static final int TMAX = 26;56private static final int SKEW = 38;57private static final int DAMP = 700;58private static final int INITIAL_BIAS = 72;59private static final int INITIAL_N = 0x80;6061/* "Basic" Unicode/ASCII code points */62private static final int HYPHEN = 0x2d;63private static final int DELIMITER = HYPHEN;6465private static final int ZERO = 0x30;66private static final int NINE = 0x39;6768private static final int SMALL_A = 0x61;69private static final int SMALL_Z = 0x7a;7071private static final int CAPITAL_A = 0x41;72private static final int CAPITAL_Z = 0x5a;7374// TODO: eliminate the 256 limitation75private static final int MAX_CP_COUNT = 256;7677private static final int UINT_MAGIC = 0x80000000;78private static final long ULONG_MAGIC = 0x8000000000000000L;7980private static int adaptBias(int delta, int length, boolean firstTime){81if(firstTime){82delta /=DAMP;83}else{84delta /= 2;85}86delta += delta/length;8788int count=0;89for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {90delta/=(BASE-TMIN);91}9293return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));94}9596/**97* basicToDigit[] contains the numeric value of a basic code98* point (for use in representing integers) in the range 0 to99* BASE-1, or -1 if b is does not represent a value.100*/101static final int[] basicToDigit= new int[]{102-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,103-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,104105-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,10626, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,107108-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,10915, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,110111-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,11215, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,113114-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,115-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,116117-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,118-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,119120-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,121-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,122123-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,124-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1125};126127private static char asciiCaseMap(char b, boolean uppercase) {128if(uppercase) {129if(SMALL_A<=b && b<=SMALL_Z) {130b-=(SMALL_A-CAPITAL_A);131}132} else {133if(CAPITAL_A<=b && b<=CAPITAL_Z) {134b+=(SMALL_A-CAPITAL_A);135}136}137return b;138}139140/**141* digitToBasic() returns the basic code point whose value142* (when used for representing integers) is d, which must be in the143* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is144* nonzero, in which case the uppercase form is used.145*/146private static char digitToBasic(int digit, boolean uppercase) {147/* 0..25 map to ASCII a..z or A..Z */148/* 26..35 map to ASCII 0..9 */149if(digit<26) {150if(uppercase) {151return (char)(CAPITAL_A+digit);152} else {153return (char)(SMALL_A+digit);154}155} else {156return (char)((ZERO-26)+digit);157}158}159/**160* Converts Unicode to Punycode.161* The input string must not contain single, unpaired surrogates.162* The output will be represented as an array of ASCII code points.163*164* @param src165* @param caseFlags166* @return167* @throws ParseException168*/169public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{170171int[] cpBuffer = new int[MAX_CP_COUNT];172int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;173char c, c2;174int srcLength = src.length();175int destCapacity = MAX_CP_COUNT;176char[] dest = new char[destCapacity];177StringBuffer result = new StringBuffer();178/*179* Handle the basic code points and180* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):181*/182srcCPCount=destLength=0;183184for(j=0; j<srcLength; ++j) {185if(srcCPCount==MAX_CP_COUNT) {186/* too many input code points */187throw new ParseException("Too many input code points", -1);188}189c=src.charAt(j);190if(isBasic(c)) {191if(destLength<destCapacity) {192cpBuffer[srcCPCount++]=0;193dest[destLength]=194caseFlags!=null ?195asciiCaseMap(c, caseFlags[j]) :196c;197}198++destLength;199} else {200n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;201if(!UTF16.isSurrogate(c)) {202n|=c;203} else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {204++j;205206n|=UCharacter.getCodePoint(c, c2);207} else {208/* error: unmatched surrogate */209throw new ParseException("Illegal char found", -1);210}211cpBuffer[srcCPCount++]=n;212}213}214215/* Finish the basic string - if it is not empty - with a delimiter. */216basicLength=destLength;217if(basicLength>0) {218if(destLength<destCapacity) {219dest[destLength]=DELIMITER;220}221++destLength;222}223224/*225* handledCPCount is the number of code points that have been handled226* basicLength is the number of basic code points227* destLength is the number of chars that have been output228*/229230/* Initialize the state: */231n=INITIAL_N;232delta=0;233bias=INITIAL_BIAS;234235/* Main encoding loop: */236for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {237/*238* All non-basic code points < n have been handled already.239* Find the next larger one:240*/241for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {242q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */243if(n<=q && q<m) {244m=q;245}246}247248/*249* Increase delta enough to advance the decoder's250* <n,i> state to <m,0>, but guard against overflow:251*/252if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {253throw new RuntimeException("Internal program error");254}255delta+=(m-n)*(handledCPCount+1);256n=m;257258/* Encode a sequence of same code points n */259for(j=0; j<srcCPCount; ++j) {260q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */261if(q<n) {262++delta;263} else if(q==n) {264/* Represent delta as a generalized variable-length integer: */265for(q=delta, k=BASE; /* no condition */; k+=BASE) {266267/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt268269t=k-bias;270if(t<TMIN) {271t=TMIN;272} else if(t>TMAX) {273t=TMAX;274}275*/276277t=k-bias;278if(t<TMIN) {279t=TMIN;280} else if(k>=(bias+TMAX)) {281t=TMAX;282}283284if(q<t) {285break;286}287288if(destLength<destCapacity) {289dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false);290}291q=(q-t)/(BASE-t);292}293294if(destLength<destCapacity) {295dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0));296}297bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));298delta=0;299++handledCPCount;300}301}302303++delta;304++n;305}306307return result.append(dest, 0, destLength);308}309310private static boolean isBasic(int ch){311return (ch < INITIAL_N);312}313314private static boolean isBasicUpperCase(int ch){315return( CAPITAL_A <= ch && ch <= CAPITAL_Z);316}317318private static boolean isSurrogate(int ch){319return (((ch)&0xfffff800)==0xd800);320}321/**322* Converts Punycode to Unicode.323* The Unicode string will be at most as long as the Punycode string.324*325* @param src326* @param caseFlags327* @return328* @throws ParseException329*/330public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)331throws ParseException{332int srcLength = src.length();333StringBuffer result = new StringBuffer();334int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,335destCPCount, firstSupplementaryIndex, cpLength;336char b;337int destCapacity = MAX_CP_COUNT;338char[] dest = new char[destCapacity];339340/*341* Handle the basic code points:342* Let basicLength be the number of input code points343* before the last delimiter, or 0 if there is none,344* then copy the first basicLength code points to the output.345*346* The two following loops iterate backward.347*/348for(j=srcLength; j>0;) {349if(src.charAt(--j)==DELIMITER) {350break;351}352}353destLength=basicLength=destCPCount=j;354355while(j>0) {356b=src.charAt(--j);357if(!isBasic(b)) {358throw new ParseException("Illegal char found", -1);359}360361if(j<destCapacity) {362dest[j]= b;363364if(caseFlags!=null) {365caseFlags[j]=isBasicUpperCase(b);366}367}368}369370/* Initialize the state: */371n=INITIAL_N;372i=0;373bias=INITIAL_BIAS;374firstSupplementaryIndex=1000000000;375376/*377* Main decoding loop:378* Start just after the last delimiter if any379* basic code points were copied; start at the beginning otherwise.380*/381for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {382/*383* in is the index of the next character to be consumed, and384* destCPCount is the number of code points in the output array.385*386* Decode a generalized variable-length integer into delta,387* which gets added to i. The overflow checking is easier388* if we increase i as we go, then subtract off its starting389* value at the end to obtain delta.390*/391for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {392if(in>=srcLength) {393throw new ParseException("Illegal char found", -1);394}395396digit=basicToDigit[(byte)src.charAt(in++)];397if(digit<0) {398throw new ParseException("Invalid char found", -1);399}400if(digit>(0x7fffffff-i)/w) {401/* integer overflow */402throw new ParseException("Illegal char found", -1);403}404405i+=digit*w;406t=k-bias;407if(t<TMIN) {408t=TMIN;409} else if(k>=(bias+TMAX)) {410t=TMAX;411}412if(digit<t) {413break;414}415416if(w>0x7fffffff/(BASE-t)) {417/* integer overflow */418throw new ParseException("Illegal char found", -1);419}420w*=BASE-t;421}422423/*424* Modification from sample code:425* Increments destCPCount here,426* where needed instead of in for() loop tail.427*/428++destCPCount;429bias=adaptBias(i-oldi, destCPCount, (oldi==0));430431/*432* i was supposed to wrap around from (incremented) destCPCount to 0,433* incrementing n each time, so we'll fix that now:434*/435if(i/destCPCount>(0x7fffffff-n)) {436/* integer overflow */437throw new ParseException("Illegal char found", -1);438}439440n+=i/destCPCount;441i%=destCPCount;442/* not needed for Punycode: */443/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */444445if(n>0x10ffff || isSurrogate(n)) {446/* Unicode code point overflow */447throw new ParseException("Illegal char found", -1);448}449450/* Insert n at position i of the output: */451cpLength=UTF16.getCharCount(n);452if((destLength+cpLength)<destCapacity) {453int codeUnitIndex;454455/*456* Handle indexes when supplementary code points are present.457*458* In almost all cases, there will be only BMP code points before i459* and even in the entire string.460* This is handled with the same efficiency as with UTF-32.461*462* Only the rare cases with supplementary code points are handled463* more slowly - but not too bad since this is an insertion anyway.464*/465if(i<=firstSupplementaryIndex) {466codeUnitIndex=i;467if(cpLength>1) {468firstSupplementaryIndex=codeUnitIndex;469} else {470++firstSupplementaryIndex;471}472} else {473codeUnitIndex=firstSupplementaryIndex;474codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex);475}476477/* use the UChar index codeUnitIndex instead of the code point index i */478if(codeUnitIndex<destLength) {479System.arraycopy(dest, codeUnitIndex,480dest, codeUnitIndex+cpLength,481(destLength-codeUnitIndex));482if(caseFlags!=null) {483System.arraycopy(caseFlags, codeUnitIndex,484caseFlags, codeUnitIndex+cpLength,485destLength-codeUnitIndex);486}487}488if(cpLength==1) {489/* BMP, insert one code unit */490dest[codeUnitIndex]=(char)n;491} else {492/* supplementary character, insert two code units */493dest[codeUnitIndex]=UTF16.getLeadSurrogate(n);494dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n);495}496if(caseFlags!=null) {497/* Case of last character determines uppercase flag: */498caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));499if(cpLength==2) {500caseFlags[codeUnitIndex+1]=false;501}502}503}504destLength+=cpLength;505++i;506}507result.append(dest, 0, destLength);508return result;509}510}511512513