Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/java/net/IDN.java
38829 views
/*1* Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/24package java.net;2526import java.io.InputStream;27import java.io.IOException;28import java.security.AccessController;29import java.security.PrivilegedAction;3031import sun.net.idn.StringPrep;32import sun.net.idn.Punycode;33import sun.text.normalizer.UCharacterIterator;3435/**36* Provides methods to convert internationalized domain names (IDNs) between37* a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation.38* Internationalized domain names can use characters from the entire range of39* Unicode, while traditional domain names are restricted to ASCII characters.40* ACE is an encoding of Unicode strings that uses only ASCII characters and41* can be used with software (such as the Domain Name System) that only42* understands traditional domain names.43*44* <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.45* RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ46* <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a47* profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and48* <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert49* domain name string back and forth.50*51* <p>The behavior of aforementioned conversion process can be adjusted by various flags:52* <ul>53* <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted54* can contain code points that are unassigned in Unicode 3.2, which is the55* Unicode version on which IDN conversion is based. If the flag is not used,56* the presence of such unassigned code points is treated as an error.57* <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>.58* It is an error if they don't meet the requirements.59* </ul>60* These flags can be logically OR'ed together.61*62* <p>The security consideration is important with respect to internationalization63* domain name support. For example, English domain names may be <i>homographed</i>64* - maliciously misspelled by substitution of non-Latin letters.65* <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a>66* discusses security issues of IDN support as well as possible solutions.67* Applications are responsible for taking adequate security measures when using68* international domain names.69*70* @author Edward Wang71* @since 1.672*73*/74public final class IDN {75/**76* Flag to allow processing of unassigned code points77*/78public static final int ALLOW_UNASSIGNED = 0x01;7980/**81* Flag to turn on the check against STD-3 ASCII rules82*/83public static final int USE_STD3_ASCII_RULES = 0x02;848586/**87* Translates a string from Unicode to ASCII Compatible Encoding (ACE),88* as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.89*90* <p>ToASCII operation can fail. ToASCII fails if any step of it fails.91* If ToASCII operation fails, an IllegalArgumentException will be thrown.92* In this case, the input string should not be used in an internationalized domain name.93*94* <p> A label is an individual part of a domain name. The original ToASCII operation,95* as defined in RFC 3490, only operates on a single label. This method can handle96* both label and entire domain name, by assuming that labels in a domain name are97* always separated by dots. The following characters are recognized as dots:98* \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),99* and \uFF61 (halfwidth ideographic full stop). if dots are100* used as label separators, this method also changes all of them to \u002E (full stop)101* in output translated string.102*103* @param input the string to be processed104* @param flag process flag; can be 0 or any logical OR of possible flags105*106* @return the translated {@code String}107*108* @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification109*/110public static String toASCII(String input, int flag)111{112int p = 0, q = 0;113StringBuffer out = new StringBuffer();114115if (isRootLabel(input)) {116return ".";117}118119while (p < input.length()) {120q = searchDots(input, p);121out.append(toASCIIInternal(input.substring(p, q), flag));122if (q != (input.length())) {123// has more labels, or keep the trailing dot as at present124out.append('.');125}126p = q + 1;127}128129return out.toString();130}131132133/**134* Translates a string from Unicode to ASCII Compatible Encoding (ACE),135* as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.136*137* <p> This convenience method works as if by invoking the138* two-argument counterpart as follows:139* <blockquote>140* {@link #toASCII(String, int) toASCII}(input, 0);141* </blockquote>142*143* @param input the string to be processed144*145* @return the translated {@code String}146*147* @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification148*/149public static String toASCII(String input) {150return toASCII(input, 0);151}152153154/**155* Translates a string from ASCII Compatible Encoding (ACE) to Unicode,156* as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.157*158* <p>ToUnicode never fails. In case of any error, the input string is returned unmodified.159*160* <p> A label is an individual part of a domain name. The original ToUnicode operation,161* as defined in RFC 3490, only operates on a single label. This method can handle162* both label and entire domain name, by assuming that labels in a domain name are163* always separated by dots. The following characters are recognized as dots:164* \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),165* and \uFF61 (halfwidth ideographic full stop).166*167* @param input the string to be processed168* @param flag process flag; can be 0 or any logical OR of possible flags169*170* @return the translated {@code String}171*/172public static String toUnicode(String input, int flag) {173int p = 0, q = 0;174StringBuffer out = new StringBuffer();175176if (isRootLabel(input)) {177return ".";178}179180while (p < input.length()) {181q = searchDots(input, p);182out.append(toUnicodeInternal(input.substring(p, q), flag));183if (q != (input.length())) {184// has more labels, or keep the trailing dot as at present185out.append('.');186}187p = q + 1;188}189190return out.toString();191}192193194/**195* Translates a string from ASCII Compatible Encoding (ACE) to Unicode,196* as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.197*198* <p> This convenience method works as if by invoking the199* two-argument counterpart as follows:200* <blockquote>201* {@link #toUnicode(String, int) toUnicode}(input, 0);202* </blockquote>203*204* @param input the string to be processed205*206* @return the translated {@code String}207*/208public static String toUnicode(String input) {209return toUnicode(input, 0);210}211212213/* ---------------- Private members -------------- */214215// ACE Prefix is "xn--"216private static final String ACE_PREFIX = "xn--";217private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length();218219private static final int MAX_LABEL_LENGTH = 63;220221// single instance of nameprep222private static StringPrep namePrep = null;223224static {225InputStream stream = null;226227try {228final String IDN_PROFILE = "uidna.spp";229if (System.getSecurityManager() != null) {230stream = AccessController.doPrivileged(new PrivilegedAction<InputStream>() {231public InputStream run() {232return StringPrep.class.getResourceAsStream(IDN_PROFILE);233}234});235} else {236stream = StringPrep.class.getResourceAsStream(IDN_PROFILE);237}238239namePrep = new StringPrep(stream);240stream.close();241} catch (IOException e) {242// should never reach here243assert false;244}245}246247248/* ---------------- Private operations -------------- */249250251//252// to suppress the default zero-argument constructor253//254private IDN() {}255256//257// toASCII operation; should only apply to a single label258//259private static String toASCIIInternal(String label, int flag)260{261// step 1262// Check if the string contains code points outside the ASCII range 0..0x7c.263boolean isASCII = isAllASCII(label);264StringBuffer dest;265266// step 2267// perform the nameprep operation; flag ALLOW_UNASSIGNED is used here268if (!isASCII) {269UCharacterIterator iter = UCharacterIterator.getInstance(label);270try {271dest = namePrep.prepare(iter, flag);272} catch (java.text.ParseException e) {273throw new IllegalArgumentException(e);274}275} else {276dest = new StringBuffer(label);277}278279// step 8, move forward to check the smallest number of the code points280// the length must be inside 1..63281if (dest.length() == 0) {282throw new IllegalArgumentException(283"Empty label is not a legal name");284}285286// step 3287// Verify the absence of non-LDH ASCII code points288// 0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f289// Verify the absence of leading and trailing hyphen290boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0);291if (useSTD3ASCIIRules) {292for (int i = 0; i < dest.length(); i++) {293int c = dest.charAt(i);294if (isNonLDHAsciiCodePoint(c)) {295throw new IllegalArgumentException(296"Contains non-LDH ASCII characters");297}298}299300if (dest.charAt(0) == '-' ||301dest.charAt(dest.length() - 1) == '-') {302303throw new IllegalArgumentException(304"Has leading or trailing hyphen");305}306}307308if (!isASCII) {309// step 4310// If all code points are inside 0..0x7f, skip to step 8311if (!isAllASCII(dest.toString())) {312// step 5313// verify the sequence does not begin with ACE prefix314if(!startsWithACEPrefix(dest)){315316// step 6317// encode the sequence with punycode318try {319dest = Punycode.encode(dest, null);320} catch (java.text.ParseException e) {321throw new IllegalArgumentException(e);322}323324dest = toASCIILower(dest);325326// step 7327// prepend the ACE prefix328dest.insert(0, ACE_PREFIX);329} else {330throw new IllegalArgumentException("The input starts with the ACE Prefix");331}332333}334}335336// step 8337// the length must be inside 1..63338if (dest.length() > MAX_LABEL_LENGTH) {339throw new IllegalArgumentException("The label in the input is too long");340}341342return dest.toString();343}344345//346// toUnicode operation; should only apply to a single label347//348private static String toUnicodeInternal(String label, int flag) {349boolean[] caseFlags = null;350StringBuffer dest;351352// step 1353// find out if all the codepoints in input are ASCII354boolean isASCII = isAllASCII(label);355356if(!isASCII){357// step 2358// perform the nameprep operation; flag ALLOW_UNASSIGNED is used here359try {360UCharacterIterator iter = UCharacterIterator.getInstance(label);361dest = namePrep.prepare(iter, flag);362} catch (Exception e) {363// toUnicode never fails; if any step fails, return the input string364return label;365}366} else {367dest = new StringBuffer(label);368}369370// step 3371// verify ACE Prefix372if(startsWithACEPrefix(dest)) {373374// step 4375// Remove the ACE Prefix376String temp = dest.substring(ACE_PREFIX_LENGTH, dest.length());377378try {379// step 5380// Decode using punycode381StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp), null);382383// step 6384// Apply toASCII385String toASCIIOut = toASCII(decodeOut.toString(), flag);386387// step 7388// verify389if (toASCIIOut.equalsIgnoreCase(dest.toString())) {390// step 8391// return output of step 5392return decodeOut.toString();393}394} catch (Exception ignored) {395// no-op396}397}398399// just return the input400return label;401}402403404//405// LDH stands for "letter/digit/hyphen", with characters restricted to the406// 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen407// <->.408// Non LDH refers to characters in the ASCII range, but which are not409// letters, digits or the hypen.410//411// non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x5B..0x60, 0x7B..0x7F412//413private static boolean isNonLDHAsciiCodePoint(int ch){414return (0x0000 <= ch && ch <= 0x002C) ||415(0x002E <= ch && ch <= 0x002F) ||416(0x003A <= ch && ch <= 0x0040) ||417(0x005B <= ch && ch <= 0x0060) ||418(0x007B <= ch && ch <= 0x007F);419}420421//422// search dots in a string and return the index of that character;423// or if there is no dots, return the length of input string424// dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),425// and \uFF61 (halfwidth ideographic full stop).426//427private static int searchDots(String s, int start) {428int i;429for (i = start; i < s.length(); i++) {430if (isLabelSeparator(s.charAt(i))) {431break;432}433}434435return i;436}437438//439// to check if a string is a root label, ".".440//441private static boolean isRootLabel(String s) {442return (s.length() == 1 && isLabelSeparator(s.charAt(0)));443}444445//446// to check if a character is a label separator, i.e. a dot character.447//448private static boolean isLabelSeparator(char c) {449return (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61');450}451452//453// to check if a string only contains US-ASCII code point454//455private static boolean isAllASCII(String input) {456boolean isASCII = true;457for (int i = 0; i < input.length(); i++) {458int c = input.charAt(i);459if (c > 0x7F) {460isASCII = false;461break;462}463}464return isASCII;465}466467//468// to check if a string starts with ACE-prefix469//470private static boolean startsWithACEPrefix(StringBuffer input){471boolean startsWithPrefix = true;472473if(input.length() < ACE_PREFIX_LENGTH){474return false;475}476for(int i = 0; i < ACE_PREFIX_LENGTH; i++){477if(toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)){478startsWithPrefix = false;479}480}481return startsWithPrefix;482}483484private static char toASCIILower(char ch){485if('A' <= ch && ch <= 'Z'){486return (char)(ch + 'a' - 'A');487}488return ch;489}490491private static StringBuffer toASCIILower(StringBuffer input){492StringBuffer dest = new StringBuffer();493for(int i = 0; i < input.length();i++){494dest.append(toASCIILower(input.charAt(i)));495}496return dest;497}498}499500501