Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/java/lang/ConditionalSpecialCasing.java
38829 views
/*1* Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425package java.lang;2627import java.text.BreakIterator;28import java.util.HashSet;29import java.util.Hashtable;30import java.util.Iterator;31import java.util.Locale;32import sun.text.Normalizer;333435/**36* This is a utility class for <code>String.toLowerCase()</code> and37* <code>String.toUpperCase()</code>, that handles special casing with38* conditions. In other words, it handles the mappings with conditions39* that are defined in40* <a href="http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt">Special41* Casing Properties</a> file.42* <p>43* Note that the unconditional case mappings (including 1:M mappings)44* are handled in <code>Character.toLower/UpperCase()</code>.45*/46final class ConditionalSpecialCasing {4748// context conditions.49final static int FINAL_CASED = 1;50final static int AFTER_SOFT_DOTTED = 2;51final static int MORE_ABOVE = 3;52final static int AFTER_I = 4;53final static int NOT_BEFORE_DOT = 5;5455// combining class definitions56final static int COMBINING_CLASS_ABOVE = 230;5758// Special case mapping entries59static Entry[] entry = {60//# ================================================================================61//# Conditional mappings62//# ================================================================================63new Entry(0x03A3, new char[]{0x03C2}, new char[]{0x03A3}, null, FINAL_CASED), // # GREEK CAPITAL LETTER SIGMA64new Entry(0x0130, new char[]{0x0069, 0x0307}, new char[]{0x0130}, null, 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE6566//# ================================================================================67//# Locale-sensitive mappings68//# ================================================================================69//# Lithuanian70new Entry(0x0307, new char[]{0x0307}, new char[]{}, "lt", AFTER_SOFT_DOTTED), // # COMBINING DOT ABOVE71new Entry(0x0049, new char[]{0x0069, 0x0307}, new char[]{0x0049}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I72new Entry(0x004A, new char[]{0x006A, 0x0307}, new char[]{0x004A}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER J73new Entry(0x012E, new char[]{0x012F, 0x0307}, new char[]{0x012E}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I WITH OGONEK74new Entry(0x00CC, new char[]{0x0069, 0x0307, 0x0300}, new char[]{0x00CC}, "lt", 0), // # LATIN CAPITAL LETTER I WITH GRAVE75new Entry(0x00CD, new char[]{0x0069, 0x0307, 0x0301}, new char[]{0x00CD}, "lt", 0), // # LATIN CAPITAL LETTER I WITH ACUTE76new Entry(0x0128, new char[]{0x0069, 0x0307, 0x0303}, new char[]{0x0128}, "lt", 0), // # LATIN CAPITAL LETTER I WITH TILDE7778//# ================================================================================79//# Turkish and Azeri80new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE81new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE82new Entry(0x0307, new char[]{}, new char[]{0x0307}, "tr", AFTER_I), // # COMBINING DOT ABOVE83new Entry(0x0307, new char[]{}, new char[]{0x0307}, "az", AFTER_I), // # COMBINING DOT ABOVE84new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "tr", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I85new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "az", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I86new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN SMALL LETTER I87new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "az", 0) // # LATIN SMALL LETTER I88};8990// A hash table that contains the above entries91static Hashtable<Integer, HashSet<Entry>> entryTable = new Hashtable<>();92static {93// create hashtable from the entry94for (int i = 0; i < entry.length; i ++) {95Entry cur = entry[i];96Integer cp = new Integer(cur.getCodePoint());97HashSet<Entry> set = entryTable.get(cp);98if (set == null) {99set = new HashSet<Entry>();100}101set.add(cur);102entryTable.put(cp, set);103}104}105106static int toLowerCaseEx(String src, int index, Locale locale) {107char[] result = lookUpTable(src, index, locale, true);108109if (result != null) {110if (result.length == 1) {111return result[0];112} else {113return Character.ERROR;114}115} else {116// default to Character class' one117return Character.toLowerCase(src.codePointAt(index));118}119}120121static int toUpperCaseEx(String src, int index, Locale locale) {122char[] result = lookUpTable(src, index, locale, false);123124if (result != null) {125if (result.length == 1) {126return result[0];127} else {128return Character.ERROR;129}130} else {131// default to Character class' one132return Character.toUpperCaseEx(src.codePointAt(index));133}134}135136static char[] toLowerCaseCharArray(String src, int index, Locale locale) {137return lookUpTable(src, index, locale, true);138}139140static char[] toUpperCaseCharArray(String src, int index, Locale locale) {141char[] result = lookUpTable(src, index, locale, false);142if (result != null) {143return result;144} else {145return Character.toUpperCaseCharArray(src.codePointAt(index));146}147}148149private static char[] lookUpTable(String src, int index, Locale locale, boolean bLowerCasing) {150HashSet<Entry> set = entryTable.get(new Integer(src.codePointAt(index)));151char[] ret = null;152153if (set != null) {154Iterator<Entry> iter = set.iterator();155String currentLang = locale.getLanguage();156while (iter.hasNext()) {157Entry entry = iter.next();158String conditionLang = entry.getLanguage();159if (((conditionLang == null) || (conditionLang.equals(currentLang))) &&160isConditionMet(src, index, locale, entry.getCondition())) {161ret = bLowerCasing ? entry.getLowerCase() : entry.getUpperCase();162if (conditionLang != null) {163break;164}165}166}167}168169return ret;170}171172private static boolean isConditionMet(String src, int index, Locale locale, int condition) {173switch (condition) {174case FINAL_CASED:175return isFinalCased(src, index, locale);176177case AFTER_SOFT_DOTTED:178return isAfterSoftDotted(src, index);179180case MORE_ABOVE:181return isMoreAbove(src, index);182183case AFTER_I:184return isAfterI(src, index);185186case NOT_BEFORE_DOT:187return !isBeforeDot(src, index);188189default:190return true;191}192}193194/**195* Implements the "Final_Cased" condition196*197* Specification: Within the closest word boundaries containing C, there is a cased198* letter before C, and there is no cased letter after C.199*200* Regular Expression:201* Before C: [{cased==true}][{wordBoundary!=true}]*202* After C: !([{wordBoundary!=true}]*[{cased}])203*/204private static boolean isFinalCased(String src, int index, Locale locale) {205BreakIterator wordBoundary = BreakIterator.getWordInstance(locale);206wordBoundary.setText(src);207int ch;208209// Look for a preceding 'cased' letter210for (int i = index; (i >= 0) && !wordBoundary.isBoundary(i);211i -= Character.charCount(ch)) {212213ch = src.codePointBefore(i);214if (isCased(ch)) {215216int len = src.length();217// Check that there is no 'cased' letter after the index218for (i = index + Character.charCount(src.codePointAt(index));219(i < len) && !wordBoundary.isBoundary(i);220i += Character.charCount(ch)) {221222ch = src.codePointAt(i);223if (isCased(ch)) {224return false;225}226}227228return true;229}230}231232return false;233}234235/**236* Implements the "After_I" condition237*238* Specification: The last preceding base character was an uppercase I,239* and there is no intervening combining character class 230 (ABOVE).240*241* Regular Expression:242* Before C: [I]([{cc!=230}&{cc!=0}])*243*/244private static boolean isAfterI(String src, int index) {245int ch;246int cc;247248// Look for the last preceding base character249for (int i = index; i > 0; i -= Character.charCount(ch)) {250251ch = src.codePointBefore(i);252253if (ch == 'I') {254return true;255} else {256cc = Normalizer.getCombiningClass(ch);257if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {258return false;259}260}261}262263return false;264}265266/**267* Implements the "After_Soft_Dotted" condition268*269* Specification: The last preceding character with combining class270* of zero before C was Soft_Dotted, and there is no intervening271* combining character class 230 (ABOVE).272*273* Regular Expression:274* Before C: [{Soft_Dotted==true}]([{cc!=230}&{cc!=0}])*275*/276private static boolean isAfterSoftDotted(String src, int index) {277int ch;278int cc;279280// Look for the last preceding character281for (int i = index; i > 0; i -= Character.charCount(ch)) {282283ch = src.codePointBefore(i);284285if (isSoftDotted(ch)) {286return true;287} else {288cc = Normalizer.getCombiningClass(ch);289if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {290return false;291}292}293}294295return false;296}297298/**299* Implements the "More_Above" condition300*301* Specification: C is followed by one or more characters of combining302* class 230 (ABOVE) in the combining character sequence.303*304* Regular Expression:305* After C: [{cc!=0}]*[{cc==230}]306*/307private static boolean isMoreAbove(String src, int index) {308int ch;309int cc;310int len = src.length();311312// Look for a following ABOVE combining class character313for (int i = index + Character.charCount(src.codePointAt(index));314i < len; i += Character.charCount(ch)) {315316ch = src.codePointAt(i);317cc = Normalizer.getCombiningClass(ch);318319if (cc == COMBINING_CLASS_ABOVE) {320return true;321} else if (cc == 0) {322return false;323}324}325326return false;327}328329/**330* Implements the "Before_Dot" condition331*332* Specification: C is followed by <code>U+0307 COMBINING DOT ABOVE</code>.333* Any sequence of characters with a combining class that is334* neither 0 nor 230 may intervene between the current character335* and the combining dot above.336*337* Regular Expression:338* After C: ([{cc!=230}&{cc!=0}])*[\u0307]339*/340private static boolean isBeforeDot(String src, int index) {341int ch;342int cc;343int len = src.length();344345// Look for a following COMBINING DOT ABOVE346for (int i = index + Character.charCount(src.codePointAt(index));347i < len; i += Character.charCount(ch)) {348349ch = src.codePointAt(i);350351if (ch == '\u0307') {352return true;353} else {354cc = Normalizer.getCombiningClass(ch);355if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {356return false;357}358}359}360361return false;362}363364/**365* Examines whether a character is 'cased'.366*367* A character C is defined to be 'cased' if and only if at least one of368* following are true for C: uppercase==true, or lowercase==true, or369* general_category==titlecase_letter.370*371* The uppercase and lowercase property values are specified in the data372* file DerivedCoreProperties.txt in the Unicode Character Database.373*/374private static boolean isCased(int ch) {375int type = Character.getType(ch);376if (type == Character.LOWERCASE_LETTER ||377type == Character.UPPERCASE_LETTER ||378type == Character.TITLECASE_LETTER) {379return true;380} else {381// Check for Other_Lowercase and Other_Uppercase382//383if ((ch >= 0x02B0) && (ch <= 0x02B8)) {384// MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y385return true;386} else if ((ch >= 0x02C0) && (ch <= 0x02C1)) {387// MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP388return true;389} else if ((ch >= 0x02E0) && (ch <= 0x02E4)) {390// MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP391return true;392} else if (ch == 0x0345) {393// COMBINING GREEK YPOGEGRAMMENI394return true;395} else if (ch == 0x037A) {396// GREEK YPOGEGRAMMENI397return true;398} else if ((ch >= 0x1D2C) && (ch <= 0x1D61)) {399// MODIFIER LETTER CAPITAL A..MODIFIER LETTER SMALL CHI400return true;401} else if ((ch >= 0x2160) && (ch <= 0x217F)) {402// ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND403// SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND404return true;405} else if ((ch >= 0x24B6) && (ch <= 0x24E9)) {406// CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z407// CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z408return true;409} else {410return false;411}412}413}414415private static boolean isSoftDotted(int ch) {416switch (ch) {417case 0x0069: // Soft_Dotted # L& LATIN SMALL LETTER I418case 0x006A: // Soft_Dotted # L& LATIN SMALL LETTER J419case 0x012F: // Soft_Dotted # L& LATIN SMALL LETTER I WITH OGONEK420case 0x0268: // Soft_Dotted # L& LATIN SMALL LETTER I WITH STROKE421case 0x0456: // Soft_Dotted # L& CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I422case 0x0458: // Soft_Dotted # L& CYRILLIC SMALL LETTER JE423case 0x1D62: // Soft_Dotted # L& LATIN SUBSCRIPT SMALL LETTER I424case 0x1E2D: // Soft_Dotted # L& LATIN SMALL LETTER I WITH TILDE BELOW425case 0x1ECB: // Soft_Dotted # L& LATIN SMALL LETTER I WITH DOT BELOW426case 0x2071: // Soft_Dotted # L& SUPERSCRIPT LATIN SMALL LETTER I427return true;428default:429return false;430}431}432433/**434* An internal class that represents an entry in the Special Casing Properties.435*/436static class Entry {437int ch;438char [] lower;439char [] upper;440String lang;441int condition;442443Entry(int ch, char[] lower, char[] upper, String lang, int condition) {444this.ch = ch;445this.lower = lower;446this.upper = upper;447this.lang = lang;448this.condition = condition;449}450451int getCodePoint() {452return ch;453}454455char[] getLowerCase() {456return lower;457}458459char[] getUpperCase() {460return upper;461}462463String getLanguage() {464return lang;465}466467int getCondition() {468return condition;469}470}471}472473474