Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/make/src/classes/build/tools/generatebreakiteratordata/CharSet.java
32287 views
/*1* Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425/*26* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved27* (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved28*29* The original version of this source code and documentation30* is copyrighted and owned by Taligent, Inc., a wholly-owned31* subsidiary of IBM. These materials are provided under terms32* of a License Agreement between Taligent and Sun. This technology33* is protected by multiple US and International patents.34*35* This notice and attribution to Taligent may not be removed.36* Taligent is a registered trademark of Taligent, Inc.37*/3839package build.tools.generatebreakiteratordata;4041import java.util.Arrays;42import java.util.Hashtable;4344/**45* An object representing a set of characters. (This is a "set" in the46* mathematical sense: an unduplicated list of characters on which set47* operations such as union and intersection can be performed.) The48* set information is stored in compressed, optimized form: The object49* contains an integer array with an even number of characters. Each50* pair of characters represents a range of characters contained in the set51* (a pair of the same character represents a single character). The52* characters are sorted in increasing order.53*/54class CharSet {55/**56* The structure containing the set information. The characters57* in this array are organized into pairs, each pair representing58* a range of characters contained in the set59*/60private int[] chars;6162//==========================================================================63// parseString() and associated routines64//==========================================================================65/**66* A cache which is used to speed up parseString() whenever it is67* used to parse a description that has been parsed before68*/69private static Hashtable<String, CharSet> expressionCache = null;7071/**72* Builds a CharSet based on a textual description. For the syntax of73* the description, see the documentation of RuleBasedBreakIterator.74* @see java.text.RuleBasedBreakIterator75*/76public static CharSet parseString(String s) {77CharSet result = null;7879// if "s" is in the expression cache, pull the result out80// of the expresison cache81if (expressionCache != null) {82result = expressionCache.get(s);83}8485// otherwise, use doParseString() to actually parse the string,86// and then add a corresponding entry to the expression cache87if (result == null) {88result = doParseString(s);89if (expressionCache == null) {90expressionCache = new Hashtable<>();91}92expressionCache.put(s, result);93}94result = (CharSet)(result.clone());95return result;96}9798/**99* This function is used by parseString() to actually parse the string100*/101private static CharSet doParseString(String s) {102CharSet result = new CharSet();103int p = 0;104105boolean haveDash = false;106boolean haveTilde = false;107boolean wIsReal = false;108int w = 0x0000;109110// for each character in the description...111while (p < s.length()) {112int c = s.codePointAt(p);113114// if it's an opening bracket...115if (c == '[') {116// flush the single-character cache117if (wIsReal) {118result.internalUnion(new CharSet(w));119}120121// locate the matching closing bracket122int bracketLevel = 1;123int q = p + 1;124while (bracketLevel != 0) {125// if no matching bracket by end of string then...126if (q >= s.length()) {127throw new IllegalArgumentException("Parse error at position " + p + " in " + s);128}129int ch = s.codePointAt(q);130switch (ch) {131case '\\': // need to step over next character132ch = s.codePointAt(++q);133break;134case '[':135++bracketLevel;136break;137case ']':138--bracketLevel;139break;140}141q += Character.charCount(ch);142}143--q;144145// call parseString() recursively to parse the text inside146// the brackets, then either add or subtract the result from147// our running result depending on whether or not the []148// expresison was preceded by a ^149if (!haveTilde) {150result.internalUnion(CharSet.parseString(s.substring(p + 1, q)));151}152else {153result.internalDifference(CharSet.parseString(s.substring(p + 1, q)));154}155haveTilde = false;156haveDash = false;157wIsReal = false;158p = q + 1;159}160161// if the character is a colon...162else if (c == ':') {163// flush the single-character cache164if (wIsReal) {165result.internalUnion(new CharSet(w));166}167168// locate the matching colon (and throw an error if there169// isn't one)170int q = s.indexOf(':', p + 1);171if (q == -1) {172throw new IllegalArgumentException("Parse error at position " + p + " in " + s);173}174175// use charSetForCategory() to parse the text in the colons,176// and either add or substract the result from our running177// result depending on whether the :: expression was178// preceded by a ^179if (!haveTilde) {180result.internalUnion(charSetForCategory(s.substring(p + 1, q)));181}182else {183result.internalDifference(charSetForCategory(s.substring(p + 1, q)));184}185186// reset everything and advance to the next character187haveTilde = false;188haveDash = false;189wIsReal = false;190p = q + 1;191}192193// if the character is a dash, set an appropriate flag194else if (c == '-') {195if (wIsReal) {196haveDash = true;197}198++p;199}200201// if the character is a caret, flush the single-character202// cache and set an appropriate flag. If the set is empty203// (i.e., if the expression begins with ^), invert the set204// (i.e., set it to include everything). The idea here is205// that a set that includes nothing but ^ expressions206// means "everything but these things".207else if (c == '^') {208if (wIsReal) {209result.internalUnion(new CharSet(w));210wIsReal = false;211}212haveTilde = true;213++p;214if (result.empty()) {215result.internalComplement();216}217}218219// throw an exception on an illegal character220else if (c >= ' ' && c < '\u007f' && !Character.isLetter((char)c)221&& !Character.isDigit((char)c) && c != '\\') {222throw new IllegalArgumentException("Parse error at position " + p + " in " + s);223}224225// otherwise, we end up here...226else {227// on a backslash, advance to the next character228if (c == '\\') {229++p;230}231232// if the preceding character was a dash, this character233// defines the end of a range. Add or subtract that range234// from the running result depending on whether or not it235// was preceded by a ^236if (haveDash) {237if (s.codePointAt(p) < w) {238throw new IllegalArgumentException("U+" +239Integer.toHexString(s.codePointAt(p))240+ " is less than U+" + Integer.toHexString(w) + ". Dash expressions "241+ "can't have their endpoints in reverse order.");242}243244int ch = s.codePointAt(p);245if (!haveTilde) {246result.internalUnion(new CharSet(w, ch));247}248else {249result.internalDifference(new CharSet(w, ch));250}251p += Character.charCount(ch);252haveDash = false;253haveTilde = false;254wIsReal = false;255}256257// if the preceding character was a caret, remove this character258// from the running result259else if (haveTilde) {260w = s.codePointAt(p);261result.internalDifference(new CharSet(w));262p += Character.charCount(w);263haveTilde = false;264wIsReal = false;265}266267// otherwise, flush the single-character cache and then268// put this character into the cache269else if (wIsReal) {270result.internalUnion(new CharSet(w));271w = s.codePointAt(p);272p += Character.charCount(w);273wIsReal = true;274} else {275w = s.codePointAt(p);276p += Character.charCount(w);277wIsReal = true;278}279}280}281282// finally, flush the single-character cache one last time283if (wIsReal) {284result.internalUnion(new CharSet(w));285}286287return result;288}289290/**291* Creates a CharSet containing all the characters in a particular292* Unicode category. The text is either a two-character code from293* the Unicode database or a single character that begins one or more294* two-character codes.295*/296private static CharSet charSetForCategory(String category) {297// throw an exception if we have anything other than one or two298// characters inside the colons299if (category.length() == 0 || category.length() >= 3) {300throw new IllegalArgumentException("Invalid character category: " + category);301}302303// if we have two characters, search the category map for that code304// and either construct and return a CharSet from the data in the305// category map or throw an exception306if (category.length() == 2) {307for (int i = 0; i < CharacterCategory.categoryNames.length; i++) {308if (CharacterCategory.categoryNames[i].equals(category)) {309return new CharSet(CharacterCategory.getCategoryMap(i));310}311}312throw new IllegalArgumentException("Invalid character category: " + category);313}314315// if we have one character, search the category map for codes beginning316// with that letter, and union together all of the matching sets that317// we find (or throw an exception if there are no matches)318else if (category.length() == 1) {319CharSet result = new CharSet();320for (int i = 0; i < CharacterCategory.categoryNames.length; i++) {321if (CharacterCategory.categoryNames[i].startsWith(category)) {322result = result.union(new CharSet(CharacterCategory.getCategoryMap(i)));323}324}325if (result.empty()) {326throw new IllegalArgumentException("Invalid character category: " + category);327}328else {329return result;330}331}332return new CharSet(); // should never get here, but to make the compiler happy...333}334335/**336* Returns a copy of CharSet's expression cache and sets CharSet's337* expression cache to empty.338*/339public static Hashtable<String, CharSet> releaseExpressionCache() {340Hashtable<String, CharSet> result = expressionCache;341expressionCache = null;342return result;343}344345//==========================================================================346// CharSet manipulation347//==========================================================================348/**349* Creates an empty CharSet.350*/351public CharSet() {352chars = new int[0];353}354355/**356* Creates a CharSet containing a single character.357* @param c The character to put into the CharSet358*/359public CharSet(int c) {360chars = new int[2];361chars[0] = c;362chars[1] = c;363}364365/**366* Creates a CharSet containing a range of characters.367* @param lo The lowest-numbered character to include in the range368* @param hi The highest-numbered character to include in the range369*/370public CharSet(int lo, int hi) {371chars = new int[2];372if (lo <= hi) {373chars[0] = lo;374chars[1] = hi;375}376else {377chars[0] = hi;378chars[1] = lo;379}380}381382/**383* Creates a CharSet, initializing it from the internal storage384* of another CharSet (this function performs no error checking385* on "chars", so if it's malformed, undefined behavior will result)386*/387private CharSet(int[] chars) {388this.chars = chars;389}390391/**392* Returns a CharSet representing the union of two CharSets.393*/394public CharSet union(CharSet that) {395return new CharSet(doUnion(that.chars));396}397398/**399* Adds the characters in "that" to this CharSet400*/401private void internalUnion(CharSet that) {402chars = doUnion(that.chars);403}404405/**406* The actual implementation of the union functions407*/408private int[] doUnion(int[] c2) {409int[] result = new int[chars.length+c2.length];410411int i = 0;412int j = 0;413int index = 0;414415// consider all the characters in both strings416while (i < chars.length && j < c2.length) {417int ub;418419// the first character in the result is the lower of the420// starting characters of the two strings, and "ub" gets421// set to the upper bound of that range422if (chars[i] < c2[j]) {423result[index++] = chars[i];424ub = chars[++i];425}426else {427result[index++] = c2[j];428ub = c2[++j];429}430431// for as long as one of our two pointers is pointing to a range's432// end point, or i is pointing to a character that is less than433// "ub" plus one (the "plus one" stitches touching ranges together)...434while (i % 2 == 1 ||435j % 2 == 1 ||436(i < chars.length && chars[i] <= ub + 1)) {437438// advance i to the first character that is greater than439// "ub" plus one440while (i < chars.length && chars[i] <= ub + 1) {441++i;442}443444// if i points to the endpoint of a range, update "ub"445// to that character, or if i points to the start of446// a range and the endpoint of the preceding range is447// greater than "ub", update "up" to _that_ character448if (i % 2 == 1) {449ub = chars[i];450}451else if (i > 0 && chars[i - 1] > ub) {452ub = chars[i - 1];453}454455// now advance j to the first character that is greater456// that "ub" plus one457while (j < c2.length && c2[j] <= ub + 1) {458++j;459}460461// if j points to the endpoint of a range, update "ub"462// to that character, or if j points to the start of463// a range and the endpoint of the preceding range is464// greater than "ub", update "up" to _that_ character465if (j % 2 == 1) {466ub = c2[j];467}468else if (j > 0 && c2[j - 1] > ub) {469ub = c2[j - 1];470}471}472// when we finally fall out of this loop, we will have stitched473// together a series of ranges that overlap or touch, i and j474// will both point to starting points of ranges, and "ub" will475// be the endpoint of the range we're working on. Write "ub"476// to the result477result[index++] = ub;478479// loop back around to create the next range in the result480}481482// we fall out to here when we've exhausted all the characters in483// one of the operands. We can append all of the remaining characters484// in the other operand without doing any extra work.485if (i < chars.length) {486for (int k = i; k < chars.length; k++) {487result[index++] = chars[k];488}489}490if (j < c2.length) {491for (int k = j; k < c2.length; k++) {492result[index++] = c2[k];493}494}495496if (result.length > index) {497int[] tmpbuf = new int[index];498System.arraycopy(result, 0, tmpbuf, 0, index);499return tmpbuf;500}501502return result;503}504505/**506* Returns the intersection of two CharSets.507*/508public CharSet intersection(CharSet that) {509return new CharSet(doIntersection(that.chars));510}511512/**513* Removes from this CharSet any characters that aren't also in "that"514*/515private void internalIntersection(CharSet that) {516chars = doIntersection(that.chars);517}518519/**520* The internal implementation of the two intersection functions521*/522private int[] doIntersection(int[] c2) {523int[] result = new int[chars.length+c2.length];524525int i = 0;526int j = 0;527int oldI;528int oldJ;529int index = 0;530531// iterate until we've exhausted one of the operands532while (i < chars.length && j < c2.length) {533534// advance j until it points to a character that is larger than535// the one i points to. If this is the beginning of a one-536// character range, advance j to point to the end537if (i < chars.length && i % 2 == 0) {538while (j < c2.length && c2[j] < chars[i]) {539++j;540}541if (j < c2.length && j % 2 == 0 && c2[j] == chars[i]) {542++j;543}544}545546// if j points to the endpoint of a range, save the current547// value of i, then advance i until it reaches a character548// which is larger than the character pointed at549// by j. All of the characters we've advanced over (except550// the one currently pointed to by i) are added to the result551oldI = i;552while (j % 2 == 1 && i < chars.length && chars[i] <= c2[j]) {553++i;554}555for (int k = oldI; k < i; k++) {556result[index++] = chars[k];557}558559// if i points to the endpoint of a range, save the current560// value of j, then advance j until it reaches a character561// which is larger than the character pointed at562// by i. All of the characters we've advanced over (except563// the one currently pointed to by i) are added to the result564oldJ = j;565while (i % 2 == 1 && j < c2.length && c2[j] <= chars[i]) {566++j;567}568for (int k = oldJ; k < j; k++) {569result[index++] = c2[k];570}571572// advance i until it points to a character larger than j573// If it points at the beginning of a one-character range,574// advance it to the end of that range575if (j < c2.length && j % 2 == 0) {576while (i < chars.length && chars[i] < c2[j]) {577++i;578}579if (i < chars.length && i % 2 == 0 && c2[j] == chars[i]) {580++i;581}582}583}584585if (result.length > index) {586int[] tmpbuf = new int[index];587System.arraycopy(result, 0, tmpbuf, 0, index);588return tmpbuf;589}590591return result;592}593594/**595* Returns a CharSet containing all the characters in "this" that596* aren't also in "that"597*/598public CharSet difference(CharSet that) {599return new CharSet(doIntersection(that.doComplement()));600}601602/**603* Removes from "this" all the characters that are also in "that"604*/605private void internalDifference(CharSet that) {606chars = doIntersection(that.doComplement());607}608609/**610* Returns a CharSet containing all the characters which are not611* in "this"612*/613public CharSet complement() {614return new CharSet(doComplement());615}616617/**618* Complements "this". All the characters it contains are removed,619* and all the characters it doesn't contain are added.620*/621private void internalComplement() {622chars = doComplement();623}624625/**626* The internal implementation function for the complement routines627*/628private int[] doComplement() {629// the complement of an empty CharSet is one containing everything630if (empty()) {631int[] result = new int[2];632result[0] = 0x0000;633result[1] = 0x10FFFF;634return result;635}636637int[] result = new int[chars.length+2];638639int i = 0;640int index = 0;641642// the result begins with \u0000 unless the original CharSet does643if (chars[0] != 0x0000) {644result[index++] = 0x0000;645}646647// walk through the characters in this CharSet. Append a pair of648// characters the first of which is one less than the first649// character we see and the second of which is one plus the second650// character we see (don't write the first character if it's \u0000,651// and don't write the second character if it's \uffff.652while (i < chars.length) {653if (chars[i] != 0x0000) {654result[index++] = chars[i] - 1;655}656if (chars[i + 1] != 0x10FFFF) {657result[index++] = chars[i + 1] + 1;658}659i += 2;660}661662// add 0x10ffff to the end of the result, unless it was in663// the original set664if (chars[i-1] != 0x10FFFF) {665result[index++] = 0x10FFFF;666}667668if (result.length > index) {669int[] tmpbuf = new int[index];670System.arraycopy(result, 0, tmpbuf, 0, index);671return tmpbuf;672}673674return result;675}676677/**678* Returns true if this CharSet contains the specified character679* @param c The character we're testing for set membership680*/681public boolean contains(int c) {682// search for the first range endpoint that is greater than or683// equal to c684int i = 1;685while (i < chars.length && chars[i] < c) {686i += 2;687}688689// if we've walked off the end, we don't contain c690if (i == chars.length) {691return false;692}693694// otherwise, we contain c if the beginning of the range is less695// than or equal to c696return chars[i - 1] <= c;697}698699/**700* Returns true if "that" is another instance of CharSet containing701* the exact same characters as this one702*/703public boolean equals(Object that) {704return (that instanceof CharSet) && Arrays.equals(chars, ((CharSet)that).chars);705}706707/**708* Returns the hash code for this set of characters709*/710public int hashCode() {711return Arrays.hashCode(chars);712}713714/**715* Creates a new CharSet that is equal to this one716*/717public Object clone() {718return new CharSet(chars);719}720721/**722* Returns true if this CharSet contains no characters723*/724public boolean empty() {725return chars.length == 0;726}727728/**729* Returns a textual representation of this CharSet. If the result730* of calling this function is passed to CharSet.parseString(), it731* will produce another CharSet that is equal to this one.732*/733public String toString() {734StringBuffer result = new StringBuffer();735736// the result begins with an opening bracket737result.append('[');738739// iterate through the ranges in the CharSet740for (int i = 0; i < chars.length; i += 2) {741// for a range with the same beginning and ending point,742// output that character743if (chars[i] == chars[i + 1]) {744result.append("0x");745result.append(Integer.toHexString(chars[i]));746}747748// otherwise, output the start and end points of the range749// separated by a dash750else {751result.append("0x");752result.append(Integer.toHexString(chars[i]));753result.append("-0x");754result.append(Integer.toHexString(chars[i + 1]));755}756}757758// the result ends with a closing bracket759result.append(']');760return result.toString();761}762763/**764* Returns an integer array representing the contents of this CharSet765* in the same form in which they're stored internally: as pairs766* of characters representing the start and end points of ranges767*/768public int[] getRanges() {769return chars;770}771772/**773* Returns an Enumeration that will return the ranges of characters774* contained in this CharSet one at a time775*/776public Enumeration getChars() {777return new Enumeration(this);778}779780//==========================================================================781// CharSet.Enumeration782//==========================================================================783784/**785* An Enumeration that can be used to extract the character ranges786* from a CharSet one at a time787*/788public class Enumeration implements java.util.Enumeration<int[]> {789/**790* Initializes a CharSet.Enumeration791*/792Enumeration(CharSet cs) {793this.chars = cs.chars;794p = 0;795}796797/**798* Returns true if the enumeration hasn't yet returned799* all the ranges in the CharSet800*/801public boolean hasMoreElements() {802return p < chars.length;803}804805/**806* Returns the next range in the CarSet807*/808public int[] nextElement() {809int[] result = new int[2];810result[0] = chars[p++];811result[1] = chars[p++];812return result;813}814815int p;816int[] chars;817}818}819820821