Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/sun/text/normalizer/UCharacter.java
38830 views
1
/*
2
* Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation. Oracle designates this
8
* particular file as subject to the "Classpath" exception as provided
9
* by Oracle in the LICENSE file that accompanied this code.
10
*
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
16
*
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
*
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
23
* questions.
24
*/
25
/*
26
*******************************************************************************
27
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
28
* *
29
* The original version of this source code and documentation is copyrighted *
30
* and owned by IBM, These materials are provided under terms of a License *
31
* Agreement between IBM and Sun. This technology is protected by multiple *
32
* US and International patents. This notice and attribution to IBM may not *
33
* to removed. *
34
*******************************************************************************
35
*/
36
37
package sun.text.normalizer;
38
39
import java.io.IOException;
40
import java.util.MissingResourceException;
41
42
/**
43
* <p>
44
* The UCharacter class provides extensions to the
45
* <a href="https://docs.oracle.com/javase/1.5.0/docs/api/java/lang/Character.html">
46
* java.lang.Character</a> class. These extensions provide support for
47
* more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
48
* class, provide support for supplementary characters (those with code
49
* points above U+FFFF).
50
* Each ICU release supports the latest version of Unicode available at that time.
51
* </p>
52
* <p>
53
* Code points are represented in these API using ints. While it would be
54
* more convenient in Java to have a separate primitive datatype for them,
55
* ints suffice in the meantime.
56
* </p>
57
* <p>
58
* To use this class please add the jar file name icu4j.jar to the
59
* class path, since it contains data files which supply the information used
60
* by this file.<br>
61
* E.g. In Windows <br>
62
* <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
63
* Otherwise, another method would be to copy the files uprops.dat and
64
* unames.icu from the icu4j source subdirectory
65
* <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
66
* <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
67
* </p>
68
* <p>
69
* Aside from the additions for UTF-16 support, and the updated Unicode
70
* properties, the main differences between UCharacter and Character are:
71
* <ul>
72
* <li> UCharacter is not designed to be a char wrapper and does not have
73
* APIs to which involves management of that single char.<br>
74
* These include:
75
* <ul>
76
* <li> char charValue(),
77
* <li> int compareTo(java.lang.Character, java.lang.Character), etc.
78
* </ul>
79
* <li> UCharacter does not include Character APIs that are deprecated, nor
80
* does it include the Java-specific character information, such as
81
* boolean isJavaIdentifierPart(char ch).
82
* <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
83
* values '10' - '35'. UCharacter also does this in digit and
84
* getNumericValue, to adhere to the java semantics of these
85
* methods. New methods unicodeDigit, and
86
* getUnicodeNumericValue do not treat the above code points
87
* as having numeric values. This is a semantic change from ICU4J 1.3.1.
88
* </ul>
89
* <p>
90
* Further detail differences can be determined from the program
91
* <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
92
* com.ibm.icu.dev.test.lang.UCharacterCompare</a>
93
* </p>
94
* <p>
95
* In addition to Java compatibility functions, which calculate derived properties,
96
* this API provides low-level access to the Unicode Character Database.
97
* </p>
98
* <p>
99
* Unicode assigns each code point (not just assigned character) values for
100
* many properties.
101
* Most of them are simple boolean flags, or constants from a small enumerated list.
102
* For some properties, values are strings or other relatively more complex types.
103
* </p>
104
* <p>
105
* For more information see
106
* "About the Unicode Character Database" (http://www.unicode.org/ucd/)
107
* and the ICU User Guide chapter on Properties (http://www.icu-project.org/userguide/properties.html).
108
* </p>
109
* <p>
110
* There are also functions that provide easy migration from C/POSIX functions
111
* like isblank(). Their use is generally discouraged because the C/POSIX
112
* standards do not define their semantics beyond the ASCII range, which means
113
* that different implementations exhibit very different behavior.
114
* Instead, Unicode properties should be used directly.
115
* </p>
116
* <p>
117
* There are also only a few, broad C/POSIX character classes, and they tend
118
* to be used for conflicting purposes. For example, the "isalpha()" class
119
* is sometimes used to determine word boundaries, while a more sophisticated
120
* approach would at least distinguish initial letters from continuation
121
* characters (the latter including combining marks).
122
* (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
123
* Another example: There is no "istitle()" class for titlecase characters.
124
* </p>
125
* <p>
126
* ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
127
* ICU implements them according to the Standard Recommendations in
128
* Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
129
* (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
130
* </p>
131
* <p>
132
* API access for C/POSIX character classes is as follows:
133
* - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
134
* - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
135
* - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
136
* - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|(1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|(1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
137
* - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
138
* - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
139
* - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM)
140
* - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
141
* - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK)
142
* - cntrl: getType(c)==CONTROL
143
* - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH)
144
* - print: hasBinaryProperty(c, UProperty.POSIX_PRINT)
145
* </p>
146
* <p>
147
* The C/POSIX character classes are also available in UnicodeSet patterns,
148
* using patterns like [:graph:] or \p{graph}.
149
* </p>
150
* <p>
151
* Note: There are several ICU (and Java) whitespace functions.
152
* Comparison:
153
* - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
154
* most of general categories "Z" (separators) + most whitespace ISO controls
155
* (including no-break spaces, but excluding IS1..IS4 and ZWSP)
156
* - isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
157
* - isSpaceChar: just Z (including no-break spaces)
158
* </p>
159
* <p>
160
* This class is not subclassable
161
* </p>
162
* @author Syn Wee Quek
163
* @stable ICU 2.1
164
* @see com.ibm.icu.lang.UCharacterEnums
165
*/
166
167
public final class UCharacter
168
{
169
170
/**
171
* Numeric Type constants.
172
* @see UProperty#NUMERIC_TYPE
173
* @stable ICU 2.4
174
*/
175
public static interface NumericType
176
{
177
/**
178
* @stable ICU 2.4
179
*/
180
public static final int DECIMAL = 1;
181
}
182
183
// public data members -----------------------------------------------
184
185
/**
186
* The lowest Unicode code point value.
187
* @stable ICU 2.1
188
*/
189
public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
190
191
/**
192
* The highest Unicode code point value (scalar value) according to the
193
* Unicode Standard.
194
* This is a 21-bit value (21 bits, rounded up).<br>
195
* Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
196
* @stable ICU 2.1
197
*/
198
public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
199
200
/**
201
* The minimum value for Supplementary code points
202
* @stable ICU 2.1
203
*/
204
public static final int SUPPLEMENTARY_MIN_VALUE =
205
UTF16.SUPPLEMENTARY_MIN_VALUE;
206
207
// public methods ----------------------------------------------------
208
209
/**
210
* Retrieves the numeric value of a decimal digit code point.
211
* <br>This method observes the semantics of
212
* <code>java.lang.Character.digit()</code>. Note that this
213
* will return positive values for code points for which isDigit
214
* returns false, just like java.lang.Character.
215
* <br><em>Semantic Change:</em> In release 1.3.1 and
216
* prior, this did not treat the European letters as having a
217
* digit value, and also treated numeric letters and other numbers as
218
* digits.
219
* This has been changed to conform to the java semantics.
220
* <br>A code point is a valid digit if and only if:
221
* <ul>
222
* <li>ch is a decimal digit or one of the european letters, and
223
* <li>the value of ch is less than the specified radix.
224
* </ul>
225
* @param ch the code point to query
226
* @param radix the radix
227
* @return the numeric value represented by the code point in the
228
* specified radix, or -1 if the code point is not a decimal digit
229
* or if its value is too large for the radix
230
* @stable ICU 2.1
231
*/
232
public static int digit(int ch, int radix)
233
{
234
// when ch is out of bounds getProperty == 0
235
int props = getProperty(ch);
236
int value;
237
if (getNumericType(props) == NumericType.DECIMAL) {
238
value = UCharacterProperty.getUnsignedValue(props);
239
} else {
240
value = getEuropeanDigit(ch);
241
}
242
return (0 <= value && value < radix) ? value : -1;
243
}
244
245
/**
246
* Returns the Bidirection property of a code point.
247
* For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
248
* property.<br>
249
* Result returned belongs to the interface
250
* <a href=UCharacterDirection.html>UCharacterDirection</a>
251
* @param ch the code point to be determined its direction
252
* @return direction constant from UCharacterDirection.
253
* @stable ICU 2.1
254
*/
255
public static int getDirection(int ch)
256
{
257
return gBdp.getClass(ch);
258
}
259
260
/**
261
* Returns a code point corresponding to the two UTF16 characters.
262
* @param lead the lead char
263
* @param trail the trail char
264
* @return code point if surrogate characters are valid.
265
* @exception IllegalArgumentException thrown when argument characters do
266
* not form a valid codepoint
267
* @stable ICU 2.1
268
*/
269
public static int getCodePoint(char lead, char trail)
270
{
271
if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
272
return UCharacterProperty.getRawSupplementary(lead, trail);
273
}
274
throw new IllegalArgumentException("Illegal surrogate characters");
275
}
276
277
/**
278
* <p>Get the "age" of the code point.</p>
279
* <p>The "age" is the Unicode version when the code point was first
280
* designated (as a non-character or for Private Use) or assigned a
281
* character.
282
* <p>This can be useful to avoid emitting code points to receiving
283
* processes that do not accept newer characters.</p>
284
* <p>The data is from the UCD file DerivedAge.txt.</p>
285
* @param ch The code point.
286
* @return the Unicode version number
287
* @stable ICU 2.6
288
*/
289
public static VersionInfo getAge(int ch)
290
{
291
if (ch < MIN_VALUE || ch > MAX_VALUE) {
292
throw new IllegalArgumentException("Codepoint out of bounds");
293
}
294
return PROPERTY_.getAge(ch);
295
}
296
297
// private variables -------------------------------------------------
298
299
/**
300
* Database storing the sets of character property
301
*/
302
private static final UCharacterProperty PROPERTY_;
303
/**
304
* For optimization
305
*/
306
private static final char[] PROPERTY_TRIE_INDEX_;
307
private static final char[] PROPERTY_TRIE_DATA_;
308
private static final int PROPERTY_INITIAL_VALUE_;
309
310
private static final UBiDiProps gBdp;
311
312
// block to initialise character property database
313
static
314
{
315
try
316
{
317
PROPERTY_ = UCharacterProperty.getInstance();
318
PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_;
319
PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_;
320
PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_;
321
}
322
catch (Exception e)
323
{
324
throw new MissingResourceException(e.getMessage(),"","");
325
}
326
327
UBiDiProps bdp;
328
try {
329
bdp=UBiDiProps.getSingleton();
330
} catch(IOException e) {
331
bdp=UBiDiProps.getDummy();
332
}
333
gBdp=bdp;
334
}
335
336
/**
337
* Shift to get numeric type
338
*/
339
private static final int NUMERIC_TYPE_SHIFT_ = 5;
340
/**
341
* Mask to get numeric type
342
*/
343
private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_;
344
345
// private methods ---------------------------------------------------
346
347
/**
348
* Getting the digit values of characters like 'A' - 'Z', normal,
349
* half-width and full-width. This method assumes that the other digit
350
* characters are checked by the calling method.
351
* @param ch character to test
352
* @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
353
* its corresponding digit will be returned.
354
*/
355
private static int getEuropeanDigit(int ch) {
356
if ((ch > 0x7a && ch < 0xff21)
357
|| ch < 0x41 || (ch > 0x5a && ch < 0x61)
358
|| ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
359
return -1;
360
}
361
if (ch <= 0x7a) {
362
// ch >= 0x41 or ch < 0x61
363
return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
364
}
365
// ch >= 0xff21
366
if (ch <= 0xff3a) {
367
return ch + 10 - 0xff21;
368
}
369
// ch >= 0xff41 && ch <= 0xff5a
370
return ch + 10 - 0xff41;
371
}
372
373
/**
374
* Gets the numeric type of the property argument
375
* @param props 32 bit property
376
* @return the numeric type
377
*/
378
private static int getNumericType(int props)
379
{
380
return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_;
381
}
382
383
/**
384
* Gets the property value at the index.
385
* This is optimized.
386
* Note this is alittle different from CharTrie the index m_trieData_
387
* is never negative.
388
* This is a duplicate of UCharacterProperty.getProperty. For optimization
389
* purposes, this method calls the trie data directly instead of through
390
* UCharacterProperty.getProperty.
391
* @param ch code point whose property value is to be retrieved
392
* @return property value of code point
393
* @stable ICU 2.6
394
*/
395
private static final int getProperty(int ch)
396
{
397
if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
398
|| (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
399
&& ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
400
// BMP codepoint 0000..D7FF or DC00..FFFF
401
try { // using try for ch < 0 is faster than using an if statement
402
return PROPERTY_TRIE_DATA_[
403
(PROPERTY_TRIE_INDEX_[ch >> 5] << 2)
404
+ (ch & 0x1f)];
405
} catch (ArrayIndexOutOfBoundsException e) {
406
return PROPERTY_INITIAL_VALUE_;
407
}
408
}
409
if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
410
// lead surrogate D800..DBFF
411
return PROPERTY_TRIE_DATA_[
412
(PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2)
413
+ (ch & 0x1f)];
414
}
415
// for optimization
416
if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
417
// supplementary code point 10000..10FFFF
418
// look at the construction of supplementary characters
419
// trail forms the ends of it.
420
return PROPERTY_.m_trie_.getSurrogateValue(
421
UTF16.getLeadSurrogate(ch),
422
(char)(ch & 0x3ff));
423
}
424
// return m_dataOffset_ if there is an error, in this case we return
425
// the default value: m_initialValue_
426
// we cannot assume that m_initialValue_ is at offset 0
427
// this is for optimization.
428
return PROPERTY_INITIAL_VALUE_;
429
}
430
431
}
432
433