Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/sun/text/normalizer/UCharacterProperty.java
38830 views
1
/*
2
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation. Oracle designates this
8
* particular file as subject to the "Classpath" exception as provided
9
* by Oracle in the LICENSE file that accompanied this code.
10
*
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
16
*
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
*
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
23
* questions.
24
*/
25
/*
26
*******************************************************************************
27
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
28
* *
29
* The original version of this source code and documentation is copyrighted *
30
* and owned by IBM, These materials are provided under terms of a License *
31
* Agreement between IBM and Sun. This technology is protected by multiple *
32
* US and International patents. This notice and attribution to IBM may not *
33
* to removed. *
34
*******************************************************************************
35
*/
36
37
package sun.text.normalizer;
38
39
import java.io.BufferedInputStream;
40
import java.io.InputStream;
41
import java.io.IOException;
42
import java.util.MissingResourceException;
43
44
/**
45
* <p>Internal class used for Unicode character property database.</p>
46
* <p>This classes store binary data read from uprops.icu.
47
* It does not have the capability to parse the data into more high-level
48
* information. It only returns bytes of information when required.</p>
49
* <p>Due to the form most commonly used for retrieval, array of char is used
50
* to store the binary data.</p>
51
* <p>UCharacterPropertyDB also contains information on accessing indexes to
52
* significant points in the binary data.</p>
53
* <p>Responsibility for molding the binary data into more meaning form lies on
54
* <a href=UCharacter.html>UCharacter</a>.</p>
55
* @author Syn Wee Quek
56
* @since release 2.1, february 1st 2002
57
*/
58
59
public final class UCharacterProperty
60
{
61
// public data members -----------------------------------------------
62
63
/**
64
* Trie data
65
*/
66
public CharTrie m_trie_;
67
/**
68
* Optimization
69
* CharTrie index array
70
*/
71
public char[] m_trieIndex_;
72
/**
73
* Optimization
74
* CharTrie data array
75
*/
76
public char[] m_trieData_;
77
/**
78
* Optimization
79
* CharTrie data offset
80
*/
81
public int m_trieInitialValue_;
82
/**
83
* Unicode version
84
*/
85
public VersionInfo m_unicodeVersion_;
86
87
// uprops.h enum UPropertySource --------------------------------------- ***
88
89
/** From uchar.c/uprops.icu properties vectors trie */
90
public static final int SRC_PROPSVEC=2;
91
/** One more than the highest UPropertySource (SRC_) constant. */
92
public static final int SRC_COUNT=9;
93
94
// public methods ----------------------------------------------------
95
96
/**
97
* Java friends implementation
98
*/
99
public void setIndexData(CharTrie.FriendAgent friendagent)
100
{
101
m_trieIndex_ = friendagent.getPrivateIndex();
102
m_trieData_ = friendagent.getPrivateData();
103
m_trieInitialValue_ = friendagent.getPrivateInitialValue();
104
}
105
106
/**
107
* Gets the property value at the index.
108
* This is optimized.
109
* Note this is alittle different from CharTrie the index m_trieData_
110
* is never negative.
111
* @param ch code point whose property value is to be retrieved
112
* @return property value of code point
113
*/
114
public final int getProperty(int ch)
115
{
116
if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
117
|| (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
118
&& ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
119
// BMP codepoint 0000..D7FF or DC00..FFFF
120
// optimized
121
try { // using try for ch < 0 is faster than using an if statement
122
return m_trieData_[
123
(m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
124
<< Trie.INDEX_STAGE_2_SHIFT_)
125
+ (ch & Trie.INDEX_STAGE_3_MASK_)];
126
} catch (ArrayIndexOutOfBoundsException e) {
127
return m_trieInitialValue_;
128
}
129
}
130
if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
131
// lead surrogate D800..DBFF
132
return m_trieData_[
133
(m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
134
+ (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
135
<< Trie.INDEX_STAGE_2_SHIFT_)
136
+ (ch & Trie.INDEX_STAGE_3_MASK_)];
137
}
138
if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
139
// supplementary code point 10000..10FFFF
140
// look at the construction of supplementary characters
141
// trail forms the ends of it.
142
return m_trie_.getSurrogateValue(
143
UTF16.getLeadSurrogate(ch),
144
(char)(ch & Trie.SURROGATE_MASK_));
145
}
146
// ch is out of bounds
147
// return m_dataOffset_ if there is an error, in this case we return
148
// the default value: m_initialValue_
149
// we cannot assume that m_initialValue_ is at offset 0
150
// this is for optimization.
151
return m_trieInitialValue_;
152
153
// this all is an inlined form of return m_trie_.getCodePointValue(ch);
154
}
155
156
/**
157
* Getting the unsigned numeric value of a character embedded in the property
158
* argument
159
* @param prop the character
160
* @return unsigned numberic value
161
*/
162
public static int getUnsignedValue(int prop)
163
{
164
return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
165
}
166
167
/**
168
* Gets the unicode additional properties.
169
* C version getUnicodeProperties.
170
* @param codepoint codepoint whose additional properties is to be
171
* retrieved
172
* @param column
173
* @return unicode properties
174
*/
175
public int getAdditional(int codepoint, int column) {
176
if (column == -1) {
177
return getProperty(codepoint);
178
}
179
if (column < 0 || column >= m_additionalColumnsCount_) {
180
return 0;
181
}
182
return m_additionalVectors_[
183
m_additionalTrie_.getCodePointValue(codepoint) + column];
184
}
185
186
/**
187
* <p>Get the "age" of the code point.</p>
188
* <p>The "age" is the Unicode version when the code point was first
189
* designated (as a non-character or for Private Use) or assigned a
190
* character.</p>
191
* <p>This can be useful to avoid emitting code points to receiving
192
* processes that do not accept newer characters.</p>
193
* <p>The data is from the UCD file DerivedAge.txt.</p>
194
* <p>This API does not check the validity of the codepoint.</p>
195
* @param codepoint The code point.
196
* @return the Unicode version number
197
*/
198
public VersionInfo getAge(int codepoint)
199
{
200
int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
201
return VersionInfo.getInstance(
202
(version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
203
version & LAST_NIBBLE_MASK_, 0, 0);
204
}
205
206
/**
207
* Forms a supplementary code point from the argument character<br>
208
* Note this is for internal use hence no checks for the validity of the
209
* surrogate characters are done
210
* @param lead lead surrogate character
211
* @param trail trailing surrogate character
212
* @return code point of the supplementary character
213
*/
214
public static int getRawSupplementary(char lead, char trail)
215
{
216
return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
217
}
218
219
/**
220
* Loads the property data and initialize the UCharacterProperty instance.
221
* @throws MissingResourceException when data is missing or data has been corrupted
222
*/
223
public static UCharacterProperty getInstance()
224
{
225
if(INSTANCE_ == null) {
226
try {
227
INSTANCE_ = new UCharacterProperty();
228
}
229
catch (Exception e) {
230
throw new MissingResourceException(e.getMessage(),"","");
231
}
232
}
233
return INSTANCE_;
234
}
235
236
/**
237
* Checks if the argument c is to be treated as a white space in ICU
238
* rules. Usually ICU rule white spaces are ignored unless quoted.
239
* Equivalent to test for Pattern_White_Space Unicode property.
240
* Stable set of characters, won't change.
241
* See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
242
* @param c codepoint to check
243
* @return true if c is a ICU white space
244
*/
245
public static boolean isRuleWhiteSpace(int c)
246
{
247
/* "white space" in the sense of ICU rule parsers
248
This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
249
See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
250
U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
251
Equivalent to test for Pattern_White_Space Unicode property.
252
*/
253
return (c >= 0x0009 && c <= 0x2029 &&
254
(c <= 0x000D || c == 0x0020 || c == 0x0085 ||
255
c == 0x200E || c == 0x200F || c >= 0x2028));
256
}
257
258
// protected variables -----------------------------------------------
259
260
/**
261
* Extra property trie
262
*/
263
CharTrie m_additionalTrie_;
264
/**
265
* Extra property vectors, 1st column for age and second for binary
266
* properties.
267
*/
268
int m_additionalVectors_[];
269
/**
270
* Number of additional columns
271
*/
272
int m_additionalColumnsCount_;
273
/**
274
* Maximum values for block, bits used as in vector word
275
* 0
276
*/
277
int m_maxBlockScriptValue_;
278
/**
279
* Maximum values for script, bits used as in vector word
280
* 0
281
*/
282
int m_maxJTGValue_;
283
284
// private variables -------------------------------------------------
285
286
/**
287
* UnicodeData.txt property object
288
*/
289
private static UCharacterProperty INSTANCE_ = null;
290
291
/**
292
* Default name of the datafile
293
*/
294
private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
295
296
/**
297
* Default buffer size of datafile
298
*/
299
private static final int DATA_BUFFER_SIZE_ = 25000;
300
301
/**
302
* Numeric value shift
303
*/
304
private static final int VALUE_SHIFT_ = 8;
305
306
/**
307
* Mask to be applied after shifting to obtain an unsigned numeric value
308
*/
309
private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;
310
311
/**
312
* Shift value for lead surrogate to form a supplementary character.
313
*/
314
private static final int LEAD_SURROGATE_SHIFT_ = 10;
315
/**
316
* Offset to add to combined surrogate pair to avoid msking.
317
*/
318
private static final int SURROGATE_OFFSET_ =
319
UTF16.SUPPLEMENTARY_MIN_VALUE -
320
(UTF16.SURROGATE_MIN_VALUE <<
321
LEAD_SURROGATE_SHIFT_) -
322
UTF16.TRAIL_SURROGATE_MIN_VALUE;
323
324
// additional properties ----------------------------------------------
325
326
/**
327
* First nibble shift
328
*/
329
private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
330
/**
331
* Second nibble mask
332
*/
333
private static final int LAST_NIBBLE_MASK_ = 0xF;
334
/**
335
* Age value shift
336
*/
337
private static final int AGE_SHIFT_ = 24;
338
339
// private constructors --------------------------------------------------
340
341
/**
342
* Constructor
343
* @exception IOException thrown when data reading fails or data corrupted
344
*/
345
private UCharacterProperty() throws IOException
346
{
347
// jar access
348
InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
349
BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);
350
UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
351
reader.read(this);
352
b.close();
353
354
m_trie_.putIndexData(this);
355
}
356
357
public void upropsvec_addPropertyStarts(UnicodeSet set) {
358
/* add the start code point of each same-value range of the properties vectors trie */
359
if(m_additionalColumnsCount_>0) {
360
/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
361
TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
362
RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
363
while(propsVectorsIter.next(propsVectorsResult)){
364
set.add(propsVectorsResult.start);
365
}
366
}
367
}
368
369
}
370
371