Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/sun/text/normalizer/Trie.java
38830 views
1
/*
2
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation. Oracle designates this
8
* particular file as subject to the "Classpath" exception as provided
9
* by Oracle in the LICENSE file that accompanied this code.
10
*
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
16
*
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
*
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
23
* questions.
24
*/
25
/*
26
*******************************************************************************
27
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
28
* *
29
* The original version of this source code and documentation is copyrighted *
30
* and owned by IBM, These materials are provided under terms of a License *
31
* Agreement between IBM and Sun. This technology is protected by multiple *
32
* US and International patents. This notice and attribution to IBM may not *
33
* to removed. *
34
*******************************************************************************
35
*/
36
37
package sun.text.normalizer;
38
39
import java.io.DataInputStream;
40
import java.io.InputStream;
41
import java.io.IOException;
42
43
/**
44
* <p>A trie is a kind of compressed, serializable table of values
45
* associated with Unicode code points (0..0x10ffff).</p>
46
* <p>This class defines the basic structure of a trie and provides methods
47
* to <b>retrieve the offsets to the actual data</b>.</p>
48
* <p>Data will be the form of an array of basic types, char or int.</p>
49
* <p>The actual data format will have to be specified by the user in the
50
* inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
51
* <p>This trie implementation is optimized for getting offset while walking
52
* forward through a UTF-16 string.
53
* Therefore, the simplest and fastest access macros are the
54
* fromLead() and fromOffsetTrail() methods.
55
* The fromBMP() method are a little more complicated; they get offsets even
56
* for lead surrogate codepoints, while the fromLead() method get special
57
* "folded" offsets for lead surrogate code units if there is relevant data
58
* associated with them.
59
* From such a folded offsets, an offset needs to be extracted to supply
60
* to the fromOffsetTrail() methods.
61
* To handle such supplementary codepoints, some offset information are kept
62
* in the data.</p>
63
* <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
64
* that offset from the folded value for the lead surrogate unit.</p>
65
* <p>For examples of use, see com.ibm.icu.impl.CharTrie or
66
* com.ibm.icu.impl.IntTrie.</p>
67
* @author synwee
68
* @see com.ibm.icu.impl.CharTrie
69
* @see com.ibm.icu.impl.IntTrie
70
* @since release 2.1, Jan 01 2002
71
*/
72
public abstract class Trie
73
{
74
// public class declaration ----------------------------------------
75
76
/**
77
* Character data in com.ibm.impl.Trie have different user-specified format
78
* for different purposes.
79
* This interface specifies methods to be implemented in order for
80
* com.ibm.impl.Trie, to surrogate offset information encapsulated within
81
* the data.
82
*/
83
public static interface DataManipulate
84
{
85
/**
86
* Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
87
* data
88
* the index array offset of the indexes for that lead surrogate.
89
* @param value data value for a surrogate from the trie, including the
90
* folding offset
91
* @return data offset or 0 if there is no data for the lead surrogate
92
*/
93
public int getFoldingOffset(int value);
94
}
95
96
// default implementation
97
private static class DefaultGetFoldingOffset implements DataManipulate {
98
public int getFoldingOffset(int value) {
99
return value;
100
}
101
}
102
103
// protected constructor -------------------------------------------
104
105
/**
106
* Trie constructor for CharTrie use.
107
* @param inputStream ICU data file input stream which contains the
108
* trie
109
* @param dataManipulate object containing the information to parse the
110
* trie data
111
* @throws IOException thrown when input stream does not have the
112
* right header.
113
*/
114
protected Trie(InputStream inputStream,
115
DataManipulate dataManipulate) throws IOException
116
{
117
DataInputStream input = new DataInputStream(inputStream);
118
// Magic number to authenticate the data.
119
int signature = input.readInt();
120
m_options_ = input.readInt();
121
122
if (!checkHeader(signature)) {
123
throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
124
}
125
126
if(dataManipulate != null) {
127
m_dataManipulate_ = dataManipulate;
128
} else {
129
m_dataManipulate_ = new DefaultGetFoldingOffset();
130
}
131
m_isLatin1Linear_ = (m_options_ &
132
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
133
m_dataOffset_ = input.readInt();
134
m_dataLength_ = input.readInt();
135
unserialize(inputStream);
136
}
137
138
/**
139
* Trie constructor
140
* @param index array to be used for index
141
* @param options used by the trie
142
* @param dataManipulate object containing the information to parse the
143
* trie data
144
*/
145
protected Trie(char index[], int options, DataManipulate dataManipulate)
146
{
147
m_options_ = options;
148
if(dataManipulate != null) {
149
m_dataManipulate_ = dataManipulate;
150
} else {
151
m_dataManipulate_ = new DefaultGetFoldingOffset();
152
}
153
m_isLatin1Linear_ = (m_options_ &
154
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
155
m_index_ = index;
156
m_dataOffset_ = m_index_.length;
157
}
158
159
// protected data members ------------------------------------------
160
161
/**
162
* Lead surrogate code points' index displacement in the index array.
163
* 0x10000-0xd800=0x2800
164
* 0x2800 >> INDEX_STAGE_1_SHIFT_
165
*/
166
protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
167
/**
168
* Shift size for shifting right the input index. 1..9
169
*/
170
protected static final int INDEX_STAGE_1_SHIFT_ = 5;
171
/**
172
* Shift size for shifting left the index array values.
173
* Increases possible data size with 16-bit index values at the cost
174
* of compactability.
175
* This requires blocks of stage 2 data to be aligned by
176
* DATA_GRANULARITY.
177
* 0..INDEX_STAGE_1_SHIFT
178
*/
179
protected static final int INDEX_STAGE_2_SHIFT_ = 2;
180
/**
181
* Number of data values in a stage 2 (data array) block.
182
*/
183
protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
184
/**
185
* Mask for getting the lower bits from the input index.
186
* DATA_BLOCK_LENGTH - 1.
187
*/
188
protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
189
/** Number of bits of a trail surrogate that are used in index table lookups. */
190
protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
191
/**
192
* Number of index (stage 1) entries per lead surrogate.
193
* Same as number of index entries for 1024 trail surrogates,
194
* ==0x400>>INDEX_STAGE_1_SHIFT_
195
*/
196
protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
197
/** Length of the BMP portion of the index (stage 1) array. */
198
protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
199
/**
200
* Surrogate mask to use when shifting offset to retrieve supplementary
201
* values
202
*/
203
protected static final int SURROGATE_MASK_ = 0x3FF;
204
/**
205
* Index or UTF16 characters
206
*/
207
protected char m_index_[];
208
/**
209
* Internal TrieValue which handles the parsing of the data value.
210
* This class is to be implemented by the user
211
*/
212
protected DataManipulate m_dataManipulate_;
213
/**
214
* Start index of the data portion of the trie. CharTrie combines
215
* index and data into a char array, so this is used to indicate the
216
* initial offset to the data portion.
217
* Note this index always points to the initial value.
218
*/
219
protected int m_dataOffset_;
220
/**
221
* Length of the data array
222
*/
223
protected int m_dataLength_;
224
225
// protected methods -----------------------------------------------
226
227
/**
228
* Gets the offset to the data which the surrogate pair points to.
229
* @param lead lead surrogate
230
* @param trail trailing surrogate
231
* @return offset to data
232
*/
233
protected abstract int getSurrogateOffset(char lead, char trail);
234
235
/**
236
* Gets the value at the argument index
237
* @param index value at index will be retrieved
238
* @return 32 bit value
239
*/
240
protected abstract int getValue(int index);
241
242
/**
243
* Gets the default initial value
244
* @return 32 bit value
245
*/
246
protected abstract int getInitialValue();
247
248
/**
249
* Gets the offset to the data which the index ch after variable offset
250
* points to.
251
* Note for locating a non-supplementary character data offset, calling
252
* <p>
253
* getRawOffset(0, ch);
254
* </p>
255
* will do. Otherwise if it is a supplementary character formed by
256
* surrogates lead and trail. Then we would have to call getRawOffset()
257
* with getFoldingIndexOffset(). See getSurrogateOffset().
258
* @param offset index offset which ch is to start from
259
* @param ch index to be used after offset
260
* @return offset to the data
261
*/
262
protected final int getRawOffset(int offset, char ch)
263
{
264
return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
265
<< INDEX_STAGE_2_SHIFT_)
266
+ (ch & INDEX_STAGE_3_MASK_);
267
}
268
269
/**
270
* Gets the offset to data which the BMP character points to
271
* Treats a lead surrogate as a normal code point.
272
* @param ch BMP character
273
* @return offset to data
274
*/
275
protected final int getBMPOffset(char ch)
276
{
277
return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
278
&& ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
279
? getRawOffset(LEAD_INDEX_OFFSET_, ch)
280
: getRawOffset(0, ch);
281
// using a getRawOffset(ch) makes no diff
282
}
283
284
/**
285
* Gets the offset to the data which this lead surrogate character points
286
* to.
287
* Data at the returned offset may contain folding offset information for
288
* the next trailing surrogate character.
289
* @param ch lead surrogate character
290
* @return offset to data
291
*/
292
protected final int getLeadOffset(char ch)
293
{
294
return getRawOffset(0, ch);
295
}
296
297
/**
298
* Internal trie getter from a code point.
299
* Could be faster(?) but longer with
300
* if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
301
* Gets the offset to data which the codepoint points to
302
* @param ch codepoint
303
* @return offset to data
304
*/
305
protected final int getCodePointOffset(int ch)
306
{
307
// if ((ch >> 16) == 0) slower
308
if (ch < 0) {
309
return -1;
310
} else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
311
// fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
312
return getRawOffset(0, (char)ch);
313
} else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
314
// BMP codepoint
315
return getBMPOffset((char)ch);
316
} else if (ch <= UCharacter.MAX_VALUE) {
317
// look at the construction of supplementary characters
318
// trail forms the ends of it.
319
return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
320
(char)(ch & SURROGATE_MASK_));
321
} else {
322
// return -1 // if there is an error, in this case we return
323
return -1;
324
}
325
}
326
327
/**
328
* <p>Parses the inputstream and creates the trie index with it.</p>
329
* <p>This is overwritten by the child classes.
330
* @param inputStream input stream containing the trie information
331
* @exception IOException thrown when data reading fails.
332
*/
333
protected void unserialize(InputStream inputStream) throws IOException
334
{
335
//indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
336
m_index_ = new char[m_dataOffset_];
337
DataInputStream input = new DataInputStream(inputStream);
338
for (int i = 0; i < m_dataOffset_; i ++) {
339
m_index_[i] = input.readChar();
340
}
341
}
342
343
/**
344
* Determines if this is a 32 bit trie
345
* @return true if options specifies this is a 32 bit trie
346
*/
347
protected final boolean isIntTrie()
348
{
349
return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
350
}
351
352
/**
353
* Determines if this is a 16 bit trie
354
* @return true if this is a 16 bit trie
355
*/
356
protected final boolean isCharTrie()
357
{
358
return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
359
}
360
361
// private data members --------------------------------------------
362
363
/**
364
* Latin 1 option mask
365
*/
366
protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
367
/**
368
* Constant number to authenticate the byte block
369
*/
370
protected static final int HEADER_SIGNATURE_ = 0x54726965;
371
/**
372
* Header option formatting
373
*/
374
private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
375
protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
376
protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
377
378
/**
379
* Flag indicator for Latin quick access data block
380
*/
381
private boolean m_isLatin1Linear_;
382
383
/**
384
* <p>Trie options field.</p>
385
* <p>options bit field:<br>
386
* 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
387
* 8 0 = 16-bit data, 1=32-bit data<br>
388
* 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>
389
* 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>
390
*/
391
private int m_options_;
392
393
// private methods ---------------------------------------------------
394
395
/**
396
* Authenticates raw data header.
397
* Checking the header information, signature and options.
398
* @param signature This contains the options and type of a Trie
399
* @return true if the header is authenticated valid
400
*/
401
private final boolean checkHeader(int signature)
402
{
403
// check the signature
404
// Trie in big-endian US-ASCII (0x54726965).
405
// Magic number to authenticate the data.
406
if (signature != HEADER_SIGNATURE_) {
407
return false;
408
}
409
410
if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
411
INDEX_STAGE_1_SHIFT_ ||
412
((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
413
HEADER_OPTIONS_SHIFT_MASK_)
414
!= INDEX_STAGE_2_SHIFT_) {
415
return false;
416
}
417
return true;
418
}
419
}
420
421