Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/sun/text/normalizer/UTF16.java
38830 views
1
/*
2
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation. Oracle designates this
8
* particular file as subject to the "Classpath" exception as provided
9
* by Oracle in the LICENSE file that accompanied this code.
10
*
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
16
*
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
*
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
23
* questions.
24
*/
25
/*
26
*******************************************************************************
27
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
28
* *
29
* The original version of this source code and documentation is copyrighted *
30
* and owned by IBM, These materials are provided under terms of a License *
31
* Agreement between IBM and Sun. This technology is protected by multiple *
32
* US and International patents. This notice and attribution to IBM may not *
33
* to removed. *
34
*******************************************************************************
35
*/
36
37
package sun.text.normalizer;
38
39
/**
40
* <p>Standalone utility class providing UTF16 character conversions and
41
* indexing conversions.</p>
42
* <p>Code that uses strings alone rarely need modification.
43
* By design, UTF-16 does not allow overlap, so searching for strings is a safe
44
* operation. Similarly, concatenation is always safe. Substringing is safe if
45
* the start and end are both on UTF-32 boundaries. In normal code, the values
46
* for start and end are on those boundaries, since they arose from operations
47
* like searching. If not, the nearest UTF-32 boundaries can be determined
48
* using <code>bounds()</code>.</p>
49
* <strong>Examples:</strong>
50
* <p>The following examples illustrate use of some of these methods.
51
* <pre>
52
* // iteration forwards: Original
53
* for (int i = 0; i &lt; s.length(); ++i) {
54
* char ch = s.charAt(i);
55
* doSomethingWith(ch);
56
* }
57
*
58
* // iteration forwards: Changes for UTF-32
59
* int ch;
60
* for (int i = 0; i &lt; s.length(); i+=UTF16.getCharCount(ch)) {
61
* ch = UTF16.charAt(s,i);
62
* doSomethingWith(ch);
63
* }
64
*
65
* // iteration backwards: Original
66
* for (int i = s.length() -1; i >= 0; --i) {
67
* char ch = s.charAt(i);
68
* doSomethingWith(ch);
69
* }
70
*
71
* // iteration backwards: Changes for UTF-32
72
* int ch;
73
* for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
74
* ch = UTF16.charAt(s,i);
75
* doSomethingWith(ch);
76
* }
77
* </pre>
78
* <strong>Notes:</strong>
79
* <ul>
80
* <li>
81
* <strong>Naming:</strong> For clarity, High and Low surrogates are called
82
* <code>Lead</code> and <code>Trail</code> in the API, which gives a better
83
* sense of their ordering in a string. <code>offset16</code> and
84
* <code>offset32</code> are used to distinguish offsets to UTF-16
85
* boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
86
* used to contain UTF-32 characters, as opposed to <code>char16</code>,
87
* which is a UTF-16 code unit.
88
* </li>
89
* <li>
90
* <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
91
* UTF-32 offset to a UTF-16 offset and back. Because of the difference in
92
* structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
93
* back if and only if <code>bounds(string, offset16) != TRAIL</code>.
94
* </li>
95
* <li>
96
* <strong>Exceptions:</strong> The error checking will throw an exception
97
* if indices are out of bounds. Other than than that, all methods will
98
* behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
99
* values are present. <code>UCharacter.isLegal()</code> can be used to check
100
* for validity if desired.
101
* </li>
102
* <li>
103
* <strong>Unmatched Surrogates:</strong> If the string contains unmatched
104
* surrogates, then these are counted as one UTF-32 value. This matches
105
* their iteration behavior, which is vital. It also matches common display
106
* practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
107
* </li>
108
* <li>
109
* <strong>Optimization:</strong> The method implementations may need
110
* optimization if the compiler doesn't fold static final methods. Since
111
* surrogate pairs will form an exceeding small percentage of all the text
112
* in the world, the singleton case should always be optimized for.
113
* </li>
114
* </ul>
115
* @author Mark Davis, with help from Markus Scherer
116
* @stable ICU 2.1
117
*/
118
119
public final class UTF16
120
{
121
// public variables ---------------------------------------------------
122
123
/**
124
* The lowest Unicode code point value.
125
* @stable ICU 2.1
126
*/
127
public static final int CODEPOINT_MIN_VALUE = 0;
128
/**
129
* The highest Unicode code point value (scalar value) according to the
130
* Unicode Standard.
131
* @stable ICU 2.1
132
*/
133
public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
134
/**
135
* The minimum value for Supplementary code points
136
* @stable ICU 2.1
137
*/
138
public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
139
/**
140
* Lead surrogate minimum value
141
* @stable ICU 2.1
142
*/
143
public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
144
/**
145
* Trail surrogate minimum value
146
* @stable ICU 2.1
147
*/
148
public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
149
/**
150
* Lead surrogate maximum value
151
* @stable ICU 2.1
152
*/
153
public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
154
/**
155
* Trail surrogate maximum value
156
* @stable ICU 2.1
157
*/
158
public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
159
/**
160
* Surrogate minimum value
161
* @stable ICU 2.1
162
*/
163
public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
164
165
// public method ------------------------------------------------------
166
167
/**
168
* Extract a single UTF-32 value from a string.
169
* Used when iterating forwards or backwards (with
170
* <code>UTF16.getCharCount()</code>, as well as random access. If a
171
* validity check is required, use
172
* <code><a href="../lang/UCharacter.html#isLegal(char)">
173
* UCharacter.isLegal()</a></code> on the return value.
174
* If the char retrieved is part of a surrogate pair, its supplementary
175
* character will be returned. If a complete supplementary character is
176
* not found the incomplete character will be returned
177
* @param source array of UTF-16 chars
178
* @param offset16 UTF-16 offset to the start of the character.
179
* @return UTF-32 value for the UTF-32 value that contains the char at
180
* offset16. The boundaries of that codepoint are the same as in
181
* <code>bounds32()</code>.
182
* @exception IndexOutOfBoundsException thrown if offset16 is out of
183
* bounds.
184
* @stable ICU 2.1
185
*/
186
public static int charAt(String source, int offset16) {
187
char single = source.charAt(offset16);
188
if (single < LEAD_SURROGATE_MIN_VALUE) {
189
return single;
190
}
191
return _charAt(source, offset16, single);
192
}
193
194
private static int _charAt(String source, int offset16, char single) {
195
if (single > TRAIL_SURROGATE_MAX_VALUE) {
196
return single;
197
}
198
199
// Convert the UTF-16 surrogate pair if necessary.
200
// For simplicity in usage, and because the frequency of pairs is
201
// low, look both directions.
202
203
if (single <= LEAD_SURROGATE_MAX_VALUE) {
204
++offset16;
205
if (source.length() != offset16) {
206
char trail = source.charAt(offset16);
207
if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
208
return UCharacterProperty.getRawSupplementary(single, trail);
209
}
210
}
211
} else {
212
--offset16;
213
if (offset16 >= 0) {
214
// single is a trail surrogate so
215
char lead = source.charAt(offset16);
216
if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
217
return UCharacterProperty.getRawSupplementary(lead, single);
218
}
219
}
220
}
221
return single; // return unmatched surrogate
222
}
223
224
/**
225
* Extract a single UTF-32 value from a substring.
226
* Used when iterating forwards or backwards (with
227
* <code>UTF16.getCharCount()</code>, as well as random access. If a
228
* validity check is required, use
229
* <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
230
* </a></code> on the return value.
231
* If the char retrieved is part of a surrogate pair, its supplementary
232
* character will be returned. If a complete supplementary character is
233
* not found the incomplete character will be returned
234
* @param source array of UTF-16 chars
235
* @param start offset to substring in the source array for analyzing
236
* @param limit offset to substring in the source array for analyzing
237
* @param offset16 UTF-16 offset relative to start
238
* @return UTF-32 value for the UTF-32 value that contains the char at
239
* offset16. The boundaries of that codepoint are the same as in
240
* <code>bounds32()</code>.
241
* @exception IndexOutOfBoundsException thrown if offset16 is not within
242
* the range of start and limit.
243
* @stable ICU 2.1
244
*/
245
public static int charAt(char source[], int start, int limit,
246
int offset16)
247
{
248
offset16 += start;
249
if (offset16 < start || offset16 >= limit) {
250
throw new ArrayIndexOutOfBoundsException(offset16);
251
}
252
253
char single = source[offset16];
254
if (!isSurrogate(single)) {
255
return single;
256
}
257
258
// Convert the UTF-16 surrogate pair if necessary.
259
// For simplicity in usage, and because the frequency of pairs is
260
// low, look both directions.
261
if (single <= LEAD_SURROGATE_MAX_VALUE) {
262
offset16 ++;
263
if (offset16 >= limit) {
264
return single;
265
}
266
char trail = source[offset16];
267
if (isTrailSurrogate(trail)) {
268
return UCharacterProperty.getRawSupplementary(single, trail);
269
}
270
}
271
else { // isTrailSurrogate(single), so
272
if (offset16 == start) {
273
return single;
274
}
275
offset16 --;
276
char lead = source[offset16];
277
if (isLeadSurrogate(lead))
278
return UCharacterProperty.getRawSupplementary(lead, single);
279
}
280
return single; // return unmatched surrogate
281
}
282
283
/**
284
* Determines how many chars this char32 requires.
285
* If a validity check is required, use <code>
286
* <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
287
* char32 before calling.
288
* @param char32 the input codepoint.
289
* @return 2 if is in supplementary space, otherwise 1.
290
* @stable ICU 2.1
291
*/
292
public static int getCharCount(int char32)
293
{
294
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
295
return 1;
296
}
297
return 2;
298
}
299
300
/**
301
* Determines whether the code value is a surrogate.
302
* @param char16 the input character.
303
* @return true iff the input character is a surrogate.
304
* @stable ICU 2.1
305
*/
306
public static boolean isSurrogate(char char16)
307
{
308
return LEAD_SURROGATE_MIN_VALUE <= char16 &&
309
char16 <= TRAIL_SURROGATE_MAX_VALUE;
310
}
311
312
/**
313
* Determines whether the character is a trail surrogate.
314
* @param char16 the input character.
315
* @return true iff the input character is a trail surrogate.
316
* @stable ICU 2.1
317
*/
318
public static boolean isTrailSurrogate(char char16)
319
{
320
return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
321
char16 <= TRAIL_SURROGATE_MAX_VALUE);
322
}
323
324
/**
325
* Determines whether the character is a lead surrogate.
326
* @param char16 the input character.
327
* @return true iff the input character is a lead surrogate
328
* @stable ICU 2.1
329
*/
330
public static boolean isLeadSurrogate(char char16)
331
{
332
return LEAD_SURROGATE_MIN_VALUE <= char16 &&
333
char16 <= LEAD_SURROGATE_MAX_VALUE;
334
}
335
336
/**
337
* Returns the lead surrogate.
338
* If a validity check is required, use
339
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
340
* on char32 before calling.
341
* @param char32 the input character.
342
* @return lead surrogate if the getCharCount(ch) is 2; <br>
343
* and 0 otherwise (note: 0 is not a valid lead surrogate).
344
* @stable ICU 2.1
345
*/
346
public static char getLeadSurrogate(int char32)
347
{
348
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
349
return (char)(LEAD_SURROGATE_OFFSET_ +
350
(char32 >> LEAD_SURROGATE_SHIFT_));
351
}
352
353
return 0;
354
}
355
356
/**
357
* Returns the trail surrogate.
358
* If a validity check is required, use
359
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
360
* on char32 before calling.
361
* @param char32 the input character.
362
* @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise
363
* the character itself
364
* @stable ICU 2.1
365
*/
366
public static char getTrailSurrogate(int char32)
367
{
368
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
369
return (char)(TRAIL_SURROGATE_MIN_VALUE +
370
(char32 & TRAIL_SURROGATE_MASK_));
371
}
372
373
return (char)char32;
374
}
375
376
/**
377
* Convenience method corresponding to String.valueOf(char). Returns a one
378
* or two char string containing the UTF-32 value in UTF16 format. If a
379
* validity check is required, use
380
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
381
* on char32 before calling.
382
* @param char32 the input character.
383
* @return string value of char32 in UTF16 format
384
* @exception IllegalArgumentException thrown if char32 is a invalid
385
* codepoint.
386
* @stable ICU 2.1
387
*/
388
public static String valueOf(int char32)
389
{
390
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
391
throw new IllegalArgumentException("Illegal codepoint");
392
}
393
return toString(char32);
394
}
395
396
/**
397
* Append a single UTF-32 value to the end of a StringBuffer.
398
* If a validity check is required, use
399
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
400
* on char32 before calling.
401
* @param target the buffer to append to
402
* @param char32 value to append.
403
* @return the updated StringBuffer
404
* @exception IllegalArgumentException thrown when char32 does not lie
405
* within the range of the Unicode codepoints
406
* @stable ICU 2.1
407
*/
408
public static StringBuffer append(StringBuffer target, int char32)
409
{
410
// Check for irregular values
411
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
412
throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
413
}
414
415
// Write the UTF-16 values
416
if (char32 >= SUPPLEMENTARY_MIN_VALUE)
417
{
418
target.append(getLeadSurrogate(char32));
419
target.append(getTrailSurrogate(char32));
420
}
421
else {
422
target.append((char)char32);
423
}
424
return target;
425
}
426
427
//// for StringPrep
428
/**
429
* Shifts offset16 by the argument number of codepoints within a subarray.
430
* @param source char array
431
* @param start position of the subarray to be performed on
432
* @param limit position of the subarray to be performed on
433
* @param offset16 UTF16 position to shift relative to start
434
* @param shift32 number of codepoints to shift
435
* @return new shifted offset16 relative to start
436
* @exception IndexOutOfBoundsException if the new offset16 is out of
437
* bounds with respect to the subarray or the subarray bounds
438
* are out of range.
439
* @stable ICU 2.1
440
*/
441
public static int moveCodePointOffset(char source[], int start, int limit,
442
int offset16, int shift32)
443
{
444
int size = source.length;
445
int count;
446
char ch;
447
int result = offset16 + start;
448
if (start<0 || limit<start) {
449
throw new StringIndexOutOfBoundsException(start);
450
}
451
if (limit>size) {
452
throw new StringIndexOutOfBoundsException(limit);
453
}
454
if (offset16<0 || result>limit) {
455
throw new StringIndexOutOfBoundsException(offset16);
456
}
457
if (shift32 > 0 ) {
458
if (shift32 + result > size) {
459
throw new StringIndexOutOfBoundsException(result);
460
}
461
count = shift32;
462
while (result < limit && count > 0)
463
{
464
ch = source[result];
465
if (isLeadSurrogate(ch) && (result+1 < limit) &&
466
isTrailSurrogate(source[result+1])) {
467
result ++;
468
}
469
count --;
470
result ++;
471
}
472
} else {
473
if (result + shift32 < start) {
474
throw new StringIndexOutOfBoundsException(result);
475
}
476
for (count=-shift32; count>0; count--) {
477
result--;
478
if (result<start) {
479
break;
480
}
481
ch = source[result];
482
if (isTrailSurrogate(ch) && result>start && isLeadSurrogate(source[result-1])) {
483
result--;
484
}
485
}
486
}
487
if (count != 0) {
488
throw new StringIndexOutOfBoundsException(shift32);
489
}
490
result -= start;
491
return result;
492
}
493
494
// private data members -------------------------------------------------
495
496
/**
497
* Shift value for lead surrogate to form a supplementary character.
498
*/
499
private static final int LEAD_SURROGATE_SHIFT_ = 10;
500
501
/**
502
* Mask to retrieve the significant value from a trail surrogate.
503
*/
504
private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
505
506
/**
507
* Value that all lead surrogate starts with
508
*/
509
private static final int LEAD_SURROGATE_OFFSET_ =
510
LEAD_SURROGATE_MIN_VALUE -
511
(SUPPLEMENTARY_MIN_VALUE
512
>> LEAD_SURROGATE_SHIFT_);
513
514
// private methods ------------------------------------------------------
515
516
/**
517
* <p>Converts argument code point and returns a String object representing
518
* the code point's value in UTF16 format.</p>
519
* <p>This method does not check for the validity of the codepoint, the
520
* results are not guaranteed if a invalid codepoint is passed as
521
* argument.</p>
522
* <p>The result is a string whose length is 1 for non-supplementary code
523
* points, 2 otherwise.</p>
524
* @param ch code point
525
* @return string representation of the code point
526
*/
527
private static String toString(int ch)
528
{
529
if (ch < SUPPLEMENTARY_MIN_VALUE) {
530
return String.valueOf((char)ch);
531
}
532
533
StringBuffer result = new StringBuffer();
534
result.append(getLeadSurrogate(ch));
535
result.append(getTrailSurrogate(ch));
536
return result.toString();
537
}
538
}
539
540