Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/make/src/classes/build/tools/generatebreakiteratordata/CharSet.java
32287 views
1
/*
2
* Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation. Oracle designates this
8
* particular file as subject to the "Classpath" exception as provided
9
* by Oracle in the LICENSE file that accompanied this code.
10
*
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
16
*
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
*
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
23
* questions.
24
*/
25
26
/*
27
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
28
* (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
29
*
30
* The original version of this source code and documentation
31
* is copyrighted and owned by Taligent, Inc., a wholly-owned
32
* subsidiary of IBM. These materials are provided under terms
33
* of a License Agreement between Taligent and Sun. This technology
34
* is protected by multiple US and International patents.
35
*
36
* This notice and attribution to Taligent may not be removed.
37
* Taligent is a registered trademark of Taligent, Inc.
38
*/
39
40
package build.tools.generatebreakiteratordata;
41
42
import java.util.Arrays;
43
import java.util.Hashtable;
44
45
/**
46
* An object representing a set of characters. (This is a "set" in the
47
* mathematical sense: an unduplicated list of characters on which set
48
* operations such as union and intersection can be performed.) The
49
* set information is stored in compressed, optimized form: The object
50
* contains an integer array with an even number of characters. Each
51
* pair of characters represents a range of characters contained in the set
52
* (a pair of the same character represents a single character). The
53
* characters are sorted in increasing order.
54
*/
55
class CharSet {
56
/**
57
* The structure containing the set information. The characters
58
* in this array are organized into pairs, each pair representing
59
* a range of characters contained in the set
60
*/
61
private int[] chars;
62
63
//==========================================================================
64
// parseString() and associated routines
65
//==========================================================================
66
/**
67
* A cache which is used to speed up parseString() whenever it is
68
* used to parse a description that has been parsed before
69
*/
70
private static Hashtable<String, CharSet> expressionCache = null;
71
72
/**
73
* Builds a CharSet based on a textual description. For the syntax of
74
* the description, see the documentation of RuleBasedBreakIterator.
75
* @see java.text.RuleBasedBreakIterator
76
*/
77
public static CharSet parseString(String s) {
78
CharSet result = null;
79
80
// if "s" is in the expression cache, pull the result out
81
// of the expresison cache
82
if (expressionCache != null) {
83
result = expressionCache.get(s);
84
}
85
86
// otherwise, use doParseString() to actually parse the string,
87
// and then add a corresponding entry to the expression cache
88
if (result == null) {
89
result = doParseString(s);
90
if (expressionCache == null) {
91
expressionCache = new Hashtable<>();
92
}
93
expressionCache.put(s, result);
94
}
95
result = (CharSet)(result.clone());
96
return result;
97
}
98
99
/**
100
* This function is used by parseString() to actually parse the string
101
*/
102
private static CharSet doParseString(String s) {
103
CharSet result = new CharSet();
104
int p = 0;
105
106
boolean haveDash = false;
107
boolean haveTilde = false;
108
boolean wIsReal = false;
109
int w = 0x0000;
110
111
// for each character in the description...
112
while (p < s.length()) {
113
int c = s.codePointAt(p);
114
115
// if it's an opening bracket...
116
if (c == '[') {
117
// flush the single-character cache
118
if (wIsReal) {
119
result.internalUnion(new CharSet(w));
120
}
121
122
// locate the matching closing bracket
123
int bracketLevel = 1;
124
int q = p + 1;
125
while (bracketLevel != 0) {
126
// if no matching bracket by end of string then...
127
if (q >= s.length()) {
128
throw new IllegalArgumentException("Parse error at position " + p + " in " + s);
129
}
130
int ch = s.codePointAt(q);
131
switch (ch) {
132
case '\\': // need to step over next character
133
ch = s.codePointAt(++q);
134
break;
135
case '[':
136
++bracketLevel;
137
break;
138
case ']':
139
--bracketLevel;
140
break;
141
}
142
q += Character.charCount(ch);
143
}
144
--q;
145
146
// call parseString() recursively to parse the text inside
147
// the brackets, then either add or subtract the result from
148
// our running result depending on whether or not the []
149
// expresison was preceded by a ^
150
if (!haveTilde) {
151
result.internalUnion(CharSet.parseString(s.substring(p + 1, q)));
152
}
153
else {
154
result.internalDifference(CharSet.parseString(s.substring(p + 1, q)));
155
}
156
haveTilde = false;
157
haveDash = false;
158
wIsReal = false;
159
p = q + 1;
160
}
161
162
// if the character is a colon...
163
else if (c == ':') {
164
// flush the single-character cache
165
if (wIsReal) {
166
result.internalUnion(new CharSet(w));
167
}
168
169
// locate the matching colon (and throw an error if there
170
// isn't one)
171
int q = s.indexOf(':', p + 1);
172
if (q == -1) {
173
throw new IllegalArgumentException("Parse error at position " + p + " in " + s);
174
}
175
176
// use charSetForCategory() to parse the text in the colons,
177
// and either add or substract the result from our running
178
// result depending on whether the :: expression was
179
// preceded by a ^
180
if (!haveTilde) {
181
result.internalUnion(charSetForCategory(s.substring(p + 1, q)));
182
}
183
else {
184
result.internalDifference(charSetForCategory(s.substring(p + 1, q)));
185
}
186
187
// reset everything and advance to the next character
188
haveTilde = false;
189
haveDash = false;
190
wIsReal = false;
191
p = q + 1;
192
}
193
194
// if the character is a dash, set an appropriate flag
195
else if (c == '-') {
196
if (wIsReal) {
197
haveDash = true;
198
}
199
++p;
200
}
201
202
// if the character is a caret, flush the single-character
203
// cache and set an appropriate flag. If the set is empty
204
// (i.e., if the expression begins with ^), invert the set
205
// (i.e., set it to include everything). The idea here is
206
// that a set that includes nothing but ^ expressions
207
// means "everything but these things".
208
else if (c == '^') {
209
if (wIsReal) {
210
result.internalUnion(new CharSet(w));
211
wIsReal = false;
212
}
213
haveTilde = true;
214
++p;
215
if (result.empty()) {
216
result.internalComplement();
217
}
218
}
219
220
// throw an exception on an illegal character
221
else if (c >= ' ' && c < '\u007f' && !Character.isLetter((char)c)
222
&& !Character.isDigit((char)c) && c != '\\') {
223
throw new IllegalArgumentException("Parse error at position " + p + " in " + s);
224
}
225
226
// otherwise, we end up here...
227
else {
228
// on a backslash, advance to the next character
229
if (c == '\\') {
230
++p;
231
}
232
233
// if the preceding character was a dash, this character
234
// defines the end of a range. Add or subtract that range
235
// from the running result depending on whether or not it
236
// was preceded by a ^
237
if (haveDash) {
238
if (s.codePointAt(p) < w) {
239
throw new IllegalArgumentException("U+" +
240
Integer.toHexString(s.codePointAt(p))
241
+ " is less than U+" + Integer.toHexString(w) + ". Dash expressions "
242
+ "can't have their endpoints in reverse order.");
243
}
244
245
int ch = s.codePointAt(p);
246
if (!haveTilde) {
247
result.internalUnion(new CharSet(w, ch));
248
}
249
else {
250
result.internalDifference(new CharSet(w, ch));
251
}
252
p += Character.charCount(ch);
253
haveDash = false;
254
haveTilde = false;
255
wIsReal = false;
256
}
257
258
// if the preceding character was a caret, remove this character
259
// from the running result
260
else if (haveTilde) {
261
w = s.codePointAt(p);
262
result.internalDifference(new CharSet(w));
263
p += Character.charCount(w);
264
haveTilde = false;
265
wIsReal = false;
266
}
267
268
// otherwise, flush the single-character cache and then
269
// put this character into the cache
270
else if (wIsReal) {
271
result.internalUnion(new CharSet(w));
272
w = s.codePointAt(p);
273
p += Character.charCount(w);
274
wIsReal = true;
275
} else {
276
w = s.codePointAt(p);
277
p += Character.charCount(w);
278
wIsReal = true;
279
}
280
}
281
}
282
283
// finally, flush the single-character cache one last time
284
if (wIsReal) {
285
result.internalUnion(new CharSet(w));
286
}
287
288
return result;
289
}
290
291
/**
292
* Creates a CharSet containing all the characters in a particular
293
* Unicode category. The text is either a two-character code from
294
* the Unicode database or a single character that begins one or more
295
* two-character codes.
296
*/
297
private static CharSet charSetForCategory(String category) {
298
// throw an exception if we have anything other than one or two
299
// characters inside the colons
300
if (category.length() == 0 || category.length() >= 3) {
301
throw new IllegalArgumentException("Invalid character category: " + category);
302
}
303
304
// if we have two characters, search the category map for that code
305
// and either construct and return a CharSet from the data in the
306
// category map or throw an exception
307
if (category.length() == 2) {
308
for (int i = 0; i < CharacterCategory.categoryNames.length; i++) {
309
if (CharacterCategory.categoryNames[i].equals(category)) {
310
return new CharSet(CharacterCategory.getCategoryMap(i));
311
}
312
}
313
throw new IllegalArgumentException("Invalid character category: " + category);
314
}
315
316
// if we have one character, search the category map for codes beginning
317
// with that letter, and union together all of the matching sets that
318
// we find (or throw an exception if there are no matches)
319
else if (category.length() == 1) {
320
CharSet result = new CharSet();
321
for (int i = 0; i < CharacterCategory.categoryNames.length; i++) {
322
if (CharacterCategory.categoryNames[i].startsWith(category)) {
323
result = result.union(new CharSet(CharacterCategory.getCategoryMap(i)));
324
}
325
}
326
if (result.empty()) {
327
throw new IllegalArgumentException("Invalid character category: " + category);
328
}
329
else {
330
return result;
331
}
332
}
333
return new CharSet(); // should never get here, but to make the compiler happy...
334
}
335
336
/**
337
* Returns a copy of CharSet's expression cache and sets CharSet's
338
* expression cache to empty.
339
*/
340
public static Hashtable<String, CharSet> releaseExpressionCache() {
341
Hashtable<String, CharSet> result = expressionCache;
342
expressionCache = null;
343
return result;
344
}
345
346
//==========================================================================
347
// CharSet manipulation
348
//==========================================================================
349
/**
350
* Creates an empty CharSet.
351
*/
352
public CharSet() {
353
chars = new int[0];
354
}
355
356
/**
357
* Creates a CharSet containing a single character.
358
* @param c The character to put into the CharSet
359
*/
360
public CharSet(int c) {
361
chars = new int[2];
362
chars[0] = c;
363
chars[1] = c;
364
}
365
366
/**
367
* Creates a CharSet containing a range of characters.
368
* @param lo The lowest-numbered character to include in the range
369
* @param hi The highest-numbered character to include in the range
370
*/
371
public CharSet(int lo, int hi) {
372
chars = new int[2];
373
if (lo <= hi) {
374
chars[0] = lo;
375
chars[1] = hi;
376
}
377
else {
378
chars[0] = hi;
379
chars[1] = lo;
380
}
381
}
382
383
/**
384
* Creates a CharSet, initializing it from the internal storage
385
* of another CharSet (this function performs no error checking
386
* on "chars", so if it's malformed, undefined behavior will result)
387
*/
388
private CharSet(int[] chars) {
389
this.chars = chars;
390
}
391
392
/**
393
* Returns a CharSet representing the union of two CharSets.
394
*/
395
public CharSet union(CharSet that) {
396
return new CharSet(doUnion(that.chars));
397
}
398
399
/**
400
* Adds the characters in "that" to this CharSet
401
*/
402
private void internalUnion(CharSet that) {
403
chars = doUnion(that.chars);
404
}
405
406
/**
407
* The actual implementation of the union functions
408
*/
409
private int[] doUnion(int[] c2) {
410
int[] result = new int[chars.length+c2.length];
411
412
int i = 0;
413
int j = 0;
414
int index = 0;
415
416
// consider all the characters in both strings
417
while (i < chars.length && j < c2.length) {
418
int ub;
419
420
// the first character in the result is the lower of the
421
// starting characters of the two strings, and "ub" gets
422
// set to the upper bound of that range
423
if (chars[i] < c2[j]) {
424
result[index++] = chars[i];
425
ub = chars[++i];
426
}
427
else {
428
result[index++] = c2[j];
429
ub = c2[++j];
430
}
431
432
// for as long as one of our two pointers is pointing to a range's
433
// end point, or i is pointing to a character that is less than
434
// "ub" plus one (the "plus one" stitches touching ranges together)...
435
while (i % 2 == 1 ||
436
j % 2 == 1 ||
437
(i < chars.length && chars[i] <= ub + 1)) {
438
439
// advance i to the first character that is greater than
440
// "ub" plus one
441
while (i < chars.length && chars[i] <= ub + 1) {
442
++i;
443
}
444
445
// if i points to the endpoint of a range, update "ub"
446
// to that character, or if i points to the start of
447
// a range and the endpoint of the preceding range is
448
// greater than "ub", update "up" to _that_ character
449
if (i % 2 == 1) {
450
ub = chars[i];
451
}
452
else if (i > 0 && chars[i - 1] > ub) {
453
ub = chars[i - 1];
454
}
455
456
// now advance j to the first character that is greater
457
// that "ub" plus one
458
while (j < c2.length && c2[j] <= ub + 1) {
459
++j;
460
}
461
462
// if j points to the endpoint of a range, update "ub"
463
// to that character, or if j points to the start of
464
// a range and the endpoint of the preceding range is
465
// greater than "ub", update "up" to _that_ character
466
if (j % 2 == 1) {
467
ub = c2[j];
468
}
469
else if (j > 0 && c2[j - 1] > ub) {
470
ub = c2[j - 1];
471
}
472
}
473
// when we finally fall out of this loop, we will have stitched
474
// together a series of ranges that overlap or touch, i and j
475
// will both point to starting points of ranges, and "ub" will
476
// be the endpoint of the range we're working on. Write "ub"
477
// to the result
478
result[index++] = ub;
479
480
// loop back around to create the next range in the result
481
}
482
483
// we fall out to here when we've exhausted all the characters in
484
// one of the operands. We can append all of the remaining characters
485
// in the other operand without doing any extra work.
486
if (i < chars.length) {
487
for (int k = i; k < chars.length; k++) {
488
result[index++] = chars[k];
489
}
490
}
491
if (j < c2.length) {
492
for (int k = j; k < c2.length; k++) {
493
result[index++] = c2[k];
494
}
495
}
496
497
if (result.length > index) {
498
int[] tmpbuf = new int[index];
499
System.arraycopy(result, 0, tmpbuf, 0, index);
500
return tmpbuf;
501
}
502
503
return result;
504
}
505
506
/**
507
* Returns the intersection of two CharSets.
508
*/
509
public CharSet intersection(CharSet that) {
510
return new CharSet(doIntersection(that.chars));
511
}
512
513
/**
514
* Removes from this CharSet any characters that aren't also in "that"
515
*/
516
private void internalIntersection(CharSet that) {
517
chars = doIntersection(that.chars);
518
}
519
520
/**
521
* The internal implementation of the two intersection functions
522
*/
523
private int[] doIntersection(int[] c2) {
524
int[] result = new int[chars.length+c2.length];
525
526
int i = 0;
527
int j = 0;
528
int oldI;
529
int oldJ;
530
int index = 0;
531
532
// iterate until we've exhausted one of the operands
533
while (i < chars.length && j < c2.length) {
534
535
// advance j until it points to a character that is larger than
536
// the one i points to. If this is the beginning of a one-
537
// character range, advance j to point to the end
538
if (i < chars.length && i % 2 == 0) {
539
while (j < c2.length && c2[j] < chars[i]) {
540
++j;
541
}
542
if (j < c2.length && j % 2 == 0 && c2[j] == chars[i]) {
543
++j;
544
}
545
}
546
547
// if j points to the endpoint of a range, save the current
548
// value of i, then advance i until it reaches a character
549
// which is larger than the character pointed at
550
// by j. All of the characters we've advanced over (except
551
// the one currently pointed to by i) are added to the result
552
oldI = i;
553
while (j % 2 == 1 && i < chars.length && chars[i] <= c2[j]) {
554
++i;
555
}
556
for (int k = oldI; k < i; k++) {
557
result[index++] = chars[k];
558
}
559
560
// if i points to the endpoint of a range, save the current
561
// value of j, then advance j until it reaches a character
562
// which is larger than the character pointed at
563
// by i. All of the characters we've advanced over (except
564
// the one currently pointed to by i) are added to the result
565
oldJ = j;
566
while (i % 2 == 1 && j < c2.length && c2[j] <= chars[i]) {
567
++j;
568
}
569
for (int k = oldJ; k < j; k++) {
570
result[index++] = c2[k];
571
}
572
573
// advance i until it points to a character larger than j
574
// If it points at the beginning of a one-character range,
575
// advance it to the end of that range
576
if (j < c2.length && j % 2 == 0) {
577
while (i < chars.length && chars[i] < c2[j]) {
578
++i;
579
}
580
if (i < chars.length && i % 2 == 0 && c2[j] == chars[i]) {
581
++i;
582
}
583
}
584
}
585
586
if (result.length > index) {
587
int[] tmpbuf = new int[index];
588
System.arraycopy(result, 0, tmpbuf, 0, index);
589
return tmpbuf;
590
}
591
592
return result;
593
}
594
595
/**
596
* Returns a CharSet containing all the characters in "this" that
597
* aren't also in "that"
598
*/
599
public CharSet difference(CharSet that) {
600
return new CharSet(doIntersection(that.doComplement()));
601
}
602
603
/**
604
* Removes from "this" all the characters that are also in "that"
605
*/
606
private void internalDifference(CharSet that) {
607
chars = doIntersection(that.doComplement());
608
}
609
610
/**
611
* Returns a CharSet containing all the characters which are not
612
* in "this"
613
*/
614
public CharSet complement() {
615
return new CharSet(doComplement());
616
}
617
618
/**
619
* Complements "this". All the characters it contains are removed,
620
* and all the characters it doesn't contain are added.
621
*/
622
private void internalComplement() {
623
chars = doComplement();
624
}
625
626
/**
627
* The internal implementation function for the complement routines
628
*/
629
private int[] doComplement() {
630
// the complement of an empty CharSet is one containing everything
631
if (empty()) {
632
int[] result = new int[2];
633
result[0] = 0x0000;
634
result[1] = 0x10FFFF;
635
return result;
636
}
637
638
int[] result = new int[chars.length+2];
639
640
int i = 0;
641
int index = 0;
642
643
// the result begins with \u0000 unless the original CharSet does
644
if (chars[0] != 0x0000) {
645
result[index++] = 0x0000;
646
}
647
648
// walk through the characters in this CharSet. Append a pair of
649
// characters the first of which is one less than the first
650
// character we see and the second of which is one plus the second
651
// character we see (don't write the first character if it's \u0000,
652
// and don't write the second character if it's \uffff.
653
while (i < chars.length) {
654
if (chars[i] != 0x0000) {
655
result[index++] = chars[i] - 1;
656
}
657
if (chars[i + 1] != 0x10FFFF) {
658
result[index++] = chars[i + 1] + 1;
659
}
660
i += 2;
661
}
662
663
// add 0x10ffff to the end of the result, unless it was in
664
// the original set
665
if (chars[i-1] != 0x10FFFF) {
666
result[index++] = 0x10FFFF;
667
}
668
669
if (result.length > index) {
670
int[] tmpbuf = new int[index];
671
System.arraycopy(result, 0, tmpbuf, 0, index);
672
return tmpbuf;
673
}
674
675
return result;
676
}
677
678
/**
679
* Returns true if this CharSet contains the specified character
680
* @param c The character we're testing for set membership
681
*/
682
public boolean contains(int c) {
683
// search for the first range endpoint that is greater than or
684
// equal to c
685
int i = 1;
686
while (i < chars.length && chars[i] < c) {
687
i += 2;
688
}
689
690
// if we've walked off the end, we don't contain c
691
if (i == chars.length) {
692
return false;
693
}
694
695
// otherwise, we contain c if the beginning of the range is less
696
// than or equal to c
697
return chars[i - 1] <= c;
698
}
699
700
/**
701
* Returns true if "that" is another instance of CharSet containing
702
* the exact same characters as this one
703
*/
704
public boolean equals(Object that) {
705
return (that instanceof CharSet) && Arrays.equals(chars, ((CharSet)that).chars);
706
}
707
708
/**
709
* Returns the hash code for this set of characters
710
*/
711
public int hashCode() {
712
return Arrays.hashCode(chars);
713
}
714
715
/**
716
* Creates a new CharSet that is equal to this one
717
*/
718
public Object clone() {
719
return new CharSet(chars);
720
}
721
722
/**
723
* Returns true if this CharSet contains no characters
724
*/
725
public boolean empty() {
726
return chars.length == 0;
727
}
728
729
/**
730
* Returns a textual representation of this CharSet. If the result
731
* of calling this function is passed to CharSet.parseString(), it
732
* will produce another CharSet that is equal to this one.
733
*/
734
public String toString() {
735
StringBuffer result = new StringBuffer();
736
737
// the result begins with an opening bracket
738
result.append('[');
739
740
// iterate through the ranges in the CharSet
741
for (int i = 0; i < chars.length; i += 2) {
742
// for a range with the same beginning and ending point,
743
// output that character
744
if (chars[i] == chars[i + 1]) {
745
result.append("0x");
746
result.append(Integer.toHexString(chars[i]));
747
}
748
749
// otherwise, output the start and end points of the range
750
// separated by a dash
751
else {
752
result.append("0x");
753
result.append(Integer.toHexString(chars[i]));
754
result.append("-0x");
755
result.append(Integer.toHexString(chars[i + 1]));
756
}
757
}
758
759
// the result ends with a closing bracket
760
result.append(']');
761
return result.toString();
762
}
763
764
/**
765
* Returns an integer array representing the contents of this CharSet
766
* in the same form in which they're stored internally: as pairs
767
* of characters representing the start and end points of ranges
768
*/
769
public int[] getRanges() {
770
return chars;
771
}
772
773
/**
774
* Returns an Enumeration that will return the ranges of characters
775
* contained in this CharSet one at a time
776
*/
777
public Enumeration getChars() {
778
return new Enumeration(this);
779
}
780
781
//==========================================================================
782
// CharSet.Enumeration
783
//==========================================================================
784
785
/**
786
* An Enumeration that can be used to extract the character ranges
787
* from a CharSet one at a time
788
*/
789
public class Enumeration implements java.util.Enumeration<int[]> {
790
/**
791
* Initializes a CharSet.Enumeration
792
*/
793
Enumeration(CharSet cs) {
794
this.chars = cs.chars;
795
p = 0;
796
}
797
798
/**
799
* Returns true if the enumeration hasn't yet returned
800
* all the ranges in the CharSet
801
*/
802
public boolean hasMoreElements() {
803
return p < chars.length;
804
}
805
806
/**
807
* Returns the next range in the CarSet
808
*/
809
public int[] nextElement() {
810
int[] result = new int[2];
811
result[0] = chars[p++];
812
result[1] = chars[p++];
813
return result;
814
}
815
816
int p;
817
int[] chars;
818
}
819
}
820
821