Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/sun/text/normalizer/UnicodeSet.java
38830 views
1
/*
2
* Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation. Oracle designates this
8
* particular file as subject to the "Classpath" exception as provided
9
* by Oracle in the LICENSE file that accompanied this code.
10
*
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
16
*
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
*
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
23
* questions.
24
*/
25
/*
26
*******************************************************************************
27
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
28
* *
29
* The original version of this source code and documentation is copyrighted *
30
* and owned by IBM, These materials are provided under terms of a License *
31
* Agreement between IBM and Sun. This technology is protected by multiple *
32
* US and International patents. This notice and attribution to IBM may not *
33
* to removed. *
34
*******************************************************************************
35
*/
36
37
package sun.text.normalizer;
38
39
import java.text.ParsePosition;
40
import java.util.Iterator;
41
import java.util.TreeSet;
42
43
/**
44
* A mutable set of Unicode characters and multicharacter strings. Objects of this class
45
* represent <em>character classes</em> used in regular expressions.
46
* A character specifies a subset of Unicode code points. Legal
47
* code points are U+0000 to U+10FFFF, inclusive.
48
*
49
* <p>The UnicodeSet class is not designed to be subclassed.
50
*
51
* <p><code>UnicodeSet</code> supports two APIs. The first is the
52
* <em>operand</em> API that allows the caller to modify the value of
53
* a <code>UnicodeSet</code> object. It conforms to Java 2's
54
* <code>java.util.Set</code> interface, although
55
* <code>UnicodeSet</code> does not actually implement that
56
* interface. All methods of <code>Set</code> are supported, with the
57
* modification that they take a character range or single character
58
* instead of an <code>Object</code>, and they take a
59
* <code>UnicodeSet</code> instead of a <code>Collection</code>. The
60
* operand API may be thought of in terms of boolean logic: a boolean
61
* OR is implemented by <code>add</code>, a boolean AND is implemented
62
* by <code>retain</code>, a boolean XOR is implemented by
63
* <code>complement</code> taking an argument, and a boolean NOT is
64
* implemented by <code>complement</code> with no argument. In terms
65
* of traditional set theory function names, <code>add</code> is a
66
* union, <code>retain</code> is an intersection, <code>remove</code>
67
* is an asymmetric difference, and <code>complement</code> with no
68
* argument is a set complement with respect to the superset range
69
* <code>MIN_VALUE-MAX_VALUE</code>
70
*
71
* <p>The second API is the
72
* <code>applyPattern()</code>/<code>toPattern()</code> API from the
73
* <code>java.text.Format</code>-derived classes. Unlike the
74
* methods that add characters, add categories, and control the logic
75
* of the set, the method <code>applyPattern()</code> sets all
76
* attributes of a <code>UnicodeSet</code> at once, based on a
77
* string pattern.
78
*
79
* <p><b>Pattern syntax</b></p>
80
*
81
* Patterns are accepted by the constructors and the
82
* <code>applyPattern()</code> methods and returned by the
83
* <code>toPattern()</code> method. These patterns follow a syntax
84
* similar to that employed by version 8 regular expression character
85
* classes. Here are some simple examples:
86
*
87
* <blockquote>
88
* <table>
89
* <tr align="top">
90
* <td nowrap valign="top" align="left"><code>[]</code></td>
91
* <td valign="top">No characters</td>
92
* </tr><tr align="top">
93
* <td nowrap valign="top" align="left"><code>[a]</code></td>
94
* <td valign="top">The character 'a'</td>
95
* </tr><tr align="top">
96
* <td nowrap valign="top" align="left"><code>[ae]</code></td>
97
* <td valign="top">The characters 'a' and 'e'</td>
98
* </tr>
99
* <tr>
100
* <td nowrap valign="top" align="left"><code>[a-e]</code></td>
101
* <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
102
* point order</td>
103
* </tr>
104
* <tr>
105
* <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
106
* <td valign="top">The character U+4E01</td>
107
* </tr>
108
* <tr>
109
* <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
110
* <td valign="top">The character 'a' and the multicharacter strings &quot;ab&quot; and
111
* &quot;ac&quot;</td>
112
* </tr>
113
* <tr>
114
* <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td>
115
* <td valign="top">All characters in the general category Uppercase Letter</td>
116
* </tr>
117
* </table>
118
* </blockquote>
119
*
120
* Any character may be preceded by a backslash in order to remove any special
121
* meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
122
* ignored, unless they are escaped.
123
*
124
* <p>Property patterns specify a set of characters having a certain
125
* property as defined by the Unicode standard. Both the POSIX-like
126
* "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a
127
* complete list of supported property patterns, see the User's Guide
128
* for UnicodeSet at
129
* <a href="http://www.icu-project.org/userguide/unicodeSet.html">
130
* http://www.icu-project.org/userguide/unicodeSet.html</a>.
131
* Actual determination of property data is defined by the underlying
132
* Unicode database as implemented by UCharacter.
133
*
134
* <p>Patterns specify individual characters, ranges of characters, and
135
* Unicode property sets. When elements are concatenated, they
136
* specify their union. To complement a set, place a '^' immediately
137
* after the opening '['. Property patterns are inverted by modifying
138
* their delimiters; "[:^foo]" and "\P{foo}". In any other location,
139
* '^' has no special meaning.
140
*
141
* <p>Ranges are indicated by placing two a '-' between two
142
* characters, as in "a-z". This specifies the range of all
143
* characters from the left to the right, in Unicode order. If the
144
* left character is greater than or equal to the
145
* right character it is a syntax error. If a '-' occurs as the first
146
* character after the opening '[' or '[^', or if it occurs as the
147
* last character before the closing ']', then it is taken as a
148
* literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
149
* set of three characters, 'a', 'b', and '-'.
150
*
151
* <p>Sets may be intersected using the '&' operator or the asymmetric
152
* set difference may be taken using the '-' operator, for example,
153
* "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
154
* with values less than 4096. Operators ('&' and '|') have equal
155
* precedence and bind left-to-right. Thus
156
* "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
157
* "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
158
* difference; intersection is commutative.
159
*
160
* <table>
161
* <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
162
* <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
163
* through 'z' and all letters in between, in Unicode order
164
* <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
165
* all characters but 'a' through 'z',
166
* that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
167
* <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
168
* <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
169
* <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
170
* <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
171
* <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
172
* <td>The asymmetric difference of sets specified by <em>pat1</em> and
173
* <em>pat2</em>
174
* <tr valign=top><td nowrap><code>[:Lu:] or \p{Lu}</code>
175
* <td>The set of characters having the specified
176
* Unicode property; in
177
* this case, Unicode uppercase letters
178
* <tr valign=top><td nowrap><code>[:^Lu:] or \P{Lu}</code>
179
* <td>The set of characters <em>not</em> having the given
180
* Unicode property
181
* </table>
182
*
183
* <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
184
*
185
* <p><b>Formal syntax</b></p>
186
*
187
* <blockquote>
188
* <table>
189
* <tr align="top">
190
* <td nowrap valign="top" align="right"><code>pattern :=&nbsp; </code></td>
191
* <td valign="top"><code>('[' '^'? item* ']') |
192
* property</code></td>
193
* </tr>
194
* <tr align="top">
195
* <td nowrap valign="top" align="right"><code>item :=&nbsp; </code></td>
196
* <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
197
* </code></td>
198
* </tr>
199
* <tr align="top">
200
* <td nowrap valign="top" align="right"><code>pattern-expr :=&nbsp; </code></td>
201
* <td valign="top"><code>pattern | pattern-expr pattern |
202
* pattern-expr op pattern<br>
203
* </code></td>
204
* </tr>
205
* <tr align="top">
206
* <td nowrap valign="top" align="right"><code>op :=&nbsp; </code></td>
207
* <td valign="top"><code>'&amp;' | '-'<br>
208
* </code></td>
209
* </tr>
210
* <tr align="top">
211
* <td nowrap valign="top" align="right"><code>special :=&nbsp; </code></td>
212
* <td valign="top"><code>'[' | ']' | '-'<br>
213
* </code></td>
214
* </tr>
215
* <tr align="top">
216
* <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>
217
* <td valign="top"><em>any character that is not</em><code> special<br>
218
* | ('\\' </code><em>any character</em><code>)<br>
219
* | ('&#92;u' hex hex hex hex)<br>
220
* </code></td>
221
* </tr>
222
* <tr align="top">
223
* <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
224
* <td valign="top"><em>any character for which
225
* </em><code>Character.digit(c, 16)</code><em>
226
* returns a non-negative result</em></td>
227
* </tr>
228
* <tr>
229
* <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
230
* <td valign="top"><em>a Unicode property set pattern</td>
231
* </tr>
232
* </table>
233
* <br>
234
* <table border="1">
235
* <tr>
236
* <td>Legend: <table>
237
* <tr>
238
* <td nowrap valign="top"><code>a := b</code></td>
239
* <td width="20" valign="top">&nbsp; </td>
240
* <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
241
* </tr>
242
* <tr>
243
* <td nowrap valign="top"><code>a?</code></td>
244
* <td valign="top"></td>
245
* <td valign="top">zero or one instance of <code>a</code><br>
246
* </td>
247
* </tr>
248
* <tr>
249
* <td nowrap valign="top"><code>a*</code></td>
250
* <td valign="top"></td>
251
* <td valign="top">one or more instances of <code>a</code><br>
252
* </td>
253
* </tr>
254
* <tr>
255
* <td nowrap valign="top"><code>a | b</code></td>
256
* <td valign="top"></td>
257
* <td valign="top">either <code>a</code> or <code>b</code><br>
258
* </td>
259
* </tr>
260
* <tr>
261
* <td nowrap valign="top"><code>'a'</code></td>
262
* <td valign="top"></td>
263
* <td valign="top">the literal string between the quotes </td>
264
* </tr>
265
* </table>
266
* </td>
267
* </tr>
268
* </table>
269
* </blockquote>
270
* <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
271
*
272
* @author Alan Liu
273
* @stable ICU 2.0
274
* @see UnicodeSetIterator
275
*/
276
public class UnicodeSet implements UnicodeMatcher {
277
278
private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
279
private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
280
// 110000 for codepoints
281
282
/**
283
* Minimum value that can be stored in a UnicodeSet.
284
* @stable ICU 2.0
285
*/
286
public static final int MIN_VALUE = LOW;
287
288
/**
289
* Maximum value that can be stored in a UnicodeSet.
290
* @stable ICU 2.0
291
*/
292
public static final int MAX_VALUE = HIGH - 1;
293
294
private int len; // length used; list may be longer to minimize reallocs
295
private int[] list; // MUST be terminated with HIGH
296
private int[] rangeList; // internal buffer
297
private int[] buffer; // internal buffer
298
299
// NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
300
// is not private so that UnicodeSetIterator can get access
301
TreeSet<String> strings = new TreeSet<>();
302
303
/**
304
* The pattern representation of this set. This may not be the
305
* most economical pattern. It is the pattern supplied to
306
* applyPattern(), with variables substituted and whitespace
307
* removed. For sets constructed without applyPattern(), or
308
* modified using the non-pattern API, this string will be null,
309
* indicating that toPattern() must generate a pattern
310
* representation from the inversion list.
311
*/
312
private String pat = null;
313
314
private static final int START_EXTRA = 16; // initial storage. Must be >= 0
315
private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
316
317
/**
318
* A set of all characters _except_ the second through last characters of
319
* certain ranges. These ranges are ranges of characters whose
320
* properties are all exactly alike, e.g. CJK Ideographs from
321
* U+4E00 to U+9FA5.
322
*/
323
private static UnicodeSet INCLUSIONS[] = null;
324
325
//----------------------------------------------------------------
326
// Public API
327
//----------------------------------------------------------------
328
329
/**
330
* Constructs an empty set.
331
* @stable ICU 2.0
332
*/
333
public UnicodeSet() {
334
list = new int[1 + START_EXTRA];
335
list[len++] = HIGH;
336
}
337
338
/**
339
* Constructs a set containing the given range. If <code>end >
340
* start</code> then an empty set is created.
341
*
342
* @param start first character, inclusive, of range
343
* @param end last character, inclusive, of range
344
* @stable ICU 2.0
345
*/
346
public UnicodeSet(int start, int end) {
347
this();
348
complement(start, end);
349
}
350
351
/**
352
* Constructs a set from the given pattern. See the class description
353
* for the syntax of the pattern language. Whitespace is ignored.
354
* @param pattern a string specifying what characters are in the set
355
* @exception java.lang.IllegalArgumentException if the pattern contains
356
* a syntax error.
357
* @stable ICU 2.0
358
*/
359
public UnicodeSet(String pattern) {
360
this();
361
applyPattern(pattern, null, null, IGNORE_SPACE);
362
}
363
364
/**
365
* Make this object represent the same set as <code>other</code>.
366
* @param other a <code>UnicodeSet</code> whose value will be
367
* copied to this object
368
* @stable ICU 2.0
369
*/
370
public UnicodeSet set(UnicodeSet other) {
371
list = other.list.clone();
372
len = other.len;
373
pat = other.pat;
374
strings = (TreeSet)other.strings.clone();
375
return this;
376
}
377
378
/**
379
* Modifies this set to represent the set specified by the given pattern.
380
* See the class description for the syntax of the pattern language.
381
* Whitespace is ignored.
382
* @param pattern a string specifying what characters are in the set
383
* @exception java.lang.IllegalArgumentException if the pattern
384
* contains a syntax error.
385
* @stable ICU 2.0
386
*/
387
public final UnicodeSet applyPattern(String pattern) {
388
return applyPattern(pattern, null, null, IGNORE_SPACE);
389
}
390
391
/**
392
* Append the <code>toPattern()</code> representation of a
393
* string to the given <code>StringBuffer</code>.
394
*/
395
private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
396
for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
397
_appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable);
398
}
399
}
400
401
/**
402
* Append the <code>toPattern()</code> representation of a
403
* character to the given <code>StringBuffer</code>.
404
*/
405
private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
406
if (escapeUnprintable && Utility.isUnprintable(c)) {
407
// Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
408
// unprintable
409
if (Utility.escapeUnprintable(buf, c)) {
410
return;
411
}
412
}
413
// Okay to let ':' pass through
414
switch (c) {
415
case '[': // SET_OPEN:
416
case ']': // SET_CLOSE:
417
case '-': // HYPHEN:
418
case '^': // COMPLEMENT:
419
case '&': // INTERSECTION:
420
case '\\': //BACKSLASH:
421
case '{':
422
case '}':
423
case '$':
424
case ':':
425
buf.append('\\');
426
break;
427
default:
428
// Escape whitespace
429
if (UCharacterProperty.isRuleWhiteSpace(c)) {
430
buf.append('\\');
431
}
432
break;
433
}
434
UTF16.append(buf, c);
435
}
436
437
/**
438
* Append a string representation of this set to result. This will be
439
* a cleaned version of the string passed to applyPattern(), if there
440
* is one. Otherwise it will be generated.
441
*/
442
private StringBuffer _toPattern(StringBuffer result,
443
boolean escapeUnprintable) {
444
if (pat != null) {
445
int i;
446
int backslashCount = 0;
447
for (i=0; i<pat.length(); ) {
448
int c = UTF16.charAt(pat, i);
449
i += UTF16.getCharCount(c);
450
if (escapeUnprintable && Utility.isUnprintable(c)) {
451
// If the unprintable character is preceded by an odd
452
// number of backslashes, then it has been escaped.
453
// Before unescaping it, we delete the final
454
// backslash.
455
if ((backslashCount % 2) == 1) {
456
result.setLength(result.length() - 1);
457
}
458
Utility.escapeUnprintable(result, c);
459
backslashCount = 0;
460
} else {
461
UTF16.append(result, c);
462
if (c == '\\') {
463
++backslashCount;
464
} else {
465
backslashCount = 0;
466
}
467
}
468
}
469
return result;
470
}
471
472
return _generatePattern(result, escapeUnprintable, true);
473
}
474
475
/**
476
* Generate and append a string representation of this set to result.
477
* This does not use this.pat, the cleaned up copy of the string
478
* passed to applyPattern().
479
* @param includeStrings if false, doesn't include the strings.
480
* @stable ICU 3.8
481
*/
482
public StringBuffer _generatePattern(StringBuffer result,
483
boolean escapeUnprintable, boolean includeStrings) {
484
result.append('[');
485
486
int count = getRangeCount();
487
488
// If the set contains at least 2 intervals and includes both
489
// MIN_VALUE and MAX_VALUE, then the inverse representation will
490
// be more economical.
491
if (count > 1 &&
492
getRangeStart(0) == MIN_VALUE &&
493
getRangeEnd(count-1) == MAX_VALUE) {
494
495
// Emit the inverse
496
result.append('^');
497
498
for (int i = 1; i < count; ++i) {
499
int start = getRangeEnd(i-1)+1;
500
int end = getRangeStart(i)-1;
501
_appendToPat(result, start, escapeUnprintable);
502
if (start != end) {
503
if ((start+1) != end) {
504
result.append('-');
505
}
506
_appendToPat(result, end, escapeUnprintable);
507
}
508
}
509
}
510
511
// Default; emit the ranges as pairs
512
else {
513
for (int i = 0; i < count; ++i) {
514
int start = getRangeStart(i);
515
int end = getRangeEnd(i);
516
_appendToPat(result, start, escapeUnprintable);
517
if (start != end) {
518
if ((start+1) != end) {
519
result.append('-');
520
}
521
_appendToPat(result, end, escapeUnprintable);
522
}
523
}
524
}
525
526
if (includeStrings && strings.size() > 0) {
527
Iterator<String> it = strings.iterator();
528
while (it.hasNext()) {
529
result.append('{');
530
_appendToPat(result, it.next(), escapeUnprintable);
531
result.append('}');
532
}
533
}
534
return result.append(']');
535
}
536
537
// for internal use, after checkFrozen has been called
538
private UnicodeSet add_unchecked(int start, int end) {
539
if (start < MIN_VALUE || start > MAX_VALUE) {
540
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
541
}
542
if (end < MIN_VALUE || end > MAX_VALUE) {
543
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
544
}
545
if (start < end) {
546
add(range(start, end), 2, 0);
547
} else if (start == end) {
548
add(start);
549
}
550
return this;
551
}
552
553
/**
554
* Adds the specified character to this set if it is not already
555
* present. If this set already contains the specified character,
556
* the call leaves this set unchanged.
557
* @stable ICU 2.0
558
*/
559
public final UnicodeSet add(int c) {
560
return add_unchecked(c);
561
}
562
563
// for internal use only, after checkFrozen has been called
564
private final UnicodeSet add_unchecked(int c) {
565
if (c < MIN_VALUE || c > MAX_VALUE) {
566
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
567
}
568
569
// find smallest i such that c < list[i]
570
// if odd, then it is IN the set
571
// if even, then it is OUT of the set
572
int i = findCodePoint(c);
573
574
// already in set?
575
if ((i & 1) != 0) return this;
576
577
// HIGH is 0x110000
578
// assert(list[len-1] == HIGH);
579
580
// empty = [HIGH]
581
// [start_0, limit_0, start_1, limit_1, HIGH]
582
583
// [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
584
// ^
585
// list[i]
586
587
// i == 0 means c is before the first range
588
589
if (c == list[i]-1) {
590
// c is before start of next range
591
list[i] = c;
592
// if we touched the HIGH mark, then add a new one
593
if (c == MAX_VALUE) {
594
ensureCapacity(len+1);
595
list[len++] = HIGH;
596
}
597
if (i > 0 && c == list[i-1]) {
598
// collapse adjacent ranges
599
600
// [..., start_k-1, c, c, limit_k, ..., HIGH]
601
// ^
602
// list[i]
603
System.arraycopy(list, i+1, list, i-1, len-i-1);
604
len -= 2;
605
}
606
}
607
608
else if (i > 0 && c == list[i-1]) {
609
// c is after end of prior range
610
list[i-1]++;
611
// no need to chcek for collapse here
612
}
613
614
else {
615
// At this point we know the new char is not adjacent to
616
// any existing ranges, and it is not 10FFFF.
617
618
619
// [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
620
// ^
621
// list[i]
622
623
// [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
624
// ^
625
// list[i]
626
627
// Don't use ensureCapacity() to save on copying.
628
// NOTE: This has no measurable impact on performance,
629
// but it might help in some usage patterns.
630
if (len+2 > list.length) {
631
int[] temp = new int[len + 2 + GROW_EXTRA];
632
if (i != 0) System.arraycopy(list, 0, temp, 0, i);
633
System.arraycopy(list, i, temp, i+2, len-i);
634
list = temp;
635
} else {
636
System.arraycopy(list, i, list, i+2, len-i);
637
}
638
639
list[i] = c;
640
list[i+1] = c+1;
641
len += 2;
642
}
643
644
pat = null;
645
return this;
646
}
647
648
/**
649
* Adds the specified multicharacter to this set if it is not already
650
* present. If this set already contains the multicharacter,
651
* the call leaves this set unchanged.
652
* Thus "ch" => {"ch"}
653
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
654
* @param s the source string
655
* @return this object, for chaining
656
* @stable ICU 2.0
657
*/
658
public final UnicodeSet add(String s) {
659
int cp = getSingleCP(s);
660
if (cp < 0) {
661
strings.add(s);
662
pat = null;
663
} else {
664
add_unchecked(cp, cp);
665
}
666
return this;
667
}
668
669
/**
670
* @return a code point IF the string consists of a single one.
671
* otherwise returns -1.
672
* @param string to test
673
*/
674
private static int getSingleCP(String s) {
675
if (s.length() < 1) {
676
throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
677
}
678
if (s.length() > 2) return -1;
679
if (s.length() == 1) return s.charAt(0);
680
681
// at this point, len = 2
682
int cp = UTF16.charAt(s, 0);
683
if (cp > 0xFFFF) { // is surrogate pair
684
return cp;
685
}
686
return -1;
687
}
688
689
/**
690
* Complements the specified range in this set. Any character in
691
* the range will be removed if it is in this set, or will be
692
* added if it is not in this set. If <code>end > start</code>
693
* then an empty range is complemented, leaving the set unchanged.
694
*
695
* @param start first character, inclusive, of range to be removed
696
* from this set.
697
* @param end last character, inclusive, of range to be removed
698
* from this set.
699
* @stable ICU 2.0
700
*/
701
public UnicodeSet complement(int start, int end) {
702
if (start < MIN_VALUE || start > MAX_VALUE) {
703
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
704
}
705
if (end < MIN_VALUE || end > MAX_VALUE) {
706
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
707
}
708
if (start <= end) {
709
xor(range(start, end), 2, 0);
710
}
711
pat = null;
712
return this;
713
}
714
715
/**
716
* This is equivalent to
717
* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
718
* @stable ICU 2.0
719
*/
720
public UnicodeSet complement() {
721
if (list[0] == LOW) {
722
System.arraycopy(list, 1, list, 0, len-1);
723
--len;
724
} else {
725
ensureCapacity(len+1);
726
System.arraycopy(list, 0, list, 1, len);
727
list[0] = LOW;
728
++len;
729
}
730
pat = null;
731
return this;
732
}
733
734
/**
735
* Returns true if this set contains the given character.
736
* @param c character to be checked for containment
737
* @return true if the test condition is met
738
* @stable ICU 2.0
739
*/
740
public boolean contains(int c) {
741
if (c < MIN_VALUE || c > MAX_VALUE) {
742
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
743
}
744
745
/*
746
// Set i to the index of the start item greater than ch
747
// We know we will terminate without length test!
748
int i = -1;
749
while (true) {
750
if (c < list[++i]) break;
751
}
752
*/
753
754
int i = findCodePoint(c);
755
756
return ((i & 1) != 0); // return true if odd
757
}
758
759
/**
760
* Returns the smallest value i such that c < list[i]. Caller
761
* must ensure that c is a legal value or this method will enter
762
* an infinite loop. This method performs a binary search.
763
* @param c a character in the range MIN_VALUE..MAX_VALUE
764
* inclusive
765
* @return the smallest integer i in the range 0..len-1,
766
* inclusive, such that c < list[i]
767
*/
768
private final int findCodePoint(int c) {
769
/* Examples:
770
findCodePoint(c)
771
set list[] c=0 1 3 4 7 8
772
=== ============== ===========
773
[] [110000] 0 0 0 0 0 0
774
[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
775
[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
776
[:all:] [0, 110000] 1 1 1 1 1 1
777
*/
778
779
// Return the smallest i such that c < list[i]. Assume
780
// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
781
if (c < list[0]) return 0;
782
// High runner test. c is often after the last range, so an
783
// initial check for this condition pays off.
784
if (len >= 2 && c >= list[len-2]) return len-1;
785
int lo = 0;
786
int hi = len - 1;
787
// invariant: c >= list[lo]
788
// invariant: c < list[hi]
789
for (;;) {
790
int i = (lo + hi) >>> 1;
791
if (i == lo) return hi;
792
if (c < list[i]) {
793
hi = i;
794
} else {
795
lo = i;
796
}
797
}
798
}
799
800
/**
801
* Adds all of the elements in the specified set to this set if
802
* they're not already present. This operation effectively
803
* modifies this set so that its value is the <i>union</i> of the two
804
* sets. The behavior of this operation is unspecified if the specified
805
* collection is modified while the operation is in progress.
806
*
807
* @param c set whose elements are to be added to this set.
808
* @stable ICU 2.0
809
*/
810
public UnicodeSet addAll(UnicodeSet c) {
811
add(c.list, c.len, 0);
812
strings.addAll(c.strings);
813
return this;
814
}
815
816
/**
817
* Retains only the elements in this set that are contained in the
818
* specified set. In other words, removes from this set all of
819
* its elements that are not contained in the specified set. This
820
* operation effectively modifies this set so that its value is
821
* the <i>intersection</i> of the two sets.
822
*
823
* @param c set that defines which elements this set will retain.
824
* @stable ICU 2.0
825
*/
826
public UnicodeSet retainAll(UnicodeSet c) {
827
retain(c.list, c.len, 0);
828
strings.retainAll(c.strings);
829
return this;
830
}
831
832
/**
833
* Removes from this set all of its elements that are contained in the
834
* specified set. This operation effectively modifies this
835
* set so that its value is the <i>asymmetric set difference</i> of
836
* the two sets.
837
*
838
* @param c set that defines which elements will be removed from
839
* this set.
840
* @stable ICU 2.0
841
*/
842
public UnicodeSet removeAll(UnicodeSet c) {
843
retain(c.list, c.len, 2);
844
strings.removeAll(c.strings);
845
return this;
846
}
847
848
/**
849
* Removes all of the elements from this set. This set will be
850
* empty after this call returns.
851
* @stable ICU 2.0
852
*/
853
public UnicodeSet clear() {
854
list[0] = HIGH;
855
len = 1;
856
pat = null;
857
strings.clear();
858
return this;
859
}
860
861
/**
862
* Iteration method that returns the number of ranges contained in
863
* this set.
864
* @see #getRangeStart
865
* @see #getRangeEnd
866
* @stable ICU 2.0
867
*/
868
public int getRangeCount() {
869
return len/2;
870
}
871
872
/**
873
* Iteration method that returns the first character in the
874
* specified range of this set.
875
* @exception ArrayIndexOutOfBoundsException if index is outside
876
* the range <code>0..getRangeCount()-1</code>
877
* @see #getRangeCount
878
* @see #getRangeEnd
879
* @stable ICU 2.0
880
*/
881
public int getRangeStart(int index) {
882
return list[index*2];
883
}
884
885
/**
886
* Iteration method that returns the last character in the
887
* specified range of this set.
888
* @exception ArrayIndexOutOfBoundsException if index is outside
889
* the range <code>0..getRangeCount()-1</code>
890
* @see #getRangeStart
891
* @see #getRangeEnd
892
* @stable ICU 2.0
893
*/
894
public int getRangeEnd(int index) {
895
return (list[index*2 + 1] - 1);
896
}
897
898
//----------------------------------------------------------------
899
// Implementation: Pattern parsing
900
//----------------------------------------------------------------
901
902
/**
903
* Parses the given pattern, starting at the given position. The character
904
* at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
905
* Parsing continues until the corresponding closing ']'. If a syntax error
906
* is encountered between the opening and closing brace, the parse fails.
907
* Upon return from a successful parse, the ParsePosition is updated to
908
* point to the character following the closing ']', and an inversion
909
* list for the parsed pattern is returned. This method
910
* calls itself recursively to parse embedded subpatterns.
911
*
912
* @param pattern the string containing the pattern to be parsed. The
913
* portion of the string from pos.getIndex(), which must be a '[', to the
914
* corresponding closing ']', is parsed.
915
* @param pos upon entry, the position at which to being parsing. The
916
* character at pattern.charAt(pos.getIndex()) must be a '['. Upon return
917
* from a successful parse, pos.getIndex() is either the character after the
918
* closing ']' of the parsed pattern, or pattern.length() if the closing ']'
919
* is the last character of the pattern string.
920
* @return an inversion list for the parsed substring
921
* of <code>pattern</code>
922
* @exception java.lang.IllegalArgumentException if the parse fails.
923
*/
924
UnicodeSet applyPattern(String pattern,
925
ParsePosition pos,
926
SymbolTable symbols,
927
int options) {
928
929
// Need to build the pattern in a temporary string because
930
// _applyPattern calls add() etc., which set pat to empty.
931
boolean parsePositionWasNull = pos == null;
932
if (parsePositionWasNull) {
933
pos = new ParsePosition(0);
934
}
935
936
StringBuffer rebuiltPat = new StringBuffer();
937
RuleCharacterIterator chars =
938
new RuleCharacterIterator(pattern, symbols, pos);
939
applyPattern(chars, symbols, rebuiltPat, options);
940
if (chars.inVariable()) {
941
syntaxError(chars, "Extra chars in variable value");
942
}
943
pat = rebuiltPat.toString();
944
if (parsePositionWasNull) {
945
int i = pos.getIndex();
946
947
// Skip over trailing whitespace
948
if ((options & IGNORE_SPACE) != 0) {
949
i = Utility.skipWhitespace(pattern, i);
950
}
951
952
if (i != pattern.length()) {
953
throw new IllegalArgumentException("Parse of \"" + pattern +
954
"\" failed at " + i);
955
}
956
}
957
return this;
958
}
959
960
/**
961
* Parse the pattern from the given RuleCharacterIterator. The
962
* iterator is advanced over the parsed pattern.
963
* @param chars iterator over the pattern characters. Upon return
964
* it will be advanced to the first character after the parsed
965
* pattern, or the end of the iteration if all characters are
966
* parsed.
967
* @param symbols symbol table to use to parse and dereference
968
* variables, or null if none.
969
* @param rebuiltPat the pattern that was parsed, rebuilt or
970
* copied from the input pattern, as appropriate.
971
* @param options a bit mask of zero or more of the following:
972
* IGNORE_SPACE, CASE.
973
*/
974
void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
975
StringBuffer rebuiltPat, int options) {
976
// Syntax characters: [ ] ^ - & { }
977
978
// Recognized special forms for chars, sets: c-c s-s s&s
979
980
int opts = RuleCharacterIterator.PARSE_VARIABLES |
981
RuleCharacterIterator.PARSE_ESCAPES;
982
if ((options & IGNORE_SPACE) != 0) {
983
opts |= RuleCharacterIterator.SKIP_WHITESPACE;
984
}
985
986
StringBuffer patBuf = new StringBuffer(), buf = null;
987
boolean usePat = false;
988
UnicodeSet scratch = null;
989
Object backup = null;
990
991
// mode: 0=before [, 1=between [...], 2=after ]
992
// lastItem: 0=none, 1=char, 2=set
993
int lastItem = 0, lastChar = 0, mode = 0;
994
char op = 0;
995
996
boolean invert = false;
997
998
clear();
999
1000
while (mode != 2 && !chars.atEnd()) {
1001
if (false) {
1002
// Debugging assertion
1003
if (!((lastItem == 0 && op == 0) ||
1004
(lastItem == 1 && (op == 0 || op == '-')) ||
1005
(lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
1006
throw new IllegalArgumentException();
1007
}
1008
}
1009
1010
int c = 0;
1011
boolean literal = false;
1012
UnicodeSet nested = null;
1013
1014
// -------- Check for property pattern
1015
1016
// setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
1017
int setMode = 0;
1018
if (resemblesPropertyPattern(chars, opts)) {
1019
setMode = 2;
1020
}
1021
1022
// -------- Parse '[' of opening delimiter OR nested set.
1023
// If there is a nested set, use `setMode' to define how
1024
// the set should be parsed. If the '[' is part of the
1025
// opening delimiter for this pattern, parse special
1026
// strings "[", "[^", "[-", and "[^-". Check for stand-in
1027
// characters representing a nested set in the symbol
1028
// table.
1029
1030
else {
1031
// Prepare to backup if necessary
1032
backup = chars.getPos(backup);
1033
c = chars.next(opts);
1034
literal = chars.isEscaped();
1035
1036
if (c == '[' && !literal) {
1037
if (mode == 1) {
1038
chars.setPos(backup); // backup
1039
setMode = 1;
1040
} else {
1041
// Handle opening '[' delimiter
1042
mode = 1;
1043
patBuf.append('[');
1044
backup = chars.getPos(backup); // prepare to backup
1045
c = chars.next(opts);
1046
literal = chars.isEscaped();
1047
if (c == '^' && !literal) {
1048
invert = true;
1049
patBuf.append('^');
1050
backup = chars.getPos(backup); // prepare to backup
1051
c = chars.next(opts);
1052
literal = chars.isEscaped();
1053
}
1054
// Fall through to handle special leading '-';
1055
// otherwise restart loop for nested [], \p{}, etc.
1056
if (c == '-') {
1057
literal = true;
1058
// Fall through to handle literal '-' below
1059
} else {
1060
chars.setPos(backup); // backup
1061
continue;
1062
}
1063
}
1064
} else if (symbols != null) {
1065
UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
1066
if (m != null) {
1067
try {
1068
nested = (UnicodeSet) m;
1069
setMode = 3;
1070
} catch (ClassCastException e) {
1071
syntaxError(chars, "Syntax error");
1072
}
1073
}
1074
}
1075
}
1076
1077
// -------- Handle a nested set. This either is inline in
1078
// the pattern or represented by a stand-in that has
1079
// previously been parsed and was looked up in the symbol
1080
// table.
1081
1082
if (setMode != 0) {
1083
if (lastItem == 1) {
1084
if (op != 0) {
1085
syntaxError(chars, "Char expected after operator");
1086
}
1087
add_unchecked(lastChar, lastChar);
1088
_appendToPat(patBuf, lastChar, false);
1089
lastItem = op = 0;
1090
}
1091
1092
if (op == '-' || op == '&') {
1093
patBuf.append(op);
1094
}
1095
1096
if (nested == null) {
1097
if (scratch == null) scratch = new UnicodeSet();
1098
nested = scratch;
1099
}
1100
switch (setMode) {
1101
case 1:
1102
nested.applyPattern(chars, symbols, patBuf, options);
1103
break;
1104
case 2:
1105
chars.skipIgnored(opts);
1106
nested.applyPropertyPattern(chars, patBuf, symbols);
1107
break;
1108
case 3: // `nested' already parsed
1109
nested._toPattern(patBuf, false);
1110
break;
1111
}
1112
1113
usePat = true;
1114
1115
if (mode == 0) {
1116
// Entire pattern is a category; leave parse loop
1117
set(nested);
1118
mode = 2;
1119
break;
1120
}
1121
1122
switch (op) {
1123
case '-':
1124
removeAll(nested);
1125
break;
1126
case '&':
1127
retainAll(nested);
1128
break;
1129
case 0:
1130
addAll(nested);
1131
break;
1132
}
1133
1134
op = 0;
1135
lastItem = 2;
1136
1137
continue;
1138
}
1139
1140
if (mode == 0) {
1141
syntaxError(chars, "Missing '['");
1142
}
1143
1144
// -------- Parse special (syntax) characters. If the
1145
// current character is not special, or if it is escaped,
1146
// then fall through and handle it below.
1147
1148
if (!literal) {
1149
switch (c) {
1150
case ']':
1151
if (lastItem == 1) {
1152
add_unchecked(lastChar, lastChar);
1153
_appendToPat(patBuf, lastChar, false);
1154
}
1155
// Treat final trailing '-' as a literal
1156
if (op == '-') {
1157
add_unchecked(op, op);
1158
patBuf.append(op);
1159
} else if (op == '&') {
1160
syntaxError(chars, "Trailing '&'");
1161
}
1162
patBuf.append(']');
1163
mode = 2;
1164
continue;
1165
case '-':
1166
if (op == 0) {
1167
if (lastItem != 0) {
1168
op = (char) c;
1169
continue;
1170
} else {
1171
// Treat final trailing '-' as a literal
1172
add_unchecked(c, c);
1173
c = chars.next(opts);
1174
literal = chars.isEscaped();
1175
if (c == ']' && !literal) {
1176
patBuf.append("-]");
1177
mode = 2;
1178
continue;
1179
}
1180
}
1181
}
1182
syntaxError(chars, "'-' not after char or set");
1183
break;
1184
case '&':
1185
if (lastItem == 2 && op == 0) {
1186
op = (char) c;
1187
continue;
1188
}
1189
syntaxError(chars, "'&' not after set");
1190
break;
1191
case '^':
1192
syntaxError(chars, "'^' not after '['");
1193
break;
1194
case '{':
1195
if (op != 0) {
1196
syntaxError(chars, "Missing operand after operator");
1197
}
1198
if (lastItem == 1) {
1199
add_unchecked(lastChar, lastChar);
1200
_appendToPat(patBuf, lastChar, false);
1201
}
1202
lastItem = 0;
1203
if (buf == null) {
1204
buf = new StringBuffer();
1205
} else {
1206
buf.setLength(0);
1207
}
1208
boolean ok = false;
1209
while (!chars.atEnd()) {
1210
c = chars.next(opts);
1211
literal = chars.isEscaped();
1212
if (c == '}' && !literal) {
1213
ok = true;
1214
break;
1215
}
1216
UTF16.append(buf, c);
1217
}
1218
if (buf.length() < 1 || !ok) {
1219
syntaxError(chars, "Invalid multicharacter string");
1220
}
1221
// We have new string. Add it to set and continue;
1222
// we don't need to drop through to the further
1223
// processing
1224
add(buf.toString());
1225
patBuf.append('{');
1226
_appendToPat(patBuf, buf.toString(), false);
1227
patBuf.append('}');
1228
continue;
1229
case SymbolTable.SYMBOL_REF:
1230
// symbols nosymbols
1231
// [a-$] error error (ambiguous)
1232
// [a$] anchor anchor
1233
// [a-$x] var "x"* literal '$'
1234
// [a-$.] error literal '$'
1235
// *We won't get here in the case of var "x"
1236
backup = chars.getPos(backup);
1237
c = chars.next(opts);
1238
literal = chars.isEscaped();
1239
boolean anchor = (c == ']' && !literal);
1240
if (symbols == null && !anchor) {
1241
c = SymbolTable.SYMBOL_REF;
1242
chars.setPos(backup);
1243
break; // literal '$'
1244
}
1245
if (anchor && op == 0) {
1246
if (lastItem == 1) {
1247
add_unchecked(lastChar, lastChar);
1248
_appendToPat(patBuf, lastChar, false);
1249
}
1250
add_unchecked(UnicodeMatcher.ETHER);
1251
usePat = true;
1252
patBuf.append(SymbolTable.SYMBOL_REF).append(']');
1253
mode = 2;
1254
continue;
1255
}
1256
syntaxError(chars, "Unquoted '$'");
1257
break;
1258
default:
1259
break;
1260
}
1261
}
1262
1263
// -------- Parse literal characters. This includes both
1264
// escaped chars ("\u4E01") and non-syntax characters
1265
// ("a").
1266
1267
switch (lastItem) {
1268
case 0:
1269
lastItem = 1;
1270
lastChar = c;
1271
break;
1272
case 1:
1273
if (op == '-') {
1274
if (lastChar >= c) {
1275
// Don't allow redundant (a-a) or empty (b-a) ranges;
1276
// these are most likely typos.
1277
syntaxError(chars, "Invalid range");
1278
}
1279
add_unchecked(lastChar, c);
1280
_appendToPat(patBuf, lastChar, false);
1281
patBuf.append(op);
1282
_appendToPat(patBuf, c, false);
1283
lastItem = op = 0;
1284
} else {
1285
add_unchecked(lastChar, lastChar);
1286
_appendToPat(patBuf, lastChar, false);
1287
lastChar = c;
1288
}
1289
break;
1290
case 2:
1291
if (op != 0) {
1292
syntaxError(chars, "Set expected after operator");
1293
}
1294
lastChar = c;
1295
lastItem = 1;
1296
break;
1297
}
1298
}
1299
1300
if (mode != 2) {
1301
syntaxError(chars, "Missing ']'");
1302
}
1303
1304
chars.skipIgnored(opts);
1305
1306
if (invert) {
1307
complement();
1308
}
1309
1310
// Use the rebuilt pattern (pat) only if necessary. Prefer the
1311
// generated pattern.
1312
if (usePat) {
1313
rebuiltPat.append(patBuf.toString());
1314
} else {
1315
_generatePattern(rebuiltPat, false, true);
1316
}
1317
}
1318
1319
private static void syntaxError(RuleCharacterIterator chars, String msg) {
1320
throw new IllegalArgumentException("Error: " + msg + " at \"" +
1321
Utility.escape(chars.toString()) +
1322
'"');
1323
}
1324
1325
//----------------------------------------------------------------
1326
// Implementation: Utility methods
1327
//----------------------------------------------------------------
1328
1329
private void ensureCapacity(int newLen) {
1330
if (newLen <= list.length) return;
1331
int[] temp = new int[newLen + GROW_EXTRA];
1332
System.arraycopy(list, 0, temp, 0, len);
1333
list = temp;
1334
}
1335
1336
private void ensureBufferCapacity(int newLen) {
1337
if (buffer != null && newLen <= buffer.length) return;
1338
buffer = new int[newLen + GROW_EXTRA];
1339
}
1340
1341
/**
1342
* Assumes start <= end.
1343
*/
1344
private int[] range(int start, int end) {
1345
if (rangeList == null) {
1346
rangeList = new int[] { start, end+1, HIGH };
1347
} else {
1348
rangeList[0] = start;
1349
rangeList[1] = end+1;
1350
}
1351
return rangeList;
1352
}
1353
1354
//----------------------------------------------------------------
1355
// Implementation: Fundamental operations
1356
//----------------------------------------------------------------
1357
1358
// polarity = 0, 3 is normal: x xor y
1359
// polarity = 1, 2: x xor ~y == x === y
1360
1361
private UnicodeSet xor(int[] other, int otherLen, int polarity) {
1362
ensureBufferCapacity(len + otherLen);
1363
int i = 0, j = 0, k = 0;
1364
int a = list[i++];
1365
int b;
1366
if (polarity == 1 || polarity == 2) {
1367
b = LOW;
1368
if (other[j] == LOW) { // skip base if already LOW
1369
++j;
1370
b = other[j];
1371
}
1372
} else {
1373
b = other[j++];
1374
}
1375
// simplest of all the routines
1376
// sort the values, discarding identicals!
1377
while (true) {
1378
if (a < b) {
1379
buffer[k++] = a;
1380
a = list[i++];
1381
} else if (b < a) {
1382
buffer[k++] = b;
1383
b = other[j++];
1384
} else if (a != HIGH) { // at this point, a == b
1385
// discard both values!
1386
a = list[i++];
1387
b = other[j++];
1388
} else { // DONE!
1389
buffer[k++] = HIGH;
1390
len = k;
1391
break;
1392
}
1393
}
1394
// swap list and buffer
1395
int[] temp = list;
1396
list = buffer;
1397
buffer = temp;
1398
pat = null;
1399
return this;
1400
}
1401
1402
// polarity = 0 is normal: x union y
1403
// polarity = 2: x union ~y
1404
// polarity = 1: ~x union y
1405
// polarity = 3: ~x union ~y
1406
1407
private UnicodeSet add(int[] other, int otherLen, int polarity) {
1408
ensureBufferCapacity(len + otherLen);
1409
int i = 0, j = 0, k = 0;
1410
int a = list[i++];
1411
int b = other[j++];
1412
// change from xor is that we have to check overlapping pairs
1413
// polarity bit 1 means a is second, bit 2 means b is.
1414
main:
1415
while (true) {
1416
switch (polarity) {
1417
case 0: // both first; take lower if unequal
1418
if (a < b) { // take a
1419
// Back up over overlapping ranges in buffer[]
1420
if (k > 0 && a <= buffer[k-1]) {
1421
// Pick latter end value in buffer[] vs. list[]
1422
a = max(list[i], buffer[--k]);
1423
} else {
1424
// No overlap
1425
buffer[k++] = a;
1426
a = list[i];
1427
}
1428
i++; // Common if/else code factored out
1429
polarity ^= 1;
1430
} else if (b < a) { // take b
1431
if (k > 0 && b <= buffer[k-1]) {
1432
b = max(other[j], buffer[--k]);
1433
} else {
1434
buffer[k++] = b;
1435
b = other[j];
1436
}
1437
j++;
1438
polarity ^= 2;
1439
} else { // a == b, take a, drop b
1440
if (a == HIGH) break main;
1441
// This is symmetrical; it doesn't matter if
1442
// we backtrack with a or b. - liu
1443
if (k > 0 && a <= buffer[k-1]) {
1444
a = max(list[i], buffer[--k]);
1445
} else {
1446
// No overlap
1447
buffer[k++] = a;
1448
a = list[i];
1449
}
1450
i++;
1451
polarity ^= 1;
1452
b = other[j++]; polarity ^= 2;
1453
}
1454
break;
1455
case 3: // both second; take higher if unequal, and drop other
1456
if (b <= a) { // take a
1457
if (a == HIGH) break main;
1458
buffer[k++] = a;
1459
} else { // take b
1460
if (b == HIGH) break main;
1461
buffer[k++] = b;
1462
}
1463
a = list[i++]; polarity ^= 1; // factored common code
1464
b = other[j++]; polarity ^= 2;
1465
break;
1466
case 1: // a second, b first; if b < a, overlap
1467
if (a < b) { // no overlap, take a
1468
buffer[k++] = a; a = list[i++]; polarity ^= 1;
1469
} else if (b < a) { // OVERLAP, drop b
1470
b = other[j++]; polarity ^= 2;
1471
} else { // a == b, drop both!
1472
if (a == HIGH) break main;
1473
a = list[i++]; polarity ^= 1;
1474
b = other[j++]; polarity ^= 2;
1475
}
1476
break;
1477
case 2: // a first, b second; if a < b, overlap
1478
if (b < a) { // no overlap, take b
1479
buffer[k++] = b; b = other[j++]; polarity ^= 2;
1480
} else if (a < b) { // OVERLAP, drop a
1481
a = list[i++]; polarity ^= 1;
1482
} else { // a == b, drop both!
1483
if (a == HIGH) break main;
1484
a = list[i++]; polarity ^= 1;
1485
b = other[j++]; polarity ^= 2;
1486
}
1487
break;
1488
}
1489
}
1490
buffer[k++] = HIGH; // terminate
1491
len = k;
1492
// swap list and buffer
1493
int[] temp = list;
1494
list = buffer;
1495
buffer = temp;
1496
pat = null;
1497
return this;
1498
}
1499
1500
// polarity = 0 is normal: x intersect y
1501
// polarity = 2: x intersect ~y == set-minus
1502
// polarity = 1: ~x intersect y
1503
// polarity = 3: ~x intersect ~y
1504
1505
private UnicodeSet retain(int[] other, int otherLen, int polarity) {
1506
ensureBufferCapacity(len + otherLen);
1507
int i = 0, j = 0, k = 0;
1508
int a = list[i++];
1509
int b = other[j++];
1510
// change from xor is that we have to check overlapping pairs
1511
// polarity bit 1 means a is second, bit 2 means b is.
1512
main:
1513
while (true) {
1514
switch (polarity) {
1515
case 0: // both first; drop the smaller
1516
if (a < b) { // drop a
1517
a = list[i++]; polarity ^= 1;
1518
} else if (b < a) { // drop b
1519
b = other[j++]; polarity ^= 2;
1520
} else { // a == b, take one, drop other
1521
if (a == HIGH) break main;
1522
buffer[k++] = a; a = list[i++]; polarity ^= 1;
1523
b = other[j++]; polarity ^= 2;
1524
}
1525
break;
1526
case 3: // both second; take lower if unequal
1527
if (a < b) { // take a
1528
buffer[k++] = a; a = list[i++]; polarity ^= 1;
1529
} else if (b < a) { // take b
1530
buffer[k++] = b; b = other[j++]; polarity ^= 2;
1531
} else { // a == b, take one, drop other
1532
if (a == HIGH) break main;
1533
buffer[k++] = a; a = list[i++]; polarity ^= 1;
1534
b = other[j++]; polarity ^= 2;
1535
}
1536
break;
1537
case 1: // a second, b first;
1538
if (a < b) { // NO OVERLAP, drop a
1539
a = list[i++]; polarity ^= 1;
1540
} else if (b < a) { // OVERLAP, take b
1541
buffer[k++] = b; b = other[j++]; polarity ^= 2;
1542
} else { // a == b, drop both!
1543
if (a == HIGH) break main;
1544
a = list[i++]; polarity ^= 1;
1545
b = other[j++]; polarity ^= 2;
1546
}
1547
break;
1548
case 2: // a first, b second; if a < b, overlap
1549
if (b < a) { // no overlap, drop b
1550
b = other[j++]; polarity ^= 2;
1551
} else if (a < b) { // OVERLAP, take a
1552
buffer[k++] = a; a = list[i++]; polarity ^= 1;
1553
} else { // a == b, drop both!
1554
if (a == HIGH) break main;
1555
a = list[i++]; polarity ^= 1;
1556
b = other[j++]; polarity ^= 2;
1557
}
1558
break;
1559
}
1560
}
1561
buffer[k++] = HIGH; // terminate
1562
len = k;
1563
// swap list and buffer
1564
int[] temp = list;
1565
list = buffer;
1566
buffer = temp;
1567
pat = null;
1568
return this;
1569
}
1570
1571
private static final int max(int a, int b) {
1572
return (a > b) ? a : b;
1573
}
1574
1575
//----------------------------------------------------------------
1576
// Generic filter-based scanning code
1577
//----------------------------------------------------------------
1578
1579
private static interface Filter {
1580
boolean contains(int codePoint);
1581
}
1582
1583
// VersionInfo for unassigned characters
1584
static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
1585
1586
private static class VersionFilter implements Filter {
1587
VersionInfo version;
1588
1589
VersionFilter(VersionInfo version) { this.version = version; }
1590
1591
public boolean contains(int ch) {
1592
VersionInfo v = UCharacter.getAge(ch);
1593
// Reference comparison ok; VersionInfo caches and reuses
1594
// unique objects.
1595
return v != NO_VERSION &&
1596
v.compareTo(version) <= 0;
1597
}
1598
}
1599
1600
private static synchronized UnicodeSet getInclusions(int src) {
1601
if (INCLUSIONS == null) {
1602
INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
1603
}
1604
if(INCLUSIONS[src] == null) {
1605
UnicodeSet incl = new UnicodeSet();
1606
switch(src) {
1607
case UCharacterProperty.SRC_PROPSVEC:
1608
UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
1609
break;
1610
default:
1611
throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
1612
}
1613
INCLUSIONS[src] = incl;
1614
}
1615
return INCLUSIONS[src];
1616
}
1617
1618
/**
1619
* Generic filter-based scanning code for UCD property UnicodeSets.
1620
*/
1621
private UnicodeSet applyFilter(Filter filter, int src) {
1622
// Walk through all Unicode characters, noting the start
1623
// and end of each range for which filter.contain(c) is
1624
// true. Add each range to a set.
1625
//
1626
// To improve performance, use the INCLUSIONS set, which
1627
// encodes information about character ranges that are known
1628
// to have identical properties, such as the CJK Ideographs
1629
// from U+4E00 to U+9FA5. INCLUSIONS contains all characters
1630
// except the first characters of such ranges.
1631
//
1632
// TODO Where possible, instead of scanning over code points,
1633
// use internal property data to initialize UnicodeSets for
1634
// those properties. Scanning code points is slow.
1635
1636
clear();
1637
1638
int startHasProperty = -1;
1639
UnicodeSet inclusions = getInclusions(src);
1640
int limitRange = inclusions.getRangeCount();
1641
1642
for (int j=0; j<limitRange; ++j) {
1643
// get current range
1644
int start = inclusions.getRangeStart(j);
1645
int end = inclusions.getRangeEnd(j);
1646
1647
// for all the code points in the range, process
1648
for (int ch = start; ch <= end; ++ch) {
1649
// only add to the unicodeset on inflection points --
1650
// where the hasProperty value changes to false
1651
if (filter.contains(ch)) {
1652
if (startHasProperty < 0) {
1653
startHasProperty = ch;
1654
}
1655
} else if (startHasProperty >= 0) {
1656
add_unchecked(startHasProperty, ch-1);
1657
startHasProperty = -1;
1658
}
1659
}
1660
}
1661
if (startHasProperty >= 0) {
1662
add_unchecked(startHasProperty, 0x10FFFF);
1663
}
1664
1665
return this;
1666
}
1667
1668
/**
1669
* Remove leading and trailing rule white space and compress
1670
* internal rule white space to a single space character.
1671
*
1672
* @see UCharacterProperty#isRuleWhiteSpace
1673
*/
1674
private static String mungeCharName(String source) {
1675
StringBuffer buf = new StringBuffer();
1676
for (int i=0; i<source.length(); ) {
1677
int ch = UTF16.charAt(source, i);
1678
i += UTF16.getCharCount(ch);
1679
if (UCharacterProperty.isRuleWhiteSpace(ch)) {
1680
if (buf.length() == 0 ||
1681
buf.charAt(buf.length() - 1) == ' ') {
1682
continue;
1683
}
1684
ch = ' '; // convert to ' '
1685
}
1686
UTF16.append(buf, ch);
1687
}
1688
if (buf.length() != 0 &&
1689
buf.charAt(buf.length() - 1) == ' ') {
1690
buf.setLength(buf.length() - 1);
1691
}
1692
return buf.toString();
1693
}
1694
1695
/**
1696
* Modifies this set to contain those code points which have the
1697
* given value for the given property. Prior contents of this
1698
* set are lost.
1699
* @param propertyAlias
1700
* @param valueAlias
1701
* @param symbols if not null, then symbols are first called to see if a property
1702
* is available. If true, then everything else is skipped.
1703
* @return this set
1704
* @stable ICU 3.2
1705
*/
1706
public UnicodeSet applyPropertyAlias(String propertyAlias,
1707
String valueAlias, SymbolTable symbols) {
1708
if (valueAlias.length() > 0) {
1709
if (propertyAlias.equals("Age")) {
1710
// Must munge name, since
1711
// VersionInfo.getInstance() does not do
1712
// 'loose' matching.
1713
VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
1714
applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
1715
return this;
1716
}
1717
}
1718
throw new IllegalArgumentException("Unsupported property: " + propertyAlias);
1719
}
1720
1721
/**
1722
* Return true if the given iterator appears to point at a
1723
* property pattern. Regardless of the result, return with the
1724
* iterator unchanged.
1725
* @param chars iterator over the pattern characters. Upon return
1726
* it will be unchanged.
1727
* @param iterOpts RuleCharacterIterator options
1728
*/
1729
private static boolean resemblesPropertyPattern(RuleCharacterIterator chars,
1730
int iterOpts) {
1731
boolean result = false;
1732
iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
1733
Object pos = chars.getPos(null);
1734
int c = chars.next(iterOpts);
1735
if (c == '[' || c == '\\') {
1736
int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
1737
result = (c == '[') ? (d == ':') :
1738
(d == 'N' || d == 'p' || d == 'P');
1739
}
1740
chars.setPos(pos);
1741
return result;
1742
}
1743
1744
/**
1745
* Parse the given property pattern at the given parse position.
1746
* @param symbols TODO
1747
*/
1748
private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) {
1749
int pos = ppos.getIndex();
1750
1751
// On entry, ppos should point to one of the following locations:
1752
1753
// Minimum length is 5 characters, e.g. \p{L}
1754
if ((pos+5) > pattern.length()) {
1755
return null;
1756
}
1757
1758
boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1759
boolean isName = false; // true for \N{pat}, o/w false
1760
boolean invert = false;
1761
1762
// Look for an opening [:, [:^, \p, or \P
1763
if (pattern.regionMatches(pos, "[:", 0, 2)) {
1764
posix = true;
1765
pos = Utility.skipWhitespace(pattern, pos+2);
1766
if (pos < pattern.length() && pattern.charAt(pos) == '^') {
1767
++pos;
1768
invert = true;
1769
}
1770
} else if (pattern.regionMatches(true, pos, "\\p", 0, 2) ||
1771
pattern.regionMatches(pos, "\\N", 0, 2)) {
1772
char c = pattern.charAt(pos+1);
1773
invert = (c == 'P');
1774
isName = (c == 'N');
1775
pos = Utility.skipWhitespace(pattern, pos+2);
1776
if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
1777
// Syntax error; "\p" or "\P" not followed by "{"
1778
return null;
1779
}
1780
} else {
1781
// Open delimiter not seen
1782
return null;
1783
}
1784
1785
// Look for the matching close delimiter, either :] or }
1786
int close = pattern.indexOf(posix ? ":]" : "}", pos);
1787
if (close < 0) {
1788
// Syntax error; close delimiter missing
1789
return null;
1790
}
1791
1792
// Look for an '=' sign. If this is present, we will parse a
1793
// medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1794
// pattern.
1795
int equals = pattern.indexOf('=', pos);
1796
String propName, valueName;
1797
if (equals >= 0 && equals < close && !isName) {
1798
// Equals seen; parse medium/long pattern
1799
propName = pattern.substring(pos, equals);
1800
valueName = pattern.substring(equals+1, close);
1801
}
1802
1803
else {
1804
// Handle case where no '=' is seen, and \N{}
1805
propName = pattern.substring(pos, close);
1806
valueName = "";
1807
1808
// Handle \N{name}
1809
if (isName) {
1810
// This is a little inefficient since it means we have to
1811
// parse "na" back to UProperty.NAME even though we already
1812
// know it's UProperty.NAME. If we refactor the API to
1813
// support args of (int, String) then we can remove
1814
// "na" and make this a little more efficient.
1815
valueName = propName;
1816
propName = "na";
1817
}
1818
}
1819
1820
applyPropertyAlias(propName, valueName, symbols);
1821
1822
if (invert) {
1823
complement();
1824
}
1825
1826
// Move to the limit position after the close delimiter
1827
ppos.setIndex(close + (posix ? 2 : 1));
1828
1829
return this;
1830
}
1831
1832
/**
1833
* Parse a property pattern.
1834
* @param chars iterator over the pattern characters. Upon return
1835
* it will be advanced to the first character after the parsed
1836
* pattern, or the end of the iteration if all characters are
1837
* parsed.
1838
* @param rebuiltPat the pattern that was parsed, rebuilt or
1839
* copied from the input pattern, as appropriate.
1840
* @param symbols TODO
1841
*/
1842
private void applyPropertyPattern(RuleCharacterIterator chars,
1843
StringBuffer rebuiltPat, SymbolTable symbols) {
1844
String patStr = chars.lookahead();
1845
ParsePosition pos = new ParsePosition(0);
1846
applyPropertyPattern(patStr, pos, symbols);
1847
if (pos.getIndex() == 0) {
1848
syntaxError(chars, "Invalid property pattern");
1849
}
1850
chars.jumpahead(pos.getIndex());
1851
rebuiltPat.append(patStr.substring(0, pos.getIndex()));
1852
}
1853
1854
//----------------------------------------------------------------
1855
// Case folding API
1856
//----------------------------------------------------------------
1857
1858
/**
1859
* Bitmask for constructor and applyPattern() indicating that
1860
* white space should be ignored. If set, ignore characters for
1861
* which UCharacterProperty.isRuleWhiteSpace() returns true,
1862
* unless they are quoted or escaped. This may be ORed together
1863
* with other selectors.
1864
* @stable ICU 3.8
1865
*/
1866
public static final int IGNORE_SPACE = 1;
1867
1868
}
1869
1870
1871