Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/java/text/BreakIterator.java
38829 views
1
/*
2
* Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation. Oracle designates this
8
* particular file as subject to the "Classpath" exception as provided
9
* by Oracle in the LICENSE file that accompanied this code.
10
*
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
16
*
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
*
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
23
* questions.
24
*/
25
26
/*
27
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
28
* (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
29
*
30
* The original version of this source code and documentation
31
* is copyrighted and owned by Taligent, Inc., a wholly-owned
32
* subsidiary of IBM. These materials are provided under terms
33
* of a License Agreement between Taligent and Sun. This technology
34
* is protected by multiple US and International patents.
35
*
36
* This notice and attribution to Taligent may not be removed.
37
* Taligent is a registered trademark of Taligent, Inc.
38
*
39
*/
40
41
package java.text;
42
43
import java.lang.ref.SoftReference;
44
import java.text.spi.BreakIteratorProvider;
45
import java.util.Locale;
46
import sun.util.locale.provider.LocaleProviderAdapter;
47
import sun.util.locale.provider.LocaleServiceProviderPool;
48
49
50
/**
51
* The <code>BreakIterator</code> class implements methods for finding
52
* the location of boundaries in text. Instances of <code>BreakIterator</code>
53
* maintain a current position and scan over text
54
* returning the index of characters where boundaries occur.
55
* Internally, <code>BreakIterator</code> scans text using a
56
* <code>CharacterIterator</code>, and is thus able to scan text held
57
* by any object implementing that protocol. A <code>StringCharacterIterator</code>
58
* is used to scan <code>String</code> objects passed to <code>setText</code>.
59
*
60
* <p>
61
* You use the factory methods provided by this class to create
62
* instances of various types of break iterators. In particular,
63
* use <code>getWordInstance</code>, <code>getLineInstance</code>,
64
* <code>getSentenceInstance</code>, and <code>getCharacterInstance</code>
65
* to create <code>BreakIterator</code>s that perform
66
* word, line, sentence, and character boundary analysis respectively.
67
* A single <code>BreakIterator</code> can work only on one unit
68
* (word, line, sentence, and so on). You must use a different iterator
69
* for each unit boundary analysis you wish to perform.
70
*
71
* <p><a name="line"></a>
72
* Line boundary analysis determines where a text string can be
73
* broken when line-wrapping. The mechanism correctly handles
74
* punctuation and hyphenated words. Actual line breaking needs
75
* to also consider the available line width and is handled by
76
* higher-level software.
77
*
78
* <p><a name="sentence"></a>
79
* Sentence boundary analysis allows selection with correct interpretation
80
* of periods within numbers and abbreviations, and trailing punctuation
81
* marks such as quotation marks and parentheses.
82
*
83
* <p><a name="word"></a>
84
* Word boundary analysis is used by search and replace functions, as
85
* well as within text editing applications that allow the user to
86
* select words with a double click. Word selection provides correct
87
* interpretation of punctuation marks within and following
88
* words. Characters that are not part of a word, such as symbols
89
* or punctuation marks, have word-breaks on both sides.
90
*
91
* <p><a name="character"></a>
92
* Character boundary analysis allows users to interact with characters
93
* as they expect to, for example, when moving the cursor through a text
94
* string. Character boundary analysis provides correct navigation
95
* through character strings, regardless of how the character is stored.
96
* The boundaries returned may be those of supplementary characters,
97
* combining character sequences, or ligature clusters.
98
* For example, an accented character might be stored as a base character
99
* and a diacritical mark. What users consider to be a character can
100
* differ between languages.
101
*
102
* <p>
103
* The <code>BreakIterator</code> instances returned by the factory methods
104
* of this class are intended for use with natural languages only, not for
105
* programming language text. It is however possible to define subclasses
106
* that tokenize a programming language.
107
*
108
* <P>
109
* <strong>Examples</strong>:<P>
110
* Creating and using text boundaries:
111
* <blockquote>
112
* <pre>
113
* public static void main(String args[]) {
114
* if (args.length == 1) {
115
* String stringToExamine = args[0];
116
* //print each word in order
117
* BreakIterator boundary = BreakIterator.getWordInstance();
118
* boundary.setText(stringToExamine);
119
* printEachForward(boundary, stringToExamine);
120
* //print each sentence in reverse order
121
* boundary = BreakIterator.getSentenceInstance(Locale.US);
122
* boundary.setText(stringToExamine);
123
* printEachBackward(boundary, stringToExamine);
124
* printFirst(boundary, stringToExamine);
125
* printLast(boundary, stringToExamine);
126
* }
127
* }
128
* </pre>
129
* </blockquote>
130
*
131
* Print each element in order:
132
* <blockquote>
133
* <pre>
134
* public static void printEachForward(BreakIterator boundary, String source) {
135
* int start = boundary.first();
136
* for (int end = boundary.next();
137
* end != BreakIterator.DONE;
138
* start = end, end = boundary.next()) {
139
* System.out.println(source.substring(start,end));
140
* }
141
* }
142
* </pre>
143
* </blockquote>
144
*
145
* Print each element in reverse order:
146
* <blockquote>
147
* <pre>
148
* public static void printEachBackward(BreakIterator boundary, String source) {
149
* int end = boundary.last();
150
* for (int start = boundary.previous();
151
* start != BreakIterator.DONE;
152
* end = start, start = boundary.previous()) {
153
* System.out.println(source.substring(start,end));
154
* }
155
* }
156
* </pre>
157
* </blockquote>
158
*
159
* Print first element:
160
* <blockquote>
161
* <pre>
162
* public static void printFirst(BreakIterator boundary, String source) {
163
* int start = boundary.first();
164
* int end = boundary.next();
165
* System.out.println(source.substring(start,end));
166
* }
167
* </pre>
168
* </blockquote>
169
*
170
* Print last element:
171
* <blockquote>
172
* <pre>
173
* public static void printLast(BreakIterator boundary, String source) {
174
* int end = boundary.last();
175
* int start = boundary.previous();
176
* System.out.println(source.substring(start,end));
177
* }
178
* </pre>
179
* </blockquote>
180
*
181
* Print the element at a specified position:
182
* <blockquote>
183
* <pre>
184
* public static void printAt(BreakIterator boundary, int pos, String source) {
185
* int end = boundary.following(pos);
186
* int start = boundary.previous();
187
* System.out.println(source.substring(start,end));
188
* }
189
* </pre>
190
* </blockquote>
191
*
192
* Find the next word:
193
* <blockquote>
194
* <pre>{@code
195
* public static int nextWordStartAfter(int pos, String text) {
196
* BreakIterator wb = BreakIterator.getWordInstance();
197
* wb.setText(text);
198
* int last = wb.following(pos);
199
* int current = wb.next();
200
* while (current != BreakIterator.DONE) {
201
* for (int p = last; p < current; p++) {
202
* if (Character.isLetter(text.codePointAt(p)))
203
* return last;
204
* }
205
* last = current;
206
* current = wb.next();
207
* }
208
* return BreakIterator.DONE;
209
* }
210
* }</pre>
211
* (The iterator returned by BreakIterator.getWordInstance() is unique in that
212
* the break positions it returns don't represent both the start and end of the
213
* thing being iterated over. That is, a sentence-break iterator returns breaks
214
* that each represent the end of one sentence and the beginning of the next.
215
* With the word-break iterator, the characters between two boundaries might be a
216
* word, or they might be the punctuation or whitespace between two words. The
217
* above code uses a simple heuristic to determine which boundary is the beginning
218
* of a word: If the characters between this boundary and the next boundary
219
* include at least one letter (this can be an alphabetical letter, a CJK ideograph,
220
* a Hangul syllable, a Kana character, etc.), then the text between this boundary
221
* and the next is a word; otherwise, it's the material between words.)
222
* </blockquote>
223
*
224
* @see CharacterIterator
225
*
226
*/
227
228
public abstract class BreakIterator implements Cloneable
229
{
230
/**
231
* Constructor. BreakIterator is stateless and has no default behavior.
232
*/
233
protected BreakIterator()
234
{
235
}
236
237
/**
238
* Create a copy of this iterator
239
* @return A copy of this
240
*/
241
@Override
242
public Object clone()
243
{
244
try {
245
return super.clone();
246
}
247
catch (CloneNotSupportedException e) {
248
throw new InternalError(e);
249
}
250
}
251
252
/**
253
* DONE is returned by previous(), next(), next(int), preceding(int)
254
* and following(int) when either the first or last text boundary has been
255
* reached.
256
*/
257
public static final int DONE = -1;
258
259
/**
260
* Returns the first boundary. The iterator's current position is set
261
* to the first text boundary.
262
* @return The character index of the first text boundary.
263
*/
264
public abstract int first();
265
266
/**
267
* Returns the last boundary. The iterator's current position is set
268
* to the last text boundary.
269
* @return The character index of the last text boundary.
270
*/
271
public abstract int last();
272
273
/**
274
* Returns the nth boundary from the current boundary. If either
275
* the first or last text boundary has been reached, it returns
276
* <code>BreakIterator.DONE</code> and the current position is set to either
277
* the first or last text boundary depending on which one is reached. Otherwise,
278
* the iterator's current position is set to the new boundary.
279
* For example, if the iterator's current position is the mth text boundary
280
* and three more boundaries exist from the current boundary to the last text
281
* boundary, the next(2) call will return m + 2. The new text position is set
282
* to the (m + 2)th text boundary. A next(4) call would return
283
* <code>BreakIterator.DONE</code> and the last text boundary would become the
284
* new text position.
285
* @param n which boundary to return. A value of 0
286
* does nothing. Negative values move to previous boundaries
287
* and positive values move to later boundaries.
288
* @return The character index of the nth boundary from the current position
289
* or <code>BreakIterator.DONE</code> if either first or last text boundary
290
* has been reached.
291
*/
292
public abstract int next(int n);
293
294
/**
295
* Returns the boundary following the current boundary. If the current boundary
296
* is the last text boundary, it returns <code>BreakIterator.DONE</code> and
297
* the iterator's current position is unchanged. Otherwise, the iterator's
298
* current position is set to the boundary following the current boundary.
299
* @return The character index of the next text boundary or
300
* <code>BreakIterator.DONE</code> if the current boundary is the last text
301
* boundary.
302
* Equivalent to next(1).
303
* @see #next(int)
304
*/
305
public abstract int next();
306
307
/**
308
* Returns the boundary preceding the current boundary. If the current boundary
309
* is the first text boundary, it returns <code>BreakIterator.DONE</code> and
310
* the iterator's current position is unchanged. Otherwise, the iterator's
311
* current position is set to the boundary preceding the current boundary.
312
* @return The character index of the previous text boundary or
313
* <code>BreakIterator.DONE</code> if the current boundary is the first text
314
* boundary.
315
*/
316
public abstract int previous();
317
318
/**
319
* Returns the first boundary following the specified character offset. If the
320
* specified offset equals to the last text boundary, it returns
321
* <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
322
* Otherwise, the iterator's current position is set to the returned boundary.
323
* The value returned is always greater than the offset or the value
324
* <code>BreakIterator.DONE</code>.
325
* @param offset the character offset to begin scanning.
326
* @return The first boundary after the specified offset or
327
* <code>BreakIterator.DONE</code> if the last text boundary is passed in
328
* as the offset.
329
* @exception IllegalArgumentException if the specified offset is less than
330
* the first text boundary or greater than the last text boundary.
331
*/
332
public abstract int following(int offset);
333
334
/**
335
* Returns the last boundary preceding the specified character offset. If the
336
* specified offset equals to the first text boundary, it returns
337
* <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
338
* Otherwise, the iterator's current position is set to the returned boundary.
339
* The value returned is always less than the offset or the value
340
* <code>BreakIterator.DONE</code>.
341
* @param offset the character offset to begin scanning.
342
* @return The last boundary before the specified offset or
343
* <code>BreakIterator.DONE</code> if the first text boundary is passed in
344
* as the offset.
345
* @exception IllegalArgumentException if the specified offset is less than
346
* the first text boundary or greater than the last text boundary.
347
* @since 1.2
348
*/
349
public int preceding(int offset) {
350
// NOTE: This implementation is here solely because we can't add new
351
// abstract methods to an existing class. There is almost ALWAYS a
352
// better, faster way to do this.
353
int pos = following(offset);
354
while (pos >= offset && pos != DONE) {
355
pos = previous();
356
}
357
return pos;
358
}
359
360
/**
361
* Returns true if the specified character offset is a text boundary.
362
* @param offset the character offset to check.
363
* @return <code>true</code> if "offset" is a boundary position,
364
* <code>false</code> otherwise.
365
* @exception IllegalArgumentException if the specified offset is less than
366
* the first text boundary or greater than the last text boundary.
367
* @since 1.2
368
*/
369
public boolean isBoundary(int offset) {
370
// NOTE: This implementation probably is wrong for most situations
371
// because it fails to take into account the possibility that a
372
// CharacterIterator passed to setText() may not have a begin offset
373
// of 0. But since the abstract BreakIterator doesn't have that
374
// knowledge, it assumes the begin offset is 0. If you subclass
375
// BreakIterator, copy the SimpleTextBoundary implementation of this
376
// function into your subclass. [This should have been abstract at
377
// this level, but it's too late to fix that now.]
378
if (offset == 0) {
379
return true;
380
}
381
int boundary = following(offset - 1);
382
if (boundary == DONE) {
383
throw new IllegalArgumentException();
384
}
385
return boundary == offset;
386
}
387
388
/**
389
* Returns character index of the text boundary that was most
390
* recently returned by next(), next(int), previous(), first(), last(),
391
* following(int) or preceding(int). If any of these methods returns
392
* <code>BreakIterator.DONE</code> because either first or last text boundary
393
* has been reached, it returns the first or last text boundary depending on
394
* which one is reached.
395
* @return The text boundary returned from the above methods, first or last
396
* text boundary.
397
* @see #next()
398
* @see #next(int)
399
* @see #previous()
400
* @see #first()
401
* @see #last()
402
* @see #following(int)
403
* @see #preceding(int)
404
*/
405
public abstract int current();
406
407
/**
408
* Get the text being scanned
409
* @return the text being scanned
410
*/
411
public abstract CharacterIterator getText();
412
413
/**
414
* Set a new text string to be scanned. The current scan
415
* position is reset to first().
416
* @param newText new text to scan.
417
*/
418
public void setText(String newText)
419
{
420
setText(new StringCharacterIterator(newText));
421
}
422
423
/**
424
* Set a new text for scanning. The current scan
425
* position is reset to first().
426
* @param newText new text to scan.
427
*/
428
public abstract void setText(CharacterIterator newText);
429
430
private static final int CHARACTER_INDEX = 0;
431
private static final int WORD_INDEX = 1;
432
private static final int LINE_INDEX = 2;
433
private static final int SENTENCE_INDEX = 3;
434
435
@SuppressWarnings("unchecked")
436
private static final SoftReference<BreakIteratorCache>[] iterCache = (SoftReference<BreakIteratorCache>[]) new SoftReference<?>[4];
437
438
/**
439
* Returns a new <code>BreakIterator</code> instance
440
* for <a href="BreakIterator.html#word">word breaks</a>
441
* for the {@linkplain Locale#getDefault() default locale}.
442
* @return A break iterator for word breaks
443
*/
444
public static BreakIterator getWordInstance()
445
{
446
return getWordInstance(Locale.getDefault());
447
}
448
449
/**
450
* Returns a new <code>BreakIterator</code> instance
451
* for <a href="BreakIterator.html#word">word breaks</a>
452
* for the given locale.
453
* @param locale the desired locale
454
* @return A break iterator for word breaks
455
* @exception NullPointerException if <code>locale</code> is null
456
*/
457
public static BreakIterator getWordInstance(Locale locale)
458
{
459
return getBreakInstance(locale, WORD_INDEX);
460
}
461
462
/**
463
* Returns a new <code>BreakIterator</code> instance
464
* for <a href="BreakIterator.html#line">line breaks</a>
465
* for the {@linkplain Locale#getDefault() default locale}.
466
* @return A break iterator for line breaks
467
*/
468
public static BreakIterator getLineInstance()
469
{
470
return getLineInstance(Locale.getDefault());
471
}
472
473
/**
474
* Returns a new <code>BreakIterator</code> instance
475
* for <a href="BreakIterator.html#line">line breaks</a>
476
* for the given locale.
477
* @param locale the desired locale
478
* @return A break iterator for line breaks
479
* @exception NullPointerException if <code>locale</code> is null
480
*/
481
public static BreakIterator getLineInstance(Locale locale)
482
{
483
return getBreakInstance(locale, LINE_INDEX);
484
}
485
486
/**
487
* Returns a new <code>BreakIterator</code> instance
488
* for <a href="BreakIterator.html#character">character breaks</a>
489
* for the {@linkplain Locale#getDefault() default locale}.
490
* @return A break iterator for character breaks
491
*/
492
public static BreakIterator getCharacterInstance()
493
{
494
return getCharacterInstance(Locale.getDefault());
495
}
496
497
/**
498
* Returns a new <code>BreakIterator</code> instance
499
* for <a href="BreakIterator.html#character">character breaks</a>
500
* for the given locale.
501
* @param locale the desired locale
502
* @return A break iterator for character breaks
503
* @exception NullPointerException if <code>locale</code> is null
504
*/
505
public static BreakIterator getCharacterInstance(Locale locale)
506
{
507
return getBreakInstance(locale, CHARACTER_INDEX);
508
}
509
510
/**
511
* Returns a new <code>BreakIterator</code> instance
512
* for <a href="BreakIterator.html#sentence">sentence breaks</a>
513
* for the {@linkplain Locale#getDefault() default locale}.
514
* @return A break iterator for sentence breaks
515
*/
516
public static BreakIterator getSentenceInstance()
517
{
518
return getSentenceInstance(Locale.getDefault());
519
}
520
521
/**
522
* Returns a new <code>BreakIterator</code> instance
523
* for <a href="BreakIterator.html#sentence">sentence breaks</a>
524
* for the given locale.
525
* @param locale the desired locale
526
* @return A break iterator for sentence breaks
527
* @exception NullPointerException if <code>locale</code> is null
528
*/
529
public static BreakIterator getSentenceInstance(Locale locale)
530
{
531
return getBreakInstance(locale, SENTENCE_INDEX);
532
}
533
534
private static BreakIterator getBreakInstance(Locale locale, int type) {
535
if (iterCache[type] != null) {
536
BreakIteratorCache cache = iterCache[type].get();
537
if (cache != null) {
538
if (cache.getLocale().equals(locale)) {
539
return cache.createBreakInstance();
540
}
541
}
542
}
543
544
BreakIterator result = createBreakInstance(locale, type);
545
BreakIteratorCache cache = new BreakIteratorCache(locale, result);
546
iterCache[type] = new SoftReference<>(cache);
547
return result;
548
}
549
550
private static BreakIterator createBreakInstance(Locale locale,
551
int type) {
552
LocaleProviderAdapter adapter = LocaleProviderAdapter.getAdapter(BreakIteratorProvider.class, locale);
553
BreakIterator iterator = createBreakInstance(adapter, locale, type);
554
if (iterator == null) {
555
iterator = createBreakInstance(LocaleProviderAdapter.forJRE(), locale, type);
556
}
557
return iterator;
558
}
559
560
private static BreakIterator createBreakInstance(LocaleProviderAdapter adapter, Locale locale, int type) {
561
BreakIteratorProvider breakIteratorProvider = adapter.getBreakIteratorProvider();
562
BreakIterator iterator = null;
563
switch (type) {
564
case CHARACTER_INDEX:
565
iterator = breakIteratorProvider.getCharacterInstance(locale);
566
break;
567
case WORD_INDEX:
568
iterator = breakIteratorProvider.getWordInstance(locale);
569
break;
570
case LINE_INDEX:
571
iterator = breakIteratorProvider.getLineInstance(locale);
572
break;
573
case SENTENCE_INDEX:
574
iterator = breakIteratorProvider.getSentenceInstance(locale);
575
break;
576
}
577
return iterator;
578
}
579
580
/**
581
* Returns an array of all locales for which the
582
* <code>get*Instance</code> methods of this class can return
583
* localized instances.
584
* The returned array represents the union of locales supported by the Java
585
* runtime and by installed
586
* {@link java.text.spi.BreakIteratorProvider BreakIteratorProvider} implementations.
587
* It must contain at least a <code>Locale</code>
588
* instance equal to {@link java.util.Locale#US Locale.US}.
589
*
590
* @return An array of locales for which localized
591
* <code>BreakIterator</code> instances are available.
592
*/
593
public static synchronized Locale[] getAvailableLocales()
594
{
595
LocaleServiceProviderPool pool =
596
LocaleServiceProviderPool.getPool(BreakIteratorProvider.class);
597
return pool.getAvailableLocales();
598
}
599
600
private static final class BreakIteratorCache {
601
602
private BreakIterator iter;
603
private Locale locale;
604
605
BreakIteratorCache(Locale locale, BreakIterator iter) {
606
this.locale = locale;
607
this.iter = (BreakIterator) iter.clone();
608
}
609
610
Locale getLocale() {
611
return locale;
612
}
613
614
BreakIterator createBreakInstance() {
615
return (BreakIterator) iter.clone();
616
}
617
}
618
}
619
620