CoCalc -- BreakIterator.java

GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/java/text/BreakIterator.java
⁴⁵⁹⁸² views
1
/*
2
 * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.
3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
 *
5
 * This code is free software; you can redistribute it and/or modify it
6
 * under the terms of the GNU General Public License version 2 only, as
7
 * published by the Free Software Foundation.  Oracle designates this
8
 * particular file as subject to the "Classpath" exception as provided
9
 * by Oracle in the LICENSE file that accompanied this code.
10
 *
11
 * This code is distributed in the hope that it will be useful, but WITHOUT
12
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14
 * version 2 for more details (a copy is included in the LICENSE file that
15
 * accompanied this code).
16
 *
17
 * You should have received a copy of the GNU General Public License version
18
 * 2 along with this work; if not, write to the Free Software Foundation,
19
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
 *
21
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
 * or visit www.oracle.com if you need additional information or have any
23
 * questions.
24
 */
25

26
/*
27
 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
28
 * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
29
 *
30
 * The original version of this source code and documentation
31
 * is copyrighted and owned by Taligent, Inc., a wholly-owned
32
 * subsidiary of IBM. These materials are provided under terms
33
 * of a License Agreement between Taligent and Sun. This technology
34
 * is protected by multiple US and International patents.
35
 *
36
 * This notice and attribution to Taligent may not be removed.
37
 * Taligent is a registered trademark of Taligent, Inc.
38
 *
39
 */
40

41
package java.text;
42

43
import java.lang.ref.SoftReference;
44
import java.text.spi.BreakIteratorProvider;
45
import java.util.Locale;
46
import sun.util.locale.provider.LocaleProviderAdapter;
47
import sun.util.locale.provider.LocaleServiceProviderPool;
48

49

50
/**
51
 * The <code>BreakIterator</code> class implements methods for finding
52
 * the location of boundaries in text. Instances of <code>BreakIterator</code>
53
 * maintain a current position and scan over text
54
 * returning the index of characters where boundaries occur.
55
 * Internally, <code>BreakIterator</code> scans text using a
56
 * <code>CharacterIterator</code>, and is thus able to scan text held
57
 * by any object implementing that protocol. A <code>StringCharacterIterator</code>
58
 * is used to scan <code>String</code> objects passed to <code>setText</code>.
59
 *
60
 * <p>
61
 * You use the factory methods provided by this class to create
62
 * instances of various types of break iterators. In particular,
63
 * use <code>getWordInstance</code>, <code>getLineInstance</code>,
64
 * <code>getSentenceInstance</code>, and <code>getCharacterInstance</code>
65
 * to create <code>BreakIterator</code>s that perform
66
 * word, line, sentence, and character boundary analysis respectively.
67
 * A single <code>BreakIterator</code> can work only on one unit
68
 * (word, line, sentence, and so on). You must use a different iterator
69
 * for each unit boundary analysis you wish to perform.
70
 *
71
 * <p><a name="line"></a>
72
 * Line boundary analysis determines where a text string can be
73
 * broken when line-wrapping. The mechanism correctly handles
74
 * punctuation and hyphenated words. Actual line breaking needs
75
 * to also consider the available line width and is handled by
76
 * higher-level software.
77
 *
78
 * <p><a name="sentence"></a>
79
 * Sentence boundary analysis allows selection with correct interpretation
80
 * of periods within numbers and abbreviations, and trailing punctuation
81
 * marks such as quotation marks and parentheses.
82
 *
83
 * <p><a name="word"></a>
84
 * Word boundary analysis is used by search and replace functions, as
85
 * well as within text editing applications that allow the user to
86
 * select words with a double click. Word selection provides correct
87
 * interpretation of punctuation marks within and following
88
 * words. Characters that are not part of a word, such as symbols
89
 * or punctuation marks, have word-breaks on both sides.
90
 *
91
 * <p><a name="character"></a>
92
 * Character boundary analysis allows users to interact with characters
93
 * as they expect to, for example, when moving the cursor through a text
94
 * string. Character boundary analysis provides correct navigation
95
 * through character strings, regardless of how the character is stored.
96
 * The boundaries returned may be those of supplementary characters,
97
 * combining character sequences, or ligature clusters.
98
 * For example, an accented character might be stored as a base character
99
 * and a diacritical mark. What users consider to be a character can
100
 * differ between languages.
101
 *
102
 * <p>
103
 * The <code>BreakIterator</code> instances returned by the factory methods
104
 * of this class are intended for use with natural languages only, not for
105
 * programming language text. It is however possible to define subclasses
106
 * that tokenize a programming language.
107
 *
108
 * <P>
109
 * <strong>Examples</strong>:<P>
110
 * Creating and using text boundaries:
111
 * <blockquote>
112
 * <pre>
113
 * public static void main(String args[]) {
114
 *      if (args.length == 1) {
115
 *          String stringToExamine = args[0];
116
 *          //print each word in order
117
 *          BreakIterator boundary = BreakIterator.getWordInstance();
118
 *          boundary.setText(stringToExamine);
119
 *          printEachForward(boundary, stringToExamine);
120
 *          //print each sentence in reverse order
121
 *          boundary = BreakIterator.getSentenceInstance(Locale.US);
122
 *          boundary.setText(stringToExamine);
123
 *          printEachBackward(boundary, stringToExamine);
124
 *          printFirst(boundary, stringToExamine);
125
 *          printLast(boundary, stringToExamine);
126
 *      }
127
 * }
128
 * </pre>
129
 * </blockquote>
130
 *
131
 * Print each element in order:
132
 * <blockquote>
133
 * <pre>
134
 * public static void printEachForward(BreakIterator boundary, String source) {
135
 *     int start = boundary.first();
136
 *     for (int end = boundary.next();
137
 *          end != BreakIterator.DONE;
138
 *          start = end, end = boundary.next()) {
139
 *          System.out.println(source.substring(start,end));
140
 *     }
141
 * }
142
 * </pre>
143
 * </blockquote>
144
 *
145
 * Print each element in reverse order:
146
 * <blockquote>
147
 * <pre>
148
 * public static void printEachBackward(BreakIterator boundary, String source) {
149
 *     int end = boundary.last();
150
 *     for (int start = boundary.previous();
151
 *          start != BreakIterator.DONE;
152
 *          end = start, start = boundary.previous()) {
153
 *         System.out.println(source.substring(start,end));
154
 *     }
155
 * }
156
 * </pre>
157
 * </blockquote>
158
 *
159
 * Print first element:
160
 * <blockquote>
161
 * <pre>
162
 * public static void printFirst(BreakIterator boundary, String source) {
163
 *     int start = boundary.first();
164
 *     int end = boundary.next();
165
 *     System.out.println(source.substring(start,end));
166
 * }
167
 * </pre>
168
 * </blockquote>
169
 *
170
 * Print last element:
171
 * <blockquote>
172
 * <pre>
173
 * public static void printLast(BreakIterator boundary, String source) {
174
 *     int end = boundary.last();
175
 *     int start = boundary.previous();
176
 *     System.out.println(source.substring(start,end));
177
 * }
178
 * </pre>
179
 * </blockquote>
180
 *
181
 * Print the element at a specified position:
182
 * <blockquote>
183
 * <pre>
184
 * public static void printAt(BreakIterator boundary, int pos, String source) {
185
 *     int end = boundary.following(pos);
186
 *     int start = boundary.previous();
187
 *     System.out.println(source.substring(start,end));
188
 * }
189
 * </pre>
190
 * </blockquote>
191
 *
192
 * Find the next word:
193
 * <blockquote>
194
 * <pre>{@code
195
 * public static int nextWordStartAfter(int pos, String text) {
196
 *     BreakIterator wb = BreakIterator.getWordInstance();
197
 *     wb.setText(text);
198
 *     int last = wb.following(pos);
199
 *     int current = wb.next();
200
 *     while (current != BreakIterator.DONE) {
201
 *         for (int p = last; p < current; p++) {
202
 *             if (Character.isLetter(text.codePointAt(p)))
203
 *                 return last;
204
 *         }
205
 *         last = current;
206
 *         current = wb.next();
207
 *     }
208
 *     return BreakIterator.DONE;
209
 * }
210
 * }</pre>
211
 * (The iterator returned by BreakIterator.getWordInstance() is unique in that
212
 * the break positions it returns don't represent both the start and end of the
213
 * thing being iterated over.  That is, a sentence-break iterator returns breaks
214
 * that each represent the end of one sentence and the beginning of the next.
215
 * With the word-break iterator, the characters between two boundaries might be a
216
 * word, or they might be the punctuation or whitespace between two words.  The
217
 * above code uses a simple heuristic to determine which boundary is the beginning
218
 * of a word: If the characters between this boundary and the next boundary
219
 * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
220
 * a Hangul syllable, a Kana character, etc.), then the text between this boundary
221
 * and the next is a word; otherwise, it's the material between words.)
222
 * </blockquote>
223
 *
224
 * @see CharacterIterator
225
 *
226
 */
227

228
public abstract class BreakIterator implements Cloneable
229
{
230
    /**
231
     * Constructor. BreakIterator is stateless and has no default behavior.
232
     */
233
    protected BreakIterator()
234
    {
235
    }
236

237
    /**
238
     * Create a copy of this iterator
239
     * @return A copy of this
240
     */
241
    @Override
242
    public Object clone()
243
    {
244
        try {
245
            return super.clone();
246
        }
247
        catch (CloneNotSupportedException e) {
248
            throw new InternalError(e);
249
        }
250
    }
251

252
    /**
253
     * DONE is returned by previous(), next(), next(int), preceding(int)
254
     * and following(int) when either the first or last text boundary has been
255
     * reached.
256
     */
257
    public static final int DONE = -1;
258

259
    /**
260
     * Returns the first boundary. The iterator's current position is set
261
     * to the first text boundary.
262
     * @return The character index of the first text boundary.
263
     */
264
    public abstract int first();
265

266
    /**
267
     * Returns the last boundary. The iterator's current position is set
268
     * to the last text boundary.
269
     * @return The character index of the last text boundary.
270
     */
271
    public abstract int last();
272

273
    /**
274
     * Returns the nth boundary from the current boundary. If either
275
     * the first or last text boundary has been reached, it returns
276
     * <code>BreakIterator.DONE</code> and the current position is set to either
277
     * the first or last text boundary depending on which one is reached. Otherwise,
278
     * the iterator's current position is set to the new boundary.
279
     * For example, if the iterator's current position is the mth text boundary
280
     * and three more boundaries exist from the current boundary to the last text
281
     * boundary, the next(2) call will return m + 2. The new text position is set
282
     * to the (m + 2)th text boundary. A next(4) call would return
283
     * <code>BreakIterator.DONE</code> and the last text boundary would become the
284
     * new text position.
285
     * @param n which boundary to return.  A value of 0
286
     * does nothing.  Negative values move to previous boundaries
287
     * and positive values move to later boundaries.
288
     * @return The character index of the nth boundary from the current position
289
     * or <code>BreakIterator.DONE</code> if either first or last text boundary
290
     * has been reached.
291
     */
292
    public abstract int next(int n);
293

294
    /**
295
     * Returns the boundary following the current boundary. If the current boundary
296
     * is the last text boundary, it returns <code>BreakIterator.DONE</code> and
297
     * the iterator's current position is unchanged. Otherwise, the iterator's
298
     * current position is set to the boundary following the current boundary.
299
     * @return The character index of the next text boundary or
300
     * <code>BreakIterator.DONE</code> if the current boundary is the last text
301
     * boundary.
302
     * Equivalent to next(1).
303
     * @see #next(int)
304
     */
305
    public abstract int next();
306

307
    /**
308
     * Returns the boundary preceding the current boundary. If the current boundary
309
     * is the first text boundary, it returns <code>BreakIterator.DONE</code> and
310
     * the iterator's current position is unchanged. Otherwise, the iterator's
311
     * current position is set to the boundary preceding the current boundary.
312
     * @return The character index of the previous text boundary or
313
     * <code>BreakIterator.DONE</code> if the current boundary is the first text
314
     * boundary.
315
     */
316
    public abstract int previous();
317

318
    /**
319
     * Returns the first boundary following the specified character offset. If the
320
     * specified offset equals to the last text boundary, it returns
321
     * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
322
     * Otherwise, the iterator's current position is set to the returned boundary.
323
     * The value returned is always greater than the offset or the value
324
     * <code>BreakIterator.DONE</code>.
325
     * @param offset the character offset to begin scanning.
326
     * @return The first boundary after the specified offset or
327
     * <code>BreakIterator.DONE</code> if the last text boundary is passed in
328
     * as the offset.
329
     * @exception  IllegalArgumentException if the specified offset is less than
330
     * the first text boundary or greater than the last text boundary.
331
     */
332
    public abstract int following(int offset);
333

334
    /**
335
     * Returns the last boundary preceding the specified character offset. If the
336
     * specified offset equals to the first text boundary, it returns
337
     * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
338
     * Otherwise, the iterator's current position is set to the returned boundary.
339
     * The value returned is always less than the offset or the value
340
     * <code>BreakIterator.DONE</code>.
341
     * @param offset the character offset to begin scanning.
342
     * @return The last boundary before the specified offset or
343
     * <code>BreakIterator.DONE</code> if the first text boundary is passed in
344
     * as the offset.
345
     * @exception   IllegalArgumentException if the specified offset is less than
346
     * the first text boundary or greater than the last text boundary.
347
     * @since 1.2
348
     */
349
    public int preceding(int offset) {
350
        // NOTE:  This implementation is here solely because we can't add new
351
        // abstract methods to an existing class.  There is almost ALWAYS a
352
        // better, faster way to do this.
353
        int pos = following(offset);
354
        while (pos >= offset && pos != DONE) {
355
            pos = previous();
356
        }
357
        return pos;
358
    }
359

360
    /**
361
     * Returns true if the specified character offset is a text boundary.
362
     * @param offset the character offset to check.
363
     * @return <code>true</code> if "offset" is a boundary position,
364
     * <code>false</code> otherwise.
365
     * @exception   IllegalArgumentException if the specified offset is less than
366
     * the first text boundary or greater than the last text boundary.
367
     * @since 1.2
368
     */
369
    public boolean isBoundary(int offset) {
370
        // NOTE: This implementation probably is wrong for most situations
371
        // because it fails to take into account the possibility that a
372
        // CharacterIterator passed to setText() may not have a begin offset
373
        // of 0.  But since the abstract BreakIterator doesn't have that
374
        // knowledge, it assumes the begin offset is 0.  If you subclass
375
        // BreakIterator, copy the SimpleTextBoundary implementation of this
376
        // function into your subclass.  [This should have been abstract at
377
        // this level, but it's too late to fix that now.]
378
        if (offset == 0) {
379
            return true;
380
        }
381
        int boundary = following(offset - 1);
382
        if (boundary == DONE) {
383
            throw new IllegalArgumentException();
384
        }
385
        return boundary == offset;
386
    }
387

388
    /**
389
     * Returns character index of the text boundary that was most
390
     * recently returned by next(), next(int), previous(), first(), last(),
391
     * following(int) or preceding(int). If any of these methods returns
392
     * <code>BreakIterator.DONE</code> because either first or last text boundary
393
     * has been reached, it returns the first or last text boundary depending on
394
     * which one is reached.
395
     * @return The text boundary returned from the above methods, first or last
396
     * text boundary.
397
     * @see #next()
398
     * @see #next(int)
399
     * @see #previous()
400
     * @see #first()
401
     * @see #last()
402
     * @see #following(int)
403
     * @see #preceding(int)
404
     */
405
    public abstract int current();
406

407
    /**
408
     * Get the text being scanned
409
     * @return the text being scanned
410
     */
411
    public abstract CharacterIterator getText();
412

413
    /**
414
     * Set a new text string to be scanned.  The current scan
415
     * position is reset to first().
416
     * @param newText new text to scan.
417
     */
418
    public void setText(String newText)
419
    {
420
        setText(new StringCharacterIterator(newText));
421
    }
422

423
    /**
424
     * Set a new text for scanning.  The current scan
425
     * position is reset to first().
426
     * @param newText new text to scan.
427
     */
428
    public abstract void setText(CharacterIterator newText);
429

430
    private static final int CHARACTER_INDEX = 0;
431
    private static final int WORD_INDEX = 1;
432
    private static final int LINE_INDEX = 2;
433
    private static final int SENTENCE_INDEX = 3;
434

435
    @SuppressWarnings("unchecked")
436
    private static final SoftReference<BreakIteratorCache>[] iterCache = (SoftReference<BreakIteratorCache>[]) new SoftReference<?>[4];
437

438
    /**
439
     * Returns a new <code>BreakIterator</code> instance
440
     * for <a href="BreakIterator.html#word">word breaks</a>
441
     * for the {@linkplain Locale#getDefault() default locale}.
442
     * @return A break iterator for word breaks
443
     */
444
    public static BreakIterator getWordInstance()
445
    {
446
        return getWordInstance(Locale.getDefault());
447
    }
448

449
    /**
450
     * Returns a new <code>BreakIterator</code> instance
451
     * for <a href="BreakIterator.html#word">word breaks</a>
452
     * for the given locale.
453
     * @param locale the desired locale
454
     * @return A break iterator for word breaks
455
     * @exception NullPointerException if <code>locale</code> is null
456
     */
457
    public static BreakIterator getWordInstance(Locale locale)
458
    {
459
        return getBreakInstance(locale, WORD_INDEX);
460
    }
461

462
    /**
463
     * Returns a new <code>BreakIterator</code> instance
464
     * for <a href="BreakIterator.html#line">line breaks</a>
465
     * for the {@linkplain Locale#getDefault() default locale}.
466
     * @return A break iterator for line breaks
467
     */
468
    public static BreakIterator getLineInstance()
469
    {
470
        return getLineInstance(Locale.getDefault());
471
    }
472

473
    /**
474
     * Returns a new <code>BreakIterator</code> instance
475
     * for <a href="BreakIterator.html#line">line breaks</a>
476
     * for the given locale.
477
     * @param locale the desired locale
478
     * @return A break iterator for line breaks
479
     * @exception NullPointerException if <code>locale</code> is null
480
     */
481
    public static BreakIterator getLineInstance(Locale locale)
482
    {
483
        return getBreakInstance(locale, LINE_INDEX);
484
    }
485

486
    /**
487
     * Returns a new <code>BreakIterator</code> instance
488
     * for <a href="BreakIterator.html#character">character breaks</a>
489
     * for the {@linkplain Locale#getDefault() default locale}.
490
     * @return A break iterator for character breaks
491
     */
492
    public static BreakIterator getCharacterInstance()
493
    {
494
        return getCharacterInstance(Locale.getDefault());
495
    }
496

497
    /**
498
     * Returns a new <code>BreakIterator</code> instance
499
     * for <a href="BreakIterator.html#character">character breaks</a>
500
     * for the given locale.
501
     * @param locale the desired locale
502
     * @return A break iterator for character breaks
503
     * @exception NullPointerException if <code>locale</code> is null
504
     */
505
    public static BreakIterator getCharacterInstance(Locale locale)
506
    {
507
        return getBreakInstance(locale, CHARACTER_INDEX);
508
    }
509

510
    /**
511
     * Returns a new <code>BreakIterator</code> instance
512
     * for <a href="BreakIterator.html#sentence">sentence breaks</a>
513
     * for the {@linkplain Locale#getDefault() default locale}.
514
     * @return A break iterator for sentence breaks
515
     */
516
    public static BreakIterator getSentenceInstance()
517
    {
518
        return getSentenceInstance(Locale.getDefault());
519
    }
520

521
    /**
522
     * Returns a new <code>BreakIterator</code> instance
523
     * for <a href="BreakIterator.html#sentence">sentence breaks</a>
524
     * for the given locale.
525
     * @param locale the desired locale
526
     * @return A break iterator for sentence breaks
527
     * @exception NullPointerException if <code>locale</code> is null
528
     */
529
    public static BreakIterator getSentenceInstance(Locale locale)
530
    {
531
        return getBreakInstance(locale, SENTENCE_INDEX);
532
    }
533

534
    private static BreakIterator getBreakInstance(Locale locale, int type) {
535
        if (iterCache[type] != null) {
536
            BreakIteratorCache cache = iterCache[type].get();
537
            if (cache != null) {
538
                if (cache.getLocale().equals(locale)) {
539
                    return cache.createBreakInstance();
540
                }
541
            }
542
        }
543

544
        BreakIterator result = createBreakInstance(locale, type);
545
        BreakIteratorCache cache = new BreakIteratorCache(locale, result);
546
        iterCache[type] = new SoftReference<>(cache);
547
        return result;
548
    }
549

550
    private static BreakIterator createBreakInstance(Locale locale,
551
                                                     int type) {
552
        LocaleProviderAdapter adapter = LocaleProviderAdapter.getAdapter(BreakIteratorProvider.class, locale);
553
        BreakIterator iterator = createBreakInstance(adapter, locale, type);
554
        if (iterator == null) {
555
            iterator = createBreakInstance(LocaleProviderAdapter.forJRE(), locale, type);
556
        }
557
        return iterator;
558
    }
559

560
    private static BreakIterator createBreakInstance(LocaleProviderAdapter adapter, Locale locale, int type) {
561
        BreakIteratorProvider breakIteratorProvider = adapter.getBreakIteratorProvider();
562
        BreakIterator iterator = null;
563
        switch (type) {
564
        case CHARACTER_INDEX:
565
            iterator = breakIteratorProvider.getCharacterInstance(locale);
566
            break;
567
        case WORD_INDEX:
568
            iterator = breakIteratorProvider.getWordInstance(locale);
569
            break;
570
        case LINE_INDEX:
571
            iterator = breakIteratorProvider.getLineInstance(locale);
572
            break;
573
        case SENTENCE_INDEX:
574
            iterator = breakIteratorProvider.getSentenceInstance(locale);
575
            break;
576
        }
577
        return iterator;
578
    }
579

580
    /**
581
     * Returns an array of all locales for which the
582
     * <code>get*Instance</code> methods of this class can return
583
     * localized instances.
584
     * The returned array represents the union of locales supported by the Java
585
     * runtime and by installed
586
     * {@link java.text.spi.BreakIteratorProvider BreakIteratorProvider} implementations.
587
     * It must contain at least a <code>Locale</code>
588
     * instance equal to {@link java.util.Locale#US Locale.US}.
589
     *
590
     * @return An array of locales for which localized
591
     *         <code>BreakIterator</code> instances are available.
592
     */
593
    public static synchronized Locale[] getAvailableLocales()
594
    {
595
        LocaleServiceProviderPool pool =
596
            LocaleServiceProviderPool.getPool(BreakIteratorProvider.class);
597
        return pool.getAvailableLocales();
598
    }
599

600
    private static final class BreakIteratorCache {
601

602
        private BreakIterator iter;
603
        private Locale locale;
604

605
        BreakIteratorCache(Locale locale, BreakIterator iter) {
606
            this.locale = locale;
607
            this.iter = (BreakIterator) iter.clone();
608
        }
609

610
        Locale getLocale() {
611
            return locale;
612
        }
613

614
        BreakIterator createBreakInstance() {
615
            return (BreakIterator) iter.clone();
616
        }
617
    }
618
}
619

620
Product

Resources

Company