Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/classes/java/text/BreakIterator.java
38829 views
/*1* Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/2425/*26* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved27* (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved28*29* The original version of this source code and documentation30* is copyrighted and owned by Taligent, Inc., a wholly-owned31* subsidiary of IBM. These materials are provided under terms32* of a License Agreement between Taligent and Sun. This technology33* is protected by multiple US and International patents.34*35* This notice and attribution to Taligent may not be removed.36* Taligent is a registered trademark of Taligent, Inc.37*38*/3940package java.text;4142import java.lang.ref.SoftReference;43import java.text.spi.BreakIteratorProvider;44import java.util.Locale;45import sun.util.locale.provider.LocaleProviderAdapter;46import sun.util.locale.provider.LocaleServiceProviderPool;474849/**50* The <code>BreakIterator</code> class implements methods for finding51* the location of boundaries in text. Instances of <code>BreakIterator</code>52* maintain a current position and scan over text53* returning the index of characters where boundaries occur.54* Internally, <code>BreakIterator</code> scans text using a55* <code>CharacterIterator</code>, and is thus able to scan text held56* by any object implementing that protocol. A <code>StringCharacterIterator</code>57* is used to scan <code>String</code> objects passed to <code>setText</code>.58*59* <p>60* You use the factory methods provided by this class to create61* instances of various types of break iterators. In particular,62* use <code>getWordInstance</code>, <code>getLineInstance</code>,63* <code>getSentenceInstance</code>, and <code>getCharacterInstance</code>64* to create <code>BreakIterator</code>s that perform65* word, line, sentence, and character boundary analysis respectively.66* A single <code>BreakIterator</code> can work only on one unit67* (word, line, sentence, and so on). You must use a different iterator68* for each unit boundary analysis you wish to perform.69*70* <p><a name="line"></a>71* Line boundary analysis determines where a text string can be72* broken when line-wrapping. The mechanism correctly handles73* punctuation and hyphenated words. Actual line breaking needs74* to also consider the available line width and is handled by75* higher-level software.76*77* <p><a name="sentence"></a>78* Sentence boundary analysis allows selection with correct interpretation79* of periods within numbers and abbreviations, and trailing punctuation80* marks such as quotation marks and parentheses.81*82* <p><a name="word"></a>83* Word boundary analysis is used by search and replace functions, as84* well as within text editing applications that allow the user to85* select words with a double click. Word selection provides correct86* interpretation of punctuation marks within and following87* words. Characters that are not part of a word, such as symbols88* or punctuation marks, have word-breaks on both sides.89*90* <p><a name="character"></a>91* Character boundary analysis allows users to interact with characters92* as they expect to, for example, when moving the cursor through a text93* string. Character boundary analysis provides correct navigation94* through character strings, regardless of how the character is stored.95* The boundaries returned may be those of supplementary characters,96* combining character sequences, or ligature clusters.97* For example, an accented character might be stored as a base character98* and a diacritical mark. What users consider to be a character can99* differ between languages.100*101* <p>102* The <code>BreakIterator</code> instances returned by the factory methods103* of this class are intended for use with natural languages only, not for104* programming language text. It is however possible to define subclasses105* that tokenize a programming language.106*107* <P>108* <strong>Examples</strong>:<P>109* Creating and using text boundaries:110* <blockquote>111* <pre>112* public static void main(String args[]) {113* if (args.length == 1) {114* String stringToExamine = args[0];115* //print each word in order116* BreakIterator boundary = BreakIterator.getWordInstance();117* boundary.setText(stringToExamine);118* printEachForward(boundary, stringToExamine);119* //print each sentence in reverse order120* boundary = BreakIterator.getSentenceInstance(Locale.US);121* boundary.setText(stringToExamine);122* printEachBackward(boundary, stringToExamine);123* printFirst(boundary, stringToExamine);124* printLast(boundary, stringToExamine);125* }126* }127* </pre>128* </blockquote>129*130* Print each element in order:131* <blockquote>132* <pre>133* public static void printEachForward(BreakIterator boundary, String source) {134* int start = boundary.first();135* for (int end = boundary.next();136* end != BreakIterator.DONE;137* start = end, end = boundary.next()) {138* System.out.println(source.substring(start,end));139* }140* }141* </pre>142* </blockquote>143*144* Print each element in reverse order:145* <blockquote>146* <pre>147* public static void printEachBackward(BreakIterator boundary, String source) {148* int end = boundary.last();149* for (int start = boundary.previous();150* start != BreakIterator.DONE;151* end = start, start = boundary.previous()) {152* System.out.println(source.substring(start,end));153* }154* }155* </pre>156* </blockquote>157*158* Print first element:159* <blockquote>160* <pre>161* public static void printFirst(BreakIterator boundary, String source) {162* int start = boundary.first();163* int end = boundary.next();164* System.out.println(source.substring(start,end));165* }166* </pre>167* </blockquote>168*169* Print last element:170* <blockquote>171* <pre>172* public static void printLast(BreakIterator boundary, String source) {173* int end = boundary.last();174* int start = boundary.previous();175* System.out.println(source.substring(start,end));176* }177* </pre>178* </blockquote>179*180* Print the element at a specified position:181* <blockquote>182* <pre>183* public static void printAt(BreakIterator boundary, int pos, String source) {184* int end = boundary.following(pos);185* int start = boundary.previous();186* System.out.println(source.substring(start,end));187* }188* </pre>189* </blockquote>190*191* Find the next word:192* <blockquote>193* <pre>{@code194* public static int nextWordStartAfter(int pos, String text) {195* BreakIterator wb = BreakIterator.getWordInstance();196* wb.setText(text);197* int last = wb.following(pos);198* int current = wb.next();199* while (current != BreakIterator.DONE) {200* for (int p = last; p < current; p++) {201* if (Character.isLetter(text.codePointAt(p)))202* return last;203* }204* last = current;205* current = wb.next();206* }207* return BreakIterator.DONE;208* }209* }</pre>210* (The iterator returned by BreakIterator.getWordInstance() is unique in that211* the break positions it returns don't represent both the start and end of the212* thing being iterated over. That is, a sentence-break iterator returns breaks213* that each represent the end of one sentence and the beginning of the next.214* With the word-break iterator, the characters between two boundaries might be a215* word, or they might be the punctuation or whitespace between two words. The216* above code uses a simple heuristic to determine which boundary is the beginning217* of a word: If the characters between this boundary and the next boundary218* include at least one letter (this can be an alphabetical letter, a CJK ideograph,219* a Hangul syllable, a Kana character, etc.), then the text between this boundary220* and the next is a word; otherwise, it's the material between words.)221* </blockquote>222*223* @see CharacterIterator224*225*/226227public abstract class BreakIterator implements Cloneable228{229/**230* Constructor. BreakIterator is stateless and has no default behavior.231*/232protected BreakIterator()233{234}235236/**237* Create a copy of this iterator238* @return A copy of this239*/240@Override241public Object clone()242{243try {244return super.clone();245}246catch (CloneNotSupportedException e) {247throw new InternalError(e);248}249}250251/**252* DONE is returned by previous(), next(), next(int), preceding(int)253* and following(int) when either the first or last text boundary has been254* reached.255*/256public static final int DONE = -1;257258/**259* Returns the first boundary. The iterator's current position is set260* to the first text boundary.261* @return The character index of the first text boundary.262*/263public abstract int first();264265/**266* Returns the last boundary. The iterator's current position is set267* to the last text boundary.268* @return The character index of the last text boundary.269*/270public abstract int last();271272/**273* Returns the nth boundary from the current boundary. If either274* the first or last text boundary has been reached, it returns275* <code>BreakIterator.DONE</code> and the current position is set to either276* the first or last text boundary depending on which one is reached. Otherwise,277* the iterator's current position is set to the new boundary.278* For example, if the iterator's current position is the mth text boundary279* and three more boundaries exist from the current boundary to the last text280* boundary, the next(2) call will return m + 2. The new text position is set281* to the (m + 2)th text boundary. A next(4) call would return282* <code>BreakIterator.DONE</code> and the last text boundary would become the283* new text position.284* @param n which boundary to return. A value of 0285* does nothing. Negative values move to previous boundaries286* and positive values move to later boundaries.287* @return The character index of the nth boundary from the current position288* or <code>BreakIterator.DONE</code> if either first or last text boundary289* has been reached.290*/291public abstract int next(int n);292293/**294* Returns the boundary following the current boundary. If the current boundary295* is the last text boundary, it returns <code>BreakIterator.DONE</code> and296* the iterator's current position is unchanged. Otherwise, the iterator's297* current position is set to the boundary following the current boundary.298* @return The character index of the next text boundary or299* <code>BreakIterator.DONE</code> if the current boundary is the last text300* boundary.301* Equivalent to next(1).302* @see #next(int)303*/304public abstract int next();305306/**307* Returns the boundary preceding the current boundary. If the current boundary308* is the first text boundary, it returns <code>BreakIterator.DONE</code> and309* the iterator's current position is unchanged. Otherwise, the iterator's310* current position is set to the boundary preceding the current boundary.311* @return The character index of the previous text boundary or312* <code>BreakIterator.DONE</code> if the current boundary is the first text313* boundary.314*/315public abstract int previous();316317/**318* Returns the first boundary following the specified character offset. If the319* specified offset equals to the last text boundary, it returns320* <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.321* Otherwise, the iterator's current position is set to the returned boundary.322* The value returned is always greater than the offset or the value323* <code>BreakIterator.DONE</code>.324* @param offset the character offset to begin scanning.325* @return The first boundary after the specified offset or326* <code>BreakIterator.DONE</code> if the last text boundary is passed in327* as the offset.328* @exception IllegalArgumentException if the specified offset is less than329* the first text boundary or greater than the last text boundary.330*/331public abstract int following(int offset);332333/**334* Returns the last boundary preceding the specified character offset. If the335* specified offset equals to the first text boundary, it returns336* <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.337* Otherwise, the iterator's current position is set to the returned boundary.338* The value returned is always less than the offset or the value339* <code>BreakIterator.DONE</code>.340* @param offset the character offset to begin scanning.341* @return The last boundary before the specified offset or342* <code>BreakIterator.DONE</code> if the first text boundary is passed in343* as the offset.344* @exception IllegalArgumentException if the specified offset is less than345* the first text boundary or greater than the last text boundary.346* @since 1.2347*/348public int preceding(int offset) {349// NOTE: This implementation is here solely because we can't add new350// abstract methods to an existing class. There is almost ALWAYS a351// better, faster way to do this.352int pos = following(offset);353while (pos >= offset && pos != DONE) {354pos = previous();355}356return pos;357}358359/**360* Returns true if the specified character offset is a text boundary.361* @param offset the character offset to check.362* @return <code>true</code> if "offset" is a boundary position,363* <code>false</code> otherwise.364* @exception IllegalArgumentException if the specified offset is less than365* the first text boundary or greater than the last text boundary.366* @since 1.2367*/368public boolean isBoundary(int offset) {369// NOTE: This implementation probably is wrong for most situations370// because it fails to take into account the possibility that a371// CharacterIterator passed to setText() may not have a begin offset372// of 0. But since the abstract BreakIterator doesn't have that373// knowledge, it assumes the begin offset is 0. If you subclass374// BreakIterator, copy the SimpleTextBoundary implementation of this375// function into your subclass. [This should have been abstract at376// this level, but it's too late to fix that now.]377if (offset == 0) {378return true;379}380int boundary = following(offset - 1);381if (boundary == DONE) {382throw new IllegalArgumentException();383}384return boundary == offset;385}386387/**388* Returns character index of the text boundary that was most389* recently returned by next(), next(int), previous(), first(), last(),390* following(int) or preceding(int). If any of these methods returns391* <code>BreakIterator.DONE</code> because either first or last text boundary392* has been reached, it returns the first or last text boundary depending on393* which one is reached.394* @return The text boundary returned from the above methods, first or last395* text boundary.396* @see #next()397* @see #next(int)398* @see #previous()399* @see #first()400* @see #last()401* @see #following(int)402* @see #preceding(int)403*/404public abstract int current();405406/**407* Get the text being scanned408* @return the text being scanned409*/410public abstract CharacterIterator getText();411412/**413* Set a new text string to be scanned. The current scan414* position is reset to first().415* @param newText new text to scan.416*/417public void setText(String newText)418{419setText(new StringCharacterIterator(newText));420}421422/**423* Set a new text for scanning. The current scan424* position is reset to first().425* @param newText new text to scan.426*/427public abstract void setText(CharacterIterator newText);428429private static final int CHARACTER_INDEX = 0;430private static final int WORD_INDEX = 1;431private static final int LINE_INDEX = 2;432private static final int SENTENCE_INDEX = 3;433434@SuppressWarnings("unchecked")435private static final SoftReference<BreakIteratorCache>[] iterCache = (SoftReference<BreakIteratorCache>[]) new SoftReference<?>[4];436437/**438* Returns a new <code>BreakIterator</code> instance439* for <a href="BreakIterator.html#word">word breaks</a>440* for the {@linkplain Locale#getDefault() default locale}.441* @return A break iterator for word breaks442*/443public static BreakIterator getWordInstance()444{445return getWordInstance(Locale.getDefault());446}447448/**449* Returns a new <code>BreakIterator</code> instance450* for <a href="BreakIterator.html#word">word breaks</a>451* for the given locale.452* @param locale the desired locale453* @return A break iterator for word breaks454* @exception NullPointerException if <code>locale</code> is null455*/456public static BreakIterator getWordInstance(Locale locale)457{458return getBreakInstance(locale, WORD_INDEX);459}460461/**462* Returns a new <code>BreakIterator</code> instance463* for <a href="BreakIterator.html#line">line breaks</a>464* for the {@linkplain Locale#getDefault() default locale}.465* @return A break iterator for line breaks466*/467public static BreakIterator getLineInstance()468{469return getLineInstance(Locale.getDefault());470}471472/**473* Returns a new <code>BreakIterator</code> instance474* for <a href="BreakIterator.html#line">line breaks</a>475* for the given locale.476* @param locale the desired locale477* @return A break iterator for line breaks478* @exception NullPointerException if <code>locale</code> is null479*/480public static BreakIterator getLineInstance(Locale locale)481{482return getBreakInstance(locale, LINE_INDEX);483}484485/**486* Returns a new <code>BreakIterator</code> instance487* for <a href="BreakIterator.html#character">character breaks</a>488* for the {@linkplain Locale#getDefault() default locale}.489* @return A break iterator for character breaks490*/491public static BreakIterator getCharacterInstance()492{493return getCharacterInstance(Locale.getDefault());494}495496/**497* Returns a new <code>BreakIterator</code> instance498* for <a href="BreakIterator.html#character">character breaks</a>499* for the given locale.500* @param locale the desired locale501* @return A break iterator for character breaks502* @exception NullPointerException if <code>locale</code> is null503*/504public static BreakIterator getCharacterInstance(Locale locale)505{506return getBreakInstance(locale, CHARACTER_INDEX);507}508509/**510* Returns a new <code>BreakIterator</code> instance511* for <a href="BreakIterator.html#sentence">sentence breaks</a>512* for the {@linkplain Locale#getDefault() default locale}.513* @return A break iterator for sentence breaks514*/515public static BreakIterator getSentenceInstance()516{517return getSentenceInstance(Locale.getDefault());518}519520/**521* Returns a new <code>BreakIterator</code> instance522* for <a href="BreakIterator.html#sentence">sentence breaks</a>523* for the given locale.524* @param locale the desired locale525* @return A break iterator for sentence breaks526* @exception NullPointerException if <code>locale</code> is null527*/528public static BreakIterator getSentenceInstance(Locale locale)529{530return getBreakInstance(locale, SENTENCE_INDEX);531}532533private static BreakIterator getBreakInstance(Locale locale, int type) {534if (iterCache[type] != null) {535BreakIteratorCache cache = iterCache[type].get();536if (cache != null) {537if (cache.getLocale().equals(locale)) {538return cache.createBreakInstance();539}540}541}542543BreakIterator result = createBreakInstance(locale, type);544BreakIteratorCache cache = new BreakIteratorCache(locale, result);545iterCache[type] = new SoftReference<>(cache);546return result;547}548549private static BreakIterator createBreakInstance(Locale locale,550int type) {551LocaleProviderAdapter adapter = LocaleProviderAdapter.getAdapter(BreakIteratorProvider.class, locale);552BreakIterator iterator = createBreakInstance(adapter, locale, type);553if (iterator == null) {554iterator = createBreakInstance(LocaleProviderAdapter.forJRE(), locale, type);555}556return iterator;557}558559private static BreakIterator createBreakInstance(LocaleProviderAdapter adapter, Locale locale, int type) {560BreakIteratorProvider breakIteratorProvider = adapter.getBreakIteratorProvider();561BreakIterator iterator = null;562switch (type) {563case CHARACTER_INDEX:564iterator = breakIteratorProvider.getCharacterInstance(locale);565break;566case WORD_INDEX:567iterator = breakIteratorProvider.getWordInstance(locale);568break;569case LINE_INDEX:570iterator = breakIteratorProvider.getLineInstance(locale);571break;572case SENTENCE_INDEX:573iterator = breakIteratorProvider.getSentenceInstance(locale);574break;575}576return iterator;577}578579/**580* Returns an array of all locales for which the581* <code>get*Instance</code> methods of this class can return582* localized instances.583* The returned array represents the union of locales supported by the Java584* runtime and by installed585* {@link java.text.spi.BreakIteratorProvider BreakIteratorProvider} implementations.586* It must contain at least a <code>Locale</code>587* instance equal to {@link java.util.Locale#US Locale.US}.588*589* @return An array of locales for which localized590* <code>BreakIterator</code> instances are available.591*/592public static synchronized Locale[] getAvailableLocales()593{594LocaleServiceProviderPool pool =595LocaleServiceProviderPool.getPool(BreakIteratorProvider.class);596return pool.getAvailableLocales();597}598599private static final class BreakIteratorCache {600601private BreakIterator iter;602private Locale locale;603604BreakIteratorCache(Locale locale, BreakIterator iter) {605this.locale = locale;606this.iter = (BreakIterator) iter.clone();607}608609Locale getLocale() {610return locale;611}612613BreakIterator createBreakInstance() {614return (BreakIterator) iter.clone();615}616}617}618619620