Path: blob/jdk8u272-b10-aarch32-20201026/jdk/src/share/native/common/unicode/ubrk.h
48773 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3******************************************************************************4* Copyright (C) 1996-2015, International Business Machines Corporation and others.5* All Rights Reserved.6******************************************************************************7*/89#ifndef UBRK_H10#define UBRK_H1112#include "unicode/utypes.h"13#include "unicode/uloc.h"14#include "unicode/utext.h"15#include "unicode/localpointer.h"1617/**18* A text-break iterator.19* For usage in C programs.20*/21#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR22# define UBRK_TYPEDEF_UBREAK_ITERATOR23/**24* Opaque type representing an ICU Break iterator object.25* @stable ICU 2.026*/27typedef struct UBreakIterator UBreakIterator;28#endif2930#if !UCONFIG_NO_BREAK_ITERATION3132#include "unicode/parseerr.h"3334/**35* \file36* \brief C API: BreakIterator37*38* <h2> BreakIterator C API </h2>39*40* The BreakIterator C API defines methods for finding the location41* of boundaries in text. Pointer to a UBreakIterator maintain a42* current position and scan over text returning the index of characters43* where boundaries occur.44* <p>45* Line boundary analysis determines where a text string can be broken46* when line-wrapping. The mechanism correctly handles punctuation and47* hyphenated words.48* <p>49* Note: The locale keyword "lb" can be used to modify line break50* behavior according to the CSS level 3 line-break options, see51* <http://dev.w3.org/csswg/css-text/#line-breaking>. For example:52* "ja@lb=strict", "zh@lb=loose".53* <p>54* Sentence boundary analysis allows selection with correct55* interpretation of periods within numbers and abbreviations, and56* trailing punctuation marks such as quotation marks and parentheses.57* <p>58* Note: The locale keyword "ss" can be used to enable use of59* segmentation suppression data (preventing breaks in English after60* abbreviations such as "Mr." or "Est.", for example), as follows:61* "en@ss=standard".62* <p>63* Word boundary analysis is used by search and replace functions, as64* well as within text editing applications that allow the user to65* select words with a double click. Word selection provides correct66* interpretation of punctuation marks within and following67* words. Characters that are not part of a word, such as symbols or68* punctuation marks, have word-breaks on both sides.69* <p>70* Character boundary analysis identifies the boundaries of71* "Extended Grapheme Clusters", which are groupings of codepoints72* that should be treated as character-like units for many text operations.73* Please see Unicode Standard Annex #29, Unicode Text Segmentation,74* http://www.unicode.org/reports/tr29/ for additional information75* on grapheme clusters and guidelines on their use.76* <p>77* Title boundary analysis locates all positions,78* typically starts of words, that should be set to Title Case79* when title casing the text.80* <p>81* The text boundary positions are found according to the rules82* described in Unicode Standard Annex #29, Text Boundaries, and83* Unicode Standard Annex #14, Line Breaking Properties. These84* are available at http://www.unicode.org/reports/tr14/ and85* http://www.unicode.org/reports/tr29/.86* <p>87* In addition to the plain C API defined in this header file, an88* object oriented C++ API with equivalent functionality is defined in the89* file brkiter.h.90* <p>91* Code snippets illustrating the use of the Break Iterator APIs92* are available in the ICU User Guide,93* http://icu-project.org/userguide/boundaryAnalysis.html94* and in the sample program icu/source/samples/break/break.cpp95*/9697/** The possible types of text boundaries. @stable ICU 2.0 */98typedef enum UBreakIteratorType {99/** Character breaks @stable ICU 2.0 */100UBRK_CHARACTER = 0,101/** Word breaks @stable ICU 2.0 */102UBRK_WORD = 1,103/** Line breaks @stable ICU 2.0 */104UBRK_LINE = 2,105/** Sentence breaks @stable ICU 2.0 */106UBRK_SENTENCE = 3,107108#ifndef U_HIDE_DEPRECATED_API109/**110* Title Case breaks111* The iterator created using this type locates title boundaries as described for112* Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,113* please use Word Boundary iterator.114*115* @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.116*/117UBRK_TITLE = 4,118/**119* One more than the highest normal UBreakIteratorType value.120* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.121*/122UBRK_COUNT = 5123#endif // U_HIDE_DEPRECATED_API124} UBreakIteratorType;125126/** Value indicating all text boundaries have been returned.127* @stable ICU 2.0128*/129#define UBRK_DONE ((int32_t) -1)130131132/**133* Enum constants for the word break tags returned by134* getRuleStatus(). A range of values is defined for each category of135* word, to allow for further subdivisions of a category in future releases.136* Applications should check for tag values falling within the range, rather137* than for single individual values.138*139* The numeric values of all of these constants are stable (will not change).140*141* @stable ICU 2.2142*/143typedef enum UWordBreak {144/** Tag value for "words" that do not fit into any of other categories.145* Includes spaces and most punctuation. */146UBRK_WORD_NONE = 0,147/** Upper bound for tags for uncategorized words. */148UBRK_WORD_NONE_LIMIT = 100,149/** Tag value for words that appear to be numbers, lower limit. */150UBRK_WORD_NUMBER = 100,151/** Tag value for words that appear to be numbers, upper limit. */152UBRK_WORD_NUMBER_LIMIT = 200,153/** Tag value for words that contain letters, excluding154* hiragana, katakana or ideographic characters, lower limit. */155UBRK_WORD_LETTER = 200,156/** Tag value for words containing letters, upper limit */157UBRK_WORD_LETTER_LIMIT = 300,158/** Tag value for words containing kana characters, lower limit */159UBRK_WORD_KANA = 300,160/** Tag value for words containing kana characters, upper limit */161UBRK_WORD_KANA_LIMIT = 400,162/** Tag value for words containing ideographic characters, lower limit */163UBRK_WORD_IDEO = 400,164/** Tag value for words containing ideographic characters, upper limit */165UBRK_WORD_IDEO_LIMIT = 500166} UWordBreak;167168/**169* Enum constants for the line break tags returned by getRuleStatus().170* A range of values is defined for each category of171* word, to allow for further subdivisions of a category in future releases.172* Applications should check for tag values falling within the range, rather173* than for single individual values.174*175* The numeric values of all of these constants are stable (will not change).176*177* @stable ICU 2.8178*/179typedef enum ULineBreakTag {180/** Tag value for soft line breaks, positions at which a line break181* is acceptable but not required */182UBRK_LINE_SOFT = 0,183/** Upper bound for soft line breaks. */184UBRK_LINE_SOFT_LIMIT = 100,185/** Tag value for a hard, or mandatory line break */186UBRK_LINE_HARD = 100,187/** Upper bound for hard line breaks. */188UBRK_LINE_HARD_LIMIT = 200189} ULineBreakTag;190191192193/**194* Enum constants for the sentence break tags returned by getRuleStatus().195* A range of values is defined for each category of196* sentence, to allow for further subdivisions of a category in future releases.197* Applications should check for tag values falling within the range, rather198* than for single individual values.199*200* The numeric values of all of these constants are stable (will not change).201*202* @stable ICU 2.8203*/204typedef enum USentenceBreakTag {205/** Tag value for for sentences ending with a sentence terminator206* ('.', '?', '!', etc.) character, possibly followed by a207* hard separator (CR, LF, PS, etc.)208*/209UBRK_SENTENCE_TERM = 0,210/** Upper bound for tags for sentences ended by sentence terminators. */211UBRK_SENTENCE_TERM_LIMIT = 100,212/** Tag value for for sentences that do not contain an ending213* sentence terminator ('.', '?', '!', etc.) character, but214* are ended only by a hard separator (CR, LF, PS, etc.) or end of input.215*/216UBRK_SENTENCE_SEP = 100,217/** Upper bound for tags for sentences ended by a separator. */218UBRK_SENTENCE_SEP_LIMIT = 200219/** Tag value for a hard, or mandatory line break */220} USentenceBreakTag;221222223/**224* Open a new UBreakIterator for locating text boundaries for a specified locale.225* A UBreakIterator may be used for detecting character, line, word,226* and sentence breaks in text.227* @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,228* UBRK_LINE, UBRK_SENTENCE229* @param locale The locale specifying the text-breaking conventions. Note that230* locale keys such as "lb" and "ss" may be used to modify text break behavior,231* see general discussion of BreakIterator C API.232* @param text The text to be iterated over. May be null, in which case ubrk_setText() is233* used to specify the text to be iterated.234* @param textLength The number of characters in text, or -1 if null-terminated.235* @param status A UErrorCode to receive any errors.236* @return A UBreakIterator for the specified locale.237* @see ubrk_openRules238* @stable ICU 2.0239*/240U_STABLE UBreakIterator* U_EXPORT2241ubrk_open(UBreakIteratorType type,242const char *locale,243const UChar *text,244int32_t textLength,245UErrorCode *status);246247/**248* Open a new UBreakIterator for locating text boundaries using specified breaking rules.249* The rule syntax is ... (TBD)250* @param rules A set of rules specifying the text breaking conventions.251* @param rulesLength The number of characters in rules, or -1 if null-terminated.252* @param text The text to be iterated over. May be null, in which case ubrk_setText() is253* used to specify the text to be iterated.254* @param textLength The number of characters in text, or -1 if null-terminated.255* @param parseErr Receives position and context information for any syntax errors256* detected while parsing the rules.257* @param status A UErrorCode to receive any errors.258* @return A UBreakIterator for the specified rules.259* @see ubrk_open260* @stable ICU 2.2261*/262U_STABLE UBreakIterator* U_EXPORT2263ubrk_openRules(const UChar *rules,264int32_t rulesLength,265const UChar *text,266int32_t textLength,267UParseError *parseErr,268UErrorCode *status);269270/**271* Open a new UBreakIterator for locating text boundaries using precompiled binary rules.272* Opening a UBreakIterator this way is substantially faster than using ubrk_openRules.273* Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not274* compatible across different major versions of ICU, nor across platforms of different275* endianness or different base character set family (ASCII vs EBCDIC).276* @param binaryRules A set of compiled binary rules specifying the text breaking277* conventions. Ownership of the storage containing the compiled278* rules remains with the caller of this function. The compiled279* rules must not be modified or deleted during the life of the280* break iterator.281* @param rulesLength The length of binaryRules in bytes; must be >= 0.282* @param text The text to be iterated over. May be null, in which case283* ubrk_setText() is used to specify the text to be iterated.284* @param textLength The number of characters in text, or -1 if null-terminated.285* @param status Pointer to UErrorCode to receive any errors.286* @return UBreakIterator for the specified rules.287* @see ubrk_getBinaryRules288* @stable ICU 59289*/290U_STABLE UBreakIterator* U_EXPORT2291ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength,292const UChar * text, int32_t textLength,293UErrorCode * status);294295/**296* Thread safe cloning operation297* @param bi iterator to be cloned298* @param stackBuffer <em>Deprecated functionality as of ICU 52, use NULL.</em><br>299* user allocated space for the new clone. If NULL new memory will be allocated.300* If buffer is not large enough, new memory will be allocated.301* Clients can use the U_BRK_SAFECLONE_BUFFERSIZE.302* @param pBufferSize <em>Deprecated functionality as of ICU 52, use NULL or 1.</em><br>303* pointer to size of allocated space.304* If *pBufferSize == 0, a sufficient size for use in cloning will305* be returned ('pre-flighting')306* If *pBufferSize is not enough for a stack-based safe clone,307* new memory will be allocated.308* @param status to indicate whether the operation went on smoothly or there were errors309* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.310* @return pointer to the new clone311* @stable ICU 2.0312*/313U_STABLE UBreakIterator * U_EXPORT2314ubrk_safeClone(315const UBreakIterator *bi,316void *stackBuffer,317int32_t *pBufferSize,318UErrorCode *status);319320#ifndef U_HIDE_DEPRECATED_API321322/**323* A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().324* @deprecated ICU 52. Do not rely on ubrk_safeClone() cloning into any provided buffer.325*/326#define U_BRK_SAFECLONE_BUFFERSIZE 1327328#endif /* U_HIDE_DEPRECATED_API */329330/**331* Close a UBreakIterator.332* Once closed, a UBreakIterator may no longer be used.333* @param bi The break iterator to close.334* @stable ICU 2.0335*/336U_STABLE void U_EXPORT2337ubrk_close(UBreakIterator *bi);338339#if U_SHOW_CPLUSPLUS_API340341U_NAMESPACE_BEGIN342343/**344* \class LocalUBreakIteratorPointer345* "Smart pointer" class, closes a UBreakIterator via ubrk_close().346* For most methods see the LocalPointerBase base class.347*348* @see LocalPointerBase349* @see LocalPointer350* @stable ICU 4.4351*/352U_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_close);353354U_NAMESPACE_END355356#endif357358/**359* Sets an existing iterator to point to a new piece of text.360* The break iterator retains a pointer to the supplied text.361* The caller must not modify or delete the text while the BreakIterator362* retains the reference.363*364* @param bi The iterator to use365* @param text The text to be set366* @param textLength The length of the text367* @param status The error code368* @stable ICU 2.0369*/370U_STABLE void U_EXPORT2371ubrk_setText(UBreakIterator* bi,372const UChar* text,373int32_t textLength,374UErrorCode* status);375376377/**378* Sets an existing iterator to point to a new piece of text.379*380* All index positions returned by break iterator functions are381* native indices from the UText. For example, when breaking UTF-8382* encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc.383* will be UTF-8 string indices, not UTF-16 positions.384*385* @param bi The iterator to use386* @param text The text to be set.387* This function makes a shallow clone of the supplied UText. This means388* that the caller is free to immediately close or otherwise reuse the389* UText that was passed as a parameter, but that the underlying text itself390* must not be altered while being referenced by the break iterator.391* @param status The error code392* @stable ICU 3.4393*/394U_STABLE void U_EXPORT2395ubrk_setUText(UBreakIterator* bi,396UText* text,397UErrorCode* status);398399400401/**402* Determine the most recently-returned text boundary.403*404* @param bi The break iterator to use.405* @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,406* \ref ubrk_first, or \ref ubrk_last.407* @stable ICU 2.0408*/409U_STABLE int32_t U_EXPORT2410ubrk_current(const UBreakIterator *bi);411412/**413* Advance the iterator to the boundary following the current boundary.414*415* @param bi The break iterator to use.416* @return The character index of the next text boundary, or UBRK_DONE417* if all text boundaries have been returned.418* @see ubrk_previous419* @stable ICU 2.0420*/421U_STABLE int32_t U_EXPORT2422ubrk_next(UBreakIterator *bi);423424/**425* Set the iterator position to the boundary preceding the current boundary.426*427* @param bi The break iterator to use.428* @return The character index of the preceding text boundary, or UBRK_DONE429* if all text boundaries have been returned.430* @see ubrk_next431* @stable ICU 2.0432*/433U_STABLE int32_t U_EXPORT2434ubrk_previous(UBreakIterator *bi);435436/**437* Set the iterator position to zero, the start of the text being scanned.438* @param bi The break iterator to use.439* @return The new iterator position (zero).440* @see ubrk_last441* @stable ICU 2.0442*/443U_STABLE int32_t U_EXPORT2444ubrk_first(UBreakIterator *bi);445446/**447* Set the iterator position to the index immediately <EM>beyond</EM> the last character in the text being scanned.448* This is not the same as the last character.449* @param bi The break iterator to use.450* @return The character offset immediately <EM>beyond</EM> the last character in the451* text being scanned.452* @see ubrk_first453* @stable ICU 2.0454*/455U_STABLE int32_t U_EXPORT2456ubrk_last(UBreakIterator *bi);457458/**459* Set the iterator position to the first boundary preceding the specified offset.460* The new position is always smaller than offset, or UBRK_DONE.461* @param bi The break iterator to use.462* @param offset The offset to begin scanning.463* @return The text boundary preceding offset, or UBRK_DONE.464* @see ubrk_following465* @stable ICU 2.0466*/467U_STABLE int32_t U_EXPORT2468ubrk_preceding(UBreakIterator *bi,469int32_t offset);470471/**472* Advance the iterator to the first boundary following the specified offset.473* The value returned is always greater than offset, or UBRK_DONE.474* @param bi The break iterator to use.475* @param offset The offset to begin scanning.476* @return The text boundary following offset, or UBRK_DONE.477* @see ubrk_preceding478* @stable ICU 2.0479*/480U_STABLE int32_t U_EXPORT2481ubrk_following(UBreakIterator *bi,482int32_t offset);483484/**485* Get a locale for which text breaking information is available.486* A UBreakIterator in a locale returned by this function will perform the correct487* text breaking for the locale.488* @param index The index of the desired locale.489* @return A locale for which number text breaking information is available, or 0 if none.490* @see ubrk_countAvailable491* @stable ICU 2.0492*/493U_STABLE const char* U_EXPORT2494ubrk_getAvailable(int32_t index);495496/**497* Determine how many locales have text breaking information available.498* This function is most useful as determining the loop ending condition for499* calls to \ref ubrk_getAvailable.500* @return The number of locales for which text breaking information is available.501* @see ubrk_getAvailable502* @stable ICU 2.0503*/504U_STABLE int32_t U_EXPORT2505ubrk_countAvailable(void);506507508/**509* Returns true if the specified position is a boundary position. As a side510* effect, leaves the iterator pointing to the first boundary position at511* or after "offset".512* @param bi The break iterator to use.513* @param offset the offset to check.514* @return True if "offset" is a boundary position.515* @stable ICU 2.0516*/517U_STABLE UBool U_EXPORT2518ubrk_isBoundary(UBreakIterator *bi, int32_t offset);519520/**521* Return the status from the break rule that determined the most recently522* returned break position. The values appear in the rule source523* within brackets, {123}, for example. For rules that do not specify a524* status, a default value of 0 is returned.525* <p>526* For word break iterators, the possible values are defined in enum UWordBreak.527* @stable ICU 2.2528*/529U_STABLE int32_t U_EXPORT2530ubrk_getRuleStatus(UBreakIterator *bi);531532/**533* Get the statuses from the break rules that determined the most recently534* returned break position. The values appear in the rule source535* within brackets, {123}, for example. The default status value for rules536* that do not explicitly provide one is zero.537* <p>538* For word break iterators, the possible values are defined in enum UWordBreak.539* @param bi The break iterator to use540* @param fillInVec an array to be filled in with the status values.541* @param capacity the length of the supplied vector. A length of zero causes542* the function to return the number of status values, in the543* normal way, without attempting to store any values.544* @param status receives error codes.545* @return The number of rule status values from rules that determined546* the most recent boundary returned by the break iterator.547* @stable ICU 3.0548*/549U_STABLE int32_t U_EXPORT2550ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);551552/**553* Return the locale of the break iterator. You can choose between the valid and554* the actual locale.555* @param bi break iterator556* @param type locale type (valid or actual)557* @param status error code558* @return locale string559* @stable ICU 2.8560*/561U_STABLE const char* U_EXPORT2562ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);563564/**565* Set the subject text string upon which the break iterator is operating566* without changing any other aspect of the state.567* The new and previous text strings must have the same content.568*569* This function is intended for use in environments where ICU is operating on570* strings that may move around in memory. It provides a mechanism for notifying571* ICU that the string has been relocated, and providing a new UText to access the572* string in its new position.573*574* Note that the break iterator never copies the underlying text575* of a string being processed, but always operates directly on the original text576* provided by the user. Refreshing simply drops the references to the old text577* and replaces them with references to the new.578*579* Caution: this function is normally used only by very specialized580* system-level code. One example use case is with garbage collection581* that moves the text in memory.582*583* @param bi The break iterator.584* @param text The new (moved) text string.585* @param status Receives errors detected by this function.586*587* @stable ICU 49588*/589U_STABLE void U_EXPORT2590ubrk_refreshUText(UBreakIterator *bi,591UText *text,592UErrorCode *status);593594595/**596* Get a compiled binary version of the rules specifying the behavior of a UBreakIterator.597* The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator598* more quickly than using ubrk_openRules. The compiled rules are not compatible across599* different major versions of ICU, nor across platforms of different endianness or600* different base character set family (ASCII vs EBCDIC). Supports preflighting (with601* binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to602* the binaryRules buffer. However, whether preflighting or not, if the actual length603* is greater than INT32_MAX, then the function returns 0 and sets *status to604* U_INDEX_OUTOFBOUNDS_ERROR.605606* @param bi The break iterator to use.607* @param binaryRules Buffer to receive the compiled binary rules; set to NULL for608* preflighting.609* @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for610* preflighting. Must be >= 0.611* @param status Pointer to UErrorCode to receive any errors, such as612* U_BUFFER_OVERFLOW_ERROR, U_INDEX_OUTOFBOUNDS_ERROR, or613* U_ILLEGAL_ARGUMENT_ERROR.614* @return The actual byte length of the binary rules, if <= INT32_MAX;615* otherwise 0. If not preflighting and this is larger than616* rulesCapacity, *status will be set to an error.617* @see ubrk_openBinaryRules618* @stable ICU 59619*/620U_STABLE int32_t U_EXPORT2621ubrk_getBinaryRules(UBreakIterator *bi,622uint8_t * binaryRules, int32_t rulesCapacity,623UErrorCode * status);624625#endif /* #if !UCONFIG_NO_BREAK_ITERATION */626627#endif628629630