Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/native/common/unicode/ubrk.h
38827 views
/*1******************************************************************************2* Copyright (C) 1996-2015, International Business Machines Corporation and others.3* All Rights Reserved.4******************************************************************************5*/67#ifndef UBRK_H8#define UBRK_H910#include "unicode/utypes.h"11#include "unicode/uloc.h"12#include "unicode/utext.h"13#include "unicode/localpointer.h"1415/**16* A text-break iterator.17* For usage in C programs.18*/19#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR20# define UBRK_TYPEDEF_UBREAK_ITERATOR21/**22* Opaque type representing an ICU Break iterator object.23* @stable ICU 2.024*/25typedef struct UBreakIterator UBreakIterator;26#endif2728#if !UCONFIG_NO_BREAK_ITERATION2930#include "unicode/parseerr.h"3132/**33* \file34* \brief C API: BreakIterator35*36* <h2> BreakIterator C API </h2>37*38* The BreakIterator C API defines methods for finding the location39* of boundaries in text. Pointer to a UBreakIterator maintain a40* current position and scan over text returning the index of characters41* where boundaries occur.42* <p>43* Line boundary analysis determines where a text string can be broken44* when line-wrapping. The mechanism correctly handles punctuation and45* hyphenated words.46* <p>47* Note: The locale keyword "lb" can be used to modify line break48* behavior according to the CSS level 3 line-break options, see49* <http://dev.w3.org/csswg/css-text/#line-breaking>. For example:50* "ja@lb=strict", "zh@lb=loose".51* <p>52* Sentence boundary analysis allows selection with correct53* interpretation of periods within numbers and abbreviations, and54* trailing punctuation marks such as quotation marks and parentheses.55* <p>56* Note: The locale keyword "ss" can be used to enable use of57* segmentation suppression data (preventing breaks in English after58* abbreviations such as "Mr." or "Est.", for example), as follows:59* "en@ss=standard".60* <p>61* Word boundary analysis is used by search and replace functions, as62* well as within text editing applications that allow the user to63* select words with a double click. Word selection provides correct64* interpretation of punctuation marks within and following65* words. Characters that are not part of a word, such as symbols or66* punctuation marks, have word-breaks on both sides.67* <p>68* Character boundary analysis identifies the boundaries of69* "Extended Grapheme Clusters", which are groupings of codepoints70* that should be treated as character-like units for many text operations.71* Please see Unicode Standard Annex #29, Unicode Text Segmentation,72* http://www.unicode.org/reports/tr29/ for additional information73* on grapheme clusters and guidelines on their use.74* <p>75* Title boundary analysis locates all positions,76* typically starts of words, that should be set to Title Case77* when title casing the text.78* <p>79* The text boundary positions are found according to the rules80* described in Unicode Standard Annex #29, Text Boundaries, and81* Unicode Standard Annex #14, Line Breaking Properties. These82* are available at http://www.unicode.org/reports/tr14/ and83* http://www.unicode.org/reports/tr29/.84* <p>85* In addition to the plain C API defined in this header file, an86* object oriented C++ API with equivalent functionality is defined in the87* file brkiter.h.88* <p>89* Code snippets illustrating the use of the Break Iterator APIs90* are available in the ICU User Guide,91* http://icu-project.org/userguide/boundaryAnalysis.html92* and in the sample program icu/source/samples/break/break.cpp93*/9495/** The possible types of text boundaries. @stable ICU 2.0 */96typedef enum UBreakIteratorType {97/** Character breaks @stable ICU 2.0 */98UBRK_CHARACTER = 0,99/** Word breaks @stable ICU 2.0 */100UBRK_WORD = 1,101/** Line breaks @stable ICU 2.0 */102UBRK_LINE = 2,103/** Sentence breaks @stable ICU 2.0 */104UBRK_SENTENCE = 3,105106#ifndef U_HIDE_DEPRECATED_API107/**108* Title Case breaks109* The iterator created using this type locates title boundaries as described for110* Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,111* please use Word Boundary iterator.112*113* @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.114*/115UBRK_TITLE = 4,116#endif /* U_HIDE_DEPRECATED_API */117UBRK_COUNT = 5118} UBreakIteratorType;119120/** Value indicating all text boundaries have been returned.121* @stable ICU 2.0122*/123#define UBRK_DONE ((int32_t) -1)124125126/**127* Enum constants for the word break tags returned by128* getRuleStatus(). A range of values is defined for each category of129* word, to allow for further subdivisions of a category in future releases.130* Applications should check for tag values falling within the range, rather131* than for single individual values.132* @stable ICU 2.2133*/134typedef enum UWordBreak {135/** Tag value for "words" that do not fit into any of other categories.136* Includes spaces and most punctuation. */137UBRK_WORD_NONE = 0,138/** Upper bound for tags for uncategorized words. */139UBRK_WORD_NONE_LIMIT = 100,140/** Tag value for words that appear to be numbers, lower limit. */141UBRK_WORD_NUMBER = 100,142/** Tag value for words that appear to be numbers, upper limit. */143UBRK_WORD_NUMBER_LIMIT = 200,144/** Tag value for words that contain letters, excluding145* hiragana, katakana or ideographic characters, lower limit. */146UBRK_WORD_LETTER = 200,147/** Tag value for words containing letters, upper limit */148UBRK_WORD_LETTER_LIMIT = 300,149/** Tag value for words containing kana characters, lower limit */150UBRK_WORD_KANA = 300,151/** Tag value for words containing kana characters, upper limit */152UBRK_WORD_KANA_LIMIT = 400,153/** Tag value for words containing ideographic characters, lower limit */154UBRK_WORD_IDEO = 400,155/** Tag value for words containing ideographic characters, upper limit */156UBRK_WORD_IDEO_LIMIT = 500157} UWordBreak;158159/**160* Enum constants for the line break tags returned by getRuleStatus().161* A range of values is defined for each category of162* word, to allow for further subdivisions of a category in future releases.163* Applications should check for tag values falling within the range, rather164* than for single individual values.165* @stable ICU 2.8166*/167typedef enum ULineBreakTag {168/** Tag value for soft line breaks, positions at which a line break169* is acceptable but not required */170UBRK_LINE_SOFT = 0,171/** Upper bound for soft line breaks. */172UBRK_LINE_SOFT_LIMIT = 100,173/** Tag value for a hard, or mandatory line break */174UBRK_LINE_HARD = 100,175/** Upper bound for hard line breaks. */176UBRK_LINE_HARD_LIMIT = 200177} ULineBreakTag;178179180181/**182* Enum constants for the sentence break tags returned by getRuleStatus().183* A range of values is defined for each category of184* sentence, to allow for further subdivisions of a category in future releases.185* Applications should check for tag values falling within the range, rather186* than for single individual values.187* @stable ICU 2.8188*/189typedef enum USentenceBreakTag {190/** Tag value for for sentences ending with a sentence terminator191* ('.', '?', '!', etc.) character, possibly followed by a192* hard separator (CR, LF, PS, etc.)193*/194UBRK_SENTENCE_TERM = 0,195/** Upper bound for tags for sentences ended by sentence terminators. */196UBRK_SENTENCE_TERM_LIMIT = 100,197/** Tag value for for sentences that do not contain an ending198* sentence terminator ('.', '?', '!', etc.) character, but199* are ended only by a hard separator (CR, LF, PS, etc.) or end of input.200*/201UBRK_SENTENCE_SEP = 100,202/** Upper bound for tags for sentences ended by a separator. */203UBRK_SENTENCE_SEP_LIMIT = 200204/** Tag value for a hard, or mandatory line break */205} USentenceBreakTag;206207208/**209* Open a new UBreakIterator for locating text boundaries for a specified locale.210* A UBreakIterator may be used for detecting character, line, word,211* and sentence breaks in text.212* @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,213* UBRK_LINE, UBRK_SENTENCE214* @param locale The locale specifying the text-breaking conventions. Note that215* locale keys such as "lb" and "ss" may be used to modify text break behavior,216* see general discussion of BreakIterator C API.217* @param text The text to be iterated over.218* @param textLength The number of characters in text, or -1 if null-terminated.219* @param status A UErrorCode to receive any errors.220* @return A UBreakIterator for the specified locale.221* @see ubrk_openRules222* @stable ICU 2.0223*/224U_STABLE UBreakIterator* U_EXPORT2225ubrk_open(UBreakIteratorType type,226const char *locale,227const UChar *text,228int32_t textLength,229UErrorCode *status);230231/**232* Open a new UBreakIterator for locating text boundaries using specified breaking rules.233* The rule syntax is ... (TBD)234* @param rules A set of rules specifying the text breaking conventions.235* @param rulesLength The number of characters in rules, or -1 if null-terminated.236* @param text The text to be iterated over. May be null, in which case ubrk_setText() is237* used to specify the text to be iterated.238* @param textLength The number of characters in text, or -1 if null-terminated.239* @param parseErr Receives position and context information for any syntax errors240* detected while parsing the rules.241* @param status A UErrorCode to receive any errors.242* @return A UBreakIterator for the specified rules.243* @see ubrk_open244* @stable ICU 2.2245*/246U_STABLE UBreakIterator* U_EXPORT2247ubrk_openRules(const UChar *rules,248int32_t rulesLength,249const UChar *text,250int32_t textLength,251UParseError *parseErr,252UErrorCode *status);253254/**255* Thread safe cloning operation256* @param bi iterator to be cloned257* @param stackBuffer <em>Deprecated functionality as of ICU 52, use NULL.</em><br>258* user allocated space for the new clone. If NULL new memory will be allocated.259* If buffer is not large enough, new memory will be allocated.260* Clients can use the U_BRK_SAFECLONE_BUFFERSIZE.261* @param pBufferSize <em>Deprecated functionality as of ICU 52, use NULL or 1.</em><br>262* pointer to size of allocated space.263* If *pBufferSize == 0, a sufficient size for use in cloning will264* be returned ('pre-flighting')265* If *pBufferSize is not enough for a stack-based safe clone,266* new memory will be allocated.267* @param status to indicate whether the operation went on smoothly or there were errors268* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.269* @return pointer to the new clone270* @stable ICU 2.0271*/272U_STABLE UBreakIterator * U_EXPORT2273ubrk_safeClone(274const UBreakIterator *bi,275void *stackBuffer,276int32_t *pBufferSize,277UErrorCode *status);278279#ifndef U_HIDE_DEPRECATED_API280281/**282* A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().283* @deprecated ICU 52. Do not rely on ubrk_safeClone() cloning into any provided buffer.284*/285#define U_BRK_SAFECLONE_BUFFERSIZE 1286287#endif /* U_HIDE_DEPRECATED_API */288289/**290* Close a UBreakIterator.291* Once closed, a UBreakIterator may no longer be used.292* @param bi The break iterator to close.293* @stable ICU 2.0294*/295U_STABLE void U_EXPORT2296ubrk_close(UBreakIterator *bi);297298#if U_SHOW_CPLUSPLUS_API299300U_NAMESPACE_BEGIN301302/**303* \class LocalUBreakIteratorPointer304* "Smart pointer" class, closes a UBreakIterator via ubrk_close().305* For most methods see the LocalPointerBase base class.306*307* @see LocalPointerBase308* @see LocalPointer309* @stable ICU 4.4310*/311U_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_close);312313U_NAMESPACE_END314315#endif316317/**318* Sets an existing iterator to point to a new piece of text319* @param bi The iterator to use320* @param text The text to be set321* @param textLength The length of the text322* @param status The error code323* @stable ICU 2.0324*/325U_STABLE void U_EXPORT2326ubrk_setText(UBreakIterator* bi,327const UChar* text,328int32_t textLength,329UErrorCode* status);330331332/**333* Sets an existing iterator to point to a new piece of text.334*335* All index positions returned by break iterator functions are336* native indices from the UText. For example, when breaking UTF-8337* encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc.338* will be UTF-8 string indices, not UTF-16 positions.339*340* @param bi The iterator to use341* @param text The text to be set.342* This function makes a shallow clone of the supplied UText. This means343* that the caller is free to immediately close or otherwise reuse the344* UText that was passed as a parameter, but that the underlying text itself345* must not be altered while being referenced by the break iterator.346* @param status The error code347* @stable ICU 3.4348*/349U_STABLE void U_EXPORT2350ubrk_setUText(UBreakIterator* bi,351UText* text,352UErrorCode* status);353354355356/**357* Determine the most recently-returned text boundary.358*359* @param bi The break iterator to use.360* @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,361* \ref ubrk_first, or \ref ubrk_last.362* @stable ICU 2.0363*/364U_STABLE int32_t U_EXPORT2365ubrk_current(const UBreakIterator *bi);366367/**368* Advance the iterator to the boundary following the current boundary.369*370* @param bi The break iterator to use.371* @return The character index of the next text boundary, or UBRK_DONE372* if all text boundaries have been returned.373* @see ubrk_previous374* @stable ICU 2.0375*/376U_STABLE int32_t U_EXPORT2377ubrk_next(UBreakIterator *bi);378379/**380* Set the iterator position to the boundary preceding the current boundary.381*382* @param bi The break iterator to use.383* @return The character index of the preceding text boundary, or UBRK_DONE384* if all text boundaries have been returned.385* @see ubrk_next386* @stable ICU 2.0387*/388U_STABLE int32_t U_EXPORT2389ubrk_previous(UBreakIterator *bi);390391/**392* Set the iterator position to zero, the start of the text being scanned.393* @param bi The break iterator to use.394* @return The new iterator position (zero).395* @see ubrk_last396* @stable ICU 2.0397*/398U_STABLE int32_t U_EXPORT2399ubrk_first(UBreakIterator *bi);400401/**402* Set the iterator position to the index immediately <EM>beyond</EM> the last character in the text being scanned.403* This is not the same as the last character.404* @param bi The break iterator to use.405* @return The character offset immediately <EM>beyond</EM> the last character in the406* text being scanned.407* @see ubrk_first408* @stable ICU 2.0409*/410U_STABLE int32_t U_EXPORT2411ubrk_last(UBreakIterator *bi);412413/**414* Set the iterator position to the first boundary preceding the specified offset.415* The new position is always smaller than offset, or UBRK_DONE.416* @param bi The break iterator to use.417* @param offset The offset to begin scanning.418* @return The text boundary preceding offset, or UBRK_DONE.419* @see ubrk_following420* @stable ICU 2.0421*/422U_STABLE int32_t U_EXPORT2423ubrk_preceding(UBreakIterator *bi,424int32_t offset);425426/**427* Advance the iterator to the first boundary following the specified offset.428* The value returned is always greater than offset, or UBRK_DONE.429* @param bi The break iterator to use.430* @param offset The offset to begin scanning.431* @return The text boundary following offset, or UBRK_DONE.432* @see ubrk_preceding433* @stable ICU 2.0434*/435U_STABLE int32_t U_EXPORT2436ubrk_following(UBreakIterator *bi,437int32_t offset);438439/**440* Get a locale for which text breaking information is available.441* A UBreakIterator in a locale returned by this function will perform the correct442* text breaking for the locale.443* @param index The index of the desired locale.444* @return A locale for which number text breaking information is available, or 0 if none.445* @see ubrk_countAvailable446* @stable ICU 2.0447*/448U_STABLE const char* U_EXPORT2449ubrk_getAvailable(int32_t index);450451/**452* Determine how many locales have text breaking information available.453* This function is most useful as determining the loop ending condition for454* calls to \ref ubrk_getAvailable.455* @return The number of locales for which text breaking information is available.456* @see ubrk_getAvailable457* @stable ICU 2.0458*/459U_STABLE int32_t U_EXPORT2460ubrk_countAvailable(void);461462463/**464* Returns true if the specfied position is a boundary position. As a side465* effect, leaves the iterator pointing to the first boundary position at466* or after "offset".467* @param bi The break iterator to use.468* @param offset the offset to check.469* @return True if "offset" is a boundary position.470* @stable ICU 2.0471*/472U_STABLE UBool U_EXPORT2473ubrk_isBoundary(UBreakIterator *bi, int32_t offset);474475/**476* Return the status from the break rule that determined the most recently477* returned break position. The values appear in the rule source478* within brackets, {123}, for example. For rules that do not specify a479* status, a default value of 0 is returned.480* <p>481* For word break iterators, the possible values are defined in enum UWordBreak.482* @stable ICU 2.2483*/484U_STABLE int32_t U_EXPORT2485ubrk_getRuleStatus(UBreakIterator *bi);486487/**488* Get the statuses from the break rules that determined the most recently489* returned break position. The values appear in the rule source490* within brackets, {123}, for example. The default status value for rules491* that do not explicitly provide one is zero.492* <p>493* For word break iterators, the possible values are defined in enum UWordBreak.494* @param bi The break iterator to use495* @param fillInVec an array to be filled in with the status values.496* @param capacity the length of the supplied vector. A length of zero causes497* the function to return the number of status values, in the498* normal way, without attemtping to store any values.499* @param status receives error codes.500* @return The number of rule status values from rules that determined501* the most recent boundary returned by the break iterator.502* @stable ICU 3.0503*/504U_STABLE int32_t U_EXPORT2505ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);506507/**508* Return the locale of the break iterator. You can choose between the valid and509* the actual locale.510* @param bi break iterator511* @param type locale type (valid or actual)512* @param status error code513* @return locale string514* @stable ICU 2.8515*/516U_STABLE const char* U_EXPORT2517ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);518519/**520* Set the subject text string upon which the break iterator is operating521* without changing any other aspect of the state.522* The new and previous text strings must have the same content.523*524* This function is intended for use in environments where ICU is operating on525* strings that may move around in memory. It provides a mechanism for notifying526* ICU that the string has been relocated, and providing a new UText to access the527* string in its new position.528*529* Note that the break iterator never copies the underlying text530* of a string being processed, but always operates directly on the original text531* provided by the user. Refreshing simply drops the references to the old text532* and replaces them with references to the new.533*534* Caution: this function is normally used only by very specialized535* system-level code. One example use case is with garbage collection536* that moves the text in memory.537*538* @param bi The break iterator.539* @param text The new (moved) text string.540* @param status Receives errors detected by this function.541*542* @stable ICU 49543*/544U_STABLE void U_EXPORT2545ubrk_refreshUText(UBreakIterator *bi,546UText *text,547UErrorCode *status);548549#endif /* #if !UCONFIG_NO_BREAK_ITERATION */550551#endif552553554