Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/native/common/unicode/rbbi.h
38827 views
/*1***************************************************************************2* Copyright (C) 1999-2014 International Business Machines Corporation *3* and others. All rights reserved. *4***************************************************************************56**********************************************************************7* Date Name Description8* 10/22/99 alan Creation.9* 11/11/99 rgillam Complete port from Java.10**********************************************************************11*/1213#ifndef RBBI_H14#define RBBI_H1516#include "unicode/utypes.h"1718/**19* \file20* \brief C++ API: Rule Based Break Iterator21*/2223#if !UCONFIG_NO_BREAK_ITERATION2425#include "unicode/brkiter.h"26#include "unicode/udata.h"27#include "unicode/parseerr.h"28#include "unicode/schriter.h"29#include "unicode/uchriter.h"303132struct UTrie;3334U_NAMESPACE_BEGIN3536/** @internal */37struct RBBIDataHeader;38class RuleBasedBreakIteratorTables;39class BreakIterator;40class RBBIDataWrapper;41class UStack;42class LanguageBreakEngine;43class UnhandledEngine;44struct RBBIStateTable;4546474849/**50*51* A subclass of BreakIterator whose behavior is specified using a list of rules.52* <p>Instances of this class are most commonly created by the factory methods of53* BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc.,54* and then used via the abstract API in class BreakIterator</p>55*56* <p>See the ICU User Guide for information on Break Iterator Rules.</p>57*58* <p>This class is not intended to be subclassed. (Class DictionaryBasedBreakIterator59* is a subclass, but that relationship is effectively internal to the ICU60* implementation. The subclassing interface to RulesBasedBreakIterator is61* not part of the ICU API, and may not remain stable.</p>62*63*/64class U_COMMON_API RuleBasedBreakIterator /*U_FINAL*/ : public BreakIterator {6566protected:67/**68* The UText through which this BreakIterator accesses the text69* @internal70*/71UText *fText;7273/**74* A character iterator that refers to the same text as the UText, above.75* Only included for compatibility with old API, which was based on CharacterIterators.76* Value may be adopted from outside, or one of fSCharIter or fDCharIter, below.77*/78CharacterIterator *fCharIter;7980/**81* When the input text is provided by a UnicodeString, this will point to82* a characterIterator that wraps that data. Needed only for the83* implementation of getText(), a backwards compatibility issue.84*/85StringCharacterIterator *fSCharIter;8687/**88* When the input text is provided by a UText, this89* dummy CharacterIterator over an empty string will90* be returned from getText()91*/92UCharCharacterIterator *fDCharIter;9394/**95* The rule data for this BreakIterator instance96* @internal97*/98RBBIDataWrapper *fData;99100/** Index of the Rule {tag} values for the most recent match.101* @internal102*/103int32_t fLastRuleStatusIndex;104105/**106* Rule tag value valid flag.107* Some iterator operations don't intrinsically set the correct tag value.108* This flag lets us lazily compute the value if we are ever asked for it.109* @internal110*/111UBool fLastStatusIndexValid;112113/**114* Counter for the number of characters encountered with the "dictionary"115* flag set.116* @internal117*/118uint32_t fDictionaryCharCount;119120/**121* When a range of characters is divided up using the dictionary, the break122* positions that are discovered are stored here, preventing us from having123* to use either the dictionary or the state table again until the iterator124* leaves this range of text. Has the most impact for line breaking.125* @internal126*/127int32_t* fCachedBreakPositions;128129/**130* The number of elements in fCachedBreakPositions131* @internal132*/133int32_t fNumCachedBreakPositions;134135/**136* if fCachedBreakPositions is not null, this indicates which item in the137* cache the current iteration position refers to138* @internal139*/140int32_t fPositionInCache;141142/**143*144* If present, UStack of LanguageBreakEngine objects that might handle145* dictionary characters. Searched from top to bottom to find an object to146* handle a given character.147* @internal148*/149UStack *fLanguageBreakEngines;150151/**152*153* If present, the special LanguageBreakEngine used for handling154* characters that are in the dictionary set, but not handled by any155* LangugageBreakEngine.156* @internal157*/158UnhandledEngine *fUnhandledBreakEngine;159160/**161*162* The type of the break iterator, or -1 if it has not been set.163* @internal164*/165int32_t fBreakType;166167protected:168//=======================================================================169// constructors170//=======================================================================171172#ifndef U_HIDE_INTERNAL_API173/**174* Constant to be used in the constructor175* RuleBasedBreakIterator(RBBIDataHeader*, EDontAdopt, UErrorCode &);176* which does not adopt the memory indicated by the RBBIDataHeader*177* parameter.178*179* @internal180*/181enum EDontAdopt {182kDontAdopt183};184185/**186* Constructor from a flattened set of RBBI data in malloced memory.187* RulesBasedBreakIterators built from a custom set of rules188* are created via this constructor; the rules are compiled189* into memory, then the break iterator is constructed here.190*191* The break iterator adopts the memory, and will192* free it when done.193* @internal194*/195RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);196197/**198* Constructor from a flattened set of RBBI data in memory which need not199* be malloced (e.g. it may be a memory-mapped file, etc.).200*201* This version does not adopt the memory, and does not202* free it when done.203* @internal204*/205RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt dontAdopt, UErrorCode &status);206#endif /* U_HIDE_INTERNAL_API */207208209friend class RBBIRuleBuilder;210/** @internal */211friend class BreakIterator;212213214215public:216217/** Default constructor. Creates an empty shell of an iterator, with no218* rules or text to iterate over. Object can subsequently be assigned to.219* @stable ICU 2.2220*/221RuleBasedBreakIterator();222223/**224* Copy constructor. Will produce a break iterator with the same behavior,225* and which iterates over the same text, as the one passed in.226* @param that The RuleBasedBreakIterator passed to be copied227* @stable ICU 2.0228*/229RuleBasedBreakIterator(const RuleBasedBreakIterator& that);230231/**232* Construct a RuleBasedBreakIterator from a set of rules supplied as a string.233* @param rules The break rules to be used.234* @param parseError In the event of a syntax error in the rules, provides the location235* within the rules of the problem.236* @param status Information on any errors encountered.237* @stable ICU 2.2238*/239RuleBasedBreakIterator( const UnicodeString &rules,240UParseError &parseError,241UErrorCode &status);242243/**244* Contruct a RuleBasedBreakIterator from a set of precompiled binary rules.245* Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules().246* Construction of a break iterator in this way is substantially faster than247* constuction from source rules.248*249* Ownership of the storage containing the compiled rules remains with the250* caller of this function. The compiled rules must not be modified or251* deleted during the life of the break iterator.252*253* The compiled rules are not compatible across different major versions of ICU.254* The compiled rules are comaptible only between machines with the same255* byte ordering (little or big endian) and the same base character set family256* (ASCII or EBCDIC).257*258* @see #getBinaryRules259* @param compiledRules A pointer to the compiled break rules to be used.260* @param ruleLength The length of the compiled break rules, in bytes. This261* corresponds to the length value produced by getBinaryRules().262* @param status Information on any errors encountered, including invalid263* binary rules.264* @stable ICU 4.8265*/266RuleBasedBreakIterator(const uint8_t *compiledRules,267uint32_t ruleLength,268UErrorCode &status);269270/**271* This constructor uses the udata interface to create a BreakIterator272* whose internal tables live in a memory-mapped file. "image" is an273* ICU UDataMemory handle for the pre-compiled break iterator tables.274* @param image handle to the memory image for the break iterator data.275* Ownership of the UDataMemory handle passes to the Break Iterator,276* which will be responsible for closing it when it is no longer needed.277* @param status Information on any errors encountered.278* @see udata_open279* @see #getBinaryRules280* @stable ICU 2.8281*/282RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status);283284/**285* Destructor286* @stable ICU 2.0287*/288virtual ~RuleBasedBreakIterator();289290/**291* Assignment operator. Sets this iterator to have the same behavior,292* and iterate over the same text, as the one passed in.293* @param that The RuleBasedBreakItertor passed in294* @return the newly created RuleBasedBreakIterator295* @stable ICU 2.0296*/297RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);298299/**300* Equality operator. Returns TRUE if both BreakIterators are of the301* same class, have the same behavior, and iterate over the same text.302* @param that The BreakIterator to be compared for equality303* @return TRUE if both BreakIterators are of the304* same class, have the same behavior, and iterate over the same text.305* @stable ICU 2.0306*/307virtual UBool operator==(const BreakIterator& that) const;308309/**310* Not-equal operator. If operator== returns TRUE, this returns FALSE,311* and vice versa.312* @param that The BreakIterator to be compared for inequality313* @return TRUE if both BreakIterators are not same.314* @stable ICU 2.0315*/316UBool operator!=(const BreakIterator& that) const;317318/**319* Returns a newly-constructed RuleBasedBreakIterator with the same320* behavior, and iterating over the same text, as this one.321* Differs from the copy constructor in that it is polymorphic, and322* will correctly clone (copy) a derived class.323* clone() is thread safe. Multiple threads may simultaeneously324* clone the same source break iterator.325* @return a newly-constructed RuleBasedBreakIterator326* @stable ICU 2.0327*/328virtual BreakIterator* clone() const;329330/**331* Compute a hash code for this BreakIterator332* @return A hash code333* @stable ICU 2.0334*/335virtual int32_t hashCode(void) const;336337/**338* Returns the description used to create this iterator339* @return the description used to create this iterator340* @stable ICU 2.0341*/342virtual const UnicodeString& getRules(void) const;343344//=======================================================================345// BreakIterator overrides346//=======================================================================347348/**349* <p>350* Return a CharacterIterator over the text being analyzed.351* The returned character iterator is owned by the break iterator, and must352* not be deleted by the caller. Repeated calls to this function may353* return the same CharacterIterator.354* </p>355* <p>356* The returned character iterator must not be used concurrently with357* the break iterator. If concurrent operation is needed, clone the358* returned character iterator first and operate on the clone.359* </p>360* <p>361* When the break iterator is operating on text supplied via a UText,362* this function will fail. Lacking any way to signal failures, it363* returns an CharacterIterator containing no text.364* The function getUText() provides similar functionality,365* is reliable, and is more efficient.366* </p>367*368* TODO: deprecate this function?369*370* @return An iterator over the text being analyzed.371* @stable ICU 2.0372*/373virtual CharacterIterator& getText(void) const;374375376/**377* Get a UText for the text being analyzed.378* The returned UText is a shallow clone of the UText used internally379* by the break iterator implementation. It can safely be used to380* access the text without impacting any break iterator operations,381* but the underlying text itself must not be altered.382*383* @param fillIn A UText to be filled in. If NULL, a new UText will be384* allocated to hold the result.385* @param status receives any error codes.386* @return The current UText for this break iterator. If an input387* UText was provided, it will always be returned.388* @stable ICU 3.4389*/390virtual UText *getUText(UText *fillIn, UErrorCode &status) const;391392/**393* Set the iterator to analyze a new piece of text. This function resets394* the current iteration position to the beginning of the text.395* @param newText An iterator over the text to analyze. The BreakIterator396* takes ownership of the character iterator. The caller MUST NOT delete it!397* @stable ICU 2.0398*/399virtual void adoptText(CharacterIterator* newText);400401/**402* Set the iterator to analyze a new piece of text. This function resets403* the current iteration position to the beginning of the text.404* @param newText The text to analyze.405* @stable ICU 2.0406*/407virtual void setText(const UnicodeString& newText);408409/**410* Reset the break iterator to operate over the text represented by411* the UText. The iterator position is reset to the start.412*413* This function makes a shallow clone of the supplied UText. This means414* that the caller is free to immediately close or otherwise reuse the415* Utext that was passed as a parameter, but that the underlying text itself416* must not be altered while being referenced by the break iterator.417*418* @param text The UText used to change the text.419* @param status Receives any error codes.420* @stable ICU 3.4421*/422virtual void setText(UText *text, UErrorCode &status);423424/**425* Sets the current iteration position to the beginning of the text, position zero.426* @return The offset of the beginning of the text, zero.427* @stable ICU 2.0428*/429virtual int32_t first(void);430431/**432* Sets the current iteration position to the end of the text.433* @return The text's past-the-end offset.434* @stable ICU 2.0435*/436virtual int32_t last(void);437438/**439* Advances the iterator either forward or backward the specified number of steps.440* Negative values move backward, and positive values move forward. This is441* equivalent to repeatedly calling next() or previous().442* @param n The number of steps to move. The sign indicates the direction443* (negative is backwards, and positive is forwards).444* @return The character offset of the boundary position n boundaries away from445* the current one.446* @stable ICU 2.0447*/448virtual int32_t next(int32_t n);449450/**451* Advances the iterator to the next boundary position.452* @return The position of the first boundary after this one.453* @stable ICU 2.0454*/455virtual int32_t next(void);456457/**458* Moves the iterator backwards, to the last boundary preceding this one.459* @return The position of the last boundary position preceding this one.460* @stable ICU 2.0461*/462virtual int32_t previous(void);463464/**465* Sets the iterator to refer to the first boundary position following466* the specified position.467* @param offset The position from which to begin searching for a break position.468* @return The position of the first break after the current position.469* @stable ICU 2.0470*/471virtual int32_t following(int32_t offset);472473/**474* Sets the iterator to refer to the last boundary position before the475* specified position.476* @param offset The position to begin searching for a break from.477* @return The position of the last boundary before the starting position.478* @stable ICU 2.0479*/480virtual int32_t preceding(int32_t offset);481482/**483* Returns true if the specfied position is a boundary position. As a side484* effect, leaves the iterator pointing to the first boundary position at485* or after "offset".486* @param offset the offset to check.487* @return True if "offset" is a boundary position.488* @stable ICU 2.0489*/490virtual UBool isBoundary(int32_t offset);491492/**493* Returns the current iteration position.494* @return The current iteration position.495* @stable ICU 2.0496*/497virtual int32_t current(void) const;498499500/**501* Return the status tag from the break rule that determined the most recently502* returned break position. For break rules that do not specify a503* status, a default value of 0 is returned. If more than one break rule504* would cause a boundary to be located at some position in the text,505* the numerically largest of the applicable status values is returned.506* <p>507* Of the standard types of ICU break iterators, only word break and508* line break provide status values. The values are defined in509* the header file ubrk.h. For Word breaks, the status allows distinguishing between words510* that contain alphabetic letters, "words" that appear to be numbers,511* punctuation and spaces, words containing ideographic characters, and512* more. For Line Break, the status distinguishes between hard (mandatory) breaks513* and soft (potential) break positions.514* <p>515* <code>getRuleStatus()</code> can be called after obtaining a boundary516* position from <code>next()</code>, <code>previous()</code>, or517* any other break iterator functions that returns a boundary position.518* <p>519* When creating custom break rules, one is free to define whatever520* status values may be convenient for the application.521* <p>522* Note: this function is not thread safe. It should not have been523* declared const, and the const remains only for compatibility524* reasons. (The function is logically const, but not bit-wise const).525* <p>526* @return the status from the break rule that determined the most recently527* returned break position.528*529* @see UWordBreak530* @stable ICU 2.2531*/532virtual int32_t getRuleStatus() const;533534/**535* Get the status (tag) values from the break rule(s) that determined the most536* recently returned break position.537* <p>538* The returned status value(s) are stored into an array provided by the caller.539* The values are stored in sorted (ascending) order.540* If the capacity of the output array is insufficient to hold the data,541* the output will be truncated to the available length, and a542* U_BUFFER_OVERFLOW_ERROR will be signaled.543*544* @param fillInVec an array to be filled in with the status values.545* @param capacity the length of the supplied vector. A length of zero causes546* the function to return the number of status values, in the547* normal way, without attemtping to store any values.548* @param status receives error codes.549* @return The number of rule status values from rules that determined550* the most recent boundary returned by the break iterator.551* In the event of a U_BUFFER_OVERFLOW_ERROR, the return value552* is the total number of status values that were available,553* not the reduced number that were actually returned.554* @see getRuleStatus555* @stable ICU 3.0556*/557virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);558559/**560* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.561* This method is to implement a simple version of RTTI, since not all562* C++ compilers support genuine RTTI. Polymorphic operator==() and563* clone() methods call this method.564*565* @return The class ID for this object. All objects of a566* given class have the same class ID. Objects of567* other classes have different class IDs.568* @stable ICU 2.0569*/570virtual UClassID getDynamicClassID(void) const;571572/**573* Returns the class ID for this class. This is useful only for574* comparing to a return value from getDynamicClassID(). For example:575*576* Base* polymorphic_pointer = createPolymorphicObject();577* if (polymorphic_pointer->getDynamicClassID() ==578* Derived::getStaticClassID()) ...579*580* @return The class ID for all objects of this class.581* @stable ICU 2.0582*/583static UClassID U_EXPORT2 getStaticClassID(void);584585/**586* Deprecated functionality. Use clone() instead.587*588* Create a clone (copy) of this break iterator in memory provided589* by the caller. The idea is to increase performance by avoiding590* a storage allocation. Use of this functoin is NOT RECOMMENDED.591* Performance gains are minimal, and correct buffer management is592* tricky. Use clone() instead.593*594* @param stackBuffer The pointer to the memory into which the cloned object595* should be placed. If NULL, allocate heap memory596* for the cloned object.597* @param BufferSize The size of the buffer. If zero, return the required598* buffer size, but do not clone the object. If the599* size was too small (but not zero), allocate heap600* storage for the cloned object.601*602* @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be603* returned if the the provided buffer was too small, and604* the clone was therefore put on the heap.605*606* @return Pointer to the clone object. This may differ from the stackBuffer607* address if the byte alignment of the stack buffer was not suitable608* or if the stackBuffer was too small to hold the clone.609* @deprecated ICU 52. Use clone() instead.610*/611virtual BreakIterator * createBufferClone(void *stackBuffer,612int32_t &BufferSize,613UErrorCode &status);614615616/**617* Return the binary form of compiled break rules,618* which can then be used to create a new break iterator at some619* time in the future. Creating a break iterator from pre-compiled rules620* is much faster than building one from the source form of the621* break rules.622*623* The binary data can only be used with the same version of ICU624* and on the same platform type (processor endian-ness)625*626* @param length Returns the length of the binary data. (Out paramter.)627*628* @return A pointer to the binary (compiled) rule data. The storage629* belongs to the RulesBasedBreakIterator object, not the630* caller, and must not be modified or deleted.631* @stable ICU 4.8632*/633virtual const uint8_t *getBinaryRules(uint32_t &length);634635/**636* Set the subject text string upon which the break iterator is operating637* without changing any other aspect of the matching state.638* The new and previous text strings must have the same content.639*640* This function is intended for use in environments where ICU is operating on641* strings that may move around in memory. It provides a mechanism for notifying642* ICU that the string has been relocated, and providing a new UText to access the643* string in its new position.644*645* Note that the break iterator implementation never copies the underlying text646* of a string being processed, but always operates directly on the original text647* provided by the user. Refreshing simply drops the references to the old text648* and replaces them with references to the new.649*650* Caution: this function is normally used only by very specialized,651* system-level code. One example use case is with garbage collection that moves652* the text in memory.653*654* @param input The new (moved) text string.655* @param status Receives errors detected by this function.656* @return *this657*658* @stable ICU 49659*/660virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status);661662663protected:664//=======================================================================665// implementation666//=======================================================================667/**668* Dumps caches and performs other actions associated with a complete change669* in text or iteration position.670* @internal671*/672virtual void reset(void);673674#if 0675/**676* Return true if the category lookup for this char677* indicates that it is in the set of dictionary lookup chars.678* This function is intended for use by dictionary based break iterators.679* @return true if the category lookup for this char680* indicates that it is in the set of dictionary lookup chars.681* @internal682*/683virtual UBool isDictionaryChar(UChar32);684685/**686* Get the type of the break iterator.687* @internal688*/689virtual int32_t getBreakType() const;690#endif691692/**693* Set the type of the break iterator.694* @internal695*/696virtual void setBreakType(int32_t type);697698#ifndef U_HIDE_INTERNAL_API699/**700* Common initialization function, used by constructors and bufferClone.701* @internal702*/703void init();704#endif /* U_HIDE_INTERNAL_API */705706private:707708/**709* This method backs the iterator back up to a "safe position" in the text.710* This is a position that we know, without any context, must be a break position.711* The various calling methods then iterate forward from this safe position to712* the appropriate position to return. (For more information, see the description713* of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)714* @param statetable state table used of moving backwards715* @internal716*/717int32_t handlePrevious(const RBBIStateTable *statetable);718719/**720* This method is the actual implementation of the next() method. All iteration721* vectors through here. This method initializes the state machine to state 1722* and advances through the text character by character until we reach the end723* of the text or the state machine transitions to state 0. We update our return724* value every time the state machine passes through a possible end state.725* @param statetable state table used of moving forwards726* @internal727*/728int32_t handleNext(const RBBIStateTable *statetable);729730protected:731732#ifndef U_HIDE_INTERNAL_API733/**734* This is the function that actually implements dictionary-based735* breaking. Covering at least the range from startPos to endPos,736* it checks for dictionary characters, and if it finds them determines737* the appropriate object to deal with them. It may cache found breaks in738* fCachedBreakPositions as it goes. It may well also look at text outside739* the range startPos to endPos.740* If going forward, endPos is the normal Unicode break result, and741* if goind in reverse, startPos is the normal Unicode break result742* @param startPos The start position of a range of text743* @param endPos The end position of a range of text744* @param reverse The call is for the reverse direction745* @internal746*/747int32_t checkDictionary(int32_t startPos, int32_t endPos, UBool reverse);748#endif /* U_HIDE_INTERNAL_API */749750private:751752/**753* This function returns the appropriate LanguageBreakEngine for a754* given character c.755* @param c A character in the dictionary set756* @internal757*/758const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);759760/**761* @internal762*/763void makeRuleStatusValid();764765};766767//------------------------------------------------------------------------------768//769// Inline Functions Definitions ...770//771//------------------------------------------------------------------------------772773inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {774return !operator==(that);775}776777U_NAMESPACE_END778779#endif /* #if !UCONFIG_NO_BREAK_ITERATION */780781#endif782783784