Path: blob/jdk8u272-b10-aarch32-20201026/jdk/src/share/native/common/unicode/rbbi.h
48732 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3***************************************************************************4* Copyright (C) 1999-2016 International Business Machines Corporation *5* and others. All rights reserved. *6***************************************************************************78**********************************************************************9* Date Name Description10* 10/22/99 alan Creation.11* 11/11/99 rgillam Complete port from Java.12**********************************************************************13*/1415#ifndef RBBI_H16#define RBBI_H1718#include "unicode/utypes.h"1920/**21* \file22* \brief C++ API: Rule Based Break Iterator23*/2425#if !UCONFIG_NO_BREAK_ITERATION2627#include "unicode/brkiter.h"28#include "unicode/udata.h"29#include "unicode/parseerr.h"30#include "unicode/schriter.h"3132U_NAMESPACE_BEGIN3334/** @internal */35class LanguageBreakEngine;36struct RBBIDataHeader;37class RBBIDataWrapper;38class UnhandledEngine;39class UStack;4041/**42*43* A subclass of BreakIterator whose behavior is specified using a list of rules.44* <p>Instances of this class are most commonly created by the factory methods of45* BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc.,46* and then used via the abstract API in class BreakIterator</p>47*48* <p>See the ICU User Guide for information on Break Iterator Rules.</p>49*50* <p>This class is not intended to be subclassed.</p>51*/52class U_COMMON_API RuleBasedBreakIterator /*U_FINAL*/ : public BreakIterator {5354private:55/**56* The UText through which this BreakIterator accesses the text57* @internal (private)58*/59UText fText;6061#ifndef U_HIDE_INTERNAL_API62public:63#endif /* U_HIDE_INTERNAL_API */64/**65* The rule data for this BreakIterator instance.66* Not for general use; Public only for testing purposes.67* @internal68*/69RBBIDataWrapper *fData;70private:7172/**73* The current position of the iterator. Pinned, 0 < fPosition <= text.length.74* Never has the value UBRK_DONE (-1).75*/76int32_t fPosition;7778/**79* TODO:80*/81int32_t fRuleStatusIndex;8283/**84* Cache of previously determined boundary positions.85*/86class BreakCache;87BreakCache *fBreakCache;8889/**90* Cache of boundary positions within a region of text that has been91* sub-divided by dictionary based breaking.92*/93class DictionaryCache;94DictionaryCache *fDictionaryCache;9596/**97*98* If present, UStack of LanguageBreakEngine objects that might handle99* dictionary characters. Searched from top to bottom to find an object to100* handle a given character.101* @internal (private)102*/103UStack *fLanguageBreakEngines;104105/**106*107* If present, the special LanguageBreakEngine used for handling108* characters that are in the dictionary set, but not handled by any109* LanguageBreakEngine.110* @internal (private)111*/112UnhandledEngine *fUnhandledBreakEngine;113114/**115* Counter for the number of characters encountered with the "dictionary"116* flag set.117* @internal (private)118*/119uint32_t fDictionaryCharCount;120121/**122* A character iterator that refers to the same text as the UText, above.123* Only included for compatibility with old API, which was based on CharacterIterators.124* Value may be adopted from outside, or one of fSCharIter or fDCharIter, below.125*/126CharacterIterator *fCharIter;127128/**129* When the input text is provided by a UnicodeString, this will point to130* a characterIterator that wraps that data. Needed only for the131* implementation of getText(), a backwards compatibility issue.132*/133StringCharacterIterator fSCharIter;134135/**136* True when iteration has run off the end, and iterator functions should return UBRK_DONE.137*/138UBool fDone;139140//=======================================================================141// constructors142//=======================================================================143144/**145* Constructor from a flattened set of RBBI data in malloced memory.146* RulesBasedBreakIterators built from a custom set of rules147* are created via this constructor; the rules are compiled148* into memory, then the break iterator is constructed here.149*150* The break iterator adopts the memory, and will151* free it when done.152* @internal (private)153*/154RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);155156/** @internal */157friend class RBBIRuleBuilder;158/** @internal */159friend class BreakIterator;160161public:162163/** Default constructor. Creates an empty shell of an iterator, with no164* rules or text to iterate over. Object can subsequently be assigned to.165* @stable ICU 2.2166*/167RuleBasedBreakIterator();168169/**170* Copy constructor. Will produce a break iterator with the same behavior,171* and which iterates over the same text, as the one passed in.172* @param that The RuleBasedBreakIterator passed to be copied173* @stable ICU 2.0174*/175RuleBasedBreakIterator(const RuleBasedBreakIterator& that);176177/**178* Construct a RuleBasedBreakIterator from a set of rules supplied as a string.179* @param rules The break rules to be used.180* @param parseError In the event of a syntax error in the rules, provides the location181* within the rules of the problem.182* @param status Information on any errors encountered.183* @stable ICU 2.2184*/185RuleBasedBreakIterator( const UnicodeString &rules,186UParseError &parseError,187UErrorCode &status);188189/**190* Construct a RuleBasedBreakIterator from a set of precompiled binary rules.191* Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules().192* Construction of a break iterator in this way is substantially faster than193* construction from source rules.194*195* Ownership of the storage containing the compiled rules remains with the196* caller of this function. The compiled rules must not be modified or197* deleted during the life of the break iterator.198*199* The compiled rules are not compatible across different major versions of ICU.200* The compiled rules are compatible only between machines with the same201* byte ordering (little or big endian) and the same base character set family202* (ASCII or EBCDIC).203*204* @see #getBinaryRules205* @param compiledRules A pointer to the compiled break rules to be used.206* @param ruleLength The length of the compiled break rules, in bytes. This207* corresponds to the length value produced by getBinaryRules().208* @param status Information on any errors encountered, including invalid209* binary rules.210* @stable ICU 4.8211*/212RuleBasedBreakIterator(const uint8_t *compiledRules,213uint32_t ruleLength,214UErrorCode &status);215216/**217* This constructor uses the udata interface to create a BreakIterator218* whose internal tables live in a memory-mapped file. "image" is an219* ICU UDataMemory handle for the pre-compiled break iterator tables.220* @param image handle to the memory image for the break iterator data.221* Ownership of the UDataMemory handle passes to the Break Iterator,222* which will be responsible for closing it when it is no longer needed.223* @param status Information on any errors encountered.224* @see udata_open225* @see #getBinaryRules226* @stable ICU 2.8227*/228RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status);229230/**231* Destructor232* @stable ICU 2.0233*/234virtual ~RuleBasedBreakIterator();235236/**237* Assignment operator. Sets this iterator to have the same behavior,238* and iterate over the same text, as the one passed in.239* @param that The RuleBasedBreakItertor passed in240* @return the newly created RuleBasedBreakIterator241* @stable ICU 2.0242*/243RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);244245/**246* Equality operator. Returns TRUE if both BreakIterators are of the247* same class, have the same behavior, and iterate over the same text.248* @param that The BreakIterator to be compared for equality249* @return TRUE if both BreakIterators are of the250* same class, have the same behavior, and iterate over the same text.251* @stable ICU 2.0252*/253virtual UBool operator==(const BreakIterator& that) const;254255/**256* Not-equal operator. If operator== returns TRUE, this returns FALSE,257* and vice versa.258* @param that The BreakIterator to be compared for inequality259* @return TRUE if both BreakIterators are not same.260* @stable ICU 2.0261*/262inline UBool operator!=(const BreakIterator& that) const;263264/**265* Returns a newly-constructed RuleBasedBreakIterator with the same266* behavior, and iterating over the same text, as this one.267* Differs from the copy constructor in that it is polymorphic, and268* will correctly clone (copy) a derived class.269* clone() is thread safe. Multiple threads may simultaneously270* clone the same source break iterator.271* @return a newly-constructed RuleBasedBreakIterator272* @stable ICU 2.0273*/274virtual BreakIterator* clone() const;275276/**277* Compute a hash code for this BreakIterator278* @return A hash code279* @stable ICU 2.0280*/281virtual int32_t hashCode(void) const;282283/**284* Returns the description used to create this iterator285* @return the description used to create this iterator286* @stable ICU 2.0287*/288virtual const UnicodeString& getRules(void) const;289290//=======================================================================291// BreakIterator overrides292//=======================================================================293294/**295* <p>296* Return a CharacterIterator over the text being analyzed.297* The returned character iterator is owned by the break iterator, and must298* not be deleted by the caller. Repeated calls to this function may299* return the same CharacterIterator.300* </p>301* <p>302* The returned character iterator must not be used concurrently with303* the break iterator. If concurrent operation is needed, clone the304* returned character iterator first and operate on the clone.305* </p>306* <p>307* When the break iterator is operating on text supplied via a UText,308* this function will fail. Lacking any way to signal failures, it309* returns an CharacterIterator containing no text.310* The function getUText() provides similar functionality,311* is reliable, and is more efficient.312* </p>313*314* TODO: deprecate this function?315*316* @return An iterator over the text being analyzed.317* @stable ICU 2.0318*/319virtual CharacterIterator& getText(void) const;320321322/**323* Get a UText for the text being analyzed.324* The returned UText is a shallow clone of the UText used internally325* by the break iterator implementation. It can safely be used to326* access the text without impacting any break iterator operations,327* but the underlying text itself must not be altered.328*329* @param fillIn A UText to be filled in. If NULL, a new UText will be330* allocated to hold the result.331* @param status receives any error codes.332* @return The current UText for this break iterator. If an input333* UText was provided, it will always be returned.334* @stable ICU 3.4335*/336virtual UText *getUText(UText *fillIn, UErrorCode &status) const;337338/**339* Set the iterator to analyze a new piece of text. This function resets340* the current iteration position to the beginning of the text.341* @param newText An iterator over the text to analyze. The BreakIterator342* takes ownership of the character iterator. The caller MUST NOT delete it!343* @stable ICU 2.0344*/345virtual void adoptText(CharacterIterator* newText);346347/**348* Set the iterator to analyze a new piece of text. This function resets349* the current iteration position to the beginning of the text.350*351* The BreakIterator will retain a reference to the supplied string.352* The caller must not modify or delete the text while the BreakIterator353* retains the reference.354*355* @param newText The text to analyze.356* @stable ICU 2.0357*/358virtual void setText(const UnicodeString& newText);359360/**361* Reset the break iterator to operate over the text represented by362* the UText. The iterator position is reset to the start.363*364* This function makes a shallow clone of the supplied UText. This means365* that the caller is free to immediately close or otherwise reuse the366* Utext that was passed as a parameter, but that the underlying text itself367* must not be altered while being referenced by the break iterator.368*369* @param text The UText used to change the text.370* @param status Receives any error codes.371* @stable ICU 3.4372*/373virtual void setText(UText *text, UErrorCode &status);374375/**376* Sets the current iteration position to the beginning of the text, position zero.377* @return The offset of the beginning of the text, zero.378* @stable ICU 2.0379*/380virtual int32_t first(void);381382/**383* Sets the current iteration position to the end of the text.384* @return The text's past-the-end offset.385* @stable ICU 2.0386*/387virtual int32_t last(void);388389/**390* Advances the iterator either forward or backward the specified number of steps.391* Negative values move backward, and positive values move forward. This is392* equivalent to repeatedly calling next() or previous().393* @param n The number of steps to move. The sign indicates the direction394* (negative is backwards, and positive is forwards).395* @return The character offset of the boundary position n boundaries away from396* the current one.397* @stable ICU 2.0398*/399virtual int32_t next(int32_t n);400401/**402* Advances the iterator to the next boundary position.403* @return The position of the first boundary after this one.404* @stable ICU 2.0405*/406virtual int32_t next(void);407408/**409* Moves the iterator backwards, to the last boundary preceding this one.410* @return The position of the last boundary position preceding this one.411* @stable ICU 2.0412*/413virtual int32_t previous(void);414415/**416* Sets the iterator to refer to the first boundary position following417* the specified position.418* @param offset The position from which to begin searching for a break position.419* @return The position of the first break after the current position.420* @stable ICU 2.0421*/422virtual int32_t following(int32_t offset);423424/**425* Sets the iterator to refer to the last boundary position before the426* specified position.427* @param offset The position to begin searching for a break from.428* @return The position of the last boundary before the starting position.429* @stable ICU 2.0430*/431virtual int32_t preceding(int32_t offset);432433/**434* Returns true if the specified position is a boundary position. As a side435* effect, leaves the iterator pointing to the first boundary position at436* or after "offset".437* @param offset the offset to check.438* @return True if "offset" is a boundary position.439* @stable ICU 2.0440*/441virtual UBool isBoundary(int32_t offset);442443/**444* Returns the current iteration position. Note that UBRK_DONE is never445* returned from this function; if iteration has run to the end of a446* string, current() will return the length of the string while447* next() will return UBRK_DONE).448* @return The current iteration position.449* @stable ICU 2.0450*/451virtual int32_t current(void) const;452453454/**455* Return the status tag from the break rule that determined the boundary at456* the current iteration position. For break rules that do not specify a457* status, a default value of 0 is returned. If more than one break rule458* would cause a boundary to be located at some position in the text,459* the numerically largest of the applicable status values is returned.460* <p>461* Of the standard types of ICU break iterators, only word break and462* line break provide status values. The values are defined in463* the header file ubrk.h. For Word breaks, the status allows distinguishing between words464* that contain alphabetic letters, "words" that appear to be numbers,465* punctuation and spaces, words containing ideographic characters, and466* more. For Line Break, the status distinguishes between hard (mandatory) breaks467* and soft (potential) break positions.468* <p>469* <code>getRuleStatus()</code> can be called after obtaining a boundary470* position from <code>next()</code>, <code>previous()</code>, or471* any other break iterator functions that returns a boundary position.472* <p>473* Note that <code>getRuleStatus()</code> returns the value corresponding to474* <code>current()</code> index even after <code>next()</code> has returned DONE.475* <p>476* When creating custom break rules, one is free to define whatever477* status values may be convenient for the application.478* <p>479* @return the status from the break rule that determined the boundary480* at the current iteration position.481*482* @see UWordBreak483* @stable ICU 2.2484*/485virtual int32_t getRuleStatus() const;486487/**488* Get the status (tag) values from the break rule(s) that determined the boundary489* at the current iteration position.490* <p>491* The returned status value(s) are stored into an array provided by the caller.492* The values are stored in sorted (ascending) order.493* If the capacity of the output array is insufficient to hold the data,494* the output will be truncated to the available length, and a495* U_BUFFER_OVERFLOW_ERROR will be signaled.496*497* @param fillInVec an array to be filled in with the status values.498* @param capacity the length of the supplied vector. A length of zero causes499* the function to return the number of status values, in the500* normal way, without attempting to store any values.501* @param status receives error codes.502* @return The number of rule status values from the rules that determined503* the boundary at the current iteration position.504* In the event of a U_BUFFER_OVERFLOW_ERROR, the return value505* is the total number of status values that were available,506* not the reduced number that were actually returned.507* @see getRuleStatus508* @stable ICU 3.0509*/510virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);511512/**513* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.514* This method is to implement a simple version of RTTI, since not all515* C++ compilers support genuine RTTI. Polymorphic operator==() and516* clone() methods call this method.517*518* @return The class ID for this object. All objects of a519* given class have the same class ID. Objects of520* other classes have different class IDs.521* @stable ICU 2.0522*/523virtual UClassID getDynamicClassID(void) const;524525/**526* Returns the class ID for this class. This is useful only for527* comparing to a return value from getDynamicClassID(). For example:528*529* Base* polymorphic_pointer = createPolymorphicObject();530* if (polymorphic_pointer->getDynamicClassID() ==531* Derived::getStaticClassID()) ...532*533* @return The class ID for all objects of this class.534* @stable ICU 2.0535*/536static UClassID U_EXPORT2 getStaticClassID(void);537538/**539* Deprecated functionality. Use clone() instead.540*541* Create a clone (copy) of this break iterator in memory provided542* by the caller. The idea is to increase performance by avoiding543* a storage allocation. Use of this function is NOT RECOMMENDED.544* Performance gains are minimal, and correct buffer management is545* tricky. Use clone() instead.546*547* @param stackBuffer The pointer to the memory into which the cloned object548* should be placed. If NULL, allocate heap memory549* for the cloned object.550* @param BufferSize The size of the buffer. If zero, return the required551* buffer size, but do not clone the object. If the552* size was too small (but not zero), allocate heap553* storage for the cloned object.554*555* @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be556* returned if the provided buffer was too small, and557* the clone was therefore put on the heap.558*559* @return Pointer to the clone object. This may differ from the stackBuffer560* address if the byte alignment of the stack buffer was not suitable561* or if the stackBuffer was too small to hold the clone.562* @deprecated ICU 52. Use clone() instead.563*/564virtual BreakIterator * createBufferClone(void *stackBuffer,565int32_t &BufferSize,566UErrorCode &status);567568569/**570* Return the binary form of compiled break rules,571* which can then be used to create a new break iterator at some572* time in the future. Creating a break iterator from pre-compiled rules573* is much faster than building one from the source form of the574* break rules.575*576* The binary data can only be used with the same version of ICU577* and on the same platform type (processor endian-ness)578*579* @param length Returns the length of the binary data. (Out parameter.)580*581* @return A pointer to the binary (compiled) rule data. The storage582* belongs to the RulesBasedBreakIterator object, not the583* caller, and must not be modified or deleted.584* @stable ICU 4.8585*/586virtual const uint8_t *getBinaryRules(uint32_t &length);587588/**589* Set the subject text string upon which the break iterator is operating590* without changing any other aspect of the matching state.591* The new and previous text strings must have the same content.592*593* This function is intended for use in environments where ICU is operating on594* strings that may move around in memory. It provides a mechanism for notifying595* ICU that the string has been relocated, and providing a new UText to access the596* string in its new position.597*598* Note that the break iterator implementation never copies the underlying text599* of a string being processed, but always operates directly on the original text600* provided by the user. Refreshing simply drops the references to the old text601* and replaces them with references to the new.602*603* Caution: this function is normally used only by very specialized,604* system-level code. One example use case is with garbage collection that moves605* the text in memory.606*607* @param input The new (moved) text string.608* @param status Receives errors detected by this function.609* @return *this610*611* @stable ICU 49612*/613virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status);614615616private:617//=======================================================================618// implementation619//=======================================================================620/**621* Dumps caches and performs other actions associated with a complete change622* in text or iteration position.623* @internal (private)624*/625void reset(void);626627/**628* Common initialization function, used by constructors and bufferClone.629* @internal (private)630*/631void init(UErrorCode &status);632633/**634* Iterate backwards from an arbitrary position in the input text using the635* synthesized Safe Reverse rules.636* This locates a "Safe Position" from which the forward break rules637* will operate correctly. A Safe Position is not necessarily a boundary itself.638*639* @param fromPosition the position in the input text to begin the iteration.640* @internal (private)641*/642int32_t handleSafePrevious(int32_t fromPosition);643644/**645* Find a rule-based boundary by running the state machine.646* Input647* fPosition, the position in the text to begin from.648* Output649* fPosition: the boundary following the starting position.650* fDictionaryCharCount the number of dictionary characters encountered.651* If > 0, the segment will be further subdivided652* fRuleStatusIndex Info from the state table indicating which rules caused the boundary.653*654* @internal (private)655*/656int32_t handleNext();657658659/**660* This function returns the appropriate LanguageBreakEngine for a661* given character c.662* @param c A character in the dictionary set663* @internal (private)664*/665const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);666667public:668#ifndef U_HIDE_INTERNAL_API669/**670* Debugging function only.671* @internal672*/673void dumpCache();674675/**676* Debugging function only.677* @internal678*/679void dumpTables();680681#endif /* U_HIDE_INTERNAL_API */682};683684//------------------------------------------------------------------------------685//686// Inline Functions Definitions ...687//688//------------------------------------------------------------------------------689690inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {691return !operator==(that);692}693694U_NAMESPACE_END695696#endif /* #if !UCONFIG_NO_BREAK_ITERATION */697698#endif699700701