Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/native/common/unicode/chariter.h
38827 views
/*1********************************************************************2*3* Copyright (C) 1997-2011, International Business Machines4* Corporation and others. All Rights Reserved.5*6********************************************************************7*/89#ifndef CHARITER_H10#define CHARITER_H1112#include "unicode/utypes.h"13#include "unicode/uobject.h"14#include "unicode/unistr.h"15/**16* \file17* \brief C++ API: Character Iterator18*/1920U_NAMESPACE_BEGIN21/**22* Abstract class that defines an API for forward-only iteration23* on text objects.24* This is a minimal interface for iteration without random access25* or backwards iteration. It is especially useful for wrapping26* streams with converters into an object for collation or27* normalization.28*29* <p>Characters can be accessed in two ways: as code units or as30* code points.31* Unicode code points are 21-bit integers and are the scalar values32* of Unicode characters. ICU uses the type UChar32 for them.33* Unicode code units are the storage units of a given34* Unicode/UCS Transformation Format (a character encoding scheme).35* With UTF-16, all code points can be represented with either one36* or two code units ("surrogates").37* String storage is typically based on code units, while properties38* of characters are typically determined using code point values.39* Some processes may be designed to work with sequences of code units,40* or it may be known that all characters that are important to an41* algorithm can be represented with single code units.42* Other processes will need to use the code point access functions.</p>43*44* <p>ForwardCharacterIterator provides nextPostInc() to access45* a code unit and advance an internal position into the text object,46* similar to a <code>return text[position++]</code>.<br>47* It provides next32PostInc() to access a code point and advance an internal48* position.</p>49*50* <p>next32PostInc() assumes that the current position is that of51* the beginning of a code point, i.e., of its first code unit.52* After next32PostInc(), this will be true again.53* In general, access to code units and code points in the same54* iteration loop should not be mixed. In UTF-16, if the current position55* is on a second code unit (Low Surrogate), then only that code unit56* is returned even by next32PostInc().</p>57*58* <p>For iteration with either function, there are two ways to59* check for the end of the iteration. When there are no more60* characters in the text object:61* <ul>62* <li>The hasNext() function returns FALSE.</li>63* <li>nextPostInc() and next32PostInc() return DONE64* when one attempts to read beyond the end of the text object.</li>65* </ul>66*67* Example:68* \code69* void function1(ForwardCharacterIterator &it) {70* UChar32 c;71* while(it.hasNext()) {72* c=it.next32PostInc();73* // use c74* }75* }76*77* void function1(ForwardCharacterIterator &it) {78* UChar c;79* while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {80* // use c81* }82* }83* \endcode84* </p>85*86* @stable ICU 2.087*/88class U_COMMON_API ForwardCharacterIterator : public UObject {89public:90/**91* Value returned by most of ForwardCharacterIterator's functions92* when the iterator has reached the limits of its iteration.93* @stable ICU 2.094*/95enum { DONE = 0xffff };9697/**98* Destructor.99* @stable ICU 2.0100*/101virtual ~ForwardCharacterIterator();102103/**104* Returns true when both iterators refer to the same105* character in the same character-storage object.106* @param that The ForwardCharacterIterator to be compared for equality107* @return true when both iterators refer to the same108* character in the same character-storage object109* @stable ICU 2.0110*/111virtual UBool operator==(const ForwardCharacterIterator& that) const = 0;112113/**114* Returns true when the iterators refer to different115* text-storage objects, or to different characters in the116* same text-storage object.117* @param that The ForwardCharacterIterator to be compared for inequality118* @return true when the iterators refer to different119* text-storage objects, or to different characters in the120* same text-storage object121* @stable ICU 2.0122*/123inline UBool operator!=(const ForwardCharacterIterator& that) const;124125/**126* Generates a hash code for this iterator.127* @return the hash code.128* @stable ICU 2.0129*/130virtual int32_t hashCode(void) const = 0;131132/**133* Returns a UClassID for this ForwardCharacterIterator ("poor man's134* RTTI").<P> Despite the fact that this function is public,135* DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!136* @return a UClassID for this ForwardCharacterIterator137* @stable ICU 2.0138*/139virtual UClassID getDynamicClassID(void) const = 0;140141/**142* Gets the current code unit for returning and advances to the next code unit143* in the iteration range144* (toward endIndex()). If there are145* no more code units to return, returns DONE.146* @return the current code unit.147* @stable ICU 2.0148*/149virtual UChar nextPostInc(void) = 0;150151/**152* Gets the current code point for returning and advances to the next code point153* in the iteration range154* (toward endIndex()). If there are155* no more code points to return, returns DONE.156* @return the current code point.157* @stable ICU 2.0158*/159virtual UChar32 next32PostInc(void) = 0;160161/**162* Returns FALSE if there are no more code units or code points163* at or after the current position in the iteration range.164* This is used with nextPostInc() or next32PostInc() in forward165* iteration.166* @returns FALSE if there are no more code units or code points167* at or after the current position in the iteration range.168* @stable ICU 2.0169*/170virtual UBool hasNext() = 0;171172protected:173/** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/174ForwardCharacterIterator();175176/** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/177ForwardCharacterIterator(const ForwardCharacterIterator &other);178179/**180* Assignment operator to be overridden in the implementing class.181* @stable ICU 2.0182*/183ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }184};185186/**187* Abstract class that defines an API for iteration188* on text objects.189* This is an interface for forward and backward iteration190* and random access into a text object.191*192* <p>The API provides backward compatibility to the Java and older ICU193* CharacterIterator classes but extends them significantly:194* <ol>195* <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>196* <li>While the old API functions provided forward iteration with197* "pre-increment" semantics, the new one also provides functions198* with "post-increment" semantics. They are more efficient and should199* be the preferred iterator functions for new implementations.200* The backward iteration always had "pre-decrement" semantics, which201* are efficient.</li>202* <li>Just like ForwardCharacterIterator, it provides access to203* both code units and code points. Code point access versions are available204* for the old and the new iteration semantics.</li>205* <li>There are new functions for setting and moving the current position206* without returning a character, for efficiency.</li>207* </ol>208*209* See ForwardCharacterIterator for examples for using the new forward iteration210* functions. For backward iteration, there is also a hasPrevious() function211* that can be used analogously to hasNext().212* The old functions work as before and are shown below.</p>213*214* <p>Examples for some of the new functions:</p>215*216* Forward iteration with hasNext():217* \code218* void forward1(CharacterIterator &it) {219* UChar32 c;220* for(it.setToStart(); it.hasNext();) {221* c=it.next32PostInc();222* // use c223* }224* }225* \endcode226* Forward iteration more similar to loops with the old forward iteration,227* showing a way to convert simple for() loops:228* \code229* void forward2(CharacterIterator &it) {230* UChar c;231* for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {232* // use c233* }234* }235* \endcode236* Backward iteration with setToEnd() and hasPrevious():237* \code238* void backward1(CharacterIterator &it) {239* UChar32 c;240* for(it.setToEnd(); it.hasPrevious();) {241* c=it.previous32();242* // use c243* }244* }245* \endcode246* Backward iteration with a more traditional for() loop:247* \code248* void backward2(CharacterIterator &it) {249* UChar c;250* for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {251* // use c252* }253* }254* \endcode255*256* Example for random access:257* \code258* void random(CharacterIterator &it) {259* // set to the third code point from the beginning260* it.move32(3, CharacterIterator::kStart);261* // get a code point from here without moving the position262* UChar32 c=it.current32();263* // get the position264* int32_t pos=it.getIndex();265* // get the previous code unit266* UChar u=it.previous();267* // move back one more code unit268* it.move(-1, CharacterIterator::kCurrent);269* // set the position back to where it was270* // and read the same code point c and move beyond it271* it.setIndex(pos);272* if(c!=it.next32PostInc()) {273* exit(1); // CharacterIterator inconsistent274* }275* }276* \endcode277*278* <p>Examples, especially for the old API:</p>279*280* Function processing characters, in this example simple output281* <pre>282* \code283* void processChar( UChar c )284* {285* cout << " " << c;286* }287* \endcode288* </pre>289* Traverse the text from start to finish290* <pre>291* \code292* void traverseForward(CharacterIterator& iter)293* {294* for(UChar c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {295* processChar(c);296* }297* }298* \endcode299* </pre>300* Traverse the text backwards, from end to start301* <pre>302* \code303* void traverseBackward(CharacterIterator& iter)304* {305* for(UChar c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {306* processChar(c);307* }308* }309* \endcode310* </pre>311* Traverse both forward and backward from a given position in the text.312* Calls to notBoundary() in this example represents some additional stopping criteria.313* <pre>314* \code315* void traverseOut(CharacterIterator& iter, int32_t pos)316* {317* UChar c;318* for (c = iter.setIndex(pos);319* c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));320* c = iter.next()) {}321* int32_t end = iter.getIndex();322* for (c = iter.setIndex(pos);323* c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));324* c = iter.previous()) {}325* int32_t start = iter.getIndex() + 1;326*327* cout << "start: " << start << " end: " << end << endl;328* for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {329* processChar(c);330* }331* }332* \endcode333* </pre>334* Creating a StringCharacterIterator and calling the test functions335* <pre>336* \code337* void CharacterIterator_Example( void )338* {339* cout << endl << "===== CharacterIterator_Example: =====" << endl;340* UnicodeString text("Ein kleiner Satz.");341* StringCharacterIterator iterator(text);342* cout << "----- traverseForward: -----------" << endl;343* traverseForward( iterator );344* cout << endl << endl << "----- traverseBackward: ----------" << endl;345* traverseBackward( iterator );346* cout << endl << endl << "----- traverseOut: ---------------" << endl;347* traverseOut( iterator, 7 );348* cout << endl << endl << "-----" << endl;349* }350* \endcode351* </pre>352*353* @stable ICU 2.0354*/355class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {356public:357/**358* Origin enumeration for the move() and move32() functions.359* @stable ICU 2.0360*/361enum EOrigin { kStart, kCurrent, kEnd };362363/**364* Destructor.365* @stable ICU 2.0366*/367virtual ~CharacterIterator();368369/**370* Returns a pointer to a new CharacterIterator of the same371* concrete class as this one, and referring to the same372* character in the same text-storage object as this one. The373* caller is responsible for deleting the new clone.374* @return a pointer to a new CharacterIterator375* @stable ICU 2.0376*/377virtual CharacterIterator* clone(void) const = 0;378379/**380* Sets the iterator to refer to the first code unit in its381* iteration range, and returns that code unit.382* This can be used to begin an iteration with next().383* @return the first code unit in its iteration range.384* @stable ICU 2.0385*/386virtual UChar first(void) = 0;387388/**389* Sets the iterator to refer to the first code unit in its390* iteration range, returns that code unit, and moves the position391* to the second code unit. This is an alternative to setToStart()392* for forward iteration with nextPostInc().393* @return the first code unit in its iteration range.394* @stable ICU 2.0395*/396virtual UChar firstPostInc(void);397398/**399* Sets the iterator to refer to the first code point in its400* iteration range, and returns that code unit,401* This can be used to begin an iteration with next32().402* Note that an iteration with next32PostInc(), beginning with,403* e.g., setToStart() or firstPostInc(), is more efficient.404* @return the first code point in its iteration range.405* @stable ICU 2.0406*/407virtual UChar32 first32(void) = 0;408409/**410* Sets the iterator to refer to the first code point in its411* iteration range, returns that code point, and moves the position412* to the second code point. This is an alternative to setToStart()413* for forward iteration with next32PostInc().414* @return the first code point in its iteration range.415* @stable ICU 2.0416*/417virtual UChar32 first32PostInc(void);418419/**420* Sets the iterator to refer to the first code unit or code point in its421* iteration range. This can be used to begin a forward422* iteration with nextPostInc() or next32PostInc().423* @return the start position of the iteration range424* @stable ICU 2.0425*/426inline int32_t setToStart();427428/**429* Sets the iterator to refer to the last code unit in its430* iteration range, and returns that code unit.431* This can be used to begin an iteration with previous().432* @return the last code unit.433* @stable ICU 2.0434*/435virtual UChar last(void) = 0;436437/**438* Sets the iterator to refer to the last code point in its439* iteration range, and returns that code unit.440* This can be used to begin an iteration with previous32().441* @return the last code point.442* @stable ICU 2.0443*/444virtual UChar32 last32(void) = 0;445446/**447* Sets the iterator to the end of its iteration range, just behind448* the last code unit or code point. This can be used to begin a backward449* iteration with previous() or previous32().450* @return the end position of the iteration range451* @stable ICU 2.0452*/453inline int32_t setToEnd();454455/**456* Sets the iterator to refer to the "position"-th code unit457* in the text-storage object the iterator refers to, and458* returns that code unit.459* @param position the "position"-th code unit in the text-storage object460* @return the "position"-th code unit.461* @stable ICU 2.0462*/463virtual UChar setIndex(int32_t position) = 0;464465/**466* Sets the iterator to refer to the beginning of the code point467* that contains the "position"-th code unit468* in the text-storage object the iterator refers to, and469* returns that code point.470* The current position is adjusted to the beginning of the code point471* (its first code unit).472* @param position the "position"-th code unit in the text-storage object473* @return the "position"-th code point.474* @stable ICU 2.0475*/476virtual UChar32 setIndex32(int32_t position) = 0;477478/**479* Returns the code unit the iterator currently refers to.480* @return the current code unit.481* @stable ICU 2.0482*/483virtual UChar current(void) const = 0;484485/**486* Returns the code point the iterator currently refers to.487* @return the current code point.488* @stable ICU 2.0489*/490virtual UChar32 current32(void) const = 0;491492/**493* Advances to the next code unit in the iteration range494* (toward endIndex()), and returns that code unit. If there are495* no more code units to return, returns DONE.496* @return the next code unit.497* @stable ICU 2.0498*/499virtual UChar next(void) = 0;500501/**502* Advances to the next code point in the iteration range503* (toward endIndex()), and returns that code point. If there are504* no more code points to return, returns DONE.505* Note that iteration with "pre-increment" semantics is less506* efficient than iteration with "post-increment" semantics507* that is provided by next32PostInc().508* @return the next code point.509* @stable ICU 2.0510*/511virtual UChar32 next32(void) = 0;512513/**514* Advances to the previous code unit in the iteration range515* (toward startIndex()), and returns that code unit. If there are516* no more code units to return, returns DONE.517* @return the previous code unit.518* @stable ICU 2.0519*/520virtual UChar previous(void) = 0;521522/**523* Advances to the previous code point in the iteration range524* (toward startIndex()), and returns that code point. If there are525* no more code points to return, returns DONE.526* @return the previous code point.527* @stable ICU 2.0528*/529virtual UChar32 previous32(void) = 0;530531/**532* Returns FALSE if there are no more code units or code points533* before the current position in the iteration range.534* This is used with previous() or previous32() in backward535* iteration.536* @return FALSE if there are no more code units or code points537* before the current position in the iteration range, return TRUE otherwise.538* @stable ICU 2.0539*/540virtual UBool hasPrevious() = 0;541542/**543* Returns the numeric index in the underlying text-storage544* object of the character returned by first(). Since it's545* possible to create an iterator that iterates across only546* part of a text-storage object, this number isn't547* necessarily 0.548* @returns the numeric index in the underlying text-storage549* object of the character returned by first().550* @stable ICU 2.0551*/552inline int32_t startIndex(void) const;553554/**555* Returns the numeric index in the underlying text-storage556* object of the position immediately BEYOND the character557* returned by last().558* @return the numeric index in the underlying text-storage559* object of the position immediately BEYOND the character560* returned by last().561* @stable ICU 2.0562*/563inline int32_t endIndex(void) const;564565/**566* Returns the numeric index in the underlying text-storage567* object of the character the iterator currently refers to568* (i.e., the character returned by current()).569* @return the numberic index in the text-storage object of570* the character the iterator currently refers to571* @stable ICU 2.0572*/573inline int32_t getIndex(void) const;574575/**576* Returns the length of the entire text in the underlying577* text-storage object.578* @return the length of the entire text in the text-storage object579* @stable ICU 2.0580*/581inline int32_t getLength() const;582583/**584* Moves the current position relative to the start or end of the585* iteration range, or relative to the current position itself.586* The movement is expressed in numbers of code units forward587* or backward by specifying a positive or negative delta.588* @param delta the position relative to origin. A positive delta means forward;589* a negative delta means backward.590* @param origin Origin enumeration {kStart, kCurrent, kEnd}591* @return the new position592* @stable ICU 2.0593*/594virtual int32_t move(int32_t delta, EOrigin origin) = 0;595596/**597* Moves the current position relative to the start or end of the598* iteration range, or relative to the current position itself.599* The movement is expressed in numbers of code points forward600* or backward by specifying a positive or negative delta.601* @param delta the position relative to origin. A positive delta means forward;602* a negative delta means backward.603* @param origin Origin enumeration {kStart, kCurrent, kEnd}604* @return the new position605* @stable ICU 2.0606*/607virtual int32_t move32(int32_t delta, EOrigin origin) = 0;608609/**610* Copies the text under iteration into the UnicodeString611* referred to by "result".612* @param result Receives a copy of the text under iteration.613* @stable ICU 2.0614*/615virtual void getText(UnicodeString& result) = 0;616617protected:618/**619* Empty constructor.620* @stable ICU 2.0621*/622CharacterIterator();623624/**625* Constructor, just setting the length field in this base class.626* @stable ICU 2.0627*/628CharacterIterator(int32_t length);629630/**631* Constructor, just setting the length and position fields in this base class.632* @stable ICU 2.0633*/634CharacterIterator(int32_t length, int32_t position);635636/**637* Constructor, just setting the length, start, end, and position fields in this base class.638* @stable ICU 2.0639*/640CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);641642/**643* Copy constructor.644*645* @param that The CharacterIterator to be copied646* @stable ICU 2.0647*/648CharacterIterator(const CharacterIterator &that);649650/**651* Assignment operator. Sets this CharacterIterator to have the same behavior,652* as the one passed in.653* @param that The CharacterIterator passed in.654* @return the newly set CharacterIterator.655* @stable ICU 2.0656*/657CharacterIterator &operator=(const CharacterIterator &that);658659/**660* Base class text length field.661* Necessary this for correct getText() and hashCode().662* @stable ICU 2.0663*/664int32_t textLength;665666/**667* Base class field for the current position.668* @stable ICU 2.0669*/670int32_t pos;671672/**673* Base class field for the start of the iteration range.674* @stable ICU 2.0675*/676int32_t begin;677678/**679* Base class field for the end of the iteration range.680* @stable ICU 2.0681*/682int32_t end;683};684685inline UBool686ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {687return !operator==(that);688}689690inline int32_t691CharacterIterator::setToStart() {692return move(0, kStart);693}694695inline int32_t696CharacterIterator::setToEnd() {697return move(0, kEnd);698}699700inline int32_t701CharacterIterator::startIndex(void) const {702return begin;703}704705inline int32_t706CharacterIterator::endIndex(void) const {707return end;708}709710inline int32_t711CharacterIterator::getIndex(void) const {712return pos;713}714715inline int32_t716CharacterIterator::getLength(void) const {717return textLength;718}719720U_NAMESPACE_END721#endif722723724