Path: blob/jdk8u272-b10-aarch32-20201026/jdk/src/share/native/common/unicode/chariter.h
48729 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3********************************************************************4*5* Copyright (C) 1997-2011, International Business Machines6* Corporation and others. All Rights Reserved.7*8********************************************************************9*/1011#ifndef CHARITER_H12#define CHARITER_H1314#include "unicode/utypes.h"15#include "unicode/uobject.h"16#include "unicode/unistr.h"17/**18* \file19* \brief C++ API: Character Iterator20*/2122U_NAMESPACE_BEGIN23/**24* Abstract class that defines an API for forward-only iteration25* on text objects.26* This is a minimal interface for iteration without random access27* or backwards iteration. It is especially useful for wrapping28* streams with converters into an object for collation or29* normalization.30*31* <p>Characters can be accessed in two ways: as code units or as32* code points.33* Unicode code points are 21-bit integers and are the scalar values34* of Unicode characters. ICU uses the type UChar32 for them.35* Unicode code units are the storage units of a given36* Unicode/UCS Transformation Format (a character encoding scheme).37* With UTF-16, all code points can be represented with either one38* or two code units ("surrogates").39* String storage is typically based on code units, while properties40* of characters are typically determined using code point values.41* Some processes may be designed to work with sequences of code units,42* or it may be known that all characters that are important to an43* algorithm can be represented with single code units.44* Other processes will need to use the code point access functions.</p>45*46* <p>ForwardCharacterIterator provides nextPostInc() to access47* a code unit and advance an internal position into the text object,48* similar to a <code>return text[position++]</code>.<br>49* It provides next32PostInc() to access a code point and advance an internal50* position.</p>51*52* <p>next32PostInc() assumes that the current position is that of53* the beginning of a code point, i.e., of its first code unit.54* After next32PostInc(), this will be true again.55* In general, access to code units and code points in the same56* iteration loop should not be mixed. In UTF-16, if the current position57* is on a second code unit (Low Surrogate), then only that code unit58* is returned even by next32PostInc().</p>59*60* <p>For iteration with either function, there are two ways to61* check for the end of the iteration. When there are no more62* characters in the text object:63* <ul>64* <li>The hasNext() function returns FALSE.</li>65* <li>nextPostInc() and next32PostInc() return DONE66* when one attempts to read beyond the end of the text object.</li>67* </ul>68*69* Example:70* \code71* void function1(ForwardCharacterIterator &it) {72* UChar32 c;73* while(it.hasNext()) {74* c=it.next32PostInc();75* // use c76* }77* }78*79* void function1(ForwardCharacterIterator &it) {80* char16_t c;81* while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {82* // use c83* }84* }85* \endcode86* </p>87*88* @stable ICU 2.089*/90class U_COMMON_API ForwardCharacterIterator : public UObject {91public:92/**93* Value returned by most of ForwardCharacterIterator's functions94* when the iterator has reached the limits of its iteration.95* @stable ICU 2.096*/97enum { DONE = 0xffff };9899/**100* Destructor.101* @stable ICU 2.0102*/103virtual ~ForwardCharacterIterator();104105/**106* Returns true when both iterators refer to the same107* character in the same character-storage object.108* @param that The ForwardCharacterIterator to be compared for equality109* @return true when both iterators refer to the same110* character in the same character-storage object111* @stable ICU 2.0112*/113virtual UBool operator==(const ForwardCharacterIterator& that) const = 0;114115/**116* Returns true when the iterators refer to different117* text-storage objects, or to different characters in the118* same text-storage object.119* @param that The ForwardCharacterIterator to be compared for inequality120* @return true when the iterators refer to different121* text-storage objects, or to different characters in the122* same text-storage object123* @stable ICU 2.0124*/125inline UBool operator!=(const ForwardCharacterIterator& that) const;126127/**128* Generates a hash code for this iterator.129* @return the hash code.130* @stable ICU 2.0131*/132virtual int32_t hashCode(void) const = 0;133134/**135* Returns a UClassID for this ForwardCharacterIterator ("poor man's136* RTTI").<P> Despite the fact that this function is public,137* DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!138* @return a UClassID for this ForwardCharacterIterator139* @stable ICU 2.0140*/141virtual UClassID getDynamicClassID(void) const = 0;142143/**144* Gets the current code unit for returning and advances to the next code unit145* in the iteration range146* (toward endIndex()). If there are147* no more code units to return, returns DONE.148* @return the current code unit.149* @stable ICU 2.0150*/151virtual char16_t nextPostInc(void) = 0;152153/**154* Gets the current code point for returning and advances to the next code point155* in the iteration range156* (toward endIndex()). If there are157* no more code points to return, returns DONE.158* @return the current code point.159* @stable ICU 2.0160*/161virtual UChar32 next32PostInc(void) = 0;162163/**164* Returns FALSE if there are no more code units or code points165* at or after the current position in the iteration range.166* This is used with nextPostInc() or next32PostInc() in forward167* iteration.168* @returns FALSE if there are no more code units or code points169* at or after the current position in the iteration range.170* @stable ICU 2.0171*/172virtual UBool hasNext() = 0;173174protected:175/** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/176ForwardCharacterIterator();177178/** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/179ForwardCharacterIterator(const ForwardCharacterIterator &other);180181/**182* Assignment operator to be overridden in the implementing class.183* @stable ICU 2.0184*/185ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }186};187188/**189* Abstract class that defines an API for iteration190* on text objects.191* This is an interface for forward and backward iteration192* and random access into a text object.193*194* <p>The API provides backward compatibility to the Java and older ICU195* CharacterIterator classes but extends them significantly:196* <ol>197* <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>198* <li>While the old API functions provided forward iteration with199* "pre-increment" semantics, the new one also provides functions200* with "post-increment" semantics. They are more efficient and should201* be the preferred iterator functions for new implementations.202* The backward iteration always had "pre-decrement" semantics, which203* are efficient.</li>204* <li>Just like ForwardCharacterIterator, it provides access to205* both code units and code points. Code point access versions are available206* for the old and the new iteration semantics.</li>207* <li>There are new functions for setting and moving the current position208* without returning a character, for efficiency.</li>209* </ol>210*211* See ForwardCharacterIterator for examples for using the new forward iteration212* functions. For backward iteration, there is also a hasPrevious() function213* that can be used analogously to hasNext().214* The old functions work as before and are shown below.</p>215*216* <p>Examples for some of the new functions:</p>217*218* Forward iteration with hasNext():219* \code220* void forward1(CharacterIterator &it) {221* UChar32 c;222* for(it.setToStart(); it.hasNext();) {223* c=it.next32PostInc();224* // use c225* }226* }227* \endcode228* Forward iteration more similar to loops with the old forward iteration,229* showing a way to convert simple for() loops:230* \code231* void forward2(CharacterIterator &it) {232* char16_t c;233* for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {234* // use c235* }236* }237* \endcode238* Backward iteration with setToEnd() and hasPrevious():239* \code240* void backward1(CharacterIterator &it) {241* UChar32 c;242* for(it.setToEnd(); it.hasPrevious();) {243* c=it.previous32();244* // use c245* }246* }247* \endcode248* Backward iteration with a more traditional for() loop:249* \code250* void backward2(CharacterIterator &it) {251* char16_t c;252* for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {253* // use c254* }255* }256* \endcode257*258* Example for random access:259* \code260* void random(CharacterIterator &it) {261* // set to the third code point from the beginning262* it.move32(3, CharacterIterator::kStart);263* // get a code point from here without moving the position264* UChar32 c=it.current32();265* // get the position266* int32_t pos=it.getIndex();267* // get the previous code unit268* char16_t u=it.previous();269* // move back one more code unit270* it.move(-1, CharacterIterator::kCurrent);271* // set the position back to where it was272* // and read the same code point c and move beyond it273* it.setIndex(pos);274* if(c!=it.next32PostInc()) {275* exit(1); // CharacterIterator inconsistent276* }277* }278* \endcode279*280* <p>Examples, especially for the old API:</p>281*282* Function processing characters, in this example simple output283* <pre>284* \code285* void processChar( char16_t c )286* {287* cout << " " << c;288* }289* \endcode290* </pre>291* Traverse the text from start to finish292* <pre>293* \code294* void traverseForward(CharacterIterator& iter)295* {296* for(char16_t c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {297* processChar(c);298* }299* }300* \endcode301* </pre>302* Traverse the text backwards, from end to start303* <pre>304* \code305* void traverseBackward(CharacterIterator& iter)306* {307* for(char16_t c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {308* processChar(c);309* }310* }311* \endcode312* </pre>313* Traverse both forward and backward from a given position in the text.314* Calls to notBoundary() in this example represents some additional stopping criteria.315* <pre>316* \code317* void traverseOut(CharacterIterator& iter, int32_t pos)318* {319* char16_t c;320* for (c = iter.setIndex(pos);321* c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));322* c = iter.next()) {}323* int32_t end = iter.getIndex();324* for (c = iter.setIndex(pos);325* c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));326* c = iter.previous()) {}327* int32_t start = iter.getIndex() + 1;328*329* cout << "start: " << start << " end: " << end << endl;330* for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {331* processChar(c);332* }333* }334* \endcode335* </pre>336* Creating a StringCharacterIterator and calling the test functions337* <pre>338* \code339* void CharacterIterator_Example( void )340* {341* cout << endl << "===== CharacterIterator_Example: =====" << endl;342* UnicodeString text("Ein kleiner Satz.");343* StringCharacterIterator iterator(text);344* cout << "----- traverseForward: -----------" << endl;345* traverseForward( iterator );346* cout << endl << endl << "----- traverseBackward: ----------" << endl;347* traverseBackward( iterator );348* cout << endl << endl << "----- traverseOut: ---------------" << endl;349* traverseOut( iterator, 7 );350* cout << endl << endl << "-----" << endl;351* }352* \endcode353* </pre>354*355* @stable ICU 2.0356*/357class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {358public:359/**360* Origin enumeration for the move() and move32() functions.361* @stable ICU 2.0362*/363enum EOrigin { kStart, kCurrent, kEnd };364365/**366* Destructor.367* @stable ICU 2.0368*/369virtual ~CharacterIterator();370371/**372* Returns a pointer to a new CharacterIterator of the same373* concrete class as this one, and referring to the same374* character in the same text-storage object as this one. The375* caller is responsible for deleting the new clone.376* @return a pointer to a new CharacterIterator377* @stable ICU 2.0378*/379virtual CharacterIterator* clone(void) const = 0;380381/**382* Sets the iterator to refer to the first code unit in its383* iteration range, and returns that code unit.384* This can be used to begin an iteration with next().385* @return the first code unit in its iteration range.386* @stable ICU 2.0387*/388virtual char16_t first(void) = 0;389390/**391* Sets the iterator to refer to the first code unit in its392* iteration range, returns that code unit, and moves the position393* to the second code unit. This is an alternative to setToStart()394* for forward iteration with nextPostInc().395* @return the first code unit in its iteration range.396* @stable ICU 2.0397*/398virtual char16_t firstPostInc(void);399400/**401* Sets the iterator to refer to the first code point in its402* iteration range, and returns that code unit,403* This can be used to begin an iteration with next32().404* Note that an iteration with next32PostInc(), beginning with,405* e.g., setToStart() or firstPostInc(), is more efficient.406* @return the first code point in its iteration range.407* @stable ICU 2.0408*/409virtual UChar32 first32(void) = 0;410411/**412* Sets the iterator to refer to the first code point in its413* iteration range, returns that code point, and moves the position414* to the second code point. This is an alternative to setToStart()415* for forward iteration with next32PostInc().416* @return the first code point in its iteration range.417* @stable ICU 2.0418*/419virtual UChar32 first32PostInc(void);420421/**422* Sets the iterator to refer to the first code unit or code point in its423* iteration range. This can be used to begin a forward424* iteration with nextPostInc() or next32PostInc().425* @return the start position of the iteration range426* @stable ICU 2.0427*/428inline int32_t setToStart();429430/**431* Sets the iterator to refer to the last code unit in its432* iteration range, and returns that code unit.433* This can be used to begin an iteration with previous().434* @return the last code unit.435* @stable ICU 2.0436*/437virtual char16_t last(void) = 0;438439/**440* Sets the iterator to refer to the last code point in its441* iteration range, and returns that code unit.442* This can be used to begin an iteration with previous32().443* @return the last code point.444* @stable ICU 2.0445*/446virtual UChar32 last32(void) = 0;447448/**449* Sets the iterator to the end of its iteration range, just behind450* the last code unit or code point. This can be used to begin a backward451* iteration with previous() or previous32().452* @return the end position of the iteration range453* @stable ICU 2.0454*/455inline int32_t setToEnd();456457/**458* Sets the iterator to refer to the "position"-th code unit459* in the text-storage object the iterator refers to, and460* returns that code unit.461* @param position the "position"-th code unit in the text-storage object462* @return the "position"-th code unit.463* @stable ICU 2.0464*/465virtual char16_t setIndex(int32_t position) = 0;466467/**468* Sets the iterator to refer to the beginning of the code point469* that contains the "position"-th code unit470* in the text-storage object the iterator refers to, and471* returns that code point.472* The current position is adjusted to the beginning of the code point473* (its first code unit).474* @param position the "position"-th code unit in the text-storage object475* @return the "position"-th code point.476* @stable ICU 2.0477*/478virtual UChar32 setIndex32(int32_t position) = 0;479480/**481* Returns the code unit the iterator currently refers to.482* @return the current code unit.483* @stable ICU 2.0484*/485virtual char16_t current(void) const = 0;486487/**488* Returns the code point the iterator currently refers to.489* @return the current code point.490* @stable ICU 2.0491*/492virtual UChar32 current32(void) const = 0;493494/**495* Advances to the next code unit in the iteration range496* (toward endIndex()), and returns that code unit. If there are497* no more code units to return, returns DONE.498* @return the next code unit.499* @stable ICU 2.0500*/501virtual char16_t next(void) = 0;502503/**504* Advances to the next code point in the iteration range505* (toward endIndex()), and returns that code point. If there are506* no more code points to return, returns DONE.507* Note that iteration with "pre-increment" semantics is less508* efficient than iteration with "post-increment" semantics509* that is provided by next32PostInc().510* @return the next code point.511* @stable ICU 2.0512*/513virtual UChar32 next32(void) = 0;514515/**516* Advances to the previous code unit in the iteration range517* (toward startIndex()), and returns that code unit. If there are518* no more code units to return, returns DONE.519* @return the previous code unit.520* @stable ICU 2.0521*/522virtual char16_t previous(void) = 0;523524/**525* Advances to the previous code point in the iteration range526* (toward startIndex()), and returns that code point. If there are527* no more code points to return, returns DONE.528* @return the previous code point.529* @stable ICU 2.0530*/531virtual UChar32 previous32(void) = 0;532533/**534* Returns FALSE if there are no more code units or code points535* before the current position in the iteration range.536* This is used with previous() or previous32() in backward537* iteration.538* @return FALSE if there are no more code units or code points539* before the current position in the iteration range, return TRUE otherwise.540* @stable ICU 2.0541*/542virtual UBool hasPrevious() = 0;543544/**545* Returns the numeric index in the underlying text-storage546* object of the character returned by first(). Since it's547* possible to create an iterator that iterates across only548* part of a text-storage object, this number isn't549* necessarily 0.550* @returns the numeric index in the underlying text-storage551* object of the character returned by first().552* @stable ICU 2.0553*/554inline int32_t startIndex(void) const;555556/**557* Returns the numeric index in the underlying text-storage558* object of the position immediately BEYOND the character559* returned by last().560* @return the numeric index in the underlying text-storage561* object of the position immediately BEYOND the character562* returned by last().563* @stable ICU 2.0564*/565inline int32_t endIndex(void) const;566567/**568* Returns the numeric index in the underlying text-storage569* object of the character the iterator currently refers to570* (i.e., the character returned by current()).571* @return the numeric index in the text-storage object of572* the character the iterator currently refers to573* @stable ICU 2.0574*/575inline int32_t getIndex(void) const;576577/**578* Returns the length of the entire text in the underlying579* text-storage object.580* @return the length of the entire text in the text-storage object581* @stable ICU 2.0582*/583inline int32_t getLength() const;584585/**586* Moves the current position relative to the start or end of the587* iteration range, or relative to the current position itself.588* The movement is expressed in numbers of code units forward589* or backward by specifying a positive or negative delta.590* @param delta the position relative to origin. A positive delta means forward;591* a negative delta means backward.592* @param origin Origin enumeration {kStart, kCurrent, kEnd}593* @return the new position594* @stable ICU 2.0595*/596virtual int32_t move(int32_t delta, EOrigin origin) = 0;597598/**599* Moves the current position relative to the start or end of the600* iteration range, or relative to the current position itself.601* The movement is expressed in numbers of code points forward602* or backward by specifying a positive or negative delta.603* @param delta the position relative to origin. A positive delta means forward;604* a negative delta means backward.605* @param origin Origin enumeration {kStart, kCurrent, kEnd}606* @return the new position607* @stable ICU 2.0608*/609#ifdef move32610// One of the system headers right now is sometimes defining a conflicting macro we don't use611#undef move32612#endif613virtual int32_t move32(int32_t delta, EOrigin origin) = 0;614615/**616* Copies the text under iteration into the UnicodeString617* referred to by "result".618* @param result Receives a copy of the text under iteration.619* @stable ICU 2.0620*/621virtual void getText(UnicodeString& result) = 0;622623protected:624/**625* Empty constructor.626* @stable ICU 2.0627*/628CharacterIterator();629630/**631* Constructor, just setting the length field in this base class.632* @stable ICU 2.0633*/634CharacterIterator(int32_t length);635636/**637* Constructor, just setting the length and position fields in this base class.638* @stable ICU 2.0639*/640CharacterIterator(int32_t length, int32_t position);641642/**643* Constructor, just setting the length, start, end, and position fields in this base class.644* @stable ICU 2.0645*/646CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);647648/**649* Copy constructor.650*651* @param that The CharacterIterator to be copied652* @stable ICU 2.0653*/654CharacterIterator(const CharacterIterator &that);655656/**657* Assignment operator. Sets this CharacterIterator to have the same behavior,658* as the one passed in.659* @param that The CharacterIterator passed in.660* @return the newly set CharacterIterator.661* @stable ICU 2.0662*/663CharacterIterator &operator=(const CharacterIterator &that);664665/**666* Base class text length field.667* Necessary this for correct getText() and hashCode().668* @stable ICU 2.0669*/670int32_t textLength;671672/**673* Base class field for the current position.674* @stable ICU 2.0675*/676int32_t pos;677678/**679* Base class field for the start of the iteration range.680* @stable ICU 2.0681*/682int32_t begin;683684/**685* Base class field for the end of the iteration range.686* @stable ICU 2.0687*/688int32_t end;689};690691inline UBool692ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {693return !operator==(that);694}695696inline int32_t697CharacterIterator::setToStart() {698return move(0, kStart);699}700701inline int32_t702CharacterIterator::setToEnd() {703return move(0, kEnd);704}705706inline int32_t707CharacterIterator::startIndex(void) const {708return begin;709}710711inline int32_t712CharacterIterator::endIndex(void) const {713return end;714}715716inline int32_t717CharacterIterator::getIndex(void) const {718return pos;719}720721inline int32_t722CharacterIterator::getLength(void) const {723return textLength;724}725726U_NAMESPACE_END727#endif728729730