Path: blob/jdk8u272-b10-aarch32-20201026/jdk/src/share/native/common/unicode/edits.h
48725 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html23// edits.h4// created: 2016dec30 Markus W. Scherer56#ifndef __EDITS_H__7#define __EDITS_H__89#include "unicode/utypes.h"10#include "unicode/uobject.h"1112/**13* \file14* \brief C++ API: C++ class Edits for low-level string transformations on styled text.15*/1617U_NAMESPACE_BEGIN1819class UnicodeString;2021/**22* Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions23* in linear progression. Does not support moving/reordering of text.24*25* There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to26* instances of this class using {@link #addReplace(int32_t, int32_t)} (for change edits) and27* {@link #addUnchanged(int32_t)} (for no-change edits). Change edits are retained with full granularity,28* whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one29* mapping between code points in the source and destination strings.30*31* After all edits have been added, instances of this class should be considered immutable, and an32* {@link Edits::Iterator} can be used for queries.33*34* There are four flavors of Edits::Iterator:35*36* <ul>37* <li>{@link #getFineIterator()} retains full granularity of change edits.38* <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling39* next() on the iterator, skips over no-change edits (unchanged regions).40* <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change41* edits are automatically merged during the construction phase.)42* <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when43* calling next() on the iterator, skips over no-change edits (unchanged regions).44* </ul>45*46* For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the47* following fine edits:48* <ul>49* <li>abc ⇨ abc (no-change)50* <li>ß ⇨ ss (change)51* <li>D ⇨ d (change)52* <li>e ⇨ e (no-change)53* <li>F ⇨ f (change)54* </ul>55* and the following coarse edits (note how adjacent change edits get merged together):56* <ul>57* <li>abc ⇨ abc (no-change)58* <li>ßD ⇨ ssd (change)59* <li>e ⇨ e (no-change)60* <li>F ⇨ f (change)61* </ul>62*63* The "fine changes" and "coarse changes" iterators will step through only the change edits when their64* `Edits::Iterator::next()` methods are called. They are identical to the non-change iterators when65* their `Edits::Iterator::findSourceIndex()` or `Edits::Iterator::findDestinationIndex()`66* methods are used to walk through the string.67*68* For examples of how to use this class, see the test `TestCaseMapEditsIteratorDocs` in69* UCharacterCaseTest.java.70*71* An Edits object tracks a separate UErrorCode, but ICU string transformation functions72* (e.g., case mapping functions) merge any such errors into their API's UErrorCode.73*74* @stable ICU 5975*/76class U_COMMON_API Edits U_FINAL : public UMemory {77public:78/**79* Constructs an empty object.80* @stable ICU 5981*/82Edits() :83array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0),84errorCode_(U_ZERO_ERROR) {}85/**86* Copy constructor.87* @param other source edits88* @stable ICU 6089*/90Edits(const Edits &other) :91array(stackArray), capacity(STACK_CAPACITY), length(other.length),92delta(other.delta), numChanges(other.numChanges),93errorCode_(other.errorCode_) {94copyArray(other);95}96/**97* Move constructor, might leave src empty.98* This object will have the same contents that the source object had.99* @param src source edits100* @stable ICU 60101*/102Edits(Edits &&src) U_NOEXCEPT :103array(stackArray), capacity(STACK_CAPACITY), length(src.length),104delta(src.delta), numChanges(src.numChanges),105errorCode_(src.errorCode_) {106moveArray(src);107}108109/**110* Destructor.111* @stable ICU 59112*/113~Edits();114115/**116* Assignment operator.117* @param other source edits118* @return *this119* @stable ICU 60120*/121Edits &operator=(const Edits &other);122123/**124* Move assignment operator, might leave src empty.125* This object will have the same contents that the source object had.126* The behavior is undefined if *this and src are the same object.127* @param src source edits128* @return *this129* @stable ICU 60130*/131Edits &operator=(Edits &&src) U_NOEXCEPT;132133/**134* Resets the data but may not release memory.135* @stable ICU 59136*/137void reset() U_NOEXCEPT;138139/**140* Adds a no-change edit: a record for an unchanged segment of text.141* Normally called from inside ICU string transformation functions, not user code.142* @stable ICU 59143*/144void addUnchanged(int32_t unchangedLength);145/**146* Adds a change edit: a record for a text replacement/insertion/deletion.147* Normally called from inside ICU string transformation functions, not user code.148* @stable ICU 59149*/150void addReplace(int32_t oldLength, int32_t newLength);151/**152* Sets the UErrorCode if an error occurred while recording edits.153* Preserves older error codes in the outErrorCode.154* Normally called from inside ICU string transformation functions, not user code.155* @param outErrorCode Set to an error code if it does not contain one already156* and an error occurred while recording edits.157* Otherwise unchanged.158* @return TRUE if U_FAILURE(outErrorCode)159* @stable ICU 59160*/161UBool copyErrorTo(UErrorCode &outErrorCode);162163/**164* How much longer is the new text compared with the old text?165* @return new length minus old length166* @stable ICU 59167*/168int32_t lengthDelta() const { return delta; }169/**170* @return TRUE if there are any change edits171* @stable ICU 59172*/173UBool hasChanges() const { return numChanges != 0; }174175/**176* @return the number of change edits177* @stable ICU 60178*/179int32_t numberOfChanges() const { return numChanges; }180181/**182* Access to the list of edits.183*184* At any moment in time, an instance of this class points to a single edit: a "window" into a span185* of the source string and the corresponding span of the destination string. The source string span186* starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string187* span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars.188*189* The iterator can be moved between edits using the `next()`, `findSourceIndex(int32_t, UErrorCode &)`,190* and `findDestinationIndex(int32_t, UErrorCode &)` methods.191* Calling any of these methods mutates the iterator to make it point to the corresponding edit.192*193* For more information, see the documentation for {@link Edits}.194*195* @see getCoarseIterator196* @see getFineIterator197* @stable ICU 59198*/199struct U_COMMON_API Iterator U_FINAL : public UMemory {200/**201* Default constructor, empty iterator.202* @stable ICU 60203*/204Iterator() :205array(nullptr), index(0), length(0),206remaining(0), onlyChanges_(FALSE), coarse(FALSE),207dir(0), changed(FALSE), oldLength_(0), newLength_(0),208srcIndex(0), replIndex(0), destIndex(0) {}209/**210* Copy constructor.211* @stable ICU 59212*/213Iterator(const Iterator &other) = default;214/**215* Assignment operator.216* @stable ICU 59217*/218Iterator &operator=(const Iterator &other) = default;219220/**221* Advances the iterator to the next edit.222* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,223* or else the function returns immediately. Check for U_FAILURE()224* on output or use with function chaining. (See User Guide for details.)225* @return TRUE if there is another edit226* @stable ICU 59227*/228UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); }229230/**231* Moves the iterator to the edit that contains the source index.232* The source index may be found in a no-change edit233* even if normal iteration would skip no-change edits.234* Normal iteration can continue from a found edit.235*236* The iterator state before this search logically does not matter.237* (It may affect the performance of the search.)238*239* The iterator state after this search is undefined240* if the source index is out of bounds for the source string.241*242* @param i source index243* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,244* or else the function returns immediately. Check for U_FAILURE()245* on output or use with function chaining. (See User Guide for details.)246* @return TRUE if the edit for the source index was found247* @stable ICU 59248*/249UBool findSourceIndex(int32_t i, UErrorCode &errorCode) {250return findIndex(i, TRUE, errorCode) == 0;251}252253/**254* Moves the iterator to the edit that contains the destination index.255* The destination index may be found in a no-change edit256* even if normal iteration would skip no-change edits.257* Normal iteration can continue from a found edit.258*259* The iterator state before this search logically does not matter.260* (It may affect the performance of the search.)261*262* The iterator state after this search is undefined263* if the source index is out of bounds for the source string.264*265* @param i destination index266* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,267* or else the function returns immediately. Check for U_FAILURE()268* on output or use with function chaining. (See User Guide for details.)269* @return TRUE if the edit for the destination index was found270* @stable ICU 60271*/272UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) {273return findIndex(i, FALSE, errorCode) == 0;274}275276/**277* Computes the destination index corresponding to the given source index.278* If the source index is inside a change edit (not at its start),279* then the destination index at the end of that edit is returned,280* since there is no information about index mapping inside a change edit.281*282* (This means that indexes to the start and middle of an edit,283* for example around a grapheme cluster, are mapped to indexes284* encompassing the entire edit.285* The alternative, mapping an interior index to the start,286* would map such an interval to an empty one.)287*288* This operation will usually but not always modify this object.289* The iterator state after this search is undefined.290*291* @param i source index292* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,293* or else the function returns immediately. Check for U_FAILURE()294* on output or use with function chaining. (See User Guide for details.)295* @return destination index; undefined if i is not 0..string length296* @stable ICU 60297*/298int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode);299300/**301* Computes the source index corresponding to the given destination index.302* If the destination index is inside a change edit (not at its start),303* then the source index at the end of that edit is returned,304* since there is no information about index mapping inside a change edit.305*306* (This means that indexes to the start and middle of an edit,307* for example around a grapheme cluster, are mapped to indexes308* encompassing the entire edit.309* The alternative, mapping an interior index to the start,310* would map such an interval to an empty one.)311*312* This operation will usually but not always modify this object.313* The iterator state after this search is undefined.314*315* @param i destination index316* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,317* or else the function returns immediately. Check for U_FAILURE()318* on output or use with function chaining. (See User Guide for details.)319* @return source index; undefined if i is not 0..string length320* @stable ICU 60321*/322int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode);323324/**325* Returns whether the edit currently represented by the iterator is a change edit.326*327* @return TRUE if this edit replaces oldLength() units with newLength() different ones.328* FALSE if oldLength units remain unchanged.329* @stable ICU 59330*/331UBool hasChange() const { return changed; }332333/**334* The length of the current span in the source string, which starts at {@link #sourceIndex}.335*336* @return the number of units in the original string which are replaced or remain unchanged.337* @stable ICU 59338*/339int32_t oldLength() const { return oldLength_; }340341/**342* The length of the current span in the destination string, which starts at343* {@link #destinationIndex}, or in the replacement string, which starts at344* {@link #replacementIndex}.345*346* @return the number of units in the modified string, if hasChange() is TRUE.347* Same as oldLength if hasChange() is FALSE.348* @stable ICU 59349*/350int32_t newLength() const { return newLength_; }351352/**353* The start index of the current span in the source string; the span has length354* {@link #oldLength}.355*356* @return the current index into the source string357* @stable ICU 59358*/359int32_t sourceIndex() const { return srcIndex; }360361/**362* The start index of the current span in the replacement string; the span has length363* {@link #newLength}. Well-defined only if the current edit is a change edit.364*365* The *replacement string* is the concatenation of all substrings of the destination366* string corresponding to change edits.367*368* This method is intended to be used together with operations that write only replacement369* characters (e.g. operations specifying the \ref U_OMIT_UNCHANGED_TEXT option).370* The source string can then be modified in-place.371*372* @return the current index into the replacement-characters-only string,373* not counting unchanged spans374* @stable ICU 59375*/376int32_t replacementIndex() const {377// TODO: Throw an exception if we aren't in a change edit?378return replIndex;379}380381/**382* The start index of the current span in the destination string; the span has length383* {@link #newLength}.384*385* @return the current index into the full destination string386* @stable ICU 59387*/388int32_t destinationIndex() const { return destIndex; }389390#ifndef U_HIDE_INTERNAL_API391/**392* A string representation of the current edit represented by the iterator for debugging. You393* should not depend on the contents of the return string.394* @internal395*/396UnicodeString& toString(UnicodeString& appendTo) const;397#endif // U_HIDE_INTERNAL_API398399private:400friend class Edits;401402Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);403404int32_t readLength(int32_t head);405void updateNextIndexes();406void updatePreviousIndexes();407UBool noNext();408UBool next(UBool onlyChanges, UErrorCode &errorCode);409UBool previous(UErrorCode &errorCode);410/** @return -1: error or i<0; 0: found; 1: i>=string length */411int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode);412413const uint16_t *array;414int32_t index, length;415// 0 if we are not within compressed equal-length changes.416// Otherwise the number of remaining changes, including the current one.417int32_t remaining;418UBool onlyChanges_, coarse;419420int8_t dir; // iteration direction: back(<0), initial(0), forward(>0)421UBool changed;422int32_t oldLength_, newLength_;423int32_t srcIndex, replIndex, destIndex;424};425426/**427* Returns an Iterator for coarse-grained change edits428* (adjacent change edits are treated as one).429* Can be used to perform simple string updates.430* Skips no-change edits.431* @return an Iterator that merges adjacent changes.432* @stable ICU 59433*/434Iterator getCoarseChangesIterator() const {435return Iterator(array, length, TRUE, TRUE);436}437438/**439* Returns an Iterator for coarse-grained change and no-change edits440* (adjacent change edits are treated as one).441* Can be used to perform simple string updates.442* Adjacent change edits are treated as one edit.443* @return an Iterator that merges adjacent changes.444* @stable ICU 59445*/446Iterator getCoarseIterator() const {447return Iterator(array, length, FALSE, TRUE);448}449450/**451* Returns an Iterator for fine-grained change edits452* (full granularity of change edits is retained).453* Can be used for modifying styled text.454* Skips no-change edits.455* @return an Iterator that separates adjacent changes.456* @stable ICU 59457*/458Iterator getFineChangesIterator() const {459return Iterator(array, length, TRUE, FALSE);460}461462/**463* Returns an Iterator for fine-grained change and no-change edits464* (full granularity of change edits is retained).465* Can be used for modifying styled text.466* @return an Iterator that separates adjacent changes.467* @stable ICU 59468*/469Iterator getFineIterator() const {470return Iterator(array, length, FALSE, FALSE);471}472473/**474* Merges the two input Edits and appends the result to this object.475*476* Consider two string transformations (for example, normalization and case mapping)477* where each records Edits in addition to writing an output string.<br>478* Edits ab reflect how substrings of input string a479* map to substrings of intermediate string b.<br>480* Edits bc reflect how substrings of intermediate string b481* map to substrings of output string c.<br>482* This function merges ab and bc such that the additional edits483* recorded in this object reflect how substrings of input string a484* map to substrings of output string c.485*486* If unrelated Edits are passed in where the output string of the first487* has a different length than the input string of the second,488* then a U_ILLEGAL_ARGUMENT_ERROR is reported.489*490* @param ab reflects how substrings of input string a491* map to substrings of intermediate string b.492* @param bc reflects how substrings of intermediate string b493* map to substrings of output string c.494* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,495* or else the function returns immediately. Check for U_FAILURE()496* on output or use with function chaining. (See User Guide for details.)497* @return *this, with the merged edits appended498* @stable ICU 60499*/500Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode);501502private:503void releaseArray() U_NOEXCEPT;504Edits ©Array(const Edits &other);505Edits &moveArray(Edits &src) U_NOEXCEPT;506507void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; }508int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; }509510void append(int32_t r);511UBool growArray();512513static const int32_t STACK_CAPACITY = 100;514uint16_t *array;515int32_t capacity;516int32_t length;517int32_t delta;518int32_t numChanges;519UErrorCode errorCode_;520uint16_t stackArray[STACK_CAPACITY];521};522523U_NAMESPACE_END524525#endif // __EDITS_H__526527528