Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/native/common/unicode/edits.h
38827 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html23// edits.h4// created: 2016dec30 Markus W. Scherer56#ifndef __EDITS_H__7#define __EDITS_H__89#include "unicode/utypes.h"1011#if U_SHOW_CPLUSPLUS_API1213#include "unicode/uobject.h"1415/**16* \file17* \brief C++ API: C++ class Edits for low-level string transformations on styled text.18*/1920U_NAMESPACE_BEGIN2122class UnicodeString;2324/**25* Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions26* in linear progression. Does not support moving/reordering of text.27*28* There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to29* instances of this class using {@link #addReplace(int32_t, int32_t)} (for change edits) and30* {@link #addUnchanged(int32_t)} (for no-change edits). Change edits are retained with full granularity,31* whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one32* mapping between code points in the source and destination strings.33*34* After all edits have been added, instances of this class should be considered immutable, and an35* {@link Edits::Iterator} can be used for queries.36*37* There are four flavors of Edits::Iterator:38*39* <ul>40* <li>{@link #getFineIterator()} retains full granularity of change edits.41* <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling42* next() on the iterator, skips over no-change edits (unchanged regions).43* <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change44* edits are automatically merged during the construction phase.)45* <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when46* calling next() on the iterator, skips over no-change edits (unchanged regions).47* </ul>48*49* For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the50* following fine edits:51* <ul>52* <li>abc ⇨ abc (no-change)53* <li>ß ⇨ ss (change)54* <li>D ⇨ d (change)55* <li>e ⇨ e (no-change)56* <li>F ⇨ f (change)57* </ul>58* and the following coarse edits (note how adjacent change edits get merged together):59* <ul>60* <li>abc ⇨ abc (no-change)61* <li>ßD ⇨ ssd (change)62* <li>e ⇨ e (no-change)63* <li>F ⇨ f (change)64* </ul>65*66* The "fine changes" and "coarse changes" iterators will step through only the change edits when their67* `Edits::Iterator::next()` methods are called. They are identical to the non-change iterators when68* their `Edits::Iterator::findSourceIndex()` or `Edits::Iterator::findDestinationIndex()`69* methods are used to walk through the string.70*71* For examples of how to use this class, see the test `TestCaseMapEditsIteratorDocs` in72* UCharacterCaseTest.java.73*74* An Edits object tracks a separate UErrorCode, but ICU string transformation functions75* (e.g., case mapping functions) merge any such errors into their API's UErrorCode.76*77* @stable ICU 5978*/79class U_COMMON_API Edits U_FINAL : public UMemory {80public:81/**82* Constructs an empty object.83* @stable ICU 5984*/85Edits() :86array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0),87errorCode_(U_ZERO_ERROR) {}88/**89* Copy constructor.90* @param other source edits91* @stable ICU 6092*/93Edits(const Edits &other) :94array(stackArray), capacity(STACK_CAPACITY), length(other.length),95delta(other.delta), numChanges(other.numChanges),96errorCode_(other.errorCode_) {97copyArray(other);98}99/**100* Move constructor, might leave src empty.101* This object will have the same contents that the source object had.102* @param src source edits103* @stable ICU 60104*/105Edits(Edits &&src) U_NOEXCEPT :106array(stackArray), capacity(STACK_CAPACITY), length(src.length),107delta(src.delta), numChanges(src.numChanges),108errorCode_(src.errorCode_) {109moveArray(src);110}111112/**113* Destructor.114* @stable ICU 59115*/116~Edits();117118/**119* Assignment operator.120* @param other source edits121* @return *this122* @stable ICU 60123*/124Edits &operator=(const Edits &other);125126/**127* Move assignment operator, might leave src empty.128* This object will have the same contents that the source object had.129* The behavior is undefined if *this and src are the same object.130* @param src source edits131* @return *this132* @stable ICU 60133*/134Edits &operator=(Edits &&src) U_NOEXCEPT;135136/**137* Resets the data but may not release memory.138* @stable ICU 59139*/140void reset() U_NOEXCEPT;141142/**143* Adds a no-change edit: a record for an unchanged segment of text.144* Normally called from inside ICU string transformation functions, not user code.145* @stable ICU 59146*/147void addUnchanged(int32_t unchangedLength);148/**149* Adds a change edit: a record for a text replacement/insertion/deletion.150* Normally called from inside ICU string transformation functions, not user code.151* @stable ICU 59152*/153void addReplace(int32_t oldLength, int32_t newLength);154/**155* Sets the UErrorCode if an error occurred while recording edits.156* Preserves older error codes in the outErrorCode.157* Normally called from inside ICU string transformation functions, not user code.158* @param outErrorCode Set to an error code if it does not contain one already159* and an error occurred while recording edits.160* Otherwise unchanged.161* @return TRUE if U_FAILURE(outErrorCode)162* @stable ICU 59163*/164UBool copyErrorTo(UErrorCode &outErrorCode) const;165166/**167* How much longer is the new text compared with the old text?168* @return new length minus old length169* @stable ICU 59170*/171int32_t lengthDelta() const { return delta; }172/**173* @return TRUE if there are any change edits174* @stable ICU 59175*/176UBool hasChanges() const { return numChanges != 0; }177178/**179* @return the number of change edits180* @stable ICU 60181*/182int32_t numberOfChanges() const { return numChanges; }183184/**185* Access to the list of edits.186*187* At any moment in time, an instance of this class points to a single edit: a "window" into a span188* of the source string and the corresponding span of the destination string. The source string span189* starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string190* span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars.191*192* The iterator can be moved between edits using the `next()`, `findSourceIndex(int32_t, UErrorCode &)`,193* and `findDestinationIndex(int32_t, UErrorCode &)` methods.194* Calling any of these methods mutates the iterator to make it point to the corresponding edit.195*196* For more information, see the documentation for {@link Edits}.197*198* @see getCoarseIterator199* @see getFineIterator200* @stable ICU 59201*/202struct U_COMMON_API Iterator U_FINAL : public UMemory {203/**204* Default constructor, empty iterator.205* @stable ICU 60206*/207Iterator() :208array(nullptr), index(0), length(0),209remaining(0), onlyChanges_(FALSE), coarse(FALSE),210dir(0), changed(FALSE), oldLength_(0), newLength_(0),211srcIndex(0), replIndex(0), destIndex(0) {}212/**213* Copy constructor.214* @stable ICU 59215*/216Iterator(const Iterator &other) = default;217/**218* Assignment operator.219* @stable ICU 59220*/221Iterator &operator=(const Iterator &other) = default;222223/**224* Advances the iterator to the next edit.225* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,226* or else the function returns immediately. Check for U_FAILURE()227* on output or use with function chaining. (See User Guide for details.)228* @return TRUE if there is another edit229* @stable ICU 59230*/231UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); }232233/**234* Moves the iterator to the edit that contains the source index.235* The source index may be found in a no-change edit236* even if normal iteration would skip no-change edits.237* Normal iteration can continue from a found edit.238*239* The iterator state before this search logically does not matter.240* (It may affect the performance of the search.)241*242* The iterator state after this search is undefined243* if the source index is out of bounds for the source string.244*245* @param i source index246* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,247* or else the function returns immediately. Check for U_FAILURE()248* on output or use with function chaining. (See User Guide for details.)249* @return TRUE if the edit for the source index was found250* @stable ICU 59251*/252UBool findSourceIndex(int32_t i, UErrorCode &errorCode) {253return findIndex(i, TRUE, errorCode) == 0;254}255256/**257* Moves the iterator to the edit that contains the destination index.258* The destination index may be found in a no-change edit259* even if normal iteration would skip no-change edits.260* Normal iteration can continue from a found edit.261*262* The iterator state before this search logically does not matter.263* (It may affect the performance of the search.)264*265* The iterator state after this search is undefined266* if the source index is out of bounds for the source string.267*268* @param i destination index269* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,270* or else the function returns immediately. Check for U_FAILURE()271* on output or use with function chaining. (See User Guide for details.)272* @return TRUE if the edit for the destination index was found273* @stable ICU 60274*/275UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) {276return findIndex(i, FALSE, errorCode) == 0;277}278279/**280* Computes the destination index corresponding to the given source index.281* If the source index is inside a change edit (not at its start),282* then the destination index at the end of that edit is returned,283* since there is no information about index mapping inside a change edit.284*285* (This means that indexes to the start and middle of an edit,286* for example around a grapheme cluster, are mapped to indexes287* encompassing the entire edit.288* The alternative, mapping an interior index to the start,289* would map such an interval to an empty one.)290*291* This operation will usually but not always modify this object.292* The iterator state after this search is undefined.293*294* @param i source index295* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,296* or else the function returns immediately. Check for U_FAILURE()297* on output or use with function chaining. (See User Guide for details.)298* @return destination index; undefined if i is not 0..string length299* @stable ICU 60300*/301int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode);302303/**304* Computes the source index corresponding to the given destination index.305* If the destination index is inside a change edit (not at its start),306* then the source index at the end of that edit is returned,307* since there is no information about index mapping inside a change edit.308*309* (This means that indexes to the start and middle of an edit,310* for example around a grapheme cluster, are mapped to indexes311* encompassing the entire edit.312* The alternative, mapping an interior index to the start,313* would map such an interval to an empty one.)314*315* This operation will usually but not always modify this object.316* The iterator state after this search is undefined.317*318* @param i destination index319* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,320* or else the function returns immediately. Check for U_FAILURE()321* on output or use with function chaining. (See User Guide for details.)322* @return source index; undefined if i is not 0..string length323* @stable ICU 60324*/325int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode);326327/**328* Returns whether the edit currently represented by the iterator is a change edit.329*330* @return TRUE if this edit replaces oldLength() units with newLength() different ones.331* FALSE if oldLength units remain unchanged.332* @stable ICU 59333*/334UBool hasChange() const { return changed; }335336/**337* The length of the current span in the source string, which starts at {@link #sourceIndex}.338*339* @return the number of units in the original string which are replaced or remain unchanged.340* @stable ICU 59341*/342int32_t oldLength() const { return oldLength_; }343344/**345* The length of the current span in the destination string, which starts at346* {@link #destinationIndex}, or in the replacement string, which starts at347* {@link #replacementIndex}.348*349* @return the number of units in the modified string, if hasChange() is TRUE.350* Same as oldLength if hasChange() is FALSE.351* @stable ICU 59352*/353int32_t newLength() const { return newLength_; }354355/**356* The start index of the current span in the source string; the span has length357* {@link #oldLength}.358*359* @return the current index into the source string360* @stable ICU 59361*/362int32_t sourceIndex() const { return srcIndex; }363364/**365* The start index of the current span in the replacement string; the span has length366* {@link #newLength}. Well-defined only if the current edit is a change edit.367*368* The *replacement string* is the concatenation of all substrings of the destination369* string corresponding to change edits.370*371* This method is intended to be used together with operations that write only replacement372* characters (e.g. operations specifying the \ref U_OMIT_UNCHANGED_TEXT option).373* The source string can then be modified in-place.374*375* @return the current index into the replacement-characters-only string,376* not counting unchanged spans377* @stable ICU 59378*/379int32_t replacementIndex() const {380// TODO: Throw an exception if we aren't in a change edit?381return replIndex;382}383384/**385* The start index of the current span in the destination string; the span has length386* {@link #newLength}.387*388* @return the current index into the full destination string389* @stable ICU 59390*/391int32_t destinationIndex() const { return destIndex; }392393#ifndef U_HIDE_INTERNAL_API394/**395* A string representation of the current edit represented by the iterator for debugging. You396* should not depend on the contents of the return string.397* @internal398*/399UnicodeString& toString(UnicodeString& appendTo) const;400#endif // U_HIDE_INTERNAL_API401402private:403friend class Edits;404405Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);406407int32_t readLength(int32_t head);408void updateNextIndexes();409void updatePreviousIndexes();410UBool noNext();411UBool next(UBool onlyChanges, UErrorCode &errorCode);412UBool previous(UErrorCode &errorCode);413/** @return -1: error or i<0; 0: found; 1: i>=string length */414int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode);415416const uint16_t *array;417int32_t index, length;418// 0 if we are not within compressed equal-length changes.419// Otherwise the number of remaining changes, including the current one.420int32_t remaining;421UBool onlyChanges_, coarse;422423int8_t dir; // iteration direction: back(<0), initial(0), forward(>0)424UBool changed;425int32_t oldLength_, newLength_;426int32_t srcIndex, replIndex, destIndex;427};428429/**430* Returns an Iterator for coarse-grained change edits431* (adjacent change edits are treated as one).432* Can be used to perform simple string updates.433* Skips no-change edits.434* @return an Iterator that merges adjacent changes.435* @stable ICU 59436*/437Iterator getCoarseChangesIterator() const {438return Iterator(array, length, TRUE, TRUE);439}440441/**442* Returns an Iterator for coarse-grained change and no-change edits443* (adjacent change edits are treated as one).444* Can be used to perform simple string updates.445* Adjacent change edits are treated as one edit.446* @return an Iterator that merges adjacent changes.447* @stable ICU 59448*/449Iterator getCoarseIterator() const {450return Iterator(array, length, FALSE, TRUE);451}452453/**454* Returns an Iterator for fine-grained change edits455* (full granularity of change edits is retained).456* Can be used for modifying styled text.457* Skips no-change edits.458* @return an Iterator that separates adjacent changes.459* @stable ICU 59460*/461Iterator getFineChangesIterator() const {462return Iterator(array, length, TRUE, FALSE);463}464465/**466* Returns an Iterator for fine-grained change and no-change edits467* (full granularity of change edits is retained).468* Can be used for modifying styled text.469* @return an Iterator that separates adjacent changes.470* @stable ICU 59471*/472Iterator getFineIterator() const {473return Iterator(array, length, FALSE, FALSE);474}475476/**477* Merges the two input Edits and appends the result to this object.478*479* Consider two string transformations (for example, normalization and case mapping)480* where each records Edits in addition to writing an output string.<br>481* Edits ab reflect how substrings of input string a482* map to substrings of intermediate string b.<br>483* Edits bc reflect how substrings of intermediate string b484* map to substrings of output string c.<br>485* This function merges ab and bc such that the additional edits486* recorded in this object reflect how substrings of input string a487* map to substrings of output string c.488*489* If unrelated Edits are passed in where the output string of the first490* has a different length than the input string of the second,491* then a U_ILLEGAL_ARGUMENT_ERROR is reported.492*493* @param ab reflects how substrings of input string a494* map to substrings of intermediate string b.495* @param bc reflects how substrings of intermediate string b496* map to substrings of output string c.497* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,498* or else the function returns immediately. Check for U_FAILURE()499* on output or use with function chaining. (See User Guide for details.)500* @return *this, with the merged edits appended501* @stable ICU 60502*/503Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode);504505private:506void releaseArray() U_NOEXCEPT;507Edits ©Array(const Edits &other);508Edits &moveArray(Edits &src) U_NOEXCEPT;509510void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; }511int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; }512513void append(int32_t r);514UBool growArray();515516static const int32_t STACK_CAPACITY = 100;517uint16_t *array;518int32_t capacity;519int32_t length;520int32_t delta;521int32_t numChanges;522UErrorCode errorCode_;523uint16_t stackArray[STACK_CAPACITY];524};525526U_NAMESPACE_END527528#endif /* U_SHOW_CPLUSPLUS_API */529530#endif // __EDITS_H__531532533