Path: blob/jdk8u272-b10-aarch32-20201026/jdk/src/share/native/common/unicode/normalizer2.h
48773 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*******************************************************************************4*5* Copyright (C) 2009-2013, International Business Machines6* Corporation and others. All Rights Reserved.7*8*******************************************************************************9* file name: normalizer2.h10* encoding: UTF-811* tab size: 8 (not used)12* indentation:413*14* created on: 2009nov2215* created by: Markus W. Scherer16*/1718#ifndef __NORMALIZER2_H__19#define __NORMALIZER2_H__2021/**22* \file23* \brief C++ API: New API for Unicode Normalization.24*/2526#include "unicode/utypes.h"2728#if !UCONFIG_NO_NORMALIZATION2930#include "unicode/stringpiece.h"31#include "unicode/uniset.h"32#include "unicode/unistr.h"33#include "unicode/unorm2.h"3435U_NAMESPACE_BEGIN3637class ByteSink;3839/**40* Unicode normalization functionality for standard Unicode normalization or41* for using custom mapping tables.42* All instances of this class are unmodifiable/immutable.43* Instances returned by getInstance() are singletons that must not be deleted by the caller.44* The Normalizer2 class is not intended for public subclassing.45*46* The primary functions are to produce a normalized string and to detect whether47* a string is already normalized.48* The most commonly used normalization forms are those defined in49* http://www.unicode.org/unicode/reports/tr15/50* However, this API supports additional normalization forms for specialized purposes.51* For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)52* and can be used in implementations of UTS #46.53*54* Not only are the standard compose and decompose modes supplied,55* but additional modes are provided as documented in the Mode enum.56*57* Some of the functions in this class identify normalization boundaries.58* At a normalization boundary, the portions of the string59* before it and starting from it do not interact and can be handled independently.60*61* The spanQuickCheckYes() stops at a normalization boundary.62* When the goal is a normalized string, then the text before the boundary63* can be copied, and the remainder can be processed with normalizeSecondAndAppend().64*65* The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether66* a character is guaranteed to be at a normalization boundary,67* regardless of context.68* This is used for moving from one normalization boundary to the next69* or preceding boundary, and for performing iterative normalization.70*71* Iterative normalization is useful when only a small portion of a72* longer string needs to be processed.73* For example, in ICU, iterative normalization is used by the NormalizationTransliterator74* (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()75* (to process only the substring for which sort key bytes are computed).76*77* The set of normalization boundaries returned by these functions may not be78* complete: There may be more boundaries that could be returned.79* Different functions may return different boundaries.80* @stable ICU 4.481*/82class U_COMMON_API Normalizer2 : public UObject {83public:84/**85* Destructor.86* @stable ICU 4.487*/88~Normalizer2();8990/**91* Returns a Normalizer2 instance for Unicode NFC normalization.92* Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).93* Returns an unmodifiable singleton instance. Do not delete it.94* @param errorCode Standard ICU error code. Its input value must95* pass the U_SUCCESS() test, or else the function returns96* immediately. Check for U_FAILURE() on output or use with97* function chaining. (See User Guide for details.)98* @return the requested Normalizer2, if successful99* @stable ICU 49100*/101static const Normalizer2 *102getNFCInstance(UErrorCode &errorCode);103104/**105* Returns a Normalizer2 instance for Unicode NFD normalization.106* Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).107* Returns an unmodifiable singleton instance. Do not delete it.108* @param errorCode Standard ICU error code. Its input value must109* pass the U_SUCCESS() test, or else the function returns110* immediately. Check for U_FAILURE() on output or use with111* function chaining. (See User Guide for details.)112* @return the requested Normalizer2, if successful113* @stable ICU 49114*/115static const Normalizer2 *116getNFDInstance(UErrorCode &errorCode);117118/**119* Returns a Normalizer2 instance for Unicode NFKC normalization.120* Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).121* Returns an unmodifiable singleton instance. Do not delete it.122* @param errorCode Standard ICU error code. Its input value must123* pass the U_SUCCESS() test, or else the function returns124* immediately. Check for U_FAILURE() on output or use with125* function chaining. (See User Guide for details.)126* @return the requested Normalizer2, if successful127* @stable ICU 49128*/129static const Normalizer2 *130getNFKCInstance(UErrorCode &errorCode);131132/**133* Returns a Normalizer2 instance for Unicode NFKD normalization.134* Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).135* Returns an unmodifiable singleton instance. Do not delete it.136* @param errorCode Standard ICU error code. Its input value must137* pass the U_SUCCESS() test, or else the function returns138* immediately. Check for U_FAILURE() on output or use with139* function chaining. (See User Guide for details.)140* @return the requested Normalizer2, if successful141* @stable ICU 49142*/143static const Normalizer2 *144getNFKDInstance(UErrorCode &errorCode);145146/**147* Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.148* Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).149* Returns an unmodifiable singleton instance. Do not delete it.150* @param errorCode Standard ICU error code. Its input value must151* pass the U_SUCCESS() test, or else the function returns152* immediately. Check for U_FAILURE() on output or use with153* function chaining. (See User Guide for details.)154* @return the requested Normalizer2, if successful155* @stable ICU 49156*/157static const Normalizer2 *158getNFKCCasefoldInstance(UErrorCode &errorCode);159160/**161* Returns a Normalizer2 instance which uses the specified data file162* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)163* and which composes or decomposes text according to the specified mode.164* Returns an unmodifiable singleton instance. Do not delete it.165*166* Use packageName=NULL for data files that are part of ICU's own data.167* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.168* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.169* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.170*171* @param packageName NULL for ICU built-in data, otherwise application data package name172* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file173* @param mode normalization mode (compose or decompose etc.)174* @param errorCode Standard ICU error code. Its input value must175* pass the U_SUCCESS() test, or else the function returns176* immediately. Check for U_FAILURE() on output or use with177* function chaining. (See User Guide for details.)178* @return the requested Normalizer2, if successful179* @stable ICU 4.4180*/181static const Normalizer2 *182getInstance(const char *packageName,183const char *name,184UNormalization2Mode mode,185UErrorCode &errorCode);186187/**188* Returns the normalized form of the source string.189* @param src source string190* @param errorCode Standard ICU error code. Its input value must191* pass the U_SUCCESS() test, or else the function returns192* immediately. Check for U_FAILURE() on output or use with193* function chaining. (See User Guide for details.)194* @return normalized src195* @stable ICU 4.4196*/197UnicodeString198normalize(const UnicodeString &src, UErrorCode &errorCode) const {199UnicodeString result;200normalize(src, result, errorCode);201return result;202}203/**204* Writes the normalized form of the source string to the destination string205* (replacing its contents) and returns the destination string.206* The source and destination strings must be different objects.207* @param src source string208* @param dest destination string; its contents is replaced with normalized src209* @param errorCode Standard ICU error code. Its input value must210* pass the U_SUCCESS() test, or else the function returns211* immediately. Check for U_FAILURE() on output or use with212* function chaining. (See User Guide for details.)213* @return dest214* @stable ICU 4.4215*/216virtual UnicodeString &217normalize(const UnicodeString &src,218UnicodeString &dest,219UErrorCode &errorCode) const = 0;220221/**222* Normalizes a UTF-8 string and optionally records how source substrings223* relate to changed and unchanged result substrings.224*225* Currently implemented completely only for "compose" modes,226* such as for NFC, NFKC, and NFKC_Casefold227* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).228* Otherwise currently converts to & from UTF-16 and does not support edits.229*230* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.231* @param src Source UTF-8 string.232* @param sink A ByteSink to which the normalized UTF-8 result string is written.233* sink.Flush() is called at the end.234* @param edits Records edits for index mapping, working with styled text,235* and getting only changes (if any).236* The Edits contents is undefined if any error occurs.237* This function calls edits->reset() first unless238* options includes U_EDITS_NO_RESET. edits can be nullptr.239* @param errorCode Standard ICU error code. Its input value must240* pass the U_SUCCESS() test, or else the function returns241* immediately. Check for U_FAILURE() on output or use with242* function chaining. (See User Guide for details.)243* @stable ICU 60244*/245virtual void246normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,247Edits *edits, UErrorCode &errorCode) const;248249/**250* Appends the normalized form of the second string to the first string251* (merging them at the boundary) and returns the first string.252* The result is normalized if the first string was normalized.253* The first and second strings must be different objects.254* @param first string, should be normalized255* @param second string, will be normalized256* @param errorCode Standard ICU error code. Its input value must257* pass the U_SUCCESS() test, or else the function returns258* immediately. Check for U_FAILURE() on output or use with259* function chaining. (See User Guide for details.)260* @return first261* @stable ICU 4.4262*/263virtual UnicodeString &264normalizeSecondAndAppend(UnicodeString &first,265const UnicodeString &second,266UErrorCode &errorCode) const = 0;267/**268* Appends the second string to the first string269* (merging them at the boundary) and returns the first string.270* The result is normalized if both the strings were normalized.271* The first and second strings must be different objects.272* @param first string, should be normalized273* @param second string, should be normalized274* @param errorCode Standard ICU error code. Its input value must275* pass the U_SUCCESS() test, or else the function returns276* immediately. Check for U_FAILURE() on output or use with277* function chaining. (See User Guide for details.)278* @return first279* @stable ICU 4.4280*/281virtual UnicodeString &282append(UnicodeString &first,283const UnicodeString &second,284UErrorCode &errorCode) const = 0;285286/**287* Gets the decomposition mapping of c.288* Roughly equivalent to normalizing the String form of c289* on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function290* returns FALSE and does not write a string291* if c does not have a decomposition mapping in this instance's data.292* This function is independent of the mode of the Normalizer2.293* @param c code point294* @param decomposition String object which will be set to c's295* decomposition mapping, if there is one.296* @return TRUE if c has a decomposition, otherwise FALSE297* @stable ICU 4.6298*/299virtual UBool300getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;301302/**303* Gets the raw decomposition mapping of c.304*305* This is similar to the getDecomposition() method but returns the306* raw decomposition mapping as specified in UnicodeData.txt or307* (for custom data) in the mapping files processed by the gennorm2 tool.308* By contrast, getDecomposition() returns the processed,309* recursively-decomposed version of this mapping.310*311* When used on a standard NFKC Normalizer2 instance,312* getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.313*314* When used on a standard NFC Normalizer2 instance,315* it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);316* in this case, the result contains either one or two code points (=1..4 char16_ts).317*318* This function is independent of the mode of the Normalizer2.319* The default implementation returns FALSE.320* @param c code point321* @param decomposition String object which will be set to c's322* raw decomposition mapping, if there is one.323* @return TRUE if c has a decomposition, otherwise FALSE324* @stable ICU 49325*/326virtual UBool327getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;328329/**330* Performs pairwise composition of a & b and returns the composite if there is one.331*332* Returns a composite code point c only if c has a two-way mapping to a+b.333* In standard Unicode normalization, this means that334* c has a canonical decomposition to a+b335* and c does not have the Full_Composition_Exclusion property.336*337* This function is independent of the mode of the Normalizer2.338* The default implementation returns a negative value.339* @param a A (normalization starter) code point.340* @param b Another code point.341* @return The non-negative composite code point if there is one; otherwise a negative value.342* @stable ICU 49343*/344virtual UChar32345composePair(UChar32 a, UChar32 b) const;346347/**348* Gets the combining class of c.349* The default implementation returns 0350* but all standard implementations return the Unicode Canonical_Combining_Class value.351* @param c code point352* @return c's combining class353* @stable ICU 49354*/355virtual uint8_t356getCombiningClass(UChar32 c) const;357358/**359* Tests if the string is normalized.360* Internally, in cases where the quickCheck() method would return "maybe"361* (which is only possible for the two COMPOSE modes) this method362* resolves to "yes" or "no" to provide a definitive result,363* at the cost of doing more work in those cases.364* @param s input string365* @param errorCode Standard ICU error code. Its input value must366* pass the U_SUCCESS() test, or else the function returns367* immediately. Check for U_FAILURE() on output or use with368* function chaining. (See User Guide for details.)369* @return TRUE if s is normalized370* @stable ICU 4.4371*/372virtual UBool373isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;374/**375* Tests if the UTF-8 string is normalized.376* Internally, in cases where the quickCheck() method would return "maybe"377* (which is only possible for the two COMPOSE modes) this method378* resolves to "yes" or "no" to provide a definitive result,379* at the cost of doing more work in those cases.380*381* This works for all normalization modes,382* but it is currently optimized for UTF-8 only for "compose" modes,383* such as for NFC, NFKC, and NFKC_Casefold384* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).385* For other modes it currently converts to UTF-16 and calls isNormalized().386*387* @param s UTF-8 input string388* @param errorCode Standard ICU error code. Its input value must389* pass the U_SUCCESS() test, or else the function returns390* immediately. Check for U_FAILURE() on output or use with391* function chaining. (See User Guide for details.)392* @return TRUE if s is normalized393* @stable ICU 60394*/395virtual UBool396isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;397398399/**400* Tests if the string is normalized.401* For the two COMPOSE modes, the result could be "maybe" in cases that402* would take a little more work to resolve definitively.403* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster404* combination of quick check + normalization, to avoid405* re-checking the "yes" prefix.406* @param s input string407* @param errorCode Standard ICU error code. Its input value must408* pass the U_SUCCESS() test, or else the function returns409* immediately. Check for U_FAILURE() on output or use with410* function chaining. (See User Guide for details.)411* @return UNormalizationCheckResult412* @stable ICU 4.4413*/414virtual UNormalizationCheckResult415quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;416417/**418* Returns the end of the normalized substring of the input string.419* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>420* the substring <code>UnicodeString(s, 0, end)</code>421* will pass the quick check with a "yes" result.422*423* The returned end index is usually one or more characters before the424* "no" or "maybe" character: The end index is at a normalization boundary.425* (See the class documentation for more about normalization boundaries.)426*427* When the goal is a normalized string and most input strings are expected428* to be normalized already, then call this method,429* and if it returns a prefix shorter than the input string,430* copy that prefix and use normalizeSecondAndAppend() for the remainder.431* @param s input string432* @param errorCode Standard ICU error code. Its input value must433* pass the U_SUCCESS() test, or else the function returns434* immediately. Check for U_FAILURE() on output or use with435* function chaining. (See User Guide for details.)436* @return "yes" span end index437* @stable ICU 4.4438*/439virtual int32_t440spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;441442/**443* Tests if the character always has a normalization boundary before it,444* regardless of context.445* If true, then the character does not normalization-interact with446* preceding characters.447* In other words, a string containing this character can be normalized448* by processing portions before this character and starting from this449* character independently.450* This is used for iterative normalization. See the class documentation for details.451* @param c character to test452* @return TRUE if c has a normalization boundary before it453* @stable ICU 4.4454*/455virtual UBool hasBoundaryBefore(UChar32 c) const = 0;456457/**458* Tests if the character always has a normalization boundary after it,459* regardless of context.460* If true, then the character does not normalization-interact with461* following characters.462* In other words, a string containing this character can be normalized463* by processing portions up to this character and after this464* character independently.465* This is used for iterative normalization. See the class documentation for details.466* Note that this operation may be significantly slower than hasBoundaryBefore().467* @param c character to test468* @return TRUE if c has a normalization boundary after it469* @stable ICU 4.4470*/471virtual UBool hasBoundaryAfter(UChar32 c) const = 0;472473/**474* Tests if the character is normalization-inert.475* If true, then the character does not change, nor normalization-interact with476* preceding or following characters.477* In other words, a string containing this character can be normalized478* by processing portions before this character and after this479* character independently.480* This is used for iterative normalization. See the class documentation for details.481* Note that this operation may be significantly slower than hasBoundaryBefore().482* @param c character to test483* @return TRUE if c is normalization-inert484* @stable ICU 4.4485*/486virtual UBool isInert(UChar32 c) const = 0;487};488489/**490* Normalization filtered by a UnicodeSet.491* Normalizes portions of the text contained in the filter set and leaves492* portions not contained in the filter set unchanged.493* Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).494* Not-in-the-filter text is treated as "is normalized" and "quick check yes".495* This class implements all of (and only) the Normalizer2 API.496* An instance of this class is unmodifiable/immutable but is constructed and497* must be destructed by the owner.498* @stable ICU 4.4499*/500class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {501public:502/**503* Constructs a filtered normalizer wrapping any Normalizer2 instance504* and a filter set.505* Both are aliased and must not be modified or deleted while this object506* is used.507* The filter set should be frozen; otherwise the performance will suffer greatly.508* @param n2 wrapped Normalizer2 instance509* @param filterSet UnicodeSet which determines the characters to be normalized510* @stable ICU 4.4511*/512FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :513norm2(n2), set(filterSet) {}514515/**516* Destructor.517* @stable ICU 4.4518*/519~FilteredNormalizer2();520521/**522* Writes the normalized form of the source string to the destination string523* (replacing its contents) and returns the destination string.524* The source and destination strings must be different objects.525* @param src source string526* @param dest destination string; its contents is replaced with normalized src527* @param errorCode Standard ICU error code. Its input value must528* pass the U_SUCCESS() test, or else the function returns529* immediately. Check for U_FAILURE() on output or use with530* function chaining. (See User Guide for details.)531* @return dest532* @stable ICU 4.4533*/534virtual UnicodeString &535normalize(const UnicodeString &src,536UnicodeString &dest,537UErrorCode &errorCode) const U_OVERRIDE;538539/**540* Normalizes a UTF-8 string and optionally records how source substrings541* relate to changed and unchanged result substrings.542*543* Currently implemented completely only for "compose" modes,544* such as for NFC, NFKC, and NFKC_Casefold545* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).546* Otherwise currently converts to & from UTF-16 and does not support edits.547*548* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.549* @param src Source UTF-8 string.550* @param sink A ByteSink to which the normalized UTF-8 result string is written.551* sink.Flush() is called at the end.552* @param edits Records edits for index mapping, working with styled text,553* and getting only changes (if any).554* The Edits contents is undefined if any error occurs.555* This function calls edits->reset() first unless556* options includes U_EDITS_NO_RESET. edits can be nullptr.557* @param errorCode Standard ICU error code. Its input value must558* pass the U_SUCCESS() test, or else the function returns559* immediately. Check for U_FAILURE() on output or use with560* function chaining. (See User Guide for details.)561* @stable ICU 60562*/563virtual void564normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,565Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;566567/**568* Appends the normalized form of the second string to the first string569* (merging them at the boundary) and returns the first string.570* The result is normalized if the first string was normalized.571* The first and second strings must be different objects.572* @param first string, should be normalized573* @param second string, will be normalized574* @param errorCode Standard ICU error code. Its input value must575* pass the U_SUCCESS() test, or else the function returns576* immediately. Check for U_FAILURE() on output or use with577* function chaining. (See User Guide for details.)578* @return first579* @stable ICU 4.4580*/581virtual UnicodeString &582normalizeSecondAndAppend(UnicodeString &first,583const UnicodeString &second,584UErrorCode &errorCode) const U_OVERRIDE;585/**586* Appends the second string to the first string587* (merging them at the boundary) and returns the first string.588* The result is normalized if both the strings were normalized.589* The first and second strings must be different objects.590* @param first string, should be normalized591* @param second string, should be normalized592* @param errorCode Standard ICU error code. Its input value must593* pass the U_SUCCESS() test, or else the function returns594* immediately. Check for U_FAILURE() on output or use with595* function chaining. (See User Guide for details.)596* @return first597* @stable ICU 4.4598*/599virtual UnicodeString &600append(UnicodeString &first,601const UnicodeString &second,602UErrorCode &errorCode) const U_OVERRIDE;603604/**605* Gets the decomposition mapping of c.606* For details see the base class documentation.607*608* This function is independent of the mode of the Normalizer2.609* @param c code point610* @param decomposition String object which will be set to c's611* decomposition mapping, if there is one.612* @return TRUE if c has a decomposition, otherwise FALSE613* @stable ICU 4.6614*/615virtual UBool616getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;617618/**619* Gets the raw decomposition mapping of c.620* For details see the base class documentation.621*622* This function is independent of the mode of the Normalizer2.623* @param c code point624* @param decomposition String object which will be set to c's625* raw decomposition mapping, if there is one.626* @return TRUE if c has a decomposition, otherwise FALSE627* @stable ICU 49628*/629virtual UBool630getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;631632/**633* Performs pairwise composition of a & b and returns the composite if there is one.634* For details see the base class documentation.635*636* This function is independent of the mode of the Normalizer2.637* @param a A (normalization starter) code point.638* @param b Another code point.639* @return The non-negative composite code point if there is one; otherwise a negative value.640* @stable ICU 49641*/642virtual UChar32643composePair(UChar32 a, UChar32 b) const U_OVERRIDE;644645/**646* Gets the combining class of c.647* The default implementation returns 0648* but all standard implementations return the Unicode Canonical_Combining_Class value.649* @param c code point650* @return c's combining class651* @stable ICU 49652*/653virtual uint8_t654getCombiningClass(UChar32 c) const U_OVERRIDE;655656/**657* Tests if the string is normalized.658* For details see the Normalizer2 base class documentation.659* @param s input string660* @param errorCode Standard ICU error code. Its input value must661* pass the U_SUCCESS() test, or else the function returns662* immediately. Check for U_FAILURE() on output or use with663* function chaining. (See User Guide for details.)664* @return TRUE if s is normalized665* @stable ICU 4.4666*/667virtual UBool668isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;669/**670* Tests if the UTF-8 string is normalized.671* Internally, in cases where the quickCheck() method would return "maybe"672* (which is only possible for the two COMPOSE modes) this method673* resolves to "yes" or "no" to provide a definitive result,674* at the cost of doing more work in those cases.675*676* This works for all normalization modes,677* but it is currently optimized for UTF-8 only for "compose" modes,678* such as for NFC, NFKC, and NFKC_Casefold679* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).680* For other modes it currently converts to UTF-16 and calls isNormalized().681*682* @param s UTF-8 input string683* @param errorCode Standard ICU error code. Its input value must684* pass the U_SUCCESS() test, or else the function returns685* immediately. Check for U_FAILURE() on output or use with686* function chaining. (See User Guide for details.)687* @return TRUE if s is normalized688* @stable ICU 60689*/690virtual UBool691isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;692/**693* Tests if the string is normalized.694* For details see the Normalizer2 base class documentation.695* @param s input string696* @param errorCode Standard ICU error code. Its input value must697* pass the U_SUCCESS() test, or else the function returns698* immediately. Check for U_FAILURE() on output or use with699* function chaining. (See User Guide for details.)700* @return UNormalizationCheckResult701* @stable ICU 4.4702*/703virtual UNormalizationCheckResult704quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;705/**706* Returns the end of the normalized substring of the input string.707* For details see the Normalizer2 base class documentation.708* @param s input string709* @param errorCode Standard ICU error code. Its input value must710* pass the U_SUCCESS() test, or else the function returns711* immediately. Check for U_FAILURE() on output or use with712* function chaining. (See User Guide for details.)713* @return "yes" span end index714* @stable ICU 4.4715*/716virtual int32_t717spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;718719/**720* Tests if the character always has a normalization boundary before it,721* regardless of context.722* For details see the Normalizer2 base class documentation.723* @param c character to test724* @return TRUE if c has a normalization boundary before it725* @stable ICU 4.4726*/727virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;728729/**730* Tests if the character always has a normalization boundary after it,731* regardless of context.732* For details see the Normalizer2 base class documentation.733* @param c character to test734* @return TRUE if c has a normalization boundary after it735* @stable ICU 4.4736*/737virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;738739/**740* Tests if the character is normalization-inert.741* For details see the Normalizer2 base class documentation.742* @param c character to test743* @return TRUE if c is normalization-inert744* @stable ICU 4.4745*/746virtual UBool isInert(UChar32 c) const U_OVERRIDE;747private:748UnicodeString &749normalize(const UnicodeString &src,750UnicodeString &dest,751USetSpanCondition spanCondition,752UErrorCode &errorCode) const;753754void755normalizeUTF8(uint32_t options, const char *src, int32_t length,756ByteSink &sink, Edits *edits,757USetSpanCondition spanCondition,758UErrorCode &errorCode) const;759760UnicodeString &761normalizeSecondAndAppend(UnicodeString &first,762const UnicodeString &second,763UBool doNormalize,764UErrorCode &errorCode) const;765766const Normalizer2 &norm2;767const UnicodeSet &set;768};769770U_NAMESPACE_END771772#endif // !UCONFIG_NO_NORMALIZATION773#endif // __NORMALIZER2_H__774775776