Path: blob/aarch64-shenandoah-jdk8u272-b10/jdk/src/share/native/common/unicode/normalizer2.h
38827 views
/*1*******************************************************************************2*3* Copyright (C) 2009-2013, International Business Machines4* Corporation and others. All Rights Reserved.5*6*******************************************************************************7* file name: normalizer2.h8* encoding: US-ASCII9* tab size: 8 (not used)10* indentation:411*12* created on: 2009nov2213* created by: Markus W. Scherer14*/1516#ifndef __NORMALIZER2_H__17#define __NORMALIZER2_H__1819/**20* \file21* \brief C++ API: New API for Unicode Normalization.22*/2324#include "unicode/utypes.h"2526#if !UCONFIG_NO_NORMALIZATION2728#include "unicode/uniset.h"29#include "unicode/unistr.h"30#include "unicode/unorm2.h"3132U_NAMESPACE_BEGIN3334/**35* Unicode normalization functionality for standard Unicode normalization or36* for using custom mapping tables.37* All instances of this class are unmodifiable/immutable.38* Instances returned by getInstance() are singletons that must not be deleted by the caller.39* The Normalizer2 class is not intended for public subclassing.40*41* The primary functions are to produce a normalized string and to detect whether42* a string is already normalized.43* The most commonly used normalization forms are those defined in44* http://www.unicode.org/unicode/reports/tr15/45* However, this API supports additional normalization forms for specialized purposes.46* For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)47* and can be used in implementations of UTS #46.48*49* Not only are the standard compose and decompose modes supplied,50* but additional modes are provided as documented in the Mode enum.51*52* Some of the functions in this class identify normalization boundaries.53* At a normalization boundary, the portions of the string54* before it and starting from it do not interact and can be handled independently.55*56* The spanQuickCheckYes() stops at a normalization boundary.57* When the goal is a normalized string, then the text before the boundary58* can be copied, and the remainder can be processed with normalizeSecondAndAppend().59*60* The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether61* a character is guaranteed to be at a normalization boundary,62* regardless of context.63* This is used for moving from one normalization boundary to the next64* or preceding boundary, and for performing iterative normalization.65*66* Iterative normalization is useful when only a small portion of a67* longer string needs to be processed.68* For example, in ICU, iterative normalization is used by the NormalizationTransliterator69* (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()70* (to process only the substring for which sort key bytes are computed).71*72* The set of normalization boundaries returned by these functions may not be73* complete: There may be more boundaries that could be returned.74* Different functions may return different boundaries.75* @stable ICU 4.476*/77class U_COMMON_API Normalizer2 : public UObject {78public:79/**80* Destructor.81* @stable ICU 4.482*/83~Normalizer2();8485/**86* Returns a Normalizer2 instance for Unicode NFC normalization.87* Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).88* Returns an unmodifiable singleton instance. Do not delete it.89* @param errorCode Standard ICU error code. Its input value must90* pass the U_SUCCESS() test, or else the function returns91* immediately. Check for U_FAILURE() on output or use with92* function chaining. (See User Guide for details.)93* @return the requested Normalizer2, if successful94* @stable ICU 4995*/96static const Normalizer2 *97getNFCInstance(UErrorCode &errorCode);9899/**100* Returns a Normalizer2 instance for Unicode NFD normalization.101* Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).102* Returns an unmodifiable singleton instance. Do not delete it.103* @param errorCode Standard ICU error code. Its input value must104* pass the U_SUCCESS() test, or else the function returns105* immediately. Check for U_FAILURE() on output or use with106* function chaining. (See User Guide for details.)107* @return the requested Normalizer2, if successful108* @stable ICU 49109*/110static const Normalizer2 *111getNFDInstance(UErrorCode &errorCode);112113/**114* Returns a Normalizer2 instance for Unicode NFKC normalization.115* Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).116* Returns an unmodifiable singleton instance. Do not delete it.117* @param errorCode Standard ICU error code. Its input value must118* pass the U_SUCCESS() test, or else the function returns119* immediately. Check for U_FAILURE() on output or use with120* function chaining. (See User Guide for details.)121* @return the requested Normalizer2, if successful122* @stable ICU 49123*/124static const Normalizer2 *125getNFKCInstance(UErrorCode &errorCode);126127/**128* Returns a Normalizer2 instance for Unicode NFKD normalization.129* Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).130* Returns an unmodifiable singleton instance. Do not delete it.131* @param errorCode Standard ICU error code. Its input value must132* pass the U_SUCCESS() test, or else the function returns133* immediately. Check for U_FAILURE() on output or use with134* function chaining. (See User Guide for details.)135* @return the requested Normalizer2, if successful136* @stable ICU 49137*/138static const Normalizer2 *139getNFKDInstance(UErrorCode &errorCode);140141/**142* Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.143* Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).144* Returns an unmodifiable singleton instance. Do not delete it.145* @param errorCode Standard ICU error code. Its input value must146* pass the U_SUCCESS() test, or else the function returns147* immediately. Check for U_FAILURE() on output or use with148* function chaining. (See User Guide for details.)149* @return the requested Normalizer2, if successful150* @stable ICU 49151*/152static const Normalizer2 *153getNFKCCasefoldInstance(UErrorCode &errorCode);154155/**156* Returns a Normalizer2 instance which uses the specified data file157* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)158* and which composes or decomposes text according to the specified mode.159* Returns an unmodifiable singleton instance. Do not delete it.160*161* Use packageName=NULL for data files that are part of ICU's own data.162* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.163* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.164* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.165*166* @param packageName NULL for ICU built-in data, otherwise application data package name167* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file168* @param mode normalization mode (compose or decompose etc.)169* @param errorCode Standard ICU error code. Its input value must170* pass the U_SUCCESS() test, or else the function returns171* immediately. Check for U_FAILURE() on output or use with172* function chaining. (See User Guide for details.)173* @return the requested Normalizer2, if successful174* @stable ICU 4.4175*/176static const Normalizer2 *177getInstance(const char *packageName,178const char *name,179UNormalization2Mode mode,180UErrorCode &errorCode);181182/**183* Returns the normalized form of the source string.184* @param src source string185* @param errorCode Standard ICU error code. Its input value must186* pass the U_SUCCESS() test, or else the function returns187* immediately. Check for U_FAILURE() on output or use with188* function chaining. (See User Guide for details.)189* @return normalized src190* @stable ICU 4.4191*/192UnicodeString193normalize(const UnicodeString &src, UErrorCode &errorCode) const {194UnicodeString result;195normalize(src, result, errorCode);196return result;197}198/**199* Writes the normalized form of the source string to the destination string200* (replacing its contents) and returns the destination string.201* The source and destination strings must be different objects.202* @param src source string203* @param dest destination string; its contents is replaced with normalized src204* @param errorCode Standard ICU error code. Its input value must205* pass the U_SUCCESS() test, or else the function returns206* immediately. Check for U_FAILURE() on output or use with207* function chaining. (See User Guide for details.)208* @return dest209* @stable ICU 4.4210*/211virtual UnicodeString &212normalize(const UnicodeString &src,213UnicodeString &dest,214UErrorCode &errorCode) const = 0;215/**216* Appends the normalized form of the second string to the first string217* (merging them at the boundary) and returns the first string.218* The result is normalized if the first string was normalized.219* The first and second strings must be different objects.220* @param first string, should be normalized221* @param second string, will be normalized222* @param errorCode Standard ICU error code. Its input value must223* pass the U_SUCCESS() test, or else the function returns224* immediately. Check for U_FAILURE() on output or use with225* function chaining. (See User Guide for details.)226* @return first227* @stable ICU 4.4228*/229virtual UnicodeString &230normalizeSecondAndAppend(UnicodeString &first,231const UnicodeString &second,232UErrorCode &errorCode) const = 0;233/**234* Appends the second string to the first string235* (merging them at the boundary) and returns the first string.236* The result is normalized if both the strings were normalized.237* The first and second strings must be different objects.238* @param first string, should be normalized239* @param second string, should be normalized240* @param errorCode Standard ICU error code. Its input value must241* pass the U_SUCCESS() test, or else the function returns242* immediately. Check for U_FAILURE() on output or use with243* function chaining. (See User Guide for details.)244* @return first245* @stable ICU 4.4246*/247virtual UnicodeString &248append(UnicodeString &first,249const UnicodeString &second,250UErrorCode &errorCode) const = 0;251252/**253* Gets the decomposition mapping of c.254* Roughly equivalent to normalizing the String form of c255* on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function256* returns FALSE and does not write a string257* if c does not have a decomposition mapping in this instance's data.258* This function is independent of the mode of the Normalizer2.259* @param c code point260* @param decomposition String object which will be set to c's261* decomposition mapping, if there is one.262* @return TRUE if c has a decomposition, otherwise FALSE263* @stable ICU 4.6264*/265virtual UBool266getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;267268/**269* Gets the raw decomposition mapping of c.270*271* This is similar to the getDecomposition() method but returns the272* raw decomposition mapping as specified in UnicodeData.txt or273* (for custom data) in the mapping files processed by the gennorm2 tool.274* By contrast, getDecomposition() returns the processed,275* recursively-decomposed version of this mapping.276*277* When used on a standard NFKC Normalizer2 instance,278* getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.279*280* When used on a standard NFC Normalizer2 instance,281* it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);282* in this case, the result contains either one or two code points (=1..4 UChars).283*284* This function is independent of the mode of the Normalizer2.285* The default implementation returns FALSE.286* @param c code point287* @param decomposition String object which will be set to c's288* raw decomposition mapping, if there is one.289* @return TRUE if c has a decomposition, otherwise FALSE290* @stable ICU 49291*/292virtual UBool293getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;294295/**296* Performs pairwise composition of a & b and returns the composite if there is one.297*298* Returns a composite code point c only if c has a two-way mapping to a+b.299* In standard Unicode normalization, this means that300* c has a canonical decomposition to a+b301* and c does not have the Full_Composition_Exclusion property.302*303* This function is independent of the mode of the Normalizer2.304* The default implementation returns a negative value.305* @param a A (normalization starter) code point.306* @param b Another code point.307* @return The non-negative composite code point if there is one; otherwise a negative value.308* @stable ICU 49309*/310virtual UChar32311composePair(UChar32 a, UChar32 b) const;312313/**314* Gets the combining class of c.315* The default implementation returns 0316* but all standard implementations return the Unicode Canonical_Combining_Class value.317* @param c code point318* @return c's combining class319* @stable ICU 49320*/321virtual uint8_t322getCombiningClass(UChar32 c) const;323324/**325* Tests if the string is normalized.326* Internally, in cases where the quickCheck() method would return "maybe"327* (which is only possible for the two COMPOSE modes) this method328* resolves to "yes" or "no" to provide a definitive result,329* at the cost of doing more work in those cases.330* @param s input string331* @param errorCode Standard ICU error code. Its input value must332* pass the U_SUCCESS() test, or else the function returns333* immediately. Check for U_FAILURE() on output or use with334* function chaining. (See User Guide for details.)335* @return TRUE if s is normalized336* @stable ICU 4.4337*/338virtual UBool339isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;340341/**342* Tests if the string is normalized.343* For the two COMPOSE modes, the result could be "maybe" in cases that344* would take a little more work to resolve definitively.345* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster346* combination of quick check + normalization, to avoid347* re-checking the "yes" prefix.348* @param s input string349* @param errorCode Standard ICU error code. Its input value must350* pass the U_SUCCESS() test, or else the function returns351* immediately. Check for U_FAILURE() on output or use with352* function chaining. (See User Guide for details.)353* @return UNormalizationCheckResult354* @stable ICU 4.4355*/356virtual UNormalizationCheckResult357quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;358359/**360* Returns the end of the normalized substring of the input string.361* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>362* the substring <code>UnicodeString(s, 0, end)</code>363* will pass the quick check with a "yes" result.364*365* The returned end index is usually one or more characters before the366* "no" or "maybe" character: The end index is at a normalization boundary.367* (See the class documentation for more about normalization boundaries.)368*369* When the goal is a normalized string and most input strings are expected370* to be normalized already, then call this method,371* and if it returns a prefix shorter than the input string,372* copy that prefix and use normalizeSecondAndAppend() for the remainder.373* @param s input string374* @param errorCode Standard ICU error code. Its input value must375* pass the U_SUCCESS() test, or else the function returns376* immediately. Check for U_FAILURE() on output or use with377* function chaining. (See User Guide for details.)378* @return "yes" span end index379* @stable ICU 4.4380*/381virtual int32_t382spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;383384/**385* Tests if the character always has a normalization boundary before it,386* regardless of context.387* If true, then the character does not normalization-interact with388* preceding characters.389* In other words, a string containing this character can be normalized390* by processing portions before this character and starting from this391* character independently.392* This is used for iterative normalization. See the class documentation for details.393* @param c character to test394* @return TRUE if c has a normalization boundary before it395* @stable ICU 4.4396*/397virtual UBool hasBoundaryBefore(UChar32 c) const = 0;398399/**400* Tests if the character always has a normalization boundary after it,401* regardless of context.402* If true, then the character does not normalization-interact with403* following characters.404* In other words, a string containing this character can be normalized405* by processing portions up to this character and after this406* character independently.407* This is used for iterative normalization. See the class documentation for details.408* Note that this operation may be significantly slower than hasBoundaryBefore().409* @param c character to test410* @return TRUE if c has a normalization boundary after it411* @stable ICU 4.4412*/413virtual UBool hasBoundaryAfter(UChar32 c) const = 0;414415/**416* Tests if the character is normalization-inert.417* If true, then the character does not change, nor normalization-interact with418* preceding or following characters.419* In other words, a string containing this character can be normalized420* by processing portions before this character and after this421* character independently.422* This is used for iterative normalization. See the class documentation for details.423* Note that this operation may be significantly slower than hasBoundaryBefore().424* @param c character to test425* @return TRUE if c is normalization-inert426* @stable ICU 4.4427*/428virtual UBool isInert(UChar32 c) const = 0;429};430431/**432* Normalization filtered by a UnicodeSet.433* Normalizes portions of the text contained in the filter set and leaves434* portions not contained in the filter set unchanged.435* Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).436* Not-in-the-filter text is treated as "is normalized" and "quick check yes".437* This class implements all of (and only) the Normalizer2 API.438* An instance of this class is unmodifiable/immutable but is constructed and439* must be destructed by the owner.440* @stable ICU 4.4441*/442class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {443public:444/**445* Constructs a filtered normalizer wrapping any Normalizer2 instance446* and a filter set.447* Both are aliased and must not be modified or deleted while this object448* is used.449* The filter set should be frozen; otherwise the performance will suffer greatly.450* @param n2 wrapped Normalizer2 instance451* @param filterSet UnicodeSet which determines the characters to be normalized452* @stable ICU 4.4453*/454FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :455norm2(n2), set(filterSet) {}456457/**458* Destructor.459* @stable ICU 4.4460*/461~FilteredNormalizer2();462463/**464* Writes the normalized form of the source string to the destination string465* (replacing its contents) and returns the destination string.466* The source and destination strings must be different objects.467* @param src source string468* @param dest destination string; its contents is replaced with normalized src469* @param errorCode Standard ICU error code. Its input value must470* pass the U_SUCCESS() test, or else the function returns471* immediately. Check for U_FAILURE() on output or use with472* function chaining. (See User Guide for details.)473* @return dest474* @stable ICU 4.4475*/476virtual UnicodeString &477normalize(const UnicodeString &src,478UnicodeString &dest,479UErrorCode &errorCode) const;480/**481* Appends the normalized form of the second string to the first string482* (merging them at the boundary) and returns the first string.483* The result is normalized if the first string was normalized.484* The first and second strings must be different objects.485* @param first string, should be normalized486* @param second string, will be normalized487* @param errorCode Standard ICU error code. Its input value must488* pass the U_SUCCESS() test, or else the function returns489* immediately. Check for U_FAILURE() on output or use with490* function chaining. (See User Guide for details.)491* @return first492* @stable ICU 4.4493*/494virtual UnicodeString &495normalizeSecondAndAppend(UnicodeString &first,496const UnicodeString &second,497UErrorCode &errorCode) const;498/**499* Appends the second string to the first string500* (merging them at the boundary) and returns the first string.501* The result is normalized if both the strings were normalized.502* The first and second strings must be different objects.503* @param first string, should be normalized504* @param second string, should be normalized505* @param errorCode Standard ICU error code. Its input value must506* pass the U_SUCCESS() test, or else the function returns507* immediately. Check for U_FAILURE() on output or use with508* function chaining. (See User Guide for details.)509* @return first510* @stable ICU 4.4511*/512virtual UnicodeString &513append(UnicodeString &first,514const UnicodeString &second,515UErrorCode &errorCode) const;516517/**518* Gets the decomposition mapping of c.519* For details see the base class documentation.520*521* This function is independent of the mode of the Normalizer2.522* @param c code point523* @param decomposition String object which will be set to c's524* decomposition mapping, if there is one.525* @return TRUE if c has a decomposition, otherwise FALSE526* @stable ICU 4.6527*/528virtual UBool529getDecomposition(UChar32 c, UnicodeString &decomposition) const;530531/**532* Gets the raw decomposition mapping of c.533* For details see the base class documentation.534*535* This function is independent of the mode of the Normalizer2.536* @param c code point537* @param decomposition String object which will be set to c's538* raw decomposition mapping, if there is one.539* @return TRUE if c has a decomposition, otherwise FALSE540* @stable ICU 49541*/542virtual UBool543getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;544545/**546* Performs pairwise composition of a & b and returns the composite if there is one.547* For details see the base class documentation.548*549* This function is independent of the mode of the Normalizer2.550* @param a A (normalization starter) code point.551* @param b Another code point.552* @return The non-negative composite code point if there is one; otherwise a negative value.553* @stable ICU 49554*/555virtual UChar32556composePair(UChar32 a, UChar32 b) const;557558/**559* Gets the combining class of c.560* The default implementation returns 0561* but all standard implementations return the Unicode Canonical_Combining_Class value.562* @param c code point563* @return c's combining class564* @stable ICU 49565*/566virtual uint8_t567getCombiningClass(UChar32 c) const;568569/**570* Tests if the string is normalized.571* For details see the Normalizer2 base class documentation.572* @param s input string573* @param errorCode Standard ICU error code. Its input value must574* pass the U_SUCCESS() test, or else the function returns575* immediately. Check for U_FAILURE() on output or use with576* function chaining. (See User Guide for details.)577* @return TRUE if s is normalized578* @stable ICU 4.4579*/580virtual UBool581isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;582/**583* Tests if the string is normalized.584* For details see the Normalizer2 base class documentation.585* @param s input string586* @param errorCode Standard ICU error code. Its input value must587* pass the U_SUCCESS() test, or else the function returns588* immediately. Check for U_FAILURE() on output or use with589* function chaining. (See User Guide for details.)590* @return UNormalizationCheckResult591* @stable ICU 4.4592*/593virtual UNormalizationCheckResult594quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;595/**596* Returns the end of the normalized substring of the input string.597* For details see the Normalizer2 base class documentation.598* @param s input string599* @param errorCode Standard ICU error code. Its input value must600* pass the U_SUCCESS() test, or else the function returns601* immediately. Check for U_FAILURE() on output or use with602* function chaining. (See User Guide for details.)603* @return "yes" span end index604* @stable ICU 4.4605*/606virtual int32_t607spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;608609/**610* Tests if the character always has a normalization boundary before it,611* regardless of context.612* For details see the Normalizer2 base class documentation.613* @param c character to test614* @return TRUE if c has a normalization boundary before it615* @stable ICU 4.4616*/617virtual UBool hasBoundaryBefore(UChar32 c) const;618619/**620* Tests if the character always has a normalization boundary after it,621* regardless of context.622* For details see the Normalizer2 base class documentation.623* @param c character to test624* @return TRUE if c has a normalization boundary after it625* @stable ICU 4.4626*/627virtual UBool hasBoundaryAfter(UChar32 c) const;628629/**630* Tests if the character is normalization-inert.631* For details see the Normalizer2 base class documentation.632* @param c character to test633* @return TRUE if c is normalization-inert634* @stable ICU 4.4635*/636virtual UBool isInert(UChar32 c) const;637private:638UnicodeString &639normalize(const UnicodeString &src,640UnicodeString &dest,641USetSpanCondition spanCondition,642UErrorCode &errorCode) const;643644UnicodeString &645normalizeSecondAndAppend(UnicodeString &first,646const UnicodeString &second,647UBool doNormalize,648UErrorCode &errorCode) const;649650const Normalizer2 &norm2;651const UnicodeSet &set;652};653654U_NAMESPACE_END655656#endif // !UCONFIG_NO_NORMALIZATION657#endif // __NORMALIZER2_H__658659660