// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*******************************************************************************4* Copyright (C) 2012-2014, International Business Machines5* Corporation and others. All Rights Reserved.6*******************************************************************************7* collationfcd.h8*9* created on: 2012aug1810* created by: Markus W. Scherer11*/1213#ifndef __COLLATIONFCD_H__14#define __COLLATIONFCD_H__1516#include "unicode/utypes.h"1718#if !UCONFIG_NO_COLLATION1920#include "unicode/utf16.h"2122U_NAMESPACE_BEGIN2324/**25* Data and functions for the FCD check fast path.26*27* The fast path looks at a pair of 16-bit code units and checks28* whether there is an FCD boundary between them;29* there is if the first unit has a trailing ccc=0 (!hasTccc(first))30* or the second unit has a leading ccc=0 (!hasLccc(second)),31* or both.32* When the fast path finds a possible non-boundary,33* then the FCD check slow path looks at the actual sequence of FCD values.34*35* This is a pure optimization.36* The fast path must at least find all possible non-boundaries.37* If the fast path is too pessimistic, it costs performance.38*39* For a pair of BMP characters, the fast path tests are precise (1 bit per character).40*41* For a supplementary code point, the two units are its lead and trail surrogates.42* We set hasTccc(lead)=true if any of its 1024 associated supplementary code points43* has lccc!=0 or tccc!=0.44* We set hasLccc(trail)=true for all trail surrogates.45* As a result, we leave the fast path if the lead surrogate might start a46* supplementary code point that is not FCD-inert.47* (So the fast path need not detect that there is a surrogate pair,48* nor look ahead to the next full code point.)49*50* hasLccc(lead)=true if any of its 1024 associated supplementary code points51* has lccc!=0, for fast boundary checking between BMP & supplementary.52*53* hasTccc(trail)=false:54* It should only be tested for unpaired trail surrogates which are FCD-inert.55*/56class U_I18N_API CollationFCD {57public:58static inline UBool hasLccc(UChar32 c) {59// assert c <= 0xffff60// c can be negative, e.g., U_SENTINEL from UCharIterator;61// that is handled in the first test.62int32_t i;63return64// U+0300 is the first character with lccc!=0.65c >= 0x300 &&66(i = lcccIndex[c >> 5]) != 0 &&67(lcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;68}6970static inline UBool hasTccc(UChar32 c) {71// assert c <= 0xffff72// c can be negative, e.g., U_SENTINEL from UCharIterator;73// that is handled in the first test.74int32_t i;75return76// U+00C0 is the first character with tccc!=0.77c >= 0xc0 &&78(i = tcccIndex[c >> 5]) != 0 &&79(tcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;80}8182static inline UBool mayHaveLccc(UChar32 c) {83// Handles all of Unicode 0..10FFFF.84// c can be negative, e.g., U_SENTINEL.85// U+0300 is the first character with lccc!=0.86if(c < 0x300) { return false; }87if(c > 0xffff) { c = U16_LEAD(c); }88int32_t i;89return90(i = lcccIndex[c >> 5]) != 0 &&91(lcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;92}9394/**95* Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)96* must be decomposed before reaching the core collation code,97* or else some sequences including them, even ones passing the FCD check,98* do not yield canonically equivalent results.99*100* This is a fast and imprecise test.101*102* @param c a code point103* @return true if c is U+0F73, U+0F75 or U+0F81 or one of several other Tibetan characters104*/105static inline UBool maybeTibetanCompositeVowel(UChar32 c) {106return (c & 0x1fff01) == 0xf01;107}108109/**110* Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)111* must be decomposed before reaching the core collation code,112* or else some sequences including them, even ones passing the FCD check,113* do not yield canonically equivalent results.114*115* They have distinct lccc/tccc combinations: 129/130 or 129/132.116*117* @param fcd16 the FCD value (lccc/tccc combination) of a code point118* @return true if fcd16 is from U+0F73, U+0F75 or U+0F81119*/120static inline UBool isFCD16OfTibetanCompositeVowel(uint16_t fcd16) {121return fcd16 == 0x8182 || fcd16 == 0x8184;122}123124private:125CollationFCD() = delete; // No instantiation.126127static const uint8_t lcccIndex[2048];128static const uint8_t tcccIndex[2048];129static const uint32_t lcccBits[];130static const uint32_t tcccBits[];131};132133U_NAMESPACE_END134135#endif // !UCONFIG_NO_COLLATION136#endif // __COLLATIONFCD_H__137138139