Path: blob/master/thirdparty/icu4c/common/characterproperties.cpp
9902 views
// © 2018 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html23// characterproperties.cpp4// created: 2018sep03 Markus W. Scherer56#include "unicode/utypes.h"7#include "unicode/localpointer.h"8#include "unicode/uchar.h"9#include "unicode/ucpmap.h"10#include "unicode/ucptrie.h"11#include "unicode/umutablecptrie.h"12#include "unicode/uniset.h"13#include "unicode/uscript.h"14#include "unicode/uset.h"15#include "cmemory.h"16#include "emojiprops.h"17#include "mutex.h"18#include "normalizer2impl.h"19#include "uassert.h"20#include "ubidi_props.h"21#include "ucase.h"22#include "ucln_cmn.h"23#include "umutex.h"24#include "uprops.h"2526using icu::LocalPointer;27#if !UCONFIG_NO_NORMALIZATION28using icu::Normalizer2Factory;29using icu::Normalizer2Impl;30#endif31using icu::UInitOnce;32using icu::UnicodeSet;3334namespace {3536UBool U_CALLCONV characterproperties_cleanup();3738constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + (UCHAR_INT_LIMIT - UCHAR_INT_START);3940struct Inclusion {41UnicodeSet *fSet = nullptr;42UInitOnce fInitOnce {};43};44Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()4546UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};4748UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};4950icu::UMutex cpMutex;5152//----------------------------------------------------------------53// Inclusions list54//----------------------------------------------------------------5556// USetAdder implementation57// Does not use uset.h to reduce code dependencies58void U_CALLCONV59_set_add(USet *set, UChar32 c) {60reinterpret_cast<UnicodeSet*>(set)->add(c);61}6263void U_CALLCONV64_set_addRange(USet *set, UChar32 start, UChar32 end) {65reinterpret_cast<UnicodeSet*>(set)->add(start, end);66}6768void U_CALLCONV69_set_addString(USet *set, const char16_t *str, int32_t length) {70reinterpret_cast<UnicodeSet*>(set)->add(icu::UnicodeString(static_cast<UBool>(length < 0), str, length));71}7273UBool U_CALLCONV characterproperties_cleanup() {74for (Inclusion &in: gInclusions) {75delete in.fSet;76in.fSet = nullptr;77in.fInitOnce.reset();78}79for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {80delete sets[i];81sets[i] = nullptr;82}83for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {84ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));85maps[i] = nullptr;86}87return true;88}8990void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {91// This function is invoked only via umtx_initOnce().92U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);93if (src == UPROPS_SRC_NONE) {94errorCode = U_INTERNAL_PROGRAM_ERROR;95return;96}97U_ASSERT(gInclusions[src].fSet == nullptr);9899LocalPointer<UnicodeSet> incl(new UnicodeSet());100if (incl.isNull()) {101errorCode = U_MEMORY_ALLOCATION_ERROR;102return;103}104USetAdder sa = {105reinterpret_cast<USet*>(incl.getAlias()),106_set_add,107_set_addRange,108_set_addString,109nullptr, // don't need remove()110nullptr // don't need removeRange()111};112113switch(src) {114case UPROPS_SRC_CHAR:115uchar_addPropertyStarts(&sa, &errorCode);116break;117case UPROPS_SRC_PROPSVEC:118upropsvec_addPropertyStarts(&sa, &errorCode);119break;120case UPROPS_SRC_CHAR_AND_PROPSVEC:121uchar_addPropertyStarts(&sa, &errorCode);122upropsvec_addPropertyStarts(&sa, &errorCode);123break;124#if !UCONFIG_NO_NORMALIZATION125case UPROPS_SRC_CASE_AND_NORM: {126const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);127if(U_SUCCESS(errorCode)) {128impl->addPropertyStarts(&sa, errorCode);129}130ucase_addPropertyStarts(&sa, &errorCode);131break;132}133case UPROPS_SRC_NFC: {134const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);135if(U_SUCCESS(errorCode)) {136impl->addPropertyStarts(&sa, errorCode);137}138break;139}140case UPROPS_SRC_NFKC: {141const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);142if(U_SUCCESS(errorCode)) {143impl->addPropertyStarts(&sa, errorCode);144}145break;146}147case UPROPS_SRC_NFKC_CF: {148const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);149if(U_SUCCESS(errorCode)) {150impl->addPropertyStarts(&sa, errorCode);151}152break;153}154case UPROPS_SRC_NFC_CANON_ITER: {155const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);156if(U_SUCCESS(errorCode)) {157impl->addCanonIterPropertyStarts(&sa, errorCode);158}159break;160}161#endif162case UPROPS_SRC_CASE:163ucase_addPropertyStarts(&sa, &errorCode);164break;165case UPROPS_SRC_BIDI:166ubidi_addPropertyStarts(&sa, &errorCode);167break;168case UPROPS_SRC_INPC:169case UPROPS_SRC_INSC:170case UPROPS_SRC_VO:171uprops_addPropertyStarts(src, &sa, &errorCode);172break;173case UPROPS_SRC_EMOJI: {174const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);175if (U_SUCCESS(errorCode)) {176ep->addPropertyStarts(&sa, errorCode);177}178break;179}180case UPROPS_SRC_IDSU:181// New in Unicode 15.1 for just two characters.182sa.add(sa.set, 0x2FFE);183sa.add(sa.set, 0x2FFF + 1);184break;185case UPROPS_SRC_ID_COMPAT_MATH:186case UPROPS_SRC_MCM:187uprops_addPropertyStarts(src, &sa, &errorCode);188break;189case UPROPS_SRC_BLOCK:190ublock_addPropertyStarts(&sa, errorCode);191break;192default:193errorCode = U_INTERNAL_PROGRAM_ERROR;194break;195}196197if (U_FAILURE(errorCode)) {198return;199}200if (incl->isBogus()) {201errorCode = U_MEMORY_ALLOCATION_ERROR;202return;203}204// Compact for caching.205incl->compact();206gInclusions[src].fSet = incl.orphan();207ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);208}209210const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {211if (U_FAILURE(errorCode)) { return nullptr; }212if (src < 0 || UPROPS_SRC_COUNT <= src) {213errorCode = U_ILLEGAL_ARGUMENT_ERROR;214return nullptr;215}216Inclusion &i = gInclusions[src];217umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);218return i.fSet;219}220221void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {222// This function is invoked only via umtx_initOnce().223U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);224int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);225U_ASSERT(gInclusions[inclIndex].fSet == nullptr);226UPropertySource src = uprops_getSource(prop);227const UnicodeSet *incl = getInclusionsForSource(src, errorCode);228if (U_FAILURE(errorCode)) {229return;230}231232LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));233if (intPropIncl.isNull()) {234errorCode = U_MEMORY_ALLOCATION_ERROR;235return;236}237int32_t numRanges = incl->getRangeCount();238int32_t prevValue = 0;239for (int32_t i = 0; i < numRanges; ++i) {240UChar32 rangeEnd = incl->getRangeEnd(i);241for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {242// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.243int32_t value = u_getIntPropertyValue(c, prop);244if (value != prevValue) {245intPropIncl->add(c);246prevValue = value;247}248}249}250251if (intPropIncl->isBogus()) {252errorCode = U_MEMORY_ALLOCATION_ERROR;253return;254}255// Compact for caching.256intPropIncl->compact();257gInclusions[inclIndex].fSet = intPropIncl.orphan();258ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);259}260261} // namespace262263U_NAMESPACE_BEGIN264265const UnicodeSet *CharacterProperties::getInclusionsForProperty(266UProperty prop, UErrorCode &errorCode) {267if (U_FAILURE(errorCode)) { return nullptr; }268if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {269int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);270Inclusion &i = gInclusions[inclIndex];271umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);272return i.fSet;273} else {274UPropertySource src = uprops_getSource(prop);275return getInclusionsForSource(src, errorCode);276}277}278279U_NAMESPACE_END280281namespace {282283UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {284if (U_FAILURE(errorCode)) { return nullptr; }285LocalPointer<UnicodeSet> set(new UnicodeSet());286if (set.isNull()) {287errorCode = U_MEMORY_ALLOCATION_ERROR;288return nullptr;289}290if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {291// property of strings292const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);293if (U_FAILURE(errorCode)) { return nullptr; }294USetAdder sa = {295reinterpret_cast<USet*>(set.getAlias()),296_set_add,297_set_addRange,298_set_addString,299nullptr, // don't need remove()300nullptr // don't need removeRange()301};302ep->addStrings(&sa, property, errorCode);303if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {304// property of _only_ strings305set->freeze();306return set.orphan();307}308}309310const UnicodeSet *inclusions =311icu::CharacterProperties::getInclusionsForProperty(property, errorCode);312if (U_FAILURE(errorCode)) { return nullptr; }313int32_t numRanges = inclusions->getRangeCount();314UChar32 startHasProperty = -1;315316for (int32_t i = 0; i < numRanges; ++i) {317UChar32 rangeEnd = inclusions->getRangeEnd(i);318for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {319// TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.320if (u_hasBinaryProperty(c, property)) {321if (startHasProperty < 0) {322// Transition from false to true.323startHasProperty = c;324}325} else if (startHasProperty >= 0) {326// Transition from true to false.327set->add(startHasProperty, c - 1);328startHasProperty = -1;329}330}331}332if (startHasProperty >= 0) {333set->add(startHasProperty, 0x10FFFF);334}335set->freeze();336return set.orphan();337}338339UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {340if (U_FAILURE(errorCode)) { return nullptr; }341uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;342icu::LocalUMutableCPTriePointer mutableTrie(343umutablecptrie_open(nullValue, nullValue, &errorCode));344const UnicodeSet *inclusions =345icu::CharacterProperties::getInclusionsForProperty(property, errorCode);346if (U_FAILURE(errorCode)) { return nullptr; }347int32_t numRanges = inclusions->getRangeCount();348UChar32 start = 0;349uint32_t value = nullValue;350351for (int32_t i = 0; i < numRanges; ++i) {352UChar32 rangeEnd = inclusions->getRangeEnd(i);353for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {354// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.355uint32_t nextValue = u_getIntPropertyValue(c, property);356if (value != nextValue) {357if (value != nullValue) {358umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);359}360start = c;361value = nextValue;362}363}364}365if (value != 0) {366umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);367}368369UCPTrieType type;370if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {371type = UCPTRIE_TYPE_FAST;372} else {373type = UCPTRIE_TYPE_SMALL;374}375UCPTrieValueWidth valueWidth;376// TODO: UCharacterProperty.IntProperty377int32_t max = u_getIntPropertyMaxValue(property);378if (max <= 0xff) {379valueWidth = UCPTRIE_VALUE_BITS_8;380} else if (max <= 0xffff) {381valueWidth = UCPTRIE_VALUE_BITS_16;382} else {383valueWidth = UCPTRIE_VALUE_BITS_32;384}385return reinterpret_cast<UCPMap *>(386umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));387}388389} // namespace390391U_NAMESPACE_BEGIN392393const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {394if (U_FAILURE(errorCode)) { return nullptr; }395if (property < 0 || UCHAR_BINARY_LIMIT <= property) {396errorCode = U_ILLEGAL_ARGUMENT_ERROR;397return nullptr;398}399Mutex m(&cpMutex);400UnicodeSet *set = sets[property];401if (set == nullptr) {402sets[property] = set = makeSet(property, errorCode);403}404return set;405}406407U_NAMESPACE_END408409U_NAMESPACE_USE410411U_CAPI const USet * U_EXPORT2412u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {413const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);414return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;415}416417U_CAPI const UCPMap * U_EXPORT2418u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {419if (U_FAILURE(*pErrorCode)) { return nullptr; }420if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {421*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;422return nullptr;423}424Mutex m(&cpMutex);425UCPMap *map = maps[property - UCHAR_INT_START];426if (map == nullptr) {427maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);428}429return map;430}431432433