Path: blob/master/libs/icucommon/characterproperties.cpp
12343 views
// © 2018 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html23// characterproperties.cpp4// created: 2018sep03 Markus W. Scherer56#include "unicode/utypes.h"7#include "unicode/localpointer.h"8#include "unicode/uchar.h"9#include "unicode/ucpmap.h"10#include "unicode/ucptrie.h"11#include "unicode/umutablecptrie.h"12#include "unicode/uniset.h"13#include "unicode/uscript.h"14#include "unicode/uset.h"15#include "cmemory.h"16#include "emojiprops.h"17#include "mutex.h"18#include "normalizer2impl.h"19#include "uassert.h"20#include "ubidi_props.h"21#include "ucase.h"22#include "ucln_cmn.h"23#include "umutex.h"24#include "uprops.h"2526using icu::LocalPointer;27#if !UCONFIG_NO_NORMALIZATION28using icu::Normalizer2Factory;29using icu::Normalizer2Impl;30#endif31using icu::UInitOnce;32using icu::UnicodeSet;3334namespace {3536UBool U_CALLCONV characterproperties_cleanup();3738constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + (UCHAR_INT_LIMIT - UCHAR_INT_START);3940struct Inclusion {41UnicodeSet *fSet = nullptr;42UInitOnce fInitOnce {};43};44Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()4546UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};4748UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};4950icu::UMutex cpMutex;5152//----------------------------------------------------------------53// Inclusions list54//----------------------------------------------------------------5556// USetAdder implementation57// Does not use uset.h to reduce code dependencies58void U_CALLCONV59_set_add(USet *set, UChar32 c) {60((UnicodeSet *)set)->add(c);61}6263void U_CALLCONV64_set_addRange(USet *set, UChar32 start, UChar32 end) {65((UnicodeSet *)set)->add(start, end);66}6768void U_CALLCONV69_set_addString(USet *set, const UChar *str, int32_t length) {70((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));71}7273UBool U_CALLCONV characterproperties_cleanup() {74for (Inclusion &in: gInclusions) {75delete in.fSet;76in.fSet = nullptr;77in.fInitOnce.reset();78}79for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {80delete sets[i];81sets[i] = nullptr;82}83for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {84ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));85maps[i] = nullptr;86}87return true;88}8990void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {91// This function is invoked only via umtx_initOnce().92U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);93if (src == UPROPS_SRC_NONE) {94errorCode = U_INTERNAL_PROGRAM_ERROR;95return;96}97U_ASSERT(gInclusions[src].fSet == nullptr);9899LocalPointer<UnicodeSet> incl(new UnicodeSet());100if (incl.isNull()) {101errorCode = U_MEMORY_ALLOCATION_ERROR;102return;103}104USetAdder sa = {105(USet *)incl.getAlias(),106_set_add,107_set_addRange,108_set_addString,109nullptr, // don't need remove()110nullptr // don't need removeRange()111};112113switch(src) {114case UPROPS_SRC_CHAR:115uchar_addPropertyStarts(&sa, &errorCode);116break;117case UPROPS_SRC_PROPSVEC:118upropsvec_addPropertyStarts(&sa, &errorCode);119break;120case UPROPS_SRC_CHAR_AND_PROPSVEC:121uchar_addPropertyStarts(&sa, &errorCode);122upropsvec_addPropertyStarts(&sa, &errorCode);123break;124#if !UCONFIG_NO_NORMALIZATION125case UPROPS_SRC_CASE_AND_NORM: {126const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);127if(U_SUCCESS(errorCode)) {128impl->addPropertyStarts(&sa, errorCode);129}130ucase_addPropertyStarts(&sa, &errorCode);131break;132}133case UPROPS_SRC_NFC: {134const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);135if(U_SUCCESS(errorCode)) {136impl->addPropertyStarts(&sa, errorCode);137}138break;139}140case UPROPS_SRC_NFKC: {141const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);142if(U_SUCCESS(errorCode)) {143impl->addPropertyStarts(&sa, errorCode);144}145break;146}147case UPROPS_SRC_NFKC_CF: {148const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);149if(U_SUCCESS(errorCode)) {150impl->addPropertyStarts(&sa, errorCode);151}152break;153}154case UPROPS_SRC_NFC_CANON_ITER: {155const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);156if(U_SUCCESS(errorCode)) {157impl->addCanonIterPropertyStarts(&sa, errorCode);158}159break;160}161#endif162case UPROPS_SRC_CASE:163ucase_addPropertyStarts(&sa, &errorCode);164break;165case UPROPS_SRC_BIDI:166ubidi_addPropertyStarts(&sa, &errorCode);167break;168case UPROPS_SRC_INPC:169case UPROPS_SRC_INSC:170case UPROPS_SRC_VO:171uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);172break;173case UPROPS_SRC_EMOJI: {174const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);175if (U_SUCCESS(errorCode)) {176ep->addPropertyStarts(&sa, errorCode);177}178break;179}180default:181errorCode = U_INTERNAL_PROGRAM_ERROR;182break;183}184185if (U_FAILURE(errorCode)) {186return;187}188if (incl->isBogus()) {189errorCode = U_MEMORY_ALLOCATION_ERROR;190return;191}192// Compact for caching.193incl->compact();194gInclusions[src].fSet = incl.orphan();195ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);196}197198const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {199if (U_FAILURE(errorCode)) { return nullptr; }200if (src < 0 || UPROPS_SRC_COUNT <= src) {201errorCode = U_ILLEGAL_ARGUMENT_ERROR;202return nullptr;203}204Inclusion &i = gInclusions[src];205umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);206return i.fSet;207}208209void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {210// This function is invoked only via umtx_initOnce().211U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);212int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);213U_ASSERT(gInclusions[inclIndex].fSet == nullptr);214UPropertySource src = uprops_getSource(prop);215const UnicodeSet *incl = getInclusionsForSource(src, errorCode);216if (U_FAILURE(errorCode)) {217return;218}219220LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));221if (intPropIncl.isNull()) {222errorCode = U_MEMORY_ALLOCATION_ERROR;223return;224}225int32_t numRanges = incl->getRangeCount();226int32_t prevValue = 0;227for (int32_t i = 0; i < numRanges; ++i) {228UChar32 rangeEnd = incl->getRangeEnd(i);229for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {230// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.231int32_t value = u_getIntPropertyValue(c, prop);232if (value != prevValue) {233intPropIncl->add(c);234prevValue = value;235}236}237}238239if (intPropIncl->isBogus()) {240errorCode = U_MEMORY_ALLOCATION_ERROR;241return;242}243// Compact for caching.244intPropIncl->compact();245gInclusions[inclIndex].fSet = intPropIncl.orphan();246ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);247}248249} // namespace250251U_NAMESPACE_BEGIN252253const UnicodeSet *CharacterProperties::getInclusionsForProperty(254UProperty prop, UErrorCode &errorCode) {255if (U_FAILURE(errorCode)) { return nullptr; }256if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {257int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);258Inclusion &i = gInclusions[inclIndex];259umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);260return i.fSet;261} else {262UPropertySource src = uprops_getSource(prop);263return getInclusionsForSource(src, errorCode);264}265}266267U_NAMESPACE_END268269namespace {270271UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {272if (U_FAILURE(errorCode)) { return nullptr; }273LocalPointer<UnicodeSet> set(new UnicodeSet());274if (set.isNull()) {275errorCode = U_MEMORY_ALLOCATION_ERROR;276return nullptr;277}278if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {279// property of strings280const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);281if (U_FAILURE(errorCode)) { return nullptr; }282USetAdder sa = {283(USet *)set.getAlias(),284_set_add,285_set_addRange,286_set_addString,287nullptr, // don't need remove()288nullptr // don't need removeRange()289};290ep->addStrings(&sa, property, errorCode);291if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {292// property of _only_ strings293set->freeze();294return set.orphan();295}296}297298const UnicodeSet *inclusions =299icu::CharacterProperties::getInclusionsForProperty(property, errorCode);300if (U_FAILURE(errorCode)) { return nullptr; }301int32_t numRanges = inclusions->getRangeCount();302UChar32 startHasProperty = -1;303304for (int32_t i = 0; i < numRanges; ++i) {305UChar32 rangeEnd = inclusions->getRangeEnd(i);306for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {307// TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.308if (u_hasBinaryProperty(c, property)) {309if (startHasProperty < 0) {310// Transition from false to true.311startHasProperty = c;312}313} else if (startHasProperty >= 0) {314// Transition from true to false.315set->add(startHasProperty, c - 1);316startHasProperty = -1;317}318}319}320if (startHasProperty >= 0) {321set->add(startHasProperty, 0x10FFFF);322}323set->freeze();324return set.orphan();325}326327UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {328if (U_FAILURE(errorCode)) { return nullptr; }329uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;330icu::LocalUMutableCPTriePointer mutableTrie(331umutablecptrie_open(nullValue, nullValue, &errorCode));332const UnicodeSet *inclusions =333icu::CharacterProperties::getInclusionsForProperty(property, errorCode);334if (U_FAILURE(errorCode)) { return nullptr; }335int32_t numRanges = inclusions->getRangeCount();336UChar32 start = 0;337uint32_t value = nullValue;338339for (int32_t i = 0; i < numRanges; ++i) {340UChar32 rangeEnd = inclusions->getRangeEnd(i);341for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {342// TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.343uint32_t nextValue = u_getIntPropertyValue(c, property);344if (value != nextValue) {345if (value != nullValue) {346umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);347}348start = c;349value = nextValue;350}351}352}353if (value != 0) {354umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);355}356357UCPTrieType type;358if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {359type = UCPTRIE_TYPE_FAST;360} else {361type = UCPTRIE_TYPE_SMALL;362}363UCPTrieValueWidth valueWidth;364// TODO: UCharacterProperty.IntProperty365int32_t max = u_getIntPropertyMaxValue(property);366if (max <= 0xff) {367valueWidth = UCPTRIE_VALUE_BITS_8;368} else if (max <= 0xffff) {369valueWidth = UCPTRIE_VALUE_BITS_16;370} else {371valueWidth = UCPTRIE_VALUE_BITS_32;372}373return reinterpret_cast<UCPMap *>(374umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));375}376377} // namespace378379U_NAMESPACE_USE380381U_CAPI const USet * U_EXPORT2382u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {383if (U_FAILURE(*pErrorCode)) { return nullptr; }384if (property < 0 || UCHAR_BINARY_LIMIT <= property) {385*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;386return nullptr;387}388Mutex m(&cpMutex);389UnicodeSet *set = sets[property];390if (set == nullptr) {391sets[property] = set = makeSet(property, *pErrorCode);392}393if (U_FAILURE(*pErrorCode)) { return nullptr; }394return set->toUSet();395}396397U_CAPI const UCPMap * U_EXPORT2398u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {399if (U_FAILURE(*pErrorCode)) { return nullptr; }400if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {401*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;402return nullptr;403}404Mutex m(&cpMutex);405UCPMap *map = maps[property - UCHAR_INT_START];406if (map == nullptr) {407maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);408}409return map;410}411412413