Path: blob/master/thirdparty/icu4c/common/emojiprops.cpp
9902 views
// © 2021 and later: Unicode, Inc. and others.1// License & terms of use: https://www.unicode.org/copyright.html23// emojiprops.cpp4// created: 2021sep04 Markus W. Scherer56#include "unicode/utypes.h"7#include "unicode/uchar.h"8#include "unicode/ucharstrie.h"9#include "unicode/ucptrie.h"10#include "unicode/udata.h"11#include "unicode/ustringtrie.h"12#include "unicode/utf16.h"13#include "emojiprops.h"14#include "ucln.h"15#include "ucln_cmn.h"16#include "umutex.h"17#include "uset_imp.h"1819U_NAMESPACE_BEGIN2021namespace {2223EmojiProps *singleton = nullptr;24icu::UInitOnce emojiInitOnce {};2526UBool U_CALLCONV emojiprops_cleanup() {27delete singleton;28singleton = nullptr;29emojiInitOnce.reset();30return true;31}3233void U_CALLCONV initSingleton(UErrorCode &errorCode) {34if (U_FAILURE(errorCode)) { return; }35singleton = new EmojiProps(errorCode);36if (singleton == nullptr) {37errorCode = U_MEMORY_ALLOCATION_ERROR;38} else if (U_FAILURE(errorCode)) {39delete singleton;40singleton = nullptr;41}42ucln_common_registerCleanup(UCLN_COMMON_EMOJIPROPS, emojiprops_cleanup);43}4445// TODO: turn this into a shared helper function46// Requires the major version to match, and then requires at least the minor version.47UBool udata_isAcceptableMajorMinor(48const UDataInfo &info, const char16_t *dataFormat, uint8_t major, uint8_t minor) {49return50info.size >= 20 &&51info.isBigEndian == U_IS_BIG_ENDIAN &&52info.charsetFamily == U_CHARSET_FAMILY &&53info.dataFormat[0] == dataFormat[0] &&54info.dataFormat[1] == dataFormat[1] &&55info.dataFormat[2] == dataFormat[2] &&56info.dataFormat[3] == dataFormat[3] &&57info.formatVersion[0] == major &&58info.formatVersion[1] >= minor;59}6061} // namespace6263EmojiProps::~EmojiProps() {64udata_close(memory);65ucptrie_close(cpTrie);66}6768const EmojiProps *69EmojiProps::getSingleton(UErrorCode &errorCode) {70if (U_FAILURE(errorCode)) { return nullptr; }71umtx_initOnce(emojiInitOnce, &initSingleton, errorCode);72return singleton;73}7475UBool U_CALLCONV76EmojiProps::isAcceptable(void * /*context*/, const char * /*type*/, const char * /*name*/,77const UDataInfo *pInfo) {78return udata_isAcceptableMajorMinor(*pInfo, u"Emoj", 1, 0);79}8081void82EmojiProps::load(UErrorCode &errorCode) {83memory = udata_openChoice(nullptr, "icu", "uemoji", isAcceptable, this, &errorCode);84if (U_FAILURE(errorCode)) { return; }85const uint8_t* inBytes = static_cast<const uint8_t*>(udata_getMemory(memory));86const int32_t* inIndexes = reinterpret_cast<const int32_t*>(inBytes);87int32_t indexesLength = inIndexes[IX_CPTRIE_OFFSET] / 4;88if (indexesLength <= IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET) {89errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes.90return;91}9293int32_t i = IX_CPTRIE_OFFSET;94int32_t offset = inIndexes[i++];95int32_t nextOffset = inIndexes[i];96cpTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_8,97inBytes + offset, nextOffset - offset, nullptr, &errorCode);98if (U_FAILURE(errorCode)) {99return;100}101102for (i = IX_BASIC_EMOJI_TRIE_OFFSET; i <= IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET; ++i) {103offset = inIndexes[i];104nextOffset = inIndexes[i + 1];105// Set/leave nullptr if there is no UCharsTrie.106const char16_t* p = nextOffset > offset ? reinterpret_cast<const char16_t*>(inBytes + offset) : nullptr;107stringTries[getStringTrieIndex(i)] = p;108}109}110111void112EmojiProps::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {113// Add the start code point of each same-value range of the trie.114UChar32 start = 0, end;115uint32_t value;116while ((end = ucptrie_getRange(cpTrie, start, UCPMAP_RANGE_NORMAL, 0,117nullptr, nullptr, &value)) >= 0) {118sa->add(sa->set, start);119start = end + 1;120}121}122123UBool124EmojiProps::hasBinaryProperty(UChar32 c, UProperty which) {125UErrorCode errorCode = U_ZERO_ERROR;126const EmojiProps *ep = getSingleton(errorCode);127return U_SUCCESS(errorCode) && ep->hasBinaryPropertyImpl(c, which);128}129130UBool131EmojiProps::hasBinaryPropertyImpl(UChar32 c, UProperty which) const {132if (which < UCHAR_EMOJI || UCHAR_RGI_EMOJI < which) {133return false;134}135// Note: UCHAR_REGIONAL_INDICATOR is a single, hardcoded range implemented elsewhere.136static constexpr int8_t bitFlags[] = {137BIT_EMOJI, // UCHAR_EMOJI=57138BIT_EMOJI_PRESENTATION, // UCHAR_EMOJI_PRESENTATION=58139BIT_EMOJI_MODIFIER, // UCHAR_EMOJI_MODIFIER=59140BIT_EMOJI_MODIFIER_BASE, // UCHAR_EMOJI_MODIFIER_BASE=60141BIT_EMOJI_COMPONENT, // UCHAR_EMOJI_COMPONENT=61142-1, // UCHAR_REGIONAL_INDICATOR=62143-1, // UCHAR_PREPENDED_CONCATENATION_MARK=63144BIT_EXTENDED_PICTOGRAPHIC, // UCHAR_EXTENDED_PICTOGRAPHIC=64145BIT_BASIC_EMOJI, // UCHAR_BASIC_EMOJI=65146-1, // UCHAR_EMOJI_KEYCAP_SEQUENCE=66147-1, // UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE=67148-1, // UCHAR_RGI_EMOJI_FLAG_SEQUENCE=68149-1, // UCHAR_RGI_EMOJI_TAG_SEQUENCE=69150-1, // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE=70151BIT_BASIC_EMOJI, // UCHAR_RGI_EMOJI=71152};153int32_t bit = bitFlags[which - UCHAR_EMOJI];154if (bit < 0) {155return false; // not a property that we support in this function156}157uint8_t bits = UCPTRIE_FAST_GET(cpTrie, UCPTRIE_8, c);158return (bits >> bit) & 1;159}160161UBool162EmojiProps::hasBinaryProperty(const char16_t *s, int32_t length, UProperty which) {163UErrorCode errorCode = U_ZERO_ERROR;164const EmojiProps *ep = getSingleton(errorCode);165return U_SUCCESS(errorCode) && ep->hasBinaryPropertyImpl(s, length, which);166}167168UBool169EmojiProps::hasBinaryPropertyImpl(const char16_t *s, int32_t length, UProperty which) const {170if (s == nullptr && length != 0) { return false; }171if (length <= 0 && (length == 0 || *s == 0)) { return false; } // empty string172// The caller should have delegated single code points to hasBinaryProperty(c, which).173if (which < UCHAR_BASIC_EMOJI || UCHAR_RGI_EMOJI < which) {174return false;175}176UProperty firstProp = which, lastProp = which;177if (which == UCHAR_RGI_EMOJI) {178// RGI_Emoji is the union of the other emoji properties of strings.179firstProp = UCHAR_BASIC_EMOJI;180lastProp = UCHAR_RGI_EMOJI_ZWJ_SEQUENCE;181}182for (int32_t prop = firstProp; prop <= lastProp; ++prop) {183const char16_t *trieUChars = stringTries[prop - UCHAR_BASIC_EMOJI];184if (trieUChars != nullptr) {185UCharsTrie trie(trieUChars);186UStringTrieResult result = trie.next(s, length);187if (USTRINGTRIE_HAS_VALUE(result)) {188return true;189}190}191}192return false;193}194195void196EmojiProps::addStrings(const USetAdder *sa, UProperty which, UErrorCode &errorCode) const {197if (U_FAILURE(errorCode)) { return; }198if (which < UCHAR_BASIC_EMOJI || UCHAR_RGI_EMOJI < which) {199return;200}201UProperty firstProp = which, lastProp = which;202if (which == UCHAR_RGI_EMOJI) {203// RGI_Emoji is the union of the other emoji properties of strings.204firstProp = UCHAR_BASIC_EMOJI;205lastProp = UCHAR_RGI_EMOJI_ZWJ_SEQUENCE;206}207for (int32_t prop = firstProp; prop <= lastProp; ++prop) {208const char16_t *trieUChars = stringTries[prop - UCHAR_BASIC_EMOJI];209if (trieUChars != nullptr) {210UCharsTrie::Iterator iter(trieUChars, 0, errorCode);211while (iter.next(errorCode)) {212const UnicodeString &s = iter.getString();213sa->addString(sa->set, s.getBuffer(), s.length());214}215}216}217}218219U_NAMESPACE_END220221222