Path: blob/master/thirdparty/icu4c/i18n/uspoof_impl.cpp
9912 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3**********************************************************************4* Copyright (C) 2008-2016, International Business Machines5* Corporation and others. All Rights Reserved.6**********************************************************************7*/89#include "unicode/utypes.h"10#include "unicode/uspoof.h"11#include "unicode/uchar.h"12#include "unicode/uniset.h"13#include "unicode/utf16.h"14#include "utrie2.h"15#include "cmemory.h"16#include "cstring.h"17#include "scriptset.h"18#include "umutex.h"19#include "udataswp.h"20#include "uassert.h"21#include "ucln_in.h"22#include "uspoof_impl.h"2324#if !UCONFIG_NO_NORMALIZATION252627U_NAMESPACE_BEGIN2829UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)3031SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {32construct(status);33fSpoofData = data;34}3536SpoofImpl::SpoofImpl(UErrorCode& status) {37construct(status);3839// TODO: Call this method where it is actually needed, instead of in the40// constructor, to allow for lazy data loading. See #12696.41fSpoofData = SpoofData::getDefault(status);42}4344SpoofImpl::SpoofImpl() {45UErrorCode status = U_ZERO_ERROR;46construct(status);4748// TODO: Call this method where it is actually needed, instead of in the49// constructor, to allow for lazy data loading. See #12696.50fSpoofData = SpoofData::getDefault(status);51}5253void SpoofImpl::construct(UErrorCode& status) {54fChecks = USPOOF_ALL_CHECKS;55fSpoofData = nullptr;56fAllowedCharsSet = nullptr;57fAllowedLocales = nullptr;58fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;5960if (U_FAILURE(status)) { return; }6162UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);63fAllowedCharsSet = allowedCharsSet;64fAllowedLocales = uprv_strdup("");65if (fAllowedCharsSet == nullptr || fAllowedLocales == nullptr) {66status = U_MEMORY_ALLOCATION_ERROR;67return;68}69allowedCharsSet->freeze();70}717273// Copy Constructor, used by the user level clone() function.74SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :75fChecks(USPOOF_ALL_CHECKS), fSpoofData(nullptr), fAllowedCharsSet(nullptr) ,76fAllowedLocales(nullptr) {77if (U_FAILURE(status)) {78return;79}80fChecks = src.fChecks;81if (src.fSpoofData != nullptr) {82fSpoofData = src.fSpoofData->addReference();83}84fAllowedCharsSet = src.fAllowedCharsSet->clone();85fAllowedLocales = uprv_strdup(src.fAllowedLocales);86if (fAllowedCharsSet == nullptr || fAllowedLocales == nullptr) {87status = U_MEMORY_ALLOCATION_ERROR;88}89fRestrictionLevel = src.fRestrictionLevel;90}9192SpoofImpl::~SpoofImpl() {93if (fSpoofData != nullptr) {94fSpoofData->removeReference(); // Will delete if refCount goes to zero.95}96delete fAllowedCharsSet;97uprv_free((void *)fAllowedLocales);98}99100// Cast this instance as a USpoofChecker for the C API.101USpoofChecker *SpoofImpl::asUSpoofChecker() {102return exportForC();103}104105//106// Incoming parameter check on Status and the SpoofChecker object107// received from the C API.108//109const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {110const auto* This = validate(sc, status);111if (U_FAILURE(status)) {112return nullptr;113}114if (This->fSpoofData != nullptr && !This->fSpoofData->validateDataVersion(status)) {115return nullptr;116}117return This;118}119120SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {121return const_cast<SpoofImpl *>122(SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));123}124125126void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {127UnicodeSet allowedChars;128UnicodeSet *tmpSet = nullptr;129const char *locStart = localesList;130const char *locEnd = nullptr;131const char *localesListEnd = localesList + uprv_strlen(localesList);132int32_t localeListCount = 0; // Number of locales provided by caller.133134// Loop runs once per locale from the localesList, a comma separated list of locales.135do {136locEnd = uprv_strchr(locStart, ',');137if (locEnd == nullptr) {138locEnd = localesListEnd;139}140while (*locStart == ' ') {141locStart++;142}143const char *trimmedEnd = locEnd-1;144while (trimmedEnd > locStart && *trimmedEnd == ' ') {145trimmedEnd--;146}147if (trimmedEnd <= locStart) {148break;149}150const char* locale = uprv_strndup(locStart, static_cast<int32_t>(trimmedEnd + 1 - locStart));151localeListCount++;152153// We have one locale from the locales list.154// Add the script chars for this locale to the accumulating set of allowed chars.155// If the locale is no good, we will be notified back via status.156addScriptChars(locale, &allowedChars, status);157uprv_free((void *)locale);158if (U_FAILURE(status)) {159break;160}161locStart = locEnd + 1;162} while (locStart < localesListEnd);163164// If our caller provided an empty list of locales, we disable the allowed characters checking165if (localeListCount == 0) {166uprv_free((void *)fAllowedLocales);167fAllowedLocales = uprv_strdup("");168tmpSet = new UnicodeSet(0, 0x10ffff);169if (fAllowedLocales == nullptr || tmpSet == nullptr) {170status = U_MEMORY_ALLOCATION_ERROR;171return;172}173tmpSet->freeze();174delete fAllowedCharsSet;175fAllowedCharsSet = tmpSet;176fChecks &= ~USPOOF_CHAR_LIMIT;177return;178}179180181// Add all common and inherited characters to the set of allowed chars.182UnicodeSet tempSet;183tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);184allowedChars.addAll(tempSet);185tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);186allowedChars.addAll(tempSet);187188// If anything went wrong, we bail out without changing189// the state of the spoof checker.190if (U_FAILURE(status)) {191return;192}193194// Store the updated spoof checker state.195tmpSet = allowedChars.clone();196const char *tmpLocalesList = uprv_strdup(localesList);197if (tmpSet == nullptr || tmpLocalesList == nullptr) {198status = U_MEMORY_ALLOCATION_ERROR;199return;200}201uprv_free((void *)fAllowedLocales);202fAllowedLocales = tmpLocalesList;203tmpSet->freeze();204delete fAllowedCharsSet;205fAllowedCharsSet = tmpSet;206fChecks |= USPOOF_CHAR_LIMIT;207}208209210const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {211return fAllowedLocales;212}213214215// Given a locale (a language), add all the characters from all of the scripts used with that language216// to the allowedChars UnicodeSet217218void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {219UScriptCode scripts[30];220221int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);222if (U_FAILURE(status)) {223return;224}225if (status == U_USING_DEFAULT_WARNING) {226status = U_ILLEGAL_ARGUMENT_ERROR;227return;228}229UnicodeSet tmpSet;230int32_t i;231for (i=0; i<numScripts; i++) {232tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);233allowedChars->addAll(tmpSet);234}235}236237// Computes the augmented script set for a code point, according to UTS 39 section 5.1.238void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {239result.resetAll();240result.setScriptExtensions(codePoint, status);241if (U_FAILURE(status)) { return; }242243// Section 5.1 step 1244if (result.test(USCRIPT_HAN, status)) {245result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);246result.set(USCRIPT_JAPANESE, status);247result.set(USCRIPT_KOREAN, status);248}249if (result.test(USCRIPT_HIRAGANA, status)) {250result.set(USCRIPT_JAPANESE, status);251}252if (result.test(USCRIPT_KATAKANA, status)) {253result.set(USCRIPT_JAPANESE, status);254}255if (result.test(USCRIPT_HANGUL, status)) {256result.set(USCRIPT_KOREAN, status);257}258if (result.test(USCRIPT_BOPOMOFO, status)) {259result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);260}261262// Section 5.1 step 2263if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {264result.setAll();265}266}267268// Computes the resolved script set for a string, according to UTS 39 section 5.1.269void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {270getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);271}272273// Computes the resolved script set for a string, omitting characters having the specified script.274// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.275void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {276result.setAll();277278ScriptSet temp;279UChar32 codePoint;280for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {281codePoint = input.char32At(i);282283// Compute the augmented script set for the character284getAugmentedScriptSet(codePoint, temp, status);285if (U_FAILURE(status)) { return; }286287// Intersect the augmented script set with the resolved script set, but only if the character doesn't288// have the script specified in the function call289if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {290result.intersect(temp);291}292}293}294295// Computes the set of numerics for a string, according to UTS 39 section 5.3.296void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {297result.clear();298299UChar32 codePoint;300for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {301codePoint = input.char32At(i);302303// Store a representative character for each kind of decimal digit304if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {305// Store the zero character as a representative for comparison.306// Unicode guarantees it is codePoint - value307result.add(codePoint - static_cast<UChar32>(u_getNumericValue(codePoint)));308}309}310}311312// Computes the restriction level of a string, according to UTS 39 section 5.2.313URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {314// Section 5.2 step 1:315if (!fAllowedCharsSet->containsAll(input)) {316return USPOOF_UNRESTRICTIVE;317}318319// Section 5.2 step 2320// Java use a static UnicodeSet for this test. In C++, avoid the static variable321// and just do a simple for loop.322UBool allASCII = true;323for (int32_t i=0, length=input.length(); i<length; i++) {324if (input.charAt(i) > 0x7f) {325allASCII = false;326break;327}328}329if (allASCII) {330return USPOOF_ASCII;331}332333// Section 5.2 steps 3:334ScriptSet resolvedScriptSet;335getResolvedScriptSet(input, resolvedScriptSet, status);336if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }337338// Section 5.2 step 4:339if (!resolvedScriptSet.isEmpty()) {340return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;341}342343// Section 5.2 step 5:344ScriptSet resolvedNoLatn;345getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);346if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }347348// Section 5.2 step 6:349if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)350|| resolvedNoLatn.test(USCRIPT_JAPANESE, status)351|| resolvedNoLatn.test(USCRIPT_KOREAN, status)) {352return USPOOF_HIGHLY_RESTRICTIVE;353}354355// Section 5.2 step 7:356if (!resolvedNoLatn.isEmpty()357&& !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)358&& !resolvedNoLatn.test(USCRIPT_GREEK, status)359&& !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {360return USPOOF_MODERATELY_RESTRICTIVE;361}362363// Section 5.2 step 8:364return USPOOF_MINIMALLY_RESTRICTIVE;365}366367int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {368bool sawLeadCharacter = false;369for (int32_t i=0; i<input.length();) {370UChar32 cp = input.char32At(i);371if (sawLeadCharacter && cp == 0x0307) {372return i;373}374uint8_t combiningClass = u_getCombiningClass(cp);375// Skip over characters except for those with combining class 0 (non-combining characters) or with376// combining class 230 (same class as U+0307)377U_ASSERT(u_getCombiningClass(0x0307) == 230);378if (combiningClass == 0 || combiningClass == 230) {379sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);380}381i += U16_LENGTH(cp);382}383return -1;384}385386static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {387return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||388u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);389}390391bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {392if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {393return true;394}395UnicodeString skelStr;396fSpoofData->confusableLookup(cp, skelStr);397UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));398if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {399return true;400}401return false;402}403404405406// Convert a text format hex number. Utility function used by builder code. Static.407// Input: char16_t *string text. Output: a UChar32408// Input has been pre-checked, and will have no non-hex chars.409// The number must fall in the code point range of 0..0x10ffff410// Static Function.411UChar32 SpoofImpl::ScanHex(const char16_t *s, int32_t start, int32_t limit, UErrorCode &status) {412if (U_FAILURE(status)) {413return 0;414}415U_ASSERT(limit-start > 0);416uint32_t val = 0;417int i;418for (i=start; i<limit; i++) {419int digitVal = s[i] - 0x30;420if (digitVal>9) {421digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'422}423if (digitVal>15) {424digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'425}426U_ASSERT(digitVal <= 0xf);427val <<= 4;428val += digitVal;429}430if (val > 0x10ffff) {431status = U_PARSE_ERROR;432val = 0;433}434return static_cast<UChar32>(val);435}436437438//-----------------------------------------439//440// class CheckResult Implementation441//442//-----------------------------------------443444CheckResult::CheckResult() {445clear();446}447448USpoofCheckResult* CheckResult::asUSpoofCheckResult() {449return exportForC();450}451452//453// Incoming parameter check on Status and the CheckResult object454// received from the C API.455//456const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {457return validate(ptr, status);458}459460CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {461return validate(ptr, status);462}463464void CheckResult::clear() {465fChecks = 0;466fNumerics.clear();467fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;468}469470int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {471if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {472return fChecks | fRestrictionLevel;473} else {474return fChecks;475}476}477478CheckResult::~CheckResult() {479}480481//----------------------------------------------------------------------------------------------482//483// class SpoofData Implementation484//485//----------------------------------------------------------------------------------------------486487488UBool SpoofData::validateDataVersion(UErrorCode &status) const {489if (U_FAILURE(status) ||490fRawData == nullptr ||491fRawData->fMagic != USPOOF_MAGIC ||492fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||493fRawData->fFormatVersion[1] != 0 ||494fRawData->fFormatVersion[2] != 0 ||495fRawData->fFormatVersion[3] != 0) {496status = U_INVALID_FORMAT_ERROR;497return false;498}499return true;500}501502static UBool U_CALLCONV503spoofDataIsAcceptable(void *context,504const char * /* type */, const char * /*name*/,505const UDataInfo *pInfo) {506if(507pInfo->size >= 20 &&508pInfo->isBigEndian == U_IS_BIG_ENDIAN &&509pInfo->charsetFamily == U_CHARSET_FAMILY &&510pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu "511pInfo->dataFormat[1] == 0x66 &&512pInfo->dataFormat[2] == 0x75 &&513pInfo->dataFormat[3] == 0x20 &&514pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION515) {516UVersionInfo *version = static_cast<UVersionInfo *>(context);517if(version != nullptr) {518uprv_memcpy(version, pInfo->dataVersion, 4);519}520return true;521} else {522return false;523}524}525526// Methods for the loading of the default confusables data file. The confusable527// data is loaded only when it is needed.528//529// SpoofData::getDefault() - Return the default confusables data, and call the530// initOnce() if it is not available. Adds a reference531// to the SpoofData that the caller is responsible for532// decrementing when they are done with the data.533//534// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData535// is shared by all spoof checkers using the default data.536//537// uspoof_cleanupDefaultData - Called during cleanup.538//539540static UInitOnce gSpoofInitDefaultOnce {};541static SpoofData* gDefaultSpoofData;542543static UBool U_CALLCONV544uspoof_cleanupDefaultData() {545if (gDefaultSpoofData) {546// Will delete, assuming all user-level spoof checkers were closed.547gDefaultSpoofData->removeReference();548gDefaultSpoofData = nullptr;549gSpoofInitDefaultOnce.reset();550}551return true;552}553554static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {555UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",556spoofDataIsAcceptable,557nullptr, // context, would receive dataVersion if supplied.558&status);559if (U_FAILURE(status)) { return; }560gDefaultSpoofData = new SpoofData(udm, status);561if (U_FAILURE(status)) {562delete gDefaultSpoofData;563gDefaultSpoofData = nullptr;564return;565}566if (gDefaultSpoofData == nullptr) {567status = U_MEMORY_ALLOCATION_ERROR;568return;569}570ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);571}572573SpoofData* SpoofData::getDefault(UErrorCode& status) {574umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);575if (U_FAILURE(status)) { return nullptr; }576gDefaultSpoofData->addReference();577return gDefaultSpoofData;578}579580581582SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)583{584reset();585if (U_FAILURE(status)) {586return;587}588fUDM = udm;589// fRawData is non-const because it may be constructed by the data builder.590fRawData = reinterpret_cast<SpoofDataHeader *>(591const_cast<void *>(udata_getMemory(udm)));592validateDataVersion(status);593initPtrs(status);594}595596597SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)598{599reset();600if (U_FAILURE(status)) {601return;602}603if (static_cast<size_t>(length) < sizeof(SpoofDataHeader)) {604status = U_INVALID_FORMAT_ERROR;605return;606}607if (data == nullptr) {608status = U_ILLEGAL_ARGUMENT_ERROR;609return;610}611void *ncData = const_cast<void *>(data);612fRawData = static_cast<SpoofDataHeader *>(ncData);613if (length < fRawData->fLength) {614status = U_INVALID_FORMAT_ERROR;615return;616}617validateDataVersion(status);618initPtrs(status);619}620621622// Spoof Data constructor for use from data builder.623// Initializes a new, empty data area that will be populated later.624SpoofData::SpoofData(UErrorCode &status) {625reset();626if (U_FAILURE(status)) {627return;628}629fDataOwned = true;630631// The spoof header should already be sized to be a multiple of 16 bytes.632// Just in case it's not, round it up.633uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;634U_ASSERT(initialSize == sizeof(SpoofDataHeader));635636fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));637fMemLimit = initialSize;638if (fRawData == nullptr) {639status = U_MEMORY_ALLOCATION_ERROR;640return;641}642uprv_memset(fRawData, 0, initialSize);643644fRawData->fMagic = USPOOF_MAGIC;645fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;646fRawData->fFormatVersion[1] = 0;647fRawData->fFormatVersion[2] = 0;648fRawData->fFormatVersion[3] = 0;649initPtrs(status);650}651652// reset() - initialize all fields.653// Should be updated if any new fields are added.654// Called by constructors to put things in a known initial state.655void SpoofData::reset() {656fRawData = nullptr;657fDataOwned = false;658fUDM = nullptr;659fMemLimit = 0;660fRefCount = 1;661fCFUKeys = nullptr;662fCFUValues = nullptr;663fCFUStrings = nullptr;664}665666667// SpoofData::initPtrs()668// Initialize the pointers to the various sections of the raw data.669//670// This function is used both during the Trie building process (multiple671// times, as the individual data sections are added), and672// during the opening of a Spoof Checker from prebuilt data.673//674// The pointers for non-existent data sections (identified by an offset of 0)675// are set to nullptr.676//677// Note: During building the data, adding each new data section678// reallocs the raw data area, which likely relocates it, which679// in turn requires reinitializing all of the pointers into it, hence680// multiple calls to this function during building.681//682void SpoofData::initPtrs(UErrorCode &status) {683fCFUKeys = nullptr;684fCFUValues = nullptr;685fCFUStrings = nullptr;686if (U_FAILURE(status)) {687return;688}689if (fRawData->fCFUKeys != 0) {690fCFUKeys = reinterpret_cast<int32_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUKeys);691}692if (fRawData->fCFUStringIndex != 0) {693fCFUValues = reinterpret_cast<uint16_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUStringIndex);694}695if (fRawData->fCFUStringTable != 0) {696fCFUStrings = reinterpret_cast<char16_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUStringTable);697}698}699700701SpoofData::~SpoofData() {702if (fDataOwned) {703uprv_free(fRawData);704}705fRawData = nullptr;706if (fUDM != nullptr) {707udata_close(fUDM);708}709fUDM = nullptr;710}711712713void SpoofData::removeReference() {714if (umtx_atomic_dec(&fRefCount) == 0) {715delete this;716}717}718719720SpoofData *SpoofData::addReference() {721umtx_atomic_inc(&fRefCount);722return this;723}724725726void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {727if (U_FAILURE(status)) {728return nullptr;729}730if (!fDataOwned) {731UPRV_UNREACHABLE_EXIT;732}733734numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16735uint32_t returnOffset = fMemLimit;736fMemLimit += numBytes;737fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));738fRawData->fLength = fMemLimit;739uprv_memset((char *)fRawData + returnOffset, 0, numBytes);740initPtrs(status);741return reinterpret_cast<char*>(fRawData) + returnOffset;742}743744int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {745int32_t dataSize = fRawData->fLength;746if (capacity < dataSize) {747status = U_BUFFER_OVERFLOW_ERROR;748return dataSize;749}750uprv_memcpy(buf, fRawData, dataSize);751return dataSize;752}753754int32_t SpoofData::size() const {755return fRawData->fLength;756}757758//-------------------------------759//760// Front-end APIs for SpoofData761//762//-------------------------------763764int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {765// Perform a binary search.766// [lo, hi), i.e lo is inclusive, hi is exclusive.767// The result after the loop will be in lo.768int32_t lo = 0;769int32_t hi = length();770do {771int32_t mid = (lo + hi) / 2;772if (codePointAt(mid) > inChar) {773hi = mid;774} else if (codePointAt(mid) < inChar) {775lo = mid;776} else {777// Found result. Break early.778lo = mid;779break;780}781} while (hi - lo > 1);782783// Did we find an entry? If not, the char maps to itself.784if (codePointAt(lo) != inChar) {785dest.append(inChar);786return 1;787}788789// Add the element to the string builder and return.790return appendValueTo(lo, dest);791}792793int32_t SpoofData::length() const {794return fRawData->fCFUKeysSize;795}796797UChar32 SpoofData::codePointAt(int32_t index) const {798return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);799}800801int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {802int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);803804// Value is either a char (for strings of length 1) or805// an index into the string table (for longer strings)806uint16_t value = fCFUValues[index];807if (stringLength == 1) {808dest.append(static_cast<char16_t>(value));809} else {810dest.append(fCFUStrings + value, stringLength);811}812813return stringLength;814}815816817U_NAMESPACE_END818819U_NAMESPACE_USE820821//-----------------------------------------------------------------------------822//823// uspoof_swap - byte swap and char encoding swap of spoof data824//825//-----------------------------------------------------------------------------826U_CAPI int32_t U_EXPORT2827uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,828UErrorCode *status) {829830if (status == nullptr || U_FAILURE(*status)) {831return 0;832}833if(ds==nullptr || inData==nullptr || length<-1 || (length>0 && outData==nullptr)) {834*status=U_ILLEGAL_ARGUMENT_ERROR;835return 0;836}837838//839// Check that the data header is for spoof data.840// (Header contents are defined in gencfu.cpp)841//842const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);843if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */844pInfo->dataFormat[1]==0x66 &&845pInfo->dataFormat[2]==0x75 &&846pInfo->dataFormat[3]==0x20 &&847pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&848pInfo->formatVersion[1]==0 &&849pInfo->formatVersion[2]==0 &&850pInfo->formatVersion[3]==0 )) {851udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "852"(format version %02x %02x %02x %02x) is not recognized\n",853pInfo->dataFormat[0], pInfo->dataFormat[1],854pInfo->dataFormat[2], pInfo->dataFormat[3],855pInfo->formatVersion[0], pInfo->formatVersion[1],856pInfo->formatVersion[2], pInfo->formatVersion[3]);857*status=U_UNSUPPORTED_ERROR;858return 0;859}860861//862// Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific863// header). This swap also conveniently gets us864// the size of the ICU d.h., which lets us locate the start865// of the uspoof specific data.866//867int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);868869870//871// Get the Spoof Data Header, and check that it appears to be OK.872//873//874const uint8_t *inBytes =(const uint8_t *)inData+headerSize;875SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;876if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC ||877ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))878{879udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");880*status=U_UNSUPPORTED_ERROR;881return 0;882}883884//885// Prefight operation? Just return the size886//887int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);888int32_t totalSize = headerSize + spoofDataLength;889if (length < 0) {890return totalSize;891}892893//894// Check that length passed in is consistent with length from Spoof data header.895//896if (length < totalSize) {897udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",898spoofDataLength);899*status=U_INDEX_OUTOFBOUNDS_ERROR;900return 0;901}902903904//905// Swap the Data. Do the data itself first, then the Spoof Data Header, because906// we need to reference the header to locate the data, and an907// inplace swap of the header leaves it unusable.908//909uint8_t *outBytes = (uint8_t *)outData + headerSize;910SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes;911912int32_t sectionStart;913int32_t sectionLength;914915//916// If not swapping in place, zero out the output buffer before starting.917// Gaps may exist between the individual sections, and these must be zeroed in918// the output buffer. The simplest way to do that is to just zero the whole thing.919//920if (inBytes != outBytes) {921uprv_memset(outBytes, 0, spoofDataLength);922}923924// Confusables Keys Section (fCFUKeys)925sectionStart = ds->readUInt32(spoofDH->fCFUKeys);926sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;927ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);928929// String Index Section930sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);931sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;932ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);933934// String Table Section935sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);936sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;937ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);938939// And, last, swap the header itself.940// int32_t fMagic // swap this941// uint8_t fFormatVersion[4] // Do not swap this, just copy942// int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.943//944uint32_t magic = ds->readUInt32(spoofDH->fMagic);945ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);946947if (inBytes != outBytes) {948uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));949}950// swap starting at fLength951ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);952953return totalSize;954}955956#endif957958959960961