CoCalc -- uspoof

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/icu4c/i18n/uspoof_impl.cpp
²¹⁵²¹ views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (C) 2008-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*/
9

10
#include "unicode/utypes.h"
11
#include "unicode/uspoof.h"
12
#include "unicode/uchar.h"
13
#include "unicode/uniset.h"
14
#include "unicode/utf16.h"
15
#include "utrie2.h"
16
#include "cmemory.h"
17
#include "cstring.h"
18
#include "scriptset.h"
19
#include "umutex.h"
20
#include "udataswp.h"
21
#include "uassert.h"
22
#include "ucln_in.h"
23
#include "uspoof_impl.h"
24

25
#if !UCONFIG_NO_NORMALIZATION
26

27

28
U_NAMESPACE_BEGIN
29

30
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
31

32
SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
33
    construct(status);
34
    fSpoofData = data;
35
}
36

37
SpoofImpl::SpoofImpl(UErrorCode& status) {
38
    construct(status);
39

40
    // TODO: Call this method where it is actually needed, instead of in the
41
    // constructor, to allow for lazy data loading.  See #12696.
42
    fSpoofData = SpoofData::getDefault(status);
43
}
44

45
SpoofImpl::SpoofImpl() {
46
    UErrorCode status = U_ZERO_ERROR;
47
    construct(status);
48

49
    // TODO: Call this method where it is actually needed, instead of in the
50
    // constructor, to allow for lazy data loading.  See #12696.
51
    fSpoofData = SpoofData::getDefault(status);
52
}
53

54
void SpoofImpl::construct(UErrorCode& status) {
55
    fChecks = USPOOF_ALL_CHECKS;
56
    fSpoofData = nullptr;
57
    fAllowedCharsSet = nullptr;
58
    fAllowedLocales = nullptr;
59
    fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
60

61
    if (U_FAILURE(status)) { return; }
62

63
    UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
64
    fAllowedCharsSet = allowedCharsSet;
65
    fAllowedLocales  = uprv_strdup("");
66
    if (fAllowedCharsSet == nullptr || fAllowedLocales == nullptr) {
67
        status = U_MEMORY_ALLOCATION_ERROR;
68
        return;
69
    }
70
    allowedCharsSet->freeze();
71
}
72

73

74
// Copy Constructor, used by the user level clone() function.
75
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
76
        fChecks(USPOOF_ALL_CHECKS), fSpoofData(nullptr), fAllowedCharsSet(nullptr) , 
77
        fAllowedLocales(nullptr) {
78
    if (U_FAILURE(status)) {
79
        return;
80
    }
81
    fChecks = src.fChecks;
82
    if (src.fSpoofData != nullptr) {
83
        fSpoofData = src.fSpoofData->addReference();
84
    }
85
    fAllowedCharsSet = src.fAllowedCharsSet->clone();
86
    fAllowedLocales = uprv_strdup(src.fAllowedLocales);
87
    if (fAllowedCharsSet == nullptr || fAllowedLocales == nullptr) {
88
        status = U_MEMORY_ALLOCATION_ERROR;
89
    }
90
    fRestrictionLevel = src.fRestrictionLevel;
91
}
92

93
SpoofImpl::~SpoofImpl() {
94
    if (fSpoofData != nullptr) {
95
        fSpoofData->removeReference();   // Will delete if refCount goes to zero.
96
    }
97
    delete fAllowedCharsSet;
98
    uprv_free((void *)fAllowedLocales);
99
}
100

101
//  Cast this instance as a USpoofChecker for the C API.
102
USpoofChecker *SpoofImpl::asUSpoofChecker() {
103
    return exportForC();
104
}
105

106
//
107
//  Incoming parameter check on Status and the SpoofChecker object
108
//    received from the C API.
109
//
110
const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
111
    const auto* This = validate(sc, status);
112
    if (U_FAILURE(status)) {
113
        return nullptr;
114
    }
115
    if (This->fSpoofData != nullptr && !This->fSpoofData->validateDataVersion(status)) {
116
        return nullptr;
117
    }
118
    return This;
119
}
120

121
SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
122
    return const_cast<SpoofImpl *>
123
        (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
124
}
125

126

127
void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
128
    UnicodeSet    allowedChars;
129
    UnicodeSet    *tmpSet = nullptr;
130
    const char    *locStart = localesList;
131
    const char    *locEnd = nullptr;
132
    const char    *localesListEnd = localesList + uprv_strlen(localesList);
133
    int32_t        localeListCount = 0;   // Number of locales provided by caller.
134

135
    // Loop runs once per locale from the localesList, a comma separated list of locales.
136
    do {
137
        locEnd = uprv_strchr(locStart, ',');
138
        if (locEnd == nullptr) {
139
            locEnd = localesListEnd;
140
        }
141
        while (*locStart == ' ') {
142
            locStart++;
143
        }
144
        const char *trimmedEnd = locEnd-1;
145
        while (trimmedEnd > locStart && *trimmedEnd == ' ') {
146
            trimmedEnd--;
147
        }
148
        if (trimmedEnd <= locStart) {
149
            break;
150
        }
151
        const char* locale = uprv_strndup(locStart, static_cast<int32_t>(trimmedEnd + 1 - locStart));
152
        localeListCount++;
153

154
        // We have one locale from the locales list.
155
        // Add the script chars for this locale to the accumulating set of allowed chars.
156
        // If the locale is no good, we will be notified back via status.
157
        addScriptChars(locale, &allowedChars, status);
158
        uprv_free((void *)locale);
159
        if (U_FAILURE(status)) {
160
            break;
161
        }
162
        locStart = locEnd + 1;
163
    } while (locStart < localesListEnd);
164

165
    // If our caller provided an empty list of locales, we disable the allowed characters checking
166
    if (localeListCount == 0) {
167
        uprv_free((void *)fAllowedLocales);
168
        fAllowedLocales = uprv_strdup("");
169
        tmpSet = new UnicodeSet(0, 0x10ffff);
170
        if (fAllowedLocales == nullptr || tmpSet == nullptr) {
171
            status = U_MEMORY_ALLOCATION_ERROR;
172
            return;
173
        } 
174
        tmpSet->freeze();
175
        delete fAllowedCharsSet;
176
        fAllowedCharsSet = tmpSet;
177
        fChecks &= ~USPOOF_CHAR_LIMIT;
178
        return;
179
    }
180

181
        
182
    // Add all common and inherited characters to the set of allowed chars.
183
    UnicodeSet tempSet;
184
    tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
185
    allowedChars.addAll(tempSet);
186
    tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
187
    allowedChars.addAll(tempSet);
188
    
189
    // If anything went wrong, we bail out without changing
190
    // the state of the spoof checker.
191
    if (U_FAILURE(status)) {
192
        return;
193
    }
194

195
    // Store the updated spoof checker state.
196
    tmpSet = allowedChars.clone();
197
    const char *tmpLocalesList = uprv_strdup(localesList);
198
    if (tmpSet == nullptr || tmpLocalesList == nullptr) {
199
        status = U_MEMORY_ALLOCATION_ERROR;
200
        return;
201
    }
202
    uprv_free((void *)fAllowedLocales);
203
    fAllowedLocales = tmpLocalesList;
204
    tmpSet->freeze();
205
    delete fAllowedCharsSet;
206
    fAllowedCharsSet = tmpSet;
207
    fChecks |= USPOOF_CHAR_LIMIT;
208
}
209

210

211
const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
212
    return fAllowedLocales;
213
}
214

215

216
// Given a locale (a language), add all the characters from all of the scripts used with that language
217
// to the allowedChars UnicodeSet
218

219
void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
220
    UScriptCode scripts[30];
221

222
    int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
223
    if (U_FAILURE(status)) {
224
        return;
225
    }
226
    if (status == U_USING_DEFAULT_WARNING) {
227
        status = U_ILLEGAL_ARGUMENT_ERROR;
228
        return;
229
    }
230
    UnicodeSet tmpSet;
231
    int32_t    i;
232
    for (i=0; i<numScripts; i++) {
233
        tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
234
        allowedChars->addAll(tmpSet);
235
    }
236
}
237

238
// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
239
void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
240
    result.resetAll();
241
    result.setScriptExtensions(codePoint, status);
242
    if (U_FAILURE(status)) { return; }
243

244
    // Section 5.1 step 1
245
    if (result.test(USCRIPT_HAN, status)) {
246
        result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
247
        result.set(USCRIPT_JAPANESE, status);
248
        result.set(USCRIPT_KOREAN, status);
249
    }
250
    if (result.test(USCRIPT_HIRAGANA, status)) {
251
        result.set(USCRIPT_JAPANESE, status);
252
    }
253
    if (result.test(USCRIPT_KATAKANA, status)) {
254
        result.set(USCRIPT_JAPANESE, status);
255
    }
256
    if (result.test(USCRIPT_HANGUL, status)) {
257
        result.set(USCRIPT_KOREAN, status);
258
    }
259
    if (result.test(USCRIPT_BOPOMOFO, status)) {
260
        result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
261
    }
262

263
    // Section 5.1 step 2
264
    if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
265
        result.setAll();
266
    }
267
}
268

269
// Computes the resolved script set for a string, according to UTS 39 section 5.1.
270
void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
271
    getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
272
}
273

274
// Computes the resolved script set for a string, omitting characters having the specified script.
275
// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
276
void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
277
    result.setAll();
278

279
    ScriptSet temp;
280
    UChar32 codePoint;
281
    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
282
        codePoint = input.char32At(i);
283

284
        // Compute the augmented script set for the character
285
        getAugmentedScriptSet(codePoint, temp, status);
286
        if (U_FAILURE(status)) { return; }
287

288
        // Intersect the augmented script set with the resolved script set, but only if the character doesn't
289
        // have the script specified in the function call
290
        if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
291
            result.intersect(temp);
292
        }
293
    }
294
}
295

296
// Computes the set of numerics for a string, according to UTS 39 section 5.3.
297
void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
298
    result.clear();
299

300
    UChar32 codePoint;
301
    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
302
        codePoint = input.char32At(i);
303

304
        // Store a representative character for each kind of decimal digit
305
        if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
306
            // Store the zero character as a representative for comparison.
307
            // Unicode guarantees it is codePoint - value
308
            result.add(codePoint - static_cast<UChar32>(u_getNumericValue(codePoint)));
309
        }
310
    }
311
}
312

313
// Computes the restriction level of a string, according to UTS 39 section 5.2.
314
URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
315
    // Section 5.2 step 1:
316
    if (!fAllowedCharsSet->containsAll(input)) {
317
        return USPOOF_UNRESTRICTIVE;
318
    }
319

320
    // Section 5.2 step 2
321
    // Java use a static UnicodeSet for this test.  In C++, avoid the static variable
322
    // and just do a simple for loop.
323
    UBool allASCII = true;
324
    for (int32_t i=0, length=input.length(); i<length; i++) {
325
        if (input.charAt(i) > 0x7f) {
326
            allASCII = false;
327
            break;
328
        }
329
    }
330
    if (allASCII) {
331
        return USPOOF_ASCII;
332
    }
333

334
    // Section 5.2 steps 3:
335
    ScriptSet resolvedScriptSet;
336
    getResolvedScriptSet(input, resolvedScriptSet, status);
337
    if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
338

339
    // Section 5.2 step 4:
340
    if (!resolvedScriptSet.isEmpty()) {
341
        return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
342
    }
343

344
    // Section 5.2 step 5:
345
    ScriptSet resolvedNoLatn;
346
    getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
347
    if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
348

349
    // Section 5.2 step 6:
350
    if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
351
            || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
352
            || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
353
        return USPOOF_HIGHLY_RESTRICTIVE;
354
    }
355

356
    // Section 5.2 step 7:
357
    if (!resolvedNoLatn.isEmpty()
358
            && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
359
            && !resolvedNoLatn.test(USCRIPT_GREEK, status)
360
            && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
361
        return USPOOF_MODERATELY_RESTRICTIVE;
362
    }
363

364
    // Section 5.2 step 8:
365
    return USPOOF_MINIMALLY_RESTRICTIVE;
366
}
367

368
int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
369
    bool sawLeadCharacter = false;
370
    for (int32_t i=0; i<input.length();) {
371
        UChar32 cp = input.char32At(i);
372
        if (sawLeadCharacter && cp == 0x0307) {
373
            return i;
374
        }
375
        uint8_t combiningClass = u_getCombiningClass(cp);
376
        // Skip over characters except for those with combining class 0 (non-combining characters) or with
377
        // combining class 230 (same class as U+0307)
378
        U_ASSERT(u_getCombiningClass(0x0307) == 230);
379
        if (combiningClass == 0 || combiningClass == 230) {
380
            sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
381
        }
382
        i += U16_LENGTH(cp);
383
    }
384
    return -1;
385
}
386

387
static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
388
    return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||
389
           u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
390
}
391

392
bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
393
    if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
394
        return true;
395
    }
396
    UnicodeString skelStr;
397
    fSpoofData->confusableLookup(cp, skelStr);
398
    UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
399
    if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
400
        return true;
401
    }
402
    return false;
403
}
404

405

406

407
// Convert a text format hex number.  Utility function used by builder code.  Static.
408
// Input: char16_t *string text.  Output: a UChar32
409
// Input has been pre-checked, and will have no non-hex chars.
410
// The number must fall in the code point range of 0..0x10ffff
411
// Static Function.
412
UChar32 SpoofImpl::ScanHex(const char16_t *s, int32_t start, int32_t limit, UErrorCode &status) {
413
    if (U_FAILURE(status)) {
414
        return 0;
415
    }
416
    U_ASSERT(limit-start > 0);
417
    uint32_t val = 0;
418
    int i;
419
    for (i=start; i<limit; i++) {
420
        int digitVal = s[i] - 0x30;
421
        if (digitVal>9) {
422
            digitVal = 0xa + (s[i] - 0x41);  // Upper Case 'A'
423
        }
424
        if (digitVal>15) {
425
            digitVal = 0xa + (s[i] - 0x61);  // Lower Case 'a'
426
        }
427
        U_ASSERT(digitVal <= 0xf);
428
        val <<= 4;
429
        val += digitVal;
430
    }
431
    if (val > 0x10ffff) {
432
        status = U_PARSE_ERROR;
433
        val = 0;
434
    }
435
    return static_cast<UChar32>(val);
436
}
437

438

439
//-----------------------------------------
440
//
441
//   class CheckResult Implementation
442
//
443
//-----------------------------------------
444

445
CheckResult::CheckResult() {
446
    clear();
447
}
448

449
USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
450
    return exportForC();
451
}
452

453
//
454
//  Incoming parameter check on Status and the CheckResult object
455
//    received from the C API.
456
//
457
const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
458
    return validate(ptr, status);
459
}
460

461
CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
462
    return validate(ptr, status);
463
}
464

465
void CheckResult::clear() {
466
    fChecks = 0;
467
    fNumerics.clear();
468
    fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
469
}
470

471
int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
472
    if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
473
        return fChecks | fRestrictionLevel;
474
    } else {
475
        return fChecks;
476
    }
477
}
478

479
CheckResult::~CheckResult() {
480
}
481

482
//----------------------------------------------------------------------------------------------
483
//
484
//   class SpoofData Implementation
485
//
486
//----------------------------------------------------------------------------------------------
487

488

489
UBool SpoofData::validateDataVersion(UErrorCode &status) const {
490
    if (U_FAILURE(status) ||
491
        fRawData == nullptr ||
492
        fRawData->fMagic != USPOOF_MAGIC ||
493
        fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
494
        fRawData->fFormatVersion[1] != 0 ||
495
        fRawData->fFormatVersion[2] != 0 ||
496
        fRawData->fFormatVersion[3] != 0) {
497
            status = U_INVALID_FORMAT_ERROR;
498
            return false;
499
    }
500
    return true;
501
}
502

503
static UBool U_CALLCONV
504
spoofDataIsAcceptable(void *context,
505
                        const char * /* type */, const char * /*name*/,
506
                        const UDataInfo *pInfo) {
507
    if(
508
        pInfo->size >= 20 &&
509
        pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
510
        pInfo->charsetFamily == U_CHARSET_FAMILY &&
511
        pInfo->dataFormat[0] == 0x43 &&  // dataFormat="Cfu "
512
        pInfo->dataFormat[1] == 0x66 &&
513
        pInfo->dataFormat[2] == 0x75 &&
514
        pInfo->dataFormat[3] == 0x20 &&
515
        pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
516
    ) {
517
        UVersionInfo *version = static_cast<UVersionInfo *>(context);
518
        if(version != nullptr) {
519
            uprv_memcpy(version, pInfo->dataVersion, 4);
520
        }
521
        return true;
522
    } else {
523
        return false;
524
    }
525
}
526

527
//  Methods for the loading of the default confusables data file.  The confusable
528
//  data is loaded only when it is needed.
529
//
530
//  SpoofData::getDefault() - Return the default confusables data, and call the
531
//                            initOnce() if it is not available.  Adds a reference
532
//                            to the SpoofData that the caller is responsible for
533
//                            decrementing when they are done with the data.
534
//
535
//  uspoof_loadDefaultData - Called once, from initOnce().  The resulting SpoofData
536
//                           is shared by all spoof checkers using the default data.
537
//
538
//  uspoof_cleanupDefaultData - Called during cleanup.
539
//
540

541
static UInitOnce gSpoofInitDefaultOnce {};
542
static SpoofData* gDefaultSpoofData;
543

544
static UBool U_CALLCONV
545
uspoof_cleanupDefaultData() {
546
    if (gDefaultSpoofData) {
547
        // Will delete, assuming all user-level spoof checkers were closed.
548
        gDefaultSpoofData->removeReference();
549
        gDefaultSpoofData = nullptr;
550
        gSpoofInitDefaultOnce.reset();
551
    }
552
    return true;
553
}
554

555
static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
556
    UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
557
                                        spoofDataIsAcceptable, 
558
                                        nullptr,       // context, would receive dataVersion if supplied.
559
                                        &status);
560
    if (U_FAILURE(status)) { return; }
561
    gDefaultSpoofData = new SpoofData(udm, status);
562
    if (U_FAILURE(status)) {
563
        delete gDefaultSpoofData;
564
        gDefaultSpoofData = nullptr;
565
        return;
566
    }
567
    if (gDefaultSpoofData == nullptr) {
568
        status = U_MEMORY_ALLOCATION_ERROR;
569
        return;
570
    }
571
    ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
572
}
573

574
SpoofData* SpoofData::getDefault(UErrorCode& status) {
575
    umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
576
    if (U_FAILURE(status)) { return nullptr; }
577
    gDefaultSpoofData->addReference();
578
    return gDefaultSpoofData;
579
}
580

581

582

583
SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
584
{
585
    reset();
586
    if (U_FAILURE(status)) {
587
        return;
588
    }
589
    fUDM = udm;
590
    // fRawData is non-const because it may be constructed by the data builder.
591
    fRawData = reinterpret_cast<SpoofDataHeader *>(
592
            const_cast<void *>(udata_getMemory(udm)));
593
    validateDataVersion(status);
594
    initPtrs(status);
595
}
596

597

598
SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
599
{
600
    reset();
601
    if (U_FAILURE(status)) {
602
        return;
603
    }
604
    if (static_cast<size_t>(length) < sizeof(SpoofDataHeader)) {
605
        status = U_INVALID_FORMAT_ERROR;
606
        return;
607
    }
608
    if (data == nullptr) {
609
        status = U_ILLEGAL_ARGUMENT_ERROR;
610
        return;
611
    }
612
    void *ncData = const_cast<void *>(data);
613
    fRawData = static_cast<SpoofDataHeader *>(ncData);
614
    if (length < fRawData->fLength) {
615
        status = U_INVALID_FORMAT_ERROR;
616
        return;
617
    }
618
    validateDataVersion(status);
619
    initPtrs(status);
620
}
621

622

623
// Spoof Data constructor for use from data builder.
624
//   Initializes a new, empty data area that will be populated later.
625
SpoofData::SpoofData(UErrorCode &status) {
626
    reset();
627
    if (U_FAILURE(status)) {
628
        return;
629
    }
630
    fDataOwned = true;
631

632
    // The spoof header should already be sized to be a multiple of 16 bytes.
633
    // Just in case it's not, round it up.
634
    uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
635
    U_ASSERT(initialSize == sizeof(SpoofDataHeader));
636
    
637
    fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
638
    fMemLimit = initialSize;
639
    if (fRawData == nullptr) {
640
        status = U_MEMORY_ALLOCATION_ERROR;
641
        return;
642
    }
643
    uprv_memset(fRawData, 0, initialSize);
644

645
    fRawData->fMagic = USPOOF_MAGIC;
646
    fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
647
    fRawData->fFormatVersion[1] = 0;
648
    fRawData->fFormatVersion[2] = 0;
649
    fRawData->fFormatVersion[3] = 0;
650
    initPtrs(status);
651
}
652

653
// reset() - initialize all fields.
654
//           Should be updated if any new fields are added.
655
//           Called by constructors to put things in a known initial state.
656
void SpoofData::reset() {
657
   fRawData = nullptr;
658
   fDataOwned = false;
659
   fUDM      = nullptr;
660
   fMemLimit = 0;
661
   fRefCount = 1;
662
   fCFUKeys = nullptr;
663
   fCFUValues = nullptr;
664
   fCFUStrings = nullptr;
665
}
666

667

668
//  SpoofData::initPtrs()
669
//            Initialize the pointers to the various sections of the raw data.
670
//
671
//            This function is used both during the Trie building process (multiple
672
//            times, as the individual data sections are added), and
673
//            during the opening of a Spoof Checker from prebuilt data.
674
//
675
//            The pointers for non-existent data sections (identified by an offset of 0)
676
//            are set to nullptr.
677
//
678
//            Note:  During building the data, adding each new data section
679
//            reallocs the raw data area, which likely relocates it, which
680
//            in turn requires reinitializing all of the pointers into it, hence
681
//            multiple calls to this function during building.
682
//
683
void SpoofData::initPtrs(UErrorCode &status) {
684
    fCFUKeys = nullptr;
685
    fCFUValues = nullptr;
686
    fCFUStrings = nullptr;
687
    if (U_FAILURE(status)) {
688
        return;
689
    }
690
    if (fRawData->fCFUKeys != 0) {
691
        fCFUKeys = reinterpret_cast<int32_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUKeys);
692
    }
693
    if (fRawData->fCFUStringIndex != 0) {
694
        fCFUValues = reinterpret_cast<uint16_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUStringIndex);
695
    }
696
    if (fRawData->fCFUStringTable != 0) {
697
        fCFUStrings = reinterpret_cast<char16_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUStringTable);
698
    }
699
}
700

701

702
SpoofData::~SpoofData() {
703
    if (fDataOwned) {
704
        uprv_free(fRawData);
705
    }
706
    fRawData = nullptr;
707
    if (fUDM != nullptr) {
708
        udata_close(fUDM);
709
    }
710
    fUDM = nullptr;
711
}
712

713

714
void SpoofData::removeReference() {
715
    if (umtx_atomic_dec(&fRefCount) == 0) {
716
        delete this;
717
    }
718
}
719

720

721
SpoofData *SpoofData::addReference() {
722
    umtx_atomic_inc(&fRefCount);
723
    return this;
724
}
725

726

727
void *SpoofData::reserveSpace(int32_t numBytes,  UErrorCode &status) {
728
    if (U_FAILURE(status)) {
729
        return nullptr;
730
    }
731
    if (!fDataOwned) {
732
        UPRV_UNREACHABLE_EXIT;
733
    }
734

735
    numBytes = (numBytes + 15) & ~15;   // Round up to a multiple of 16
736
    uint32_t returnOffset = fMemLimit;
737
    fMemLimit += numBytes;
738
    fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
739
    fRawData->fLength = fMemLimit;
740
    uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
741
    initPtrs(status);
742
    return reinterpret_cast<char*>(fRawData) + returnOffset;
743
}
744

745
int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
746
    int32_t dataSize = fRawData->fLength;
747
    if (capacity < dataSize) {
748
        status = U_BUFFER_OVERFLOW_ERROR;
749
        return dataSize;
750
    }
751
    uprv_memcpy(buf, fRawData, dataSize);
752
    return dataSize;
753
}
754

755
int32_t SpoofData::size() const {
756
    return fRawData->fLength;
757
}
758

759
//-------------------------------
760
//
761
// Front-end APIs for SpoofData
762
//
763
//-------------------------------
764

765
int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
766
    // Perform a binary search.
767
    // [lo, hi), i.e lo is inclusive, hi is exclusive.
768
    // The result after the loop will be in lo.
769
    int32_t lo = 0;
770
    int32_t hi = length();
771
    do {
772
        int32_t mid = (lo + hi) / 2;
773
        if (codePointAt(mid) > inChar) {
774
            hi = mid;
775
        } else if (codePointAt(mid) < inChar) {
776
            lo = mid;
777
        } else {
778
            // Found result.  Break early.
779
            lo = mid;
780
            break;
781
        }
782
    } while (hi - lo > 1);
783

784
    // Did we find an entry?  If not, the char maps to itself.
785
    if (codePointAt(lo) != inChar) {
786
        dest.append(inChar);
787
        return 1;
788
    }
789

790
    // Add the element to the string builder and return.
791
    return appendValueTo(lo, dest);
792
}
793

794
int32_t SpoofData::length() const {
795
    return fRawData->fCFUKeysSize;
796
}
797

798
UChar32 SpoofData::codePointAt(int32_t index) const {
799
    return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
800
}
801

802
int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
803
    int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
804

805
    // Value is either a char (for strings of length 1) or
806
    // an index into the string table (for longer strings)
807
    uint16_t value = fCFUValues[index];
808
    if (stringLength == 1) {
809
        dest.append(static_cast<char16_t>(value));
810
    } else {
811
        dest.append(fCFUStrings + value, stringLength);
812
    }
813

814
    return stringLength;
815
}
816

817

818
U_NAMESPACE_END
819

820
U_NAMESPACE_USE
821

822
//-----------------------------------------------------------------------------
823
//
824
//  uspoof_swap   -  byte swap and char encoding swap of spoof data
825
//
826
//-----------------------------------------------------------------------------
827
U_CAPI int32_t U_EXPORT2
828
uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
829
           UErrorCode *status) {
830

831
    if (status == nullptr || U_FAILURE(*status)) {
832
        return 0;
833
    }
834
    if(ds==nullptr || inData==nullptr || length<-1 || (length>0 && outData==nullptr)) {
835
        *status=U_ILLEGAL_ARGUMENT_ERROR;
836
        return 0;
837
    }
838

839
    //
840
    //  Check that the data header is for spoof data.
841
    //    (Header contents are defined in gencfu.cpp)
842
    //
843
    const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
844
    if(!(  pInfo->dataFormat[0]==0x43 &&   /* dataFormat="Cfu " */
845
           pInfo->dataFormat[1]==0x66 &&
846
           pInfo->dataFormat[2]==0x75 &&
847
           pInfo->dataFormat[3]==0x20 &&
848
           pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
849
           pInfo->formatVersion[1]==0 &&
850
           pInfo->formatVersion[2]==0 &&
851
           pInfo->formatVersion[3]==0  )) {
852
        udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
853
                             "(format version %02x %02x %02x %02x) is not recognized\n",
854
                         pInfo->dataFormat[0], pInfo->dataFormat[1],
855
                         pInfo->dataFormat[2], pInfo->dataFormat[3],
856
                         pInfo->formatVersion[0], pInfo->formatVersion[1],
857
                         pInfo->formatVersion[2], pInfo->formatVersion[3]);
858
        *status=U_UNSUPPORTED_ERROR;
859
        return 0;
860
    }
861

862
    //
863
    // Swap the data header.  (This is the generic ICU Data Header, not the uspoof Specific
864
    //                         header).  This swap also conveniently gets us
865
    //                         the size of the ICU d.h., which lets us locate the start
866
    //                         of the uspoof specific data.
867
    //
868
    int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
869

870

871
    //
872
    // Get the Spoof Data Header, and check that it appears to be OK.
873
    //
874
    //
875
    const uint8_t   *inBytes =(const uint8_t *)inData+headerSize;
876
    SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
877
    if (ds->readUInt32(spoofDH->fMagic)   != USPOOF_MAGIC ||
878
        ds->readUInt32(spoofDH->fLength)  <  sizeof(SpoofDataHeader)) 
879
    {
880
        udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
881
        *status=U_UNSUPPORTED_ERROR;
882
        return 0;
883
    }
884

885
    //
886
    // Prefight operation?  Just return the size
887
    //
888
    int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
889
    int32_t totalSize = headerSize + spoofDataLength;
890
    if (length < 0) {
891
        return totalSize;
892
    }
893

894
    //
895
    // Check that length passed in is consistent with length from Spoof data header.
896
    //
897
    if (length < totalSize) {
898
        udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
899
                            spoofDataLength);
900
        *status=U_INDEX_OUTOFBOUNDS_ERROR;
901
        return 0;
902
        }
903

904

905
    //
906
    // Swap the Data.  Do the data itself first, then the Spoof Data Header, because
907
    //                 we need to reference the header to locate the data, and an
908
    //                 inplace swap of the header leaves it unusable.
909
    //
910
    uint8_t          *outBytes = (uint8_t *)outData + headerSize;
911
    SpoofDataHeader  *outputDH = (SpoofDataHeader *)outBytes;
912

913
    int32_t   sectionStart;
914
    int32_t   sectionLength;
915

916
    //
917
    // If not swapping in place, zero out the output buffer before starting.
918
    //    Gaps may exist between the individual sections, and these must be zeroed in
919
    //    the output buffer.  The simplest way to do that is to just zero the whole thing.
920
    //
921
    if (inBytes != outBytes) {
922
        uprv_memset(outBytes, 0, spoofDataLength);
923
    }
924

925
    // Confusables Keys Section   (fCFUKeys)
926
    sectionStart  = ds->readUInt32(spoofDH->fCFUKeys);
927
    sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
928
    ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
929

930
    // String Index Section
931
    sectionStart  = ds->readUInt32(spoofDH->fCFUStringIndex);
932
    sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
933
    ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
934

935
    // String Table Section
936
    sectionStart  = ds->readUInt32(spoofDH->fCFUStringTable);
937
    sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
938
    ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
939

940
    // And, last, swap the header itself.
941
    //   int32_t   fMagic             // swap this
942
    //   uint8_t   fFormatVersion[4]  // Do not swap this, just copy
943
    //   int32_t   fLength and all the rest       // Swap the rest, all is 32 bit stuff.
944
    //
945
    uint32_t magic = ds->readUInt32(spoofDH->fMagic);
946
    ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
947

948
    if (inBytes != outBytes) {
949
        uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
950
    }
951
    // swap starting at fLength
952
    ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
953

954
    return totalSize;
955
}
956

957
#endif
958

959

960

961
Product

Resources

Company