Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/icu4c/i18n/uspoof_impl.cpp
9912 views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
* Copyright (C) 2008-2016, International Business Machines
6
* Corporation and others. All Rights Reserved.
7
**********************************************************************
8
*/
9
10
#include "unicode/utypes.h"
11
#include "unicode/uspoof.h"
12
#include "unicode/uchar.h"
13
#include "unicode/uniset.h"
14
#include "unicode/utf16.h"
15
#include "utrie2.h"
16
#include "cmemory.h"
17
#include "cstring.h"
18
#include "scriptset.h"
19
#include "umutex.h"
20
#include "udataswp.h"
21
#include "uassert.h"
22
#include "ucln_in.h"
23
#include "uspoof_impl.h"
24
25
#if !UCONFIG_NO_NORMALIZATION
26
27
28
U_NAMESPACE_BEGIN
29
30
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
31
32
SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
33
construct(status);
34
fSpoofData = data;
35
}
36
37
SpoofImpl::SpoofImpl(UErrorCode& status) {
38
construct(status);
39
40
// TODO: Call this method where it is actually needed, instead of in the
41
// constructor, to allow for lazy data loading. See #12696.
42
fSpoofData = SpoofData::getDefault(status);
43
}
44
45
SpoofImpl::SpoofImpl() {
46
UErrorCode status = U_ZERO_ERROR;
47
construct(status);
48
49
// TODO: Call this method where it is actually needed, instead of in the
50
// constructor, to allow for lazy data loading. See #12696.
51
fSpoofData = SpoofData::getDefault(status);
52
}
53
54
void SpoofImpl::construct(UErrorCode& status) {
55
fChecks = USPOOF_ALL_CHECKS;
56
fSpoofData = nullptr;
57
fAllowedCharsSet = nullptr;
58
fAllowedLocales = nullptr;
59
fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
60
61
if (U_FAILURE(status)) { return; }
62
63
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
64
fAllowedCharsSet = allowedCharsSet;
65
fAllowedLocales = uprv_strdup("");
66
if (fAllowedCharsSet == nullptr || fAllowedLocales == nullptr) {
67
status = U_MEMORY_ALLOCATION_ERROR;
68
return;
69
}
70
allowedCharsSet->freeze();
71
}
72
73
74
// Copy Constructor, used by the user level clone() function.
75
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
76
fChecks(USPOOF_ALL_CHECKS), fSpoofData(nullptr), fAllowedCharsSet(nullptr) ,
77
fAllowedLocales(nullptr) {
78
if (U_FAILURE(status)) {
79
return;
80
}
81
fChecks = src.fChecks;
82
if (src.fSpoofData != nullptr) {
83
fSpoofData = src.fSpoofData->addReference();
84
}
85
fAllowedCharsSet = src.fAllowedCharsSet->clone();
86
fAllowedLocales = uprv_strdup(src.fAllowedLocales);
87
if (fAllowedCharsSet == nullptr || fAllowedLocales == nullptr) {
88
status = U_MEMORY_ALLOCATION_ERROR;
89
}
90
fRestrictionLevel = src.fRestrictionLevel;
91
}
92
93
SpoofImpl::~SpoofImpl() {
94
if (fSpoofData != nullptr) {
95
fSpoofData->removeReference(); // Will delete if refCount goes to zero.
96
}
97
delete fAllowedCharsSet;
98
uprv_free((void *)fAllowedLocales);
99
}
100
101
// Cast this instance as a USpoofChecker for the C API.
102
USpoofChecker *SpoofImpl::asUSpoofChecker() {
103
return exportForC();
104
}
105
106
//
107
// Incoming parameter check on Status and the SpoofChecker object
108
// received from the C API.
109
//
110
const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
111
const auto* This = validate(sc, status);
112
if (U_FAILURE(status)) {
113
return nullptr;
114
}
115
if (This->fSpoofData != nullptr && !This->fSpoofData->validateDataVersion(status)) {
116
return nullptr;
117
}
118
return This;
119
}
120
121
SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
122
return const_cast<SpoofImpl *>
123
(SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
124
}
125
126
127
void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
128
UnicodeSet allowedChars;
129
UnicodeSet *tmpSet = nullptr;
130
const char *locStart = localesList;
131
const char *locEnd = nullptr;
132
const char *localesListEnd = localesList + uprv_strlen(localesList);
133
int32_t localeListCount = 0; // Number of locales provided by caller.
134
135
// Loop runs once per locale from the localesList, a comma separated list of locales.
136
do {
137
locEnd = uprv_strchr(locStart, ',');
138
if (locEnd == nullptr) {
139
locEnd = localesListEnd;
140
}
141
while (*locStart == ' ') {
142
locStart++;
143
}
144
const char *trimmedEnd = locEnd-1;
145
while (trimmedEnd > locStart && *trimmedEnd == ' ') {
146
trimmedEnd--;
147
}
148
if (trimmedEnd <= locStart) {
149
break;
150
}
151
const char* locale = uprv_strndup(locStart, static_cast<int32_t>(trimmedEnd + 1 - locStart));
152
localeListCount++;
153
154
// We have one locale from the locales list.
155
// Add the script chars for this locale to the accumulating set of allowed chars.
156
// If the locale is no good, we will be notified back via status.
157
addScriptChars(locale, &allowedChars, status);
158
uprv_free((void *)locale);
159
if (U_FAILURE(status)) {
160
break;
161
}
162
locStart = locEnd + 1;
163
} while (locStart < localesListEnd);
164
165
// If our caller provided an empty list of locales, we disable the allowed characters checking
166
if (localeListCount == 0) {
167
uprv_free((void *)fAllowedLocales);
168
fAllowedLocales = uprv_strdup("");
169
tmpSet = new UnicodeSet(0, 0x10ffff);
170
if (fAllowedLocales == nullptr || tmpSet == nullptr) {
171
status = U_MEMORY_ALLOCATION_ERROR;
172
return;
173
}
174
tmpSet->freeze();
175
delete fAllowedCharsSet;
176
fAllowedCharsSet = tmpSet;
177
fChecks &= ~USPOOF_CHAR_LIMIT;
178
return;
179
}
180
181
182
// Add all common and inherited characters to the set of allowed chars.
183
UnicodeSet tempSet;
184
tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
185
allowedChars.addAll(tempSet);
186
tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
187
allowedChars.addAll(tempSet);
188
189
// If anything went wrong, we bail out without changing
190
// the state of the spoof checker.
191
if (U_FAILURE(status)) {
192
return;
193
}
194
195
// Store the updated spoof checker state.
196
tmpSet = allowedChars.clone();
197
const char *tmpLocalesList = uprv_strdup(localesList);
198
if (tmpSet == nullptr || tmpLocalesList == nullptr) {
199
status = U_MEMORY_ALLOCATION_ERROR;
200
return;
201
}
202
uprv_free((void *)fAllowedLocales);
203
fAllowedLocales = tmpLocalesList;
204
tmpSet->freeze();
205
delete fAllowedCharsSet;
206
fAllowedCharsSet = tmpSet;
207
fChecks |= USPOOF_CHAR_LIMIT;
208
}
209
210
211
const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
212
return fAllowedLocales;
213
}
214
215
216
// Given a locale (a language), add all the characters from all of the scripts used with that language
217
// to the allowedChars UnicodeSet
218
219
void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
220
UScriptCode scripts[30];
221
222
int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
223
if (U_FAILURE(status)) {
224
return;
225
}
226
if (status == U_USING_DEFAULT_WARNING) {
227
status = U_ILLEGAL_ARGUMENT_ERROR;
228
return;
229
}
230
UnicodeSet tmpSet;
231
int32_t i;
232
for (i=0; i<numScripts; i++) {
233
tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
234
allowedChars->addAll(tmpSet);
235
}
236
}
237
238
// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
239
void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
240
result.resetAll();
241
result.setScriptExtensions(codePoint, status);
242
if (U_FAILURE(status)) { return; }
243
244
// Section 5.1 step 1
245
if (result.test(USCRIPT_HAN, status)) {
246
result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
247
result.set(USCRIPT_JAPANESE, status);
248
result.set(USCRIPT_KOREAN, status);
249
}
250
if (result.test(USCRIPT_HIRAGANA, status)) {
251
result.set(USCRIPT_JAPANESE, status);
252
}
253
if (result.test(USCRIPT_KATAKANA, status)) {
254
result.set(USCRIPT_JAPANESE, status);
255
}
256
if (result.test(USCRIPT_HANGUL, status)) {
257
result.set(USCRIPT_KOREAN, status);
258
}
259
if (result.test(USCRIPT_BOPOMOFO, status)) {
260
result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
261
}
262
263
// Section 5.1 step 2
264
if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
265
result.setAll();
266
}
267
}
268
269
// Computes the resolved script set for a string, according to UTS 39 section 5.1.
270
void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
271
getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
272
}
273
274
// Computes the resolved script set for a string, omitting characters having the specified script.
275
// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
276
void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
277
result.setAll();
278
279
ScriptSet temp;
280
UChar32 codePoint;
281
for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
282
codePoint = input.char32At(i);
283
284
// Compute the augmented script set for the character
285
getAugmentedScriptSet(codePoint, temp, status);
286
if (U_FAILURE(status)) { return; }
287
288
// Intersect the augmented script set with the resolved script set, but only if the character doesn't
289
// have the script specified in the function call
290
if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
291
result.intersect(temp);
292
}
293
}
294
}
295
296
// Computes the set of numerics for a string, according to UTS 39 section 5.3.
297
void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
298
result.clear();
299
300
UChar32 codePoint;
301
for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
302
codePoint = input.char32At(i);
303
304
// Store a representative character for each kind of decimal digit
305
if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
306
// Store the zero character as a representative for comparison.
307
// Unicode guarantees it is codePoint - value
308
result.add(codePoint - static_cast<UChar32>(u_getNumericValue(codePoint)));
309
}
310
}
311
}
312
313
// Computes the restriction level of a string, according to UTS 39 section 5.2.
314
URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
315
// Section 5.2 step 1:
316
if (!fAllowedCharsSet->containsAll(input)) {
317
return USPOOF_UNRESTRICTIVE;
318
}
319
320
// Section 5.2 step 2
321
// Java use a static UnicodeSet for this test. In C++, avoid the static variable
322
// and just do a simple for loop.
323
UBool allASCII = true;
324
for (int32_t i=0, length=input.length(); i<length; i++) {
325
if (input.charAt(i) > 0x7f) {
326
allASCII = false;
327
break;
328
}
329
}
330
if (allASCII) {
331
return USPOOF_ASCII;
332
}
333
334
// Section 5.2 steps 3:
335
ScriptSet resolvedScriptSet;
336
getResolvedScriptSet(input, resolvedScriptSet, status);
337
if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
338
339
// Section 5.2 step 4:
340
if (!resolvedScriptSet.isEmpty()) {
341
return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
342
}
343
344
// Section 5.2 step 5:
345
ScriptSet resolvedNoLatn;
346
getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
347
if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
348
349
// Section 5.2 step 6:
350
if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
351
|| resolvedNoLatn.test(USCRIPT_JAPANESE, status)
352
|| resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
353
return USPOOF_HIGHLY_RESTRICTIVE;
354
}
355
356
// Section 5.2 step 7:
357
if (!resolvedNoLatn.isEmpty()
358
&& !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
359
&& !resolvedNoLatn.test(USCRIPT_GREEK, status)
360
&& !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
361
return USPOOF_MODERATELY_RESTRICTIVE;
362
}
363
364
// Section 5.2 step 8:
365
return USPOOF_MINIMALLY_RESTRICTIVE;
366
}
367
368
int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
369
bool sawLeadCharacter = false;
370
for (int32_t i=0; i<input.length();) {
371
UChar32 cp = input.char32At(i);
372
if (sawLeadCharacter && cp == 0x0307) {
373
return i;
374
}
375
uint8_t combiningClass = u_getCombiningClass(cp);
376
// Skip over characters except for those with combining class 0 (non-combining characters) or with
377
// combining class 230 (same class as U+0307)
378
U_ASSERT(u_getCombiningClass(0x0307) == 230);
379
if (combiningClass == 0 || combiningClass == 230) {
380
sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
381
}
382
i += U16_LENGTH(cp);
383
}
384
return -1;
385
}
386
387
static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
388
return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||
389
u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
390
}
391
392
bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
393
if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
394
return true;
395
}
396
UnicodeString skelStr;
397
fSpoofData->confusableLookup(cp, skelStr);
398
UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
399
if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
400
return true;
401
}
402
return false;
403
}
404
405
406
407
// Convert a text format hex number. Utility function used by builder code. Static.
408
// Input: char16_t *string text. Output: a UChar32
409
// Input has been pre-checked, and will have no non-hex chars.
410
// The number must fall in the code point range of 0..0x10ffff
411
// Static Function.
412
UChar32 SpoofImpl::ScanHex(const char16_t *s, int32_t start, int32_t limit, UErrorCode &status) {
413
if (U_FAILURE(status)) {
414
return 0;
415
}
416
U_ASSERT(limit-start > 0);
417
uint32_t val = 0;
418
int i;
419
for (i=start; i<limit; i++) {
420
int digitVal = s[i] - 0x30;
421
if (digitVal>9) {
422
digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'
423
}
424
if (digitVal>15) {
425
digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'
426
}
427
U_ASSERT(digitVal <= 0xf);
428
val <<= 4;
429
val += digitVal;
430
}
431
if (val > 0x10ffff) {
432
status = U_PARSE_ERROR;
433
val = 0;
434
}
435
return static_cast<UChar32>(val);
436
}
437
438
439
//-----------------------------------------
440
//
441
// class CheckResult Implementation
442
//
443
//-----------------------------------------
444
445
CheckResult::CheckResult() {
446
clear();
447
}
448
449
USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
450
return exportForC();
451
}
452
453
//
454
// Incoming parameter check on Status and the CheckResult object
455
// received from the C API.
456
//
457
const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
458
return validate(ptr, status);
459
}
460
461
CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
462
return validate(ptr, status);
463
}
464
465
void CheckResult::clear() {
466
fChecks = 0;
467
fNumerics.clear();
468
fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
469
}
470
471
int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
472
if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
473
return fChecks | fRestrictionLevel;
474
} else {
475
return fChecks;
476
}
477
}
478
479
CheckResult::~CheckResult() {
480
}
481
482
//----------------------------------------------------------------------------------------------
483
//
484
// class SpoofData Implementation
485
//
486
//----------------------------------------------------------------------------------------------
487
488
489
UBool SpoofData::validateDataVersion(UErrorCode &status) const {
490
if (U_FAILURE(status) ||
491
fRawData == nullptr ||
492
fRawData->fMagic != USPOOF_MAGIC ||
493
fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
494
fRawData->fFormatVersion[1] != 0 ||
495
fRawData->fFormatVersion[2] != 0 ||
496
fRawData->fFormatVersion[3] != 0) {
497
status = U_INVALID_FORMAT_ERROR;
498
return false;
499
}
500
return true;
501
}
502
503
static UBool U_CALLCONV
504
spoofDataIsAcceptable(void *context,
505
const char * /* type */, const char * /*name*/,
506
const UDataInfo *pInfo) {
507
if(
508
pInfo->size >= 20 &&
509
pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
510
pInfo->charsetFamily == U_CHARSET_FAMILY &&
511
pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu "
512
pInfo->dataFormat[1] == 0x66 &&
513
pInfo->dataFormat[2] == 0x75 &&
514
pInfo->dataFormat[3] == 0x20 &&
515
pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
516
) {
517
UVersionInfo *version = static_cast<UVersionInfo *>(context);
518
if(version != nullptr) {
519
uprv_memcpy(version, pInfo->dataVersion, 4);
520
}
521
return true;
522
} else {
523
return false;
524
}
525
}
526
527
// Methods for the loading of the default confusables data file. The confusable
528
// data is loaded only when it is needed.
529
//
530
// SpoofData::getDefault() - Return the default confusables data, and call the
531
// initOnce() if it is not available. Adds a reference
532
// to the SpoofData that the caller is responsible for
533
// decrementing when they are done with the data.
534
//
535
// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData
536
// is shared by all spoof checkers using the default data.
537
//
538
// uspoof_cleanupDefaultData - Called during cleanup.
539
//
540
541
static UInitOnce gSpoofInitDefaultOnce {};
542
static SpoofData* gDefaultSpoofData;
543
544
static UBool U_CALLCONV
545
uspoof_cleanupDefaultData() {
546
if (gDefaultSpoofData) {
547
// Will delete, assuming all user-level spoof checkers were closed.
548
gDefaultSpoofData->removeReference();
549
gDefaultSpoofData = nullptr;
550
gSpoofInitDefaultOnce.reset();
551
}
552
return true;
553
}
554
555
static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
556
UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
557
spoofDataIsAcceptable,
558
nullptr, // context, would receive dataVersion if supplied.
559
&status);
560
if (U_FAILURE(status)) { return; }
561
gDefaultSpoofData = new SpoofData(udm, status);
562
if (U_FAILURE(status)) {
563
delete gDefaultSpoofData;
564
gDefaultSpoofData = nullptr;
565
return;
566
}
567
if (gDefaultSpoofData == nullptr) {
568
status = U_MEMORY_ALLOCATION_ERROR;
569
return;
570
}
571
ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
572
}
573
574
SpoofData* SpoofData::getDefault(UErrorCode& status) {
575
umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
576
if (U_FAILURE(status)) { return nullptr; }
577
gDefaultSpoofData->addReference();
578
return gDefaultSpoofData;
579
}
580
581
582
583
SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
584
{
585
reset();
586
if (U_FAILURE(status)) {
587
return;
588
}
589
fUDM = udm;
590
// fRawData is non-const because it may be constructed by the data builder.
591
fRawData = reinterpret_cast<SpoofDataHeader *>(
592
const_cast<void *>(udata_getMemory(udm)));
593
validateDataVersion(status);
594
initPtrs(status);
595
}
596
597
598
SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
599
{
600
reset();
601
if (U_FAILURE(status)) {
602
return;
603
}
604
if (static_cast<size_t>(length) < sizeof(SpoofDataHeader)) {
605
status = U_INVALID_FORMAT_ERROR;
606
return;
607
}
608
if (data == nullptr) {
609
status = U_ILLEGAL_ARGUMENT_ERROR;
610
return;
611
}
612
void *ncData = const_cast<void *>(data);
613
fRawData = static_cast<SpoofDataHeader *>(ncData);
614
if (length < fRawData->fLength) {
615
status = U_INVALID_FORMAT_ERROR;
616
return;
617
}
618
validateDataVersion(status);
619
initPtrs(status);
620
}
621
622
623
// Spoof Data constructor for use from data builder.
624
// Initializes a new, empty data area that will be populated later.
625
SpoofData::SpoofData(UErrorCode &status) {
626
reset();
627
if (U_FAILURE(status)) {
628
return;
629
}
630
fDataOwned = true;
631
632
// The spoof header should already be sized to be a multiple of 16 bytes.
633
// Just in case it's not, round it up.
634
uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
635
U_ASSERT(initialSize == sizeof(SpoofDataHeader));
636
637
fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
638
fMemLimit = initialSize;
639
if (fRawData == nullptr) {
640
status = U_MEMORY_ALLOCATION_ERROR;
641
return;
642
}
643
uprv_memset(fRawData, 0, initialSize);
644
645
fRawData->fMagic = USPOOF_MAGIC;
646
fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
647
fRawData->fFormatVersion[1] = 0;
648
fRawData->fFormatVersion[2] = 0;
649
fRawData->fFormatVersion[3] = 0;
650
initPtrs(status);
651
}
652
653
// reset() - initialize all fields.
654
// Should be updated if any new fields are added.
655
// Called by constructors to put things in a known initial state.
656
void SpoofData::reset() {
657
fRawData = nullptr;
658
fDataOwned = false;
659
fUDM = nullptr;
660
fMemLimit = 0;
661
fRefCount = 1;
662
fCFUKeys = nullptr;
663
fCFUValues = nullptr;
664
fCFUStrings = nullptr;
665
}
666
667
668
// SpoofData::initPtrs()
669
// Initialize the pointers to the various sections of the raw data.
670
//
671
// This function is used both during the Trie building process (multiple
672
// times, as the individual data sections are added), and
673
// during the opening of a Spoof Checker from prebuilt data.
674
//
675
// The pointers for non-existent data sections (identified by an offset of 0)
676
// are set to nullptr.
677
//
678
// Note: During building the data, adding each new data section
679
// reallocs the raw data area, which likely relocates it, which
680
// in turn requires reinitializing all of the pointers into it, hence
681
// multiple calls to this function during building.
682
//
683
void SpoofData::initPtrs(UErrorCode &status) {
684
fCFUKeys = nullptr;
685
fCFUValues = nullptr;
686
fCFUStrings = nullptr;
687
if (U_FAILURE(status)) {
688
return;
689
}
690
if (fRawData->fCFUKeys != 0) {
691
fCFUKeys = reinterpret_cast<int32_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUKeys);
692
}
693
if (fRawData->fCFUStringIndex != 0) {
694
fCFUValues = reinterpret_cast<uint16_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUStringIndex);
695
}
696
if (fRawData->fCFUStringTable != 0) {
697
fCFUStrings = reinterpret_cast<char16_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUStringTable);
698
}
699
}
700
701
702
SpoofData::~SpoofData() {
703
if (fDataOwned) {
704
uprv_free(fRawData);
705
}
706
fRawData = nullptr;
707
if (fUDM != nullptr) {
708
udata_close(fUDM);
709
}
710
fUDM = nullptr;
711
}
712
713
714
void SpoofData::removeReference() {
715
if (umtx_atomic_dec(&fRefCount) == 0) {
716
delete this;
717
}
718
}
719
720
721
SpoofData *SpoofData::addReference() {
722
umtx_atomic_inc(&fRefCount);
723
return this;
724
}
725
726
727
void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
728
if (U_FAILURE(status)) {
729
return nullptr;
730
}
731
if (!fDataOwned) {
732
UPRV_UNREACHABLE_EXIT;
733
}
734
735
numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16
736
uint32_t returnOffset = fMemLimit;
737
fMemLimit += numBytes;
738
fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
739
fRawData->fLength = fMemLimit;
740
uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
741
initPtrs(status);
742
return reinterpret_cast<char*>(fRawData) + returnOffset;
743
}
744
745
int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
746
int32_t dataSize = fRawData->fLength;
747
if (capacity < dataSize) {
748
status = U_BUFFER_OVERFLOW_ERROR;
749
return dataSize;
750
}
751
uprv_memcpy(buf, fRawData, dataSize);
752
return dataSize;
753
}
754
755
int32_t SpoofData::size() const {
756
return fRawData->fLength;
757
}
758
759
//-------------------------------
760
//
761
// Front-end APIs for SpoofData
762
//
763
//-------------------------------
764
765
int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
766
// Perform a binary search.
767
// [lo, hi), i.e lo is inclusive, hi is exclusive.
768
// The result after the loop will be in lo.
769
int32_t lo = 0;
770
int32_t hi = length();
771
do {
772
int32_t mid = (lo + hi) / 2;
773
if (codePointAt(mid) > inChar) {
774
hi = mid;
775
} else if (codePointAt(mid) < inChar) {
776
lo = mid;
777
} else {
778
// Found result. Break early.
779
lo = mid;
780
break;
781
}
782
} while (hi - lo > 1);
783
784
// Did we find an entry? If not, the char maps to itself.
785
if (codePointAt(lo) != inChar) {
786
dest.append(inChar);
787
return 1;
788
}
789
790
// Add the element to the string builder and return.
791
return appendValueTo(lo, dest);
792
}
793
794
int32_t SpoofData::length() const {
795
return fRawData->fCFUKeysSize;
796
}
797
798
UChar32 SpoofData::codePointAt(int32_t index) const {
799
return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
800
}
801
802
int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
803
int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
804
805
// Value is either a char (for strings of length 1) or
806
// an index into the string table (for longer strings)
807
uint16_t value = fCFUValues[index];
808
if (stringLength == 1) {
809
dest.append(static_cast<char16_t>(value));
810
} else {
811
dest.append(fCFUStrings + value, stringLength);
812
}
813
814
return stringLength;
815
}
816
817
818
U_NAMESPACE_END
819
820
U_NAMESPACE_USE
821
822
//-----------------------------------------------------------------------------
823
//
824
// uspoof_swap - byte swap and char encoding swap of spoof data
825
//
826
//-----------------------------------------------------------------------------
827
U_CAPI int32_t U_EXPORT2
828
uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
829
UErrorCode *status) {
830
831
if (status == nullptr || U_FAILURE(*status)) {
832
return 0;
833
}
834
if(ds==nullptr || inData==nullptr || length<-1 || (length>0 && outData==nullptr)) {
835
*status=U_ILLEGAL_ARGUMENT_ERROR;
836
return 0;
837
}
838
839
//
840
// Check that the data header is for spoof data.
841
// (Header contents are defined in gencfu.cpp)
842
//
843
const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
844
if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */
845
pInfo->dataFormat[1]==0x66 &&
846
pInfo->dataFormat[2]==0x75 &&
847
pInfo->dataFormat[3]==0x20 &&
848
pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
849
pInfo->formatVersion[1]==0 &&
850
pInfo->formatVersion[2]==0 &&
851
pInfo->formatVersion[3]==0 )) {
852
udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
853
"(format version %02x %02x %02x %02x) is not recognized\n",
854
pInfo->dataFormat[0], pInfo->dataFormat[1],
855
pInfo->dataFormat[2], pInfo->dataFormat[3],
856
pInfo->formatVersion[0], pInfo->formatVersion[1],
857
pInfo->formatVersion[2], pInfo->formatVersion[3]);
858
*status=U_UNSUPPORTED_ERROR;
859
return 0;
860
}
861
862
//
863
// Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
864
// header). This swap also conveniently gets us
865
// the size of the ICU d.h., which lets us locate the start
866
// of the uspoof specific data.
867
//
868
int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
869
870
871
//
872
// Get the Spoof Data Header, and check that it appears to be OK.
873
//
874
//
875
const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
876
SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
877
if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC ||
878
ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
879
{
880
udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
881
*status=U_UNSUPPORTED_ERROR;
882
return 0;
883
}
884
885
//
886
// Prefight operation? Just return the size
887
//
888
int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
889
int32_t totalSize = headerSize + spoofDataLength;
890
if (length < 0) {
891
return totalSize;
892
}
893
894
//
895
// Check that length passed in is consistent with length from Spoof data header.
896
//
897
if (length < totalSize) {
898
udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
899
spoofDataLength);
900
*status=U_INDEX_OUTOFBOUNDS_ERROR;
901
return 0;
902
}
903
904
905
//
906
// Swap the Data. Do the data itself first, then the Spoof Data Header, because
907
// we need to reference the header to locate the data, and an
908
// inplace swap of the header leaves it unusable.
909
//
910
uint8_t *outBytes = (uint8_t *)outData + headerSize;
911
SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes;
912
913
int32_t sectionStart;
914
int32_t sectionLength;
915
916
//
917
// If not swapping in place, zero out the output buffer before starting.
918
// Gaps may exist between the individual sections, and these must be zeroed in
919
// the output buffer. The simplest way to do that is to just zero the whole thing.
920
//
921
if (inBytes != outBytes) {
922
uprv_memset(outBytes, 0, spoofDataLength);
923
}
924
925
// Confusables Keys Section (fCFUKeys)
926
sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
927
sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
928
ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
929
930
// String Index Section
931
sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
932
sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
933
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
934
935
// String Table Section
936
sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
937
sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
938
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
939
940
// And, last, swap the header itself.
941
// int32_t fMagic // swap this
942
// uint8_t fFormatVersion[4] // Do not swap this, just copy
943
// int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
944
//
945
uint32_t magic = ds->readUInt32(spoofDH->fMagic);
946
ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
947
948
if (inBytes != outBytes) {
949
uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
950
}
951
// swap starting at fLength
952
ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
953
954
return totalSize;
955
}
956
957
#endif
958
959
960
961