Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/icu4c/i18n/unicode/uspoof.h
9912 views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
***************************************************************************
5
* Copyright (C) 2008-2016, International Business Machines Corporation
6
* and others. All Rights Reserved.
7
***************************************************************************
8
* file name: uspoof.h
9
* encoding: UTF-8
10
* tab size: 8 (not used)
11
* indentation:4
12
*
13
* created on: 2008Feb13
14
* created by: Andy Heninger
15
*
16
* Unicode Spoof Detection
17
*/
18
19
#ifndef USPOOF_H
20
#define USPOOF_H
21
22
#include "unicode/ubidi.h"
23
#include "unicode/utypes.h"
24
#include "unicode/uset.h"
25
#include "unicode/parseerr.h"
26
27
#if !UCONFIG_NO_NORMALIZATION
28
29
30
#if U_SHOW_CPLUSPLUS_API
31
#include "unicode/localpointer.h"
32
#include "unicode/unistr.h"
33
#include "unicode/uniset.h"
34
#endif
35
36
37
/**
38
* \file
39
* \brief C API: Unicode Security and Spoofing Detection
40
*
41
* <p>
42
* This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and
43
* <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
44
*
45
* <ol>
46
* <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and
47
* &quot;&Eta;arvest&quot;, where the second string starts with the Greek capital letter Eta.</li>
48
* <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
49
* detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li>
50
* </ol>
51
*
52
* <p>
53
* Although originally designed as a method for flagging suspicious identifier strings such as URLs,
54
* <code>USpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word
55
* content filters.
56
*
57
* <p>
58
* The functions of this class are exposed as C API, with a handful of syntactical conveniences for C++.
59
*
60
* <h2>Confusables</h2>
61
*
62
* <p>
63
* The following example shows how to use <code>USpoofChecker</code> to check for confusability between two strings:
64
*
65
* \code{.c}
66
* UErrorCode status = U_ZERO_ERROR;
67
* UChar* str1 = (UChar*) u"Harvest";
68
* UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA
69
*
70
* USpoofChecker* sc = uspoof_open(&status);
71
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
72
*
73
* int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status);
74
* UBool result = bitmask != 0;
75
* // areConfusable: 1 (status: U_ZERO_ERROR)
76
* printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
77
* uspoof_close(sc);
78
* \endcode
79
*
80
* <p>
81
* The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks}
82
* enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the
83
* confusability test; and the following line extracts the result out of the return value. For best performance,
84
* the instance should be created once (e.g., upon application startup), and the efficient
85
* {@link uspoof_areConfusable} method can be used at runtime.
86
*
87
* If the paragraph direction used to display the strings is known, the bidi function should be used instead:
88
*
89
* \code{.c}
90
* UErrorCode status = U_ZERO_ERROR;
91
* // These strings look identical when rendered in a left-to-right context.
92
* // They look distinct in a right-to-left context.
93
* UChar* str1 = (UChar*) u"A1\u05D0"; // A1א
94
* UChar* str2 = (UChar*) u"A\u05D01"; // Aא1
95
*
96
* USpoofChecker* sc = uspoof_open(&status);
97
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
98
*
99
* int32_t bitmask = uspoof_areBidiConfusable(sc, UBIDI_LTR, str1, -1, str2, -1, &status);
100
* UBool result = bitmask != 0;
101
* // areBidiConfusable: 1 (status: U_ZERO_ERROR)
102
* printf("areBidiConfusable: %d (status: %s)\n", result, u_errorName(status));
103
* uspoof_close(sc);
104
* \endcode
105
*
106
* <p>
107
* The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call
108
* {@link uspoof_close} when the object goes out of scope:
109
*
110
* \code{.cpp}
111
* UErrorCode status = U_ZERO_ERROR;
112
* LocalUSpoofCheckerPointer sc(uspoof_open(&status));
113
* uspoof_setChecks(sc.getAlias(), USPOOF_CONFUSABLE, &status);
114
* // ...
115
* \endcode
116
*
117
* UTS 39 defines two strings to be <em>confusable</em> if they map to the same <em>skeleton string</em>. A skeleton can
118
* be thought of as a "hash code". {@link uspoof_getSkeleton} computes the skeleton for a particular string, so
119
* the following snippet is equivalent to the example above:
120
*
121
* \code{.c}
122
* UErrorCode status = U_ZERO_ERROR;
123
* UChar* str1 = (UChar*) u"Harvest";
124
* UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA
125
*
126
* USpoofChecker* sc = uspoof_open(&status);
127
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
128
*
129
* // Get skeleton 1
130
* int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status);
131
* UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar));
132
* status = U_ZERO_ERROR;
133
* uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status);
134
*
135
* // Get skeleton 2
136
* int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status);
137
* UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar));
138
* status = U_ZERO_ERROR;
139
* uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status);
140
*
141
* // Are the skeletons the same?
142
* UBool result = u_strcmp(skel1, skel2) == 0;
143
* // areConfusable: 1 (status: U_ZERO_ERROR)
144
* printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
145
* uspoof_close(sc);
146
* free(skel1);
147
* free(skel2);
148
* \endcode
149
*
150
* If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling
151
* {@link uspoof_areConfusable} many times in a loop, {@link uspoof_getSkeleton} can be used instead, as shown below:
152
*
153
* \code{.c}
154
* UErrorCode status = U_ZERO_ERROR;
155
* #define DICTIONARY_LENGTH 2
156
* UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
157
* UChar* skeletons[DICTIONARY_LENGTH];
158
* UChar* str = (UChar*) u"1orern";
159
*
160
* // Setup:
161
* USpoofChecker* sc = uspoof_open(&status);
162
* uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
163
* for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
164
* UChar* word = dictionary[i];
165
* int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status);
166
* skeletons[i] = (UChar*) malloc(++len * sizeof(UChar));
167
* status = U_ZERO_ERROR;
168
* uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status);
169
* }
170
*
171
* // Live Check:
172
* {
173
* int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status);
174
* UChar* skel = (UChar*) malloc(++len * sizeof(UChar));
175
* status = U_ZERO_ERROR;
176
* uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status);
177
* UBool result = false;
178
* for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
179
* result = u_strcmp(skel, skeletons[i]) == 0;
180
* if (result == true) { break; }
181
* }
182
* // Has confusable in dictionary: 1 (status: U_ZERO_ERROR)
183
* printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status));
184
* free(skel);
185
* }
186
*
187
* for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
188
* free(skeletons[i]);
189
* }
190
* uspoof_close(sc);
191
* \endcode
192
*
193
* <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em>
194
* guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons
195
* at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons.
196
*
197
* <h2>Spoof Detection</h2>
198
*
199
* The following snippet shows a minimal example of using <code>USpoofChecker</code> to perform spoof detection on a
200
* string:
201
*
202
* \code{.c}
203
* UErrorCode status = U_ZERO_ERROR;
204
* UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A
205
*
206
* // Get the default set of allowable characters:
207
* USet* allowed = uset_openEmpty();
208
* uset_addAll(allowed, uspoof_getRecommendedSet(&status));
209
* uset_addAll(allowed, uspoof_getInclusionSet(&status));
210
*
211
* USpoofChecker* sc = uspoof_open(&status);
212
* uspoof_setAllowedChars(sc, allowed, &status);
213
* uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE);
214
*
215
* int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status);
216
* UBool result = bitmask != 0;
217
* // fails checks: 1 (status: U_ZERO_ERROR)
218
* printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
219
* uspoof_close(sc);
220
* uset_close(allowed);
221
* \endcode
222
*
223
* As in the case for confusability checking, it is good practice to create one <code>USpoofChecker</code> instance at
224
* startup, and call the cheaper {@link uspoof_check} online. We specify the set of
225
* allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39.
226
*
227
* In addition to {@link uspoof_check}, the function {@link uspoof_checkUTF8} is exposed for UTF8-encoded char* strings,
228
* and {@link uspoof_checkUnicodeString} is exposed for C++ programmers.
229
*
230
* If the {@link USPOOF_AUX_INFO} check is enabled, a limited amount of information on why a string failed the checks
231
* is available in the returned bitmask. For complete information, use the {@link uspoof_check2} class of functions
232
* with a {@link USpoofCheckResult} parameter:
233
*
234
* \code{.c}
235
* UErrorCode status = U_ZERO_ERROR;
236
* UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A
237
*
238
* // Get the default set of allowable characters:
239
* USet* allowed = uset_openEmpty();
240
* uset_addAll(allowed, uspoof_getRecommendedSet(&status));
241
* uset_addAll(allowed, uspoof_getInclusionSet(&status));
242
*
243
* USpoofChecker* sc = uspoof_open(&status);
244
* uspoof_setAllowedChars(sc, allowed, &status);
245
* uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE);
246
*
247
* USpoofCheckResult* checkResult = uspoof_openCheckResult(&status);
248
* int32_t bitmask = uspoof_check2(sc, str, -1, checkResult, &status);
249
*
250
* int32_t failures1 = bitmask;
251
* int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status);
252
* assert(failures1 == failures2);
253
* // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
254
* printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
255
*
256
* // Cleanup:
257
* uspoof_close(sc);
258
* uset_close(allowed);
259
* uspoof_closeCheckResult(checkResult);
260
* \endcode
261
*
262
* C++ users can take advantage of a few syntactical conveniences. The following snippet is functionally
263
* equivalent to the one above:
264
*
265
* \code{.cpp}
266
* UErrorCode status = U_ZERO_ERROR;
267
* UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A
268
*
269
* // Get the default set of allowable characters:
270
* UnicodeSet allowed;
271
* allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status));
272
* allowed.addAll(*uspoof_getInclusionUnicodeSet(&status));
273
*
274
* LocalUSpoofCheckerPointer sc(uspoof_open(&status));
275
* uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status);
276
* uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE);
277
*
278
* LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status));
279
* int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
280
*
281
* int32_t failures1 = bitmask;
282
* int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status);
283
* assert(failures1 == failures2);
284
* // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
285
* printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
286
*
287
* // Explicit cleanup not necessary.
288
* \endcode
289
*
290
* The return value is a bitmask of the checks that failed. In this case, there was one check that failed:
291
* {@link USPOOF_RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are:
292
*
293
* <ul>
294
* <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the
295
* <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS
296
* 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li>
297
* <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character
298
* sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li>
299
* <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable
300
* characters. See {@link uspoof_setAllowedChars} and {@link uspoof_setAllowedLocales}.</li>
301
* <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li>
302
* </ul>
303
*
304
* <p>
305
* These checks can be enabled independently of each other. For example, if you were interested in checking for only the
306
* INVISIBLE and MIXED_NUMBERS conditions, you could do:
307
*
308
* \code{.c}
309
* UErrorCode status = U_ZERO_ERROR;
310
* UChar* str = (UChar*) u"8\u09EA"; // 8 mixed with U+09EA BENGALI DIGIT FOUR
311
*
312
* USpoofChecker* sc = uspoof_open(&status);
313
* uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status);
314
*
315
* int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status);
316
* UBool result = bitmask != 0;
317
* // fails checks: 1 (status: U_ZERO_ERROR)
318
* printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
319
* uspoof_close(sc);
320
* \endcode
321
*
322
* Here is an example in C++ showing how to compute the restriction level of a string:
323
*
324
* \code{.cpp}
325
* UErrorCode status = U_ZERO_ERROR;
326
* UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A
327
*
328
* // Get the default set of allowable characters:
329
* UnicodeSet allowed;
330
* allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status));
331
* allowed.addAll(*uspoof_getInclusionUnicodeSet(&status));
332
*
333
* LocalUSpoofCheckerPointer sc(uspoof_open(&status));
334
* uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status);
335
* uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE);
336
* uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL | USPOOF_AUX_INFO, &status);
337
*
338
* LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status));
339
* int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
340
*
341
* URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status);
342
* // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask:
343
* assert((restrictionLevel & bitmask) == restrictionLevel);
344
* // Restriction level: 0x50000000 (status: U_ZERO_ERROR)
345
* printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status));
346
* \endcode
347
*
348
* The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since
349
* USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check.
350
*
351
* <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in
352
* <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings
353
* are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have
354
* Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is
355
* recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed
356
* with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on
357
* the levels, see UTS 39 or {@link URestrictionLevel}. The Restriction Level test is aware of the set of
358
* allowed characters set in {@link uspoof_setAllowedChars}. Note that characters which have script code
359
* COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
360
* scripts.
361
*
362
* <h2>Advanced bidirectional usage</h2>
363
* If the paragraph direction with which the identifiers will be displayed is not known, there are
364
* multiple options for confusable detection depending on the circumstances.
365
*
366
* <p>
367
* In some circumstances, the only concern is confusion between identifiers displayed with the same
368
* paragraph direction.
369
*
370
* <p>
371
* An example is the case where identifiers are usernames prefixed with the @ symbol.
372
* That symbol will appear to the left in a left-to-right context, and to the right in a
373
* right-to-left context, so that an identifier displayed in a left-to-right context can never be
374
* confused with an identifier displayed in a right-to-left context:
375
* <ul>
376
* <li>
377
* The usernames "A1א" (A one aleph) and "Aא1" (A aleph 1)
378
* would be considered confusable, since they both appear as \@A1א in a left-to-right context, and the
379
* usernames "אA_1" (aleph A underscore one) and "א1_A" (aleph one underscore A) would be considered
380
* confusable, since they both appear as A_1א@ in a right-to-left context.
381
* </li>
382
* <li>
383
* The username "Mark_" would not be considered confusable with the username "_Mark",
384
* even though the latter would appear as Mark_@ in a right-to-left context, and the
385
* former as \@Mark_ in a left-to-right context.
386
* </li>
387
* </ul>
388
* <p>
389
* In that case, the caller should check for both LTR-confusability and RTL-confusability:
390
*
391
* \code{.cpp}
392
* bool confusableInEitherDirection =
393
* uspoof_areBidiConfusableUnicodeString(sc, UBIDI_LTR, id1, id2, &status) ||
394
* uspoof_areBidiConfusableUnicodeString(sc, UBIDI_RTL, id1, id2, &status);
395
* \endcode
396
*
397
* If the bidiSkeleton is used, the LTR and RTL skeleta should be kept separately and compared, LTR
398
* with LTR and RTL with RTL.
399
*
400
* <p>
401
* In cases where confusability between the visual appearances of an identifier displayed in a
402
* left-to-right context with another identifier displayed in a right-to-left context is a concern,
403
* the LTR skeleton of one can be compared with the RTL skeleton of the other. However, this
404
* very broad definition of confusability may have unexpected results; for instance, it treats the
405
* ASCII identifiers "Mark_" and "_Mark" as confusable.
406
*
407
* <h2>Additional Information</h2>
408
*
409
* A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
410
*
411
* <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether
412
* two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads,
413
* using the same USpoofChecker instance.
414
*
415
* More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are
416
* thread safe. Those that take a non-const USpoofChecker are not thread safe..
417
*
418
* @stable ICU 4.6
419
*/
420
421
U_CDECL_BEGIN
422
423
struct USpoofChecker;
424
/**
425
* @stable ICU 4.2
426
*/
427
typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker */
428
429
struct USpoofCheckResult;
430
/**
431
* @see uspoof_openCheckResult
432
* @stable ICU 58
433
*/
434
typedef struct USpoofCheckResult USpoofCheckResult;
435
436
/**
437
* Enum for the kinds of checks that USpoofChecker can perform.
438
* These enum values are used both to select the set of checks that
439
* will be performed, and to report results from the check function.
440
*
441
* @stable ICU 4.2
442
*/
443
typedef enum USpoofChecks {
444
/**
445
* When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
446
* that the two strings are visually confusable and that they are from the same script, according to UTS 39 section
447
* 4.
448
*
449
* @see uspoof_areConfusable
450
* @stable ICU 4.2
451
*/
452
USPOOF_SINGLE_SCRIPT_CONFUSABLE = 1,
453
454
/**
455
* When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
456
* that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS
457
* 39 section 4.
458
*
459
* @see uspoof_areConfusable
460
* @stable ICU 4.2
461
*/
462
USPOOF_MIXED_SCRIPT_CONFUSABLE = 2,
463
464
/**
465
* When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
466
* that the two strings are visually confusable and that they are not from the same script but both of them are
467
* single-script strings, according to UTS 39 section 4.
468
*
469
* @see uspoof_areConfusable
470
* @stable ICU 4.2
471
*/
472
USPOOF_WHOLE_SCRIPT_CONFUSABLE = 4,
473
474
/**
475
* Enable this flag in {@link uspoof_setChecks} to turn on all types of confusables. You may set
476
* the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to
477
* make {@link uspoof_areConfusable} return only those types of confusables.
478
*
479
* @see uspoof_areConfusable
480
* @see uspoof_getSkeleton
481
* @stable ICU 58
482
*/
483
USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE,
484
485
#ifndef U_HIDE_DEPRECATED_API
486
/**
487
* This flag is deprecated and no longer affects the behavior of SpoofChecker.
488
*
489
* @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was deprecated.
490
*/
491
USPOOF_ANY_CASE = 8,
492
#endif /* U_HIDE_DEPRECATED_API */
493
494
/**
495
* Check that an identifier is no looser than the specified RestrictionLevel.
496
* The default if {@link uspoof_setRestrictionLevel} is not called is HIGHLY_RESTRICTIVE.
497
*
498
* If USPOOF_AUX_INFO is enabled the actual restriction level of the
499
* identifier being tested will also be returned by uspoof_check().
500
*
501
* @see URestrictionLevel
502
* @see uspoof_setRestrictionLevel
503
* @see USPOOF_AUX_INFO
504
*
505
* @stable ICU 51
506
*/
507
USPOOF_RESTRICTION_LEVEL = 16,
508
509
#ifndef U_HIDE_DEPRECATED_API
510
/** Check that an identifier contains only characters from a
511
* single script (plus chars from the common and inherited scripts.)
512
* Applies to checks of a single identifier check only.
513
* @deprecated ICU 51 Use RESTRICTION_LEVEL instead.
514
*/
515
USPOOF_SINGLE_SCRIPT = USPOOF_RESTRICTION_LEVEL,
516
#endif /* U_HIDE_DEPRECATED_API */
517
518
/** Check an identifier for the presence of invisible characters,
519
* such as zero-width spaces, or character sequences that are
520
* likely not to display, such as multiple occurrences of the same
521
* non-spacing mark. This check does not test the input string as a whole
522
* for conformance to any particular syntax for identifiers.
523
*/
524
USPOOF_INVISIBLE = 32,
525
526
/** Check that an identifier contains only characters from a specified set
527
* of acceptable characters. See {@link uspoof_setAllowedChars} and
528
* {@link uspoof_setAllowedLocales}. Note that a string that fails this check
529
* will also fail the {@link USPOOF_RESTRICTION_LEVEL} check.
530
*/
531
USPOOF_CHAR_LIMIT = 64,
532
533
/**
534
* Check that an identifier does not mix numbers from different numbering systems.
535
* For more information, see UTS 39 section 5.3.
536
*
537
* @stable ICU 51
538
*/
539
USPOOF_MIXED_NUMBERS = 128,
540
541
/**
542
* Check that an identifier does not have a combining character following a character in which that
543
* combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
544
*
545
* More specifically, the following characters are forbidden from preceding a U+0307:
546
* <ul>
547
* <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li>
548
* <li>Latin lowercase letter 'l'</li>
549
* <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li>
550
* <li>Any character whose confusable prototype ends with such a character
551
* (Soft_Dotted, 'l', 'ı', or 'ȷ')</li>
552
* </ul>
553
* In addition, combining characters are allowed between the above characters and U+0307 except those
554
* with combining class 0 or combining class "Above" (230, same class as U+0307).
555
*
556
* This list and the number of combing characters considered by this check may grow over time.
557
*
558
* @stable ICU 62
559
*/
560
USPOOF_HIDDEN_OVERLAY = 256,
561
562
/**
563
* Enable all spoof checks.
564
*
565
* @stable ICU 4.6
566
*/
567
USPOOF_ALL_CHECKS = 0xFFFF,
568
569
/**
570
* Enable the return of auxiliary (non-error) information in the
571
* upper bits of the check results value.
572
*
573
* If this "check" is not enabled, the results of {@link uspoof_check} will be
574
* zero when an identifier passes all of the enabled checks.
575
*
576
* If this "check" is enabled, (uspoof_check() & {@link USPOOF_ALL_CHECKS}) will
577
* be zero when an identifier passes all checks.
578
*
579
* @stable ICU 51
580
*/
581
USPOOF_AUX_INFO = 0x40000000
582
583
} USpoofChecks;
584
585
586
/**
587
* Constants from UTS #39 for use in {@link uspoof_setRestrictionLevel}, and
588
* for returned identifier restriction levels in check results.
589
*
590
* @stable ICU 51
591
*
592
* @see uspoof_setRestrictionLevel
593
* @see uspoof_check
594
*/
595
typedef enum URestrictionLevel {
596
/**
597
* All characters in the string are in the identifier profile and all characters in the string are in the
598
* ASCII range.
599
*
600
* @stable ICU 51
601
*/
602
USPOOF_ASCII = 0x10000000,
603
/**
604
* The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and
605
* the string is single-script, according to the definition in UTS 39 section 5.1.
606
*
607
* @stable ICU 53
608
*/
609
USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000,
610
/**
611
* The string classifies as Single Script, or all characters in the string are in the identifier profile and
612
* the string is covered by any of the following sets of scripts, according to the definition in UTS 39
613
* section 5.1:
614
* <ul>
615
* <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li>
616
* <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li>
617
* <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li>
618
* </ul>
619
* This is the default restriction in ICU.
620
*
621
* @stable ICU 51
622
*/
623
USPOOF_HIGHLY_RESTRICTIVE = 0x30000000,
624
/**
625
* The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile
626
* and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic,
627
* Greek, and Cherokee.
628
*
629
* @stable ICU 51
630
*/
631
USPOOF_MODERATELY_RESTRICTIVE = 0x40000000,
632
/**
633
* All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts.
634
*
635
* @stable ICU 51
636
*/
637
USPOOF_MINIMALLY_RESTRICTIVE = 0x50000000,
638
/**
639
* Any valid identifiers, including characters outside of the Identifier Profile.
640
*
641
* @stable ICU 51
642
*/
643
USPOOF_UNRESTRICTIVE = 0x60000000,
644
/**
645
* Mask for selecting the Restriction Level bits from the return value of {@link uspoof_check}.
646
*
647
* @stable ICU 53
648
*/
649
USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000,
650
#ifndef U_HIDE_INTERNAL_API
651
/**
652
* An undefined restriction level.
653
* @internal
654
*/
655
USPOOF_UNDEFINED_RESTRICTIVE = -1
656
#endif /* U_HIDE_INTERNAL_API */
657
} URestrictionLevel;
658
659
/**
660
* Create a Unicode Spoof Checker, configured to perform all
661
* checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT.
662
* Note that additional checks may be added in the future,
663
* resulting in the changes to the default checking behavior.
664
*
665
* @param status The error code, set if this function encounters a problem.
666
* @return the newly created Spoof Checker
667
* @stable ICU 4.2
668
*/
669
U_CAPI USpoofChecker * U_EXPORT2
670
uspoof_open(UErrorCode *status);
671
672
673
/**
674
* Open a Spoof checker from its serialized form, stored in 32-bit-aligned memory.
675
* Inverse of uspoof_serialize().
676
* The memory containing the serialized data must remain valid and unchanged
677
* as long as the spoof checker, or any cloned copies of the spoof checker,
678
* are in use. Ownership of the memory remains with the caller.
679
* The spoof checker (and any clones) must be closed prior to deleting the
680
* serialized data.
681
*
682
* @param data a pointer to 32-bit-aligned memory containing the serialized form of spoof data
683
* @param length the number of bytes available at data;
684
* can be more than necessary
685
* @param pActualLength receives the actual number of bytes at data taken up by the data;
686
* can be NULL
687
* @param pErrorCode ICU error code
688
* @return the spoof checker.
689
*
690
* @see uspoof_open
691
* @see uspoof_serialize
692
* @stable ICU 4.2
693
*/
694
U_CAPI USpoofChecker * U_EXPORT2
695
uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
696
UErrorCode *pErrorCode);
697
698
/**
699
* Open a Spoof Checker from the source form of the spoof data.
700
* The input corresponds to the Unicode data file confusables.txt
701
* as described in Unicode Technical Standard #39. The syntax of the source data
702
* is as described in UTS #39 for this file, and the content of
703
* this file is acceptable input.
704
*
705
* The character encoding of the (char *) input text is UTF-8.
706
*
707
* @param confusables a pointer to the confusable characters definitions,
708
* as found in file confusables.txt from unicode.org.
709
* @param confusablesLen The length of the confusables text, or -1 if the
710
* input string is zero terminated.
711
* @param confusablesWholeScript
712
* Deprecated in ICU 58. No longer used.
713
* @param confusablesWholeScriptLen
714
* Deprecated in ICU 58. No longer used.
715
* @param errType In the event of an error in the input, indicates
716
* which of the input files contains the error.
717
* The value is one of USPOOF_SINGLE_SCRIPT_CONFUSABLE or
718
* USPOOF_WHOLE_SCRIPT_CONFUSABLE, or
719
* zero if no errors are found.
720
* @param pe In the event of an error in the input, receives the position
721
* in the input text (line, offset) of the error.
722
* @param status an in/out ICU UErrorCode. Among the possible errors is
723
* U_PARSE_ERROR, which is used to report syntax errors
724
* in the input.
725
* @return A spoof checker that uses the rules from the input files.
726
* @stable ICU 4.2
727
*/
728
U_CAPI USpoofChecker * U_EXPORT2
729
uspoof_openFromSource(const char *confusables, int32_t confusablesLen,
730
const char *confusablesWholeScript, int32_t confusablesWholeScriptLen,
731
int32_t *errType, UParseError *pe, UErrorCode *status);
732
733
734
/**
735
* Close a Spoof Checker, freeing any memory that was being held by
736
* its implementation.
737
* @stable ICU 4.2
738
*/
739
U_CAPI void U_EXPORT2
740
uspoof_close(USpoofChecker *sc);
741
742
/**
743
* Clone a Spoof Checker. The clone will be set to perform the same checks
744
* as the original source.
745
*
746
* @param sc The source USpoofChecker
747
* @param status The error code, set if this function encounters a problem.
748
* @return
749
* @stable ICU 4.2
750
*/
751
U_CAPI USpoofChecker * U_EXPORT2
752
uspoof_clone(const USpoofChecker *sc, UErrorCode *status);
753
754
755
/**
756
* Specify the bitmask of checks that will be performed by {@link uspoof_check}. Calling this method
757
* overwrites any checks that may have already been enabled. By default, all checks are enabled.
758
*
759
* To enable specific checks and disable all others,
760
* OR together only the bit constants for the desired checks.
761
* For example, to fail strings containing characters outside of
762
* the set specified by {@link uspoof_setAllowedChars} and
763
* also strings that contain digits from mixed numbering systems:
764
*
765
* <pre>
766
* {@code
767
* uspoof_setChecks(USPOOF_CHAR_LIMIT | USPOOF_MIXED_NUMBERS);
768
* }
769
* </pre>
770
*
771
* To disable specific checks and enable all others,
772
* start with ALL_CHECKS and "AND away" the not-desired checks.
773
* For example, if you are not planning to use the {@link uspoof_areConfusable} functionality,
774
* it is good practice to disable the CONFUSABLE check:
775
*
776
* <pre>
777
* {@code
778
* uspoof_setChecks(USPOOF_ALL_CHECKS & ~USPOOF_CONFUSABLE);
779
* }
780
* </pre>
781
*
782
* Note that methods such as {@link uspoof_setAllowedChars}, {@link uspoof_setAllowedLocales}, and
783
* {@link uspoof_setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they
784
* enable onto the existing bitmask specified by this method. For more details, see the documentation of those
785
* methods.
786
*
787
* @param sc The USpoofChecker
788
* @param checks The set of checks that this spoof checker will perform.
789
* The value is a bit set, obtained by OR-ing together
790
* values from enum USpoofChecks.
791
* @param status The error code, set if this function encounters a problem.
792
* @stable ICU 4.2
793
*
794
*/
795
U_CAPI void U_EXPORT2
796
uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status);
797
798
/**
799
* Get the set of checks that this Spoof Checker has been configured to perform.
800
*
801
* @param sc The USpoofChecker
802
* @param status The error code, set if this function encounters a problem.
803
* @return The set of checks that this spoof checker will perform.
804
* The value is a bit set, obtained by OR-ing together
805
* values from enum USpoofChecks.
806
* @stable ICU 4.2
807
*
808
*/
809
U_CAPI int32_t U_EXPORT2
810
uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
811
812
/**
813
* Set the loosest restriction level allowed for strings. The default if this is not called is
814
* {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
815
* {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
816
* to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
817
*
818
* @param sc The USpoofChecker
819
* @param restrictionLevel The loosest restriction level allowed.
820
* @see URestrictionLevel
821
* @stable ICU 51
822
*/
823
U_CAPI void U_EXPORT2
824
uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel);
825
826
827
/**
828
* Get the Restriction Level that will be tested if the checks include {@link USPOOF_RESTRICTION_LEVEL}.
829
*
830
* @return The restriction level
831
* @see URestrictionLevel
832
* @stable ICU 51
833
*/
834
U_CAPI URestrictionLevel U_EXPORT2
835
uspoof_getRestrictionLevel(const USpoofChecker *sc);
836
837
/**
838
* Limit characters that are acceptable in identifiers being checked to those
839
* normally used with the languages associated with the specified locales.
840
* Any previously specified list of locales is replaced by the new settings.
841
*
842
* A set of languages is determined from the locale(s), and
843
* from those a set of acceptable Unicode scripts is determined.
844
* Characters from this set of scripts, along with characters from
845
* the "common" and "inherited" Unicode Script categories
846
* will be permitted.
847
*
848
* Supplying an empty string removes all restrictions;
849
* characters from any script will be allowed.
850
*
851
* The {@link USPOOF_CHAR_LIMIT} test is automatically enabled for this
852
* USpoofChecker when calling this function with a non-empty list
853
* of locales.
854
*
855
* The Unicode Set of characters that will be allowed is accessible
856
* via the uspoof_getAllowedChars() function. uspoof_setAllowedLocales()
857
* will <i>replace</i> any previously applied set of allowed characters.
858
*
859
* Adjustments, such as additions or deletions of certain classes of characters,
860
* can be made to the result of uspoof_setAllowedLocales() by
861
* fetching the resulting set with uspoof_getAllowedChars(),
862
* manipulating it with the Unicode Set API, then resetting the
863
* spoof detectors limits with uspoof_setAllowedChars().
864
*
865
* @param sc The USpoofChecker
866
* @param localesList A list list of locales, from which the language
867
* and associated script are extracted. The locales
868
* are comma-separated if there is more than one.
869
* White space may not appear within an individual locale,
870
* but is ignored otherwise.
871
* The locales are syntactically like those from the
872
* HTTP Accept-Language header.
873
* If the localesList is empty, no restrictions will be placed on
874
* the allowed characters.
875
*
876
* @param status The error code, set if this function encounters a problem.
877
* @stable ICU 4.2
878
*/
879
U_CAPI void U_EXPORT2
880
uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status);
881
882
/**
883
* Get a list of locales for the scripts that are acceptable in strings
884
* to be checked. If no limitations on scripts have been specified,
885
* an empty string will be returned.
886
*
887
* uspoof_setAllowedChars() will reset the list of allowed to be empty.
888
*
889
* The format of the returned list is the same as that supplied to
890
* uspoof_setAllowedLocales(), but returned list may not be identical
891
* to the originally specified string; the string may be reformatted,
892
* and information other than languages from
893
* the originally specified locales may be omitted.
894
*
895
* @param sc The USpoofChecker
896
* @param status The error code, set if this function encounters a problem.
897
* @return A string containing a list of locales corresponding
898
* to the acceptable scripts, formatted like an
899
* HTTP Accept Language value.
900
*
901
* @stable ICU 4.2
902
*/
903
U_CAPI const char * U_EXPORT2
904
uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status);
905
906
907
/**
908
* Limit the acceptable characters to those specified by a Unicode Set.
909
* Any previously specified character limit is
910
* is replaced by the new settings. This includes limits on
911
* characters that were set with the uspoof_setAllowedLocales() function.
912
*
913
* The USPOOF_CHAR_LIMIT test is automatically enabled for this
914
* USpoofChecker by this function.
915
*
916
* @param sc The USpoofChecker
917
* @param chars A Unicode Set containing the list of
918
* characters that are permitted. Ownership of the set
919
* remains with the caller. The incoming set is cloned by
920
* this function, so there are no restrictions on modifying
921
* or deleting the USet after calling this function.
922
* @param status The error code, set if this function encounters a problem.
923
* @stable ICU 4.2
924
*/
925
U_CAPI void U_EXPORT2
926
uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status);
927
928
929
/**
930
* Get a USet for the characters permitted in an identifier.
931
* This corresponds to the limits imposed by the Set Allowed Characters
932
* functions. Limitations imposed by other checks will not be
933
* reflected in the set returned by this function.
934
*
935
* The returned set will be frozen, meaning that it cannot be modified
936
* by the caller.
937
*
938
* Ownership of the returned set remains with the Spoof Detector. The
939
* returned set will become invalid if the spoof detector is closed,
940
* or if a new set of allowed characters is specified.
941
*
942
*
943
* @param sc The USpoofChecker
944
* @param status The error code, set if this function encounters a problem.
945
* @return A USet containing the characters that are permitted by
946
* the USPOOF_CHAR_LIMIT test.
947
* @stable ICU 4.2
948
*/
949
U_CAPI const USet * U_EXPORT2
950
uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status);
951
952
953
/**
954
* Check the specified string for possible security issues.
955
* The text to be checked will typically be an identifier of some sort.
956
* The set of checks to be performed is specified with uspoof_setChecks().
957
*
958
* \note
959
* Consider using the newer API, {@link uspoof_check2}, instead.
960
* The newer API exposes additional information from the check procedure
961
* and is otherwise identical to this method.
962
*
963
* @param sc The USpoofChecker
964
* @param id The identifier to be checked for possible security issues,
965
* in UTF-16 format.
966
* @param length the length of the string to be checked, expressed in
967
* 16 bit UTF-16 code units, or -1 if the string is
968
* zero terminated.
969
* @param position Deprecated in ICU 51. Always returns zero.
970
* Originally, an out parameter for the index of the first
971
* string position that failed a check.
972
* This parameter may be NULL.
973
* @param status The error code, set if an error occurred while attempting to
974
* perform the check.
975
* Spoofing or security issues detected with the input string are
976
* not reported here, but through the function's return value.
977
* @return An integer value with bits set for any potential security
978
* or spoofing issues detected. The bits are defined by
979
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
980
* will be zero if the input string passes all of the
981
* enabled checks.
982
* @see uspoof_check2
983
* @stable ICU 4.2
984
*/
985
U_CAPI int32_t U_EXPORT2
986
uspoof_check(const USpoofChecker *sc,
987
const UChar *id, int32_t length,
988
int32_t *position,
989
UErrorCode *status);
990
991
992
/**
993
* Check the specified string for possible security issues.
994
* The text to be checked will typically be an identifier of some sort.
995
* The set of checks to be performed is specified with uspoof_setChecks().
996
*
997
* \note
998
* Consider using the newer API, {@link uspoof_check2UTF8}, instead.
999
* The newer API exposes additional information from the check procedure
1000
* and is otherwise identical to this method.
1001
*
1002
* @param sc The USpoofChecker
1003
* @param id A identifier to be checked for possible security issues, in UTF8 format.
1004
* @param length the length of the string to be checked, or -1 if the string is
1005
* zero terminated.
1006
* @param position Deprecated in ICU 51. Always returns zero.
1007
* Originally, an out parameter for the index of the first
1008
* string position that failed a check.
1009
* This parameter may be NULL.
1010
* @param status The error code, set if an error occurred while attempting to
1011
* perform the check.
1012
* Spoofing or security issues detected with the input string are
1013
* not reported here, but through the function's return value.
1014
* If the input contains invalid UTF-8 sequences,
1015
* a status of U_INVALID_CHAR_FOUND will be returned.
1016
* @return An integer value with bits set for any potential security
1017
* or spoofing issues detected. The bits are defined by
1018
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
1019
* will be zero if the input string passes all of the
1020
* enabled checks.
1021
* @see uspoof_check2UTF8
1022
* @stable ICU 4.2
1023
*/
1024
U_CAPI int32_t U_EXPORT2
1025
uspoof_checkUTF8(const USpoofChecker *sc,
1026
const char *id, int32_t length,
1027
int32_t *position,
1028
UErrorCode *status);
1029
1030
1031
/**
1032
* Check the specified string for possible security issues.
1033
* The text to be checked will typically be an identifier of some sort.
1034
* The set of checks to be performed is specified with uspoof_setChecks().
1035
*
1036
* @param sc The USpoofChecker
1037
* @param id The identifier to be checked for possible security issues,
1038
* in UTF-16 format.
1039
* @param length the length of the string to be checked, or -1 if the string is
1040
* zero terminated.
1041
* @param checkResult An instance of USpoofCheckResult to be filled with
1042
* details about the identifier. Can be NULL.
1043
* @param status The error code, set if an error occurred while attempting to
1044
* perform the check.
1045
* Spoofing or security issues detected with the input string are
1046
* not reported here, but through the function's return value.
1047
* @return An integer value with bits set for any potential security
1048
* or spoofing issues detected. The bits are defined by
1049
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
1050
* will be zero if the input string passes all of the
1051
* enabled checks. Any information in this bitmask will be
1052
* consistent with the information saved in the optional
1053
* checkResult parameter.
1054
* @see uspoof_openCheckResult
1055
* @see uspoof_check2UTF8
1056
* @see uspoof_check2UnicodeString
1057
* @stable ICU 58
1058
*/
1059
U_CAPI int32_t U_EXPORT2
1060
uspoof_check2(const USpoofChecker *sc,
1061
const UChar* id, int32_t length,
1062
USpoofCheckResult* checkResult,
1063
UErrorCode *status);
1064
1065
/**
1066
* Check the specified string for possible security issues.
1067
* The text to be checked will typically be an identifier of some sort.
1068
* The set of checks to be performed is specified with uspoof_setChecks().
1069
*
1070
* This version of {@link uspoof_check} accepts a USpoofCheckResult, which
1071
* returns additional information about the identifier. For more
1072
* information, see {@link uspoof_openCheckResult}.
1073
*
1074
* @param sc The USpoofChecker
1075
* @param id A identifier to be checked for possible security issues, in UTF8 format.
1076
* @param length the length of the string to be checked, or -1 if the string is
1077
* zero terminated.
1078
* @param checkResult An instance of USpoofCheckResult to be filled with
1079
* details about the identifier. Can be NULL.
1080
* @param status The error code, set if an error occurred while attempting to
1081
* perform the check.
1082
* Spoofing or security issues detected with the input string are
1083
* not reported here, but through the function's return value.
1084
* @return An integer value with bits set for any potential security
1085
* or spoofing issues detected. The bits are defined by
1086
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
1087
* will be zero if the input string passes all of the
1088
* enabled checks. Any information in this bitmask will be
1089
* consistent with the information saved in the optional
1090
* checkResult parameter.
1091
* @see uspoof_openCheckResult
1092
* @see uspoof_check2
1093
* @see uspoof_check2UnicodeString
1094
* @stable ICU 58
1095
*/
1096
U_CAPI int32_t U_EXPORT2
1097
uspoof_check2UTF8(const USpoofChecker *sc,
1098
const char *id, int32_t length,
1099
USpoofCheckResult* checkResult,
1100
UErrorCode *status);
1101
1102
/**
1103
* Create a USpoofCheckResult, used by the {@link uspoof_check2} class of functions to return
1104
* information about the identifier. Information includes:
1105
* <ul>
1106
* <li>A bitmask of the checks that failed</li>
1107
* <li>The identifier's restriction level (UTS 39 section 5.2)</li>
1108
* <li>The set of numerics in the string (UTS 39 section 5.3)</li>
1109
* </ul>
1110
* The data held in a USpoofCheckResult is cleared whenever it is passed into a new call
1111
* of {@link uspoof_check2}.
1112
*
1113
* @param status The error code, set if this function encounters a problem.
1114
* @return the newly created USpoofCheckResult
1115
* @see uspoof_check2
1116
* @see uspoof_check2UTF8
1117
* @see uspoof_check2UnicodeString
1118
* @stable ICU 58
1119
*/
1120
U_CAPI USpoofCheckResult* U_EXPORT2
1121
uspoof_openCheckResult(UErrorCode *status);
1122
1123
/**
1124
* Close a USpoofCheckResult, freeing any memory that was being held by
1125
* its implementation.
1126
*
1127
* @param checkResult The instance of USpoofCheckResult to close
1128
* @stable ICU 58
1129
*/
1130
U_CAPI void U_EXPORT2
1131
uspoof_closeCheckResult(USpoofCheckResult *checkResult);
1132
1133
/**
1134
* Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests
1135
* in question: USPOOF_RESTRICTION_LEVEL, USPOOF_CHAR_LIMIT, and so on.
1136
*
1137
* @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
1138
* @param status The error code, set if an error occurred.
1139
* @return An integer value with bits set for any potential security
1140
* or spoofing issues detected. The bits are defined by
1141
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
1142
* will be zero if the input string passes all of the
1143
* enabled checks.
1144
* @see uspoof_setChecks
1145
* @stable ICU 58
1146
*/
1147
U_CAPI int32_t U_EXPORT2
1148
uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status);
1149
1150
/**
1151
* Gets the restriction level that the text meets, if the USPOOF_RESTRICTION_LEVEL check
1152
* was enabled; otherwise, undefined.
1153
*
1154
* @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
1155
* @param status The error code, set if an error occurred.
1156
* @return The restriction level contained in the USpoofCheckResult
1157
* @see uspoof_setRestrictionLevel
1158
* @stable ICU 58
1159
*/
1160
U_CAPI URestrictionLevel U_EXPORT2
1161
uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status);
1162
1163
/**
1164
* Gets the set of numerics found in the string, if the USPOOF_MIXED_NUMBERS check was enabled;
1165
* otherwise, undefined. The set will contain the zero digit from each decimal number system found
1166
* in the input string. Ownership of the returned USet remains with the USpoofCheckResult.
1167
* The USet will be free'd when {@link uspoof_closeCheckResult} is called.
1168
*
1169
* @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
1170
* @return The set of numerics contained in the USpoofCheckResult
1171
* @param status The error code, set if an error occurred.
1172
* @stable ICU 58
1173
*/
1174
U_CAPI const USet* U_EXPORT2
1175
uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status);
1176
1177
1178
/**
1179
* Check whether two specified strings are visually confusable.
1180
*
1181
* If the strings are confusable, the return value will be nonzero, as long as
1182
* {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
1183
*
1184
* The bits in the return value correspond to flags for each of the classes of
1185
* confusables applicable to the two input strings. According to UTS 39
1186
* section 4, the possible flags are:
1187
*
1188
* <ul>
1189
* <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
1190
* <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
1191
* <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
1192
* </ul>
1193
*
1194
* If one or more of the above flags were not listed in uspoof_setChecks(), this
1195
* function will never report that class of confusable. The check
1196
* {@link USPOOF_CONFUSABLE} enables all three flags.
1197
*
1198
*
1199
* @param sc The USpoofChecker
1200
* @param id1 The first of the two identifiers to be compared for
1201
* confusability. The strings are in UTF-16 format.
1202
* @param length1 the length of the first identifier, expressed in
1203
* 16 bit UTF-16 code units, or -1 if the string is
1204
* nul terminated.
1205
* @param id2 The second of the two identifiers to be compared for
1206
* confusability. The identifiers are in UTF-16 format.
1207
* @param length2 The length of the second identifiers, expressed in
1208
* 16 bit UTF-16 code units, or -1 if the string is
1209
* nul terminated.
1210
* @param status The error code, set if an error occurred while attempting to
1211
* perform the check.
1212
* Confusability of the identifiers is not reported here,
1213
* but through this function's return value.
1214
* @return An integer value with bit(s) set corresponding to
1215
* the type of confusability found, as defined by
1216
* enum USpoofChecks. Zero is returned if the identifiers
1217
* are not confusable.
1218
*
1219
* @stable ICU 4.2
1220
*/
1221
U_CAPI int32_t U_EXPORT2
1222
uspoof_areConfusable(const USpoofChecker *sc,
1223
const UChar *id1, int32_t length1,
1224
const UChar *id2, int32_t length2,
1225
UErrorCode *status);
1226
1227
/**
1228
* Check whether two specified strings are visually confusable when
1229
* displayed in a context with the given paragraph direction.
1230
*
1231
* If the strings are confusable, the return value will be nonzero, as long as
1232
* {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
1233
*
1234
* The bits in the return value correspond to flags for each of the classes of
1235
* confusables applicable to the two input strings. According to UTS 39
1236
* section 4, the possible flags are:
1237
*
1238
* <ul>
1239
* <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
1240
* <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
1241
* <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
1242
* </ul>
1243
*
1244
* If one or more of the above flags were not listed in uspoof_setChecks(), this
1245
* function will never report that class of confusable. The check
1246
* {@link USPOOF_CONFUSABLE} enables all three flags.
1247
*
1248
*
1249
* @param sc The USpoofChecker
1250
* @param direction The paragraph direction with which the identifiers are
1251
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
1252
* @param id1 The first of the two identifiers to be compared for
1253
* confusability. The strings are in UTF-16 format.
1254
* @param length1 the length of the first identifier, expressed in
1255
* 16 bit UTF-16 code units, or -1 if the string is
1256
* nul terminated.
1257
* @param id2 The second of the two identifiers to be compared for
1258
* confusability. The identifiers are in UTF-16 format.
1259
* @param length2 The length of the second identifiers, expressed in
1260
* 16 bit UTF-16 code units, or -1 if the string is
1261
* nul terminated.
1262
* @param status The error code, set if an error occurred while attempting to
1263
* perform the check.
1264
* Confusability of the identifiers is not reported here,
1265
* but through this function's return value.
1266
* @return An integer value with bit(s) set corresponding to
1267
* the type of confusability found, as defined by
1268
* enum USpoofChecks. Zero is returned if the identifiers
1269
* are not confusable.
1270
*
1271
* @stable ICU 74
1272
*/
1273
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction,
1274
const UChar *id1, int32_t length1,
1275
const UChar *id2, int32_t length2,
1276
UErrorCode *status);
1277
1278
/**
1279
* A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format.
1280
*
1281
* @param sc The USpoofChecker
1282
* @param id1 The first of the two identifiers to be compared for
1283
* confusability. The strings are in UTF-8 format.
1284
* @param length1 the length of the first identifiers, in bytes, or -1
1285
* if the string is nul terminated.
1286
* @param id2 The second of the two identifiers to be compared for
1287
* confusability. The strings are in UTF-8 format.
1288
* @param length2 The length of the second string in bytes, or -1
1289
* if the string is nul terminated.
1290
* @param status The error code, set if an error occurred while attempting to
1291
* perform the check.
1292
* Confusability of the strings is not reported here,
1293
* but through this function's return value.
1294
* @return An integer value with bit(s) set corresponding to
1295
* the type of confusability found, as defined by
1296
* enum USpoofChecks. Zero is returned if the strings
1297
* are not confusable.
1298
*
1299
* @stable ICU 4.2
1300
*
1301
* @see uspoof_areConfusable
1302
*/
1303
U_CAPI int32_t U_EXPORT2
1304
uspoof_areConfusableUTF8(const USpoofChecker *sc,
1305
const char *id1, int32_t length1,
1306
const char *id2, int32_t length2,
1307
UErrorCode *status);
1308
1309
/**
1310
* A version of {@link uspoof_areBidiConfusable} accepting strings in UTF-8 format.
1311
*
1312
* @param sc The USpoofChecker
1313
* @param direction The paragraph direction with which the identifiers are
1314
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
1315
* @param id1 The first of the two identifiers to be compared for
1316
* confusability. The strings are in UTF-8 format.
1317
* @param length1 the length of the first identifiers, in bytes, or -1
1318
* if the string is nul terminated.
1319
* @param id2 The second of the two identifiers to be compared for
1320
* confusability. The strings are in UTF-8 format.
1321
* @param length2 The length of the second string in bytes, or -1
1322
* if the string is nul terminated.
1323
* @param status The error code, set if an error occurred while attempting to
1324
* perform the check.
1325
* Confusability of the strings is not reported here,
1326
* but through this function's return value.
1327
* @return An integer value with bit(s) set corresponding to
1328
* the type of confusability found, as defined by
1329
* enum USpoofChecks. Zero is returned if the strings
1330
* are not confusable.
1331
*
1332
* @stable ICU 74
1333
*
1334
* @see uspoof_areBidiConfusable
1335
*/
1336
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction,
1337
const char *id1, int32_t length1,
1338
const char *id2, int32_t length2,
1339
UErrorCode *status);
1340
1341
/**
1342
* Get the "skeleton" for an identifier.
1343
* Skeletons are a transformation of the input identifier;
1344
* Two identifiers are confusable if their skeletons are identical.
1345
* See Unicode Technical Standard #39 for additional information.
1346
*
1347
* Using skeletons directly makes it possible to quickly check
1348
* whether an identifier is confusable with any of some large
1349
* set of existing identifiers, by creating an efficiently
1350
* searchable collection of the skeletons.
1351
*
1352
* @param sc The USpoofChecker
1353
* @param type Deprecated in ICU 58. You may pass any number.
1354
* Originally, controlled which of the Unicode confusable data
1355
* tables to use.
1356
* @param id The input identifier whose skeleton will be computed.
1357
* @param length The length of the input identifier, expressed in 16 bit
1358
* UTF-16 code units, or -1 if the string is zero terminated.
1359
* @param dest The output buffer, to receive the skeleton string.
1360
* @param destCapacity The length of the output buffer, in 16 bit units.
1361
* The destCapacity may be zero, in which case the function will
1362
* return the actual length of the skeleton.
1363
* @param status The error code, set if an error occurred while attempting to
1364
* perform the check.
1365
* @return The length of the skeleton string. The returned length
1366
* is always that of the complete skeleton, even when the
1367
* supplied buffer is too small (or of zero length)
1368
*
1369
* @stable ICU 4.2
1370
* @see uspoof_areConfusable
1371
*/
1372
U_CAPI int32_t U_EXPORT2
1373
uspoof_getSkeleton(const USpoofChecker *sc,
1374
uint32_t type,
1375
const UChar *id, int32_t length,
1376
UChar *dest, int32_t destCapacity,
1377
UErrorCode *status);
1378
1379
/**
1380
* Get the "bidiSkeleton" for an identifier and a direction.
1381
* Skeletons are a transformation of the input identifier;
1382
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
1383
* they are RTL-confusable if their RTL bidiSkeletons are identical.
1384
* See Unicode Technical Standard #39 for additional information:
1385
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
1386
*
1387
* Using skeletons directly makes it possible to quickly check
1388
* whether an identifier is confusable with any of some large
1389
* set of existing identifiers, by creating an efficiently
1390
* searchable collection of the skeletons.
1391
*
1392
* @param sc The USpoofChecker.
1393
* @param direction The context direction with which the identifier will be
1394
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
1395
* @param id The input identifier whose skeleton will be computed.
1396
* @param length The length of the input identifier, expressed in 16 bit
1397
* UTF-16 code units, or -1 if the string is zero terminated.
1398
* @param dest The output buffer, to receive the skeleton string.
1399
* @param destCapacity The length of the output buffer, in 16 bit units.
1400
* The destCapacity may be zero, in which case the function will
1401
* return the actual length of the skeleton.
1402
* @param status The error code, set if an error occurred while attempting to
1403
* perform the check.
1404
* @return The length of the skeleton string. The returned length
1405
* is always that of the complete skeleton, even when the
1406
* supplied buffer is too small (or of zero length)
1407
*
1408
* @stable ICU 74
1409
* @see uspoof_areBidiConfusable
1410
*/
1411
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc,
1412
UBiDiDirection direction,
1413
const UChar *id, int32_t length,
1414
UChar *dest, int32_t destCapacity, UErrorCode *status);
1415
1416
/**
1417
* Get the "skeleton" for an identifier.
1418
* Skeletons are a transformation of the input identifier;
1419
* Two identifiers are confusable if their skeletons are identical.
1420
* See Unicode Technical Standard #39 for additional information.
1421
*
1422
* Using skeletons directly makes it possible to quickly check
1423
* whether an identifier is confusable with any of some large
1424
* set of existing identifiers, by creating an efficiently
1425
* searchable collection of the skeletons.
1426
*
1427
* @param sc The USpoofChecker
1428
* @param type Deprecated in ICU 58. You may pass any number.
1429
* Originally, controlled which of the Unicode confusable data
1430
* tables to use.
1431
* @param id The UTF-8 format identifier whose skeleton will be computed.
1432
* @param length The length of the input string, in bytes,
1433
* or -1 if the string is zero terminated.
1434
* @param dest The output buffer, to receive the skeleton string.
1435
* @param destCapacity The length of the output buffer, in bytes.
1436
* The destCapacity may be zero, in which case the function will
1437
* return the actual length of the skeleton.
1438
* @param status The error code, set if an error occurred while attempting to
1439
* perform the check. Possible Errors include U_INVALID_CHAR_FOUND
1440
* for invalid UTF-8 sequences, and
1441
* U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
1442
* to hold the complete skeleton.
1443
* @return The length of the skeleton string, in bytes. The returned length
1444
* is always that of the complete skeleton, even when the
1445
* supplied buffer is too small (or of zero length)
1446
*
1447
* @stable ICU 4.2
1448
*/
1449
U_CAPI int32_t U_EXPORT2
1450
uspoof_getSkeletonUTF8(const USpoofChecker *sc,
1451
uint32_t type,
1452
const char *id, int32_t length,
1453
char *dest, int32_t destCapacity,
1454
UErrorCode *status);
1455
1456
/**
1457
* Get the "bidiSkeleton" for an identifier and a direction.
1458
* Skeletons are a transformation of the input identifier;
1459
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
1460
* they are RTL-confusable if their RTL bidiSkeletons are identical.
1461
* See Unicode Technical Standard #39 for additional information:
1462
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
1463
*
1464
* Using skeletons directly makes it possible to quickly check
1465
* whether an identifier is confusable with any of some large
1466
* set of existing identifiers, by creating an efficiently
1467
* searchable collection of the skeletons.
1468
*
1469
* @param sc The USpoofChecker
1470
* @param direction The context direction with which the identifier will be
1471
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
1472
* @param id The UTF-8 format identifier whose skeleton will be computed.
1473
* @param length The length of the input string, in bytes,
1474
* or -1 if the string is zero terminated.
1475
* @param dest The output buffer, to receive the skeleton string.
1476
* @param destCapacity The length of the output buffer, in bytes.
1477
* The destCapacity may be zero, in which case the function will
1478
* return the actual length of the skeleton.
1479
* @param status The error code, set if an error occurred while attempting to
1480
* perform the check. Possible Errors include U_INVALID_CHAR_FOUND
1481
* for invalid UTF-8 sequences, and
1482
* U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
1483
* to hold the complete skeleton.
1484
* @return The length of the skeleton string, in bytes. The returned length
1485
* is always that of the complete skeleton, even when the
1486
* supplied buffer is too small (or of zero length)
1487
*
1488
* @stable ICU 74
1489
*/
1490
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction,
1491
const char *id, int32_t length, char *dest,
1492
int32_t destCapacity, UErrorCode *status);
1493
1494
/**
1495
* Get the set of Candidate Characters for Inclusion in Identifiers, as defined
1496
* in http://unicode.org/Public/security/latest/xidmodifications.txt
1497
* and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1498
*
1499
* The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1500
* be deleted by the caller.
1501
*
1502
* @param status The error code, set if a problem occurs while creating the set.
1503
*
1504
* @stable ICU 51
1505
*/
1506
U_CAPI const USet * U_EXPORT2
1507
uspoof_getInclusionSet(UErrorCode *status);
1508
1509
/**
1510
* Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
1511
* in http://unicode.org/Public/security/latest/xidmodifications.txt
1512
* and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1513
*
1514
* The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1515
* be deleted by the caller.
1516
*
1517
* @param status The error code, set if a problem occurs while creating the set.
1518
*
1519
* @stable ICU 51
1520
*/
1521
U_CAPI const USet * U_EXPORT2
1522
uspoof_getRecommendedSet(UErrorCode *status);
1523
1524
/**
1525
* Serialize the data for a spoof detector into a chunk of memory.
1526
* The flattened spoof detection tables can later be used to efficiently
1527
* instantiate a new Spoof Detector.
1528
*
1529
* The serialized spoof checker includes only the data compiled from the
1530
* Unicode data tables by uspoof_openFromSource(); it does not include
1531
* include any other state or configuration that may have been set.
1532
*
1533
* @param sc the Spoof Detector whose data is to be serialized.
1534
* @param data a pointer to 32-bit-aligned memory to be filled with the data,
1535
* can be NULL if capacity==0
1536
* @param capacity the number of bytes available at data,
1537
* or 0 for preflighting
1538
* @param status an in/out ICU UErrorCode; possible errors include:
1539
* - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization
1540
* - U_ILLEGAL_ARGUMENT_ERROR the data or capacity parameters are bad
1541
* @return the number of bytes written or needed for the spoof data
1542
*
1543
* @see utrie2_openFromSerialized()
1544
* @stable ICU 4.2
1545
*/
1546
U_CAPI int32_t U_EXPORT2
1547
uspoof_serialize(USpoofChecker *sc,
1548
void *data, int32_t capacity,
1549
UErrorCode *status);
1550
1551
U_CDECL_END
1552
1553
#if U_SHOW_CPLUSPLUS_API
1554
1555
U_NAMESPACE_BEGIN
1556
1557
/**
1558
* \class LocalUSpoofCheckerPointer
1559
* "Smart pointer" class, closes a USpoofChecker via uspoof_close().
1560
* For most methods see the LocalPointerBase base class.
1561
*
1562
* @see LocalPointerBase
1563
* @see LocalPointer
1564
* @stable ICU 4.4
1565
*/
1566
/**
1567
* \cond
1568
* Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER.
1569
* For now, suppress with a Doxygen cond
1570
*/
1571
U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckerPointer, USpoofChecker, uspoof_close);
1572
/** \endcond */
1573
1574
/**
1575
* \class LocalUSpoofCheckResultPointer
1576
* "Smart pointer" class, closes a USpoofCheckResult via `uspoof_closeCheckResult()`.
1577
* For most methods see the LocalPointerBase base class.
1578
*
1579
* @see LocalPointerBase
1580
* @see LocalPointer
1581
* @stable ICU 58
1582
*/
1583
1584
/**
1585
* \cond
1586
* Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER.
1587
* For now, suppress with a Doxygen cond
1588
*/
1589
U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckResultPointer, USpoofCheckResult, uspoof_closeCheckResult);
1590
/** \endcond */
1591
1592
U_NAMESPACE_END
1593
1594
/**
1595
* Limit the acceptable characters to those specified by a Unicode Set.
1596
* Any previously specified character limit is
1597
* is replaced by the new settings. This includes limits on
1598
* characters that were set with the uspoof_setAllowedLocales() function.
1599
*
1600
* The USPOOF_CHAR_LIMIT test is automatically enabled for this
1601
* USoofChecker by this function.
1602
*
1603
* @param sc The USpoofChecker
1604
* @param chars A Unicode Set containing the list of
1605
* characters that are permitted. Ownership of the set
1606
* remains with the caller. The incoming set is cloned by
1607
* this function, so there are no restrictions on modifying
1608
* or deleting the UnicodeSet after calling this function.
1609
* @param status The error code, set if this function encounters a problem.
1610
* @stable ICU 4.2
1611
*/
1612
U_CAPI void U_EXPORT2
1613
uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const icu::UnicodeSet *chars, UErrorCode *status);
1614
1615
1616
/**
1617
* Get a UnicodeSet for the characters permitted in an identifier.
1618
* This corresponds to the limits imposed by the Set Allowed Characters /
1619
* UnicodeSet functions. Limitations imposed by other checks will not be
1620
* reflected in the set returned by this function.
1621
*
1622
* The returned set will be frozen, meaning that it cannot be modified
1623
* by the caller.
1624
*
1625
* Ownership of the returned set remains with the Spoof Detector. The
1626
* returned set will become invalid if the spoof detector is closed,
1627
* or if a new set of allowed characters is specified.
1628
*
1629
*
1630
* @param sc The USpoofChecker
1631
* @param status The error code, set if this function encounters a problem.
1632
* @return A UnicodeSet containing the characters that are permitted by
1633
* the USPOOF_CHAR_LIMIT test.
1634
* @stable ICU 4.2
1635
*/
1636
U_CAPI const icu::UnicodeSet * U_EXPORT2
1637
uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status);
1638
1639
/**
1640
* Check the specified string for possible security issues.
1641
* The text to be checked will typically be an identifier of some sort.
1642
* The set of checks to be performed is specified with uspoof_setChecks().
1643
*
1644
* \note
1645
* Consider using the newer API, {@link uspoof_check2UnicodeString}, instead.
1646
* The newer API exposes additional information from the check procedure
1647
* and is otherwise identical to this method.
1648
*
1649
* @param sc The USpoofChecker
1650
* @param id A identifier to be checked for possible security issues.
1651
* @param position Deprecated in ICU 51. Always returns zero.
1652
* Originally, an out parameter for the index of the first
1653
* string position that failed a check.
1654
* This parameter may be nullptr.
1655
* @param status The error code, set if an error occurred while attempting to
1656
* perform the check.
1657
* Spoofing or security issues detected with the input string are
1658
* not reported here, but through the function's return value.
1659
* @return An integer value with bits set for any potential security
1660
* or spoofing issues detected. The bits are defined by
1661
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
1662
* will be zero if the input string passes all of the
1663
* enabled checks.
1664
* @see uspoof_check2UnicodeString
1665
* @stable ICU 4.2
1666
*/
1667
U_CAPI int32_t U_EXPORT2
1668
uspoof_checkUnicodeString(const USpoofChecker *sc,
1669
const icu::UnicodeString &id,
1670
int32_t *position,
1671
UErrorCode *status);
1672
1673
/**
1674
* Check the specified string for possible security issues.
1675
* The text to be checked will typically be an identifier of some sort.
1676
* The set of checks to be performed is specified with uspoof_setChecks().
1677
*
1678
* @param sc The USpoofChecker
1679
* @param id A identifier to be checked for possible security issues.
1680
* @param checkResult An instance of USpoofCheckResult to be filled with
1681
* details about the identifier. Can be nullptr.
1682
* @param status The error code, set if an error occurred while attempting to
1683
* perform the check.
1684
* Spoofing or security issues detected with the input string are
1685
* not reported here, but through the function's return value.
1686
* @return An integer value with bits set for any potential security
1687
* or spoofing issues detected. The bits are defined by
1688
* enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
1689
* will be zero if the input string passes all of the
1690
* enabled checks. Any information in this bitmask will be
1691
* consistent with the information saved in the optional
1692
* checkResult parameter.
1693
* @see uspoof_openCheckResult
1694
* @see uspoof_check2
1695
* @see uspoof_check2UTF8
1696
* @stable ICU 58
1697
*/
1698
U_CAPI int32_t U_EXPORT2
1699
uspoof_check2UnicodeString(const USpoofChecker *sc,
1700
const icu::UnicodeString &id,
1701
USpoofCheckResult* checkResult,
1702
UErrorCode *status);
1703
1704
/**
1705
* A version of {@link uspoof_areConfusable} accepting UnicodeStrings.
1706
*
1707
* @param sc The USpoofChecker
1708
* @param s1 The first of the two identifiers to be compared for
1709
* confusability. The strings are in UTF-8 format.
1710
* @param s2 The second of the two identifiers to be compared for
1711
* confusability. The strings are in UTF-8 format.
1712
* @param status The error code, set if an error occurred while attempting to
1713
* perform the check.
1714
* Confusability of the identifiers is not reported here,
1715
* but through this function's return value.
1716
* @return An integer value with bit(s) set corresponding to
1717
* the type of confusability found, as defined by
1718
* enum USpoofChecks. Zero is returned if the identifiers
1719
* are not confusable.
1720
*
1721
* @stable ICU 4.2
1722
*
1723
* @see uspoof_areConfusable
1724
*/
1725
U_CAPI int32_t U_EXPORT2
1726
uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
1727
const icu::UnicodeString &s1,
1728
const icu::UnicodeString &s2,
1729
UErrorCode *status);
1730
1731
/**
1732
* A version of {@link uspoof_areBidiConfusable} accepting UnicodeStrings.
1733
*
1734
* @param sc The USpoofChecker
1735
* @param direction The paragraph direction with which the identifiers are
1736
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
1737
* @param s1 The first of the two identifiers to be compared for
1738
* confusability. The strings are in UTF-8 format.
1739
* @param s2 The second of the two identifiers to be compared for
1740
* confusability. The strings are in UTF-8 format.
1741
* @param status The error code, set if an error occurred while attempting to
1742
* perform the check.
1743
* Confusability of the identifiers is not reported here,
1744
* but through this function's return value.
1745
* @return An integer value with bit(s) set corresponding to
1746
* the type of confusability found, as defined by
1747
* enum USpoofChecks. Zero is returned if the identifiers
1748
* are not confusable.
1749
*
1750
* @stable ICU 74
1751
*
1752
* @see uspoof_areBidiConfusable
1753
*/
1754
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc,
1755
UBiDiDirection direction,
1756
const icu::UnicodeString &s1,
1757
const icu::UnicodeString &s2,
1758
UErrorCode *status);
1759
1760
/**
1761
* Get the "skeleton" for an identifier.
1762
* Skeletons are a transformation of the input identifier;
1763
* Two identifiers are confusable if their skeletons are identical.
1764
* See Unicode Technical Standard #39 for additional information.
1765
*
1766
* Using skeletons directly makes it possible to quickly check
1767
* whether an identifier is confusable with any of some large
1768
* set of existing identifiers, by creating an efficiently
1769
* searchable collection of the skeletons.
1770
*
1771
* @param sc The USpoofChecker.
1772
* @param type Deprecated in ICU 58. You may pass any number.
1773
* Originally, controlled which of the Unicode confusable data
1774
* tables to use.
1775
* @param id The input identifier whose skeleton will be computed.
1776
* @param dest The output identifier, to receive the skeleton string.
1777
* @param status The error code, set if an error occurred while attempting to
1778
* perform the check.
1779
* @return A reference to the destination (skeleton) string.
1780
*
1781
* @stable ICU 4.2
1782
*/
1783
U_I18N_API icu::UnicodeString & U_EXPORT2
1784
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
1785
uint32_t type,
1786
const icu::UnicodeString &id,
1787
icu::UnicodeString &dest,
1788
UErrorCode *status);
1789
1790
/**
1791
* Get the "bidiSkeleton" for an identifier and a direction.
1792
* Skeletons are a transformation of the input identifier;
1793
* Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
1794
* they are RTL-confusable if their RTL bidiSkeletons are identical.
1795
* See Unicode Technical Standard #39 for additional information.
1796
* https://www.unicode.org/reports/tr39/#Confusable_Detection.
1797
*
1798
* Using skeletons directly makes it possible to quickly check
1799
* whether an identifier is confusable with any of some large
1800
* set of existing identifiers, by creating an efficiently
1801
* searchable collection of the skeletons.
1802
*
1803
* @param sc The USpoofChecker.
1804
* @param direction The context direction with which the identifier will be
1805
* displayed. Must be either UBIDI_LTR or UBIDI_RTL.
1806
* @param id The input identifier whose bidiSkeleton will be computed.
1807
* @param dest The output identifier, to receive the skeleton string.
1808
* @param status The error code, set if an error occurred while attempting to
1809
* perform the check.
1810
* @return A reference to the destination (skeleton) string.
1811
*
1812
* @stable ICU 74
1813
*/
1814
U_I18N_API icu::UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString(
1815
const USpoofChecker *sc, UBiDiDirection direction, const icu::UnicodeString &id,
1816
icu::UnicodeString &dest, UErrorCode *status);
1817
1818
/**
1819
* Get the set of Candidate Characters for Inclusion in Identifiers, as defined
1820
* in http://unicode.org/Public/security/latest/xidmodifications.txt
1821
* and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1822
*
1823
* The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1824
* be deleted by the caller.
1825
*
1826
* @param status The error code, set if a problem occurs while creating the set.
1827
*
1828
* @stable ICU 51
1829
*/
1830
U_CAPI const icu::UnicodeSet * U_EXPORT2
1831
uspoof_getInclusionUnicodeSet(UErrorCode *status);
1832
1833
/**
1834
* Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
1835
* in http://unicode.org/Public/security/latest/xidmodifications.txt
1836
* and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1837
*
1838
* The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1839
* be deleted by the caller.
1840
*
1841
* @param status The error code, set if a problem occurs while creating the set.
1842
*
1843
* @stable ICU 51
1844
*/
1845
U_CAPI const icu::UnicodeSet * U_EXPORT2
1846
uspoof_getRecommendedUnicodeSet(UErrorCode *status);
1847
1848
#endif /* U_SHOW_CPLUSPLUS_API */
1849
1850
#endif /* UCONFIG_NO_NORMALIZATION */
1851
1852
#endif /* USPOOF_H */
1853
1854