CoCalc -- uspoof.h

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/icu4c/i18n/unicode/uspoof.h
⁹⁹¹² views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
***************************************************************************
5
* Copyright (C) 2008-2016, International Business Machines Corporation
6
* and others. All Rights Reserved.
7
***************************************************************************
8
*   file name:  uspoof.h
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2008Feb13
14
*   created by: Andy Heninger
15
*
16
*   Unicode Spoof Detection
17
*/
18

19
#ifndef USPOOF_H
20
#define USPOOF_H
21

22
#include "unicode/ubidi.h"
23
#include "unicode/utypes.h"
24
#include "unicode/uset.h"
25
#include "unicode/parseerr.h"
26

27
#if !UCONFIG_NO_NORMALIZATION
28

29

30
#if U_SHOW_CPLUSPLUS_API
31
#include "unicode/localpointer.h"
32
#include "unicode/unistr.h"
33
#include "unicode/uniset.h"
34
#endif
35

36

37
/**
38
 * \file
39
 * \brief C API: Unicode Security and Spoofing Detection
40
 *
41
 * <p>
42
 * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and
43
 * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
44
 *
45
 * <ol>
46
 * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and
47
 * &quot;&Eta;arvest&quot;, where the second string starts with the Greek capital letter Eta.</li>
48
 * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
49
 * detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li>
50
 * </ol>
51
 *
52
 * <p>
53
 * Although originally designed as a method for flagging suspicious identifier strings such as URLs,
54
 * <code>USpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word
55
 * content filters.
56
 *
57
 * <p>
58
 * The functions of this class are exposed as C API, with a handful of syntactical conveniences for C++.
59
 *
60
 * <h2>Confusables</h2>
61
 *
62
 * <p>
63
 * The following example shows how to use <code>USpoofChecker</code> to check for confusability between two strings:
64
 *
65
 * \code{.c}
66
 * UErrorCode status = U_ZERO_ERROR;
67
 * UChar* str1 = (UChar*) u"Harvest";
68
 * UChar* str2 = (UChar*) u"\u0397arvest";  // with U+0397 GREEK CAPITAL LETTER ETA
69
 *
70
 * USpoofChecker* sc = uspoof_open(&status);
71
 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
72
 *
73
 * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status);
74
 * UBool result = bitmask != 0;
75
 * // areConfusable: 1 (status: U_ZERO_ERROR)
76
 * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
77
 * uspoof_close(sc);
78
 * \endcode
79
 *
80
 * <p>
81
 * The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks}
82
 * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the
83
 * confusability test; and the following line extracts the result out of the return value. For best performance,
84
 * the instance should be created once (e.g., upon application startup), and the efficient
85
 * {@link uspoof_areConfusable} method can be used at runtime.
86
 *
87
 * If the paragraph direction used to display the strings is known, the bidi function should be used instead:
88
 *
89
 * \code{.c}
90
 * UErrorCode status = U_ZERO_ERROR;
91
 * // These strings look identical when rendered in a left-to-right context.
92
 * // They look distinct in a right-to-left context.
93
 * UChar* str1 = (UChar*) u"A1\u05D0";  // A1א
94
 * UChar* str2 = (UChar*) u"A\u05D01";  // Aא1
95
 *
96
 * USpoofChecker* sc = uspoof_open(&status);
97
 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
98
 *
99
 * int32_t bitmask = uspoof_areBidiConfusable(sc, UBIDI_LTR, str1, -1, str2, -1, &status);
100
 * UBool result = bitmask != 0;
101
 * // areBidiConfusable: 1 (status: U_ZERO_ERROR)
102
 * printf("areBidiConfusable: %d (status: %s)\n", result, u_errorName(status));
103
 * uspoof_close(sc);
104
 * \endcode
105
 *
106
 * <p>
107
 * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers.  It will automatically call
108
 * {@link uspoof_close} when the object goes out of scope:
109
 *
110
 * \code{.cpp}
111
 * UErrorCode status = U_ZERO_ERROR;
112
 * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
113
 * uspoof_setChecks(sc.getAlias(), USPOOF_CONFUSABLE, &status);
114
 * // ...
115
 * \endcode
116
 *
117
 * UTS 39 defines two strings to be <em>confusable</em> if they map to the same <em>skeleton string</em>. A skeleton can
118
 * be thought of as a "hash code". {@link uspoof_getSkeleton} computes the skeleton for a particular string, so
119
 * the following snippet is equivalent to the example above:
120
 *
121
 * \code{.c}
122
 * UErrorCode status = U_ZERO_ERROR;
123
 * UChar* str1 = (UChar*) u"Harvest";
124
 * UChar* str2 = (UChar*) u"\u0397arvest";  // with U+0397 GREEK CAPITAL LETTER ETA
125
 *
126
 * USpoofChecker* sc = uspoof_open(&status);
127
 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
128
 *
129
 * // Get skeleton 1
130
 * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status);
131
 * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar));
132
 * status = U_ZERO_ERROR;
133
 * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status);
134
 *
135
 * // Get skeleton 2
136
 * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status);
137
 * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar));
138
 * status = U_ZERO_ERROR;
139
 * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status);
140
 *
141
 * // Are the skeletons the same?
142
 * UBool result = u_strcmp(skel1, skel2) == 0;
143
 * // areConfusable: 1 (status: U_ZERO_ERROR)
144
 * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
145
 * uspoof_close(sc);
146
 * free(skel1);
147
 * free(skel2);
148
 * \endcode
149
 *
150
 * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling
151
 * {@link uspoof_areConfusable} many times in a loop, {@link uspoof_getSkeleton} can be used instead, as shown below:
152
 *
153
 * \code{.c}
154
 * UErrorCode status = U_ZERO_ERROR;
155
 * #define DICTIONARY_LENGTH 2
156
 * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
157
 * UChar* skeletons[DICTIONARY_LENGTH];
158
 * UChar* str = (UChar*) u"1orern";
159
 *
160
 * // Setup:
161
 * USpoofChecker* sc = uspoof_open(&status);
162
 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
163
 * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
164
 *     UChar* word = dictionary[i];
165
 *     int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status);
166
 *     skeletons[i] = (UChar*) malloc(++len * sizeof(UChar));
167
 *     status = U_ZERO_ERROR;
168
 *     uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status);
169
 * }
170
 *
171
 * // Live Check:
172
 * {
173
 *     int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status);
174
 *     UChar* skel = (UChar*) malloc(++len * sizeof(UChar));
175
 *     status = U_ZERO_ERROR;
176
 *     uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status);
177
 *     UBool result = false;
178
 *     for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
179
 *         result = u_strcmp(skel, skeletons[i]) == 0;
180
 *         if (result == true) { break; }
181
 *     }
182
 *     // Has confusable in dictionary: 1 (status: U_ZERO_ERROR)
183
 *     printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status));
184
 *     free(skel);
185
 * }
186
 *
187
 * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
188
 *     free(skeletons[i]);
189
 * }
190
 * uspoof_close(sc);
191
 * \endcode
192
 *
193
 * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em>
194
 * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons
195
 * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons.
196
 *
197
 * <h2>Spoof Detection</h2>
198
 *
199
 * The following snippet shows a minimal example of using <code>USpoofChecker</code> to perform spoof detection on a
200
 * string:
201
 *
202
 * \code{.c}
203
 * UErrorCode status = U_ZERO_ERROR;
204
 * UChar* str = (UChar*) u"p\u0430ypal";  // with U+0430 CYRILLIC SMALL LETTER A
205
 *
206
 * // Get the default set of allowable characters:
207
 * USet* allowed = uset_openEmpty();
208
 * uset_addAll(allowed, uspoof_getRecommendedSet(&status));
209
 * uset_addAll(allowed, uspoof_getInclusionSet(&status));
210
 *
211
 * USpoofChecker* sc = uspoof_open(&status);
212
 * uspoof_setAllowedChars(sc, allowed, &status);
213
 * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE);
214
 *
215
 * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status);
216
 * UBool result = bitmask != 0;
217
 * // fails checks: 1 (status: U_ZERO_ERROR)
218
 * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
219
 * uspoof_close(sc);
220
 * uset_close(allowed);
221
 * \endcode
222
 *
223
 * As in the case for confusability checking, it is good practice to create one <code>USpoofChecker</code> instance at
224
 * startup, and call the cheaper {@link uspoof_check} online. We specify the set of
225
 * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39.
226
 *
227
 * In addition to {@link uspoof_check}, the function {@link uspoof_checkUTF8} is exposed for UTF8-encoded char* strings,
228
 * and {@link uspoof_checkUnicodeString} is exposed for C++ programmers.
229
 *
230
 * If the {@link USPOOF_AUX_INFO} check is enabled, a limited amount of information on why a string failed the checks
231
 * is available in the returned bitmask.  For complete information, use the {@link uspoof_check2} class of functions
232
 * with a {@link USpoofCheckResult} parameter:
233
 *
234
 * \code{.c}
235
 * UErrorCode status = U_ZERO_ERROR;
236
 * UChar* str = (UChar*) u"p\u0430ypal";  // with U+0430 CYRILLIC SMALL LETTER A
237
 *
238
 * // Get the default set of allowable characters:
239
 * USet* allowed = uset_openEmpty();
240
 * uset_addAll(allowed, uspoof_getRecommendedSet(&status));
241
 * uset_addAll(allowed, uspoof_getInclusionSet(&status));
242
 *
243
 * USpoofChecker* sc = uspoof_open(&status);
244
 * uspoof_setAllowedChars(sc, allowed, &status);
245
 * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE);
246
 *
247
 * USpoofCheckResult* checkResult = uspoof_openCheckResult(&status);
248
 * int32_t bitmask = uspoof_check2(sc, str, -1, checkResult, &status);
249
 *
250
 * int32_t failures1 = bitmask;
251
 * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status);
252
 * assert(failures1 == failures2);
253
 * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
254
 * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
255
 *
256
 * // Cleanup:
257
 * uspoof_close(sc);
258
 * uset_close(allowed);
259
 * uspoof_closeCheckResult(checkResult);
260
 * \endcode
261
 *
262
 * C++ users can take advantage of a few syntactical conveniences.  The following snippet is functionally
263
 * equivalent to the one above:
264
 *
265
 * \code{.cpp}
266
 * UErrorCode status = U_ZERO_ERROR;
267
 * UnicodeString str((UChar*) u"p\u0430ypal");  // with U+0430 CYRILLIC SMALL LETTER A
268
 *
269
 * // Get the default set of allowable characters:
270
 * UnicodeSet allowed;
271
 * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status));
272
 * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status));
273
 *
274
 * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
275
 * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status);
276
 * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE);
277
 *
278
 * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status));
279
 * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
280
 *
281
 * int32_t failures1 = bitmask;
282
 * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status);
283
 * assert(failures1 == failures2);
284
 * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
285
 * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
286
 *
287
 * // Explicit cleanup not necessary.
288
 * \endcode
289
 *
290
 * The return value is a bitmask of the checks that failed. In this case, there was one check that failed:
291
 * {@link USPOOF_RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are:
292
 *
293
 * <ul>
294
 * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the
295
 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS
296
 * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li>
297
 * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character
298
 * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li>
299
 * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable
300
 * characters. See {@link uspoof_setAllowedChars} and {@link uspoof_setAllowedLocales}.</li>
301
 * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li>
302
 * </ul>
303
 *
304
 * <p>
305
 * These checks can be enabled independently of each other. For example, if you were interested in checking for only the
306
 * INVISIBLE and MIXED_NUMBERS conditions, you could do:
307
 *
308
 * \code{.c}
309
 * UErrorCode status = U_ZERO_ERROR;
310
 * UChar* str = (UChar*) u"8\u09EA";  // 8 mixed with U+09EA BENGALI DIGIT FOUR
311
 *
312
 * USpoofChecker* sc = uspoof_open(&status);
313
 * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status);
314
 *
315
 * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status);
316
 * UBool result = bitmask != 0;
317
 * // fails checks: 1 (status: U_ZERO_ERROR)
318
 * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
319
 * uspoof_close(sc);
320
 * \endcode
321
 *
322
 * Here is an example in C++ showing how to compute the restriction level of a string:
323
 *
324
 * \code{.cpp}
325
 * UErrorCode status = U_ZERO_ERROR;
326
 * UnicodeString str((UChar*) u"p\u0430ypal");  // with U+0430 CYRILLIC SMALL LETTER A
327
 *
328
 * // Get the default set of allowable characters:
329
 * UnicodeSet allowed;
330
 * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status));
331
 * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status));
332
 *
333
 * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
334
 * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status);
335
 * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE);
336
 * uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL | USPOOF_AUX_INFO, &status);
337
 *
338
 * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status));
339
 * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
340
 *
341
 * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status);
342
 * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask:
343
 * assert((restrictionLevel & bitmask) == restrictionLevel);
344
 * // Restriction level: 0x50000000 (status: U_ZERO_ERROR)
345
 * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status));
346
 * \endcode
347
 *
348
 * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE.  Since
349
 * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check.
350
 *
351
 * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in
352
 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings
353
 * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have
354
 * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is
355
 * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed
356
 * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on
357
 * the levels, see UTS 39 or {@link URestrictionLevel}. The Restriction Level test is aware of the set of
358
 * allowed characters set in {@link uspoof_setAllowedChars}. Note that characters which have script code
359
 * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
360
 * scripts.
361
 *
362
 * <h2>Advanced bidirectional usage</h2>
363
 * If the paragraph direction with which the identifiers will be displayed is not known, there are
364
 * multiple options for confusable detection depending on the circumstances.
365
 *
366
 * <p>
367
 * In some circumstances, the only concern is confusion between identifiers displayed with the same
368
 * paragraph direction.
369
 *
370
 * <p>
371
 * An example is the case where identifiers are usernames prefixed with the @ symbol.
372
 * That symbol will appear to the left in a left-to-right context, and to the right in a
373
 * right-to-left context, so that an identifier displayed in a left-to-right context can never be
374
 * confused with an identifier displayed in a right-to-left context:
375
 * <ul>
376
 * <li>
377
 * The usernames "A1א" (A one aleph) and "Aא1" (A aleph 1)
378
 * would be considered confusable, since they both appear as \@A1א in a left-to-right context, and the
379
 * usernames "אA_1" (aleph A underscore one) and "א1_A" (aleph one underscore A) would be considered
380
 * confusable, since they both appear as A_1א@ in a right-to-left context.
381
 * </li>
382
 * <li>
383
 * The username "Mark_" would not be considered confusable with the username "_Mark",
384
 * even though the latter would appear as Mark_@ in a right-to-left context, and the
385
 * former as \@Mark_ in a left-to-right context.
386
 * </li>
387
 * </ul>
388
 * <p>
389
 * In that case, the caller should check for both LTR-confusability and RTL-confusability:
390
 *
391
 * \code{.cpp}
392
 * bool confusableInEitherDirection =
393
 *     uspoof_areBidiConfusableUnicodeString(sc, UBIDI_LTR, id1, id2, &status) ||
394
 *     uspoof_areBidiConfusableUnicodeString(sc, UBIDI_RTL, id1, id2, &status);
395
 * \endcode
396
 *
397
 * If the bidiSkeleton is used, the LTR and RTL skeleta should be kept separately and compared, LTR
398
 * with LTR and RTL with RTL.
399
 *
400
 * <p>
401
 * In cases where confusability between the visual appearances of an identifier displayed in a
402
 * left-to-right context with another identifier displayed in a right-to-left context is a concern,
403
 * the LTR skeleton of one can be compared with the RTL skeleton of the other.  However, this
404
 * very broad definition of confusability may have unexpected results; for instance, it treats the
405
 * ASCII identifiers "Mark_" and "_Mark" as confusable.
406
 *
407
 * <h2>Additional Information</h2>
408
 *
409
 * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
410
 *
411
 * <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether
412
 * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads,
413
 * using the same USpoofChecker instance.
414
 *
415
 * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are
416
 * thread safe. Those that take a non-const USpoofChecker are not thread safe..
417
 *
418
 * @stable ICU 4.6
419
 */
420

421
U_CDECL_BEGIN
422

423
struct USpoofChecker;
424
/**
425
 * @stable ICU 4.2
426
 */
427
typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker */
428

429
struct USpoofCheckResult;
430
/**
431
 * @see uspoof_openCheckResult
432
 * @stable ICU 58
433
 */
434
typedef struct USpoofCheckResult USpoofCheckResult;
435

436
/**
437
 * Enum for the kinds of checks that USpoofChecker can perform.
438
 * These enum values are used both to select the set of checks that
439
 * will be performed, and to report results from the check function.
440
 *
441
 * @stable ICU 4.2
442
 */
443
typedef enum USpoofChecks {
444
    /**
445
     * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
446
     * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section
447
     * 4.
448
     *
449
     * @see uspoof_areConfusable
450
     * @stable ICU 4.2
451
     */
452
    USPOOF_SINGLE_SCRIPT_CONFUSABLE =   1,
453

454
    /**
455
     * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
456
     * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS
457
     * 39 section 4.
458
     *
459
     * @see uspoof_areConfusable
460
     * @stable ICU 4.2
461
     */
462
    USPOOF_MIXED_SCRIPT_CONFUSABLE  =   2,
463

464
    /**
465
     * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
466
     * that the two strings are visually confusable and that they are not from the same script but both of them are
467
     * single-script strings, according to UTS 39 section 4.
468
     *
469
     * @see uspoof_areConfusable
470
     * @stable ICU 4.2
471
     */
472
    USPOOF_WHOLE_SCRIPT_CONFUSABLE  =   4,
473

474
    /**
475
     * Enable this flag in {@link uspoof_setChecks} to turn on all types of confusables.  You may set
476
     * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to
477
     * make {@link uspoof_areConfusable} return only those types of confusables.
478
     *
479
     * @see uspoof_areConfusable
480
     * @see uspoof_getSkeleton
481
     * @stable ICU 58
482
     */
483
    USPOOF_CONFUSABLE               =   USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE,
484

485
#ifndef U_HIDE_DEPRECATED_API
486
    /**
487
      * This flag is deprecated and no longer affects the behavior of SpoofChecker.
488
      *
489
      * @deprecated ICU 58  Any case confusable mappings were removed from UTS 39; the corresponding ICU API was deprecated.
490
      */
491
    USPOOF_ANY_CASE                 =   8,
492
#endif  /* U_HIDE_DEPRECATED_API */
493

494
    /**
495
      * Check that an identifier is no looser than the specified RestrictionLevel.
496
      * The default if {@link uspoof_setRestrictionLevel} is not called is HIGHLY_RESTRICTIVE.
497
      *
498
      * If USPOOF_AUX_INFO is enabled the actual restriction level of the
499
      * identifier being tested will also be returned by uspoof_check().
500
      *
501
      * @see URestrictionLevel
502
      * @see uspoof_setRestrictionLevel
503
      * @see USPOOF_AUX_INFO
504
      *
505
      * @stable ICU 51
506
      */
507
    USPOOF_RESTRICTION_LEVEL        = 16,
508

509
#ifndef U_HIDE_DEPRECATED_API
510
    /** Check that an identifier contains only characters from a
511
      * single script (plus chars from the common and inherited scripts.)
512
      * Applies to checks of a single identifier check only.
513
      * @deprecated ICU 51  Use RESTRICTION_LEVEL instead.
514
      */
515
    USPOOF_SINGLE_SCRIPT            =  USPOOF_RESTRICTION_LEVEL,
516
#endif  /* U_HIDE_DEPRECATED_API */
517

518
    /** Check an identifier for the presence of invisible characters,
519
      * such as zero-width spaces, or character sequences that are
520
      * likely not to display, such as multiple occurrences of the same
521
      * non-spacing mark.  This check does not test the input string as a whole
522
      * for conformance to any particular syntax for identifiers.
523
      */
524
    USPOOF_INVISIBLE                =  32,
525

526
    /** Check that an identifier contains only characters from a specified set
527
      * of acceptable characters.  See {@link uspoof_setAllowedChars} and
528
      * {@link uspoof_setAllowedLocales}.  Note that a string that fails this check
529
      * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check.
530
      */
531
    USPOOF_CHAR_LIMIT               =  64,
532

533
    /**
534
     * Check that an identifier does not mix numbers from different numbering systems.
535
     * For more information, see UTS 39 section 5.3.
536
     *
537
     * @stable ICU 51
538
     */
539
    USPOOF_MIXED_NUMBERS            = 128,
540

541
    /**
542
     * Check that an identifier does not have a combining character following a character in which that
543
     * combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
544
     *
545
     * More specifically, the following characters are forbidden from preceding a U+0307:
546
     * <ul>
547
     * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li>
548
     * <li>Latin lowercase letter 'l'</li>
549
     * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li>
550
     * <li>Any character whose confusable prototype ends with such a character
551
     * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li>
552
     * </ul>
553
     * In addition, combining characters are allowed between the above characters and U+0307 except those
554
     * with combining class 0 or combining class "Above" (230, same class as U+0307).
555
     *
556
     * This list and the number of combing characters considered by this check may grow over time.
557
     *
558
     * @stable ICU 62
559
     */
560
    USPOOF_HIDDEN_OVERLAY            = 256,
561

562
   /**
563
     * Enable all spoof checks.
564
     *
565
     * @stable ICU 4.6
566
     */
567
    USPOOF_ALL_CHECKS               = 0xFFFF,
568

569
    /**
570
      * Enable the return of auxiliary (non-error) information in the
571
      * upper bits of the check results value.
572
      *
573
      * If this "check" is not enabled, the results of {@link uspoof_check} will be
574
      * zero when an identifier passes all of the enabled checks.
575
      *
576
      * If this "check" is enabled, (uspoof_check() & {@link USPOOF_ALL_CHECKS}) will
577
      * be zero when an identifier passes all checks.
578
      *
579
      * @stable ICU 51
580
      */
581
    USPOOF_AUX_INFO                  = 0x40000000
582

583
    } USpoofChecks;
584

585

586
    /**
587
     * Constants from UTS #39 for use in {@link uspoof_setRestrictionLevel}, and
588
     * for returned identifier restriction levels in check results.
589
     *
590
     * @stable ICU 51
591
     *
592
     * @see uspoof_setRestrictionLevel
593
     * @see uspoof_check
594
     */
595
    typedef enum URestrictionLevel {
596
        /**
597
         * All characters in the string are in the identifier profile and all characters in the string are in the
598
         * ASCII range.
599
         *
600
         * @stable ICU 51
601
         */
602
        USPOOF_ASCII = 0x10000000,
603
        /**
604
         * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and
605
         * the string is single-script, according to the definition in UTS 39 section 5.1.
606
         *
607
         * @stable ICU 53
608
         */
609
        USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000,
610
        /**
611
         * The string classifies as Single Script, or all characters in the string are in the identifier profile and
612
         * the string is covered by any of the following sets of scripts, according to the definition in UTS 39
613
         * section 5.1:
614
         * <ul>
615
         *   <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li>
616
         *   <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li>
617
         *   <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li>
618
         * </ul>
619
         * This is the default restriction in ICU.
620
         *
621
         * @stable ICU 51
622
         */
623
        USPOOF_HIGHLY_RESTRICTIVE = 0x30000000,
624
        /**
625
         * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile
626
         * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic,
627
         * Greek, and Cherokee.
628
         *
629
         * @stable ICU 51
630
         */
631
        USPOOF_MODERATELY_RESTRICTIVE = 0x40000000,
632
        /**
633
         * All characters in the string are in the identifier profile.  Allow arbitrary mixtures of scripts.
634
         *
635
         * @stable ICU 51
636
         */
637
        USPOOF_MINIMALLY_RESTRICTIVE = 0x50000000,
638
        /**
639
         * Any valid identifiers, including characters outside of the Identifier Profile.
640
         *
641
         * @stable ICU 51
642
         */
643
        USPOOF_UNRESTRICTIVE = 0x60000000,
644
        /**
645
         * Mask for selecting the Restriction Level bits from the return value of {@link uspoof_check}.
646
         *
647
         * @stable ICU 53
648
         */
649
        USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000,
650
#ifndef U_HIDE_INTERNAL_API
651
        /**
652
         * An undefined restriction level.
653
         * @internal
654
         */
655
        USPOOF_UNDEFINED_RESTRICTIVE = -1
656
#endif  /* U_HIDE_INTERNAL_API */
657
    } URestrictionLevel;
658

659
/**
660
 *  Create a Unicode Spoof Checker, configured to perform all
661
 *  checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT.
662
 *  Note that additional checks may be added in the future,
663
 *  resulting in the changes to the default checking behavior.
664
 *
665
 *  @param status  The error code, set if this function encounters a problem.
666
 *  @return        the newly created Spoof Checker
667
 *  @stable ICU 4.2
668
 */
669
U_CAPI USpoofChecker * U_EXPORT2
670
uspoof_open(UErrorCode *status);
671

672

673
/**
674
 * Open a Spoof checker from its serialized form, stored in 32-bit-aligned memory.
675
 * Inverse of uspoof_serialize().
676
 * The memory containing the serialized data must remain valid and unchanged
677
 * as long as the spoof checker, or any cloned copies of the spoof checker,
678
 * are in use.  Ownership of the memory remains with the caller.
679
 * The spoof checker (and any clones) must be closed prior to deleting the
680
 * serialized data.
681
 *
682
 * @param data a pointer to 32-bit-aligned memory containing the serialized form of spoof data
683
 * @param length the number of bytes available at data;
684
 *               can be more than necessary
685
 * @param pActualLength receives the actual number of bytes at data taken up by the data;
686
 *                      can be NULL
687
 * @param pErrorCode ICU error code
688
 * @return the spoof checker.
689
 *
690
 * @see uspoof_open
691
 * @see uspoof_serialize
692
 * @stable ICU 4.2
693
 */
694
U_CAPI USpoofChecker * U_EXPORT2
695
uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
696
                          UErrorCode *pErrorCode);
697

698
/**
699
  * Open a Spoof Checker from the source form of the spoof data.
700
  * The input corresponds to the Unicode data file confusables.txt
701
  * as described in Unicode Technical Standard #39.  The syntax of the source data
702
  * is as described in UTS #39 for this file, and the content of
703
  * this file is acceptable input.
704
  *
705
  * The character encoding of the (char *) input text is UTF-8.
706
  *
707
  * @param confusables a pointer to the confusable characters definitions,
708
  *                    as found in file confusables.txt from unicode.org.
709
  * @param confusablesLen The length of the confusables text, or -1 if the
710
  *                    input string is zero terminated.
711
  * @param confusablesWholeScript
712
  *                    Deprecated in ICU 58.  No longer used.
713
  * @param confusablesWholeScriptLen
714
  *                    Deprecated in ICU 58.  No longer used.
715
  * @param errType     In the event of an error in the input, indicates
716
  *                    which of the input files contains the error.
717
  *                    The value is one of USPOOF_SINGLE_SCRIPT_CONFUSABLE or
718
  *                    USPOOF_WHOLE_SCRIPT_CONFUSABLE, or
719
  *                    zero if no errors are found.
720
  * @param pe          In the event of an error in the input, receives the position
721
  *                    in the input text (line, offset) of the error.
722
  * @param status      an in/out ICU UErrorCode.  Among the possible errors is
723
  *                    U_PARSE_ERROR, which is used to report syntax errors
724
  *                    in the input.
725
  * @return            A spoof checker that uses the rules from the input files.
726
  * @stable ICU 4.2
727
  */
728
U_CAPI USpoofChecker * U_EXPORT2
729
uspoof_openFromSource(const char *confusables,  int32_t confusablesLen,
730
                      const char *confusablesWholeScript, int32_t confusablesWholeScriptLen,
731
                      int32_t *errType, UParseError *pe, UErrorCode *status);
732

733

734
/**
735
  * Close a Spoof Checker, freeing any memory that was being held by
736
  *   its implementation.
737
  * @stable ICU 4.2
738
  */
739
U_CAPI void U_EXPORT2
740
uspoof_close(USpoofChecker *sc);
741

742
/**
743
 * Clone a Spoof Checker.  The clone will be set to perform the same checks
744
 *   as the original source.
745
 *
746
 * @param sc       The source USpoofChecker
747
 * @param status   The error code, set if this function encounters a problem.
748
 * @return
749
 * @stable ICU 4.2
750
 */
751
U_CAPI USpoofChecker * U_EXPORT2
752
uspoof_clone(const USpoofChecker *sc, UErrorCode *status);
753

754

755
/**
756
 * Specify the bitmask of checks that will be performed by {@link uspoof_check}. Calling this method
757
 * overwrites any checks that may have already been enabled. By default, all checks are enabled.
758
 *
759
 * To enable specific checks and disable all others,
760
 * OR together only the bit constants for the desired checks.
761
 * For example, to fail strings containing characters outside of
762
 * the set specified by {@link uspoof_setAllowedChars} and
763
 * also strings that contain digits from mixed numbering systems:
764
 *
765
 * <pre>
766
 * {@code
767
 * uspoof_setChecks(USPOOF_CHAR_LIMIT | USPOOF_MIXED_NUMBERS);
768
 * }
769
 * </pre>
770
 *
771
 * To disable specific checks and enable all others,
772
 * start with ALL_CHECKS and "AND away" the not-desired checks.
773
 * For example, if you are not planning to use the {@link uspoof_areConfusable} functionality,
774
 * it is good practice to disable the CONFUSABLE check:
775
 *
776
 * <pre>
777
 * {@code
778
 * uspoof_setChecks(USPOOF_ALL_CHECKS & ~USPOOF_CONFUSABLE);
779
 * }
780
 * </pre>
781
 *
782
 * Note that methods such as {@link uspoof_setAllowedChars}, {@link uspoof_setAllowedLocales}, and
783
 * {@link uspoof_setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they
784
 * enable onto the existing bitmask specified by this method. For more details, see the documentation of those
785
 * methods.
786
 *
787
 * @param sc       The USpoofChecker
788
 * @param checks         The set of checks that this spoof checker will perform.
789
 *                 The value is a bit set, obtained by OR-ing together
790
 *                 values from enum USpoofChecks.
791
 * @param status   The error code, set if this function encounters a problem.
792
 * @stable ICU 4.2
793
 *
794
 */
795
U_CAPI void U_EXPORT2
796
uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status);
797

798
/**
799
 * Get the set of checks that this Spoof Checker has been configured to perform.
800
 *
801
 * @param sc       The USpoofChecker
802
 * @param status   The error code, set if this function encounters a problem.
803
 * @return         The set of checks that this spoof checker will perform.
804
 *                 The value is a bit set, obtained by OR-ing together
805
 *                 values from enum USpoofChecks.
806
 * @stable ICU 4.2
807
 *
808
 */
809
U_CAPI int32_t U_EXPORT2
810
uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
811

812
/**
813
 * Set the loosest restriction level allowed for strings. The default if this is not called is
814
 * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
815
 * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
816
 * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
817
 *
818
 * @param sc       The USpoofChecker
819
 * @param restrictionLevel The loosest restriction level allowed.
820
 * @see URestrictionLevel
821
 * @stable ICU 51
822
 */
823
U_CAPI void U_EXPORT2
824
uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel);
825

826

827
/**
828
  * Get the Restriction Level that will be tested if the checks include {@link USPOOF_RESTRICTION_LEVEL}.
829
  *
830
  * @return The restriction level
831
  * @see URestrictionLevel
832
  * @stable ICU 51
833
  */
834
U_CAPI URestrictionLevel U_EXPORT2
835
uspoof_getRestrictionLevel(const USpoofChecker *sc);
836

837
/**
838
 * Limit characters that are acceptable in identifiers being checked to those
839
 * normally used with the languages associated with the specified locales.
840
 * Any previously specified list of locales is replaced by the new settings.
841
 *
842
 * A set of languages is determined from the locale(s), and
843
 * from those a set of acceptable Unicode scripts is determined.
844
 * Characters from this set of scripts, along with characters from
845
 * the "common" and "inherited" Unicode Script categories
846
 * will be permitted.
847
 *
848
 * Supplying an empty string removes all restrictions;
849
 * characters from any script will be allowed.
850
 *
851
 * The {@link USPOOF_CHAR_LIMIT} test is automatically enabled for this
852
 * USpoofChecker when calling this function with a non-empty list
853
 * of locales.
854
 *
855
 * The Unicode Set of characters that will be allowed is accessible
856
 * via the uspoof_getAllowedChars() function.  uspoof_setAllowedLocales()
857
 * will <i>replace</i> any previously applied set of allowed characters.
858
 *
859
 * Adjustments, such as additions or deletions of certain classes of characters,
860
 * can be made to the result of uspoof_setAllowedLocales() by
861
 * fetching the resulting set with uspoof_getAllowedChars(),
862
 * manipulating it with the Unicode Set API, then resetting the
863
 * spoof detectors limits with uspoof_setAllowedChars().
864
 *
865
 * @param sc           The USpoofChecker
866
 * @param localesList  A list list of locales, from which the language
867
 *                     and associated script are extracted.  The locales
868
 *                     are comma-separated if there is more than one.
869
 *                     White space may not appear within an individual locale,
870
 *                     but is ignored otherwise.
871
 *                     The locales are syntactically like those from the
872
 *                     HTTP Accept-Language header.
873
 *                     If the localesList is empty, no restrictions will be placed on
874
 *                     the allowed characters.
875
 *
876
 * @param status       The error code, set if this function encounters a problem.
877
 * @stable ICU 4.2
878
 */
879
U_CAPI void U_EXPORT2
880
uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status);
881

882
/**
883
 * Get a list of locales for the scripts that are acceptable in strings
884
 *  to be checked.  If no limitations on scripts have been specified,
885
 *  an empty string will be returned.
886
 *
887
 *  uspoof_setAllowedChars() will reset the list of allowed to be empty.
888
 *
889
 *  The format of the returned list is the same as that supplied to
890
 *  uspoof_setAllowedLocales(), but returned list may not be identical
891
 *  to the originally specified string; the string may be reformatted,
892
 *  and information other than languages from
893
 *  the originally specified locales may be omitted.
894
 *
895
 * @param sc           The USpoofChecker
896
 * @param status       The error code, set if this function encounters a problem.
897
 * @return             A string containing a list of  locales corresponding
898
 *                     to the acceptable scripts, formatted like an
899
 *                     HTTP Accept Language value.
900
 *
901
 * @stable ICU 4.2
902
 */
903
U_CAPI const char * U_EXPORT2
904
uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status);
905

906

907
/**
908
 * Limit the acceptable characters to those specified by a Unicode Set.
909
 *   Any previously specified character limit is
910
 *   is replaced by the new settings.  This includes limits on
911
 *   characters that were set with the uspoof_setAllowedLocales() function.
912
 *
913
 * The USPOOF_CHAR_LIMIT test is automatically enabled for this
914
 * USpoofChecker by this function.
915
 *
916
 * @param sc       The USpoofChecker
917
 * @param chars    A Unicode Set containing the list of
918
 *                 characters that are permitted.  Ownership of the set
919
 *                 remains with the caller.  The incoming set is cloned by
920
 *                 this function, so there are no restrictions on modifying
921
 *                 or deleting the USet after calling this function.
922
 * @param status   The error code, set if this function encounters a problem.
923
 * @stable ICU 4.2
924
 */
925
U_CAPI void U_EXPORT2
926
uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status);
927

928

929
/**
930
 * Get a USet for the characters permitted in an identifier.
931
 * This corresponds to the limits imposed by the Set Allowed Characters
932
 * functions. Limitations imposed by other checks will not be
933
 * reflected in the set returned by this function.
934
 *
935
 * The returned set will be frozen, meaning that it cannot be modified
936
 * by the caller.
937
 *
938
 * Ownership of the returned set remains with the Spoof Detector.  The
939
 * returned set will become invalid if the spoof detector is closed,
940
 * or if a new set of allowed characters is specified.
941
 *
942
 *
943
 * @param sc       The USpoofChecker
944
 * @param status   The error code, set if this function encounters a problem.
945
 * @return         A USet containing the characters that are permitted by
946
 *                 the USPOOF_CHAR_LIMIT test.
947
 * @stable ICU 4.2
948
 */
949
U_CAPI const USet * U_EXPORT2
950
uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status);
951

952

953
/**
954
 * Check the specified string for possible security issues.
955
 * The text to be checked will typically be an identifier of some sort.
956
 * The set of checks to be performed is specified with uspoof_setChecks().
957
 *
958
 * \note
959
 *   Consider using the newer API, {@link uspoof_check2}, instead.
960
 *   The newer API exposes additional information from the check procedure
961
 *   and is otherwise identical to this method.
962
 *
963
 * @param sc      The USpoofChecker
964
 * @param id      The identifier to be checked for possible security issues,
965
 *                in UTF-16 format.
966
 * @param length  the length of the string to be checked, expressed in
967
 *                16 bit UTF-16 code units, or -1 if the string is
968
 *                zero terminated.
969
 * @param position  Deprecated in ICU 51.  Always returns zero.
970
 *                Originally, an out parameter for the index of the first
971
 *                string position that failed a check.
972
 *                This parameter may be NULL.
973
 * @param status  The error code, set if an error occurred while attempting to
974
 *                perform the check.
975
 *                Spoofing or security issues detected with the input string are
976
 *                not reported here, but through the function's return value.
977
 * @return        An integer value with bits set for any potential security
978
 *                or spoofing issues detected.  The bits are defined by
979
 *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
980
 *                will be zero if the input string passes all of the
981
 *                enabled checks.
982
 * @see uspoof_check2
983
 * @stable ICU 4.2
984
 */
985
U_CAPI int32_t U_EXPORT2
986
uspoof_check(const USpoofChecker *sc,
987
                         const UChar *id, int32_t length,
988
                         int32_t *position,
989
                         UErrorCode *status);
990

991

992
/**
993
 * Check the specified string for possible security issues.
994
 * The text to be checked will typically be an identifier of some sort.
995
 * The set of checks to be performed is specified with uspoof_setChecks().
996
 *
997
 * \note
998
 *   Consider using the newer API, {@link uspoof_check2UTF8}, instead.
999
 *   The newer API exposes additional information from the check procedure
1000
 *   and is otherwise identical to this method.
1001
 *
1002
 * @param sc      The USpoofChecker
1003
 * @param id      A identifier to be checked for possible security issues, in UTF8 format.
1004
 * @param length  the length of the string to be checked, or -1 if the string is
1005
 *                zero terminated.
1006
 * @param position  Deprecated in ICU 51.  Always returns zero.
1007
 *                Originally, an out parameter for the index of the first
1008
 *                string position that failed a check.
1009
 *                This parameter may be NULL.
1010
 * @param status  The error code, set if an error occurred while attempting to
1011
 *                perform the check.
1012
 *                Spoofing or security issues detected with the input string are
1013
 *                not reported here, but through the function's return value.
1014
 *                If the input contains invalid UTF-8 sequences,
1015
 *                a status of U_INVALID_CHAR_FOUND will be returned.
1016
 * @return        An integer value with bits set for any potential security
1017
 *                or spoofing issues detected.  The bits are defined by
1018
 *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1019
 *                will be zero if the input string passes all of the
1020
 *                enabled checks.
1021
 * @see uspoof_check2UTF8
1022
 * @stable ICU 4.2
1023
 */
1024
U_CAPI int32_t U_EXPORT2
1025
uspoof_checkUTF8(const USpoofChecker *sc,
1026
                 const char *id, int32_t length,
1027
                 int32_t *position,
1028
                 UErrorCode *status);
1029

1030

1031
/**
1032
 * Check the specified string for possible security issues.
1033
 * The text to be checked will typically be an identifier of some sort.
1034
 * The set of checks to be performed is specified with uspoof_setChecks().
1035
 *
1036
 * @param sc      The USpoofChecker
1037
 * @param id      The identifier to be checked for possible security issues,
1038
 *                in UTF-16 format.
1039
 * @param length  the length of the string to be checked, or -1 if the string is
1040
 *                zero terminated.
1041
 * @param checkResult  An instance of USpoofCheckResult to be filled with
1042
 *                details about the identifier.  Can be NULL.
1043
 * @param status  The error code, set if an error occurred while attempting to
1044
 *                perform the check.
1045
 *                Spoofing or security issues detected with the input string are
1046
 *                not reported here, but through the function's return value.
1047
 * @return        An integer value with bits set for any potential security
1048
 *                or spoofing issues detected.  The bits are defined by
1049
 *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1050
 *                will be zero if the input string passes all of the
1051
 *                enabled checks.  Any information in this bitmask will be
1052
 *                consistent with the information saved in the optional
1053
 *                checkResult parameter.
1054
 * @see uspoof_openCheckResult
1055
 * @see uspoof_check2UTF8
1056
 * @see uspoof_check2UnicodeString
1057
 * @stable ICU 58
1058
 */
1059
U_CAPI int32_t U_EXPORT2
1060
uspoof_check2(const USpoofChecker *sc,
1061
    const UChar* id, int32_t length,
1062
    USpoofCheckResult* checkResult,
1063
    UErrorCode *status);
1064

1065
/**
1066
 * Check the specified string for possible security issues.
1067
 * The text to be checked will typically be an identifier of some sort.
1068
 * The set of checks to be performed is specified with uspoof_setChecks().
1069
 *
1070
 * This version of {@link uspoof_check} accepts a USpoofCheckResult, which
1071
 * returns additional information about the identifier.  For more
1072
 * information, see {@link uspoof_openCheckResult}.
1073
 *
1074
 * @param sc      The USpoofChecker
1075
 * @param id      A identifier to be checked for possible security issues, in UTF8 format.
1076
 * @param length  the length of the string to be checked, or -1 if the string is
1077
 *                zero terminated.
1078
 * @param checkResult  An instance of USpoofCheckResult to be filled with
1079
 *                details about the identifier.  Can be NULL.
1080
 * @param status  The error code, set if an error occurred while attempting to
1081
 *                perform the check.
1082
 *                Spoofing or security issues detected with the input string are
1083
 *                not reported here, but through the function's return value.
1084
 * @return        An integer value with bits set for any potential security
1085
 *                or spoofing issues detected.  The bits are defined by
1086
 *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1087
 *                will be zero if the input string passes all of the
1088
 *                enabled checks.  Any information in this bitmask will be
1089
 *                consistent with the information saved in the optional
1090
 *                checkResult parameter.
1091
 * @see uspoof_openCheckResult
1092
 * @see uspoof_check2
1093
 * @see uspoof_check2UnicodeString
1094
 * @stable ICU 58
1095
 */
1096
U_CAPI int32_t U_EXPORT2
1097
uspoof_check2UTF8(const USpoofChecker *sc,
1098
    const char *id, int32_t length,
1099
    USpoofCheckResult* checkResult,
1100
    UErrorCode *status);
1101

1102
/**
1103
 * Create a USpoofCheckResult, used by the {@link uspoof_check2} class of functions to return
1104
 * information about the identifier.  Information includes:
1105
 * <ul>
1106
 *   <li>A bitmask of the checks that failed</li>
1107
 *   <li>The identifier's restriction level (UTS 39 section 5.2)</li>
1108
 *   <li>The set of numerics in the string (UTS 39 section 5.3)</li>
1109
 * </ul>
1110
 * The data held in a USpoofCheckResult is cleared whenever it is passed into a new call
1111
 * of {@link uspoof_check2}.
1112
 *
1113
 * @param status  The error code, set if this function encounters a problem.
1114
 * @return        the newly created USpoofCheckResult
1115
 * @see uspoof_check2
1116
 * @see uspoof_check2UTF8
1117
 * @see uspoof_check2UnicodeString
1118
 * @stable ICU 58
1119
 */
1120
U_CAPI USpoofCheckResult* U_EXPORT2
1121
uspoof_openCheckResult(UErrorCode *status);
1122

1123
/**
1124
 * Close a USpoofCheckResult, freeing any memory that was being held by
1125
 *   its implementation.
1126
 *
1127
 * @param checkResult  The instance of USpoofCheckResult to close
1128
 * @stable ICU 58
1129
 */
1130
U_CAPI void U_EXPORT2
1131
uspoof_closeCheckResult(USpoofCheckResult *checkResult);
1132

1133
/**
1134
 * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests
1135
 * in question: USPOOF_RESTRICTION_LEVEL, USPOOF_CHAR_LIMIT, and so on.
1136
 *
1137
 * @param checkResult  The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
1138
 * @param status       The error code, set if an error occurred.
1139
 * @return        An integer value with bits set for any potential security
1140
 *                or spoofing issues detected.  The bits are defined by
1141
 *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1142
 *                will be zero if the input string passes all of the
1143
 *                enabled checks.
1144
 * @see uspoof_setChecks
1145
 * @stable ICU 58
1146
 */
1147
U_CAPI int32_t U_EXPORT2
1148
uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status);
1149

1150
/**
1151
 * Gets the restriction level that the text meets, if the USPOOF_RESTRICTION_LEVEL check
1152
 * was enabled; otherwise, undefined.
1153
 *
1154
 * @param checkResult  The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
1155
 * @param status       The error code, set if an error occurred.
1156
 * @return             The restriction level contained in the USpoofCheckResult
1157
 * @see uspoof_setRestrictionLevel
1158
 * @stable ICU 58
1159
 */
1160
U_CAPI URestrictionLevel U_EXPORT2
1161
uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status);
1162

1163
/**
1164
 * Gets the set of numerics found in the string, if the USPOOF_MIXED_NUMBERS check was enabled;
1165
 * otherwise, undefined.  The set will contain the zero digit from each decimal number system found
1166
 * in the input string.  Ownership of the returned USet remains with the USpoofCheckResult.
1167
 * The USet will be free'd when {@link uspoof_closeCheckResult} is called.
1168
 *
1169
 * @param checkResult  The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
1170
 * @return             The set of numerics contained in the USpoofCheckResult
1171
 * @param status       The error code, set if an error occurred.
1172
 * @stable ICU 58
1173
 */
1174
U_CAPI const USet* U_EXPORT2
1175
uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status);
1176

1177

1178
/**
1179
 * Check whether two specified strings are visually confusable.
1180
 *
1181
 * If the strings are confusable, the return value will be nonzero, as long as
1182
 * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
1183
 *
1184
 * The bits in the return value correspond to flags for each of the classes of
1185
 * confusables applicable to the two input strings.  According to UTS 39
1186
 * section 4, the possible flags are:
1187
 *
1188
 * <ul>
1189
 *   <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
1190
 *   <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
1191
 *   <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
1192
 * </ul>
1193
 *
1194
 * If one or more of the above flags were not listed in uspoof_setChecks(), this
1195
 * function will never report that class of confusable.  The check
1196
 * {@link USPOOF_CONFUSABLE} enables all three flags.
1197
 *
1198
 *
1199
 * @param sc      The USpoofChecker
1200
 * @param id1     The first of the two identifiers to be compared for
1201
 *                confusability.  The strings are in UTF-16 format.
1202
 * @param length1 the length of the first identifier, expressed in
1203
 *                16 bit UTF-16 code units, or -1 if the string is
1204
 *                nul terminated.
1205
 * @param id2     The second of the two identifiers to be compared for
1206
 *                confusability.  The identifiers are in UTF-16 format.
1207
 * @param length2 The length of the second identifiers, expressed in
1208
 *                16 bit UTF-16 code units, or -1 if the string is
1209
 *                nul terminated.
1210
 * @param status  The error code, set if an error occurred while attempting to
1211
 *                perform the check.
1212
 *                Confusability of the identifiers is not reported here,
1213
 *                but through this function's return value.
1214
 * @return        An integer value with bit(s) set corresponding to
1215
 *                the type of confusability found, as defined by
1216
 *                enum USpoofChecks.  Zero is returned if the identifiers
1217
 *                are not confusable.
1218
 *
1219
 * @stable ICU 4.2
1220
 */
1221
U_CAPI int32_t U_EXPORT2
1222
uspoof_areConfusable(const USpoofChecker *sc,
1223
                     const UChar *id1, int32_t length1,
1224
                     const UChar *id2, int32_t length2,
1225
                     UErrorCode *status);
1226

1227
/**
1228
 * Check whether two specified strings are visually confusable when
1229
 * displayed in a context with the given paragraph direction.
1230
 *
1231
 * If the strings are confusable, the return value will be nonzero, as long as
1232
 * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
1233
 *
1234
 * The bits in the return value correspond to flags for each of the classes of
1235
 * confusables applicable to the two input strings.  According to UTS 39
1236
 * section 4, the possible flags are:
1237
 *
1238
 * <ul>
1239
 *   <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
1240
 *   <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
1241
 *   <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
1242
 * </ul>
1243
 *
1244
 * If one or more of the above flags were not listed in uspoof_setChecks(), this
1245
 * function will never report that class of confusable.  The check
1246
 * {@link USPOOF_CONFUSABLE} enables all three flags.
1247
 *
1248
 *
1249
 * @param sc      The USpoofChecker
1250
 * @param direction The paragraph direction with which the identifiers are
1251
 *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1252
 * @param id1     The first of the two identifiers to be compared for
1253
 *                confusability.  The strings are in UTF-16 format.
1254
 * @param length1 the length of the first identifier, expressed in
1255
 *                16 bit UTF-16 code units, or -1 if the string is
1256
 *                nul terminated.
1257
 * @param id2     The second of the two identifiers to be compared for
1258
 *                confusability.  The identifiers are in UTF-16 format.
1259
 * @param length2 The length of the second identifiers, expressed in
1260
 *                16 bit UTF-16 code units, or -1 if the string is
1261
 *                nul terminated.
1262
 * @param status  The error code, set if an error occurred while attempting to
1263
 *                perform the check.
1264
 *                Confusability of the identifiers is not reported here,
1265
 *                but through this function's return value.
1266
 * @return        An integer value with bit(s) set corresponding to
1267
 *                the type of confusability found, as defined by
1268
 *                enum USpoofChecks.  Zero is returned if the identifiers
1269
 *                are not confusable.
1270
 *
1271
 * @stable ICU 74
1272
 */
1273
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction,
1274
                                                  const UChar *id1, int32_t length1,
1275
                                                  const UChar *id2, int32_t length2,
1276
                                                  UErrorCode *status);
1277

1278
/**
1279
 * A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format.
1280
 *
1281
 * @param sc      The USpoofChecker
1282
 * @param id1     The first of the two identifiers to be compared for
1283
 *                confusability.  The strings are in UTF-8 format.
1284
 * @param length1 the length of the first identifiers, in bytes, or -1
1285
 *                if the string is nul terminated.
1286
 * @param id2     The second of the two identifiers to be compared for
1287
 *                confusability.  The strings are in UTF-8 format.
1288
 * @param length2 The length of the second string in bytes, or -1
1289
 *                if the string is nul terminated.
1290
 * @param status  The error code, set if an error occurred while attempting to
1291
 *                perform the check.
1292
 *                Confusability of the strings is not reported here,
1293
 *                but through this function's return value.
1294
 * @return        An integer value with bit(s) set corresponding to
1295
 *                the type of confusability found, as defined by
1296
 *                enum USpoofChecks.  Zero is returned if the strings
1297
 *                are not confusable.
1298
 *
1299
 * @stable ICU 4.2
1300
 *
1301
 * @see uspoof_areConfusable
1302
 */
1303
U_CAPI int32_t U_EXPORT2
1304
uspoof_areConfusableUTF8(const USpoofChecker *sc,
1305
                         const char *id1, int32_t length1,
1306
                         const char *id2, int32_t length2,
1307
                         UErrorCode *status);
1308

1309
/**
1310
 * A version of {@link uspoof_areBidiConfusable} accepting strings in UTF-8 format.
1311
 *
1312
 * @param sc      The USpoofChecker
1313
 * @param direction The paragraph direction with which the identifiers are
1314
 *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1315
 * @param id1     The first of the two identifiers to be compared for
1316
 *                confusability.  The strings are in UTF-8 format.
1317
 * @param length1 the length of the first identifiers, in bytes, or -1
1318
 *                if the string is nul terminated.
1319
 * @param id2     The second of the two identifiers to be compared for
1320
 *                confusability.  The strings are in UTF-8 format.
1321
 * @param length2 The length of the second string in bytes, or -1
1322
 *                if the string is nul terminated.
1323
 * @param status  The error code, set if an error occurred while attempting to
1324
 *                perform the check.
1325
 *                Confusability of the strings is not reported here,
1326
 *                but through this function's return value.
1327
 * @return        An integer value with bit(s) set corresponding to
1328
 *                the type of confusability found, as defined by
1329
 *                enum USpoofChecks.  Zero is returned if the strings
1330
 *                are not confusable.
1331
 *
1332
 * @stable ICU 74
1333
 *
1334
 * @see uspoof_areBidiConfusable
1335
 */
1336
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction,
1337
                                                      const char *id1, int32_t length1,
1338
                                                      const char *id2, int32_t length2,
1339
                                                      UErrorCode *status);
1340

1341
/**
1342
 *  Get the "skeleton" for an identifier.
1343
 *  Skeletons are a transformation of the input identifier;
1344
 * Two identifiers are confusable if their skeletons are identical.
1345
 *  See Unicode Technical Standard #39 for additional information.
1346
 *
1347
 *  Using skeletons directly makes it possible to quickly check
1348
 *  whether an identifier is confusable with any of some large
1349
 *  set of existing identifiers, by creating an efficiently
1350
 *  searchable collection of the skeletons.
1351
 *
1352
 * @param sc      The USpoofChecker
1353
 * @param type    Deprecated in ICU 58.  You may pass any number.
1354
 *                Originally, controlled which of the Unicode confusable data
1355
 *                tables to use.
1356
 * @param id      The input identifier whose skeleton will be computed.
1357
 * @param length  The length of the input identifier, expressed in 16 bit
1358
 *                UTF-16 code units, or -1 if the string is zero terminated.
1359
 * @param dest    The output buffer, to receive the skeleton string.
1360
 * @param destCapacity  The length of the output buffer, in 16 bit units.
1361
 *                The destCapacity may be zero, in which case the function will
1362
 *                return the actual length of the skeleton.
1363
 * @param status  The error code, set if an error occurred while attempting to
1364
 *                perform the check.
1365
 * @return        The length of the skeleton string.  The returned length
1366
 *                is always that of the complete skeleton, even when the
1367
 *                supplied buffer is too small (or of zero length)
1368
 *
1369
 * @stable ICU 4.2
1370
 * @see uspoof_areConfusable
1371
 */
1372
U_CAPI int32_t U_EXPORT2
1373
uspoof_getSkeleton(const USpoofChecker *sc,
1374
                   uint32_t type,
1375
                   const UChar *id,  int32_t length,
1376
                   UChar *dest, int32_t destCapacity,
1377
                   UErrorCode *status);
1378

1379
/**
1380
 *  Get the "bidiSkeleton" for an identifier and a direction.
1381
 *  Skeletons are a transformation of the input identifier;
1382
 *  Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
1383
 *  they are RTL-confusable if their RTL bidiSkeletons are identical.
1384
 *  See Unicode Technical Standard #39 for additional information:
1385
 *  https://www.unicode.org/reports/tr39/#Confusable_Detection.
1386
 *
1387
 *  Using skeletons directly makes it possible to quickly check
1388
 *  whether an identifier is confusable with any of some large
1389
 *  set of existing identifiers, by creating an efficiently
1390
 *  searchable collection of the skeletons.
1391
 *
1392
 * @param sc      The USpoofChecker.
1393
 * @param direction The context direction with which the identifier will be
1394
 *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1395
 * @param id      The input identifier whose skeleton will be computed.
1396
 * @param length  The length of the input identifier, expressed in 16 bit
1397
 *                UTF-16 code units, or -1 if the string is zero terminated.
1398
 * @param dest    The output buffer, to receive the skeleton string.
1399
 * @param destCapacity  The length of the output buffer, in 16 bit units.
1400
 *                The destCapacity may be zero, in which case the function will
1401
 *                return the actual length of the skeleton.
1402
 * @param status  The error code, set if an error occurred while attempting to
1403
 *                perform the check.
1404
 * @return        The length of the skeleton string.  The returned length
1405
 *                is always that of the complete skeleton, even when the
1406
 *                supplied buffer is too small (or of zero length)
1407
 *
1408
 * @stable ICU 74
1409
 * @see uspoof_areBidiConfusable
1410
 */
1411
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc,
1412
                                                UBiDiDirection direction,
1413
                                                const UChar *id, int32_t length,
1414
                                                UChar *dest, int32_t destCapacity, UErrorCode *status);
1415

1416
/**
1417
 *  Get the "skeleton" for an identifier.
1418
 *  Skeletons are a transformation of the input identifier;
1419
 *  Two identifiers are confusable if their skeletons are identical.
1420
 *  See Unicode Technical Standard #39 for additional information.
1421
 *
1422
 *  Using skeletons directly makes it possible to quickly check
1423
 *  whether an identifier is confusable with any of some large
1424
 *  set of existing identifiers, by creating an efficiently
1425
 *  searchable collection of the skeletons.
1426
 *
1427
 * @param sc      The USpoofChecker
1428
 * @param type    Deprecated in ICU 58.  You may pass any number.
1429
 *                Originally, controlled which of the Unicode confusable data
1430
 *                tables to use.
1431
 * @param id      The UTF-8 format identifier whose skeleton will be computed.
1432
 * @param length  The length of the input string, in bytes,
1433
 *                or -1 if the string is zero terminated.
1434
 * @param dest    The output buffer, to receive the skeleton string.
1435
 * @param destCapacity  The length of the output buffer, in bytes.
1436
 *                The destCapacity may be zero, in which case the function will
1437
 *                return the actual length of the skeleton.
1438
 * @param status  The error code, set if an error occurred while attempting to
1439
 *                perform the check.  Possible Errors include U_INVALID_CHAR_FOUND
1440
 *                   for invalid UTF-8 sequences, and
1441
 *                   U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
1442
 *                   to hold the complete skeleton.
1443
 * @return        The length of the skeleton string, in bytes.  The returned length
1444
 *                is always that of the complete skeleton, even when the
1445
 *                supplied buffer is too small (or of zero length)
1446
 *
1447
 * @stable ICU 4.2
1448
 */
1449
U_CAPI int32_t U_EXPORT2
1450
uspoof_getSkeletonUTF8(const USpoofChecker *sc,
1451
                       uint32_t type,
1452
                       const char *id,  int32_t length,
1453
                       char *dest, int32_t destCapacity,
1454
                       UErrorCode *status);
1455

1456
/**
1457
 *  Get the "bidiSkeleton" for an identifier and a direction.
1458
 *  Skeletons are a transformation of the input identifier;
1459
 *  Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
1460
 *  they are RTL-confusable if their RTL bidiSkeletons are identical.
1461
 *  See Unicode Technical Standard #39 for additional information:
1462
 *  https://www.unicode.org/reports/tr39/#Confusable_Detection.
1463
 *
1464
 *  Using skeletons directly makes it possible to quickly check
1465
 *  whether an identifier is confusable with any of some large
1466
 *  set of existing identifiers, by creating an efficiently
1467
 *  searchable collection of the skeletons.
1468
 *
1469
 * @param sc      The USpoofChecker
1470
 * @param direction The context direction with which the identifier will be
1471
 *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1472
 * @param id      The UTF-8 format identifier whose skeleton will be computed.
1473
 * @param length  The length of the input string, in bytes,
1474
 *                or -1 if the string is zero terminated.
1475
 * @param dest    The output buffer, to receive the skeleton string.
1476
 * @param destCapacity  The length of the output buffer, in bytes.
1477
 *                The destCapacity may be zero, in which case the function will
1478
 *                return the actual length of the skeleton.
1479
 * @param status  The error code, set if an error occurred while attempting to
1480
 *                perform the check.  Possible Errors include U_INVALID_CHAR_FOUND
1481
 *                for invalid UTF-8 sequences, and
1482
 *                U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
1483
 *                to hold the complete skeleton.
1484
 * @return        The length of the skeleton string, in bytes.  The returned length
1485
 *                is always that of the complete skeleton, even when the
1486
 *                supplied buffer is too small (or of zero length)
1487
 *
1488
 * @stable ICU 74
1489
 */
1490
U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction,
1491
                                                    const char *id, int32_t length, char *dest,
1492
                                                    int32_t destCapacity, UErrorCode *status);
1493

1494
/**
1495
  * Get the set of Candidate Characters for Inclusion in Identifiers, as defined
1496
  * in http://unicode.org/Public/security/latest/xidmodifications.txt
1497
  * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1498
  *
1499
  * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1500
  * be deleted by the caller.
1501
  *
1502
  * @param status The error code, set if a problem occurs while creating the set.
1503
  *
1504
  * @stable ICU 51
1505
  */
1506
U_CAPI const USet * U_EXPORT2
1507
uspoof_getInclusionSet(UErrorCode *status);
1508

1509
/**
1510
  * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
1511
  * in http://unicode.org/Public/security/latest/xidmodifications.txt
1512
  * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1513
  *
1514
  * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1515
  * be deleted by the caller.
1516
  *
1517
  * @param status The error code, set if a problem occurs while creating the set.
1518
  *
1519
  * @stable ICU 51
1520
  */
1521
U_CAPI const USet * U_EXPORT2
1522
uspoof_getRecommendedSet(UErrorCode *status);
1523

1524
/**
1525
 * Serialize the data for a spoof detector into a chunk of memory.
1526
 * The flattened spoof detection tables can later be used to efficiently
1527
 * instantiate a new Spoof Detector.
1528
 *
1529
 * The serialized spoof checker includes only the data compiled from the
1530
 * Unicode data tables by uspoof_openFromSource(); it does not include
1531
 * include any other state or configuration that may have been set.
1532
 *
1533
 * @param sc   the Spoof Detector whose data is to be serialized.
1534
 * @param data a pointer to 32-bit-aligned memory to be filled with the data,
1535
 *             can be NULL if capacity==0
1536
 * @param capacity the number of bytes available at data,
1537
 *                 or 0 for preflighting
1538
 * @param status an in/out ICU UErrorCode; possible errors include:
1539
 * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization
1540
 * - U_ILLEGAL_ARGUMENT_ERROR  the data or capacity parameters are bad
1541
 * @return the number of bytes written or needed for the spoof data
1542
 *
1543
 * @see utrie2_openFromSerialized()
1544
 * @stable ICU 4.2
1545
 */
1546
U_CAPI int32_t U_EXPORT2
1547
uspoof_serialize(USpoofChecker *sc,
1548
                 void *data, int32_t capacity,
1549
                 UErrorCode *status);
1550

1551
U_CDECL_END
1552

1553
#if U_SHOW_CPLUSPLUS_API
1554

1555
U_NAMESPACE_BEGIN
1556

1557
/**
1558
 * \class LocalUSpoofCheckerPointer
1559
 * "Smart pointer" class, closes a USpoofChecker via uspoof_close().
1560
 * For most methods see the LocalPointerBase base class.
1561
 *
1562
 * @see LocalPointerBase
1563
 * @see LocalPointer
1564
 * @stable ICU 4.4
1565
 */
1566
/**
1567
 * \cond
1568
 * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER.
1569
 *       For now, suppress with a Doxygen cond
1570
 */
1571
U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckerPointer, USpoofChecker, uspoof_close);
1572
/** \endcond */
1573

1574
/**
1575
 * \class LocalUSpoofCheckResultPointer
1576
 * "Smart pointer" class, closes a USpoofCheckResult via `uspoof_closeCheckResult()`.
1577
 * For most methods see the LocalPointerBase base class.
1578
 *
1579
 * @see LocalPointerBase
1580
 * @see LocalPointer
1581
 * @stable ICU 58
1582
 */
1583

1584
/**
1585
 * \cond
1586
 * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER.
1587
 *       For now, suppress with a Doxygen cond
1588
 */
1589
U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckResultPointer, USpoofCheckResult, uspoof_closeCheckResult);
1590
/** \endcond */
1591

1592
U_NAMESPACE_END
1593

1594
/**
1595
 * Limit the acceptable characters to those specified by a Unicode Set.
1596
 *   Any previously specified character limit is
1597
 *   is replaced by the new settings.    This includes limits on
1598
 *   characters that were set with the uspoof_setAllowedLocales() function.
1599
 *
1600
 * The USPOOF_CHAR_LIMIT test is automatically enabled for this
1601
 * USoofChecker by this function.
1602
 *
1603
 * @param sc       The USpoofChecker
1604
 * @param chars    A Unicode Set containing the list of
1605
 *                 characters that are permitted.  Ownership of the set
1606
 *                 remains with the caller.  The incoming set is cloned by
1607
 *                 this function, so there are no restrictions on modifying
1608
 *                 or deleting the UnicodeSet after calling this function.
1609
 * @param status   The error code, set if this function encounters a problem.
1610
 * @stable ICU 4.2
1611
 */
1612
U_CAPI void U_EXPORT2
1613
uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const icu::UnicodeSet *chars, UErrorCode *status);
1614

1615

1616
/**
1617
 * Get a UnicodeSet for the characters permitted in an identifier.
1618
 * This corresponds to the limits imposed by the Set Allowed Characters /
1619
 * UnicodeSet functions. Limitations imposed by other checks will not be
1620
 * reflected in the set returned by this function.
1621
 *
1622
 * The returned set will be frozen, meaning that it cannot be modified
1623
 * by the caller.
1624
 *
1625
 * Ownership of the returned set remains with the Spoof Detector.  The
1626
 * returned set will become invalid if the spoof detector is closed,
1627
 * or if a new set of allowed characters is specified.
1628
 *
1629
 *
1630
 * @param sc       The USpoofChecker
1631
 * @param status   The error code, set if this function encounters a problem.
1632
 * @return         A UnicodeSet containing the characters that are permitted by
1633
 *                 the USPOOF_CHAR_LIMIT test.
1634
 * @stable ICU 4.2
1635
 */
1636
U_CAPI const icu::UnicodeSet * U_EXPORT2
1637
uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status);
1638

1639
/**
1640
 * Check the specified string for possible security issues.
1641
 * The text to be checked will typically be an identifier of some sort.
1642
 * The set of checks to be performed is specified with uspoof_setChecks().
1643
 *
1644
 * \note
1645
 *   Consider using the newer API, {@link uspoof_check2UnicodeString}, instead.
1646
 *   The newer API exposes additional information from the check procedure
1647
 *   and is otherwise identical to this method.
1648
 *
1649
 * @param sc      The USpoofChecker
1650
 * @param id      A identifier to be checked for possible security issues.
1651
 * @param position  Deprecated in ICU 51.  Always returns zero.
1652
 *                Originally, an out parameter for the index of the first
1653
 *                string position that failed a check.
1654
 *                This parameter may be nullptr.
1655
 * @param status  The error code, set if an error occurred while attempting to
1656
 *                perform the check.
1657
 *                Spoofing or security issues detected with the input string are
1658
 *                not reported here, but through the function's return value.
1659
 * @return        An integer value with bits set for any potential security
1660
 *                or spoofing issues detected.  The bits are defined by
1661
 *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1662
 *                will be zero if the input string passes all of the
1663
 *                enabled checks.
1664
 * @see uspoof_check2UnicodeString
1665
 * @stable ICU 4.2
1666
 */
1667
U_CAPI int32_t U_EXPORT2
1668
uspoof_checkUnicodeString(const USpoofChecker *sc,
1669
                          const icu::UnicodeString &id,
1670
                          int32_t *position,
1671
                          UErrorCode *status);
1672

1673
/**
1674
 * Check the specified string for possible security issues.
1675
 * The text to be checked will typically be an identifier of some sort.
1676
 * The set of checks to be performed is specified with uspoof_setChecks().
1677
 *
1678
 * @param sc      The USpoofChecker
1679
 * @param id      A identifier to be checked for possible security issues.
1680
 * @param checkResult  An instance of USpoofCheckResult to be filled with
1681
 *                details about the identifier.  Can be nullptr.
1682
 * @param status  The error code, set if an error occurred while attempting to
1683
 *                perform the check.
1684
 *                Spoofing or security issues detected with the input string are
1685
 *                not reported here, but through the function's return value.
1686
 * @return        An integer value with bits set for any potential security
1687
 *                or spoofing issues detected.  The bits are defined by
1688
 *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1689
 *                will be zero if the input string passes all of the
1690
 *                enabled checks.  Any information in this bitmask will be
1691
 *                consistent with the information saved in the optional
1692
 *                checkResult parameter.
1693
 * @see uspoof_openCheckResult
1694
 * @see uspoof_check2
1695
 * @see uspoof_check2UTF8
1696
 * @stable ICU 58
1697
 */
1698
U_CAPI int32_t U_EXPORT2
1699
uspoof_check2UnicodeString(const USpoofChecker *sc,
1700
    const icu::UnicodeString &id,
1701
    USpoofCheckResult* checkResult,
1702
    UErrorCode *status);
1703

1704
/**
1705
 * A version of {@link uspoof_areConfusable} accepting UnicodeStrings.
1706
 *
1707
 * @param sc      The USpoofChecker
1708
 * @param s1     The first of the two identifiers to be compared for
1709
 *                confusability.  The strings are in UTF-8 format.
1710
 * @param s2     The second of the two identifiers to be compared for
1711
 *                confusability.  The strings are in UTF-8 format.
1712
 * @param status  The error code, set if an error occurred while attempting to
1713
 *                perform the check.
1714
 *                Confusability of the identifiers is not reported here,
1715
 *                but through this function's return value.
1716
 * @return        An integer value with bit(s) set corresponding to
1717
 *                the type of confusability found, as defined by
1718
 *                enum USpoofChecks.  Zero is returned if the identifiers
1719
 *                are not confusable.
1720
 *
1721
 * @stable ICU 4.2
1722
 *
1723
 * @see uspoof_areConfusable
1724
 */
1725
U_CAPI int32_t U_EXPORT2
1726
uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
1727
                                  const icu::UnicodeString &s1,
1728
                                  const icu::UnicodeString &s2,
1729
                                  UErrorCode *status);
1730

1731
/**
1732
 * A version of {@link uspoof_areBidiConfusable} accepting UnicodeStrings.
1733
 *
1734
 * @param sc      The USpoofChecker
1735
 * @param direction The paragraph direction with which the identifiers are
1736
 *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1737
 * @param s1     The first of the two identifiers to be compared for
1738
 *                confusability.  The strings are in UTF-8 format.
1739
 * @param s2     The second of the two identifiers to be compared for
1740
 *                confusability.  The strings are in UTF-8 format.
1741
 * @param status  The error code, set if an error occurred while attempting to
1742
 *                perform the check.
1743
 *                Confusability of the identifiers is not reported here,
1744
 *                but through this function's return value.
1745
 * @return        An integer value with bit(s) set corresponding to
1746
 *                the type of confusability found, as defined by
1747
 *                enum USpoofChecks.  Zero is returned if the identifiers
1748
 *                are not confusable.
1749
 *
1750
 * @stable ICU 74
1751
 *
1752
 * @see uspoof_areBidiConfusable
1753
 */
1754
U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc,
1755
                                                               UBiDiDirection direction,
1756
                                                               const icu::UnicodeString &s1,
1757
                                                               const icu::UnicodeString &s2,
1758
                                                               UErrorCode *status);
1759

1760
/**
1761
 *  Get the "skeleton" for an identifier.
1762
 *  Skeletons are a transformation of the input identifier;
1763
 *  Two identifiers are confusable if their skeletons are identical.
1764
 *  See Unicode Technical Standard #39 for additional information.
1765
 *
1766
 *  Using skeletons directly makes it possible to quickly check
1767
 *  whether an identifier is confusable with any of some large
1768
 *  set of existing identifiers, by creating an efficiently
1769
 *  searchable collection of the skeletons.
1770
 *
1771
 * @param sc      The USpoofChecker.
1772
 * @param type    Deprecated in ICU 58.  You may pass any number.
1773
 *                Originally, controlled which of the Unicode confusable data
1774
 *                tables to use.
1775
 * @param id      The input identifier whose skeleton will be computed.
1776
 * @param dest    The output identifier, to receive the skeleton string.
1777
 * @param status  The error code, set if an error occurred while attempting to
1778
 *                perform the check.
1779
 * @return        A reference to the destination (skeleton) string.
1780
 *
1781
 * @stable ICU 4.2
1782
 */
1783
U_I18N_API icu::UnicodeString & U_EXPORT2
1784
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
1785
                                uint32_t type,
1786
                                const icu::UnicodeString &id,
1787
                                icu::UnicodeString &dest,
1788
                                UErrorCode *status);
1789

1790
/**
1791
 *  Get the "bidiSkeleton" for an identifier and a direction.
1792
 *  Skeletons are a transformation of the input identifier;
1793
 *  Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
1794
 *  they are RTL-confusable if their RTL bidiSkeletons are identical.
1795
 *  See Unicode Technical Standard #39 for additional information.
1796
 *  https://www.unicode.org/reports/tr39/#Confusable_Detection.
1797
 *
1798
 *  Using skeletons directly makes it possible to quickly check
1799
 *  whether an identifier is confusable with any of some large
1800
 *  set of existing identifiers, by creating an efficiently
1801
 *  searchable collection of the skeletons.
1802
 *
1803
 * @param sc      The USpoofChecker.
1804
 * @param direction The context direction with which the identifier will be
1805
 *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1806
 * @param id      The input identifier whose bidiSkeleton will be computed.
1807
 * @param dest    The output identifier, to receive the skeleton string.
1808
 * @param status  The error code, set if an error occurred while attempting to
1809
 *                perform the check.
1810
 * @return        A reference to the destination (skeleton) string.
1811
 *
1812
 * @stable ICU 74
1813
 */
1814
U_I18N_API icu::UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString(
1815
    const USpoofChecker *sc, UBiDiDirection direction, const icu::UnicodeString &id,
1816
    icu::UnicodeString &dest, UErrorCode *status);
1817

1818
/**
1819
  * Get the set of Candidate Characters for Inclusion in Identifiers, as defined
1820
  * in http://unicode.org/Public/security/latest/xidmodifications.txt
1821
  * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1822
  *
1823
  * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1824
  * be deleted by the caller.
1825
  *
1826
  * @param status The error code, set if a problem occurs while creating the set.
1827
  *
1828
  * @stable ICU 51
1829
  */
1830
U_CAPI const icu::UnicodeSet * U_EXPORT2
1831
uspoof_getInclusionUnicodeSet(UErrorCode *status);
1832

1833
/**
1834
  * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
1835
  * in http://unicode.org/Public/security/latest/xidmodifications.txt
1836
  * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1837
  *
1838
  * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1839
  * be deleted by the caller.
1840
  *
1841
  * @param status The error code, set if a problem occurs while creating the set.
1842
  *
1843
  * @stable ICU 51
1844
  */
1845
U_CAPI const icu::UnicodeSet * U_EXPORT2
1846
uspoof_getRecommendedUnicodeSet(UErrorCode *status);
1847

1848
#endif /* U_SHOW_CPLUSPLUS_API */
1849

1850
#endif /* UCONFIG_NO_NORMALIZATION */
1851

1852
#endif   /* USPOOF_H */
1853

1854
Product

Resources

Company