CoCalc -- normalizer2.h

GitHub Repository: PojavLauncherTeam/openjdk-aarch32-jdk8u
Path: blob/jdk8u272-b10-aarch32-20201026/jdk/src/share/native/common/unicode/normalizer2.h
⁴⁸⁷⁷³ views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 2009-2013, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  normalizer2.h
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2009nov22
16
*   created by: Markus W. Scherer
17
*/
18

19
#ifndef __NORMALIZER2_H__
20
#define __NORMALIZER2_H__
21

22
/**
23
 * \file
24
 * \brief C++ API: New API for Unicode Normalization.
25
 */
26

27
#include "unicode/utypes.h"
28

29
#if !UCONFIG_NO_NORMALIZATION
30

31
#include "unicode/stringpiece.h"
32
#include "unicode/uniset.h"
33
#include "unicode/unistr.h"
34
#include "unicode/unorm2.h"
35

36
U_NAMESPACE_BEGIN
37

38
class ByteSink;
39

40
/**
41
 * Unicode normalization functionality for standard Unicode normalization or
42
 * for using custom mapping tables.
43
 * All instances of this class are unmodifiable/immutable.
44
 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
45
 * The Normalizer2 class is not intended for public subclassing.
46
 *
47
 * The primary functions are to produce a normalized string and to detect whether
48
 * a string is already normalized.
49
 * The most commonly used normalization forms are those defined in
50
 * http://www.unicode.org/unicode/reports/tr15/
51
 * However, this API supports additional normalization forms for specialized purposes.
52
 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
53
 * and can be used in implementations of UTS #46.
54
 *
55
 * Not only are the standard compose and decompose modes supplied,
56
 * but additional modes are provided as documented in the Mode enum.
57
 *
58
 * Some of the functions in this class identify normalization boundaries.
59
 * At a normalization boundary, the portions of the string
60
 * before it and starting from it do not interact and can be handled independently.
61
 *
62
 * The spanQuickCheckYes() stops at a normalization boundary.
63
 * When the goal is a normalized string, then the text before the boundary
64
 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
65
 *
66
 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
67
 * a character is guaranteed to be at a normalization boundary,
68
 * regardless of context.
69
 * This is used for moving from one normalization boundary to the next
70
 * or preceding boundary, and for performing iterative normalization.
71
 *
72
 * Iterative normalization is useful when only a small portion of a
73
 * longer string needs to be processed.
74
 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
75
 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
76
 * (to process only the substring for which sort key bytes are computed).
77
 *
78
 * The set of normalization boundaries returned by these functions may not be
79
 * complete: There may be more boundaries that could be returned.
80
 * Different functions may return different boundaries.
81
 * @stable ICU 4.4
82
 */
83
class U_COMMON_API Normalizer2 : public UObject {
84
public:
85
    /**
86
     * Destructor.
87
     * @stable ICU 4.4
88
     */
89
    ~Normalizer2();
90

91
    /**
92
     * Returns a Normalizer2 instance for Unicode NFC normalization.
93
     * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
94
     * Returns an unmodifiable singleton instance. Do not delete it.
95
     * @param errorCode Standard ICU error code. Its input value must
96
     *                  pass the U_SUCCESS() test, or else the function returns
97
     *                  immediately. Check for U_FAILURE() on output or use with
98
     *                  function chaining. (See User Guide for details.)
99
     * @return the requested Normalizer2, if successful
100
     * @stable ICU 49
101
     */
102
    static const Normalizer2 *
103
    getNFCInstance(UErrorCode &errorCode);
104

105
    /**
106
     * Returns a Normalizer2 instance for Unicode NFD normalization.
107
     * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
108
     * Returns an unmodifiable singleton instance. Do not delete it.
109
     * @param errorCode Standard ICU error code. Its input value must
110
     *                  pass the U_SUCCESS() test, or else the function returns
111
     *                  immediately. Check for U_FAILURE() on output or use with
112
     *                  function chaining. (See User Guide for details.)
113
     * @return the requested Normalizer2, if successful
114
     * @stable ICU 49
115
     */
116
    static const Normalizer2 *
117
    getNFDInstance(UErrorCode &errorCode);
118

119
    /**
120
     * Returns a Normalizer2 instance for Unicode NFKC normalization.
121
     * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
122
     * Returns an unmodifiable singleton instance. Do not delete it.
123
     * @param errorCode Standard ICU error code. Its input value must
124
     *                  pass the U_SUCCESS() test, or else the function returns
125
     *                  immediately. Check for U_FAILURE() on output or use with
126
     *                  function chaining. (See User Guide for details.)
127
     * @return the requested Normalizer2, if successful
128
     * @stable ICU 49
129
     */
130
    static const Normalizer2 *
131
    getNFKCInstance(UErrorCode &errorCode);
132

133
    /**
134
     * Returns a Normalizer2 instance for Unicode NFKD normalization.
135
     * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
136
     * Returns an unmodifiable singleton instance. Do not delete it.
137
     * @param errorCode Standard ICU error code. Its input value must
138
     *                  pass the U_SUCCESS() test, or else the function returns
139
     *                  immediately. Check for U_FAILURE() on output or use with
140
     *                  function chaining. (See User Guide for details.)
141
     * @return the requested Normalizer2, if successful
142
     * @stable ICU 49
143
     */
144
    static const Normalizer2 *
145
    getNFKDInstance(UErrorCode &errorCode);
146

147
    /**
148
     * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
149
     * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
150
     * Returns an unmodifiable singleton instance. Do not delete it.
151
     * @param errorCode Standard ICU error code. Its input value must
152
     *                  pass the U_SUCCESS() test, or else the function returns
153
     *                  immediately. Check for U_FAILURE() on output or use with
154
     *                  function chaining. (See User Guide for details.)
155
     * @return the requested Normalizer2, if successful
156
     * @stable ICU 49
157
     */
158
    static const Normalizer2 *
159
    getNFKCCasefoldInstance(UErrorCode &errorCode);
160

161
    /**
162
     * Returns a Normalizer2 instance which uses the specified data file
163
     * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
164
     * and which composes or decomposes text according to the specified mode.
165
     * Returns an unmodifiable singleton instance. Do not delete it.
166
     *
167
     * Use packageName=NULL for data files that are part of ICU's own data.
168
     * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
169
     * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
170
     * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
171
     *
172
     * @param packageName NULL for ICU built-in data, otherwise application data package name
173
     * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
174
     * @param mode normalization mode (compose or decompose etc.)
175
     * @param errorCode Standard ICU error code. Its input value must
176
     *                  pass the U_SUCCESS() test, or else the function returns
177
     *                  immediately. Check for U_FAILURE() on output or use with
178
     *                  function chaining. (See User Guide for details.)
179
     * @return the requested Normalizer2, if successful
180
     * @stable ICU 4.4
181
     */
182
    static const Normalizer2 *
183
    getInstance(const char *packageName,
184
                const char *name,
185
                UNormalization2Mode mode,
186
                UErrorCode &errorCode);
187

188
    /**
189
     * Returns the normalized form of the source string.
190
     * @param src source string
191
     * @param errorCode Standard ICU error code. Its input value must
192
     *                  pass the U_SUCCESS() test, or else the function returns
193
     *                  immediately. Check for U_FAILURE() on output or use with
194
     *                  function chaining. (See User Guide for details.)
195
     * @return normalized src
196
     * @stable ICU 4.4
197
     */
198
    UnicodeString
199
    normalize(const UnicodeString &src, UErrorCode &errorCode) const {
200
        UnicodeString result;
201
        normalize(src, result, errorCode);
202
        return result;
203
    }
204
    /**
205
     * Writes the normalized form of the source string to the destination string
206
     * (replacing its contents) and returns the destination string.
207
     * The source and destination strings must be different objects.
208
     * @param src source string
209
     * @param dest destination string; its contents is replaced with normalized src
210
     * @param errorCode Standard ICU error code. Its input value must
211
     *                  pass the U_SUCCESS() test, or else the function returns
212
     *                  immediately. Check for U_FAILURE() on output or use with
213
     *                  function chaining. (See User Guide for details.)
214
     * @return dest
215
     * @stable ICU 4.4
216
     */
217
    virtual UnicodeString &
218
    normalize(const UnicodeString &src,
219
              UnicodeString &dest,
220
              UErrorCode &errorCode) const = 0;
221

222
    /**
223
     * Normalizes a UTF-8 string and optionally records how source substrings
224
     * relate to changed and unchanged result substrings.
225
     *
226
     * Currently implemented completely only for "compose" modes,
227
     * such as for NFC, NFKC, and NFKC_Casefold
228
     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
229
     * Otherwise currently converts to & from UTF-16 and does not support edits.
230
     *
231
     * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
232
     * @param src       Source UTF-8 string.
233
     * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
234
     *                  sink.Flush() is called at the end.
235
     * @param edits     Records edits for index mapping, working with styled text,
236
     *                  and getting only changes (if any).
237
     *                  The Edits contents is undefined if any error occurs.
238
     *                  This function calls edits->reset() first unless
239
     *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
240
     * @param errorCode Standard ICU error code. Its input value must
241
     *                  pass the U_SUCCESS() test, or else the function returns
242
     *                  immediately. Check for U_FAILURE() on output or use with
243
     *                  function chaining. (See User Guide for details.)
244
     * @stable ICU 60
245
     */
246
    virtual void
247
    normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
248
                  Edits *edits, UErrorCode &errorCode) const;
249

250
    /**
251
     * Appends the normalized form of the second string to the first string
252
     * (merging them at the boundary) and returns the first string.
253
     * The result is normalized if the first string was normalized.
254
     * The first and second strings must be different objects.
255
     * @param first string, should be normalized
256
     * @param second string, will be normalized
257
     * @param errorCode Standard ICU error code. Its input value must
258
     *                  pass the U_SUCCESS() test, or else the function returns
259
     *                  immediately. Check for U_FAILURE() on output or use with
260
     *                  function chaining. (See User Guide for details.)
261
     * @return first
262
     * @stable ICU 4.4
263
     */
264
    virtual UnicodeString &
265
    normalizeSecondAndAppend(UnicodeString &first,
266
                             const UnicodeString &second,
267
                             UErrorCode &errorCode) const = 0;
268
    /**
269
     * Appends the second string to the first string
270
     * (merging them at the boundary) and returns the first string.
271
     * The result is normalized if both the strings were normalized.
272
     * The first and second strings must be different objects.
273
     * @param first string, should be normalized
274
     * @param second string, should be normalized
275
     * @param errorCode Standard ICU error code. Its input value must
276
     *                  pass the U_SUCCESS() test, or else the function returns
277
     *                  immediately. Check for U_FAILURE() on output or use with
278
     *                  function chaining. (See User Guide for details.)
279
     * @return first
280
     * @stable ICU 4.4
281
     */
282
    virtual UnicodeString &
283
    append(UnicodeString &first,
284
           const UnicodeString &second,
285
           UErrorCode &errorCode) const = 0;
286

287
    /**
288
     * Gets the decomposition mapping of c.
289
     * Roughly equivalent to normalizing the String form of c
290
     * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
291
     * returns FALSE and does not write a string
292
     * if c does not have a decomposition mapping in this instance's data.
293
     * This function is independent of the mode of the Normalizer2.
294
     * @param c code point
295
     * @param decomposition String object which will be set to c's
296
     *                      decomposition mapping, if there is one.
297
     * @return TRUE if c has a decomposition, otherwise FALSE
298
     * @stable ICU 4.6
299
     */
300
    virtual UBool
301
    getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
302

303
    /**
304
     * Gets the raw decomposition mapping of c.
305
     *
306
     * This is similar to the getDecomposition() method but returns the
307
     * raw decomposition mapping as specified in UnicodeData.txt or
308
     * (for custom data) in the mapping files processed by the gennorm2 tool.
309
     * By contrast, getDecomposition() returns the processed,
310
     * recursively-decomposed version of this mapping.
311
     *
312
     * When used on a standard NFKC Normalizer2 instance,
313
     * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
314
     *
315
     * When used on a standard NFC Normalizer2 instance,
316
     * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
317
     * in this case, the result contains either one or two code points (=1..4 char16_ts).
318
     *
319
     * This function is independent of the mode of the Normalizer2.
320
     * The default implementation returns FALSE.
321
     * @param c code point
322
     * @param decomposition String object which will be set to c's
323
     *                      raw decomposition mapping, if there is one.
324
     * @return TRUE if c has a decomposition, otherwise FALSE
325
     * @stable ICU 49
326
     */
327
    virtual UBool
328
    getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
329

330
    /**
331
     * Performs pairwise composition of a & b and returns the composite if there is one.
332
     *
333
     * Returns a composite code point c only if c has a two-way mapping to a+b.
334
     * In standard Unicode normalization, this means that
335
     * c has a canonical decomposition to a+b
336
     * and c does not have the Full_Composition_Exclusion property.
337
     *
338
     * This function is independent of the mode of the Normalizer2.
339
     * The default implementation returns a negative value.
340
     * @param a A (normalization starter) code point.
341
     * @param b Another code point.
342
     * @return The non-negative composite code point if there is one; otherwise a negative value.
343
     * @stable ICU 49
344
     */
345
    virtual UChar32
346
    composePair(UChar32 a, UChar32 b) const;
347

348
    /**
349
     * Gets the combining class of c.
350
     * The default implementation returns 0
351
     * but all standard implementations return the Unicode Canonical_Combining_Class value.
352
     * @param c code point
353
     * @return c's combining class
354
     * @stable ICU 49
355
     */
356
    virtual uint8_t
357
    getCombiningClass(UChar32 c) const;
358

359
    /**
360
     * Tests if the string is normalized.
361
     * Internally, in cases where the quickCheck() method would return "maybe"
362
     * (which is only possible for the two COMPOSE modes) this method
363
     * resolves to "yes" or "no" to provide a definitive result,
364
     * at the cost of doing more work in those cases.
365
     * @param s input string
366
     * @param errorCode Standard ICU error code. Its input value must
367
     *                  pass the U_SUCCESS() test, or else the function returns
368
     *                  immediately. Check for U_FAILURE() on output or use with
369
     *                  function chaining. (See User Guide for details.)
370
     * @return TRUE if s is normalized
371
     * @stable ICU 4.4
372
     */
373
    virtual UBool
374
    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
375
    /**
376
     * Tests if the UTF-8 string is normalized.
377
     * Internally, in cases where the quickCheck() method would return "maybe"
378
     * (which is only possible for the two COMPOSE modes) this method
379
     * resolves to "yes" or "no" to provide a definitive result,
380
     * at the cost of doing more work in those cases.
381
     *
382
     * This works for all normalization modes,
383
     * but it is currently optimized for UTF-8 only for "compose" modes,
384
     * such as for NFC, NFKC, and NFKC_Casefold
385
     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
386
     * For other modes it currently converts to UTF-16 and calls isNormalized().
387
     *
388
     * @param s UTF-8 input string
389
     * @param errorCode Standard ICU error code. Its input value must
390
     *                  pass the U_SUCCESS() test, or else the function returns
391
     *                  immediately. Check for U_FAILURE() on output or use with
392
     *                  function chaining. (See User Guide for details.)
393
     * @return TRUE if s is normalized
394
     * @stable ICU 60
395
     */
396
    virtual UBool
397
    isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
398

399

400
    /**
401
     * Tests if the string is normalized.
402
     * For the two COMPOSE modes, the result could be "maybe" in cases that
403
     * would take a little more work to resolve definitively.
404
     * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
405
     * combination of quick check + normalization, to avoid
406
     * re-checking the "yes" prefix.
407
     * @param s input string
408
     * @param errorCode Standard ICU error code. Its input value must
409
     *                  pass the U_SUCCESS() test, or else the function returns
410
     *                  immediately. Check for U_FAILURE() on output or use with
411
     *                  function chaining. (See User Guide for details.)
412
     * @return UNormalizationCheckResult
413
     * @stable ICU 4.4
414
     */
415
    virtual UNormalizationCheckResult
416
    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
417

418
    /**
419
     * Returns the end of the normalized substring of the input string.
420
     * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
421
     * the substring <code>UnicodeString(s, 0, end)</code>
422
     * will pass the quick check with a "yes" result.
423
     *
424
     * The returned end index is usually one or more characters before the
425
     * "no" or "maybe" character: The end index is at a normalization boundary.
426
     * (See the class documentation for more about normalization boundaries.)
427
     *
428
     * When the goal is a normalized string and most input strings are expected
429
     * to be normalized already, then call this method,
430
     * and if it returns a prefix shorter than the input string,
431
     * copy that prefix and use normalizeSecondAndAppend() for the remainder.
432
     * @param s input string
433
     * @param errorCode Standard ICU error code. Its input value must
434
     *                  pass the U_SUCCESS() test, or else the function returns
435
     *                  immediately. Check for U_FAILURE() on output or use with
436
     *                  function chaining. (See User Guide for details.)
437
     * @return "yes" span end index
438
     * @stable ICU 4.4
439
     */
440
    virtual int32_t
441
    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
442

443
    /**
444
     * Tests if the character always has a normalization boundary before it,
445
     * regardless of context.
446
     * If true, then the character does not normalization-interact with
447
     * preceding characters.
448
     * In other words, a string containing this character can be normalized
449
     * by processing portions before this character and starting from this
450
     * character independently.
451
     * This is used for iterative normalization. See the class documentation for details.
452
     * @param c character to test
453
     * @return TRUE if c has a normalization boundary before it
454
     * @stable ICU 4.4
455
     */
456
    virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
457

458
    /**
459
     * Tests if the character always has a normalization boundary after it,
460
     * regardless of context.
461
     * If true, then the character does not normalization-interact with
462
     * following characters.
463
     * In other words, a string containing this character can be normalized
464
     * by processing portions up to this character and after this
465
     * character independently.
466
     * This is used for iterative normalization. See the class documentation for details.
467
     * Note that this operation may be significantly slower than hasBoundaryBefore().
468
     * @param c character to test
469
     * @return TRUE if c has a normalization boundary after it
470
     * @stable ICU 4.4
471
     */
472
    virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
473

474
    /**
475
     * Tests if the character is normalization-inert.
476
     * If true, then the character does not change, nor normalization-interact with
477
     * preceding or following characters.
478
     * In other words, a string containing this character can be normalized
479
     * by processing portions before this character and after this
480
     * character independently.
481
     * This is used for iterative normalization. See the class documentation for details.
482
     * Note that this operation may be significantly slower than hasBoundaryBefore().
483
     * @param c character to test
484
     * @return TRUE if c is normalization-inert
485
     * @stable ICU 4.4
486
     */
487
    virtual UBool isInert(UChar32 c) const = 0;
488
};
489

490
/**
491
 * Normalization filtered by a UnicodeSet.
492
 * Normalizes portions of the text contained in the filter set and leaves
493
 * portions not contained in the filter set unchanged.
494
 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
495
 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
496
 * This class implements all of (and only) the Normalizer2 API.
497
 * An instance of this class is unmodifiable/immutable but is constructed and
498
 * must be destructed by the owner.
499
 * @stable ICU 4.4
500
 */
501
class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
502
public:
503
    /**
504
     * Constructs a filtered normalizer wrapping any Normalizer2 instance
505
     * and a filter set.
506
     * Both are aliased and must not be modified or deleted while this object
507
     * is used.
508
     * The filter set should be frozen; otherwise the performance will suffer greatly.
509
     * @param n2 wrapped Normalizer2 instance
510
     * @param filterSet UnicodeSet which determines the characters to be normalized
511
     * @stable ICU 4.4
512
     */
513
    FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
514
            norm2(n2), set(filterSet) {}
515

516
    /**
517
     * Destructor.
518
     * @stable ICU 4.4
519
     */
520
    ~FilteredNormalizer2();
521

522
    /**
523
     * Writes the normalized form of the source string to the destination string
524
     * (replacing its contents) and returns the destination string.
525
     * The source and destination strings must be different objects.
526
     * @param src source string
527
     * @param dest destination string; its contents is replaced with normalized src
528
     * @param errorCode Standard ICU error code. Its input value must
529
     *                  pass the U_SUCCESS() test, or else the function returns
530
     *                  immediately. Check for U_FAILURE() on output or use with
531
     *                  function chaining. (See User Guide for details.)
532
     * @return dest
533
     * @stable ICU 4.4
534
     */
535
    virtual UnicodeString &
536
    normalize(const UnicodeString &src,
537
              UnicodeString &dest,
538
              UErrorCode &errorCode) const U_OVERRIDE;
539

540
    /**
541
     * Normalizes a UTF-8 string and optionally records how source substrings
542
     * relate to changed and unchanged result substrings.
543
     *
544
     * Currently implemented completely only for "compose" modes,
545
     * such as for NFC, NFKC, and NFKC_Casefold
546
     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
547
     * Otherwise currently converts to & from UTF-16 and does not support edits.
548
     *
549
     * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
550
     * @param src       Source UTF-8 string.
551
     * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
552
     *                  sink.Flush() is called at the end.
553
     * @param edits     Records edits for index mapping, working with styled text,
554
     *                  and getting only changes (if any).
555
     *                  The Edits contents is undefined if any error occurs.
556
     *                  This function calls edits->reset() first unless
557
     *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
558
     * @param errorCode Standard ICU error code. Its input value must
559
     *                  pass the U_SUCCESS() test, or else the function returns
560
     *                  immediately. Check for U_FAILURE() on output or use with
561
     *                  function chaining. (See User Guide for details.)
562
     * @stable ICU 60
563
     */
564
    virtual void
565
    normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
566
                  Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
567

568
    /**
569
     * Appends the normalized form of the second string to the first string
570
     * (merging them at the boundary) and returns the first string.
571
     * The result is normalized if the first string was normalized.
572
     * The first and second strings must be different objects.
573
     * @param first string, should be normalized
574
     * @param second string, will be normalized
575
     * @param errorCode Standard ICU error code. Its input value must
576
     *                  pass the U_SUCCESS() test, or else the function returns
577
     *                  immediately. Check for U_FAILURE() on output or use with
578
     *                  function chaining. (See User Guide for details.)
579
     * @return first
580
     * @stable ICU 4.4
581
     */
582
    virtual UnicodeString &
583
    normalizeSecondAndAppend(UnicodeString &first,
584
                             const UnicodeString &second,
585
                             UErrorCode &errorCode) const U_OVERRIDE;
586
    /**
587
     * Appends the second string to the first string
588
     * (merging them at the boundary) and returns the first string.
589
     * The result is normalized if both the strings were normalized.
590
     * The first and second strings must be different objects.
591
     * @param first string, should be normalized
592
     * @param second string, should be normalized
593
     * @param errorCode Standard ICU error code. Its input value must
594
     *                  pass the U_SUCCESS() test, or else the function returns
595
     *                  immediately. Check for U_FAILURE() on output or use with
596
     *                  function chaining. (See User Guide for details.)
597
     * @return first
598
     * @stable ICU 4.4
599
     */
600
    virtual UnicodeString &
601
    append(UnicodeString &first,
602
           const UnicodeString &second,
603
           UErrorCode &errorCode) const U_OVERRIDE;
604

605
    /**
606
     * Gets the decomposition mapping of c.
607
     * For details see the base class documentation.
608
     *
609
     * This function is independent of the mode of the Normalizer2.
610
     * @param c code point
611
     * @param decomposition String object which will be set to c's
612
     *                      decomposition mapping, if there is one.
613
     * @return TRUE if c has a decomposition, otherwise FALSE
614
     * @stable ICU 4.6
615
     */
616
    virtual UBool
617
    getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
618

619
    /**
620
     * Gets the raw decomposition mapping of c.
621
     * For details see the base class documentation.
622
     *
623
     * This function is independent of the mode of the Normalizer2.
624
     * @param c code point
625
     * @param decomposition String object which will be set to c's
626
     *                      raw decomposition mapping, if there is one.
627
     * @return TRUE if c has a decomposition, otherwise FALSE
628
     * @stable ICU 49
629
     */
630
    virtual UBool
631
    getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
632

633
    /**
634
     * Performs pairwise composition of a & b and returns the composite if there is one.
635
     * For details see the base class documentation.
636
     *
637
     * This function is independent of the mode of the Normalizer2.
638
     * @param a A (normalization starter) code point.
639
     * @param b Another code point.
640
     * @return The non-negative composite code point if there is one; otherwise a negative value.
641
     * @stable ICU 49
642
     */
643
    virtual UChar32
644
    composePair(UChar32 a, UChar32 b) const U_OVERRIDE;
645

646
    /**
647
     * Gets the combining class of c.
648
     * The default implementation returns 0
649
     * but all standard implementations return the Unicode Canonical_Combining_Class value.
650
     * @param c code point
651
     * @return c's combining class
652
     * @stable ICU 49
653
     */
654
    virtual uint8_t
655
    getCombiningClass(UChar32 c) const U_OVERRIDE;
656

657
    /**
658
     * Tests if the string is normalized.
659
     * For details see the Normalizer2 base class documentation.
660
     * @param s input string
661
     * @param errorCode Standard ICU error code. Its input value must
662
     *                  pass the U_SUCCESS() test, or else the function returns
663
     *                  immediately. Check for U_FAILURE() on output or use with
664
     *                  function chaining. (See User Guide for details.)
665
     * @return TRUE if s is normalized
666
     * @stable ICU 4.4
667
     */
668
    virtual UBool
669
    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
670
    /**
671
     * Tests if the UTF-8 string is normalized.
672
     * Internally, in cases where the quickCheck() method would return "maybe"
673
     * (which is only possible for the two COMPOSE modes) this method
674
     * resolves to "yes" or "no" to provide a definitive result,
675
     * at the cost of doing more work in those cases.
676
     *
677
     * This works for all normalization modes,
678
     * but it is currently optimized for UTF-8 only for "compose" modes,
679
     * such as for NFC, NFKC, and NFKC_Casefold
680
     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
681
     * For other modes it currently converts to UTF-16 and calls isNormalized().
682
     *
683
     * @param s UTF-8 input string
684
     * @param errorCode Standard ICU error code. Its input value must
685
     *                  pass the U_SUCCESS() test, or else the function returns
686
     *                  immediately. Check for U_FAILURE() on output or use with
687
     *                  function chaining. (See User Guide for details.)
688
     * @return TRUE if s is normalized
689
     * @stable ICU 60
690
     */
691
    virtual UBool
692
    isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
693
    /**
694
     * Tests if the string is normalized.
695
     * For details see the Normalizer2 base class documentation.
696
     * @param s input string
697
     * @param errorCode Standard ICU error code. Its input value must
698
     *                  pass the U_SUCCESS() test, or else the function returns
699
     *                  immediately. Check for U_FAILURE() on output or use with
700
     *                  function chaining. (See User Guide for details.)
701
     * @return UNormalizationCheckResult
702
     * @stable ICU 4.4
703
     */
704
    virtual UNormalizationCheckResult
705
    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
706
    /**
707
     * Returns the end of the normalized substring of the input string.
708
     * For details see the Normalizer2 base class documentation.
709
     * @param s input string
710
     * @param errorCode Standard ICU error code. Its input value must
711
     *                  pass the U_SUCCESS() test, or else the function returns
712
     *                  immediately. Check for U_FAILURE() on output or use with
713
     *                  function chaining. (See User Guide for details.)
714
     * @return "yes" span end index
715
     * @stable ICU 4.4
716
     */
717
    virtual int32_t
718
    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
719

720
    /**
721
     * Tests if the character always has a normalization boundary before it,
722
     * regardless of context.
723
     * For details see the Normalizer2 base class documentation.
724
     * @param c character to test
725
     * @return TRUE if c has a normalization boundary before it
726
     * @stable ICU 4.4
727
     */
728
    virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
729

730
    /**
731
     * Tests if the character always has a normalization boundary after it,
732
     * regardless of context.
733
     * For details see the Normalizer2 base class documentation.
734
     * @param c character to test
735
     * @return TRUE if c has a normalization boundary after it
736
     * @stable ICU 4.4
737
     */
738
    virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
739

740
    /**
741
     * Tests if the character is normalization-inert.
742
     * For details see the Normalizer2 base class documentation.
743
     * @param c character to test
744
     * @return TRUE if c is normalization-inert
745
     * @stable ICU 4.4
746
     */
747
    virtual UBool isInert(UChar32 c) const U_OVERRIDE;
748
private:
749
    UnicodeString &
750
    normalize(const UnicodeString &src,
751
              UnicodeString &dest,
752
              USetSpanCondition spanCondition,
753
              UErrorCode &errorCode) const;
754

755
    void
756
    normalizeUTF8(uint32_t options, const char *src, int32_t length,
757
                  ByteSink &sink, Edits *edits,
758
                  USetSpanCondition spanCondition,
759
                  UErrorCode &errorCode) const;
760

761
    UnicodeString &
762
    normalizeSecondAndAppend(UnicodeString &first,
763
                             const UnicodeString &second,
764
                             UBool doNormalize,
765
                             UErrorCode &errorCode) const;
766

767
    const Normalizer2 &norm2;
768
    const UnicodeSet &set;
769
};
770

771
U_NAMESPACE_END
772

773
#endif  // !UCONFIG_NO_NORMALIZATION
774
#endif  // __NORMALIZER2_H__
775

776
Product

Resources

Company