Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/openjdk-aarch32-jdk8u
Path: blob/jdk8u272-b10-aarch32-20201026/jdk/src/share/native/common/unicode/normalizer2.h
48773 views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
* Copyright (C) 2009-2013, International Business Machines
7
* Corporation and others. All Rights Reserved.
8
*
9
*******************************************************************************
10
* file name: normalizer2.h
11
* encoding: UTF-8
12
* tab size: 8 (not used)
13
* indentation:4
14
*
15
* created on: 2009nov22
16
* created by: Markus W. Scherer
17
*/
18
19
#ifndef __NORMALIZER2_H__
20
#define __NORMALIZER2_H__
21
22
/**
23
* \file
24
* \brief C++ API: New API for Unicode Normalization.
25
*/
26
27
#include "unicode/utypes.h"
28
29
#if !UCONFIG_NO_NORMALIZATION
30
31
#include "unicode/stringpiece.h"
32
#include "unicode/uniset.h"
33
#include "unicode/unistr.h"
34
#include "unicode/unorm2.h"
35
36
U_NAMESPACE_BEGIN
37
38
class ByteSink;
39
40
/**
41
* Unicode normalization functionality for standard Unicode normalization or
42
* for using custom mapping tables.
43
* All instances of this class are unmodifiable/immutable.
44
* Instances returned by getInstance() are singletons that must not be deleted by the caller.
45
* The Normalizer2 class is not intended for public subclassing.
46
*
47
* The primary functions are to produce a normalized string and to detect whether
48
* a string is already normalized.
49
* The most commonly used normalization forms are those defined in
50
* http://www.unicode.org/unicode/reports/tr15/
51
* However, this API supports additional normalization forms for specialized purposes.
52
* For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
53
* and can be used in implementations of UTS #46.
54
*
55
* Not only are the standard compose and decompose modes supplied,
56
* but additional modes are provided as documented in the Mode enum.
57
*
58
* Some of the functions in this class identify normalization boundaries.
59
* At a normalization boundary, the portions of the string
60
* before it and starting from it do not interact and can be handled independently.
61
*
62
* The spanQuickCheckYes() stops at a normalization boundary.
63
* When the goal is a normalized string, then the text before the boundary
64
* can be copied, and the remainder can be processed with normalizeSecondAndAppend().
65
*
66
* The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
67
* a character is guaranteed to be at a normalization boundary,
68
* regardless of context.
69
* This is used for moving from one normalization boundary to the next
70
* or preceding boundary, and for performing iterative normalization.
71
*
72
* Iterative normalization is useful when only a small portion of a
73
* longer string needs to be processed.
74
* For example, in ICU, iterative normalization is used by the NormalizationTransliterator
75
* (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
76
* (to process only the substring for which sort key bytes are computed).
77
*
78
* The set of normalization boundaries returned by these functions may not be
79
* complete: There may be more boundaries that could be returned.
80
* Different functions may return different boundaries.
81
* @stable ICU 4.4
82
*/
83
class U_COMMON_API Normalizer2 : public UObject {
84
public:
85
/**
86
* Destructor.
87
* @stable ICU 4.4
88
*/
89
~Normalizer2();
90
91
/**
92
* Returns a Normalizer2 instance for Unicode NFC normalization.
93
* Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
94
* Returns an unmodifiable singleton instance. Do not delete it.
95
* @param errorCode Standard ICU error code. Its input value must
96
* pass the U_SUCCESS() test, or else the function returns
97
* immediately. Check for U_FAILURE() on output or use with
98
* function chaining. (See User Guide for details.)
99
* @return the requested Normalizer2, if successful
100
* @stable ICU 49
101
*/
102
static const Normalizer2 *
103
getNFCInstance(UErrorCode &errorCode);
104
105
/**
106
* Returns a Normalizer2 instance for Unicode NFD normalization.
107
* Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
108
* Returns an unmodifiable singleton instance. Do not delete it.
109
* @param errorCode Standard ICU error code. Its input value must
110
* pass the U_SUCCESS() test, or else the function returns
111
* immediately. Check for U_FAILURE() on output or use with
112
* function chaining. (See User Guide for details.)
113
* @return the requested Normalizer2, if successful
114
* @stable ICU 49
115
*/
116
static const Normalizer2 *
117
getNFDInstance(UErrorCode &errorCode);
118
119
/**
120
* Returns a Normalizer2 instance for Unicode NFKC normalization.
121
* Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
122
* Returns an unmodifiable singleton instance. Do not delete it.
123
* @param errorCode Standard ICU error code. Its input value must
124
* pass the U_SUCCESS() test, or else the function returns
125
* immediately. Check for U_FAILURE() on output or use with
126
* function chaining. (See User Guide for details.)
127
* @return the requested Normalizer2, if successful
128
* @stable ICU 49
129
*/
130
static const Normalizer2 *
131
getNFKCInstance(UErrorCode &errorCode);
132
133
/**
134
* Returns a Normalizer2 instance for Unicode NFKD normalization.
135
* Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
136
* Returns an unmodifiable singleton instance. Do not delete it.
137
* @param errorCode Standard ICU error code. Its input value must
138
* pass the U_SUCCESS() test, or else the function returns
139
* immediately. Check for U_FAILURE() on output or use with
140
* function chaining. (See User Guide for details.)
141
* @return the requested Normalizer2, if successful
142
* @stable ICU 49
143
*/
144
static const Normalizer2 *
145
getNFKDInstance(UErrorCode &errorCode);
146
147
/**
148
* Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
149
* Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
150
* Returns an unmodifiable singleton instance. Do not delete it.
151
* @param errorCode Standard ICU error code. Its input value must
152
* pass the U_SUCCESS() test, or else the function returns
153
* immediately. Check for U_FAILURE() on output or use with
154
* function chaining. (See User Guide for details.)
155
* @return the requested Normalizer2, if successful
156
* @stable ICU 49
157
*/
158
static const Normalizer2 *
159
getNFKCCasefoldInstance(UErrorCode &errorCode);
160
161
/**
162
* Returns a Normalizer2 instance which uses the specified data file
163
* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
164
* and which composes or decomposes text according to the specified mode.
165
* Returns an unmodifiable singleton instance. Do not delete it.
166
*
167
* Use packageName=NULL for data files that are part of ICU's own data.
168
* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
169
* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
170
* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
171
*
172
* @param packageName NULL for ICU built-in data, otherwise application data package name
173
* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
174
* @param mode normalization mode (compose or decompose etc.)
175
* @param errorCode Standard ICU error code. Its input value must
176
* pass the U_SUCCESS() test, or else the function returns
177
* immediately. Check for U_FAILURE() on output or use with
178
* function chaining. (See User Guide for details.)
179
* @return the requested Normalizer2, if successful
180
* @stable ICU 4.4
181
*/
182
static const Normalizer2 *
183
getInstance(const char *packageName,
184
const char *name,
185
UNormalization2Mode mode,
186
UErrorCode &errorCode);
187
188
/**
189
* Returns the normalized form of the source string.
190
* @param src source string
191
* @param errorCode Standard ICU error code. Its input value must
192
* pass the U_SUCCESS() test, or else the function returns
193
* immediately. Check for U_FAILURE() on output or use with
194
* function chaining. (See User Guide for details.)
195
* @return normalized src
196
* @stable ICU 4.4
197
*/
198
UnicodeString
199
normalize(const UnicodeString &src, UErrorCode &errorCode) const {
200
UnicodeString result;
201
normalize(src, result, errorCode);
202
return result;
203
}
204
/**
205
* Writes the normalized form of the source string to the destination string
206
* (replacing its contents) and returns the destination string.
207
* The source and destination strings must be different objects.
208
* @param src source string
209
* @param dest destination string; its contents is replaced with normalized src
210
* @param errorCode Standard ICU error code. Its input value must
211
* pass the U_SUCCESS() test, or else the function returns
212
* immediately. Check for U_FAILURE() on output or use with
213
* function chaining. (See User Guide for details.)
214
* @return dest
215
* @stable ICU 4.4
216
*/
217
virtual UnicodeString &
218
normalize(const UnicodeString &src,
219
UnicodeString &dest,
220
UErrorCode &errorCode) const = 0;
221
222
/**
223
* Normalizes a UTF-8 string and optionally records how source substrings
224
* relate to changed and unchanged result substrings.
225
*
226
* Currently implemented completely only for "compose" modes,
227
* such as for NFC, NFKC, and NFKC_Casefold
228
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
229
* Otherwise currently converts to & from UTF-16 and does not support edits.
230
*
231
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
232
* @param src Source UTF-8 string.
233
* @param sink A ByteSink to which the normalized UTF-8 result string is written.
234
* sink.Flush() is called at the end.
235
* @param edits Records edits for index mapping, working with styled text,
236
* and getting only changes (if any).
237
* The Edits contents is undefined if any error occurs.
238
* This function calls edits->reset() first unless
239
* options includes U_EDITS_NO_RESET. edits can be nullptr.
240
* @param errorCode Standard ICU error code. Its input value must
241
* pass the U_SUCCESS() test, or else the function returns
242
* immediately. Check for U_FAILURE() on output or use with
243
* function chaining. (See User Guide for details.)
244
* @stable ICU 60
245
*/
246
virtual void
247
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
248
Edits *edits, UErrorCode &errorCode) const;
249
250
/**
251
* Appends the normalized form of the second string to the first string
252
* (merging them at the boundary) and returns the first string.
253
* The result is normalized if the first string was normalized.
254
* The first and second strings must be different objects.
255
* @param first string, should be normalized
256
* @param second string, will be normalized
257
* @param errorCode Standard ICU error code. Its input value must
258
* pass the U_SUCCESS() test, or else the function returns
259
* immediately. Check for U_FAILURE() on output or use with
260
* function chaining. (See User Guide for details.)
261
* @return first
262
* @stable ICU 4.4
263
*/
264
virtual UnicodeString &
265
normalizeSecondAndAppend(UnicodeString &first,
266
const UnicodeString &second,
267
UErrorCode &errorCode) const = 0;
268
/**
269
* Appends the second string to the first string
270
* (merging them at the boundary) and returns the first string.
271
* The result is normalized if both the strings were normalized.
272
* The first and second strings must be different objects.
273
* @param first string, should be normalized
274
* @param second string, should be normalized
275
* @param errorCode Standard ICU error code. Its input value must
276
* pass the U_SUCCESS() test, or else the function returns
277
* immediately. Check for U_FAILURE() on output or use with
278
* function chaining. (See User Guide for details.)
279
* @return first
280
* @stable ICU 4.4
281
*/
282
virtual UnicodeString &
283
append(UnicodeString &first,
284
const UnicodeString &second,
285
UErrorCode &errorCode) const = 0;
286
287
/**
288
* Gets the decomposition mapping of c.
289
* Roughly equivalent to normalizing the String form of c
290
* on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
291
* returns FALSE and does not write a string
292
* if c does not have a decomposition mapping in this instance's data.
293
* This function is independent of the mode of the Normalizer2.
294
* @param c code point
295
* @param decomposition String object which will be set to c's
296
* decomposition mapping, if there is one.
297
* @return TRUE if c has a decomposition, otherwise FALSE
298
* @stable ICU 4.6
299
*/
300
virtual UBool
301
getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
302
303
/**
304
* Gets the raw decomposition mapping of c.
305
*
306
* This is similar to the getDecomposition() method but returns the
307
* raw decomposition mapping as specified in UnicodeData.txt or
308
* (for custom data) in the mapping files processed by the gennorm2 tool.
309
* By contrast, getDecomposition() returns the processed,
310
* recursively-decomposed version of this mapping.
311
*
312
* When used on a standard NFKC Normalizer2 instance,
313
* getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
314
*
315
* When used on a standard NFC Normalizer2 instance,
316
* it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
317
* in this case, the result contains either one or two code points (=1..4 char16_ts).
318
*
319
* This function is independent of the mode of the Normalizer2.
320
* The default implementation returns FALSE.
321
* @param c code point
322
* @param decomposition String object which will be set to c's
323
* raw decomposition mapping, if there is one.
324
* @return TRUE if c has a decomposition, otherwise FALSE
325
* @stable ICU 49
326
*/
327
virtual UBool
328
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
329
330
/**
331
* Performs pairwise composition of a & b and returns the composite if there is one.
332
*
333
* Returns a composite code point c only if c has a two-way mapping to a+b.
334
* In standard Unicode normalization, this means that
335
* c has a canonical decomposition to a+b
336
* and c does not have the Full_Composition_Exclusion property.
337
*
338
* This function is independent of the mode of the Normalizer2.
339
* The default implementation returns a negative value.
340
* @param a A (normalization starter) code point.
341
* @param b Another code point.
342
* @return The non-negative composite code point if there is one; otherwise a negative value.
343
* @stable ICU 49
344
*/
345
virtual UChar32
346
composePair(UChar32 a, UChar32 b) const;
347
348
/**
349
* Gets the combining class of c.
350
* The default implementation returns 0
351
* but all standard implementations return the Unicode Canonical_Combining_Class value.
352
* @param c code point
353
* @return c's combining class
354
* @stable ICU 49
355
*/
356
virtual uint8_t
357
getCombiningClass(UChar32 c) const;
358
359
/**
360
* Tests if the string is normalized.
361
* Internally, in cases where the quickCheck() method would return "maybe"
362
* (which is only possible for the two COMPOSE modes) this method
363
* resolves to "yes" or "no" to provide a definitive result,
364
* at the cost of doing more work in those cases.
365
* @param s input string
366
* @param errorCode Standard ICU error code. Its input value must
367
* pass the U_SUCCESS() test, or else the function returns
368
* immediately. Check for U_FAILURE() on output or use with
369
* function chaining. (See User Guide for details.)
370
* @return TRUE if s is normalized
371
* @stable ICU 4.4
372
*/
373
virtual UBool
374
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
375
/**
376
* Tests if the UTF-8 string is normalized.
377
* Internally, in cases where the quickCheck() method would return "maybe"
378
* (which is only possible for the two COMPOSE modes) this method
379
* resolves to "yes" or "no" to provide a definitive result,
380
* at the cost of doing more work in those cases.
381
*
382
* This works for all normalization modes,
383
* but it is currently optimized for UTF-8 only for "compose" modes,
384
* such as for NFC, NFKC, and NFKC_Casefold
385
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
386
* For other modes it currently converts to UTF-16 and calls isNormalized().
387
*
388
* @param s UTF-8 input string
389
* @param errorCode Standard ICU error code. Its input value must
390
* pass the U_SUCCESS() test, or else the function returns
391
* immediately. Check for U_FAILURE() on output or use with
392
* function chaining. (See User Guide for details.)
393
* @return TRUE if s is normalized
394
* @stable ICU 60
395
*/
396
virtual UBool
397
isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
398
399
400
/**
401
* Tests if the string is normalized.
402
* For the two COMPOSE modes, the result could be "maybe" in cases that
403
* would take a little more work to resolve definitively.
404
* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
405
* combination of quick check + normalization, to avoid
406
* re-checking the "yes" prefix.
407
* @param s input string
408
* @param errorCode Standard ICU error code. Its input value must
409
* pass the U_SUCCESS() test, or else the function returns
410
* immediately. Check for U_FAILURE() on output or use with
411
* function chaining. (See User Guide for details.)
412
* @return UNormalizationCheckResult
413
* @stable ICU 4.4
414
*/
415
virtual UNormalizationCheckResult
416
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
417
418
/**
419
* Returns the end of the normalized substring of the input string.
420
* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
421
* the substring <code>UnicodeString(s, 0, end)</code>
422
* will pass the quick check with a "yes" result.
423
*
424
* The returned end index is usually one or more characters before the
425
* "no" or "maybe" character: The end index is at a normalization boundary.
426
* (See the class documentation for more about normalization boundaries.)
427
*
428
* When the goal is a normalized string and most input strings are expected
429
* to be normalized already, then call this method,
430
* and if it returns a prefix shorter than the input string,
431
* copy that prefix and use normalizeSecondAndAppend() for the remainder.
432
* @param s input string
433
* @param errorCode Standard ICU error code. Its input value must
434
* pass the U_SUCCESS() test, or else the function returns
435
* immediately. Check for U_FAILURE() on output or use with
436
* function chaining. (See User Guide for details.)
437
* @return "yes" span end index
438
* @stable ICU 4.4
439
*/
440
virtual int32_t
441
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
442
443
/**
444
* Tests if the character always has a normalization boundary before it,
445
* regardless of context.
446
* If true, then the character does not normalization-interact with
447
* preceding characters.
448
* In other words, a string containing this character can be normalized
449
* by processing portions before this character and starting from this
450
* character independently.
451
* This is used for iterative normalization. See the class documentation for details.
452
* @param c character to test
453
* @return TRUE if c has a normalization boundary before it
454
* @stable ICU 4.4
455
*/
456
virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
457
458
/**
459
* Tests if the character always has a normalization boundary after it,
460
* regardless of context.
461
* If true, then the character does not normalization-interact with
462
* following characters.
463
* In other words, a string containing this character can be normalized
464
* by processing portions up to this character and after this
465
* character independently.
466
* This is used for iterative normalization. See the class documentation for details.
467
* Note that this operation may be significantly slower than hasBoundaryBefore().
468
* @param c character to test
469
* @return TRUE if c has a normalization boundary after it
470
* @stable ICU 4.4
471
*/
472
virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
473
474
/**
475
* Tests if the character is normalization-inert.
476
* If true, then the character does not change, nor normalization-interact with
477
* preceding or following characters.
478
* In other words, a string containing this character can be normalized
479
* by processing portions before this character and after this
480
* character independently.
481
* This is used for iterative normalization. See the class documentation for details.
482
* Note that this operation may be significantly slower than hasBoundaryBefore().
483
* @param c character to test
484
* @return TRUE if c is normalization-inert
485
* @stable ICU 4.4
486
*/
487
virtual UBool isInert(UChar32 c) const = 0;
488
};
489
490
/**
491
* Normalization filtered by a UnicodeSet.
492
* Normalizes portions of the text contained in the filter set and leaves
493
* portions not contained in the filter set unchanged.
494
* Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
495
* Not-in-the-filter text is treated as "is normalized" and "quick check yes".
496
* This class implements all of (and only) the Normalizer2 API.
497
* An instance of this class is unmodifiable/immutable but is constructed and
498
* must be destructed by the owner.
499
* @stable ICU 4.4
500
*/
501
class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
502
public:
503
/**
504
* Constructs a filtered normalizer wrapping any Normalizer2 instance
505
* and a filter set.
506
* Both are aliased and must not be modified or deleted while this object
507
* is used.
508
* The filter set should be frozen; otherwise the performance will suffer greatly.
509
* @param n2 wrapped Normalizer2 instance
510
* @param filterSet UnicodeSet which determines the characters to be normalized
511
* @stable ICU 4.4
512
*/
513
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
514
norm2(n2), set(filterSet) {}
515
516
/**
517
* Destructor.
518
* @stable ICU 4.4
519
*/
520
~FilteredNormalizer2();
521
522
/**
523
* Writes the normalized form of the source string to the destination string
524
* (replacing its contents) and returns the destination string.
525
* The source and destination strings must be different objects.
526
* @param src source string
527
* @param dest destination string; its contents is replaced with normalized src
528
* @param errorCode Standard ICU error code. Its input value must
529
* pass the U_SUCCESS() test, or else the function returns
530
* immediately. Check for U_FAILURE() on output or use with
531
* function chaining. (See User Guide for details.)
532
* @return dest
533
* @stable ICU 4.4
534
*/
535
virtual UnicodeString &
536
normalize(const UnicodeString &src,
537
UnicodeString &dest,
538
UErrorCode &errorCode) const U_OVERRIDE;
539
540
/**
541
* Normalizes a UTF-8 string and optionally records how source substrings
542
* relate to changed and unchanged result substrings.
543
*
544
* Currently implemented completely only for "compose" modes,
545
* such as for NFC, NFKC, and NFKC_Casefold
546
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
547
* Otherwise currently converts to & from UTF-16 and does not support edits.
548
*
549
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
550
* @param src Source UTF-8 string.
551
* @param sink A ByteSink to which the normalized UTF-8 result string is written.
552
* sink.Flush() is called at the end.
553
* @param edits Records edits for index mapping, working with styled text,
554
* and getting only changes (if any).
555
* The Edits contents is undefined if any error occurs.
556
* This function calls edits->reset() first unless
557
* options includes U_EDITS_NO_RESET. edits can be nullptr.
558
* @param errorCode Standard ICU error code. Its input value must
559
* pass the U_SUCCESS() test, or else the function returns
560
* immediately. Check for U_FAILURE() on output or use with
561
* function chaining. (See User Guide for details.)
562
* @stable ICU 60
563
*/
564
virtual void
565
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
566
Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
567
568
/**
569
* Appends the normalized form of the second string to the first string
570
* (merging them at the boundary) and returns the first string.
571
* The result is normalized if the first string was normalized.
572
* The first and second strings must be different objects.
573
* @param first string, should be normalized
574
* @param second string, will be normalized
575
* @param errorCode Standard ICU error code. Its input value must
576
* pass the U_SUCCESS() test, or else the function returns
577
* immediately. Check for U_FAILURE() on output or use with
578
* function chaining. (See User Guide for details.)
579
* @return first
580
* @stable ICU 4.4
581
*/
582
virtual UnicodeString &
583
normalizeSecondAndAppend(UnicodeString &first,
584
const UnicodeString &second,
585
UErrorCode &errorCode) const U_OVERRIDE;
586
/**
587
* Appends the second string to the first string
588
* (merging them at the boundary) and returns the first string.
589
* The result is normalized if both the strings were normalized.
590
* The first and second strings must be different objects.
591
* @param first string, should be normalized
592
* @param second string, should be normalized
593
* @param errorCode Standard ICU error code. Its input value must
594
* pass the U_SUCCESS() test, or else the function returns
595
* immediately. Check for U_FAILURE() on output or use with
596
* function chaining. (See User Guide for details.)
597
* @return first
598
* @stable ICU 4.4
599
*/
600
virtual UnicodeString &
601
append(UnicodeString &first,
602
const UnicodeString &second,
603
UErrorCode &errorCode) const U_OVERRIDE;
604
605
/**
606
* Gets the decomposition mapping of c.
607
* For details see the base class documentation.
608
*
609
* This function is independent of the mode of the Normalizer2.
610
* @param c code point
611
* @param decomposition String object which will be set to c's
612
* decomposition mapping, if there is one.
613
* @return TRUE if c has a decomposition, otherwise FALSE
614
* @stable ICU 4.6
615
*/
616
virtual UBool
617
getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
618
619
/**
620
* Gets the raw decomposition mapping of c.
621
* For details see the base class documentation.
622
*
623
* This function is independent of the mode of the Normalizer2.
624
* @param c code point
625
* @param decomposition String object which will be set to c's
626
* raw decomposition mapping, if there is one.
627
* @return TRUE if c has a decomposition, otherwise FALSE
628
* @stable ICU 49
629
*/
630
virtual UBool
631
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
632
633
/**
634
* Performs pairwise composition of a & b and returns the composite if there is one.
635
* For details see the base class documentation.
636
*
637
* This function is independent of the mode of the Normalizer2.
638
* @param a A (normalization starter) code point.
639
* @param b Another code point.
640
* @return The non-negative composite code point if there is one; otherwise a negative value.
641
* @stable ICU 49
642
*/
643
virtual UChar32
644
composePair(UChar32 a, UChar32 b) const U_OVERRIDE;
645
646
/**
647
* Gets the combining class of c.
648
* The default implementation returns 0
649
* but all standard implementations return the Unicode Canonical_Combining_Class value.
650
* @param c code point
651
* @return c's combining class
652
* @stable ICU 49
653
*/
654
virtual uint8_t
655
getCombiningClass(UChar32 c) const U_OVERRIDE;
656
657
/**
658
* Tests if the string is normalized.
659
* For details see the Normalizer2 base class documentation.
660
* @param s input string
661
* @param errorCode Standard ICU error code. Its input value must
662
* pass the U_SUCCESS() test, or else the function returns
663
* immediately. Check for U_FAILURE() on output or use with
664
* function chaining. (See User Guide for details.)
665
* @return TRUE if s is normalized
666
* @stable ICU 4.4
667
*/
668
virtual UBool
669
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
670
/**
671
* Tests if the UTF-8 string is normalized.
672
* Internally, in cases where the quickCheck() method would return "maybe"
673
* (which is only possible for the two COMPOSE modes) this method
674
* resolves to "yes" or "no" to provide a definitive result,
675
* at the cost of doing more work in those cases.
676
*
677
* This works for all normalization modes,
678
* but it is currently optimized for UTF-8 only for "compose" modes,
679
* such as for NFC, NFKC, and NFKC_Casefold
680
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
681
* For other modes it currently converts to UTF-16 and calls isNormalized().
682
*
683
* @param s UTF-8 input string
684
* @param errorCode Standard ICU error code. Its input value must
685
* pass the U_SUCCESS() test, or else the function returns
686
* immediately. Check for U_FAILURE() on output or use with
687
* function chaining. (See User Guide for details.)
688
* @return TRUE if s is normalized
689
* @stable ICU 60
690
*/
691
virtual UBool
692
isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
693
/**
694
* Tests if the string is normalized.
695
* For details see the Normalizer2 base class documentation.
696
* @param s input string
697
* @param errorCode Standard ICU error code. Its input value must
698
* pass the U_SUCCESS() test, or else the function returns
699
* immediately. Check for U_FAILURE() on output or use with
700
* function chaining. (See User Guide for details.)
701
* @return UNormalizationCheckResult
702
* @stable ICU 4.4
703
*/
704
virtual UNormalizationCheckResult
705
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
706
/**
707
* Returns the end of the normalized substring of the input string.
708
* For details see the Normalizer2 base class documentation.
709
* @param s input string
710
* @param errorCode Standard ICU error code. Its input value must
711
* pass the U_SUCCESS() test, or else the function returns
712
* immediately. Check for U_FAILURE() on output or use with
713
* function chaining. (See User Guide for details.)
714
* @return "yes" span end index
715
* @stable ICU 4.4
716
*/
717
virtual int32_t
718
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
719
720
/**
721
* Tests if the character always has a normalization boundary before it,
722
* regardless of context.
723
* For details see the Normalizer2 base class documentation.
724
* @param c character to test
725
* @return TRUE if c has a normalization boundary before it
726
* @stable ICU 4.4
727
*/
728
virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
729
730
/**
731
* Tests if the character always has a normalization boundary after it,
732
* regardless of context.
733
* For details see the Normalizer2 base class documentation.
734
* @param c character to test
735
* @return TRUE if c has a normalization boundary after it
736
* @stable ICU 4.4
737
*/
738
virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
739
740
/**
741
* Tests if the character is normalization-inert.
742
* For details see the Normalizer2 base class documentation.
743
* @param c character to test
744
* @return TRUE if c is normalization-inert
745
* @stable ICU 4.4
746
*/
747
virtual UBool isInert(UChar32 c) const U_OVERRIDE;
748
private:
749
UnicodeString &
750
normalize(const UnicodeString &src,
751
UnicodeString &dest,
752
USetSpanCondition spanCondition,
753
UErrorCode &errorCode) const;
754
755
void
756
normalizeUTF8(uint32_t options, const char *src, int32_t length,
757
ByteSink &sink, Edits *edits,
758
USetSpanCondition spanCondition,
759
UErrorCode &errorCode) const;
760
761
UnicodeString &
762
normalizeSecondAndAppend(UnicodeString &first,
763
const UnicodeString &second,
764
UBool doNormalize,
765
UErrorCode &errorCode) const;
766
767
const Normalizer2 &norm2;
768
const UnicodeSet &set;
769
};
770
771
U_NAMESPACE_END
772
773
#endif // !UCONFIG_NO_NORMALIZATION
774
#endif // __NORMALIZER2_H__
775
776