Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/icui18n/collationdatabuilder.cpp
12343 views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2012-2015, International Business Machines
6
* Corporation and others. All Rights Reserved.
7
*******************************************************************************
8
* collationdatabuilder.cpp
9
*
10
* (replaced the former ucol_elm.cpp)
11
*
12
* created on: 2012apr01
13
* created by: Markus W. Scherer
14
*/
15
16
#include "unicode/utypes.h"
17
18
#if !UCONFIG_NO_COLLATION
19
20
#include "unicode/localpointer.h"
21
#include "unicode/uchar.h"
22
#include "unicode/ucharstrie.h"
23
#include "unicode/ucharstriebuilder.h"
24
#include "unicode/uniset.h"
25
#include "unicode/unistr.h"
26
#include "unicode/usetiter.h"
27
#include "unicode/utf16.h"
28
#include "cmemory.h"
29
#include "collation.h"
30
#include "collationdata.h"
31
#include "collationdatabuilder.h"
32
#include "collationfastlatinbuilder.h"
33
#include "collationiterator.h"
34
#include "normalizer2impl.h"
35
#include "utrie2.h"
36
#include "uvectr32.h"
37
#include "uvectr64.h"
38
#include "uvector.h"
39
40
U_NAMESPACE_BEGIN
41
42
CollationDataBuilder::CEModifier::~CEModifier() {}
43
44
/**
45
* Build-time context and CE32 for a code point.
46
* If a code point has contextual mappings, then the default (no-context) mapping
47
* and all conditional mappings are stored in a singly-linked list
48
* of ConditionalCE32, sorted by context strings.
49
*
50
* Context strings sort by prefix length, then by prefix, then by contraction suffix.
51
* Context strings must be unique and in ascending order.
52
*/
53
struct ConditionalCE32 : public UMemory {
54
ConditionalCE32()
55
: context(),
56
ce32(0), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32),
57
next(-1) {}
58
ConditionalCE32(const UnicodeString &ct, uint32_t ce)
59
: context(ct),
60
ce32(ce), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32),
61
next(-1) {}
62
63
inline UBool hasContext() const { return context.length() > 1; }
64
inline int32_t prefixLength() const { return context.charAt(0); }
65
66
/**
67
* "\0" for the first entry for any code point, with its default CE32.
68
*
69
* Otherwise one unit with the length of the prefix string,
70
* then the prefix string, then the contraction suffix.
71
*/
72
UnicodeString context;
73
/**
74
* CE32 for the code point and its context.
75
* Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag).
76
*/
77
uint32_t ce32;
78
/**
79
* Default CE32 for all contexts with this same prefix.
80
* Initially NO_CE32. Set only while building runtime data structures,
81
* and only on one of the nodes of a sub-list with the same prefix.
82
*/
83
uint32_t defaultCE32;
84
/**
85
* CE32 for the built contexts.
86
* When fetching CEs from the builder, the contexts are built into their runtime form
87
* so that the normal collation implementation can process them.
88
* The result is cached in the list head. It is reset when the contexts are modified.
89
* All of these builtCE32 are invalidated by clearContexts(),
90
* via incrementing the contextsEra.
91
*/
92
uint32_t builtCE32;
93
/**
94
* The "era" of building intermediate contexts when the above builtCE32 was set.
95
* When the array of cached, temporary contexts overflows, then clearContexts()
96
* removes them all and invalidates the builtCE32 that used to point to built tries.
97
*/
98
int32_t era = -1;
99
/**
100
* Index of the next ConditionalCE32.
101
* Negative for the end of the list.
102
*/
103
int32_t next;
104
// Note: We could create a separate class for all of the contextual mappings for
105
// a code point, with the builtCE32, the era, and a list of the actual mappings.
106
// The class that represents one mapping would then not need to
107
// store those fields in each element.
108
};
109
110
U_CDECL_BEGIN
111
112
void U_CALLCONV
113
uprv_deleteConditionalCE32(void *obj) {
114
delete static_cast<ConditionalCE32 *>(obj);
115
}
116
117
U_CDECL_END
118
119
/**
120
* Build-time collation element and character iterator.
121
* Uses the runtime CollationIterator for fetching CEs for a string
122
* but reads from the builder's unfinished data structures.
123
* In particular, this class reads from the unfinished trie
124
* and has to avoid CollationIterator::nextCE() and redirect other
125
* calls to data->getCE32() and data->getCE32FromSupplementary().
126
*
127
* We do this so that we need not implement the collation algorithm
128
* again for the builder and make it behave exactly like the runtime code.
129
* That would be more difficult to test and maintain than this indirection.
130
*
131
* Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data,
132
* so the data accesses from those code paths need not be modified.
133
*
134
* This class iterates directly over whole code points
135
* so that the CollationIterator does not need the finished trie
136
* for handling the LEAD_SURROGATE_TAG.
137
*/
138
class DataBuilderCollationIterator : public CollationIterator {
139
public:
140
DataBuilderCollationIterator(CollationDataBuilder &b);
141
142
virtual ~DataBuilderCollationIterator();
143
144
int32_t fetchCEs(const UnicodeString &str, int32_t start, int64_t ces[], int32_t cesLength);
145
146
virtual void resetToOffset(int32_t newOffset) override;
147
virtual int32_t getOffset() const override;
148
149
virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
150
virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
151
152
protected:
153
virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
154
virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
155
156
virtual uint32_t getDataCE32(UChar32 c) const override;
157
virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) override;
158
159
CollationDataBuilder &builder;
160
CollationData builderData;
161
uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH];
162
const UnicodeString *s;
163
int32_t pos;
164
};
165
166
DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder &b)
167
: CollationIterator(&builderData, /*numeric=*/ false),
168
builder(b), builderData(b.nfcImpl),
169
s(NULL), pos(0) {
170
builderData.base = builder.base;
171
// Set all of the jamoCE32s[] to indirection CE32s.
172
for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types.
173
UChar32 jamo = CollationDataBuilder::jamoCpFromIndex(j);
174
jamoCE32s[j] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, jamo) |
175
CollationDataBuilder::IS_BUILDER_JAMO_CE32;
176
}
177
builderData.jamoCE32s = jamoCE32s;
178
}
179
180
DataBuilderCollationIterator::~DataBuilderCollationIterator() {}
181
182
int32_t
183
DataBuilderCollationIterator::fetchCEs(const UnicodeString &str, int32_t start,
184
int64_t ces[], int32_t cesLength) {
185
// Set the pointers each time, in case they changed due to reallocation.
186
builderData.ce32s = reinterpret_cast<const uint32_t *>(builder.ce32s.getBuffer());
187
builderData.ces = builder.ce64s.getBuffer();
188
builderData.contexts = builder.contexts.getBuffer();
189
// Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32().
190
reset();
191
s = &str;
192
pos = start;
193
UErrorCode errorCode = U_ZERO_ERROR;
194
while(U_SUCCESS(errorCode) && pos < s->length()) {
195
// No need to keep all CEs in the iterator buffer.
196
clearCEs();
197
UChar32 c = s->char32At(pos);
198
pos += U16_LENGTH(c);
199
uint32_t ce32 = utrie2_get32(builder.trie, c);
200
const CollationData *d;
201
if(ce32 == Collation::FALLBACK_CE32) {
202
d = builder.base;
203
ce32 = builder.base->getCE32(c);
204
} else {
205
d = &builderData;
206
}
207
appendCEsFromCE32(d, c, ce32, /*forward=*/ true, errorCode);
208
U_ASSERT(U_SUCCESS(errorCode));
209
for(int32_t i = 0; i < getCEsLength(); ++i) {
210
int64_t ce = getCE(i);
211
if(ce != 0) {
212
if(cesLength < Collation::MAX_EXPANSION_LENGTH) {
213
ces[cesLength] = ce;
214
}
215
++cesLength;
216
}
217
}
218
}
219
return cesLength;
220
}
221
222
void
223
DataBuilderCollationIterator::resetToOffset(int32_t newOffset) {
224
reset();
225
pos = newOffset;
226
}
227
228
int32_t
229
DataBuilderCollationIterator::getOffset() const {
230
return pos;
231
}
232
233
UChar32
234
DataBuilderCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
235
if(pos == s->length()) {
236
return U_SENTINEL;
237
}
238
UChar32 c = s->char32At(pos);
239
pos += U16_LENGTH(c);
240
return c;
241
}
242
243
UChar32
244
DataBuilderCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
245
if(pos == 0) {
246
return U_SENTINEL;
247
}
248
UChar32 c = s->char32At(pos - 1);
249
pos -= U16_LENGTH(c);
250
return c;
251
}
252
253
void
254
DataBuilderCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
255
pos = s->moveIndex32(pos, num);
256
}
257
258
void
259
DataBuilderCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
260
pos = s->moveIndex32(pos, -num);
261
}
262
263
uint32_t
264
DataBuilderCollationIterator::getDataCE32(UChar32 c) const {
265
return utrie2_get32(builder.trie, c);
266
}
267
268
uint32_t
269
DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) {
270
if (U_FAILURE(errorCode)) { return 0; }
271
U_ASSERT(Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG));
272
if((ce32 & CollationDataBuilder::IS_BUILDER_JAMO_CE32) != 0) {
273
UChar32 jamo = Collation::indexFromCE32(ce32);
274
return utrie2_get32(builder.trie, jamo);
275
} else {
276
ConditionalCE32 *cond = builder.getConditionalCE32ForCE32(ce32);
277
if (cond == nullptr) {
278
errorCode = U_INTERNAL_PROGRAM_ERROR;
279
// TODO: ICU-21531 figure out why this happens.
280
return 0;
281
}
282
if(cond->builtCE32 == Collation::NO_CE32 || cond->era != builder.contextsEra) {
283
// Build the context-sensitive mappings into their runtime form and cache the result.
284
cond->builtCE32 = builder.buildContext(cond, errorCode);
285
if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
286
errorCode = U_ZERO_ERROR;
287
builder.clearContexts();
288
cond->builtCE32 = builder.buildContext(cond, errorCode);
289
}
290
cond->era = builder.contextsEra;
291
builderData.contexts = builder.contexts.getBuffer();
292
}
293
return cond->builtCE32;
294
}
295
}
296
297
// ------------------------------------------------------------------------- ***
298
299
CollationDataBuilder::CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode)
300
: nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
301
base(NULL), baseSettings(NULL),
302
trie(NULL),
303
ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode),
304
modified(false),
305
icu4xMode(icu4xMode),
306
fastLatinEnabled(false), fastLatinBuilder(NULL),
307
collIter(NULL) {
308
// Reserve the first CE32 for U+0000.
309
if (!icu4xMode) {
310
ce32s.addElement(0, errorCode);
311
}
312
conditionalCE32s.setDeleter(uprv_deleteConditionalCE32);
313
}
314
315
CollationDataBuilder::~CollationDataBuilder() {
316
utrie2_close(trie);
317
delete fastLatinBuilder;
318
delete collIter;
319
}
320
321
void
322
CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &errorCode) {
323
if(U_FAILURE(errorCode)) { return; }
324
if(trie != NULL) {
325
errorCode = U_INVALID_STATE_ERROR;
326
return;
327
}
328
if(b == NULL) {
329
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
330
return;
331
}
332
base = b;
333
334
// For a tailoring, the default is to fall back to the base.
335
// For ICU4X, use the same value for fallback as for the default
336
// to avoid having to have different blocks for the two.
337
trie = utrie2_open(Collation::FALLBACK_CE32, icu4xMode ? Collation::FALLBACK_CE32 : Collation::FFFD_CE32, &errorCode);
338
339
if (!icu4xMode) {
340
// Set the Latin-1 letters block so that it is allocated first in the data array,
341
// to try to improve locality of reference when sorting Latin-1 text.
342
// Do not use utrie2_setRange32() since that will not actually allocate blocks
343
// that are filled with the default value.
344
// ASCII (0..7F) is already preallocated anyway.
345
for(UChar32 c = 0xc0; c <= 0xff; ++c) {
346
utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode);
347
}
348
349
// Hangul syllables are not tailorable (except via tailoring Jamos).
350
// Always set the Hangul tag to help performance.
351
// Do this here, rather than in buildMappings(),
352
// so that we see the HANGUL_TAG in various assertions.
353
uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
354
utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode);
355
356
// Copy the set contents but don't copy/clone the set as a whole because
357
// that would copy the isFrozen state too.
358
unsafeBackwardSet.addAll(*b->unsafeBackwardSet);
359
}
360
361
if(U_FAILURE(errorCode)) { return; }
362
}
363
364
UBool
365
CollationDataBuilder::maybeSetPrimaryRange(UChar32 start, UChar32 end,
366
uint32_t primary, int32_t step,
367
UErrorCode &errorCode) {
368
if(U_FAILURE(errorCode)) { return false; }
369
U_ASSERT(start <= end);
370
// TODO: Do we need to check what values are currently set for start..end?
371
// An offset range is worth it only if we can achieve an overlap between
372
// adjacent UTrie2 blocks of 32 code points each.
373
// An offset CE is also a little more expensive to look up and compute
374
// than a simple CE.
375
// If the range spans at least three UTrie2 block boundaries (> 64 code points),
376
// then we take it.
377
// If the range spans one or two block boundaries and there are
378
// at least 4 code points on either side, then we take it.
379
// (We could additionally require a minimum range length of, say, 16.)
380
int32_t blockDelta = (end >> 5) - (start >> 5);
381
if(2 <= step && step <= 0x7f &&
382
(blockDelta >= 3 ||
383
(blockDelta > 0 && (start & 0x1f) <= 0x1c && (end & 0x1f) >= 3))) {
384
int64_t dataCE = ((int64_t)primary << 32) | (start << 8) | step;
385
if(isCompressiblePrimary(primary)) { dataCE |= 0x80; }
386
int32_t index = addCE(dataCE, errorCode);
387
if(U_FAILURE(errorCode)) { return 0; }
388
if(index > Collation::MAX_INDEX) {
389
errorCode = U_BUFFER_OVERFLOW_ERROR;
390
return 0;
391
}
392
uint32_t offsetCE32 = Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG, index);
393
utrie2_setRange32(trie, start, end, offsetCE32, true, &errorCode);
394
modified = true;
395
return true;
396
} else {
397
return false;
398
}
399
}
400
401
uint32_t
402
CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
403
uint32_t primary, int32_t step,
404
UErrorCode &errorCode) {
405
if(U_FAILURE(errorCode)) { return 0; }
406
UBool isCompressible = isCompressiblePrimary(primary);
407
if(maybeSetPrimaryRange(start, end, primary, step, errorCode)) {
408
return Collation::incThreeBytePrimaryByOffset(primary, isCompressible,
409
(end - start + 1) * step);
410
} else {
411
// Short range: Set individual CE32s.
412
for(;;) {
413
utrie2_set32(trie, start, Collation::makeLongPrimaryCE32(primary), &errorCode);
414
++start;
415
primary = Collation::incThreeBytePrimaryByOffset(primary, isCompressible, step);
416
if(start > end) { return primary; }
417
}
418
modified = true;
419
}
420
}
421
422
uint32_t
423
CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const {
424
int32_t i = Collation::indexFromCE32(ce32);
425
int64_t dataCE = fromBase ? base->ces[i] : ce64s.elementAti(i);
426
uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE);
427
return Collation::makeLongPrimaryCE32(p);
428
}
429
430
UBool
431
CollationDataBuilder::isCompressibleLeadByte(uint32_t b) const {
432
return base->isCompressibleLeadByte(b);
433
}
434
435
UBool
436
CollationDataBuilder::isAssigned(UChar32 c) const {
437
return Collation::isAssignedCE32(utrie2_get32(trie, c));
438
}
439
440
uint32_t
441
CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c) const {
442
uint32_t ce32 = utrie2_get32(trie, c);
443
if(Collation::isLongPrimaryCE32(ce32)) {
444
return Collation::primaryFromLongPrimaryCE32(ce32);
445
} else {
446
return 0;
447
}
448
}
449
450
int64_t
451
CollationDataBuilder::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
452
if(U_FAILURE(errorCode)) { return 0; }
453
// Keep parallel with CollationData::getSingleCE().
454
UBool fromBase = false;
455
uint32_t ce32 = utrie2_get32(trie, c);
456
if(ce32 == Collation::FALLBACK_CE32) {
457
fromBase = true;
458
ce32 = base->getCE32(c);
459
}
460
while(Collation::isSpecialCE32(ce32)) {
461
switch(Collation::tagFromCE32(ce32)) {
462
case Collation::LATIN_EXPANSION_TAG:
463
case Collation::BUILDER_DATA_TAG:
464
case Collation::PREFIX_TAG:
465
case Collation::CONTRACTION_TAG:
466
case Collation::HANGUL_TAG:
467
case Collation::LEAD_SURROGATE_TAG:
468
errorCode = U_UNSUPPORTED_ERROR;
469
return 0;
470
case Collation::FALLBACK_TAG:
471
case Collation::RESERVED_TAG_3:
472
errorCode = U_INTERNAL_PROGRAM_ERROR;
473
return 0;
474
case Collation::LONG_PRIMARY_TAG:
475
return Collation::ceFromLongPrimaryCE32(ce32);
476
case Collation::LONG_SECONDARY_TAG:
477
return Collation::ceFromLongSecondaryCE32(ce32);
478
case Collation::EXPANSION32_TAG:
479
if(Collation::lengthFromCE32(ce32) == 1) {
480
int32_t i = Collation::indexFromCE32(ce32);
481
ce32 = fromBase ? base->ce32s[i] : ce32s.elementAti(i);
482
break;
483
} else {
484
errorCode = U_UNSUPPORTED_ERROR;
485
return 0;
486
}
487
case Collation::EXPANSION_TAG: {
488
if(Collation::lengthFromCE32(ce32) == 1) {
489
int32_t i = Collation::indexFromCE32(ce32);
490
return fromBase ? base->ces[i] : ce64s.elementAti(i);
491
} else {
492
errorCode = U_UNSUPPORTED_ERROR;
493
return 0;
494
}
495
}
496
case Collation::DIGIT_TAG:
497
// Fetch the non-numeric-collation CE32 and continue.
498
ce32 = ce32s.elementAti(Collation::indexFromCE32(ce32));
499
break;
500
case Collation::U0000_TAG:
501
U_ASSERT(c == 0);
502
// Fetch the normal ce32 for U+0000 and continue.
503
ce32 = fromBase ? base->ce32s[0] : ce32s.elementAti(0);
504
break;
505
case Collation::OFFSET_TAG:
506
ce32 = getCE32FromOffsetCE32(fromBase, c, ce32);
507
break;
508
case Collation::IMPLICIT_TAG:
509
return Collation::unassignedCEFromCodePoint(c);
510
}
511
}
512
return Collation::ceFromSimpleCE32(ce32);
513
}
514
515
int32_t
516
CollationDataBuilder::addCE(int64_t ce, UErrorCode &errorCode) {
517
int32_t length = ce64s.size();
518
for(int32_t i = 0; i < length; ++i) {
519
if(ce == ce64s.elementAti(i)) { return i; }
520
}
521
ce64s.addElement(ce, errorCode);
522
return length;
523
}
524
525
int32_t
526
CollationDataBuilder::addCE32(uint32_t ce32, UErrorCode &errorCode) {
527
int32_t length = ce32s.size();
528
for(int32_t i = 0; i < length; ++i) {
529
if(ce32 == (uint32_t)ce32s.elementAti(i)) { return i; }
530
}
531
ce32s.addElement((int32_t)ce32, errorCode);
532
return length;
533
}
534
535
int32_t
536
CollationDataBuilder::addConditionalCE32(const UnicodeString &context, uint32_t ce32,
537
UErrorCode &errorCode) {
538
if(U_FAILURE(errorCode)) { return -1; }
539
U_ASSERT(!context.isEmpty());
540
int32_t index = conditionalCE32s.size();
541
if(index > Collation::MAX_INDEX) {
542
errorCode = U_BUFFER_OVERFLOW_ERROR;
543
return -1;
544
}
545
LocalPointer<ConditionalCE32> cond(new ConditionalCE32(context, ce32), errorCode);
546
conditionalCE32s.adoptElement(cond.orphan(), errorCode);
547
if(U_FAILURE(errorCode)) {
548
return -1;
549
}
550
return index;
551
}
552
553
void
554
CollationDataBuilder::add(const UnicodeString &prefix, const UnicodeString &s,
555
const int64_t ces[], int32_t cesLength,
556
UErrorCode &errorCode) {
557
uint32_t ce32 = encodeCEs(ces, cesLength, errorCode);
558
addCE32(prefix, s, ce32, errorCode);
559
}
560
561
void
562
CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &s,
563
uint32_t ce32, UErrorCode &errorCode) {
564
if(U_FAILURE(errorCode)) { return; }
565
if(s.isEmpty()) {
566
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
567
return;
568
}
569
if(trie == NULL || utrie2_isFrozen(trie)) {
570
errorCode = U_INVALID_STATE_ERROR;
571
return;
572
}
573
UChar32 c = s.char32At(0);
574
int32_t cLength = U16_LENGTH(c);
575
uint32_t oldCE32 = utrie2_get32(trie, c);
576
UBool hasContext = !prefix.isEmpty() || s.length() > cLength;
577
578
if (icu4xMode) {
579
if (base && c >= 0x1100 && c < 0x1200) {
580
// Omit jamo tailorings.
581
// TODO(https://github.com/unicode-org/icu4x/issues/1941).
582
}
583
const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(errorCode);
584
UnicodeString sInNfd;
585
nfdNormalizer->normalize(s, sInNfd, errorCode);
586
if (s != sInNfd) {
587
// s is not in NFD, so it cannot match in ICU4X, since ICU4X only
588
// does NFD lookups.
589
// Now check that we're only rejecting known cases.
590
if (s.length() == 2) {
591
char16_t second = s.charAt(1);
592
if (second == 0x0F73 || second == 0x0F75 || second == 0x0F81) {
593
// Second is a special decomposing Tibetan vowel sign.
594
// These also get added in the decomposed form, so ignoring
595
// this instance is OK.
596
return;
597
}
598
if (c == 0xFDD1 && second == 0xAC00) {
599
// This strange contraction exists in the root and
600
// doesn't have a decomposed counterpart there.
601
// This won't match in ICU4X anyway and is very strange:
602
// Unassigned Arabic presentation form contracting with
603
// the very first Hangul syllable. Let's ignore this
604
// explicitly.
605
return;
606
}
607
}
608
// Unknown case worth investigating if ever found.
609
errorCode = U_UNSUPPORTED_ERROR;
610
return;
611
}
612
613
if (!prefix.isEmpty()) {
614
UnicodeString prefixInNfd;
615
nfdNormalizer->normalize(prefix, prefixInNfd, errorCode);
616
if (prefix != prefixInNfd) {
617
errorCode = U_UNSUPPORTED_ERROR;
618
return;
619
}
620
621
int32_t count = prefix.countChar32();
622
if (count > 2) {
623
// Prefix too long for ICU4X.
624
errorCode = U_UNSUPPORTED_ERROR;
625
return;
626
}
627
UChar32 utf32[4];
628
int32_t len = prefix.toUTF32(utf32, 4, errorCode);
629
if (len != count) {
630
errorCode = U_INVALID_STATE_ERROR;
631
return;
632
}
633
UChar32 c = utf32[0];
634
if (u_getCombiningClass(c)) {
635
// Prefix must start with as starter for ICU4X.
636
errorCode = U_UNSUPPORTED_ERROR;
637
return;
638
}
639
// XXX: Korean searchjl has jamo in prefix, so commenting out this
640
// check for now. ICU4X currently ignores non-root jamo tables anyway.
641
// searchjl was added in
642
// https://unicode-org.atlassian.net/browse/CLDR-3560
643
// Contractions were changed to prefixes in
644
// https://unicode-org.atlassian.net/browse/CLDR-6546
645
//
646
// if ((c >= 0x1100 && c < 0x1200) || (c >= 0xAC00 && c < 0xD7A4)) {
647
// errorCode = U_UNSUPPORTED_ERROR;
648
// return;
649
// }
650
if ((len > 1) && !(utf32[1] == 0x3099 || utf32[1] == 0x309A)) {
651
// Second character in prefix, if present, must be a kana voicing mark for ICU4X.
652
errorCode = U_UNSUPPORTED_ERROR;
653
return;
654
}
655
}
656
657
if (s.length() > cLength) {
658
// Check that there's no modern Hangul in contractions.
659
for (int32_t i = 0; i < s.length(); ++i) {
660
UChar c = s.charAt(i);
661
if ((c >= 0x1100 && c < 0x1100 + 19) || (c >= 0x1161 && c < 0x1161 + 21) || (c >= 0x11A7 && c < 0x11A7 + 28) || (c >= 0xAC00 && c < 0xD7A4)) {
662
errorCode = U_UNSUPPORTED_ERROR;
663
return;
664
}
665
}
666
}
667
}
668
669
if(oldCE32 == Collation::FALLBACK_CE32) {
670
// First tailoring for c.
671
// If c has contextual base mappings or if we add a contextual mapping,
672
// then copy the base mappings.
673
// Otherwise we just override the base mapping.
674
uint32_t baseCE32 = base->getFinalCE32(base->getCE32(c));
675
if(hasContext || Collation::ce32HasContext(baseCE32)) {
676
oldCE32 = copyFromBaseCE32(c, baseCE32, true, errorCode);
677
utrie2_set32(trie, c, oldCE32, &errorCode);
678
if(U_FAILURE(errorCode)) { return; }
679
}
680
}
681
if(!hasContext) {
682
// No prefix, no contraction.
683
if(!isBuilderContextCE32(oldCE32)) {
684
utrie2_set32(trie, c, ce32, &errorCode);
685
} else {
686
ConditionalCE32 *cond = getConditionalCE32ForCE32(oldCE32);
687
cond->builtCE32 = Collation::NO_CE32;
688
cond->ce32 = ce32;
689
}
690
} else {
691
ConditionalCE32 *cond;
692
if(!isBuilderContextCE32(oldCE32)) {
693
// Replace the simple oldCE32 with a builder context CE32
694
// pointing to a new ConditionalCE32 list head.
695
int32_t index = addConditionalCE32(UnicodeString((UChar)0), oldCE32, errorCode);
696
if(U_FAILURE(errorCode)) { return; }
697
uint32_t contextCE32 = makeBuilderContextCE32(index);
698
utrie2_set32(trie, c, contextCE32, &errorCode);
699
contextChars.add(c);
700
cond = getConditionalCE32(index);
701
} else {
702
cond = getConditionalCE32ForCE32(oldCE32);
703
cond->builtCE32 = Collation::NO_CE32;
704
}
705
UnicodeString suffix(s, cLength);
706
UnicodeString context((UChar)prefix.length());
707
context.append(prefix).append(suffix);
708
unsafeBackwardSet.addAll(suffix);
709
for(;;) {
710
// invariant: context > cond->context
711
int32_t next = cond->next;
712
if(next < 0) {
713
// Append a new ConditionalCE32 after cond.
714
int32_t index = addConditionalCE32(context, ce32, errorCode);
715
if(U_FAILURE(errorCode)) { return; }
716
cond->next = index;
717
break;
718
}
719
ConditionalCE32 *nextCond = getConditionalCE32(next);
720
int8_t cmp = context.compare(nextCond->context);
721
if(cmp < 0) {
722
// Insert a new ConditionalCE32 between cond and nextCond.
723
int32_t index = addConditionalCE32(context, ce32, errorCode);
724
if(U_FAILURE(errorCode)) { return; }
725
cond->next = index;
726
getConditionalCE32(index)->next = next;
727
break;
728
} else if(cmp == 0) {
729
// Same context as before, overwrite its ce32.
730
nextCond->ce32 = ce32;
731
break;
732
}
733
cond = nextCond;
734
}
735
}
736
modified = true;
737
}
738
739
uint32_t
740
CollationDataBuilder::encodeOneCEAsCE32(int64_t ce) {
741
uint32_t p = (uint32_t)(ce >> 32);
742
uint32_t lower32 = (uint32_t)ce;
743
uint32_t t = (uint32_t)(ce & 0xffff);
744
U_ASSERT((t & 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s.
745
if((ce & INT64_C(0xffff00ff00ff)) == 0) {
746
// normal form ppppsstt
747
return p | (lower32 >> 16) | (t >> 8);
748
} else if((ce & INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE) {
749
// long-primary form ppppppC1
750
return Collation::makeLongPrimaryCE32(p);
751
} else if(p == 0 && (t & 0xff) == 0) {
752
// long-secondary form ssssttC2
753
return Collation::makeLongSecondaryCE32(lower32);
754
}
755
return Collation::NO_CE32;
756
}
757
758
uint32_t
759
CollationDataBuilder::encodeOneCE(int64_t ce, UErrorCode &errorCode) {
760
// Try to encode one CE as one CE32.
761
uint32_t ce32 = encodeOneCEAsCE32(ce);
762
if(ce32 != Collation::NO_CE32) { return ce32; }
763
int32_t index = addCE(ce, errorCode);
764
if(U_FAILURE(errorCode)) { return 0; }
765
if(index > Collation::MAX_INDEX) {
766
errorCode = U_BUFFER_OVERFLOW_ERROR;
767
return 0;
768
}
769
return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, index, 1);
770
}
771
772
uint32_t
773
CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength,
774
UErrorCode &errorCode) {
775
if(U_FAILURE(errorCode)) { return 0; }
776
if(cesLength < 0 || cesLength > Collation::MAX_EXPANSION_LENGTH) {
777
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
778
return 0;
779
}
780
if(trie == NULL || utrie2_isFrozen(trie)) {
781
errorCode = U_INVALID_STATE_ERROR;
782
return 0;
783
}
784
if(cesLength == 0) {
785
// Convenience: We cannot map to nothing, but we can map to a completely ignorable CE.
786
// Do this here so that callers need not do it.
787
return encodeOneCEAsCE32(0);
788
} else if(cesLength == 1) {
789
return encodeOneCE(ces[0], errorCode);
790
} else if(cesLength == 2 && !icu4xMode) {
791
// Try to encode two CEs as one CE32.
792
// Turn this off for ICU4X, because without the canonical closure
793
// these are so rare that it doesn't make sense to spend a branch
794
// on checking this tag when using the data.
795
int64_t ce0 = ces[0];
796
int64_t ce1 = ces[1];
797
uint32_t p0 = (uint32_t)(ce0 >> 32);
798
if((ce0 & INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE &&
799
(ce1 & INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE &&
800
p0 != 0) {
801
// Latin mini expansion
802
return
803
p0 |
804
(((uint32_t)ce0 & 0xff00u) << 8) |
805
(uint32_t)(ce1 >> 16) |
806
Collation::SPECIAL_CE32_LOW_BYTE |
807
Collation::LATIN_EXPANSION_TAG;
808
}
809
}
810
// Try to encode two or more CEs as CE32s.
811
int32_t newCE32s[Collation::MAX_EXPANSION_LENGTH];
812
for(int32_t i = 0;; ++i) {
813
if(i == cesLength) {
814
return encodeExpansion32(newCE32s, cesLength, errorCode);
815
}
816
uint32_t ce32 = encodeOneCEAsCE32(ces[i]);
817
if(ce32 == Collation::NO_CE32) { break; }
818
newCE32s[i] = (int32_t)ce32;
819
}
820
return encodeExpansion(ces, cesLength, errorCode);
821
}
822
823
uint32_t
824
CollationDataBuilder::encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode) {
825
if(U_FAILURE(errorCode)) { return 0; }
826
// See if this sequence of CEs has already been stored.
827
int64_t first = ces[0];
828
int32_t ce64sMax = ce64s.size() - length;
829
for(int32_t i = 0; i <= ce64sMax; ++i) {
830
if(first == ce64s.elementAti(i)) {
831
if(i > Collation::MAX_INDEX) {
832
errorCode = U_BUFFER_OVERFLOW_ERROR;
833
return 0;
834
}
835
for(int32_t j = 1;; ++j) {
836
if(j == length) {
837
return Collation::makeCE32FromTagIndexAndLength(
838
Collation::EXPANSION_TAG, i, length);
839
}
840
if(ce64s.elementAti(i + j) != ces[j]) { break; }
841
}
842
}
843
}
844
// Store the new sequence.
845
int32_t i = ce64s.size();
846
if(i > Collation::MAX_INDEX) {
847
errorCode = U_BUFFER_OVERFLOW_ERROR;
848
return 0;
849
}
850
for(int32_t j = 0; j < length; ++j) {
851
ce64s.addElement(ces[j], errorCode);
852
}
853
return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, i, length);
854
}
855
856
uint32_t
857
CollationDataBuilder::encodeExpansion32(const int32_t newCE32s[], int32_t length,
858
UErrorCode &errorCode) {
859
if(U_FAILURE(errorCode)) { return 0; }
860
// See if this sequence of CE32s has already been stored.
861
int32_t first = newCE32s[0];
862
int32_t ce32sMax = ce32s.size() - length;
863
for(int32_t i = 0; i <= ce32sMax; ++i) {
864
if(first == ce32s.elementAti(i)) {
865
if(i > Collation::MAX_INDEX) {
866
errorCode = U_BUFFER_OVERFLOW_ERROR;
867
return 0;
868
}
869
for(int32_t j = 1;; ++j) {
870
if(j == length) {
871
return Collation::makeCE32FromTagIndexAndLength(
872
Collation::EXPANSION32_TAG, i, length);
873
}
874
if(ce32s.elementAti(i + j) != newCE32s[j]) { break; }
875
}
876
}
877
}
878
// Store the new sequence.
879
int32_t i = ce32s.size();
880
if(i > Collation::MAX_INDEX) {
881
errorCode = U_BUFFER_OVERFLOW_ERROR;
882
return 0;
883
}
884
for(int32_t j = 0; j < length; ++j) {
885
ce32s.addElement(newCE32s[j], errorCode);
886
}
887
return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG, i, length);
888
}
889
890
uint32_t
891
CollationDataBuilder::copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext,
892
UErrorCode &errorCode) {
893
if(U_FAILURE(errorCode)) { return 0; }
894
if(!Collation::isSpecialCE32(ce32)) { return ce32; }
895
switch(Collation::tagFromCE32(ce32)) {
896
case Collation::LONG_PRIMARY_TAG:
897
case Collation::LONG_SECONDARY_TAG:
898
case Collation::LATIN_EXPANSION_TAG:
899
// copy as is
900
break;
901
case Collation::EXPANSION32_TAG: {
902
const uint32_t *baseCE32s = base->ce32s + Collation::indexFromCE32(ce32);
903
int32_t length = Collation::lengthFromCE32(ce32);
904
ce32 = encodeExpansion32(
905
reinterpret_cast<const int32_t *>(baseCE32s), length, errorCode);
906
break;
907
}
908
case Collation::EXPANSION_TAG: {
909
const int64_t *baseCEs = base->ces + Collation::indexFromCE32(ce32);
910
int32_t length = Collation::lengthFromCE32(ce32);
911
ce32 = encodeExpansion(baseCEs, length, errorCode);
912
break;
913
}
914
case Collation::PREFIX_TAG: {
915
// Flatten prefixes and nested suffixes (contractions)
916
// into a linear list of ConditionalCE32.
917
const UChar *p = base->contexts + Collation::indexFromCE32(ce32);
918
ce32 = CollationData::readCE32(p); // Default if no prefix match.
919
if(!withContext) {
920
return copyFromBaseCE32(c, ce32, false, errorCode);
921
}
922
ConditionalCE32 head;
923
UnicodeString context((UChar)0);
924
int32_t index;
925
if(Collation::isContractionCE32(ce32)) {
926
index = copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode);
927
} else {
928
ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
929
head.next = index = addConditionalCE32(context, ce32, errorCode);
930
}
931
if(U_FAILURE(errorCode)) { return 0; }
932
ConditionalCE32 *cond = getConditionalCE32(index); // the last ConditionalCE32 so far
933
UCharsTrie::Iterator prefixes(p + 2, 0, errorCode);
934
while(prefixes.next(errorCode)) {
935
context = prefixes.getString();
936
context.reverse();
937
context.insert(0, (UChar)context.length());
938
ce32 = (uint32_t)prefixes.getValue();
939
if(Collation::isContractionCE32(ce32)) {
940
index = copyContractionsFromBaseCE32(context, c, ce32, cond, errorCode);
941
} else {
942
ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
943
cond->next = index = addConditionalCE32(context, ce32, errorCode);
944
}
945
if(U_FAILURE(errorCode)) { return 0; }
946
cond = getConditionalCE32(index);
947
}
948
ce32 = makeBuilderContextCE32(head.next);
949
contextChars.add(c);
950
break;
951
}
952
case Collation::CONTRACTION_TAG: {
953
if(!withContext) {
954
const UChar *p = base->contexts + Collation::indexFromCE32(ce32);
955
ce32 = CollationData::readCE32(p); // Default if no suffix match.
956
return copyFromBaseCE32(c, ce32, false, errorCode);
957
}
958
ConditionalCE32 head;
959
UnicodeString context((UChar)0);
960
copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode);
961
ce32 = makeBuilderContextCE32(head.next);
962
contextChars.add(c);
963
break;
964
}
965
case Collation::HANGUL_TAG:
966
errorCode = U_UNSUPPORTED_ERROR; // We forbid tailoring of Hangul syllables.
967
break;
968
case Collation::OFFSET_TAG:
969
ce32 = getCE32FromOffsetCE32(true, c, ce32);
970
break;
971
case Collation::IMPLICIT_TAG:
972
ce32 = encodeOneCE(Collation::unassignedCEFromCodePoint(c), errorCode);
973
break;
974
default:
975
UPRV_UNREACHABLE_EXIT; // require ce32 == base->getFinalCE32(ce32)
976
}
977
return ce32;
978
}
979
980
int32_t
981
CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
982
ConditionalCE32 *cond, UErrorCode &errorCode) {
983
if(U_FAILURE(errorCode)) { return 0; }
984
const UChar *p = base->contexts + Collation::indexFromCE32(ce32);
985
int32_t index;
986
if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
987
// No match on the single code point.
988
// We are underneath a prefix, and the default mapping is just
989
// a fallback to the mappings for a shorter prefix.
990
U_ASSERT(context.length() > 1);
991
index = -1;
992
} else {
993
ce32 = CollationData::readCE32(p); // Default if no suffix match.
994
U_ASSERT(!Collation::isContractionCE32(ce32));
995
ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
996
cond->next = index = addConditionalCE32(context, ce32, errorCode);
997
if(U_FAILURE(errorCode)) { return 0; }
998
cond = getConditionalCE32(index);
999
}
1000
1001
int32_t suffixStart = context.length();
1002
UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);
1003
while(suffixes.next(errorCode)) {
1004
context.append(suffixes.getString());
1005
ce32 = copyFromBaseCE32(c, (uint32_t)suffixes.getValue(), true, errorCode);
1006
cond->next = index = addConditionalCE32(context, ce32, errorCode);
1007
if(U_FAILURE(errorCode)) { return 0; }
1008
// No need to update the unsafeBackwardSet because the tailoring set
1009
// is already a copy of the base set.
1010
cond = getConditionalCE32(index);
1011
context.truncate(suffixStart);
1012
}
1013
U_ASSERT(index >= 0);
1014
return index;
1015
}
1016
1017
class CopyHelper {
1018
public:
1019
CopyHelper(const CollationDataBuilder &s, CollationDataBuilder &d,
1020
const CollationDataBuilder::CEModifier &m, UErrorCode &initialErrorCode)
1021
: src(s), dest(d), modifier(m),
1022
errorCode(initialErrorCode) {}
1023
1024
UBool copyRangeCE32(UChar32 start, UChar32 end, uint32_t ce32) {
1025
ce32 = copyCE32(ce32);
1026
utrie2_setRange32(dest.trie, start, end, ce32, true, &errorCode);
1027
if(CollationDataBuilder::isBuilderContextCE32(ce32)) {
1028
dest.contextChars.add(start, end);
1029
}
1030
return U_SUCCESS(errorCode);
1031
}
1032
1033
uint32_t copyCE32(uint32_t ce32) {
1034
if(!Collation::isSpecialCE32(ce32)) {
1035
int64_t ce = modifier.modifyCE32(ce32);
1036
if(ce != Collation::NO_CE) {
1037
ce32 = dest.encodeOneCE(ce, errorCode);
1038
}
1039
} else {
1040
int32_t tag = Collation::tagFromCE32(ce32);
1041
if(tag == Collation::EXPANSION32_TAG) {
1042
const uint32_t *srcCE32s = reinterpret_cast<uint32_t *>(src.ce32s.getBuffer());
1043
srcCE32s += Collation::indexFromCE32(ce32);
1044
int32_t length = Collation::lengthFromCE32(ce32);
1045
// Inspect the source CE32s. Just copy them if none are modified.
1046
// Otherwise copy to modifiedCEs, with modifications.
1047
UBool isModified = false;
1048
for(int32_t i = 0; i < length; ++i) {
1049
ce32 = srcCE32s[i];
1050
int64_t ce;
1051
if(Collation::isSpecialCE32(ce32) ||
1052
(ce = modifier.modifyCE32(ce32)) == Collation::NO_CE) {
1053
if(isModified) {
1054
modifiedCEs[i] = Collation::ceFromCE32(ce32);
1055
}
1056
} else {
1057
if(!isModified) {
1058
for(int32_t j = 0; j < i; ++j) {
1059
modifiedCEs[j] = Collation::ceFromCE32(srcCE32s[j]);
1060
}
1061
isModified = true;
1062
}
1063
modifiedCEs[i] = ce;
1064
}
1065
}
1066
if(isModified) {
1067
ce32 = dest.encodeCEs(modifiedCEs, length, errorCode);
1068
} else {
1069
ce32 = dest.encodeExpansion32(
1070
reinterpret_cast<const int32_t *>(srcCE32s), length, errorCode);
1071
}
1072
} else if(tag == Collation::EXPANSION_TAG) {
1073
const int64_t *srcCEs = src.ce64s.getBuffer();
1074
srcCEs += Collation::indexFromCE32(ce32);
1075
int32_t length = Collation::lengthFromCE32(ce32);
1076
// Inspect the source CEs. Just copy them if none are modified.
1077
// Otherwise copy to modifiedCEs, with modifications.
1078
UBool isModified = false;
1079
for(int32_t i = 0; i < length; ++i) {
1080
int64_t srcCE = srcCEs[i];
1081
int64_t ce = modifier.modifyCE(srcCE);
1082
if(ce == Collation::NO_CE) {
1083
if(isModified) {
1084
modifiedCEs[i] = srcCE;
1085
}
1086
} else {
1087
if(!isModified) {
1088
for(int32_t j = 0; j < i; ++j) {
1089
modifiedCEs[j] = srcCEs[j];
1090
}
1091
isModified = true;
1092
}
1093
modifiedCEs[i] = ce;
1094
}
1095
}
1096
if(isModified) {
1097
ce32 = dest.encodeCEs(modifiedCEs, length, errorCode);
1098
} else {
1099
ce32 = dest.encodeExpansion(srcCEs, length, errorCode);
1100
}
1101
} else if(tag == Collation::BUILDER_DATA_TAG) {
1102
// Copy the list of ConditionalCE32.
1103
ConditionalCE32 *cond = src.getConditionalCE32ForCE32(ce32);
1104
U_ASSERT(!cond->hasContext());
1105
int32_t destIndex = dest.addConditionalCE32(
1106
cond->context, copyCE32(cond->ce32), errorCode);
1107
ce32 = CollationDataBuilder::makeBuilderContextCE32(destIndex);
1108
while(cond->next >= 0) {
1109
cond = src.getConditionalCE32(cond->next);
1110
ConditionalCE32 *prevDestCond = dest.getConditionalCE32(destIndex);
1111
destIndex = dest.addConditionalCE32(
1112
cond->context, copyCE32(cond->ce32), errorCode);
1113
int32_t suffixStart = cond->prefixLength() + 1;
1114
dest.unsafeBackwardSet.addAll(cond->context.tempSubString(suffixStart));
1115
prevDestCond->next = destIndex;
1116
}
1117
} else {
1118
// Just copy long CEs and Latin mini expansions (and other expected values) as is,
1119
// assuming that the modifier would not modify them.
1120
U_ASSERT(tag == Collation::LONG_PRIMARY_TAG ||
1121
tag == Collation::LONG_SECONDARY_TAG ||
1122
tag == Collation::LATIN_EXPANSION_TAG ||
1123
tag == Collation::HANGUL_TAG);
1124
}
1125
}
1126
return ce32;
1127
}
1128
1129
const CollationDataBuilder &src;
1130
CollationDataBuilder &dest;
1131
const CollationDataBuilder::CEModifier &modifier;
1132
int64_t modifiedCEs[Collation::MAX_EXPANSION_LENGTH];
1133
UErrorCode errorCode;
1134
};
1135
1136
U_CDECL_BEGIN
1137
1138
static UBool U_CALLCONV
1139
enumRangeForCopy(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1140
return
1141
value == Collation::UNASSIGNED_CE32 || value == Collation::FALLBACK_CE32 ||
1142
((CopyHelper *)context)->copyRangeCE32(start, end, value);
1143
}
1144
1145
U_CDECL_END
1146
1147
void
1148
CollationDataBuilder::copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
1149
UErrorCode &errorCode) {
1150
if(U_FAILURE(errorCode)) { return; }
1151
if(trie == NULL || utrie2_isFrozen(trie)) {
1152
errorCode = U_INVALID_STATE_ERROR;
1153
return;
1154
}
1155
CopyHelper helper(src, *this, modifier, errorCode);
1156
utrie2_enum(src.trie, NULL, enumRangeForCopy, &helper);
1157
errorCode = helper.errorCode;
1158
// Update the contextChars and the unsafeBackwardSet while copying,
1159
// in case a character had conditional mappings in the source builder
1160
// and they were removed later.
1161
modified |= src.modified;
1162
}
1163
1164
void
1165
CollationDataBuilder::optimize(const UnicodeSet &set, UErrorCode &errorCode) {
1166
if(U_FAILURE(errorCode) || set.isEmpty()) { return; }
1167
UnicodeSetIterator iter(set);
1168
while(iter.next() && !iter.isString()) {
1169
UChar32 c = iter.getCodepoint();
1170
uint32_t ce32 = utrie2_get32(trie, c);
1171
if(ce32 == Collation::FALLBACK_CE32) {
1172
ce32 = base->getFinalCE32(base->getCE32(c));
1173
ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
1174
utrie2_set32(trie, c, ce32, &errorCode);
1175
}
1176
}
1177
modified = true;
1178
}
1179
1180
void
1181
CollationDataBuilder::suppressContractions(const UnicodeSet &set, UErrorCode &errorCode) {
1182
if(U_FAILURE(errorCode) || set.isEmpty()) { return; }
1183
UnicodeSetIterator iter(set);
1184
while(iter.next() && !iter.isString()) {
1185
UChar32 c = iter.getCodepoint();
1186
uint32_t ce32 = utrie2_get32(trie, c);
1187
if(ce32 == Collation::FALLBACK_CE32) {
1188
ce32 = base->getFinalCE32(base->getCE32(c));
1189
if(Collation::ce32HasContext(ce32)) {
1190
ce32 = copyFromBaseCE32(c, ce32, false /* without context */, errorCode);
1191
utrie2_set32(trie, c, ce32, &errorCode);
1192
}
1193
} else if(isBuilderContextCE32(ce32)) {
1194
ce32 = getConditionalCE32ForCE32(ce32)->ce32;
1195
// Simply abandon the list of ConditionalCE32.
1196
// The caller will copy this builder in the end,
1197
// eliminating unreachable data.
1198
utrie2_set32(trie, c, ce32, &errorCode);
1199
contextChars.remove(c);
1200
}
1201
}
1202
modified = true;
1203
}
1204
1205
UBool
1206
CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode) {
1207
if(U_FAILURE(errorCode)) { return false; }
1208
UBool anyJamoAssigned = base == NULL; // always set jamoCE32s in the base data
1209
UBool needToCopyFromBase = false;
1210
for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types.
1211
UChar32 jamo = jamoCpFromIndex(j);
1212
UBool fromBase = false;
1213
uint32_t ce32 = utrie2_get32(trie, jamo);
1214
anyJamoAssigned |= Collation::isAssignedCE32(ce32);
1215
// TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned.
1216
// (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.)
1217
if(ce32 == Collation::FALLBACK_CE32) {
1218
fromBase = true;
1219
ce32 = base->getCE32(jamo);
1220
}
1221
if(Collation::isSpecialCE32(ce32)) {
1222
switch(Collation::tagFromCE32(ce32)) {
1223
case Collation::LONG_PRIMARY_TAG:
1224
case Collation::LONG_SECONDARY_TAG:
1225
case Collation::LATIN_EXPANSION_TAG:
1226
// Copy the ce32 as-is.
1227
break;
1228
case Collation::EXPANSION32_TAG:
1229
case Collation::EXPANSION_TAG:
1230
case Collation::PREFIX_TAG:
1231
case Collation::CONTRACTION_TAG:
1232
if(fromBase) {
1233
// Defer copying until we know if anyJamoAssigned.
1234
ce32 = Collation::FALLBACK_CE32;
1235
needToCopyFromBase = true;
1236
}
1237
break;
1238
case Collation::IMPLICIT_TAG:
1239
// An unassigned Jamo should only occur in tests with incomplete bases.
1240
U_ASSERT(fromBase);
1241
ce32 = Collation::FALLBACK_CE32;
1242
needToCopyFromBase = true;
1243
break;
1244
case Collation::OFFSET_TAG:
1245
ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32);
1246
break;
1247
case Collation::FALLBACK_TAG:
1248
case Collation::RESERVED_TAG_3:
1249
case Collation::BUILDER_DATA_TAG:
1250
case Collation::DIGIT_TAG:
1251
case Collation::U0000_TAG:
1252
case Collation::HANGUL_TAG:
1253
case Collation::LEAD_SURROGATE_TAG:
1254
errorCode = U_INTERNAL_PROGRAM_ERROR;
1255
return false;
1256
}
1257
}
1258
jamoCE32s[j] = ce32;
1259
}
1260
if(anyJamoAssigned && needToCopyFromBase) {
1261
for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) {
1262
if(jamoCE32s[j] == Collation::FALLBACK_CE32) {
1263
UChar32 jamo = jamoCpFromIndex(j);
1264
jamoCE32s[j] = copyFromBaseCE32(jamo, base->getCE32(jamo),
1265
/*withContext=*/ true, errorCode);
1266
}
1267
}
1268
}
1269
return anyJamoAssigned && U_SUCCESS(errorCode);
1270
}
1271
1272
void
1273
CollationDataBuilder::setDigitTags(UErrorCode &errorCode) {
1274
UnicodeSet digits(UNICODE_STRING_SIMPLE("[:Nd:]"), errorCode);
1275
if(U_FAILURE(errorCode)) { return; }
1276
UnicodeSetIterator iter(digits);
1277
while(iter.next()) {
1278
U_ASSERT(!iter.isString());
1279
UChar32 c = iter.getCodepoint();
1280
uint32_t ce32 = utrie2_get32(trie, c);
1281
if(ce32 != Collation::FALLBACK_CE32 && ce32 != Collation::UNASSIGNED_CE32) {
1282
int32_t index = addCE32(ce32, errorCode);
1283
if(U_FAILURE(errorCode)) { return; }
1284
if(index > Collation::MAX_INDEX) {
1285
errorCode = U_BUFFER_OVERFLOW_ERROR;
1286
return;
1287
}
1288
ce32 = Collation::makeCE32FromTagIndexAndLength(
1289
Collation::DIGIT_TAG, index, u_charDigitValue(c));
1290
utrie2_set32(trie, c, ce32, &errorCode);
1291
}
1292
}
1293
}
1294
1295
U_CDECL_BEGIN
1296
1297
static UBool U_CALLCONV
1298
enumRangeLeadValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
1299
int32_t *pValue = (int32_t *)context;
1300
if(value == Collation::UNASSIGNED_CE32) {
1301
value = Collation::LEAD_ALL_UNASSIGNED;
1302
} else if(value == Collation::FALLBACK_CE32) {
1303
value = Collation::LEAD_ALL_FALLBACK;
1304
} else {
1305
*pValue = Collation::LEAD_MIXED;
1306
return false;
1307
}
1308
if(*pValue < 0) {
1309
*pValue = (int32_t)value;
1310
} else if(*pValue != (int32_t)value) {
1311
*pValue = Collation::LEAD_MIXED;
1312
return false;
1313
}
1314
return true;
1315
}
1316
1317
U_CDECL_END
1318
1319
void
1320
CollationDataBuilder::setLeadSurrogates(UErrorCode &errorCode) {
1321
for(UChar lead = 0xd800; lead < 0xdc00; ++lead) {
1322
int32_t value = -1;
1323
utrie2_enumForLeadSurrogate(trie, lead, NULL, enumRangeLeadValue, &value);
1324
utrie2_set32ForLeadSurrogateCodeUnit(
1325
trie, lead,
1326
Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG, 0) | (uint32_t)value,
1327
&errorCode);
1328
}
1329
}
1330
1331
void
1332
CollationDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
1333
buildMappings(data, errorCode);
1334
if(base != NULL) {
1335
data.numericPrimary = base->numericPrimary;
1336
data.compressibleBytes = base->compressibleBytes;
1337
data.numScripts = base->numScripts;
1338
data.scriptsIndex = base->scriptsIndex;
1339
data.scriptStarts = base->scriptStarts;
1340
data.scriptStartsLength = base->scriptStartsLength;
1341
}
1342
buildFastLatinTable(data, errorCode);
1343
}
1344
1345
void
1346
CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode) {
1347
if(U_FAILURE(errorCode)) { return; }
1348
if(trie == NULL || utrie2_isFrozen(trie)) {
1349
errorCode = U_INVALID_STATE_ERROR;
1350
return;
1351
}
1352
1353
buildContexts(errorCode);
1354
1355
uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH];
1356
int32_t jamoIndex = -1;
1357
if(getJamoCE32s(jamoCE32s, errorCode)) {
1358
jamoIndex = ce32s.size();
1359
for(int32_t i = 0; i < CollationData::JAMO_CE32S_LENGTH; ++i) {
1360
ce32s.addElement((int32_t)jamoCE32s[i], errorCode);
1361
}
1362
// Small optimization: Use a bit in the Hangul ce32
1363
// to indicate that none of the Jamo CE32s are isSpecialCE32()
1364
// (as it should be in the root collator).
1365
// It allows CollationIterator to avoid recursive function calls and per-Jamo tests.
1366
// In order to still have good trie compression and keep this code simple,
1367
// we only set this flag if a whole block of 588 Hangul syllables starting with
1368
// a common leading consonant (Jamo L) has this property.
1369
UBool isAnyJamoVTSpecial = false;
1370
for(int32_t i = Hangul::JAMO_L_COUNT; i < CollationData::JAMO_CE32S_LENGTH; ++i) {
1371
if(Collation::isSpecialCE32(jamoCE32s[i])) {
1372
isAnyJamoVTSpecial = true;
1373
break;
1374
}
1375
}
1376
uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
1377
UChar32 c = Hangul::HANGUL_BASE;
1378
for(int32_t i = 0; i < Hangul::JAMO_L_COUNT; ++i) { // iterate over the Jamo L
1379
uint32_t ce32 = hangulCE32;
1380
if(!isAnyJamoVTSpecial && !Collation::isSpecialCE32(jamoCE32s[i])) {
1381
ce32 |= Collation::HANGUL_NO_SPECIAL_JAMO;
1382
}
1383
UChar32 limit = c + Hangul::JAMO_VT_COUNT;
1384
utrie2_setRange32(trie, c, limit - 1, ce32, true, &errorCode);
1385
c = limit;
1386
}
1387
} else {
1388
// Copy the Hangul CE32s from the base in blocks per Jamo L,
1389
// assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks.
1390
for(UChar32 c = Hangul::HANGUL_BASE; c < Hangul::HANGUL_LIMIT;) {
1391
uint32_t ce32 = base->getCE32(c);
1392
U_ASSERT(Collation::hasCE32Tag(ce32, Collation::HANGUL_TAG));
1393
UChar32 limit = c + Hangul::JAMO_VT_COUNT;
1394
utrie2_setRange32(trie, c, limit - 1, ce32, true, &errorCode);
1395
c = limit;
1396
}
1397
}
1398
1399
setDigitTags(errorCode);
1400
setLeadSurrogates(errorCode);
1401
1402
if (!icu4xMode) {
1403
// For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
1404
ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0);
1405
utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode);
1406
}
1407
1408
utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode);
1409
if(U_FAILURE(errorCode)) { return; }
1410
1411
// Mark each lead surrogate as "unsafe"
1412
// if any of its 1024 associated supplementary code points is "unsafe".
1413
UChar32 c = 0x10000;
1414
for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
1415
if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) {
1416
unsafeBackwardSet.add(lead);
1417
}
1418
}
1419
unsafeBackwardSet.freeze();
1420
1421
data.trie = trie;
1422
data.ce32s = reinterpret_cast<const uint32_t *>(ce32s.getBuffer());
1423
data.ces = ce64s.getBuffer();
1424
data.contexts = contexts.getBuffer();
1425
1426
data.ce32sLength = ce32s.size();
1427
data.cesLength = ce64s.size();
1428
data.contextsLength = contexts.length();
1429
1430
data.base = base;
1431
if(jamoIndex >= 0) {
1432
data.jamoCE32s = data.ce32s + jamoIndex;
1433
} else {
1434
data.jamoCE32s = base->jamoCE32s;
1435
}
1436
data.unsafeBackwardSet = &unsafeBackwardSet;
1437
}
1438
1439
void
1440
CollationDataBuilder::clearContexts() {
1441
contexts.remove();
1442
// Incrementing the contexts build "era" invalidates all of the builtCE32
1443
// from before this clearContexts() call.
1444
// Simpler than finding and resetting all of those fields.
1445
++contextsEra;
1446
}
1447
1448
void
1449
CollationDataBuilder::buildContexts(UErrorCode &errorCode) {
1450
if(U_FAILURE(errorCode)) { return; }
1451
// Ignore abandoned lists and the cached builtCE32,
1452
// and build all contexts from scratch.
1453
clearContexts();
1454
UnicodeSetIterator iter(contextChars);
1455
while(U_SUCCESS(errorCode) && iter.next()) {
1456
U_ASSERT(!iter.isString());
1457
UChar32 c = iter.getCodepoint();
1458
uint32_t ce32 = utrie2_get32(trie, c);
1459
if(!isBuilderContextCE32(ce32)) {
1460
// Impossible: No context data for c in contextChars.
1461
errorCode = U_INTERNAL_PROGRAM_ERROR;
1462
return;
1463
}
1464
ConditionalCE32 *cond = getConditionalCE32ForCE32(ce32);
1465
ce32 = buildContext(cond, errorCode);
1466
utrie2_set32(trie, c, ce32, &errorCode);
1467
}
1468
}
1469
1470
uint32_t
1471
CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) {
1472
if(U_FAILURE(errorCode)) { return 0; }
1473
// The list head must have no context.
1474
U_ASSERT(!head->hasContext());
1475
// The list head must be followed by one or more nodes that all do have context.
1476
U_ASSERT(head->next >= 0);
1477
UCharsTrieBuilder prefixBuilder(errorCode);
1478
UCharsTrieBuilder contractionBuilder(errorCode);
1479
// This outer loop goes from each prefix to the next.
1480
// For each prefix it finds the one or more same-prefix entries (firstCond..lastCond).
1481
// If there are multiple suffixes for the same prefix,
1482
// then an inner loop builds a contraction trie for them.
1483
for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) {
1484
if(U_FAILURE(errorCode)) { return 0; } // early out for memory allocation errors
1485
// After the list head, the prefix or suffix can be empty, but not both.
1486
U_ASSERT(cond == head || cond->hasContext());
1487
int32_t prefixLength = cond->prefixLength();
1488
UnicodeString prefix(cond->context, 0, prefixLength + 1);
1489
// Collect all contraction suffixes for one prefix.
1490
ConditionalCE32 *firstCond = cond;
1491
ConditionalCE32 *lastCond;
1492
do {
1493
lastCond = cond;
1494
// Clear the defaultCE32 fields as we go.
1495
// They are left over from building a previous version of this list of contexts.
1496
//
1497
// One of the code paths below may copy a preceding defaultCE32
1498
// into its emptySuffixCE32.
1499
// If a new suffix has been inserted before what used to be
1500
// the firstCond for its prefix, then that previous firstCond could still
1501
// contain an outdated defaultCE32 from an earlier buildContext() and
1502
// result in an incorrect emptySuffixCE32.
1503
// So we reset all defaultCE32 before reading and setting new values.
1504
cond->defaultCE32 = Collation::NO_CE32;
1505
} while(cond->next >= 0 &&
1506
(cond = getConditionalCE32(cond->next))->context.startsWith(prefix));
1507
uint32_t ce32;
1508
int32_t suffixStart = prefixLength + 1; // == prefix.length()
1509
if(lastCond->context.length() == suffixStart) {
1510
// One prefix without contraction suffix.
1511
U_ASSERT(firstCond == lastCond);
1512
ce32 = lastCond->ce32;
1513
cond = lastCond;
1514
} else {
1515
// Build the contractions trie.
1516
contractionBuilder.clear();
1517
// Entry for an empty suffix, to be stored before the trie.
1518
uint32_t emptySuffixCE32 = 0;
1519
uint32_t flags = 0;
1520
if(firstCond->context.length() == suffixStart) {
1521
// There is a mapping for the prefix and the single character c. (p|c)
1522
// If no other suffix matches, then we return this value.
1523
emptySuffixCE32 = firstCond->ce32;
1524
cond = getConditionalCE32(firstCond->next);
1525
} else {
1526
// There is no mapping for the prefix and just the single character.
1527
// (There is no p|c, only p|cd, p|ce etc.)
1528
flags |= Collation::CONTRACT_SINGLE_CP_NO_MATCH;
1529
// When the prefix matches but none of the prefix-specific suffixes,
1530
// then we fall back to the mappings with the next-longest prefix,
1531
// and ultimately to mappings with no prefix.
1532
// Each fallback might be another set of contractions.
1533
// For example, if there are mappings for ch, p|cd, p|ce, but not for p|c,
1534
// then in text "pch" we find the ch contraction.
1535
for(cond = head;; cond = getConditionalCE32(cond->next)) {
1536
int32_t length = cond->prefixLength();
1537
if(length == prefixLength) { break; }
1538
if(cond->defaultCE32 != Collation::NO_CE32 &&
1539
(length==0 || prefix.endsWith(cond->context, 1, length))) {
1540
emptySuffixCE32 = cond->defaultCE32;
1541
}
1542
}
1543
cond = firstCond;
1544
}
1545
// Optimization: Set a flag when
1546
// the first character of every contraction suffix has lccc!=0.
1547
// Short-circuits contraction matching when a normal letter follows.
1548
flags |= Collation::CONTRACT_NEXT_CCC;
1549
// Add all of the non-empty suffixes into the contraction trie.
1550
for(;;) {
1551
UnicodeString suffix(cond->context, suffixStart);
1552
uint16_t fcd16 = nfcImpl.getFCD16(suffix.char32At(0));
1553
if(fcd16 <= 0xff) {
1554
flags &= ~Collation::CONTRACT_NEXT_CCC;
1555
}
1556
fcd16 = nfcImpl.getFCD16(suffix.char32At(suffix.length() - 1));
1557
if(fcd16 > 0xff) {
1558
// The last suffix character has lccc!=0, allowing for discontiguous contractions.
1559
flags |= Collation::CONTRACT_TRAILING_CCC;
1560
}
1561
if (icu4xMode && (flags & Collation::CONTRACT_HAS_STARTER) == 0) {
1562
for (int32_t i = 0; i < suffix.length();) {
1563
UChar32 c = suffix.char32At(i);
1564
if (!u_getCombiningClass(c)) {
1565
flags |= Collation::CONTRACT_HAS_STARTER;
1566
break;
1567
}
1568
if (c > 0xFFFF) {
1569
i += 2;
1570
} else {
1571
++i;
1572
}
1573
}
1574
}
1575
contractionBuilder.add(suffix, (int32_t)cond->ce32, errorCode);
1576
if(cond == lastCond) { break; }
1577
cond = getConditionalCE32(cond->next);
1578
}
1579
int32_t index = addContextTrie(emptySuffixCE32, contractionBuilder, errorCode);
1580
if(U_FAILURE(errorCode)) { return 0; }
1581
if(index > Collation::MAX_INDEX) {
1582
errorCode = U_BUFFER_OVERFLOW_ERROR;
1583
return 0;
1584
}
1585
ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | flags;
1586
}
1587
U_ASSERT(cond == lastCond);
1588
firstCond->defaultCE32 = ce32;
1589
if(prefixLength == 0) {
1590
if(cond->next < 0) {
1591
// No non-empty prefixes, only contractions.
1592
return ce32;
1593
}
1594
} else {
1595
prefix.remove(0, 1); // Remove the length unit.
1596
prefix.reverse();
1597
prefixBuilder.add(prefix, (int32_t)ce32, errorCode);
1598
if(cond->next < 0) { break; }
1599
}
1600
}
1601
U_ASSERT(head->defaultCE32 != Collation::NO_CE32);
1602
int32_t index = addContextTrie(head->defaultCE32, prefixBuilder, errorCode);
1603
if(U_FAILURE(errorCode)) { return 0; }
1604
if(index > Collation::MAX_INDEX) {
1605
errorCode = U_BUFFER_OVERFLOW_ERROR;
1606
return 0;
1607
}
1608
return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG, index);
1609
}
1610
1611
int32_t
1612
CollationDataBuilder::addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
1613
UErrorCode &errorCode) {
1614
UnicodeString context;
1615
context.append((UChar)(defaultCE32 >> 16)).append((UChar)defaultCE32);
1616
UnicodeString trieString;
1617
context.append(trieBuilder.buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieString, errorCode));
1618
if(U_FAILURE(errorCode)) { return -1; }
1619
int32_t index = contexts.indexOf(context);
1620
if(index < 0) {
1621
index = contexts.length();
1622
contexts.append(context);
1623
}
1624
return index;
1625
}
1626
1627
void
1628
CollationDataBuilder::buildFastLatinTable(CollationData &data, UErrorCode &errorCode) {
1629
if(U_FAILURE(errorCode) || !fastLatinEnabled) { return; }
1630
1631
delete fastLatinBuilder;
1632
fastLatinBuilder = new CollationFastLatinBuilder(errorCode);
1633
if(fastLatinBuilder == NULL) {
1634
errorCode = U_MEMORY_ALLOCATION_ERROR;
1635
return;
1636
}
1637
if(fastLatinBuilder->forData(data, errorCode)) {
1638
const uint16_t *table = fastLatinBuilder->getTable();
1639
int32_t length = fastLatinBuilder->lengthOfTable();
1640
if(base != NULL && length == base->fastLatinTableLength &&
1641
uprv_memcmp(table, base->fastLatinTable, length * 2) == 0) {
1642
// Same fast Latin table as in the base, use that one instead.
1643
delete fastLatinBuilder;
1644
fastLatinBuilder = NULL;
1645
table = base->fastLatinTable;
1646
}
1647
data.fastLatinTable = table;
1648
data.fastLatinTableLength = length;
1649
} else {
1650
delete fastLatinBuilder;
1651
fastLatinBuilder = NULL;
1652
}
1653
}
1654
1655
int32_t
1656
CollationDataBuilder::getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength) {
1657
return getCEs(s, 0, ces, cesLength);
1658
}
1659
1660
int32_t
1661
CollationDataBuilder::getCEs(const UnicodeString &prefix, const UnicodeString &s,
1662
int64_t ces[], int32_t cesLength) {
1663
int32_t prefixLength = prefix.length();
1664
if(prefixLength == 0) {
1665
return getCEs(s, 0, ces, cesLength);
1666
} else {
1667
return getCEs(prefix + s, prefixLength, ces, cesLength);
1668
}
1669
}
1670
1671
int32_t
1672
CollationDataBuilder::getCEs(const UnicodeString &s, int32_t start,
1673
int64_t ces[], int32_t cesLength) {
1674
if(collIter == NULL) {
1675
collIter = new DataBuilderCollationIterator(*this);
1676
if(collIter == NULL) { return 0; }
1677
}
1678
return collIter->fetchCEs(s, start, ces, cesLength);
1679
}
1680
1681
U_NAMESPACE_END
1682
1683
#endif // !UCONFIG_NO_COLLATION
1684
1685