Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/icui18n/alphaindex.cpp
12343 views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2009-2014, International Business Machines Corporation and
6
* others. All Rights Reserved.
7
*******************************************************************************
8
*/
9
10
#include "unicode/utypes.h"
11
12
#if !UCONFIG_NO_COLLATION
13
14
#include "unicode/alphaindex.h"
15
#include "unicode/coll.h"
16
#include "unicode/localpointer.h"
17
#include "unicode/normalizer2.h"
18
#include "unicode/tblcoll.h"
19
#include "unicode/uchar.h"
20
#include "unicode/ulocdata.h"
21
#include "unicode/uniset.h"
22
#include "unicode/uobject.h"
23
#include "unicode/usetiter.h"
24
#include "unicode/utf16.h"
25
26
#include "cmemory.h"
27
#include "cstring.h"
28
#include "uassert.h"
29
#include "uvector.h"
30
#include "uvectr64.h"
31
32
//#include <string>
33
//#include <iostream>
34
35
U_NAMESPACE_BEGIN
36
37
namespace {
38
39
/**
40
* Prefix string for Chinese index buckets.
41
* See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
42
*/
43
const UChar BASE[1] = { 0xFDD0 };
44
const int32_t BASE_LENGTH = 1;
45
46
UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
47
const UnicodeString &one, const UnicodeString &other);
48
49
} // namespace
50
51
static int32_t U_CALLCONV
52
collatorComparator(const void *context, const void *left, const void *right);
53
54
static int32_t U_CALLCONV
55
recordCompareFn(const void *context, const void *left, const void *right);
56
57
// UVector<Record *> support function, delete a Record.
58
static void U_CALLCONV
59
alphaIndex_deleteRecord(void *obj) {
60
delete static_cast<AlphabeticIndex::Record *>(obj);
61
}
62
63
namespace {
64
65
UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
66
UErrorCode &errorCode) {
67
if (U_FAILURE(errorCode)) { return NULL; }
68
if (owned.isValid()) {
69
return owned.orphan();
70
}
71
UnicodeString *p = new UnicodeString(s);
72
if (p == NULL) {
73
errorCode = U_MEMORY_ALLOCATION_ERROR;
74
}
75
return p;
76
}
77
78
inline UnicodeString *getString(const UVector &list, int32_t i) {
79
return static_cast<UnicodeString *>(list[i]);
80
}
81
82
inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
83
return static_cast<AlphabeticIndex::Bucket *>(list[i]);
84
}
85
86
inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
87
return static_cast<AlphabeticIndex::Record *>(list[i]);
88
}
89
90
/**
91
* Like Java Collections.binarySearch(List, String, Comparator).
92
*
93
* @return the index>=0 where the item was found,
94
* or the index<0 for inserting the string at ~index in sorted order
95
*/
96
int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
97
if (list.size() == 0) { return ~0; }
98
int32_t start = 0;
99
int32_t limit = list.size();
100
for (;;) {
101
int32_t i = (start + limit) / 2;
102
const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
103
UErrorCode errorCode = U_ZERO_ERROR;
104
UCollationResult cmp = coll.compare(s, *si, errorCode);
105
if (cmp == UCOL_EQUAL) {
106
return i;
107
} else if (cmp < 0) {
108
if (i == start) {
109
return ~start; // insert s before *si
110
}
111
limit = i;
112
} else {
113
if (i == start) {
114
return ~(start + 1); // insert s after *si
115
}
116
start = i;
117
}
118
}
119
}
120
121
} // namespace
122
123
// The BucketList is not in the anonymous namespace because only Clang
124
// seems to support its use in other classes from there.
125
// However, we also don't need U_I18N_API because it is not used from outside the i18n library.
126
class BucketList : public UObject {
127
public:
128
BucketList(UVector *bucketList, UVector *publicBucketList)
129
: bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
130
int32_t displayIndex = 0;
131
for (int32_t i = 0; i < publicBucketList->size(); ++i) {
132
getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
133
}
134
}
135
136
// The virtual destructor must not be inline.
137
// See ticket #8454 for details.
138
virtual ~BucketList();
139
140
int32_t getBucketCount() const {
141
return immutableVisibleList_->size();
142
}
143
144
int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
145
UErrorCode &errorCode) {
146
// binary search
147
int32_t start = 0;
148
int32_t limit = bucketList_->size();
149
while ((start + 1) < limit) {
150
int32_t i = (start + limit) / 2;
151
const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
152
UCollationResult nameVsBucket =
153
collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
154
if (nameVsBucket < 0) {
155
limit = i;
156
} else {
157
start = i;
158
}
159
}
160
const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
161
if (bucket->displayBucket_ != NULL) {
162
bucket = bucket->displayBucket_;
163
}
164
return bucket->displayIndex_;
165
}
166
167
/** All of the buckets, visible and invisible. */
168
UVector *bucketList_;
169
/** Just the visible buckets. */
170
UVector *immutableVisibleList_;
171
};
172
173
BucketList::~BucketList() {
174
delete bucketList_;
175
if (immutableVisibleList_ != bucketList_) {
176
delete immutableVisibleList_;
177
}
178
}
179
180
AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
181
delete buckets_;
182
delete collatorPrimaryOnly_;
183
}
184
185
int32_t
186
AlphabeticIndex::ImmutableIndex::getBucketCount() const {
187
return buckets_->getBucketCount();
188
}
189
190
int32_t
191
AlphabeticIndex::ImmutableIndex::getBucketIndex(
192
const UnicodeString &name, UErrorCode &errorCode) const {
193
return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
194
}
195
196
const AlphabeticIndex::Bucket *
197
AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
198
if (0 <= index && index < buckets_->getBucketCount()) {
199
return icu::getBucket(*buckets_->immutableVisibleList_, index);
200
} else {
201
return NULL;
202
}
203
}
204
205
AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
206
: inputList_(NULL),
207
labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
208
maxLabelCount_(99),
209
initialLabels_(NULL), firstCharsInScripts_(NULL),
210
collator_(NULL), collatorPrimaryOnly_(NULL),
211
buckets_(NULL) {
212
init(&locale, status);
213
}
214
215
216
AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
217
: inputList_(NULL),
218
labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
219
maxLabelCount_(99),
220
initialLabels_(NULL), firstCharsInScripts_(NULL),
221
collator_(collator), collatorPrimaryOnly_(NULL),
222
buckets_(NULL) {
223
init(NULL, status);
224
}
225
226
227
228
AlphabeticIndex::~AlphabeticIndex() {
229
delete collator_;
230
delete collatorPrimaryOnly_;
231
delete firstCharsInScripts_;
232
delete buckets_;
233
delete inputList_;
234
delete initialLabels_;
235
}
236
237
238
AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
239
if (U_FAILURE(status)) {
240
return *this;
241
}
242
initialLabels_->addAll(additions);
243
clearBuckets();
244
return *this;
245
}
246
247
248
AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
249
addIndexExemplars(locale, status);
250
clearBuckets();
251
return *this;
252
}
253
254
255
AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
256
if (U_FAILURE(errorCode)) { return NULL; }
257
// In C++, the ImmutableIndex must own its copy of the BucketList,
258
// even if it contains no records, for proper memory management.
259
// We could clone the buckets_ if they are not NULL,
260
// but that would be worth it only if this method is called multiple times,
261
// or called after using the old-style bucket iterator API.
262
LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
263
LocalPointer<RuleBasedCollator> coll(collatorPrimaryOnly_->clone());
264
if (immutableBucketList.isNull() || coll.isNull()) {
265
errorCode = U_MEMORY_ALLOCATION_ERROR;
266
return NULL;
267
}
268
ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
269
if (immIndex == NULL) {
270
errorCode = U_MEMORY_ALLOCATION_ERROR;
271
return NULL;
272
}
273
// The ImmutableIndex adopted its parameter objects.
274
immutableBucketList.orphan();
275
coll.orphan();
276
return immIndex;
277
}
278
279
int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
280
initBuckets(status);
281
if (U_FAILURE(status)) {
282
return 0;
283
}
284
return buckets_->getBucketCount();
285
}
286
287
288
int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
289
if (U_FAILURE(status) || inputList_ == NULL) {
290
return 0;
291
}
292
return inputList_->size();
293
}
294
295
void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
296
U_ASSERT(indexCharacters.hasDeleter());
297
const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
298
if (U_FAILURE(errorCode)) { return; }
299
300
const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
301
const UnicodeString &overflowBoundary =
302
*getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
303
304
// We make a sorted array of elements.
305
// Some of the input may be redundant.
306
// That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
307
// We filter out those cases.
308
UnicodeSetIterator iter(*initialLabels_);
309
while (U_SUCCESS(errorCode) && iter.next()) {
310
const UnicodeString *item = &iter.getString();
311
LocalPointer<UnicodeString> ownedItem;
312
UBool checkDistinct;
313
int32_t itemLength = item->length();
314
if (!item->hasMoreChar32Than(0, itemLength, 1)) {
315
checkDistinct = false;
316
} else if(item->charAt(itemLength - 1) == 0x2a && // '*'
317
item->charAt(itemLength - 2) != 0x2a) {
318
// Use a label if it is marked with one trailing star,
319
// even if the label string sorts the same when all contractions are suppressed.
320
ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
321
item = ownedItem.getAlias();
322
if (item == NULL) {
323
errorCode = U_MEMORY_ALLOCATION_ERROR;
324
return;
325
}
326
checkDistinct = false;
327
} else {
328
checkDistinct = true;
329
}
330
if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
331
// Ignore a primary-ignorable or non-alphabetic index character.
332
} else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
333
// Ignore an index character that will land in the overflow bucket.
334
} else if (checkDistinct &&
335
collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
336
// Ignore a multi-code point index character that does not sort distinctly
337
// from the sequence of its separate characters.
338
} else {
339
int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
340
if (insertionPoint < 0) {
341
indexCharacters.insertElementAt(
342
ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
343
} else {
344
const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
345
if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
346
indexCharacters.setElementAt(
347
ownedString(*item, ownedItem, errorCode), insertionPoint);
348
}
349
}
350
}
351
}
352
if (U_FAILURE(errorCode)) { return; }
353
354
// if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
355
356
int32_t size = indexCharacters.size() - 1;
357
if (size > maxLabelCount_) {
358
int32_t count = 0;
359
int32_t old = -1;
360
for (int32_t i = 0; i < indexCharacters.size();) {
361
++count;
362
int32_t bump = count * maxLabelCount_ / size;
363
if (bump == old) {
364
indexCharacters.removeElementAt(i);
365
} else {
366
old = bump;
367
++i;
368
}
369
}
370
}
371
}
372
373
namespace {
374
375
const UnicodeString &fixLabel(const UnicodeString &current, UnicodeString &temp) {
376
if (!current.startsWith(BASE, BASE_LENGTH)) {
377
return current;
378
}
379
UChar rest = current.charAt(BASE_LENGTH);
380
if (0x2800 < rest && rest <= 0x28FF) { // stroke count
381
int32_t count = rest-0x2800;
382
temp.setTo((UChar)(0x30 + count % 10));
383
if (count >= 10) {
384
count /= 10;
385
temp.insert(0, (UChar)(0x30 + count % 10));
386
if (count >= 10) {
387
count /= 10;
388
temp.insert(0, (UChar)(0x30 + count));
389
}
390
}
391
return temp.append((UChar)0x5283);
392
}
393
return temp.setTo(current, BASE_LENGTH);
394
}
395
396
UBool hasMultiplePrimaryWeights(
397
const RuleBasedCollator &coll, uint32_t variableTop,
398
const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) {
399
ces.removeAllElements();
400
coll.internalGetCEs(s, ces, errorCode);
401
if (U_FAILURE(errorCode)) { return false; }
402
UBool seenPrimary = false;
403
for (int32_t i = 0; i < ces.size(); ++i) {
404
int64_t ce = ces.elementAti(i);
405
uint32_t p = (uint32_t)(ce >> 32);
406
if (p > variableTop) {
407
// not primary ignorable
408
if (seenPrimary) {
409
return true;
410
}
411
seenPrimary = true;
412
}
413
}
414
return false;
415
}
416
417
} // namespace
418
419
BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
420
// Initialize indexCharacters.
421
UVector indexCharacters(errorCode);
422
indexCharacters.setDeleter(uprv_deleteUObject);
423
initLabels(indexCharacters, errorCode);
424
if (U_FAILURE(errorCode)) { return NULL; }
425
426
// Variables for hasMultiplePrimaryWeights().
427
UVector64 ces(errorCode);
428
uint32_t variableTop;
429
if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
430
variableTop = collatorPrimaryOnly_->getVariableTop(errorCode);
431
} else {
432
variableTop = 0;
433
}
434
UBool hasInvisibleBuckets = false;
435
436
// Helper arrays for Chinese Pinyin collation.
437
Bucket *asciiBuckets[26] = {
438
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
439
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
440
};
441
Bucket *pinyinBuckets[26] = {
442
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
443
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
444
};
445
UBool hasPinyin = false;
446
447
LocalPointer<UVector> bucketList(new UVector(errorCode), errorCode);
448
if (U_FAILURE(errorCode)) {
449
return NULL;
450
}
451
bucketList->setDeleter(uprv_deleteUObject);
452
453
// underflow bucket
454
LocalPointer<Bucket> bucket(new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW), errorCode);
455
if (U_FAILURE(errorCode)) {
456
return NULL;
457
}
458
bucketList->adoptElement(bucket.orphan(), errorCode);
459
if (U_FAILURE(errorCode)) { return NULL; }
460
461
UnicodeString temp;
462
463
// fix up the list, adding underflow, additions, overflow
464
// Insert inflow labels as needed.
465
int32_t scriptIndex = -1;
466
const UnicodeString *scriptUpperBoundary = &emptyString_;
467
for (int32_t i = 0; i < indexCharacters.size(); ++i) {
468
UnicodeString &current = *getString(indexCharacters, i);
469
if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
470
// We crossed the script boundary into a new script.
471
const UnicodeString &inflowBoundary = *scriptUpperBoundary;
472
UBool skippedScript = false;
473
for (;;) {
474
scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
475
if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
476
break;
477
}
478
skippedScript = true;
479
}
480
if (skippedScript && bucketList->size() > 1) {
481
// We are skipping one or more scripts,
482
// and we are not just getting out of the underflow label.
483
bucket.adoptInsteadAndCheckErrorCode(
484
new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW), errorCode);
485
bucketList->adoptElement(bucket.orphan(), errorCode);
486
if (U_FAILURE(errorCode)) { return nullptr; }
487
}
488
}
489
// Add a bucket with the current label.
490
bucket.adoptInsteadAndCheckErrorCode(
491
new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL), errorCode);
492
bucketList->adoptElement(bucket.orphan(), errorCode);
493
if (U_FAILURE(errorCode)) { return nullptr; }
494
// Remember ASCII and Pinyin buckets for Pinyin redirects.
495
UChar c;
496
if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z
497
asciiBuckets[c - 0x41] = (Bucket *)bucketList->lastElement();
498
} else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
499
0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
500
pinyinBuckets[c - 0x41] = (Bucket *)bucketList->lastElement();
501
hasPinyin = true;
502
}
503
// Check for multiple primary weights.
504
if (!current.startsWith(BASE, BASE_LENGTH) &&
505
hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current,
506
ces, errorCode) &&
507
current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
508
// "AE-ligature" or "Sch" etc.
509
for (int32_t j = bucketList->size() - 2;; --j) {
510
Bucket *singleBucket = getBucket(*bucketList, j);
511
if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
512
// There is no single-character bucket since the last
513
// underflow or inflow label.
514
break;
515
}
516
if (singleBucket->displayBucket_ == NULL &&
517
!hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop,
518
singleBucket->lowerBoundary_,
519
ces, errorCode)) {
520
// Add an invisible bucket that redirects strings greater than the expansion
521
// to the previous single-character bucket.
522
// For example, after ... Q R S Sch we add Sch\uFFFF->S
523
// and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
524
bucket.adoptInsteadAndCheckErrorCode(new Bucket(emptyString_,
525
UnicodeString(current).append((UChar)0xFFFF),
526
U_ALPHAINDEX_NORMAL),
527
errorCode);
528
if (U_FAILURE(errorCode)) {
529
return NULL;
530
}
531
bucket->displayBucket_ = singleBucket;
532
bucketList->adoptElement(bucket.orphan(), errorCode);
533
if (U_FAILURE(errorCode)) { return nullptr; }
534
hasInvisibleBuckets = true;
535
break;
536
}
537
}
538
}
539
}
540
if (U_FAILURE(errorCode)) { return NULL; }
541
if (bucketList->size() == 1) {
542
// No real labels, show only the underflow label.
543
BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
544
if (bl == NULL) {
545
errorCode = U_MEMORY_ALLOCATION_ERROR;
546
return NULL;
547
}
548
bucketList.orphan();
549
return bl;
550
}
551
// overflow bucket
552
bucket.adoptInsteadAndCheckErrorCode(
553
new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW), errorCode);
554
bucketList->adoptElement(bucket.orphan(), errorCode); // final
555
if (U_FAILURE(errorCode)) { return nullptr; }
556
557
if (hasPinyin) {
558
// Redirect Pinyin buckets.
559
Bucket *asciiBucket = NULL;
560
for (int32_t i = 0; i < 26; ++i) {
561
if (asciiBuckets[i] != NULL) {
562
asciiBucket = asciiBuckets[i];
563
}
564
if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
565
pinyinBuckets[i]->displayBucket_ = asciiBucket;
566
hasInvisibleBuckets = true;
567
}
568
}
569
}
570
571
if (U_FAILURE(errorCode)) { return NULL; }
572
if (!hasInvisibleBuckets) {
573
BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
574
if (bl == NULL) {
575
errorCode = U_MEMORY_ALLOCATION_ERROR;
576
return NULL;
577
}
578
bucketList.orphan();
579
return bl;
580
}
581
// Merge inflow buckets that are visually adjacent.
582
// Iterate backwards: Merge inflow into overflow rather than the other way around.
583
int32_t i = bucketList->size() - 1;
584
Bucket *nextBucket = getBucket(*bucketList, i);
585
while (--i > 0) {
586
Bucket *bucket = getBucket(*bucketList, i);
587
if (bucket->displayBucket_ != NULL) {
588
continue; // skip invisible buckets
589
}
590
if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
591
if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
592
bucket->displayBucket_ = nextBucket;
593
continue;
594
}
595
}
596
nextBucket = bucket;
597
}
598
599
LocalPointer<UVector> publicBucketList(new UVector(errorCode), errorCode);
600
if (U_FAILURE(errorCode)) {
601
return NULL;
602
}
603
// Do not call publicBucketList->setDeleter():
604
// This vector shares its objects with the bucketList.
605
for (int32_t j = 0; j < bucketList->size(); ++j) {
606
Bucket *bucket = getBucket(*bucketList, j);
607
if (bucket->displayBucket_ == NULL) {
608
publicBucketList->addElement(bucket, errorCode);
609
}
610
}
611
if (U_FAILURE(errorCode)) { return NULL; }
612
BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
613
if (bl == NULL) {
614
errorCode = U_MEMORY_ALLOCATION_ERROR;
615
return NULL;
616
}
617
bucketList.orphan();
618
publicBucketList.orphan();
619
return bl;
620
}
621
622
/**
623
* Creates an index, and buckets and sorts the list of records into the index.
624
*/
625
void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
626
if (U_FAILURE(errorCode) || buckets_ != NULL) {
627
return;
628
}
629
buckets_ = createBucketList(errorCode);
630
if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
631
return;
632
}
633
634
// Sort the records by name.
635
// Stable sort preserves input order of collation duplicates.
636
inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
637
638
// Now, we traverse all of the input, which is now sorted.
639
// If the item doesn't go in the current bucket, we find the next bucket that contains it.
640
// This makes the process order n*log(n), since we just sort the list and then do a linear process.
641
// However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
642
// we need to improve it for that case.
643
644
Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
645
int32_t bucketIndex = 1;
646
Bucket *nextBucket;
647
const UnicodeString *upperBoundary;
648
if (bucketIndex < buckets_->bucketList_->size()) {
649
nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
650
upperBoundary = &nextBucket->lowerBoundary_;
651
} else {
652
nextBucket = NULL;
653
upperBoundary = NULL;
654
}
655
for (int32_t i = 0; i < inputList_->size(); ++i) {
656
Record *r = getRecord(*inputList_, i);
657
// if the current bucket isn't the right one, find the one that is
658
// We have a special flag for the last bucket so that we don't look any further
659
while (upperBoundary != NULL &&
660
collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
661
currentBucket = nextBucket;
662
// now reset the boundary that we compare against
663
if (bucketIndex < buckets_->bucketList_->size()) {
664
nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
665
upperBoundary = &nextBucket->lowerBoundary_;
666
} else {
667
upperBoundary = NULL;
668
}
669
}
670
// now put the record into the bucket.
671
Bucket *bucket = currentBucket;
672
if (bucket->displayBucket_ != NULL) {
673
bucket = bucket->displayBucket_;
674
}
675
if (bucket->records_ == NULL) {
676
LocalPointer<UVector> records(new UVector(errorCode), errorCode);
677
if (U_FAILURE(errorCode)) {
678
return;
679
}
680
bucket->records_ = records.orphan();
681
}
682
bucket->records_->addElement(r, errorCode);
683
}
684
}
685
686
void AlphabeticIndex::clearBuckets() {
687
if (buckets_ != NULL) {
688
delete buckets_;
689
buckets_ = NULL;
690
internalResetBucketIterator();
691
}
692
}
693
694
void AlphabeticIndex::internalResetBucketIterator() {
695
labelsIterIndex_ = -1;
696
currentBucket_ = NULL;
697
}
698
699
700
void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
701
LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
702
if (U_FAILURE(status)) {
703
return;
704
}
705
706
UnicodeSet exemplars;
707
ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
708
if (U_SUCCESS(status)) {
709
initialLabels_->addAll(exemplars);
710
return;
711
}
712
status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
713
714
// The locale data did not include explicit Index characters.
715
// Synthesize a set of them from the locale's standard exemplar characters.
716
ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
717
if (U_FAILURE(status)) {
718
return;
719
}
720
721
// question: should we add auxiliary exemplars?
722
if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.isEmpty()) {
723
exemplars.add(0x61, 0x7A);
724
}
725
if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
726
// cut down to small list
727
exemplars.remove(0xAC00, 0xD7A3).
728
add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
729
add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
730
add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
731
add(0xD30C).add(0xD558);
732
}
733
if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block
734
// cut down to small list
735
// make use of the fact that Ethiopic is allocated in 8's, where
736
// the base is 0 mod 8.
737
UnicodeSet ethiopic(UnicodeString(u"[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]"), status);
738
ethiopic.retainAll(exemplars);
739
exemplars.remove(u'ሀ', 0x137F).addAll(ethiopic);
740
}
741
742
// Upper-case any that aren't already so.
743
// (We only do this for synthesized index characters.)
744
UnicodeSetIterator it(exemplars);
745
UnicodeString upperC;
746
while (it.next()) {
747
const UnicodeString &exemplarC = it.getString();
748
upperC = exemplarC;
749
upperC.toUpper(locale);
750
initialLabels_->add(upperC);
751
}
752
}
753
754
UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
755
UnicodeSet contractions;
756
collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode);
757
if (U_FAILURE(errorCode) || contractions.isEmpty()) { return false; }
758
initialLabels_->addAll(contractions);
759
UnicodeSetIterator iter(contractions);
760
while (iter.next()) {
761
const UnicodeString &s = iter.getString();
762
U_ASSERT (s.startsWith(BASE, BASE_LENGTH));
763
UChar c = s.charAt(s.length() - 1);
764
if (0x41 <= c && c <= 0x5A) { // A-Z
765
// There are Pinyin labels, add ASCII A-Z labels as well.
766
initialLabels_->add(0x41, 0x5A); // A-Z
767
break;
768
}
769
}
770
return true;
771
}
772
773
774
/*
775
* Return the string with interspersed CGJs. Input must have more than 2 codepoints.
776
*/
777
static const UChar CGJ = 0x034F;
778
UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
779
UnicodeString result;
780
if (item.length() == 0) {
781
return result;
782
}
783
int32_t i = 0;
784
for (;;) {
785
UChar32 cp = item.char32At(i);
786
result.append(cp);
787
i = item.moveIndex32(i, 1);
788
if (i >= item.length()) {
789
break;
790
}
791
result.append(CGJ);
792
}
793
return result;
794
}
795
796
797
bool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
798
return false;
799
}
800
801
802
bool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
803
return false;
804
}
805
806
807
const RuleBasedCollator &AlphabeticIndex::getCollator() const {
808
return *collator_;
809
}
810
811
812
const UnicodeString &AlphabeticIndex::getInflowLabel() const {
813
return inflowLabel_;
814
}
815
816
const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
817
return overflowLabel_;
818
}
819
820
821
const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
822
return underflowLabel_;
823
}
824
825
826
AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
827
inflowLabel_ = label;
828
clearBuckets();
829
return *this;
830
}
831
832
833
AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
834
overflowLabel_ = label;
835
clearBuckets();
836
return *this;
837
}
838
839
840
AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
841
underflowLabel_ = label;
842
clearBuckets();
843
return *this;
844
}
845
846
847
int32_t AlphabeticIndex::getMaxLabelCount() const {
848
return maxLabelCount_;
849
}
850
851
852
AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
853
if (U_FAILURE(status)) {
854
return *this;
855
}
856
if (maxLabelCount <= 0) {
857
status = U_ILLEGAL_ARGUMENT_ERROR;
858
return *this;
859
}
860
maxLabelCount_ = maxLabelCount;
861
clearBuckets();
862
return *this;
863
}
864
865
866
//
867
// init() - Common code for constructors.
868
//
869
870
void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
871
if (U_FAILURE(status)) { return; }
872
if (locale == NULL && collator_ == NULL) {
873
status = U_ILLEGAL_ARGUMENT_ERROR;
874
return;
875
}
876
877
initialLabels_ = new UnicodeSet();
878
if (initialLabels_ == NULL) {
879
status = U_MEMORY_ALLOCATION_ERROR;
880
return;
881
}
882
883
inflowLabel_.setTo((UChar)0x2026); // Ellipsis
884
overflowLabel_ = inflowLabel_;
885
underflowLabel_ = inflowLabel_;
886
887
if (collator_ == NULL) {
888
Collator *coll = Collator::createInstance(*locale, status);
889
if (U_FAILURE(status)) {
890
delete coll;
891
return;
892
}
893
if (coll == NULL) {
894
status = U_MEMORY_ALLOCATION_ERROR;
895
return;
896
}
897
collator_ = dynamic_cast<RuleBasedCollator *>(coll);
898
if (collator_ == NULL) {
899
delete coll;
900
status = U_UNSUPPORTED_ERROR;
901
return;
902
}
903
}
904
collatorPrimaryOnly_ = collator_->clone();
905
if (collatorPrimaryOnly_ == NULL) {
906
status = U_MEMORY_ALLOCATION_ERROR;
907
return;
908
}
909
collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
910
firstCharsInScripts_ = firstStringsInScript(status);
911
if (U_FAILURE(status)) { return; }
912
firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
913
// Guard against a degenerate collator where
914
// some script boundary strings are primary ignorable.
915
for (;;) {
916
if (U_FAILURE(status)) { return; }
917
if (firstCharsInScripts_->isEmpty()) {
918
// AlphabeticIndex requires some non-ignorable script boundary strings.
919
status = U_ILLEGAL_ARGUMENT_ERROR;
920
return;
921
}
922
if (collatorPrimaryOnly_->compare(
923
*static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
924
emptyString_, status) == UCOL_EQUAL) {
925
firstCharsInScripts_->removeElementAt(0);
926
} else {
927
break;
928
}
929
}
930
931
// Chinese index characters, which are specific to each of the several Chinese tailorings,
932
// take precedence over the single locale data exemplar set per language.
933
if (!addChineseIndexCharacters(status) && locale != NULL) {
934
addIndexExemplars(*locale, status);
935
}
936
}
937
938
939
//
940
// Comparison function for UVector<UnicodeString *> sorting with a collator.
941
//
942
static int32_t U_CALLCONV
943
collatorComparator(const void *context, const void *left, const void *right) {
944
const UElement *leftElement = static_cast<const UElement *>(left);
945
const UElement *rightElement = static_cast<const UElement *>(right);
946
const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer);
947
const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
948
949
if (leftString == rightString) {
950
// Catches case where both are NULL
951
return 0;
952
}
953
if (leftString == NULL) {
954
return 1;
955
}
956
if (rightString == NULL) {
957
return -1;
958
}
959
const Collator *col = static_cast<const Collator *>(context);
960
UErrorCode errorCode = U_ZERO_ERROR;
961
return col->compare(*leftString, *rightString, errorCode);
962
}
963
964
//
965
// Comparison function for UVector<Record *> sorting with a collator.
966
//
967
static int32_t U_CALLCONV
968
recordCompareFn(const void *context, const void *left, const void *right) {
969
const UElement *leftElement = static_cast<const UElement *>(left);
970
const UElement *rightElement = static_cast<const UElement *>(right);
971
const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
972
const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
973
const Collator *col = static_cast<const Collator *>(context);
974
UErrorCode errorCode = U_ZERO_ERROR;
975
return col->compare(leftRec->name_, rightRec->name_, errorCode);
976
}
977
978
UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
979
if (U_FAILURE(status)) {
980
return NULL;
981
}
982
LocalPointer<UVector> dest(new UVector(status), status);
983
if (U_FAILURE(status)) {
984
return NULL;
985
}
986
dest->setDeleter(uprv_deleteUObject);
987
// Fetch the script-first-primary contractions which are defined in the root collator.
988
// They all start with U+FDD1.
989
UnicodeSet set;
990
collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status);
991
if (U_FAILURE(status)) {
992
return NULL;
993
}
994
if (set.isEmpty()) {
995
status = U_UNSUPPORTED_ERROR;
996
return NULL;
997
}
998
UnicodeSetIterator iter(set);
999
while (iter.next()) {
1000
const UnicodeString &boundary = iter.getString();
1001
uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1));
1002
if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) {
1003
// Ignore boundaries for the special reordering groups.
1004
// Take only those for "real scripts" (where the sample character is a Letter,
1005
// and the one for unassigned implicit weights (Cn).
1006
continue;
1007
}
1008
LocalPointer<UnicodeString> s(new UnicodeString(boundary), status);
1009
dest->adoptElement(s.orphan(), status);
1010
if (U_FAILURE(status)) {
1011
return nullptr;
1012
}
1013
}
1014
return dest.orphan();
1015
}
1016
1017
1018
namespace {
1019
1020
/**
1021
* Returns true if one index character string is "better" than the other.
1022
* Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1023
* better, and otherwise binary-less-than is better.
1024
*/
1025
UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
1026
const UnicodeString &one, const UnicodeString &other) {
1027
// This is called with primary-equal strings, but never with one.equals(other).
1028
UErrorCode status = U_ZERO_ERROR;
1029
UnicodeString n1 = nfkdNormalizer.normalize(one, status);
1030
UnicodeString n2 = nfkdNormalizer.normalize(other, status);
1031
if (U_FAILURE(status)) { return false; }
1032
int32_t result = n1.countChar32() - n2.countChar32();
1033
if (result != 0) {
1034
return result < 0;
1035
}
1036
result = n1.compareCodePointOrder(n2);
1037
if (result != 0) {
1038
return result < 0;
1039
}
1040
return one.compareCodePointOrder(other) < 0;
1041
}
1042
1043
} // namespace
1044
1045
//
1046
// Constructor & Destructor for AlphabeticIndex::Record
1047
//
1048
// Records are internal only, instances are not directly surfaced in the public API.
1049
// This class is mostly struct-like, with all public fields.
1050
1051
AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
1052
: name_(name), data_(data) {}
1053
1054
AlphabeticIndex::Record::~Record() {
1055
}
1056
1057
1058
AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1059
if (U_FAILURE(status)) {
1060
return *this;
1061
}
1062
if (inputList_ == NULL) {
1063
LocalPointer<UVector> inputList(new UVector(status), status);
1064
if (U_FAILURE(status)) {
1065
return *this;
1066
}
1067
inputList_ = inputList.orphan();
1068
inputList_->setDeleter(alphaIndex_deleteRecord);
1069
}
1070
LocalPointer<Record> r(new Record(name, data), status);
1071
inputList_->adoptElement(r.orphan(), status);
1072
if (U_FAILURE(status)) {
1073
return *this;
1074
}
1075
clearBuckets();
1076
//std::string ss;
1077
//std::string ss2;
1078
//std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1079
// " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1080
return *this;
1081
}
1082
1083
1084
AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
1085
if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
1086
inputList_->removeAllElements();
1087
clearBuckets();
1088
}
1089
return *this;
1090
}
1091
1092
int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
1093
initBuckets(status);
1094
if (U_FAILURE(status)) {
1095
return 0;
1096
}
1097
return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
1098
}
1099
1100
1101
int32_t AlphabeticIndex::getBucketIndex() const {
1102
return labelsIterIndex_;
1103
}
1104
1105
1106
UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1107
if (U_FAILURE(status)) {
1108
return false;
1109
}
1110
if (buckets_ == NULL && currentBucket_ != NULL) {
1111
status = U_ENUM_OUT_OF_SYNC_ERROR;
1112
return false;
1113
}
1114
initBuckets(status);
1115
if (U_FAILURE(status)) {
1116
return false;
1117
}
1118
++labelsIterIndex_;
1119
if (labelsIterIndex_ >= buckets_->getBucketCount()) {
1120
labelsIterIndex_ = buckets_->getBucketCount();
1121
return false;
1122
}
1123
currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
1124
resetRecordIterator();
1125
return true;
1126
}
1127
1128
const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1129
if (currentBucket_ != NULL) {
1130
return currentBucket_->label_;
1131
} else {
1132
return emptyString_;
1133
}
1134
}
1135
1136
1137
UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1138
if (currentBucket_ != NULL) {
1139
return currentBucket_->labelType_;
1140
} else {
1141
return U_ALPHAINDEX_NORMAL;
1142
}
1143
}
1144
1145
1146
int32_t AlphabeticIndex::getBucketRecordCount() const {
1147
if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
1148
return currentBucket_->records_->size();
1149
} else {
1150
return 0;
1151
}
1152
}
1153
1154
1155
AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1156
if (U_FAILURE(status)) {
1157
return *this;
1158
}
1159
internalResetBucketIterator();
1160
return *this;
1161
}
1162
1163
1164
UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1165
if (U_FAILURE(status)) {
1166
return false;
1167
}
1168
if (currentBucket_ == NULL) {
1169
// We are trying to iterate over the items in a bucket, but there is no
1170
// current bucket from the enumeration of buckets.
1171
status = U_INVALID_STATE_ERROR;
1172
return false;
1173
}
1174
if (buckets_ == NULL) {
1175
status = U_ENUM_OUT_OF_SYNC_ERROR;
1176
return false;
1177
}
1178
if (currentBucket_->records_ == NULL) {
1179
return false;
1180
}
1181
++itemsIterIndex_;
1182
if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1183
itemsIterIndex_ = currentBucket_->records_->size();
1184
return false;
1185
}
1186
return true;
1187
}
1188
1189
1190
const UnicodeString &AlphabeticIndex::getRecordName() const {
1191
const UnicodeString *retStr = &emptyString_;
1192
if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1193
itemsIterIndex_ >= 0 &&
1194
itemsIterIndex_ < currentBucket_->records_->size()) {
1195
Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1196
retStr = &item->name_;
1197
}
1198
return *retStr;
1199
}
1200
1201
const void *AlphabeticIndex::getRecordData() const {
1202
const void *retPtr = NULL;
1203
if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1204
itemsIterIndex_ >= 0 &&
1205
itemsIterIndex_ < currentBucket_->records_->size()) {
1206
Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1207
retPtr = item->data_;
1208
}
1209
return retPtr;
1210
}
1211
1212
1213
AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1214
itemsIterIndex_ = -1;
1215
return *this;
1216
}
1217
1218
1219
1220
AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1221
const UnicodeString &lowerBoundary,
1222
UAlphabeticIndexLabelType type)
1223
: label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
1224
displayBucket_(NULL), displayIndex_(-1),
1225
records_(NULL) {
1226
}
1227
1228
1229
AlphabeticIndex::Bucket::~Bucket() {
1230
delete records_;
1231
}
1232
1233
U_NAMESPACE_END
1234
1235
#endif // !UCONFIG_NO_COLLATION
1236
1237