Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/icui18n/collationfastlatin.cpp
12343 views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2013-2015, International Business Machines
6
* Corporation and others. All Rights Reserved.
7
*******************************************************************************
8
* collationfastlatin.cpp
9
*
10
* created on: 2013aug18
11
* created by: Markus W. Scherer
12
*/
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_COLLATION
17
18
#include "unicode/ucol.h"
19
#include "collationdata.h"
20
#include "collationfastlatin.h"
21
#include "collationsettings.h"
22
#include "uassert.h"
23
24
U_NAMESPACE_BEGIN
25
26
int32_t
27
CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,
28
uint16_t *primaries, int32_t capacity) {
29
const uint16_t *table = data->fastLatinTable;
30
if(table == NULL) { return -1; }
31
U_ASSERT(capacity == LATIN_LIMIT);
32
if(capacity != LATIN_LIMIT) { return -1; }
33
34
uint32_t miniVarTop;
35
if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {
36
// No mini primaries are variable, set a variableTop just below the
37
// lowest long mini primary.
38
miniVarTop = MIN_LONG - 1;
39
} else {
40
int32_t headerLength = *table & 0xff;
41
int32_t i = 1 + settings.getMaxVariable();
42
if(i >= headerLength) {
43
return -1; // variableTop >= digits, should not occur
44
}
45
miniVarTop = table[i];
46
}
47
48
UBool digitsAreReordered = false;
49
if(settings.hasReordering()) {
50
uint32_t prevStart = 0;
51
uint32_t beforeDigitStart = 0;
52
uint32_t digitStart = 0;
53
uint32_t afterDigitStart = 0;
54
for(int32_t group = UCOL_REORDER_CODE_FIRST;
55
group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES;
56
++group) {
57
uint32_t start = data->getFirstPrimaryForGroup(group);
58
start = settings.reorder(start);
59
if(group == UCOL_REORDER_CODE_DIGIT) {
60
beforeDigitStart = prevStart;
61
digitStart = start;
62
} else if(start != 0) {
63
if(start < prevStart) {
64
// The permutation affects the groups up to Latin.
65
return -1;
66
}
67
// In the future, there might be a special group between digits & Latin.
68
if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
69
afterDigitStart = start;
70
}
71
prevStart = start;
72
}
73
}
74
uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN);
75
latinStart = settings.reorder(latinStart);
76
if(latinStart < prevStart) {
77
return -1;
78
}
79
if(afterDigitStart == 0) {
80
afterDigitStart = latinStart;
81
}
82
if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
83
digitsAreReordered = true;
84
}
85
}
86
87
table += (table[0] & 0xff); // skip the header
88
for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {
89
uint32_t p = table[c];
90
if(p >= MIN_SHORT) {
91
p &= SHORT_PRIMARY_MASK;
92
} else if(p > miniVarTop) {
93
p &= LONG_PRIMARY_MASK;
94
} else {
95
p = 0;
96
}
97
primaries[c] = (uint16_t)p;
98
}
99
if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) {
100
// Bail out for digits.
101
for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
102
}
103
104
// Shift the miniVarTop above other options.
105
return ((int32_t)miniVarTop << 16) | settings.options;
106
}
107
108
int32_t
109
CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
110
const UChar *left, int32_t leftLength,
111
const UChar *right, int32_t rightLength) {
112
// This is a modified copy of CollationCompare::compareUpToQuaternary(),
113
// optimized for common Latin text.
114
// Keep them in sync!
115
// Keep compareUTF16() and compareUTF8() in sync very closely!
116
117
U_ASSERT((table[0] >> 8) == VERSION);
118
table += (table[0] & 0xff); // skip the header
119
uint32_t variableTop = (uint32_t)options >> 16; // see getOptions()
120
options &= 0xffff; // needed for CollationSettings::getStrength() to work
121
122
// Check for supported characters, fetch mini CEs, and compare primaries.
123
int32_t leftIndex = 0, rightIndex = 0;
124
/**
125
* Single mini CE or a pair.
126
* The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
127
* If there is only one, then it is in the lower bits, and the upper bits are 0.
128
*/
129
uint32_t leftPair = 0, rightPair = 0;
130
for(;;) {
131
// We fetch CEs until we get a non-ignorable primary or reach the end.
132
while(leftPair == 0) {
133
if(leftIndex == leftLength) {
134
leftPair = EOS;
135
break;
136
}
137
UChar32 c = left[leftIndex++];
138
if(c <= LATIN_MAX) {
139
leftPair = primaries[c];
140
if(leftPair != 0) { break; }
141
if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
142
return BAIL_OUT_RESULT;
143
}
144
leftPair = table[c];
145
} else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
146
leftPair = table[c - PUNCT_START + LATIN_LIMIT];
147
} else {
148
leftPair = lookup(table, c);
149
}
150
if(leftPair >= MIN_SHORT) {
151
leftPair &= SHORT_PRIMARY_MASK;
152
break;
153
} else if(leftPair > variableTop) {
154
leftPair &= LONG_PRIMARY_MASK;
155
break;
156
} else {
157
leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
158
if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
159
leftPair = getPrimaries(variableTop, leftPair);
160
}
161
}
162
163
while(rightPair == 0) {
164
if(rightIndex == rightLength) {
165
rightPair = EOS;
166
break;
167
}
168
UChar32 c = right[rightIndex++];
169
if(c <= LATIN_MAX) {
170
rightPair = primaries[c];
171
if(rightPair != 0) { break; }
172
if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
173
return BAIL_OUT_RESULT;
174
}
175
rightPair = table[c];
176
} else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
177
rightPair = table[c - PUNCT_START + LATIN_LIMIT];
178
} else {
179
rightPair = lookup(table, c);
180
}
181
if(rightPair >= MIN_SHORT) {
182
rightPair &= SHORT_PRIMARY_MASK;
183
break;
184
} else if(rightPair > variableTop) {
185
rightPair &= LONG_PRIMARY_MASK;
186
break;
187
} else {
188
rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
189
if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
190
rightPair = getPrimaries(variableTop, rightPair);
191
}
192
}
193
194
if(leftPair == rightPair) {
195
if(leftPair == EOS) { break; }
196
leftPair = rightPair = 0;
197
continue;
198
}
199
uint32_t leftPrimary = leftPair & 0xffff;
200
uint32_t rightPrimary = rightPair & 0xffff;
201
if(leftPrimary != rightPrimary) {
202
// Return the primary difference.
203
return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
204
}
205
if(leftPair == EOS) { break; }
206
leftPair >>= 16;
207
rightPair >>= 16;
208
}
209
// In the following, we need to re-fetch each character because we did not buffer the CEs,
210
// but we know that the string is well-formed and
211
// only contains supported characters and mappings.
212
213
// We might skip the secondary level but continue with the case level
214
// which is turned on separately.
215
if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
216
leftIndex = rightIndex = 0;
217
leftPair = rightPair = 0;
218
for(;;) {
219
while(leftPair == 0) {
220
if(leftIndex == leftLength) {
221
leftPair = EOS;
222
break;
223
}
224
UChar32 c = left[leftIndex++];
225
if(c <= LATIN_MAX) {
226
leftPair = table[c];
227
} else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
228
leftPair = table[c - PUNCT_START + LATIN_LIMIT];
229
} else {
230
leftPair = lookup(table, c);
231
}
232
if(leftPair >= MIN_SHORT) {
233
leftPair = getSecondariesFromOneShortCE(leftPair);
234
break;
235
} else if(leftPair > variableTop) {
236
leftPair = COMMON_SEC_PLUS_OFFSET;
237
break;
238
} else {
239
leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
240
leftPair = getSecondaries(variableTop, leftPair);
241
}
242
}
243
244
while(rightPair == 0) {
245
if(rightIndex == rightLength) {
246
rightPair = EOS;
247
break;
248
}
249
UChar32 c = right[rightIndex++];
250
if(c <= LATIN_MAX) {
251
rightPair = table[c];
252
} else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
253
rightPair = table[c - PUNCT_START + LATIN_LIMIT];
254
} else {
255
rightPair = lookup(table, c);
256
}
257
if(rightPair >= MIN_SHORT) {
258
rightPair = getSecondariesFromOneShortCE(rightPair);
259
break;
260
} else if(rightPair > variableTop) {
261
rightPair = COMMON_SEC_PLUS_OFFSET;
262
break;
263
} else {
264
rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
265
rightPair = getSecondaries(variableTop, rightPair);
266
}
267
}
268
269
if(leftPair == rightPair) {
270
if(leftPair == EOS) { break; }
271
leftPair = rightPair = 0;
272
continue;
273
}
274
uint32_t leftSecondary = leftPair & 0xffff;
275
uint32_t rightSecondary = rightPair & 0xffff;
276
if(leftSecondary != rightSecondary) {
277
if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
278
// Full support for backwards secondary requires backwards contraction matching
279
// and moving backwards between merge separators.
280
return BAIL_OUT_RESULT;
281
}
282
return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
283
}
284
if(leftPair == EOS) { break; }
285
leftPair >>= 16;
286
rightPair >>= 16;
287
}
288
}
289
290
if((options & CollationSettings::CASE_LEVEL) != 0) {
291
UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
292
leftIndex = rightIndex = 0;
293
leftPair = rightPair = 0;
294
for(;;) {
295
while(leftPair == 0) {
296
if(leftIndex == leftLength) {
297
leftPair = EOS;
298
break;
299
}
300
UChar32 c = left[leftIndex++];
301
leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
302
if(leftPair < MIN_LONG) {
303
leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
304
}
305
leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
306
}
307
308
while(rightPair == 0) {
309
if(rightIndex == rightLength) {
310
rightPair = EOS;
311
break;
312
}
313
UChar32 c = right[rightIndex++];
314
rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
315
if(rightPair < MIN_LONG) {
316
rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
317
}
318
rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
319
}
320
321
if(leftPair == rightPair) {
322
if(leftPair == EOS) { break; }
323
leftPair = rightPair = 0;
324
continue;
325
}
326
uint32_t leftCase = leftPair & 0xffff;
327
uint32_t rightCase = rightPair & 0xffff;
328
if(leftCase != rightCase) {
329
if((options & CollationSettings::UPPER_FIRST) == 0) {
330
return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
331
} else {
332
return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
333
}
334
}
335
if(leftPair == EOS) { break; }
336
leftPair >>= 16;
337
rightPair >>= 16;
338
}
339
}
340
if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
341
342
// Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
343
UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
344
345
leftIndex = rightIndex = 0;
346
leftPair = rightPair = 0;
347
for(;;) {
348
while(leftPair == 0) {
349
if(leftIndex == leftLength) {
350
leftPair = EOS;
351
break;
352
}
353
UChar32 c = left[leftIndex++];
354
leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
355
if(leftPair < MIN_LONG) {
356
leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
357
}
358
leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
359
}
360
361
while(rightPair == 0) {
362
if(rightIndex == rightLength) {
363
rightPair = EOS;
364
break;
365
}
366
UChar32 c = right[rightIndex++];
367
rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
368
if(rightPair < MIN_LONG) {
369
rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
370
}
371
rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
372
}
373
374
if(leftPair == rightPair) {
375
if(leftPair == EOS) { break; }
376
leftPair = rightPair = 0;
377
continue;
378
}
379
uint32_t leftTertiary = leftPair & 0xffff;
380
uint32_t rightTertiary = rightPair & 0xffff;
381
if(leftTertiary != rightTertiary) {
382
if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
383
// Pass through EOS and MERGE_WEIGHT
384
// and keep real tertiary weights larger than the MERGE_WEIGHT.
385
// Tertiary CEs (secondary ignorables) are not supported in fast Latin.
386
if(leftTertiary > MERGE_WEIGHT) {
387
leftTertiary ^= CASE_MASK;
388
}
389
if(rightTertiary > MERGE_WEIGHT) {
390
rightTertiary ^= CASE_MASK;
391
}
392
}
393
return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
394
}
395
if(leftPair == EOS) { break; }
396
leftPair >>= 16;
397
rightPair >>= 16;
398
}
399
if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
400
401
leftIndex = rightIndex = 0;
402
leftPair = rightPair = 0;
403
for(;;) {
404
while(leftPair == 0) {
405
if(leftIndex == leftLength) {
406
leftPair = EOS;
407
break;
408
}
409
UChar32 c = left[leftIndex++];
410
leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
411
if(leftPair < MIN_LONG) {
412
leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
413
}
414
leftPair = getQuaternaries(variableTop, leftPair);
415
}
416
417
while(rightPair == 0) {
418
if(rightIndex == rightLength) {
419
rightPair = EOS;
420
break;
421
}
422
UChar32 c = right[rightIndex++];
423
rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
424
if(rightPair < MIN_LONG) {
425
rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
426
}
427
rightPair = getQuaternaries(variableTop, rightPair);
428
}
429
430
if(leftPair == rightPair) {
431
if(leftPair == EOS) { break; }
432
leftPair = rightPair = 0;
433
continue;
434
}
435
uint32_t leftQuaternary = leftPair & 0xffff;
436
uint32_t rightQuaternary = rightPair & 0xffff;
437
if(leftQuaternary != rightQuaternary) {
438
return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
439
}
440
if(leftPair == EOS) { break; }
441
leftPair >>= 16;
442
rightPair >>= 16;
443
}
444
return UCOL_EQUAL;
445
}
446
447
int32_t
448
CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
449
const uint8_t *left, int32_t leftLength,
450
const uint8_t *right, int32_t rightLength) {
451
// Keep compareUTF16() and compareUTF8() in sync very closely!
452
453
U_ASSERT((table[0] >> 8) == VERSION);
454
table += (table[0] & 0xff); // skip the header
455
uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions()
456
options &= 0xffff; // needed for CollationSettings::getStrength() to work
457
458
// Check for supported characters, fetch mini CEs, and compare primaries.
459
int32_t leftIndex = 0, rightIndex = 0;
460
/**
461
* Single mini CE or a pair.
462
* The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
463
* If there is only one, then it is in the lower bits, and the upper bits are 0.
464
*/
465
uint32_t leftPair = 0, rightPair = 0;
466
// Note: There is no need to assemble the code point.
467
// We only need to look up the table entry for the character,
468
// and nextPair() looks for whether c==0.
469
for(;;) {
470
// We fetch CEs until we get a non-ignorable primary or reach the end.
471
while(leftPair == 0) {
472
if(leftIndex == leftLength) {
473
leftPair = EOS;
474
break;
475
}
476
UChar32 c = left[leftIndex++];
477
uint8_t t;
478
if(c <= 0x7f) {
479
leftPair = primaries[c];
480
if(leftPair != 0) { break; }
481
if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
482
return BAIL_OUT_RESULT;
483
}
484
leftPair = table[c];
485
} else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&
486
0x80 <= (t = left[leftIndex]) && t <= 0xbf) {
487
++leftIndex;
488
c = ((c - 0xc2) << 6) + t;
489
leftPair = primaries[c];
490
if(leftPair != 0) { break; }
491
leftPair = table[c];
492
} else {
493
leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);
494
}
495
if(leftPair >= MIN_SHORT) {
496
leftPair &= SHORT_PRIMARY_MASK;
497
break;
498
} else if(leftPair > variableTop) {
499
leftPair &= LONG_PRIMARY_MASK;
500
break;
501
} else {
502
leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
503
if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
504
leftPair = getPrimaries(variableTop, leftPair);
505
}
506
}
507
508
while(rightPair == 0) {
509
if(rightIndex == rightLength) {
510
rightPair = EOS;
511
break;
512
}
513
UChar32 c = right[rightIndex++];
514
uint8_t t;
515
if(c <= 0x7f) {
516
rightPair = primaries[c];
517
if(rightPair != 0) { break; }
518
if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
519
return BAIL_OUT_RESULT;
520
}
521
rightPair = table[c];
522
} else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&
523
0x80 <= (t = right[rightIndex]) && t <= 0xbf) {
524
++rightIndex;
525
c = ((c - 0xc2) << 6) + t;
526
rightPair = primaries[c];
527
if(rightPair != 0) { break; }
528
rightPair = table[c];
529
} else {
530
rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);
531
}
532
if(rightPair >= MIN_SHORT) {
533
rightPair &= SHORT_PRIMARY_MASK;
534
break;
535
} else if(rightPair > variableTop) {
536
rightPair &= LONG_PRIMARY_MASK;
537
break;
538
} else {
539
rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
540
if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
541
rightPair = getPrimaries(variableTop, rightPair);
542
}
543
}
544
545
if(leftPair == rightPair) {
546
if(leftPair == EOS) { break; }
547
leftPair = rightPair = 0;
548
continue;
549
}
550
uint32_t leftPrimary = leftPair & 0xffff;
551
uint32_t rightPrimary = rightPair & 0xffff;
552
if(leftPrimary != rightPrimary) {
553
// Return the primary difference.
554
return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
555
}
556
if(leftPair == EOS) { break; }
557
leftPair >>= 16;
558
rightPair >>= 16;
559
}
560
// In the following, we need to re-fetch each character because we did not buffer the CEs,
561
// but we know that the string is well-formed and
562
// only contains supported characters and mappings.
563
564
// We might skip the secondary level but continue with the case level
565
// which is turned on separately.
566
if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
567
leftIndex = rightIndex = 0;
568
leftPair = rightPair = 0;
569
for(;;) {
570
while(leftPair == 0) {
571
if(leftIndex == leftLength) {
572
leftPair = EOS;
573
break;
574
}
575
UChar32 c = left[leftIndex++];
576
if(c <= 0x7f) {
577
leftPair = table[c];
578
} else if(c <= LATIN_MAX_UTF8_LEAD) {
579
leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];
580
} else {
581
leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);
582
}
583
if(leftPair >= MIN_SHORT) {
584
leftPair = getSecondariesFromOneShortCE(leftPair);
585
break;
586
} else if(leftPair > variableTop) {
587
leftPair = COMMON_SEC_PLUS_OFFSET;
588
break;
589
} else {
590
leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
591
leftPair = getSecondaries(variableTop, leftPair);
592
}
593
}
594
595
while(rightPair == 0) {
596
if(rightIndex == rightLength) {
597
rightPair = EOS;
598
break;
599
}
600
UChar32 c = right[rightIndex++];
601
if(c <= 0x7f) {
602
rightPair = table[c];
603
} else if(c <= LATIN_MAX_UTF8_LEAD) {
604
rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];
605
} else {
606
rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);
607
}
608
if(rightPair >= MIN_SHORT) {
609
rightPair = getSecondariesFromOneShortCE(rightPair);
610
break;
611
} else if(rightPair > variableTop) {
612
rightPair = COMMON_SEC_PLUS_OFFSET;
613
break;
614
} else {
615
rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
616
rightPair = getSecondaries(variableTop, rightPair);
617
}
618
}
619
620
if(leftPair == rightPair) {
621
if(leftPair == EOS) { break; }
622
leftPair = rightPair = 0;
623
continue;
624
}
625
uint32_t leftSecondary = leftPair & 0xffff;
626
uint32_t rightSecondary = rightPair & 0xffff;
627
if(leftSecondary != rightSecondary) {
628
if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
629
// Full support for backwards secondary requires backwards contraction matching
630
// and moving backwards between merge separators.
631
return BAIL_OUT_RESULT;
632
}
633
return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
634
}
635
if(leftPair == EOS) { break; }
636
leftPair >>= 16;
637
rightPair >>= 16;
638
}
639
}
640
641
if((options & CollationSettings::CASE_LEVEL) != 0) {
642
UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
643
leftIndex = rightIndex = 0;
644
leftPair = rightPair = 0;
645
for(;;) {
646
while(leftPair == 0) {
647
if(leftIndex == leftLength) {
648
leftPair = EOS;
649
break;
650
}
651
UChar32 c = left[leftIndex++];
652
leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
653
if(leftPair < MIN_LONG) {
654
leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
655
}
656
leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
657
}
658
659
while(rightPair == 0) {
660
if(rightIndex == rightLength) {
661
rightPair = EOS;
662
break;
663
}
664
UChar32 c = right[rightIndex++];
665
rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
666
if(rightPair < MIN_LONG) {
667
rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
668
}
669
rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
670
}
671
672
if(leftPair == rightPair) {
673
if(leftPair == EOS) { break; }
674
leftPair = rightPair = 0;
675
continue;
676
}
677
uint32_t leftCase = leftPair & 0xffff;
678
uint32_t rightCase = rightPair & 0xffff;
679
if(leftCase != rightCase) {
680
if((options & CollationSettings::UPPER_FIRST) == 0) {
681
return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
682
} else {
683
return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
684
}
685
}
686
if(leftPair == EOS) { break; }
687
leftPair >>= 16;
688
rightPair >>= 16;
689
}
690
}
691
if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
692
693
// Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
694
UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
695
696
leftIndex = rightIndex = 0;
697
leftPair = rightPair = 0;
698
for(;;) {
699
while(leftPair == 0) {
700
if(leftIndex == leftLength) {
701
leftPair = EOS;
702
break;
703
}
704
UChar32 c = left[leftIndex++];
705
leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
706
if(leftPair < MIN_LONG) {
707
leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
708
}
709
leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
710
}
711
712
while(rightPair == 0) {
713
if(rightIndex == rightLength) {
714
rightPair = EOS;
715
break;
716
}
717
UChar32 c = right[rightIndex++];
718
rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
719
if(rightPair < MIN_LONG) {
720
rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
721
}
722
rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
723
}
724
725
if(leftPair == rightPair) {
726
if(leftPair == EOS) { break; }
727
leftPair = rightPair = 0;
728
continue;
729
}
730
uint32_t leftTertiary = leftPair & 0xffff;
731
uint32_t rightTertiary = rightPair & 0xffff;
732
if(leftTertiary != rightTertiary) {
733
if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
734
// Pass through EOS and MERGE_WEIGHT
735
// and keep real tertiary weights larger than the MERGE_WEIGHT.
736
// Tertiary CEs (secondary ignorables) are not supported in fast Latin.
737
if(leftTertiary > MERGE_WEIGHT) {
738
leftTertiary ^= CASE_MASK;
739
}
740
if(rightTertiary > MERGE_WEIGHT) {
741
rightTertiary ^= CASE_MASK;
742
}
743
}
744
return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
745
}
746
if(leftPair == EOS) { break; }
747
leftPair >>= 16;
748
rightPair >>= 16;
749
}
750
if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
751
752
leftIndex = rightIndex = 0;
753
leftPair = rightPair = 0;
754
for(;;) {
755
while(leftPair == 0) {
756
if(leftIndex == leftLength) {
757
leftPair = EOS;
758
break;
759
}
760
UChar32 c = left[leftIndex++];
761
leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
762
if(leftPair < MIN_LONG) {
763
leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
764
}
765
leftPair = getQuaternaries(variableTop, leftPair);
766
}
767
768
while(rightPair == 0) {
769
if(rightIndex == rightLength) {
770
rightPair = EOS;
771
break;
772
}
773
UChar32 c = right[rightIndex++];
774
rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
775
if(rightPair < MIN_LONG) {
776
rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
777
}
778
rightPair = getQuaternaries(variableTop, rightPair);
779
}
780
781
if(leftPair == rightPair) {
782
if(leftPair == EOS) { break; }
783
leftPair = rightPair = 0;
784
continue;
785
}
786
uint32_t leftQuaternary = leftPair & 0xffff;
787
uint32_t rightQuaternary = rightPair & 0xffff;
788
if(leftQuaternary != rightQuaternary) {
789
return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
790
}
791
if(leftPair == EOS) { break; }
792
leftPair >>= 16;
793
rightPair >>= 16;
794
}
795
return UCOL_EQUAL;
796
}
797
798
uint32_t
799
CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {
800
U_ASSERT(c > LATIN_MAX);
801
if(PUNCT_START <= c && c < PUNCT_LIMIT) {
802
return table[c - PUNCT_START + LATIN_LIMIT];
803
} else if(c == 0xfffe) {
804
return MERGE_WEIGHT;
805
} else if(c == 0xffff) {
806
return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
807
} else {
808
return BAIL_OUT;
809
}
810
}
811
812
uint32_t
813
CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,
814
const uint8_t *s8, int32_t &sIndex, int32_t sLength) {
815
// The caller handled ASCII and valid/supported Latin.
816
U_ASSERT(c > 0x7f);
817
int32_t i2 = sIndex + 1;
818
if(i2 < sLength || sLength < 0) {
819
uint8_t t1 = s8[sIndex];
820
uint8_t t2 = s8[i2];
821
sIndex += 2;
822
if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {
823
return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF
824
} else if(c == 0xef && t1 == 0xbf) {
825
if(t2 == 0xbe) {
826
return MERGE_WEIGHT; // U+FFFE
827
} else if(t2 == 0xbf) {
828
return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF
829
}
830
}
831
}
832
return BAIL_OUT;
833
}
834
835
uint32_t
836
CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
837
const uint8_t *s8, int32_t &sIndex) {
838
// The caller handled ASCII.
839
// The string is well-formed and contains only supported characters.
840
U_ASSERT(c > 0x7f);
841
if(c <= LATIN_MAX_UTF8_LEAD) {
842
return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F
843
}
844
uint8_t t2 = s8[sIndex + 1];
845
sIndex += 2;
846
if(c == 0xe2) {
847
return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF
848
} else if(t2 == 0xbe) {
849
return MERGE_WEIGHT; // U+FFFE
850
} else {
851
return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF
852
}
853
}
854
855
uint32_t
856
CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
857
const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {
858
if(ce >= MIN_LONG || ce < CONTRACTION) {
859
return ce; // simple or special mini CE
860
} else if(ce >= EXPANSION) {
861
int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
862
return ((uint32_t)table[index + 1] << 16) | table[index];
863
} else /* ce >= CONTRACTION */ {
864
if(c == 0 && sLength < 0) {
865
sLength = sIndex - 1;
866
return EOS;
867
}
868
// Contraction list: Default mapping followed by
869
// 0 or more single-character contraction suffix mappings.
870
int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
871
if(sIndex != sLength) {
872
// Read the next character.
873
int32_t c2;
874
int32_t nextIndex = sIndex;
875
if(s16 != NULL) {
876
c2 = s16[nextIndex++];
877
if(c2 > LATIN_MAX) {
878
if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
879
c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF
880
} else if(c2 == 0xfffe || c2 == 0xffff) {
881
c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions.
882
} else {
883
return BAIL_OUT;
884
}
885
}
886
} else {
887
c2 = s8[nextIndex++];
888
if(c2 > 0x7f) {
889
uint8_t t;
890
if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&
891
0x80 <= (t = s8[nextIndex]) && t <= 0xbf) {
892
c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F
893
++nextIndex;
894
} else {
895
int32_t i2 = nextIndex + 1;
896
if(i2 < sLength || sLength < 0) {
897
if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&
898
0x80 <= (t = s8[i2]) && t <= 0xbf) {
899
c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF
900
} else if(c2 == 0xef && s8[nextIndex] == 0xbf &&
901
((t = s8[i2]) == 0xbe || t == 0xbf)) {
902
c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions.
903
} else {
904
return BAIL_OUT;
905
}
906
} else {
907
return BAIL_OUT;
908
}
909
nextIndex += 2;
910
}
911
}
912
}
913
if(c2 == 0 && sLength < 0) {
914
sLength = sIndex;
915
c2 = -1;
916
}
917
// Look for the next character in the contraction suffix list,
918
// which is in ascending order of single suffix characters.
919
int32_t i = index;
920
int32_t head = table[i]; // first skip the default mapping
921
int32_t x;
922
do {
923
i += head >> CONTR_LENGTH_SHIFT;
924
head = table[i];
925
x = head & CONTR_CHAR_MASK;
926
} while(x < c2);
927
if(x == c2) {
928
index = i;
929
sIndex = nextIndex;
930
}
931
}
932
// Return the CE or CEs for the default or contraction mapping.
933
int32_t length = table[index] >> CONTR_LENGTH_SHIFT;
934
if(length == 1) {
935
return BAIL_OUT;
936
}
937
ce = table[index + 1];
938
if(length == 2) {
939
return ce;
940
} else {
941
return ((uint32_t)table[index + 2] << 16) | ce;
942
}
943
}
944
}
945
946
uint32_t
947
CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {
948
if(pair <= 0xffff) {
949
// one mini CE
950
if(pair >= MIN_SHORT) {
951
pair = getSecondariesFromOneShortCE(pair);
952
} else if(pair > variableTop) {
953
pair = COMMON_SEC_PLUS_OFFSET;
954
} else if(pair >= MIN_LONG) {
955
pair = 0; // variable
956
}
957
// else special mini CE
958
} else {
959
uint32_t ce = pair & 0xffff;
960
if(ce >= MIN_SHORT) {
961
pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
962
} else if(ce > variableTop) {
963
pair = TWO_COMMON_SEC_PLUS_OFFSET;
964
} else {
965
U_ASSERT(ce >= MIN_LONG);
966
pair = 0; // variable
967
}
968
}
969
return pair;
970
}
971
972
uint32_t
973
CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {
974
// Primary+caseLevel: Ignore case level weights of primary ignorables.
975
// Otherwise: Ignore case level weights of secondary ignorables.
976
// For details see the comments in the CollationCompare class.
977
// Tertiary CEs (secondary ignorables) are not supported in fast Latin.
978
if(pair <= 0xffff) {
979
// one mini CE
980
if(pair >= MIN_SHORT) {
981
// A high secondary weight means we really have two CEs,
982
// a primary CE and a secondary CE.
983
uint32_t ce = pair;
984
pair &= CASE_MASK; // explicit weight of primary CE
985
if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
986
pair |= LOWER_CASE << 16; // implied weight of secondary CE
987
}
988
} else if(pair > variableTop) {
989
pair = LOWER_CASE;
990
} else if(pair >= MIN_LONG) {
991
pair = 0; // variable
992
}
993
// else special mini CE
994
} else {
995
// two mini CEs, same primary groups, neither expands like above
996
uint32_t ce = pair & 0xffff;
997
if(ce >= MIN_SHORT) {
998
if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
999
pair &= CASE_MASK;
1000
} else {
1001
pair &= TWO_CASES_MASK;
1002
}
1003
} else if(ce > variableTop) {
1004
pair = TWO_LOWER_CASES;
1005
} else {
1006
U_ASSERT(ce >= MIN_LONG);
1007
pair = 0; // variable
1008
}
1009
}
1010
return pair;
1011
}
1012
1013
uint32_t
1014
CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {
1015
if(pair <= 0xffff) {
1016
// one mini CE
1017
if(pair >= MIN_SHORT) {
1018
// A high secondary weight means we really have two CEs,
1019
// a primary CE and a secondary CE.
1020
uint32_t ce = pair;
1021
if(withCaseBits) {
1022
pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
1023
if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1024
pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
1025
}
1026
} else {
1027
pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1028
if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1029
pair |= COMMON_TER_PLUS_OFFSET << 16;
1030
}
1031
}
1032
} else if(pair > variableTop) {
1033
pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1034
if(withCaseBits) {
1035
pair |= LOWER_CASE;
1036
}
1037
} else if(pair >= MIN_LONG) {
1038
pair = 0; // variable
1039
}
1040
// else special mini CE
1041
} else {
1042
// two mini CEs, same primary groups, neither expands like above
1043
uint32_t ce = pair & 0xffff;
1044
if(ce >= MIN_SHORT) {
1045
if(withCaseBits) {
1046
pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
1047
} else {
1048
pair &= TWO_TERTIARIES_MASK;
1049
}
1050
pair += TWO_TER_OFFSETS;
1051
} else if(ce > variableTop) {
1052
pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
1053
if(withCaseBits) {
1054
pair |= TWO_LOWER_CASES;
1055
}
1056
} else {
1057
U_ASSERT(ce >= MIN_LONG);
1058
pair = 0; // variable
1059
}
1060
}
1061
return pair;
1062
}
1063
1064
uint32_t
1065
CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {
1066
// Return the primary weight of a variable CE,
1067
// or the maximum primary weight for a non-variable, not-completely-ignorable CE.
1068
if(pair <= 0xffff) {
1069
// one mini CE
1070
if(pair >= MIN_SHORT) {
1071
// A high secondary weight means we really have two CEs,
1072
// a primary CE and a secondary CE.
1073
if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1074
pair = TWO_SHORT_PRIMARIES_MASK;
1075
} else {
1076
pair = SHORT_PRIMARY_MASK;
1077
}
1078
} else if(pair > variableTop) {
1079
pair = SHORT_PRIMARY_MASK;
1080
} else if(pair >= MIN_LONG) {
1081
pair &= LONG_PRIMARY_MASK; // variable
1082
}
1083
// else special mini CE
1084
} else {
1085
// two mini CEs, same primary groups, neither expands like above
1086
uint32_t ce = pair & 0xffff;
1087
if(ce > variableTop) {
1088
pair = TWO_SHORT_PRIMARIES_MASK;
1089
} else {
1090
U_ASSERT(ce >= MIN_LONG);
1091
pair &= TWO_LONG_PRIMARIES_MASK; // variable
1092
}
1093
}
1094
return pair;
1095
}
1096
1097
U_NAMESPACE_END
1098
1099
#endif // !UCONFIG_NO_COLLATION
1100
1101