Path: blob/master/libs/icui18n/collationfastlatin.cpp
12343 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*******************************************************************************4* Copyright (C) 2013-2015, International Business Machines5* Corporation and others. All Rights Reserved.6*******************************************************************************7* collationfastlatin.cpp8*9* created on: 2013aug1810* created by: Markus W. Scherer11*/1213#include "unicode/utypes.h"1415#if !UCONFIG_NO_COLLATION1617#include "unicode/ucol.h"18#include "collationdata.h"19#include "collationfastlatin.h"20#include "collationsettings.h"21#include "uassert.h"2223U_NAMESPACE_BEGIN2425int32_t26CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,27uint16_t *primaries, int32_t capacity) {28const uint16_t *table = data->fastLatinTable;29if(table == NULL) { return -1; }30U_ASSERT(capacity == LATIN_LIMIT);31if(capacity != LATIN_LIMIT) { return -1; }3233uint32_t miniVarTop;34if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {35// No mini primaries are variable, set a variableTop just below the36// lowest long mini primary.37miniVarTop = MIN_LONG - 1;38} else {39int32_t headerLength = *table & 0xff;40int32_t i = 1 + settings.getMaxVariable();41if(i >= headerLength) {42return -1; // variableTop >= digits, should not occur43}44miniVarTop = table[i];45}4647UBool digitsAreReordered = false;48if(settings.hasReordering()) {49uint32_t prevStart = 0;50uint32_t beforeDigitStart = 0;51uint32_t digitStart = 0;52uint32_t afterDigitStart = 0;53for(int32_t group = UCOL_REORDER_CODE_FIRST;54group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES;55++group) {56uint32_t start = data->getFirstPrimaryForGroup(group);57start = settings.reorder(start);58if(group == UCOL_REORDER_CODE_DIGIT) {59beforeDigitStart = prevStart;60digitStart = start;61} else if(start != 0) {62if(start < prevStart) {63// The permutation affects the groups up to Latin.64return -1;65}66// In the future, there might be a special group between digits & Latin.67if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {68afterDigitStart = start;69}70prevStart = start;71}72}73uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN);74latinStart = settings.reorder(latinStart);75if(latinStart < prevStart) {76return -1;77}78if(afterDigitStart == 0) {79afterDigitStart = latinStart;80}81if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {82digitsAreReordered = true;83}84}8586table += (table[0] & 0xff); // skip the header87for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {88uint32_t p = table[c];89if(p >= MIN_SHORT) {90p &= SHORT_PRIMARY_MASK;91} else if(p > miniVarTop) {92p &= LONG_PRIMARY_MASK;93} else {94p = 0;95}96primaries[c] = (uint16_t)p;97}98if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) {99// Bail out for digits.100for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }101}102103// Shift the miniVarTop above other options.104return ((int32_t)miniVarTop << 16) | settings.options;105}106107int32_t108CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,109const UChar *left, int32_t leftLength,110const UChar *right, int32_t rightLength) {111// This is a modified copy of CollationCompare::compareUpToQuaternary(),112// optimized for common Latin text.113// Keep them in sync!114// Keep compareUTF16() and compareUTF8() in sync very closely!115116U_ASSERT((table[0] >> 8) == VERSION);117table += (table[0] & 0xff); // skip the header118uint32_t variableTop = (uint32_t)options >> 16; // see getOptions()119options &= 0xffff; // needed for CollationSettings::getStrength() to work120121// Check for supported characters, fetch mini CEs, and compare primaries.122int32_t leftIndex = 0, rightIndex = 0;123/**124* Single mini CE or a pair.125* The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.126* If there is only one, then it is in the lower bits, and the upper bits are 0.127*/128uint32_t leftPair = 0, rightPair = 0;129for(;;) {130// We fetch CEs until we get a non-ignorable primary or reach the end.131while(leftPair == 0) {132if(leftIndex == leftLength) {133leftPair = EOS;134break;135}136UChar32 c = left[leftIndex++];137if(c <= LATIN_MAX) {138leftPair = primaries[c];139if(leftPair != 0) { break; }140if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {141return BAIL_OUT_RESULT;142}143leftPair = table[c];144} else if(PUNCT_START <= c && c < PUNCT_LIMIT) {145leftPair = table[c - PUNCT_START + LATIN_LIMIT];146} else {147leftPair = lookup(table, c);148}149if(leftPair >= MIN_SHORT) {150leftPair &= SHORT_PRIMARY_MASK;151break;152} else if(leftPair > variableTop) {153leftPair &= LONG_PRIMARY_MASK;154break;155} else {156leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);157if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }158leftPair = getPrimaries(variableTop, leftPair);159}160}161162while(rightPair == 0) {163if(rightIndex == rightLength) {164rightPair = EOS;165break;166}167UChar32 c = right[rightIndex++];168if(c <= LATIN_MAX) {169rightPair = primaries[c];170if(rightPair != 0) { break; }171if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {172return BAIL_OUT_RESULT;173}174rightPair = table[c];175} else if(PUNCT_START <= c && c < PUNCT_LIMIT) {176rightPair = table[c - PUNCT_START + LATIN_LIMIT];177} else {178rightPair = lookup(table, c);179}180if(rightPair >= MIN_SHORT) {181rightPair &= SHORT_PRIMARY_MASK;182break;183} else if(rightPair > variableTop) {184rightPair &= LONG_PRIMARY_MASK;185break;186} else {187rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);188if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }189rightPair = getPrimaries(variableTop, rightPair);190}191}192193if(leftPair == rightPair) {194if(leftPair == EOS) { break; }195leftPair = rightPair = 0;196continue;197}198uint32_t leftPrimary = leftPair & 0xffff;199uint32_t rightPrimary = rightPair & 0xffff;200if(leftPrimary != rightPrimary) {201// Return the primary difference.202return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;203}204if(leftPair == EOS) { break; }205leftPair >>= 16;206rightPair >>= 16;207}208// In the following, we need to re-fetch each character because we did not buffer the CEs,209// but we know that the string is well-formed and210// only contains supported characters and mappings.211212// We might skip the secondary level but continue with the case level213// which is turned on separately.214if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {215leftIndex = rightIndex = 0;216leftPair = rightPair = 0;217for(;;) {218while(leftPair == 0) {219if(leftIndex == leftLength) {220leftPair = EOS;221break;222}223UChar32 c = left[leftIndex++];224if(c <= LATIN_MAX) {225leftPair = table[c];226} else if(PUNCT_START <= c && c < PUNCT_LIMIT) {227leftPair = table[c - PUNCT_START + LATIN_LIMIT];228} else {229leftPair = lookup(table, c);230}231if(leftPair >= MIN_SHORT) {232leftPair = getSecondariesFromOneShortCE(leftPair);233break;234} else if(leftPair > variableTop) {235leftPair = COMMON_SEC_PLUS_OFFSET;236break;237} else {238leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);239leftPair = getSecondaries(variableTop, leftPair);240}241}242243while(rightPair == 0) {244if(rightIndex == rightLength) {245rightPair = EOS;246break;247}248UChar32 c = right[rightIndex++];249if(c <= LATIN_MAX) {250rightPair = table[c];251} else if(PUNCT_START <= c && c < PUNCT_LIMIT) {252rightPair = table[c - PUNCT_START + LATIN_LIMIT];253} else {254rightPair = lookup(table, c);255}256if(rightPair >= MIN_SHORT) {257rightPair = getSecondariesFromOneShortCE(rightPair);258break;259} else if(rightPair > variableTop) {260rightPair = COMMON_SEC_PLUS_OFFSET;261break;262} else {263rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);264rightPair = getSecondaries(variableTop, rightPair);265}266}267268if(leftPair == rightPair) {269if(leftPair == EOS) { break; }270leftPair = rightPair = 0;271continue;272}273uint32_t leftSecondary = leftPair & 0xffff;274uint32_t rightSecondary = rightPair & 0xffff;275if(leftSecondary != rightSecondary) {276if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {277// Full support for backwards secondary requires backwards contraction matching278// and moving backwards between merge separators.279return BAIL_OUT_RESULT;280}281return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;282}283if(leftPair == EOS) { break; }284leftPair >>= 16;285rightPair >>= 16;286}287}288289if((options & CollationSettings::CASE_LEVEL) != 0) {290UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;291leftIndex = rightIndex = 0;292leftPair = rightPair = 0;293for(;;) {294while(leftPair == 0) {295if(leftIndex == leftLength) {296leftPair = EOS;297break;298}299UChar32 c = left[leftIndex++];300leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);301if(leftPair < MIN_LONG) {302leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);303}304leftPair = getCases(variableTop, strengthIsPrimary, leftPair);305}306307while(rightPair == 0) {308if(rightIndex == rightLength) {309rightPair = EOS;310break;311}312UChar32 c = right[rightIndex++];313rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);314if(rightPair < MIN_LONG) {315rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);316}317rightPair = getCases(variableTop, strengthIsPrimary, rightPair);318}319320if(leftPair == rightPair) {321if(leftPair == EOS) { break; }322leftPair = rightPair = 0;323continue;324}325uint32_t leftCase = leftPair & 0xffff;326uint32_t rightCase = rightPair & 0xffff;327if(leftCase != rightCase) {328if((options & CollationSettings::UPPER_FIRST) == 0) {329return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;330} else {331return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;332}333}334if(leftPair == EOS) { break; }335leftPair >>= 16;336rightPair >>= 16;337}338}339if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }340341// Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.342UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);343344leftIndex = rightIndex = 0;345leftPair = rightPair = 0;346for(;;) {347while(leftPair == 0) {348if(leftIndex == leftLength) {349leftPair = EOS;350break;351}352UChar32 c = left[leftIndex++];353leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);354if(leftPair < MIN_LONG) {355leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);356}357leftPair = getTertiaries(variableTop, withCaseBits, leftPair);358}359360while(rightPair == 0) {361if(rightIndex == rightLength) {362rightPair = EOS;363break;364}365UChar32 c = right[rightIndex++];366rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);367if(rightPair < MIN_LONG) {368rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);369}370rightPair = getTertiaries(variableTop, withCaseBits, rightPair);371}372373if(leftPair == rightPair) {374if(leftPair == EOS) { break; }375leftPair = rightPair = 0;376continue;377}378uint32_t leftTertiary = leftPair & 0xffff;379uint32_t rightTertiary = rightPair & 0xffff;380if(leftTertiary != rightTertiary) {381if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {382// Pass through EOS and MERGE_WEIGHT383// and keep real tertiary weights larger than the MERGE_WEIGHT.384// Tertiary CEs (secondary ignorables) are not supported in fast Latin.385if(leftTertiary > MERGE_WEIGHT) {386leftTertiary ^= CASE_MASK;387}388if(rightTertiary > MERGE_WEIGHT) {389rightTertiary ^= CASE_MASK;390}391}392return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;393}394if(leftPair == EOS) { break; }395leftPair >>= 16;396rightPair >>= 16;397}398if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }399400leftIndex = rightIndex = 0;401leftPair = rightPair = 0;402for(;;) {403while(leftPair == 0) {404if(leftIndex == leftLength) {405leftPair = EOS;406break;407}408UChar32 c = left[leftIndex++];409leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);410if(leftPair < MIN_LONG) {411leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);412}413leftPair = getQuaternaries(variableTop, leftPair);414}415416while(rightPair == 0) {417if(rightIndex == rightLength) {418rightPair = EOS;419break;420}421UChar32 c = right[rightIndex++];422rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);423if(rightPair < MIN_LONG) {424rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);425}426rightPair = getQuaternaries(variableTop, rightPair);427}428429if(leftPair == rightPair) {430if(leftPair == EOS) { break; }431leftPair = rightPair = 0;432continue;433}434uint32_t leftQuaternary = leftPair & 0xffff;435uint32_t rightQuaternary = rightPair & 0xffff;436if(leftQuaternary != rightQuaternary) {437return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;438}439if(leftPair == EOS) { break; }440leftPair >>= 16;441rightPair >>= 16;442}443return UCOL_EQUAL;444}445446int32_t447CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,448const uint8_t *left, int32_t leftLength,449const uint8_t *right, int32_t rightLength) {450// Keep compareUTF16() and compareUTF8() in sync very closely!451452U_ASSERT((table[0] >> 8) == VERSION);453table += (table[0] & 0xff); // skip the header454uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions()455options &= 0xffff; // needed for CollationSettings::getStrength() to work456457// Check for supported characters, fetch mini CEs, and compare primaries.458int32_t leftIndex = 0, rightIndex = 0;459/**460* Single mini CE or a pair.461* The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.462* If there is only one, then it is in the lower bits, and the upper bits are 0.463*/464uint32_t leftPair = 0, rightPair = 0;465// Note: There is no need to assemble the code point.466// We only need to look up the table entry for the character,467// and nextPair() looks for whether c==0.468for(;;) {469// We fetch CEs until we get a non-ignorable primary or reach the end.470while(leftPair == 0) {471if(leftIndex == leftLength) {472leftPair = EOS;473break;474}475UChar32 c = left[leftIndex++];476uint8_t t;477if(c <= 0x7f) {478leftPair = primaries[c];479if(leftPair != 0) { break; }480if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {481return BAIL_OUT_RESULT;482}483leftPair = table[c];484} else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&4850x80 <= (t = left[leftIndex]) && t <= 0xbf) {486++leftIndex;487c = ((c - 0xc2) << 6) + t;488leftPair = primaries[c];489if(leftPair != 0) { break; }490leftPair = table[c];491} else {492leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);493}494if(leftPair >= MIN_SHORT) {495leftPair &= SHORT_PRIMARY_MASK;496break;497} else if(leftPair > variableTop) {498leftPair &= LONG_PRIMARY_MASK;499break;500} else {501leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);502if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }503leftPair = getPrimaries(variableTop, leftPair);504}505}506507while(rightPair == 0) {508if(rightIndex == rightLength) {509rightPair = EOS;510break;511}512UChar32 c = right[rightIndex++];513uint8_t t;514if(c <= 0x7f) {515rightPair = primaries[c];516if(rightPair != 0) { break; }517if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {518return BAIL_OUT_RESULT;519}520rightPair = table[c];521} else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&5220x80 <= (t = right[rightIndex]) && t <= 0xbf) {523++rightIndex;524c = ((c - 0xc2) << 6) + t;525rightPair = primaries[c];526if(rightPair != 0) { break; }527rightPair = table[c];528} else {529rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);530}531if(rightPair >= MIN_SHORT) {532rightPair &= SHORT_PRIMARY_MASK;533break;534} else if(rightPair > variableTop) {535rightPair &= LONG_PRIMARY_MASK;536break;537} else {538rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);539if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }540rightPair = getPrimaries(variableTop, rightPair);541}542}543544if(leftPair == rightPair) {545if(leftPair == EOS) { break; }546leftPair = rightPair = 0;547continue;548}549uint32_t leftPrimary = leftPair & 0xffff;550uint32_t rightPrimary = rightPair & 0xffff;551if(leftPrimary != rightPrimary) {552// Return the primary difference.553return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;554}555if(leftPair == EOS) { break; }556leftPair >>= 16;557rightPair >>= 16;558}559// In the following, we need to re-fetch each character because we did not buffer the CEs,560// but we know that the string is well-formed and561// only contains supported characters and mappings.562563// We might skip the secondary level but continue with the case level564// which is turned on separately.565if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {566leftIndex = rightIndex = 0;567leftPair = rightPair = 0;568for(;;) {569while(leftPair == 0) {570if(leftIndex == leftLength) {571leftPair = EOS;572break;573}574UChar32 c = left[leftIndex++];575if(c <= 0x7f) {576leftPair = table[c];577} else if(c <= LATIN_MAX_UTF8_LEAD) {578leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];579} else {580leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);581}582if(leftPair >= MIN_SHORT) {583leftPair = getSecondariesFromOneShortCE(leftPair);584break;585} else if(leftPair > variableTop) {586leftPair = COMMON_SEC_PLUS_OFFSET;587break;588} else {589leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);590leftPair = getSecondaries(variableTop, leftPair);591}592}593594while(rightPair == 0) {595if(rightIndex == rightLength) {596rightPair = EOS;597break;598}599UChar32 c = right[rightIndex++];600if(c <= 0x7f) {601rightPair = table[c];602} else if(c <= LATIN_MAX_UTF8_LEAD) {603rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];604} else {605rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);606}607if(rightPair >= MIN_SHORT) {608rightPair = getSecondariesFromOneShortCE(rightPair);609break;610} else if(rightPair > variableTop) {611rightPair = COMMON_SEC_PLUS_OFFSET;612break;613} else {614rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);615rightPair = getSecondaries(variableTop, rightPair);616}617}618619if(leftPair == rightPair) {620if(leftPair == EOS) { break; }621leftPair = rightPair = 0;622continue;623}624uint32_t leftSecondary = leftPair & 0xffff;625uint32_t rightSecondary = rightPair & 0xffff;626if(leftSecondary != rightSecondary) {627if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {628// Full support for backwards secondary requires backwards contraction matching629// and moving backwards between merge separators.630return BAIL_OUT_RESULT;631}632return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;633}634if(leftPair == EOS) { break; }635leftPair >>= 16;636rightPair >>= 16;637}638}639640if((options & CollationSettings::CASE_LEVEL) != 0) {641UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;642leftIndex = rightIndex = 0;643leftPair = rightPair = 0;644for(;;) {645while(leftPair == 0) {646if(leftIndex == leftLength) {647leftPair = EOS;648break;649}650UChar32 c = left[leftIndex++];651leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);652if(leftPair < MIN_LONG) {653leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);654}655leftPair = getCases(variableTop, strengthIsPrimary, leftPair);656}657658while(rightPair == 0) {659if(rightIndex == rightLength) {660rightPair = EOS;661break;662}663UChar32 c = right[rightIndex++];664rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);665if(rightPair < MIN_LONG) {666rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);667}668rightPair = getCases(variableTop, strengthIsPrimary, rightPair);669}670671if(leftPair == rightPair) {672if(leftPair == EOS) { break; }673leftPair = rightPair = 0;674continue;675}676uint32_t leftCase = leftPair & 0xffff;677uint32_t rightCase = rightPair & 0xffff;678if(leftCase != rightCase) {679if((options & CollationSettings::UPPER_FIRST) == 0) {680return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;681} else {682return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;683}684}685if(leftPair == EOS) { break; }686leftPair >>= 16;687rightPair >>= 16;688}689}690if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }691692// Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.693UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);694695leftIndex = rightIndex = 0;696leftPair = rightPair = 0;697for(;;) {698while(leftPair == 0) {699if(leftIndex == leftLength) {700leftPair = EOS;701break;702}703UChar32 c = left[leftIndex++];704leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);705if(leftPair < MIN_LONG) {706leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);707}708leftPair = getTertiaries(variableTop, withCaseBits, leftPair);709}710711while(rightPair == 0) {712if(rightIndex == rightLength) {713rightPair = EOS;714break;715}716UChar32 c = right[rightIndex++];717rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);718if(rightPair < MIN_LONG) {719rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);720}721rightPair = getTertiaries(variableTop, withCaseBits, rightPair);722}723724if(leftPair == rightPair) {725if(leftPair == EOS) { break; }726leftPair = rightPair = 0;727continue;728}729uint32_t leftTertiary = leftPair & 0xffff;730uint32_t rightTertiary = rightPair & 0xffff;731if(leftTertiary != rightTertiary) {732if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {733// Pass through EOS and MERGE_WEIGHT734// and keep real tertiary weights larger than the MERGE_WEIGHT.735// Tertiary CEs (secondary ignorables) are not supported in fast Latin.736if(leftTertiary > MERGE_WEIGHT) {737leftTertiary ^= CASE_MASK;738}739if(rightTertiary > MERGE_WEIGHT) {740rightTertiary ^= CASE_MASK;741}742}743return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;744}745if(leftPair == EOS) { break; }746leftPair >>= 16;747rightPair >>= 16;748}749if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }750751leftIndex = rightIndex = 0;752leftPair = rightPair = 0;753for(;;) {754while(leftPair == 0) {755if(leftIndex == leftLength) {756leftPair = EOS;757break;758}759UChar32 c = left[leftIndex++];760leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);761if(leftPair < MIN_LONG) {762leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);763}764leftPair = getQuaternaries(variableTop, leftPair);765}766767while(rightPair == 0) {768if(rightIndex == rightLength) {769rightPair = EOS;770break;771}772UChar32 c = right[rightIndex++];773rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);774if(rightPair < MIN_LONG) {775rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);776}777rightPair = getQuaternaries(variableTop, rightPair);778}779780if(leftPair == rightPair) {781if(leftPair == EOS) { break; }782leftPair = rightPair = 0;783continue;784}785uint32_t leftQuaternary = leftPair & 0xffff;786uint32_t rightQuaternary = rightPair & 0xffff;787if(leftQuaternary != rightQuaternary) {788return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;789}790if(leftPair == EOS) { break; }791leftPair >>= 16;792rightPair >>= 16;793}794return UCOL_EQUAL;795}796797uint32_t798CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {799U_ASSERT(c > LATIN_MAX);800if(PUNCT_START <= c && c < PUNCT_LIMIT) {801return table[c - PUNCT_START + LATIN_LIMIT];802} else if(c == 0xfffe) {803return MERGE_WEIGHT;804} else if(c == 0xffff) {805return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;806} else {807return BAIL_OUT;808}809}810811uint32_t812CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,813const uint8_t *s8, int32_t &sIndex, int32_t sLength) {814// The caller handled ASCII and valid/supported Latin.815U_ASSERT(c > 0x7f);816int32_t i2 = sIndex + 1;817if(i2 < sLength || sLength < 0) {818uint8_t t1 = s8[sIndex];819uint8_t t2 = s8[i2];820sIndex += 2;821if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {822return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF823} else if(c == 0xef && t1 == 0xbf) {824if(t2 == 0xbe) {825return MERGE_WEIGHT; // U+FFFE826} else if(t2 == 0xbf) {827return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF828}829}830}831return BAIL_OUT;832}833834uint32_t835CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,836const uint8_t *s8, int32_t &sIndex) {837// The caller handled ASCII.838// The string is well-formed and contains only supported characters.839U_ASSERT(c > 0x7f);840if(c <= LATIN_MAX_UTF8_LEAD) {841return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F842}843uint8_t t2 = s8[sIndex + 1];844sIndex += 2;845if(c == 0xe2) {846return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF847} else if(t2 == 0xbe) {848return MERGE_WEIGHT; // U+FFFE849} else {850return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF851}852}853854uint32_t855CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,856const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {857if(ce >= MIN_LONG || ce < CONTRACTION) {858return ce; // simple or special mini CE859} else if(ce >= EXPANSION) {860int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);861return ((uint32_t)table[index + 1] << 16) | table[index];862} else /* ce >= CONTRACTION */ {863if(c == 0 && sLength < 0) {864sLength = sIndex - 1;865return EOS;866}867// Contraction list: Default mapping followed by868// 0 or more single-character contraction suffix mappings.869int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);870if(sIndex != sLength) {871// Read the next character.872int32_t c2;873int32_t nextIndex = sIndex;874if(s16 != NULL) {875c2 = s16[nextIndex++];876if(c2 > LATIN_MAX) {877if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {878c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF879} else if(c2 == 0xfffe || c2 == 0xffff) {880c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions.881} else {882return BAIL_OUT;883}884}885} else {886c2 = s8[nextIndex++];887if(c2 > 0x7f) {888uint8_t t;889if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&8900x80 <= (t = s8[nextIndex]) && t <= 0xbf) {891c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F892++nextIndex;893} else {894int32_t i2 = nextIndex + 1;895if(i2 < sLength || sLength < 0) {896if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&8970x80 <= (t = s8[i2]) && t <= 0xbf) {898c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF899} else if(c2 == 0xef && s8[nextIndex] == 0xbf &&900((t = s8[i2]) == 0xbe || t == 0xbf)) {901c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions.902} else {903return BAIL_OUT;904}905} else {906return BAIL_OUT;907}908nextIndex += 2;909}910}911}912if(c2 == 0 && sLength < 0) {913sLength = sIndex;914c2 = -1;915}916// Look for the next character in the contraction suffix list,917// which is in ascending order of single suffix characters.918int32_t i = index;919int32_t head = table[i]; // first skip the default mapping920int32_t x;921do {922i += head >> CONTR_LENGTH_SHIFT;923head = table[i];924x = head & CONTR_CHAR_MASK;925} while(x < c2);926if(x == c2) {927index = i;928sIndex = nextIndex;929}930}931// Return the CE or CEs for the default or contraction mapping.932int32_t length = table[index] >> CONTR_LENGTH_SHIFT;933if(length == 1) {934return BAIL_OUT;935}936ce = table[index + 1];937if(length == 2) {938return ce;939} else {940return ((uint32_t)table[index + 2] << 16) | ce;941}942}943}944945uint32_t946CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {947if(pair <= 0xffff) {948// one mini CE949if(pair >= MIN_SHORT) {950pair = getSecondariesFromOneShortCE(pair);951} else if(pair > variableTop) {952pair = COMMON_SEC_PLUS_OFFSET;953} else if(pair >= MIN_LONG) {954pair = 0; // variable955}956// else special mini CE957} else {958uint32_t ce = pair & 0xffff;959if(ce >= MIN_SHORT) {960pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;961} else if(ce > variableTop) {962pair = TWO_COMMON_SEC_PLUS_OFFSET;963} else {964U_ASSERT(ce >= MIN_LONG);965pair = 0; // variable966}967}968return pair;969}970971uint32_t972CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {973// Primary+caseLevel: Ignore case level weights of primary ignorables.974// Otherwise: Ignore case level weights of secondary ignorables.975// For details see the comments in the CollationCompare class.976// Tertiary CEs (secondary ignorables) are not supported in fast Latin.977if(pair <= 0xffff) {978// one mini CE979if(pair >= MIN_SHORT) {980// A high secondary weight means we really have two CEs,981// a primary CE and a secondary CE.982uint32_t ce = pair;983pair &= CASE_MASK; // explicit weight of primary CE984if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {985pair |= LOWER_CASE << 16; // implied weight of secondary CE986}987} else if(pair > variableTop) {988pair = LOWER_CASE;989} else if(pair >= MIN_LONG) {990pair = 0; // variable991}992// else special mini CE993} else {994// two mini CEs, same primary groups, neither expands like above995uint32_t ce = pair & 0xffff;996if(ce >= MIN_SHORT) {997if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {998pair &= CASE_MASK;999} else {1000pair &= TWO_CASES_MASK;1001}1002} else if(ce > variableTop) {1003pair = TWO_LOWER_CASES;1004} else {1005U_ASSERT(ce >= MIN_LONG);1006pair = 0; // variable1007}1008}1009return pair;1010}10111012uint32_t1013CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {1014if(pair <= 0xffff) {1015// one mini CE1016if(pair >= MIN_SHORT) {1017// A high secondary weight means we really have two CEs,1018// a primary CE and a secondary CE.1019uint32_t ce = pair;1020if(withCaseBits) {1021pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;1022if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {1023pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;1024}1025} else {1026pair = (pair & TERTIARY_MASK) + TER_OFFSET;1027if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {1028pair |= COMMON_TER_PLUS_OFFSET << 16;1029}1030}1031} else if(pair > variableTop) {1032pair = (pair & TERTIARY_MASK) + TER_OFFSET;1033if(withCaseBits) {1034pair |= LOWER_CASE;1035}1036} else if(pair >= MIN_LONG) {1037pair = 0; // variable1038}1039// else special mini CE1040} else {1041// two mini CEs, same primary groups, neither expands like above1042uint32_t ce = pair & 0xffff;1043if(ce >= MIN_SHORT) {1044if(withCaseBits) {1045pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;1046} else {1047pair &= TWO_TERTIARIES_MASK;1048}1049pair += TWO_TER_OFFSETS;1050} else if(ce > variableTop) {1051pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;1052if(withCaseBits) {1053pair |= TWO_LOWER_CASES;1054}1055} else {1056U_ASSERT(ce >= MIN_LONG);1057pair = 0; // variable1058}1059}1060return pair;1061}10621063uint32_t1064CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {1065// Return the primary weight of a variable CE,1066// or the maximum primary weight for a non-variable, not-completely-ignorable CE.1067if(pair <= 0xffff) {1068// one mini CE1069if(pair >= MIN_SHORT) {1070// A high secondary weight means we really have two CEs,1071// a primary CE and a secondary CE.1072if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {1073pair = TWO_SHORT_PRIMARIES_MASK;1074} else {1075pair = SHORT_PRIMARY_MASK;1076}1077} else if(pair > variableTop) {1078pair = SHORT_PRIMARY_MASK;1079} else if(pair >= MIN_LONG) {1080pair &= LONG_PRIMARY_MASK; // variable1081}1082// else special mini CE1083} else {1084// two mini CEs, same primary groups, neither expands like above1085uint32_t ce = pair & 0xffff;1086if(ce > variableTop) {1087pair = TWO_SHORT_PRIMARIES_MASK;1088} else {1089U_ASSERT(ce >= MIN_LONG);1090pair &= TWO_LONG_PRIMARIES_MASK; // variable1091}1092}1093return pair;1094}10951096U_NAMESPACE_END10971098#endif // !UCONFIG_NO_COLLATION109911001101