Path: blob/master/libs/icui18n/collationdatareader.cpp
12343 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*******************************************************************************4* Copyright (C) 2013-2015, International Business Machines5* Corporation and others. All Rights Reserved.6*******************************************************************************7* collationdatareader.cpp8*9* created on: 2013feb0710* created by: Markus W. Scherer11*/1213#include "unicode/utypes.h"1415#if !UCONFIG_NO_COLLATION1617#include "unicode/ucol.h"18#include "unicode/udata.h"19#include "unicode/uscript.h"20#include "cmemory.h"21#include "collation.h"22#include "collationdata.h"23#include "collationdatareader.h"24#include "collationfastlatin.h"25#include "collationkeys.h"26#include "collationrootelements.h"27#include "collationsettings.h"28#include "collationtailoring.h"29#include "collunsafe.h"30#include "normalizer2impl.h"31#include "uassert.h"32#include "ucmndata.h"33#include "utrie2.h"3435U_NAMESPACE_BEGIN3637namespace {3839int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {40return (i < length) ? indexes[i] : -1;41}4243} // namespace4445void46CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,47CollationTailoring &tailoring, UErrorCode &errorCode) {48if(U_FAILURE(errorCode)) { return; }49if(base != NULL) {50if(inBytes == NULL || (0 <= inLength && inLength < 24)) {51errorCode = U_ILLEGAL_ARGUMENT_ERROR;52return;53}54const DataHeader *header = reinterpret_cast<const DataHeader *>(inBytes);55if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0x27 &&56isAcceptable(tailoring.version, NULL, NULL, &header->info))) {57errorCode = U_INVALID_FORMAT_ERROR;58return;59}60if(base->getUCAVersion() != tailoring.getUCAVersion()) {61errorCode = U_COLLATOR_VERSION_MISMATCH;62return;63}64int32_t headerLength = header->dataHeader.headerSize;65inBytes += headerLength;66if(inLength >= 0) {67inLength -= headerLength;68}69}7071if(inBytes == NULL || (0 <= inLength && inLength < 8)) {72errorCode = U_ILLEGAL_ARGUMENT_ERROR;73return;74}75const int32_t *inIndexes = reinterpret_cast<const int32_t *>(inBytes);76int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH];77if(indexesLength < 2 || (0 <= inLength && inLength < indexesLength * 4)) {78errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes.79return;80}8182// Assume that the tailoring data is in initial state,83// with NULL pointers and 0 lengths.8485// Set pointers to non-empty data parts.86// Do this in order of their byte offsets. (Should help porting to Java.)8788int32_t index; // one of the indexes[] slots89int32_t offset; // byte offset for the index part90int32_t length; // number of bytes in the index part9192if(indexesLength > IX_TOTAL_SIZE) {93length = inIndexes[IX_TOTAL_SIZE];94} else if(indexesLength > IX_REORDER_CODES_OFFSET) {95length = inIndexes[indexesLength - 1];96} else {97length = 0; // only indexes, and inLength was already checked for them98}99if(0 <= inLength && inLength < length) {100errorCode = U_INVALID_FORMAT_ERROR;101return;102}103104const CollationData *baseData = base == NULL ? NULL : base->data;105const int32_t *reorderCodes = NULL;106int32_t reorderCodesLength = 0;107const uint32_t *reorderRanges = NULL;108int32_t reorderRangesLength = 0;109index = IX_REORDER_CODES_OFFSET;110offset = getIndex(inIndexes, indexesLength, index);111length = getIndex(inIndexes, indexesLength, index + 1) - offset;112if(length >= 4) {113if(baseData == NULL) {114// We assume for collation settings that115// the base data does not have a reordering.116errorCode = U_INVALID_FORMAT_ERROR;117return;118}119reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);120reorderCodesLength = length / 4;121122// The reorderRanges (if any) are the trailing reorderCodes entries.123// Split the array at the boundary.124// Script or reorder codes do not exceed 16-bit values.125// Range limits are stored in the upper 16 bits, and are never 0.126while(reorderRangesLength < reorderCodesLength &&127(reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {128++reorderRangesLength;129}130U_ASSERT(reorderRangesLength < reorderCodesLength);131if(reorderRangesLength != 0) {132reorderCodesLength -= reorderRangesLength;133reorderRanges = reinterpret_cast<const uint32_t *>(reorderCodes + reorderCodesLength);134}135}136137// There should be a reorder table only if there are reorder codes.138// However, when there are reorder codes the reorder table may be omitted to reduce139// the data size.140const uint8_t *reorderTable = NULL;141index = IX_REORDER_TABLE_OFFSET;142offset = getIndex(inIndexes, indexesLength, index);143length = getIndex(inIndexes, indexesLength, index + 1) - offset;144if(length >= 256) {145if(reorderCodesLength == 0) {146errorCode = U_INVALID_FORMAT_ERROR; // Reordering table without reordering codes.147return;148}149reorderTable = inBytes + offset;150} else {151// If we have reorder codes, then build the reorderTable at the end,152// when the CollationData is otherwise complete.153}154155if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) {156errorCode = U_INVALID_FORMAT_ERROR;157return;158}159CollationData *data = NULL; // Remains NULL if there are no mappings.160161index = IX_TRIE_OFFSET;162offset = getIndex(inIndexes, indexesLength, index);163length = getIndex(inIndexes, indexesLength, index + 1) - offset;164if(length >= 8) {165if(!tailoring.ensureOwnedData(errorCode)) { return; }166data = tailoring.ownedData;167data->base = baseData;168data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000;169data->trie = tailoring.trie = utrie2_openFromSerialized(170UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL,171&errorCode);172if(U_FAILURE(errorCode)) { return; }173} else if(baseData != NULL) {174// Use the base data. Only the settings are tailored.175tailoring.data = baseData;176} else {177errorCode = U_INVALID_FORMAT_ERROR; // No mappings.178return;179}180181index = IX_CES_OFFSET;182offset = getIndex(inIndexes, indexesLength, index);183length = getIndex(inIndexes, indexesLength, index + 1) - offset;184if(length >= 8) {185if(data == NULL) {186errorCode = U_INVALID_FORMAT_ERROR; // Tailored ces without tailored trie.187return;188}189data->ces = reinterpret_cast<const int64_t *>(inBytes + offset);190data->cesLength = length / 8;191}192193index = IX_CE32S_OFFSET;194offset = getIndex(inIndexes, indexesLength, index);195length = getIndex(inIndexes, indexesLength, index + 1) - offset;196if(length >= 4) {197if(data == NULL) {198errorCode = U_INVALID_FORMAT_ERROR; // Tailored ce32s without tailored trie.199return;200}201data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset);202data->ce32sLength = length / 4;203}204205int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START);206if(jamoCE32sStart >= 0) {207if(data == NULL || data->ce32s == NULL) {208errorCode = U_INVALID_FORMAT_ERROR; // Index into non-existent ce32s[].209return;210}211data->jamoCE32s = data->ce32s + jamoCE32sStart;212} else if(data == NULL) {213// Nothing to do.214} else if(baseData != NULL) {215data->jamoCE32s = baseData->jamoCE32s;216} else {217errorCode = U_INVALID_FORMAT_ERROR; // No Jamo CE32s for Hangul processing.218return;219}220221index = IX_ROOT_ELEMENTS_OFFSET;222offset = getIndex(inIndexes, indexesLength, index);223length = getIndex(inIndexes, indexesLength, index + 1) - offset;224if(length >= 4) {225length /= 4;226if(data == NULL || length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) {227errorCode = U_INVALID_FORMAT_ERROR;228return;229}230data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset);231data->rootElementsLength = length;232uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE];233if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) {234errorCode = U_INVALID_FORMAT_ERROR;235return;236}237uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES];238if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) {239// [fixed last secondary common byte] is too low,240// and secondary weights would collide with compressed common secondaries.241errorCode = U_INVALID_FORMAT_ERROR;242return;243}244}245246index = IX_CONTEXTS_OFFSET;247offset = getIndex(inIndexes, indexesLength, index);248length = getIndex(inIndexes, indexesLength, index + 1) - offset;249if(length >= 2) {250if(data == NULL) {251errorCode = U_INVALID_FORMAT_ERROR; // Tailored contexts without tailored trie.252return;253}254data->contexts = reinterpret_cast<const UChar *>(inBytes + offset);255data->contextsLength = length / 2;256}257258index = IX_UNSAFE_BWD_OFFSET;259offset = getIndex(inIndexes, indexesLength, index);260length = getIndex(inIndexes, indexesLength, index + 1) - offset;261if(length >= 2) {262if(data == NULL) {263errorCode = U_INVALID_FORMAT_ERROR;264return;265}266if(baseData == NULL) {267#if defined(COLLUNSAFE_COLL_VERSION) && defined (COLLUNSAFE_SERIALIZE)268tailoring.unsafeBackwardSet = new UnicodeSet(unsafe_serializedData, unsafe_serializedCount, UnicodeSet::kSerialized, errorCode);269if(tailoring.unsafeBackwardSet == NULL) {270errorCode = U_MEMORY_ALLOCATION_ERROR;271return;272} else if (U_FAILURE(errorCode)) {273return;274}275#else276// Create the unsafe-backward set for the root collator.277// Include all non-zero combining marks and trail surrogates.278// We do this at load time, rather than at build time,279// to simplify Unicode version bootstrapping:280// The root data builder only needs the new FractionalUCA.txt data,281// but it need not be built with a version of ICU already updated to282// the corresponding new Unicode Character Database.283//284// The following is an optimized version of285// new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").286// It is faster and requires fewer code dependencies.287tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates288if(tailoring.unsafeBackwardSet == NULL) {289errorCode = U_MEMORY_ALLOCATION_ERROR;290return;291}292data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);293#endif // !COLLUNSAFE_SERIALIZE || !COLLUNSAFE_COLL_VERSION294} else {295// Clone the root collator's set contents.296tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(297baseData->unsafeBackwardSet->cloneAsThawed());298if(tailoring.unsafeBackwardSet == NULL) {299errorCode = U_MEMORY_ALLOCATION_ERROR;300return;301}302}303// Add the ranges from the data file to the unsafe-backward set.304USerializedSet sset;305const uint16_t *unsafeData = reinterpret_cast<const uint16_t *>(inBytes + offset);306if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) {307errorCode = U_INVALID_FORMAT_ERROR;308return;309}310int32_t count = uset_getSerializedRangeCount(&sset);311for(int32_t i = 0; i < count; ++i) {312UChar32 start, end;313uset_getSerializedRange(&sset, i, &start, &end);314tailoring.unsafeBackwardSet->add(start, end);315}316// Mark each lead surrogate as "unsafe"317// if any of its 1024 associated supplementary code points is "unsafe".318UChar32 c = 0x10000;319for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {320if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) {321tailoring.unsafeBackwardSet->add(lead);322}323}324tailoring.unsafeBackwardSet->freeze();325data->unsafeBackwardSet = tailoring.unsafeBackwardSet;326} else if(data == NULL) {327// Nothing to do.328} else if(baseData != NULL) {329// No tailoring-specific data: Alias the root collator's set.330data->unsafeBackwardSet = baseData->unsafeBackwardSet;331} else {332errorCode = U_INVALID_FORMAT_ERROR; // No unsafeBackwardSet.333return;334}335336// If the fast Latin format version is different,337// or the version is set to 0 for "no fast Latin table",338// then just always use the normal string comparison path.339if(data != NULL) {340data->fastLatinTable = NULL;341data->fastLatinTableLength = 0;342if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION) {343index = IX_FAST_LATIN_TABLE_OFFSET;344offset = getIndex(inIndexes, indexesLength, index);345length = getIndex(inIndexes, indexesLength, index + 1) - offset;346if(length >= 2) {347data->fastLatinTable = reinterpret_cast<const uint16_t *>(inBytes + offset);348data->fastLatinTableLength = length / 2;349if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION) {350errorCode = U_INVALID_FORMAT_ERROR; // header vs. table version mismatch351return;352}353} else if(baseData != NULL) {354data->fastLatinTable = baseData->fastLatinTable;355data->fastLatinTableLength = baseData->fastLatinTableLength;356}357}358}359360index = IX_SCRIPTS_OFFSET;361offset = getIndex(inIndexes, indexesLength, index);362length = getIndex(inIndexes, indexesLength, index + 1) - offset;363if(length >= 2) {364if(data == NULL) {365errorCode = U_INVALID_FORMAT_ERROR;366return;367}368const uint16_t *scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);369int32_t scriptsLength = length / 2;370data->numScripts = scripts[0];371// There must be enough entries for both arrays, including more than two range starts.372data->scriptStartsLength = scriptsLength - (1 + data->numScripts + 16);373if(data->scriptStartsLength <= 2 ||374CollationData::MAX_NUM_SCRIPT_RANGES < data->scriptStartsLength) {375errorCode = U_INVALID_FORMAT_ERROR;376return;377}378data->scriptsIndex = scripts + 1;379data->scriptStarts = scripts + 1 + data->numScripts + 16;380if(!(data->scriptStarts[0] == 0 &&381data->scriptStarts[1] == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8) &&382data->scriptStarts[data->scriptStartsLength - 1] ==383(Collation::TRAIL_WEIGHT_BYTE << 8))) {384errorCode = U_INVALID_FORMAT_ERROR;385return;386}387} else if(data == NULL) {388// Nothing to do.389} else if(baseData != NULL) {390data->numScripts = baseData->numScripts;391data->scriptsIndex = baseData->scriptsIndex;392data->scriptStarts = baseData->scriptStarts;393data->scriptStartsLength = baseData->scriptStartsLength;394}395396index = IX_COMPRESSIBLE_BYTES_OFFSET;397offset = getIndex(inIndexes, indexesLength, index);398length = getIndex(inIndexes, indexesLength, index + 1) - offset;399if(length >= 256) {400if(data == NULL) {401errorCode = U_INVALID_FORMAT_ERROR;402return;403}404data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offset);405} else if(data == NULL) {406// Nothing to do.407} else if(baseData != NULL) {408data->compressibleBytes = baseData->compressibleBytes;409} else {410errorCode = U_INVALID_FORMAT_ERROR; // No compressibleBytes[].411return;412}413414const CollationSettings &ts = *tailoring.settings;415int32_t options = inIndexes[IX_OPTIONS] & 0xffff;416uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];417int32_t fastLatinOptions = CollationFastLatin::getOptions(418tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrimaries));419if(options == ts.options && ts.variableTop != 0 &&420reorderCodesLength == ts.reorderCodesLength &&421(reorderCodesLength == 0 ||422uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0) &&423fastLatinOptions == ts.fastLatinOptions &&424(fastLatinOptions < 0 ||425uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,426sizeof(fastLatinPrimaries)) == 0)) {427return;428}429430CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings);431if(settings == NULL) {432errorCode = U_MEMORY_ALLOCATION_ERROR;433return;434}435settings->options = options;436// Set variableTop from options and scripts data.437settings->variableTop = tailoring.data->getLastPrimaryForGroup(438UCOL_REORDER_CODE_FIRST + int32_t{settings->getMaxVariable()});439if(settings->variableTop == 0) {440errorCode = U_INVALID_FORMAT_ERROR;441return;442}443444if(reorderCodesLength != 0) {445settings->aliasReordering(*baseData, reorderCodes, reorderCodesLength,446reorderRanges, reorderRangesLength,447reorderTable, errorCode);448}449450settings->fastLatinOptions = CollationFastLatin::getOptions(451tailoring.data, *settings,452settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries));453}454455UBool U_CALLCONV456CollationDataReader::isAcceptable(void *context,457const char * /* type */, const char * /*name*/,458const UDataInfo *pInfo) {459if(460pInfo->size >= 20 &&461pInfo->isBigEndian == U_IS_BIG_ENDIAN &&462pInfo->charsetFamily == U_CHARSET_FAMILY &&463pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol"464pInfo->dataFormat[1] == 0x43 &&465pInfo->dataFormat[2] == 0x6f &&466pInfo->dataFormat[3] == 0x6c &&467pInfo->formatVersion[0] == 5468) {469UVersionInfo *version = static_cast<UVersionInfo *>(context);470if(version != NULL) {471uprv_memcpy(version, pInfo->dataVersion, 4);472}473return true;474} else {475return false;476}477}478479U_NAMESPACE_END480481#endif // !UCONFIG_NO_COLLATION482483484