Path: blob/master/thirdparty/icu4c/common/dictionarydata.cpp
9902 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*******************************************************************************4* Copyright (C) 2014-2016, International Business Machines5* Corporation and others. All Rights Reserved.6*******************************************************************************7* dictionarydata.h8*9* created on: 2012may3110* created by: Markus W. Scherer & Maxime Serrano11*/1213#include "dictionarydata.h"14#include "unicode/ucharstrie.h"15#include "unicode/bytestrie.h"16#include "unicode/udata.h"17#include "cmemory.h"1819#if !UCONFIG_NO_BREAK_ITERATION2021U_NAMESPACE_BEGIN2223const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;24const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;25const int32_t DictionaryData::TRIE_TYPE_MASK = 7;26const int32_t DictionaryData::TRIE_HAS_VALUES = 8;2728const int32_t DictionaryData::TRANSFORM_NONE = 0;29const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;30const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;31const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;3233DictionaryMatcher::~DictionaryMatcher() {34}3536UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {37udata_close(file);38}3940int32_t UCharsDictionaryMatcher::getType() const {41return DictionaryData::TRIE_TYPE_UCHARS;42}4344int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,45int32_t *lengths, int32_t *cpLengths, int32_t *values,46int32_t *prefix) const {4748UCharsTrie uct(characters);49int32_t startingTextIndex = static_cast<int32_t>(utext_getNativeIndex(text));50int32_t wordCount = 0;51int32_t codePointsMatched = 0;5253for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {54UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);55int32_t lengthMatched = static_cast<int32_t>(utext_getNativeIndex(text)) - startingTextIndex;56codePointsMatched += 1;57if (USTRINGTRIE_HAS_VALUE(result)) {58if (wordCount < limit) {59if (values != nullptr) {60values[wordCount] = uct.getValue();61}62if (lengths != nullptr) {63lengths[wordCount] = lengthMatched;64}65if (cpLengths != nullptr) {66cpLengths[wordCount] = codePointsMatched;67}68++wordCount;69}70if (result == USTRINGTRIE_FINAL_VALUE) {71break;72}73}74else if (result == USTRINGTRIE_NO_MATCH) {75break;76}77if (lengthMatched >= maxLength) {78break;79}80}8182if (prefix != nullptr) {83*prefix = codePointsMatched;84}85return wordCount;86}8788BytesDictionaryMatcher::~BytesDictionaryMatcher() {89udata_close(file);90}9192UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {93if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {94if (c == 0x200D) {95return 0xFF;96} else if (c == 0x200C) {97return 0xFE;98}99int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);100if (delta < 0 || 0xFD < delta) {101return U_SENTINEL;102}103return static_cast<UChar32>(delta);104}105return c;106}107108int32_t BytesDictionaryMatcher::getType() const {109return DictionaryData::TRIE_TYPE_BYTES;110}111112int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,113int32_t *lengths, int32_t *cpLengths, int32_t *values,114int32_t *prefix) const {115BytesTrie bt(characters);116int32_t startingTextIndex = static_cast<int32_t>(utext_getNativeIndex(text));117int32_t wordCount = 0;118int32_t codePointsMatched = 0;119120for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {121UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));122int32_t lengthMatched = static_cast<int32_t>(utext_getNativeIndex(text)) - startingTextIndex;123codePointsMatched += 1;124if (USTRINGTRIE_HAS_VALUE(result)) {125if (wordCount < limit) {126if (values != nullptr) {127values[wordCount] = bt.getValue();128}129if (lengths != nullptr) {130lengths[wordCount] = lengthMatched;131}132if (cpLengths != nullptr) {133cpLengths[wordCount] = codePointsMatched;134}135++wordCount;136}137if (result == USTRINGTRIE_FINAL_VALUE) {138break;139}140}141else if (result == USTRINGTRIE_NO_MATCH) {142break;143}144if (lengthMatched >= maxLength) {145break;146}147}148149if (prefix != nullptr) {150*prefix = codePointsMatched;151}152return wordCount;153}154155156U_NAMESPACE_END157158U_NAMESPACE_USE159160U_CAPI int32_t U_EXPORT2161udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,162void *outData, UErrorCode *pErrorCode) {163const UDataInfo *pInfo;164int32_t headerSize;165const uint8_t *inBytes;166uint8_t *outBytes;167const int32_t *inIndexes;168int32_t indexes[DictionaryData::IX_COUNT];169int32_t i, offset, size;170171headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);172if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) return 0;173pInfo = (const UDataInfo *)((const char *)inData + 4);174if (!(pInfo->dataFormat[0] == 0x44 &&175pInfo->dataFormat[1] == 0x69 &&176pInfo->dataFormat[2] == 0x63 &&177pInfo->dataFormat[3] == 0x74 &&178pInfo->formatVersion[0] == 1)) {179udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",180pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);181*pErrorCode = U_UNSUPPORTED_ERROR;182return 0;183}184185inBytes = (const uint8_t *)inData + headerSize;186outBytes = (outData == nullptr) ? nullptr : (uint8_t *)outData + headerSize;187188inIndexes = (const int32_t *)inBytes;189if (length >= 0) {190length -= headerSize;191if (length < (int32_t)(sizeof(indexes))) {192udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);193*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;194return 0;195}196}197198for (i = 0; i < DictionaryData::IX_COUNT; i++) {199indexes[i] = udata_readInt32(ds, inIndexes[i]);200}201202size = indexes[DictionaryData::IX_TOTAL_SIZE];203204if (length >= 0) {205if (length < size) {206udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);207*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;208return 0;209}210211if (inBytes != outBytes) {212uprv_memcpy(outBytes, inBytes, size);213}214215offset = 0;216ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);217offset = (int32_t)sizeof(indexes);218int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;219int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];220221if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {222ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);223} else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {224// nothing to do225} else {226udata_printError(ds, "udict_swap(): unknown trie type!\n");227*pErrorCode = U_UNSUPPORTED_ERROR;228return 0;229}230231// these next two sections are empty in the current format,232// but may be used later.233offset = nextOffset;234nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];235offset = nextOffset;236nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];237offset = nextOffset;238}239return headerSize + size;240}241#endif242243244