Path: blob/master/thirdparty/icu4c/common/bytestriebuilder.cpp
9903 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*******************************************************************************4* Copyright (C) 2010-2012, International Business Machines5* Corporation and others. All Rights Reserved.6*******************************************************************************7* file name: bytestriebuilder.cpp8* encoding: UTF-89* tab size: 8 (not used)10* indentation:411*12* created on: 2010sep2513* created by: Markus W. Scherer14*/1516#include "unicode/utypes.h"17#include "unicode/bytestrie.h"18#include "unicode/bytestriebuilder.h"19#include "unicode/stringpiece.h"20#include "charstr.h"21#include "cmemory.h"22#include "uhash.h"23#include "uarrsort.h"24#include "uassert.h"25#include "ustr_imp.h"2627U_NAMESPACE_BEGIN2829/*30* Note: This builder implementation stores (bytes, value) pairs with full copies31* of the byte sequences, until the BytesTrie is built.32* It might(!) take less memory if we collected the data in a temporary, dynamic trie.33*/3435class BytesTrieElement : public UMemory {36public:37// Use compiler's default constructor, initializes nothing.3839void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode);4041StringPiece getString(const CharString &strings) const {42int32_t offset=stringOffset;43int32_t length;44if(offset>=0) {45length = static_cast<uint8_t>(strings[offset++]);46} else {47offset=~offset;48length = (static_cast<int32_t>(static_cast<uint8_t>(strings[offset])) << 8) | static_cast<uint8_t>(strings[offset + 1]);49offset+=2;50}51return StringPiece(strings.data()+offset, length);52}53int32_t getStringLength(const CharString &strings) const {54int32_t offset=stringOffset;55if(offset>=0) {56return static_cast<uint8_t>(strings[offset]);57} else {58offset=~offset;59return (static_cast<int32_t>(static_cast<uint8_t>(strings[offset])) << 8) | static_cast<uint8_t>(strings[offset + 1]);60}61}6263char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }6465int32_t getValue() const { return value; }6667int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;6869private:70const char *data(const CharString &strings) const {71int32_t offset=stringOffset;72if(offset>=0) {73++offset;74} else {75offset=~offset+2;76}77return strings.data()+offset;78}7980// If the stringOffset is non-negative, then the first strings byte contains81// the string length.82// If the stringOffset is negative, then the first two strings bytes contain83// the string length (big-endian), and the offset needs to be bit-inverted.84// (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)85int32_t stringOffset;86int32_t value;87};8889void90BytesTrieElement::setTo(StringPiece s, int32_t val,91CharString &strings, UErrorCode &errorCode) {92if(U_FAILURE(errorCode)) {93return;94}95int32_t length=s.length();96if(length>0xffff) {97// Too long: We store the length in 1 or 2 bytes.98errorCode=U_INDEX_OUTOFBOUNDS_ERROR;99return;100}101int32_t offset=strings.length();102if(length>0xff) {103offset=~offset;104strings.append(static_cast<char>(length >> 8), errorCode);105}106strings.append(static_cast<char>(length), errorCode);107stringOffset=offset;108value=val;109strings.append(s, errorCode);110}111112int32_t113BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {114// TODO: add StringPiece::compare(), see ticket #8187115StringPiece thisString=getString(strings);116StringPiece otherString=other.getString(strings);117int32_t lengthDiff=thisString.length()-otherString.length();118int32_t commonLength;119if(lengthDiff<=0) {120commonLength=thisString.length();121} else {122commonLength=otherString.length();123}124int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);125return diff!=0 ? diff : lengthDiff;126}127128BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)129: strings(nullptr), elements(nullptr), elementsCapacity(0), elementsLength(0),130bytes(nullptr), bytesCapacity(0), bytesLength(0) {131if(U_FAILURE(errorCode)) {132return;133}134strings=new CharString();135if(strings==nullptr) {136errorCode=U_MEMORY_ALLOCATION_ERROR;137}138}139140BytesTrieBuilder::~BytesTrieBuilder() {141delete strings;142delete[] elements;143uprv_free(bytes);144}145146BytesTrieBuilder &147BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) {148if(U_FAILURE(errorCode)) {149return *this;150}151if(bytesLength>0) {152// Cannot add elements after building.153errorCode=U_NO_WRITE_PERMISSION;154return *this;155}156if(elementsLength==elementsCapacity) {157int32_t newCapacity;158if(elementsCapacity==0) {159newCapacity=1024;160} else {161newCapacity=4*elementsCapacity;162}163BytesTrieElement *newElements=new BytesTrieElement[newCapacity];164if(newElements==nullptr) {165errorCode=U_MEMORY_ALLOCATION_ERROR;166return *this; // error instead of dereferencing null167}168if(elementsLength>0) {169uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement));170}171delete[] elements;172elements=newElements;173elementsCapacity=newCapacity;174}175elements[elementsLength++].setTo(s, value, *strings, errorCode);176return *this;177}178179U_CDECL_BEGIN180181static int32_t U_CALLCONV182compareElementStrings(const void *context, const void *left, const void *right) {183const CharString *strings=static_cast<const CharString *>(context);184const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);185const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);186return leftElement->compareStringTo(*rightElement, *strings);187}188189U_CDECL_END190191BytesTrie *192BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {193buildBytes(buildOption, errorCode);194BytesTrie *newTrie=nullptr;195if(U_SUCCESS(errorCode)) {196newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));197if(newTrie==nullptr) {198errorCode=U_MEMORY_ALLOCATION_ERROR;199} else {200bytes=nullptr; // The new trie now owns the array.201bytesCapacity=0;202}203}204return newTrie;205}206207StringPiece208BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {209buildBytes(buildOption, errorCode);210StringPiece result;211if(U_SUCCESS(errorCode)) {212result.set(bytes+(bytesCapacity-bytesLength), bytesLength);213}214return result;215}216217void218BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {219if(U_FAILURE(errorCode)) {220return;221}222if(bytes!=nullptr && bytesLength>0) {223// Already built.224return;225}226if(bytesLength==0) {227if(elementsLength==0) {228errorCode=U_INDEX_OUTOFBOUNDS_ERROR;229return;230}231uprv_sortArray(elements, elementsLength, static_cast<int32_t>(sizeof(BytesTrieElement)),232compareElementStrings, strings,233false, // need not be a stable sort234&errorCode);235if(U_FAILURE(errorCode)) {236return;237}238// Duplicate strings are not allowed.239StringPiece prev=elements[0].getString(*strings);240for(int32_t i=1; i<elementsLength; ++i) {241StringPiece current=elements[i].getString(*strings);242if(prev==current) {243errorCode=U_ILLEGAL_ARGUMENT_ERROR;244return;245}246prev=current;247}248}249// Create and byte-serialize the trie for the elements.250bytesLength=0;251int32_t capacity=strings->length();252if(capacity<1024) {253capacity=1024;254}255if(bytesCapacity<capacity) {256uprv_free(bytes);257bytes=static_cast<char *>(uprv_malloc(capacity));258if(bytes==nullptr) {259errorCode=U_MEMORY_ALLOCATION_ERROR;260bytesCapacity=0;261return;262}263bytesCapacity=capacity;264}265StringTrieBuilder::build(buildOption, elementsLength, errorCode);266if(bytes==nullptr) {267errorCode=U_MEMORY_ALLOCATION_ERROR;268}269}270271BytesTrieBuilder &272BytesTrieBuilder::clear() {273strings->clear();274elementsLength=0;275bytesLength=0;276return *this;277}278279int32_t280BytesTrieBuilder::getElementStringLength(int32_t i) const {281return elements[i].getStringLength(*strings);282}283284char16_t285BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {286return static_cast<uint8_t>(elements[i].charAt(byteIndex, *strings));287}288289int32_t290BytesTrieBuilder::getElementValue(int32_t i) const {291return elements[i].getValue();292}293294int32_t295BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {296const BytesTrieElement &firstElement=elements[first];297const BytesTrieElement &lastElement=elements[last];298int32_t minStringLength=firstElement.getStringLength(*strings);299while(++byteIndex<minStringLength &&300firstElement.charAt(byteIndex, *strings)==301lastElement.charAt(byteIndex, *strings)) {}302return byteIndex;303}304305int32_t306BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {307int32_t length=0; // Number of different bytes at byteIndex.308int32_t i=start;309do {310char byte=elements[i++].charAt(byteIndex, *strings);311while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {312++i;313}314++length;315} while(i<limit);316return length;317}318319int32_t320BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {321do {322char byte=elements[i++].charAt(byteIndex, *strings);323while(byte==elements[i].charAt(byteIndex, *strings)) {324++i;325}326} while(--count>0);327return i;328}329330int32_t331BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, char16_t byte) const {332char b = static_cast<char>(byte);333while(b==elements[i].charAt(byteIndex, *strings)) {334++i;335}336return i;337}338339BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)340: LinearMatchNode(len, nextNode), s(bytes) {341hash=static_cast<int32_t>(342static_cast<uint32_t>(hash)*37u + static_cast<uint32_t>(ustr_hashCharsN(bytes, len)));343}344345bool346BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {347if(this==&other) {348return true;349}350if(!LinearMatchNode::operator==(other)) {351return false;352}353const BTLinearMatchNode &o=static_cast<const BTLinearMatchNode &>(other);354return 0==uprv_memcmp(s, o.s, length);355}356357void358BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {359BytesTrieBuilder &b=static_cast<BytesTrieBuilder &>(builder);360next->write(builder);361b.write(s, length);362offset=b.write(b.getMinLinearMatch()+length-1);363}364365StringTrieBuilder::Node *366BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,367Node *nextNode) const {368return new BTLinearMatchNode(369elements[i].getString(*strings).data()+byteIndex,370length,371nextNode);372}373374UBool375BytesTrieBuilder::ensureCapacity(int32_t length) {376if(bytes==nullptr) {377return false; // previous memory allocation had failed378}379if(length>bytesCapacity) {380int32_t newCapacity=bytesCapacity;381do {382newCapacity*=2;383} while(newCapacity<=length);384char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));385if(newBytes==nullptr) {386// unable to allocate memory387uprv_free(bytes);388bytes=nullptr;389bytesCapacity=0;390return false;391}392uprv_memcpy(newBytes+(newCapacity-bytesLength),393bytes+(bytesCapacity-bytesLength), bytesLength);394uprv_free(bytes);395bytes=newBytes;396bytesCapacity=newCapacity;397}398return true;399}400401int32_t402BytesTrieBuilder::write(int32_t byte) {403int32_t newLength=bytesLength+1;404if(ensureCapacity(newLength)) {405bytesLength=newLength;406bytes[bytesCapacity - bytesLength] = static_cast<char>(byte);407}408return bytesLength;409}410411int32_t412BytesTrieBuilder::write(const char *b, int32_t length) {413int32_t newLength=bytesLength+length;414if(ensureCapacity(newLength)) {415bytesLength=newLength;416uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);417}418return bytesLength;419}420421int32_t422BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {423return write(elements[i].getString(*strings).data()+byteIndex, length);424}425426int32_t427BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {428if(0<=i && i<=BytesTrie::kMaxOneByteValue) {429return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);430}431char intBytes[5];432int32_t length=1;433if(i<0 || i>0xffffff) {434intBytes[0] = static_cast<char>(BytesTrie::kFiveByteValueLead);435intBytes[1] = static_cast<char>(static_cast<uint32_t>(i) >> 24);436intBytes[2] = static_cast<char>(static_cast<uint32_t>(i) >> 16);437intBytes[3] = static_cast<char>(static_cast<uint32_t>(i) >> 8);438intBytes[4] = static_cast<char>(i);439length=5;440// } else if(i<=BytesTrie::kMaxOneByteValue) {441// intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);442} else {443if(i<=BytesTrie::kMaxTwoByteValue) {444intBytes[0] = static_cast<char>(BytesTrie::kMinTwoByteValueLead + (i >> 8));445} else {446if(i<=BytesTrie::kMaxThreeByteValue) {447intBytes[0] = static_cast<char>(BytesTrie::kMinThreeByteValueLead + (i >> 16));448} else {449intBytes[0] = static_cast<char>(BytesTrie::kFourByteValueLead);450intBytes[1] = static_cast<char>(i >> 16);451length=2;452}453intBytes[length++] = static_cast<char>(i >> 8);454}455intBytes[length++] = static_cast<char>(i);456}457intBytes[0] = static_cast<char>((intBytes[0] << 1) | isFinal);458return write(intBytes, length);459}460461int32_t462BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {463int32_t offset=write(node);464if(hasValue) {465offset=writeValueAndFinal(value, false);466}467return offset;468}469470int32_t471BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {472int32_t i=bytesLength-jumpTarget;473U_ASSERT(i>=0);474if(i<=BytesTrie::kMaxOneByteDelta) {475return write(i);476} else {477char intBytes[5];478return write(intBytes, internalEncodeDelta(i, intBytes));479}480}481482int32_t483BytesTrieBuilder::internalEncodeDelta(int32_t i, char intBytes[]) {484U_ASSERT(i>=0);485if(i<=BytesTrie::kMaxOneByteDelta) {486intBytes[0] = static_cast<char>(i);487return 1;488}489int32_t length=1;490if(i<=BytesTrie::kMaxTwoByteDelta) {491intBytes[0] = static_cast<char>(BytesTrie::kMinTwoByteDeltaLead + (i >> 8));492} else {493if(i<=BytesTrie::kMaxThreeByteDelta) {494intBytes[0] = static_cast<char>(BytesTrie::kMinThreeByteDeltaLead + (i >> 16));495} else {496if(i<=0xffffff) {497intBytes[0] = static_cast<char>(BytesTrie::kFourByteDeltaLead);498} else {499intBytes[0] = static_cast<char>(BytesTrie::kFiveByteDeltaLead);500intBytes[1] = static_cast<char>(i >> 24);501length=2;502}503intBytes[length++] = static_cast<char>(i >> 16);504}505intBytes[length++] = static_cast<char>(i >> 8);506}507intBytes[length++] = static_cast<char>(i);508return length;509}510511U_NAMESPACE_END512513514