Path: blob/master/thirdparty/icu4c/common/bytestrie.cpp
9904 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*******************************************************************************4* Copyright (C) 2010-2011, International Business Machines5* Corporation and others. All Rights Reserved.6*******************************************************************************7* file name: bytestrie.cpp8* encoding: UTF-89* tab size: 8 (not used)10* indentation:411*12* created on: 2010sep2513* created by: Markus W. Scherer14*/1516#include "unicode/utypes.h"17#include "unicode/bytestream.h"18#include "unicode/bytestrie.h"19#include "unicode/uobject.h"20#include "cmemory.h"21#include "uassert.h"2223U_NAMESPACE_BEGIN2425BytesTrie::~BytesTrie() {26uprv_free(ownedArray_);27}2829// lead byte already shifted right by 1.30int32_t31BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) {32int32_t value;33if(leadByte<kMinTwoByteValueLead) {34value=leadByte-kMinOneByteValueLead;35} else if(leadByte<kMinThreeByteValueLead) {36value=((leadByte-kMinTwoByteValueLead)<<8)|*pos;37} else if(leadByte<kFourByteValueLead) {38value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];39} else if(leadByte==kFourByteValueLead) {40value=(pos[0]<<16)|(pos[1]<<8)|pos[2];41} else {42value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];43}44return value;45}4647const uint8_t *48BytesTrie::jumpByDelta(const uint8_t *pos) {49int32_t delta=*pos++;50if(delta<kMinTwoByteDeltaLead) {51// nothing to do52} else if(delta<kMinThreeByteDeltaLead) {53delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++;54} else if(delta<kFourByteDeltaLead) {55delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1];56pos+=2;57} else if(delta==kFourByteDeltaLead) {58delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];59pos+=3;60} else {61delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];62pos+=4;63}64return pos+delta;65}6667UStringTrieResult68BytesTrie::current() const {69const uint8_t *pos=pos_;70if(pos==nullptr) {71return USTRINGTRIE_NO_MATCH;72} else {73int32_t node;74return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?75valueResult(node) : USTRINGTRIE_NO_VALUE;76}77}7879UStringTrieResult80BytesTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) {81// Branch according to the current byte.82if(length==0) {83length=*pos++;84}85++length;86// The length of the branch is the number of bytes to select from.87// The data structure encodes a binary search.88while(length>kMaxBranchLinearSubNodeLength) {89if(inByte<*pos++) {90length>>=1;91pos=jumpByDelta(pos);92} else {93length=length-(length>>1);94pos=skipDelta(pos);95}96}97// Drop down to linear search for the last few bytes.98// length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=399// and divides length by 2.100do {101if(inByte==*pos++) {102UStringTrieResult result;103int32_t node=*pos;104U_ASSERT(node>=kMinValueLead);105if(node&kValueIsFinal) {106// Leave the final value for getValue() to read.107result=USTRINGTRIE_FINAL_VALUE;108} else {109// Use the non-final value as the jump delta.110++pos;111// int32_t delta=readValue(pos, node>>1);112node>>=1;113int32_t delta;114if(node<kMinTwoByteValueLead) {115delta=node-kMinOneByteValueLead;116} else if(node<kMinThreeByteValueLead) {117delta=((node-kMinTwoByteValueLead)<<8)|*pos++;118} else if(node<kFourByteValueLead) {119delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];120pos+=2;121} else if(node==kFourByteValueLead) {122delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];123pos+=3;124} else {125delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];126pos+=4;127}128// end readValue()129pos+=delta;130node=*pos;131result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;132}133pos_=pos;134return result;135}136--length;137pos=skipValue(pos);138} while(length>1);139if(inByte==*pos++) {140pos_=pos;141int32_t node=*pos;142return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;143} else {144stop();145return USTRINGTRIE_NO_MATCH;146}147}148149UStringTrieResult150BytesTrie::nextImpl(const uint8_t *pos, int32_t inByte) {151for(;;) {152int32_t node=*pos++;153if(node<kMinLinearMatch) {154return branchNext(pos, node, inByte);155} else if(node<kMinValueLead) {156// Match the first of length+1 bytes.157int32_t length=node-kMinLinearMatch; // Actual match length minus 1.158if(inByte==*pos++) {159remainingMatchLength_=--length;160pos_=pos;161return (length<0 && (node=*pos)>=kMinValueLead) ?162valueResult(node) : USTRINGTRIE_NO_VALUE;163} else {164// No match.165break;166}167} else if(node&kValueIsFinal) {168// No further matching bytes.169break;170} else {171// Skip intermediate value.172pos=skipValue(pos, node);173// The next node must not also be a value node.174U_ASSERT(*pos<kMinValueLead);175}176}177stop();178return USTRINGTRIE_NO_MATCH;179}180181UStringTrieResult182BytesTrie::next(int32_t inByte) {183const uint8_t *pos=pos_;184if(pos==nullptr) {185return USTRINGTRIE_NO_MATCH;186}187if(inByte<0) {188inByte+=0x100;189}190int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.191if(length>=0) {192// Remaining part of a linear-match node.193if(inByte==*pos++) {194remainingMatchLength_=--length;195pos_=pos;196int32_t node;197return (length<0 && (node=*pos)>=kMinValueLead) ?198valueResult(node) : USTRINGTRIE_NO_VALUE;199} else {200stop();201return USTRINGTRIE_NO_MATCH;202}203}204return nextImpl(pos, inByte);205}206207UStringTrieResult208BytesTrie::next(const char *s, int32_t sLength) {209if(sLength<0 ? *s==0 : sLength==0) {210// Empty input.211return current();212}213const uint8_t *pos=pos_;214if(pos==nullptr) {215return USTRINGTRIE_NO_MATCH;216}217int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.218for(;;) {219// Fetch the next input byte, if there is one.220// Continue a linear-match node without rechecking sLength<0.221int32_t inByte;222if(sLength<0) {223for(;;) {224if((inByte=*s++)==0) {225remainingMatchLength_=length;226pos_=pos;227int32_t node;228return (length<0 && (node=*pos)>=kMinValueLead) ?229valueResult(node) : USTRINGTRIE_NO_VALUE;230}231if(length<0) {232remainingMatchLength_=length;233break;234}235if(inByte!=*pos) {236stop();237return USTRINGTRIE_NO_MATCH;238}239++pos;240--length;241}242} else {243for(;;) {244if(sLength==0) {245remainingMatchLength_=length;246pos_=pos;247int32_t node;248return (length<0 && (node=*pos)>=kMinValueLead) ?249valueResult(node) : USTRINGTRIE_NO_VALUE;250}251inByte=*s++;252--sLength;253if(length<0) {254remainingMatchLength_=length;255break;256}257if(inByte!=*pos) {258stop();259return USTRINGTRIE_NO_MATCH;260}261++pos;262--length;263}264}265for(;;) {266int32_t node=*pos++;267if(node<kMinLinearMatch) {268UStringTrieResult result=branchNext(pos, node, inByte);269if(result==USTRINGTRIE_NO_MATCH) {270return USTRINGTRIE_NO_MATCH;271}272// Fetch the next input byte, if there is one.273if(sLength<0) {274if((inByte=*s++)==0) {275return result;276}277} else {278if(sLength==0) {279return result;280}281inByte=*s++;282--sLength;283}284if(result==USTRINGTRIE_FINAL_VALUE) {285// No further matching bytes.286stop();287return USTRINGTRIE_NO_MATCH;288}289pos=pos_; // branchNext() advanced pos and wrote it to pos_ .290} else if(node<kMinValueLead) {291// Match length+1 bytes.292length=node-kMinLinearMatch; // Actual match length minus 1.293if(inByte!=*pos) {294stop();295return USTRINGTRIE_NO_MATCH;296}297++pos;298--length;299break;300} else if(node&kValueIsFinal) {301// No further matching bytes.302stop();303return USTRINGTRIE_NO_MATCH;304} else {305// Skip intermediate value.306pos=skipValue(pos, node);307// The next node must not also be a value node.308U_ASSERT(*pos<kMinValueLead);309}310}311}312}313314const uint8_t *315BytesTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length,316UBool haveUniqueValue, int32_t &uniqueValue) {317while(length>kMaxBranchLinearSubNodeLength) {318++pos; // ignore the comparison byte319if(nullptr==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {320return nullptr;321}322length=length-(length>>1);323pos=skipDelta(pos);324}325do {326++pos; // ignore a comparison byte327// handle its value328int32_t node=*pos++;329UBool isFinal = static_cast<UBool>(node & kValueIsFinal);330int32_t value=readValue(pos, node>>1);331pos=skipValue(pos, node);332if(isFinal) {333if(haveUniqueValue) {334if(value!=uniqueValue) {335return nullptr;336}337} else {338uniqueValue=value;339haveUniqueValue=true;340}341} else {342if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {343return nullptr;344}345haveUniqueValue=true;346}347} while(--length>1);348return pos+1; // ignore the last comparison byte349}350351UBool352BytesTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {353for(;;) {354int32_t node=*pos++;355if(node<kMinLinearMatch) {356if(node==0) {357node=*pos++;358}359pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);360if(pos==nullptr) {361return false;362}363haveUniqueValue=true;364} else if(node<kMinValueLead) {365// linear-match node366pos+=node-kMinLinearMatch+1; // Ignore the match bytes.367} else {368UBool isFinal = static_cast<UBool>(node & kValueIsFinal);369int32_t value=readValue(pos, node>>1);370if(haveUniqueValue) {371if(value!=uniqueValue) {372return false;373}374} else {375uniqueValue=value;376haveUniqueValue=true;377}378if(isFinal) {379return true;380}381pos=skipValue(pos, node);382}383}384}385386int32_t387BytesTrie::getNextBytes(ByteSink &out) const {388const uint8_t *pos=pos_;389if(pos==nullptr) {390return 0;391}392if(remainingMatchLength_>=0) {393append(out, *pos); // Next byte of a pending linear-match node.394return 1;395}396int32_t node=*pos++;397if(node>=kMinValueLead) {398if(node&kValueIsFinal) {399return 0;400} else {401pos=skipValue(pos, node);402node=*pos++;403U_ASSERT(node<kMinValueLead);404}405}406if(node<kMinLinearMatch) {407if(node==0) {408node=*pos++;409}410getNextBranchBytes(pos, ++node, out);411return node;412} else {413// First byte of the linear-match node.414append(out, *pos);415return 1;416}417}418419void420BytesTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) {421while(length>kMaxBranchLinearSubNodeLength) {422++pos; // ignore the comparison byte423getNextBranchBytes(jumpByDelta(pos), length>>1, out);424length=length-(length>>1);425pos=skipDelta(pos);426}427do {428append(out, *pos++);429pos=skipValue(pos);430} while(--length>1);431append(out, *pos);432}433434void435BytesTrie::append(ByteSink &out, int c) {436char ch = static_cast<char>(c);437out.Append(&ch, 1);438}439440U_NAMESPACE_END441442443