Path: blob/jdk8u272-b10-aarch32-20201026/jdk/src/share/native/common/unicode/bytestrie.h
48729 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*******************************************************************************4* Copyright (C) 2010-2012, International Business Machines5* Corporation and others. All Rights Reserved.6*******************************************************************************7* file name: bytestrie.h8* encoding: UTF-89* tab size: 8 (not used)10* indentation:411*12* created on: 2010sep2513* created by: Markus W. Scherer14*/1516#ifndef __BYTESTRIE_H__17#define __BYTESTRIE_H__1819/**20* \file21* \brief C++ API: Trie for mapping byte sequences to integer values.22*/2324#include "unicode/utypes.h"25#include "unicode/stringpiece.h"26#include "unicode/uobject.h"27#include "unicode/ustringtrie.h"2829U_NAMESPACE_BEGIN3031class ByteSink;32class BytesTrieBuilder;33class CharString;34class UVector32;3536/**37* Light-weight, non-const reader class for a BytesTrie.38* Traverses a byte-serialized data structure with minimal state,39* for mapping byte sequences to non-negative integer values.40*41* This class owns the serialized trie data only if it was constructed by42* the builder's build() method.43* The public constructor and the copy constructor only alias the data (only copy the pointer).44* There is no assignment operator.45*46* This class is not intended for public subclassing.47* @stable ICU 4.848*/49class U_COMMON_API BytesTrie : public UMemory {50public:51/**52* Constructs a BytesTrie reader instance.53*54* The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder,55* starting with the first byte of that sequence.56* The BytesTrie object will not read more bytes than57* the BytesTrieBuilder generated in the corresponding build() call.58*59* The array is not copied/cloned and must not be modified while60* the BytesTrie object is in use.61*62* @param trieBytes The byte array that contains the serialized trie.63* @stable ICU 4.864*/65BytesTrie(const void *trieBytes)66: ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)),67pos_(bytes_), remainingMatchLength_(-1) {}6869/**70* Destructor.71* @stable ICU 4.872*/73~BytesTrie();7475/**76* Copy constructor, copies the other trie reader object and its state,77* but not the byte array which will be shared. (Shallow copy.)78* @param other Another BytesTrie object.79* @stable ICU 4.880*/81BytesTrie(const BytesTrie &other)82: ownedArray_(NULL), bytes_(other.bytes_),83pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}8485/**86* Resets this trie to its initial state.87* @return *this88* @stable ICU 4.889*/90BytesTrie &reset() {91pos_=bytes_;92remainingMatchLength_=-1;93return *this;94}9596/**97* BytesTrie state object, for saving a trie's current state98* and resetting the trie back to this state later.99* @stable ICU 4.8100*/101class State : public UMemory {102public:103/**104* Constructs an empty State.105* @stable ICU 4.8106*/107State() { bytes=NULL; }108private:109friend class BytesTrie;110111const uint8_t *bytes;112const uint8_t *pos;113int32_t remainingMatchLength;114};115116/**117* Saves the state of this trie.118* @param state The State object to hold the trie's state.119* @return *this120* @see resetToState121* @stable ICU 4.8122*/123const BytesTrie &saveState(State &state) const {124state.bytes=bytes_;125state.pos=pos_;126state.remainingMatchLength=remainingMatchLength_;127return *this;128}129130/**131* Resets this trie to the saved state.132* If the state object contains no state, or the state of a different trie,133* then this trie remains unchanged.134* @param state The State object which holds a saved trie state.135* @return *this136* @see saveState137* @see reset138* @stable ICU 4.8139*/140BytesTrie &resetToState(const State &state) {141if(bytes_==state.bytes && bytes_!=NULL) {142pos_=state.pos;143remainingMatchLength_=state.remainingMatchLength;144}145return *this;146}147148/**149* Determines whether the byte sequence so far matches, whether it has a value,150* and whether another input byte can continue a matching byte sequence.151* @return The match/value Result.152* @stable ICU 4.8153*/154UStringTrieResult current() const;155156/**157* Traverses the trie from the initial state for this input byte.158* Equivalent to reset().next(inByte).159* @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.160* Values below -0x100 and above 0xff will never match.161* @return The match/value Result.162* @stable ICU 4.8163*/164inline UStringTrieResult first(int32_t inByte) {165remainingMatchLength_=-1;166if(inByte<0) {167inByte+=0x100;168}169return nextImpl(bytes_, inByte);170}171172/**173* Traverses the trie from the current state for this input byte.174* @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.175* Values below -0x100 and above 0xff will never match.176* @return The match/value Result.177* @stable ICU 4.8178*/179UStringTrieResult next(int32_t inByte);180181/**182* Traverses the trie from the current state for this byte sequence.183* Equivalent to184* \code185* Result result=current();186* for(each c in s)187* if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH;188* result=next(c);189* return result;190* \endcode191* @param s A string or byte sequence. Can be NULL if length is 0.192* @param length The length of the byte sequence. Can be -1 if NUL-terminated.193* @return The match/value Result.194* @stable ICU 4.8195*/196UStringTrieResult next(const char *s, int32_t length);197198/**199* Returns a matching byte sequence's value if called immediately after200* current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE.201* getValue() can be called multiple times.202*203* Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE!204* @return The value for the byte sequence so far.205* @stable ICU 4.8206*/207inline int32_t getValue() const {208const uint8_t *pos=pos_;209int32_t leadByte=*pos++;210// U_ASSERT(leadByte>=kMinValueLead);211return readValue(pos, leadByte>>1);212}213214/**215* Determines whether all byte sequences reachable from the current state216* map to the same value.217* @param uniqueValue Receives the unique value, if this function returns TRUE.218* (output-only)219* @return TRUE if all byte sequences reachable from the current state220* map to the same value.221* @stable ICU 4.8222*/223inline UBool hasUniqueValue(int32_t &uniqueValue) const {224const uint8_t *pos=pos_;225// Skip the rest of a pending linear-match node.226return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);227}228229/**230* Finds each byte which continues the byte sequence from the current state.231* That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now.232* @param out Each next byte is appended to this object.233* (Only uses the out.Append(s, length) method.)234* @return the number of bytes which continue the byte sequence from here235* @stable ICU 4.8236*/237int32_t getNextBytes(ByteSink &out) const;238239/**240* Iterator for all of the (byte sequence, value) pairs in a BytesTrie.241* @stable ICU 4.8242*/243class U_COMMON_API Iterator : public UMemory {244public:245/**246* Iterates from the root of a byte-serialized BytesTrie.247* @param trieBytes The trie bytes.248* @param maxStringLength If 0, the iterator returns full strings/byte sequences.249* Otherwise, the iterator returns strings with this maximum length.250* @param errorCode Standard ICU error code. Its input value must251* pass the U_SUCCESS() test, or else the function returns252* immediately. Check for U_FAILURE() on output or use with253* function chaining. (See User Guide for details.)254* @stable ICU 4.8255*/256Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);257258/**259* Iterates from the current state of the specified BytesTrie.260* @param trie The trie whose state will be copied for iteration.261* @param maxStringLength If 0, the iterator returns full strings/byte sequences.262* Otherwise, the iterator returns strings with this maximum length.263* @param errorCode Standard ICU error code. Its input value must264* pass the U_SUCCESS() test, or else the function returns265* immediately. Check for U_FAILURE() on output or use with266* function chaining. (See User Guide for details.)267* @stable ICU 4.8268*/269Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);270271/**272* Destructor.273* @stable ICU 4.8274*/275~Iterator();276277/**278* Resets this iterator to its initial state.279* @return *this280* @stable ICU 4.8281*/282Iterator &reset();283284/**285* @return TRUE if there are more elements.286* @stable ICU 4.8287*/288UBool hasNext() const;289290/**291* Finds the next (byte sequence, value) pair if there is one.292*293* If the byte sequence is truncated to the maximum length and does not294* have a real value, then the value is set to -1.295* In this case, this "not a real value" is indistinguishable from296* a real value of -1.297* @param errorCode Standard ICU error code. Its input value must298* pass the U_SUCCESS() test, or else the function returns299* immediately. Check for U_FAILURE() on output or use with300* function chaining. (See User Guide for details.)301* @return TRUE if there is another element.302* @stable ICU 4.8303*/304UBool next(UErrorCode &errorCode);305306/**307* @return The NUL-terminated byte sequence for the last successful next().308* @stable ICU 4.8309*/310StringPiece getString() const;311/**312* @return The value for the last successful next().313* @stable ICU 4.8314*/315int32_t getValue() const { return value_; }316317private:318UBool truncateAndStop();319320const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);321322const uint8_t *bytes_;323const uint8_t *pos_;324const uint8_t *initialPos_;325int32_t remainingMatchLength_;326int32_t initialRemainingMatchLength_;327328CharString *str_;329int32_t maxLength_;330int32_t value_;331332// The stack stores pairs of integers for backtracking to another333// outbound edge of a branch node.334// The first integer is an offset from bytes_.335// The second integer has the str_->length() from before the node in bits 15..0,336// and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)337// (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,338// but the code looks more confusing that way.)339UVector32 *stack_;340};341342private:343friend class BytesTrieBuilder;344345/**346* Constructs a BytesTrie reader instance.347* Unlike the public constructor which just aliases an array,348* this constructor adopts the builder's array.349* This constructor is only called by the builder.350*/351BytesTrie(void *adoptBytes, const void *trieBytes)352: ownedArray_(static_cast<uint8_t *>(adoptBytes)),353bytes_(static_cast<const uint8_t *>(trieBytes)),354pos_(bytes_), remainingMatchLength_(-1) {}355356// No assignment operator.357BytesTrie &operator=(const BytesTrie &other);358359inline void stop() {360pos_=NULL;361}362363// Reads a compact 32-bit integer.364// pos is already after the leadByte, and the lead byte is already shifted right by 1.365static int32_t readValue(const uint8_t *pos, int32_t leadByte);366static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {367// U_ASSERT(leadByte>=kMinValueLead);368if(leadByte>=(kMinTwoByteValueLead<<1)) {369if(leadByte<(kMinThreeByteValueLead<<1)) {370++pos;371} else if(leadByte<(kFourByteValueLead<<1)) {372pos+=2;373} else {374pos+=3+((leadByte>>1)&1);375}376}377return pos;378}379static inline const uint8_t *skipValue(const uint8_t *pos) {380int32_t leadByte=*pos++;381return skipValue(pos, leadByte);382}383384// Reads a jump delta and jumps.385static const uint8_t *jumpByDelta(const uint8_t *pos);386387static inline const uint8_t *skipDelta(const uint8_t *pos) {388int32_t delta=*pos++;389if(delta>=kMinTwoByteDeltaLead) {390if(delta<kMinThreeByteDeltaLead) {391++pos;392} else if(delta<kFourByteDeltaLead) {393pos+=2;394} else {395pos+=3+(delta&1);396}397}398return pos;399}400401static inline UStringTrieResult valueResult(int32_t node) {402return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));403}404405// Handles a branch node for both next(byte) and next(string).406UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);407408// Requires remainingLength_<0.409UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);410411// Helper functions for hasUniqueValue().412// Recursively finds a unique value (or whether there is not a unique one)413// from a branch.414static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,415UBool haveUniqueValue, int32_t &uniqueValue);416// Recursively finds a unique value (or whether there is not a unique one)417// starting from a position on a node lead byte.418static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);419420// Helper functions for getNextBytes().421// getNextBytes() when pos is on a branch node.422static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);423static void append(ByteSink &out, int c);424425// BytesTrie data structure426//427// The trie consists of a series of byte-serialized nodes for incremental428// string/byte sequence matching. The root node is at the beginning of the trie data.429//430// Types of nodes are distinguished by their node lead byte ranges.431// After each node, except a final-value node, another node follows to432// encode match values or continue matching further bytes.433//434// Node types:435// - Value node: Stores a 32-bit integer in a compact, variable-length format.436// The value is for the string/byte sequence so far.437// One node bit indicates whether the value is final or whether438// matching continues with the next node.439// - Linear-match node: Matches a number of bytes.440// - Branch node: Branches to other nodes according to the current input byte.441// The node byte is the length of the branch (number of bytes to select from)442// minus 1. It is followed by a sub-node:443// - If the length is at most kMaxBranchLinearSubNodeLength, then444// there are length-1 (key, value) pairs and then one more comparison byte.445// If one of the key bytes matches, then the value is either a final value for446// the string/byte sequence so far, or a "jump" delta to the next node.447// If the last byte matches, then matching continues with the next node.448// (Values have the same encoding as value nodes.)449// - If the length is greater than kMaxBranchLinearSubNodeLength, then450// there is one byte and one "jump" delta.451// If the input byte is less than the sub-node byte, then "jump" by delta to452// the next sub-node which will have a length of length/2.453// (The delta has its own compact encoding.)454// Otherwise, skip the "jump" delta to the next sub-node455// which will have a length of length-length/2.456457// Node lead byte values.458459// 00..0f: Branch node. If node!=0 then the length is node+1, otherwise460// the length is one more than the next byte.461462// For a branch sub-node with at most this many entries, we drop down463// to a linear search.464static const int32_t kMaxBranchLinearSubNodeLength=5;465466// 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.467static const int32_t kMinLinearMatch=0x10;468static const int32_t kMaxLinearMatchLength=0x10;469470// 20..ff: Variable-length value node.471// If odd, the value is final. (Otherwise, intermediate value or jump delta.)472// Then shift-right by 1 bit.473// The remaining lead byte value indicates the number of following bytes (0..4)474// and contains the value's top bits.475static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20476// It is a final value if bit 0 is set.477static const int32_t kValueIsFinal=1;478479// Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.480static const int32_t kMinOneByteValueLead=kMinValueLead/2; // 0x10481static const int32_t kMaxOneByteValue=0x40; // At least 6 bits in the first byte.482483static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51484static const int32_t kMaxTwoByteValue=0x1aff;485486static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c487static const int32_t kFourByteValueLead=0x7e;488489// A little more than Unicode code points. (0x11ffff)490static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;491492static const int32_t kFiveByteValueLead=0x7f;493494// Compact delta integers.495static const int32_t kMaxOneByteDelta=0xbf;496static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0497static const int32_t kMinThreeByteDeltaLead=0xf0;498static const int32_t kFourByteDeltaLead=0xfe;499static const int32_t kFiveByteDeltaLead=0xff;500501static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff502static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff503504uint8_t *ownedArray_;505506// Fixed value referencing the BytesTrie bytes.507const uint8_t *bytes_;508509// Iterator variables.510511// Pointer to next trie byte to read. NULL if no more matches.512const uint8_t *pos_;513// Remaining length of a linear-match node, minus 1. Negative if not in such a node.514int32_t remainingMatchLength_;515};516517U_NAMESPACE_END518519#endif // __BYTESTRIE_H__520521522