Path: blob/master/libs/icui18n/collationdatabuilder.h
12343 views
// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*******************************************************************************4* Copyright (C) 2012-2014, International Business Machines5* Corporation and others. All Rights Reserved.6*******************************************************************************7* collationdatabuilder.h8*9* created on: 2012apr0110* created by: Markus W. Scherer11*/1213#ifndef __COLLATIONDATABUILDER_H__14#define __COLLATIONDATABUILDER_H__1516#include "unicode/utypes.h"1718#if !UCONFIG_NO_COLLATION1920#include "unicode/uniset.h"21#include "unicode/unistr.h"22#include "unicode/uversion.h"23#include "collation.h"24#include "collationdata.h"25#include "collationsettings.h"26#include "normalizer2impl.h"27#include "utrie2.h"28#include "uvectr32.h"29#include "uvectr64.h"30#include "uvector.h"3132U_NAMESPACE_BEGIN3334struct ConditionalCE32;3536class CollationFastLatinBuilder;37class CopyHelper;38class DataBuilderCollationIterator;39class UCharsTrieBuilder;4041/**42* Low-level CollationData builder.43* Takes (character, CE) pairs and builds them into runtime data structures.44* Supports characters with context prefixes and contraction suffixes.45*/46class U_I18N_API CollationDataBuilder : public UObject {47public:48/**49* Collation element modifier. Interface class for a modifier50* that changes a tailoring builder's temporary CEs to final CEs.51* Called for every non-special CE32 and every expansion CE.52*/53class CEModifier : public UObject {54public:55virtual ~CEModifier();56/** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */57virtual int64_t modifyCE32(uint32_t ce32) const = 0;58/** Returns a new CE to replace the input CE, or else Collation::NO_CE. */59virtual int64_t modifyCE(int64_t ce) const = 0;60};6162CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode);6364virtual ~CollationDataBuilder();6566void initForTailoring(const CollationData *b, UErrorCode &errorCode);6768virtual UBool isCompressibleLeadByte(uint32_t b) const;6970inline UBool isCompressiblePrimary(uint32_t p) const {71return isCompressibleLeadByte(p >> 24);72}7374/**75* @return true if this builder has mappings (e.g., add() has been called)76*/77UBool hasMappings() const { return modified; }7879/**80* @return true if c has CEs in this builder81*/82UBool isAssigned(UChar32 c) const;8384/**85* @return the three-byte primary if c maps to a single such CE and has no context data,86* otherwise returns 0.87*/88uint32_t getLongPrimaryIfSingleCE(UChar32 c) const;8990/**91* @return the single CE for c.92* Sets an error code if c does not have a single CE.93*/94int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;9596void add(const UnicodeString &prefix, const UnicodeString &s,97const int64_t ces[], int32_t cesLength,98UErrorCode &errorCode);99100/**101* Encodes the ces as either the returned ce32 by itself,102* or by storing an expansion, with the returned ce32 referring to that.103*104* add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))105*/106virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);107void addCE32(const UnicodeString &prefix, const UnicodeString &s,108uint32_t ce32, UErrorCode &errorCode);109110/**111* Sets three-byte-primary CEs for a range of code points in code point order,112* if it is worth doing; otherwise no change is made.113* None of the code points in the range should have complex mappings so far114* (expansions/contractions/prefixes).115* @param start first code point116* @param end last code point (inclusive)117* @param primary primary weight for 'start'118* @param step per-code point primary-weight increment119* @param errorCode ICU in/out error code120* @return true if an OFFSET_TAG range was used for start..end121*/122UBool maybeSetPrimaryRange(UChar32 start, UChar32 end,123uint32_t primary, int32_t step,124UErrorCode &errorCode);125126/**127* Sets three-byte-primary CEs for a range of code points in code point order.128* Sets range values if that is worth doing, or else individual values.129* None of the code points in the range should have complex mappings so far130* (expansions/contractions/prefixes).131* @param start first code point132* @param end last code point (inclusive)133* @param primary primary weight for 'start'134* @param step per-code point primary-weight increment135* @param errorCode ICU in/out error code136* @return the next primary after 'end': start primary incremented by ((end-start)+1)*step137*/138uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,139uint32_t primary, int32_t step,140UErrorCode &errorCode);141142/**143* Copies all mappings from the src builder, with modifications.144* This builder here must not be built yet, and should be empty.145*/146void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,147UErrorCode &errorCode);148149void optimize(const UnicodeSet &set, UErrorCode &errorCode);150void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode);151152void enableFastLatin() { fastLatinEnabled = true; }153virtual void build(CollationData &data, UErrorCode &errorCode);154155/**156* Looks up CEs for s and appends them to the ces array.157* Does not handle normalization: s should be in FCD form.158*159* Does not write completely ignorable CEs.160* Does not write beyond Collation::MAX_EXPANSION_LENGTH.161*162* @return incremented cesLength163*/164int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength);165int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s,166int64_t ces[], int32_t cesLength);167168protected:169friend class CopyHelper;170friend class DataBuilderCollationIterator;171172uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const;173174int32_t addCE(int64_t ce, UErrorCode &errorCode);175int32_t addCE32(uint32_t ce32, UErrorCode &errorCode);176int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode);177178inline ConditionalCE32 *getConditionalCE32(int32_t index) const {179return static_cast<ConditionalCE32 *>(conditionalCE32s[index]);180}181inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const {182return getConditionalCE32(Collation::indexFromCE32(ce32));183}184185static uint32_t makeBuilderContextCE32(int32_t index) {186return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index);187}188static inline UBool isBuilderContextCE32(uint32_t ce32) {189return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG);190}191192static uint32_t encodeOneCEAsCE32(int64_t ce);193uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode);194uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode);195uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode);196197uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode);198/**199* Copies base contractions to a list of ConditionalCE32.200* Sets cond->next to the index of the first new item201* and returns the index of the last new item.202*/203int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,204ConditionalCE32 *cond, UErrorCode &errorCode);205206UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode);207void setDigitTags(UErrorCode &errorCode);208void setLeadSurrogates(UErrorCode &errorCode);209210void buildMappings(CollationData &data, UErrorCode &errorCode);211212void clearContexts();213void buildContexts(UErrorCode &errorCode);214uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode);215int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,216UErrorCode &errorCode);217218void buildFastLatinTable(CollationData &data, UErrorCode &errorCode);219220int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength);221222static UChar32 jamoCpFromIndex(int32_t i) {223// 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27224if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; }225i -= Hangul::JAMO_L_COUNT;226if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; }227i -= Hangul::JAMO_V_COUNT;228// i < 27229return Hangul::JAMO_T_BASE + 1 + i;230}231232/** @see Collation::BUILDER_DATA_TAG */233static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100;234235const Normalizer2Impl &nfcImpl;236const CollationData *base;237const CollationSettings *baseSettings;238UTrie2 *trie;239UVector32 ce32s;240UVector64 ce64s;241UVector conditionalCE32s; // vector of ConditionalCE32242// Characters that have context (prefixes or contraction suffixes).243UnicodeSet contextChars;244// Serialized UCharsTrie structures for finalized contexts.245UnicodeString contexts;246private:247/**248* The "era" of building intermediate contexts.249* When the array of cached, temporary contexts overflows, then clearContexts()250* removes them all and invalidates the builtCE32 that used to point to built tries.251* See ConditionalCE32::era.252*/253int32_t contextsEra = 0;254protected:255UnicodeSet unsafeBackwardSet;256UBool modified;257UBool icu4xMode;258259UBool fastLatinEnabled;260CollationFastLatinBuilder *fastLatinBuilder;261262DataBuilderCollationIterator *collIter;263};264265U_NAMESPACE_END266267#endif // !UCONFIG_NO_COLLATION268#endif // __COLLATIONDATABUILDER_H__269270271