// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*******************************************************************************4* Copyright (C) 2013-2014, International Business Machines5* Corporation and others. All Rights Reserved.6*******************************************************************************7* collationbuilder.h8*9* created on: 2013may0610* created by: Markus W. Scherer11*/1213#ifndef __COLLATIONBUILDER_H__14#define __COLLATIONBUILDER_H__1516#include "unicode/utypes.h"1718#if !UCONFIG_NO_COLLATION1920#include "unicode/uniset.h"21#include "unicode/unistr.h"22#include "collationrootelements.h"23#include "collationruleparser.h"24#include "uvectr32.h"25#include "uvectr64.h"2627struct UParseError;2829U_NAMESPACE_BEGIN3031struct CollationData;32struct CollationTailoring;3334class CEFinalizer;35class CollationDataBuilder;36class Normalizer2;37class Normalizer2Impl;3839class U_I18N_API CollationBuilder : public CollationRuleParser::Sink {40public:41CollationBuilder(const CollationTailoring *b, UBool icu4xMode, UErrorCode &errorCode);42CollationBuilder(const CollationTailoring *base, UErrorCode &errorCode);43virtual ~CollationBuilder();4445void disableFastLatin() { fastLatinEnabled = false; }4647CollationTailoring *parseAndBuild(const UnicodeString &ruleString,48const UVersionInfo rulesVersion,49CollationRuleParser::Importer *importer,50UParseError *outParseError,51UErrorCode &errorCode);5253const char *getErrorReason() const { return errorReason; }5455private:56friend class CEFinalizer;5758/** Implements CollationRuleParser::Sink. */59virtual void addReset(int32_t strength, const UnicodeString &str,60const char *&errorReason, UErrorCode &errorCode) override;61/**62* Returns the secondary or tertiary weight preceding the current node's weight.63* node=nodes[index].64*/65uint32_t getWeight16Before(int32_t index, int64_t node, int32_t level);6667int64_t getSpecialResetPosition(const UnicodeString &str,68const char *&parserErrorReason, UErrorCode &errorCode);6970/** Implements CollationRuleParser::Sink. */71virtual void addRelation(int32_t strength, const UnicodeString &prefix,72const UnicodeString &str, const UnicodeString &extension,73const char *&errorReason, UErrorCode &errorCode) override;7475/**76* Picks one of the current CEs and finds or inserts a node in the graph77* for the CE + strength.78*/79int32_t findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason,80UErrorCode &errorCode);81int32_t findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode);82/** Finds or inserts the node for a root CE's primary weight. */83int32_t findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode);84/** Finds or inserts the node for a secondary or tertiary weight. */85int32_t findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level,86UErrorCode &errorCode);8788/**89* Makes and inserts a new tailored node into the list, after the one at index.90* Skips over nodes of weaker strength to maintain collation order91* ("postpone insertion").92* @return the new node's index93*/94int32_t insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode);9596/**97* Inserts a new node into the list, between list-adjacent items.98* The node's previous and next indexes must not be set yet.99* @return the new node's index100*/101int32_t insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node,102UErrorCode &errorCode);103104/**105* Finds the node which implies or contains a common=05 weight of the given strength106* (secondary or tertiary), if the current node is stronger.107* Skips weaker nodes and tailored nodes if the current node is stronger108* and is followed by an explicit-common-weight node.109* Always returns the input index if that node is no stronger than the given strength.110*/111int32_t findCommonNode(int32_t index, int32_t strength) const;112113void setCaseBits(const UnicodeString &nfdString,114const char *&parserErrorReason, UErrorCode &errorCode);115116/** Implements CollationRuleParser::Sink. */117virtual void suppressContractions(const UnicodeSet &set, const char *&parserErrorReason,118UErrorCode &errorCode) override;119120/** Implements CollationRuleParser::Sink. */121virtual void optimize(const UnicodeSet &set, const char *&parserErrorReason,122UErrorCode &errorCode) override;123124/**125* Adds the mapping and its canonical closure.126* Takes ce32=dataBuilder->encodeCEs(...) so that the data builder127* need not re-encode the CEs multiple times.128*/129uint32_t addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,130const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,131UErrorCode &errorCode);132uint32_t addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,133const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,134UErrorCode &errorCode);135void addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,136UErrorCode &errorCode);137UBool mergeCompositeIntoString(const UnicodeString &nfdString, int32_t indexAfterLastStarter,138UChar32 composite, const UnicodeString &decomp,139UnicodeString &newNFDString, UnicodeString &newString,140UErrorCode &errorCode) const;141142UBool ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const;143UBool ignoreString(const UnicodeString &s, UErrorCode &errorCode) const;144UBool isFCD(const UnicodeString &s, UErrorCode &errorCode) const;145146void closeOverComposites(UErrorCode &errorCode);147148uint32_t addIfDifferent(const UnicodeString &prefix, const UnicodeString &str,149const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,150UErrorCode &errorCode);151static UBool sameCEs(const int64_t ces1[], int32_t ces1Length,152const int64_t ces2[], int32_t ces2Length);153154/**155* Walks the tailoring graph and overwrites tailored nodes with new CEs.156* After this, the graph is destroyed.157* The nodes array can then be used only as a source of tailored CEs.158*/159void makeTailoredCEs(UErrorCode &errorCode);160/**161* Counts the tailored nodes of the given strength up to the next node162* which is either stronger or has an explicit weight of this strength.163*/164static int32_t countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength);165166/** Replaces temporary CEs with the final CEs they point to. */167void finalizeCEs(UErrorCode &errorCode);168169/**170* Encodes "temporary CE" data into a CE that fits into the CE32 data structure,171* with 2-byte primary, 1-byte secondary and 6-bit tertiary,172* with valid CE byte values.173*174* The index must not exceed 20 bits (0xfffff).175* The strength must fit into 2 bits (UCOL_PRIMARY..UCOL_QUATERNARY).176*177* Temporary CEs are distinguished from real CEs by their use of178* secondary weights 06..45 which are otherwise reserved for compressed sort keys.179*180* The case bits are unused and available.181*/182static inline int64_t tempCEFromIndexAndStrength(int32_t index, int32_t strength) {183return184// CE byte offsets, to ensure valid CE bytes, and case bits 11185INT64_C(0x4040000006002000) +186// index bits 19..13 -> primary byte 1 = CE bits 63..56 (byte values 40..BF)187((int64_t)(index & 0xfe000) << 43) +188// index bits 12..6 -> primary byte 2 = CE bits 55..48 (byte values 40..BF)189((int64_t)(index & 0x1fc0) << 42) +190// index bits 5..0 -> secondary byte 1 = CE bits 31..24 (byte values 06..45)191((index & 0x3f) << 24) +192// strength bits 1..0 -> tertiary byte 1 = CE bits 13..8 (byte values 20..23)193(strength << 8);194}195static inline int32_t indexFromTempCE(int64_t tempCE) {196tempCE -= INT64_C(0x4040000006002000);197return198((int32_t)(tempCE >> 43) & 0xfe000) |199((int32_t)(tempCE >> 42) & 0x1fc0) |200((int32_t)(tempCE >> 24) & 0x3f);201}202static inline int32_t strengthFromTempCE(int64_t tempCE) {203return ((int32_t)tempCE >> 8) & 3;204}205static inline UBool isTempCE(int64_t ce) {206uint32_t sec = (uint32_t)ce >> 24;207return 6 <= sec && sec <= 0x45;208}209210static inline int32_t indexFromTempCE32(uint32_t tempCE32) {211tempCE32 -= 0x40400620;212return213((int32_t)(tempCE32 >> 11) & 0xfe000) |214((int32_t)(tempCE32 >> 10) & 0x1fc0) |215((int32_t)(tempCE32 >> 8) & 0x3f);216}217static inline UBool isTempCE32(uint32_t ce32) {218return219(ce32 & 0xff) >= 2 && // not a long-primary/long-secondary CE322206 <= ((ce32 >> 8) & 0xff) && ((ce32 >> 8) & 0xff) <= 0x45;221}222223static int32_t ceStrength(int64_t ce);224225/** At most 1M nodes, limited by the 20 bits in node bit fields. */226static const int32_t MAX_INDEX = 0xfffff;227/**228* Node bit 6 is set on a primary node if there are nodes229* with secondary values below the common secondary weight (05).230*/231static const int32_t HAS_BEFORE2 = 0x40;232/**233* Node bit 5 is set on a primary or secondary node if there are nodes234* with tertiary values below the common tertiary weight (05).235*/236static const int32_t HAS_BEFORE3 = 0x20;237/**238* Node bit 3 distinguishes a tailored node, which has no weight value,239* from a node with an explicit (root or default) weight.240*/241static const int32_t IS_TAILORED = 8;242243static inline int64_t nodeFromWeight32(uint32_t weight32) {244return (int64_t)weight32 << 32;245}246static inline int64_t nodeFromWeight16(uint32_t weight16) {247return (int64_t)weight16 << 48;248}249static inline int64_t nodeFromPreviousIndex(int32_t previous) {250return (int64_t)previous << 28;251}252static inline int64_t nodeFromNextIndex(int32_t next) {253return next << 8;254}255static inline int64_t nodeFromStrength(int32_t strength) {256return strength;257}258259static inline uint32_t weight32FromNode(int64_t node) {260return (uint32_t)(node >> 32);261}262static inline uint32_t weight16FromNode(int64_t node) {263return (uint32_t)(node >> 48) & 0xffff;264}265static inline int32_t previousIndexFromNode(int64_t node) {266return (int32_t)(node >> 28) & MAX_INDEX;267}268static inline int32_t nextIndexFromNode(int64_t node) {269return ((int32_t)node >> 8) & MAX_INDEX;270}271static inline int32_t strengthFromNode(int64_t node) {272return (int32_t)node & 3;273}274275static inline UBool nodeHasBefore2(int64_t node) {276return (node & HAS_BEFORE2) != 0;277}278static inline UBool nodeHasBefore3(int64_t node) {279return (node & HAS_BEFORE3) != 0;280}281static inline UBool nodeHasAnyBefore(int64_t node) {282return (node & (HAS_BEFORE2 | HAS_BEFORE3)) != 0;283}284static inline UBool isTailoredNode(int64_t node) {285return (node & IS_TAILORED) != 0;286}287288static inline int64_t changeNodePreviousIndex(int64_t node, int32_t previous) {289return (node & INT64_C(0xffff00000fffffff)) | nodeFromPreviousIndex(previous);290}291static inline int64_t changeNodeNextIndex(int64_t node, int32_t next) {292return (node & INT64_C(0xfffffffff00000ff)) | nodeFromNextIndex(next);293}294295const Normalizer2 &nfd, &fcd;296const Normalizer2Impl &nfcImpl;297298const CollationTailoring *base;299const CollationData *baseData;300const CollationRootElements rootElements;301uint32_t variableTop;302303CollationDataBuilder *dataBuilder;304UBool fastLatinEnabled;305UBool icu4xMode;306UnicodeSet optimizeSet;307const char *errorReason;308309int64_t ces[Collation::MAX_EXPANSION_LENGTH];310int32_t cesLength;311312/**313* Indexes of nodes with root primary weights, sorted by primary.314* Compact form of a TreeMap from root primary to node index.315*316* This is a performance optimization for finding reset positions.317* Without this, we would have to search through the entire nodes list.318* It also allows storing root primary weights in list head nodes,319* without previous index, leaving room in root primary nodes for 32-bit primary weights.320*/321UVector32 rootPrimaryIndexes;322/**323* Data structure for assigning tailored weights and CEs.324* Doubly-linked lists of nodes in mostly collation order.325* Each list starts with a root primary node and ends with a nextIndex of 0.326*327* When there are any nodes in the list, then there is always a root primary node at index 0.328* This allows some code not to have to check explicitly for nextIndex==0.329*330* Root primary nodes have 32-bit weights but do not have previous indexes.331* All other nodes have at most 16-bit weights and do have previous indexes.332*333* Nodes with explicit weights store root collator weights,334* or default weak weights (e.g., secondary 05) for stronger nodes.335* "Tailored" nodes, with the IS_TAILORED bit set,336* do not store explicit weights but rather337* create a difference of a certain strength from the preceding node.338*339* A root node is followed by either340* - a root/default node of the same strength, or341* - a root/default node of the next-weaker strength, or342* - a tailored node of the same strength.343*344* A node of a given strength normally implies "common" weights on weaker levels.345*346* A node with HAS_BEFORE2 must be immediately followed by347* a secondary node with an explicit below-common weight, then a secondary tailored node,348* and later an explicit common-secondary node.349* The below-common weight can be a root weight,350* or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight351* or before the lowest root weight.352* (&[before 2] resets to an explicit secondary node so that353* the following addRelation(secondary) tailors right after that.354* If we did not have this node and instead were to reset on the primary node,355* then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.)356*357* If the flag is not set, then there are no explicit secondary nodes358* with the common or lower weights.359*360* Same for HAS_BEFORE3 for tertiary nodes and weights.361* A node must not have both flags set.362*363* Tailored CEs are initially represented in a CollationDataBuilder as temporary CEs364* which point to stable indexes in this list,365* and temporary CEs stored in a CollationDataBuilder only point to tailored nodes.366*367* A temporary CE in the ces[] array may point to a non-tailored reset-before-position node,368* until the next relation is added.369*370* At the end, the tailored weights are allocated as necessary,371* then the tailored nodes are replaced with final CEs,372* and the CollationData is rewritten by replacing temporary CEs with final ones.373*374* We cannot simply insert new nodes in the middle of the array375* because that would invalidate the indexes stored in existing temporary CEs.376* We need to use a linked graph with stable indexes to existing nodes.377* A doubly-linked list seems easiest to maintain.378*379* Each node is stored as an int64_t, with its fields stored as bit fields.380*381* Root primary node:382* - primary weight: 32 bits 63..32383* - reserved/unused/zero: 4 bits 31..28384*385* Weaker root nodes & tailored nodes:386* - a weight: 16 bits 63..48387* + a root or default weight for a non-tailored node388* + unused/zero for a tailored node389* - index to the previous node: 20 bits 47..28390*391* All types of nodes:392* - index to the next node: 20 bits 27..8393* + nextIndex=0 in last node per root-primary list394* - reserved/unused/zero bits: bits 7, 4, 2395* - HAS_BEFORE2: bit 6396* - HAS_BEFORE3: bit 5397* - IS_TAILORED: bit 3398* - the difference strength (primary/secondary/tertiary/quaternary): 2 bits 1..0399*400* We could allocate structs with pointers, but we would have to store them401* in a pointer list so that they can be indexed from temporary CEs,402* and they would require more memory allocations.403*/404UVector64 nodes;405};406407U_NAMESPACE_END408409#endif // !UCONFIG_NO_COLLATION410#endif // __COLLATIONBUILDER_H__411412413