// © 2016 and later: Unicode, Inc. and others.1// License & terms of use: http://www.unicode.org/copyright.html2/*3*******************************************************************************4* Copyright (C) 2013-2015, International Business Machines5* Corporation and others. All Rights Reserved.6*******************************************************************************7* collationdatareader.h8*9* created on: 2013feb0710* created by: Markus W. Scherer11*/1213#ifndef __COLLATIONDATAREADER_H__14#define __COLLATIONDATAREADER_H__1516#include "unicode/utypes.h"1718#if !UCONFIG_NO_COLLATION1920#include "unicode/udata.h"2122struct UDataMemory;2324U_NAMESPACE_BEGIN2526struct CollationTailoring;2728/**29* Collation binary data reader.30*/31struct U_I18N_API CollationDataReader /* all static */ {32// The following constants are also copied into source/common/ucol_swp.cpp.33// Keep them in sync!34enum {35/**36* Number of int32_t indexes.37*38* Can be 2 if there are only options.39* Can be 7 or 8 if there are only options and a script reordering.40* The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.41*/42IX_INDEXES_LENGTH, // 043/**44* Bits 31..24: numericPrimary, for numeric collation45* 23..16: fast Latin format version (0 = no fast Latin table)46* 15.. 0: options bit set47*/48IX_OPTIONS,49IX_RESERVED2,50IX_RESERVED3,5152/** Array offset to Jamo CE32s in ce32s[], or <0 if none. */53IX_JAMO_CE32S_START, // 45455// Byte offsets from the start of the data, after the generic header.56// The indexes[] are at byte offset 0, other data follows.57// Each data item is aligned properly.58// The data items should be in descending order of unit size,59// to minimize the need for padding.60// Each item's byte length is given by the difference between its offset and61// the next index/offset value.62/** Byte offset to int32_t reorderCodes[]. */63IX_REORDER_CODES_OFFSET,64/**65* Byte offset to uint8_t reorderTable[].66* Empty table if <256 bytes (padding only).67* Otherwise 256 bytes or more (with padding).68*/69IX_REORDER_TABLE_OFFSET,70/** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */71IX_TRIE_OFFSET,7273IX_RESERVED8_OFFSET, // 874/** Byte offset to int64_t ces[]. */75IX_CES_OFFSET,76IX_RESERVED10_OFFSET,77/** Byte offset to uint32_t ce32s[]. */78IX_CE32S_OFFSET,7980/** Byte offset to uint32_t rootElements[]. */81IX_ROOT_ELEMENTS_OFFSET, // 1282/** Byte offset to UChar *contexts[]. */83IX_CONTEXTS_OFFSET,84/** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */85IX_UNSAFE_BWD_OFFSET,86/** Byte offset to uint16_t fastLatinTable[]. */87IX_FAST_LATIN_TABLE_OFFSET,8889/** Byte offset to uint16_t scripts[]. */90IX_SCRIPTS_OFFSET, // 1691/**92* Byte offset to UBool compressibleBytes[].93* Empty table if <256 bytes (padding only).94* Otherwise 256 bytes or more (with padding).95*/96IX_COMPRESSIBLE_BYTES_OFFSET,97IX_RESERVED18_OFFSET,98IX_TOTAL_SIZE99};100101static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,102CollationTailoring &tailoring, UErrorCode &errorCode);103104static UBool U_CALLCONV105isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);106107private:108CollationDataReader() = delete; // no constructor109};110111/*112* Format of collation data (ucadata.icu, binary data in coll/ *.res files).113* Format version 5.114*115* The root collation data is stored in the ucadata.icu file.116* Tailorings are stored inside .res resource bundle files, with a complete file header.117*118* Collation data begins with a standard ICU data file header119* (DataHeader, see ucmndata.h and unicode/udata.h).120* The UDataInfo.dataVersion field contains the UCA and other version numbers,121* see the comments for CollationTailoring.version.122*123* After the header, the file contains the following parts.124* Constants are defined as enum values of the CollationDataReader class.125* See also the Collation class.126*127* int32_t indexes[indexesLength];128* The indexes array has variable length.129* Some tailorings only need the length and the options,130* others only add reorderCodes and the reorderTable,131* some need to store mappings.132* Only as many indexes are stored as needed to read all of the data.133*134* Index 0: indexesLength135* Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS136* Index 2..3: Unused/reserved/0.137* Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo138* are stored in a short, contiguous part of the ce32s array.139*140* Indexes 5..19 are byte offsets in ascending order.141* Each byte offset marks the start of the next part in the data file,142* and the end of the previous one.143* When two consecutive byte offsets are the same (or too short),144* then the corresponding part is empty.145* Byte offsets are offsets from after the header,146* that is, from the beginning of the indexes[].147* Each part starts at an offset with proper alignment for its data.148* If necessary, the previous part may include padding bytes to achieve this alignment.149* The last byte offset that is stored in the indexes indicates the total size of the data150* (starting with the indexes).151*152* int32_t reorderCodes[]; -- empty in root153* The list of script and reordering codes.154*155* Beginning with format version 5, this array may optionally156* have trailing entries with a full list of reorder ranges157* as described for CollationSettings::reorderRanges.158*159* Script or reorder codes are first and do not exceed 16-bit values.160* Range limits are stored in the upper 16 bits, and are never 0.161* Split this array into reorder codes and ranges at the first entry162* with non-zero upper 16 bits.163*164* If the ranges are missing but needed for split-reordered primary lead bytes,165* then they are regenerated at load time.166*167* uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes168* Primary-weight lead byte permutation table.169* Normally present when the reorderCodes are, but can be built at load time.170*171* Beginning with format version 5, a 0 entry at a non-zero index172* (which is otherwise an illegal value)173* means that the primary lead byte is "split"174* (there are different offsets for primaries that share that lead byte)175* and the reordering offset must be determined via the reorder ranges176* that are either stored as part of the reorderCodes array177* or regenerated at load time.178*179* UTrie2 trie; -- see utrie2_impl.h and utrie2.h180* The trie holds the main collation data. Each code point is mapped to a 32-bit value.181* It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,182* in which case it is a special CE32 and contains a 4-bit tag and further data.183* See the Collation class for details.184*185* The trie has a value for each lead surrogate code unit with some bits encoding186* collective properties of the 1024 supplementary characters whose UTF-16 form starts with187* the lead surrogate. See Collation::LEAD_SURROGATE_TAG..188*189* int64_t ces[];190* 64-bit CEs and expansions that cannot be stored in a more compact form.191*192* uint32_t ce32s[];193* CE32s for expansions in compact form, and for characters whose trie values194* contain special data.195*196* uint32_t rootElements[]; -- empty in all tailorings197* Compact storage for all of the CEs that occur in the root collation.198* See the CollationRootElements class.199*200* UChar *contexts[];201* Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings.202*203* uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize()204* Serialized form of characters that are unsafe when iterating backwards,205* and at the end of an identical string prefix.206* Back up to a safe character.207* Lead surrogates are "unsafe" when any of their corresponding supplementary208* code points are unsafe.209* Does not include [:^lccc=0:][:^tccc=0:].210* For each tailoring, the root unsafeBackwardSet is subtracted.211* (As a result, in many tailorings no set needs to be stored.)212*213* uint16_t fastLatinTable[];214* Optional optimization for Latin text.215* See the CollationFastLatin class.216*217* uint16_t scripts[]; -- empty in all tailorings218* Format version 5:219* uint16_t numScripts;220* uint16_t scriptsIndex[numScripts+16];221* uint16_t scriptStarts[];222* See CollationData::numScripts etc.223*224* Format version 4:225* Table of the reordering groups with their first and last lead bytes,226* and their script and reordering codes.227* See CollationData::scripts.228*229* UBool compressibleBytes[]; -- empty in all tailorings230* Flag for getSortKey(), indicating primary weight lead bytes that are compressible.231*232* -----------------233* Changes for formatVersion 5 (ICU 55)234*235* Reordering moves single scripts, not groups of scripts.236* Reorder ranges are optionally appended to the reorderCodes,237* and a 0 entry in the reorderTable indicates a split lead byte.238* The scripts data has a new format.239*240* The rootElements may contain secondary and tertiary weights below common=05.241* (Used for small Hiragana letters.)242* Where is occurs, there is also an explicit unit with common secondary & tertiary weights.243* There are no other data structure changes, but builder code needs to be able to handle such data.244*245* The collation element for the merge separator code point U+FFFE246* does not necessarily have special, unique secondary/tertiary weights any more.247*/248249U_NAMESPACE_END250251#endif // !UCONFIG_NO_COLLATION252#endif // __COLLATIONDATAREADER_H__253254255