CoCalc -- collationdatareader.h

GitHub Repository: wine-mirror/wine
Path: blob/master/libs/icui18n/collationdatareader.h
¹²³⁴³ views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2013-2015, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* collationdatareader.h
9
*
10
* created on: 2013feb07
11
* created by: Markus W. Scherer
12
*/
13

14
#ifndef __COLLATIONDATAREADER_H__
15
#define __COLLATIONDATAREADER_H__
16

17
#include "unicode/utypes.h"
18

19
#if !UCONFIG_NO_COLLATION
20

21
#include "unicode/udata.h"
22

23
struct UDataMemory;
24

25
U_NAMESPACE_BEGIN
26

27
struct CollationTailoring;
28

29
/**
30
 * Collation binary data reader.
31
 */
32
struct U_I18N_API CollationDataReader /* all static */ {
33
    // The following constants are also copied into source/common/ucol_swp.cpp.
34
    // Keep them in sync!
35
    enum {
36
        /**
37
         * Number of int32_t indexes.
38
         *
39
         * Can be 2 if there are only options.
40
         * Can be 7 or 8 if there are only options and a script reordering.
41
         * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
42
         */
43
        IX_INDEXES_LENGTH,  // 0
44
        /**
45
         * Bits 31..24: numericPrimary, for numeric collation
46
         *      23..16: fast Latin format version (0 = no fast Latin table)
47
         *      15.. 0: options bit set
48
         */
49
        IX_OPTIONS,
50
        IX_RESERVED2,
51
        IX_RESERVED3,
52

53
        /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
54
        IX_JAMO_CE32S_START,  // 4
55

56
        // Byte offsets from the start of the data, after the generic header.
57
        // The indexes[] are at byte offset 0, other data follows.
58
        // Each data item is aligned properly.
59
        // The data items should be in descending order of unit size,
60
        // to minimize the need for padding.
61
        // Each item's byte length is given by the difference between its offset and
62
        // the next index/offset value.
63
        /** Byte offset to int32_t reorderCodes[]. */
64
        IX_REORDER_CODES_OFFSET,
65
        /**
66
         * Byte offset to uint8_t reorderTable[].
67
         * Empty table if <256 bytes (padding only).
68
         * Otherwise 256 bytes or more (with padding).
69
         */
70
        IX_REORDER_TABLE_OFFSET,
71
        /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
72
        IX_TRIE_OFFSET,
73

74
        IX_RESERVED8_OFFSET,  // 8
75
        /** Byte offset to int64_t ces[]. */
76
        IX_CES_OFFSET,
77
        IX_RESERVED10_OFFSET,
78
        /** Byte offset to uint32_t ce32s[]. */
79
        IX_CE32S_OFFSET,
80

81
        /** Byte offset to uint32_t rootElements[]. */
82
        IX_ROOT_ELEMENTS_OFFSET,  // 12
83
        /** Byte offset to UChar *contexts[]. */
84
        IX_CONTEXTS_OFFSET,
85
        /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */
86
        IX_UNSAFE_BWD_OFFSET,
87
        /** Byte offset to uint16_t fastLatinTable[]. */
88
        IX_FAST_LATIN_TABLE_OFFSET,
89

90
        /** Byte offset to uint16_t scripts[]. */
91
        IX_SCRIPTS_OFFSET,  // 16
92
        /**
93
         * Byte offset to UBool compressibleBytes[].
94
         * Empty table if <256 bytes (padding only).
95
         * Otherwise 256 bytes or more (with padding).
96
         */
97
        IX_COMPRESSIBLE_BYTES_OFFSET,
98
        IX_RESERVED18_OFFSET,
99
        IX_TOTAL_SIZE
100
    };
101

102
    static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
103
                     CollationTailoring &tailoring, UErrorCode &errorCode);
104

105
    static UBool U_CALLCONV
106
    isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
107

108
private:
109
    CollationDataReader() = delete;  // no constructor
110
};
111

112
/*
113
 * Format of collation data (ucadata.icu, binary data in coll/ *.res files).
114
 * Format version 5.
115
 *
116
 * The root collation data is stored in the ucadata.icu file.
117
 * Tailorings are stored inside .res resource bundle files, with a complete file header.
118
 *
119
 * Collation data begins with a standard ICU data file header
120
 * (DataHeader, see ucmndata.h and unicode/udata.h).
121
 * The UDataInfo.dataVersion field contains the UCA and other version numbers,
122
 * see the comments for CollationTailoring.version.
123
 *
124
 * After the header, the file contains the following parts.
125
 * Constants are defined as enum values of the CollationDataReader class.
126
 * See also the Collation class.
127
 *
128
 * int32_t indexes[indexesLength];
129
 *      The indexes array has variable length.
130
 *      Some tailorings only need the length and the options,
131
 *      others only add reorderCodes and the reorderTable,
132
 *      some need to store mappings.
133
 *      Only as many indexes are stored as needed to read all of the data.
134
 *
135
 *      Index 0: indexesLength
136
 *      Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS
137
 *      Index 2..3: Unused/reserved/0.
138
 *      Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo
139
 *               are stored in a short, contiguous part of the ce32s array.
140
 *
141
 *      Indexes 5..19 are byte offsets in ascending order.
142
 *      Each byte offset marks the start of the next part in the data file,
143
 *      and the end of the previous one.
144
 *      When two consecutive byte offsets are the same (or too short),
145
 *      then the corresponding part is empty.
146
 *      Byte offsets are offsets from after the header,
147
 *      that is, from the beginning of the indexes[].
148
 *      Each part starts at an offset with proper alignment for its data.
149
 *      If necessary, the previous part may include padding bytes to achieve this alignment.
150
 *      The last byte offset that is stored in the indexes indicates the total size of the data
151
 *      (starting with the indexes).
152
 *
153
 * int32_t reorderCodes[]; -- empty in root
154
 *      The list of script and reordering codes.
155
 *
156
 *      Beginning with format version 5, this array may optionally
157
 *      have trailing entries with a full list of reorder ranges
158
 *      as described for CollationSettings::reorderRanges.
159
 *
160
 *      Script or reorder codes are first and do not exceed 16-bit values.
161
 *      Range limits are stored in the upper 16 bits, and are never 0.
162
 *      Split this array into reorder codes and ranges at the first entry
163
 *      with non-zero upper 16 bits.
164
 *
165
 *      If the ranges are missing but needed for split-reordered primary lead bytes,
166
 *      then they are regenerated at load time.
167
 *
168
 * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes
169
 *      Primary-weight lead byte permutation table.
170
 *      Normally present when the reorderCodes are, but can be built at load time.
171
 *
172
 *      Beginning with format version 5, a 0 entry at a non-zero index
173
 *      (which is otherwise an illegal value)
174
 *      means that the primary lead byte is "split"
175
 *      (there are different offsets for primaries that share that lead byte)
176
 *      and the reordering offset must be determined via the reorder ranges
177
 *      that are either stored as part of the reorderCodes array
178
 *      or regenerated at load time.
179
 *
180
 * UTrie2 trie; -- see utrie2_impl.h and utrie2.h
181
 *      The trie holds the main collation data. Each code point is mapped to a 32-bit value.
182
 *      It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,
183
 *      in which case it is a special CE32 and contains a 4-bit tag and further data.
184
 *      See the Collation class for details.
185
 *
186
 *      The trie has a value for each lead surrogate code unit with some bits encoding
187
 *      collective properties of the 1024 supplementary characters whose UTF-16 form starts with
188
 *      the lead surrogate. See Collation::LEAD_SURROGATE_TAG..
189
 *
190
 * int64_t ces[];
191
 *      64-bit CEs and expansions that cannot be stored in a more compact form.
192
 *
193
 * uint32_t ce32s[];
194
 *      CE32s for expansions in compact form, and for characters whose trie values
195
 *      contain special data.
196
 *
197
 * uint32_t rootElements[]; -- empty in all tailorings
198
 *      Compact storage for all of the CEs that occur in the root collation.
199
 *      See the CollationRootElements class.
200
 *
201
 * UChar *contexts[];
202
 *      Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings.
203
 *
204
 * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize()
205
 *      Serialized form of characters that are unsafe when iterating backwards,
206
 *      and at the end of an identical string prefix.
207
 *      Back up to a safe character.
208
 *      Lead surrogates are "unsafe" when any of their corresponding supplementary
209
 *      code points are unsafe.
210
 *      Does not include [:^lccc=0:][:^tccc=0:].
211
 *      For each tailoring, the root unsafeBackwardSet is subtracted.
212
 *      (As a result, in many tailorings no set needs to be stored.)
213
 *
214
 * uint16_t fastLatinTable[];
215
 *      Optional optimization for Latin text.
216
 *      See the CollationFastLatin class.
217
 *
218
 * uint16_t scripts[]; -- empty in all tailorings
219
 *      Format version 5:
220
 *      uint16_t numScripts;
221
 *      uint16_t scriptsIndex[numScripts+16];
222
 *      uint16_t scriptStarts[];
223
 *      See CollationData::numScripts etc.
224
 *
225
 *      Format version 4:
226
 *      Table of the reordering groups with their first and last lead bytes,
227
 *      and their script and reordering codes.
228
 *      See CollationData::scripts.
229
 *
230
 * UBool compressibleBytes[]; -- empty in all tailorings
231
 *      Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
232
 *
233
 * -----------------
234
 * Changes for formatVersion 5 (ICU 55)
235
 *
236
 * Reordering moves single scripts, not groups of scripts.
237
 * Reorder ranges are optionally appended to the reorderCodes,
238
 * and a 0 entry in the reorderTable indicates a split lead byte.
239
 * The scripts data has a new format.
240
 *
241
 * The rootElements may contain secondary and tertiary weights below common=05.
242
 * (Used for small Hiragana letters.)
243
 * Where is occurs, there is also an explicit unit with common secondary & tertiary weights.
244
 * There are no other data structure changes, but builder code needs to be able to handle such data.
245
 *
246
 * The collation element for the merge separator code point U+FFFE
247
 * does not necessarily have special, unique secondary/tertiary weights any more.
248
 */
249

250
U_NAMESPACE_END
251

252
#endif  // !UCONFIG_NO_COLLATION
253
#endif  // __COLLATIONDATAREADER_H__
254

255
Product

Resources

Company