Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/icu4c/common/bytestriebuilder.cpp
9903 views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2010-2012, International Business Machines
6
* Corporation and others. All Rights Reserved.
7
*******************************************************************************
8
* file name: bytestriebuilder.cpp
9
* encoding: UTF-8
10
* tab size: 8 (not used)
11
* indentation:4
12
*
13
* created on: 2010sep25
14
* created by: Markus W. Scherer
15
*/
16
17
#include "unicode/utypes.h"
18
#include "unicode/bytestrie.h"
19
#include "unicode/bytestriebuilder.h"
20
#include "unicode/stringpiece.h"
21
#include "charstr.h"
22
#include "cmemory.h"
23
#include "uhash.h"
24
#include "uarrsort.h"
25
#include "uassert.h"
26
#include "ustr_imp.h"
27
28
U_NAMESPACE_BEGIN
29
30
/*
31
* Note: This builder implementation stores (bytes, value) pairs with full copies
32
* of the byte sequences, until the BytesTrie is built.
33
* It might(!) take less memory if we collected the data in a temporary, dynamic trie.
34
*/
35
36
class BytesTrieElement : public UMemory {
37
public:
38
// Use compiler's default constructor, initializes nothing.
39
40
void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode);
41
42
StringPiece getString(const CharString &strings) const {
43
int32_t offset=stringOffset;
44
int32_t length;
45
if(offset>=0) {
46
length = static_cast<uint8_t>(strings[offset++]);
47
} else {
48
offset=~offset;
49
length = (static_cast<int32_t>(static_cast<uint8_t>(strings[offset])) << 8) | static_cast<uint8_t>(strings[offset + 1]);
50
offset+=2;
51
}
52
return StringPiece(strings.data()+offset, length);
53
}
54
int32_t getStringLength(const CharString &strings) const {
55
int32_t offset=stringOffset;
56
if(offset>=0) {
57
return static_cast<uint8_t>(strings[offset]);
58
} else {
59
offset=~offset;
60
return (static_cast<int32_t>(static_cast<uint8_t>(strings[offset])) << 8) | static_cast<uint8_t>(strings[offset + 1]);
61
}
62
}
63
64
char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
65
66
int32_t getValue() const { return value; }
67
68
int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
69
70
private:
71
const char *data(const CharString &strings) const {
72
int32_t offset=stringOffset;
73
if(offset>=0) {
74
++offset;
75
} else {
76
offset=~offset+2;
77
}
78
return strings.data()+offset;
79
}
80
81
// If the stringOffset is non-negative, then the first strings byte contains
82
// the string length.
83
// If the stringOffset is negative, then the first two strings bytes contain
84
// the string length (big-endian), and the offset needs to be bit-inverted.
85
// (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
86
int32_t stringOffset;
87
int32_t value;
88
};
89
90
void
91
BytesTrieElement::setTo(StringPiece s, int32_t val,
92
CharString &strings, UErrorCode &errorCode) {
93
if(U_FAILURE(errorCode)) {
94
return;
95
}
96
int32_t length=s.length();
97
if(length>0xffff) {
98
// Too long: We store the length in 1 or 2 bytes.
99
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
100
return;
101
}
102
int32_t offset=strings.length();
103
if(length>0xff) {
104
offset=~offset;
105
strings.append(static_cast<char>(length >> 8), errorCode);
106
}
107
strings.append(static_cast<char>(length), errorCode);
108
stringOffset=offset;
109
value=val;
110
strings.append(s, errorCode);
111
}
112
113
int32_t
114
BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
115
// TODO: add StringPiece::compare(), see ticket #8187
116
StringPiece thisString=getString(strings);
117
StringPiece otherString=other.getString(strings);
118
int32_t lengthDiff=thisString.length()-otherString.length();
119
int32_t commonLength;
120
if(lengthDiff<=0) {
121
commonLength=thisString.length();
122
} else {
123
commonLength=otherString.length();
124
}
125
int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
126
return diff!=0 ? diff : lengthDiff;
127
}
128
129
BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
130
: strings(nullptr), elements(nullptr), elementsCapacity(0), elementsLength(0),
131
bytes(nullptr), bytesCapacity(0), bytesLength(0) {
132
if(U_FAILURE(errorCode)) {
133
return;
134
}
135
strings=new CharString();
136
if(strings==nullptr) {
137
errorCode=U_MEMORY_ALLOCATION_ERROR;
138
}
139
}
140
141
BytesTrieBuilder::~BytesTrieBuilder() {
142
delete strings;
143
delete[] elements;
144
uprv_free(bytes);
145
}
146
147
BytesTrieBuilder &
148
BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) {
149
if(U_FAILURE(errorCode)) {
150
return *this;
151
}
152
if(bytesLength>0) {
153
// Cannot add elements after building.
154
errorCode=U_NO_WRITE_PERMISSION;
155
return *this;
156
}
157
if(elementsLength==elementsCapacity) {
158
int32_t newCapacity;
159
if(elementsCapacity==0) {
160
newCapacity=1024;
161
} else {
162
newCapacity=4*elementsCapacity;
163
}
164
BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
165
if(newElements==nullptr) {
166
errorCode=U_MEMORY_ALLOCATION_ERROR;
167
return *this; // error instead of dereferencing null
168
}
169
if(elementsLength>0) {
170
uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement));
171
}
172
delete[] elements;
173
elements=newElements;
174
elementsCapacity=newCapacity;
175
}
176
elements[elementsLength++].setTo(s, value, *strings, errorCode);
177
return *this;
178
}
179
180
U_CDECL_BEGIN
181
182
static int32_t U_CALLCONV
183
compareElementStrings(const void *context, const void *left, const void *right) {
184
const CharString *strings=static_cast<const CharString *>(context);
185
const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
186
const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
187
return leftElement->compareStringTo(*rightElement, *strings);
188
}
189
190
U_CDECL_END
191
192
BytesTrie *
193
BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
194
buildBytes(buildOption, errorCode);
195
BytesTrie *newTrie=nullptr;
196
if(U_SUCCESS(errorCode)) {
197
newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
198
if(newTrie==nullptr) {
199
errorCode=U_MEMORY_ALLOCATION_ERROR;
200
} else {
201
bytes=nullptr; // The new trie now owns the array.
202
bytesCapacity=0;
203
}
204
}
205
return newTrie;
206
}
207
208
StringPiece
209
BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
210
buildBytes(buildOption, errorCode);
211
StringPiece result;
212
if(U_SUCCESS(errorCode)) {
213
result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
214
}
215
return result;
216
}
217
218
void
219
BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
220
if(U_FAILURE(errorCode)) {
221
return;
222
}
223
if(bytes!=nullptr && bytesLength>0) {
224
// Already built.
225
return;
226
}
227
if(bytesLength==0) {
228
if(elementsLength==0) {
229
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
230
return;
231
}
232
uprv_sortArray(elements, elementsLength, static_cast<int32_t>(sizeof(BytesTrieElement)),
233
compareElementStrings, strings,
234
false, // need not be a stable sort
235
&errorCode);
236
if(U_FAILURE(errorCode)) {
237
return;
238
}
239
// Duplicate strings are not allowed.
240
StringPiece prev=elements[0].getString(*strings);
241
for(int32_t i=1; i<elementsLength; ++i) {
242
StringPiece current=elements[i].getString(*strings);
243
if(prev==current) {
244
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
245
return;
246
}
247
prev=current;
248
}
249
}
250
// Create and byte-serialize the trie for the elements.
251
bytesLength=0;
252
int32_t capacity=strings->length();
253
if(capacity<1024) {
254
capacity=1024;
255
}
256
if(bytesCapacity<capacity) {
257
uprv_free(bytes);
258
bytes=static_cast<char *>(uprv_malloc(capacity));
259
if(bytes==nullptr) {
260
errorCode=U_MEMORY_ALLOCATION_ERROR;
261
bytesCapacity=0;
262
return;
263
}
264
bytesCapacity=capacity;
265
}
266
StringTrieBuilder::build(buildOption, elementsLength, errorCode);
267
if(bytes==nullptr) {
268
errorCode=U_MEMORY_ALLOCATION_ERROR;
269
}
270
}
271
272
BytesTrieBuilder &
273
BytesTrieBuilder::clear() {
274
strings->clear();
275
elementsLength=0;
276
bytesLength=0;
277
return *this;
278
}
279
280
int32_t
281
BytesTrieBuilder::getElementStringLength(int32_t i) const {
282
return elements[i].getStringLength(*strings);
283
}
284
285
char16_t
286
BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
287
return static_cast<uint8_t>(elements[i].charAt(byteIndex, *strings));
288
}
289
290
int32_t
291
BytesTrieBuilder::getElementValue(int32_t i) const {
292
return elements[i].getValue();
293
}
294
295
int32_t
296
BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
297
const BytesTrieElement &firstElement=elements[first];
298
const BytesTrieElement &lastElement=elements[last];
299
int32_t minStringLength=firstElement.getStringLength(*strings);
300
while(++byteIndex<minStringLength &&
301
firstElement.charAt(byteIndex, *strings)==
302
lastElement.charAt(byteIndex, *strings)) {}
303
return byteIndex;
304
}
305
306
int32_t
307
BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
308
int32_t length=0; // Number of different bytes at byteIndex.
309
int32_t i=start;
310
do {
311
char byte=elements[i++].charAt(byteIndex, *strings);
312
while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
313
++i;
314
}
315
++length;
316
} while(i<limit);
317
return length;
318
}
319
320
int32_t
321
BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
322
do {
323
char byte=elements[i++].charAt(byteIndex, *strings);
324
while(byte==elements[i].charAt(byteIndex, *strings)) {
325
++i;
326
}
327
} while(--count>0);
328
return i;
329
}
330
331
int32_t
332
BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, char16_t byte) const {
333
char b = static_cast<char>(byte);
334
while(b==elements[i].charAt(byteIndex, *strings)) {
335
++i;
336
}
337
return i;
338
}
339
340
BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
341
: LinearMatchNode(len, nextNode), s(bytes) {
342
hash=static_cast<int32_t>(
343
static_cast<uint32_t>(hash)*37u + static_cast<uint32_t>(ustr_hashCharsN(bytes, len)));
344
}
345
346
bool
347
BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
348
if(this==&other) {
349
return true;
350
}
351
if(!LinearMatchNode::operator==(other)) {
352
return false;
353
}
354
const BTLinearMatchNode &o=static_cast<const BTLinearMatchNode &>(other);
355
return 0==uprv_memcmp(s, o.s, length);
356
}
357
358
void
359
BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
360
BytesTrieBuilder &b=static_cast<BytesTrieBuilder &>(builder);
361
next->write(builder);
362
b.write(s, length);
363
offset=b.write(b.getMinLinearMatch()+length-1);
364
}
365
366
StringTrieBuilder::Node *
367
BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
368
Node *nextNode) const {
369
return new BTLinearMatchNode(
370
elements[i].getString(*strings).data()+byteIndex,
371
length,
372
nextNode);
373
}
374
375
UBool
376
BytesTrieBuilder::ensureCapacity(int32_t length) {
377
if(bytes==nullptr) {
378
return false; // previous memory allocation had failed
379
}
380
if(length>bytesCapacity) {
381
int32_t newCapacity=bytesCapacity;
382
do {
383
newCapacity*=2;
384
} while(newCapacity<=length);
385
char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
386
if(newBytes==nullptr) {
387
// unable to allocate memory
388
uprv_free(bytes);
389
bytes=nullptr;
390
bytesCapacity=0;
391
return false;
392
}
393
uprv_memcpy(newBytes+(newCapacity-bytesLength),
394
bytes+(bytesCapacity-bytesLength), bytesLength);
395
uprv_free(bytes);
396
bytes=newBytes;
397
bytesCapacity=newCapacity;
398
}
399
return true;
400
}
401
402
int32_t
403
BytesTrieBuilder::write(int32_t byte) {
404
int32_t newLength=bytesLength+1;
405
if(ensureCapacity(newLength)) {
406
bytesLength=newLength;
407
bytes[bytesCapacity - bytesLength] = static_cast<char>(byte);
408
}
409
return bytesLength;
410
}
411
412
int32_t
413
BytesTrieBuilder::write(const char *b, int32_t length) {
414
int32_t newLength=bytesLength+length;
415
if(ensureCapacity(newLength)) {
416
bytesLength=newLength;
417
uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
418
}
419
return bytesLength;
420
}
421
422
int32_t
423
BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
424
return write(elements[i].getString(*strings).data()+byteIndex, length);
425
}
426
427
int32_t
428
BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
429
if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
430
return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
431
}
432
char intBytes[5];
433
int32_t length=1;
434
if(i<0 || i>0xffffff) {
435
intBytes[0] = static_cast<char>(BytesTrie::kFiveByteValueLead);
436
intBytes[1] = static_cast<char>(static_cast<uint32_t>(i) >> 24);
437
intBytes[2] = static_cast<char>(static_cast<uint32_t>(i) >> 16);
438
intBytes[3] = static_cast<char>(static_cast<uint32_t>(i) >> 8);
439
intBytes[4] = static_cast<char>(i);
440
length=5;
441
// } else if(i<=BytesTrie::kMaxOneByteValue) {
442
// intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
443
} else {
444
if(i<=BytesTrie::kMaxTwoByteValue) {
445
intBytes[0] = static_cast<char>(BytesTrie::kMinTwoByteValueLead + (i >> 8));
446
} else {
447
if(i<=BytesTrie::kMaxThreeByteValue) {
448
intBytes[0] = static_cast<char>(BytesTrie::kMinThreeByteValueLead + (i >> 16));
449
} else {
450
intBytes[0] = static_cast<char>(BytesTrie::kFourByteValueLead);
451
intBytes[1] = static_cast<char>(i >> 16);
452
length=2;
453
}
454
intBytes[length++] = static_cast<char>(i >> 8);
455
}
456
intBytes[length++] = static_cast<char>(i);
457
}
458
intBytes[0] = static_cast<char>((intBytes[0] << 1) | isFinal);
459
return write(intBytes, length);
460
}
461
462
int32_t
463
BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
464
int32_t offset=write(node);
465
if(hasValue) {
466
offset=writeValueAndFinal(value, false);
467
}
468
return offset;
469
}
470
471
int32_t
472
BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
473
int32_t i=bytesLength-jumpTarget;
474
U_ASSERT(i>=0);
475
if(i<=BytesTrie::kMaxOneByteDelta) {
476
return write(i);
477
} else {
478
char intBytes[5];
479
return write(intBytes, internalEncodeDelta(i, intBytes));
480
}
481
}
482
483
int32_t
484
BytesTrieBuilder::internalEncodeDelta(int32_t i, char intBytes[]) {
485
U_ASSERT(i>=0);
486
if(i<=BytesTrie::kMaxOneByteDelta) {
487
intBytes[0] = static_cast<char>(i);
488
return 1;
489
}
490
int32_t length=1;
491
if(i<=BytesTrie::kMaxTwoByteDelta) {
492
intBytes[0] = static_cast<char>(BytesTrie::kMinTwoByteDeltaLead + (i >> 8));
493
} else {
494
if(i<=BytesTrie::kMaxThreeByteDelta) {
495
intBytes[0] = static_cast<char>(BytesTrie::kMinThreeByteDeltaLead + (i >> 16));
496
} else {
497
if(i<=0xffffff) {
498
intBytes[0] = static_cast<char>(BytesTrie::kFourByteDeltaLead);
499
} else {
500
intBytes[0] = static_cast<char>(BytesTrie::kFiveByteDeltaLead);
501
intBytes[1] = static_cast<char>(i >> 24);
502
length=2;
503
}
504
intBytes[length++] = static_cast<char>(i >> 16);
505
}
506
intBytes[length++] = static_cast<char>(i >> 8);
507
}
508
intBytes[length++] = static_cast<char>(i);
509
return length;
510
}
511
512
U_NAMESPACE_END
513
514