Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/icu4c/common/dictbe.h
9902 views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/**
4
*******************************************************************************
5
* Copyright (C) 2006-2014, International Business Machines Corporation *
6
* and others. All Rights Reserved. *
7
*******************************************************************************
8
*/
9
10
#ifndef DICTBE_H
11
#define DICTBE_H
12
13
#include "unicode/utypes.h"
14
#include "unicode/uniset.h"
15
#include "unicode/utext.h"
16
17
#include "brkeng.h"
18
#include "hash.h"
19
#include "mlbe.h"
20
#include "uvectr32.h"
21
22
U_NAMESPACE_BEGIN
23
24
class DictionaryMatcher;
25
class MlBreakEngine;
26
class Normalizer2;
27
28
/*******************************************************************
29
* DictionaryBreakEngine
30
*/
31
32
/**
33
* <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
34
* dictionary to determine language-specific breaks.</p>
35
*
36
* <p>After it is constructed a DictionaryBreakEngine may be shared between
37
* threads without synchronization.</p>
38
*/
39
class DictionaryBreakEngine : public LanguageBreakEngine {
40
private:
41
/**
42
* The set of characters handled by this engine
43
* @internal
44
*/
45
46
UnicodeSet fSet;
47
48
public:
49
50
/**
51
* <p>Constructor </p>
52
*/
53
DictionaryBreakEngine();
54
55
/**
56
* <p>Virtual destructor.</p>
57
*/
58
virtual ~DictionaryBreakEngine();
59
60
/**
61
* <p>Indicate whether this engine handles a particular character for
62
* a particular kind of break.</p>
63
*
64
* @param c A character which begins a run that the engine might handle
65
* @param locale The locale.
66
* @return true if this engine handles the particular character and break
67
* type.
68
*/
69
virtual UBool handles(UChar32 c, const char* locale) const override;
70
71
/**
72
* <p>Find any breaks within a run in the supplied text.</p>
73
*
74
* @param text A UText representing the text. The iterator is left at
75
* the end of the run of characters which the engine is capable of handling
76
* that starts from the first character in the range.
77
* @param startPos The start of the run within the supplied text.
78
* @param endPos The end of the run within the supplied text.
79
* @param foundBreaks vector of int32_t to receive the break positions
80
* @param status Information on any errors encountered.
81
* @return The number of breaks found.
82
*/
83
virtual int32_t findBreaks( UText *text,
84
int32_t startPos,
85
int32_t endPos,
86
UVector32 &foundBreaks,
87
UBool isPhraseBreaking,
88
UErrorCode& status ) const override;
89
90
protected:
91
92
/**
93
* <p>Set the character set handled by this engine.</p>
94
*
95
* @param set A UnicodeSet of the set of characters handled by the engine
96
*/
97
virtual void setCharacters( const UnicodeSet &set );
98
99
/**
100
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
101
*
102
* @param text A UText representing the text
103
* @param rangeStart The start of the range of dictionary characters
104
* @param rangeEnd The end of the range of dictionary characters
105
* @param foundBreaks Output of C array of int32_t break positions, or 0
106
* @param status Information on any errors encountered.
107
* @return The number of breaks found
108
*/
109
virtual int32_t divideUpDictionaryRange( UText *text,
110
int32_t rangeStart,
111
int32_t rangeEnd,
112
UVector32 &foundBreaks,
113
UBool isPhraseBreaking,
114
UErrorCode& status) const = 0;
115
116
};
117
118
/*******************************************************************
119
* ThaiBreakEngine
120
*/
121
122
/**
123
* <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
124
* dictionary and heuristics to determine Thai-specific breaks.</p>
125
*
126
* <p>After it is constructed a ThaiBreakEngine may be shared between
127
* threads without synchronization.</p>
128
*/
129
class ThaiBreakEngine : public DictionaryBreakEngine {
130
private:
131
/**
132
* The set of characters handled by this engine
133
* @internal
134
*/
135
136
UnicodeSet fEndWordSet;
137
UnicodeSet fBeginWordSet;
138
UnicodeSet fSuffixSet;
139
UnicodeSet fMarkSet;
140
DictionaryMatcher *fDictionary;
141
142
public:
143
144
/**
145
* <p>Default constructor.</p>
146
*
147
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
148
* engine is deleted.
149
*/
150
ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
151
152
/**
153
* <p>Virtual destructor.</p>
154
*/
155
virtual ~ThaiBreakEngine();
156
157
protected:
158
/**
159
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
160
*
161
* @param text A UText representing the text
162
* @param rangeStart The start of the range of dictionary characters
163
* @param rangeEnd The end of the range of dictionary characters
164
* @param foundBreaks Output of C array of int32_t break positions, or 0
165
* @param status Information on any errors encountered.
166
* @return The number of breaks found
167
*/
168
virtual int32_t divideUpDictionaryRange( UText *text,
169
int32_t rangeStart,
170
int32_t rangeEnd,
171
UVector32 &foundBreaks,
172
UBool isPhraseBreaking,
173
UErrorCode& status) const override;
174
175
};
176
177
/*******************************************************************
178
* LaoBreakEngine
179
*/
180
181
/**
182
* <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
183
* dictionary and heuristics to determine Lao-specific breaks.</p>
184
*
185
* <p>After it is constructed a LaoBreakEngine may be shared between
186
* threads without synchronization.</p>
187
*/
188
class LaoBreakEngine : public DictionaryBreakEngine {
189
private:
190
/**
191
* The set of characters handled by this engine
192
* @internal
193
*/
194
195
UnicodeSet fEndWordSet;
196
UnicodeSet fBeginWordSet;
197
UnicodeSet fMarkSet;
198
DictionaryMatcher *fDictionary;
199
200
public:
201
202
/**
203
* <p>Default constructor.</p>
204
*
205
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
206
* engine is deleted.
207
*/
208
LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
209
210
/**
211
* <p>Virtual destructor.</p>
212
*/
213
virtual ~LaoBreakEngine();
214
215
protected:
216
/**
217
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
218
*
219
* @param text A UText representing the text
220
* @param rangeStart The start of the range of dictionary characters
221
* @param rangeEnd The end of the range of dictionary characters
222
* @param foundBreaks Output of C array of int32_t break positions, or 0
223
* @param status Information on any errors encountered.
224
* @return The number of breaks found
225
*/
226
virtual int32_t divideUpDictionaryRange( UText *text,
227
int32_t rangeStart,
228
int32_t rangeEnd,
229
UVector32 &foundBreaks,
230
UBool isPhraseBreaking,
231
UErrorCode& status) const override;
232
233
};
234
235
/*******************************************************************
236
* BurmeseBreakEngine
237
*/
238
239
/**
240
* <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
241
* DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
242
*
243
* <p>After it is constructed a BurmeseBreakEngine may be shared between
244
* threads without synchronization.</p>
245
*/
246
class BurmeseBreakEngine : public DictionaryBreakEngine {
247
private:
248
/**
249
* The set of characters handled by this engine
250
* @internal
251
*/
252
253
UnicodeSet fEndWordSet;
254
UnicodeSet fBeginWordSet;
255
UnicodeSet fMarkSet;
256
DictionaryMatcher *fDictionary;
257
258
public:
259
260
/**
261
* <p>Default constructor.</p>
262
*
263
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
264
* engine is deleted.
265
*/
266
BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
267
268
/**
269
* <p>Virtual destructor.</p>
270
*/
271
virtual ~BurmeseBreakEngine();
272
273
protected:
274
/**
275
* <p>Divide up a range of known dictionary characters.</p>
276
*
277
* @param text A UText representing the text
278
* @param rangeStart The start of the range of dictionary characters
279
* @param rangeEnd The end of the range of dictionary characters
280
* @param foundBreaks Output of C array of int32_t break positions, or 0
281
* @param status Information on any errors encountered.
282
* @return The number of breaks found
283
*/
284
virtual int32_t divideUpDictionaryRange( UText *text,
285
int32_t rangeStart,
286
int32_t rangeEnd,
287
UVector32 &foundBreaks,
288
UBool isPhraseBreaking,
289
UErrorCode& status) const override;
290
291
};
292
293
/*******************************************************************
294
* KhmerBreakEngine
295
*/
296
297
/**
298
* <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
299
* DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
300
*
301
* <p>After it is constructed a KhmerBreakEngine may be shared between
302
* threads without synchronization.</p>
303
*/
304
class KhmerBreakEngine : public DictionaryBreakEngine {
305
private:
306
/**
307
* The set of characters handled by this engine
308
* @internal
309
*/
310
311
UnicodeSet fEndWordSet;
312
UnicodeSet fBeginWordSet;
313
UnicodeSet fMarkSet;
314
DictionaryMatcher *fDictionary;
315
316
public:
317
318
/**
319
* <p>Default constructor.</p>
320
*
321
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
322
* engine is deleted.
323
*/
324
KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
325
326
/**
327
* <p>Virtual destructor.</p>
328
*/
329
virtual ~KhmerBreakEngine();
330
331
protected:
332
/**
333
* <p>Divide up a range of known dictionary characters.</p>
334
*
335
* @param text A UText representing the text
336
* @param rangeStart The start of the range of dictionary characters
337
* @param rangeEnd The end of the range of dictionary characters
338
* @param foundBreaks Output of C array of int32_t break positions, or 0
339
* @param status Information on any errors encountered.
340
* @return The number of breaks found
341
*/
342
virtual int32_t divideUpDictionaryRange( UText *text,
343
int32_t rangeStart,
344
int32_t rangeEnd,
345
UVector32 &foundBreaks,
346
UBool isPhraseBreaking,
347
UErrorCode& status) const override;
348
349
};
350
351
#if !UCONFIG_NO_NORMALIZATION
352
353
/*******************************************************************
354
* CjkBreakEngine
355
*/
356
357
//indicates language/script that the CjkBreakEngine will handle
358
enum LanguageType {
359
kKorean,
360
kChineseJapanese
361
};
362
363
/**
364
* <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
365
* dictionary with costs associated with each word and
366
* Viterbi decoding to determine CJK-specific breaks.</p>
367
*/
368
class CjkBreakEngine : public DictionaryBreakEngine {
369
protected:
370
/**
371
* The set of characters handled by this engine
372
* @internal
373
*/
374
UnicodeSet fHangulWordSet;
375
UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
376
UnicodeSet fClosePunctuationSet;
377
378
DictionaryMatcher *fDictionary;
379
const Normalizer2 *nfkcNorm2;
380
MlBreakEngine *fMlBreakEngine;
381
bool isCj;
382
383
private:
384
// Load Japanese extensions.
385
void loadJapaneseExtensions(UErrorCode& error);
386
// Load Japanese Hiragana.
387
void loadHiragana(UErrorCode& error);
388
// Initialize fSkipSet by loading Japanese Hiragana and extensions.
389
void initJapanesePhraseParameter(UErrorCode& error);
390
391
Hashtable fSkipSet;
392
393
public:
394
395
/**
396
* <p>Default constructor.</p>
397
*
398
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
399
* engine is deleted. The DictionaryMatcher must contain costs for each word
400
* in order for the dictionary to work properly.
401
*/
402
CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
403
404
/**
405
* <p>Virtual destructor.</p>
406
*/
407
virtual ~CjkBreakEngine();
408
409
protected:
410
/**
411
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
412
*
413
* @param text A UText representing the text
414
* @param rangeStart The start of the range of dictionary characters
415
* @param rangeEnd The end of the range of dictionary characters
416
* @param foundBreaks Output of C array of int32_t break positions, or 0
417
* @param status Information on any errors encountered.
418
* @return The number of breaks found
419
*/
420
virtual int32_t divideUpDictionaryRange( UText *text,
421
int32_t rangeStart,
422
int32_t rangeEnd,
423
UVector32 &foundBreaks,
424
UBool isPhraseBreaking,
425
UErrorCode& status) const override;
426
427
};
428
429
#endif
430
431
U_NAMESPACE_END
432
433
/* DICTBE_H */
434
#endif
435
436