Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/icucommon/dictbe.h
12343 views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/**
4
*******************************************************************************
5
* Copyright (C) 2006-2014, International Business Machines Corporation *
6
* and others. All Rights Reserved. *
7
*******************************************************************************
8
*/
9
10
#ifndef DICTBE_H
11
#define DICTBE_H
12
13
#include "unicode/utypes.h"
14
#include "unicode/uniset.h"
15
#include "unicode/utext.h"
16
17
#include "brkeng.h"
18
#include "hash.h"
19
#include "uvectr32.h"
20
21
U_NAMESPACE_BEGIN
22
23
class DictionaryMatcher;
24
class Normalizer2;
25
26
/*******************************************************************
27
* DictionaryBreakEngine
28
*/
29
30
/**
31
* <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
32
* dictionary to determine language-specific breaks.</p>
33
*
34
* <p>After it is constructed a DictionaryBreakEngine may be shared between
35
* threads without synchronization.</p>
36
*/
37
class DictionaryBreakEngine : public LanguageBreakEngine {
38
private:
39
/**
40
* The set of characters handled by this engine
41
* @internal
42
*/
43
44
UnicodeSet fSet;
45
46
public:
47
48
/**
49
* <p>Constructor </p>
50
*/
51
DictionaryBreakEngine();
52
53
/**
54
* <p>Virtual destructor.</p>
55
*/
56
virtual ~DictionaryBreakEngine();
57
58
/**
59
* <p>Indicate whether this engine handles a particular character for
60
* a particular kind of break.</p>
61
*
62
* @param c A character which begins a run that the engine might handle
63
* @return true if this engine handles the particular character and break
64
* type.
65
*/
66
virtual UBool handles(UChar32 c) const override;
67
68
/**
69
* <p>Find any breaks within a run in the supplied text.</p>
70
*
71
* @param text A UText representing the text. The iterator is left at
72
* the end of the run of characters which the engine is capable of handling
73
* that starts from the first character in the range.
74
* @param startPos The start of the run within the supplied text.
75
* @param endPos The end of the run within the supplied text.
76
* @param foundBreaks vector of int32_t to receive the break positions
77
* @param status Information on any errors encountered.
78
* @return The number of breaks found.
79
*/
80
virtual int32_t findBreaks( UText *text,
81
int32_t startPos,
82
int32_t endPos,
83
UVector32 &foundBreaks,
84
UBool isPhraseBreaking,
85
UErrorCode& status ) const override;
86
87
protected:
88
89
/**
90
* <p>Set the character set handled by this engine.</p>
91
*
92
* @param set A UnicodeSet of the set of characters handled by the engine
93
*/
94
virtual void setCharacters( const UnicodeSet &set );
95
96
/**
97
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
98
*
99
* @param text A UText representing the text
100
* @param rangeStart The start of the range of dictionary characters
101
* @param rangeEnd The end of the range of dictionary characters
102
* @param foundBreaks Output of C array of int32_t break positions, or 0
103
* @param status Information on any errors encountered.
104
* @return The number of breaks found
105
*/
106
virtual int32_t divideUpDictionaryRange( UText *text,
107
int32_t rangeStart,
108
int32_t rangeEnd,
109
UVector32 &foundBreaks,
110
UBool isPhraseBreaking,
111
UErrorCode& status) const = 0;
112
113
};
114
115
/*******************************************************************
116
* ThaiBreakEngine
117
*/
118
119
/**
120
* <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
121
* dictionary and heuristics to determine Thai-specific breaks.</p>
122
*
123
* <p>After it is constructed a ThaiBreakEngine may be shared between
124
* threads without synchronization.</p>
125
*/
126
class ThaiBreakEngine : public DictionaryBreakEngine {
127
private:
128
/**
129
* The set of characters handled by this engine
130
* @internal
131
*/
132
133
UnicodeSet fEndWordSet;
134
UnicodeSet fBeginWordSet;
135
UnicodeSet fSuffixSet;
136
UnicodeSet fMarkSet;
137
DictionaryMatcher *fDictionary;
138
139
public:
140
141
/**
142
* <p>Default constructor.</p>
143
*
144
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
145
* engine is deleted.
146
*/
147
ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
148
149
/**
150
* <p>Virtual destructor.</p>
151
*/
152
virtual ~ThaiBreakEngine();
153
154
protected:
155
/**
156
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
157
*
158
* @param text A UText representing the text
159
* @param rangeStart The start of the range of dictionary characters
160
* @param rangeEnd The end of the range of dictionary characters
161
* @param foundBreaks Output of C array of int32_t break positions, or 0
162
* @param status Information on any errors encountered.
163
* @return The number of breaks found
164
*/
165
virtual int32_t divideUpDictionaryRange( UText *text,
166
int32_t rangeStart,
167
int32_t rangeEnd,
168
UVector32 &foundBreaks,
169
UBool isPhraseBreaking,
170
UErrorCode& status) const override;
171
172
};
173
174
/*******************************************************************
175
* LaoBreakEngine
176
*/
177
178
/**
179
* <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
180
* dictionary and heuristics to determine Lao-specific breaks.</p>
181
*
182
* <p>After it is constructed a LaoBreakEngine may be shared between
183
* threads without synchronization.</p>
184
*/
185
class LaoBreakEngine : public DictionaryBreakEngine {
186
private:
187
/**
188
* The set of characters handled by this engine
189
* @internal
190
*/
191
192
UnicodeSet fEndWordSet;
193
UnicodeSet fBeginWordSet;
194
UnicodeSet fMarkSet;
195
DictionaryMatcher *fDictionary;
196
197
public:
198
199
/**
200
* <p>Default constructor.</p>
201
*
202
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
203
* engine is deleted.
204
*/
205
LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
206
207
/**
208
* <p>Virtual destructor.</p>
209
*/
210
virtual ~LaoBreakEngine();
211
212
protected:
213
/**
214
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
215
*
216
* @param text A UText representing the text
217
* @param rangeStart The start of the range of dictionary characters
218
* @param rangeEnd The end of the range of dictionary characters
219
* @param foundBreaks Output of C array of int32_t break positions, or 0
220
* @param status Information on any errors encountered.
221
* @return The number of breaks found
222
*/
223
virtual int32_t divideUpDictionaryRange( UText *text,
224
int32_t rangeStart,
225
int32_t rangeEnd,
226
UVector32 &foundBreaks,
227
UBool isPhraseBreaking,
228
UErrorCode& status) const override;
229
230
};
231
232
/*******************************************************************
233
* BurmeseBreakEngine
234
*/
235
236
/**
237
* <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
238
* DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
239
*
240
* <p>After it is constructed a BurmeseBreakEngine may be shared between
241
* threads without synchronization.</p>
242
*/
243
class BurmeseBreakEngine : public DictionaryBreakEngine {
244
private:
245
/**
246
* The set of characters handled by this engine
247
* @internal
248
*/
249
250
UnicodeSet fEndWordSet;
251
UnicodeSet fBeginWordSet;
252
UnicodeSet fMarkSet;
253
DictionaryMatcher *fDictionary;
254
255
public:
256
257
/**
258
* <p>Default constructor.</p>
259
*
260
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
261
* engine is deleted.
262
*/
263
BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
264
265
/**
266
* <p>Virtual destructor.</p>
267
*/
268
virtual ~BurmeseBreakEngine();
269
270
protected:
271
/**
272
* <p>Divide up a range of known dictionary characters.</p>
273
*
274
* @param text A UText representing the text
275
* @param rangeStart The start of the range of dictionary characters
276
* @param rangeEnd The end of the range of dictionary characters
277
* @param foundBreaks Output of C array of int32_t break positions, or 0
278
* @param status Information on any errors encountered.
279
* @return The number of breaks found
280
*/
281
virtual int32_t divideUpDictionaryRange( UText *text,
282
int32_t rangeStart,
283
int32_t rangeEnd,
284
UVector32 &foundBreaks,
285
UBool isPhraseBreaking,
286
UErrorCode& status) const override;
287
288
};
289
290
/*******************************************************************
291
* KhmerBreakEngine
292
*/
293
294
/**
295
* <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
296
* DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
297
*
298
* <p>After it is constructed a KhmerBreakEngine may be shared between
299
* threads without synchronization.</p>
300
*/
301
class KhmerBreakEngine : public DictionaryBreakEngine {
302
private:
303
/**
304
* The set of characters handled by this engine
305
* @internal
306
*/
307
308
UnicodeSet fEndWordSet;
309
UnicodeSet fBeginWordSet;
310
UnicodeSet fMarkSet;
311
DictionaryMatcher *fDictionary;
312
313
public:
314
315
/**
316
* <p>Default constructor.</p>
317
*
318
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
319
* engine is deleted.
320
*/
321
KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
322
323
/**
324
* <p>Virtual destructor.</p>
325
*/
326
virtual ~KhmerBreakEngine();
327
328
protected:
329
/**
330
* <p>Divide up a range of known dictionary characters.</p>
331
*
332
* @param text A UText representing the text
333
* @param rangeStart The start of the range of dictionary characters
334
* @param rangeEnd The end of the range of dictionary characters
335
* @param foundBreaks Output of C array of int32_t break positions, or 0
336
* @param status Information on any errors encountered.
337
* @return The number of breaks found
338
*/
339
virtual int32_t divideUpDictionaryRange( UText *text,
340
int32_t rangeStart,
341
int32_t rangeEnd,
342
UVector32 &foundBreaks,
343
UBool isPhraseBreaking,
344
UErrorCode& status) const override;
345
346
};
347
348
#if !UCONFIG_NO_NORMALIZATION
349
350
/*******************************************************************
351
* CjkBreakEngine
352
*/
353
354
//indicates language/script that the CjkBreakEngine will handle
355
enum LanguageType {
356
kKorean,
357
kChineseJapanese
358
};
359
360
/**
361
* <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
362
* dictionary with costs associated with each word and
363
* Viterbi decoding to determine CJK-specific breaks.</p>
364
*/
365
class CjkBreakEngine : public DictionaryBreakEngine {
366
protected:
367
/**
368
* The set of characters handled by this engine
369
* @internal
370
*/
371
UnicodeSet fHangulWordSet;
372
UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
373
UnicodeSet fClosePunctuationSet;
374
375
DictionaryMatcher *fDictionary;
376
const Normalizer2 *nfkcNorm2;
377
378
private:
379
// Load Japanese extensions.
380
void loadJapaneseExtensions(UErrorCode& error);
381
// Load Japanese Hiragana.
382
void loadHiragana(UErrorCode& error);
383
// Initialize fSkipSet by loading Japanese Hiragana and extensions.
384
void initJapanesePhraseParameter(UErrorCode& error);
385
386
Hashtable fSkipSet;
387
388
public:
389
390
/**
391
* <p>Default constructor.</p>
392
*
393
* @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
394
* engine is deleted. The DictionaryMatcher must contain costs for each word
395
* in order for the dictionary to work properly.
396
*/
397
CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
398
399
/**
400
* <p>Virtual destructor.</p>
401
*/
402
virtual ~CjkBreakEngine();
403
404
protected:
405
/**
406
* <p>Divide up a range of known dictionary characters handled by this break engine.</p>
407
*
408
* @param text A UText representing the text
409
* @param rangeStart The start of the range of dictionary characters
410
* @param rangeEnd The end of the range of dictionary characters
411
* @param foundBreaks Output of C array of int32_t break positions, or 0
412
* @param status Information on any errors encountered.
413
* @return The number of breaks found
414
*/
415
virtual int32_t divideUpDictionaryRange( UText *text,
416
int32_t rangeStart,
417
int32_t rangeEnd,
418
UVector32 &foundBreaks,
419
UBool isPhraseBreaking,
420
UErrorCode& status) const override;
421
422
};
423
424
#endif
425
426
U_NAMESPACE_END
427
428
/* DICTBE_H */
429
#endif
430
431