Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/icucommon/brkeng.cpp
12343 views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
************************************************************************************
5
* Copyright (C) 2006-2016, International Business Machines Corporation
6
* and others. All Rights Reserved.
7
************************************************************************************
8
*/
9
10
#include "unicode/utypes.h"
11
12
#if !UCONFIG_NO_BREAK_ITERATION
13
14
#include "unicode/uchar.h"
15
#include "unicode/uniset.h"
16
#include "unicode/chariter.h"
17
#include "unicode/ures.h"
18
#include "unicode/udata.h"
19
#include "unicode/putil.h"
20
#include "unicode/ustring.h"
21
#include "unicode/uscript.h"
22
#include "unicode/ucharstrie.h"
23
#include "unicode/bytestrie.h"
24
25
#include "brkeng.h"
26
#include "cmemory.h"
27
#include "dictbe.h"
28
#include "lstmbe.h"
29
#include "charstr.h"
30
#include "dictionarydata.h"
31
#include "mutex.h"
32
#include "uvector.h"
33
#include "umutex.h"
34
#include "uresimp.h"
35
#include "ubrkimpl.h"
36
37
U_NAMESPACE_BEGIN
38
39
/*
40
******************************************************************
41
*/
42
43
LanguageBreakEngine::LanguageBreakEngine() {
44
}
45
46
LanguageBreakEngine::~LanguageBreakEngine() {
47
}
48
49
/*
50
******************************************************************
51
*/
52
53
LanguageBreakFactory::LanguageBreakFactory() {
54
}
55
56
LanguageBreakFactory::~LanguageBreakFactory() {
57
}
58
59
/*
60
******************************************************************
61
*/
62
63
UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
64
(void)status;
65
}
66
67
UnhandledEngine::~UnhandledEngine() {
68
delete fHandled;
69
fHandled = nullptr;
70
}
71
72
UBool
73
UnhandledEngine::handles(UChar32 c) const {
74
return fHandled && fHandled->contains(c);
75
}
76
77
int32_t
78
UnhandledEngine::findBreaks( UText *text,
79
int32_t /* startPos */,
80
int32_t endPos,
81
UVector32 &/*foundBreaks*/,
82
UBool /* isPhraseBreaking */,
83
UErrorCode &status) const {
84
if (U_FAILURE(status)) return 0;
85
UChar32 c = utext_current32(text);
86
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
87
utext_next32(text); // TODO: recast loop to work with post-increment operations.
88
c = utext_current32(text);
89
}
90
return 0;
91
}
92
93
void
94
UnhandledEngine::handleCharacter(UChar32 c) {
95
if (fHandled == nullptr) {
96
fHandled = new UnicodeSet();
97
if (fHandled == nullptr) {
98
return;
99
}
100
}
101
if (!fHandled->contains(c)) {
102
UErrorCode status = U_ZERO_ERROR;
103
// Apply the entire script of the character.
104
int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
105
fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
106
}
107
}
108
109
/*
110
******************************************************************
111
*/
112
113
ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
114
fEngines = 0;
115
}
116
117
ICULanguageBreakFactory::~ICULanguageBreakFactory() {
118
if (fEngines != 0) {
119
delete fEngines;
120
}
121
}
122
123
U_NAMESPACE_END
124
U_CDECL_BEGIN
125
static void U_CALLCONV _deleteEngine(void *obj) {
126
delete (const icu::LanguageBreakEngine *) obj;
127
}
128
U_CDECL_END
129
U_NAMESPACE_BEGIN
130
131
const LanguageBreakEngine *
132
ICULanguageBreakFactory::getEngineFor(UChar32 c) {
133
const LanguageBreakEngine *lbe = NULL;
134
UErrorCode status = U_ZERO_ERROR;
135
136
static UMutex gBreakEngineMutex;
137
Mutex m(&gBreakEngineMutex);
138
139
if (fEngines == nullptr) {
140
LocalPointer<UStack> engines(new UStack(_deleteEngine, nullptr, status), status);
141
if (U_FAILURE(status) ) {
142
// Note: no way to return error code to caller.
143
return nullptr;
144
}
145
fEngines = engines.orphan();
146
} else {
147
int32_t i = fEngines->size();
148
while (--i >= 0) {
149
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
150
if (lbe != NULL && lbe->handles(c)) {
151
return lbe;
152
}
153
}
154
}
155
156
// We didn't find an engine. Create one.
157
lbe = loadEngineFor(c);
158
if (lbe != nullptr) {
159
fEngines->push((void *)lbe, status);
160
}
161
return U_SUCCESS(status) ? lbe : nullptr;
162
}
163
164
const LanguageBreakEngine *
165
ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
166
UErrorCode status = U_ZERO_ERROR;
167
UScriptCode code = uscript_getScript(c, &status);
168
if (U_SUCCESS(status)) {
169
const LanguageBreakEngine *engine = nullptr;
170
// Try to use LSTM first
171
const LSTMData *data = CreateLSTMDataForScript(code, status);
172
if (U_SUCCESS(status)) {
173
if (data != nullptr) {
174
engine = CreateLSTMBreakEngine(code, data, status);
175
if (U_SUCCESS(status) && engine != nullptr) {
176
return engine;
177
}
178
if (engine != nullptr) {
179
delete engine;
180
engine = nullptr;
181
} else {
182
DeleteLSTMData(data);
183
}
184
}
185
}
186
status = U_ZERO_ERROR; // fallback to dictionary based
187
DictionaryMatcher *m = loadDictionaryMatcherFor(code);
188
if (m != NULL) {
189
switch(code) {
190
case USCRIPT_THAI:
191
engine = new ThaiBreakEngine(m, status);
192
break;
193
case USCRIPT_LAO:
194
engine = new LaoBreakEngine(m, status);
195
break;
196
case USCRIPT_MYANMAR:
197
engine = new BurmeseBreakEngine(m, status);
198
break;
199
case USCRIPT_KHMER:
200
engine = new KhmerBreakEngine(m, status);
201
break;
202
203
#if !UCONFIG_NO_NORMALIZATION
204
// CJK not available w/o normalization
205
case USCRIPT_HANGUL:
206
engine = new CjkBreakEngine(m, kKorean, status);
207
break;
208
209
// use same BreakEngine and dictionary for both Chinese and Japanese
210
case USCRIPT_HIRAGANA:
211
case USCRIPT_KATAKANA:
212
case USCRIPT_HAN:
213
engine = new CjkBreakEngine(m, kChineseJapanese, status);
214
break;
215
#if 0
216
// TODO: Have to get some characters with script=common handled
217
// by CjkBreakEngine (e.g. U+309B). Simply subjecting
218
// them to CjkBreakEngine does not work. The engine has to
219
// special-case them.
220
case USCRIPT_COMMON:
221
{
222
UBlockCode block = ublock_getCode(code);
223
if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
224
engine = new CjkBreakEngine(dict, kChineseJapanese, status);
225
break;
226
}
227
#endif
228
#endif
229
230
default:
231
break;
232
}
233
if (engine == NULL) {
234
delete m;
235
}
236
else if (U_FAILURE(status)) {
237
delete engine;
238
engine = NULL;
239
}
240
return engine;
241
}
242
}
243
return NULL;
244
}
245
246
DictionaryMatcher *
247
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
248
UErrorCode status = U_ZERO_ERROR;
249
// open root from brkitr tree.
250
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
251
b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
252
int32_t dictnlength = 0;
253
const UChar *dictfname =
254
ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
255
if (U_FAILURE(status)) {
256
ures_close(b);
257
return NULL;
258
}
259
CharString dictnbuf;
260
CharString ext;
261
const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
262
if (extStart != NULL) {
263
int32_t len = (int32_t)(extStart - dictfname);
264
ext.appendInvariantChars(UnicodeString(false, extStart + 1, dictnlength - len - 1), status);
265
dictnlength = len;
266
}
267
dictnbuf.appendInvariantChars(UnicodeString(false, dictfname, dictnlength), status);
268
ures_close(b);
269
270
UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
271
if (U_SUCCESS(status)) {
272
// build trie
273
const uint8_t *data = (const uint8_t *)udata_getMemory(file);
274
const int32_t *indexes = (const int32_t *)data;
275
const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
276
const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
277
DictionaryMatcher *m = NULL;
278
if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
279
const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
280
const char *characters = (const char *)(data + offset);
281
m = new BytesDictionaryMatcher(characters, transform, file);
282
}
283
else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
284
const UChar *characters = (const UChar *)(data + offset);
285
m = new UCharsDictionaryMatcher(characters, file);
286
}
287
if (m == NULL) {
288
// no matcher exists to take ownership - either we are an invalid
289
// type or memory allocation failed
290
udata_close(file);
291
}
292
return m;
293
} else if (dictfname != NULL) {
294
// we don't have a dictionary matcher.
295
// returning NULL here will cause us to fail to find a dictionary break engine, as expected
296
status = U_ZERO_ERROR;
297
return NULL;
298
}
299
return NULL;
300
}
301
302
U_NAMESPACE_END
303
304
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
305
306