CoCalc -- brkeng.cpp

GitHub Repository: wine-mirror/wine
Path: blob/master/libs/icucommon/brkeng.cpp
¹²³⁴³ views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
 ************************************************************************************
5
 * Copyright (C) 2006-2016, International Business Machines Corporation
6
 * and others. All Rights Reserved.
7
 ************************************************************************************
8
 */
9

10
#include "unicode/utypes.h"
11

12
#if !UCONFIG_NO_BREAK_ITERATION
13

14
#include "unicode/uchar.h"
15
#include "unicode/uniset.h"
16
#include "unicode/chariter.h"
17
#include "unicode/ures.h"
18
#include "unicode/udata.h"
19
#include "unicode/putil.h"
20
#include "unicode/ustring.h"
21
#include "unicode/uscript.h"
22
#include "unicode/ucharstrie.h"
23
#include "unicode/bytestrie.h"
24

25
#include "brkeng.h"
26
#include "cmemory.h"
27
#include "dictbe.h"
28
#include "lstmbe.h"
29
#include "charstr.h"
30
#include "dictionarydata.h"
31
#include "mutex.h"
32
#include "uvector.h"
33
#include "umutex.h"
34
#include "uresimp.h"
35
#include "ubrkimpl.h"
36

37
U_NAMESPACE_BEGIN
38

39
/*
40
 ******************************************************************
41
 */
42

43
LanguageBreakEngine::LanguageBreakEngine() {
44
}
45

46
LanguageBreakEngine::~LanguageBreakEngine() {
47
}
48

49
/*
50
 ******************************************************************
51
 */
52

53
LanguageBreakFactory::LanguageBreakFactory() {
54
}
55

56
LanguageBreakFactory::~LanguageBreakFactory() {
57
}
58

59
/*
60
 ******************************************************************
61
 */
62

63
UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
64
    (void)status;
65
}
66

67
UnhandledEngine::~UnhandledEngine() {
68
    delete fHandled;
69
    fHandled = nullptr;
70
}
71

72
UBool
73
UnhandledEngine::handles(UChar32 c) const {
74
    return fHandled && fHandled->contains(c);
75
}
76

77
int32_t
78
UnhandledEngine::findBreaks( UText *text,
79
                             int32_t /* startPos */,
80
                             int32_t endPos,
81
                             UVector32 &/*foundBreaks*/,
82
                             UBool /* isPhraseBreaking */,
83
                             UErrorCode &status) const {
84
    if (U_FAILURE(status)) return 0;
85
    UChar32 c = utext_current32(text);
86
    while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
87
        utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
88
        c = utext_current32(text);
89
    }
90
    return 0;
91
}
92

93
void
94
UnhandledEngine::handleCharacter(UChar32 c) {
95
    if (fHandled == nullptr) {
96
        fHandled = new UnicodeSet();
97
        if (fHandled == nullptr) {
98
            return;
99
        }
100
    }
101
    if (!fHandled->contains(c)) {
102
        UErrorCode status = U_ZERO_ERROR;
103
        // Apply the entire script of the character.
104
        int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
105
        fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
106
    }
107
}
108

109
/*
110
 ******************************************************************
111
 */
112

113
ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
114
    fEngines = 0;
115
}
116

117
ICULanguageBreakFactory::~ICULanguageBreakFactory() {
118
    if (fEngines != 0) {
119
        delete fEngines;
120
    }
121
}
122

123
U_NAMESPACE_END
124
U_CDECL_BEGIN
125
static void U_CALLCONV _deleteEngine(void *obj) {
126
    delete (const icu::LanguageBreakEngine *) obj;
127
}
128
U_CDECL_END
129
U_NAMESPACE_BEGIN
130

131
const LanguageBreakEngine *
132
ICULanguageBreakFactory::getEngineFor(UChar32 c) {
133
    const LanguageBreakEngine *lbe = NULL;
134
    UErrorCode  status = U_ZERO_ERROR;
135

136
    static UMutex gBreakEngineMutex;
137
    Mutex m(&gBreakEngineMutex);
138

139
    if (fEngines == nullptr) {
140
        LocalPointer<UStack>  engines(new UStack(_deleteEngine, nullptr, status), status);
141
        if (U_FAILURE(status) ) {
142
            // Note: no way to return error code to caller.
143
            return nullptr;
144
        }
145
        fEngines = engines.orphan();
146
    } else {
147
        int32_t i = fEngines->size();
148
        while (--i >= 0) {
149
            lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
150
            if (lbe != NULL && lbe->handles(c)) {
151
                return lbe;
152
            }
153
        }
154
    }
155

156
    // We didn't find an engine. Create one.
157
    lbe = loadEngineFor(c);
158
    if (lbe != nullptr) {
159
        fEngines->push((void *)lbe, status);
160
    }
161
    return U_SUCCESS(status) ? lbe : nullptr;
162
}
163

164
const LanguageBreakEngine *
165
ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
166
    UErrorCode status = U_ZERO_ERROR;
167
    UScriptCode code = uscript_getScript(c, &status);
168
    if (U_SUCCESS(status)) {
169
        const LanguageBreakEngine *engine = nullptr;
170
        // Try to use LSTM first
171
        const LSTMData *data = CreateLSTMDataForScript(code, status);
172
        if (U_SUCCESS(status)) {
173
            if (data != nullptr) {
174
                engine = CreateLSTMBreakEngine(code, data, status);
175
                if (U_SUCCESS(status) && engine != nullptr) {
176
                    return engine;
177
                }
178
                if (engine != nullptr) {
179
                    delete engine;
180
                    engine = nullptr;
181
                } else {
182
                    DeleteLSTMData(data);
183
                }
184
            }
185
        }
186
        status = U_ZERO_ERROR;  // fallback to dictionary based
187
        DictionaryMatcher *m = loadDictionaryMatcherFor(code);
188
        if (m != NULL) {
189
            switch(code) {
190
            case USCRIPT_THAI:
191
                engine = new ThaiBreakEngine(m, status);
192
                break;
193
            case USCRIPT_LAO:
194
                engine = new LaoBreakEngine(m, status);
195
                break;
196
            case USCRIPT_MYANMAR:
197
                engine = new BurmeseBreakEngine(m, status);
198
                break;
199
            case USCRIPT_KHMER:
200
                engine = new KhmerBreakEngine(m, status);
201
                break;
202

203
#if !UCONFIG_NO_NORMALIZATION
204
                // CJK not available w/o normalization
205
            case USCRIPT_HANGUL:
206
                engine = new CjkBreakEngine(m, kKorean, status);
207
                break;
208

209
            // use same BreakEngine and dictionary for both Chinese and Japanese
210
            case USCRIPT_HIRAGANA:
211
            case USCRIPT_KATAKANA:
212
            case USCRIPT_HAN:
213
                engine = new CjkBreakEngine(m, kChineseJapanese, status);
214
                break;
215
#if 0
216
            // TODO: Have to get some characters with script=common handled
217
            // by CjkBreakEngine (e.g. U+309B). Simply subjecting
218
            // them to CjkBreakEngine does not work. The engine has to
219
            // special-case them.
220
            case USCRIPT_COMMON:
221
            {
222
                UBlockCode block = ublock_getCode(code);
223
                if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
224
                   engine = new CjkBreakEngine(dict, kChineseJapanese, status);
225
                break;
226
            }
227
#endif
228
#endif
229

230
            default:
231
                break;
232
            }
233
            if (engine == NULL) {
234
                delete m;
235
            }
236
            else if (U_FAILURE(status)) {
237
                delete engine;
238
                engine = NULL;
239
            }
240
            return engine;
241
        }
242
    }
243
    return NULL;
244
}
245

246
DictionaryMatcher *
247
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
248
    UErrorCode status = U_ZERO_ERROR;
249
    // open root from brkitr tree.
250
    UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
251
    b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
252
    int32_t dictnlength = 0;
253
    const UChar *dictfname =
254
        ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
255
    if (U_FAILURE(status)) {
256
        ures_close(b);
257
        return NULL;
258
    }
259
    CharString dictnbuf;
260
    CharString ext;
261
    const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
262
    if (extStart != NULL) {
263
        int32_t len = (int32_t)(extStart - dictfname);
264
        ext.appendInvariantChars(UnicodeString(false, extStart + 1, dictnlength - len - 1), status);
265
        dictnlength = len;
266
    }
267
    dictnbuf.appendInvariantChars(UnicodeString(false, dictfname, dictnlength), status);
268
    ures_close(b);
269

270
    UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
271
    if (U_SUCCESS(status)) {
272
        // build trie
273
        const uint8_t *data = (const uint8_t *)udata_getMemory(file);
274
        const int32_t *indexes = (const int32_t *)data;
275
        const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
276
        const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
277
        DictionaryMatcher *m = NULL;
278
        if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
279
            const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
280
            const char *characters = (const char *)(data + offset);
281
            m = new BytesDictionaryMatcher(characters, transform, file);
282
        }
283
        else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
284
            const UChar *characters = (const UChar *)(data + offset);
285
            m = new UCharsDictionaryMatcher(characters, file);
286
        }
287
        if (m == NULL) {
288
            // no matcher exists to take ownership - either we are an invalid
289
            // type or memory allocation failed
290
            udata_close(file);
291
        }
292
        return m;
293
    } else if (dictfname != NULL) {
294
        // we don't have a dictionary matcher.
295
        // returning NULL here will cause us to fail to find a dictionary break engine, as expected
296
        status = U_ZERO_ERROR;
297
        return NULL;
298
    }
299
    return NULL;
300
}
301

302
U_NAMESPACE_END
303

304
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
305

306
Product

Resources

Company