Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/icui18n/anytrans.cpp
12343 views
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*****************************************************************
5
* Copyright (c) 2002-2014, International Business Machines Corporation
6
* and others. All Rights Reserved.
7
*****************************************************************
8
* Date Name Description
9
* 06/06/2002 aliu Creation.
10
*****************************************************************
11
*/
12
13
#include "unicode/utypes.h"
14
15
#if !UCONFIG_NO_TRANSLITERATION
16
17
#include "unicode/uobject.h"
18
#include "unicode/uscript.h"
19
20
#include "anytrans.h"
21
#include "hash.h"
22
#include "mutex.h"
23
#include "nultrans.h"
24
#include "putilimp.h"
25
#include "tridpars.h"
26
#include "uinvchar.h"
27
#include "uvector.h"
28
29
//------------------------------------------------------------
30
// Constants
31
32
static const UChar TARGET_SEP = 45; // '-'
33
static const UChar VARIANT_SEP = 47; // '/'
34
static const UChar ANY[] = {0x41,0x6E,0x79,0}; // "Any"
35
static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
36
static const UChar LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-"
37
38
// initial size for an Any-XXXX transform's cache of script-XXXX transforms
39
// (will grow as necessary, but we don't expect to have source text with more than 7 scripts)
40
#define ANY_TRANS_CACHE_INIT_SIZE 7
41
42
//------------------------------------------------------------
43
44
U_CDECL_BEGIN
45
/**
46
* Deleter function for Transliterator*.
47
*/
48
static void U_CALLCONV
49
_deleteTransliterator(void *obj) {
50
delete (icu::Transliterator*) obj;
51
}
52
U_CDECL_END
53
54
//------------------------------------------------------------
55
56
U_NAMESPACE_BEGIN
57
58
//------------------------------------------------------------
59
// ScriptRunIterator
60
61
/**
62
* Returns a series of ranges corresponding to scripts. They will be
63
* of the form:
64
*
65
* ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
66
* | | - first run (start, limit)
67
* | | - second run (start, limit)
68
*
69
* That is, the runs will overlap. The reason for this is so that a
70
* transliterator can consider common characters both before and after
71
* the scripts.
72
*/
73
class ScriptRunIterator : public UMemory {
74
private:
75
const Replaceable& text;
76
int32_t textStart;
77
int32_t textLimit;
78
79
public:
80
/**
81
* The code of the current run, valid after next() returns. May
82
* be USCRIPT_INVALID_CODE if and only if the entire text is
83
* COMMON/INHERITED.
84
*/
85
UScriptCode scriptCode;
86
87
/**
88
* The start of the run, inclusive, valid after next() returns.
89
*/
90
int32_t start;
91
92
/**
93
* The end of the run, exclusive, valid after next() returns.
94
*/
95
int32_t limit;
96
97
/**
98
* Constructs a run iterator over the given text from start
99
* (inclusive) to limit (exclusive).
100
*/
101
ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
102
103
/**
104
* Returns true if there are any more runs. true is always
105
* returned at least once. Upon return, the caller should
106
* examine scriptCode, start, and limit.
107
*/
108
UBool next();
109
110
/**
111
* Adjusts internal indices for a change in the limit index of the
112
* given delta. A positive delta means the limit has increased.
113
*/
114
void adjustLimit(int32_t delta);
115
116
private:
117
ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
118
ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
119
};
120
121
ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
122
int32_t myStart, int32_t myLimit) :
123
text(theText)
124
{
125
textStart = myStart;
126
textLimit = myLimit;
127
limit = myStart;
128
}
129
130
UBool ScriptRunIterator::next() {
131
UChar32 ch;
132
UScriptCode s;
133
UErrorCode ec = U_ZERO_ERROR;
134
135
scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
136
start = limit;
137
138
// Are we done?
139
if (start == textLimit) {
140
return false;
141
}
142
143
// Move start back to include adjacent COMMON or INHERITED
144
// characters
145
while (start > textStart) {
146
ch = text.char32At(start - 1); // look back
147
s = uscript_getScript(ch, &ec);
148
if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
149
--start;
150
} else {
151
break;
152
}
153
}
154
155
// Move limit ahead to include COMMON, INHERITED, and characters
156
// of the current script.
157
while (limit < textLimit) {
158
ch = text.char32At(limit); // look ahead
159
s = uscript_getScript(ch, &ec);
160
if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
161
if (scriptCode == USCRIPT_INVALID_CODE) {
162
scriptCode = s;
163
} else if (s != scriptCode) {
164
break;
165
}
166
}
167
++limit;
168
}
169
170
// Return true even if the entire text is COMMON / INHERITED, in
171
// which case scriptCode will be USCRIPT_INVALID_CODE.
172
return true;
173
}
174
175
void ScriptRunIterator::adjustLimit(int32_t delta) {
176
limit += delta;
177
textLimit += delta;
178
}
179
180
//------------------------------------------------------------
181
// AnyTransliterator
182
183
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
184
185
AnyTransliterator::AnyTransliterator(const UnicodeString& id,
186
const UnicodeString& theTarget,
187
const UnicodeString& theVariant,
188
UScriptCode theTargetScript,
189
UErrorCode& ec) :
190
Transliterator(id, NULL),
191
targetScript(theTargetScript)
192
{
193
cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
194
if (U_FAILURE(ec)) {
195
return;
196
}
197
uhash_setValueDeleter(cache, _deleteTransliterator);
198
199
target = theTarget;
200
if (theVariant.length() > 0) {
201
target.append(VARIANT_SEP).append(theVariant);
202
}
203
}
204
205
AnyTransliterator::~AnyTransliterator() {
206
uhash_close(cache);
207
}
208
209
/**
210
* Copy constructor.
211
*/
212
AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
213
Transliterator(o),
214
target(o.target),
215
targetScript(o.targetScript)
216
{
217
// Don't copy the cache contents
218
UErrorCode ec = U_ZERO_ERROR;
219
cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
220
if (U_FAILURE(ec)) {
221
return;
222
}
223
uhash_setValueDeleter(cache, _deleteTransliterator);
224
}
225
226
/**
227
* Transliterator API.
228
*/
229
AnyTransliterator* AnyTransliterator::clone() const {
230
return new AnyTransliterator(*this);
231
}
232
233
/**
234
* Implements {@link Transliterator#handleTransliterate}.
235
*/
236
void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
237
UBool isIncremental) const {
238
int32_t allStart = pos.start;
239
int32_t allLimit = pos.limit;
240
241
ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
242
243
while (it.next()) {
244
// Ignore runs in the ante context
245
if (it.limit <= allStart) continue;
246
247
// Try to instantiate transliterator from it.scriptCode to
248
// our target or target/variant
249
Transliterator* t = getTransliterator(it.scriptCode);
250
251
if (t == NULL) {
252
// We have no transliterator. Do nothing, but keep
253
// pos.start up to date.
254
pos.start = it.limit;
255
continue;
256
}
257
258
// If the run end is before the transliteration limit, do
259
// a non-incremental transliteration. Otherwise do an
260
// incremental one.
261
UBool incremental = isIncremental && (it.limit >= allLimit);
262
263
pos.start = uprv_max(allStart, it.start);
264
pos.limit = uprv_min(allLimit, it.limit);
265
int32_t limit = pos.limit;
266
t->filteredTransliterate(text, pos, incremental);
267
int32_t delta = pos.limit - limit;
268
allLimit += delta;
269
it.adjustLimit(delta);
270
271
// We're done if we enter the post context
272
if (it.limit >= allLimit) break;
273
}
274
275
// Restore limit. pos.start is fine where the last transliterator
276
// left it, or at the end of the last run.
277
pos.limit = allLimit;
278
}
279
280
Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
281
282
if (source == targetScript || source == USCRIPT_INVALID_CODE) {
283
return NULL;
284
}
285
286
Transliterator* t = NULL;
287
{
288
Mutex m(NULL);
289
t = (Transliterator*) uhash_iget(cache, (int32_t) source);
290
}
291
if (t == NULL) {
292
UErrorCode ec = U_ZERO_ERROR;
293
UnicodeString sourceName(uscript_getShortName(source), -1, US_INV);
294
UnicodeString id(sourceName);
295
id.append(TARGET_SEP).append(target);
296
297
t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
298
if (U_FAILURE(ec) || t == NULL) {
299
delete t;
300
301
// Try to pivot around Latin, our most common script
302
id = sourceName;
303
id.append(LATIN_PIVOT, -1).append(target);
304
t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
305
if (U_FAILURE(ec) || t == NULL) {
306
delete t;
307
t = NULL;
308
}
309
}
310
311
if (t != NULL) {
312
Transliterator *rt = NULL;
313
{
314
Mutex m(NULL);
315
rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source));
316
if (rt == NULL) {
317
// Common case, no race to cache this new transliterator.
318
uhash_iput(cache, (int32_t) source, t, &ec);
319
} else {
320
// Race case, some other thread beat us to caching this transliterator.
321
Transliterator *temp = rt;
322
rt = t; // Our newly created transliterator that lost the race & now needs deleting.
323
t = temp; // The transliterator from the cache that we will return.
324
}
325
}
326
delete rt; // will be non-null only in case of races.
327
}
328
}
329
return t;
330
}
331
332
/**
333
* Return the script code for a given name, or -1 if not found.
334
*/
335
static UScriptCode scriptNameToCode(const UnicodeString& name) {
336
char buf[128];
337
UScriptCode code;
338
UErrorCode ec = U_ZERO_ERROR;
339
int32_t nameLen = name.length();
340
UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
341
342
if (isInvariant) {
343
name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
344
buf[127] = 0; // Make sure that we NULL terminate the string.
345
}
346
if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
347
{
348
code = USCRIPT_INVALID_CODE;
349
}
350
return code;
351
}
352
353
/**
354
* Registers standard transliterators with the system. Called by
355
* Transliterator during initialization. Scan all current targets and
356
* register those that are scripts T as Any-T/V.
357
*/
358
void AnyTransliterator::registerIDs() {
359
360
UErrorCode ec = U_ZERO_ERROR;
361
Hashtable seen(true, ec);
362
363
int32_t sourceCount = Transliterator::_countAvailableSources();
364
for (int32_t s=0; s<sourceCount; ++s) {
365
UnicodeString source;
366
Transliterator::_getAvailableSource(s, source);
367
368
// Ignore the "Any" source
369
if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
370
371
int32_t targetCount = Transliterator::_countAvailableTargets(source);
372
for (int32_t t=0; t<targetCount; ++t) {
373
UnicodeString target;
374
Transliterator::_getAvailableTarget(t, source, target);
375
376
// Only process each target once
377
if (seen.geti(target) != 0) continue;
378
ec = U_ZERO_ERROR;
379
seen.puti(target, 1, ec);
380
381
// Get the script code for the target. If not a script, ignore.
382
UScriptCode targetScript = scriptNameToCode(target);
383
if (targetScript == USCRIPT_INVALID_CODE) continue;
384
385
int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
386
// assert(variantCount >= 1);
387
for (int32_t v=0; v<variantCount; ++v) {
388
UnicodeString variant;
389
Transliterator::_getAvailableVariant(v, source, target, variant);
390
391
UnicodeString id;
392
TransliteratorIDParser::STVtoID(UnicodeString(true, ANY, 3), target, variant, id);
393
ec = U_ZERO_ERROR;
394
AnyTransliterator* tl = new AnyTransliterator(id, target, variant,
395
targetScript, ec);
396
if (U_FAILURE(ec)) {
397
delete tl;
398
} else {
399
Transliterator::_registerInstance(tl);
400
Transliterator::_registerSpecialInverse(target, UnicodeString(true, NULL_ID, 4), false);
401
}
402
}
403
}
404
}
405
}
406
407
U_NAMESPACE_END
408
409
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
410
411
//eof
412
413