Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
alexbevi
GitHub Repository: alexbevi/BizHawk
Path: blob/master/psx/mednadisc/string/ConvertUTF.cpp
2 views
1
/*
2
* Copyright 2001-2004 Unicode, Inc.
3
*
4
* Disclaimer
5
*
6
* This source code is provided as is by Unicode, Inc. No claims are
7
* made as to fitness for any particular purpose. No warranties of any
8
* kind are expressed or implied. The recipient agrees to determine
9
* applicability of information provided. If this file has been
10
* purchased on magnetic or optical media from Unicode, Inc., the
11
* sole remedy for any claim will be exchange of defective media
12
* within 90 days of receipt.
13
*
14
* Limitations on Rights to Redistribute This Code
15
*
16
* Unicode, Inc. hereby grants the right to freely use the information
17
* supplied in this file in the creation of products supporting the
18
* Unicode Standard, and to make copies of this file in any form
19
* for internal or external distribution as long as this notice
20
* remains attached.
21
*/
22
23
/* ---------------------------------------------------------------------
24
25
Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26
Author: Mark E. Davis, 1994.
27
Rev History: Rick McGowan, fixes & updates May 2001.
28
Sept 2001: fixed const & error conditions per
29
mods suggested by S. Parent & A. Lillich.
30
June 2002: Tim Dodd added detection and handling of incomplete
31
source sequences, enhanced error detection, added casts
32
to eliminate compiler warnings.
33
July 2003: slight mods to back out aggressive FFFE detection.
34
Jan 2004: updated switches in from-UTF8 conversions.
35
Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36
37
See the header file "ConvertUTF.h" for complete documentation.
38
39
------------------------------------------------------------------------ */
40
41
42
#include "../types.h"
43
#include "ConvertUTF.h"
44
#ifdef CVTUTF_DEBUG
45
#include <stdio.h>
46
#endif
47
48
#include <string.h>
49
#include <stdlib.h>
50
51
static const int halfShift = 10; /* used for shifting by 10 bits */
52
53
static const UTF32 halfBase = 0x0010000UL;
54
static const UTF32 halfMask = 0x3FFUL;
55
56
#define UNI_SUR_HIGH_START (UTF32)0xD800
57
#define UNI_SUR_HIGH_END (UTF32)0xDBFF
58
#define UNI_SUR_LOW_START (UTF32)0xDC00
59
#define UNI_SUR_LOW_END (UTF32)0xDFFF
60
#define false 0
61
#define true 1
62
63
/* --------------------------------------------------------------------- */
64
65
ConversionResult ConvertUTF32toUTF16 (
66
const UTF32** sourceStart, const UTF32* sourceEnd,
67
UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
68
ConversionResult result = conversionOK;
69
const UTF32* source = *sourceStart;
70
UTF16* target = *targetStart;
71
while (source < sourceEnd) {
72
UTF32 ch;
73
if (target >= targetEnd) {
74
result = targetExhausted; break;
75
}
76
ch = *source++;
77
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
78
/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
79
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
80
if (flags == strictConversion) {
81
--source; /* return to the illegal value itself */
82
result = sourceIllegal;
83
break;
84
} else {
85
*target++ = UNI_REPLACEMENT_CHAR;
86
}
87
} else {
88
*target++ = (UTF16)ch; /* normal case */
89
}
90
} else if (ch > UNI_MAX_LEGAL_UTF32) {
91
if (flags == strictConversion) {
92
result = sourceIllegal;
93
} else {
94
*target++ = UNI_REPLACEMENT_CHAR;
95
}
96
} else {
97
/* target is a character in range 0xFFFF - 0x10FFFF. */
98
if (target + 1 >= targetEnd) {
99
--source; /* Back up source pointer! */
100
result = targetExhausted; break;
101
}
102
ch -= halfBase;
103
*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
104
*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
105
}
106
}
107
*sourceStart = source;
108
*targetStart = target;
109
return result;
110
}
111
112
/* --------------------------------------------------------------------- */
113
114
ConversionResult ConvertUTF16toUTF32 (
115
const UTF16** sourceStart, const UTF16* sourceEnd,
116
UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
117
ConversionResult result = conversionOK;
118
const UTF16* source = *sourceStart;
119
UTF32* target = *targetStart;
120
UTF32 ch, ch2;
121
while (source < sourceEnd) {
122
const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
123
ch = *source++;
124
/* If we have a surrogate pair, convert to UTF32 first. */
125
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
126
/* If the 16 bits following the high surrogate are in the source buffer... */
127
if (source < sourceEnd) {
128
ch2 = *source;
129
/* If it's a low surrogate, convert to UTF32. */
130
if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
131
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
132
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
133
++source;
134
} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
135
--source; /* return to the illegal value itself */
136
result = sourceIllegal;
137
break;
138
}
139
} else { /* We don't have the 16 bits following the high surrogate. */
140
--source; /* return to the high surrogate */
141
result = sourceExhausted;
142
break;
143
}
144
} else if (flags == strictConversion) {
145
/* UTF-16 surrogate values are illegal in UTF-32 */
146
if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
147
--source; /* return to the illegal value itself */
148
result = sourceIllegal;
149
break;
150
}
151
}
152
if (target >= targetEnd) {
153
source = oldSource; /* Back up source pointer! */
154
result = targetExhausted; break;
155
}
156
*target++ = ch;
157
}
158
*sourceStart = source;
159
*targetStart = target;
160
#ifdef CVTUTF_DEBUG
161
if (result == sourceIllegal) {
162
fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
163
fflush(stderr);
164
}
165
#endif
166
return result;
167
}
168
169
/* --------------------------------------------------------------------- */
170
171
/*
172
* Index into the table below with the first byte of a UTF-8 sequence to
173
* get the number of trailing bytes that are supposed to follow it.
174
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
175
* left as-is for anyone who may want to do such conversion, which was
176
* allowed in earlier algorithms.
177
*/
178
static const char trailingBytesForUTF8[256] = {
179
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
180
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
181
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
182
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
183
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
184
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
185
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
186
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
187
};
188
189
/*
190
* Magic values subtracted from a buffer value during UTF8 conversion.
191
* This table contains as many values as there might be trailing bytes
192
* in a UTF-8 sequence.
193
*/
194
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
195
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
196
197
/*
198
* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
199
* into the first byte, depending on how many bytes follow. There are
200
* as many entries in this table as there are UTF-8 sequence types.
201
* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
202
* for *legal* UTF-8 will be 4 or fewer bytes total.
203
*/
204
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
205
206
/* --------------------------------------------------------------------- */
207
208
/* The interface converts a whole buffer to avoid function-call overhead.
209
* Constants have been gathered. Loops & conditionals have been removed as
210
* much as possible for efficiency, in favor of drop-through switches.
211
* (See "Note A" at the bottom of the file for equivalent code.)
212
* If your compiler supports it, the "isLegalUTF8" call can be turned
213
* into an inline function.
214
*/
215
216
/* --------------------------------------------------------------------- */
217
218
ConversionResult ConvertUTF16toUTF8 (
219
const UTF16** sourceStart, const UTF16* sourceEnd,
220
UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
221
ConversionResult result = conversionOK;
222
const UTF16* source = *sourceStart;
223
UTF8* target = *targetStart;
224
while (source < sourceEnd) {
225
UTF32 ch;
226
unsigned short bytesToWrite = 0;
227
const UTF32 byteMask = 0xBF;
228
const UTF32 byteMark = 0x80;
229
const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
230
ch = *source++;
231
/* If we have a surrogate pair, convert to UTF32 first. */
232
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
233
/* If the 16 bits following the high surrogate are in the source buffer... */
234
if (source < sourceEnd) {
235
UTF32 ch2 = *source;
236
/* If it's a low surrogate, convert to UTF32. */
237
if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
238
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
239
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
240
++source;
241
} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
242
--source; /* return to the illegal value itself */
243
result = sourceIllegal;
244
break;
245
}
246
} else { /* We don't have the 16 bits following the high surrogate. */
247
--source; /* return to the high surrogate */
248
result = sourceExhausted;
249
break;
250
}
251
} else if (flags == strictConversion) {
252
/* UTF-16 surrogate values are illegal in UTF-32 */
253
if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
254
--source; /* return to the illegal value itself */
255
result = sourceIllegal;
256
break;
257
}
258
}
259
/* Figure out how many bytes the result will require */
260
if (ch < (UTF32)0x80) { bytesToWrite = 1;
261
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
262
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
263
} else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
264
} else { bytesToWrite = 3;
265
ch = UNI_REPLACEMENT_CHAR;
266
}
267
268
target += bytesToWrite;
269
if (target > targetEnd) {
270
source = oldSource; /* Back up source pointer! */
271
target -= bytesToWrite; result = targetExhausted; break;
272
}
273
switch (bytesToWrite) { /* note: everything falls through. */
274
case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
275
case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
276
case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
277
case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
278
}
279
target += bytesToWrite;
280
}
281
*sourceStart = source;
282
*targetStart = target;
283
return result;
284
}
285
286
/* --------------------------------------------------------------------- */
287
288
/*
289
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
290
* This must be called with the length pre-determined by the first byte.
291
* If not calling this from ConvertUTF8to*, then the length can be set by:
292
* length = trailingBytesForUTF8[*source]+1;
293
* and the sequence is illegal right away if there aren't that many bytes
294
* available.
295
* If presented with a length > 4, this returns false. The Unicode
296
* definition of UTF-8 goes up to 4-byte sequences.
297
*/
298
299
static bool isLegalUTF8(const UTF8 *source, int length) {
300
UTF8 a;
301
const UTF8 *srcptr = source+length;
302
switch (length) {
303
default: return false;
304
/* Everything else falls through when "true"... */
305
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
306
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
307
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
308
309
switch (*source) {
310
/* no fall-through in this inner switch */
311
case 0xE0: if (a < 0xA0) return false; break;
312
case 0xED: if (a > 0x9F) return false; break;
313
case 0xF0: if (a < 0x90) return false; break;
314
case 0xF4: if (a > 0x8F) return false; break;
315
default: if (a < 0x80) return false;
316
}
317
318
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
319
}
320
if (*source > 0xF4) return false;
321
return true;
322
}
323
324
/* --------------------------------------------------------------------- */
325
326
/*
327
* Exported function to return whether a UTF-8 sequence is legal or not.
328
* This is not used here; it's just exported.
329
*/
330
bool isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
331
int length = trailingBytesForUTF8[*source]+1;
332
if (source+length > sourceEnd) {
333
return false;
334
}
335
return isLegalUTF8(source, length);
336
}
337
338
/* --------------------------------------------------------------------- */
339
340
ConversionResult ConvertUTF8toUTF16 (
341
const UTF8** sourceStart, const UTF8* sourceEnd,
342
UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
343
ConversionResult result = conversionOK;
344
const UTF8* source = *sourceStart;
345
UTF16* target = *targetStart;
346
while (source < sourceEnd) {
347
UTF32 ch = 0;
348
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
349
if (source + extraBytesToRead >= sourceEnd) {
350
result = sourceExhausted; break;
351
}
352
/* Do this check whether lenient or strict */
353
if (! isLegalUTF8(source, extraBytesToRead+1)) {
354
result = sourceIllegal;
355
break;
356
}
357
/*
358
* The cases all fall through. See "Note A" below.
359
*/
360
switch (extraBytesToRead) {
361
case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
362
case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
363
case 3: ch += *source++; ch <<= 6;
364
case 2: ch += *source++; ch <<= 6;
365
case 1: ch += *source++; ch <<= 6;
366
case 0: ch += *source++;
367
}
368
ch -= offsetsFromUTF8[extraBytesToRead];
369
370
if (target >= targetEnd) {
371
source -= (extraBytesToRead+1); /* Back up source pointer! */
372
result = targetExhausted; break;
373
}
374
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
375
/* UTF-16 surrogate values are illegal in UTF-32 */
376
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
377
if (flags == strictConversion) {
378
source -= (extraBytesToRead+1); /* return to the illegal value itself */
379
result = sourceIllegal;
380
break;
381
} else {
382
*target++ = UNI_REPLACEMENT_CHAR;
383
}
384
} else {
385
*target++ = (UTF16)ch; /* normal case */
386
}
387
} else if (ch > UNI_MAX_UTF16) {
388
if (flags == strictConversion) {
389
result = sourceIllegal;
390
source -= (extraBytesToRead+1); /* return to the start */
391
break; /* Bail out; shouldn't continue */
392
} else {
393
*target++ = UNI_REPLACEMENT_CHAR;
394
}
395
} else {
396
/* target is a character in range 0xFFFF - 0x10FFFF. */
397
if (target + 1 >= targetEnd) {
398
source -= (extraBytesToRead+1); /* Back up source pointer! */
399
result = targetExhausted; break;
400
}
401
ch -= halfBase;
402
*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
403
*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
404
}
405
}
406
*sourceStart = source;
407
*targetStart = target;
408
return result;
409
}
410
411
/* --------------------------------------------------------------------- */
412
413
ConversionResult ConvertUTF32toUTF8 (
414
const UTF32** sourceStart, const UTF32* sourceEnd,
415
UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
416
ConversionResult result = conversionOK;
417
const UTF32* source = *sourceStart;
418
UTF8* target = *targetStart;
419
while (source < sourceEnd) {
420
UTF32 ch;
421
unsigned short bytesToWrite = 0;
422
const UTF32 byteMask = 0xBF;
423
const UTF32 byteMark = 0x80;
424
ch = *source++;
425
if (flags == strictConversion ) {
426
/* UTF-16 surrogate values are illegal in UTF-32 */
427
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
428
--source; /* return to the illegal value itself */
429
result = sourceIllegal;
430
break;
431
}
432
}
433
/*
434
* Figure out how many bytes the result will require. Turn any
435
* illegally large UTF32 things (> Plane 17) into replacement chars.
436
*/
437
if (ch < (UTF32)0x80) { bytesToWrite = 1;
438
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
439
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
440
} else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
441
} else { bytesToWrite = 3;
442
ch = UNI_REPLACEMENT_CHAR;
443
result = sourceIllegal;
444
}
445
446
target += bytesToWrite;
447
if (target > targetEnd) {
448
--source; /* Back up source pointer! */
449
target -= bytesToWrite; result = targetExhausted; break;
450
}
451
switch (bytesToWrite) { /* note: everything falls through. */
452
case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
453
case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
454
case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
455
case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
456
}
457
target += bytesToWrite;
458
}
459
*sourceStart = source;
460
*targetStart = target;
461
return result;
462
}
463
464
/* --------------------------------------------------------------------- */
465
466
ConversionResult ConvertUTF8toUTF32 (
467
const UTF8** sourceStart, const UTF8* sourceEnd,
468
UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
469
ConversionResult result = conversionOK;
470
const UTF8* source = *sourceStart;
471
UTF32* target = *targetStart;
472
while (source < sourceEnd) {
473
UTF32 ch = 0;
474
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
475
if (source + extraBytesToRead >= sourceEnd) {
476
result = sourceExhausted; break;
477
}
478
/* Do this check whether lenient or strict */
479
if (! isLegalUTF8(source, extraBytesToRead+1)) {
480
result = sourceIllegal;
481
break;
482
}
483
/*
484
* The cases all fall through. See "Note A" below.
485
*/
486
switch (extraBytesToRead) {
487
case 5: ch += *source++; ch <<= 6;
488
case 4: ch += *source++; ch <<= 6;
489
case 3: ch += *source++; ch <<= 6;
490
case 2: ch += *source++; ch <<= 6;
491
case 1: ch += *source++; ch <<= 6;
492
case 0: ch += *source++;
493
}
494
ch -= offsetsFromUTF8[extraBytesToRead];
495
496
if (target >= targetEnd) {
497
source -= (extraBytesToRead+1); /* Back up the source pointer! */
498
result = targetExhausted; break;
499
}
500
if (ch <= UNI_MAX_LEGAL_UTF32) {
501
/*
502
* UTF-16 surrogate values are illegal in UTF-32, and anything
503
* over Plane 17 (> 0x10FFFF) is illegal.
504
*/
505
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
506
if (flags == strictConversion) {
507
source -= (extraBytesToRead+1); /* return to the illegal value itself */
508
result = sourceIllegal;
509
break;
510
} else {
511
*target++ = UNI_REPLACEMENT_CHAR;
512
}
513
} else {
514
*target++ = ch;
515
}
516
} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
517
result = sourceIllegal;
518
*target++ = UNI_REPLACEMENT_CHAR;
519
}
520
}
521
*sourceStart = source;
522
*targetStart = target;
523
return result;
524
}
525
526
UTF32 *MakeUTF32FromUTF8(UTF8 *string)
527
{
528
UTF32 *ret, *tstart;
529
const UTF8 *tstring = string;
530
531
size_t string_length = strlen((char *)string);
532
533
tstart = ret = (UTF32 *)malloc(string_length * sizeof(UTF32) + 1);
534
535
ConvertUTF8toUTF32(&tstring, &string[string_length], &tstart, &tstart[string_length], lenientConversion);
536
537
*tstart = 0;
538
539
return(ret);
540
}
541
542
543
/* ---------------------------------------------------------------------
544
545
Note A.
546
The fall-through switches in UTF-8 reading code save a
547
temp variable, some decrements & conditionals. The switches
548
are equivalent to the following loop:
549
{
550
int tmpBytesToRead = extraBytesToRead+1;
551
do {
552
ch += *source++;
553
--tmpBytesToRead;
554
if (tmpBytesToRead) ch <<= 6;
555
} while (tmpBytesToRead > 0);
556
}
557
In UTF-8 writing code, the switches on "bytesToWrite" are
558
similarly unrolled loops.
559
560
--------------------------------------------------------------------- */
561
562