CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Common/Data/Encoding/Utf8.cpp
Views: 1401
1
/*
2
Basic UTF-8 manipulation routines
3
by Jeff Bezanson
4
placed in the public domain Fall 2005
5
6
This code is designed to provide the utilities you need to manipulate
7
UTF-8 as an internal string encoding. These functions do not perform the
8
error checking normally needed when handling UTF-8 data, so if you happen
9
to be from the Unicode Consortium you will want to flay me alive.
10
I do this because error checking can be performed at the boundaries (I/O),
11
with these routines reserved for higher performance on data known to be
12
valid.
13
*/
14
15
#ifdef _WIN32
16
#include <windows.h>
17
#undef min
18
#undef max
19
#endif
20
21
#include <cstdlib>
22
#include <cstdio>
23
#include <cstring>
24
#include <cstdarg>
25
#include <cstdint>
26
27
#include <algorithm>
28
#include <string>
29
30
#include "Common/Data/Encoding/Utf8.h"
31
#include "Common/Data/Encoding/Utf16.h"
32
#include "Common/Log.h"
33
34
// is start of UTF sequence
35
inline bool isutf(char c) {
36
return (c & 0xC0) != 0x80;
37
}
38
39
static const uint32_t offsetsFromUTF8[6] = {
40
0x00000000UL, 0x00003080UL, 0x000E2080UL,
41
0x03C82080UL, 0xFA082080UL, 0x82082080UL
42
};
43
44
static const uint8_t trailingBytesForUTF8[256] = {
45
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
46
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
52
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
53
};
54
55
/* returns length of next utf-8 sequence */
56
int u8_seqlen(const char *s)
57
{
58
return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
59
}
60
61
/* conversions without error checking
62
only works for valid UTF-8, i.e. no 5- or 6-byte sequences
63
srcsz = source size in bytes, or -1 if 0-terminated
64
sz = dest size in # of wide characters
65
66
returns # characters converted
67
dest will always be L'\0'-terminated, even if there isn't enough room
68
for all the characters.
69
if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
70
*/
71
int u8_toucs(uint32_t *dest, int sz, const char *src, int srcsz)
72
{
73
uint32_t ch;
74
const char *src_end = src + srcsz;
75
int nb;
76
int i=0;
77
78
while (i < sz-1) {
79
nb = trailingBytesForUTF8[(unsigned char)*src];
80
if (srcsz == -1) {
81
if (*src == 0)
82
goto done_toucs;
83
}
84
else {
85
if (src + nb >= src_end)
86
goto done_toucs;
87
}
88
ch = 0;
89
switch (nb) {
90
/* these fall through deliberately */
91
case 3: ch += (unsigned char)*src++; ch <<= 6;
92
case 2: ch += (unsigned char)*src++; ch <<= 6;
93
case 1: ch += (unsigned char)*src++; ch <<= 6;
94
case 0: ch += (unsigned char)*src++;
95
}
96
ch -= offsetsFromUTF8[nb];
97
dest[i++] = ch;
98
}
99
done_toucs:
100
dest[i] = 0;
101
return i;
102
}
103
104
/* srcsz = number of source characters, or -1 if 0-terminated
105
sz = size of dest buffer in bytes
106
107
returns # characters converted
108
dest will only be '\0'-terminated if there is enough space. this is
109
for consistency; imagine there are 2 bytes of space left, but the next
110
character requires 3 bytes. in this case we could NUL-terminate, but in
111
general we can't when there's insufficient space. therefore this function
112
only NUL-terminates if all the characters fit, and there's space for
113
the NUL as well.
114
the destination string will never be bigger than the source string.
115
*/
116
int u8_toutf8(char *dest, int sz, const uint32_t *src, int srcsz)
117
{
118
uint32_t ch;
119
int i = 0;
120
char *dest_end = dest + sz;
121
122
while (srcsz<0 ? src[i]!=0 : i < srcsz) {
123
ch = src[i];
124
if (ch < 0x80) {
125
if (dest >= dest_end)
126
return i;
127
*dest++ = (char)ch;
128
}
129
else if (ch < 0x800) {
130
if (dest >= dest_end-1)
131
return i;
132
*dest++ = (ch>>6) | 0xC0;
133
*dest++ = (ch & 0x3F) | 0x80;
134
}
135
else if (ch < 0x10000) {
136
if (dest >= dest_end-2)
137
return i;
138
*dest++ = (ch>>12) | 0xE0;
139
*dest++ = ((ch>>6) & 0x3F) | 0x80;
140
*dest++ = (ch & 0x3F) | 0x80;
141
}
142
else if (ch < 0x110000) {
143
if (dest >= dest_end-3)
144
return i;
145
*dest++ = (ch>>18) | 0xF0;
146
*dest++ = ((ch>>12) & 0x3F) | 0x80;
147
*dest++ = ((ch>>6) & 0x3F) | 0x80;
148
*dest++ = (ch & 0x3F) | 0x80;
149
}
150
i++;
151
}
152
if (dest < dest_end)
153
*dest = '\0';
154
return i;
155
}
156
157
int u8_wc_toutf8(char *dest, uint32_t ch)
158
{
159
if (ch < 0x80) {
160
dest[0] = (char)ch;
161
return 1;
162
}
163
if (ch < 0x800) {
164
dest[0] = (ch>>6) | 0xC0;
165
dest[1] = (ch & 0x3F) | 0x80;
166
return 2;
167
}
168
if (ch < 0x10000) {
169
dest[0] = (ch>>12) | 0xE0;
170
dest[1] = ((ch>>6) & 0x3F) | 0x80;
171
dest[2] = (ch & 0x3F) | 0x80;
172
return 3;
173
}
174
if (ch < 0x110000) {
175
dest[0] = (ch>>18) | 0xF0;
176
dest[1] = ((ch>>12) & 0x3F) | 0x80;
177
dest[2] = ((ch>>6) & 0x3F) | 0x80;
178
dest[3] = (ch & 0x3F) | 0x80;
179
return 4;
180
}
181
return 0;
182
}
183
184
/* charnum => byte offset */
185
int u8_offset(const char *str, int charnum)
186
{
187
int offs=0;
188
189
while (charnum > 0 && str[offs]) {
190
(void)(isutf(str[++offs]) || isutf(str[++offs]) ||
191
isutf(str[++offs]) || ++offs);
192
charnum--;
193
}
194
return offs;
195
}
196
197
/* byte offset => charnum */
198
int u8_charnum(const char *s, int offset)
199
{
200
int charnum = 0, offs=0;
201
202
while (offs < offset && s[offs]) {
203
(void)(isutf(s[++offs]) || isutf(s[++offs]) ||
204
isutf(s[++offs]) || ++offs);
205
charnum++;
206
}
207
return charnum;
208
}
209
210
/* reads the next utf-8 sequence out of a string, updating an index */
211
uint32_t u8_nextchar(const char *s, int *index, size_t size) {
212
uint32_t ch = 0;
213
_dbg_assert_(*index >= 0 && *index < 100000000);
214
int sz = 0;
215
int i = *index;
216
do {
217
ch = (ch << 6) + (unsigned char)s[i++];
218
sz++;
219
} while (i < size && s[i] && ((s[i]) & 0xC0) == 0x80);
220
*index = i;
221
return ch - offsetsFromUTF8[sz - 1];
222
}
223
224
uint32_t u8_nextchar_unsafe(const char *s, int *i) {
225
uint32_t ch = (unsigned char)s[(*i)++];
226
int sz = 1;
227
if (ch >= 0xF0) {
228
sz++;
229
ch &= ~0x10;
230
}
231
if (ch >= 0xE0) {
232
sz++;
233
ch &= ~0x20;
234
}
235
if (ch >= 0xC0) {
236
sz++;
237
ch &= ~0xC0;
238
}
239
240
// Just assume the bytes must be there. This is the logic used on the PSP.
241
for (int j = 1; j < sz; ++j) {
242
ch <<= 6;
243
ch += ((unsigned char)s[(*i)++]) & 0x3F;
244
}
245
return ch;
246
}
247
248
void u8_inc(const char *s, int *i)
249
{
250
(void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) ||
251
isutf(s[++(*i)]) || ++(*i));
252
}
253
254
void u8_dec(const char *s, int *i)
255
{
256
(void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) ||
257
isutf(s[--(*i)]) || --(*i));
258
}
259
260
int octal_digit(char c)
261
{
262
return (c >= '0' && c <= '7');
263
}
264
265
int hex_digit(char c)
266
{
267
return ((c >= '0' && c <= '9') ||
268
(c >= 'A' && c <= 'F') ||
269
(c >= 'a' && c <= 'f'));
270
}
271
272
/* assumes that src points to the character after a backslash
273
returns number of input characters processed */
274
int u8_read_escape_sequence(const char *str, uint32_t *dest)
275
{
276
long ch;
277
char digs[9]="\0\0\0\0\0\0\0\0";
278
int dno=0, i=1;
279
280
ch = (uint32_t)str[0]; /* take literal character */
281
if (str[0] == 'n')
282
ch = L'\n';
283
else if (str[0] == 't')
284
ch = L'\t';
285
else if (str[0] == 'r')
286
ch = L'\r';
287
else if (str[0] == 'b')
288
ch = L'\b';
289
else if (str[0] == 'f')
290
ch = L'\f';
291
else if (str[0] == 'v')
292
ch = L'\v';
293
else if (str[0] == 'a')
294
ch = L'\a';
295
else if (octal_digit(str[0])) {
296
i = 0;
297
do {
298
digs[dno++] = str[i++];
299
} while (octal_digit(str[i]) && dno < 3);
300
ch = strtol(digs, NULL, 8);
301
}
302
else if (str[0] == 'x') {
303
while (hex_digit(str[i]) && dno < 2) {
304
digs[dno++] = str[i++];
305
}
306
if (dno > 0)
307
ch = strtol(digs, NULL, 16);
308
}
309
else if (str[0] == 'u') {
310
while (hex_digit(str[i]) && dno < 4) {
311
digs[dno++] = str[i++];
312
}
313
if (dno > 0)
314
ch = strtol(digs, NULL, 16);
315
}
316
else if (str[0] == 'U') {
317
while (hex_digit(str[i]) && dno < 8) {
318
digs[dno++] = str[i++];
319
}
320
if (dno > 0)
321
ch = strtol(digs, NULL, 16);
322
}
323
*dest = (uint32_t)ch;
324
325
return i;
326
}
327
328
/* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8
329
example: u8_unescape(mybuf, 256, "hello\\u220e")
330
note the double backslash is needed if called on a C string literal */
331
int u8_unescape(char *buf, int sz, char *src)
332
{
333
int c=0, amt;
334
uint32_t ch;
335
char temp[4];
336
337
while (*src && c < sz) {
338
if (*src == '\\') {
339
src++;
340
amt = u8_read_escape_sequence(src, &ch);
341
}
342
else {
343
ch = (uint32_t)*src;
344
amt = 1;
345
}
346
src += amt;
347
amt = u8_wc_toutf8(temp, ch);
348
if (amt > sz-c)
349
break;
350
memcpy(&buf[c], temp, amt);
351
c += amt;
352
}
353
if (c < sz)
354
buf[c] = '\0';
355
return c;
356
}
357
358
int u8_is_locale_utf8(const char *locale)
359
{
360
/* this code based on libutf8 */
361
const char* cp = locale;
362
363
for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) {
364
if (*cp == '.') {
365
const char* encoding = ++cp;
366
for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++)
367
;
368
if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5))
369
|| (cp-encoding == 4 && !strncmp(encoding, "utf8", 4)))
370
return 1; /* it's UTF-8 */
371
break;
372
}
373
}
374
return 0;
375
}
376
377
bool AnyEmojiInString(std::string_view str, size_t byteCount) {
378
int i = 0;
379
while (i < byteCount) {
380
uint32_t c = u8_nextchar(str.data(), &i, str.size());
381
if (CodepointIsProbablyEmoji(c)) {
382
return true;
383
}
384
}
385
return false;
386
}
387
388
int UTF8StringNonASCIICount(std::string_view utf8string) {
389
UTF8 utf(utf8string);
390
int count = 0;
391
while (!utf.end()) {
392
int c = utf.next();
393
if (c > 127)
394
++count;
395
}
396
return count;
397
}
398
399
bool UTF8StringHasNonASCII(std::string_view utf8string) {
400
return UTF8StringNonASCIICount(utf8string) > 0;
401
}
402
403
#ifdef _WIN32
404
405
std::string ConvertWStringToUTF8(const wchar_t *wstr) {
406
int len = (int)wcslen(wstr);
407
int size = (int)WideCharToMultiByte(CP_UTF8, 0, wstr, len, 0, 0, NULL, NULL);
408
std::string s;
409
s.resize(size);
410
if (size > 0) {
411
WideCharToMultiByte(CP_UTF8, 0, wstr, len, &s[0], size, NULL, NULL);
412
}
413
return s;
414
}
415
416
std::string ConvertWStringToUTF8(const std::wstring &wstr) {
417
int len = (int)wstr.size();
418
int size = (int)WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), len, 0, 0, NULL, NULL);
419
std::string s;
420
s.resize(size);
421
if (size > 0) {
422
WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), len, &s[0], size, NULL, NULL);
423
}
424
return s;
425
}
426
427
void ConvertUTF8ToWString(wchar_t *dest, size_t destSize, std::string_view source) {
428
int len = (int)source.size();
429
destSize -= 1; // account for the \0.
430
int size = (int)MultiByteToWideChar(CP_UTF8, 0, source.data(), len, NULL, 0);
431
MultiByteToWideChar(CP_UTF8, 0, source.data(), len, dest, std::min((int)destSize, size));
432
dest[size] = 0;
433
}
434
435
std::wstring ConvertUTF8ToWString(const std::string_view source) {
436
int len = (int)source.size();
437
int size = (int)MultiByteToWideChar(CP_UTF8, 0, source.data(), len, NULL, 0);
438
std::wstring str;
439
str.resize(size);
440
if (size > 0) {
441
MultiByteToWideChar(CP_UTF8, 0, source.data(), (int)source.size(), &str[0], size);
442
}
443
return str;
444
}
445
446
#endif
447
448
std::string ConvertUCS2ToUTF8(const std::u16string &wstr) {
449
std::string s;
450
// Worst case.
451
s.resize(wstr.size() * 4);
452
453
size_t pos = 0;
454
for (wchar_t c : wstr) {
455
pos += UTF8::encode(&s[pos], c);
456
}
457
458
s.resize(pos);
459
return s;
460
}
461
462
std::string SanitizeUTF8(std::string_view utf8string) {
463
UTF8 utf(utf8string);
464
std::string s;
465
// Worst case.
466
s.resize(utf8string.size() * 4);
467
468
// This stops at invalid start bytes.
469
size_t pos = 0;
470
while (!utf.end() && !utf.invalid()) {
471
int c = utf.next_unsafe();
472
pos += UTF8::encode(&s[pos], c);
473
}
474
s.resize(pos);
475
return s;
476
}
477
478
static size_t ConvertUTF8ToUCS2Internal(char16_t *dest, size_t destSize, std::string_view source) {
479
const char16_t *const orig = dest;
480
const char16_t *const destEnd = dest + destSize;
481
482
UTF8 utf(source);
483
484
char16_t *destw = (char16_t *)dest;
485
const char16_t *const destwEnd = destw + destSize;
486
487
// Ignores characters outside the BMP.
488
while (uint32_t c = utf.next()) {
489
if (destw + UTF16LE::encodeUnitsUCS2(c) >= destwEnd) {
490
break;
491
}
492
destw += UTF16LE::encodeUCS2(destw, c);
493
}
494
495
// No ++ to not count the null-terminator in length.
496
if (destw < destEnd) {
497
*destw = 0;
498
}
499
500
return destw - orig;
501
}
502
503
void ConvertUTF8ToUCS2(char16_t *dest, size_t destSize, const std::string &source) {
504
ConvertUTF8ToUCS2Internal(dest, destSize, source);
505
}
506
507
std::u16string ConvertUTF8ToUCS2(std::string_view source) {
508
std::u16string dst;
509
// utf-8 won't be less bytes than there are characters.
510
dst.resize(source.size(), 0);
511
size_t realLen = ConvertUTF8ToUCS2Internal(&dst[0], source.size(), source);
512
dst.resize(realLen);
513
return dst;
514
}
515
516
std::string CodepointToUTF8(uint32_t codePoint) {
517
char temp[16]{};
518
UTF8::encode(temp, codePoint);
519
return std::string(temp);
520
}
521
522
#ifndef _WIN32
523
524
// Replacements for the Win32 wstring functions. Not to be used from emulation code!
525
526
std::string ConvertWStringToUTF8(const std::wstring &wstr) {
527
std::string s;
528
// Worst case.
529
s.resize(wstr.size() * 4);
530
531
size_t pos = 0;
532
for (wchar_t c : wstr) {
533
pos += UTF8::encode(&s[pos], c);
534
}
535
536
s.resize(pos);
537
return s;
538
}
539
540
static size_t ConvertUTF8ToWStringInternal(wchar_t *dest, size_t destSize, std::string_view source) {
541
const wchar_t *const orig = dest;
542
const wchar_t *const destEnd = dest + destSize;
543
544
UTF8 utf(source);
545
546
if (sizeof(wchar_t) == 2) {
547
char16_t *destw = (char16_t *)dest;
548
const char16_t *const destwEnd = destw + destSize;
549
while (char32_t c = utf.next()) {
550
if (destw + UTF16LE::encodeUnits(c) >= destwEnd) {
551
break;
552
}
553
destw += UTF16LE::encode(destw, c);
554
}
555
dest = (wchar_t *)destw;
556
} else {
557
while (char32_t c = utf.next()) {
558
if (dest + 1 >= destEnd) {
559
break;
560
}
561
*dest++ = c;
562
}
563
}
564
565
// No ++ to not count the terminal in length.
566
if (dest < destEnd) {
567
*dest = 0;
568
}
569
570
return dest - orig;
571
}
572
573
std::wstring ConvertUTF8ToWString(std::string_view source) {
574
std::wstring dst;
575
// conservative size estimate for wide characters from utf-8 bytes. Will always reserve too much space.
576
dst.resize(source.size());
577
size_t realLen = ConvertUTF8ToWStringInternal(&dst[0], source.size(), source);
578
dst.resize(realLen); // no need to write a NUL, it's done for us by resize.
579
return dst;
580
}
581
582
#endif
583
584