CoCalc -- Utf8.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Common/Data/Encoding/Utf8.cpp
Views: ¹⁴⁰¹
1
/*
2
  Basic UTF-8 manipulation routines
3
  by Jeff Bezanson
4
  placed in the public domain Fall 2005
5

6
  This code is designed to provide the utilities you need to manipulate
7
  UTF-8 as an internal string encoding. These functions do not perform the
8
  error checking normally needed when handling UTF-8 data, so if you happen
9
  to be from the Unicode Consortium you will want to flay me alive.
10
  I do this because error checking can be performed at the boundaries (I/O),
11
  with these routines reserved for higher performance on data known to be
12
  valid.
13
*/
14

15
#ifdef _WIN32
16
#include <windows.h>
17
#undef min
18
#undef max
19
#endif
20

21
#include <cstdlib>
22
#include <cstdio>
23
#include <cstring>
24
#include <cstdarg>
25
#include <cstdint>
26

27
#include <algorithm>
28
#include <string>
29

30
#include "Common/Data/Encoding/Utf8.h"
31
#include "Common/Data/Encoding/Utf16.h"
32
#include "Common/Log.h"
33

34
// is start of UTF sequence
35
inline bool isutf(char c) {
36
	return (c & 0xC0) != 0x80;
37
}
38

39
static const uint32_t offsetsFromUTF8[6] = {
40
  0x00000000UL, 0x00003080UL, 0x000E2080UL,
41
  0x03C82080UL, 0xFA082080UL, 0x82082080UL
42
};
43

44
static const uint8_t trailingBytesForUTF8[256] = {
45
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
46
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
52
		2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
53
};
54

55
/* returns length of next utf-8 sequence */
56
int u8_seqlen(const char *s)
57
{
58
  return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
59
}
60

61
/* conversions without error checking
62
   only works for valid UTF-8, i.e. no 5- or 6-byte sequences
63
   srcsz = source size in bytes, or -1 if 0-terminated
64
   sz = dest size in # of wide characters
65

66
   returns # characters converted
67
   dest will always be L'\0'-terminated, even if there isn't enough room
68
   for all the characters.
69
   if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
70
*/
71
int u8_toucs(uint32_t *dest, int sz, const char *src, int srcsz)
72
{
73
  uint32_t ch;
74
  const char *src_end = src + srcsz;
75
  int nb;
76
  int i=0;
77

78
  while (i < sz-1) {
79
    nb = trailingBytesForUTF8[(unsigned char)*src];
80
    if (srcsz == -1) {
81
      if (*src == 0)
82
        goto done_toucs;
83
    }
84
    else {
85
      if (src + nb >= src_end)
86
        goto done_toucs;
87
    }
88
    ch = 0;
89
    switch (nb) {
90
      /* these fall through deliberately */
91
    case 3: ch += (unsigned char)*src++; ch <<= 6;
92
    case 2: ch += (unsigned char)*src++; ch <<= 6;
93
    case 1: ch += (unsigned char)*src++; ch <<= 6;
94
    case 0: ch += (unsigned char)*src++;
95
    }
96
    ch -= offsetsFromUTF8[nb];
97
    dest[i++] = ch;
98
  }
99
 done_toucs:
100
  dest[i] = 0;
101
  return i;
102
}
103

104
/* srcsz = number of source characters, or -1 if 0-terminated
105
   sz = size of dest buffer in bytes
106

107
   returns # characters converted
108
   dest will only be '\0'-terminated if there is enough space. this is
109
   for consistency; imagine there are 2 bytes of space left, but the next
110
   character requires 3 bytes. in this case we could NUL-terminate, but in
111
   general we can't when there's insufficient space. therefore this function
112
   only NUL-terminates if all the characters fit, and there's space for
113
   the NUL as well.
114
   the destination string will never be bigger than the source string.
115
*/
116
int u8_toutf8(char *dest, int sz, const uint32_t *src, int srcsz)
117
{
118
  uint32_t ch;
119
  int i = 0;
120
  char *dest_end = dest + sz;
121

122
  while (srcsz<0 ? src[i]!=0 : i < srcsz) {
123
    ch = src[i];
124
    if (ch < 0x80) {
125
      if (dest >= dest_end)
126
        return i;
127
      *dest++ = (char)ch;
128
    }
129
    else if (ch < 0x800) {
130
      if (dest >= dest_end-1)
131
        return i;
132
      *dest++ = (ch>>6) | 0xC0;
133
      *dest++ = (ch & 0x3F) | 0x80;
134
    }
135
    else if (ch < 0x10000) {
136
      if (dest >= dest_end-2)
137
        return i;
138
      *dest++ = (ch>>12) | 0xE0;
139
      *dest++ = ((ch>>6) & 0x3F) | 0x80;
140
      *dest++ = (ch & 0x3F) | 0x80;
141
    }
142
    else if (ch < 0x110000) {
143
      if (dest >= dest_end-3)
144
        return i;
145
      *dest++ = (ch>>18) | 0xF0;
146
      *dest++ = ((ch>>12) & 0x3F) | 0x80;
147
      *dest++ = ((ch>>6) & 0x3F) | 0x80;
148
      *dest++ = (ch & 0x3F) | 0x80;
149
    }
150
    i++;
151
  }
152
  if (dest < dest_end)
153
    *dest = '\0';
154
  return i;
155
}
156

157
int u8_wc_toutf8(char *dest, uint32_t ch)
158
{
159
  if (ch < 0x80) {
160
    dest[0] = (char)ch;
161
    return 1;
162
  }
163
  if (ch < 0x800) {
164
    dest[0] = (ch>>6) | 0xC0;
165
    dest[1] = (ch & 0x3F) | 0x80;
166
    return 2;
167
  }
168
  if (ch < 0x10000) {
169
    dest[0] = (ch>>12) | 0xE0;
170
    dest[1] = ((ch>>6) & 0x3F) | 0x80;
171
    dest[2] = (ch & 0x3F) | 0x80;
172
    return 3;
173
  }
174
  if (ch < 0x110000) {
175
    dest[0] = (ch>>18) | 0xF0;
176
    dest[1] = ((ch>>12) & 0x3F) | 0x80;
177
    dest[2] = ((ch>>6) & 0x3F) | 0x80;
178
    dest[3] = (ch & 0x3F) | 0x80;
179
    return 4;
180
  }
181
  return 0;
182
}
183

184
/* charnum => byte offset */
185
int u8_offset(const char *str, int charnum)
186
{
187
  int offs=0;
188

189
  while (charnum > 0 && str[offs]) {
190
    (void)(isutf(str[++offs]) || isutf(str[++offs]) ||
191
         isutf(str[++offs]) || ++offs);
192
    charnum--;
193
  }
194
  return offs;
195
}
196

197
/* byte offset => charnum */
198
int u8_charnum(const char *s, int offset)
199
{
200
  int charnum = 0, offs=0;
201

202
  while (offs < offset && s[offs]) {
203
    (void)(isutf(s[++offs]) || isutf(s[++offs]) ||
204
         isutf(s[++offs]) || ++offs);
205
    charnum++;
206
  }
207
  return charnum;
208
}
209

210
/* reads the next utf-8 sequence out of a string, updating an index */
211
uint32_t u8_nextchar(const char *s, int *index, size_t size) {
212
	uint32_t ch = 0;
213
	_dbg_assert_(*index >= 0 && *index < 100000000);
214
	int sz = 0;
215
	int i = *index;
216
	do {
217
		ch = (ch << 6) + (unsigned char)s[i++];
218
		sz++;
219
	} while (i < size && s[i] && ((s[i]) & 0xC0) == 0x80);
220
	*index = i;
221
	return ch - offsetsFromUTF8[sz - 1];
222
}
223

224
uint32_t u8_nextchar_unsafe(const char *s, int *i) {
225
	uint32_t ch = (unsigned char)s[(*i)++];
226
	int sz = 1;
227
	if (ch >= 0xF0) {
228
		sz++;
229
		ch &= ~0x10;
230
	}
231
	if (ch >= 0xE0) {
232
		sz++;
233
		ch &= ~0x20;
234
	}
235
	if (ch >= 0xC0) {
236
		sz++;
237
		ch &= ~0xC0;
238
	}
239

240
	// Just assume the bytes must be there.  This is the logic used on the PSP.
241
	for (int j = 1; j < sz; ++j) {
242
		ch <<= 6;
243
		ch += ((unsigned char)s[(*i)++]) & 0x3F;
244
	}
245
	return ch;
246
}
247

248
void u8_inc(const char *s, int *i)
249
{
250
  (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) ||
251
       isutf(s[++(*i)]) || ++(*i));
252
}
253

254
void u8_dec(const char *s, int *i)
255
{
256
  (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) ||
257
       isutf(s[--(*i)]) || --(*i));
258
}
259

260
int octal_digit(char c)
261
{
262
  return (c >= '0' && c <= '7');
263
}
264

265
int hex_digit(char c)
266
{
267
  return ((c >= '0' && c <= '9') ||
268
      (c >= 'A' && c <= 'F') ||
269
      (c >= 'a' && c <= 'f'));
270
}
271

272
/* assumes that src points to the character after a backslash
273
   returns number of input characters processed */
274
int u8_read_escape_sequence(const char *str, uint32_t *dest)
275
{
276
  long ch;
277
  char digs[9]="\0\0\0\0\0\0\0\0";
278
  int dno=0, i=1;
279

280
  ch = (uint32_t)str[0];  /* take literal character */
281
  if (str[0] == 'n')
282
    ch = L'\n';
283
  else if (str[0] == 't')
284
    ch = L'\t';
285
  else if (str[0] == 'r')
286
    ch = L'\r';
287
  else if (str[0] == 'b')
288
    ch = L'\b';
289
  else if (str[0] == 'f')
290
    ch = L'\f';
291
  else if (str[0] == 'v')
292
    ch = L'\v';
293
  else if (str[0] == 'a')
294
    ch = L'\a';
295
  else if (octal_digit(str[0])) {
296
    i = 0;
297
    do {
298
      digs[dno++] = str[i++];
299
    } while (octal_digit(str[i]) && dno < 3);
300
    ch = strtol(digs, NULL, 8);
301
  }
302
  else if (str[0] == 'x') {
303
    while (hex_digit(str[i]) && dno < 2) {
304
      digs[dno++] = str[i++];
305
    }
306
    if (dno > 0)
307
      ch = strtol(digs, NULL, 16);
308
  }
309
  else if (str[0] == 'u') {
310
    while (hex_digit(str[i]) && dno < 4) {
311
      digs[dno++] = str[i++];
312
    }
313
    if (dno > 0)
314
      ch = strtol(digs, NULL, 16);
315
  }
316
  else if (str[0] == 'U') {
317
    while (hex_digit(str[i]) && dno < 8) {
318
      digs[dno++] = str[i++];
319
    }
320
    if (dno > 0)
321
      ch = strtol(digs, NULL, 16);
322
  }
323
  *dest = (uint32_t)ch;
324

325
  return i;
326
}
327

328
/* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8
329
   example: u8_unescape(mybuf, 256, "hello\\u220e")
330
   note the double backslash is needed if called on a C string literal */
331
int u8_unescape(char *buf, int sz, char *src)
332
{
333
  int c=0, amt;
334
  uint32_t ch;
335
  char temp[4];
336

337
  while (*src && c < sz) {
338
    if (*src == '\\') {
339
      src++;
340
      amt = u8_read_escape_sequence(src, &ch);
341
    }
342
    else {
343
      ch = (uint32_t)*src;
344
      amt = 1;
345
    }
346
    src += amt;
347
    amt = u8_wc_toutf8(temp, ch);
348
    if (amt > sz-c)
349
      break;
350
    memcpy(&buf[c], temp, amt);
351
    c += amt;
352
  }
353
  if (c < sz)
354
    buf[c] = '\0';
355
  return c;
356
}
357

358
int u8_is_locale_utf8(const char *locale)
359
{
360
  /* this code based on libutf8 */
361
  const char* cp = locale;
362

363
  for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) {
364
    if (*cp == '.') {
365
      const char* encoding = ++cp;
366
      for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++)
367
        ;
368
      if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5))
369
        || (cp-encoding == 4 && !strncmp(encoding, "utf8", 4)))
370
        return 1; /* it's UTF-8 */
371
      break;
372
    }
373
  }
374
  return 0;
375
}
376

377
bool AnyEmojiInString(std::string_view str, size_t byteCount) {
378
	int i = 0;
379
	while (i < byteCount) {
380
		uint32_t c = u8_nextchar(str.data(), &i, str.size());
381
		if (CodepointIsProbablyEmoji(c)) {
382
			return true;
383
		}
384
	}
385
	return false;
386
}
387

388
int UTF8StringNonASCIICount(std::string_view utf8string) {
389
	UTF8 utf(utf8string);
390
	int count = 0;
391
	while (!utf.end()) {
392
		int c = utf.next();
393
		if (c > 127)
394
			++count;
395
	}
396
	return count;
397
}
398

399
bool UTF8StringHasNonASCII(std::string_view utf8string) {
400
	return UTF8StringNonASCIICount(utf8string) > 0;
401
}
402

403
#ifdef _WIN32
404

405
std::string ConvertWStringToUTF8(const wchar_t *wstr) {
406
	int len = (int)wcslen(wstr);
407
	int size = (int)WideCharToMultiByte(CP_UTF8, 0, wstr, len, 0, 0, NULL, NULL);
408
	std::string s;
409
	s.resize(size);
410
	if (size > 0) {
411
		WideCharToMultiByte(CP_UTF8, 0, wstr, len, &s[0], size, NULL, NULL);
412
	}
413
	return s;
414
}
415

416
std::string ConvertWStringToUTF8(const std::wstring &wstr) {
417
	int len = (int)wstr.size();
418
	int size = (int)WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), len, 0, 0, NULL, NULL);
419
	std::string s;
420
	s.resize(size);
421
	if (size > 0) {
422
		WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), len, &s[0], size, NULL, NULL);
423
	}
424
	return s;
425
}
426

427
void ConvertUTF8ToWString(wchar_t *dest, size_t destSize, std::string_view source) {
428
	int len = (int)source.size();
429
	destSize -= 1;  // account for the \0.
430
	int size = (int)MultiByteToWideChar(CP_UTF8, 0, source.data(), len, NULL, 0);
431
	MultiByteToWideChar(CP_UTF8, 0, source.data(), len, dest, std::min((int)destSize, size));
432
	dest[size] = 0;
433
}
434

435
std::wstring ConvertUTF8ToWString(const std::string_view source) {
436
	int len = (int)source.size();
437
	int size = (int)MultiByteToWideChar(CP_UTF8, 0, source.data(), len, NULL, 0);
438
	std::wstring str;
439
	str.resize(size);
440
	if (size > 0) {
441
		MultiByteToWideChar(CP_UTF8, 0, source.data(), (int)source.size(), &str[0], size);
442
	}
443
	return str;
444
}
445

446
#endif
447

448
std::string ConvertUCS2ToUTF8(const std::u16string &wstr) {
449
	std::string s;
450
	// Worst case.
451
	s.resize(wstr.size() * 4);
452

453
	size_t pos = 0;
454
	for (wchar_t c : wstr) {
455
		pos += UTF8::encode(&s[pos], c);
456
	}
457

458
	s.resize(pos);
459
	return s;
460
}
461

462
std::string SanitizeUTF8(std::string_view utf8string) {
463
	UTF8 utf(utf8string);
464
	std::string s;
465
	// Worst case.
466
	s.resize(utf8string.size() * 4);
467

468
	// This stops at invalid start bytes.
469
	size_t pos = 0;
470
	while (!utf.end() && !utf.invalid()) {
471
		int c = utf.next_unsafe();
472
		pos += UTF8::encode(&s[pos], c);
473
	}
474
	s.resize(pos);
475
	return s;
476
}
477

478
static size_t ConvertUTF8ToUCS2Internal(char16_t *dest, size_t destSize, std::string_view source) {
479
	const char16_t *const orig = dest;
480
	const char16_t *const destEnd = dest + destSize;
481

482
	UTF8 utf(source);
483

484
	char16_t *destw = (char16_t *)dest;
485
	const char16_t *const destwEnd = destw + destSize;
486

487
	// Ignores characters outside the BMP.
488
	while (uint32_t c = utf.next()) {
489
		if (destw + UTF16LE::encodeUnitsUCS2(c) >= destwEnd) {
490
			break;
491
		}
492
		destw += UTF16LE::encodeUCS2(destw, c);
493
	}
494

495
	// No ++ to not count the null-terminator in length.
496
	if (destw < destEnd) {
497
		*destw = 0;
498
	}
499

500
	return destw - orig;
501
}
502

503
void ConvertUTF8ToUCS2(char16_t *dest, size_t destSize, const std::string &source) {
504
	ConvertUTF8ToUCS2Internal(dest, destSize, source);
505
}
506

507
std::u16string ConvertUTF8ToUCS2(std::string_view source) {
508
	std::u16string dst;
509
	// utf-8 won't be less bytes than there are characters.
510
	dst.resize(source.size(), 0);
511
	size_t realLen = ConvertUTF8ToUCS2Internal(&dst[0], source.size(), source);
512
	dst.resize(realLen);
513
	return dst;
514
}
515

516
std::string CodepointToUTF8(uint32_t codePoint) {
517
	char temp[16]{};
518
	UTF8::encode(temp, codePoint);
519
	return std::string(temp);
520
}
521

522
#ifndef _WIN32
523

524
// Replacements for the Win32 wstring functions. Not to be used from emulation code!
525

526
std::string ConvertWStringToUTF8(const std::wstring &wstr) {
527
	std::string s;
528
	// Worst case.
529
	s.resize(wstr.size() * 4);
530

531
	size_t pos = 0;
532
	for (wchar_t c : wstr) {
533
		pos += UTF8::encode(&s[pos], c);
534
	}
535

536
	s.resize(pos);
537
	return s;
538
}
539

540
static size_t ConvertUTF8ToWStringInternal(wchar_t *dest, size_t destSize, std::string_view source) {
541
	const wchar_t *const orig = dest;
542
	const wchar_t *const destEnd = dest + destSize;
543

544
	UTF8 utf(source);
545

546
	if (sizeof(wchar_t) == 2) {
547
		char16_t *destw = (char16_t *)dest;
548
		const char16_t *const destwEnd = destw + destSize;
549
		while (char32_t c = utf.next()) {
550
			if (destw + UTF16LE::encodeUnits(c) >= destwEnd) {
551
				break;
552
			}
553
			destw += UTF16LE::encode(destw, c);
554
		}
555
		dest = (wchar_t *)destw;
556
	} else {
557
		while (char32_t c = utf.next()) {
558
			if (dest + 1 >= destEnd) {
559
				break;
560
			}
561
			*dest++ = c;
562
		}
563
	}
564

565
	// No ++ to not count the terminal in length.
566
	if (dest < destEnd) {
567
		*dest = 0;
568
	}
569

570
	return dest - orig;
571
}
572

573
std::wstring ConvertUTF8ToWString(std::string_view source) {
574
	std::wstring dst;
575
	// conservative size estimate for wide characters from utf-8 bytes. Will always reserve too much space.
576
	dst.resize(source.size());
577
	size_t realLen = ConvertUTF8ToWStringInternal(&dst[0], source.size(), source);
578
	dst.resize(realLen);  // no need to write a NUL, it's done for us by resize.
579
	return dst;
580
}
581

582
#endif
583

584
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company