CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Common/Data/Encoding/Utf8.h
Views: 1401
1
/*
2
Basic UTF-8 manipulation routines
3
by Jeff Bezanson
4
placed in the public domain Fall 2005
5
6
This code is designed to provide the utilities you need to manipulate
7
UTF-8 as an internal string encoding. These functions do not perform the
8
error checking normally needed when handling UTF-8 data, so if you happen
9
to be from the Unicode Consortium you will want to flay me alive.
10
I do this because error checking can be performed at the boundaries (I/O),
11
with these routines reserved for higher performance on data known to be
12
valid.
13
*/
14
15
// Further modified, and C++ stuff added, by [email protected].
16
17
#pragma once
18
19
#include <cstdint>
20
#include <cstring>
21
#include <string>
22
#include <string_view>
23
24
uint32_t u8_nextchar(const char *s, int *i, size_t size);
25
uint32_t u8_nextchar_unsafe(const char *s, int *i);
26
int u8_wc_toutf8(char *dest, uint32_t ch);
27
void u8_inc(const char *s, int *i);
28
void u8_dec(const char *s, int *i);
29
30
inline bool CodepointIsProbablyEmoji(uint32_t c) {
31
// Original check was some ranges grabbed from https://stackoverflow.com/a/62898106.
32
// But let's just go with checking if outside the BMP, it's not a big deal if we accidentally
33
// switch to color when not needed if someone uses a weird glyph.
34
return c > 0xFFFF;
35
}
36
37
bool AnyEmojiInString(std::string_view str, size_t byteCount);
38
39
class UTF8 {
40
public:
41
static const uint32_t INVALID = (uint32_t)-1;
42
// TODO: Try to get rid of this constructor.
43
explicit UTF8(const char *c) : c_(c), size_((int)strlen(c)), index_(0) {}
44
explicit UTF8(std::string_view view) : c_(view.data()), size_((int)view.size()), index_(0) {}
45
explicit UTF8(std::string_view view, int index) : c_(view.data()), size_((int)view.size()), index_(index) {}
46
bool end() const { return index_ == size_; }
47
// Returns true if the next character is outside BMP and Planes 1 - 16.
48
bool invalid() const {
49
unsigned char c = (unsigned char)c_[index_];
50
return (c >= 0x80 && c <= 0xC1) || c >= 0xF5;
51
}
52
uint32_t next() {
53
return u8_nextchar(c_, &index_, size_);
54
}
55
// Allow invalid continuation bytes.
56
uint32_t next_unsafe() {
57
return u8_nextchar_unsafe(c_, &index_);
58
}
59
uint32_t peek() const {
60
int tempIndex = index_;
61
return u8_nextchar(c_, &tempIndex, size_);
62
}
63
void fwd() {
64
u8_inc(c_, &index_);
65
}
66
void bwd() {
67
u8_dec(c_, &index_);
68
}
69
int length() const {
70
return size_;
71
}
72
int byteIndex() const {
73
return index_;
74
}
75
static int encode(char *dest, uint32_t ch) {
76
return u8_wc_toutf8(dest, ch);
77
}
78
static int encodeUnits(uint32_t ch) {
79
if (ch < 0x80) {
80
return 1;
81
} else if (ch < 0x800) {
82
return 2;
83
} else if (ch < 0x10000) {
84
return 3;
85
} else if (ch < 0x110000) {
86
return 4;
87
}
88
return 0;
89
}
90
91
private:
92
const char *c_;
93
int index_;
94
int size_;
95
};
96
97
int UTF8StringNonASCIICount(std::string_view utf8string);
98
99
bool UTF8StringHasNonASCII(std::string_view utf8string);
100
101
102
// Removes overlong encodings and similar.
103
std::string SanitizeUTF8(std::string_view utf8string);
104
std::string CodepointToUTF8(uint32_t codePoint);
105
106
107
// UTF8 to Win32 UTF-16
108
// Should be used when calling Win32 api calls
109
#ifdef _WIN32
110
111
std::string ConvertWStringToUTF8(const std::wstring &wstr);
112
std::string ConvertWStringToUTF8(const wchar_t *wstr);
113
void ConvertUTF8ToWString(wchar_t *dest, size_t destSize, std::string_view source);
114
std::wstring ConvertUTF8ToWString(std::string_view source);
115
116
#else
117
118
// Used by SymbolMap/assembler
119
std::wstring ConvertUTF8ToWString(std::string_view source);
120
std::string ConvertWStringToUTF8(const std::wstring &wstr);
121
122
#endif
123
124
std::string ConvertUCS2ToUTF8(const std::u16string &wstr);
125
126
// Dest size in units, not bytes.
127
void ConvertUTF8ToUCS2(char16_t *dest, size_t destSize, std::string_view source);
128
std::u16string ConvertUTF8ToUCS2(std::string_view source);
129
130