Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
zmx0142857
GitHub Repository: zmx0142857/mini-games
Path: blob/master/c/include/str.h
363 views
1
#ifndef STR_H
2
#define STR_H
3
4
// TODO: windows 下的乱码问题
5
// chcp 65001
6
// g++ test.cpp -o test -Wall -fexec-charset=utf-8
7
8
#include <iostream>
9
#include <sstream>
10
#include <vector>
11
12
/* utf-8 的 6 种形态:
13
* 0xxxxxxx
14
* 110xxxxx 10xxxxxx
15
* 1110xxxx 10xxxxxx 10xxxxxx
16
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
17
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
18
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
19
* 我们只考虑前 3 种
20
*/
21
22
class Char {
23
24
public:
25
unsigned short ord;
26
27
explicit Char(unsigned short d=0): ord(d) {}
28
29
explicit Char(const std::string &str) {
30
std::istringstream is(str);
31
decode(is);
32
}
33
34
explicit Char(std::istream &is) {
35
decode(is);
36
}
37
38
operator bool() const {
39
return ord;
40
}
41
42
bool is_ascii() const {
43
return ord < 0x80;
44
}
45
46
// return count of bytes when converted to utf-8
47
size_t size() const {
48
if (ord < 0x80)
49
return 1;
50
else if (ord < 0x800)
51
return 2;
52
else // if (ord < 0x10000)
53
return 3;
54
// 只考虑前 3 种 utf-8 类型 (最多 3 字节)
55
}
56
57
// encode with utf-8
58
std::string to_string() const {
59
size_t sz = size();
60
if (sz == 1)
61
return std::string({char(ord)});
62
else if (sz == 2) {
63
char c0 = (ord & 0x3f) | 0x80;
64
char c1 = ((ord >> 6) & 0x1f) | 0xc0;
65
return std::string({c1, c0});
66
} else { // if (sz == 3)
67
char c0 = (ord & 0x3f) | 0x80;
68
char c1 = ((ord >> 6) & 0x3f) | 0x80;
69
char c2 = ((ord >> 12) & 0xf) | 0xe0;
70
return std::string({c2, c1, c0});
71
}
72
}
73
74
friend std::ostream &operator<<(std::ostream &os, const Char &c) {
75
return os << c.to_string();
76
}
77
78
friend std::istream &operator>>(std::istream &is, Char &c) {
79
c.decode(is);
80
return is;
81
}
82
83
bool operator==(const Char &c) {
84
return this->ord == c.ord;
85
}
86
87
bool operator!=(const Char &c) {
88
return !(*this == c);
89
}
90
91
bool operator==(unsigned short num) {
92
return this->ord == num;
93
}
94
95
bool operator!=(unsigned short num) {
96
return !(*this == num);
97
}
98
99
static void test() {
100
using namespace std;
101
string str;
102
// 20013: 中
103
// 25991: 文
104
// 26834: 棒
105
while (cin >> str) {
106
Char c(str);
107
cout << c.ord << ": " << Char(c.ord) << endl;
108
}
109
}
110
111
private:
112
// take first Char only
113
void decode(std::istream &is) {
114
ord = 0;
115
if (!is) bad_encoding(is, 0);
116
char c;
117
is.get(c);
118
if (!is) bad_encoding(is, c);
119
//std::cout << "first: " << int((unsigned char)c) << std::endl;
120
if ((c & 0x80) == 0) { // size == 1
121
ord = c;
122
} else if ((c & 0xe0) == 0xc0) { // size == 2
123
ord = c & 0x1f;
124
read_next(is);
125
} else { // size == 3
126
if ((c & 0xf0) != 0xe0) {
127
bad_encoding(is, c);
128
}
129
ord = c & 0xf;
130
read_next(is);
131
read_next(is);
132
}
133
}
134
135
void read_next(std::istream &is) {
136
char c;
137
ord <<= 6;
138
is.get(c);
139
if (!is) bad_encoding(is, c);
140
//std::cout << "next: " << int((unsigned char)c) << std::endl;
141
if ((c & 0xc0) != 0x80) {
142
bad_encoding(is, c);
143
}
144
ord |= (c & 0x3f);
145
}
146
147
void bad_encoding(std::istream &is, int c) {
148
if (c == 0)
149
is.setstate(std::ios::eofbit);
150
else
151
is.setstate(std::ios::badbit);
152
//std::cout << "decode failed\n";
153
/*
154
std::ostringstream os;
155
os << "cannot decode byte: 0x" << std::hex << c;
156
throw std::invalid_argument(os.str());
157
*/
158
}
159
160
};
161
162
class String {
163
164
std::vector<Char> data;
165
166
public:
167
String() {}
168
169
String(const std::vector<Char> &data): data(data) {}
170
171
String(const std::string &str) {
172
std::istringstream is(str);
173
decode(is);
174
}
175
176
// encode with utf-8
177
std::string to_string() const {
178
std::ostringstream os;
179
for (const auto &c : data) {
180
os << c;
181
}
182
return os.str();
183
}
184
185
friend std::ostream &operator<<(std::ostream &os, const String &s) {
186
return os << s.to_string();
187
}
188
189
size_t size() const {
190
return data.size();
191
}
192
193
Char &operator[](size_t index) {
194
return data[index];
195
}
196
197
const Char &operator[](size_t index) const {
198
return data[index];
199
}
200
201
void push_back(const Char &ch) {
202
data.push_back(ch);
203
}
204
205
static void test() {
206
using namespace std;
207
string str;
208
while (cin >> str) {
209
String s(str);
210
cout << s << endl;
211
for (const auto &c : s.data) {
212
cout << c.ord << ' ';
213
}
214
cout << endl;
215
}
216
}
217
218
private:
219
void decode(std::istream &is) {
220
if (!is) return;
221
while (true) {
222
Char c(is);
223
if (!is) return;
224
data.push_back(c);
225
}
226
}
227
228
};
229
230
// 标准库的 getline 忽略空行, 而这个不会
231
std::istream &getline(std::istream &is, String &str) {
232
std::string line;
233
char c;
234
while (is.get(c) && c != '\n') {
235
line += c;
236
}
237
str = String(line);
238
return is;
239
}
240
241
#endif // STR_H
242
243