Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
alexbevi
GitHub Repository: alexbevi/BizHawk
Path: blob/master/waterbox/libc/functions/_PDCLIB/_PDCLIB_utf8.c
2 views
1
/* UTF-8 codec
2
3
This file is part of the Public Domain C Library (PDCLib).
4
Permission is granted to use, modify, and / or redistribute at will.
5
*/
6
7
#ifndef REGTEST
8
#include <stdbool.h>
9
#include <stdint.h>
10
#include <uchar.h>
11
#include <assert.h>
12
#include "_PDCLIB_encoding.h"
13
14
/* Use of the mbstate:
15
*
16
* _StUC[0] is the current decoding state
17
* _St32[1] is the character accumulated so far
18
*/
19
20
static bool utf8_mbsinit( const mbstate_t *p_s )
21
{ return p_s->_StUC[0] == 0; }
22
23
enum {
24
DecStart = 0,
25
26
Dec2B2,
27
28
Dec3B2,
29
Dec3B3,
30
31
Dec4B2,
32
Dec4B3,
33
Dec4B4
34
};
35
36
#define state (p_s->_StUC[0])
37
#define accum (p_s->_St32[1])
38
39
#define START_CONVERSION \
40
bool result = true; \
41
42
#define END_CONVERSION \
43
end_conversion: \
44
return result
45
46
#define FINISH(_r) do { \
47
result = (_r); \
48
goto end_conversion; \
49
} while(0)
50
51
#define OUT32(_c) do { \
52
if(p_outbuf) \
53
(*((*p_outbuf)++)) = (_c); \
54
(*p_outsz)--; \
55
_PDCLIB_UNDEFINED(accum); \
56
state = DecStart; \
57
} while(0)
58
59
#define CHECK_CONTINUATION \
60
do { if((c & 0xC0) != 0x80) return false; } while(0)
61
62
static bool utf8toc32(
63
char32_t *restrict *restrict p_outbuf,
64
size_t *restrict p_outsz,
65
const char *restrict *restrict p_inbuf,
66
size_t *restrict p_insz,
67
mbstate_t *restrict p_s
68
)
69
{
70
START_CONVERSION
71
while(*p_outsz && *p_insz) {
72
unsigned char c = **p_inbuf;
73
char32_t c32;
74
switch(state) {
75
case DecStart:
76
// 1 byte
77
if(c <= 0x7F) {
78
OUT32(c);
79
} else if(c <= 0xDF) {
80
accum = (c & 0x1F) << 6;
81
state = Dec2B2;
82
} else if(c <= 0xEF) {
83
accum = (c & 0x0F) << 12;
84
state = Dec3B2;
85
} else if(c <= 0xF4) {
86
accum = (c & 0x07) << 18;
87
state = Dec4B2;
88
} else {
89
// 5+byte sequence illegal
90
FINISH(false);
91
}
92
break;
93
94
case Dec2B2:
95
CHECK_CONTINUATION;
96
97
c32 = accum | (c & 0x3F);
98
99
// Overlong sequence (e.g. NUL injection)
100
if(c32 <= 0x7F)
101
FINISH(false);
102
103
OUT32(c32);
104
break;
105
106
case Dec3B2:
107
CHECK_CONTINUATION;
108
accum |= (c & 0x3F) << 6;
109
state = Dec3B3;
110
break;
111
112
case Dec3B3:
113
CHECK_CONTINUATION;
114
115
c32 = accum | (c & 0x3F);
116
117
// Overlong
118
if(c32 <= 0x07FF)
119
FINISH(false);
120
121
// Surrogate
122
if(c32 >= 0xD800 && c32 <= 0xDFFF)
123
FINISH(false);
124
125
OUT32(c32);
126
break;
127
128
case Dec4B2:
129
CHECK_CONTINUATION;
130
accum |= (c & 0x3F) << 12;
131
state = Dec4B3;
132
break;
133
134
case Dec4B3:
135
CHECK_CONTINUATION;
136
accum |= (c & 0x3F) << 6;
137
state = Dec4B4;
138
break;
139
140
case Dec4B4:
141
CHECK_CONTINUATION;
142
143
c32 = accum | (c & 0x3F);
144
145
// Overlong
146
if(c32 <= 0xFFFF) FINISH(false);
147
148
// Not in Unicode
149
if(c32 > 0x10FFFF) FINISH(false);
150
151
OUT32(c32);
152
break;
153
154
default:
155
assert(!"Invalid state");
156
}
157
158
(*p_inbuf)++;
159
(*p_insz)--;
160
}
161
END_CONVERSION;
162
}
163
164
enum {
165
EncStart = 0,
166
Enc1R,
167
Enc2R,
168
Enc3R,
169
};
170
171
static bool c32toutf8(
172
char *restrict *restrict p_outbuf,
173
size_t *restrict p_outsz,
174
const char32_t *restrict *restrict p_inbuf,
175
size_t *restrict p_insz,
176
mbstate_t *restrict p_s
177
)
178
{
179
START_CONVERSION
180
while(*p_outsz) {
181
unsigned char outc = 0;
182
switch(state) {
183
case Enc3R:
184
outc = 0x80 | ((accum >> 12) & 0x3F);
185
state = Enc2R;
186
break;
187
188
case Enc2R:
189
outc = 0x80 | ((accum >> 6) & 0x3F);
190
state = Enc1R;
191
break;
192
193
case Enc1R:
194
outc = 0x80 | (accum & 0x3F);
195
state = EncStart;
196
_PDCLIB_UNDEFINED(accum);
197
break;
198
199
case EncStart:
200
if(*p_insz == 0)
201
FINISH(true);
202
203
accum = **p_inbuf;
204
(*p_inbuf)++;
205
(*p_insz)--;
206
207
if(accum <= 0x7F) {
208
outc = accum;
209
state = EncStart;
210
_PDCLIB_UNDEFINED(accum);
211
} else if(accum <= 0x7FF) {
212
outc = 0xC0 | (accum >> 6);
213
state = Enc1R;
214
} else if(accum <= 0xFFFF) {
215
outc = 0xE0 | (accum >> 12);
216
state = Enc2R;
217
} else if(accum <= 0x10FFFF) {
218
outc = 0xF0 | (accum >> 18);
219
state = Enc3R;
220
} else {
221
FINISH(false);
222
}
223
break;
224
}
225
226
if(p_outbuf) {
227
**p_outbuf = outc;
228
(*p_outbuf)++;
229
}
230
(*p_outsz)--;
231
}
232
END_CONVERSION;
233
}
234
235
const struct _PDCLIB_charcodec_t _PDCLIB_utf8_codec = {
236
.__mbsinit = utf8_mbsinit,
237
.__mbstoc32s = utf8toc32,
238
.__c32stombs = c32toutf8,
239
.__mb_max = 4,
240
};
241
242
#endif
243
244
#ifdef TEST
245
#include "_PDCLIB_test.h"
246
247
int main( void )
248
{
249
#ifndef REGTEST
250
// Valid conversion & back
251
252
static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF"
253
"\xF4\x8F\xBF\xBF";
254
255
char32_t c32out[8];
256
257
char32_t *c32ptr = &c32out[0];
258
size_t c32rem = 8;
259
const char *chrptr = (char*) &input[0];
260
size_t chrrem = strlen(input);
261
mbstate_t mbs = { 0 };
262
263
TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
264
TESTCASE(c32rem == 0);
265
TESTCASE(chrrem == 0);
266
TESTCASE(c32ptr == &c32out[8]);
267
TESTCASE(chrptr == &input[strlen(input)]);
268
TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&
269
c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&
270
c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);
271
272
char chrout[strlen(input)];
273
c32ptr = &c32out[0];
274
c32rem = 8;
275
chrptr = &chrout[0];
276
chrrem = strlen(input);
277
TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));
278
TESTCASE(c32rem == 0);
279
TESTCASE(chrrem == 0);
280
TESTCASE(c32ptr == &c32out[8]);
281
TESTCASE(chrptr == &chrout[strlen(input)]);
282
TESTCASE(memcmp(chrout, input, strlen(input)) == 0);
283
284
// Multi-part conversion
285
static const char* mpinput = "\xDF\xBF";
286
c32ptr = &c32out[0];
287
c32rem = 8;
288
chrptr = &mpinput[0];
289
chrrem = 1;
290
TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
291
TESTCASE(c32ptr == &c32out[0]);
292
TESTCASE(c32rem == 8);
293
TESTCASE(chrptr == &mpinput[1]);
294
TESTCASE(chrrem == 0);
295
chrrem = 1;
296
TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
297
TESTCASE(c32ptr == &c32out[1]);
298
TESTCASE(c32rem == 7);
299
TESTCASE(chrptr == &mpinput[2]);
300
TESTCASE(chrrem == 0);
301
302
// Invalid conversions
303
304
// Overlong nuls
305
const char* nul2 = "\xC0\x80";
306
c32ptr = &c32out[0];
307
c32rem = 8;
308
chrptr = &nul2[0];
309
chrrem = 2;
310
TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
311
memset(&mbs, 0, sizeof mbs);
312
const char* nul3 = "\xE0\x80\x80";
313
c32ptr = &c32out[0];
314
c32rem = 8;
315
chrptr = &nul3[0];
316
chrrem = 3;
317
TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
318
memset(&mbs, 0, sizeof mbs);
319
const char* nul4 = "\xF0\x80\x80\x80";
320
c32ptr = &c32out[0];
321
c32rem = 8;
322
chrptr = &nul4[0];
323
chrrem = 4;
324
TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
325
326
// Starting on a continuation
327
const char* cont = "\x80";
328
c32ptr = &c32out[0];
329
c32rem = 8;
330
chrptr = &cont[0];
331
chrrem = 1;
332
TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
333
#endif
334
return TEST_RESULTS;
335
}
336
337
#endif
338
339
340