Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/phabricator
Path: blob/master/externals/figlet/utf8.c
12240 views
1
#ifdef TLF_FONTS
2
/*
3
* Copyright (c) 2007 Alexey Vatchenko <[email protected]>
4
*
5
* Permission to use, copy, modify, and/or distribute this software for any
6
* purpose with or without fee is hereby granted, provided that the above
7
* copyright notice and this permission notice appear in all copies.
8
*
9
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
*/
17
#include <sys/types.h>
18
19
#include <wchar.h>
20
#include <arpa/inet.h> /* for htonl() */
21
22
#include "utf8.h"
23
24
#define _NXT 0x80
25
#define _SEQ2 0xc0
26
#define _SEQ3 0xe0
27
#define _SEQ4 0xf0
28
#define _SEQ5 0xf8
29
#define _SEQ6 0xfc
30
31
#define _BOM 0xfeff
32
33
static int __wchar_forbitten(wchar_t sym);
34
static int __utf8_forbitten(u_char octet);
35
36
static int
37
__wchar_forbitten(wchar_t sym)
38
{
39
40
/* Surrogate pairs */
41
if (sym >= 0xd800 && sym <= 0xdfff)
42
return (-1);
43
44
return (0);
45
}
46
47
static int
48
__utf8_forbitten(u_char octet)
49
{
50
51
switch (octet) {
52
case 0xc0:
53
case 0xc1:
54
case 0xf5:
55
case 0xff:
56
return (-1);
57
}
58
59
return (0);
60
}
61
62
/*
63
* DESCRIPTION
64
* This function translates UTF-8 string into UCS-4 string (all symbols
65
* will be in local machine byte order).
66
*
67
* It takes the following arguments:
68
* in - input UTF-8 string. It can be null-terminated.
69
* insize - size of input string in bytes.
70
* out - result buffer for UCS-4 string. If out is NULL,
71
* function returns size of result buffer.
72
* outsize - size of out buffer in wide characters.
73
*
74
* RETURN VALUES
75
* The function returns size of result buffer (in wide characters).
76
* Zero is returned in case of error.
77
*
78
* CAVEATS
79
* 1. If UTF-8 string contains zero symbols, they will be translated
80
* as regular symbols.
81
* 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary
82
* when `out' is NULL and not NULL. It's because of special UTF-8
83
* sequences which may result in forbitten (by RFC3629) UNICODE
84
* characters. So, the caller must check return value every time and
85
* not prepare buffer in advance (\0 terminate) but after calling this
86
* function.
87
*/
88
size_t
89
utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize,
90
int flags)
91
{
92
u_char *p, *lim;
93
wchar_t *wlim, high;
94
size_t n, total, i, n_bits;
95
96
if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
97
return (0);
98
99
total = 0;
100
p = (u_char *)in;
101
lim = p + insize;
102
wlim = out + outsize;
103
104
for (; p < lim; p += n) {
105
if (__utf8_forbitten(*p) != 0 &&
106
(flags & UTF8_IGNORE_ERROR) == 0)
107
return (0);
108
109
/*
110
* Get number of bytes for one wide character.
111
*/
112
n = 1; /* default: 1 byte. Used when skipping bytes. */
113
if ((*p & 0x80) == 0)
114
high = (wchar_t)*p;
115
else if ((*p & 0xe0) == _SEQ2) {
116
n = 2;
117
high = (wchar_t)(*p & 0x1f);
118
} else if ((*p & 0xf0) == _SEQ3) {
119
n = 3;
120
high = (wchar_t)(*p & 0x0f);
121
} else if ((*p & 0xf8) == _SEQ4) {
122
n = 4;
123
high = (wchar_t)(*p & 0x07);
124
} else if ((*p & 0xfc) == _SEQ5) {
125
n = 5;
126
high = (wchar_t)(*p & 0x03);
127
} else if ((*p & 0xfe) == _SEQ6) {
128
n = 6;
129
high = (wchar_t)(*p & 0x01);
130
} else {
131
if ((flags & UTF8_IGNORE_ERROR) == 0)
132
return (0);
133
continue;
134
}
135
136
/* does the sequence header tell us truth about length? */
137
if (lim - p <= n - 1) {
138
if ((flags & UTF8_IGNORE_ERROR) == 0)
139
return (0);
140
n = 1;
141
continue; /* skip */
142
}
143
144
/*
145
* Validate sequence.
146
* All symbols must have higher bits set to 10xxxxxx
147
*/
148
if (n > 1) {
149
for (i = 1; i < n; i++) {
150
if ((p[i] & 0xc0) != _NXT)
151
break;
152
}
153
if (i != n) {
154
if ((flags & UTF8_IGNORE_ERROR) == 0)
155
return (0);
156
n = 1;
157
continue; /* skip */
158
}
159
}
160
161
total++;
162
163
if (out == NULL)
164
continue;
165
166
if (out >= wlim)
167
return (0); /* no space left */
168
169
*out = 0;
170
n_bits = 0;
171
for (i = 1; i < n; i++) {
172
*out |= (wchar_t)(p[n - i] & 0x3f) << n_bits;
173
n_bits += 6; /* 6 low bits in every byte */
174
}
175
*out |= high << n_bits;
176
177
if (*out == 0) /* return at end of string */
178
break;
179
180
if (__wchar_forbitten(*out) != 0) {
181
if ((flags & UTF8_IGNORE_ERROR) == 0)
182
return (0); /* forbitten character */
183
else {
184
total--;
185
out--;
186
}
187
} else if (*out == _BOM && (flags & UTF8_SKIP_BOM) != 0) {
188
total--;
189
out--;
190
}
191
192
out++;
193
}
194
195
return (total);
196
}
197
198
/*
199
* DESCRIPTION
200
* This function translates UCS-4 symbols (given in local machine
201
* byte order) into UTF-8 string.
202
*
203
* It takes the following arguments:
204
* in - input unicode string. It can be null-terminated.
205
* insize - size of input string in wide characters.
206
* out - result buffer for utf8 string. If out is NULL,
207
* function returns size of result buffer.
208
* outsize - size of result buffer.
209
*
210
* RETURN VALUES
211
* The function returns size of result buffer (in bytes). Zero is returned
212
* in case of error.
213
*
214
* CAVEATS
215
* If UCS-4 string contains zero symbols, they will be translated
216
* as regular symbols.
217
*/
218
size_t
219
wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize,
220
int flags)
221
{
222
wchar_t *w, *wlim, ch;
223
u_char *p, *lim, *oc;
224
size_t total, n;
225
226
if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
227
return (0);
228
229
w = (wchar_t *)in;
230
wlim = w + insize;
231
p = (u_char *)out;
232
lim = p + outsize;
233
total = 0;
234
for (; w < wlim; w++) {
235
if (__wchar_forbitten(*w) != 0) {
236
if ((flags & UTF8_IGNORE_ERROR) == 0)
237
return (0);
238
else
239
continue;
240
}
241
242
if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0)
243
continue;
244
245
if (*w < 0) {
246
if ((flags & UTF8_IGNORE_ERROR) == 0)
247
return (0);
248
continue;
249
} else if (*w <= 0x0000007f)
250
n = 1;
251
else if (*w <= 0x000007ff)
252
n = 2;
253
else if (*w <= 0x0000ffff)
254
n = 3;
255
else if (*w <= 0x001fffff)
256
n = 4;
257
else if (*w <= 0x03ffffff)
258
n = 5;
259
else /* if (*w <= 0x7fffffff) */
260
n = 6;
261
262
total += n;
263
264
if (out == NULL)
265
continue;
266
267
if (lim - p <= n - 1)
268
return (0); /* no space left */
269
270
/* make it work under different endians */
271
ch = htonl(*w);
272
oc = (u_char *)&ch;
273
switch (n) {
274
case 1:
275
*p = oc[3];
276
break;
277
278
case 2:
279
p[1] = _NXT | (oc[3] & 0x3f);
280
p[0] = _SEQ2 | (oc[3] >> 6) | ((oc[2] & 0x07) << 2);
281
break;
282
283
case 3:
284
p[2] = _NXT | (oc[3] & 0x3f);
285
p[1] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
286
p[0] = _SEQ3 | ((oc[2] & 0xf0) >> 4);
287
break;
288
289
case 4:
290
p[3] = _NXT | (oc[3] & 0x3f);
291
p[2] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
292
p[1] = _NXT | ((oc[2] & 0xf0) >> 4) |
293
((oc[1] & 0x03) << 4);
294
p[0] = _SEQ4 | ((oc[1] & 0x1f) >> 2);
295
break;
296
297
case 5:
298
p[4] = _NXT | (oc[3] & 0x3f);
299
p[3] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
300
p[2] = _NXT | ((oc[2] & 0xf0) >> 4) |
301
((oc[1] & 0x03) << 4);
302
p[1] = _NXT | (oc[1] >> 2);
303
p[0] = _SEQ5 | (oc[0] & 0x03);
304
break;
305
306
case 6:
307
p[5] = _NXT | (oc[3] & 0x3f);
308
p[4] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
309
p[3] = _NXT | (oc[2] >> 4) | ((oc[1] & 0x03) << 4);
310
p[2] = _NXT | (oc[1] >> 2);
311
p[1] = _NXT | (oc[0] & 0x3f);
312
p[0] = _SEQ6 | ((oc[0] & 0x40) >> 6);
313
break;
314
}
315
316
/*
317
* NOTE: do not check here for forbitten UTF-8 characters.
318
* They cannot appear here because we do proper convertion.
319
*/
320
321
p += n;
322
}
323
324
return (total);
325
}
326
#endif /* TLF_FONTS */
327
328