CoCalc -- utf8.c

GitHub Repository: freebsd/phabricator
Path: blob/master/externals/figlet/utf8.c
¹³⁴⁵³ views
1
#ifdef TLF_FONTS
2
/*
3
 * Copyright (c) 2007 Alexey Vatchenko <[email protected]>
4
 *
5
 * Permission to use, copy, modify, and/or distribute this software for any
6
 * purpose with or without fee is hereby granted, provided that the above
7
 * copyright notice and this permission notice appear in all copies.
8
 *
9
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
 */
17
#include <sys/types.h>
18

19
#include <wchar.h>
20
#include <arpa/inet.h>	/* for htonl() */
21

22
#include "utf8.h"
23

24
#define _NXT	0x80
25
#define _SEQ2	0xc0
26
#define _SEQ3	0xe0
27
#define _SEQ4	0xf0
28
#define _SEQ5	0xf8
29
#define _SEQ6	0xfc
30

31
#define _BOM	0xfeff
32

33
static int __wchar_forbitten(wchar_t sym);
34
static int __utf8_forbitten(u_char octet);
35

36
static int
37
__wchar_forbitten(wchar_t sym)
38
{
39

40
	/* Surrogate pairs */
41
	if (sym >= 0xd800 && sym <= 0xdfff)
42
		return (-1);
43

44
	return (0);
45
}
46

47
static int
48
__utf8_forbitten(u_char octet)
49
{
50

51
	switch (octet) {
52
	case 0xc0:
53
	case 0xc1:
54
	case 0xf5:
55
	case 0xff:
56
		return (-1);
57
	}
58

59
	return (0);
60
}
61

62
/*
63
 * DESCRIPTION
64
 *	This function translates UTF-8 string into UCS-4 string (all symbols
65
 *	will be in local machine byte order).
66
 *
67
 *	It takes the following arguments:
68
 *	in	- input UTF-8 string. It can be null-terminated.
69
 *	insize	- size of input string in bytes.
70
 *	out	- result buffer for UCS-4 string. If out is NULL,
71
 *		function returns size of result buffer.
72
 *	outsize - size of out buffer in wide characters.
73
 *
74
 * RETURN VALUES
75
 *	The function returns size of result buffer (in wide characters).
76
 *	Zero is returned in case of error.
77
 *
78
 * CAVEATS
79
 *	1. If UTF-8 string contains zero symbols, they will be translated
80
 *	   as regular symbols.
81
 *	2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary
82
 *	   when `out' is NULL and not NULL. It's because of special UTF-8
83
 *	   sequences which may result in forbitten (by RFC3629) UNICODE
84
 *	   characters.  So, the caller must check return value every time and
85
 *	   not prepare buffer in advance (\0 terminate) but after calling this
86
 *	   function.
87
 */
88
size_t
89
utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize,
90
    int flags)
91
{
92
	u_char *p, *lim;
93
	wchar_t *wlim, high;
94
	size_t n, total, i, n_bits;
95

96
	if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
97
		return (0);
98

99
	total = 0;
100
	p = (u_char *)in;
101
	lim = p + insize;
102
	wlim = out + outsize;
103

104
	for (; p < lim; p += n) {
105
		if (__utf8_forbitten(*p) != 0 &&
106
		    (flags & UTF8_IGNORE_ERROR) == 0)
107
			return (0);
108

109
		/*
110
		 * Get number of bytes for one wide character.
111
		 */
112
		n = 1;	/* default: 1 byte. Used when skipping bytes. */
113
		if ((*p & 0x80) == 0)
114
			high = (wchar_t)*p;
115
		else if ((*p & 0xe0) == _SEQ2) {
116
			n = 2;
117
			high = (wchar_t)(*p & 0x1f);
118
		} else if ((*p & 0xf0) == _SEQ3) {
119
			n = 3;
120
			high = (wchar_t)(*p & 0x0f);
121
		} else if ((*p & 0xf8) == _SEQ4) {
122
			n = 4;
123
			high = (wchar_t)(*p & 0x07);
124
		} else if ((*p & 0xfc) == _SEQ5) {
125
			n = 5;
126
			high = (wchar_t)(*p & 0x03);
127
		} else if ((*p & 0xfe) == _SEQ6) {
128
			n = 6;
129
			high = (wchar_t)(*p & 0x01);
130
		} else {
131
			if ((flags & UTF8_IGNORE_ERROR) == 0)
132
				return (0);
133
			continue;
134
		}
135

136
		/* does the sequence header tell us truth about length? */
137
		if (lim - p <= n - 1) {
138
			if ((flags & UTF8_IGNORE_ERROR) == 0)
139
				return (0);
140
			n = 1;
141
			continue;	/* skip */
142
		}
143

144
		/*
145
		 * Validate sequence.
146
		 * All symbols must have higher bits set to 10xxxxxx
147
		 */
148
		if (n > 1) {
149
			for (i = 1; i < n; i++) {
150
				if ((p[i] & 0xc0) != _NXT)
151
					break;
152
			}
153
			if (i != n) {
154
				if ((flags & UTF8_IGNORE_ERROR) == 0)
155
					return (0);
156
				n = 1;
157
				continue;	/* skip */
158
			}
159
		}
160

161
		total++;
162

163
		if (out == NULL)
164
			continue;
165

166
		if (out >= wlim)
167
			return (0);		/* no space left */
168

169
		*out = 0;
170
		n_bits = 0;
171
		for (i = 1; i < n; i++) {
172
			*out |= (wchar_t)(p[n - i] & 0x3f) << n_bits;
173
			n_bits += 6;		/* 6 low bits in every byte */
174
		}
175
		*out |= high << n_bits;
176

177
		if (*out == 0)			/* return at end of string */
178
			break;
179

180
		if (__wchar_forbitten(*out) != 0) {
181
			if ((flags & UTF8_IGNORE_ERROR) == 0)
182
				return (0);	/* forbitten character */
183
			else {
184
				total--;
185
				out--;
186
			}
187
		} else if (*out == _BOM && (flags & UTF8_SKIP_BOM) != 0) {
188
			total--;
189
			out--;
190
		}
191

192
		out++;
193
	}
194

195
	return (total);
196
}
197

198
/*
199
 * DESCRIPTION
200
 *	This function translates UCS-4 symbols (given in local machine
201
 *	byte order) into UTF-8 string.
202
 *
203
 *	It takes the following arguments:
204
 *	in	- input unicode string. It can be null-terminated.
205
 *	insize	- size of input string in wide characters.
206
 *	out	- result buffer for utf8 string. If out is NULL,
207
 *		function returns size of result buffer.
208
 *	outsize - size of result buffer.
209
 *
210
 * RETURN VALUES
211
 *	The function returns size of result buffer (in bytes). Zero is returned
212
 *	in case of error.
213
 *
214
 * CAVEATS
215
 *	If UCS-4 string contains zero symbols, they will be translated
216
 *	as regular symbols.
217
 */
218
size_t
219
wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize,
220
    int flags)
221
{
222
	wchar_t *w, *wlim, ch;
223
	u_char *p, *lim, *oc;
224
	size_t total, n;
225

226
	if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
227
		return (0);
228

229
	w = (wchar_t *)in;
230
	wlim = w + insize;
231
	p = (u_char *)out;
232
	lim = p + outsize;
233
	total = 0;
234
	for (; w < wlim; w++) {
235
		if (__wchar_forbitten(*w) != 0) {
236
			if ((flags & UTF8_IGNORE_ERROR) == 0)
237
				return (0);
238
			else
239
				continue;
240
		}
241

242
		if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0)
243
			continue;
244

245
		if (*w < 0) {
246
			if ((flags & UTF8_IGNORE_ERROR) == 0)
247
				return (0);
248
			continue;
249
		} else if (*w <= 0x0000007f)
250
			n = 1;
251
		else if (*w <= 0x000007ff)
252
			n = 2;
253
		else if (*w <= 0x0000ffff)
254
			n = 3;
255
		else if (*w <= 0x001fffff)
256
			n = 4;
257
		else if (*w <= 0x03ffffff)
258
			n = 5;
259
		else /* if (*w <= 0x7fffffff) */
260
			n = 6;
261

262
		total += n;
263

264
		if (out == NULL)
265
			continue;
266

267
		if (lim - p <= n - 1)
268
			return (0);		/* no space left */
269

270
		/* make it work under different endians */
271
		ch = htonl(*w);
272
		oc = (u_char *)&ch;
273
		switch (n) {
274
		case 1:
275
			*p = oc[3];
276
			break;
277

278
		case 2:
279
			p[1] = _NXT | (oc[3] & 0x3f);
280
			p[0] = _SEQ2 | (oc[3] >> 6) | ((oc[2] & 0x07) << 2);
281
			break;
282

283
		case 3:
284
			p[2] = _NXT | (oc[3] & 0x3f);
285
			p[1] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
286
			p[0] = _SEQ3 | ((oc[2] & 0xf0) >> 4);
287
			break;
288

289
		case 4:
290
			p[3] = _NXT | (oc[3] & 0x3f);
291
			p[2] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
292
			p[1] = _NXT | ((oc[2] & 0xf0) >> 4) |
293
			    ((oc[1] & 0x03) << 4);
294
			p[0] = _SEQ4 | ((oc[1] & 0x1f) >> 2);
295
			break;
296

297
		case 5:
298
			p[4] = _NXT | (oc[3] & 0x3f);
299
			p[3] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
300
			p[2] = _NXT | ((oc[2] & 0xf0) >> 4) |
301
			    ((oc[1] & 0x03) << 4);
302
			p[1] = _NXT | (oc[1] >> 2);
303
			p[0] = _SEQ5 | (oc[0] & 0x03);
304
			break;
305

306
		case 6:
307
			p[5] = _NXT | (oc[3] & 0x3f);
308
			p[4] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
309
			p[3] = _NXT | (oc[2] >> 4) | ((oc[1] & 0x03) << 4);
310
			p[2] = _NXT | (oc[1] >> 2);
311
			p[1] = _NXT | (oc[0] & 0x3f);
312
			p[0] = _SEQ6 | ((oc[0] & 0x40) >> 6);
313
			break;
314
		}
315

316
		/*
317
		 * NOTE: do not check here for forbitten UTF-8 characters.
318
		 * They cannot appear here because we do proper convertion.
319
		 */
320

321
		p += n;
322
	}
323

324
	return (total);
325
}
326
#endif /* TLF_FONTS */
327

328
Product

Resources

Company