CoCalc -- iconv

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/libkern/iconv_ucs.c
³⁴⁸²⁰ views
1
/*-
2
 * SPDX-License-Identifier: BSD-2-Clause
3
 *
4
 * Copyright (c) 2003, 2005 Ryuichiro Imura
5
 * All rights reserved.
6
 *
7
 * Redistribution and use in source and binary forms, with or without
8
 * modification, are permitted provided that the following conditions
9
 * are met:
10
 * 1. Redistributions of source code must retain the above copyright
11
 *    notice, this list of conditions and the following disclaimer.
12
 * 2. Redistributions in binary form must reproduce the above copyright
13
 *    notice, this list of conditions and the following disclaimer in the
14
 *    documentation and/or other materials provided with the distribution.
15
 *
16
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
 * SUCH DAMAGE.
27
 */
28

29
#include <sys/param.h>
30
#include <sys/kernel.h>
31
#include <sys/systm.h>
32
#include <sys/malloc.h>
33
#include <sys/iconv.h>
34

35
#include "iconv_converter_if.h"
36

37
/*
38
 * "UCS" converter
39
 */
40

41
#define	KICONV_UCS_COMBINE	0x1
42
#define	KICONV_UCS_FROM_UTF8	0x2
43
#define	KICONV_UCS_TO_UTF8	0x4
44
#define	KICONV_UCS_FROM_LE	0x8
45
#define	KICONV_UCS_TO_LE	0x10
46
#define	KICONV_UCS_FROM_UTF16	0x20
47
#define	KICONV_UCS_TO_UTF16	0x40
48
#define	KICONV_UCS_UCS4		0x80
49

50
#define	ENCODING_UTF16	"UTF-16BE"
51
#define	ENCODING_UTF8	"UTF-8"
52

53
static struct {
54
	const char *name;
55
	int from_flag, to_flag;
56
} unicode_family[] = {
57
	{ "UTF-8",	KICONV_UCS_FROM_UTF8,	KICONV_UCS_TO_UTF8 },
58
	{ "UCS-2LE",	KICONV_UCS_FROM_LE,	KICONV_UCS_TO_LE },
59
	{ "UTF-16BE",	KICONV_UCS_FROM_UTF16,	KICONV_UCS_TO_UTF16 },
60
	{ "UTF-16LE",	KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
61
	    KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
62
	{ NULL,		0,	0 }
63
};
64

65
static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
66
static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
67
static uint32_t encode_surrogate(uint32_t code);
68
static uint32_t decode_surrogate(const u_char *ucs);
69

70
#ifdef MODULE_DEPEND
71
MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
72
#endif
73

74
/*
75
 * UCS converter instance
76
 */
77
struct iconv_ucs {
78
	KOBJ_FIELDS;
79
	int			convtype;
80
	struct iconv_cspair *	d_csp;
81
	struct iconv_cspair *	d_cspf;
82
	void *			f_ctp;
83
	void *			t_ctp;
84
	void *			ctype;
85
};
86

87
static int
88
iconv_ucs_open(struct iconv_converter_class *dcp,
89
	struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
90
{
91
	struct iconv_ucs *dp;
92
	int i;
93
	const char *from, *to;
94

95
	dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
96
	to = csp->cp_to;
97
	from = cspf ? cspf->cp_from : csp->cp_from;
98

99
	dp->convtype = 0;
100

101
	if (cspf)
102
		dp->convtype |= KICONV_UCS_COMBINE;
103
	for (i = 0; unicode_family[i].name; i++) {
104
		if (strcasecmp(from, unicode_family[i].name) == 0)
105
			dp->convtype |= unicode_family[i].from_flag;
106
		if (strcasecmp(to, unicode_family[i].name) == 0)
107
			dp->convtype |= unicode_family[i].to_flag;
108
	}
109
	if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
110
		dp->convtype |= KICONV_UCS_UCS4;
111
	else
112
		dp->convtype &= ~KICONV_UCS_UCS4;
113

114
	dp->f_ctp = dp->t_ctp = NULL;
115
	if (dp->convtype & KICONV_UCS_COMBINE) {
116
		if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
117
		    (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
118
			iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
119
		}
120
		if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
121
		    (dp->convtype & KICONV_UCS_TO_LE) == 0) {
122
			iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
123
		}
124
	}
125

126
	dp->ctype = NULL;
127
	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
128
		iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
129

130
	dp->d_csp = csp;
131
	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
132
		if (cspf) {
133
			dp->d_cspf = cspf;
134
			cspf->cp_refcount++;
135
		} else
136
			csp->cp_refcount++;
137
	}
138
	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
139
		csp->cp_refcount++;
140
	*dpp = (void*)dp;
141
	return 0;
142
}
143

144
static int
145
iconv_ucs_close(void *data)
146
{
147
	struct iconv_ucs *dp = data;
148

149
	if (dp->f_ctp)
150
		iconv_close(dp->f_ctp);
151
	if (dp->t_ctp)
152
		iconv_close(dp->t_ctp);
153
	if (dp->ctype)
154
		iconv_close(dp->ctype);
155
	if (dp->d_cspf)
156
		dp->d_cspf->cp_refcount--;
157
	else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
158
		dp->d_csp->cp_refcount--;
159
	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
160
		dp->d_csp->cp_refcount--;
161
	kobj_delete((struct kobj*)data, M_ICONV);
162
	return 0;
163
}
164

165
static int
166
iconv_ucs_conv(void *d2p, const char **inbuf,
167
	size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
168
	int convchar, int casetype)
169
{
170
	struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
171
	int ret = 0, i;
172
	size_t in, on, ir, or, inlen, outlen, ucslen;
173
	const char *src, *p;
174
	char *dst;
175
	u_char ucs[4], *q;
176
	uint32_t code;
177

178
	if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
179
		return 0;
180
	ir = in = *inbytesleft;
181
	or = on = *outbytesleft;
182
	src = *inbuf;
183
	dst = *outbuf;
184

185
	while (ir > 0 && or > 0) {
186
		/*
187
		 * The first half of conversion.
188
		 * (convert any code into ENCODING_UNICODE)
189
		 */
190
		code = 0;
191
		p = src;
192
		if (dp->convtype & KICONV_UCS_FROM_UTF8) {
193
			/* convert UTF-8 to ENCODING_UNICODE */
194
			inlen = 0;
195
			code = utf8_to_ucs4(p, &inlen, ir);
196
			if (code == 0) {
197
				ret = -1;
198
				break;
199
			}
200

201
			if (casetype == KICONV_FROM_LOWER && dp->ctype) {
202
				code = towlower(code, dp->ctype);
203
			} else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
204
				code = towupper(code, dp->ctype);
205
			}
206

207
			if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
208
				/* reserved for utf-16 surrogate pair */
209
				/* invalid unicode */
210
				ret = -1;
211
				break;
212
			}
213

214
			if (inlen == 4) {
215
				if (dp->convtype & KICONV_UCS_UCS4) {
216
					ucslen = 4;
217
					code = encode_surrogate(code);
218
				} else {
219
					/* can't handle with ucs-2 */
220
					ret = -1;
221
					break;
222
				}
223
			} else {
224
				ucslen = 2;
225
			}
226

227
			/* save UCS-4 into ucs[] */
228
			for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
229
				*q++ = (code >> (i << 3)) & 0xff;
230

231
		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
232
			/* convert local code to ENCODING_UNICODE */
233
			ucslen = 4;
234
			inlen = ir;
235
			q = ucs;
236
			ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
237
			    &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
238
			if (ret)
239
				break;
240
			inlen = ir - inlen;
241
			ucslen = 4 - ucslen;
242

243
		} else {
244
			/* src code is a proper subset of ENCODING_UNICODE */
245
			q = ucs;
246
			if (dp->convtype & KICONV_UCS_FROM_LE) {
247
				*q = *(p + 1);
248
				*(q + 1) = *p;
249
				p += 2;
250
			} else {
251
				*q = *p++;
252
				*(q + 1) = *p++;
253
			}
254
			if ((*q & 0xfc) == 0xd8) {
255
				if (dp->convtype & KICONV_UCS_UCS4 &&
256
				    dp->convtype & KICONV_UCS_FROM_UTF16) {
257
					inlen = ucslen = 4;
258
				} else {
259
					/* invalid unicode */
260
					ret = -1;
261
					break;
262
				}
263
			} else {
264
				inlen = ucslen = 2;
265
			}
266
			if (ir < inlen) {
267
				ret = -1;
268
				break;
269
			}
270
			if (ucslen == 4) {
271
				q += 2;
272
				if (dp->convtype & KICONV_UCS_FROM_LE) {
273
					*q = *(p + 1);
274
					*(q + 1) = *p;
275
				} else {
276
					*q = *p++;
277
					*(q + 1) = *p;
278
				}
279
				if ((*q & 0xfc) != 0xdc) {
280
					/* invalid unicode */
281
					ret = -1;
282
					break;
283
				}
284
			}
285
		}
286

287
		/*
288
		 * The second half of conversion.
289
		 * (convert ENCODING_UNICODE into any code)
290
		 */
291
		p = ucs;
292
		if (dp->convtype & KICONV_UCS_TO_UTF8) {
293
			q = (u_char *)dst;
294
			if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
295
				/* decode surrogate pair */
296
				code = decode_surrogate(p);
297
			} else {
298
				code = (ucs[0] << 8) | ucs[1];
299
			}
300

301
			if (casetype == KICONV_LOWER && dp->ctype) {
302
				code = towlower(code, dp->ctype);
303
			} else if (casetype == KICONV_UPPER && dp->ctype) {
304
				code = towupper(code, dp->ctype);
305
			}
306

307
			outlen = 0;
308
			if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
309
				ret = -1;
310
				break;
311
			}
312

313
			src += inlen;
314
			ir -= inlen;
315
			dst += outlen;
316
			or -= outlen;
317

318
		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
319
			ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
320
			    &or, casetype & (KICONV_LOWER | KICONV_UPPER));
321
			if (ret)
322
				break;
323

324
			src += inlen;
325
			ir -= inlen;
326

327
		} else {
328
			/* dst code is a proper subset of ENCODING_UNICODE */
329
			if (or < ucslen) {
330
				ret = -1;
331
				break;
332
			}
333
			src += inlen;
334
			ir -= inlen;
335
			or -= ucslen;
336
			if (dp->convtype & KICONV_UCS_TO_LE) {
337
				*dst++ = *(p + 1);
338
				*dst++ = *p;
339
				p += 2;
340
			} else {
341
				*dst++ = *p++;
342
				*dst++ = *p++;
343
			}
344
			if (ucslen == 4) {
345
				if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
346
				    (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
347
					ret = -1;
348
					break;
349
				}
350
				if (dp->convtype & KICONV_UCS_TO_LE) {
351
					*dst++ = *(p + 1);
352
					*dst++ = *p;
353
				} else {
354
					*dst++ = *p++;
355
					*dst++ = *p;
356
				}
357
			}
358
		}
359

360
		if (convchar == 1)
361
			break;
362
	}
363

364
	*inbuf += in - ir;
365
	*outbuf += on - or;
366
	*inbytesleft -= in - ir;
367
	*outbytesleft -= on - or;
368
	return (ret);
369
}
370

371
static int
372
iconv_ucs_init(struct iconv_converter_class *dcp)
373
{
374
	int error;
375

376
	error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
377
	if (error)
378
		return (error);
379
	error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
380
	if (error)
381
		return (error);
382
	return (0);
383
}
384

385
static int
386
iconv_ucs_done(struct iconv_converter_class *dcp)
387
{
388
	return (0);
389
}
390

391
static const char *
392
iconv_ucs_name(struct iconv_converter_class *dcp)
393
{
394
	return (ENCODING_UNICODE);
395
}
396

397
static kobj_method_t iconv_ucs_methods[] = {
398
	KOBJMETHOD(iconv_converter_open,	iconv_ucs_open),
399
	KOBJMETHOD(iconv_converter_close,	iconv_ucs_close),
400
	KOBJMETHOD(iconv_converter_conv,	iconv_ucs_conv),
401
	KOBJMETHOD(iconv_converter_init,	iconv_ucs_init),
402
	KOBJMETHOD(iconv_converter_done,	iconv_ucs_done),
403
	KOBJMETHOD(iconv_converter_name,	iconv_ucs_name),
404
	{0, 0}
405
};
406

407
KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
408

409
static uint32_t
410
utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
411
{
412
	size_t i, w = 0;
413
	uint32_t ucs4 = 0;
414

415
	/*
416
	 * get leading 1 byte from utf-8
417
	 */
418
	if ((*src & 0x80) == 0) {
419
		/*
420
		 * leading 1 bit is "0"
421
		 *  utf-8: 0xxxxxxx
422
		 *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
423
		 */
424
		w = 1;
425
		/* get trailing 7 bits */
426
		ucs4 = *src & 0x7f;
427
	} else if ((*src & 0xe0) == 0xc0) {
428
		/*
429
		 * leading 3 bits are "110"
430
		 *  utf-8: 110xxxxx 10yyyyyy
431
		 *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
432
		 */
433
		w = 2;
434
		/* get trailing 5 bits */
435
		ucs4 = *src & 0x1f;
436
	} else if ((*src & 0xf0) == 0xe0) {
437
		/*
438
		 * leading 4 bits are "1110"
439
		 *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
440
		 *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
441
		 */
442
		w = 3;
443
		/* get trailing 4 bits */
444
		ucs4 = *src & 0x0f;
445
	} else if ((*src & 0xf8) == 0xf0) {
446
		/*
447
		 * leading 5 bits are "11110"
448
		 *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
449
		 *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
450
		 */
451
		w = 4;
452
		/* get trailing 3 bits */
453
		ucs4 = *src & 0x07;
454
	} else {
455
		/* out of utf-16 range or having illegal bits */
456
		return (0);
457
	}
458

459
	if (srclen < w)
460
		return (0);
461

462
	/*
463
	 * get left parts from utf-8
464
	 */
465
	for (i = 1 ; i < w ; i++) {
466
		if ((*(src + i) & 0xc0) != 0x80) {
467
			/* invalid: leading 2 bits are not "10" */
468
			return (0);
469
		}
470
		/* concatenate trailing 6 bits into ucs4 */
471
		ucs4 <<= 6;
472
		ucs4 |= *(src + i) & 0x3f;
473
	}
474

475
	*utf8width = w;
476
	return (ucs4);
477
}
478

479
static u_char *
480
ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
481
{
482
	u_char lead, *p;
483
	size_t i, w;
484

485
	/*
486
	 * determine utf-8 width and leading bits
487
	 */
488
	if (ucs4 < 0x80) {
489
		w = 1;
490
		lead = 0;	/* "0" */
491
	} else if (ucs4 < 0x800) {
492
		w = 2;
493
		lead = 0xc0;	/* "11" */
494
	} else if (ucs4 < 0x10000) {
495
		w = 3;
496
		lead = 0xe0;	/* "111" */
497
	} else if (ucs4 < 0x200000) {
498
		w = 4;
499
		lead = 0xf0;	/* "1111" */
500
	} else {
501
		return (NULL);
502
	}
503

504
	if (dstlen < w)
505
		return (NULL);
506

507
	/*
508
	 * construct utf-8
509
	 */
510
	p = dst;
511
	for (i = w - 1 ; i >= 1 ; i--) {
512
		/* get trailing 6 bits and put it with leading bit as "1" */
513
		*(p + i) = (ucs4 & 0x3f) | 0x80;
514
		ucs4 >>= 6;
515
	}
516
	*p = ucs4 | lead;
517

518
	*utf8width = w;
519

520
	return (p);
521
}
522

523
static uint32_t
524
encode_surrogate(uint32_t code)
525
{
526
	return ((((code - 0x10000) << 6) & 0x3ff0000) |
527
	    ((code - 0x10000) & 0x3ff) | 0xd800dc00);
528
}
529

530
static uint32_t
531
decode_surrogate(const u_char *ucs)
532
{
533
	return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
534
	    ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
535
}
536

537
Product

Resources

Company