Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/libkern/iconv_ucs.c
34820 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 2003, 2005 Ryuichiro Imura
5
* All rights reserved.
6
*
7
* Redistribution and use in source and binary forms, with or without
8
* modification, are permitted provided that the following conditions
9
* are met:
10
* 1. Redistributions of source code must retain the above copyright
11
* notice, this list of conditions and the following disclaimer.
12
* 2. Redistributions in binary form must reproduce the above copyright
13
* notice, this list of conditions and the following disclaimer in the
14
* documentation and/or other materials provided with the distribution.
15
*
16
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
* SUCH DAMAGE.
27
*/
28
29
#include <sys/param.h>
30
#include <sys/kernel.h>
31
#include <sys/systm.h>
32
#include <sys/malloc.h>
33
#include <sys/iconv.h>
34
35
#include "iconv_converter_if.h"
36
37
/*
38
* "UCS" converter
39
*/
40
41
#define KICONV_UCS_COMBINE 0x1
42
#define KICONV_UCS_FROM_UTF8 0x2
43
#define KICONV_UCS_TO_UTF8 0x4
44
#define KICONV_UCS_FROM_LE 0x8
45
#define KICONV_UCS_TO_LE 0x10
46
#define KICONV_UCS_FROM_UTF16 0x20
47
#define KICONV_UCS_TO_UTF16 0x40
48
#define KICONV_UCS_UCS4 0x80
49
50
#define ENCODING_UTF16 "UTF-16BE"
51
#define ENCODING_UTF8 "UTF-8"
52
53
static struct {
54
const char *name;
55
int from_flag, to_flag;
56
} unicode_family[] = {
57
{ "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 },
58
{ "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE },
59
{ "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 },
60
{ "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
61
KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
62
{ NULL, 0, 0 }
63
};
64
65
static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
66
static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
67
static uint32_t encode_surrogate(uint32_t code);
68
static uint32_t decode_surrogate(const u_char *ucs);
69
70
#ifdef MODULE_DEPEND
71
MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
72
#endif
73
74
/*
75
* UCS converter instance
76
*/
77
struct iconv_ucs {
78
KOBJ_FIELDS;
79
int convtype;
80
struct iconv_cspair * d_csp;
81
struct iconv_cspair * d_cspf;
82
void * f_ctp;
83
void * t_ctp;
84
void * ctype;
85
};
86
87
static int
88
iconv_ucs_open(struct iconv_converter_class *dcp,
89
struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
90
{
91
struct iconv_ucs *dp;
92
int i;
93
const char *from, *to;
94
95
dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
96
to = csp->cp_to;
97
from = cspf ? cspf->cp_from : csp->cp_from;
98
99
dp->convtype = 0;
100
101
if (cspf)
102
dp->convtype |= KICONV_UCS_COMBINE;
103
for (i = 0; unicode_family[i].name; i++) {
104
if (strcasecmp(from, unicode_family[i].name) == 0)
105
dp->convtype |= unicode_family[i].from_flag;
106
if (strcasecmp(to, unicode_family[i].name) == 0)
107
dp->convtype |= unicode_family[i].to_flag;
108
}
109
if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
110
dp->convtype |= KICONV_UCS_UCS4;
111
else
112
dp->convtype &= ~KICONV_UCS_UCS4;
113
114
dp->f_ctp = dp->t_ctp = NULL;
115
if (dp->convtype & KICONV_UCS_COMBINE) {
116
if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
117
(dp->convtype & KICONV_UCS_FROM_LE) == 0) {
118
iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
119
}
120
if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
121
(dp->convtype & KICONV_UCS_TO_LE) == 0) {
122
iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
123
}
124
}
125
126
dp->ctype = NULL;
127
if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
128
iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
129
130
dp->d_csp = csp;
131
if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
132
if (cspf) {
133
dp->d_cspf = cspf;
134
cspf->cp_refcount++;
135
} else
136
csp->cp_refcount++;
137
}
138
if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
139
csp->cp_refcount++;
140
*dpp = (void*)dp;
141
return 0;
142
}
143
144
static int
145
iconv_ucs_close(void *data)
146
{
147
struct iconv_ucs *dp = data;
148
149
if (dp->f_ctp)
150
iconv_close(dp->f_ctp);
151
if (dp->t_ctp)
152
iconv_close(dp->t_ctp);
153
if (dp->ctype)
154
iconv_close(dp->ctype);
155
if (dp->d_cspf)
156
dp->d_cspf->cp_refcount--;
157
else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
158
dp->d_csp->cp_refcount--;
159
if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
160
dp->d_csp->cp_refcount--;
161
kobj_delete((struct kobj*)data, M_ICONV);
162
return 0;
163
}
164
165
static int
166
iconv_ucs_conv(void *d2p, const char **inbuf,
167
size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
168
int convchar, int casetype)
169
{
170
struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
171
int ret = 0, i;
172
size_t in, on, ir, or, inlen, outlen, ucslen;
173
const char *src, *p;
174
char *dst;
175
u_char ucs[4], *q;
176
uint32_t code;
177
178
if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
179
return 0;
180
ir = in = *inbytesleft;
181
or = on = *outbytesleft;
182
src = *inbuf;
183
dst = *outbuf;
184
185
while (ir > 0 && or > 0) {
186
/*
187
* The first half of conversion.
188
* (convert any code into ENCODING_UNICODE)
189
*/
190
code = 0;
191
p = src;
192
if (dp->convtype & KICONV_UCS_FROM_UTF8) {
193
/* convert UTF-8 to ENCODING_UNICODE */
194
inlen = 0;
195
code = utf8_to_ucs4(p, &inlen, ir);
196
if (code == 0) {
197
ret = -1;
198
break;
199
}
200
201
if (casetype == KICONV_FROM_LOWER && dp->ctype) {
202
code = towlower(code, dp->ctype);
203
} else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
204
code = towupper(code, dp->ctype);
205
}
206
207
if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
208
/* reserved for utf-16 surrogate pair */
209
/* invalid unicode */
210
ret = -1;
211
break;
212
}
213
214
if (inlen == 4) {
215
if (dp->convtype & KICONV_UCS_UCS4) {
216
ucslen = 4;
217
code = encode_surrogate(code);
218
} else {
219
/* can't handle with ucs-2 */
220
ret = -1;
221
break;
222
}
223
} else {
224
ucslen = 2;
225
}
226
227
/* save UCS-4 into ucs[] */
228
for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
229
*q++ = (code >> (i << 3)) & 0xff;
230
231
} else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
232
/* convert local code to ENCODING_UNICODE */
233
ucslen = 4;
234
inlen = ir;
235
q = ucs;
236
ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
237
&ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
238
if (ret)
239
break;
240
inlen = ir - inlen;
241
ucslen = 4 - ucslen;
242
243
} else {
244
/* src code is a proper subset of ENCODING_UNICODE */
245
q = ucs;
246
if (dp->convtype & KICONV_UCS_FROM_LE) {
247
*q = *(p + 1);
248
*(q + 1) = *p;
249
p += 2;
250
} else {
251
*q = *p++;
252
*(q + 1) = *p++;
253
}
254
if ((*q & 0xfc) == 0xd8) {
255
if (dp->convtype & KICONV_UCS_UCS4 &&
256
dp->convtype & KICONV_UCS_FROM_UTF16) {
257
inlen = ucslen = 4;
258
} else {
259
/* invalid unicode */
260
ret = -1;
261
break;
262
}
263
} else {
264
inlen = ucslen = 2;
265
}
266
if (ir < inlen) {
267
ret = -1;
268
break;
269
}
270
if (ucslen == 4) {
271
q += 2;
272
if (dp->convtype & KICONV_UCS_FROM_LE) {
273
*q = *(p + 1);
274
*(q + 1) = *p;
275
} else {
276
*q = *p++;
277
*(q + 1) = *p;
278
}
279
if ((*q & 0xfc) != 0xdc) {
280
/* invalid unicode */
281
ret = -1;
282
break;
283
}
284
}
285
}
286
287
/*
288
* The second half of conversion.
289
* (convert ENCODING_UNICODE into any code)
290
*/
291
p = ucs;
292
if (dp->convtype & KICONV_UCS_TO_UTF8) {
293
q = (u_char *)dst;
294
if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
295
/* decode surrogate pair */
296
code = decode_surrogate(p);
297
} else {
298
code = (ucs[0] << 8) | ucs[1];
299
}
300
301
if (casetype == KICONV_LOWER && dp->ctype) {
302
code = towlower(code, dp->ctype);
303
} else if (casetype == KICONV_UPPER && dp->ctype) {
304
code = towupper(code, dp->ctype);
305
}
306
307
outlen = 0;
308
if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
309
ret = -1;
310
break;
311
}
312
313
src += inlen;
314
ir -= inlen;
315
dst += outlen;
316
or -= outlen;
317
318
} else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
319
ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
320
&or, casetype & (KICONV_LOWER | KICONV_UPPER));
321
if (ret)
322
break;
323
324
src += inlen;
325
ir -= inlen;
326
327
} else {
328
/* dst code is a proper subset of ENCODING_UNICODE */
329
if (or < ucslen) {
330
ret = -1;
331
break;
332
}
333
src += inlen;
334
ir -= inlen;
335
or -= ucslen;
336
if (dp->convtype & KICONV_UCS_TO_LE) {
337
*dst++ = *(p + 1);
338
*dst++ = *p;
339
p += 2;
340
} else {
341
*dst++ = *p++;
342
*dst++ = *p++;
343
}
344
if (ucslen == 4) {
345
if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
346
(dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
347
ret = -1;
348
break;
349
}
350
if (dp->convtype & KICONV_UCS_TO_LE) {
351
*dst++ = *(p + 1);
352
*dst++ = *p;
353
} else {
354
*dst++ = *p++;
355
*dst++ = *p;
356
}
357
}
358
}
359
360
if (convchar == 1)
361
break;
362
}
363
364
*inbuf += in - ir;
365
*outbuf += on - or;
366
*inbytesleft -= in - ir;
367
*outbytesleft -= on - or;
368
return (ret);
369
}
370
371
static int
372
iconv_ucs_init(struct iconv_converter_class *dcp)
373
{
374
int error;
375
376
error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
377
if (error)
378
return (error);
379
error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
380
if (error)
381
return (error);
382
return (0);
383
}
384
385
static int
386
iconv_ucs_done(struct iconv_converter_class *dcp)
387
{
388
return (0);
389
}
390
391
static const char *
392
iconv_ucs_name(struct iconv_converter_class *dcp)
393
{
394
return (ENCODING_UNICODE);
395
}
396
397
static kobj_method_t iconv_ucs_methods[] = {
398
KOBJMETHOD(iconv_converter_open, iconv_ucs_open),
399
KOBJMETHOD(iconv_converter_close, iconv_ucs_close),
400
KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv),
401
KOBJMETHOD(iconv_converter_init, iconv_ucs_init),
402
KOBJMETHOD(iconv_converter_done, iconv_ucs_done),
403
KOBJMETHOD(iconv_converter_name, iconv_ucs_name),
404
{0, 0}
405
};
406
407
KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
408
409
static uint32_t
410
utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
411
{
412
size_t i, w = 0;
413
uint32_t ucs4 = 0;
414
415
/*
416
* get leading 1 byte from utf-8
417
*/
418
if ((*src & 0x80) == 0) {
419
/*
420
* leading 1 bit is "0"
421
* utf-8: 0xxxxxxx
422
* ucs-4: 00000000 00000000 00000000 0xxxxxxx
423
*/
424
w = 1;
425
/* get trailing 7 bits */
426
ucs4 = *src & 0x7f;
427
} else if ((*src & 0xe0) == 0xc0) {
428
/*
429
* leading 3 bits are "110"
430
* utf-8: 110xxxxx 10yyyyyy
431
* ucs-4: 00000000 00000000 00000xxx xxyyyyyy
432
*/
433
w = 2;
434
/* get trailing 5 bits */
435
ucs4 = *src & 0x1f;
436
} else if ((*src & 0xf0) == 0xe0) {
437
/*
438
* leading 4 bits are "1110"
439
* utf-8: 1110xxxx 10yyyyyy 10zzzzzz
440
* ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
441
*/
442
w = 3;
443
/* get trailing 4 bits */
444
ucs4 = *src & 0x0f;
445
} else if ((*src & 0xf8) == 0xf0) {
446
/*
447
* leading 5 bits are "11110"
448
* utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
449
* ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
450
*/
451
w = 4;
452
/* get trailing 3 bits */
453
ucs4 = *src & 0x07;
454
} else {
455
/* out of utf-16 range or having illegal bits */
456
return (0);
457
}
458
459
if (srclen < w)
460
return (0);
461
462
/*
463
* get left parts from utf-8
464
*/
465
for (i = 1 ; i < w ; i++) {
466
if ((*(src + i) & 0xc0) != 0x80) {
467
/* invalid: leading 2 bits are not "10" */
468
return (0);
469
}
470
/* concatenate trailing 6 bits into ucs4 */
471
ucs4 <<= 6;
472
ucs4 |= *(src + i) & 0x3f;
473
}
474
475
*utf8width = w;
476
return (ucs4);
477
}
478
479
static u_char *
480
ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
481
{
482
u_char lead, *p;
483
size_t i, w;
484
485
/*
486
* determine utf-8 width and leading bits
487
*/
488
if (ucs4 < 0x80) {
489
w = 1;
490
lead = 0; /* "0" */
491
} else if (ucs4 < 0x800) {
492
w = 2;
493
lead = 0xc0; /* "11" */
494
} else if (ucs4 < 0x10000) {
495
w = 3;
496
lead = 0xe0; /* "111" */
497
} else if (ucs4 < 0x200000) {
498
w = 4;
499
lead = 0xf0; /* "1111" */
500
} else {
501
return (NULL);
502
}
503
504
if (dstlen < w)
505
return (NULL);
506
507
/*
508
* construct utf-8
509
*/
510
p = dst;
511
for (i = w - 1 ; i >= 1 ; i--) {
512
/* get trailing 6 bits and put it with leading bit as "1" */
513
*(p + i) = (ucs4 & 0x3f) | 0x80;
514
ucs4 >>= 6;
515
}
516
*p = ucs4 | lead;
517
518
*utf8width = w;
519
520
return (p);
521
}
522
523
static uint32_t
524
encode_surrogate(uint32_t code)
525
{
526
return ((((code - 0x10000) << 6) & 0x3ff0000) |
527
((code - 0x10000) & 0x3ff) | 0xd800dc00);
528
}
529
530
static uint32_t
531
decode_surrogate(const u_char *ucs)
532
{
533
return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
534
((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
535
}
536
537