Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/unicode/u8_textprep.c
48383 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24
* Use is subject to license terms.
25
*/
26
27
/*
28
* Copyright 2022 MNX Cloud, Inc.
29
*/
30
31
32
33
/*
34
* UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
35
*
36
* Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
37
* u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
38
* the section 3C man pages.
39
* Interface stability: Committed.
40
*/
41
42
#include <sys/types.h>
43
#include <sys/string.h>
44
#include <sys/param.h>
45
#include <sys/sysmacros.h>
46
#include <sys/debug.h>
47
#include <sys/kmem.h>
48
#include <sys/sunddi.h>
49
#include <sys/u8_textprep.h>
50
#include <sys/byteorder.h>
51
#include <sys/errno.h>
52
#include <sys/u8_textprep_data.h>
53
#include <sys/mod.h>
54
55
/* The maximum possible number of bytes in a UTF-8 character. */
56
#define U8_MB_CUR_MAX (4)
57
58
/*
59
* The maximum number of bytes needed for a UTF-8 character to cover
60
* U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
61
*/
62
#define U8_MAX_BYTES_UCS2 (3)
63
64
/* The maximum possible number of bytes in a Stream-Safe Text. */
65
#define U8_STREAM_SAFE_TEXT_MAX (128)
66
67
/*
68
* The maximum number of characters in a combining/conjoining sequence and
69
* the actual upperbound limit of a combining/conjoining sequence.
70
*/
71
#define U8_MAX_CHARS_A_SEQ (32)
72
#define U8_UPPER_LIMIT_IN_A_SEQ (31)
73
74
/* The combining class value for Starter. */
75
#define U8_COMBINING_CLASS_STARTER (0)
76
77
/*
78
* Some Hangul related macros at below.
79
*
80
* The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
81
* Vowels, and optional Trailing consonants in Unicode scalar values.
82
*
83
* Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
84
* the actual U+11A8. This is due to that the trailing consonant is optional
85
* and thus we are doing a pre-calculation of subtracting one.
86
*
87
* Each of 19 modern leading consonants has total 588 possible syllables since
88
* Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
89
* no trailing consonant case, i.e., 21 x 28 = 588.
90
*
91
* We also have bunch of Hangul related macros at below. Please bear in mind
92
* that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
93
* a Hangul Jamo or not but the value does not guarantee that it is a Hangul
94
* Jamo; it just guarantee that it will be most likely.
95
*/
96
#define U8_HANGUL_SYL_FIRST (0xAC00U)
97
#define U8_HANGUL_SYL_LAST (0xD7A3U)
98
99
#define U8_HANGUL_JAMO_L_FIRST (0x1100U)
100
#define U8_HANGUL_JAMO_L_LAST (0x1112U)
101
#define U8_HANGUL_JAMO_V_FIRST (0x1161U)
102
#define U8_HANGUL_JAMO_V_LAST (0x1175U)
103
#define U8_HANGUL_JAMO_T_FIRST (0x11A7U)
104
#define U8_HANGUL_JAMO_T_LAST (0x11C2U)
105
106
#define U8_HANGUL_V_COUNT (21)
107
#define U8_HANGUL_VT_COUNT (588)
108
#define U8_HANGUL_T_COUNT (28)
109
110
#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U)
111
112
#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
113
(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
114
(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
115
(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
116
117
#define U8_HANGUL_JAMO_L(u) \
118
((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
119
120
#define U8_HANGUL_JAMO_V(u) \
121
((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
122
123
#define U8_HANGUL_JAMO_T(u) \
124
((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
125
126
#define U8_HANGUL_JAMO(u) \
127
((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
128
129
#define U8_HANGUL_SYLLABLE(u) \
130
((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
131
132
#define U8_HANGUL_COMPOSABLE_L_V(s, u) \
133
((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
134
135
#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \
136
((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
137
138
/* The types of decomposition mappings. */
139
#define U8_DECOMP_BOTH (0xF5U)
140
#define U8_DECOMP_CANONICAL (0xF6U)
141
142
/* The indicator for 16-bit table. */
143
#define U8_16BIT_TABLE_INDICATOR (0x8000U)
144
145
/* The following are some convenience macros. */
146
#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
147
(u) = ((((uint32_t)(b1) & 0x0F) << 12) | \
148
(((uint32_t)(b2) & 0x3F) << 6) | \
149
((uint32_t)(b3) & 0x3F));
150
151
#define U8_SIMPLE_SWAP(a, b, t) \
152
(t) = (a); \
153
(a) = (b); \
154
(b) = (t);
155
156
#define U8_ASCII_TOUPPER(c) \
157
(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
158
159
#define U8_ASCII_TOLOWER(c) \
160
(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
161
162
#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U)
163
/*
164
* The following macro assumes that the two characters that are to be
165
* swapped are adjacent to each other and 'a' comes before 'b'.
166
*
167
* If the assumptions are not met, then, the macro will fail.
168
*/
169
#define U8_SWAP_COMB_MARKS(a, b) \
170
for (k = 0; k < disp[(a)]; k++) \
171
u8t[k] = u8s[start[(a)] + k]; \
172
for (k = 0; k < disp[(b)]; k++) \
173
u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
174
start[(b)] = start[(a)] + disp[(b)]; \
175
for (k = 0; k < disp[(a)]; k++) \
176
u8s[start[(b)] + k] = u8t[k]; \
177
U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
178
U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
179
180
/* The possible states during normalization. */
181
typedef enum {
182
U8_STATE_START = 0,
183
U8_STATE_HANGUL_L = 1,
184
U8_STATE_HANGUL_LV = 2,
185
U8_STATE_HANGUL_LVT = 3,
186
U8_STATE_HANGUL_V = 4,
187
U8_STATE_HANGUL_T = 5,
188
U8_STATE_COMBINING_MARK = 6
189
} u8_normalization_states_t;
190
191
/*
192
* The three vectors at below are used to check bytes of a given UTF-8
193
* character are valid and not containing any malformed byte values.
194
*
195
* We used to have a quite relaxed UTF-8 binary representation but then there
196
* was some security related issues and so the Unicode Consortium defined
197
* and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
198
* one more time at the Unicode 3.2. The following three tables are based on
199
* that.
200
*/
201
202
#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF)
203
204
#define I_ U8_ILLEGAL_CHAR
205
#define O_ U8_OUT_OF_RANGE_CHAR
206
207
static const int8_t u8_number_of_bytes[0x100] = {
208
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
209
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
210
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
211
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
212
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
213
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
214
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
215
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
216
217
/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */
218
I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
219
220
/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */
221
I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
222
223
/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */
224
I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
225
226
/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */
227
I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
228
229
/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
230
I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
231
232
/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
233
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
234
235
/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
236
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
237
238
/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
239
4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
240
};
241
242
#undef I_
243
#undef O_
244
245
static const uint8_t u8_valid_min_2nd_byte[0x100] = {
246
0, 0, 0, 0, 0, 0, 0, 0,
247
0, 0, 0, 0, 0, 0, 0, 0,
248
0, 0, 0, 0, 0, 0, 0, 0,
249
0, 0, 0, 0, 0, 0, 0, 0,
250
0, 0, 0, 0, 0, 0, 0, 0,
251
0, 0, 0, 0, 0, 0, 0, 0,
252
0, 0, 0, 0, 0, 0, 0, 0,
253
0, 0, 0, 0, 0, 0, 0, 0,
254
0, 0, 0, 0, 0, 0, 0, 0,
255
0, 0, 0, 0, 0, 0, 0, 0,
256
0, 0, 0, 0, 0, 0, 0, 0,
257
0, 0, 0, 0, 0, 0, 0, 0,
258
0, 0, 0, 0, 0, 0, 0, 0,
259
0, 0, 0, 0, 0, 0, 0, 0,
260
0, 0, 0, 0, 0, 0, 0, 0,
261
0, 0, 0, 0, 0, 0, 0, 0,
262
0, 0, 0, 0, 0, 0, 0, 0,
263
0, 0, 0, 0, 0, 0, 0, 0,
264
0, 0, 0, 0, 0, 0, 0, 0,
265
0, 0, 0, 0, 0, 0, 0, 0,
266
0, 0, 0, 0, 0, 0, 0, 0,
267
0, 0, 0, 0, 0, 0, 0, 0,
268
0, 0, 0, 0, 0, 0, 0, 0,
269
0, 0, 0, 0, 0, 0, 0, 0,
270
/* C0 C1 C2 C3 C4 C5 C6 C7 */
271
0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
272
/* C8 C9 CA CB CC CD CE CF */
273
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
274
/* D0 D1 D2 D3 D4 D5 D6 D7 */
275
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
276
/* D8 D9 DA DB DC DD DE DF */
277
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
278
/* E0 E1 E2 E3 E4 E5 E6 E7 */
279
0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
280
/* E8 E9 EA EB EC ED EE EF */
281
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
282
/* F0 F1 F2 F3 F4 F5 F6 F7 */
283
0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
284
0, 0, 0, 0, 0, 0, 0, 0,
285
};
286
287
static const uint8_t u8_valid_max_2nd_byte[0x100] = {
288
0, 0, 0, 0, 0, 0, 0, 0,
289
0, 0, 0, 0, 0, 0, 0, 0,
290
0, 0, 0, 0, 0, 0, 0, 0,
291
0, 0, 0, 0, 0, 0, 0, 0,
292
0, 0, 0, 0, 0, 0, 0, 0,
293
0, 0, 0, 0, 0, 0, 0, 0,
294
0, 0, 0, 0, 0, 0, 0, 0,
295
0, 0, 0, 0, 0, 0, 0, 0,
296
0, 0, 0, 0, 0, 0, 0, 0,
297
0, 0, 0, 0, 0, 0, 0, 0,
298
0, 0, 0, 0, 0, 0, 0, 0,
299
0, 0, 0, 0, 0, 0, 0, 0,
300
0, 0, 0, 0, 0, 0, 0, 0,
301
0, 0, 0, 0, 0, 0, 0, 0,
302
0, 0, 0, 0, 0, 0, 0, 0,
303
0, 0, 0, 0, 0, 0, 0, 0,
304
0, 0, 0, 0, 0, 0, 0, 0,
305
0, 0, 0, 0, 0, 0, 0, 0,
306
0, 0, 0, 0, 0, 0, 0, 0,
307
0, 0, 0, 0, 0, 0, 0, 0,
308
0, 0, 0, 0, 0, 0, 0, 0,
309
0, 0, 0, 0, 0, 0, 0, 0,
310
0, 0, 0, 0, 0, 0, 0, 0,
311
0, 0, 0, 0, 0, 0, 0, 0,
312
/* C0 C1 C2 C3 C4 C5 C6 C7 */
313
0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
314
/* C8 C9 CA CB CC CD CE CF */
315
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
316
/* D0 D1 D2 D3 D4 D5 D6 D7 */
317
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
318
/* D8 D9 DA DB DC DD DE DF */
319
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
320
/* E0 E1 E2 E3 E4 E5 E6 E7 */
321
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
322
/* E8 E9 EA EB EC ED EE EF */
323
0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
324
/* F0 F1 F2 F3 F4 F5 F6 F7 */
325
0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
326
0, 0, 0, 0, 0, 0, 0, 0,
327
};
328
329
330
/*
331
* The u8_validate() validates on the given UTF-8 character string and
332
* calculate the byte length. It is quite similar to mblen(3C) except that
333
* this will validate against the list of characters if required and
334
* specific to UTF-8 and Unicode.
335
*/
336
int
337
u8_validate(const char *u8str, size_t n, char **list, int flag, int *errnum)
338
{
339
uchar_t *ib;
340
uchar_t *ibtail;
341
uchar_t **p;
342
uchar_t *s1;
343
uchar_t *s2;
344
uchar_t f;
345
int sz;
346
size_t i;
347
int ret_val;
348
boolean_t second;
349
boolean_t no_need_to_validate_entire;
350
boolean_t check_additional;
351
boolean_t validate_ucs2_range_only;
352
353
if (! u8str)
354
return (0);
355
356
ib = (uchar_t *)u8str;
357
ibtail = ib + n;
358
359
ret_val = 0;
360
361
no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
362
check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
363
validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
364
365
while (ib < ibtail) {
366
/*
367
* The first byte of a UTF-8 character tells how many
368
* bytes will follow for the character. If the first byte
369
* is an illegal byte value or out of range value, we just
370
* return -1 with an appropriate error number.
371
*/
372
sz = u8_number_of_bytes[*ib];
373
if (sz == U8_ILLEGAL_CHAR) {
374
*errnum = EILSEQ;
375
return (-1);
376
}
377
378
if (sz == U8_OUT_OF_RANGE_CHAR ||
379
(validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
380
*errnum = ERANGE;
381
return (-1);
382
}
383
384
/*
385
* If we don't have enough bytes to check on, that's also
386
* an error. As you can see, we give illegal byte sequence
387
* checking higher priority then EINVAL cases.
388
*/
389
if ((ibtail - ib) < sz) {
390
*errnum = EINVAL;
391
return (-1);
392
}
393
394
if (sz == 1) {
395
ib++;
396
ret_val++;
397
} else {
398
/*
399
* Check on the multi-byte UTF-8 character. For more
400
* details on this, see comment added for the used
401
* data structures at the beginning of the file.
402
*/
403
f = *ib++;
404
ret_val++;
405
second = B_TRUE;
406
for (i = 1; i < sz; i++) {
407
if (second) {
408
if (*ib < u8_valid_min_2nd_byte[f] ||
409
*ib > u8_valid_max_2nd_byte[f]) {
410
*errnum = EILSEQ;
411
return (-1);
412
}
413
second = B_FALSE;
414
} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
415
*errnum = EILSEQ;
416
return (-1);
417
}
418
ib++;
419
ret_val++;
420
}
421
}
422
423
if (check_additional) {
424
for (p = (uchar_t **)list, i = 0; p[i]; i++) {
425
s1 = ib - sz;
426
s2 = p[i];
427
while (s1 < ib) {
428
if (*s1 != *s2 || *s2 == '\0')
429
break;
430
s1++;
431
s2++;
432
}
433
434
if (s1 >= ib && *s2 == '\0') {
435
*errnum = EBADF;
436
return (-1);
437
}
438
}
439
}
440
441
if (no_need_to_validate_entire)
442
break;
443
}
444
445
return (ret_val);
446
}
447
448
/*
449
* The do_case_conv() looks at the mapping tables and returns found
450
* bytes if any. If not found, the input bytes are returned. The function
451
* always terminate the return bytes with a null character assuming that
452
* there are plenty of room to do so.
453
*
454
* The case conversions are simple case conversions mapping a character to
455
* another character as specified in the Unicode data. The byte size of
456
* the mapped character could be different from that of the input character.
457
*
458
* The return value is the byte length of the returned character excluding
459
* the terminating null byte.
460
*/
461
static size_t
462
do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
463
{
464
size_t i;
465
uint16_t b1 = 0;
466
uint16_t b2 = 0;
467
uint16_t b3 = 0;
468
uint16_t b3_tbl;
469
uint16_t b3_base;
470
uint16_t b4 = 0;
471
size_t start_id;
472
size_t end_id;
473
474
/*
475
* At this point, the only possible values for sz are 2, 3, and 4.
476
* The u8s should point to a vector that is well beyond the size of
477
* 5 bytes.
478
*/
479
if (sz == 2) {
480
b3 = u8s[0] = s[0];
481
b4 = u8s[1] = s[1];
482
} else if (sz == 3) {
483
b2 = u8s[0] = s[0];
484
b3 = u8s[1] = s[1];
485
b4 = u8s[2] = s[2];
486
} else if (sz == 4) {
487
b1 = u8s[0] = s[0];
488
b2 = u8s[1] = s[1];
489
b3 = u8s[2] = s[2];
490
b4 = u8s[3] = s[3];
491
} else {
492
/* This is not possible but just in case as a fallback. */
493
if (is_it_toupper)
494
*u8s = U8_ASCII_TOUPPER(*s);
495
else
496
*u8s = U8_ASCII_TOLOWER(*s);
497
u8s[1] = '\0';
498
499
return (1);
500
}
501
u8s[sz] = '\0';
502
503
/*
504
* Let's find out if we have a corresponding character.
505
*/
506
b1 = u8_common_b1_tbl[uv][b1];
507
if (b1 == U8_TBL_ELEMENT_NOT_DEF)
508
return ((size_t)sz);
509
510
b2 = u8_case_common_b2_tbl[uv][b1][b2];
511
if (b2 == U8_TBL_ELEMENT_NOT_DEF)
512
return ((size_t)sz);
513
514
if (is_it_toupper) {
515
b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
516
if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
517
return ((size_t)sz);
518
519
start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
520
end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
521
522
/* Either there is no match or an error at the table. */
523
if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
524
return ((size_t)sz);
525
526
b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
527
528
for (i = 0; start_id < end_id; start_id++)
529
u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
530
} else {
531
#ifdef U8_STRCMP_CI_LOWER
532
b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
533
if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
534
return ((size_t)sz);
535
536
start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
537
end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
538
539
if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
540
return ((size_t)sz);
541
542
b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
543
544
for (i = 0; start_id < end_id; start_id++)
545
u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
546
#else
547
__builtin_unreachable();
548
#endif
549
}
550
551
/*
552
* If i is still zero, that means there is no corresponding character.
553
*/
554
if (i == 0)
555
return ((size_t)sz);
556
557
u8s[i] = '\0';
558
559
return (i);
560
}
561
562
/*
563
* The do_case_compare() function compares the two input strings, s1 and s2,
564
* one character at a time doing case conversions if applicable and return
565
* the comparison result as like strcmp().
566
*
567
* Since, in empirical sense, most of text data are 7-bit ASCII characters,
568
* we treat the 7-bit ASCII characters as a special case trying to yield
569
* faster processing time.
570
*/
571
static int
572
do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
573
size_t n2, boolean_t is_it_toupper, int *errnum)
574
{
575
int f;
576
int sz1;
577
int sz2;
578
size_t j;
579
size_t i1;
580
size_t i2;
581
uchar_t u8s1[U8_MB_CUR_MAX + 1];
582
uchar_t u8s2[U8_MB_CUR_MAX + 1];
583
584
i1 = i2 = 0;
585
while (i1 < n1 && i2 < n2) {
586
/*
587
* Find out what would be the byte length for this UTF-8
588
* character at string s1 and also find out if this is
589
* an illegal start byte or not and if so, issue a proper
590
* error number and yet treat this byte as a character.
591
*/
592
sz1 = u8_number_of_bytes[*s1];
593
if (sz1 < 0) {
594
*errnum = EILSEQ;
595
sz1 = 1;
596
}
597
598
/*
599
* For 7-bit ASCII characters mainly, we do a quick case
600
* conversion right at here.
601
*
602
* If we don't have enough bytes for this character, issue
603
* an EINVAL error and use what are available.
604
*
605
* If we have enough bytes, find out if there is
606
* a corresponding uppercase character and if so, copy over
607
* the bytes for a comparison later. If there is no
608
* corresponding uppercase character, then, use what we have
609
* for the comparison.
610
*/
611
if (sz1 == 1) {
612
if (is_it_toupper)
613
u8s1[0] = U8_ASCII_TOUPPER(*s1);
614
else
615
u8s1[0] = U8_ASCII_TOLOWER(*s1);
616
s1++;
617
u8s1[1] = '\0';
618
} else if ((i1 + sz1) > n1) {
619
*errnum = EINVAL;
620
for (j = 0; (i1 + j) < n1; )
621
u8s1[j++] = *s1++;
622
u8s1[j] = '\0';
623
} else {
624
(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
625
s1 += sz1;
626
}
627
628
/* Do the same for the string s2. */
629
sz2 = u8_number_of_bytes[*s2];
630
if (sz2 < 0) {
631
*errnum = EILSEQ;
632
sz2 = 1;
633
}
634
635
if (sz2 == 1) {
636
if (is_it_toupper)
637
u8s2[0] = U8_ASCII_TOUPPER(*s2);
638
else
639
u8s2[0] = U8_ASCII_TOLOWER(*s2);
640
s2++;
641
u8s2[1] = '\0';
642
} else if ((i2 + sz2) > n2) {
643
*errnum = EINVAL;
644
for (j = 0; (i2 + j) < n2; )
645
u8s2[j++] = *s2++;
646
u8s2[j] = '\0';
647
} else {
648
(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
649
s2 += sz2;
650
}
651
652
/* Now compare the two characters. */
653
if (sz1 == 1 && sz2 == 1) {
654
if (*u8s1 > *u8s2)
655
return (1);
656
if (*u8s1 < *u8s2)
657
return (-1);
658
} else {
659
f = strcmp((const char *)u8s1, (const char *)u8s2);
660
if (f != 0)
661
return (f);
662
}
663
664
/*
665
* They were the same. Let's move on to the next
666
* characters then.
667
*/
668
i1 += sz1;
669
i2 += sz2;
670
}
671
672
/*
673
* We compared until the end of either or both strings.
674
*
675
* If we reached to or went over the ends for the both, that means
676
* they are the same.
677
*
678
* If we reached only one of the two ends, that means the other string
679
* has something which then the fact can be used to determine
680
* the return value.
681
*/
682
if (i1 >= n1) {
683
if (i2 >= n2)
684
return (0);
685
return (-1);
686
}
687
return (1);
688
}
689
690
/*
691
* The combining_class() function checks on the given bytes and find out
692
* the corresponding Unicode combining class value. The return value 0 means
693
* it is a Starter. Any illegal UTF-8 character will also be treated as
694
* a Starter.
695
*/
696
static uchar_t
697
combining_class(size_t uv, uchar_t *s, size_t sz)
698
{
699
uint16_t b1 = 0;
700
uint16_t b2 = 0;
701
uint16_t b3 = 0;
702
uint16_t b4 = 0;
703
704
if (sz == 1 || sz > 4)
705
return (0);
706
707
if (sz == 2) {
708
b3 = s[0];
709
b4 = s[1];
710
} else if (sz == 3) {
711
b2 = s[0];
712
b3 = s[1];
713
b4 = s[2];
714
} else if (sz == 4) {
715
b1 = s[0];
716
b2 = s[1];
717
b3 = s[2];
718
b4 = s[3];
719
}
720
721
b1 = u8_common_b1_tbl[uv][b1];
722
if (b1 == U8_TBL_ELEMENT_NOT_DEF)
723
return (0);
724
725
b2 = u8_combining_class_b2_tbl[uv][b1][b2];
726
if (b2 == U8_TBL_ELEMENT_NOT_DEF)
727
return (0);
728
729
b3 = u8_combining_class_b3_tbl[uv][b2][b3];
730
if (b3 == U8_TBL_ELEMENT_NOT_DEF)
731
return (0);
732
733
return (u8_combining_class_b4_tbl[uv][b3][b4]);
734
}
735
736
/*
737
* The do_decomp() function finds out a matching decomposition if any
738
* and return. If there is no match, the input bytes are copied and returned.
739
* The function also checks if there is a Hangul, decomposes it if necessary
740
* and returns.
741
*
742
* To save time, a single byte 7-bit ASCII character should be handled by
743
* the caller.
744
*
745
* The function returns the number of bytes returned sans always terminating
746
* the null byte. It will also return a state that will tell if there was
747
* a Hangul character decomposed which then will be used by the caller.
748
*/
749
static size_t
750
do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
751
boolean_t canonical_decomposition, u8_normalization_states_t *state)
752
{
753
uint16_t b1 = 0;
754
uint16_t b2 = 0;
755
uint16_t b3 = 0;
756
uint16_t b3_tbl;
757
uint16_t b3_base;
758
uint16_t b4 = 0;
759
size_t start_id;
760
size_t end_id;
761
size_t i;
762
uint32_t u1;
763
764
if (sz == 2) {
765
b3 = u8s[0] = s[0];
766
b4 = u8s[1] = s[1];
767
u8s[2] = '\0';
768
} else if (sz == 3) {
769
/* Convert it to a Unicode scalar value. */
770
U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
771
772
/*
773
* If this is a Hangul syllable, we decompose it into
774
* a leading consonant, a vowel, and an optional trailing
775
* consonant and then return.
776
*/
777
if (U8_HANGUL_SYLLABLE(u1)) {
778
u1 -= U8_HANGUL_SYL_FIRST;
779
780
b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
781
b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
782
/ U8_HANGUL_T_COUNT;
783
b3 = u1 % U8_HANGUL_T_COUNT;
784
785
U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
786
U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
787
if (b3) {
788
b3 += U8_HANGUL_JAMO_T_FIRST;
789
U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
790
791
u8s[9] = '\0';
792
*state = U8_STATE_HANGUL_LVT;
793
return (9);
794
}
795
796
u8s[6] = '\0';
797
*state = U8_STATE_HANGUL_LV;
798
return (6);
799
}
800
801
b2 = u8s[0] = s[0];
802
b3 = u8s[1] = s[1];
803
b4 = u8s[2] = s[2];
804
u8s[3] = '\0';
805
806
/*
807
* If this is a Hangul Jamo, we know there is nothing
808
* further that we can decompose.
809
*/
810
if (U8_HANGUL_JAMO_L(u1)) {
811
*state = U8_STATE_HANGUL_L;
812
return (3);
813
}
814
815
if (U8_HANGUL_JAMO_V(u1)) {
816
if (*state == U8_STATE_HANGUL_L)
817
*state = U8_STATE_HANGUL_LV;
818
else
819
*state = U8_STATE_HANGUL_V;
820
return (3);
821
}
822
823
if (U8_HANGUL_JAMO_T(u1)) {
824
if (*state == U8_STATE_HANGUL_LV)
825
*state = U8_STATE_HANGUL_LVT;
826
else
827
*state = U8_STATE_HANGUL_T;
828
return (3);
829
}
830
} else if (sz == 4) {
831
b1 = u8s[0] = s[0];
832
b2 = u8s[1] = s[1];
833
b3 = u8s[2] = s[2];
834
b4 = u8s[3] = s[3];
835
u8s[4] = '\0';
836
} else {
837
/*
838
* This is a fallback and should not happen if the function
839
* was called properly.
840
*/
841
u8s[0] = s[0];
842
u8s[1] = '\0';
843
*state = U8_STATE_START;
844
return (1);
845
}
846
847
/*
848
* At this point, this routine does not know what it would get.
849
* The caller should sort it out if the state isn't a Hangul one.
850
*/
851
*state = U8_STATE_START;
852
853
/* Try to find matching decomposition mapping byte sequence. */
854
b1 = u8_common_b1_tbl[uv][b1];
855
if (b1 == U8_TBL_ELEMENT_NOT_DEF)
856
return ((size_t)sz);
857
858
b2 = u8_decomp_b2_tbl[uv][b1][b2];
859
if (b2 == U8_TBL_ELEMENT_NOT_DEF)
860
return ((size_t)sz);
861
862
b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
863
if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
864
return ((size_t)sz);
865
866
/*
867
* If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
868
* which is 0x8000, this means we couldn't fit the mappings into
869
* the cardinality of a unsigned byte.
870
*/
871
if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
872
b3_tbl -= U8_16BIT_TABLE_INDICATOR;
873
start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
874
end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
875
} else {
876
// cppcheck-suppress arrayIndexOutOfBoundsCond
877
start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
878
// cppcheck-suppress arrayIndexOutOfBoundsCond
879
end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
880
}
881
882
/* This also means there wasn't any matching decomposition. */
883
if (start_id >= end_id)
884
return ((size_t)sz);
885
886
/*
887
* The final table for decomposition mappings has three types of
888
* byte sequences depending on whether a mapping is for compatibility
889
* decomposition, canonical decomposition, or both like the following:
890
*
891
* (1) Compatibility decomposition mappings:
892
*
893
* +---+---+-...-+---+
894
* | B0| B1| ... | Bm|
895
* +---+---+-...-+---+
896
*
897
* The first byte, B0, is always less than 0xF5 (U8_DECOMP_BOTH).
898
*
899
* (2) Canonical decomposition mappings:
900
*
901
* +---+---+---+-...-+---+
902
* | T | b0| b1| ... | bn|
903
* +---+---+---+-...-+---+
904
*
905
* where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
906
*
907
* (3) Both mappings:
908
*
909
* +---+---+---+---+-...-+---+---+---+-...-+---+
910
* | T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
911
* +---+---+---+---+-...-+---+---+---+-...-+---+
912
*
913
* where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
914
* byte, b0 to bn are canonical mapping bytes and B0 to Bm are
915
* compatibility mapping bytes.
916
*
917
* Note that compatibility decomposition means doing recursive
918
* decompositions using both compatibility decomposition mappings and
919
* canonical decomposition mappings. On the other hand, canonical
920
* decomposition means doing recursive decompositions using only
921
* canonical decomposition mappings. Since the table we have has gone
922
* through the recursions already, we do not need to do so during
923
* runtime, i.e., the table has been completely flattened out
924
* already.
925
*/
926
927
b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
928
929
/* Get the type, T, of the byte sequence. */
930
b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
931
932
/*
933
* If necessary, adjust start_id, end_id, or both. Note that if
934
* this is compatibility decomposition mapping, there is no
935
* adjustment.
936
*/
937
if (canonical_decomposition) {
938
/* Is the mapping only for compatibility decomposition? */
939
if (b1 < U8_DECOMP_BOTH)
940
return ((size_t)sz);
941
942
start_id++;
943
944
if (b1 == U8_DECOMP_BOTH) {
945
end_id = start_id +
946
u8_decomp_final_tbl[uv][b3_base + start_id];
947
start_id++;
948
}
949
} else {
950
/*
951
* Unless this is a compatibility decomposition mapping,
952
* we adjust the start_id.
953
*/
954
if (b1 == U8_DECOMP_BOTH) {
955
start_id++;
956
start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
957
} else if (b1 == U8_DECOMP_CANONICAL) {
958
start_id++;
959
}
960
}
961
962
for (i = 0; start_id < end_id; start_id++)
963
u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
964
u8s[i] = '\0';
965
966
return (i);
967
}
968
969
/*
970
* The find_composition_start() function uses the character bytes given and
971
* find out the matching composition mappings if any and return the address
972
* to the composition mappings as explained in the do_composition().
973
*/
974
static uchar_t *
975
find_composition_start(size_t uv, uchar_t *s, size_t sz)
976
{
977
uint16_t b1 = 0;
978
uint16_t b2 = 0;
979
uint16_t b3 = 0;
980
uint16_t b3_tbl;
981
uint16_t b3_base;
982
uint16_t b4 = 0;
983
size_t start_id;
984
size_t end_id;
985
986
if (sz == 1) {
987
b4 = s[0];
988
} else if (sz == 2) {
989
b3 = s[0];
990
b4 = s[1];
991
} else if (sz == 3) {
992
b2 = s[0];
993
b3 = s[1];
994
b4 = s[2];
995
} else if (sz == 4) {
996
b1 = s[0];
997
b2 = s[1];
998
b3 = s[2];
999
b4 = s[3];
1000
} else {
1001
/*
1002
* This is a fallback and should not happen if the function
1003
* was called properly.
1004
*/
1005
return (NULL);
1006
}
1007
1008
b1 = u8_composition_b1_tbl[uv][b1];
1009
if (b1 == U8_TBL_ELEMENT_NOT_DEF)
1010
return (NULL);
1011
1012
b2 = u8_composition_b2_tbl[uv][b1][b2];
1013
if (b2 == U8_TBL_ELEMENT_NOT_DEF)
1014
return (NULL);
1015
1016
b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
1017
if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
1018
return (NULL);
1019
1020
if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
1021
b3_tbl -= U8_16BIT_TABLE_INDICATOR;
1022
start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
1023
end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
1024
} else {
1025
// cppcheck-suppress arrayIndexOutOfBoundsCond
1026
start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
1027
// cppcheck-suppress arrayIndexOutOfBoundsCond
1028
end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
1029
}
1030
1031
if (start_id >= end_id)
1032
return (NULL);
1033
1034
b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
1035
1036
return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
1037
}
1038
1039
/*
1040
* The blocked() function checks on the combining class values of previous
1041
* characters in this sequence and return whether it is blocked or not.
1042
*/
1043
static boolean_t
1044
blocked(uchar_t *comb_class, size_t last)
1045
{
1046
uchar_t my_comb_class;
1047
size_t i;
1048
1049
my_comb_class = comb_class[last];
1050
for (i = 1; i < last; i++)
1051
if (comb_class[i] >= my_comb_class ||
1052
comb_class[i] == U8_COMBINING_CLASS_STARTER)
1053
return (B_TRUE);
1054
1055
return (B_FALSE);
1056
}
1057
1058
/*
1059
* The do_composition() reads the character string pointed by 's' and
1060
* do necessary canonical composition and then copy over the result back to
1061
* the 's'.
1062
*
1063
* The input argument 's' cannot contain more than 32 characters.
1064
*/
1065
static size_t
1066
do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
1067
uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
1068
{
1069
uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
1070
uchar_t tc[U8_MB_CUR_MAX] = { '\0' };
1071
uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
1072
size_t saved_marks_count;
1073
uchar_t *p;
1074
uchar_t *saved_p;
1075
uchar_t *q;
1076
size_t i;
1077
size_t saved_i;
1078
size_t j;
1079
size_t k;
1080
size_t l;
1081
size_t C;
1082
size_t saved_l;
1083
size_t size;
1084
uint32_t u1;
1085
uint32_t u2;
1086
boolean_t match_not_found = B_TRUE;
1087
1088
/*
1089
* This should never happen unless the callers are doing some strange
1090
* and unexpected things.
1091
*
1092
* The "last" is the index pointing to the last character not last + 1.
1093
*/
1094
if (last >= U8_MAX_CHARS_A_SEQ)
1095
last = U8_UPPER_LIMIT_IN_A_SEQ;
1096
1097
for (i = l = 0; i <= last; i++) {
1098
/*
1099
* The last or any non-Starters at the beginning, we don't
1100
* have any chance to do composition and so we just copy them
1101
* to the temporary buffer.
1102
*/
1103
if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
1104
SAVE_THE_CHAR:
1105
p = s + start[i];
1106
size = disp[i];
1107
for (k = 0; k < size; k++)
1108
t[l++] = *p++;
1109
continue;
1110
}
1111
1112
/*
1113
* If this could be a start of Hangul Jamos, then, we try to
1114
* conjoin them.
1115
*/
1116
if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
1117
U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
1118
s[start[i] + 1], s[start[i] + 2]);
1119
U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
1120
s[start[i] + 4], s[start[i] + 5]);
1121
1122
if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
1123
u1 -= U8_HANGUL_JAMO_L_FIRST;
1124
u2 -= U8_HANGUL_JAMO_V_FIRST;
1125
u1 = U8_HANGUL_SYL_FIRST +
1126
(u1 * U8_HANGUL_V_COUNT + u2) *
1127
U8_HANGUL_T_COUNT;
1128
1129
i += 2;
1130
if (i <= last) {
1131
U8_PUT_3BYTES_INTO_UTF32(u2,
1132
s[start[i]], s[start[i] + 1],
1133
s[start[i] + 2]);
1134
1135
if (U8_HANGUL_JAMO_T(u2)) {
1136
u1 += u2 -
1137
U8_HANGUL_JAMO_T_FIRST;
1138
i++;
1139
}
1140
}
1141
1142
U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
1143
i--;
1144
l += 3;
1145
continue;
1146
}
1147
}
1148
1149
/*
1150
* Let's then find out if this Starter has composition
1151
* mapping.
1152
*/
1153
p = find_composition_start(uv, s + start[i], disp[i]);
1154
if (p == NULL)
1155
goto SAVE_THE_CHAR;
1156
1157
/*
1158
* We have a Starter with composition mapping and the next
1159
* character is a non-Starter. Let's try to find out if
1160
* we can do composition.
1161
*/
1162
1163
saved_p = p;
1164
saved_i = i;
1165
saved_l = l;
1166
saved_marks_count = 0;
1167
1168
TRY_THE_NEXT_MARK:
1169
q = s + start[++i];
1170
size = disp[i];
1171
1172
/*
1173
* The next for() loop compares the non-Starter pointed by
1174
* 'q' with the possible (joinable) characters pointed by 'p'.
1175
*
1176
* The composition final table entry pointed by the 'p'
1177
* looks like the following:
1178
*
1179
* +---+---+---+-...-+---+---+---+---+-...-+---+---+
1180
* | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
1181
* +---+---+---+-...-+---+---+---+---+-...-+---+---+
1182
*
1183
* where C is the count byte indicating the number of
1184
* mapping pairs where each pair would be look like
1185
* (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
1186
* character of a canonical decomposition and the B0-Bm are
1187
* the bytes of a matching composite character. The F is
1188
* a filler byte after each character as the separator.
1189
*/
1190
1191
match_not_found = B_TRUE;
1192
1193
for (C = *p++; C > 0; C--) {
1194
for (k = 0; k < size; p++, k++)
1195
if (*p != q[k])
1196
break;
1197
1198
/* Have we found it? */
1199
if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
1200
match_not_found = B_FALSE;
1201
1202
l = saved_l;
1203
1204
while (*++p != U8_TBL_ELEMENT_FILLER)
1205
t[l++] = *p;
1206
1207
break;
1208
}
1209
1210
/* We didn't find; skip to the next pair. */
1211
if (*p != U8_TBL_ELEMENT_FILLER)
1212
while (*++p != U8_TBL_ELEMENT_FILLER)
1213
;
1214
while (*++p != U8_TBL_ELEMENT_FILLER)
1215
;
1216
p++;
1217
}
1218
1219
/*
1220
* If there was no match, we will need to save the combining
1221
* mark for later appending. After that, if the next one
1222
* is a non-Starter and not blocked, then, we try once
1223
* again to do composition with the next non-Starter.
1224
*
1225
* If there was no match and this was a Starter, then,
1226
* this is a new start.
1227
*
1228
* If there was a match and a composition done and we have
1229
* more to check on, then, we retrieve a new composition final
1230
* table entry for the composite and then try to do the
1231
* composition again.
1232
*/
1233
1234
if (match_not_found) {
1235
if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
1236
i--;
1237
goto SAVE_THE_CHAR;
1238
}
1239
1240
saved_marks[saved_marks_count++] = i;
1241
}
1242
1243
if (saved_l == l) {
1244
while (i < last) {
1245
if (blocked(comb_class, i + 1))
1246
saved_marks[saved_marks_count++] = ++i;
1247
else
1248
break;
1249
}
1250
if (i < last) {
1251
p = saved_p;
1252
goto TRY_THE_NEXT_MARK;
1253
}
1254
} else if (i < last) {
1255
p = find_composition_start(uv, t + saved_l,
1256
l - saved_l);
1257
if (p != NULL) {
1258
saved_p = p;
1259
goto TRY_THE_NEXT_MARK;
1260
}
1261
}
1262
1263
/*
1264
* There is no more composition possible.
1265
*
1266
* If there was no composition what so ever then we copy
1267
* over the original Starter and then append any non-Starters
1268
* remaining at the target string sequentially after that.
1269
*/
1270
1271
if (saved_l == l) {
1272
p = s + start[saved_i];
1273
size = disp[saved_i];
1274
for (j = 0; j < size; j++)
1275
t[l++] = *p++;
1276
}
1277
1278
for (k = 0; k < saved_marks_count; k++) {
1279
p = s + start[saved_marks[k]];
1280
size = disp[saved_marks[k]];
1281
for (j = 0; j < size; j++)
1282
t[l++] = *p++;
1283
}
1284
}
1285
1286
/*
1287
* If the last character is a Starter and if we have a character
1288
* (possibly another Starter) that can be turned into a composite,
1289
* we do so and we do so until there is no more of composition
1290
* possible.
1291
*/
1292
if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
1293
p = *os;
1294
saved_l = l - disp[last];
1295
1296
while (p < oslast) {
1297
int8_t number_of_bytes = u8_number_of_bytes[*p];
1298
1299
if (number_of_bytes <= 1)
1300
break;
1301
size = number_of_bytes;
1302
if ((p + size) > oslast)
1303
break;
1304
1305
saved_p = p;
1306
1307
for (i = 0; i < size; i++)
1308
tc[i] = *p++;
1309
1310
q = find_composition_start(uv, t + saved_l,
1311
l - saved_l);
1312
if (q == NULL) {
1313
p = saved_p;
1314
break;
1315
}
1316
1317
match_not_found = B_TRUE;
1318
1319
for (C = *q++; C > 0; C--) {
1320
for (k = 0; k < size; q++, k++)
1321
if (*q != tc[k])
1322
break;
1323
1324
if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
1325
match_not_found = B_FALSE;
1326
1327
l = saved_l;
1328
1329
while (*++q != U8_TBL_ELEMENT_FILLER) {
1330
/*
1331
* This is practically
1332
* impossible but we don't
1333
* want to take any chances.
1334
*/
1335
if (l >=
1336
U8_STREAM_SAFE_TEXT_MAX) {
1337
p = saved_p;
1338
goto SAFE_RETURN;
1339
}
1340
t[l++] = *q;
1341
}
1342
1343
break;
1344
}
1345
1346
if (*q != U8_TBL_ELEMENT_FILLER)
1347
while (*++q != U8_TBL_ELEMENT_FILLER)
1348
;
1349
while (*++q != U8_TBL_ELEMENT_FILLER)
1350
;
1351
q++;
1352
}
1353
1354
if (match_not_found) {
1355
p = saved_p;
1356
break;
1357
}
1358
}
1359
SAFE_RETURN:
1360
*os = p;
1361
}
1362
1363
/*
1364
* Now we copy over the temporary string to the target string.
1365
* Since composition always reduces the number of characters or
1366
* the number of characters stay, we don't need to worry about
1367
* the buffer overflow here.
1368
*/
1369
for (i = 0; i < l; i++)
1370
s[i] = t[i];
1371
s[l] = '\0';
1372
1373
return (l);
1374
}
1375
1376
/*
1377
* The collect_a_seq() function checks on the given string s, collect
1378
* a sequence of characters at u8s, and return the sequence. While it collects
1379
* a sequence, it also applies case conversion, canonical or compatibility
1380
* decomposition, canonical decomposition, or some or all of them and
1381
* in that order.
1382
*
1383
* The collected sequence cannot be bigger than 32 characters since if
1384
* it is having more than 31 characters, the sequence will be terminated
1385
* with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
1386
* a Stream-Safe Text. The collected sequence is always terminated with
1387
* a null byte and the return value is the byte length of the sequence
1388
* including 0. The return value does not include the terminating
1389
* null byte.
1390
*/
1391
static size_t
1392
collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
1393
boolean_t is_it_toupper,
1394
boolean_t is_it_tolower,
1395
boolean_t canonical_decomposition,
1396
boolean_t compatibility_decomposition,
1397
boolean_t canonical_composition,
1398
int *errnum, u8_normalization_states_t *state)
1399
{
1400
uchar_t *s;
1401
int sz;
1402
int saved_sz;
1403
size_t i;
1404
size_t j;
1405
size_t k;
1406
size_t l;
1407
uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
1408
uchar_t disp[U8_MAX_CHARS_A_SEQ];
1409
uchar_t start[U8_MAX_CHARS_A_SEQ];
1410
uchar_t u8t[U8_MB_CUR_MAX] = { '\0' };
1411
uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
1412
uchar_t tc;
1413
size_t last;
1414
size_t saved_last;
1415
uint32_t u1;
1416
1417
/*
1418
* Save the source string pointer which we will return a changed
1419
* pointer if we do processing.
1420
*/
1421
s = *source;
1422
1423
/*
1424
* The following is a fallback for just in case callers are not
1425
* checking the string boundaries before the calling.
1426
*/
1427
if (s >= slast) {
1428
u8s[0] = '\0';
1429
1430
return (0);
1431
}
1432
1433
/*
1434
* As the first thing, let's collect a character and do case
1435
* conversion if necessary.
1436
*/
1437
1438
sz = u8_number_of_bytes[*s];
1439
1440
if (sz < 0) {
1441
*errnum = EILSEQ;
1442
1443
u8s[0] = *s++;
1444
u8s[1] = '\0';
1445
1446
*source = s;
1447
1448
return (1);
1449
}
1450
1451
if (sz == 1) {
1452
if (is_it_toupper)
1453
u8s[0] = U8_ASCII_TOUPPER(*s);
1454
else if (is_it_tolower)
1455
u8s[0] = U8_ASCII_TOLOWER(*s);
1456
else
1457
u8s[0] = *s;
1458
s++;
1459
u8s[1] = '\0';
1460
} else if ((s + sz) > slast) {
1461
*errnum = EINVAL;
1462
1463
for (i = 0; s < slast; )
1464
u8s[i++] = *s++;
1465
u8s[i] = '\0';
1466
1467
*source = s;
1468
1469
return (i);
1470
} else {
1471
if (is_it_toupper || is_it_tolower) {
1472
i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
1473
s += sz;
1474
sz = i;
1475
} else {
1476
for (i = 0; i < sz; )
1477
u8s[i++] = *s++;
1478
u8s[i] = '\0';
1479
}
1480
}
1481
1482
/*
1483
* And then canonical/compatibility decomposition followed by
1484
* an optional canonical composition. Please be noted that
1485
* canonical composition is done only when a decomposition is
1486
* done.
1487
*/
1488
if (canonical_decomposition || compatibility_decomposition) {
1489
if (sz == 1) {
1490
*state = U8_STATE_START;
1491
1492
saved_sz = 1;
1493
1494
comb_class[0] = 0;
1495
start[0] = 0;
1496
disp[0] = 1;
1497
1498
last = 1;
1499
} else {
1500
saved_sz = do_decomp(uv, u8s, u8s, sz,
1501
canonical_decomposition, state);
1502
1503
last = 0;
1504
1505
for (i = 0; i < saved_sz; ) {
1506
sz = u8_number_of_bytes[u8s[i]];
1507
1508
comb_class[last] = combining_class(uv,
1509
u8s + i, sz);
1510
start[last] = i;
1511
disp[last] = sz;
1512
1513
last++;
1514
i += sz;
1515
}
1516
1517
/*
1518
* Decomposition yields various Hangul related
1519
* states but not on combining marks. We need to
1520
* find out at here by checking on the last
1521
* character.
1522
*/
1523
if (*state == U8_STATE_START) {
1524
if (comb_class[last - 1])
1525
*state = U8_STATE_COMBINING_MARK;
1526
}
1527
}
1528
1529
saved_last = last;
1530
1531
while (s < slast) {
1532
sz = u8_number_of_bytes[*s];
1533
1534
/*
1535
* If this is an illegal character, an incomplete
1536
* character, or an 7-bit ASCII Starter character,
1537
* then we have collected a sequence; break and let
1538
* the next call deal with the two cases.
1539
*
1540
* Note that this is okay only if you are using this
1541
* function with a fixed length string, not on
1542
* a buffer with multiple calls of one chunk at a time.
1543
*/
1544
if (sz <= 1) {
1545
break;
1546
} else if ((s + sz) > slast) {
1547
break;
1548
} else {
1549
/*
1550
* If the previous character was a Hangul Jamo
1551
* and this character is a Hangul Jamo that
1552
* can be conjoined, we collect the Jamo.
1553
*/
1554
if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
1555
U8_PUT_3BYTES_INTO_UTF32(u1,
1556
*s, *(s + 1), *(s + 2));
1557
1558
if (U8_HANGUL_COMPOSABLE_L_V(*state,
1559
u1)) {
1560
i = 0;
1561
*state = U8_STATE_HANGUL_LV;
1562
goto COLLECT_A_HANGUL;
1563
}
1564
1565
if (U8_HANGUL_COMPOSABLE_LV_T(*state,
1566
u1)) {
1567
i = 0;
1568
*state = U8_STATE_HANGUL_LVT;
1569
goto COLLECT_A_HANGUL;
1570
}
1571
}
1572
1573
/*
1574
* Regardless of whatever it was, if this is
1575
* a Starter, we don't collect the character
1576
* since that's a new start and we will deal
1577
* with it at the next time.
1578
*/
1579
i = combining_class(uv, s, sz);
1580
if (i == U8_COMBINING_CLASS_STARTER)
1581
break;
1582
1583
/*
1584
* We know the current character is a combining
1585
* mark. If the previous character wasn't
1586
* a Starter (not Hangul) or a combining mark,
1587
* then, we don't collect this combining mark.
1588
*/
1589
if (*state != U8_STATE_START &&
1590
*state != U8_STATE_COMBINING_MARK)
1591
break;
1592
1593
*state = U8_STATE_COMBINING_MARK;
1594
COLLECT_A_HANGUL:
1595
/*
1596
* If we collected a Starter and combining
1597
* marks up to 30, i.e., total 31 characters,
1598
* then, we terminate this degenerately long
1599
* combining sequence with a U+034F COMBINING
1600
* GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
1601
* UTF-8 and turn this into a Stream-Safe
1602
* Text. This will be extremely rare but
1603
* possible.
1604
*
1605
* The following will also guarantee that
1606
* we are not writing more than 32 characters
1607
* plus a NULL at u8s[].
1608
*/
1609
if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
1610
TURN_STREAM_SAFE:
1611
*state = U8_STATE_START;
1612
comb_class[last] = 0;
1613
start[last] = saved_sz;
1614
disp[last] = 2;
1615
last++;
1616
1617
u8s[saved_sz++] = 0xCD;
1618
u8s[saved_sz++] = 0x8F;
1619
1620
break;
1621
}
1622
1623
/*
1624
* Some combining marks also do decompose into
1625
* another combining mark or marks.
1626
*/
1627
if (*state == U8_STATE_COMBINING_MARK) {
1628
k = last;
1629
l = sz;
1630
i = do_decomp(uv, uts, s, sz,
1631
canonical_decomposition, state);
1632
for (j = 0; j < i; ) {
1633
sz = u8_number_of_bytes[uts[j]];
1634
1635
comb_class[last] =
1636
combining_class(uv,
1637
uts + j, sz);
1638
start[last] = saved_sz + j;
1639
disp[last] = sz;
1640
1641
last++;
1642
if (last >=
1643
U8_UPPER_LIMIT_IN_A_SEQ) {
1644
last = k;
1645
goto TURN_STREAM_SAFE;
1646
}
1647
j += sz;
1648
}
1649
1650
*state = U8_STATE_COMBINING_MARK;
1651
sz = i;
1652
s += l;
1653
1654
for (i = 0; i < sz; i++)
1655
u8s[saved_sz++] = uts[i];
1656
} else {
1657
comb_class[last] = i;
1658
start[last] = saved_sz;
1659
disp[last] = sz;
1660
last++;
1661
1662
for (i = 0; i < sz; i++)
1663
u8s[saved_sz++] = *s++;
1664
}
1665
1666
/*
1667
* If this is U+0345 COMBINING GREEK
1668
* YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
1669
* iota subscript, and need to be converted to
1670
* uppercase letter, convert it to U+0399 GREEK
1671
* CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
1672
* i.e., convert to capital adscript form as
1673
* specified in the Unicode standard.
1674
*
1675
* This is the only special case of (ambiguous)
1676
* case conversion at combining marks and
1677
* probably the standard will never have
1678
* anything similar like this in future.
1679
*/
1680
if (is_it_toupper && sz >= 2 &&
1681
u8s[saved_sz - 2] == 0xCD &&
1682
u8s[saved_sz - 1] == 0x85) {
1683
u8s[saved_sz - 2] = 0xCE;
1684
u8s[saved_sz - 1] = 0x99;
1685
}
1686
}
1687
}
1688
1689
/*
1690
* Let's try to ensure a canonical ordering for the collected
1691
* combining marks. We do this only if we have collected
1692
* at least one more non-Starter. (The decomposition mapping
1693
* data tables have fully (and recursively) expanded and
1694
* canonically ordered decompositions.)
1695
*
1696
* The U8_SWAP_COMB_MARKS() convenience macro has some
1697
* assumptions and we are meeting the assumptions.
1698
*/
1699
last--;
1700
if (last >= saved_last) {
1701
for (i = 0; i < last; i++)
1702
for (j = last; j > i; j--)
1703
if (comb_class[j] &&
1704
comb_class[j - 1] > comb_class[j]) {
1705
U8_SWAP_COMB_MARKS(j - 1, j);
1706
}
1707
}
1708
1709
*source = s;
1710
1711
if (! canonical_composition) {
1712
u8s[saved_sz] = '\0';
1713
return (saved_sz);
1714
}
1715
1716
/*
1717
* Now do the canonical composition. Note that we do this
1718
* only after a canonical or compatibility decomposition to
1719
* finish up NFC or NFKC.
1720
*/
1721
sz = do_composition(uv, u8s, comb_class, start, disp, last,
1722
&s, slast);
1723
}
1724
1725
*source = s;
1726
1727
return ((size_t)sz);
1728
}
1729
1730
/*
1731
* The do_norm_compare() function does string comparison based on Unicode
1732
* simple case mappings and Unicode Normalization definitions.
1733
*
1734
* It does so by collecting a sequence of character at a time and comparing
1735
* the collected sequences from the strings.
1736
*
1737
* The meanings on the return values are the same as the usual strcmp().
1738
*/
1739
static int
1740
do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
1741
int flag, int *errnum)
1742
{
1743
int result;
1744
size_t sz1;
1745
size_t sz2;
1746
uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
1747
uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
1748
uchar_t *s1last;
1749
uchar_t *s2last;
1750
boolean_t is_it_toupper;
1751
boolean_t is_it_tolower;
1752
boolean_t canonical_decomposition;
1753
boolean_t compatibility_decomposition;
1754
boolean_t canonical_composition;
1755
u8_normalization_states_t state;
1756
1757
s1last = s1 + n1;
1758
s2last = s2 + n2;
1759
1760
is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1761
#ifdef U8_STRCMP_CI_LOWER
1762
is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1763
#else
1764
is_it_tolower = 0;
1765
#endif
1766
canonical_decomposition = flag & U8_CANON_DECOMP;
1767
compatibility_decomposition = flag & U8_COMPAT_DECOMP;
1768
canonical_composition = flag & U8_CANON_COMP;
1769
1770
while (s1 < s1last && s2 < s2last) {
1771
/*
1772
* If the current character is a 7-bit ASCII and the last
1773
* character, or, if the current character and the next
1774
* character are both some 7-bit ASCII characters then
1775
* we treat the current character as a sequence.
1776
*
1777
* In any other cases, we need to call collect_a_seq().
1778
*/
1779
1780
if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
1781
((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
1782
if (is_it_toupper)
1783
u8s1[0] = U8_ASCII_TOUPPER(*s1);
1784
else if (is_it_tolower)
1785
u8s1[0] = U8_ASCII_TOLOWER(*s1);
1786
else
1787
u8s1[0] = *s1;
1788
u8s1[1] = '\0';
1789
sz1 = 1;
1790
s1++;
1791
} else {
1792
state = U8_STATE_START;
1793
sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
1794
is_it_toupper, is_it_tolower,
1795
canonical_decomposition,
1796
compatibility_decomposition,
1797
canonical_composition, errnum, &state);
1798
}
1799
1800
if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
1801
((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
1802
if (is_it_toupper)
1803
u8s2[0] = U8_ASCII_TOUPPER(*s2);
1804
else if (is_it_tolower)
1805
u8s2[0] = U8_ASCII_TOLOWER(*s2);
1806
else
1807
u8s2[0] = *s2;
1808
u8s2[1] = '\0';
1809
sz2 = 1;
1810
s2++;
1811
} else {
1812
state = U8_STATE_START;
1813
sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
1814
is_it_toupper, is_it_tolower,
1815
canonical_decomposition,
1816
compatibility_decomposition,
1817
canonical_composition, errnum, &state);
1818
}
1819
1820
/*
1821
* Now compare the two characters. If they are the same,
1822
* we move on to the next character sequences.
1823
*/
1824
if (sz1 == 1 && sz2 == 1) {
1825
if (*u8s1 > *u8s2)
1826
return (1);
1827
if (*u8s1 < *u8s2)
1828
return (-1);
1829
} else {
1830
result = strcmp((const char *)u8s1, (const char *)u8s2);
1831
if (result != 0)
1832
return (result);
1833
}
1834
}
1835
1836
/*
1837
* We compared until the end of either or both strings.
1838
*
1839
* If we reached to or went over the ends for the both, that means
1840
* they are the same.
1841
*
1842
* If we reached only one end, that means the other string has
1843
* something which then can be used to determine the return value.
1844
*/
1845
if (s1 >= s1last) {
1846
if (s2 >= s2last)
1847
return (0);
1848
return (-1);
1849
}
1850
return (1);
1851
}
1852
1853
/*
1854
* The u8_strcmp() function compares two UTF-8 strings quite similar to
1855
* the strcmp(). For the comparison, however, Unicode Normalization specific
1856
* equivalency and Unicode simple case conversion mappings based equivalency
1857
* can be requested and checked against.
1858
*/
1859
int
1860
u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
1861
int *errnum)
1862
{
1863
int f;
1864
size_t n1;
1865
size_t n2;
1866
1867
*errnum = 0;
1868
1869
/*
1870
* Check on the requested Unicode version, case conversion, and
1871
* normalization flag values.
1872
*/
1873
1874
if (uv > U8_UNICODE_LATEST) {
1875
*errnum = ERANGE;
1876
uv = U8_UNICODE_LATEST;
1877
}
1878
1879
if (flag == 0) {
1880
flag = U8_STRCMP_CS;
1881
} else {
1882
#ifdef U8_STRCMP_CI_LOWER
1883
f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER
1884
| U8_STRCMP_CI_LOWER);
1885
#else
1886
f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER);
1887
#endif
1888
if (f == 0) {
1889
flag |= U8_STRCMP_CS;
1890
}
1891
#ifdef U8_STRCMP_CI_LOWER
1892
else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
1893
f != U8_STRCMP_CI_LOWER)
1894
#else
1895
else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER)
1896
#endif
1897
{
1898
*errnum = EBADF;
1899
flag = U8_STRCMP_CS;
1900
}
1901
1902
f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1903
if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
1904
f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
1905
*errnum = EBADF;
1906
flag = U8_STRCMP_CS;
1907
}
1908
}
1909
1910
if (flag == U8_STRCMP_CS) {
1911
return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
1912
}
1913
1914
n1 = strlen(s1);
1915
n2 = strlen(s2);
1916
if (n != 0) {
1917
if (n < n1)
1918
n1 = n;
1919
if (n < n2)
1920
n2 = n;
1921
}
1922
1923
/*
1924
* Simple case conversion can be done much faster and so we do
1925
* them separately here.
1926
*/
1927
if (flag == U8_STRCMP_CI_UPPER) {
1928
return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1929
n1, n2, B_TRUE, errnum));
1930
}
1931
#ifdef U8_STRCMP_CI_LOWER
1932
else if (flag == U8_STRCMP_CI_LOWER) {
1933
return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1934
n1, n2, B_FALSE, errnum));
1935
}
1936
#endif
1937
1938
return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
1939
flag, errnum));
1940
}
1941
1942
size_t
1943
u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
1944
int flag, size_t unicode_version, int *errnum)
1945
{
1946
int f;
1947
int sz;
1948
uchar_t *ib;
1949
uchar_t *ibtail;
1950
uchar_t *ob;
1951
uchar_t *obtail;
1952
boolean_t do_not_ignore_null;
1953
boolean_t do_not_ignore_invalid;
1954
boolean_t is_it_toupper;
1955
boolean_t is_it_tolower;
1956
boolean_t canonical_decomposition;
1957
boolean_t compatibility_decomposition;
1958
boolean_t canonical_composition;
1959
size_t ret_val;
1960
size_t i;
1961
size_t j;
1962
uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
1963
u8_normalization_states_t state;
1964
1965
if (unicode_version > U8_UNICODE_LATEST) {
1966
*errnum = ERANGE;
1967
return ((size_t)-1);
1968
}
1969
1970
#ifdef U8_TEXTPREP_TOLOWER
1971
f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
1972
if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
1973
*errnum = EBADF;
1974
return ((size_t)-1);
1975
}
1976
#endif
1977
1978
f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1979
if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
1980
f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
1981
*errnum = EBADF;
1982
return ((size_t)-1);
1983
}
1984
1985
if (inarray == NULL || *inlen == 0)
1986
return (0);
1987
1988
if (outarray == NULL) {
1989
*errnum = E2BIG;
1990
return ((size_t)-1);
1991
}
1992
1993
ib = (uchar_t *)inarray;
1994
ob = (uchar_t *)outarray;
1995
ibtail = ib + *inlen;
1996
obtail = ob + *outlen;
1997
1998
do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
1999
do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
2000
is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
2001
#ifdef U8_TEXTPREP_TOLOWER
2002
is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
2003
#else
2004
is_it_tolower = 0;
2005
#endif
2006
2007
ret_val = 0;
2008
2009
/*
2010
* If we don't have a normalization flag set, we do the simple case
2011
* conversion based text preparation separately below. Text
2012
* preparation involving Normalization will be done in the false task
2013
* block, again, separately since it will take much more time and
2014
* resource than doing simple case conversions.
2015
*/
2016
if (f == 0) {
2017
while (ib < ibtail) {
2018
if (*ib == '\0' && do_not_ignore_null)
2019
break;
2020
2021
sz = u8_number_of_bytes[*ib];
2022
2023
if (sz < 0) {
2024
if (do_not_ignore_invalid) {
2025
*errnum = EILSEQ;
2026
ret_val = (size_t)-1;
2027
break;
2028
}
2029
2030
sz = 1;
2031
ret_val++;
2032
}
2033
2034
if (sz == 1) {
2035
if (ob >= obtail) {
2036
*errnum = E2BIG;
2037
ret_val = (size_t)-1;
2038
break;
2039
}
2040
2041
if (is_it_toupper)
2042
*ob = U8_ASCII_TOUPPER(*ib);
2043
else if (is_it_tolower)
2044
*ob = U8_ASCII_TOLOWER(*ib);
2045
else
2046
*ob = *ib;
2047
ib++;
2048
ob++;
2049
} else if ((ib + sz) > ibtail) {
2050
if (do_not_ignore_invalid) {
2051
*errnum = EINVAL;
2052
ret_val = (size_t)-1;
2053
break;
2054
}
2055
2056
if ((obtail - ob) < (ibtail - ib)) {
2057
*errnum = E2BIG;
2058
ret_val = (size_t)-1;
2059
break;
2060
}
2061
2062
/*
2063
* We treat the remaining incomplete character
2064
* bytes as a character.
2065
*/
2066
ret_val++;
2067
2068
while (ib < ibtail)
2069
*ob++ = *ib++;
2070
} else {
2071
if (is_it_toupper || is_it_tolower) {
2072
i = do_case_conv(unicode_version, u8s,
2073
ib, sz, is_it_toupper);
2074
2075
if ((obtail - ob) < i) {
2076
*errnum = E2BIG;
2077
ret_val = (size_t)-1;
2078
break;
2079
}
2080
2081
ib += sz;
2082
2083
for (sz = 0; sz < i; sz++)
2084
*ob++ = u8s[sz];
2085
} else {
2086
if ((obtail - ob) < sz) {
2087
*errnum = E2BIG;
2088
ret_val = (size_t)-1;
2089
break;
2090
}
2091
2092
for (i = 0; i < sz; i++)
2093
*ob++ = *ib++;
2094
}
2095
}
2096
}
2097
} else {
2098
canonical_decomposition = flag & U8_CANON_DECOMP;
2099
compatibility_decomposition = flag & U8_COMPAT_DECOMP;
2100
canonical_composition = flag & U8_CANON_COMP;
2101
2102
while (ib < ibtail) {
2103
if (*ib == '\0' && do_not_ignore_null)
2104
break;
2105
2106
/*
2107
* If the current character is a 7-bit ASCII
2108
* character and it is the last character, or,
2109
* if the current character is a 7-bit ASCII
2110
* character and the next character is also a 7-bit
2111
* ASCII character, then, we copy over this
2112
* character without going through collect_a_seq().
2113
*
2114
* In any other cases, we need to look further with
2115
* the collect_a_seq() function.
2116
*/
2117
if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
2118
((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
2119
if (ob >= obtail) {
2120
*errnum = E2BIG;
2121
ret_val = (size_t)-1;
2122
break;
2123
}
2124
2125
if (is_it_toupper)
2126
*ob = U8_ASCII_TOUPPER(*ib);
2127
else if (is_it_tolower)
2128
*ob = U8_ASCII_TOLOWER(*ib);
2129
else
2130
*ob = *ib;
2131
ib++;
2132
ob++;
2133
} else {
2134
*errnum = 0;
2135
state = U8_STATE_START;
2136
2137
j = collect_a_seq(unicode_version, u8s,
2138
&ib, ibtail,
2139
is_it_toupper,
2140
is_it_tolower,
2141
canonical_decomposition,
2142
compatibility_decomposition,
2143
canonical_composition,
2144
errnum, &state);
2145
2146
if (*errnum && do_not_ignore_invalid) {
2147
ret_val = (size_t)-1;
2148
break;
2149
}
2150
2151
if ((obtail - ob) < j) {
2152
*errnum = E2BIG;
2153
ret_val = (size_t)-1;
2154
break;
2155
}
2156
2157
for (i = 0; i < j; i++)
2158
*ob++ = u8s[i];
2159
}
2160
}
2161
}
2162
2163
*inlen = ibtail - ib;
2164
*outlen = obtail - ob;
2165
2166
return (ret_val);
2167
}
2168
2169
EXPORT_SYMBOL(u8_validate);
2170
EXPORT_SYMBOL(u8_strcmp);
2171
EXPORT_SYMBOL(u8_textprep_str);
2172
2173