Path: blob/main/sys/contrib/openzfs/module/unicode/u8_textprep.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright 2008 Sun Microsystems, Inc. All rights reserved.23* Use is subject to license terms.24*/2526/*27* Copyright 2022 MNX Cloud, Inc.28*/29303132/*33* UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).34*35* Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),36* u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also37* the section 3C man pages.38* Interface stability: Committed.39*/4041#include <sys/types.h>42#include <sys/string.h>43#include <sys/param.h>44#include <sys/sysmacros.h>45#include <sys/debug.h>46#include <sys/kmem.h>47#include <sys/sunddi.h>48#include <sys/u8_textprep.h>49#include <sys/byteorder.h>50#include <sys/errno.h>51#include <sys/u8_textprep_data.h>52#include <sys/mod.h>5354/* The maximum possible number of bytes in a UTF-8 character. */55#define U8_MB_CUR_MAX (4)5657/*58* The maximum number of bytes needed for a UTF-8 character to cover59* U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.60*/61#define U8_MAX_BYTES_UCS2 (3)6263/* The maximum possible number of bytes in a Stream-Safe Text. */64#define U8_STREAM_SAFE_TEXT_MAX (128)6566/*67* The maximum number of characters in a combining/conjoining sequence and68* the actual upperbound limit of a combining/conjoining sequence.69*/70#define U8_MAX_CHARS_A_SEQ (32)71#define U8_UPPER_LIMIT_IN_A_SEQ (31)7273/* The combining class value for Starter. */74#define U8_COMBINING_CLASS_STARTER (0)7576/*77* Some Hangul related macros at below.78*79* The first and the last of Hangul syllables, Hangul Jamo Leading consonants,80* Vowels, and optional Trailing consonants in Unicode scalar values.81*82* Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not83* the actual U+11A8. This is due to that the trailing consonant is optional84* and thus we are doing a pre-calculation of subtracting one.85*86* Each of 19 modern leading consonants has total 588 possible syllables since87* Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for88* no trailing consonant case, i.e., 21 x 28 = 588.89*90* We also have bunch of Hangul related macros at below. Please bear in mind91* that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is92* a Hangul Jamo or not but the value does not guarantee that it is a Hangul93* Jamo; it just guarantee that it will be most likely.94*/95#define U8_HANGUL_SYL_FIRST (0xAC00U)96#define U8_HANGUL_SYL_LAST (0xD7A3U)9798#define U8_HANGUL_JAMO_L_FIRST (0x1100U)99#define U8_HANGUL_JAMO_L_LAST (0x1112U)100#define U8_HANGUL_JAMO_V_FIRST (0x1161U)101#define U8_HANGUL_JAMO_V_LAST (0x1175U)102#define U8_HANGUL_JAMO_T_FIRST (0x11A7U)103#define U8_HANGUL_JAMO_T_LAST (0x11C2U)104105#define U8_HANGUL_V_COUNT (21)106#define U8_HANGUL_VT_COUNT (588)107#define U8_HANGUL_T_COUNT (28)108109#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U)110111#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \112(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \113(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \114(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));115116#define U8_HANGUL_JAMO_L(u) \117((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)118119#define U8_HANGUL_JAMO_V(u) \120((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)121122#define U8_HANGUL_JAMO_T(u) \123((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)124125#define U8_HANGUL_JAMO(u) \126((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)127128#define U8_HANGUL_SYLLABLE(u) \129((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)130131#define U8_HANGUL_COMPOSABLE_L_V(s, u) \132((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))133134#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \135((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))136137/* The types of decomposition mappings. */138#define U8_DECOMP_BOTH (0xF5U)139#define U8_DECOMP_CANONICAL (0xF6U)140141/* The indicator for 16-bit table. */142#define U8_16BIT_TABLE_INDICATOR (0x8000U)143144/* The following are some convenience macros. */145#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \146(u) = ((((uint32_t)(b1) & 0x0F) << 12) | \147(((uint32_t)(b2) & 0x3F) << 6) | \148((uint32_t)(b3) & 0x3F));149150#define U8_SIMPLE_SWAP(a, b, t) \151(t) = (a); \152(a) = (b); \153(b) = (t);154155#define U8_ASCII_TOUPPER(c) \156(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))157158#define U8_ASCII_TOLOWER(c) \159(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))160161#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U)162/*163* The following macro assumes that the two characters that are to be164* swapped are adjacent to each other and 'a' comes before 'b'.165*166* If the assumptions are not met, then, the macro will fail.167*/168#define U8_SWAP_COMB_MARKS(a, b) \169for (k = 0; k < disp[(a)]; k++) \170u8t[k] = u8s[start[(a)] + k]; \171for (k = 0; k < disp[(b)]; k++) \172u8s[start[(a)] + k] = u8s[start[(b)] + k]; \173start[(b)] = start[(a)] + disp[(b)]; \174for (k = 0; k < disp[(a)]; k++) \175u8s[start[(b)] + k] = u8t[k]; \176U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \177U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);178179/* The possible states during normalization. */180typedef enum {181U8_STATE_START = 0,182U8_STATE_HANGUL_L = 1,183U8_STATE_HANGUL_LV = 2,184U8_STATE_HANGUL_LVT = 3,185U8_STATE_HANGUL_V = 4,186U8_STATE_HANGUL_T = 5,187U8_STATE_COMBINING_MARK = 6188} u8_normalization_states_t;189190/*191* The three vectors at below are used to check bytes of a given UTF-8192* character are valid and not containing any malformed byte values.193*194* We used to have a quite relaxed UTF-8 binary representation but then there195* was some security related issues and so the Unicode Consortium defined196* and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it197* one more time at the Unicode 3.2. The following three tables are based on198* that.199*/200201#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF)202203#define I_ U8_ILLEGAL_CHAR204#define O_ U8_OUT_OF_RANGE_CHAR205206static const int8_t u8_number_of_bytes[0x100] = {2071, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,2081, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,2091, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,2101, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,2111, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,2121, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,2131, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,2141, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,215216/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */217I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,218219/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */220I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,221222/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */223I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,224225/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */226I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,227228/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */229I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,230231/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */2322, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,233234/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */2353, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,236237/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */2384, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,239};240241#undef I_242#undef O_243244static const uint8_t u8_valid_min_2nd_byte[0x100] = {2450, 0, 0, 0, 0, 0, 0, 0,2460, 0, 0, 0, 0, 0, 0, 0,2470, 0, 0, 0, 0, 0, 0, 0,2480, 0, 0, 0, 0, 0, 0, 0,2490, 0, 0, 0, 0, 0, 0, 0,2500, 0, 0, 0, 0, 0, 0, 0,2510, 0, 0, 0, 0, 0, 0, 0,2520, 0, 0, 0, 0, 0, 0, 0,2530, 0, 0, 0, 0, 0, 0, 0,2540, 0, 0, 0, 0, 0, 0, 0,2550, 0, 0, 0, 0, 0, 0, 0,2560, 0, 0, 0, 0, 0, 0, 0,2570, 0, 0, 0, 0, 0, 0, 0,2580, 0, 0, 0, 0, 0, 0, 0,2590, 0, 0, 0, 0, 0, 0, 0,2600, 0, 0, 0, 0, 0, 0, 0,2610, 0, 0, 0, 0, 0, 0, 0,2620, 0, 0, 0, 0, 0, 0, 0,2630, 0, 0, 0, 0, 0, 0, 0,2640, 0, 0, 0, 0, 0, 0, 0,2650, 0, 0, 0, 0, 0, 0, 0,2660, 0, 0, 0, 0, 0, 0, 0,2670, 0, 0, 0, 0, 0, 0, 0,2680, 0, 0, 0, 0, 0, 0, 0,269/* C0 C1 C2 C3 C4 C5 C6 C7 */2700, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,271/* C8 C9 CA CB CC CD CE CF */2720x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,273/* D0 D1 D2 D3 D4 D5 D6 D7 */2740x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,275/* D8 D9 DA DB DC DD DE DF */2760x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,277/* E0 E1 E2 E3 E4 E5 E6 E7 */2780xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,279/* E8 E9 EA EB EC ED EE EF */2800x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,281/* F0 F1 F2 F3 F4 F5 F6 F7 */2820x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,2830, 0, 0, 0, 0, 0, 0, 0,284};285286static const uint8_t u8_valid_max_2nd_byte[0x100] = {2870, 0, 0, 0, 0, 0, 0, 0,2880, 0, 0, 0, 0, 0, 0, 0,2890, 0, 0, 0, 0, 0, 0, 0,2900, 0, 0, 0, 0, 0, 0, 0,2910, 0, 0, 0, 0, 0, 0, 0,2920, 0, 0, 0, 0, 0, 0, 0,2930, 0, 0, 0, 0, 0, 0, 0,2940, 0, 0, 0, 0, 0, 0, 0,2950, 0, 0, 0, 0, 0, 0, 0,2960, 0, 0, 0, 0, 0, 0, 0,2970, 0, 0, 0, 0, 0, 0, 0,2980, 0, 0, 0, 0, 0, 0, 0,2990, 0, 0, 0, 0, 0, 0, 0,3000, 0, 0, 0, 0, 0, 0, 0,3010, 0, 0, 0, 0, 0, 0, 0,3020, 0, 0, 0, 0, 0, 0, 0,3030, 0, 0, 0, 0, 0, 0, 0,3040, 0, 0, 0, 0, 0, 0, 0,3050, 0, 0, 0, 0, 0, 0, 0,3060, 0, 0, 0, 0, 0, 0, 0,3070, 0, 0, 0, 0, 0, 0, 0,3080, 0, 0, 0, 0, 0, 0, 0,3090, 0, 0, 0, 0, 0, 0, 0,3100, 0, 0, 0, 0, 0, 0, 0,311/* C0 C1 C2 C3 C4 C5 C6 C7 */3120, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,313/* C8 C9 CA CB CC CD CE CF */3140xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,315/* D0 D1 D2 D3 D4 D5 D6 D7 */3160xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,317/* D8 D9 DA DB DC DD DE DF */3180xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,319/* E0 E1 E2 E3 E4 E5 E6 E7 */3200xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,321/* E8 E9 EA EB EC ED EE EF */3220xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,323/* F0 F1 F2 F3 F4 F5 F6 F7 */3240xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,3250, 0, 0, 0, 0, 0, 0, 0,326};327328329/*330* The u8_validate() validates on the given UTF-8 character string and331* calculate the byte length. It is quite similar to mblen(3C) except that332* this will validate against the list of characters if required and333* specific to UTF-8 and Unicode.334*/335int336u8_validate(const char *u8str, size_t n, char **list, int flag, int *errnum)337{338uchar_t *ib;339uchar_t *ibtail;340uchar_t **p;341uchar_t *s1;342uchar_t *s2;343uchar_t f;344int sz;345size_t i;346int ret_val;347boolean_t second;348boolean_t no_need_to_validate_entire;349boolean_t check_additional;350boolean_t validate_ucs2_range_only;351352if (! u8str)353return (0);354355ib = (uchar_t *)u8str;356ibtail = ib + n;357358ret_val = 0;359360no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);361check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;362validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;363364while (ib < ibtail) {365/*366* The first byte of a UTF-8 character tells how many367* bytes will follow for the character. If the first byte368* is an illegal byte value or out of range value, we just369* return -1 with an appropriate error number.370*/371sz = u8_number_of_bytes[*ib];372if (sz == U8_ILLEGAL_CHAR) {373*errnum = EILSEQ;374return (-1);375}376377if (sz == U8_OUT_OF_RANGE_CHAR ||378(validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {379*errnum = ERANGE;380return (-1);381}382383/*384* If we don't have enough bytes to check on, that's also385* an error. As you can see, we give illegal byte sequence386* checking higher priority then EINVAL cases.387*/388if ((ibtail - ib) < sz) {389*errnum = EINVAL;390return (-1);391}392393if (sz == 1) {394ib++;395ret_val++;396} else {397/*398* Check on the multi-byte UTF-8 character. For more399* details on this, see comment added for the used400* data structures at the beginning of the file.401*/402f = *ib++;403ret_val++;404second = B_TRUE;405for (i = 1; i < sz; i++) {406if (second) {407if (*ib < u8_valid_min_2nd_byte[f] ||408*ib > u8_valid_max_2nd_byte[f]) {409*errnum = EILSEQ;410return (-1);411}412second = B_FALSE;413} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {414*errnum = EILSEQ;415return (-1);416}417ib++;418ret_val++;419}420}421422if (check_additional) {423for (p = (uchar_t **)list, i = 0; p[i]; i++) {424s1 = ib - sz;425s2 = p[i];426while (s1 < ib) {427if (*s1 != *s2 || *s2 == '\0')428break;429s1++;430s2++;431}432433if (s1 >= ib && *s2 == '\0') {434*errnum = EBADF;435return (-1);436}437}438}439440if (no_need_to_validate_entire)441break;442}443444return (ret_val);445}446447/*448* The do_case_conv() looks at the mapping tables and returns found449* bytes if any. If not found, the input bytes are returned. The function450* always terminate the return bytes with a null character assuming that451* there are plenty of room to do so.452*453* The case conversions are simple case conversions mapping a character to454* another character as specified in the Unicode data. The byte size of455* the mapped character could be different from that of the input character.456*457* The return value is the byte length of the returned character excluding458* the terminating null byte.459*/460static size_t461do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)462{463size_t i;464uint16_t b1 = 0;465uint16_t b2 = 0;466uint16_t b3 = 0;467uint16_t b3_tbl;468uint16_t b3_base;469uint16_t b4 = 0;470size_t start_id;471size_t end_id;472473/*474* At this point, the only possible values for sz are 2, 3, and 4.475* The u8s should point to a vector that is well beyond the size of476* 5 bytes.477*/478if (sz == 2) {479b3 = u8s[0] = s[0];480b4 = u8s[1] = s[1];481} else if (sz == 3) {482b2 = u8s[0] = s[0];483b3 = u8s[1] = s[1];484b4 = u8s[2] = s[2];485} else if (sz == 4) {486b1 = u8s[0] = s[0];487b2 = u8s[1] = s[1];488b3 = u8s[2] = s[2];489b4 = u8s[3] = s[3];490} else {491/* This is not possible but just in case as a fallback. */492if (is_it_toupper)493*u8s = U8_ASCII_TOUPPER(*s);494else495*u8s = U8_ASCII_TOLOWER(*s);496u8s[1] = '\0';497498return (1);499}500u8s[sz] = '\0';501502/*503* Let's find out if we have a corresponding character.504*/505b1 = u8_common_b1_tbl[uv][b1];506if (b1 == U8_TBL_ELEMENT_NOT_DEF)507return ((size_t)sz);508509b2 = u8_case_common_b2_tbl[uv][b1][b2];510if (b2 == U8_TBL_ELEMENT_NOT_DEF)511return ((size_t)sz);512513if (is_it_toupper) {514b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;515if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)516return ((size_t)sz);517518start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];519end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];520521/* Either there is no match or an error at the table. */522if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)523return ((size_t)sz);524525b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;526527for (i = 0; start_id < end_id; start_id++)528u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];529} else {530#ifdef U8_STRCMP_CI_LOWER531b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;532if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)533return ((size_t)sz);534535start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];536end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];537538if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)539return ((size_t)sz);540541b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;542543for (i = 0; start_id < end_id; start_id++)544u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];545#else546__builtin_unreachable();547#endif548}549550/*551* If i is still zero, that means there is no corresponding character.552*/553if (i == 0)554return ((size_t)sz);555556u8s[i] = '\0';557558return (i);559}560561/*562* The do_case_compare() function compares the two input strings, s1 and s2,563* one character at a time doing case conversions if applicable and return564* the comparison result as like strcmp().565*566* Since, in empirical sense, most of text data are 7-bit ASCII characters,567* we treat the 7-bit ASCII characters as a special case trying to yield568* faster processing time.569*/570static int571do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,572size_t n2, boolean_t is_it_toupper, int *errnum)573{574int f;575int sz1;576int sz2;577size_t j;578size_t i1;579size_t i2;580uchar_t u8s1[U8_MB_CUR_MAX + 1];581uchar_t u8s2[U8_MB_CUR_MAX + 1];582583i1 = i2 = 0;584while (i1 < n1 && i2 < n2) {585/*586* Find out what would be the byte length for this UTF-8587* character at string s1 and also find out if this is588* an illegal start byte or not and if so, issue a proper589* error number and yet treat this byte as a character.590*/591sz1 = u8_number_of_bytes[*s1];592if (sz1 < 0) {593*errnum = EILSEQ;594sz1 = 1;595}596597/*598* For 7-bit ASCII characters mainly, we do a quick case599* conversion right at here.600*601* If we don't have enough bytes for this character, issue602* an EINVAL error and use what are available.603*604* If we have enough bytes, find out if there is605* a corresponding uppercase character and if so, copy over606* the bytes for a comparison later. If there is no607* corresponding uppercase character, then, use what we have608* for the comparison.609*/610if (sz1 == 1) {611if (is_it_toupper)612u8s1[0] = U8_ASCII_TOUPPER(*s1);613else614u8s1[0] = U8_ASCII_TOLOWER(*s1);615s1++;616u8s1[1] = '\0';617} else if ((i1 + sz1) > n1) {618*errnum = EINVAL;619for (j = 0; (i1 + j) < n1; )620u8s1[j++] = *s1++;621u8s1[j] = '\0';622} else {623(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);624s1 += sz1;625}626627/* Do the same for the string s2. */628sz2 = u8_number_of_bytes[*s2];629if (sz2 < 0) {630*errnum = EILSEQ;631sz2 = 1;632}633634if (sz2 == 1) {635if (is_it_toupper)636u8s2[0] = U8_ASCII_TOUPPER(*s2);637else638u8s2[0] = U8_ASCII_TOLOWER(*s2);639s2++;640u8s2[1] = '\0';641} else if ((i2 + sz2) > n2) {642*errnum = EINVAL;643for (j = 0; (i2 + j) < n2; )644u8s2[j++] = *s2++;645u8s2[j] = '\0';646} else {647(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);648s2 += sz2;649}650651/* Now compare the two characters. */652if (sz1 == 1 && sz2 == 1) {653if (*u8s1 > *u8s2)654return (1);655if (*u8s1 < *u8s2)656return (-1);657} else {658f = strcmp((const char *)u8s1, (const char *)u8s2);659if (f != 0)660return (f);661}662663/*664* They were the same. Let's move on to the next665* characters then.666*/667i1 += sz1;668i2 += sz2;669}670671/*672* We compared until the end of either or both strings.673*674* If we reached to or went over the ends for the both, that means675* they are the same.676*677* If we reached only one of the two ends, that means the other string678* has something which then the fact can be used to determine679* the return value.680*/681if (i1 >= n1) {682if (i2 >= n2)683return (0);684return (-1);685}686return (1);687}688689/*690* The combining_class() function checks on the given bytes and find out691* the corresponding Unicode combining class value. The return value 0 means692* it is a Starter. Any illegal UTF-8 character will also be treated as693* a Starter.694*/695static uchar_t696combining_class(size_t uv, uchar_t *s, size_t sz)697{698uint16_t b1 = 0;699uint16_t b2 = 0;700uint16_t b3 = 0;701uint16_t b4 = 0;702703if (sz == 1 || sz > 4)704return (0);705706if (sz == 2) {707b3 = s[0];708b4 = s[1];709} else if (sz == 3) {710b2 = s[0];711b3 = s[1];712b4 = s[2];713} else if (sz == 4) {714b1 = s[0];715b2 = s[1];716b3 = s[2];717b4 = s[3];718}719720b1 = u8_common_b1_tbl[uv][b1];721if (b1 == U8_TBL_ELEMENT_NOT_DEF)722return (0);723724b2 = u8_combining_class_b2_tbl[uv][b1][b2];725if (b2 == U8_TBL_ELEMENT_NOT_DEF)726return (0);727728b3 = u8_combining_class_b3_tbl[uv][b2][b3];729if (b3 == U8_TBL_ELEMENT_NOT_DEF)730return (0);731732return (u8_combining_class_b4_tbl[uv][b3][b4]);733}734735/*736* The do_decomp() function finds out a matching decomposition if any737* and return. If there is no match, the input bytes are copied and returned.738* The function also checks if there is a Hangul, decomposes it if necessary739* and returns.740*741* To save time, a single byte 7-bit ASCII character should be handled by742* the caller.743*744* The function returns the number of bytes returned sans always terminating745* the null byte. It will also return a state that will tell if there was746* a Hangul character decomposed which then will be used by the caller.747*/748static size_t749do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,750boolean_t canonical_decomposition, u8_normalization_states_t *state)751{752uint16_t b1 = 0;753uint16_t b2 = 0;754uint16_t b3 = 0;755uint16_t b3_tbl;756uint16_t b3_base;757uint16_t b4 = 0;758size_t start_id;759size_t end_id;760size_t i;761uint32_t u1;762763if (sz == 2) {764b3 = u8s[0] = s[0];765b4 = u8s[1] = s[1];766u8s[2] = '\0';767} else if (sz == 3) {768/* Convert it to a Unicode scalar value. */769U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);770771/*772* If this is a Hangul syllable, we decompose it into773* a leading consonant, a vowel, and an optional trailing774* consonant and then return.775*/776if (U8_HANGUL_SYLLABLE(u1)) {777u1 -= U8_HANGUL_SYL_FIRST;778779b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;780b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)781/ U8_HANGUL_T_COUNT;782b3 = u1 % U8_HANGUL_T_COUNT;783784U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);785U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);786if (b3) {787b3 += U8_HANGUL_JAMO_T_FIRST;788U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);789790u8s[9] = '\0';791*state = U8_STATE_HANGUL_LVT;792return (9);793}794795u8s[6] = '\0';796*state = U8_STATE_HANGUL_LV;797return (6);798}799800b2 = u8s[0] = s[0];801b3 = u8s[1] = s[1];802b4 = u8s[2] = s[2];803u8s[3] = '\0';804805/*806* If this is a Hangul Jamo, we know there is nothing807* further that we can decompose.808*/809if (U8_HANGUL_JAMO_L(u1)) {810*state = U8_STATE_HANGUL_L;811return (3);812}813814if (U8_HANGUL_JAMO_V(u1)) {815if (*state == U8_STATE_HANGUL_L)816*state = U8_STATE_HANGUL_LV;817else818*state = U8_STATE_HANGUL_V;819return (3);820}821822if (U8_HANGUL_JAMO_T(u1)) {823if (*state == U8_STATE_HANGUL_LV)824*state = U8_STATE_HANGUL_LVT;825else826*state = U8_STATE_HANGUL_T;827return (3);828}829} else if (sz == 4) {830b1 = u8s[0] = s[0];831b2 = u8s[1] = s[1];832b3 = u8s[2] = s[2];833b4 = u8s[3] = s[3];834u8s[4] = '\0';835} else {836/*837* This is a fallback and should not happen if the function838* was called properly.839*/840u8s[0] = s[0];841u8s[1] = '\0';842*state = U8_STATE_START;843return (1);844}845846/*847* At this point, this routine does not know what it would get.848* The caller should sort it out if the state isn't a Hangul one.849*/850*state = U8_STATE_START;851852/* Try to find matching decomposition mapping byte sequence. */853b1 = u8_common_b1_tbl[uv][b1];854if (b1 == U8_TBL_ELEMENT_NOT_DEF)855return ((size_t)sz);856857b2 = u8_decomp_b2_tbl[uv][b1][b2];858if (b2 == U8_TBL_ELEMENT_NOT_DEF)859return ((size_t)sz);860861b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;862if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)863return ((size_t)sz);864865/*866* If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR867* which is 0x8000, this means we couldn't fit the mappings into868* the cardinality of a unsigned byte.869*/870if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {871b3_tbl -= U8_16BIT_TABLE_INDICATOR;872start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];873end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];874} else {875// cppcheck-suppress arrayIndexOutOfBoundsCond876start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];877// cppcheck-suppress arrayIndexOutOfBoundsCond878end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];879}880881/* This also means there wasn't any matching decomposition. */882if (start_id >= end_id)883return ((size_t)sz);884885/*886* The final table for decomposition mappings has three types of887* byte sequences depending on whether a mapping is for compatibility888* decomposition, canonical decomposition, or both like the following:889*890* (1) Compatibility decomposition mappings:891*892* +---+---+-...-+---+893* | B0| B1| ... | Bm|894* +---+---+-...-+---+895*896* The first byte, B0, is always less than 0xF5 (U8_DECOMP_BOTH).897*898* (2) Canonical decomposition mappings:899*900* +---+---+---+-...-+---+901* | T | b0| b1| ... | bn|902* +---+---+---+-...-+---+903*904* where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).905*906* (3) Both mappings:907*908* +---+---+---+---+-...-+---+---+---+-...-+---+909* | T | D | b0| b1| ... | bn| B0| B1| ... | Bm|910* +---+---+---+---+-...-+---+---+---+-...-+---+911*912* where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement913* byte, b0 to bn are canonical mapping bytes and B0 to Bm are914* compatibility mapping bytes.915*916* Note that compatibility decomposition means doing recursive917* decompositions using both compatibility decomposition mappings and918* canonical decomposition mappings. On the other hand, canonical919* decomposition means doing recursive decompositions using only920* canonical decomposition mappings. Since the table we have has gone921* through the recursions already, we do not need to do so during922* runtime, i.e., the table has been completely flattened out923* already.924*/925926b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;927928/* Get the type, T, of the byte sequence. */929b1 = u8_decomp_final_tbl[uv][b3_base + start_id];930931/*932* If necessary, adjust start_id, end_id, or both. Note that if933* this is compatibility decomposition mapping, there is no934* adjustment.935*/936if (canonical_decomposition) {937/* Is the mapping only for compatibility decomposition? */938if (b1 < U8_DECOMP_BOTH)939return ((size_t)sz);940941start_id++;942943if (b1 == U8_DECOMP_BOTH) {944end_id = start_id +945u8_decomp_final_tbl[uv][b3_base + start_id];946start_id++;947}948} else {949/*950* Unless this is a compatibility decomposition mapping,951* we adjust the start_id.952*/953if (b1 == U8_DECOMP_BOTH) {954start_id++;955start_id += u8_decomp_final_tbl[uv][b3_base + start_id];956} else if (b1 == U8_DECOMP_CANONICAL) {957start_id++;958}959}960961for (i = 0; start_id < end_id; start_id++)962u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];963u8s[i] = '\0';964965return (i);966}967968/*969* The find_composition_start() function uses the character bytes given and970* find out the matching composition mappings if any and return the address971* to the composition mappings as explained in the do_composition().972*/973static uchar_t *974find_composition_start(size_t uv, uchar_t *s, size_t sz)975{976uint16_t b1 = 0;977uint16_t b2 = 0;978uint16_t b3 = 0;979uint16_t b3_tbl;980uint16_t b3_base;981uint16_t b4 = 0;982size_t start_id;983size_t end_id;984985if (sz == 1) {986b4 = s[0];987} else if (sz == 2) {988b3 = s[0];989b4 = s[1];990} else if (sz == 3) {991b2 = s[0];992b3 = s[1];993b4 = s[2];994} else if (sz == 4) {995b1 = s[0];996b2 = s[1];997b3 = s[2];998b4 = s[3];999} else {1000/*1001* This is a fallback and should not happen if the function1002* was called properly.1003*/1004return (NULL);1005}10061007b1 = u8_composition_b1_tbl[uv][b1];1008if (b1 == U8_TBL_ELEMENT_NOT_DEF)1009return (NULL);10101011b2 = u8_composition_b2_tbl[uv][b1][b2];1012if (b2 == U8_TBL_ELEMENT_NOT_DEF)1013return (NULL);10141015b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;1016if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)1017return (NULL);10181019if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {1020b3_tbl -= U8_16BIT_TABLE_INDICATOR;1021start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];1022end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];1023} else {1024// cppcheck-suppress arrayIndexOutOfBoundsCond1025start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];1026// cppcheck-suppress arrayIndexOutOfBoundsCond1027end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];1028}10291030if (start_id >= end_id)1031return (NULL);10321033b3_base = u8_composition_b3_tbl[uv][b2][b3].base;10341035return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));1036}10371038/*1039* The blocked() function checks on the combining class values of previous1040* characters in this sequence and return whether it is blocked or not.1041*/1042static boolean_t1043blocked(uchar_t *comb_class, size_t last)1044{1045uchar_t my_comb_class;1046size_t i;10471048my_comb_class = comb_class[last];1049for (i = 1; i < last; i++)1050if (comb_class[i] >= my_comb_class ||1051comb_class[i] == U8_COMBINING_CLASS_STARTER)1052return (B_TRUE);10531054return (B_FALSE);1055}10561057/*1058* The do_composition() reads the character string pointed by 's' and1059* do necessary canonical composition and then copy over the result back to1060* the 's'.1061*1062* The input argument 's' cannot contain more than 32 characters.1063*/1064static size_t1065do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,1066uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)1067{1068uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];1069uchar_t tc[U8_MB_CUR_MAX] = { '\0' };1070uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];1071size_t saved_marks_count;1072uchar_t *p;1073uchar_t *saved_p;1074uchar_t *q;1075size_t i;1076size_t saved_i;1077size_t j;1078size_t k;1079size_t l;1080size_t C;1081size_t saved_l;1082size_t size;1083uint32_t u1;1084uint32_t u2;1085boolean_t match_not_found = B_TRUE;10861087/*1088* This should never happen unless the callers are doing some strange1089* and unexpected things.1090*1091* The "last" is the index pointing to the last character not last + 1.1092*/1093if (last >= U8_MAX_CHARS_A_SEQ)1094last = U8_UPPER_LIMIT_IN_A_SEQ;10951096for (i = l = 0; i <= last; i++) {1097/*1098* The last or any non-Starters at the beginning, we don't1099* have any chance to do composition and so we just copy them1100* to the temporary buffer.1101*/1102if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {1103SAVE_THE_CHAR:1104p = s + start[i];1105size = disp[i];1106for (k = 0; k < size; k++)1107t[l++] = *p++;1108continue;1109}11101111/*1112* If this could be a start of Hangul Jamos, then, we try to1113* conjoin them.1114*/1115if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {1116U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],1117s[start[i] + 1], s[start[i] + 2]);1118U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],1119s[start[i] + 4], s[start[i] + 5]);11201121if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {1122u1 -= U8_HANGUL_JAMO_L_FIRST;1123u2 -= U8_HANGUL_JAMO_V_FIRST;1124u1 = U8_HANGUL_SYL_FIRST +1125(u1 * U8_HANGUL_V_COUNT + u2) *1126U8_HANGUL_T_COUNT;11271128i += 2;1129if (i <= last) {1130U8_PUT_3BYTES_INTO_UTF32(u2,1131s[start[i]], s[start[i] + 1],1132s[start[i] + 2]);11331134if (U8_HANGUL_JAMO_T(u2)) {1135u1 += u2 -1136U8_HANGUL_JAMO_T_FIRST;1137i++;1138}1139}11401141U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);1142i--;1143l += 3;1144continue;1145}1146}11471148/*1149* Let's then find out if this Starter has composition1150* mapping.1151*/1152p = find_composition_start(uv, s + start[i], disp[i]);1153if (p == NULL)1154goto SAVE_THE_CHAR;11551156/*1157* We have a Starter with composition mapping and the next1158* character is a non-Starter. Let's try to find out if1159* we can do composition.1160*/11611162saved_p = p;1163saved_i = i;1164saved_l = l;1165saved_marks_count = 0;11661167TRY_THE_NEXT_MARK:1168q = s + start[++i];1169size = disp[i];11701171/*1172* The next for() loop compares the non-Starter pointed by1173* 'q' with the possible (joinable) characters pointed by 'p'.1174*1175* The composition final table entry pointed by the 'p'1176* looks like the following:1177*1178* +---+---+---+-...-+---+---+---+---+-...-+---+---+1179* | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |1180* +---+---+---+-...-+---+---+---+---+-...-+---+---+1181*1182* where C is the count byte indicating the number of1183* mapping pairs where each pair would be look like1184* (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second1185* character of a canonical decomposition and the B0-Bm are1186* the bytes of a matching composite character. The F is1187* a filler byte after each character as the separator.1188*/11891190match_not_found = B_TRUE;11911192for (C = *p++; C > 0; C--) {1193for (k = 0; k < size; p++, k++)1194if (*p != q[k])1195break;11961197/* Have we found it? */1198if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {1199match_not_found = B_FALSE;12001201l = saved_l;12021203while (*++p != U8_TBL_ELEMENT_FILLER)1204t[l++] = *p;12051206break;1207}12081209/* We didn't find; skip to the next pair. */1210if (*p != U8_TBL_ELEMENT_FILLER)1211while (*++p != U8_TBL_ELEMENT_FILLER)1212;1213while (*++p != U8_TBL_ELEMENT_FILLER)1214;1215p++;1216}12171218/*1219* If there was no match, we will need to save the combining1220* mark for later appending. After that, if the next one1221* is a non-Starter and not blocked, then, we try once1222* again to do composition with the next non-Starter.1223*1224* If there was no match and this was a Starter, then,1225* this is a new start.1226*1227* If there was a match and a composition done and we have1228* more to check on, then, we retrieve a new composition final1229* table entry for the composite and then try to do the1230* composition again.1231*/12321233if (match_not_found) {1234if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {1235i--;1236goto SAVE_THE_CHAR;1237}12381239saved_marks[saved_marks_count++] = i;1240}12411242if (saved_l == l) {1243while (i < last) {1244if (blocked(comb_class, i + 1))1245saved_marks[saved_marks_count++] = ++i;1246else1247break;1248}1249if (i < last) {1250p = saved_p;1251goto TRY_THE_NEXT_MARK;1252}1253} else if (i < last) {1254p = find_composition_start(uv, t + saved_l,1255l - saved_l);1256if (p != NULL) {1257saved_p = p;1258goto TRY_THE_NEXT_MARK;1259}1260}12611262/*1263* There is no more composition possible.1264*1265* If there was no composition what so ever then we copy1266* over the original Starter and then append any non-Starters1267* remaining at the target string sequentially after that.1268*/12691270if (saved_l == l) {1271p = s + start[saved_i];1272size = disp[saved_i];1273for (j = 0; j < size; j++)1274t[l++] = *p++;1275}12761277for (k = 0; k < saved_marks_count; k++) {1278p = s + start[saved_marks[k]];1279size = disp[saved_marks[k]];1280for (j = 0; j < size; j++)1281t[l++] = *p++;1282}1283}12841285/*1286* If the last character is a Starter and if we have a character1287* (possibly another Starter) that can be turned into a composite,1288* we do so and we do so until there is no more of composition1289* possible.1290*/1291if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {1292p = *os;1293saved_l = l - disp[last];12941295while (p < oslast) {1296int8_t number_of_bytes = u8_number_of_bytes[*p];12971298if (number_of_bytes <= 1)1299break;1300size = number_of_bytes;1301if ((p + size) > oslast)1302break;13031304saved_p = p;13051306for (i = 0; i < size; i++)1307tc[i] = *p++;13081309q = find_composition_start(uv, t + saved_l,1310l - saved_l);1311if (q == NULL) {1312p = saved_p;1313break;1314}13151316match_not_found = B_TRUE;13171318for (C = *q++; C > 0; C--) {1319for (k = 0; k < size; q++, k++)1320if (*q != tc[k])1321break;13221323if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {1324match_not_found = B_FALSE;13251326l = saved_l;13271328while (*++q != U8_TBL_ELEMENT_FILLER) {1329/*1330* This is practically1331* impossible but we don't1332* want to take any chances.1333*/1334if (l >=1335U8_STREAM_SAFE_TEXT_MAX) {1336p = saved_p;1337goto SAFE_RETURN;1338}1339t[l++] = *q;1340}13411342break;1343}13441345if (*q != U8_TBL_ELEMENT_FILLER)1346while (*++q != U8_TBL_ELEMENT_FILLER)1347;1348while (*++q != U8_TBL_ELEMENT_FILLER)1349;1350q++;1351}13521353if (match_not_found) {1354p = saved_p;1355break;1356}1357}1358SAFE_RETURN:1359*os = p;1360}13611362/*1363* Now we copy over the temporary string to the target string.1364* Since composition always reduces the number of characters or1365* the number of characters stay, we don't need to worry about1366* the buffer overflow here.1367*/1368for (i = 0; i < l; i++)1369s[i] = t[i];1370s[l] = '\0';13711372return (l);1373}13741375/*1376* The collect_a_seq() function checks on the given string s, collect1377* a sequence of characters at u8s, and return the sequence. While it collects1378* a sequence, it also applies case conversion, canonical or compatibility1379* decomposition, canonical decomposition, or some or all of them and1380* in that order.1381*1382* The collected sequence cannot be bigger than 32 characters since if1383* it is having more than 31 characters, the sequence will be terminated1384* with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into1385* a Stream-Safe Text. The collected sequence is always terminated with1386* a null byte and the return value is the byte length of the sequence1387* including 0. The return value does not include the terminating1388* null byte.1389*/1390static size_t1391collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,1392boolean_t is_it_toupper,1393boolean_t is_it_tolower,1394boolean_t canonical_decomposition,1395boolean_t compatibility_decomposition,1396boolean_t canonical_composition,1397int *errnum, u8_normalization_states_t *state)1398{1399uchar_t *s;1400int sz;1401int saved_sz;1402size_t i;1403size_t j;1404size_t k;1405size_t l;1406uchar_t comb_class[U8_MAX_CHARS_A_SEQ];1407uchar_t disp[U8_MAX_CHARS_A_SEQ];1408uchar_t start[U8_MAX_CHARS_A_SEQ];1409uchar_t u8t[U8_MB_CUR_MAX] = { '\0' };1410uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];1411uchar_t tc;1412size_t last;1413size_t saved_last;1414uint32_t u1;14151416/*1417* Save the source string pointer which we will return a changed1418* pointer if we do processing.1419*/1420s = *source;14211422/*1423* The following is a fallback for just in case callers are not1424* checking the string boundaries before the calling.1425*/1426if (s >= slast) {1427u8s[0] = '\0';14281429return (0);1430}14311432/*1433* As the first thing, let's collect a character and do case1434* conversion if necessary.1435*/14361437sz = u8_number_of_bytes[*s];14381439if (sz < 0) {1440*errnum = EILSEQ;14411442u8s[0] = *s++;1443u8s[1] = '\0';14441445*source = s;14461447return (1);1448}14491450if (sz == 1) {1451if (is_it_toupper)1452u8s[0] = U8_ASCII_TOUPPER(*s);1453else if (is_it_tolower)1454u8s[0] = U8_ASCII_TOLOWER(*s);1455else1456u8s[0] = *s;1457s++;1458u8s[1] = '\0';1459} else if ((s + sz) > slast) {1460*errnum = EINVAL;14611462for (i = 0; s < slast; )1463u8s[i++] = *s++;1464u8s[i] = '\0';14651466*source = s;14671468return (i);1469} else {1470if (is_it_toupper || is_it_tolower) {1471i = do_case_conv(uv, u8s, s, sz, is_it_toupper);1472s += sz;1473sz = i;1474} else {1475for (i = 0; i < sz; )1476u8s[i++] = *s++;1477u8s[i] = '\0';1478}1479}14801481/*1482* And then canonical/compatibility decomposition followed by1483* an optional canonical composition. Please be noted that1484* canonical composition is done only when a decomposition is1485* done.1486*/1487if (canonical_decomposition || compatibility_decomposition) {1488if (sz == 1) {1489*state = U8_STATE_START;14901491saved_sz = 1;14921493comb_class[0] = 0;1494start[0] = 0;1495disp[0] = 1;14961497last = 1;1498} else {1499saved_sz = do_decomp(uv, u8s, u8s, sz,1500canonical_decomposition, state);15011502last = 0;15031504for (i = 0; i < saved_sz; ) {1505sz = u8_number_of_bytes[u8s[i]];15061507comb_class[last] = combining_class(uv,1508u8s + i, sz);1509start[last] = i;1510disp[last] = sz;15111512last++;1513i += sz;1514}15151516/*1517* Decomposition yields various Hangul related1518* states but not on combining marks. We need to1519* find out at here by checking on the last1520* character.1521*/1522if (*state == U8_STATE_START) {1523if (comb_class[last - 1])1524*state = U8_STATE_COMBINING_MARK;1525}1526}15271528saved_last = last;15291530while (s < slast) {1531sz = u8_number_of_bytes[*s];15321533/*1534* If this is an illegal character, an incomplete1535* character, or an 7-bit ASCII Starter character,1536* then we have collected a sequence; break and let1537* the next call deal with the two cases.1538*1539* Note that this is okay only if you are using this1540* function with a fixed length string, not on1541* a buffer with multiple calls of one chunk at a time.1542*/1543if (sz <= 1) {1544break;1545} else if ((s + sz) > slast) {1546break;1547} else {1548/*1549* If the previous character was a Hangul Jamo1550* and this character is a Hangul Jamo that1551* can be conjoined, we collect the Jamo.1552*/1553if (*s == U8_HANGUL_JAMO_1ST_BYTE) {1554U8_PUT_3BYTES_INTO_UTF32(u1,1555*s, *(s + 1), *(s + 2));15561557if (U8_HANGUL_COMPOSABLE_L_V(*state,1558u1)) {1559i = 0;1560*state = U8_STATE_HANGUL_LV;1561goto COLLECT_A_HANGUL;1562}15631564if (U8_HANGUL_COMPOSABLE_LV_T(*state,1565u1)) {1566i = 0;1567*state = U8_STATE_HANGUL_LVT;1568goto COLLECT_A_HANGUL;1569}1570}15711572/*1573* Regardless of whatever it was, if this is1574* a Starter, we don't collect the character1575* since that's a new start and we will deal1576* with it at the next time.1577*/1578i = combining_class(uv, s, sz);1579if (i == U8_COMBINING_CLASS_STARTER)1580break;15811582/*1583* We know the current character is a combining1584* mark. If the previous character wasn't1585* a Starter (not Hangul) or a combining mark,1586* then, we don't collect this combining mark.1587*/1588if (*state != U8_STATE_START &&1589*state != U8_STATE_COMBINING_MARK)1590break;15911592*state = U8_STATE_COMBINING_MARK;1593COLLECT_A_HANGUL:1594/*1595* If we collected a Starter and combining1596* marks up to 30, i.e., total 31 characters,1597* then, we terminate this degenerately long1598* combining sequence with a U+034F COMBINING1599* GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in1600* UTF-8 and turn this into a Stream-Safe1601* Text. This will be extremely rare but1602* possible.1603*1604* The following will also guarantee that1605* we are not writing more than 32 characters1606* plus a NULL at u8s[].1607*/1608if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {1609TURN_STREAM_SAFE:1610*state = U8_STATE_START;1611comb_class[last] = 0;1612start[last] = saved_sz;1613disp[last] = 2;1614last++;16151616u8s[saved_sz++] = 0xCD;1617u8s[saved_sz++] = 0x8F;16181619break;1620}16211622/*1623* Some combining marks also do decompose into1624* another combining mark or marks.1625*/1626if (*state == U8_STATE_COMBINING_MARK) {1627k = last;1628l = sz;1629i = do_decomp(uv, uts, s, sz,1630canonical_decomposition, state);1631for (j = 0; j < i; ) {1632sz = u8_number_of_bytes[uts[j]];16331634comb_class[last] =1635combining_class(uv,1636uts + j, sz);1637start[last] = saved_sz + j;1638disp[last] = sz;16391640last++;1641if (last >=1642U8_UPPER_LIMIT_IN_A_SEQ) {1643last = k;1644goto TURN_STREAM_SAFE;1645}1646j += sz;1647}16481649*state = U8_STATE_COMBINING_MARK;1650sz = i;1651s += l;16521653for (i = 0; i < sz; i++)1654u8s[saved_sz++] = uts[i];1655} else {1656comb_class[last] = i;1657start[last] = saved_sz;1658disp[last] = sz;1659last++;16601661for (i = 0; i < sz; i++)1662u8s[saved_sz++] = *s++;1663}16641665/*1666* If this is U+0345 COMBINING GREEK1667* YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,1668* iota subscript, and need to be converted to1669* uppercase letter, convert it to U+0399 GREEK1670* CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),1671* i.e., convert to capital adscript form as1672* specified in the Unicode standard.1673*1674* This is the only special case of (ambiguous)1675* case conversion at combining marks and1676* probably the standard will never have1677* anything similar like this in future.1678*/1679if (is_it_toupper && sz >= 2 &&1680u8s[saved_sz - 2] == 0xCD &&1681u8s[saved_sz - 1] == 0x85) {1682u8s[saved_sz - 2] = 0xCE;1683u8s[saved_sz - 1] = 0x99;1684}1685}1686}16871688/*1689* Let's try to ensure a canonical ordering for the collected1690* combining marks. We do this only if we have collected1691* at least one more non-Starter. (The decomposition mapping1692* data tables have fully (and recursively) expanded and1693* canonically ordered decompositions.)1694*1695* The U8_SWAP_COMB_MARKS() convenience macro has some1696* assumptions and we are meeting the assumptions.1697*/1698last--;1699if (last >= saved_last) {1700for (i = 0; i < last; i++)1701for (j = last; j > i; j--)1702if (comb_class[j] &&1703comb_class[j - 1] > comb_class[j]) {1704U8_SWAP_COMB_MARKS(j - 1, j);1705}1706}17071708*source = s;17091710if (! canonical_composition) {1711u8s[saved_sz] = '\0';1712return (saved_sz);1713}17141715/*1716* Now do the canonical composition. Note that we do this1717* only after a canonical or compatibility decomposition to1718* finish up NFC or NFKC.1719*/1720sz = do_composition(uv, u8s, comb_class, start, disp, last,1721&s, slast);1722}17231724*source = s;17251726return ((size_t)sz);1727}17281729/*1730* The do_norm_compare() function does string comparison based on Unicode1731* simple case mappings and Unicode Normalization definitions.1732*1733* It does so by collecting a sequence of character at a time and comparing1734* the collected sequences from the strings.1735*1736* The meanings on the return values are the same as the usual strcmp().1737*/1738static int1739do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,1740int flag, int *errnum)1741{1742int result;1743size_t sz1;1744size_t sz2;1745uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];1746uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];1747uchar_t *s1last;1748uchar_t *s2last;1749boolean_t is_it_toupper;1750boolean_t is_it_tolower;1751boolean_t canonical_decomposition;1752boolean_t compatibility_decomposition;1753boolean_t canonical_composition;1754u8_normalization_states_t state;17551756s1last = s1 + n1;1757s2last = s2 + n2;17581759is_it_toupper = flag & U8_TEXTPREP_TOUPPER;1760#ifdef U8_STRCMP_CI_LOWER1761is_it_tolower = flag & U8_TEXTPREP_TOLOWER;1762#else1763is_it_tolower = 0;1764#endif1765canonical_decomposition = flag & U8_CANON_DECOMP;1766compatibility_decomposition = flag & U8_COMPAT_DECOMP;1767canonical_composition = flag & U8_CANON_COMP;17681769while (s1 < s1last && s2 < s2last) {1770/*1771* If the current character is a 7-bit ASCII and the last1772* character, or, if the current character and the next1773* character are both some 7-bit ASCII characters then1774* we treat the current character as a sequence.1775*1776* In any other cases, we need to call collect_a_seq().1777*/17781779if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||1780((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {1781if (is_it_toupper)1782u8s1[0] = U8_ASCII_TOUPPER(*s1);1783else if (is_it_tolower)1784u8s1[0] = U8_ASCII_TOLOWER(*s1);1785else1786u8s1[0] = *s1;1787u8s1[1] = '\0';1788sz1 = 1;1789s1++;1790} else {1791state = U8_STATE_START;1792sz1 = collect_a_seq(uv, u8s1, &s1, s1last,1793is_it_toupper, is_it_tolower,1794canonical_decomposition,1795compatibility_decomposition,1796canonical_composition, errnum, &state);1797}17981799if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||1800((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {1801if (is_it_toupper)1802u8s2[0] = U8_ASCII_TOUPPER(*s2);1803else if (is_it_tolower)1804u8s2[0] = U8_ASCII_TOLOWER(*s2);1805else1806u8s2[0] = *s2;1807u8s2[1] = '\0';1808sz2 = 1;1809s2++;1810} else {1811state = U8_STATE_START;1812sz2 = collect_a_seq(uv, u8s2, &s2, s2last,1813is_it_toupper, is_it_tolower,1814canonical_decomposition,1815compatibility_decomposition,1816canonical_composition, errnum, &state);1817}18181819/*1820* Now compare the two characters. If they are the same,1821* we move on to the next character sequences.1822*/1823if (sz1 == 1 && sz2 == 1) {1824if (*u8s1 > *u8s2)1825return (1);1826if (*u8s1 < *u8s2)1827return (-1);1828} else {1829result = strcmp((const char *)u8s1, (const char *)u8s2);1830if (result != 0)1831return (result);1832}1833}18341835/*1836* We compared until the end of either or both strings.1837*1838* If we reached to or went over the ends for the both, that means1839* they are the same.1840*1841* If we reached only one end, that means the other string has1842* something which then can be used to determine the return value.1843*/1844if (s1 >= s1last) {1845if (s2 >= s2last)1846return (0);1847return (-1);1848}1849return (1);1850}18511852/*1853* The u8_strcmp() function compares two UTF-8 strings quite similar to1854* the strcmp(). For the comparison, however, Unicode Normalization specific1855* equivalency and Unicode simple case conversion mappings based equivalency1856* can be requested and checked against.1857*/1858int1859u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,1860int *errnum)1861{1862int f;1863size_t n1;1864size_t n2;18651866*errnum = 0;18671868/*1869* Check on the requested Unicode version, case conversion, and1870* normalization flag values.1871*/18721873if (uv > U8_UNICODE_LATEST) {1874*errnum = ERANGE;1875uv = U8_UNICODE_LATEST;1876}18771878if (flag == 0) {1879flag = U8_STRCMP_CS;1880} else {1881#ifdef U8_STRCMP_CI_LOWER1882f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER1883| U8_STRCMP_CI_LOWER);1884#else1885f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER);1886#endif1887if (f == 0) {1888flag |= U8_STRCMP_CS;1889}1890#ifdef U8_STRCMP_CI_LOWER1891else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&1892f != U8_STRCMP_CI_LOWER)1893#else1894else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER)1895#endif1896{1897*errnum = EBADF;1898flag = U8_STRCMP_CS;1899}19001901f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);1902if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&1903f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {1904*errnum = EBADF;1905flag = U8_STRCMP_CS;1906}1907}19081909if (flag == U8_STRCMP_CS) {1910return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));1911}19121913n1 = strlen(s1);1914n2 = strlen(s2);1915if (n != 0) {1916if (n < n1)1917n1 = n;1918if (n < n2)1919n2 = n;1920}19211922/*1923* Simple case conversion can be done much faster and so we do1924* them separately here.1925*/1926if (flag == U8_STRCMP_CI_UPPER) {1927return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,1928n1, n2, B_TRUE, errnum));1929}1930#ifdef U8_STRCMP_CI_LOWER1931else if (flag == U8_STRCMP_CI_LOWER) {1932return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,1933n1, n2, B_FALSE, errnum));1934}1935#endif19361937return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,1938flag, errnum));1939}19401941size_t1942u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,1943int flag, size_t unicode_version, int *errnum)1944{1945int f;1946int sz;1947uchar_t *ib;1948uchar_t *ibtail;1949uchar_t *ob;1950uchar_t *obtail;1951boolean_t do_not_ignore_null;1952boolean_t do_not_ignore_invalid;1953boolean_t is_it_toupper;1954boolean_t is_it_tolower;1955boolean_t canonical_decomposition;1956boolean_t compatibility_decomposition;1957boolean_t canonical_composition;1958size_t ret_val;1959size_t i;1960size_t j;1961uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];1962u8_normalization_states_t state;19631964if (unicode_version > U8_UNICODE_LATEST) {1965*errnum = ERANGE;1966return ((size_t)-1);1967}19681969#ifdef U8_TEXTPREP_TOLOWER1970f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);1971if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {1972*errnum = EBADF;1973return ((size_t)-1);1974}1975#endif19761977f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);1978if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&1979f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {1980*errnum = EBADF;1981return ((size_t)-1);1982}19831984if (inarray == NULL || *inlen == 0)1985return (0);19861987if (outarray == NULL) {1988*errnum = E2BIG;1989return ((size_t)-1);1990}19911992ib = (uchar_t *)inarray;1993ob = (uchar_t *)outarray;1994ibtail = ib + *inlen;1995obtail = ob + *outlen;19961997do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);1998do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);1999is_it_toupper = flag & U8_TEXTPREP_TOUPPER;2000#ifdef U8_TEXTPREP_TOLOWER2001is_it_tolower = flag & U8_TEXTPREP_TOLOWER;2002#else2003is_it_tolower = 0;2004#endif20052006ret_val = 0;20072008/*2009* If we don't have a normalization flag set, we do the simple case2010* conversion based text preparation separately below. Text2011* preparation involving Normalization will be done in the false task2012* block, again, separately since it will take much more time and2013* resource than doing simple case conversions.2014*/2015if (f == 0) {2016while (ib < ibtail) {2017if (*ib == '\0' && do_not_ignore_null)2018break;20192020sz = u8_number_of_bytes[*ib];20212022if (sz < 0) {2023if (do_not_ignore_invalid) {2024*errnum = EILSEQ;2025ret_val = (size_t)-1;2026break;2027}20282029sz = 1;2030ret_val++;2031}20322033if (sz == 1) {2034if (ob >= obtail) {2035*errnum = E2BIG;2036ret_val = (size_t)-1;2037break;2038}20392040if (is_it_toupper)2041*ob = U8_ASCII_TOUPPER(*ib);2042else if (is_it_tolower)2043*ob = U8_ASCII_TOLOWER(*ib);2044else2045*ob = *ib;2046ib++;2047ob++;2048} else if ((ib + sz) > ibtail) {2049if (do_not_ignore_invalid) {2050*errnum = EINVAL;2051ret_val = (size_t)-1;2052break;2053}20542055if ((obtail - ob) < (ibtail - ib)) {2056*errnum = E2BIG;2057ret_val = (size_t)-1;2058break;2059}20602061/*2062* We treat the remaining incomplete character2063* bytes as a character.2064*/2065ret_val++;20662067while (ib < ibtail)2068*ob++ = *ib++;2069} else {2070if (is_it_toupper || is_it_tolower) {2071i = do_case_conv(unicode_version, u8s,2072ib, sz, is_it_toupper);20732074if ((obtail - ob) < i) {2075*errnum = E2BIG;2076ret_val = (size_t)-1;2077break;2078}20792080ib += sz;20812082for (sz = 0; sz < i; sz++)2083*ob++ = u8s[sz];2084} else {2085if ((obtail - ob) < sz) {2086*errnum = E2BIG;2087ret_val = (size_t)-1;2088break;2089}20902091for (i = 0; i < sz; i++)2092*ob++ = *ib++;2093}2094}2095}2096} else {2097canonical_decomposition = flag & U8_CANON_DECOMP;2098compatibility_decomposition = flag & U8_COMPAT_DECOMP;2099canonical_composition = flag & U8_CANON_COMP;21002101while (ib < ibtail) {2102if (*ib == '\0' && do_not_ignore_null)2103break;21042105/*2106* If the current character is a 7-bit ASCII2107* character and it is the last character, or,2108* if the current character is a 7-bit ASCII2109* character and the next character is also a 7-bit2110* ASCII character, then, we copy over this2111* character without going through collect_a_seq().2112*2113* In any other cases, we need to look further with2114* the collect_a_seq() function.2115*/2116if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||2117((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {2118if (ob >= obtail) {2119*errnum = E2BIG;2120ret_val = (size_t)-1;2121break;2122}21232124if (is_it_toupper)2125*ob = U8_ASCII_TOUPPER(*ib);2126else if (is_it_tolower)2127*ob = U8_ASCII_TOLOWER(*ib);2128else2129*ob = *ib;2130ib++;2131ob++;2132} else {2133*errnum = 0;2134state = U8_STATE_START;21352136j = collect_a_seq(unicode_version, u8s,2137&ib, ibtail,2138is_it_toupper,2139is_it_tolower,2140canonical_decomposition,2141compatibility_decomposition,2142canonical_composition,2143errnum, &state);21442145if (*errnum && do_not_ignore_invalid) {2146ret_val = (size_t)-1;2147break;2148}21492150if ((obtail - ob) < j) {2151*errnum = E2BIG;2152ret_val = (size_t)-1;2153break;2154}21552156for (i = 0; i < j; i++)2157*ob++ = u8s[i];2158}2159}2160}21612162*inlen = ibtail - ib;2163*outlen = obtail - ob;21642165return (ret_val);2166}21672168EXPORT_SYMBOL(u8_validate);2169EXPORT_SYMBOL(u8_strcmp);2170EXPORT_SYMBOL(u8_textprep_str);217121722173