Path: blob/main/crypto/heimdal/lib/wind/normalize.c
34889 views
/*1* Copyright (c) 2004 Kungliga Tekniska Högskolan2* (Royal Institute of Technology, Stockholm, Sweden).3* All rights reserved.4*5* Redistribution and use in source and binary forms, with or without6* modification, are permitted provided that the following conditions7* are met:8*9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11*12* 2. Redistributions in binary form must reproduce the above copyright13* notice, this list of conditions and the following disclaimer in the14* documentation and/or other materials provided with the distribution.15*16* 3. Neither the name of the Institute nor the names of its contributors17* may be used to endorse or promote products derived from this software18* without specific prior written permission.19*20* THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND21* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE22* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE23* ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE24* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL25* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS26* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)27* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT28* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY29* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF30* SUCH DAMAGE.31*/3233#ifdef HAVE_CONFIG_H34#include <config.h>35#endif36#include "windlocl.h"3738#include <assert.h>39#include <stdlib.h>40#include <errno.h>41#include <stdio.h>4243#include "roken.h"4445#include "normalize_table.h"4647static int48translation_cmp(const void *key, const void *data)49{50const struct translation *t1 = (const struct translation *)key;51const struct translation *t2 = (const struct translation *)data;5253return t1->key - t2->key;54}5556enum { s_base = 0xAC00};57enum { s_count = 11172};58enum { l_base = 0x1100};59enum { l_count = 19};60enum { v_base = 0x1161};61enum { v_count = 21};62enum { t_base = 0x11A7};63enum { t_count = 28};64enum { n_count = v_count * t_count};6566static int67hangul_decomp(const uint32_t *in, size_t in_len,68uint32_t *out, size_t *out_len)69{70uint32_t u = *in;71unsigned s_index;72unsigned l, v, t;73unsigned o;7475if (u < s_base || u >= s_base + s_count)76return 0;77s_index = u - s_base;78l = l_base + s_index / n_count;79v = v_base + (s_index % n_count) / t_count;80t = t_base + s_index % t_count;81o = 2;82if (t != t_base)83++o;84if (*out_len < o)85return WIND_ERR_OVERRUN;86out[0] = l;87out[1] = v;88if (t != t_base)89out[2] = t;90*out_len = o;91return 1;92}9394static uint32_t95hangul_composition(const uint32_t *in, size_t in_len)96{97if (in_len < 2)98return 0;99if (in[0] >= l_base && in[0] < l_base + l_count) {100unsigned l_index = in[0] - l_base;101unsigned v_index;102103if (in[1] < v_base || in[1] >= v_base + v_count)104return 0;105v_index = in[1] - v_base;106return (l_index * v_count + v_index) * t_count + s_base;107} else if (in[0] >= s_base && in[0] < s_base + s_count) {108unsigned s_index = in[0] - s_base;109unsigned t_index;110111if (s_index % t_count != 0)112return 0;113if (in[1] < t_base || in[1] >= t_base + t_count)114return 0;115t_index = in[1] - t_base;116return in[0] + t_index;117}118return 0;119}120121static int122compat_decomp(const uint32_t *in, size_t in_len,123uint32_t *out, size_t *out_len)124{125unsigned i;126unsigned o = 0;127128for (i = 0; i < in_len; ++i) {129struct translation ts = {in[i]};130size_t sub_len = *out_len - o;131int ret;132133ret = hangul_decomp(in + i, in_len - i,134out + o, &sub_len);135if (ret) {136if (ret == WIND_ERR_OVERRUN)137return ret;138o += sub_len;139} else {140void *s = bsearch(&ts,141_wind_normalize_table,142_wind_normalize_table_size,143sizeof(_wind_normalize_table[0]),144translation_cmp);145if (s != NULL) {146const struct translation *t = (const struct translation *)s;147148ret = compat_decomp(_wind_normalize_val_table + t->val_offset,149t->val_len,150out + o, &sub_len);151if (ret)152return ret;153o += sub_len;154} else {155if (o >= *out_len)156return WIND_ERR_OVERRUN;157out[o++] = in[i];158159}160}161}162*out_len = o;163return 0;164}165166static void167swap_char(uint32_t * a, uint32_t * b)168{169uint32_t t;170t = *a;171*a = *b;172*b = t;173}174175/* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points176* that all have Canonical_Combining_Class > 0 */177static void178canonical_reorder_sequence(uint32_t * a, size_t len)179{180size_t i, j;181182if (len <= 1)183return;184185for (i = 1; i < len; i++) {186for (j = i;187j > 0 &&188_wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);189j--)190swap_char(&a[j], &a[j-1]);191}192}193194static void195canonical_reorder(uint32_t *tmp, size_t tmp_len)196{197size_t i;198199for (i = 0; i < tmp_len; ++i) {200int cc = _wind_combining_class(tmp[i]);201if (cc) {202size_t j;203for (j = i + 1;204j < tmp_len && _wind_combining_class(tmp[j]);205++j)206;207canonical_reorder_sequence(&tmp[i], j - i);208i = j;209}210}211}212213static uint32_t214find_composition(const uint32_t *in, unsigned in_len)215{216unsigned short canon_index = 0;217uint32_t cur;218unsigned n = 0;219220cur = hangul_composition(in, in_len);221if (cur)222return cur;223224do {225const struct canon_node *c = &_wind_canon_table[canon_index];226unsigned i;227228if (n % 5 == 0) {229if (in_len-- == 0)230return c->val;231cur = *in++;232}233234i = cur >> 16;235if (i < c->next_start || i >= c->next_end)236canon_index = 0;237else238canon_index =239_wind_canon_next_table[c->next_offset + i - c->next_start];240if (canon_index != 0) {241cur = (cur << 4) & 0xFFFFF;242++n;243}244} while (canon_index != 0);245return 0;246}247248static int249combine(const uint32_t *in, size_t in_len,250uint32_t *out, size_t *out_len)251{252unsigned i;253int ostarter;254unsigned o = 0;255int old_cc;256257for (i = 0; i < in_len;) {258while (i < in_len && _wind_combining_class(in[i]) != 0) {259out[o++] = in[i++];260}261if (i < in_len) {262if (o >= *out_len)263return WIND_ERR_OVERRUN;264ostarter = o;265out[o++] = in[i++];266old_cc = -1;267268while (i < in_len) {269uint32_t comb;270uint32_t v[2];271int cc;272273v[0] = out[ostarter];274v[1] = in[i];275276cc = _wind_combining_class(in[i]);277if (old_cc != cc && (comb = find_composition(v, 2))) {278out[ostarter] = comb;279} else if (cc == 0) {280break;281} else {282if (o >= *out_len)283return WIND_ERR_OVERRUN;284out[o++] = in[i];285old_cc = cc;286}287++i;288}289}290}291*out_len = o;292return 0;293}294295int296_wind_stringprep_normalize(const uint32_t *in, size_t in_len,297uint32_t *out, size_t *out_len)298{299size_t tmp_len;300uint32_t *tmp;301int ret;302303if (in_len == 0) {304*out_len = 0;305return 0;306}307308tmp_len = in_len * 4;309if (tmp_len < MAX_LENGTH_CANON)310tmp_len = MAX_LENGTH_CANON;311tmp = malloc(tmp_len * sizeof(uint32_t));312if (tmp == NULL)313return ENOMEM;314315ret = compat_decomp(in, in_len, tmp, &tmp_len);316if (ret) {317free(tmp);318return ret;319}320canonical_reorder(tmp, tmp_len);321ret = combine(tmp, tmp_len, out, out_len);322free(tmp);323return ret;324}325326327