Path: blob/main/crypto/krb5/src/util/support/utf8_conv.c
34889 views
/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */1/* util/support/utf8_conv.c */2/*3* Copyright 2008, 2017 by the Massachusetts Institute of Technology.4* All Rights Reserved.5*6* Export of this software from the United States of America may7* require a specific license from the United States Government.8* It is the responsibility of any person or organization contemplating9* export to obtain such a license before exporting.10*11* WITHIN THAT CONSTRAINT, permission to use, copy, modify, and12* distribute this software and its documentation for any purpose and13* without fee is hereby granted, provided that the above copyright14* notice appear in all copies and that both that copyright notice and15* this permission notice appear in supporting documentation, and that16* the name of M.I.T. not be used in advertising or publicity pertaining17* to distribution of the software without specific, written prior18* permission. Furthermore if you modify this software you must label19* your software as modified software and not distribute it in such a20* fashion that it might be confused with the original M.I.T. software.21* M.I.T. makes no representations about the suitability of22* this software for any purpose. It is provided "as is" without express23* or implied warranty.24*/25/*26* Copyright 1998-2008 The OpenLDAP Foundation.27* All rights reserved.28*29* Redistribution and use in source and binary forms, with or without30* modification, are permitted only as authorized by the OpenLDAP31* Public License.32*33* A copy of this license is available in the file LICENSE in the34* top-level directory of the distribution or, alternatively, at35* <https://www.OpenLDAP.org/license.html>.36*/37/* Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.38*39* THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND40* TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT41* TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS42* AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"43* IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION44* OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP45* PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT46* THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.47*/4849/* This work is based on OpenLDAP Software <https://www.openldap.org/>. */5051/*52* These routines convert between UTF-16 and UTF-8. UTF-16 encodes a Unicode53* character in either two or four bytes. Characters in the Basic Multilingual54* Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes.55* Characters in the Supplementary Planes (10000..10FFFF) are split into a high56* surrogate and a low surrogate, each containing ten bits of the character57* value, and encoded in four bytes.58*/5960#include "k5-platform.h"61#include "k5-utf8.h"62#include "k5-buf.h"63#include "k5-input.h"64#include "supp-int.h"6566static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };6768/* A high surrogate is ten bits masked with 0xD800. */69#define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF)7071/* A low surrogate is ten bits masked with 0xDC00. */72#define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF)7374/* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate75* value. */76#define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF)77#define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c))7879/* A Basic Multilingual Plane character is in the range 0..FFFF and is not a80* surrogate value. */81#define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c))8283/* Characters in the Supplementary Planes have a base value subtracted from84* their code points to form a 20-bit value; ten bits go in each surrogate. */85#define BASE 0x1000086#define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10))87#define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF))88#define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF)))8990int91k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out)92{93struct k5buf buf;94krb5_ucs4 ch;95size_t chlen, i;9697*utf16_out = NULL;98*nbytes_out = 0;99100/* UTF-16 conversion is used for RC4 string-to-key, so treat this data as101* sensitive. */102k5_buf_init_dynamic_zap(&buf);103104/* Examine next UTF-8 character. */105while (*utf8 != '\0') {106/* Get UTF-8 sequence length from first byte. */107chlen = KRB5_UTF8_CHARLEN2(utf8, chlen);108if (chlen == 0)109goto invalid;110111/* First byte minus length tag */112ch = (krb5_ucs4)(utf8[0] & mask[chlen]);113114for (i = 1; i < chlen; i++) {115/* Subsequent bytes must start with 10. */116if ((utf8[i] & 0xc0) != 0x80)117goto invalid;118119/* 6 bits of data in each subsequent byte */120ch <<= 6;121ch |= (krb5_ucs4)(utf8[i] & 0x3f);122}123if (!IS_VALID_UNICODE(ch))124goto invalid;125126/* Characters in the basic multilingual plane are encoded using two127* bytes; other characters are encoded using four bytes. */128if (IS_BMP(ch)) {129k5_buf_add_uint16_le(&buf, ch);130} else {131/* 0x10000 is subtracted from ch; then the high ten bits plus132* 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */133k5_buf_add_uint16_le(&buf, HIGH_SURROGATE(ch));134k5_buf_add_uint16_le(&buf, LOW_SURROGATE(ch));135}136137/* Move to next UTF-8 character. */138utf8 += chlen;139}140141*utf16_out = buf.data;142*nbytes_out = buf.len;143return 0;144145invalid:146k5_buf_free(&buf);147return EINVAL;148}149150int151k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out)152{153struct k5buf buf;154struct k5input in;155uint16_t ch1, ch2;156krb5_ucs4 ch;157size_t chlen;158void *p;159160*utf8_out = NULL;161162if (nbytes % 2 != 0)163return EINVAL;164165k5_buf_init_dynamic(&buf);166k5_input_init(&in, utf16bytes, nbytes);167while (!in.status && in.len > 0) {168/* Get the next character or high surrogate. A low surrogate without a169* preceding high surrogate is invalid. */170ch1 = k5_input_get_uint16_le(&in);171if (IS_LOW_SURROGATE(ch1))172goto invalid;173if (IS_HIGH_SURROGATE(ch1)) {174/* Get the low surrogate and combine the pair. */175ch2 = k5_input_get_uint16_le(&in);176if (!IS_LOW_SURROGATE(ch2))177goto invalid;178ch = COMPOSE(ch1, ch2);179} else {180ch = ch1;181}182183chlen = krb5int_ucs4_to_utf8(ch, NULL);184p = k5_buf_get_space(&buf, chlen);185if (p == NULL)186return ENOMEM;187(void)krb5int_ucs4_to_utf8(ch, p);188}189190if (in.status)191goto invalid;192193*utf8_out = k5_buf_cstring(&buf);194return (*utf8_out == NULL) ? ENOMEM : 0;195196invalid:197k5_buf_free(&buf);198return EINVAL;199}200201202