Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/crypto/krb5/src/util/support/utf8_conv.c
34889 views
1
/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
/* util/support/utf8_conv.c */
3
/*
4
* Copyright 2008, 2017 by the Massachusetts Institute of Technology.
5
* All Rights Reserved.
6
*
7
* Export of this software from the United States of America may
8
* require a specific license from the United States Government.
9
* It is the responsibility of any person or organization contemplating
10
* export to obtain such a license before exporting.
11
*
12
* WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
13
* distribute this software and its documentation for any purpose and
14
* without fee is hereby granted, provided that the above copyright
15
* notice appear in all copies and that both that copyright notice and
16
* this permission notice appear in supporting documentation, and that
17
* the name of M.I.T. not be used in advertising or publicity pertaining
18
* to distribution of the software without specific, written prior
19
* permission. Furthermore if you modify this software you must label
20
* your software as modified software and not distribute it in such a
21
* fashion that it might be confused with the original M.I.T. software.
22
* M.I.T. makes no representations about the suitability of
23
* this software for any purpose. It is provided "as is" without express
24
* or implied warranty.
25
*/
26
/*
27
* Copyright 1998-2008 The OpenLDAP Foundation.
28
* All rights reserved.
29
*
30
* Redistribution and use in source and binary forms, with or without
31
* modification, are permitted only as authorized by the OpenLDAP
32
* Public License.
33
*
34
* A copy of this license is available in the file LICENSE in the
35
* top-level directory of the distribution or, alternatively, at
36
* <https://www.OpenLDAP.org/license.html>.
37
*/
38
/* Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
39
*
40
* THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
41
* TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
42
* TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
43
* AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
44
* IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
45
* OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
46
* PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
47
* THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
48
*/
49
50
/* This work is based on OpenLDAP Software <https://www.openldap.org/>. */
51
52
/*
53
* These routines convert between UTF-16 and UTF-8. UTF-16 encodes a Unicode
54
* character in either two or four bytes. Characters in the Basic Multilingual
55
* Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes.
56
* Characters in the Supplementary Planes (10000..10FFFF) are split into a high
57
* surrogate and a low surrogate, each containing ten bits of the character
58
* value, and encoded in four bytes.
59
*/
60
61
#include "k5-platform.h"
62
#include "k5-utf8.h"
63
#include "k5-buf.h"
64
#include "k5-input.h"
65
#include "supp-int.h"
66
67
static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
68
69
/* A high surrogate is ten bits masked with 0xD800. */
70
#define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF)
71
72
/* A low surrogate is ten bits masked with 0xDC00. */
73
#define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF)
74
75
/* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate
76
* value. */
77
#define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF)
78
#define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c))
79
80
/* A Basic Multilingual Plane character is in the range 0..FFFF and is not a
81
* surrogate value. */
82
#define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c))
83
84
/* Characters in the Supplementary Planes have a base value subtracted from
85
* their code points to form a 20-bit value; ten bits go in each surrogate. */
86
#define BASE 0x10000
87
#define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10))
88
#define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF))
89
#define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF)))
90
91
int
92
k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out)
93
{
94
struct k5buf buf;
95
krb5_ucs4 ch;
96
size_t chlen, i;
97
98
*utf16_out = NULL;
99
*nbytes_out = 0;
100
101
/* UTF-16 conversion is used for RC4 string-to-key, so treat this data as
102
* sensitive. */
103
k5_buf_init_dynamic_zap(&buf);
104
105
/* Examine next UTF-8 character. */
106
while (*utf8 != '\0') {
107
/* Get UTF-8 sequence length from first byte. */
108
chlen = KRB5_UTF8_CHARLEN2(utf8, chlen);
109
if (chlen == 0)
110
goto invalid;
111
112
/* First byte minus length tag */
113
ch = (krb5_ucs4)(utf8[0] & mask[chlen]);
114
115
for (i = 1; i < chlen; i++) {
116
/* Subsequent bytes must start with 10. */
117
if ((utf8[i] & 0xc0) != 0x80)
118
goto invalid;
119
120
/* 6 bits of data in each subsequent byte */
121
ch <<= 6;
122
ch |= (krb5_ucs4)(utf8[i] & 0x3f);
123
}
124
if (!IS_VALID_UNICODE(ch))
125
goto invalid;
126
127
/* Characters in the basic multilingual plane are encoded using two
128
* bytes; other characters are encoded using four bytes. */
129
if (IS_BMP(ch)) {
130
k5_buf_add_uint16_le(&buf, ch);
131
} else {
132
/* 0x10000 is subtracted from ch; then the high ten bits plus
133
* 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */
134
k5_buf_add_uint16_le(&buf, HIGH_SURROGATE(ch));
135
k5_buf_add_uint16_le(&buf, LOW_SURROGATE(ch));
136
}
137
138
/* Move to next UTF-8 character. */
139
utf8 += chlen;
140
}
141
142
*utf16_out = buf.data;
143
*nbytes_out = buf.len;
144
return 0;
145
146
invalid:
147
k5_buf_free(&buf);
148
return EINVAL;
149
}
150
151
int
152
k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out)
153
{
154
struct k5buf buf;
155
struct k5input in;
156
uint16_t ch1, ch2;
157
krb5_ucs4 ch;
158
size_t chlen;
159
void *p;
160
161
*utf8_out = NULL;
162
163
if (nbytes % 2 != 0)
164
return EINVAL;
165
166
k5_buf_init_dynamic(&buf);
167
k5_input_init(&in, utf16bytes, nbytes);
168
while (!in.status && in.len > 0) {
169
/* Get the next character or high surrogate. A low surrogate without a
170
* preceding high surrogate is invalid. */
171
ch1 = k5_input_get_uint16_le(&in);
172
if (IS_LOW_SURROGATE(ch1))
173
goto invalid;
174
if (IS_HIGH_SURROGATE(ch1)) {
175
/* Get the low surrogate and combine the pair. */
176
ch2 = k5_input_get_uint16_le(&in);
177
if (!IS_LOW_SURROGATE(ch2))
178
goto invalid;
179
ch = COMPOSE(ch1, ch2);
180
} else {
181
ch = ch1;
182
}
183
184
chlen = krb5int_ucs4_to_utf8(ch, NULL);
185
p = k5_buf_get_space(&buf, chlen);
186
if (p == NULL)
187
return ENOMEM;
188
(void)krb5int_ucs4_to_utf8(ch, p);
189
}
190
191
if (in.status)
192
goto invalid;
193
194
*utf8_out = k5_buf_cstring(&buf);
195
return (*utf8_out == NULL) ? ENOMEM : 0;
196
197
invalid:
198
k5_buf_free(&buf);
199
return EINVAL;
200
}
201
202