Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/ldap/libldap/utf-8.c
4394 views
1
/* utf-8.c -- Basic UTF-8 routines */
2
/* $OpenLDAP$ */
3
/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4
*
5
* Copyright 1998-2024 The OpenLDAP Foundation.
6
* All rights reserved.
7
*
8
* Redistribution and use in source and binary forms, with or without
9
* modification, are permitted only as authorized by the OpenLDAP
10
* Public License.
11
*
12
* A copy of this license is available in the file LICENSE in the
13
* top-level directory of the distribution or, alternatively, at
14
* <http://www.OpenLDAP.org/license.html>.
15
*/
16
/* Basic UTF-8 routines
17
*
18
* These routines are "dumb". Though they understand UTF-8,
19
* they don't grok Unicode. That is, they can push bits,
20
* but don't have a clue what the bits represent. That's
21
* good enough for use with the LDAP Client SDK.
22
*
23
* These routines are not optimized.
24
*/
25
26
#include "portable.h"
27
28
#include <stdio.h>
29
30
#include <ac/stdlib.h>
31
32
#include <ac/socket.h>
33
#include <ac/string.h>
34
#include <ac/time.h>
35
36
#include "ldap_utf8.h"
37
38
#include "ldap-int.h"
39
#include "ldap_defaults.h"
40
41
/*
42
* return the number of bytes required to hold the
43
* NULL-terminated UTF-8 string NOT INCLUDING the
44
* termination.
45
*/
46
ber_len_t ldap_utf8_bytes( const char * p )
47
{
48
ber_len_t bytes;
49
50
for( bytes=0; p[bytes]; bytes++ ) {
51
/* EMPTY */ ;
52
}
53
54
return bytes;
55
}
56
57
ber_len_t ldap_utf8_chars( const char * p )
58
{
59
/* could be optimized and could check for invalid sequences */
60
ber_len_t chars=0;
61
62
for( ; *p ; LDAP_UTF8_INCR(p) ) {
63
chars++;
64
}
65
66
return chars;
67
}
68
69
/* return offset to next character */
70
int ldap_utf8_offset( const char * p )
71
{
72
return LDAP_UTF8_NEXT(p) - p;
73
}
74
75
/*
76
* Returns length indicated by first byte.
77
*/
78
const char ldap_utf8_lentab[] = {
79
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
83
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
84
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
85
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
86
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
87
88
int ldap_utf8_charlen( const char * p )
89
{
90
if (!(*p & 0x80))
91
return 1;
92
93
return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
94
}
95
96
/*
97
* Make sure the UTF-8 char used the shortest possible encoding
98
* returns charlen if valid, 0 if not.
99
*
100
* Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
101
* The table is slightly modified from that of the RFC.
102
*
103
* UCS-4 range (hex) UTF-8 sequence (binary)
104
* 0000 0000-0000 007F 0.......
105
* 0000 0080-0000 07FF 110++++. 10......
106
* 0000 0800-0000 FFFF 1110++++ 10+..... 10......
107
* 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10......
108
* 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10......
109
* 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10......
110
*
111
* The '.' bits are "don't cares". When validating a UTF-8 sequence,
112
* at least one of the '+' bits must be set, otherwise the character
113
* should have been encoded in fewer octets. Note that in the two-octet
114
* case, only the first octet needs to be validated, and this is done
115
* in the ldap_utf8_lentab[] above.
116
*/
117
118
/* mask of required bits in second octet */
119
#undef c
120
#define c const char
121
c ldap_utf8_mintab[] = {
122
(c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
123
(c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
124
(c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
125
(c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
126
#undef c
127
128
int ldap_utf8_charlen2( const char * p )
129
{
130
int i = LDAP_UTF8_CHARLEN( p );
131
132
if ( i > 2 ) {
133
if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
134
i = 0;
135
}
136
return i;
137
}
138
139
/* conv UTF-8 to UCS-4, useful for comparisons */
140
ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
141
{
142
const unsigned char *c = (const unsigned char *) p;
143
ldap_ucs4_t ch;
144
int len, i;
145
static unsigned char mask[] = {
146
0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
147
148
len = LDAP_UTF8_CHARLEN2(p, len);
149
150
if( len == 0 ) return LDAP_UCS4_INVALID;
151
152
ch = c[0] & mask[len];
153
154
for(i=1; i < len; i++) {
155
if ((c[i] & 0xc0) != 0x80) {
156
return LDAP_UCS4_INVALID;
157
}
158
159
ch <<= 6;
160
ch |= c[i] & 0x3f;
161
}
162
163
return ch;
164
}
165
166
/* conv UCS-4 to UTF-8, not used */
167
int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
168
{
169
int len=0;
170
unsigned char* p = (unsigned char *) buf;
171
172
/* not a valid Unicode character */
173
if ( c < 0 ) return 0;
174
175
/* Just return length, don't convert */
176
if(buf == NULL) {
177
if( c < 0x80 ) return 1;
178
else if( c < 0x800 ) return 2;
179
else if( c < 0x10000 ) return 3;
180
else if( c < 0x200000 ) return 4;
181
else if( c < 0x4000000 ) return 5;
182
else return 6;
183
}
184
185
if( c < 0x80 ) {
186
p[len++] = c;
187
188
} else if( c < 0x800 ) {
189
p[len++] = 0xc0 | ( c >> 6 );
190
p[len++] = 0x80 | ( c & 0x3f );
191
192
} else if( c < 0x10000 ) {
193
p[len++] = 0xe0 | ( c >> 12 );
194
p[len++] = 0x80 | ( (c >> 6) & 0x3f );
195
p[len++] = 0x80 | ( c & 0x3f );
196
197
} else if( c < 0x200000 ) {
198
p[len++] = 0xf0 | ( c >> 18 );
199
p[len++] = 0x80 | ( (c >> 12) & 0x3f );
200
p[len++] = 0x80 | ( (c >> 6) & 0x3f );
201
p[len++] = 0x80 | ( c & 0x3f );
202
203
} else if( c < 0x4000000 ) {
204
p[len++] = 0xf8 | ( c >> 24 );
205
p[len++] = 0x80 | ( (c >> 18) & 0x3f );
206
p[len++] = 0x80 | ( (c >> 12) & 0x3f );
207
p[len++] = 0x80 | ( (c >> 6) & 0x3f );
208
p[len++] = 0x80 | ( c & 0x3f );
209
210
} else /* if( c < 0x80000000 ) */ {
211
p[len++] = 0xfc | ( c >> 30 );
212
p[len++] = 0x80 | ( (c >> 24) & 0x3f );
213
p[len++] = 0x80 | ( (c >> 18) & 0x3f );
214
p[len++] = 0x80 | ( (c >> 12) & 0x3f );
215
p[len++] = 0x80 | ( (c >> 6) & 0x3f );
216
p[len++] = 0x80 | ( c & 0x3f );
217
}
218
219
return len;
220
}
221
222
#define LDAP_UCS_UTF8LEN(c) \
223
c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
224
(c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
225
226
/* Convert a string to UTF-8 format. The input string is expected to
227
* have characters of 1, 2, or 4 octets (in network byte order)
228
* corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
229
* types respectively. (Here T61STRING just means that there is one
230
* octet per character and characters may use the high bit of the octet.
231
* The characters are assumed to use ISO mappings, no provision is made
232
* for converting from T.61 coding rules to Unicode.)
233
*/
234
235
int
236
ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
237
{
238
unsigned char *in, *end;
239
char *ptr;
240
ldap_ucs4_t u;
241
int i, l = 0;
242
243
utf8s->bv_val = NULL;
244
utf8s->bv_len = 0;
245
246
in = (unsigned char *)ucs->bv_val;
247
248
/* Make sure we stop at an even multiple of csize */
249
end = in + ( ucs->bv_len & ~(csize-1) );
250
251
for (; in < end; ) {
252
u = *in++;
253
if (csize > 1) {
254
u <<= 8;
255
u |= *in++;
256
}
257
if (csize > 2) {
258
u <<= 8;
259
u |= *in++;
260
u <<= 8;
261
u |= *in++;
262
}
263
i = LDAP_UCS_UTF8LEN(u);
264
if (i == 0)
265
return LDAP_INVALID_SYNTAX;
266
l += i;
267
}
268
269
utf8s->bv_val = LDAP_MALLOC( l+1 );
270
if (utf8s->bv_val == NULL)
271
return LDAP_NO_MEMORY;
272
utf8s->bv_len = l;
273
274
ptr = utf8s->bv_val;
275
for (in = (unsigned char *)ucs->bv_val; in < end; ) {
276
u = *in++;
277
if (csize > 1) {
278
u <<= 8;
279
u |= *in++;
280
}
281
if (csize > 2) {
282
u <<= 8;
283
u |= *in++;
284
u <<= 8;
285
u |= *in++;
286
}
287
ptr += ldap_x_ucs4_to_utf8(u, ptr);
288
}
289
*ptr = '\0';
290
return LDAP_SUCCESS;
291
}
292
293
/*
294
* Advance to the next UTF-8 character
295
*
296
* Ignores length of multibyte character, instead rely on
297
* continuation markers to find start of next character.
298
* This allows for "resyncing" of when invalid characters
299
* are provided provided the start of the next character
300
* is appears within the 6 bytes examined.
301
*/
302
char* ldap_utf8_next( const char * p )
303
{
304
int i;
305
const unsigned char *u = (const unsigned char *) p;
306
307
if( LDAP_UTF8_ISASCII(u) ) {
308
return (char *) &p[1];
309
}
310
311
for( i=1; i<6; i++ ) {
312
if ( ( u[i] & 0xc0 ) != 0x80 ) {
313
return (char *) &p[i];
314
}
315
}
316
317
return (char *) &p[i];
318
}
319
320
/*
321
* Advance to the previous UTF-8 character
322
*
323
* Ignores length of multibyte character, instead rely on
324
* continuation markers to find start of next character.
325
* This allows for "resyncing" of when invalid characters
326
* are provided provided the start of the next character
327
* is appears within the 6 bytes examined.
328
*/
329
char* ldap_utf8_prev( const char * p )
330
{
331
int i;
332
const unsigned char *u = (const unsigned char *) p;
333
334
for( i=-1; i>-6 ; i-- ) {
335
if ( ( u[i] & 0xc0 ) != 0x80 ) {
336
return (char *) &p[i];
337
}
338
}
339
340
return (char *) &p[i];
341
}
342
343
/*
344
* Copy one UTF-8 character from src to dst returning
345
* number of bytes copied.
346
*
347
* Ignores length of multibyte character, instead rely on
348
* continuation markers to find start of next character.
349
* This allows for "resyncing" of when invalid characters
350
* are provided provided the start of the next character
351
* is appears within the 6 bytes examined.
352
*/
353
int ldap_utf8_copy( char* dst, const char *src )
354
{
355
int i;
356
const unsigned char *u = (const unsigned char *) src;
357
358
dst[0] = src[0];
359
360
if( LDAP_UTF8_ISASCII(u) ) {
361
return 1;
362
}
363
364
for( i=1; i<6; i++ ) {
365
if ( ( u[i] & 0xc0 ) != 0x80 ) {
366
return i;
367
}
368
dst[i] = src[i];
369
}
370
371
return i;
372
}
373
374
#ifndef UTF8_ALPHA_CTYPE
375
/*
376
* UTF-8 ctype routines
377
* Only deals with characters < 0x80 (ie: US-ASCII)
378
*/
379
380
int ldap_utf8_isascii( const char * p )
381
{
382
unsigned c = * (const unsigned char *) p;
383
return LDAP_ASCII(c);
384
}
385
386
int ldap_utf8_isdigit( const char * p )
387
{
388
unsigned c = * (const unsigned char *) p;
389
390
if(!LDAP_ASCII(c)) return 0;
391
392
return LDAP_DIGIT( c );
393
}
394
395
int ldap_utf8_isxdigit( const char * p )
396
{
397
unsigned c = * (const unsigned char *) p;
398
399
if(!LDAP_ASCII(c)) return 0;
400
401
return LDAP_HEX(c);
402
}
403
404
int ldap_utf8_isspace( const char * p )
405
{
406
unsigned c = * (const unsigned char *) p;
407
408
if(!LDAP_ASCII(c)) return 0;
409
410
switch(c) {
411
case ' ':
412
case '\t':
413
case '\n':
414
case '\r':
415
case '\v':
416
case '\f':
417
return 1;
418
}
419
420
return 0;
421
}
422
423
/*
424
* These are not needed by the C SDK and are
425
* not "good enough" for general use.
426
*/
427
int ldap_utf8_isalpha( const char * p )
428
{
429
unsigned c = * (const unsigned char *) p;
430
431
if(!LDAP_ASCII(c)) return 0;
432
433
return LDAP_ALPHA(c);
434
}
435
436
int ldap_utf8_isalnum( const char * p )
437
{
438
unsigned c = * (const unsigned char *) p;
439
440
if(!LDAP_ASCII(c)) return 0;
441
442
return LDAP_ALNUM(c);
443
}
444
445
int ldap_utf8_islower( const char * p )
446
{
447
unsigned c = * (const unsigned char *) p;
448
449
if(!LDAP_ASCII(c)) return 0;
450
451
return LDAP_LOWER(c);
452
}
453
454
int ldap_utf8_isupper( const char * p )
455
{
456
unsigned c = * (const unsigned char *) p;
457
458
if(!LDAP_ASCII(c)) return 0;
459
460
return LDAP_UPPER(c);
461
}
462
#endif
463
464
465
/*
466
* UTF-8 string routines
467
*/
468
469
/* like strchr() */
470
char * (ldap_utf8_strchr)( const char *str, const char *chr )
471
{
472
for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
473
if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
474
return (char *) str;
475
}
476
}
477
478
return NULL;
479
}
480
481
/* like strcspn() but returns number of bytes, not characters */
482
ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
483
{
484
const char *cstr;
485
const char *cset;
486
487
for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
488
for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
489
if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
490
return cstr - str;
491
}
492
}
493
}
494
495
return cstr - str;
496
}
497
498
/* like strspn() but returns number of bytes, not characters */
499
ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
500
{
501
const char *cstr;
502
const char *cset;
503
504
for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
505
for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
506
if( *cset == '\0' ) {
507
return cstr - str;
508
}
509
510
if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
511
break;
512
}
513
}
514
}
515
516
return cstr - str;
517
}
518
519
/* like strpbrk(), replaces strchr() as well */
520
char *(ldap_utf8_strpbrk)( const char *str, const char *set )
521
{
522
for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
523
const char *cset;
524
525
for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
526
if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
527
return (char *) str;
528
}
529
}
530
}
531
532
return NULL;
533
}
534
535
/* like strtok_r(), not strtok() */
536
char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
537
{
538
char *begin;
539
char *end;
540
541
if( last == NULL ) return NULL;
542
543
begin = str ? str : *last;
544
545
begin += ldap_utf8_strspn( begin, sep );
546
547
if( *begin == '\0' ) {
548
*last = NULL;
549
return NULL;
550
}
551
552
end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
553
554
if( *end != '\0' ) {
555
char *next = LDAP_UTF8_NEXT( end );
556
*end = '\0';
557
end = next;
558
}
559
560
*last = end;
561
return begin;
562
}
563
564