CoCalc -- utf-8.c

GitHub Repository: wine-mirror/wine
Path: blob/master/libs/ldap/libldap/utf-8.c
⁴³⁹⁴ views
1
/* utf-8.c -- Basic UTF-8 routines */
2
/* $OpenLDAP$ */
3
/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4
 *
5
 * Copyright 1998-2024 The OpenLDAP Foundation.
6
 * All rights reserved.
7
 *
8
 * Redistribution and use in source and binary forms, with or without
9
 * modification, are permitted only as authorized by the OpenLDAP
10
 * Public License.
11
 *
12
 * A copy of this license is available in the file LICENSE in the
13
 * top-level directory of the distribution or, alternatively, at
14
 * <http://www.OpenLDAP.org/license.html>.
15
 */
16
/* Basic UTF-8 routines
17
 *
18
 * These routines are "dumb".  Though they understand UTF-8,
19
 * they don't grok Unicode.  That is, they can push bits,
20
 * but don't have a clue what the bits represent.  That's
21
 * good enough for use with the LDAP Client SDK.
22
 *
23
 * These routines are not optimized.
24
 */
25

26
#include "portable.h"
27

28
#include <stdio.h>
29

30
#include <ac/stdlib.h>
31

32
#include <ac/socket.h>
33
#include <ac/string.h>
34
#include <ac/time.h>
35

36
#include "ldap_utf8.h"
37

38
#include "ldap-int.h"
39
#include "ldap_defaults.h"
40

41
/*
42
 * return the number of bytes required to hold the
43
 * NULL-terminated UTF-8 string NOT INCLUDING the
44
 * termination.
45
 */
46
ber_len_t ldap_utf8_bytes( const char * p )
47
{
48
	ber_len_t bytes;
49

50
	for( bytes=0; p[bytes]; bytes++ ) {
51
		/* EMPTY */ ;
52
	}
53

54
	return bytes;
55
}
56

57
ber_len_t ldap_utf8_chars( const char * p )
58
{
59
	/* could be optimized and could check for invalid sequences */
60
	ber_len_t chars=0;
61

62
	for( ; *p ; LDAP_UTF8_INCR(p) ) {
63
		chars++;
64
	}
65

66
	return chars;
67
}
68

69
/* return offset to next character */
70
int ldap_utf8_offset( const char * p )
71
{
72
	return LDAP_UTF8_NEXT(p) - p;
73
}
74

75
/*
76
 * Returns length indicated by first byte.
77
 */
78
const char ldap_utf8_lentab[] = {
79
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
83
	0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
84
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
85
	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
86
	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
87

88
int ldap_utf8_charlen( const char * p )
89
{
90
	if (!(*p & 0x80))
91
		return 1;
92

93
	return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
94
}
95

96
/*
97
 * Make sure the UTF-8 char used the shortest possible encoding
98
 * returns charlen if valid, 0 if not.
99
 *
100
 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
101
 * The table is slightly modified from that of the RFC.
102
 *
103
 * UCS-4 range (hex)      UTF-8 sequence (binary)
104
 * 0000 0000-0000 007F   0.......
105
 * 0000 0080-0000 07FF   110++++. 10......
106
 * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
107
 * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
108
 * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
109
 * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
110
 *
111
 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
112
 * at least one of the '+' bits must be set, otherwise the character
113
 * should have been encoded in fewer octets. Note that in the two-octet
114
 * case, only the first octet needs to be validated, and this is done
115
 * in the ldap_utf8_lentab[] above.
116
 */
117

118
/* mask of required bits in second octet */
119
#undef c
120
#define c const char
121
c ldap_utf8_mintab[] = {
122
	(c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
123
	(c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
124
	(c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
125
	(c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
126
#undef c
127

128
int ldap_utf8_charlen2( const char * p )
129
{
130
	int i = LDAP_UTF8_CHARLEN( p );
131

132
	if ( i > 2 ) {
133
		if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
134
			i = 0;
135
	}
136
	return i;
137
}
138

139
/* conv UTF-8 to UCS-4, useful for comparisons */
140
ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
141
{
142
    const unsigned char *c = (const unsigned char *) p;
143
    ldap_ucs4_t ch;
144
	int len, i;
145
	static unsigned char mask[] = {
146
		0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
147

148
	len = LDAP_UTF8_CHARLEN2(p, len);
149

150
	if( len == 0 ) return LDAP_UCS4_INVALID;
151

152
	ch = c[0] & mask[len];
153

154
	for(i=1; i < len; i++) {
155
		if ((c[i] & 0xc0) != 0x80) {
156
			return LDAP_UCS4_INVALID;
157
		}
158

159
		ch <<= 6;
160
		ch |= c[i] & 0x3f;
161
	}
162

163
	return ch;
164
}
165

166
/* conv UCS-4 to UTF-8, not used */
167
int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
168
{
169
	int len=0;
170
	unsigned char* p = (unsigned char *) buf;
171

172
	/* not a valid Unicode character */
173
	if ( c < 0 ) return 0;
174

175
	/* Just return length, don't convert */
176
	if(buf == NULL) {
177
		if( c < 0x80 ) return 1;
178
		else if( c < 0x800 ) return 2;
179
		else if( c < 0x10000 ) return 3;
180
		else if( c < 0x200000 ) return 4;
181
		else if( c < 0x4000000 ) return 5;
182
		else return 6;
183
	}
184

185
	if( c < 0x80 ) {
186
		p[len++] = c;
187

188
	} else if( c < 0x800 ) {
189
		p[len++] = 0xc0 | ( c >> 6 );
190
		p[len++] = 0x80 | ( c & 0x3f );
191

192
	} else if( c < 0x10000 ) {
193
		p[len++] = 0xe0 | ( c >> 12 );
194
		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
195
		p[len++] = 0x80 | ( c & 0x3f );
196

197
	} else if( c < 0x200000 ) {
198
		p[len++] = 0xf0 | ( c >> 18 );
199
		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
200
		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
201
		p[len++] = 0x80 | ( c & 0x3f );
202

203
	} else if( c < 0x4000000 ) {
204
		p[len++] = 0xf8 | ( c >> 24 );
205
		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
206
		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
207
		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
208
		p[len++] = 0x80 | ( c & 0x3f );
209

210
	} else /* if( c < 0x80000000 ) */ {
211
		p[len++] = 0xfc | ( c >> 30 );
212
		p[len++] = 0x80 | ( (c >> 24) & 0x3f );
213
		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
214
		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
215
		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
216
		p[len++] = 0x80 | ( c & 0x3f );
217
	}
218

219
	return len;
220
}
221

222
#define LDAP_UCS_UTF8LEN(c)	\
223
	c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
224
	(c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
225

226
/* Convert a string to UTF-8 format. The input string is expected to
227
 * have characters of 1, 2, or 4 octets (in network byte order)
228
 * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
229
 * types respectively. (Here T61STRING just means that there is one
230
 * octet per character and characters may use the high bit of the octet.
231
 * The characters are assumed to use ISO mappings, no provision is made
232
 * for converting from T.61 coding rules to Unicode.)
233
 */
234

235
int
236
ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
237
{
238
	unsigned char *in, *end;
239
	char *ptr;
240
	ldap_ucs4_t u;
241
	int i, l = 0;
242

243
	utf8s->bv_val = NULL;
244
	utf8s->bv_len = 0;
245

246
	in = (unsigned char *)ucs->bv_val;
247

248
	/* Make sure we stop at an even multiple of csize */
249
	end = in + ( ucs->bv_len & ~(csize-1) );
250

251
	for (; in < end; ) {
252
		u = *in++;
253
		if (csize > 1) {
254
			u <<= 8;
255
			u |= *in++;
256
		}
257
		if (csize > 2) {
258
			u <<= 8;
259
			u |= *in++;
260
			u <<= 8;
261
			u |= *in++;
262
		}
263
		i = LDAP_UCS_UTF8LEN(u);
264
		if (i == 0)
265
			return LDAP_INVALID_SYNTAX;
266
		l += i;
267
	}
268

269
	utf8s->bv_val = LDAP_MALLOC( l+1 );
270
	if (utf8s->bv_val == NULL)
271
		return LDAP_NO_MEMORY;
272
	utf8s->bv_len = l;
273

274
	ptr = utf8s->bv_val;
275
	for (in = (unsigned char *)ucs->bv_val; in < end; ) {
276
		u = *in++;
277
		if (csize > 1) {
278
			u <<= 8;
279
			u |= *in++;
280
		}
281
		if (csize > 2) {
282
			u <<= 8;
283
			u |= *in++;
284
			u <<= 8;
285
			u |= *in++;
286
		}
287
		ptr += ldap_x_ucs4_to_utf8(u, ptr);
288
	}
289
	*ptr = '\0';
290
	return LDAP_SUCCESS;
291
}
292

293
/*
294
 * Advance to the next UTF-8 character
295
 *
296
 * Ignores length of multibyte character, instead rely on
297
 * continuation markers to find start of next character.
298
 * This allows for "resyncing" of when invalid characters
299
 * are provided provided the start of the next character
300
 * is appears within the 6 bytes examined.
301
 */
302
char* ldap_utf8_next( const char * p )
303
{
304
	int i;
305
	const unsigned char *u = (const unsigned char *) p;
306

307
	if( LDAP_UTF8_ISASCII(u) ) {
308
		return (char *) &p[1];
309
	}
310

311
	for( i=1; i<6; i++ ) {
312
		if ( ( u[i] & 0xc0 ) != 0x80 ) {
313
			return (char *) &p[i];
314
		}
315
	}
316

317
	return (char *) &p[i];
318
}
319

320
/*
321
 * Advance to the previous UTF-8 character
322
 *
323
 * Ignores length of multibyte character, instead rely on
324
 * continuation markers to find start of next character.
325
 * This allows for "resyncing" of when invalid characters
326
 * are provided provided the start of the next character
327
 * is appears within the 6 bytes examined.
328
 */
329
char* ldap_utf8_prev( const char * p )
330
{
331
	int i;
332
	const unsigned char *u = (const unsigned char *) p;
333

334
	for( i=-1; i>-6 ; i-- ) {
335
		if ( ( u[i] & 0xc0 ) != 0x80 ) {
336
			return (char *) &p[i];
337
		}
338
	}
339

340
	return (char *) &p[i];
341
}
342

343
/*
344
 * Copy one UTF-8 character from src to dst returning
345
 * number of bytes copied.
346
 *
347
 * Ignores length of multibyte character, instead rely on
348
 * continuation markers to find start of next character.
349
 * This allows for "resyncing" of when invalid characters
350
 * are provided provided the start of the next character
351
 * is appears within the 6 bytes examined.
352
 */
353
int ldap_utf8_copy( char* dst, const char *src )
354
{
355
	int i;
356
	const unsigned char *u = (const unsigned char *) src;
357

358
	dst[0] = src[0];
359

360
	if( LDAP_UTF8_ISASCII(u) ) {
361
		return 1;
362
	}
363

364
	for( i=1; i<6; i++ ) {
365
		if ( ( u[i] & 0xc0 ) != 0x80 ) {
366
			return i;
367
		}
368
		dst[i] = src[i];
369
	}
370

371
	return i;
372
}
373

374
#ifndef UTF8_ALPHA_CTYPE
375
/*
376
 * UTF-8 ctype routines
377
 * Only deals with characters < 0x80 (ie: US-ASCII)
378
 */
379

380
int ldap_utf8_isascii( const char * p )
381
{
382
	unsigned c = * (const unsigned char *) p;
383
	return LDAP_ASCII(c);
384
}
385

386
int ldap_utf8_isdigit( const char * p )
387
{
388
	unsigned c = * (const unsigned char *) p;
389

390
	if(!LDAP_ASCII(c)) return 0;
391

392
	return LDAP_DIGIT( c );
393
}
394

395
int ldap_utf8_isxdigit( const char * p )
396
{
397
	unsigned c = * (const unsigned char *) p;
398

399
	if(!LDAP_ASCII(c)) return 0;
400

401
	return LDAP_HEX(c);
402
}
403

404
int ldap_utf8_isspace( const char * p )
405
{
406
	unsigned c = * (const unsigned char *) p;
407

408
	if(!LDAP_ASCII(c)) return 0;
409

410
	switch(c) {
411
	case ' ':
412
	case '\t':
413
	case '\n':
414
	case '\r':
415
	case '\v':
416
	case '\f':
417
		return 1;
418
	}
419

420
	return 0;
421
}
422

423
/*
424
 * These are not needed by the C SDK and are
425
 * not "good enough" for general use.
426
 */
427
int ldap_utf8_isalpha( const char * p )
428
{
429
	unsigned c = * (const unsigned char *) p;
430

431
	if(!LDAP_ASCII(c)) return 0;
432

433
	return LDAP_ALPHA(c);
434
}
435

436
int ldap_utf8_isalnum( const char * p )
437
{
438
	unsigned c = * (const unsigned char *) p;
439

440
	if(!LDAP_ASCII(c)) return 0;
441

442
	return LDAP_ALNUM(c);
443
}
444

445
int ldap_utf8_islower( const char * p )
446
{
447
	unsigned c = * (const unsigned char *) p;
448

449
	if(!LDAP_ASCII(c)) return 0;
450

451
	return LDAP_LOWER(c);
452
}
453

454
int ldap_utf8_isupper( const char * p )
455
{
456
	unsigned c = * (const unsigned char *) p;
457

458
	if(!LDAP_ASCII(c)) return 0;
459

460
	return LDAP_UPPER(c);
461
}
462
#endif
463

464

465
/*
466
 * UTF-8 string routines
467
 */
468

469
/* like strchr() */
470
char * (ldap_utf8_strchr)( const char *str, const char *chr )
471
{
472
	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
473
		if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
474
			return (char *) str;
475
		}
476
	}
477

478
	return NULL;
479
}
480

481
/* like strcspn() but returns number of bytes, not characters */
482
ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
483
{
484
	const char *cstr;
485
	const char *cset;
486

487
	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
488
		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
489
			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
490
				return cstr - str;
491
			}
492
		}
493
	}
494

495
	return cstr - str;
496
}
497

498
/* like strspn() but returns number of bytes, not characters */
499
ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
500
{
501
	const char *cstr;
502
	const char *cset;
503

504
	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
505
		for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
506
			if( *cset == '\0' ) {
507
				return cstr - str;
508
			}
509

510
			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
511
				break;
512
			}
513
		}
514
	}
515

516
	return cstr - str;
517
}
518

519
/* like strpbrk(), replaces strchr() as well */
520
char *(ldap_utf8_strpbrk)( const char *str, const char *set )
521
{
522
	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
523
		const char *cset;
524

525
		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
526
			if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
527
				return (char *) str;
528
			}
529
		}
530
	}
531

532
	return NULL;
533
}
534

535
/* like strtok_r(), not strtok() */
536
char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
537
{
538
	char *begin;
539
	char *end;
540

541
	if( last == NULL ) return NULL;
542

543
	begin = str ? str : *last;
544

545
	begin += ldap_utf8_strspn( begin, sep );
546

547
	if( *begin == '\0' ) {
548
		*last = NULL;
549
		return NULL;
550
	}
551

552
	end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
553

554
	if( *end != '\0' ) {
555
		char *next = LDAP_UTF8_NEXT( end );
556
		*end = '\0';
557
		end = next;
558
	}
559

560
	*last = end;
561
	return begin;
562
}
563

564
Product

Resources

Company