CoCalc -- ConvertUTF.cpp

GitHub Repository: alexbevi/BizHawk
Path: blob/master/psx/mednadisc/string/ConvertUTF.cpp
² views
1
/*
2
 * Copyright 2001-2004 Unicode, Inc.
3
 * 
4
 * Disclaimer
5
 * 
6
 * This source code is provided as is by Unicode, Inc. No claims are
7
 * made as to fitness for any particular purpose. No warranties of any
8
 * kind are expressed or implied. The recipient agrees to determine
9
 * applicability of information provided. If this file has been
10
 * purchased on magnetic or optical media from Unicode, Inc., the
11
 * sole remedy for any claim will be exchange of defective media
12
 * within 90 days of receipt.
13
 * 
14
 * Limitations on Rights to Redistribute This Code
15
 * 
16
 * Unicode, Inc. hereby grants the right to freely use the information
17
 * supplied in this file in the creation of products supporting the
18
 * Unicode Standard, and to make copies of this file in any form
19
 * for internal or external distribution as long as this notice
20
 * remains attached.
21
 */
22

23
/* ---------------------------------------------------------------------
24

25
    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26
    Author: Mark E. Davis, 1994.
27
    Rev History: Rick McGowan, fixes & updates May 2001.
28
    Sept 2001: fixed const & error conditions per
29
	mods suggested by S. Parent & A. Lillich.
30
    June 2002: Tim Dodd added detection and handling of incomplete
31
	source sequences, enhanced error detection, added casts
32
	to eliminate compiler warnings.
33
    July 2003: slight mods to back out aggressive FFFE detection.
34
    Jan 2004: updated switches in from-UTF8 conversions.
35
    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36

37
    See the header file "ConvertUTF.h" for complete documentation.
38

39
------------------------------------------------------------------------ */
40

41

42
#include "../types.h"
43
#include "ConvertUTF.h"
44
#ifdef CVTUTF_DEBUG
45
#include <stdio.h>
46
#endif
47

48
#include <string.h>
49
#include <stdlib.h>
50

51
static const int halfShift  = 10; /* used for shifting by 10 bits */
52

53
static const UTF32 halfBase = 0x0010000UL;
54
static const UTF32 halfMask = 0x3FFUL;
55

56
#define UNI_SUR_HIGH_START  (UTF32)0xD800
57
#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
58
#define UNI_SUR_LOW_START   (UTF32)0xDC00
59
#define UNI_SUR_LOW_END     (UTF32)0xDFFF
60
#define false	   0
61
#define true	    1
62

63
/* --------------------------------------------------------------------- */
64

65
ConversionResult ConvertUTF32toUTF16 (
66
	const UTF32** sourceStart, const UTF32* sourceEnd, 
67
	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
68
    ConversionResult result = conversionOK;
69
    const UTF32* source = *sourceStart;
70
    UTF16* target = *targetStart;
71
    while (source < sourceEnd) {
72
	UTF32 ch;
73
	if (target >= targetEnd) {
74
	    result = targetExhausted; break;
75
	}
76
	ch = *source++;
77
	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
78
	    /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
79
	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
80
		if (flags == strictConversion) {
81
		    --source; /* return to the illegal value itself */
82
		    result = sourceIllegal;
83
		    break;
84
		} else {
85
		    *target++ = UNI_REPLACEMENT_CHAR;
86
		}
87
	    } else {
88
		*target++ = (UTF16)ch; /* normal case */
89
	    }
90
	} else if (ch > UNI_MAX_LEGAL_UTF32) {
91
	    if (flags == strictConversion) {
92
		result = sourceIllegal;
93
	    } else {
94
		*target++ = UNI_REPLACEMENT_CHAR;
95
	    }
96
	} else {
97
	    /* target is a character in range 0xFFFF - 0x10FFFF. */
98
	    if (target + 1 >= targetEnd) {
99
		--source; /* Back up source pointer! */
100
		result = targetExhausted; break;
101
	    }
102
	    ch -= halfBase;
103
	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
104
	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
105
	}
106
    }
107
    *sourceStart = source;
108
    *targetStart = target;
109
    return result;
110
}
111

112
/* --------------------------------------------------------------------- */
113

114
ConversionResult ConvertUTF16toUTF32 (
115
	const UTF16** sourceStart, const UTF16* sourceEnd, 
116
	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
117
    ConversionResult result = conversionOK;
118
    const UTF16* source = *sourceStart;
119
    UTF32* target = *targetStart;
120
    UTF32 ch, ch2;
121
    while (source < sourceEnd) {
122
	const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
123
	ch = *source++;
124
	/* If we have a surrogate pair, convert to UTF32 first. */
125
	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
126
	    /* If the 16 bits following the high surrogate are in the source buffer... */
127
	    if (source < sourceEnd) {
128
		ch2 = *source;
129
		/* If it's a low surrogate, convert to UTF32. */
130
		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
131
		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
132
			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
133
		    ++source;
134
		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
135
		    --source; /* return to the illegal value itself */
136
		    result = sourceIllegal;
137
		    break;
138
		}
139
	    } else { /* We don't have the 16 bits following the high surrogate. */
140
		--source; /* return to the high surrogate */
141
		result = sourceExhausted;
142
		break;
143
	    }
144
	} else if (flags == strictConversion) {
145
	    /* UTF-16 surrogate values are illegal in UTF-32 */
146
	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
147
		--source; /* return to the illegal value itself */
148
		result = sourceIllegal;
149
		break;
150
	    }
151
	}
152
	if (target >= targetEnd) {
153
	    source = oldSource; /* Back up source pointer! */
154
	    result = targetExhausted; break;
155
	}
156
	*target++ = ch;
157
    }
158
    *sourceStart = source;
159
    *targetStart = target;
160
#ifdef CVTUTF_DEBUG
161
if (result == sourceIllegal) {
162
    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
163
    fflush(stderr);
164
}
165
#endif
166
    return result;
167
}
168

169
/* --------------------------------------------------------------------- */
170

171
/*
172
 * Index into the table below with the first byte of a UTF-8 sequence to
173
 * get the number of trailing bytes that are supposed to follow it.
174
 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
175
 * left as-is for anyone who may want to do such conversion, which was
176
 * allowed in earlier algorithms.
177
 */
178
static const char trailingBytesForUTF8[256] = {
179
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
180
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
181
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
182
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
183
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
184
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
185
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
186
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
187
};
188

189
/*
190
 * Magic values subtracted from a buffer value during UTF8 conversion.
191
 * This table contains as many values as there might be trailing bytes
192
 * in a UTF-8 sequence.
193
 */
194
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
195
		     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
196

197
/*
198
 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
199
 * into the first byte, depending on how many bytes follow.  There are
200
 * as many entries in this table as there are UTF-8 sequence types.
201
 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
202
 * for *legal* UTF-8 will be 4 or fewer bytes total.
203
 */
204
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
205

206
/* --------------------------------------------------------------------- */
207

208
/* The interface converts a whole buffer to avoid function-call overhead.
209
 * Constants have been gathered. Loops & conditionals have been removed as
210
 * much as possible for efficiency, in favor of drop-through switches.
211
 * (See "Note A" at the bottom of the file for equivalent code.)
212
 * If your compiler supports it, the "isLegalUTF8" call can be turned
213
 * into an inline function.
214
 */
215

216
/* --------------------------------------------------------------------- */
217

218
ConversionResult ConvertUTF16toUTF8 (
219
	const UTF16** sourceStart, const UTF16* sourceEnd, 
220
	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
221
    ConversionResult result = conversionOK;
222
    const UTF16* source = *sourceStart;
223
    UTF8* target = *targetStart;
224
    while (source < sourceEnd) {
225
	UTF32 ch;
226
	unsigned short bytesToWrite = 0;
227
	const UTF32 byteMask = 0xBF;
228
	const UTF32 byteMark = 0x80; 
229
	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
230
	ch = *source++;
231
	/* If we have a surrogate pair, convert to UTF32 first. */
232
	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
233
	    /* If the 16 bits following the high surrogate are in the source buffer... */
234
	    if (source < sourceEnd) {
235
		UTF32 ch2 = *source;
236
		/* If it's a low surrogate, convert to UTF32. */
237
		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
238
		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
239
			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
240
		    ++source;
241
		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
242
		    --source; /* return to the illegal value itself */
243
		    result = sourceIllegal;
244
		    break;
245
		}
246
	    } else { /* We don't have the 16 bits following the high surrogate. */
247
		--source; /* return to the high surrogate */
248
		result = sourceExhausted;
249
		break;
250
	    }
251
	} else if (flags == strictConversion) {
252
	    /* UTF-16 surrogate values are illegal in UTF-32 */
253
	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
254
		--source; /* return to the illegal value itself */
255
		result = sourceIllegal;
256
		break;
257
	    }
258
	}
259
	/* Figure out how many bytes the result will require */
260
	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
261
	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
262
	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
263
	} else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
264
	} else {			    bytesToWrite = 3;
265
					    ch = UNI_REPLACEMENT_CHAR;
266
	}
267

268
	target += bytesToWrite;
269
	if (target > targetEnd) {
270
	    source = oldSource; /* Back up source pointer! */
271
	    target -= bytesToWrite; result = targetExhausted; break;
272
	}
273
	switch (bytesToWrite) { /* note: everything falls through. */
274
	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
275
	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
276
	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
277
	    case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
278
	}
279
	target += bytesToWrite;
280
    }
281
    *sourceStart = source;
282
    *targetStart = target;
283
    return result;
284
}
285

286
/* --------------------------------------------------------------------- */
287

288
/*
289
 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
290
 * This must be called with the length pre-determined by the first byte.
291
 * If not calling this from ConvertUTF8to*, then the length can be set by:
292
 *  length = trailingBytesForUTF8[*source]+1;
293
 * and the sequence is illegal right away if there aren't that many bytes
294
 * available.
295
 * If presented with a length > 4, this returns false.  The Unicode
296
 * definition of UTF-8 goes up to 4-byte sequences.
297
 */
298

299
static bool isLegalUTF8(const UTF8 *source, int length) {
300
    UTF8 a;
301
    const UTF8 *srcptr = source+length;
302
    switch (length) {
303
    default: return false;
304
	/* Everything else falls through when "true"... */
305
    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
306
    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
307
    case 2: if ((a = (*--srcptr)) > 0xBF) return false;
308

309
	switch (*source) {
310
	    /* no fall-through in this inner switch */
311
	    case 0xE0: if (a < 0xA0) return false; break;
312
	    case 0xED: if (a > 0x9F) return false; break;
313
	    case 0xF0: if (a < 0x90) return false; break;
314
	    case 0xF4: if (a > 0x8F) return false; break;
315
	    default:   if (a < 0x80) return false;
316
	}
317

318
    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
319
    }
320
    if (*source > 0xF4) return false;
321
    return true;
322
}
323

324
/* --------------------------------------------------------------------- */
325

326
/*
327
 * Exported function to return whether a UTF-8 sequence is legal or not.
328
 * This is not used here; it's just exported.
329
 */
330
bool isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
331
    int length = trailingBytesForUTF8[*source]+1;
332
    if (source+length > sourceEnd) {
333
	return false;
334
    }
335
    return isLegalUTF8(source, length);
336
}
337

338
/* --------------------------------------------------------------------- */
339

340
ConversionResult ConvertUTF8toUTF16 (
341
	const UTF8** sourceStart, const UTF8* sourceEnd, 
342
	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
343
    ConversionResult result = conversionOK;
344
    const UTF8* source = *sourceStart;
345
    UTF16* target = *targetStart;
346
    while (source < sourceEnd) {
347
	UTF32 ch = 0;
348
	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
349
	if (source + extraBytesToRead >= sourceEnd) {
350
	    result = sourceExhausted; break;
351
	}
352
	/* Do this check whether lenient or strict */
353
	if (! isLegalUTF8(source, extraBytesToRead+1)) {
354
	    result = sourceIllegal;
355
	    break;
356
	}
357
	/*
358
	 * The cases all fall through. See "Note A" below.
359
	 */
360
	switch (extraBytesToRead) {
361
	    case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
362
	    case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
363
	    case 3: ch += *source++; ch <<= 6;
364
	    case 2: ch += *source++; ch <<= 6;
365
	    case 1: ch += *source++; ch <<= 6;
366
	    case 0: ch += *source++;
367
	}
368
	ch -= offsetsFromUTF8[extraBytesToRead];
369

370
	if (target >= targetEnd) {
371
	    source -= (extraBytesToRead+1); /* Back up source pointer! */
372
	    result = targetExhausted; break;
373
	}
374
	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
375
	    /* UTF-16 surrogate values are illegal in UTF-32 */
376
	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
377
		if (flags == strictConversion) {
378
		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
379
		    result = sourceIllegal;
380
		    break;
381
		} else {
382
		    *target++ = UNI_REPLACEMENT_CHAR;
383
		}
384
	    } else {
385
		*target++ = (UTF16)ch; /* normal case */
386
	    }
387
	} else if (ch > UNI_MAX_UTF16) {
388
	    if (flags == strictConversion) {
389
		result = sourceIllegal;
390
		source -= (extraBytesToRead+1); /* return to the start */
391
		break; /* Bail out; shouldn't continue */
392
	    } else {
393
		*target++ = UNI_REPLACEMENT_CHAR;
394
	    }
395
	} else {
396
	    /* target is a character in range 0xFFFF - 0x10FFFF. */
397
	    if (target + 1 >= targetEnd) {
398
		source -= (extraBytesToRead+1); /* Back up source pointer! */
399
		result = targetExhausted; break;
400
	    }
401
	    ch -= halfBase;
402
	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
403
	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
404
	}
405
    }
406
    *sourceStart = source;
407
    *targetStart = target;
408
    return result;
409
}
410

411
/* --------------------------------------------------------------------- */
412

413
ConversionResult ConvertUTF32toUTF8 (
414
	const UTF32** sourceStart, const UTF32* sourceEnd, 
415
	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
416
    ConversionResult result = conversionOK;
417
    const UTF32* source = *sourceStart;
418
    UTF8* target = *targetStart;
419
    while (source < sourceEnd) {
420
	UTF32 ch;
421
	unsigned short bytesToWrite = 0;
422
	const UTF32 byteMask = 0xBF;
423
	const UTF32 byteMark = 0x80; 
424
	ch = *source++;
425
	if (flags == strictConversion ) {
426
	    /* UTF-16 surrogate values are illegal in UTF-32 */
427
	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
428
		--source; /* return to the illegal value itself */
429
		result = sourceIllegal;
430
		break;
431
	    }
432
	}
433
	/*
434
	 * Figure out how many bytes the result will require. Turn any
435
	 * illegally large UTF32 things (> Plane 17) into replacement chars.
436
	 */
437
	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
438
	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
439
	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
440
	} else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
441
	} else {			    bytesToWrite = 3;
442
					    ch = UNI_REPLACEMENT_CHAR;
443
					    result = sourceIllegal;
444
	}
445
	
446
	target += bytesToWrite;
447
	if (target > targetEnd) {
448
	    --source; /* Back up source pointer! */
449
	    target -= bytesToWrite; result = targetExhausted; break;
450
	}
451
	switch (bytesToWrite) { /* note: everything falls through. */
452
	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
453
	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
454
	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
455
	    case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
456
	}
457
	target += bytesToWrite;
458
    }
459
    *sourceStart = source;
460
    *targetStart = target;
461
    return result;
462
}
463

464
/* --------------------------------------------------------------------- */
465

466
ConversionResult ConvertUTF8toUTF32 (
467
	const UTF8** sourceStart, const UTF8* sourceEnd, 
468
	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
469
    ConversionResult result = conversionOK;
470
    const UTF8* source = *sourceStart;
471
    UTF32* target = *targetStart;
472
    while (source < sourceEnd) {
473
	UTF32 ch = 0;
474
	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
475
	if (source + extraBytesToRead >= sourceEnd) {
476
	    result = sourceExhausted; break;
477
	}
478
	/* Do this check whether lenient or strict */
479
	if (! isLegalUTF8(source, extraBytesToRead+1)) {
480
	    result = sourceIllegal;
481
	    break;
482
	}
483
	/*
484
	 * The cases all fall through. See "Note A" below.
485
	 */
486
	switch (extraBytesToRead) {
487
	    case 5: ch += *source++; ch <<= 6;
488
	    case 4: ch += *source++; ch <<= 6;
489
	    case 3: ch += *source++; ch <<= 6;
490
	    case 2: ch += *source++; ch <<= 6;
491
	    case 1: ch += *source++; ch <<= 6;
492
	    case 0: ch += *source++;
493
	}
494
	ch -= offsetsFromUTF8[extraBytesToRead];
495

496
	if (target >= targetEnd) {
497
	    source -= (extraBytesToRead+1); /* Back up the source pointer! */
498
	    result = targetExhausted; break;
499
	}
500
	if (ch <= UNI_MAX_LEGAL_UTF32) {
501
	    /*
502
	     * UTF-16 surrogate values are illegal in UTF-32, and anything
503
	     * over Plane 17 (> 0x10FFFF) is illegal.
504
	     */
505
	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
506
		if (flags == strictConversion) {
507
		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
508
		    result = sourceIllegal;
509
		    break;
510
		} else {
511
		    *target++ = UNI_REPLACEMENT_CHAR;
512
		}
513
	    } else {
514
		*target++ = ch;
515
	    }
516
	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
517
	    result = sourceIllegal;
518
	    *target++ = UNI_REPLACEMENT_CHAR;
519
	}
520
    }
521
    *sourceStart = source;
522
    *targetStart = target;
523
    return result;
524
}
525

526
UTF32 *MakeUTF32FromUTF8(UTF8 *string)
527
{
528
 UTF32 *ret, *tstart;
529
 const UTF8 *tstring = string;
530

531
 size_t string_length = strlen((char *)string);
532

533
 tstart = ret = (UTF32 *)malloc(string_length * sizeof(UTF32) + 1);
534

535
 ConvertUTF8toUTF32(&tstring, &string[string_length], &tstart, &tstart[string_length], lenientConversion);
536

537
 *tstart = 0;
538

539
 return(ret);
540
}
541

542

543
/* ---------------------------------------------------------------------
544

545
    Note A.
546
    The fall-through switches in UTF-8 reading code save a
547
    temp variable, some decrements & conditionals.  The switches
548
    are equivalent to the following loop:
549
	{
550
	    int tmpBytesToRead = extraBytesToRead+1;
551
	    do {
552
		ch += *source++;
553
		--tmpBytesToRead;
554
		if (tmpBytesToRead) ch <<= 6;
555
	    } while (tmpBytesToRead > 0);
556
	}
557
    In UTF-8 writing code, the switches on "bytesToWrite" are
558
    similarly unrolled loops.
559

560
   --------------------------------------------------------------------- */
561

562
Product

Resources

Company