Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/lib/libast/comp/iconv.c
1810 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 1985-2012 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Glenn Fowler <[email protected]> *
18
* David Korn <[email protected]> *
19
* Phong Vo <[email protected]> *
20
* *
21
***********************************************************************/
22
#pragma prototyped
23
24
/*
25
* Glenn Fowler
26
* AT&T Research
27
*
28
* iconv intercept
29
* minimally provides { utf*<=>bin ascii<=>ebcdic* }
30
*/
31
32
#include <ast.h>
33
#include <dirent.h>
34
35
#define DEBUG_TRACE 0
36
#define _ICONV_LIST_PRIVATE_
37
38
#include <ccode.h>
39
#include <ctype.h>
40
#include <iconv.h>
41
42
#include "lclib.h"
43
44
#if !_lib_iconv_open
45
46
#define _ast_iconv_t iconv_t
47
#define _ast_iconv_f iconv_f
48
#define _ast_iconv_list_t iconv_list_t
49
#define _ast_iconv_open iconv_open
50
#define _ast_iconv iconv
51
#define _ast_iconv_close iconv_close
52
#define _ast_iconv_list iconv_list
53
#define _ast_iconv_move iconv_move
54
#define _ast_iconv_name iconv_name
55
#define _ast_iconv_write iconv_write
56
57
#endif
58
59
#ifndef E2BIG
60
#define E2BIG ENOMEM
61
#endif
62
#ifndef EILSEQ
63
#define EILSEQ EIO
64
#endif
65
66
#define RETURN(e,n,fn) \
67
if (*fn && !e) e = E2BIG; \
68
if (e) { errno = e; return (size_t)(-1); } \
69
return n;
70
71
typedef struct Map_s
72
{
73
char* name;
74
const unsigned char* map;
75
_ast_iconv_f fun;
76
int index;
77
} Map_t;
78
79
typedef struct Conv_s
80
{
81
iconv_t cvt;
82
char* buf;
83
size_t size;
84
Map_t from;
85
Map_t to;
86
} Conv_t;
87
88
static Conv_t* freelist[4];
89
static int freeindex;
90
91
static const char name_local[] = "local";
92
static const char name_native[] = "native";
93
94
static const _ast_iconv_list_t codes[] =
95
{
96
{
97
"utf",
98
"un|unicode|utf",
99
"multibyte 8-bit unicode",
100
"UTF-%s",
101
"8",
102
CC_UTF,
103
},
104
105
{
106
"ume",
107
"um|ume|utf?(-)7",
108
"multibyte 7-bit unicode",
109
"UTF-7",
110
0,
111
CC_UME,
112
},
113
114
{
115
"euc",
116
"(big|euc)*",
117
"euc family",
118
0,
119
0,
120
CC_ICONV,
121
},
122
123
{
124
"dos",
125
"dos?(-)?(855)",
126
"dos code page",
127
"DOS855",
128
0,
129
CC_ICONV,
130
},
131
132
{
133
"ucs",
134
"ucs?(-)?(2)?(be)|utf-16?(be)",
135
"unicode runes",
136
"UCS-%s",
137
"2",
138
CC_UCS,
139
},
140
141
{
142
"ucs-le",
143
"ucs?(-)?(2)le|utf-16le",
144
"little endian unicode runes",
145
"UCS-%sLE",
146
"2",
147
CC_SCU,
148
},
149
150
{ 0 },
151
};
152
153
#if _UWIN
154
155
#include <ast_windows.h>
156
157
#ifndef CP_UCS2
158
#define CP_UCS2 0x0000
159
#endif
160
161
static char _win_maps[] = "/reg/local_machine/SOFTWARE/Classes/MIME/Database/Charset";
162
163
/*
164
* return the codeset index given its name or alias
165
* the map is in the what? oh, the registry
166
*/
167
168
static int
169
_win_codeset(const char* name)
170
{
171
register char* s;
172
char* e;
173
int n;
174
Sfio_t* sp;
175
char aka[128];
176
char tmp[128];
177
178
#if DEBUG_TRACE
179
error(DEBUG_TRACE, "AHA#%d _win_codeset name=%s", __LINE__, name);
180
#endif
181
if (name == name_native)
182
return CP_ACP;
183
if (!strcasecmp(name, "utf") || !strcasecmp(name, "utf8") || !strcasecmp(name, "utf-8"))
184
return CP_UTF8;
185
if (!strcasecmp(name, "ucs") || !strcasecmp(name, "ucs2") || !strcasecmp(name, "ucs-2"))
186
return CP_UCS2;
187
if (name[0] == '0' && name[1] == 'x' && (n = strtol(name, &e, 0)) > 0 && !*e)
188
return n;
189
for (;;)
190
{
191
sfsprintf(tmp, sizeof(tmp), "%s/%s", _win_maps, name);
192
if (!(sp = sfopen(0, tmp, "r")))
193
{
194
s = (char*)name;
195
if ((s[0] == 'c' || s[0] == 'C') && (s[1] == 'p' || s[1] == 'P'))
196
s += 2;
197
if (!isdigit(s[0]))
198
break;
199
sfsprintf(tmp, sizeof(tmp), "%s/windows-%s", _win_maps, s);
200
if (!(sp = sfopen(0, tmp, "r")))
201
break;
202
}
203
for (;;)
204
{
205
if (!(s = sfgetr(sp, '\n', 0)))
206
{
207
sfclose(sp);
208
return -1;
209
}
210
if (!strncasecmp(s, "AliasForCharSet=", 16))
211
{
212
n = sfvalue(sp) - 17;
213
s += 16;
214
if (n >= sizeof(aka))
215
n = sizeof(aka) - 1;
216
memcpy(aka, s, n);
217
aka[n] = 0;
218
sfclose(sp);
219
name = (const char*)aka;
220
break;
221
}
222
if (!strncasecmp(s, "CodePage=", 9))
223
{
224
s += 9;
225
n = strtol(s, 0, 0);
226
sfclose(sp);
227
return n;
228
}
229
}
230
}
231
return -1;
232
}
233
234
/*
235
* get and check the codeset indices
236
*/
237
238
static _ast_iconv_t
239
_win_iconv_open(register Conv_t* cc, const char* t, const char* f)
240
{
241
#if DEBUG_TRACE
242
error(DEBUG_TRACE, "AHA#%d _win_iconv_open f=%s t=%s\n", __LINE__, f, t);
243
#endif
244
if ((cc->from.index = _win_codeset(f)) < 0)
245
return (_ast_iconv_t)(-1);
246
if ((cc->to.index = _win_codeset(t)) < 0)
247
return (_ast_iconv_t)(-1);
248
#if DEBUG_TRACE
249
error(DEBUG_TRACE, "AHA#%d _win_iconv_open f=0x%04x t=0x%04x\n", __LINE__, cc->from.index, cc->to.index);
250
#endif
251
return (_ast_iconv_t)cc;
252
}
253
254
/*
255
* even though the indices already check out
256
* they could still be rejected
257
*/
258
259
static size_t
260
_win_iconv(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
261
{
262
Conv_t* cc = (Conv_t*)cd;
263
size_t un;
264
size_t tz;
265
size_t fz;
266
size_t bz;
267
size_t pz;
268
size_t oz;
269
LPWSTR ub;
270
271
#if DEBUG_TRACE
272
error(DEBUG_TRACE, "AHA#%d _win_iconv from=0x%04x to=0x%04x\n", __LINE__, cc->from.index, cc->to.index);
273
#endif
274
if (cc->from.index == cc->to.index || cc->from.index != CP_UCS2 && cc->to.index == 0)
275
{
276
/*
277
* easy
278
*/
279
280
fz = tz = (*fn < *tn) ? *fn : *tn;
281
memcpy(*tb, *fb, fz);
282
}
283
else
284
{
285
ub = 0;
286
un = *fn;
287
288
/*
289
* from => ucs-2
290
*/
291
292
if (cc->to.index == CP_UCS2)
293
{
294
if ((tz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)*tb, *tn)) && tz <= *tn)
295
{
296
fz = *fn;
297
tz *= sizeof(WCHAR);
298
}
299
else
300
{
301
/*
302
* target too small
303
* binary search on input size to make it fit
304
*/
305
306
oz = 0;
307
pz = *fn / 2;
308
fz = *fn - pz;
309
for (;;)
310
{
311
while (!(tz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)fz, (LPWSTR)*tb, 0)))
312
if (++fz >= *fn)
313
goto nope;
314
tz *= sizeof(WCHAR);
315
if (tz == *tn)
316
break;
317
if (!(pz /= 2))
318
{
319
if (!(fz = oz))
320
goto nope;
321
break;
322
}
323
if (tz > *tn)
324
fz -= pz;
325
else
326
{
327
oz = fz;
328
fz += pz;
329
}
330
}
331
}
332
}
333
else
334
{
335
if (cc->from.index == CP_UCS2)
336
{
337
un = *fn / sizeof(WCHAR);
338
ub = (LPWSTR)*fb;
339
}
340
else if (!(un = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)*tb, 0)))
341
goto nope;
342
else if (!(ub = (LPWSTR)malloc(un * sizeof(WCHAR))))
343
goto nope;
344
else if (!(un = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)ub, un)))
345
goto nope;
346
347
/*
348
* ucs-2 => to
349
*/
350
351
if (tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, un, *tb, *tn, 0, 0))
352
fz = *fn;
353
else
354
{
355
/*
356
* target too small
357
* binary search on input size to make it fit
358
*/
359
360
oz = 0;
361
pz = *fn / 2;
362
bz = *fn - pz;
363
for (;;)
364
{
365
while (!(fz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)bz, (LPWSTR)ub, un)))
366
if (++bz > *fn)
367
goto nope;
368
if (!(tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, fz, *tb, 0, 0, 0)))
369
goto nope;
370
if (tz == *tn)
371
break;
372
if (!(pz /= 2))
373
{
374
if (!(fz = oz))
375
goto nope;
376
break;
377
}
378
if (tz > *tn)
379
bz -= pz;
380
else
381
{
382
oz = bz;
383
bz += pz;
384
}
385
}
386
if (!(tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, fz, *tb, tz, 0, 0)))
387
goto nope;
388
#if DEBUG_TRACE
389
error(DEBUG_TRACE, "AHA#%d _win_iconv *fn=%u fz=%u[%u] *tn=%u tz=%u\n", __LINE__, *fn, fz, fz * sizeof(WCHAR), *tn, tz);
390
#endif
391
#if 0
392
fz *= sizeof(WCHAR);
393
#endif
394
}
395
if (ub != (LPWSTR)*fb)
396
free(ub);
397
}
398
}
399
*fb += fz;
400
*fn -= fz;
401
*tb += tz;
402
*tn -= tz;
403
return fz;
404
nope:
405
if (ub && ub != (LPWSTR)*fb)
406
free(ub);
407
errno = EINVAL;
408
return (size_t)(-1);
409
}
410
411
#endif
412
413
/*
414
* return canonical character code set name for m
415
* if b!=0 then canonical name placed in b of size n
416
* <ccode.h> index returned
417
*/
418
419
int
420
_ast_iconv_name(register const char* m, register char* b, size_t n)
421
{
422
register const _ast_iconv_list_t* cp;
423
const _ast_iconv_list_t* bp;
424
register int c;
425
register char* e;
426
ssize_t sub[2];
427
char buf[16];
428
#if DEBUG_TRACE
429
char* o;
430
#endif
431
432
if (!b)
433
{
434
b = buf;
435
n = sizeof(buf);
436
}
437
#if DEBUG_TRACE
438
o = b;
439
#endif
440
e = b + n - 1;
441
bp = 0;
442
n = 0;
443
cp = ccmaplist(NiL);
444
#if DEBUG_TRACE
445
if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name m=\"%s\"\n", error_info.id, error_info.trace, __LINE__, m);
446
#endif
447
for (;;)
448
{
449
#if DEBUG_TRACE
450
if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name n=%d bp=%p cp=%p ccode=%d name=\"%s\"\n", error_info.id, error_info.trace, __LINE__, n, bp, cp, cp->ccode, cp->name);
451
#endif
452
if (strgrpmatch(m, cp->match, sub, elementsof(sub) / 2, STR_MAXIMAL|STR_LEFT|STR_ICASE))
453
{
454
if (!(c = m[sub[1]]))
455
{
456
bp = cp;
457
break;
458
}
459
if (sub[1] > n && !isalpha(c))
460
{
461
bp = cp;
462
n = sub[1];
463
}
464
}
465
if (cp->ccode < 0)
466
{
467
if (!(++cp)->name)
468
break;
469
}
470
else if (!(cp = (const _ast_iconv_list_t*)ccmaplist((_ast_iconv_list_t*)cp)))
471
cp = codes;
472
}
473
if (cp = bp)
474
{
475
if (cp->canon)
476
{
477
if (cp->index)
478
{
479
for (m += sub[1]; *m && !isalnum(*m); m++);
480
if (!isdigit(*m))
481
m = cp->index;
482
}
483
else
484
m = "1";
485
b += sfsprintf(b, e - b, cp->canon, m);
486
}
487
else if (cp->ccode == CC_NATIVE)
488
{
489
if ((locales[AST_LC_CTYPE]->flags & LC_default) || !locales[AST_LC_CTYPE]->charset || !(m = locales[AST_LC_CTYPE]->charset->code) || streq(m, "iso8859-1"))
490
switch (CC_NATIVE)
491
{
492
case CC_EBCDIC:
493
m = (const char*)"EBCDIC";
494
break;
495
case CC_EBCDIC_I:
496
m = (const char*)"EBCDIC-I";
497
break;
498
case CC_EBCDIC_O:
499
m = (const char*)"EBCDIC-O";
500
break;
501
default:
502
m = (const char*)"ISO-8859-1";
503
break;
504
}
505
b += sfsprintf(b, e - b, "%s", m);
506
}
507
*b = 0;
508
#if DEBUG_TRACE
509
if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name ccode=%d canon=\"%s\"\n", error_info.id, error_info.trace, __LINE__, cp->ccode, o);
510
#endif
511
return cp->ccode;
512
}
513
while (b < e && (c = *m++))
514
{
515
if (islower(c))
516
c = toupper(c);
517
*b++ = c;
518
}
519
*b = 0;
520
#if DEBUG_TRACE
521
if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name ccode=%d canon=\"%s\"\n", error_info.id, error_info.trace, __LINE__, CC_ICONV, o);
522
#endif
523
return CC_ICONV;
524
}
525
526
/*
527
* convert utf-8 to bin
528
*/
529
530
static size_t
531
utf2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
532
{
533
register unsigned char* f;
534
register unsigned char* fe;
535
register unsigned char* t;
536
register unsigned char* te;
537
register unsigned char* p;
538
register int c;
539
register int w;
540
size_t n;
541
int e;
542
543
e = 0;
544
f = (unsigned char*)(*fb);
545
fe = f + (*fn);
546
t = (unsigned char*)(*tb);
547
te = t + (*tn);
548
while (t < te && f < fe)
549
{
550
p = f;
551
c = *f++;
552
if (c & 0x80)
553
{
554
if (!(c & 0x40))
555
{
556
f = p;
557
e = EILSEQ;
558
break;
559
}
560
if (c & 0x20)
561
{
562
w = (c & 0x0F) << 12;
563
if (f >= fe)
564
{
565
f = p;
566
e = EINVAL;
567
break;
568
}
569
c = *f++;
570
if (c & 0x40)
571
{
572
f = p;
573
e = EILSEQ;
574
break;
575
}
576
w |= (c & 0x3F) << 6;
577
}
578
else
579
w = (c & 0x1F) << 6;
580
if (f >= fe)
581
{
582
f = p;
583
e = EINVAL;
584
break;
585
}
586
c = *f++;
587
w |= (c & 0x3F);
588
}
589
else
590
w = c;
591
*t++ = w;
592
}
593
*fn -= (char*)f - (*fb);
594
*fb = (char*)f;
595
*tn -= (n = (char*)t - (*tb));
596
*tb = (char*)t;
597
RETURN(e, n, fn);
598
}
599
600
/*
601
* convert bin to utf-8
602
*/
603
604
static size_t
605
bin2utf(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
606
{
607
register unsigned char* f;
608
register unsigned char* fe;
609
register unsigned char* t;
610
register unsigned char* te;
611
register int c;
612
wchar_t w;
613
size_t n;
614
int e;
615
616
e = 0;
617
f = (unsigned char*)(*fb);
618
fe = f + (*fn);
619
t = (unsigned char*)(*tb);
620
te = t + (*tn);
621
while (f < fe && t < te)
622
{
623
if (!mbwide())
624
{
625
c = 1;
626
w = *f;
627
}
628
else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
629
{
630
e = EINVAL;
631
break;
632
}
633
else if (!c)
634
c = 1;
635
if (!(w & ~0x7F))
636
*t++ = w;
637
else
638
{
639
if (!(w & ~0x7FF))
640
{
641
if (t >= (te - 2))
642
{
643
e = E2BIG;
644
break;
645
}
646
*t++ = 0xC0 + (w >> 6);
647
}
648
else if (!(w & ~0xffff))
649
{
650
if (t >= (te - 3))
651
{
652
e = E2BIG;
653
break;
654
}
655
*t++ = 0xE0 + (w >> 12);
656
*t++ = 0x80 + ((w >> 6 ) & 0x3F);
657
}
658
else
659
{
660
e = EILSEQ;
661
break;
662
}
663
*t++ = 0x80 + (w & 0x3F);
664
}
665
f += c;
666
}
667
*fn -= (n = (char*)f - (*fb));
668
*fb = (char*)f;
669
*tn -= (char*)t - (*tb);
670
*tb = (char*)t;
671
RETURN(e, n, fn);
672
}
673
674
static const unsigned char ume_D[] =
675
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?!\"#$%&*;<=>@[]^_`{|} \t\n";
676
677
static const unsigned char ume_M[] =
678
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
679
680
static unsigned char ume_d[UCHAR_MAX+1];
681
682
static unsigned char ume_m[UCHAR_MAX+1];
683
684
#define NOE 0xFF
685
#define UMEINIT() (ume_d[ume_D[0]]?0:umeinit())
686
687
/*
688
* initialize the ume tables
689
*/
690
691
static int
692
umeinit(void)
693
{
694
register const unsigned char* s;
695
register int i;
696
register int c;
697
698
if (!ume_d[ume_D[0]])
699
{
700
s = ume_D;
701
while (c = *s++)
702
ume_d[c] = 1;
703
memset(ume_m, NOE, sizeof(ume_m));
704
for (i = 0; c = ume_M[i]; i++)
705
ume_m[c] = i;
706
}
707
return 0;
708
}
709
710
/*
711
* convert utf-7 to bin
712
*/
713
714
static size_t
715
ume2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
716
{
717
register unsigned char* f;
718
register unsigned char* fe;
719
register unsigned char* t;
720
register unsigned char* te;
721
register unsigned char* p;
722
register int s;
723
register int c;
724
register int w;
725
size_t n;
726
int e;
727
728
e = 0;
729
UMEINIT();
730
f = (unsigned char*)(*fb);
731
fe = f + (*fn);
732
t = (unsigned char*)(*tb);
733
te = t + (*tn);
734
s = 0;
735
while (f < fe && t < te)
736
{
737
p = f;
738
c = *f++;
739
if (s)
740
{
741
if (c == '-' && s > 1)
742
s = 0;
743
else if ((w = ume_m[c]) == NOE)
744
{
745
s = 0;
746
*t++ = c;
747
}
748
else if (f >= (fe - 2))
749
{
750
f = p;
751
e = EINVAL;
752
break;
753
}
754
else
755
{
756
s = 2;
757
w = (w << 6) | ume_m[*f++];
758
w = (w << 6) | ume_m[*f++];
759
if (!(w & ~0xFF))
760
*t++ = w;
761
else if (t >= (te - 1))
762
{
763
f = p;
764
e = E2BIG;
765
break;
766
}
767
else
768
{
769
*t++ = (w >> 8) & 0xFF;
770
*t++ = w & 0xFF;
771
}
772
}
773
}
774
else if (c == '+')
775
s = 1;
776
else
777
*t++ = c;
778
}
779
*fn -= (char*)f - (*fb);
780
*fb = (char*)f;
781
*tn -= (n = (char*)t - (*tb));
782
*tb = (char*)t;
783
RETURN(e, n, fn);
784
}
785
786
/*
787
* convert bin to utf-7
788
*/
789
790
static size_t
791
bin2ume(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
792
{
793
register unsigned char* f;
794
register unsigned char* fe;
795
register unsigned char* t;
796
register unsigned char* te;
797
register int c;
798
register int s;
799
wchar_t w;
800
size_t n;
801
int e;
802
803
e = 0;
804
UMEINIT();
805
f = (unsigned char*)(*fb);
806
fe = f + (*fn);
807
t = (unsigned char*)(*tb);
808
te = t + (*tn);
809
s = 0;
810
while (f < fe && t < (te - s))
811
{
812
if (!mbwide())
813
{
814
c = 1;
815
w = *f;
816
}
817
else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
818
{
819
e = EINVAL;
820
break;
821
}
822
else if (!c)
823
c = 1;
824
if (!(w & ~0x7F) && ume_d[w])
825
{
826
if (s)
827
{
828
s = 0;
829
*t++ = '-';
830
}
831
*t++ = w;
832
}
833
else if (t >= (te - (4 + s)))
834
{
835
e = E2BIG;
836
break;
837
}
838
else
839
{
840
if (!s)
841
{
842
s = 1;
843
*t++ = '+';
844
}
845
*t++ = ume_M[(w >> 12) & 0x3F];
846
*t++ = ume_M[(w >> 6) & 0x3F];
847
*t++ = ume_M[w & 0x3F];
848
}
849
f += c;
850
}
851
if (s)
852
*t++ = '-';
853
*fn -= (n = (char*)f - (*fb));
854
*fb = (char*)f;
855
*tn -= (char*)t - (*tb);
856
*tb = (char*)t;
857
RETURN(e, n, fn);
858
}
859
860
/*
861
* convert ucs-2 to bin with no byte swap
862
*/
863
864
static size_t
865
ucs2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
866
{
867
register unsigned char* f;
868
register unsigned char* fe;
869
register unsigned char* t;
870
register unsigned char* te;
871
register int w;
872
size_t n;
873
int e;
874
875
e = 0;
876
f = (unsigned char*)(*fb);
877
fe = f + (*fn);
878
t = (unsigned char*)(*tb);
879
te = t + (*tn);
880
while (f < (fe - 1) && t < te)
881
{
882
w = *f++;
883
w = (w << 8) | *f++;
884
if (!(w & ~0xFF))
885
*t++ = w;
886
else if (t >= (te - 1))
887
{
888
f -= 2;
889
e = E2BIG;
890
break;
891
}
892
else
893
{
894
*t++ = (w >> 8) & 0xFF;
895
*t++ = w & 0xFF;
896
}
897
}
898
*fn -= (char*)f - (*fb);
899
*fb = (char*)f;
900
*tn -= (n = (char*)t - (*tb));
901
*tb = (char*)t;
902
RETURN(e, n, fn);
903
}
904
905
/*
906
* convert bin to ucs-2 with no byte swap
907
*/
908
909
static size_t
910
bin2ucs(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
911
{
912
register unsigned char* f;
913
register unsigned char* fe;
914
register unsigned char* t;
915
register unsigned char* te;
916
register int c;
917
wchar_t w;
918
size_t n;
919
int e;
920
921
e = 0;
922
f = (unsigned char*)(*fb);
923
fe = f + (*fn);
924
t = (unsigned char*)(*tb);
925
te = t + (*tn);
926
while (f < fe && t < (te - 1))
927
{
928
if (!mbwide())
929
{
930
c = 1;
931
w = *f;
932
}
933
if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
934
{
935
e = EINVAL;
936
break;
937
}
938
else if (!c)
939
c = 1;
940
*t++ = (w >> 8) & 0xFF;
941
*t++ = w & 0xFF;
942
f += c;
943
}
944
*fn -= (n = (char*)f - (*fb));
945
*fb = (char*)f;
946
*tn -= (char*)t - (*tb);
947
*tb = (char*)t;
948
RETURN(e, n, fn);
949
}
950
951
/*
952
* convert ucs-2 to bin with byte swap
953
*/
954
955
static size_t
956
scu2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
957
{
958
register unsigned char* f;
959
register unsigned char* fe;
960
register unsigned char* t;
961
register unsigned char* te;
962
register int w;
963
size_t n;
964
int e;
965
966
e = 0;
967
f = (unsigned char*)(*fb);
968
fe = f + (*fn);
969
t = (unsigned char*)(*tb);
970
te = t + (*tn);
971
while (f < (fe - 1) && t < te)
972
{
973
w = *f++;
974
w = w | (*f++ << 8);
975
if (!(w & ~0xFF))
976
*t++ = w;
977
else if (t >= (te - 1))
978
{
979
f -= 2;
980
e = E2BIG;
981
break;
982
}
983
else
984
{
985
*t++ = (w >> 8) & 0xFF;
986
*t++ = w & 0xFF;
987
}
988
}
989
*fn -= (char*)f - (*fb);
990
*fb = (char*)f;
991
*tn -= (n = (char*)t - (*tb));
992
*tb = (char*)t;
993
RETURN(e, n, fn);
994
}
995
996
/*
997
* convert bin to ucs-2 with byte swap
998
*/
999
1000
static size_t
1001
bin2scu(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
1002
{
1003
register unsigned char* f;
1004
register unsigned char* fe;
1005
register unsigned char* t;
1006
register unsigned char* te;
1007
register int c;
1008
wchar_t w;
1009
size_t n;
1010
int e;
1011
1012
e = 0;
1013
f = (unsigned char*)(*fb);
1014
fe = f + (*fn);
1015
t = (unsigned char*)(*tb);
1016
te = t + (*tn);
1017
while (f < fe && t < (te - 1))
1018
{
1019
if (!mbwide())
1020
{
1021
c = 1;
1022
w = *f;
1023
}
1024
else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
1025
{
1026
e = EINVAL;
1027
break;
1028
}
1029
else if (!c)
1030
c = 1;
1031
*t++ = w & 0xFF;
1032
*t++ = (w >> 8) & 0xFF;
1033
f += c;
1034
}
1035
*fn -= (n = (char*)f - (*fb));
1036
*fb = (char*)f;
1037
*tn -= (char*)t - (*tb);
1038
*tb = (char*)t;
1039
RETURN(e, n, fn);
1040
}
1041
1042
/*
1043
* open a character code conversion map from f to t
1044
*/
1045
1046
_ast_iconv_t
1047
_ast_iconv_open(const char* t, const char* f)
1048
{
1049
register Conv_t* cc;
1050
int fc;
1051
int tc;
1052
int i;
1053
1054
char fr[64];
1055
char to[64];
1056
1057
#if DEBUG_TRACE
1058
error(DEBUG_TRACE, "AHA#%d _ast_iconv_open f=%s t=%s\n", __LINE__, f, t);
1059
#endif
1060
if (!t || !*t || *t == '-' && !*(t + 1) || !strcasecmp(t, name_local) || !strcasecmp(t, name_native))
1061
t = name_native;
1062
if (!f || !*f || *f == '-' && !*(f + 1) || !strcasecmp(t, name_local) || !strcasecmp(f, name_native))
1063
f = name_native;
1064
1065
/*
1066
* the ast identify is always (iconv_t)(0)
1067
*/
1068
1069
if (t == f)
1070
return (iconv_t)(0);
1071
fc = _ast_iconv_name(f, fr, sizeof(fr));
1072
tc = _ast_iconv_name(t, to, sizeof(to));
1073
#if DEBUG_TRACE
1074
error(DEBUG_TRACE, "AHA#%d _ast_iconv_open f=%s:%s:%d t=%s:%s:%d\n", __LINE__, f, fr, fc, t, to, tc);
1075
#endif
1076
if (fc != CC_ICONV && fc == tc || streq(fr, to))
1077
return (iconv_t)(0);
1078
1079
/*
1080
* first check the free list
1081
*/
1082
1083
for (i = 0; i < elementsof(freelist); i++)
1084
if ((cc = freelist[i]) && streq(to, cc->to.name) && streq(fr, cc->from.name))
1085
{
1086
freelist[i] = 0;
1087
#if _lib_iconv_open
1088
/*
1089
* reset the shift state if any
1090
*/
1091
1092
if (cc->cvt != (iconv_t)(-1))
1093
iconv(cc->cvt, NiL, NiL, NiL, NiL);
1094
#endif
1095
return cc;
1096
}
1097
1098
/*
1099
* allocate a new one
1100
*/
1101
1102
if (!(cc = newof(0, Conv_t, 1, strlen(to) + strlen(fr) + 2)))
1103
return (iconv_t)(-1);
1104
cc->to.name = (char*)(cc + 1);
1105
cc->from.name = strcopy(cc->to.name, to) + 1;
1106
strcpy(cc->from.name, fr);
1107
cc->cvt = (iconv_t)(-1);
1108
1109
/*
1110
* 8 bit maps are the easiest
1111
*/
1112
1113
if (fc >= 0 && tc >= 0)
1114
cc->from.map = ccmap(fc, tc);
1115
#if _lib_iconv_open
1116
else if ((cc->cvt = iconv_open(t, f)) != (iconv_t)(-1) || (cc->cvt = iconv_open(to, fr)) != (iconv_t)(-1))
1117
cc->from.fun = (_ast_iconv_f)iconv;
1118
#endif
1119
#if _UWIN
1120
else if ((cc->cvt = _win_iconv_open(cc, t, f)) != (_ast_iconv_t)(-1) || (cc->cvt = _win_iconv_open(cc, to, fr)) != (_ast_iconv_t)(-1))
1121
cc->from.fun = (_ast_iconv_f)_win_iconv;
1122
#endif
1123
else
1124
{
1125
switch (fc)
1126
{
1127
case CC_UTF:
1128
cc->from.fun = utf2bin;
1129
break;
1130
case CC_UME:
1131
cc->from.fun = ume2bin;
1132
break;
1133
case CC_UCS:
1134
cc->from.fun = ucs2bin;
1135
break;
1136
case CC_SCU:
1137
cc->from.fun = scu2bin;
1138
break;
1139
case CC_ASCII:
1140
break;
1141
default:
1142
if (fc < 0)
1143
goto nope;
1144
cc->from.map = ccmap(fc, CC_ASCII);
1145
break;
1146
}
1147
switch (tc)
1148
{
1149
case CC_UTF:
1150
cc->to.fun = bin2utf;
1151
break;
1152
case CC_UME:
1153
cc->to.fun = bin2ume;
1154
break;
1155
case CC_UCS:
1156
cc->to.fun = bin2ucs;
1157
break;
1158
case CC_SCU:
1159
cc->to.fun = bin2scu;
1160
break;
1161
case CC_ASCII:
1162
break;
1163
default:
1164
if (tc < 0)
1165
goto nope;
1166
cc->to.map = ccmap(CC_ASCII, tc);
1167
break;
1168
}
1169
}
1170
return (iconv_t)cc;
1171
nope:
1172
return (iconv_t)(-1);
1173
}
1174
1175
/*
1176
* close a character code conversion map
1177
*/
1178
1179
int
1180
_ast_iconv_close(_ast_iconv_t cd)
1181
{
1182
Conv_t* cc;
1183
Conv_t* oc;
1184
int i;
1185
int r = 0;
1186
1187
if (cd == (_ast_iconv_t)(-1))
1188
return -1;
1189
if (!(cc = (Conv_t*)cd))
1190
return 0;
1191
1192
/*
1193
* add to the free list
1194
*/
1195
1196
i = freeindex;
1197
for (;;)
1198
{
1199
if (++ i >= elementsof(freelist))
1200
i = 0;
1201
if (!freelist[i])
1202
break;
1203
if (i == freeindex)
1204
{
1205
if (++ i >= elementsof(freelist))
1206
i = 0;
1207
1208
/*
1209
* close the oldest
1210
*/
1211
1212
if (oc = freelist[i])
1213
{
1214
#if _lib_iconv_open
1215
if (oc->cvt != (iconv_t)(-1))
1216
r = iconv_close(oc->cvt);
1217
#endif
1218
if (oc->buf)
1219
free(oc->buf);
1220
free(oc);
1221
}
1222
break;
1223
}
1224
}
1225
freelist[freeindex = i] = cc;
1226
return r;
1227
}
1228
1229
/*
1230
* copy *fb size *fn to *tb size *tn
1231
* fb,fn tb,tn updated on return
1232
*/
1233
1234
size_t
1235
_ast_iconv(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
1236
{
1237
Conv_t* cc = (Conv_t*)cd;
1238
register unsigned char* f;
1239
register unsigned char* t;
1240
register unsigned char* e;
1241
register const unsigned char* m;
1242
register size_t n;
1243
char* b;
1244
char* tfb;
1245
size_t tfn;
1246
size_t i;
1247
1248
if (!fb || !*fb)
1249
{
1250
/* TODO: reset to the initial state */
1251
if (!tb || !*tb)
1252
return 0;
1253
/* TODO: write the initial state shift sequence */
1254
return 0;
1255
}
1256
n = *tn;
1257
if (cc)
1258
{
1259
if (cc->from.fun)
1260
{
1261
if (cc->to.fun)
1262
{
1263
if (!cc->buf && !(cc->buf = oldof(0, char, cc->size = SF_BUFSIZE, 0)))
1264
{
1265
errno = ENOMEM;
1266
return -1;
1267
}
1268
b = cc->buf;
1269
i = cc->size;
1270
tfb = *fb;
1271
tfn = *fn;
1272
if ((*cc->from.fun)(cc->cvt, &tfb, &tfn, &b, &i) == (size_t)(-1))
1273
return -1;
1274
tfn = b - cc->buf;
1275
tfb = cc->buf;
1276
n = (*cc->to.fun)(cc->cvt, &tfb, &tfn, tb, tn);
1277
i = tfb - cc->buf;
1278
*fb += i;
1279
*fn -= i;
1280
return n;
1281
}
1282
if ((*cc->from.fun)(cc->cvt, fb, fn, tb, tn) == (size_t)(-1))
1283
return -1;
1284
n -= *tn;
1285
if (m = cc->to.map)
1286
{
1287
e = (unsigned char*)(*tb);
1288
for (t = e - n; t < e; t++)
1289
*t = m[*t];
1290
}
1291
return n;
1292
}
1293
else if (cc->to.fun)
1294
{
1295
if (!(m = cc->from.map))
1296
return (*cc->to.fun)(cc->cvt, fb, fn, tb, tn);
1297
if (!cc->buf && !(cc->buf = oldof(0, char, cc->size = SF_BUFSIZE, 0)))
1298
{
1299
errno = ENOMEM;
1300
return -1;
1301
}
1302
if ((n = *fn) > cc->size)
1303
n = cc->size;
1304
f = (unsigned char*)(*fb);
1305
e = f + n;
1306
t = (unsigned char*)(b = cc->buf);
1307
while (f < e)
1308
*t++ = m[*f++];
1309
n = (*cc->to.fun)(cc->cvt, &b, fn, tb, tn);
1310
*fb += b - cc->buf;
1311
return n;
1312
}
1313
}
1314
if (n > *fn)
1315
n = *fn;
1316
if (cc && (m = cc->from.map))
1317
{
1318
f = (unsigned char*)(*fb);
1319
e = f + n;
1320
t = (unsigned char*)(*tb);
1321
while (f < e)
1322
*t++ = m[*f++];
1323
}
1324
else
1325
memcpy(*tb, *fb, n);
1326
*fb += n;
1327
*fn -= n;
1328
*tb += n;
1329
*tn -= n;
1330
return n;
1331
}
1332
1333
#define OK ((size_t)-1)
1334
1335
/*
1336
* write *fb size *fn to op
1337
* fb,fn updated on return
1338
* total bytes written to op returned
1339
*/
1340
1341
ssize_t
1342
_ast_iconv_write(_ast_iconv_t cd, Sfio_t* op, char** fb, size_t* fn, Iconv_disc_t* disc)
1343
{
1344
char* fo = *fb;
1345
char* tb;
1346
char* ts;
1347
size_t* e;
1348
size_t tn;
1349
size_t r;
1350
int ok;
1351
Iconv_disc_t compat;
1352
1353
/*
1354
* the old api had optional size_t* instead of Iconv_disc_t*
1355
*/
1356
1357
if (!disc || disc->version < 20110101L || disc->version >= 30000101L)
1358
{
1359
e = (size_t*)disc;
1360
disc = &compat;
1361
iconv_init(disc, 0);
1362
}
1363
else
1364
e = 0;
1365
r = 0;
1366
tn = 0;
1367
ok = 1;
1368
while (ok && *fn > 0)
1369
{
1370
if (!(tb = (char*)sfreserve(op, -(tn + 1), SF_WRITE|SF_LOCKR)) || !(tn = sfvalue(op)))
1371
{
1372
if (!r)
1373
r = -1;
1374
break;
1375
}
1376
ts = tb;
1377
#if DEBUG_TRACE
1378
error(DEBUG_TRACE, "AHA#%d iconv_write ts=%p tn=%d", __LINE__, ts, tn);
1379
for (;;)
1380
#else
1381
while (*fn > 0 && _ast_iconv(cd, fb, fn, &ts, &tn) == (size_t)(-1))
1382
#endif
1383
{
1384
#if DEBUG_TRACE
1385
ssize_t _r;
1386
error(DEBUG_TRACE, "AHA#%d iconv_write %d => %d `%-.*s'", __LINE__, *fn, tn, *fn, *fb);
1387
_r = _ast_iconv(cd, fb, fn, &ts, &tn);
1388
error(DEBUG_TRACE, "AHA#%d iconv_write %d => %d [%d]", __LINE__, *fn, tn, _r);
1389
if (_r != (size_t)(-1) || !fn)
1390
break;
1391
#endif
1392
switch (errno)
1393
{
1394
case E2BIG:
1395
break;
1396
case EINVAL:
1397
if (disc->errorf)
1398
(*disc->errorf)(NiL, disc, ERROR_SYSTEM|2, "incomplete multibyte sequence at offset %I*u", sizeof(fo), *fb - fo);
1399
goto bad;
1400
default:
1401
if (disc->errorf)
1402
(*disc->errorf)(NiL, disc, ERROR_SYSTEM|2, "invalid multibyte sequence at offset %I*u", sizeof(fo), *fb - fo);
1403
bad:
1404
disc->errors++;
1405
if (!(disc->flags & ICONV_FATAL))
1406
{
1407
if (!(disc->flags & ICONV_OMIT) && tn > 0)
1408
{
1409
*ts++ = (disc->fill >= 0) ? disc->fill : **fb;
1410
tn--;
1411
}
1412
(*fb)++;
1413
(*fn)--;
1414
continue;
1415
}
1416
ok = 0;
1417
break;
1418
}
1419
break;
1420
}
1421
#if DEBUG_TRACE
1422
error(DEBUG_TRACE, "AHA#%d iconv_write %d", __LINE__, ts - tb);
1423
#endif
1424
sfwrite(op, tb, ts - tb);
1425
r += ts - tb;
1426
}
1427
if (e)
1428
*e = disc->errors;
1429
return r;
1430
}
1431
1432
/*
1433
* move n bytes from ip to op
1434
*/
1435
1436
ssize_t
1437
_ast_iconv_move(_ast_iconv_t cd, Sfio_t* ip, Sfio_t* op, size_t n, Iconv_disc_t* disc)
1438
{
1439
char* fb;
1440
char* fs;
1441
char* tb;
1442
char* ts;
1443
size_t* e;
1444
size_t fe;
1445
size_t fn;
1446
size_t fo;
1447
size_t ft;
1448
size_t tn;
1449
size_t i;
1450
ssize_t r = 0;
1451
int ok = 1;
1452
int locked;
1453
Iconv_disc_t compat;
1454
1455
/*
1456
* the old api had optional size_t* instead of Iconv_disc_t*
1457
*/
1458
1459
if (!disc || disc->version < 20110101L || disc->version >= 30000101L)
1460
{
1461
e = (size_t*)disc;
1462
disc = &compat;
1463
iconv_init(disc, 0);
1464
}
1465
else
1466
e = 0;
1467
tb = 0;
1468
fe = OK;
1469
ft = 0;
1470
fn = n;
1471
do
1472
{
1473
if (n != SF_UNBOUND)
1474
n = -((ssize_t)(n & (((size_t)(~0))>>1)));
1475
if ((!(fb = (char*)sfreserve(ip, n, locked = SF_LOCKR)) || !(fo = sfvalue(ip))) &&
1476
(!(fb = (char*)sfreserve(ip, n, locked = 0)) || !(fo = sfvalue(ip))))
1477
break;
1478
fs = fb;
1479
fn = fo;
1480
if (!(tb = (char*)sfreserve(op, SF_UNBOUND, SF_WRITE|SF_LOCKR)))
1481
{
1482
if (!r)
1483
r = -1;
1484
break;
1485
}
1486
ts = tb;
1487
tn = sfvalue(op);
1488
while (fn > 0 && _ast_iconv(cd, &fs, &fn, &ts, &tn) == (size_t)(-1))
1489
{
1490
switch (errno)
1491
{
1492
case E2BIG:
1493
break;
1494
case EINVAL:
1495
if (fe == ft + (fo - fn))
1496
{
1497
fe = OK;
1498
if (disc->errorf)
1499
(*disc->errorf)(NiL, disc, ERROR_SYSTEM|2, "incomplete multibyte sequence at offset %I*u", sizeof(ft), ft + (fo - fn));
1500
goto bad;
1501
}
1502
fe = ft;
1503
break;
1504
default:
1505
if (disc->errorf)
1506
(*disc->errorf)(NiL, disc, ERROR_SYSTEM|2, "invalid multibyte sequence at offset %I*u", sizeof(ft), ft + (fo - fn));
1507
bad:
1508
disc->errors++;
1509
if (!(disc->flags & ICONV_FATAL))
1510
{
1511
if (!(disc->flags & ICONV_OMIT) && tn > 0)
1512
{
1513
*ts++ = (disc->fill >= 0) ? disc->fill : *fs;
1514
tn--;
1515
}
1516
fs++;
1517
fn--;
1518
continue;
1519
}
1520
ok = 0;
1521
break;
1522
}
1523
break;
1524
}
1525
sfwrite(op, tb, ts - tb);
1526
r += ts - tb;
1527
ts = tb;
1528
if (locked)
1529
sfread(ip, fb, fs - fb);
1530
else
1531
for (i = fn; --i >= (fs - fb);)
1532
sfungetc(ip, fb[i]);
1533
if (n != SF_UNBOUND)
1534
{
1535
if (n <= (fs - fb))
1536
break;
1537
n -= fs - fb;
1538
}
1539
ft += (fs - fb);
1540
if (fn == fo)
1541
fn++;
1542
} while (ok);
1543
if (fb && locked)
1544
sfread(ip, fb, 0);
1545
if (tb)
1546
{
1547
sfwrite(op, tb, 0);
1548
if (ts > tb)
1549
{
1550
sfwrite(op, tb, ts - tb);
1551
r += ts - tb;
1552
}
1553
}
1554
if (e)
1555
*e = disc->errors;
1556
return r;
1557
}
1558
1559
/*
1560
* iconv_list_t iterator
1561
* call with arg 0 to start
1562
* prev return value is current arg
1563
*/
1564
1565
_ast_iconv_list_t*
1566
_ast_iconv_list(_ast_iconv_list_t* cp)
1567
{
1568
#if _UWIN
1569
struct dirent* ent;
1570
1571
if (!cp)
1572
{
1573
if (!(cp = newof(0, _ast_iconv_list_t, 1, 0)))
1574
return ccmaplist(NiL);
1575
if (!(cp->data = opendir(_win_maps)))
1576
{
1577
free(cp);
1578
return ccmaplist(NiL);
1579
}
1580
}
1581
if (cp->data)
1582
{
1583
if (ent = readdir((DIR*)cp->data))
1584
{
1585
cp->name = cp->match = cp->desc = (const char*)ent->d_name;
1586
return cp;
1587
}
1588
closedir((DIR*)cp->data);
1589
free(cp);
1590
return ccmaplist(NiL);
1591
}
1592
#else
1593
if (!cp)
1594
return ccmaplist(NiL);
1595
#endif
1596
if (cp->ccode >= 0)
1597
return (cp = ccmaplist(cp)) ? cp : (_ast_iconv_list_t*)codes;
1598
return (++cp)->name ? cp : (_ast_iconv_list_t*)0;
1599
}
1600
1601