Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/cmd/builtin/tr.c
1808 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 1992-2012 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Glenn Fowler <[email protected]> *
18
* David Korn <[email protected]> *
19
* *
20
***********************************************************************/
21
#pragma prototyped
22
/*
23
* David Korn
24
* Glenn Fowler
25
* AT&T Research
26
*
27
* tr
28
*/
29
30
static const char usage[] =
31
"[-?\n@(#)$Id: tr (AT&T Research) 2012-05-31 $\n]"
32
USAGE_LICENSE
33
"[+NAME?tr - translate, squeeze, and/or delete characters]"
34
"[+DESCRIPTION?\btr\b copies the standard input to the standard output"
35
" with substitution or deletion of selected characters. Input"
36
" characters in \aset1\a are mapped to corresponding characters"
37
" in \aset2\a.]"
38
39
"[c:complement?Complement \aset1\a.]"
40
"[d:delete?Delete characters in \aset1\a but do not translate.]"
41
"[s:squeeze-repeats?Replace sequences of the same character with one.]"
42
"[t:truncate-set1?Truncate \aset1\a to the length of \aset2\a.]"
43
44
"[+?\asets\a are specified as strings of characters. Most represent"
45
" themselves. Interpreted sequences are:]{"
46
" [+\\nnn?character with octal value \annn\a]"
47
" [+\\xnn?character with hexadecimal value \ann\a]"
48
" [+\\\\?backslash]"
49
" [+\\a?alert]"
50
" [+\\b?backpace]"
51
" [+\\f?form feed]"
52
" [+\\r?return]"
53
" [+\\t?horizontal tab]"
54
" [+\\v?vertical tab]"
55
" [+\\E?escape]"
56
" [+c1-c2?all characters from \ac1\a to \ac2\a in ascending order]"
57
" [+[c1-c2]]?same as \ac1-c2\a if both \asets\a use this form]"
58
" [+[[c*]]]]?in \aset2\a, copies of \\ac\\a until length of \aset1\a]"
59
" [+[[c*n]]]]?\\an\\a copies of \\ac\\a]"
60
" [+[[::alnum::]]]]?all letters and digits]"
61
" [+[[::alpha::]]]]?all letters]"
62
" [+[[::blank::]]]]?all horizontal whitespace]"
63
" [+[[::cntrl::]]]]?all control characters]"
64
" [+[[::digit::]]]]?all digits]"
65
" [+[[::graph::]]]]?all printable characters, not including space]"
66
" [+[[::lower::]]]]?all lower case letters]"
67
" [+[[::print::]]]]?all printable characters, including space]"
68
" [+[[::punct::]]]]?all punctuation characters]"
69
" [+[[::space::]]]]?all horizontal or vertical whitespace]"
70
" [+[[::upper::]]]]?all upper case letters]"
71
" [+[[::xdigit::]]]]?all hexadecimal digits]"
72
" [+[[=c=]]]]?all characters which are equivalent to \\ac\\a]"
73
" }"
74
"[+?Translation occurs if \b-d\b is not given and both \aset1\a"
75
" and \aset2\a appear. \b-t\b may be used only when translating."
76
" \aset2\a is extended to the length of \aset1\a by repeating its last"
77
" character as necessary. Excess characters in \aset2\a are ignored."
78
" Only [:lower:]] and [:upper:]] are guaranteed to expand in ascending"
79
" order. They may only be used in pairs to specify case conversion."
80
" \b-s\b uses \aset1\a if neither translating nor deleting, otherwise"
81
" squeeze uses \aset2\a and occurs after translation or deletion.]"
82
83
"\n"
84
"\n[ set1 [ set2 ] ]\n"
85
"\n"
86
"[+SEE ALSO?\bsed\b(1), \bascii\b(5)]"
87
;
88
89
#include <cmd.h>
90
#include <ctype.h>
91
#include <error.h>
92
#include <regex.h>
93
94
#define TR_COMPLEMENT (1<<0)
95
#define TR_DELETE (1<<1)
96
#define TR_SQUEEZE (1<<2)
97
#define TR_TRUNCATE (1<<3)
98
99
#define HITBIT (1<<(CHAR_BIT+1))
100
#define DELBIT (1<<(CHAR_BIT+2))
101
#define ONEBIT (1<<(CHAR_BIT+3))
102
103
#define setchar(p,s,t) ((p)->type=(t),(p)->prev=(p)->last=(-1),(p)->isit=0,(p)->count=0,(p)->base=(p)->next=(s))
104
105
typedef struct
106
{
107
int code[1<<CHAR_BIT];
108
int convert;
109
int count;
110
int prev;
111
int last;
112
int level;
113
int position;
114
int src;
115
int dst;
116
int type;
117
int truncate;
118
regclass_t isit;
119
unsigned char* base;
120
unsigned char* next;
121
unsigned char* hold;
122
} Tr_t;
123
124
static const char* typename[] = { "source", "destination" };
125
126
/*
127
* return next string character
128
* the string pointer is advanced
129
* returns -1 for end of string
130
* returns -2 for string format error
131
*/
132
133
static int
134
nextchar(register Tr_t* tr)
135
{
136
register int c;
137
int q;
138
unsigned char* e;
139
regclass_t f;
140
wchar_t wc;
141
char buf[32];
142
143
/*
144
* tr.count>0 when tr.type==1 string contains x*count
145
*/
146
147
if (tr->count)
148
{
149
if (tr->count > 0)
150
tr->count--;
151
return tr->prev;
152
}
153
154
/*
155
* tr.last>=0 when string contains char class
156
*/
157
158
next:
159
if (tr->last >= 0)
160
{
161
while (++tr->prev <= tr->last)
162
if (!tr->isit || (*tr->isit)(tr->prev))
163
return (!tr->type || !tr->convert) ? tr->prev : tr->convert == 'l' ? tolower(tr->prev) : toupper(tr->prev);
164
tr->last = -1;
165
tr->hold = tr->next + 1;
166
}
167
switch (c = *tr->next++)
168
{
169
case 0:
170
tr->next--;
171
c = tr->level ? -2 : tr->type && !tr->truncate ? tr->prev : -1;
172
break;
173
case '\\':
174
c = chresc((char*)tr->next - 1, (char**)&tr->next);
175
break;
176
case '[':
177
switch (*tr->next)
178
{
179
case ':':
180
f = 0;
181
if (tr->convert)
182
{
183
c = *(tr->next + 1);
184
if (tr->convert == c || tr->type && !tr->position)
185
{
186
c = *tr->next;
187
goto member;
188
}
189
else if (!strncmp((char*)tr->next, ":lower:", 7) || !strncmp((char*)tr->next, ":upper:", 7))
190
{
191
f = tr->isit;
192
tr->convert = c;
193
c = tr->next - tr->base;
194
if (!tr->type)
195
tr->position = c;
196
else if (tr->position != c)
197
return -2;
198
}
199
}
200
if (!(tr->isit = regclass((char*)tr->next, (char**)&e)))
201
{
202
if (f)
203
tr->isit = f;
204
c = ':';
205
goto member;
206
}
207
tr->next = e;
208
if (f)
209
tr->isit = f;
210
tr->prev = -1;
211
tr->last = UCHAR_MAX + 1;
212
return nextchar(tr);
213
case '.':
214
case '=':
215
if ((q = regcollate((char*)tr->next, (char**)&e, buf, sizeof(buf), &wc)) >= 0)
216
{
217
tr->next = e;
218
c = q ? buf[0] : 0;
219
break;
220
}
221
/*FALLTHROUGH*/
222
member:
223
if (*(e = tr->next + 1))
224
{
225
while (*++e && *e != c && *e != ']');
226
if (*e != ']' && *++e == ']')
227
return -2;
228
}
229
default:
230
if (!tr->level)
231
{
232
tr->level++;
233
c = nextchar(tr);
234
if (*tr->next == '*')
235
{
236
e = tr->next + 1;
237
if (!(tr->count = (int)strtol((char*)tr->next + 1, (char**)&tr->next, 0)) && tr->next == e)
238
{
239
if (tr->type == 0)
240
return -2;
241
tr->count = -1;
242
}
243
if (*tr->next++ != ']')
244
return -2;
245
if (tr->count < 0)
246
{
247
/*
248
* tr->src chars total
249
* tr->dst chars so far
250
* count what's left
251
*/
252
253
Tr_t peek;
254
255
peek = *tr;
256
peek.count = 0;
257
peek.last = -1;
258
while (nextchar(&peek) >= 0)
259
peek.dst++;
260
tr->count = tr->src - peek.dst;
261
}
262
else if (tr->count > (1<<CHAR_BIT))
263
tr->count = (1<<CHAR_BIT);
264
if (!tr->count)
265
goto next;
266
tr->count--;
267
tr->level--;
268
}
269
}
270
break;
271
}
272
break;
273
case '-':
274
if (tr->prev >= 0 && tr->next != tr->hold && *tr->next)
275
{
276
c = tr->prev;
277
tr->last = nextchar(tr);
278
if (c > tr->last)
279
return -2;
280
tr->prev = c;
281
goto next;
282
}
283
break;
284
case ']':
285
if (tr->level > 0 && tr->next > tr->base + 2)
286
{
287
tr->level--;
288
c = nextchar(tr);
289
}
290
break;
291
}
292
return tr->prev = c;
293
}
294
295
/*
296
* return a tr handle for <src,dst>
297
*/
298
299
static Tr_t*
300
tropen(unsigned char* src, unsigned char* dst, int flags)
301
{
302
register Tr_t* tr;
303
register int c;
304
register int n;
305
register int x;
306
register int squeeze;
307
unsigned int set[1<<(CHAR_BIT+1)];
308
309
if (!(tr = newof(0, Tr_t, 1, 0)))
310
{
311
error(2, "out of space [code]");
312
return 0;
313
}
314
switch (flags & (TR_DELETE|TR_SQUEEZE))
315
{
316
case TR_DELETE:
317
case TR_SQUEEZE:
318
case TR_DELETE|TR_SQUEEZE:
319
break;
320
default:
321
tr->convert = '?';
322
break;
323
}
324
tr->truncate = flags & TR_TRUNCATE;
325
if (dst && !*dst)
326
dst = 0;
327
squeeze = (flags & TR_SQUEEZE) ? ONEBIT : 0;
328
for (n = 0; n < (1<<CHAR_BIT); n++)
329
tr->code[n] = n;
330
n = 0;
331
if (src)
332
{
333
setchar(tr, src, 0);
334
while ((c = nextchar(tr)) >=0 && n < elementsof(set))
335
{
336
tr->code[c] |= HITBIT;
337
#if DEBUG_TRACE
338
error(-1, "src %d '%c'", n, c);
339
#endif
340
set[n++] = c;
341
}
342
if (c < -1)
343
goto bad;
344
}
345
tr->src = n;
346
if (flags & TR_COMPLEMENT)
347
{
348
for (n = c = 0; n < (1<<CHAR_BIT); n++)
349
if (!(tr->code[n] & HITBIT))
350
set[c++] = n;
351
tr->src = c;
352
}
353
if (tr->convert == '?')
354
tr->convert = 0;
355
setchar(tr, dst, 1);
356
for (tr->dst = 0; tr->dst < tr->src; tr->dst++)
357
{
358
c = set[tr->dst];
359
if (flags & TR_DELETE)
360
tr->code[c] |= DELBIT;
361
else if (dst)
362
{
363
if ((x = nextchar(tr)) >= 0)
364
{
365
#if DEBUG_TRACE
366
error(-1, "dst %d '%c' => '%c'", tr->dst, c, x);
367
#endif
368
tr->code[c] = x | squeeze;
369
}
370
else if (x < -1)
371
goto bad;
372
else if (tr->truncate)
373
{
374
while (tr->dst < tr->src)
375
{
376
c = set[tr->dst++];
377
tr->code[c] = c | squeeze;
378
}
379
break;
380
}
381
}
382
else
383
{
384
x = squeeze ? c : 0;
385
tr->code[c] = x | squeeze;
386
}
387
}
388
if ((flags & (TR_DELETE|TR_SQUEEZE)) == (TR_DELETE|TR_SQUEEZE))
389
{
390
tr->truncate = 1;
391
for (tr->dst = 0; (x = nextchar(tr)) >= 0; tr->dst++)
392
if (!(tr->code[x] & DELBIT))
393
{
394
#if DEBUG_TRACE
395
error(-1, "dst %d '%c'", tr->dst, x);
396
#endif
397
tr->code[x] = x | ONEBIT;
398
}
399
if (x < -1)
400
goto bad;
401
}
402
return tr;
403
bad:
404
error(2, "%s: invalid %s string", tr->base, typename[tr->type]);
405
free(tr);
406
return 0;
407
}
408
409
/*
410
* close a tr handle
411
*/
412
413
void
414
trclose(Tr_t* tr)
415
{
416
free(tr);
417
}
418
419
/*
420
* tr each char of ip and put results to op
421
* stop after <ncopy> bytes are written
422
*/
423
424
static ssize_t
425
trcopy(Tr_t* tr, Sfio_t* ip, Sfio_t* op, ssize_t ncopy)
426
{
427
register int c;
428
register int oldc = -1;
429
register int* code = tr->code;
430
register unsigned char* inp = 0;
431
register unsigned char* outp = 0;
432
register unsigned char* inend;
433
register unsigned char* outend = 0;
434
register ssize_t nwrite = 0;
435
unsigned char* inbuff = 0;
436
unsigned char* outbuff = 0;
437
438
while (nwrite != ncopy)
439
{
440
if (!(inbuff = (unsigned char*)sfreserve(ip, SF_UNBOUND, SF_LOCKR)))
441
{
442
if (sfvalue(ip))
443
{
444
error(2, ERROR_SYSTEM|2, "read error");
445
return -1;
446
}
447
break;
448
}
449
c = sfvalue(ip);
450
inend = (inp = inbuff) + c;
451
452
/*
453
* process the next input buffer
454
*/
455
456
while (inp < inend)
457
{
458
if (outp >= outend)
459
{
460
/*
461
* write out current buffer
462
*/
463
464
if ((c = outp - outbuff) > 0)
465
{
466
if ((nwrite += c) == ncopy)
467
break;
468
sfwrite(op, outbuff, c);
469
}
470
471
/*
472
* get write buffer space
473
*/
474
475
if (!(outbuff = (unsigned char*)sfreserve(op, (ncopy < 0) ? SF_UNBOUND : (ncopy - nwrite), SF_LOCKR)))
476
break;
477
outend = (outp = outbuff) + sfvalue(op);
478
}
479
c = code[*inp++];
480
if (!(c & DELBIT) && c != oldc)
481
{
482
*outp++ = c;
483
oldc = c | ONEBIT;
484
}
485
}
486
sfread(ip, inbuff, inp - inbuff);
487
inp = inbuff;
488
}
489
if (inbuff && (c = inp - inbuff) > 0)
490
sfread(ip, inbuff, c);
491
if (outbuff && (c = outp - outbuff) >= 0)
492
sfwrite(op, outbuff, c);
493
if (sfsync(op))
494
{
495
if (!ERROR_PIPE(errno))
496
error(ERROR_SYSTEM|2, "write error [%d]", c);
497
return -1;
498
}
499
return nwrite;
500
}
501
502
int
503
b_tr(int argc, char** argv, Shbltin_t* context)
504
{
505
register int flags = 0;
506
Tr_t* tr;
507
508
cmdinit(argc, argv, context, ERROR_CATALOG, 0);
509
flags = 0;
510
for (;;)
511
{
512
switch (optget(argv, usage))
513
{
514
case 'c':
515
flags |= TR_COMPLEMENT;
516
continue;
517
case 'd':
518
flags |= TR_DELETE;
519
continue;
520
case 's':
521
flags |= TR_SQUEEZE;
522
continue;
523
case 't':
524
flags |= TR_TRUNCATE;
525
continue;
526
case ':':
527
error(2, "%s", opt_info.arg);
528
continue;
529
case '?':
530
error(ERROR_USAGE|4, "%s", opt_info.arg);
531
continue;
532
}
533
break;
534
}
535
argv += opt_info.index;
536
if (error_info.errors)
537
error(ERROR_USAGE|4, "%s", optusage(NiL));
538
if (tr = tropen((unsigned char*)argv[0], (unsigned char*)argv[0] ? (unsigned char*)argv[1] : (unsigned char*)0, flags))
539
{
540
trcopy(tr, sfstdin, sfstdout, SF_UNBOUND);
541
trclose(tr);
542
}
543
return error_info.errors != 0;
544
}
545
546