Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/lib/libcmd/cut.c
1808 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 1992-2012 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Glenn Fowler <[email protected]> *
18
* David Korn <[email protected]> *
19
* *
20
***********************************************************************/
21
#pragma prototyped
22
/*
23
* David Korn
24
* AT&T Bell Laboratories
25
*
26
* cut fields or columns from fields from a file
27
*/
28
29
static const char usage[] =
30
"[-?\n@(#)$Id: cut (AT&T Research) 2010-08-11 $\n]"
31
USAGE_LICENSE
32
"[+NAME?cut - cut out selected columns or fields of each line of a file]"
33
"[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
34
"from one or more files, contatenating them on standard output.]"
35
"[+?The option argument \alist\a is a comma-separated or blank-separated "
36
"list of positive numbers and ranges. Ranges can be of three "
37
"forms. The first is two positive integers separated by a hyphen "
38
"(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
39
"\ahigh\a. The second is a positive number preceded by a hyphen "
40
"(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
41
"\ahigh\a. The last is a positive number followed by a hyphen "
42
"(\alow\a\b-\b), which represents all fields from \alow\a to the "
43
"last field, inclusive. Elements in the \alist\a can be repeated, "
44
"can overlap, and can appear in any order. The order of the "
45
"output is that of the input.]"
46
"[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
47
"[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
48
"cuts from standard input. The start of the file is defined "
49
"as the current offset.]"
50
"[b:bytes]:[list?\bcut\b based on a list of byte counts.]"
51
"[c:characters]:[list?\bcut\b based on a list of character counts.]"
52
"[d:delimiter]:[delim?The field character for the \b-f\b option is set "
53
"to \adelim\a. The default is the \btab\b character.]"
54
"[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
55
"character specified with the \b-d\b optiion.]"
56
"[n!:split?Split multibyte characters selected by the \b-b\b option.]"
57
"[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
58
"records of length \areclen\a when used with the \b-b\b or \b-c\b "
59
"option.]"
60
"[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
61
"when used with the \b-f\b option. By default, lines with no "
62
"delimiters will be passsed in untouched.]"
63
"[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
64
"the \b-f\b option is set to \aldelim\a. The default is the "
65
"\bnewline\b character.]"
66
"[N!:newline?Output new-lines at end of each record when used "
67
"with the \b-b\b or \b-c\b option.]"
68
"\n"
69
"\n[file ...]\n"
70
"\n"
71
"[+EXIT STATUS?]{"
72
"[+0?All files processed successfully.]"
73
"[+>0?One or more files failed to open or could not be read.]"
74
"}"
75
"[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
76
;
77
78
#include <cmd.h>
79
#include <ctype.h>
80
81
typedef struct Delim_s
82
{
83
char* str;
84
int len;
85
int chr;
86
} Delim_t;
87
88
typedef struct Cut_s
89
{
90
int mb;
91
int eob;
92
int cflag;
93
int nosplit;
94
int sflag;
95
int nlflag;
96
int reclen;
97
Delim_t wdelim;
98
Delim_t ldelim;
99
unsigned char space[UCHAR_MAX+1];
100
int list[2]; /* NOTE: must be last member */
101
} Cut_t;
102
103
#define HUGE INT_MAX
104
#define BLOCK 8*1024
105
#define C_BYTES 1
106
#define C_CHARS 2
107
#define C_FIELDS 4
108
#define C_SUPRESS 8
109
#define C_NOSPLIT 16
110
#define C_NONEWLINE 32
111
112
#define SP_LINE 1
113
#define SP_WORD 2
114
#define SP_WIDE 3
115
116
/*
117
* compare the first of an array of integers
118
*/
119
120
static int
121
mycomp(register const void* a, register const void* b)
122
{
123
if (*((int*)a) < *((int*)b))
124
return -1;
125
if (*((int*)a) > *((int*)b))
126
return 1;
127
return 0;
128
}
129
130
static Cut_t*
131
cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen)
132
{
133
register int* lp;
134
register int c;
135
register int n = 0;
136
register int range = 0;
137
register char* cp = str;
138
Cut_t* cut;
139
140
if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int))))
141
error(ERROR_exit(1), "out of space");
142
if (cut->mb = mbwide())
143
{
144
memset(cut->space, 0, sizeof(cut->space) / 2);
145
memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2);
146
}
147
else
148
memset(cut->space, 0, sizeof(cut->space));
149
cut->wdelim = *wdelim;
150
if (wdelim->len == 1)
151
cut->space[wdelim->chr] = SP_WORD;
152
cut->ldelim = *ldelim;
153
cut->eob = (ldelim->len == 1) ? ldelim->chr : 0;
154
cut->space[cut->eob] = SP_LINE;
155
cut->cflag = (mode&C_CHARS) && cut->mb;
156
cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb;
157
cut->sflag = (mode&C_SUPRESS) != 0;
158
cut->nlflag = (mode&C_NONEWLINE) != 0;
159
cut->reclen = reclen;
160
lp = cut->list;
161
for (;;)
162
switch(c = *cp++)
163
{
164
case ' ':
165
case '\t':
166
while(*cp==' ' || *cp=='\t')
167
cp++;
168
/*FALLTHROUGH*/
169
case 0:
170
case ',':
171
if(range)
172
{
173
--range;
174
if((n = (n ? (n-range) : (HUGE-1))) < 0)
175
error(ERROR_exit(1),"invalid range for c/f option");
176
*lp++ = range;
177
*lp++ = n;
178
}
179
else
180
{
181
*lp++ = --n;
182
*lp++ = 1;
183
}
184
if(c==0)
185
{
186
register int *dp;
187
*lp = HUGE;
188
n = 1 + (lp-cut->list)/2;
189
qsort(lp=cut->list,n,2*sizeof(*lp),mycomp);
190
/* eliminate overlapping regions */
191
for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2)
192
{
193
if(lp[0] <= range)
194
{
195
if(lp[1]==HUGE)
196
{
197
dp[-1] = HUGE;
198
break;
199
}
200
if((c = lp[0]+lp[1]-range)>0)
201
{
202
range += c;
203
dp[-1] += c;
204
}
205
}
206
else
207
{
208
range = *dp++ = lp[0];
209
if(lp[1]==HUGE)
210
{
211
*dp++ = HUGE;
212
break;
213
}
214
range += (*dp++ = lp[1]);
215
}
216
}
217
*dp = HUGE;
218
lp = cut->list;
219
/* convert ranges into gaps */
220
for(n=0; *lp!=HUGE; lp+=2)
221
{
222
c = *lp;
223
*lp -= n;
224
n = c+lp[1];
225
}
226
return cut;
227
}
228
n = range = 0;
229
break;
230
231
case '-':
232
if(range)
233
error(ERROR_exit(1),"bad list for c/f option");
234
range = n?n:1;
235
n = 0;
236
break;
237
238
default:
239
if(!isdigit(c))
240
error(ERROR_exit(1),"bad list for c/f option");
241
n = 10*n + (c-'0');
242
break;
243
}
244
/* NOTREACHED */
245
}
246
247
/*
248
* cut each line of file <fdin> and put results to <fdout> using list <list>
249
*/
250
251
static void
252
cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
253
{
254
register int c;
255
register int len;
256
register int ncol = 0;
257
register const int* lp = cut->list;
258
register char* bp;
259
register int skip; /* non-zero for don't copy */
260
int must;
261
const char* xx;
262
263
for (;;)
264
{
265
if (len = cut->reclen)
266
bp = sfreserve(fdin, len, -1);
267
else
268
bp = sfgetr(fdin, '\n', 0);
269
if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR)))
270
break;
271
len = sfvalue(fdin);
272
xx = 0;
273
if (!(ncol = skip = *(lp = cut->list)))
274
ncol = *++lp;
275
must = 1;
276
do
277
{
278
if (cut->nosplit)
279
{
280
register const char* s = bp;
281
register int w = len < ncol ? len : ncol;
282
register int z;
283
284
while (w > 0)
285
{
286
if (!(*s & 0x80))
287
z = 1;
288
else if ((z = mbnsize(s, w)) <= 0)
289
{
290
if (s == bp && xx)
291
{
292
w += s - xx;
293
bp = (char*)(s = xx);
294
xx = 0;
295
continue;
296
}
297
xx = s;
298
if (skip)
299
s += w;
300
w = 0;
301
break;
302
}
303
s += z;
304
w -= z;
305
}
306
c = s - bp;
307
ncol = !w && ncol >= len;
308
}
309
else if (cut->cflag)
310
{
311
register const char* s = bp;
312
register int w = len;
313
register int z;
314
315
while (w > 0 && ncol > 0)
316
{
317
ncol--;
318
if (!(*s & 0x80) || (z = mbnsize(s, w)) <= 0)
319
z = 1;
320
s += z;
321
w -= z;
322
323
}
324
c = s - bp;
325
ncol = !w && (ncol || !skip);
326
}
327
else
328
{
329
if ((c = ncol) > len)
330
c = len;
331
else if (c == len && !skip)
332
ncol++;
333
ncol -= c;
334
}
335
if (!skip && c)
336
{
337
if (sfwrite(fdout, (char*)bp, c) < 0)
338
return;
339
must = 0;
340
}
341
bp += c;
342
if (ncol)
343
break;
344
len -= c;
345
ncol = *++lp;
346
skip = !skip;
347
} while (ncol != HUGE);
348
if (!cut->nlflag && (skip || must || cut->reclen))
349
{
350
if (cut->ldelim.len > 1)
351
sfwrite(fdout, cut->ldelim.str, cut->ldelim.len);
352
else
353
sfputc(fdout, cut->ldelim.chr);
354
}
355
}
356
}
357
358
/*
359
* cut each line of file <fdin> and put results to <fdout> using list <list>
360
* stream <fdin> must be line buffered
361
*/
362
363
static void
364
cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
365
{
366
register unsigned char *sp = cut->space;
367
register unsigned char *cp;
368
register unsigned char *wp;
369
register int c, nfields;
370
register const int *lp = cut->list;
371
register unsigned char *copy;
372
register int nodelim, empty, inword=0;
373
register unsigned char *ep;
374
unsigned char *bp, *first;
375
int lastchar;
376
wchar_t w;
377
Sfio_t *fdtmp = 0;
378
long offset = 0;
379
unsigned char mb[8];
380
/* process each buffer */
381
while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0)
382
{
383
cp = bp;
384
ep = cp + --c;
385
if((lastchar = cp[c]) != cut->eob)
386
*ep = cut->eob;
387
/* process each line in the buffer */
388
while (cp <= ep)
389
{
390
first = cp;
391
if (!inword)
392
{
393
nodelim = empty = 1;
394
copy = cp;
395
if (nfields = *(lp = cut->list))
396
copy = 0;
397
else
398
nfields = *++lp;
399
}
400
else if (copy)
401
copy = cp;
402
inword = 0;
403
do
404
{
405
/* skip over non-delimiter characters */
406
if (cut->mb)
407
for (;;)
408
{
409
switch (c = sp[*(unsigned char*)cp++])
410
{
411
case 0:
412
continue;
413
case SP_WIDE:
414
wp = --cp;
415
while ((c = mb2wc(w, cp, ep - cp)) <= 0)
416
{
417
/* mb char possibly spanning buffer boundary -- fun stuff */
418
if ((ep - cp) < mbmax())
419
{
420
int i;
421
int j;
422
int k;
423
424
if (lastchar != cut->eob)
425
{
426
*ep = lastchar;
427
if ((c = mb2wc(w, cp, ep - cp)) > 0)
428
break;
429
}
430
if (copy)
431
{
432
empty = 0;
433
if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
434
goto failed;
435
}
436
for (i = 0; i <= (ep - cp); i++)
437
mb[i] = cp[i];
438
if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0)
439
goto failed;
440
cp = bp;
441
ep = cp + --c;
442
if ((lastchar = cp[c]) != cut->eob)
443
*ep = cut->eob;
444
j = i;
445
k = 0;
446
while (j < mbmax())
447
mb[j++] = cp[k++];
448
if ((c = mb2wc(w, (char*)mb, j)) <= 0)
449
{
450
c = i;
451
w = 0;
452
}
453
first = bp = cp += c - i;
454
if (copy)
455
{
456
copy = bp;
457
if (w == cut->ldelim.chr)
458
lastchar = cut->ldelim.chr;
459
else if (w != cut->wdelim.chr)
460
{
461
empty = 0;
462
if (sfwrite(fdout, (char*)mb, c) < 0)
463
goto failed;
464
}
465
}
466
c = 0;
467
}
468
else
469
{
470
w = *cp;
471
c = 1;
472
}
473
break;
474
}
475
cp += c;
476
c = w;
477
if (c == cut->wdelim.chr)
478
{
479
c = SP_WORD;
480
break;
481
}
482
if (c == cut->ldelim.chr)
483
{
484
c = SP_LINE;
485
break;
486
}
487
continue;
488
default:
489
wp = cp - 1;
490
break;
491
}
492
break;
493
}
494
else
495
{
496
while (!(c = sp[*cp++]));
497
wp = cp - 1;
498
}
499
/* check for end-of-line */
500
if (c == SP_LINE)
501
{
502
if (cp <= ep)
503
break;
504
if (lastchar == cut->ldelim.chr)
505
break;
506
/* restore cut->last character */
507
if (lastchar != cut->eob)
508
*ep = lastchar;
509
inword++;
510
if (!sp[lastchar])
511
break;
512
}
513
nodelim = 0;
514
if (--nfields > 0)
515
continue;
516
nfields = *++lp;
517
if (copy)
518
{
519
empty = 0;
520
if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
521
goto failed;
522
copy = 0;
523
}
524
else
525
/* set to delimiter unless the first field */
526
copy = empty ? cp : wp;
527
} while (!inword);
528
if (!inword)
529
{
530
if (!copy)
531
{
532
if (nodelim)
533
{
534
if (!cut->sflag)
535
{
536
if (offset)
537
{
538
sfseek(fdtmp,(Sfoff_t)0,SEEK_SET);
539
sfmove(fdtmp,fdout,offset,-1);
540
}
541
copy = first;
542
}
543
}
544
else
545
sfputc(fdout,'\n');
546
}
547
if (offset)
548
sfseek(fdtmp,offset=0,SEEK_SET);
549
}
550
if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0)
551
goto failed;
552
}
553
/* see whether to save in tmp file */
554
if(inword && nodelim && !cut->sflag && (c=cp-first)>0)
555
{
556
/* copy line to tmpfile in case no fields */
557
if(!fdtmp)
558
fdtmp = sftmp(BLOCK);
559
sfwrite(fdtmp,(char*)first,c);
560
offset +=c;
561
}
562
}
563
failed:
564
if(fdtmp)
565
sfclose(fdtmp);
566
}
567
568
int
569
b_cut(int argc, char** argv, Shbltin_t* context)
570
{
571
register char* cp = 0;
572
register Sfio_t* fp;
573
char* s;
574
int n;
575
Cut_t* cut;
576
int mode = 0;
577
Delim_t wdelim;
578
Delim_t ldelim;
579
size_t reclen = 0;
580
581
cmdinit(argc, argv, context, ERROR_CATALOG, 0);
582
wdelim.chr = '\t';
583
ldelim.chr = '\n';
584
wdelim.len = ldelim.len = 1;
585
for (;;)
586
{
587
switch (optget(argv, usage))
588
{
589
case 0:
590
break;
591
case 'b':
592
case 'c':
593
if(mode&C_FIELDS)
594
{
595
error(2, "f option already specified");
596
continue;
597
}
598
cp = opt_info.arg;
599
if(opt_info.option[1]=='b')
600
mode |= C_BYTES;
601
else
602
mode |= C_CHARS;
603
continue;
604
case 'D':
605
ldelim.str = opt_info.arg;
606
if (mbwide())
607
{
608
s = opt_info.arg;
609
ldelim.chr = mbchar(s);
610
if ((n = s - opt_info.arg) > 1)
611
{
612
ldelim.len = n;
613
continue;
614
}
615
}
616
ldelim.chr = *(unsigned char*)opt_info.arg;
617
ldelim.len = 1;
618
continue;
619
case 'd':
620
wdelim.str = opt_info.arg;
621
if (mbwide())
622
{
623
s = opt_info.arg;
624
wdelim.chr = mbchar(s);
625
if ((n = s - opt_info.arg) > 1)
626
{
627
wdelim.len = n;
628
continue;
629
}
630
}
631
wdelim.chr = *(unsigned char*)opt_info.arg;
632
wdelim.len = 1;
633
continue;
634
case 'f':
635
if(mode&(C_CHARS|C_BYTES))
636
{
637
error(2, "c option already specified");
638
continue;
639
}
640
cp = opt_info.arg;
641
mode |= C_FIELDS;
642
continue;
643
case 'n':
644
mode |= C_NOSPLIT;
645
continue;
646
case 'N':
647
mode |= C_NONEWLINE;
648
continue;
649
case 'R':
650
case 'r':
651
if(opt_info.num>0)
652
reclen = opt_info.num;
653
continue;
654
case 's':
655
mode |= C_SUPRESS;
656
continue;
657
case ':':
658
error(2, "%s", opt_info.arg);
659
break;
660
case '?':
661
error(ERROR_usage(2), "%s", opt_info.arg);
662
break;
663
}
664
break;
665
}
666
argv += opt_info.index;
667
if (error_info.errors)
668
error(ERROR_usage(2), "%s",optusage(NiL));
669
if(!cp)
670
{
671
error(2, "b, c or f option must be specified");
672
error(ERROR_usage(2), "%s", optusage(NiL));
673
}
674
if(!*cp)
675
error(3, "non-empty b, c or f option must be specified");
676
if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS)
677
error(3, "s option requires f option");
678
cut = cutinit(mode, cp, &wdelim, &ldelim, reclen);
679
if(cp = *argv)
680
argv++;
681
do
682
{
683
if(!cp || streq(cp,"-"))
684
fp = sfstdin;
685
else if(!(fp = sfopen(NiL,cp,"r")))
686
{
687
error(ERROR_system(0),"%s: cannot open",cp);
688
continue;
689
}
690
if(mode&C_FIELDS)
691
cutfields(cut,fp,sfstdout);
692
else
693
cutcols(cut,fp,sfstdout);
694
if(fp!=sfstdin)
695
sfclose(fp);
696
} while(cp = *argv++);
697
if (sfsync(sfstdout))
698
error(ERROR_system(0), "write error");
699
return error_info.errors != 0;
700
}
701
702