Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/cmd/std/split.c
1808 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 1989-2011 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Glenn Fowler <[email protected]> *
18
* *
19
***********************************************************************/
20
#pragma prototyped
21
/*
22
* split.c
23
* David Korn
24
* AT&T Research
25
*/
26
27
static const char split_usage[] =
28
"[-?\n@(#)$Id: split (AT&T Research) 2006-09-19 $\n]"
29
USAGE_LICENSE
30
"[+NAME?split - split files into pieces]"
31
"[+DESCRIPTION?\bsplit\b reads an input file and writes one or more"
32
" output files so that \bcat\b(1) on these files will produce"
33
" the input file. The default size for each piece is 1000 lines."
34
" The suffix consists of \asuffix_len\a lower case characters"
35
" from the POSIX locale.]"
36
"[+?If \aprefix\a is specified it will be used as a prefix for each"
37
" of the resulting files from the split operation. If \aprefix\a"
38
" is specified, the prefix \bx\b will be used.]"
39
"[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bsplit\b"
40
" copies from standard input starting at the current location.]"
41
"[+?The option arguments for \b-b\b and \b-C\b can optionally be followed"
42
" by one of the following characters to specify a different"
43
" unit other than a single byte:]{"
44
" [+b?512 bytes.]"
45
" [+k?1-killobytes.]"
46
" [+m?1-megabyte.]"
47
" [+g?1-gigabyte.]"
48
" [+t?1-terabyte.]"
49
" }"
50
"[+?For backwards compatibility, \b-\b\aline_count\a is equivalent to"
51
" \b-l\b \aline_count\a.]"
52
"[l:lines]#[line_count:=1000?\aline_count\a specified the number of lines"
53
" for each piece except the last. If the input does not end in"
54
" a newline, the partial line is included in the last piece.]"
55
"[a|n:suffix-length]#[suffix_len:=2?\asuffix_len\a defines the number of"
56
" letters that form the suffix portion of the file names for each of"
57
" the pieces that the file is split into.]"
58
"[b:bytes]#[n?Splits the file into byte size pieces defined by \an\a"
59
" rather than lines.]"
60
"[C:line-bytes]#[n?Splits the file into lines totaling a most \an\a bytes.]"
61
"\n"
62
"\n[ file [ name ] ]\n"
63
"\n"
64
"[+EXIT STATUS]{"
65
" [+0?Successful completion.]"
66
" [+>0?An error occurred.]"
67
"}"
68
"[+SEE ALSO? \bcsplit\b(1), \bcat\b(1)]"
69
;
70
71
static const char csplit_usage[] =
72
"[-?\n@(#)$Id: csplit (AT&T Research) 2003-08-21 $\n]"
73
USAGE_LICENSE
74
"[+NAME?csplit - split a file into sections determined by context lines]"
75
"[+DESCRIPTION?\bcsplit\b creates zero or more output files containing"
76
" sections of the given input \afile\a, or the standard input if the"
77
" name \b-\b is given. By default, \bcsplit\b prints the number of"
78
" bytes written to each output file after it has been created.]"
79
"[+?The contents of the output files are determined by the \apattern\a"
80
" arguments. An error occurs if a pattern argument refers to a"
81
" nonexistent line of the input file, such as if no remaining line"
82
" matches a given regular expression. After all the given patterns have"
83
" been matched, any remaining output is copied into one last output"
84
" file. The types of pattern arguments are:]{"
85
" [+line?Create an output file containing the current line up"
86
" to (but not including) line \aline\a (a positive"
87
" integer) of the input file. If followed by a repeat"
88
" count, also create an output file containing the"
89
" next \aline\a lines of the input file once for each"
90
" repeat.]"
91
" [+/regexp/[offset]]?Create an output file containing the"
92
" current line up to (but not including) the next line"
93
" of the input file that contains a match for"
94
" \aregexp\a. The optional \aoffset\a is a \b+\b or"
95
" \b-\b followed by a positive integer. If it is given,"
96
" the input up to the matching line plus or minus"
97
" \aoffset\a is put into the output file, and the line"
98
" after that begins the next section of input.]"
99
" [+%regexp%[offset]]?Like the previous type, except that it"
100
" does not create an output file, so that section of"
101
" the input file is effectively ignored.]"
102
" [+{repeat-count}?Repeat the previous pattern \arepeat-count\a"
103
" (a positive integer) additional times. An asterisk"
104
" may be given in place of the (integer) repeat count,"
105
" in which case the preceeding pattern is repeated as"
106
" many times as necessary until the input is exausted.]"
107
" }"
108
"[+?The output file names consist of a prefix followed by a suffix. By"
109
" default, the suffix is merely an ascending linear sequence of two-digit"
110
" decimal numbers starting with 00 and ranging up to 99, however this"
111
" default may be overridden by either the \b--digits\b option or by the"
112
" \b--suffix-format\b option (see below.) In any case, concatenating"
113
" the output files in sorted order by file name produces the original"
114
" input file, in order. The default output file name prefix is \bxx\b.]"
115
"[+?By default, if \bcsplit\b encounters an error or receives a hangup,"
116
" interrupt, quit, or terminate signal, it removes any output files"
117
" that it has created so far before it exits.]"
118
"[b:suffix-format?Use the \bprintf\b(3) \aformat\a to generate the file"
119
" name suffix.]:[format:=\b%02d\b]"
120
"[f:prefix?Use \aprefix\a to generate the file name prefix.]:[prefix:=\bxx\b]"
121
"[k:keep-files?Do not remove output files on errors.]"
122
"[a|n:digits?Use \adigits\a in the generated file name suffixes.]#[digits:=2]"
123
"[s:silent|quiet?Do not print output file counts and sizes.]"
124
"[z:elide-empty-files?Remove empty output files.]"
125
"\n"
126
"\nfile arg ...\n"
127
"\n"
128
"[+EXIT STATUS?]{"
129
" [+0?Successful completion.]"
130
" [+>0?An error occurred.]"
131
"}"
132
"[+SEE ALSO? \bsplit\b(1), \bcat\b(1)]"
133
;
134
135
#include <cmd.h>
136
#include <regex.h>
137
138
#define S_FLAG 001
139
#define K_FLAG 002
140
#define C_FLAG 004
141
#define B_FLAG 010
142
#define Z_FLAG 020
143
#define M_FLAG 040
144
145
#define OP_LINES 0
146
#define OP_SEARCH 1
147
#define OP_SKIP 2
148
#define OP_ABSOLUTE 3
149
150
#define BLK_SIZE 2048
151
152
struct fname
153
{
154
char* fname;
155
char* format;
156
char* suffix;
157
char* last;
158
char low;
159
char high;
160
int count;
161
};
162
163
struct op
164
{
165
struct op* next;
166
Sfoff_t size;
167
size_t repeat;
168
int flags;
169
regex_t* re;
170
};
171
172
/*
173
* create an operation structure
174
*/
175
static struct op*
176
getop(struct op** prev, Sfoff_t size, size_t repeat, int flags, int re)
177
{
178
struct op* op;
179
180
if (op = newof(0, struct op, 1, re ? sizeof(regex_t) : 0))
181
{
182
op->repeat = repeat;
183
op->flags = flags;
184
op->size = size;
185
op->next = 0;
186
if (re)
187
op->re = (regex_t*)(op + 1);
188
*prev = op;
189
}
190
return op;
191
}
192
193
/*
194
* process /expr/offset arguments
195
* returns new operation structure which is added to linked list
196
*/
197
198
static struct op*
199
getexpr(struct op** prev, const char* arg)
200
{
201
char* cp = (char*)arg;
202
char* ep;
203
int n;
204
struct op* op;
205
206
if (op = getop(prev, 0, 1, *cp == '/' ? OP_SEARCH : OP_SKIP, 1))
207
{
208
if (n = regcomp(op->re, cp, REG_DELIMITED|REG_NOSUB))
209
{
210
regfatal(op->re, 2, n);
211
return 0;
212
}
213
cp += op->re->re_npat;
214
if (*cp)
215
{
216
op->size = strtoll(cp, &ep, 10);
217
if (*ep)
218
error(ERROR_exit(1), "%s: invalid offset", cp);
219
}
220
}
221
return op;
222
}
223
224
/*
225
* set up file name generator whose form is <prefix>... where ... is
226
* suflen characters from low..high
227
* returns a pointer to a structure that can be used to create
228
* file names
229
*/
230
231
static struct fname*
232
setfname(const char* prefix, char* format, int suflen, int low, int high)
233
{
234
struct fname* fp;
235
int flen;
236
int slen;
237
int len;
238
char* cp;
239
240
flen = strlen(prefix);
241
len = flen + suflen + 1;
242
if (format)
243
{
244
slen = strlen(format);
245
len += flen + slen + 1;
246
}
247
else
248
slen = 0;
249
if (fp = newof(0, struct fname, 1, len))
250
{
251
cp = (char*)(fp + 1);
252
if (format)
253
{
254
strcpy(fp->format = cp, prefix);
255
cp += flen;
256
strcpy(cp, format);
257
cp += slen + 1;
258
}
259
fp->low = low;
260
fp->high = high;
261
fp->count = 0;
262
strcpy(fp->fname = cp, prefix);
263
cp += flen;
264
fp->suffix = cp;
265
while (suflen-- > 0)
266
*cp++ = low;
267
*cp-- = 0;
268
fp->last = cp;
269
(*cp)--;
270
flen = _POSIX_NAME_MAX;
271
if (cp = strrchr(fp->fname, '/'))
272
{
273
cp++;
274
len = strlen(cp);
275
if (len > flen)
276
{
277
*(cp - 1) = 0;
278
flen = (int)strtol(astconf("NAME_MAX", fp->fname, NiL), NiL, 0);
279
*(cp - 1) = '/';
280
}
281
}
282
else
283
{
284
cp = fp->fname;
285
if (len > flen)
286
flen = (int)strtol(astconf("NAME_MAX", ".", NiL), NiL, 0);
287
}
288
if (len > flen)
289
error(ERROR_exit(1), "%s: filename too long", prefix);
290
}
291
return fp;
292
}
293
294
/*
295
* return next sequential file name
296
*/
297
298
static char*
299
getfname(struct fname* fp)
300
{
301
register char* cp = fp->last;
302
303
if (fp->format)
304
return sfprints(fp->format, fp->count++);
305
while (++(*cp) > fp->high)
306
{
307
*cp-- = fp->low;
308
if (cp < fp->suffix)
309
{
310
error(0, "file limit reached");
311
return 0;
312
}
313
}
314
fp->count++;
315
return fp->fname;
316
}
317
318
/*
319
* remove all generated files
320
*/
321
322
static void
323
removeall(struct fname* fp)
324
{
325
register char* cp = fp->suffix;
326
327
while (*cp)
328
*cp++ = fp->low;
329
*(cp - 1) -= 1;
330
while (fp->count-- > 0)
331
{
332
remove(getfname(fp));
333
fp->count--;
334
}
335
fp->count = 0;
336
}
337
338
static int
339
msize(Sfio_t* in, long len)
340
{
341
Sfoff_t off = sftell(in);
342
register char* cp;
343
register char* dp;
344
register long m;
345
register long n = len;
346
register long nlen = 0;
347
348
if (sfsize(in) - off <= len)
349
return len;
350
while (nlen == 0 && n > 0)
351
{
352
n -= BLK_SIZE;
353
if (n < 0)
354
n = 0;
355
sfseek(in, off + n, SEEK_SET);
356
if (!(dp = cp = sfreserve(in, BLK_SIZE, 0)))
357
return len;
358
m = BLK_SIZE;
359
while (m-- > 0)
360
{
361
if (*cp++ == '\n')
362
nlen = n + (cp - dp);
363
}
364
}
365
if (n > 0)
366
sfseek(in, off, SEEK_SET);
367
return nlen ? nlen : len;
368
}
369
370
static int
371
split(Sfio_t* in, struct fname* fp, struct op* op, int flags)
372
{
373
register char* cp;
374
register char* s;
375
Sfoff_t len;
376
Sfoff_t z;
377
Sfoff_t size;
378
size_t repeat;
379
int c;
380
381
register Sfio_t* out = 0;
382
register char* peek = 0;
383
register long n = 0;
384
int delim = (flags & B_FLAG) ? -1 : '\n';
385
size_t lineno = 1;
386
387
while (op)
388
{
389
if (op->flags == OP_LINES)
390
len = op->size;
391
repeat = op->repeat;
392
do
393
{
394
if (op->flags != OP_SKIP)
395
{
396
if (!(cp = getfname(fp)))
397
goto err;
398
if (!(out = sfopen(NiL, cp, "w")))
399
{
400
fp->count--;
401
error(ERROR_SYSTEM|2, "%s: cannot create", cp);
402
goto err;
403
}
404
}
405
if (op->flags == OP_ABSOLUTE || op->flags == OP_LINES)
406
{
407
if (op->flags == OP_ABSOLUTE)
408
len = op->size - lineno;
409
if (peek)
410
{
411
if ((n = sfputr(out, peek, delim)) <= 0)
412
goto done;
413
peek = 0;
414
if (len > 0)
415
len--;
416
lineno++;
417
}
418
if (len)
419
{
420
z = (flags & M_FLAG) ? msize(in, len) : len;
421
if ((n = sfmove(in, out, z, delim)) < z || n < 0)
422
goto done;
423
lineno += n;
424
}
425
}
426
else
427
{
428
if (peek)
429
{
430
if (out && (n = sfputr(out, peek, delim)) <= 0)
431
goto done;
432
lineno++;
433
peek = 0;
434
}
435
while (s = sfgetr(in, delim, 1))
436
{
437
if (!(c = regexec(op->re, s, 0, NiL, 0)))
438
break;
439
lineno++;
440
if (c != REG_NOMATCH)
441
{
442
regfatal(op->re, 2, c);
443
goto err;
444
}
445
if (out && (n = sfputr(out, s, delim)) <= 0)
446
goto done;
447
}
448
if (!(peek = s))
449
{
450
while (op->next)
451
op = op->next;
452
repeat = 1;
453
}
454
}
455
if (out)
456
{
457
size = sfseek(out, (Sfoff_t)0, SEEK_END);
458
if (!(flags & S_FLAG))
459
sfprintf(sfstdout, "%I*d\n", sizeof(size), size);
460
sfclose(out);
461
out = 0;
462
if ((flags & Z_FLAG) && size <= 0)
463
remove(cp);
464
}
465
} while (!repeat || --repeat);
466
op = op->next;
467
}
468
done:
469
if (out)
470
{
471
sfclose(out);
472
if (n <= 0)
473
remove(cp);
474
}
475
if (n >= 0)
476
return 0;
477
err:
478
if (!(flags & K_FLAG))
479
removeall(fp);
480
return 1;
481
}
482
483
int
484
main(int argc, char** argv)
485
{
486
struct fname* fp;
487
struct op* top;
488
char* cp;
489
char* prefix;
490
const char* usage;
491
Sfio_t* in;
492
int flags;
493
ssize_t n;
494
495
char* format = 0;
496
Sfoff_t size = 10000;
497
int suflen = 2;
498
499
if (cp = strrchr(*argv, '/'))
500
cp++;
501
else
502
cp = *argv;
503
error_info.id = cp;
504
if (streq(cp, "split"))
505
{
506
usage = split_usage;
507
flags = S_FLAG|K_FLAG;
508
prefix = "x";
509
}
510
else
511
{
512
usage = csplit_usage;
513
flags = C_FLAG;
514
prefix = "xx";
515
}
516
for (;;)
517
{
518
switch (optget(argv, usage))
519
{
520
case 0:
521
break;
522
case 'l':
523
flags &= ~(B_FLAG|M_FLAG);
524
if ((size = opt_info.number) <= 0)
525
error(1, "%s: invalid size", opt_info.arg);
526
continue;
527
case 'k':
528
flags |= K_FLAG;
529
continue;
530
case 's':
531
flags |= S_FLAG;
532
continue;
533
case 'z':
534
flags |= Z_FLAG;
535
continue;
536
case 'f':
537
prefix = opt_info.arg;
538
continue;
539
case 'a':
540
case 'n':
541
suflen = opt_info.num;
542
continue;
543
case 'C':
544
flags |= M_FLAG;
545
case 'b':
546
if (flags & S_FLAG)
547
{
548
if ((size = opt_info.number) <= 0)
549
error(1, "%s: invalid size", opt_info.arg);
550
flags |= B_FLAG;
551
}
552
else
553
format = opt_info.arg;
554
continue;
555
case ':':
556
error(2, "%s", opt_info.arg);
557
break;
558
case '?':
559
error(ERROR_usage(2), "%s", opt_info.arg);
560
break;
561
}
562
break;
563
}
564
argv += opt_info.index;
565
argc -= opt_info.index;
566
if (error_info.errors || !(flags & C_FLAG) && argc > 2 || (flags & C_FLAG) && argc < 2)
567
error(ERROR_usage(2), "%s", optusage(NiL));
568
cp = *argv++;
569
if (flags & C_FLAG)
570
{
571
struct op* op = 0;
572
char* sp;
573
574
while (sp = *argv++)
575
{
576
switch (*sp)
577
{
578
case '/':
579
case '?':
580
case '%':
581
op = getexpr(op ? &op->next : &top, sp);
582
break;
583
case '{':
584
if (!op)
585
error(ERROR_exit(1), "%s: pattern expected for repeat count", *(argv - 1));
586
if (*++sp == '*' && *(sp + 1) == '}' && !*(sp + 2))
587
op->repeat = 0;
588
else
589
{
590
if ((n = strtol(sp, &sp, 10)) <= 0 || *sp != '}' || *(sp + 1))
591
error(ERROR_exit(1), "%s: invalid repeat count", *(argv - 1));
592
op->repeat = n + 1;
593
}
594
if (op->flags == OP_ABSOLUTE)
595
op->flags = OP_LINES;
596
break;
597
default:
598
if ((size = strtoll(sp, &sp, 10)) <= 0 || *sp)
599
error(ERROR_exit(1), "%s: invalid line number", *(argv - 1));
600
op = getop(op ? &op->next : &top, size, 1, OP_ABSOLUTE, 0);
601
break;
602
}
603
}
604
op = getop(op ? &op->next : &top, SF_UNBOUND, 1, OP_LINES, 0);
605
fp = setfname(prefix, format, suflen, '0', '9');
606
}
607
else
608
{
609
if (cp && *argv)
610
prefix = *argv;
611
getop(&top, size, SF_UNBOUND, OP_LINES, 0);
612
fp = setfname(prefix, format, suflen, 'a', 'z');
613
}
614
if (!cp || streq(cp, "-"))
615
in = sfstdin;
616
else if (!(in = sfopen(NiL, cp, "r")))
617
error(ERROR_system(1), "%s: cannot open", cp);
618
n = split(in, fp, top, flags);
619
if (in != sfstdin)
620
sfclose(in);
621
return n;
622
}
623
624