CoCalc -- split.c

GitHub Repository: att/ast
Path: blob/master/src/cmd/std/split.c
¹⁸⁰⁸ views
1
/***********************************************************************
2
*                                                                      *
3
*               This software is part of the ast package               *
4
*          Copyright (c) 1989-2011 AT&T Intellectual Property          *
5
*                      and is licensed under the                       *
6
*                 Eclipse Public License, Version 1.0                  *
7
*                    by AT&T Intellectual Property                     *
8
*                                                                      *
9
*                A copy of the License is available at                 *
10
*          http://www.eclipse.org/org/documents/epl-v10.html           *
11
*         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12
*                                                                      *
13
*              Information and Software Systems Research               *
14
*                            AT&T Research                             *
15
*                           Florham Park NJ                            *
16
*                                                                      *
17
*                 Glenn Fowler <[email protected]>                  *
18
*                                                                      *
19
***********************************************************************/
20
#pragma prototyped
21
/*
22
 * split.c
23
 * David Korn
24
 * AT&T Research
25
 */
26

27
static const char split_usage[] =
28
"[-?\n@(#)$Id: split (AT&T Research) 2006-09-19 $\n]"
29
USAGE_LICENSE
30
"[+NAME?split - split files into pieces]"
31
"[+DESCRIPTION?\bsplit\b reads an input file and writes one or more"
32
"	output files so that \bcat\b(1) on these files will produce"
33
"	the input file. The default size for each piece is 1000 lines." 
34
"	The suffix consists of \asuffix_len\a lower case characters"
35
"	from the POSIX locale.]"
36
"[+?If \aprefix\a is specified it will be used as a prefix for each"
37
"	of the resulting files from the split operation. If \aprefix\a"
38
"	is specified, the prefix \bx\b will be used.]"
39
"[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bsplit\b"
40
"	copies from standard input starting at the current location.]"
41
"[+?The option arguments for \b-b\b and \b-C\b can optionally be followed"
42
"	by one of the following characters to specify a different"
43
"	unit other than a single byte:]{"
44
"		[+b?512 bytes.]"
45
"		[+k?1-killobytes.]"
46
"		[+m?1-megabyte.]"
47
"		[+g?1-gigabyte.]"
48
"		[+t?1-terabyte.]"
49
"	}"
50
"[+?For backwards compatibility, \b-\b\aline_count\a is equivalent to"
51
"	 \b-l\b \aline_count\a.]"
52
"[l:lines]#[line_count:=1000?\aline_count\a specified the number of lines"
53
"	for each piece except the last. If the input does not end in"
54
"	a newline, the partial line is included in the last piece.]"
55
"[a|n:suffix-length]#[suffix_len:=2?\asuffix_len\a defines the number of"
56
"	letters that form the suffix portion of the file names for each of"
57
"	the pieces that the file is split into.]"
58
"[b:bytes]#[n?Splits the file into byte size pieces defined by \an\a"
59
"	rather than lines.]"
60
"[C:line-bytes]#[n?Splits the file into lines totaling a most \an\a bytes.]"
61
"\n"
62
"\n[ file [ name ] ]\n"
63
"\n"
64
"[+EXIT STATUS]{"
65
"	[+0?Successful completion.]"
66
"	[+>0?An error occurred.]"
67
"}"
68
"[+SEE ALSO? \bcsplit\b(1), \bcat\b(1)]"
69
;
70

71
static const char csplit_usage[] =
72
"[-?\n@(#)$Id: csplit (AT&T Research) 2003-08-21 $\n]"
73
USAGE_LICENSE
74
"[+NAME?csplit - split a file into sections determined by context lines]"
75
"[+DESCRIPTION?\bcsplit\b creates zero or more output files containing"
76
"	sections of the given input \afile\a, or the standard input if the"
77
"	name \b-\b is given. By default, \bcsplit\b prints the number of"
78
"	bytes written to each output file after it has been created.]"
79
"[+?The contents of the output files are determined by the \apattern\a"
80
"	arguments. An error occurs if a pattern argument refers to a"
81
"	nonexistent line of the input file, such as if no remaining line"
82
"	matches a given regular expression.  After all the given patterns have"
83
"	been matched, any remaining output is copied into one last output"
84
"	file. The types of pattern arguments are:]{"
85
"		[+line?Create an output file containing the current line up"
86
"			to (but not including) line \aline\a (a positive"
87
"			integer) of the input file. If followed by a repeat"
88
"			count, also create an output file containing the"
89
"			next \aline\a lines of the input file once for each"
90
"			repeat.]"
91
"		[+/regexp/[offset]]?Create an output file containing the"
92
"			current line up to (but not including) the next line"
93
"			of the input file that contains a match for"
94
"			\aregexp\a. The optional \aoffset\a is a \b+\b or"
95
"			\b-\b followed by a positive integer. If it is given,"
96
"			the input up to the matching line plus or minus"
97
"			\aoffset\a is put into the output file, and the line"
98
"			after that begins the next section of input.]"
99
"		[+%regexp%[offset]]?Like the previous type, except that it"
100
"			does not create an output file, so that section of"
101
"			the input file is effectively ignored.]"
102
"		[+{repeat-count}?Repeat the previous pattern \arepeat-count\a"
103
"			(a positive integer) additional times. An asterisk"
104
"			may be given in place of the (integer) repeat count,"
105
"			in which case the preceeding pattern is repeated as"
106
"			many times as necessary until the input is exausted.]"
107
"	}"
108
"[+?The output file names consist of a prefix followed by a suffix. By"
109
"	default, the suffix is merely an ascending linear sequence of two-digit"
110
"	decimal numbers starting with 00 and ranging up to 99, however this"
111
"	default may be overridden by either the \b--digits\b option or by the"
112
"	\b--suffix-format\b option (see below.) In any case, concatenating"
113
"	the output files in sorted order by file name produces the original"
114
"	input file, in order. The default output file name prefix is \bxx\b.]"
115
"[+?By default, if \bcsplit\b encounters an error or receives a hangup,"
116
"	interrupt, quit, or terminate signal, it removes any output files"
117
"	that it has created so far before it exits.]"
118
"[b:suffix-format?Use the \bprintf\b(3) \aformat\a to generate the file"
119
"	name suffix.]:[format:=\b%02d\b]"
120
"[f:prefix?Use \aprefix\a to generate the file name prefix.]:[prefix:=\bxx\b]"
121
"[k:keep-files?Do not remove output files on errors.]"
122
"[a|n:digits?Use \adigits\a in the generated file name suffixes.]#[digits:=2]"
123
"[s:silent|quiet?Do not print output file counts and sizes.]"
124
"[z:elide-empty-files?Remove empty output files.]"
125
"\n"
126
"\nfile arg ...\n"
127
"\n"
128
"[+EXIT STATUS?]{"
129
"	[+0?Successful completion.]"
130
"	[+>0?An error occurred.]"
131
"}"
132
"[+SEE ALSO? \bsplit\b(1), \bcat\b(1)]"
133
;
134

135
#include <cmd.h>
136
#include <regex.h>
137

138
#define	S_FLAG		001
139
#define	K_FLAG		002
140
#define	C_FLAG		004
141
#define	B_FLAG		010
142
#define	Z_FLAG		020
143
#define	M_FLAG		040
144

145
#define OP_LINES	0
146
#define OP_SEARCH	1
147
#define OP_SKIP		2
148
#define OP_ABSOLUTE	3
149

150
#define BLK_SIZE	2048
151

152
struct fname
153
{
154
	char*		fname;
155
	char*		format;
156
	char*		suffix;
157
	char*		last;
158
	char		low;
159
	char		high;
160
	int		count;
161
};
162

163
struct op
164
{
165
	struct op*	next;
166
	Sfoff_t		size;
167
	size_t		repeat;
168
	int		flags;
169
	regex_t*	re;
170
};
171

172
/*
173
 * create an operation structure
174
 */
175
static struct op*
176
getop(struct op** prev, Sfoff_t size, size_t repeat, int flags, int re)
177
{
178
	struct op*	op;
179

180
	if (op = newof(0, struct op, 1, re ? sizeof(regex_t) : 0))
181
	{
182
		op->repeat = repeat;
183
		op->flags = flags;
184
		op->size = size;
185
		op->next = 0;
186
		if (re)
187
			op->re = (regex_t*)(op + 1);
188
		*prev = op;
189
	}
190
	return op;
191
}
192

193
/*
194
 * process /expr/offset arguments
195
 * returns new operation structure which is added to linked list
196
 */
197

198
static struct op*
199
getexpr(struct op** prev, const char* arg)
200
{
201
	char*		cp = (char*)arg;
202
	char*		ep;
203
	int		n;
204
	struct op*	op;
205

206
	if (op = getop(prev, 0, 1, *cp == '/' ? OP_SEARCH : OP_SKIP, 1))
207
	{
208
		if (n = regcomp(op->re, cp, REG_DELIMITED|REG_NOSUB))
209
		{
210
			regfatal(op->re, 2, n);
211
			return 0;
212
		}
213
		cp += op->re->re_npat;
214
		if (*cp)
215
		{
216
			op->size = strtoll(cp, &ep, 10);
217
			if (*ep)
218
				error(ERROR_exit(1), "%s: invalid offset", cp);
219
		}
220
	}
221
	return op;
222
}
223

224
/*
225
 * set up file name generator whose form is <prefix>... where ... is
226
 * suflen characters from low..high
227
 * returns a pointer to a structure that can be used to create
228
 * file names
229
 */
230

231
static struct fname*
232
setfname(const char* prefix, char* format, int suflen, int low, int high)
233
{
234
	struct fname*	fp;
235
	int		flen;
236
	int		slen;
237
	int		len;
238
	char*		cp;
239

240
	flen = strlen(prefix);
241
	len = flen + suflen + 1;
242
	if (format)
243
	{
244
		slen = strlen(format);
245
		len += flen + slen + 1;
246
	}
247
	else
248
		slen = 0;
249
	if (fp = newof(0, struct fname, 1, len))
250
	{
251
		cp = (char*)(fp + 1);
252
		if (format)
253
		{
254
			strcpy(fp->format = cp, prefix);
255
			cp += flen;
256
			strcpy(cp, format);
257
			cp += slen + 1;
258
		}
259
		fp->low = low;
260
		fp->high = high;
261
		fp->count = 0;
262
		strcpy(fp->fname = cp, prefix);
263
		cp += flen;
264
		fp->suffix = cp;
265
		while (suflen-- > 0)
266
			*cp++ = low;
267
		*cp-- = 0;
268
		fp->last = cp;
269
		(*cp)--;
270
		flen = _POSIX_NAME_MAX;
271
		if (cp = strrchr(fp->fname, '/'))
272
		{
273
			cp++;
274
			len = strlen(cp);
275
			if (len > flen)
276
			{
277
				*(cp - 1) = 0;
278
				flen = (int)strtol(astconf("NAME_MAX", fp->fname, NiL), NiL, 0);
279
				*(cp - 1) = '/';
280
			}
281
		}
282
		else
283
		{
284
			cp = fp->fname;
285
			if (len > flen)
286
				flen = (int)strtol(astconf("NAME_MAX", ".", NiL), NiL, 0);
287
		}
288
		if (len > flen)
289
			error(ERROR_exit(1), "%s: filename too long", prefix);
290
	}
291
	return fp;
292
}
293

294
/*
295
 * return next sequential file name
296
 */
297

298
static char*
299
getfname(struct fname* fp)
300
{
301
	register char*	cp = fp->last;
302

303
	if (fp->format)
304
		return sfprints(fp->format, fp->count++);
305
	while (++(*cp) > fp->high)
306
	{
307
		*cp-- = fp->low;
308
		if (cp < fp->suffix)
309
		{
310
			error(0, "file limit reached");
311
			return 0;
312
		}
313
	}
314
	fp->count++;
315
	return fp->fname;
316
}
317

318
/*
319
 * remove all generated files
320
 */
321

322
static void
323
removeall(struct fname* fp)
324
{
325
	register char*	cp = fp->suffix;
326

327
	while (*cp)
328
		*cp++ = fp->low;
329
	*(cp - 1) -= 1;
330
	while (fp->count-- > 0)
331
	{
332
		remove(getfname(fp));
333
		fp->count--;
334
	}
335
	fp->count = 0;
336
}
337

338
static int
339
msize(Sfio_t* in, long len)
340
{
341
	Sfoff_t		off = sftell(in);
342
	register char*	cp;
343
	register char*	dp;
344
	register long	m;
345
	register long	n = len;
346
	register long	nlen = 0;
347

348
	if (sfsize(in) - off <= len)
349
		return len;
350
	while (nlen == 0 && n > 0)
351
	{
352
		n -= BLK_SIZE;
353
		if (n < 0)
354
			n = 0;
355
		sfseek(in, off + n, SEEK_SET);
356
		if (!(dp = cp = sfreserve(in, BLK_SIZE, 0)))
357
			return len;
358
		m = BLK_SIZE;
359
		while (m-- > 0)
360
		{
361
			if (*cp++ == '\n')
362
				nlen = n + (cp - dp);
363
		}
364
	}
365
	if (n > 0)
366
		sfseek(in, off, SEEK_SET);
367
	return nlen ? nlen : len;
368
}
369

370
static int
371
split(Sfio_t* in, struct fname* fp, struct op* op, int flags)
372
{
373
	register char*		cp;
374
	register char*		s;
375
	Sfoff_t			len;
376
	Sfoff_t			z;
377
	Sfoff_t			size;
378
	size_t			repeat;
379
	int			c;
380

381
	register Sfio_t*	out = 0;
382
	register char*		peek = 0;
383
	register long		n = 0;
384
	int			delim = (flags & B_FLAG) ? -1 : '\n';
385
	size_t			lineno = 1;
386

387
	while (op)
388
	{
389
		if (op->flags == OP_LINES)
390
			len = op->size;
391
		repeat = op->repeat;
392
		do
393
		{
394
			if (op->flags != OP_SKIP)
395
			{
396
				if (!(cp = getfname(fp)))
397
					goto err;
398
				if (!(out = sfopen(NiL, cp, "w")))
399
				{
400
					fp->count--;
401
					error(ERROR_SYSTEM|2, "%s: cannot create", cp);
402
					goto err;
403
				}
404
			}
405
			if (op->flags == OP_ABSOLUTE || op->flags == OP_LINES)
406
			{
407
				if (op->flags == OP_ABSOLUTE)
408
					len = op->size - lineno;
409
				if (peek)
410
				{
411
					if ((n = sfputr(out, peek, delim)) <= 0)
412
						goto done;
413
					peek = 0;
414
					if (len > 0)
415
						len--;
416
					lineno++;
417
				}
418
				if (len)
419
				{
420
					z = (flags & M_FLAG) ? msize(in, len) : len;
421
					if ((n = sfmove(in, out, z, delim)) < z || n < 0)
422
						goto done;
423
					lineno += n;
424
				}
425
			}
426
			else
427
			{
428
				if (peek)
429
				{
430
					if (out && (n = sfputr(out, peek, delim)) <= 0)
431
						goto done;
432
					lineno++;
433
					peek = 0;
434
				}
435
				while (s = sfgetr(in, delim, 1))
436
				{
437
					if (!(c = regexec(op->re, s, 0, NiL, 0)))
438
						break;
439
					lineno++;
440
					if (c != REG_NOMATCH)
441
					{
442
						regfatal(op->re, 2, c);
443
						goto err;
444
					}
445
					if (out && (n = sfputr(out, s, delim)) <= 0)
446
						goto done;
447
				}
448
				if (!(peek = s))
449
				{
450
					while (op->next)
451
						op = op->next;
452
					repeat = 1;
453
				}
454
			}
455
			if (out)
456
			{
457
				size = sfseek(out, (Sfoff_t)0, SEEK_END);
458
				if (!(flags & S_FLAG))
459
					sfprintf(sfstdout, "%I*d\n", sizeof(size), size);
460
				sfclose(out);
461
				out = 0;
462
				if ((flags & Z_FLAG) && size <= 0)
463
					remove(cp);
464
			}
465
		} while (!repeat || --repeat);
466
		op = op->next;
467
	}
468
 done:
469
	if (out)
470
	{
471
		sfclose(out);
472
		if (n <= 0)
473
			remove(cp);
474
	}
475
	if (n >= 0)
476
		return 0;
477
 err:
478
	if (!(flags & K_FLAG))
479
		removeall(fp);
480
	return 1;
481
}
482

483
int
484
main(int argc, char** argv)
485
{
486
	struct fname*	fp;
487
	struct op*	top;
488
	char*		cp;
489
	char*		prefix;
490
	const char*	usage;
491
	Sfio_t*		in;
492
	int		flags;
493
	ssize_t		n;
494

495
	char*		format = 0;
496
	Sfoff_t		size = 10000;
497
	int		suflen = 2;
498

499
	if (cp = strrchr(*argv, '/'))
500
		cp++;
501
	else
502
		cp = *argv;
503
	error_info.id = cp;
504
	if (streq(cp, "split"))
505
	{
506
		usage = split_usage;
507
		flags = S_FLAG|K_FLAG;
508
		prefix = "x";
509
	}
510
	else
511
	{
512
		usage = csplit_usage;
513
		flags = C_FLAG;
514
		prefix = "xx";
515
	}
516
	for (;;)
517
	{
518
		switch (optget(argv, usage))
519
		{
520
		case 0:
521
			break;
522
		case 'l':
523
			flags &= ~(B_FLAG|M_FLAG);
524
			if ((size = opt_info.number) <= 0)
525
				error(1, "%s: invalid size", opt_info.arg);
526
			continue;
527
		case 'k':
528
			flags |= K_FLAG;
529
			continue;
530
		case 's':
531
			flags |= S_FLAG;
532
			continue;
533
		case 'z':
534
			flags |= Z_FLAG;
535
			continue;
536
		case 'f':
537
			prefix = opt_info.arg;
538
			continue;
539
		case 'a':
540
		case 'n':
541
			suflen = opt_info.num;
542
			continue;
543
		case 'C':
544
			flags |= M_FLAG;
545
		case 'b':
546
			if (flags & S_FLAG)
547
			{
548
				if ((size = opt_info.number) <= 0)
549
					error(1, "%s: invalid size", opt_info.arg);
550
				flags |= B_FLAG;
551
			}
552
			else
553
				format = opt_info.arg;
554
			continue;
555
		case ':':
556
			error(2, "%s", opt_info.arg);
557
			break;
558
		case '?':
559
			error(ERROR_usage(2), "%s", opt_info.arg);
560
			break;
561
		}
562
		break;
563
	}
564
	argv += opt_info.index;
565
	argc -= opt_info.index;
566
	if (error_info.errors || !(flags & C_FLAG) && argc > 2 || (flags & C_FLAG) && argc < 2)
567
		error(ERROR_usage(2), "%s", optusage(NiL));
568
	cp = *argv++;
569
	if (flags & C_FLAG)
570
	{
571
		struct op*	op = 0;
572
		char*		sp;
573

574
		while (sp = *argv++)
575
		{
576
			switch (*sp)
577
			{
578
			case '/':
579
			case '?':
580
			case '%':
581
				op = getexpr(op ? &op->next : &top, sp);
582
				break;
583
			case '{':
584
				if (!op)
585
					error(ERROR_exit(1), "%s: pattern expected for repeat count", *(argv - 1));
586
				if (*++sp == '*' && *(sp + 1) == '}' && !*(sp + 2))
587
					op->repeat = 0;
588
				else
589
				{
590
					if ((n = strtol(sp, &sp, 10)) <= 0 || *sp != '}' || *(sp + 1))
591
						error(ERROR_exit(1), "%s: invalid repeat count", *(argv - 1));
592
					op->repeat = n + 1;
593
				}
594
				if (op->flags == OP_ABSOLUTE)
595
					op->flags = OP_LINES;
596
				break;
597
			default:
598
				if ((size = strtoll(sp, &sp, 10)) <= 0 || *sp)
599
					error(ERROR_exit(1), "%s: invalid line number", *(argv - 1));
600
				op = getop(op ? &op->next : &top, size, 1, OP_ABSOLUTE, 0);
601
				break;
602
			}
603
		}
604
		op = getop(op ? &op->next : &top, SF_UNBOUND, 1, OP_LINES, 0);
605
		fp = setfname(prefix, format, suflen, '0', '9');
606
	}
607
	else
608
	{
609
		if (cp && *argv)
610
			prefix = *argv;
611
		getop(&top, size, SF_UNBOUND, OP_LINES, 0);
612
		fp = setfname(prefix, format, suflen, 'a', 'z');
613
	}
614
	if (!cp || streq(cp, "-"))
615
		in = sfstdin;
616
	else if (!(in = sfopen(NiL, cp, "r")))
617
		error(ERROR_system(1), "%s: cannot open", cp);
618
	n = split(in, fp, top, flags);
619
	if (in != sfstdin)
620
		sfclose(in);
621
	return n;
622
}
623

624
Product

Resources

Company