CoCalc -- pzip.c

GitHub Repository: att/ast
Path: blob/master/src/cmd/pzip/pzip.c
¹⁸⁰⁸ views
1
/***********************************************************************
2
*                                                                      *
3
*               This software is part of the ast package               *
4
*          Copyright (c) 1998-2011 AT&T Intellectual Property          *
5
*                      and is licensed under the                       *
6
*                 Eclipse Public License, Version 1.0                  *
7
*                    by AT&T Intellectual Property                     *
8
*                                                                      *
9
*                A copy of the License is available at                 *
10
*          http://www.eclipse.org/org/documents/epl-v10.html           *
11
*         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12
*                                                                      *
13
*              Information and Software Systems Research               *
14
*                            AT&T Research                             *
15
*                           Florham Park NJ                            *
16
*                                                                      *
17
*                 Glenn Fowler <[email protected]>                  *
18
*                                                                      *
19
***********************************************************************/
20
#pragma prototyped
21

22
/*
23
 * partitioned fixed record zip
24
 */
25

26
static const char usage[] =
27
"[-?\n@(#)$Id: pzip (AT&T Research) 2003-07-17 $\n]"
28
USAGE_LICENSE
29
"[+NAME?\f?\f - fixed record partition compress/decompress]"
30
"[+DESCRIPTION?\b\f?\f\b compresses and decompresses data files of fixed"
31
"	length rows (records) and columns (fields). It performs better than"
32
"	\bgzip\b(1) in space/time on data that has many (typically > 50%)"
33
"	columns that change at a low rate (columns with a low rate of change"
34
"	are low frequency; columns with a high rate of change are high"
35
"	frequency).]"
36
"[+?The \apzip\a compress format is itself \agzipped\a; decompressed data"
37
"	is reorganized according to the user-specified \apartition\a file"
38
"	(see the \b--partition\b option below) before being passed to"
39
"	\agzip\a. Low frequency columns are difference encoded and high"
40
"	frequency column groups are transposed to column-major order."
41
"	The \agzip\a tables are flushed between each column partition group."
42
"	This has a positive space/time effect on the \agzip\a string match"
43
"	and huffman tables.]"
44
"[+?If a \apartition\a file is specified then \apzip\a compresses the input"
45
"	\afile\a to the standard output, otherwise \apzip\a decompresses"
46
"	the input \afile\a to the standard ouput. If \afile\a is omitted"
47
"	then the standard input is used. If the standard input is a tty"
48
"	then \b/dev/null\b is silently used.]"
49
"[+?\afile\a may be \apzip\a compressed, \agzip\a compressed, or raw."
50
"	\apzip\a files self-identify; the row size and partition can be"
51
"	determined from the \apzip\a header. For \agzip\a and raw data,"
52
"	the following are done to determine the row size:]{"
53
"		[+(1)?The row size is taken from the \b--row\b option"
54
"			if specified.]"
55
"		[+(2)?If the \b--partition\b option is specified then the"
56
"			row size is taken from the \apartition\a file.]"
57
"		[+(3)?If the data is newline-terminated and if it contains"
58
"			at least two lines and if the first two data lines"
59
"			have the same length then that length is taken"
60
"			to be the row size.]"
61
"		[+(4)?Otherwise the row size cannot be determined and"
62
"			\apzip\a exits with a diagnostic.]"
63
"}"
64

65
"[a:append?Sets the \bPZ_APPEND\b flag that may be used by some disciplines.]"
66
"[b:bzip?Use \bbzip\b(1) compression instead of the default \bgzip\b(1)."
67
"	\abzip\a is not fully supported, pending further investigation.]"
68
"[c:comment?Place \acomment\a in the output \apzip\a file header when"
69
"	compressing. The comment is listed by the \b--header\b option.]:"
70
"		[comment]"
71
"[x:crc?Enable \agzip\a crc32 cyclic redundancy checking for decompress."
72
"	On some systems this can double the execution wall time."
73
"	Most data corruption errors are still caught even with \bnocrc\b.]"
74
"[d:dump?Enable detailed tracing.]"
75
"[B:bufsize?Set the output buffer size to \asize\a -- for debugging.]#[size]"
76
"[D:debug?Set the debug trace level to \alevel\a. Higher levels produce"
77
"	more output.]#[level]"
78
"[O:dio?Push the \bsfdcdio\b(3) direct io discipline on the input streams."
79
"	Silently ignored on systems that do not support direct io.]"
80
"[G!:gzip?\b--nogzip\b disables \agzip\a compression. Most often used for"
81
"	conversion or debugging.]"
82
"[h:header?List header information on the input \apzip\a file and exit."
83
"	This output is compatible with the \b--partition\b file format.]"
84
"[l:library?Loads the dll \alibrary\a via the \apzlib\a() call."
85
"	\alibrary\a must contain the function"
86
"	\bint pz_init(Pz_t* pz, Pzdisc_t* disc)\b"
87
"	which is called during \apzip\a stream initialization. \bpz_init\b"
88
"	allows run time modification to \adisc\a: most often it supplies"
89
"	alternate discipline functions. Runtime libraries may interpret"
90
"	options specific to the library; library usage and description"
91
"	will be appended to online help output if the help options"
92
"	appear after the \b--library\b option. Runtime libraries may"
93
"	provide additional diagnostics and tracing when \b--summary\b,"
94
"	\b--verbose\b or \b--dump\b are specified. In general, runtime"
95
"	libraries are not needed for decompression. The \b--header\b"
96
"	option lists the runtime libraries used to compress the input"
97
"	file.]:[library]"
98
"[z:lzw?Use \bcompress\b(1) lzw compression instead of the default \bgzip\b(1)."
99
"	\alzw\a is not fully supported, pending further investigation.]"
100
"[o:override?Override the column partition. Currently only fixed value"
101
"	columns may be specified. The syntax is"
102
"	\abegin\a[-\aend\a]]='\avalue\a' where \abegin\a is the beginning"
103
"	column offset (starting at 0), \aend\a is the ending column offset"
104
"	for an inclusive range, and \avalue\a is the fixed column value."
105
"	Uncompress time is improved when high frequency columns are given"
106
"	fixed values (see the \b--partition\b option).]:[begin[-end]]=value]"
107
"[p:partition?\afile\a specifies the data row size and the high frequency"
108
"	column partition groups and permutation. \afile\a may contain URL-like"
109
"	components: \apath\a\b?name=\b\apart\a or \apath\a\b#\b\apart\a"
110
"	reads the partition file \apath\a and uses the partition named"
111
"	\apart\a. Other options may be set by separating each with , or space."
112
"	The partition file is a sequence of lines. Comments start with # and"
113
"	continue to the end of the line. The first non-comment line specifies"
114
"	the optional name string in \"...\". The next non-comment line"
115
"	specifies the row size. The remaining lines operate on column offset"
116
"	ranges of the form: \abegin\a[-\aend\a]] where \abegin\a is the"
117
"	beginning column offset (starting at 0), and \aend\a is the ending"
118
"	column offset for an inclusive range. The file name \b//\b or"
119
"	\b/gzip/\b disables \bpzip\b partitioning and applies only"
120
"	\bgzip\b compression. The operators are:]:[file]{"
121
"		[+range [...]]?places all columns in the specified \arange\a"
122
"			list in the same high frequency partition group."
123
"			Each high frequency partition group is processed as"
124
"			a separate block by the underlying compressor"
125
"			(\bgzip\b(1) by default).]"
126
"		[+range='value'?specifies that each column in \arange\a"
127
"			has the fixed character value \avalue\a. C-style"
128
"			character escapes are valid for \avalue\a.]"
129
"}"
130
"[Z:push?Push the \bsfdcpzip\b(3) io discipline rather than direct library"
131
"	calls. Used for debugging and performance testing.]"
132
"[P!:pzip?\b--nopzip\b disables \apzip\a compression. Most often used for"
133
"	conversion or debugging.]"
134
"[Q:regress?Generate output for regression testing, such that identical"
135
"	invocations with identical input files will generate the same output.]"
136
"[r:row?Specifies the input row size (number of byte columns) for data"
137
"	that does not self-identify.]#[row-size]"
138
"[S:split?Instead of compressing, the input split discipline, which must be"
139
"	specified by a subsequent \b--library\b option, splits the input"
140
"	data into files named \aid\a. \aid\a is determined by the split"
141
"	discipline \bnamef\b function. The optional \apattern\a is a \bksh\b(1)"
142
"	file match pattern that limits the split to \aid\a's matching"
143
"	\apattern\a (e.g., \b--split='1234|98765'\b.) If \b--append\b is also"
144
"	specified then the data is appened to any pre-existing \aid\a files;"
145
"	otherwise each file is truncated when the first record containing"
146
"	\aid\a data is read. If there are no records with \aid\a data then"
147
"	the \aid\a file is not modified. \b--split\b should be used in a"
148
"	separate directory, and the directory should be cleared when"
149
"	\b--append\b is not specified to avoid mixing inconsistent"
150
"	data. No records will be written to a split file with size"
151
"	>= \b--window\b bytes.]:?[pattern]"
152
"[s:summary?Enable summary tracing to the standard error. Runtime libraries"
153
"	may add addtional information to the default \bpzip\b(3) library"
154
"	summary output. Compression summary includes the compression rate,"
155
"	bytes per record, and compression wall time. This option also"
156
"	enables split discipline warnings about \aid\a partitions that"
157
"	should be generated by \bpin\b(1) and added to the partition"
158
"	file to improve compression. The \bpin\b(1) output, with an additional"
159
"	\"\aid\a\" line manually prepended, can then be appended to an existing"
160
"	partition file.]"
161
"[T:test?Enable \bpzip\b(3) implementation-specific tests and tracing.]#"
162
"		[mask]"
163
"[v:verbose?Enable intermediate tracing.]"
164
"[w:window?Each chunk of \awindow\a bytes is compressed separately. The"
165
"	window size may be silently decreased to accomodate an integral"
166
"	number of complete rows.]#[window-size:=4M]"
167
"[W:write-test?Loop on \asfread\a()/\bpzwrite\b(3) in chunks of \agroup\a"
168
"	records rather than a single \apzdeflate\a() call for compression."
169
"	Used for debugging and performance testing.]#[group]"
170
"[X:prefix?Uncompressed data contains a prefix that is defined by \acount\a"
171
"	and an optional \aterminator\a. This data is preserved but is not"
172
"	\bpzip\b compressed. If \acount\a is \b0\b on uncompress then the"
173
"	header is not copied to the output. \aterminator\a may be one"
174
"	of:]:[count[*terminator]]]{"
175
"		[+\aomitted\a?\acount\a bytes.]"
176
"		[+L?\acount\a \bnewline\b terminated records.]"
177
"		[+'\achar\a'?\acount\a \achar\a terminated records.]"
178
"}"
179

180
"\n"
181
"\nfile\n"
182
"\n"
183
"[+SEE ALSO?\bbzip\b(1), \bgzip\b(1), \bpin\b(1), \bpop\b(1), \bpzip\b(3)]"
184
"[+BUGS?\bpzip\b decompress currently fails if the standard input is a pipe."
185
"	This will be addressed in a future release.]"
186
;
187

188
#include <ast.h>
189
#include <error.h>
190
#include <pzip.h>
191
#include <sfdcbzip.h>
192

193
typedef int (*Method_f)(Sfio_t*, int);
194

195
int
196
main(int argc, char** argv)
197
{
198
	register Pz_t*	pz;
199
	Pzdisc_t	disc;
200
	Sfio_t*		dp;
201
	char*		s;
202

203
	Method_f	method = 0;
204
	ssize_t		bufsize = 0;
205
	int		push = 0;
206
	int		testwrite = 0;
207
	unsigned long	flags = PZ_READ|PZ_FORCE;
208

209
	if (s = strrchr(*argv, '/'))
210
		s++;
211
	else
212
		s = *argv;
213
	error_info.id = s;
214
	memset(&disc, 0, sizeof(disc));
215
	disc.version = PZ_VERSION;
216
	disc.errorf = errorf;
217
	if (!(dp = sfstropen()))
218
		error(ERROR_SYSTEM|3, "out of space");
219
	for (;;)
220
	{
221
		switch (optget(argv, usage))
222
		{
223
		case 'a':
224
			flags |= PZ_APPEND;
225
			continue;
226
		case 'b':
227
			method = sfdcbzip;
228
			continue;
229
		case 'c':
230
			disc.comment = opt_info.arg;
231
			continue;
232
		case 'd':
233
			flags |= PZ_DUMP;
234
			continue;
235
		case 'h':
236
			flags &= ~(PZ_READ|PZ_WRITE|PZ_FORCE);
237
			flags |= PZ_STAT|PZ_DUMP;
238
			continue;
239
		case 'l':
240
			sfprintf(dp, "library=%s\n", opt_info.arg);
241
			continue;
242
		case 'o':
243
			sfputr(dp, opt_info.arg, '\n');
244
			continue;
245
		case 'p':
246
			disc.partition = opt_info.arg;
247
			flags &= ~(PZ_READ|PZ_FORCE);
248
			flags |= PZ_WRITE;
249
			continue;
250
		case 'r':
251
			sfprintf(dp, "row=%d\n", opt_info.num);
252
			continue;
253
		case 's':
254
			flags |= PZ_SUMMARY;
255
			continue;
256
		case 'S':
257
			flags |= PZ_SPLIT;
258
			sfprintf(dp, "split%s%s\n", opt_info.arg ? "=" : "", opt_info.arg ? opt_info.arg : "");
259
			continue;
260
		case 'v':
261
			flags |= PZ_VERBOSE;
262
			continue;
263
		case 'w':
264
			disc.window = opt_info.num;
265
			continue;
266
		case 'x':
267
			flags |= PZ_CRC;
268
			continue;
269
		case 'z':
270
			method = sfdclzw;
271
			continue;
272
		case 'B':
273
			bufsize = opt_info.num;
274
			continue;
275
		case 'D':
276
			error_info.trace = -opt_info.num;
277
			continue;
278
		case 'O':
279
			flags |= PZ_DIO;
280
			continue;
281
		case 'G':
282
			flags |= PZ_NOGZIP;
283
			continue;
284
		case 'P':
285
			flags |= PZ_NOPZIP;
286
			continue;
287
		case 'Q':
288
			sfprintf(dp, "regress\n");
289
			continue;
290
		case 'T':
291
			sfprintf(dp, "test=%s\n", opt_info.arg);
292
			continue;
293
		case 'W':
294
			testwrite = opt_info.num;
295
			continue;
296
		case 'X':
297
			sfprintf(dp, "prefix=%s\n", opt_info.arg);
298
			continue;
299
		case 'Z':
300
			push = 1;
301
			continue;
302
		case '?':
303
			if (opt_info.name[0] == '-' && opt_info.name[1] == '-')
304
				sfputr(dp, opt_info.name + 2, '\n');
305
			else
306
				sfputr(dp, "??short", '\n');
307
			continue;
308
		case ':':
309
			if (!opt_info.option[0] && opt_info.name[0] == opt_info.name[1] || opt_info.option[0] == '-' && opt_info.option[1] == '?')
310
				sfputr(dp, &argv[opt_info.index - 1][2], '\n');
311
			else
312
				error(2, "%s", opt_info.arg);
313
			continue;
314
		}
315
		break;
316
	}
317
	argv += opt_info.index;
318
	if (error_info.errors || *argv && *(argv + 1))
319
		error(ERROR_USAGE|4, "%s", optusage(NiL));
320
	if (sftell(dp) && !(disc.options = strdup(sfstruse(dp))))
321
		error(ERROR_SYSTEM|3, "out of space");
322
	sfclose(dp);
323
	if (flags & PZ_SPLIT)
324
	{
325
		flags &= ~PZ_WRITE;
326
		flags |= PZ_READ;
327
	}
328
	if (push)
329
	{
330
		if (*argv)
331
			error(3, "%s: file argument not expected for sfdcpzip discipline test", *argv);
332
		if (sfdcpzip(sfstdin, NiL, flags, &disc) < 0)
333
			error(3, "sfdcpzip discipline push error");
334
		if (sfmove(sfstdin, sfstdout, SF_UNBOUND, -1) < 0 || sfclose(sfstdout))
335
			error(ERROR_SYSTEM|3, "sfdcpzip io error");
336
		return 0;
337
	}
338
	if (method)
339
	{
340
		flags |= PZ_NOGZIP;
341
		if ((*method)((flags & PZ_WRITE) ? sfstdout : sfstdin, 0) < 0)
342
			error(3, "compression method discipline push error");
343
	}
344
	if (bufsize)
345
	{
346
		sfset(sfstdout, SF_SHARE|SF_LINE, 0);
347
		sfsetbuf(sfstdout, NiL, bufsize);
348
	}
349
	if (pz = pzopen(&disc, *argv, flags))
350
	{
351
		if (testwrite && (flags & PZ_WRITE))
352
		{
353
			unsigned char*	buf;
354
			size_t		n;
355
			ssize_t		r;
356

357
			n = pz->part->row * testwrite;
358
			error(1, "pzwrite test %d bytes per chunk", n);
359
			if (!(buf = newof(0, unsigned char, n, 0)))
360
				error(ERROR_SYSTEM|3, "out of space [buf]");
361
			while ((r = sfread(pz->io, buf, n)) > 0)
362
				if (pzwrite(pz, sfstdout, buf, r) != r)
363
					return 1;
364
			if (r < 0)
365
				error(ERROR_SYSTEM|3, "%s: read error", pz->path);
366
		}
367
		return ((flags & PZ_WRITE) && pzdeflate(pz, sfstdout) || (flags & PZ_READ) && pzinflate(pz, sfstdout)) || pzclose(pz) || error_info.errors;
368
	}
369
	return 1;
370
}
371

372
Product

Resources

Company