Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/cmd/pzip/pzip.c
1808 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 1998-2011 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Glenn Fowler <[email protected]> *
18
* *
19
***********************************************************************/
20
#pragma prototyped
21
22
/*
23
* partitioned fixed record zip
24
*/
25
26
static const char usage[] =
27
"[-?\n@(#)$Id: pzip (AT&T Research) 2003-07-17 $\n]"
28
USAGE_LICENSE
29
"[+NAME?\f?\f - fixed record partition compress/decompress]"
30
"[+DESCRIPTION?\b\f?\f\b compresses and decompresses data files of fixed"
31
" length rows (records) and columns (fields). It performs better than"
32
" \bgzip\b(1) in space/time on data that has many (typically > 50%)"
33
" columns that change at a low rate (columns with a low rate of change"
34
" are low frequency; columns with a high rate of change are high"
35
" frequency).]"
36
"[+?The \apzip\a compress format is itself \agzipped\a; decompressed data"
37
" is reorganized according to the user-specified \apartition\a file"
38
" (see the \b--partition\b option below) before being passed to"
39
" \agzip\a. Low frequency columns are difference encoded and high"
40
" frequency column groups are transposed to column-major order."
41
" The \agzip\a tables are flushed between each column partition group."
42
" This has a positive space/time effect on the \agzip\a string match"
43
" and huffman tables.]"
44
"[+?If a \apartition\a file is specified then \apzip\a compresses the input"
45
" \afile\a to the standard output, otherwise \apzip\a decompresses"
46
" the input \afile\a to the standard ouput. If \afile\a is omitted"
47
" then the standard input is used. If the standard input is a tty"
48
" then \b/dev/null\b is silently used.]"
49
"[+?\afile\a may be \apzip\a compressed, \agzip\a compressed, or raw."
50
" \apzip\a files self-identify; the row size and partition can be"
51
" determined from the \apzip\a header. For \agzip\a and raw data,"
52
" the following are done to determine the row size:]{"
53
" [+(1)?The row size is taken from the \b--row\b option"
54
" if specified.]"
55
" [+(2)?If the \b--partition\b option is specified then the"
56
" row size is taken from the \apartition\a file.]"
57
" [+(3)?If the data is newline-terminated and if it contains"
58
" at least two lines and if the first two data lines"
59
" have the same length then that length is taken"
60
" to be the row size.]"
61
" [+(4)?Otherwise the row size cannot be determined and"
62
" \apzip\a exits with a diagnostic.]"
63
"}"
64
65
"[a:append?Sets the \bPZ_APPEND\b flag that may be used by some disciplines.]"
66
"[b:bzip?Use \bbzip\b(1) compression instead of the default \bgzip\b(1)."
67
" \abzip\a is not fully supported, pending further investigation.]"
68
"[c:comment?Place \acomment\a in the output \apzip\a file header when"
69
" compressing. The comment is listed by the \b--header\b option.]:"
70
" [comment]"
71
"[x:crc?Enable \agzip\a crc32 cyclic redundancy checking for decompress."
72
" On some systems this can double the execution wall time."
73
" Most data corruption errors are still caught even with \bnocrc\b.]"
74
"[d:dump?Enable detailed tracing.]"
75
"[B:bufsize?Set the output buffer size to \asize\a -- for debugging.]#[size]"
76
"[D:debug?Set the debug trace level to \alevel\a. Higher levels produce"
77
" more output.]#[level]"
78
"[O:dio?Push the \bsfdcdio\b(3) direct io discipline on the input streams."
79
" Silently ignored on systems that do not support direct io.]"
80
"[G!:gzip?\b--nogzip\b disables \agzip\a compression. Most often used for"
81
" conversion or debugging.]"
82
"[h:header?List header information on the input \apzip\a file and exit."
83
" This output is compatible with the \b--partition\b file format.]"
84
"[l:library?Loads the dll \alibrary\a via the \apzlib\a() call."
85
" \alibrary\a must contain the function"
86
" \bint pz_init(Pz_t* pz, Pzdisc_t* disc)\b"
87
" which is called during \apzip\a stream initialization. \bpz_init\b"
88
" allows run time modification to \adisc\a: most often it supplies"
89
" alternate discipline functions. Runtime libraries may interpret"
90
" options specific to the library; library usage and description"
91
" will be appended to online help output if the help options"
92
" appear after the \b--library\b option. Runtime libraries may"
93
" provide additional diagnostics and tracing when \b--summary\b,"
94
" \b--verbose\b or \b--dump\b are specified. In general, runtime"
95
" libraries are not needed for decompression. The \b--header\b"
96
" option lists the runtime libraries used to compress the input"
97
" file.]:[library]"
98
"[z:lzw?Use \bcompress\b(1) lzw compression instead of the default \bgzip\b(1)."
99
" \alzw\a is not fully supported, pending further investigation.]"
100
"[o:override?Override the column partition. Currently only fixed value"
101
" columns may be specified. The syntax is"
102
" \abegin\a[-\aend\a]]='\avalue\a' where \abegin\a is the beginning"
103
" column offset (starting at 0), \aend\a is the ending column offset"
104
" for an inclusive range, and \avalue\a is the fixed column value."
105
" Uncompress time is improved when high frequency columns are given"
106
" fixed values (see the \b--partition\b option).]:[begin[-end]]=value]"
107
"[p:partition?\afile\a specifies the data row size and the high frequency"
108
" column partition groups and permutation. \afile\a may contain URL-like"
109
" components: \apath\a\b?name=\b\apart\a or \apath\a\b#\b\apart\a"
110
" reads the partition file \apath\a and uses the partition named"
111
" \apart\a. Other options may be set by separating each with , or space."
112
" The partition file is a sequence of lines. Comments start with # and"
113
" continue to the end of the line. The first non-comment line specifies"
114
" the optional name string in \"...\". The next non-comment line"
115
" specifies the row size. The remaining lines operate on column offset"
116
" ranges of the form: \abegin\a[-\aend\a]] where \abegin\a is the"
117
" beginning column offset (starting at 0), and \aend\a is the ending"
118
" column offset for an inclusive range. The file name \b//\b or"
119
" \b/gzip/\b disables \bpzip\b partitioning and applies only"
120
" \bgzip\b compression. The operators are:]:[file]{"
121
" [+range [...]]?places all columns in the specified \arange\a"
122
" list in the same high frequency partition group."
123
" Each high frequency partition group is processed as"
124
" a separate block by the underlying compressor"
125
" (\bgzip\b(1) by default).]"
126
" [+range='value'?specifies that each column in \arange\a"
127
" has the fixed character value \avalue\a. C-style"
128
" character escapes are valid for \avalue\a.]"
129
"}"
130
"[Z:push?Push the \bsfdcpzip\b(3) io discipline rather than direct library"
131
" calls. Used for debugging and performance testing.]"
132
"[P!:pzip?\b--nopzip\b disables \apzip\a compression. Most often used for"
133
" conversion or debugging.]"
134
"[Q:regress?Generate output for regression testing, such that identical"
135
" invocations with identical input files will generate the same output.]"
136
"[r:row?Specifies the input row size (number of byte columns) for data"
137
" that does not self-identify.]#[row-size]"
138
"[S:split?Instead of compressing, the input split discipline, which must be"
139
" specified by a subsequent \b--library\b option, splits the input"
140
" data into files named \aid\a. \aid\a is determined by the split"
141
" discipline \bnamef\b function. The optional \apattern\a is a \bksh\b(1)"
142
" file match pattern that limits the split to \aid\a's matching"
143
" \apattern\a (e.g., \b--split='1234|98765'\b.) If \b--append\b is also"
144
" specified then the data is appened to any pre-existing \aid\a files;"
145
" otherwise each file is truncated when the first record containing"
146
" \aid\a data is read. If there are no records with \aid\a data then"
147
" the \aid\a file is not modified. \b--split\b should be used in a"
148
" separate directory, and the directory should be cleared when"
149
" \b--append\b is not specified to avoid mixing inconsistent"
150
" data. No records will be written to a split file with size"
151
" >= \b--window\b bytes.]:?[pattern]"
152
"[s:summary?Enable summary tracing to the standard error. Runtime libraries"
153
" may add addtional information to the default \bpzip\b(3) library"
154
" summary output. Compression summary includes the compression rate,"
155
" bytes per record, and compression wall time. This option also"
156
" enables split discipline warnings about \aid\a partitions that"
157
" should be generated by \bpin\b(1) and added to the partition"
158
" file to improve compression. The \bpin\b(1) output, with an additional"
159
" \"\aid\a\" line manually prepended, can then be appended to an existing"
160
" partition file.]"
161
"[T:test?Enable \bpzip\b(3) implementation-specific tests and tracing.]#"
162
" [mask]"
163
"[v:verbose?Enable intermediate tracing.]"
164
"[w:window?Each chunk of \awindow\a bytes is compressed separately. The"
165
" window size may be silently decreased to accomodate an integral"
166
" number of complete rows.]#[window-size:=4M]"
167
"[W:write-test?Loop on \asfread\a()/\bpzwrite\b(3) in chunks of \agroup\a"
168
" records rather than a single \apzdeflate\a() call for compression."
169
" Used for debugging and performance testing.]#[group]"
170
"[X:prefix?Uncompressed data contains a prefix that is defined by \acount\a"
171
" and an optional \aterminator\a. This data is preserved but is not"
172
" \bpzip\b compressed. If \acount\a is \b0\b on uncompress then the"
173
" header is not copied to the output. \aterminator\a may be one"
174
" of:]:[count[*terminator]]]{"
175
" [+\aomitted\a?\acount\a bytes.]"
176
" [+L?\acount\a \bnewline\b terminated records.]"
177
" [+'\achar\a'?\acount\a \achar\a terminated records.]"
178
"}"
179
180
"\n"
181
"\nfile\n"
182
"\n"
183
"[+SEE ALSO?\bbzip\b(1), \bgzip\b(1), \bpin\b(1), \bpop\b(1), \bpzip\b(3)]"
184
"[+BUGS?\bpzip\b decompress currently fails if the standard input is a pipe."
185
" This will be addressed in a future release.]"
186
;
187
188
#include <ast.h>
189
#include <error.h>
190
#include <pzip.h>
191
#include <sfdcbzip.h>
192
193
typedef int (*Method_f)(Sfio_t*, int);
194
195
int
196
main(int argc, char** argv)
197
{
198
register Pz_t* pz;
199
Pzdisc_t disc;
200
Sfio_t* dp;
201
char* s;
202
203
Method_f method = 0;
204
ssize_t bufsize = 0;
205
int push = 0;
206
int testwrite = 0;
207
unsigned long flags = PZ_READ|PZ_FORCE;
208
209
if (s = strrchr(*argv, '/'))
210
s++;
211
else
212
s = *argv;
213
error_info.id = s;
214
memset(&disc, 0, sizeof(disc));
215
disc.version = PZ_VERSION;
216
disc.errorf = errorf;
217
if (!(dp = sfstropen()))
218
error(ERROR_SYSTEM|3, "out of space");
219
for (;;)
220
{
221
switch (optget(argv, usage))
222
{
223
case 'a':
224
flags |= PZ_APPEND;
225
continue;
226
case 'b':
227
method = sfdcbzip;
228
continue;
229
case 'c':
230
disc.comment = opt_info.arg;
231
continue;
232
case 'd':
233
flags |= PZ_DUMP;
234
continue;
235
case 'h':
236
flags &= ~(PZ_READ|PZ_WRITE|PZ_FORCE);
237
flags |= PZ_STAT|PZ_DUMP;
238
continue;
239
case 'l':
240
sfprintf(dp, "library=%s\n", opt_info.arg);
241
continue;
242
case 'o':
243
sfputr(dp, opt_info.arg, '\n');
244
continue;
245
case 'p':
246
disc.partition = opt_info.arg;
247
flags &= ~(PZ_READ|PZ_FORCE);
248
flags |= PZ_WRITE;
249
continue;
250
case 'r':
251
sfprintf(dp, "row=%d\n", opt_info.num);
252
continue;
253
case 's':
254
flags |= PZ_SUMMARY;
255
continue;
256
case 'S':
257
flags |= PZ_SPLIT;
258
sfprintf(dp, "split%s%s\n", opt_info.arg ? "=" : "", opt_info.arg ? opt_info.arg : "");
259
continue;
260
case 'v':
261
flags |= PZ_VERBOSE;
262
continue;
263
case 'w':
264
disc.window = opt_info.num;
265
continue;
266
case 'x':
267
flags |= PZ_CRC;
268
continue;
269
case 'z':
270
method = sfdclzw;
271
continue;
272
case 'B':
273
bufsize = opt_info.num;
274
continue;
275
case 'D':
276
error_info.trace = -opt_info.num;
277
continue;
278
case 'O':
279
flags |= PZ_DIO;
280
continue;
281
case 'G':
282
flags |= PZ_NOGZIP;
283
continue;
284
case 'P':
285
flags |= PZ_NOPZIP;
286
continue;
287
case 'Q':
288
sfprintf(dp, "regress\n");
289
continue;
290
case 'T':
291
sfprintf(dp, "test=%s\n", opt_info.arg);
292
continue;
293
case 'W':
294
testwrite = opt_info.num;
295
continue;
296
case 'X':
297
sfprintf(dp, "prefix=%s\n", opt_info.arg);
298
continue;
299
case 'Z':
300
push = 1;
301
continue;
302
case '?':
303
if (opt_info.name[0] == '-' && opt_info.name[1] == '-')
304
sfputr(dp, opt_info.name + 2, '\n');
305
else
306
sfputr(dp, "??short", '\n');
307
continue;
308
case ':':
309
if (!opt_info.option[0] && opt_info.name[0] == opt_info.name[1] || opt_info.option[0] == '-' && opt_info.option[1] == '?')
310
sfputr(dp, &argv[opt_info.index - 1][2], '\n');
311
else
312
error(2, "%s", opt_info.arg);
313
continue;
314
}
315
break;
316
}
317
argv += opt_info.index;
318
if (error_info.errors || *argv && *(argv + 1))
319
error(ERROR_USAGE|4, "%s", optusage(NiL));
320
if (sftell(dp) && !(disc.options = strdup(sfstruse(dp))))
321
error(ERROR_SYSTEM|3, "out of space");
322
sfclose(dp);
323
if (flags & PZ_SPLIT)
324
{
325
flags &= ~PZ_WRITE;
326
flags |= PZ_READ;
327
}
328
if (push)
329
{
330
if (*argv)
331
error(3, "%s: file argument not expected for sfdcpzip discipline test", *argv);
332
if (sfdcpzip(sfstdin, NiL, flags, &disc) < 0)
333
error(3, "sfdcpzip discipline push error");
334
if (sfmove(sfstdin, sfstdout, SF_UNBOUND, -1) < 0 || sfclose(sfstdout))
335
error(ERROR_SYSTEM|3, "sfdcpzip io error");
336
return 0;
337
}
338
if (method)
339
{
340
flags |= PZ_NOGZIP;
341
if ((*method)((flags & PZ_WRITE) ? sfstdout : sfstdin, 0) < 0)
342
error(3, "compression method discipline push error");
343
}
344
if (bufsize)
345
{
346
sfset(sfstdout, SF_SHARE|SF_LINE, 0);
347
sfsetbuf(sfstdout, NiL, bufsize);
348
}
349
if (pz = pzopen(&disc, *argv, flags))
350
{
351
if (testwrite && (flags & PZ_WRITE))
352
{
353
unsigned char* buf;
354
size_t n;
355
ssize_t r;
356
357
n = pz->part->row * testwrite;
358
error(1, "pzwrite test %d bytes per chunk", n);
359
if (!(buf = newof(0, unsigned char, n, 0)))
360
error(ERROR_SYSTEM|3, "out of space [buf]");
361
while ((r = sfread(pz->io, buf, n)) > 0)
362
if (pzwrite(pz, sfstdout, buf, r) != r)
363
return 1;
364
if (r < 0)
365
error(ERROR_SYSTEM|3, "%s: read error", pz->path);
366
}
367
return ((flags & PZ_WRITE) && pzdeflate(pz, sfstdout) || (flags & PZ_READ) && pzinflate(pz, sfstdout)) || pzclose(pz) || error_info.errors;
368
}
369
return 1;
370
}
371
372