Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/cmd/pzip/pop.c
1808 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 1998-2011 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Glenn Fowler <[email protected]> *
18
* *
19
***********************************************************************/
20
#pragma prototyped
21
22
/*
23
* partitioned fixed ops
24
*/
25
26
static const char usage[] =
27
"[-?\n@(#)$Id: pop (AT&T Research) 2003-04-05 $\n]"
28
USAGE_LICENSE
29
"[+NAME?pop - operate on partioned fixed row and column data]"
30
"[+DESCRIPTION?\bpop\b operates on partitioned fixed row and column data files."
31
" It can cut high or low frequency partition columns, list format field"
32
" names for partition columns, and list the partition column frequencies."
33
" See \bpzip\b(1) for a detailed description of file partitions"
34
" and column frequencies.]"
35
36
"[c:cut?Copy selected columns from the input rows to the standard output.]"
37
"[e:endiff?Copy the row-by-row difference to the standard output.]"
38
"[f:format?Specifies the data format (schema) file. Two input styles"
39
" are accepted. The first style lists field names and sizes in"
40
" consecutive order: `\bname\b,\asize\a[,\acomment\a...]]'. The second"
41
" style lists the field offset range and name:"
42
" `\abegin\a[-\aend\a]] \bname\b'. Column offsets start at 0."
43
" Names are used to label partition group listings on the standard"
44
" output, with partition groups separated by an empty line.]:[file]"
45
"[h:high?List information on high frequency columns only. This is"
46
" the default.]"
47
"[i:information?List the selected column frequency information on the"
48
" standard output.]"
49
"[l:low?List information on low frequency columns only.]"
50
"[m:map?List the partition file with the row size equal to the number of"
51
" high frequency columns and the high frequency columns renumbered"
52
" in order from 0. This partition file can then be used on high"
53
" frequency data produced by the \b--cut\b option.]"
54
"[n:newline?Append a newline to each cut output row.]"
55
"[o:override?Override the column partition. Currently only fixed value"
56
" columns may be specified. The syntax is"
57
" \abegin\a[-\aend\a]]='\avalue\a' where \abegin\a is the beginning"
58
" column offset (starting at 0), \aend\a is the ending column offset"
59
" for an inclusive range, and \avalue\a is the fixed column value."
60
" Uncompress time is improved when high frequency columns are given"
61
" fixed values (see the \b--partition\b option).]:[name=value]"
62
"[p:partition?Specifies the data row size and the high frequency column"
63
" partition groups and permutation. The partition file is a sequence"
64
" of lines. Comments start with # and continue to the end of the line."
65
" The first non-comment line specifies the optional name string"
66
" in \"...\". The next non-comment line specifies the row size."
67
" The remaining lines operate on column offset ranges of the form:"
68
" \abegin\a[-\aend\a]] where \abegin\a is the beginning column offset"
69
" (starting at 0), and \aend\a is the ending column offset for an"
70
" inclusive range. The operators are:]:[file]{"
71
" [+range [...]]?places all columns in the specified \arange\a"
72
" list in the same high frequency partition group."
73
" Each high frequency partition group is processed as"
74
" a separate block by the underlying compressor"
75
" (\bgzip\b(1) by default).]"
76
" [+range='value'?specifies that each column in \arange\a"
77
" has the fixed character value \avalue\a. C-style"
78
" character escapes are valid for \avalue\a.]"
79
"}"
80
"[r:row?Specifies the input row size (number of byte columns). Exactly"
81
" one of \b--row\b or \b--partition\b must be specified.]#[row-size]"
82
"[u:undiff?The inverse of the \b--endiff\b difference encoding.]"
83
"[v:verbose?List header information on the input \apzip\a file or"
84
" \apartition-file\a and continue processing.]"
85
"[x:identify?Identify output information columns with labels from the"
86
" \b--format\b file.]"
87
"[Q:regress?Generate output for regression testing, such that identical"
88
" invocations with identical input files will generate the same output.]"
89
"[T:test?Enable implementation-specific tests and tracing.]#[test-mask]"
90
"[X:prefix?Uncompressed data contains a prefix that is defined by \acount\a"
91
" and an optional \aterminator\a. This data is not \bpzip\b compressed."
92
" \aterminator\a may be one of:]:[count[*terminator]]]{"
93
" [+\aomitted\a?\acount\a bytes.]"
94
" [+L?\acount\a \bnewline\b terminated records.]"
95
" [+'\achar\a'?\acount\a \achar\a terminated records.]"
96
97
"\n"
98
"\n[ file ]\n"
99
"\n"
100
"[+SEE ALSO?\bgzip\b(1), \bpin\b(1), \bpzip\b(1), \bpzip\b(3)]"
101
;
102
103
#include <ast.h>
104
#include <ctype.h>
105
#include <error.h>
106
#include <pzip.h>
107
#include <tok.h>
108
109
#define OP_CUT 0x0001
110
#define OP_ENDIFF 0x0002
111
#define OP_ID 0x0004
112
#define OP_INFO 0x0008
113
#define OP_LO 0x0010
114
#define OP_MAP 0x0020
115
#define OP_NL 0x0040
116
#define OP_UNDIFF 0x0100
117
#define OP_VERBOSE 0x0200
118
119
typedef struct
120
{
121
char* name;
122
int beg;
123
int end;
124
} Label_t;
125
126
typedef struct
127
{
128
unsigned char hit[UCHAR_MAX+1]; /* values seen */
129
unsigned long changes; /* number of changes */
130
unsigned int values; /* # different values */
131
int prev; /* prev row value */
132
} Info_t;
133
134
/*
135
* gather stats from sp into ip
136
*/
137
138
static ssize_t
139
gather(register Pz_t* pz, register Pzpart_t* pp, Sfio_t* sp, register Info_t* ip, size_t* map, size_t m)
140
{
141
register int i;
142
register int j;
143
register unsigned char* buf;
144
register size_t n;
145
register ssize_t r;
146
register size_t rows;
147
148
for (i = 0; i < m; i++)
149
ip[i].prev = -1;
150
rows = 0;
151
for (;;)
152
{
153
buf = pz->buf;
154
if ((r = sfread(sp, buf, pz->win)) < (ssize_t)pp->row)
155
{
156
if (r < 0)
157
{
158
error(ERROR_SYSTEM|2, "read error");
159
return -1;
160
}
161
if (r > 0)
162
error(1, "last record incomplete");
163
break;
164
}
165
for (rows += (n = r / pp->row); n--; buf += pp->row)
166
for (i = 0; i < m; i++)
167
if (ip[i].prev != buf[j = map[i]])
168
{
169
ip[i].hit[ip[i].prev = buf[j]] = 1;
170
ip[i].changes++;
171
}
172
}
173
for (i = 0; i < m; i++)
174
for (j = 0; j < elementsof(ip[i].hit); j++)
175
if (ip[i].hit[j])
176
ip[i].values++;
177
return rows;
178
}
179
180
/*
181
* cut hi (default) or lo cols from stdin to stdout
182
*/
183
184
static int
185
cut(register Pz_t* pz, register Pzpart_t* pp, int op, register size_t* map, size_t m)
186
{
187
register int i;
188
register int j;
189
register size_t n;
190
register ssize_t r;
191
register unsigned char* ib;
192
register unsigned char* ob;
193
194
if (op & OP_VERBOSE)
195
for (n = 0; n < m; n++)
196
error(0, "map %3d => %3d", map[n], n);
197
if (!(pz->wrk = vmnewof(pz->vm, 0, unsigned char, pz->win, 0)))
198
error(ERROR_SYSTEM|3, "out of space");
199
for (;;)
200
{
201
ib = pz->buf;
202
ob = pz->wrk;
203
if ((r = sfread(pz->io, ib, pz->win)) < (ssize_t)pp->row)
204
{
205
if (r > 0)
206
error(1, "last record incomplete");
207
break;
208
}
209
n = r / pp->row;
210
for (i = 0; i < n; i++)
211
{
212
if (op & OP_ID)
213
{
214
*ob++ = i >> 8;
215
*ob++ = i;
216
}
217
for (j = 0; j < m; j++)
218
*ob++ = ib[map[j]];
219
if (op & OP_NL)
220
*ob++ = '\n';
221
ib += pp->row;
222
}
223
n = ob - pz->wrk;
224
if (sfwrite(sfstdout, pz->wrk, n) != (ssize_t)n)
225
error(ERROR_SYSTEM|3, "write error");
226
}
227
return 0;
228
}
229
230
/*
231
* label the mapped format fields
232
*/
233
234
static int
235
label(register Pz_t* pz, Pzpart_t* pp, int op, register size_t* map, size_t m, char* format)
236
{
237
register char* s;
238
register int i;
239
register int g;
240
ssize_t rows;
241
Sfio_t* sp;
242
Label_t* lv;
243
Label_t* lp;
244
Label_t** xv;
245
Info_t* ip;
246
247
if (!(sp = pzfind(pz, format, "fmt", "r")))
248
error(ERROR_SYSTEM|3, "%s: cannot read format file", format);
249
if (!(lv = vmnewof(pz->vm, 0, Label_t, pp->row + 1, 0)))
250
error(ERROR_SYSTEM|3, "out of space");
251
if (!(xv = vmnewof(pz->vm, 0, Label_t*, pp->row, 0)))
252
error(ERROR_SYSTEM|3, "out of space");
253
error_info.file = format;
254
lv->end = -1;
255
lp = ++lv;
256
while (s = sfgetr(sp, '\n', 1))
257
{
258
error_info.line++;
259
for (; isspace(*s); s++);
260
if (!*s || *s == '#' || *s == '"')
261
continue;
262
if (!isdigit(*s))
263
{
264
if (tokscan(s, NiL, "%s, %d,", &lp->name, &lp->end) != 2)
265
continue;
266
lp->beg = (lp-1)->end + 1;
267
lp->end += lp->beg - 1;
268
}
269
else if (tokscan(s, NiL, "%d-%d %s", &lp->beg, &lp->end, &lp->name) != 3)
270
continue;
271
if (streq(lp->name, "variable_ascii"))
272
continue;
273
if (streq(lp->name, "Newline"))
274
break;
275
if (lp->end >= pp->row)
276
error(3, "format entry extends beyond %I*d row size", sizeof(pp->row), pp->row);
277
if (!(lp->name = vmstrdup(pz->vm, lp->name)))
278
error(ERROR_SYSTEM|3, "out of space");
279
for (i = lp->beg; i <= lp->end; i++)
280
xv[i] = lp;
281
if (pz->test & 0x0010)
282
error(2, "%d-%d\t%s", lp->beg, lp->end, lp->name);
283
lp++;
284
}
285
sfclose(sp);
286
lp->name = "Newline";
287
lp->beg = lp->end = (lp-1)->end + 1;
288
if (lp->end != (pp->row - 1))
289
error(3, "format file row size %d does not match expected %I*d", lp->end + 1, sizeof(pp->row), pp->row);
290
xv[lp->beg] = lp;
291
error_info.file = 0;
292
error_info.line = 0;
293
if (op & OP_INFO)
294
{
295
if (!(ip = vmnewof(pz->vm, 0, Info_t, m, 0)))
296
error(ERROR_SYSTEM|3, "out of space");
297
if ((rows = gather(pz, pp, pz->io, ip, map, m)) < 0)
298
return 1;
299
sfprintf(sfstdout, "%s frequency info over %I*d rows\n\n", (op & OP_LO) ? "low" : "high", sizeof(rows), rows);
300
sfprintf(sfstdout, "%33s %3s %6s %3s\n\n", "FIELD", "COL", "FREQ", "VAL");
301
if (op & OP_LO)
302
g = map[0];
303
else
304
g = 0;
305
for (i = g = 0; i < m; i++)
306
{
307
if (op & OP_LO)
308
{
309
if (g != map[i])
310
sfprintf(sfstdout, "\n");
311
g = map[i] + 1;
312
}
313
else if (g != pp->lab[i])
314
{
315
g = pp->lab[i];
316
sfprintf(sfstdout, "\n");
317
}
318
sfprintf(sfstdout, "%33s %3d %6lu %3d\n", xv[map[i]]->name, map[i], ip[i].changes, ip[i].values);
319
}
320
}
321
else
322
for (i = 0; i < m;)
323
{
324
lp = xv[map[i]];
325
if (op & OP_LO)
326
g = map[i] + 1;
327
else
328
g = pp->lab[i];
329
sfprintf(sfstdout, "%33s %3d", lp->name, map[i]);
330
while (++i < m)
331
{
332
if (op & OP_LO)
333
{
334
if (g != map[i])
335
{
336
sfprintf(sfstdout, "\n");
337
break;
338
}
339
g = map[i] + 1;
340
}
341
else if (g != pp->lab[i])
342
{
343
sfprintf(sfstdout, "\n");
344
break;
345
}
346
if (xv[map[i]] != lp)
347
break;
348
sfprintf(sfstdout, " %3d", map[i]);
349
}
350
sfprintf(sfstdout, "\n");
351
}
352
return 0;
353
}
354
355
/*
356
* list info on the mapped fields
357
*/
358
359
static int
360
info(register Pz_t* pz, register Pzpart_t* pp, int op, register size_t* map, size_t m)
361
{
362
register int i;
363
register int g;
364
ssize_t rows;
365
Info_t* ip;
366
367
if (!(ip = vmnewof(pz->vm, 0, Info_t, m, 0)))
368
error(ERROR_SYSTEM|3, "out of space");
369
if ((rows = gather(pz, pp, pz->io, ip, map, m)) < 0)
370
return 1;
371
sfprintf(sfstdout, "%s frequency info over %I*d rows\n\n", (op & OP_LO) ? "low" : "high", sizeof(rows), rows);
372
sfprintf(sfstdout, "%3s %6s %3s\n\n", "COL", "FREQ", "VAL");
373
if (op & OP_LO)
374
g = map[0];
375
else
376
g = 0;
377
for (i = g = 0; i < m; i++)
378
{
379
if (op & OP_LO)
380
{
381
if (g != map[i])
382
sfprintf(sfstdout, "\n");
383
g = map[i] + 1;
384
}
385
else if (g != pp->lab[i])
386
{
387
g = pp->lab[i];
388
sfprintf(sfstdout, "\n");
389
}
390
sfprintf(sfstdout, "%3d %6lu %3d\n", map[i], ip[i].changes, ip[i].values);
391
}
392
return 0;
393
}
394
395
/*
396
* copy the row by row diff of path to sfstdout
397
*/
398
399
static int
400
diff(int op, const char* path, size_t row)
401
{
402
register int i;
403
register int j;
404
register int k;
405
ssize_t r;
406
unsigned char* buf[2];
407
unsigned char* dif;
408
Sfio_t* sp;
409
410
if (!(buf[0] = newof(0, unsigned char, row, 0)) || !(buf[1] = newof(0, unsigned char, row, 0)) || !(dif = newof(0, unsigned char, row, 0)))
411
{
412
error(ERROR_SYSTEM|2, "out of space");
413
return 1;
414
}
415
if (!(sp = sfopen(NiL, path, "r")))
416
{
417
error(ERROR_SYSTEM|2, "%s: cannot read", path);
418
return 1;
419
}
420
if (op & OP_ENDIFF)
421
{
422
for (i = 0; (r = sfread(sp, buf[i], row)) == row; i = k)
423
{
424
k = !i;
425
for (j = 0; j < row; j++)
426
dif[j] = buf[i][j] - buf[k][j];
427
if (sfwrite(sfstdout, dif, row) != row)
428
break;
429
}
430
}
431
else
432
{
433
for (i = 0; (r = sfread(sp, dif, row)) == row; i = k)
434
{
435
k = !i;
436
for (j = 0; j < row; j++)
437
buf[i][j] = dif[j] + buf[k][j];
438
if (sfwrite(sfstdout, buf[i], row) != row)
439
break;
440
}
441
}
442
sfclose(sp);
443
if (sfsync(sfstdout))
444
{
445
error(ERROR_SYSTEM|2, "write error");
446
return 1;
447
}
448
if (r < 0)
449
{
450
error(ERROR_SYSTEM|2, "%s: read error", path);
451
return 1;
452
}
453
if (r)
454
error(1, "%s: last record incomplete", path);
455
return 0;
456
}
457
458
int
459
main(int argc, char** argv)
460
{
461
register Pz_t* pz;
462
register Pzpart_t* pp;
463
register int i;
464
int m;
465
size_t* map;
466
Pzdisc_t disc;
467
Sfio_t* dp;
468
469
int flags = 0;
470
char* format = 0;
471
int op = 0;
472
size_t row = 0;
473
474
error_info.id = "pop";
475
memset(&disc, 0, sizeof(disc));
476
disc.version = PZ_VERSION;
477
disc.errorf = errorf;
478
if (!(dp = sfstropen()))
479
error(ERROR_SYSTEM|3, "out of space [options]");
480
for (;;)
481
{
482
switch (optget(argv, usage))
483
{
484
case 'c':
485
op |= OP_CUT;
486
continue;
487
case 'e':
488
op |= OP_ENDIFF;
489
continue;
490
case 'f':
491
format = opt_info.arg;
492
continue;
493
case 'h':
494
op &= ~OP_LO;
495
continue;
496
case 'i':
497
op |= OP_INFO;
498
continue;
499
case 'l':
500
op |= OP_LO;
501
continue;
502
case 'm':
503
op |= OP_MAP;
504
continue;
505
case 'n':
506
op |= OP_NL;
507
continue;
508
case 'o':
509
sfputr(dp, opt_info.arg, '\n');
510
continue;
511
case 'p':
512
disc.partition = opt_info.arg;
513
continue;
514
case 'r':
515
row = opt_info.num;
516
continue;
517
case 'u':
518
op |= OP_UNDIFF;
519
continue;
520
case 'v':
521
op |= OP_VERBOSE;
522
flags |= PZ_VERBOSE;
523
continue;
524
case 'x':
525
op |= OP_ID;
526
continue;
527
case 'Q':
528
sfprintf(dp, "regress\n");
529
continue;
530
case 'T':
531
sfprintf(dp, "test=%s\n", opt_info.arg);
532
continue;
533
case 'X':
534
sfprintf(dp, "prefix=%s\n", opt_info.arg);
535
continue;
536
case '?':
537
error(ERROR_USAGE|4, "%s", opt_info.arg);
538
continue;
539
case ':':
540
if (!opt_info.option[0])
541
sfputr(dp, &argv[opt_info.index - 1][2], '\n');
542
else
543
error(2, "%s", opt_info.arg);
544
continue;
545
}
546
break;
547
}
548
argv += opt_info.index;
549
if (error_info.errors || *argv && *(argv + 1))
550
error(ERROR_USAGE|4, "%s", optusage(NiL));
551
if (sfstrtell(dp) && !(disc.options = strdup(sfstruse(dp))))
552
error(ERROR_SYSTEM|3, "out of space [options]");
553
sfstrclose(dp);
554
if (op & (OP_ENDIFF|OP_UNDIFF))
555
{
556
if (!row)
557
error(3, "-r row-size required for -e");
558
return diff(op, *argv, row);
559
}
560
if (row)
561
{
562
if (disc.partition)
563
error(3, "only one of -r and -p may be specified");
564
if (!(disc.partition = strdup(sfprints("/%I*u/", sizeof(row), row))))
565
error(ERROR_SYSTEM|3, "out of space");
566
}
567
if (!disc.partition)
568
flags |= PZ_READ;
569
else if (op & OP_INFO)
570
flags |= PZ_WRITE;
571
if (!(pz = pzopen(&disc, *argv, flags)))
572
return 1;
573
pp = pz->part;
574
if (!disc.partition && (op & OP_INFO))
575
{
576
sfprintf(sfstdout, "row size %d\n", pp->row);
577
op |= OP_LO;
578
}
579
pz->win = (pz->win / pp->row) * pp->row;
580
if (op & OP_LO)
581
{
582
if (!(map = vmnewof(pz->vm, 0, size_t, pp->row - pp->nmap, 0)))
583
error(ERROR_SYSTEM|3, "out of space");
584
m = 0;
585
for (i = 0; i < pp->row; i++)
586
if (pp->low[i])
587
map[m++] = i;
588
}
589
else
590
{
591
map = pp->map;
592
m = pp->nmap;
593
}
594
if (op & OP_CUT)
595
i = cut(pz, pp, op, map, m);
596
else if (format)
597
i = label(pz, pp, op, map, m, format);
598
else if (op & OP_INFO)
599
i = info(pz, pp, op, map, m);
600
else if (op & OP_MAP)
601
{
602
pp->row = pp->nmap;
603
for (i = 0; i < pp->nmap; i++)
604
pp->map[i] = i;
605
pzpartprint(pz, pp, sfstdout);
606
}
607
pzclose(pz);
608
return i;
609
}
610
611