Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/cmd/pzip/pin.c
1808 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 1998-2011 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Glenn Fowler <[email protected]> *
18
* *
19
***********************************************************************/
20
#pragma prototyped
21
22
/*
23
* Glenn Fowler
24
* Adam Buchsbaum
25
* AT&T Research
26
*
27
* induce a column partition on fixed row data
28
*/
29
30
static const char usage[] =
31
"[-?\n@(#)$Id: pin (AT&T Research) 2003-07-17 $\n]"
32
USAGE_LICENSE
33
"[+NAME?pin - induce a pzip partition on fixed record data]"
34
"[+DESCRIPTION?\bpin\b induces a \bpzip\b(1) column partition on data files"
35
" of fixed length rows (records) and columns (fields). If a partition"
36
" file is specified then that partition is refined. A partition file,"
37
" suitable for use by \bpin\b, \bpzip\b(1) or \bpop\b(1) is listed"
38
" on the standard output. The input \afile\a is referred to as"
39
" \atraining data\a. \b--size\b allows more than one \afile\a"
40
" argument, otherwise exactly one file must be specified.]"
41
"[+?Partitions are induced in a three step process:]{"
42
" [+(1)?If \b--partition\b is not specified then a"
43
" subset of columns are filtered from the training data for"
44
" partitioning. This filtering usually gathers high frequency columns"
45
" from the data. A high frequency column has data with a high rate of"
46
" change between rows. High frequency cutoff rates, specified by"
47
" \b--high\b, typically range"
48
" from 10% to 50%, depending on the data. \bpzip\b(1) compresses"
49
" low frequency columns much more efficiently than \bgzip\b(1).]"
50
" [+(2)?A heuristic search determines an initial ordering of high"
51
" frequency columns to present to step (3). An optimal solution for"
52
" both ordering and partitioning is NP-complete.]"
53
" [+(3)?The optimal partition for the ordering from step (2) is"
54
" determined by a dynamic program that computes the compressed size"
55
" for all partitions that preserve order."
56
" Alternative greedy methods are also available that work more"
57
" quickly but do not guarantee optimality.]"
58
"}"
59
"[+?See \bpzip\b(1) for a detailed description of file partitions and"
60
" column frequencies. \bpin\b can run for a long time on some data"
61
" (e.g., 10 minutes on a 400Mhz Pentium with \b--high 80 --window 4M\b)."
62
" Use \b--verbose\b possibly with \b--test=010\b to monitor progress.]"
63
64
"[b:bzip?Use \bbzip\b(1) compression instead of the default \bgzip\b(1)."
65
" \abzip\a is not fully supported by \apzip\a, pending further"
66
" investigation.]"
67
"[c:cache?Generate some information on \afile\a that can be reused"
68
" on another invocation; this information is saved in \bpin-\b\abase\a,"
69
" where \abase\a is the base name (no directory) of \afile\a. Saved"
70
" information includes column frequencies and singleton and pairwise"
71
" column \agzip\a rates.]"
72
"[g:group?Sets the maximum number of columns in any partition group. Lower"
73
" values speed up the the dynamic program but also may produce"
74
" sub-optimal solutions.]#[columns:=row-size]"
75
"[h:high?Select this number of columns with the highest frequencies for the"
76
" columns in the partition. If \acolumns\a is followed by `%' then"
77
" columns with frequencies larger than this percentage are"
78
" selected.]:[columns:=10%]"
79
"[l:level?Sets the \agzip\a compression level to \alevel\a. Levels range"
80
" from 1 (fastest, worst compression) to 9 (slowest, best compression).]#"
81
" [level:=6]"
82
"[m:maxhigh?Exit with exit code 3 if the number of high frequency columns"
83
" exceeds \bmaxhigh\b. If \bmaxhigh\b is followed by `%' then the limit"
84
" is \bmaxhigh\b percent of the total number of columns.]:[maxhigh:=40%]"
85
"[o:sort?Sort the window data by row before inducing the partition.]"
86
"[p:partition?Specifies the data row size and the high frequency column"
87
" partition groups and permutation. The partition file is a sequence"
88
" of lines. Comments start with # and continue to the end of the line."
89
" The first non-comment line specifies the optional name string"
90
" in \"...\". The next non-comment line specifies the row size."
91
" The remaining lines operate on column offset ranges of the form:"
92
" \abegin\a[-\aend\a]] where \abegin\a is the beginning column offset"
93
" (starting at 0), and \aend\a is the ending column offset for an"
94
" inclusive range. The operators are:]:[file]{"
95
" [+range [...]]?places all columns in the specified \arange\a"
96
" list in the same high frequency partition group."
97
" Each high frequency partition group is processed as"
98
" a separate block by the underlying compressor"
99
" (\bgzip\b(1) by default).]"
100
" [+range='value'?specifies that each column in \arange\a"
101
" has the fixed character value \avalue\a. C-style"
102
" character escapes are valid for \avalue\a.]"
103
"}"
104
"[r:row?Specifies the input row size (number of byte columns). The row size"
105
" is determined by sampling the input if not specified.]#[row-size]"
106
"[v:verbose?List partition search details on the standard error.]"
107
"[w:window?Limit the number of training data rows used to induce the"
108
" partition. The window size may be decreased to accomodate an"
109
" integral number of complete rows.]#[window-size:=4M]"
110
"[O:optimize?Choose the optimization (partitioning) method for step (3)"
111
" above. The methods"
112
" are:]:[method:=\foptimize_default\f]{\foptimize_methods\f}"
113
"[Q:regress?Generate output for regression testing, such that identical"
114
" invocations with identical input files will generate the same output.]"
115
"[R:reorder?Choose the reordering method for step (2) above. The methods"
116
" are:]:[method:=\freorder_default\f]{\freorder_methods\f}"
117
"[S:size?Ignore \b--row\b, determine the fixed record size based on a window"
118
" of sampled data, print it on the standard output, and exit. If more"
119
" than one \afile\a is specified then the record size and name are"
120
" printed for each file. If the sample is insufficient, or if"
121
" \b--verify\b is specified, then all of the data read to determine"
122
" the row size. A \b0\b size means the record size could not be"
123
" determined.]"
124
"[T:test?Enable implementation-specific tests and tracing.]#[test-mask]{"
125
" [+0x0010?Enable reorder keep trace.]"
126
" [+0x0020?Enable reorder skip/cost trace.]"
127
" [+0x0040?Enable reorder permutation trace.]"
128
" [+0x0080?Enable reorder level 2 merge prune.]"
129
" [+0x0100?Disable reorder merge prune.]"
130
" [+0x0200?Partition using initial tsp cycles.]"
131
"}"
132
"[V:verify?Verify \b--size\b by reading all data instead of the window sample.]"
133
"[X:prefix?Uncompressed data contains a prefix that is defined by \acount\a"
134
" and an optional \aterminator\a. This data is not \bpzip\b compressed."
135
" \aterminator\a may be one of:]:[count[*terminator]]]{"
136
" [+\aomitted\a?\acount\a bytes.]"
137
" [+L?\acount\a \bnewline\b terminated records.]"
138
" [+'\achar\a'?\acount\a \achar\a terminated records.]"
139
"}"
140
141
"\n"
142
"\nfile ...\n"
143
"\n"
144
"[+SEE ALSO?\bbzip\b(1), \bgzip\b(1), \bpop\b(1), \bpzip\b(1), \bpzip\b(3)]"
145
;
146
147
#include <ast.h>
148
#include <ctype.h>
149
#include <error.h>
150
#include <ls.h>
151
#include <pzip.h>
152
#include <zlib.h>
153
#include <bzlib.h>
154
155
#include "FEATURE/local"
156
157
#if _hdr_tsp && _lib_tspopen
158
#include <tsp.h>
159
#endif
160
161
#define OP_optimize 0x01
162
#define OP_reorder 0x02
163
#define OP_size 0x04
164
#define OP_verify 0x10
165
166
#define minof(x,y) ((x)<(y)?(x):(y))
167
168
typedef struct
169
{
170
size_t elements;
171
size_t size;
172
size_t rate;
173
size_t hit;
174
size_t skip;
175
size_t* member;
176
} Part_t;
177
178
typedef struct
179
{
180
unsigned long frequency;
181
int prev;
182
int column;
183
} Stats_t;
184
185
struct Optimize_method_s;
186
187
typedef void (*Optimize_f)(struct Optimize_method_s*, unsigned char*, unsigned char*, size_t**, size_t*, size_t*, size_t, size_t, size_t);
188
189
typedef struct Optimize_method_s
190
{
191
const char* name;
192
const char* description;
193
Optimize_f fun;
194
} Optimize_method_t;
195
196
struct Reorder_method_s;
197
198
typedef void (*Reorder_f)(struct Reorder_method_s*, unsigned char*, unsigned char*, int*, size_t, size_t);
199
200
typedef struct Reorder_method_s
201
{
202
const char* name;
203
const char* description;
204
Reorder_f fun;
205
} Reorder_method_t;
206
207
static struct
208
{
209
Stats_t* stats;
210
size_t* pam;
211
size_t* map;
212
char* input;
213
char* cachefile;
214
int bzip;
215
int cache;
216
int level;
217
int pairs;
218
int sort;
219
int test;
220
int window;
221
int verbose;
222
} state;
223
224
/*
225
* allocate an i-length vector of size_t
226
*/
227
228
static size_t*
229
vector(int i)
230
{
231
register size_t* p;
232
233
if (!(p = newof(0, size_t, i, 0)))
234
error(ERROR_SYSTEM|3, "out of space [%d vector]", i);
235
return p;
236
}
237
238
/*
239
* allocate an i X i matrix of size_t
240
*/
241
242
static size_t**
243
matrix(int i)
244
{
245
register size_t** p;
246
register size_t* v;
247
register int k;
248
size_t n;
249
250
n = i * i;
251
if (!(p = newof(0, size_t*, i, n * sizeof(size_t))))
252
error(ERROR_SYSTEM|3, "out of space [%d X %d matrix]", i, i);
253
v = (size_t*)(p + i);
254
for (k = 0; k < i; k++)
255
{
256
p[k] = v;
257
v += i;
258
}
259
return p;
260
}
261
262
/*
263
* allocate an i X i partition workspace
264
*/
265
266
static Part_t*
267
partition(int i)
268
{
269
register Part_t* p;
270
register size_t* v;
271
register int k;
272
size_t n;
273
274
n = i * i;
275
if (!(p = newof(0, Part_t, n, n * i * sizeof(size_t))))
276
error(ERROR_SYSTEM|3, "out of space [%d partition]", i);
277
v = (size_t*)(p + n);
278
for (k = 0; k < n; k++)
279
{
280
p[k].member = v;
281
v += i;
282
}
283
return p;
284
}
285
286
/*
287
* order frequencies hi to lo
288
*/
289
290
static int
291
byfrequency(const void* va, const void* vb)
292
{
293
register Stats_t* a = (Stats_t*)va;
294
register Stats_t* b = (Stats_t*)vb;
295
296
if (a->frequency < b->frequency)
297
return 1;
298
if (a->frequency > b->frequency)
299
return -1;
300
if (a->column < b->column)
301
return -1;
302
if (a->column > b->column)
303
return 1;
304
return 0;
305
}
306
307
/*
308
* order column lo to hi
309
*/
310
311
static int
312
bycolumn(const void* va, const void* vb)
313
{
314
register Stats_t* a = (Stats_t*)va;
315
register Stats_t* b = (Stats_t*)vb;
316
317
if (a->column < b->column)
318
return -1;
319
if (a->column > b->column)
320
return 1;
321
return 0;
322
}
323
324
/*
325
* order partition rate lo to hi
326
*/
327
328
static int
329
byrate(const void* va, const void* vb)
330
{
331
register Part_t* a = (Part_t*)va;
332
register Part_t* b = (Part_t*)vb;
333
334
if (a->rate < b->rate)
335
return -1;
336
if (a->rate > b->rate)
337
return 1;
338
if (a->member < b->member)
339
return -1;
340
if (a->member > b->member)
341
return 1;
342
return 0;
343
}
344
345
/*
346
* order row lo to hi
347
*/
348
349
static int
350
byrow(const void* va, const void* vb)
351
{
352
return memcmp(va, vb, state.sort);
353
}
354
355
/*
356
* dump one partition line with label to sp
357
*/
358
359
static void
360
dumppart(Sfio_t* sp, const char* label, register Part_t* pp)
361
{
362
register int i;
363
364
sfprintf(sp, "reorder %s %5u %5u :", label, pp->rate, pp->size);
365
for (i = 0; i < pp->elements; i++)
366
sfprintf(sp, " %3u", state.map[pp->member[i]]);
367
sfprintf(sp, "\n");
368
}
369
370
/*
371
* return the compressed size of buffer b of size bytes
372
*/
373
374
static size_t
375
zip(unsigned char* b, size_t size)
376
{
377
unsigned long used;
378
unsigned int dest;
379
size_t safe;
380
381
static unsigned char* buf;
382
static int bufsize;
383
384
safe = 4 * roundof(size, 8 * 1024);
385
if (bufsize < safe)
386
{
387
bufsize = safe;
388
if (!(buf = newof(buf, unsigned char, bufsize, 0)))
389
error(ERROR_SYSTEM|3, "out of space [buf]");
390
}
391
if (state.bzip)
392
{
393
dest = bufsize;
394
if (bzBuffToBuffCompress((char*)buf, &dest, (char*)b, size, state.level, 0, 30) != BZ_OK)
395
error(3, "bzip compress failed");
396
used = dest;
397
}
398
else
399
{
400
used = bufsize;
401
if (compress2(buf, &used, b, size, state.level) != Z_OK)
402
error(3, "gzip compress failed");
403
}
404
return used;
405
}
406
407
/*
408
* copy col i through col j from s into t
409
* and return the compressed size
410
*/
411
412
static size_t
413
field(unsigned char* t, register unsigned char* s, int i, int j, register int row, size_t tot)
414
{
415
register unsigned char* b;
416
register unsigned char* e;
417
register int n;
418
419
b = t;
420
e = s + tot;
421
n = j - i + 1;
422
for (s += i; s < e; s += row)
423
{
424
memcpy(b, s, n);
425
b += n;
426
}
427
return zip(t, b - t);
428
}
429
430
/*
431
* copy col i and col j from s into t
432
* and return the compressed size
433
*/
434
435
static size_t
436
pair(unsigned char* t, register unsigned char* s, int i, register int j, size_t row, size_t tot)
437
{
438
register unsigned char* b;
439
register unsigned char* e;
440
441
b = t;
442
e = s + tot;
443
j -= i;
444
for (s += i; s < e; s += row)
445
{
446
*b++ = *s;
447
*b++ = *(s + j);
448
}
449
return zip(t, b - t);
450
}
451
452
/*
453
* return the compressed size for partition pp
454
*/
455
456
static size_t
457
part(unsigned char* t, register unsigned char* s, register Part_t* pp, size_t row, size_t tot)
458
{
459
register unsigned char* b;
460
register unsigned char* e;
461
register int i;
462
463
b = t;
464
e = s + tot;
465
for (; s < e; s += row)
466
for (i = 0; i < pp->elements; i++)
467
*b++ = s[pp->member[i]];
468
return zip(t, b - t);
469
}
470
471
/*
472
* merge col i with part pp and keep the best compression in np
473
* (np+1) is used as tmp workspace
474
* return 1 if the best merge is better than i and pp separately
475
*/
476
477
static int
478
merge(unsigned char* t, unsigned char* s, int i, Part_t* pp, register Part_t* np, size_t** siz, size_t row, size_t tot)
479
{
480
register int j;
481
register int k;
482
register Part_t* bp;
483
size_t x;
484
size_t z;
485
486
k = 0;
487
np->member[k++] = i;
488
for (j = 0; j < pp->elements; j++)
489
np->member[k++] = pp->member[j];
490
np->elements = k;
491
bp = 0;
492
x = siz[i][i] + pp->size;
493
for (i = 0;;)
494
{
495
z = part(t, s, np, row, tot);
496
if (z < x)
497
{
498
bp = np + 1;
499
bp->size = x = z;
500
for (j = 0; j < k; j++)
501
bp->member[j] = np->member[j];
502
}
503
if (++i >= k)
504
break;
505
j = np->member[i];
506
np->member[i] = np->member[i - 1];
507
np->member[i - 1] = j;
508
}
509
if (bp)
510
{
511
np->size = bp->size;
512
np->rate = np->size / k;
513
for (j = 0; j < k; j++)
514
np->member[j] = bp->member[j];
515
if (state.test & 0x0040)
516
dumppart(sfstderr, "merg", np);
517
return 1;
518
}
519
return 0;
520
}
521
522
/*
523
* filter out the high frequency columns from dat
524
*/
525
526
static size_t
527
filter(Sfio_t* ip, unsigned char** bufp, unsigned char** datp, Pz_t* pz, int high, int maxhigh, size_t row, size_t tot)
528
{
529
register int i;
530
register int j;
531
char* q;
532
unsigned char* s;
533
unsigned char* t;
534
unsigned char* dat;
535
unsigned char* buf;
536
int c;
537
size_t rows;
538
size_t n;
539
ssize_t r;
540
unsigned long freq;
541
Pzpart_t* pp;
542
Sfio_t* sp;
543
544
if (pz || high)
545
{
546
buf = *bufp;
547
dat = *datp;
548
rows = tot / row;
549
if (pz)
550
{
551
pp = pz->part;
552
high = pp->nmap;
553
if (!(state.map = newof(state.map, size_t, high, 0)))
554
error(ERROR_SYSTEM|3, "out of space [map]");
555
memcpy(state.map, pp->map, high * sizeof(state.map[0]));
556
pzclose(pz);
557
}
558
else
559
{
560
if (!(state.map = oldof(state.map, size_t, row, 0)))
561
error(ERROR_SYSTEM|3, "out of space [map]");
562
if (!(state.stats = oldof(state.stats, Stats_t, row, 0)))
563
error(ERROR_SYSTEM|3, "out of space [stats]");
564
memset(state.stats, 0, row * sizeof(state.stats[0]));
565
if (state.cache && (sp = sfopen(NiL, state.cachefile, "r")))
566
{
567
int column;
568
unsigned long frequency;
569
570
if (state.verbose)
571
sfprintf(sfstderr, "filter using %s for frequencies\n", state.cachefile);
572
error_info.file = state.cachefile;
573
error_info.line = 0;
574
while (q = sfgetr(sp, '\n', 1))
575
{
576
error_info.line++;
577
if (*q == 'f')
578
{
579
if (sfsscanf(q + 1, "%d %d %lu", &c, &column, &frequency) != 3 || c < 0 || c >= row || column < 0 || column >= row)
580
error(3, "%s: `%s' is invalid", buf, q);
581
state.stats[c].prev = 1;
582
state.stats[c].column = column;
583
state.stats[c].frequency = frequency;
584
}
585
}
586
for (j = 0; j < row; j++)
587
if (!state.stats[j].prev)
588
error(3, "invalid cache -- column %d frequency omitted", j);
589
sfclose(sp);
590
error_info.file = 0;
591
error_info.line = 0;
592
}
593
else
594
{
595
if (state.verbose)
596
{
597
if (high < 0)
598
sfprintf(sfstderr, "filter top %d%% high frequency columns\n", -high);
599
else if (high > 0)
600
sfprintf(sfstderr, "filter top %d high frequency columns\n", high);
601
}
602
s = dat;
603
for (i = 0; i < rows; i++)
604
for (j = 0; j < row; j++)
605
if (state.stats[j].prev != (c = *s++))
606
{
607
state.stats[j].prev = c;
608
state.stats[j].frequency++;
609
}
610
n = rows;
611
while ((r = sfread(ip, s = buf, state.window)) > 0)
612
{
613
r /= row;
614
if (state.sort)
615
qsort(s, r, row, byrow);
616
for (i = 0; i < r; i++)
617
for (j = 0; j < row; j++)
618
if (state.stats[j].prev != (c = *s++))
619
{
620
state.stats[j].prev = c;
621
state.stats[j].frequency++;
622
}
623
if ((n += r) > 10000000L)
624
{
625
/*
626
* that'll do pig
627
*/
628
629
r = 0;
630
break;
631
}
632
}
633
if (r < 0)
634
error(ERROR_SYSTEM|3, "data read error");
635
else if (r)
636
error(1, "last record incomplete");
637
for (j = 0; j < row; j++)
638
state.stats[j].column = j;
639
qsort(state.stats, row, sizeof(Stats_t), byfrequency);
640
if (state.cache)
641
{
642
error_info.file = state.cachefile;
643
error_info.line = 0;
644
if (!(sp = sfopen(NiL, state.cachefile, "w")))
645
error(ERROR_SYSTEM|1, "cannot write cache");
646
else
647
{
648
sfprintf(sp, "# %s cache for %s\n", error_info.id, state.input);
649
for (j = 0; j < row; j++)
650
{
651
sfprintf(sp, "f %d %d %lu\n", j, state.stats[j].column, state.stats[j].frequency);
652
error_info.line++;
653
}
654
if (sfclose(sp))
655
error(ERROR_SYSTEM|3, "cache write error");
656
}
657
error_info.file = 0;
658
error_info.line = 0;
659
}
660
if (state.verbose)
661
sfprintf(sfstderr, "filter done -- %u rows\n", n);
662
}
663
if (high < 0)
664
{
665
high = -high;
666
freq = state.stats[0].frequency * high / 100;
667
for (j = 0; j < row; j++)
668
if (state.stats[j].frequency <= freq)
669
break;
670
high = j;
671
if (state.verbose)
672
sfprintf(sfstderr, "%d high frequency column%s out of %d\n", high, high == 1 ? "" : "s", row);
673
}
674
if (maxhigh < 0)
675
{
676
maxhigh = -maxhigh;
677
if (high > (row * maxhigh / 100))
678
error(6, "high frequency count %d exceeds %d%% of %d", high, maxhigh, row);
679
}
680
else if (maxhigh > 0)
681
{
682
if (high > maxhigh)
683
error(6, "high frequency count %d exceeds %d", high, maxhigh);
684
}
685
686
/*
687
* cut the original data layout some slack
688
* by using the original column order
689
*/
690
691
qsort(state.stats, high, sizeof(Stats_t), bycolumn);
692
for (j = 0; j < high; j++)
693
state.map[j] = state.stats[j].column;
694
state.pairs = row;
695
if (!(state.pam = newof(state.pam, size_t, row, 0)))
696
error(ERROR_SYSTEM|3, "out of space [pam]");
697
for (j = 0; j < row; j++)
698
state.pam[j] = row;
699
for (j = 0; j < high; j++)
700
state.pam[state.map[j]] = j;
701
}
702
s = dat;
703
t = buf;
704
for (i = 0; i < rows; i++)
705
{
706
for (j = 0; j < high; j++)
707
*t++ = s[state.map[j]];
708
s += row;
709
}
710
row = high;
711
*bufp = dat;
712
*datp = buf;
713
}
714
else
715
{
716
if (!(state.map = newof(0, size_t, row, 0)))
717
error(ERROR_SYSTEM|3, "out of space [map]");
718
for (i = 0; i < row; i++)
719
state.map[i] = i;
720
}
721
return row;
722
}
723
724
/*
725
* permute state.map and dat according to a new order
726
*/
727
728
static void
729
permute(unsigned char* buf, unsigned char* dat, size_t* ord, size_t row, size_t tot)
730
{
731
register int i;
732
size_t* tmap;
733
unsigned char* end;
734
735
tmap = vector(row);
736
for (i = 0; i < row; i++)
737
tmap[i] = state.map[i];
738
for (i = 0; i < row; i++)
739
state.map[i] = tmap[ord[i]];
740
for (end = dat + tot; dat < end; dat += row)
741
{
742
memcpy(buf, dat, row);
743
for (i = 0; i < row; i++)
744
dat[i] = buf[ord[i]];
745
}
746
free(tmap);
747
}
748
749
/*
750
* stuff the partion group labels in lab
751
*/
752
753
static int
754
solution(int* lab, int label, size_t* sol, int beg, int end)
755
{
756
int i;
757
758
i = sol[end];
759
if (i >= beg)
760
label = solution(lab, label, sol, beg, i);
761
for (label++, beg = i + 1; beg <= end; beg++)
762
lab[beg] = label;
763
return label;
764
}
765
766
/*
767
* list the final partition on sp
768
*/
769
770
static void
771
list(Sfio_t* sp, int* lab, size_t row)
772
{
773
register int i;
774
register int j;
775
register int g;
776
777
g = -1;
778
for (i = 0; i < row; i++)
779
{
780
if (g != lab[i])
781
{
782
g = lab[i];
783
sfprintf(sp, "\n");
784
}
785
else
786
sfprintf(sp, " ");
787
for (j = i + 1; j < row && lab[j] == g && state.map[j] == (state.map[j - 1] + 1); j++);
788
sfprintf(sp, "%d", state.map[i]);
789
if (j > (i + 2))
790
{
791
i = j - 1;
792
sfprintf(sp, "-%d", state.map[i]);
793
}
794
}
795
sfprintf(sp, "\n");
796
}
797
798
/*
799
* search for a good initial ordering
800
*/
801
802
static void
803
reorder_heuristic(Reorder_method_t* method, unsigned char* buf, unsigned char* dat, int* lab, size_t row, size_t tot)
804
{
805
register int i;
806
register int j;
807
register int k;
808
register Part_t* cp;
809
register Part_t* np;
810
register Part_t* tp;
811
register Part_t* bp;
812
Part_t* fp;
813
size_t* hit;
814
size_t* ord;
815
ssize_t* cst;
816
char* s;
817
int ii;
818
int jj;
819
size_t y;
820
size_t z;
821
ssize_t cy;
822
Part_t* cur;
823
Part_t* nxt;
824
Part_t* fin;
825
size_t** siz;
826
Sfio_t* sp;
827
828
siz = matrix(row);
829
cur = partition(row);
830
nxt = partition(row);
831
fin = partition(row);
832
hit = vector(row);
833
ord = vector(row);
834
cst = (ssize_t*)vector(row);
835
836
/*
837
* fill in the pair compression size matrix
838
* and the initial partition candidate list
839
*/
840
841
k = 2;
842
cp = cur;
843
844
/*
845
* check the cache
846
*/
847
848
if (state.cache && (sp = sfopen(NiL, state.cachefile, "r")))
849
{
850
if (state.verbose)
851
sfprintf(sfstderr, "reorder using %s for pair compress sizes\n", state.cachefile);
852
error_info.file = state.cachefile;
853
error_info.line = 0;
854
while (s = sfgetr(sp, '\n', 1))
855
{
856
error_info.line++;
857
if (*s == 'p')
858
{
859
if (sfsscanf(s + 1, "%d %d %I*u", &ii, &jj, sizeof(z), &z) != 3 || ii < 0 || jj < 0 || state.pairs && (ii >= state.pairs || jj >= state.pairs))
860
error(3, "%s: `%s' is invalid", buf, s);
861
if (state.pairs)
862
{
863
ii = state.pam[ii];
864
jj = state.pam[jj];
865
}
866
if (ii >= row || jj >= row)
867
continue;
868
siz[ii][jj] = z;
869
if (ii == jj)
870
hit[ii] = 1;
871
else
872
{
873
hit[ii] = hit[jj] = k;
874
cp->size = siz[ii][jj];
875
cp->rate = cp->size / 2;
876
cp->elements = 2;
877
cp->member[0] = ii;
878
cp->member[1] = jj;
879
cp++;
880
}
881
}
882
}
883
sfclose(sp);
884
error_info.file = 0;
885
error_info.line = 0;
886
}
887
sp = 0;
888
for (i = 0; i < row; i++)
889
if (!hit[i])
890
{
891
hit[i] = 1;
892
if (state.cache && !error_info.file)
893
{
894
error_info.file = state.cachefile;
895
error_info.line = 0;
896
if (!(sp = sfopen(NiL, error_info.file, "a")))
897
error(ERROR_SYSTEM|1, "cannot update cache");
898
else if (!sfseek(sp, (Sfoff_t)0, SEEK_END))
899
{
900
sfprintf(sp, "# %s cache for %s\n", error_info.id, state.input);
901
error_info.line++;
902
}
903
}
904
siz[i][i] = field(buf, dat, i, i, row, tot);
905
if (sp)
906
{
907
sfprintf(sp, "p %d %d %I*u\n", state.map[i], state.map[i], sizeof(siz[i][i]), siz[i][i]);
908
error_info.line++;
909
}
910
if (state.verbose)
911
sfprintf(sfstderr, "reorder pairs for %d [%d]\n", i, state.map[i]);
912
for (j = 0; j < i; j++)
913
{
914
z = pair(buf, dat, i, j, row, tot);
915
y = pair(buf, dat, j, i, row, tot);
916
if (z <= y)
917
{
918
ii = i;
919
jj = j;
920
}
921
else
922
{
923
z = y;
924
ii = j;
925
jj = i;
926
}
927
if (z < (siz[i][i] + siz[j][j]))
928
{
929
hit[i] = hit[j] = k;
930
cp->size = siz[ii][jj] = z;
931
cp->rate = cp->size / 2;
932
cp->elements = 2;
933
cp->member[0] = ii;
934
cp->member[1] = jj;
935
cp++;
936
if (sp)
937
{
938
sfprintf(sp, "p %d %d %I*u\n", state.map[ii], state.map[jj], sizeof(z), z);
939
error_info.line++;
940
}
941
}
942
}
943
}
944
if (sp && sfclose(sp))
945
error(ERROR_SYSTEM|1, "cache write error");
946
error_info.file = 0;
947
error_info.line = 0;
948
949
/*
950
* coalesce the partitions
951
*/
952
953
fp = fin;
954
for (;;)
955
{
956
if (state.verbose)
957
sfprintf(sfstderr, "reorder part %d\n", k + 1);
958
qsort(cur, cp - cur, sizeof(Part_t), byrate);
959
np = nxt;
960
if (k == 2)
961
{
962
for (i = 0; i < row; i++)
963
cst[i] = state.window;
964
for (tp = cur; tp < cp; tp++)
965
{
966
i = tp->member[0];
967
j = tp->member[1];
968
if (!(z = siz[i][j]))
969
z = siz[j][i];
970
if (z)
971
{
972
cy = z - siz[i][i];
973
if (cy < cst[j])
974
cst[j] = cy;
975
cy = z - siz[j][j];
976
if (cy < cst[i])
977
cst[i] = cy;
978
}
979
}
980
}
981
else
982
for (tp = cur; tp < cp; tp++)
983
for (j = 0; j < tp->elements; j++)
984
{
985
ii = 0;
986
for (i = 0; i < tp->elements; i++)
987
if (i != j)
988
np->member[ii++] = tp->member[i];
989
np->elements = ii;
990
cst[tp->member[j]] = tp->size - part(buf, dat, np, row, tot);
991
if (state.test & 0x0020)
992
sfprintf(sfstderr, "reorder cost %6u %u\n", state.map[tp->member[j]], cst[tp->member[j]]);
993
}
994
for (tp = cur; tp < cp; tp++)
995
{
996
bp = 0;
997
for (i = 0; i < row; i++)
998
{
999
if (hit[i] > k)
1000
continue;
1001
z = 0;
1002
jj = 0;
1003
for (j = 0; j < tp->elements; j++)
1004
if (hit[tp->member[j]] > k)
1005
z = 2;
1006
else
1007
{
1008
tp->member[jj++] = tp->member[j];
1009
if (i == tp->member[j])
1010
z = 2;
1011
else if (!z && (siz[i][tp->member[j]] || siz[tp->member[j]][i]))
1012
z = 1;
1013
}
1014
tp->elements = jj;
1015
if (z != 1)
1016
continue;
1017
if (!merge(buf, dat, i, tp, np, siz, row, tot))
1018
{
1019
if (!(state.test & 0x0100) && (k > 2 || (state.test & 0x0080)))
1020
for (j = 0; j < tp->elements; j++)
1021
siz[i][tp->member[j]] = siz[tp->member[j]][i] = 0;
1022
}
1023
else if ((ssize_t)(np->size - tp->size) < cst[i] && (!bp || np->size < bp->size))
1024
{
1025
bp = np + 2;
1026
bp->size = np->size;
1027
bp->elements = np->elements;
1028
for (j = 0; j < np->elements; j++)
1029
bp->member[j] = np->member[j];
1030
}
1031
}
1032
if (bp)
1033
{
1034
np->size = bp->size;
1035
np->elements = bp->elements;
1036
for (j = 0; j < bp->elements; j++)
1037
hit[np->member[j] = bp->member[j]] = k + 1;
1038
if (state.test & 0x0010)
1039
dumppart(sfstderr, "keep", np);
1040
np++;
1041
}
1042
else
1043
tp->skip = 1;
1044
}
1045
for (tp = cur; tp < cp; tp++)
1046
if (tp->skip)
1047
{
1048
if (state.test & 0x0020)
1049
dumppart(sfstderr, "skip", tp);
1050
tp->skip = 0;
1051
if (k > 2 && tp->elements > 1)
1052
{
1053
fp->hit = k;
1054
fp->size = tp->size;
1055
fp->elements = tp->elements;
1056
for (j = 0; j < tp->elements; j++)
1057
fp->member[j] = tp->member[j];
1058
fp++;
1059
}
1060
}
1061
if (np == nxt)
1062
break;
1063
tp = nxt;
1064
nxt = cur;
1065
cur = tp;
1066
cp = np;
1067
k++;
1068
}
1069
1070
/*
1071
* collect the order in ord and the partition labels in lab
1072
*/
1073
1074
if (state.verbose)
1075
sfprintf(sfstderr, "reorder done\n");
1076
j = 0;
1077
jj = 0;
1078
k = 0;
1079
while (fp-- > fin)
1080
for (i = 0; i < fp->elements; i++)
1081
if (hit[fp->member[i]] == fp->hit)
1082
{
1083
if (!jj)
1084
{
1085
jj = 1;
1086
j++;
1087
}
1088
hit[fp->member[i]] = 0;
1089
ord[k++] = fp->member[i];
1090
lab[fp->member[i]] = j;
1091
}
1092
for (i = 0; i < row; i++)
1093
if (hit[i])
1094
{
1095
ord[k++] = i;
1096
lab[i] = ++j;
1097
}
1098
1099
/*
1100
* permute state.map and dat according to the new order
1101
*/
1102
1103
permute(buf, dat, ord, row, tot);
1104
1105
/*
1106
* clean up
1107
*/
1108
1109
free(siz);
1110
free(cur);
1111
free(nxt);
1112
free(fin);
1113
free(hit);
1114
free(cst);
1115
free(ord);
1116
}
1117
1118
/*
1119
* tsp ordering
1120
*/
1121
1122
static void
1123
reorder_tsp(Reorder_method_t* method, unsigned char* buf, unsigned char* dat, int* lab, size_t row, size_t tot)
1124
{
1125
#if TSP_VERSION
1126
size_t i;
1127
size_t j;
1128
size_t end;
1129
size_t breakat;
1130
size_t* self;
1131
size_t** apart;
1132
size_t** together;
1133
Tsp_t* tsp;
1134
Tsp_index_t* tour;
1135
Tsp_index_t* cycle;
1136
Tsp_cost_t breakval;
1137
Tsp_cost_t** cost;
1138
Tsp_cost_t* v;
1139
Tsp_disc_t disc;
1140
1141
if (!(cost = newof(0, Tsp_cost_t*, row, row * row * sizeof(Tsp_cost_t))))
1142
error(ERROR_SYSTEM|3, "out of space [%d X %d cost matrix]", row, row);
1143
v = (Tsp_cost_t*)(cost + row);
1144
for (i = 0; i < row; i++)
1145
{
1146
cost[i] = v;
1147
v += row;
1148
}
1149
self = vector(row);
1150
apart = matrix(row);
1151
together = matrix(row);
1152
1153
/*
1154
* compute the Tsp_cost_t matrix
1155
*/
1156
1157
if (state.verbose)
1158
sfprintf(sfstderr, "compute the tsp cost matrix\n");
1159
for (i = 0; i < row; i++)
1160
self[i] = field(buf, dat, i, i, row, tot);
1161
for (i = 0; i < row; i++)
1162
for (j = 0; j < row; j++)
1163
if (i != j)
1164
{
1165
together[i][j] = pair(buf, dat, i, j, row, tot);
1166
apart[i][j] = self[i] + self[j];
1167
cost[i][j] = minof(together[i][j], apart[i][j]);
1168
}
1169
1170
/*
1171
* generate a tour
1172
*/
1173
1174
if (state.verbose)
1175
sfprintf(sfstderr, "generate a tour\n");
1176
memset(&disc, 0, sizeof(disc));
1177
disc.version = TSP_VERSION;
1178
disc.errorf = errorf;
1179
if (!(tsp = tspopen(&disc, cost, row, TSP_DFS|(state.verbose ? TSP_VERBOSE : 0))))
1180
error(3, "tspopen error");
1181
if (!(tour = tsptour(tsp)))
1182
error(3, "tsptour error");
1183
1184
/*
1185
* break tour at most expensive link; put order into self
1186
*/
1187
1188
breakat = end = row-1;
1189
breakval = cost[tour[end]][tour[0]];
1190
for (i = 0; i < end; i++)
1191
if (cost[tour[i]][tour[i+1]] > breakval)
1192
{
1193
breakat = i;
1194
breakval = cost[tour[i]][tour[i+1]];
1195
}
1196
j = 0;
1197
for (i = breakat + 1; i < row; i++)
1198
self[j++] = tour[i];
1199
for (i = 0; i <= breakat; i++)
1200
self[j++] = tour[i];
1201
1202
/*
1203
* permute state.map and dat according to the new order
1204
*/
1205
1206
permute(buf, dat, self, row, tot);
1207
1208
/*
1209
* partition
1210
*/
1211
1212
if (state.test & 0x0200)
1213
{
1214
/*
1215
* partition according to the initial tsp cycles
1216
*/
1217
1218
if (!(cycle = tspcycle(tsp)))
1219
error(3, "tspcycle error");
1220
for (i = 0; i < row; i++)
1221
lab[i] = cycle[self[i]];
1222
}
1223
else
1224
{
1225
/*
1226
* partition according to dependence along tour
1227
*/
1228
1229
for (i = j = 0; i < end; i++)
1230
{
1231
lab[i] = j;
1232
if (together[self[i]][self[i+1]] > apart[self[i]][self[i+1]])
1233
j++;
1234
}
1235
lab[i] = j;
1236
}
1237
1238
/*
1239
* clean up
1240
*/
1241
1242
free(self);
1243
free(apart);
1244
free(together);
1245
free(cost);
1246
tspclose(tsp);
1247
#else
1248
error(3, "%s ordering requires -l%s", method->name, method->name);
1249
#endif
1250
}
1251
1252
/*
1253
* dynamic program to find optimal partition based on zip sizes in val
1254
*/
1255
1256
static void
1257
optimize_dynamic(Optimize_method_t* method, unsigned char* buf, unsigned char* dat, size_t** val, size_t* cst, size_t* sol, size_t row, size_t tot, size_t grp)
1258
{
1259
int i;
1260
int j;
1261
int k;
1262
size_t new;
1263
1264
/*
1265
* fill in the field compress value matrix
1266
*/
1267
1268
for (i = 0; i < row; i++)
1269
{
1270
if (grp <= 0)
1271
k = row;
1272
else if ((k = i + grp) > row)
1273
k = row;
1274
if (state.verbose)
1275
sfprintf(sfstderr, "%s %d..%d\n", method->name, i, k - 1);
1276
for (j = i; j < k; j++)
1277
val[i][j] = field(buf, dat, i, j, row, tot);
1278
for (; j < row; j++)
1279
val[i][j] = ~0;
1280
}
1281
1282
/*
1283
* now run the dynamic program
1284
*/
1285
1286
for (i = 0; i < row; i++)
1287
{
1288
cst[i] = val[0][i];
1289
sol[i] = -1;
1290
for (j = 0; j < i; j++)
1291
{
1292
new = cst[j] + val[j+1][i];
1293
if (new < cst[i])
1294
{
1295
cst[i] = new;
1296
sol[i] = j;
1297
}
1298
}
1299
}
1300
}
1301
1302
/*
1303
* greedy algorithm to approximate optimal partition based on zip sizes in val
1304
*/
1305
1306
static void
1307
optimize_greedy(Optimize_method_t* method, unsigned char* buf, unsigned char* dat, size_t** val, size_t* cst, size_t* sol, size_t row, size_t tot, size_t grp)
1308
{
1309
int i;
1310
size_t expand;
1311
size_t new;
1312
1313
cst[0] = val[0][0] = field(buf, dat, 0, 0, row, tot);
1314
sol[0] = -1;
1315
for (i = 1; i < row; i++)
1316
{
1317
val[sol[i-1] + 1][i] = field(buf, dat, sol[i-1]+1, i, row, tot);
1318
val[i][i] = field(buf, dat, i, i, row, tot);
1319
expand = val[sol[i-1] + 1][i];
1320
new = cst[i-1] + val[i][i];
1321
if (new < expand) {
1322
cst[i] = val[i][i];
1323
sol[i] = i-1;
1324
}
1325
else {
1326
cst[i] = expand;
1327
sol[i] = sol[i-1];
1328
}
1329
}
1330
}
1331
1332
/*
1333
* transitive greedy algorithm to approximate optimal partition
1334
* based on zip sizes in val
1335
*/
1336
1337
static void
1338
optimize_transitive(Optimize_method_t* method, unsigned char* buf, unsigned char* dat, size_t** val, size_t* cst, size_t* sol, size_t row, size_t tot, size_t grp)
1339
{
1340
int i;
1341
size_t expand;
1342
1343
cst[0] = val[0][0] = field(buf, dat, 0, 0, row, tot);
1344
sol[0] = -1;
1345
for (i = 1; i < row; i++)
1346
{
1347
val[i][i] = field(buf, dat, i, i, row, tot);
1348
expand = field(buf, dat, i-1, i, row, tot);
1349
if (expand <= val[i-1][i-1] + val[i][i])
1350
{
1351
cst[i] = expand;
1352
sol[i] = sol[i-1];
1353
}
1354
else
1355
{
1356
cst[i] = val[i][i];
1357
sol[i] = i-1;
1358
}
1359
}
1360
}
1361
1362
/*
1363
* optimization (partitioning) method table
1364
* the first entry is the default
1365
*/
1366
1367
static Optimize_method_t optimize_methods[] =
1368
{
1369
{
1370
"dynamic",
1371
"dynamic program optimal partition",
1372
optimize_dynamic
1373
},
1374
{
1375
"greedy",
1376
"greedy approximation partition",
1377
optimize_greedy
1378
},
1379
{
1380
"none",
1381
"no partition",
1382
0
1383
},
1384
{
1385
"transitive",
1386
"transitive greedy approximation partition",
1387
optimize_transitive
1388
},
1389
};
1390
1391
/*
1392
* reorder method table
1393
* the first entry is the default
1394
*/
1395
1396
static Reorder_method_t reorder_methods[] =
1397
{
1398
{
1399
"heuristic",
1400
"heuristic reorder",
1401
reorder_heuristic
1402
},
1403
{
1404
"none",
1405
"no reordering",
1406
0
1407
},
1408
{
1409
"tsp",
1410
"tsp reordering",
1411
reorder_tsp
1412
},
1413
};
1414
1415
/*
1416
* optget() info discipline function
1417
*/
1418
1419
static int
1420
optinfo(Opt_t* op, Sfio_t* sp, const char* s, Optdisc_t* dp)
1421
{
1422
register int i;
1423
register int n;
1424
1425
n = 0;
1426
if (streq(s, "optimize_default"))
1427
n += sfprintf(sp, "%s", optimize_methods[0].name);
1428
else if (streq(s, "optimize_methods"))
1429
for (i = 0; i < elementsof(optimize_methods); i++)
1430
n += sfprintf(sp, "[+%s?%s]", optimize_methods[i].name, optimize_methods[i].description);
1431
else if (streq(s, "reorder_default"))
1432
n += sfprintf(sp, "%s", reorder_methods[0].name);
1433
else if (streq(s, "reorder_methods"))
1434
for (i = 0; i < elementsof(reorder_methods); i++)
1435
n += sfprintf(sp, "[+%s?%s]", reorder_methods[i].name, reorder_methods[i].description);
1436
return n;
1437
}
1438
1439
int
1440
main(int argc, char** argv)
1441
{
1442
unsigned char* dat;
1443
unsigned char* buf;
1444
char* s;
1445
int i;
1446
ssize_t n;
1447
ssize_t m;
1448
size_t rows;
1449
size_t tot;
1450
size_t rec;
1451
size_t win;
1452
int* lab;
1453
Sfio_t* ip;
1454
Sfio_t* dp;
1455
Pzdisc_t disc;
1456
struct stat is;
1457
struct stat cs;
1458
Optdisc_t optdisc;
1459
1460
int op = 0;
1461
int high = -10;
1462
int maxhigh = -40;
1463
int maxgrp = 0;
1464
size_t row = 0;
1465
Pz_t* pz = 0;
1466
char* partition = 0;
1467
1468
Optimize_method_t* optimize_method = &optimize_methods[0];
1469
Reorder_method_t* reorder_method = &reorder_methods[0];
1470
1471
error_info.id = "pin";
1472
optinit(&optdisc, optinfo);
1473
state.level = 6;
1474
state.window = PZ_WINDOW;
1475
if (!(dp = sfstropen()))
1476
error(ERROR_SYSTEM|3, "out of space");
1477
for (;;)
1478
{
1479
switch (optget(argv, usage))
1480
{
1481
case 'b':
1482
state.bzip = 1;
1483
continue;
1484
case 'c':
1485
state.cache = opt_info.num;
1486
continue;
1487
case 'g':
1488
maxgrp = opt_info.num;
1489
continue;
1490
case 'h':
1491
high = strtol(opt_info.arg, &s, 0);
1492
if (*s == '%')
1493
{
1494
s++;
1495
high = -high;
1496
}
1497
if (*s)
1498
error(3, "%s: %s: invalid number", opt_info.name, opt_info.arg);
1499
continue;
1500
case 'l':
1501
state.level = opt_info.num;
1502
continue;
1503
case 'm':
1504
maxhigh = strtol(opt_info.arg, &s, 0);
1505
if (*s == '%')
1506
{
1507
s++;
1508
maxhigh = -maxhigh;
1509
}
1510
if (*s)
1511
error(3, "%s: %s: invalid number", opt_info.name, opt_info.arg);
1512
continue;
1513
case 'o':
1514
state.sort = 1;
1515
continue;
1516
case 'p':
1517
partition = opt_info.arg;
1518
continue;
1519
case 'r':
1520
row = opt_info.num;
1521
continue;
1522
case 'v':
1523
state.verbose = 1;
1524
continue;
1525
case 'w':
1526
state.window = opt_info.num;
1527
continue;
1528
case 'O':
1529
n = strlen(opt_info.arg);
1530
for (i = 0;; i++)
1531
{
1532
if (i >= elementsof(optimize_methods))
1533
{
1534
error(2, "%s: unknown partition method", opt_info.arg);
1535
break;
1536
}
1537
if (strneq(opt_info.arg, optimize_methods[i].name, n))
1538
{
1539
optimize_method = &optimize_methods[i];
1540
break;
1541
}
1542
}
1543
continue;
1544
case 'Q':
1545
sfprintf(dp, "regress\n");
1546
continue;
1547
case 'R':
1548
n = strlen(opt_info.arg);
1549
for (i = 0;; i++)
1550
{
1551
if (i >= elementsof(reorder_methods))
1552
{
1553
error(2, "%s: unknown reorder method", opt_info.arg);
1554
break;
1555
}
1556
if (strneq(opt_info.arg, reorder_methods[i].name, n))
1557
{
1558
reorder_method = &reorder_methods[i];
1559
break;
1560
}
1561
}
1562
continue;
1563
case 'S':
1564
op |= OP_size;
1565
continue;
1566
case 'T':
1567
sfprintf(dp, "test=%s\n", opt_info.arg);
1568
continue;
1569
case 'V':
1570
op |= OP_verify;
1571
continue;
1572
case 'X':
1573
sfprintf(dp, "prefix=%s\n", opt_info.arg);
1574
continue;
1575
case '?':
1576
error(ERROR_USAGE|4, "%s", opt_info.arg);
1577
continue;
1578
case ':':
1579
if (!opt_info.option[0] && opt_info.name[0] == opt_info.name[1])
1580
sfputr(dp, &argv[opt_info.index - 1][2], '\n');
1581
else
1582
error(2, "%s", opt_info.arg);
1583
continue;
1584
}
1585
break;
1586
}
1587
argv += opt_info.index;
1588
if (error_info.errors || !(state.input = *argv++) || (n = *argv != 0) && !(op & OP_size))
1589
error(ERROR_USAGE|4, "%s", optusage(NiL));
1590
1591
/*
1592
* set up the workspace
1593
*/
1594
1595
if (optimize_method->fun)
1596
op |= OP_optimize;
1597
if (reorder_method->fun)
1598
op |= OP_reorder;
1599
memset(&disc, 0, sizeof(disc));
1600
disc.version = PZ_VERSION;
1601
disc.errorf = errorf;
1602
disc.partition = partition;
1603
disc.window = 1;
1604
if (sftell(dp) && !(disc.options = strdup(sfstruse(dp))))
1605
error(ERROR_SYSTEM|3, "out of space");
1606
sfclose(dp);
1607
if (op & OP_size)
1608
{
1609
row = 0;
1610
dat = 0;
1611
buf = 0;
1612
}
1613
for (;;)
1614
{
1615
ip = 0;
1616
if (pz = pzopen(&disc, state.input, 0))
1617
{
1618
state.test = pz->test;
1619
ip = pz->io;
1620
if (partition)
1621
{
1622
row = pz->part->row;
1623
high = pz->part->nmap;
1624
}
1625
else
1626
{
1627
m = pz->part->row;
1628
if (!row)
1629
{
1630
if (m > 0)
1631
{
1632
row = m;
1633
if (state.verbose && !(op & OP_size))
1634
sfprintf(sfstderr, "row size %I*u\n", sizeof(row), row);
1635
}
1636
else if (!(op & OP_size))
1637
error(1, "could not determine row size");
1638
}
1639
else if (m > 0 && (row < m || row > m && (row % m)))
1640
error(1, "row size %I*d may be invalid -- try %I*u next time", sizeof(row), row, sizeof(m), m);
1641
pz->io = 0;
1642
pzclose(pz);
1643
pz = 0;
1644
}
1645
}
1646
else if (!(op & OP_size))
1647
return 1;
1648
if (!(op & OP_size))
1649
break;
1650
if (row && (op & OP_verify))
1651
{
1652
win = (state.window / row) * row;
1653
if (!(dat = newof(dat, unsigned char, win, 0)))
1654
error(ERROR_SYSTEM|3, "out of space [dat]");
1655
if (!(buf = newof(buf, unsigned char, win, 0)))
1656
error(ERROR_SYSTEM|3, "out of space [buf]");
1657
if ((m = sfread(ip, dat, win)) < 0)
1658
m = 0;
1659
rows = m / row;
1660
if (filter(ip, &buf, &dat, pz, high, maxhigh, row, row * rows) == row)
1661
row = 0;
1662
}
1663
if (!n)
1664
{
1665
sfprintf(sfstdout, "%I*u\n", sizeof(row), row);
1666
return 0;
1667
}
1668
sfprintf(sfstdout, "%8I*u %s\n", sizeof(row), row, state.input);
1669
if (pz)
1670
pzclose(pz);
1671
else if (ip)
1672
sfclose(ip);
1673
if (!(state.input = *argv++))
1674
return 0;
1675
row = 0;
1676
}
1677
if (!(rec = row))
1678
error(3, "-r row-size is required");
1679
if (high > (int)row)
1680
error(3, "-h col-count must be <= -r row-size");
1681
state.window = (state.window / row) * row;
1682
if (!(dat = newof(0, unsigned char, state.window, 0)))
1683
error(ERROR_SYSTEM|3, "out of space [dat]");
1684
if (!(buf = newof(0, unsigned char, state.window, 0)))
1685
error(ERROR_SYSTEM|3, "out of space [buf]");
1686
if ((n = sfread(ip, dat, state.window)) <= 0)
1687
error(3, "input empty");
1688
rows = n / row;
1689
state.window = rows * row;
1690
if (state.sort)
1691
{
1692
state.sort = row;
1693
qsort(dat, rows, row, byrow);
1694
}
1695
1696
/*
1697
* set up the cache file
1698
*/
1699
1700
if (stat(state.input, &is))
1701
error(ERROR_SYSTEM|3, "%s: cannot stat", state.input);
1702
if (s = strrchr(state.input, '/'))
1703
s++;
1704
else
1705
s = state.input;
1706
if (state.cache)
1707
{
1708
if (!(state.cachefile = strdup(sfprints("%s-%s", error_info.id, s))))
1709
error(ERROR_SYSTEM|3, "out of space");
1710
if (!stat(state.cachefile, &cs) && cs.st_mtime < is.st_mtime && remove(state.cachefile))
1711
error(ERROR_SYSTEM|3, "%s: cannot update intermediate data cache", state.cachefile);
1712
}
1713
1714
/*
1715
* filter out the high frequency columns
1716
*/
1717
1718
if (row = filter(ip, &buf, &dat, pz, high, maxhigh, row, row * rows))
1719
{
1720
tot = row * rows;
1721
if (!(lab = newof(0, int, row, 0)))
1722
error(ERROR_SYSTEM|3, "out of space [lab]");
1723
if (pz && !(op & (OP_reorder|OP_optimize)))
1724
for (n = 0; n < pz->part->nmap; n++)
1725
lab[n] = pz->part->lab[n];
1726
}
1727
else
1728
op &= ~(OP_optimize|OP_reorder);
1729
1730
/*
1731
* determine a better ordering
1732
*/
1733
1734
if (op & OP_reorder)
1735
(*reorder_method->fun)(reorder_method, buf, dat, lab, row, tot);
1736
1737
/*
1738
* generate a partition based on the ordering
1739
*/
1740
1741
if (op & OP_optimize)
1742
{
1743
size_t* cst;
1744
size_t* sol;
1745
size_t** val;
1746
1747
/*
1748
* allocate the tmp workspace
1749
*/
1750
1751
val = matrix(row);
1752
cst = vector(row);
1753
sol = vector(row);
1754
1755
/*
1756
* partition
1757
*/
1758
1759
(*optimize_method->fun)(optimize_method, buf, dat, val, cst, sol, row, tot, maxgrp);
1760
1761
/*
1762
* gather the optimal partition group labels in lab
1763
*/
1764
1765
solution(lab, 0, sol, 0, row - 1);
1766
if (state.verbose)
1767
sfprintf(sfstderr, "%s done\n", optimize_method->name);
1768
1769
/*
1770
* clean up
1771
*/
1772
1773
free(val);
1774
free(cst);
1775
free(sol);
1776
}
1777
1778
/*
1779
* finished -- list the partition
1780
*/
1781
1782
sfprintf(sfstdout, "# pzip partition\n");
1783
sfprintf(sfstdout, "# %s\n", fmtident(usage));
1784
if (!(op & OP_reorder))
1785
sfprintf(sfstdout, "# group coalescing limited to adjacent columns\n");
1786
if (maxgrp)
1787
sfprintf(sfstdout, "# group size limited to %d columns\n", maxgrp);
1788
sfprintf(sfstdout, "# row %d window %d compression level %d\n", rec, state.window, state.level);
1789
if (state.sort)
1790
sfprintf(sfstdout, "\nsort=1\n");
1791
if (high)
1792
sfprintf(sfstdout, "\n%d\t# high frequency %d\n", rec, row);
1793
else
1794
sfprintf(sfstdout, "\n%d\n", rec);
1795
list(sfstdout, lab, row);
1796
return 0;
1797
}
1798
1799