Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/lib/libvcodex/Vcwindow/vcwprefix.c
1810 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 2003-2011 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Phong Vo <[email protected]> *
18
* *
19
***********************************************************************/
20
#include "vcwhdr.h"
21
22
/* Compute window using block matching.
23
**
24
** Written by Kiem-Phong Vo.
25
*/
26
27
#define PF_MINSIZE (16) /* minimum block size */
28
#define PF_DFLTSIZE (16*PF_MINSIZE) /* default block size */
29
#define PF_MAXSIZE (64*PF_MINSIZE) /* maximum block size */
30
31
#define PFMERGE(bz) (16*(bz)) /* merge if overlapped by this */
32
#define PFSMALL(n,bz) ((n) <= 3*(bz)) /* window too small to use */
33
34
#define PF_COEF ((Vchash_t)17) /* linear congruential hash */
35
36
#define PF_HUGE (~((Vchash_t)0))
37
38
typedef struct _pfseg_s Pfseg_t;
39
typedef struct _pffile_s Pffile_t;
40
typedef struct _prefix_s Prefix_t;
41
typedef struct _pfobj_s Pfobj_t;
42
typedef struct _pftab_s Pftab_t;
43
44
struct _pfseg_s
45
{ Sfoff_t ldt; /* data interval */
46
Sfoff_t rdt;
47
Sfoff_t lmt; /* matched data interval */
48
Sfoff_t rmt;
49
ssize_t mtch; /* number of matched bytes */
50
};
51
52
struct _pfobj_s
53
{ Vchash_t key;
54
Pfobj_t* next;
55
};
56
struct _pftab_s
57
{ Pfobj_t* list;
58
};
59
60
struct _pffile_s
61
{ Pfobj_t* obj; /* keys representing blocks */
62
Pftab_t* tab;
63
size_t mask;
64
65
Vcchar_t* bloom; /* Bloom filtering of unmatches */
66
size_t bmask;
67
68
Sfio_t* sf; /* stream matching against */
69
Sfoff_t sfsz; /* max file size */
70
ssize_t maxo; /* max object with signature */
71
72
Sfoff_t dtpos; /* position/size of parsed data */
73
ssize_t dtsz;
74
ssize_t cseg; /* current unprocessed segment */
75
ssize_t nseg; /* number of parsed segments */
76
ssize_t sseg; /* size of segment array */
77
Pfseg_t* seg; /* parsed segments in data */
78
};
79
80
struct _prefix_s
81
{ Pffile_t* srcf;
82
Pffile_t* tarf;
83
ssize_t blksz; /* block size for signatures */
84
Vchash_t coef; /* high coefficient of hashf */
85
Vchash_t* key; /* temp space for data keys */
86
ssize_t nkey; /* current size of key[] */
87
Sfoff_t keyb; /* base of calculated keys */
88
Sfoff_t here; /* data processed to here */
89
};
90
91
/* functions to set and test bits from a Bloom filter */
92
#define BLOOMDEF(pf,ky,b1,b2) (((b1) = (ky)&(pf)->bmask), \
93
((b2) = VCHASH(ky)&(pf)->bmask) )
94
#define BLOOMSET(pf,ky,b1,b2) (BLOOMDEF(pf,ky,b1,b2), \
95
((pf)->bloom[(b1) >> 3] |= (1 << ((b1)&7)) ), \
96
((pf)->bloom[(b2) >> 3] |= (1 << ((b2)&7)) ) )
97
#define Bloomtest(pf,ky,b1,b2) (BLOOMDEF(pf,ky,b1,b2), \
98
(((pf)->bloom[(b1) >> 3] & (1 << ((b1)&7)) ) && \
99
((pf)->bloom[(b2) >> 3] & (1 << ((b2)&7)) ) ) )
100
#define BLOOMTEST(pf,ky,b1,b2) ((pf)->bloom ? Bloomtest(pf,ky,b1,b2) : 1)
101
102
#define pfindex(pf,ky) (((ky)>>3) & (pf)->mask)
103
104
/* compute the linear congruential hash of a block of data */
105
#if __STD_C
106
static Vchash_t pfhash(Vcchar_t* data, ssize_t dtsz)
107
#else
108
static Vchash_t pfhash(data, dtsz)
109
Vcchar_t* data;
110
ssize_t dtsz;
111
#endif
112
{
113
Vchash_t key;
114
Vcchar_t *endd;
115
116
for(key = 0, endd = data+dtsz; data < endd; ++data)
117
key = key*PF_COEF + *data;
118
return key;
119
}
120
121
/* compute hash value of dt given the hash value of (dt-1) */
122
#define pfnext(dt,sz,ky,cf) \
123
( ((ky) - *((dt)-1)*(cf))*PF_COEF + *((dt)+sz-1) )
124
125
/* create the structure to search file sf */
126
#if __STD_C
127
static Pffile_t* pfopen(Sfio_t* sf, ssize_t sz, int srcfile)
128
#else
129
static Pffile_t* pfopen(sf, sz, srcfile)
130
Sfio_t* sf;
131
ssize_t sz;
132
int srcfile;
133
#endif
134
{
135
ssize_t k, n, blsz;
136
Sfoff_t sfsz;
137
Pffile_t *pf;
138
139
if((sfsz = sfsize(sf)) < 0 )
140
return NIL(Pffile_t*);
141
if(sz > 0 && (n = (ssize_t)(sfsz/sz)) > 0)
142
{ for(k = 1024; k*2 < n; k *= 2)
143
;
144
blsz = srcfile ? 2*k : 0; /* Bloom filter size */
145
}
146
else n = k = blsz = 0;
147
148
sz = sizeof(Pffile_t) + n*sizeof(Pfobj_t) + k*sizeof(Pftab_t) + blsz;
149
if(!(pf = (Pffile_t*)calloc(1,sz)) )
150
return NIL(Pffile_t*);
151
152
if(n > 0)
153
{ pf->obj = (Pfobj_t*)(pf + 1);
154
pf->tab = (Pftab_t*)(pf->obj+n);
155
pf->mask = k-1;
156
157
pf->bloom = srcfile ? (Vcchar_t*)(pf->tab + k) : NIL(Vcchar_t*);
158
pf->bmask = 8*blsz - 1;
159
}
160
161
pf->sf = sf;
162
pf->sfsz = sfsz;
163
pf->maxo = n <= 0 ? -1 : 0;
164
165
return pf;
166
}
167
168
/* make blocks with addresses < maxp searchable */
169
#if __STD_C
170
static int pfbuild(Pffile_t* pf, ssize_t blksz)
171
#else
172
static int pfbuild(pf, blksz)
173
Pffile_t* pf;
174
ssize_t blksz;
175
#endif
176
{
177
Vchash_t k, r, n;
178
Pfobj_t *obj, *endo;
179
Pftab_t *tab;
180
Void_t *dt;
181
182
if(pf->maxo < 0) /* no construction possible */
183
return 0;
184
185
/* construct keys to represent blocks */
186
if(sfseek(pf->sf, (Sfoff_t)0, 0) != 0)
187
return -1;
188
for(endo = (obj = pf->obj) + (size_t)(pf->sfsz/blksz); obj < endo; ++obj)
189
{ if(!(dt = sfreserve(pf->sf, blksz, 0)) || sfvalue(pf->sf) < blksz)
190
return -1;
191
if((k = pfhash(dt, blksz)) == PF_HUGE)
192
k = PF_HUGE-1;
193
194
obj->key = k;
195
if(pf->bloom)
196
BLOOMSET(pf, k, n, r);
197
tab = pf->tab + pfindex(pf, k);
198
obj->next = tab->list; tab->list = obj;
199
}
200
201
return 0;
202
}
203
204
#if __STD_C
205
static ssize_t pfsegment(Pffile_t* pf, ssize_t ld, ssize_t lm, ssize_t mn, ssize_t blksz)
206
#else
207
static ssize_t pfsegment(pf, ld, lm, mn, blksz)
208
Pffile_t* pf;
209
ssize_t ld;
210
ssize_t lm;
211
ssize_t mn;
212
ssize_t blksz;
213
#endif
214
{
215
Pfseg_t *seg;
216
Sfoff_t ldt, rdt, lmt, rmt;
217
218
ldt = ld + pf->dtpos; rdt = ldt + mn;
219
lmt = ((Sfoff_t)lm)*blksz; rmt = lmt + mn;
220
221
/* merge with last segment if overlapping */
222
if((seg = pf->nseg > 0 ? pf->seg + pf->nseg-1 : 0) &&
223
lmt <= (seg->rmt+PFMERGE(blksz)) && rmt >= (seg->lmt-PFMERGE(blksz)) )
224
{ if(seg->lmt > lmt)
225
seg->lmt = lmt;
226
if(seg->rmt < rmt)
227
seg->rmt = rmt;
228
seg->rdt = rdt;
229
seg->mtch += mn;
230
return mn;
231
}
232
233
if(PFSMALL(mn,blksz)) /* too small to make a segment */
234
return 0;
235
236
/* creating a new segment */
237
if(pf->nseg >= pf->sseg)
238
{ pf->seg = (Pfseg_t*)realloc(pf->seg, (pf->nseg+8)*sizeof(Pfseg_t));
239
if(!pf->seg)
240
return -1;
241
pf->sseg = pf->nseg + 8;
242
}
243
seg = pf->seg + pf->nseg; pf->nseg += 1;
244
seg->ldt = ldt; seg->rdt = rdt;
245
seg->lmt = lmt; seg->rmt = rmt;
246
seg->mtch = mn;
247
return mn;
248
}
249
250
/* parse a string of keys into matchable and unmatched data */
251
#if __STD_C
252
static ssize_t pfparse(Pffile_t* pf, Vchash_t* dt, ssize_t n, Sfoff_t pos, ssize_t blksz)
253
#else
254
static ssize_t pfparse(pf, dt, n, pos, blksz)
255
Pffile_t* pf;
256
Vchash_t* dt;
257
ssize_t n;
258
Sfoff_t pos;
259
ssize_t blksz;
260
#endif
261
{
262
ssize_t i, k, d, dd, mm, mn, total, ucnt;
263
Pfobj_t *o;
264
265
pf->cseg = pf->nseg = 0;
266
pf->dtpos = pos;
267
pf->dtsz = n;
268
269
total = 0;
270
for(d = 0, n -= (blksz-1); d < n; d = dd+mn)
271
{ for(mn = 0, mm = -1, dd = d; dd < n; ++dd)
272
{ if(!BLOOMTEST(pf, dt[dd], i, k))
273
continue;
274
for(o = pf->tab[pfindex(pf, dt[dd])].list; o; o = o->next)
275
{ /* find a stretch of approximate matches */
276
k = dd; i = o - pf->obj; ucnt = 0;
277
for(; k < n && i < pf->maxo; k += blksz, i += 1)
278
{ if(dt[k] == pf->obj[i].key)
279
ucnt = 0;
280
else if(k == dd || ucnt >= 2)
281
break; /* too many misses */
282
else ucnt += 1;
283
}
284
if((k = (k-dd) - ucnt*blksz) > mn)
285
{ mn = k; /* current best match length */
286
mm = o - pf->obj;
287
}
288
}
289
if(mn > 0)
290
break;
291
}
292
293
if(mn <= 0) /* done with search for windows */
294
break;
295
else if((k = pfsegment(pf, dd, mm, mn, blksz)) < 0)
296
return -1;
297
else total += k; /* good match */
298
}
299
300
return total;
301
}
302
303
/* compute a window to return to application for delta-ing */
304
#if __STD_C
305
static int pfwindow(Pffile_t* pf, Vcwmatch_t* wm, ssize_t blksz)
306
#else
307
static int pfwindow(pf, wm, blksz)
308
Pffile_t* pf;
309
Vcwmatch_t* wm;
310
ssize_t blksz;
311
#endif
312
{
313
Pfseg_t *seg;
314
ssize_t sz;
315
316
seg = pf->cseg >= pf->nseg ? NIL(Pfseg_t*) : pf->seg+pf->cseg;
317
318
/* absorb left unmatched data if any */
319
sz = seg ? (ssize_t)(seg->ldt - pf->dtpos) : pf->dtsz;
320
if(seg && sz > 0 && sz < (seg->rdt - seg->ldt) )
321
{ seg->ldt -= sz;
322
if((seg->lmt -= sz) < 0)
323
seg->lmt = 0;
324
sz = 0;
325
}
326
327
if(sz > 0) /* returning an unmatchable window of data */
328
{ wm->wpos = -1;
329
wm->wsize = 0;
330
wm->wdata = 0;
331
wm->msize = sz;
332
pf->dtpos += sz; pf->dtsz -= sz;
333
wm->more = pf->dtsz > 0 ? 1 : 0;
334
return 0;
335
}
336
337
pf->cseg += 1; /* will be returning whatever is in seg */
338
339
/* absorb any right unmatched block data */
340
if(pf->cseg < pf->nseg)
341
sz = (ssize_t)((seg+1)->ldt - seg->rdt);
342
else sz = (ssize_t)((pf->dtpos+pf->dtsz) - seg->rdt);
343
if(sz > 0)
344
{ seg->rdt += sz;
345
if((seg->rmt += sz) > pf->maxo*blksz)
346
seg->rmt = ((Sfoff_t)pf->maxo)*blksz;
347
}
348
349
/* set window */
350
wm->wpos = seg->lmt;
351
wm->wsize = (ssize_t)(seg->rmt - seg->lmt);
352
if(sfseek(pf->sf, seg->lmt, 0) != seg->lmt ||
353
!(wm->wdata = sfreserve(pf->sf, wm->wsize, 0)) ||
354
sfvalue(pf->sf) < wm->wsize)
355
{ pf->cseg = pf->nseg = 0;
356
pf->dtsz = 0;
357
return -1;
358
}
359
wm->msize = (ssize_t)(seg->rdt - seg->ldt);
360
361
pf->dtpos += wm->msize; pf->dtsz -= wm->msize;
362
wm->more = pf->dtsz > 0 ? 1 : 0;
363
364
return 1;
365
}
366
367
#if __STD_C
368
static Vcwmatch_t* pfmatch(Vcwindow_t* vcw, Void_t* data, size_t dtsz, Sfoff_t here)
369
#else
370
static Vcwmatch_t* pfmatch(vcw, data, dtsz, here)
371
Vcwindow_t* vcw;
372
Void_t* data; /* target data to be matched */
373
size_t dtsz; /* data size */
374
Sfoff_t here; /* current target position */
375
#endif
376
{
377
int rv;
378
Vcchar_t *dt, *endd;
379
Vchash_t k, *key;
380
Pffile_t *pf;
381
Vcwmatch_t *wm = &vcw->match;
382
Prefix_t *pfx = (Prefix_t*)vcw->mtdata;
383
384
if(!pfx->srcf && !pfx->tarf )
385
return NIL(Vcwmatch_t*);
386
387
if((pf = pfx->srcf) && ((Sfoff_t)dtsz >= pf->sfsz/2 || dtsz < pfx->blksz) )
388
{ wm->type = VCD_SOURCEFILE;
389
if(dtsz < pfx->blksz) /* small data, use matching window */
390
{ wm->wpos = here;
391
wm->wsize = here+dtsz > pf->sfsz ? (ssize_t)(pf->sfsz-here) : dtsz;
392
}
393
else /* too large, use all source file */
394
{ wm->wpos = 0;
395
wm->wsize = (ssize_t)pf->sfsz;
396
}
397
if(sfseek(pf->sf, (Sfoff_t)0, 0) != (Sfoff_t)0 ||
398
!(wm->wdata = sfreserve(pf->sf, wm->wsize, 0)) )
399
return NIL(Vcwmatch_t*);
400
wm->msize = dtsz;
401
wm->more = 0;
402
return wm;
403
}
404
405
target_file: /* returning computed windows in target file */
406
if((pf = pfx->tarf) )
407
{ if(pf->dtpos == here && dtsz <= pf->dtsz )
408
{ if((rv = pfwindow(pf, wm, pfx->blksz)) < 0)
409
return NIL(Vcwmatch_t*);
410
wm->type = rv == 0 ? 0 : VCD_TARGETFILE;
411
return wm;
412
}
413
414
pf->cseg = pf->nseg = 0;
415
pf->maxo = here/pfx->blksz; /* can now match to here */
416
}
417
418
pf = pfx->srcf;
419
if(dtsz > pfx->nkey) /* make sure we have space for keys */
420
{ pfx->key = (Vchash_t*)realloc(pfx->key, dtsz*sizeof(Vchash_t));
421
if((pfx->nkey = pfx->key ? dtsz : 0) <= 0)
422
return NIL(Vcwmatch_t*);
423
}
424
if(here != pfx->here || !pf || pf->dtsz <= 0 )
425
{ /* compute all keys for this set of data */
426
endd = (dt = (Vcchar_t*)data) + dtsz - (pfx->blksz-1);
427
k = pfhash(dt, pfx->blksz);
428
*(key = pfx->key) = k == PF_HUGE ? (k>>1) : k;
429
for(dt += 1; dt < endd; ++dt)
430
{ k = pfnext(dt, pfx->blksz, k, pfx->coef);
431
*++key = k == PF_HUGE ? (k>>1) : k;
432
/**/DEBUG_ASSERT(k == pfhash(dt, pfx->blksz));
433
}
434
pfx->keyb = pfx->here = here; /* set base of calculated keys */
435
436
/* match with source data to parse into segments */
437
if((pf ? pfparse(pf, pfx->key, dtsz, here, pfx->blksz) : 0) <= 0 )
438
{ /* fail source, try target */
439
if((pf = pfx->tarf) &&
440
pfparse(pf, pfx->key, dtsz, here, pfx->blksz) > 0 )
441
goto target_file;
442
else return NIL(Vcwmatch_t*);
443
}
444
}
445
446
/**/DEBUG_ASSERT(pf && pf == pfx->srcf && pf->dtsz > 0);
447
/**/DEBUG_ASSERT(pfx->here == here);
448
if((rv = pfwindow(pf, wm, pfx->blksz)) < 0)
449
return NIL(Vcwmatch_t*);
450
else
451
{ pfx->here = here + wm->msize;
452
/**/DEBUG_PRINT(2,"here=%8d ",(ssize_t)here);
453
/**/DEBUG_PRINT(2,"dtsz=%8d ",(ssize_t)dtsz);
454
/**/DEBUG_PRINT(2,"mtch=%8d ",(ssize_t)wm->msize);
455
/**/DEBUG_PRINT(2,"wpos=%8d ",(ssize_t)wm->wpos);
456
/**/DEBUG_PRINT(2,"wsiz=%8d \n",(ssize_t)wm->wsize);
457
if(rv > 0)
458
{ wm->type = VCD_SOURCEFILE;
459
return wm;
460
}
461
else if(!pfx->tarf || wm->msize <= 2*pfx->blksz)
462
{ wm->type = 0;
463
return wm;
464
}
465
else /* try matching this segment with target data */
466
{ key = pfx->key + (size_t)(here - pfx->keyb);
467
dtsz = wm->msize; /* set this for possible tail recursion */
468
if(pfparse(pfx->tarf, key, dtsz, here, pfx->blksz) > 0 )
469
goto target_file;
470
else
471
{ wm->type = 0;
472
return wm;
473
}
474
}
475
}
476
}
477
478
/* Event handler */
479
#if __STD_C
480
static int pfevent(Vcwindow_t* vcw, int type)
481
#else
482
static int pfevent(vcw, type)
483
Vcwindow_t* vcw;
484
int type;
485
#endif
486
{
487
Sfoff_t srcsz, tarsz, maxsz;
488
ssize_t sz;
489
Prefix_t *pfx;
490
int rv = -1;
491
492
if(type == VCW_OPENING)
493
{
494
if(!vcw->disc || (!vcw->disc->srcf && !vcw->disc->tarf) )
495
return -1;
496
497
if(!(pfx = (Prefix_t*)calloc(1,sizeof(Prefix_t))) )
498
return -1;
499
500
/* compute a suitable block size between [MINSIZE,MAXSIZE] */
501
srcsz = vcw->disc->srcf ? sfsize(vcw->disc->srcf) : 0;
502
tarsz = vcw->disc->tarf ? sfsize(vcw->disc->tarf) : 0;
503
maxsz = srcsz > tarsz ? srcsz : tarsz;
504
if(maxsz < PF_MINSIZE)
505
sz = 0;
506
else
507
{ sz = PF_DFLTSIZE;
508
#define PF_MINBLKCNT (8*1024) /* aim for at least this many */
509
#define PF_MAXBLKCNT (16*1024*1024) /* likewise for upper bound */
510
while(sz < PF_MAXSIZE && maxsz/sz > PF_MAXBLKCNT)
511
sz += PF_MINSIZE;
512
while(sz > PF_MINSIZE && maxsz/sz < PF_MINBLKCNT)
513
sz -= PF_MINSIZE;
514
}
515
pfx->blksz = sz; /**/ DEBUG_PRINT(2,"blksz=%d\n",pfx->blksz);
516
517
pfx->coef = 1; /* highest coef in hashing blocks */
518
for(; sz > 1; --sz)
519
pfx->coef *= PF_COEF;
520
521
if(srcsz > 0)
522
{ if(!(pfx->srcf = pfopen(vcw->disc->srcf, pfx->blksz, 1)) )
523
goto do_closing;
524
if(pfx->blksz > 0)
525
{ if(pfbuild(pfx->srcf, pfx->blksz) < 0)
526
goto do_closing;
527
pfx->srcf->maxo = pfx->srcf->sfsz/pfx->blksz;
528
}
529
}
530
531
if(tarsz > 0 && pfx->blksz > 0)
532
{ if(!(pfx->tarf = pfopen(vcw->disc->tarf, pfx->blksz, 0)) )
533
goto do_closing;
534
if(pfbuild(pfx->tarf, pfx->blksz) < 0)
535
goto do_closing;
536
pfx->tarf->maxo = 0;
537
}
538
539
vcw->mtdata = (Void_t*)pfx;
540
return 0;
541
}
542
else if(type == VCW_CLOSING)
543
{ rv = 0;
544
do_closing:
545
if((pfx = (Prefix_t*)vcw->mtdata) )
546
{ if(pfx->srcf)
547
{ if(pfx->srcf->seg && pfx->srcf->sseg > 0)
548
free(pfx->srcf->seg);
549
free(pfx->srcf);
550
}
551
if(pfx->tarf)
552
{ if(pfx->tarf->seg && pfx->tarf->sseg > 0)
553
free(pfx->tarf->seg);
554
free(pfx->tarf);
555
}
556
if(pfx->key && pfx->nkey > 0)
557
free(pfx->key);
558
free(pfx);
559
}
560
vcw->mtdata = NIL(Void_t*);
561
return rv;
562
}
563
else return 0;
564
}
565
566
Vcwmethod_t _Vcwprefix =
567
{ pfmatch,
568
pfevent,
569
"prefix",
570
"Find windows with matching prefixes.",
571
"[-version?window::prefix (AT&T Research) 2003-01-01]" USAGE_LICENSE,
572
};
573
574
Vcwmethod_t* Vcwprefix = &_Vcwprefix;
575
576