Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/lib/libvcodex/vclzparse.c
1808 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 2003-2011 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Phong Vo <[email protected]> *
18
* *
19
***********************************************************************/
20
#include "vchdr.h"
21
22
/* Generalized block-move parsing of a pair of strings.
23
**
24
** Written by Kiem-Phong Vo ([email protected])
25
*/
26
27
#define HMIN 3
28
#define MMIN(vcpa) ((vcpa)->mmin >= HMIN ? (vcpa)->mmin : HMIN)
29
30
#define MAXPAIR 256 /* max # of extended matching pairs */
31
typedef struct _pair_s
32
{ ssize_t size; /* # of pairs found */
33
Vclzmatch_t mtch[MAXPAIR]; /* storage for pairs */
34
} Pair_t;
35
#define NEXTPAIR(pr) ((pr)->size < MAXPAIR-1 ? ((pr)->mtch + (pr)->size++) : NIL(Vclzmatch_t*))
36
37
#define ALPHA 5 /* multiplier for hashing polynomial */
38
typedef struct _obj_s
39
{ struct _obj_s* next;
40
} Obj_t;
41
typedef struct _hash_s
42
{ Vchash_t mask; /* mask to get hash table index */
43
Obj_t** htab; /* hash table of size (mask+1) */
44
Obj_t* src; /* objects representing source */
45
Obj_t* tar; /* objects representing target */
46
Vchash_t coef; /* max polynomial coefficient */
47
ssize_t poly; /* number of polynomial terms */
48
} Hash_t;
49
#define ENTRY(hs,ky) ((hs)->htab + ((ssize_t)((ky) & (hs)->mask)) )
50
#define INSERT(e,ob) ((ob)->next = *(e), *(e) = (ob) )
51
#define DELETE(e,p,ob) ((p) ? ((p)->next = (ob)->next) : (*(e) = (ob)->next) )
52
53
#if __STD_C
54
static Hash_t* hashtable(ssize_t mmin, ssize_t nsrc, ssize_t ntar)
55
#else
56
static Hash_t* hashtable(mmin, nsrc, ntar)
57
ssize_t mmin; /* segment size to be hashed */
58
ssize_t nsrc; /* length of source data */
59
ssize_t ntar; /* length of target data */
60
#endif
61
{ ssize_t n, size;
62
Hash_t *hs;
63
64
/* compute size to allocate */
65
for(n = 1<<10; n < (nsrc+ntar)/4; n *= 2) /* size of hash table */
66
;
67
size = sizeof(Hash_t) + n*sizeof(Obj_t*) + (nsrc+ntar)*sizeof(Obj_t);
68
69
if(!(hs = (Hash_t*)calloc(1, size)) )
70
return NIL(Hash_t*);
71
72
hs->htab = (Obj_t**)(hs+1);
73
hs->src = (Obj_t*)(hs->htab + n);
74
hs->tar = hs->src + nsrc;
75
hs->mask = n - 1;
76
77
/* hashing polynomial data */
78
hs->poly = (mmin = mmin >= HMIN ? mmin : HMIN);
79
for(hs->coef = 1, n = 1; n < mmin; ++n)
80
hs->coef *= ALPHA;
81
82
return hs;
83
}
84
85
#if __STD_C
86
static Vchash_t fkhash(Hash_t* hs, Vcchar_t* ks, Vcchar_t* cm)
87
#else
88
static Vchash_t fkhash(hs, ks, cm)
89
Hash_t* hs;
90
Vcchar_t* ks; /* string to compute hash key */
91
Vcchar_t* cm; /* byte mapping before matching */
92
#endif
93
{ int k;
94
Vchash_t ky;
95
96
ky = cm ? cm[ks[0]] : ks[0];
97
for(k = 1; k < hs->poly; ++k)
98
ky = ky*ALPHA + (cm ? cm[ks[k]] : ks[k]);
99
100
return ky;
101
}
102
#define fknext(cf,ks,ns,ky) (((ky) - (ks)[0]*(cf) )*ALPHA + (ns)[0] )
103
#define fknextm(cf,ks,ns,ky,cm) (((ky) - (cm)[(ks)[0]]*(cf) )*ALPHA + (cm)[(ns)[0]] )
104
105
#if __STD_C
106
static Vchash_t rkhash(Hash_t* hs, Vcchar_t* ks, Vcchar_t* cm)
107
#else
108
static Vchash_t rkhash(hs, ks, cm)
109
Hash_t* hs;
110
Vcchar_t* ks;
111
Vcchar_t* cm;
112
#endif
113
{ int k;
114
Vchash_t ky;
115
116
ky = cm ? cm[ks[hs->poly - 1]] : ks[hs->poly - 1];
117
for(k = hs->poly-2; k >= 0; --k)
118
ky = ky*ALPHA + (cm ? cm[ks[k]] : ks[k]);
119
120
return ky;
121
}
122
123
/* extend to match nearby segments */
124
#if __STD_C
125
static ssize_t extend(Vclzparse_t* vcpa, Pair_t* pr, int type)
126
#else
127
static ssize_t extend(vcpa, pr, type)
128
Vclzparse_t* vcpa;
129
Pair_t* pr;
130
int type;
131
#endif
132
{
133
ssize_t m, em, n, en, u, mmin, umax, mbeg;
134
Vcchar_t *ms, *ts, *et, *mstr;
135
Vclzmatch_t *mt = pr->mtch + pr->size - 1;
136
Vcchar_t *cmap = ((vcpa->type&type&VCLZ_MAP) && vcpa->cmap) ? vcpa->cmap : 0;
137
138
/* amounts of matches required and unmatches allowed */
139
mmin = vcpa->mmin <= 1 ? 1 : vcpa->mmin;
140
umax = mmin*(2*vclogi(mmin) + 1);
141
142
if((m = mt->mpos - vcpa->nsrc) >= 0 )
143
{ mstr = vcpa->tar; /* matching in target data */
144
mbeg = vcpa->nsrc; /* origin of data */
145
ms = mstr + m; /* start of match */
146
n = vcpa->ntar - m; /* amount of matchable */
147
}
148
else
149
{ mstr = vcpa->src; /* matching in source data */
150
mbeg = 0;
151
ms = mstr + mt->mpos;
152
n = vcpa->nsrc - mt->mpos;
153
}
154
155
/* bounds of matchable data */
156
ts = vcpa->tar + (m = mt->tpos - vcpa->nsrc);
157
et = ts + (n <= (m = vcpa->ntar-m) ? n : m);
158
159
type &= VCLZ_REVERSE;
160
ts += mt->size; ms += (type ? -mt->size : mt->size);
161
for(; ts < et; ts += n, ms += (type ? -n : n))
162
{ for(u = mmin, n = 0, en = et-ts; n < en; )
163
{ /* count unmatches */
164
em = (em = n+u) > en ? en : em;
165
for(m = n; m < em; ++m)
166
if(ms[type ? -m : m] == (cmap ? cmap[ts[m]] : ts[m]))
167
break;
168
if((u -= (m-n)) <= 0)
169
goto done;
170
171
/* count matches */
172
for(n = m, m = n+1; m < en; ++m)
173
if(ms[type ? -m : m] != (cmap ? cmap[ts[m]] : ts[m]))
174
break;
175
if((m-n) < mmin) /* not big enough */
176
{ if((u += m-n) >= umax)
177
goto done;
178
n = m;
179
}
180
else if(!(mt = NEXTPAIR(pr)) )
181
goto done;
182
else
183
{ mt->tpos = (ts - vcpa->tar) + vcpa->nsrc + n;
184
mt->mpos = (ms - mstr) + mbeg + (type ? -n : n);
185
mt->size = m - n;
186
n = m;
187
break;
188
}
189
}
190
}
191
192
done: mt = pr->mtch+pr->size-1;
193
return (mt->tpos + mt->size) - pr->mtch->tpos;
194
}
195
196
#if __STD_C
197
static int hashparse(Vclzparse_t* vcpa, ssize_t prune)
198
#else
199
static int hashparse(vcpa, prune)
200
Vclzparse_t* vcpa;
201
ssize_t prune; /* prune target by this amount */
202
#endif
203
{
204
Vcchar_t *ks, *ns, *ts, *et, *sm, *em;
205
Obj_t *m, *p, *obj, *add, *endo, **ht;
206
ssize_t n, tp;
207
Vchash_t fk, fm, ky;
208
Obj_t *fpos, *rpos;
209
ssize_t flen, ftyp, rlen, rtyp;
210
Pair_t fpair, rpair, *pair;
211
Hash_t *hs;
212
Vchash_t coef;
213
int reverse = vcpa->type&VCLZ_REVERSE;
214
Vcchar_t *cmap = ((vcpa->type&VCLZ_MAP) && vcpa->cmap) ? vcpa->cmap : NIL(Vcchar_t*);
215
ssize_t mmin = MMIN(vcpa); /* minimum match length */
216
/**/DEBUG_ASSERT(vcpa->ntar >= mmin);
217
218
/* initialize hash table */
219
if(!(hs = hashtable(mmin, vcpa->nsrc, vcpa->ntar)))
220
return -1;
221
mmin = hs->poly;
222
coef = hs->coef;
223
224
if(vcpa->nsrc >= hs->poly) /* add source data into hash table */
225
{ fk = fkhash(hs, vcpa->src, NIL(Vcchar_t*)); ky = fk+1;
226
ns = (ks = vcpa->src) + mmin;
227
for(endo = (m = hs->src) + vcpa->nsrc - (mmin-1); ; ++ks, ++ns )
228
{ if(fk != ky )
229
{ ht = ENTRY(hs, fk); INSERT(ht, m); }
230
if((m += 1) >= endo)
231
break;
232
ky = fk; fk = fknext(coef, ks, ns, fk);
233
}
234
}
235
236
ns = (ks = vcpa->tar) + mmin; et = ks + vcpa->ntar; /* bounds of target data */
237
fk = fkhash(hs, ks, NIL(Vcchar_t*));
238
fm = cmap ? fkhash(hs, ks, cmap) : 0;
239
for(add = obj = hs->tar, endo = obj + vcpa->ntar - (mmin-1); ; )
240
{ /**/DEBUG_ASSERT(fkhash(hs, ks, 0) == fk);
241
/**/DEBUG_ASSERT(!cmap || fkhash(hs, ks, cmap) == fm);
242
243
fpos = rpos = NIL(Obj_t*); /* match position */
244
flen = rlen = vcpa->mmin - 1; /* match length */
245
ftyp = rtyp = 0; /* match type */
246
247
/* forward: matching twice for plain and mapped data */
248
for(tp = 0; tp <= VCLZ_MAP; tp += VCLZ_MAP)
249
{ if(tp == 0) /* plain matching */
250
ky = fk;
251
else if(!cmap)
252
continue;
253
else ky = fm;
254
255
ht = ENTRY(hs, ky);
256
if(prune > 0) /* prune target data outside of search window */
257
{ for(p = NIL(Obj_t*), m = *ht; m && m >= hs->tar; )
258
{ if((obj - m) > prune)
259
{ DELETE(ht,p,m); m = p ? p->next : *ht; }
260
else { p = m; m = m->next; }
261
}
262
}
263
264
for(m = *ht; m; m = m->next)
265
{ ts = ks; /* starting point of string */
266
267
if(m >= hs->tar) /* possible target match */
268
{ sm = vcpa->tar + (m - hs->tar);
269
em = vcpa->tar + vcpa->ntar;
270
}
271
else /* possible source match */
272
{ sm = vcpa->src + (m - hs->src);
273
em = vcpa->src + vcpa->nsrc;
274
}
275
if((em-sm) > (et-ts) ) /* matchable bound */
276
em = sm + (et-ts);
277
if((em-sm) <= flen )
278
continue;
279
280
if(tp == 0) /* match without mapping */
281
{ if(sm[flen] != ts[flen] || sm[flen>>1] != ts[flen>>1] )
282
continue;
283
for(; sm < em; ++sm, ++ts)
284
if(*sm != *ts)
285
break;
286
}
287
else /* match with mapping */
288
{ if(sm[flen] != cmap[ts[flen]] ||
289
sm[flen>>1] != cmap[ts[flen>>1]] )
290
continue;
291
for(; sm < em; ++sm, ++ts)
292
if(*sm != cmap[*ts])
293
break;
294
}
295
296
if((n = ts - ks) > flen)
297
{ fpos = m;
298
flen = n;
299
ftyp = tp;
300
}
301
}
302
}
303
if(flen >= vcpa->mmin)
304
{ n = 0;
305
if(add < obj)
306
{ fpair.mtch[n].tpos = add - hs->src;
307
fpair.mtch[n].mpos = -1;
308
fpair.mtch[n].size = obj - add;
309
n += 1;
310
}
311
fpair.mtch[n].tpos = obj - hs->src;
312
fpair.mtch[n].mpos = fpos - hs->src;
313
fpair.mtch[n].size = flen;
314
fpair.size = n+1;
315
flen = extend(vcpa, &fpair, ftyp);
316
}
317
318
/* reverse matching */
319
for(tp = 0; reverse && tp <= VCLZ_MAP; tp += VCLZ_MAP)
320
{ if(tp == 0)
321
ky = rkhash(hs, ks, 0);
322
else if(!cmap)
323
continue;
324
else ky = rkhash(hs, ks, cmap);
325
326
ht = ENTRY(hs, ky );
327
if(prune > 0) /* prune target data outside of search window */
328
{ for(p = NIL(Obj_t*), m = *ht; m && m >= hs->tar; )
329
{ if((obj - m) > prune)
330
{ DELETE(ht,p,m); m = p ? p->next : *ht; }
331
else { p = m; m = m->next; }
332
}
333
}
334
335
for(m = *ht; m; m = m->next)
336
{ ts = ks; /* starting point of string */
337
338
if(m >= hs->tar) /* possible target match */
339
{ sm = vcpa->tar + ((m + mmin-1) - hs->tar);
340
em = vcpa->tar;
341
if(sm >= ts)
342
continue;
343
}
344
else /* possible source match */
345
{ sm = vcpa->src + ((m + mmin-1) - hs->src);
346
em = vcpa->src;
347
}
348
if((sm-em) > (et-ts) ) /* matchable bound */
349
em = sm - (et-ts);
350
if((sm-em+1) <= rlen )
351
continue;
352
353
if(tp == 0) /* match without cmapping */
354
{ if(sm[-rlen] != ts[rlen] || sm[-(rlen>>1)] != ts[rlen>>1] )
355
continue;
356
for(; sm >= em; --sm, ++ts)
357
if(*sm != *ts)
358
break;
359
}
360
else /* match with cmapping */
361
{ if(sm[-rlen] != cmap[ts[rlen]] ||
362
sm[-(rlen>>1)] != cmap[ts[rlen>>1]] )
363
continue;
364
for(; sm >= em; --sm, ++ts)
365
if(*sm != cmap[*ts])
366
break;
367
}
368
369
if((n = ts - ks) > rlen)
370
{ rpos = m + (mmin-1);
371
rlen = n;
372
rtyp = tp;
373
}
374
}
375
}
376
if(rlen >= vcpa->mmin)
377
{ n = 0;
378
if(add < obj)
379
{ rpair.mtch[n].tpos = add - hs->src;
380
rpair.mtch[n].mpos = -1;
381
rpair.mtch[n].size = obj - add;
382
n += 1;
383
}
384
rpair.mtch[n].tpos = obj - hs->src;
385
rpair.mtch[n].mpos = rpos - hs->src;
386
rpair.mtch[n].size = rlen;
387
rpair.size = n+1;
388
rlen = extend(vcpa, &rpair, rtyp|VCLZ_REVERSE);
389
}
390
391
if(flen >= rlen)
392
{ n = flen; pair = &fpair; tp = ftyp; }
393
else { n = rlen; pair = &rpair; tp = rtyp|VCLZ_REVERSE; }
394
395
if(n >= vcpa->mmin) /* a potential match */
396
{ if((n = (*vcpa->parsef)(vcpa, tp, pair->mtch, pair->size)) < 0)
397
goto done;
398
else if(n == 0) /* treat everything as unmatched */
399
goto nope;
400
else if((add += n) >= endo)
401
break;
402
403
for(ky = fk+1;;) /* add parsed data to htab */
404
{ if(fk != ky )
405
{ ht = ENTRY(hs, fk); INSERT(ht, obj); }
406
ky = fk;
407
fk = fknext(coef, ks, ns, fk);
408
fm = cmap ? fknextm(coef, ks, ns, fm, cmap) : 0;
409
ks += 1; ns += 1;
410
if((obj += 1) >= add)
411
break;
412
}
413
}
414
else
415
{ nope: ht = ENTRY(hs, fk); INSERT(ht, obj);
416
if((obj += 1) >= endo)
417
break;
418
fk = fknext(coef, ks, ns, fk);
419
fm = cmap ? fknextm(coef, ks, ns, fm, cmap) : 0;
420
ks += 1; ns += 1;
421
}
422
}
423
424
if((n = (hs->tar + vcpa->ntar) - add) > 0 )
425
{ fpair.mtch[0].tpos = add - hs->src;
426
fpair.mtch[0].mpos = -1;
427
fpair.mtch[0].size = n;
428
n = (*vcpa->parsef)(vcpa, 0, fpair.mtch, 1);
429
}
430
431
done: free(hs);
432
return n >= 0 ? 0 : -1;
433
}
434
435
#if __STD_C
436
static int sfxparse(Vclzparse_t* vcpa)
437
#else
438
static int sfxparse(vcpa)
439
Vclzparse_t* vcpa;
440
#endif
441
{
442
ssize_t p, r, ad;
443
ssize_t lp, lz, rp, rz, savp, savlp, savlz;
444
ssize_t nsrc, nstr;
445
Vcsfxint_t *inv, *idx;
446
Vcchar_t *s1, *s2, *ends, *str = NIL(Vcchar_t*);
447
Pair_t pair;
448
Vcsfx_t *sfx = NIL(Vcsfx_t*);
449
ssize_t mmin = MMIN(vcpa);
450
int rv = -1;
451
452
/* catenate source and target strings into a superstring */
453
if((nsrc = vcpa->nsrc) == 0)
454
{ nstr = vcpa->ntar;
455
str = vcpa->tar;
456
}
457
else if(vcpa->tar == (vcpa->src + nsrc) )
458
{ nstr = nsrc + vcpa->ntar;
459
str = vcpa->src;
460
}
461
else
462
{ nstr = nsrc + vcpa->ntar;
463
if(!(str = (Vcchar_t*)malloc(nstr)) )
464
return -1;
465
memcpy(str, vcpa->src, nsrc);
466
memcpy(str+nsrc, vcpa->tar, vcpa->ntar);
467
}
468
ends = str+nstr;
469
470
if(!(sfx = vcsfxsort(str,nstr)) ) /* compute suffix array */
471
goto done;
472
idx = sfx->idx; inv = sfx->inv;
473
474
for(savlz = savlp = savp = 0, ad = p = nsrc; p < nstr; )
475
{ for(lz = lp = 0, r = inv[p]-1; r >= 0; --r)
476
{ if((lp = idx[r]) < p) /* best left match */
477
{ s1 = str+p; s2 = str+lp;
478
for(; s1 < ends && *s1 == *s2; ++s1, ++s2 )
479
lz += 1;
480
if(lp < nsrc && (lp+lz) > nsrc)
481
if((lz = nsrc-lp) < mmin )
482
lp = lz = 0;
483
break;
484
}
485
}
486
487
for(rz = rp = 0, r = inv[p]+1; r < nstr; ++r)
488
{ if((rp = idx[r]) < p) /* best right match */
489
{ s1 = str+p; s2 = str+rp;
490
for(; s1 < ends && *s1 == *s2; ++s1, ++s2 )
491
rz += 1;
492
if(rp < nsrc && (rp+rz) > nsrc)
493
if((rz = nsrc-rp) < mmin )
494
rp = rz = 0;
495
break;
496
}
497
}
498
499
if(rz > lz || (rz == lz && rp > lp) )
500
{ lp = rp; lz = rz; } /* best match over all */
501
502
#define SEARCH 4 /* neighborhood to search for an improved match */
503
if(savlz == 0 || (p-savp) < SEARCH)
504
{ if(lz > savlz )
505
{ savlp = lp; savlz = lz; savp = p;
506
p += 1; /* improve! go again */
507
continue;
508
}
509
if(savlz > 0 && lz >= (savlz-2) )
510
{ p += 1; /* not a bad short, go again */
511
continue;
512
}
513
}
514
if(savlz > 0) /* no more improvement, use current best */
515
{ lz = savlz; lp = savlp; p = savp;
516
savlz = savlp = savp = 0;
517
}
518
519
if(lz >= mmin)
520
{ r = 0;
521
if(ad < p)
522
{ pair.mtch[r].tpos = ad;
523
pair.mtch[r].mpos = -1;
524
pair.mtch[r].size = p-ad;
525
r += 1;
526
}
527
pair.mtch[r].tpos = p;
528
pair.mtch[r].mpos = lp;
529
pair.mtch[r].size = lz;
530
pair.size = r+1;
531
extend(vcpa, &pair, 0);
532
if((r = (*vcpa->parsef)(vcpa, 0, pair.mtch, pair.size)) < 0 )
533
goto done;
534
else if(r == 0)
535
goto nope;
536
else ad += r; /* advance by amount processed */
537
}
538
else
539
{ nope: p += 1; /* skip over an unmatched position */
540
}
541
} /**/DEBUG_ASSERT(p == nstr);
542
543
if(ad < p)
544
{ pair.mtch[0].tpos = ad;
545
pair.mtch[0].mpos = -1;
546
pair.mtch[0].size = p-ad;
547
if((*vcpa->parsef)(vcpa, 0, pair.mtch, 1) <= 0 )
548
goto done;
549
}
550
551
rv = 0; /* if got here, data successfully parsed */
552
553
done: if(str && str != vcpa->tar && str != vcpa->src)
554
free(str);
555
if(sfx)
556
free(sfx);
557
return rv;
558
}
559
560
#if __STD_C
561
int vclzparse(Vclzparse_t* vcpa, ssize_t bound)
562
#else
563
int vclzparse(vcpa, bound )
564
Vclzparse_t* vcpa;
565
ssize_t bound; /* <0: sfxsort, >=0: hashing with pruning if >0 */
566
#endif
567
{
568
/* error conditions */
569
if(!vcpa->parsef)
570
return -1;
571
if(vcpa->nsrc < 0 || (vcpa->nsrc > 0 && !vcpa->src) )
572
return -1;
573
if(vcpa->ntar < 0 || (vcpa->ntar > 0 && !vcpa->tar) )
574
return -1;
575
576
if(vcpa->ntar == 0 ) /* nothing to do */
577
return 0;
578
579
if(vcpa->ntar < MMIN(vcpa) ) /* no match possible */
580
{ Vclzmatch_t mtch;
581
mtch.tpos = 0;
582
mtch.mpos = -1;
583
mtch.size = vcpa->ntar;
584
if((*vcpa->parsef)(vcpa, 0, &mtch, 1) != vcpa->ntar)
585
return -1;
586
else return 0;
587
}
588
589
if((vcpa->type & (VCLZ_REVERSE|VCLZ_MAP)) || bound >= 0 || vcpa->cmap )
590
return hashparse(vcpa, bound);
591
else return sfxparse(vcpa);
592
}
593
594