Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/lib/libvcodex/Vcwindow/vcwvote.c
1810 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 2003-2011 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Phong Vo <[email protected]> *
18
* *
19
***********************************************************************/
20
#include "vcwhdr.h"
21
22
/* Method to compute a matching window in source data using
23
** n-gram frequencies to determine similarity.
24
**
25
** Written by Kiem-Phong Vo ([email protected])
26
*/
27
28
/* structure to search windows using frequency matching */
29
typedef struct _freq_s
30
{ Vcwfile_t* srcf; /* to find matches in source file */
31
Sfio_t* tarf; /* target file if it is seekable */
32
33
Sfoff_t next; /* next sequential match location */
34
int dtsz; /* size of last data set */
35
36
double bestd; /* best match distance so far */
37
38
int ntar; /* # of times checking target data */
39
int star; /* # of successes at getting a window */
40
} Freq_t;
41
42
#define CHKTARGET 32 /* minimum # of target checking */
43
#define ENDTARGET(fr) (fr->ntar > CHKTARGET && fr->ntar > 8*fr->star)
44
45
#define SEQSEARCH 0.25 /* check sequential if compressed well */
46
47
#define SEQITVL (8*1024) /* search around next location */
48
#define IDXITVL (2*1024) /* search around each candidate index */
49
50
#define SEQMATCH 0.16 /* close enough for a sequential match */
51
#define TARMATCH 0.08 /* close enough to prune target search */
52
#define SRCHMATCH 0.04 /* close enough to prune search */
53
#define NGRAMMATCH 0.01 /* prune in ngram frequency matching */
54
55
#define NOTARMATCH 0.20 /* do not use as a target file match */
56
#define NOSRCMATCH 0.40 /* do not use as a source file match */
57
58
#if __STD_C
59
static double frinterval(Vcwindow_t* vcw, size_t* dfreq, size_t size, Sfoff_t l, Sfoff_t r)
60
#else
61
static double frinterval(vcw, dfreq, size, l, r)
62
Vcwindow_t* vcw;
63
size_t* dfreq; /* frequencies of data n-grams */
64
size_t size; /* size of data */
65
Sfoff_t l, r; /* interval to search */
66
#endif
67
{
68
Vcchar_t *data;
69
size_t dtsz;
70
double dif;
71
int mtch;
72
Freq_t *fr = (Freq_t*)vcw->mtdata;
73
Vcwmatch_t *wm = &vcw->match;
74
75
if(l < 0)
76
l = 0;
77
if((r += size) > fr->srcf->size)
78
r = fr->srcf->size;
79
if((dtsz = (size_t)(r-l)) < size)
80
return 1.;
81
82
if(sfseek(fr->srcf->file, l, 0) != l ||
83
!(data = sfreserve(fr->srcf->file, dtsz, 0)) )
84
return 1.;
85
86
if((dif = vcwngmatch(&mtch, dfreq, size, data, dtsz, 0, NGRAMMATCH)) < fr->bestd)
87
{ fr->bestd = dif;
88
wm->type = VCD_SOURCEFILE;
89
wm->wpos = l + mtch;
90
wm->wsize = size;
91
}
92
93
return fr->bestd;
94
}
95
96
#if __STD_C
97
static double frsearch(Vcwindow_t* vcw, size_t* dfreq, size_t size)
98
#else
99
static double frsearch(vcw, dfreq, size)
100
Vcwindow_t* vcw;
101
size_t* dfreq; /* frequencies of data n-grams */
102
size_t size; /* size of data */
103
#endif
104
{
105
Sfoff_t pos, l, r, max;
106
int i;
107
Freq_t *fr = (Freq_t*)vcw->mtdata;
108
109
max = fr->srcf->size - size;
110
for(i = 0; i < fr->srcf->nidx; )
111
{
112
pos = ((Sfoff_t)fr->srcf->idx[i])*((Sfoff_t)NG_SIZE);
113
if((l = pos - IDXITVL) < 0)
114
l = 0;
115
if((r = pos + IDXITVL) > max)
116
r = max;
117
118
/* glom together all overlapping intervals */
119
for(i = i+1; i < fr->srcf->nidx; ++i)
120
{ pos = ((Sfoff_t)fr->srcf->idx[i])*((Sfoff_t)NG_SIZE);
121
if(pos-IDXITVL >= r)
122
break;
123
if((r = pos+IDXITVL) > max)
124
r = max;
125
}
126
127
if(frinterval(vcw, dfreq, size, l, r) >= 1. )
128
return 1.;
129
if(fr->bestd < SRCHMATCH)
130
break;
131
}
132
return fr->bestd;
133
}
134
135
#if __STD_C
136
static double frtarget(Vcwindow_t* vcw, size_t* dfreq, size_t size, Sfoff_t here)
137
#else
138
static double frtarget(vcw, dfreq, size, here)
139
Vcwindow_t* vcw;
140
size_t* dfreq; /* frequencies of data n-grams */
141
size_t size; /* size of data */
142
Sfoff_t here; /* current location */
143
#endif
144
{
145
Sfoff_t pos, cpos;
146
size_t dtsz;
147
int mtch;
148
Vcchar_t *data;
149
double dif;
150
Freq_t *fr = (Freq_t*)vcw->mtdata;
151
Vcwmatch_t *wm = &vcw->match;
152
153
/* if has not been successful, stop doing it */
154
if(ENDTARGET(fr))
155
return 1.;
156
157
fr->ntar += 1;
158
159
/* unseekable */
160
if((cpos = sfseek(fr->tarf, (Sfoff_t)0, 1)) < 0)
161
goto f_err;
162
163
/* search a neighborhood in target file */
164
if((pos = here - (size+size/8)) < 0)
165
pos = 0;
166
if((dtsz = (size_t)(here - pos)) < size)
167
return 1.;
168
169
if(sfseek(fr->tarf, pos, 0) != pos )
170
goto f_err;
171
if(!(data = sfreserve(fr->tarf, dtsz, 0)) )
172
{ sfseek(fr->tarf, cpos, 0);
173
174
f_err: /* will never try searching target file again */
175
fr->ntar = CHKTARGET;
176
fr->star = 0;
177
return 1.;
178
}
179
180
dif = vcwngmatch(&mtch, dfreq, size, data, dtsz, 0, NGRAMMATCH);
181
if(dif < fr->bestd )
182
{ fr->bestd = dif;
183
wm->type = VCD_TARGETFILE;
184
wm->wpos = pos + mtch;
185
wm->wsize = size;
186
}
187
188
sfseek(fr->tarf, cpos, 0);
189
return fr->bestd;
190
}
191
192
#if __STD_C
193
static Vcwmatch_t* frmatch(Vcwindow_t* vcw, Void_t* data, size_t dtsz, Sfoff_t here)
194
#else
195
static Vcwmatch_t* frmatch(vcw, data, dtsz, here)
196
Vcwindow_t* vcw;
197
Void_t* data; /* target data to be matched */
198
size_t dtsz; /* data size */
199
Sfoff_t here; /* current target position */
200
#endif
201
{
202
size_t dfreq[NG_FREQ];
203
ssize_t comp;
204
Sfoff_t high;
205
Sfio_t *sf;
206
Freq_t *fr;
207
Vcwmatch_t *wm = &vcw->match;
208
209
if(!vcw || !(fr = (Freq_t*)vcw->mtdata) || (!fr->srcf && !fr->tarf) )
210
return NIL(Vcwmatch_t*);
211
212
/* size of result from last compression */
213
if((comp = vcw->cmpsz) <= 0)
214
comp = fr->dtsz;
215
vcw->cmpsz = 0;
216
217
fr->bestd = 1.;
218
wm->type = 0;
219
vcwngfreq(dfreq, data, dtsz); /* n-gram frequencies of given data */
220
221
/* search back an area in target file for matches */
222
if(fr->tarf && here > (Sfoff_t)dtsz &&
223
frtarget(vcw, dfreq, dtsz, here) < TARMATCH )
224
goto done;
225
226
/* if last compression result is good, try next sequential position */
227
if(fr->srcf && (fr->dtsz == 0 || (comp/(double)fr->dtsz) < SEQSEARCH) &&
228
frinterval(vcw,dfreq,dtsz,fr->next-SEQITVL,fr->next+SEQITVL) < SEQMATCH )
229
goto done;
230
231
/* find places in source file likely to have matches */
232
if(fr->srcf && vcwfsearch(fr->srcf, (Vcchar_t*)data, dtsz) > 0 &&
233
frsearch(vcw, dfreq, dtsz) < SRCHMATCH )
234
goto done;
235
236
/* target file matches should be more stringent */
237
if((wm->type == VCD_TARGETFILE && fr->bestd > NOTARMATCH) ||
238
(wm->type == VCD_SOURCEFILE && fr->bestd > NOSRCMATCH) )
239
wm->type = 0;
240
241
done: if(wm->type == 0)
242
{ if(!fr->srcf)
243
return NIL(Vcwmatch_t*);
244
245
wm->type = VCD_SOURCEFILE;
246
wm->wpos = here+dtsz < fr->srcf->size ? here : fr->srcf->size - dtsz;
247
if(wm->wpos < 0)
248
wm->wpos = 0;
249
wm->wsize = dtsz;
250
}
251
252
if(wm->type == VCD_SOURCEFILE)
253
{ fr->dtsz = dtsz;
254
fr->next = wm->wpos + dtsz;
255
high = fr->srcf->size;
256
}
257
else
258
{ fr->star += 1; /* success at using target data */
259
high = here;
260
}
261
262
/* add a little extra around the computed window */
263
wm->wsize += 2*VCWEXTRA(dtsz);
264
if((wm->wpos -= VCWEXTRA(dtsz)) < 0)
265
wm->wpos = 0;
266
if((wm->wpos + wm->wsize) > high && (wm->wpos = high - wm->wsize) < 0 )
267
{ wm->wpos = 0;
268
wm->wsize = (ssize_t)high;
269
}
270
271
/* get window data */
272
sf = wm->type == VCD_SOURCEFILE ? vcw->disc->srcf : vcw->disc->tarf;
273
if(!sf || sfseek(sf, wm->wpos, 0) != wm->wpos ||
274
!(wm->wdata = sfreserve(sf, wm->wsize, 0)) ||
275
sfvalue(sf) < wm->wsize )
276
return NIL(Vcwmatch_t*);
277
278
wm->msize = dtsz;
279
wm->more = 0;
280
281
/**/DEBUG_PRINT(2,"here=%8d ",(ssize_t)here);
282
/**/DEBUG_PRINT(2,"dtsz=%8d ",(ssize_t)dtsz);
283
/**/DEBUG_PRINT(2,"mtch=%8d ",(ssize_t)wm->msize);
284
/**/DEBUG_PRINT(2,"wpos=%8d ",(ssize_t)wm->wpos);
285
/**/DEBUG_PRINT(2,"wsiz=%8d \n",(ssize_t)wm->wsize);
286
287
return wm;
288
}
289
290
/* Event handler */
291
#if __STD_C
292
static int frevent(Vcwindow_t* vcw, int type)
293
#else
294
static int frevent(vcw, type)
295
Vcwindow_t* vcw;
296
int type;
297
#endif
298
{
299
Freq_t *fr;
300
301
switch(type)
302
{
303
case VCW_OPENING:
304
if(!(fr = (Freq_t*)calloc(1,sizeof(Freq_t))) )
305
return -1;
306
307
if(vcw->disc && vcw->disc->srcf )
308
fr->srcf = vcwfopen(vcw->disc->srcf);
309
else fr->srcf = NIL(Vcwfile_t*);
310
311
if(vcw->disc && vcw->disc->tarf &&
312
sfseek(vcw->disc->tarf, (Sfoff_t)0, 1) >= 0)
313
fr->tarf = vcw->disc->tarf;
314
else fr->tarf = NIL(Sfio_t*);
315
316
if(!fr->srcf && !fr->tarf)
317
{ free(fr);
318
return -1;
319
}
320
321
fr->dtsz = 0;
322
fr->next = 0;
323
fr->bestd = 1.;
324
fr->ntar = fr->star = 0;
325
326
vcw->mtdata = (Void_t*)fr;
327
break;
328
329
case VCW_CLOSING:
330
if((fr = (Freq_t*)vcw->mtdata) )
331
{ if(fr->srcf)
332
vcwfclose(fr->srcf);
333
free(fr);
334
}
335
336
vcw->mtdata = NIL(Void_t*);
337
break;
338
}
339
340
return 0;
341
}
342
343
Vcwmethod_t _Vcwvote =
344
{ frmatch,
345
frevent,
346
"vote",
347
"Find windows by voting for matches.",
348
"[-version?window::vote (AT&T Research) 2003-01-01]" USAGE_LICENSE,
349
};
350
351
Vcwmethod_t* Vcwvote = &_Vcwvote;
352
353