#include "vcwhdr.h"
typedef struct _freq_s
{ Vcwfile_t* srcf;
Sfio_t* tarf;
Sfoff_t next;
int dtsz;
double bestd;
int ntar;
int star;
} Freq_t;
#define CHKTARGET 32
#define ENDTARGET(fr) (fr->ntar > CHKTARGET && fr->ntar > 8*fr->star)
#define SEQSEARCH 0.25
#define SEQITVL (8*1024)
#define IDXITVL (2*1024)
#define SEQMATCH 0.16
#define TARMATCH 0.08
#define SRCHMATCH 0.04
#define NGRAMMATCH 0.01
#define NOTARMATCH 0.20
#define NOSRCMATCH 0.40
#if __STD_C
static double frinterval(Vcwindow_t* vcw, size_t* dfreq, size_t size, Sfoff_t l, Sfoff_t r)
#else
static double frinterval(vcw, dfreq, size, l, r)
Vcwindow_t* vcw;
size_t* dfreq;
size_t size;
Sfoff_t l, r;
#endif
{
Vcchar_t *data;
size_t dtsz;
double dif;
int mtch;
Freq_t *fr = (Freq_t*)vcw->mtdata;
Vcwmatch_t *wm = &vcw->match;
if(l < 0)
l = 0;
if((r += size) > fr->srcf->size)
r = fr->srcf->size;
if((dtsz = (size_t)(r-l)) < size)
return 1.;
if(sfseek(fr->srcf->file, l, 0) != l ||
!(data = sfreserve(fr->srcf->file, dtsz, 0)) )
return 1.;
if((dif = vcwngmatch(&mtch, dfreq, size, data, dtsz, 0, NGRAMMATCH)) < fr->bestd)
{ fr->bestd = dif;
wm->type = VCD_SOURCEFILE;
wm->wpos = l + mtch;
wm->wsize = size;
}
return fr->bestd;
}
#if __STD_C
static double frsearch(Vcwindow_t* vcw, size_t* dfreq, size_t size)
#else
static double frsearch(vcw, dfreq, size)
Vcwindow_t* vcw;
size_t* dfreq;
size_t size;
#endif
{
Sfoff_t pos, l, r, max;
int i;
Freq_t *fr = (Freq_t*)vcw->mtdata;
max = fr->srcf->size - size;
for(i = 0; i < fr->srcf->nidx; )
{
pos = ((Sfoff_t)fr->srcf->idx[i])*((Sfoff_t)NG_SIZE);
if((l = pos - IDXITVL) < 0)
l = 0;
if((r = pos + IDXITVL) > max)
r = max;
for(i = i+1; i < fr->srcf->nidx; ++i)
{ pos = ((Sfoff_t)fr->srcf->idx[i])*((Sfoff_t)NG_SIZE);
if(pos-IDXITVL >= r)
break;
if((r = pos+IDXITVL) > max)
r = max;
}
if(frinterval(vcw, dfreq, size, l, r) >= 1. )
return 1.;
if(fr->bestd < SRCHMATCH)
break;
}
return fr->bestd;
}
#if __STD_C
static double frtarget(Vcwindow_t* vcw, size_t* dfreq, size_t size, Sfoff_t here)
#else
static double frtarget(vcw, dfreq, size, here)
Vcwindow_t* vcw;
size_t* dfreq;
size_t size;
Sfoff_t here;
#endif
{
Sfoff_t pos, cpos;
size_t dtsz;
int mtch;
Vcchar_t *data;
double dif;
Freq_t *fr = (Freq_t*)vcw->mtdata;
Vcwmatch_t *wm = &vcw->match;
if(ENDTARGET(fr))
return 1.;
fr->ntar += 1;
if((cpos = sfseek(fr->tarf, (Sfoff_t)0, 1)) < 0)
goto f_err;
if((pos = here - (size+size/8)) < 0)
pos = 0;
if((dtsz = (size_t)(here - pos)) < size)
return 1.;
if(sfseek(fr->tarf, pos, 0) != pos )
goto f_err;
if(!(data = sfreserve(fr->tarf, dtsz, 0)) )
{ sfseek(fr->tarf, cpos, 0);
f_err:
fr->ntar = CHKTARGET;
fr->star = 0;
return 1.;
}
dif = vcwngmatch(&mtch, dfreq, size, data, dtsz, 0, NGRAMMATCH);
if(dif < fr->bestd )
{ fr->bestd = dif;
wm->type = VCD_TARGETFILE;
wm->wpos = pos + mtch;
wm->wsize = size;
}
sfseek(fr->tarf, cpos, 0);
return fr->bestd;
}
#if __STD_C
static Vcwmatch_t* frmatch(Vcwindow_t* vcw, Void_t* data, size_t dtsz, Sfoff_t here)
#else
static Vcwmatch_t* frmatch(vcw, data, dtsz, here)
Vcwindow_t* vcw;
Void_t* data;
size_t dtsz;
Sfoff_t here;
#endif
{
size_t dfreq[NG_FREQ];
ssize_t comp;
Sfoff_t high;
Sfio_t *sf;
Freq_t *fr;
Vcwmatch_t *wm = &vcw->match;
if(!vcw || !(fr = (Freq_t*)vcw->mtdata) || (!fr->srcf && !fr->tarf) )
return NIL(Vcwmatch_t*);
if((comp = vcw->cmpsz) <= 0)
comp = fr->dtsz;
vcw->cmpsz = 0;
fr->bestd = 1.;
wm->type = 0;
vcwngfreq(dfreq, data, dtsz);
if(fr->tarf && here > (Sfoff_t)dtsz &&
frtarget(vcw, dfreq, dtsz, here) < TARMATCH )
goto done;
if(fr->srcf && (fr->dtsz == 0 || (comp/(double)fr->dtsz) < SEQSEARCH) &&
frinterval(vcw,dfreq,dtsz,fr->next-SEQITVL,fr->next+SEQITVL) < SEQMATCH )
goto done;
if(fr->srcf && vcwfsearch(fr->srcf, (Vcchar_t*)data, dtsz) > 0 &&
frsearch(vcw, dfreq, dtsz) < SRCHMATCH )
goto done;
if((wm->type == VCD_TARGETFILE && fr->bestd > NOTARMATCH) ||
(wm->type == VCD_SOURCEFILE && fr->bestd > NOSRCMATCH) )
wm->type = 0;
done: if(wm->type == 0)
{ if(!fr->srcf)
return NIL(Vcwmatch_t*);
wm->type = VCD_SOURCEFILE;
wm->wpos = here+dtsz < fr->srcf->size ? here : fr->srcf->size - dtsz;
if(wm->wpos < 0)
wm->wpos = 0;
wm->wsize = dtsz;
}
if(wm->type == VCD_SOURCEFILE)
{ fr->dtsz = dtsz;
fr->next = wm->wpos + dtsz;
high = fr->srcf->size;
}
else
{ fr->star += 1;
high = here;
}
wm->wsize += 2*VCWEXTRA(dtsz);
if((wm->wpos -= VCWEXTRA(dtsz)) < 0)
wm->wpos = 0;
if((wm->wpos + wm->wsize) > high && (wm->wpos = high - wm->wsize) < 0 )
{ wm->wpos = 0;
wm->wsize = (ssize_t)high;
}
sf = wm->type == VCD_SOURCEFILE ? vcw->disc->srcf : vcw->disc->tarf;
if(!sf || sfseek(sf, wm->wpos, 0) != wm->wpos ||
!(wm->wdata = sfreserve(sf, wm->wsize, 0)) ||
sfvalue(sf) < wm->wsize )
return NIL(Vcwmatch_t*);
wm->msize = dtsz;
wm->more = 0;
DEBUG_PRINT(2,"here=%8d ",(ssize_t)here);
DEBUG_PRINT(2,"dtsz=%8d ",(ssize_t)dtsz);
DEBUG_PRINT(2,"mtch=%8d ",(ssize_t)wm->msize);
DEBUG_PRINT(2,"wpos=%8d ",(ssize_t)wm->wpos);
DEBUG_PRINT(2,"wsiz=%8d \n",(ssize_t)wm->wsize);
return wm;
}
#if __STD_C
static int frevent(Vcwindow_t* vcw, int type)
#else
static int frevent(vcw, type)
Vcwindow_t* vcw;
int type;
#endif
{
Freq_t *fr;
switch(type)
{
case VCW_OPENING:
if(!(fr = (Freq_t*)calloc(1,sizeof(Freq_t))) )
return -1;
if(vcw->disc && vcw->disc->srcf )
fr->srcf = vcwfopen(vcw->disc->srcf);
else fr->srcf = NIL(Vcwfile_t*);
if(vcw->disc && vcw->disc->tarf &&
sfseek(vcw->disc->tarf, (Sfoff_t)0, 1) >= 0)
fr->tarf = vcw->disc->tarf;
else fr->tarf = NIL(Sfio_t*);
if(!fr->srcf && !fr->tarf)
{ free(fr);
return -1;
}
fr->dtsz = 0;
fr->next = 0;
fr->bestd = 1.;
fr->ntar = fr->star = 0;
vcw->mtdata = (Void_t*)fr;
break;
case VCW_CLOSING:
if((fr = (Freq_t*)vcw->mtdata) )
{ if(fr->srcf)
vcwfclose(fr->srcf);
free(fr);
}
vcw->mtdata = NIL(Void_t*);
break;
}
return 0;
}
Vcwmethod_t _Vcwvote =
{ frmatch,
frevent,
"vote",
"Find windows by voting for matches.",
"[-version?window::vote (AT&T Research) 2003-01-01]" USAGE_LICENSE,
};
Vcwmethod_t* Vcwvote = &_Vcwvote;