Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/lib/libvcodex/Vcwindow/vcwngram.c
1810 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 2003-2011 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Phong Vo <[email protected]> *
18
* *
19
***********************************************************************/
20
#include "vcwhdr.h"
21
22
/* Functions to compute n-gram frequencies, etc.
23
**
24
** Written by Kiem-Phong Vo ([email protected])
25
*/
26
27
/* compute n-gram frequencies */
28
#if __STD_C
29
int vcwngfreq(size_t* freq, Vcchar_t* data, size_t size)
30
#else
31
int vcwngfreq(freq, data, size)
32
size_t* freq; /* frequency array size NG_FREQ */
33
Vcchar_t* data; /* data to be aggregated */
34
size_t size; /* size of data in bytes */
35
#endif
36
{
37
size_t n, gram;
38
39
for(n = 0; n < NG_FREQ; ++n)
40
freq[n] = 0;
41
42
if(size < NG_BYTE)
43
return 0;
44
45
size -= NG_BYTE;
46
for(n = 0, NGINIT(data,gram); ; ++n, ++data, NGNEXT(data,gram) )
47
{ freq[NGINDEX(gram)] += 1;
48
if(n >= size)
49
return 0;
50
}
51
}
52
53
/* compute the position in "data" whose frequency vector sfreq
54
** is closest to the frequency vector dfreq.
55
*/
56
#if __STD_C
57
double vcwngmatch(int* mtch, size_t* dfreq, size_t size,
58
Vcchar_t* data, size_t dtsz, size_t pos, double stop)
59
#else
60
double vcwngmatch(mtch, dfreq, size, data, dtsz, pos, stop)
61
int* mtch; /* to return the best match */
62
size_t* dfreq; /* vector of data frequency */
63
size_t size; /* size of data to be matched */
64
Vcchar_t* data; /* source data array */
65
size_t dtsz; /* size of source data */
66
size_t pos; /* starting position of search */
67
double stop; /* stop search criterion */
68
#endif
69
{
70
size_t lfreq[NG_FREQ], rfreq[NG_FREQ]; /* frequency vectors */
71
Vcchar_t *lldt, *lrdt; /* boundaries of left data segment */
72
Grint_t llgr, lrgr; /* left,right n-grams of left segment */
73
size_t ldif, lmax; /* running statistics of left segment */
74
Vcchar_t *rldt, *rrdt; /* boundaries of right data segment */
75
Grint_t rlgr, rrgr; /* left,right n-grams of right segment */
76
size_t rdif, rmax; /* running statistics of right segment */
77
size_t bestp, l, r;
78
double bestd;
79
Vcchar_t *edata;
80
81
if(dtsz < size) /* worst possible match */
82
return 1.;
83
84
if(pos > (dtsz -= size) )
85
pos = dtsz;
86
87
vcwngfreq(rfreq, data+pos, size);
88
89
/* initial frequency vector */
90
rdif = rmax = 0;
91
for(r = 0; r < NG_FREQ; ++r)
92
{ if(dfreq[r] < rfreq[r])
93
{ rdif += rfreq[r] - dfreq[r];
94
rmax += rfreq[r];
95
}
96
else
97
{ rdif += dfreq[r] - rfreq[r];
98
rmax += dfreq[r];
99
}
100
}
101
102
bestp = pos;
103
bestd = rdif/(double)rmax;
104
if(bestd < stop || dtsz == 0)
105
goto done;
106
107
/* starting boundaries of data */
108
edata = data + dtsz;
109
lldt = rldt = data + pos;
110
lrdt = rrdt = data + pos + size - NG_BYTE;
111
112
if(lldt > data)
113
{ for(l = 0; l < NG_FREQ; ++l)
114
lfreq[l] = rfreq[l];
115
ldif = rdif;
116
lmax = rmax;
117
118
NGINIT(lldt,llgr);
119
NGINIT(lrdt,lrgr);
120
}
121
122
if(rldt < edata)
123
{ NGINIT(rldt, rlgr);
124
NGINIT(rrdt, rrgr);
125
}
126
127
while(lldt > data || rldt < edata)
128
{ if(lldt > data)
129
{ lldt -= 1; NGINIT(lldt, llgr); l = NGINDEX(llgr); /* l coming */
130
r = NGINDEX(lrgr); lrdt -= 1; NGINIT(lrdt, lrgr); /* r leaving */
131
if(l != r)
132
{ if((lfreq[r] -= 1) < dfreq[r])
133
ldif += 1;
134
else { ldif -= 1; lmax -= 1; }
135
136
if((lfreq[l] += 1) > dfreq[l])
137
{ ldif += 1; lmax += 1; }
138
else ldif -= 1;
139
140
if((ldif/(double)lmax) < bestd)
141
{ bestd = ldif/(double)lmax;
142
bestp = lldt-data;
143
if(bestd < stop)
144
goto done;
145
}
146
}
147
}
148
149
if(rldt < edata)
150
{ l = NGINDEX(rlgr); rldt += 1; NGNEXT(rldt, rlgr); /* l leaving */
151
rrdt += 1; NGNEXT(rrdt, rrgr); r = NGINDEX(rrgr); /* r coming */
152
if(l != r)
153
{ if((rfreq[l] -= 1) < dfreq[l])
154
rdif += 1;
155
else { rdif -= 1; rmax -= 1; }
156
157
if((rfreq[r] += 1) > dfreq[r])
158
{ rdif += 1; rmax += 1; }
159
else rdif -= 1;
160
161
if((rdif/(double)rmax) < bestd)
162
{ bestd = rdif/(double)rmax;
163
bestp = rldt-data;
164
if(bestd < stop)
165
goto done;
166
}
167
}
168
}
169
}
170
171
done: *mtch = bestp;
172
return bestd;
173
}
174
175
/* The signature of a segment of data is the sum of all its n-grams.
176
Thus, each 1K (2^10) segment has a maximum signature of size 2^25.
177
*/
178
#if __STD_C
179
Grint_t vcwngsig(Vcchar_t* data, size_t n)
180
#else
181
Grint_t vcwngsig(data, n)
182
Vcchar_t* data;
183
size_t n;
184
#endif
185
{
186
Grint_t sig, key;
187
Vcchar_t* endd;
188
189
endd = data + n - (NG_BYTE-1);
190
for(sig = 0, NGINIT(data,key), data += 1; data < endd; ++data)
191
{ sig += NGVALUE(key);
192
NGNEXT(data,key);
193
}
194
195
return sig;
196
}
197
198