Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
att
GitHub Repository: att/ast
Path: blob/master/src/lib/libodelta/suftree.c
1808 views
1
/***********************************************************************
2
* *
3
* This software is part of the ast package *
4
* Copyright (c) 1989-2011 AT&T Intellectual Property *
5
* and is licensed under the *
6
* Eclipse Public License, Version 1.0 *
7
* by AT&T Intellectual Property *
8
* *
9
* A copy of the License is available at *
10
* http://www.eclipse.org/org/documents/epl-v10.html *
11
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12
* *
13
* Information and Software Systems Research *
14
* AT&T Research *
15
* Florham Park NJ *
16
* *
17
* Phong Vo <[email protected]> *
18
* *
19
***********************************************************************/
20
#pragma prototyped
21
#define _IN_SUF_TREE
22
#include "suftree.h"
23
24
/*
25
** Construct a suffix tree for a source string
26
** and perform string matching of various input strings.
27
** This is based on the suffix tree algorithm of E. McCreight.
28
** I extended the algorithm to remove the restriction that the
29
** last element of the string has to be different from the rest
30
** of the string. Note also that since the alphabet is large (256),
31
** instead of being stored in an array, the children of a node
32
** are kept in a linked list which is managed by a move-to-front
33
** heuristic.
34
**
35
** For details, see the paper:
36
** "A Space-Economical Suffix Tree Construction Algorithm"
37
** E.M. McCreight, JACM, v.23, No.2, 1976, pp.262-272.
38
**
39
** BldSuftree returns either NULL or the pointer to the root of the
40
** tree. In the latter case, the tree can be destroyed by free()-ing
41
** the root.
42
**
43
** Written by Kiem-Phong Vo, 5/20/88.
44
*/
45
46
/* Delete a suffix tree */
47
void delsuftree(Suftree* root)
48
{
49
if(!root)
50
return;
51
root -= 1;
52
while(root)
53
{
54
register Suftree *next;
55
next = NEXT(root);
56
free(root);
57
root = next;
58
}
59
}
60
61
/* Find a child whose label string starts with a given character */
62
static Suftree *child_find(Suftree* node, Element c)
63
{
64
register Suftree *np, *last;
65
66
last = 0;
67
for(np = CHILD(node); np; np = SIBLING(np))
68
if(LENGTH(np) > 0 && *LABEL(np) == c)
69
break;
70
else last = np;
71
72
if(np && last)
73
{
74
/* move-to-front heuristic */
75
SIBLING(last) = SIBLING(np);
76
SIBLING(np) = CHILD(node);
77
CHILD(node) = np;
78
}
79
return np;
80
}
81
82
/* Get space for tree nodes. */
83
static Suftree *getmem(Suftree* root, int n)
84
{
85
Suftree *list;
86
87
if(!(list = (Suftree*)malloc((n+1)*sizeof(Suftree))))
88
{
89
if(root)
90
delsuftree(root);
91
return 0;
92
}
93
94
/* store where this list is for later deletion */
95
NEXT(list) = 0;
96
if(root)
97
{
98
for(root -= 1; NEXT(root); root = NEXT(root))
99
;
100
NEXT(root) = list;
101
}
102
103
return list+1;
104
}
105
106
/* Build the tree.
107
** Following are the meaning of a few important variables:
108
** clocus: contracted locus, this variable defines the
109
** tree node that points to the longest prefix
110
** that terminates at a node in the current tree.
111
** locus: defines a tree node to be constructed that
112
** points to the longest prefix that can be defined.
113
** Unless both clocus and locus equal the root,
114
** we maintain the invariance that clocus is the
115
** parent of locus.
116
** link: defines the sublink of the locus of the prefix
117
** of the previously inserted suffix.
118
*/
119
Suftree *bldsuftree(Element* src, long len)
120
{
121
register Element *sp, *mp, *rescan, *endmatch, *endsrc;
122
register Suftree *match, *clocus, *locus, *link;
123
register long mtlen, relen;
124
register int n;
125
Suftree *root, *list, *endlist;
126
127
if(len == 0)
128
return 0;
129
130
/* get initial space for the tree nodes */
131
root = 0;
132
133
/* 2*len+1 is the maximum number of nodes that can be created */
134
n = 2*len+1;
135
if(n > ALLOCSIZE)
136
n = ALLOCSIZE;
137
if(!(list = getmem(root,n)))
138
return 0;
139
endlist = list + n;
140
141
/* make root node */
142
root = list++;
143
LABEL(root) = 0;
144
CHILD(root) = 0;
145
LINK(root) = root;
146
147
/* locus and contracted locus of the empty substring */
148
locus = clocus = root;
149
150
/* the current match length */
151
mtlen = 0;
152
153
/* the end of the source string */
154
endsrc = src+len;
155
156
/* now build the tree */
157
for(; src < endsrc; ++src)
158
{
159
/* prepare for scanning the current suffix */
160
if(locus != root)
161
{
162
/* define the string to be rescanned */
163
rescan = LABEL(locus);
164
relen = LENGTH(locus);
165
166
/* minus the first character of the previous prefix */
167
if(clocus == root)
168
{
169
rescan++;
170
if(relen > 0)
171
--relen;
172
}
173
}
174
else mtlen = relen = 0;
175
176
/* the length of the known-to-be-matched part */
177
if(mtlen > 0)
178
--mtlen;
179
/**/ ASSERT(relen <= mtlen)
180
181
/* use sublink to rescan */
182
link = LINK(clocus);
183
184
/* rescan */
185
while(relen > 0)
186
{
187
/* find a child of link that starts with the
188
first character of rescan. We then know that
189
rescan must match a prefix of that child.
190
*/
191
match = child_find(link,*rescan);
192
/**/ ASSERT(match != NULL)
193
194
/* clocus will be the parent of the new link */
195
clocus = link;
196
197
/* rescan contains LABEL(match) */
198
if(relen >= LENGTH(match))
199
{
200
link = match;
201
relen -= LENGTH(match);
202
rescan += LENGTH(match);
203
}
204
/* rescan is a proper prefix of LABEL(match) */
205
else
206
{
207
if(list >= endlist)
208
{
209
if(!(list = getmem(root,ALLOCSIZE)))
210
return 0;
211
else endlist = list+ALLOCSIZE;
212
}
213
214
/* make an internal node labeled with rescan */
215
LABEL(list) = rescan;
216
LENGTH(list) = relen;
217
CHILD(list) = match;
218
SIBLING(list) = SIBLING(match);
219
LINK(list) = root;
220
221
/* adjust label and pointer of old node */
222
SIBLING(match) = 0;
223
LABEL(match) += relen;
224
LENGTH(match) -= relen;
225
226
CHILD(link) = list;
227
link = list++;
228
break;
229
}
230
}
231
232
/* define sublink for the prefix of the last suffix */
233
if(locus != root)
234
LINK(locus) = link;
235
236
/* scan to match as much as possible */
237
locus = link;
238
sp = src + mtlen;
239
while(sp < endsrc)
240
{
241
/* see if it matches some child of clocus */
242
if(!(match = child_find(locus,*sp)))
243
break;
244
245
/* clocus will be the parent of the new locus */
246
clocus = locus;
247
248
/* find the extend of the match */
249
mp = LABEL(match);
250
endmatch = mp + LENGTH(match);
251
for(; sp < endsrc && mp < endmatch; ++sp, ++mp)
252
if(*sp != *mp)
253
break;
254
255
/* the whole node is matched */
256
if(mp >= endmatch)
257
{
258
locus = match;
259
mtlen += LENGTH(match);
260
}
261
/* found the extended locus of this suffix */
262
else
263
{
264
if(list >= endlist)
265
{
266
if(!(list = getmem(root,ALLOCSIZE)))
267
return 0;
268
else endlist = list+ALLOCSIZE;
269
}
270
271
/* make a new internal node */
272
LABEL(list) = LABEL(match);
273
LENGTH(list) = mp - LABEL(match);
274
CHILD(list) = match;
275
SIBLING(list) = SIBLING(match);
276
LINK(list) = root;
277
278
SIBLING(match) = 0;
279
LABEL(match) += LENGTH(list);
280
LENGTH(match) -= LENGTH(list);
281
mtlen += LENGTH(list);
282
283
/* the new node is the locus for this suffix */
284
CHILD(locus) = list;
285
locus = list++;
286
break;
287
}
288
}
289
290
if(list >= endlist)
291
{
292
if(!(list = getmem(root,ALLOCSIZE)))
293
return 0;
294
else endlist = list+ALLOCSIZE;
295
}
296
297
/* make a new external node for the suffix */
298
SUFFIX(list) = src;
299
LABEL(list) = sp;
300
LENGTH(list) = endsrc-sp;
301
CHILD(list) = 0;
302
303
/* hook it in as the first child of locus */
304
SIBLING(list) = CHILD(locus);
305
CHILD(locus) = list++;
306
}
307
308
return root;
309
}
310
311
/* Given a raw string and a string represented in a suffix tree,
312
match the string against the tree to find a longest matching
313
prefix of the string.
314
Return the length of the match and where it occurs in the
315
string represented by the tree.
316
*/
317
long mtchsuftree(Suftree* tree, Element* str, long len, Element** mtchp)
318
{
319
register Suftree *match;
320
register Element *sp, *mp, *endmp, *endstr;
321
register long mlen;
322
323
mlen = 0;
324
endstr = str + len;
325
while(1)
326
{
327
if(!(match = child_find(tree,*str)))
328
break;
329
330
/* find the extent of the match */
331
mp = LABEL(match);
332
endmp = mp + LENGTH(match);
333
for(sp = str; sp < endstr && mp < endmp; ++sp, ++mp)
334
if(*sp != *mp)
335
break;
336
337
/* update the length of the match */
338
mlen += sp-str;
339
340
/* prepare for next iteration */
341
tree = match;
342
str = sp;
343
344
/* see if we have to work any more */
345
if(mp < endmp || str >= endstr)
346
break;
347
}
348
349
if(mlen == 0) /* no match */
350
*mtchp = 0;
351
else
352
{
353
/* find where the match starts */
354
while(CHILD(tree))
355
tree = CHILD(tree);
356
*mtchp = SUFFIX(tree);
357
}
358
359
return mlen;
360
}
361
362