/***********************************************************************1* *2* This software is part of the ast package *3* Copyright (c) 1989-2011 AT&T Intellectual Property *4* and is licensed under the *5* Eclipse Public License, Version 1.0 *6* by AT&T Intellectual Property *7* *8* A copy of the License is available at *9* http://www.eclipse.org/org/documents/epl-v10.html *10* (with md5 checksum b35adb5213ca9657e911e9befb180842) *11* *12* Information and Software Systems Research *13* AT&T Research *14* Florham Park NJ *15* *16* Phong Vo <[email protected]> *17* *18***********************************************************************/19#pragma prototyped20#define _IN_SUF_TREE21#include "suftree.h"2223/*24** Construct a suffix tree for a source string25** and perform string matching of various input strings.26** This is based on the suffix tree algorithm of E. McCreight.27** I extended the algorithm to remove the restriction that the28** last element of the string has to be different from the rest29** of the string. Note also that since the alphabet is large (256),30** instead of being stored in an array, the children of a node31** are kept in a linked list which is managed by a move-to-front32** heuristic.33**34** For details, see the paper:35** "A Space-Economical Suffix Tree Construction Algorithm"36** E.M. McCreight, JACM, v.23, No.2, 1976, pp.262-272.37**38** BldSuftree returns either NULL or the pointer to the root of the39** tree. In the latter case, the tree can be destroyed by free()-ing40** the root.41**42** Written by Kiem-Phong Vo, 5/20/88.43*/4445/* Delete a suffix tree */46void delsuftree(Suftree* root)47{48if(!root)49return;50root -= 1;51while(root)52{53register Suftree *next;54next = NEXT(root);55free(root);56root = next;57}58}5960/* Find a child whose label string starts with a given character */61static Suftree *child_find(Suftree* node, Element c)62{63register Suftree *np, *last;6465last = 0;66for(np = CHILD(node); np; np = SIBLING(np))67if(LENGTH(np) > 0 && *LABEL(np) == c)68break;69else last = np;7071if(np && last)72{73/* move-to-front heuristic */74SIBLING(last) = SIBLING(np);75SIBLING(np) = CHILD(node);76CHILD(node) = np;77}78return np;79}8081/* Get space for tree nodes. */82static Suftree *getmem(Suftree* root, int n)83{84Suftree *list;8586if(!(list = (Suftree*)malloc((n+1)*sizeof(Suftree))))87{88if(root)89delsuftree(root);90return 0;91}9293/* store where this list is for later deletion */94NEXT(list) = 0;95if(root)96{97for(root -= 1; NEXT(root); root = NEXT(root))98;99NEXT(root) = list;100}101102return list+1;103}104105/* Build the tree.106** Following are the meaning of a few important variables:107** clocus: contracted locus, this variable defines the108** tree node that points to the longest prefix109** that terminates at a node in the current tree.110** locus: defines a tree node to be constructed that111** points to the longest prefix that can be defined.112** Unless both clocus and locus equal the root,113** we maintain the invariance that clocus is the114** parent of locus.115** link: defines the sublink of the locus of the prefix116** of the previously inserted suffix.117*/118Suftree *bldsuftree(Element* src, long len)119{120register Element *sp, *mp, *rescan, *endmatch, *endsrc;121register Suftree *match, *clocus, *locus, *link;122register long mtlen, relen;123register int n;124Suftree *root, *list, *endlist;125126if(len == 0)127return 0;128129/* get initial space for the tree nodes */130root = 0;131132/* 2*len+1 is the maximum number of nodes that can be created */133n = 2*len+1;134if(n > ALLOCSIZE)135n = ALLOCSIZE;136if(!(list = getmem(root,n)))137return 0;138endlist = list + n;139140/* make root node */141root = list++;142LABEL(root) = 0;143CHILD(root) = 0;144LINK(root) = root;145146/* locus and contracted locus of the empty substring */147locus = clocus = root;148149/* the current match length */150mtlen = 0;151152/* the end of the source string */153endsrc = src+len;154155/* now build the tree */156for(; src < endsrc; ++src)157{158/* prepare for scanning the current suffix */159if(locus != root)160{161/* define the string to be rescanned */162rescan = LABEL(locus);163relen = LENGTH(locus);164165/* minus the first character of the previous prefix */166if(clocus == root)167{168rescan++;169if(relen > 0)170--relen;171}172}173else mtlen = relen = 0;174175/* the length of the known-to-be-matched part */176if(mtlen > 0)177--mtlen;178/**/ ASSERT(relen <= mtlen)179180/* use sublink to rescan */181link = LINK(clocus);182183/* rescan */184while(relen > 0)185{186/* find a child of link that starts with the187first character of rescan. We then know that188rescan must match a prefix of that child.189*/190match = child_find(link,*rescan);191/**/ ASSERT(match != NULL)192193/* clocus will be the parent of the new link */194clocus = link;195196/* rescan contains LABEL(match) */197if(relen >= LENGTH(match))198{199link = match;200relen -= LENGTH(match);201rescan += LENGTH(match);202}203/* rescan is a proper prefix of LABEL(match) */204else205{206if(list >= endlist)207{208if(!(list = getmem(root,ALLOCSIZE)))209return 0;210else endlist = list+ALLOCSIZE;211}212213/* make an internal node labeled with rescan */214LABEL(list) = rescan;215LENGTH(list) = relen;216CHILD(list) = match;217SIBLING(list) = SIBLING(match);218LINK(list) = root;219220/* adjust label and pointer of old node */221SIBLING(match) = 0;222LABEL(match) += relen;223LENGTH(match) -= relen;224225CHILD(link) = list;226link = list++;227break;228}229}230231/* define sublink for the prefix of the last suffix */232if(locus != root)233LINK(locus) = link;234235/* scan to match as much as possible */236locus = link;237sp = src + mtlen;238while(sp < endsrc)239{240/* see if it matches some child of clocus */241if(!(match = child_find(locus,*sp)))242break;243244/* clocus will be the parent of the new locus */245clocus = locus;246247/* find the extend of the match */248mp = LABEL(match);249endmatch = mp + LENGTH(match);250for(; sp < endsrc && mp < endmatch; ++sp, ++mp)251if(*sp != *mp)252break;253254/* the whole node is matched */255if(mp >= endmatch)256{257locus = match;258mtlen += LENGTH(match);259}260/* found the extended locus of this suffix */261else262{263if(list >= endlist)264{265if(!(list = getmem(root,ALLOCSIZE)))266return 0;267else endlist = list+ALLOCSIZE;268}269270/* make a new internal node */271LABEL(list) = LABEL(match);272LENGTH(list) = mp - LABEL(match);273CHILD(list) = match;274SIBLING(list) = SIBLING(match);275LINK(list) = root;276277SIBLING(match) = 0;278LABEL(match) += LENGTH(list);279LENGTH(match) -= LENGTH(list);280mtlen += LENGTH(list);281282/* the new node is the locus for this suffix */283CHILD(locus) = list;284locus = list++;285break;286}287}288289if(list >= endlist)290{291if(!(list = getmem(root,ALLOCSIZE)))292return 0;293else endlist = list+ALLOCSIZE;294}295296/* make a new external node for the suffix */297SUFFIX(list) = src;298LABEL(list) = sp;299LENGTH(list) = endsrc-sp;300CHILD(list) = 0;301302/* hook it in as the first child of locus */303SIBLING(list) = CHILD(locus);304CHILD(locus) = list++;305}306307return root;308}309310/* Given a raw string and a string represented in a suffix tree,311match the string against the tree to find a longest matching312prefix of the string.313Return the length of the match and where it occurs in the314string represented by the tree.315*/316long mtchsuftree(Suftree* tree, Element* str, long len, Element** mtchp)317{318register Suftree *match;319register Element *sp, *mp, *endmp, *endstr;320register long mlen;321322mlen = 0;323endstr = str + len;324while(1)325{326if(!(match = child_find(tree,*str)))327break;328329/* find the extent of the match */330mp = LABEL(match);331endmp = mp + LENGTH(match);332for(sp = str; sp < endstr && mp < endmp; ++sp, ++mp)333if(*sp != *mp)334break;335336/* update the length of the match */337mlen += sp-str;338339/* prepare for next iteration */340tree = match;341str = sp;342343/* see if we have to work any more */344if(mp < endmp || str >= endstr)345break;346}347348if(mlen == 0) /* no match */349*mtchp = 0;350else351{352/* find where the match starts */353while(CHILD(tree))354tree = CHILD(tree);355*mtchp = SUFFIX(tree);356}357358return mlen;359}360361362