Path: blob/master/Week 7/Programming Assignment - 6/ex6/porterStemmer.m
863 views
function stem = porterStemmer(inString)1% Applies the Porter Stemming algorithm as presented in the following2% paper:3% Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,4% no. 3, pp 130-13756% Original code modeled after the C version provided at:7% http://www.tartarus.org/~martin/PorterStemmer/c.txt89% The main part of the stemming algorithm starts here. b is an array of10% characters, holding the word to be stemmed. The letters are in b[k0],11% b[k0+1] ending at b[k]. In fact k0 = 1 in this demo program (since12% matlab begins indexing by 1 instead of 0). k is readjusted downwards as13% the stemming progresses. Zero termination is not in fact used in the14% algorithm.1516% To call this function, use the string to be stemmed as the input17% argument. This function returns the stemmed word as a string.1819% Lower-case string20inString = lower(inString);2122global j;23b = inString;24k = length(b);25k0 = 1;26j = k;27282930% With this if statement, strings of length 1 or 2 don't go through the31% stemming process. Remove this conditional to match the published32% algorithm.33stem = b;34if k > 235% Output displays per step are commented out.36%disp(sprintf('Word to stem: %s', b));37x = step1ab(b, k, k0);38%disp(sprintf('Steps 1A and B yield: %s', x{1}));39x = step1c(x{1}, x{2}, k0);40%disp(sprintf('Step 1C yields: %s', x{1}));41x = step2(x{1}, x{2}, k0);42%disp(sprintf('Step 2 yields: %s', x{1}));43x = step3(x{1}, x{2}, k0);44%disp(sprintf('Step 3 yields: %s', x{1}));45x = step4(x{1}, x{2}, k0);46%disp(sprintf('Step 4 yields: %s', x{1}));47x = step5(x{1}, x{2}, k0);48%disp(sprintf('Step 5 yields: %s', x{1}));49stem = x{1};50end5152% cons(j) is TRUE <=> b[j] is a consonant.53function c = cons(i, b, k0)54c = true;55switch(b(i))56case {'a', 'e', 'i', 'o', 'u'}57c = false;58case 'y'59if i == k060c = true;61else62c = ~cons(i - 1, b, k0);63end64end6566% mseq() measures the number of consonant sequences between k0 and j. If67% c is a consonant sequence and v a vowel sequence, and <..> indicates68% arbitrary presence,6970% <c><v> gives 071% <c>vc<v> gives 172% <c>vcvc<v> gives 273% <c>vcvcvc<v> gives 374% ....75function n = measure(b, k0)76global j;77n = 0;78i = k0;79while true80if i > j81return82end83if ~cons(i, b, k0)84break;85end86i = i + 1;87end88i = i + 1;89while true90while true91if i > j92return93end94if cons(i, b, k0)95break;96end97i = i + 1;98end99i = i + 1;100n = n + 1;101while true102if i > j103return104end105if ~cons(i, b, k0)106break;107end108i = i + 1;109end110i = i + 1;111end112113114% vowelinstem() is TRUE <=> k0,...j contains a vowel115function vis = vowelinstem(b, k0)116global j;117for i = k0:j,118if ~cons(i, b, k0)119vis = true;120return121end122end123vis = false;124125%doublec(i) is TRUE <=> i,(i-1) contain a double consonant.126function dc = doublec(i, b, k0)127if i < k0+1128dc = false;129return130end131if b(i) ~= b(i-1)132dc = false;133return134end135dc = cons(i, b, k0);136137138% cvc(j) is TRUE <=> j-2,j-1,j has the form consonant - vowel - consonant139% and also if the second c is not w,x or y. this is used when trying to140% restore an e at the end of a short word. e.g.141%142% cav(e), lov(e), hop(e), crim(e), but143% snow, box, tray.144145function c1 = cvc(i, b, k0)146if ((i < (k0+2)) || ~cons(i, b, k0) || cons(i-1, b, k0) || ~cons(i-2, b, k0))147c1 = false;148else149if (b(i) == 'w' || b(i) == 'x' || b(i) == 'y')150c1 = false;151return152end153c1 = true;154end155156% ends(s) is TRUE <=> k0,...k ends with the string s.157function s = ends(str, b, k)158global j;159if (str(length(str)) ~= b(k))160s = false;161return162end % tiny speed-up163if (length(str) > k)164s = false;165return166end167if strcmp(b(k-length(str)+1:k), str)168s = true;169j = k - length(str);170return171else172s = false;173end174175% setto(s) sets (j+1),...k to the characters in the string s, readjusting176% k accordingly.177178function so = setto(s, b, k)179global j;180for i = j+1:(j+length(s))181b(i) = s(i-j);182end183if k > j+length(s)184b((j+length(s)+1):k) = '';185end186k = length(b);187so = {b, k};188189% rs(s) is used further down.190% [Note: possible null/value for r if rs is called]191function r = rs(str, b, k, k0)192r = {b, k};193if measure(b, k0) > 0194r = setto(str, b, k);195end196197% step1ab() gets rid of plurals and -ed or -ing. e.g.198199% caresses -> caress200% ponies -> poni201% ties -> ti202% caress -> caress203% cats -> cat204205% feed -> feed206% agreed -> agree207% disabled -> disable208209% matting -> mat210% mating -> mate211% meeting -> meet212% milling -> mill213% messing -> mess214215% meetings -> meet216217function s1ab = step1ab(b, k, k0)218global j;219if b(k) == 's'220if ends('sses', b, k)221k = k-2;222elseif ends('ies', b, k)223retVal = setto('i', b, k);224b = retVal{1};225k = retVal{2};226elseif (b(k-1) ~= 's')227k = k-1;228end229end230if ends('eed', b, k)231if measure(b, k0) > 0;232k = k-1;233end234elseif (ends('ed', b, k) || ends('ing', b, k)) && vowelinstem(b, k0)235k = j;236retVal = {b, k};237if ends('at', b, k)238retVal = setto('ate', b(k0:k), k);239elseif ends('bl', b, k)240retVal = setto('ble', b(k0:k), k);241elseif ends('iz', b, k)242retVal = setto('ize', b(k0:k), k);243elseif doublec(k, b, k0)244retVal = {b, k-1};245if b(retVal{2}) == 'l' || b(retVal{2}) == 's' || ...246b(retVal{2}) == 'z'247retVal = {retVal{1}, retVal{2}+1};248end249elseif measure(b, k0) == 1 && cvc(k, b, k0)250retVal = setto('e', b(k0:k), k);251end252k = retVal{2};253b = retVal{1}(k0:k);254end255j = k;256s1ab = {b(k0:k), k};257258% step1c() turns terminal y to i when there is another vowel in the stem.259function s1c = step1c(b, k, k0)260global j;261if ends('y', b, k) && vowelinstem(b, k0)262b(k) = 'i';263end264j = k;265s1c = {b, k};266267% step2() maps double suffices to single ones. so -ization ( = -ize plus268% -ation) maps to -ize etc. note that the string before the suffix must give269% m() > 0.270function s2 = step2(b, k, k0)271global j;272s2 = {b, k};273switch b(k-1)274case {'a'}275if ends('ational', b, k) s2 = rs('ate', b, k, k0);276elseif ends('tional', b, k) s2 = rs('tion', b, k, k0); end;277case {'c'}278if ends('enci', b, k) s2 = rs('ence', b, k, k0);279elseif ends('anci', b, k) s2 = rs('ance', b, k, k0); end;280case {'e'}281if ends('izer', b, k) s2 = rs('ize', b, k, k0); end;282case {'l'}283if ends('bli', b, k) s2 = rs('ble', b, k, k0);284elseif ends('alli', b, k) s2 = rs('al', b, k, k0);285elseif ends('entli', b, k) s2 = rs('ent', b, k, k0);286elseif ends('eli', b, k) s2 = rs('e', b, k, k0);287elseif ends('ousli', b, k) s2 = rs('ous', b, k, k0); end;288case {'o'}289if ends('ization', b, k) s2 = rs('ize', b, k, k0);290elseif ends('ation', b, k) s2 = rs('ate', b, k, k0);291elseif ends('ator', b, k) s2 = rs('ate', b, k, k0); end;292case {'s'}293if ends('alism', b, k) s2 = rs('al', b, k, k0);294elseif ends('iveness', b, k) s2 = rs('ive', b, k, k0);295elseif ends('fulness', b, k) s2 = rs('ful', b, k, k0);296elseif ends('ousness', b, k) s2 = rs('ous', b, k, k0); end;297case {'t'}298if ends('aliti', b, k) s2 = rs('al', b, k, k0);299elseif ends('iviti', b, k) s2 = rs('ive', b, k, k0);300elseif ends('biliti', b, k) s2 = rs('ble', b, k, k0); end;301case {'g'}302if ends('logi', b, k) s2 = rs('log', b, k, k0); end;303end304j = s2{2};305306% step3() deals with -ic-, -full, -ness etc. similar strategy to step2.307function s3 = step3(b, k, k0)308global j;309s3 = {b, k};310switch b(k)311case {'e'}312if ends('icate', b, k) s3 = rs('ic', b, k, k0);313elseif ends('ative', b, k) s3 = rs('', b, k, k0);314elseif ends('alize', b, k) s3 = rs('al', b, k, k0); end;315case {'i'}316if ends('iciti', b, k) s3 = rs('ic', b, k, k0); end;317case {'l'}318if ends('ical', b, k) s3 = rs('ic', b, k, k0);319elseif ends('ful', b, k) s3 = rs('', b, k, k0); end;320case {'s'}321if ends('ness', b, k) s3 = rs('', b, k, k0); end;322end323j = s3{2};324325% step4() takes off -ant, -ence etc., in context <c>vcvc<v>.326function s4 = step4(b, k, k0)327global j;328switch b(k-1)329case {'a'}330if ends('al', b, k) end;331case {'c'}332if ends('ance', b, k)333elseif ends('ence', b, k) end;334case {'e'}335if ends('er', b, k) end;336case {'i'}337if ends('ic', b, k) end;338case {'l'}339if ends('able', b, k)340elseif ends('ible', b, k) end;341case {'n'}342if ends('ant', b, k)343elseif ends('ement', b, k)344elseif ends('ment', b, k)345elseif ends('ent', b, k) end;346case {'o'}347if ends('ion', b, k)348if j == 0349elseif ~(strcmp(b(j),'s') || strcmp(b(j),'t'))350j = k;351end352elseif ends('ou', b, k) end;353case {'s'}354if ends('ism', b, k) end;355case {'t'}356if ends('ate', b, k)357elseif ends('iti', b, k) end;358case {'u'}359if ends('ous', b, k) end;360case {'v'}361if ends('ive', b, k) end;362case {'z'}363if ends('ize', b, k) end;364end365if measure(b, k0) > 1366s4 = {b(k0:j), j};367else368s4 = {b(k0:k), k};369end370371% step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1.372function s5 = step5(b, k, k0)373global j;374j = k;375if b(k) == 'e'376a = measure(b, k0);377if (a > 1) || ((a == 1) && ~cvc(k-1, b, k0))378k = k-1;379end380end381if (b(k) == 'l') && doublec(k, b, k0) && (measure(b, k0) > 1)382k = k-1;383end384s5 = {b(k0:k), k};385386387