CoCalc -- porterStemmer.m

GitHub Repository: hackassin/Coursera-Machine-Learning
Path: blob/master/Week 7/Programming Assignment - 6/ex6/porterStemmer.m
⁸⁶³ views
1
function stem = porterStemmer(inString)
2
% Applies the Porter Stemming algorithm as presented in the following
3
% paper:
4
% Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
5
%   no. 3, pp 130-137
6

7
% Original code modeled after the C version provided at:
8
% http://www.tartarus.org/~martin/PorterStemmer/c.txt
9

10
% The main part of the stemming algorithm starts here. b is an array of
11
% characters, holding the word to be stemmed. The letters are in b[k0],
12
% b[k0+1] ending at b[k]. In fact k0 = 1 in this demo program (since
13
% matlab begins indexing by 1 instead of 0). k is readjusted downwards as
14
% the stemming progresses. Zero termination is not in fact used in the
15
% algorithm.
16

17
% To call this function, use the string to be stemmed as the input
18
% argument.  This function returns the stemmed word as a string.
19

20
% Lower-case string
21
inString = lower(inString);
22

23
global j;
24
b = inString;
25
k = length(b);
26
k0 = 1;
27
j = k;
28

29

30

31
% With this if statement, strings of length 1 or 2 don't go through the
32
% stemming process. Remove this conditional to match the published
33
% algorithm.
34
stem = b;
35
if k > 2
36
    % Output displays per step are commented out.
37
    %disp(sprintf('Word to stem: %s', b));
38
    x = step1ab(b, k, k0);
39
    %disp(sprintf('Steps 1A and B yield: %s', x{1}));
40
    x = step1c(x{1}, x{2}, k0);
41
    %disp(sprintf('Step 1C yields: %s', x{1}));
42
    x = step2(x{1}, x{2}, k0);
43
    %disp(sprintf('Step 2 yields: %s', x{1}));
44
    x = step3(x{1}, x{2}, k0);
45
    %disp(sprintf('Step 3 yields: %s', x{1}));
46
    x = step4(x{1}, x{2}, k0);
47
    %disp(sprintf('Step 4 yields: %s', x{1}));
48
    x = step5(x{1}, x{2}, k0);
49
    %disp(sprintf('Step 5 yields: %s', x{1}));
50
    stem = x{1};
51
end
52

53
% cons(j) is TRUE <=> b[j] is a consonant.
54
function c = cons(i, b, k0)
55
c = true;
56
switch(b(i))
57
    case {'a', 'e', 'i', 'o', 'u'}
58
        c = false;
59
    case 'y'
60
        if i == k0
61
            c = true;
62
        else
63
            c = ~cons(i - 1, b, k0);
64
        end
65
end
66

67
% mseq() measures the number of consonant sequences between k0 and j.  If
68
% c is a consonant sequence and v a vowel sequence, and <..> indicates
69
% arbitrary presence,
70

71
%      <c><v>       gives 0
72
%      <c>vc<v>     gives 1
73
%      <c>vcvc<v>   gives 2
74
%      <c>vcvcvc<v> gives 3
75
%      ....
76
function n = measure(b, k0)
77
global j;
78
n = 0;
79
i = k0;
80
while true
81
    if i > j
82
        return
83
    end
84
    if ~cons(i, b, k0)
85
        break;
86
    end
87
    i = i + 1;
88
end
89
i = i + 1;
90
while true
91
    while true
92
        if i > j
93
            return
94
        end
95
        if cons(i, b, k0)
96
            break;
97
        end
98
        i = i + 1;
99
    end
100
    i = i + 1;
101
    n = n + 1;
102
    while true
103
        if i > j
104
            return
105
        end
106
        if ~cons(i, b, k0)
107
            break;
108
        end
109
        i = i + 1;
110
    end
111
    i = i + 1;
112
end
113

114

115
% vowelinstem() is TRUE <=> k0,...j contains a vowel
116
function vis = vowelinstem(b, k0)
117
global j;
118
for i = k0:j,
119
    if ~cons(i, b, k0)
120
        vis = true;
121
        return
122
    end
123
end
124
vis = false;
125

126
%doublec(i) is TRUE <=> i,(i-1) contain a double consonant.
127
function dc = doublec(i, b, k0)
128
if i < k0+1
129
    dc = false;
130
    return
131
end
132
if b(i) ~= b(i-1)
133
    dc = false;
134
    return
135
end
136
dc = cons(i, b, k0);
137

138

139
% cvc(j) is TRUE <=> j-2,j-1,j has the form consonant - vowel - consonant
140
% and also if the second c is not w,x or y. this is used when trying to
141
% restore an e at the end of a short word. e.g.
142
%
143
%      cav(e), lov(e), hop(e), crim(e), but
144
%      snow, box, tray.
145

146
function c1 = cvc(i, b, k0)
147
if ((i < (k0+2)) || ~cons(i, b, k0) || cons(i-1, b, k0) || ~cons(i-2, b, k0))
148
    c1 = false;
149
else
150
    if (b(i) == 'w' || b(i) == 'x' || b(i) == 'y')
151
        c1 = false;
152
        return
153
    end
154
    c1 = true;
155
end
156

157
% ends(s) is TRUE <=> k0,...k ends with the string s.
158
function s = ends(str, b, k)
159
global j;
160
if (str(length(str)) ~= b(k))
161
    s = false;
162
    return
163
end % tiny speed-up
164
if (length(str) > k)
165
    s = false;
166
    return
167
end
168
if strcmp(b(k-length(str)+1:k), str)
169
    s = true;
170
    j = k - length(str);
171
    return
172
else
173
    s = false;
174
end
175

176
% setto(s) sets (j+1),...k to the characters in the string s, readjusting
177
% k accordingly.
178

179
function so = setto(s, b, k)
180
global j;
181
for i = j+1:(j+length(s))
182
    b(i) = s(i-j);
183
end
184
if k > j+length(s)
185
    b((j+length(s)+1):k) = '';
186
end
187
k = length(b);
188
so = {b, k};
189

190
% rs(s) is used further down.
191
% [Note: possible null/value for r if rs is called]
192
function r = rs(str, b, k, k0)
193
r = {b, k};
194
if measure(b, k0) > 0
195
    r = setto(str, b, k);
196
end
197

198
% step1ab() gets rid of plurals and -ed or -ing. e.g.
199

200
%       caresses  ->  caress
201
%       ponies    ->  poni
202
%       ties      ->  ti
203
%       caress    ->  caress
204
%       cats      ->  cat
205

206
%       feed      ->  feed
207
%       agreed    ->  agree
208
%       disabled  ->  disable
209

210
%       matting   ->  mat
211
%       mating    ->  mate
212
%       meeting   ->  meet
213
%       milling   ->  mill
214
%       messing   ->  mess
215

216
%       meetings  ->  meet
217

218
function s1ab = step1ab(b, k, k0)
219
global j;
220
if b(k) == 's'
221
    if ends('sses', b, k)
222
        k = k-2;
223
    elseif ends('ies', b, k)
224
        retVal = setto('i', b, k);
225
        b = retVal{1};
226
        k = retVal{2};
227
    elseif (b(k-1) ~= 's')
228
        k = k-1;
229
    end
230
end
231
if ends('eed', b, k)
232
    if measure(b, k0) > 0;
233
        k = k-1;
234
    end
235
elseif (ends('ed', b, k) || ends('ing', b, k)) && vowelinstem(b, k0)
236
    k = j;
237
    retVal = {b, k};
238
    if ends('at', b, k)
239
        retVal = setto('ate', b(k0:k), k);
240
    elseif ends('bl', b, k)
241
        retVal = setto('ble', b(k0:k), k);
242
    elseif ends('iz', b, k)
243
        retVal = setto('ize', b(k0:k), k);
244
    elseif doublec(k, b, k0)
245
        retVal = {b, k-1};
246
        if b(retVal{2}) == 'l' || b(retVal{2}) == 's' || ...
247
                b(retVal{2}) == 'z'
248
            retVal = {retVal{1}, retVal{2}+1};
249
        end
250
    elseif measure(b, k0) == 1 && cvc(k, b, k0)
251
        retVal = setto('e', b(k0:k), k);
252
    end
253
    k = retVal{2};
254
    b = retVal{1}(k0:k);
255
end
256
j = k;
257
s1ab = {b(k0:k), k};
258

259
%  step1c() turns terminal y to i when there is another vowel in the stem.
260
function s1c = step1c(b, k, k0)
261
global j;
262
if ends('y', b, k) && vowelinstem(b, k0)
263
    b(k) = 'i';
264
end
265
j = k;
266
s1c = {b, k};
267

268
% step2() maps double suffices to single ones. so -ization ( = -ize plus
269
% -ation) maps to -ize etc. note that the string before the suffix must give
270
% m() > 0.
271
function s2 = step2(b, k, k0)
272
global j;
273
s2 = {b, k};
274
switch b(k-1)
275
    case {'a'}
276
        if ends('ational', b, k) s2 = rs('ate', b, k, k0);
277
        elseif ends('tional', b, k) s2 = rs('tion', b, k, k0); end;
278
    case {'c'}
279
        if ends('enci', b, k) s2 = rs('ence', b, k, k0);
280
        elseif ends('anci', b, k) s2 = rs('ance', b, k, k0); end;
281
    case {'e'}
282
        if ends('izer', b, k) s2 = rs('ize', b, k, k0); end;
283
    case {'l'}
284
        if ends('bli', b, k) s2 = rs('ble', b, k, k0);
285
        elseif ends('alli', b, k) s2 = rs('al', b, k, k0);
286
        elseif ends('entli', b, k) s2 = rs('ent', b, k, k0);
287
        elseif ends('eli', b, k) s2 = rs('e', b, k, k0);
288
        elseif ends('ousli', b, k) s2 = rs('ous', b, k, k0); end;
289
    case {'o'}
290
        if ends('ization', b, k) s2 = rs('ize', b, k, k0);
291
        elseif ends('ation', b, k) s2 = rs('ate', b, k, k0);
292
        elseif ends('ator', b, k) s2 = rs('ate', b, k, k0); end;
293
    case {'s'}
294
        if ends('alism', b, k) s2 = rs('al', b, k, k0);
295
        elseif ends('iveness', b, k) s2 = rs('ive', b, k, k0);
296
        elseif ends('fulness', b, k) s2 = rs('ful', b, k, k0);
297
        elseif ends('ousness', b, k) s2 = rs('ous', b, k, k0); end;
298
    case {'t'}
299
        if ends('aliti', b, k) s2 = rs('al', b, k, k0);
300
        elseif ends('iviti', b, k) s2 = rs('ive', b, k, k0);
301
        elseif ends('biliti', b, k) s2 = rs('ble', b, k, k0); end;
302
    case {'g'}
303
        if ends('logi', b, k) s2 = rs('log', b, k, k0); end;
304
end
305
j = s2{2};
306

307
% step3() deals with -ic-, -full, -ness etc. similar strategy to step2.
308
function s3 = step3(b, k, k0)
309
global j;
310
s3 = {b, k};
311
switch b(k)
312
    case {'e'}
313
        if ends('icate', b, k) s3 = rs('ic', b, k, k0);
314
        elseif ends('ative', b, k) s3 = rs('', b, k, k0);
315
        elseif ends('alize', b, k) s3 = rs('al', b, k, k0); end;
316
    case {'i'}
317
        if ends('iciti', b, k) s3 = rs('ic', b, k, k0); end;
318
    case {'l'}
319
        if ends('ical', b, k) s3 = rs('ic', b, k, k0);
320
        elseif ends('ful', b, k) s3 = rs('', b, k, k0); end;
321
    case {'s'}
322
        if ends('ness', b, k) s3 = rs('', b, k, k0); end;
323
end
324
j = s3{2};
325

326
% step4() takes off -ant, -ence etc., in context <c>vcvc<v>.
327
function s4 = step4(b, k, k0)
328
global j;
329
switch b(k-1)
330
    case {'a'}
331
        if ends('al', b, k) end;
332
    case {'c'}
333
        if ends('ance', b, k)
334
        elseif ends('ence', b, k) end;
335
    case {'e'}
336
        if ends('er', b, k) end;
337
    case {'i'}
338
        if ends('ic', b, k) end;
339
    case {'l'}
340
        if ends('able', b, k)
341
        elseif ends('ible', b, k) end;
342
    case {'n'}
343
        if ends('ant', b, k)
344
        elseif ends('ement', b, k)
345
        elseif ends('ment', b, k)
346
        elseif ends('ent', b, k) end;
347
    case {'o'}
348
        if ends('ion', b, k)
349
            if j == 0
350
            elseif ~(strcmp(b(j),'s') || strcmp(b(j),'t'))
351
                j = k;
352
            end
353
        elseif ends('ou', b, k) end;
354
    case {'s'}
355
        if ends('ism', b, k) end;
356
    case {'t'}
357
        if ends('ate', b, k)
358
        elseif ends('iti', b, k) end;
359
    case {'u'}
360
        if ends('ous', b, k) end;
361
    case {'v'}
362
        if ends('ive', b, k) end;
363
    case {'z'}
364
        if ends('ize', b, k) end;
365
end
366
if measure(b, k0) > 1
367
    s4 = {b(k0:j), j};
368
else
369
    s4 = {b(k0:k), k};
370
end
371

372
% step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1.
373
function s5 = step5(b, k, k0)
374
global j;
375
j = k;
376
if b(k) == 'e'
377
    a = measure(b, k0);
378
    if (a > 1) || ((a == 1) && ~cvc(k-1, b, k0))
379
        k = k-1;
380
    end
381
end
382
if (b(k) == 'l') && doublec(k, b, k0) && (measure(b, k0) > 1)
383
    k = k-1;
384
end
385
s5 = {b(k0:k), k};
386

387
Product

Resources

Company