Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hackassin
GitHub Repository: hackassin/Coursera-Machine-Learning
Path: blob/master/Week 7/Programming Assignment - 6/ex6/porterStemmer.m
863 views
1
function stem = porterStemmer(inString)
2
% Applies the Porter Stemming algorithm as presented in the following
3
% paper:
4
% Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
5
% no. 3, pp 130-137
6
7
% Original code modeled after the C version provided at:
8
% http://www.tartarus.org/~martin/PorterStemmer/c.txt
9
10
% The main part of the stemming algorithm starts here. b is an array of
11
% characters, holding the word to be stemmed. The letters are in b[k0],
12
% b[k0+1] ending at b[k]. In fact k0 = 1 in this demo program (since
13
% matlab begins indexing by 1 instead of 0). k is readjusted downwards as
14
% the stemming progresses. Zero termination is not in fact used in the
15
% algorithm.
16
17
% To call this function, use the string to be stemmed as the input
18
% argument. This function returns the stemmed word as a string.
19
20
% Lower-case string
21
inString = lower(inString);
22
23
global j;
24
b = inString;
25
k = length(b);
26
k0 = 1;
27
j = k;
28
29
30
31
% With this if statement, strings of length 1 or 2 don't go through the
32
% stemming process. Remove this conditional to match the published
33
% algorithm.
34
stem = b;
35
if k > 2
36
% Output displays per step are commented out.
37
%disp(sprintf('Word to stem: %s', b));
38
x = step1ab(b, k, k0);
39
%disp(sprintf('Steps 1A and B yield: %s', x{1}));
40
x = step1c(x{1}, x{2}, k0);
41
%disp(sprintf('Step 1C yields: %s', x{1}));
42
x = step2(x{1}, x{2}, k0);
43
%disp(sprintf('Step 2 yields: %s', x{1}));
44
x = step3(x{1}, x{2}, k0);
45
%disp(sprintf('Step 3 yields: %s', x{1}));
46
x = step4(x{1}, x{2}, k0);
47
%disp(sprintf('Step 4 yields: %s', x{1}));
48
x = step5(x{1}, x{2}, k0);
49
%disp(sprintf('Step 5 yields: %s', x{1}));
50
stem = x{1};
51
end
52
53
% cons(j) is TRUE <=> b[j] is a consonant.
54
function c = cons(i, b, k0)
55
c = true;
56
switch(b(i))
57
case {'a', 'e', 'i', 'o', 'u'}
58
c = false;
59
case 'y'
60
if i == k0
61
c = true;
62
else
63
c = ~cons(i - 1, b, k0);
64
end
65
end
66
67
% mseq() measures the number of consonant sequences between k0 and j. If
68
% c is a consonant sequence and v a vowel sequence, and <..> indicates
69
% arbitrary presence,
70
71
% <c><v> gives 0
72
% <c>vc<v> gives 1
73
% <c>vcvc<v> gives 2
74
% <c>vcvcvc<v> gives 3
75
% ....
76
function n = measure(b, k0)
77
global j;
78
n = 0;
79
i = k0;
80
while true
81
if i > j
82
return
83
end
84
if ~cons(i, b, k0)
85
break;
86
end
87
i = i + 1;
88
end
89
i = i + 1;
90
while true
91
while true
92
if i > j
93
return
94
end
95
if cons(i, b, k0)
96
break;
97
end
98
i = i + 1;
99
end
100
i = i + 1;
101
n = n + 1;
102
while true
103
if i > j
104
return
105
end
106
if ~cons(i, b, k0)
107
break;
108
end
109
i = i + 1;
110
end
111
i = i + 1;
112
end
113
114
115
% vowelinstem() is TRUE <=> k0,...j contains a vowel
116
function vis = vowelinstem(b, k0)
117
global j;
118
for i = k0:j,
119
if ~cons(i, b, k0)
120
vis = true;
121
return
122
end
123
end
124
vis = false;
125
126
%doublec(i) is TRUE <=> i,(i-1) contain a double consonant.
127
function dc = doublec(i, b, k0)
128
if i < k0+1
129
dc = false;
130
return
131
end
132
if b(i) ~= b(i-1)
133
dc = false;
134
return
135
end
136
dc = cons(i, b, k0);
137
138
139
% cvc(j) is TRUE <=> j-2,j-1,j has the form consonant - vowel - consonant
140
% and also if the second c is not w,x or y. this is used when trying to
141
% restore an e at the end of a short word. e.g.
142
%
143
% cav(e), lov(e), hop(e), crim(e), but
144
% snow, box, tray.
145
146
function c1 = cvc(i, b, k0)
147
if ((i < (k0+2)) || ~cons(i, b, k0) || cons(i-1, b, k0) || ~cons(i-2, b, k0))
148
c1 = false;
149
else
150
if (b(i) == 'w' || b(i) == 'x' || b(i) == 'y')
151
c1 = false;
152
return
153
end
154
c1 = true;
155
end
156
157
% ends(s) is TRUE <=> k0,...k ends with the string s.
158
function s = ends(str, b, k)
159
global j;
160
if (str(length(str)) ~= b(k))
161
s = false;
162
return
163
end % tiny speed-up
164
if (length(str) > k)
165
s = false;
166
return
167
end
168
if strcmp(b(k-length(str)+1:k), str)
169
s = true;
170
j = k - length(str);
171
return
172
else
173
s = false;
174
end
175
176
% setto(s) sets (j+1),...k to the characters in the string s, readjusting
177
% k accordingly.
178
179
function so = setto(s, b, k)
180
global j;
181
for i = j+1:(j+length(s))
182
b(i) = s(i-j);
183
end
184
if k > j+length(s)
185
b((j+length(s)+1):k) = '';
186
end
187
k = length(b);
188
so = {b, k};
189
190
% rs(s) is used further down.
191
% [Note: possible null/value for r if rs is called]
192
function r = rs(str, b, k, k0)
193
r = {b, k};
194
if measure(b, k0) > 0
195
r = setto(str, b, k);
196
end
197
198
% step1ab() gets rid of plurals and -ed or -ing. e.g.
199
200
% caresses -> caress
201
% ponies -> poni
202
% ties -> ti
203
% caress -> caress
204
% cats -> cat
205
206
% feed -> feed
207
% agreed -> agree
208
% disabled -> disable
209
210
% matting -> mat
211
% mating -> mate
212
% meeting -> meet
213
% milling -> mill
214
% messing -> mess
215
216
% meetings -> meet
217
218
function s1ab = step1ab(b, k, k0)
219
global j;
220
if b(k) == 's'
221
if ends('sses', b, k)
222
k = k-2;
223
elseif ends('ies', b, k)
224
retVal = setto('i', b, k);
225
b = retVal{1};
226
k = retVal{2};
227
elseif (b(k-1) ~= 's')
228
k = k-1;
229
end
230
end
231
if ends('eed', b, k)
232
if measure(b, k0) > 0;
233
k = k-1;
234
end
235
elseif (ends('ed', b, k) || ends('ing', b, k)) && vowelinstem(b, k0)
236
k = j;
237
retVal = {b, k};
238
if ends('at', b, k)
239
retVal = setto('ate', b(k0:k), k);
240
elseif ends('bl', b, k)
241
retVal = setto('ble', b(k0:k), k);
242
elseif ends('iz', b, k)
243
retVal = setto('ize', b(k0:k), k);
244
elseif doublec(k, b, k0)
245
retVal = {b, k-1};
246
if b(retVal{2}) == 'l' || b(retVal{2}) == 's' || ...
247
b(retVal{2}) == 'z'
248
retVal = {retVal{1}, retVal{2}+1};
249
end
250
elseif measure(b, k0) == 1 && cvc(k, b, k0)
251
retVal = setto('e', b(k0:k), k);
252
end
253
k = retVal{2};
254
b = retVal{1}(k0:k);
255
end
256
j = k;
257
s1ab = {b(k0:k), k};
258
259
% step1c() turns terminal y to i when there is another vowel in the stem.
260
function s1c = step1c(b, k, k0)
261
global j;
262
if ends('y', b, k) && vowelinstem(b, k0)
263
b(k) = 'i';
264
end
265
j = k;
266
s1c = {b, k};
267
268
% step2() maps double suffices to single ones. so -ization ( = -ize plus
269
% -ation) maps to -ize etc. note that the string before the suffix must give
270
% m() > 0.
271
function s2 = step2(b, k, k0)
272
global j;
273
s2 = {b, k};
274
switch b(k-1)
275
case {'a'}
276
if ends('ational', b, k) s2 = rs('ate', b, k, k0);
277
elseif ends('tional', b, k) s2 = rs('tion', b, k, k0); end;
278
case {'c'}
279
if ends('enci', b, k) s2 = rs('ence', b, k, k0);
280
elseif ends('anci', b, k) s2 = rs('ance', b, k, k0); end;
281
case {'e'}
282
if ends('izer', b, k) s2 = rs('ize', b, k, k0); end;
283
case {'l'}
284
if ends('bli', b, k) s2 = rs('ble', b, k, k0);
285
elseif ends('alli', b, k) s2 = rs('al', b, k, k0);
286
elseif ends('entli', b, k) s2 = rs('ent', b, k, k0);
287
elseif ends('eli', b, k) s2 = rs('e', b, k, k0);
288
elseif ends('ousli', b, k) s2 = rs('ous', b, k, k0); end;
289
case {'o'}
290
if ends('ization', b, k) s2 = rs('ize', b, k, k0);
291
elseif ends('ation', b, k) s2 = rs('ate', b, k, k0);
292
elseif ends('ator', b, k) s2 = rs('ate', b, k, k0); end;
293
case {'s'}
294
if ends('alism', b, k) s2 = rs('al', b, k, k0);
295
elseif ends('iveness', b, k) s2 = rs('ive', b, k, k0);
296
elseif ends('fulness', b, k) s2 = rs('ful', b, k, k0);
297
elseif ends('ousness', b, k) s2 = rs('ous', b, k, k0); end;
298
case {'t'}
299
if ends('aliti', b, k) s2 = rs('al', b, k, k0);
300
elseif ends('iviti', b, k) s2 = rs('ive', b, k, k0);
301
elseif ends('biliti', b, k) s2 = rs('ble', b, k, k0); end;
302
case {'g'}
303
if ends('logi', b, k) s2 = rs('log', b, k, k0); end;
304
end
305
j = s2{2};
306
307
% step3() deals with -ic-, -full, -ness etc. similar strategy to step2.
308
function s3 = step3(b, k, k0)
309
global j;
310
s3 = {b, k};
311
switch b(k)
312
case {'e'}
313
if ends('icate', b, k) s3 = rs('ic', b, k, k0);
314
elseif ends('ative', b, k) s3 = rs('', b, k, k0);
315
elseif ends('alize', b, k) s3 = rs('al', b, k, k0); end;
316
case {'i'}
317
if ends('iciti', b, k) s3 = rs('ic', b, k, k0); end;
318
case {'l'}
319
if ends('ical', b, k) s3 = rs('ic', b, k, k0);
320
elseif ends('ful', b, k) s3 = rs('', b, k, k0); end;
321
case {'s'}
322
if ends('ness', b, k) s3 = rs('', b, k, k0); end;
323
end
324
j = s3{2};
325
326
% step4() takes off -ant, -ence etc., in context <c>vcvc<v>.
327
function s4 = step4(b, k, k0)
328
global j;
329
switch b(k-1)
330
case {'a'}
331
if ends('al', b, k) end;
332
case {'c'}
333
if ends('ance', b, k)
334
elseif ends('ence', b, k) end;
335
case {'e'}
336
if ends('er', b, k) end;
337
case {'i'}
338
if ends('ic', b, k) end;
339
case {'l'}
340
if ends('able', b, k)
341
elseif ends('ible', b, k) end;
342
case {'n'}
343
if ends('ant', b, k)
344
elseif ends('ement', b, k)
345
elseif ends('ment', b, k)
346
elseif ends('ent', b, k) end;
347
case {'o'}
348
if ends('ion', b, k)
349
if j == 0
350
elseif ~(strcmp(b(j),'s') || strcmp(b(j),'t'))
351
j = k;
352
end
353
elseif ends('ou', b, k) end;
354
case {'s'}
355
if ends('ism', b, k) end;
356
case {'t'}
357
if ends('ate', b, k)
358
elseif ends('iti', b, k) end;
359
case {'u'}
360
if ends('ous', b, k) end;
361
case {'v'}
362
if ends('ive', b, k) end;
363
case {'z'}
364
if ends('ize', b, k) end;
365
end
366
if measure(b, k0) > 1
367
s4 = {b(k0:j), j};
368
else
369
s4 = {b(k0:k), k};
370
end
371
372
% step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1.
373
function s5 = step5(b, k, k0)
374
global j;
375
j = k;
376
if b(k) == 'e'
377
a = measure(b, k0);
378
if (a > 1) || ((a == 1) && ~cvc(k-1, b, k0))
379
k = k-1;
380
end
381
end
382
if (b(k) == 'l') && doublec(k, b, k0) && (measure(b, k0) > 1)
383
k = k-1;
384
end
385
s5 = {b(k0:k), k};
386
387