Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/phabricator
Path: blob/master/externals/porter-stemmer/src/Porter.php
12241 views
1
<?php
2
3
# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
4
5
/**
6
* Copyright (c) 2005-2016 Richard Heyes (http://www.phpguru.org/)
7
*
8
* Portions Copyright 2003-2007 Jon Abernathy <[email protected]>
9
*
10
* Originally available under the GPL 2 or greater. Relicensed with permission
11
* of original authors under the MIT License in 2016.
12
*
13
* All rights reserved.
14
*
15
* @package PorterStemmer
16
* @author Richard Heyes
17
* @author Jon Abernathy <[email protected]>
18
* @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/)
19
* @license http://www.opensource.org/licenses/mit-license.html MIT License
20
*/
21
22
/**
23
* PHP 5 Implementation of the Porter Stemmer algorithm. Certain elements
24
* were borrowed from the (broken) implementation by Jon Abernathy.
25
*
26
* See http://tartarus.org/~martin/PorterStemmer/ for a description of the
27
* algorithm.
28
*
29
* Usage:
30
*
31
* $stem = PorterStemmer::Stem($word);
32
*
33
* How easy is that?
34
*
35
* @package PorterStemmer
36
* @author Richard Heyes
37
* @author Jon Abernathy <[email protected]>
38
* @copyright 2005-2016 Richard Heyes (http://www.phpguru.org/)
39
* @license http://www.opensource.org/licenses/mit-license.html MIT License
40
*/
41
class Porter
42
{
43
/**
44
* Regex for matching a consonant
45
*
46
* @var string
47
*/
48
private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
49
50
/**
51
* Regex for matching a vowel
52
*
53
* @var string
54
*/
55
private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
56
57
/**
58
* Stems a word. Simple huh?
59
*
60
* @param string $word Word to stem
61
*
62
* @return string Stemmed word
63
*/
64
public static function Stem($word)
65
{
66
if (strlen($word) <= 2) {
67
return $word;
68
}
69
70
$word = self::step1ab($word);
71
$word = self::step1c($word);
72
$word = self::step2($word);
73
$word = self::step3($word);
74
$word = self::step4($word);
75
$word = self::step5($word);
76
77
return $word;
78
}
79
80
/**
81
* Step 1
82
*/
83
private static function step1ab($word)
84
{
85
// Part a
86
if (substr($word, -1) == 's') {
87
88
self::replace($word, 'sses', 'ss')
89
OR self::replace($word, 'ies', 'i')
90
OR self::replace($word, 'ss', 'ss')
91
OR self::replace($word, 's', '');
92
}
93
94
// Part b
95
if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) { // First rule
96
$v = self::$regex_vowel;
97
98
// ing and ed
99
if ( preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
100
OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons
101
102
// If one of above two test successful
103
if ( !self::replace($word, 'at', 'ate')
104
AND !self::replace($word, 'bl', 'ble')
105
AND !self::replace($word, 'iz', 'ize')) {
106
107
// Double consonant ending
108
if ( self::doubleConsonant($word)
109
AND substr($word, -2) != 'll'
110
AND substr($word, -2) != 'ss'
111
AND substr($word, -2) != 'zz') {
112
113
$word = substr($word, 0, -1);
114
115
} elseif (self::m($word) == 1 AND self::cvc($word)) {
116
$word .= 'e';
117
}
118
}
119
}
120
}
121
122
return $word;
123
}
124
125
/**
126
* Step 1c
127
*
128
* @param string $word Word to stem
129
*/
130
private static function step1c($word)
131
{
132
$v = self::$regex_vowel;
133
134
if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
135
self::replace($word, 'y', 'i');
136
}
137
138
return $word;
139
}
140
141
/**
142
* Step 2
143
*
144
* @param string $word Word to stem
145
*/
146
private static function step2($word)
147
{
148
switch (substr($word, -2, 1)) {
149
case 'a':
150
self::replace($word, 'ational', 'ate', 0)
151
OR self::replace($word, 'tional', 'tion', 0);
152
break;
153
154
case 'c':
155
self::replace($word, 'enci', 'ence', 0)
156
OR self::replace($word, 'anci', 'ance', 0);
157
break;
158
159
case 'e':
160
self::replace($word, 'izer', 'ize', 0);
161
break;
162
163
case 'g':
164
self::replace($word, 'logi', 'log', 0);
165
break;
166
167
case 'l':
168
self::replace($word, 'entli', 'ent', 0)
169
OR self::replace($word, 'ousli', 'ous', 0)
170
OR self::replace($word, 'alli', 'al', 0)
171
OR self::replace($word, 'bli', 'ble', 0)
172
OR self::replace($word, 'eli', 'e', 0);
173
break;
174
175
case 'o':
176
self::replace($word, 'ization', 'ize', 0)
177
OR self::replace($word, 'ation', 'ate', 0)
178
OR self::replace($word, 'ator', 'ate', 0);
179
break;
180
181
case 's':
182
self::replace($word, 'iveness', 'ive', 0)
183
OR self::replace($word, 'fulness', 'ful', 0)
184
OR self::replace($word, 'ousness', 'ous', 0)
185
OR self::replace($word, 'alism', 'al', 0);
186
break;
187
188
case 't':
189
self::replace($word, 'biliti', 'ble', 0)
190
OR self::replace($word, 'aliti', 'al', 0)
191
OR self::replace($word, 'iviti', 'ive', 0);
192
break;
193
}
194
195
return $word;
196
}
197
198
/**
199
* Step 3
200
*
201
* @param string $word String to stem
202
*/
203
private static function step3($word)
204
{
205
switch (substr($word, -2, 1)) {
206
case 'a':
207
self::replace($word, 'ical', 'ic', 0);
208
break;
209
210
case 's':
211
self::replace($word, 'ness', '', 0);
212
break;
213
214
case 't':
215
self::replace($word, 'icate', 'ic', 0)
216
OR self::replace($word, 'iciti', 'ic', 0);
217
break;
218
219
case 'u':
220
self::replace($word, 'ful', '', 0);
221
break;
222
223
case 'v':
224
self::replace($word, 'ative', '', 0);
225
break;
226
227
case 'z':
228
self::replace($word, 'alize', 'al', 0);
229
break;
230
}
231
232
return $word;
233
}
234
235
/**
236
* Step 4
237
*
238
* @param string $word Word to stem
239
*/
240
private static function step4($word)
241
{
242
switch (substr($word, -2, 1)) {
243
case 'a':
244
self::replace($word, 'al', '', 1);
245
break;
246
247
case 'c':
248
self::replace($word, 'ance', '', 1)
249
OR self::replace($word, 'ence', '', 1);
250
break;
251
252
case 'e':
253
self::replace($word, 'er', '', 1);
254
break;
255
256
case 'i':
257
self::replace($word, 'ic', '', 1);
258
break;
259
260
case 'l':
261
self::replace($word, 'able', '', 1)
262
OR self::replace($word, 'ible', '', 1);
263
break;
264
265
case 'n':
266
self::replace($word, 'ant', '', 1)
267
OR self::replace($word, 'ement', '', 1)
268
OR self::replace($word, 'ment', '', 1)
269
OR self::replace($word, 'ent', '', 1);
270
break;
271
272
case 'o':
273
if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
274
self::replace($word, 'ion', '', 1);
275
} else {
276
self::replace($word, 'ou', '', 1);
277
}
278
break;
279
280
case 's':
281
self::replace($word, 'ism', '', 1);
282
break;
283
284
case 't':
285
self::replace($word, 'ate', '', 1)
286
OR self::replace($word, 'iti', '', 1);
287
break;
288
289
case 'u':
290
self::replace($word, 'ous', '', 1);
291
break;
292
293
case 'v':
294
self::replace($word, 'ive', '', 1);
295
break;
296
297
case 'z':
298
self::replace($word, 'ize', '', 1);
299
break;
300
}
301
302
return $word;
303
}
304
305
/**
306
* Step 5
307
*
308
* @param string $word Word to stem
309
*/
310
private static function step5($word)
311
{
312
// Part a
313
if (substr($word, -1) == 'e') {
314
if (self::m(substr($word, 0, -1)) > 1) {
315
self::replace($word, 'e', '');
316
317
} elseif (self::m(substr($word, 0, -1)) == 1) {
318
319
if (!self::cvc(substr($word, 0, -1))) {
320
self::replace($word, 'e', '');
321
}
322
}
323
}
324
325
// Part b
326
if (self::m($word) > 1 AND self::doubleConsonant($word) AND substr($word, -1) == 'l') {
327
$word = substr($word, 0, -1);
328
}
329
330
return $word;
331
}
332
333
/**
334
* Replaces the first string with the second, at the end of the string
335
*
336
* If third arg is given, then the preceding string must match that m
337
* count at least.
338
*
339
* @param string $str String to check
340
* @param string $check Ending to check for
341
* @param string $repl Replacement string
342
* @param int $m Optional minimum number of m() to meet
343
*
344
* @return bool Whether the $check string was at the end of the $str
345
* string. True does not necessarily mean that it was
346
* replaced.
347
*/
348
private static function replace(&$str, $check, $repl, $m = null)
349
{
350
$len = 0 - strlen($check);
351
352
if (substr($str, $len) == $check) {
353
$substr = substr($str, 0, $len);
354
if (is_null($m) OR self::m($substr) > $m) {
355
$str = $substr . $repl;
356
}
357
358
return true;
359
}
360
361
return false;
362
}
363
364
/**
365
* What, you mean it's not obvious from the name?
366
*
367
* m() measures the number of consonant sequences in $str. if c is
368
* a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
369
* presence,
370
*
371
* <c><v> gives 0
372
* <c>vc<v> gives 1
373
* <c>vcvc<v> gives 2
374
* <c>vcvcvc<v> gives 3
375
*
376
* @param string $str The string to return the m count for
377
*
378
* @return int The m count
379
*/
380
private static function m($str)
381
{
382
$c = self::$regex_consonant;
383
$v = self::$regex_vowel;
384
385
$str = preg_replace("#^$c+#", '', $str);
386
$str = preg_replace("#$v+$#", '', $str);
387
388
preg_match_all("#($v+$c+)#", $str, $matches);
389
390
return count($matches[1]);
391
}
392
393
/**
394
* Returns true/false as to whether the given string contains two
395
* of the same consonant next to each other at the end of the string.
396
*
397
* @param string $str String to check
398
*
399
* @return bool Result
400
*/
401
private static function doubleConsonant($str)
402
{
403
$c = self::$regex_consonant;
404
405
return preg_match("#$c{2}$#", $str, $matches) AND $matches[0][0] == $matches[0][1];
406
}
407
408
/**
409
* Checks for ending CVC sequence where second C is not W, X or Y
410
*
411
* @param string $str String to check
412
*
413
* @return bool Result
414
*/
415
private static function cvc($str)
416
{
417
$c = self::$regex_consonant;
418
$v = self::$regex_vowel;
419
420
return preg_match("#($c$v$c)$#", $str, $matches)
421
AND strlen($matches[1]) == 3
422
AND $matches[1][2] != 'w'
423
AND $matches[1][2] != 'x'
424
AND $matches[1][2] != 'y';
425
}
426
}
427
428