CWIS Developer Documentation
PorterStemmer.php
Go to the documentation of this file.
1 <?php
26  {
31  private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
32 
33 
38  private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
39 
40 
46  private static $cache = array();
47 
48 
56  public static function Stem($word, $cache = false)
57  {
58  if (strlen($word) <= 2) {
59  return $word;
60  }
61 
62  // Check cache
63  if ($cache AND !empty(self::$cache[$word])) {
64  return self::$cache[$word];
65  }
66 
70  $word = preg_replace("/('ve|n't|'d)$/", '', $word);
71 
72  $stem = self::step1ab($word);
73  $stem = self::step1c($stem);
74  $stem = self::step2($stem);
75  $stem = self::step3($stem);
76  $stem = self::step4($stem);
77  $stem = self::step5($stem);
78 
79  // Store in cache
80  if ($cache) {
81  self::$cache[$word] = $stem;
82  }
83 
84  return $stem;
85  }
86 
87 
91  private static function step1ab($word)
92  {
93  // Part a
94  if (substr($word, -1) == 's') {
95 
96  self::replace($word, 'sses', 'ss')
97  OR self::replace($word, 'ies', 'i')
98  OR self::replace($word, 'ss', 'ss')
99  OR self::replace($word, 's', '');
100  }
101 
102  // Part b
103  if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) { // First rule
104  $v = self::$regex_vowel;
105 
106  // ing and ed
107  if ( preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
108  OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons
109 
110  // If one of above two test successful
111  if ( !self::replace($word, 'at', 'ate')
112  AND !self::replace($word, 'bl', 'ble')
113  AND !self::replace($word, 'iz', 'ize')) {
114 
115  // Double consonant ending
116  if ( self::doubleConsonant($word)
117  AND substr($word, -2) != 'll'
118  AND substr($word, -2) != 'ss'
119  AND substr($word, -2) != 'zz') {
120 
121  $word = substr($word, 0, -1);
122 
123  } else if (self::m($word) == 1 AND self::cvc($word)) {
124  $word .= 'e';
125  }
126  }
127  }
128  }
129 
130  return $word;
131  }
132 
133 
139  private static function step1c($word)
140  {
141  $v = self::$regex_vowel;
142 
143  if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
144  self::replace($word, 'y', 'i');
145  }
146 
147  return $word;
148  }
149 
150 
156  private static function step2($word)
157  {
158  switch (substr($word, -2, 1)) {
159  case 'a':
160  self::replace($word, 'ational', 'ate', 0)
161  OR self::replace($word, 'tional', 'tion', 0);
162  break;
163 
164  case 'c':
165  self::replace($word, 'enci', 'ence', 0)
166  OR self::replace($word, 'anci', 'ance', 0);
167  break;
168 
169  case 'e':
170  self::replace($word, 'izer', 'ize', 0);
171  break;
172 
173  case 'g':
174  self::replace($word, 'logi', 'log', 0);
175  break;
176 
177  case 'l':
178  self::replace($word, 'entli', 'ent', 0)
179  OR self::replace($word, 'ousli', 'ous', 0)
180  OR self::replace($word, 'alli', 'al', 0)
181  OR self::replace($word, 'bli', 'ble', 0)
182  OR self::replace($word, 'eli', 'e', 0);
183  break;
184 
185  case 'o':
186  self::replace($word, 'ization', 'ize', 0)
187  OR self::replace($word, 'ation', 'ate', 0)
188  OR self::replace($word, 'ator', 'ate', 0);
189  break;
190 
191  case 's':
192  self::replace($word, 'iveness', 'ive', 0)
193  OR self::replace($word, 'fulness', 'ful', 0)
194  OR self::replace($word, 'ousness', 'ous', 0)
195  OR self::replace($word, 'alism', 'al', 0);
196  break;
197 
198  case 't':
199  self::replace($word, 'biliti', 'ble', 0)
200  OR self::replace($word, 'aliti', 'al', 0)
201  OR self::replace($word, 'iviti', 'ive', 0);
202  break;
203  }
204 
205  return $word;
206  }
207 
208 
214  private static function step3($word)
215  {
216  switch (substr($word, -2, 1)) {
217  case 'a':
218  self::replace($word, 'ical', 'ic', 0);
219  break;
220 
221  case 's':
222  self::replace($word, 'alise', 'al', 0)
223  OR self::replace($word, 'ness', '', 0);
224  break;
225 
226  case 't':
227  self::replace($word, 'icate', 'ic', 0)
228  OR self::replace($word, 'iciti', 'ic', 0);
229  break;
230 
231  case 'u':
232  self::replace($word, 'ful', '', 0);
233  break;
234 
235  case 'v':
236  self::replace($word, 'ative', '', 0);
237  break;
238 
239  case 'z':
240  self::replace($word, 'alize', 'al', 0);
241  break;
242  }
243 
244  return $word;
245  }
246 
247 
253  private static function step4($word)
254  {
255  switch (substr($word, -2, 1)) {
256  case 'a':
257  self::replace($word, 'al', '', 1);
258  break;
259 
260  case 'c':
261  self::replace($word, 'ance', '', 1)
262  OR self::replace($word, 'ence', '', 1);
263  break;
264 
265  case 'e':
266  self::replace($word, 'er', '', 1);
267  break;
268 
269  case 'i':
270  self::replace($word, 'ic', '', 1);
271  break;
272 
273  case 'l':
274  self::replace($word, 'able', '', 1)
275  OR self::replace($word, 'ible', '', 1);
276  break;
277 
278  case 'n':
279  self::replace($word, 'ant', '', 1)
280  OR self::replace($word, 'ement', '', 1)
281  OR self::replace($word, 'ment', '', 1)
282  OR self::replace($word, 'ent', '', 1);
283  break;
284 
285  case 'o':
286  if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
287  self::replace($word, 'ion', '', 1);
288  } else {
289  self::replace($word, 'ou', '', 1);
290  }
291  break;
292 
293  case 's':
294  self::replace($word, 'ism', '', 1);
295  break;
296 
297  case 't':
298  self::replace($word, 'ate', '', 1)
299  OR self::replace($word, 'iti', '', 1);
300  break;
301 
302  case 'u':
303  self::replace($word, 'ous', '', 1);
304  break;
305 
306  case 'v':
307  self::replace($word, 'ive', '', 1);
308  break;
309 
310  case 'z':
311  self::replace($word, 'ize', '', 1);
312  break;
313  }
314 
315  return $word;
316  }
317 
318 
324  private static function step5($word)
325  {
326  // Part a
327  if (substr($word, -1) == 'e') {
328  if (self::m(substr($word, 0, -1)) > 1) {
329  self::replace($word, 'e', '');
330 
331  } else if (self::m(substr($word, 0, -1)) == 1) {
332 
333  if (!self::cvc(substr($word, 0, -1))) {
334  self::replace($word, 'e', '');
335  }
336  }
337  }
338 
339  // Part b
340  if (self::m($word) > 1 AND self::doubleConsonant($word) AND substr($word, -1) == 'l') {
341  $word = substr($word, 0, -1);
342  }
343 
344  return $word;
345  }
346 
347 
360  private static function replace(&$str, $check, $repl, $m = null)
361  {
362  $len = 0 - strlen($check);
363 
364  if (substr($str, $len) == $check) {
365  $substr = substr($str, 0, $len);
366  if (is_null($m) OR self::m($substr) > $m) {
367  $str = $substr . $repl;
368  }
369 
370  return true;
371  }
372 
373  return false;
374  }
375 
376 
392  private static function m($str)
393  {
394  $c = self::$regex_consonant;
395  $v = self::$regex_vowel;
396 
397  $str = preg_replace("#^$c+#", '', $str);
398  $str = preg_replace("#$v+$#", '', $str);
399 
400  preg_match_all("#($v+$c+)#", $str, $matches);
401 
402  return count($matches[1]);
403  }
404 
405 
413  private static function doubleConsonant($str)
414  {
415  $c = self::$regex_consonant;
416 
417  return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
418  }
419 
420 
427  private static function cvc($str)
428  {
429  $c = self::$regex_consonant;
430  $v = self::$regex_vowel;
431 
432  return preg_match("#($c$v$c)$#", $str, $matches)
433  AND strlen($matches[1]) == 3
434  AND $matches[1]{2} != 'w'
435  AND $matches[1]{2} != 'x'
436  AND $matches[1]{2} != 'y';
437  }
438  }
439 ?>