Source for file class.porterstemmer.php

Documentation is available at class.porterstemmer.php

  1. <?php
  2.     /**
  3.     * Copyright (c) 2005 Richard Heyes (http://www.phpguru.org/)
  4.     *
  5.     * All rights reserved.
  6.     *
  7.     * This script is free software.
  8.     */
  9.  
  10.     /**
  11.     * PHP5 Implementation of the Porter Stemmer algorithm. Certain elements
  12.     * were borrowed from the (broken) implementation by Jon Abernathy.
  13.     *
  14.     * Usage:
  15.     *
  16.     *  $stem = PorterStemmer::Stem($word);
  17.     *
  18.     * How easy is that?
  19.     */
  20.  
  21.     class PorterStemmer
  22.     {
  23.         /**
  24.         * Regex for matching a consonant
  25.         * @var string 
  26.         */
  27.         private static $regex_consonant '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
  28.  
  29.  
  30.         /**
  31.         * Regex for matching a vowel
  32.         * @var string 
  33.         */
  34.         private static $regex_vowel '(?:[aeiou]|(?<![aeiou])y)';
  35.  
  36.  
  37.         /**
  38.         * Stems a word. Simple huh?
  39.         *
  40.         * @param  string $word Word to stem
  41.         * @return string       Stemmed word
  42.         */
  43.         public static function Stem($word)
  44.         {
  45.             if (strlen($word<= 2{
  46.                 return $word;
  47.             }
  48.  
  49.             $word self::step1ab($word);
  50.             $word self::step1c($word);
  51.             $word self::step2($word);
  52.             $word self::step3($word);
  53.             $word self::step4($word);
  54.             $word self::step5($word);
  55.  
  56.             return $word;
  57.         }
  58.  
  59.  
  60.         /**
  61.         * Step 1
  62.         */
  63.         private static function step1ab($word)
  64.         {
  65.             // Part a
  66.             if (substr($word-1== 's'{
  67.  
  68.                    self::replace($word'sses''ss')
  69.                 OR self::replace($word'ies''i')
  70.                 OR self::replace($word'ss''ss')
  71.                 OR self::replace($word's''');
  72.             }
  73.  
  74.             // Part b
  75.             if (substr($word-21!= 'e' OR !self::replace($word'eed''ee'0)) // First rule
  76.                 $v self::$regex_vowel;
  77.  
  78.                 // ing and ed
  79.                 if (   preg_match("#$v+#"substr($word0-3)) && self::replace($word'ing''')
  80.                     OR preg_match("#$v+#"substr($word0-2)) && self::replace($word'ed''')) // Note use of && and OR, for precedence reasons
  81.  
  82.                     // If one of above two test successful
  83.                     if (    !self::replace($word'at''ate')
  84.                         AND !self::replace($word'bl''ble')
  85.                         AND !self::replace($word'iz''ize')) {
  86.  
  87.                         // Double consonant ending
  88.                         if (    self::doubleConsonant($word)
  89.                             AND substr($word-2!= 'll'
  90.                             AND substr($word-2!= 'ss'
  91.                             AND substr($word-2!= 'zz'{
  92.  
  93.                             $word substr($word0-1);
  94.  
  95.                         else if (self::m($word== AND self::cvc($word)) {
  96.                             $word .= 'e';
  97.                         }
  98.                     }
  99.                 }
  100.             }
  101.  
  102.             return $word;
  103.         }
  104.  
  105.  
  106.         /**
  107.         * Step 1c
  108.         *
  109.         * @param string $word Word to stem
  110.         */
  111.         private static function step1c($word)
  112.         {
  113.             $v self::$regex_vowel;
  114.  
  115.             if (substr($word-1== 'y' && preg_match("#$v+#"substr($word0-1))) {
  116.                 self::replace($word'y''i');
  117.             }
  118.  
  119.             return $word;
  120.         }
  121.  
  122.  
  123.         /**
  124.         * Step 2
  125.         *
  126.         * @param string $word Word to stem
  127.         */
  128.         private static function step2($word)
  129.         {
  130.             switch (substr($word-21)) {
  131.                 case 'a':
  132.                        self::replace($word'ational''ate'0)
  133.                     OR self::replace($word'tional''tion'0);
  134.                     break;
  135.  
  136.                 case 'c':
  137.                        self::replace($word'enci''ence'0)
  138.                     OR self::replace($word'anci''ance'0);
  139.                     break;
  140.  
  141.                 case 'e':
  142.                     self::replace($word'izer''ize'0);
  143.                     break;
  144.  
  145.                 case 'g':
  146.                     self::replace($word'logi''log'0);
  147.                     break;
  148.  
  149.                 case 'l':
  150.                        self::replace($word'entli''ent'0)
  151.                     OR self::replace($word'ousli''ous'0)
  152.                     OR self::replace($word'alli''al'0)
  153.                     OR self::replace($word'bli''ble'0)
  154.                     OR self::replace($word'eli''e'0);
  155.                     break;
  156.  
  157.                 case 'o':
  158.                        self::replace($word'ization''ize'0)
  159.                     OR self::replace($word'ation''ate'0)
  160.                     OR self::replace($word'ator''ate'0);
  161.                     break;
  162.  
  163.                 case 's':
  164.                        self::replace($word'iveness''ive'0)
  165.                     OR self::replace($word'fulness''ful'0)
  166.                     OR self::replace($word'ousness''ous'0)
  167.                     OR self::replace($word'alism''al'0);
  168.                     break;
  169.  
  170.                 case 't':
  171.                        self::replace($word'biliti''ble'0)
  172.                     OR self::replace($word'aliti''al'0)
  173.                     OR self::replace($word'iviti''ive'0);
  174.                     break;
  175.             }
  176.  
  177.             return $word;
  178.         }
  179.  
  180.  
  181.         /**
  182.         * Step 3
  183.         *
  184.         * @param string $word String to stem
  185.         */
  186.         private static function step3($word)
  187.         {
  188.             switch (substr($word-21)) {
  189.                 case 'a':
  190.                     self::replace($word'ical''ic'0);
  191.                     break;
  192.  
  193.                 case 's':
  194.                     self::replace($word'ness'''0);
  195.                     break;
  196.  
  197.                 case 't':
  198.                        self::replace($word'icate''ic'0)
  199.                     OR self::replace($word'iciti''ic'0);
  200.                     break;
  201.  
  202.                 case 'u':
  203.                     self::replace($word'ful'''0);
  204.                     break;
  205.  
  206.                 case 'v':
  207.                     self::replace($word'ative'''0);
  208.                     break;
  209.  
  210.                 case 'z':
  211.                     self::replace($word'alize''al'0);
  212.                     break;
  213.             }
  214.  
  215.             return $word;
  216.         }
  217.  
  218.  
  219.         /**
  220.         * Step 4
  221.         *
  222.         * @param string $word Word to stem
  223.         */
  224.         private static function step4($word)
  225.         {
  226.             switch (substr($word-21)) {
  227.                 case 'a':
  228.                     self::replace($word'al'''1);
  229.                     break;
  230.  
  231.                 case 'c':
  232.                        self::replace($word'ance'''1)
  233.                     OR self::replace($word'ence'''1);
  234.                     break;
  235.  
  236.                 case 'e':
  237.                     self::replace($word'er'''1);
  238.                     break;
  239.  
  240.                 case 'i':
  241.                     self::replace($word'ic'''1);
  242.                     break;
  243.  
  244.                 case 'l':
  245.                        self::replace($word'able'''1)
  246.                     OR self::replace($word'ible'''1);
  247.                     break;
  248.  
  249.                 case 'n':
  250.                        self::replace($word'ant'''1)
  251.                     OR self::replace($word'ement'''1)
  252.                     OR self::replace($word'ment'''1)
  253.                     OR self::replace($word'ent'''1);
  254.                     break;
  255.  
  256.                 case 'o':
  257.                     if (substr($word-4== 'tion' OR substr($word-4== 'sion'{
  258.                        self::replace($word'ion'''1);
  259.                     else {
  260.                         self::replace($word'ou'''1);
  261.                     }
  262.                     break;
  263.  
  264.                 case 's':
  265.                     self::replace($word'ism'''1);
  266.                     break;
  267.  
  268.                 case 't':
  269.                        self::replace($word'ate'''1)
  270.                     OR self::replace($word'iti'''1);
  271.                     break;
  272.  
  273.                 case 'u':
  274.                     self::replace($word'ous'''1);
  275.                     break;
  276.  
  277.                 case 'v':
  278.                     self::replace($word'ive'''1);
  279.                     break;
  280.  
  281.                 case 'z':
  282.                     self::replace($word'ize'''1);
  283.                     break;
  284.             }
  285.  
  286.             return $word;
  287.         }
  288.  
  289.  
  290.         /**
  291.         * Step 5
  292.         *
  293.         * @param string $word Word to stem
  294.         */
  295.         private static function step5($word)
  296.         {
  297.             // Part a
  298.             if (substr($word-1== 'e'{
  299.                 if (self::m(substr($word0-1)) 1{
  300.                     self::replace($word'e''');
  301.  
  302.                 else if (self::m(substr($word0-1)) == 1{
  303.  
  304.                     if (!self::cvc(substr($word0-1))) {
  305.                         self::replace($word'e''');
  306.                     }
  307.                 }
  308.             }
  309.  
  310.             // Part b
  311.             if (self::m($wordAND self::doubleConsonant($wordAND substr($word-1== 'l'{
  312.                 $word substr($word0-1);
  313.             }
  314.  
  315.             return $word;
  316.         }
  317.  
  318.  
  319.         /**
  320.         * Replaces the first string with the second, at the end of the string. If third
  321.         * arg is given, then the preceding string must match that m count at least.
  322.         *
  323.         * @param  string $str   String to check
  324.         * @param  string $check Ending to check for
  325.         * @param  string $repl  Replacement string
  326.         * @param  int    $m     Optional minimum number of m() to meet
  327.         * @return bool          Whether the $check string was at the end
  328.         *                        of the $str string. True does not necessarily mean
  329.         *                        that it was replaced.
  330.         */
  331.         private static function replace(&$str$check$repl$m null)
  332.         {
  333.             $len strlen($check);
  334.  
  335.             if (substr($str$len== $check{
  336.                 $substr substr($str0$len);
  337.                 if (is_null($mOR self::m($substr$m{
  338.                     $str $substr $repl;
  339.                 }
  340.  
  341.                 return true;
  342.             }
  343.  
  344.             return false;
  345.         }
  346.  
  347.  
  348.         /**
  349.         * What, you mean it's not obvious from the name?
  350.         *
  351.         * m() measures the number of consonant sequences in $str. if c is
  352.         * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
  353.         * presence,
  354.         *
  355.         * <c><v>       gives 0
  356.         * <c>vc<v>     gives 1
  357.         * <c>vcvc<v>   gives 2
  358.         * <c>vcvcvc<v> gives 3
  359.         *
  360.         * @param  string $str The string to return the m count for
  361.         * @return int         The m count
  362.         */
  363.         private static function m($str)
  364.         {
  365.             $c self::$regex_consonant;
  366.             $v self::$regex_vowel;
  367.  
  368.             $str preg_replace("#^$c+#"''$str);
  369.             $str preg_replace("#$v+$#"''$str);
  370.  
  371.             preg_match_all("#($v+$c+)#"$str$matches);
  372.  
  373.             return count($matches[1]);
  374.         }
  375.  
  376.  
  377.         /**
  378.         * Returns true/false as to whether the given string contains two
  379.         * of the same consonant next to each other at the end of the string.
  380.         *
  381.         * @param  string $str String to check
  382.         * @return bool        Result
  383.         */
  384.         private static function doubleConsonant($str)
  385.         {
  386.             $c self::$regex_consonant;
  387.  
  388.             return preg_match("#$c{2}$#"$str$matchesAND $matches[0]{0== $matches[0]{1};
  389.         }
  390.  
  391.  
  392.         /**
  393.         * Checks for ending CVC sequence where second C is not W, X or Y
  394.         *
  395.         * @param  string $str String to check
  396.         * @return bool        Result
  397.         */
  398.         private static function cvc($str)
  399.         {
  400.             $c self::$regex_consonant;
  401.             $v self::$regex_vowel;
  402.  
  403.             return     preg_match("#($c$v$c)$#"$str$matches)
  404.                    AND strlen($matches[1]== 3
  405.                    AND $matches[1]{2!= 'w'
  406.                    AND $matches[1]{2!= 'x'
  407.                    AND $matches[1]{2!= 'y';
  408.         }
  409.     }
  410. ?>

Documentation generated on Sun, 13 Dec 2009 19:39:33 +0000 by phpDocumentor 1.4.3