Current File : /home/pacjaorg/public_html/dnpsom/libraries/vendor/wamania/php-stemmer/src/Stemmer/English.php |
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
* English Porter 2
*
* @link http://snowball.tartarus.org/algorithms/english/stemmer.html
* @author wamania
*
*/
class English extends Stem
{
/**
* All english vowels
*/
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y');
protected static $doubles = array('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt');
protected static $liEnding = array('c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't');
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
if (Utf8::strlen($word) < 3) {
return $word;
}
$this->word = UTF8::strtolower($word);
// exceptions
if (null !== ($word = $this->exception1())) {
return $word;
}
$this->plainVowels = implode('', self::$vowels);
// Remove initial ', if present.
$first = UTF8::substr($this->word, 0, 1);
if ($first == "'") {
$this->word = UTF8::substr($this->word, 1);
}
// Set initial y, or y after a vowel, to Y
if ($first == 'y') {
$this->word = preg_replace('#^y#u', 'Y', $this->word);
}
$this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word);
$this->r1();
$this->exceptionR1();
$this->r2();
$this->step0();
$this->step1a();
// exceptions 2
if (null !== ($word = $this->exception2())) {
return $word;
}
$this->step1b();
$this->step1c();
$this->step2();
$this->step3();
$this->step4();
$this->step5();
$this->finish();
return $this->word;
}
/**
* Step 0
* Remove ', 's, 's'
*/
private function step0()
{
if ( ($position = $this->search(array("'s'", "'s", "'"))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
private function step1a()
{
// sses
// replace by ss
if ( ($position = $this->search(array('sses'))) !== false) {
$this->word = preg_replace('#(sses)$#u', 'ss', $this->word);
return true;
}
// ied+ ies*
// replace by i if preceded by more than one letter, otherwise by ie (so ties -> tie, cries -> cri)
if ( ($position = $this->search(array('ied', 'ies'))) !== false) {
if ($position > 1) {
$this->word = preg_replace('#(ied|ies)$#u', 'i', $this->word);
} else {
$this->word = preg_replace('#(ied|ies)$#u', 'ie', $this->word);
}
return true;
}
// us+ ss
// do nothing
if ( ($position = $this->search(array('us', 'ss'))) !== false) {
return true;
}
// s
// delete if the preceding word part contains a vowel not immediately before the s (so gas and this retain the s, gaps and kiwis lose it)
if ( ($position = $this->search(array('s'))) !== false) {
for ($i=0; $i<$position-1; $i++) {
$letter = UTF8::substr($this->word, $i, 1);
if (in_array($letter, self::$vowels)) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
}
return true;
}
return false;
}
/**
* Step 1b
*/
private function step1b()
{
// eed eedly+
// replace by ee if in R1
if ( ($position = $this->search(array('eedly', 'eed'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(eedly|eed)$#u', 'ee', $this->word);
}
return true;
}
// ed edly+ ing ingly+
// delete if the preceding word part contains a vowel, and after the deletion:
// if the word ends at, bl or iz add e (so luxuriat -> luxuriate), or
// if the word ends with a double remove the last letter (so hopp -> hop), or
// if the word is short, add e (so hop -> hope)
if ( ($position = $this->search(array('edly', 'ingly', 'ed', 'ing'))) !== false) {
for ($i=0; $i<$position; $i++) {
$letter = UTF8::substr($this->word, $i, 1);
if (in_array($letter, self::$vowels)) {
$this->word = UTF8::substr($this->word, 0, $position);
if ($this->search(array('at', 'bl', 'iz')) !== false) {
$this->word .= 'e';
} elseif ( ($position2 = $this->search(self::$doubles)) !== false) {
$this->word = UTF8::substr($this->word, 0, ($position2+1));
} elseif ($this->isShort()) {
$this->word .= 'e';
}
return true;
}
}
return true;
}
return false;
}
/**
* Step 1c: *
*/
private function step1c()
{
// replace suffix y or Y by i if preceded by a non-vowel
// which is not the first letter of the word (so cry -> cri, by -> by, say -> say)
$length = UTF8::strlen($this->word);
if ($length < 3) {
return true;
}
if ( ($position = $this->search(array('y', 'Y'))) !== false) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if (! in_array($letter, self::$vowels)) {
$this->word = preg_replace('#(y|Y)$#u', 'i', $this->word);
}
return true;
}
return false;
}
/**
* Step 2
* Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
*/
private function step2()
{
// iveness iviti: replace by ive
if ( ($position = $this->search(array('iveness', 'iviti'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(iveness|iviti)$#u', 'ive', $this->word);
}
return true;
}
// ousli ousness: replace by ous
if ( ($position = $this->search(array('ousli', 'ousness'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(ousli|ousness)$#u', 'ous', $this->word);
}
return true;
}
// izer ization: replace by ize
if ( ($position = $this->search(array('izer', 'ization'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(izer|ization)$#u', 'ize', $this->word);
}
return true;
}
// ational ation ator: replace by ate
if ( ($position = $this->search(array('ational', 'ation', 'ator'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(ational|ation|ator)$#u', 'ate', $this->word);
}
return true;
}
// biliti bli+: replace by ble
if ( ($position = $this->search(array('biliti', 'bli'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(biliti|bli)$#u', 'ble', $this->word);
}
return true;
}
// lessli+: replace by less
if ( ($position = $this->search(array('lessli'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(lessli)$#u', 'less', $this->word);
}
return true;
}
// fulness: replace by ful
if ( ($position = $this->search(array('fulness', 'fulli'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(fulness|fulli)$#u', 'ful', $this->word);
}
return true;
}
// tional: replace by tion
if ( ($position = $this->search(array('tional'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(tional)$#u', 'tion', $this->word);
}
return true;
}
// alism aliti alli: replace by al
if ( ($position = $this->search(array('alism', 'aliti', 'alli'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(alism|aliti|alli)$#u', 'al', $this->word);
}
return true;
}
// enci: replace by ence
if ( ($position = $this->search(array('enci'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(enci)$#u', 'ence', $this->word);
}
return true;
}
// anci: replace by ance
if ( ($position = $this->search(array('anci'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(anci)$#u', 'ance', $this->word);
}
return true;
}
// abli: replace by able
if ( ($position = $this->search(array('abli'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(abli)$#u', 'able', $this->word);
}
return true;
}
// entli: replace by ent
if ( ($position = $this->search(array('entli'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(entli)$#u', 'ent', $this->word);
}
return true;
}
// ogi+: replace by og if preceded by l
if ( ($position = $this->search(array('ogi'))) !== false) {
if ($this->inR1($position)) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if ($letter == 'l') {
$this->word = preg_replace('#(ogi)$#u', 'og', $this->word);
}
}
return true;
}
// li+: delete if preceded by a valid li-ending
if ( ($position = $this->search(array('li'))) !== false) {
if ($this->inR1($position)) {
// a letter for you
$letter = UTF8::substr($this->word, ($position-1), 1);
if (in_array($letter, self::$liEnding)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
return true;
}
return false;
}
/**
* Step 3:
* Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
*/
private function step3()
{
// ational+: replace by ate
if ($this->searchIfInR1(array('ational')) !== false) {
$this->word = preg_replace('#(ational)$#u', 'ate', $this->word);
return true;
}
// tional+: replace by tion
if ($this->searchIfInR1(array('tional')) !== false) {
$this->word = preg_replace('#(tional)$#u', 'tion', $this->word);
return true;
}
// alize: replace by al
if ($this->searchIfInR1(array('alize')) !== false) {
$this->word = preg_replace('#(alize)$#u', 'al', $this->word);
return true;
}
// icate iciti ical: replace by ic
if ($this->searchIfInR1(array('icate', 'iciti', 'ical')) !== false) {
$this->word = preg_replace('#(icate|iciti|ical)$#u', 'ic', $this->word);
return true;
}
// ful ness: delete
if ( ($position = $this->searchIfInR1(array('ful', 'ness'))) !== false) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
// ative*: delete if in R2
if ( (($position = $this->searchIfInR1(array('ative'))) !== false) && ($this->inR2($position)) ) {
$this->word = UTF8::substr($this->word, 0, $position);
return true;
}
return false;
}
/**
* Step 4
* Search for the longest among the following suffixes, and, if found and in R2, perform the action indicated.
*/
private function step4()
{
// ement ance ence able ible ant ment ent ism ate iti ous ive ize al er ic
// delete
if ( ($position = $this->search(array(
'ance', 'ence', 'ement', 'able', 'ible', 'ant', 'ment', 'ent', 'ism',
'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er', 'ic'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// ion
// delete if preceded by s or t
if ( ($position = $this->searchIfInR2(array('ion'))) !== false) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if ($letter == 's' || $letter == 't') {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
return false;
}
/**
* Step 5: *
* Search for the the following suffixes, and, if found, perform the action indicated.
*/
private function step5()
{
// e
// delete if in R2, or in R1 and not preceded by a short syllable
if ( ($position = $this->search(array('e'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
} elseif ($this->inR1($position)) {
if ( (! $this->searchShortSyllabe(-4, 3)) && (! $this->searchShortSyllabe(-3, 2)) ) {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
return true;
}
// l
// delete if in R2 and preceded by l
if ( ($position = $this->searchIfInR2(array('l'))) !== false) {
$before = $position - 1;
$letter = UTF8::substr($this->word, $before, 1);
if ($letter == 'l') {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
return false;
}
private function finish()
{
$this->word = UTF8::str_replace('Y', 'y', $this->word);
}
private function exceptionR1()
{
if (Utf8::strpos($this->word, 'gener') === 0) {
$this->r1 = UTF8::substr($this->word, 5);
$this->r1Index = 5;
} elseif (Utf8::strpos($this->word, 'commun') === 0) {
$this->r1 = UTF8::substr($this->word, 6);
$this->r1Index = 6;
} elseif (Utf8::strpos($this->word, 'arsen') === 0) {
$this->r1 = UTF8::substr($this->word, 5);
$this->r1Index = 5;
}
}
/**
* 1/ Stem certain special words as follows,
* 2/ If one of the following is found, leave it invariant,
*/
private function exception1()
{
$exceptions = array(
'skis' => 'ski',
'skies' => 'sky',
'dying' => 'die',
'lying' => 'lie',
'tying' => 'tie',
'idly' => 'idl',
'gently' => 'gentl',
'ugly' => 'ugli',
'early' => 'earli',
'only' => 'onli',
'singly' => 'singl',
// invariants
'sky' => 'sky',
'news' => 'news',
'howe' => 'howe',
'atlas' => 'atlas',
'cosmos' => 'cosmos',
'bias' => 'bias',
'andes' => 'andes'
);
if (isset($exceptions[$this->word])) {
return $exceptions[$this->word];
}
return null;
}
/**
* Following step 1a, leave the following invariant,
*/
private function exception2()
{
$exceptions = array(
'inning' => 'inning',
'outing' => 'outing',
'canning' => 'canning',
'herring' => 'herring',
'earring' => 'earring',
'proceed' => 'proceed',
'exceed' => 'exceed',
'succeed' => 'succeed'
);
if (isset($exceptions[$this->word])) {
return $exceptions[$this->word];
}
return null;
}
/**
* A word is called short if it ends in a short syllable, and if R1 is null.
* Note : R1 not really null, but the word at this state must be smaller than r1 index
*
* @return boolean
*/
private function isShort()
{
$length = UTF8::strlen($this->word);
return ( ($this->searchShortSyllabe(-3, 3) || $this->searchShortSyllabe(-2, 2)) && ($length == $this->r1Index) );
}
/**
* Define a short syllable in a word as either (a) a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel,
* or * (b) a vowel at the beginning of the word followed by a non-vowel.
*
* So rap, trap, entrap end with a short syllable, and ow, on, at are classed as short syllables.
* But uproot, bestow, disturb do not end with a short syllable.
*/
private function searchShortSyllabe($from, $nbLetters)
{
$length = UTF8::strlen($this->word);
if ($from < 0) {
$from = $length + $from;
}
if ($from < 0) {
$from = 0;
}
// (a) is just for beginning of the word
if ( ($nbLetters == 2) && ($from != 0) ) {
return false;
}
$first = UTF8::substr($this->word, $from, 1);
$second = UTF8::substr($this->word, ($from+1), 1);
if ($nbLetters == 2) {
if ( (in_array($first, self::$vowels)) && (!in_array($second, self::$vowels)) ) {
return true;
}
}
$third = UTF8::substr($this->word, ($from+2), 1);
if ( (!in_array($first, self::$vowels)) && (in_array($second, self::$vowels))
&& (!in_array($third, array_merge(self::$vowels, array('x', 'Y', 'w'))))) {
return true;
}
return false;
}
}