Current File : /home/pacjaorg/public_html/dnpsom/libraries/vendor/wamania/php-stemmer/src/Stemmer/Dutch.php |
<?php
namespace Wamania\Snowball\Stemmer;
use voku\helper\UTF8;
/**
*
* @link http://snowball.tartarus.org/algorithms/dutch/stemmer.html
* @author wamania
*
*/
class Dutch extends Stem
{
/**
* All dutch vowels
*/
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'è');
/**
* {@inheritdoc}
*/
public function stem($word)
{
// we do ALL in UTF-8
if (!UTF8::is_utf8($word)) {
throw new \Exception('Word must be in UTF-8');
}
$this->word = UTF8::strtolower($word);
// First, remove all umlaut and acute accents.
$this->word = UTF8::str_replace(
array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'),
array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'),
$this->word);
$this->plainVowels = implode('', self::$vowels);
// Put initial y, y after a vowel, and i between vowels into upper case.
$this->word = preg_replace('#^y#u', 'Y', $this->word);
$this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word);
$this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
// R1 and R2 (see the note on R1 and R2) are then defined as in German.
// R1 and R2 are first set up in the standard way
$this->r1();
$this->r2();
// but then R1 is adjusted so that the region before it contains at least 3 letters.
if ($this->r1Index < 3) {
$this->r1Index = 3;
$this->r1 = UTF8::substr($this->word, 3);
}
// Do each of steps 1, 2 3 and 4.
$this->step1();
$removedE = $this->step2();
$this->step3a();
$this->step3b($removedE);
$this->step4();
$this->finish();
return $this->word;
}
/**
* Define a valid s-ending as a non-vowel other than j.
* @param string $ending
* @return boolean
*/
private function hasValidSEnding($word)
{
$lastLetter = UTF8::substr($word, -1, 1);
return !in_array($lastLetter, array_merge(self::$vowels, array('j')));
}
/**
* Define a valid en-ending as a non-vowel, and not gem.
* @param string $ending
* @return boolean
*/
private function hasValidEnEnding($word)
{
$lastLetter = UTF8::substr($word, -1, 1);
if (in_array($lastLetter, self::$vowels)) {
return false;
}
$threeLastLetters = UTF8::substr($word, -3, 3);
if ($threeLastLetters == 'gem') {
return false;
}
return true;
}
/**
* Define undoubling the ending as removing the last letter if the word ends kk, dd or tt.
*/
private function unDoubling()
{
if ($this->search(array('kk', 'dd', 'tt')) !== false) {
$this->word = UTF8::substr($this->word, 0, -1);
}
}
/**
* Step 1
* Search for the longest among the following suffixes, and perform the action indicated
*/
private function step1()
{
// heden
// replace with heid if in R1
if ( ($position = $this->search(array('heden'))) !== false) {
if ($this->inR1($position)) {
$this->word = preg_replace('#(heden)$#u', 'heid', $this->word);
}
return true;
}
// en ene
// delete if in R1 and preceded by a valid en-ending, and then undouble the ending
if ( ($position = $this->search(array('ene', 'en'))) !== false) {
if ($this->inR1($position)) {
$word = UTF8::substr($this->word, 0, $position);
if ($this->hasValidEnEnding($word)) {
$this->word = $word;
$this->unDoubling();
}
}
return true;
}
// s se
// delete if in R1 and preceded by a valid s-ending
if ( ($position = $this->search(array('se', 's'))) !== false) {
if ($this->inR1($position)) {
$word = UTF8::substr($this->word, 0, $position);
if ($this->hasValidSEnding($word)) {
$this->word = $word;
}
}
return true;
}
return false;
}
/**
* Step 2
* Delete suffix e if in R1 and preceded by a non-vowel, and then undouble the ending
*/
private function step2()
{
if ( ($position = $this->search(array('e'))) !== false) {
if ($this->inR1($position)) {
$letter = UTF8::substr($this->word, -2, 1);
if (!in_array($letter, self::$vowels)) {
$this->word = UTF8::substr($this->word, 0, $position);
$this->unDoubling();
return true;
}
}
}
return false;
}
/**
* Step 3a: heid
* delete heid if in R2 and not preceded by c, and treat a preceding en as in step 1(b)
*/
private function step3a()
{
if ( ($position = $this->search(array('heid'))) !== false) {
if ($this->inR2($position)) {
$letter = UTF8::substr($this->word, -5, 1);
if ($letter !== 'c') {
$this->word = UTF8::substr($this->word, 0, $position);
if ( ($position = $this->search(array('en'))) !== false) {
if ($this->inR1($position)) {
$word = UTF8::substr($this->word, 0, $position);
if ($this->hasValidEnEnding($word)) {
$this->word = $word;
$this->unDoubling();
}
}
}
}
}
}
}
/**
* Step 3b: d-suffixe
* Search for the longest among the following suffixes, and perform the action indicated.
*/
private function step3b($removedE)
{
// end ing
// delete if in R2
// if preceded by ig, delete if in R2 and not preceded by e, otherwise undouble the ending
if ( ($position = $this->search(array('end', 'ing'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
if ( ($position2 = $this->searchIfInR2(array('ig'))) !== false) {
$letter = UTF8::substr($this->word, -3, 1);
if ($letter !== 'e') {
$this->word = UTF8::substr($this->word, 0, $position2);
}
} else {
$this->unDoubling();
}
}
return true;
}
// ig
// delete if in R2 and not preceded by e
if ( ($position = $this->search(array('ig'))) !== false) {
if ($this->inR2($position)) {
$letter = UTF8::substr($this->word, -3, 1);
if ($letter !== 'e') {
$this->word = UTF8::substr($this->word, 0, $position);
}
}
return true;
}
// lijk
// delete if in R2, and then repeat step 2
if ( ($position = $this->search(array('lijk'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
$this->step2();
}
return true;
}
// baar
// delete if in R2
if ( ($position = $this->search(array('baar'))) !== false) {
if ($this->inR2($position)) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
// bar
// delete if in R2 and if step 2 actually removed an e
if ( ($position = $this->search(array('bar'))) !== false) {
if ($this->inR2($position) && $removedE) {
$this->word = UTF8::substr($this->word, 0, $position);
}
return true;
}
return false;
}
/**
* Step 4: undouble vowel
* If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u,
* remove one of the vowels from V (for example, maan -> man, brood -> brod).
*/
private function step4()
{
// D is a non-vowel other than I
$d = UTF8::substr($this->word, -1, 1);
if (in_array($d, array_merge(self::$vowels, array('I')))) {
return false;
}
// V is double a, e, o or u
$v = UTF8::substr($this->word, -3, 2);
if (!in_array($v, array('aa', 'ee', 'oo', 'uu'))) {
return false;
}
$singleV = UTF8::substr($v, 0, 1);
// C is a non-vowel
$c = UTF8::substr($this->word, -4, 1);
if (in_array($c, self::$vowels)) {
return false;
}
$this->word = UTF8::substr($this->word, 0, -4);
$this->word .= $c . $singleV .$d;
}
/**
* Finally
* Turn I and Y back into lower case.
*/
private function finish()
{
$this->word = UTF8::str_replace(array('I', 'Y'), array('i', 'y'), $this->word);
}
}