add short cut methods for stemming and rake

yooper · yooper · commit e520cf285451 · 2018-05-17T21:43:54.000-04:00
diff --git a/README.md b/README.md
@@ -71,5 +71,20 @@ Customize the ngrams
 $trigrams = ngrams($tokens,3, '|');
 ```
  
+### Stemming
+By default stem method uses the Porter Stemmer.
+```php
+$stemmedTokens = stem($tokens);
+```
+You can customize which type of stemmer to use by passing in the name of the stemmer class name
+```php
+$stemmedTokens = stem($tokens, \TextAnalysis\Stemmers\MorphStemmer::class);
+```
 
-
+### Keyword Extract with Rake
+There is a short cut method for using the Rake algorithm. You will need to clean
+your data prior to using. Second parameter is the ngram size of your keywords to extract.
+```php
+$rake = rake($tokens, 3);
+$results = $rake->getKeywordScores();
+```
diff --git a/src/Analysis/DateAnalysis.php b/src/Analysis/DateAnalysis.php
@@ -28,7 +28,7 @@ class DateAnalysis
      * 
      * @param type $text
      */
-    public function __construct($text) 
+    public function __construct(string $text) 
     {
         $tokenizer = new SentenceTokenizer();
         $this->sentences = $tokenizer->tokenize( $this->normalize($text)) ;        
@@ -39,7 +39,7 @@ public function __construct($text)
      * ie Mar. to March
      * @param string $text
      */
-    protected function normalize($text)
+    protected function normalize(string $text) : string
     {
         $search = ['jan.','feb.','mar.','apr.','may.','jun.','jul.','aug.','sep.','oct.','nov.','dec.'];
         $replace = [
@@ -62,7 +62,7 @@ protected function normalize($text)
     /**
      * @return DateTime[]
      */
-    public function getDates()
+    public function getDates() : array
     {        
         // return the cached copy
         if(empty($this->dates)) {
diff --git a/src/Collocations/CollocationFinder.php b/src/Collocations/CollocationFinder.php
@@ -2,20 +2,25 @@
 
 namespace TextAnalysis\Collocations;
 
-use TextAnalysis\NGrams\NGramFactory;
-use TextAnalysis\Analysis\FreqDist;
-
 /**
  * Helps find popular phrases in the given set of tokens
  * @author yooper
  */
 class CollocationFinder 
 {
+    /**
+     * The ngram size 
+     * @var int
+     */
     protected $nGramSize = 2;
     
+    /**
+     *
+     * @var array
+     */
     protected $tokens = [];
     
-    public function __construct(array $tokens, $nGramSize = 2) 
+    public function __construct(array $tokens, int $nGramSize = 2) 
     {
         $this->tokens = $tokens;
         $this->nGramSize = $nGramSize;
diff --git a/src/Tokenizers/SentimentTokenizer.php b/src/Tokenizers/SentimentTokenizer.php
diff --git a/src/helpers/helpers.php b/src/helpers/helpers.php
@@ -111,6 +111,38 @@ function text( string $text ): \TextAnalysis\Corpus\TextCorpus {
 	}
 }
 
+
+if (! function_exists('rake')) {
+    /**
+    * Returns an instance of the Rake
+    *
+    * @param array $tokens
+    *
+    * @return \TextAnalysis\Analysis\Keywords\Rake
+    */
+    function rake(array $tokens, int $ngramSize = 3): \TextAnalysis\Analysis\Keywords\Rake 
+    {
+        return new \TextAnalysis\Analysis\Keywords\Rake(new \TextAnalysis\Documents\TokensDocument($tokens), $ngramSize);
+    }
+}
+
+if (! function_exists('stem')) {
+    /**
+    * Returns an array of stemmed tokens
+    *
+    * @param array $tokens
+    *
+    * @return \TextAnalysis\Analysis\Keywords\Rake
+    */
+    function stem(array $tokens, string $stemmerClassName = \TextAnalysis\Stemmers\PorterStemmer::class): array 
+    {
+	$stemmer = new $stemmerClassName();
+        return array_map(function($token) use($stemmer){ return $stemmer->stem($token); }, $tokens);
+    }
+}
+
+
+
 /**
  * Check if the given array has the given needle, using a case insensitive search. 
  * Keeps a local copy of the normalized haystack for quicker lookup on the same array
diff --git a/tests/TextAnalysis/Analysis/Keywords/RakeTest.php b/tests/TextAnalysis/Analysis/Keywords/RakeTest.php
@@ -36,6 +36,25 @@ public function testRake()
         $this->assertArrayHasKey('8/8/2016 5:51 pm', $results);         
     }
     
+    public function testSimplifiedRake()
+    {
+        $stopwords = array_map('trim', file(VENDOR_DIR.'yooper/stop-words/data/stop-words_english_1_en.txt'));
+        // all punctuation must be moved 1 over. Fixes issues with sentences
+        $testData = (new SpacePunctuationFilter([':','\/']))->transform($this->getTestData());
+        //rake MUST be split on whitespace and new lines only
+        $tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData);        
+        $tokenDoc = new TokensDocument($tokens);
+        $tokenDoc->applyTransformation(new LowerCaseFilter())
+                ->applyTransformation(new StopWordsFilter($stopwords), false)
+                ->applyTransformation(new PunctuationFilter(['@',':','\/']), false)
+                ->applyTransformation(new CharFilter(), false);
+
+        $rake = rake($tokenDoc->toArray(), 3);
+        $results = $rake->getKeywordScores();
+        $this->assertArrayHasKey('minimal generating sets', $results); 
+        $this->assertArrayHasKey('8/8/2016 5:51 pm', $results);                 
+    }
+    
     /**
      * Sample test data 
      * @return string
diff --git a/tests/TextAnalysis/Stemmers/PorterStemmerTest.php b/tests/TextAnalysis/Stemmers/PorterStemmerTest.php
@@ -18,4 +18,10 @@ public function testStemmer()
         $this->assertEquals('univers', $stemmer->stem('universities'));
         $this->assertEquals('judg',$stemmer->stem('judges'));
     }
+    
+    public function testSimplifiedStemmer()
+    {
+        $this->assertEquals(['univers','judg'], stem(['universities', 'judges']));
+    }    
+    
 }

Original file line number	Diff line number	Diff line change
`@@ -18,4 +18,10 @@ public function testStemmer()`
`18`	`18`	`$this->assertEquals('univers', $stemmer->stem('universities'));`
`19`	`19`	`$this->assertEquals('judg',$stemmer->stem('judges'));`
`20`	`20`	`}`
	`21`	`+`
	`22`	`+ public function testSimplifiedStemmer()`
	`23`	`+ {`
	`24`	`+ $this->assertEquals(['univers','judg'], stem(['universities', 'judges']));`
	`25`	`+ }`
	`26`	`+`
`21`	`27`	`}`