working on sentence summarizer

yooper · yooper · commit c061bf6520d1 · 2018-06-29T12:06:37.000-04:00
diff --git a/composer.json b/composer.json
@@ -29,7 +29,8 @@
         "symfony/console": "~2.7|~3.4|~4.0",
         "camspiers/porter-stemmer": "~1",
         "wamania/php-stemmer": "~1",
-        "yooper/nicknames": "~1"
+        "yooper/nicknames": "~1",
+        "vanderlee/php-sentence": "^1.0"
     },
     "require-dev": {
         "phpunit/phpunit": "~5",
diff --git a/src/Analysis/Summarize/Simple.php b/src/Analysis/Summarize/Simple.php
@@ -0,0 +1,46 @@
+<?php
+
+namespace TextAnalysis\Analysis\Summarize;
+
+use TextAnalysis\Models\ScoreKeeper;
+
+/**
+ * A simple algorithm based off of frequency counts for finding the best
+ * sentence to summarize the text
+ * @author yooper
+ */
+class Simple 
+{    
+    /**
+     * Returns each sentenced scored. 
+     * @param array $wordTokens
+     * @param array $sentenceTokens
+     * @return array
+     */
+    public function summarize(array $wordTokens, array $sentenceTokens) : array
+    {
+        $tokenCounts = array_count_values($wordTokens);
+        $scoreKeepers = [];
+        for($index = 0; $index < count($sentenceTokens); $index++)
+        {
+            $scoreKeepers[] = new ScoreKeeper($sentenceTokens[$index], $index);
+        }
+               
+        $sentenceCounter = array_fill_keys($sentenceTokens, 0);
+        
+        foreach($tokenCounts as $token => $freq)
+        {
+            foreach($scoreKeepers as $sentenceKeeper)
+            {
+                if(strpos($sentenceKeeper->getToken(), $token) !== false) {
+                    
+                    $sentenceKeeper->addToScore($freq);
+                }
+            }
+        } 
+        
+        usort($scoreKeepers, 'score_keeper_sort');
+        return $scoreKeepers;
+    }
+        
+}
diff --git a/src/Classifiers/NaiveBayes.php b/src/Classifiers/NaiveBayes.php
@@ -23,7 +23,7 @@ class NaiveBayes implements \TextAnalysis\Interfaces\IClassifier
     
     /**
      * Track the token counts
-     * @var array[int]
+     * @var int[]
      */
     protected $tokenCount = [];
             
diff --git a/src/Filters/TrimFilter.php b/src/Filters/TrimFilter.php
@@ -0,0 +1,20 @@
+<?php
+
+namespace TextAnalysis\Filters;
+
+use TextAnalysis\Interfaces\ITokenTransformation;
+
+
+/**
+ *
+ * @author yooper
+ */
+class TrimFilter implements ITokenTransformation
+{
+    //put your code here
+    public function transform($word) 
+    {
+        return trim($word);
+    }
+
+}
diff --git a/src/Models/ScoreKeeper.php b/src/Models/ScoreKeeper.php
@@ -0,0 +1,55 @@
+<?php
+
+namespace TextAnalysis\Models;
+
+/**
+ * Track metrics of tokenization
+ * @author yooper
+ */
+class ScoreKeeper 
+{
+    /**
+     *
+     * @var string
+     */
+    protected $token;
+    
+    /**
+     *
+     * @var mixed
+     */
+    protected $score;
+    
+    /**
+     *
+     * @var mixed
+     */
+    protected $index;
+    
+    public function __construct(string $token, $index, $score = 0) 
+    {
+        $this->token = $token;
+        $this->index = $index;
+        $this->score = $score;
+    }
+    
+    public function getToken() : string
+    {
+        return $this->token;
+    }
+    
+    public function getIndex()
+    {
+        return $this->index;
+    }
+    
+    public function getScore()
+    {
+        return $this->score;
+    }
+    
+    public function addToScore($score)
+    {
+        $this->score += $score;
+    }
+}
diff --git a/src/Tokenizers/VanderleeTokenizer.php b/src/Tokenizers/VanderleeTokenizer.php
@@ -0,0 +1,40 @@
+<?php
+
+namespace TextAnalysis\Tokenizers;
+
+use Sentence;
+
+/**
+ * A wrapper around the sentence tokenizer written by 
+ * vanderlee/php-sentence
+ * @author yooper
+ */
+class VanderleeTokenizer extends TokenizerAbstract
+{
+    /**
+     *
+     * @var Sentence
+     */
+    protected $sentence = null;
+    
+    public function __construct() 
+    {
+        $this->sentence = new Sentence;
+    }
+
+    /**
+     * Split the text into sentences
+     * @param string $string
+     * @return array
+     */
+    public function tokenize($string): array 
+    {
+        return filter_empty( $this->sentence->split($string));
+    }
+    
+    public function __destruct() 
+    {
+        unset($this->sentence);
+    }
+
+}
diff --git a/src/helpers/helpers.php b/src/helpers/helpers.php
@@ -228,36 +228,32 @@ function naive_bayes() : \TextAnalysis\Classifiers\NaiveBayes
 }
 
 /**
- * Return an array of filtered tokens
+ * Pass the tokens in by reference and modify them
  * @param array $tokens
  * @param string $filterType
- * @return string[]
  */
-function filter_tokens(array &$tokens, string $filterType) : array
+function filter_tokens(array &$tokens, string $filterType)
 {
     $className = "\\TextAnalysis\\Filters\\{$filterType}";
     $filter = new $className();
     foreach($tokens as &$token)
     {
         $token = $filter->transform($token);
     }
-    return array_values($tokens);
 }
 
 /**
  * Filter out stop words
  * @param array $tokens
  * @param array $stopwords
- * @return array
  */
-function filter_stopwords(array &$tokens, array &$stopwords) : array
+function filter_stopwords(array &$tokens, array &$stopwords)
 {
     $filter = new \TextAnalysis\Filters\StopWordsFilter($stopwords);      
     foreach($tokens as &$token)
     {
         $token = $filter->transform($token);
     }
-    return array_values($tokens);
 }
 
 /**
@@ -298,4 +294,58 @@ function filter_empty(array $tokens) : array
     return array_filter($tokens);
 }
 
+function score_keeper_sort($a, $b)
+{
+    if ($a->getScore() == $b->getScore()) {
+        return 0;
+    }
+    return ($a->getScore() < $b->getScore()) ? 1 : -1;
+}
+
+/**
+ * Apply common filters and
+ * @param string $text
+ * @param array $stopwords
+ * @return array
+ */
+function summary_simple(string $text, array $stopwords = []) : array
+{
+    $sentenceTokensOriginal = (new \TextAnalysis\Tokenizers\VanderleeTokenizer())->tokenize(strtolower($text));
+
+    //create copy
+    $sentenceTokens = $sentenceTokensOriginal;
+    if(!empty($stopwords)) {
+        foreach($sentenceTokens as &$sentence)
+        {
+            $sentence = str_replace($stopwords, " ", $sentence);
+        }
+    }
+        
+    filter_tokens($sentenceTokens, 'TrimFilter');
+    filter_tokens($sentenceTokens, 'QuotesFilter');
+    filter_tokens($sentenceTokens, 'CharFilter');        
+        
+    $wordTokens = tokenize($text);
+    foreach(['LowerCaseFilter','PunctuationFilter','QuotesFilter','PossessiveNounFilter','CharFilter'] as $filterType)
+    {
+        filter_tokens($wordTokens, $filterType);
+    }
+     
+    if(!empty($stopwords)) {
+        filter_stopwords($wordTokens, $stopwords);
+    }
+    
+    $summarizer = new \TextAnalysis\Analysis\Summarize\Simple();
+    $scores = $summarizer->summarize(filter_empty( $wordTokens ), $sentenceTokens);
+
+    // reorder sentences in the best order
+    $bestSentences = [];
+    foreach($scores as $score)
+    {
+        $bestSentences[] = $sentenceTokensOriginal[$score->getIndex()];
+    }    
+    return $bestSentences;
+}
+
+
 
diff --git a/tests/TextAnalysis/Analysis/Summarize/SimpleTest.php b/tests/TextAnalysis/Analysis/Summarize/SimpleTest.php
@@ -0,0 +1,54 @@
+<?php
+
+namespace Tests\TextAnalysis\Analysis\Summarize;
+
+/**
+ * Test out the simple summary algorithm
+ * @author yooper
+ */
+class SimpleTest extends \PHPUnit_Framework_TestCase
+{
+    public function testSimpleWithStopwords()
+    {
+        $stopwords = get_stop_words(VENDOR_DIR."yooper/stop-words/data/stop-words_english_1_en.txt"); 
+        $stopwords = array_map(function($word){ return " {$word} ";}, $stopwords);
+        $bestSentences = summary_simple($this->getArticle(), $stopwords);     
+        $this->assertCount(13, $bestSentences);
+        $this->assertEquals($this->getTopSentence(), $bestSentences[0]);
+                      
+    }
+    
+    public function testSimpleWithoutStopwords()
+    {
+        $bestSentences = summary_simple($this->getArticle());     
+        $this->assertCount(13, $bestSentences);
+        $this->assertNotEquals($this->getTopSentenceWithoutStopwords(), $bestSentences[0]);              
+    }    
+    
+    public function getArticle() : string
+    {
+        return <<<TEXT
+According to a Tuesday news release, Houghton County leaders are asking for a slowing of supply donations.
+
+Volunteers and financial donations are still needed, along with dehumidifiers, box fans or large equipment that can be used for excavating, demolition or reconstruction.
+
+"The response to our recent flood disaster has been overwhelming, and the Copper Country cannot be thankful enough for the support that’s been received," said Michael Babcock, the director of marketing and communications at Finlandia University. "However, as of now, volunteers have reached a point where enough normal supplies are on hand. Officials are now asking that the flow of general donations be reduced or stopped with a few exceptions. We know of several additional semi loads coming, but we’re now asking that any additional large deliveries that are planned please be put on hold, unless the items being donated are dehumidifiers, box fans or large equipment that can be used for excavating, demolition or reconstruction."
+
+Volunteers are still needed. The recovery effort is transitioning from initial clean-up to the rehab and reconstruction phase, and additional volunteers are a vital part of that effort.
+
+To donate money, please go to coppercountrystrong.com/donate.
+
+On Friday at 4 p.m. the Flood Relief Supply Distribution at Dee Stadium will be closing. Those in need of supplies are asked to stop by before it closes to get what’s needed for the weekend. Next steps for the distribution center are being evaluated and will be announced as soon as possible.        
+TEXT;
+    }
+    
+    public function getTopSentence()
+    {
+        return '"the response to our recent flood disaster has been overwhelming, and the copper country cannot be thankful enough for the support that\'s been received," said michael babcock, the director of marketing and communications at finlandia university.';
+    }
+    
+    public function getTopSentenceWithoutStopwords()
+    {
+        return 'we know of several additional semi loads coming, but we’re now asking that any additional large deliveries that are planned please be put on hold, unless the items being donated are dehumidifiers, box fans or large equipment that can be used for excavating, demolition or reconstruction."';
+    }
+}
diff --git a/tests/TextAnalysis/Classifiers/NaiveBayesTest.php b/tests/TextAnalysis/Classifiers/NaiveBayesTest.php
@@ -41,9 +41,9 @@ public function testMovieReviews()
         
         $movieReviewTokens = tokenize($this->getMovieReview());
         $stopWords = get_stop_words(VENDOR_DIR."yooper/stop-words/data/stop-words_english_1_en.txt");
-        $movieReviewTokens = filter_stopwords($movieReviewTokens, $stopWords);
-        $movieReviewTokens = filter_tokens($movieReviewTokens, 'PunctuationFilter');
-        $movieReviewTokens = filter_tokens($movieReviewTokens, 'QuotesFilter');
+        filter_stopwords($movieReviewTokens, $stopWords);
+        filter_tokens($movieReviewTokens, 'PunctuationFilter');
+        filter_tokens($movieReviewTokens, 'QuotesFilter');
         $movieReviewTokens = stem($movieReviewTokens);                   
         $this->assertEquals('positive', array_keys($nb->predict($movieReviewTokens))[0]);
         
@@ -58,9 +58,9 @@ protected function getTokenizedReviews(string $filePath) : array
         }
         
         $tokens = tokenize(file_get_contents($filePath));
-        $tokens = filter_tokens($tokens, 'PunctuationFilter');
-        $tokens = filter_tokens($tokens, 'QuotesFilter');
-        $tokens = filter_stopwords($tokens, $stopWords);        
+        filter_tokens($tokens, 'PunctuationFilter');
+        filter_tokens($tokens, 'QuotesFilter');
+        filter_stopwords($tokens, $stopWords);        
         $tokens = stem($tokens);
         $tokens = filter_empty($tokens);
         return $tokens;
diff --git a/tests/TextAnalysis/Tokenizers/VanderleeTokenizerTest.php b/tests/TextAnalysis/Tokenizers/VanderleeTokenizerTest.php
@@ -0,0 +1,25 @@
+<?php
+namespace Tests\TextAnalysis\Tokenizers;
+
+use TextAnalysis\Tokenizers\VanderleeTokenizer;
+
+/**
+ *
+ * @author yooper
+ */
+class VanderleeTokenizerTest extends \PHPUnit_Framework_TestCase
+{
+    public function testTokenizer()
+    {
+        $tokenizer = new VanderleeTokenizer();
+        $sentences = $tokenizer->tokenize($this->getText());
+        $this->assertCount(5, $sentences);
+    }
+    
+    protected function getText()
+    {
+        return <<<TEXT
+Hello there, Mr. Smith. What're you doing today... Smith, my friend?\n\nI hope it's good. This last sentence will cost you $2.50! Just kidding :)
+TEXT;
+    }
+}