Merge pull request #40 from yooper/sentence_summarizer

yooper · web-flow · commit 892d543f94c2 · 2018-07-19T13:03:54.000-04:00
Sentence summarizer
diff --git a/README.md b/README.md
@@ -7,6 +7,19 @@ php-text-analysis
 [![Total Downloads](https://poser.pugx.org/yooper/php-text-analysis/downloads)](https://packagist.org/packages/yooper/php-text-analysis)
 
 PHP Text Analysis is a library for performing Information Retrieval (IR) and Natural Language Processing (NLP) tasks using the PHP language. 
+There are tools in this library that can perform:
+
+* document classification
+* sentiment analysis
+* compare documents
+* frequency analysis
+* tokenization
+* stemming
+* collocations with Pointwise Mutual Information
+* lexical diversity
+* corpus analysis
+* text summarization
+
 All the documentation for this project can be found in the book and wiki. 
 
 PHP Text Analysis Book & Wiki
@@ -88,3 +101,25 @@ your data prior to using. Second parameter is the ngram size of your keywords to
 $rake = rake($tokens, 3);
 $results = $rake->getKeywordScores();
 ```
+
+### Sentiment Analysis with Vader
+Need Sentiment Analysis with PHP Use Vader, https://github.com/cjhutto/vaderSentiment .
+The PHP implementation can be invoked easily. Just normalize your data before hand.
+```php
+$sentimentScores = vader($tokens);
+```
+
+### Document Classification with Naive Bayes
+Need to do some docucment classification with PHP, trying using the Naive Bayes
+implementation. An example of classifying movie reviews can be found in the unit
+tests
+
+```php
+$nb = naive_bayes();
+$nb->train('mexican', tokenize('taco nacho enchilada burrito'));        
+$nb->train('american', tokenize('hamburger burger fries pop'));  
+$nb->predict(tokenize('my favorite food is a burrito'));
+```
+
+
+
diff --git a/composer.json b/composer.json
@@ -29,7 +29,8 @@
         "symfony/console": "~2.7|~3.4|~4.0",
         "camspiers/porter-stemmer": "~1",
         "wamania/php-stemmer": "~1",
-        "yooper/nicknames": "~1"
+        "yooper/nicknames": "~1",
+        "vanderlee/php-sentence": "^1.0"
     },
     "require-dev": {
         "phpunit/phpunit": "~5",
diff --git a/src/Analysis/Summarize/Simple.php b/src/Analysis/Summarize/Simple.php
@@ -0,0 +1,44 @@
+<?php
+
+namespace TextAnalysis\Analysis\Summarize;
+
+use TextAnalysis\Models\ScoreKeeper;
+
+/**
+ * A simple algorithm based off of frequency counts for finding the best
+ * sentence to summarize the text
+ * @author yooper
+ */
+class Simple 
+{    
+    /**
+     * Returns each sentenced scored. 
+     * @param array $wordTokens
+     * @param array $sentenceTokens
+     * @return array
+     */
+    public function summarize(array $wordTokens, array $sentenceTokens) : array
+    {
+        $tokenCounts = array_count_values($wordTokens);
+        $scoreKeepers = [];
+        for($index = 0; $index < count($sentenceTokens); $index++)
+        {
+            $scoreKeepers[] = new ScoreKeeper($sentenceTokens[$index], $index);
+        }
+                       
+        foreach($tokenCounts as $token => $freq)
+        {
+            foreach($scoreKeepers as $sentenceKeeper)
+            {
+                if(strpos($sentenceKeeper->getToken(), $token) !== false) {
+                    
+                    $sentenceKeeper->addToScore($freq);
+                }
+            }
+        } 
+        
+        usort($scoreKeepers, 'score_keeper_sort');
+        return $scoreKeepers;
+    }
+        
+}
diff --git a/src/Classifiers/NaiveBayes.php b/src/Classifiers/NaiveBayes.php
@@ -23,7 +23,7 @@ class NaiveBayes implements \TextAnalysis\Interfaces\IClassifier
     
     /**
      * Track the token counts
-     * @var array[int]
+     * @var int[]
      */
     protected $tokenCount = [];
             
diff --git a/src/Filters/TrimFilter.php b/src/Filters/TrimFilter.php
@@ -0,0 +1,20 @@
+<?php
+
+namespace TextAnalysis\Filters;
+
+use TextAnalysis\Interfaces\ITokenTransformation;
+
+
+/**
+ *
+ * @author yooper
+ */
+class TrimFilter implements ITokenTransformation
+{
+    //put your code here
+    public function transform($word) 
+    {
+        return trim($word);
+    }
+
+}
diff --git a/src/Models/ScoreKeeper.php b/src/Models/ScoreKeeper.php
@@ -0,0 +1,55 @@
+<?php
+
+namespace TextAnalysis\Models;
+
+/**
+ * Track metrics of tokenization
+ * @author yooper
+ */
+class ScoreKeeper 
+{
+    /**
+     *
+     * @var string
+     */
+    protected $token;
+    
+    /**
+     *
+     * @var mixed
+     */
+    protected $score;
+    
+    /**
+     *
+     * @var mixed
+     */
+    protected $index;
+    
+    public function __construct(string $token, $index, $score = 0) 
+    {
+        $this->token = $token;
+        $this->index = $index;
+        $this->score = $score;
+    }
+    
+    public function getToken() : string
+    {
+        return $this->token;
+    }
+    
+    public function getIndex()
+    {
+        return $this->index;
+    }
+    
+    public function getScore()
+    {
+        return $this->score;
+    }
+    
+    public function addToScore($score)
+    {
+        $this->score += $score;
+    }
+}
diff --git a/src/Tokenizers/VanderleeTokenizer.php b/src/Tokenizers/VanderleeTokenizer.php
@@ -0,0 +1,40 @@
+<?php
+
+namespace TextAnalysis\Tokenizers;
+
+use Sentence;
+
+/**
+ * A wrapper around the sentence tokenizer written by 
+ * vanderlee/php-sentence
+ * @author yooper
+ */
+class VanderleeTokenizer extends TokenizerAbstract
+{
+    /**
+     *
+     * @var Sentence
+     */
+    protected $sentence = null;
+    
+    public function __construct() 
+    {
+        $this->sentence = new Sentence;
+    }
+
+    /**
+     * Split the text into sentences
+     * @param string $string
+     * @return array
+     */
+    public function tokenize($string): array 
+    {
+        return filter_empty( $this->sentence->split($string));
+    }
+    
+    public function __destruct() 
+    {
+        unset($this->sentence);
+    }
+
+}
diff --git a/src/helpers/helpers.php b/src/helpers/helpers.php
@@ -133,7 +133,11 @@ function rake(array $tokens, int $ngramSize = 3): \TextAnalysis\Analysis\Keyword
     function stem(array $tokens, string $stemmerClassName = \TextAnalysis\Stemmers\PorterStemmer::class): array 
     {
 	$stemmer = new $stemmerClassName();
-        return array_map(function($token) use($stemmer){ return $stemmer->stem($token); }, $tokens);
+        foreach($tokens as &$token)
+        {
+            $token = $stemmer->stem($token);
+        }
+        return $tokens;
     }
 }
 
@@ -224,28 +228,32 @@ function naive_bayes() : \TextAnalysis\Classifiers\NaiveBayes
 }
 
 /**
- * Return an array of filtered tokens
+ * Pass the tokens in by reference and modify them
  * @param array $tokens
  * @param string $filterType
- * @return string[]
  */
-function filter_tokens(array $tokens, string $filterType) : array
+function filter_tokens(array &$tokens, string $filterType)
 {
     $className = "\\TextAnalysis\\Filters\\{$filterType}";
     $filter = new $className();
-    return array_values( array_map(function($token) use($filter){ return $filter->transform($token);}, $tokens));
+    foreach($tokens as &$token)
+    {
+        $token = $filter->transform($token);
+    }
 }
 
 /**
  * Filter out stop words
  * @param array $tokens
  * @param array $stopwords
- * @return array
  */
-function filter_stopwords(array $tokens, array $stopwords) : array
+function filter_stopwords(array &$tokens, array &$stopwords)
 {
-    $filter = new \TextAnalysis\Filters\StopWordsFilter($stopwords);
-    return array_values( array_map(function($token) use($filter){ return $filter->transform($token);}, $tokens));    
+    $filter = new \TextAnalysis\Filters\StopWordsFilter($stopwords);      
+    foreach($tokens as &$token)
+    {
+        $token = $filter->transform($token);
+    }
 }
 
 /**
@@ -255,9 +263,89 @@ function filter_stopwords(array $tokens, array $stopwords) : array
  */
 function get_stop_words(string $filePath) : array
 {
-    return array_map('trim', file($filePath));    
+    $rows = file($filePath);
+    array_walk($rows, function(&$value){ $value = trim($value); });
+    return $rows;
 }
 
+/**
+ * Return the polarity scores from the vader algorithm
+ * @param array $tokens
+ * @return array
+ */
+function vader(array $tokens) : array
+{
+    return (new \TextAnalysis\Sentiment\Vader())->getPolarityScores($tokens);
+}
+
+/**
+ * Filter out all null and empty strings
+ * @param array $tokens
+ * @return string[]
+ */
+function filter_empty(array $tokens) : array
+{
+    foreach($tokens as &$token)
+    {
+        if(empty(trim($token))) {
+            $token = NULL;
+        }
+    }    
+    return array_filter($tokens);
+}
+
+function score_keeper_sort($a, $b)
+{
+    if ($a->getScore() == $b->getScore()) {
+        return 0;
+    }
+    return ($a->getScore() < $b->getScore()) ? 1 : -1;
+}
+
+/**
+ * Apply common filters and
+ * @param string $text
+ * @param array $stopwords
+ * @return array
+ */
+function summary_simple(string $text, array $stopwords = []) : array
+{
+    $sentenceTokensOriginal = (new \TextAnalysis\Tokenizers\VanderleeTokenizer())->tokenize(strtolower($text));
+
+    //create copy
+    $sentenceTokens = $sentenceTokensOriginal;
+    if(!empty($stopwords)) {
+        foreach($sentenceTokens as &$sentence)
+        {
+            $sentence = str_replace($stopwords, " ", $sentence);
+        }
+    }
+        
+    filter_tokens($sentenceTokens, 'TrimFilter');
+    filter_tokens($sentenceTokens, 'QuotesFilter');
+    filter_tokens($sentenceTokens, 'CharFilter');        
+        
+    $wordTokens = tokenize($text);
+    foreach(['LowerCaseFilter','PunctuationFilter','QuotesFilter','PossessiveNounFilter','CharFilter'] as $filterType)
+    {
+        filter_tokens($wordTokens, $filterType);
+    }
+     
+    if(!empty($stopwords)) {
+        filter_stopwords($wordTokens, $stopwords);
+    }
+    
+    $summarizer = new \TextAnalysis\Analysis\Summarize\Simple();
+    $scores = $summarizer->summarize(filter_empty( $wordTokens ), $sentenceTokens);
+
+    // reorder sentences in the best order
+    $bestSentences = [];
+    foreach($scores as $score)
+    {
+        $bestSentences[] = $sentenceTokensOriginal[$score->getIndex()];
+    }    
+    return $bestSentences;
+}
 
 
 
diff --git a/src/helpers/storage.php b/src/helpers/storage.php
diff --git a/tests/TextAnalysis/Analysis/Summarize/SimpleTest.php b/tests/TextAnalysis/Analysis/Summarize/SimpleTest.php
diff --git a/tests/TextAnalysis/Classifiers/NaiveBayesTest.php b/tests/TextAnalysis/Classifiers/NaiveBayesTest.php
diff --git a/tests/TextAnalysis/Tokenizers/VanderleeTokenizerTest.php b/tests/TextAnalysis/Tokenizers/VanderleeTokenizerTest.php