yooper
diff --git a/‎README.md
Lines changed: 35 additions & 0 deletions b/‎README.md
Lines changed: 35 additions & 0 deletions
diff --git a/‎composer.json
Lines changed: 3 additions & 2 deletions b/‎composer.json
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/Classifiers/NaiveBayes.php
Lines changed: 88 additions & 0 deletions b/‎src/Classifiers/NaiveBayes.php
Lines changed: 88 additions & 0 deletions
diff --git a/‎src/Filters/TrimFilter.php
Lines changed: 20 additions & 0 deletions b/‎src/Filters/TrimFilter.php
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/Interfaces/IClassifier.php
Lines changed: 13 additions & 0 deletions b/‎src/Interfaces/IClassifier.php
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/Models/ScoreKeeper.php
Lines changed: 55 additions & 0 deletions b/‎src/Models/ScoreKeeper.php
Lines changed: 55 additions & 0 deletions
diff --git a/‎src/Sentiment/Vader.php
Lines changed: 4 additions & 0 deletions b/‎src/Sentiment/Vader.php
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/Tokenizers/VanderleeTokenizer.php
Lines changed: 40 additions & 0 deletions b/‎src/Tokenizers/VanderleeTokenizer.php
Lines changed: 40 additions & 0 deletions
@@ -7,6 +7,19 @@ php-text-analysis
 [![Total Downloads](https://poser.pugx.org/yooper/php-text-analysis/downloads)](https://packagist.org/packages/yooper/php-text-analysis)
 
 PHP Text Analysis is a library for performing Information Retrieval (IR) and Natural Language Processing (NLP) tasks using the PHP language. 
+There are tools in this library that can perform:
+
+* document classification
+* sentiment analysis
+* compare documents
+* frequency analysis
+* tokenization
+* stemming
+* collocations with Pointwise Mutual Information
+* lexical diversity
+* corpus analysis
+* text summarization
+
 All the documentation for this project can be found in the book and wiki. 
 
 PHP Text Analysis Book & Wiki
@@ -88,3 +101,25 @@ your data prior to using. Second parameter is the ngram size of your keywords to
 $rake = rake($tokens, 3);
 $results = $rake->getKeywordScores();
 ```
+
+### Sentiment Analysis with Vader
+Need Sentiment Analysis with PHP Use Vader, https://github.com/cjhutto/vaderSentiment .
+The PHP implementation can be invoked easily. Just normalize your data before hand.
+```php
+$sentimentScores = vader($tokens);
+```
+
+### Document Classification with Naive Bayes
+Need to do some docucment classification with PHP, trying using the Naive Bayes
+implementation. An example of classifying movie reviews can be found in the unit
+tests
+
+```php
+$nb = naive_bayes();
+$nb->train('mexican', tokenize('taco nacho enchilada burrito'));        
+$nb->train('american', tokenize('hamburger burger fries pop'));  
+$nb->predict(tokenize('my favorite food is a burrito'));
+```
+
+
+
@@ -1,7 +1,7 @@
 {
     "name": "yooper/php-text-analysis",
     "description": "PHP Text Analysis is a library for performing Information Retrieval (IR) and Natural Language Processing (NLP) tasks using the PHP language",
-    "keywords": ["nlp","ir","text analysis","natural language processing"],
+    "keywords": ["nlp","ir","text analysis","natural language processing", "text classification"],
     "license": "MIT",
     "authors": [
     {
@@ -29,7 +29,8 @@
         "symfony/console": "~2.7|~3.4|~4.0",
         "camspiers/porter-stemmer": "~1",
         "wamania/php-stemmer": "~1",
-        "yooper/nicknames": "~1"
+        "yooper/nicknames": "~1",
+        "vanderlee/php-sentence": "^1.0"
     },
     "require-dev": {
         "phpunit/phpunit": "~5",
 
@@ -0,0 +1,88 @@
+<?php
+
+namespace TextAnalysis\Classifiers;
+
+/**
+ * Implementation of Naive Bayes algorithm, borrowed heavily from 
+ * https://github.com/fieg/bayes
+ * @author yooper
+ */
+class NaiveBayes implements \TextAnalysis\Interfaces\IClassifier
+{        
+    /**
+     * Track token and counts for a given label
+     * @var array
+     */
+    protected $labels = [];
+                
+    /**
+     * Track the number of docs with the given label
+     * @var array[int]
+     */
+    protected $labelCount = [];
+    
+    /**
+     * Track the token counts
+     * @var int[]
+     */
+    protected $tokenCount = [];
+            
+    public function train(string $label, array $tokens)
+    {
+        $freqDist = array_count_values($tokens);        
+        if(!isset($this->labels[$label])) {
+            $this->labels[$label] = [];
+            $this->labelCount[$label] = 0;            
+        }
+        
+        $this->labelCount[$label]++;          
+        foreach($freqDist as $token => $count)
+        {
+            isset($this->tokenCount[$token]) ? $this->tokenCount[$token] += $count : $this->tokenCount[$token] = $count;            
+            isset($this->labels[$label][$token]) ? $this->labels[$label][$token] += $count : $this->labels[$label][$token] = $count;
+        }         
+    }
+    
+    public function predict(array $tokens) 
+    {
+        $totalDocs = $this->getDocCount();
+        $scores = [];
+        
+        foreach ($this->labelCount as $label => $docCount) 
+        {
+            $sum = 0;
+            $inversedDocCount = $totalDocs - $docCount;
+            $docCountReciprocal = 1 / $docCount;
+            $inversedDocCountReciprocal = 1 / $inversedDocCount;
+            
+            foreach ($tokens as $token) 
+            {
+                $totalTokenCount = $this->tokenCount[$token] ?? 1; // prevent division by zero
+                $tokenCount = $this->labels[$label][$token] ?? 0;
+                $inversedTokenCount = $totalTokenCount - $tokenCount;
+                $tokenProbabilityPositive = $tokenCount * $docCountReciprocal;
+                $tokenProbabilityNegative = $inversedTokenCount * $inversedDocCountReciprocal;
+                $probability = $tokenProbabilityPositive / ($tokenProbabilityPositive + $tokenProbabilityNegative);
+                $probability = (0.5 + ($totalTokenCount * $probability)) / (1 + $totalTokenCount);
+                $sum += log(1 - $probability) - log($probability);
+            }
+            $scores[$label] = 1 / (1 + exp($sum));
+        }
+        arsort($scores, SORT_NUMERIC);
+        return $scores;                
+    }
+    
+    public function getDocCount() : int
+    {
+        return array_sum( array_values( $this->labelCount)) ?? 0;
+    }
+    
+    public function __destruct() 
+    {
+        unset($this->labelCount);
+        unset($this->labels);
+        unset($this->tokenCount);
+    }
+    
+   
+}
@@ -0,0 +1,20 @@
+<?php
+
+namespace TextAnalysis\Filters;
+
+use TextAnalysis\Interfaces\ITokenTransformation;
+
+
+/**
+ *
+ * @author yooper
+ */
+class TrimFilter implements ITokenTransformation
+{
+    //put your code here
+    public function transform($word) 
+    {
+        return trim($word);
+    }
+
+}
@@ -0,0 +1,13 @@
+<?php
+namespace TextAnalysis\Interfaces;
+
+/**
+ * Used by classifier algorithms
+ * @author yooper
+ */
+interface IClassifier 
+{
+    public function train(string $label, array $tokens);
+    
+    public function predict(array $tokens);
+}
@@ -0,0 +1,55 @@
+<?php
+
+namespace TextAnalysis\Models;
+
+/**
+ * Track metrics of tokenization
+ * @author yooper
+ */
+class ScoreKeeper 
+{
+    /**
+     *
+     * @var string
+     */
+    protected $token;
+    
+    /**
+     *
+     * @var mixed
+     */
+    protected $score;
+    
+    /**
+     *
+     * @var mixed
+     */
+    protected $index;
+    
+    public function __construct(string $token, $index, $score = 0) 
+    {
+        $this->token = $token;
+        $this->index = $index;
+        $this->score = $score;
+    }
+    
+    public function getToken() : string
+    {
+        return $this->token;
+    }
+    
+    public function getIndex()
+    {
+        return $this->index;
+    }
+    
+    public function getScore()
+    {
+        return $this->score;
+    }
+    
+    public function addToScore($score)
+    {
+        $this->score += $score;
+    }
+}
@@ -285,6 +285,10 @@ public function idiomsCheck(float $valence, array $tokens, int $index)
      */
     public function leastCheck(float $valence, array $tokens, int $index) : float
     {
+        if($index === 0) {
+            return $valence;
+        }
+        
         $inLexicon = isset($this->getLexicon()[strtolower($tokens[$index-1])]);
 
         if($inLexicon) {
 
@@ -0,0 +1,40 @@
+<?php
+
+namespace TextAnalysis\Tokenizers;
+
+use Sentence;
+
+/**
+ * A wrapper around the sentence tokenizer written by 
+ * vanderlee/php-sentence
+ * @author yooper
+ */
+class VanderleeTokenizer extends TokenizerAbstract
+{
+    /**
+     *
+     * @var Sentence
+     */
+    protected $sentence = null;
+    
+    public function __construct() 
+    {
+        $this->sentence = new Sentence;
+    }
+
+    /**
+     * Split the text into sentences
+     * @param string $string
+     * @return array
+     */
+    public function tokenize($string): array 
+    {
+        return filter_empty( $this->sentence->split($string));
+    }
+    
+    public function __destruct() 
+    {
+        unset($this->sentence);
+    }
+
+}