Skip to content

Commit 9619882

Browse files
committed
Merge branch 'master' of github.com:yooper/php-text-analysis
2 parents ad08210 + b436f21 commit 9619882

File tree

14 files changed

+549
-29
lines changed

14 files changed

+549
-29
lines changed

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,19 @@ php-text-analysis
77
[![Total Downloads](https://poser.pugx.org/yooper/php-text-analysis/downloads)](https://packagist.org/packages/yooper/php-text-analysis)
88

99
PHP Text Analysis is a library for performing Information Retrieval (IR) and Natural Language Processing (NLP) tasks using the PHP language.
10+
There are tools in this library that can perform:
11+
12+
* document classification
13+
* sentiment analysis
14+
* compare documents
15+
* frequency analysis
16+
* tokenization
17+
* stemming
18+
* collocations with Pointwise Mutual Information
19+
* lexical diversity
20+
* corpus analysis
21+
* text summarization
22+
1023
All the documentation for this project can be found in the book and wiki.
1124

1225
PHP Text Analysis Book & Wiki
@@ -88,3 +101,25 @@ your data prior to using. Second parameter is the ngram size of your keywords to
88101
$rake = rake($tokens, 3);
89102
$results = $rake->getKeywordScores();
90103
```
104+
105+
### Sentiment Analysis with Vader
106+
Need Sentiment Analysis with PHP Use Vader, https://github.com/cjhutto/vaderSentiment .
107+
The PHP implementation can be invoked easily. Just normalize your data before hand.
108+
```php
109+
$sentimentScores = vader($tokens);
110+
```
111+
112+
### Document Classification with Naive Bayes
113+
Need to do some docucment classification with PHP, trying using the Naive Bayes
114+
implementation. An example of classifying movie reviews can be found in the unit
115+
tests
116+
117+
```php
118+
$nb = naive_bayes();
119+
$nb->train('mexican', tokenize('taco nacho enchilada burrito'));
120+
$nb->train('american', tokenize('hamburger burger fries pop'));
121+
$nb->predict(tokenize('my favorite food is a burrito'));
122+
```
123+
124+
125+

composer.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "yooper/php-text-analysis",
33
"description": "PHP Text Analysis is a library for performing Information Retrieval (IR) and Natural Language Processing (NLP) tasks using the PHP language",
4-
"keywords": ["nlp","ir","text analysis","natural language processing"],
4+
"keywords": ["nlp","ir","text analysis","natural language processing", "text classification"],
55
"license": "MIT",
66
"authors": [
77
{
@@ -29,7 +29,8 @@
2929
"symfony/console": "~2.7|~3.4|~4.0",
3030
"camspiers/porter-stemmer": "~1",
3131
"wamania/php-stemmer": "~1",
32-
"yooper/nicknames": "~1"
32+
"yooper/nicknames": "~1",
33+
"vanderlee/php-sentence": "^1.0"
3334
},
3435
"require-dev": {
3536
"phpunit/phpunit": "~5",

src/Classifiers/NaiveBayes.php

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
<?php
2+
3+
namespace TextAnalysis\Classifiers;
4+
5+
/**
6+
* Implementation of Naive Bayes algorithm, borrowed heavily from
7+
* https://github.com/fieg/bayes
8+
* @author yooper
9+
*/
10+
class NaiveBayes implements \TextAnalysis\Interfaces\IClassifier
11+
{
12+
/**
13+
* Track token and counts for a given label
14+
* @var array
15+
*/
16+
protected $labels = [];
17+
18+
/**
19+
* Track the number of docs with the given label
20+
* @var array[int]
21+
*/
22+
protected $labelCount = [];
23+
24+
/**
25+
* Track the token counts
26+
* @var int[]
27+
*/
28+
protected $tokenCount = [];
29+
30+
public function train(string $label, array $tokens)
31+
{
32+
$freqDist = array_count_values($tokens);
33+
if(!isset($this->labels[$label])) {
34+
$this->labels[$label] = [];
35+
$this->labelCount[$label] = 0;
36+
}
37+
38+
$this->labelCount[$label]++;
39+
foreach($freqDist as $token => $count)
40+
{
41+
isset($this->tokenCount[$token]) ? $this->tokenCount[$token] += $count : $this->tokenCount[$token] = $count;
42+
isset($this->labels[$label][$token]) ? $this->labels[$label][$token] += $count : $this->labels[$label][$token] = $count;
43+
}
44+
}
45+
46+
public function predict(array $tokens)
47+
{
48+
$totalDocs = $this->getDocCount();
49+
$scores = [];
50+
51+
foreach ($this->labelCount as $label => $docCount)
52+
{
53+
$sum = 0;
54+
$inversedDocCount = $totalDocs - $docCount;
55+
$docCountReciprocal = 1 / $docCount;
56+
$inversedDocCountReciprocal = 1 / $inversedDocCount;
57+
58+
foreach ($tokens as $token)
59+
{
60+
$totalTokenCount = $this->tokenCount[$token] ?? 1; // prevent division by zero
61+
$tokenCount = $this->labels[$label][$token] ?? 0;
62+
$inversedTokenCount = $totalTokenCount - $tokenCount;
63+
$tokenProbabilityPositive = $tokenCount * $docCountReciprocal;
64+
$tokenProbabilityNegative = $inversedTokenCount * $inversedDocCountReciprocal;
65+
$probability = $tokenProbabilityPositive / ($tokenProbabilityPositive + $tokenProbabilityNegative);
66+
$probability = (0.5 + ($totalTokenCount * $probability)) / (1 + $totalTokenCount);
67+
$sum += log(1 - $probability) - log($probability);
68+
}
69+
$scores[$label] = 1 / (1 + exp($sum));
70+
}
71+
arsort($scores, SORT_NUMERIC);
72+
return $scores;
73+
}
74+
75+
public function getDocCount() : int
76+
{
77+
return array_sum( array_values( $this->labelCount)) ?? 0;
78+
}
79+
80+
public function __destruct()
81+
{
82+
unset($this->labelCount);
83+
unset($this->labels);
84+
unset($this->tokenCount);
85+
}
86+
87+
88+
}

src/Filters/TrimFilter.php

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<?php
2+
3+
namespace TextAnalysis\Filters;
4+
5+
use TextAnalysis\Interfaces\ITokenTransformation;
6+
7+
8+
/**
9+
*
10+
* @author yooper
11+
*/
12+
class TrimFilter implements ITokenTransformation
13+
{
14+
//put your code here
15+
public function transform($word)
16+
{
17+
return trim($word);
18+
}
19+
20+
}

src/Interfaces/IClassifier.php

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<?php
2+
namespace TextAnalysis\Interfaces;
3+
4+
/**
5+
* Used by classifier algorithms
6+
* @author yooper
7+
*/
8+
interface IClassifier
9+
{
10+
public function train(string $label, array $tokens);
11+
12+
public function predict(array $tokens);
13+
}

src/Models/ScoreKeeper.php

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
<?php
2+
3+
namespace TextAnalysis\Models;
4+
5+
/**
6+
* Track metrics of tokenization
7+
* @author yooper
8+
*/
9+
class ScoreKeeper
10+
{
11+
/**
12+
*
13+
* @var string
14+
*/
15+
protected $token;
16+
17+
/**
18+
*
19+
* @var mixed
20+
*/
21+
protected $score;
22+
23+
/**
24+
*
25+
* @var mixed
26+
*/
27+
protected $index;
28+
29+
public function __construct(string $token, $index, $score = 0)
30+
{
31+
$this->token = $token;
32+
$this->index = $index;
33+
$this->score = $score;
34+
}
35+
36+
public function getToken() : string
37+
{
38+
return $this->token;
39+
}
40+
41+
public function getIndex()
42+
{
43+
return $this->index;
44+
}
45+
46+
public function getScore()
47+
{
48+
return $this->score;
49+
}
50+
51+
public function addToScore($score)
52+
{
53+
$this->score += $score;
54+
}
55+
}

src/Sentiment/Vader.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,10 @@ public function idiomsCheck(float $valence, array $tokens, int $index)
285285
*/
286286
public function leastCheck(float $valence, array $tokens, int $index) : float
287287
{
288+
if($index === 0) {
289+
return $valence;
290+
}
291+
288292
$inLexicon = isset($this->getLexicon()[strtolower($tokens[$index-1])]);
289293

290294
if($inLexicon) {

src/Tokenizers/VanderleeTokenizer.php

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<?php
2+
3+
namespace TextAnalysis\Tokenizers;
4+
5+
use Sentence;
6+
7+
/**
8+
* A wrapper around the sentence tokenizer written by
9+
* vanderlee/php-sentence
10+
* @author yooper
11+
*/
12+
class VanderleeTokenizer extends TokenizerAbstract
13+
{
14+
/**
15+
*
16+
* @var Sentence
17+
*/
18+
protected $sentence = null;
19+
20+
public function __construct()
21+
{
22+
$this->sentence = new Sentence;
23+
}
24+
25+
/**
26+
* Split the text into sentences
27+
* @param string $string
28+
* @return array
29+
*/
30+
public function tokenize($string): array
31+
{
32+
return filter_empty( $this->sentence->split($string));
33+
}
34+
35+
public function __destruct()
36+
{
37+
unset($this->sentence);
38+
}
39+
40+
}

0 commit comments

Comments
 (0)