Skip to content

Commit f0abcbd

Browse files
authored
Merge pull request #39 from yooper/bayes
Added Bayes Algorithm
2 parents e520cf2 + 9bec646 commit f0abcbd

File tree

5 files changed

+241
-9
lines changed

5 files changed

+241
-9
lines changed

composer.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "yooper/php-text-analysis",
33
"description": "PHP Text Analysis is a library for performing Information Retrieval (IR) and Natural Language Processing (NLP) tasks using the PHP language",
4-
"keywords": ["nlp","ir","text analysis","natural language processing"],
4+
"keywords": ["nlp","ir","text analysis","natural language processing", "text classification"],
55
"license": "MIT",
66
"authors": [
77
{

src/Classifiers/NaiveBayes.php

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
<?php
2+
3+
namespace TextAnalysis\Classifiers;
4+
5+
/**
6+
* Implementation of Naive Bayes algorithm, borrowed heavily from
7+
* https://github.com/fieg/bayes
8+
* @author yooper
9+
*/
10+
class NaiveBayes implements \TextAnalysis\Interfaces\IClassifier
11+
{
12+
/**
13+
* Track token and counts for a given label
14+
* @var array
15+
*/
16+
protected $labels = [];
17+
18+
/**
19+
* Track the number of docs with the given label
20+
* @var array[int]
21+
*/
22+
protected $labelCount = [];
23+
24+
/**
25+
* Track the token counts
26+
* @var array[int]
27+
*/
28+
protected $tokenCount = [];
29+
30+
public function train(string $label, array $tokens)
31+
{
32+
$freqDist = array_count_values($tokens);
33+
if(!isset($this->labels[$label])) {
34+
$this->labels[$label] = [];
35+
$this->labelCount[$label] = 0;
36+
}
37+
38+
$this->labelCount[$label]++;
39+
foreach($freqDist as $token => $count)
40+
{
41+
isset($this->tokenCount[$token]) ? $this->tokenCount[$token] += $count : $this->tokenCount[$token] = $count;
42+
isset($this->labels[$label][$token]) ? $this->labels[$label][$token] += $count : $this->labels[$label][$token] = $count;
43+
}
44+
}
45+
46+
public function predict(array $tokens)
47+
{
48+
$totalDocs = $this->getDocCount();
49+
$scores = [];
50+
51+
foreach ($this->labelCount as $label => $docCount)
52+
{
53+
$sum = 0;
54+
$inversedDocCount = $totalDocs - $docCount;
55+
$docCountReciprocal = 1 / $docCount;
56+
$inversedDocCountReciprocal = 1 / $inversedDocCount;
57+
58+
foreach ($tokens as $token)
59+
{
60+
$totalTokenCount = $this->tokenCount[$token] ?? 1; // prevent division by zero
61+
$tokenCount = $this->labels[$label][$token] ?? 0;
62+
$inversedTokenCount = $totalTokenCount - $tokenCount;
63+
$tokenProbabilityPositive = $tokenCount * $docCountReciprocal;
64+
$tokenProbabilityNegative = $inversedTokenCount * $inversedDocCountReciprocal;
65+
$probability = $tokenProbabilityPositive / ($tokenProbabilityPositive + $tokenProbabilityNegative);
66+
$probability = (0.5 + ($totalTokenCount * $probability)) / (1 + $totalTokenCount);
67+
$sum += log(1 - $probability) - log($probability);
68+
}
69+
$scores[$label] = 1 / (1 + exp($sum));
70+
}
71+
arsort($scores, SORT_NUMERIC);
72+
return $scores;
73+
}
74+
75+
public function getDocCount() : int
76+
{
77+
return array_sum( array_values( $this->labelCount)) ?? 0;
78+
}
79+
80+
public function __destruct()
81+
{
82+
unset($this->labelCount);
83+
unset($this->labels);
84+
unset($this->tokenCount);
85+
}
86+
87+
88+
}

src/Interfaces/IClassifier.php

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<?php
2+
namespace TextAnalysis\Interfaces;
3+
4+
/**
5+
* Used by classifier algorithms
6+
* @author yooper
7+
*/
8+
interface IClassifier
9+
{
10+
public function train(string $label, array $tokens);
11+
12+
public function predict(array $tokens);
13+
}

src/helpers/helpers.php

Lines changed: 49 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,6 @@ function starts_with( string $haystack, string $needle ): bool {
9090
/**
9191
* @param string $haystack
9292
* @param string $needle
93-
*
9493
* @return bool
9594
*/
9695
function ends_with( string $haystack, string $needle ): bool {
@@ -103,7 +102,6 @@ function ends_with( string $haystack, string $needle ): bool {
103102
* Returns an instance of the TextCorpus
104103
*
105104
* @param string $text
106-
*
107105
* @return \TextAnalysis\Corpus\TextCorpus
108106
*/
109107
function text( string $text ): \TextAnalysis\Corpus\TextCorpus {
@@ -116,8 +114,7 @@ function text( string $text ): \TextAnalysis\Corpus\TextCorpus {
116114
/**
117115
* Returns an instance of the Rake
118116
*
119-
* @param array $tokens
120-
*
117+
* @param string[] $tokens
121118
* @return \TextAnalysis\Analysis\Keywords\Rake
122119
*/
123120
function rake(array $tokens, int $ngramSize = 3): \TextAnalysis\Analysis\Keywords\Rake
@@ -130,9 +127,8 @@ function rake(array $tokens, int $ngramSize = 3): \TextAnalysis\Analysis\Keyword
130127
/**
131128
* Returns an array of stemmed tokens
132129
*
133-
* @param array $tokens
134-
*
135-
* @return \TextAnalysis\Analysis\Keywords\Rake
130+
* @param string[] $tokens
131+
* @return string[]
136132
*/
137133
function stem(array $tokens, string $stemmerClassName = \TextAnalysis\Stemmers\PorterStemmer::class): array
138134
{
@@ -214,7 +210,52 @@ function gutenberg_list() : array
214210
*/
215211
function scan_dir(string $dir) : array
216212
{
217-
return array_diff(scandir($dir), ['..', '.']);
213+
$filePaths = array_diff(scandir($dir), ['..', '.']);
214+
return array_map(function($filePath) use ($dir){ return realpath($dir.DIRECTORY_SEPARATOR.$filePath); }, $filePaths);
215+
}
216+
217+
/**
218+
* Shortcut function for getting naive bayes implementation
219+
* @return \TextAnalysis\Classifiers\NaiveBayes
220+
*/
221+
function naive_bayes() : \TextAnalysis\Classifiers\NaiveBayes
222+
{
223+
return new \TextAnalysis\Classifiers\NaiveBayes;
224+
}
225+
226+
/**
227+
* Return an array of filtered tokens
228+
* @param array $tokens
229+
* @param string $filterType
230+
* @return string[]
231+
*/
232+
function filter_tokens(array $tokens, string $filterType) : array
233+
{
234+
$className = "\\TextAnalysis\\Filters\\{$filterType}";
235+
$filter = new $className();
236+
return array_values( array_map(function($token) use($filter){ return $filter->transform($token);}, $tokens));
237+
}
238+
239+
/**
240+
* Filter out stop words
241+
* @param array $tokens
242+
* @param array $stopwords
243+
* @return array
244+
*/
245+
function filter_stopwords(array $tokens, array $stopwords) : array
246+
{
247+
$filter = new \TextAnalysis\Filters\StopWordsFilter($stopwords);
248+
return array_values( array_map(function($token) use($filter){ return $filter->transform($token);}, $tokens));
249+
}
250+
251+
/**
252+
* Read a file into memory that is new line delimited
253+
* @param string $filePath
254+
* @return array
255+
*/
256+
function get_stop_words(string $filePath) : array
257+
{
258+
return array_map('trim', file($filePath));
218259
}
219260

220261

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
<?php
2+
3+
namespace Tests\TextAnalysis\Classifiers;
4+
5+
/**
6+
* Description of NaiveBayesTest
7+
*
8+
* @author yooper
9+
*/
10+
class NaiveBayesTest extends \PHPUnit_Framework_TestCase
11+
{
12+
13+
public function testNaiveBayes()
14+
{
15+
$nb = naive_bayes();
16+
$nb->train('mexican', tokenize('taco nacho enchilada burrito'));
17+
$nb->train('american', tokenize('hamburger burger fries pop'));
18+
$this->assertEquals(['mexican', 'american'], array_keys ( $nb->predict(tokenize('my favorite food is a burrito'))));
19+
$this->assertEquals(['american', 'mexican'], array_keys ( $nb->predict(tokenize('my favorite food is pop and fries'))));
20+
}
21+
22+
public function testMovieReviews()
23+
{
24+
if( getenv('SKIP_TEST') || !is_dir(get_storage_path('corpora/movie_reviews'))) {
25+
return;
26+
}
27+
28+
$posFilePaths = scan_dir(get_storage_path('corpora/movie_reviews/pos'));
29+
$nb = naive_bayes();
30+
31+
foreach($posFilePaths as $filePath)
32+
{
33+
$nb->train('positive', $this->getTokenizedReviews($filePath));
34+
}
35+
36+
$negFilePaths = scan_dir(get_storage_path('corpora/movie_reviews/neg'));
37+
foreach($negFilePaths as $filePath)
38+
{
39+
$nb->train('negative', $this->getTokenizedReviews($filePath));
40+
}
41+
42+
$movieReviewTokens = tokenize($this->getMovieReview());
43+
$movieReviewTokens = filter_stopwords($movieReviewTokens, get_stop_words(VENDOR_DIR."yooper/stop-words/data/stop-words_english_1_en.txt"));
44+
$movieReviewTokens = filter_tokens($movieReviewTokens, 'PunctuationFilter');
45+
$movieReviewTokens = filter_tokens($movieReviewTokens, 'QuotesFilter');
46+
$movieReviewTokens = stem($movieReviewTokens);
47+
$this->assertEquals('positive', array_keys($nb->predict($movieReviewTokens))[0]);
48+
49+
}
50+
51+
protected function getTokenizedReviews(string $filePath) : array
52+
{
53+
$tokens = tokenize(file_get_contents($filePath));
54+
$tokens = filter_stopwords($tokens, get_stop_words(VENDOR_DIR."yooper/stop-words/data/stop-words_english_1_en.txt"));
55+
$tokens = filter_tokens($tokens, 'PunctuationFilter');
56+
$tokens = filter_tokens($tokens, 'QuotesFilter');
57+
$tokens = stem($tokens);
58+
return $tokens;
59+
}
60+
61+
/**
62+
* Taken from https://www.rollingstone.com/movies/reviews/incredibles-2-movie-review-pixar-w521419
63+
* @return string
64+
*/
65+
protected function getMovieReview() : string
66+
{
67+
return <<<TEXT
68+
It really is incredible. Yes, the sequel to Brad Bird's 2004 classic is not the groundbreaker that stormed the multiplex 14 years ago – you only get to be shiny new once. Pixar's animated miracle didn't look or sound like anything else, being about a family of superheroes forced into retirement by a legal system that didn't care for the collateral damage caused by their do-gooding antics. How many family films dealt with midlife crisis, marital dysfunction, child neglect, impotence fears, fashion faux pas and existential angst? But this follow-up is every bit the start-to-finish sensation as the original, and you'll be happy to know that Bird's subversive spirit is alive and thriving. The kiddies probably won't notice – they'll be too distracted by all the whooshing derring-do – but like its Oscar-winning predecessor, The Incredibles 2 doesn't ring cartoonish. It rings true.
69+
70+
RELATED
71+
72+
25 Best Pixar Movie Characters
73+
From Buzz Lightyear to Bing Bong, the most memorable heroes and villains from groundbreaking animation giants
74+
75+
It may have taken years for Bird & co. to get this sequel together, but the action picks up right where the original left off, as if it were yesterday. The Parr family – mom Helen (voiced by Holly Hunter), dad Bob (Craig T. Nelson), 14-year-old Violet (Sarah Vowell), 10-year-old Dash (Huckleberry Milner) and baby Jack-Jack (Eli Fucile) – is still in exile, massively frustrated by being forced to keep their powers in check. All of which goes out the window when a villain named the Underminer (John Ratzenberger) starts raising hell in Municiberg. Nothing like heroics to get the family out of its funk.
76+
77+
But there's a difference this time. Helen, a.k.a. Elastigirl, takes charge, leaving Mr. Incredible to stay home with his teen daughter and the tots. Female empowerment suits Mom, as she stops a runaway train from wreaking havoc thanks to her quick thinking and stretchable arms. It's a great action sequence – and some great voicework from Hunter, whose vocals can stretch from subtle to pow and all stops in between. This is her show and she makes the newly emboldened character resonate onscreen like nobody's business.
78+
79+
Elastigirl clearly likes getting costumed up again and back into the thick of it. She's not alone: Telecommunications tycoon Winston Deavor (Bob Odenkirk) thinks the time is now to get the Incredibles back into the public's good graces. With the help of his tech-nerd sister Evelyn (Catherine Keener – talk about great voices!), he launches a campaign to make superheroes popular again. Dad is stuck playing Mr. Mom at home, totally unable to cope with Violet's boy problems, Dash's adolescent rebellion and a baby who'sshowing power-potential that's both deeply funny and scary. Jack-Jack's transformation is a riot. It's the cue for the return of fashion guru Edna Mode (again voiced hilariously by Bird) to take the demon baby in hand for a supersuit fitting-slash-overnight sleepover. It's a wildly comic duel of scenestealers.
80+
81+
The villain of the piece is a diabolically clever entity named Screenslaver, which seeks to control the minds of citizens through screens – not a hard job, since damn near everyone is already enslaved to the screens on their devices. Unlike other filmmakers who are bound to formulas and black-and-white conceptions of heroes and villains, Bird works outside the box. The bad guy wants to destroy the Incredibles because citizens would rather have fantasy figures save the world instead of getting off their lazy asses and doing something about it themselves. In other words, Screenslaver – despite using nefarious means to an end – has a point. And the film is richer for the character's ambiguity.
82+
83+
Of course, nothing stops the fun, set to another rousing score by the ever-fantastic Michael Giacchino. All the stops are pulled out in the rousing climax that brings the characters together, including family friend Lucius Best/Frozone (Samuel L. Jackson). The setting is a mega-yacht, owned by the Deavors, where Screenslaver maneuvers to turn our heroes to the dark side. No spoilers, except to say that Bird is peerless at playing with our feelings while never veering from his heartfelt tribute to the Parrs as the core resilient American family. The Incredibles 2 is more than peak summer entertainment. It's an exhilarating gift.
84+
85+
TEXT;
86+
}
87+
88+
89+
90+
}

0 commit comments

Comments
 (0)