Skip to content

Commit 892d543

Browse files
authored
Merge pull request #40 from yooper/sentence_summarizer
Sentence summarizer
2 parents f0abcbd + 1efe428 commit 892d543

File tree

12 files changed

+410
-36
lines changed

12 files changed

+410
-36
lines changed

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,19 @@ php-text-analysis
77
[![Total Downloads](https://poser.pugx.org/yooper/php-text-analysis/downloads)](https://packagist.org/packages/yooper/php-text-analysis)
88

99
PHP Text Analysis is a library for performing Information Retrieval (IR) and Natural Language Processing (NLP) tasks using the PHP language.
10+
There are tools in this library that can perform:
11+
12+
* document classification
13+
* sentiment analysis
14+
* compare documents
15+
* frequency analysis
16+
* tokenization
17+
* stemming
18+
* collocations with Pointwise Mutual Information
19+
* lexical diversity
20+
* corpus analysis
21+
* text summarization
22+
1023
All the documentation for this project can be found in the book and wiki.
1124

1225
PHP Text Analysis Book & Wiki
@@ -88,3 +101,25 @@ your data prior to using. Second parameter is the ngram size of your keywords to
88101
$rake = rake($tokens, 3);
89102
$results = $rake->getKeywordScores();
90103
```
104+
105+
### Sentiment Analysis with Vader
106+
Need Sentiment Analysis with PHP Use Vader, https://github.com/cjhutto/vaderSentiment .
107+
The PHP implementation can be invoked easily. Just normalize your data before hand.
108+
```php
109+
$sentimentScores = vader($tokens);
110+
```
111+
112+
### Document Classification with Naive Bayes
113+
Need to do some docucment classification with PHP, trying using the Naive Bayes
114+
implementation. An example of classifying movie reviews can be found in the unit
115+
tests
116+
117+
```php
118+
$nb = naive_bayes();
119+
$nb->train('mexican', tokenize('taco nacho enchilada burrito'));
120+
$nb->train('american', tokenize('hamburger burger fries pop'));
121+
$nb->predict(tokenize('my favorite food is a burrito'));
122+
```
123+
124+
125+

composer.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@
2929
"symfony/console": "~2.7|~3.4|~4.0",
3030
"camspiers/porter-stemmer": "~1",
3131
"wamania/php-stemmer": "~1",
32-
"yooper/nicknames": "~1"
32+
"yooper/nicknames": "~1",
33+
"vanderlee/php-sentence": "^1.0"
3334
},
3435
"require-dev": {
3536
"phpunit/phpunit": "~5",

src/Analysis/Summarize/Simple.php

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
<?php
2+
3+
namespace TextAnalysis\Analysis\Summarize;
4+
5+
use TextAnalysis\Models\ScoreKeeper;
6+
7+
/**
8+
* A simple algorithm based off of frequency counts for finding the best
9+
* sentence to summarize the text
10+
* @author yooper
11+
*/
12+
class Simple
13+
{
14+
/**
15+
* Returns each sentenced scored.
16+
* @param array $wordTokens
17+
* @param array $sentenceTokens
18+
* @return array
19+
*/
20+
public function summarize(array $wordTokens, array $sentenceTokens) : array
21+
{
22+
$tokenCounts = array_count_values($wordTokens);
23+
$scoreKeepers = [];
24+
for($index = 0; $index < count($sentenceTokens); $index++)
25+
{
26+
$scoreKeepers[] = new ScoreKeeper($sentenceTokens[$index], $index);
27+
}
28+
29+
foreach($tokenCounts as $token => $freq)
30+
{
31+
foreach($scoreKeepers as $sentenceKeeper)
32+
{
33+
if(strpos($sentenceKeeper->getToken(), $token) !== false) {
34+
35+
$sentenceKeeper->addToScore($freq);
36+
}
37+
}
38+
}
39+
40+
usort($scoreKeepers, 'score_keeper_sort');
41+
return $scoreKeepers;
42+
}
43+
44+
}

src/Classifiers/NaiveBayes.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class NaiveBayes implements \TextAnalysis\Interfaces\IClassifier
2323

2424
/**
2525
* Track the token counts
26-
* @var array[int]
26+
* @var int[]
2727
*/
2828
protected $tokenCount = [];
2929

src/Filters/TrimFilter.php

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<?php
2+
3+
namespace TextAnalysis\Filters;
4+
5+
use TextAnalysis\Interfaces\ITokenTransformation;
6+
7+
8+
/**
9+
*
10+
* @author yooper
11+
*/
12+
class TrimFilter implements ITokenTransformation
13+
{
14+
//put your code here
15+
public function transform($word)
16+
{
17+
return trim($word);
18+
}
19+
20+
}

src/Models/ScoreKeeper.php

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
<?php
2+
3+
namespace TextAnalysis\Models;
4+
5+
/**
6+
* Track metrics of tokenization
7+
* @author yooper
8+
*/
9+
class ScoreKeeper
10+
{
11+
/**
12+
*
13+
* @var string
14+
*/
15+
protected $token;
16+
17+
/**
18+
*
19+
* @var mixed
20+
*/
21+
protected $score;
22+
23+
/**
24+
*
25+
* @var mixed
26+
*/
27+
protected $index;
28+
29+
public function __construct(string $token, $index, $score = 0)
30+
{
31+
$this->token = $token;
32+
$this->index = $index;
33+
$this->score = $score;
34+
}
35+
36+
public function getToken() : string
37+
{
38+
return $this->token;
39+
}
40+
41+
public function getIndex()
42+
{
43+
return $this->index;
44+
}
45+
46+
public function getScore()
47+
{
48+
return $this->score;
49+
}
50+
51+
public function addToScore($score)
52+
{
53+
$this->score += $score;
54+
}
55+
}

src/Tokenizers/VanderleeTokenizer.php

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<?php
2+
3+
namespace TextAnalysis\Tokenizers;
4+
5+
use Sentence;
6+
7+
/**
8+
* A wrapper around the sentence tokenizer written by
9+
* vanderlee/php-sentence
10+
* @author yooper
11+
*/
12+
class VanderleeTokenizer extends TokenizerAbstract
13+
{
14+
/**
15+
*
16+
* @var Sentence
17+
*/
18+
protected $sentence = null;
19+
20+
public function __construct()
21+
{
22+
$this->sentence = new Sentence;
23+
}
24+
25+
/**
26+
* Split the text into sentences
27+
* @param string $string
28+
* @return array
29+
*/
30+
public function tokenize($string): array
31+
{
32+
return filter_empty( $this->sentence->split($string));
33+
}
34+
35+
public function __destruct()
36+
{
37+
unset($this->sentence);
38+
}
39+
40+
}

src/helpers/helpers.php

Lines changed: 98 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,11 @@ function rake(array $tokens, int $ngramSize = 3): \TextAnalysis\Analysis\Keyword
133133
function stem(array $tokens, string $stemmerClassName = \TextAnalysis\Stemmers\PorterStemmer::class): array
134134
{
135135
$stemmer = new $stemmerClassName();
136-
return array_map(function($token) use($stemmer){ return $stemmer->stem($token); }, $tokens);
136+
foreach($tokens as &$token)
137+
{
138+
$token = $stemmer->stem($token);
139+
}
140+
return $tokens;
137141
}
138142
}
139143

@@ -224,28 +228,32 @@ function naive_bayes() : \TextAnalysis\Classifiers\NaiveBayes
224228
}
225229

226230
/**
227-
* Return an array of filtered tokens
231+
* Pass the tokens in by reference and modify them
228232
* @param array $tokens
229233
* @param string $filterType
230-
* @return string[]
231234
*/
232-
function filter_tokens(array $tokens, string $filterType) : array
235+
function filter_tokens(array &$tokens, string $filterType)
233236
{
234237
$className = "\\TextAnalysis\\Filters\\{$filterType}";
235238
$filter = new $className();
236-
return array_values( array_map(function($token) use($filter){ return $filter->transform($token);}, $tokens));
239+
foreach($tokens as &$token)
240+
{
241+
$token = $filter->transform($token);
242+
}
237243
}
238244

239245
/**
240246
* Filter out stop words
241247
* @param array $tokens
242248
* @param array $stopwords
243-
* @return array
244249
*/
245-
function filter_stopwords(array $tokens, array $stopwords) : array
250+
function filter_stopwords(array &$tokens, array &$stopwords)
246251
{
247-
$filter = new \TextAnalysis\Filters\StopWordsFilter($stopwords);
248-
return array_values( array_map(function($token) use($filter){ return $filter->transform($token);}, $tokens));
252+
$filter = new \TextAnalysis\Filters\StopWordsFilter($stopwords);
253+
foreach($tokens as &$token)
254+
{
255+
$token = $filter->transform($token);
256+
}
249257
}
250258

251259
/**
@@ -255,9 +263,89 @@ function filter_stopwords(array $tokens, array $stopwords) : array
255263
*/
256264
function get_stop_words(string $filePath) : array
257265
{
258-
return array_map('trim', file($filePath));
266+
$rows = file($filePath);
267+
array_walk($rows, function(&$value){ $value = trim($value); });
268+
return $rows;
259269
}
260270

271+
/**
272+
* Return the polarity scores from the vader algorithm
273+
* @param array $tokens
274+
* @return array
275+
*/
276+
function vader(array $tokens) : array
277+
{
278+
return (new \TextAnalysis\Sentiment\Vader())->getPolarityScores($tokens);
279+
}
280+
281+
/**
282+
* Filter out all null and empty strings
283+
* @param array $tokens
284+
* @return string[]
285+
*/
286+
function filter_empty(array $tokens) : array
287+
{
288+
foreach($tokens as &$token)
289+
{
290+
if(empty(trim($token))) {
291+
$token = NULL;
292+
}
293+
}
294+
return array_filter($tokens);
295+
}
296+
297+
function score_keeper_sort($a, $b)
298+
{
299+
if ($a->getScore() == $b->getScore()) {
300+
return 0;
301+
}
302+
return ($a->getScore() < $b->getScore()) ? 1 : -1;
303+
}
304+
305+
/**
306+
* Apply common filters and
307+
* @param string $text
308+
* @param array $stopwords
309+
* @return array
310+
*/
311+
function summary_simple(string $text, array $stopwords = []) : array
312+
{
313+
$sentenceTokensOriginal = (new \TextAnalysis\Tokenizers\VanderleeTokenizer())->tokenize(strtolower($text));
314+
315+
//create copy
316+
$sentenceTokens = $sentenceTokensOriginal;
317+
if(!empty($stopwords)) {
318+
foreach($sentenceTokens as &$sentence)
319+
{
320+
$sentence = str_replace($stopwords, " ", $sentence);
321+
}
322+
}
323+
324+
filter_tokens($sentenceTokens, 'TrimFilter');
325+
filter_tokens($sentenceTokens, 'QuotesFilter');
326+
filter_tokens($sentenceTokens, 'CharFilter');
327+
328+
$wordTokens = tokenize($text);
329+
foreach(['LowerCaseFilter','PunctuationFilter','QuotesFilter','PossessiveNounFilter','CharFilter'] as $filterType)
330+
{
331+
filter_tokens($wordTokens, $filterType);
332+
}
333+
334+
if(!empty($stopwords)) {
335+
filter_stopwords($wordTokens, $stopwords);
336+
}
337+
338+
$summarizer = new \TextAnalysis\Analysis\Summarize\Simple();
339+
$scores = $summarizer->summarize(filter_empty( $wordTokens ), $sentenceTokens);
340+
341+
// reorder sentences in the best order
342+
$bestSentences = [];
343+
foreach($scores as $score)
344+
{
345+
$bestSentences[] = $sentenceTokensOriginal[$score->getIndex()];
346+
}
347+
return $bestSentences;
348+
}
261349

262350

263351

0 commit comments

Comments
 (0)