Skip to content

Commit c061bf6

Browse files
committed
working on sentence summarizer
1 parent 7dafd4d commit c061bf6

File tree

10 files changed

+306
-15
lines changed

10 files changed

+306
-15
lines changed

composer.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@
2929
"symfony/console": "~2.7|~3.4|~4.0",
3030
"camspiers/porter-stemmer": "~1",
3131
"wamania/php-stemmer": "~1",
32-
"yooper/nicknames": "~1"
32+
"yooper/nicknames": "~1",
33+
"vanderlee/php-sentence": "^1.0"
3334
},
3435
"require-dev": {
3536
"phpunit/phpunit": "~5",

src/Analysis/Summarize/Simple.php

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
<?php
2+
3+
namespace TextAnalysis\Analysis\Summarize;
4+
5+
use TextAnalysis\Models\ScoreKeeper;
6+
7+
/**
8+
* A simple algorithm based off of frequency counts for finding the best
9+
* sentence to summarize the text
10+
* @author yooper
11+
*/
12+
class Simple
13+
{
14+
/**
15+
* Returns each sentenced scored.
16+
* @param array $wordTokens
17+
* @param array $sentenceTokens
18+
* @return array
19+
*/
20+
public function summarize(array $wordTokens, array $sentenceTokens) : array
21+
{
22+
$tokenCounts = array_count_values($wordTokens);
23+
$scoreKeepers = [];
24+
for($index = 0; $index < count($sentenceTokens); $index++)
25+
{
26+
$scoreKeepers[] = new ScoreKeeper($sentenceTokens[$index], $index);
27+
}
28+
29+
$sentenceCounter = array_fill_keys($sentenceTokens, 0);
30+
31+
foreach($tokenCounts as $token => $freq)
32+
{
33+
foreach($scoreKeepers as $sentenceKeeper)
34+
{
35+
if(strpos($sentenceKeeper->getToken(), $token) !== false) {
36+
37+
$sentenceKeeper->addToScore($freq);
38+
}
39+
}
40+
}
41+
42+
usort($scoreKeepers, 'score_keeper_sort');
43+
return $scoreKeepers;
44+
}
45+
46+
}

src/Classifiers/NaiveBayes.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class NaiveBayes implements \TextAnalysis\Interfaces\IClassifier
2323

2424
/**
2525
* Track the token counts
26-
* @var array[int]
26+
* @var int[]
2727
*/
2828
protected $tokenCount = [];
2929

src/Filters/TrimFilter.php

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<?php
2+
3+
namespace TextAnalysis\Filters;
4+
5+
use TextAnalysis\Interfaces\ITokenTransformation;
6+
7+
8+
/**
9+
*
10+
* @author yooper
11+
*/
12+
class TrimFilter implements ITokenTransformation
13+
{
14+
//put your code here
15+
public function transform($word)
16+
{
17+
return trim($word);
18+
}
19+
20+
}

src/Models/ScoreKeeper.php

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
<?php
2+
3+
namespace TextAnalysis\Models;
4+
5+
/**
6+
* Track metrics of tokenization
7+
* @author yooper
8+
*/
9+
class ScoreKeeper
10+
{
11+
/**
12+
*
13+
* @var string
14+
*/
15+
protected $token;
16+
17+
/**
18+
*
19+
* @var mixed
20+
*/
21+
protected $score;
22+
23+
/**
24+
*
25+
* @var mixed
26+
*/
27+
protected $index;
28+
29+
public function __construct(string $token, $index, $score = 0)
30+
{
31+
$this->token = $token;
32+
$this->index = $index;
33+
$this->score = $score;
34+
}
35+
36+
public function getToken() : string
37+
{
38+
return $this->token;
39+
}
40+
41+
public function getIndex()
42+
{
43+
return $this->index;
44+
}
45+
46+
public function getScore()
47+
{
48+
return $this->score;
49+
}
50+
51+
public function addToScore($score)
52+
{
53+
$this->score += $score;
54+
}
55+
}

src/Tokenizers/VanderleeTokenizer.php

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<?php
2+
3+
namespace TextAnalysis\Tokenizers;
4+
5+
use Sentence;
6+
7+
/**
8+
* A wrapper around the sentence tokenizer written by
9+
* vanderlee/php-sentence
10+
* @author yooper
11+
*/
12+
class VanderleeTokenizer extends TokenizerAbstract
13+
{
14+
/**
15+
*
16+
* @var Sentence
17+
*/
18+
protected $sentence = null;
19+
20+
public function __construct()
21+
{
22+
$this->sentence = new Sentence;
23+
}
24+
25+
/**
26+
* Split the text into sentences
27+
* @param string $string
28+
* @return array
29+
*/
30+
public function tokenize($string): array
31+
{
32+
return filter_empty( $this->sentence->split($string));
33+
}
34+
35+
public function __destruct()
36+
{
37+
unset($this->sentence);
38+
}
39+
40+
}

src/helpers/helpers.php

Lines changed: 57 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -228,36 +228,32 @@ function naive_bayes() : \TextAnalysis\Classifiers\NaiveBayes
228228
}
229229

230230
/**
231-
* Return an array of filtered tokens
231+
* Pass the tokens in by reference and modify them
232232
* @param array $tokens
233233
* @param string $filterType
234-
* @return string[]
235234
*/
236-
function filter_tokens(array &$tokens, string $filterType) : array
235+
function filter_tokens(array &$tokens, string $filterType)
237236
{
238237
$className = "\\TextAnalysis\\Filters\\{$filterType}";
239238
$filter = new $className();
240239
foreach($tokens as &$token)
241240
{
242241
$token = $filter->transform($token);
243242
}
244-
return array_values($tokens);
245243
}
246244

247245
/**
248246
* Filter out stop words
249247
* @param array $tokens
250248
* @param array $stopwords
251-
* @return array
252249
*/
253-
function filter_stopwords(array &$tokens, array &$stopwords) : array
250+
function filter_stopwords(array &$tokens, array &$stopwords)
254251
{
255252
$filter = new \TextAnalysis\Filters\StopWordsFilter($stopwords);
256253
foreach($tokens as &$token)
257254
{
258255
$token = $filter->transform($token);
259256
}
260-
return array_values($tokens);
261257
}
262258

263259
/**
@@ -298,4 +294,58 @@ function filter_empty(array $tokens) : array
298294
return array_filter($tokens);
299295
}
300296

297+
function score_keeper_sort($a, $b)
298+
{
299+
if ($a->getScore() == $b->getScore()) {
300+
return 0;
301+
}
302+
return ($a->getScore() < $b->getScore()) ? 1 : -1;
303+
}
304+
305+
/**
306+
* Apply common filters and
307+
* @param string $text
308+
* @param array $stopwords
309+
* @return array
310+
*/
311+
function summary_simple(string $text, array $stopwords = []) : array
312+
{
313+
$sentenceTokensOriginal = (new \TextAnalysis\Tokenizers\VanderleeTokenizer())->tokenize(strtolower($text));
314+
315+
//create copy
316+
$sentenceTokens = $sentenceTokensOriginal;
317+
if(!empty($stopwords)) {
318+
foreach($sentenceTokens as &$sentence)
319+
{
320+
$sentence = str_replace($stopwords, " ", $sentence);
321+
}
322+
}
323+
324+
filter_tokens($sentenceTokens, 'TrimFilter');
325+
filter_tokens($sentenceTokens, 'QuotesFilter');
326+
filter_tokens($sentenceTokens, 'CharFilter');
327+
328+
$wordTokens = tokenize($text);
329+
foreach(['LowerCaseFilter','PunctuationFilter','QuotesFilter','PossessiveNounFilter','CharFilter'] as $filterType)
330+
{
331+
filter_tokens($wordTokens, $filterType);
332+
}
333+
334+
if(!empty($stopwords)) {
335+
filter_stopwords($wordTokens, $stopwords);
336+
}
337+
338+
$summarizer = new \TextAnalysis\Analysis\Summarize\Simple();
339+
$scores = $summarizer->summarize(filter_empty( $wordTokens ), $sentenceTokens);
340+
341+
// reorder sentences in the best order
342+
$bestSentences = [];
343+
foreach($scores as $score)
344+
{
345+
$bestSentences[] = $sentenceTokensOriginal[$score->getIndex()];
346+
}
347+
return $bestSentences;
348+
}
349+
350+
301351

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
<?php
2+
3+
namespace Tests\TextAnalysis\Analysis\Summarize;
4+
5+
/**
6+
* Test out the simple summary algorithm
7+
* @author yooper
8+
*/
9+
class SimpleTest extends \PHPUnit_Framework_TestCase
10+
{
11+
public function testSimpleWithStopwords()
12+
{
13+
$stopwords = get_stop_words(VENDOR_DIR."yooper/stop-words/data/stop-words_english_1_en.txt");
14+
$stopwords = array_map(function($word){ return " {$word} ";}, $stopwords);
15+
$bestSentences = summary_simple($this->getArticle(), $stopwords);
16+
$this->assertCount(13, $bestSentences);
17+
$this->assertEquals($this->getTopSentence(), $bestSentences[0]);
18+
19+
}
20+
21+
public function testSimpleWithoutStopwords()
22+
{
23+
$bestSentences = summary_simple($this->getArticle());
24+
$this->assertCount(13, $bestSentences);
25+
$this->assertNotEquals($this->getTopSentenceWithoutStopwords(), $bestSentences[0]);
26+
}
27+
28+
public function getArticle() : string
29+
{
30+
return <<<TEXT
31+
According to a Tuesday news release, Houghton County leaders are asking for a slowing of supply donations.
32+
33+
Volunteers and financial donations are still needed, along with dehumidifiers, box fans or large equipment that can be used for excavating, demolition or reconstruction.
34+
35+
"The response to our recent flood disaster has been overwhelming, and the Copper Country cannot be thankful enough for the support that’s been received," said Michael Babcock, the director of marketing and communications at Finlandia University. "However, as of now, volunteers have reached a point where enough normal supplies are on hand. Officials are now asking that the flow of general donations be reduced or stopped with a few exceptions. We know of several additional semi loads coming, but we’re now asking that any additional large deliveries that are planned please be put on hold, unless the items being donated are dehumidifiers, box fans or large equipment that can be used for excavating, demolition or reconstruction."
36+
37+
Volunteers are still needed. The recovery effort is transitioning from initial clean-up to the rehab and reconstruction phase, and additional volunteers are a vital part of that effort.
38+
39+
To donate money, please go to coppercountrystrong.com/donate.
40+
41+
On Friday at 4 p.m. the Flood Relief Supply Distribution at Dee Stadium will be closing. Those in need of supplies are asked to stop by before it closes to get what’s needed for the weekend. Next steps for the distribution center are being evaluated and will be announced as soon as possible.
42+
TEXT;
43+
}
44+
45+
public function getTopSentence()
46+
{
47+
return '"the response to our recent flood disaster has been overwhelming, and the copper country cannot be thankful enough for the support that\'s been received," said michael babcock, the director of marketing and communications at finlandia university.';
48+
}
49+
50+
public function getTopSentenceWithoutStopwords()
51+
{
52+
return 'we know of several additional semi loads coming, but we’re now asking that any additional large deliveries that are planned please be put on hold, unless the items being donated are dehumidifiers, box fans or large equipment that can be used for excavating, demolition or reconstruction."';
53+
}
54+
}

tests/TextAnalysis/Classifiers/NaiveBayesTest.php

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@ public function testMovieReviews()
4141

4242
$movieReviewTokens = tokenize($this->getMovieReview());
4343
$stopWords = get_stop_words(VENDOR_DIR."yooper/stop-words/data/stop-words_english_1_en.txt");
44-
$movieReviewTokens = filter_stopwords($movieReviewTokens, $stopWords);
45-
$movieReviewTokens = filter_tokens($movieReviewTokens, 'PunctuationFilter');
46-
$movieReviewTokens = filter_tokens($movieReviewTokens, 'QuotesFilter');
44+
filter_stopwords($movieReviewTokens, $stopWords);
45+
filter_tokens($movieReviewTokens, 'PunctuationFilter');
46+
filter_tokens($movieReviewTokens, 'QuotesFilter');
4747
$movieReviewTokens = stem($movieReviewTokens);
4848
$this->assertEquals('positive', array_keys($nb->predict($movieReviewTokens))[0]);
4949

@@ -58,9 +58,9 @@ protected function getTokenizedReviews(string $filePath) : array
5858
}
5959

6060
$tokens = tokenize(file_get_contents($filePath));
61-
$tokens = filter_tokens($tokens, 'PunctuationFilter');
62-
$tokens = filter_tokens($tokens, 'QuotesFilter');
63-
$tokens = filter_stopwords($tokens, $stopWords);
61+
filter_tokens($tokens, 'PunctuationFilter');
62+
filter_tokens($tokens, 'QuotesFilter');
63+
filter_stopwords($tokens, $stopWords);
6464
$tokens = stem($tokens);
6565
$tokens = filter_empty($tokens);
6666
return $tokens;
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<?php
2+
namespace Tests\TextAnalysis\Tokenizers;
3+
4+
use TextAnalysis\Tokenizers\VanderleeTokenizer;
5+
6+
/**
7+
*
8+
* @author yooper
9+
*/
10+
class VanderleeTokenizerTest extends \PHPUnit_Framework_TestCase
11+
{
12+
public function testTokenizer()
13+
{
14+
$tokenizer = new VanderleeTokenizer();
15+
$sentences = $tokenizer->tokenize($this->getText());
16+
$this->assertCount(5, $sentences);
17+
}
18+
19+
protected function getText()
20+
{
21+
return <<<TEXT
22+
Hello there, Mr. Smith. What're you doing today... Smith, my friend?\n\nI hope it's good. This last sentence will cost you $2.50! Just kidding :)
23+
TEXT;
24+
}
25+
}

0 commit comments

Comments
 (0)