Skip to content

Commit e520cf2

Browse files
committed
add short cut methods for stemming and rake
1 parent 91453b0 commit e520cf2

File tree

7 files changed

+85
-27
lines changed

7 files changed

+85
-27
lines changed

README.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,5 +71,20 @@ Customize the ngrams
7171
$trigrams = ngrams($tokens,3, '|');
7272
```
7373

74+
### Stemming
75+
By default stem method uses the Porter Stemmer.
76+
```php
77+
$stemmedTokens = stem($tokens);
78+
```
79+
You can customize which type of stemmer to use by passing in the name of the stemmer class name
80+
```php
81+
$stemmedTokens = stem($tokens, \TextAnalysis\Stemmers\MorphStemmer::class);
82+
```
7483

75-
84+
### Keyword Extract with Rake
85+
There is a short cut method for using the Rake algorithm. You will need to clean
86+
your data prior to using. Second parameter is the ngram size of your keywords to extract.
87+
```php
88+
$rake = rake($tokens, 3);
89+
$results = $rake->getKeywordScores();
90+
```

src/Analysis/DateAnalysis.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class DateAnalysis
2828
*
2929
* @param type $text
3030
*/
31-
public function __construct($text)
31+
public function __construct(string $text)
3232
{
3333
$tokenizer = new SentenceTokenizer();
3434
$this->sentences = $tokenizer->tokenize( $this->normalize($text)) ;
@@ -39,7 +39,7 @@ public function __construct($text)
3939
* ie Mar. to March
4040
* @param string $text
4141
*/
42-
protected function normalize($text)
42+
protected function normalize(string $text) : string
4343
{
4444
$search = ['jan.','feb.','mar.','apr.','may.','jun.','jul.','aug.','sep.','oct.','nov.','dec.'];
4545
$replace = [
@@ -62,7 +62,7 @@ protected function normalize($text)
6262
/**
6363
* @return DateTime[]
6464
*/
65-
public function getDates()
65+
public function getDates() : array
6666
{
6767
// return the cached copy
6868
if(empty($this->dates)) {

src/Collocations/CollocationFinder.php

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,25 @@
22

33
namespace TextAnalysis\Collocations;
44

5-
use TextAnalysis\NGrams\NGramFactory;
6-
use TextAnalysis\Analysis\FreqDist;
7-
85
/**
96
* Helps find popular phrases in the given set of tokens
107
* @author yooper
118
*/
129
class CollocationFinder
1310
{
11+
/**
12+
* The ngram size
13+
* @var int
14+
*/
1415
protected $nGramSize = 2;
1516

17+
/**
18+
*
19+
* @var array
20+
*/
1621
protected $tokens = [];
1722

18-
public function __construct(array $tokens, $nGramSize = 2)
23+
public function __construct(array $tokens, int $nGramSize = 2)
1924
{
2025
$this->tokens = $tokens;
2126
$this->nGramSize = $nGramSize;

src/Tokenizers/SentimentTokenizer.php

Lines changed: 0 additions & 19 deletions
This file was deleted.

src/helpers/helpers.php

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,38 @@ function text( string $text ): \TextAnalysis\Corpus\TextCorpus {
111111
}
112112
}
113113

114+
115+
if (! function_exists('rake')) {
116+
/**
117+
* Returns an instance of the Rake
118+
*
119+
* @param array $tokens
120+
*
121+
* @return \TextAnalysis\Analysis\Keywords\Rake
122+
*/
123+
function rake(array $tokens, int $ngramSize = 3): \TextAnalysis\Analysis\Keywords\Rake
124+
{
125+
return new \TextAnalysis\Analysis\Keywords\Rake(new \TextAnalysis\Documents\TokensDocument($tokens), $ngramSize);
126+
}
127+
}
128+
129+
if (! function_exists('stem')) {
130+
/**
131+
* Returns an array of stemmed tokens
132+
*
133+
* @param array $tokens
134+
*
135+
* @return \TextAnalysis\Analysis\Keywords\Rake
136+
*/
137+
function stem(array $tokens, string $stemmerClassName = \TextAnalysis\Stemmers\PorterStemmer::class): array
138+
{
139+
$stemmer = new $stemmerClassName();
140+
return array_map(function($token) use($stemmer){ return $stemmer->stem($token); }, $tokens);
141+
}
142+
}
143+
144+
145+
114146
/**
115147
* Check if the given array has the given needle, using a case insensitive search.
116148
* Keeps a local copy of the normalized haystack for quicker lookup on the same array

tests/TextAnalysis/Analysis/Keywords/RakeTest.php

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,25 @@ public function testRake()
3636
$this->assertArrayHasKey('8/8/2016 5:51 pm', $results);
3737
}
3838

39+
public function testSimplifiedRake()
40+
{
41+
$stopwords = array_map('trim', file(VENDOR_DIR.'yooper/stop-words/data/stop-words_english_1_en.txt'));
42+
// all punctuation must be moved 1 over. Fixes issues with sentences
43+
$testData = (new SpacePunctuationFilter([':','\/']))->transform($this->getTestData());
44+
//rake MUST be split on whitespace and new lines only
45+
$tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData);
46+
$tokenDoc = new TokensDocument($tokens);
47+
$tokenDoc->applyTransformation(new LowerCaseFilter())
48+
->applyTransformation(new StopWordsFilter($stopwords), false)
49+
->applyTransformation(new PunctuationFilter(['@',':','\/']), false)
50+
->applyTransformation(new CharFilter(), false);
51+
52+
$rake = rake($tokenDoc->toArray(), 3);
53+
$results = $rake->getKeywordScores();
54+
$this->assertArrayHasKey('minimal generating sets', $results);
55+
$this->assertArrayHasKey('8/8/2016 5:51 pm', $results);
56+
}
57+
3958
/**
4059
* Sample test data
4160
* @return string

tests/TextAnalysis/Stemmers/PorterStemmerTest.php

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,10 @@ public function testStemmer()
1818
$this->assertEquals('univers', $stemmer->stem('universities'));
1919
$this->assertEquals('judg',$stemmer->stem('judges'));
2020
}
21+
22+
public function testSimplifiedStemmer()
23+
{
24+
$this->assertEquals(['univers','judg'], stem(['universities', 'judges']));
25+
}
26+
2127
}

0 commit comments

Comments
 (0)