Skip to content

Commit 93152b3

Browse files
committed
Merge branch 'master' into bayes
2 parents 8bfb370 + e520cf2 commit 93152b3

File tree

10 files changed

+152
-36
lines changed

10 files changed

+152
-36
lines changed

README.md

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,19 @@ php-text-analysis
66

77
[![Total Downloads](https://poser.pugx.org/yooper/php-text-analysis/downloads)](https://packagist.org/packages/yooper/php-text-analysis)
88

9+
PHP Text Analysis is a library for performing Information Retrieval (IR) and Natural Language Processing (NLP) tasks using the PHP language.
10+
All the documentation for this project can be found in the book and wiki.
11+
12+
PHP Text Analysis Book & Wiki
13+
=============
14+
15+
A book is in the works and your contributions are needed. You can find the book
16+
at https://github.com/yooper/php-text-analysis-book
17+
18+
19+
Also, documentation for the library resides in the wiki, too.
20+
https://github.com/yooper/php-text-analysis/wiki
921

10-
PHP Text Analysis is a library for performing Information Retrieval (IR) and Natural Language Processing (NLP) tasks using the PHP language. All the documentation for this project can be found in the wiki.
1122

1223
Installation Instructions
1324
=============
@@ -16,9 +27,6 @@ Add PHP Text Analysis to your project
1627
```
1728
composer require yooper/php-text-analysis
1829
```
19-
Documentation for the library resides in the wiki.
20-
https://github.com/yooper/php-text-analysis/wiki
21-
2230

2331
### Tokenization
2432
```php
@@ -63,9 +71,20 @@ Customize the ngrams
6371
$trigrams = ngrams($tokens,3, '|');
6472
```
6573

66-
Dictionary Installation
67-
=============
68-
69-
To do
70-
74+
### Stemming
75+
By default stem method uses the Porter Stemmer.
76+
```php
77+
$stemmedTokens = stem($tokens);
78+
```
79+
You can customize which type of stemmer to use by passing in the name of the stemmer class name
80+
```php
81+
$stemmedTokens = stem($tokens, \TextAnalysis\Stemmers\MorphStemmer::class);
82+
```
7183

84+
### Keyword Extract with Rake
85+
There is a short cut method for using the Rake algorithm. You will need to clean
86+
your data prior to using. Second parameter is the ngram size of your keywords to extract.
87+
```php
88+
$rake = rake($tokens, 3);
89+
$results = $rake->getKeywordScores();
90+
```

composer.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
"psr-4": {
1414
"TextAnalysis\\": "src/"
1515
},
16-
"files": ["src/helpers/storage.php", "src/helpers/print.php", "src/helpers/simplified.php", "src/helpers/helpers.php"]
16+
"files": ["src/helpers/storage.php", "src/helpers/print.php", "src/helpers/simplified.php", "src/helpers/helpers.php", "src/helpers/interactive_help.php"]
1717
},
1818
"autoload-dev": {
1919
"files": ["tests/TestBaseCase.php"]

interactive

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
# run the library in an interactive mode for doing analysis
3+
echo "Welcome to the PHP Text Analysis Interactive Console"
4+
echo "Type help(); to get a list of available functions"
5+
php -a -d auto_prepend_file=./vendor/autoload.php
6+

src/Analysis/DateAnalysis.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class DateAnalysis
2828
*
2929
* @param type $text
3030
*/
31-
public function __construct($text)
31+
public function __construct(string $text)
3232
{
3333
$tokenizer = new SentenceTokenizer();
3434
$this->sentences = $tokenizer->tokenize( $this->normalize($text)) ;
@@ -39,7 +39,7 @@ public function __construct($text)
3939
* ie Mar. to March
4040
* @param string $text
4141
*/
42-
protected function normalize($text)
42+
protected function normalize(string $text) : string
4343
{
4444
$search = ['jan.','feb.','mar.','apr.','may.','jun.','jul.','aug.','sep.','oct.','nov.','dec.'];
4545
$replace = [
@@ -62,7 +62,7 @@ protected function normalize($text)
6262
/**
6363
* @return DateTime[]
6464
*/
65-
public function getDates()
65+
public function getDates() : array
6666
{
6767
// return the cached copy
6868
if(empty($this->dates)) {

src/Collocations/CollocationFinder.php

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,25 @@
22

33
namespace TextAnalysis\Collocations;
44

5-
use TextAnalysis\NGrams\NGramFactory;
6-
use TextAnalysis\Analysis\FreqDist;
7-
85
/**
96
* Helps find popular phrases in the given set of tokens
107
* @author yooper
118
*/
129
class CollocationFinder
1310
{
11+
/**
12+
* The ngram size
13+
* @var int
14+
*/
1415
protected $nGramSize = 2;
1516

17+
/**
18+
*
19+
* @var array
20+
*/
1621
protected $tokens = [];
1722

18-
public function __construct(array $tokens, $nGramSize = 2)
23+
public function __construct(array $tokens, int $nGramSize = 2)
1924
{
2025
$this->tokens = $tokens;
2126
$this->nGramSize = $nGramSize;

src/Tokenizers/SentimentTokenizer.php

Lines changed: 0 additions & 19 deletions
This file was deleted.

src/helpers/helpers.php

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,38 @@ function text( string $text ): \TextAnalysis\Corpus\TextCorpus {
111111
}
112112
}
113113

114+
115+
if (! function_exists('rake')) {
116+
/**
117+
* Returns an instance of the Rake
118+
*
119+
* @param array $tokens
120+
*
121+
* @return \TextAnalysis\Analysis\Keywords\Rake
122+
*/
123+
function rake(array $tokens, int $ngramSize = 3): \TextAnalysis\Analysis\Keywords\Rake
124+
{
125+
return new \TextAnalysis\Analysis\Keywords\Rake(new \TextAnalysis\Documents\TokensDocument($tokens), $ngramSize);
126+
}
127+
}
128+
129+
if (! function_exists('stem')) {
130+
/**
131+
* Returns an array of stemmed tokens
132+
*
133+
* @param array $tokens
134+
*
135+
* @return \TextAnalysis\Analysis\Keywords\Rake
136+
*/
137+
function stem(array $tokens, string $stemmerClassName = \TextAnalysis\Stemmers\PorterStemmer::class): array
138+
{
139+
$stemmer = new $stemmerClassName();
140+
return array_map(function($token) use($stemmer){ return $stemmer->stem($token); }, $tokens);
141+
}
142+
}
143+
144+
145+
114146
/**
115147
* Check if the given array has the given needle, using a case insensitive search.
116148
* Keeps a local copy of the normalized haystack for quicker lookup on the same array
@@ -155,3 +187,36 @@ function array_searchi(string $needle, array $haystack)
155187
return array_search($needle, $localCopy);
156188
}
157189

190+
191+
/**
192+
* Load a book into memory
193+
* @param string $filename
194+
* @return string
195+
*/
196+
function gutenberg(string $filename) : string
197+
{
198+
return file_get_contents(get_storage_path("corpora/gutenberg").$filename);
199+
}
200+
201+
/**
202+
* Return a list of books available
203+
* @return array
204+
*/
205+
function gutenberg_list() : array
206+
{
207+
return scan_dir(get_storage_path("corpora/gutenberg/"));
208+
}
209+
210+
/**
211+
* Shortcut function for getting contents of directory
212+
* @param string $dir
213+
* @return array
214+
*/
215+
function scan_dir(string $dir) : array
216+
{
217+
return array_diff(scandir($dir), ['..', '.']);
218+
}
219+
220+
221+
222+

src/helpers/interactive_help.php

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<?php
2+
3+
/**
4+
* List out a help menu in the interactive console
5+
*/
6+
function help()
7+
{
8+
$menu = [
9+
'text(string $text) -> Return a TextCorpus object',
10+
'normalize(string $text) -> Normalize text to lower case',
11+
'todo ....'
12+
];
13+
print_array($menu);
14+
}
15+

tests/TextAnalysis/Analysis/Keywords/RakeTest.php

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,25 @@ public function testRake()
3636
$this->assertArrayHasKey('8/8/2016 5:51 pm', $results);
3737
}
3838

39+
public function testSimplifiedRake()
40+
{
41+
$stopwords = array_map('trim', file(VENDOR_DIR.'yooper/stop-words/data/stop-words_english_1_en.txt'));
42+
// all punctuation must be moved 1 over. Fixes issues with sentences
43+
$testData = (new SpacePunctuationFilter([':','\/']))->transform($this->getTestData());
44+
//rake MUST be split on whitespace and new lines only
45+
$tokens = (new GeneralTokenizer(" \n\t\r"))->tokenize($testData);
46+
$tokenDoc = new TokensDocument($tokens);
47+
$tokenDoc->applyTransformation(new LowerCaseFilter())
48+
->applyTransformation(new StopWordsFilter($stopwords), false)
49+
->applyTransformation(new PunctuationFilter(['@',':','\/']), false)
50+
->applyTransformation(new CharFilter(), false);
51+
52+
$rake = rake($tokenDoc->toArray(), 3);
53+
$results = $rake->getKeywordScores();
54+
$this->assertArrayHasKey('minimal generating sets', $results);
55+
$this->assertArrayHasKey('8/8/2016 5:51 pm', $results);
56+
}
57+
3958
/**
4059
* Sample test data
4160
* @return string

tests/TextAnalysis/Stemmers/PorterStemmerTest.php

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,10 @@ public function testStemmer()
1818
$this->assertEquals('univers', $stemmer->stem('universities'));
1919
$this->assertEquals('judg',$stemmer->stem('judges'));
2020
}
21+
22+
public function testSimplifiedStemmer()
23+
{
24+
$this->assertEquals(['univers','judg'], stem(['universities', 'judges']));
25+
}
26+
2127
}

0 commit comments

Comments
 (0)