-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathexample_02_document_collections.php
84 lines (60 loc) · 2.22 KB
/
example_02_document_collections.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
<?php
/**
* An example of creating a creating document collection
* Document Collections allow you to work with a group of documents easily
*/
require_once('vendor/autoload.php');
//used to generate a chart from the output of PHP Text Analysis
require_once('utils/BarPageBuilder.php');
/**
* @var string $book
*/
$tomSawyerBook = file_get_contents('data/books/pg74.txt');
$huckFinnBook = file_get_contents('data/books/pg76.txt');
/**
* Create a tokenizer object to parse the book into a set of tokens
*
*/
$tokenizer = new \TextAnalysis\Tokenizers\GeneralTokenizer();
/**
* Get the set of tokens generated by the tokenize and
* create a token document from the tokens
*
*/
$tomSawyerDocument = new \TextAnalysis\Documents\TokensDocument($tokenizer->tokenize($tomSawyerBook));
$huckFinnDocument = new \TextAnalysis\Documents\TokensDocument($tokenizer->tokenize($huckFinnBook));
/**
* create a document collection that can have filters or further analysis done
*/
$docCollection = new \TextAnalysis\Collections\DocumentArrayCollection(array($tomSawyerDocument, $huckFinnDocument));
/**
* Apply filters to the document collection
* lower case the documents, remove quotes and remove stop words
*/
$filters = array(
new \TextAnalysis\Filters\LowerCaseFilter(),
new \TextAnalysis\Filters\QuotesFilter(),
new \TextAnalysis\Filters\EnglishStopWordsFilter()
);
/**
* Applies the filters to all the documents
*/
$docCollection->applyTransformations($filters);
/**
* See how the top 10 keyword frequency has changed by applying the filters compared to example 01
*/
$freqDist = new \TextAnalysis\Analysis\FreqDist($docCollection[0]->getDocumentData());
/**
* Get the top 10 most used words in Tom Sawyer
*/
$top10 = array_splice($freqDist->getKeyValuesByFrequency(), 0, 10);
/**
* Use High Charts to visualize the data
*/
$pageBuilder = new BarPageBuilder($top10);
$html = $pageBuilder->getHtmlPage();
file_put_contents("pub/pages/example_02_document_collections.html", $html);
echo 'go to the directory pub/pages/example_02_document_collections.html and open the file with your web browser'.PHP_EOL;
/**
* go to the directory in this project and open the file with your web browser
*/