Skip to content

Commit ab9147b

Browse files
authored
Merge pull request #24 from yooper/twitter_tokenizer
Twitter tokenizer
2 parents bae31c6 + e4fe186 commit ab9147b

File tree

2 files changed

+169
-0
lines changed

2 files changed

+169
-0
lines changed

src/Tokenizers/TwitterTokenizer.php

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
<?php
2+
3+
4+
namespace TextAnalysis\Tokenizers;
5+
6+
/**
7+
* Tokenize tweets, based on http://www.nltk.org/_modules/nltk/tokenize/casual.html
8+
* @author yooper
9+
*/
10+
class TwitterTokenizer extends TokenizerAbstract
11+
{
12+
public function tokenize($string): array
13+
{
14+
$matches = [];
15+
$found = preg_match_all("~{$this->getJoinedRegexes()}~uim", $string, $matches);
16+
if($found === 0) {
17+
return [];
18+
}
19+
return $matches[0];
20+
}
21+
22+
/**
23+
* Return an array of regexes, ordered by priority
24+
*/
25+
public function getRegexes() : array
26+
{
27+
return [
28+
$this->getPhoneNumbers(),
29+
$this->getUrls(),
30+
$this->getEmoticons(),
31+
$this->getHashTags(),
32+
$this->getAsciiArrows(),
33+
$this->getUsernames(),
34+
$this->getHashTags(),
35+
$this->getWordsWith(),
36+
$this->getNumbers(),
37+
$this->getWordsWithout(),
38+
$this->getEllipsisDots(),
39+
$this->getEverythingElse()
40+
];
41+
}
42+
43+
/**
44+
*
45+
* @return string
46+
*/
47+
public function getJoinedRegexes() : string
48+
{
49+
return implode("|", $this->getRegexes());
50+
}
51+
52+
public function getHtmlTags() : string
53+
{
54+
return '<[^>\s]+>';
55+
}
56+
57+
public function getAsciiArrows() : string
58+
{
59+
return '[\-]+>|<[\-]+';
60+
}
61+
62+
public function getUsernames() : string
63+
{
64+
return '(?:@[\w_]+)';
65+
}
66+
67+
public function getHashTags() : string
68+
{
69+
return '(?:\#+[\w_]+[\w\'_\-]*[\w_]+)';
70+
}
71+
72+
/**
73+
* # Words with apostrophes or dashes.
74+
* @return string
75+
*/
76+
public function getWordsWith() : string
77+
{
78+
return "(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_])";
79+
}
80+
81+
public function getEmailAddress() : string
82+
{
83+
return '[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]';
84+
}
85+
86+
public function getNumbers() : string
87+
{
88+
return '(?:[+\-]?\d+[,/.:-]\d+[+\-]?)';
89+
}
90+
91+
/**
92+
* Words without apostrophes or dashes.
93+
* @return string
94+
*/
95+
public function getWordsWithout() : string
96+
{
97+
return '(?:[\w_]+)';
98+
}
99+
100+
public function getEllipsisDots() : string
101+
{
102+
return '(?:\.(?:\s*\.){1,})';
103+
}
104+
105+
/**
106+
* Everything else that isn't whitespace.
107+
* @return string
108+
*/
109+
public function getEverythingElse() : string
110+
{
111+
return '(?:\S)';
112+
}
113+
114+
/**
115+
* Taken from https://stackoverflow.com/questions/2113908/what-regular-expression-will-match-valid-international-phone-numbers
116+
* @return string
117+
*/
118+
public function getPhoneNumbers() : string
119+
{
120+
return '(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?';
121+
}
122+
123+
/**
124+
* Taken from https://stackoverflow.com/questions/6427530/regular-expression-pattern-to-match-url-with-or-without-http-www
125+
* @return string
126+
*/
127+
public function getUrls() : string
128+
{
129+
$regex = '((https?|ftp)://)?'; // SCHEME
130+
$regex .= '([a-z0-9+!*(),;?&=$_.-]+(:[a-z0-9+!*(),;?&=$_.-]+)?@)?'; // User and Pass
131+
$regex .= '([a-z0-9\-\.]*)\.(([a-z]{2,4})|([0-9]{1,3}\.([0-9]{1,3})\.([0-9]{1,3})))'; // Host or IP
132+
$regex .= "(:[0-9]{2,5})?"; // Port
133+
$regex .= '(/([a-z0-9+$_%-]\.?)+)*/?'; // Path
134+
$regex .= '(\?[a-z+&\$_.-][a-z0-9;:@&%=+/$_.-]*)?'; // GET Query
135+
$regex .= '(#[a-z_.-][a-z0-9+$%_.-]*)?'; // Anchor
136+
return $regex;
137+
}
138+
139+
public function getEmoticons() : string
140+
{
141+
return '(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)';
142+
}
143+
144+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<?php
2+
3+
namespace Tests\TextAnalysis\Tokenizers;
4+
5+
use TextAnalysis\Tokenizers\TwitterTokenizer;
6+
7+
/**
8+
*
9+
* @author yooper
10+
*/
11+
class TwitterTokenizerTest extends \PHPUnit_Framework_TestCase
12+
{
13+
public function testTokenizer()
14+
{
15+
$tokens = (new TwitterTokenizer)->tokenize('This is a common Tweet #format where @mentions and.errors!!!!like this:-))))) might #appear❤ ❤☺❤#ThisIsAHashtag!?!');
16+
$this->assertCount(33, $tokens);
17+
18+
}
19+
20+
public function testForUrlAndEmail()
21+
{
22+
$tokens = (new TwitterTokenizer)->tokenize('Custom Software Development http://redbeardtechnologies.com/ 906-555-5555 or contact support at [email protected] :-)');
23+
$this->assertCount(11, $tokens);
24+
}
25+
}

0 commit comments

Comments
 (0)