|
| 1 | +<?php |
| 2 | + |
| 3 | + |
| 4 | +namespace TextAnalysis\Tokenizers; |
| 5 | + |
| 6 | +/** |
| 7 | + * Tokenize tweets, based on http://www.nltk.org/_modules/nltk/tokenize/casual.html |
| 8 | + * @author yooper |
| 9 | + */ |
| 10 | +class TwitterTokenizer extends TokenizerAbstract |
| 11 | +{ |
| 12 | + public function tokenize($string): array |
| 13 | + { |
| 14 | + $matches = []; |
| 15 | + $found = preg_match_all("~{$this->getJoinedRegexes()}~uim", $string, $matches); |
| 16 | + if($found === 0) { |
| 17 | + return []; |
| 18 | + } |
| 19 | + return $matches[0]; |
| 20 | + } |
| 21 | + |
| 22 | + /** |
| 23 | + * Return an array of regexes, ordered by priority |
| 24 | + */ |
| 25 | + public function getRegexes() : array |
| 26 | + { |
| 27 | + return [ |
| 28 | + $this->getPhoneNumbers(), |
| 29 | + $this->getUrls(), |
| 30 | + $this->getEmoticons(), |
| 31 | + $this->getHashTags(), |
| 32 | + $this->getAsciiArrows(), |
| 33 | + $this->getUsernames(), |
| 34 | + $this->getHashTags(), |
| 35 | + $this->getWordsWith(), |
| 36 | + $this->getNumbers(), |
| 37 | + $this->getWordsWithout(), |
| 38 | + $this->getEllipsisDots(), |
| 39 | + $this->getEverythingElse() |
| 40 | + ]; |
| 41 | + } |
| 42 | + |
| 43 | + /** |
| 44 | + * |
| 45 | + * @return string |
| 46 | + */ |
| 47 | + public function getJoinedRegexes() : string |
| 48 | + { |
| 49 | + return implode("|", $this->getRegexes()); |
| 50 | + } |
| 51 | + |
| 52 | + public function getHtmlTags() : string |
| 53 | + { |
| 54 | + return '<[^>\s]+>'; |
| 55 | + } |
| 56 | + |
| 57 | + public function getAsciiArrows() : string |
| 58 | + { |
| 59 | + return '[\-]+>|<[\-]+'; |
| 60 | + } |
| 61 | + |
| 62 | + public function getUsernames() : string |
| 63 | + { |
| 64 | + return '(?:@[\w_]+)'; |
| 65 | + } |
| 66 | + |
| 67 | + public function getHashTags() : string |
| 68 | + { |
| 69 | + return '(?:\#+[\w_]+[\w\'_\-]*[\w_]+)'; |
| 70 | + } |
| 71 | + |
| 72 | + /** |
| 73 | + * # Words with apostrophes or dashes. |
| 74 | + * @return string |
| 75 | + */ |
| 76 | + public function getWordsWith() : string |
| 77 | + { |
| 78 | + return "(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_])"; |
| 79 | + } |
| 80 | + |
| 81 | + public function getEmailAddress() : string |
| 82 | + { |
| 83 | + return '[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]'; |
| 84 | + } |
| 85 | + |
| 86 | + public function getNumbers() : string |
| 87 | + { |
| 88 | + return '(?:[+\-]?\d+[,/.:-]\d+[+\-]?)'; |
| 89 | + } |
| 90 | + |
| 91 | + /** |
| 92 | + * Words without apostrophes or dashes. |
| 93 | + * @return string |
| 94 | + */ |
| 95 | + public function getWordsWithout() : string |
| 96 | + { |
| 97 | + return '(?:[\w_]+)'; |
| 98 | + } |
| 99 | + |
| 100 | + public function getEllipsisDots() : string |
| 101 | + { |
| 102 | + return '(?:\.(?:\s*\.){1,})'; |
| 103 | + } |
| 104 | + |
| 105 | + /** |
| 106 | + * Everything else that isn't whitespace. |
| 107 | + * @return string |
| 108 | + */ |
| 109 | + public function getEverythingElse() : string |
| 110 | + { |
| 111 | + return '(?:\S)'; |
| 112 | + } |
| 113 | + |
| 114 | + /** |
| 115 | + * Taken from https://stackoverflow.com/questions/2113908/what-regular-expression-will-match-valid-international-phone-numbers |
| 116 | + * @return string |
| 117 | + */ |
| 118 | + public function getPhoneNumbers() : string |
| 119 | + { |
| 120 | + return '(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'; |
| 121 | + } |
| 122 | + |
| 123 | + /** |
| 124 | + * Taken from https://stackoverflow.com/questions/6427530/regular-expression-pattern-to-match-url-with-or-without-http-www |
| 125 | + * @return string |
| 126 | + */ |
| 127 | + public function getUrls() : string |
| 128 | + { |
| 129 | + $regex = '((https?|ftp)://)?'; // SCHEME |
| 130 | + $regex .= '([a-z0-9+!*(),;?&=$_.-]+(:[a-z0-9+!*(),;?&=$_.-]+)?@)?'; // User and Pass |
| 131 | + $regex .= '([a-z0-9\-\.]*)\.(([a-z]{2,4})|([0-9]{1,3}\.([0-9]{1,3})\.([0-9]{1,3})))'; // Host or IP |
| 132 | + $regex .= "(:[0-9]{2,5})?"; // Port |
| 133 | + $regex .= '(/([a-z0-9+$_%-]\.?)+)*/?'; // Path |
| 134 | + $regex .= '(\?[a-z+&\$_.-][a-z0-9;:@&%=+/$_.-]*)?'; // GET Query |
| 135 | + $regex .= '(#[a-z_.-][a-z0-9+$%_.-]*)?'; // Anchor |
| 136 | + return $regex; |
| 137 | + } |
| 138 | + |
| 139 | + public function getEmoticons() : string |
| 140 | + { |
| 141 | + return '(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)'; |
| 142 | + } |
| 143 | + |
| 144 | +} |
0 commit comments