Skip to content

Commit ba5c76f

Browse files
authored
Merge pull request #45 from thiagogomesverissimo/improving-concordance
Improving concordance method
2 parents ad5b139 + 0436a62 commit ba5c76f

File tree

4 files changed

+5705
-18
lines changed

4 files changed

+5705
-18
lines changed

src/Corpus/TextCorpus.php

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -73,29 +73,63 @@ public function getLexicalDiversity(string $lexicalDiversityClassName = Naive::c
7373
/**
7474
* See https://stackoverflow.com/questions/15737408/php-find-all-occurrences-of-a-substring-in-a-string
7575
* @param string $needle
76-
* @param int $spacing The amount of space left and right of the found needle
76+
* @param int $contextLength The amount of space left and right of the found needle
77+
* @param bool $ignorecase
78+
* @param int $position. Available options: contain, begin, end, equal.
7779
* @return array
7880
*/
79-
public function concordance(string $needle, int $spacing = 20) : array
81+
public function concordance(string $needle, int $contextLength = 20, bool $ignorecase = true, string $position = 'contain') : array
8082
{
81-
$position = 0;
83+
// temporary solution to handle unicode chars
84+
$this->text = utf8_decode($this->text);
85+
$needle = utf8_decode($needle);
86+
8287
$found = [];
83-
$text = trim(preg_replace('/[\s\t\n\r\s]+/', ' ', $this->text));
88+
$text = ' ' . trim(preg_replace('/[\s\t\n\r\s]+/', ' ', $this->text)) . ' ';
8489
$needleLength = strlen($needle);
8590
$textLength = strlen($text);
86-
$bufferLength = $needleLength + 2 * $spacing;
87-
88-
while (($position = stripos($text, $needle, $position))!== false)
89-
{
90-
$left = max($position - $spacing, 0);
91-
if($needleLength + $spacing + $position > $textLength) {
92-
$tmp = substr($text, $left);
93-
} else {
91+
$bufferLength = $needleLength + 2 * $contextLength;
92+
93+
// \p{L} or \p{Letter}: any kind of letter from any language.
94+
95+
$special_chars = "\/\-_\'";
96+
$word_part = '\p{L}'.$special_chars;
97+
98+
switch ($position) {
99+
case 'equal':
100+
$pattern = "/[^$word_part]($needle)[^$word_part]/";
101+
break;
102+
case 'begin':
103+
$pattern = "/[^$word_part]($needle)[$special_chars]?[\p{L}]*|^($needle)/";
104+
break;
105+
case 'end':
106+
$pattern = "/[\p{L}]*[$special_chars]?[\p{L}]*($needle)[^$word_part]/";
107+
break;
108+
case 'contain':
109+
$pattern = "/($needle)/";
110+
break;
111+
default:
112+
$pattern = "/($needle)/";
113+
break;
114+
}
115+
116+
$case = $ignorecase ? 'i' : '';
117+
preg_match_all($pattern.$case, $text, $matches, PREG_OFFSET_CAPTURE);
118+
119+
// Getting excerpts
120+
foreach($matches[1] as $match) {
121+
122+
$needlePosition = $match[1];
123+
$left = max($needlePosition - $contextLength, 0);
124+
125+
if($needleLength + $contextLength + $needlePosition > $textLength) {
126+
$tmp = substr($text, $left);
127+
} else {
94128
$tmp = substr($text, $left, $bufferLength);
95-
}
96-
$found[] = $tmp;
97-
$position += $needleLength;
129+
}
130+
$found[] = utf8_encode($tmp);
98131
}
132+
99133
return $found;
100134
}
101135

tests/TestBaseCase.php

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class TestBaseCase extends \PHPUnit_Framework_TestCase
1717
* @var string
1818
*/
1919
static protected $text = null;
20+
static protected $text_ptbr = null;
2021

2122
/**
2223
*
@@ -33,14 +34,25 @@ public function setUp()
3334
//load the text file
3435
if(is_null(self::$text)) {
3536
self::$text = file_get_contents(TESTS_PATH.DS.'data'.DS.'books'.DS.'tom_sawyer.txt');
37+
self::$text_ptbr = file_get_contents(TESTS_PATH.DS.'data'.DS.'books'.DS.'/ptbr/Dom_Casmurro.txt');
3638
}
3739
}
3840

39-
public function getText() : string
41+
public function getText(string $language = 'en') : string
4042
{
41-
return self::$text;
43+
switch($language) {
44+
case 'ptbr':
45+
return self::$text_ptbr;
46+
break;
47+
case 'en':
48+
return self::$text;
49+
break;
50+
default:
51+
return self::$text;
52+
break;
53+
}
4254
}
43-
55+
4456
/**
4557
*
4658
* @param string $className

tests/TextAnalysis/Corpus/TextCorpusTest.php

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,13 @@ public function testConcordance()
2424
$this->assertCount(34, $results);
2525
}
2626

27+
public function testConcordancePtBr()
28+
{
29+
$corpus = new TextCorpus($this->getText('ptbr'));
30+
$results = $corpus->concordance("José",20, true, 'equal');
31+
$this->assertCount(160, $results);
32+
}
33+
2734
public function testTokenizer()
2835
{
2936
$corpus = new TextCorpus($this->getText());

0 commit comments

Comments
 (0)