Merge pull request #45 from thiagogomesverissimo/improving-concordance

yooper · web-flow · commit ba5c76f7a60f · 2018-12-19T07:54:08.000-05:00
Improving concordance method
diff --git a/src/Corpus/TextCorpus.php b/src/Corpus/TextCorpus.php
@@ -73,29 +73,63 @@ public function getLexicalDiversity(string $lexicalDiversityClassName = Naive::c
     /**
      * See https://stackoverflow.com/questions/15737408/php-find-all-occurrences-of-a-substring-in-a-string
      * @param string $needle
-     * @param int $spacing The amount of space left and right of the found needle
+     * @param int $contextLength The amount of space left and right of the found needle
+     * @param bool $ignorecase
+     * @param int $position. Available options: contain, begin, end, equal.
      * @return array
      */
-    public function concordance(string $needle, int $spacing = 20) : array
+    public function concordance(string $needle, int $contextLength = 20, bool $ignorecase = true, string $position = 'contain') : array
     {
-        $position = 0;
+        // temporary solution to handle unicode chars
+        $this->text = utf8_decode($this->text);
+        $needle = utf8_decode($needle);
+        
         $found = [];
-        $text = trim(preg_replace('/[\s\t\n\r\s]+/', ' ', $this->text));
+        $text = ' ' . trim(preg_replace('/[\s\t\n\r\s]+/', ' ', $this->text)) . ' ';
         $needleLength = strlen($needle);
         $textLength = strlen($text);
-        $bufferLength = $needleLength + 2 * $spacing;
-                        
-        while (($position = stripos($text, $needle, $position))!== false) 
-        {
-            $left = max($position - $spacing, 0);                        
-            if($needleLength + $spacing + $position > $textLength) {
-                $tmp = substr($text, $left); 
-            } else { 
+        $bufferLength = $needleLength + 2 * $contextLength;
+
+        // \p{L} or \p{Letter}: any kind of letter from any language.
+
+        $special_chars = "\/\-_\'";
+        $word_part = '\p{L}'.$special_chars;
+
+        switch ($position) {
+            case 'equal':
+                $pattern = "/[^$word_part]($needle)[^$word_part]/";
+                break;
+            case 'begin':
+                $pattern = "/[^$word_part]($needle)[$special_chars]?[\p{L}]*|^($needle)/";
+                break;
+            case 'end':
+                $pattern = "/[\p{L}]*[$special_chars]?[\p{L}]*($needle)[^$word_part]/";
+                break;
+            case 'contain':
+                $pattern = "/($needle)/";
+                break;
+            default:
+                $pattern = "/($needle)/";
+                break;
+        }
+
+        $case = $ignorecase ? 'i' : '';
+        preg_match_all($pattern.$case, $text, $matches, PREG_OFFSET_CAPTURE);
+
+        // Getting excerpts
+        foreach($matches[1] as $match) {
+
+            $needlePosition = $match[1];
+            $left = max($needlePosition - $contextLength, 0);
+
+            if($needleLength + $contextLength + $needlePosition > $textLength) {
+                $tmp = substr($text, $left);
+            } else {
                 $tmp = substr($text, $left, $bufferLength);
-            }            
-            $found[] = $tmp;
-            $position += $needleLength;
+            }
+            $found[] = utf8_encode($tmp);
         }
+
         return $found;
     }
     
diff --git a/tests/TestBaseCase.php b/tests/TestBaseCase.php
@@ -17,6 +17,7 @@ class TestBaseCase extends \PHPUnit_Framework_TestCase
      * @var string 
      */
     static protected $text = null;
+    static protected $text_ptbr = null;
 
     /**
      *
@@ -33,14 +34,25 @@ public function setUp()
         //load the text file
         if(is_null(self::$text)) { 
             self::$text = file_get_contents(TESTS_PATH.DS.'data'.DS.'books'.DS.'tom_sawyer.txt');
+            self::$text_ptbr = file_get_contents(TESTS_PATH.DS.'data'.DS.'books'.DS.'/ptbr/Dom_Casmurro.txt');
         }        
     }
     
-    public function getText() : string
+    public function getText(string $language = 'en') : string
     {
-        return self::$text;
+        switch($language) {
+            case 'ptbr':
+                return self::$text_ptbr;
+                break;
+            case 'en':
+                return self::$text;
+                break;
+            default:
+                return self::$text;
+                break;
+        }
     }
-    
+
     /**
      * 
      * @param string $className
diff --git a/tests/TextAnalysis/Corpus/TextCorpusTest.php b/tests/TextAnalysis/Corpus/TextCorpusTest.php
@@ -24,6 +24,13 @@ public function testConcordance()
         $this->assertCount(34, $results);
     }
     
+    public function testConcordancePtBr()
+    {
+        $corpus = new TextCorpus($this->getText('ptbr'));
+        $results = $corpus->concordance("José",20, true, 'equal');
+        $this->assertCount(160, $results);
+    }
+
     public function testTokenizer()
     {
         $corpus = new TextCorpus($this->getText());
diff --git a/tests/data/books/ptbr/Dom_Casmurro.txt b/tests/data/books/ptbr/Dom_Casmurro.txt

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@ class TestBaseCase extends \PHPUnit_Framework_TestCase`
`17`	`17`	`* @var string`
`18`	`18`	`*/`
`19`	`19`	`static protected $text = null;`
	`20`	`+ static protected $text_ptbr = null;`
`20`	`21`
`21`	`22`	`/**`
`22`	`23`	`*`
`@@ -33,14 +34,25 @@ public function setUp()`
`33`	`34`	`//load the text file`
`34`	`35`	`if(is_null(self::$text)) {`
`35`	`36`	`self::$text = file_get_contents(TESTS_PATH.DS.'data'.DS.'books'.DS.'tom_sawyer.txt');`
	`37`	`+ self::$text_ptbr = file_get_contents(TESTS_PATH.DS.'data'.DS.'books'.DS.'/ptbr/Dom_Casmurro.txt');`
`36`	`38`	`}`
`37`	`39`	`}`
`38`	`40`
`39`		`- public function getText() : string`
	`41`	`+ public function getText(string $language = 'en') : string`
`40`	`42`	`{`
`41`		`- return self::$text;`
	`43`	`+ switch($language) {`
	`44`	`+ case 'ptbr':`
	`45`	`+ return self::$text_ptbr;`
	`46`	`+ break;`
	`47`	`+ case 'en':`
	`48`	`+ return self::$text;`
	`49`	`+ break;`
	`50`	`+ default:`
	`51`	`+ return self::$text;`
	`52`	`+ break;`
	`53`	`+ }`
`42`	`54`	`}`
`43`		`-`
	`55`	`+`
`44`	`56`	`/**`
`45`	`57`	`*`
`46`	`58`	`* @param string $className`
Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,13 @@ public function testConcordance()`
`24`	`24`	`$this->assertCount(34, $results);`
`25`	`25`	`}`
`26`	`26`
	`27`	`+ public function testConcordancePtBr()`
	`28`	`+ {`
	`29`	`+ $corpus = new TextCorpus($this->getText('ptbr'));`
	`30`	`+ $results = $corpus->concordance("José",20, true, 'equal');`
	`31`	`+ $this->assertCount(160, $results);`
	`32`	`+ }`
	`33`	`+`
`27`	`34`	`public function testTokenizer()`
`28`	`35`	`{`
`29`	`36`	`$corpus = new TextCorpus($this->getText());`