@@ -73,29 +73,63 @@ public function getLexicalDiversity(string $lexicalDiversityClassName = Naive::c
73
73
/**
74
74
* See https://stackoverflow.com/questions/15737408/php-find-all-occurrences-of-a-substring-in-a-string
75
75
* @param string $needle
76
- * @param int $spacing The amount of space left and right of the found needle
76
+ * @param int $contextLength The amount of space left and right of the found needle
77
+ * @param bool $ignorecase
78
+ * @param int $position. Available options: contain, begin, end, equal.
77
79
* @return array
78
80
*/
79
- public function concordance (string $ needle , int $ spacing = 20 ) : array
81
+ public function concordance (string $ needle , int $ contextLength = 20 , bool $ ignorecase = true , string $ position = ' contain ' ) : array
80
82
{
81
- $ position = 0 ;
83
+ // temporary solution to handle unicode chars
84
+ $ this ->text = utf8_decode ($ this ->text );
85
+ $ needle = utf8_decode ($ needle );
86
+
82
87
$ found = [];
83
- $ text = trim (preg_replace ('/[\s\t\n\r\s]+/ ' , ' ' , $ this ->text ));
88
+ $ text = ' ' . trim (preg_replace ('/[\s\t\n\r\s]+/ ' , ' ' , $ this ->text )) . ' ' ;
84
89
$ needleLength = strlen ($ needle );
85
90
$ textLength = strlen ($ text );
86
- $ bufferLength = $ needleLength + 2 * $ spacing ;
87
-
88
- while (($ position = stripos ($ text , $ needle , $ position ))!== false )
89
- {
90
- $ left = max ($ position - $ spacing , 0 );
91
- if ($ needleLength + $ spacing + $ position > $ textLength ) {
92
- $ tmp = substr ($ text , $ left );
93
- } else {
91
+ $ bufferLength = $ needleLength + 2 * $ contextLength ;
92
+
93
+ // \p{L} or \p{Letter}: any kind of letter from any language.
94
+
95
+ $ special_chars = "\/\-_\' " ;
96
+ $ word_part = '\p{L} ' .$ special_chars ;
97
+
98
+ switch ($ position ) {
99
+ case 'equal ' :
100
+ $ pattern = "/[^ $ word_part]( $ needle)[^ $ word_part]/ " ;
101
+ break ;
102
+ case 'begin ' :
103
+ $ pattern = "/[^ $ word_part]( $ needle)[ $ special_chars]?[\p{L}]*|^( $ needle)/ " ;
104
+ break ;
105
+ case 'end ' :
106
+ $ pattern = "/[\p{L}]*[ $ special_chars]?[\p{L}]*( $ needle)[^ $ word_part]/ " ;
107
+ break ;
108
+ case 'contain ' :
109
+ $ pattern = "/( $ needle)/ " ;
110
+ break ;
111
+ default :
112
+ $ pattern = "/( $ needle)/ " ;
113
+ break ;
114
+ }
115
+
116
+ $ case = $ ignorecase ? 'i ' : '' ;
117
+ preg_match_all ($ pattern .$ case , $ text , $ matches , PREG_OFFSET_CAPTURE );
118
+
119
+ // Getting excerpts
120
+ foreach ($ matches [1 ] as $ match ) {
121
+
122
+ $ needlePosition = $ match [1 ];
123
+ $ left = max ($ needlePosition - $ contextLength , 0 );
124
+
125
+ if ($ needleLength + $ contextLength + $ needlePosition > $ textLength ) {
126
+ $ tmp = substr ($ text , $ left );
127
+ } else {
94
128
$ tmp = substr ($ text , $ left , $ bufferLength );
95
- }
96
- $ found [] = $ tmp ;
97
- $ position += $ needleLength ;
129
+ }
130
+ $ found [] = utf8_encode ($ tmp );
98
131
}
132
+
99
133
return $ found ;
100
134
}
101
135
0 commit comments