Basic wildcard impl. (issue #26)

haukex · Oct 27, 2024 · d7888d5 · d7888d5
1 parent cceb79a
commit d7888d5
Showing 1 changed file with 20 additions and 30 deletions.
diff --git a/src/workers/equiv.ts b/src/workers/equiv.ts
@@ -23,7 +23,8 @@
 
 import escapeStringRegexp from 'escape-string-regexp'
 import {assert} from '../js/common'
-import {default as alphabet} from './alphabet.json'
+//import {default as alphabet} from './alphabet.json'
+//const WORD_RE = new RegExp(alphabet.re.word)
 
 /**
  * The following is the list of character equivalencies.
@@ -121,6 +122,7 @@ for (const [l,r] of EQUIV) {
   }
 }
 const _pats = Array.from(_pat_dict.keys()).sort().sort((a,b) => b.length-a.length)
+// add `*` to the following so we split on wildcards too (special treatment below)
 const EQUIV_PAT = new RegExp( '(' + _pats.map(escapeStringRegexp).join('|') + '|\\*)', 'g')
 const EQUIV_REPL :Map<string, string> = new Map()
 for (const pat of _pats) {
@@ -138,46 +140,34 @@ for (const pat of _pats) {
   )
 }
 //console.debug(EQUIV_PAT, EQUIV_REPL)
-// make sure * isn't in dict because it gets special treatment as a wildcard:
+// make sure `*` isn't in dict because it gets special treatment as a wildcard:
 assert(!EQUIV_REPL.has('*'))
-const WORD_RE = new RegExp(alphabet.re.word)
 
 /**
  * This function takes a search word and turns it into a string suitable for use in a regular expression.
  * Two strings are returned: The first is the search word simply turned into a regular expression
  * (a stricter version, intended for use in giving exact matches a higher score), while the second string
  * contains mappings for the character equivalents (a looser pattern that will result in more matches).
+ * The latter is what will actually be used in the search.
  *
  * Callers are expected to have cleaned the search term with `cleanSearchTerm`.
  *
  * **Internal Note:** Our caller expects us to return a pattern **WITHOUT** anchors or capturing groups!
  */
 export function makeSearchPattern(what :string) : [string, string] {
-  // tuples of: 0. orig string, 1. escaped string, 2. with replacements
-  let prevItem :string|undefined = undefined
-  const parts :[string, string, string][] = what.split(EQUIV_PAT).filter(s => s.length)
-    .flatMap(v=>{  // strip duplicate '*'
-      const rv = prevItem==='*' && v==='*' ? [] : [v]
-      prevItem = v
-      return rv
-    })
-    .map(v=>{
-      const esc = escapeStringRegexp(v)
-      return [ v, esc, EQUIV_REPL.get(v)??esc ]
-    })
-  // loop over by index instead of `for(const part of parts)` because I need to look ahead/behind in `parts`
-  for ( let i=0; i<parts.length; i++ ) {
-    const part = parts[i]
-    assert(part)
-    if (part[0] === '*') {  // wildcard
-      const prevWord = !!( i>0 && parts[i-1]?.[0]?.at(-1)?.match(WORD_RE) )
-      const nextWord = !!( i+1<parts.length && parts[i+1]?.[0]?.[0]?.match(WORD_RE) )
-      //TODO: If the goal is to allow things like 'durchbruch*' to mean '\bdurchbruch.*', I'll need to inject that \b ...
-      console.debug(`prevWord=${prevWord} nextWord=${nextWord}`)  //TODO: debug, remove
-      part[1] = '.*?'
-      part[2] = '.*?'
-    }
-  }
-  console.debug(parts)  //TODO: Debug, remove
-  return [ parts.map(v=>v[1]).join(''), parts.map(v=>v[2]).join('') ]
+  // `parts` is tuples of: [ 0. escaped string, 1. with replacements ]
+  const parts :[string, string][] =
+    what.split(EQUIV_PAT).filter(s => s.length)
+      .flatMap( (v,i,arr) =>
+        v==='*' && (
+          i && arr[i-1]==='*'         // strip duplicate '*'
+          // a search for "*blah*" is equivalent to a search for "blah" since there are no anchors, so:
+          || i==0 || i==arr.length-1  // strip '*' at beginning or end
+        ) ? [] : [v] )
+      .map( v => {
+        const replacement = v==='*' ? '.*?' : escapeStringRegexp(v)
+        return [ replacement, EQUIV_REPL.get(v)??replacement ]
+      } )
+  //console.debug(parts)
+  return [ parts.map(v=>v[0]).join(''), parts.map(v=>v[1]).join('') ]
 }