Skip to content

Commit

Permalink
Basic wildcard impl. (issue #26)
Browse files Browse the repository at this point in the history
  • Loading branch information
haukex committed Oct 27, 2024
1 parent cceb79a commit d7888d5
Showing 1 changed file with 20 additions and 30 deletions.
50 changes: 20 additions & 30 deletions src/workers/equiv.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@

import escapeStringRegexp from 'escape-string-regexp'
import {assert} from '../js/common'
import {default as alphabet} from './alphabet.json'
//import {default as alphabet} from './alphabet.json'
//const WORD_RE = new RegExp(alphabet.re.word)

/**
* The following is the list of character equivalencies.
Expand Down Expand Up @@ -121,6 +122,7 @@ for (const [l,r] of EQUIV) {
}
}
const _pats = Array.from(_pat_dict.keys()).sort().sort((a,b) => b.length-a.length)
// add `*` to the following so we split on wildcards too (special treatment below)
const EQUIV_PAT = new RegExp( '(' + _pats.map(escapeStringRegexp).join('|') + '|\\*)', 'g')
const EQUIV_REPL :Map<string, string> = new Map()
for (const pat of _pats) {
Expand All @@ -138,46 +140,34 @@ for (const pat of _pats) {
)
}
//console.debug(EQUIV_PAT, EQUIV_REPL)
// make sure * isn't in dict because it gets special treatment as a wildcard:
// make sure `*` isn't in dict because it gets special treatment as a wildcard:
assert(!EQUIV_REPL.has('*'))
const WORD_RE = new RegExp(alphabet.re.word)

/**
* This function takes a search word and turns it into a string suitable for use in a regular expression.
* Two strings are returned: The first is the search word simply turned into a regular expression
* (a stricter version, intended for use in giving exact matches a higher score), while the second string
* contains mappings for the character equivalents (a looser pattern that will result in more matches).
* The latter is what will actually be used in the search.
*
* Callers are expected to have cleaned the search term with `cleanSearchTerm`.
*
* **Internal Note:** Our caller expects us to return a pattern **WITHOUT** anchors or capturing groups!
*/
export function makeSearchPattern(what :string) : [string, string] {
// tuples of: 0. orig string, 1. escaped string, 2. with replacements
let prevItem :string|undefined = undefined
const parts :[string, string, string][] = what.split(EQUIV_PAT).filter(s => s.length)
.flatMap(v=>{ // strip duplicate '*'
const rv = prevItem==='*' && v==='*' ? [] : [v]
prevItem = v
return rv
})
.map(v=>{
const esc = escapeStringRegexp(v)
return [ v, esc, EQUIV_REPL.get(v)??esc ]
})
// loop over by index instead of `for(const part of parts)` because I need to look ahead/behind in `parts`
for ( let i=0; i<parts.length; i++ ) {
const part = parts[i]
assert(part)
if (part[0] === '*') { // wildcard
const prevWord = !!( i>0 && parts[i-1]?.[0]?.at(-1)?.match(WORD_RE) )
const nextWord = !!( i+1<parts.length && parts[i+1]?.[0]?.[0]?.match(WORD_RE) )
//TODO: If the goal is to allow things like 'durchbruch*' to mean '\bdurchbruch.*', I'll need to inject that \b ...
console.debug(`prevWord=${prevWord} nextWord=${nextWord}`) //TODO: debug, remove
part[1] = '.*?'
part[2] = '.*?'
}
}
console.debug(parts) //TODO: Debug, remove
return [ parts.map(v=>v[1]).join(''), parts.map(v=>v[2]).join('') ]
// `parts` is tuples of: [ 0. escaped string, 1. with replacements ]
const parts :[string, string][] =
what.split(EQUIV_PAT).filter(s => s.length)
.flatMap( (v,i,arr) =>
v==='*' && (
i && arr[i-1]==='*' // strip duplicate '*'
// a search for "*blah*" is equivalent to a search for "blah" since there are no anchors, so:
|| i==0 || i==arr.length-1 // strip '*' at beginning or end
) ? [] : [v] )
.map( v => {
const replacement = v==='*' ? '.*?' : escapeStringRegexp(v)
return [ replacement, EQUIV_REPL.get(v)??replacement ]
} )
//console.debug(parts)
return [ parts.map(v=>v[0]).join(''), parts.map(v=>v[1]).join('') ]
}

0 comments on commit d7888d5

Please sign in to comment.