diff --git a/model/labels/regexp.go b/model/labels/regexp.go deleted file mode 100644 index d2151d83ddb..00000000000 --- a/model/labels/regexp.go +++ /dev/null @@ -1,1086 +0,0 @@ -// Copyright 2020 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package labels - -import ( - "slices" - "strings" - "unicode" - "unicode/utf8" - - "github.com/grafana/regexp" - "github.com/grafana/regexp/syntax" - "golang.org/x/text/unicode/norm" -) - -const ( - maxSetMatches = 256 - - // The minimum number of alternate values a regex should have to trigger - // the optimization done by optimizeEqualOrPrefixStringMatchers() and so use a map - // to match values instead of iterating over a list. This value has - // been computed running BenchmarkOptimizeEqualStringMatchers. - minEqualMultiStringMatcherMapThreshold = 16 -) - -type FastRegexMatcher struct { - // Under some conditions, re is nil because the expression is never parsed. - // We store the original string to be able to return it in GetRegexString(). - reString string - re *regexp.Regexp - - setMatches []string - stringMatcher StringMatcher - prefix string - suffix string - contains []string - - // matchString is the "compiled" function to run by MatchString(). - matchString func(string) bool -} - -func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) { - m := &FastRegexMatcher{ - reString: v, - } - - m.stringMatcher, m.setMatches = optimizeAlternatingLiterals(v) - if m.stringMatcher != nil { - // If we already have a string matcher, we don't need to parse the regex - // or compile the matchString function. This also avoids the behavior in - // compileMatchStringFunction where it prefers to use setMatches when - // available, even if the string matcher is faster. - m.matchString = m.stringMatcher.Matches - } else { - parsed, err := syntax.Parse(v, syntax.Perl) - if err != nil { - return nil, err - } - // Simplify the syntax tree to run faster. - parsed = parsed.Simplify() - m.re, err = regexp.Compile("^(?:" + parsed.String() + ")$") - if err != nil { - return nil, err - } - if parsed.Op == syntax.OpConcat { - m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed) - } - if matches, caseSensitive := findSetMatches(parsed); caseSensitive { - m.setMatches = matches - } - m.stringMatcher = stringMatcherFromRegexp(parsed) - m.matchString = m.compileMatchStringFunction() - } - - return m, nil -} - -// compileMatchStringFunction returns the function to run by MatchString(). -func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool { - // If the only optimization available is the string matcher, then we can just run it. - if len(m.setMatches) == 0 && m.prefix == "" && m.suffix == "" && len(m.contains) == 0 && m.stringMatcher != nil { - return m.stringMatcher.Matches - } - - return func(s string) bool { - if len(m.setMatches) != 0 { - for _, match := range m.setMatches { - if match == s { - return true - } - } - return false - } - if m.prefix != "" && !strings.HasPrefix(s, m.prefix) { - return false - } - if m.suffix != "" && !strings.HasSuffix(s, m.suffix) { - return false - } - if len(m.contains) > 0 && !containsInOrder(s, m.contains) { - return false - } - if m.stringMatcher != nil { - return m.stringMatcher.Matches(s) - } - return m.re.MatchString(s) - } -} - -// IsOptimized returns true if any fast-path optimization is applied to the -// regex matcher. -func (m *FastRegexMatcher) IsOptimized() bool { - return len(m.setMatches) > 0 || m.stringMatcher != nil || m.prefix != "" || m.suffix != "" || len(m.contains) > 0 -} - -// findSetMatches extract equality matches from a regexp. -// Returns nil if we can't replace the regexp by only equality matchers or the regexp contains -// a mix of case sensitive and case insensitive matchers. -func findSetMatches(re *syntax.Regexp) (matches []string, caseSensitive bool) { - clearBeginEndText(re) - - return findSetMatchesInternal(re, "") -} - -func findSetMatchesInternal(re *syntax.Regexp, base string) (matches []string, caseSensitive bool) { - switch re.Op { - case syntax.OpBeginText: - // Correctly handling the begin text operator inside a regex is tricky, - // so in this case we fallback to the regex engine. - return nil, false - case syntax.OpEndText: - // Correctly handling the end text operator inside a regex is tricky, - // so in this case we fallback to the regex engine. - return nil, false - case syntax.OpLiteral: - return []string{base + string(re.Rune)}, isCaseSensitive(re) - case syntax.OpEmptyMatch: - if base != "" { - return []string{base}, isCaseSensitive(re) - } - case syntax.OpAlternate: - return findSetMatchesFromAlternate(re, base) - case syntax.OpCapture: - clearCapture(re) - return findSetMatchesInternal(re, base) - case syntax.OpConcat: - return findSetMatchesFromConcat(re, base) - case syntax.OpCharClass: - if len(re.Rune)%2 != 0 { - return nil, false - } - var matches []string - var totalSet int - for i := 0; i+1 < len(re.Rune); i += 2 { - totalSet += int(re.Rune[i+1]-re.Rune[i]) + 1 - } - // limits the total characters that can be used to create matches. - // In some case like negation [^0-9] a lot of possibilities exists and that - // can create thousands of possible matches at which points we're better off using regexp. - if totalSet > maxSetMatches { - return nil, false - } - for i := 0; i+1 < len(re.Rune); i += 2 { - lo, hi := re.Rune[i], re.Rune[i+1] - for c := lo; c <= hi; c++ { - matches = append(matches, base+string(c)) - } - } - return matches, isCaseSensitive(re) - default: - return nil, false - } - return nil, false -} - -func findSetMatchesFromConcat(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) { - if len(re.Sub) == 0 { - return nil, false - } - clearCapture(re.Sub...) - - matches = []string{base} - - for i := 0; i < len(re.Sub); i++ { - var newMatches []string - for j, b := range matches { - m, caseSensitive := findSetMatchesInternal(re.Sub[i], b) - if m == nil { - return nil, false - } - if tooManyMatches(newMatches, m...) { - return nil, false - } - - // All matches must have the same case sensitivity. If it's the first set of matches - // returned, we store its sensitivity as the expected case, and then we'll check all - // other ones. - if i == 0 && j == 0 { - matchesCaseSensitive = caseSensitive - } - if matchesCaseSensitive != caseSensitive { - return nil, false - } - - newMatches = append(newMatches, m...) - } - matches = newMatches - } - - return matches, matchesCaseSensitive -} - -func findSetMatchesFromAlternate(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) { - for i, sub := range re.Sub { - found, caseSensitive := findSetMatchesInternal(sub, base) - if found == nil { - return nil, false - } - if tooManyMatches(matches, found...) { - return nil, false - } - - // All matches must have the same case sensitivity. If it's the first set of matches - // returned, we store its sensitivity as the expected case, and then we'll check all - // other ones. - if i == 0 { - matchesCaseSensitive = caseSensitive - } - if matchesCaseSensitive != caseSensitive { - return nil, false - } - - matches = append(matches, found...) - } - - return matches, matchesCaseSensitive -} - -// clearCapture removes capture operation as they are not used for matching. -func clearCapture(regs ...*syntax.Regexp) { - for _, r := range regs { - // Iterate on the regexp because capture groups could be nested. - for r.Op == syntax.OpCapture { - *r = *r.Sub[0] - } - } -} - -// clearBeginEndText removes the begin and end text from the regexp. Prometheus regexp are anchored to the beginning and end of the string. -func clearBeginEndText(re *syntax.Regexp) { - // Do not clear begin/end text from an alternate operator because it could - // change the actual regexp properties. - if re.Op == syntax.OpAlternate { - return - } - - if len(re.Sub) == 0 { - return - } - if len(re.Sub) == 1 { - if re.Sub[0].Op == syntax.OpBeginText || re.Sub[0].Op == syntax.OpEndText { - // We need to remove this element. Since it's the only one, we convert into a matcher of an empty string. - // OpEmptyMatch is regexp's nop operator. - re.Op = syntax.OpEmptyMatch - re.Sub = nil - return - } - } - if re.Sub[0].Op == syntax.OpBeginText { - re.Sub = re.Sub[1:] - } - if re.Sub[len(re.Sub)-1].Op == syntax.OpEndText { - re.Sub = re.Sub[:len(re.Sub)-1] - } -} - -// isCaseInsensitive tells if a regexp is case insensitive. -// The flag should be check at each level of the syntax tree. -func isCaseInsensitive(reg *syntax.Regexp) bool { - return (reg.Flags & syntax.FoldCase) != 0 -} - -// isCaseSensitive tells if a regexp is case sensitive. -// The flag should be check at each level of the syntax tree. -func isCaseSensitive(reg *syntax.Regexp) bool { - return !isCaseInsensitive(reg) -} - -// tooManyMatches guards against creating too many set matches. -func tooManyMatches(matches []string, added ...string) bool { - return len(matches)+len(added) > maxSetMatches -} - -func (m *FastRegexMatcher) MatchString(s string) bool { - return m.matchString(s) -} - -func (m *FastRegexMatcher) SetMatches() []string { - // IMPORTANT: always return a copy, otherwise if the caller manipulate this slice it will - // also get manipulated in the cached FastRegexMatcher instance. - return slices.Clone(m.setMatches) -} - -func (m *FastRegexMatcher) GetRegexString() string { - return m.reString -} - -// optimizeAlternatingLiterals optimizes a regex of the form -// -// `literal1|literal2|literal3|...` -// -// this function returns an optimized StringMatcher or nil if the regex -// cannot be optimized in this way, and a list of setMatches up to maxSetMatches. -func optimizeAlternatingLiterals(s string) (StringMatcher, []string) { - if len(s) == 0 { - return emptyStringMatcher{}, nil - } - - estimatedAlternates := strings.Count(s, "|") + 1 - - // If there are no alternates, check if the string is a literal - if estimatedAlternates == 1 { - if regexp.QuoteMeta(s) == s { - return &equalStringMatcher{s: s, caseSensitive: true}, []string{s} - } - return nil, nil - } - - multiMatcher := newEqualMultiStringMatcher(true, estimatedAlternates, 0, 0) - - for end := strings.IndexByte(s, '|'); end > -1; end = strings.IndexByte(s, '|') { - // Split the string into the next literal and the remainder - subMatch := s[:end] - s = s[end+1:] - - // break if any of the submatches are not literals - if regexp.QuoteMeta(subMatch) != subMatch { - return nil, nil - } - - multiMatcher.add(subMatch) - } - - // break if the remainder is not a literal - if regexp.QuoteMeta(s) != s { - return nil, nil - } - multiMatcher.add(s) - - return multiMatcher, multiMatcher.setMatches() -} - -// optimizeConcatRegex returns literal prefix/suffix text that can be safely -// checked against the label value before running the regexp matcher. -func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix string, contains []string) { - sub := r.Sub - clearCapture(sub...) - - // We can safely remove begin and end text matchers respectively - // at the beginning and end of the regexp. - if len(sub) > 0 && sub[0].Op == syntax.OpBeginText { - sub = sub[1:] - } - if len(sub) > 0 && sub[len(sub)-1].Op == syntax.OpEndText { - sub = sub[:len(sub)-1] - } - - if len(sub) == 0 { - return - } - - // Given Prometheus regex matchers are always anchored to the begin/end - // of the text, if the first/last operations are literals, we can safely - // treat them as prefix/suffix. - if sub[0].Op == syntax.OpLiteral && (sub[0].Flags&syntax.FoldCase) == 0 { - prefix = string(sub[0].Rune) - } - if last := len(sub) - 1; sub[last].Op == syntax.OpLiteral && (sub[last].Flags&syntax.FoldCase) == 0 { - suffix = string(sub[last].Rune) - } - - // If contains any literal which is not a prefix/suffix, we keep track of - // all the ones which are case-sensitive. - for i := 1; i < len(sub)-1; i++ { - if sub[i].Op == syntax.OpLiteral && (sub[i].Flags&syntax.FoldCase) == 0 { - contains = append(contains, string(sub[i].Rune)) - } - } - - return -} - -// StringMatcher is a matcher that matches a string in place of a regular expression. -type StringMatcher interface { - Matches(s string) bool -} - -// stringMatcherFromRegexp attempts to replace a common regexp with a string matcher. -// It returns nil if the regexp is not supported. -func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher { - clearBeginEndText(re) - - m := stringMatcherFromRegexpInternal(re) - m = optimizeEqualOrPrefixStringMatchers(m, minEqualMultiStringMatcherMapThreshold) - - return m -} - -func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher { - clearCapture(re) - - switch re.Op { - case syntax.OpBeginText: - // Correctly handling the begin text operator inside a regex is tricky, - // so in this case we fallback to the regex engine. - return nil - case syntax.OpEndText: - // Correctly handling the end text operator inside a regex is tricky, - // so in this case we fallback to the regex engine. - return nil - case syntax.OpPlus: - if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL { - return nil - } - return &anyNonEmptyStringMatcher{ - matchNL: re.Sub[0].Op == syntax.OpAnyChar, - } - case syntax.OpStar: - if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL { - return nil - } - - // If the newline is valid, than this matcher literally match any string (even empty). - if re.Sub[0].Op == syntax.OpAnyChar { - return trueMatcher{} - } - - // Any string is fine (including an empty one), as far as it doesn't contain any newline. - return anyStringWithoutNewlineMatcher{} - case syntax.OpQuest: - // Only optimize for ".?". - if len(re.Sub) != 1 || (re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL) { - return nil - } - - return &zeroOrOneCharacterStringMatcher{ - matchNL: re.Sub[0].Op == syntax.OpAnyChar, - } - case syntax.OpEmptyMatch: - return emptyStringMatcher{} - - case syntax.OpLiteral: - return &equalStringMatcher{ - s: string(re.Rune), - caseSensitive: !isCaseInsensitive(re), - } - case syntax.OpAlternate: - or := make([]StringMatcher, 0, len(re.Sub)) - for _, sub := range re.Sub { - m := stringMatcherFromRegexpInternal(sub) - if m == nil { - return nil - } - or = append(or, m) - } - return orStringMatcher(or) - case syntax.OpConcat: - clearCapture(re.Sub...) - - if len(re.Sub) == 0 { - return emptyStringMatcher{} - } - if len(re.Sub) == 1 { - return stringMatcherFromRegexpInternal(re.Sub[0]) - } - - var left, right StringMatcher - - // Let's try to find if there's a first and last any matchers. - if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar || re.Sub[0].Op == syntax.OpQuest { - left = stringMatcherFromRegexpInternal(re.Sub[0]) - if left == nil { - return nil - } - re.Sub = re.Sub[1:] - } - if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar || re.Sub[len(re.Sub)-1].Op == syntax.OpQuest { - right = stringMatcherFromRegexpInternal(re.Sub[len(re.Sub)-1]) - if right == nil { - return nil - } - re.Sub = re.Sub[:len(re.Sub)-1] - } - - matches, matchesCaseSensitive := findSetMatchesInternal(re, "") - - if len(matches) == 0 && len(re.Sub) == 2 { - // We have not find fixed set matches. We look for other known cases that - // we can optimize. - switch { - // Prefix is literal. - case right == nil && re.Sub[0].Op == syntax.OpLiteral: - right = stringMatcherFromRegexpInternal(re.Sub[1]) - if right != nil { - matches = []string{string(re.Sub[0].Rune)} - matchesCaseSensitive = !isCaseInsensitive(re.Sub[0]) - } - - // Suffix is literal. - case left == nil && re.Sub[1].Op == syntax.OpLiteral: - left = stringMatcherFromRegexpInternal(re.Sub[0]) - if left != nil { - matches = []string{string(re.Sub[1].Rune)} - matchesCaseSensitive = !isCaseInsensitive(re.Sub[1]) - } - } - } - - // Ensure we've found some literals to match (optionally with a left and/or right matcher). - // If not, then this optimization doesn't trigger. - if len(matches) == 0 { - return nil - } - - // Use the right (and best) matcher based on what we've found. - switch { - // No left and right matchers (only fixed set matches). - case left == nil && right == nil: - // if there's no any matchers on both side it's a concat of literals - or := make([]StringMatcher, 0, len(matches)) - for _, match := range matches { - or = append(or, &equalStringMatcher{ - s: match, - caseSensitive: matchesCaseSensitive, - }) - } - return orStringMatcher(or) - - // Right matcher with 1 fixed set match. - case left == nil && len(matches) == 1: - return newLiteralPrefixStringMatcher(matches[0], matchesCaseSensitive, right) - - // Left matcher with 1 fixed set match. - case right == nil && len(matches) == 1: - return &literalSuffixStringMatcher{ - left: left, - suffix: matches[0], - suffixCaseSensitive: matchesCaseSensitive, - } - - // We found literals in the middle. We can trigger the fast path only if - // the matches are case sensitive because containsStringMatcher doesn't - // support case insensitive. - case matchesCaseSensitive: - return &containsStringMatcher{ - substrings: matches, - left: left, - right: right, - } - } - } - return nil -} - -// containsStringMatcher matches a string if it contains any of the substrings. -// If left and right are not nil, it's a contains operation where left and right must match. -// If left is nil, it's a hasPrefix operation and right must match. -// Finally, if right is nil it's a hasSuffix operation and left must match. -type containsStringMatcher struct { - // The matcher that must match the left side. Can be nil. - left StringMatcher - - // At least one of these strings must match in the "middle", between left and right matchers. - substrings []string - - // The matcher that must match the right side. Can be nil. - right StringMatcher -} - -func (m *containsStringMatcher) Matches(s string) bool { - for _, substr := range m.substrings { - switch { - case m.right != nil && m.left != nil: - searchStartPos := 0 - - for { - pos := strings.Index(s[searchStartPos:], substr) - if pos < 0 { - break - } - - // Since we started searching from searchStartPos, we have to add that offset - // to get the actual position of the substring inside the text. - pos += searchStartPos - - // If both the left and right matchers match, then we can stop searching because - // we've found a match. - if m.left.Matches(s[:pos]) && m.right.Matches(s[pos+len(substr):]) { - return true - } - - // Continue searching for another occurrence of the substring inside the text. - searchStartPos = pos + 1 - } - case m.left != nil: - // If we have to check for characters on the left then we need to match a suffix. - if strings.HasSuffix(s, substr) && m.left.Matches(s[:len(s)-len(substr)]) { - return true - } - case m.right != nil: - if strings.HasPrefix(s, substr) && m.right.Matches(s[len(substr):]) { - return true - } - } - } - return false -} - -func newLiteralPrefixStringMatcher(prefix string, prefixCaseSensitive bool, right StringMatcher) StringMatcher { - if prefixCaseSensitive { - return &literalPrefixSensitiveStringMatcher{ - prefix: prefix, - right: right, - } - } - - return &literalPrefixInsensitiveStringMatcher{ - prefix: prefix, - right: right, - } -} - -// literalPrefixSensitiveStringMatcher matches a string with the given literal case-sensitive prefix and right side matcher. -type literalPrefixSensitiveStringMatcher struct { - prefix string - - // The matcher that must match the right side. Can be nil. - right StringMatcher -} - -func (m *literalPrefixSensitiveStringMatcher) Matches(s string) bool { - if !strings.HasPrefix(s, m.prefix) { - return false - } - - // Ensure the right side matches. - return m.right.Matches(s[len(m.prefix):]) -} - -// literalPrefixInsensitiveStringMatcher matches a string with the given literal case-insensitive prefix and right side matcher. -type literalPrefixInsensitiveStringMatcher struct { - prefix string - - // The matcher that must match the right side. Can be nil. - right StringMatcher -} - -func (m *literalPrefixInsensitiveStringMatcher) Matches(s string) bool { - if !hasPrefixCaseInsensitive(s, m.prefix) { - return false - } - - // Ensure the right side matches. - return m.right.Matches(s[len(m.prefix):]) -} - -// literalSuffixStringMatcher matches a string with the given literal suffix and left side matcher. -type literalSuffixStringMatcher struct { - // The matcher that must match the left side. Can be nil. - left StringMatcher - - suffix string - suffixCaseSensitive bool -} - -func (m *literalSuffixStringMatcher) Matches(s string) bool { - // Ensure the suffix matches. - if m.suffixCaseSensitive && !strings.HasSuffix(s, m.suffix) { - return false - } - if !m.suffixCaseSensitive && !hasSuffixCaseInsensitive(s, m.suffix) { - return false - } - - // Ensure the left side matches. - return m.left.Matches(s[:len(s)-len(m.suffix)]) -} - -// emptyStringMatcher matches an empty string. -type emptyStringMatcher struct{} - -func (m emptyStringMatcher) Matches(s string) bool { - return len(s) == 0 -} - -// orStringMatcher matches any of the sub-matchers. -type orStringMatcher []StringMatcher - -func (m orStringMatcher) Matches(s string) bool { - for _, matcher := range m { - if matcher.Matches(s) { - return true - } - } - return false -} - -// equalStringMatcher matches a string exactly and support case insensitive. -type equalStringMatcher struct { - s string - caseSensitive bool -} - -func (m *equalStringMatcher) Matches(s string) bool { - if m.caseSensitive { - return m.s == s - } - return strings.EqualFold(m.s, s) -} - -type multiStringMatcherBuilder interface { - StringMatcher - add(s string) - addPrefix(prefix string, prefixCaseSensitive bool, matcher StringMatcher) - setMatches() []string -} - -func newEqualMultiStringMatcher(caseSensitive bool, estimatedSize, estimatedPrefixes, minPrefixLength int) multiStringMatcherBuilder { - // If the estimated size is low enough, it's faster to use a slice instead of a map. - if estimatedSize < minEqualMultiStringMatcherMapThreshold && estimatedPrefixes == 0 { - return &equalMultiStringSliceMatcher{caseSensitive: caseSensitive, values: make([]string, 0, estimatedSize)} - } - - return &equalMultiStringMapMatcher{ - values: make(map[string]struct{}, estimatedSize), - prefixes: make(map[string][]StringMatcher, estimatedPrefixes), - minPrefixLen: minPrefixLength, - caseSensitive: caseSensitive, - } -} - -// equalMultiStringSliceMatcher matches a string exactly against a slice of valid values. -type equalMultiStringSliceMatcher struct { - values []string - - caseSensitive bool -} - -func (m *equalMultiStringSliceMatcher) add(s string) { - m.values = append(m.values, s) -} - -func (m *equalMultiStringSliceMatcher) addPrefix(_ string, _ bool, _ StringMatcher) { - panic("not implemented") -} - -func (m *equalMultiStringSliceMatcher) setMatches() []string { - return m.values -} - -func (m *equalMultiStringSliceMatcher) Matches(s string) bool { - if m.caseSensitive { - for _, v := range m.values { - if s == v { - return true - } - } - } else { - for _, v := range m.values { - if strings.EqualFold(s, v) { - return true - } - } - } - return false -} - -// equalMultiStringMapMatcher matches a string exactly against a map of valid values -// or against a set of prefix matchers. -type equalMultiStringMapMatcher struct { - // values contains values to match a string against. If the matching is case insensitive, - // the values here must be lowercase. - values map[string]struct{} - // prefixes maps strings, all of length minPrefixLen, to sets of matchers to check the rest of the string. - // If the matching is case insensitive, prefixes are all lowercase. - prefixes map[string][]StringMatcher - // minPrefixLen can be zero, meaning there are no prefix matchers. - minPrefixLen int - caseSensitive bool -} - -func (m *equalMultiStringMapMatcher) add(s string) { - if !m.caseSensitive { - s = toNormalisedLower(s) - } - - m.values[s] = struct{}{} -} - -func (m *equalMultiStringMapMatcher) addPrefix(prefix string, prefixCaseSensitive bool, matcher StringMatcher) { - if m.minPrefixLen == 0 { - panic("addPrefix called when no prefix length defined") - } - if len(prefix) < m.minPrefixLen { - panic("addPrefix called with a too short prefix") - } - if m.caseSensitive != prefixCaseSensitive { - panic("addPrefix called with a prefix whose case sensitivity is different than the expected one") - } - - s := prefix[:m.minPrefixLen] - if !m.caseSensitive { - s = strings.ToLower(s) - } - - m.prefixes[s] = append(m.prefixes[s], matcher) -} - -func (m *equalMultiStringMapMatcher) setMatches() []string { - if len(m.values) >= maxSetMatches || len(m.prefixes) > 0 { - return nil - } - - matches := make([]string, 0, len(m.values)) - for s := range m.values { - matches = append(matches, s) - } - return matches -} - -func (m *equalMultiStringMapMatcher) Matches(s string) bool { - if !m.caseSensitive { - s = toNormalisedLower(s) - } - - if _, ok := m.values[s]; ok { - return true - } - if m.minPrefixLen > 0 && len(s) >= m.minPrefixLen { - for _, matcher := range m.prefixes[s[:m.minPrefixLen]] { - if matcher.Matches(s) { - return true - } - } - } - return false -} - -// toNormalisedLower normalise the input string using "Unicode Normalization Form D" and then convert -// it to lower case. -func toNormalisedLower(s string) string { - var buf []byte - for i := 0; i < len(s); i++ { - c := s[i] - if c >= utf8.RuneSelf { - return strings.Map(unicode.ToLower, norm.NFKD.String(s)) - } - if 'A' <= c && c <= 'Z' { - if buf == nil { - buf = []byte(s) - } - buf[i] = c + 'a' - 'A' - } - } - if buf == nil { - return s - } - return yoloString(buf) -} - -// anyStringWithoutNewlineMatcher is a stringMatcher which matches any string -// (including an empty one) as far as it doesn't contain any newline character. -type anyStringWithoutNewlineMatcher struct{} - -func (m anyStringWithoutNewlineMatcher) Matches(s string) bool { - // We need to make sure it doesn't contain a newline. Since the newline is - // an ASCII character, we can use strings.IndexByte(). - return strings.IndexByte(s, '\n') == -1 -} - -// anyNonEmptyStringMatcher is a stringMatcher which matches any non-empty string. -type anyNonEmptyStringMatcher struct { - matchNL bool -} - -func (m *anyNonEmptyStringMatcher) Matches(s string) bool { - if m.matchNL { - // It's OK if the string contains a newline so we just need to make - // sure it's non-empty. - return len(s) > 0 - } - - // We need to make sure it non-empty and doesn't contain a newline. - // Since the newline is an ASCII character, we can use strings.IndexByte(). - return len(s) > 0 && strings.IndexByte(s, '\n') == -1 -} - -// zeroOrOneCharacterStringMatcher is a StringMatcher which matches zero or one occurrence -// of any character. The newline character is matches only if matchNL is set to true. -type zeroOrOneCharacterStringMatcher struct { - matchNL bool -} - -func (m *zeroOrOneCharacterStringMatcher) Matches(s string) bool { - // If there's more than one rune in the string, then it can't match. - if r, size := utf8.DecodeRuneInString(s); r == utf8.RuneError { - // Size is 0 for empty strings, 1 for invalid rune. - // Empty string matches, invalid rune matches if there isn't anything else. - return size == len(s) - } else if size < len(s) { - return false - } - - // No need to check for the newline if the string is empty or matching a newline is OK. - if m.matchNL || len(s) == 0 { - return true - } - - return s[0] != '\n' -} - -// trueMatcher is a stringMatcher which matches any string (always returns true). -type trueMatcher struct{} - -func (m trueMatcher) Matches(_ string) bool { - return true -} - -// optimizeEqualOrPrefixStringMatchers optimize a specific case where all matchers are made by an -// alternation (orStringMatcher) of strings checked for equality (equalStringMatcher) or -// with a literal prefix (literalPrefixSensitiveStringMatcher or literalPrefixInsensitiveStringMatcher). -// -// In this specific case, when we have many strings to match against we can use a map instead -// of iterating over the list of strings. -func optimizeEqualOrPrefixStringMatchers(input StringMatcher, threshold int) StringMatcher { - var ( - caseSensitive bool - caseSensitiveSet bool - numValues int - numPrefixes int - minPrefixLength int - ) - - // Analyse the input StringMatcher to count the number of occurrences - // and ensure all of them have the same case sensitivity. - analyseEqualMatcherCallback := func(matcher *equalStringMatcher) bool { - // Ensure we don't have mixed case sensitivity. - if caseSensitiveSet && caseSensitive != matcher.caseSensitive { - return false - } else if !caseSensitiveSet { - caseSensitive = matcher.caseSensitive - caseSensitiveSet = true - } - - numValues++ - return true - } - - analysePrefixMatcherCallback := func(prefix string, prefixCaseSensitive bool, matcher StringMatcher) bool { - // Ensure we don't have mixed case sensitivity. - if caseSensitiveSet && caseSensitive != prefixCaseSensitive { - return false - } else if !caseSensitiveSet { - caseSensitive = prefixCaseSensitive - caseSensitiveSet = true - } - if numPrefixes == 0 || len(prefix) < minPrefixLength { - minPrefixLength = len(prefix) - } - - numPrefixes++ - return true - } - - if !findEqualOrPrefixStringMatchers(input, analyseEqualMatcherCallback, analysePrefixMatcherCallback) { - return input - } - - // If the number of values and prefixes found is less than the threshold, then we should skip the optimization. - if (numValues + numPrefixes) < threshold { - return input - } - - // Parse again the input StringMatcher to extract all values and storing them. - // We can skip the case sensitivity check because we've already checked it and - // if the code reach this point then it means all matchers have the same case sensitivity. - multiMatcher := newEqualMultiStringMatcher(caseSensitive, numValues, numPrefixes, minPrefixLength) - - // Ignore the return value because we already iterated over the input StringMatcher - // and it was all good. - findEqualOrPrefixStringMatchers(input, func(matcher *equalStringMatcher) bool { - multiMatcher.add(matcher.s) - return true - }, func(prefix string, prefixCaseSensitive bool, matcher StringMatcher) bool { - multiMatcher.addPrefix(prefix, caseSensitive, matcher) - return true - }) - - return multiMatcher -} - -// findEqualOrPrefixStringMatchers analyze the input StringMatcher and calls the equalMatcherCallback for each -// equalStringMatcher found, and prefixMatcherCallback for each literalPrefixSensitiveStringMatcher and literalPrefixInsensitiveStringMatcher found. -// -// Returns true if and only if the input StringMatcher is *only* composed by an alternation of equalStringMatcher and/or -// literal prefix matcher. Returns false if prefixMatcherCallback is nil and a literal prefix matcher is encountered. -func findEqualOrPrefixStringMatchers(input StringMatcher, equalMatcherCallback func(matcher *equalStringMatcher) bool, prefixMatcherCallback func(prefix string, prefixCaseSensitive bool, matcher StringMatcher) bool) bool { - orInput, ok := input.(orStringMatcher) - if !ok { - return false - } - - for _, m := range orInput { - switch casted := m.(type) { - case orStringMatcher: - if !findEqualOrPrefixStringMatchers(m, equalMatcherCallback, prefixMatcherCallback) { - return false - } - - case *equalStringMatcher: - if !equalMatcherCallback(casted) { - return false - } - - case *literalPrefixSensitiveStringMatcher: - if prefixMatcherCallback == nil || !prefixMatcherCallback(casted.prefix, true, casted) { - return false - } - - case *literalPrefixInsensitiveStringMatcher: - if prefixMatcherCallback == nil || !prefixMatcherCallback(casted.prefix, false, casted) { - return false - } - - default: - // It's not an equal or prefix string matcher, so we have to stop searching - // cause this optimization can't be applied. - return false - } - } - - return true -} - -func hasPrefixCaseInsensitive(s, prefix string) bool { - return len(s) >= len(prefix) && strings.EqualFold(s[0:len(prefix)], prefix) -} - -func hasSuffixCaseInsensitive(s, suffix string) bool { - return len(s) >= len(suffix) && strings.EqualFold(s[len(s)-len(suffix):], suffix) -} - -func containsInOrder(s string, contains []string) bool { - // Optimization for the case we only have to look for 1 substring. - if len(contains) == 1 { - return strings.Contains(s, contains[0]) - } - - return containsInOrderMulti(s, contains) -} - -func containsInOrderMulti(s string, contains []string) bool { - offset := 0 - - for _, substr := range contains { - at := strings.Index(s[offset:], substr) - if at == -1 { - return false - } - - offset += at + len(substr) - } - - return true -}