diff --git a/internal/setup/setup.go b/internal/setup/setup.go index 6ed24abe..23d0746f 100644 --- a/internal/setup/setup.go +++ b/internal/setup/setup.go @@ -3,6 +3,7 @@ package setup import ( "errors" "fmt" + "maps" "os" "path/filepath" "slices" @@ -273,14 +274,46 @@ func (r *Release) validate() error { } // Check for glob and generate conflicts. + // + // This is the most time-consuming part of every command because checking + // each pair of paths uses an n^2 algorithm; specifically due to glob + // handling. We can speed up the process by exploiting the fact that + // conflicts are not allowed which means that in valid releases globs will + // appear at only the specific parts of the paths. There will always be a + // non-trivial prefix of the path that doesn't contain globs. + // + // We will speed up the search by only looking for conflicts in paths that + // share this same prefix. By collecting all paths and doing binary search + // we can find the start of the prefix. This is much simpler than other + // data structures like tries or radix trees which have similar algorithmic + // complexities. + allPaths := slices.Collect(maps.Keys(paths)) + slices.Sort(allPaths) for oldPath, oldSlices := range paths { for _, old := range oldSlices { oldInfo := old.Contents[oldPath] if oldInfo.Kind != GeneratePath && oldInfo.Kind != GlobPath { break } - for newPath, newSlices := range paths { - if oldPath == newPath { + + prefixLen := strings.IndexAny(oldPath, "*?") + if prefixLen == -1 { + return fmt.Errorf("internal error: invalid path: generate or glob path does not contain ' ?' or '*': %q", oldPath) + } + searchKey := oldPath[:prefixLen] + // startIndex is the position of the prefix or the position where + // the prefix would have been found. + startIndex, _ := slices.BinarySearch(allPaths, searchKey) + for i := startIndex; i < len(allPaths); i++ { + newPath := allPaths[i] + if !strings.HasPrefix(newPath, searchKey) { + // Iterate until the prefix no longer matches, which means + // all other paths will fail the comparison below. + break + } + newSlices := paths[newPath] + // We know prefixes match, we can skip that part of the string. + if oldPath[len(searchKey)-1:] == newPath[len(searchKey)-1:] { // Identical paths have been filtered earlier. continue } @@ -291,13 +324,16 @@ func (r *Release) validate() error { continue } } - if strdist.GlobPath(newPath, oldPath) { + // We know prefixes match, we can skip that part of the string. + if strdist.GlobPath(newPath[len(searchKey)-1:], oldPath[len(searchKey)-1:]) { if (old.Package > new.Package) || (old.Package == new.Package && old.Name > new.Name) || (old.Package == new.Package && old.Name == new.Name && oldPath > newPath) { old, new = new, old oldPath, newPath = newPath, oldPath } return fmt.Errorf("slices %s and %s conflict on %s and %s", old, new, oldPath, newPath) + } else { + break } } } diff --git a/internal/setup/setup_test.go b/internal/setup/setup_test.go index d669943f..4456ce0b 100644 --- a/internal/setup/setup_test.go +++ b/internal/setup/setup_test.go @@ -518,6 +518,86 @@ var setupTests = []setupTest{{ `, }, relerror: "slices mypkg1_myslice1 and mypkg2_myslice1 conflict on /path1", +}, { + summary: "When multiple prefixes match all are tested", + input: map[string]string{ + "slices/mydir/mypkg1.yaml": ` + package: mypkg1 + slices: + myslice1: + contents: + /path/*/foo: + `, + "slices/mydir/mypkg2.yaml": ` + package: mypkg2 + slices: + myslice1: + contents: + /path/bar/foo1: + /path/bar/f*: + `, + }, + relerror: "slices mypkg1_myslice1 and mypkg2_myslice1 conflict on /path/\\*/foo and /path/bar/f\\*", +}, { + summary: "Later matching paths are tested after several non-matches", + input: map[string]string{ + "slices/mydir/mypkg1.yaml": ` + package: mypkg1 + slices: + myslice1: + contents: + /path/*/foo: + `, + "slices/mydir/mypkg2.yaml": ` + package: mypkg2 + slices: + myslice1: + contents: + /path/bar/f: + /path/bar/fa: + /path/bar/fb: + /path/bar/f*: + `, + }, + relerror: "slices mypkg1_myslice1 and mypkg2_myslice1 conflict on /path/\\*/foo and /path/bar/f\\*", +}, { + summary: "Conflicting globs with wildcard in the middle of the segment", + input: map[string]string{ + "slices/mydir/mypkg1.yaml": ` + package: mypkg1 + slices: + myslice: + contents: + /path/ab*cd/file: + `, + "slices/mydir/mypkg2.yaml": ` + package: mypkg2 + slices: + myslice: + contents: + /path/ab12cd/file: + `, + }, + relerror: `slices mypkg1_myslice and mypkg2_myslice conflict on /path/ab\*cd/file and /path/ab12cd/file`, +}, { + summary: "Conflicting globs when one side uses double-star", + input: map[string]string{ + "slices/mydir/mypkg1.yaml": ` + package: mypkg1 + slices: + myslice: + contents: + /path/**/file: + `, + "slices/mydir/mypkg2.yaml": ` + package: mypkg2 + slices: + myslice: + contents: + /path/*/file: + `, + }, + relerror: `slices mypkg1_myslice and mypkg2_myslice conflict on /path/\*\*/file and /path/\*/file`, }, { summary: "Directories must be suffixed with /", input: map[string]string{ @@ -1943,6 +2023,25 @@ var setupTests = []setupTest{{ `, }, relerror: `slices mypkg_myslice1 and mypkg_myslice2 conflict on /path/subdir/\*\* and /path/\*\*`, +}, { + summary: "Generate paths conflict with glob paths sharing the prefix", + input: map[string]string{ + "slices/mydir/mypkg1.yaml": ` + package: mypkg1 + slices: + myslice: + contents: + /path/subdir/**: {generate: manifest} + `, + "slices/mydir/mypkg2.yaml": ` + package: mypkg2 + slices: + myslice: + contents: + /path/subdir/f*: + `, + }, + relerror: `slices mypkg1_myslice and mypkg2_myslice conflict on /path/subdir/\*\* and /path/subdir/f\*`, }, { summary: `No other options in "generate" paths`, input: map[string]string{ diff --git a/internal/strdist/strdist.go b/internal/strdist/strdist.go index d5b640ef..baff1c38 100644 --- a/internal/strdist/strdist.go +++ b/internal/strdist/strdist.go @@ -34,14 +34,14 @@ func Distance(a, b string, f CostFunc, cut int64) int64 { } lst := make([]CostInt, len(b)+1) bl := 0 - for bi, br := range b { - bl++ + for _, br := range b { cost := f(-1, br) - if cost.InsertB == Inhibit || lst[bi] == Inhibit { - lst[bi+1] = Inhibit + if cost.InsertB == Inhibit || lst[bl] == Inhibit { + lst[bl+1] = Inhibit } else { - lst[bi+1] = lst[bi] + cost.InsertB + lst[bl+1] = lst[bl] + cost.InsertB } + bl++ } lst = lst[:bl+1] // Not required, but caching means preventing the fast path @@ -87,8 +87,7 @@ func Distance(a, b string, f CostFunc, cut int64) int64 { if debug { debugf("... %v", lst) } - _ = stop - if cut != 0 && stop { + if cut != 0 && len(b) > 0 && stop { break } } @@ -105,66 +104,96 @@ func Distance(a, b string, f CostFunc, cut int64) int64 { // * - Any zero or more characters, except for / // ** - Any zero or more characters, including / func GlobPath(a, b string) bool { - if !wildcardPrefixMatch(a, b) { - // Fast path. - return false + // Computing the actual distance is slow as its complexity is + // O(len(a) * len(b)). If the paths contain globs there is no way around + // it, but we can be clever about creating segments from the paths and only + // calling the distance function when necessary. If there is no glob the + // complexity becomes O(len(a) + len(b)). + // + // The algorithm works by separating the path into segments delimited by + // '/'. We compare the segments in order for a and b, we have three cases: + // 1) No segment uses globs, comparison is memcmp of both strings. + // 2) One of the strings uses a single "*" or "?". We call the distance + // function only with the segment which reduces the algorithmic + // complexity by reducing the length. + // 3) One of the strings uses "**". We need to call distance on the + // the rest of both strings and we can no longer rely on segments. + // + // Crucially, this optimization works because: + // * There are few paths in the releases that use "**", because it results + // in conflict easily. In fact, it is usually used to extract a whole + // directory, meaning the prefix segements should be unique and we can + // avoid computing "**" completely. + distance := func(a, b string) bool { + a = strings.ReplaceAll(a, "**", "⁑") + b = strings.ReplaceAll(b, "**", "⁑") + return Distance(a, b, globCost, 1) == 0 } - if !wildcardSuffixMatch(a, b) { - // Fast path. - return false + + // Returns the index where the segment ends (next '/' or the end of the + // string) and whether the string has a "*" or "?". This function will only + // traverse the string once. + segmentEnd := func(s string) (end int, hasGlob bool) { + end = strings.IndexAny(s, "*?/") + if end == -1 { + end = len(s) - 1 + } else if s[end] == '*' || s[end] == '?' { + slash := strings.IndexRune(s[end:], '/') + if slash != -1 { + end = end + slash + } else { + end = len(s) - 1 + } + hasGlob = true + } + return end, hasGlob + } + + for len(a) > 0 && len(b) > 0 { + endA, globA := segmentEnd(a) + endB, globB := segmentEnd(b) + + segmentA := a[:endA+1] + segmentB := b[:endB+1] + if strings.Contains(segmentA, "**") || strings.Contains(segmentB, "**") { + // We need to match the rest of the string with the slow path, no + // other way around it. + return distance(a, b) + } else if globA || globB { + if !distance(segmentA, segmentB) { + return false + } + } else { + if segmentA != segmentB { + return false + } + } + + a = a[endA+1:] + b = b[endB+1:] } - a = strings.ReplaceAll(a, "**", "⁑") - b = strings.ReplaceAll(b, "**", "⁑") - return Distance(a, b, globCost, 1) == 0 + // One string is empty, this call is linear. + return distance(a, b) } func globCost(ar, br rune) Cost { if ar == '⁑' || br == '⁑' { return Cost{SwapAB: 0, DeleteA: 0, InsertB: 0} } - if ar == '/' || br == '/' { - return Cost{SwapAB: Inhibit, DeleteA: Inhibit, InsertB: Inhibit} - } if ar == '*' || br == '*' { + if ar == '*' && br == '/' { + return Cost{SwapAB: Inhibit, DeleteA: 0, InsertB: Inhibit} + } else if ar == '/' && br == '*' { + return Cost{SwapAB: Inhibit, DeleteA: Inhibit, InsertB: 0} + } return Cost{SwapAB: 0, DeleteA: 0, InsertB: 0} } + if ar == '/' || br == '/' { + return Cost{SwapAB: Inhibit, DeleteA: Inhibit, InsertB: Inhibit} + } if ar == '?' || br == '?' { return Cost{SwapAB: 0, DeleteA: 1, InsertB: 1} } return Cost{SwapAB: 1, DeleteA: 1, InsertB: 1} } - -// wildcardPrefixMatch compares whether the prefixes of a and b are equal up -// to the shortest one. The prefix is defined as the longest substring that -// starts at index 0 and does not contain a wildcard. -func wildcardPrefixMatch(a, b string) bool { - ai := strings.IndexAny(a, "*?") - bi := strings.IndexAny(b, "*?") - if ai == -1 { - ai = len(a) - } - if bi == -1 { - bi = len(b) - } - mini := min(ai, bi) - return a[:mini] == b[:mini] -} - -// wildcardSuffixMatch compares whether the suffixes of a and b are equal up -// to the shortest one. The suffix is defined as the longest substring that ends -// at the string length and does not contain a wildcard. -func wildcardSuffixMatch(a, b string) bool { - ai := strings.LastIndexAny(a, "*?") - la := 0 - if ai != -1 { - la = len(a) - ai - 1 - } - lb := 0 - bi := strings.LastIndexAny(b, "*?") - if bi != -1 { - lb = len(b) - bi - 1 - } - minl := min(la, lb) - return a[len(a)-minl:] == b[len(b)-minl:] -} diff --git a/internal/strdist/strdist_test.go b/internal/strdist/strdist_test.go index 7f8975a0..d337edf8 100644 --- a/internal/strdist/strdist_test.go +++ b/internal/strdist/strdist_test.go @@ -58,6 +58,12 @@ var distanceTests = []distanceTest{ {f: strdist.GlobCost, r: 1, a: "a**f/hij", b: "abc/def/hik"}, {f: strdist.GlobCost, r: 2, a: "a**fg", b: "abc/def/hik"}, {f: strdist.GlobCost, r: 0, a: "a**f/hij/klm", b: "abc/d**m"}, + {f: strdist.GlobCost, r: 1, a: "**a", b: ""}, + {f: strdist.GlobCost, r: 0, a: "/*foo/", b: "/foo/"}, + {f: strdist.GlobCost, r: 1, a: "*f", b: ""}, + {f: strdist.GlobCost, r: 1, a: "", b: "*f"}, + {f: strdist.GlobCost, r: 0, a: "?**", b: "f/foo"}, + {f: strdist.GlobCost, r: 0, a: "/aa/a", b: "/a?**"}, } func (s *S) TestDistance(c *C) {