diff --git a/Sources/RegexBenchmark/Benchmark.swift b/Sources/RegexBenchmark/Benchmark.swift index 2622639fb..f807a8e55 100644 --- a/Sources/RegexBenchmark/Benchmark.swift +++ b/Sources/RegexBenchmark/Benchmark.swift @@ -71,6 +71,13 @@ struct NSBenchmark: RegexBenchmark { enum NSMatchType { case allMatches case first + + init(_ type: Benchmark.MatchType) { + switch type { + case .whole, .first: self = .first + case .allMatches: self = .allMatches + } + } } func run() { @@ -126,7 +133,7 @@ struct CrossBenchmark { /// The base name of the benchmark var baseName: String - /// The string to compile in differnet engines + /// The string to compile in different engines var regex: String /// The text to search @@ -143,57 +150,32 @@ struct CrossBenchmark { /// Whether or not to do firstMatch as well or just allMatches var includeFirst: Bool = false + /// Whether to also run scalar-semantic mode + var alsoRunScalarSemantic: Bool = true + func register(_ runner: inout BenchmarkRunner) { - let swiftRegex = try! Regex(regex) - let nsRegex: NSRegularExpression if isWhole { - nsRegex = try! NSRegularExpression(pattern: "^" + regex + "$") + runner.registerCrossBenchmark( + nameBase: baseName, + input: input, + pattern: regex, + .whole, + alsoRunScalarSemantic: alsoRunScalarSemantic) } else { - nsRegex = try! NSRegularExpression(pattern: regex) - } + runner.registerCrossBenchmark( + nameBase: baseName, + input: input, + pattern: regex, + .allMatches, + alsoRunScalarSemantic: alsoRunScalarSemantic) - if isWhole { - runner.register( - Benchmark( - name: baseName + "Whole", - regex: swiftRegex, - pattern: regex, - type: .whole, - target: input)) - runner.register( - NSBenchmark( - name: baseName + "Whole" + CrossBenchmark.nsSuffix, - regex: nsRegex, - type: .first, - target: input)) - } else { - runner.register( - Benchmark( - name: baseName + "All", - regex: swiftRegex, - pattern: regex, - type: .allMatches, - target: input)) - runner.register( - NSBenchmark( - name: baseName + "All" + CrossBenchmark.nsSuffix, - regex: nsRegex, - type: .allMatches, - target: input)) if includeFirst || runner.includeFirstOverride { - runner.register( - Benchmark( - name: baseName + "First", - regex: swiftRegex, - pattern: regex, - type: .first, - target: input)) - runner.register( - NSBenchmark( - name: baseName + "First" + CrossBenchmark.nsSuffix, - regex: nsRegex, - type: .first, - target: input)) + runner.registerCrossBenchmark( + nameBase: baseName, + input: input, + pattern: regex, + .first, + alsoRunScalarSemantic: alsoRunScalarSemantic) } } } @@ -209,20 +191,16 @@ struct CrossInputListBenchmark { /// The list of strings to search var inputs: [String] + + /// Also run in scalar-semantic mode + var alsoRunScalarSemantic: Bool = true func register(_ runner: inout BenchmarkRunner) { - let swiftRegex = try! Regex(regex) - runner.register(InputListBenchmark( + runner.registerCrossBenchmark( name: baseName, - regex: swiftRegex, + inputList: inputs, pattern: regex, - targets: inputs - )) - runner.register(InputListNSBenchmark( - name: baseName + CrossBenchmark.nsSuffix, - regex: regex, - targets: inputs - )) + alsoRunScalarSemantic: alsoRunScalarSemantic) } } diff --git a/Sources/RegexBenchmark/BenchmarkRunner.swift b/Sources/RegexBenchmark/BenchmarkRunner.swift index 641c03224..b067b9679 100644 --- a/Sources/RegexBenchmark/BenchmarkRunner.swift +++ b/Sources/RegexBenchmark/BenchmarkRunner.swift @@ -4,6 +4,16 @@ import Foundation /// The number of times to re-run the benchmark if results are too varying private var rerunCount: Int { 3 } +extension Benchmark.MatchType { + fileprivate var nameSuffix: String { + switch self { + case .whole: return "_Whole" + case .first: return "_First" + case .allMatches: return "_All" + } + } +} + struct BenchmarkRunner { let suiteName: String var suite: [any RegexBenchmark] = [] @@ -16,12 +26,141 @@ struct BenchmarkRunner { // Forcibly include firstMatch benchmarks for all CrossBenchmarks let includeFirstOverride: Bool + + // Register a cross-benchmark + mutating func registerCrossBenchmark( + nameBase: String, + input: String, + pattern: String, + _ type: Benchmark.MatchType, + alsoRunScalarSemantic: Bool = true + ) { + let swiftRegex = try! Regex(pattern) + let nsRegex: NSRegularExpression + if type == .whole { + nsRegex = try! NSRegularExpression(pattern: "^" + pattern + "$") + } else { + nsRegex = try! NSRegularExpression(pattern: pattern) + } + let nameSuffix = type.nameSuffix + + register( + Benchmark( + name: nameBase + nameSuffix, + regex: swiftRegex, + pattern: pattern, + type: type, + target: input)) + register( + NSBenchmark( + name: nameBase + nameSuffix + CrossBenchmark.nsSuffix, + regex: nsRegex, + type: .init(type), + target: input)) + + if alsoRunScalarSemantic { + register( + Benchmark( + name: nameBase + nameSuffix + "_Scalar", + regex: swiftRegex.matchingSemantics(.unicodeScalar), + pattern: pattern, + type: type, + target: input)) + register( + NSBenchmark( + name: nameBase + nameSuffix + "_Scalar" + CrossBenchmark.nsSuffix, + regex: nsRegex, + type: .init(type), + target: input)) + } + } + + // Register a cross-benchmark list + mutating func registerCrossBenchmark( + name: String, + inputList: [String], + pattern: String, + alsoRunScalarSemantic: Bool = true + ) { + let swiftRegex = try! Regex(pattern) + register(InputListBenchmark( + name: name, + regex: swiftRegex, + pattern: pattern, + targets: inputList + )) + register(InputListNSBenchmark( + name: name + CrossBenchmark.nsSuffix, + regex: pattern, + targets: inputList + )) + + if alsoRunScalarSemantic { + register(InputListBenchmark( + name: name + "_Scalar", + regex: swiftRegex.matchingSemantics(.unicodeScalar), + pattern: pattern, + targets: inputList + )) + register(InputListNSBenchmark( + name: name + "_Scalar" + CrossBenchmark.nsSuffix, + regex: pattern, + targets: inputList + )) + } + + } + + // Register a swift-only benchmark + mutating func register( + nameBase: String, + input: String, + pattern: String, + _ swiftRegex: Regex, + _ type: Benchmark.MatchType, + alsoRunScalarSemantic: Bool = true + ) { + let nameSuffix = type.nameSuffix + + register( + Benchmark( + name: nameBase + nameSuffix, + regex: swiftRegex, + pattern: pattern, + type: type, + target: input)) + + if alsoRunScalarSemantic { + register( + Benchmark( + name: nameBase + nameSuffix + "_Scalar", + regex: swiftRegex, + pattern: pattern, + type: type, + target: input)) + } + } - mutating func register(_ benchmark: some RegexBenchmark) { + private mutating func register(_ benchmark: NSBenchmark) { suite.append(benchmark) } - mutating func register(_ benchmark: some SwiftRegexBenchmark) { + private mutating func register(_ benchmark: Benchmark) { + var benchmark = benchmark + if enableTracing { + benchmark.enableTracing() + } + if enableMetrics { + benchmark.enableMetrics() + } + suite.append(benchmark) + } + + private mutating func register(_ benchmark: InputListNSBenchmark) { + suite.append(benchmark) + } + + private mutating func register(_ benchmark: InputListBenchmark) { var benchmark = benchmark if enableTracing { benchmark.enableTracing() diff --git a/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift b/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift index 61d7b197f..27b2b07b4 100644 --- a/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift +++ b/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift @@ -12,53 +12,55 @@ extension BenchmarkRunner { let input = Inputs.graphemeBreakData - register(Benchmark( - name: "BasicCCC", - regex: try! Regex(basic), + // TODO: Which of these can be cross-benchmarks? + + register( + nameBase: "BasicCCC", + input: input, pattern: basic, - type: .allMatches, - target: input)) + try! Regex(basic), + .allMatches) - register(Benchmark( - name: "BasicRangeCCC", - regex: try! Regex(basicRange), + register( + nameBase: "BasicRangeCCC", + input: input, pattern: basicRange, - type: .allMatches, - target: input)) + try! Regex(basicRange), + .allMatches) - register(Benchmark( - name: "CaseInsensitiveCCC", - regex: try! Regex(caseInsensitive), + register( + nameBase: "CaseInsensitiveCCC", + input: input, pattern: caseInsensitive, - type: .allMatches, - target: input)) + try! Regex(caseInsensitive), + .allMatches) - register(Benchmark( - name: "InvertedCCC", - regex: try! Regex(inverted), + register( + nameBase: "InvertedCCC", + input: input, pattern: inverted, - type: .allMatches, - target: input)) + try! Regex(inverted), + .allMatches) - register(Benchmark( - name: "SubtractionCCC", - regex: try! Regex(subtraction), + register( + nameBase: "SubtractionCCC", + input: input, pattern: subtraction, - type: .allMatches, - target: input)) + try! Regex(subtraction), + .allMatches) - register(Benchmark( - name: "IntersectionCCC", - regex: try! Regex(intersection), + register( + nameBase: "IntersectionCCC", + input: input, pattern: intersection, - type: .allMatches, - target: input)) + try! Regex(intersection), + .allMatches) - register(Benchmark( - name: "symDiffCCC", - regex: try! Regex(symmetricDifference), + register( + nameBase: "symDiffCCC", + input: input, pattern: symmetricDifference, - type: .allMatches, - target: input)) + try! Regex(symmetricDifference), + .allMatches) } } diff --git a/Sources/RegexBenchmark/Utils/Stats.swift b/Sources/RegexBenchmark/Utils/Stats.swift index 0cc9156a4..175826a0b 100644 --- a/Sources/RegexBenchmark/Utils/Stats.swift +++ b/Sources/RegexBenchmark/Utils/Stats.swift @@ -3,8 +3,8 @@ import Foundation enum Stats {} extension Stats { - // Maximum allowed standard deviation is 5% of the median runtime - static let maxAllowedStdev = 0.05 + // Maximum allowed standard deviation is 7.5% of the median runtime + static let maxAllowedStdev = 0.075 static func tTest(_ a: Measurement, _ b: Measurement) -> Bool { // Student's t-test diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift b/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift index 484ff4648..d0fb8673d 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift @@ -50,11 +50,8 @@ extension BidirectionalCollection where Element: Comparable { public func firstRange( of other: C ) -> Range? where C.Element == Element { - let searcher = PatternOrEmpty( - searcher: TwoWaySearcher(pattern: Array(other))) - let slice = self[...] - var state = searcher.state(for: slice, in: startIndex..(pattern: Array(other), by: ==) + return searcher.search(self[...], in: startIndex.. Bool, - maxSplits: Int, - omittingEmptySubsequences: Bool - ) -> SplitCollection> { - split(by: PredicateConsumer(predicate: predicate), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) - } -} - -// MARK: Single element algorithms - -extension Collection where Element: Equatable { - func split( - by separator: Element, - maxSplits: Int, - omittingEmptySubsequences: Bool - ) -> SplitCollection> { - split(whereSeparator: { $0 == separator }, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) - } -} - // MARK: Fixed pattern algorithms extension Collection where Element: Equatable { @@ -180,41 +155,6 @@ extension Collection where Element: Equatable { } } -extension BidirectionalCollection where Element: Equatable { - // FIXME -// public func splitFromBack( -// separator: S -// ) -> ReversedSplitCollection> -// where S.Element == Element -// { -// splitFromBack(separator: ZSearcher(pattern: Array(separator), by: ==)) -// } -} - -extension BidirectionalCollection where Element: Comparable { - func split( - by separator: C, - maxSplits: Int, - omittingEmptySubsequences: Bool - ) -> SplitCollection>> - where C.Element == Element - { - split( - by: PatternOrEmpty(searcher: TwoWaySearcher(pattern: Array(separator))), - maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) - } - - // FIXME -// public func splitFromBack( -// separator: S -// ) -> ReversedSplitCollection>> -// where S.Element == Element -// { -// splitFromBack(separator: PatternOrEmpty( -// searcher: TwoWaySearcher(pattern: Array(separator)))) -// } -} - // String split overload breakers // // These are underscored and marked as SPI so that the *actual* public overloads diff --git a/Sources/_StringProcessing/Algorithms/Searchers/TwoWaySearcher.swift b/Sources/_StringProcessing/Algorithms/Searchers/TwoWaySearcher.swift deleted file mode 100644 index 2428f89cd..000000000 --- a/Sources/_StringProcessing/Algorithms/Searchers/TwoWaySearcher.swift +++ /dev/null @@ -1,197 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -extension BidirectionalCollection where Element: Equatable { - fileprivate func _ends(with suffix: C) -> Bool - where C.Element == Element - { - FixedPatternConsumer(pattern: suffix).consumingBack(self[...]) != nil - } - } - -struct TwoWaySearcher - where Searched.Element: Comparable -{ - // TODO: Be generic over the pattern? - let pattern: [Searched.Element] - let criticalIndex: Int - let period: Int - let periodIsExact: Bool - - init?(pattern: [Searched.Element]) { - guard !pattern.isEmpty else { return nil } - - let (criticalIndex, periodOfSecondPart) = pattern._criticalFactorization(<) - let periodIsExact = pattern[criticalIndex...] - .prefix(periodOfSecondPart) - ._ends(with: pattern[.. - ) -> State { - // FIXME: Is this 'limitedBy' requirement a sign of error? - let criticalIndex = searched.index( - range.lowerBound, offsetBy: criticalIndex, limitedBy: range.upperBound) - ?? range.upperBound - return State( - end: range.upperBound, - index: range.lowerBound, - criticalIndex: - criticalIndex, - memory: nil) - } - - func search( - _ searched: Searched, - _ state: inout State - ) -> Range? { - while state.criticalIndex != searched.endIndex { - if let end = _searchRight(searched, &state), - let start = _searchLeft(searched, &state, end) - { - state.index = end - // FIXME: Is this 'limitedBy' requirement a sign of error? - state.criticalIndex = searched.index( - end, offsetBy: criticalIndex, limitedBy: searched.endIndex) - ?? searched.endIndex - state.memory = nil - return start.. Searched.Index? { - let rStart: Int - var rIndex: Searched.Index - - if let memory = state.memory, memory.offset > criticalIndex { - rStart = memory.offset - rIndex = memory.index - } else { - rStart = criticalIndex - rIndex = state.criticalIndex - } - - for i in rStart.. Searched.Index? { - let lStart = min(state.memory?.offset ?? 0, criticalIndex) - var lIndex = state.criticalIndex - - for i in (lStart.. Bool - ) -> (index: Int, periodOfSecondPart: Int) { - let less = _maximalSuffix(isOrderedBefore) - let greater = _maximalSuffix({ isOrderedBefore($1, $0) }) - return less.index > greater.index ? less : greater - } - - func _maximalSuffix( - _ isOrderedBefore: (Element, Element) -> Bool - ) -> (index: Int, periodOfSecondPart: Int) { - var left = 0 - var right = 1 - var offset = 0 - var period = 1 - - while right + offset < count { - let a = self[right + offset] - let b = self[left + offset] - - if isOrderedBefore(a, b) { - // Suffix is smaller, period is entire prefix so far. - right += offset + 1 - offset = 0 - period = right - left - } else if isOrderedBefore(b, a) { - // Suffix is larger, start over from current location. - left = right - right += 1 - offset = 0 - period = 1 - } else { - // Advance through repetition of the current period. - offset += 1 - if offset + 1 == period { - right += offset + 1 - offset = 0 - } else { - offset += 1 - } - } - } - - return (left, period) - } -} diff --git a/Sources/_StringProcessing/CMakeLists.txt b/Sources/_StringProcessing/CMakeLists.txt index ef4aeb6ef..f7ccd7ab9 100644 --- a/Sources/_StringProcessing/CMakeLists.txt +++ b/Sources/_StringProcessing/CMakeLists.txt @@ -23,7 +23,6 @@ add_library(_StringProcessing Algorithms/Searchers/NaivePatternSearcher.swift Algorithms/Searchers/PatternOrEmpty.swift Algorithms/Searchers/PredicateSearcher.swift - Algorithms/Searchers/TwoWaySearcher.swift Algorithms/Searchers/ZSearcher.swift Engine/Backtracking.swift Engine/Consume.swift diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index a3d864165..5e446b472 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -15,7 +15,7 @@ extension Processor { isStrictASCII: Bool, isScalarSemantics: Bool ) -> Bool { - guard let next = input._matchBuiltinCC( + guard let next = input.matchBuiltinCC( cc, at: currentPosition, isInverted: isInverted, @@ -30,6 +30,7 @@ extension Processor { } func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { + // TODO: needs benchmark coverage if currentPosition == subjectBounds.lowerBound { return true } switch payload.semanticLevel { case .graphemeCluster: @@ -40,6 +41,7 @@ extension Processor { } func isAtEndOfLine(_ payload: AssertionPayload) -> Bool { + // TODO: needs benchmark coverage if currentPosition == subjectBounds.upperBound { return true } switch payload.semanticLevel { case .graphemeCluster: @@ -50,6 +52,8 @@ extension Processor { } mutating func builtinAssert(by payload: AssertionPayload) throws -> Bool { + // TODO: needs benchmark coverage + // Future work: Optimize layout and dispatch switch payload.kind { case .startOfSubject: return currentPosition == subjectBounds.lowerBound @@ -59,10 +63,10 @@ extension Processor { switch payload.semanticLevel { case .graphemeCluster: return input.index(after: currentPosition) == subjectBounds.upperBound - && input[currentPosition].isNewline + && input[currentPosition].isNewline case .unicodeScalar: return input.unicodeScalars.index(after: currentPosition) == subjectBounds.upperBound - && input.unicodeScalars[currentPosition].isNewline + && input.unicodeScalars[currentPosition].isNewline } case .endOfSubject: return currentPosition == subjectBounds.upperBound @@ -115,12 +119,75 @@ extension Processor { } } -// MARK: Built-in character class matching +// MARK: Matching `.` +extension String { + // TODO: Should the below have a `limitedBy` parameter? + + func matchAnyNonNewline( + at currentPosition: String.Index, + isScalarSemantics: Bool + ) -> String.Index? { + guard currentPosition < endIndex else { + return nil + } + if case .definite(let result) = _quickMatchAnyNonNewline( + at: currentPosition, + isScalarSemantics: isScalarSemantics + ) { + assert(result == _thoroughMatchAnyNonNewline( + at: currentPosition, + isScalarSemantics: isScalarSemantics)) + return result + } + return _thoroughMatchAnyNonNewline( + at: currentPosition, + isScalarSemantics: isScalarSemantics) + } + + @inline(__always) + private func _quickMatchAnyNonNewline( + at currentPosition: String.Index, + isScalarSemantics: Bool + ) -> QuickResult { + assert(currentPosition < endIndex) + guard let (asciiValue, next, isCRLF) = _quickASCIICharacter( + at: currentPosition + ) else { + return .unknown + } + switch asciiValue { + case (._lineFeed)...(._carriageReturn): + return .definite(nil) + default: + assert(!isCRLF) + return .definite(next) + } + } + + @inline(never) + private func _thoroughMatchAnyNonNewline( + at currentPosition: String.Index, + isScalarSemantics: Bool + ) -> String.Index? { + assert(currentPosition < endIndex) + if isScalarSemantics { + let scalar = unicodeScalars[currentPosition] + guard !scalar.isNewline else { return nil } + return unicodeScalars.index(after: currentPosition) + } + + let char = self[currentPosition] + guard !char.isNewline else { return nil } + return index(after: currentPosition) + } +} +// MARK: - Built-in character class matching extension String { + // TODO: Should the below have a `limitedBy` parameter? // Mentioned in ProgrammersManual.md, update docs if redesigned - func _matchBuiltinCC( + func matchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, isInverted: Bool, @@ -155,7 +222,7 @@ extension String { // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(__always) - func _quickMatchBuiltinCC( + private func _quickMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, isInverted: Bool, @@ -173,7 +240,7 @@ extension String { // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(never) - func _thoroughMatchBuiltinCC( + private func _thoroughMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, isInverted: Bool, diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index fa68b8b76..7ca3ae84a 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,26 +1,43 @@ extension Processor { func _doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { - var next: Input.Index? + // TODO: This optimization is only enabled for grapheme cluster semantics, + // we want these for scalar semantics as well. + switch payload.type { case .bitset: - next = _doMatchBitset(registers[payload.bitset]) + return input.matchBitset( + registers[payload.bitset], + at: currentPosition, + limitedBy: end, + isScalarSemantics: false) case .asciiChar: - next = _doMatchScalar( - UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) + return input.matchScalar( + UnicodeScalar.init(_value: UInt32(payload.asciiChar)), + at: currentPosition, + limitedBy: end, + boundaryCheck: true, + isCaseInsensitive: false) case .builtin: + // FIXME: bounds check? endIndex or end? + // We only emit .quantify if it consumes a single character - next = input._matchBuiltinCC( + return input.matchBuiltinCC( payload.builtin, at: currentPosition, isInverted: payload.builtinIsInverted, isStrictASCII: payload.builtinIsStrict, isScalarSemantics: false) case .any: - let matched = currentPosition != input.endIndex - && (!input[currentPosition].isNewline || payload.anyMatchesNewline) - next = matched ? input.index(after: currentPosition) : nil + // FIXME: endIndex or end? + guard currentPosition < input.endIndex else { return nil } + + if payload.anyMatchesNewline { + return input.index(after: currentPosition) + } + + return input.matchAnyNonNewline( + at: currentPosition, isScalarSemantics: false) } - return next } /// Generic quantify instruction interpreter @@ -30,7 +47,7 @@ extension Processor { var trips = 0 var extraTrips = payload.extraTrips var savePoint = startQuantifierSavePoint() - + while true { if trips >= payload.minTrips { if extraTrips == 0 { break } @@ -40,7 +57,14 @@ extension Processor { } } let next = _doQuantifyMatch(payload) - guard let idx = next else { break } + guard let idx = next else { + if !savePoint.rangeIsEmpty { + // The last save point has saved the current, non-matching position, + // so it's unneeded. + savePoint.shrinkRange(input) + } + break + } currentPosition = idx trips += 1 } @@ -50,12 +74,8 @@ extension Processor { return false } - if payload.quantKind == .eager && !savePoint.rangeIsEmpty { - // The last save point has saved the current position, so it's unneeded - savePoint.shrinkRange(input) - if !savePoint.rangeIsEmpty { - savePoints.append(savePoint) - } + if !savePoint.rangeIsEmpty { + savePoints.append(savePoint) } return true } diff --git a/Sources/_StringProcessing/Engine/Metrics.swift b/Sources/_StringProcessing/Engine/Metrics.swift index 753c3c3d1..372a7e1b4 100644 --- a/Sources/_StringProcessing/Engine/Metrics.swift +++ b/Sources/_StringProcessing/Engine/Metrics.swift @@ -1,13 +1,71 @@ extension Processor { +#if PROCESSOR_MEASUREMENTS_ENABLED struct ProcessorMetrics { var instructionCounts: [Instruction.OpCode: Int] = [:] var backtracks: Int = 0 var resets: Int = 0 + var cycleCount: Int = 0 + + var isTracingEnabled: Bool = false + var shouldMeasureMetrics: Bool = false + + init(isTracingEnabled: Bool, shouldMeasureMetrics: Bool) { + self.isTracingEnabled = isTracingEnabled + self.shouldMeasureMetrics = shouldMeasureMetrics + } } - +#else + struct ProcessorMetrics { + var isTracingEnabled: Bool { false } + var shouldMeasureMetrics: Bool { false } + var cycleCount: Int { 0 } + + init(isTracingEnabled: Bool, shouldMeasureMetrics: Bool) { } + } +#endif +} + +extension Processor { + + mutating func startCycleMetrics() { +#if PROCESSOR_MEASUREMENTS_ENABLED + if metrics.cycleCount == 0 { + trace() + measureMetrics() + } +#endif + } + + mutating func endCycleMetrics() { +#if PROCESSOR_MEASUREMENTS_ENABLED + metrics.cycleCount += 1 + trace() + measureMetrics() + _checkInvariants() +#endif + } +} + +extension Processor.ProcessorMetrics { + + mutating func addReset() { +#if PROCESSOR_MEASUREMENTS_ENABLED + self.resets += 1 +#endif + } + + mutating func addBacktrack() { +#if PROCESSOR_MEASUREMENTS_ENABLED + self.backtracks += 1 +#endif + } +} + +extension Processor { +#if PROCESSOR_MEASUREMENTS_ENABLED func printMetrics() { print("===") - print("Total cycle count: \(cycleCount)") + print("Total cycle count: \(metrics.cycleCount)") print("Backtracks: \(metrics.backtracks)") print("Resets: \(metrics.resets)") print("Instructions:") @@ -21,7 +79,7 @@ extension Processor { } mutating func measure() { - let (opcode, _) = fetch().destructure + let (opcode, _) = fetch() if metrics.instructionCounts.keys.contains(opcode) { metrics.instructionCounts[opcode]! += 1 } else { @@ -30,8 +88,9 @@ extension Processor { } mutating func measureMetrics() { - if shouldMeasureMetrics { + if metrics.shouldMeasureMetrics { measure() } } +#endif } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 25cad5c8c..0350a37db 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -35,7 +35,7 @@ struct Processor { /// of the search. `input` can be a "supersequence" of the subject, while /// `input[subjectBounds]` is the logical entity that is being searched. let input: Input - + /// The bounds of the logical subject in `input`. /// /// `subjectBounds` represents the bounds of the string or substring that a @@ -46,7 +46,7 @@ struct Processor { /// `subjectBounds` is always equal to or a subrange of /// `input.startIndex.. - + /// The bounds within the subject for an individual search. /// /// `searchBounds` is equal to `subjectBounds` in some cases, but can be a @@ -62,7 +62,7 @@ struct Processor { let instructions: InstructionList // MARK: Resettable state - + /// The current search position while processing. /// /// `currentPosition` must always be in the range `subjectBounds` or equal @@ -81,16 +81,12 @@ struct Processor { var wordIndexCache: Set? = nil var wordIndexMaxIndex: String.Index? = nil - + var state: State = .inProgress var failureReason: Error? = nil - // MARK: Metrics, debugging, etc. - var cycleCount = 0 - var isTracingEnabled: Bool - let shouldMeasureMetrics: Bool - var metrics: ProcessorMetrics = ProcessorMetrics() + var metrics: ProcessorMetrics } extension Processor { @@ -116,14 +112,17 @@ extension Processor { self.subjectBounds = subjectBounds self.searchBounds = searchBounds self.matchMode = matchMode - self.isTracingEnabled = isTracingEnabled - self.shouldMeasureMetrics = shouldMeasureMetrics + + self.metrics = ProcessorMetrics( + isTracingEnabled: isTracingEnabled, + shouldMeasureMetrics: shouldMeasureMetrics) + self.currentPosition = searchBounds.lowerBound // Initialize registers with end of search bounds self.registers = Registers(program, searchBounds.upperBound) self.storedCaptures = Array( - repeating: .init(), count: program.registerInfo.captures) + repeating: .init(), count: program.registerInfo.captures) _checkInvariants() } @@ -144,8 +143,8 @@ extension Processor { self.state = .inProgress self.failureReason = nil - - if shouldMeasureMetrics { metrics.resets += 1 } + + metrics.addReset() _checkInvariants() } @@ -160,23 +159,22 @@ extension Processor { } extension Processor { + func fetch() -> (Instruction.OpCode, Instruction.Payload) { + instructions[controller.pc].destructure + } + var slice: Input.SubSequence { // TODO: Should we whole-scale switch to slices, or // does that depend on options for some anchors? input[searchBounds] } - // Advance in our input, without any checks or failure signalling - mutating func _uncheckedForcedConsumeOne() { - assert(currentPosition != end) - input.formIndex(after: ¤tPosition) - } - // Advance in our input // // Returns whether the advance succeeded. On failure, our // save point was restored mutating func consume(_ n: Distance) -> Bool { + // TODO: needs benchmark coverage guard let idx = input.index( currentPosition, offsetBy: n.rawValue, limitedBy: end ) else { @@ -186,9 +184,10 @@ extension Processor { currentPosition = idx return true } - + // Advances in unicode scalar view mutating func consumeScalar(_ n: Distance) -> Bool { + // TODO: needs benchmark coverage guard let idx = input.unicodeScalars.index( currentPosition, offsetBy: n.rawValue, limitedBy: end ) else { @@ -220,29 +219,29 @@ extension Processor { func load() -> Element? { currentPosition < end ? input[currentPosition] : nil } - func load(count: Int) -> Input.SubSequence? { - let slice = self.slice[currentPosition...].prefix(count) - guard slice.count == count else { return nil } - return slice - } + + // MARK: Match functions + // + // TODO: refactor these such that `cycle()` calls the corresponding String + // method directly, and all the step, signalFailure, and + // currentPosition logic is collected into a single place inside + // cycle(). // Match against the current input element. Returns whether // it succeeded vs signaling an error. - mutating func match(_ e: Element) -> Bool { - guard let cur = load(), cur == e else { - signalFailure() - return false - } - _uncheckedForcedConsumeOne() - return true - } - - mutating func matchCaseInsensitive(_ e: Element) -> Bool { - guard let cur = load(), cur.lowercased() == e.lowercased() else { + mutating func match( + _ e: Element, isCaseInsensitive: Bool + ) -> Bool { + guard let next = input.match( + e, + at: currentPosition, + limitedBy: end, + isCaseInsensitive: isCaseInsensitive + ) else { signalFailure() return false } - _uncheckedForcedConsumeOne() + currentPosition = next return true } @@ -250,81 +249,54 @@ extension Processor { // it succeeded vs signaling an error. mutating func matchSeq( _ seq: Substring, - isScalarMode: Bool + isScalarSemantics: Bool ) -> Bool { - if isScalarMode { - for s in seq.unicodeScalars { - guard matchScalar(s, boundaryCheck: false) else { return false } - } - return true - } - - for e in seq { - guard match(e) else { return false } - } - return true - } - - func loadScalar() -> Unicode.Scalar? { - currentPosition < end ? input.unicodeScalars[currentPosition] : nil - } - - func _doMatchScalar(_ s: Unicode.Scalar, _ boundaryCheck: Bool) -> Input.Index? { - if s == loadScalar(), - let idx = input.unicodeScalars.index( - currentPosition, - offsetBy: 1, - limitedBy: end), - (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) { - return idx - } else { - return nil - } - } - - mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { - guard let next = _doMatchScalar(s, boundaryCheck) else { + guard let next = input.matchSeq( + seq, + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics + ) else { signalFailure() return false } + currentPosition = next return true } - mutating func matchScalarCaseInsensitive( + mutating func matchScalar( _ s: Unicode.Scalar, - boundaryCheck: Bool + boundaryCheck: Bool, + isCaseInsensitive: Bool ) -> Bool { - guard let curScalar = loadScalar(), - s.properties.lowercaseMapping == curScalar.properties.lowercaseMapping, - let idx = input.unicodeScalars.index( - currentPosition, - offsetBy: 1, - limitedBy: end), - (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) - else { + guard let next = input.matchScalar( + s, + at: currentPosition, + limitedBy: end, + boundaryCheck: boundaryCheck, + isCaseInsensitive: isCaseInsensitive + ) else { signalFailure() return false } - currentPosition = idx + currentPosition = next return true } - func _doMatchBitset(_ bitset: DSLTree.CustomCharacterClass.AsciiBitset) -> Input.Index? { - if let cur = load(), bitset.matches(char: cur) { - return input.index(after: currentPosition) - } else { - return nil - } - } - // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset mutating func matchBitset( - _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + isScalarSemantics: Bool ) -> Bool { - guard let next = _doMatchBitset(bitset) else { + guard let next = input.matchBitset( + bitset, + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics + ) else { signalFailure() return false } @@ -332,37 +304,18 @@ extension Processor { return true } - // Equivalent of matchBitset but emitted when in unicode scalar semantic mode - mutating func matchBitsetScalar( - _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + // Matches the next character/scalar if it is not a newline + mutating func matchAnyNonNewline( + isScalarSemantics: Bool ) -> Bool { - guard let curScalar = loadScalar(), - bitset.matches(scalar: curScalar), - let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end) else { - signalFailure() - return false - } - currentPosition = idx - return true - } - - // Matches the next character if it is not a newline - mutating func matchAnyNonNewline() -> Bool { - guard let c = load(), !c.isNewline else { - signalFailure() - return false - } - _uncheckedForcedConsumeOne() - return true - } - - // Matches the next scalar if it is not a newline - mutating func matchAnyNonNewlineScalar() -> Bool { - guard let s = loadScalar(), !s.isNewline else { + guard let next = input.matchAnyNonNewline( + at: currentPosition, + isScalarSemantics: isScalarSemantics + ) else { signalFailure() return false } - input.unicodeScalars.formIndex(after: ¤tPosition) + currentPosition = next return true } @@ -401,8 +354,8 @@ extension Processor { storedCaptures = capEnds registers.ints = intRegisters registers.positions = posRegisters - - if shouldMeasureMetrics { metrics.backtracks += 1 } + + metrics.addBacktrack() } mutating func abort(_ e: Error? = nil) { @@ -436,25 +389,15 @@ extension Processor { // TODO: What should we do here? fatalError("Invalid code: Tried to clear save points when empty") } - + mutating func cycle() { _checkInvariants() assert(state == .inProgress) -#if PROCESSOR_MEASUREMENTS_ENABLED - if cycleCount == 0 { - trace() - measureMetrics() - } - defer { - cycleCount += 1 - trace() - measureMetrics() - _checkInvariants() - } -#endif + startCycleMetrics() + defer { endCycleMetrics() } - let (opcode, payload) = fetch().destructure + let (opcode, payload) = fetch() switch opcode { case .invalid: fatalError("Invalid program") @@ -535,50 +478,30 @@ extension Processor { } } case .matchAnyNonNewline: - if payload.isScalar { - if matchAnyNonNewlineScalar() { - controller.step() - } - } else { - if matchAnyNonNewline() { - controller.step() - } + if matchAnyNonNewline(isScalarSemantics: payload.isScalar) { + controller.step() } case .match: let (isCaseInsensitive, reg) = payload.elementPayload - if isCaseInsensitive { - if matchCaseInsensitive(registers[reg]) { - controller.step() - } - } else { - if match(registers[reg]) { - controller.step() - } + if match(registers[reg], isCaseInsensitive: isCaseInsensitive) { + controller.step() } case .matchScalar: let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload - if caseInsensitive { - if matchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck) { - controller.step() - } - } else { - if matchScalar(scalar, boundaryCheck: boundaryCheck) { - controller.step() - } + if matchScalar( + scalar, + boundaryCheck: boundaryCheck, + isCaseInsensitive: caseInsensitive + ) { + controller.step() } case .matchBitset: let (isScalar, reg) = payload.bitsetPayload let bitset = registers[reg] - if isScalar { - if matchBitsetScalar(bitset) { - controller.step() - } - } else { - if matchBitset(bitset) { - controller.step() - } + if matchBitset(bitset, isScalarSemantics: isScalar) { + controller.step() } case .matchBuiltin: @@ -669,7 +592,7 @@ extension Processor { signalFailure() return } - if matchSeq(input[range], isScalarMode: isScalarMode) { + if matchSeq(input[range], isScalarSemantics: isScalarMode) { controller.step() } @@ -714,3 +637,122 @@ extension Processor { } } } + +// MARK: String matchers +// +// TODO: Refactor into separate file, formalize patterns + +extension String { + + func match( + _ char: Character, + at pos: Index, + limitedBy end: String.Index, + isCaseInsensitive: Bool + ) -> Index? { + // TODO: This can be greatly sped up with string internals + // TODO: This is also very much quick-check-able + assert(end <= endIndex) + + guard pos < end else { return nil } + + if isCaseInsensitive { + guard self[pos].lowercased() == char.lowercased() else { return nil } + } else { + guard self[pos] == char else { return nil } + } + + let idx = index(after: pos) + guard idx <= end else { return nil } + + return idx + } + + func matchSeq( + _ seq: Substring, + at pos: Index, + limitedBy end: Index, + isScalarSemantics: Bool + ) -> Index? { + // TODO: This can be greatly sped up with string internals + // TODO: This is also very much quick-check-able + assert(end <= endIndex) + + var cur = pos + + if isScalarSemantics { + for e in seq.unicodeScalars { + guard cur < end, unicodeScalars[cur] == e else { return nil } + self.unicodeScalars.formIndex(after: &cur) + } + } else { + for e in seq { + guard cur < end, self[cur] == e else { return nil } + self.formIndex(after: &cur) + } + } + + guard cur <= end else { return nil } + return cur + } + + func matchScalar( + _ scalar: Unicode.Scalar, + at pos: Index, + limitedBy end: String.Index, + boundaryCheck: Bool, + isCaseInsensitive: Bool + ) -> Index? { + // TODO: extremely quick-check-able + // TODO: can be sped up with string internals + assert(end <= endIndex) + + guard pos < end else { return nil } + let curScalar = unicodeScalars[pos] + + if isCaseInsensitive { + guard curScalar.properties.lowercaseMapping == scalar.properties.lowercaseMapping + else { + return nil + } + } else { + guard curScalar == scalar else { return nil } + } + + let idx = unicodeScalars.index(after: pos) + guard idx <= end else { return nil } + + if boundaryCheck && !isOnGraphemeClusterBoundary(idx) { + return nil + } + + return idx + } + + func matchBitset( + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + at pos: Index, + limitedBy end: Index, + isScalarSemantics: Bool + ) -> Index? { + // TODO: extremely quick-check-able + // TODO: can be sped up with string internals + assert(end <= endIndex) + + guard pos < end else { return nil } + + let idx: String.Index + if isScalarSemantics { + guard bitset.matches(unicodeScalars[pos]) else { return nil } + idx = unicodeScalars.index(after: pos) + } else { + guard bitset.matches(self[pos]) else { return nil } + idx = index(after: pos) + } + + guard idx <= end else { return nil } + return idx + } + + +} diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index 725319b00..b0ce67555 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -9,7 +9,12 @@ // //===----------------------------------------------------------------------===// + +// TODO: Remove this protocol (and/or reuse it for something like a FastProcessor) extension Processor: TracedProcessor { + var cycleCount: Int { metrics.cycleCount } + var isTracingEnabled: Bool { metrics.isTracingEnabled } + var isFailState: Bool { state == .fail } var isAcceptState: Bool { state == .accept } diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 253858d1f..0453fcd80 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -31,7 +31,7 @@ struct Executor { subjectBounds: subjectBounds, searchBounds: searchBounds) #if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.shouldMeasureMetrics { cpu.printMetrics() } } + defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } #endif var low = searchBounds.lowerBound let high = searchBounds.upperBound @@ -60,7 +60,7 @@ struct Executor { var cpu = engine.makeProcessor( input: input, bounds: subjectBounds, matchMode: mode) #if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.shouldMeasureMetrics { cpu.printMetrics() } } + defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } #endif return try _match(input, from: subjectBounds.lowerBound, using: &cpu) } diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift index 5150e18cc..de13e340a 100644 --- a/Sources/_StringProcessing/Unicode/ASCII.swift +++ b/Sources/_StringProcessing/Unicode/ASCII.swift @@ -9,26 +9,25 @@ // //===----------------------------------------------------------------------===// -private var _lineFeed: UInt8 { 0x0A } -private var _carriageReturn: UInt8 { 0x0D } -private var _lineTab: UInt8 { 0x0B } -private var _formFeed: UInt8 { 0x0C } -private var _space: UInt8 { 0x20 } -private var _tab: UInt8 { 0x09 } +extension UInt8 { + static var _lineFeed: UInt8 { 0x0A } + static var _carriageReturn: UInt8 { 0x0D } + static var _lineTab: UInt8 { 0x0B } + static var _formFeed: UInt8 { 0x0C } + static var _space: UInt8 { 0x20 } + static var _tab: UInt8 { 0x09 } + + static var _underscore: UInt8 { 0x5F } +} private var _0: UInt8 { 0x30 } private var _9: UInt8 { 0x39 } -private func _isASCIINumber(_ x: UInt8) -> Bool { - return (_0..._9).contains(x) -} private var _a: UInt8 { 0x61 } private var _z: UInt8 { 0x7A } private var _A: UInt8 { 0x41 } private var _Z: UInt8 { 0x5A } -private var _underscore: UInt8 { 0x5F } - extension UInt8 { var _isASCII: Bool { self < 0x80 } @@ -43,14 +42,14 @@ extension UInt8 { /// Assuming we're ASCII, whether we match `\h` var _asciiIsHorizontalWhitespace: Bool { assert(_isASCII) - return self == _space || self == _tab + return self == ._space || self == ._tab } /// Assuming we're ASCII, whether we match `\v` var _asciiIsVerticalWhitespace: Bool { assert(_isASCII) switch self { - case _lineFeed, _carriageReturn, _lineTab, _formFeed: + case ._lineFeed, ._carriageReturn, ._lineTab, ._formFeed: return true default: return false @@ -61,7 +60,7 @@ extension UInt8 { var _asciiIsWhitespace: Bool { assert(_isASCII) switch self { - case _space, _tab, _lineFeed, _lineTab, _formFeed, _carriageReturn: + case ._space, ._tab, ._lineFeed, ._lineTab, ._formFeed, ._carriageReturn: return true default: return false @@ -77,11 +76,13 @@ extension UInt8 { /// Assuming we're ASCII, whether we match `\w` var _asciiIsWord: Bool { assert(_isASCII) - return _asciiIsDigit || _asciiIsLetter || self == _underscore + return _asciiIsDigit || _asciiIsLetter || self == ._underscore } } extension String { + /// TODO: better to take isScalarSemantics parameter, we can return more results + /// and we can give the right `next` index, not requiring the caller to re-adjust it /// TODO: detailed description of nuanced semantics func _quickASCIICharacter( at idx: Index @@ -107,7 +108,7 @@ extension String { guard tail._isSub300StartingByte else { return nil } // Handle CR-LF: - if base == _carriageReturn && tail == _lineFeed { + if base == ._carriageReturn && tail == ._lineFeed { utf8.formIndex(after: &next) guard next == endIndex || utf8[next]._isSub300StartingByte else { return nil @@ -165,5 +166,6 @@ extension String { return (next, asciiValue._asciiIsWord) } } + } diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift index 50da079f6..f1f9573c1 100644 --- a/Sources/_StringProcessing/Unicode/WordBreaking.swift +++ b/Sources/_StringProcessing/Unicode/WordBreaking.swift @@ -12,6 +12,7 @@ @_spi(_Unicode) import Swift +// TODO: Sink onto String extension Processor { func atSimpleBoundary( _ usesAsciiWord: Bool, @@ -20,9 +21,11 @@ extension Processor { func matchesWord(at i: Input.Index) -> Bool { switch semanticLevel { case .graphemeCluster: + // TODO: needs benchmark coverage let c = input[i] return c.isWordCharacter && (c.isASCII || !usesAsciiWord) case .unicodeScalar: + // TODO: needs benchmark coverage let c = input.unicodeScalars[i] return (c.properties.isAlphabetic || c == "_") && (c.isASCII || !usesAsciiWord) } @@ -51,6 +54,7 @@ extension String { using cache: inout Set?, _ maxIndex: inout String.Index? ) -> Bool { + // TODO: needs benchmark coverage guard i != startIndex, i != endIndex else { return true } diff --git a/Sources/_StringProcessing/Utility/AsciiBitset.swift b/Sources/_StringProcessing/Utility/AsciiBitset.swift index ad3159820..e063447a0 100644 --- a/Sources/_StringProcessing/Utility/AsciiBitset.swift +++ b/Sources/_StringProcessing/Utility/AsciiBitset.swift @@ -57,7 +57,7 @@ extension DSLTree.CustomCharacterClass { } } - internal func matches(char: Character) -> Bool { + internal func matches(_ char: Character) -> Bool { let matched: Bool if let val = char._singleScalarAsciiValue { matched = matches(val) @@ -71,7 +71,7 @@ extension DSLTree.CustomCharacterClass { return matched } - internal func matches(scalar: Unicode.Scalar) -> Bool { + internal func matches(_ scalar: Unicode.Scalar) -> Bool { let matched: Bool if scalar.isASCII { let val = UInt8(ascii: scalar) diff --git a/Sources/_StringProcessing/Utility/Protocols.swift b/Sources/_StringProcessing/Utility/Protocols.swift index 7542a17dd..24ffbcf70 100644 --- a/Sources/_StringProcessing/Utility/Protocols.swift +++ b/Sources/_StringProcessing/Utility/Protocols.swift @@ -44,13 +44,3 @@ protocol ProcessorProtocol { var registers: Registers { get } } -extension ProcessorProtocol { - func fetch() -> Instruction { - instructions[currentPC] - } - - var callStack: Array { [] } -// var savePoints: Array { [] } - var registers: Array { [] } - -} diff --git a/Sources/_StringProcessing/Utility/Traced.swift b/Sources/_StringProcessing/Utility/Traced.swift index 112a601b1..198564fe1 100644 --- a/Sources/_StringProcessing/Utility/Traced.swift +++ b/Sources/_StringProcessing/Utility/Traced.swift @@ -13,7 +13,7 @@ // TODO: Place shared formatting and trace infrastructure here protocol Traced { - var isTracingEnabled: Bool { get set } + var isTracingEnabled: Bool { get } } protocol TracedProcessor: ProcessorProtocol, Traced { diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index cdee66ddb..c053b31e4 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -79,7 +79,7 @@ struct _CharacterClassModel: Hashable { let isScalarSemantics = matchLevel == .unicodeScalar - return input._matchBuiltinCC( + return input.matchBuiltinCC( cc, at: currentPosition, isInverted: isInverted, diff --git a/Tests/RegexBuilderTests/AlgorithmsTests.swift b/Tests/RegexBuilderTests/AlgorithmsTests.swift index 115941070..b85395d19 100644 --- a/Tests/RegexBuilderTests/AlgorithmsTests.swift +++ b/Tests/RegexBuilderTests/AlgorithmsTests.swift @@ -16,7 +16,7 @@ import RegexBuilder @available(SwiftStdlib 5.7, *) class RegexConsumerTests: XCTestCase { func testMatches() { - let regex = Capture(OneOrMore(.digit)) { 2 * Int($0)! } + let regex = Capture<(Substring, Int)>(OneOrMore(.digit)) { 2 * Int($0)! } let str = "foo 160 bar 99 baz" XCTAssertEqual(str.matches(of: regex).map(\.output.1), [320, 198]) } diff --git a/Tests/RegexBuilderTests/CustomTests.swift b/Tests/RegexBuilderTests/CustomTests.swift index 26746d613..848ef4626 100644 --- a/Tests/RegexBuilderTests/CustomTests.swift +++ b/Tests/RegexBuilderTests/CustomTests.swift @@ -64,7 +64,7 @@ private struct IntParser: CustomConsumingRegexComponent { guard index != bounds.upperBound else { return nil } let r = Regex { - Capture(OneOrMore(.digit)) { Int($0) } + Capture<(Substring, Int?)>(OneOrMore(.digit)) { Int($0) } } guard let match = input[index..(Repeat(.digit, count: 2)) { Int($0) } } } diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 5e85ad26c..19ac675dc 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -1091,7 +1091,7 @@ class RegexDSLTests: XCTestCase { OneOrMore("a") Capture { TryCapture("b", transform: { Int($0) }) - ZeroOrMore( + ZeroOrMore<(Substring, Double?)>( TryCapture("c", transform: { Double($0) }) ) Optionally("e") @@ -1542,12 +1542,12 @@ class RegexDSLTests: XCTestCase { in bounds: Range ) throws -> (upperBound: String.Index, output: SemanticVersion)? { let regex = Regex { - TryCapture(OneOrMore(.digit)) { Int($0) } + TryCapture<(Substring, Int)>(OneOrMore(.digit)) { Int($0) } "." - TryCapture(OneOrMore(.digit)) { Int($0) } + TryCapture<(Substring, Int)>(OneOrMore(.digit)) { Int($0) } Optionally { "." - TryCapture(OneOrMore(.digit)) { Int($0) } + TryCapture<(Substring, Int)>(OneOrMore(.digit)) { Int($0) } } Optionally { "-" @@ -1876,7 +1876,7 @@ extension RegexDSLTests { ":" regexWithTooManyCaptures ":" - TryCapture(OneOrMore(.word)) { Int($0) } + TryCapture<(Substring, Int)>(OneOrMore(.word)) { Int($0) } #/:(\d+):/# } XCTAssert(type(of: dslWithTooManyCaptures).self diff --git a/Tests/RegexTests/AlgorithmsTests.swift b/Tests/RegexTests/AlgorithmsTests.swift index d5419fe9c..60548a0a2 100644 --- a/Tests/RegexTests/AlgorithmsTests.swift +++ b/Tests/RegexTests/AlgorithmsTests.swift @@ -165,8 +165,12 @@ class AlgorithmTests: XCTestCase { let actualCol: [Range] = input.ranges(of: pattern)[...].map(input.offsets(of:)) XCTAssertEqual(actualCol, expected, file: file, line: line) - let firstRange = input.firstRange(of: pattern).map(input.offsets(of:)) - XCTAssertEqual(firstRange, expected.first, file: file, line: line) + let firstRange = input.firstRange(of: pattern) + XCTAssertEqual(firstRange.map(input.offsets(of:)), expected.first, file: file, line: line) + if let upperBound = firstRange?.upperBound, !pattern.isEmpty { + let secondRange = input[upperBound...].firstRange(of: pattern).map(input.offsets(of:)) + XCTAssertEqual(secondRange, expected.dropFirst().first, file: file, line: line) + } } expectRanges("", "", [0..<0]) @@ -176,6 +180,9 @@ class AlgorithmTests: XCTestCase { expectRanges("abcde", "bcd", [1..<4]) expectRanges("ababacabababa", "abababa", [6..<13]) expectRanges("ababacabababa", "aba", [0..<3, 6..<9, 10..<13]) + + // Test for rdar://92794248 + expectRanges("ADACBADADACBADACB", "ADACB", [0..<5, 7..<12, 12..<17]) } // rdar://105154010 diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index a1bf0e76f..a6c9babbe 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1891,6 +1891,11 @@ extension RegexTests { func testSingleLineMode() { firstMatchTest(#".+"#, input: "a\nb", match: "a") firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") + + // We recognize LF, line tab, FF, and CR as newlines by default + firstMatchTest(#"."#, input: "\u{A}\u{B}\u{C}\u{D}\nb", match: "b") + firstMatchTest(#".+"#, input: "\u{A}\u{B}\u{C}\u{D}\nbb", match: "bb") + } func testMatchNewlines() { @@ -2574,4 +2579,35 @@ extension RegexTests { func testFuzzerArtifacts() throws { expectCompletion(regex: #"(b?)\1*"#, in: "a") } + + func testIssue640() throws { + // Original report from https://github.com/apple/swift-experimental-string-processing/issues/640 + let original = try Regex("[1-9][0-9]{0,2}(?:,?[0-9]{3})*") + XCTAssertNotNil("36,769".wholeMatch(of: original)) + XCTAssertNotNil("36769".wholeMatch(of: original)) + + // Simplified case + let simplified = try Regex("a{0,2}a") + XCTAssertNotNil("aaa".wholeMatch(of: simplified)) + + for max in 1...8 { + let patternEager = "a{0,\(max)}a" + let regexEager = try Regex(patternEager) + let patternReluctant = "a{0,\(max)}?a" + let regexReluctant = try Regex(patternReluctant) + for length in 1...(max + 1) { + let str = String(repeating: "a", count: length) + if str.wholeMatch(of: regexEager) == nil { + XCTFail("Didn't match '\(patternEager)' in '\(str)' (\(max),\(length)).") + } + if str.wholeMatch(of: regexReluctant) == nil { + XCTFail("Didn't match '\(patternReluctant)' in '\(str)' (\(max),\(length)).") + } + } + + let possessiveRegex = try Regex("a{0,\(max)}+a") + let str = String(repeating: "a", count: max + 1) + XCTAssertNotNil(str.wholeMatch(of: possessiveRegex)) + } + } }