diff --git a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift index c61c37fd..33001887 100644 --- a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift +++ b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift @@ -12,7 +12,7 @@ internal import _RegexParser extension Compiler.ByteCodeGen { - mutating func emitRoot(_ root: DSLList) throws -> MEProgram { + mutating func emitRoot(_ root: inout DSLList) throws -> MEProgram { // If the whole regex is a matcher, then the whole-match value // is the constructed value. Denote that the current value // register is the processor's value output. @@ -22,7 +22,11 @@ extension Compiler.ByteCodeGen { default: break } - + + if optimizationsEnabled { + root.autoPossessify() + } + var list = root.nodes[...] try emitNode(&list) @@ -352,15 +356,7 @@ fileprivate extension Compiler.ByteCodeGen { _ kind: DSLTree.QuantificationKind, _ list: inout ArraySlice ) throws { - let updatedKind: AST.Quantification.Kind - switch kind { - case .explicit(let kind): - updatedKind = kind.ast - case .syntax(let kind): - updatedKind = kind.ast.applying(options) - case .default: - updatedKind = options.defaultQuantificationKind - } + let updatedKind = kind.applying(options: options) let (low, high) = amount.bounds guard let low = low else { diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d6ec4d71..24c94da1 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -506,15 +506,7 @@ extension Compiler.ByteCodeGen { _ kind: DSLTree.QuantificationKind, _ child: DSLTree.Node ) throws { - let updatedKind: AST.Quantification.Kind - switch kind { - case .explicit(let kind): - updatedKind = kind.ast - case .syntax(let kind): - updatedKind = kind.ast.applying(options) - case .default: - updatedKind = options.defaultQuantificationKind - } + let updatedKind = kind.applying(options: options) let (low, high) = amount.bounds guard let low = low else { diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index e2fd2a28..25e6e4cf 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -47,13 +47,13 @@ class Compiler { __consuming func emitViaList() throws -> MEProgram { // TODO: Handle global options - let dslList = DSLList(tree: tree) + var dslList = DSLList(tree: tree) var codegen = ByteCodeGen( options: options, compileOptions: compileOptions, captureList: tree.captureList) - return try codegen.emitRoot(dslList) + return try codegen.emitRoot(&dslList) } } diff --git a/Sources/_StringProcessing/LiteralPrinter.swift b/Sources/_StringProcessing/LiteralPrinter.swift index fa80f032..e1dc3fa2 100644 --- a/Sources/_StringProcessing/LiteralPrinter.swift +++ b/Sources/_StringProcessing/LiteralPrinter.swift @@ -224,13 +224,16 @@ extension LiteralPrinter { } mutating func outputQuantificationKind(_ kind: DSLTree.QuantificationKind) { - switch kind { - case .`default`: + guard let astKind = kind.quantificationKind?.ast else { // We can treat this as if the current default had been given explicity. outputQuantificationKind( .explicit(.init(ast: options.defaultQuantificationKind))) - case let .explicit(kind): - switch kind.ast { + return + } + + if kind.isExplicit { + // Explicitly provided modifiers need to match the current option state. + switch astKind { case .eager: output(options.isReluctantByDefault ? "?" : "") case .reluctant: @@ -242,9 +245,9 @@ extension LiteralPrinter { fatalError() #endif } - case let .syntax(kind): + } else { // Syntactically-specified quantification modifiers can stay as-is. - switch kind.ast { + switch astKind { case .eager: output("") case .reluctant: diff --git a/Sources/_StringProcessing/Optimizations/AutoPossessification.swift b/Sources/_StringProcessing/Optimizations/AutoPossessification.swift new file mode 100644 index 00000000..03993940 --- /dev/null +++ b/Sources/_StringProcessing/Optimizations/AutoPossessification.swift @@ -0,0 +1,397 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +extension DSLList { + private func _requiredAtomImpl( + _ position: inout Int, + options: inout MatchingOptions, + allowOptionsChanges: Bool + ) -> DSLTree.Atom?? { + guard position < nodes.count else { + return nil + } + + switch nodes[position] { + case .atom(let atom): + switch atom { + case .changeMatchingOptions(let seq): + // Exit early if an atom changes the matching options. + // TODO: Allow some/all options changes. + if allowOptionsChanges { + options.apply(seq.ast) + return nil + } else { + return .some(nil) + } + default: + return atom + } + + // In a concatenation, the first definitive child provides the answer, + // and then we need to skip past (in some cases at least) the remaining + // concatenation elements. + case .concatenation(let children): + var result: DSLTree.Atom?? = nil + var i = 0 + while i < children.count { + i += 1 + position += 1 + if let r = _requiredAtomImpl(&position, options: &options, allowOptionsChanges: allowOptionsChanges) { + result = r + break + } + } + + for _ in i.. DSLTree.Atom? { + var position = 0 + var options = MatchingOptions() + return _requiredAtomImpl(&position, options: &options, allowOptionsChanges: allowOptionsChanges) ?? nil + } + + internal mutating func autoPossessifyNextQuantification( + _ position: inout Int, + options: inout MatchingOptions + ) -> (Int, DSLTree.Atom)? { + guard position < nodes.count else { + return nil + } + + switch nodes[position] { + case .quantification(_, _, _): + let quantPosition = position + position += 1 + + // Do a search within this quantification's contents + // FIXME: How to handle an inner quantification surfacing here? + var innerPosition = position + _ = autoPossessifyNextQuantification(&innerPosition, options: &options) + + switch _requiredAtomImpl(&position, options: &options, allowOptionsChanges: false) { + case .some(let atom?): + return (quantPosition, atom) + case .none, .some(.none): + return nil + } + + case .concatenation(let children): + // If we find a valid quantification among this concatenation's components, + // we must look for a required atom in the sibling. If a definitive result + // is not found, pop up the recursion stack to find a sibling at a higher + // level. + var foundQuantification: (Int, DSLTree.Atom)? = nil + var foundNextAtom: DSLTree.Atom? = nil + var i = 0 + position += 1 + while i < children.count { + i += 1 + if let result = autoPossessifyNextQuantification(&position, options: &options) { + foundQuantification = result + break + } + } + + while i < children.count { + i += 1 + position += 1 + if let result = _requiredAtomImpl(&position, options: &options, allowOptionsChanges: false) { + foundNextAtom = result + break + } + } + + for _ in i.. Bool { + switch (self, other) { + case (.char(let a), .char(let b)): + // Two characters are mutually exclusive if one does not match against + // the other. + // + // Relevant options: + // - semantic level + // - case insensitivity + + if options.semanticLevel == .graphemeCluster { + // Just call String.match(Character, ...) + let s = String(a) + return nil == s.match( + b, at: s.startIndex, + limitedBy: s.endIndex, + isCaseInsensitive: options.isCaseInsensitive) + } else { + // Call String.matchScalar(Scalar, ...) for each in scalar sequence + let s = String(a) + var i = s.startIndex + var j = b.unicodeScalars.startIndex + while i < s.endIndex { + guard j < b.unicodeScalars.endIndex else { return true } + guard let nextIndex = s.matchScalar(b.unicodeScalars[j], at: i, limitedBy: s.endIndex, boundaryCheck: false, isCaseInsensitive: options.isCaseInsensitive) else { + return true + } + i = nextIndex + b.unicodeScalars.formIndex(after: &j) + } + return false + } + + case (.scalar(let a), .scalar(let b)): + // Two scalars are mutually exclusive if one does not match against + // the other. + // + // Relevant options: + // - case insensitivity + let s = String(a) + return nil == s.matchScalar( + b, at: s.startIndex, + limitedBy: s.endIndex, + boundaryCheck: false, + isCaseInsensitive: options.isCaseInsensitive) + + case (.characterClass(let a), .characterClass(let b)): + // Certain character classes are mutually exclusive of each other. + return a.excludes(b, options: options) + + // For character class and char/scalar, we can test against the class's model. + case (.characterClass(let a), .char(let b)), (.char(let b), .characterClass(let a)): + let s = "\(b)" + return nil == a.asRuntimeModel(options).matches(in: s, at: s.startIndex, limitedBy: s.endIndex) + case (.characterClass(let a), .scalar(let b)), (.scalar(let b), .characterClass(let a)): + let s = "\(b)" + return nil == a.asRuntimeModel(options).matches(in: s, at: s.startIndex, limitedBy: s.endIndex) + + default: + return false + } + } +} + +extension DSLTree.Atom.CharacterClass { + func excludes(_ other: Self, options: MatchingOptions) -> Bool { + if other == .anyGrapheme || other == .anyUnicodeScalar { + return false + } + + return switch self { + case .anyGrapheme, .anyUnicodeScalar: + false + + case .digit: + switch other { + case .whitespace, .horizontalWhitespace, .verticalWhitespace, .newlineSequence, + .notWord, .notDigit: true + default: false + } + case .notDigit: + other == .digit + + case .horizontalWhitespace: + switch other { + case .word, .digit, .verticalWhitespace, .newlineSequence, + .notWhitespace, .notHorizontalWhitespace: true + default: false + } + case .notHorizontalWhitespace: + other == .horizontalWhitespace + + case .newlineSequence: + switch other { + case .word, .digit, .horizontalWhitespace, .notNewline: true + default: false + } + case .notNewline: + other == .newlineSequence + + case .whitespace: + switch other { + case .word, .digit, .notWhitespace: true + default: false + } + case .notWhitespace: + other == .whitespace + + case .verticalWhitespace: + switch other { + case .word, .digit, .notWhitespace, .notVerticalWhitespace: true + default: false + } + case .notVerticalWhitespace: + other == .verticalWhitespace + + case .word: + switch other { + case .whitespace, .horizontalWhitespace, .verticalWhitespace, .newlineSequence, + .notWord: true + default: false + } + case .notWord: + other == .word + } + } +} diff --git a/Sources/_StringProcessing/Regex/DSLList.swift b/Sources/_StringProcessing/Regex/DSLList.swift index 1bbb0c9c..f8d09a95 100644 --- a/Sources/_StringProcessing/Regex/DSLList.swift +++ b/Sources/_StringProcessing/Regex/DSLList.swift @@ -94,3 +94,35 @@ extension DSLTree { }) } } + +extension DSLList { + internal func skipNode(_ position: inout Int) { + guard position < nodes.count else { + return + } + switch nodes[position] { + case let .orderedChoice(children): + let n = children.count + for _ in 0.. Self { + .init(quantificationKind: kind, isExplicit: true, canAutoPossessify: nil) + } + /// A kind set via syntax, which can be affected by options. - case syntax(_AST.QuantificationKind) + static func syntax(_ kind: _AST.QuantificationKind) -> Self { + .init(quantificationKind: kind, isExplicit: false, canAutoPossessify: nil) + } var ast: AST.Quantification.Kind? { - switch self { - case .default: return nil - case .explicit(let kind), .syntax(let kind): - return kind.ast + quantificationKind?.ast + } + + func applying(options: MatchingOptions) -> AST.Quantification.Kind { + guard let kind = quantificationKind?.ast else { + return options.defaultQuantificationKind + } + return if isExplicit { + kind + } else { + kind.applying(options) } } } @@ -889,6 +908,146 @@ extension DSLTree.Node { } } +// MARK: Required first and last atoms + +extension DSLTree.Node { + private func _requiredAtomImpl(forward: Bool) -> DSLTree.Atom?? { + switch self { + case .atom(let atom): + return switch atom { + case .changeMatchingOptions: + nil + default: + atom + } + + // In a concatenation, the first definitive child provides the answer. + case .concatenation(let children): + if forward { + for child in children { + if let result = child._requiredAtomImpl(forward: forward) { + return result + } + } + } else { + for child in children.reversed() { + if let result = child._requiredAtomImpl(forward: forward) { + return result + } + } + } + return nil + + // For a quoted literal, we can look at the first char + // TODO: matching semantics??? + case .quotedLiteral(let str): + return str.first.map(DSLTree.Atom.char) + + // TODO: custom character classes could/should participate here somehow + case .customCharacterClass: + return .some(nil) + + // Trivia/empty have no effect. + case .trivia, .empty: + return nil + + // For alternation and conditional, no required first (this could change + // if we identify the _same_ required first atom across all possibilities). + case .orderedChoice, .conditional: + return .some(nil) + + // Groups (and other parent nodes) defer to the child. + case .nonCapturingGroup(_, let child), .capture(_, _, let child, _), + .ignoreCapturesInTypedOutput(let child), + .limitCaptureNesting(let child): + return child._requiredAtomImpl(forward: forward) + + // A quantification that doesn't require its child to exist can still + // allow a start-only match. (e.g. `/(foo)?^bar/`) + case .quantification(let amount, _, let child): + return amount.requiresAtLeastOne + ? child._requiredAtomImpl(forward: forward) + : .some(nil) + + // Extended behavior isn't known, so we return `false` for safety. + case .consumer, .matcher, .characterPredicate, .absentFunction: + return .some(nil) + } + } + + internal func requiredFirstAtom() -> DSLTree.Atom? { + self._requiredAtomImpl(forward: true) ?? nil + } + + internal func requiredLastAtom() -> DSLTree.Atom? { + self._requiredAtomImpl(forward: false) ?? nil + } +} + + +private func _requiredAtomImpl(_ list: inout ArraySlice) -> DSLTree.Atom?? { + guard let node = list.popFirst() else { + return nil + } + switch node { + case .atom(let atom): + return switch atom { + case .changeMatchingOptions: + nil + default: + atom + } + + // In a concatenation, the first definitive child provides the answer. + case .concatenation(let children): + for _ in 0..) -> DSLTree.Atom? { + _requiredAtomImpl(&list) ?? nil +} + // MARK: AST wrapper types // // These wrapper types are required because even @_spi-marked public APIs can't @@ -952,6 +1111,14 @@ extension DSLTree { internal var isNegativeLookahead: Bool { self.ast == .negativeLookahead } + + internal var isChangeMatchingOptions: Bool { + if case let .changeMatchingOptions = ast { + return true + } else { + return false + } + } } @_spi(RegexBuilder) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index e36285ae..a87112b9 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -37,12 +37,15 @@ func _roundTripLiteral( return remadeRegex } +// Validate that the given regex compiles to the same instructions whether +// as a tree (original) or a list (new). We need to compile with optimizations +// disabled, since new optimizations are primarily landing in list compilation. func _validateListCompilation( _ regex: Regex ) throws -> Bool { - let treeCompiler = Compiler(tree: regex.program.tree) + let treeCompiler = Compiler(tree: regex.program.tree, compileOptions: .disableOptimizations) let treeProgram = try treeCompiler.emitViaTree() - let listCompiler = Compiler(tree: regex.program.tree) + let listCompiler = Compiler(tree: regex.program.tree, compileOptions: .disableOptimizations) let listProgram = try listCompiler.emitViaList() return treeProgram.instructions == listProgram.instructions } @@ -734,6 +737,31 @@ extension RegexTests { ("baaaaabc", nil), ("baaaaaaaabc", nil)) + // Auto-possessification tests: + // - case sensitive + firstMatchTests( + "a+A", + ("aaaaA", "aaaaA"), + ("aaaaa", nil), + ("aaAaa", "aaA")) + // - case insensitive + firstMatchTests( + "(?i:a+A)", + ("aaaaA", "aaaaA"), + ("aaaaa", "aaaaa")) + firstMatchTests( + "(?i)a+A", + ("aaaaA", "aaaaA"), + ("aaaaa", "aaaaa")) + firstMatchTests( + "a+(?i:A)", + ("aaaaA", "aaaaA"), + ("aaaaa", "aaaaa")) + firstMatchTests( + "a+(?:(?i)A)", + ("aaaaA", "aaaaA"), + ("aaaaa", "aaaaa")) + // XFAIL'd possessive tests firstMatchTests( "a?+a", diff --git a/Tests/RegexTests/OptimizationTests.swift b/Tests/RegexTests/OptimizationTests.swift new file mode 100644 index 00000000..40bcd201 --- /dev/null +++ b/Tests/RegexTests/OptimizationTests.swift @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import Testing +@testable @_spi(RegexBuilder) import _StringProcessing +@testable import _RegexParser + +@Suite struct OptimizationTests { + @available(macOS 9999, *) + @Test(arguments: [#/a/#, #/a+/#, #/(?:a+)/#, #/(?:a)+/#, #/(?m)a+/#, #/ab?c/#, #/(?:a+)+$/#, #/(?:(?:a+b)+b)/#]) + func requiredFirstAtom(pattern: Regex) throws { + let list = DSLList(tree: pattern.program.tree) + let atom = list.requiredFirstAtom(allowOptionsChanges: true) + #expect(atom?.literalCharacterValue == "a", "Missing first character atom in '\(pattern._literalPattern!)'") + } + + @available(macOS 9999, *) + @Test(arguments: [#/a?/#, #/(?:a|b)/#, #/[a]/#, #/a?bc/#]) + func noRequiredFirstAtom(pattern: Regex) throws { + let list = DSLList(tree: pattern.program.tree) + let atom = list.requiredFirstAtom(allowOptionsChanges: true) + #expect(atom == nil, "Unexpected required first atom in '\(pattern._literalPattern!)'") + } + + @available(macOS 9999, *) + @Test(arguments: [#/a+b/#, #/a*b/#, #/\w+\s/#, #/(?:a+b|b+a)/#, #/(?:(?:a+b)+b)/#, #/\d+a/#, #/a+A/#]) + func autoPossessify(pattern: Regex) throws { + var list = DSLList(tree: pattern.program.tree) + list.autoPossessify() + for node in list.nodes { + switch node { + case .quantification(_, let kind, _): + #expect( + kind.isExplicit && kind.quantificationKind?.ast == .possessive, + "Expected possessification in '\(pattern._literalPattern!)'") + default: break + } + } + } + + @available(macOS 9999, *) + @Test(arguments: [ + #/a?/#, #/a+a/#, #/a+(?:b|c)/#, #/(?:a+|b+)/#, #/[a]/#, #/a?a/#, + #/(?i)a+A/#, #/(?i:a+A)/# // case insensitivity when checking exclusion + ]) + func noAutoPossessify(pattern: Regex) throws { + var list = DSLList(tree: pattern.program.tree) + list.autoPossessify() + for node in list.nodes { + switch node { + case .quantification(_, let kind, _): + #expect( + kind.quantificationKind?.ast != .possessive, + "Unexpected possessification in '\(pattern._literalPattern!)'") + default: break + } + } + } +}