From 5c9e6e8665178726ef6c3c82c86de34923d5f0a4 Mon Sep 17 00:00:00 2001 From: Richard Wei Date: Mon, 20 Jun 2022 11:34:47 -0700 Subject: [PATCH] Allow `Regex` to be used in the DSL. Apply a workaround to allow `Regex` to be used in the DSL. This workaround emits each nested `Regex` as a custom matcher so that it's essentially treated as a separate compilation unit. A proper fix for this is to introduce scoped type erasure in the matching engine so that all type erasure (including the top-level one) goes through this model. I left some stubs in (`beginTypeErase`, `endTypeErase`) which I'll implement in a follow-up. Since this implementation is using `Executor.match(...) -> Regex.Match` in the regex compiler, we need to add availability annotations to the `Executor` and `Compiler` types. Resolves rdar://94320030. --- Sources/_StringProcessing/ByteCodeGen.swift | 31 ++++++++ Sources/_StringProcessing/Compiler.swift | 2 + .../_StringProcessing/ConsumerInterface.swift | 2 +- .../Engine/Instruction.swift | 9 +++ .../_StringProcessing/Engine/MEBuilder.swift | 8 +++ .../_StringProcessing/Engine/Processor.swift | 6 ++ Sources/_StringProcessing/Executor.swift | 1 + .../_StringProcessing/PrintAsPattern.swift | 3 + Sources/_StringProcessing/Regex/Core.swift | 8 ++- Sources/_StringProcessing/Regex/DSLTree.swift | 27 +++++-- Tests/RegexBuilderTests/RegexDSLTests.swift | 72 +++++++++++++++++++ 11 files changed, 163 insertions(+), 6 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index cff0df57e..086041d11 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -1,5 +1,6 @@ @_implementationOnly import _RegexParser +@available(SwiftStdlib 5.7, *) extension Compiler { struct ByteCodeGen { var options: MatchingOptions @@ -15,8 +16,15 @@ extension Compiler { } } +@available(SwiftStdlib 5.7, *) extension Compiler.ByteCodeGen { mutating func emitRoot(_ root: DSLTree.Node) throws -> MEProgram { + // FIXME: Remove once output type erasure is represented in the matching + // engine. This workaround is to prevent a top-level `Regex` + // from being emitted as a matcher, which would be an infinite recursion. + if case let .typeErase(child) = root { + return try emitRoot(child) + } // The whole match (`.0` element of output) is equivalent to an implicit // capture over the entire regex. try emitNode(.capture(name: nil, reference: nil, root)) @@ -25,6 +33,7 @@ extension Compiler.ByteCodeGen { } } +@available(SwiftStdlib 5.7, *) fileprivate extension Compiler.ByteCodeGen { mutating func emitAtom(_ a: DSLTree.Atom) throws { defer { @@ -765,6 +774,28 @@ fileprivate extension Compiler.ByteCodeGen { case .characterPredicate: throw Unsupported("character predicates") + case .typeErase(let child): + // FIXME: This is a workaround for `Regex` not working in + // the DSL. This separates any `Regex` into its own + // compilation unit, but is less efficient. We should instead represent + // output type erasure in the matching engine (`beginTypeErase`, + // `endTypeErase`). + // + // Long-term design: + // beginTypeErase + // + // endTypeErase + let program = try Compiler(tree: DSLTree(child)).emit() + let executor = Executor(program: program) + return emitMatcher { input, startIndex, range in + guard let match: Regex.Match = try executor.match( + input, in: startIndex.. Executor { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 07757eb6a..9bfdfc409 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -39,7 +39,7 @@ extension DSLTree.Node { case .orderedChoice, .conditional, .concatenation, .capture, .nonCapturingGroup, .quantification, .trivia, .empty, - .absentFunction: return nil + .absentFunction, .typeErase: return nil case .consumer: fatalError("FIXME: Is this where we handle them?") diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 9d2ae5a69..efd82ecd5 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -193,6 +193,15 @@ extension Instruction { /// case backreference + /// Push a new type erasure scope into the capture stack. + case beginTypeErase + + /// Pop the last type erasure scope, create a `AnyRegexOutput` from that + /// scope, and store it in a value register. + /// + /// endTypeErase(_: ValReg) + case endTypeErase + // MARK: Matching: State transitions // TODO: State transitions need more work. We want diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index f278b7328..e84124b7c 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -187,6 +187,14 @@ extension MEProgram.Builder { .init(capture: cap, transform: trans))) } + mutating func buildBeginTypeErase() { + instructions.append(.init(.beginTypeErase)) + } + + mutating func buildEndTypeErase() { + instructions.append(.init(.endTypeErase)) + } + mutating func buildMatcher( _ fun: MatcherRegister, into reg: ValueRegister ) { diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 8471ef861..bfa0d2be1 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -442,6 +442,12 @@ extension Processor { value, overwriteInitial: sp) controller.step() + case .beginTypeErase: + fatalError("Unimplemented") + + case .endTypeErase: + fatalError("Unimplemented") + case .builtinAssertion: builtinAssertion() diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index f8d10001e..7966974b6 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -11,6 +11,7 @@ @_implementationOnly import _RegexParser +@available(SwiftStdlib 5.7, *) struct Executor { // TODO: consider let, for now lets us toggle tracing var engine: Engine diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 2fe7c6ccc..275400ea9 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -280,6 +280,9 @@ extension PrettyPrinter { case .absentFunction: print("/* TODO: absent function */") + + case .typeErase: + print("/* TODO: type erasure */") } } diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index b96ccda58..0e328efd7 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -56,7 +56,13 @@ public struct Regex: RegexComponent { } public var regex: Regex { - self + if Output.self == AnyRegexOutput.self { + if case .typeErase = root { + return self + } + return .init(node: .typeErase(root)) + } + return self } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 72c5f1526..e45d8120f 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -93,6 +93,10 @@ extension DSLTree { case matcher(Any.Type, _MatcherInterface) + // MARK: - Type erasure + + case typeErase(Node) + // TODO: Would this just boil down to a consumer? case characterPredicate(_CharacterPredicateInterface) } @@ -265,6 +269,7 @@ extension DSLTree.Node { case let .capture(_, _, n, _): return [n] case let .nonCapturingGroup(_, n): return [n] case let .quantification(_, _, n): return [n] + case let .typeErase(n): return [n] case let .conditional(_, t, f): return [t,f] @@ -486,6 +491,7 @@ public struct CaptureTransform: Hashable, CustomStringConvertible { // These wrapper types are required because even @_spi-marked public APIs can't // include symbols from implementation-only dependencies. +@available(SwiftStdlib 5.7, *) extension DSLTree.Node { func _addCaptures( to list: inout CaptureList, @@ -551,7 +557,7 @@ extension DSLTree.Node { break case .customCharacterClass, .atom, .trivia, .empty, - .quotedLiteral, .consumer, .characterPredicate: + .quotedLiteral, .consumer, .characterPredicate, .typeErase: break } } @@ -566,7 +572,7 @@ extension DSLTree.Node { .conditional, .quantification, .customCharacterClass, .atom, .trivia, .empty, .quotedLiteral, .regexLiteral, .absentFunction, .convertedRegexLiteral, .consumer, - .characterPredicate, .matcher: + .characterPredicate, .matcher, .typeErase: return false } } @@ -583,16 +589,28 @@ extension DSLTree.Node { /// Returns the type of the whole match, i.e. `.0` element type of the output. var wholeMatchType: Any.Type { - if case .matcher(let type, _) = outputDefiningNode { + switch outputDefiningNode { + case .matcher(let type, _): return type + case .typeErase: + return AnyRegexOutput.self + default: + return Substring.self } - return Substring.self } } extension DSLTree { + @available(SwiftStdlib 5.7, *) var captureList: CaptureList { var list = CaptureList() + // FIXME: This is peering through any top-level `.typeErase`. Once type + // erasure was handled in the engine, this can be simplified to using `root` + // directly. + var root = root + while case let .typeErase(child) = root { + root = child + } list.append(.init(type: root.wholeMatchType, optionalDepth: 0, .fake)) root._addCaptures(to: &list, optionalNesting: 0) return list @@ -620,6 +638,7 @@ extension DSLTree { case let .capture(_, _, n, _): return [_Tree(n)] case let .nonCapturingGroup(_, n): return [_Tree(n)] case let .quantification(_, _, n): return [_Tree(n)] + case let .typeErase(n): return [_Tree(n)] case let .conditional(_, t, f): return [_Tree(t), _Tree(f)] diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index fc31e575f..c90e04a4e 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -1071,6 +1071,78 @@ class RegexDSLTests: XCTestCase { } } } + + func testTypeErasedRegexInDSL() throws { + do { + let input = "johnappleseed: 12." + let numberRegex = try! Regex(#"(\d+)\.?"#) + let regex = Regex { + Capture { + OneOrMore(.word) + } + ZeroOrMore(.whitespace) + ":" + ZeroOrMore(.whitespace) + numberRegex + } + let match = try XCTUnwrap(input.wholeMatch(of: regex)) + XCTAssertEqual(match.0, input[...]) + XCTAssertEqual(match.1, "johnappleseed") + } + do { + let input = "johnappleseed: 12." + let numberRegex = try! Regex(#"(\d+)\.?"#) + let regex = Regex { + Capture { + OneOrMore(.word) + } + ZeroOrMore(.whitespace) + ":" + ZeroOrMore(.whitespace) + Capture { numberRegex } + } + let match = try XCTUnwrap(input.wholeMatch(of: regex)) + XCTAssertEqual(match.0, input[...]) + XCTAssertEqual(match.1, "johnappleseed") + XCTAssertEqual(match.2[0].value as? Substring, "12.") + XCTAssertEqual(match.2[1].value as? Substring, "12") + } + do { + let input = "johnappleseed: 12." + // Anchors should be with respect to the entire input. + let numberRegex = try! Regex(#"^(\d+)\.?"#) + let regex = Regex { + Capture { + OneOrMore(.word) + } + ZeroOrMore(.whitespace) + ":" + ZeroOrMore(.whitespace) + Capture { numberRegex } + } + XCTAssertNil(input.wholeMatch(of: regex)) + } + do { + let input = "johnappleseed: 12.[12]" + // Backreferences in a type-erased regex are scoped to the type-erased + // regex itself. `\1` here should refer to "12", not "johnappleseed" + let numberRegex = try! Regex(#"(\d+)\.?\[\1\]"#) + let regex = Regex { + Capture { + OneOrMore(.word) + } + ZeroOrMore(.whitespace) + ":" + ZeroOrMore(.whitespace) + Capture { numberRegex } + } + let match = try XCTUnwrap(input.wholeMatch(of: regex)) + XCTAssertEqual(match.0, input[...]) + XCTAssertEqual(match.1, "johnappleseed") + XCTAssertEqual(match.2[0].value as? Substring, "12.[12]") + XCTAssertEqual(match.2[1].value as? Substring, "12") + } + } } extension Unicode.Scalar {