Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 5c9e6e8

Browse files
committedJun 21, 2022
Allow Regex<AnyRegexOutput> to be used in the DSL.
Apply a workaround to allow `Regex<AnyRegexOutput>` to be used in the DSL. This workaround emits each nested `Regex<AnyRegexOutput>` as a custom matcher so that it's essentially treated as a separate compilation unit. A proper fix for this is to introduce scoped type erasure in the matching engine so that all type erasure (including the top-level one) goes through this model. I left some stubs in (`beginTypeErase`, `endTypeErase`) which I'll implement in a follow-up. Since this implementation is using `Executor.match(...) -> Regex.Match` in the regex compiler, we need to add availability annotations to the `Executor` and `Compiler` types. Resolves rdar://94320030.
1 parent 61e979c commit 5c9e6e8

11 files changed

+163
-6
lines changed
 

‎Sources/_StringProcessing/ByteCodeGen.swift

+31
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
@_implementationOnly import _RegexParser
22

3+
@available(SwiftStdlib 5.7, *)
34
extension Compiler {
45
struct ByteCodeGen {
56
var options: MatchingOptions
@@ -15,8 +16,15 @@ extension Compiler {
1516
}
1617
}
1718

19+
@available(SwiftStdlib 5.7, *)
1820
extension Compiler.ByteCodeGen {
1921
mutating func emitRoot(_ root: DSLTree.Node) throws -> MEProgram {
22+
// FIXME: Remove once output type erasure is represented in the matching
23+
// engine. This workaround is to prevent a top-level `Regex<AnyRegexOutput>`
24+
// from being emitted as a matcher, which would be an infinite recursion.
25+
if case let .typeErase(child) = root {
26+
return try emitRoot(child)
27+
}
2028
// The whole match (`.0` element of output) is equivalent to an implicit
2129
// capture over the entire regex.
2230
try emitNode(.capture(name: nil, reference: nil, root))
@@ -25,6 +33,7 @@ extension Compiler.ByteCodeGen {
2533
}
2634
}
2735

36+
@available(SwiftStdlib 5.7, *)
2837
fileprivate extension Compiler.ByteCodeGen {
2938
mutating func emitAtom(_ a: DSLTree.Atom) throws {
3039
defer {
@@ -765,6 +774,28 @@ fileprivate extension Compiler.ByteCodeGen {
765774
case .characterPredicate:
766775
throw Unsupported("character predicates")
767776

777+
case .typeErase(let child):
778+
// FIXME: This is a workaround for `Regex<AnyRegexOutput>` not working in
779+
// the DSL. This separates any `Regex<AnyRegexOutput>` into its own
780+
// compilation unit, but is less efficient. We should instead represent
781+
// output type erasure in the matching engine (`beginTypeErase`,
782+
// `endTypeErase`).
783+
//
784+
// Long-term design:
785+
// beginTypeErase
786+
// <code for child>
787+
// endTypeErase
788+
let program = try Compiler(tree: DSLTree(child)).emit()
789+
let executor = Executor(program: program)
790+
return emitMatcher { input, startIndex, range in
791+
guard let match: Regex<AnyRegexOutput>.Match = try executor.match(
792+
input, in: startIndex..<range.upperBound, .partialFromFront
793+
) else {
794+
return nil
795+
}
796+
return (match.range.upperBound, match.output)
797+
}
798+
768799
case .trivia, .empty:
769800
return nil
770801
}

‎Sources/_StringProcessing/Compiler.swift

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
@_implementationOnly import _RegexParser
1313

14+
@available(SwiftStdlib 5.7, *)
1415
class Compiler {
1516
let tree: DSLTree
1617

@@ -34,6 +35,7 @@ class Compiler {
3435
}
3536
}
3637

38+
@available(SwiftStdlib 5.7, *)
3739
func _compileRegex(
3840
_ regex: String, _ syntax: SyntaxOptions = .traditional
3941
) throws -> Executor {

‎Sources/_StringProcessing/ConsumerInterface.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ extension DSLTree.Node {
3939
case .orderedChoice, .conditional, .concatenation,
4040
.capture, .nonCapturingGroup,
4141
.quantification, .trivia, .empty,
42-
.absentFunction: return nil
42+
.absentFunction, .typeErase: return nil
4343

4444
case .consumer:
4545
fatalError("FIXME: Is this where we handle them?")

‎Sources/_StringProcessing/Engine/Instruction.swift

+9
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,15 @@ extension Instruction {
193193
///
194194
case backreference
195195

196+
/// Push a new type erasure scope into the capture stack.
197+
case beginTypeErase
198+
199+
/// Pop the last type erasure scope, create a `AnyRegexOutput` from that
200+
/// scope, and store it in a value register.
201+
///
202+
/// endTypeErase(_: ValReg)
203+
case endTypeErase
204+
196205
// MARK: Matching: State transitions
197206

198207
// TODO: State transitions need more work. We want

‎Sources/_StringProcessing/Engine/MEBuilder.swift

+8
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,14 @@ extension MEProgram.Builder {
187187
.init(capture: cap, transform: trans)))
188188
}
189189

190+
mutating func buildBeginTypeErase() {
191+
instructions.append(.init(.beginTypeErase))
192+
}
193+
194+
mutating func buildEndTypeErase() {
195+
instructions.append(.init(.endTypeErase))
196+
}
197+
190198
mutating func buildMatcher(
191199
_ fun: MatcherRegister, into reg: ValueRegister
192200
) {

‎Sources/_StringProcessing/Engine/Processor.swift

+6
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,12 @@ extension Processor {
442442
value, overwriteInitial: sp)
443443
controller.step()
444444

445+
case .beginTypeErase:
446+
fatalError("Unimplemented")
447+
448+
case .endTypeErase:
449+
fatalError("Unimplemented")
450+
445451
case .builtinAssertion:
446452
builtinAssertion()
447453

‎Sources/_StringProcessing/Executor.swift

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
@_implementationOnly import _RegexParser
1313

14+
@available(SwiftStdlib 5.7, *)
1415
struct Executor {
1516
// TODO: consider let, for now lets us toggle tracing
1617
var engine: Engine

‎Sources/_StringProcessing/PrintAsPattern.swift

+3
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,9 @@ extension PrettyPrinter {
280280

281281
case .absentFunction:
282282
print("/* TODO: absent function */")
283+
284+
case .typeErase:
285+
print("/* TODO: type erasure */")
283286
}
284287
}
285288

‎Sources/_StringProcessing/Regex/Core.swift

+7-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,13 @@ public struct Regex<Output>: RegexComponent {
5656
}
5757

5858
public var regex: Regex<Output> {
59-
self
59+
if Output.self == AnyRegexOutput.self {
60+
if case .typeErase = root {
61+
return self
62+
}
63+
return .init(node: .typeErase(root))
64+
}
65+
return self
6066
}
6167
}
6268

‎Sources/_StringProcessing/Regex/DSLTree.swift

+23-4
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,10 @@ extension DSLTree {
9393

9494
case matcher(Any.Type, _MatcherInterface)
9595

96+
// MARK: - Type erasure
97+
98+
case typeErase(Node)
99+
96100
// TODO: Would this just boil down to a consumer?
97101
case characterPredicate(_CharacterPredicateInterface)
98102
}
@@ -265,6 +269,7 @@ extension DSLTree.Node {
265269
case let .capture(_, _, n, _): return [n]
266270
case let .nonCapturingGroup(_, n): return [n]
267271
case let .quantification(_, _, n): return [n]
272+
case let .typeErase(n): return [n]
268273

269274
case let .conditional(_, t, f): return [t,f]
270275

@@ -486,6 +491,7 @@ public struct CaptureTransform: Hashable, CustomStringConvertible {
486491
// These wrapper types are required because even @_spi-marked public APIs can't
487492
// include symbols from implementation-only dependencies.
488493

494+
@available(SwiftStdlib 5.7, *)
489495
extension DSLTree.Node {
490496
func _addCaptures(
491497
to list: inout CaptureList,
@@ -551,7 +557,7 @@ extension DSLTree.Node {
551557
break
552558

553559
case .customCharacterClass, .atom, .trivia, .empty,
554-
.quotedLiteral, .consumer, .characterPredicate:
560+
.quotedLiteral, .consumer, .characterPredicate, .typeErase:
555561
break
556562
}
557563
}
@@ -566,7 +572,7 @@ extension DSLTree.Node {
566572
.conditional, .quantification, .customCharacterClass, .atom,
567573
.trivia, .empty, .quotedLiteral, .regexLiteral, .absentFunction,
568574
.convertedRegexLiteral, .consumer,
569-
.characterPredicate, .matcher:
575+
.characterPredicate, .matcher, .typeErase:
570576
return false
571577
}
572578
}
@@ -583,16 +589,28 @@ extension DSLTree.Node {
583589

584590
/// Returns the type of the whole match, i.e. `.0` element type of the output.
585591
var wholeMatchType: Any.Type {
586-
if case .matcher(let type, _) = outputDefiningNode {
592+
switch outputDefiningNode {
593+
case .matcher(let type, _):
587594
return type
595+
case .typeErase:
596+
return AnyRegexOutput.self
597+
default:
598+
return Substring.self
588599
}
589-
return Substring.self
590600
}
591601
}
592602

593603
extension DSLTree {
604+
@available(SwiftStdlib 5.7, *)
594605
var captureList: CaptureList {
595606
var list = CaptureList()
607+
// FIXME: This is peering through any top-level `.typeErase`. Once type
608+
// erasure was handled in the engine, this can be simplified to using `root`
609+
// directly.
610+
var root = root
611+
while case let .typeErase(child) = root {
612+
root = child
613+
}
596614
list.append(.init(type: root.wholeMatchType, optionalDepth: 0, .fake))
597615
root._addCaptures(to: &list, optionalNesting: 0)
598616
return list
@@ -620,6 +638,7 @@ extension DSLTree {
620638
case let .capture(_, _, n, _): return [_Tree(n)]
621639
case let .nonCapturingGroup(_, n): return [_Tree(n)]
622640
case let .quantification(_, _, n): return [_Tree(n)]
641+
case let .typeErase(n): return [_Tree(n)]
623642

624643
case let .conditional(_, t, f): return [_Tree(t), _Tree(f)]
625644

‎Tests/RegexBuilderTests/RegexDSLTests.swift

+72
Original file line numberDiff line numberDiff line change
@@ -1071,6 +1071,78 @@ class RegexDSLTests: XCTestCase {
10711071
}
10721072
}
10731073
}
1074+
1075+
func testTypeErasedRegexInDSL() throws {
1076+
do {
1077+
let input = "johnappleseed: 12."
1078+
let numberRegex = try! Regex(#"(\d+)\.?"#)
1079+
let regex = Regex {
1080+
Capture {
1081+
OneOrMore(.word)
1082+
}
1083+
ZeroOrMore(.whitespace)
1084+
":"
1085+
ZeroOrMore(.whitespace)
1086+
numberRegex
1087+
}
1088+
let match = try XCTUnwrap(input.wholeMatch(of: regex))
1089+
XCTAssertEqual(match.0, input[...])
1090+
XCTAssertEqual(match.1, "johnappleseed")
1091+
}
1092+
do {
1093+
let input = "johnappleseed: 12."
1094+
let numberRegex = try! Regex(#"(\d+)\.?"#)
1095+
let regex = Regex {
1096+
Capture {
1097+
OneOrMore(.word)
1098+
}
1099+
ZeroOrMore(.whitespace)
1100+
":"
1101+
ZeroOrMore(.whitespace)
1102+
Capture { numberRegex }
1103+
}
1104+
let match = try XCTUnwrap(input.wholeMatch(of: regex))
1105+
XCTAssertEqual(match.0, input[...])
1106+
XCTAssertEqual(match.1, "johnappleseed")
1107+
XCTAssertEqual(match.2[0].value as? Substring, "12.")
1108+
XCTAssertEqual(match.2[1].value as? Substring, "12")
1109+
}
1110+
do {
1111+
let input = "johnappleseed: 12."
1112+
// Anchors should be with respect to the entire input.
1113+
let numberRegex = try! Regex(#"^(\d+)\.?"#)
1114+
let regex = Regex {
1115+
Capture {
1116+
OneOrMore(.word)
1117+
}
1118+
ZeroOrMore(.whitespace)
1119+
":"
1120+
ZeroOrMore(.whitespace)
1121+
Capture { numberRegex }
1122+
}
1123+
XCTAssertNil(input.wholeMatch(of: regex))
1124+
}
1125+
do {
1126+
let input = "johnappleseed: 12.[12]"
1127+
// Backreferences in a type-erased regex are scoped to the type-erased
1128+
// regex itself. `\1` here should refer to "12", not "johnappleseed"
1129+
let numberRegex = try! Regex(#"(\d+)\.?\[\1\]"#)
1130+
let regex = Regex {
1131+
Capture {
1132+
OneOrMore(.word)
1133+
}
1134+
ZeroOrMore(.whitespace)
1135+
":"
1136+
ZeroOrMore(.whitespace)
1137+
Capture { numberRegex }
1138+
}
1139+
let match = try XCTUnwrap(input.wholeMatch(of: regex))
1140+
XCTAssertEqual(match.0, input[...])
1141+
XCTAssertEqual(match.1, "johnappleseed")
1142+
XCTAssertEqual(match.2[0].value as? Substring, "12.[12]")
1143+
XCTAssertEqual(match.2[1].value as? Substring, "12")
1144+
}
1145+
}
10741146
}
10751147

10761148
extension Unicode.Scalar {

0 commit comments

Comments
 (0)
Please sign in to comment.