Skip to content

Commit 4cdb58a

Browse files
authored
[perf] Create a flattened representation of DSLTree (#831)
This change simplifies the pre-compilation representation of the regex to store a pre-order traversal of the syntax tree in an array instead of an indirect enum. This enables an optimization pass that can index into and mutate the tree more easily. This change includes tests that verify that list-based compilation generates the same instructions as the original tree-based compilation, and switches to the list-based compilation. This change also eliminates the "literal wrapper" node that preserved AST – we aren't using the saved AST, so it resulted in unneeded links in the chain. Because parsing still generates an AST which is converted to a DSLTree before the new list, regex compilation may be slower until the intermediate DSLTree is fully removed.
1 parent 157f03c commit 4cdb58a

File tree

10 files changed

+1132
-102
lines changed

10 files changed

+1132
-102
lines changed

Sources/_StringProcessing/ByteCodeGen+DSLList.swift

Lines changed: 855 additions & 0 deletions
Large diffs are not rendered by default.

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ extension Compiler {
2323
var hasEmittedFirstMatchableAtom = false
2424

2525
private let compileOptions: _CompileOptions
26-
fileprivate var optimizationsEnabled: Bool {
26+
internal var optimizationsEnabled: Bool {
2727
!compileOptions.contains(.disableOptimizations)
2828
}
2929

@@ -61,7 +61,7 @@ extension Compiler.ByteCodeGen {
6161
}
6262
}
6363

64-
fileprivate extension Compiler.ByteCodeGen {
64+
extension Compiler.ByteCodeGen {
6565
mutating func emitAtom(_ a: DSLTree.Atom) throws {
6666
defer {
6767
if a.isMatchable {
@@ -809,7 +809,7 @@ fileprivate extension Compiler.ByteCodeGen {
809809
default:
810810
return false
811811
}
812-
case .convertedRegexLiteral(let node, _):
812+
case .limitCaptureNesting(let node):
813813
return tryEmitFastQuant(node, kind, minTrips, maxExtraTrips)
814814
case .nonCapturingGroup(let groupKind, let node):
815815
// .nonCapture nonCapturingGroups are ignored during compilation
@@ -1203,7 +1203,7 @@ fileprivate extension Compiler.ByteCodeGen {
12031203
switch node {
12041204
case .concatenation(let ch):
12051205
return ch.flatMap(flatten)
1206-
case .convertedRegexLiteral(let n, _), .ignoreCapturesInTypedOutput(let n):
1206+
case .ignoreCapturesInTypedOutput(let n), .limitCaptureNesting(let n):
12071207
return flatten(n)
12081208
default:
12091209
return [node]
@@ -1283,6 +1283,9 @@ fileprivate extension Compiler.ByteCodeGen {
12831283
case let .ignoreCapturesInTypedOutput(child):
12841284
try emitNode(child)
12851285

1286+
case let .limitCaptureNesting(child):
1287+
return try emitNode(child)
1288+
12861289
case .conditional:
12871290
throw Unsupported("Conditionals")
12881291

@@ -1306,9 +1309,6 @@ fileprivate extension Compiler.ByteCodeGen {
13061309
case let .quotedLiteral(s):
13071310
emitQuotedLiteral(s)
13081311

1309-
case let .convertedRegexLiteral(n, _):
1310-
return try emitNode(n)
1311-
13121312
case .absentFunction:
13131313
throw Unsupported("absent function")
13141314
case .consumer:
@@ -1359,8 +1359,6 @@ extension DSLTree.Node {
13591359
return false
13601360
case .quotedLiteral(let string):
13611361
return !string.isEmpty
1362-
case .convertedRegexLiteral(let node, _):
1363-
return node.guaranteesForwardProgress
13641362
case .consumer, .matcher:
13651363
// Allow zero width consumers and matchers
13661364
return false
@@ -1369,6 +1367,8 @@ extension DSLTree.Node {
13691367
case .quantification(let amount, _, let child):
13701368
let (atLeast, _) = amount.ast.bounds
13711369
return atLeast ?? 0 > 0 && child.guaranteesForwardProgress
1370+
case .limitCaptureNesting(let node), .ignoreCapturesInTypedOutput(let node):
1371+
return node.guaranteesForwardProgress
13721372
default: return false
13731373
}
13741374
}

Sources/_StringProcessing/Compiler.swift

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ class Compiler {
3232
}
3333

3434
__consuming func emit() throws -> MEProgram {
35+
try emitViaList()
36+
}
37+
38+
__consuming func emitViaTree() throws -> MEProgram {
3539
// TODO: Handle global options
3640
var codegen = ByteCodeGen(
3741
options: options,
@@ -40,6 +44,17 @@ class Compiler {
4044
captureList: tree.captureList)
4145
return try codegen.emitRoot(tree.root)
4246
}
47+
48+
__consuming func emitViaList() throws -> MEProgram {
49+
// TODO: Handle global options
50+
let dslList = DSLList(tree: tree)
51+
var codegen = ByteCodeGen(
52+
options: options,
53+
compileOptions:
54+
compileOptions,
55+
captureList: tree.captureList)
56+
return try codegen.emitRoot(dslList)
57+
}
4358
}
4459

4560
/// Hashable wrapper for `Any.Type`.

Sources/_StringProcessing/LiteralPrinter.swift

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,11 +116,9 @@ extension LiteralPrinter {
116116
outputNode(child)
117117
output(")")
118118

119-
case let .ignoreCapturesInTypedOutput(child):
119+
case let .ignoreCapturesInTypedOutput(child),
120+
let .limitCaptureNesting(child):
120121
outputNode(child)
121-
case .convertedRegexLiteral(let node, _):
122-
outputNode(node)
123-
124122
case let .quantification(amount, kind, node):
125123
outputQuantification(amount, kind, node)
126124
case let .customCharacterClass(charClass):

Sources/_StringProcessing/PrintAsPattern.swift

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,9 @@ extension PrettyPrinter {
179179
case let .ignoreCapturesInTypedOutput(child):
180180
printAsPattern(convertedFromAST: child, isTopLevel: isTopLevel)
181181

182+
case let .limitCaptureNesting(child):
183+
printAsPattern(convertedFromAST: child, isTopLevel: isTopLevel)
184+
182185
case .conditional:
183186
print("/* TODO: conditional */")
184187

@@ -258,20 +261,6 @@ extension PrettyPrinter {
258261

259262
break
260263

261-
case let .convertedRegexLiteral(.atom(a), _):
262-
if let pattern = a._patternBase(&self), pattern.canBeWrapped {
263-
printAtom(pattern.0)
264-
return
265-
}
266-
267-
break
268-
case let .convertedRegexLiteral(.customCharacterClass(ccc), _):
269-
if ccc.isSimplePrint {
270-
printSimpleCCC(ccc)
271-
return
272-
}
273-
274-
break
275264
default:
276265
break
277266
}
@@ -305,13 +294,6 @@ extension PrettyPrinter {
305294
case let .quotedLiteral(v):
306295
print(v._quoted)
307296

308-
case let .convertedRegexLiteral(n, _):
309-
// FIXME: This recursion coordinates with back-off
310-
// check above, so it should work out. Need a
311-
// cleaner way to do this. This means the argument
312-
// label is a lie.
313-
printAsPattern(convertedFromAST: n, isTopLevel: isTopLevel)
314-
315297
case let .customCharacterClass(ccc):
316298
printAsPattern(ccc)
317299

@@ -1431,9 +1413,6 @@ extension DSLTree.Node {
14311413
result += node.getNamedCaptures()
14321414
}
14331415

1434-
case .convertedRegexLiteral(let node, _):
1435-
result += node.getNamedCaptures()
1436-
14371416
case .quantification(_, _, let node):
14381417
result += node.getNamedCaptures()
14391418

Sources/_StringProcessing/Regex/ASTConversion.swift

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,28 +13,13 @@ internal import _RegexParser
1313

1414
extension AST {
1515
var dslTree: DSLTree {
16-
return DSLTree(root.dslTreeNode)
16+
return DSLTree(.limitCaptureNesting(root.dslTreeNode))
1717
}
1818
}
1919

2020
extension AST.Node {
2121
/// Converts an AST node to a `convertedRegexLiteral` node.
2222
var dslTreeNode: DSLTree.Node {
23-
func wrap(_ node: DSLTree.Node) -> DSLTree.Node {
24-
switch node {
25-
case .convertedRegexLiteral:
26-
// FIXME: DSL can have one item concats
27-
// assertionFailure("Double wrapping?")
28-
return node
29-
default:
30-
break
31-
}
32-
// TODO: Should we do this for the
33-
// single-concatenation child too, or should?
34-
// we wrap _that_?
35-
return .convertedRegexLiteral(node, .init(ast: self))
36-
}
37-
3823
// Convert the top-level node without wrapping
3924
func convert() throws -> DSLTree.Node {
4025
switch self {
@@ -105,9 +90,8 @@ extension AST.Node {
10590
}
10691
}
10792

108-
// FIXME: make total function again
10993
let converted = try! convert()
110-
return wrap(converted)
94+
return converted
11195
}
11296
}
11397

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2025 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
struct DSLList {
13+
var nodes: [DSLTree.Node]
14+
15+
init(_ initial: DSLTree.Node) {
16+
self.nodes = [initial]
17+
}
18+
19+
init(_ nodes: [DSLTree.Node]) {
20+
self.nodes = nodes
21+
}
22+
23+
init(tree: DSLTree) {
24+
self.nodes = Array(tree.depthFirst)
25+
}
26+
}
27+
28+
extension DSLTree.Node {
29+
var directChildren: Int {
30+
switch self {
31+
case .trivia, .empty, .quotedLiteral,
32+
.consumer, .matcher, .characterPredicate,
33+
.customCharacterClass, .atom:
34+
return 0
35+
36+
case .orderedChoice(let c), .concatenation(let c):
37+
return c.count
38+
39+
case .capture, .nonCapturingGroup,
40+
.quantification, .ignoreCapturesInTypedOutput,
41+
.limitCaptureNesting, .conditional:
42+
return 1
43+
44+
case .absentFunction:
45+
return 0
46+
}
47+
}
48+
}
49+
50+
extension DSLTree {
51+
struct DepthFirst: Sequence, IteratorProtocol {
52+
typealias Element = DSLTree.Node
53+
private var stack: [Frame]
54+
private let getChildren: (Element) -> [Element]
55+
56+
private struct Frame {
57+
let node: Element
58+
let children: [Element]
59+
var nextIndex: Int = 0
60+
}
61+
62+
fileprivate init(
63+
root: Element,
64+
getChildren: @escaping (Element) -> [Element]
65+
) {
66+
self.getChildren = getChildren
67+
self.stack = [Frame(node: root, children: getChildren(root))]
68+
}
69+
70+
mutating func next() -> Element? {
71+
guard let top = stack.popLast() else { return nil }
72+
// Push children in reverse so leftmost comes out first.
73+
for child in top.children.reversed() {
74+
stack.append(Frame(node: child, children: getChildren(child)))
75+
}
76+
77+
// Since we coalesce the children before adding them to the stack,
78+
// we need an exact matching number of children in the list's
79+
// concatenation node, so that it can provide the correct component
80+
// count. This will go away/change when .concatenation only stores
81+
// a count.
82+
return switch top.node {
83+
case .concatenation:
84+
.concatenation(top.node.coalescedChildren)
85+
default:
86+
top.node
87+
}
88+
}
89+
}
90+
91+
var depthFirst: DepthFirst {
92+
DepthFirst(root: root, getChildren: {
93+
$0.coalescedChildren
94+
})
95+
}
96+
}

0 commit comments

Comments
 (0)