Skip to content

Commit 69f406c

Browse files
authored
Adds SPI for a NSRE compatibility mode option (#698)
NSRegularExpression matches at the Unicode scalar level, but also matches `\r\n` sequences with a single `.` when single-line mode is enabled. This adds a `_nsreCompatibility` property that enables both of those behaviors, and implements support for the special case handling of `.`.
1 parent 45fd8ec commit 69f406c

File tree

6 files changed

+71
-6
lines changed

6 files changed

+71
-6
lines changed

Diff for: Sources/_RegexParser/Regex/AST/MatchingOptions.swift

+3
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ extension AST {
4444

4545
// Swift-only default possessive quantifier
4646
case possessiveByDefault // t.b.d.
47+
48+
// NSRegularExpression compatibility special-case
49+
case nsreCompatibleDot // no AST representation
4750
}
4851

4952
public var kind: Kind

Diff for: Sources/_RegexParser/Regex/Parse/Sema.swift

+2-1
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,8 @@ extension RegexValidator {
142142

143143
case .caseInsensitive, .possessiveByDefault, .reluctantByDefault,
144144
.singleLine, .multiline, .namedCapturesOnly, .extended, .extraExtended,
145-
.asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps:
145+
.asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps,
146+
.nsreCompatibleDot:
146147
break
147148
}
148149
}

Diff for: Sources/_StringProcessing/ByteCodeGen.swift

+10-4
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ fileprivate extension Compiler.ByteCodeGen {
6767
emitAnyNonNewline()
6868

6969
case .dot:
70-
emitDot()
70+
try emitDot()
7171

7272
case let .char(c):
7373
emitCharacter(c)
@@ -238,9 +238,15 @@ fileprivate extension Compiler.ByteCodeGen {
238238
}
239239
}
240240

241-
mutating func emitDot() {
241+
mutating func emitDot() throws {
242242
if options.dotMatchesNewline {
243-
emitAny()
243+
if options.usesNSRECompatibleDot {
244+
try emitAlternation([
245+
.atom(.characterClass(.newlineSequence)),
246+
.atom(.anyNonNewline)])
247+
} else {
248+
emitAny()
249+
}
244250
} else {
245251
emitAnyNonNewline()
246252
}
@@ -964,7 +970,7 @@ fileprivate extension Compiler.ByteCodeGen {
964970
case let .customCharacterClass(ccc):
965971
if ccc.containsDot {
966972
if !ccc.isInverted {
967-
emitDot()
973+
try emitDot()
968974
} else {
969975
throw Unsupported("Inverted any")
970976
}

Diff for: Sources/_StringProcessing/MatchingOptions.swift

+7
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,10 @@ extension MatchingOptions {
120120
? .graphemeCluster
121121
: .unicodeScalar
122122
}
123+
124+
var usesNSRECompatibleDot: Bool {
125+
stack.last!.contains(.nsreCompatibleDot)
126+
}
123127
}
124128

125129
// MARK: - Implementation
@@ -141,6 +145,7 @@ extension MatchingOptions {
141145
// Not available via regex literal flags
142146
case transparentBounds
143147
case withoutAnchoringBounds
148+
case nsreCompatibleDot
144149

145150
// Oniguruma options
146151
case asciiOnlyDigit
@@ -197,6 +202,8 @@ extension MatchingOptions {
197202
self = .byteSemantics
198203
case .possessiveByDefault:
199204
self = .possessiveByDefault
205+
case .nsreCompatibleDot:
206+
self = .nsreCompatibleDot
200207

201208
// Whitespace options are only relevant during parsing, not compilation.
202209
case .extended, .extraExtended:

Diff for: Sources/_StringProcessing/Regex/Options.swift

+12
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,18 @@ extension Regex {
159159
return wrapInOption(.unicodeScalarSemantics, addingIf: true)
160160
}
161161
}
162+
163+
/// Returns a regular expression that uses an NSRegularExpression
164+
/// compatibility mode.
165+
///
166+
/// This mode includes using Unicode scalar semantics and treating a `dot`
167+
/// as matching newline sequences (when in the unrelated dot-matches-newlines
168+
/// mode).
169+
@_spi(Foundation)
170+
public var _nsreCompatibility: Regex<RegexOutput> {
171+
wrapInOption(.nsreCompatibleDot, addingIf: true)
172+
.wrapInOption(.unicodeScalarSemantics, addingIf: true)
173+
}
162174
}
163175

164176
/// A semantic level to use during regex matching.

Diff for: Tests/RegexTests/MatchTests.swift

+37-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import XCTest
1313
@testable import _RegexParser
14-
@testable @_spi(RegexBenchmark) import _StringProcessing
14+
@testable @_spi(RegexBenchmark) @_spi(Foundation) import _StringProcessing
1515
import TestSupport
1616

1717
struct MatchError: Error {
@@ -2717,4 +2717,40 @@ extension RegexTests {
27172717
XCTAssertNotNil(str.wholeMatch(of: possessiveRegex))
27182718
}
27192719
}
2720+
2721+
func testNSRECompatibility() throws {
2722+
// NSRE-compatibility includes scalar matching, so `[\r\n]` should match
2723+
// either `\r` or `\n`.
2724+
let text = #"""
2725+
y=sin(x)+sin(2x)+sin(3x);\#rText "This is a function of x.";\r
2726+
"""#
2727+
let lineTerminationRegex = try Regex(#";[\r\n]"#)
2728+
._nsreCompatibility
2729+
2730+
let afterLine = try XCTUnwrap(text.firstRange(of: "Text"))
2731+
let match = try lineTerminationRegex.firstMatch(in: text)
2732+
XCTAssert(match?.range.upperBound == afterLine.lowerBound)
2733+
2734+
// NSRE-compatibility treats "dot" as special, in that it can match a
2735+
// newline sequence as well as a single Unicode scalar.
2736+
let aDotBRegex = try Regex(#"a.b"#)
2737+
._nsreCompatibility
2738+
.dotMatchesNewlines()
2739+
for input in ["a\rb", "a\nb", "a\r\nb"] {
2740+
XCTAssertNotNil(try aDotBRegex.wholeMatch(in: input))
2741+
}
2742+
2743+
// NSRE-compatibility doesn't give special treatment to newline sequences
2744+
// when matching other "match everything" regex patterns, like `[[^z]z]`,
2745+
// so this pattern doesn't match "a\r\nb".
2746+
let aCCBRegex = try Regex(#"a[[^z]z]b"#)
2747+
._nsreCompatibility
2748+
for input in ["a\rb", "a\nb", "a\r\nb"] {
2749+
if input.unicodeScalars.count == 3 {
2750+
XCTAssertNotNil(try aCCBRegex.wholeMatch(in: input))
2751+
} else {
2752+
XCTAssertNil(try aCCBRegex.wholeMatch(in: input))
2753+
}
2754+
}
2755+
}
27202756
}

0 commit comments

Comments
 (0)