Skip to content

Commit 355027f

Browse files
authored
[swift/main] Substring boundaries during matching (#697)
* Handle boundaries when matching in substrings (#675) * Handle boundaries when matching in substrings Some of our existing matching routines use the start/endIndex of the input, which is basically never the right thing to do. This change revises those checks to use the search bounds, by either moving the boundary check out of the matching method, or if the boundary is a part of what needs to be matched (e.g. word boundaries have different behavior at the start/end than in the middle of a string) the search bounds are passed into the matching method. Testing is currently handled by piggy-backing on the existing match tests; we should add more tests to handle substring- specific edge cases. * Handle sub-character substring boundaries This change passes the end boundary down into matching methods, and uses it to find the actual character that is part of the input substring, even if the substring's end boundary is in the middle of a grapheme cluster. Substrings cannot have sub-Unicode scalar boundaries as of Swift 5.7; we can remove a check for this when matching an individual scalar. * Add test for substring replacement
1 parent cc96bb5 commit 355027f

File tree

9 files changed

+209
-91
lines changed

9 files changed

+209
-91
lines changed

Sources/_StringProcessing/ConsumerInterface.swift

+9-7
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,16 @@ extension Character {
6868
switch opts.semanticLevel {
6969
case .graphemeCluster:
7070
return { input, bounds in
71-
let low = bounds.lowerBound
71+
guard let (char, next) = input.characterAndEnd(
72+
at: bounds.lowerBound, limitedBy: bounds.upperBound)
73+
else { return nil }
7274
if isCaseInsensitive && isCased {
73-
return input[low].lowercased() == lowercased()
74-
? input.index(after: low)
75+
return char.lowercased() == self.lowercased()
76+
? next
7577
: nil
7678
} else {
77-
return input[low] == self
78-
? input.index(after: low)
79+
return char == self
80+
? next
7981
: nil
8082
}
8183
}
@@ -188,7 +190,7 @@ extension DSLTree.Atom.CharacterClass {
188190
func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction {
189191
let model = asRuntimeModel(opts)
190192
return { input, bounds in
191-
model.matches(in: input, at: bounds.lowerBound)
193+
model.matches(in: input, at: bounds.lowerBound, limitedBy: bounds.upperBound)
192194
}
193195
}
194196
}
@@ -517,7 +519,7 @@ extension DSLTree.CustomCharacterClass {
517519
}
518520
if isInverted {
519521
return opts.semanticLevel == .graphemeCluster
520-
? input.index(after: bounds.lowerBound)
522+
? Swift.min(input.index(after: bounds.lowerBound), bounds.upperBound)
521523
: input.unicodeScalars.index(after: bounds.lowerBound)
522524
}
523525
return nil

Sources/_StringProcessing/Engine/MEBuiltins.swift

+77-32
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@ extension Processor {
1515
isStrictASCII: Bool,
1616
isScalarSemantics: Bool
1717
) -> Bool {
18-
guard let next = input.matchBuiltinCC(
18+
guard currentPosition < end, let next = input.matchBuiltinCC(
1919
cc,
2020
at: currentPosition,
21+
limitedBy: end,
2122
isInverted: isInverted,
2223
isStrictASCII: isStrictASCII,
2324
isScalarSemantics: isScalarSemantics
@@ -102,56 +103,96 @@ extension Processor {
102103

103104
case .wordBoundary:
104105
if payload.usesSimpleUnicodeBoundaries {
105-
// TODO: How should we handle bounds?
106106
return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel)
107107
} else {
108-
return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex)
108+
return input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex)
109109
}
110110

111111
case .notWordBoundary:
112112
if payload.usesSimpleUnicodeBoundaries {
113-
// TODO: How should we handle bounds?
114113
return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel)
115114
} else {
116-
return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex)
115+
return !input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex)
117116
}
118117
}
119118
}
120119
}
121120

122121
// MARK: Matching `.`
123122
extension String {
124-
// TODO: Should the below have a `limitedBy` parameter?
123+
/// Returns the character at `pos`, bounded by `end`, as well as the upper
124+
/// boundary of the returned character.
125+
///
126+
/// This function handles loading a character from a string while respecting
127+
/// an end boundary, even if that end boundary is sub-character or sub-scalar.
128+
///
129+
/// - If `pos` is at or past `end`, this function returns `nil`.
130+
/// - If `end` is between `pos` and the next grapheme cluster boundary (i.e.,
131+
/// `end` is before `self.index(after: pos)`, then the returned character
132+
/// is smaller than the one that would be produced by `self[pos]` and the
133+
/// returned index is at the end of that character.
134+
/// - If `end` is between `pos` and the next grapheme cluster boundary, and
135+
/// is not on a Unicode scalar boundary, the partial scalar is dropped. This
136+
/// can result in a `nil` return or a character that includes only part of
137+
/// the `self[pos]` character.
138+
///
139+
/// - Parameters:
140+
/// - pos: The position to load a character from.
141+
/// - end: The limit for the character at `pos`.
142+
/// - Returns: The character at `pos`, bounded by `end`, if it exists, along
143+
/// with the upper bound of that character. The upper bound is always
144+
/// scalar-aligned.
145+
func characterAndEnd(at pos: String.Index, limitedBy end: String.Index) -> (Character, String.Index)? {
146+
// FIXME: Sink into the stdlib to avoid multiple boundary calculations
147+
guard pos < end else { return nil }
148+
let next = index(after: pos)
149+
if next <= end {
150+
return (self[pos], next)
151+
}
125152

153+
// `end` must be a sub-character position that is between `pos` and the
154+
// next grapheme boundary. This is okay if `end` is on a Unicode scalar
155+
// boundary, but if it's in the middle of a scalar's code units, there
156+
// may not be a character to return at all after rounding down. Use
157+
// `Substring`'s rounding to determine what we can return.
158+
let substr = self[pos..<end]
159+
return substr.isEmpty
160+
? nil
161+
: (substr.first!, substr.endIndex)
162+
}
163+
126164
func matchAnyNonNewline(
127165
at currentPosition: String.Index,
166+
limitedBy end: String.Index,
128167
isScalarSemantics: Bool
129168
) -> String.Index? {
130-
guard currentPosition < endIndex else {
131-
return nil
132-
}
169+
guard currentPosition < end else { return nil }
133170
if case .definite(let result) = _quickMatchAnyNonNewline(
134171
at: currentPosition,
172+
limitedBy: end,
135173
isScalarSemantics: isScalarSemantics
136174
) {
137175
assert(result == _thoroughMatchAnyNonNewline(
138176
at: currentPosition,
177+
limitedBy: end,
139178
isScalarSemantics: isScalarSemantics))
140179
return result
141180
}
142181
return _thoroughMatchAnyNonNewline(
143182
at: currentPosition,
183+
limitedBy: end,
144184
isScalarSemantics: isScalarSemantics)
145185
}
146186

147187
@inline(__always)
148188
private func _quickMatchAnyNonNewline(
149189
at currentPosition: String.Index,
190+
limitedBy end: String.Index,
150191
isScalarSemantics: Bool
151192
) -> QuickResult<String.Index?> {
152-
assert(currentPosition < endIndex)
193+
assert(currentPosition < end)
153194
guard let (asciiValue, next, isCRLF) = _quickASCIICharacter(
154-
at: currentPosition
195+
at: currentPosition, limitedBy: end
155196
) else {
156197
return .unknown
157198
}
@@ -167,46 +208,47 @@ extension String {
167208
@inline(never)
168209
private func _thoroughMatchAnyNonNewline(
169210
at currentPosition: String.Index,
211+
limitedBy end: String.Index,
170212
isScalarSemantics: Bool
171213
) -> String.Index? {
172-
assert(currentPosition < endIndex)
173214
if isScalarSemantics {
215+
guard currentPosition < end else { return nil }
174216
let scalar = unicodeScalars[currentPosition]
175217
guard !scalar.isNewline else { return nil }
176218
return unicodeScalars.index(after: currentPosition)
177219
}
178220

179-
let char = self[currentPosition]
180-
guard !char.isNewline else { return nil }
181-
return index(after: currentPosition)
221+
guard let (char, next) = characterAndEnd(at: currentPosition, limitedBy: end),
222+
!char.isNewline
223+
else { return nil }
224+
return next
182225
}
183226
}
184227

185228
// MARK: - Built-in character class matching
186229
extension String {
187-
// TODO: Should the below have a `limitedBy` parameter?
188-
189230
// Mentioned in ProgrammersManual.md, update docs if redesigned
190231
func matchBuiltinCC(
191232
_ cc: _CharacterClassModel.Representation,
192233
at currentPosition: String.Index,
234+
limitedBy end: String.Index,
193235
isInverted: Bool,
194236
isStrictASCII: Bool,
195237
isScalarSemantics: Bool
196238
) -> String.Index? {
197-
guard currentPosition < endIndex else {
198-
return nil
199-
}
239+
guard currentPosition < end else { return nil }
200240
if case .definite(let result) = _quickMatchBuiltinCC(
201241
cc,
202242
at: currentPosition,
243+
limitedBy: end,
203244
isInverted: isInverted,
204245
isStrictASCII: isStrictASCII,
205246
isScalarSemantics: isScalarSemantics
206247
) {
207248
assert(result == _thoroughMatchBuiltinCC(
208249
cc,
209250
at: currentPosition,
251+
limitedBy: end,
210252
isInverted: isInverted,
211253
isStrictASCII: isStrictASCII,
212254
isScalarSemantics: isScalarSemantics))
@@ -215,6 +257,7 @@ extension String {
215257
return _thoroughMatchBuiltinCC(
216258
cc,
217259
at: currentPosition,
260+
limitedBy: end,
218261
isInverted: isInverted,
219262
isStrictASCII: isStrictASCII,
220263
isScalarSemantics: isScalarSemantics)
@@ -225,13 +268,17 @@ extension String {
225268
private func _quickMatchBuiltinCC(
226269
_ cc: _CharacterClassModel.Representation,
227270
at currentPosition: String.Index,
271+
limitedBy end: String.Index,
228272
isInverted: Bool,
229273
isStrictASCII: Bool,
230274
isScalarSemantics: Bool
231275
) -> QuickResult<String.Index?> {
232-
assert(currentPosition < endIndex)
276+
assert(currentPosition < end)
233277
guard let (next, result) = _quickMatch(
234-
cc, at: currentPosition, isScalarSemantics: isScalarSemantics
278+
cc,
279+
at: currentPosition,
280+
limitedBy: end,
281+
isScalarSemantics: isScalarSemantics
235282
) else {
236283
return .unknown
237284
}
@@ -243,27 +290,25 @@ extension String {
243290
private func _thoroughMatchBuiltinCC(
244291
_ cc: _CharacterClassModel.Representation,
245292
at currentPosition: String.Index,
293+
limitedBy end: String.Index,
246294
isInverted: Bool,
247295
isStrictASCII: Bool,
248296
isScalarSemantics: Bool
249297
) -> String.Index? {
250-
assert(currentPosition < endIndex)
251-
let char = self[currentPosition]
298+
// TODO: Branch here on scalar semantics
299+
// Don't want to pay character cost if unnecessary
300+
guard var (char, next) =
301+
characterAndEnd(at: currentPosition, limitedBy: end)
302+
else { return nil }
252303
let scalar = unicodeScalars[currentPosition]
253304

254305
let asciiCheck = !isStrictASCII
255306
|| (scalar.isASCII && isScalarSemantics)
256307
|| char.isASCII
257308

258309
var matched: Bool
259-
var next: String.Index
260-
switch (isScalarSemantics, cc) {
261-
case (_, .anyGrapheme):
262-
next = index(after: currentPosition)
263-
case (true, _):
310+
if isScalarSemantics && cc != .anyGrapheme {
264311
next = unicodeScalars.index(after: currentPosition)
265-
case (false, _):
266-
next = index(after: currentPosition)
267312
}
268313

269314
switch cc {
@@ -291,7 +336,7 @@ extension String {
291336
if isScalarSemantics {
292337
matched = scalar.isNewline && asciiCheck
293338
if matched && scalar == "\r"
294-
&& next != endIndex && unicodeScalars[next] == "\n" {
339+
&& next < end && unicodeScalars[next] == "\n" {
295340
// Match a full CR-LF sequence even in scalar semantics
296341
unicodeScalars.formIndex(after: &next)
297342
}

Sources/_StringProcessing/Engine/MEQuantify.swift

+6-4
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,18 @@ extension Processor {
1717
boundaryCheck: !isScalarSemantics,
1818
isCaseInsensitive: false)
1919
case .builtin:
20-
// FIXME: bounds check? endIndex or end?
20+
guard currentPosition < end else { return nil }
2121

2222
// We only emit .quantify if it consumes a single character
2323
return input.matchBuiltinCC(
2424
payload.builtin,
2525
at: currentPosition,
26+
limitedBy: end,
2627
isInverted: payload.builtinIsInverted,
2728
isStrictASCII: payload.builtinIsStrict,
2829
isScalarSemantics: isScalarSemantics)
2930
case .any:
30-
// FIXME: endIndex or end?
31-
guard currentPosition < input.endIndex else { return nil }
31+
guard currentPosition < end else { return nil }
3232

3333
if payload.anyMatchesNewline {
3434
if isScalarSemantics {
@@ -38,7 +38,9 @@ extension Processor {
3838
}
3939

4040
return input.matchAnyNonNewline(
41-
at: currentPosition, isScalarSemantics: isScalarSemantics)
41+
at: currentPosition,
42+
limitedBy: end,
43+
isScalarSemantics: isScalarSemantics)
4244
}
4345
}
4446

0 commit comments

Comments
 (0)