@@ -15,9 +15,10 @@ extension Processor {
15
15
isStrictASCII: Bool ,
16
16
isScalarSemantics: Bool
17
17
) -> Bool {
18
- guard let next = input. matchBuiltinCC (
18
+ guard currentPosition < end , let next = input. matchBuiltinCC (
19
19
cc,
20
20
at: currentPosition,
21
+ limitedBy: end,
21
22
isInverted: isInverted,
22
23
isStrictASCII: isStrictASCII,
23
24
isScalarSemantics: isScalarSemantics
@@ -102,56 +103,96 @@ extension Processor {
102
103
103
104
case . wordBoundary:
104
105
if payload. usesSimpleUnicodeBoundaries {
105
- // TODO: How should we handle bounds?
106
106
return atSimpleBoundary ( payload. usesASCIIWord, payload. semanticLevel)
107
107
} else {
108
- return input. isOnWordBoundary ( at: currentPosition, using: & wordIndexCache, & wordIndexMaxIndex)
108
+ return input. isOnWordBoundary ( at: currentPosition, in : searchBounds , using: & wordIndexCache, & wordIndexMaxIndex)
109
109
}
110
110
111
111
case . notWordBoundary:
112
112
if payload. usesSimpleUnicodeBoundaries {
113
- // TODO: How should we handle bounds?
114
113
return !atSimpleBoundary( payload. usesASCIIWord, payload. semanticLevel)
115
114
} else {
116
- return !input. isOnWordBoundary ( at: currentPosition, using: & wordIndexCache, & wordIndexMaxIndex)
115
+ return !input. isOnWordBoundary ( at: currentPosition, in : searchBounds , using: & wordIndexCache, & wordIndexMaxIndex)
117
116
}
118
117
}
119
118
}
120
119
}
121
120
122
121
// MARK: Matching `.`
123
122
extension String {
124
- // TODO: Should the below have a `limitedBy` parameter?
123
+ /// Returns the character at `pos`, bounded by `end`, as well as the upper
124
+ /// boundary of the returned character.
125
+ ///
126
+ /// This function handles loading a character from a string while respecting
127
+ /// an end boundary, even if that end boundary is sub-character or sub-scalar.
128
+ ///
129
+ /// - If `pos` is at or past `end`, this function returns `nil`.
130
+ /// - If `end` is between `pos` and the next grapheme cluster boundary (i.e.,
131
+ /// `end` is before `self.index(after: pos)`, then the returned character
132
+ /// is smaller than the one that would be produced by `self[pos]` and the
133
+ /// returned index is at the end of that character.
134
+ /// - If `end` is between `pos` and the next grapheme cluster boundary, and
135
+ /// is not on a Unicode scalar boundary, the partial scalar is dropped. This
136
+ /// can result in a `nil` return or a character that includes only part of
137
+ /// the `self[pos]` character.
138
+ ///
139
+ /// - Parameters:
140
+ /// - pos: The position to load a character from.
141
+ /// - end: The limit for the character at `pos`.
142
+ /// - Returns: The character at `pos`, bounded by `end`, if it exists, along
143
+ /// with the upper bound of that character. The upper bound is always
144
+ /// scalar-aligned.
145
+ func characterAndEnd( at pos: String . Index , limitedBy end: String . Index ) -> ( Character , String . Index ) ? {
146
+ // FIXME: Sink into the stdlib to avoid multiple boundary calculations
147
+ guard pos < end else { return nil }
148
+ let next = index ( after: pos)
149
+ if next <= end {
150
+ return ( self [ pos] , next)
151
+ }
125
152
153
+ // `end` must be a sub-character position that is between `pos` and the
154
+ // next grapheme boundary. This is okay if `end` is on a Unicode scalar
155
+ // boundary, but if it's in the middle of a scalar's code units, there
156
+ // may not be a character to return at all after rounding down. Use
157
+ // `Substring`'s rounding to determine what we can return.
158
+ let substr = self [ pos..< end]
159
+ return substr. isEmpty
160
+ ? nil
161
+ : ( substr. first!, substr. endIndex)
162
+ }
163
+
126
164
func matchAnyNonNewline(
127
165
at currentPosition: String . Index ,
166
+ limitedBy end: String . Index ,
128
167
isScalarSemantics: Bool
129
168
) -> String . Index ? {
130
- guard currentPosition < endIndex else {
131
- return nil
132
- }
169
+ guard currentPosition < end else { return nil }
133
170
if case . definite( let result) = _quickMatchAnyNonNewline (
134
171
at: currentPosition,
172
+ limitedBy: end,
135
173
isScalarSemantics: isScalarSemantics
136
174
) {
137
175
assert ( result == _thoroughMatchAnyNonNewline (
138
176
at: currentPosition,
177
+ limitedBy: end,
139
178
isScalarSemantics: isScalarSemantics) )
140
179
return result
141
180
}
142
181
return _thoroughMatchAnyNonNewline (
143
182
at: currentPosition,
183
+ limitedBy: end,
144
184
isScalarSemantics: isScalarSemantics)
145
185
}
146
186
147
187
@inline ( __always)
148
188
private func _quickMatchAnyNonNewline(
149
189
at currentPosition: String . Index ,
190
+ limitedBy end: String . Index ,
150
191
isScalarSemantics: Bool
151
192
) -> QuickResult < String . Index ? > {
152
- assert ( currentPosition < endIndex )
193
+ assert ( currentPosition < end )
153
194
guard let ( asciiValue, next, isCRLF) = _quickASCIICharacter (
154
- at: currentPosition
195
+ at: currentPosition, limitedBy : end
155
196
) else {
156
197
return . unknown
157
198
}
@@ -167,46 +208,47 @@ extension String {
167
208
@inline ( never)
168
209
private func _thoroughMatchAnyNonNewline(
169
210
at currentPosition: String . Index ,
211
+ limitedBy end: String . Index ,
170
212
isScalarSemantics: Bool
171
213
) -> String . Index ? {
172
- assert ( currentPosition < endIndex)
173
214
if isScalarSemantics {
215
+ guard currentPosition < end else { return nil }
174
216
let scalar = unicodeScalars [ currentPosition]
175
217
guard !scalar. isNewline else { return nil }
176
218
return unicodeScalars. index ( after: currentPosition)
177
219
}
178
220
179
- let char = self [ currentPosition]
180
- guard !char. isNewline else { return nil }
181
- return index ( after: currentPosition)
221
+ guard let ( char, next) = characterAndEnd ( at: currentPosition, limitedBy: end) ,
222
+ !char. isNewline
223
+ else { return nil }
224
+ return next
182
225
}
183
226
}
184
227
185
228
// MARK: - Built-in character class matching
186
229
extension String {
187
- // TODO: Should the below have a `limitedBy` parameter?
188
-
189
230
// Mentioned in ProgrammersManual.md, update docs if redesigned
190
231
func matchBuiltinCC(
191
232
_ cc: _CharacterClassModel . Representation ,
192
233
at currentPosition: String . Index ,
234
+ limitedBy end: String . Index ,
193
235
isInverted: Bool ,
194
236
isStrictASCII: Bool ,
195
237
isScalarSemantics: Bool
196
238
) -> String . Index ? {
197
- guard currentPosition < endIndex else {
198
- return nil
199
- }
239
+ guard currentPosition < end else { return nil }
200
240
if case . definite( let result) = _quickMatchBuiltinCC (
201
241
cc,
202
242
at: currentPosition,
243
+ limitedBy: end,
203
244
isInverted: isInverted,
204
245
isStrictASCII: isStrictASCII,
205
246
isScalarSemantics: isScalarSemantics
206
247
) {
207
248
assert ( result == _thoroughMatchBuiltinCC (
208
249
cc,
209
250
at: currentPosition,
251
+ limitedBy: end,
210
252
isInverted: isInverted,
211
253
isStrictASCII: isStrictASCII,
212
254
isScalarSemantics: isScalarSemantics) )
@@ -215,6 +257,7 @@ extension String {
215
257
return _thoroughMatchBuiltinCC (
216
258
cc,
217
259
at: currentPosition,
260
+ limitedBy: end,
218
261
isInverted: isInverted,
219
262
isStrictASCII: isStrictASCII,
220
263
isScalarSemantics: isScalarSemantics)
@@ -225,13 +268,17 @@ extension String {
225
268
private func _quickMatchBuiltinCC(
226
269
_ cc: _CharacterClassModel . Representation ,
227
270
at currentPosition: String . Index ,
271
+ limitedBy end: String . Index ,
228
272
isInverted: Bool ,
229
273
isStrictASCII: Bool ,
230
274
isScalarSemantics: Bool
231
275
) -> QuickResult < String . Index ? > {
232
- assert ( currentPosition < endIndex )
276
+ assert ( currentPosition < end )
233
277
guard let ( next, result) = _quickMatch (
234
- cc, at: currentPosition, isScalarSemantics: isScalarSemantics
278
+ cc,
279
+ at: currentPosition,
280
+ limitedBy: end,
281
+ isScalarSemantics: isScalarSemantics
235
282
) else {
236
283
return . unknown
237
284
}
@@ -243,27 +290,25 @@ extension String {
243
290
private func _thoroughMatchBuiltinCC(
244
291
_ cc: _CharacterClassModel . Representation ,
245
292
at currentPosition: String . Index ,
293
+ limitedBy end: String . Index ,
246
294
isInverted: Bool ,
247
295
isStrictASCII: Bool ,
248
296
isScalarSemantics: Bool
249
297
) -> String . Index ? {
250
- assert ( currentPosition < endIndex)
251
- let char = self [ currentPosition]
298
+ // TODO: Branch here on scalar semantics
299
+ // Don't want to pay character cost if unnecessary
300
+ guard var ( char, next) =
301
+ characterAndEnd ( at: currentPosition, limitedBy: end)
302
+ else { return nil }
252
303
let scalar = unicodeScalars [ currentPosition]
253
304
254
305
let asciiCheck = !isStrictASCII
255
306
|| ( scalar. isASCII && isScalarSemantics)
256
307
|| char. isASCII
257
308
258
309
var matched : Bool
259
- var next : String . Index
260
- switch ( isScalarSemantics, cc) {
261
- case ( _, . anyGrapheme) :
262
- next = index ( after: currentPosition)
263
- case ( true , _) :
310
+ if isScalarSemantics && cc != . anyGrapheme {
264
311
next = unicodeScalars. index ( after: currentPosition)
265
- case ( false , _) :
266
- next = index ( after: currentPosition)
267
312
}
268
313
269
314
switch cc {
@@ -291,7 +336,7 @@ extension String {
291
336
if isScalarSemantics {
292
337
matched = scalar. isNewline && asciiCheck
293
338
if matched && scalar == " \r "
294
- && next != endIndex && unicodeScalars [ next] == " \n " {
339
+ && next < end && unicodeScalars [ next] == " \n " {
295
340
// Match a full CR-LF sequence even in scalar semantics
296
341
unicodeScalars. formIndex ( after: & next)
297
342
}
0 commit comments