Skip to content

Commit 4e742b4

Browse files
authored
Add ASCII fast-path ASCII character class matching (#690)
Uses quickASCIICharacter to speed up ASCII character class matching. 2x speedup for EmailLookahead_All and many, many others. 10% regression in AnchoredNotFound_First and related.
1 parent 69f406c commit 4e742b4

File tree

4 files changed

+56
-19
lines changed

4 files changed

+56
-19
lines changed

Sources/_StringProcessing/Engine/InstPayload.swift

+2-2
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ extension Instruction.Payload {
378378
struct QuantifyPayload: RawRepresentable {
379379
let rawValue: UInt64
380380
enum PayloadType: UInt64 {
381-
case bitset = 0
381+
case asciiBitset = 0
382382
case asciiChar = 1
383383
case any = 2
384384
case builtin = 4
@@ -448,7 +448,7 @@ struct QuantifyPayload: RawRepresentable {
448448
) {
449449
assert(bitset.bits <= _payloadMask)
450450
self.rawValue = bitset.bits
451-
+ QuantifyPayload.packInfoValues(kind, minTrips, maxExtraTrips, .bitset, isScalarSemantics: isScalarSemantics)
451+
+ QuantifyPayload.packInfoValues(kind, minTrips, maxExtraTrips, .asciiBitset, isScalarSemantics: isScalarSemantics)
452452
}
453453

454454
init(

Sources/_StringProcessing/Engine/MEQuantify.swift

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ extension Processor {
33
let isScalarSemantics = payload.isScalarSemantics
44

55
switch payload.type {
6-
case .bitset:
7-
return input.matchBitset(
6+
case .asciiBitset:
7+
return input.matchASCIIBitset(
88
registers[payload.bitset],
99
at: currentPosition,
1010
limitedBy: end,

Sources/_StringProcessing/Engine/Processor.swift

+43-12
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ extension Processor {
291291
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset,
292292
isScalarSemantics: Bool
293293
) -> Bool {
294-
guard let next = input.matchBitset(
294+
guard let next = input.matchASCIIBitset(
295295
bitset,
296296
at: currentPosition,
297297
limitedBy: end,
@@ -723,22 +723,53 @@ extension String {
723723
return idx
724724
}
725725

726-
func matchBitset(
726+
func matchASCIIBitset(
727727
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset,
728728
at pos: Index,
729729
limitedBy end: Index,
730730
isScalarSemantics: Bool
731731
) -> Index? {
732-
// TODO: extremely quick-check-able
733-
// TODO: can be sped up with string internals
734-
if isScalarSemantics {
735-
guard pos < end else { return nil }
736-
guard bitset.matches(unicodeScalars[pos]) else { return nil }
737-
return unicodeScalars.index(after: pos)
738-
} else {
739-
guard let (char, next) = characterAndEnd(at: pos, limitedBy: end),
740-
bitset.matches(char) else { return nil }
741-
return next
732+
733+
// FIXME: Inversion should be tracked and handled in only one place.
734+
// That is, we should probably store it as a bit in the instruction, so that
735+
// bitset matching and bitset inversion is bit-based rather that semantically
736+
// inverting the notion of a match or not. As-is, we need to track both
737+
// meanings in some code paths.
738+
let isInverted = bitset.isInverted
739+
740+
// TODO: More fodder for refactoring `_quickASCIICharacter`, see the comment
741+
// there
742+
guard let (asciiByte, next, isCRLF) = _quickASCIICharacter(
743+
at: pos,
744+
limitedBy: end
745+
) else {
746+
if isScalarSemantics {
747+
guard pos < end else { return nil }
748+
guard bitset.matches(unicodeScalars[pos]) else { return nil }
749+
return unicodeScalars.index(after: pos)
750+
} else {
751+
guard let (char, next) = characterAndEnd(at: pos, limitedBy: end),
752+
bitset.matches(char) else { return nil }
753+
return next
754+
}
755+
}
756+
757+
guard bitset.matches(asciiByte) else {
758+
// FIXME: check inversion here after refactored out of bitset
759+
return nil
742760
}
761+
762+
// CR-LF should only match `[\r]` in scalar semantic mode or if inverted
763+
if isCRLF {
764+
if isScalarSemantics {
765+
return self.unicodeScalars.index(before: next)
766+
}
767+
if isInverted {
768+
return next
769+
}
770+
return nil
771+
}
772+
773+
return next
743774
}
744775
}

Sources/_StringProcessing/Utility/AsciiBitset.swift

+9-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
// TODO: Probably refactor out of DSLTree
12
extension DSLTree.CustomCharacterClass {
23
internal struct AsciiBitset {
34
let isInverted: Bool
@@ -49,18 +50,23 @@ extension DSLTree.CustomCharacterClass {
4950
}
5051
}
5152

52-
private func matches(_ val: UInt8) -> Bool {
53+
private func _matchesWithoutInversionCheck(_ val: UInt8) -> Bool {
5354
if val < 64 {
5455
return (a >> val) & 1 == 1
5556
} else {
5657
return (b >> (val - 64)) & 1 == 1
5758
}
5859
}
5960

61+
internal func matches(_ byte: UInt8) -> Bool {
62+
guard byte < 128 else { return isInverted }
63+
return _matchesWithoutInversionCheck(byte) == !isInverted
64+
}
65+
6066
internal func matches(_ char: Character) -> Bool {
6167
let matched: Bool
6268
if let val = char._singleScalarAsciiValue {
63-
matched = matches(val)
69+
matched = _matchesWithoutInversionCheck(val)
6470
} else {
6571
matched = false
6672
}
@@ -75,7 +81,7 @@ extension DSLTree.CustomCharacterClass {
7581
let matched: Bool
7682
if scalar.isASCII {
7783
let val = UInt8(ascii: scalar)
78-
matched = matches(val)
84+
matched = _matchesWithoutInversionCheck(val)
7985
} else {
8086
matched = false
8187
}

0 commit comments

Comments
 (0)