From eee2721794d1646ba6fbdd7151c839ff7eb7c0ab Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Sun, 10 Dec 2023 11:26:44 -0700 Subject: [PATCH 1/2] Add ASCII fast-path ASCII character class matching Uses quickASCIICharacter to speed up ASCII character class matching. 2x speedup for EmailLookahead_All and many, many others. 10% regression in AnchoredNotFound_First and related. --- .../Engine/InstPayload.swift | 4 +- .../_StringProcessing/Engine/MEQuantify.swift | 4 +- .../_StringProcessing/Engine/Processor.swift | 55 +++++++++++++++---- .../Utility/AsciiBitset.swift | 12 +++- 4 files changed, 56 insertions(+), 19 deletions(-) diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index d569fcd32..78baf9ce1 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -378,7 +378,7 @@ extension Instruction.Payload { struct QuantifyPayload: RawRepresentable { let rawValue: UInt64 enum PayloadType: UInt64 { - case bitset = 0 + case asciiBitset = 0 case asciiChar = 1 case any = 2 case builtin = 4 @@ -448,7 +448,7 @@ struct QuantifyPayload: RawRepresentable { ) { assert(bitset.bits <= _payloadMask) self.rawValue = bitset.bits - + QuantifyPayload.packInfoValues(kind, minTrips, maxExtraTrips, .bitset, isScalarSemantics: isScalarSemantics) + + QuantifyPayload.packInfoValues(kind, minTrips, maxExtraTrips, .asciiBitset, isScalarSemantics: isScalarSemantics) } init( diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 2d187607c..a0480cde6 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -3,8 +3,8 @@ extension Processor { let isScalarSemantics = payload.isScalarSemantics switch payload.type { - case .bitset: - return input.matchBitset( + case .asciiBitset: + return input.matchASCIIBitset( registers[payload.bitset], at: currentPosition, limitedBy: end, diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 6e0a7774c..86365322b 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -291,7 +291,7 @@ extension Processor { _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, isScalarSemantics: Bool ) -> Bool { - guard let next = input.matchBitset( + guard let next = input.matchASCIIBitset( bitset, at: currentPosition, limitedBy: end, @@ -723,22 +723,53 @@ extension String { return idx } - func matchBitset( + func matchASCIIBitset( _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, at pos: Index, limitedBy end: Index, isScalarSemantics: Bool ) -> Index? { - // TODO: extremely quick-check-able - // TODO: can be sped up with string internals - if isScalarSemantics { - guard pos < end else { return nil } - guard bitset.matches(unicodeScalars[pos]) else { return nil } - return unicodeScalars.index(after: pos) - } else { - guard let (char, next) = characterAndEnd(at: pos, limitedBy: end), - bitset.matches(char) else { return nil } - return next + + // FIXME: Inversion should be tracked and handled in only one place. + // That is, we should probably store it as a bit in the instruction, so that + // bitset matching and bitset inversion is bit-based rather that semantically + // inverting the notion of a match or not. As-is, we need to track both + // meanings in some code paths. + let isInverted = bitset.isInverted + + // TODO: More fodder for refactoring `_quickASCIICharacter`, see the comment + // there + guard let (asciiByte, next, isCRLF) = _quickASCIICharacter( + at: pos, + limitedBy: end + ) else { + if isScalarSemantics { + guard pos < end else { return nil } + guard bitset.matches(unicodeScalars[pos]) else { return nil } + return unicodeScalars.index(after: pos) + } else { + guard let (char, next) = characterAndEnd(at: pos, limitedBy: end), + bitset.matches(char) else { return nil } + return next + } + } + + guard bitset.matches(asciiByte) else { + // FIXME: check inversion here after refactored out of bitset + return nil } + + // CR-LF should only match `[\r]` in scalar semantic mode or if inverted + if isCRLF { + if isScalarSemantics { + return self.unicodeScalars.index(before: next) + } + if isInverted { + return next + } + return nil + } + + return next } } diff --git a/Sources/_StringProcessing/Utility/AsciiBitset.swift b/Sources/_StringProcessing/Utility/AsciiBitset.swift index e063447a0..2b0217cdc 100644 --- a/Sources/_StringProcessing/Utility/AsciiBitset.swift +++ b/Sources/_StringProcessing/Utility/AsciiBitset.swift @@ -1,3 +1,4 @@ +// TODO: Probably refactor out of DSLTree extension DSLTree.CustomCharacterClass { internal struct AsciiBitset { let isInverted: Bool @@ -49,7 +50,7 @@ extension DSLTree.CustomCharacterClass { } } - private func matches(_ val: UInt8) -> Bool { + private func _matchesWithoutInversionCheck(_ val: UInt8) -> Bool { if val < 64 { return (a >> val) & 1 == 1 } else { @@ -57,10 +58,15 @@ extension DSLTree.CustomCharacterClass { } } + internal func matches(_ byte: UInt8) -> Bool { + guard byte < 128 else { return isInverted } + return _matchesWithoutInversionCheck(byte) == !isInverted + } + internal func matches(_ char: Character) -> Bool { let matched: Bool if let val = char._singleScalarAsciiValue { - matched = matches(val) + matched = _matchesWithoutInversionCheck(val) } else { matched = false } @@ -75,7 +81,7 @@ extension DSLTree.CustomCharacterClass { let matched: Bool if scalar.isASCII { let val = UInt8(ascii: scalar) - matched = matches(val) + matched = _matchesWithoutInversionCheck(val) } else { matched = false } From 80e92af70261d69fcced6188b8e56c5632c8de2a Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Sun, 10 Dec 2023 13:00:32 -0700 Subject: [PATCH 2/2] Refactor _quickASCIICharacter --- .../_StringProcessing/Engine/MEBuiltins.swift | 6 +- .../_StringProcessing/Engine/Processor.swift | 15 +--- Sources/_StringProcessing/Unicode/ASCII.swift | 84 +++++++++++-------- 3 files changed, 56 insertions(+), 49 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 0dafd6720..49eb8bc69 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -191,8 +191,10 @@ extension String { isScalarSemantics: Bool ) -> QuickResult { assert(currentPosition < end) - guard let (asciiValue, next, isCRLF) = _quickASCIICharacter( - at: currentPosition, limitedBy: end + guard let (asciiValue, isCRLF: isCRLF, next) = _quickASCIICharacter( + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics ) else { return .unknown } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 86365322b..69a32e25c 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -737,11 +737,10 @@ extension String { // meanings in some code paths. let isInverted = bitset.isInverted - // TODO: More fodder for refactoring `_quickASCIICharacter`, see the comment - // there - guard let (asciiByte, next, isCRLF) = _quickASCIICharacter( + guard let (asciiByte, isCRLF: isCRLF, next) = _quickASCIICharacter( at: pos, - limitedBy: end + limitedBy: end, + isScalarSemantics: isScalarSemantics ) else { if isScalarSemantics { guard pos < end else { return nil } @@ -760,13 +759,7 @@ extension String { } // CR-LF should only match `[\r]` in scalar semantic mode or if inverted - if isCRLF { - if isScalarSemantics { - return self.unicodeScalars.index(before: next) - } - if isInverted { - return next - } + if isCRLF && !isScalarSemantics && !isInverted { return nil } diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift index 53dfe652d..2e6eb9b35 100644 --- a/Sources/_StringProcessing/Unicode/ASCII.swift +++ b/Sources/_StringProcessing/Unicode/ASCII.swift @@ -81,13 +81,22 @@ extension UInt8 { } extension String { - /// TODO: better to take isScalarSemantics parameter, we can return more results - /// and we can give the right `next` index, not requiring the caller to re-adjust it - /// TODO: detailed description of nuanced semantics + /// + /// If the position in the input is not definitely a full-ASCII character (uses sub-`0x300` quick check + /// on next byte), return `nil.` + /// + /// Otherwise, returns: + /// 1. The first ASCII byte for a character (or scalar if `isScalarSemantics`) + /// 2. Whether the ASCII Character is CR-LF + /// 3. The index for the end of that particular ASCII character: + /// If the character is not CR-LF, the index of the next byte + /// if `isScalarSemantics` is false and the ASCII character is CR-LF, the index after the CR-LF sequence. + /// func _quickASCIICharacter( at idx: Index, - limitedBy end: Index - ) -> (first: UInt8, next: Index, crLF: Bool)? { + limitedBy end: Index, + isScalarSemantics: Bool + ) -> (firstASCIIByte: UInt8, isCRLF: Bool, asciiCharacterEnd: Index)? { // TODO: fastUTF8 version assert(String.Index(idx, within: unicodeScalars) != nil) assert(idx <= end) @@ -101,25 +110,25 @@ extension String { return nil } - var next = utf8.index(after: idx) - if next == end { - return (first: base, next: next, crLF: false) + let byteEnd = utf8.index(after: idx) + if isScalarSemantics || byteEnd == end { + return (firstASCIIByte: base, isCRLF: false, asciiCharacterEnd: byteEnd) } - let tail = utf8[next] + let tail = utf8[byteEnd] guard tail._isSub300StartingByte else { return nil } // Handle CR-LF: if base == ._carriageReturn && tail == ._lineFeed { - utf8.formIndex(after: &next) - guard next == end || utf8[next]._isSub300StartingByte else { + let crLFEnd = utf8.index(after: byteEnd) + guard crLFEnd == end || utf8[crLFEnd]._isSub300StartingByte else { return nil } - return (first: base, next: next, crLF: true) + return (firstASCIIByte: base, isCRLF: true, asciiCharacterEnd: crLFEnd) } assert(self[idx].isASCII && self[idx] != "\r\n") - return (first: base, next: next, crLF: false) + return (firstASCIIByte: base, isCRLF: false, asciiCharacterEnd: byteEnd) } func _quickMatch( @@ -128,44 +137,47 @@ extension String { limitedBy end: Index, isScalarSemantics: Bool ) -> (next: Index, matchResult: Bool)? { + // Don't use scalar semantics in this quick path for anyGrapheme cluster or + // newline sequences, which are not scalar character classes. + let useScalarSemantics = isScalarSemantics && cc != .anyGrapheme && cc != .newlineSequence /// ASCII fast-paths - guard let (asciiValue, next, isCRLF) = _quickASCIICharacter( - at: idx, limitedBy: end + guard let (asciiValue, isCRLF: isCRLF, charEnd) = _quickASCIICharacter( + at: idx, + limitedBy: end, + isScalarSemantics: useScalarSemantics ) else { return nil } // TODO: bitvectors switch cc { - case .any, .anyGrapheme: - return (next, true) + case .any: + return (charEnd, true) + + case .anyGrapheme: + // _quickASCIICharacter call handled CR-LF for us + _ = isCRLF + return (charEnd, true) case .digit: - return (next, asciiValue._asciiIsDigit) + return (charEnd, asciiValue._asciiIsDigit) case .horizontalWhitespace: - return (next, asciiValue._asciiIsHorizontalWhitespace) - - case .verticalWhitespace, .newlineSequence: - if asciiValue._asciiIsVerticalWhitespace { - if isScalarSemantics && isCRLF && cc == .verticalWhitespace { - return (utf8.index(before: next), true) - } - return (next, true) - } - return (next, false) + return (charEnd, asciiValue._asciiIsHorizontalWhitespace) + + case .verticalWhitespace: + return (charEnd, asciiValue._asciiIsVerticalWhitespace) + + case .newlineSequence: + // _quickASCIICharacter call handled CR-LF for us + _ = isCRLF + return (charEnd, asciiValue._asciiIsVerticalWhitespace) case .whitespace: - if asciiValue._asciiIsWhitespace { - if isScalarSemantics && isCRLF { - return (utf8.index(before: next), true) - } - return (next, true) - } - return (next, false) + return (charEnd, asciiValue._asciiIsWhitespace) case .word: - return (next, asciiValue._asciiIsWord) + return (charEnd, asciiValue._asciiIsWord) } }