From eee2721794d1646ba6fbdd7151c839ff7eb7c0ab Mon Sep 17 00:00:00 2001
From: Michael Ilseman <michael.ilseman@gmail.com>
Date: Sun, 10 Dec 2023 11:26:44 -0700
Subject: [PATCH 1/2] Add ASCII fast-path ASCII character class matching

Uses quickASCIICharacter to speed up ASCII character class matching.

2x speedup for EmailLookahead_All and many, many others. 10% regression in AnchoredNotFound_First and related.
---
 .../Engine/InstPayload.swift                  |  4 +-
 .../_StringProcessing/Engine/MEQuantify.swift |  4 +-
 .../_StringProcessing/Engine/Processor.swift  | 55 +++++++++++++++----
 .../Utility/AsciiBitset.swift                 | 12 +++-
 4 files changed, 56 insertions(+), 19 deletions(-)

diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift
index d569fcd32..78baf9ce1 100644
--- a/Sources/_StringProcessing/Engine/InstPayload.swift
+++ b/Sources/_StringProcessing/Engine/InstPayload.swift
@@ -378,7 +378,7 @@ extension Instruction.Payload {
 struct QuantifyPayload: RawRepresentable {
   let rawValue: UInt64
   enum PayloadType: UInt64 {
-    case bitset = 0
+    case asciiBitset = 0
     case asciiChar = 1
     case any = 2
     case builtin = 4
@@ -448,7 +448,7 @@ struct QuantifyPayload: RawRepresentable {
   ) {
     assert(bitset.bits <= _payloadMask)
     self.rawValue = bitset.bits
-      + QuantifyPayload.packInfoValues(kind, minTrips, maxExtraTrips, .bitset, isScalarSemantics: isScalarSemantics)
+      + QuantifyPayload.packInfoValues(kind, minTrips, maxExtraTrips, .asciiBitset, isScalarSemantics: isScalarSemantics)
   }
 
   init(
diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift
index 2d187607c..a0480cde6 100644
--- a/Sources/_StringProcessing/Engine/MEQuantify.swift
+++ b/Sources/_StringProcessing/Engine/MEQuantify.swift
@@ -3,8 +3,8 @@ extension Processor {
     let isScalarSemantics = payload.isScalarSemantics
 
     switch payload.type {
-    case .bitset:
-      return input.matchBitset(
+    case .asciiBitset:
+      return input.matchASCIIBitset(
         registers[payload.bitset],
         at: currentPosition,
         limitedBy: end,
diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift
index 6e0a7774c..86365322b 100644
--- a/Sources/_StringProcessing/Engine/Processor.swift
+++ b/Sources/_StringProcessing/Engine/Processor.swift
@@ -291,7 +291,7 @@ extension Processor {
     _ bitset: DSLTree.CustomCharacterClass.AsciiBitset,
     isScalarSemantics: Bool
   ) -> Bool {
-    guard let next = input.matchBitset(
+    guard let next = input.matchASCIIBitset(
       bitset,
       at: currentPosition,
       limitedBy: end,
@@ -723,22 +723,53 @@ extension String {
     return idx
   }
 
-  func matchBitset(
+  func matchASCIIBitset(
     _ bitset: DSLTree.CustomCharacterClass.AsciiBitset,
     at pos: Index,
     limitedBy end: Index,
     isScalarSemantics: Bool
   ) -> Index? {
-    // TODO: extremely quick-check-able
-    // TODO: can be sped up with string internals
-    if isScalarSemantics {
-      guard pos < end else { return nil }
-      guard bitset.matches(unicodeScalars[pos]) else { return nil }
-      return unicodeScalars.index(after: pos)
-    } else {
-      guard let (char, next) = characterAndEnd(at: pos, limitedBy: end),
-            bitset.matches(char) else { return nil }
-      return next
+
+    // FIXME: Inversion should be tracked and handled in only one place.
+    // That is, we should probably store it as a bit in the instruction, so that
+    // bitset matching and bitset inversion is bit-based rather that semantically
+    // inverting the notion of a match or not. As-is, we need to track both
+    // meanings in some code paths.
+    let isInverted = bitset.isInverted
+
+    // TODO: More fodder for refactoring `_quickASCIICharacter`, see the comment 
+    // there
+    guard let (asciiByte, next, isCRLF) = _quickASCIICharacter(
+      at: pos,
+      limitedBy: end
+    ) else {
+      if isScalarSemantics {
+        guard pos < end else { return nil }
+        guard bitset.matches(unicodeScalars[pos]) else { return nil }
+        return unicodeScalars.index(after: pos)
+      } else {
+        guard let (char, next) = characterAndEnd(at: pos, limitedBy: end),
+              bitset.matches(char) else { return nil }
+        return next
+      }
+    }
+
+    guard bitset.matches(asciiByte) else {
+      // FIXME: check inversion here after refactored out of bitset
+      return nil
     }
+
+    // CR-LF should only match `[\r]` in scalar semantic mode or if inverted
+    if isCRLF {
+      if isScalarSemantics {
+        return self.unicodeScalars.index(before: next)
+      }
+      if isInverted {
+        return next
+      }
+      return nil
+    }
+
+    return next
   }
 }
diff --git a/Sources/_StringProcessing/Utility/AsciiBitset.swift b/Sources/_StringProcessing/Utility/AsciiBitset.swift
index e063447a0..2b0217cdc 100644
--- a/Sources/_StringProcessing/Utility/AsciiBitset.swift
+++ b/Sources/_StringProcessing/Utility/AsciiBitset.swift
@@ -1,3 +1,4 @@
+// TODO: Probably refactor out of DSLTree
 extension DSLTree.CustomCharacterClass {
   internal struct AsciiBitset {
     let isInverted: Bool
@@ -49,7 +50,7 @@ extension DSLTree.CustomCharacterClass {
       }
     }
 
-    private func matches(_ val: UInt8) -> Bool {
+    private func _matchesWithoutInversionCheck(_ val: UInt8) -> Bool {
       if val < 64 {
         return (a >> val) & 1 == 1
       } else {
@@ -57,10 +58,15 @@ extension DSLTree.CustomCharacterClass {
       }
     }
 
+    internal func matches(_ byte: UInt8) -> Bool {
+      guard byte < 128 else { return isInverted }
+      return _matchesWithoutInversionCheck(byte) == !isInverted
+    }
+
     internal func matches(_ char: Character) -> Bool {
       let matched: Bool
       if let val = char._singleScalarAsciiValue {
-        matched = matches(val)
+        matched = _matchesWithoutInversionCheck(val)
       } else {
         matched = false
       }
@@ -75,7 +81,7 @@ extension DSLTree.CustomCharacterClass {
       let matched: Bool
       if scalar.isASCII {
         let val = UInt8(ascii: scalar)
-        matched = matches(val)
+        matched = _matchesWithoutInversionCheck(val)
       } else {
         matched = false
       }

From 80e92af70261d69fcced6188b8e56c5632c8de2a Mon Sep 17 00:00:00 2001
From: Michael Ilseman <michael.ilseman@gmail.com>
Date: Sun, 10 Dec 2023 13:00:32 -0700
Subject: [PATCH 2/2] Refactor _quickASCIICharacter

---
 .../_StringProcessing/Engine/MEBuiltins.swift |  6 +-
 .../_StringProcessing/Engine/Processor.swift  | 15 +---
 Sources/_StringProcessing/Unicode/ASCII.swift | 84 +++++++++++--------
 3 files changed, 56 insertions(+), 49 deletions(-)

diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift
index 0dafd6720..49eb8bc69 100644
--- a/Sources/_StringProcessing/Engine/MEBuiltins.swift
+++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift
@@ -191,8 +191,10 @@ extension String {
     isScalarSemantics: Bool
   ) -> QuickResult<String.Index?> {
     assert(currentPosition < end)
-    guard let (asciiValue, next, isCRLF) = _quickASCIICharacter(
-      at: currentPosition, limitedBy: end
+    guard let (asciiValue, isCRLF: isCRLF, next) = _quickASCIICharacter(
+      at: currentPosition,
+      limitedBy: end,
+      isScalarSemantics: isScalarSemantics
     ) else {
       return .unknown
     }
diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift
index 86365322b..69a32e25c 100644
--- a/Sources/_StringProcessing/Engine/Processor.swift
+++ b/Sources/_StringProcessing/Engine/Processor.swift
@@ -737,11 +737,10 @@ extension String {
     // meanings in some code paths.
     let isInverted = bitset.isInverted
 
-    // TODO: More fodder for refactoring `_quickASCIICharacter`, see the comment 
-    // there
-    guard let (asciiByte, next, isCRLF) = _quickASCIICharacter(
+    guard let (asciiByte, isCRLF: isCRLF, next) = _quickASCIICharacter(
       at: pos,
-      limitedBy: end
+      limitedBy: end,
+      isScalarSemantics: isScalarSemantics
     ) else {
       if isScalarSemantics {
         guard pos < end else { return nil }
@@ -760,13 +759,7 @@ extension String {
     }
 
     // CR-LF should only match `[\r]` in scalar semantic mode or if inverted
-    if isCRLF {
-      if isScalarSemantics {
-        return self.unicodeScalars.index(before: next)
-      }
-      if isInverted {
-        return next
-      }
+    if isCRLF && !isScalarSemantics && !isInverted {
       return nil
     }
 
diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift
index 53dfe652d..2e6eb9b35 100644
--- a/Sources/_StringProcessing/Unicode/ASCII.swift
+++ b/Sources/_StringProcessing/Unicode/ASCII.swift
@@ -81,13 +81,22 @@ extension UInt8 {
 }
 
 extension String {
-  /// TODO: better to take isScalarSemantics parameter, we can return more results
-  /// and we can give the right `next` index, not requiring the caller to re-adjust it
-  /// TODO: detailed description of nuanced semantics
+  ///
+  /// If the position in the input is not definitely a full-ASCII character (uses sub-`0x300` quick check
+  /// on next byte), return `nil.`
+  ///
+  /// Otherwise, returns:
+  ///   1. The first ASCII byte for a character (or scalar if `isScalarSemantics`)
+  ///   2. Whether the ASCII Character is CR-LF
+  ///   3. The index for the end of that particular ASCII character:
+  ///     If the character is not CR-LF, the index of the next byte
+  ///     if `isScalarSemantics` is false and the ASCII character is CR-LF, the index after the CR-LF sequence.
+  ///
   func _quickASCIICharacter(
     at idx: Index,
-    limitedBy end: Index
-  ) -> (first: UInt8, next: Index, crLF: Bool)? {
+    limitedBy end: Index,
+    isScalarSemantics: Bool
+  ) -> (firstASCIIByte: UInt8, isCRLF: Bool, asciiCharacterEnd: Index)? {
     // TODO: fastUTF8 version
     assert(String.Index(idx, within: unicodeScalars) != nil)
     assert(idx <= end)
@@ -101,25 +110,25 @@ extension String {
       return nil
     }
 
-    var next = utf8.index(after: idx)
-    if next == end {
-      return (first: base, next: next, crLF: false)
+    let byteEnd = utf8.index(after: idx)
+    if isScalarSemantics || byteEnd == end {
+      return (firstASCIIByte: base, isCRLF: false, asciiCharacterEnd: byteEnd)
     }
 
-    let tail = utf8[next]
+    let tail = utf8[byteEnd]
     guard tail._isSub300StartingByte else { return nil }
 
     // Handle CR-LF:
     if base == ._carriageReturn && tail == ._lineFeed {
-      utf8.formIndex(after: &next)
-      guard next == end || utf8[next]._isSub300StartingByte else {
+      let crLFEnd = utf8.index(after: byteEnd)
+      guard crLFEnd == end || utf8[crLFEnd]._isSub300StartingByte else {
         return nil
       }
-      return (first: base, next: next, crLF: true)
+      return (firstASCIIByte: base, isCRLF: true, asciiCharacterEnd: crLFEnd)
     }
 
     assert(self[idx].isASCII && self[idx] != "\r\n")
-    return (first: base, next: next, crLF: false)
+    return (firstASCIIByte: base, isCRLF: false, asciiCharacterEnd: byteEnd)
   }
 
   func _quickMatch(
@@ -128,44 +137,47 @@ extension String {
     limitedBy end: Index,
     isScalarSemantics: Bool
   ) -> (next: Index, matchResult: Bool)? {
+    // Don't use scalar semantics in this quick path for anyGrapheme cluster or
+    // newline sequences, which are not scalar character classes.
+    let useScalarSemantics = isScalarSemantics && cc != .anyGrapheme && cc != .newlineSequence
     /// ASCII fast-paths
-    guard let (asciiValue, next, isCRLF) = _quickASCIICharacter(
-      at: idx, limitedBy: end
+    guard let (asciiValue, isCRLF: isCRLF, charEnd) = _quickASCIICharacter(
+      at: idx,
+      limitedBy: end,
+      isScalarSemantics: useScalarSemantics
     ) else {
       return nil
     }
 
     // TODO: bitvectors
     switch cc {
-    case .any, .anyGrapheme:
-      return (next, true)
+    case .any:
+      return (charEnd, true)
+
+    case .anyGrapheme:
+      // _quickASCIICharacter call handled CR-LF for us
+      _ = isCRLF
+      return (charEnd, true)
 
     case .digit:
-      return (next, asciiValue._asciiIsDigit)
+      return (charEnd, asciiValue._asciiIsDigit)
 
     case .horizontalWhitespace:
-      return (next, asciiValue._asciiIsHorizontalWhitespace)
-
-    case .verticalWhitespace, .newlineSequence:
-      if asciiValue._asciiIsVerticalWhitespace {
-        if isScalarSemantics && isCRLF && cc == .verticalWhitespace {
-          return (utf8.index(before: next), true)
-        }
-        return (next, true)
-      }
-      return (next, false)
+      return (charEnd, asciiValue._asciiIsHorizontalWhitespace)
+
+    case .verticalWhitespace:
+      return (charEnd, asciiValue._asciiIsVerticalWhitespace)
+
+    case .newlineSequence:
+      // _quickASCIICharacter call handled CR-LF for us
+      _ = isCRLF
+      return (charEnd, asciiValue._asciiIsVerticalWhitespace)
 
     case .whitespace:
-      if asciiValue._asciiIsWhitespace {
-        if isScalarSemantics && isCRLF {
-          return (utf8.index(before: next), true)
-        }
-        return (next, true)
-      }
-      return (next, false)
+      return (charEnd, asciiValue._asciiIsWhitespace)
 
     case .word:
-      return (next, asciiValue._asciiIsWord)
+      return (charEnd, asciiValue._asciiIsWord)
     }
   }