swiftlang · natecook1000 · May 16, 2022 · May 18, 2022 · May 19, 2022 · milseman
diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift
@@ -771,7 +771,7 @@ extension AST.Atom {
   /// range.
   public var isValidCharacterClassRangeBound: Bool {
     // If we have a literal character value for this, it can be used as a bound.
-    if literalCharacterValue != nil { return true }
+    if literalCharacterValue?.hasExactlyOneScalar == true { return true }
     switch kind {
     // \cx, \C-x, \M-x, \M-\C-x, \N{...}
     case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:

diff --git a/Sources/_RegexParser/Utility/Misc.swift b/Sources/_RegexParser/Utility/Misc.swift
@@ -19,6 +19,13 @@ extension Substring {
   var string: String { String(self) }
 }
 
+extension Character {
+  /// Whether this character is made up of exactly one Unicode scalar value.
+  public var hasExactlyOneScalar: Bool {
+    unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex
+  }
+}
+
 extension CustomStringConvertible {
   @_alwaysEmitIntoClient
   public var halfWidthCornerQuoted: String {

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -60,24 +60,53 @@ extension DSLTree.Atom {
     _ opts: MatchingOptions
   ) throws -> MEProgram<String>.ConsumeFunction? {
     let isCaseInsensitive = opts.isCaseInsensitive
-
+    let isCharacterSemantics = opts.semanticLevel == .graphemeCluster
+
     switch self {
     case let .char(c):
-      // TODO: Match level?
       return { input, bounds in
-        let low = bounds.lowerBound
+        let nextIndex = isCharacterSemantics
+          ? input.index(after: bounds.lowerBound)
+          : input.unicodeScalars.index(after: bounds.lowerBound)
+
+        var curIdx = bounds.lowerBound
         if isCaseInsensitive && c.isCased {
-          return input[low].lowercased() == c.lowercased()
-            ? input.index(after: low)
-            : nil
+          if isCharacterSemantics {
+            return input[curIdx].lowercased() == c.lowercased()
+              ? nextIndex
+              : nil
+          } else {
+            // FIXME: How do multi-scalar characters match in case insensitive mode?
+            return input.unicodeScalars[curIdx].properties.lowercaseMapping == c.lowercased()
+              ? nextIndex
+              : nil
+          }
         } else {
-          return input[low] == c
-            ? input.index(after: low)
-            : nil
+          if isCharacterSemantics {
+            return input[curIdx] == c
+              ? nextIndex
+              : nil
+          } else {
+            // Try to match the sequence of unicodeScalars in `input` and `c`
+            var patternIndex = c.unicodeScalars.startIndex
+            while curIdx < input.endIndex, patternIndex < c.unicodeScalars.endIndex {
+              if input.unicodeScalars[curIdx] != c.unicodeScalars[patternIndex] {
+                return nil
+              }
+              input.unicodeScalars.formIndex(after: &curIdx)
+              c.unicodeScalars.formIndex(after: &patternIndex)
+            }
+
+            // Match succeeded if all scalars in `c.unicodeScalars` matched
+            return patternIndex == c.unicodeScalars.endIndex
+              ? curIdx
+              : nil
+          }
         }
       }
     case let .scalar(s):
-      return consumeScalar {
+      let consume = consumeFunction(for: opts)
+      return consume {
         isCaseInsensitive
           ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
           : $0 == s
@@ -247,40 +276,48 @@ extension DSLTree.CustomCharacterClass.Member {
       }
       return c
     case let .range(low, high):
-      // TODO:
-      guard let lhs = low.literalCharacterValue else {
+      guard let lhs = low.literalCharacterValue, lhs.hasExactlyOneScalar else {
         throw Unsupported("\(low) in range")
       }
-      guard let rhs = high.literalCharacterValue else {
+      guard let rhs = high.literalCharacterValue, rhs.hasExactlyOneScalar else {
         throw Unsupported("\(high) in range")
       }
+      guard lhs <= rhs else {
+        throw Unsupported("Invalid range \(low)-\(high)")
+      }
 
-      if opts.isCaseInsensitive {
-        let lhsLower = lhs.lowercased()
-        let rhsLower = rhs.lowercased()
-        guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
-        return { input, bounds in
-          // TODO: check for out of bounds?
-          let curIdx = bounds.lowerBound
-          if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
-            // TODO: semantic level
-            return input.index(after: curIdx)
-          }
+      let isCaseInsensitive = opts.isCaseInsensitive
+      let isCharacterSemantic = opts.semanticLevel == .graphemeCluster
+
+      return { input, bounds in
+        // TODO: check for out of bounds?
+        let curIdx = bounds.lowerBound
+        let nextIndex = isCharacterSemantic
+          ? input.index(after: curIdx)
+          : input.unicodeScalars.index(after: curIdx)
+        if isCharacterSemantic && !input[curIdx].hasExactlyOneScalar {
           return nil
         }
-      } else {
-        guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
-        return { input, bounds in
-          // TODO: check for out of bounds?
-          let curIdx = bounds.lowerBound
-          if (lhs...rhs).contains(input[curIdx]) {
-            // TODO: semantic level
-            return input.index(after: curIdx)
-          }
+        let scalar = input.unicodeScalars[curIdx]
+        let scalarRange = lhs.unicodeScalars.first! ... rhs.unicodeScalars.first!
+        if scalarRange.contains(scalar) {
+          return nextIndex
+        }
+        if !isCaseInsensitive {
           return nil
         }
+
+        let stringRange = String(lhs)...String(rhs)
+        if (scalar.properties.changesWhenLowercased
+            && stringRange.contains(scalar.properties.lowercaseMapping))
+          || (scalar.properties.changesWhenUppercased
+            && stringRange.contains(scalar.properties.uppercaseMapping)) {
+          return nextIndex
+        }
+
+        return nil
       }
-
+      
     case let .custom(ccc):
       return try ccc.generateConsumer(opts)
 

diff --git a/Sources/_StringProcessing/Unicode/CharacterProps.swift b/Sources/_StringProcessing/Unicode/CharacterProps.swift
@@ -11,10 +11,3 @@
 
 
 // TODO
-
-extension Character {
-  /// Whether this character is made up of exactly one Unicode scalar value.
-  var hasExactlyOneScalar: Bool {
-    unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex
-  }
-}
diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift
@@ -103,6 +103,11 @@ public struct _CharacterClassModel: Hashable {
           return c == character
         }
       case .range(let range):
+        // Ranges can be formed with single-scalar characters, and can only
+        // match as such.
+        // FIXME: Convert to canonical composed version before testing?
+        guard character.hasExactlyOneScalar else { return false }
+
         if options.isCaseInsensitive {
           let newLower = range.lowerBound.lowercased()
           let newUpper = range.upperBound.lowercased()

diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
@@ -673,6 +673,11 @@ extension RegexTests {
     }
     firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}")
 
+    firstMatchTest(#"[12]"#, input: "1️⃣", match: nil)
+    firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil)
+    firstMatchTest(#"[\d]"#, input: "1️⃣", match: "1️⃣")
+    firstMatchTest(#"(?P)[\d]"#, input: "1️⃣", match: nil)
+
     // Currently not supported in the matching engine.
     for c: UnicodeScalar in ["a", "b", "c"] {
       firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)",
@@ -726,6 +731,33 @@ extension RegexTests {
     firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: "abc",
                    syntax: .experimental)
     firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: #""abc""#)
+
+    // Case sensitivity and ranges.
+    for ch in "abcD" {
+      firstMatchTest("[a-cD]", input: String(ch), match: String(ch))
+    }
+    for ch in "ABCd" {
+      firstMatchTest("[a-cD]", input: String(ch), match: nil)
+    }
+
+    for ch in "abcABCdD" {
+      firstMatchTest("(?i)[a-cd]", input: String(ch), match: String(ch))
+      firstMatchTest("(?i)[A-CD]", input: String(ch), match: String(ch))
+      firstMatchTest("(?iu)[a-cd]", input: String(ch), match: String(ch))
+      firstMatchTest("(?iu)[A-CD]", input: String(ch), match: String(ch))
+    }
+
+    for ch in "XYZ[\\]^_`abcd" {
+      firstMatchTest("[X-cd]", input: String(ch), match: String(ch))
+      firstMatchTest("[X-cd]", input: String(ch), match: String(ch))
+      firstMatchTest("(?u)[X-cd]", input: String(ch), match: String(ch))
+      firstMatchTest("(?u)[X-cd]", input: String(ch), match: String(ch))
+    }
+
+    for ch in "XYZ[\\]^_`abcxyzABCdD" {
+      firstMatchTest("(?i)[X-cd]", input: String(ch), match: String(ch))
+      firstMatchTest("(?iu)[X-cD]", input: String(ch), match: String(ch))
+    }
   }
 
   func testCharacterProperties() {
@@ -1507,31 +1539,28 @@ extension RegexTests {
   }
 
   func testCanonicalEquivalenceCustomCharacterClass() throws {
-    // Expectation: Concatenations with custom character classes should be able
-    // to match within a grapheme cluster. That is, a regex should be able to
-    // match the scalar values that comprise a grapheme cluster in separate,
-    // or repeated, custom character classes.
+    // Expectation: Custom character class matches do not cross grapheme
+    // character boundaries by default. When matching with Unicode scalar
+    // semantics, grapheme cluster boundaries are ignored, so matching
+    // sequences of custom character classes can succeed.
 
     matchTest(
       #"[áéíóú]$"#,
       (eComposed, true),
       (eDecomposed, true))
 
-    // FIXME: Custom char classes don't use canonical equivalence with composed characters
-    firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed,
-              xfail: true)
-    firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed,
-              xfail: true)
-    firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed,
-              xfail: true)
+    // Unicode scalar semantics
+    firstMatchTest(#"(?u)e[\u{301}]$"#, input: eDecomposed, match: eDecomposed)
+    firstMatchTest(#"(?u)e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed)
+    firstMatchTest(#"(?u)[e][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed)
+    firstMatchTest(#"(?u)[e-e][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed)
+    firstMatchTest(#"(?u)[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed)
 
-    // FIXME: Custom char classes don't match decomposed characters
-    firstMatchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed,
-              xfail: true)
-    firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed,
-              xfail: true)
-    firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed,
-              xfail: true)
+    // Grapheme cluster semantics
+    firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: nil)
+    firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: nil)
+    firstMatchTest(#"[e][\u{300}-\u{320}]$"#, input: eComposed, match: nil)
+    firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: nil)
 
     let flag = "🇰🇷"
     firstMatchTest(#"🇰🇷"#, input: flag, match: flag)
@@ -1540,27 +1569,15 @@ extension RegexTests {
     firstMatchTest(#"\u{1F1F0 1F1F7}"#, input: flag, match: flag)
 
     // First Unicode scalar followed by CCC of regional indicators
-    firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag,
-              xfail: true)
-
-    // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character
+    firstMatchTest(#"(?u)^\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: flag)
+    // A CCC of regional indicators followed by the second Unicode scalar
+    firstMatchTest(#"(?u)^[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}$"#, input: flag, match: flag)
     // A CCC of regional indicators x 2
-    firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag,
-              xfail: true)
+    firstMatchTest(#"(?u)^[\u{1F1E6}-\u{1F1FF}]{2}$"#, input: flag, match: flag)
 
-    // FIXME: A single CCC of regional indicators matches the whole flag character
-    // A CCC of regional indicators followed by the second Unicode scalar
-    firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag,
-              xfail: true)
     // A single CCC of regional indicators
-    firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil,
-              xfail: true)
-
-    // A single CCC of actual flag emojis / combined regional indicators
-    firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: flag, match: flag)
-    // This succeeds (correctly) because \u{1F1F0} is lexicographically
-    // within the CCC range
-    firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: "\u{1F1F0}abc", match: "\u{1F1F0}")
+    firstMatchTest(#"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil)
+    firstMatchTest(#"^(?u)[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil)
   }
 
   func testAnyChar() throws {

diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
@@ -2534,6 +2534,8 @@ extension RegexTests {
     diagnosticTest("[[:=:]]", .emptyProperty)
 
     diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand)
+    diagnosticTest(#"|([🇦🇫-🇿🇼])?"#, .invalidCharacterClassRangeOperand)
+    diagnosticTest(#"|([👨‍👩‍👦-👩‍👩‍👧‍👧])?"#, .invalidCharacterClassRangeOperand)
 
     diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A"))
     diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A"))