-
Notifications
You must be signed in to change notification settings - Fork 48
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Limit custom character class ranges to single scalars #422
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,13 @@ extension Substring { | |
var string: String { String(self) } | ||
} | ||
|
||
extension Character { | ||
/// Whether this character is made up of exactly one Unicode scalar value. | ||
public var hasExactlyOneScalar: Bool { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this public? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was previously in the _StringProcessing module; we need it in _RegexParser for the compile-time validation. |
||
unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex | ||
} | ||
} | ||
|
||
extension CustomStringConvertible { | ||
@_alwaysEmitIntoClient | ||
public var halfWidthCornerQuoted: String { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,24 +60,53 @@ extension DSLTree.Atom { | |
_ opts: MatchingOptions | ||
) throws -> MEProgram<String>.ConsumeFunction? { | ||
let isCaseInsensitive = opts.isCaseInsensitive | ||
|
||
let isCharacterSemantics = opts.semanticLevel == .graphemeCluster | ||
|
||
switch self { | ||
case let .char(c): | ||
// TODO: Match level? | ||
return { input, bounds in | ||
let low = bounds.lowerBound | ||
let nextIndex = isCharacterSemantics | ||
? input.index(after: bounds.lowerBound) | ||
: input.unicodeScalars.index(after: bounds.lowerBound) | ||
|
||
var curIdx = bounds.lowerBound | ||
if isCaseInsensitive && c.isCased { | ||
return input[low].lowercased() == c.lowercased() | ||
? input.index(after: low) | ||
: nil | ||
if isCharacterSemantics { | ||
return input[curIdx].lowercased() == c.lowercased() | ||
? nextIndex | ||
: nil | ||
} else { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove else |
||
// FIXME: How do multi-scalar characters match in case insensitive mode? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you have an issue or something to clarify this? |
||
return input.unicodeScalars[curIdx].properties.lowercaseMapping == c.lowercased() | ||
? nextIndex | ||
: nil | ||
} | ||
} else { | ||
return input[low] == c | ||
? input.index(after: low) | ||
: nil | ||
if isCharacterSemantics { | ||
return input[curIdx] == c | ||
? nextIndex | ||
: nil | ||
} else { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove else |
||
// Try to match the sequence of unicodeScalars in `input` and `c` | ||
var patternIndex = c.unicodeScalars.startIndex | ||
while curIdx < input.endIndex, patternIndex < c.unicodeScalars.endIndex { | ||
if input.unicodeScalars[curIdx] != c.unicodeScalars[patternIndex] { | ||
return nil | ||
} | ||
input.unicodeScalars.formIndex(after: &curIdx) | ||
c.unicodeScalars.formIndex(after: &patternIndex) | ||
} | ||
|
||
// Match succeeded if all scalars in `c.unicodeScalars` matched | ||
return patternIndex == c.unicodeScalars.endIndex | ||
? curIdx | ||
: nil | ||
} | ||
} | ||
} | ||
case let .scalar(s): | ||
return consumeScalar { | ||
let consume = consumeFunction(for: opts) | ||
return consume { | ||
isCaseInsensitive | ||
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping | ||
: $0 == s | ||
|
@@ -247,40 +276,48 @@ extension DSLTree.CustomCharacterClass.Member { | |
} | ||
return c | ||
case let .range(low, high): | ||
// TODO: | ||
guard let lhs = low.literalCharacterValue else { | ||
guard let lhs = low.literalCharacterValue, lhs.hasExactlyOneScalar else { | ||
throw Unsupported("\(low) in range") | ||
} | ||
guard let rhs = high.literalCharacterValue else { | ||
guard let rhs = high.literalCharacterValue, rhs.hasExactlyOneScalar else { | ||
throw Unsupported("\(high) in range") | ||
} | ||
guard lhs <= rhs else { | ||
throw Unsupported("Invalid range \(low)-\(high)") | ||
} | ||
|
||
if opts.isCaseInsensitive { | ||
let lhsLower = lhs.lowercased() | ||
let rhsLower = rhs.lowercased() | ||
guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } | ||
return { input, bounds in | ||
// TODO: check for out of bounds? | ||
let curIdx = bounds.lowerBound | ||
if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) { | ||
// TODO: semantic level | ||
return input.index(after: curIdx) | ||
} | ||
let isCaseInsensitive = opts.isCaseInsensitive | ||
let isCharacterSemantic = opts.semanticLevel == .graphemeCluster | ||
|
||
return { input, bounds in | ||
// TODO: check for out of bounds? | ||
let curIdx = bounds.lowerBound | ||
let nextIndex = isCharacterSemantic | ||
? input.index(after: curIdx) | ||
: input.unicodeScalars.index(after: curIdx) | ||
Comment on lines
+295
to
+297
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Really feel like this should be a helper function or fully refactored. It seems ripe for bugs if vigilance is required to remember to support scalar semantics There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll make it a helper function, but the real issue is that the two views share the same index type, and we can't enforce that you call the helper function. Maybe we could look at designing a wrapper type that would distinguish between the two? Seems like it add a lot of friction… struct ViewLockedIndex<View> {
var index: String.Index
}
extension String {
func index(after i: ViewLockedIndex<String>) -> ViewLockedIndex<String> {
.init(index: index(after: i.index))
}
func index(after i: ViewLockedIndex<String.UnicodeScalarView>) -> ViewLockedIndex<String.UnicodeScalarView>
.init(index: self.unicodeScalars.index(after: i.index))
}
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What I mean is rather than have a bunch of inner-most ifs, what would the code look like to have an outer-if? That is a more bottoms-up refactoring |
||
if isCharacterSemantic && !input[curIdx].hasExactlyOneScalar { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does this do to non-NFC characters? |
||
return nil | ||
} | ||
} else { | ||
guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } | ||
return { input, bounds in | ||
// TODO: check for out of bounds? | ||
let curIdx = bounds.lowerBound | ||
if (lhs...rhs).contains(input[curIdx]) { | ||
// TODO: semantic level | ||
return input.index(after: curIdx) | ||
} | ||
let scalar = input.unicodeScalars[curIdx] | ||
let scalarRange = lhs.unicodeScalars.first! ... rhs.unicodeScalars.first! | ||
if scalarRange.contains(scalar) { | ||
return nextIndex | ||
} | ||
if !isCaseInsensitive { | ||
return nil | ||
} | ||
|
||
let stringRange = String(lhs)...String(rhs) | ||
if (scalar.properties.changesWhenLowercased | ||
&& stringRange.contains(scalar.properties.lowercaseMapping)) | ||
|| (scalar.properties.changesWhenUppercased | ||
&& stringRange.contains(scalar.properties.uppercaseMapping)) { | ||
return nextIndex | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't this going back to lexicographical contains? |
||
} | ||
|
||
return nil | ||
} | ||
|
||
case let .custom(ccc): | ||
return try ccc.generateConsumer(opts) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -103,6 +103,11 @@ public struct _CharacterClassModel: Hashable { | |
return c == character | ||
} | ||
case .range(let range): | ||
// Ranges can be formed with single-scalar characters, and can only | ||
// match as such. | ||
// FIXME: Convert to canonical composed version before testing? | ||
guard character.hasExactlyOneScalar else { return false } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is just wrong and breaks canonical equivalence. |
||
|
||
if options.isCaseInsensitive { | ||
let newLower = range.lowerBound.lowercased() | ||
let newUpper = range.upperBound.lowercased() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -673,6 +673,11 @@ extension RegexTests { | |
} | ||
firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}") | ||
|
||
firstMatchTest(#"[12]"#, input: "1️⃣", match: nil) | ||
firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil) | ||
firstMatchTest(#"[\d]"#, input: "1️⃣", match: "1️⃣") | ||
firstMatchTest(#"(?P)[\d]"#, input: "1️⃣", match: nil) | ||
|
||
// Currently not supported in the matching engine. | ||
for c: UnicodeScalar in ["a", "b", "c"] { | ||
firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)", | ||
|
@@ -726,6 +731,33 @@ extension RegexTests { | |
firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: "abc", | ||
syntax: .experimental) | ||
firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: #""abc""#) | ||
|
||
// Case sensitivity and ranges. | ||
for ch in "abcD" { | ||
firstMatchTest("[a-cD]", input: String(ch), match: String(ch)) | ||
} | ||
for ch in "ABCd" { | ||
firstMatchTest("[a-cD]", input: String(ch), match: nil) | ||
} | ||
|
||
for ch in "abcABCdD" { | ||
firstMatchTest("(?i)[a-cd]", input: String(ch), match: String(ch)) | ||
firstMatchTest("(?i)[A-CD]", input: String(ch), match: String(ch)) | ||
firstMatchTest("(?iu)[a-cd]", input: String(ch), match: String(ch)) | ||
firstMatchTest("(?iu)[A-CD]", input: String(ch), match: String(ch)) | ||
} | ||
|
||
for ch in "XYZ[\\]^_`abcd" { | ||
firstMatchTest("[X-cd]", input: String(ch), match: String(ch)) | ||
firstMatchTest("[X-cd]", input: String(ch), match: String(ch)) | ||
firstMatchTest("(?u)[X-cd]", input: String(ch), match: String(ch)) | ||
firstMatchTest("(?u)[X-cd]", input: String(ch), match: String(ch)) | ||
} | ||
|
||
for ch in "XYZ[\\]^_`abcxyzABCdD" { | ||
firstMatchTest("(?i)[X-cd]", input: String(ch), match: String(ch)) | ||
firstMatchTest("(?iu)[X-cD]", input: String(ch), match: String(ch)) | ||
} | ||
} | ||
|
||
func testCharacterProperties() { | ||
|
@@ -1507,31 +1539,28 @@ extension RegexTests { | |
} | ||
|
||
func testCanonicalEquivalenceCustomCharacterClass() throws { | ||
// Expectation: Concatenations with custom character classes should be able | ||
// to match within a grapheme cluster. That is, a regex should be able to | ||
// match the scalar values that comprise a grapheme cluster in separate, | ||
// or repeated, custom character classes. | ||
// Expectation: Custom character class matches do not cross grapheme | ||
// character boundaries by default. When matching with Unicode scalar | ||
// semantics, grapheme cluster boundaries are ignored, so matching | ||
// sequences of custom character classes can succeed. | ||
|
||
matchTest( | ||
#"[áéíóú]$"#, | ||
(eComposed, true), | ||
(eDecomposed, true)) | ||
|
||
// FIXME: Custom char classes don't use canonical equivalence with composed characters | ||
firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed, | ||
xfail: true) | ||
firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, | ||
xfail: true) | ||
firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, | ||
xfail: true) | ||
// Unicode scalar semantics | ||
firstMatchTest(#"(?u)e[\u{301}]$"#, input: eDecomposed, match: eDecomposed) | ||
firstMatchTest(#"(?u)e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed) | ||
firstMatchTest(#"(?u)[e][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed) | ||
firstMatchTest(#"(?u)[e-e][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed) | ||
firstMatchTest(#"(?u)[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed) | ||
|
||
// FIXME: Custom char classes don't match decomposed characters | ||
firstMatchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed, | ||
xfail: true) | ||
firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, | ||
xfail: true) | ||
firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, | ||
xfail: true) | ||
// Grapheme cluster semantics | ||
firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: nil) | ||
firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: nil) | ||
firstMatchTest(#"[e][\u{300}-\u{320}]$"#, input: eComposed, match: nil) | ||
firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: nil) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We also need positive tests for grapheme-semantics under canonical equivalence. |
||
|
||
let flag = "🇰🇷" | ||
firstMatchTest(#"🇰🇷"#, input: flag, match: flag) | ||
|
@@ -1540,27 +1569,15 @@ extension RegexTests { | |
firstMatchTest(#"\u{1F1F0 1F1F7}"#, input: flag, match: flag) | ||
|
||
// First Unicode scalar followed by CCC of regional indicators | ||
firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag, | ||
xfail: true) | ||
|
||
// FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character | ||
firstMatchTest(#"(?u)^\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: flag) | ||
// A CCC of regional indicators followed by the second Unicode scalar | ||
firstMatchTest(#"(?u)^[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}$"#, input: flag, match: flag) | ||
// A CCC of regional indicators x 2 | ||
firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag, | ||
xfail: true) | ||
firstMatchTest(#"(?u)^[\u{1F1E6}-\u{1F1FF}]{2}$"#, input: flag, match: flag) | ||
|
||
// FIXME: A single CCC of regional indicators matches the whole flag character | ||
// A CCC of regional indicators followed by the second Unicode scalar | ||
firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag, | ||
xfail: true) | ||
// A single CCC of regional indicators | ||
firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil, | ||
xfail: true) | ||
|
||
// A single CCC of actual flag emojis / combined regional indicators | ||
firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: flag, match: flag) | ||
// This succeeds (correctly) because \u{1F1F0} is lexicographically | ||
// within the CCC range | ||
firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: "\u{1F1F0}abc", match: "\u{1F1F0}") | ||
firstMatchTest(#"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil) | ||
firstMatchTest(#"^(?u)[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil) | ||
} | ||
|
||
func testAnyChar() throws { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does normalization affect this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It does for sure, it would help if we normalized all characters as we stored them in the custom character class model.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Which normalization?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NFC would allow us to permit as many characters as possible to act as endpoints.