diff --git a/Documentation/ProgrammersManual.md b/Documentation/ProgrammersManual.md new file mode 100644 index 000000000..67021f44c --- /dev/null +++ b/Documentation/ProgrammersManual.md @@ -0,0 +1,30 @@ +# Programmer's Manual + +## Programming patterns + +### Engine quick checks and fast paths + +In the engine nomenclature, a quick-check results in a yes/no/maybe while a thorough check always results in a definite answer. + +The nature of quick checks and fast paths is that they bifurcate testing coverage. One easy way to prevent this in simple cases is to assert that a definite quick result matches the thorough result. + +One example of this pattern is matching against a builtin character class. The engine has a `_matchBuiltinCC` + +```swift + func _matchBuiltinCC(...) -> Input.Index? { + // Calls _quickMatchBuiltinCC, if that gives a definite result + // asserts that it is the same as the result of + // _thoroughMatchBuiltinCC and returns it. Otherwise returns the + // result of _thoroughMatchBuiltinCC + } + + @inline(__always) + func _quickMatchBuiltinCC(...) -> QuickResult + + @inline(never) + func _thoroughMatchBuiltinCC(...) -> Input.Index? +``` + +The thorough check is never inlined, as it is a lot of cold code. Note that quick and thorough functions should be pure, that is they shouldn't update processor state. + + diff --git a/Package.swift b/Package.swift index 5d45950db..f02ef1828 100644 --- a/Package.swift +++ b/Package.swift @@ -7,7 +7,7 @@ let availabilityDefinition = PackageDescription.SwiftSetting.unsafeFlags([ "-Xfrontend", "-define-availability", "-Xfrontend", - "SwiftStdlib 5.7:macOS 9999, iOS 9999, watchOS 9999, tvOS 9999", + "SwiftStdlib 5.7:macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0", "-Xfrontend", "-define-availability", "-Xfrontend", diff --git a/Sources/RegexBenchmark/BenchmarkRunner.swift b/Sources/RegexBenchmark/BenchmarkRunner.swift index 1a62858c1..641c03224 100644 --- a/Sources/RegexBenchmark/BenchmarkRunner.swift +++ b/Sources/RegexBenchmark/BenchmarkRunner.swift @@ -1,6 +1,9 @@ import Foundation @_spi(RegexBenchmark) import _StringProcessing +/// The number of times to re-run the benchmark if results are too varying +private var rerunCount: Int { 3 } + struct BenchmarkRunner { let suiteName: String var suite: [any RegexBenchmark] = [] @@ -82,11 +85,16 @@ struct BenchmarkRunner { for b in suite { var result = measure(benchmark: b, samples: samples) if result.runtimeIsTooVariant { - print("Warning: Standard deviation > \(Stats.maxAllowedStdev*100)% for \(b.name)") - print(result.runtime) - print("Rerunning \(b.name)") - result = measure(benchmark: b, samples: result.runtime.samples*2) - print(result.runtime) + for _ in 0.. \(Stats.maxAllowedStdev*100)% for \(b.name)") + print(result.runtime) + print("Rerunning \(b.name)") + result = measure(benchmark: b, samples: result.runtime.samples*2) + print(result.runtime) + if !result.runtimeIsTooVariant { + break + } + } if result.runtimeIsTooVariant { fatalError("Benchmark \(b.name) is too variant") } diff --git a/Sources/TestSupport/TestSupport.swift b/Sources/TestSupport/TestSupport.swift index b60adb63f..b562f9255 100644 --- a/Sources/TestSupport/TestSupport.swift +++ b/Sources/TestSupport/TestSupport.swift @@ -15,7 +15,7 @@ import XCTest // *without* `-disable-availability-checking` to ensure the #available check is // not compiled into a no-op. -#if os(Linux) +#if os(Linux) || os(Android) public func XCTExpectFailure( _ message: String? = nil, body: () throws -> Void ) rethrows {} diff --git a/Sources/VariadicsGenerator/VariadicsGenerator.swift b/Sources/VariadicsGenerator/VariadicsGenerator.swift index 606402393..91f1682dd 100644 --- a/Sources/VariadicsGenerator/VariadicsGenerator.swift +++ b/Sources/VariadicsGenerator/VariadicsGenerator.swift @@ -14,7 +14,7 @@ import ArgumentParser #if os(macOS) import Darwin -#elseif os(Linux) +#elseif canImport(Glibc) import Glibc #elseif os(Windows) import CRT diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 15e052901..d4c91bd63 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -702,9 +702,6 @@ fileprivate extension Compiler.ByteCodeGen { case .characterClass(let cc): // Custom character class that consumes a single grapheme let model = cc.asRuntimeModel(options) - guard model.consumesSingleGrapheme else { - return false - } builder.buildQuantify( model: model, kind, diff --git a/Sources/_StringProcessing/CMakeLists.txt b/Sources/_StringProcessing/CMakeLists.txt index ba3e2e03c..ef4aeb6ef 100644 --- a/Sources/_StringProcessing/CMakeLists.txt +++ b/Sources/_StringProcessing/CMakeLists.txt @@ -47,6 +47,7 @@ add_library(_StringProcessing Regex/DSLTree.swift Regex/Match.swift Regex/Options.swift + Unicode/ASCII.swift Unicode/CaseConversion.swift Unicode/CharacterProps.swift Unicode/Comparison.swift diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 36a6043fe..a3d864165 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -9,17 +9,18 @@ extension Character { } extension Processor { - mutating func matchBuiltin( + mutating func matchBuiltinCC( _ cc: _CharacterClassModel.Representation, - _ isInverted: Bool, - _ isStrictASCII: Bool, - _ isScalarSemantics: Bool + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool ) -> Bool { - guard let next = _doMatchBuiltin( + guard let next = input._matchBuiltinCC( cc, - isInverted, - isStrictASCII, - isScalarSemantics + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics ) else { signalFailure() return false @@ -27,96 +28,7 @@ extension Processor { currentPosition = next return true } - - func _doMatchBuiltin( - _ cc: _CharacterClassModel.Representation, - _ isInverted: Bool, - _ isStrictASCII: Bool, - _ isScalarSemantics: Bool - ) -> Input.Index? { - guard let char = load(), let scalar = loadScalar() else { - return nil - } - let asciiCheck = (char.isASCII && !isScalarSemantics) - || (scalar.isASCII && isScalarSemantics) - || !isStrictASCII - - var matched: Bool - var next: Input.Index - switch (isScalarSemantics, cc) { - case (_, .anyGrapheme): - next = input.index(after: currentPosition) - case (_, .anyScalar): - next = input.unicodeScalars.index(after: currentPosition) - case (true, _): - next = input.unicodeScalars.index(after: currentPosition) - case (false, _): - next = input.index(after: currentPosition) - } - - switch cc { - case .any, .anyGrapheme: - matched = true - case .anyScalar: - if isScalarSemantics { - matched = true - } else { - matched = input.isOnGraphemeClusterBoundary(next) - } - case .digit: - if isScalarSemantics { - matched = scalar.properties.numericType != nil && asciiCheck - } else { - matched = char.isNumber && asciiCheck - } - case .horizontalWhitespace: - if isScalarSemantics { - matched = scalar.isHorizontalWhitespace && asciiCheck - } else { - matched = char._isHorizontalWhitespace && asciiCheck - } - case .verticalWhitespace: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - } else { - matched = char._isNewline && asciiCheck - } - case .newlineSequence: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - if matched && scalar == "\r" - && next != input.endIndex && input.unicodeScalars[next] == "\n" { - // Match a full CR-LF sequence even in scalar semantics - input.unicodeScalars.formIndex(after: &next) - } - } else { - matched = char._isNewline && asciiCheck - } - case .whitespace: - if isScalarSemantics { - matched = scalar.properties.isWhitespace && asciiCheck - } else { - matched = char.isWhitespace && asciiCheck - } - case .word: - if isScalarSemantics { - matched = scalar.properties.isAlphabetic && asciiCheck - } else { - matched = char.isWordCharacter && asciiCheck - } - } - - if isInverted { - matched.toggle() - } - - guard matched else { - return nil - } - return next - } - func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { if currentPosition == subjectBounds.lowerBound { return true } switch payload.semanticLevel { @@ -126,7 +38,7 @@ extension Processor { return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline } } - + func isAtEndOfLine(_ payload: AssertionPayload) -> Bool { if currentPosition == subjectBounds.upperBound { return true } switch payload.semanticLevel { @@ -169,7 +81,7 @@ extension Processor { return isAtStartOfLine(payload) case .endOfLine: return isAtEndOfLine(payload) - + case .caretAnchor: if payload.anchorsMatchNewlines { return isAtStartOfLine(payload) @@ -202,3 +114,144 @@ extension Processor { } } } + +// MARK: Built-in character class matching + +extension String { + + // Mentioned in ProgrammersManual.md, update docs if redesigned + func _matchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> String.Index? { + guard currentPosition < endIndex else { + return nil + } + if case .definite(let result) = _quickMatchBuiltinCC( + cc, + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics + ) { + assert(result == _thoroughMatchBuiltinCC( + cc, + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics)) + return result + } + return _thoroughMatchBuiltinCC( + cc, + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) + } + + // Mentioned in ProgrammersManual.md, update docs if redesigned + @inline(__always) + func _quickMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> QuickResult { + assert(currentPosition < endIndex) + guard let (next, result) = _quickMatch( + cc, at: currentPosition, isScalarSemantics: isScalarSemantics + ) else { + return .unknown + } + return .definite(result == isInverted ? nil : next) + } + + // Mentioned in ProgrammersManual.md, update docs if redesigned + @inline(never) + func _thoroughMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> String.Index? { + assert(currentPosition < endIndex) + let char = self[currentPosition] + let scalar = unicodeScalars[currentPosition] + + let asciiCheck = !isStrictASCII + || (scalar.isASCII && isScalarSemantics) + || char.isASCII + + var matched: Bool + var next: String.Index + switch (isScalarSemantics, cc) { + case (_, .anyGrapheme): + next = index(after: currentPosition) + case (true, _): + next = unicodeScalars.index(after: currentPosition) + case (false, _): + next = index(after: currentPosition) + } + + switch cc { + case .any, .anyGrapheme: + matched = true + case .digit: + if isScalarSemantics { + matched = scalar.properties.numericType != nil && asciiCheck + } else { + matched = char.isNumber && asciiCheck + } + case .horizontalWhitespace: + if isScalarSemantics { + matched = scalar.isHorizontalWhitespace && asciiCheck + } else { + matched = char._isHorizontalWhitespace && asciiCheck + } + case .verticalWhitespace: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + } else { + matched = char._isNewline && asciiCheck + } + case .newlineSequence: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + if matched && scalar == "\r" + && next != endIndex && unicodeScalars[next] == "\n" { + // Match a full CR-LF sequence even in scalar semantics + unicodeScalars.formIndex(after: &next) + } + } else { + matched = char._isNewline && asciiCheck + } + case .whitespace: + if isScalarSemantics { + matched = scalar.properties.isWhitespace && asciiCheck + } else { + matched = char.isWhitespace && asciiCheck + } + case .word: + if isScalarSemantics { + matched = scalar.properties.isAlphabetic && asciiCheck + } else { + matched = char.isWordCharacter && asciiCheck + } + } + + if isInverted { + matched.toggle() + } + + guard matched else { + return nil + } + return next + } +} diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 9d17dc9bd..fa68b8b76 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -9,11 +9,12 @@ extension Processor { UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) case .builtin: // We only emit .quantify if it consumes a single character - next = _doMatchBuiltin( + next = input._matchBuiltinCC( payload.builtin, - payload.builtinIsInverted, - payload.builtinIsStrict, - false) + at: currentPosition, + isInverted: payload.builtinIsInverted, + isStrictASCII: payload.builtinIsStrict, + isScalarSemantics: false) case .any: let matched = currentPosition != input.endIndex && (!input[currentPosition].isNewline || payload.anyMatchesNewline) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 18e355fb5..25cad5c8c 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -583,11 +583,11 @@ extension Processor { case .matchBuiltin: let payload = payload.characterClassPayload - if matchBuiltin( + if matchBuiltinCC( payload.cc, - payload.isInverted, - payload.isStrictASCII, - payload.isScalarSemantics + isInverted: payload.isInverted, + isStrictASCII: payload.isStrictASCII, + isScalarSemantics: payload.isScalarSemantics ) { controller.step() } diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 3d2d38842..08c40157a 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -760,8 +760,6 @@ extension DSLTree.Atom.CharacterClass { switch self { case .anyGrapheme: return ".anyGraphemeCluster" - case .anyUnicodeScalar: - return ".anyUnicodeScalar" case .digit: return ".digit" case .notDigit: @@ -786,6 +784,8 @@ extension DSLTree.Atom.CharacterClass { return ".whitespace" case .notWhitespace: return ".whitespace.inverted" + case .anyUnicodeScalar: + fatalError("Unsupported") } } } diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index f5b08dd6d..6af0924cb 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -183,7 +183,6 @@ extension AST.Atom.EscapedBuiltin { case .wordCharacter: return .word case .notWordCharacter: return .notWord case .graphemeCluster: return .anyGrapheme - case .trueAnychar: return .anyUnicodeScalar default: return nil } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 93e86c607..b784e2382 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -260,7 +260,6 @@ extension DSLTree.Atom.CharacterClass { public var inverted: DSLTree.Atom.CharacterClass? { switch self { case .anyGrapheme: return nil - case .anyUnicodeScalar: return nil case .digit: return .notDigit case .notDigit: return .digit case .word: return .notWord @@ -273,6 +272,8 @@ extension DSLTree.Atom.CharacterClass { case .notVerticalWhitespace: return .verticalWhitespace case .whitespace: return .notWhitespace case .notWhitespace: return .whitespace + case .anyUnicodeScalar: + fatalError("Unsupported") } } } diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift new file mode 100644 index 000000000..5150e18cc --- /dev/null +++ b/Sources/_StringProcessing/Unicode/ASCII.swift @@ -0,0 +1,169 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2023 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +private var _lineFeed: UInt8 { 0x0A } +private var _carriageReturn: UInt8 { 0x0D } +private var _lineTab: UInt8 { 0x0B } +private var _formFeed: UInt8 { 0x0C } +private var _space: UInt8 { 0x20 } +private var _tab: UInt8 { 0x09 } + +private var _0: UInt8 { 0x30 } +private var _9: UInt8 { 0x39 } +private func _isASCIINumber(_ x: UInt8) -> Bool { + return (_0..._9).contains(x) +} + +private var _a: UInt8 { 0x61 } +private var _z: UInt8 { 0x7A } +private var _A: UInt8 { 0x41 } +private var _Z: UInt8 { 0x5A } + +private var _underscore: UInt8 { 0x5F } + +extension UInt8 { + var _isASCII: Bool { self < 0x80 } + + // TODO: Bitvectors for the below + + /// Assuming we're ASCII, whether we match `\d` + var _asciiIsDigit: Bool { + assert(_isASCII) + return(_0..._9).contains(self) + } + + /// Assuming we're ASCII, whether we match `\h` + var _asciiIsHorizontalWhitespace: Bool { + assert(_isASCII) + return self == _space || self == _tab + } + + /// Assuming we're ASCII, whether we match `\v` + var _asciiIsVerticalWhitespace: Bool { + assert(_isASCII) + switch self { + case _lineFeed, _carriageReturn, _lineTab, _formFeed: + return true + default: + return false + } + } + + /// Assuming we're ASCII, whether we match `\s` + var _asciiIsWhitespace: Bool { + assert(_isASCII) + switch self { + case _space, _tab, _lineFeed, _lineTab, _formFeed, _carriageReturn: + return true + default: + return false + } + } + + /// Assuming we're ASCII, whether we match `[a-zA-Z]` + var _asciiIsLetter: Bool { + assert(_isASCII) + return (_a..._z).contains(self) || (_A..._Z).contains(self) + } + + /// Assuming we're ASCII, whether we match `\w` + var _asciiIsWord: Bool { + assert(_isASCII) + return _asciiIsDigit || _asciiIsLetter || self == _underscore + } +} + +extension String { + /// TODO: detailed description of nuanced semantics + func _quickASCIICharacter( + at idx: Index + ) -> (first: UInt8, next: Index, crLF: Bool)? { + // TODO: fastUTF8 version + + if idx == endIndex { + return nil + } + let base = utf8[idx] + guard base._isASCII else { + assert(!self[idx].isASCII) + return nil + } + + var next = utf8.index(after: idx) + if next == utf8.endIndex { + assert(self[idx].isASCII) + return (first: base, next: next, crLF: false) + } + + let tail = utf8[next] + guard tail._isSub300StartingByte else { return nil } + + // Handle CR-LF: + if base == _carriageReturn && tail == _lineFeed { + utf8.formIndex(after: &next) + guard next == endIndex || utf8[next]._isSub300StartingByte else { + return nil + } + assert(self[idx] == "\r\n") + return (first: base, next: next, crLF: true) + } + + assert(self[idx].isASCII && self[idx] != "\r\n") + return (first: base, next: next, crLF: false) + } + + func _quickMatch( + _ cc: _CharacterClassModel.Representation, + at idx: Index, + isScalarSemantics: Bool + ) -> (next: Index, matchResult: Bool)? { + /// ASCII fast-paths + guard let (asciiValue, next, isCRLF) = _quickASCIICharacter( + at: idx + ) else { + return nil + } + + // TODO: bitvectors + switch cc { + case .any, .anyGrapheme: + return (next, true) + + case .digit: + return (next, asciiValue._asciiIsDigit) + + case .horizontalWhitespace: + return (next, asciiValue._asciiIsHorizontalWhitespace) + + case .verticalWhitespace, .newlineSequence: + if asciiValue._asciiIsVerticalWhitespace { + if isScalarSemantics && isCRLF && cc == .verticalWhitespace { + return (utf8.index(before: next), true) + } + return (next, true) + } + return (next, false) + + case .whitespace: + if asciiValue._asciiIsWhitespace { + if isScalarSemantics && isCRLF { + return (utf8.index(before: next), true) + } + return (next, true) + } + return (next, false) + + case .word: + return (next, asciiValue._asciiIsWord) + } + } +} + diff --git a/Sources/_StringProcessing/Unicode/NFC.swift b/Sources/_StringProcessing/Unicode/NFC.swift index 5c2c4aa48..59d195bb6 100644 --- a/Sources/_StringProcessing/Unicode/NFC.swift +++ b/Sources/_StringProcessing/Unicode/NFC.swift @@ -12,6 +12,11 @@ @_spi(_Unicode) import Swift +extension UInt8 { + /// Whether this is the starting byte of a sub-300 (i.e. pre-combining scalar) scalars + var _isSub300StartingByte: Bool { self < 0xCC } +} + extension UnicodeScalar { /// Checks whether the scalar is in NFC form. var isNFC: Bool { Character(self).singleNFCScalar == self } diff --git a/Sources/_StringProcessing/Utility/Misc.swift b/Sources/_StringProcessing/Utility/Misc.swift index 8a9cbe325..8555ec85c 100644 --- a/Sources/_StringProcessing/Utility/Misc.swift +++ b/Sources/_StringProcessing/Utility/Misc.swift @@ -57,3 +57,11 @@ extension Array { with: initialAccumulator, into: { [finish($0) ]}, accumulate: accumulate) } } + +/// An enum for quick-check functions, which could return an answer or indefinite. +/// We use a separate type because often the answer itself is optional. +enum QuickResult { + case definite(_ r: R) + case unknown +} + diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index c5f1f8ecd..cdee66ddb 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -18,10 +18,10 @@ struct _CharacterClassModel: Hashable { /// The actual character class to match. let cc: Representation - + /// The level (character or Unicode scalar) at which to match. let matchLevel: MatchingOptions.SemanticLevel - + /// If this character character class only matches ascii characters let isStrictASCII: Bool @@ -45,8 +45,6 @@ struct _CharacterClassModel: Hashable { case any = 0 /// Any grapheme cluster case anyGrapheme - /// Any Unicode scalar - case anyScalar /// Character.isDigit case digit /// Horizontal whitespace: `[:blank:]`, i.e @@ -61,7 +59,7 @@ struct _CharacterClassModel: Hashable { /// Character.isLetter or Character.isDigit or Character == "_" case word } - + /// Returns the end of the match of this character class in the string. /// /// - Parameter str: The string to match against. @@ -78,90 +76,15 @@ struct _CharacterClassModel: Hashable { guard currentPosition != input.endIndex else { return nil } - let char = input[currentPosition] - let scalar = input.unicodeScalars[currentPosition] - let isScalarSemantics = matchLevel == .unicodeScalar - let asciiCheck = (char.isASCII && !isScalarSemantics) - || (scalar.isASCII && isScalarSemantics) - || !isStrictASCII - - var matched: Bool - var next: String.Index - switch (isScalarSemantics, cc) { - case (_, .anyGrapheme): - next = input.index(after: currentPosition) - case (_, .anyScalar): - // FIXME: This allows us to be not-scalar aligned when in grapheme mode - // Should this even be allowed? - next = input.unicodeScalars.index(after: currentPosition) - case (true, _): - next = input.unicodeScalars.index(after: currentPosition) - case (false, _): - next = input.index(after: currentPosition) - } - switch cc { - case .any, .anyGrapheme, .anyScalar: - matched = true - case .digit: - if isScalarSemantics { - matched = scalar.properties.numericType != nil && asciiCheck - } else { - matched = char.isNumber && asciiCheck - } - case .horizontalWhitespace: - if isScalarSemantics { - matched = scalar.isHorizontalWhitespace && asciiCheck - } else { - matched = char._isHorizontalWhitespace && asciiCheck - } - case .verticalWhitespace: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - } else { - matched = char._isNewline && asciiCheck - } - case .newlineSequence: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - if matched && scalar == "\r" - && next != input.endIndex && input.unicodeScalars[next] == "\n" { - // Match a full CR-LF sequence even in scalar sematnics - input.unicodeScalars.formIndex(after: &next) - } - } else { - matched = char._isNewline && asciiCheck - } - case .whitespace: - if isScalarSemantics { - matched = scalar.properties.isWhitespace && asciiCheck - } else { - matched = char.isWhitespace && asciiCheck - } - case .word: - if isScalarSemantics { - matched = scalar.properties.isAlphabetic && asciiCheck - } else { - matched = char.isWordCharacter && asciiCheck - } - } - if isInverted { - matched.toggle() - } - if matched { - return next - } else { - return nil - } - } -} + let isScalarSemantics = matchLevel == .unicodeScalar -extension _CharacterClassModel { - var consumesSingleGrapheme: Bool { - switch self.cc { - case .anyScalar: return false - default: return true - } + return input._matchBuiltinCC( + cc, + at: currentPosition, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) } } @@ -185,7 +108,6 @@ extension _CharacterClassModel.Representation: CustomStringConvertible { switch self { case .any: return "" case .anyGrapheme: return "" - case .anyScalar: return "" case .digit: return "" case .horizontalWhitespace: return "" case .newlineSequence: return "" @@ -251,8 +173,10 @@ extension DSLTree.Atom.CharacterClass { case .anyGrapheme: cc = .anyGrapheme case .anyUnicodeScalar: - cc = .anyScalar + fatalError("Unsupported") } return _CharacterClassModel(cc: cc, options: options, isInverted: inverted) } } + + diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 0e529368a..5e85ad26c 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -1099,15 +1099,26 @@ class RegexDSLTests: XCTestCase { } let _: (Substring, Substring, Int, Double?).Type = type(of: regex3).RegexOutput.self - - let regex4 = Regex { + + // FIXME: Remove explicit type and `subregex1` and `subregex2` when type checker regression is fixed + let subregex1: Regex<(Substring, Substring?)> = Regex { + ZeroOrMore(Capture("d")) + } + let subregex2: Regex<( + Substring, Substring, Substring, Substring? + )> = Regex { + Capture(OneOrMore("b")) + Capture(ZeroOrMore("c")) + subregex1 + Optionally("e") + } + let regex4: Regex<( + Substring, Substring, Substring, Substring, Substring? + )> = Regex { OneOrMore("a") Capture { OneOrMore { - Capture(OneOrMore("b")) - Capture(ZeroOrMore("c")) - ZeroOrMore(Capture("d")) - Optionally("e") + subregex2 } } } diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 11479bfb6..ae7aa2aa0 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -62,7 +62,7 @@ fileprivate func expectFirstMatch( XCTAssertEqual(input.firstMatch(of: r)?.output, output, file: file, line: line) } -#if os(Linux) +#if os(Linux) || os(Android) func XCTExpectFailure(_ message: String? = nil, body: () throws -> Void) rethrows {} #endif @@ -286,6 +286,9 @@ extension UTS18Tests { // Test \v - vertical space lines = lineInput.matches(of: regex(#"\d{2}\v^"#).anchorsMatchLineEndings()) XCTAssertEqual(lines.count, 11) + // Test \s - whitespace + lines = lineInput.matches(of: regex(#"\d{2}\s^"#).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 11) // Test anchors as line boundaries lines = lineInput.matches(of: regex(#"^\d{2}$"#).anchorsMatchLineEndings()) XCTAssertEqual(lines.count, 12) @@ -301,6 +304,10 @@ extension UTS18Tests { lines = lineInput.matches( of: regex(#"\d{2}\v(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) XCTAssertEqual(lines.count, 10) + // Unicode scalar semantics - \s matches all except for \r\n sequence + lines = lineInput.matches( + of: regex(#"\d{2}\s(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 10) // Does not contain an empty line XCTAssertFalse(lineInput.contains(regex(#"^$"#)))