diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 5eb38518e..beba6101b 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -59,10 +59,14 @@ fileprivate extension Compiler.ByteCodeGen { emitAny() case let .char(c): - try emitCharacter(c) + emitCharacter(c) case let .scalar(s): - try emitScalar(s) + if options.semanticLevel == .graphemeCluster { + emitCharacter(Character(s)) + } else { + emitMatchScalar(s) + } case let .assertion(kind): try emitAssertion(kind.ast) @@ -88,6 +92,34 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitQuotedLiteral(_ s: String) { + guard options.semanticLevel == .graphemeCluster else { + for char in s { + for scalar in char.unicodeScalars { + emitMatchScalar(scalar) + } + } + return + } + + // Fast path for eliding boundary checks for an all ascii quoted literal + if optimizationsEnabled && s.allSatisfy(\.isASCII) { + let lastIdx = s.unicodeScalars.indices.last! + for idx in s.unicodeScalars.indices { + let boundaryCheck = idx == lastIdx + let scalar = s.unicodeScalars[idx] + if options.isCaseInsensitive && scalar.properties.isCased { + builder.buildMatchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck) + } else { + builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck) + } + } + return + } + + for c in s { emitCharacter(c) } + } + mutating func emitBackreference( _ ref: AST.Reference ) throws { @@ -245,41 +277,47 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitScalar(_ s: UnicodeScalar) throws { - // TODO: Native instruction buildMatchScalar(s) - if options.isCaseInsensitive { - // TODO: e.g. buildCaseInsensitiveMatchScalar(s) - builder.buildConsume(by: consumeScalar { - $0.properties.lowercaseMapping == s.properties.lowercaseMapping - }) + mutating func emitMatchScalar(_ s: UnicodeScalar) { + assert(options.semanticLevel == .unicodeScalar) + if options.isCaseInsensitive && s.properties.isCased { + builder.buildMatchScalarCaseInsensitive(s, boundaryCheck: false) } else { - builder.buildConsume(by: consumeScalar { - $0 == s - }) + builder.buildMatchScalar(s, boundaryCheck: false) } } - mutating func emitCharacter(_ c: Character) throws { - // Unicode scalar matches the specific scalars that comprise a character + mutating func emitCharacter(_ c: Character) { + // Unicode scalar mode matches the specific scalars that comprise a character if options.semanticLevel == .unicodeScalar { for scalar in c.unicodeScalars { - try emitScalar(scalar) + emitMatchScalar(scalar) } return } if options.isCaseInsensitive && c.isCased { - // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true) - builder.buildConsume { input, bounds in - let inputChar = input[bounds.lowerBound].lowercased() - let matchChar = c.lowercased() - return inputChar == matchChar - ? input.index(after: bounds.lowerBound) - : nil + if optimizationsEnabled && c.isASCII { + // c.isCased ensures that c is not CR-LF, + // so we know that c is a single scalar + assert(c.unicodeScalars.count == 1) + builder.buildMatchScalarCaseInsensitive( + c.unicodeScalars.last!, + boundaryCheck: true) + } else { + builder.buildMatch(c, isCaseInsensitive: true) } - } else { - builder.buildMatch(c) + return } + + if optimizationsEnabled && c.isASCII { + let lastIdx = c.unicodeScalars.indices.last! + for idx in c.unicodeScalars.indices { + builder.buildMatchScalar(c.unicodeScalars[idx], boundaryCheck: idx == lastIdx) + } + return + } + + builder.buildMatch(c, isCaseInsensitive: false) } mutating func emitAny() { @@ -717,11 +755,12 @@ fileprivate extension Compiler.ByteCodeGen { _ ccc: DSLTree.CustomCharacterClass ) throws { if let asciiBitset = ccc.asAsciiBitset(options), - options.semanticLevel == .graphemeCluster, optimizationsEnabled { - // future work: add a bit to .matchBitset to consume either a character - // or a scalar so we can have this optimization in scalar mode - builder.buildMatchAsciiBitset(asciiBitset) + if options.semanticLevel == .unicodeScalar { + builder.buildScalarMatchAsciiBitset(asciiBitset) + } else { + builder.buildMatchAsciiBitset(asciiBitset) + } } else { let consumer = try ccc.generateConsumer(options) builder.buildConsume(by: consumer) @@ -798,45 +837,7 @@ fileprivate extension Compiler.ByteCodeGen { try emitAtom(a) case let .quotedLiteral(s): - if options.semanticLevel == .graphemeCluster { - if options.isCaseInsensitive { - // TODO: buildCaseInsensitiveMatchSequence(c) or alternative - builder.buildConsume { input, bounds in - var iterator = s.makeIterator() - var currentIndex = bounds.lowerBound - while let ch = iterator.next() { - guard currentIndex < bounds.upperBound, - ch.lowercased() == input[currentIndex].lowercased() - else { return nil } - input.formIndex(after: ¤tIndex) - } - return currentIndex - } - } else { - builder.buildMatchSequence(s) - } - } else { - builder.buildConsume { - [caseInsensitive = options.isCaseInsensitive] input, bounds in - // TODO: Case folding - var iterator = s.unicodeScalars.makeIterator() - var currentIndex = bounds.lowerBound - while let scalar = iterator.next() { - guard currentIndex < bounds.upperBound else { return nil } - if caseInsensitive { - if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping { - return nil - } - } else { - if scalar != input.unicodeScalars[currentIndex] { - return nil - } - } - input.unicodeScalars.formIndex(after: ¤tIndex) - } - return currentIndex - } - } + emitQuotedLiteral(s) case let .convertedRegexLiteral(n, _): return try emitNode(n) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index dbb324b67..4b98bc17c 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -11,6 +11,13 @@ @_implementationOnly import _RegexParser +extension Character { + var _singleScalarAsciiValue: UInt8? { + guard self != "\r\n" else { return nil } + return asciiValue + } +} + extension DSLTree.Node { /// Attempt to generate a consumer from this AST node /// @@ -53,11 +60,50 @@ extension DSLTree._AST.Atom { } } +extension Character { + func generateConsumer( + _ opts: MatchingOptions + ) throws -> MEProgram.ConsumeFunction? { + let isCaseInsensitive = opts.isCaseInsensitive + switch opts.semanticLevel { + case .graphemeCluster: + return { input, bounds in + let low = bounds.lowerBound + if isCaseInsensitive && isCased { + return input[low].lowercased() == lowercased() + ? input.index(after: low) + : nil + } else { + return input[low] == self + ? input.index(after: low) + : nil + } + } + case .unicodeScalar: + // TODO: This should only be reachable from character class emission, can + // we guarantee that? Otherwise we'd want a different matching behavior. + let consumers = unicodeScalars.map { s in consumeScalar { + isCaseInsensitive + ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping + : $0 == s + }} + return { input, bounds in + for fn in consumers { + if let idx = fn(input, bounds) { + return idx + } + } + return nil + } + } + } +} + extension DSLTree.Atom { var singleScalarASCIIValue: UInt8? { switch self { - case let .char(c) where c != "\r\n": - return c.asciiValue + case let .char(c): + return c._singleScalarAsciiValue case let .scalar(s) where s.isASCII: return UInt8(ascii: s) case let .unconverted(atom): @@ -72,44 +118,15 @@ extension DSLTree.Atom { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { - let isCaseInsensitive = opts.isCaseInsensitive - switch self { case let .char(c): - if opts.semanticLevel == .graphemeCluster { - return { input, bounds in - let low = bounds.lowerBound - if isCaseInsensitive && c.isCased { - return input[low].lowercased() == c.lowercased() - ? input.index(after: low) - : nil - } else { - return input[low] == c - ? input.index(after: low) - : nil - } - } - } else { - let consumers = c.unicodeScalars.map { s in consumeScalar { - isCaseInsensitive - ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping - : $0 == s - }} - return { input, bounds in - for fn in consumers { - if let idx = fn(input, bounds) { - return idx - } - } - return nil - } - } + return try c.generateConsumer(opts) + case let .scalar(s): - return consumeScalar { - isCaseInsensitive - ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping - : $0 == s - } + // A scalar always matches the same as a single scalar character. This + // means it must match a whole grapheme in grapheme semantic mode, but + // can match a single scalar in scalar semantic mode. + return try Character(s).generateConsumer(opts) case .any: // FIXME: Should this be a total ordering? @@ -211,16 +228,20 @@ extension AST.Atom { var singleScalar: UnicodeScalar? { switch kind { case .scalar(let s): return s.value + case .escaped(let e): + guard let s = e.scalarValue else { return nil } + return s default: return nil } } var singleScalarASCIIValue: UInt8? { + if let s = singleScalar, s.isASCII { + return UInt8(ascii: s) + } switch kind { - case let .char(c) where c != "\r\n": - return c.asciiValue - case let .scalar(s) where s.value.isASCII: - return UInt8(ascii: s.value) + case let .char(c): + return c._singleScalarAsciiValue default: return nil } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 21c647a3b..42fb86913 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -147,6 +147,26 @@ extension Instruction.Payload { var string: StringRegister { interpret() } + + init(scalar: Unicode.Scalar) { + self.init(UInt64(scalar.value)) + } + var scalar: Unicode.Scalar { + return Unicode.Scalar(_value: UInt32(self.rawValue)) + } + + init(scalar: Unicode.Scalar, caseInsensitive: Bool, boundaryCheck: Bool) { + let raw = UInt64(scalar.value) + + (caseInsensitive ? 1 << 55: 0) + + (boundaryCheck ? 1 << 54 : 0) + self.init(raw) + } + var scalarPayload: (Unicode.Scalar, caseInsensitive: Bool, boundaryCheck: Bool) { + let caseInsensitive = (self.rawValue >> 55) & 1 == 1 + let boundaryCheck = (self.rawValue >> 54) & 1 == 1 + let scalar = Unicode.Scalar(_value: UInt32(self.rawValue & 0xFFFF_FFFF)) + return (scalar, caseInsensitive: caseInsensitive, boundaryCheck: boundaryCheck) + } init(sequence: SequenceRegister) { self.init(sequence) @@ -190,18 +210,20 @@ extension Instruction.Payload { interpret() } - init(element: ElementRegister) { - self.init(element) + init(element: ElementRegister, isCaseInsensitive: Bool) { + self.init(isCaseInsensitive ? 1 : 0, element) } - var element: ElementRegister { - interpret() + var elementPayload: (isCaseInsensitive: Bool, ElementRegister) { + let pair: (UInt64, ElementRegister) = interpretPair() + return (isCaseInsensitive: pair.0 == 1, pair.1) } - init(bitset: AsciiBitsetRegister) { - self.init(bitset) + init(bitset: AsciiBitsetRegister, isScalar: Bool) { + self.init(isScalar ? 1 : 0, bitset) } - var bitset: AsciiBitsetRegister { - interpret() + var bitsetPayload: (isScalar: Bool, AsciiBitsetRegister) { + let pair: (UInt64, AsciiBitsetRegister) = interpretPair() + return (isScalar: pair.0 == 1, pair.1) } init(consumer: ConsumeFunctionRegister) { diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 4cc810138..8e1a1f294 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -90,20 +90,27 @@ extension Instruction { /// Composite assert-advance else restore. /// - /// match(_: EltReg) + /// match(_: EltReg, isCaseInsensitive: Bool) /// - /// Operand: Element register to compare against. + /// Operands: + /// - Element register to compare against. + /// - Boolean for if we should match in a case insensitive way case match - /// Match against a sequence of elements + /// Match against a scalar and possibly perform a boundary check or match in a case insensitive way /// - /// matchSequence(_: SeqReg) + /// matchScalar(_: Unicode.Scalar, isCaseInsensitive: Bool, boundaryCheck: Bool) /// - /// Operand: Sequence register to compare against. - case matchSequence + /// Operands: Scalar value to match against and booleans + case matchScalar - /// Match against a set of valid ascii values stored in a bitset - /// Operand: Ascii bitset register containing the bitset + /// Match a character or a scalar against a set of valid ascii values stored in a bitset + /// + /// matchBitset(_: AsciiBitsetRegister, isScalar: Bool) + /// + /// Operand: + /// - Ascii bitset register containing the bitset + /// - Boolean for if we should match by scalar value case matchBitset /// TODO: builtin assertions and anchors @@ -324,7 +331,7 @@ extension Instruction { var elementRegister: ElementRegister? { switch opcode { case .match: - return payload.element + return payload.elementPayload.1 default: return nil } } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 84b80489f..0b9a91726 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -144,24 +144,32 @@ extension MEProgram.Builder { instructions.append(.init(.advance, .init(distance: n))) } - mutating func buildMatch(_ e: Character) { + mutating func buildMatch(_ e: Character, isCaseInsensitive: Bool) { instructions.append(.init( - .match, .init(element: elements.store(e)))) + .match, .init(element: elements.store(e), isCaseInsensitive: isCaseInsensitive))) } - mutating func buildMatchSequence( - _ s: S - ) where S.Element == Character { - instructions.append(.init( - .matchSequence, - .init(sequence: sequences.store(.init(s))))) + mutating func buildMatchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) { + instructions.append(.init(.matchScalar, .init(scalar: s, caseInsensitive: false, boundaryCheck: boundaryCheck))) + } + + mutating func buildMatchScalarCaseInsensitive(_ s: Unicode.Scalar, boundaryCheck: Bool) { + instructions.append(.init(.matchScalar, .init(scalar: s, caseInsensitive: true, boundaryCheck: boundaryCheck))) } + mutating func buildMatchAsciiBitset( _ b: DSLTree.CustomCharacterClass.AsciiBitset ) { instructions.append(.init( - .matchBitset, .init(bitset: makeAsciiBitset(b)))) + .matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: false))) + } + + mutating func buildScalarMatchAsciiBitset( + _ b: DSLTree.CustomCharacterClass.AsciiBitset + ) { + instructions.append(.init( + .matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: true))) } mutating func buildConsume( diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index d19da01e5..2be918294 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -219,6 +219,15 @@ extension Processor { return true } + mutating func matchCaseInsensitive(_ e: Element) -> Bool { + guard let cur = load(), cur.lowercased() == e.lowercased() else { + signalFailure() + return false + } + _uncheckedForcedConsumeOne() + return true + } + // Match against the current input prefix. Returns whether // it succeeded vs signaling an error. mutating func matchSeq( @@ -230,6 +239,44 @@ extension Processor { return true } + func loadScalar() -> Unicode.Scalar? { + currentPosition < end ? input.unicodeScalars[currentPosition] : nil + } + + mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { + guard s == loadScalar(), + let idx = input.unicodeScalars.index( + currentPosition, + offsetBy: 1, + limitedBy: end), + (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) + else { + signalFailure() + return false + } + currentPosition = idx + return true + } + + mutating func matchScalarCaseInsensitive( + _ s: Unicode.Scalar, + boundaryCheck: Bool + ) -> Bool { + guard let curScalar = loadScalar(), + s.properties.lowercaseMapping == curScalar.properties.lowercaseMapping, + let idx = input.unicodeScalars.index( + currentPosition, + offsetBy: 1, + limitedBy: end), + (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) + else { + signalFailure() + return false + } + currentPosition = idx + return true + } + // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset @@ -244,6 +291,20 @@ extension Processor { return true } + // Equivalent of matchBitset but emitted when in unicode scalar semantic mode + mutating func matchBitsetScalar( + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + ) -> Bool { + guard let curScalar = loadScalar(), + bitset.matches(scalar: curScalar), + let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end) else { + signalFailure() + return false + } + currentPosition = idx + return true + } + mutating func signalFailure() { guard let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoints.popLast()?.destructure @@ -379,23 +440,40 @@ extension Processor { } case .match: - let reg = payload.element - if match(registers[reg]) { - controller.step() + let (isCaseInsensitive, reg) = payload.elementPayload + if isCaseInsensitive { + if matchCaseInsensitive(registers[reg]) { + controller.step() + } + } else { + if match(registers[reg]) { + controller.step() + } } - case .matchSequence: - let reg = payload.sequence - let seq = registers[reg] - if matchSeq(seq) { - controller.step() + case .matchScalar: + let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload + if caseInsensitive { + if matchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck) { + controller.step() + } + } else { + if matchScalar(scalar, boundaryCheck: boundaryCheck) { + controller.step() + } } case .matchBitset: - let reg = payload.bitset + let (isScalar, reg) = payload.bitsetPayload let bitset = registers[reg] - if matchBitset(bitset) { - controller.step() + if isScalar { + if matchBitsetScalar(bitset) { + controller.step() + } + } else { + if matchBitset(bitset) { + controller.step() + } } case .consumeBy: diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 4237eda33..119a5d14f 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -315,8 +315,7 @@ extension PrettyPrinter { return } - var charMembers = "" - + var charMembers = StringLiteralBuilder() // This iterates through all of the character class members collecting all // of the members who can be stuffed into a singular '.anyOf(...)' vs. @@ -340,14 +339,10 @@ extension PrettyPrinter { switch a { case let .char(c): charMembers.append(c) - - if c == "\\" { - charMembers.append(c) - } - return false case let .scalar(s): - charMembers += "\\u{\(String(s.value, radix: 16, uppercase: true))}" + charMembers.append( + unescaped: "\\u{\(String(s.value, radix: 16, uppercase: true))}") return false case .unconverted(_): return true @@ -356,7 +351,7 @@ extension PrettyPrinter { } case let .quotedLiteral(s): - charMembers += s + charMembers.append(s) return false case .trivia(_): @@ -370,7 +365,7 @@ extension PrettyPrinter { // Also in the same vein, if we have a few atom members but no // nonAtomMembers, then we can emit a single .anyOf(...) for them. if !charMembers.isEmpty, nonCharMembers.isEmpty { - let anyOf = ".anyOf(\(charMembers._quoted))" + let anyOf = ".anyOf(\(charMembers))" indent() @@ -393,7 +388,7 @@ extension PrettyPrinter { printer.indent() if !charMembers.isEmpty { - printer.output(".anyOf(\(charMembers._quoted))") + printer.output(".anyOf(\(charMembers))") if nonCharMembers.count > 0 { printer.output(",") @@ -617,10 +612,39 @@ extension PrettyPrinter { } extension String { - // TODO: Escaping? + fileprivate var _escaped: String { + _replacing(#"\"#, with: #"\\"#)._replacing(#"""#, with: #"\""#) + } + fileprivate var _quoted: String { - "\"\(self._replacing(#"\"#, with: #"\\"#)._replacing(#"""#, with: #"\""#))\"" + _escaped._bareQuoted + } + + fileprivate var _bareQuoted: String { + #""\#(self)""# + } +} + +/// A helper for building string literals, which handles escaping the contents +/// appended. +fileprivate struct StringLiteralBuilder { + private var contents = "" + + var result: String { contents._bareQuoted } + var isEmpty: Bool { contents.isEmpty } + + mutating func append(_ str: String) { + contents += str._escaped + } + mutating func append(_ c: Character) { + contents += String(c)._escaped } + mutating func append(unescaped str: String) { + contents += str + } +} +extension StringLiteralBuilder: CustomStringConvertible { + var description: String { result } } extension AST.Atom.AssertionKind { @@ -1107,8 +1131,8 @@ extension DSLTree.Atom { case let .scalar(s): let hex = String(s.value, radix: 16, uppercase: true) - return ("\\u{\(hex)}"._quoted, false) - + return ("\\u{\(hex)}"._bareQuoted, false) + case let .unconverted(a): if a.ast.isUnprintableAtom { return ("#/\(a.ast._regexBase)/#", false) @@ -1149,7 +1173,7 @@ extension DSLTree.Atom { case let .scalar(s): let hex = String(s.value, radix: 16, uppercase: true) - return "\\u{\(hex)}"._quoted + return "\\u{\(hex)}"._bareQuoted case let .unconverted(a): return a.ast._regexBase diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 320d10897..8e58280c0 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -216,7 +216,7 @@ extension AST.Atom { switch self.kind { case let .char(c): return .char(c) - case let .scalar(s): return .char(Character(s.value)) + case let .scalar(s): return .scalar(s.value) case .any: return .any case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 740bdcb8d..c251dded7 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -159,86 +159,6 @@ extension DSLTree { indirect case subtraction(CustomCharacterClass, CustomCharacterClass) indirect case symmetricDifference(CustomCharacterClass, CustomCharacterClass) } - - internal struct AsciiBitset { - let isInverted: Bool - var a: UInt64 = 0 - var b: UInt64 = 0 - - init(isInverted: Bool) { - self.isInverted = isInverted - } - - init(_ val: UInt8, _ isInverted: Bool, _ isCaseInsensitive: Bool) { - self.isInverted = isInverted - add(val, isCaseInsensitive) - } - - init(low: UInt8, high: UInt8, isInverted: Bool, isCaseInsensitive: Bool) { - self.isInverted = isInverted - for val in low...high { - add(val, isCaseInsensitive) - } - } - - internal init( - a: UInt64, - b: UInt64, - isInverted: Bool - ) { - self.isInverted = isInverted - self.a = a - self.b = b - } - - internal mutating func add(_ val: UInt8, _ isCaseInsensitive: Bool) { - setBit(val) - if isCaseInsensitive { - switch val { - case 64...90: setBit(val + 32) - case 97...122: setBit(val - 32) - default: break - } - } - } - - internal mutating func setBit(_ val: UInt8) { - if val < 64 { - a = a | 1 << val - } else { - b = b | 1 << (val - 64) - } - } - - internal func matches(char: Character) -> Bool { - let ret: Bool - if let val = char.asciiValue { - if val < 64 { - ret = (a >> val) & 1 == 1 - } else { - ret = (b >> (val - 64)) & 1 == 1 - } - } else { - ret = false - } - - if isInverted { - return !ret - } - - return ret - } - - /// Joins another bitset from a Member of the same CustomCharacterClass - internal func union(_ other: AsciiBitset) -> AsciiBitset { - precondition(self.isInverted == other.isInverted) - return AsciiBitset( - a: self.a | other.a, - b: self.b | other.b, - isInverted: self.isInverted - ) - } - } } @_spi(RegexBuilder) diff --git a/Sources/_StringProcessing/Utility/AsciiBitset.swift b/Sources/_StringProcessing/Utility/AsciiBitset.swift new file mode 100644 index 000000000..ad3159820 --- /dev/null +++ b/Sources/_StringProcessing/Utility/AsciiBitset.swift @@ -0,0 +1,99 @@ +extension DSLTree.CustomCharacterClass { + internal struct AsciiBitset { + let isInverted: Bool + var a: UInt64 = 0 + var b: UInt64 = 0 + + init(isInverted: Bool) { + self.isInverted = isInverted + } + + init(_ val: UInt8, _ isInverted: Bool, _ isCaseInsensitive: Bool) { + self.isInverted = isInverted + add(val, isCaseInsensitive) + } + + init(low: UInt8, high: UInt8, isInverted: Bool, isCaseInsensitive: Bool) { + self.isInverted = isInverted + for val in low...high { + add(val, isCaseInsensitive) + } + } + + internal init( + a: UInt64, + b: UInt64, + isInverted: Bool + ) { + self.isInverted = isInverted + self.a = a + self.b = b + } + + internal mutating func add(_ val: UInt8, _ isCaseInsensitive: Bool) { + setBit(val) + if isCaseInsensitive { + switch val { + case 64...90: setBit(val + 32) + case 97...122: setBit(val - 32) + default: break + } + } + } + + internal mutating func setBit(_ val: UInt8) { + if val < 64 { + a = a | 1 << val + } else { + b = b | 1 << (val - 64) + } + } + + private func matches(_ val: UInt8) -> Bool { + if val < 64 { + return (a >> val) & 1 == 1 + } else { + return (b >> (val - 64)) & 1 == 1 + } + } + + internal func matches(char: Character) -> Bool { + let matched: Bool + if let val = char._singleScalarAsciiValue { + matched = matches(val) + } else { + matched = false + } + + if isInverted { + return !matched + } + return matched + } + + internal func matches(scalar: Unicode.Scalar) -> Bool { + let matched: Bool + if scalar.isASCII { + let val = UInt8(ascii: scalar) + matched = matches(val) + } else { + matched = false + } + + if isInverted { + return !matched + } + return matched + } + + /// Joins another bitset from a Member of the same CustomCharacterClass + internal func union(_ other: AsciiBitset) -> AsciiBitset { + precondition(self.isInverted == other.isInverted) + return AsciiBitset( + a: self.a | other.a, + b: self.b | other.b, + isInverted: self.isInverted + ) + } + } +} diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index b67c6c242..bf6e48607 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -13,6 +13,10 @@ import XCTest import _StringProcessing import RegexBuilder +#if os(Linux) +func XCTExpectFailure(_ message: String? = nil, body: () throws -> Void) rethrows {} +#endif + class RegexDSLTests: XCTestCase { func _testDSLCaptures( _ tests: (input: String, expectedCaptures: MatchType?)..., @@ -1120,6 +1124,66 @@ class RegexDSLTests: XCTestCase { } } + func testScalarMatching() throws { + // RegexBuilder provides a RegexComponent conformance for UnicodeScalar. In + // grapheme cluster mode, it should only match entire graphemes. It may + // match a single scalar of a grapheme cluster in scalar semantic mode. + XCTAssertNotNil("a".firstMatch(of: "a" as UnicodeScalar)) + XCTAssertNil("a\u{301}".firstMatch(of: "a" as UnicodeScalar)) + XCTAssertNotNil("a\u{301}".firstMatch( + of: ("a" as UnicodeScalar).regex.matchingSemantics(.unicodeScalar))) + + let r1 = Regex { + "a" as UnicodeScalar + } + XCTAssertNil(try r1.firstMatch(in: "a\u{301}")) + XCTAssertNotNil( + try r1.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}") + ) + + let r2 = Regex { + CharacterClass.anyOf(["a" as UnicodeScalar, "๐Ÿ‘"]) + } + XCTAssertNil(try r2.firstMatch(in: "a\u{301}")) + XCTAssertNotNil( + try r2.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}") + ) + + let r3 = Regex { + "๐Ÿ‘จ" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "๐Ÿ‘จ" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "๐Ÿ‘ง" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "๐Ÿ‘ฆ" as UnicodeScalar + } + XCTAssertNil(try r3.firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + + let r4 = Regex { "รฉ" as UnicodeScalar } + XCTAssertNotNil( + try r4.firstMatch(in: "e\u{301}") + ) + XCTAssertNotNil( + try r4.firstMatch(in: "รฉ") + ) + + try XCTExpectFailure("Need stronger scalar coalescing logic") { + let r5 = Regex { + "e" + "\u{301}" as UnicodeScalar + } + XCTAssertNotNil( + try r5.firstMatch(in: "e\u{301}") + ) + XCTAssertNotNil( + try r5.firstMatch(in: "รฉ") + ) + } + } + struct SemanticVersion: Equatable { var major: Int var minor: Int diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 712808184..6c8f66e10 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -14,6 +14,131 @@ import XCTest +enum DecodedInstr { + case invalid + case moveImmediate + case moveCurrentPosition + case branch + case condBranchZeroElseDecrement + case condBranchSamePosition + case save + case saveAddress + case splitSaving + case clear + case clearThrough + case accept + case fail + case advance + case match + case matchCaseInsensitive + case matchScalar + case matchScalarCaseInsensitiveUnchecked + case matchScalarCaseInsensitive + case matchScalarUnchecked + case matchBitsetScalar + case matchBitset + case consumeBy + case assertBy + case matchBy + case backreference + case beginCapture + case endCapture + case transformCapture + case captureValue + case builtinAssertion + case builtinCharacterClass +} + +extension DecodedInstr { + /// Decode the given instruction by looking at the opcode and payload, expanding out certain instructions + /// like matchScalar and match into their variants + /// + /// Must stay in sync with Processor.cycle + static func decode(_ instruction: Instruction) -> DecodedInstr { + let (opcode, payload) = instruction.destructure + + switch opcode { + case .invalid: + fatalError("Invalid program") + case .moveImmediate: + return .moveImmediate + case .moveCurrentPosition: + return .moveCurrentPosition + case .branch: + return .branch + case .condBranchZeroElseDecrement: + return .condBranchZeroElseDecrement + case .condBranchSamePosition: + return .condBranchSamePosition + case .save: + return .save + case .saveAddress: + return .saveAddress + case .splitSaving: + return .splitSaving + case .clear: + return .clear + case .clearThrough: + return .clearThrough + case .accept: + return .accept + case .fail: + return .fail + case .advance: + return .advance + case .match: + let (isCaseInsensitive, _) = payload.elementPayload + if isCaseInsensitive { + return .matchCaseInsensitive + } else { + return .match + } + case .matchScalar: + let (_, caseInsensitive, boundaryCheck) = payload.scalarPayload + if caseInsensitive { + if boundaryCheck { + return .matchScalarCaseInsensitive + } else { + return .matchScalarCaseInsensitiveUnchecked + } + } else { + if boundaryCheck { + return .matchScalar + } else { + return .matchScalarUnchecked + } + } + case .matchBitset: + let (isScalar, _) = payload.bitsetPayload + if isScalar { + return .matchBitsetScalar + } else { + return .matchBitset + } + case .consumeBy: + return consumeBy + case .assertBy: + return .assertBy + case .matchBy: + return .matchBy + case .backreference: + return .backreference + case .beginCapture: + return .beginCapture + case .endCapture: + return .endCapture + case .transformCapture: + return .transformCapture + case .captureValue: + return .captureValue + case .builtinAssertion: + return .builtinAssertion + case .builtinCharacterClass: + return .builtinCharacterClass +} + } +} + extension RegexTests { private func testCompilationEquivalence( @@ -147,16 +272,24 @@ extension RegexTests { for regex: String, syntax: SyntaxOptions = .traditional, semanticLevel: RegexSemanticLevel? = nil, - contains targets: Set, + contains targets: Set = [], + doesNotContain invalid: Set = [], file: StaticString = #file, line: UInt = #line ) { do { let prog = try _compileRegex(regex, syntax, semanticLevel) - var found: Set = [] + var found: Set = [] for inst in prog.engine.instructions { - if targets.contains(inst.opcode) { - found.insert(inst.opcode) + let decoded = DecodedInstr.decode(inst) + found.insert(decoded) + + if invalid.contains(decoded) { + XCTFail( + "Compiled regex '\(regex)' contains incorrect opcode \(decoded)", + file: file, + line: line) + return } } @@ -174,39 +307,95 @@ extension RegexTests { } } - private func expectProgram( - for regex: String, - syntax: SyntaxOptions = .traditional, - semanticLevel: RegexSemanticLevel? = nil, - doesNotContain targets: Set, - file: StaticString = #file, - line: UInt = #line - ) { - do { - let prog = try _compileRegex(regex, syntax, semanticLevel) - for inst in prog.engine.instructions { - if targets.contains(inst.opcode) { - XCTFail( - "Compiled regex '\(regex)' contains incorrect opcode \(inst.opcode)", - file: file, - line: line) - return - } - } - } catch { - XCTFail( - "Failed to compile regex '\(regex)': \(error)", - file: file, - line: line) - } - } - func testBitsetCompile() { - expectProgram(for: "[abc]", contains: [.matchBitset]) - expectProgram(for: "[abc]", doesNotContain: [.consumeBy]) + expectProgram( + for: "[abc]", + contains: [.matchBitset], + doesNotContain: [.consumeBy, .matchBitsetScalar]) + expectProgram( + for: "[abc]", + semanticLevel: .unicodeScalar, + contains: [.matchBitsetScalar], + doesNotContain: [.matchBitset, .consumeBy]) + } - expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, doesNotContain: [.matchBitset]) - expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, contains: [.consumeBy]) + func testScalarOptimizeCompilation() { + // all ascii quoted literal -> elide boundary checks + expectProgram( + for: "abcd", + contains: [.matchScalar, .matchScalarUnchecked], + doesNotContain: [.match, .consumeBy]) + // ascii character -> matchScalar with boundary check + expectProgram( + for: "a", + contains: [.matchScalar], + doesNotContain: [.match, .consumeBy, .matchScalarUnchecked]) + // quoted literal is not all ascii -> match scalar when possible, always do boundary checks + expectProgram( + for: "aaa\u{301}", + contains: [.match, .matchScalar], + doesNotContain: [.consumeBy, .matchScalarUnchecked]) + // scalar mode -> always emit match scalar without boundary checks + expectProgram( + for: "abcd", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.match, .consumeBy, .matchScalar]) + expectProgram( + for: "a", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.match, .consumeBy, .matchScalar]) + expectProgram( + for: "aaa\u{301}", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.match, .consumeBy, .matchScalar]) + } + + func testCaseInsensitivityCompilation() { + // quoted literal is all ascii -> match scalar case insensitive and skip + // boundary checks + expectProgram( + for: "(?i)abcd", + contains: [.matchScalarCaseInsensitiveUnchecked, .matchScalarCaseInsensitive], + doesNotContain: [.match, .matchCaseInsensitive, .matchScalar, .matchScalarUnchecked]) + // quoted literal is all non-cased ascii -> emit match scalar instructions + expectProgram( + for: "(?i)&&&&", + contains: [.matchScalar, .matchScalarUnchecked], + doesNotContain: [.match, .matchCaseInsensitive, + .matchScalarCaseInsensitive, .matchScalarCaseInsensitiveUnchecked]) + // quoted literal is not all ascii -> match scalar case insensitive when + // possible, match character case insensitive when needed, always perform + // boundary check + expectProgram( + for: "(?i)abcd\u{301}", + contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked, .match, .matchScalar]) + // same as before but contains ascii non cased characters -> emit matchScalar for them + expectProgram( + for: "(?i)abcd\u{301};.'!", + contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive, .matchScalar], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked, .match]) + // contains non-ascii non-cased characters -> emit match + expectProgram( + for: "(?i)abcd\u{301};.'!๐Ÿ’–", + contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive, .matchScalar, .match], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked]) + + // scalar mode -> emit unchecked scalar match only, emit case insensitive + // only if the scalar is cased + expectProgram( + for: "(?i);.'!๐Ÿ’–", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked]) + expectProgram( + for: "(?i)abcdรฉ", + semanticLevel: .unicodeScalar, + contains: [.matchScalarCaseInsensitiveUnchecked], + doesNotContain: [.matchScalarUnchecked]) } func testQuantificationForwardProgressCompile() { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index a1bce20fd..f2715eac1 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -24,9 +24,10 @@ func _firstMatch( _ regexStr: String, input: String, validateOptimizations: Bool, + semanticLevel: RegexSemanticLevel = .graphemeCluster, syntax: SyntaxOptions = .traditional ) throws -> (String, [String?])? { - var regex = try Regex(regexStr, syntax: syntax) + var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel) let result = try regex.firstMatch(in: input) if validateOptimizations { @@ -64,6 +65,7 @@ func flatCaptureTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #file, line: UInt = #line ) { @@ -73,6 +75,7 @@ func flatCaptureTest( regex, input: test, validateOptimizations: validateOptimizations, + semanticLevel: semanticLevel, syntax: syntax ) else { if expect == nil { @@ -123,6 +126,7 @@ func matchTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #file, line: UInt = #line ) { @@ -136,6 +140,7 @@ func matchTest( dumpAST: dumpAST, xfail: xfail, validateOptimizations: validateOptimizations, + semanticLevel: semanticLevel, file: file, line: line) } @@ -153,6 +158,7 @@ func firstMatchTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #filePath, line: UInt = #line ) { @@ -161,12 +167,13 @@ func firstMatchTest( regex, input: input, validateOptimizations: validateOptimizations, + semanticLevel: semanticLevel, syntax: syntax)?.0 if xfail { XCTAssertNotEqual(found, match, file: file, line: line) } else { - XCTAssertEqual(found, match, file: file, line: line) + XCTAssertEqual(found, match, "Incorrect match", file: file, line: line) } } catch { if !xfail { @@ -599,6 +606,12 @@ extension RegexTests { ("A", true), ("a", false)) + matchTest(#"(?i)[a]"#, + ("๐Ÿ’ฟ", false), + ("a\u{301}", false), + ("A", true), + ("a", true)) + matchTest("[a]", ("a\u{301}", false)) @@ -613,14 +626,12 @@ extension RegexTests { // interpreted as matching the scalars "\r" or "\n". // It does not fully match the character "\r\n" because the character class // in scalar mode will only match one scalar - do { - let regex = try Regex("[\r\n]").matchingSemantics(.unicodeScalar) - XCTAssertEqual("\r", try regex.wholeMatch(in: "\r")?.0) - XCTAssertEqual("\n", try regex.wholeMatch(in: "\n")?.0) - XCTAssertEqual(nil, try regex.wholeMatch(in: "\r\n")?.0) - } catch { - XCTFail("\(error)", file: #filePath, line: #line) - } + matchTest( + "^[\r\n]$", + ("\r", true), + ("\n", true), + ("\r\n", false), + semanticLevel: .unicodeScalar) matchTest("[^\r\n]", ("\r\n", false), @@ -628,7 +639,17 @@ extension RegexTests { ("\r", true)) matchTest("[\n\r]", ("\n", true), - ("\r", true)) + ("\r", true), + ("\r\n", false)) + + matchTest( + #"[a]\u0301"#, + ("a\u{301}", false), + semanticLevel: .graphemeCluster) + matchTest( + #"[a]\u0301"#, + ("a\u{301}", true), + semanticLevel: .unicodeScalar) firstMatchTest("[-]", input: "123-abcxyz", match: "-") @@ -1855,6 +1876,19 @@ extension RegexTests { // TODO: Add test for grapheme boundaries at start/end of match + // Testing the matchScalar optimization for ascii quoted literals and characters + func testScalarOptimization() throws { + // check that we are correctly doing the boundary check after matchScalar + firstMatchTest("a", input: "a\u{301}", match: nil) + firstMatchTest("aa", input: "aa\u{301}", match: nil) + + firstMatchTest("a", input: "a\u{301}", match: "a", semanticLevel: .unicodeScalar) + firstMatchTest("aa", input: "aa\u{301}", match: "aa", semanticLevel: .unicodeScalar) + + // case insensitive tests + firstMatchTest(#"(?i)abc\u{301}d"#, input: "AbC\u{301}d", match: "AbC\u{301}d", semanticLevel: .unicodeScalar) + } + func testCase() { let regex = try! Regex(#".\N{SPARKLING HEART}."#) let input = "๐ŸงŸโ€โ™€๏ธ๐Ÿ’–๐Ÿง  or ๐Ÿง ๐Ÿ’–โ˜•๏ธ" diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 97ba3e333..6822330f3 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -117,4 +117,34 @@ extension RenderDSLTests { } """#) } + + func testScalar() throws { + try testConversion(#"\u{B4}"#, #""" + Regex { + "\u{B4}" + } + """#) + try testConversion(#"\u{301}"#, #""" + Regex { + "\u{301}" + } + """#) + try testConversion(#"[\u{301}]"#, #""" + Regex { + One(.anyOf("\u{301}")) + } + """#) + try testConversion(#"[abc\u{301}]"#, #""" + Regex { + One(.anyOf("abc\u{301}")) + } + """#) + + // TODO: We ought to try and preserve the scalar syntax here. + try testConversion(#"a\u{301}"#, #""" + Regex { + "aฬ" + } + """#) + } }