diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index ee3d5c2f8..48b2ce540 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -226,3 +226,64 @@ public struct NegativeLookahead: _BuiltinRegexComponent { self.init(_RegexFactory().negativeLookaheadNonCapturing(component())) } } + +/// A regex component that allows a match to continue only if its contents +/// match at the given location. +/// +/// A lookbehind is a zero-length assertion that its included regex matches at +/// a particular position. Lookbehinds do not advance the overall matching +/// position in the input string — once a lookbehind succeeds, matching continues +/// in the regex from the same position. +@available(SwiftStdlib 5.7, *) // TODO: How should this be gated? +public struct Lookbehind: _BuiltinRegexComponent { + public var regex: Regex + + init(_ regex: Regex) { + self.regex = regex + } + + /// Creates a lookbehind from the given regex component. + public init( + _ component: R + ) where R.RegexOutput == Output { + self.init(_RegexFactory().lookbehindNonCapturing(component)) + } + + /// Creates a lookbehind from the regex generated by the given builder closure. + public init( + @RegexComponentBuilder _ component: () -> R + ) where R.RegexOutput == Output { + self.init(_RegexFactory().lookbehindNonCapturing(component())) + } +} + +/// A regex component that allows a match to continue only if its contents +/// do not match at the given location. +/// +/// A negative lookbehind is a zero-length assertion that its included regex +/// does not match at a particular position. Lookbehinds do not advance the +/// overall matching position in the input string — once a lookbehind succeeds, +/// matching continues in the regex from the same position. +@available(SwiftStdlib 5.7, *) // TODO: How should this be gated? +public struct NegativeLookbehind: _BuiltinRegexComponent { + public var regex: Regex + + init(_ regex: Regex) { + self.regex = regex + } + + /// Creates a negative lookbehind from the given regex component. + public init( + _ component: R + ) where R.RegexOutput == Output { + self.init(_RegexFactory().negativeLookbehindNonCapturing(component)) + } + + /// Creates a negative lookbehind from the regex generated by the given builder + /// closure. + public init( + @RegexComponentBuilder _ component: () -> R + ) where R.RegexOutput == Output { + self.init(_RegexFactory().negativeLookbehindNonCapturing(component())) + } +} diff --git a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift index be288491d..41aca8504 100644 --- a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift +++ b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift @@ -47,6 +47,9 @@ extension AST { // NSRegularExpression compatibility special-case case nsreCompatibleDot // no AST representation + + // Lookbehind support + case reverse // no AST representation } public var kind: Kind diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 1fdadd8de..3ec852aa8 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -523,6 +523,19 @@ extension Parser { mutating func parseCustomCharacterClass( _ start: Source.Located ) -> CustomCC { + // Excessively nested recursion is a common DOS attack, so limit + // our recursion. + context.parseDepth += 1 + defer { context.parseDepth -= 1 } + guard context.parseDepth < context.maxParseDepth else { + self.errorAtCurrentPosition(.nestingTooDeep) + + // This is not generally recoverable and further errors will be + // incorrect + diags.suppressFurtherDiagnostics = true + return .init(start, [], start.location) + } + let alreadyInCCC = context.isInCustomCharacterClass context.isInCustomCharacterClass = true defer { context.isInCustomCharacterClass = alreadyInCCC } diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index d2f7c622d..1ae001101 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -143,7 +143,7 @@ extension RegexValidator { case .caseInsensitive, .possessiveByDefault, .reluctantByDefault, .singleLine, .multiline, .namedCapturesOnly, .extended, .extraExtended, .asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps, - .nsreCompatibleDot: + .nsreCompatibleDot, .reverse: break } } @@ -370,7 +370,7 @@ extension RegexValidator { } switch kind.value { case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead, - .atomicNonCapturing: + .atomicNonCapturing, .lookbehind, .negativeLookbehind: break case .balancedCapture: @@ -384,8 +384,8 @@ extension RegexValidator { case .nonAtomicLookahead: error(.unsupported("non-atomic lookahead"), at: kind.location) - case .lookbehind, .negativeLookbehind, .nonAtomicLookbehind: - error(.unsupported("lookbehind"), at: kind.location) + case .nonAtomicLookbehind: + error(.unsupported("non-atomic lookbehind"), at: kind.location) case .scriptRun, .atomicScriptRun: error(.unsupported("script run"), at: kind.location) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 6a00a0dfd..7569f7489 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -16,6 +16,9 @@ internal import _RegexParser extension Compiler { struct ByteCodeGen { + var reverse: Bool { + options.reversed + } var options: MatchingOptions var builder = MEProgram.Builder() /// A Boolean indicating whether the first matchable atom has been emitted. @@ -136,14 +139,19 @@ fileprivate extension Compiler.ByteCodeGen { // ASCII value) if s.utf8.count >= longThreshold, !options.isCaseInsensitive { let boundaryCheck = options.semanticLevel == .graphemeCluster - builder.buildMatchUTF8(Array(s.utf8), boundaryCheck: boundaryCheck) + builder.buildMatchUTF8( + Array(s.utf8), + boundaryCheck: boundaryCheck, + reverse: reverse + ) return } } guard options.semanticLevel == .graphemeCluster else { for char in s { - for scalar in char.unicodeScalars { + let scalars: any Collection = reverse ? char.unicodeScalars.reversed() : char.unicodeScalars + for scalar in scalars { emitMatchScalar(scalar) } } @@ -152,20 +160,27 @@ fileprivate extension Compiler.ByteCodeGen { // Fast path for eliding boundary checks for an all ascii quoted literal if optimizationsEnabled && s.allSatisfy(\.isASCII) && !s.isEmpty { - let lastIdx = s.unicodeScalars.indices.last! - for idx in s.unicodeScalars.indices { - let boundaryCheck = idx == lastIdx + let boundaryIdx = reverse ? s.unicodeScalars.indices.first! : s.unicodeScalars.indices.last! + let indices: any Collection = reverse + ? s.unicodeScalars.indices.reversed() + : s.unicodeScalars.indices + + for idx in indices { + let boundaryCheck = idx == boundaryIdx let scalar = s.unicodeScalars[idx] if options.isCaseInsensitive && scalar.properties.isCased { - builder.buildMatchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck) + builder.buildMatchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck, reverse: reverse) } else { - builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck) + builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck, reverse: reverse) } } return } - for c in s { emitCharacter(c) } + let chars: any Collection = reverse ? s.reversed() : s + for char in chars { + emitCharacter(char) + } } mutating func emitBackreference( @@ -212,18 +227,18 @@ fileprivate extension Compiler.ByteCodeGen { } mutating func emitCharacterClass(_ cc: DSLTree.Atom.CharacterClass) { - builder.buildMatchBuiltin(model: cc.asRuntimeModel(options)) + builder.buildMatchBuiltin(model: cc.asRuntimeModel(options), reverse: reverse) } mutating func emitMatchScalar(_ s: UnicodeScalar) { assert(options.semanticLevel == .unicodeScalar) if options.isCaseInsensitive && s.properties.isCased { - builder.buildMatchScalarCaseInsensitive(s, boundaryCheck: false) + builder.buildMatchScalarCaseInsensitive(s, boundaryCheck: false, reverse: reverse) } else { - builder.buildMatchScalar(s, boundaryCheck: false) + builder.buildMatchScalar(s, boundaryCheck: false, reverse: reverse) } } - + mutating func emitCharacter(_ c: Character) { // Unicode scalar mode matches the specific scalars that comprise a character if options.semanticLevel == .unicodeScalar { @@ -232,7 +247,7 @@ fileprivate extension Compiler.ByteCodeGen { } return } - + if options.isCaseInsensitive && c.isCased { if optimizationsEnabled && c.isASCII { // c.isCased ensures that c is not CR-LF, @@ -240,22 +255,25 @@ fileprivate extension Compiler.ByteCodeGen { assert(c.unicodeScalars.count == 1) builder.buildMatchScalarCaseInsensitive( c.unicodeScalars.last!, - boundaryCheck: true) + boundaryCheck: true, + reverse: reverse) } else { - builder.buildMatch(c, isCaseInsensitive: true) + builder.buildMatch(c, isCaseInsensitive: true, reverse: reverse) } return } - + if optimizationsEnabled && c.isASCII { let lastIdx = c.unicodeScalars.indices.last! for idx in c.unicodeScalars.indices { - builder.buildMatchScalar(c.unicodeScalars[idx], boundaryCheck: idx == lastIdx) + let scalar = c.unicodeScalars[idx] + let boundaryCheck = idx == lastIdx + builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck, reverse: reverse) } return } - - builder.buildMatch(c, isCaseInsensitive: false) + + builder.buildMatch(c, isCaseInsensitive: false, reverse: reverse) } mutating func emitAny() { @@ -270,9 +288,9 @@ fileprivate extension Compiler.ByteCodeGen { mutating func emitAnyNonNewline() { switch options.semanticLevel { case .graphemeCluster: - builder.buildConsumeNonNewline() + builder.buildConsumeNonNewline(reverse: reverse) case .unicodeScalar: - builder.buildConsumeScalarNonNewline() + builder.buildConsumeScalarNonNewline(reverse: reverse) } } @@ -341,20 +359,42 @@ fileprivate extension Compiler.ByteCodeGen { // be glueing sub-grapheme components together? try emitNode(node) } + + mutating func emitLookaround( + _ kind: (forwards: Bool, positive: Bool), + _ child: DSLTree.Node + ) throws { + guard !child.containsCustomConsumer else { + throw Unsupported("Lookarounds with custom consumers") + } + + if !kind.forwards { + defer { options.endScope() } + options.beginScope() + // TODO: JH - Is it okay to use .fake here? + options.apply(.init(adding: [.init(.reverse, location: .fake)])) + } + + if kind.positive { + try emitPositiveLookaround(child) + } else { + try emitNegativeLookaround(child) + } + } - mutating func emitPositiveLookahead(_ child: DSLTree.Node) throws { + mutating func emitPositiveLookaround(_ child: DSLTree.Node) throws { /* - save(restoringAt: success) - save(restoringAt: intercept) - // failure restores at intercept - clearThrough(intercept) // remove intercept and any leftovers from + save(restoringAt: success) + save(restoringAt: intercept) + // failure restores at intercept + clearThrough(intercept) // remove intercept and any leftovers from fail(preservingCaptures: true) // ->success - intercept: - clearSavePoint // remove success - fail // propagate failure - success: - ... - */ + intercept: + clearSavePoint // remove success + fail // propagate failure + success: + ... + */ let intercept = builder.makeAddress() let success = builder.makeAddress() @@ -370,8 +410,8 @@ fileprivate extension Compiler.ByteCodeGen { builder.label(success) } - - mutating func emitNegativeLookahead(_ child: DSLTree.Node) throws { + + mutating func emitNegativeLookaround(_ child: DSLTree.Node) throws { /* save(restoringAt: success) save(restoringAt: intercept) @@ -399,20 +439,6 @@ fileprivate extension Compiler.ByteCodeGen { builder.label(success) } - - mutating func emitLookaround( - _ kind: (forwards: Bool, positive: Bool), - _ child: DSLTree.Node - ) throws { - guard kind.forwards else { - throw Unsupported("backwards assertions") - } - if kind.positive { - try emitPositiveLookahead(child) - } else { - try emitNegativeLookahead(child) - } - } mutating func emitAtomicNoncapturingGroup( _ child: DSLTree.Node @@ -472,15 +498,14 @@ fileprivate extension Compiler.ByteCodeGen { options.beginScope() defer { options.endScope() } - if let lookaround = kind.lookaroundKind { - try emitLookaround(lookaround, child) - return - } - switch kind { case .lookahead, .negativeLookahead, .lookbehind, .negativeLookbehind: - throw Unreachable("TODO: reason") + guard let lookaround = kind.lookaroundKind else { + throw Unreachable("TODO: reason") + } + + try emitLookaround(lookaround, child) case .capture, .namedCapture, .balancedCapture: throw Unreachable("These should produce a capture node") @@ -491,7 +516,7 @@ fileprivate extension Compiler.ByteCodeGen { } options.apply(optionSequence) try emitNode(child) - + case .atomicNonCapturing: try emitAtomicNoncapturingGroup(child) @@ -768,7 +793,8 @@ fileprivate extension Compiler.ByteCodeGen { guard let bitset = ccc.asAsciiBitset(options) else { return false } - builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + + builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics, reverse: reverse) case .atom(let atom): switch atom { @@ -778,24 +804,24 @@ fileprivate extension Compiler.ByteCodeGen { guard let bitset = DSLTree.CustomCharacterClass(members: [.atom(atom)]).asAsciiBitset(options) else { return false } - builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics, reverse: reverse) } else { // Uncased character OR case-sensitive matching; match as a single scalar ascii value character guard let val = c._singleScalarAsciiValue else { return false } - builder.buildQuantify(asciiChar: val, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + builder.buildQuantify(asciiChar: val, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics, reverse: reverse) } case .any: builder.buildQuantifyAny( - matchesNewlines: true, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + matchesNewlines: true, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics, reverse: reverse) case .anyNonNewline: builder.buildQuantifyAny( - matchesNewlines: false, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + matchesNewlines: false, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics, reverse: reverse) case .dot: builder.buildQuantifyAny( - matchesNewlines: options.dotMatchesNewline, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + matchesNewlines: options.dotMatchesNewline, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics, reverse: reverse) case .characterClass(let cc): // Custom character class that consumes a single grapheme @@ -805,7 +831,9 @@ fileprivate extension Compiler.ByteCodeGen { kind, minTrips, maxExtraTrips, - isScalarSemantics: isScalarSemantics) + isScalarSemantics: isScalarSemantics, + reverse: reverse + ) default: return false } @@ -1119,9 +1147,9 @@ fileprivate extension Compiler.ByteCodeGen { if let asciiBitset = ccc.asAsciiBitset(options), optimizationsEnabled { if options.semanticLevel == .unicodeScalar { - builder.buildScalarMatchAsciiBitset(asciiBitset) + builder.buildScalarMatchAsciiBitset(asciiBitset, reverse: reverse) } else { - builder.buildMatchAsciiBitset(asciiBitset) + builder.buildMatchAsciiBitset(asciiBitset, reverse: reverse) } return } @@ -1203,7 +1231,7 @@ fileprivate extension Compiler.ByteCodeGen { return [node] } } - let children = children + var children = children .flatMap(flatten) .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in switch node { @@ -1222,6 +1250,9 @@ fileprivate extension Compiler.ByteCodeGen { return false } } + if reverse { + children.reverse() + } for child in children { try emitConcatenationComponent(child) } @@ -1230,7 +1261,6 @@ fileprivate extension Compiler.ByteCodeGen { @discardableResult mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? { switch node { - case let .orderedChoice(children): try emitAlternation(children) @@ -1389,3 +1419,28 @@ extension DSLTree.CustomCharacterClass { return false } } + +extension DSLTree.Node { + var containsCustomConsumer: Bool { + switch self { + case .orderedChoice(let array), .concatenation(let array): + array.contains { $0.containsCustomConsumer } + case .capture(_, _, let node, _): + node.containsCustomConsumer + case .nonCapturingGroup(_, let node): + node.containsCustomConsumer + case .ignoreCapturesInTypedOutput(let node): + node.containsCustomConsumer + case .conditional(_, let node, let node2): + node.containsCustomConsumer || node2.containsCustomConsumer + case .quantification(_, _, let node): + node.containsCustomConsumer + case .convertedRegexLiteral(let node, _): + node.containsCustomConsumer + case .customCharacterClass, .atom, .trivia, .empty, .quotedLiteral, .absentFunction, .characterPredicate: + false + case .consumer, .matcher: + true + } + } +} diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 80bfd9b05..9adc23da7 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -94,6 +94,13 @@ extension Instruction { /// Operand: Amount to advance by. case advance + /// Reverse the input position. + /// + /// reverse(_ amount: Distance) + /// + /// Operand: Amount to reverse by. + case reverse + // TODO: Is the amount useful here? Is it commonly more than 1? /// Composite assert-advance else restore. @@ -105,6 +112,15 @@ extension Instruction { /// - Boolean for if we should match in a case insensitive way case match + /// Composite reverse-assert else restore. + /// + /// match(_: EltReg, isCaseInsensitive: Bool) + /// + /// Operands: + /// - Element register to compare against. + /// - Boolean for if we should match in a case insensitive way + case reverseMatch + /// Match against a scalar and possibly perform a boundary check or match in a case insensitive way /// /// matchScalar(_: Unicode.Scalar, isCaseInsensitive: Bool, boundaryCheck: Bool) @@ -112,6 +128,13 @@ extension Instruction { /// Operands: Scalar value to match against and booleans case matchScalar + /// Reverse match against a scalar and possibly perform a boundary check or reverse match in a case insensitive way + /// + /// reverseMatchScalar(_: Unicode.Scalar, isCaseInsensitive: Bool, boundaryCheck: Bool) + /// + /// Operands: Scalar value to match against and booleans + case reverseMatchScalar + /// Match directly (binary semantics) against a series of UTF-8 bytes /// /// NOTE: Compiler should ensure to only emit this instruction when normalization @@ -123,6 +146,17 @@ extension Instruction { /// matchUTF8(_: UTF8Register, boundaryCheck: Bool) case matchUTF8 + /// Reverse match directly (binary semantics) against a series of UTF-8 bytes + /// + /// NOTE: Compiler should ensure to only emit this instruction when normalization + /// is not required. E.g., scalar-semantic mode or when the matched portion is entirely ASCII + /// (which is invariant under NFC). Similary, this is case-sensitive. + /// + /// TODO: should we add case-insensitive? + /// + /// reverseMatchUTF8(_: UTF8Register, boundaryCheck: Bool) + case reverseMatchUTF8 + /// Match a character or a scalar against a set of valid ascii values stored in a bitset /// /// matchBitset(_: AsciiBitsetRegister, isScalar: Bool) @@ -132,6 +166,15 @@ extension Instruction { /// - Boolean for if we should match by scalar value case matchBitset + /// Reverse match a character or a scalar against a set of valid ascii values stored in a bitset + /// + /// reverseMatchBitset(_: AsciiBitsetRegister, isScalar: Bool) + /// + /// Operand: + /// - Ascii bitset register containing the bitset + /// - Boolean for if we should match by scalar value + case reverseMatchBitset + /// Match against a built-in character class /// /// matchBuiltin(_: CharacterClassPayload) @@ -141,11 +184,25 @@ extension Instruction { /// - If it is inverted /// - If it strictly matches only ascii values case matchBuiltin - + + /// Reverse match against a built-in character class + /// + /// reverseMatchBuiltin(_: CharacterClassPayload) + /// + /// Operand: the payload contains + /// - The character class + /// - If it is inverted + /// - If it strictly matches only ascii values + case reverseMatchBuiltin + /// Matches any non newline character /// Operand: If we are in scalar mode or not case matchAnyNonNewline + /// Reverse matches any non newline character + /// Operand: If we are in scalar mode or not + case reverseMatchAnyNonNewline + // MARK: Extension points /// Advance the input position based on the result by calling the consume @@ -212,7 +269,7 @@ extension Instruction { /// Fused save-and-branch. /// - /// split(to: target, saving: backtrackPoint) + /// split(to: target, saving: backtrackPoint) /// case splitSaving @@ -223,6 +280,13 @@ extension Instruction { /// quantify(_:QuantifyPayload) /// case quantify + /// Fused reverse quantify, execute, save instruction + /// Quantifies the stored instruction in an inner loop instead of looping through instructions in processor + /// Only quantifies specific nodes + /// + /// reverseQuantify(_:QuantifyPayload) + /// + case reverseQuantify /// Begin the given capture /// /// beginCapture(_:CapReg) @@ -266,7 +330,6 @@ extension Instruction { // TODO: Fused assertions. It seems like we often want to // branch based on assertion fail or success. - } } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 1a26421eb..07a685007 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -14,7 +14,7 @@ internal import _RegexParser // For errors extension MEProgram { struct Builder { var instructions: [Instruction] = [] - + // Tracing var enableTracing = false var enableMetrics = false @@ -179,56 +179,74 @@ extension MEProgram.Builder { mutating func buildAdvance(_ n: Distance) { instructions.append(.init(.advance, .init(distance: n))) } - + + mutating func buildReverse(_ n: Distance) { + instructions.append(.init(.reverse, .init(distance: n))) + } + + mutating func buildReverseUnicodeScalar(_ n: Distance) { + instructions.append(.init(.reverse, .init(distance: n, isScalarDistance: true))) + } + mutating func buildAdvanceUnicodeScalar(_ n: Distance) { instructions.append( .init(.advance, .init(distance: n, isScalarDistance: true))) } - - mutating func buildConsumeNonNewline() { - instructions.append(.init(.matchAnyNonNewline, .init(isScalar: false))) + + mutating func buildConsumeNonNewline(reverse: Bool) { + let opcode = reverse ? Instruction.OpCode.reverseMatchAnyNonNewline : .matchAnyNonNewline + instructions.append(.init(opcode, .init(isScalar: false))) } - - mutating func buildConsumeScalarNonNewline() { - instructions.append(.init(.matchAnyNonNewline, .init(isScalar: true))) + + mutating func buildConsumeScalarNonNewline(reverse: Bool) { + let opcode = reverse ? Instruction.OpCode.reverseMatchAnyNonNewline : .matchAnyNonNewline + instructions.append(.init(opcode, .init(isScalar: true))) } - mutating func buildMatch(_ e: Character, isCaseInsensitive: Bool) { + mutating func buildMatch(_ e: Character, isCaseInsensitive: Bool, reverse: Bool) { + let opcode = reverse ? Instruction.OpCode.reverseMatch : .match instructions.append(.init( - .match, .init(element: elements.store(e), isCaseInsensitive: isCaseInsensitive))) + opcode, .init(element: elements.store(e), isCaseInsensitive: isCaseInsensitive))) } - mutating func buildMatchUTF8(_ utf8: Array, boundaryCheck: Bool) { - instructions.append(.init(.matchUTF8, .init( + mutating func buildMatchUTF8(_ utf8: Array, boundaryCheck: Bool, reverse: Bool) { + let opcode = reverse ? Instruction.OpCode.reverseMatchUTF8 : .matchUTF8 + instructions.append(.init(opcode, .init( utf8: utf8Contents.store(utf8), boundaryCheck: boundaryCheck))) } - mutating func buildMatchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) { - instructions.append(.init(.matchScalar, .init(scalar: s, caseInsensitive: false, boundaryCheck: boundaryCheck))) - } - - mutating func buildMatchScalarCaseInsensitive(_ s: Unicode.Scalar, boundaryCheck: Bool) { - instructions.append(.init(.matchScalar, .init(scalar: s, caseInsensitive: true, boundaryCheck: boundaryCheck))) + mutating func buildMatchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool, reverse: Bool) { + let opcode = reverse ? Instruction.OpCode.reverseMatchScalar : .matchScalar + instructions.append(.init(opcode, .init(scalar: s, caseInsensitive: false, boundaryCheck: boundaryCheck))) } + mutating func buildMatchScalarCaseInsensitive(_ s: Unicode.Scalar, boundaryCheck: Bool, reverse: Bool) { + let opcode = reverse ? Instruction.OpCode.reverseMatchScalar : .matchScalar + instructions.append(.init(opcode, .init(scalar: s, caseInsensitive: true, boundaryCheck: boundaryCheck))) + } mutating func buildMatchAsciiBitset( - _ b: DSLTree.CustomCharacterClass.AsciiBitset + _ b: DSLTree.CustomCharacterClass.AsciiBitset, + reverse: Bool ) { + let opcode = reverse ? Instruction.OpCode.reverseMatchBitset : .matchBitset instructions.append(.init( - .matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: false))) + opcode, .init(bitset: makeAsciiBitset(b), isScalar: false))) } mutating func buildScalarMatchAsciiBitset( - _ b: DSLTree.CustomCharacterClass.AsciiBitset + _ b: DSLTree.CustomCharacterClass.AsciiBitset, + reverse: Bool ) { + let opcode = reverse ? Instruction.OpCode.reverseMatchBitset : .matchBitset instructions.append(.init( - .matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: true))) + opcode, .init(bitset: makeAsciiBitset(b), isScalar: true))) } - - mutating func buildMatchBuiltin(model: _CharacterClassModel) { + + mutating func buildMatchBuiltin(model: _CharacterClassModel, reverse: Bool) { + let opcode = reverse ? Instruction.OpCode.reverseMatchBuiltin : .matchBuiltin instructions.append(.init( - .matchBuiltin, .init(model))) + opcode, .init(model))) } mutating func buildConsume( @@ -261,10 +279,12 @@ extension MEProgram.Builder { _ kind: AST.Quantification.Kind, _ minTrips: Int, _ maxExtraTrips: Int?, - isScalarSemantics: Bool + isScalarSemantics: Bool, + reverse: Bool ) { + let opcode = reverse ? Instruction.OpCode.reverseQuantify : .quantify instructions.append(.init( - .quantify, + opcode, .init(quantify: .init(bitset: makeAsciiBitset(bitset), kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics)))) } @@ -273,10 +293,12 @@ extension MEProgram.Builder { _ kind: AST.Quantification.Kind, _ minTrips: Int, _ maxExtraTrips: Int?, - isScalarSemantics: Bool + isScalarSemantics: Bool, + reverse: Bool ) { + let opcode = reverse ? Instruction.OpCode.reverseQuantify : .quantify instructions.append(.init( - .quantify, + opcode, .init(quantify: .init(asciiChar: asciiChar, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics)))) } @@ -285,10 +307,12 @@ extension MEProgram.Builder { _ kind: AST.Quantification.Kind, _ minTrips: Int, _ maxExtraTrips: Int?, - isScalarSemantics: Bool + isScalarSemantics: Bool, + reverse: Bool ) { + let opcode = reverse ? Instruction.OpCode.reverseQuantify : .quantify instructions.append(.init( - .quantify, + opcode, .init(quantify: .init(matchesNewlines: matchesNewlines, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics)))) } @@ -297,10 +321,12 @@ extension MEProgram.Builder { _ kind: AST.Quantification.Kind, _ minTrips: Int, _ maxExtraTrips: Int?, - isScalarSemantics: Bool + isScalarSemantics: Bool, + reverse: Bool ) { + let opcode = reverse ? Instruction.OpCode.reverseQuantify : .quantify instructions.append(.init( - .quantify, + opcode, .init(quantify: .init(model: model,kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics)))) } @@ -583,7 +609,7 @@ extension MEProgram.Builder { defer { asciiBitsets.append(b) } return AsciiBitsetRegister(asciiBitsets.count) } - + mutating func makeConsumeFunction( _ f: @escaping MEProgram.ConsumeFunction ) -> ConsumeFunctionRegister { diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index ab47a1a5f..691de6ef7 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -30,6 +30,27 @@ extension Processor { return true } + mutating func reverseMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> Bool { + guard currentPosition >= start, let previous = input.reverseMatchBuiltinCC( + cc, + at: currentPosition, + limitedBy: start, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics + ) else { + signalFailure() + return false + } + currentPosition = previous + return true + } + func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { // TODO: needs benchmark coverage if currentPosition == subjectBounds.lowerBound { return true } @@ -160,7 +181,49 @@ extension String { ? nil : (substr.first!, substr.endIndex) } - + + /// Returns the character at `pos`, bounded by `start`, as well as the lower + /// boundary of the returned character. + /// + /// This function handles loading a character from a string while respecting + /// an start boundary, even if that start boundary is sub-character or sub-scalar. + /// + /// - If `pos` is at or past `start`, this function returns `nil`. + /// - If `start` is between `pos` and the next grapheme cluster boundary (i.e., + /// `start` is before `self.index(after: pos)`, then the returned character + /// is smaller than the one that would be produced by `self[pos]` and the + /// returned index is at the start of that character. + /// - If `start` is between `pos` and the next grapheme cluster boundary, and + /// is not on a Unicode scalar boundary, the partial scalar is dropped. This + /// can result in a `nil` return or a character that includes only part of + /// the `self[pos]` character. + /// + /// - Parameters: + /// - pos: The position to load a character from. + /// - start: The limit for the character at `pos`. + /// - Returns: The character at `pos`, bounded by `start`, if it exists, along + /// with the lower bound of that character. The lower bound is always + /// scalar-aligned. + func characterAndStart( + at pos: String.Index, + limitedBy start: String.Index + ) -> (Character, characterStart: String.Index)? { + // FIXME: Sink into the stdlib to avoid multiple boundary calculations + guard pos > start else { return nil } + let previous = index(before: pos) + if previous >= start { + return (self[pos], previous) + } + + // `start` must be a sub-character position that is between `pos` and the + // next grapheme boundary. This is okay if `start` is on a Unicode scalar + // boundary, but if it's in the middle of a scalar's code units, there + // may not be a character to return at all after rounding down. Use + // `Substring`'s rounding to determine what we can return. + let substr = self[start.. String.Index? { + guard currentPosition > start else { return nil } + if case .definite(let result) = _quickReverseMatchAnyNonNewline( + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics + ) { + assert(result == _thoroughReverseMatchAnyNonNewline( + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics)) + return result + } + return _thoroughReverseMatchAnyNonNewline( + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics) + } + @inline(__always) private func _quickMatchAnyNonNewline( at currentPosition: String.Index, @@ -205,6 +291,27 @@ extension String { } } + @inline(__always) + private func _quickReverseMatchAnyNonNewline( + at currentPosition: String.Index, + limitedBy start: String.Index, + isScalarSemantics: Bool + ) -> QuickResult { + assert(currentPosition > start) + guard let (asciiValue, previous, isCRLF) = _quickReverseASCIICharacter( + at: currentPosition, limitedBy: start + ) else { + return .unknown + } + switch asciiValue { + case (._lineFeed)...(._carriageReturn): + return .definite(nil) + default: + assert(!isCRLF) + return .definite(previous) + } + } + @inline(never) private func _thoroughMatchAnyNonNewline( at currentPosition: String.Index, @@ -224,6 +331,25 @@ extension String { return next } + @inline(never) + private func _thoroughReverseMatchAnyNonNewline( + at currentPosition: String.Index, + limitedBy start: String.Index, + isScalarSemantics: Bool + ) -> String.Index? { + if isScalarSemantics { + guard currentPosition > start else { return nil } + let scalar = unicodeScalars[currentPosition] + guard !scalar.isNewline else { return nil } + return unicodeScalars.index(before: currentPosition) + } + + guard let (char, previous) = characterAndStart(at: currentPosition, limitedBy: start), + !char.isNewline + else { return nil } + return previous + } + internal func matchRegexDot( at currentPosition: Index, limitedBy end: Index, @@ -282,6 +408,42 @@ extension String { isScalarSemantics: isScalarSemantics) } + func reverseMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + limitedBy start: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> String.Index? { + guard currentPosition > start else { return nil } + if case .definite(let result) = _quickReverseMatchBuiltinCC( + cc, + at: currentPosition, + limitedBy: start, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics + ) { + assert(result == _thoroughReverseMatchBuiltinCC( + cc, + at: currentPosition, + limitedBy: start, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics)) + return result + } + return _thoroughReverseMatchBuiltinCC( + cc, + at: currentPosition, + limitedBy: start, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) + } + + // TODO: JH - Is there any value in testing this? How would it be tested? // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(__always) private func _quickMatchBuiltinCC( @@ -289,7 +451,7 @@ extension String { at currentPosition: String.Index, limitedBy end: String.Index, isInverted: Bool, - isStrictASCII: Bool, + isStrictASCII: Bool, // TODO: JH - Is this just reserved for future use? A relic of the past? isScalarSemantics: Bool ) -> QuickResult { assert(currentPosition < end) @@ -304,6 +466,28 @@ extension String { return .definite(result == isInverted ? nil : next) } + @inline(__always) + private func _quickReverseMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + limitedBy start: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> QuickResult { + assert(currentPosition > start) + guard let (previous, result) = _quickReverseMatch( + cc, + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics + ) else { + return .unknown + } + return .definite(result == isInverted ? nil : previous) + } + + // TODO: JH - How can this be unit tested? // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(never) private func _thoroughMatchBuiltinCC( @@ -386,4 +570,87 @@ extension String { } return next } + + @inline(never) + private func _thoroughReverseMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + limitedBy start: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> String.Index? { + // TODO: Branch here on scalar semantics + // Don't want to pay character cost if unnecessary + guard let (char, previousIndex) = + characterAndStart(at: currentPosition, limitedBy: start) + else { return nil } + var previous = previousIndex + let scalar = unicodeScalars[currentPosition] + + let asciiCheck = !isStrictASCII + || (scalar.isASCII && isScalarSemantics) + || char.isASCII + + var matched: Bool + if isScalarSemantics && cc != .anyGrapheme { + previous = unicodeScalars.index(before: currentPosition) + } + + switch cc { + case .any, .anyGrapheme: + matched = true + case .digit: + if isScalarSemantics { + matched = scalar.properties.numericType != nil && asciiCheck + } else { + matched = char.isNumber && asciiCheck + } + case .horizontalWhitespace: + if isScalarSemantics { + matched = scalar.isHorizontalWhitespace && asciiCheck + } else { + matched = char._isHorizontalWhitespace && asciiCheck + } + case .verticalWhitespace: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + } else { + matched = char._isNewline && asciiCheck + } + case .newlineSequence: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + if matched && scalar == "\r" + && previous >= start && unicodeScalars[previous] == "\n" { + // Match a full CR-LF sequence even in scalar semantics + unicodeScalars.formIndex(after: &previous) + } + } else { + matched = char._isNewline && asciiCheck + } + case .whitespace: + if isScalarSemantics { + matched = scalar.properties.isWhitespace && asciiCheck + } else { + matched = char.isWhitespace && asciiCheck + } + case .word: + if isScalarSemantics { + matched = scalar.properties.isAlphabetic && asciiCheck + } else { + matched = char.isWordCharacter && asciiCheck + } + } + + if isInverted { + matched.toggle() + } + + guard matched else { + return nil + } + + return previous + } } diff --git a/Sources/_StringProcessing/Engine/MEReverseQuantify.swift b/Sources/_StringProcessing/Engine/MEReverseQuantify.swift new file mode 100644 index 000000000..5f1afb1bc --- /dev/null +++ b/Sources/_StringProcessing/Engine/MEReverseQuantify.swift @@ -0,0 +1,177 @@ +extension Processor { + func _doReverseQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { + let isScalarSemantics = payload.isScalarSemantics + + switch payload.type { + case .asciiBitset: + return input.reverseMatchASCIIBitset( + registers[payload.bitset], + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics) + case .asciiChar: + return input.reverseMatchScalar( + UnicodeScalar.init(_value: UInt32(payload.asciiChar)), + at: currentPosition, + limitedBy: start, + boundaryCheck: !isScalarSemantics, + isCaseInsensitive: false) + case .builtinCC: + guard currentPosition >= start else { return nil } + + // We only emit .quantify if it consumes a single character + return input.reverseMatchBuiltinCC( + payload.builtinCC, + at: currentPosition, + limitedBy: start, + isInverted: payload.builtinIsInverted, + isStrictASCII: payload.builtinIsStrict, + isScalarSemantics: isScalarSemantics) + case .any: + guard currentPosition >= start else { return nil } + + if payload.anyMatchesNewline { + if isScalarSemantics { + return input.unicodeScalars.index(before: currentPosition) + } + return input.index(before: currentPosition) + } + + return input.reverseMatchAnyNonNewline( + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics) + } + } + + /// Generic bounded reverseQuantify instruction interpreter + /// - Handles .eager and .posessive + /// - Handles arbitrary minTrips and maxExtraTrips + mutating func runReverseQuantify(_ payload: QuantifyPayload) -> Bool { + assert(payload.quantKind != .reluctant) + + var trips = 0 + var maxExtraTrips = payload.maxExtraTrips + + while trips < payload.minTrips { + guard let previous = _doReverseQuantifyMatch(payload) else { + signalFailure() + return false + } + + currentPosition = previous + + // If we've reached the start of the string but still have more trips, fail + if currentPosition == start, trips < payload.minTrips { + signalFailure() + return false + } + + trips += 1 + } + + // If we don't have any more trips to take: + if maxExtraTrips == 0 { + // We're done + return true + } + + // We've already consumed the minimum number of characters, + // If we can't get another match, the reverse quantify was successful + guard let previous = _doReverseQuantifyMatch(payload) else { + return true + } + maxExtraTrips = maxExtraTrips.map { $0 - 1 } + + // Remember the range of valid positions in case we can create a quantified + // save point + var rangeStart = currentPosition + let rangeEnd = currentPosition + currentPosition = previous + + while true { + if maxExtraTrips == 0 { break } + + guard let previous = _doReverseQuantifyMatch(payload) else { + break + } + maxExtraTrips = maxExtraTrips.map({$0 - 1}) + rangeStart = currentPosition + currentPosition = previous + } + + if payload.quantKind == .eager { + savePoints.append(makeQuantifiedSavePoint( + rangeStart.. Bool { + assert(payload.quantKind == .eager + && payload.minTrips == 1 + && payload.maxExtraTrips == nil) + + // Match at least once + guard let previous = _doReverseQuantifyMatch(payload) else { + signalFailure() + return false + } + + // Run `a+` as `aa*` + currentPosition = previous + _doRunEagerZeroOrMoreReverseQuantify(payload) + return true + } + + /// Specialized quantify instruction interpreter for ? + mutating func runZeroOrOneReverseQuantify(_ payload: QuantifyPayload) -> Bool { + assert(payload.minTrips == 0 + && payload.maxExtraTrips == 1) + let previous = _doReverseQuantifyMatch(payload) + guard let idx = previous else { + return true // matched zero times + } + if payload.quantKind != .possessive { + // Save the zero match + savePoints.append(makeSavePoint(resumingAt: currentPC+1)) + } + currentPosition = idx + return true + } +} diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 0bf19b829..6a5244793 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -219,6 +219,33 @@ extension Processor { return false } + // Reverse in our input + // + // Returns whether the reverse succeeded. On failure, our + // save point was restored + mutating func reverseConsume(_ n: Distance) -> Bool { + // TODO: needs benchmark coverage + if let idx = input.index( + currentPosition, offsetBy: -n.rawValue, limitedBy: start + ) { + currentPosition = idx + return true + } + + // If `start` falls in the middle of a character, and we are trying to reverse + // by one "character", then we should max out at `start` even though the above + // reversal will result in `nil`. + if n == 1, let idx = input.unicodeScalars.index( + currentPosition, offsetBy: -n.rawValue, limitedBy: start + ) { + currentPosition = idx + return true + } + + signalFailure() + return false + } + // Advances in unicode scalar view mutating func consumeScalar(_ n: Distance) -> Bool { // TODO: needs benchmark coverage @@ -232,6 +259,19 @@ extension Processor { return true } + // Reverses in unicode scalar view + mutating func reverseConsumeScalar(_ n: Distance) -> Bool { + // TODO: needs benchmark coverage + guard let idx = input.unicodeScalars.index( + currentPosition, offsetBy: -n.rawValue, limitedBy: start + ) else { + signalFailure() + return false + } + currentPosition = idx + return true + } + /// Continue matching at the specified index. /// /// - Precondition: `bounds.contains(index) || index == bounds.upperBound` @@ -279,6 +319,33 @@ extension Processor { return true } + // Reverse match against the current input element. Returns whether + // it succeeded vs signaling an error. + mutating func reverseMatch( + _ e: Element, isCaseInsensitive: Bool + ) -> Bool { + let previous = input.reverseMatch( + e, + at: currentPosition, + limitedBy: start, + isCaseInsensitive: isCaseInsensitive + ) + + guard let previous else { + guard currentPosition == start else { + // If there's no previous character, and we're not + // at the start of the string, the match has failed + signalFailure() + return false + } + + return true + } + + currentPosition = previous + return true + } + // Match against the current input prefix. Returns whether // it succeeded vs signaling an error. mutating func matchSeq( @@ -318,6 +385,32 @@ extension Processor { return true } + mutating func reverseMatchScalar( + _ s: Unicode.Scalar, + boundaryCheck: Bool, + isCaseInsensitive: Bool + ) -> Bool { + let previous = input.reverseMatchScalar( + s, + at: currentPosition, + limitedBy: start, + boundaryCheck: boundaryCheck, + isCaseInsensitive: isCaseInsensitive + ) + + guard let previous else { + guard currentPosition == start else { + signalFailure() + return false + } + + return true + } + + currentPosition = previous + return true + } + // TODO: bytes should be a Span or RawSpan mutating func matchUTF8( _ bytes: Array, @@ -336,6 +429,24 @@ extension Processor { return true } + // TODO: bytes should be a Span or RawSpan + mutating func reverseMatchUTF8( + _ bytes: Array, + boundaryCheck: Bool + ) -> Bool { + guard let previous = input.reverseMatchUTF8( + bytes, + at: currentPosition, + limitedBy: start, + boundaryCheck: boundaryCheck + ) else { + signalFailure() + return false + } + currentPosition = previous + return true + } + // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset @@ -356,6 +467,26 @@ extension Processor { return true } + // If we have a bitset we know that the CharacterClass only matches against + // ascii characters, so check if the current input element is ascii then + // check if it is set in the bitset + mutating func reverseMatchBitset( + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + isScalarSemantics: Bool + ) -> Bool { + guard let previous = input.reverseMatchASCIIBitset( + bitset, + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics + ) else { + signalFailure() + return false + } + currentPosition = previous + return true + } + // Matches the next character/scalar if it is not a newline mutating func matchAnyNonNewline( isScalarSemantics: Bool @@ -372,6 +503,22 @@ extension Processor { return true } + // Matches the previous character/scalar if it is not a newline + mutating func reverseMatchAnyNonNewline( + isScalarSemantics: Bool + ) -> Bool { + guard let previous = input.reverseMatchAnyNonNewline( + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics + ) else { + signalFailure() + return false + } + currentPosition = previous + return true + } + mutating func signalFailure(preservingCaptures: Bool = false) { guard !savePoints.isEmpty else { state = .fail @@ -535,16 +682,35 @@ extension Processor { controller.step() } } + case .reverse: + let (isScalar, distance) = payload.distance + if isScalar { + if reverseConsumeScalar(distance) { + controller.step() + } + } else { + if reverseConsume(distance) { + controller.step() + } + } case .matchAnyNonNewline: if matchAnyNonNewline(isScalarSemantics: payload.isScalar) { controller.step() } + case .reverseMatchAnyNonNewline: + if reverseMatchAnyNonNewline(isScalarSemantics: payload.isScalar) { + controller.step() + } case .match: let (isCaseInsensitive, reg) = payload.elementPayload if match(registers[reg], isCaseInsensitive: isCaseInsensitive) { controller.step() } - + case .reverseMatch: + let (isCaseInsensitive, reg) = payload.elementPayload + if reverseMatch(registers[reg], isCaseInsensitive: isCaseInsensitive) { + controller.step() + } case .matchScalar: let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload if matchScalar( @@ -554,6 +720,15 @@ extension Processor { ) { controller.step() } + case .reverseMatchScalar: + let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload + if reverseMatchScalar( + scalar, + boundaryCheck: boundaryCheck, + isCaseInsensitive: caseInsensitive + ) { + controller.step() + } case .matchUTF8: let (utf8Reg, boundaryCheck) = payload.matchUTF8Payload @@ -564,13 +739,27 @@ extension Processor { controller.step() } + case .reverseMatchUTF8: + let (utf8Reg, boundaryCheck) = payload.matchUTF8Payload + let utf8Content = registers[utf8Reg] + if reverseMatchUTF8( + utf8Content, boundaryCheck: boundaryCheck + ) { + controller.step() + } + case .matchBitset: let (isScalar, reg) = payload.bitsetPayload let bitset = registers[reg] if matchBitset(bitset, isScalarSemantics: isScalar) { controller.step() } - + case .reverseMatchBitset: + let (isScalar, reg) = payload.bitsetPayload + let bitset = registers[reg] + if reverseMatchBitset(bitset, isScalarSemantics: isScalar) { + controller.step() + } case .matchBuiltin: let payload = payload.characterClassPayload if matchBuiltinCC( @@ -581,10 +770,40 @@ extension Processor { ) { controller.step() } + case .reverseMatchBuiltin: + let payload = payload.characterClassPayload + if reverseMatchBuiltinCC( + payload.cc, + isInverted: payload.isInverted, + isStrictASCII: payload.isStrictASCII, + isScalarSemantics: payload.isScalarSemantics + ) { + controller.step() + } case .quantify: if runQuantify(payload.quantify) { controller.step() } + case .reverseQuantify: + let quantPayload = payload.quantify + let matched: Bool + switch (quantPayload.quantKind, quantPayload.minTrips, quantPayload.maxExtraTrips) { + case (.reluctant, _, _): + assertionFailure(".reluctant is not supported by .quantify") + return + case (.eager, 0, nil): + runEagerZeroOrMoreReverseQuantify(quantPayload) + matched = true + case (.eager, 1, nil): + matched = runEagerOneOrMoreReverseQuantify(quantPayload) + case (_, 0, 1): + matched = runZeroOrOneReverseQuantify(quantPayload) + default: + matched = runReverseQuantify(quantPayload) + } + if matched { + controller.step() + } case .consumeBy: let reg = payload.consumer @@ -715,6 +934,25 @@ extension String { return next } + func reverseMatch( + _ char: Character, + at pos: Index, + limitedBy start: String.Index, + isCaseInsensitive: Bool + ) -> Index? { + // TODO: This can be greatly sped up with string internals + // TODO: This is also very much quick-check-able + guard let (stringChar, next) = characterAndStart(at: pos, limitedBy: start) else { return nil } + + if isCaseInsensitive { + guard stringChar.lowercased() == char.lowercased() else { return nil } + } else { + guard stringChar == char else { return nil } + } + + return next + } + func matchSeq( _ seq: Substring, at pos: Index, @@ -774,6 +1012,37 @@ extension String { return idx } + func reverseMatchScalar( + _ scalar: Unicode.Scalar, + at pos: Index, + limitedBy start: String.Index, + boundaryCheck: Bool, + isCaseInsensitive: Bool + ) -> Index? { + // TODO: extremely quick-check-able + // TODO: can be sped up with string internals + guard pos > start else { return nil } + let curScalar = unicodeScalars[pos] + + if isCaseInsensitive { + guard curScalar.properties.lowercaseMapping == scalar.properties.lowercaseMapping + else { + return nil + } + } else { + guard curScalar == scalar else { return nil } + } + + let idx = unicodeScalars.index(before: pos) + assert(idx >= start, "Input is a substring with a sub-scalar startIndex.") + + if boundaryCheck && !isOnGraphemeClusterBoundary(idx) { + return nil + } + + return idx + } + func matchUTF8( _ bytes: Array, at pos: Index, @@ -786,7 +1055,28 @@ extension String { self.utf8.formIndex(after: &cur) } - guard cur <= end else { return nil } + assert(cur <= end) + + if boundaryCheck && !isOnGraphemeClusterBoundary(cur) { + return nil + } + + return cur + } + + func reverseMatchUTF8( + _ bytes: Array, + at pos: Index, + limitedBy start: Index, + boundaryCheck: Bool + ) -> Index? { + var cur = pos + for b in bytes.reversed() { + guard cur > start, self.utf8[cur] == b else { return nil } + self.utf8.formIndex(before: &cur) + } + + assert(cur > start) if boundaryCheck && !isOnGraphemeClusterBoundary(cur) { return nil @@ -844,4 +1134,54 @@ extension String { return next } + + func reverseMatchASCIIBitset( + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + at pos: Index, + limitedBy start: Index, + isScalarSemantics: Bool + ) -> Index? { + + // FIXME: Inversion should be tracked and handled in only one place. + // That is, we should probably store it as a bit in the instruction, so that + // bitset matching and bitset inversion is bit-based rather that semantically + // inverting the notion of a match or not. As-is, we need to track both + // meanings in some code paths. + let isInverted = bitset.isInverted + + // TODO: More fodder for refactoring `_quickASCIICharacter`, see the comment + // there + guard let (asciiByte, previous, isCRLF) = _quickReverseASCIICharacter( + at: pos, + limitedBy: start + ) else { + if isScalarSemantics { + guard pos >= start else { return nil } + guard bitset.matches(unicodeScalars[pos]) else { return nil } + return unicodeScalars.index(before: pos) + } else { + guard let (char, previous) = characterAndStart(at: pos, limitedBy: start), + bitset.matches(char) else { return nil } + return previous + } + } + + guard bitset.matches(asciiByte) else { + // FIXME: check inversion here after refactored out of bitset + return nil + } + + // CR-LF should only match `[\r]` in scalar semantic mode or if inverted + if isCRLF { + if isScalarSemantics { + return self.unicodeScalars.index(after: previous) + } + if isInverted { + return previous + } + return nil + } + + return previous + } } diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index b67cbb6a5..e43b79264 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -65,6 +65,13 @@ extension Instruction: CustomStringConvertible { } else { return "match char[\(reg)]" } + case .reverseMatch: + let (isCaseInsensitive, reg) = payload.elementPayload + if isCaseInsensitive { + return "reverseMatchCaseInsensitive char[\(reg)]" + } else { + return "reverseMatch char[\(reg)]" + } case .matchBitset: let (isScalar, reg) = payload.bitsetPayload if isScalar { @@ -72,9 +79,19 @@ extension Instruction: CustomStringConvertible { } else { return "matchBitset bitset[\(reg)]" } + case .reverseMatchBitset: + let (isScalar, reg) = payload.bitsetPayload + if isScalar { + return "reverseMatchBitsetScalar bitset[\(reg)]" + } else { + return "reverseMatchBitset bitset[\(reg)]" + } case .matchBuiltin: let payload = payload.characterClassPayload return "matchBuiltin \(payload.cc) (\(payload.isInverted))" + case .reverseMatchBuiltin: + let payload = payload.characterClassPayload + return "\(opcode) \(payload.cc) (\(payload.isInverted))" case .matchBy: let (matcherReg, valReg) = payload.pairedMatcherValue return "\(opcode) match[\(matcherReg)] -> val[\(valReg)]" @@ -85,6 +102,13 @@ extension Instruction: CustomStringConvertible { } else { return "matchScalar '\(scalar)' boundaryCheck: \(boundaryCheck)" } + case .reverseMatchScalar: + let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload + if caseInsensitive { + return "reverseMatchScalarCaseInsensitive '\(scalar)' boundaryCheck: \(boundaryCheck)" + } else { + return "reverseMatchScalar '\(scalar)' boundaryCheck: \(boundaryCheck)" + } case .moveCurrentPosition: let reg = payload.position return "\(opcode) -> pos[\(reg)]" @@ -94,6 +118,9 @@ extension Instruction: CustomStringConvertible { case .quantify: let payload = payload.quantify return "\(opcode) \(payload.type) \(payload.minTrips) \(payload.maxExtraTrips?.description ?? "unbounded" )" + case .reverseQuantify: + let payload = payload.quantify + return "\(opcode) \(payload.type) \(payload.minTrips) \(payload.maxExtraTrips?.description ?? "unbounded" )" case .save: let resumeAddr = payload.addr return "\(opcode) \(resumeAddr)" @@ -106,6 +133,8 @@ extension Instruction: CustomStringConvertible { case .transformCapture: let (cap, trans) = payload.pairedCaptureTransform return "\(opcode) trans[\(trans)](\(cap))" + case .reverse: + return "\(opcode) \(payload.distance)" default: return "\(opcode)" } diff --git a/Sources/_StringProcessing/LiteralPrinter.swift b/Sources/_StringProcessing/LiteralPrinter.swift index 5c136827c..afae5b6fb 100644 --- a/Sources/_StringProcessing/LiteralPrinter.swift +++ b/Sources/_StringProcessing/LiteralPrinter.swift @@ -575,6 +575,9 @@ extension AST.MatchingOption.Kind { // NSRE Compatibility option; no literal representation case .nsreCompatibleDot: return nil + // Reverse option for lookbehinds; no literal representation + case .reverse: return nil + #if RESILIENT_LIBRARIES @unknown default: fatalError() diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index 793c6c82d..226451870 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -133,10 +133,14 @@ extension MatchingOptions { var usesCanonicalEquivalence: Bool { semanticLevel == .graphemeCluster } - + var usesNSRECompatibleDot: Bool { stack.last!.contains(.nsreCompatibleDot) } + + var reversed: Bool { + stack.last!.contains(.reverse) + } } // MARK: - Implementation @@ -160,6 +164,9 @@ extension MatchingOptions { case withoutAnchoringBounds case nsreCompatibleDot + // Not available via regex literal flags + case reverse + // Oniguruma options case asciiOnlyDigit case asciiOnlyPOSIXProps @@ -225,6 +232,8 @@ extension MatchingOptions { self = .extended case .extraExtended: self = .extraExtended + case .reverse: + self = .reverse #if RESILIENT_LIBRARIES @unknown default: fatalError() diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 8d6a5fbc7..6b20d5e17 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -105,7 +105,7 @@ extension DSLTree { case explicit(_AST.QuantificationKind) /// A kind set via syntax, which can be affected by options. case syntax(_AST.QuantificationKind) - + var ast: AST.Quantification.Kind? { switch self { case .default: return nil @@ -114,12 +114,12 @@ extension DSLTree { } } } - + @_spi(RegexBuilder) public struct CustomCharacterClass { var members: [Member] var isInverted: Bool - + var containsDot: Bool { members.contains { member in switch member { @@ -152,13 +152,13 @@ extension DSLTree { self.members = members self.isInverted = isInverted } - + public static func generalCategory(_ category: Unicode.GeneralCategory) -> Self { let property = AST.Atom.CharacterProperty(.generalCategory(category.extendedGeneralCategory!), isInverted: false, isPOSIX: false) let astAtom = AST.Atom(.property(property), .fake) return .init(members: [.atom(.unconverted(.init(ast: astAtom)))]) } - + public var inverted: CustomCharacterClass { var result = self result.isInverted.toggle() @@ -263,7 +263,7 @@ extension DSLTree.Atom { /// \B case notWordBoundary } - + @_spi(RegexBuilder) public enum CharacterClass: Hashable { case digit @@ -396,7 +396,7 @@ extension DSLTree.Node { @_spi(RegexBuilder) public var children: [DSLTree.Node] { switch self { - + case let .orderedChoice(v): return v case let .concatenation(v): return v @@ -504,12 +504,12 @@ public struct ReferenceID: Hashable { public var _raw: Int { base } - + public init() { base = Self.counter Self.counter += 1 } - + init(_ base: Int) { self.base = base } @@ -854,14 +854,14 @@ extension DSLTree { /// `_TreeNode` conformance. struct _Tree: _TreeNode { var node: DSLTree.Node - + init(_ node: DSLTree.Node) { self.node = node } - + var children: [_Tree]? { switch node { - + case let .orderedChoice(v): return v.map(_Tree.init) case let .concatenation(v): return v.map(_Tree.init) @@ -892,7 +892,7 @@ extension DSLTree { @_spi(RegexBuilder) public struct GroupKind { internal var ast: AST.Group.Kind - + public static var atomicNonCapturing: Self { .init(ast: .atomicNonCapturing) } @@ -902,17 +902,23 @@ extension DSLTree { public static var negativeLookahead: Self { .init(ast: .negativeLookahead) } + public static var lookbehind: Self { + .init(ast: .lookbehind) + } + public static var negativeLookbehind: Self { + .init(ast: .negativeLookbehind) + } } @_spi(RegexBuilder) public struct ConditionKind { internal var ast: AST.Conditional.Condition.Kind } - + @_spi(RegexBuilder) public struct QuantificationKind { internal var ast: AST.Quantification.Kind - + public static var eager: Self { .init(ast: .eager) } @@ -923,11 +929,11 @@ extension DSLTree { .init(ast: .possessive) } } - + @_spi(RegexBuilder) public struct QuantificationAmount { internal var ast: AST.Quantification.Amount - + public static var zeroOrMore: Self { .init(ast: .zeroOrMore) } @@ -965,27 +971,27 @@ extension DSLTree { } } } - + @_spi(RegexBuilder) public struct ASTNode { internal var ast: AST.Node } - + @_spi(RegexBuilder) public struct AbsentFunction { internal var ast: AST.AbsentFunction } - + @_spi(RegexBuilder) public struct Reference { internal var ast: AST.Reference } - + @_spi(RegexBuilder) public struct MatchingOptionSequence { internal var ast: AST.MatchingOptionSequence } - + public struct Atom { internal var ast: AST.Atom } diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift index 53dfe652d..efbe406e3 100644 --- a/Sources/_StringProcessing/Unicode/ASCII.swift +++ b/Sources/_StringProcessing/Unicode/ASCII.swift @@ -109,7 +109,7 @@ extension String { let tail = utf8[next] guard tail._isSub300StartingByte else { return nil } - // Handle CR-LF: + // Handle CR-LF by advancing past the sequence if both characters are present if base == ._carriageReturn && tail == ._lineFeed { utf8.formIndex(after: &next) guard next == end || utf8[next]._isSub300StartingByte else { @@ -122,6 +122,49 @@ extension String { return (first: base, next: next, crLF: false) } + /// TODO: better to take isScalarSemantics parameter, we can return more results + /// and we can give the right `next` index, not requiring the caller to re-adjust it + /// TODO: detailed description of nuanced semantics + func _quickReverseASCIICharacter( + at idx: Index, + limitedBy start: Index + ) -> (first: UInt8, previous: Index, crLF: Bool)? { + // TODO: fastUTF8 version + assert(String.Index(idx, within: unicodeScalars) != nil) + assert(idx >= start) + + // If we're already at the start, there is no previous character + if idx == start { + return nil + } + + let char = utf8[idx] + guard char._isASCII else { + assert(!self[idx].isASCII) + return nil + } + + var previous = utf8.index(before: idx) + if previous == start { + return (first: char, previous: previous, crLF: false) + } + + let head = utf8[previous] + guard head._isSub300StartingByte else { return nil } + + // Handle CR-LF by reversing past the sequence if both characters are present + if char == ._lineFeed && head == ._carriageReturn { + utf8.formIndex(before: &previous) + guard previous == start || utf8[previous]._isSub300StartingByte else { + return nil + } + return (first: char, previous: previous, crLF: true) + } + + assert(self[idx].isASCII && self[idx] != "\r\n") + return (first: char, previous: previous, crLF: false) + } + func _quickMatch( _ cc: _CharacterClassModel.Representation, at idx: Index, @@ -169,5 +212,50 @@ extension String { } } -} + func _quickReverseMatch( + _ cc: _CharacterClassModel.Representation, + at idx: Index, + limitedBy start: Index, + isScalarSemantics: Bool + ) -> (previous: Index, matchResult: Bool)? { + /// ASCII fast-paths + guard let (asciiValue, previous, isCRLF) = _quickReverseASCIICharacter( + at: idx, limitedBy: start + ) else { + return nil + } + + // TODO: bitvectors + switch cc { + case .any, .anyGrapheme: + return (previous, true) + + case .digit: + return (previous, asciiValue._asciiIsDigit) + + case .horizontalWhitespace: + return (previous, asciiValue._asciiIsHorizontalWhitespace) + + case .verticalWhitespace, .newlineSequence: + if asciiValue._asciiIsVerticalWhitespace { + if isScalarSemantics && isCRLF && cc == .verticalWhitespace { + return (utf8.index(after: previous), true) + } + return (previous, true) + } + return (previous, false) + + case .whitespace: + if asciiValue._asciiIsWhitespace { + if isScalarSemantics && isCRLF { + return (utf8.index(after: previous), true) + } + return (previous, true) + } + return (previous, false) + case .word: + return (previous, asciiValue._asciiIsWord) + } + } +} diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index 0c224e159..3cce8a80b 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -167,7 +167,25 @@ public struct _RegexFactory { ) -> Regex { .init(node: .nonCapturingGroup(.negativeLookahead, component.regex.root)) } - + + @_spi(RegexBuilder) + @available(SwiftStdlib 5.7, *) + public func lookbehindNonCapturing( + _ component: some RegexComponent + ) -> Regex { + // TODO: Compiler error if component contains a custom consumer? + .init(node: .nonCapturingGroup(.lookbehind, component.regex.root)) + } + + @_spi(RegexBuilder) + @available(SwiftStdlib 5.7, *) + public func negativeLookbehindNonCapturing( + _ component: some RegexComponent + ) -> Regex { + // TODO: Compiler error if component contains a custom consumer? + .init(node: .nonCapturingGroup(.negativeLookbehind, component.regex.root)) + } + @available(SwiftStdlib 5.7, *) public func orderedChoice( _ component: some RegexComponent diff --git a/Tests/DocumentationTests/RegexBuilderTests.swift b/Tests/DocumentationTests/RegexBuilderTests.swift index d0ae36e01..e9535cdf3 100644 --- a/Tests/DocumentationTests/RegexBuilderTests.swift +++ b/Tests/DocumentationTests/RegexBuilderTests.swift @@ -205,3 +205,32 @@ extension RegexBuilderTests { XCTAssertEqual(matches[0].1, 121.54) } } + +@available(SwiftStdlib 5.10, *) +extension RegexBuilderTests { + func testPositiveLookbehind() throws { + let regex = Regex { + Lookbehind { "foo" } + "bar" + } + + let matching = try regex.firstMatch(in: "foobar")?.output // == "bar" + let nonMatching = try regex.firstMatch(in: "fuubar")?.output // == nil + + try XCTAssertEqual(XCTUnwrap(matching), "bar") + XCTAssertNil(nonMatching) + } + + func testNegativeLookbehind() throws { + let regex = Regex { + NegativeLookbehind { "buzz" } + "baz" + } + + let matching = try regex.firstMatch(in: "foobaz")?.output // == "baz" + let nonMatching = try regex.firstMatch(in: "buzzbaz")?.output // == nil + + try XCTAssertEqual(XCTUnwrap(matching), "baz") + XCTAssertNil(nonMatching) + } +} diff --git a/Tests/MatchingEngineTests/ASCIITests.swift b/Tests/MatchingEngineTests/ASCIITests.swift new file mode 100644 index 000000000..4af6bf28f --- /dev/null +++ b/Tests/MatchingEngineTests/ASCIITests.swift @@ -0,0 +1,359 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import XCTest + +@testable import _StringProcessing + +final class QuickASCIICharacterTests: XCTestCase { + func testHappyPath() throws { + // Given + let sut = "foo" + + // When + let result = sut._quickASCIICharacter(at: sut.startIndex, limitedBy: sut.endIndex) + + // Then + let (char, nextIdx, isCRLF) = try XCTUnwrap(result) + XCTAssertEqual(char, sut.utf8[sut.startIndex]) + XCTAssertEqual(nextIdx, sut.index(after: sut.startIndex)) + XCTAssertFalse(isCRLF) + } + + func testAtEnd() throws { + // Given + let sut = "foo" + + // When + let result = sut._quickASCIICharacter(at: sut.endIndex, limitedBy: sut.endIndex) + + // Then + XCTAssertNil(result) + } + + func testNonASCIIChar() throws { + // Given + let sut = "é" + + // When + let result = sut._quickASCIICharacter(at: sut.startIndex, limitedBy: sut.endIndex) + + // Then + XCTAssertNil(result) + } + + func testNextIsEnd() throws { + // Given + let sut = "foo" + let index = sut.index(before: sut.endIndex) + + // When + let result = sut._quickASCIICharacter(at: index, limitedBy: sut.endIndex) + + // Then + let (char, nextIdx, isCRLF) = try XCTUnwrap(result) + XCTAssertEqual(char, sut.utf8[index]) + XCTAssertEqual(nextIdx, sut.endIndex) + XCTAssertFalse(isCRLF) + } + + // TODO: JH - Figure out how to test sub 300 starting bytes + func testIsCRLF() throws { + // Given + let sut = "\r\n" + + // When + let result = sut._quickASCIICharacter(at: sut.utf8.startIndex, limitedBy: sut.endIndex) + + // Then + let (char, nextIdx, isCRLF) = try XCTUnwrap(result) + XCTAssertEqual(char, sut.utf8[sut.startIndex]) + XCTAssertEqual(nextIdx, sut.endIndex) + XCTAssertTrue(isCRLF) + } +} + +final class QuickReverseASCIICharacterTests: XCTestCase { + func testHappyPath() throws { + // Given + let sut = "foo" + let index = sut.index(after: sut.startIndex) + + // When + let result = sut._quickReverseASCIICharacter(at: index, limitedBy: sut.startIndex) + + // Then + let (char, previousIdx, isCRLF) = try XCTUnwrap(result) + XCTAssertEqual(char, sut.utf8[index]) + XCTAssertEqual(previousIdx, sut.startIndex) + XCTAssertFalse(isCRLF) + } + + func testAtStart() throws { + // Given + let sut = "foo" + + // When + let result = sut._quickReverseASCIICharacter(at: sut.startIndex, limitedBy: sut.startIndex) + + // Then + XCTAssertNil(result) + } + + func testNonASCIIChar() throws { + // Given + let sut = "é" + + // When + let result = sut._quickReverseASCIICharacter(at: sut.startIndex, limitedBy: sut.startIndex) + + // Then + XCTAssertNil(result) + } + + func testPreviousIsStart() throws { + // Given + let sut = "foo" + let index = sut.index(after: sut.startIndex) + + // When + let result = sut._quickReverseASCIICharacter(at: index, limitedBy: sut.startIndex) + + // Then + let (char, previousIdx, isCRLF) = try XCTUnwrap(result) + XCTAssertEqual(char, sut.utf8[index]) + XCTAssertEqual(previousIdx, sut.startIndex) + XCTAssertFalse(isCRLF) + } + + // TODO: JH - Figure out how to test sub 300 starting bytes + func testIsCRLF() throws { + // Given + let sut = "foo\r\n" + // Start at '\n' + let index = sut.utf8.index(before: sut.endIndex) + + // When + let result = sut._quickReverseASCIICharacter(at: index, limitedBy: sut.startIndex) + + // Then + let (char, previousIndex, isCRLF) = try XCTUnwrap(result) + XCTAssertEqual(char, sut.utf8[index]) + XCTAssertEqual(previousIndex, sut.index(sut.startIndex, offsetBy: 2)) + XCTAssertTrue(isCRLF) + } +} + +final class ASCIIQuickMatchTests: XCTestCase { + func testAny() throws { + try _test(matching: .any, against: "!") + try _test(matching: .anyGrapheme, against: "!") + } + + func testDigit() throws { + try _test(matching: .digit, against: "1") + try _test(matching: .digit, against: "a", shouldMatch: false) + } + + func testHorizontalWhitespace() throws { + try _test(matching: .horizontalWhitespace, against: " ") + try _test(matching: .horizontalWhitespace, against: "\t") + try _test(matching: .horizontalWhitespace, against: "\n", shouldMatch: false) + } + + func testVerticalWhitespace() throws { + try _test(matching: .verticalWhitespace, against: "\n") + try _test(matching: .verticalWhitespace, against: "\t", shouldMatch: false) + try _test(matching: .newlineSequence, against: "\n") + try _test(matching: .newlineSequence, against: "\t", shouldMatch: false) + } + + func testVerticalWhitespaceMatchesCRLF() throws { + let crlf = "\r\n" + + // When using scalar semantics: + // The next index should be the index of the "\n" character + try _test( + matching: .verticalWhitespace, + against: crlf, + expectedNext: crlf.utf8.firstIndex(of: ._lineFeed) + ) + + // When not using scalar semantics: + // The next index should be the index after the whole \r\n sequence (the end index) + try _test( + matching: .verticalWhitespace, + against: crlf, + isScalarSemantics: false + ) + } + + func testWhitespace() throws { + try _test(matching: .whitespace, against: " ") + try _test(matching: .whitespace, against: "\t") + try _test(matching: .whitespace, against: "\n") + try _test(matching: .whitespace, against: "a", shouldMatch: false) + } + + func testWhitespaceCRLF() throws { + // Given + let crlf = "\r\n" + + // When using scalar semantics: + // The next index should be the index of the "\n" character + try _test( + matching: .whitespace, + against: crlf, + expectedNext: crlf.utf8.firstIndex(of: ._lineFeed) + ) + + // When not using scalar semantics: + // The next index should be the index after the whole \r\n sequence (the end index) + try _test( + matching: .whitespace, + against: crlf, + isScalarSemantics: false + ) + } + + func testWord() throws { + // Given + try _test(matching: .word, against: "a") + try _test(matching: .word, against: "1") + try _test(matching: .word, against: "_") + try _test(matching: .word, against: "-", shouldMatch: false) + } + + private func _test( + matching cc: _CharacterClassModel.Representation, + against sut: String, + isScalarSemantics: Bool = true, + shouldMatch: Bool = true, + expectedNext: String.Index? = nil + ) throws { + // When + let result = sut._quickMatch( + cc, + at: sut.startIndex, + limitedBy: sut.endIndex, + isScalarSemantics: isScalarSemantics + ) + + // Then + let (next, matched) = try XCTUnwrap(result) + XCTAssertEqual(matched, shouldMatch) + XCTAssertEqual(next, expectedNext ?? sut.endIndex) + } +} + +final class ASCIIQuickReverseMatchTests: XCTestCase { + func testAny() throws { + try _test(matching: .any, against: "1!") + try _test(matching: .anyGrapheme, against: "1!") + } + + func testDigit() throws { + try _test(matching: .digit, against: "a1") + try _test(matching: .digit, against: "1a", shouldMatch: false) + } + + func testHorizontalWhitespace() throws { + try _test(matching: .horizontalWhitespace, against: "a ") + try _test(matching: .horizontalWhitespace, against: "a\t") + try _test(matching: .horizontalWhitespace, against: "a\n", shouldMatch: false) + } + + func testVerticalWhitespace() throws { + try _test(matching: .verticalWhitespace, against: "a\n") + try _test(matching: .verticalWhitespace, against: "a\t", shouldMatch: false) + } + + func testVerticalWhitespaceMatchesCRLF() throws { + let sut = "a\r\n" + + // When using scalar semantics: + // The next index should be the index of the "\n" character + try _test( + matching: .verticalWhitespace, + against: sut, + at: sut.utf8.index(before: sut.utf8.endIndex), + expectedPrevious: sut.utf8.firstIndex(of: ._carriageReturn) + ) + + // When not using scalar semantics: + // The next index should be the index after the whole \r\n sequence (the end index) + try _test( + matching: .verticalWhitespace, + against: sut, + isScalarSemantics: false + ) + } + + func testWhitespace() throws { + try _test(matching: .whitespace, against: "a ") + try _test(matching: .whitespace, against: "a\t") + try _test(matching: .whitespace, against: "a\n") + try _test(matching: .whitespace, against: " a", shouldMatch: false) + } + + func testWhitespaceCRLF() throws { + // Given + let sut = "a\r\n" + + // When using scalar semantics: + // The previous index should be the index of the "\r" character + try _test( + matching: .whitespace, + against: sut, + at: sut.utf8.index(before: sut.utf8.endIndex), + expectedPrevious: sut.utf8.firstIndex(of: ._carriageReturn) + ) + + // When not using scalar semantics: + // The previous index should be the index before the whole \r\n sequence + // (the start index) + try _test( + matching: .whitespace, + against: sut, + isScalarSemantics: false + ) + } + + func testWord() throws { + // Given + try _test(matching: .word, against: "!a") + try _test(matching: .word, against: "!1") + try _test(matching: .word, against: "!_") + try _test(matching: .word, against: "a-", shouldMatch: false) + } + + private func _test( + matching cc: _CharacterClassModel.Representation, + against sut: String, + at index: String.Index? = nil, + isScalarSemantics: Bool = true, + shouldMatch: Bool = true, + expectedPrevious: String.Index? = nil + ) throws { + // When + let result = sut._quickReverseMatch( + cc, + at: index ?? sut.index(before: sut.endIndex), + limitedBy: sut.startIndex, + isScalarSemantics: isScalarSemantics + ) + + // Then + let (previous, matched) = try XCTUnwrap(result) + XCTAssertEqual(matched, shouldMatch) + XCTAssertEqual(previous, expectedPrevious ?? sut.startIndex) + } +} diff --git a/Tests/MatchingEngineTests/MatchingEngineTests.swift b/Tests/MatchingEngineTests/MatchingEngineTests.swift index ccfe85ec7..8785b14e8 100644 --- a/Tests/MatchingEngineTests/MatchingEngineTests.swift +++ b/Tests/MatchingEngineTests/MatchingEngineTests.swift @@ -13,5 +13,695 @@ import XCTest @testable import _StringProcessing -// TODO: Unit tests for the engine itself. Functional testing -// is handled by regex tests. +final class StringMatchingTests: XCTestCase { + // MARK: characterAndEnd tests + func testCharacterAndEnd_HappyPath() throws { + // Given + let sut = "foo" + + // When + let result = sut.characterAndEnd(at: sut.startIndex, limitedBy: sut.endIndex) + + // Then + let (char, nextIndex) = try XCTUnwrap(result) + XCTAssertEqual(char, "f") + XCTAssertEqual(nextIndex, sut.index(after: sut.startIndex)) + } + + func testCharacterAndEnd_SubcharacterMatch() throws { + // Given a string with 2 subcharacter positions in its utf8 view + // \u{62}\u{300}\u{316}\u{65}\u{73}\u{74} + let sut = "b̖̀est" + + let pos = sut.startIndex + let end = sut.utf8.index(after: sut.utf8.startIndex) + + // When + let result = sut.characterAndEnd(at: pos, limitedBy: end) + + // Then + let (char, nextIndex) = try XCTUnwrap(result) + XCTAssertEqual(char, "b") + XCTAssertEqual(nextIndex, end) + } + + func testCharacterAndEnd_SubcharacterMatchEmptyRounded() throws { + // Given a string with 3 sub-character positions in its utf8 view + // \u{62}\u{300}\u{316}\u{335}\u{65}\u{73}\u{74} + let sut = "b̵̖̀est" + + // And a range that doesn't touch a grapheme cluster boundary + // 1[utf8] (aka \u{300}) + let pos = sut.utf8.index(after: sut.startIndex) + // 2[utf8] (aka \u{316}) + let end = sut.utf8.index(sut.startIndex, offsetBy: 2) + + // When we try to get a character from a sub-character range + // of unicode scalars + let result = sut.characterAndEnd(at: pos, limitedBy: end) + + // Then `characterAndEnd` should return nil rather than an empty string + XCTAssertNil(result) + } + + func testCharacterAndEnd_atEnd() { + // Given + let sut = "foo" + + // When + let result = sut.characterAndEnd(at: sut.endIndex, limitedBy: sut.endIndex) + + // Then + XCTAssertNil(result) + } + + // MARK: characterAndStart tests + func testCharacterAndStart_HappyPath() throws { + // Given + let sut = "foo" + let pos = sut.index(before: sut.endIndex) + + // When + let result = sut.characterAndStart(at: pos, limitedBy: sut.startIndex) + + // Then + let (char, previousIndex) = try XCTUnwrap(result) + XCTAssertEqual(char, "o") + XCTAssertEqual(previousIndex, sut.index(before: pos)) + } + + // FIXME: JH - Two diacritical marks are considered a character. + // TODO: JH - Learn more about Substring rounding(?) +// func testCharacterAndStart_SubcharacterMatch() throws { +// // Given a string with 2 subcharacter positions in its utf8 view +// // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} +// let sut = "ab̖̀cd" +// +// // 3[utf8] (aka \u{316}) +// let pos = sut.utf8.index(sut.startIndex, offsetBy: 3) +// let start = sut.startIndex//utf8.index(before: pos) +// +// // When +// let result = sut.characterAndStart(at: pos, limitedBy: start) +// +// // Then +// XCTAssertNil(result) +// let (char, nextIndex) = try XCTUnwrap(result) +// XCTAssertEqual(char, "t") +// XCTAssertEqual(nextIndex, end) +// } +// +// func testCharacterAndStart_SubcharacterMatchEmptyRounded() throws { +// // Given a string with 3 sub-character positions in its utf8 view +// // \u{61}\u{62}\u{335}\u{300}\u{316}\u{63}\u{64} +// let sut = "ab̵̖̀cd" +// +// // And a range that doesn't touch a grapheme cluster boundary +// // 4[utf8] (aka \u{335}) +// let pos = sut.utf8.index(sut.startIndex, offsetBy: 4) +// // 3[utf8] (aka \u{300}) +// let start = sut.utf8.index(sut.startIndex, offsetBy: 3) +// +// // When we try to get a character from a sub-character range +// // of unicode scalars +// let result = sut.characterAndStart(at: pos, limitedBy: start) +// +// // Then `characterAndStart` should return nil rather than an empty string +// XCTAssertNil(result) +// } + + func testCharacterAndStart_atStart() { + // Given + let sut = "foo" + + // When + let result = sut.characterAndStart(at: sut.startIndex, limitedBy: sut.startIndex) + + // Then + XCTAssertNil(result) + } + + // MARK: matchAnyNonNewline tests + func testMatchAnyNonNewline() throws { + // Given + // A string without any newline characters + let sut = "bar" + // and any index other than `endIndex` + let pos = sut.index(before: sut.endIndex) + + // When we run the match: + let result = sut.matchAnyNonNewline( + at: pos, + limitedBy: sut.endIndex, + isScalarSemantics: true + ) + + // Then the next index should be `sut.endIndex` + let nextIndex = try XCTUnwrap(result) + XCTAssertEqual(nextIndex, sut.endIndex) + } + + func testMatchAnyNonNewline_Newline() throws { + // Given + // A string that has a newline character + let sut = "ba\nr" + // and the index of that newline character + let pos = try XCTUnwrap(sut.firstIndex(of: "\n")) + + // When we run the reverse match: + let result = sut.matchAnyNonNewline( + at: pos, + limitedBy: sut.endIndex, + isScalarSemantics: true + ) + + // Then we should get nil because the character at `pos` is a newline + XCTAssertNil(result) + } + + func testMatchAnyNonNewline_atEnd() throws { + // Given + // A string without any newline characters + let sut = "bar" + + // When we try to reverse match starting at `startIndex`: + let result = sut.matchAnyNonNewline( + at: sut.endIndex, + limitedBy: sut.endIndex, + isScalarSemantics: true + ) + + // Then we should get nil because there isn't an index before `startIndex` + XCTAssertNil(result) + } + + func testReverseMatchAnyNonNewline() throws { + // Given + // A string without any newline characters + let sut = "bar" + // and an index other than `startIndex` or `endIndex` + let pos = sut.index(before: sut.endIndex) + + // When we run the reverse match: + let result = sut.reverseMatchAnyNonNewline( + at: pos, + limitedBy: sut.startIndex, + isScalarSemantics: true + ) + + // Then we should get a previous index + let previousIndex = try XCTUnwrap(result) + // The character at the previous index should be "a" + XCTAssertEqual(sut[previousIndex], "a") + } + + func testReverseMatchAnyNonNewline_Newline() throws { + // Given + // A string that has a newline character, + let sut = "ba\nr" + // and the index of that newline character + let pos = try XCTUnwrap(sut.firstIndex(of: "\n")) + + // When we run the reverse match: + let result = sut.reverseMatchAnyNonNewline( + at: pos, + limitedBy: sut.startIndex, + isScalarSemantics: true + ) + + // Then we should get nil because the character at `pos` is a newline + XCTAssertNil(result) + } + + func testReverseMatchAnyNonNewline_atStart() throws { + // Given + // A string without any newline characters + let sut = "bar" + + // When we try to reverse match starting at `startIndex`: + let result = sut.reverseMatchAnyNonNewline( + at: sut.startIndex, + limitedBy: sut.startIndex, + isScalarSemantics: true + ) + + // Then we should get nil because there isn't an index before `startIndex` + XCTAssertNil(result) + } + + func testMatchBuiltinCCAtEnd() { + // Given + let sut = "" + + // When + let next = sut.matchBuiltinCC( + .any, + at: sut.endIndex, + limitedBy: sut.endIndex, + isInverted: false, + isStrictASCII: false, + isScalarSemantics: true + ) + + // Then + XCTAssertNil(next) + } +} + +// MARK: matchScalar tests +extension StringMatchingTests { + func testMatchScalar() { + // Given + let sut = "bar" + + // When + let next = sut.matchScalar( + "b", + at: sut.startIndex, + limitedBy: sut.endIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertEqual(next, sut.index(after: sut.startIndex)) + } + + func testMatchScalarNoMatch() { + // Given + let sut = "bar" + + // When + let next = sut.matchScalar( + "a", + at: sut.startIndex, + limitedBy: sut.endIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertNil(next) + } + + func testMatchScalarCaseInsensitive() { + // Given + let sut = "BAR" + + // When + let next = sut.matchScalar( + "b", + at: sut.startIndex, + limitedBy: sut.endIndex, + boundaryCheck: false, + isCaseInsensitive: true + ) + + // Then + XCTAssertEqual(next, sut.index(after: sut.startIndex)) + } + + func testMatchScalarCaseInsensitiveNoMatch() { + // Given + let sut = "BAR" + + // When + let next = sut.matchScalar( + "a", + at: sut.startIndex, + limitedBy: sut.endIndex, + boundaryCheck: false, + isCaseInsensitive: true + ) + + // Then + XCTAssertNil(next) + } + + func testMatchScalarAtEnd() { + // Given + let sut = "" + + // When + let next = sut.matchScalar( + "a", + at: sut.endIndex, + limitedBy: sut.endIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertNil(next) + } + + // TODO: JH - Write test for when the boundary check passes/check if that's already covered + func testMatchScalarFailsBoundaryCheck() { + // Given + // \u{62}\u{300}\u{316}\u{65}\u{73}\u{74} + let sut = "b̖̀est" + + // When + let next = sut.matchScalar( + "\u{300}", + at: sut.unicodeScalars.index(after: sut.unicodeScalars.startIndex), + limitedBy: sut.endIndex, + boundaryCheck: true, + isCaseInsensitive: false + ) + + // Then + XCTAssertNil(next) + } + + func testMatchScalarNoBoundaryCheck() { + // Given + // \u{62}\u{300}\u{316}\u{65}\u{73}\u{74} + let sut = "b̖̀est" + let startPos = sut.unicodeScalars.index(after: sut.unicodeScalars.startIndex) + + // When + let next = sut.matchScalar( + "\u{300}", + at: startPos, + limitedBy: sut.endIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertEqual(next, sut.unicodeScalars.index(after: startPos)) + } +} + +// MARK: reverseMatchScalar tests +extension StringMatchingTests { + func testReverseMatchScalar() { + // Given + let sut = "bar" + + // When + let previous = sut.reverseMatchScalar( + "a", + at: sut.index(after: sut.startIndex), + limitedBy: sut.startIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertEqual(previous, sut.startIndex) + } + + func testReverseMatchScalarNoMatch() { + // Given + let sut = "bar" + + // When + let previous = sut.reverseMatchScalar( + "b", + at: sut.index(after: sut.startIndex), + limitedBy: sut.startIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertNil(previous) + } + + func testReverseMatchScalarCaseInsensitive() { + // Given + let sut = "BAR" + + // When + let previous = sut.reverseMatchScalar( + "a", + at: sut.index(after: sut.startIndex), + limitedBy: sut.startIndex, + boundaryCheck: false, + isCaseInsensitive: true + ) + + // Then + XCTAssertEqual(previous, sut.startIndex) + } + + func testReverseMatchScalarCaseInsensitiveNoMatch() { + // Given + let sut = "BAR" + + // When + let previous = sut.reverseMatchScalar( + "b", + at: sut.index(after: sut.startIndex), + limitedBy: sut.startIndex, + boundaryCheck: false, + isCaseInsensitive: true + ) + + // Then + XCTAssertNil(previous) + } + + func testReverseMatchScalarAtStart() { + // Given + let sut = "a" + + // When + let previous = sut.reverseMatchScalar( + "a", + at: sut.startIndex, + limitedBy: sut.startIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertNil(previous) + } + + // TODO: JH - Write test for when the boundary check passes/check if that's already covered + func testReverseMatchScalarFailsBoundaryCheck() { + // Given + // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} + let sut = "ab̖̀cd" + + // When + let previous = sut.reverseMatchScalar( + "\u{316}", + at: sut.unicodeScalars.index(sut.unicodeScalars.startIndex, offsetBy: 3), + limitedBy: sut.startIndex, + boundaryCheck: true, + isCaseInsensitive: false + ) + + // Then + XCTAssertNil(previous) + } + + func testReverseMatchScalarNoBoundaryCheck() { + // Given + // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} + let sut = "ab̖̀cd" + let startPos = sut.unicodeScalars.index(sut.unicodeScalars.startIndex, offsetBy: 3) + + // When + let previous = sut.reverseMatchScalar( + "\u{316}", + at: startPos, + limitedBy: sut.startIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertEqual(previous, sut.unicodeScalars.index(before: startPos)) + } +} + +// MARK: matchUTF8 tests +extension StringMatchingTests { + func testMatchUTF8() { + // Given + let sut = "quotedliteral" + let needle = Array(sut.prefix(3).utf8) + + // When + let next = sut.matchUTF8( + needle, + at: sut.startIndex, + limitedBy: sut.endIndex, + boundaryCheck: false + ) + + // Then + XCTAssertEqual(next, sut.index(sut.startIndex, offsetBy: 3)) + } + + func testMatchUTF8NoMatch() { + // Given + let haystack = "quotedliteral" + let needle = Array("\(haystack.prefix(2))a".utf8) + + // When + let next = haystack.matchUTF8( + needle, + at: haystack.startIndex, + limitedBy: haystack.endIndex, + boundaryCheck: false + ) + + // Then + XCTAssertNil(next) + } + + func testMatchUTF8MatchPastEnd() { + // Given + let haystack = "quotedliteral" + let needle = Array(haystack.prefix(3).utf8) + + // When + let next = haystack.matchUTF8( + needle, + at: haystack.startIndex, + limitedBy: haystack.index(haystack.startIndex, offsetBy: 2), + boundaryCheck: false + ) + + // Then + XCTAssertNil(next) + } + + // TODO: JH - Write test for when the boundary check passes/check if that's already covered + func testMatchUTF8FailsBoundaryCheck() { + // Given + // \u{62}\u{300}\u{316}\u{65}\u{73}\u{74} + let sut = "b̖̀est" + + // When + let next = sut.matchUTF8( + Array("\u{62}".utf8), + at: sut.unicodeScalars.startIndex, + limitedBy: sut.endIndex, + boundaryCheck: true + ) + + // Then + XCTAssertNil(next) + } + + func testMatchUTF8NoBoundaryCheck() { + // Given + // \u{62}\u{300}\u{316}\u{65}\u{73}\u{74} + let sut = "b̖̀est" + + // When + let next = sut.matchUTF8( + Array("\u{62}".utf8), + at: sut.startIndex, + limitedBy: sut.endIndex, + boundaryCheck: false + ) + + // Then + XCTAssertEqual(next, sut.unicodeScalars.index(after: sut.startIndex)) + } +} + +// MARK: reverseMatchUTF8 tests +extension StringMatchingTests { + func testReverseMatchUTF8() { + // Given + let sut = "quotedliteral" + let needle = Array(sut.suffix(3).utf8) + + // When + let previous = sut.reverseMatchUTF8( + needle, + at: sut.index(before: sut.endIndex), + limitedBy: sut.startIndex, + boundaryCheck: false + ) + + // Then + XCTAssertEqual(previous, sut.index(sut.endIndex, offsetBy: -4)) + } + + func testReverseMatchUTF8NoMatch() { + // Given + let haystack = "quotedliteral" + let needle = Array("\(haystack.suffix(2))a".utf8) + + // When + let previous = haystack.reverseMatchUTF8( + needle, + at: haystack.index(before: haystack.endIndex), + limitedBy: haystack.startIndex, + boundaryCheck: false + ) + + // Then + XCTAssertNil(previous) + } + + func testReverseMatchUTF8MatchPastStart() { + // Given + let haystack = "quotedliteral" + let needle = Array(haystack.suffix(3).utf8) + + // When + let previous = haystack.reverseMatchUTF8( + needle, + at: haystack.index(haystack.endIndex, offsetBy: -1), + limitedBy: haystack.index(haystack.unicodeScalars.endIndex, offsetBy: -2), + boundaryCheck: false + ) + + // Then + XCTAssertNil(previous) + } + + // TODO: JH - Write test for when the boundary check passes/check if that's already covered + func testReverseMatchUTF8FailsBoundaryCheck() { + // Given + // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} + let sut = "ab̖̀cd" + let needle = Array("\u{316}".utf8) + + // When + let previous = sut.reverseMatchUTF8( + needle, + at: sut.utf8.index(sut.utf8.endIndex, offsetBy: -3), + limitedBy: sut.startIndex, + boundaryCheck: true + ) + + // Then + XCTAssertNil(previous) + } + + func testReverseMatchUTF8NoBoundaryCheck() throws { + // Given + // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} + // utf8 = [97, 98, 204, 128, 204, 150, 99, 100] + let sut = "ab̖̀cd" + // utf8 = [204, 150] + let needle = Array("\u{316}".utf8) + // Position of \u{316} = 5[utf8] + let startPos = sut.utf8.index(sut.utf8.endIndex, offsetBy: -3) + + // When + let previous = sut.reverseMatchUTF8( + needle, + at: startPos, + limitedBy: sut.startIndex, + boundaryCheck: false + ) + + // Then + // TODO: JH - Is there a better way to write this assertion? + // Previous should be the second byte of \u{300} + XCTAssertEqual(sut.utf8[previous!], 128) + } +} diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 7ea38490a..437c7e669 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -51,6 +51,14 @@ enum DecodedInstr { case transformCapture case captureValue case quantify + case reverse + case reverseMatch + case reverseMatchAnyNonNewline + case reverseMatchBitset + case reverseMatchBuiltin + case reverseMatchScalar + case reverseMatchUTF8 + case reverseQuantify } extension DecodedInstr { @@ -142,8 +150,24 @@ extension DecodedInstr { return .captureValue case .matchBuiltin: return .matchBuiltin + case .reverse: + return .reverse + case .reverseMatch: + return .reverseMatch + case .reverseMatchScalar: + return .reverseMatchScalar + case .reverseMatchBitset: + return .reverseMatchBitset + case .reverseMatchBuiltin: + return .reverseMatchBuiltin + case .reverseMatchAnyNonNewline: + return .reverseMatchAnyNonNewline + case .reverseQuantify: + return .reverseQuantify case .matchUTF8: return .matchUTF8 + case .reverseMatchUTF8: + return .reverseMatchUTF8 } } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index c52560d66..ae739dd7f 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -289,8 +289,8 @@ func firstMatchTest( input: String, match: String?, syntax: SyntaxOptions = .traditional, - enableTracing: Bool = false, - dumpAST: Bool = false, + enableTracing: Bool = true, + dumpAST: Bool = true, xfail: Bool = false, validateOptimizations: Bool = true, semanticLevel: RegexSemanticLevel = .graphemeCluster, @@ -325,6 +325,7 @@ func firstMatchTests( enableTracing: Bool = false, dumpAST: Bool = false, xfail: Bool = false, + validateOptimizations: Bool = true, semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #filePath, line: UInt = #line @@ -338,6 +339,7 @@ func firstMatchTests( enableTracing: enableTracing, dumpAST: dumpAST, xfail: xfail, + validateOptimizations: validateOptimizations, semanticLevel: semanticLevel, file: file, line: line) @@ -1601,28 +1603,61 @@ extension RegexTests { (input: "hzello", match: "e"), (input: "hezllo", match: nil), (input: "helloz", match: nil)) + } + func testLookbehinds() { firstMatchTest( - #"(?<=USD)\d+"#, input: "Price: USD100", match: "100", xfail: true) + #"(?<=USD)\d+"#, input: "Price: USD100", match: "100") firstMatchTest( - #"(*plb:USD)\d+"#, input: "Price: USD100", match: "100", xfail: true) + #"(*plb:USD)\d+"#, input: "Price: USD100", match: "100") firstMatchTest( #"(*positive_lookbehind:USD)\d+"#, - input: "Price: USD100", match: "100", xfail: true) - // engines generally enforce that lookbehinds are fixed width + input: "Price: USD100", match: "100") + firstMatchTest( - #"\d{3}(?<=USD\d{3})"#, input: "Price: USD100", match: "100", xfail: true) + #"\d{3}(?<=USD\d{3})"#, input: "Price: USD100", match: "100") firstMatchTest( - #"(?