@@ -65,10 +65,14 @@ fileprivate extension Compiler.ByteCodeGen {
6565 emitDot ( )
6666
6767 case let . char( c) :
68- try emitCharacter ( c)
68+ emitCharacter ( c)
6969
7070 case let . scalar( s) :
71- try emitScalar ( s)
71+ if options. semanticLevel == . graphemeCluster {
72+ emitCharacter ( Character ( s) )
73+ } else {
74+ emitMatchScalar ( s)
75+ }
7276
7377 case let . assertion( kind) :
7478 try emitAssertion ( kind)
@@ -94,6 +98,34 @@ fileprivate extension Compiler.ByteCodeGen {
9498 }
9599 }
96100
101+ mutating func emitQuotedLiteral( _ s: String ) {
102+ guard options. semanticLevel == . graphemeCluster else {
103+ for char in s {
104+ for scalar in char. unicodeScalars {
105+ emitMatchScalar ( scalar)
106+ }
107+ }
108+ return
109+ }
110+
111+ // Fast path for eliding boundary checks for an all ascii quoted literal
112+ if optimizationsEnabled && s. allSatisfy ( \. isASCII) {
113+ let lastIdx = s. unicodeScalars. indices. last!
114+ for idx in s. unicodeScalars. indices {
115+ let boundaryCheck = idx == lastIdx
116+ let scalar = s. unicodeScalars [ idx]
117+ if options. isCaseInsensitive && scalar. properties. isCased {
118+ builder. buildMatchScalarCaseInsensitive ( scalar, boundaryCheck: boundaryCheck)
119+ } else {
120+ builder. buildMatchScalar ( scalar, boundaryCheck: boundaryCheck)
121+ }
122+ }
123+ return
124+ }
125+
126+ for c in s { emitCharacter ( c) }
127+ }
128+
97129 mutating func emitBackreference(
98130 _ ref: AST . Reference
99131 ) throws {
@@ -257,41 +289,47 @@ fileprivate extension Compiler.ByteCodeGen {
257289 }
258290 }
259291
260- mutating func emitScalar( _ s: UnicodeScalar ) throws {
261- // TODO: Native instruction buildMatchScalar(s)
262- if options. isCaseInsensitive {
263- // TODO: e.g. buildCaseInsensitiveMatchScalar(s)
264- builder. buildConsume ( by: consumeScalar {
265- $0. properties. lowercaseMapping == s. properties. lowercaseMapping
266- } )
292+ mutating func emitMatchScalar( _ s: UnicodeScalar ) {
293+ assert ( options. semanticLevel == . unicodeScalar)
294+ if options. isCaseInsensitive && s. properties. isCased {
295+ builder. buildMatchScalarCaseInsensitive ( s, boundaryCheck: false )
267296 } else {
268- builder. buildConsume ( by: consumeScalar {
269- $0 == s
270- } )
297+ builder. buildMatchScalar ( s, boundaryCheck: false )
271298 }
272299 }
273300
274- mutating func emitCharacter( _ c: Character ) throws {
275- // Unicode scalar matches the specific scalars that comprise a character
301+ mutating func emitCharacter( _ c: Character ) {
302+ // Unicode scalar mode matches the specific scalars that comprise a character
276303 if options. semanticLevel == . unicodeScalar {
277304 for scalar in c. unicodeScalars {
278- try emitScalar ( scalar)
305+ emitMatchScalar ( scalar)
279306 }
280307 return
281308 }
282309
283310 if options. isCaseInsensitive && c. isCased {
284- // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
285- builder. buildConsume { input, bounds in
286- let inputChar = input [ bounds. lowerBound] . lowercased ( )
287- let matchChar = c. lowercased ( )
288- return inputChar == matchChar
289- ? input. index ( after: bounds. lowerBound)
290- : nil
311+ if optimizationsEnabled && c. isASCII {
312+ // c.isCased ensures that c is not CR-LF,
313+ // so we know that c is a single scalar
314+ assert ( c. unicodeScalars. count == 1 )
315+ builder. buildMatchScalarCaseInsensitive (
316+ c. unicodeScalars. last!,
317+ boundaryCheck: true )
318+ } else {
319+ builder. buildMatch ( c, isCaseInsensitive: true )
291320 }
292- } else {
293- builder. buildMatch ( c)
321+ return
322+ }
323+
324+ if optimizationsEnabled && c. isASCII {
325+ let lastIdx = c. unicodeScalars. indices. last!
326+ for idx in c. unicodeScalars. indices {
327+ builder. buildMatchScalar ( c. unicodeScalars [ idx] , boundaryCheck: idx == lastIdx)
328+ }
329+ return
294330 }
331+
332+ builder. buildMatch ( c, isCaseInsensitive: false )
295333 }
296334
297335 mutating func emitAny( ) {
@@ -567,7 +605,12 @@ fileprivate extension Compiler.ByteCodeGen {
567605 decrement %minTrips and fallthrough
568606
569607 loop-body:
608+ <if can't guarantee forward progress && extraTrips = nil>:
609+ mov currentPosition %pos
570610 evaluate the subexpression
611+ <if can't guarantee forward progress && extraTrips = nil>:
612+ if %pos is currentPosition:
613+ goto exit
571614 goto min-trip-count control block
572615
573616 exit-policy control block:
@@ -670,7 +713,28 @@ fileprivate extension Compiler.ByteCodeGen {
670713 // <subexpression>
671714 // branch min-trip-count
672715 builder. label ( loopBody)
716+
717+ // if we aren't sure if the child node will have forward progress and
718+ // we have an unbounded quantification
719+ let startPosition : PositionRegister ?
720+ let emitPositionChecking =
721+ ( !optimizationsEnabled || !child. guaranteesForwardProgress) &&
722+ extraTrips == nil
723+
724+ if emitPositionChecking {
725+ startPosition = builder. makePositionRegister ( )
726+ builder. buildMoveCurrentPosition ( into: startPosition!)
727+ } else {
728+ startPosition = nil
729+ }
673730 try emitNode ( child)
731+ if emitPositionChecking {
732+ // in all quantifier cases, no matter what minTrips or extraTrips is,
733+ // if we have a successful non-advancing match, branch to exit because it
734+ // can match an arbitrary number of times
735+ builder. buildCondBranch ( to: exit, ifSamePositionAs: startPosition!)
736+ }
737+
674738 if minTrips <= 1 {
675739 // fallthrough
676740 } else {
@@ -715,11 +779,12 @@ fileprivate extension Compiler.ByteCodeGen {
715779 _ ccc: DSLTree . CustomCharacterClass
716780 ) throws {
717781 if let asciiBitset = ccc. asAsciiBitset ( options) ,
718- options. semanticLevel == . graphemeCluster,
719782 optimizationsEnabled {
720- // future work: add a bit to .matchBitset to consume either a character
721- // or a scalar so we can have this optimization in scalar mode
722- builder. buildMatchAsciiBitset ( asciiBitset)
783+ if options. semanticLevel == . unicodeScalar {
784+ builder. buildScalarMatchAsciiBitset ( asciiBitset)
785+ } else {
786+ builder. buildMatchAsciiBitset ( asciiBitset)
787+ }
723788 } else {
724789 let consumer = try ccc. generateConsumer ( options)
725790 builder. buildConsume ( by: consumer)
@@ -796,45 +861,7 @@ fileprivate extension Compiler.ByteCodeGen {
796861 try emitAtom ( a)
797862
798863 case let . quotedLiteral( s) :
799- if options. semanticLevel == . graphemeCluster {
800- if options. isCaseInsensitive {
801- // TODO: buildCaseInsensitiveMatchSequence(c) or alternative
802- builder. buildConsume { input, bounds in
803- var iterator = s. makeIterator ( )
804- var currentIndex = bounds. lowerBound
805- while let ch = iterator. next ( ) {
806- guard currentIndex < bounds. upperBound,
807- ch. lowercased ( ) == input [ currentIndex] . lowercased ( )
808- else { return nil }
809- input. formIndex ( after: & currentIndex)
810- }
811- return currentIndex
812- }
813- } else {
814- builder. buildMatchSequence ( s)
815- }
816- } else {
817- builder. buildConsume {
818- [ caseInsensitive = options. isCaseInsensitive] input, bounds in
819- // TODO: Case folding
820- var iterator = s. unicodeScalars. makeIterator ( )
821- var currentIndex = bounds. lowerBound
822- while let scalar = iterator. next ( ) {
823- guard currentIndex < bounds. upperBound else { return nil }
824- if caseInsensitive {
825- if scalar. properties. lowercaseMapping != input. unicodeScalars [ currentIndex] . properties. lowercaseMapping {
826- return nil
827- }
828- } else {
829- if scalar != input. unicodeScalars [ currentIndex] {
830- return nil
831- }
832- }
833- input. unicodeScalars. formIndex ( after: & currentIndex)
834- }
835- return currentIndex
836- }
837- }
864+ emitQuotedLiteral ( s)
838865
839866 case let . convertedRegexLiteral( n, _) :
840867 return try emitNode ( n)
@@ -856,3 +883,42 @@ fileprivate extension Compiler.ByteCodeGen {
856883 return nil
857884 }
858885}
886+
887+ extension DSLTree . Node {
888+ var guaranteesForwardProgress : Bool {
889+ switch self {
890+ case . orderedChoice( let children) :
891+ return children. allSatisfy { $0. guaranteesForwardProgress }
892+ case . concatenation( let children) :
893+ return children. contains ( where: { $0. guaranteesForwardProgress } )
894+ case . capture( _, _, let node, _) :
895+ return node. guaranteesForwardProgress
896+ case . nonCapturingGroup( let kind, let child) :
897+ switch kind. ast {
898+ case . lookahead, . negativeLookahead, . lookbehind, . negativeLookbehind:
899+ return false
900+ default : return child. guaranteesForwardProgress
901+ }
902+ case . atom( let atom) :
903+ switch atom {
904+ case . changeMatchingOptions, . assertion: return false
905+ default : return true
906+ }
907+ case . trivia, . empty:
908+ return false
909+ case . quotedLiteral( let string) :
910+ return !string. isEmpty
911+ case . convertedRegexLiteral( let node, _) :
912+ return node. guaranteesForwardProgress
913+ case . consumer, . matcher:
914+ // Allow zero width consumers and matchers
915+ return false
916+ case . customCharacterClass:
917+ return true
918+ case . quantification( let amount, _, let child) :
919+ let ( atLeast, _) = amount. ast. bounds
920+ return atLeast ?? 0 > 0 && child. guaranteesForwardProgress
921+ default : return false
922+ }
923+ }
924+ }
0 commit comments