@@ -59,10 +59,14 @@ fileprivate extension Compiler.ByteCodeGen {
5959 emitAny ( )
6060
6161 case let . char( c) :
62- try emitCharacter ( c)
62+ emitCharacter ( c)
6363
6464 case let . scalar( s) :
65- try emitScalar ( s)
65+ if options. semanticLevel == . graphemeCluster {
66+ emitCharacter ( Character ( s) )
67+ } else {
68+ emitMatchScalar ( s)
69+ }
6670
6771 case let . assertion( kind) :
6872 try emitAssertion ( kind. ast)
@@ -88,6 +92,34 @@ fileprivate extension Compiler.ByteCodeGen {
8892 }
8993 }
9094
95+ mutating func emitQuotedLiteral( _ s: String ) {
96+ guard options. semanticLevel == . graphemeCluster else {
97+ for char in s {
98+ for scalar in char. unicodeScalars {
99+ emitMatchScalar ( scalar)
100+ }
101+ }
102+ return
103+ }
104+
105+ // Fast path for eliding boundary checks for an all ascii quoted literal
106+ if optimizationsEnabled && s. allSatisfy ( \. isASCII) {
107+ let lastIdx = s. unicodeScalars. indices. last!
108+ for idx in s. unicodeScalars. indices {
109+ let boundaryCheck = idx == lastIdx
110+ let scalar = s. unicodeScalars [ idx]
111+ if options. isCaseInsensitive && scalar. properties. isCased {
112+ builder. buildMatchScalarCaseInsensitive ( scalar, boundaryCheck: boundaryCheck)
113+ } else {
114+ builder. buildMatchScalar ( scalar, boundaryCheck: boundaryCheck)
115+ }
116+ }
117+ return
118+ }
119+
120+ for c in s { emitCharacter ( c) }
121+ }
122+
91123 mutating func emitBackreference(
92124 _ ref: AST . Reference
93125 ) throws {
@@ -245,41 +277,47 @@ fileprivate extension Compiler.ByteCodeGen {
245277 }
246278 }
247279
248- mutating func emitScalar( _ s: UnicodeScalar ) throws {
249- // TODO: Native instruction buildMatchScalar(s)
250- if options. isCaseInsensitive {
251- // TODO: e.g. buildCaseInsensitiveMatchScalar(s)
252- builder. buildConsume ( by: consumeScalar {
253- $0. properties. lowercaseMapping == s. properties. lowercaseMapping
254- } )
280+ mutating func emitMatchScalar( _ s: UnicodeScalar ) {
281+ assert ( options. semanticLevel == . unicodeScalar)
282+ if options. isCaseInsensitive && s. properties. isCased {
283+ builder. buildMatchScalarCaseInsensitive ( s, boundaryCheck: false )
255284 } else {
256- builder. buildConsume ( by: consumeScalar {
257- $0 == s
258- } )
285+ builder. buildMatchScalar ( s, boundaryCheck: false )
259286 }
260287 }
261288
262- mutating func emitCharacter( _ c: Character ) throws {
263- // Unicode scalar matches the specific scalars that comprise a character
289+ mutating func emitCharacter( _ c: Character ) {
290+ // Unicode scalar mode matches the specific scalars that comprise a character
264291 if options. semanticLevel == . unicodeScalar {
265292 for scalar in c. unicodeScalars {
266- try emitScalar ( scalar)
293+ emitMatchScalar ( scalar)
267294 }
268295 return
269296 }
270297
271298 if options. isCaseInsensitive && c. isCased {
272- // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
273- builder. buildConsume { input, bounds in
274- let inputChar = input [ bounds. lowerBound] . lowercased ( )
275- let matchChar = c. lowercased ( )
276- return inputChar == matchChar
277- ? input. index ( after: bounds. lowerBound)
278- : nil
299+ if optimizationsEnabled && c. isASCII {
300+ // c.isCased ensures that c is not CR-LF,
301+ // so we know that c is a single scalar
302+ assert ( c. unicodeScalars. count == 1 )
303+ builder. buildMatchScalarCaseInsensitive (
304+ c. unicodeScalars. last!,
305+ boundaryCheck: true )
306+ } else {
307+ builder. buildMatch ( c, isCaseInsensitive: true )
279308 }
280- } else {
281- builder. buildMatch ( c)
309+ return
282310 }
311+
312+ if optimizationsEnabled && c. isASCII {
313+ let lastIdx = c. unicodeScalars. indices. last!
314+ for idx in c. unicodeScalars. indices {
315+ builder. buildMatchScalar ( c. unicodeScalars [ idx] , boundaryCheck: idx == lastIdx)
316+ }
317+ return
318+ }
319+
320+ builder. buildMatch ( c, isCaseInsensitive: false )
283321 }
284322
285323 mutating func emitAny( ) {
@@ -717,11 +755,12 @@ fileprivate extension Compiler.ByteCodeGen {
717755 _ ccc: DSLTree . CustomCharacterClass
718756 ) throws {
719757 if let asciiBitset = ccc. asAsciiBitset ( options) ,
720- options. semanticLevel == . graphemeCluster,
721758 optimizationsEnabled {
722- // future work: add a bit to .matchBitset to consume either a character
723- // or a scalar so we can have this optimization in scalar mode
724- builder. buildMatchAsciiBitset ( asciiBitset)
759+ if options. semanticLevel == . unicodeScalar {
760+ builder. buildScalarMatchAsciiBitset ( asciiBitset)
761+ } else {
762+ builder. buildMatchAsciiBitset ( asciiBitset)
763+ }
725764 } else {
726765 let consumer = try ccc. generateConsumer ( options)
727766 builder. buildConsume ( by: consumer)
@@ -798,45 +837,7 @@ fileprivate extension Compiler.ByteCodeGen {
798837 try emitAtom ( a)
799838
800839 case let . quotedLiteral( s) :
801- if options. semanticLevel == . graphemeCluster {
802- if options. isCaseInsensitive {
803- // TODO: buildCaseInsensitiveMatchSequence(c) or alternative
804- builder. buildConsume { input, bounds in
805- var iterator = s. makeIterator ( )
806- var currentIndex = bounds. lowerBound
807- while let ch = iterator. next ( ) {
808- guard currentIndex < bounds. upperBound,
809- ch. lowercased ( ) == input [ currentIndex] . lowercased ( )
810- else { return nil }
811- input. formIndex ( after: & currentIndex)
812- }
813- return currentIndex
814- }
815- } else {
816- builder. buildMatchSequence ( s)
817- }
818- } else {
819- builder. buildConsume {
820- [ caseInsensitive = options. isCaseInsensitive] input, bounds in
821- // TODO: Case folding
822- var iterator = s. unicodeScalars. makeIterator ( )
823- var currentIndex = bounds. lowerBound
824- while let scalar = iterator. next ( ) {
825- guard currentIndex < bounds. upperBound else { return nil }
826- if caseInsensitive {
827- if scalar. properties. lowercaseMapping != input. unicodeScalars [ currentIndex] . properties. lowercaseMapping {
828- return nil
829- }
830- } else {
831- if scalar != input. unicodeScalars [ currentIndex] {
832- return nil
833- }
834- }
835- input. unicodeScalars. formIndex ( after: & currentIndex)
836- }
837- return currentIndex
838- }
839- }
840+ emitQuotedLiteral ( s)
840841
841842 case let . convertedRegexLiteral( n, _) :
842843 return try emitNode ( n)
0 commit comments