Skip to content

Optimize matching to match on scalar values when possible #525

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Jul 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
e518bdf
Add unicode and dna benchmarks
rctcwyvrn Jun 23, 2022
eb64036
Fixup naming, turn off firstmatch, add benchmark filtering by regex
rctcwyvrn Jun 23, 2022
1237f9a
Make --exclude-ns actually work correctly
rctcwyvrn Jun 23, 2022
fdf2c23
Add usage comment for generateFasta
rctcwyvrn Jun 23, 2022
eeb38e9
First ver
rctcwyvrn Jun 28, 2022
6f76f36
Merge branch 'main' into match-scalar
rctcwyvrn Jun 29, 2022
3a2b324
Remove matchseq entirely
rctcwyvrn Jun 29, 2022
df8919e
Merge branch 'just-one-more-benchmark-suite' into temp
rctcwyvrn Jun 29, 2022
c2ee8cc
Finish up matchScalar
rctcwyvrn Jun 29, 2022
809b085
Factor out nextScalarIndex for matchBitsetScalar
rctcwyvrn Jun 29, 2022
06c77c7
Add scalar mode support for matching bitsets + fix bug
rctcwyvrn Jun 29, 2022
e3d7ad7
Emit matchScalar in quotedLiteral when in unicode scalar mode
rctcwyvrn Jun 29, 2022
3e1e088
Add tests
rctcwyvrn Jun 29, 2022
b4c7c8c
Cleanup
rctcwyvrn Jun 30, 2022
5359e31
Revert "Merge branch 'just-one-more-benchmark-suite' into temp"
rctcwyvrn Jun 30, 2022
6e4c2bd
Cleanup
rctcwyvrn Jul 4, 2022
1a359b4
Add case-insensitive match instructions
rctcwyvrn Jul 4, 2022
097ffeb
Remove extra instructions and use payload bits instead
rctcwyvrn Jul 4, 2022
76667a2
Comment out compiletests for now
rctcwyvrn Jul 4, 2022
6a1f6e9
Fix compile tests
rctcwyvrn Jul 5, 2022
fce8f9a
Merge branch 'main' into scalar-optimizations-clean
rctcwyvrn Jul 7, 2022
0860368
Fix scalar matching in grapheme semantic mode
hamishknight Jul 8, 2022
c6bc811
Preserve scalar syntax in DSL conversion
hamishknight Jul 8, 2022
bca1d2b
Change scalar semantics to match #565
rctcwyvrn Jul 11, 2022
79e60ac
Merge branch 'not-to-scale' into scalar-optimizations-clean
rctcwyvrn Jul 11, 2022
22cc9d5
Add edge case test
rctcwyvrn Jul 11, 2022
7a9923d
Always match .scalar under grapheme semantics
rctcwyvrn Jul 11, 2022
47f8e66
Merge branch 'main' into scalar-optimizations-clean
rctcwyvrn Jul 11, 2022
7aa98d1
Add new instructions to compile tests
rctcwyvrn Jul 11, 2022
113cfe3
Add an XFAIL test for scalar coalescing
rctcwyvrn Jul 12, 2022
9949b8e
Fix XCTExpectFailure for linux
rctcwyvrn Jul 12, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 68 additions & 67 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,14 @@ fileprivate extension Compiler.ByteCodeGen {
emitAny()

case let .char(c):
try emitCharacter(c)
emitCharacter(c)

case let .scalar(s):
try emitScalar(s)
if options.semanticLevel == .graphemeCluster {
emitCharacter(Character(s))
} else {
emitMatchScalar(s)
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should fail tests, and if it's not we may need to write the tests and XFAIL them. E.g. /a\u{301}/, IIUC, as written would try to make a proper Character from the combining scalar.


case let .assertion(kind):
try emitAssertion(kind.ast)
Expand All @@ -88,6 +92,34 @@ fileprivate extension Compiler.ByteCodeGen {
}
}

mutating func emitQuotedLiteral(_ s: String) {
guard options.semanticLevel == .graphemeCluster else {
for char in s {
for scalar in char.unicodeScalars {
emitMatchScalar(scalar)
}
}
return
}

// Fast path for eliding boundary checks for an all ascii quoted literal
if optimizationsEnabled && s.allSatisfy(\.isASCII) {
let lastIdx = s.unicodeScalars.indices.last!
for idx in s.unicodeScalars.indices {
let boundaryCheck = idx == lastIdx
let scalar = s.unicodeScalars[idx]
if options.isCaseInsensitive && scalar.properties.isCased {
builder.buildMatchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck)
} else {
builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck)
}
}
return
}

for c in s { emitCharacter(c) }
}

mutating func emitBackreference(
_ ref: AST.Reference
) throws {
Expand Down Expand Up @@ -245,41 +277,47 @@ fileprivate extension Compiler.ByteCodeGen {
}
}

mutating func emitScalar(_ s: UnicodeScalar) throws {
// TODO: Native instruction buildMatchScalar(s)
if options.isCaseInsensitive {
// TODO: e.g. buildCaseInsensitiveMatchScalar(s)
builder.buildConsume(by: consumeScalar {
$0.properties.lowercaseMapping == s.properties.lowercaseMapping
})
mutating func emitMatchScalar(_ s: UnicodeScalar) {
assert(options.semanticLevel == .unicodeScalar)
if options.isCaseInsensitive && s.properties.isCased {
builder.buildMatchScalarCaseInsensitive(s, boundaryCheck: false)
} else {
builder.buildConsume(by: consumeScalar {
$0 == s
})
builder.buildMatchScalar(s, boundaryCheck: false)
}
}

mutating func emitCharacter(_ c: Character) throws {
// Unicode scalar matches the specific scalars that comprise a character
mutating func emitCharacter(_ c: Character) {
// Unicode scalar mode matches the specific scalars that comprise a character
if options.semanticLevel == .unicodeScalar {
for scalar in c.unicodeScalars {
try emitScalar(scalar)
emitMatchScalar(scalar)
}
return
}

if options.isCaseInsensitive && c.isCased {
// TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
builder.buildConsume { input, bounds in
let inputChar = input[bounds.lowerBound].lowercased()
let matchChar = c.lowercased()
return inputChar == matchChar
? input.index(after: bounds.lowerBound)
: nil
if optimizationsEnabled && c.isASCII {
// c.isCased ensures that c is not CR-LF,
// so we know that c is a single scalar
assert(c.unicodeScalars.count == 1)
builder.buildMatchScalarCaseInsensitive(
c.unicodeScalars.last!,
boundaryCheck: true)
} else {
builder.buildMatch(c, isCaseInsensitive: true)
}
} else {
builder.buildMatch(c)
return
}

if optimizationsEnabled && c.isASCII {
let lastIdx = c.unicodeScalars.indices.last!
for idx in c.unicodeScalars.indices {
builder.buildMatchScalar(c.unicodeScalars[idx], boundaryCheck: idx == lastIdx)
}
return
}

builder.buildMatch(c, isCaseInsensitive: false)
}

mutating func emitAny() {
Expand Down Expand Up @@ -717,11 +755,12 @@ fileprivate extension Compiler.ByteCodeGen {
_ ccc: DSLTree.CustomCharacterClass
) throws {
if let asciiBitset = ccc.asAsciiBitset(options),
options.semanticLevel == .graphemeCluster,
optimizationsEnabled {
// future work: add a bit to .matchBitset to consume either a character
// or a scalar so we can have this optimization in scalar mode
builder.buildMatchAsciiBitset(asciiBitset)
if options.semanticLevel == .unicodeScalar {
builder.buildScalarMatchAsciiBitset(asciiBitset)
} else {
builder.buildMatchAsciiBitset(asciiBitset)
}
} else {
let consumer = try ccc.generateConsumer(options)
builder.buildConsume(by: consumer)
Expand Down Expand Up @@ -798,45 +837,7 @@ fileprivate extension Compiler.ByteCodeGen {
try emitAtom(a)

case let .quotedLiteral(s):
if options.semanticLevel == .graphemeCluster {
if options.isCaseInsensitive {
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
builder.buildConsume { input, bounds in
var iterator = s.makeIterator()
var currentIndex = bounds.lowerBound
while let ch = iterator.next() {
guard currentIndex < bounds.upperBound,
ch.lowercased() == input[currentIndex].lowercased()
else { return nil }
input.formIndex(after: &currentIndex)
}
return currentIndex
}
} else {
builder.buildMatchSequence(s)
}
} else {
builder.buildConsume {
[caseInsensitive = options.isCaseInsensitive] input, bounds in
// TODO: Case folding
var iterator = s.unicodeScalars.makeIterator()
var currentIndex = bounds.lowerBound
while let scalar = iterator.next() {
guard currentIndex < bounds.upperBound else { return nil }
if caseInsensitive {
if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping {
return nil
}
} else {
if scalar != input.unicodeScalars[currentIndex] {
return nil
}
}
input.unicodeScalars.formIndex(after: &currentIndex)
}
return currentIndex
}
}
emitQuotedLiteral(s)

case let .convertedRegexLiteral(n, _):
return try emitNode(n)
Expand Down
103 changes: 62 additions & 41 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@

@_implementationOnly import _RegexParser

extension Character {
var _singleScalarAsciiValue: UInt8? {
guard self != "\r\n" else { return nil }
return asciiValue
}
}

extension DSLTree.Node {
/// Attempt to generate a consumer from this AST node
///
Expand Down Expand Up @@ -53,11 +60,50 @@ extension DSLTree._AST.Atom {
}
}

extension Character {
func generateConsumer(
_ opts: MatchingOptions
) throws -> MEProgram.ConsumeFunction? {
let isCaseInsensitive = opts.isCaseInsensitive
switch opts.semanticLevel {
case .graphemeCluster:
return { input, bounds in
let low = bounds.lowerBound
if isCaseInsensitive && isCased {
return input[low].lowercased() == lowercased()
? input.index(after: low)
: nil
} else {
return input[low] == self
? input.index(after: low)
: nil
}
}
case .unicodeScalar:
// TODO: This should only be reachable from character class emission, can
// we guarantee that? Otherwise we'd want a different matching behavior.
let consumers = unicodeScalars.map { s in consumeScalar {
isCaseInsensitive
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
: $0 == s
}}
return { input, bounds in
for fn in consumers {
if let idx = fn(input, bounds) {
return idx
}
}
return nil
}
}
}
}

extension DSLTree.Atom {
var singleScalarASCIIValue: UInt8? {
switch self {
case let .char(c) where c != "\r\n":
return c.asciiValue
case let .char(c):
return c._singleScalarAsciiValue
case let .scalar(s) where s.isASCII:
return UInt8(ascii: s)
case let .unconverted(atom):
Expand All @@ -72,44 +118,15 @@ extension DSLTree.Atom {
func generateConsumer(
_ opts: MatchingOptions
) throws -> MEProgram.ConsumeFunction? {
let isCaseInsensitive = opts.isCaseInsensitive

switch self {
case let .char(c):
if opts.semanticLevel == .graphemeCluster {
return { input, bounds in
let low = bounds.lowerBound
if isCaseInsensitive && c.isCased {
return input[low].lowercased() == c.lowercased()
? input.index(after: low)
: nil
} else {
return input[low] == c
? input.index(after: low)
: nil
}
}
} else {
let consumers = c.unicodeScalars.map { s in consumeScalar {
isCaseInsensitive
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
: $0 == s
}}
return { input, bounds in
for fn in consumers {
if let idx = fn(input, bounds) {
return idx
}
}
return nil
}
}
return try c.generateConsumer(opts)

case let .scalar(s):
return consumeScalar {
isCaseInsensitive
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
: $0 == s
}
// A scalar always matches the same as a single scalar character. This
// means it must match a whole grapheme in grapheme semantic mode, but
// can match a single scalar in scalar semantic mode.
return try Character(s).generateConsumer(opts)

case .any:
// FIXME: Should this be a total ordering?
Expand Down Expand Up @@ -211,16 +228,20 @@ extension AST.Atom {
var singleScalar: UnicodeScalar? {
switch kind {
case .scalar(let s): return s.value
case .escaped(let e):
guard let s = e.scalarValue else { return nil }
return s
default: return nil
}
}

var singleScalarASCIIValue: UInt8? {
if let s = singleScalar, s.isASCII {
return UInt8(ascii: s)
}
switch kind {
case let .char(c) where c != "\r\n":
return c.asciiValue
case let .scalar(s) where s.value.isASCII:
return UInt8(ascii: s.value)
case let .char(c):
return c._singleScalarAsciiValue
default:
return nil
}
Expand Down
38 changes: 30 additions & 8 deletions Sources/_StringProcessing/Engine/InstPayload.swift
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,26 @@ extension Instruction.Payload {
var string: StringRegister {
interpret()
}

init(scalar: Unicode.Scalar) {
self.init(UInt64(scalar.value))
}
var scalar: Unicode.Scalar {
return Unicode.Scalar(_value: UInt32(self.rawValue))
}

init(scalar: Unicode.Scalar, caseInsensitive: Bool, boundaryCheck: Bool) {
let raw = UInt64(scalar.value)
+ (caseInsensitive ? 1 << 55: 0)
+ (boundaryCheck ? 1 << 54 : 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Future: we'll be redoing instruction encoding and can clean this up alongside other changes.

self.init(raw)
}
var scalarPayload: (Unicode.Scalar, caseInsensitive: Bool, boundaryCheck: Bool) {
let caseInsensitive = (self.rawValue >> 55) & 1 == 1
let boundaryCheck = (self.rawValue >> 54) & 1 == 1
let scalar = Unicode.Scalar(_value: UInt32(self.rawValue & 0xFFFF_FFFF))
return (scalar, caseInsensitive: caseInsensitive, boundaryCheck: boundaryCheck)
}

init(sequence: SequenceRegister) {
self.init(sequence)
Expand Down Expand Up @@ -190,18 +210,20 @@ extension Instruction.Payload {
interpret()
}

init(element: ElementRegister) {
self.init(element)
init(element: ElementRegister, isCaseInsensitive: Bool) {
self.init(isCaseInsensitive ? 1 : 0, element)
}
var element: ElementRegister {
interpret()
var elementPayload: (isCaseInsensitive: Bool, ElementRegister) {
let pair: (UInt64, ElementRegister) = interpretPair()
return (isCaseInsensitive: pair.0 == 1, pair.1)
}

init(bitset: AsciiBitsetRegister) {
self.init(bitset)
init(bitset: AsciiBitsetRegister, isScalar: Bool) {
self.init(isScalar ? 1 : 0, bitset)
}
var bitset: AsciiBitsetRegister {
interpret()
var bitsetPayload: (isScalar: Bool, AsciiBitsetRegister) {
let pair: (UInt64, AsciiBitsetRegister) = interpretPair()
return (isScalar: pair.0 == 1, pair.1)
}

init(consumer: ConsumeFunctionRegister) {
Expand Down
Loading