-
Notifications
You must be signed in to change notification settings - Fork 85
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PERF-3056 Add workloads for fragmented inserts in presence of columns…
…tore index (#787)
- Loading branch information
1 parent
7a29faf
commit f449258
Showing
2 changed files
with
349 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
SchemaVersion: 2018-07-01 | ||
Owner: "@mongodb/query-execution" | ||
Description: | | ||
This workload compares performance of inserts into a collection with only the default _id index, | ||
and in presence of the columnstore index. It uses an artificial data set with a wide overall | ||
schema and narrow individual objects to model fragmented access to CSI, which clusters entries by | ||
path. The data size is relatively small (1e6 documents yield ~175MB data size and ~105MB storage | ||
size). | ||
We would like to be able to correlate the results of this workload with the similar one that uses | ||
nested data (CsiFragmentedInsertsNested.yml). Please make sure to update both when making changes. | ||
Keywords: | ||
- columnstore | ||
- insert | ||
|
||
AutoRun: | ||
- When: | ||
mongodb_setup: | ||
$eq: | ||
- standalone-all-feature-flags | ||
branch_name: | ||
$neq: | ||
- v4.0 | ||
- v4.2 | ||
- v4.4 | ||
- v5.0 | ||
- v6.0 | ||
|
||
Clients: | ||
Default: | ||
QueryOptions: | ||
# Allow for longer duration since index builds may take a while. | ||
socketTimeoutMS: 600_000 # = 10 min | ||
connectTimeoutMS: 600_000 | ||
|
||
GlobalDefaults: | ||
MaxPhases: &maxPhases 5 | ||
Database: &db csiFragmentedInsertsFlat | ||
|
||
# The Loader actor creates collections named "Collection<N>" where N corresponds to the thread's | ||
# number. We'll use a single collection, created by a single thread, so it becomes 'Collection0'. | ||
Collection: &coll Collection0 | ||
|
||
# If modifying any of the these parameters, please review 'CsiFragmentedInsertsNested.yml' to | ||
# ensure that the results of these two workloads can still be correlated. | ||
DocumentCount: &docCount 1e6 | ||
SchemaWidth: &schemaWidth 10000 | ||
ObjectWidth: &objectWidth 10 | ||
SampleSize: &sampleSize 10 | ||
|
||
Document: &document | ||
# { | ||
# _id: ObjectId(...), | ||
# root: { | ||
# x271: NumberInt(9917), | ||
# x6305: NumberInt(11), | ||
# x8: NumberInt(1022), | ||
# <7 more fields like this> | ||
# } | ||
# } | ||
root: {^Object: { | ||
withNEntries: *objectWidth, | ||
|
||
# We are using uniform distribution of fields to make the sampling more stable. | ||
havingKeys: {^FormatString: { | ||
"format": "x%d", | ||
"withArgs": [{^RandomInt: {min: 0, max: *schemaWidth}}] | ||
}}, | ||
|
||
# We don't expect the actual values to matter. | ||
andValues: {^RandomInt: {min: 0, max: *schemaWidth}}, | ||
|
||
# Occasionally, the key generator might produce the same key name. For this workload, | ||
# it's OK to have some of the objects with fewer than 'objectWidth' fields (and we don't | ||
# want to test parsing of the duplicated keys for the index as it's not a common user | ||
# scenario). | ||
duplicatedKeys: skip | ||
}} | ||
|
||
ActorTemplates: | ||
# We want to be able to compare results from the same test between the runs in presence of different | ||
# indexes. For this we'll have to instantiate the same actor multiple times with a unique name. | ||
- TemplateName: InsertFromSample | ||
Config: | ||
Name: {^Parameter: {Name: "Name", Default: "Insert"}} | ||
Type: SamplingLoader | ||
Threads: {^Parameter: {Name: "Threads", Default: 1}} | ||
Phases: | ||
OnlyActiveInPhases: | ||
Active: [{^Parameter: {Name: "OnlyActiveInPhase", Default: 1024}}] | ||
NopInPhasesUpTo: *maxPhases | ||
PhaseConfig: | ||
Database: *db | ||
Collection: *coll | ||
|
||
# No more than sampleSize*objectWidth paths can be "touched" by a sample. | ||
SampleSize: *sampleSize | ||
|
||
# When the batch size is the same as sample size, each sampled document will be used in | ||
# the batch exactly once, and this will be repeated for the number of batches. We expect, | ||
# that the first batch might be affected by "cold" caches but the subsequent batches | ||
# would be fully warmed up. | ||
InsertBatchSize: *sampleSize | ||
|
||
# The 'SamplingLoader' actor re-samples on repeat, meaning that it would get a new set of | ||
# documents likely with different paths and values. This makes each repeat hit different | ||
# parts of the indexes, causing a long warm up tail. To avoid this we test with a single | ||
# repeat but multiple batches. | ||
Repeat: {^Parameter: {Name: "Repeats", Default: 1}} | ||
|
||
# The instances of the template must specify the number of batches to make it clear what | ||
# they are testing wrt to cold/warm state. The stats per batch will be available in the | ||
# 'IndividualBulkInsert' measurement. | ||
Batches: {^Parameter: {Name: "Batches", Default: 500}} | ||
|
||
Actors: | ||
- Name: Loader | ||
Type: Loader | ||
Threads: 1 | ||
Phases: | ||
OnlyActiveInPhases: | ||
Active: [0] | ||
NopInPhasesUpTo: *maxPhases | ||
PhaseConfig: | ||
# Cannot have more threads than the actor itself. | ||
Threads: 1 | ||
Repeat: 1 | ||
Database: *db | ||
CollectionCount: 1 | ||
DocumentCount: *docCount | ||
BatchSize: 1000 | ||
Document: *document | ||
|
||
- Name: Quiese | ||
Type: QuiesceActor | ||
Threads: 1 | ||
Database: *db | ||
Phases: | ||
OnlyActiveInPhases: | ||
Active: [1, 4] | ||
NopInPhasesUpTo: *maxPhases | ||
PhaseConfig: | ||
Repeat: 1 | ||
|
||
# Do multiple inserts of documents with the same schema. The performance of the first insert might | ||
# be affected by the inserts in the previous stages, but it should stabilize after that so, by using | ||
# a large number of batches, it should amortize sufficiently to have P90 similar to P50 for the | ||
# latency. | ||
# Target measurements: Latency50thPercentile | ||
- ActorFromTemplate: | ||
TemplateName: InsertFromSample | ||
TemplateParameters: | ||
Name: NoIndexes | ||
OnlyActiveInPhase: 2 | ||
|
||
- Name: BuildColumnStoreIndex | ||
Type: RunCommand | ||
Threads: 1 | ||
Phases: | ||
OnlyActiveInPhases: | ||
Active: [3] | ||
NopInPhasesUpTo: *maxPhases | ||
PhaseConfig: | ||
Repeat: 1 | ||
Database: *db | ||
Operations: | ||
- OperationMetricsName: BulkBuildColumnStoreIndex | ||
OperationName: RunCommand | ||
OperationCommand: | ||
createIndexes: *coll | ||
indexes: | ||
- key: {"$**": "columnstore"} | ||
name: csi | ||
|
||
# Repeat the same tests as with no index in presence of CSI. | ||
- ActorFromTemplate: | ||
TemplateName: InsertFromSample | ||
TemplateParameters: | ||
Name: Csi | ||
OnlyActiveInPhase: 5 | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
SchemaVersion: 2018-07-01 | ||
Owner: "@mongodb/query-execution" | ||
Description: | | ||
This workload compares performance of inserts into a collection with only the default _id index | ||
and in presence of a full columnstore index. We are not comparing to wildcard index because the | ||
nested data makes creating of a wildcard index too slow. Before changing any of the parameters in | ||
this workload please make sure the results can be correlated with 'CsiFragmentedInsertsFlat.yml'. | ||
As the approach in this workload is the same as in 'CsiFragmentedInsertsFlat.yml' with the exception | ||
of data used by the loader (and not comparing to the wildcard index), comments are intentionally | ||
omitted, please refer to the "flat" workload for the details. | ||
Keywords: | ||
- columnstore | ||
- insert | ||
|
||
AutoRun: | ||
- When: | ||
mongodb_setup: | ||
$eq: | ||
- standalone-all-feature-flags | ||
branch_name: | ||
$neq: | ||
- v4.0 | ||
- v4.2 | ||
- v4.4 | ||
- v5.0 | ||
- v6.0 | ||
|
||
Clients: | ||
Default: | ||
QueryOptions: | ||
socketTimeoutMS: 600_000 # = 10 min | ||
connectTimeoutMS: 600_000 | ||
|
||
GlobalDefaults: | ||
MaxPhases: &maxPhases 5 | ||
Database: &db csiFragmentedInsertsNested | ||
Collection: &coll Collection0 | ||
DocumentCount: &docCount 1e6 | ||
SchemaWidth: &schemaWidth 10000 | ||
ObjectWidth: &objectWidth 5 # with two nested paths, get 10 paths_per_object | ||
SampleSize: &sampleSize 10 | ||
|
||
Document: &document | ||
# Generate documents that would produce non-trivial array info strings. When these docs | ||
# are inserted, we want them to affect about 10 separated locations in the index, which is | ||
# achieved by uniform distribution of xN fields and because for each xN both "a" and "b" | ||
# subpaths are likely to be generated. | ||
# The documents will looks like: | ||
# { | ||
# _id: ObjectId("63890d0df7b608a2d303b941"), | ||
# root: [ | ||
# { | ||
# x8372 : [{a: [42, *]}, {b: {obj: *}}, {a: [42, *], b: [42, *]}, {b: *}, {b: *}], | ||
# <4 more xN fields> | ||
# } | ||
# ] | ||
# } | ||
root: {^Array: { | ||
of: {^Object: { | ||
withNEntries: *objectWidth, | ||
havingKeys: {^FormatString: { | ||
"format": "x%d", "withArgs": [{^RandomInt: {min: 0, max: *schemaWidth}}] | ||
}}, | ||
andValues: {^Array: { | ||
of: | ||
# Create an object with either "a" or "b" key, or both. This is achieved by using | ||
# 'duplicatedKeys: skip' parameter, which would ignore the second generated key if | ||
# it has the same name as the first, so with probability 1/2 we'll get both fields | ||
# and with probability 1/4 each of {a: } and {b: }. | ||
{^Object: { | ||
withNEntries: 2, | ||
havingKeys: {^RandomString: {length: 1, alphabet: ab}}, | ||
andValues: {^Choose: {from: [ | ||
{^RandomInt: {min: 0, max: *schemaWidth}}, | ||
[42, {^RandomInt: {min: 0, max: *schemaWidth}}], | ||
{obj: {^RandomInt: {min: 0, max: *schemaWidth}}} | ||
]}}, | ||
duplicatedKeys: skip | ||
}}, | ||
# 5 elements make it almost certain that both "a" and "b" are used in the array to | ||
# yield ~10 paths per object. | ||
number: 5 | ||
}}, | ||
duplicatedKeys: skip | ||
}}, | ||
number: 1 | ||
}} | ||
|
||
ActorTemplates: | ||
- TemplateName: InsertFromSample | ||
Config: | ||
Name: {^Parameter: {Name: "Name", Default: "Insert"}} | ||
Type: SamplingLoader | ||
Threads: {^Parameter: {Name: "Threads", Default: 1}} | ||
Phases: | ||
OnlyActiveInPhases: | ||
Active: [{^Parameter: {Name: "OnlyActiveInPhase", Default: 1024}}] | ||
NopInPhasesUpTo: *maxPhases | ||
PhaseConfig: | ||
Database: *db | ||
Collection: *coll | ||
SampleSize: *sampleSize | ||
InsertBatchSize: *sampleSize | ||
Repeat: {^Parameter: {Name: "Repeats", Default: 1}} | ||
Batches: {^Parameter: {Name: "Batches", Default: 500}} | ||
|
||
Actors: | ||
- Name: Loader | ||
Type: Loader | ||
Threads: 1 | ||
Phases: | ||
OnlyActiveInPhases: | ||
Active: [0] | ||
NopInPhasesUpTo: *maxPhases | ||
PhaseConfig: | ||
Threads: 1 | ||
Repeat: 1 | ||
Database: *db | ||
CollectionCount: 1 | ||
DocumentCount: *docCount | ||
BatchSize: 1000 | ||
Document: *document | ||
|
||
- Name: Quiese | ||
Type: QuiesceActor | ||
Threads: 1 | ||
Database: *db | ||
Phases: | ||
OnlyActiveInPhases: | ||
Active: [1, 4] | ||
NopInPhasesUpTo: *maxPhases | ||
PhaseConfig: | ||
Repeat: 1 | ||
|
||
- ActorFromTemplate: | ||
TemplateName: InsertFromSample | ||
TemplateParameters: | ||
Name: NoIndexes | ||
OnlyActiveInPhase: 2 | ||
|
||
- Name: BuildColumnStoreIndex | ||
Type: RunCommand | ||
Threads: 1 | ||
Phases: | ||
OnlyActiveInPhases: | ||
Active: [3] | ||
NopInPhasesUpTo: *maxPhases | ||
PhaseConfig: | ||
Repeat: 1 | ||
Database: *db | ||
Operations: | ||
- OperationMetricsName: BulkBuildColumnStoreIndex | ||
OperationName: RunCommand | ||
OperationCommand: | ||
createIndexes: *coll | ||
indexes: | ||
- key: {"$**": "columnstore"} | ||
name: csi | ||
|
||
- ActorFromTemplate: | ||
TemplateName: InsertFromSample | ||
TemplateParameters: | ||
Name: Csi | ||
OnlyActiveInPhase: 5 | ||
|
||
|