Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions documentation/query_documentation.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,52 @@ See `NucleotideEquals`.

See `HasNucleotideMutation`.

#### `NucleotideMutationProfile`

```
{
"distance": number,
"sequenceName": string, // optional; uses the default sequence if omitted
// exactly one of these
"querySequence": string
"sequenceId": string
"mutations": {"position": number, "symbol": string}[]
}
```

This filter is true if a sequence has at most `distance` **differences** from a profile sequence.

A difference at a position is considered when the database sequence's symbol is **not** ambiguity-compatible with the profile's symbol at that position. Ambiguity-compatible means the database symbol appears in the set of symbols compatible with the profile symbol (e.g. `R` is compatible with `A` because `R` represents A or G; `N` is compatible with any definitive base). Positions where the profile symbol is `N` (missing) are always skipped and never counted as differences.

**Profile input — exactly one of:**

- `querySequence`: a full sequence string of the same length as the reference. Each character must be a valid symbol for the sequence type.
- `sequenceId`: the primary key of a sequence already in the database. That sequence is used as the profile.
- `mutations`: an array of mutations relative to the reference. Positions are 1-based. Positions not listed retain the reference symbol. An empty array means the profile equals the reference.
- Each entry: `{"position": number, "symbol": string}` where `symbol` is a single valid character.

**Example** — find all sequences within 5 mutations of the reference:
```json
{
"type": "NucleotideMutationProfile",
"distance": 5,
"mutations": []
}
```

**Example** — find sequences within 2 mutations of a specific stored sequence:
```json
{
"type": "NucleotideMutationProfile",
"distance": 2,
"sequenceId": "EPI_ISL_123456"
}
```

#### `AminoAcidMutationProfile`

See `NucleotideMutationProfile`. Applies to amino acid sequences; `symbol` values must be valid amino acid symbols.

#### `Lineage`: `{"column": string, "value": string | null, "includeSublineages": boolean, ["recombinantFollowingMode": string]}`

This filter is true if the lineage in column `column` is equal to or an alias of `value`.
Expand Down
2 changes: 2 additions & 0 deletions performance/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
function(add_benchmark BENCH_NAME)
add_executable(${BENCH_NAME} ${BENCH_NAME}.cpp)
target_link_libraries(${BENCH_NAME} silolib)
target_include_directories(${BENCH_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/performance)
endfunction(add_benchmark)

add_benchmark(mutation_benchmark)
add_benchmark(many_string_equals)
add_benchmark(many_short_read_filters)
add_benchmark(nof_sequence_filter)
266 changes: 8 additions & 258 deletions performance/many_short_read_filters.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include <algorithm>
#include <exception>
#include <filesystem>
#include <fstream>
#include <random>
#include <sstream>
Expand All @@ -9,6 +8,7 @@
#include <arrow/compute/initialize.h>
#include <spdlog/spdlog.h>

#include "sequence_generator.h"
#include "silo/append/database_inserter.h"
#include "silo/append/ndjson_line_reader.h"
#include "silo/initialize/initializer.h"
Expand All @@ -25,253 +25,8 @@ using silo::Database;

namespace {

constexpr size_t DEFAULT_READ_COUNT = 5'000'000;
constexpr size_t DEFAULT_READ_LENGTH = 200;
constexpr double DEFAULT_MUTATION_RATE = 0.001;
constexpr double DEFAULT_DEATH_RATE = 0.1;
constexpr size_t DEFAULT_GENERATIONS = 5;
constexpr size_t DEFAULT_CHILDREN_PER_NODE = 3;
constexpr size_t DEFAULT_QUERY_COUNT = 10'000;

void changeCwdToTestFolder() {
// Look for the test data directory (`testBaseData`) in the current directory and up to
// <search_depth> directories above the current directory. If found, change the current working
// directory to the directory containing the test data directory
size_t search_depth = 4;
std::filesystem::path candidate_directory = std::filesystem::current_path().string();
for (size_t i = 0; i < search_depth; i++, candidate_directory = candidate_directory / "..") {
if (std::filesystem::exists(candidate_directory / "testBaseData/exampleDataset")) {
std::filesystem::current_path(candidate_directory);
return;
}
}
throw std::runtime_error(fmt::format(
"Should be run in root of repository, got {} and could not find root by heuristics",
std::filesystem::current_path().string()
));
}

std::string readReferenceFromFile() {
auto reference_genomes =
silo::ReferenceGenomes::readFromFile("testBaseData/exampleDataset/reference_genomes.json");
if (reference_genomes.raw_nucleotide_sequences.empty()) {
throw std::runtime_error("No nucleotide sequences found in reference genomes file");
}
return reference_genomes.raw_nucleotide_sequences.at(0);
}

using silo::Nucleotide;

// Simple tree-based sequence evolution model
class SequenceTreeGenerator {
std::mt19937 rng;
const std::string& reference;
double mutation_rate;
double death_rate;
size_t generations;
size_t children_per_node;

char mutateBase(char base) {
std::uniform_int_distribution<size_t> dist(0, 3);
char new_base;
do {
Nucleotide::Symbol new_symbol = Nucleotide::SYMBOLS.at(dist(rng));
new_base = Nucleotide::symbolToChar(new_symbol);
} while (new_base == base);
return new_base;
}

std::string mutateSequence(std::string_view sequence) {
std::string mutated{sequence};
const size_t seq_length = sequence.size();

// Sample the number of mutations from a binomial distribution
std::binomial_distribution<size_t> num_mutations_dist(seq_length, mutation_rate);
size_t num_mutations = num_mutations_dist(rng);

// Randomly choose positions to mutate
std::uniform_int_distribution<size_t> pos_dist(0, seq_length - 1);
for (size_t i = 0; i < num_mutations; ++i) {
size_t pos = pos_dist(rng);
mutated[pos] = mutateBase(mutated[pos]);
}
return mutated;
}

public:
SequenceTreeGenerator(
const std::string& ref,
uint64_t seed = 42,
double mut_rate = DEFAULT_MUTATION_RATE,
double death = DEFAULT_DEATH_RATE,
size_t gens = DEFAULT_GENERATIONS,
size_t children = DEFAULT_CHILDREN_PER_NODE
)
: rng(seed),
reference(ref),
mutation_rate(mut_rate),
death_rate(death),
generations(gens),
children_per_node(children) {}

// Generate evolved sequences using a tree model
std::vector<std::string> generateEvolvedSequences() {
std::vector<std::string> all_generated_sequences = {reference};
std::vector<std::string_view> current_generation = {reference};
std::vector<std::string_view> next_generation;
std::bernoulli_distribution survives(1.0 - death_rate);

for (size_t gen = 0; gen < generations; ++gen) {
next_generation.clear();
for (const auto& seq : current_generation) {
for (size_t child = 0; child < children_per_node; ++child) {
if (survives(rng)) {
all_generated_sequences.push_back(mutateSequence(seq));
next_generation.push_back(all_generated_sequences.back());
}
}
}
if (next_generation.empty()) {
// If all died, keep at least one survivor
next_generation.push_back(all_generated_sequences.back());
}
current_generation = std::move(next_generation);
}
return all_generated_sequences;
}
};

struct ShortRead {
size_t id;
size_t offset;
std::string sequence;
};

// Lazy generator for short reads - generates on-demand without materializing all reads
class ShortReadGenerator {
std::vector<std::string> evolved_sequences;
std::mt19937 rng;
std::uniform_int_distribution<size_t> seq_dist;
size_t count;
size_t read_length;
size_t num_positions;

public:
class iterator {
ShortReadGenerator* generator;
size_t current_id;

public:
using iterator_category = std::input_iterator_tag;
using value_type = ShortRead;
using difference_type = std::ptrdiff_t;
using pointer = const ShortRead*;
using reference = ShortRead;

iterator(ShortReadGenerator* gen, size_t id)
: generator(gen),
current_id(id) {}

ShortRead operator*() { return generator->generateAt(current_id); }

iterator& operator++() {
++current_id;
return *this;
}

iterator operator++(int) {
iterator tmp = *this;
++current_id;
return tmp;
}

bool operator==(const iterator& other) const { return current_id == other.current_id; }
bool operator!=(const iterator& other) const { return current_id != other.current_id; }
};

ShortReadGenerator(
const std::string& reference,
size_t count,
size_t read_length,
uint64_t seed = 42
)
: count(count),
read_length(read_length) {
SequenceTreeGenerator tree_gen(reference, seed);
evolved_sequences = tree_gen.generateEvolvedSequences();

SPDLOG_INFO("Generated {} evolved sequences from tree model", evolved_sequences.size());

const size_t seq_length = reference.size();
SILO_ASSERT(read_length < seq_length);
num_positions = seq_length - read_length + 1;

rng.seed(seed + 1000);
seq_dist = std::uniform_int_distribution<size_t>(0, evolved_sequences.size() - 1);
}

ShortRead generateAt(size_t read_id) {
const size_t offset = (read_id * num_positions) / count;
const auto& source_seq = evolved_sequences[seq_dist(rng)];
return {read_id, offset, source_seq.substr(offset, read_length)};
}

iterator begin() { return iterator(this, 0); }
iterator end() { return iterator(this, count); }

[[nodiscard]] size_t size() const { return count; }
};

std::stringstream generateShortReadNdjson(
const std::string& reference,
size_t count = DEFAULT_READ_COUNT,
size_t read_length = DEFAULT_READ_LENGTH
) {
ShortReadGenerator generator(reference, count, read_length);
std::stringstream buffer;

for (const auto& read : generator) {
buffer << fmt::format(
R"({{"readId":"read_{}","samplingDate":"2024-01-01","locationName":"generated","main":{{"insertions":[],"offset":{},"sequence":"{}"}}}})",
read.id,
read.offset,
read.sequence
) << "\n";
}

return buffer;
}

std::shared_ptr<Database> initializeDatabaseWithSingleReference(std::string reference) {
auto database_config = silo::config::DatabaseConfig::getValidatedConfig(R"(
schema:
instanceName: test
metadata:
- name: readId
type: string
- name: samplingDate
type: date
- name: locationName
type: string
primaryKey: readId
)");

silo::ReferenceGenomes reference_genomes{{{"main", reference}}, {}};

auto database = std::make_shared<silo::Database>();
database->createTable(
silo::schema::TableName::getDefault(),
silo::initialize::Initializer::createSchemaFromConfigFiles(
std::move(database_config),
std::move(reference_genomes),
{},
silo::common::PhyloTree{},
/*without_unaligned_sequences=*/true
)
);
return database;
}

struct TestDatabaseResult {
std::shared_ptr<Database> database;
size_t reference_length;
Expand All @@ -280,14 +35,12 @@ struct TestDatabaseResult {
TestDatabaseResult setupTestDatabase() {
std::string reference = readReferenceFromFile();
SPDLOG_INFO("Read reference sequence of length {}", reference.size());
size_t ref_length = reference.size();
const size_t ref_length = reference.size();

auto input_buffer = generateShortReadNdjson(reference);
SPDLOG_INFO("Generated short read NDJSON data");

auto database = initializeDatabaseWithSingleReference(reference);

auto input_data_stream = silo::append::NdjsonLineReader{input_buffer};
auto database = initializeDatabaseWithShortReadSchema(reference);
database->appendData(silo::schema::TableName::getDefault(), input_buffer);

return {database, ref_length};
Expand All @@ -306,20 +59,18 @@ class QueryGenerator {

std::string generateQuery() {
std::uniform_int_distribution<size_t> pos_dist(1, reference_length - 1);
size_t position = pos_dist(rng);
const size_t position = pos_dist(rng);

bool use_all_symbols = (query_counter++ % 2 == 1);
const bool use_all_symbols = (query_counter++ % 2 == 1);

if (use_all_symbols) {
// Query all 5 symbols (A, C, G, T, -) at the same position in an OR
return fmt::format(
R"({{"action":{{"type":"Aggregated"}},"filterExpression":{{"children":[{{"children":[{{"children":[{{"column":"locationName","value":"generated","type":"StringEquals"}}],"type":"Or"}},{{"column":"samplingDate","from":"2024-01-01","to":"2024-01-07","type":"DateBetween"}}],"type":"And"}},{{"children":[{{"position":{0},"symbol":"A","type":"NucleotideEquals"}},{{"position":{0},"symbol":"C","type":"NucleotideEquals"}},{{"position":{0},"symbol":"G","type":"NucleotideEquals"}},{{"position":{0},"symbol":"T","type":"NucleotideEquals"}},{{"position":{0},"symbol":"-","type":"NucleotideEquals"}}],"type":"Or"}},{{"column":"samplingDate","from":"2024-01-01","to":"2024-01-07","type":"DateBetween"}}],"type":"And"}}}})",
position
);
}
// Query a single random symbol at the position
std::uniform_int_distribution<size_t> sym_dist(0, SYMBOLS.size() - 1);
char symbol = SYMBOLS[sym_dist(rng)];
const char symbol = SYMBOLS[sym_dist(rng)];
return fmt::format(
R"({{"action":{{"type":"Aggregated"}},"filterExpression":{{"children":[{{"children":[{{"children":[{{"column":"locationName","value":"generated","type":"StringEquals"}}],"type":"Or"}},{{"column":"samplingDate","from":"2024-01-01","to":"2024-01-07","type":"DateBetween"}}],"type":"And"}},{{"position":{},"symbol":"{}","type":"NucleotideEquals"}},{{"column":"samplingDate","from":"2024-01-01","to":"2024-01-07","type":"DateBetween"}}],"type":"And"}}}})",
position,
Expand All @@ -334,7 +85,6 @@ void executeAllQueries(
size_t query_count = DEFAULT_QUERY_COUNT
) {
QueryGenerator query_gen(reference_length);

for (size_t query_num = 1; query_num <= query_count; ++query_num) {
if (query_num % 1000 == 0) {
SPDLOG_INFO("Executing query number {}", query_num);
Expand All @@ -358,7 +108,7 @@ void run() {

auto [database, reference_length] = setupTestDatabase();

while(true){
while (true) {
SPDLOG_INFO("Starting full query set benchmark ({} queries):", DEFAULT_QUERY_COUNT);
auto start = std::chrono::high_resolution_clock::now();
executeAllQueries(database, reference_length);
Expand All @@ -370,7 +120,7 @@ void run() {

} // namespace

int main(){
int main() {
try {
run();
} catch (std::exception& e) {
Expand Down
Loading