diff --git a/documentation/api.md b/documentation/api.md index 46d4ff5ef..60052f90c 100644 --- a/documentation/api.md +++ b/documentation/api.md @@ -57,8 +57,7 @@ Returns metadata about the loaded database. "version": "0.1.0", "sequenceCount": 100, "horizontalBitmapsSize": 5594, - "verticalBitmapsSize": 28102, - "numberOfPartitions": 1 + "verticalBitmapsSize": 28102 } ``` @@ -68,7 +67,6 @@ Returns metadata about the loaded database. | `sequenceCount` | Total number of sequences in the database | | `horizontalBitmapsSize` | Size of horizontal bitmap indexes (bytes) | | `verticalBitmapsSize` | Size of vertical bitmap indexes (bytes) | -| `numberOfPartitions` | Number of table partitions | --- diff --git a/documentation/data_directories.md b/documentation/data_directories.md index 1ad44b668..92b974277 100644 --- a/documentation/data_directories.md +++ b/documentation/data_directories.md @@ -11,13 +11,11 @@ output/ 1700000000/ data_version.silo database_schema.silo - default/ - ... (partition data) + default.silo 1700100000/ data_version.silo database_schema.silo - default/ - ... + default.silo ``` ### data_version.silo format diff --git a/endToEndTests/test/info.test.js b/endToEndTests/test/info.test.js index 5e1875a1c..dca2dbd9d 100644 --- a/endToEndTests/test/info.test.js +++ b/endToEndTests/test/info.test.js @@ -19,7 +19,6 @@ describe('The /info endpoint', () => { sequenceCount: 100, horizontalBitmapsSize: 5595, verticalBitmapsSize: 28102, - numberOfPartitions: 1, }); }); }); diff --git a/src/silo/api/lineage_definition_handler.cpp b/src/silo/api/lineage_definition_handler.cpp index ffd1c701f..d49edb7cc 100644 --- a/src/silo/api/lineage_definition_handler.cpp +++ b/src/silo/api/lineage_definition_handler.cpp @@ -44,8 +44,7 @@ void LineageDefinitionHandler::get( throw BadRequest("The column {} is not of type indexed-string.", column_name); } auto* metadata = - table->second->schema - ->getColumnMetadata(column_name) + table->second->schema->getColumnMetadata(column_name) .value(); if (!metadata->lineage_tree.has_value()) { throw BadRequest("The column {} does not have a lineageIndex defined.", column_name); diff --git a/src/silo/append/database_inserter.cpp b/src/silo/append/database_inserter.cpp index 9a816664e..85ccbb66c 100644 --- a/src/silo/append/database_inserter.cpp +++ b/src/silo/append/database_inserter.cpp @@ -52,7 +52,7 @@ std::expected findFieldManual( std::expected findFieldWithFallbacks( simdjson::ondemand::object& object, - const TablePartitionInserter::SniffedField& sniffed_field + const TableInserter::SniffedField& sniffed_field ) { simdjson::ondemand::value column_value; auto error = object.find_field(sniffed_field.escaped_key).get(column_value); @@ -108,10 +108,11 @@ std::expected iterateToObject( } // namespace -std::expected, std::string> -TablePartitionInserter::sniffFieldOrder(simdjson::ondemand::document_reference ndjson_line) const { +std::expected, std::string> TableInserter::sniffFieldOrder( + simdjson::ondemand::document_reference ndjson_line +) const { std::vector order_in_json_line; - auto columns_in_table = table_partition->columns.metadata; + auto columns_in_table = table->columns.metadata; ASSIGN_OR_RAISE(auto object, iterateToObject(ndjson_line)); for (auto maybe_field : object) { ASSIGN_OR_RAISE_SIMDJSON( @@ -161,47 +162,27 @@ TablePartitionInserter::sniffFieldOrder(simdjson::ondemand::document_reference n return order_in_json_line; } -std::expected TablePartitionInserter::insert( +std::expected TableInserter::insert( simdjson::ondemand::document_reference ndjson_line, - const std::vector& field_order_hint + const std::vector& field_order_hint ) const { - EVOBENCH_SCOPE_EVERY(20, "TablePartitionInserter", "insert"); + EVOBENCH_SCOPE_EVERY(20, "TableInserter", "insert"); ASSIGN_OR_RAISE(auto object, iterateToObject(ndjson_line)); for (const auto& sniffed_field : field_order_hint) { ASSIGN_OR_RAISE(auto column_value, findFieldWithFallbacks(object, sniffed_field)); - auto success_or_error = table_partition->columns.addJsonValueToColumn( - sniffed_field.column_identifier, column_value - ); + auto success_or_error = + table->columns.addJsonValueToColumn(sniffed_field.column_identifier, column_value); if (!success_or_error.has_value()) { return success_or_error; } } - table_partition->sequence_count++; + table->sequence_count++; return {}; } -TablePartitionInserter::Commit TablePartitionInserter::commit() const { - table_partition->finalize(); - table_partition->validate(); - return Commit{}; -} - -TablePartitionInserter TableInserter::openNewPartition() const { - if (table->getNumberOfPartitions() == 0) { - return TablePartitionInserter{table->addPartition()}; - } - return TablePartitionInserter{table->getPartition(0)}; -} - -TablePartitionInserter TableInserter::openLastPartition() const { - if (table->getNumberOfPartitions() == 0) { - return openNewPartition(); - } - return TablePartitionInserter{table->getPartition(table->getNumberOfPartitions() - 1)}; -} - TableInserter::Commit TableInserter::commit() const { try { + table->finalize(); table->validate(); return Commit{}; } catch (const silo::schema::DuplicatePrimaryKeyException& exception) { @@ -209,16 +190,18 @@ TableInserter::Commit TableInserter::commit() const { } } -TablePartitionInserter::Commit appendDataToTablePartition( - const TablePartitionInserter& partition_inserter, +TableInserter::Commit appendDataToTable( + std::shared_ptr table, NdjsonLineReader& input_data ) { - EVOBENCH_SCOPE("TablePartitionInserter", "appendDataToTablePartition"); + EVOBENCH_SCOPE("TableInserter", "appendDataToTable"); + const TableInserter table_inserter(std::move(table)); + size_t line_count = 0; bool first_line = true; - std::vector sniffed_field_order; + std::vector sniffed_field_order; for (auto [json_obj_or_error, raw_line] : input_data) { simdjson::ondemand::document_reference ndjson_line; auto error = json_obj_or_error.get(ndjson_line); @@ -231,7 +214,7 @@ TablePartitionInserter::Commit appendDataToTablePartition( } if (first_line) { - auto sniffed_field_order_or_error = partition_inserter.sniffFieldOrder(ndjson_line); + auto sniffed_field_order_or_error = table_inserter.sniffFieldOrder(ndjson_line); if (!sniffed_field_order_or_error.has_value()) { throw AppendException{ "{} - current line: {}", sniffed_field_order_or_error.error(), raw_line @@ -241,7 +224,7 @@ TablePartitionInserter::Commit appendDataToTablePartition( first_line = false; } - auto maybe_error = partition_inserter.insert(ndjson_line, sniffed_field_order); + auto maybe_error = table_inserter.insert(ndjson_line, sniffed_field_order); if (!maybe_error.has_value()) { throw AppendException{"{} - current line: {}", maybe_error.error(), raw_line}; } @@ -252,20 +235,6 @@ TablePartitionInserter::Commit appendDataToTablePartition( } } - return partition_inserter.commit(); -} - -TableInserter::Commit appendDataToTable( - std::shared_ptr table, - NdjsonLineReader& input_data -) { - const TableInserter table_inserter(std::move(table)); - - // TODO(#738) make partition configurable - auto table_partition = table_inserter.openLastPartition(); - - appendDataToTablePartition(table_partition, input_data); - return table_inserter.commit(); } diff --git a/src/silo/append/database_inserter.h b/src/silo/append/database_inserter.h index a09e3b958..c26ce77d2 100644 --- a/src/silo/append/database_inserter.h +++ b/src/silo/append/database_inserter.h @@ -8,22 +8,21 @@ #include "silo/append/ndjson_line_reader.h" #include "silo/storage/table.h" -#include "silo/storage/table_partition.h" namespace silo::append { -class TablePartitionInserter { - std::shared_ptr table_partition; +class TableInserter { + std::shared_ptr table; public: class Commit { - friend class TablePartitionInserter; + friend class TableInserter; Commit() = default; }; - explicit TablePartitionInserter(std::shared_ptr table_partition) - : table_partition(std::move(table_partition)) {} + explicit TableInserter(std::shared_ptr table) + : table(std::move(table)) {} struct SniffedField { silo::schema::ColumnIdentifier column_identifier; @@ -47,34 +46,9 @@ class TablePartitionInserter { [[nodiscard]] Commit commit() const; }; -class TableInserter { - std::shared_ptr table; - - public: - class Commit { - friend class TableInserter; - - Commit() = default; - }; - - explicit TableInserter(std::shared_ptr table) - : table(std::move(table)) {} - - [[nodiscard]] TablePartitionInserter openNewPartition() const; - - [[nodiscard]] TablePartitionInserter openLastPartition() const; - - [[nodiscard]] Commit commit() const; -}; - -TablePartitionInserter::Commit appendDataToTablePartition( - const TablePartitionInserter& partition_inserter, - NdjsonLineReader& input_data -); - TableInserter::Commit appendDataToTable( std::shared_ptr table, NdjsonLineReader& input_data ); -} // namespace silo::append \ No newline at end of file +} // namespace silo::append diff --git a/src/silo/common/aa_symbols.h b/src/silo/common/aa_symbols.h index e0a007b8c..6e3b74981 100644 --- a/src/silo/common/aa_symbols.h +++ b/src/silo/common/aa_symbols.h @@ -14,7 +14,7 @@ namespace silo { namespace storage::column { template -class SequenceColumnPartition; +class SequenceColumn; } class AminoAcid { @@ -52,7 +52,7 @@ class AminoAcid { }; static constexpr schema::ColumnType COLUMN_TYPE = schema::ColumnType::AMINO_ACID_SEQUENCE; - using Column = storage::column::SequenceColumnPartition; + using Column = storage::column::SequenceColumn; static constexpr uint32_t COUNT = 28; static_assert(COUNT == static_cast(Symbol::X) + 1); diff --git a/src/silo/common/nucleotide_symbols.h b/src/silo/common/nucleotide_symbols.h index b8f7aa782..b6ec4cc0a 100644 --- a/src/silo/common/nucleotide_symbols.h +++ b/src/silo/common/nucleotide_symbols.h @@ -14,7 +14,7 @@ namespace silo { namespace storage::column { template -class SequenceColumnPartition; +class SequenceColumn; } class Nucleotide { @@ -40,7 +40,7 @@ class Nucleotide { }; static constexpr schema::ColumnType COLUMN_TYPE = schema::ColumnType::NUCLEOTIDE_SEQUENCE; - using Column = storage::column::SequenceColumnPartition; + using Column = storage::column::SequenceColumn; static constexpr uint32_t COUNT = 16; diff --git a/src/silo/common/serialization_version.txt b/src/silo/common/serialization_version.txt index 15d643c29..4076d0809 100644 --- a/src/silo/common/serialization_version.txt +++ b/src/silo/common/serialization_version.txt @@ -1 +1 @@ -1774509839 +1774967790 diff --git a/src/silo/database.cpp b/src/silo/database.cpp index 95382e28a..31b8c3102 100644 --- a/src/silo/database.cpp +++ b/src/silo/database.cpp @@ -221,9 +221,7 @@ std::string Database::getNucleotideReferenceSequence( const auto& table_schema = maybe_table_schema->second; auto maybe_sequence_column_metadata = - table_schema->getColumnMetadata>( - sequence_name - ); + table_schema->getColumnMetadata>(sequence_name); if (maybe_sequence_column_metadata == std::nullopt) { SPDLOG_ERROR( "The database table {} does not contain the nucleotide sequence column {}", @@ -249,9 +247,7 @@ std::string Database::getAminoAcidReferenceSequence( const auto& table_schema = maybe_table_schema->second; auto maybe_sequence_column_metadata = - table_schema->getColumnMetadata>( - sequence_name - ); + table_schema->getColumnMetadata>(sequence_name); if (maybe_sequence_column_metadata == std::nullopt) { SPDLOG_ERROR( "The database table {} does not contain the nucleotide sequence column {}", @@ -278,20 +274,9 @@ roaring::Roaring Database::getFilteredBitmap( return {}; } auto table = maybe_table->second; - if (table->getNumberOfPartitions() == 0) { - SPDLOG_WARN("The table is empty"); - return {}; - } - if (table->getNumberOfPartitions() > 1) { - SPDLOG_ERROR( - "The table should not contain more than one partition (actual: {}), internal error.", - table->getNumberOfPartitions() - ); - return {}; - } auto rewritten_filter_expression = - filter_expression->rewrite(*table, *table->getPartition(0), Expression::AmbiguityMode::NONE); - auto filter_operator = rewritten_filter_expression->compile(*table, *table->getPartition(0)); + filter_expression->rewrite(*table, Expression::AmbiguityMode::NONE); + auto filter_operator = rewritten_filter_expression->compile(*table); roaring::Roaring bitmap = filter_operator->evaluate().getConstReference(); return bitmap; } @@ -363,22 +348,18 @@ std::vector> Database::getPrevalentAminoAcidMut namespace { void addTableStatisticsToDatabaseInfo(DatabaseInfo& database_info, const storage::Table& table) { - for (size_t partition_idx = 0; partition_idx < table.getNumberOfPartitions(); ++partition_idx) { - auto table_partition = table.getPartition(partition_idx); - // TODO(#743) try to analyze size accuracy relative to RSS - for (const auto& [_, seq_column] : table_partition->columns.nuc_columns) { - auto info = seq_column.getInfo(); - database_info.vertical_bitmaps_size += info.vertical_bitmaps_size; - database_info.horizontal_bitmaps_size += info.horizontal_bitmaps_size; - } - for (const auto& [_, seq_column] : table_partition->columns.aa_columns) { - auto info = seq_column.getInfo(); - database_info.vertical_bitmaps_size += info.vertical_bitmaps_size; - database_info.horizontal_bitmaps_size += info.horizontal_bitmaps_size; - } - database_info.sequence_count += table_partition->sequence_count; + // TODO(#743) try to analyze size accuracy relative to RSS + for (const auto& [_, seq_column] : table.columns.nuc_columns) { + auto info = seq_column.getInfo(); + database_info.vertical_bitmaps_size += info.vertical_bitmaps_size; + database_info.horizontal_bitmaps_size += info.horizontal_bitmaps_size; + } + for (const auto& [_, seq_column] : table.columns.aa_columns) { + auto info = seq_column.getInfo(); + database_info.vertical_bitmaps_size += info.vertical_bitmaps_size; + database_info.horizontal_bitmaps_size += info.horizontal_bitmaps_size; } - database_info.number_of_partitions += table.getNumberOfPartitions(); + database_info.sequence_count += table.sequence_count; } } // namespace @@ -388,8 +369,7 @@ DatabaseInfo Database::getDatabaseInfo() const { .version = silo::RELEASE_VERSION, .sequence_count = 0, .vertical_bitmaps_size = 0, - .horizontal_bitmaps_size = 0, - .number_of_partitions = 0 + .horizontal_bitmaps_size = 0 }; for (const auto& [_, table] : tables) { addTableStatisticsToDatabaseInfo(database_info, *table); @@ -441,8 +421,9 @@ void Database::saveDatabaseState(const std::filesystem::path& save_directory) { for (const auto& [table_name, table] : tables) { SPDLOG_DEBUG("Saving table data for table {}", table_name.getName()); - std::filesystem::create_directory(versioned_save_directory / table_name.getName()); - table->saveData(versioned_save_directory / table_name.getName()); + const std::filesystem::path table_file = + versioned_save_directory / (table_name.getName() + ".silo"); + table->saveData(table_file); } data_version_.saveToFile(versioned_save_directory / DATA_VERSION_FILENAME); @@ -488,7 +469,7 @@ Database Database::loadDatabaseState(const silo::SiloDataSource& silo_data_sourc for (const auto& [table_name, _] : schema.tables) { SPDLOG_DEBUG("Loading data for table {}", table_name.getName()); - database.tables.at(table_name)->loadData(save_directory / table_name.getName()); + database.tables.at(table_name)->loadData(save_directory / (table_name.getName() + ".silo")); } database.data_version_ = loadDataVersion(save_directory / DATA_VERSION_FILENAME); diff --git a/src/silo/database.test.cpp b/src/silo/database.test.cpp index 65efbc982..5f8ba87f4 100644 --- a/src/silo/database.test.cpp +++ b/src/silo/database.test.cpp @@ -89,7 +89,6 @@ TEST(DatabaseTest, shouldSaveAndReloadDatabaseWithoutErrors) { EXPECT_EQ(database_info.sequence_count, 5); EXPECT_GT(database_info.vertical_bitmaps_size, 0); EXPECT_GT(database_info.horizontal_bitmaps_size, 0); - EXPECT_EQ(database_info.number_of_partitions, 1); // When bumping the serialization version, run `make bump-serialization-version` which sets // SILO_KEEP_SERIALIZED_STATE=1 to preserve the produced directory for committing to Git. diff --git a/src/silo/database_info.cpp b/src/silo/database_info.cpp index 606ebd94d..c575b0fcc 100644 --- a/src/silo/database_info.cpp +++ b/src/silo/database_info.cpp @@ -8,7 +8,6 @@ void silo::to_json(nlohmann::json& json, const DatabaseInfo& databaseInfo) { {"version", databaseInfo.version}, {"sequenceCount", databaseInfo.sequence_count}, {"verticalBitmapsSize", databaseInfo.vertical_bitmaps_size}, - {"horizontalBitmapsSize", databaseInfo.horizontal_bitmaps_size}, - {"numberOfPartitions", databaseInfo.number_of_partitions} + {"horizontalBitmapsSize", databaseInfo.horizontal_bitmaps_size} }; } \ No newline at end of file diff --git a/src/silo/database_info.h b/src/silo/database_info.h index c230fad50..95aafd931 100644 --- a/src/silo/database_info.h +++ b/src/silo/database_info.h @@ -10,7 +10,6 @@ struct DatabaseInfo { uint32_t sequence_count; uint64_t vertical_bitmaps_size; uint64_t horizontal_bitmaps_size; - uint64_t number_of_partitions; }; // NOLINTNEXTLINE(readability-identifier-naming,misc-use-internal-linkage) diff --git a/src/silo/initialize/initializer.cpp b/src/silo/initialize/initializer.cpp index 5e7319368..1b79465e9 100644 --- a/src/silo/initialize/initializer.cpp +++ b/src/silo/initialize/initializer.cpp @@ -61,7 +61,7 @@ struct ColumnMetadataInitializer { }; template <> -void ColumnMetadataInitializer::operator()( +void ColumnMetadataInitializer::operator()( std::shared_ptr& metadata, const config::DatabaseMetadata& config_metadata, const ReferenceGenomes& /*reference_genomes*/, @@ -83,18 +83,17 @@ void ColumnMetadataInitializer::operator()( + metadata = std::make_shared( config_metadata.name, lineage_tree.value() ); } else { - metadata = std::make_shared( - config_metadata.name - ); + metadata = + std::make_shared(config_metadata.name); } } template <> -void ColumnMetadataInitializer::operator()( +void ColumnMetadataInitializer::operator()( std::shared_ptr& metadata, const config::DatabaseMetadata& config_metadata, const ReferenceGenomes& /*reference_genomes*/, @@ -102,17 +101,16 @@ void ColumnMetadataInitializer::operator()( + metadata = std::make_shared( config_metadata.name, phylo_tree_file ); } else { - metadata = - std::make_shared(config_metadata.name); + metadata = std::make_shared(config_metadata.name); } } template <> -void ColumnMetadataInitializer::operator()( +void ColumnMetadataInitializer::operator()( std::shared_ptr& /*metadata*/, const config::DatabaseMetadata& /*config_metadata*/, const ReferenceGenomes& /*reference_genomes*/, @@ -123,7 +121,7 @@ void ColumnMetadataInitializer::operator() -void ColumnMetadataInitializer::operator()>( +void ColumnMetadataInitializer::operator()>( std::shared_ptr& /*metadata*/, const config::DatabaseMetadata& /*config_metadata*/, const ReferenceGenomes& /*reference_genomes*/, @@ -134,7 +132,7 @@ void ColumnMetadataInitializer::operator() -void ColumnMetadataInitializer::operator()>( +void ColumnMetadataInitializer::operator()>( std::shared_ptr& /*metadata*/, const config::DatabaseMetadata& /*config_metadata*/, const ReferenceGenomes& /*reference_genomes*/, @@ -230,7 +228,7 @@ void assertPrimaryKeyOfTypeString(const silo::config::DatabaseConfig& database_c } // TODO(#741) we prepend the unalignedSequence columns (which are using the type -// ZstdCompressedStringColumnPartition) with 'unaligned_'. This should be cleaned up with a +// ZstdCompressedStringColumn) with 'unaligned_'. This should be cleaned up with a // refactor and breaking change of the current input format. const std::string UNALIGNED_NUCLEOTIDE_SEQUENCE_PREFIX = "unaligned_"; diff --git a/src/silo/initialize/initializer.test.cpp b/src/silo/initialize/initializer.test.cpp index 602ed30d3..06c207d6a 100644 --- a/src/silo/initialize/initializer.test.cpp +++ b/src/silo/initialize/initializer.test.cpp @@ -55,14 +55,11 @@ A.11: using silo::schema::ColumnType; ASSERT_TRUE(table_schema->getColumn("M").has_value()); ASSERT_EQ(table_schema->getColumn("M").value().type, ColumnType::AMINO_ACID_SEQUENCE); - ASSERT_TRUE( - table_schema - ->getColumnMetadata>("M") - .has_value() - ); + ASSERT_TRUE(table_schema + ->getColumnMetadata>("M") + .has_value()); ASSERT_EQ( - table_schema - ->getColumnMetadata>("M") + table_schema->getColumnMetadata>("M") .value() ->reference_sequence, reference_genomes.stringToVector("MADS*") @@ -74,8 +71,7 @@ A.11: ASSERT_TRUE(table_schema->getColumn("country").has_value()); ASSERT_EQ(table_schema->getColumn("country").value().type, ColumnType::INDEXED_STRING); ASSERT_TRUE(table_schema - ->getColumnMetadata("country" - ) + ->getColumnMetadata("country") .has_value()); ASSERT_TRUE(table_schema->getColumn("date").has_value()); @@ -83,26 +79,20 @@ A.11: ASSERT_TRUE(table_schema->getColumn("division").has_value()); ASSERT_EQ(table_schema->getColumn("division").value().type, ColumnType::INDEXED_STRING); - ASSERT_TRUE( - table_schema - ->getColumnMetadata("division") - .has_value() - ); + ASSERT_TRUE(table_schema + ->getColumnMetadata("division") + .has_value()); ASSERT_TRUE(table_schema->getColumn("main").has_value()); ASSERT_EQ(table_schema->getColumn("main").value().type, ColumnType::NUCLEOTIDE_SEQUENCE); ASSERT_TRUE( table_schema - ->getColumnMetadata>( - "main" - ) + ->getColumnMetadata>("main") .has_value() ); ASSERT_EQ( table_schema - ->getColumnMetadata>( - "main" - ) + ->getColumnMetadata>("main") .value() ->reference_sequence, reference_genomes.stringToVector("ACGTACGT") @@ -110,14 +100,11 @@ A.11: ASSERT_TRUE(table_schema->getColumn("pango_lineage").has_value()); ASSERT_EQ(table_schema->getColumn("pango_lineage").value().type, ColumnType::INDEXED_STRING); - ASSERT_TRUE( - table_schema - ->getColumnMetadata("pango_lineage") - .has_value() - ); + ASSERT_TRUE(table_schema + ->getColumnMetadata("pango_lineage") + .has_value()); auto* pango_metadata = - table_schema - ->getColumnMetadata("pango_lineage") + table_schema->getColumnMetadata("pango_lineage") .value(); ASSERT_EQ(pango_metadata->dictionary.getValue(0), "A"); ASSERT_EQ(pango_metadata->dictionary.getValue(1), "A.1"); @@ -128,33 +115,30 @@ A.11: ASSERT_TRUE(table_schema->getColumn("primaryKey").has_value()); ASSERT_EQ(table_schema->getColumn("primaryKey").value().type, ColumnType::STRING); - ASSERT_TRUE(table_schema - ->getColumnMetadata("primaryKey") - .has_value()); + ASSERT_TRUE( + table_schema->getColumnMetadata("primaryKey").has_value() + ); ASSERT_TRUE(table_schema->getColumn("qc_value").has_value()); ASSERT_EQ(table_schema->getColumn("qc_value").value().type, ColumnType::FLOAT); ASSERT_TRUE(table_schema->getColumn("region").has_value()); ASSERT_EQ(table_schema->getColumn("region").value().type, ColumnType::INDEXED_STRING); - ASSERT_TRUE(table_schema - ->getColumnMetadata("region") + ASSERT_TRUE(table_schema->getColumnMetadata("region") .has_value()); ASSERT_TRUE(table_schema->getColumn("testSecondSequence").has_value()); ASSERT_EQ( table_schema->getColumn("testSecondSequence").value().type, ColumnType::NUCLEOTIDE_SEQUENCE ); - ASSERT_TRUE( - table_schema - ->getColumnMetadata>( - "testSecondSequence" - ) - .has_value() - ); + ASSERT_TRUE(table_schema + ->getColumnMetadata>( + "testSecondSequence" + ) + .has_value()); ASSERT_EQ( table_schema - ->getColumnMetadata>( + ->getColumnMetadata>( "testSecondSequence" ) .value() @@ -169,16 +153,14 @@ A.11: ASSERT_EQ( table_schema->getColumn("unaligned_main").value().type, ColumnType::ZSTD_COMPRESSED_STRING ); - ASSERT_TRUE(table_schema - ->getColumnMetadata( - "unaligned_main" - ) - .has_value()); + ASSERT_TRUE( + table_schema + ->getColumnMetadata("unaligned_main") + .has_value() + ); ASSERT_EQ( table_schema - ->getColumnMetadata( - "unaligned_main" - ) + ->getColumnMetadata("unaligned_main") .value() ->dictionary_string, "ACGTACGT" @@ -190,13 +172,13 @@ A.11: ColumnType::ZSTD_COMPRESSED_STRING ); ASSERT_TRUE(table_schema - ->getColumnMetadata( + ->getColumnMetadata( "unaligned_testSecondSequence" ) .has_value()); ASSERT_EQ( table_schema - ->getColumnMetadata( + ->getColumnMetadata( "unaligned_testSecondSequence" ) .value() diff --git a/src/silo/query_engine/actions/insertions.cpp b/src/silo/query_engine/actions/insertions.cpp index ee40910a3..38bf399f6 100644 --- a/src/silo/query_engine/actions/insertions.cpp +++ b/src/silo/query_engine/actions/insertions.cpp @@ -22,7 +22,6 @@ #include "silo/query_engine/illegal_query_exception.h" #include "silo/query_engine/operators/query_node.h" #include "silo/storage/column/insertion_index.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::actions { diff --git a/src/silo/query_engine/actions/mutations.cpp b/src/silo/query_engine/actions/mutations.cpp index 5af8e3178..6edd555a5 100644 --- a/src/silo/query_engine/actions/mutations.cpp +++ b/src/silo/query_engine/actions/mutations.cpp @@ -26,7 +26,6 @@ #include "silo/query_engine/illegal_query_exception.h" #include "silo/query_engine/operators/query_node.h" #include "silo/storage/column/sequence_column.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::actions { diff --git a/src/silo/query_engine/exec_node/arrow_util.h b/src/silo/query_engine/exec_node/arrow_util.h index 66117451a..82672f375 100644 --- a/src/silo/query_engine/exec_node/arrow_util.h +++ b/src/silo/query_engine/exec_node/arrow_util.h @@ -28,55 +28,55 @@ template struct ArrowBuilderSelector; template <> -struct ArrowBuilderSelector { +struct ArrowBuilderSelector { using builder_type = arrow::StringBuilder; using value_type = std::string; }; template <> -struct ArrowBuilderSelector { +struct ArrowBuilderSelector { using builder_type = arrow::StringBuilder; using value_type = std::string; }; template <> -struct ArrowBuilderSelector> { +struct ArrowBuilderSelector> { using builder_type = arrow::BinaryBuilder; using value_type = std::string; }; template <> -struct ArrowBuilderSelector> { +struct ArrowBuilderSelector> { using builder_type = arrow::BinaryBuilder; using value_type = std::string; }; template <> -struct ArrowBuilderSelector { +struct ArrowBuilderSelector { using builder_type = arrow::BinaryBuilder; using value_type = std::string; }; template <> -struct ArrowBuilderSelector { +struct ArrowBuilderSelector { using builder_type = arrow::DoubleBuilder; using value_type = double; }; template <> -struct ArrowBuilderSelector { +struct ArrowBuilderSelector { using builder_type = arrow::BooleanBuilder; using value_type = bool; }; template <> -struct ArrowBuilderSelector { +struct ArrowBuilderSelector { using builder_type = arrow::Int32Builder; using value_type = int32_t; }; template <> -struct ArrowBuilderSelector { +struct ArrowBuilderSelector { using builder_type = arrow::StringBuilder; using value_type = std::string; }; diff --git a/src/silo/query_engine/exec_node/table_scan.cpp b/src/silo/query_engine/exec_node/table_scan.cpp index 06d61d585..ab55b6ee0 100644 --- a/src/silo/query_engine/exec_node/table_scan.cpp +++ b/src/silo/query_engine/exec_node/table_scan.cpp @@ -18,46 +18,45 @@ namespace { template std::vector reconstructNonNullSequences( - const storage::column::SequenceColumnPartition& sequence_column_partition, + const storage::column::SequenceColumn& sequence_column, const roaring::Roaring& non_null_row_ids ) { const size_t cardinality = non_null_row_ids.cardinality(); - const std::string partition_reference = - sequence_column_partition.local_reference_sequence_string; + const std::string reference = sequence_column.local_reference_sequence_string; std::vector reconstructed_sequences; - reconstructed_sequences.resize(cardinality, partition_reference); + reconstructed_sequences.resize(cardinality, reference); - sequence_column_partition.vertical_sequence_index.overwriteSymbolsInSequences( + sequence_column.vertical_sequence_index.overwriteSymbolsInSequences( reconstructed_sequences, non_null_row_ids ); - sequence_column_partition.horizontal_coverage_index - .template overwriteCoverageInSequence(reconstructed_sequences, non_null_row_ids); + sequence_column.horizontal_coverage_index.template overwriteCoverageInSequence( + reconstructed_sequences, non_null_row_ids + ); return reconstructed_sequences; } template // NOLINTNEXTLINE(readability-function-cognitive-complexity) arrow::Status appendSequences( - const storage::column::SequenceColumnPartition& sequence_column_partition, + const storage::column::SequenceColumn& sequence_column, const roaring::Roaring& row_ids, arrow::BinaryBuilder& output_array ) { - auto reconstructed_non_null_sequences = reconstructNonNullSequences( - sequence_column_partition, row_ids - sequence_column_partition.null_bitmap - ); + auto reconstructed_non_null_sequences = + reconstructNonNullSequences(sequence_column, row_ids - sequence_column.null_bitmap); ARROW_RETURN_NOT_OK(output_array.Reserve(row_ids.cardinality())); auto reference_sequence = - SymbolType::sequenceToString(sequence_column_partition.metadata->reference_sequence); + SymbolType::sequenceToString(sequence_column.metadata->reference_sequence); auto dictionary = std::make_shared(reference_sequence, 3); silo::ZstdCompressor compressor{dictionary}; auto reconstructed_sequence_iterator = reconstructed_non_null_sequences.begin(); for (auto row_id : row_ids) { - if (sequence_column_partition.isNull(row_id)) { + if (sequence_column.isNull(row_id)) { ARROW_RETURN_NOT_OK(output_array.AppendNull()); } else { auto& reconstructed_sequence = *reconstructed_sequence_iterator; @@ -76,71 +75,62 @@ class ColumnEntryAppender { arrow::Status operator()( ExecBatchBuilder& table_scan_node, const std::string& column_name, - const storage::TablePartition& table_partition, + const storage::Table& table, const roaring::Roaring& row_ids ); }; template <> -arrow::Status ColumnEntryAppender::operator()>( +arrow::Status ColumnEntryAppender::operator()>( ExecBatchBuilder& table_scan_node, const std::string& column_name, - const storage::TablePartition& table_partition, + const storage::Table& table, const roaring::Roaring& row_ids ) { EVOBENCH_SCOPE( - "ColumnEntryAppender", - columnTypeToString(storage::column::SequenceColumnPartition::TYPE) + "ColumnEntryAppender", columnTypeToString(storage::column::SequenceColumn::TYPE) ); auto* array = - table_scan_node - .getColumnTypeArrayBuilders>() - .at(column_name); - return appendSequences( - table_partition.columns.nuc_columns.at(column_name), row_ids, *array - ); + table_scan_node.getColumnTypeArrayBuilders>().at( + column_name + ); + return appendSequences(table.columns.nuc_columns.at(column_name), row_ids, *array); } template <> -arrow::Status ColumnEntryAppender::operator()>( +arrow::Status ColumnEntryAppender::operator()>( ExecBatchBuilder& table_scan_node, const std::string& column_name, - const storage::TablePartition& table_partition, + const storage::Table& table, const roaring::Roaring& row_ids ) { EVOBENCH_SCOPE( - "ColumnEntryAppender", - columnTypeToString(storage::column::SequenceColumnPartition::TYPE) + "ColumnEntryAppender", columnTypeToString(storage::column::SequenceColumn::TYPE) ); auto* array = - table_scan_node - .getColumnTypeArrayBuilders>() - .at(column_name); - return appendSequences( - table_partition.columns.aa_columns.at(column_name), row_ids, *array - ); + table_scan_node.getColumnTypeArrayBuilders>().at( + column_name + ); + return appendSequences(table.columns.aa_columns.at(column_name), row_ids, *array); } template <> // NOLINTNEXTLINE(readability-function-cognitive-complexity) -arrow::Status ColumnEntryAppender::operator()( +arrow::Status ColumnEntryAppender::operator()( ExecBatchBuilder& table_scan_node, const std::string& column_name, - const storage::TablePartition& table_partition, + const storage::Table& table, const roaring::Roaring& row_ids ) { EVOBENCH_SCOPE( - "ColumnEntryAppender", - columnTypeToString(storage::column::ZstdCompressedStringColumnPartition::TYPE) + "ColumnEntryAppender", columnTypeToString(storage::column::ZstdCompressedStringColumn::TYPE) ); auto* array = - table_scan_node - .getColumnTypeArrayBuilders() - .at(column_name); - const auto& column = - table_partition.columns.getColumns().at( + table_scan_node.getColumnTypeArrayBuilders().at( column_name ); + const auto& column = + table.columns.getColumns().at(column_name); for (auto row_id : row_ids) { auto value = column.getCompressed(row_id); if (value.has_value()) { @@ -157,24 +147,23 @@ template arrow::Status ColumnEntryAppender::operator()( ExecBatchBuilder& table_scan_node, const std::string& column_name, - const storage::TablePartition& table_partition, + const storage::Table& table, const roaring::Roaring& row_ids ) { EVOBENCH_SCOPE("ColumnEntryAppender", columnTypeToString(Column::TYPE)); + auto& column = table.columns.getColumns().at(column_name); auto array = table_scan_node.getColumnTypeArrayBuilders().at(column_name); for (auto row_id : row_ids) { - auto& column = table_partition.columns.getColumns().at(column_name); if (column.isNull(row_id)) { ARROW_RETURN_NOT_OK(array->AppendNull()); } else { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { auto value = column.getValueString(row_id); ARROW_RETURN_NOT_OK(array->Append(value)); - } else if constexpr (std:: - is_same_v) { + } else if constexpr (std::is_same_v) { auto value = column.getValueString(row_id); ARROW_RETURN_NOT_OK(array->Append(value)); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { auto value = common::date32ToString(column.getValue(row_id)); ARROW_RETURN_NOT_OK(array->Append(value)); } else { @@ -198,13 +187,13 @@ ExecBatchBuilder::ExecBatchBuilder(std::vector o } arrow::Status ExecBatchBuilder::appendEntries( - const storage::TablePartition& table_partition, + const storage::Table& table, const roaring::Roaring& row_ids ) { EVOBENCH_SCOPE("ExecBatchBuilder", "appendEntries"); for (const auto& field : output_fields) { ARROW_RETURN_NOT_OK(storage::column::visit( - field.type, ColumnEntryAppender{}, *this, field.name, table_partition, row_ids + field.type, ColumnEntryAppender{}, *this, field.name, table, row_ids )); } return arrow::Status::OK(); @@ -231,21 +220,12 @@ arrow::Result> TableScanGenerator::produceNextBa while (current_bitmap_reader.has_value()) { auto row_ids = current_bitmap_reader.value().nextBatch(); if (row_ids.has_value()) { - ARROW_RETURN_NOT_OK(exec_batch_builder.appendEntries( - *table->getPartition(current_partition_idx), row_ids.value() - )); + ARROW_RETURN_NOT_OK(exec_batch_builder.appendEntries(*table, row_ids.value())); ARROW_ASSIGN_OR_RAISE(auto batch, exec_batch_builder.finishBatch()); SPDLOG_DEBUG("Finished arrow::ExecBatch with length: {}", batch.length); return batch; } - current_partition_idx++; - if (current_partition_idx < partition_filters.size()) { - current_bitmap_reader = BatchedBitmapReader{ - partition_filters.at(current_partition_idx).getConstReference(), batch_size_cutoff - }; - } else { - current_bitmap_reader = std::nullopt; - } + current_bitmap_reader = std::nullopt; } return std::nullopt; } @@ -253,12 +233,12 @@ arrow::Result> TableScanGenerator::produceNextBa arrow::Result makeTableScan( arrow::acero::ExecPlan* plan, const std::vector& columns, - std::vector partition_filters_, + CopyOnWriteBitmap bitmap_filter_, std::shared_ptr table, size_t batch_size_cutoff ) { const exec_node::TableScanGenerator generator( - columns, std::move(partition_filters_), std::move(table), batch_size_cutoff + columns, std::move(bitmap_filter_), std::move(table), batch_size_cutoff ); const arrow::acero::SourceNodeOptions source_node_options{ exec_node::columnsToArrowSchema(columns), generator, arrow::Ordering::Implicit() diff --git a/src/silo/query_engine/exec_node/table_scan.h b/src/silo/query_engine/exec_node/table_scan.h index e021bf2d5..15de73614 100644 --- a/src/silo/query_engine/exec_node/table_scan.h +++ b/src/silo/query_engine/exec_node/table_scan.h @@ -34,10 +34,7 @@ class ExecBatchBuilder { return result; } - arrow::Status appendEntries( - const storage::TablePartition& table_partition, - const roaring::Roaring& row_ids - ); + arrow::Status appendEntries(const storage::Table& table, const roaring::Roaring& row_ids); arrow::Result finishBatch(); }; @@ -45,30 +42,24 @@ class ExecBatchBuilder { class TableScanGenerator { ExecBatchBuilder exec_batch_builder; - std::vector partition_filters; + CopyOnWriteBitmap bitmap_filter; std::optional current_bitmap_reader; - size_t current_partition_idx; const std::shared_ptr table; - size_t batch_size_cutoff; public: TableScanGenerator( const std::vector& columns, - std::vector partition_filters_, + CopyOnWriteBitmap bitmap_filter_, std::shared_ptr table, size_t batch_size_cutoff ) : exec_batch_builder(columns), - partition_filters(std::move(partition_filters_)), - table(std::move(table)), - batch_size_cutoff(batch_size_cutoff) { - current_partition_idx = 0; - if (!partition_filters.empty()) { - current_bitmap_reader = - BatchedBitmapReader{partition_filters.front().getConstReference(), batch_size_cutoff}; - } + bitmap_filter(std::move(bitmap_filter_)), + table(std::move(table)) { + current_bitmap_reader = + BatchedBitmapReader{bitmap_filter.getConstReference(), batch_size_cutoff}; } arrow::Future> operator()() { @@ -94,9 +85,9 @@ class TableScanGenerator { arrow::Result makeTableScan( arrow::acero::ExecPlan* plan, const std::vector& columns, - std::vector partition_filters_, + CopyOnWriteBitmap bitmap_filter_, std::shared_ptr table, size_t batch_size_cutoff ); -} // namespace silo::query_engine::exec_node \ No newline at end of file +} // namespace silo::query_engine::exec_node diff --git a/src/silo/query_engine/filter/expressions/and.cpp b/src/silo/query_engine/filter/expressions/and.cpp index 7e49def9d..e13abfa92 100644 --- a/src/silo/query_engine/filter/expressions/and.cpp +++ b/src/silo/query_engine/filter/expressions/and.cpp @@ -20,7 +20,6 @@ #include "silo/query_engine/filter/operators/selection.h" #include "silo/query_engine/filter/operators/union.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -79,16 +78,13 @@ void logCompiledChildren( } // namespace std::tuple And::compileChildren( - const storage::Table& table, - const storage::TablePartition& table_partition + const storage::Table& table ) const { OperatorVector unprocessed_child_operators; std::ranges::transform( children, std::back_inserter(unprocessed_child_operators), - [&](const std::unique_ptr& expression) { - return expression->compile(table, table_partition); - } + [&](const std::unique_ptr& expression) { return expression->compile(table); } ); OperatorVector non_negated_child_operators; OperatorVector negated_child_operators; @@ -103,7 +99,7 @@ std::tuple And::comp if (child->type() == operators::EMPTY) { SPDLOG_TRACE("Shortcutting because found empty child"); OperatorVector empty; - empty.emplace_back(std::make_unique(table_partition.sequence_count)); + empty.emplace_back(std::make_unique(table.sequence_count)); return {std::move(empty), OperatorVector(), operators::PredicateVector{}}; } if (child->type() == operators::INTERSECTION) { @@ -139,36 +135,27 @@ std::tuple And::comp }; } -std::unique_ptr And::rewrite( - const storage::Table& table, - const storage::TablePartition& table_partition, - AmbiguityMode mode -) const { +std::unique_ptr And::rewrite(const storage::Table& table, AmbiguityMode mode) const { ExpressionVector rewritten_children; rewritten_children.reserve(children.size()); for (const auto& child : children) { - rewritten_children.emplace_back(child->rewrite(table, table_partition, mode)); + rewritten_children.emplace_back(child->rewrite(table, mode)); } return std::make_unique(std::move(rewritten_children)); } -std::unique_ptr And::compile( - const storage::Table& table, - const storage::TablePartition& table_partition -) const { - auto [non_negated_child_operators, negated_child_operators, predicates] = - compileChildren(table, table_partition); +std::unique_ptr And::compile(const storage::Table& table) const { + auto [non_negated_child_operators, negated_child_operators, predicates] = compileChildren(table); if (non_negated_child_operators.empty() && negated_child_operators.empty()) { if (predicates.empty()) { SPDLOG_TRACE( "Compiled And filter expression to Full, since no predicates and no child operators" ); - return std::make_unique(table_partition.sequence_count); + return std::make_unique(table.sequence_count); } - auto result = std::make_unique( - std::move(predicates), table_partition.sequence_count - ); + auto result = + std::make_unique(std::move(predicates), table.sequence_count); SPDLOG_TRACE( "Compiled And filter expression to {} - found only predicates", result->toString() ); @@ -181,20 +168,19 @@ std::unique_ptr And::compile( index_arithmetic_operator = std::move(non_negated_child_operators[0]); } else if (negated_child_operators.size() == 1 && non_negated_child_operators.empty()) { index_arithmetic_operator = std::make_unique( - std::move(negated_child_operators[0]), table_partition.sequence_count + std::move(negated_child_operators[0]), table.sequence_count ); } else if (non_negated_child_operators.empty()) { std::unique_ptr union_ret = std::make_unique( - std::move(negated_child_operators), table_partition.sequence_count - ); - index_arithmetic_operator = std::make_unique( - std::move(union_ret), table_partition.sequence_count + std::move(negated_child_operators), table.sequence_count ); + index_arithmetic_operator = + std::make_unique(std::move(union_ret), table.sequence_count); } else { index_arithmetic_operator = std::make_unique( std::move(non_negated_child_operators), std::move(negated_child_operators), - table_partition.sequence_count + table.sequence_count ); } if (predicates.empty()) { @@ -206,7 +192,7 @@ std::unique_ptr And::compile( return index_arithmetic_operator; } auto result = std::make_unique( - std::move(index_arithmetic_operator), std::move(predicates), table_partition.sequence_count + std::move(index_arithmetic_operator), std::move(predicates), table.sequence_count ); SPDLOG_TRACE("Compiled And filter expression to {}", result->toString()); diff --git a/src/silo/query_engine/filter/expressions/and.h b/src/silo/query_engine/filter/expressions/and.h index 4288cead7..1470871ca 100644 --- a/src/silo/query_engine/filter/expressions/and.h +++ b/src/silo/query_engine/filter/expressions/and.h @@ -9,7 +9,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/filter/operators/selection.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -19,8 +18,7 @@ class And : public Expression { [[nodiscard]] std:: tuple - compileChildren(const storage::Table& table, const storage::TablePartition& table_partition) - const; + compileChildren(const storage::Table& table) const; public: explicit And(ExpressionVector&& children); @@ -29,13 +27,10 @@ class And : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/bool_equals.cpp b/src/silo/query_engine/filter/expressions/bool_equals.cpp index 366699900..1c29ebc00 100644 --- a/src/silo/query_engine/filter/expressions/bool_equals.cpp +++ b/src/silo/query_engine/filter/expressions/bool_equals.cpp @@ -9,7 +9,6 @@ #include "silo/query_engine/filter/operators/index_scan.h" #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -26,36 +25,32 @@ std::string BoolEquals::toString() const { std::unique_ptr BoolEquals::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, Expression::AmbiguityMode /*mode*/ ) const { return std::make_unique(column_name, value); } -std::unique_ptr BoolEquals::compile( - const storage::Table& /*table*/, - const silo::storage::TablePartition& table_partition -) const { +std::unique_ptr BoolEquals::compile(const storage::Table& table) const { CHECK_SILO_QUERY( - table_partition.columns.bool_columns.contains(column_name), + table.columns.bool_columns.contains(column_name), "The database does not contain the column '{}'", column_name ); - const auto& bool_column = table_partition.columns.bool_columns.at(column_name); + const auto& bool_column = table.columns.bool_columns.at(column_name); if (value == std::nullopt) { return std::make_unique( - CopyOnWriteBitmap{&bool_column.null_bitmap}, table_partition.sequence_count + CopyOnWriteBitmap{&bool_column.null_bitmap}, table.sequence_count ); } if (value.value()) { return std::make_unique( - CopyOnWriteBitmap{&bool_column.true_bitmap}, table_partition.sequence_count + CopyOnWriteBitmap{&bool_column.true_bitmap}, table.sequence_count ); } return std::make_unique( - CopyOnWriteBitmap{&bool_column.false_bitmap}, table_partition.sequence_count + CopyOnWriteBitmap{&bool_column.false_bitmap}, table.sequence_count ); SILO_UNREACHABLE(); diff --git a/src/silo/query_engine/filter/expressions/bool_equals.h b/src/silo/query_engine/filter/expressions/bool_equals.h index 45b108ad5..0e253f85f 100644 --- a/src/silo/query_engine/filter/expressions/bool_equals.h +++ b/src/silo/query_engine/filter/expressions/bool_equals.h @@ -7,7 +7,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -23,13 +22,10 @@ struct BoolEquals : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/date_between.cpp b/src/silo/query_engine/filter/expressions/date_between.cpp index 0289f70e3..73979bba4 100644 --- a/src/silo/query_engine/filter/expressions/date_between.cpp +++ b/src/silo/query_engine/filter/expressions/date_between.cpp @@ -13,9 +13,8 @@ #include "silo/query_engine/filter/operators/selection.h" #include "silo/query_engine/illegal_query_exception.h" #include "silo/storage/column/date32_column.h" -#include "silo/storage/table_partition.h" -using silo::storage::column::Date32ColumnPartition; +using silo::storage::column::Date32Column; namespace silo::query_engine::filter::expressions { @@ -40,58 +39,47 @@ std::string DateBetween::toString() const { std::unique_ptr DateBetween::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, AmbiguityMode /*mode*/ ) const { return std::make_unique(column_name, date_from, date_to); } -std::unique_ptr DateBetween::compile( - const storage::Table& table, - const storage::TablePartition& table_partition -) const { +std::unique_ptr DateBetween::compile(const storage::Table& table) const { CHECK_SILO_QUERY( table.schema->getColumn(column_name).has_value(), "The database does not contain the column '{}'", column_name ); CHECK_SILO_QUERY( - table_partition.columns.date32_columns.contains(column_name), + table.columns.date32_columns.contains(column_name), "The column '{}' is not of type date", column_name ); - const auto& date_column = table_partition.columns.date32_columns.at(column_name); + const auto& date_column = table.columns.date32_columns.at(column_name); if (date_column.isSorted()) { return std::make_unique( - computeRangesOfSortedColumn(date_column, {table_partition.sequence_count}), - table_partition.sequence_count + computeRangesOfSortedColumn(date_column, {table.sequence_count}), table.sequence_count ); } operators::PredicateVector predicates; - predicates.emplace_back( - std::make_unique>( - date_column, - operators::Comparator::HIGHER_OR_EQUALS, - date_from.value_or(std::numeric_limits::min()) - ) - ); - predicates.emplace_back( - std::make_unique>( - date_column, - operators::Comparator::LESS_OR_EQUALS, - date_to.value_or(std::numeric_limits::max()) - ) - ); - return std::make_unique( - std::move(predicates), table_partition.sequence_count - ); + predicates.emplace_back(std::make_unique>( + date_column, + operators::Comparator::HIGHER_OR_EQUALS, + date_from.value_or(std::numeric_limits::min()) + )); + predicates.emplace_back(std::make_unique>( + date_column, + operators::Comparator::LESS_OR_EQUALS, + date_to.value_or(std::numeric_limits::max()) + )); + return std::make_unique(std::move(predicates), table.sequence_count); } std::vector DateBetween:: computeRangesOfSortedColumn( - const silo::storage::column::Date32ColumnPartition& date_column, + const silo::storage::column::Date32Column& date_column, const std::vector& chunk_sizes ) const { std::vector ranges; diff --git a/src/silo/query_engine/filter/expressions/date_between.h b/src/silo/query_engine/filter/expressions/date_between.h index 7fc6d5858..255256138 100644 --- a/src/silo/query_engine/filter/expressions/date_between.h +++ b/src/silo/query_engine/filter/expressions/date_between.h @@ -12,7 +12,6 @@ #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/filter/operators/range_selection.h" #include "silo/storage/column/date32_column.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -24,7 +23,7 @@ class DateBetween : public Expression { [[nodiscard]] std::vector computeRangesOfSortedColumn( - const silo::storage::column::Date32ColumnPartition& date_column, + const silo::storage::column::Date32Column& date_column, const std::vector& chunk_sizes ) const; @@ -39,13 +38,10 @@ class DateBetween : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/date_equals.cpp b/src/silo/query_engine/filter/expressions/date_equals.cpp index d0ffaf2c5..6513da703 100644 --- a/src/silo/query_engine/filter/expressions/date_equals.cpp +++ b/src/silo/query_engine/filter/expressions/date_equals.cpp @@ -12,9 +12,8 @@ #include "silo/query_engine/filter/operators/selection.h" #include "silo/query_engine/illegal_query_exception.h" #include "silo/storage/column/date32_column.h" -#include "silo/storage/table_partition.h" -using silo::storage::column::Date32ColumnPartition; +using silo::storage::column::Date32Column; namespace silo::query_engine::filter::expressions { @@ -32,39 +31,35 @@ std::string DateEquals::toString() const { std::unique_ptr DateEquals::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, AmbiguityMode /*mode*/ ) const { return std::make_unique(column_name, value); } -std::unique_ptr DateEquals::compile( - const storage::Table& table, - const storage::TablePartition& table_partition -) const { +std::unique_ptr DateEquals::compile(const storage::Table& table) const { CHECK_SILO_QUERY( table.schema->getColumn(column_name).has_value(), "The database does not contain the column '{}'", column_name ); CHECK_SILO_QUERY( - table_partition.columns.date32_columns.contains(column_name), + table.columns.date32_columns.contains(column_name), "The column '{}' is not of type date", column_name ); - const auto& date_column = table_partition.columns.date32_columns.at(column_name); + const auto& date_column = table.columns.date32_columns.at(column_name); if (value.has_value()) { return std::make_unique( - std::make_unique>( + std::make_unique>( date_column, operators::Comparator::EQUALS, value.value() ), - table_partition.sequence_count + table.sequence_count ); } return std::make_unique( - CopyOnWriteBitmap{&date_column.null_bitmap}, table_partition.sequence_count + CopyOnWriteBitmap{&date_column.null_bitmap}, table.sequence_count ); } diff --git a/src/silo/query_engine/filter/expressions/date_equals.h b/src/silo/query_engine/filter/expressions/date_equals.h index 7799dc6e7..11b3cf992 100644 --- a/src/silo/query_engine/filter/expressions/date_equals.h +++ b/src/silo/query_engine/filter/expressions/date_equals.h @@ -9,7 +9,6 @@ #include "silo/common/date32.h" #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -25,13 +24,10 @@ class DateEquals : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/exact.cpp b/src/silo/query_engine/filter/expressions/exact.cpp index 82a91f91c..87ad5b9d7 100644 --- a/src/silo/query_engine/filter/expressions/exact.cpp +++ b/src/silo/query_engine/filter/expressions/exact.cpp @@ -11,7 +11,6 @@ #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/illegal_query_exception.h" #include "silo/query_engine/query_compilation_exception.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -24,15 +23,12 @@ std::string Exact::toString() const { std::unique_ptr Exact::rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode /*mode*/ ) const { - return child->rewrite(table, table_partition, AmbiguityMode::LOWER_BOUND); + return child->rewrite(table, AmbiguityMode::LOWER_BOUND); } -std::unique_ptr Exact::compile( - const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/ +std::unique_ptr Exact::compile(const storage::Table& /*table*/ ) const { throw QueryCompilationException{"Exact expression must be elimitated in query rewrite phase"}; } diff --git a/src/silo/query_engine/filter/expressions/exact.h b/src/silo/query_engine/filter/expressions/exact.h index d9dca07a4..804a09243 100644 --- a/src/silo/query_engine/filter/expressions/exact.h +++ b/src/silo/query_engine/filter/expressions/exact.h @@ -7,7 +7,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -21,13 +20,10 @@ class Exact : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/expression.h b/src/silo/query_engine/filter/expressions/expression.h index a21f11a4b..c84d72ce8 100644 --- a/src/silo/query_engine/filter/expressions/expression.h +++ b/src/silo/query_engine/filter/expressions/expression.h @@ -7,7 +7,6 @@ #include "silo/query_engine/filter/operators/operator.h" #include "silo/storage/table.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -28,13 +27,10 @@ class Expression { [[nodiscard]] virtual std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const = 0; - [[nodiscard]] virtual std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] virtual std::unique_ptr compile(const storage::Table& table ) const = 0; }; diff --git a/src/silo/query_engine/filter/expressions/false.cpp b/src/silo/query_engine/filter/expressions/false.cpp index 47eb2e0e7..441a2d116 100644 --- a/src/silo/query_engine/filter/expressions/false.cpp +++ b/src/silo/query_engine/filter/expressions/false.cpp @@ -5,7 +5,6 @@ #include "silo/query_engine/filter/operators/empty.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -17,17 +16,13 @@ std::string False::toString() const { std::unique_ptr False::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, AmbiguityMode /*mode*/ ) const { return std::make_unique(); } -std::unique_ptr False::compile( - const storage::Table& /*table*/, - const silo::storage::TablePartition& table_partition -) const { - return std::make_unique(table_partition.sequence_count); +std::unique_ptr False::compile(const storage::Table& table) const { + return std::make_unique(table.sequence_count); } // NOLINTNEXTLINE(readability-identifier-naming) diff --git a/src/silo/query_engine/filter/expressions/false.h b/src/silo/query_engine/filter/expressions/false.h index 4a55f6b04..1dcac288c 100644 --- a/src/silo/query_engine/filter/expressions/false.h +++ b/src/silo/query_engine/filter/expressions/false.h @@ -7,7 +7,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -19,13 +18,10 @@ class False : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/float_between.cpp b/src/silo/query_engine/filter/expressions/float_between.cpp index 9bf3a69da..8af82d6cb 100644 --- a/src/silo/query_engine/filter/expressions/float_between.cpp +++ b/src/silo/query_engine/filter/expressions/float_between.cpp @@ -11,9 +11,8 @@ #include "silo/query_engine/filter/operators/index_scan.h" #include "silo/query_engine/filter/operators/selection.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/storage/table_partition.h" -using silo::storage::column::FloatColumnPartition; +using silo::storage::column::FloatColumn; namespace silo::query_engine::filter::expressions { @@ -37,52 +36,42 @@ std::string FloatBetween::toString() const { std::unique_ptr FloatBetween::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, AmbiguityMode /*mode*/ ) const { return std::make_unique(column_name, from, to); } -std::unique_ptr FloatBetween::compile( - const storage::Table& /*table*/, - const storage::TablePartition& table_partition -) const { +std::unique_ptr FloatBetween::compile(const storage::Table& table) const { CHECK_SILO_QUERY( - table_partition.columns.float_columns.contains(column_name), + table.columns.float_columns.contains(column_name), "The database does not contain the float column '{}'", column_name ); - const auto& float_column = table_partition.columns.float_columns.at(column_name); + const auto& float_column = table.columns.float_columns.at(column_name); operators::PredicateVector predicates; if (from.has_value()) { - predicates.emplace_back( - std::make_unique>( - float_column, operators::Comparator::HIGHER_OR_EQUALS, from.value() - ) - ); + predicates.emplace_back(std::make_unique>( + float_column, operators::Comparator::HIGHER_OR_EQUALS, from.value() + )); } if (to.has_value()) { - predicates.emplace_back( - std::make_unique>( - float_column, operators::Comparator::LESS, to.value() - ) - ); + predicates.emplace_back(std::make_unique>( + float_column, operators::Comparator::LESS, to.value() + )); } if (predicates.empty()) { return std::make_unique( std::make_unique( - CopyOnWriteBitmap{&float_column.null_bitmap}, table_partition.sequence_count + CopyOnWriteBitmap{&float_column.null_bitmap}, table.sequence_count ), - table_partition.sequence_count + table.sequence_count ); } - return std::make_unique( - std::move(predicates), table_partition.sequence_count - ); + return std::make_unique(std::move(predicates), table.sequence_count); } // NOLINTNEXTLINE(readability-identifier-naming) diff --git a/src/silo/query_engine/filter/expressions/float_between.h b/src/silo/query_engine/filter/expressions/float_between.h index e1c1189c1..462aa7d50 100644 --- a/src/silo/query_engine/filter/expressions/float_between.h +++ b/src/silo/query_engine/filter/expressions/float_between.h @@ -8,7 +8,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -29,13 +28,10 @@ class FloatBetween : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/float_equals.cpp b/src/silo/query_engine/filter/expressions/float_equals.cpp index a81b41bb9..5cfdd1e44 100644 --- a/src/silo/query_engine/filter/expressions/float_equals.cpp +++ b/src/silo/query_engine/filter/expressions/float_equals.cpp @@ -12,9 +12,8 @@ #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/filter/operators/selection.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/storage/table_partition.h" -using silo::storage::column::FloatColumnPartition; +using silo::storage::column::FloatColumn; namespace silo::query_engine::filter::expressions { @@ -31,34 +30,30 @@ std::string FloatEquals::toString() const { std::unique_ptr FloatEquals::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, AmbiguityMode /*mode*/ ) const { return std::make_unique(column_name, value); } -std::unique_ptr FloatEquals::compile( - const storage::Table& /*table*/, - const storage::TablePartition& table_partition -) const { +std::unique_ptr FloatEquals::compile(const storage::Table& table) const { CHECK_SILO_QUERY( - table_partition.columns.float_columns.contains(column_name), + table.columns.float_columns.contains(column_name), "The database does not contain the column '{}'", column_name ); - const auto& float_column = table_partition.columns.float_columns.at(column_name); + const auto& float_column = table.columns.float_columns.at(column_name); if (value.has_value()) { return std::make_unique( - std::make_unique>( + std::make_unique>( float_column, operators::Comparator::EQUALS, value.value() ), - table_partition.sequence_count + table.sequence_count ); } return std::make_unique( - CopyOnWriteBitmap{&float_column.null_bitmap}, table_partition.sequence_count + CopyOnWriteBitmap{&float_column.null_bitmap}, table.sequence_count ); } diff --git a/src/silo/query_engine/filter/expressions/float_equals.h b/src/silo/query_engine/filter/expressions/float_equals.h index abd76bf46..6deb1ba45 100644 --- a/src/silo/query_engine/filter/expressions/float_equals.h +++ b/src/silo/query_engine/filter/expressions/float_equals.h @@ -7,7 +7,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -23,13 +22,10 @@ class FloatEquals : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/has_mutation.cpp b/src/silo/query_engine/filter/expressions/has_mutation.cpp index 5f24718b1..3376afa43 100644 --- a/src/silo/query_engine/filter/expressions/has_mutation.cpp +++ b/src/silo/query_engine/filter/expressions/has_mutation.cpp @@ -14,7 +14,6 @@ #include "silo/query_engine/query_compilation_exception.h" #include "silo/query_engine/query_parse_sequence_name.h" #include "silo/storage/table.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -35,7 +34,6 @@ std::string HasMutation::toString() const { template std::unique_ptr HasMutation::rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const { CHECK_SILO_QUERY( @@ -49,8 +47,8 @@ std::unique_ptr HasMutation::rewrite( const auto valid_sequence_name = validateSequenceNameOrGetDefault(sequence_name, *table.schema); - const auto& seq_store_partition = - table_partition.columns.getColumns().at(valid_sequence_name); + const auto& sequence_column = + table.columns.getColumns().at(valid_sequence_name); auto column_metadata = table.schema->getColumnMetadata(valid_sequence_name).value(); @@ -59,10 +57,10 @@ std::unique_ptr HasMutation::rewrite( "Has{}Mutation position is out of bounds {} > {}", SymbolType::SYMBOL_NAME, position_idx + 1, - seq_store_partition.metadata->reference_sequence.size() + sequence_column.metadata->reference_sequence.size() ) - auto ref_symbol = seq_store_partition.metadata->reference_sequence.at(position_idx); + auto ref_symbol = sequence_column.metadata->reference_sequence.at(position_idx); std::vector symbols = std::vector(SymbolType::SYMBOLS.begin(), SymbolType::SYMBOLS.end()); @@ -82,8 +80,7 @@ std::unique_ptr HasMutation::rewrite( template std::unique_ptr HasMutation::compile( - const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/ + const storage::Table& /*table*/ ) const { throw QueryCompilationException{ "Has{}Mutation expression must be eliminated in query rewrite phase", SymbolType::SYMBOL_NAME diff --git a/src/silo/query_engine/filter/expressions/has_mutation.h b/src/silo/query_engine/filter/expressions/has_mutation.h index 4907b4b7f..f0dade5fc 100644 --- a/src/silo/query_engine/filter/expressions/has_mutation.h +++ b/src/silo/query_engine/filter/expressions/has_mutation.h @@ -9,7 +9,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -26,13 +25,10 @@ class HasMutation : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/insertion_contains.cpp b/src/silo/query_engine/filter/expressions/insertion_contains.cpp index d6508ec8f..ef7948877 100644 --- a/src/silo/query_engine/filter/expressions/insertion_contains.cpp +++ b/src/silo/query_engine/filter/expressions/insertion_contains.cpp @@ -17,7 +17,6 @@ #include "silo/storage/column/insertion_index.h" #include "silo/storage/column/sequence_column.h" #include "silo/storage/insertion_format_exception.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -44,7 +43,6 @@ std::string InsertionContains::toString() const { template std::unique_ptr InsertionContains::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, AmbiguityMode /*mode*/ ) const { return std::make_unique>(sequence_name, position_idx, value); @@ -52,16 +50,15 @@ std::unique_ptr InsertionContains::rewrite( template std::unique_ptr InsertionContains::compile( - const storage::Table& table, - const storage::TablePartition& table_partition + const storage::Table& table ) const { const auto valid_sequence_name = validateSequenceNameOrGetDefault(sequence_name, *table.schema); - const std::map>& - sequence_stores = table_partition.columns.getColumns(); + const std::map>& sequence_stores = + table.columns.getColumns(); - const storage::column::SequenceColumnPartition& sequence_store = + const storage::column::SequenceColumn& sequence_store = sequence_stores.at(valid_sequence_name); const size_t reference_sequence_size = sequence_store.metadata->reference_sequence.size(); CHECK_SILO_QUERY( @@ -89,7 +86,7 @@ std::unique_ptr InsertionContains::compile( ); } }, - table_partition.sequence_count + table.sequence_count ); } diff --git a/src/silo/query_engine/filter/expressions/insertion_contains.h b/src/silo/query_engine/filter/expressions/insertion_contains.h index 1102ffda4..8e2f02cf1 100644 --- a/src/silo/query_engine/filter/expressions/insertion_contains.h +++ b/src/silo/query_engine/filter/expressions/insertion_contains.h @@ -9,7 +9,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -31,13 +30,10 @@ class InsertionContains : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/int_between.cpp b/src/silo/query_engine/filter/expressions/int_between.cpp index 32f7b2f03..0a71d82bd 100644 --- a/src/silo/query_engine/filter/expressions/int_between.cpp +++ b/src/silo/query_engine/filter/expressions/int_between.cpp @@ -13,9 +13,8 @@ #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/filter/operators/selection.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/storage/table_partition.h" -using silo::storage::column::IntColumnPartition; +using silo::storage::column::IntColumn; namespace silo::query_engine::filter::expressions { @@ -39,51 +38,43 @@ std::string IntBetween::toString() const { std::unique_ptr IntBetween::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, AmbiguityMode /*mode*/ ) const { return std::make_unique(column_name, from, to); } -std::unique_ptr IntBetween::compile( - const storage::Table& /*table*/, - const storage::TablePartition& table_partition -) const { +std::unique_ptr IntBetween::compile(const storage::Table& table) const { CHECK_SILO_QUERY( - table_partition.columns.int_columns.contains(column_name), + table.columns.int_columns.contains(column_name), "The database does not contain the column '{}'", column_name ); - const auto& int_column = table_partition.columns.int_columns.at(column_name); + const auto& int_column = table.columns.int_columns.at(column_name); operators::PredicateVector predicates; if (from.has_value()) { - predicates.emplace_back( - std::make_unique>( - int_column, operators::Comparator::HIGHER_OR_EQUALS, from.value() - ) - ); + predicates.emplace_back(std::make_unique>( + int_column, operators::Comparator::HIGHER_OR_EQUALS, from.value() + )); } if (to.has_value()) { - predicates.emplace_back( - std::make_unique>( - int_column, operators::Comparator::LESS_OR_EQUALS, to.value() - ) - ); + predicates.emplace_back(std::make_unique>( + int_column, operators::Comparator::LESS_OR_EQUALS, to.value() + )); } if (predicates.empty()) { return std::make_unique( std::make_unique( - CopyOnWriteBitmap{&int_column.null_bitmap}, table_partition.sequence_count + CopyOnWriteBitmap{&int_column.null_bitmap}, table.sequence_count ), - table_partition.sequence_count + table.sequence_count ); } auto result = - std::make_unique(std::move(predicates), table_partition.sequence_count); + std::make_unique(std::move(predicates), table.sequence_count); SPDLOG_TRACE("Compiled IntBetween filter expression to {}", result->toString()); diff --git a/src/silo/query_engine/filter/expressions/int_between.h b/src/silo/query_engine/filter/expressions/int_between.h index c2eef6e7e..1051b812c 100644 --- a/src/silo/query_engine/filter/expressions/int_between.h +++ b/src/silo/query_engine/filter/expressions/int_between.h @@ -9,7 +9,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -30,13 +29,10 @@ class IntBetween : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/int_equals.cpp b/src/silo/query_engine/filter/expressions/int_equals.cpp index 866e438a1..4badb538d 100644 --- a/src/silo/query_engine/filter/expressions/int_equals.cpp +++ b/src/silo/query_engine/filter/expressions/int_equals.cpp @@ -10,9 +10,8 @@ #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/filter/operators/selection.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/storage/table_partition.h" -using silo::storage::column::IntColumnPartition; +using silo::storage::column::IntColumn; namespace silo::query_engine::filter::expressions { @@ -29,34 +28,30 @@ std::string IntEquals::toString() const { std::unique_ptr IntEquals::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, AmbiguityMode /*mode*/ ) const { return std::make_unique(column_name, value); } -std::unique_ptr IntEquals::compile( - const storage::Table& /*table*/, - const storage::TablePartition& table_partition -) const { +std::unique_ptr IntEquals::compile(const storage::Table& table) const { CHECK_SILO_QUERY( - table_partition.columns.int_columns.contains(column_name), + table.columns.int_columns.contains(column_name), "The database does not contain the column '{}'", column_name ); - const auto& int_column = table_partition.columns.int_columns.at(column_name); + const auto& int_column = table.columns.int_columns.at(column_name); if (value.has_value()) { return std::make_unique( - std::make_unique>( + std::make_unique>( int_column, operators::Comparator::EQUALS, value.value() ), - table_partition.sequence_count + table.sequence_count ); } return std::make_unique( - CopyOnWriteBitmap{&int_column.null_bitmap}, table_partition.sequence_count + CopyOnWriteBitmap{&int_column.null_bitmap}, table.sequence_count ); } diff --git a/src/silo/query_engine/filter/expressions/int_equals.h b/src/silo/query_engine/filter/expressions/int_equals.h index 0860893eb..319daa6ff 100644 --- a/src/silo/query_engine/filter/expressions/int_equals.h +++ b/src/silo/query_engine/filter/expressions/int_equals.h @@ -8,7 +8,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -24,13 +23,10 @@ class IntEquals : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/is_null.cpp b/src/silo/query_engine/filter/expressions/is_null.cpp index d34995f71..993d913bf 100644 --- a/src/silo/query_engine/filter/expressions/is_null.cpp +++ b/src/silo/query_engine/filter/expressions/is_null.cpp @@ -11,7 +11,6 @@ #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/illegal_query_exception.h" #include "silo/storage/column/column_type_visitor.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -24,16 +23,12 @@ std::string IsNull::toString() const { std::unique_ptr IsNull::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, Expression::AmbiguityMode /*mode*/ ) const { return std::make_unique(column_name); } -std::unique_ptr IsNull::compile( - const storage::Table& table, - const silo::storage::TablePartition& table_partition -) const { +std::unique_ptr IsNull::compile(const storage::Table& table) const { const auto& maybe_target_column = table.schema->getColumn(column_name); CHECK_SILO_QUERY( maybe_target_column.has_value(), @@ -43,9 +38,9 @@ std::unique_ptr IsNull::compile( auto target_column = maybe_target_column.value(); return silo::storage::column::visit(target_column.type, [&]() { - const auto& column = table_partition.columns.getColumns().at(column_name); + const auto& column = table.columns.getColumns().at(column_name); return std::make_unique( - CopyOnWriteBitmap{&column.null_bitmap}, table_partition.sequence_count + CopyOnWriteBitmap{&column.null_bitmap}, table.sequence_count ); }); } diff --git a/src/silo/query_engine/filter/expressions/is_null.h b/src/silo/query_engine/filter/expressions/is_null.h index 9cce4252c..dbfc8180c 100644 --- a/src/silo/query_engine/filter/expressions/is_null.h +++ b/src/silo/query_engine/filter/expressions/is_null.h @@ -7,7 +7,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -22,13 +21,10 @@ class IsNull : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/lineage_filter.cpp b/src/silo/query_engine/filter/expressions/lineage_filter.cpp index 6ff54eb05..3c45db9fc 100644 --- a/src/silo/query_engine/filter/expressions/lineage_filter.cpp +++ b/src/silo/query_engine/filter/expressions/lineage_filter.cpp @@ -13,12 +13,11 @@ #include "silo/query_engine/filter/operators/index_scan.h" #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { using silo::common::RecombinantEdgeFollowingMode; -using silo::storage::column::IndexedStringColumnPartition; +using silo::storage::column::IndexedStringColumn; LineageFilter::LineageFilter( std::string column_name, @@ -40,7 +39,7 @@ std::string LineageFilter::toString() const { } std::optional LineageFilter::getBitmapForValue( - const IndexedStringColumnPartition& lineage_column + const IndexedStringColumn& lineage_column ) const { if (lineage == std::nullopt) { return lineage_column.filter(std::nullopt); @@ -67,35 +66,31 @@ std::optional LineageFilter::getBitmapForValue( std::unique_ptr LineageFilter::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, AmbiguityMode /*mode*/ ) const { return std::make_unique(column_name, lineage, sublineage_mode); } -std::unique_ptr LineageFilter::compile( - const storage::Table& /*table*/, - const storage::TablePartition& table_partition -) const { +std::unique_ptr LineageFilter::compile(const storage::Table& table) const { CHECK_SILO_QUERY( - table_partition.columns.indexed_string_columns.contains(column_name), + table.columns.indexed_string_columns.contains(column_name), "The database does not contain the column '{}'", column_name ); CHECK_SILO_QUERY( - table_partition.columns.indexed_string_columns.at(column_name).getLineageIndex().has_value(), + table.columns.indexed_string_columns.at(column_name).getLineageIndex().has_value(), "The database does not contain a lineage index for the column '{}'", column_name ); - const auto& lineage_column = table_partition.columns.indexed_string_columns.at(column_name); + const auto& lineage_column = table.columns.indexed_string_columns.at(column_name); std::optional bitmap = getBitmapForValue(lineage_column); if (bitmap == std::nullopt) { - return std::make_unique(table_partition.sequence_count); + return std::make_unique(table.sequence_count); } return std::make_unique( - CopyOnWriteBitmap{bitmap.value()}, table_partition.sequence_count + CopyOnWriteBitmap{bitmap.value()}, table.sequence_count ); } diff --git a/src/silo/query_engine/filter/expressions/lineage_filter.h b/src/silo/query_engine/filter/expressions/lineage_filter.h index f9ae119e3..b6c2519a6 100644 --- a/src/silo/query_engine/filter/expressions/lineage_filter.h +++ b/src/silo/query_engine/filter/expressions/lineage_filter.h @@ -7,7 +7,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -27,18 +26,15 @@ class LineageFilter : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; private: [[nodiscard]] std::optional getBitmapForValue( - const silo::storage::column::IndexedStringColumnPartition& lineage_column + const silo::storage::column::IndexedStringColumn& lineage_column ) const; }; diff --git a/src/silo/query_engine/filter/expressions/maybe.cpp b/src/silo/query_engine/filter/expressions/maybe.cpp index 459645c37..27103e8fa 100644 --- a/src/silo/query_engine/filter/expressions/maybe.cpp +++ b/src/silo/query_engine/filter/expressions/maybe.cpp @@ -10,7 +10,6 @@ #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/illegal_query_exception.h" #include "silo/query_engine/query_compilation_exception.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -23,15 +22,12 @@ std::string Maybe::toString() const { std::unique_ptr Maybe::rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode /*mode*/ ) const { - return child->rewrite(table, table_partition, AmbiguityMode::UPPER_BOUND); + return child->rewrite(table, AmbiguityMode::UPPER_BOUND); } -std::unique_ptr Maybe::compile( - const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/ +std::unique_ptr Maybe::compile(const storage::Table& /*table*/ ) const { throw QueryCompilationException{"Maybe expression must be elimitated in query rewrite phase"}; } diff --git a/src/silo/query_engine/filter/expressions/maybe.h b/src/silo/query_engine/filter/expressions/maybe.h index 1c94f8025..93f8ed3e4 100644 --- a/src/silo/query_engine/filter/expressions/maybe.h +++ b/src/silo/query_engine/filter/expressions/maybe.h @@ -7,7 +7,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -21,13 +20,10 @@ class Maybe : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/negation.cpp b/src/silo/query_engine/filter/expressions/negation.cpp index 287f7eb4b..01585a276 100644 --- a/src/silo/query_engine/filter/expressions/negation.cpp +++ b/src/silo/query_engine/filter/expressions/negation.cpp @@ -9,7 +9,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -20,19 +19,13 @@ std::string Negation::toString() const { return "!(" + child->toString() + ")"; } -std::unique_ptr Negation::rewrite( - const storage::Table& table, - const storage::TablePartition& table_partition, - AmbiguityMode mode -) const { - return std::make_unique(child->rewrite(table, table_partition, invertMode(mode))); +std::unique_ptr Negation::rewrite(const storage::Table& table, AmbiguityMode mode) + const { + return std::make_unique(child->rewrite(table, invertMode(mode))); } -std::unique_ptr Negation::compile( - const storage::Table& table, - const storage::TablePartition& table_partition -) const { - return operators::Operator::negate(child->compile(table, table_partition)); +std::unique_ptr Negation::compile(const storage::Table& table) const { + return operators::Operator::negate(child->compile(table)); } // NOLINTNEXTLINE(readability-identifier-naming) diff --git a/src/silo/query_engine/filter/expressions/negation.h b/src/silo/query_engine/filter/expressions/negation.h index dd39f1393..50d2816ba 100644 --- a/src/silo/query_engine/filter/expressions/negation.h +++ b/src/silo/query_engine/filter/expressions/negation.h @@ -7,7 +7,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -24,13 +23,10 @@ class Negation : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/nof.cpp b/src/silo/query_engine/filter/expressions/nof.cpp index 6836e4961..0a0abe1c1 100644 --- a/src/silo/query_engine/filter/expressions/nof.cpp +++ b/src/silo/query_engine/filter/expressions/nof.cpp @@ -17,7 +17,6 @@ #include "silo/query_engine/filter/operators/threshold.h" #include "silo/query_engine/filter/operators/union.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/storage/table_partition.h" namespace { @@ -176,13 +175,12 @@ std::string NOf::toString() const { } std::tuple NOf::mapChildExpressions( - const storage::Table& table, - const storage::TablePartition& table_partition + const storage::Table& table ) const { operators::OperatorVector child_operators; child_operators.reserve(children.size()); for (const auto& child_expression : children) { - child_operators.push_back(child_expression->compile(table, table_partition)); + child_operators.push_back(child_expression->compile(table)); } operators::OperatorVector non_negated_child_operators; @@ -211,31 +209,27 @@ std::tuple NOf::mapCh }; } -ExpressionVector NOf::rewriteChildren( - const storage::Table& table, - const storage::TablePartition& table_partition, - Expression::AmbiguityMode mode -) const { +ExpressionVector NOf::rewriteChildren(const storage::Table& table, Expression::AmbiguityMode mode) + const { ExpressionVector rewritten_children; rewritten_children.reserve(children.size()); for (const auto& child : children) { - rewritten_children.push_back(child->rewrite(table, table_partition, mode)); + rewritten_children.push_back(child->rewrite(table, mode)); } return rewritten_children; } std::unique_ptr NOf::rewriteToNonExact( const storage::Table& table, - const storage::TablePartition& table_partition, Expression::AmbiguityMode mode ) const { auto at_least_k = std::make_unique( - rewriteChildren(table, table_partition, mode), + rewriteChildren(table, mode), this->number_of_matchers, /*match_exactly=*/false ); auto at_least_k_plus_one = std::make_unique( - rewriteChildren(table, table_partition, mode), + rewriteChildren(table, mode), this->number_of_matchers + 1, /*match_exactly=*/false ); @@ -246,34 +240,25 @@ std::unique_ptr NOf::rewriteToNonExact( return std::make_unique(std::move(and_children)); } -std::unique_ptr NOf::rewrite( - const storage::Table& table, - const storage::TablePartition& table_partition, - AmbiguityMode mode -) const { +std::unique_ptr NOf::rewrite(const storage::Table& table, AmbiguityMode mode) const { // We cannot easily map ambiguity modes through an exact NOf expression -> rewrite without exact if (mode != NONE && match_exactly && std::cmp_less(number_of_matchers, children.size())) { - return rewriteToNonExact(table, table_partition, mode); + return rewriteToNonExact(table, mode); } - return std::make_unique( - rewriteChildren(table, table_partition, mode), number_of_matchers, match_exactly - ); + return std::make_unique(rewriteChildren(table, mode), number_of_matchers, match_exactly); } -std::unique_ptr NOf::compile( - const storage::Table& table, - const storage::TablePartition& table_partition -) const { +std::unique_ptr NOf::compile(const storage::Table& table) const { auto [non_negated_child_operators, negated_child_operators, updated_number_of_matchers] = - mapChildExpressions(table, table_partition); + mapChildExpressions(table); return toOperator( updated_number_of_matchers, std::move(non_negated_child_operators), std::move(negated_child_operators), match_exactly, - table_partition.sequence_count + table.sequence_count ); } diff --git a/src/silo/query_engine/filter/expressions/nof.h b/src/silo/query_engine/filter/expressions/nof.h index 801ed26f8..84734ff0f 100644 --- a/src/silo/query_engine/filter/expressions/nof.h +++ b/src/silo/query_engine/filter/expressions/nof.h @@ -8,7 +8,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -20,19 +19,16 @@ class NOf : public Expression { [[nodiscard]] ExpressionVector rewriteChildren( const storage::Table& table, - const storage::TablePartition& table_partition, Expression::AmbiguityMode mode ) const; [[nodiscard]] std::unique_ptr rewriteToNonExact( const storage::Table& table, - const storage::TablePartition& table_partition, Expression::AmbiguityMode mode ) const; [[nodiscard]] std::tuple - mapChildExpressions(const storage::Table& table, const storage::TablePartition& table_partition) - const; + mapChildExpressions(const storage::Table& table) const; public: explicit NOf(ExpressionVector&& children, int number_of_matchers, bool match_exactly); @@ -41,13 +37,10 @@ class NOf : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/or.cpp b/src/silo/query_engine/filter/expressions/or.cpp index c996fb7ba..e79cbfb0e 100644 --- a/src/silo/query_engine/filter/expressions/or.cpp +++ b/src/silo/query_engine/filter/expressions/or.cpp @@ -18,7 +18,6 @@ #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/filter/operators/union.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -164,17 +163,14 @@ ExpressionVector Or::mergeStringInSetExpressions(ExpressionVector children) { return new_children; } -std::unique_ptr Or::rewrite( - const storage::Table& table, - const storage::TablePartition& table_partition, - Expression::AmbiguityMode mode -) const { +std::unique_ptr Or::rewrite(const storage::Table& table, Expression::AmbiguityMode mode) + const { std::vector collected_children = collectChildren(children); ExpressionVector rewritten_children; std::ranges::transform( collected_children, std::back_inserter(rewritten_children), - [&](const Expression* child) { return child->rewrite(table, table_partition, mode); } + [&](const Expression* child) { return child->rewrite(table, mode); } ); rewritten_children = algebraicSimplification(std::move(rewritten_children)); rewritten_children = rewriteSymbolInSetExpressions(std::move(rewritten_children)); @@ -186,17 +182,12 @@ std::unique_ptr Or::rewrite( return std::make_unique(std::move(rewritten_children)); } -std::unique_ptr Or::compile( - const storage::Table& table, - const storage::TablePartition& table_partition -) const { +std::unique_ptr Or::compile(const storage::Table& table) const { OperatorVector all_child_operators; std::ranges::transform( children, std::back_inserter(all_child_operators), - [&](const std::unique_ptr& expression) { - return expression->compile(table, table_partition); - } + [&](const std::unique_ptr& expression) { return expression->compile(table); } ); OperatorVector filtered_child_operators; for (auto& child : all_child_operators) { @@ -204,7 +195,7 @@ std::unique_ptr Or::compile( continue; } if (child->type() == operators::FULL) { - return std::make_unique(table_partition.sequence_count); + return std::make_unique(table.sequence_count); } if (child->type() == operators::UNION) { auto* or_child = dynamic_cast(child.get()); @@ -218,7 +209,7 @@ std::unique_ptr Or::compile( } } if (filtered_child_operators.empty()) { - return std::make_unique(table_partition.sequence_count); + return std::make_unique(table.sequence_count); } if (filtered_child_operators.size() == 1) { return std::move(filtered_child_operators[0]); @@ -228,11 +219,11 @@ std::unique_ptr Or::compile( return child->type() == operators::COMPLEMENT; })) { return operators::Complement::fromDeMorgan( - std::move(filtered_child_operators), table_partition.sequence_count + std::move(filtered_child_operators), table.sequence_count ); } return std::make_unique( - std::move(filtered_child_operators), table_partition.sequence_count + std::move(filtered_child_operators), table.sequence_count ); } diff --git a/src/silo/query_engine/filter/expressions/or.h b/src/silo/query_engine/filter/expressions/or.h index a2797780f..ac9440e8f 100644 --- a/src/silo/query_engine/filter/expressions/or.h +++ b/src/silo/query_engine/filter/expressions/or.h @@ -7,7 +7,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -21,7 +20,6 @@ class Or : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; @@ -38,9 +36,7 @@ class Or : public Expression { [[nodiscard]] static ExpressionVector mergeStringInSetExpressions(ExpressionVector children); - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/or.test.cpp b/src/silo/query_engine/filter/expressions/or.test.cpp index eabdbd615..411c366dc 100644 --- a/src/silo/query_engine/filter/expressions/or.test.cpp +++ b/src/silo/query_engine/filter/expressions/or.test.cpp @@ -292,8 +292,7 @@ TEST(OrToString, shouldFormatChildrenCorrectly) { } TEST(OrToString, shouldHandleNestedOr) { - silo::storage::Table table(std::make_shared()); - auto table_partition = table.addPartition(); + const silo::storage::Table table(std::make_shared()); ExpressionVector inner_children; inner_children.emplace_back(std::make_unique()); @@ -307,7 +306,7 @@ TEST(OrToString, shouldHandleNestedOr) { EXPECT_EQ(outer_or.toString(), "Or(Or(True | True) | True)"); - auto rewritten_or = outer_or.rewrite(table, *table_partition, Or::AmbiguityMode::NONE); + auto rewritten_or = outer_or.rewrite(table, Or::AmbiguityMode::NONE); EXPECT_EQ(rewritten_or->toString(), "True"); } @@ -323,8 +322,9 @@ TEST(OrToString, shouldHandleNestedStringEquals) { const std::map> column_metadata{ {primary_key, std::make_shared(primary_key.name)} }; - silo::storage::Table table(std::make_shared(column_metadata, primary_key)); - auto table_partition = table.addPartition(); + const silo::storage::Table table( + std::make_shared(column_metadata, primary_key) + ); ExpressionVector inner_children; inner_children.emplace_back(std::make_unique("key", "value_1")); @@ -338,7 +338,7 @@ TEST(OrToString, shouldHandleNestedStringEquals) { EXPECT_EQ(outer_or.toString(), "Or(Or(key = 'value_1' | key = 'value_2') | key = 'value_3')"); - auto rewritten_or = outer_or.rewrite(table, *table_partition, Or::AmbiguityMode::NONE); + auto rewritten_or = outer_or.rewrite(table, Or::AmbiguityMode::NONE); EXPECT_EQ(rewritten_or->toString(), "key IN [value_1,value_2,value_3]"); } @@ -348,8 +348,9 @@ TEST(OrToString, shouldHandleObufscatedNestedStringEquals) { const std::map> column_metadata{ {primary_key, std::make_shared(primary_key.name)} }; - silo::storage::Table table(std::make_shared(column_metadata, primary_key)); - auto table_partition = table.addPartition(); + const silo::storage::Table table( + std::make_shared(column_metadata, primary_key) + ); ExpressionVector innermost_children; innermost_children.emplace_back(std::make_unique()); @@ -369,7 +370,7 @@ TEST(OrToString, shouldHandleObufscatedNestedStringEquals) { outer_or.toString(), "Or(Or(Or(False | key = 'value_1') | key = 'value_2') | key = 'value_3')" ); - auto rewritten_or = outer_or.rewrite(table, *table_partition, Or::AmbiguityMode::NONE); + auto rewritten_or = outer_or.rewrite(table, Or::AmbiguityMode::NONE); EXPECT_EQ(rewritten_or->toString(), "key IN [value_1,value_2,value_3]"); } diff --git a/src/silo/query_engine/filter/expressions/phylo_child_filter.cpp b/src/silo/query_engine/filter/expressions/phylo_child_filter.cpp index b459ac873..5900e01ce 100644 --- a/src/silo/query_engine/filter/expressions/phylo_child_filter.cpp +++ b/src/silo/query_engine/filter/expressions/phylo_child_filter.cpp @@ -11,7 +11,6 @@ #include "silo/query_engine/filter/operators/bitmap_producer.h" #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -25,7 +24,7 @@ std::string PhyloChildFilter::toString() const { namespace { std::unique_ptr createMatchingBitmap( - const storage::column::StringColumnPartition& string_column, + const storage::column::StringColumn& string_column, const std::string& internal_node, size_t row_count ) { @@ -55,25 +54,21 @@ std::unique_ptr createMatchingBitmap( std::unique_ptr PhyloChildFilter::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, AmbiguityMode /*mode*/ ) const { return std::make_unique(column_name, internal_node); } -std::unique_ptr PhyloChildFilter::compile( - const storage::Table& /*table*/, - const storage::TablePartition& table_partition -) const { +std::unique_ptr PhyloChildFilter::compile(const storage::Table& table) const { CHECK_SILO_QUERY( - table_partition.columns.string_columns.contains(column_name), + table.columns.string_columns.contains(column_name), "The database does not contain the column '{}'", column_name ); - SILO_ASSERT(table_partition.columns.string_columns.contains(column_name)); - const auto& string_column = table_partition.columns.string_columns.at(column_name); - return createMatchingBitmap(string_column, internal_node, table_partition.sequence_count); + SILO_ASSERT(table.columns.string_columns.contains(column_name)); + const auto& string_column = table.columns.string_columns.at(column_name); + return createMatchingBitmap(string_column, internal_node, table.sequence_count); } // NOLINTNEXTLINE(readability-identifier-naming) diff --git a/src/silo/query_engine/filter/expressions/phylo_child_filter.h b/src/silo/query_engine/filter/expressions/phylo_child_filter.h index 7d72169ea..c4cb4371b 100644 --- a/src/silo/query_engine/filter/expressions/phylo_child_filter.h +++ b/src/silo/query_engine/filter/expressions/phylo_child_filter.h @@ -7,7 +7,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -22,18 +21,15 @@ class PhyloChildFilter : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; private: [[nodiscard]] std::optional getBitmapForValue( - const silo::storage::column::StringColumnPartition& phylo_tree_index_column + const silo::storage::column::StringColumn& phylo_tree_index_column ) const; }; diff --git a/src/silo/query_engine/filter/expressions/string_equals.cpp b/src/silo/query_engine/filter/expressions/string_equals.cpp index 1ec71758a..7ffa593a6 100644 --- a/src/silo/query_engine/filter/expressions/string_equals.cpp +++ b/src/silo/query_engine/filter/expressions/string_equals.cpp @@ -15,7 +15,6 @@ #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/filter/operators/selection.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -31,13 +30,12 @@ std::string StringEquals::toString() const { } std::unique_ptr StringEquals::rewrite( - const storage::Table& /*table*/, - const storage::TablePartition& table_partition, + const storage::Table& table, AmbiguityMode /*mode*/ ) const { CHECK_SILO_QUERY( - table_partition.columns.string_columns.contains(column_name) || - table_partition.columns.indexed_string_columns.contains(column_name), + table.columns.string_columns.contains(column_name) || + table.columns.indexed_string_columns.contains(column_name), "The database does not contain the column '{}'", column_name ); @@ -47,31 +45,28 @@ std::unique_ptr StringEquals::rewrite( } // We do not change expressions for IndexedStringColumn - if (table_partition.columns.indexed_string_columns.contains(column_name)) { + if (table.columns.indexed_string_columns.contains(column_name)) { return std::make_unique(column_name, value); } - SILO_ASSERT(table_partition.columns.string_columns.contains(column_name)); + SILO_ASSERT(table.columns.string_columns.contains(column_name)); return std::make_unique( column_name, std::unordered_set{value.value()} ); } -std::unique_ptr StringEquals::compile( - const storage::Table& /*table*/, - const storage::TablePartition& table_partition -) const { +std::unique_ptr StringEquals::compile(const storage::Table& table) const { // If it was a StringColumn it should have been rewritten - SILO_ASSERT(table_partition.columns.indexed_string_columns.contains(column_name)); - const auto& string_column = table_partition.columns.indexed_string_columns.at(column_name); + SILO_ASSERT(table.columns.indexed_string_columns.contains(column_name)); + const auto& string_column = table.columns.indexed_string_columns.at(column_name); const auto bitmap = string_column.filter(value); if (bitmap == std::nullopt || bitmap.value()->isEmpty()) { - return std::make_unique(table_partition.sequence_count); + return std::make_unique(table.sequence_count); } return std::make_unique( - CopyOnWriteBitmap{bitmap.value()}, table_partition.sequence_count + CopyOnWriteBitmap{bitmap.value()}, table.sequence_count ); } diff --git a/src/silo/query_engine/filter/expressions/string_equals.h b/src/silo/query_engine/filter/expressions/string_equals.h index 496967150..4a35233ab 100644 --- a/src/silo/query_engine/filter/expressions/string_equals.h +++ b/src/silo/query_engine/filter/expressions/string_equals.h @@ -7,7 +7,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -23,13 +22,10 @@ class StringEquals : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/string_in_set.cpp b/src/silo/query_engine/filter/expressions/string_in_set.cpp index a567446ff..6afff2edd 100644 --- a/src/silo/query_engine/filter/expressions/string_in_set.cpp +++ b/src/silo/query_engine/filter/expressions/string_in_set.cpp @@ -16,11 +16,10 @@ #include "silo/query_engine/illegal_query_exception.h" #include "silo/storage/column/indexed_string_column.h" #include "silo/storage/column/string_column.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { -using storage::column::StringColumnPartition; +using storage::column::StringColumn; StringInSet::StringInSet(std::string column_name, std::unordered_set values) : column_name(std::move(column_name)), @@ -34,19 +33,18 @@ std::string StringInSet::toString() const { } std::unique_ptr StringInSet::rewrite( - const storage::Table& /*table*/, - const storage::TablePartition& table_partition, + const storage::Table& table, AmbiguityMode /*mode*/ ) const { CHECK_SILO_QUERY( - table_partition.columns.string_columns.contains(column_name) || - table_partition.columns.indexed_string_columns.contains(column_name), + table.columns.string_columns.contains(column_name) || + table.columns.indexed_string_columns.contains(column_name), "The database does not contain the string column '{}'", column_name ); // We do not change expressions for StringColumn - if (table_partition.columns.string_columns.contains(column_name)) { + if (table.columns.string_columns.contains(column_name)) { return std::make_unique(column_name, values); } @@ -59,17 +57,14 @@ std::unique_ptr StringInSet::rewrite( return std::make_unique(std::move(string_equal_expressions)); } -std::unique_ptr StringInSet::compile( - const storage::Table& /*table*/, - const storage::TablePartition& table_partition -) const { - SILO_ASSERT(table_partition.columns.string_columns.contains(column_name)); - const auto& string_column = table_partition.columns.string_columns.at(column_name); +std::unique_ptr StringInSet::compile(const storage::Table& table) const { + SILO_ASSERT(table.columns.string_columns.contains(column_name)); + const auto& string_column = table.columns.string_columns.at(column_name); return std::make_unique( - std::make_unique>( - &string_column, operators::StringInSet::Comparator::IN, values + std::make_unique>( + &string_column, operators::StringInSet::Comparator::IN, values ), - table_partition.sequence_count + table.sequence_count ); } diff --git a/src/silo/query_engine/filter/expressions/string_in_set.h b/src/silo/query_engine/filter/expressions/string_in_set.h index 8e0abc865..60642b6ed 100644 --- a/src/silo/query_engine/filter/expressions/string_in_set.h +++ b/src/silo/query_engine/filter/expressions/string_in_set.h @@ -8,7 +8,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -28,13 +27,10 @@ class StringInSet : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/string_search.cpp b/src/silo/query_engine/filter/expressions/string_search.cpp index d277998e7..d9b07a9f8 100644 --- a/src/silo/query_engine/filter/expressions/string_search.cpp +++ b/src/silo/query_engine/filter/expressions/string_search.cpp @@ -10,7 +10,6 @@ #include "silo/query_engine/filter/operators/bitmap_producer.h" #include "silo/query_engine/filter/operators/operator.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -48,7 +47,6 @@ std::unique_ptr createMatchingBitmap( std::unique_ptr StringSearch::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, AmbiguityMode /*mode*/ ) const { return std::make_unique( @@ -56,26 +54,21 @@ std::unique_ptr StringSearch::rewrite( ); } -std::unique_ptr StringSearch::compile( - const storage::Table& /*table*/, - const storage::TablePartition& table_partition -) const { +std::unique_ptr StringSearch::compile(const storage::Table& table) const { CHECK_SILO_QUERY( - table_partition.columns.string_columns.contains(column_name) || - table_partition.columns.indexed_string_columns.contains(column_name), + table.columns.string_columns.contains(column_name) || + table.columns.indexed_string_columns.contains(column_name), "The database does not contain the string column '{}'", column_name ) - if (table_partition.columns.indexed_string_columns.contains(column_name)) { - const auto& string_column = table_partition.columns.indexed_string_columns.at(column_name); - return createMatchingBitmap( - string_column, *search_expression, table_partition.sequence_count - ); + if (table.columns.indexed_string_columns.contains(column_name)) { + const auto& string_column = table.columns.indexed_string_columns.at(column_name); + return createMatchingBitmap(string_column, *search_expression, table.sequence_count); } - SILO_ASSERT(table_partition.columns.string_columns.contains(column_name)); - const auto& string_column = table_partition.columns.string_columns.at(column_name); - return createMatchingBitmap(string_column, *search_expression, table_partition.sequence_count); + SILO_ASSERT(table.columns.string_columns.contains(column_name)); + const auto& string_column = table.columns.string_columns.at(column_name); + return createMatchingBitmap(string_column, *search_expression, table.sequence_count); } // NOLINTNEXTLINE(readability-identifier-naming) diff --git a/src/silo/query_engine/filter/expressions/string_search.h b/src/silo/query_engine/filter/expressions/string_search.h index 0018b965b..8c218f2ee 100644 --- a/src/silo/query_engine/filter/expressions/string_search.h +++ b/src/silo/query_engine/filter/expressions/string_search.h @@ -8,7 +8,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -24,13 +23,10 @@ class StringSearch : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/expressions/symbol_equals.cpp b/src/silo/query_engine/filter/expressions/symbol_equals.cpp index 8edad35a5..c9328ac3d 100644 --- a/src/silo/query_engine/filter/expressions/symbol_equals.cpp +++ b/src/silo/query_engine/filter/expressions/symbol_equals.cpp @@ -15,7 +15,6 @@ #include "silo/query_engine/illegal_query_exception.h" #include "silo/query_engine/query_compilation_exception.h" #include "silo/query_engine/query_parse_sequence_name.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -63,7 +62,6 @@ std::string SymbolEquals::toString() const { template std::unique_ptr SymbolEquals::rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const { CHECK_SILO_QUERY( @@ -77,20 +75,20 @@ std::unique_ptr SymbolEquals::rewrite( const auto valid_sequence_name = validateSequenceNameOrGetDefault(sequence_name, *table.schema); - const auto& sequence_column_partition = - table_partition.columns.getColumns().at(valid_sequence_name); + const auto& sequence_column = + table.columns.getColumns().at(valid_sequence_name); CHECK_SILO_QUERY( - position_idx < sequence_column_partition.metadata->reference_sequence.size(), + position_idx < sequence_column.metadata->reference_sequence.size(), "{} position is out of bounds {} > {}", getFilterName(), position_idx + 1, - sequence_column_partition.metadata->reference_sequence.size() + sequence_column.metadata->reference_sequence.size() ); - auto symbol = value.getSymbolOrReplaceDotWith( - sequence_column_partition.metadata->reference_sequence.at(position_idx) - ); + auto symbol = + value.getSymbolOrReplaceDotWith(sequence_column.metadata->reference_sequence.at(position_idx) + ); if (mode == UPPER_BOUND) { auto symbols_to_match = SymbolType::AMBIGUITY_SYMBOLS.at(symbol); return std::make_unique>( @@ -105,8 +103,7 @@ std::unique_ptr SymbolEquals::rewrite( template std::unique_ptr SymbolEquals::compile( - const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/ + const storage::Table& /*table*/ ) const { throw QueryCompilationException("SymbolEquals should have been rewritten before compilation"); } diff --git a/src/silo/query_engine/filter/expressions/symbol_equals.h b/src/silo/query_engine/filter/expressions/symbol_equals.h index b97bb1c6d..82fbaa054 100644 --- a/src/silo/query_engine/filter/expressions/symbol_equals.h +++ b/src/silo/query_engine/filter/expressions/symbol_equals.h @@ -9,7 +9,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -48,13 +47,10 @@ class SymbolEquals : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; private: diff --git a/src/silo/query_engine/filter/expressions/symbol_in_set.cpp b/src/silo/query_engine/filter/expressions/symbol_in_set.cpp index 540e37780..189962741 100644 --- a/src/silo/query_engine/filter/expressions/symbol_in_set.cpp +++ b/src/silo/query_engine/filter/expressions/symbol_in_set.cpp @@ -19,7 +19,7 @@ #include "silo/query_engine/query_compilation_exception.h" #include "silo/query_engine/query_parse_sequence_name.h" -using silo::storage::column::SequenceColumnPartition; +using silo::storage::column::SequenceColumn; namespace silo::query_engine::filter::expressions { @@ -102,94 +102,92 @@ std::unique_ptr makeDifference( template std::unique_ptr compileWithMissingSymbolAndReference( - const SequenceColumnPartition& sequence_column_partition, + const SequenceColumn& sequence_column, uint32_t position_idx, const std::vector& symbols ) { // as the missing symbol and the reference symbol are included, we can just negate the other // symbols auto negated_symbols = negateSymbols(symbols); - auto bitmap = sequence_column_partition.vertical_sequence_index.getMatchingContainersAsBitmap( + auto bitmap = sequence_column.vertical_sequence_index.getMatchingContainersAsBitmap( position_idx, negated_symbols ); return std::make_unique( std::make_unique( - CopyOnWriteBitmap{std::move(bitmap)}, sequence_column_partition.sequence_count + CopyOnWriteBitmap{std::move(bitmap)}, sequence_column.sequence_count ), - sequence_column_partition.sequence_count + sequence_column.sequence_count ); } template std::unique_ptr compileWithMissingSymbol( - const SequenceColumnPartition& sequence_column_partition, + const SequenceColumn& sequence_column, uint32_t position_idx, const std::vector& symbols ) { // The missing symbol is included, so we start with the sequences with no coverage at this // position and then add the sequences with the mutation symbols - auto bitmap = sequence_column_partition.vertical_sequence_index.getMatchingContainersAsBitmap( - position_idx, symbols - ); + auto bitmap = + sequence_column.vertical_sequence_index.getMatchingContainersAsBitmap(position_idx, symbols); operators::OperatorVector operators_for_union; operators_for_union.push_back(std::make_unique( std::make_unique( - &sequence_column_partition.horizontal_coverage_index, + &sequence_column.horizontal_coverage_index, position_idx, operators::IsInCoveredRegion::Comparator::IS_NOT_COVERED ), - sequence_column_partition.sequence_count + sequence_column.sequence_count )); operators_for_union.push_back(std::make_unique( - CopyOnWriteBitmap{std::move(bitmap)}, sequence_column_partition.sequence_count + CopyOnWriteBitmap{std::move(bitmap)}, sequence_column.sequence_count )); return std::make_unique( - std::move(operators_for_union), sequence_column_partition.sequence_count + std::move(operators_for_union), sequence_column.sequence_count ); } template std::unique_ptr compileWithReference( - const SequenceColumnPartition& sequence_column_partition, + const SequenceColumn& sequence_column, uint32_t position_idx, const std::vector& symbols ) { // The reference symbol is included, so we start with the sequences with coverage at this // position and then remove the sequences with the negated mutation symbols auto negated_symbols = negateSymbolsExcluding(symbols, SymbolType::SYMBOL_MISSING); - auto bitmap = sequence_column_partition.vertical_sequence_index.getMatchingContainersAsBitmap( + auto bitmap = sequence_column.vertical_sequence_index.getMatchingContainersAsBitmap( position_idx, negated_symbols ); return makeDifference( std::make_unique( std::make_unique( - &sequence_column_partition.horizontal_coverage_index, + &sequence_column.horizontal_coverage_index, position_idx, operators::IsInCoveredRegion::Comparator::IS_COVERED ), - sequence_column_partition.sequence_count + sequence_column.sequence_count ), std::make_unique( - CopyOnWriteBitmap{std::move(bitmap)}, sequence_column_partition.sequence_count + CopyOnWriteBitmap{std::move(bitmap)}, sequence_column.sequence_count ), - sequence_column_partition.sequence_count + sequence_column.sequence_count ); } template std::unique_ptr compileOnlyMutations( - const SequenceColumnPartition& sequence_column_partition, + const SequenceColumn& sequence_column, uint32_t position_idx, const std::vector& symbols ) { // All our results are fully included in the vertical sequence index - auto bitmap = sequence_column_partition.vertical_sequence_index.getMatchingContainersAsBitmap( - position_idx, symbols - ); + auto bitmap = + sequence_column.vertical_sequence_index.getMatchingContainersAsBitmap(position_idx, symbols); return std::make_unique( - CopyOnWriteBitmap{std::move(bitmap)}, sequence_column_partition.sequence_count + CopyOnWriteBitmap{std::move(bitmap)}, sequence_column.sequence_count ); } @@ -198,7 +196,6 @@ std::unique_ptr compileOnlyMutations( template std::unique_ptr SymbolInSet::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, AmbiguityMode /*mode*/ ) const { throw QueryCompilationException( @@ -208,9 +205,7 @@ std::unique_ptr SymbolInSet::rewrite( } template -std::unique_ptr SymbolInSet::compile( - const storage::Table& table, - const storage::TablePartition& table_partition +std::unique_ptr SymbolInSet::compile(const storage::Table& table ) const { CHECK_SILO_QUERY( sequence_name.has_value() || table.schema->getDefaultSequenceName(), @@ -223,18 +218,18 @@ std::unique_ptr SymbolInSet::compile( const auto valid_sequence_name = validateSequenceNameOrGetDefault(sequence_name, *table.schema); - const auto& sequence_column_partition = - table_partition.columns.getColumns().at(valid_sequence_name); + const auto& sequence_column = + table.columns.getColumns().at(valid_sequence_name); CHECK_SILO_QUERY( - position_idx < sequence_column_partition.metadata->reference_sequence.size(), + position_idx < sequence_column.metadata->reference_sequence.size(), "{} position is out of bounds {} > {}", getFilterName(), position_idx + 1, - sequence_column_partition.metadata->reference_sequence.size() + sequence_column.metadata->reference_sequence.size() ); - auto local_reference_symbol = sequence_column_partition.getLocalReferencePosition(position_idx); + auto local_reference_symbol = sequence_column.getLocalReferencePosition(position_idx); const bool includes_reference = std::find(symbols.begin(), symbols.end(), local_reference_symbol) != symbols.end(); @@ -242,15 +237,15 @@ std::unique_ptr SymbolInSet::compile( std::find(symbols.begin(), symbols.end(), SymbolType::SYMBOL_MISSING) != symbols.end(); if (includes_reference && includes_missing_symbol) { - return compileWithMissingSymbolAndReference(sequence_column_partition, position_idx, symbols); + return compileWithMissingSymbolAndReference(sequence_column, position_idx, symbols); } if (includes_missing_symbol) { - return compileWithMissingSymbol(sequence_column_partition, position_idx, symbols); + return compileWithMissingSymbol(sequence_column, position_idx, symbols); } if (includes_reference) { - return compileWithReference(sequence_column_partition, position_idx, symbols); + return compileWithReference(sequence_column, position_idx, symbols); } - return compileOnlyMutations(sequence_column_partition, position_idx, symbols); + return compileOnlyMutations(sequence_column, position_idx, symbols); } template class SymbolInSet; diff --git a/src/silo/query_engine/filter/expressions/symbol_in_set.h b/src/silo/query_engine/filter/expressions/symbol_in_set.h index 9de7ab667..a9ad311ca 100644 --- a/src/silo/query_engine/filter/expressions/symbol_in_set.h +++ b/src/silo/query_engine/filter/expressions/symbol_in_set.h @@ -9,7 +9,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -34,13 +33,10 @@ class SymbolInSet : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; private: diff --git a/src/silo/query_engine/filter/expressions/true.cpp b/src/silo/query_engine/filter/expressions/true.cpp index 3af1eef44..69d55d754 100644 --- a/src/silo/query_engine/filter/expressions/true.cpp +++ b/src/silo/query_engine/filter/expressions/true.cpp @@ -5,7 +5,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/full.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -17,17 +16,13 @@ std::string True::toString() const { std::unique_ptr True::rewrite( const storage::Table& /*table*/, - const storage::TablePartition& /*table_partition*/, Expression::AmbiguityMode /*mode*/ ) const { return std::make_unique(); } -std::unique_ptr True::compile( - const storage::Table& /*table*/, - const storage::TablePartition& table_partition -) const { - return std::make_unique(table_partition.sequence_count); +std::unique_ptr True::compile(const storage::Table& table) const { + return std::make_unique(table.sequence_count); } // NOLINTNEXTLINE(readability-identifier-naming) diff --git a/src/silo/query_engine/filter/expressions/true.h b/src/silo/query_engine/filter/expressions/true.h index 569a0049b..0cd525387 100644 --- a/src/silo/query_engine/filter/expressions/true.h +++ b/src/silo/query_engine/filter/expressions/true.h @@ -7,7 +7,6 @@ #include "silo/query_engine/filter/expressions/expression.h" #include "silo/query_engine/filter/operators/operator.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::filter::expressions { @@ -19,13 +18,10 @@ class True : public Expression { [[nodiscard]] std::unique_ptr rewrite( const storage::Table& table, - const storage::TablePartition& table_partition, AmbiguityMode mode ) const override; - [[nodiscard]] std::unique_ptr compile( - const storage::Table& table, - const storage::TablePartition& table_partition + [[nodiscard]] std::unique_ptr compile(const storage::Table& table ) const override; }; diff --git a/src/silo/query_engine/filter/operators/selection.cpp b/src/silo/query_engine/filter/operators/selection.cpp index f673a34de..a11b85f30 100644 --- a/src/silo/query_engine/filter/operators/selection.cpp +++ b/src/silo/query_engine/filter/operators/selection.cpp @@ -20,7 +20,7 @@ namespace silo::query_engine::filter::operators { -using storage::column::StringColumnPartition; +using storage::column::StringColumn; Selection::Selection( std::optional> child_operator, @@ -166,7 +166,7 @@ bool strongOrderingMatchesComparator(std::strong_ordering strong_ordering, Compa } // namespace template <> -bool CompareToValueSelection::match(uint32_t row_id) const { +bool CompareToValueSelection::match(uint32_t row_id) const { if (column.isNull(row_id)) { return with_nulls; } diff --git a/src/silo/query_engine/filter/operators/selection.h b/src/silo/query_engine/filter/operators/selection.h index 484890607..b0820d36a 100644 --- a/src/silo/query_engine/filter/operators/selection.h +++ b/src/silo/query_engine/filter/operators/selection.h @@ -162,8 +162,7 @@ class CompareToValueSelection : public Predicate { }; template <> -bool CompareToValueSelection::match(uint32_t row_id -) const; +bool CompareToValueSelection::match(uint32_t row_id) const; class Selection : public Operator { friend class filter::expressions::And; diff --git a/src/silo/query_engine/filter/operators/selection.test.cpp b/src/silo/query_engine/filter/operators/selection.test.cpp index bf49ec81f..477ad0756 100644 --- a/src/silo/query_engine/filter/operators/selection.test.cpp +++ b/src/silo/query_engine/filter/operators/selection.test.cpp @@ -9,15 +9,15 @@ using silo::query_engine::filter::operators::Comparator; using silo::query_engine::filter::operators::CompareToValueSelection; using silo::query_engine::filter::operators::Selection; using silo::storage::column::ColumnMetadata; -using silo::storage::column::IntColumnPartition; +using silo::storage::column::IntColumn; namespace { -std::pair, IntColumnPartition> makeTestColumn( +std::pair, IntColumn> makeTestColumn( const std::vector& values ) { auto metadata = std::make_shared("test"); - IntColumnPartition test_column{metadata.get()}; + IntColumn test_column{metadata.get()}; for (auto value : values) { SILO_ASSERT(test_column.insert(value).has_value()); } @@ -32,9 +32,7 @@ TEST(OperatorSelection, equalsShouldReturnCorrectValues) { const uint32_t row_count = values.size(); auto under_test = std::make_unique( - std::make_unique>( - test_column, Comparator::EQUALS, 1 - ), + std::make_unique>(test_column, Comparator::EQUALS, 1), row_count ); @@ -49,9 +47,7 @@ TEST(OperatorSelection, notEqualsShouldReturnCorrectValues) { const uint32_t row_count = values.size(); auto under_test = std::make_unique( - std::make_unique>( - test_column, Comparator::NOT_EQUALS, 1 - ), + std::make_unique>(test_column, Comparator::NOT_EQUALS, 1), row_count ); @@ -66,9 +62,7 @@ TEST(OperatorSelection, lessShouldReturnCorrectValues) { const uint32_t row_count = values.size(); auto under_test = std::make_unique( - std::make_unique>( - test_column, Comparator::LESS, 1 - ), + std::make_unique>(test_column, Comparator::LESS, 1), row_count ); @@ -85,7 +79,7 @@ TEST(OperatorSelection, lessOrEqualsShouldReturnCorrectValues) { const uint32_t row_count = values.size(); auto under_test = std::make_unique( - std::make_unique>( + std::make_unique>( test_column, Comparator::LESS_OR_EQUALS, 1 ), row_count @@ -102,7 +96,7 @@ TEST(OperatorSelection, higherOrEqualsShouldReturnCorrectValues) { const uint32_t row_count = values.size(); auto under_test = std::make_unique( - std::make_unique>( + std::make_unique>( test_column, Comparator::HIGHER_OR_EQUALS, 1 ), row_count @@ -121,9 +115,7 @@ TEST(OperatorSelection, higherShouldReturnCorrectValues) { const uint32_t row_count = values.size(); auto under_test = std::make_unique( - std::make_unique>( - test_column, Comparator::HIGHER, 1 - ), + std::make_unique>(test_column, Comparator::HIGHER, 1), row_count ); @@ -138,9 +130,7 @@ TEST(OperatorSelection, correctWithNegativeNumbers) { const uint32_t row_count = values.size(); const Selection under_test( - std::make_unique>( - test_column, Comparator::EQUALS, -1 - ), + std::make_unique>(test_column, Comparator::EQUALS, -1), row_count ); @@ -153,9 +143,7 @@ TEST(OperatorSelection, returnsCorrectTypeInfo) { const uint32_t row_count = values.size(); const Selection under_test( - std::make_unique>( - test_column, Comparator::EQUALS, -1 - ), + std::make_unique>(test_column, Comparator::EQUALS, -1), row_count ); diff --git a/src/silo/query_engine/filter/operators/string_in_set.cpp b/src/silo/query_engine/filter/operators/string_in_set.cpp index 3877b0a6d..5d4865a87 100644 --- a/src/silo/query_engine/filter/operators/string_in_set.cpp +++ b/src/silo/query_engine/filter/operators/string_in_set.cpp @@ -57,7 +57,7 @@ std::unique_ptr StringInSet::negate() const { return std::make_unique>(column, negated_comparator, values); } -template class StringInSet; -template class StringInSet; +template class StringInSet; +template class StringInSet; } // namespace silo::query_engine::filter::operators diff --git a/src/silo/query_engine/filter/operators/string_in_set.test.cpp b/src/silo/query_engine/filter/operators/string_in_set.test.cpp index 4f73cbd1d..8baca1c93 100644 --- a/src/silo/query_engine/filter/operators/string_in_set.test.cpp +++ b/src/silo/query_engine/filter/operators/string_in_set.test.cpp @@ -11,28 +11,28 @@ using silo::query_engine::filter::operators::Selection; using silo::query_engine::filter::operators::StringInSet; +using silo::storage::column::IndexedStringColumn; using silo::storage::column::IndexedStringColumnMetadata; -using silo::storage::column::IndexedStringColumnPartition; +using silo::storage::column::StringColumn; using silo::storage::column::StringColumnMetadata; -using silo::storage::column::StringColumnPartition; namespace { -std::pair, StringColumnPartition> makeTestStringColumn( +std::pair, StringColumn> makeTestStringColumn( const std::vector& values ) { auto metadata = std::make_shared("test"); - StringColumnPartition test_column{metadata.get()}; + StringColumn test_column{metadata.get()}; for (const auto& value : values) { SILO_ASSERT(test_column.insert(value).has_value()); } return {metadata, std::move(test_column)}; } -std::pair, IndexedStringColumnPartition> +std::pair, IndexedStringColumn> makeTestIndexedStringColumn(const std::vector& values) { auto metadata = std::make_shared("test_indexed"); - IndexedStringColumnPartition test_column{metadata.get()}; + IndexedStringColumn test_column{metadata.get()}; for (const auto& value : values) { SILO_ASSERT(test_column.insert(value).has_value()); } @@ -49,9 +49,9 @@ TEST(OperatorStringInSet, matchReturnsCorrectValuesForStringColumn) { const uint32_t row_count = values.size(); auto under_test = std::make_unique( - std::make_unique>( + std::make_unique>( &test_column, - StringInSet::Comparator::IN, + StringInSet::Comparator::IN, std::unordered_set{"Switzerland", "Germany"} ), row_count @@ -68,9 +68,9 @@ TEST(OperatorStringInSet, matchReturnsCorrectValuesForIndexedStringColumn) { const uint32_t row_count = values.size(); auto under_test = std::make_unique( - std::make_unique>( + std::make_unique>( &test_column, - StringInSet::Comparator::IN, + StringInSet::Comparator::IN, std::unordered_set{"Switzerland", "Germany"} ), row_count @@ -87,9 +87,9 @@ TEST(OperatorStringInSet, matchReturnsEmptyForNoMatches) { const uint32_t row_count = values.size(); auto under_test = std::make_unique( - std::make_unique>( + std::make_unique>( &test_column, - StringInSet::Comparator::IN, + StringInSet::Comparator::IN, std::unordered_set{"Japan", "China"} ), row_count @@ -106,10 +106,8 @@ TEST(OperatorStringInSet, matchReturnsEmptyForEmptySet) { const uint32_t row_count = values.size(); auto under_test = std::make_unique( - std::make_unique>( - &test_column, - StringInSet::Comparator::IN, - std::unordered_set{} + std::make_unique>( + &test_column, StringInSet::Comparator::IN, std::unordered_set{} ), row_count ); @@ -125,9 +123,9 @@ TEST(OperatorStringInSet, negationWorksCorrectly) { const uint32_t row_count = values.size(); auto under_test = std::make_unique( - std::make_unique>( + std::make_unique>( &test_column, - StringInSet::Comparator::IN, + StringInSet::Comparator::IN, std::unordered_set{"Switzerland", "Germany"} ), row_count @@ -147,9 +145,9 @@ TEST(OperatorStringInSet, notInComparatorWorksCorrectly) { const uint32_t row_count = values.size(); auto under_test = std::make_unique( - std::make_unique>( + std::make_unique>( &test_column, - StringInSet::Comparator::NOT_IN, + StringInSet::Comparator::NOT_IN, std::unordered_set{"Switzerland", "Germany"} ), row_count @@ -162,17 +160,17 @@ TEST(OperatorStringInSet, toStringReturnsCorrectFormat) { const std::vector values{"Switzerland", "Germany"}; auto [metadata, test_column] = makeTestStringColumn(values); - const StringInSet in_predicate( + const StringInSet in_predicate( &test_column, - StringInSet::Comparator::IN, + StringInSet::Comparator::IN, std::unordered_set{"Value"} ); ASSERT_EQ(in_predicate.toString(), "test IN [Value]"); - const StringInSet not_in_predicate( + const StringInSet not_in_predicate( &test_column, - StringInSet::Comparator::NOT_IN, + StringInSet::Comparator::NOT_IN, std::unordered_set{"Value"} ); @@ -183,9 +181,9 @@ TEST(OperatorStringInSet, copyCreatesIndependentCopy) { const std::vector values{"Switzerland", "Germany"}; auto [metadata, test_column] = makeTestStringColumn(values); - auto original = std::make_unique>( + auto original = std::make_unique>( &test_column, - StringInSet::Comparator::IN, + StringInSet::Comparator::IN, std::unordered_set{"Switzerland"} ); @@ -202,9 +200,9 @@ TEST(OperatorStringInSet, matchSingleValue) { const uint32_t row_count = values.size(); auto under_test = std::make_unique( - std::make_unique>( + std::make_unique>( &test_column, - StringInSet::Comparator::IN, + StringInSet::Comparator::IN, std::unordered_set{"Apple"} ), row_count @@ -219,9 +217,9 @@ TEST(OperatorStringInSet, returnsCorrectTypeInfo) { const uint32_t row_count = values.size(); const Selection under_test( - std::make_unique>( + std::make_unique>( &test_column, - StringInSet::Comparator::IN, + StringInSet::Comparator::IN, std::unordered_set{"Switzerland"} ), row_count diff --git a/src/silo/query_engine/filter/operators/threshold.cpp b/src/silo/query_engine/filter/operators/threshold.cpp index 12c68f235..4ea694ef7 100644 --- a/src/silo/query_engine/filter/operators/threshold.cpp +++ b/src/silo/query_engine/filter/operators/threshold.cpp @@ -71,16 +71,16 @@ CopyOnWriteBitmap Threshold::evaluate() const { } else { dp_table_size = number_of_matchers; } - std::vector partition_bitmaps(dp_table_size); + std::vector bitmaps(dp_table_size); // Copy bitmap of first child if immutable, otherwise use it directly if (non_negated_children.empty()) { - partition_bitmaps[0] = negated_children[0]->evaluate().getConstReference(); + bitmaps[0] = negated_children[0]->evaluate().getConstReference(); } else { - partition_bitmaps[0] = non_negated_children[0]->evaluate().getConstReference(); + bitmaps[0] = non_negated_children[0]->evaluate().getConstReference(); } if (non_negated_children.empty()) { - partition_bitmaps[0].flip(0, row_count); + bitmaps[0].flip(0, row_count); } // NOLINTBEGIN(readability-identifier-length) @@ -99,10 +99,10 @@ CopyOnWriteBitmap Threshold::evaluate() const { // positions lower than n - k + i - 1 are unable to affect the result, because only (k - i) // iterations are left for (int j = std::min(max_table_index, i); j > std::max(0, n - k + i - 1); --j) { - partition_bitmaps[j] |= partition_bitmaps[j - 1] & bitmap.getConstReference(); + bitmaps[j] |= bitmaps[j - 1] & bitmap.getConstReference(); } if (k - i > n - 1) { - partition_bitmaps[0] |= bitmap.getConstReference(); + bitmaps[0] |= bitmap.getConstReference(); } } @@ -121,22 +121,22 @@ CopyOnWriteBitmap Threshold::evaluate() const { // positions lower than n - k + i - 1 are unable to affect the result, because only (k - i) // iterations are left for (int j = std::min(max_table_index, i); j > std::max(0, n - k + i - 1); --j) { - partition_bitmaps[j] |= partition_bitmaps[j - 1] - bitmap.getConstReference(); + bitmaps[j] |= bitmaps[j - 1] - bitmap.getConstReference(); } if (k - i > n - 1) { bitmap.getMutable().flip(0, row_count); - partition_bitmaps[0] |= bitmap.getConstReference(); + bitmaps[0] |= bitmap.getConstReference(); } } // NOLINTEND(readability-identifier-length) if (this->match_exactly) { // Because exact, we remove all that have too many - partition_bitmaps[number_of_matchers - 1] -= partition_bitmaps[number_of_matchers]; + bitmaps[number_of_matchers - 1] -= bitmaps[number_of_matchers]; - return CopyOnWriteBitmap(std::move(partition_bitmaps[number_of_matchers - 1])); + return CopyOnWriteBitmap(std::move(bitmaps[number_of_matchers - 1])); } - return CopyOnWriteBitmap(std::move(partition_bitmaps.back())); + return CopyOnWriteBitmap(std::move(bitmaps.back())); } std::unique_ptr Threshold::negate(std::unique_ptr&& threshold) { diff --git a/src/silo/query_engine/operators/compute_filter.cpp b/src/silo/query_engine/operators/compute_filter.cpp new file mode 100644 index 000000000..2992fd9aa --- /dev/null +++ b/src/silo/query_engine/operators/compute_filter.cpp @@ -0,0 +1,22 @@ +#include "silo/query_engine/operators/compute_filter.h" + +#include +#include + +#include "silo/query_engine/copy_on_write_bitmap.h" +#include "silo/query_engine/filter/expressions/expression.h" +#include "silo/storage/table.h" + +namespace silo::query_engine::operators { + +CopyOnWriteBitmap computeFilter( + const std::unique_ptr& filter, + const storage::Table& table +) { + using Expression = filter::expressions::Expression; + auto rewritten = filter->rewrite(table, Expression::AmbiguityMode::NONE); + auto compiled = rewritten->compile(table); + return compiled->evaluate(); +} + +} // namespace silo::query_engine::operators diff --git a/src/silo/query_engine/operators/compute_partition_filters.h b/src/silo/query_engine/operators/compute_filter.h similarity index 87% rename from src/silo/query_engine/operators/compute_partition_filters.h rename to src/silo/query_engine/operators/compute_filter.h index fa1c2be7e..1b79228e3 100644 --- a/src/silo/query_engine/operators/compute_partition_filters.h +++ b/src/silo/query_engine/operators/compute_filter.h @@ -9,7 +9,7 @@ namespace silo::query_engine::operators { -std::vector computePartitionFilters( +CopyOnWriteBitmap computeFilter( const std::unique_ptr& filter, const storage::Table& table ); diff --git a/src/silo/query_engine/operators/compute_partition_filters.cpp b/src/silo/query_engine/operators/compute_partition_filters.cpp deleted file mode 100644 index 3b5f0d065..000000000 --- a/src/silo/query_engine/operators/compute_partition_filters.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include "silo/query_engine/operators/compute_partition_filters.h" - -#include -#include - -#include "silo/query_engine/copy_on_write_bitmap.h" -#include "silo/query_engine/filter/expressions/expression.h" -#include "silo/storage/table.h" - -namespace silo::query_engine::operators { - -std::vector computePartitionFilters( - const std::unique_ptr& filter, - const storage::Table& table -) { - using Expression = filter::expressions::Expression; - std::vector partition_filters; - partition_filters.reserve(table.getNumberOfPartitions()); - for (size_t i = 0; i < table.getNumberOfPartitions(); ++i) { - auto rewritten = - filter->rewrite(table, *table.getPartition(i), Expression::AmbiguityMode::NONE); - auto compiled = rewritten->compile(table, *table.getPartition(i)); - partition_filters.emplace_back(compiled->evaluate()); - } - return partition_filters; -} - -} // namespace silo::query_engine::operators diff --git a/src/silo/query_engine/operators/count_filter_node.cpp b/src/silo/query_engine/operators/count_filter_node.cpp index b1581a7bb..a5c1d118a 100644 --- a/src/silo/query_engine/operators/count_filter_node.cpp +++ b/src/silo/query_engine/operators/count_filter_node.cpp @@ -11,7 +11,7 @@ #include #include "silo/query_engine/exec_node/arrow_util.h" -#include "silo/query_engine/operators/compute_partition_filters.h" +#include "silo/query_engine/operators/compute_filter.h" #include "silo/schema/database_schema.h" #include "silo/storage/table.h" @@ -35,10 +35,10 @@ arrow::Result CountFilterNode::toQueryPlan( const std::map>& /*tables*/, const config::QueryOptions& /*query_options*/ ) const { - auto partition_filters = computePartitionFilters(filter, *table); + auto filter_bitmap = computeFilter(filter, *table); std::function>()> producer = - [partition_filters = std::move(partition_filters), + [filter_bitmap = std::move(filter_bitmap), already_produced = false]() mutable -> arrow::Future> { if (already_produced) { const std::optional result = std::nullopt; @@ -46,10 +46,7 @@ arrow::Result CountFilterNode::toQueryPlan( } already_produced = true; - int64_t result_count = 0; - for (const auto& partition_filter : partition_filters) { - result_count += static_cast(partition_filter.getConstReference().cardinality()); - } + auto result_count = static_cast(filter_bitmap.getConstReference().cardinality()); arrow::Int64Builder result_builder{}; ARROW_RETURN_NOT_OK(result_builder.Append(result_count)); diff --git a/src/silo/query_engine/operators/insertions_node.cpp b/src/silo/query_engine/operators/insertions_node.cpp index 390b7d375..d9fddc818 100644 --- a/src/silo/query_engine/operators/insertions_node.cpp +++ b/src/silo/query_engine/operators/insertions_node.cpp @@ -20,11 +20,10 @@ #include "silo/query_engine/copy_on_write_bitmap.h" #include "silo/query_engine/exec_node/arrow_util.h" #include "silo/query_engine/exec_node/schema_output_builder.h" -#include "silo/query_engine/operators/compute_partition_filters.h" +#include "silo/query_engine/operators/compute_filter.h" #include "silo/schema/database_schema.h" #include "silo/storage/column/insertion_index.h" #include "silo/storage/table.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::operators { struct PositionAndInsertionKey { @@ -52,75 +51,33 @@ namespace silo::query_engine::operators { namespace { -template -struct InsertionsPrefilteredBitmaps { - std::vector&>> - bitmaps; - std::vector&>> - full_bitmaps; -}; - -template -std::unordered_map> -insertionsPreFilterBitmaps( - const storage::Table& table, - const std::vector& sequence_columns, - std::vector& bitmap_filter -) { - std::unordered_map> pre_filtered_bitmaps; - for (size_t i = 0; i < table.getNumberOfPartitions(); ++i) { - auto table_partition = table.getPartition(i); - - for (const auto& column_identifier : sequence_columns) { - const auto& sequence_column = - table_partition->columns.getColumns().at( - column_identifier.name - ); - CopyOnWriteBitmap& filter = bitmap_filter[i]; - const size_t cardinality = filter.getConstReference().cardinality(); - if (cardinality == 0) { - continue; - } - if (cardinality == table_partition->sequence_count) { - pre_filtered_bitmaps[column_identifier.name].full_bitmaps.emplace_back( - cardinality, sequence_column.insertion_index - ); - } else { - if (filter.isMutable()) { - filter.getMutable().runOptimize(); - } - pre_filtered_bitmaps[column_identifier.name].bitmaps.emplace_back( - filter, sequence_column.insertion_index - ); - } - } - } - return pre_filtered_bitmaps; -} - template // NOLINTNEXTLINE(readability-function-cognitive-complexity) arrow::Status addAggregatedInsertionsToInsertionCounts( const std::string& sequence_name, bool show_sequence_in_response, - const InsertionsPrefilteredBitmaps& prefiltered_bitmaps, + const CopyOnWriteBitmap& bitmap_filter, + const storage::Table& table, exec_node::SchemaOutputBuilder& output_builder ) { + const auto& sequence_column = + table.columns.getColumns>().at(sequence_name); std::unordered_map all_insertions; - for (const auto& [_, insertion_index] : prefiltered_bitmaps.full_bitmaps) { + auto bitmap_cardinality = bitmap_filter.getConstReference().cardinality(); + if (bitmap_cardinality == 0) { + return arrow::Status::OK(); + } + if (bitmap_cardinality == table.sequence_count) { for (const auto& [position, insertions_at_position] : - insertion_index.getInsertionPositions()) { + sequence_column.insertion_index.getInsertionPositions()) { for (const auto& insertion : insertions_at_position.insertions) { all_insertions[PositionAndInsertionKey{position, insertion.value}] += insertion.row_ids.cardinality(); } } - } - for (const auto& [bitmap_filter, insertion_index] : prefiltered_bitmaps.bitmaps) { + } else { for (const auto& [position, insertions_at_position] : - insertion_index.getInsertionPositions()) { + sequence_column.insertion_index.getInsertionPositions()) { for (const auto& insertion : insertions_at_position.insertions) { const uint32_t count = insertion.row_ids.and_cardinality(bitmap_filter.getConstReference()); @@ -172,7 +129,7 @@ arrow::Result InsertionsNode::toQueryPlan( const std::map>& /*tables*/, const config::QueryOptions& /*query_options*/ ) const { - auto partition_filters = computePartitionFilters(filter, *table); + auto bitmap_filter = computeFilter(filter, *table); auto table_handle = table; auto sequence_columns_handle = sequence_columns; @@ -181,7 +138,7 @@ arrow::Result InsertionsNode::toQueryPlan( // NOLINTNEXTLINE(readability-function-cognitive-complexity) [table_handle, output_fields, - partition_filters, + bitmap_filter, sequence_columns_handle, already_produced = false]() mutable -> arrow::Future> { if (already_produced) { @@ -192,17 +149,14 @@ arrow::Result InsertionsNode::toQueryPlan( exec_node::SchemaOutputBuilder output_builder{output_fields}; - const auto bitmaps_to_evaluate = insertionsPreFilterBitmaps( - *table_handle, sequence_columns_handle, partition_filters - ); - for (const auto& [sequence_name, prefiltered_bitmaps] : bitmaps_to_evaluate) { + for (const auto& [sequence_name, _] : sequence_columns_handle) { const auto default_sequence_name = table_handle->schema->template getDefaultSequenceName(); const bool omit_sequence_in_response = default_sequence_name.has_value() && (default_sequence_name.value().name == sequence_name); ARROW_RETURN_NOT_OK(addAggregatedInsertionsToInsertionCounts( - sequence_name, !omit_sequence_in_response, prefiltered_bitmaps, output_builder + sequence_name, !omit_sequence_in_response, bitmap_filter, *table_handle, output_builder )); } diff --git a/src/silo/query_engine/operators/most_recent_common_ancestor_node.cpp b/src/silo/query_engine/operators/most_recent_common_ancestor_node.cpp index ad68a2c31..bf8a6f5f8 100644 --- a/src/silo/query_engine/operators/most_recent_common_ancestor_node.cpp +++ b/src/silo/query_engine/operators/most_recent_common_ancestor_node.cpp @@ -18,11 +18,10 @@ #include "silo/query_engine/exec_node/arrow_util.h" #include "silo/query_engine/exec_node/schema_output_builder.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/query_engine/operators/compute_partition_filters.h" +#include "silo/query_engine/operators/compute_filter.h" #include "silo/schema/database_schema.h" #include "silo/storage/column/string_column.h" #include "silo/storage/table.h" -#include "silo/storage/table_partition.h" namespace { @@ -34,31 +33,21 @@ struct NodeValuesResult { NodeValuesResult getNodeValuesFromTable( const silo::storage::Table& table, const std::string& column_name, - std::vector& bitmap_filter + silo::query_engine::CopyOnWriteBitmap& bitmap_filter ) { - size_t num_rows = 0; - for (const auto& filter : bitmap_filter) { - num_rows += filter.getConstReference().cardinality(); - } + const size_t num_rows = bitmap_filter.getConstReference().cardinality(); std::unordered_set all_tree_node_ids; uint32_t num_empty = 0; all_tree_node_ids.reserve(num_rows); - for (size_t i = 0; i < table.getNumberOfPartitions(); ++i) { - auto table_partition = table.getPartition(i); - const auto& string_column = table_partition->columns.string_columns.at(column_name); - - const silo::query_engine::CopyOnWriteBitmap& filter = bitmap_filter[i]; - const size_t cardinality = filter.getConstReference().cardinality(); - if (cardinality == 0) { - continue; - } - for (const uint32_t row_in_table_partition : filter.getConstReference()) { - if (!string_column.isNull(row_in_table_partition)) { - auto value = string_column.getValueString(row_in_table_partition); - all_tree_node_ids.insert(value); - } else { - ++num_empty; - } + + const auto& string_column = table.columns.string_columns.at(column_name); + + for (const uint32_t row_in_table : bitmap_filter.getConstReference()) { + if (!string_column.isNull(row_in_table)) { + auto value = string_column.getValueString(row_in_table); + all_tree_node_ids.insert(value); + } else { + ++num_empty; } } return NodeValuesResult{ @@ -98,7 +87,7 @@ arrow::Result MostRecentCommonAncestorNode::toQueryPlan( const std::map>& /*tables*/, const config::QueryOptions& /*query_options*/ ) const { - auto partition_filters = computePartitionFilters(filter, *table); + auto bitmap_filter = computeFilter(filter, *table); CHECK_SILO_QUERY( table->schema->getColumn(column_name).has_value(), @@ -112,7 +101,7 @@ arrow::Result MostRecentCommonAncestorNode::toQueryPlan( column_name ); const auto& optional_table_metadata = - table->schema->getColumnMetadata(column_name); + table->schema->getColumnMetadata(column_name); CHECK_SILO_QUERY( optional_table_metadata.has_value() && optional_table_metadata.value()->phylo_tree.has_value(), @@ -132,7 +121,7 @@ arrow::Result MostRecentCommonAncestorNode::toQueryPlan( [table_handle, column_name_copy, output_fields, - partition_filters = std::move(partition_filters), + bitmap_filter = std::move(bitmap_filter), &phylo_tree, already_produced = false]() mutable -> arrow::Future> { if (already_produced) { @@ -143,7 +132,7 @@ arrow::Result MostRecentCommonAncestorNode::toQueryPlan( exec_node::SchemaOutputBuilder output_builder{output_fields}; - auto node_vals = getNodeValuesFromTable(*table_handle, column_name_copy, partition_filters); + auto node_vals = getNodeValuesFromTable(*table_handle, column_name_copy, bitmap_filter); common::MRCAResponse mrca_resp = phylo_tree.getMRCA(node_vals.node_values); std::optional mrca_node = diff --git a/src/silo/query_engine/operators/mutations_node.cpp b/src/silo/query_engine/operators/mutations_node.cpp index 9787e4b6b..b3759bd40 100644 --- a/src/silo/query_engine/operators/mutations_node.cpp +++ b/src/silo/query_engine/operators/mutations_node.cpp @@ -20,59 +20,15 @@ #include "silo/query_engine/copy_on_write_bitmap.h" #include "silo/query_engine/exec_node/arrow_util.h" #include "silo/query_engine/exec_node/schema_output_builder.h" -#include "silo/query_engine/operators/compute_partition_filters.h" +#include "silo/query_engine/operators/compute_filter.h" #include "silo/schema/database_schema.h" #include "silo/storage/column/sequence_column.h" #include "silo/storage/table.h" -#include "silo/storage/table_partition.h" namespace silo::query_engine::operators { namespace { -template -struct MutationsPrefilteredBitmaps { - std::vector&>> - bitmaps; - std::vector&>> - full_bitmaps; -}; - -template -std::unordered_map> mutationsPreFilterBitmaps( - const silo::storage::Table& table, - std::vector& bitmap_filter -) { - std::unordered_map> bitmaps_to_evaluate; - for (size_t i = 0; i < table.getNumberOfPartitions(); ++i) { - auto table_partition = table.getPartition(i); - CopyOnWriteBitmap& filter = bitmap_filter[i]; - const size_t cardinality = filter.getConstReference().cardinality(); - if (cardinality == 0) { - continue; - } - if (cardinality == table_partition->sequence_count) { - for (const auto& [sequence_name, sequence_store] : - table_partition->columns.getColumns()) { - bitmaps_to_evaluate[sequence_name].full_bitmaps.emplace_back( - cardinality, sequence_store - ); - } - } else { - if (filter.isMutable()) { - filter.getMutable().runOptimize(); - } - for (const auto& [sequence_name, sequence_store] : - table_partition->columns.getColumns()) { - bitmaps_to_evaluate[sequence_name].bitmaps.emplace_back(filter, sequence_store); - } - } - } - return bitmaps_to_evaluate; -} - using silo::storage::column::VerticalSequenceIndex; template @@ -246,81 +202,84 @@ __attribute__((noinline)) void accumulateFinalCounts( template void addMutationCountsForMixedBitmaps( - const MutationsPrefilteredBitmaps& bitmaps_to_evaluate, + const storage::column::SequenceColumn& sequence_column, + const CopyOnWriteBitmap& bitmap_filter, SymbolMap>& count_of_mutations_per_position ) { - for (const auto& [filter, sequence_column_partition] : bitmaps_to_evaluate.bitmaps) { - auto local_reference = sequence_column_partition.getLocalReference(); - const size_t sequence_length = local_reference.size(); - std::vector count_per_local_reference_position(sequence_length); + auto local_reference = sequence_column.getLocalReference(); + const size_t sequence_length = local_reference.size(); + std::vector count_per_local_reference_position(sequence_length); - initializeCountsWithSequenceCount( - count_per_local_reference_position, filter.getConstReference().cardinality() - ); - subtractFilteredNCounts( - count_per_local_reference_position, - filter, - sequence_length, - sequence_column_partition.horizontal_coverage_index.horizontal_bitmaps, - sequence_column_partition.horizontal_coverage_index.start_end - ); - countActualFilteredMutations( - count_of_mutations_per_position, - count_per_local_reference_position, - filter, - sequence_column_partition.vertical_sequence_index.vertical_bitmaps - ); - accumulateFinalCounts( - count_per_local_reference_position, local_reference, count_of_mutations_per_position - ); - } + initializeCountsWithSequenceCount( + count_per_local_reference_position, bitmap_filter.getConstReference().cardinality() + ); + subtractFilteredNCounts( + count_per_local_reference_position, + bitmap_filter, + sequence_length, + sequence_column.horizontal_coverage_index.horizontal_bitmaps, + sequence_column.horizontal_coverage_index.start_end + ); + countActualFilteredMutations( + count_of_mutations_per_position, + count_per_local_reference_position, + bitmap_filter, + sequence_column.vertical_sequence_index.vertical_bitmaps + ); + accumulateFinalCounts( + count_per_local_reference_position, local_reference, count_of_mutations_per_position + ); } template void addMutationCountsForFullBitmaps( - const MutationsPrefilteredBitmaps& bitmaps_to_evaluate, + const storage::column::SequenceColumn& sequence_column, SymbolMap>& count_of_mutations_per_position ) { - for (const auto& [_, sequence_column_partition] : bitmaps_to_evaluate.full_bitmaps) { - auto local_reference = sequence_column_partition.getLocalReference(); - const size_t sequence_length = local_reference.size(); - std::vector count_per_local_reference_position(sequence_length); + auto local_reference = sequence_column.getLocalReference(); + const size_t sequence_length = local_reference.size(); + std::vector count_per_local_reference_position(sequence_length); - initializeCountsWithSequenceCount( - count_per_local_reference_position, sequence_column_partition.sequence_count - ); - subtractHorizontalBitmapCounts( - count_per_local_reference_position, - sequence_column_partition.horizontal_coverage_index.horizontal_bitmaps - ); - subtractStartAndEndNCounts( - count_per_local_reference_position, - sequence_column_partition.horizontal_coverage_index.start_end, - sequence_length - ); - countActualMutations( - count_of_mutations_per_position, - count_per_local_reference_position, - sequence_column_partition.vertical_sequence_index.vertical_bitmaps - ); - accumulateFinalCounts( - count_per_local_reference_position, local_reference, count_of_mutations_per_position - ); - } + initializeCountsWithSequenceCount( + count_per_local_reference_position, sequence_column.sequence_count + ); + subtractHorizontalBitmapCounts( + count_per_local_reference_position, + sequence_column.horizontal_coverage_index.horizontal_bitmaps + ); + subtractStartAndEndNCounts( + count_per_local_reference_position, + sequence_column.horizontal_coverage_index.start_end, + sequence_length + ); + countActualMutations( + count_of_mutations_per_position, + count_per_local_reference_position, + sequence_column.vertical_sequence_index.vertical_bitmaps + ); + accumulateFinalCounts( + count_per_local_reference_position, local_reference, count_of_mutations_per_position + ); } template SymbolMap> calculateMutationsPerPosition( - const storage::column::SequenceColumnMetadata& column_metadata, - const MutationsPrefilteredBitmaps& bitmap_filter + const storage::column::SequenceColumn& sequence_column, + const CopyOnWriteBitmap& bitmap_filter, + uint64_t sequence_count_in_column ) { - const size_t sequence_length = column_metadata.reference_sequence.size(); + const size_t sequence_length = sequence_column.metadata->reference_sequence.size(); SymbolMap> count_of_mutations_per_position; for (const auto symbol : SymbolType::SYMBOLS) { count_of_mutations_per_position[symbol] = std::vector(sequence_length, 0); } - addMutationCountsForMixedBitmaps(bitmap_filter, count_of_mutations_per_position); - addMutationCountsForFullBitmaps(bitmap_filter, count_of_mutations_per_position); + if (bitmap_filter.getConstReference().cardinality() == sequence_count_in_column) { + addMutationCountsForFullBitmaps(sequence_column, count_of_mutations_per_position); + } else if (bitmap_filter.getConstReference().cardinality() > 0) { + addMutationCountsForMixedBitmaps( + sequence_column, bitmap_filter, count_of_mutations_per_position + ); + } return count_of_mutations_per_position; } @@ -328,15 +287,18 @@ template // NOLINTNEXTLINE(readability-function-cognitive-complexity) arrow::Status addMutationsToOutput( const std::string& sequence_name, - const storage::column::SequenceColumnMetadata& column_metadata, + const storage::column::SequenceColumn& sequence_column, double min_proportion, - const MutationsPrefilteredBitmaps& bitmap_filter, + const CopyOnWriteBitmap& bitmap_filter, + uint64_t sequence_count_in_column, exec_node::SchemaOutputBuilder& output_builder ) { - const uint32_t sequence_length = column_metadata.reference_sequence.size(); + const uint32_t sequence_length = sequence_column.metadata->reference_sequence.size(); const SymbolMap> count_of_mutations_per_position = - calculateMutationsPerPosition(column_metadata, bitmap_filter); + calculateMutationsPerPosition( + sequence_column, bitmap_filter, sequence_count_in_column + ); for (uint32_t pos = 0; pos < sequence_length; ++pos) { uint32_t total = 0; @@ -352,7 +314,7 @@ arrow::Status addMutationsToOutput( : static_cast(std::ceil(static_cast(total) * min_proportion) - 1); const typename SymbolType::Symbol symbol_in_reference_genome = - column_metadata.reference_sequence.at(pos); + sequence_column.metadata->reference_sequence.at(pos); for (const auto symbol : SymbolType::VALID_MUTATION_SYMBOLS) { if (symbol_in_reference_genome != symbol) { @@ -418,19 +380,19 @@ arrow::Result MutationsNode::toQueryPlan( const std::map>& /*tables*/, const config::QueryOptions& /*query_options*/ ) const { - auto partition_filters = computePartitionFilters(filter, *table); + auto bitmap_filter = computeFilter(filter, *table); auto table_handle = table; auto output_fields = getOutputSchema(); - auto sequence_columns_handle = sequence_columns; + auto sequence_column_identifiers = sequence_columns; const double given_min_proportion = min_proportion; std::function>()> producer = // NOLINTNEXTLINE(readability-function-cognitive-complexity) [table_handle, given_min_proportion, output_fields, - partition_filters, - sequence_columns_handle, + bitmap_filter = std::move(bitmap_filter), + sequence_column_identifiers, already_produced = false]() mutable -> arrow::Future> { if (already_produced) { const std::optional result = std::nullopt; @@ -438,26 +400,21 @@ arrow::Result MutationsNode::toQueryPlan( } already_produced = true; - auto bitmaps_to_evaluate = - mutationsPreFilterBitmaps(*table_handle, partition_filters); - exec_node::SchemaOutputBuilder output_builder(output_fields); - for (const auto& sequence_column : sequence_columns_handle) { - const storage::column::SequenceColumnMetadata* sequence_column_metadata = - table_handle->schema - ->template getColumnMetadata(sequence_column.name) - .value(); - - if (bitmaps_to_evaluate.contains(sequence_column.name)) { - ARROW_RETURN_NOT_OK(addMutationsToOutput( - sequence_column.name, - *sequence_column_metadata, - given_min_proportion, - bitmaps_to_evaluate.at(sequence_column.name), - output_builder - )); - } + for (const auto& sequence_column_identifier : sequence_column_identifiers) { + const storage::column::SequenceColumn& sequence_column = + table_handle->columns.template getColumns>() + .at(sequence_column_identifier.name); + + ARROW_RETURN_NOT_OK(addMutationsToOutput( + sequence_column_identifier.name, + sequence_column, + given_min_proportion, + bitmap_filter, + table_handle->sequence_count, + output_builder + )); } ARROW_ASSIGN_OR_RAISE( const std::vector result_columns, output_builder.finish() diff --git a/src/silo/query_engine/operators/phylo_subtree_node.cpp b/src/silo/query_engine/operators/phylo_subtree_node.cpp index 8848b1235..712c5e5c7 100644 --- a/src/silo/query_engine/operators/phylo_subtree_node.cpp +++ b/src/silo/query_engine/operators/phylo_subtree_node.cpp @@ -18,11 +18,10 @@ #include "silo/query_engine/exec_node/arrow_util.h" #include "silo/query_engine/exec_node/schema_output_builder.h" #include "silo/query_engine/illegal_query_exception.h" -#include "silo/query_engine/operators/compute_partition_filters.h" +#include "silo/query_engine/operators/compute_filter.h" #include "silo/schema/database_schema.h" #include "silo/storage/column/string_column.h" #include "silo/storage/table.h" -#include "silo/storage/table_partition.h" namespace { @@ -34,31 +33,21 @@ struct NodeValuesResult { NodeValuesResult getNodeValuesFromTable( const silo::storage::Table& table, const std::string& column_name, - std::vector& bitmap_filter + silo::query_engine::CopyOnWriteBitmap& bitmap_filter ) { - size_t num_rows = 0; - for (const auto& filter : bitmap_filter) { - num_rows += filter.getConstReference().cardinality(); - } + const size_t num_rows = bitmap_filter.getConstReference().cardinality(); std::unordered_set all_tree_node_ids; uint32_t num_empty = 0; all_tree_node_ids.reserve(num_rows); - for (size_t i = 0; i < table.getNumberOfPartitions(); ++i) { - auto table_partition = table.getPartition(i); - const auto& string_column = table_partition->columns.string_columns.at(column_name); - - const silo::query_engine::CopyOnWriteBitmap& filter = bitmap_filter[i]; - const size_t cardinality = filter.getConstReference().cardinality(); - if (cardinality == 0) { - continue; - } - for (const uint32_t row_in_table_partition : filter.getConstReference()) { - if (!string_column.isNull(row_in_table_partition)) { - auto value = string_column.getValueString(row_in_table_partition); - all_tree_node_ids.insert(value); - } else { - ++num_empty; - } + + const auto& string_column = table.columns.string_columns.at(column_name); + + for (const uint32_t row_in_table : bitmap_filter.getConstReference()) { + if (!string_column.isNull(row_in_table)) { + auto value = string_column.getValueString(row_in_table); + all_tree_node_ids.insert(value); + } else { + ++num_empty; } } return NodeValuesResult{ @@ -98,7 +87,7 @@ arrow::Result PhyloSubtreeNode::toQueryPlan( const std::map>& /*tables*/, const config::QueryOptions& /*query_options*/ ) const { - auto partition_filters = computePartitionFilters(filter, *table); + auto bitmap_filter = computeFilter(filter, *table); CHECK_SILO_QUERY( table->schema->getColumn(column_name).has_value(), @@ -111,7 +100,7 @@ arrow::Result PhyloSubtreeNode::toQueryPlan( column_name ); const auto& optional_table_metadata = - table->schema->getColumnMetadata(column_name); + table->schema->getColumnMetadata(column_name); CHECK_SILO_QUERY( optional_table_metadata.has_value() && optional_table_metadata.value()->phylo_tree.has_value(), @@ -131,7 +120,7 @@ arrow::Result PhyloSubtreeNode::toQueryPlan( [table_handle, column_name_copy, output_fields, - partition_filters = std::move(partition_filters), + bitmap_filter = std::move(bitmap_filter), &phylo_tree, contract, already_produced = false]() mutable -> arrow::Future> { @@ -143,7 +132,7 @@ arrow::Result PhyloSubtreeNode::toQueryPlan( exec_node::SchemaOutputBuilder output_builder{output_fields}; - auto node_vals = getNodeValuesFromTable(*table_handle, column_name_copy, partition_filters); + auto node_vals = getNodeValuesFromTable(*table_handle, column_name_copy, bitmap_filter); common::NewickResponse newick_resp = phylo_tree.toNewickString(node_vals.node_values, contract); diff --git a/src/silo/query_engine/operators/table_scan_node.cpp b/src/silo/query_engine/operators/table_scan_node.cpp index 5149bb388..caa048b79 100644 --- a/src/silo/query_engine/operators/table_scan_node.cpp +++ b/src/silo/query_engine/operators/table_scan_node.cpp @@ -1,7 +1,7 @@ #include "silo/query_engine/operators/table_scan_node.h" #include "silo/query_engine/exec_node/table_scan.h" -#include "silo/query_engine/operators/compute_partition_filters.h" +#include "silo/query_engine/operators/compute_filter.h" namespace silo::query_engine::operators { @@ -22,7 +22,7 @@ arrow::Result TableScanNode::toQueryPlan( const std::map>& /*tables*/, const config::QueryOptions& query_options ) const { - auto partition_filters = computePartitionFilters(filter, *table); + auto bitmap_filter = computeFilter(filter, *table); ARROW_ASSIGN_OR_RAISE(auto arrow_plan, arrow::acero::ExecPlan::Make()); @@ -31,7 +31,7 @@ arrow::Result TableScanNode::toQueryPlan( exec_node::makeTableScan( arrow_plan.get(), fields, - std::move(partition_filters), + std::move(bitmap_filter), table, query_options.materialization_cutoff ) diff --git a/src/silo/query_engine/operators/zstd_decompress_node.cpp b/src/silo/query_engine/operators/zstd_decompress_node.cpp index 7f96a72db..fb59307aa 100644 --- a/src/silo/query_engine/operators/zstd_decompress_node.cpp +++ b/src/silo/query_engine/operators/zstd_decompress_node.cpp @@ -21,7 +21,7 @@ #include "silo/query_engine/exec_node/table_scan.h" #include "silo/query_engine/exec_node/throttled_batch_reslicer.h" #include "silo/query_engine/exec_node/zstd_decompress_expression.h" -#include "silo/query_engine/operators/compute_partition_filters.h" +#include "silo/query_engine/operators/compute_filter.h" #include "silo/schema/database_schema.h" #include "silo/storage/column/column_type_visitor.h" #include "silo/storage/column/sequence_column.h" @@ -34,8 +34,8 @@ namespace { using silo::schema::ColumnIdentifier; using silo::schema::TableSchema; -using silo::storage::column::SequenceColumnPartition; -using silo::storage::column::ZstdCompressedStringColumnPartition; +using silo::storage::column::SequenceColumn; +using silo::storage::column::ZstdCompressedStringColumn; class ColumnToReferenceSequenceVisitor { public: @@ -50,13 +50,12 @@ class ColumnToReferenceSequenceVisitor { template <> std::optional ColumnToReferenceSequenceVisitor::operator( -)>( +)>( const TableSchema& table_schema, const ColumnIdentifier& column_identifier ) { auto* metadata = - table_schema - .getColumnMetadata>(column_identifier.name) + table_schema.getColumnMetadata>(column_identifier.name) .value(); std::string reference; std::ranges::transform( @@ -67,13 +66,12 @@ std::optional ColumnToReferenceSequenceVisitor::operator( template <> std::optional ColumnToReferenceSequenceVisitor::operator( -)>( +)>( const TableSchema& table_schema, const ColumnIdentifier& column_identifier ) { auto* metadata = - table_schema - .getColumnMetadata>(column_identifier.name) + table_schema.getColumnMetadata>(column_identifier.name) .value(); std::string reference; std::ranges::transform( @@ -83,14 +81,12 @@ std::optional ColumnToReferenceSequenceVisitor::operator( } template <> -std::optional ColumnToReferenceSequenceVisitor::operator( -)( +std::optional ColumnToReferenceSequenceVisitor::operator()( const TableSchema& table_schema, const ColumnIdentifier& column_identifier ) { auto* metadata = - table_schema.getColumnMetadata(column_identifier.name) - .value(); + table_schema.getColumnMetadata(column_identifier.name).value(); return metadata->dictionary_string; } diff --git a/src/silo/storage/column/bool_column.cpp b/src/silo/storage/column/bool_column.cpp index ee3281665..41940e802 100644 --- a/src/silo/storage/column/bool_column.cpp +++ b/src/silo/storage/column/bool_column.cpp @@ -6,10 +6,10 @@ namespace silo::storage::column { -BoolColumnPartition::BoolColumnPartition(ColumnMetadata* metadata) +BoolColumn::BoolColumn(ColumnMetadata* metadata) : metadata(metadata) {} -std::expected BoolColumnPartition::insert(bool value) { +std::expected BoolColumn::insert(bool value) { if (value) { true_bitmap.add(num_values++); } else { @@ -18,7 +18,7 @@ std::expected BoolColumnPartition::insert(bool value) { return {}; } -void BoolColumnPartition::insertNull() { +void BoolColumn::insertNull() { null_bitmap.add(num_values++); } } // namespace silo::storage::column diff --git a/src/silo/storage/column/bool_column.h b/src/silo/storage/column/bool_column.h index 4e96f73be..e6b643634 100644 --- a/src/silo/storage/column/bool_column.h +++ b/src/silo/storage/column/bool_column.h @@ -10,7 +10,7 @@ namespace silo::storage::column { -class BoolColumnPartition { +class BoolColumn { public: using Metadata = ColumnMetadata; @@ -27,7 +27,7 @@ class BoolColumnPartition { size_t num_values = 0; public: - explicit BoolColumnPartition(Metadata* metadata); + explicit BoolColumn(Metadata* metadata); [[nodiscard]] size_t numValues() const { return num_values; } diff --git a/src/silo/storage/column/column_type_visitor.h b/src/silo/storage/column/column_type_visitor.h index 4fd6e9754..dcc0538cc 100644 --- a/src/silo/storage/column/column_type_visitor.h +++ b/src/silo/storage/column/column_type_visitor.h @@ -18,28 +18,25 @@ template static decltype(auto) visit(schema::ColumnType type, VisitorFunction&& func, Args&&... args) { switch (type) { case schema::ColumnType::STRING: - return func.template operator()(std::forward(args)...); + return func.template operator()(std::forward(args)...); case schema::ColumnType::INDEXED_STRING: - return func.template operator()(std::forward(args)...); + return func.template operator()(std::forward(args)...); case schema::ColumnType::DATE32: - return func.template operator()(std::forward(args)...); + return func.template operator()(std::forward(args)...); case schema::ColumnType::BOOL: - return func.template operator()(std::forward(args)...); + return func.template operator()(std::forward(args)...); case schema::ColumnType::INT32: - return func.template operator()(std::forward(args)...); + return func.template operator()(std::forward(args)...); case schema::ColumnType::INT64: SILO_UNIMPLEMENTED(); case schema::ColumnType::FLOAT: - return func.template operator()(std::forward(args)...); + return func.template operator()(std::forward(args)...); case schema::ColumnType::NUCLEOTIDE_SEQUENCE: - return func.template operator( - )>(std::forward(args)...); + return func.template operator()>(std::forward(args)...); case schema::ColumnType::AMINO_ACID_SEQUENCE: - return func.template operator( - )>(std::forward(args)...); + return func.template operator()>(std::forward(args)...); case schema::ColumnType::ZSTD_COMPRESSED_STRING: - return func.template operator( - )(std::forward(args)...); + return func.template operator()(std::forward(args)...); } SILO_UNREACHABLE(); } diff --git a/src/silo/storage/column/date32_column.cpp b/src/silo/storage/column/date32_column.cpp index c89ccd606..a76c3585d 100644 --- a/src/silo/storage/column/date32_column.cpp +++ b/src/silo/storage/column/date32_column.cpp @@ -6,14 +6,14 @@ namespace silo::storage::column { -Date32ColumnPartition::Date32ColumnPartition(ColumnMetadata* metadata) +Date32Column::Date32Column(ColumnMetadata* metadata) : metadata(metadata) {} -bool Date32ColumnPartition::isSorted() const { +bool Date32Column::isSorted() const { return is_sorted; } -std::expected Date32ColumnPartition::insert(std::string_view value) { +std::expected Date32Column::insert(std::string_view value) { auto date_result = silo::common::stringToDate32(value); if (!date_result.has_value()) { return std::unexpected{date_result.error()}; @@ -26,7 +26,7 @@ std::expected Date32ColumnPartition::insert(std::string_view return {}; } -void Date32ColumnPartition::insertNull() { +void Date32Column::insertNull() { const size_t row_id = values.size(); null_bitmap.add(row_id); // We need to insert _some_ value to keep vector size correct. However, it will never be read @@ -34,11 +34,11 @@ void Date32ColumnPartition::insertNull() { is_sorted = false; } -void Date32ColumnPartition::reserve(size_t row_count) { +void Date32Column::reserve(size_t row_count) { values.reserve(values.size() + row_count); } -const std::vector& Date32ColumnPartition::getValues() const { +const std::vector& Date32Column::getValues() const { return values; } diff --git a/src/silo/storage/column/date32_column.h b/src/silo/storage/column/date32_column.h index 79a22c258..86989751c 100644 --- a/src/silo/storage/column/date32_column.h +++ b/src/silo/storage/column/date32_column.h @@ -14,7 +14,7 @@ namespace silo::storage::column { -class Date32ColumnPartition { +class Date32Column { public: using Metadata = ColumnMetadata; @@ -29,7 +29,7 @@ class Date32ColumnPartition { bool is_sorted = true; public: - explicit Date32ColumnPartition(Metadata* metadata); + explicit Date32Column(Metadata* metadata); [[nodiscard]] bool isSorted() const; diff --git a/src/silo/storage/column/date32_column.test.cpp b/src/silo/storage/column/date32_column.test.cpp index 8770af749..30ca46644 100644 --- a/src/silo/storage/column/date32_column.test.cpp +++ b/src/silo/storage/column/date32_column.test.cpp @@ -2,9 +2,9 @@ #include -TEST(Date32ColumnPartition, insertValues) { +TEST(Date32Column, insertValues) { silo::storage::column::ColumnMetadata column_metadata{"test_column"}; - silo::storage::column::Date32ColumnPartition under_test(&column_metadata); + silo::storage::column::Date32Column under_test(&column_metadata); std::vector values_to_add{ "2020-01-01", "2023-01-05", "2021-12-03", "2025-01-01", "2021-03-21" @@ -22,9 +22,9 @@ TEST(Date32ColumnPartition, insertValues) { } } -TEST(Date32ColumnPartition, insertNull) { +TEST(Date32Column, insertNull) { silo::storage::column::ColumnMetadata column_metadata{"test_column"}; - silo::storage::column::Date32ColumnPartition under_test(&column_metadata); + silo::storage::column::Date32Column under_test(&column_metadata); under_test.insertNull(); @@ -32,9 +32,9 @@ TEST(Date32ColumnPartition, insertNull) { ASSERT_TRUE(under_test.isNull(0)); } -TEST(Date32ColumnPartition, insertInvalidDateReturnsError) { +TEST(Date32Column, insertInvalidDateReturnsError) { silo::storage::column::ColumnMetadata column_metadata{"test_column"}; - silo::storage::column::Date32ColumnPartition under_test(&column_metadata); + silo::storage::column::Date32Column under_test(&column_metadata); auto result = under_test.insert("not-a-date"); ASSERT_FALSE(result.has_value()); diff --git a/src/silo/storage/column/float_column.cpp b/src/silo/storage/column/float_column.cpp index 455fb7328..c9d3650b3 100644 --- a/src/silo/storage/column/float_column.cpp +++ b/src/silo/storage/column/float_column.cpp @@ -4,20 +4,20 @@ namespace silo::storage::column { -FloatColumnPartition::FloatColumnPartition(ColumnMetadata* metadata) +FloatColumn::FloatColumn(ColumnMetadata* metadata) : metadata(metadata) {} -std::expected FloatColumnPartition::insert(double value) { +std::expected FloatColumn::insert(double value) { values.push_back(value); return {}; } -void FloatColumnPartition::insertNull() { +void FloatColumn::insertNull() { null_bitmap.add(values.size()); values.push_back(std::nan("")); } -void FloatColumnPartition::reserve(size_t row_count) { +void FloatColumn::reserve(size_t row_count) { values.reserve(values.size() + row_count); } diff --git a/src/silo/storage/column/float_column.h b/src/silo/storage/column/float_column.h index 3736c8dbe..3ca9abfa6 100644 --- a/src/silo/storage/column/float_column.h +++ b/src/silo/storage/column/float_column.h @@ -13,7 +13,7 @@ namespace silo::storage::column { -class FloatColumnPartition { +class FloatColumn { public: using Metadata = ColumnMetadata; @@ -28,7 +28,7 @@ class FloatColumnPartition { [[maybe_unused]] Metadata* metadata; - explicit FloatColumnPartition(ColumnMetadata* metadata); + explicit FloatColumn(ColumnMetadata* metadata); [[nodiscard]] size_t numValues() const { return values.size(); } diff --git a/src/silo/storage/column/float_column.test.cpp b/src/silo/storage/column/float_column.test.cpp index fd08d57bb..8b879c031 100644 --- a/src/silo/storage/column/float_column.test.cpp +++ b/src/silo/storage/column/float_column.test.cpp @@ -4,14 +4,14 @@ #include using silo::storage::column::ColumnMetadata; -using silo::storage::column::FloatColumnPartition; +using silo::storage::column::FloatColumn; TEST(FloatColumn, doesNotErrorOnValidInputs) { - ColumnMetadata column("float_column1"); - FloatColumnPartition column_partition{&column}; - SILO_ASSERT(column_partition.insert(0.1).has_value()); - column_partition.insertNull(); - ASSERT_EQ(column_partition.numValues(), 2); - ASSERT_EQ(column_partition.getValue(0), 0.1); - ASSERT_TRUE(std::isnan(column_partition.getValue(1))); + ColumnMetadata column_metadata("float_column1"); + FloatColumn column{&column_metadata}; + SILO_ASSERT(column.insert(0.1).has_value()); + column.insertNull(); + ASSERT_EQ(column.numValues(), 2); + ASSERT_EQ(column.getValue(0), 0.1); + ASSERT_TRUE(std::isnan(column.getValue(1))); } diff --git a/src/silo/storage/column/indexed_string_column.cpp b/src/silo/storage/column/indexed_string_column.cpp index 983c2a9ea..b8e51f489 100644 --- a/src/silo/storage/column/indexed_string_column.cpp +++ b/src/silo/storage/column/indexed_string_column.cpp @@ -25,21 +25,21 @@ IndexedStringColumnMetadata::IndexedStringColumnMetadata( dictionary(std::move(dictionary)), lineage_tree(std::move(lineage_tree_and_id_map)) {} -IndexedStringColumnPartition::IndexedStringColumnPartition(IndexedStringColumnMetadata* metadata) +IndexedStringColumn::IndexedStringColumn(IndexedStringColumnMetadata* metadata) : metadata(metadata) { if (metadata->lineage_tree.has_value()) { lineage_index = LineageIndex{&metadata->lineage_tree->lineage_tree}; } } -std::optional IndexedStringColumnPartition::filter(Idx value_id) const { +std::optional IndexedStringColumn::filter(Idx value_id) const { if (indexed_values.contains(value_id)) { return &indexed_values.at(value_id); } return std::nullopt; } -std::optional IndexedStringColumnPartition::filter( +std::optional IndexedStringColumn::filter( const std::optional& value ) const { if (value == std::nullopt) { @@ -52,7 +52,7 @@ std::optional IndexedStringColumnPartition::filter( return filter(value_id.value()); } -std::expected IndexedStringColumnPartition::insert(std::string_view value) { +std::expected IndexedStringColumn::insert(std::string_view value) { const size_t row_id = value_ids.size(); if (lineage_index.has_value()) { @@ -75,7 +75,7 @@ std::expected IndexedStringColumnPartition::insert(std::strin return {}; } -void IndexedStringColumnPartition::insertNull() { +void IndexedStringColumn::insertNull() { null_bitmap.add(value_ids.size()); // We need to add something to the vector, so that the size of the vector remains equal to row_id // but we do not add our row_id to indexed_values[value_id] @@ -84,19 +84,19 @@ void IndexedStringColumnPartition::insertNull() { value_ids.push_back(value_id); } -bool IndexedStringColumnPartition::isNull(size_t row_id) const { +bool IndexedStringColumn::isNull(size_t row_id) const { return null_bitmap.contains(row_id); } -void IndexedStringColumnPartition::reserve(size_t row_count) { +void IndexedStringColumn::reserve(size_t row_count) { value_ids.reserve(value_ids.size() + row_count); } -std::optional IndexedStringColumnPartition::getValueId(const std::string& value) const { +std::optional IndexedStringColumn::getValueId(const std::string& value) const { return metadata->dictionary.getId(value); } -const std::optional& IndexedStringColumnPartition::getLineageIndex() const { +const std::optional& IndexedStringColumn::getLineageIndex() const { return lineage_index; } diff --git a/src/silo/storage/column/indexed_string_column.h b/src/silo/storage/column/indexed_string_column.h index 12965a056..f3d54300f 100644 --- a/src/silo/storage/column/indexed_string_column.h +++ b/src/silo/storage/column/indexed_string_column.h @@ -56,7 +56,7 @@ class IndexedStringColumnMetadata : public ColumnMetadata { IndexedStringColumnMetadata& operator=(IndexedStringColumnMetadata&& other) = delete; }; -class IndexedStringColumnPartition { +class IndexedStringColumn { public: using Metadata = IndexedStringColumnMetadata; @@ -72,7 +72,7 @@ class IndexedStringColumnPartition { std::optional lineage_index; public: - explicit IndexedStringColumnPartition(Metadata* metadata); + explicit IndexedStringColumn(Metadata* metadata); [[nodiscard]] std::optional filter(silo::Idx value_id) const; diff --git a/src/silo/storage/column/indexed_string_column.test.cpp b/src/silo/storage/column/indexed_string_column.test.cpp index 81cff459a..d7c2c9410 100644 --- a/src/silo/storage/column/indexed_string_column.test.cpp +++ b/src/silo/storage/column/indexed_string_column.test.cpp @@ -6,14 +6,14 @@ using silo::common::LineageTreeAndIdMap; using silo::common::RecombinantEdgeFollowingMode; using silo::preprocessing::LineageDefinitionFile; +using silo::storage::column::IndexedStringColumn; using silo::storage::column::IndexedStringColumnMetadata; -using silo::storage::column::IndexedStringColumnPartition; // NOLINTBEGIN(bugprone-unchecked-optional-access) -TEST(IndexedStringColumnPartition, shouldReturnTheCorrectFilteredValues) { +TEST(IndexedStringColumn, shouldReturnTheCorrectFilteredValues) { IndexedStringColumnMetadata column_metadata("some_column"); - IndexedStringColumnPartition under_test{&column_metadata}; + IndexedStringColumn under_test{&column_metadata}; ASSERT_TRUE(under_test.insert({"value 1"})); ASSERT_TRUE(under_test.insert({"value 2"})); @@ -31,9 +31,9 @@ TEST(IndexedStringColumnPartition, shouldReturnTheCorrectFilteredValues) { ASSERT_EQ(result3, std::nullopt); } -TEST(IndexedStringColumnPartition, insertValuesToPartition) { +TEST(IndexedStringColumn, insertValuesToPartition) { IndexedStringColumnMetadata column_metadata("some_column"); - IndexedStringColumnPartition under_test{&column_metadata}; + IndexedStringColumn under_test{&column_metadata}; ASSERT_TRUE(under_test.insert({"value 1"})); ASSERT_TRUE(under_test.insert({"value 2"})); @@ -52,12 +52,12 @@ TEST(IndexedStringColumnPartition, insertValuesToPartition) { EXPECT_EQ(under_test.lookupValue(2U), "value 3"); } -TEST(IndexedStringColumnPartition, addingLineageAndThenSublineageFiltersCorrectly) { +TEST(IndexedStringColumn, addingLineageAndThenSublineageFiltersCorrectly) { auto lineage_definition = LineageTreeAndIdMap::fromLineageDefinitionFilePath( "testBaseData/exampleDataset/lineage_definition.yaml" ); IndexedStringColumnMetadata column_metadata("some_column", lineage_definition); - IndexedStringColumnPartition under_test{&column_metadata}; + IndexedStringColumn under_test{&column_metadata}; ASSERT_TRUE(under_test.insert({"BA.1.1"})); ASSERT_TRUE(under_test.insert({"BA.1.1"})); @@ -86,12 +86,12 @@ TEST(IndexedStringColumnPartition, addingLineageAndThenSublineageFiltersCorrectl ); } -TEST(IndexedStringColumnPartition, addingSublineageAndThenLineageFiltersCorrectly) { +TEST(IndexedStringColumn, addingSublineageAndThenLineageFiltersCorrectly) { auto lineage_definition = LineageTreeAndIdMap::fromLineageDefinitionFilePath( "testBaseData/exampleDataset/lineage_definition.yaml" ); IndexedStringColumnMetadata column_metadata("some_column", lineage_definition); - IndexedStringColumnPartition under_test{&column_metadata}; + IndexedStringColumn under_test{&column_metadata}; ASSERT_TRUE(under_test.insert({"BA.1.1.1"})); ASSERT_TRUE(under_test.insert({"BA.1.1.1"})); @@ -134,12 +134,12 @@ TEST(IndexedStringColumnPartition, addingSublineageAndThenLineageFiltersCorrectl ); } -TEST(IndexedStringColumnPartition, queryParentLineageThatWasNeverInserted) { +TEST(IndexedStringColumn, queryParentLineageThatWasNeverInserted) { auto lineage_definition = LineageTreeAndIdMap::fromLineageDefinitionFilePath( "testBaseData/exampleDataset/lineage_definition.yaml" ); IndexedStringColumnMetadata column_metadata("some_column", lineage_definition); - IndexedStringColumnPartition under_test{&column_metadata}; + IndexedStringColumn under_test{&column_metadata}; ASSERT_TRUE(under_test.insert({"BA.1.1.1"})); ASSERT_TRUE(under_test.insert({"BA.1.1.1"})); @@ -161,7 +161,7 @@ TEST(IndexedStringColumnPartition, queryParentLineageThatWasNeverInserted) { ); } -TEST(IndexedStringColumnPartition, errorWhenInsertingIncorrectLineages) { +TEST(IndexedStringColumn, errorWhenInsertingIncorrectLineages) { auto lineage_definition = LineageTreeAndIdMap::fromLineageDefinitionFile(LineageDefinitionFile::fromYAMLString(R"( A: {} @@ -169,7 +169,7 @@ A.1: parents: ["A"] )")); IndexedStringColumnMetadata column_metadata("some_column", lineage_definition); - IndexedStringColumnPartition under_test{&column_metadata}; + IndexedStringColumn under_test{&column_metadata}; ASSERT_TRUE(under_test.insert({"A"})); auto success = under_test.insert({"A.2"}); ASSERT_FALSE(success); diff --git a/src/silo/storage/column/int_column.cpp b/src/silo/storage/column/int_column.cpp index d17b60807..b7f6040d4 100644 --- a/src/silo/storage/column/int_column.cpp +++ b/src/silo/storage/column/int_column.cpp @@ -4,15 +4,15 @@ namespace silo::storage::column { -IntColumnPartition::IntColumnPartition(ColumnMetadata* metadata) +IntColumn::IntColumn(ColumnMetadata* metadata) : metadata(metadata) {} -std::expected IntColumnPartition::insert(int32_t value) { +std::expected IntColumn::insert(int32_t value) { values.push_back(value); return {}; } -void IntColumnPartition::insertNull() { +void IntColumn::insertNull() { null_bitmap.add(values.size()); values.push_back(0); } diff --git a/src/silo/storage/column/int_column.h b/src/silo/storage/column/int_column.h index 6d3eb5ab0..b55826efe 100644 --- a/src/silo/storage/column/int_column.h +++ b/src/silo/storage/column/int_column.h @@ -12,7 +12,7 @@ namespace silo::storage::column { -class IntColumnPartition { +class IntColumn { public: using Metadata = ColumnMetadata; @@ -27,7 +27,7 @@ class IntColumnPartition { Metadata* metadata; - explicit IntColumnPartition(Metadata* metadata); + explicit IntColumn(Metadata* metadata); [[nodiscard]] bool isNull(size_t row_id) const { return null_bitmap.contains(row_id); } diff --git a/src/silo/storage/column/int_column.test.cpp b/src/silo/storage/column/int_column.test.cpp index 8ec436c62..1ed3e2e77 100644 --- a/src/silo/storage/column/int_column.test.cpp +++ b/src/silo/storage/column/int_column.test.cpp @@ -4,15 +4,15 @@ #include using silo::storage::column::ColumnMetadata; -using silo::storage::column::IntColumnPartition; +using silo::storage::column::IntColumn; TEST(IntColumn, doesNotErrorOnValidInputs) { - ColumnMetadata column("int_column1"); - IntColumnPartition column_partition{&column}; - SILO_ASSERT(column_partition.insert(123).has_value()); - column_partition.insertNull(); - ASSERT_EQ(column_partition.numValues(), 2); - ASSERT_FALSE(column_partition.isNull(0)); - ASSERT_EQ(column_partition.getValue(0), 123); - ASSERT_TRUE(column_partition.isNull(1)); + ColumnMetadata column_metadata("int_column1"); + IntColumn column{&column_metadata}; + SILO_ASSERT(column.insert(123).has_value()); + column.insertNull(); + ASSERT_EQ(column.numValues(), 2); + ASSERT_FALSE(column.isNull(0)); + ASSERT_EQ(column.getValue(0), 123); + ASSERT_TRUE(column.isNull(1)); } diff --git a/src/silo/storage/column/sequence_column.cpp b/src/silo/storage/column/sequence_column.cpp index 2da357f87..399e3af1c 100644 --- a/src/silo/storage/column/sequence_column.cpp +++ b/src/silo/storage/column/sequence_column.cpp @@ -85,9 +85,7 @@ InsertionEntry parseInsertion(const std::string& value) { } // namespace template -SequenceColumnPartition::SequenceColumnPartition( - SequenceColumnMetadata* metadata -) +SequenceColumn::SequenceColumn(SequenceColumnMetadata* metadata) : metadata(metadata), genome_length(metadata->reference_sequence.size()), local_reference_sequence_string(SymbolType::sequenceToString(metadata->reference_sequence)), @@ -100,7 +98,7 @@ SequenceColumnPartition::SequenceColumnPartition( } template -void SequenceColumnPartition::append( +void SequenceColumn::append( std::string_view sequence, uint32_t offset, const std::vector& insertions @@ -161,7 +159,7 @@ void SequenceColumnPartition::append( } template -void SequenceColumnPartition::appendNull() { +void SequenceColumn::appendNull() { const size_t row_id = sequence_count; null_bitmap.add(row_id); sequence_count += 1; @@ -169,12 +167,12 @@ void SequenceColumnPartition::appendNull() { } template -bool SequenceColumnPartition::isNull(size_t row_id) const { +bool SequenceColumn::isNull(size_t row_id) const { return null_bitmap.contains(row_id); } template -void SequenceColumnPartition::finalize() { +void SequenceColumn::finalize() { flushBuffer(); SPDLOG_DEBUG("Building insertion index"); @@ -221,7 +219,7 @@ void SequenceColumnPartition::finalize() { const SequenceColumnInfo info_after_optimisation = calculateInfo(); SPDLOG_DEBUG( - "Sequence store partition info after filling it: {}, after local reference adaption: {}, and " + "Sequence store info after filling it: {}, after local reference adaption: {}, and " "after optimising: {}", info_after_filling, info_after_adaption, @@ -247,7 +245,7 @@ void SequenceColumnPartition::finalize() { namespace silo::storage::column { template -SequenceColumnInfo SequenceColumnPartition::calculateInfo() { +SequenceColumnInfo SequenceColumn::calculateInfo() { sequence_column_info = { .sequence_count = sequence_count, .vertical_bitmaps_size = computeVerticalBitmapsSize(), @@ -257,19 +255,19 @@ SequenceColumnInfo SequenceColumnPartition::calculateInfo() { } template -SequenceColumnInfo SequenceColumnPartition::getInfo() const { +SequenceColumnInfo SequenceColumn::getInfo() const { return sequence_column_info; } template -void SequenceColumnPartition::fillIndexes() { +void SequenceColumn::fillIndexes() { for (size_t position_idx = 0; position_idx != genome_length; ++position_idx) { vertical_sequence_index.addSymbolsToPositions(position_idx, mutation_buffer.at(position_idx)); } } template -void SequenceColumnPartition::optimizeBitmaps() { +void SequenceColumn::optimizeBitmaps() { for (auto& [sequence_diff_key, sequence_diff] : vertical_sequence_index.vertical_bitmaps) { uint8_t new_container_type; auto new_container = roaring::internal::convert_run_optimize( @@ -284,7 +282,7 @@ void SequenceColumnPartition::optimizeBitmaps() { } template -void SequenceColumnPartition::flushBuffer() { +void SequenceColumn::flushBuffer() { fillIndexes(); for (auto& position : mutation_buffer) { for (auto symbol : SymbolType::SYMBOLS) { @@ -294,7 +292,7 @@ void SequenceColumnPartition::flushBuffer() { } template -size_t SequenceColumnPartition::computeVerticalBitmapsSize() const { +size_t SequenceColumn::computeVerticalBitmapsSize() const { size_t result = 0; for (const auto& [_pos, sequence_diff] : vertical_sequence_index.vertical_bitmaps) { result += roaring::internal::container_size_in_bytes( @@ -305,7 +303,7 @@ size_t SequenceColumnPartition::computeVerticalBitmapsSize() const { } template -size_t SequenceColumnPartition::computeHorizontalBitmapsSize() const { +size_t SequenceColumn::computeHorizontalBitmapsSize() const { size_t result = 0; for (const auto& [_pos, bitmap] : horizontal_coverage_index.horizontal_bitmaps) { result += bitmap.getSizeInBytes(false); @@ -321,8 +319,8 @@ SequenceColumnMetadata::SequenceColumnMetadata( : ColumnMetadata(std::move(column_name)), reference_sequence(std::move(reference_sequence)) {} -template class SequenceColumnPartition; -template class SequenceColumnPartition; +template class SequenceColumn; +template class SequenceColumn; template class SequenceColumnMetadata; template class SequenceColumnMetadata; } // namespace silo::storage::column diff --git a/src/silo/storage/column/sequence_column.h b/src/silo/storage/column/sequence_column.h index d03e92f7a..acf1aceb2 100644 --- a/src/silo/storage/column/sequence_column.h +++ b/src/silo/storage/column/sequence_column.h @@ -51,7 +51,7 @@ class SequenceColumnMetadata : public ColumnMetadata { }; template -class SequenceColumnPartition { +class SequenceColumn { public: using Metadata = SequenceColumnMetadata; @@ -94,7 +94,7 @@ class SequenceColumnPartition { ZstdDecompressor compressed_input_decompressor; - explicit SequenceColumnPartition(Metadata* metadata); + explicit SequenceColumn(Metadata* metadata); [[nodiscard]] size_t numValues() const { return sequence_count; } diff --git a/src/silo/storage/column/sequence_column.test.cpp b/src/silo/storage/column/sequence_column.test.cpp index 604de1925..23514154b 100644 --- a/src/silo/storage/column/sequence_column.test.cpp +++ b/src/silo/storage/column/sequence_column.test.cpp @@ -9,12 +9,12 @@ using silo::Nucleotide; using silo::append::AppendException; using silo::storage::InsertionFormatException; +using silo::storage::column::SequenceColumn; using silo::storage::column::SequenceColumnMetadata; -using silo::storage::column::SequenceColumnPartition; TEST(SequenceColumn, validErrorOnBadInsertionFormat_noTwoParts) { SequenceColumnMetadata column_metadata{"test_column", {Nucleotide::Symbol::A}}; - SequenceColumnPartition under_test(&column_metadata); + SequenceColumn under_test(&column_metadata); EXPECT_THAT( // NOLINTNEXTLINE(clang-diagnostic-error) @@ -28,7 +28,7 @@ TEST(SequenceColumn, validErrorOnBadInsertionFormat_noTwoParts) { TEST(SequenceColumn, validErrorOnBadInsertionFormat_firstPartNotANumber) { SequenceColumnMetadata column_metadata{"test_column", {Nucleotide::Symbol::A}}; - SequenceColumnPartition under_test(&column_metadata); + SequenceColumn under_test(&column_metadata); EXPECT_THAT( // NOLINTNEXTLINE(clang-diagnostic-error) [&]() { under_test.append("A", 0, {"A:G"}); }, @@ -41,7 +41,7 @@ TEST(SequenceColumn, validErrorOnBadInsertionFormat_firstPartNotANumber) { TEST(SequenceColumn, validErrorOnBadInsertionFormat_secondPartIllegalSymbol) { SequenceColumnMetadata column_metadata{"test_column", {Nucleotide::Symbol::A}}; - SequenceColumnPartition under_test(&column_metadata); + SequenceColumn under_test(&column_metadata); EXPECT_THAT( // NOLINTNEXTLINE(clang-diagnostic-error) [&]() { @@ -56,7 +56,7 @@ TEST(SequenceColumn, validErrorOnBadInsertionFormat_secondPartIllegalSymbol) { TEST(SequenceColumn, validErrorOnBadInsertionFormat_secondPartIsANumber) { SequenceColumnMetadata column_metadata{"test_column", {Nucleotide::Symbol::A}}; - SequenceColumnPartition under_test(&column_metadata); + SequenceColumn under_test(&column_metadata); EXPECT_THAT( // NOLINTNEXTLINE(clang-diagnostic-error) [&]() { under_test.append("A", 0, {"0:0"}); }, @@ -68,7 +68,7 @@ TEST(SequenceColumn, validErrorOnBadInsertionFormat_secondPartIsANumber) { TEST(SequenceColumn, validErrorOnBadInsertionFormat_secondPartEmpty) { SequenceColumnMetadata column_metadata{"test_column", {Nucleotide::Symbol::A}}; - SequenceColumnPartition under_test(&column_metadata); + SequenceColumn under_test(&column_metadata); EXPECT_THAT( // NOLINTNEXTLINE(clang-diagnostic-error) [&]() { under_test.append("A", 0, {"0:"}); }, @@ -81,7 +81,7 @@ TEST(SequenceColumn, validErrorOnBadInsertionFormat_secondPartEmpty) { TEST(SequenceColumn, validErrorOnBadInsertionFormat_firstPartEmpty) { SequenceColumnMetadata column_metadata{"test_column", {Nucleotide::Symbol::A}}; - SequenceColumnPartition under_test(&column_metadata); + SequenceColumn under_test(&column_metadata); EXPECT_THAT( // NOLINTNEXTLINE(clang-diagnostic-error) [&]() { under_test.append("A", 0, {":A"}); }, @@ -94,7 +94,7 @@ TEST(SequenceColumn, validErrorOnBadInsertionFormat_firstPartEmpty) { TEST(SequenceColumn, validErrorOnNegativeInsertionPosition) { SequenceColumnMetadata column_metadata{"test_column", {Nucleotide::Symbol::A}}; - SequenceColumnPartition under_test(&column_metadata); + SequenceColumn under_test(&column_metadata); EXPECT_THAT( // NOLINTNEXTLINE(clang-diagnostic-error) @@ -105,7 +105,7 @@ TEST(SequenceColumn, validErrorOnNegativeInsertionPosition) { TEST(SequenceColumn, validErrorOnInsertionPositionOutOfRange) { SequenceColumnMetadata column_metadata{"test_column", {Nucleotide::Symbol::A}}; - SequenceColumnPartition under_test(&column_metadata); + SequenceColumn under_test(&column_metadata); EXPECT_THAT( // NOLINTNEXTLINE(clang-diagnostic-error) @@ -118,14 +118,14 @@ TEST(SequenceColumn, validErrorOnInsertionPositionOutOfRange) { TEST(SequenceColumn, validInsertionAtPositionZero) { SequenceColumnMetadata column_metadata{"test_column", {Nucleotide::Symbol::A}}; - SequenceColumnPartition under_test(&column_metadata); + SequenceColumn under_test(&column_metadata); EXPECT_NO_THROW(under_test.append("A", 0, {"0:G"})); } TEST(SequenceColumn, validInsertionAtPositionEqualToGenomeLength) { SequenceColumnMetadata column_metadata{"test_column", {Nucleotide::Symbol::A}}; - SequenceColumnPartition under_test(&column_metadata); + SequenceColumn under_test(&column_metadata); EXPECT_NO_THROW(under_test.append("A", 0, {"1:G"})); } @@ -135,7 +135,7 @@ TEST(SequenceColumn, canFinalizeTwice) { "test_column", {Nucleotide::Symbol::A, Nucleotide::Symbol::C, Nucleotide::Symbol::G, Nucleotide::Symbol::T} }; - SequenceColumnPartition under_test(&column_metadata); + SequenceColumn under_test(&column_metadata); under_test.append("AAGT", 0, std::vector{}); under_test.append("AAGT", 0, std::vector{}); diff --git a/src/silo/storage/column/string_column.cpp b/src/silo/storage/column/string_column.cpp index 98dc7d33b..af7f70994 100644 --- a/src/silo/storage/column/string_column.cpp +++ b/src/silo/storage/column/string_column.cpp @@ -11,10 +11,10 @@ using silo::common::TreeNodeId; namespace silo::storage::column { -StringColumnPartition::StringColumnPartition(StringColumnMetadata* metadata) +StringColumn::StringColumn(StringColumnMetadata* metadata) : metadata(metadata) {} -std::expected StringColumnPartition::insert(std::string_view value) { +std::expected StringColumn::insert(std::string_view value) { size_t row_id; if (value.size() <= SiloString::SHORT_STRING_SIZE) { row_id = fixed_string_data.insert(SiloString{value}); @@ -42,12 +42,12 @@ std::expected StringColumnPartition::insert(std::string_view return {}; } -void StringColumnPartition::insertNull() { +void StringColumn::insertNull() { null_bitmap.add(fixed_string_data.numValues()); fixed_string_data.insert(SiloString("")); } -bool StringColumnPartition::isNull(size_t row_id) const { +bool StringColumn::isNull(size_t row_id) const { return null_bitmap.contains(row_id); } diff --git a/src/silo/storage/column/string_column.h b/src/silo/storage/column/string_column.h index b14d913f1..9df1172a7 100644 --- a/src/silo/storage/column/string_column.h +++ b/src/silo/storage/column/string_column.h @@ -54,7 +54,7 @@ class StringColumnMetadata : public ColumnMetadata { StringColumnMetadata& operator=(StringColumnMetadata&& other) = delete; }; -class StringColumnPartition { +class StringColumn { public: using Metadata = StringColumnMetadata; @@ -74,7 +74,7 @@ class StringColumnPartition { vector::VariableDataRegistry variable_string_data; public: - explicit StringColumnPartition(Metadata* metadata); + explicit StringColumn(Metadata* metadata); [[nodiscard]] std::expected insert(std::string_view value); diff --git a/src/silo/storage/column/string_column.test.cpp b/src/silo/storage/column/string_column.test.cpp index a8b43b18c..ceb4020c8 100644 --- a/src/silo/storage/column/string_column.test.cpp +++ b/src/silo/storage/column/string_column.test.cpp @@ -10,13 +10,13 @@ #include "silo/common/phylo_tree.h" #include "silo/common/tree_node_id.h" +using silo::storage::column::StringColumn; using silo::storage::column::StringColumnMetadata; -using silo::storage::column::StringColumnPartition; // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST(StringColumnPartition, rawInsertedValuesRequeried) { +TEST(StringColumn, rawInsertedValuesRequeried) { StringColumnMetadata metadata{"string_column"}; - StringColumnPartition under_test(&metadata); + StringColumn under_test(&metadata); SILO_ASSERT(under_test.insert("value 1").has_value()); SILO_ASSERT(under_test.insert("value 2").has_value()); @@ -34,16 +34,16 @@ TEST(StringColumnPartition, rawInsertedValuesRequeried) { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST(StringColumnPartition, serializationOfMetadataWorks) { +TEST(StringColumn, serializationOfMetadataWorks) { auto phylo_tree = silo::common::PhyloTree::fromNewickString( "((CHILD2:0.5, CHILD3:1)CHILD:0.1, NOT_IN_DATASET:1.5)ROOT;" ); StringColumnMetadata metadata{"string_column", std::move(phylo_tree)}; - StringColumnPartition partition(&metadata); + StringColumn column(&metadata); - SILO_ASSERT(partition.insert("CHILD2").has_value()); - SILO_ASSERT(partition.insert("CHILD3").has_value()); - SILO_ASSERT(partition.insert("NOT_IN_TREE").has_value()); + SILO_ASSERT(column.insert("CHILD2").has_value()); + SILO_ASSERT(column.insert("CHILD3").has_value()); + SILO_ASSERT(column.insert("NOT_IN_TREE").has_value()); std::ostringstream oss; boost::archive::binary_oarchive oarchive(oss); @@ -77,12 +77,12 @@ TEST(StringColumnPartition, serializationOfMetadataWorks) { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST(StringColumnPartition, rawInsertedValuesWithPhyloTreeRequeried) { +TEST(StringColumn, rawInsertedValuesWithPhyloTreeRequeried) { auto phylo_tree = silo::common::PhyloTree::fromNewickString( "((CHILD2:0.5, CHILD3:1)CHILD:0.1, NOT_IN_DATASET:1.5)ROOT;" ); StringColumnMetadata metadata{"string_column", std::move(phylo_tree)}; - StringColumnPartition under_test(&metadata); + StringColumn under_test(&metadata); SILO_ASSERT(under_test.insert("CHILD2").has_value()); SILO_ASSERT(under_test.insert("CHILD3").has_value()); @@ -104,9 +104,9 @@ TEST(StringColumnPartition, rawInsertedValuesWithPhyloTreeRequeried) { EXPECT_EQ(tree_node_id_not_in_tree, std::nullopt); } -TEST(StringColumn, rawInsertedValuesRequeried) { +TEST(StringColumn, rawInsertedValuesRequeryLongValue) { StringColumnMetadata column("string_column"); - StringColumnPartition under_test{&column}; + StringColumn under_test{&column}; SILO_ASSERT(under_test.insert("value 1").has_value()); SILO_ASSERT(under_test.insert("value 2").has_value()); @@ -124,27 +124,27 @@ TEST(StringColumn, rawInsertedValuesRequeried) { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST(StringColumn, compareAcrossPartitions) { +TEST(StringColumn, compareAcrossColumns) { StringColumnMetadata under_test("string_column"); - StringColumnPartition partition_1{&under_test}; - SILO_ASSERT(partition_1.insert("value 1").has_value()); - SILO_ASSERT(partition_1.insert("value 2").has_value()); - SILO_ASSERT(partition_1.insert("value 2").has_value()); - SILO_ASSERT(partition_1.insert("value 3").has_value()); - SILO_ASSERT(partition_1.insert("some string that is a little longer 1").has_value()); - SILO_ASSERT(partition_1.insert("value 1").has_value()); - - StringColumnPartition partition_2{&under_test}; - SILO_ASSERT(partition_2.insert("other value 2").has_value()); - SILO_ASSERT(partition_2.insert("other values 3").has_value()); - SILO_ASSERT(partition_2.insert("value 1").has_value()); - SILO_ASSERT(partition_2.insert("other value 3").has_value()); - SILO_ASSERT(partition_2.insert("some string that is a little longer 1").has_value()); - SILO_ASSERT(partition_2.insert("other value 1").has_value()); - - EXPECT_EQ(partition_1.getValueString(0), partition_1.getValueString(5)); - EXPECT_EQ(partition_1.getValueString(5), partition_2.getValueString(2)); - EXPECT_EQ(partition_1.getValueString(4), partition_2.getValueString(4)); + StringColumn column_1{&under_test}; + SILO_ASSERT(column_1.insert("value 1").has_value()); + SILO_ASSERT(column_1.insert("value 2").has_value()); + SILO_ASSERT(column_1.insert("value 2").has_value()); + SILO_ASSERT(column_1.insert("value 3").has_value()); + SILO_ASSERT(column_1.insert("some string that is a little longer 1").has_value()); + SILO_ASSERT(column_1.insert("value 1").has_value()); + + StringColumn column_2{&under_test}; + SILO_ASSERT(column_2.insert("other value 2").has_value()); + SILO_ASSERT(column_2.insert("other values 3").has_value()); + SILO_ASSERT(column_2.insert("value 1").has_value()); + SILO_ASSERT(column_2.insert("other value 3").has_value()); + SILO_ASSERT(column_2.insert("some string that is a little longer 1").has_value()); + SILO_ASSERT(column_2.insert("other value 1").has_value()); + + EXPECT_EQ(column_1.getValueString(0), column_1.getValueString(5)); + EXPECT_EQ(column_1.getValueString(5), column_2.getValueString(2)); + EXPECT_EQ(column_1.getValueString(4), column_2.getValueString(4)); } TEST(StringColumn, manyLongValues) { @@ -155,15 +155,15 @@ TEST(StringColumn, manyLongValues) { } StringColumnMetadata under_test("string_column"); - StringColumnPartition partition{&under_test}; + StringColumn column{&under_test}; for (auto& value : test_values) { - SILO_ASSERT(partition.insert(value).has_value()); + SILO_ASSERT(column.insert(value).has_value()); } for (size_t i = 0; i < 50000; ++i) { - ASSERT_EQ(partition.getValue(i).fastCompare(test_values.at(i)), std::nullopt); - ASSERT_EQ(partition.getValueString(i), test_values.at(i)); + ASSERT_EQ(column.getValue(i).fastCompare(test_values.at(i)), std::nullopt); + ASSERT_EQ(column.getValueString(i), test_values.at(i)); } } @@ -182,23 +182,22 @@ TEST(StringColumn, manyMixedValues) { } StringColumnMetadata under_test("string_column"); - StringColumnPartition partition{&under_test}; + StringColumn column{&under_test}; for (auto& value : test_values) { - SILO_ASSERT(partition.insert(value).has_value()); + SILO_ASSERT(column.insert(value).has_value()); } for (size_t i = 0; i < 50001; ++i) { if (i % 2 == 1) { - ASSERT_TRUE(partition.getValue(i).fastCompare(test_values.at(i)).has_value()); + ASSERT_TRUE(column.getValue(i).fastCompare(test_values.at(i)).has_value()); ASSERT_EQ( - partition.getValue(i).fastCompare(test_values.at(i)).value(), - std::strong_ordering::equal + column.getValue(i).fastCompare(test_values.at(i)).value(), std::strong_ordering::equal ); - ASSERT_EQ(partition.getValueString(i), test_values.at(i)); + ASSERT_EQ(column.getValueString(i), test_values.at(i)); } else { - ASSERT_EQ(partition.getValue(i).fastCompare(test_values.at(i)), std::nullopt); - ASSERT_EQ(partition.getValueString(i), test_values.at(i)); + ASSERT_EQ(column.getValue(i).fastCompare(test_values.at(i)), std::nullopt); + ASSERT_EQ(column.getValueString(i), test_values.at(i)); } } } diff --git a/src/silo/storage/column/zstd_compressed_string_column.cpp b/src/silo/storage/column/zstd_compressed_string_column.cpp index 6951017fc..b8515c94e 100644 --- a/src/silo/storage/column/zstd_compressed_string_column.cpp +++ b/src/silo/storage/column/zstd_compressed_string_column.cpp @@ -11,33 +11,31 @@ ZstdCompressedStringColumnMetadata::ZstdCompressedStringColumnMetadata( decompressor(std::make_shared(dictionary_string)), dictionary_string(std::move(dictionary_string)) {} -ZstdCompressedStringColumnPartition::ZstdCompressedStringColumnPartition( - silo::storage::column::ZstdCompressedStringColumnPartition::Metadata* metadata +ZstdCompressedStringColumn::ZstdCompressedStringColumn( + silo::storage::column::ZstdCompressedStringColumn::Metadata* metadata ) : metadata(metadata) {} -void ZstdCompressedStringColumnPartition::reserve(size_t row_count) { +void ZstdCompressedStringColumn::reserve(size_t row_count) { values.reserve(row_count); } -void ZstdCompressedStringColumnPartition::insertNull() { +void ZstdCompressedStringColumn::insertNull() { null_bitmap.add(values.size()); values.emplace_back(); } -std::expected ZstdCompressedStringColumnPartition::insert(std::string_view value -) { +std::expected ZstdCompressedStringColumn::insert(std::string_view value) { auto compressed = metadata->compressor.compress(value.data(), value.size()); values.emplace_back(compressed); return {}; } -bool ZstdCompressedStringColumnPartition::isNull(size_t row_id) const { +bool ZstdCompressedStringColumn::isNull(size_t row_id) const { return null_bitmap.contains(row_id); } -std::optional ZstdCompressedStringColumnPartition::getDecompressed(size_t row_id -) const { +std::optional ZstdCompressedStringColumn::getDecompressed(size_t row_id) const { const auto value = values.at(row_id); if (value.empty()) { return std::nullopt; @@ -47,7 +45,7 @@ std::optional ZstdCompressedStringColumnPartition::getDecompressed( return result_buffer; } -std::optional ZstdCompressedStringColumnPartition::getCompressed(size_t row_id) const { +std::optional ZstdCompressedStringColumn::getCompressed(size_t row_id) const { auto value = values.at(row_id); if (value.empty()) { return std::nullopt; diff --git a/src/silo/storage/column/zstd_compressed_string_column.h b/src/silo/storage/column/zstd_compressed_string_column.h index 52ea47496..336b3c1c6 100644 --- a/src/silo/storage/column/zstd_compressed_string_column.h +++ b/src/silo/storage/column/zstd_compressed_string_column.h @@ -28,9 +28,7 @@ class ZstdCompressedStringColumnMetadata : public ColumnMetadata { ); }; -/// Holds information where to read unaligned sequences for a -/// segment (= the sequence of a particular name) in one partition. -class ZstdCompressedStringColumnPartition { +class ZstdCompressedStringColumn { public: using Metadata = ZstdCompressedStringColumnMetadata; @@ -44,7 +42,7 @@ class ZstdCompressedStringColumnPartition { roaring::Roaring null_bitmap; Metadata* metadata; - explicit ZstdCompressedStringColumnPartition(Metadata* metadata); + explicit ZstdCompressedStringColumn(Metadata* metadata); void reserve(size_t row_count); void insertNull(); diff --git a/src/silo/storage/column/zstd_compressed_string_column.test.cpp b/src/silo/storage/column/zstd_compressed_string_column.test.cpp index 574d9f4ff..9e4a7b613 100644 --- a/src/silo/storage/column/zstd_compressed_string_column.test.cpp +++ b/src/silo/storage/column/zstd_compressed_string_column.test.cpp @@ -2,9 +2,9 @@ #include -TEST(ZstdCompressedStringColumnPartition, insertValuesAndGetThemBack) { +TEST(ZstdCompressedStringColumn, insertValuesAndGetThemBack) { silo::storage::column::ZstdCompressedStringColumnMetadata column_metadata{"test_column", "ACGT"}; - silo::storage::column::ZstdCompressedStringColumnPartition under_test(&column_metadata); + silo::storage::column::ZstdCompressedStringColumn under_test(&column_metadata); std::vector> values_to_add{ "2020-01-01", "2023-01-05", "2021-12-03", "2025-01-01", std::nullopt, "2021-03-21", "asd" diff --git a/src/silo/storage/column_group.cpp b/src/silo/storage/column_group.cpp index 694792409..94ccd9435 100644 --- a/src/silo/storage/column_group.cpp +++ b/src/silo/storage/column_group.cpp @@ -21,110 +21,104 @@ namespace silo::storage { template <> -std::map& ColumnPartitionGroup::getColumns< - column::IndexedStringColumnPartition>() { +std::map& ColumnGroup::getColumns< + column::IndexedStringColumn>() { return indexed_string_columns; } template <> -std::map& ColumnPartitionGroup::getColumns< - column::StringColumnPartition>() { +std::map& ColumnGroup::getColumns() { return string_columns; } template <> -std::map& ColumnPartitionGroup::getColumns< - column::IntColumnPartition>() { +std::map& ColumnGroup::getColumns() { return int_columns; } template <> -std::map& ColumnPartitionGroup::getColumns< - column::BoolColumnPartition>() { +std::map& ColumnGroup::getColumns() { return bool_columns; } template <> -std::map& ColumnPartitionGroup::getColumns< - column::FloatColumnPartition>() { +std::map& ColumnGroup::getColumns() { return float_columns; } template <> -std::map& ColumnPartitionGroup::getColumns< - column::Date32ColumnPartition>() { +std::map& ColumnGroup::getColumns() { return date32_columns; } template <> -std::map>& ColumnPartitionGroup:: - getColumns>() { +std::map>& ColumnGroup::getColumns< + column::SequenceColumn>() { return nuc_columns; } template <> -std::map>& ColumnPartitionGroup::getColumns< - column::SequenceColumnPartition>() { +std::map>& ColumnGroup::getColumns< + column::SequenceColumn>() { return aa_columns; } template <> -std::map& ColumnPartitionGroup:: - getColumns() { +std::map& ColumnGroup::getColumns< + column::ZstdCompressedStringColumn>() { return zstd_compressed_string_columns; } template <> -const std::map& ColumnPartitionGroup::getColumns< - column::IndexedStringColumnPartition>() const { +const std::map& ColumnGroup::getColumns< + column::IndexedStringColumn>() const { return indexed_string_columns; } template <> -const std::map& ColumnPartitionGroup::getColumns< - column::StringColumnPartition>() const { +const std::map& ColumnGroup::getColumns( +) const { return string_columns; } template <> -const std::map& ColumnPartitionGroup::getColumns< - column::IntColumnPartition>() const { +const std::map& ColumnGroup::getColumns() const { return int_columns; } template <> -const std::map& ColumnPartitionGroup::getColumns< - column::BoolColumnPartition>() const { +const std::map& ColumnGroup::getColumns( +) const { return bool_columns; } template <> -const std::map& ColumnPartitionGroup::getColumns< - column::FloatColumnPartition>() const { +const std::map& ColumnGroup::getColumns( +) const { return float_columns; } template <> -const std::map& ColumnPartitionGroup::getColumns< - column::Date32ColumnPartition>() const { +const std::map& ColumnGroup::getColumns( +) const { return date32_columns; } template <> -const std::map>& ColumnPartitionGroup:: - getColumns>() const { +const std::map>& ColumnGroup::getColumns< + column::SequenceColumn>() const { return nuc_columns; } template <> -const std::map>& ColumnPartitionGroup:: - getColumns>() const { +const std::map>& ColumnGroup::getColumns< + column::SequenceColumn>() const { return aa_columns; } template <> -const std::map& ColumnPartitionGroup:: - getColumns() const { +const std::map& ColumnGroup::getColumns< + column::ZstdCompressedStringColumn>() const { return zstd_compressed_string_columns; } @@ -159,7 +153,7 @@ template std::expected getSequenceFromJsonLine( simdjson::ondemand::value& value, std::string_view column_name, - column::SequenceColumnPartition& sequence_column + column::SequenceColumn& sequence_column ) { // Determine sequence: try 'sequenceCompressed' (base64-encoded zstd-compressed) first, // then fall back to plain 'sequence'. @@ -201,12 +195,11 @@ std::expected getSequenceFromJsonLine( template std::expected insertToSequenceColumn( - ColumnPartitionGroup& columns, + ColumnGroup& columns, const schema::ColumnIdentifier& column, simdjson::ondemand::value& value ) { - auto& sequence_column = - columns.getColumns>().at(column.name); + auto& sequence_column = columns.getColumns>().at(column.name); bool is_null; auto error = value.is_null().get(is_null); RAISE_STRING_ERROR_WITH_CONTEXT(error, value, "error checking value for null: {}"); @@ -252,7 +245,7 @@ class ColumnValueInserter { public: template std::expected operator()( - ColumnPartitionGroup& columns, + ColumnGroup& columns, const schema::ColumnIdentifier& column, simdjson::ondemand::value& value ) { @@ -278,8 +271,8 @@ class ColumnValueInserter { }; template <> -std::expected ColumnValueInserter::operator()( - ColumnPartitionGroup& columns, +std::expected ColumnValueInserter::operator()( + ColumnGroup& columns, const schema::ColumnIdentifier& column, simdjson::ondemand::value& value ) { @@ -287,24 +280,21 @@ std::expected ColumnValueInserter::operator()().at(column.name).insertNull(); + columns.getColumns().at(column.name).insertNull(); } else { std::string_view column_value; error = value.get(column_value); RAISE_STRING_ERROR_WITH_CONTEXT( error, value, "error getting value as string: {}. {}", value.raw_json_token() ); - return columns.getColumns() - .at(column.name) - .insert(column_value); + return columns.getColumns().at(column.name).insert(column_value); } return {}; } template <> -std::expected ColumnValueInserter::operator( -)>( - ColumnPartitionGroup& columns, +std::expected ColumnValueInserter::operator()>( + ColumnGroup& columns, const schema::ColumnIdentifier& column, simdjson::ondemand::value& value ) { @@ -313,8 +303,8 @@ std::expected ColumnValueInserter::operator( template <> std::expected ColumnValueInserter::operator( -)>( - ColumnPartitionGroup& columns, +)>( + ColumnGroup& columns, const schema::ColumnIdentifier& column, simdjson::ondemand::value& value ) { @@ -323,11 +313,11 @@ std::expected ColumnValueInserter::operator( } // namespace -std::expected ColumnPartitionGroup::addJsonValueToColumn( +std::expected ColumnGroup::addJsonValueToColumn( const schema::ColumnIdentifier& column, simdjson::ondemand::value& value ) { - EVOBENCH_SCOPE_EVERY(1000, "ColumnPartitionGroup", "addJsonValueToColumn"); + EVOBENCH_SCOPE_EVERY(1000, "ColumnGroup", "addJsonValueToColumn"); auto success = column::visit(column.type, ColumnValueInserter{}, *this, column, value); if (!success.has_value()) { return std::unexpected( diff --git a/src/silo/storage/column_group.h b/src/silo/storage/column_group.h index 44fe59dd9..02a3566c9 100644 --- a/src/silo/storage/column_group.h +++ b/src/silo/storage/column_group.h @@ -23,7 +23,7 @@ namespace silo::storage { -class ColumnPartitionGroup { +class ColumnGroup { friend class boost::serialization::access; template @@ -62,16 +62,15 @@ class ColumnPartitionGroup { public: std::vector metadata; - std::map string_columns; - std::map indexed_string_columns; - std::map bool_columns; - std::map int_columns; - std::map float_columns; - std::map date32_columns; - std::map> nuc_columns; - std::map> aa_columns; - std::map - zstd_compressed_string_columns; + std::map string_columns; + std::map indexed_string_columns; + std::map bool_columns; + std::map int_columns; + std::map float_columns; + std::map date32_columns; + std::map> nuc_columns; + std::map> aa_columns; + std::map zstd_compressed_string_columns; std::expected addJsonValueToColumn( const schema::ColumnIdentifier& column_identifier, diff --git a/src/silo/storage/column_group.test.cpp b/src/silo/storage/column_group.test.cpp index 3fea588a8..bf0ea4cb0 100644 --- a/src/silo/storage/column_group.test.cpp +++ b/src/silo/storage/column_group.test.cpp @@ -23,17 +23,17 @@ using silo::Nucleotide; using silo::schema::ColumnIdentifier; -using silo::storage::ColumnPartitionGroup; -using silo::storage::column::BoolColumnPartition; +using silo::storage::ColumnGroup; +using silo::storage::column::BoolColumn; using silo::storage::column::Column; using silo::storage::column::ColumnMetadata; -using silo::storage::column::Date32ColumnPartition; -using silo::storage::column::FloatColumnPartition; -using silo::storage::column::IntColumnPartition; +using silo::storage::column::Date32Column; +using silo::storage::column::FloatColumn; +using silo::storage::column::IntColumn; +using silo::storage::column::SequenceColumn; using silo::storage::column::SequenceColumnMetadata; -using silo::storage::column::SequenceColumnPartition; +using silo::storage::column::StringColumn; using silo::storage::column::StringColumnMetadata; -using silo::storage::column::StringColumnPartition; namespace { @@ -43,24 +43,22 @@ std::expected setupColumnAndInsertJson( const std::string& json_string ) { std::unique_ptr meta; - if constexpr (std::is_same_v>) { + if constexpr (std::is_same_v>) { meta = std::make_unique>( column_name, std::vector{Nucleotide::Symbol::A} ); } else { meta = std::make_unique(column_name); } - ColumnPartitionGroup partition_group; - partition_group.getColumns().emplace(column_name, ColumnType{meta.get()}); + ColumnGroup column_group; + column_group.getColumns().emplace(column_name, ColumnType{meta.get()}); simdjson::ondemand::parser parser; const simdjson::padded_string json(json_string); auto doc = parser.iterate(json).value_unsafe(); simdjson::ondemand::value val = doc[column_name].value_unsafe(); - return partition_group.addJsonValueToColumn( - ColumnIdentifier{column_name, ColumnType::TYPE}, val - ); + return column_group.addJsonValueToColumn(ColumnIdentifier{column_name, ColumnType::TYPE}, val); } std::expected setupNucleotideColumnAndInsertJson( @@ -71,9 +69,9 @@ std::expected setupNucleotideColumnAndInsertJson( auto meta = std::make_unique>( column_name, std::vector{reference} ); - ColumnPartitionGroup partition_group; - partition_group.getColumns>().emplace( - column_name, SequenceColumnPartition{meta.get()} + ColumnGroup column_group; + column_group.getColumns>().emplace( + column_name, SequenceColumn{meta.get()} ); simdjson::ondemand::parser parser; @@ -81,8 +79,8 @@ std::expected setupNucleotideColumnAndInsertJson( auto doc = parser.iterate(json).value_unsafe(); simdjson::ondemand::value val = doc[column_name].value_unsafe(); - return partition_group.addJsonValueToColumn( - ColumnIdentifier{.name = column_name, .type = SequenceColumnPartition::TYPE}, val + return column_group.addJsonValueToColumn( + ColumnIdentifier{.name = column_name, .type = SequenceColumn::TYPE}, val ); } @@ -99,9 +97,8 @@ std::string compressAndBase64Encode(std::string_view sequence, const std::string } // namespace -TEST(ColumnPartitionGroup, givenIntegerValueForBoolColumn_returnsColumnInsertError) { - const auto result = - setupColumnAndInsertJson("bool_col", R"({"bool_col": 42})"); +TEST(ColumnGroup, givenIntegerValueForBoolColumn_returnsColumnInsertError) { + const auto result = setupColumnAndInsertJson("bool_col", R"({"bool_col": 42})"); ASSERT_FALSE(result.has_value()); EXPECT_THAT( @@ -112,9 +109,8 @@ TEST(ColumnPartitionGroup, givenIntegerValueForBoolColumn_returnsColumnInsertErr ); } -TEST(ColumnPartitionGroup, givenStringValueForIntColumn_returnsColumnInsertError) { - const auto result = - setupColumnAndInsertJson("int_col", R"({"int_col": "hello"})"); +TEST(ColumnGroup, givenStringValueForIntColumn_returnsColumnInsertError) { + const auto result = setupColumnAndInsertJson("int_col", R"({"int_col": "hello"})"); ASSERT_FALSE(result.has_value()); EXPECT_THAT( @@ -125,9 +121,9 @@ TEST(ColumnPartitionGroup, givenStringValueForIntColumn_returnsColumnInsertError ); } -TEST(ColumnPartitionGroup, givenStringValueForFloatColumn_returnsColumnInsertError) { +TEST(ColumnGroup, givenStringValueForFloatColumn_returnsColumnInsertError) { const auto result = - setupColumnAndInsertJson("float_col", R"({"float_col": "hello"})"); + setupColumnAndInsertJson("float_col", R"({"float_col": "hello"})"); ASSERT_FALSE(result.has_value()); EXPECT_THAT( @@ -138,9 +134,9 @@ TEST(ColumnPartitionGroup, givenStringValueForFloatColumn_returnsColumnInsertErr ); } -TEST(ColumnPartitionGroup, givenIntegerValueForStringColumn_returnsColumnInsertError) { +TEST(ColumnGroup, givenIntegerValueForStringColumn_returnsColumnInsertError) { const auto result = - setupColumnAndInsertJson("string_col", R"({"string_col": 42})"); + setupColumnAndInsertJson("string_col", R"({"string_col": 42})"); ASSERT_FALSE(result.has_value()); EXPECT_THAT( @@ -151,9 +147,8 @@ TEST(ColumnPartitionGroup, givenIntegerValueForStringColumn_returnsColumnInsertE ); } -TEST(ColumnPartitionGroup, givenIntegerValueForDate32Column_returnsColumnInsertError) { - const auto result = - setupColumnAndInsertJson("date_col", R"({"date_col": 42})"); +TEST(ColumnGroup, givenIntegerValueForDate32Column_returnsColumnInsertError) { + const auto result = setupColumnAndInsertJson("date_col", R"({"date_col": 42})"); ASSERT_FALSE(result.has_value()); EXPECT_THAT( @@ -164,10 +159,9 @@ TEST(ColumnPartitionGroup, givenIntegerValueForDate32Column_returnsColumnInsertE ); } -TEST(ColumnPartitionGroup, givenObjectMissingSequenceField_returnsColumnInsertError) { - const auto result = setupColumnAndInsertJson>( - "nuc_col", R"({"nuc_col": {}})" - ); +TEST(ColumnGroup, givenObjectMissingSequenceField_returnsColumnInsertError) { + const auto result = + setupColumnAndInsertJson>("nuc_col", R"({"nuc_col": {}})"); ASSERT_FALSE(result.has_value()); EXPECT_THAT( @@ -178,8 +172,8 @@ TEST(ColumnPartitionGroup, givenObjectMissingSequenceField_returnsColumnInsertEr ); } -TEST(ColumnPartitionGroup, givenObjectMissingInsertionsField_returnsColumnInsertError) { - const auto result = setupColumnAndInsertJson>( +TEST(ColumnGroup, givenObjectMissingInsertionsField_returnsColumnInsertError) { + const auto result = setupColumnAndInsertJson>( "nuc_col", R"({"nuc_col": {"sequence": "A"}})" ); @@ -192,7 +186,7 @@ TEST(ColumnPartitionGroup, givenObjectMissingInsertionsField_returnsColumnInsert ); } -TEST(ColumnPartitionGroup, givenValidSequenceCompressed_succeeds) { +TEST(ColumnGroup, givenValidSequenceCompressed_succeeds) { const std::vector reference = { Nucleotide::Symbol::A, Nucleotide::Symbol::C, Nucleotide::Symbol::G, Nucleotide::Symbol::T }; @@ -208,7 +202,7 @@ TEST(ColumnPartitionGroup, givenValidSequenceCompressed_succeeds) { ASSERT_TRUE(result.has_value()); } -TEST(ColumnPartitionGroup, givenSequenceCompressedWithMutation_succeeds) { +TEST(ColumnGroup, givenSequenceCompressedWithMutation_succeeds) { const std::vector reference = { Nucleotide::Symbol::A, Nucleotide::Symbol::C, Nucleotide::Symbol::G, Nucleotide::Symbol::T }; @@ -225,7 +219,7 @@ TEST(ColumnPartitionGroup, givenSequenceCompressedWithMutation_succeeds) { ASSERT_TRUE(result.has_value()); } -TEST(ColumnPartitionGroup, givenSequenceCompressedMultipleRows_succeeds) { +TEST(ColumnGroup, givenSequenceCompressedMultipleRows_succeeds) { std::vector reference = { Nucleotide::Symbol::A, Nucleotide::Symbol::C, Nucleotide::Symbol::G, Nucleotide::Symbol::T }; @@ -233,9 +227,9 @@ TEST(ColumnPartitionGroup, givenSequenceCompressedMultipleRows_succeeds) { auto meta = std::make_unique>("nuc_col", std::move(reference)); - ColumnPartitionGroup partition_group; - partition_group.getColumns>().emplace( - "nuc_col", SequenceColumnPartition{meta.get()} + ColumnGroup column_group; + column_group.getColumns>().emplace( + "nuc_col", SequenceColumn{meta.get()} ); for (const std::string_view sequence : {"ACGT", "ATGT", "ACGT"}) { @@ -248,14 +242,14 @@ TEST(ColumnPartitionGroup, givenSequenceCompressedMultipleRows_succeeds) { auto doc = parser.iterate(padded).value_unsafe(); simdjson::ondemand::value val = doc["nuc_col"].value_unsafe(); - const auto result = partition_group.addJsonValueToColumn( - ColumnIdentifier{.name = "nuc_col", .type = SequenceColumnPartition::TYPE}, val + const auto result = column_group.addJsonValueToColumn( + ColumnIdentifier{.name = "nuc_col", .type = SequenceColumn::TYPE}, val ); ASSERT_TRUE(result.has_value()); } } -TEST(ColumnPartitionGroup, givenSequenceCompressedWithInvalidBase64_returnsError) { +TEST(ColumnGroup, givenSequenceCompressedWithInvalidBase64_returnsError) { const std::vector reference = {Nucleotide::Symbol::A}; const auto result = setupNucleotideColumnAndInsertJson( @@ -268,7 +262,7 @@ TEST(ColumnPartitionGroup, givenSequenceCompressedWithInvalidBase64_returnsError EXPECT_THAT(result.error(), testing::HasSubstr("invalid base64")); } -TEST(ColumnPartitionGroup, givenSequenceCompressedWithInvalidZstdData_returnsError) { +TEST(ColumnGroup, givenSequenceCompressedWithInvalidZstdData_returnsError) { const std::vector reference = { Nucleotide::Symbol::A, Nucleotide::Symbol::C, Nucleotide::Symbol::G, Nucleotide::Symbol::T }; diff --git a/src/silo/storage/table.cpp b/src/silo/storage/table.cpp index 1dccfc09b..530a9dcb1 100644 --- a/src/silo/storage/table.cpp +++ b/src/silo/storage/table.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -17,17 +18,43 @@ #include "evobench/evobench.hpp" #include "silo/persistence/exception.h" +#include "silo/preprocessing/preprocessing_exception.h" #include "silo/roaring_util/roaring_serialize.h" #include "silo/schema/duplicate_primary_key_exception.h" +#include "silo/storage/column/column_type_visitor.h" namespace silo::storage { -size_t Table::getNumberOfPartitions() const { - return partitions.size(); +Table::Table(std::shared_ptr schema) + : schema(std::move(schema)) { + auto column_initializer = []( + ColumnGroup& column_group, + const silo::schema::ColumnIdentifier& column_identifier, + silo::schema::TableSchema& table_schema + ) { + ColumnType column(table_schema.getColumnMetadata(column_identifier.name).value()); + column_group.metadata.emplace_back(column_identifier); + column_group.getColumns().emplace(column_identifier.name, std::move(column)); + }; + for (const auto& col : this->schema->getColumnIdentifiers()) { + column::visit(col.type, column_initializer, columns, col, *this->schema); + } } void Table::validate() const { validatePrimaryKeyUnique(); + validateNucleotideSequences(); + validateAminoAcidSequences(); + validateMetadataColumns(); +} + +void Table::finalize() { + for (auto& [_, sequence_column] : columns.nuc_columns) { + sequence_column.finalize(); + } + for (auto& [_, sequence_column] : columns.aa_columns) { + sequence_column.finalize(); + } } void Table::validatePrimaryKeyUnique() const { @@ -35,37 +62,75 @@ void Table::validatePrimaryKeyUnique() const { const auto primary_key = schema->primary_key; SILO_ASSERT(primary_key.type == schema::ColumnType::STRING); - size_t total_rows = 0; - for (const auto& partition : partitions) { - total_rows += partition->sequence_count; - } + const auto& primary_key_column = columns.string_columns.at(primary_key.name); + auto num_values = primary_key_column.numValues(); std::unordered_set unique_keys; - unique_keys.reserve(total_rows); - for (const auto& partition : partitions) { - auto& primary_key_column = partition->columns.string_columns.at(primary_key.name); - auto num_values = primary_key_column.numValues(); - for (size_t i = 0; i < num_values; ++i) { - std::string value = primary_key_column.getValueString(i); - if (unique_keys.contains(value)) { - throw schema::DuplicatePrimaryKeyException("Found duplicate primary key {}", value); - } - unique_keys.insert(value); + unique_keys.reserve(num_values); + for (size_t i = 0; i < num_values; ++i) { + std::string value = primary_key_column.getValueString(i); + if (unique_keys.contains(value)) { + throw schema::DuplicatePrimaryKeyException("Found duplicate primary key {}", value); } + unique_keys.insert(value); } SPDLOG_DEBUG("Found {} distinct primary keys.", unique_keys.size()); } -std::shared_ptr Table::getPartition(size_t partition_idx) const { - return partitions.at(partition_idx); +void Table::validateNucleotideSequences() const { + for (const auto& [name, nuc_column] : columns.nuc_columns) { + if (nuc_column.sequence_count > sequence_count) { + SILO_PANIC( + "nuc_store {} ({}) has invalid size (expected {}).", + name, + nuc_column.sequence_count, + sequence_count + ); + } + if (nuc_column.metadata->reference_sequence.empty()) { + SILO_PANIC("reference_sequence {} is empty.", name); + } + } +} + +void Table::validateAminoAcidSequences() const { + for (const auto& [name, aa_column] : columns.aa_columns) { + if (aa_column.sequence_count > sequence_count) { + SILO_PANIC( + "aa_store {} ({}) has invalid size (expected {}).", + name, + aa_column.sequence_count, + sequence_count + ); + } + if (aa_column.metadata->reference_sequence.empty()) { + SILO_PANIC("reference_sequence {} is empty.", name); + } + } } -std::shared_ptr Table::getPartition(size_t partition_idx) { - return partitions.at(partition_idx); +template +void Table::validateColumnsHaveSize( + const std::map& columnsOfTheType, + const std::string& columnType +) const { + for (const auto& col : columnsOfTheType) { + if (col.second.numValues() != sequence_count) { + throw preprocessing::PreprocessingException( + columnType + " " + col.first + " has invalid size " + + std::to_string(col.second.numValues()) + ); + } + } } -std::shared_ptr Table::addPartition() { - return partitions.emplace_back(std::make_shared(*schema)); +void Table::validateMetadataColumns() const { + validateColumnsHaveSize(columns.date32_columns, "date32_columns"); + validateColumnsHaveSize(columns.bool_columns, "bool_columns"); + validateColumnsHaveSize(columns.int_columns, "int_columns"); + validateColumnsHaveSize(columns.indexed_string_columns, "indexed_string_columns"); + validateColumnsHaveSize(columns.string_columns, "string_columns"); + validateColumnsHaveSize(columns.float_columns, "float_columns"); } namespace { @@ -90,48 +155,28 @@ std::ofstream openOutputFileOrThrow(const std::string& path) { } // namespace -void Table::saveData(const std::filesystem::path& save_directory) { +void Table::saveData(const std::filesystem::path& path) { EVOBENCH_SCOPE("Table", "saveData"); - std::vector partition_archives; - for (uint32_t i = 0; i < getNumberOfPartitions(); ++i) { - const auto& partition_archive = save_directory / ("P" + std::to_string(i) + ".silo"); - partition_archives.emplace_back(openOutputFileOrThrow(partition_archive)); - - if (!partition_archives.back()) { - throw persistence::SaveDatabaseException( - "Cannot open partition output file " + partition_archive.string() + " for saving" - ); - } + auto output_file = openOutputFileOrThrow(path); + if (!output_file) { + throw persistence::SaveDatabaseException( + "Cannot open output file " + path.string() + " for saving" + ); } - SPDLOG_INFO("Saving {} partitions...", getNumberOfPartitions()); - for (size_t partition_idx = 0; partition_idx < getNumberOfPartitions(); ++partition_idx) { - ::boost::archive::binary_oarchive output_archive(partition_archives[partition_idx]); - partitions[partition_idx]->serializeData(output_archive, 0); - } - SPDLOG_INFO("Finished saving partitions"); + SPDLOG_INFO("Saving table data..."); + ::boost::archive::binary_oarchive output_archive(output_file); + serializeData(output_archive, 0); + SPDLOG_INFO("Finished saving table data"); } -void Table::loadData(const std::filesystem::path& save_directory) { +void Table::loadData(const std::filesystem::path& path) { EVOBENCH_SCOPE("Table", "loadData"); - std::vector file_vec; - for (const std::filesystem::path& file : std::filesystem::directory_iterator(save_directory)) { - if (file.extension() == ".silo") { - file_vec.emplace_back(openInputFileOrThrow(file)); - addPartition(); - - if (!file_vec.back()) { - throw persistence::SaveDatabaseException( - fmt::format("Cannot open partition input file {} for loading", file.string()) - ); - } - } - } - for (size_t partition_index = 0; partition_index != getNumberOfPartitions(); ++partition_index) { - ::boost::archive::binary_iarchive input_archive(file_vec[partition_index]); - partitions[partition_index]->serializeData(input_archive, 0); - } - SPDLOG_INFO("Finished loading partition data"); + + auto input_file = openInputFileOrThrow(path); + ::boost::archive::binary_iarchive input_archive(input_file); + serializeData(input_archive, 0); + SPDLOG_INFO("Finished loading table data"); } } // namespace silo::storage diff --git a/src/silo/storage/table.h b/src/silo/storage/table.h index 449ec82e5..e8be978b0 100644 --- a/src/silo/storage/table.h +++ b/src/silo/storage/table.h @@ -1,32 +1,57 @@ #pragma once +#include +#include +#include +#include + +#include + #include "silo/schema/database_schema.h" -#include "silo/storage/table_partition.h" +#include "silo/storage/column_group.h" namespace silo::storage { class Table { - std::vector> partitions; - public: std::shared_ptr schema; + storage::ColumnGroup columns; + uint32_t sequence_count = 0; - explicit Table(std::shared_ptr schema) - : schema(std::move(schema)) {} - - [[nodiscard]] size_t getNumberOfPartitions() const; + explicit Table(std::shared_ptr schema); - [[nodiscard]] std::shared_ptr getPartition(size_t partition_idx) const; + Table(Table&& other) = default; + Table& operator=(Table&& other) = default; - std::shared_ptr getPartition(size_t partition_idx); + Table(const Table& other) = delete; + Table& operator=(const Table& other) = delete; - std::shared_ptr addPartition(); + template + void serializeData(Archive& archive, [[maybe_unused]] const uint32_t version) { + // clang-format off + archive & columns; + archive & sequence_count; + // clang-format on + } void validate() const; - void loadData(const std::filesystem::path& save_directory); - void saveData(const std::filesystem::path& save_directory); + void finalize(); + + void loadData(const std::filesystem::path& path); + void saveData(const std::filesystem::path& path); void validatePrimaryKeyUnique() const; + + private: + void validateNucleotideSequences() const; + void validateAminoAcidSequences() const; + void validateMetadataColumns() const; + + template + void validateColumnsHaveSize( + const std::map& columnsOfTheType, + const std::string& columnType + ) const; }; } // namespace silo::storage diff --git a/src/silo/storage/table_partition.cpp b/src/silo/storage/table_partition.cpp deleted file mode 100644 index d478b86ab..000000000 --- a/src/silo/storage/table_partition.cpp +++ /dev/null @@ -1,100 +0,0 @@ -#include "silo/storage/table_partition.h" - -#include - -#include "silo/preprocessing/preprocessing_exception.h" -#include "silo/storage/column/column_type_visitor.h" - -namespace silo::storage { - -void TablePartition::validate() const { - validateNucleotideSequences(); - validateAminoAcidSequences(); - validateMetadataColumns(); -} - -void TablePartition::finalize() { - for (auto& [_, sequence_column] : columns.nuc_columns) { - sequence_column.finalize(); - } - for (auto& [_, sequence_column] : columns.aa_columns) { - sequence_column.finalize(); - } -} - -TablePartition::TablePartition(schema::TableSchema& schema) { - auto column_initializer = []( - ColumnPartitionGroup& column_group, - const schema::ColumnIdentifier& column_identifier, - schema::TableSchema& schema - ) { - ColumnType column(schema.getColumnMetadata(column_identifier.name).value()); - column_group.metadata.emplace_back(column_identifier); - column_group.getColumns().emplace(column_identifier.name, std::move(column)); - }; - for (const auto& column : schema.getColumnIdentifiers()) { - column::visit(column.type, column_initializer, columns, column, schema); - } -} - -void TablePartition::validateNucleotideSequences() const { - const size_t partition_size = sequence_count; - - for (const auto& [name, nuc_column] : columns.nuc_columns) { - if (nuc_column.sequence_count > partition_size) { - SILO_PANIC( - "nuc_store {} ({}) has invalid size (expected {}).", - name, - nuc_column.sequence_count, - partition_size - ); - } - if (nuc_column.metadata->reference_sequence.empty()) { - SILO_PANIC("reference_sequence {} is empty.", name); - } - } -} - -void TablePartition::validateAminoAcidSequences() const { - const size_t partition_size = sequence_count; - - for (const auto& [name, aa_column] : columns.aa_columns) { - if (aa_column.sequence_count > partition_size) { - SILO_PANIC( - "aa_store {} ({}) has invalid size (expected {}).", - name, - aa_column.sequence_count, - partition_size - ); - } - if (aa_column.metadata->reference_sequence.empty()) { - SILO_PANIC("reference_sequence {} is empty.", name); - } - } -} - -template -void TablePartition::validateColumnsHaveSize( - const std::map& columnsOfTheType, - const std::string& columnType -) const { - for (const auto& column : columnsOfTheType) { - if (column.second.numValues() != sequence_count) { - throw preprocessing::PreprocessingException( - columnType + " " + column.first + " has invalid size " + - std::to_string(column.second.numValues()) - ); - } - } -} - -void TablePartition::validateMetadataColumns() const { - validateColumnsHaveSize(columns.date32_columns, "date32_columns"); - validateColumnsHaveSize(columns.bool_columns, "bool_columns"); - validateColumnsHaveSize(columns.int_columns, "int_columns"); - validateColumnsHaveSize(columns.indexed_string_columns, "indexed_string_columns"); - validateColumnsHaveSize(columns.string_columns, "string_columns"); - validateColumnsHaveSize(columns.float_columns, "float_columns"); -} - -} // namespace silo::storage diff --git a/src/silo/storage/table_partition.h b/src/silo/storage/table_partition.h deleted file mode 100644 index 76de8838c..000000000 --- a/src/silo/storage/table_partition.h +++ /dev/null @@ -1,55 +0,0 @@ -#pragma once - -#include -#include -#include - -#include - -#include "silo/schema/database_schema.h" -#include "silo/storage/column_group.h" - -namespace silo::storage { - -class TablePartition { - public: - template - /// The data of partitions is serialized in parallel. Therefore it is not part of the default - /// serialization method - void serializeData(Archive& archive, [[maybe_unused]] const uint32_t version) { - // clang-format off - archive & columns; - archive & sequence_count; - // clang-format on - } - - storage::ColumnPartitionGroup columns; - uint32_t sequence_count = 0; - - explicit TablePartition(schema::TableSchema& schema); - - TablePartition(TablePartition&& other) = default; - TablePartition& operator=(TablePartition&& other) = default; - - TablePartition(const TablePartition& other) = delete; - TablePartition& operator=(const TablePartition& other) = delete; - - void validate() const; - - void finalize(); - - private: - void validateNucleotideSequences() const; - - void validateAminoAcidSequences() const; - - void validateMetadataColumns() const; - - template - void validateColumnsHaveSize( - const std::map& columnsOfTheType, - const std::string& columnType - ) const; -}; - -} // namespace silo::storage diff --git a/testBaseData/siloSerializedState/1774509931/data_version.silo b/testBaseData/siloSerializedState/1774509931/data_version.silo deleted file mode 100644 index 8259d478b..000000000 --- a/testBaseData/siloSerializedState/1774509931/data_version.silo +++ /dev/null @@ -1,2 +0,0 @@ -timestamp: 1774509931 -serializationVersion: 1774509839 \ No newline at end of file diff --git a/testBaseData/siloSerializedState/1774967818/data_version.silo b/testBaseData/siloSerializedState/1774967818/data_version.silo new file mode 100644 index 000000000..23e7ea991 --- /dev/null +++ b/testBaseData/siloSerializedState/1774967818/data_version.silo @@ -0,0 +1,2 @@ +timestamp: 1774967818 +serializationVersion: 1774967790 \ No newline at end of file diff --git a/testBaseData/siloSerializedState/1774509931/database_schema.silo b/testBaseData/siloSerializedState/1774967818/database_schema.silo similarity index 100% rename from testBaseData/siloSerializedState/1774509931/database_schema.silo rename to testBaseData/siloSerializedState/1774967818/database_schema.silo diff --git a/testBaseData/siloSerializedState/1774509931/default/P0.silo b/testBaseData/siloSerializedState/1774967818/default.silo similarity index 99% rename from testBaseData/siloSerializedState/1774509931/default/P0.silo rename to testBaseData/siloSerializedState/1774967818/default.silo index fb7159d2f..d325e4310 100644 Binary files a/testBaseData/siloSerializedState/1774509931/default/P0.silo and b/testBaseData/siloSerializedState/1774967818/default.silo differ