diff --git a/Makefile b/Makefile index 853d4ab70..8ceeb2fcd 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,6 @@ CMAKE_BUILD_PARALLEL_LEVEL ?= 16 .PHONY: ci ci: format all-tests -.PHONY: conanprofile conanprofile: buildScripts/create-conanprofile diff --git a/documentation/input_format.md b/documentation/input_format.md index b83060ebe..fa6f8037e 100644 --- a/documentation/input_format.md +++ b/documentation/input_format.md @@ -58,6 +58,8 @@ defaultAminoAcidSequence: E - `type`: One of `string`, `int`, `float`, `date`, `boolean` - `generateIndex`: Set to `true` to create a bitmap index for fast equality lookups. This is only valid for `string` columns - `generateLineageIndex`: Path to lineage definition file for hierarchical queries. This is only possible if `generateIndex` is also set +- `treatUnknownLineagesAsNull`: Treats unknown lineage values as null when adding them to the lineage index +- `isPhyloTreeField`: Mark this column as a phyloTreeField, which enables the phylogenetic queries. See [phylogenetic_queries.md](phylogenetic_queries.md) `defaultNucleotideSequence` and `defaultAminoAcidSequence` are optional and set the default sequence to be searched for, so that the sequence name can be omitted in queries. diff --git a/src/silo/common/serialization_version.txt b/src/silo/common/serialization_version.txt index 4076d0809..5a509d115 100644 --- a/src/silo/common/serialization_version.txt +++ b/src/silo/common/serialization_version.txt @@ -1 +1 @@ -1774967790 +1776329324 diff --git a/src/silo/config/database_config.cpp b/src/silo/config/database_config.cpp index 41a95d418..cbb0c71a7 100644 --- a/src/silo/config/database_config.cpp +++ b/src/silo/config/database_config.cpp @@ -149,6 +149,11 @@ bool YAML::convert::decode( } else { metadata.phylo_tree_node_identifier = false; } + if (node["treatUnknownLineagesAsNull"].IsDefined()) { + metadata.treat_unknown_lineages_as_null = node["treatUnknownLineagesAsNull"].as(); + } else { + metadata.treat_unknown_lineages_as_null = false; + } return true; } YAML::Node YAML::convert::encode( @@ -164,6 +169,9 @@ YAML::Node YAML::convert::encode( if (metadata.phylo_tree_node_identifier) { node["isPhyloTreeField"] = true; } + if (metadata.treat_unknown_lineages_as_null) { + node["treatUnknownLineagesAsNull"] = true; + } return node; } diff --git a/src/silo/config/database_config.h b/src/silo/config/database_config.h index c3d4bb11d..43b254173 100644 --- a/src/silo/config/database_config.h +++ b/src/silo/config/database_config.h @@ -24,6 +24,7 @@ class DatabaseMetadata { bool generate_index; std::optional generate_lineage_index; bool phylo_tree_node_identifier; + bool treat_unknown_lineages_as_null; [[nodiscard]] schema::ColumnType getColumnType() const; }; diff --git a/src/silo/initialize/initializer.cpp b/src/silo/initialize/initializer.cpp index 1b79465e9..1d7c9c2f1 100644 --- a/src/silo/initialize/initializer.cpp +++ b/src/silo/initialize/initializer.cpp @@ -84,7 +84,7 @@ void ColumnMetadataInitializer::operator() ); } metadata = std::make_shared( - config_metadata.name, lineage_tree.value() + config_metadata.name, lineage_tree.value(), config_metadata.treat_unknown_lineages_as_null ); } else { metadata = diff --git a/src/silo/storage/column/indexed_string_column.cpp b/src/silo/storage/column/indexed_string_column.cpp index b8e51f489..e616ac0d0 100644 --- a/src/silo/storage/column/indexed_string_column.cpp +++ b/src/silo/storage/column/indexed_string_column.cpp @@ -10,20 +10,24 @@ namespace silo::storage::column { IndexedStringColumnMetadata::IndexedStringColumnMetadata( std::string column_name, - common::LineageTreeAndIdMap lineage_tree_and_id_map + common::LineageTreeAndIdMap lineage_tree_and_id_map, + bool treat_unknown_lineages_as_null ) : ColumnMetadata(std::move(column_name)), dictionary(lineage_tree_and_id_map.lineage_id_lookup_map.copy()), - lineage_tree(std::move(lineage_tree_and_id_map)) {} + lineage_tree(std::move(lineage_tree_and_id_map)), + treat_unknown_lineages_as_null(treat_unknown_lineages_as_null) {} IndexedStringColumnMetadata::IndexedStringColumnMetadata( std::string column_name, common::BidirectionalStringMap dictionary, - common::LineageTreeAndIdMap lineage_tree_and_id_map + common::LineageTreeAndIdMap lineage_tree_and_id_map, + bool treat_unknown_lineages_as_null ) : ColumnMetadata(std::move(column_name)), dictionary(std::move(dictionary)), - lineage_tree(std::move(lineage_tree_and_id_map)) {} + lineage_tree(std::move(lineage_tree_and_id_map)), + treat_unknown_lineages_as_null(treat_unknown_lineages_as_null) {} IndexedStringColumn::IndexedStringColumn(IndexedStringColumnMetadata* metadata) : metadata(metadata) { @@ -57,7 +61,9 @@ std::expected IndexedStringColumn::insert(std::string_view va if (lineage_index.has_value()) { const auto value_id = metadata->dictionary.getId(value); - if (!value_id.has_value()) { + if (value_id.has_value()) { + lineage_index.value().insert(row_id, value_id.value()); + } else if (!metadata->treat_unknown_lineages_as_null) { return std::unexpected(fmt::format( "The value '{}' is not a valid lineage value for column '{}'. " "Is your lineage definition file outdated?", @@ -65,7 +71,6 @@ std::expected IndexedStringColumn::insert(std::string_view va metadata->column_name )); } - lineage_index->insert(row_id, value_id.value()); } const Idx value_id = metadata->dictionary.getOrCreateId(value); diff --git a/src/silo/storage/column/indexed_string_column.h b/src/silo/storage/column/indexed_string_column.h index f3d54300f..b8622bd1e 100644 --- a/src/silo/storage/column/indexed_string_column.h +++ b/src/silo/storage/column/indexed_string_column.h @@ -27,6 +27,7 @@ class IndexedStringColumnMetadata : public ColumnMetadata { public: common::BidirectionalStringMap dictionary; std::optional lineage_tree; + bool treat_unknown_lineages_as_null = false; explicit IndexedStringColumnMetadata(std::string column_name) : ColumnMetadata(std::move(column_name)) {} @@ -40,13 +41,15 @@ class IndexedStringColumnMetadata : public ColumnMetadata { IndexedStringColumnMetadata( std::string column_name, - common::LineageTreeAndIdMap lineage_tree_and_id_map + common::LineageTreeAndIdMap lineage_tree_and_id_map, + bool treat_unknown_lineages_as_null ); IndexedStringColumnMetadata( std::string column_name, silo::common::BidirectionalStringMap dictionary, - common::LineageTreeAndIdMap lineage_tree_and_id_map + common::LineageTreeAndIdMap lineage_tree_and_id_map, + bool treat_unknown_lineages_as_null ); IndexedStringColumnMetadata() = delete; @@ -132,6 +135,7 @@ template archive & object.column_name; archive & object.dictionary; archive & object.lineage_tree; + archive & object.treat_unknown_lineages_as_null; } } // namespace boost::serialization @@ -146,12 +150,17 @@ template std::string column_name; silo::common::BidirectionalStringMap dictionary; std::optional lineage_tree; + bool treat_unknown_lineages_as_null; archive & column_name; archive & dictionary; archive & lineage_tree; + archive & treat_unknown_lineages_as_null; if (lineage_tree.has_value()) { object = std::make_shared( - std::move(column_name), std::move(dictionary), std::move(lineage_tree.value()) + std::move(column_name), + std::move(dictionary), + std::move(lineage_tree.value()), + treat_unknown_lineages_as_null ); } else { object = std::make_shared( diff --git a/src/silo/storage/column/indexed_string_column.test.cpp b/src/silo/storage/column/indexed_string_column.test.cpp index d7c2c9410..af9919a11 100644 --- a/src/silo/storage/column/indexed_string_column.test.cpp +++ b/src/silo/storage/column/indexed_string_column.test.cpp @@ -56,7 +56,7 @@ TEST(IndexedStringColumn, addingLineageAndThenSublineageFiltersCorrectly) { auto lineage_definition = LineageTreeAndIdMap::fromLineageDefinitionFilePath( "testBaseData/exampleDataset/lineage_definition.yaml" ); - IndexedStringColumnMetadata column_metadata("some_column", lineage_definition); + IndexedStringColumnMetadata column_metadata("some_column", lineage_definition, false); IndexedStringColumn under_test{&column_metadata}; ASSERT_TRUE(under_test.insert({"BA.1.1"})); @@ -90,7 +90,7 @@ TEST(IndexedStringColumn, addingSublineageAndThenLineageFiltersCorrectly) { auto lineage_definition = LineageTreeAndIdMap::fromLineageDefinitionFilePath( "testBaseData/exampleDataset/lineage_definition.yaml" ); - IndexedStringColumnMetadata column_metadata("some_column", lineage_definition); + IndexedStringColumnMetadata column_metadata("some_column", lineage_definition, false); IndexedStringColumn under_test{&column_metadata}; ASSERT_TRUE(under_test.insert({"BA.1.1.1"})); @@ -138,7 +138,7 @@ TEST(IndexedStringColumn, queryParentLineageThatWasNeverInserted) { auto lineage_definition = LineageTreeAndIdMap::fromLineageDefinitionFilePath( "testBaseData/exampleDataset/lineage_definition.yaml" ); - IndexedStringColumnMetadata column_metadata("some_column", lineage_definition); + IndexedStringColumnMetadata column_metadata("some_column", lineage_definition, false); IndexedStringColumn under_test{&column_metadata}; ASSERT_TRUE(under_test.insert({"BA.1.1.1"})); @@ -168,7 +168,7 @@ A: {} A.1: parents: ["A"] )")); - IndexedStringColumnMetadata column_metadata("some_column", lineage_definition); + IndexedStringColumnMetadata column_metadata("some_column", lineage_definition, false); IndexedStringColumn under_test{&column_metadata}; ASSERT_TRUE(under_test.insert({"A"})); auto success = under_test.insert({"A.2"}); @@ -180,4 +180,25 @@ A.1: ); } +TEST(IndexedStringColumn, ignoringErrorWhenInsertingIncorrectLineagesIfSpecified) { + auto lineage_definition = + LineageTreeAndIdMap::fromLineageDefinitionFile(LineageDefinitionFile::fromYAMLString(R"( +A: {} +A.1: + parents: ["A"] +)")); + IndexedStringColumnMetadata column_metadata("some_column", lineage_definition, true); + IndexedStringColumn under_test{&column_metadata}; + ASSERT_TRUE(under_test.insert({"A"})); + ASSERT_TRUE(under_test.insert({"not in the lineage hierarchy"})); + EXPECT_EQ( + *under_test.getLineageIndex() + ->filterIncludingSublineages( + under_test.getValueId("A").value(), RecombinantEdgeFollowingMode::DO_NOT_FOLLOW + ) + .value(), + roaring::Roaring({0}) + ); +} + // NOLINTEND(bugprone-unchecked-optional-access) diff --git a/testBaseData/siloSerializedState/1774967818/data_version.silo b/testBaseData/siloSerializedState/1774967818/data_version.silo deleted file mode 100644 index 23e7ea991..000000000 --- a/testBaseData/siloSerializedState/1774967818/data_version.silo +++ /dev/null @@ -1,2 +0,0 @@ -timestamp: 1774967818 -serializationVersion: 1774967790 \ No newline at end of file diff --git a/testBaseData/siloSerializedState/1776329347/data_version.silo b/testBaseData/siloSerializedState/1776329347/data_version.silo new file mode 100644 index 000000000..20bf1dfd1 --- /dev/null +++ b/testBaseData/siloSerializedState/1776329347/data_version.silo @@ -0,0 +1,2 @@ +timestamp: 1776329347 +serializationVersion: 1776329324 \ No newline at end of file diff --git a/testBaseData/siloSerializedState/1774967818/database_schema.silo b/testBaseData/siloSerializedState/1776329347/database_schema.silo similarity index 99% rename from testBaseData/siloSerializedState/1774967818/database_schema.silo rename to testBaseData/siloSerializedState/1776329347/database_schema.silo index f7bae59ea..bc159d9c4 100644 Binary files a/testBaseData/siloSerializedState/1774967818/database_schema.silo and b/testBaseData/siloSerializedState/1776329347/database_schema.silo differ diff --git a/testBaseData/siloSerializedState/1774967818/default.silo b/testBaseData/siloSerializedState/1776329347/default.silo similarity index 99% rename from testBaseData/siloSerializedState/1774967818/default.silo rename to testBaseData/siloSerializedState/1776329347/default.silo index d325e4310..b3c891e31 100644 Binary files a/testBaseData/siloSerializedState/1774967818/default.silo and b/testBaseData/siloSerializedState/1776329347/default.silo differ