From 1a29ef037dd8ef60a405dbbb1b4d194b94466439 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Fri, 6 Dec 2024 16:29:51 +0100 Subject: [PATCH 01/12] optimize querying --- .../graph/representation/hash/dbg_sshash.cpp | 33 ++++++++++++------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/metagraph/src/graph/representation/hash/dbg_sshash.cpp b/metagraph/src/graph/representation/hash/dbg_sshash.cpp index 07120a28e2..8782b5a235 100644 --- a/metagraph/src/graph/representation/hash/dbg_sshash.cpp +++ b/metagraph/src/graph/representation/hash/dbg_sshash.cpp @@ -2,6 +2,8 @@ #include +#include + #include "common/seq_tools/reverse_complement.hpp" #include "common/threads/threading.hpp" #include "common/logger.hpp" @@ -119,20 +121,27 @@ void map_to_nodes_with_rc_impl(size_t k, using kmer_t = get_kmer_t; - std::vector invalid_char(n); - for (size_t i = 0; i < n; ++i) { - invalid_char[i] = !kmer_t::is_valid(sequence[i]); - } + if (with_rc) { + sshash::streaming_query_regular_parsing parser(&dict); + for (size_t i = 0; i + k <= sequence.size(); ++i) { + callback(parser.lookup_advanced(sequence.c_str() + i)); + } + } else { + std::vector invalid_char(n); + for (size_t i = 0; i < n; ++i) { + invalid_char[i] = !kmer_t::is_valid(sequence[i]); + } - auto invalid_kmer = utils::drag_and_mark_segments(invalid_char, true, k); + auto invalid_kmer = utils::drag_and_mark_segments(invalid_char, true, k); - kmer_t uint_kmer = sshash::util::string_to_uint_kmer(sequence.data(), k - 1); - uint_kmer.pad_char(); - for (size_t i = k - 1; i < n && !terminate(); ++i) { - uint_kmer.drop_char(); - uint_kmer.kth_char_or(k - 1, kmer_t::char_to_uint(sequence[i])); - callback(invalid_kmer[i] ? sshash::lookup_result() - : dict.lookup_advanced_uint(uint_kmer, with_rc)); + kmer_t uint_kmer = sshash::util::string_to_uint_kmer(sequence.data(), k - 1); + uint_kmer.pad_char(); + for (size_t i = k - 1; i < n && !terminate(); ++i) { + uint_kmer.drop_char(); + uint_kmer.kth_char_or(k - 1, kmer_t::char_to_uint(sequence[i])); + callback(invalid_kmer[i] ? sshash::lookup_result() + : dict.lookup_advanced_uint(uint_kmer, with_rc)); + } } } From d21fa579313e8557d488d689f64411d900c2f80d Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Fri, 6 Dec 2024 16:35:57 +0100 Subject: [PATCH 02/12] fix --- metagraph/src/graph/representation/hash/dbg_sshash.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metagraph/src/graph/representation/hash/dbg_sshash.cpp b/metagraph/src/graph/representation/hash/dbg_sshash.cpp index 8782b5a235..98c12b0f5d 100644 --- a/metagraph/src/graph/representation/hash/dbg_sshash.cpp +++ b/metagraph/src/graph/representation/hash/dbg_sshash.cpp @@ -122,9 +122,9 @@ void map_to_nodes_with_rc_impl(size_t k, using kmer_t = get_kmer_t; if (with_rc) { - sshash::streaming_query_regular_parsing parser(&dict); + sshash::streaming_query_canonical_parsing parser(&dict); for (size_t i = 0; i + k <= sequence.size(); ++i) { - callback(parser.lookup_advanced(sequence.c_str() + i)); + callback(parser.lookup_advanced(sequence.data() + i)); } } else { std::vector invalid_char(n); From 4b0270ba7168bf7102511d7a39a103f7b2caae1a Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Fri, 6 Dec 2024 19:08:00 +0100 Subject: [PATCH 03/12] fix incorrect parsing flag on load --- .../graph/representation/hash/dbg_sshash.cpp | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/metagraph/src/graph/representation/hash/dbg_sshash.cpp b/metagraph/src/graph/representation/hash/dbg_sshash.cpp index 98c12b0f5d..fd880f3ec5 100644 --- a/metagraph/src/graph/representation/hash/dbg_sshash.cpp +++ b/metagraph/src/graph/representation/hash/dbg_sshash.cpp @@ -80,6 +80,7 @@ DBGSSHash::DBGSSHash(const std::string &input_filename, size_t k, Mode mode, siz build_config.verbose = common::get_verbose(); build_config.num_threads = get_num_threads(); + build_config.canonical_parsing = (mode != BASIC); // silence sshash construction messages when not verbose std::ios orig_state(nullptr); @@ -406,8 +407,34 @@ bool DBGSSHash::load(std::istream &in) { *this = DBGSSHash(k, mode); num_nodes_ = num_nodes; - if (num_nodes_) - std::visit([&](auto &d) { d.visit(loader); }, dict_); + if (num_nodes_) { + std::visit([&](auto &d) { + d.visit(loader); + + if (mode_ != BASIC) { + using kmer_t = get_kmer_t; + + // TODO: HACK! this is for backwards compatibility + class dict_access { + public: + uint64_t m_size; + uint64_t m_seed; + uint16_t m_k; + uint16_t m_m; + uint16_t m_canonical_parsing; + sshash::minimizers m_minimizers; + sshash::buckets m_buckets; + sshash::skew_index m_skew_index; + sshash::weights m_weights; + }; + + static_assert(sizeof(decltype(d)) == sizeof(dict_access)); + + // overwrite canonical parsing variable; + reinterpret_cast(d).m_canonical_parsing = true; + } + }, dict_); + } return true; } From 465d27ca07f0e88ffd15ae00310527a9a6f60d20 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Fri, 6 Dec 2024 19:43:36 +0100 Subject: [PATCH 04/12] new test --- metagraph/tests/graph/test_canonical_dbg.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/metagraph/tests/graph/test_canonical_dbg.cpp b/metagraph/tests/graph/test_canonical_dbg.cpp index e0b3b0b8d0..c28a06476f 100644 --- a/metagraph/tests/graph/test_canonical_dbg.cpp +++ b/metagraph/tests/graph/test_canonical_dbg.cpp @@ -6,6 +6,7 @@ #include "common/seq_tools/reverse_complement.hpp" #include "graph/representation/canonical_dbg.hpp" #include "graph/graph_extensions/node_first_cache.hpp" +#include "graph/alignment/alignment.hpp" namespace { @@ -498,12 +499,14 @@ TYPED_TEST(CanonicalDBGTest, CallPathsCheckHalfSingleKmerForm) { std::atomic num_kmers_both = 0; graph->call_sequences([&](const auto &sequence, const auto &path) { + EXPECT_EQ(sequence, align::spell_path(*graph, map_to_nodes_sequentially(*graph, sequence))); ASSERT_EQ(path, map_to_nodes_sequentially(*graph, sequence)); num_kmers_both += path.size(); }, num_threads); std::atomic num_kmers = 0; graph->call_sequences([&](const auto &sequence, const auto &path) { + EXPECT_EQ(sequence, align::spell_path(*graph, map_to_nodes_sequentially(*graph, sequence))); ASSERT_EQ(path, map_to_nodes_sequentially(*graph, sequence)); num_kmers += path.size(); }, num_threads, true); From 262874ec4e8de16856a6878071378326ca10bc7b Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Fri, 6 Dec 2024 20:21:43 +0100 Subject: [PATCH 05/12] switch to regular parsing --- .../graph/representation/hash/dbg_sshash.cpp | 35 +++---------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/metagraph/src/graph/representation/hash/dbg_sshash.cpp b/metagraph/src/graph/representation/hash/dbg_sshash.cpp index fd880f3ec5..e9700e92c4 100644 --- a/metagraph/src/graph/representation/hash/dbg_sshash.cpp +++ b/metagraph/src/graph/representation/hash/dbg_sshash.cpp @@ -2,7 +2,7 @@ #include -#include +#include #include "common/seq_tools/reverse_complement.hpp" #include "common/threads/threading.hpp" @@ -80,7 +80,6 @@ DBGSSHash::DBGSSHash(const std::string &input_filename, size_t k, Mode mode, siz build_config.verbose = common::get_verbose(); build_config.num_threads = get_num_threads(); - build_config.canonical_parsing = (mode != BASIC); // silence sshash construction messages when not verbose std::ios orig_state(nullptr); @@ -123,7 +122,7 @@ void map_to_nodes_with_rc_impl(size_t k, using kmer_t = get_kmer_t; if (with_rc) { - sshash::streaming_query_canonical_parsing parser(&dict); + sshash::streaming_query_regular_parsing parser(&dict); for (size_t i = 0; i + k <= sequence.size(); ++i) { callback(parser.lookup_advanced(sequence.data() + i)); } @@ -407,34 +406,8 @@ bool DBGSSHash::load(std::istream &in) { *this = DBGSSHash(k, mode); num_nodes_ = num_nodes; - if (num_nodes_) { - std::visit([&](auto &d) { - d.visit(loader); - - if (mode_ != BASIC) { - using kmer_t = get_kmer_t; - - // TODO: HACK! this is for backwards compatibility - class dict_access { - public: - uint64_t m_size; - uint64_t m_seed; - uint16_t m_k; - uint16_t m_m; - uint16_t m_canonical_parsing; - sshash::minimizers m_minimizers; - sshash::buckets m_buckets; - sshash::skew_index m_skew_index; - sshash::weights m_weights; - }; - - static_assert(sizeof(decltype(d)) == sizeof(dict_access)); - - // overwrite canonical parsing variable; - reinterpret_cast(d).m_canonical_parsing = true; - } - }, dict_); - } + if (num_nodes_) + std::visit([&](auto &d) { d.visit(loader); }, dict_); return true; } From 684a3706eff3ba4b13f73f4c4f146978049dcf01 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Mon, 9 Dec 2024 12:21:01 +0100 Subject: [PATCH 06/12] update sshash --- metagraph/external-libraries/sshash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metagraph/external-libraries/sshash b/metagraph/external-libraries/sshash index 72a92b62da..b7821f6863 160000 --- a/metagraph/external-libraries/sshash +++ b/metagraph/external-libraries/sshash @@ -1 +1 @@ -Subproject commit 72a92b62da0538e18177ce0494832b315d103844 +Subproject commit b7821f6863356924d1707785eec8564c48ea2f1a From 9078a65f7a71e5cedc3ca1cf73ec4d6843450a0c Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Mon, 9 Dec 2024 12:55:12 +0100 Subject: [PATCH 07/12] update sshash --- metagraph/external-libraries/sshash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metagraph/external-libraries/sshash b/metagraph/external-libraries/sshash index b7821f6863..c15b1c92ac 160000 --- a/metagraph/external-libraries/sshash +++ b/metagraph/external-libraries/sshash @@ -1 +1 @@ -Subproject commit b7821f6863356924d1707785eec8564c48ea2f1a +Subproject commit c15b1c92ac3ea1b7a626a4eefffe188722fe9015 From d5d6221576ee9a5f38584456332dc35c943eb19f Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Mon, 9 Dec 2024 14:02:33 +0100 Subject: [PATCH 08/12] make parser global --- .../graph/representation/hash/dbg_sshash.cpp | 23 +++++++++++++------ .../graph/representation/hash/dbg_sshash.hpp | 8 +++++++ 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/metagraph/src/graph/representation/hash/dbg_sshash.cpp b/metagraph/src/graph/representation/hash/dbg_sshash.cpp index e9700e92c4..36d41ad5f1 100644 --- a/metagraph/src/graph/representation/hash/dbg_sshash.cpp +++ b/metagraph/src/graph/representation/hash/dbg_sshash.cpp @@ -2,8 +2,6 @@ #include -#include - #include "common/seq_tools/reverse_complement.hpp" #include "common/threads/threading.hpp" #include "common/logger.hpp" @@ -86,7 +84,11 @@ DBGSSHash::DBGSSHash(const std::string &input_filename, size_t k, Mode mode, siz orig_state.copyfmt(std::cout); if (!common::get_verbose()) std::cout.setstate(std::ios_base::failbit); - std::visit([&](auto &d) { d.build(input_filename, build_config); }, dict_); + std::visit([&](auto &d) { + d.build(input_filename, build_config); + using kmer_t = get_kmer_t; + parser_ = sshash::streaming_query_regular_parsing(&d); + }, dict_); if (!common::get_verbose()) std::cout.copyfmt(orig_state); @@ -122,9 +124,10 @@ void map_to_nodes_with_rc_impl(size_t k, using kmer_t = get_kmer_t; if (with_rc) { - sshash::streaming_query_regular_parsing parser(&dict); for (size_t i = 0; i + k <= sequence.size(); ++i) { - callback(parser.lookup_advanced(sequence.data() + i)); + std::visit([&](const auto &p) { + callback(p.lookup_advanced(sequence.data() + i)); + }, parser_); } } else { std::vector invalid_char(n); @@ -406,8 +409,14 @@ bool DBGSSHash::load(std::istream &in) { *this = DBGSSHash(k, mode); num_nodes_ = num_nodes; - if (num_nodes_) - std::visit([&](auto &d) { d.visit(loader); }, dict_); + if (num_nodes_) { + std::visit([&](auto &d) { + d.visit(loader); + + using kmer_t = get_kmer_t; + parser_ = sshash::streaming_query_regular_parsing(&d); + }, dict_); + } return true; } diff --git a/metagraph/src/graph/representation/hash/dbg_sshash.hpp b/metagraph/src/graph/representation/hash/dbg_sshash.hpp index 12c1a407bd..a63863b528 100644 --- a/metagraph/src/graph/representation/hash/dbg_sshash.hpp +++ b/metagraph/src/graph/representation/hash/dbg_sshash.hpp @@ -7,6 +7,8 @@ #include #include +#include + #include "graph/representation/base/sequence_graph.hpp" namespace mtg::graph { @@ -117,6 +119,12 @@ class DBGSSHash : public DeBruijnGraph { size_t num_nodes_; Mode mode_; + using parser_t = std::variant< + sshash::streaming_query_regular_parsing>, + sshash::streaming_query_regular_parsing>, + sshash::streaming_query_regular_parsing>>; + static parser_t parser_; + size_t dict_size() const; }; From 0fa11032b3418a2407e3f878fd4eb8d8ee351956 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Mon, 9 Dec 2024 14:09:34 +0100 Subject: [PATCH 09/12] fix --- .../src/graph/representation/hash/dbg_sshash.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/metagraph/src/graph/representation/hash/dbg_sshash.cpp b/metagraph/src/graph/representation/hash/dbg_sshash.cpp index 36d41ad5f1..7f5fb24c2b 100644 --- a/metagraph/src/graph/representation/hash/dbg_sshash.cpp +++ b/metagraph/src/graph/representation/hash/dbg_sshash.cpp @@ -104,9 +104,10 @@ void DBGSSHash::add_sequence(std::string_view sequence, throw std::logic_error("adding sequences not supported"); } -template +template void map_to_nodes_with_rc_impl(size_t k, const Dict &dict, + const Parser &parser, std::string_view sequence, const std::function& callback, const std::function& terminate) { @@ -125,9 +126,7 @@ void map_to_nodes_with_rc_impl(size_t k, if (with_rc) { for (size_t i = 0; i + k <= sequence.size(); ++i) { - std::visit([&](const auto &p) { - callback(p.lookup_advanced(sequence.data() + i)); - }, parser_); + callback(parser.lookup_advanced(sequence.data() + i)); } } else { std::vector invalid_char(n); @@ -153,9 +152,11 @@ void DBGSSHash::map_to_nodes_with_rc(std::string_view sequence, const std::function& callback, const std::function& terminate) const { std::visit([&](const auto &dict) { - map_to_nodes_with_rc_impl(k_, dict, sequence, [&](sshash::lookup_result res) { - callback(sshash_to_graph_index(res.kmer_id), res.kmer_orientation); - }, terminate); + std::visit([&](const auto &parser) { + map_to_nodes_with_rc_impl(k_, dict, parser, sequence, [&](sshash::lookup_result res) { + callback(sshash_to_graph_index(res.kmer_id), res.kmer_orientation); + }, terminate); + }, parser_); }, dict_); } From 4a7e7700c54ad77e9d3cd1a2070d2e438803bc9b Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Mon, 9 Dec 2024 14:13:16 +0100 Subject: [PATCH 10/12] fix --- metagraph/src/graph/representation/hash/dbg_sshash.cpp | 4 ++-- metagraph/src/graph/representation/hash/dbg_sshash.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/metagraph/src/graph/representation/hash/dbg_sshash.cpp b/metagraph/src/graph/representation/hash/dbg_sshash.cpp index 7f5fb24c2b..ae2ed30f71 100644 --- a/metagraph/src/graph/representation/hash/dbg_sshash.cpp +++ b/metagraph/src/graph/representation/hash/dbg_sshash.cpp @@ -107,7 +107,7 @@ void DBGSSHash::add_sequence(std::string_view sequence, template void map_to_nodes_with_rc_impl(size_t k, const Dict &dict, - const Parser &parser, + Parser &parser, std::string_view sequence, const std::function& callback, const std::function& terminate) { @@ -152,7 +152,7 @@ void DBGSSHash::map_to_nodes_with_rc(std::string_view sequence, const std::function& callback, const std::function& terminate) const { std::visit([&](const auto &dict) { - std::visit([&](const auto &parser) { + std::visit([&](auto &parser) { map_to_nodes_with_rc_impl(k_, dict, parser, sequence, [&](sshash::lookup_result res) { callback(sshash_to_graph_index(res.kmer_id), res.kmer_orientation); }, terminate); diff --git a/metagraph/src/graph/representation/hash/dbg_sshash.hpp b/metagraph/src/graph/representation/hash/dbg_sshash.hpp index a63863b528..48ede93114 100644 --- a/metagraph/src/graph/representation/hash/dbg_sshash.hpp +++ b/metagraph/src/graph/representation/hash/dbg_sshash.hpp @@ -123,7 +123,7 @@ class DBGSSHash : public DeBruijnGraph { sshash::streaming_query_regular_parsing>, sshash::streaming_query_regular_parsing>, sshash::streaming_query_regular_parsing>>; - static parser_t parser_; + static mutable parser_t parser_; size_t dict_size() const; }; From 3dd6bd7361b5f554afdd36e48d4a458c2bc9df00 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Mon, 9 Dec 2024 14:19:10 +0100 Subject: [PATCH 11/12] fix --- metagraph/src/graph/representation/hash/dbg_sshash.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metagraph/src/graph/representation/hash/dbg_sshash.hpp b/metagraph/src/graph/representation/hash/dbg_sshash.hpp index 48ede93114..a63863b528 100644 --- a/metagraph/src/graph/representation/hash/dbg_sshash.hpp +++ b/metagraph/src/graph/representation/hash/dbg_sshash.hpp @@ -123,7 +123,7 @@ class DBGSSHash : public DeBruijnGraph { sshash::streaming_query_regular_parsing>, sshash::streaming_query_regular_parsing>, sshash::streaming_query_regular_parsing>>; - static mutable parser_t parser_; + static parser_t parser_; size_t dict_size() const; }; From 5e1788252fbc8724b761e4f4cfd2189dddb9cfed Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Mon, 9 Dec 2024 14:43:27 +0100 Subject: [PATCH 12/12] update sshash --- metagraph/external-libraries/sshash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metagraph/external-libraries/sshash b/metagraph/external-libraries/sshash index c15b1c92ac..c62d1ae0d2 160000 --- a/metagraph/external-libraries/sshash +++ b/metagraph/external-libraries/sshash @@ -1 +1 @@ -Subproject commit c15b1c92ac3ea1b7a626a4eefffe188722fe9015 +Subproject commit c62d1ae0d2c194d36638b565d2dc488532d28613