diff --git a/.gitmodules b/.gitmodules index bf13091e..533f40dd 100644 --- a/.gitmodules +++ b/.gitmodules @@ -11,9 +11,6 @@ [submodule "lib/raptor_data_simulation"] path = lib/raptor_data_simulation url = git@github.com:eaasna/raptor_data_simulation.git -[submodule "lib/stellar3"] - path = lib/stellar3 - url = git@github.com:seqan/stellar3.git [submodule "lib/seqan"] path = lib/seqan url = git@github.com:seqan/seqan.git diff --git a/CMakeLists.txt b/CMakeLists.txt index c3934893..79f95e7c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,12 @@ set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") set (CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") +# For debugging only +#set (CMAKE_CXX_FLAGS "-ftemplate-backtrace-limit=0") +#set (CMAKE_CXX_FLAGS "-fsanitize=address -g -O0") +set (CMAKE_CXX_FLAGS "-g -O0 -Wno-unused-parameter -Wno-unused-value -Wno-unused-but-set-variable -Wno-unused-variable -Wno-unused-local-typedefs") + + # Messages string (ASCII 27 Esc) set (FontBold "${Esc}[1m") diff --git a/include/dream_stellar/LICENSE.md b/include/dream_stellar/LICENSE.md new file mode 100644 index 00000000..e8023ac7 --- /dev/null +++ b/include/dream_stellar/LICENSE.md @@ -0,0 +1,22 @@ +// ========================================================================== +// STELLAR - SwifT Exact LocaL AligneR +// http://www.seqan.de/projects/stellar/ +// ========================================================================== +// Copyright (C) 2010-2012 by Birte Kehr +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your options) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +// ========================================================================== +// Author: Birte Kehr +// ========================================================================== \ No newline at end of file diff --git a/include/dream_stellar/diagnostics/print.hpp b/include/dream_stellar/diagnostics/print.hpp new file mode 100644 index 00000000..055a5ffb --- /dev/null +++ b/include/dream_stellar/diagnostics/print.hpp @@ -0,0 +1,60 @@ +#pragma once + +#include +#include + +namespace dream_stellar +{ + +/////////////////////////////////////////////////////////////////////////////// +// Calculates parameters from parameters in options object and writes them to outStr +// Sets options.qGram if not set by user input +template +void _writeCalculatedParams(StellarOptions & options, TStream & outStr); + +/////////////////////////////////////////////////////////////////////////////// +// Writes user specified parameters from options object to outStr +template +void _writeSpecifiedParams(StellarOptions const & options, TStream & outStr); + +/////////////////////////////////////////////////////////////////////////////// +// Writes file name from options object to outStr +template +void _writeFileNames(StellarOptions const & options, TStream & outStr); + +/////////////////////////////////////////////////////////////////////////////// +// Calculates parameters from parameters in options object and from sequences and writes them to outStr +template +void _writeMoreCalculatedParams(StellarOptions const & options, + uint64_t const & refLen, + std::vector const & queries, + TStream & outStr); + +void _writeOutputStatistics(StellarOutputStatistics const & statistics, bool const verbose, bool const writeDisabledQueriesFile); + +template +void _printStellarKernelStatistics(StellarComputeStatistics const & statistics, TStream & outStr); + +template +void _printDatabaseIdAndStellarKernelStatistics( + bool const verbose, + bool const databaseStrand, + CharString const & databaseID, + StellarComputeStatistics const & statistics, + TStream & outStr); + +template +void _printStellarStatistics( + bool const verbose, + bool const databaseStrand, + StringSet const & databaseIDs, + StellarComputeStatisticsCollection const & computeStatistics, + TStream & outStr); + +template +void _writeOutputStatistics(StellarOutputStatistics const & statistics, + bool const verbose, + bool const writeDisabledQueriesFile, + TStream & outStr); + +} // namespace dream_stellar diff --git a/include/dream_stellar/diagnostics/print.tpp b/include/dream_stellar/diagnostics/print.tpp new file mode 100644 index 00000000..f1a23b5b --- /dev/null +++ b/include/dream_stellar/diagnostics/print.tpp @@ -0,0 +1,182 @@ +#pragma once + +#include + +namespace dream_stellar +{ + +using namespace seqan2; + +/////////////////////////////////////////////////////////////////////////////// +// Writes user specified parameters from options object to outStr +template +void _writeSpecifiedParams(StellarOptions const & options, TStream & outStr) +{ +//IOREV _notio_ + // Output user specified parameters + outStr << "User specified parameters:" << std::endl; + outStr << " minimal match length : " << options.minLength << std::endl; + outStr << " maximal error rate (epsilon) : " << options.epsilon << std::endl; + outStr << " maximal x-drop : " << options.xDrop << std::endl; + if (options.qGram != std::numeric_limits::max()) + outStr << " k-mer (q-gram) length : " << options.qGram << std::endl; + outStr << " search forward strand : " << ((options.forward) ? "yes" : "no") << std::endl; + outStr << " search reverse complement : " << ((options.reverse) ? "yes" : "no") << std::endl; + outStr << std::endl; + + outStr << " verification strategy : " << to_string(options.verificationMethod) << std::endl; + if (options.disableThresh != std::numeric_limits::max()) + { + outStr << " disable queries with more than : " << options.disableThresh << " matches" << std::endl; + } + outStr << " maximal number of matches : " << options.numMatches << std::endl; + outStr << " duplicate removal every : " << options.compactThresh << std::endl; + if (options.maxRepeatPeriod != 1 || options.minRepeatLength != 1000) + { + outStr << " max low complexity repeat period : " << options.maxRepeatPeriod << std::endl; + outStr << " min low complexity repeat length : " << options.minRepeatLength << std::endl; + } + if (options.qgramAbundanceCut != 1) + { + outStr << " q-gram abundance cut ratio : " << options.qgramAbundanceCut << std::endl; + } + + outStr << std::endl; +} + +/////////////////////////////////////////////////////////////////////////////// +// Calculates parameters from parameters in options object and writes them to outStr +// Sets options.qGram if not set by user input +template +void _writeCalculatedParams(StellarOptions & options, TStream & outStr) +{ +//IOREV _notio_ + StellarStatistics statistics{options}; + + outStr << "Calculated parameters:" << std::endl; + if (statistics.kMerComputed) + { + options.qGram = (unsigned)statistics.kMerLength; + outStr << " k-mer length : " << statistics.kMerLength << std::endl; + } + + outStr << " s^min : " << statistics.smin << std::endl; + outStr << " threshold : " << statistics.threshold << std::endl; + outStr << " distance cut : " << statistics.distanceCut << std::endl; + outStr << " delta : " << statistics.delta << std::endl; + outStr << " overlap : " << statistics.overlap << std::endl; + outStr << std::endl; +} + +/////////////////////////////////////////////////////////////////////////////// +// Writes file name from options object to outStr +template +void _writeFileNames(StellarOptions const & options, TStream & outStr) +{ +//IOREV _notio_ + outStr << "I/O options:" << std::endl; + outStr << " database file : " << options.databaseFile << std::endl; + outStr << " query file : " << options.queryFile << std::endl; + outStr << " alphabet : " << options.alphabet << std::endl; + outStr << " output file : " << options.outputFile << std::endl; + outStr << " output format : " << options.outputFormat << std::endl; + if (options.disableThresh != std::numeric_limits::max()) + { + outStr << " disabled queries: " << options.disabledQueriesFile << std::endl; + } + outStr << std::endl; +} + +/////////////////////////////////////////////////////////////////////////////// +// Calculates parameters from parameters in options object and from sequences and writes them to outStr +template +void _writeMoreCalculatedParams(StellarOptions const & options, + uint64_t const & refLen, + std::vector const & queries, + TStream & outStr) +{ + if (options.qgramAbundanceCut != 1) + { + outStr << "Calculated parameters:" << std::endl; + } + + uint64_t queryLength{0}; + for (auto & query : queries) + queryLength += query.size(); + + if (options.qgramAbundanceCut != 1) + { + outStr << " q-gram expected abundance : "; + outStr << queryLength / (double)((long)1 << (options.qGram << 1)) << std::endl; + outStr << " q-gram abundance threshold: "; + outStr << _max(100, (int)(queryLength * options.qgramAbundanceCut)) << std::endl; + outStr << std::endl; + } +} + +template +void _printStellarKernelStatistics(StellarComputeStatistics const & statistics, TStream & outStr) +{ + if (statistics.numSwiftHits == 0) + return; + + outStr << std::endl << " # SWIFT hits : " << statistics.numSwiftHits; + outStr << std::endl << " Longest hit : " << statistics.maxLength; + outStr << std::endl << " Avg hit length : " << statistics.totalLength/statistics.numSwiftHits; +} + +template +void _printDatabaseIdAndStellarKernelStatistics( + bool const verbose, + bool const databaseStrand, + CharString const & databaseID, + StellarComputeStatistics const & statistics, + TStream & outStr) +{ + outStr << " " << databaseID; + if (!databaseStrand) + outStr << ", complement"; + outStr << std::flush; + + if (verbose) + { + _printStellarKernelStatistics(statistics, outStr); + } + outStr << std::endl; +} + +template +void _printStellarStatistics( + bool const verbose, + bool const databaseStrand, + StringSet const & databaseIDs, + StellarComputeStatisticsCollection const & computeStatistics, + TStream & outStr) +{ + std::cerr << std::endl; // swift filter output is on same line + for (size_t i = 0; i < length(databaseIDs); ++i) + { + CharString const & databaseID = databaseIDs[i]; + StellarComputeStatistics const & statistics = computeStatistics[i]; + _printDatabaseIdAndStellarKernelStatistics(verbose, databaseStrand, databaseID, statistics, outStr); + } +} + +template +void _writeOutputStatistics(StellarOutputStatistics const & statistics, + bool const verbose, + bool const writeDisabledQueriesFile, + TStream & outStr) +{ + outStr << "# Eps-matches : " << statistics.numMatches << std::endl; + if (verbose) { + if (statistics.numMatches > 0) { + outStr << "Longest eps-match : " << statistics.maxLength << std::endl; + outStr << "Avg match length : " << statistics.totalLength / statistics.numMatches << std::endl; + } + if (writeDisabledQueriesFile) + outStr << "# Disabled queries: " << statistics.numDisabled << std::endl; + } +} + +} // namespace dream_stellar diff --git a/include/dream_stellar/extension/align_banded_nw_best_ends.hpp b/include/dream_stellar/extension/align_banded_nw_best_ends.hpp new file mode 100644 index 00000000..54cbb9c9 --- /dev/null +++ b/include/dream_stellar/extension/align_banded_nw_best_ends.hpp @@ -0,0 +1,172 @@ +#pragma once + +#include + +namespace dream_stellar +{ +using namespace seqan2; + +/////////////////////////////////////////////////////////////////////////////// +// Computes the banded alignment matrix and additionally a string with the best +// alignment end point for each alignment length. +template +inline void +_align_banded_nw_best_ends(TTrace& trace, + std::vector & bestEnds, + TSegmentVector const & str, + TScore const & sc, + TDiagonal const diagL, + TDiagonal const diagU) +{ + typedef typename Value::Type TTraceValue; + typedef typename Value::Type TScoreValue; + typedef typename Value::Type TSegment; // was Segment> now Segment> + typedef typename Size::Type TSize; + using TAlphabet = typename Value::Type; + //!TODO: TAlphabet should NOT be of container type + + SEQAN_ASSERT_GEQ(diagU, diagL); + + // Initialization + TTraceValue const Diagonal = 0; + TTraceValue const Horizontal = 1; + TTraceValue const Vertical = 2; + TSegment const& str1 = str[0]; + TSegment const& str2 = str[1]; + TSize const len1 = length(str1) + 1; + TSize const len2 = length(str2) + 1; + TSize const diagonalWidth = (TSize) (diagU - diagL + 1); + TSize hi_diag = diagonalWidth; + TSize lo_diag = 0; + if (diagL > 0) lo_diag = 0; + else lo_diag = (diagU < 0) ? hi_diag : (TSize) (1 - diagL); + TSize const lo_row = (diagU <= 0) ? -diagU : 0; + TSize const hi_row = [&]() + { + TSize const max_hi_row = len2; + // Note: diagL might be negative + assert((TDiagonal) len1 >= diagL); + if (len1 - diagL < max_hi_row) + return len1 - diagL; + else + return max_hi_row; + }(); + TSize const height = hi_row - lo_row; + + typedef String TRow; + TRow mat, len; + resize(mat, diagonalWidth); + resize(len, diagonalWidth); + resize(trace, height * diagonalWidth); + + // Classical DP with affine gap costs + typedef typename Iterator::Type TRowIter; + typedef typename Iterator::Type TTraceIter; + + TSize errors; + + assert(scoreMatch(sc) == 1u); + assert(scoreGap(sc) == scoreGapExtendHorizontal(sc, TAlphabet{}, TAlphabet{})); + assert(scoreGap(sc) == scoreGapExtendVertical(sc, TAlphabet{}, TAlphabet{})); + assert(scoreMismatch(sc) == scoreGap(sc)); + + TScoreValue const matchScore = scoreMatch(sc); + TScoreValue const gapScore = scoreGap(sc); + + for(TSize row = 0; row < height; ++row) { + TSize actualRow = row + lo_row; + if (lo_diag > 0) --lo_diag; + if ((TDiagonal)actualRow >= (TDiagonal)len1 - diagU) --hi_diag; + TTraceIter traceIt = begin(trace, Standard()) + row * diagonalWidth + lo_diag; + TRowIter current_score_rowise_it = begin(mat, Standard()) + lo_diag; + TRowIter alignment_length_it = begin(len, Standard()) + lo_diag; + + TScoreValue score_left = std::numeric_limits::min(); + TScoreValue alignment_length_left = len1+len2+1; + + for(TSize col = lo_diag; col= len1) break; + + if ((actualRow != 0) && (actualCol != 0)) { + TAlphabet const str1entry = sequenceEntryForScore(sc, str1, ((int) actualCol - 1)); + TAlphabet const str2entry = sequenceEntryForScore(sc, str2, ((int) actualRow - 1)); + + // Get the new maximum for mat + *current_score_rowise_it += score(sc, str1entry, str2entry); + *traceIt = Diagonal; + ++(*alignment_length_it); + + TScoreValue score_up = + (col < diagonalWidth - 1) ? + *(current_score_rowise_it+1) + gapScore : + std::numeric_limits::min(); + + if (score_up > *current_score_rowise_it) + { + *current_score_rowise_it = score_up; + *traceIt = Vertical; + *alignment_length_it = *(alignment_length_it+1) + 1; + } + + score_left = + (col > 0) ? + score_left + gapScore : + std::numeric_limits::min(); + if (score_left > *current_score_rowise_it) + { + *current_score_rowise_it = score_left; + *traceIt = Horizontal; + *alignment_length_it = alignment_length_left + 1; + } + score_left = *current_score_rowise_it; + alignment_length_left = *alignment_length_it; + } else { + // Usual initialization for first row and column + if (actualRow == 0) { + *current_score_rowise_it = actualCol * gapScore; + *alignment_length_it = actualCol; + } + else { + assert(actualCol == 0); + *current_score_rowise_it = actualRow * gapScore; + *alignment_length_it = actualRow; + score_left = *current_score_rowise_it; + alignment_length_left = actualRow; + } + } + + // *current_score_rowise_it: the alignment_score + // *alignment_length_it: the alignment_length (basically length of best trace path of the current cell) + // alignment_length = |matches| + |mismatches| + |gaps| + // + // alignment_score + // = |matches| * match_score + |mismatches| * gap_score + |gaps| * gap_score + // + // alignment_length * match_score + // = |matches| * match_score + |mismatches| * match_score + |gaps| * match_score + // + // alignment_score - alignment_length * match_score + // = 0 + (|mismatches| + |gaps|)(gap_score - match_score) + // + // Thus + // (alignment_score - alignment_length * match_score) / (gapScore - matchScore) + // = |mismatches| + |gaps| = errors + errors = (*current_score_rowise_it - (*alignment_length_it * matchScore)) / (gapScore - matchScore); + SEQAN_ASSERT_GEQ(errors, 0); + SEQAN_ASSERT_LEQ(errors, bestEnds.size()); + if (errors == bestEnds.size()) { + bestEnds.emplace_back(TEnd(*alignment_length_it, row, col)); + } else if (*alignment_length_it > static_cast(bestEnds[errors].length)) + bestEnds[errors] = TEnd(*alignment_length_it, row, col); + //std::cerr << row << ',' << col << ':' << *current_score_rowise_it << std::endl; + } + } + TSize newLength = bestEnds.size() - 1; + while (newLength > 0 && bestEnds[newLength].length <= bestEnds[newLength-1].length) { + --newLength; + } + bestEnds.reserve(newLength + 1); +} + +} // namespace dream_stellar diff --git a/include/dream_stellar/extension/extension_banded_trace_matrix.hpp b/include/dream_stellar/extension/extension_banded_trace_matrix.hpp new file mode 100644 index 00000000..135ddb4b --- /dev/null +++ b/include/dream_stellar/extension/extension_banded_trace_matrix.hpp @@ -0,0 +1,156 @@ +#pragma once + +#include + +#include +#include // needs seqan/sequence.h + +namespace dream_stellar +{ + +struct extension_banded_trace_matrix +{ + using diagonal_t = std::make_signed_t; + + extension_banded_trace_matrix( + size_t const rowCount, + size_t const columnCount, + diagonal_t const lowerDiagonal, + diagonal_t const upperDiagonal + ) : + _rowCount{rowCount}, + _columnCount{columnCount}, + _lowerDiagonal{lowerDiagonal}, + _upperDiagonal{upperDiagonal}, + _traceMatrix{} + { + assert(lowerDiagonal <= upperDiagonal); + + resize(_traceMatrix, dataSize()); + } + + size_t rows() const + { + return _rowCount; + } + + size_t columns() const + { + return _columnCount; + } + + diagonal_t lowerDiagonal() const + { + return _lowerDiagonal; + } + + diagonal_t upperDiagonal() const + { + return _upperDiagonal; + } + + size_t diagonalWidth() const + { + return _upperDiagonal - _lowerDiagonal + 1; + } + + // memory region for active row + std::span rowSpan(size_t const row) + { + auto [beginRow, endRow] = rowInterval(); + if (!(beginRow <= row && row <= endRow)) + return {}; + + size_t rowOffset = row - beginRow; + auto diagonalInterval = diagonalIntervalInRow(row); + size_t columnBegin = std::min(diagonalInterval.first + _lowerDiagonal + row, _columnCount); + size_t columnEnd = std::min(diagonalInterval.second + _lowerDiagonal + row, _columnCount); + + return data().subspan( + rowOffset * diagonalWidth() + diagonalInterval.first, + columnEnd - columnBegin); + } + + // complete underlying data + std::span data() + { + seqan2::TraceBack & firstValue = *begin(_traceMatrix); + return {&firstValue, dataSize()}; + } + + seqan2::String & underlyingTraceMatrix() + { + return _traceMatrix; + } + + size_t dataSize() const + { + std::pair pair = rowInterval(); + size_t height = pair.second - pair.first; + return height * diagonalWidth(); + } + + std::pair rowInterval() const + { + size_t const rowBegin = (_upperDiagonal <= 0) ? -_upperDiagonal : 0; + size_t const rowEnd = [&]() + { + size_t const maxRowEnd = _rowCount; + assert ((diagonal_t)_columnCount >= _lowerDiagonal); + if (_columnCount - _lowerDiagonal < maxRowEnd) + return _columnCount - _lowerDiagonal; + else + return maxRowEnd; + }(); + + return {rowBegin, rowEnd}; + } + + std::pair diagonalIntervalInRow(size_t const row) const + { + size_t const rowOffset = row - rowInterval().first; + + size_t diagonalEnd = diagonalWidth(); + size_t diagonalBegin = 0; + if (_lowerDiagonal <= 0) + { + diagonalBegin = (_upperDiagonal < 0) ? diagonalEnd : (size_t) (1 - _lowerDiagonal); + } + + // subtract in each row iteration + // if (diagonalBegin > 0) --diagonalBegin; + + size_t const diagonalBeginOffset = std::min(rowOffset + 1, diagonalBegin); + diagonalBegin -= diagonalBeginOffset; // lo_diag + + // subtract in each row iteration + // if ((diagonal_t)actualRow >= (diagonal_t)len1 - diagU) --hi_diag; + size_t const diagonalEndOffset = + ((diagonal_t)row >= (diagonal_t)_columnCount - _upperDiagonal) ? + (row - _columnCount + _upperDiagonal) : + 0; + diagonalEnd -= diagonalEndOffset; // hi_diag + + return {diagonalBegin, diagonalEnd}; + } + + std::pair columnIntervalInRow(size_t const row) const + { + auto diagonal = diagonalIntervalInRow(row); + + size_t columnBegin = std::min(diagonal.first + _lowerDiagonal + row, _columnCount); + size_t columnEnd = std::min(diagonal.second + _lowerDiagonal + row, _columnCount); + + return {columnBegin, columnEnd}; + } + +private: + size_t _rowCount; + size_t _columnCount; + diagonal_t _lowerDiagonal; + diagonal_t _upperDiagonal; + + seqan2::String _traceMatrix; +}; + +} // namespace dream_stellar diff --git a/include/dream_stellar/extension/extension_end_position.hpp b/include/dream_stellar/extension/extension_end_position.hpp new file mode 100644 index 00000000..de7ff2c0 --- /dev/null +++ b/include/dream_stellar/extension/extension_end_position.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include + +namespace dream_stellar +{ +using namespace seqan2; + +/////////////////////////////////////////////////////////////////////////////// +// Container for storing possible end positions in extension of eps-core +template +struct ExtensionEndPosition { + using TPosition = TPos_; + using TCoordinate = Pair; + + TPosition length; + TCoordinate coord; + + ExtensionEndPosition(): + length(0), coord(TCoordinate(0,0)) {} + + ExtensionEndPosition(TPosition len, TPosition row, TPosition col): + length(len), coord(TCoordinate(row, col)) {} +}; + +} // namespace dream_stellar diff --git a/include/dream_stellar/extension/longest_eps_match.hpp b/include/dream_stellar/extension/longest_eps_match.hpp new file mode 100644 index 00000000..e5b4d50e --- /dev/null +++ b/include/dream_stellar/extension/longest_eps_match.hpp @@ -0,0 +1,74 @@ +#pragma once + +#include + +#include + +#include + +namespace dream_stellar +{ +using namespace seqan2; + +/////////////////////////////////////////////////////////////////////////////// +// Identifies the longest epsilon match in align from possEndsLeft and possEndsRight and sets the view positions of +// align to start and end position of the longest epsilon match +template, typename TIterator = std::vector::const_iterator> +std::pair longestEpsMatch(std::vector > const & possEndsLeft, + std::vector > const & possEndsRight, + TLength const alignLen, + TLength const alignErr, + TSize const matchMinLength, + TEps const epsilon) { + + // Identify longest eps match by iterating over combinations of left and right positions + TIterator rightIt = possEndsRight.end() - 1; + TIterator leftIt = possEndsLeft.end() - 1; + TIterator right = possEndsRight.begin(); + TIterator left = possEndsLeft.begin(); + + /*for (int i = 0; i < length(possEndsRight); ++i) { + std::cout << possEndsRight[i].length << " " << possEndsRight[i].coord.i1 << "," << possEndsRight[i].coord.i2 << std::endl; + } + for (int i = 0; i < length(possEndsLeft); ++i) { + std::cout << possEndsLeft[i].length << " " << possEndsLeft[i].coord.i1 << "," << possEndsLeft[i].coord.i2 << std::endl; + }*/ + + TSize leftErr = possEndsLeft.size() - 1; + + TSize minLength = matchMinLength; + bool found = false; + // DELTA is used below against floating point rounding errors. + double const DELTA = 0.000001; + + while (leftIt >= possEndsLeft.begin()) { + TSize totalLen = (*leftIt).length + alignLen + (*rightIt).length; + if (totalLen < minLength) break; + TSize totalErr = leftErr + alignErr + possEndsRight.size() - 1; + while (rightIt >= possEndsRight.begin()) { + totalLen = (*leftIt).length + alignLen + (*rightIt).length; + if (totalLen < minLength) break; + if ((TEps)totalErr/(TEps)totalLen < epsilon + DELTA) { + right = rightIt; + left = leftIt; + //std::cout << totalLen << std::endl; + minLength = totalLen; + found = true; + break; + } + --rightIt; + --totalErr; + } + rightIt = possEndsRight.end() - 1; + --leftIt; + --leftErr; + } + + if (found) + return std::pair(left, right); + else + return std::pair(possEndsLeft.end(),possEndsRight.end()); +} + +} // namespace dream_stellar diff --git a/include/dream_stellar/io/import_sequence.hpp b/include/dream_stellar/io/import_sequence.hpp new file mode 100644 index 00000000..eeac7ca5 --- /dev/null +++ b/include/dream_stellar/io/import_sequence.hpp @@ -0,0 +1,81 @@ +#pragma once + +#include +#include + +namespace dream_stellar +{ + +using namespace seqan2; + +template +inline bool +_checkUniqueId(std::set & uniqueIds, TId const & id) +{ + TId shortId; + + // (cut at first whitespace) + for (auto it = begin(id); it != end(id) && *it > ' '; ++it) + { + appendValue(shortId, *it); + } + + auto [it, added] = uniqueIds.insert(shortId); + + return added; +} + +template +inline bool _import_database_sequences(input_t const & file_input, + collection_t & seqs, + std::vector & ids, + uint64_t & seq_len, + stream_t & str_out, + stream_t & str_err) +{ + bool ids_unique{true}; + size_t seq_count{0}; + + auto record_intake_lambda = [&](auto const & record) + { + ids_unique &= (std::find(ids.begin(), ids.end(), record.id()) != ids.end()); + seq_len += record.sequence().size(); + seqs.emplace_back(record.sequence()); + ids.emplace_back(record.id()); + seq_count++; + }; + + if constexpr (std::is_same::value) + { + seqan3::sequence_file_input fin{file_input}; + for (auto & record : fin) + { + record_intake_lambda(record); + } + } + else if constexpr (std::is_same>>::value) + { + for (auto & bin : file_input) + { + for (auto & bin_file : bin) + { + seqan3::sequence_file_input fin{bin_file}; + for (auto & record : bin_file) + { + record_intake_lambda(record); + } + } + } + } + else + { + str_err << "WARNING: Unknown database file input\n"; + } + + str_out << "Loaded " << seq_count << " adapted database sequence" << ((seq_count > 1) ? "s." : ".") << std::endl; + if (!ids_unique) + str_err << "WARNING: Non-unique adapted database ids. Output can be ambiguous.\n"; + return true; +} + +} // namespace dream_stellar diff --git a/include/dream_stellar/options/dream_options.hpp b/include/dream_stellar/options/dream_options.hpp new file mode 100644 index 00000000..a79982f2 --- /dev/null +++ b/include/dream_stellar/options/dream_options.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include + +namespace dream_stellar +{ + +struct DREAMOptions +{ + bool prefilteredSearch{false}; // search a subset of all reference sequences (e.g chr 1, chr 2) + bool searchSegment{false}; // search a segment of a single reference sequence (e.g chr1:1000-3000) + uint64_t referenceLength{0}; + + // Specify the segment (and sequence) of interest. + std::vector binSequences; + uint32_t segmentBegin; + uint32_t segmentEnd; +}; + +} // namespace dream_stellar diff --git a/include/dream_stellar/options/eps_match_options.hpp b/include/dream_stellar/options/eps_match_options.hpp new file mode 100644 index 00000000..1c1699d7 --- /dev/null +++ b/include/dream_stellar/options/eps_match_options.hpp @@ -0,0 +1,16 @@ + +#pragma once + +#include + +namespace dream_stellar +{ + +struct EPSMatchOptions +{ + dream_stellar::utils::fraction epsilon{5, 100}; // maximal error rate + double numEpsilon{0.05}; + unsigned minLength{100}; // minimal length of an epsilon-match +}; + +} // namespace dream_stellar diff --git a/include/dream_stellar/options/index_options.hpp b/include/dream_stellar/options/index_options.hpp new file mode 100644 index 00000000..0a27a791 --- /dev/null +++ b/include/dream_stellar/options/index_options.hpp @@ -0,0 +1,14 @@ +#pragma once + +#include + +namespace dream_stellar +{ + +struct IndexOptions +{ + size_t qGram{std::numeric_limits::max()}; // length of the q-grams + double qgramAbundanceCut{1}; +}; + +} // namespace dream_stellar diff --git a/include/dream_stellar/options/verifier_options.hpp b/include/dream_stellar/options/verifier_options.hpp new file mode 100644 index 00000000..a3539d22 --- /dev/null +++ b/include/dream_stellar/options/verifier_options.hpp @@ -0,0 +1,54 @@ + +#pragma once + +#include +#include + +namespace dream_stellar +{ + +struct VerifyAllLocal_; +using AllLocal = seqan2::Tag const; + +struct VerifyBestLocal_; +using BestLocal = seqan2::Tag const; + +// basically a std::variant +struct StellarVerificationMethod +{ + StellarVerificationMethod(AllLocal) : _index{0} {} + StellarVerificationMethod(BestLocal) : _index{1} {} + + constexpr std::size_t index() const noexcept + { + return _index; + } + + friend constexpr bool operator==( + StellarVerificationMethod const & m1, + StellarVerificationMethod const & m2) + { + return m1.index() == m2.index(); + } + + friend inline std::string to_string(StellarVerificationMethod method) + { + using cstring_t = char const * const; + cstring_t method_names[] = {"exact", "bestLocal"}; + return method_names[method.index()]; + } + +private: + std::size_t _index{~std::size_t{0u}}; +}; + +struct VerifierOptions +{ + double xDrop{5}; // maximal x-drop + + // verification strategy: exact, bestLocal + std::string strVerificationMethod{"exact"}; + StellarVerificationMethod verificationMethod{AllLocal{}}; +}; + +} // namespace dream_stellar diff --git a/include/dream_stellar/query_id_map.hpp b/include/dream_stellar/query_id_map.hpp new file mode 100644 index 00000000..2fd0ecd4 --- /dev/null +++ b/include/dream_stellar/query_id_map.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include + +#include + +namespace dream_stellar +{ + +/** + * @brief Associate a query ID with the corresponding segment sequence. + */ +template +struct query_id_map +{ + using rec_t = valik::shared_query_record; + std::vector & records; + + dream_stellar::StellarQuerySegment segment_from_id(unsigned const & query_id) const + { + if (query_id >= records.size()) + throw std::runtime_error("Query index " + std::to_string(query_id) + " is out of range [0, " + + std::to_string(records.size() - 1) + "]"); + rec_t & shared_rec = records[query_id]; + return shared_rec.asStellarSegment(); + } +}; + +} // namespace dream_stellar diff --git a/include/dream_stellar/shared.hpp b/include/dream_stellar/shared.hpp new file mode 100644 index 00000000..a815ee10 --- /dev/null +++ b/include/dream_stellar/shared.hpp @@ -0,0 +1,83 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace dream_stellar +{ + +/////////////////////////////////////////////////////////////////////////////// +// Options for Stellar +struct StellarOptions : public EPSMatchOptions, public IndexOptions, public VerifierOptions, public DREAMOptions { + // i/o options + std::string databaseFile; // name of database file + std::string queryFile; // name of query file + std::string outputFile{"stellar.gff"}; // name of result file + std::string disabledQueriesFile{"stellar.disabled.fasta"}; // name of result file containing disabled queries + std::string outputFormat{"gff"}; // Possible formats: gff, text + std::string alphabet{"dna5"}; // Possible values: dna, rna, protein, char + bool write_time; // write running time to standard output + + // more options + bool forward{true}; // compute matches to forward strand of database + bool onlyForward{false}; + bool reverse{true}; // compute matches to reverse complemented database + bool onlyReverse{false}; + + size_t disableThresh = std::numeric_limits::max(); // maximal number of matches allowed per query before disabling verification of hits for that query + size_t compactThresh = 500; // number of matches after which removal of overlaps and duplicates is started + size_t numMatches = 50; // maximal number of matches per query and database + size_t maxRepeatPeriod = 1; // maximal period of low complexity repeats to be filtered + size_t minRepeatLength = 1000; // minimal length of low complexity repeats to be filtered + bool verbose{false}; // verbose mode + + static constexpr size_t kmerCount(size_t sequenceLength, size_t kmerSize) + { + assert(kmerSize > 0u); + assert(sequenceLength >= kmerSize - 1u); + // number of kmers + return sequenceLength + 1u - kmerSize; + } + + static constexpr size_t kmerLemma(size_t sequenceLength, size_t kmerSize, size_t errors) + { + size_t maxAffectedKMers = kmerSize * errors; + size_t count = kmerCount(sequenceLength, kmerSize); + return std::max(count, maxAffectedKMers) - maxAffectedKMers; + } + + static constexpr size_t pigeonholeLemma(size_t sequenceLength, size_t errors) + { + assert(sequenceLength >= errors); + // how many consecutive chars must be error free + using difference_t = utils::fraction::difference_t; + return ceil(utils::fraction{static_cast(sequenceLength - errors), errors + 1}); + } + + static constexpr size_t minLengthWithExactError(size_t absoluteError, utils::fraction epsilon) + { + if (epsilon.numerator() == 0) + return std::numeric_limits::max(); + + using difference_t = utils::fraction::difference_t; + return ceil(utils::fraction{static_cast(absoluteError), 1} / epsilon); + } + + static constexpr size_t absoluteErrors(utils::fraction epsilon, size_t sequenceLength) + { + using difference_t = utils::fraction::difference_t; + return floor(utils::fraction{static_cast(sequenceLength), 1} * epsilon); + } + +}; + +} diff --git a/include/dream_stellar/stellar.hpp b/include/dream_stellar/stellar.hpp new file mode 100644 index 00000000..1ebbab48 --- /dev/null +++ b/include/dream_stellar/stellar.hpp @@ -0,0 +1,247 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace dream_stellar +{ + +using namespace seqan2; + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////// +// Checks whether two matches overlap *in seq2* and +// whether the non-overlaping parts are shorter than minLength. +template +bool +checkOverlap(TMatch const & matchA, TMatch const & matchB, TSize const minLength) { + // check id and orienation + if (matchA.id != matchB.id || matchA.orientation != matchB.orientation) return false; + if (matchA.id == TMatch::INVALID_ID || matchB.id == TMatch::INVALID_ID) return false; + + // check overlap in seq2 + if (matchA.begin2 >= matchB.begin2) { + if (matchA.end2 >= matchB.end2) { + // check length of non-overlapping parts of both matches + if ((TSize)matchA.begin2 - (TSize)matchB.begin2 >= minLength && + (TSize)matchA.end2 - (TSize)matchB.end2 >= minLength) { + return false; + } + } + // check whether offset is the same in both sequences + if (toViewPosition(matchA.row2, matchA.begin2) - toViewPosition(matchB.row2, matchB.begin2) != + toViewPosition(matchA.row1, matchA.begin1) - toViewPosition(matchB.row1, matchB.begin1)) { + return false; + } + } else { + if (matchA.end2 < matchB.end2) { + // check length of non-overlapping parts of both matches + if ((TSize)matchB.begin2 - (TSize)matchA.begin2 >= minLength && + (TSize)matchB.end2 - (TSize)matchA.end2 >= minLength) { + return false; + } + } + // check whether offset is the same in both sequences + if (toViewPosition(matchB.row2, matchB.begin2) - toViewPosition(matchA.row2, matchA.begin2) != + toViewPosition(matchB.row1, matchB.begin1) - toViewPosition(matchA.row1, matchA.begin1)) { + return false; + } + } + return true; +} + +template +TPosition +projectedPosition(TRow const & rowA, TRow const & rowB, TPosition pos) +{ + return toSourcePosition(rowB, toViewPosition(rowA, pos)); +} + +/////////////////////////////////////////////////////////////////////////////// +// Checks all alignment columns of two overlapping matches. +// It is assumed that matchA.begin1 < matchB.begin1. +template +bool +_checkAlignColOverlap(TMatch const & matchA, TMatch const & matchB, TSize const minLength) +{ + TSize equalCols = 0; + TSize diffCols = 0; + + for (typename TMatch::TPos pos = matchB.begin1; pos < _min(matchA.end1, matchB.end1); ++pos) + { + if (projectedPosition(matchA.row1, matchA.row2, pos) == projectedPosition(matchB.row1, matchB.row2, pos)) + ++equalCols; + else + ++diffCols; + } + + if (diffCols >= minLength) return false; + return true; +} + +/////////////////////////////////////////////////////////////////////////////// +// Marks matches that overlap in both sequences with a longer match as invalid. +template +void maskOverlaps(String > & matches, TSize const minLength) +{ + typedef StellarMatch TMatch; + typedef typename TMatch::TPos TPos; + typedef typename Iterator, Rooted>::Type TIter; + typedef typename Iterator, Rooted>::Type TOverlapIter; + + // sort matches by begin position in row0 + sortMatches(matches, LessPos()); + + // iterate all matches + TIter it = begin(matches); + + // list of matches that potentially overlap with the current match in row0 and + // start earlier (including matches that overlap but have a unique part of at + // least minLength) sorted by descending end positions + String overlaps; + + for (; it != end(matches); ++it) + { + if ((*it).id == TMatch::INVALID_ID) continue; + + TPos insertPos = 0; + + // iterate potentially overlapping matches + TOverlapIter overlapIt = begin(overlaps); + for (; overlapIt != end(overlaps); ++overlapIt) + { + TMatch & o = matches[*overlapIt]; + + // determine position for inserting *it into overlaps after checking + if ((*it).end1 < o.end1) insertPos++; + + // check if matches overlap in row0 - if not, then break + if (o.end1 <= (*it).begin1) break; + + // check if unique parts of the two matches in row0 are longer than minLength - if yes, then continue + if ((*it).begin1 - o.begin1 >= (TPos)minLength && + (*it).end1 > o.end1 && (*it).end1 - o.end1 >= (TPos)minLength) continue; + + // check if matches overlap in row1 - if not, then continue + if (!checkOverlap(*it, o, minLength)) continue; + + // check exact alignment columns for overlap + if (!_checkAlignColOverlap(o, *it, minLength)) continue; + + // set shorter match invalid + if (length(*it) > length(o)) + o.id = TMatch::INVALID_ID; + else + (*it).id = TMatch::INVALID_ID; + } + + // remove all matches from overlaps that end earlier than current match begins + resize(overlaps, position(overlapIt)); + + if ((*it).id != TMatch::INVALID_ID) + insertValue(overlaps, insertPos, position(it)); + } + +} + +/////////////////////////////////////////////////////////////////////////////// +// Removes matches that are marked as invalid, and then keeps only the numMatches best matches. +template +void +compactMatches(String > & matches, TSize const numMatches) { + typedef StellarMatch TMatch; + typedef typename Iterator, Standard>::Type TIterator; + + // sort matches by length (and validity) + sortMatches(matches, LessLength()); + + // count valid matches + TSize num = 0; + + TIterator it = begin(matches, Standard()); + TIterator itEnd = end(matches, Standard()); + + for(; it != itEnd; ++it) { + if ((*it).id != TMatch::INVALID_ID) + ++num; + } + + // keep only valid and longest matches + resize(matches, _min(num, numMatches)); +} + +/////////////////////////////////////////////////////////////////////////////// +// Calls swift filter and verifies swift hits. = Computes eps-matches. +// A basic block for stellar +template +StellarComputeStatistics +_stellarKernel(jst::contrib::stellar_matcher> & matcher, + StellarDatabaseSegment & database_segment, + query_id_map const & query_dict, + StellarOptions & localOptions, + SwiftHitVerifier & swiftVerifier, + TIsPatternDisabledFn && isPatternDisabled, + TOnAlignmentResultFn && onAlignmentResult, + stellar_kernel_runtime & stellar_kernel_runtime) +{ + StellarComputeStatistics statistics{}; + + auto finder_callback = [&](auto & finder) + { + ++statistics.numSwiftHits; + statistics.totalLength += database_segment.size(); + statistics.maxLength = std::max(statistics.maxLength, database_segment.size()); + + if (!isPatternDisabled(matcher)) + { + auto queryID = matcher.curSeqNo(); + StellarQuerySegment query_segment = query_dict.segment_from_id(queryID); + seqan3::debug_stream << "FOUND MATCH for query\t" << std::to_string(queryID) << '\n'; + + /* + ////Debug stuff: + //std::cout << beginPosition(infix(finder)) << ","; + //std::cout << endPosition(infix(finder)) << " "; + //std::cout << beginPosition(pattern) << ","; + //std::cout << endPosition(pattern) << std::endl; + */ + + // verification + stellar_kernel_runtime.verification_time.measure_time([&]() + { + swiftVerifier.verify( + database_segment, + query_segment, + matcher.delta(), + onAlignmentResult, + stellar_kernel_runtime.verification_time); + }); // measure_time + } + }; + + // call operator() from seqan_pattern_base + stellar_kernel_runtime.swift_filter_time.measure_time([&]() + { + matcher(database_segment.as_span(), localOptions.minRepeatLength, localOptions.maxRepeatPeriod, finder_callback); + }); + + return statistics; +} + +} // namespace dream_stellar diff --git a/include/dream_stellar/stellar_database_segment.hpp b/include/dream_stellar/stellar_database_segment.hpp new file mode 100644 index 00000000..2bba0702 --- /dev/null +++ b/include/dream_stellar/stellar_database_segment.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include + +#include +#include + +namespace dream_stellar +{ + +using namespace seqan2; + +//!TODO: remove obsolete functions +template +struct StellarDatabaseSegment : public StellarSequenceSegment +{ + using TBase = StellarSequenceSegment; + + using typename TBase::TInfixSegment; + + //!TODO: why is it nested? + using TNestedFinderSegment = seqan2::Segment; + + using TBase::TBase; // import constructor + + static StellarDatabaseSegment fromFinderMatch(TInfixSegment const & finderMatch) + { + std::span const & underlyingDatabase = host(finderMatch); + return {underlyingDatabase, seqan2::beginPosition(finderMatch), seqan2::endPosition(finderMatch)}; + } + + std::span const & underlyingDatabase() const + { + return this->underlyingSequence(); + } + + TNestedFinderSegment asFinderSegment() const + { + std::span const & _database = underlyingDatabase(); + auto finderInfix = this->asInfixSegment(); + + TInfixSegment const finderInfixSeq = infix(_database, 0, length(_database)); + TNestedFinderSegment finderSegment(finderInfixSeq, + seqan2::beginPosition(finderInfix) - seqan2::beginPosition(_database), + seqan2::endPosition(finderInfix) - seqan2::beginPosition(_database)); + return finderSegment; + } +}; + +} // namespace dream_stellar diff --git a/include/dream_stellar/stellar_extension.hpp b/include/dream_stellar/stellar_extension.hpp new file mode 100644 index 00000000..f1beee5c --- /dev/null +++ b/include/dream_stellar/stellar_extension.hpp @@ -0,0 +1,715 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace dream_stellar +{ +using namespace seqan2; + +/////////////////////////////////////////////////////////////////////////////// +// returns true if align has a match at pos, otherwise false +template +inline bool +isMatch(Align const & align, TSize const pos) { + + if(isGap(row(align, 0), pos)) { + return false; + } else if(isGap(row(align, 1), pos)) { + return false; + } else if(row(align, 0)[pos] != row(align, 1)[pos]) { + return false; + } else { + return true; + } +} + +/////////////////////////////////////////////////////////////////////////////// +// Computes possible end positions of an eps-match in a given alignment. +template +void +_fillGapsString(Align const & align, + String > & gaps) { + typedef Triple TGapInfo; + TPos totalErrors = 0; + typename Row >::Type row0 = row(align, 0); + TPos i = 0; + TPos endPos = length(row0); + TPos gapBegin = i; + + // append gap starting at beginPosition (also if its length is 0!) + while(i < endPos && !isMatch(align, i)) { + ++i; + ++totalErrors; + } + appendValue(gaps, TGapInfo(gapBegin, i, totalErrors)); + + // iterate over alignment and append gaps + while (i < endPos) { + // skip matches + while(i < endPos && isMatch(align, i)) { + ++i; + } + gapBegin = i; + // skip and count mismatches/indels + while(i < endPos && !isMatch(align, i)) { + ++i; + ++totalErrors; + } + appendValue(gaps, TGapInfo(gapBegin, i, totalErrors)); + } + /*for(unsigned l = 0; l < length(gaps); ++l) { + std::cout << gaps[l].i1 << " " << gaps[l].i2 << " " << gaps[l].i3 << std::endl; + }*/ +} + +/////////////////////////////////////////////////////////////////////////////// +// Checks the error rate of the fragment between end of left and start of right. +template +inline bool +_isEpsMatch(Triple const & left, + Triple const & right, + TFloat const eps) { + // compute mismatches/indels and length + TPos errors = right.i3 - left.i3 - (right.i2 - right.i1); + TPos len = right.i1 - left.i2; + + // check error rate + double const DELTA = 0.000001; // Small delta against floating point precision problems. + return errors/(TFloat)(len) <= eps + DELTA; +} + +/////////////////////////////////////////////////////////////////////////////// +// Identifies the longest epsilon match in align and sets the view positions of +// align to start and end position of the longest epsilon match +template +bool +longestEpsMatch(Align & align, + TSize const matchMinLength, + TFloat const epsilon) { + // Preprocessing: compute and store gaps and lengths + // A gap is a triple of gap begin position, gap end position, and total number of errors in sequence from begin + // to end position of this gap. + typedef typename Position >::Type TPosition; + typedef String > TGapsString; + TGapsString gaps; + _fillGapsString(align, gaps); + + // Identify longest eps match by iterating over combinations of left and right positions + typename Iterator::Type rightIt = end(gaps) - 1; + typename Iterator::Type leftIt = begin(gaps); + + TPosition beginPos = 0; + TPosition endPos = 0; + TSize minLength = matchMinLength - 1; + + while ((*leftIt).i2 + minLength < (*rightIt).i1) { + while ((*leftIt).i2 + minLength < (*rightIt).i1) { + if(_isEpsMatch(*leftIt, *rightIt, epsilon)) { + beginPos = (*leftIt).i2; + endPos = (*rightIt).i1; + minLength = endPos - beginPos; + break; + } + --rightIt; + } + rightIt = end(gaps) - 1; + ++leftIt; + } + + // Set view positions to the eps-match + TPosition viewBeginRow0 = toSourcePosition(row(align, 0), 0); + TPosition viewBeginRow1 = toSourcePosition(row(align, 1), 0); + setClippedEndPosition(row(align, 0), viewBeginRow0 + endPos); + setClippedEndPosition(row(align, 1), viewBeginRow1 + endPos); + setClippedBeginPosition(row(align, 0), viewBeginRow0 + beginPos); + setClippedBeginPosition(row(align, 1), viewBeginRow1 + beginPos); + // setClippedBeginPosition(row(align, 0), toSourcePosition(row(align, 0), beginPos)); + // setClippedBeginPosition(row(align, 1), toSourcePosition(row(align, 1), beginPos)); + // setBeginPosition(row(align, 0), beginPos); + // setBeginPosition(row(align, 1), beginPos); + // setClippedEndPosition(row(align, 0), toSourcePosition(row(align, 0), endPos)); + // setClippedEndPosition(row(align, 1), toSourcePosition(row(align, 1), endPos)); + SEQAN_ASSERT_EQ(length(row(align, 0)), length(row(align, 1))); + + if (endPos == 0 && beginPos == 0) return 1; + return 0; +} + +/////////////////////////////////////////////////////////////////////////////// +// Computes the banded alignment matrix for the left extension and +// returns a string with possible start positions of an eps-match. +template +void +_fillMatrixBestEndsLeft(TMatrix & matrixLeft, + std::vector & possibleEndsLeft, + //!TODO: should sequencesLeft be owning? + std::vector const, InfixSegment>> const & sequencesLeft, + TDiagonal const diagLower, + TDiagonal const diagUpper, + TScore const & scoreMatrix) { + // _align_banded_nw_best_ends(matrixLeft, possibleEndsLeft, str, scoreMatrix, + // upperDiagonal(seedOld) - upperDiagonal(seed), + // upperDiagonal(seedOld) - lowerDiagonal(seed)); + + // std::cerr << "FILL MATRIX LEFT SEQS\n" + // << "0: " << infixH << "\n" + // << "1: " << infixV << "\n"; + + // _align_banded_nw_best_ends(matrixLeft, possibleEndsLeft, str, scoreMatrix, + // diagBegin - upperDiagonal(seed), + // diagBegin - lowerDiagonal(seed)); + // upperDiagonal(seedOld) - upperDiagonal(seed), + // upperDiagonal(seedOld) - lowerDiagonal(seed)); + + // Use legacy adapted NW computation with infixH/first alignment row being in the vertical direction. + // // TODO(holtgrew): When switching to DP from new alignment module, make sure to mirror diagonals. + _align_banded_nw_best_ends(matrixLeft, possibleEndsLeft, sequencesLeft, scoreMatrix, -diagUpper, -diagLower); +} + +/////////////////////////////////////////////////////////////////////////////// +// Computes the banded alignment matrix for the right extension and +// returns a string with possible end positions of an eps-match. +template +void +_fillMatrixBestEndsRight(TMatrix & matrixRight, + std::vector & possibleEndsRight, + std::vector const, InfixSegment>> const & sequencesRight, + TDiagonal const diagLower, + TDiagonal const diagUpper, + TScore const & scoreMatrix) { + // std::cerr << "FILL MATRIX RIGHT SEQS\n" + // << "0: " << infixH << "\n" + // << "1: " << infixV << "\n"; + + // _align_banded_nw_best_ends(matrixRight, possibleEndsRight, str, scoreMatrix, + // lowerDiagonal(seedOld) - upperDiagonal(seed), + // lowerDiagonal(seedOld) - lowerDiagonal(seed)); + + // _align_banded_nw_best_ends(matrixRight, possibleEndsRight, str, scoreMatrix, + // diagEnd - upperDiagonal(seed), + // diagEnd - lowerDiagonal(seed)); + // lowerDiagonal(seedOld) - upperDiagonal(seed), + // lowerDiagonal(seedOld) - lowerDiagonal(seed)); + + // Use legacy adapted NW computation with infixH/first alignment row being in the vertical direction. + // TODO(holtgrew): When switching to DP from new alignment module, make sure to mirror diagonals. + _align_banded_nw_best_ends(matrixRight, possibleEndsRight, sequencesRight, scoreMatrix, -diagUpper, -diagLower); +} + +/////////////////////////////////////////////////////////////////////////////// +// Traceback from an arbitrary point (coordinate) in the banded alignment trace matrix (trace). +template +inline void +_alignBandedNeedlemanWunschTrace(TAlign & align, + TSegVec const & str, + TTrace const & trace, + TCoord const & coordinate, + TDiagonal const diagL, + TDiagonal const diagU) +{ + //typedef typename Value::Type TString; + typedef typename Id::Type TId; + typedef typename Size::Type TSize; + typedef typename Value::Type TTraceValue; + + // Traceback values + TTraceValue Diagonal = 0; TTraceValue Horizontal = 1; TTraceValue Vertical = 2; + + // Initialization + TId id1{0}; // for a owning StringSet<> Id is the same as the index + TId id2{1}; + TSize lo_row = (diagU <= 0) ? -1 * diagU : 0; + TSize diagonalWidth = (TSize) (diagU - diagL + 1); + + // Start the trace from the cell with the max value + TSize row = coordinate.i1; + TSize col = coordinate.i2; + + // Handle the skipped sequence parts + TSize actualRow = row + lo_row; + TSize actualCol = col + diagL + actualRow; + + if ((actualRow != 0) && (actualCol != 0)) { + // Find initial direction + TTraceValue tv = trace[row * diagonalWidth + col]; + if (tv == Horizontal) --col; + else if (tv == Vertical) {--row; ++col;} + else --row; + + // Walk until we hit a border + TSize seqLen = 1; + TTraceValue newTv = tv; + while(true) { + actualRow = row + lo_row; + actualCol = col + diagL + actualRow; + newTv = trace[row * diagonalWidth + col]; + + // Check if we hit a border + if ((actualRow == 0) || (actualCol == 0)) break; + else { + //std::cout << row << ',' << col << ':' << value(originalMat, actualRow * len1 + actualCol) << std::endl; + if (tv == Diagonal) { + if (newTv == Horizontal) { + _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv); + --col; seqLen = 1; + } else if (newTv == Vertical) { + _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv); + --row; ++col; seqLen = 1; + } else { + --row; ++seqLen; + } + } else { + if (tv == Horizontal) { + if (newTv == Diagonal) { + _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv); + --row; seqLen = 1; + } else if (newTv == Vertical) { + _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv); + --row; ++col; seqLen = 1; + } else { + --col; ++seqLen; + } + } else { + if (newTv == Diagonal) { + _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv); + --row; seqLen = 1; + } else if (newTv == Horizontal) { + _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv); + --col; seqLen = 1; + } else { + --row; ++col; ++seqLen; + } + } + } + tv = newTv; + } + } + + // Align left overs + if (seqLen) _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv); + } + + // Handle the remaining sequence + if (actualCol != 0) _alignTracePrint(align, str[0], str[1], (TId) id1, (TSize) 0, (TId) 0, (TSize) 0, (TSize) actualCol, Horizontal); + else if (actualRow != 0) _alignTracePrint(align, str[0], str[1], (TId) 0, (TSize) 0, (TId) id2, (TSize) 0, (TSize) actualRow, Vertical); + +} + +template +void _copyInfixAlignmentIntoAlignment(TAlign & align, + TInfixAlign const & infixAlign, + TPos const infixAlignHBeginPosition, + TPos const infixAlignVBeginPosition) +{ + using TAlignPos = typename Position::Type>::Type; + String viewPos; + + // NOTE: we can't use `integrateAlign(align, infixAlign);` directly + // as `infixAlign` is a sequence alignment on a local copy of a segment from + // `align`. That means we need to calculate the positions where to copy + // the alignment `infixAlign` into `align`. + + TAlignPos pos0 = infixAlignHBeginPosition // beginPosition(infixAlignH) // correct for infixes + - beginPosition(source(row(align, 0))) // ... + + beginPosition(row(infixAlign, 0)); // respect source clipping + + appendValue(viewPos, toViewPosition(row(align, 0), pos0)); + + TAlignPos pos1 = infixAlignVBeginPosition // beginPosition(infixAlignV) // correct for infixes + - beginPosition(source(row(align, 1))) // ... + + beginPosition(row(infixAlign, 1)); // respect source clipping + + appendValue(viewPos, toViewPosition(row(align, 1), pos1)); + + integrateAlign(align, infixAlign, viewPos); +} + +/////////////////////////////////////////////////////////////////////////////// +// Conducts the traceback on the extension to the left from best start position +// and writes the result into align. +template +void +_tracebackLeft(TMatrix const & matrixLeft, + TCoord const & coordinate, + std::vector const, InfixSegment>> const & sequencesLeft, + TBeginPosition const infixAlignHBeginPosition, + TBeginPosition const infixAlignVBeginPosition, + TDiagonal const diagLower, + TDiagonal const diagUpper, + TPos const endLeftH, + TPos const endLeftV, + TAlign & align) { + AlignTraceback traceBack; + _alignBandedNeedlemanWunschTrace(traceBack, sequencesLeft, matrixLeft, coordinate, + -diagUpper, -diagLower); + // upperDiagonal(seedOld) - upperDiagonal(seed), upperDiagonal(seedOld) - lowerDiagonal(seed)); + //std::cerr << "TRACEBACK\n"; + //for (unsigned i = 0; i < length(traceBack.tvs); ++i) + // std::cerr << (int)traceBack.tvs[i] << "\t" << traceBack.sizes[i] << "\n"; + //std::cerr << "---------\n"; + + reverse(traceBack.sizes); + reverse(traceBack.tvs); + + Align const, InfixSegment>> infixAlign; + resize(rows(infixAlign), 2); + assignSource(row(infixAlign, 0), infix(sequencesLeft[0], length(sequencesLeft[0]) - endLeftH, length(sequencesLeft[0]))); + assignSource(row(infixAlign, 1), infix(sequencesLeft[1], length(sequencesLeft[1]) - endLeftV, length(sequencesLeft[1]))); + + // std::cerr << "\nLEFT SEQS\n" << row(infixAlign, 0) << "\n" << row(infixAlign, 1) << "\n"; + _pumpTraceToGaps(row(infixAlign, 0), row(infixAlign, 1), traceBack); + // std::cerr << "INFIX ALIGN AFTER LEFT TRACEBACK\n\n" << infixAlign << "\n"; + // std::cerr << "ALIGN BEFORE INTEGRATION WITH INFIX ALIGN\n\n" << align << "\n"; + _copyInfixAlignmentIntoAlignment(align, infixAlign, infixAlignHBeginPosition, infixAlignVBeginPosition); + // std::cerr << "ALIGN AFTER INTEGRATION WITH INFIX ALIGN\n\n" << align << "\n"; +} + + +/////////////////////////////////////////////////////////////////////////////// +// Conducts the traceback on the extension to the right from best end position +// and writes the result into align. +template +void +_tracebackRight(TMatrix const & matrixRight, + TCoord const & coordinate, + std::vector const, InfixSegment>> const & sequencesRight, + TBeginPosition const infixAlignHBeginPosition, + TBeginPosition const infixAlignVBeginPosition, + TDiagonal const diagLower, + TDiagonal const diagUpper, + TPos const endRightH, + TPos const endRightV, + TAlign & align) { + AlignTraceback traceBack; + _alignBandedNeedlemanWunschTrace(traceBack, sequencesRight, matrixRight, coordinate, + -diagUpper, -diagLower); + // lowerDiagonal(seedOld) - upperDiagonal(seed), lowerDiagonal(seedOld) - lowerDiagonal(seed)); + //std::cerr << "TRACEBACK\n"; + //for (unsigned i = 0; i < length(traceBack.tvs); ++i) + // std::cerr << (int)traceBack.tvs[i] << "\t" << traceBack.sizes[i] << "\n"; + //std::cerr << "---------\n"; + + Align const, InfixSegment>> infixAlign; + resize(rows(infixAlign), 2); + assignSource(row(infixAlign, 0), infix(sequencesRight[0], 0, endRightH)); + assignSource(row(infixAlign, 1), infix(sequencesRight[1], 0, endRightV)); + + // std::cerr << "\nRIGHT SEQS\n" << row(infixAlign, 0) << "\n" << row(infixAlign, 1) << "\n"; + _pumpTraceToGaps(row(infixAlign, 0), row(infixAlign, 1), traceBack); + // std::cerr << "INFIX ALIGN AFTER RIGHT TRACEBACK\n\n" << infixAlign << "\n"; + // std::cerr << "ALIGN BEFORE INTEGRATION WITH INFIX ALIGN\n\n" << align << "\n"; + _copyInfixAlignmentIntoAlignment(align, infixAlign, infixAlignHBeginPosition, infixAlignVBeginPosition); + // std::cerr << "ALIGN AFTER INTEGRATION WITH INFIX ALIGN\n\n" << align << "\n"; +} + +/////////////////////////////////////////////////////////////////////////////// +// Computes the banded alignment matrix and fills a string with possible start +// and end positions of an eps-match. Determines the optimal start and end +// position for the longest eps-match and writes the trace into align. +template +bool +_bestExtension(Segment const & infH, // database + Segment const & infV, // query + TSeed const & seed, + TSeed const & seedOld, + TPos const alignLen, + TPos const alignErr, + TScore const & scoreMatrix, + TDir const direction, + TSize const minLength, + TEps const eps, + TAlign & align, + stellar_best_extension_time & best_extension_runtime) +{ + using TAlphabet = std::remove_cv::type; + using TOwningContainer = std::vector; + + typedef std::vector TAlignmentMatrix; + typedef ExtensionEndPosition TEndInfo; + typedef typename std::vector::const_iterator TEndIterator; + typedef typename Diagonal::Type TDiagonal; + + // variables for banded alignment and possible ends of match + TAlignmentMatrix matrixRight, matrixLeft; + std::vector possibleEndsLeft, possibleEndsRight; + + // new extension to the left of the old seed + assert(beginPositionH(seed) <= beginPositionH(seedOld)); // infixLeftH + assert(beginPositionV(seed) <= beginPositionV(seedOld)); // infixLeftV + + // old seed covers at least one character + assert(beginPositionH(seedOld) < endPositionH(seedOld)); + assert(beginPositionV(seedOld) < endPositionV(seedOld)); + + // new extension to the right of the old seed + assert(endPositionH(seedOld) <= endPositionH(seed)); // infixRightH + assert(endPositionV(seedOld) <= endPositionV(seed)); // infixRightV + + std::vector> sequencesLeft; + std::vector> sequencesRight; + + // Compute diagonals for updated seeds module with infixH/first alignment row being in the horizontal direction. + TDiagonal const diagLowerLeft = lowerDiagonal(seedOld) - upperDiagonal(seed); + TDiagonal const diagUpperLeft = lowerDiagonal(seedOld) - lowerDiagonal(seed); + + // Compute diagonals for updated seeds module with infixH/first alignment row being in the horizontal direction. + TDiagonal const diagLowerRight = upperDiagonal(seedOld) - upperDiagonal(seed); + TDiagonal const diagUpperRight = upperDiagonal(seedOld) - lowerDiagonal(seed); + + best_extension_runtime.banded_needleman_wunsch_time.measure_time([&]() + { + // fill banded matrix and gaps string for ... + if (direction == EXTEND_BOTH || direction == EXTEND_LEFT) { // ... extension to the left + // prepare copy segment... + //!TODO: can these be references instead of copies? + TOwningContainer segmentCopyLeftH; + TOwningContainer segmentCopyLeftV; + segmentCopyLeftH.reserve(beginPositionH(seedOld) - beginPositionH(seed)); + segmentCopyLeftV.reserve(beginPositionV(seedOld) - beginPositionV(seed)); + + best_extension_runtime.banded_needleman_wunsch_left_time.measure_time([&]() + { + // ...copy reverse complement segment + //!TODO: are seqan2 and seqan3 ranks compatible? + for (auto n : host(infH).subspan(beginPositionH(seed), beginPositionH(seedOld)) | std::views::reverse | seqan3::views::complement) + { + segmentCopyLeftH.emplace_back(n.to_rank()); + } + + for (auto n : host(infV).subspan(beginPositionV(seed), beginPositionV(seedOld)) | std::views::reverse | seqan3::views::complement) + { + segmentCopyLeftV.emplace_back(n.to_rank()); + } + + // put infix segments + sequencesLeft.emplace_back(infix(segmentCopyLeftH, 0, segmentCopyLeftH.size())); + sequencesLeft.emplace_back(infix(segmentCopyLeftV, 0, segmentCopyLeftV.size())); + + _fillMatrixBestEndsLeft(matrixLeft, possibleEndsLeft, sequencesLeft, diagLowerLeft, diagUpperLeft, scoreMatrix); + SEQAN_ASSERT_NOT(possibleEndsLeft.empty()); + }); // measure_time + } + else + possibleEndsLeft.emplace_back(TEndInfo()); + if (direction == EXTEND_BOTH || direction == EXTEND_RIGHT) { // ... extension to the right + best_extension_runtime.banded_needleman_wunsch_right_time.measure_time([&]() + { + // prepare copy segment... + //!TODO: can these be references instead of copies? + auto segmentCopyRightH = TOwningContainer(host(infH).begin() + endPositionH(seedOld), host(infH).begin() + endPositionH(seed)); + auto segmentCopyRightV = TOwningContainer(host(infV).begin() + endPositionV(seedOld), host(infV).begin() + endPositionV(seed)); + + sequencesRight.emplace_back(infix(segmentCopyRightH, 0, segmentCopyRightH.size())); + sequencesRight.emplace_back(infix(segmentCopyRightV, 0, segmentCopyRightV.size())); + _fillMatrixBestEndsRight(matrixRight, possibleEndsRight, sequencesRight, diagLowerRight, diagUpperRight, scoreMatrix); + SEQAN_ASSERT_NOT(possibleEndsRight.empty()); + }); // measure_time + } else + { + possibleEndsRight.emplace_back(TEndInfo()); + } + }); // measure_time + + // longest eps match on poss ends string + std::pair endPair = best_extension_runtime.longest_eps_match_time.measure_time([&]() + { + return longestEpsMatch(possibleEndsLeft, possibleEndsRight, alignLen, alignErr, minLength, eps); + }); + + if (endPair == std::pair(possibleEndsLeft.end(), possibleEndsRight.end())) { // no eps-match found + return false; + } + + // determine end positions of maximal eps-match in ... + TPos endLeftH = 0, endLeftV = 0; + TPos endRightH = 0, endRightV = 0; + if((*endPair.first).length != 0) { // ... extension to the left + endLeftV = (*endPair.first).coord.i1; + // correction for banded coordinates to unbanded: + if (diagLowerLeft >= 0) + endLeftV += (TPos)(diagLowerLeft); + endLeftH = endLeftV + (TPos)((*endPair.first).coord.i2 - diagUpperLeft); + } + if((*endPair.second).length != 0) { // ... extension to the right + endRightV = (*endPair.second).coord.i1; + // correction for banded coordinates to unbanded: + if (diagLowerRight >= 0) + endRightV += (TPos)(diagLowerRight); + endRightH = endRightV + (TPos)((*endPair.second).coord.i2 - diagUpperRight); + } + + // set begin and end positions of align + setBeginPosition(row(align, 0), beginPositionH(seedOld) - endLeftH); + setBeginPosition(row(align, 1), beginPositionV(seedOld) - endLeftV); + setEndPosition(row(align, 0), endPositionH(seedOld) + endRightH); + setEndPosition(row(align, 1), endPositionV(seedOld) + endRightV); + // setClippedBeginPosition(row(align, 0), beginPositionH(seedOld) - endLeftH); + // setClippedBeginPosition(row(align, 1), beginPositionV(seedOld) - endLeftV); + // setBeginPosition(row(align, 0), 0); + // setBeginPosition(row(align, 1), 0); + // setClippedEndPosition(row(align, 0), endPositionH(seedOld) + endRightH); + // setClippedEndPosition(row(align, 1), endPositionV(seedOld) + endRightV); + + best_extension_runtime.construct_seed_alignment_time.measure_time([&]() + { + // traceback through matrix from begin/end pos on ... + if((*endPair.first).length != 0) { // ... extension to the left + assert(direction == EXTEND_BOTH || direction == EXTEND_LEFT); + auto const infixAlignHBeginPosition = beginPositionH(seed) + length(sequencesLeft[0]) - endLeftH; + auto const infixAlignVBeginPosition = beginPositionV(seed) + length(sequencesLeft[1]) - endLeftV; + _tracebackLeft(matrixLeft, + (*endPair.first).coord, + sequencesLeft, + infixAlignHBeginPosition, + infixAlignVBeginPosition, + diagLowerLeft, + diagUpperLeft, + endLeftH, + endLeftV, + align); + } + if((*endPair.second).length != 0) { // ... extension to the right + assert(direction == EXTEND_BOTH || direction == EXTEND_RIGHT); + auto const infixAlignHBeginPosition = endPositionH(seedOld); + auto const infixAlignVBeginPosition = endPositionV(seedOld); + _tracebackRight(matrixRight, + (*endPair.second).coord, + sequencesRight, + infixAlignHBeginPosition, + infixAlignVBeginPosition, + diagLowerRight, + diagUpperRight, + endRightH, + endRightV, + align); + } + SEQAN_ASSERT_EQ(length(row(align, 0)), length(row(align, 1))); + }); // measure_time + + return true; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +template +void +integrateAlign(Align & align, + Align, InfixSegment>, TSpec2> const & infixAlign) { + typedef typename Size::Type TSize; + typedef typename Position >::Type>::Type TPos; + + String viewPos; + TPos pos; + for (TSize i = 0; i < length(rows(infixAlign)); ++i) { + pos = beginPosition(source(row(infixAlign, i))) + beginPosition(row(infixAlign, i)); + pos += beginPosition(host(source(row(infixAlign, i)))); + appendValue(viewPos, toViewPosition(row(align, i), pos)); + } + + // std::cerr << "HAHA infixAlign == \n" << row(infixAlign, 0) << "\n" << row(infixAlign, 1) << "\n"; + integrateAlign(align, infixAlign, viewPos); + // std::cerr << "HAHA infixAlign == \n" << row(infixAlign, 0) << "\n" << row(infixAlign, 1) << "\n"; +} + +/////////////////////////////////////////////////////////////////////////////// +// Conducts best X-drop extension and calls _bestExtension. +// After the call align contains the longest eps-Match that spans the eps-core (localAlign). +template +bool +_extendAndExtract(Align, InfixSegment> > const & localAlign, + TScoreValue const scoreDropOff, + TScore const & scoreMatrix, + Segment, InfixSegment> const & infH, + Segment, InfixSegment> const & infV, + ExtensionDirection const direction, + TSize const minLength, + TEps const eps, + TAlign & align, + stellar_extension_time & extension_runtime) { + typedef typename Position::Type TPos; + typedef Seed TSeed; + + //!NOTE: TSequence = std::span + + // std::cerr << "LOCAL ALIGN\n" << row(localAlign, 0) << "\n" << row(localAlign, 1) << "\n"; + // std::cerr << "ALIGN\n" << row(align, 0) << "\n" << row(align, 1) << "\n"; + integrateAlign(align, localAlign); + //std::cerr << __LINE__ << "\tLOCAL ALIGN\n" << row(localAlign, 0) << "\n" << row(localAlign, 1) << "\n"; + //std::cerr << __LINE__ << "\tALIGN\n" << row(align, 0) << "\n" << row(align, 1) << "\n"; + + // Get begin and end position of local alignment (seed) as source positions + // in underlying sequences. + TPos seedBeginH = beginPosition(row(localAlign, 0)) + beginPosition(infH); + TPos seedBeginV = beginPosition(row(localAlign, 1)) + beginPosition(infV); + TPos seedEndH = endPosition(row(localAlign, 0)) + beginPosition(infH); + TPos seedEndV = endPosition(row(localAlign, 1)) + beginPosition(infV); + + if (direction == EXTEND_NONE) { + // set begin and end positions of align + setBeginPosition(row(align, 0), seedBeginH); + setBeginPosition(row(align, 1), seedBeginV); + setEndPosition(row(align, 0), seedEndH); + setEndPosition(row(align, 1), seedEndV); + + if ((TSize)length(row(align, 0)) < minLength) + return false; + + longestEpsMatch(align, minLength, eps); + } else { + // gapped X-drop extension of local alignment (seed) + TSeed seed(seedBeginH, seedBeginV, seedEndH, seedEndV); + TSeed seedOld(seed); + + static_assert(std::is_same>::value, + "infH is a nested InfixSegment: Segment, InfixSegment>"); + Segment infixSequenceH = host(infH); // inner nested Segment + Segment infixSequenceV = host(infV); // inner nested Segment + extension_runtime.extend_seed_time.measure_time([&]() + { + extendSeed(seed, infixSequenceH, infixSequenceV, direction, scoreMatrix, scoreDropOff, GappedXDrop()); + }); + if (static_cast(seedSize(seed)) < minLength - (int)floor(minLength*eps)) + return false; + + // determine length and number of error columns of local alignment (seed) + TPos alignLen = _max(length(row(localAlign, 0)), length(row(localAlign, 1))); + TPos alignErr = 0; + for (TPos i = 0; i < alignLen; ++i) { + if (!isMatch(localAlign, i)) ++alignErr; + } + + // convert seeds from positions in host(seq) to positions in host(host(seq)) + setBeginPositionH(seedOld, beginPositionH(seedOld) + beginPosition(host(infH))); + setEndPositionH(seedOld, endPositionH(seedOld) + beginPosition(host(infH))); + setBeginPositionV(seedOld, beginPositionV(seedOld) + beginPosition(host(infV))); + setEndPositionV(seedOld, endPositionV(seedOld) + beginPosition(host(infV))); + setBeginPositionH(seed, beginPositionH(seed) + beginPosition(host(infH))); + setEndPositionH(seed, endPositionH(seed) + beginPosition(host(infH))); + setBeginPositionV(seed, beginPositionV(seed) + beginPosition(host(infV))); + setEndPositionV(seed, endPositionV(seed) + beginPosition(host(infV))); + + // determine best extension lengths and write the trace into align + Segment infixH = infix(infixSequenceH, beginPosition(infH), endPosition(infH)); + Segment infixV = infix(infixSequenceV, beginPosition(infV), endPosition(infV)); + + bool const found_extension = extension_runtime.best_extension_time.measure_time([&]() + { + return _bestExtension(infixH, infixV, seed, seedOld, alignLen, alignErr, scoreMatrix, direction, minLength, eps, align, extension_runtime.best_extension_time); + }); + if (!found_extension) + return false; + SEQAN_ASSERT_EQ(length(row(align, 0)), length(row(align, 1))); + } + SEQAN_ASSERT_EQ(length(row(align, 0)), length(row(align, 1))); + //std::cerr << "extracted alignment\n-------------\n" << align << "----------------\n"; + return true; +} + +} // namespace dream_stellar diff --git a/include/dream_stellar/stellar_index.hpp b/include/dream_stellar/stellar_index.hpp new file mode 100644 index 00000000..236b7ecc --- /dev/null +++ b/include/dream_stellar/stellar_index.hpp @@ -0,0 +1,115 @@ +#pragma once + +#include + +#include + +#include +#include + +namespace dream_stellar +{ +using namespace seqan2; + +template , typename TInfixSegment = seqan2::Segment> +using StellarQGramStringSet = StringSet >; + +template +using StellarQGramIndex = Index const, IndexQGram >; + +template +using StellarSwiftPattern = Pattern, Swift >; + +template +using StellarSwiftFinder = Finder const, InfixSegment> const, Swift >; + +template +struct StellarIndex +{ + using TSequence = seqan2::String; + using TInfixSegment = seqan2::Segment const, seqan2::InfixSegment>; + using TQGramStringSet = StellarQGramStringSet; + + template + StellarIndex(StringSet const & queries, IndexOptions const & options) + : StellarIndex{convertImplStringSet(queries), options} + {} + + StellarIndex(StringSet const & queries, IndexOptions const & options) + : StellarIndex{convertSegmentStringSet(queries), options} + {} + + StellarIndex(std::span const & queries, IndexOptions const & options) + : StellarIndex{convertImplSpan(queries), options} + {} + + StellarIndex() = delete; + StellarIndex(StellarIndex &&) = delete; + StellarIndex(StellarIndex const &) = delete; + StellarIndex & operator=(StellarIndex &&) = delete; + StellarIndex & operator=(StellarIndex const &) = delete; + + void construct() + { + indexRequire(qgramIndex, QGramSADir()); + } + + StellarSwiftPattern createSwiftPattern() + { + return {qgramIndex}; + } + + static StellarQGramIndex & qgramIndexFromPattern(StellarSwiftPattern & pattern) + { + return host(pattern); + } + + static TQGramStringSet const & sequencesFromPattern(StellarSwiftPattern & pattern) + { + return sequencesFromQGramIndex(qgramIndexFromPattern(pattern)); + } + + static TQGramStringSet const & sequencesFromQGramIndex(StellarQGramIndex & index) + { + return indexText(index); + } + + TQGramStringSet const dependentQueries; + StellarQGramIndex qgramIndex; + +private: + template >> + StellarIndex(TOtherQGramStringSet && queries, IndexOptions const & options) + : dependentQueries{std::forward(queries)}, qgramIndex{dependentQueries} + { + resize(indexShape(qgramIndex), options.qGram); + cargo(qgramIndex).abundanceCut = options.qgramAbundanceCut; + } + + template + static TQGramStringSet convertImplStringSet(StringSet const & queries) + { + TQGramStringSet dependentQueries; + for (TSequence const & query: queries) + seqan2::appendValue(dependentQueries, seqan2::infix(query, 0, seqan2::length(query))); + + return dependentQueries; + } + + static TQGramStringSet convertSegmentStringSet(StringSet const & queries) + { + TQGramStringSet dependentQueries = queries; + return dependentQueries; + } + + static TQGramStringSet convertImplSpan(std::span const & queries) + { + TQGramStringSet dependentQueries; + for (TInfixSegment const & query: queries) + seqan2::appendValue(dependentQueries, query); + + return dependentQueries; + } +}; + +} // namespace dream_stellar diff --git a/include/dream_stellar/stellar_launcher.hpp b/include/dream_stellar/stellar_launcher.hpp new file mode 100644 index 00000000..6c28a371 --- /dev/null +++ b/include/dream_stellar/stellar_launcher.hpp @@ -0,0 +1,108 @@ +#pragma once + +#include +#include +#include + +namespace dream_stellar +{ + +template +void _postprocessQueryMatches(bool const databaseStrand, uint64_t const & refLen, + StellarOptions const & options, + std::vector const, TId> > > & matches, + std::vector & disabledQueryIDs) +{ + using TSequence = std::span; + + for (size_t queryID = 0; queryID < matches.size(); ++queryID) + { + if (matches.size() <= queryID) + throw std::runtime_error("Query ID=" + std::to_string(queryID) + " out of range [0;" + std::to_string(matches.size() - 1) + "]"); + + QueryMatches> & queryMatches = matches[queryID]; + + queryMatches.removeOverlapsAndCompactMatches(options.disableThresh, + /*compactThresh*/ 0, + options.minLength, + options.numMatches); + + if (queryMatches.disabled) + disabledQueryIDs.push_back(queryID); + } +} + +template , typename id_t = std::string> +struct StellarLauncher +{ + template + static constexpr StellarComputeStatistics _verificationMethodVisit( + StellarVerificationMethod verificationMethod, + visitor_fn_t && visitor_fn + ) + { + if (verificationMethod == StellarVerificationMethod{AllLocal{}}) + return visitor_fn(AllLocal()); + else if (verificationMethod == StellarVerificationMethod{BestLocal{}}) + return visitor_fn(BestLocal()); + return StellarComputeStatistics{}; + } + + static StellarComputeStatistics + search_and_verify( + jst::contrib::stellar_matcher & matcher, + StellarDatabaseSegment database_segment, + id_t const & databaseID, + query_id_map const & query_dict, + bool const databaseStrand, + StellarOptions & localOptions, // localOptions.compactThresh is out-param + stellar_kernel_runtime & strand_runtime, + std::vector > > & local_matches + ) + { + auto getQueryMatches = [&](jst::contrib::stellar_matcher const & stellar_matcher) + -> QueryMatches > & + { + if (local_matches.size() < stellar_matcher.curSeqNo()) + throw std::runtime_error{"Pattern defined incorrectly. Out of sequence range."}; + return local_matches[stellar_matcher.curSeqNo()]; + }; + + auto isPatternDisabled = [&](jst::contrib::stellar_matcher const & matcher) -> bool + { + QueryMatches > & queryMatches = getQueryMatches(matcher); + return queryMatches.disabled; + }; + + auto onAlignmentResult = [&](auto & alignment) -> bool { + QueryMatches > & queryMatches = getQueryMatches(matcher); + StellarMatch> const, id_t> match(alignment, databaseID, databaseStrand); + length(match); // DEBUG: Contains assertion on clipping. + + // success + return queryMatches.insertMatch( + match, + localOptions.minLength, + localOptions.disableThresh, + localOptions.compactThresh, /* out-parameter */ + localOptions.numMatches + ); + }; + + StellarComputeStatistics statistics = _verificationMethodVisit( + localOptions.verificationMethod, + [&](TTag tag) -> StellarComputeStatistics + { + SwiftHitVerifier swiftVerifier + { + STELLAR_DESIGNATED_INITIALIZER(.eps_match_options = , localOptions), + STELLAR_DESIGNATED_INITIALIZER(.verifier_options = , localOptions), + }; + + return _stellarKernel(matcher, database_segment, query_dict, localOptions, swiftVerifier, isPatternDisabled, onAlignmentResult, strand_runtime); + }); + return statistics; + } +}; + +} // namespace dream_stellar diff --git a/include/dream_stellar/stellar_output.hpp b/include/dream_stellar/stellar_output.hpp new file mode 100755 index 00000000..45b6df8d --- /dev/null +++ b/include/dream_stellar/stellar_output.hpp @@ -0,0 +1,396 @@ +#pragma once + +#include +#include + +#include // QueryMatches + +namespace dream_stellar +{ + +using namespace seqan2; + +/////////////////////////////////////////////////////////////////////////////// +// Computes the length adjustment for E-value computation +// Based on the NCBI BLAST code by Tom Madden. +template +TSize +_computeLengthAdjustment(TSize const dbLength, TSize const queryLength) { + + const double K = 0.34; + const double logK = log(K); + const double alphaByLambda = 1.8/1.19; + const double beta = -3; + const TSize maxIterations = 20; + + double n = (double)dbLength; + double m = (double)queryLength; + double totalLen; + + double val = 0, val_min = 0, val_max; + bool converged = false; + + /* Choose val_max to be the largest nonnegative value that satisfies + * K * (m - val) * (n - N * val) > max(m,n) + * Use quadratic formula: 2 c /( - b + sqrt( b*b - 4 * a * c )) */ + + { // scope of mb, and c, the coefficients in the quadratic formula (the variable mb is -b, a=1 ommited) + double mb = m + n; + double c = n * m - _max(m, n) / K; + + if(c < 0) { + return 0; + } else { + val_max = 2 * c / (mb + sqrt(mb * mb - 4 * c)); + } + } // end scope of mb and c + + for(TSize i = 1; i <= maxIterations; i++) { + totalLen = (m - val) * (n - val); + double val_new = alphaByLambda * (logK + log(totalLen)) + beta; // proposed next value of val + if(val_new >= val) { // val is no bigger than the true fixed point + val_min = val; + if(val_new - val_min <= 1.0) { + converged = true; + break; + } + if(val_min == val_max) { // There are no more points to check + break; + } + } else { // val is greater than the true fixed point + val_max = val; + } + if(val_min <= val_new && val_new <= val_max) { // ell_new is in range. Accept it. + val = val_new; + } else { // else val_new is not in range. Reject it. + val = (i == 1) ? val_max : (val_min + val_max) / 2; + } + } + + if(converged) { // the iteration converged + // If val_fixed is the (unknown) true fixed point, then we wish to set lengthAdjustment to floor(val_fixed). + // We assume that floor(val_min) = floor(val_fixed) + return (TSize) val_min; + + // But verify that ceil(val_min) != floor(val_fixed) + val = ceil(val_min); + if( val <= val_max ) { + totalLen = (m - val) * (n - val); + if(alphaByLambda * (logK + log(totalLen)) + beta >= val) { + // ceil(val_min) == floor(val_fixed) + return (TSize) val; + } + } + } else { // the iteration did not converge + // Use the best value seen so far. + return (TSize) val_min; + } +} + +/////////////////////////////////////////////////////////////////////////////// +// Computes a CIGAR string and mutations from rows of StellarMatch. +template +void +_getCigarLine(TRow const & row0, TRow const & row1, TString & cigar, TString & mutations) { + typedef typename Size::Type TSize; + + TSize pos = 0; + + SEQAN_ASSERT_EQ(length(row0), length(row1)); + TSize dbEndPos = length(row0); + TSize queryEndPos = length(row1); + + bool first = true; + TSize readBasePos = pos + clippedBeginPosition(row1); + TSize readPos = 0; + while (pos < dbEndPos || pos < queryEndPos) { + int matched = 0; + int inserted = 0; + int deleted = 0; + while (pos != dbEndPos && pos != queryEndPos && + !isGap(row0, pos) && !isGap(row1, pos)) { + ++readPos; + if (value(row0, pos) != value(row1, pos)) { + if (first) first = false; + else mutations << ","; + mutations << readPos << seqan3::to_char(source(row1)[readBasePos]); + } + ++readBasePos; + ++pos; + ++matched; + } + if (matched > 0) cigar << matched << "M" ; + while (pos < dbEndPos && isGap(row1, pos)) { + ++pos; + ++deleted; + } + if (deleted > 0) cigar << deleted << "D"; + while (pos < queryEndPos && isGap(row0, pos)) { + ++pos; + ++readPos; + if (first) first = false; + else mutations << ","; + mutations << readPos << seqan3::to_char(source(row1)[readBasePos]); + ++readBasePos; + ++inserted; + } + if (inserted > 0) cigar << inserted << "I"; + } +} + +/////////////////////////////////////////////////////////////////////////////// +// Determines the length and the number of matches of two alignment rows +template +inline void +_analyzeAlignment(TRow const & row0, TRow const & row1, TSize & aliLen, TSize & matches) { + TSize pos = 0; + SEQAN_ASSERT_EQ(length(row0), length(row1)); + TSize end0 = length(row0); + TSize end1 = length(row1); + + matches = 0; + while ((pos < end0) && (pos < end1)) { + if (!isGap(row0, pos) && !isGap(row1, pos)) { + if (value(row0, pos) == value(row1, pos)) { + ++matches; + } + } + ++pos; + } + + aliLen = _max(end0, end1); +} + +/////////////////////////////////////////////////////////////////////////////// +// Calculates the identity of two alignment rows (percentage of matching positions). +template +double +_computeIdentity(TRow const & row0, TRow const & row1) { + typedef typename Size::Type TSize; + TSize matches, aliLen; + _analyzeAlignment(row0, row1, aliLen, matches); + + return floor(1000000.0 * matches / aliLen) / 10000.0; +} + +/////////////////////////////////////////////////////////////////////////////// +// Calculates the E-value from two alignment rows and a specified length adjustment +template +double +_computeEValue(TRow const & row0, TRow const & row1, TSize const lengthAdjustment) { + TSize m = length(source(row0)) - lengthAdjustment; + TSize n = length(source(row1)) - lengthAdjustment; + double minusLambda = -1.19; // -lambda + double K = 0.34; + + TSize matches, aliLen; + _analyzeAlignment(row0, row1, aliLen, matches); + // score = 1 * matches - 2 * errors (mismatches or gaps) + // = matches - 2 * (aliLen - matches) + TSize score = matches - 2 * (aliLen - matches); + + return K * (double)m * (double)n * exp(minusLambda * (double)score); +} + +/////////////////////////////////////////////////////////////////////////////// +// Calculates the E-value for an alignment with the specified score and number +// of matches, and the specified length of query and database sequence +template +double +_computeEValue(TSize const score, TSize const len0, TSize const len1) { + double minusLambda = -1.19; // -lambda + double K = 0.34; + + TSize lengthAdjustment = _computeLengthAdjustment(len0, len1); + TSize m = len0 - lengthAdjustment; + TSize n = len1 - lengthAdjustment; + + return K * (double)m * (double)n * exp(minusLambda * (double)score); +} + +/////////////////////////////////////////////////////////////////////////////// +// Writes rows of a StellarMatch in gff format to a file. +template +void +_writeMatchGff(std::string const & databaseID, + TId const & patternID, + bool const databaseStrand, + TRow const & row0, + TRow const & row1, + TFile & file) { +//IOREV _recordreading_ unclear how this is related to GFF support from store_io + typedef typename Value::Type>::Type TAlphabet; + + auto id_first_token = databaseID.substr(0, databaseID.find(' ')); + file << id_first_token; + + file << "\tStellar"; + file << "\teps-matches"; + + if (databaseStrand) { + file << "\t" << beginPosition(row0) + beginPosition(source(row0)) + 1; + file << "\t" << endPosition(row0) + beginPosition(source(row0)); + } else { + file << "\t" << length(source(row0)) - (endPosition(row0) + beginPosition(source(row0))) + 1; + file << "\t" << length(source(row0)) - (beginPosition(row0) + beginPosition(source(row0))); + } + + file << "\t" << _computeIdentity(row0, row1); + + file << "\t" << (databaseStrand ? '+' : '-'); + + file << "\t.\t"; + for (typename Position::Type i = 0; i < length(patternID) && value(patternID, i) > 32; ++i) { + file << value(patternID, i); + } + + //file << ";seq2Length=" << length(source(row1)); + + file << ";seq2Range=" << beginPosition(row1) + beginPosition(source(row1)) + 1; + file << "," << endPosition(row1) + beginPosition(source(row1)); + + std::stringstream cigar, mutations; + _getCigarLine(row0, row1, cigar, mutations); + file << ";cigar=" << cigar.str(); + file << ";mutations=" << mutations.str(); + file << "\n"; +} + +/////////////////////////////////////////////////////////////////////////////// +// Writes rows of a StellarMatch in human readable format to file. +template +void +_writeMatch(std::string const & databaseID, + TId const & patternID, + bool const databaseStrand, + TRow const & row0, + TRow const & row1, + TFile & file) { +//IOREV _recordreading_ _stub_ + typedef typename Value::Type>::Type TAlphabet; + + // write database ID + file << "Database sequence: " << databaseID; + if (!databaseStrand) file << " (complement)" << std::endl; + else file << std::endl; + + // write database positions + file << "Database positions: "; + if (databaseStrand) { + file << beginPosition(row0) + beginPosition(source(row0)); + file << ".." << endPosition(row0) + beginPosition(source(row0)); + } else { + file << length(source(row0)) - beginPosition(row0) + beginPosition(source(row0)); + file << ".." << length(source(row0)) - endPosition(row0) + beginPosition(source(row0)); + } + file << std::endl; + + // write query ID + file << "Query sequence: " << patternID << std::endl; + + // write query positions + file << "Query positions: "; + file << beginPosition(row1) + beginPosition(source(row1)); + file << ".." << endPosition(row1) + beginPosition(source(row1)); + file << std::endl; + + file << std::endl; + + // write match + Align::Type> align; + appendValue(align.data_rows, row0); + appendValue(align.data_rows, row1); + file << align; + file << "----------------------------------------------------------------------\n" << std::endl; +} + +template +void _writeMatchesToGffFile(QueryMatches > const & queryMatches, + CharString const & id, bool const orientation, std::ofstream & outputFile) +{ + for (StellarMatch const & match : queryMatches.matches) { + if (match.orientation != orientation) + continue; + + _writeMatchGff(match.id, id, match.orientation, match.row1, match.row2, outputFile); + } +} + +template +void _writeMatchesToTxtFile(QueryMatches > const &queryMatches, + CharString const & id, bool const orientation, std::ofstream & outputFile) +{ + for (StellarMatch const & match : queryMatches.matches) { + if (match.orientation != orientation) + continue; + + _writeMatch(match.id, id, match.orientation, match.row1, match.row2, outputFile); + } +} + +/////////////////////////////////////////////////////////////////////////////// +// Calls _writeMatchGff for each match in String of matches. +// = Writes matches in gff format to a file. +template +void _writeQueryMatchesToFile(QueryMatches > const & queryMatches, + CharString const & id, bool const orientation, CharString const & outputFormat, std::ofstream & outputFile) +{ + if (outputFormat == "gff") + _writeMatchesToGffFile(queryMatches, id, orientation, outputFile); + else + _writeMatchesToTxtFile(queryMatches, id, orientation, outputFile); +} + +/////////////////////////////////////////////////////////////////////////////// +// Calls _writeMatchGff for each match in StringSet of String of matches. +// = Writes matches in gff format to a file. +template +void _writeAllQueryMatchesToFile(std::vector > > const & matches, + TQueryIDs const & queryIDs, bool const orientation, + CharString const & outputFormat, std::ofstream & outputFile) +{ + for (size_t i = 0; i < matches.size(); i++) { + QueryMatches> const & queryMatches = matches[i]; + + _writeQueryMatchesToFile(queryMatches, queryIDs[i], orientation, outputFormat, outputFile); + } +} + +template +StellarOutputStatistics _computeOutputStatistics(std::vector > > const & matches) +{ + StellarOutputStatistics statistics{}; + + for (QueryMatches> const & queryMatches : matches) { + statistics.numMatches += queryMatches.matches.size(); + + if (queryMatches.disabled) + ++statistics.numDisabled; + + for (StellarMatch const & match : queryMatches.matches) { + size_t len = std::max(length(match.row1), length(match.row2)); + statistics.totalLength += len; + statistics.maxLength = std::max(statistics.maxLength, len); + } + } + + return statistics; +} + +template +void _appendDisabledQueryToFastaFile(CharString const & id, TQuery const & query, std::ofstream & disabledQueriesFile) +{ + disabledQueriesFile << ">" << id << "\n"; + disabledQueriesFile << query << "\n\n"; +} + +template +void _writeDisabledQueriesToFastaFile(std::vector const & disabledQueryIDs, TIds const & ids, TQueries const & queries, std::ofstream & disabledQueriesFile) +{ + assert(disabledQueriesFile.is_open()); + + for (size_t queryID : disabledQueryIDs) + _appendDisabledQueryToFastaFile(ids[queryID], queries[queryID], disabledQueriesFile); +} + +} // namespace dream_stellar diff --git a/include/dream_stellar/stellar_query_segment.hpp b/include/dream_stellar/stellar_query_segment.hpp new file mode 100644 index 00000000..fc1744d4 --- /dev/null +++ b/include/dream_stellar/stellar_query_segment.hpp @@ -0,0 +1,37 @@ +#pragma once + +#include + +namespace dream_stellar +{ + +template +struct StellarQuerySegment : public StellarSequenceSegment +{ + using TBase = StellarSequenceSegment; + + using typename TBase::TInfixSegment; + using TNestedPatternSegment = seqan2::Segment; + + using TBase::TBase; // import constructor + + std::span const & underlyingQuery() const & + { + return this->underlyingSequence(); + } + + TNestedPatternSegment asPatternSegment() const + { + std::span const & _query = underlyingQuery(); + auto patternInfix = this->asInfixSegment(); + + TInfixSegment const patternInfixSeq = infix(_query, 0, length(_query)); + return { + patternInfixSeq, + seqan2::beginPosition(patternInfix) - seqan2::beginPosition(patternInfixSeq), + seqan2::endPosition(patternInfix) - seqan2::beginPosition(patternInfixSeq) + }; + } +}; + +} // namespace dream_stellar diff --git a/include/dream_stellar/stellar_sequence_segment.hpp b/include/dream_stellar/stellar_sequence_segment.hpp new file mode 100644 index 00000000..3f350733 --- /dev/null +++ b/include/dream_stellar/stellar_sequence_segment.hpp @@ -0,0 +1,96 @@ +#pragma once + +#include + +namespace dream_stellar +{ + +template +struct StellarSequenceSegment +{ + using sequence_reference_t = std::span; + using TInfixSegment = seqan2::Segment; + + StellarSequenceSegment() = default; + + StellarSequenceSegment( + sequence_reference_t const & sequence, + size_t const beginPosition, + size_t const endPosition) + : _sequenceSegment{sequence, beginPosition, endPosition} + {} + + // do not allow temporaries + StellarSequenceSegment( + sequence_reference_t && sequence, + size_t const beginPosition, + size_t const endPosition) = delete; + + sequence_reference_t const & underlyingSequence() const + { + assert(_sequenceSegment.data_host); + return *(_sequenceSegment.data_host); + } + + size_t beginPosition() const + { + return seqan2::beginPosition(_sequenceSegment); + } + + size_t endPosition() const + { + return seqan2::endPosition(_sequenceSegment); + } + + std::pair interval() const + { + return {beginPosition(), endPosition()}; + } + + size_t size() const + { + assert(endPosition() >= beginPosition()); + return endPosition() - beginPosition(); + } + + TInfixSegment const & asInfixSegment() const + { + return _sequenceSegment; + } + + std::span as_span() const + { + return this->underlyingSequence().subspan(this->beginPosition() /* offset */, + this->endPosition() - this->beginPosition() /* count */); + } + + friend bool operator<(StellarSequenceSegment const & s1, StellarSequenceSegment const & s2) { return s1.compare_three_way(s2) < 0; } + friend bool operator<=(StellarSequenceSegment const & s1, StellarSequenceSegment const & s2) { return s1.compare_three_way(s2) <= 0; } + friend bool operator==(StellarSequenceSegment const & s1, StellarSequenceSegment const & s2) { return s1.compare_three_way(s2) == 0; } + friend bool operator!=(StellarSequenceSegment const & s1, StellarSequenceSegment const & s2) { return s1.compare_three_way(s2) != 0; } + friend bool operator>(StellarSequenceSegment const & s1, StellarSequenceSegment const & s2) { return s1.compare_three_way(s2) > 0; } + friend bool operator>=(StellarSequenceSegment const & s1, StellarSequenceSegment const & s2) { return s1.compare_three_way(s2) >= 0; } + +private: + + std::ptrdiff_t compare_three_way(StellarSequenceSegment const & otherSegment) const + { + std::ptrdiff_t diff = std::addressof(this->underlyingSequence()) + - std::addressof(otherSegment.underlyingSequence()); + + if (diff != 0) + return diff; + + diff = static_cast(this->beginPosition()) - static_cast(otherSegment.beginPosition()); + + if (diff != 0) + return diff; + + diff = static_cast(this->endPosition()) - static_cast(otherSegment.endPosition()); + return diff; + } + + TInfixSegment _sequenceSegment; +}; + +} // namespace dream_stellar diff --git a/include/dream_stellar/stellar_types.hpp b/include/dream_stellar/stellar_types.hpp new file mode 100644 index 00000000..d2f5e796 --- /dev/null +++ b/include/dream_stellar/stellar_types.hpp @@ -0,0 +1,402 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include + +#if __cpp_designated_initializers || __GNUC__ >= 8 +# define STELLAR_DESIGNATED_INITIALIZER(designator, value) designator value +#else +# define STELLAR_DESIGNATED_INITIALIZER(designator, value) value +#endif // __cpp_designated_initializers || __GNUC__ >= 8 + +namespace dream_stellar +{ + +using namespace seqan2; + +struct StellarStatistics +{ + bool kMerComputed{}; + unsigned kMerLength{}; + unsigned smin{}; + int threshold{}; + int distanceCut{}; + int delta{}; + int overlap{}; + + StellarStatistics(StellarOptions const & options) + { + if (!options.epsilon.is_proper()) + throw std::domain_error{"Epsilon must be between >= 0.0 and < 1.0."}; + + size_t n0 = options.minLength; // min length + size_t e0 = StellarOptions::absoluteErrors(options.epsilon, n0); + // nearest length (after min length) that has exactly e0 + 1 many absolute errors + size_t n1 = StellarOptions::minLengthWithExactError(e0 + 1, options.epsilon); + size_t e1 = e0 + 1; + + unsigned smin0 = StellarOptions::pigeonholeLemma(n0, e0); + unsigned smin1 = StellarOptions::pigeonholeLemma(n1, e1); + smin = (unsigned) std::min(smin0, smin1); + + assert(n1 >= n0); + assert(options.epsilon.numerator() == 0 || StellarOptions::absoluteErrors(options.epsilon, n1) == e1); + + kMerLength = options.qGram; + kMerComputed = options.qGram == (size_t)-1; + + if (kMerComputed) + kMerLength = std::min(std::max(1u, smin), 32u); + + if (kMerLength > (unsigned)options.minLength) + throw std::domain_error{"qGram must be smaller than minLength."}; + + size_t threshold0 = StellarOptions::kmerLemma(n0, kMerLength, e0); + size_t threshold1 = StellarOptions::kmerLemma(n1, kMerLength, e1); + threshold = std::max(size_t{1u}, std::min(threshold0, threshold1)); + + overlap = (int) floor((2 * threshold + kMerLength - 3) / (1 / (double)options.epsilon - kMerLength)); + distanceCut = (threshold - 1) + kMerLength * overlap + kMerLength; + int logDelta = _max(4, (int) ceil(log((double)overlap + 1) / log(2.0))); + delta = 1 << logDelta; + } +}; + +struct StellarComputeStatistics +{ + size_t numSwiftHits = 0; + + size_t maxLength = 0; + size_t totalLength = 0; + + void mergeIn(StellarComputeStatistics const & statistics) + { + this->numSwiftHits += statistics.numSwiftHits; + this->totalLength += statistics.totalLength; + this->maxLength = std::max(this->maxLength, statistics.maxLength); + } +}; + +struct StellarOutputStatistics +{ + size_t maxLength{0u}; + size_t totalLength{0u}; + size_t numMatches{0u}; + size_t numDisabled{0u}; + + void mergeIn(StellarOutputStatistics const & statistics) + { + maxLength = std::max(maxLength, statistics.maxLength); + totalLength = totalLength + statistics.totalLength; + numMatches = numMatches + statistics.numMatches; + numDisabled = numDisabled + statistics.numDisabled; + } +}; + +struct StellarComputeStatisticsCollection +{ + StellarComputeStatistics const & operator[](size_t const databaseRecordID) const + { + return _statistics[databaseRecordID]; + } + + void addStatistics(StellarComputeStatistics const & computeStatistics) + { + _statistics.push_back(computeStatistics); + } + + size_t size() const + { + return _statistics.size(); + } +private: + std::vector _statistics; // one per database +}; + + +/////////////////////////////////////////////////////////////////////////////// +// Container for storing a local alignment match +template const>, typename TId_ = std::string> +struct StellarMatch { + static_assert(std::is_const::value, "Sequence must be const qualified! I.e. StellarMatch<... const, ...>"); + typedef TSequence_ TSequence; + typedef TId_ TId; + + //!TODO: can std::span be aligned + /* + if (std::same_as const>>::value) + typedef typename TSequence::size_type TPos; + else + typedef typename Position::Type TPos; + */ + typedef size_t TPos; + + typedef Align>, ArrayGaps> TAlign; + typedef typename Row::Type TRow; + static const TId INVALID_ID; + + TId id; // database ID + bool orientation; + TPos begin1; + TPos end1; + TRow row1; + + TPos begin2; + TPos end2; + TRow row2; + + StellarMatch() : id(), orientation(false), begin1(0), end1(0), begin2(0), end2(0) + {} + + template + StellarMatch(TAlign & _align, TId _id, bool _orientation) + { + id = _id; + orientation = _orientation; + row1 = row(_align, 0); + row2 = row(_align, 1); + + begin1 = beginPosition(row1); + end1 = endPosition(row1); + + begin2 = beginPosition(row2); + end2 = endPosition(row2); + } +}; + +template +const TId +StellarMatch::INVALID_ID = "###########"; + +/////////////////////////////////////////////////////////////////////////////// +// Container for storing local alignment matches of one query sequence +template +struct QueryMatches { + typedef typename TMatch_::TSequence::size_type TSize; + + std::vector matches; + bool disabled; + + QueryMatches() : disabled(false) + {} + + inline bool removeOverlapsAndCompactMatches(size_t const minLength, + size_t const disableThresh, + size_t const compactThresh, + size_t const numMatches) + { + if (this->disabled) + return false; + + size_t const matchesCount = (this->matches).size(); + + if (matchesCount > disableThresh) { + this->disabled = true; + (this->matches).clear(); + return false; + } + + if (matchesCount <= compactThresh) + return false; + + //!TODO: reimplement for std::vector + //maskOverlaps(this->matches, minLength); // remove overlaps and duplicates + //compactMatches(this->matches, numMatches); // keep only the longest matches + return true; + } + + /////////////////////////////////////////////////////////////////////////////// + // Appends a match to matches container and removes overlapping matches if threshold is reached. + template + inline bool insertMatch(StellarMatch const & match, + TSize const minLength, + TSize1 const disableThresh, + TSize1 & compactThresh, + TSize1 const numMatches) { + + matches.emplace_back(match); + + // std::cerr << "Inserting match \n-------------\n" << match.row1 <<"\n" << match.row2 << "----------------\n"; + + if (removeOverlapsAndCompactMatches(minLength, disableThresh, compactThresh, numMatches)) + { + // raise compact threshold if many matches are kept + if ((matches.size() << 1) > compactThresh) + compactThresh += (compactThresh >> 1); + } + return true; + } + + /////////////////////////////////////////////////////////////////////////////// + // Appends a match to matches container and removes overlapping matches if threshold is reached. + template + inline bool insertMatch(StellarMatch> const, TId> const & match, + TSize const minLength, + TSize1 const disableThresh, + TSize1 & compactThresh, + TSize1 const numMatches) { + + StellarMatch const> const, std::string> backconverted_match{}; + backconverted_match.id = match.id; + backconverted_match.orientation = match.orientation; + backconverted_match.begin1 = match.begin1; + backconverted_match.end1 = match.end1; + backconverted_match.row1 = match.row1; + + backconverted_match.begin2 = match.begin2; + backconverted_match.end2 = match.end2; + backconverted_match.row2 = match.row2; + + matches.emplace_back(backconverted_match); + + // std::cerr << "Inserting match \n-------------\n" << match.row1 <<"\n" << match.row2 << "----------------\n"; + + if (removeOverlapsAndCompactMatches(minLength, disableThresh, compactThresh, numMatches)) + { + // raise compact threshold if many matches are kept + if ((matches.size() << 1) > compactThresh) + compactThresh += (compactThresh >> 1); + } + return true; + } +}; + +/////////////////////////////////////////////////////////////////////////////// + + +/////////////////////////////////////////////////////////////////////////////// +// to sort matches by position and remove overlapping matches +template +struct LessPos : public ::std::function { + LessPos() {} + + inline int compare(TMatch const & a, TMatch const & b) const { + // query number + if ((a.id) < (b.id)) return -1; + if ((a.id) > (b.id)) return 1; + + // database begin position + typename TMatch::TPos aBegin1 = _min(a.begin1, a.end1); + typename TMatch::TPos bBegin1 = _min(b.begin1, b.end1); + if (aBegin1 < bBegin1) return -1; + if (aBegin1 > bBegin1) return 1; + + // database end position + typename TMatch::TPos aEnd1 = _max(a.begin1, a.end1); + typename TMatch::TPos bEnd1 = _max(b.begin1, b.end1); + if (aEnd1 < bEnd1) return -1; + if (aEnd1 > bEnd1) return 1; + + // query begin position + typename TMatch::TPos aBegin2 = _min(a.begin2, a.end2); + typename TMatch::TPos bBegin2 = _min(b.begin2, b.end2); + if (aBegin2 < bBegin2) return -1; + if (aBegin2 > bBegin2) return 1; + + // query end position + typename TMatch::TPos aEnd2 = _max(a.begin2, a.end2); + typename TMatch::TPos bEnd2 = _max(b.begin2, b.end2); + if (aEnd2 < bEnd2) return -1; + if (aEnd2 > bEnd2) return 1; + + // orientation + if (a.orientation > b.orientation) return -1; + if (a.orientation < b.orientation) return 1; + + //// orientation + //bool oa = a.begin1 < a.end1; + //bool ob = b.begin1 < b.end1; + //if (oa != ob) return oa; + + return 0; + } + + inline bool operator() (TMatch const & a, TMatch const & b) const { + return compare(a, b) == -1; + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// to sort matches by length +template +struct LessLength : public ::std::function { + LessLength() {} + + inline int compare(TMatch const & a, TMatch const & b) const { + if (a.id == TMatch::INVALID_ID) return 1; + if (b.id == TMatch::INVALID_ID) return -1; + + typename TMatch::TPos aLength = abs((int)a.end1 - (int)a.begin1); + typename TMatch::TPos bLength = abs((int)b.end1 - (int)b.begin1); + + if (aLength < bLength) return 1; + if (aLength > bLength) return -1; + + return 0; + } + + inline bool operator() (TMatch const & a, TMatch const & b) const { + return compare(a, b) == -1; + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Determines whether match1 is upstream of match2 in specified row. +// If matches overlap, the non-overlapping parts have to be longer than minLenght. +template +inline bool +_isUpstream(StellarMatch & match1, StellarMatch & match2, TRowNo row, TSize minLength) { + typedef typename StellarMatch::TPos TPos; + + TPos e1, b2; + if (row == 0) { + e1 = match1.end1; + b2 = match2.begin1; + } else { + e1 = match1.end2; + b2 = match2.begin2; + } + + if (e1 <= b2) return true; + + TPos b1, e2; + if (row == 0) { + e2 = match2.end1; + b1 = match1.begin1; + } else { + e2 = match2.end2; + b1 = match1.begin2; + } + + if (b1 < b2 && (b2 - b1 >= minLength)) { + if ((e1 < e2) && (e2 - e1 >= minLength)) return true; + } + + return false; +} + +/////////////////////////////////////////////////////////////////////////////// +// sorts StellarMatchees by specified functor +template +inline void +sortMatches(String > & stellarMatches, TFunctorLess const & less) { + std::stable_sort( + begin(stellarMatches, Standard()), + end(stellarMatches, Standard()), + less); +} + +/////////////////////////////////////////////////////////////////////////////// +// returns the length of the longer row from StellarMatch +template +inline typename Size::Type +length(StellarMatch & match) { + return _max(length(match.row1), length(match.row2)); +} + +} // namespace dream_stellar diff --git a/include/dream_stellar/utils/fraction.hpp b/include/dream_stellar/utils/fraction.hpp new file mode 100644 index 00000000..80aebd74 --- /dev/null +++ b/include/dream_stellar/utils/fraction.hpp @@ -0,0 +1,288 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace dream_stellar::utils +{ + +struct fraction +{ + using difference_t = std::ptrdiff_t; + + fraction() = default; + + constexpr explicit fraction(uint32_t const precision_limit) : + _numerator{1} + { + _limiter = std::max(_limiter, precision_limit); + _denominator = limiter(); + if (_denominator == 0u) + throw std::domain_error{"denominator shouldn't be zero."}; + } + + template + constexpr explicit fraction(difference_t const numerator, TDenominator const denominator, std::enable_if_t, int> = 0) : + _numerator{numerator}, + _denominator{denominator} + { + if (_denominator == 0u) + throw std::domain_error{"denominator can't be zero."}; + } + + template + constexpr fraction(difference_t const numerator, TDenominator const denominator, std::enable_if_t, int> = 0) : + fraction{_abs(numerator), static_cast(_abs(denominator))} + { + if (numerator < 0) + _numerator = -_numerator; + + if (denominator < 0) + _numerator = -_numerator; + } + + static fraction from_string(std::string_view const str) + { + using double_t = long double; + char * sentinel = nullptr; + + std::cmatch simple_float_match; + std::regex simple_float_regex{"^([+-]|)([0-9]+)(.([0-9]+)|)$"}; + + static constexpr size_t float_sign_id = 1; + static constexpr size_t float_integer_id = 2; + static constexpr size_t float_fraction_id = 4; + + if (std::regex_search(str.begin(), str.end(), simple_float_match, simple_float_regex)) + { + std::string float_fraction_str = simple_float_match[float_fraction_id]; + + // NOTE: all digits without dot + std::string float_significand_str = std::string{simple_float_match[float_integer_id]} + float_fraction_str; + + auto parse_int = [](std::string_view const str) -> size_t + { + char * str_end; + return std::strtoll(str.data(), &str_end, 10); + }; + + difference_t numerator = parse_int(float_significand_str); + std::size_t denominator = 1u; + for (std::size_t exponent = float_fraction_str.size(); exponent > 0u; --exponent) + denominator *= 10; + + if (simple_float_match[float_sign_id].compare("-") == 0) + numerator = -numerator; + + return {numerator, static_cast(denominator)}; + } + + double_t value = std::strtold(str.data(), &sentinel); + return from_double(value); + } + + static fraction from_double_with_limit(long double value, uint32_t const precision_limit) + { + // Applying a lower bound to the error rate because values close to 0 increase runtime dramatically + if (abs(value) < (1.0 / (precision_limit * 1.05))) + { + return fraction(precision_limit); + } + else + return from_double(value); + } + + static fraction from_double(long double value) + { + constexpr size_t max_iterations = 400; + + int exponent{0u}; + double_t normalized_value = std::frexp(value, &exponent); + + for (size_t i = 0; i < max_iterations && normalized_value != std::floor(normalized_value); ++i) + { + // otherwise over/underflows + if (exponent <= -62 || exponent >= 62) + break; + + normalized_value *= 2.0; + exponent--; + } + + difference_t numerator = std::llround(normalized_value); + difference_t denominator = 1; + + if (exponent > 0) // normalized_value * 2^exponent + { + numerator <<= exponent; + } else + { + denominator <<= -exponent; + } + return fraction(numerator, denominator); + } + + constexpr operator double() const + { + return static_cast(_numerator) / _denominator; + } + + constexpr difference_t numerator() const + { + return _numerator; + } + + constexpr size_t denominator() const + { + return _denominator; + } + + constexpr uint32_t limiter() const + { + return _limiter; + } + + constexpr bool is_proper() const + { + // i.e. proper: 1/3, 2/3, -1/3, inproper: 3/2, 2/2, -14/3 + return _abs(_numerator) < static_cast(_denominator); + } + + constexpr friend difference_t floor(fraction a) + { + difference_t i = a.numerator() / static_cast(a.denominator()); + difference_t m = a.numerator() % static_cast(a.denominator()); + return i + (m < 0 ? -1 : 0); + } + + constexpr friend difference_t ceil(fraction a) + { + difference_t value = a.numerator() + a.denominator() - 1; + difference_t i = value / static_cast(a.denominator()); + difference_t m = value % static_cast(a.denominator()); + return i + (m < 0 ? -1 : 0); + } + + constexpr friend fraction abs(fraction a) + { + return fraction{_abs(a.numerator()), a.denominator()}; + } + + constexpr friend fraction inverse(fraction a) + { + return fraction{static_cast(a.denominator()), a.numerator()}; + } + + constexpr friend fraction operator/(fraction a, fraction b) + { + return a * inverse(b); + } + + constexpr friend fraction operator*(fraction a, fraction b) + { + return fraction{a.numerator() * b.numerator(), a.denominator() * b.denominator()}; + } + + constexpr friend fraction operator-(fraction a, fraction b) + { + return fraction{ + _numerator_times_denominator(a, b) - _numerator_times_denominator(b, a), + a.denominator() * b.denominator() + }; + } + + constexpr fraction limit_denominator(size_t max_denominator=1000000) const + { + // https://stackoverflow.com/questions/17537613/does-python-have-a-function-to-reduce-fractions + // https://hg.python.org/cpython/file/822c7c0d27d1/Lib/fractions.py#l211 + if (_denominator <= max_denominator) + return *this; + + bool is_negative = _numerator < 0; + + size_t p0 = 0; + size_t q0 = 1; + size_t p1 = 1; + size_t q1 = 0; + size_t n = _abs(_numerator); + size_t d = _denominator; + + while (true) + { + if (d == 0) + break; + + size_t a = n / d; + size_t q2 = q0 + a * q1; + size_t p2 = p0 + a * p1; + + if (q2 > max_denominator) + break; + + p0 = p1; + q0 = q1; + p1 = p2; + q1 = q2; + + assert(n >= a * d); + + size_t new_d = n - a * d; + n = d; + d = new_d; + } + + assert(max_denominator >= q0); + size_t k = (max_denominator - q0) / q1; + + size_t p2 = p0 + k * p1; + size_t q2 = q0 + k * q1; + + assert(q2 != 0); + assert(q1 != 0); + fraction bound1{static_cast(p2) * (is_negative ? -1 : 1), q2}; + fraction bound2{static_cast(p1) * (is_negative ? -1 : 1), q1}; + + // abs(bound1 - *this) + difference_t numerator1 = _abs(_numerator_times_denominator(bound1, *this) - _numerator_times_denominator(*this, bound1)); + // abs(bound2 - *this) + difference_t numerator2 = _abs(_numerator_times_denominator(bound2, *this) - _numerator_times_denominator(*this, bound2)); + if (numerator2 <= numerator1) + return bound2; + else + return bound1; + } + + template< class CharT, class Traits> + friend std::basic_ostream & + operator<<(std::basic_ostream & os, fraction const & fraction) + { + if (fraction.denominator() != fraction.limiter()) + os << ((double)fraction.numerator() / fraction.denominator()) << " = (" << fraction.numerator() << "/" << fraction.denominator() << ")"; + else + os << "0 = (0/1)"; // if floating point value <10e6 treat it as 0 + return os; + } + +private: + constexpr static difference_t _numerator_times_denominator(fraction a, fraction b) + { + return a.numerator() * static_cast(b.denominator()); + } + + // std::abs is not constexpr + constexpr static auto _abs = [](auto a) { return a < 0 ? -a : a; }; + + /* + The longest allowed min local match length is 1000 which means that the smallest meaningful error rate is 1 / 1000. + Adding a 5% allowance to account for floating point inexactness the _limiter gives a lower bound for the error rate. + It is crucial to have a lower bound for the error rate because values close to 0 have a detrimental effect on the runtime. + */ + uint32_t _limiter{1050u}; + difference_t _numerator{0u}; + size_t _denominator{1u}; +}; + +} // namespace dream_stellar::utils diff --git a/include/dream_stellar/utils/stellar_app_runtime.hpp b/include/dream_stellar/utils/stellar_app_runtime.hpp new file mode 100644 index 00000000..f57edd4c --- /dev/null +++ b/include/dream_stellar/utils/stellar_app_runtime.hpp @@ -0,0 +1,105 @@ +#pragma once + +#include +#include + +namespace dream_stellar +{ + +struct stellar_strand_time : public stellar_runtime +{ + stellar_kernel_runtime prefiltered_stellar_time{}; + stellar_runtime post_process_eps_matches_time{}; + stellar_runtime output_eps_matches_time{}; + + stellar_runtime total_time() const + { + stellar_runtime total{}; + total.manual_timing( + prefiltered_stellar_time._runtime + + post_process_eps_matches_time._runtime + + output_eps_matches_time._runtime); + + return total; + } +}; + +template +void _print_stellar_strand_time(stellar_strand_time const & strand_runtime, std::string strandDirection, TStream & outStr) +{ + stellar_kernel_runtime const & prefiltered_stellar_time + = strand_runtime.prefiltered_stellar_time; + stellar_verification_time const & verification_time + = strand_runtime.prefiltered_stellar_time.verification_time; + stellar_extension_time const & extension_time + = verification_time.extension_time; + stellar_best_extension_time const & best_extension_time + = extension_time.best_extension_time; + + outStr << " + Prefiltered Stellar Time (" << strandDirection << "): " << prefiltered_stellar_time.milliseconds() << "ms" << std::endl; + outStr << " + Swift Filter Time (" << strandDirection << "): " << prefiltered_stellar_time.swift_filter_time.milliseconds() << "ms" << std::endl; + outStr << " + Seed Verification Time (" << strandDirection << "): " << verification_time.milliseconds() << "ms" << std::endl; + outStr << " + Find Next Local Alignment Time (" << strandDirection << "): " << verification_time.next_local_alignment_time.milliseconds() << "ms" << std::endl; + outStr << " + Split At X-Drops Time (" << strandDirection << "): " << verification_time.split_at_x_drops_time.milliseconds() << "ms" << std::endl; + outStr << " + Extension Time (" << strandDirection << "): " << extension_time.milliseconds() << "ms" << std::endl; + outStr << " + Extend Seed Time (" << strandDirection << "): " << extension_time.extend_seed_time.milliseconds() << "ms" << std::endl; + outStr << " + Best Extension Time (" << strandDirection << "): " << best_extension_time.milliseconds() << "ms" << std::endl; + outStr << " + Banded Needleman-Wunsch Time (" << strandDirection << "): " << best_extension_time.banded_needleman_wunsch_time.milliseconds() << "ms" << std::endl; + outStr << " + Banded Needleman-Wunsch (Left Extension) Time (" << strandDirection << "): " << best_extension_time.banded_needleman_wunsch_left_time.milliseconds() << "ms" << std::endl; + outStr << " + Banded Needleman-Wunsch (Right Extension) Time (" << strandDirection << "): " << best_extension_time.banded_needleman_wunsch_right_time.milliseconds() << "ms" << std::endl; + outStr << " + Longest EPS Match Time (" << strandDirection << "): " << best_extension_time.longest_eps_match_time.milliseconds() << "ms" << std::endl; + outStr << " + Construct Alignment Time (" << strandDirection << "): " << best_extension_time.construct_seed_alignment_time.milliseconds() << "ms" << std::endl; + outStr << " = total time: " << best_extension_time.total_time().milliseconds() << "ms" << std::endl; + outStr << " = total time: " << extension_time.total_time().milliseconds() << "ms" << std::endl; + outStr << " = total time: " << verification_time.total_time().milliseconds() << "ms" << std::endl; + outStr << " = total time: " << prefiltered_stellar_time.total_time().milliseconds() << "ms" << std::endl; + outStr << " + Post-Process Eps-Matches Time (" << strandDirection << "): " << strand_runtime.post_process_eps_matches_time.milliseconds() << "ms" << std::endl; + outStr << " + File Output Eps-Matches Time (" << strandDirection << "): " << strand_runtime.output_eps_matches_time.milliseconds() << "ms" << std::endl; + outStr << " = total time: " << strand_runtime.total_time().milliseconds() << "ms" << std::endl; +}; + + +struct stellar_app_runtime : public stellar_runtime +{ + stellar_runtime input_queries_time{}; + stellar_runtime input_databases_time{}; + stellar_runtime swift_index_construction_time{}; + stellar_strand_time forward_strand_stellar_time{}; + stellar_runtime reverse_complement_database_time{}; + stellar_strand_time reverse_strand_stellar_time{}; + stellar_runtime output_disabled_queries_time{}; + + stellar_runtime total_time() const + { + stellar_runtime total{}; + total.manual_timing( + input_queries_time._runtime + + input_databases_time._runtime + + swift_index_construction_time._runtime + + forward_strand_stellar_time._runtime + + reverse_complement_database_time._runtime + + reverse_strand_stellar_time._runtime + + output_disabled_queries_time._runtime); + + return total; + } +}; + +template +void _print_stellar_app_time(stellar_app_runtime const & stellar_time, TStream & strOut) +{ + strOut << "Running time: " << stellar_time.milliseconds() << "ms" << std::endl; + strOut << " * Stellar Application Time: " << stellar_time.milliseconds() << "ms" << std::endl; + strOut << " + File Input Queries Time: " << stellar_time.input_queries_time.milliseconds() << "ms" << std::endl; + strOut << " + File Input Databases Time: " << stellar_time.input_databases_time.milliseconds() << "ms" << std::endl; + strOut << " + SwiftFilter Construction Time: " << stellar_time.swift_index_construction_time.milliseconds() << "ms" << std::endl; + strOut << " + Stellar Forward Strand Time: " << stellar_time.forward_strand_stellar_time.milliseconds() << "ms" << std::endl; + _print_stellar_strand_time(stellar_time.forward_strand_stellar_time, "Forward", strOut); + strOut << " + Database Reverse Complement Time: " << stellar_time.reverse_complement_database_time.milliseconds() << "ms" << std::endl; + strOut << " + Stellar Reverse Strand Time: " << stellar_time.reverse_strand_stellar_time.milliseconds() << "ms" << std::endl; + _print_stellar_strand_time(stellar_time.reverse_strand_stellar_time, "Reverse", strOut); + strOut << " + File Output Disabled Queries Time: " << stellar_time.output_disabled_queries_time.milliseconds() << "ms" << std::endl; + strOut << " = total time: " << stellar_time.total_time().milliseconds() << "ms" << std::endl; +} + +} // namespace dream_stellar diff --git a/include/dream_stellar/utils/stellar_kernel_runtime.hpp b/include/dream_stellar/utils/stellar_kernel_runtime.hpp new file mode 100644 index 00000000..069aa0ba --- /dev/null +++ b/include/dream_stellar/utils/stellar_kernel_runtime.hpp @@ -0,0 +1,78 @@ + #pragma once + +#include + +namespace dream_stellar +{ + +struct stellar_best_extension_time : public stellar_runtime +{ + stellar_runtime banded_needleman_wunsch_time; + stellar_runtime banded_needleman_wunsch_left_time; + stellar_runtime banded_needleman_wunsch_right_time; + stellar_runtime longest_eps_match_time; + stellar_runtime construct_seed_alignment_time; + + stellar_runtime total_time() const + { + stellar_runtime total{}; + total.manual_timing( + banded_needleman_wunsch_time._runtime + + longest_eps_match_time._runtime + + construct_seed_alignment_time._runtime); + + return total; + } +}; + +struct stellar_extension_time : public stellar_runtime +{ + stellar_runtime extend_seed_time; + stellar_best_extension_time best_extension_time; + + stellar_runtime total_time() const + { + stellar_runtime total{}; + total.manual_timing( + extend_seed_time._runtime + + best_extension_time._runtime); + + return total; + } +}; + +struct stellar_verification_time : public stellar_runtime +{ + stellar_runtime next_local_alignment_time; + stellar_runtime split_at_x_drops_time; + stellar_extension_time extension_time; + + stellar_runtime total_time() const + { + stellar_runtime total{}; + total.manual_timing( + next_local_alignment_time._runtime + + split_at_x_drops_time._runtime + + extension_time._runtime); + + return total; + } +}; + +struct stellar_kernel_runtime : public stellar_runtime +{ + stellar_runtime swift_filter_time{}; + stellar_verification_time verification_time{}; + + stellar_runtime total_time() const + { + stellar_runtime total{}; + total.manual_timing( + swift_filter_time._runtime + + verification_time._runtime); + + return total; + } +}; + +} // namespace dream_stellar diff --git a/include/dream_stellar/utils/stellar_runtime.hpp b/include/dream_stellar/utils/stellar_runtime.hpp new file mode 100644 index 00000000..3c891700 --- /dev/null +++ b/include/dream_stellar/utils/stellar_runtime.hpp @@ -0,0 +1,56 @@ + +#pragma once + +#include + +namespace dream_stellar +{ + +struct stellar_runtime +{ + using clock_t = std::chrono::steady_clock; + using time_point_t = typename clock_t::time_point; + using duration_t = typename clock_t::duration; + + template + auto measure_time(execute_fn && execute) + { + constexpr bool has_return = !std::is_same_v; + auto start = now(); + + if constexpr(has_return) + { + auto value = execute(); + _runtime += now() - start; + return value; + } else + { + execute(); + _runtime += now() - start; + } + } + + static time_point_t now() + { + return clock_t::now(); + } + + void manual_timing(time_point_t time_point) + { + manual_timing(now() - time_point); + } + + void manual_timing(duration_t duration) + { + _runtime += duration; + } + + size_t milliseconds() const + { + return std::chrono::duration_cast(_runtime).count(); + } + + duration_t _runtime{}; +}; + +} // namespace dream_stellar diff --git a/include/dream_stellar/verification/all_local.hpp b/include/dream_stellar/verification/all_local.hpp new file mode 100644 index 00000000..b2a68e97 --- /dev/null +++ b/include/dream_stellar/verification/all_local.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include + +namespace dream_stellar +{ + +/////////////////////////////////////////////////////////////////////////////// +// Conducts banded local alignment on swift hit (= computes eps-cores), +// splits eps-cores at X-drops, and calls _extendAndExtract for extension of eps-cores +template +void +verifySwiftHit(Segment, InfixSegment> const & infH, + Segment, InfixSegment> const & infV, + TEpsilon const eps, + TSize const minLength, + TDrop const xDrop, + TDelta const delta, + TOnAlignmentResultFn && onAlignmentResult, + stellar_verification_time & verification_runtime, + AllLocal) { + + // false == all local matches + allOrBestLocal(infH, infV, eps, minLength, xDrop, delta, onAlignmentResult, verification_runtime, std::false_type{}); +} + +} // namespace dream_stellar diff --git a/include/dream_stellar/verification/best_local.hpp b/include/dream_stellar/verification/best_local.hpp new file mode 100644 index 00000000..f24cb90d --- /dev/null +++ b/include/dream_stellar/verification/best_local.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include + +namespace dream_stellar +{ + +/////////////////////////////////////////////////////////////////////////////// +// Conducts banded local alignment on swift hit (= computes eps-cores), +// splits eps-cores at X-drops, and calls _extendAndExtract for extension of eps-cores +template +void +verifySwiftHit(Segment, InfixSegment> const & infH, + Segment, InfixSegment> const & infV, + /*double*/ TEpsilon const eps, + /*int*/ TSize const minLength, + /*double*/ TDrop const xDrop, + /*unsigned_integral*/ TDelta const delta, + TOnAlignmentResultFn && onAlignmentResult, + stellar_verification_time & verification_runtime, + BestLocal) { + + // true == best local match + allOrBestLocal(infH, infV, eps, minLength, xDrop, delta, onAlignmentResult, verification_runtime, std::true_type{}); +} + +} // namespace dream_stellar diff --git a/include/dream_stellar/verification/detail/all_or_best_local.hpp b/include/dream_stellar/verification/detail/all_or_best_local.hpp new file mode 100644 index 00000000..a6fadc41 --- /dev/null +++ b/include/dream_stellar/verification/detail/all_or_best_local.hpp @@ -0,0 +1,341 @@ +#pragma once + +#include + +#include +#include +#include + +#include + +namespace dream_stellar +{ + +/////////////////////////////////////////////////////////////////////////////// +// Appends a segment of only error positions from align to queue. +template +inline void +_appendNegativeSegment(TAlign const & align, + TPos & pos, TPos const len, + Score const & scoreMatrix, + String > & queue) { + typedef Triple TMerger; + TPos beginPos = pos; + + TScoreValue score = 0; + while (pos < len) { + if (isGap(row(align, 0), pos) || isGap(row(align, 1), pos)) { + score += scoreGap(scoreMatrix); + } else if (value(scoreMatrix, source(row(align, 0)), pos) != value(scoreMatrix, source(row(align, 1)), pos)) { + score += scoreMismatch(scoreMatrix); + } else { + break; + } + /* + if ((seqan2::length(row(align, 0)) <= pos) || (seqan2::length(row(align, 1)) <= pos)){ + break; + } else if (isGap(row(align, 0), pos) || isGap(row(align, 1), pos)) { + score += scoreGap(scoreMatrix); + } else if (value(scoreMatrix, source(row(align, 0)), pos) != value(scoreMatrix, source(row(align, 1)), pos)) { + score += scoreMismatch(scoreMatrix); + } else { + break; + } + */ + ++pos; + } + if (pos == len) appendValue(queue, TMerger(beginPos, pos, MinValue::VALUE + 1)); + else appendValue(queue, TMerger(beginPos, pos, score)); +} + +template +class empty; + +/////////////////////////////////////////////////////////////////////////////// +// Appends a segment of only matching positions from align to queue. +template +inline void +_appendPositiveSegment(TAlign const & align, + TPos & pos, TPos const len, + Score const & scoreMatrix, + String > & queue) { + if (pos == len) return; + typedef Triple TMerger; + TPos beginPos = pos; + + TScoreValue score = 0; + /* + while ((pos < len) && + ((seqan2::length(row(align, 0)) > pos) && (seqan2::length(row(align, 1)) > pos)) && + (!isGap(row(align, 0), pos) && + !isGap(row(align, 1), pos) && + (value(scoreMatrix, source(row(align, 0)), pos) == value(scoreMatrix, source(row(align, 1)), pos)))) { + score += scoreMatch(scoreMatrix); + ++pos; + } + */ + while ((pos < len) && + (!isGap(row(align, 0), pos) && + !isGap(row(align, 1), pos) && + (value(scoreMatrix, source(row(align, 0)), pos) == value(scoreMatrix, source(row(align, 1)), pos)))) { + score += scoreMatch(scoreMatrix); + ++pos; + } + appendValue(queue, TMerger(beginPos, pos, score)); +} + +/////////////////////////////////////////////////////////////////////////////// +// See Lemma 5 in Zhang et al., 1999. +template +inline bool +_negativeMerge(String & queue) { + typedef typename Value::Type TPos; + TPos len = length(queue); + if (len < 3) return false; + + TMerger cd = value(queue, len-1); + TMerger bc = value(queue, len-2); + TMerger ab = value(queue, len-3); + + if ((bc.i3 < 0) || (bc.i3 >= abs(_max(ab.i3, cd.i3)))) { + return false; + } else { + String newMerger; + appendValue(newMerger, TMerger(ab.i1, cd.i2, ab.i3 + bc.i3 + cd.i3)); + replace(queue, len-3, len, newMerger); + + return true; + } +} + +/////////////////////////////////////////////////////////////////////////////// +// See Lemma 6 in Zhang et al., 1999. +template +inline bool +_positiveMerge(String & queue) { + typedef typename Value::Type TPos; + TPos len = length(queue); + if (len < 5) return false; + + TMerger ef = value(queue, len-1); + TMerger de = value(queue, len-2); + TMerger cd = value(queue, len-3); + TMerger bc = value(queue, len-4); + TMerger ab = value(queue, len-5); + + if ((cd.i3 >= 0) || (cd.i3 < _max(ab.i3, ef.i3))) { + return false; + } else { + String newMerger; + appendValue(newMerger, TMerger(bc.i1, de.i2, bc.i3 + cd.i3 + de.i3)); + replace(queue, len-4, len-1, newMerger); + + return true; + } +} + +/////////////////////////////////////////////////////////////////////////////// +// Implements the algorithm from Zhang et al. in Bioinformatics, 1999: "Post-processing long pairwise alignments". +// Splits an alignment into sub-alignments that contain no x-Drop. +template +void +_splitAtXDrops(TAlign const & align, + Score const & scoreMatrix, + TScoreValue1 const scoreDropOff, + TScoreValue2 const minScore, + String & alignmentString) { + typedef typename Position >::Type TPos; + typedef Triple TMerger; + + // initialization + String queue; + TPos pos = _min(toViewPosition(row(align, 0), beginPosition(row(align, 0))), + toViewPosition(row(align, 1), beginPosition(row(align, 1)))); + appendValue(queue, TMerger(pos, pos, MinValue::VALUE + 1)); + + //!TODO: aliLength should not go out of host sequence bounds when appending segments + TPos aliLength = _max(toViewPosition(row(align, 0), endPosition(row(align, 0))), + toViewPosition(row(align, 1), endPosition(row(align, 1)))); + TPos len; + while ((pos < aliLength) || (length(queue) > 1)) { + // construct useful tree + if (!_negativeMerge(queue)) { + if (!_positiveMerge(queue)) { + _appendPositiveSegment(align, pos, aliLength, scoreMatrix, queue); + _appendNegativeSegment(align, pos, aliLength, scoreMatrix, queue); + } + } + + // check for x-Drop + len = length(queue); + if ((len == 3) && (value(queue, 2).i3 < scoreDropOff * (-1))) { + if (value(queue, 1).i3 >= minScore) { + // create new sub-alignment + TAlign ali(align); + // std::cerr << "SEQ0 " << source(row(ali, 0)) << "\n"; + // std::cerr << "SEQ1 " << source(row(ali, 1)) << "\n"; + // std::cerr << "ROW0\n" << row(ali, 0) << "\nqueue[1].i1 == " << queue[1].i1 << ", queue[1].i2 == " << queue[1].i2 << "\n"; + // std::cerr << "ROW1\n" << row(ali, 1) << "\nqueue[1].i1 == " << queue[1].i1 << ", queue[1].i2 == " << queue[1].i2 << + // "\n"; + setClippedBeginPosition(row(ali, 0), queue[1].i1 + clippedBeginPosition(row(align, 0))); + setClippedBeginPosition(row(ali, 1), queue[1].i1 + clippedBeginPosition(row(align, 1))); + setClippedEndPosition(row(ali, 0), queue[1].i2 + clippedBeginPosition(row(align, 0))); + setClippedEndPosition(row(ali, 1), queue[1].i2 + clippedBeginPosition(row(align, 1))); + + // TPos begin0 = toSourcePosition(row(ali, 0), value(queue, 1).i1); + // TPos begin1 = toSourcePosition(row(ali, 1), value(queue, 1).i1); + // TPos end0 = toSourcePosition(row(ali, 0), value(queue, 1).i2); + // TPos end1 = toSourcePosition(row(ali, 1), value(queue, 1).i2); + // setClippedBeginPosition(row(ali, 0), begin0); + // setClippedBeginPosition(row(ali, 1), begin1); + // setBeginPosition(row(ali, 0), 0); + // setBeginPosition(row(ali, 1), 0); + // setClippedEndPosition(row(ali, 0), end0); + // setClippedEndPosition(row(ali, 1), end1); + + // append sub-alignment + appendValue(alignmentString, ali); + } + replace(queue, 0, 2, String()); + } + } +} + +/////////////////////////////////////////////////////////////////////////////// +// Conducts banded local alignment on swift hit (= computes eps-cores), +// splits eps-cores at X-drops, and calls _extendAndExtract for extension of eps-cores +template +void +allOrBestLocal(Segment, InfixSegment> const & infH, + Segment, InfixSegment> const & infV, + TEpsilon const eps, + TSize const minLength, + TDrop const xDrop, + TDelta const delta, + TOnAlignmentResultFn && onAlignmentResult, + stellar_verification_time & verification_runtime, + std::integral_constant) { + using TInfix = Segment; + typedef Segment TSegment; + using TSequenceCopy = String>; + typedef typename StellarMatch::TAlign TAlign; + + TSize maxLength = 1000000000; + if ((TSize)length(infH) > maxLength) { + std::cerr << "Warning: SWIFT hit <" << beginPosition(infH) << "," << endPosition(infH); + std::cerr << "> , <" << beginPosition(infV) << "," << endPosition(infV); + std::cerr << "> too long. Verification skipped.\n" << std::flush; + return; + } + + // define a scoring scheme + typedef int TScore; + TScore match = 1; + // large negative scoring scheme values lead to excessive seed extension + TScore scoringSchemeLowerBound = -(TScore)1000; + TScore mismatchIndel = scoringSchemeLowerBound; + if (eps > -1/scoringSchemeLowerBound) // avoid division by 0 + mismatchIndel = (TScore)_max((TScore) std::ceil(-1/eps) + 1, -(TScore)length(host(infH))); + Score scoreMatrix(match, mismatchIndel, mismatchIndel); + TScore scoreDropOff = (TScore) _max((TScore) xDrop * (-mismatchIndel), MinValue::VALUE + 1); + + // calculate minimal score for local alignments + TEpsilon e = floor(eps*minLength); + TSize minLength1 = _max(0, (TSize)ceil((e+1) / eps)); + TEpsilon e1 = floor(eps*minLength1); + TSize minScore = _min((TSize)ceil((minLength-e) / (e+1)), (TSize)ceil((minLength1-e1) / (e1+1))); + + // diagonals for banded local alignment + int64_t upperDiag = 0; + int64_t lowerDiag = endPosition(infH) - (int64_t)endPosition(infV) - beginPosition(infH) + beginPosition(infV); + if (beginPosition(infV) == 0) { + if (endPosition(infV) == endPosition(host(infV))) { + // TODO: is it possible to get a smaller band in this case? + upperDiag = delta; + lowerDiag = -(int64_t)delta; + } else + upperDiag = lowerDiag + delta; + } else if (endPosition(infV) == endPosition(host(infV))) + lowerDiag = -(int64_t)delta; + + // banded local alignment + LocalAlignmentEnumerator, Banded> enumerator(scoreMatrix, lowerDiag, upperDiag, minScore); + Align localAlign; + resize(rows(localAlign), 2); + assignSource(row(localAlign, 0), infH); + assignSource(row(localAlign, 1), infV); + + while (true) { + bool const has_next = verification_runtime.next_local_alignment_time.measure_time([&]() + { + return nextLocalAlignment(localAlign, enumerator); + }); + + if (!has_next) + break; + + + // split local alignments containing an X-drop + String > seedAlignments; + verification_runtime.split_at_x_drops_time.measure_time([&]() + { + _splitAtXDrops(localAlign, scoreMatrix, scoreDropOff, minScore, seedAlignments); + }); + + typename Iterator > >::Type aliIt = begin(seedAlignments); + + while (aliIt != end(seedAlignments)) { + // std::cerr << "aliIt == \n" << row(*aliIt, 0) << "\n" << row(*aliIt, 1) << "\n"; + // create alignment object for the complete sequences + TAlign align; + resize(rows(align), 2); + + //!DEBUG: make copies of underlying sequence + String> infHStr; + resize(infHStr, host(host(infH)).size()); + for (size_t i{0}; i < host(host(infH)).size(); i++) + { + insert(infHStr, i, host(host(infH))[i]); + } + String> infVStr; + resize(infHStr, host(host(infV)).size()); + for (size_t i{0}; i < host(host(infV)).size(); i++) + { + insert(infVStr, i, host(host(infV))[i]); + } + + setSource(row(align, 0), infHStr); + setSource(row(align, 1), infVStr); + + // determine extension direction + ExtensionDirection direction; + if (length(seedAlignments) == 1) direction = EXTEND_BOTH; + else if (aliIt == begin(seedAlignments)) direction = EXTEND_RIGHT; + else if (aliIt == end(seedAlignments)-1) direction = EXTEND_LEFT; + else direction = EXTEND_NONE; + + bool const extension_succeeded = verification_runtime.extension_time.measure_time([&]() + { + return _extendAndExtract(*aliIt, scoreDropOff, scoreMatrix, infH, infV, direction, minLength, eps, align, verification_runtime.extension_time); + }); + + // extend alignment and obtain longest contained eps-match + if (!extension_succeeded) { + aliIt++; + continue; + } + + // insert eps-match in matches string + bool success = onAlignmentResult(align); + if (!success) + return; + + ++aliIt; + } + + if (bestLocalMethod) break; + } +} + +} // namespace dream_stellar diff --git a/include/dream_stellar/verification/swift_hit_verifier.hpp b/include/dream_stellar/verification/swift_hit_verifier.hpp new file mode 100644 index 00000000..b92a3c92 --- /dev/null +++ b/include/dream_stellar/verification/swift_hit_verifier.hpp @@ -0,0 +1,71 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace dream_stellar { + +template +struct SwiftHitVerifier +{ + EPSMatchOptions eps_match_options; + VerifierOptions verifier_options; + + template + void verify(StellarDatabaseSegment const & databaseSegment, + StellarQuerySegment const & querySegment, + TDelta const delta, + TOnAlignmentResultFn && onAlignmentResult, + stellar_verification_time & verification_runtime) + { + static_assert(std::is_unsigned::value, "TDelta must be unsigned integral."); + + /* try to convert std::span back to String + using TSegment = Segment, InfixSegment>, InfixSegment>; + TSegment finderSegment = databaseSegment.asFinderSegment(); + TSegment patternSegment = querySegment.asPatternSegment(); + + using TString = String; + TString host_infH; + for (auto n : host(host(finderSegment))) + appendValue(host_infH, n); + Segment infH_seg{host_infH, 0, length(host_infH)}; + Segment, InfixSegment> infH(infH_seg, seqan2::beginPosition(finderSegment), seqan2::endPosition(finderSegment)); + + TString host_infV; + for (auto n : host(host(patternSegment))) + appendValue(host_infV, n); + Segment infV_seg{host_infV, 0, length(host_infV)}; + Segment, InfixSegment> infV(infV_seg, seqan2::beginPosition(patternSegment), seqan2::endPosition(patternSegment)); + */ + + /* + std::cout << "Swift hit verifier\n"; + for (auto n : databaseSegment.as_span()) + std::cout << seqan3::to_char(n._symbol); + std::cout << '\n'; + + for (auto n : querySegment.as_span()) + std::cout << seqan3::to_char(n._symbol); + std::cout << '\n'; + */ + + verifySwiftHit( + databaseSegment.asFinderSegment(), + querySegment.asPatternSegment(), + (double)eps_match_options.epsilon, + eps_match_options.minLength, + verifier_options.xDrop, + delta, + onAlignmentResult, + verification_runtime, + TVerifierTag{}); + } +}; + +} // namespace dream_stellar diff --git a/include/utilities/alphabet_wrapper/matcher/concept.hpp b/include/utilities/alphabet_wrapper/matcher/concept.hpp new file mode 100644 index 00000000..fa02c92d --- /dev/null +++ b/include/utilities/alphabet_wrapper/matcher/concept.hpp @@ -0,0 +1,162 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides matcher concepts. + * \author Rene Rahn + */ + +#pragma once + +#include +#include + +#include + +namespace jst::contrib +{ + // ---------------------------------------------------------------------------- + // Operation CPOs for matcher + // ---------------------------------------------------------------------------- + + // window_size + namespace _window_size { + inline constexpr struct _cpo { + template + requires std::tag_invocable<_cpo, matcher_t> + constexpr auto operator()(matcher_t && matcher) const + noexcept(std::is_nothrow_tag_invocable_v<_cpo, matcher_t>) + -> std::tag_invoke_result_t<_cpo, matcher_t> + { + return std::tag_invoke(_cpo{}, (matcher_t &&)matcher); + } + + private: + + template + requires requires (matcher_t && matcher) { { ((matcher_t &&)matcher).window_size() } -> std::integral; } + constexpr friend auto tag_invoke(_cpo, matcher_t && matcher) + noexcept(noexcept(std::declval().window_size())) + -> decltype(std::declval().window_size()) + { + return ((matcher_t &&)matcher).window_size(); + } + } window_size; + } // namespace _window_size + using _window_size::window_size; + + template + using window_size_t = std::invoke_result_t<_window_size::_cpo, matcher_t>; + + // capture + namespace _capture { + inline constexpr struct _cpo { + template + requires std::tag_invocable<_cpo, matcher_t const &> + constexpr auto operator()(matcher_t const & matcher) const + noexcept(std::is_nothrow_tag_invocable_v<_cpo, matcher_t const &>) + -> std::tag_invoke_result_t<_cpo, matcher_t const &> + { + return std::tag_invoke(_cpo{}, matcher); + } + private: + + template + requires requires (matcher_t && matcher) { { ((matcher_t &&)matcher).capture() }; } + constexpr friend auto tag_invoke(_cpo, matcher_t && matcher) + noexcept(noexcept(std::declval().capture())) + -> decltype(std::declval().capture()) + { + return ((matcher_t &&)matcher).capture(); + } + } capture; + } // namespace _capture + using _capture::capture; + + // restore + namespace _restore { + inline constexpr struct _cpo { + template + requires std::tag_invocable<_cpo, matcher_t &, state_t> + constexpr auto operator()(matcher_t & matcher, state_t && state) const + noexcept(std::is_nothrow_tag_invocable_v<_cpo, matcher_t &, state_t>) + -> std::tag_invoke_result_t<_cpo, matcher_t &, state_t> + { + return std::tag_invoke(_cpo{}, matcher, (state_t &&) state); + } + private: + + template + requires requires (matcher_t && m, state_t && s) { { ((matcher_t &&)m).restore((state_t &&) s) }; } + constexpr friend void tag_invoke(_cpo, matcher_t && matcher, state_t && state) + noexcept(noexcept(std::declval().restore(std::declval()))) + { + ((matcher_t &&)matcher).restore((state_t &&)state); + } + } restore; + } // namespace _restore + using _restore::restore; + + template + using matcher_state_t = std::remove_cvref_t>; + + // aggregate + namespace _aggregate { + inline constexpr struct _cpo { + template + requires std::tag_invocable<_cpo, state1_t, state2_t> + constexpr auto operator()(state1_t && state1, state2_t && state2) const + noexcept(std::is_nothrow_tag_invocable_v<_cpo, state1_t, state2_t>) + -> std::tag_invoke_result_t<_cpo, state1_t, state2_t> + { + return std::tag_invoke(_cpo{}, (state1_t &&) state1, (state2_t &&) state2); + } + } aggregate; + } // namespace _aggregate + using _aggregate::aggregate; + + // ---------------------------------------------------------------------------- + // Concept defintions for matcher + // ---------------------------------------------------------------------------- + + template + concept window_matcher = std::copyable> && requires (matcher_t && matcher) + { + { jst::contrib::window_size((matcher_t &&) matcher)} -> std::integral; + }; + + namespace detail { + template + concept stateful_matcher = requires + { + typename jst::contrib::matcher_state_t; + requires std::semiregular>; + }; + } // namespace detail + + template + concept restorable_matcher = window_matcher && + detail::stateful_matcher && + requires (matcher_t && matcher) + { + { jst::contrib::capture((matcher_t &&) matcher) } -> std::convertible_to>; + { jst::contrib::restore(matcher, std::declval>()) }; + }; + + template + concept reducable_with = std::common_with, std::remove_cvref_t> && + requires (std::remove_cvref_t const & lhs, std::remove_cvref_t const & rhs) + { + { jst::contrib::aggregate(lhs, rhs) } -> std::convertible_to, std::remove_cvref_t>>; + }; + + template + concept reducable_state = reducable_with; + + template + concept online_matcher_for = window_matcher && std::invocable; +} // namespace jst::contrib diff --git a/include/utilities/alphabet_wrapper/matcher/seqan_pattern_base.hpp b/include/utilities/alphabet_wrapper/matcher/seqan_pattern_base.hpp new file mode 100644 index 00000000..eab6a820 --- /dev/null +++ b/include/utilities/alphabet_wrapper/matcher/seqan_pattern_base.hpp @@ -0,0 +1,102 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides the basic adapter for the seqan2::Pattern objects. + * \author Rene Rahn + */ + +#pragma once + +#include +#include +#include + +#include + +#include +#include +#include + +namespace jst::contrib +{ + template + class seqan_pattern_base + { + private: + + friend derived_t; + + seqan_pattern_base() = default; + + public: + + // Note const is disabled since seqan use non-const pattern ;( + template + constexpr void operator()(haystack_t && haystack, size_t const minRepeatLength, + size_t const maxRepeatPeriod, callback_t && callback) /*const*/ noexcept { + using compatible_haystack_t = jst::contrib::seqan_container_t>; + + compatible_haystack_t seqan_haystack = + jst::contrib::make_seqan_container(std::views::all((haystack_t &&)haystack)); + + auto finder = to_derived(this)->make_finder(seqan_haystack, minRepeatLength, maxRepeatPeriod); + + while (find_impl(finder, to_derived(this)->get_pattern())) { + callback(finder); + } + } + + constexpr bool empty() const noexcept { + return seqan2::empty(get_pattern().data_host); + } + + private: + + template + constexpr bool find_impl(seqan_finder_t & finder, seqan_pattern_t && pattern) const noexcept { + return std::apply([&] (auto && ...custom_args) { + return find(finder, pattern, (decltype(custom_args)) custom_args...); + }, to_derived(this)->custom_find_arguments()); + } + + template + constexpr auto make_finder(haystack_t & haystack) const noexcept + { + return seqan2::Finder{haystack}; + } + + constexpr auto & get_pattern() noexcept + { + return to_derived(this)->_pattern; + } + + constexpr auto const & get_pattern() const noexcept + { + return to_derived(this)->_pattern; + } + + constexpr auto custom_find_arguments() const noexcept { + return std::tuple{}; + } + + static constexpr derived_t * to_derived(seqan_pattern_base * me) noexcept + { + return static_cast(me); + } + + static constexpr derived_t const * to_derived(seqan_pattern_base const * me) noexcept + { + return static_cast(me); + } + + constexpr friend std::size_t tag_invoke(std::tag_t, seqan_pattern_base const & me) noexcept { + return (me.empty()) ? 0 : seqan2::length(seqan2::needle(me.get_pattern())); + } + }; + +} // namespace jst::contrib diff --git a/include/utilities/alphabet_wrapper/matcher/stellar_matcher.hpp b/include/utilities/alphabet_wrapper/matcher/stellar_matcher.hpp new file mode 100644 index 00000000..ee8fe047 --- /dev/null +++ b/include/utilities/alphabet_wrapper/matcher/stellar_matcher.hpp @@ -0,0 +1,235 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides an adapter to make the SWIFT algorithm work with std views. + * \author Rene Rahn + */ + +#pragma once + +#include + +#include +#include +#include +#include + +namespace jst::contrib +{ + using namespace seqan2; + + template + using compatible_needle_type = jst::contrib::seqan_container_t>; + + template + using multi_needle_type = seqan2::StringSet>; + + template + using qgram_shape_type = seqan2::Shape>, seqan2::SimpleShape>; + + template + using index_type = seqan2::Index, seqan2::IndexQGram, seqan2::OpenAddressing>>; + + template + class stellar_matcher : public seqan_pattern_base> + { + private: + + using base_t = seqan_pattern_base>; + + friend base_t; + + using finder_spec_type = seqan2::Swift; + using pattern_type = seqan2::Pattern, finder_spec_type>; + + multi_needle_type _multi_needle{}; + index_type _needle_index{_multi_needle}; + pattern_type _pattern{_needle_index}; + size_t _kmer_size{}; + double _kmer_abundance_cut{}; + double _error_rate{}; + unsigned _min_length{}; + + public: + + stellar_matcher() = delete; + template + requires (!std::same_as<_needle_t, stellar_matcher> && + std::constructible_from, _needle_t>) + explicit stellar_matcher(_needle_t && needle, valik::search_arguments const & arguments) : + _kmer_size{arguments.shape_size}, _kmer_abundance_cut{arguments.qgramAbundanceCut}, + _error_rate{arguments.error_rate}, _min_length{arguments.minLength} + { + appendValue(getFibre(_needle_index, seqan2::QGramText{}), + jst::contrib::make_seqan_container(std::views::all((_needle_t &&) needle))); + // like line 108 in stellar_index.hpp + resize(indexShape(_needle_index), _kmer_size); + cargo(_needle_index).abundanceCut = _kmer_abundance_cut; + _patternInit(_pattern, _error_rate, _min_length); + } + + template + requires (!std::same_as<_multi_needle_t, stellar_matcher> && + std::constructible_from, + std::views::all_t>>) + explicit stellar_matcher(_multi_needle_t && multi_needle, valik::search_arguments const & arguments) : + _kmer_size{arguments.shape_size}, _kmer_abundance_cut{arguments.qgramAbundanceCut}, + _error_rate{arguments.error_rate}, _min_length{arguments.minLength} + { + for (auto && needle : multi_needle) + { + appendValue(getFibre(_needle_index, seqan2::QGramText{}), + jst::contrib::make_seqan_container(std::views::all((decltype(needle) &&) needle))); + + // debug + // seqan3::debug_stream << "Appending fibre \n" << needle << '\n'; + } + + // like line 108 in stellar_index.hpp + resize(indexShape(_needle_index), _kmer_size); + cargo(_needle_index).abundanceCut = _kmer_abundance_cut; + _patternInit(_pattern, _error_rate, _min_length); + } + + constexpr auto position() const noexcept { + return seqan2::position(_pattern); + } + + constexpr auto curSeqNo() const noexcept { + return _pattern.curSeqNo; + } + + constexpr auto delta() const noexcept { + return (_pattern.bucketParams[0].delta + _pattern.bucketParams[0].overlap); + } + + private: + template + constexpr auto make_finder(haystack_t & haystack, size_t const minRepeatLength, size_t const maxRepeatPeriod) const noexcept + { + return seqan2::Finder{haystack, minRepeatLength, maxRepeatPeriod}; + } + + constexpr stellar_matcher & get_pattern() noexcept { + return *this; + } + + constexpr auto custom_find_arguments() const noexcept { + return std::tuple{_error_rate, _min_length}; + } + + constexpr friend std::size_t tag_invoke(std::tag_t, stellar_matcher const & me) noexcept { + return length(getFibre(needle(me._pattern), seqan2::QGramShape{})); + } + + ///////////////////////////////////////////////////////////////////////////////// + template + constexpr bool initialise(seqan2::Finder & finder, + pattern_type & pattern) + { + pattern.finderLength = std::ranges::size(haystack(finder)); + seqan2::_patternInit(pattern, _error_rate, _min_length); + seqan2::_finderSetNonEmpty(finder); + finder.dotPos = 100000; + finder.dotPos2 = 10 * finder.dotPos; + + if (!seqan2::_firstNonRepeatRange(finder, pattern)) + return false; + + if (seqan2::_swiftMultiProcessQGram(finder, pattern, seqan2::hash(pattern.shape, hostIterator(hostIterator(finder))))) + seqan2::_copySwiftHit(finder, pattern); + + return true; + } + + // declare and define a non-member friend function + template + friend bool find(seqan2::Finder & finder, + stellar_matcher & matcher, + args_t && ...args) + { + pattern_type & pattern = matcher._pattern; + if (empty(finder)) { + return matcher.initialise(finder, pattern); + } + + // all previous matches reported -> search new ones + clear(finder.hits); + + // are we at the end of the text? + if (seqan2::atEnd(finder) && finder.curRepeat == finder.endRepeat) + { + finder.curHit = finder.endHit; + return false; + } + + do + { + if (pattern.params.printDots) seqan2::_printDots(finder); + if (seqan2::atEnd(++finder)) + { + if (!seqan2::_nextNonRepeatRange(finder, pattern)) + { + if(seqan2::_swiftMultiFlushBuckets(finder, pattern)) + { + seqan2::_copySwiftHit(finder, pattern); + return true; + } + else + return false; + } + seqan2::hash(pattern.shape, seqan2::hostIterator(seqan2::hostIterator(finder))); + } + else + { + ++finder.curPos; + seqan2::hashNext(pattern.shape, seqan2::hostIterator(seqan2::hostIterator(finder))); + } + + if (seqan2::_swiftMultiProcessQGram(finder, pattern, seqan2::value(pattern.shape))) + { + seqan2::_copySwiftHit(finder, pattern); + return true; + } + + } while (true); + } + }; + + ///////////////////////////////////////////////////////////////////////////////// + + template + stellar_matcher(needle_t &&) -> stellar_matcher>; + + template + stellar_matcher(needle_t &&, double) -> stellar_matcher>; + + template + requires std::ranges::random_access_range> + stellar_matcher(multi_needle_t &&) -> stellar_matcher>>; + + template + requires std::ranges::random_access_range> + stellar_matcher(multi_needle_t &&, double) -> stellar_matcher>>; + +} // namespace jst::contrib + +namespace seqan2 { + +using needle_t = std::span>; + +template <> +struct Cargo<::jst::contrib::index_type> +{ + typedef struct + { + double abundanceCut; + } Type; +}; + +} // namespace seqan2 diff --git a/include/utilities/alphabet_wrapper/seqan/alphabet.hpp b/include/utilities/alphabet_wrapper/seqan/alphabet.hpp new file mode 100644 index 00000000..e09055aa --- /dev/null +++ b/include/utilities/alphabet_wrapper/seqan/alphabet.hpp @@ -0,0 +1,342 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides adaptions for the seqan2 alphabet interface to map seqan3 alphabets. + * \author Rene Rahn + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +namespace seqan2 +{ + template + struct alphabet_adaptor + { + alphabet_t _symbol{}; + // make this applicable to simple type adapter of seqan? + constexpr alphabet_adaptor() = default; + + template + requires std::same_as, char_t> && + requires { seqan3::assign_char_to(char_t{}, alphabet_t{}); } + constexpr explicit alphabet_adaptor(char_t c) : _symbol{seqan3::assign_char_to(c, alphabet_t{})} + {} + + template + requires (!std::same_as, rank_t>) && + std::convertible_to> && + requires { seqan3::assign_rank_to(static_cast>(rank_t{}), alphabet_t{}); } + constexpr explicit alphabet_adaptor(rank_t r) : _symbol{seqan3::assign_rank_to(static_cast>(r), alphabet_t{})} + {} + + template + requires (!std::same_as) && + seqan3::explicitly_convertible_to + constexpr explicit alphabet_adaptor(alphabet_adaptor other) noexcept : + _symbol{static_cast(other._symbol)} + {} + + constexpr bool operator==(alphabet_adaptor const &) const noexcept = default; + constexpr std::strong_ordering operator<=>(alphabet_adaptor const & other) const noexcept + { + return seqan3::to_rank(_symbol) <=> seqan3::to_rank(other._symbol); + } + + // impicit conversion enough for assignment in seqan2 simple types? + template + constexpr operator int_t() const noexcept + { + return seqan3::to_rank(_symbol); + } + + constexpr operator char() const noexcept + { + return seqan3::to_char(_symbol); + } + + constexpr operator alphabet_t() const noexcept + { + return _symbol; + } + }; + + template + seqan3::alphabet_rank_t> + CEREAL_SAVE_MINIMAL_FUNCTION_NAME(archive_t const &, alphabet_adaptor const & l) + { + return seqan3::to_rank(l); + } + + template + void CEREAL_LOAD_MINIMAL_FUNCTION_NAME(archive_t const &, + alphabet_adaptor & l, + seqan3::alphabet_rank_t> const & r) + { + seqan3::assign_rank_to(r, l); + } + + template + struct ValueSize> + { + using Type = decltype(seqan3::alphabet_size); + static constexpr Type VALUE = seqan3::alphabet_size; + }; + + template + struct BitsPerValue> + { + using Type = decltype(seqan3::alphabet_size); + static constexpr Type VALUE = seqan3::detail::ceil_log2(seqan3::alphabet_size); + }; + +} //namespace seqan2 + +namespace jst::contrib +{ + using dna4 = seqan2::alphabet_adaptor; //seqan::Dna; + using dna5 = seqan2::alphabet_adaptor; //seqan::Dna5; + using dna15 = seqan2::alphabet_adaptor; //seqan::Iupac; + + inline dna4 operator""_dna4(char const c) noexcept + { + return dna4{c}; + } + + inline std::vector operator""_dna4(char const * s, std::size_t n) + { + std::vector r; + r.reserve(n); + std::ranges::copy(std::string_view{s, n} | std::views::transform([] (char c) { + return dna4{c}; + }), std::back_inserter(r)); + return r; + } + + inline dna5 operator""_dna5(char const c) noexcept + { + return dna5{c}; + } + + inline std::vector operator""_dna5(char const * s, std::size_t n) + { + std::vector r; + r.reserve(n); + std::ranges::copy(std::string_view{s, n} | std::views::transform([] (char c) { + return dna5{c}; + }), std::back_inserter(r)); + return r; + } +} // namespace jst::contrib + +namespace seqan2 +{ + template + inline seqan3::debug_stream_type & operator<<(seqan3::debug_stream_type & stream, + SimpleType const & symbol) + { + return stream << seqan3::to_char(symbol); + } +} // namespace seqan2 +// // use a simple adaptor! +// // Idea: define a wrapper for seqan3::dna5 as seqan::SimpleType +// // then provide +// template +// struct adapted; + +// template +// using alphabet_adapter_t = SimpleType, adapted>; + +// template +// inline void assign(SimpleType> & target, alphabet2_t & source) +// { +// target.value = seqan3::to_rank((alphabet2_t &&) source); +// } + +// template +// inline void assign(SimpleType> & target, alphabet2_t const & source) +// { +// target.value = seqan3::to_rank((alphabet2_t &&) source); +// } + +// template +// inline void assign(char & c_target, SimpleType> const & source) +// { +// c_target = seqan3::to_char(source); +// } + +// // need a wrapper that implements seqan::FiniteOrderedAlphabetConcept + +// template +// struct ValueSize>> +// { +// using Type = seqan3::detail::min_viable_uint_t>; +// static const Type VALUE = seqan3::alphabet_size; +// }; + +// template +// struct BitsPerValue>> +// { +// static constexpr size_t bits_per_letter = seqan3::detail::ceil_log2(seqan3::alphabet_size); + +// using Type = seqan3::detail::min_viable_uint_t; +// static const Type VALUE = bits_per_letter; +// }; + +// // sameType(ordValue(val), size); +// // sameType(valueSize(), size); +// // sameType(ValueSize::VALUE, size); + +// // convert integer to alphabet value +// // val = 0; +// // val = size; +// // TValue val2(0); +// // TValue val3(size); +// // assign(val, val2); + + + +// template +// requires std::constructible_from>, seqan3::alphabet_rank_t> +// struct Convert +// { +// using Type = seqan3::alphabet_rank_t; +// }; +// // template +// // inline typename RemoveConst_::Type +// // convertImpl(Convert const, +// // SimpleType const & source_) +// // { +// // typename RemoveConst_::Type target_; +// // assign(target_, source_); +// // return target_; +// // } +// // we do not have implicit conversion +// // so what are we doing now? +// template +// requires (std::constructible_from>, seqan3::alphabet_rank_t>) +// inline typename Convert::Type +// convertImpl(Convert const, source_t const & source) +// { +// return seqan3::to_rank(seqan3::assign_rank_to(seqan3::to_rank(source), target_t{})); +// } +// } // namespace seqan + +namespace seqan3::custom +{ + template + struct alphabet> + { + using alphabet_t = seqan2::SimpleType; + using rank_t = typename seqan2::ValueSize::Type; + + static constexpr rank_t alphabet_size = seqan2::ValueSize::VALUE; + + static rank_t to_rank(alphabet_t const a) noexcept + { + return static_cast(a); + } + + static alphabet_t & assign_rank_to(rank_t const r, alphabet_t & a) noexcept + { + a = r; // simple assignment? or convert? or something else? + return a; + } + + static char to_char(alphabet_t const a) noexcept + { + return static_cast(a); + } + + static alphabet_t & assign_char_to(char const c, alphabet_t & a) noexcept + { + a = c; + return a; + } + }; + + template + struct alphabet> + { + using alphabet_t = seqan2::alphabet_adaptor; + using rank_t = seqan3::alphabet_rank_t; + using char_t = seqan3::alphabet_char_t; + + static constexpr rank_t alphabet_size = seqan3::alphabet_size; + + constexpr static rank_t to_rank(alphabet_t const a) noexcept + { + return seqan3::to_rank(a._symbol); + } + + constexpr static alphabet_t & assign_rank_to(rank_t const r, alphabet_t & a) noexcept + { + seqan3::assign_rank_to(r, a._symbol); + return a; + } + + constexpr static char to_char(alphabet_t const a) noexcept + { + return seqan3::to_char(a._symbol); + } + + constexpr static alphabet_t & assign_char_to(char const c, alphabet_t & a) noexcept + { + seqan3::assign_char_to(c, a._symbol); + return a; + } + + template + requires std::same_as<_wrapped_t, wrapped_t> + constexpr static auto complement(alphabet_t const a) + noexcept(std::is_nothrow_invocable_v, wrapped_t const &>) + -> std::invoke_result_t, wrapped_t const &> + { + return seqan3::complement(a._symbol); + } + + constexpr static auto char_is_valid(char_t const c) noexcept + { + return seqan3::char_is_valid_for(c); + } + }; +} // namespace seqan3::custom + +namespace seqan2 +{ + template + seqan3::alphabet_rank_t> + CEREAL_SAVE_MINIMAL_FUNCTION_NAME(archive_t const &, seqan2::SimpleType const & l) + { + return seqan3::to_rank(l); + } + + template + void CEREAL_LOAD_MINIMAL_FUNCTION_NAME(archive_t const &, + seqan2::SimpleType & l, + seqan3::alphabet_rank_t> const & r) + { + seqan3::assign_rank_to(r, l); + } +} // namespace seqan2 diff --git a/include/utilities/alphabet_wrapper/seqan/concept.hpp b/include/utilities/alphabet_wrapper/seqan/concept.hpp new file mode 100644 index 00000000..a3713c26 --- /dev/null +++ b/include/utilities/alphabet_wrapper/seqan/concept.hpp @@ -0,0 +1,52 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides concepts and CPOs for the pattern adaption. + * \author Rene Rahn + */ + +#pragma once + +#include + +namespace jst::contrib +{ + + namespace _set_up + { + inline constexpr struct _cpo + { + template + requires std::tag_invocable<_cpo, operation_t &, finder_t &> + constexpr auto operator()(operation_t & operation, finder_t & finder) const + noexcept(std::is_nothrow_tag_invocable_v<_cpo, operation_t &, finder_t &>) + -> std::tag_invoke_result_t<_cpo, operation_t &, finder_t &> + { + return std::tag_invoke(_cpo{}, operation, finder); + } + } set_up{}; + } // namespace _set_up + using _set_up::set_up; + + namespace _tear_down + { + inline constexpr struct _cpo + { + template + requires std::tag_invocable<_cpo, operation_t &, finder_t &> + constexpr auto operator()(operation_t & operation, finder_t & finder) const + noexcept(std::is_nothrow_tag_invocable_v<_cpo, operation_t &, finder_t &>) + -> std::tag_invoke_result_t<_cpo, operation_t &, finder_t &> + { + return std::tag_invoke(_cpo{}, operation, finder); + } + } tear_down{}; + } // namespace _tear_down + using _tear_down::tear_down; + +} // namespace jst::contrib diff --git a/include/utilities/alphabet_wrapper/seqan/container_adapter.hpp b/include/utilities/alphabet_wrapper/seqan/container_adapter.hpp new file mode 100644 index 00000000..d8aeab53 --- /dev/null +++ b/include/utilities/alphabet_wrapper/seqan/container_adapter.hpp @@ -0,0 +1,211 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides container adapter for the seqan container interface. + * \author Rene Rahn + */ + +#pragma once + +#include +#include +#include + +#include + +namespace jst::contrib +{ + template + class seqan_container_adapter + { + private: + std::optional _range{std::nullopt}; + + public: + using value_type = std::ranges::range_value_t; + using reference = std::ranges::range_reference_t; + using const_reference = std::ranges::range_reference_t const>; + using iterator = std::ranges::iterator_t; + using const_iterator = std::ranges::iterator_t const>; + using size_type = std::make_unsigned_t>; + using difference_type = std::iter_difference_t; + + seqan_container_adapter() = default; + explicit seqan_container_adapter(range_t range) noexcept(std::is_nothrow_move_constructible_v) : + _range{std::move(range)} + { + } + + seqan_container_adapter(seqan_container_adapter const &) = default; + seqan_container_adapter(seqan_container_adapter &&) = default; + seqan_container_adapter & operator=(seqan_container_adapter const &) = default; + seqan_container_adapter & operator=(seqan_container_adapter &&) = default; + + constexpr reference operator[](difference_type const index) noexcept + requires std::ranges::random_access_range + { + return (*_range)[index]; + } + + constexpr const_reference operator[](difference_type const index) const noexcept + requires std::ranges::random_access_range + { + return (*_range)[index]; + } + + constexpr iterator begin() noexcept(noexcept(std::ranges::begin(*_range))) + { + return std::ranges::begin(*_range); + } + + constexpr const_iterator begin() const noexcept(noexcept(std::ranges::begin(*_range))) + { + return std::ranges::begin(*_range); + } + + constexpr iterator end() noexcept(noexcept(std::ranges::end(*_range))) + { + return std::ranges::end(*_range); + } + + constexpr const_iterator end() const noexcept(noexcept(std::ranges::end(*_range))) + { + return std::ranges::end(*_range); + } + + constexpr size_type size() const noexcept(noexcept(std::ranges::distance(*_range))) + { + return std::ranges::distance(*_range); + } + + constexpr bool empty() const noexcept(noexcept(std::ranges::empty(*_range))) + { + return std::ranges::empty(*_range); + } + }; + + template + requires (std::ranges::common_range && std::ranges::random_access_range) + constexpr auto make_seqan_container(range_t range) noexcept + { + return seqan_container_adapter>{std::move(range)}; + } + + template + using seqan_container_t = decltype(make_seqan_container(std::declval())); + + template + inline void + assign(seqan_container_adapter & me, seqan_container_adapter const & source) + { + seqan2::assign(me, source); + } + +} // namespace jst::contrib + +namespace seqan2 +{ + namespace detail + { + template + using container_t = jst::contrib::seqan_container_adapter; + } // namespace detail + + template + struct Value> + { + using Type = typename detail::container_t::value_type; + }; + template + struct Value const> + { + using Type = typename detail::container_t::value_type; + }; + template + struct Reference> + { + using Type = typename detail::container_t::reference; + }; + template + struct Reference const> + { + using Type = typename detail::container_t::const_reference; + }; + template + struct GetValue> + { + using Type = typename detail::container_t::const_reference; + }; + template + struct GetValue const> + { + using Type = typename detail::container_t::const_reference; + }; + template + struct Position> + { + using Type = typename detail::container_t::difference_type; + }; + template + struct Position const> + { + using Type = typename detail::container_t::difference_type; + }; + template + struct Size> + { + using Type = typename detail::container_t::size_type; + }; + template + struct Size const> + { + using Type = typename detail::container_t::size_type; + }; + template + struct Iterator, Standard> + { + typedef Iter, StdIteratorAdaptor> Type; + }; + template + struct Iterator const, Standard> + { + typedef Iter const, StdIteratorAdaptor> Type; + }; + template + struct Iterator, Rooted> + { + + typedef Iter, + AdaptorIterator< + Iter< + detail::container_t, + StdIteratorAdaptor + > + > + > Type; + }; + template + struct Iterator const, Rooted> + { + typedef Iter const, + AdaptorIterator< + Iter< + detail::container_t const, + StdIteratorAdaptor + > + > + > Type; + }; + + template + SEQAN_CONCEPT_IMPL((jst::contrib::seqan_container_adapter), (StlContainerConcept)); + + template + SEQAN_CONCEPT_IMPL((jst::contrib::seqan_container_adapter const), (StlContainerConcept)); + +} // namespace seqan2 diff --git a/include/utilities/alphabet_wrapper/seqan/segment.hpp b/include/utilities/alphabet_wrapper/seqan/segment.hpp new file mode 100644 index 00000000..647c1907 --- /dev/null +++ b/include/utilities/alphabet_wrapper/seqan/segment.hpp @@ -0,0 +1,39 @@ +#pragma once + +#include + +namespace seqan2 +{ + +template +TAlphabet value(seqan2::Segment const, InfixSegment>, InfixSegment> const & me, + TPos pos) +{ + assert(pos >= 0); + auto const & host_span = *me.data_host.data_host; + assert(me.data_host.data_host); + assert(pos + seqan2::beginPosition(me) < host_span.size()); + return host_span[seqan2::beginPosition(me) + pos]; +} + +template +TAlphabet value(TScore const & /* Score matrix to avoid ambiguity in overloading */, + seqan2::Segment const, InfixSegment>, InfixSegment> const & me, + TPos pos) +{ + assert(pos >= 0); + auto const & host_span = *me.data_host.data_host; + assert(me.data_host.data_host); + assert(pos + seqan2::beginPosition(me) < host_span.size()); + return host_span[seqan2::beginPosition(me) + pos]; +} + +template +inline TAlphabet sequenceEntryForScore(TScore const & /*scoringScheme*/, + Segment const, InfixSegment>, InfixSegment> const & seq, + TPosition pos) +{ + return value(seq, pos); +} + +} // namespace seqan2 diff --git a/include/utilities/alphabet_wrapper/std/span.hpp b/include/utilities/alphabet_wrapper/std/span.hpp new file mode 100644 index 00000000..755fb56a --- /dev/null +++ b/include/utilities/alphabet_wrapper/std/span.hpp @@ -0,0 +1,183 @@ +#pragma once +/* +MIT License + +Copyright (c) 2020 Barry Revzin + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include +#include +#include + +#if __has_include() && __has_include() +#include +#include + +namespace span_ext { + using std::same_as; + using std::convertible_to; + using std::invocable; + using std::equality_comparable; + using std::three_way_comparable; + + using std::ranges::begin; + using std::ranges::end; + using std::ranges::range_value_t; + using std::contiguous_iterator; + using std::ranges::contiguous_range; +} + +#else +// rely on range-v3 to provide the implementations of everything +#include +#include +#include +#include +#include + +namespace span_ext { + using concepts::same_as; + using concepts::convertible_to; + using ranges::invocable; + using concepts::equality_comparable; + + template + concept compares_as = same_as, Cat>; + + template + concept three_way_comparable = equality_comparable && + requires (std::remove_reference_t const& t) { + { t < t } -> convertible_to; + { t > t } -> convertible_to; + { t <= t } -> convertible_to; + { t < t } -> convertible_to; + { t <=> t } -> compares_as; + }; + + using ranges::begin; + using ranges::end; + using ranges::iter_value_t; + using ranges::range_value_t; + using ranges::contiguous_iterator; + using ranges::contiguous_range; +} +#endif + +namespace span_ext { + inline constexpr auto synth_three_way = + [](const T& t, const T& u) + requires requires { + { t < u } -> convertible_to; + { u < t } -> convertible_to; + } + { + if constexpr (three_way_comparable) { + return t <=> u; + } else { + if (t < u) return std::weak_ordering::less; + if (u < t) return std::weak_ordering::greater; + return std::weak_ordering::equivalent; + } + }; + + template + concept synth_comparable = invocable; + + template + concept sameish = same_as, std::remove_cvref_t>; + + template + concept contiguous_range_of = contiguous_range + && sameish>; + + template + concept memcmp_friendly = sameish || sameish; + + template + auto lexicographical_compare_three_way( + InputIter1 first1, InputIter1 last1, + InputIter2 first2, InputIter2 last2) + -> decltype(synth_three_way(*first1, *first2)) + { +#if __cpp_lib_three_way_comparison >= 201711L + return std::lexicographical_compare_three_way( + first1, last1, first2, last2, synth_three_way); +#else + if (!std::is_constant_evaluated()) { + // if both iterators are contiguous and refer to memcmp-friendly types + if constexpr (contiguous_iterator && + contiguous_iterator && + sameish, + iter_value_t> && + memcmp_friendly>) + { + auto const len1 = last1 - first1; + auto const len2 = last2 - first2; + + auto const lencmp = len1 <=> len2; + auto const len = lencmp < 0 ? len1 : len2; + + if (len != 0) { + auto const c = __builtin_memcmp(&*first1, &*first2, len) <=> 0; + if (c != 0) { + return c; + } + } + return lencmp; + } + } + + // just loop + while (first1 != last1) { + if (first2 == last2) { + return std::strong_ordering::greater; + } + if (auto const c = synth_three_way(*first1, *first2); c != 0) { + return c; + } + ++first1; + ++first2; + } + return (first2 == last2) <=> true; +#endif + } +} + +namespace std { + template R> + constexpr bool operator==(span lhs, R const& rhs) + { + return ::std::equal( + lhs.begin(), lhs.end(), + span_ext::begin(rhs), span_ext::end(rhs)); + } + + template R> + constexpr auto operator<=>(span lhs, R const& rhs) + { + return span_ext::lexicographical_compare_three_way( + lhs.begin(), lhs.end(), + span_ext::begin(rhs), span_ext::end(rhs)); + } + +} // namespace std diff --git a/include/utilities/alphabet_wrapper/std/tag_invoke.hpp b/include/utilities/alphabet_wrapper/std/tag_invoke.hpp new file mode 100644 index 00000000..80207ffd --- /dev/null +++ b/include/utilities/alphabet_wrapper/std/tag_invoke.hpp @@ -0,0 +1,108 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +#pragma once + +#include +#include + +namespace std +{ + // ---------------------------------------------------------------------------- + // Implementation of tag_invoke + // ---------------------------------------------------------------------------- + + namespace _tag_invoke + { + // poison pill + void tag_invoke(); + + struct _fn + { + template + constexpr auto operator()(cpo_t cpo, args_t &&...args) const + noexcept(noexcept(tag_invoke(std::forward(cpo), std::forward(args)...))) + -> decltype(tag_invoke(std::forward(cpo), std::forward(args)...)) + { + return tag_invoke((cpo_t &&)cpo, (args_t &&)args...); + } + }; + + template + using tag_invoke_result_t = decltype(tag_invoke(std::declval(), std::declval()...)); + + using yes_type = char; + using no_type = char (&)[2]; + + template + auto try_tag_invoke(int) // + noexcept(noexcept(tag_invoke(std::declval(), std::declval()...))) + -> decltype(static_cast(tag_invoke(std::declval(), std::declval()...)), yes_type{}); + + template + no_type try_tag_invoke(...) noexcept(false); + + template