diff --git a/.gitmodules b/.gitmodules
index bf13091e..533f40dd 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -11,9 +11,6 @@
[submodule "lib/raptor_data_simulation"]
path = lib/raptor_data_simulation
url = git@github.com:eaasna/raptor_data_simulation.git
-[submodule "lib/stellar3"]
- path = lib/stellar3
- url = git@github.com:seqan/stellar3.git
[submodule "lib/seqan"]
path = lib/seqan
url = git@github.com:seqan/seqan.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3934893..79f95e7c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,6 +19,12 @@ set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
set (CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+# For debugging only
+#set (CMAKE_CXX_FLAGS "-ftemplate-backtrace-limit=0")
+#set (CMAKE_CXX_FLAGS "-fsanitize=address -g -O0")
+set (CMAKE_CXX_FLAGS "-g -O0 -Wno-unused-parameter -Wno-unused-value -Wno-unused-but-set-variable -Wno-unused-variable -Wno-unused-local-typedefs")
+
+
# Messages
string (ASCII 27 Esc)
set (FontBold "${Esc}[1m")
diff --git a/include/dream_stellar/LICENSE.md b/include/dream_stellar/LICENSE.md
new file mode 100644
index 00000000..e8023ac7
--- /dev/null
+++ b/include/dream_stellar/LICENSE.md
@@ -0,0 +1,22 @@
+// ==========================================================================
+// STELLAR - SwifT Exact LocaL AligneR
+// http://www.seqan.de/projects/stellar/
+// ==========================================================================
+// Copyright (C) 2010-2012 by Birte Kehr
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your options) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see .
+//
+// ==========================================================================
+// Author: Birte Kehr
+// ==========================================================================
\ No newline at end of file
diff --git a/include/dream_stellar/diagnostics/print.hpp b/include/dream_stellar/diagnostics/print.hpp
new file mode 100644
index 00000000..055a5ffb
--- /dev/null
+++ b/include/dream_stellar/diagnostics/print.hpp
@@ -0,0 +1,60 @@
+#pragma once
+
+#include
+#include
+
+namespace dream_stellar
+{
+
+///////////////////////////////////////////////////////////////////////////////
+// Calculates parameters from parameters in options object and writes them to outStr
+// Sets options.qGram if not set by user input
+template
+void _writeCalculatedParams(StellarOptions & options, TStream & outStr);
+
+///////////////////////////////////////////////////////////////////////////////
+// Writes user specified parameters from options object to outStr
+template
+void _writeSpecifiedParams(StellarOptions const & options, TStream & outStr);
+
+///////////////////////////////////////////////////////////////////////////////
+// Writes file name from options object to outStr
+template
+void _writeFileNames(StellarOptions const & options, TStream & outStr);
+
+///////////////////////////////////////////////////////////////////////////////
+// Calculates parameters from parameters in options object and from sequences and writes them to outStr
+template
+void _writeMoreCalculatedParams(StellarOptions const & options,
+ uint64_t const & refLen,
+ std::vector const & queries,
+ TStream & outStr);
+
+void _writeOutputStatistics(StellarOutputStatistics const & statistics, bool const verbose, bool const writeDisabledQueriesFile);
+
+template
+void _printStellarKernelStatistics(StellarComputeStatistics const & statistics, TStream & outStr);
+
+template
+void _printDatabaseIdAndStellarKernelStatistics(
+ bool const verbose,
+ bool const databaseStrand,
+ CharString const & databaseID,
+ StellarComputeStatistics const & statistics,
+ TStream & outStr);
+
+template
+void _printStellarStatistics(
+ bool const verbose,
+ bool const databaseStrand,
+ StringSet const & databaseIDs,
+ StellarComputeStatisticsCollection const & computeStatistics,
+ TStream & outStr);
+
+template
+void _writeOutputStatistics(StellarOutputStatistics const & statistics,
+ bool const verbose,
+ bool const writeDisabledQueriesFile,
+ TStream & outStr);
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/diagnostics/print.tpp b/include/dream_stellar/diagnostics/print.tpp
new file mode 100644
index 00000000..f1a23b5b
--- /dev/null
+++ b/include/dream_stellar/diagnostics/print.tpp
@@ -0,0 +1,182 @@
+#pragma once
+
+#include
+
+namespace dream_stellar
+{
+
+using namespace seqan2;
+
+///////////////////////////////////////////////////////////////////////////////
+// Writes user specified parameters from options object to outStr
+template
+void _writeSpecifiedParams(StellarOptions const & options, TStream & outStr)
+{
+//IOREV _notio_
+ // Output user specified parameters
+ outStr << "User specified parameters:" << std::endl;
+ outStr << " minimal match length : " << options.minLength << std::endl;
+ outStr << " maximal error rate (epsilon) : " << options.epsilon << std::endl;
+ outStr << " maximal x-drop : " << options.xDrop << std::endl;
+ if (options.qGram != std::numeric_limits::max())
+ outStr << " k-mer (q-gram) length : " << options.qGram << std::endl;
+ outStr << " search forward strand : " << ((options.forward) ? "yes" : "no") << std::endl;
+ outStr << " search reverse complement : " << ((options.reverse) ? "yes" : "no") << std::endl;
+ outStr << std::endl;
+
+ outStr << " verification strategy : " << to_string(options.verificationMethod) << std::endl;
+ if (options.disableThresh != std::numeric_limits::max())
+ {
+ outStr << " disable queries with more than : " << options.disableThresh << " matches" << std::endl;
+ }
+ outStr << " maximal number of matches : " << options.numMatches << std::endl;
+ outStr << " duplicate removal every : " << options.compactThresh << std::endl;
+ if (options.maxRepeatPeriod != 1 || options.minRepeatLength != 1000)
+ {
+ outStr << " max low complexity repeat period : " << options.maxRepeatPeriod << std::endl;
+ outStr << " min low complexity repeat length : " << options.minRepeatLength << std::endl;
+ }
+ if (options.qgramAbundanceCut != 1)
+ {
+ outStr << " q-gram abundance cut ratio : " << options.qgramAbundanceCut << std::endl;
+ }
+
+ outStr << std::endl;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Calculates parameters from parameters in options object and writes them to outStr
+// Sets options.qGram if not set by user input
+template
+void _writeCalculatedParams(StellarOptions & options, TStream & outStr)
+{
+//IOREV _notio_
+ StellarStatistics statistics{options};
+
+ outStr << "Calculated parameters:" << std::endl;
+ if (statistics.kMerComputed)
+ {
+ options.qGram = (unsigned)statistics.kMerLength;
+ outStr << " k-mer length : " << statistics.kMerLength << std::endl;
+ }
+
+ outStr << " s^min : " << statistics.smin << std::endl;
+ outStr << " threshold : " << statistics.threshold << std::endl;
+ outStr << " distance cut : " << statistics.distanceCut << std::endl;
+ outStr << " delta : " << statistics.delta << std::endl;
+ outStr << " overlap : " << statistics.overlap << std::endl;
+ outStr << std::endl;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Writes file name from options object to outStr
+template
+void _writeFileNames(StellarOptions const & options, TStream & outStr)
+{
+//IOREV _notio_
+ outStr << "I/O options:" << std::endl;
+ outStr << " database file : " << options.databaseFile << std::endl;
+ outStr << " query file : " << options.queryFile << std::endl;
+ outStr << " alphabet : " << options.alphabet << std::endl;
+ outStr << " output file : " << options.outputFile << std::endl;
+ outStr << " output format : " << options.outputFormat << std::endl;
+ if (options.disableThresh != std::numeric_limits::max())
+ {
+ outStr << " disabled queries: " << options.disabledQueriesFile << std::endl;
+ }
+ outStr << std::endl;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Calculates parameters from parameters in options object and from sequences and writes them to outStr
+template
+void _writeMoreCalculatedParams(StellarOptions const & options,
+ uint64_t const & refLen,
+ std::vector const & queries,
+ TStream & outStr)
+{
+ if (options.qgramAbundanceCut != 1)
+ {
+ outStr << "Calculated parameters:" << std::endl;
+ }
+
+ uint64_t queryLength{0};
+ for (auto & query : queries)
+ queryLength += query.size();
+
+ if (options.qgramAbundanceCut != 1)
+ {
+ outStr << " q-gram expected abundance : ";
+ outStr << queryLength / (double)((long)1 << (options.qGram << 1)) << std::endl;
+ outStr << " q-gram abundance threshold: ";
+ outStr << _max(100, (int)(queryLength * options.qgramAbundanceCut)) << std::endl;
+ outStr << std::endl;
+ }
+}
+
+template
+void _printStellarKernelStatistics(StellarComputeStatistics const & statistics, TStream & outStr)
+{
+ if (statistics.numSwiftHits == 0)
+ return;
+
+ outStr << std::endl << " # SWIFT hits : " << statistics.numSwiftHits;
+ outStr << std::endl << " Longest hit : " << statistics.maxLength;
+ outStr << std::endl << " Avg hit length : " << statistics.totalLength/statistics.numSwiftHits;
+}
+
+template
+void _printDatabaseIdAndStellarKernelStatistics(
+ bool const verbose,
+ bool const databaseStrand,
+ CharString const & databaseID,
+ StellarComputeStatistics const & statistics,
+ TStream & outStr)
+{
+ outStr << " " << databaseID;
+ if (!databaseStrand)
+ outStr << ", complement";
+ outStr << std::flush;
+
+ if (verbose)
+ {
+ _printStellarKernelStatistics(statistics, outStr);
+ }
+ outStr << std::endl;
+}
+
+template
+void _printStellarStatistics(
+ bool const verbose,
+ bool const databaseStrand,
+ StringSet const & databaseIDs,
+ StellarComputeStatisticsCollection const & computeStatistics,
+ TStream & outStr)
+{
+ std::cerr << std::endl; // swift filter output is on same line
+ for (size_t i = 0; i < length(databaseIDs); ++i)
+ {
+ CharString const & databaseID = databaseIDs[i];
+ StellarComputeStatistics const & statistics = computeStatistics[i];
+ _printDatabaseIdAndStellarKernelStatistics(verbose, databaseStrand, databaseID, statistics, outStr);
+ }
+}
+
+template
+void _writeOutputStatistics(StellarOutputStatistics const & statistics,
+ bool const verbose,
+ bool const writeDisabledQueriesFile,
+ TStream & outStr)
+{
+ outStr << "# Eps-matches : " << statistics.numMatches << std::endl;
+ if (verbose) {
+ if (statistics.numMatches > 0) {
+ outStr << "Longest eps-match : " << statistics.maxLength << std::endl;
+ outStr << "Avg match length : " << statistics.totalLength / statistics.numMatches << std::endl;
+ }
+ if (writeDisabledQueriesFile)
+ outStr << "# Disabled queries: " << statistics.numDisabled << std::endl;
+ }
+}
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/extension/align_banded_nw_best_ends.hpp b/include/dream_stellar/extension/align_banded_nw_best_ends.hpp
new file mode 100644
index 00000000..54cbb9c9
--- /dev/null
+++ b/include/dream_stellar/extension/align_banded_nw_best_ends.hpp
@@ -0,0 +1,172 @@
+#pragma once
+
+#include
+
+namespace dream_stellar
+{
+using namespace seqan2;
+
+///////////////////////////////////////////////////////////////////////////////
+// Computes the banded alignment matrix and additionally a string with the best
+// alignment end point for each alignment length.
+template
+inline void
+_align_banded_nw_best_ends(TTrace& trace,
+ std::vector & bestEnds,
+ TSegmentVector const & str,
+ TScore const & sc,
+ TDiagonal const diagL,
+ TDiagonal const diagU)
+{
+ typedef typename Value::Type TTraceValue;
+ typedef typename Value::Type TScoreValue;
+ typedef typename Value::Type TSegment; // was Segment> now Segment>
+ typedef typename Size::Type TSize;
+ using TAlphabet = typename Value::Type;
+ //!TODO: TAlphabet should NOT be of container type
+
+ SEQAN_ASSERT_GEQ(diagU, diagL);
+
+ // Initialization
+ TTraceValue const Diagonal = 0;
+ TTraceValue const Horizontal = 1;
+ TTraceValue const Vertical = 2;
+ TSegment const& str1 = str[0];
+ TSegment const& str2 = str[1];
+ TSize const len1 = length(str1) + 1;
+ TSize const len2 = length(str2) + 1;
+ TSize const diagonalWidth = (TSize) (diagU - diagL + 1);
+ TSize hi_diag = diagonalWidth;
+ TSize lo_diag = 0;
+ if (diagL > 0) lo_diag = 0;
+ else lo_diag = (diagU < 0) ? hi_diag : (TSize) (1 - diagL);
+ TSize const lo_row = (diagU <= 0) ? -diagU : 0;
+ TSize const hi_row = [&]()
+ {
+ TSize const max_hi_row = len2;
+ // Note: diagL might be negative
+ assert((TDiagonal) len1 >= diagL);
+ if (len1 - diagL < max_hi_row)
+ return len1 - diagL;
+ else
+ return max_hi_row;
+ }();
+ TSize const height = hi_row - lo_row;
+
+ typedef String TRow;
+ TRow mat, len;
+ resize(mat, diagonalWidth);
+ resize(len, diagonalWidth);
+ resize(trace, height * diagonalWidth);
+
+ // Classical DP with affine gap costs
+ typedef typename Iterator::Type TRowIter;
+ typedef typename Iterator::Type TTraceIter;
+
+ TSize errors;
+
+ assert(scoreMatch(sc) == 1u);
+ assert(scoreGap(sc) == scoreGapExtendHorizontal(sc, TAlphabet{}, TAlphabet{}));
+ assert(scoreGap(sc) == scoreGapExtendVertical(sc, TAlphabet{}, TAlphabet{}));
+ assert(scoreMismatch(sc) == scoreGap(sc));
+
+ TScoreValue const matchScore = scoreMatch(sc);
+ TScoreValue const gapScore = scoreGap(sc);
+
+ for(TSize row = 0; row < height; ++row) {
+ TSize actualRow = row + lo_row;
+ if (lo_diag > 0) --lo_diag;
+ if ((TDiagonal)actualRow >= (TDiagonal)len1 - diagU) --hi_diag;
+ TTraceIter traceIt = begin(trace, Standard()) + row * diagonalWidth + lo_diag;
+ TRowIter current_score_rowise_it = begin(mat, Standard()) + lo_diag;
+ TRowIter alignment_length_it = begin(len, Standard()) + lo_diag;
+
+ TScoreValue score_left = std::numeric_limits::min();
+ TScoreValue alignment_length_left = len1+len2+1;
+
+ for(TSize col = lo_diag; col= len1) break;
+
+ if ((actualRow != 0) && (actualCol != 0)) {
+ TAlphabet const str1entry = sequenceEntryForScore(sc, str1, ((int) actualCol - 1));
+ TAlphabet const str2entry = sequenceEntryForScore(sc, str2, ((int) actualRow - 1));
+
+ // Get the new maximum for mat
+ *current_score_rowise_it += score(sc, str1entry, str2entry);
+ *traceIt = Diagonal;
+ ++(*alignment_length_it);
+
+ TScoreValue score_up =
+ (col < diagonalWidth - 1) ?
+ *(current_score_rowise_it+1) + gapScore :
+ std::numeric_limits::min();
+
+ if (score_up > *current_score_rowise_it)
+ {
+ *current_score_rowise_it = score_up;
+ *traceIt = Vertical;
+ *alignment_length_it = *(alignment_length_it+1) + 1;
+ }
+
+ score_left =
+ (col > 0) ?
+ score_left + gapScore :
+ std::numeric_limits::min();
+ if (score_left > *current_score_rowise_it)
+ {
+ *current_score_rowise_it = score_left;
+ *traceIt = Horizontal;
+ *alignment_length_it = alignment_length_left + 1;
+ }
+ score_left = *current_score_rowise_it;
+ alignment_length_left = *alignment_length_it;
+ } else {
+ // Usual initialization for first row and column
+ if (actualRow == 0) {
+ *current_score_rowise_it = actualCol * gapScore;
+ *alignment_length_it = actualCol;
+ }
+ else {
+ assert(actualCol == 0);
+ *current_score_rowise_it = actualRow * gapScore;
+ *alignment_length_it = actualRow;
+ score_left = *current_score_rowise_it;
+ alignment_length_left = actualRow;
+ }
+ }
+
+ // *current_score_rowise_it: the alignment_score
+ // *alignment_length_it: the alignment_length (basically length of best trace path of the current cell)
+ // alignment_length = |matches| + |mismatches| + |gaps|
+ //
+ // alignment_score
+ // = |matches| * match_score + |mismatches| * gap_score + |gaps| * gap_score
+ //
+ // alignment_length * match_score
+ // = |matches| * match_score + |mismatches| * match_score + |gaps| * match_score
+ //
+ // alignment_score - alignment_length * match_score
+ // = 0 + (|mismatches| + |gaps|)(gap_score - match_score)
+ //
+ // Thus
+ // (alignment_score - alignment_length * match_score) / (gapScore - matchScore)
+ // = |mismatches| + |gaps| = errors
+ errors = (*current_score_rowise_it - (*alignment_length_it * matchScore)) / (gapScore - matchScore);
+ SEQAN_ASSERT_GEQ(errors, 0);
+ SEQAN_ASSERT_LEQ(errors, bestEnds.size());
+ if (errors == bestEnds.size()) {
+ bestEnds.emplace_back(TEnd(*alignment_length_it, row, col));
+ } else if (*alignment_length_it > static_cast(bestEnds[errors].length))
+ bestEnds[errors] = TEnd(*alignment_length_it, row, col);
+ //std::cerr << row << ',' << col << ':' << *current_score_rowise_it << std::endl;
+ }
+ }
+ TSize newLength = bestEnds.size() - 1;
+ while (newLength > 0 && bestEnds[newLength].length <= bestEnds[newLength-1].length) {
+ --newLength;
+ }
+ bestEnds.reserve(newLength + 1);
+}
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/extension/extension_banded_trace_matrix.hpp b/include/dream_stellar/extension/extension_banded_trace_matrix.hpp
new file mode 100644
index 00000000..135ddb4b
--- /dev/null
+++ b/include/dream_stellar/extension/extension_banded_trace_matrix.hpp
@@ -0,0 +1,156 @@
+#pragma once
+
+#include
+
+#include
+#include // needs seqan/sequence.h
+
+namespace dream_stellar
+{
+
+struct extension_banded_trace_matrix
+{
+ using diagonal_t = std::make_signed_t;
+
+ extension_banded_trace_matrix(
+ size_t const rowCount,
+ size_t const columnCount,
+ diagonal_t const lowerDiagonal,
+ diagonal_t const upperDiagonal
+ ) :
+ _rowCount{rowCount},
+ _columnCount{columnCount},
+ _lowerDiagonal{lowerDiagonal},
+ _upperDiagonal{upperDiagonal},
+ _traceMatrix{}
+ {
+ assert(lowerDiagonal <= upperDiagonal);
+
+ resize(_traceMatrix, dataSize());
+ }
+
+ size_t rows() const
+ {
+ return _rowCount;
+ }
+
+ size_t columns() const
+ {
+ return _columnCount;
+ }
+
+ diagonal_t lowerDiagonal() const
+ {
+ return _lowerDiagonal;
+ }
+
+ diagonal_t upperDiagonal() const
+ {
+ return _upperDiagonal;
+ }
+
+ size_t diagonalWidth() const
+ {
+ return _upperDiagonal - _lowerDiagonal + 1;
+ }
+
+ // memory region for active row
+ std::span rowSpan(size_t const row)
+ {
+ auto [beginRow, endRow] = rowInterval();
+ if (!(beginRow <= row && row <= endRow))
+ return {};
+
+ size_t rowOffset = row - beginRow;
+ auto diagonalInterval = diagonalIntervalInRow(row);
+ size_t columnBegin = std::min(diagonalInterval.first + _lowerDiagonal + row, _columnCount);
+ size_t columnEnd = std::min(diagonalInterval.second + _lowerDiagonal + row, _columnCount);
+
+ return data().subspan(
+ rowOffset * diagonalWidth() + diagonalInterval.first,
+ columnEnd - columnBegin);
+ }
+
+ // complete underlying data
+ std::span data()
+ {
+ seqan2::TraceBack & firstValue = *begin(_traceMatrix);
+ return {&firstValue, dataSize()};
+ }
+
+ seqan2::String & underlyingTraceMatrix()
+ {
+ return _traceMatrix;
+ }
+
+ size_t dataSize() const
+ {
+ std::pair pair = rowInterval();
+ size_t height = pair.second - pair.first;
+ return height * diagonalWidth();
+ }
+
+ std::pair rowInterval() const
+ {
+ size_t const rowBegin = (_upperDiagonal <= 0) ? -_upperDiagonal : 0;
+ size_t const rowEnd = [&]()
+ {
+ size_t const maxRowEnd = _rowCount;
+ assert ((diagonal_t)_columnCount >= _lowerDiagonal);
+ if (_columnCount - _lowerDiagonal < maxRowEnd)
+ return _columnCount - _lowerDiagonal;
+ else
+ return maxRowEnd;
+ }();
+
+ return {rowBegin, rowEnd};
+ }
+
+ std::pair diagonalIntervalInRow(size_t const row) const
+ {
+ size_t const rowOffset = row - rowInterval().first;
+
+ size_t diagonalEnd = diagonalWidth();
+ size_t diagonalBegin = 0;
+ if (_lowerDiagonal <= 0)
+ {
+ diagonalBegin = (_upperDiagonal < 0) ? diagonalEnd : (size_t) (1 - _lowerDiagonal);
+ }
+
+ // subtract in each row iteration
+ // if (diagonalBegin > 0) --diagonalBegin;
+
+ size_t const diagonalBeginOffset = std::min(rowOffset + 1, diagonalBegin);
+ diagonalBegin -= diagonalBeginOffset; // lo_diag
+
+ // subtract in each row iteration
+ // if ((diagonal_t)actualRow >= (diagonal_t)len1 - diagU) --hi_diag;
+ size_t const diagonalEndOffset =
+ ((diagonal_t)row >= (diagonal_t)_columnCount - _upperDiagonal) ?
+ (row - _columnCount + _upperDiagonal) :
+ 0;
+ diagonalEnd -= diagonalEndOffset; // hi_diag
+
+ return {diagonalBegin, diagonalEnd};
+ }
+
+ std::pair columnIntervalInRow(size_t const row) const
+ {
+ auto diagonal = diagonalIntervalInRow(row);
+
+ size_t columnBegin = std::min(diagonal.first + _lowerDiagonal + row, _columnCount);
+ size_t columnEnd = std::min(diagonal.second + _lowerDiagonal + row, _columnCount);
+
+ return {columnBegin, columnEnd};
+ }
+
+private:
+ size_t _rowCount;
+ size_t _columnCount;
+ diagonal_t _lowerDiagonal;
+ diagonal_t _upperDiagonal;
+
+ seqan2::String _traceMatrix;
+};
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/extension/extension_end_position.hpp b/include/dream_stellar/extension/extension_end_position.hpp
new file mode 100644
index 00000000..de7ff2c0
--- /dev/null
+++ b/include/dream_stellar/extension/extension_end_position.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include
+
+namespace dream_stellar
+{
+using namespace seqan2;
+
+///////////////////////////////////////////////////////////////////////////////
+// Container for storing possible end positions in extension of eps-core
+template
+struct ExtensionEndPosition {
+ using TPosition = TPos_;
+ using TCoordinate = Pair;
+
+ TPosition length;
+ TCoordinate coord;
+
+ ExtensionEndPosition():
+ length(0), coord(TCoordinate(0,0)) {}
+
+ ExtensionEndPosition(TPosition len, TPosition row, TPosition col):
+ length(len), coord(TCoordinate(row, col)) {}
+};
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/extension/longest_eps_match.hpp b/include/dream_stellar/extension/longest_eps_match.hpp
new file mode 100644
index 00000000..e5b4d50e
--- /dev/null
+++ b/include/dream_stellar/extension/longest_eps_match.hpp
@@ -0,0 +1,74 @@
+#pragma once
+
+#include
+
+#include
+
+#include
+
+namespace dream_stellar
+{
+using namespace seqan2;
+
+///////////////////////////////////////////////////////////////////////////////
+// Identifies the longest epsilon match in align from possEndsLeft and possEndsRight and sets the view positions of
+// align to start and end position of the longest epsilon match
+template, typename TIterator = std::vector::const_iterator>
+std::pair longestEpsMatch(std::vector > const & possEndsLeft,
+ std::vector > const & possEndsRight,
+ TLength const alignLen,
+ TLength const alignErr,
+ TSize const matchMinLength,
+ TEps const epsilon) {
+
+ // Identify longest eps match by iterating over combinations of left and right positions
+ TIterator rightIt = possEndsRight.end() - 1;
+ TIterator leftIt = possEndsLeft.end() - 1;
+ TIterator right = possEndsRight.begin();
+ TIterator left = possEndsLeft.begin();
+
+ /*for (int i = 0; i < length(possEndsRight); ++i) {
+ std::cout << possEndsRight[i].length << " " << possEndsRight[i].coord.i1 << "," << possEndsRight[i].coord.i2 << std::endl;
+ }
+ for (int i = 0; i < length(possEndsLeft); ++i) {
+ std::cout << possEndsLeft[i].length << " " << possEndsLeft[i].coord.i1 << "," << possEndsLeft[i].coord.i2 << std::endl;
+ }*/
+
+ TSize leftErr = possEndsLeft.size() - 1;
+
+ TSize minLength = matchMinLength;
+ bool found = false;
+ // DELTA is used below against floating point rounding errors.
+ double const DELTA = 0.000001;
+
+ while (leftIt >= possEndsLeft.begin()) {
+ TSize totalLen = (*leftIt).length + alignLen + (*rightIt).length;
+ if (totalLen < minLength) break;
+ TSize totalErr = leftErr + alignErr + possEndsRight.size() - 1;
+ while (rightIt >= possEndsRight.begin()) {
+ totalLen = (*leftIt).length + alignLen + (*rightIt).length;
+ if (totalLen < minLength) break;
+ if ((TEps)totalErr/(TEps)totalLen < epsilon + DELTA) {
+ right = rightIt;
+ left = leftIt;
+ //std::cout << totalLen << std::endl;
+ minLength = totalLen;
+ found = true;
+ break;
+ }
+ --rightIt;
+ --totalErr;
+ }
+ rightIt = possEndsRight.end() - 1;
+ --leftIt;
+ --leftErr;
+ }
+
+ if (found)
+ return std::pair(left, right);
+ else
+ return std::pair(possEndsLeft.end(),possEndsRight.end());
+}
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/io/import_sequence.hpp b/include/dream_stellar/io/import_sequence.hpp
new file mode 100644
index 00000000..eeac7ca5
--- /dev/null
+++ b/include/dream_stellar/io/import_sequence.hpp
@@ -0,0 +1,81 @@
+#pragma once
+
+#include
+#include
+
+namespace dream_stellar
+{
+
+using namespace seqan2;
+
+template
+inline bool
+_checkUniqueId(std::set & uniqueIds, TId const & id)
+{
+ TId shortId;
+
+ // (cut at first whitespace)
+ for (auto it = begin(id); it != end(id) && *it > ' '; ++it)
+ {
+ appendValue(shortId, *it);
+ }
+
+ auto [it, added] = uniqueIds.insert(shortId);
+
+ return added;
+}
+
+template
+inline bool _import_database_sequences(input_t const & file_input,
+ collection_t & seqs,
+ std::vector & ids,
+ uint64_t & seq_len,
+ stream_t & str_out,
+ stream_t & str_err)
+{
+ bool ids_unique{true};
+ size_t seq_count{0};
+
+ auto record_intake_lambda = [&](auto const & record)
+ {
+ ids_unique &= (std::find(ids.begin(), ids.end(), record.id()) != ids.end());
+ seq_len += record.sequence().size();
+ seqs.emplace_back(record.sequence());
+ ids.emplace_back(record.id());
+ seq_count++;
+ };
+
+ if constexpr (std::is_same::value)
+ {
+ seqan3::sequence_file_input fin{file_input};
+ for (auto & record : fin)
+ {
+ record_intake_lambda(record);
+ }
+ }
+ else if constexpr (std::is_same>>::value)
+ {
+ for (auto & bin : file_input)
+ {
+ for (auto & bin_file : bin)
+ {
+ seqan3::sequence_file_input fin{bin_file};
+ for (auto & record : bin_file)
+ {
+ record_intake_lambda(record);
+ }
+ }
+ }
+ }
+ else
+ {
+ str_err << "WARNING: Unknown database file input\n";
+ }
+
+ str_out << "Loaded " << seq_count << " adapted database sequence" << ((seq_count > 1) ? "s." : ".") << std::endl;
+ if (!ids_unique)
+ str_err << "WARNING: Non-unique adapted database ids. Output can be ambiguous.\n";
+ return true;
+}
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/options/dream_options.hpp b/include/dream_stellar/options/dream_options.hpp
new file mode 100644
index 00000000..a79982f2
--- /dev/null
+++ b/include/dream_stellar/options/dream_options.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include
+
+namespace dream_stellar
+{
+
+struct DREAMOptions
+{
+ bool prefilteredSearch{false}; // search a subset of all reference sequences (e.g chr 1, chr 2)
+ bool searchSegment{false}; // search a segment of a single reference sequence (e.g chr1:1000-3000)
+ uint64_t referenceLength{0};
+
+ // Specify the segment (and sequence) of interest.
+ std::vector binSequences;
+ uint32_t segmentBegin;
+ uint32_t segmentEnd;
+};
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/options/eps_match_options.hpp b/include/dream_stellar/options/eps_match_options.hpp
new file mode 100644
index 00000000..1c1699d7
--- /dev/null
+++ b/include/dream_stellar/options/eps_match_options.hpp
@@ -0,0 +1,16 @@
+
+#pragma once
+
+#include
+
+namespace dream_stellar
+{
+
+struct EPSMatchOptions
+{
+ dream_stellar::utils::fraction epsilon{5, 100}; // maximal error rate
+ double numEpsilon{0.05};
+ unsigned minLength{100}; // minimal length of an epsilon-match
+};
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/options/index_options.hpp b/include/dream_stellar/options/index_options.hpp
new file mode 100644
index 00000000..0a27a791
--- /dev/null
+++ b/include/dream_stellar/options/index_options.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include
+
+namespace dream_stellar
+{
+
+struct IndexOptions
+{
+ size_t qGram{std::numeric_limits::max()}; // length of the q-grams
+ double qgramAbundanceCut{1};
+};
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/options/verifier_options.hpp b/include/dream_stellar/options/verifier_options.hpp
new file mode 100644
index 00000000..a3539d22
--- /dev/null
+++ b/include/dream_stellar/options/verifier_options.hpp
@@ -0,0 +1,54 @@
+
+#pragma once
+
+#include
+#include
+
+namespace dream_stellar
+{
+
+struct VerifyAllLocal_;
+using AllLocal = seqan2::Tag const;
+
+struct VerifyBestLocal_;
+using BestLocal = seqan2::Tag const;
+
+// basically a std::variant
+struct StellarVerificationMethod
+{
+ StellarVerificationMethod(AllLocal) : _index{0} {}
+ StellarVerificationMethod(BestLocal) : _index{1} {}
+
+ constexpr std::size_t index() const noexcept
+ {
+ return _index;
+ }
+
+ friend constexpr bool operator==(
+ StellarVerificationMethod const & m1,
+ StellarVerificationMethod const & m2)
+ {
+ return m1.index() == m2.index();
+ }
+
+ friend inline std::string to_string(StellarVerificationMethod method)
+ {
+ using cstring_t = char const * const;
+ cstring_t method_names[] = {"exact", "bestLocal"};
+ return method_names[method.index()];
+ }
+
+private:
+ std::size_t _index{~std::size_t{0u}};
+};
+
+struct VerifierOptions
+{
+ double xDrop{5}; // maximal x-drop
+
+ // verification strategy: exact, bestLocal
+ std::string strVerificationMethod{"exact"};
+ StellarVerificationMethod verificationMethod{AllLocal{}};
+};
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/query_id_map.hpp b/include/dream_stellar/query_id_map.hpp
new file mode 100644
index 00000000..2fd0ecd4
--- /dev/null
+++ b/include/dream_stellar/query_id_map.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include
+
+#include
+
+namespace dream_stellar
+{
+
+/**
+ * @brief Associate a query ID with the corresponding segment sequence.
+ */
+template
+struct query_id_map
+{
+ using rec_t = valik::shared_query_record;
+ std::vector & records;
+
+ dream_stellar::StellarQuerySegment segment_from_id(unsigned const & query_id) const
+ {
+ if (query_id >= records.size())
+ throw std::runtime_error("Query index " + std::to_string(query_id) + " is out of range [0, "
+ + std::to_string(records.size() - 1) + "]");
+ rec_t & shared_rec = records[query_id];
+ return shared_rec.asStellarSegment();
+ }
+};
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/shared.hpp b/include/dream_stellar/shared.hpp
new file mode 100644
index 00000000..a815ee10
--- /dev/null
+++ b/include/dream_stellar/shared.hpp
@@ -0,0 +1,83 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+namespace dream_stellar
+{
+
+///////////////////////////////////////////////////////////////////////////////
+// Options for Stellar
+struct StellarOptions : public EPSMatchOptions, public IndexOptions, public VerifierOptions, public DREAMOptions {
+ // i/o options
+ std::string databaseFile; // name of database file
+ std::string queryFile; // name of query file
+ std::string outputFile{"stellar.gff"}; // name of result file
+ std::string disabledQueriesFile{"stellar.disabled.fasta"}; // name of result file containing disabled queries
+ std::string outputFormat{"gff"}; // Possible formats: gff, text
+ std::string alphabet{"dna5"}; // Possible values: dna, rna, protein, char
+ bool write_time; // write running time to standard output
+
+ // more options
+ bool forward{true}; // compute matches to forward strand of database
+ bool onlyForward{false};
+ bool reverse{true}; // compute matches to reverse complemented database
+ bool onlyReverse{false};
+
+ size_t disableThresh = std::numeric_limits::max(); // maximal number of matches allowed per query before disabling verification of hits for that query
+ size_t compactThresh = 500; // number of matches after which removal of overlaps and duplicates is started
+ size_t numMatches = 50; // maximal number of matches per query and database
+ size_t maxRepeatPeriod = 1; // maximal period of low complexity repeats to be filtered
+ size_t minRepeatLength = 1000; // minimal length of low complexity repeats to be filtered
+ bool verbose{false}; // verbose mode
+
+ static constexpr size_t kmerCount(size_t sequenceLength, size_t kmerSize)
+ {
+ assert(kmerSize > 0u);
+ assert(sequenceLength >= kmerSize - 1u);
+ // number of kmers
+ return sequenceLength + 1u - kmerSize;
+ }
+
+ static constexpr size_t kmerLemma(size_t sequenceLength, size_t kmerSize, size_t errors)
+ {
+ size_t maxAffectedKMers = kmerSize * errors;
+ size_t count = kmerCount(sequenceLength, kmerSize);
+ return std::max(count, maxAffectedKMers) - maxAffectedKMers;
+ }
+
+ static constexpr size_t pigeonholeLemma(size_t sequenceLength, size_t errors)
+ {
+ assert(sequenceLength >= errors);
+ // how many consecutive chars must be error free
+ using difference_t = utils::fraction::difference_t;
+ return ceil(utils::fraction{static_cast(sequenceLength - errors), errors + 1});
+ }
+
+ static constexpr size_t minLengthWithExactError(size_t absoluteError, utils::fraction epsilon)
+ {
+ if (epsilon.numerator() == 0)
+ return std::numeric_limits::max();
+
+ using difference_t = utils::fraction::difference_t;
+ return ceil(utils::fraction{static_cast(absoluteError), 1} / epsilon);
+ }
+
+ static constexpr size_t absoluteErrors(utils::fraction epsilon, size_t sequenceLength)
+ {
+ using difference_t = utils::fraction::difference_t;
+ return floor(utils::fraction{static_cast(sequenceLength), 1} * epsilon);
+ }
+
+};
+
+}
diff --git a/include/dream_stellar/stellar.hpp b/include/dream_stellar/stellar.hpp
new file mode 100644
index 00000000..1ebbab48
--- /dev/null
+++ b/include/dream_stellar/stellar.hpp
@@ -0,0 +1,247 @@
+#pragma once
+
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include
+
+namespace dream_stellar
+{
+
+using namespace seqan2;
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks whether two matches overlap *in seq2* and
+// whether the non-overlaping parts are shorter than minLength.
+template
+bool
+checkOverlap(TMatch const & matchA, TMatch const & matchB, TSize const minLength) {
+ // check id and orienation
+ if (matchA.id != matchB.id || matchA.orientation != matchB.orientation) return false;
+ if (matchA.id == TMatch::INVALID_ID || matchB.id == TMatch::INVALID_ID) return false;
+
+ // check overlap in seq2
+ if (matchA.begin2 >= matchB.begin2) {
+ if (matchA.end2 >= matchB.end2) {
+ // check length of non-overlapping parts of both matches
+ if ((TSize)matchA.begin2 - (TSize)matchB.begin2 >= minLength &&
+ (TSize)matchA.end2 - (TSize)matchB.end2 >= minLength) {
+ return false;
+ }
+ }
+ // check whether offset is the same in both sequences
+ if (toViewPosition(matchA.row2, matchA.begin2) - toViewPosition(matchB.row2, matchB.begin2) !=
+ toViewPosition(matchA.row1, matchA.begin1) - toViewPosition(matchB.row1, matchB.begin1)) {
+ return false;
+ }
+ } else {
+ if (matchA.end2 < matchB.end2) {
+ // check length of non-overlapping parts of both matches
+ if ((TSize)matchB.begin2 - (TSize)matchA.begin2 >= minLength &&
+ (TSize)matchB.end2 - (TSize)matchA.end2 >= minLength) {
+ return false;
+ }
+ }
+ // check whether offset is the same in both sequences
+ if (toViewPosition(matchB.row2, matchB.begin2) - toViewPosition(matchA.row2, matchA.begin2) !=
+ toViewPosition(matchB.row1, matchB.begin1) - toViewPosition(matchA.row1, matchA.begin1)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+template
+TPosition
+projectedPosition(TRow const & rowA, TRow const & rowB, TPosition pos)
+{
+ return toSourcePosition(rowB, toViewPosition(rowA, pos));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks all alignment columns of two overlapping matches.
+// It is assumed that matchA.begin1 < matchB.begin1.
+template
+bool
+_checkAlignColOverlap(TMatch const & matchA, TMatch const & matchB, TSize const minLength)
+{
+ TSize equalCols = 0;
+ TSize diffCols = 0;
+
+ for (typename TMatch::TPos pos = matchB.begin1; pos < _min(matchA.end1, matchB.end1); ++pos)
+ {
+ if (projectedPosition(matchA.row1, matchA.row2, pos) == projectedPosition(matchB.row1, matchB.row2, pos))
+ ++equalCols;
+ else
+ ++diffCols;
+ }
+
+ if (diffCols >= minLength) return false;
+ return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Marks matches that overlap in both sequences with a longer match as invalid.
+template
+void maskOverlaps(String > & matches, TSize const minLength)
+{
+ typedef StellarMatch TMatch;
+ typedef typename TMatch::TPos TPos;
+ typedef typename Iterator, Rooted>::Type TIter;
+ typedef typename Iterator, Rooted>::Type TOverlapIter;
+
+ // sort matches by begin position in row0
+ sortMatches(matches, LessPos());
+
+ // iterate all matches
+ TIter it = begin(matches);
+
+ // list of matches that potentially overlap with the current match in row0 and
+ // start earlier (including matches that overlap but have a unique part of at
+ // least minLength) sorted by descending end positions
+ String overlaps;
+
+ for (; it != end(matches); ++it)
+ {
+ if ((*it).id == TMatch::INVALID_ID) continue;
+
+ TPos insertPos = 0;
+
+ // iterate potentially overlapping matches
+ TOverlapIter overlapIt = begin(overlaps);
+ for (; overlapIt != end(overlaps); ++overlapIt)
+ {
+ TMatch & o = matches[*overlapIt];
+
+ // determine position for inserting *it into overlaps after checking
+ if ((*it).end1 < o.end1) insertPos++;
+
+ // check if matches overlap in row0 - if not, then break
+ if (o.end1 <= (*it).begin1) break;
+
+ // check if unique parts of the two matches in row0 are longer than minLength - if yes, then continue
+ if ((*it).begin1 - o.begin1 >= (TPos)minLength &&
+ (*it).end1 > o.end1 && (*it).end1 - o.end1 >= (TPos)minLength) continue;
+
+ // check if matches overlap in row1 - if not, then continue
+ if (!checkOverlap(*it, o, minLength)) continue;
+
+ // check exact alignment columns for overlap
+ if (!_checkAlignColOverlap(o, *it, minLength)) continue;
+
+ // set shorter match invalid
+ if (length(*it) > length(o))
+ o.id = TMatch::INVALID_ID;
+ else
+ (*it).id = TMatch::INVALID_ID;
+ }
+
+ // remove all matches from overlaps that end earlier than current match begins
+ resize(overlaps, position(overlapIt));
+
+ if ((*it).id != TMatch::INVALID_ID)
+ insertValue(overlaps, insertPos, position(it));
+ }
+
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Removes matches that are marked as invalid, and then keeps only the numMatches best matches.
+template
+void
+compactMatches(String > & matches, TSize const numMatches) {
+ typedef StellarMatch TMatch;
+ typedef typename Iterator, Standard>::Type TIterator;
+
+ // sort matches by length (and validity)
+ sortMatches(matches, LessLength());
+
+ // count valid matches
+ TSize num = 0;
+
+ TIterator it = begin(matches, Standard());
+ TIterator itEnd = end(matches, Standard());
+
+ for(; it != itEnd; ++it) {
+ if ((*it).id != TMatch::INVALID_ID)
+ ++num;
+ }
+
+ // keep only valid and longest matches
+ resize(matches, _min(num, numMatches));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Calls swift filter and verifies swift hits. = Computes eps-matches.
+// A basic block for stellar
+template
+StellarComputeStatistics
+_stellarKernel(jst::contrib::stellar_matcher> & matcher,
+ StellarDatabaseSegment & database_segment,
+ query_id_map const & query_dict,
+ StellarOptions & localOptions,
+ SwiftHitVerifier & swiftVerifier,
+ TIsPatternDisabledFn && isPatternDisabled,
+ TOnAlignmentResultFn && onAlignmentResult,
+ stellar_kernel_runtime & stellar_kernel_runtime)
+{
+ StellarComputeStatistics statistics{};
+
+ auto finder_callback = [&](auto & finder)
+ {
+ ++statistics.numSwiftHits;
+ statistics.totalLength += database_segment.size();
+ statistics.maxLength = std::max(statistics.maxLength, database_segment.size());
+
+ if (!isPatternDisabled(matcher))
+ {
+ auto queryID = matcher.curSeqNo();
+ StellarQuerySegment query_segment = query_dict.segment_from_id(queryID);
+ seqan3::debug_stream << "FOUND MATCH for query\t" << std::to_string(queryID) << '\n';
+
+ /*
+ ////Debug stuff:
+ //std::cout << beginPosition(infix(finder)) << ",";
+ //std::cout << endPosition(infix(finder)) << " ";
+ //std::cout << beginPosition(pattern) << ",";
+ //std::cout << endPosition(pattern) << std::endl;
+ */
+
+ // verification
+ stellar_kernel_runtime.verification_time.measure_time([&]()
+ {
+ swiftVerifier.verify(
+ database_segment,
+ query_segment,
+ matcher.delta(),
+ onAlignmentResult,
+ stellar_kernel_runtime.verification_time);
+ }); // measure_time
+ }
+ };
+
+ // call operator() from seqan_pattern_base
+ stellar_kernel_runtime.swift_filter_time.measure_time([&]()
+ {
+ matcher(database_segment.as_span(), localOptions.minRepeatLength, localOptions.maxRepeatPeriod, finder_callback);
+ });
+
+ return statistics;
+}
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/stellar_database_segment.hpp b/include/dream_stellar/stellar_database_segment.hpp
new file mode 100644
index 00000000..2bba0702
--- /dev/null
+++ b/include/dream_stellar/stellar_database_segment.hpp
@@ -0,0 +1,50 @@
+#pragma once
+
+#include
+
+#include
+#include
+
+namespace dream_stellar
+{
+
+using namespace seqan2;
+
+//!TODO: remove obsolete functions
+template
+struct StellarDatabaseSegment : public StellarSequenceSegment
+{
+ using TBase = StellarSequenceSegment;
+
+ using typename TBase::TInfixSegment;
+
+ //!TODO: why is it nested?
+ using TNestedFinderSegment = seqan2::Segment;
+
+ using TBase::TBase; // import constructor
+
+ static StellarDatabaseSegment fromFinderMatch(TInfixSegment const & finderMatch)
+ {
+ std::span const & underlyingDatabase = host(finderMatch);
+ return {underlyingDatabase, seqan2::beginPosition(finderMatch), seqan2::endPosition(finderMatch)};
+ }
+
+ std::span const & underlyingDatabase() const
+ {
+ return this->underlyingSequence();
+ }
+
+ TNestedFinderSegment asFinderSegment() const
+ {
+ std::span const & _database = underlyingDatabase();
+ auto finderInfix = this->asInfixSegment();
+
+ TInfixSegment const finderInfixSeq = infix(_database, 0, length(_database));
+ TNestedFinderSegment finderSegment(finderInfixSeq,
+ seqan2::beginPosition(finderInfix) - seqan2::beginPosition(_database),
+ seqan2::endPosition(finderInfix) - seqan2::beginPosition(_database));
+ return finderSegment;
+ }
+};
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/stellar_extension.hpp b/include/dream_stellar/stellar_extension.hpp
new file mode 100644
index 00000000..f1beee5c
--- /dev/null
+++ b/include/dream_stellar/stellar_extension.hpp
@@ -0,0 +1,715 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+namespace dream_stellar
+{
+using namespace seqan2;
+
+///////////////////////////////////////////////////////////////////////////////
+// returns true if align has a match at pos, otherwise false
+template
+inline bool
+isMatch(Align const & align, TSize const pos) {
+
+ if(isGap(row(align, 0), pos)) {
+ return false;
+ } else if(isGap(row(align, 1), pos)) {
+ return false;
+ } else if(row(align, 0)[pos] != row(align, 1)[pos]) {
+ return false;
+ } else {
+ return true;
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Computes possible end positions of an eps-match in a given alignment.
+template
+void
+_fillGapsString(Align const & align,
+ String > & gaps) {
+ typedef Triple TGapInfo;
+ TPos totalErrors = 0;
+ typename Row >::Type row0 = row(align, 0);
+ TPos i = 0;
+ TPos endPos = length(row0);
+ TPos gapBegin = i;
+
+ // append gap starting at beginPosition (also if its length is 0!)
+ while(i < endPos && !isMatch(align, i)) {
+ ++i;
+ ++totalErrors;
+ }
+ appendValue(gaps, TGapInfo(gapBegin, i, totalErrors));
+
+ // iterate over alignment and append gaps
+ while (i < endPos) {
+ // skip matches
+ while(i < endPos && isMatch(align, i)) {
+ ++i;
+ }
+ gapBegin = i;
+ // skip and count mismatches/indels
+ while(i < endPos && !isMatch(align, i)) {
+ ++i;
+ ++totalErrors;
+ }
+ appendValue(gaps, TGapInfo(gapBegin, i, totalErrors));
+ }
+ /*for(unsigned l = 0; l < length(gaps); ++l) {
+ std::cout << gaps[l].i1 << " " << gaps[l].i2 << " " << gaps[l].i3 << std::endl;
+ }*/
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Checks the error rate of the fragment between end of left and start of right.
+template
+inline bool
+_isEpsMatch(Triple const & left,
+ Triple const & right,
+ TFloat const eps) {
+ // compute mismatches/indels and length
+ TPos errors = right.i3 - left.i3 - (right.i2 - right.i1);
+ TPos len = right.i1 - left.i2;
+
+ // check error rate
+ double const DELTA = 0.000001; // Small delta against floating point precision problems.
+ return errors/(TFloat)(len) <= eps + DELTA;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Identifies the longest epsilon match in align and sets the view positions of
+// align to start and end position of the longest epsilon match
+template
+bool
+longestEpsMatch(Align & align,
+ TSize const matchMinLength,
+ TFloat const epsilon) {
+ // Preprocessing: compute and store gaps and lengths
+ // A gap is a triple of gap begin position, gap end position, and total number of errors in sequence from begin
+ // to end position of this gap.
+ typedef typename Position >::Type TPosition;
+ typedef String > TGapsString;
+ TGapsString gaps;
+ _fillGapsString(align, gaps);
+
+ // Identify longest eps match by iterating over combinations of left and right positions
+ typename Iterator::Type rightIt = end(gaps) - 1;
+ typename Iterator::Type leftIt = begin(gaps);
+
+ TPosition beginPos = 0;
+ TPosition endPos = 0;
+ TSize minLength = matchMinLength - 1;
+
+ while ((*leftIt).i2 + minLength < (*rightIt).i1) {
+ while ((*leftIt).i2 + minLength < (*rightIt).i1) {
+ if(_isEpsMatch(*leftIt, *rightIt, epsilon)) {
+ beginPos = (*leftIt).i2;
+ endPos = (*rightIt).i1;
+ minLength = endPos - beginPos;
+ break;
+ }
+ --rightIt;
+ }
+ rightIt = end(gaps) - 1;
+ ++leftIt;
+ }
+
+ // Set view positions to the eps-match
+ TPosition viewBeginRow0 = toSourcePosition(row(align, 0), 0);
+ TPosition viewBeginRow1 = toSourcePosition(row(align, 1), 0);
+ setClippedEndPosition(row(align, 0), viewBeginRow0 + endPos);
+ setClippedEndPosition(row(align, 1), viewBeginRow1 + endPos);
+ setClippedBeginPosition(row(align, 0), viewBeginRow0 + beginPos);
+ setClippedBeginPosition(row(align, 1), viewBeginRow1 + beginPos);
+ // setClippedBeginPosition(row(align, 0), toSourcePosition(row(align, 0), beginPos));
+ // setClippedBeginPosition(row(align, 1), toSourcePosition(row(align, 1), beginPos));
+ // setBeginPosition(row(align, 0), beginPos);
+ // setBeginPosition(row(align, 1), beginPos);
+ // setClippedEndPosition(row(align, 0), toSourcePosition(row(align, 0), endPos));
+ // setClippedEndPosition(row(align, 1), toSourcePosition(row(align, 1), endPos));
+ SEQAN_ASSERT_EQ(length(row(align, 0)), length(row(align, 1)));
+
+ if (endPos == 0 && beginPos == 0) return 1;
+ return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Computes the banded alignment matrix for the left extension and
+// returns a string with possible start positions of an eps-match.
+template
+void
+_fillMatrixBestEndsLeft(TMatrix & matrixLeft,
+ std::vector & possibleEndsLeft,
+ //!TODO: should sequencesLeft be owning?
+ std::vector const, InfixSegment>> const & sequencesLeft,
+ TDiagonal const diagLower,
+ TDiagonal const diagUpper,
+ TScore const & scoreMatrix) {
+ // _align_banded_nw_best_ends(matrixLeft, possibleEndsLeft, str, scoreMatrix,
+ // upperDiagonal(seedOld) - upperDiagonal(seed),
+ // upperDiagonal(seedOld) - lowerDiagonal(seed));
+
+ // std::cerr << "FILL MATRIX LEFT SEQS\n"
+ // << "0: " << infixH << "\n"
+ // << "1: " << infixV << "\n";
+
+ // _align_banded_nw_best_ends(matrixLeft, possibleEndsLeft, str, scoreMatrix,
+ // diagBegin - upperDiagonal(seed),
+ // diagBegin - lowerDiagonal(seed));
+ // upperDiagonal(seedOld) - upperDiagonal(seed),
+ // upperDiagonal(seedOld) - lowerDiagonal(seed));
+
+ // Use legacy adapted NW computation with infixH/first alignment row being in the vertical direction.
+ // // TODO(holtgrew): When switching to DP from new alignment module, make sure to mirror diagonals.
+ _align_banded_nw_best_ends(matrixLeft, possibleEndsLeft, sequencesLeft, scoreMatrix, -diagUpper, -diagLower);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Computes the banded alignment matrix for the right extension and
+// returns a string with possible end positions of an eps-match.
+template
+void
+_fillMatrixBestEndsRight(TMatrix & matrixRight,
+ std::vector & possibleEndsRight,
+ std::vector const, InfixSegment>> const & sequencesRight,
+ TDiagonal const diagLower,
+ TDiagonal const diagUpper,
+ TScore const & scoreMatrix) {
+ // std::cerr << "FILL MATRIX RIGHT SEQS\n"
+ // << "0: " << infixH << "\n"
+ // << "1: " << infixV << "\n";
+
+ // _align_banded_nw_best_ends(matrixRight, possibleEndsRight, str, scoreMatrix,
+ // lowerDiagonal(seedOld) - upperDiagonal(seed),
+ // lowerDiagonal(seedOld) - lowerDiagonal(seed));
+
+ // _align_banded_nw_best_ends(matrixRight, possibleEndsRight, str, scoreMatrix,
+ // diagEnd - upperDiagonal(seed),
+ // diagEnd - lowerDiagonal(seed));
+ // lowerDiagonal(seedOld) - upperDiagonal(seed),
+ // lowerDiagonal(seedOld) - lowerDiagonal(seed));
+
+ // Use legacy adapted NW computation with infixH/first alignment row being in the vertical direction.
+ // TODO(holtgrew): When switching to DP from new alignment module, make sure to mirror diagonals.
+ _align_banded_nw_best_ends(matrixRight, possibleEndsRight, sequencesRight, scoreMatrix, -diagUpper, -diagLower);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Traceback from an arbitrary point (coordinate) in the banded alignment trace matrix (trace).
+template
+inline void
+_alignBandedNeedlemanWunschTrace(TAlign & align,
+ TSegVec const & str,
+ TTrace const & trace,
+ TCoord const & coordinate,
+ TDiagonal const diagL,
+ TDiagonal const diagU)
+{
+ //typedef typename Value::Type TString;
+ typedef typename Id::Type TId;
+ typedef typename Size::Type TSize;
+ typedef typename Value::Type TTraceValue;
+
+ // Traceback values
+ TTraceValue Diagonal = 0; TTraceValue Horizontal = 1; TTraceValue Vertical = 2;
+
+ // Initialization
+ TId id1{0}; // for a owning StringSet<> Id is the same as the index
+ TId id2{1};
+ TSize lo_row = (diagU <= 0) ? -1 * diagU : 0;
+ TSize diagonalWidth = (TSize) (diagU - diagL + 1);
+
+ // Start the trace from the cell with the max value
+ TSize row = coordinate.i1;
+ TSize col = coordinate.i2;
+
+ // Handle the skipped sequence parts
+ TSize actualRow = row + lo_row;
+ TSize actualCol = col + diagL + actualRow;
+
+ if ((actualRow != 0) && (actualCol != 0)) {
+ // Find initial direction
+ TTraceValue tv = trace[row * diagonalWidth + col];
+ if (tv == Horizontal) --col;
+ else if (tv == Vertical) {--row; ++col;}
+ else --row;
+
+ // Walk until we hit a border
+ TSize seqLen = 1;
+ TTraceValue newTv = tv;
+ while(true) {
+ actualRow = row + lo_row;
+ actualCol = col + diagL + actualRow;
+ newTv = trace[row * diagonalWidth + col];
+
+ // Check if we hit a border
+ if ((actualRow == 0) || (actualCol == 0)) break;
+ else {
+ //std::cout << row << ',' << col << ':' << value(originalMat, actualRow * len1 + actualCol) << std::endl;
+ if (tv == Diagonal) {
+ if (newTv == Horizontal) {
+ _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
+ --col; seqLen = 1;
+ } else if (newTv == Vertical) {
+ _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
+ --row; ++col; seqLen = 1;
+ } else {
+ --row; ++seqLen;
+ }
+ } else {
+ if (tv == Horizontal) {
+ if (newTv == Diagonal) {
+ _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
+ --row; seqLen = 1;
+ } else if (newTv == Vertical) {
+ _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
+ --row; ++col; seqLen = 1;
+ } else {
+ --col; ++seqLen;
+ }
+ } else {
+ if (newTv == Diagonal) {
+ _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
+ --row; seqLen = 1;
+ } else if (newTv == Horizontal) {
+ _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
+ --col; seqLen = 1;
+ } else {
+ --row; ++col; ++seqLen;
+ }
+ }
+ }
+ tv = newTv;
+ }
+ }
+
+ // Align left overs
+ if (seqLen) _alignTracePrint(align, str[0], str[1], id1, actualCol, id2, actualRow, seqLen, tv);
+ }
+
+ // Handle the remaining sequence
+ if (actualCol != 0) _alignTracePrint(align, str[0], str[1], (TId) id1, (TSize) 0, (TId) 0, (TSize) 0, (TSize) actualCol, Horizontal);
+ else if (actualRow != 0) _alignTracePrint(align, str[0], str[1], (TId) 0, (TSize) 0, (TId) id2, (TSize) 0, (TSize) actualRow, Vertical);
+
+}
+
+template
+void _copyInfixAlignmentIntoAlignment(TAlign & align,
+ TInfixAlign const & infixAlign,
+ TPos const infixAlignHBeginPosition,
+ TPos const infixAlignVBeginPosition)
+{
+ using TAlignPos = typename Position::Type>::Type;
+ String viewPos;
+
+ // NOTE: we can't use `integrateAlign(align, infixAlign);` directly
+ // as `infixAlign` is a sequence alignment on a local copy of a segment from
+ // `align`. That means we need to calculate the positions where to copy
+ // the alignment `infixAlign` into `align`.
+
+ TAlignPos pos0 = infixAlignHBeginPosition // beginPosition(infixAlignH) // correct for infixes
+ - beginPosition(source(row(align, 0))) // ...
+ + beginPosition(row(infixAlign, 0)); // respect source clipping
+
+ appendValue(viewPos, toViewPosition(row(align, 0), pos0));
+
+ TAlignPos pos1 = infixAlignVBeginPosition // beginPosition(infixAlignV) // correct for infixes
+ - beginPosition(source(row(align, 1))) // ...
+ + beginPosition(row(infixAlign, 1)); // respect source clipping
+
+ appendValue(viewPos, toViewPosition(row(align, 1), pos1));
+
+ integrateAlign(align, infixAlign, viewPos);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Conducts the traceback on the extension to the left from best start position
+// and writes the result into align.
+template
+void
+_tracebackLeft(TMatrix const & matrixLeft,
+ TCoord const & coordinate,
+ std::vector const, InfixSegment>> const & sequencesLeft,
+ TBeginPosition const infixAlignHBeginPosition,
+ TBeginPosition const infixAlignVBeginPosition,
+ TDiagonal const diagLower,
+ TDiagonal const diagUpper,
+ TPos const endLeftH,
+ TPos const endLeftV,
+ TAlign & align) {
+ AlignTraceback traceBack;
+ _alignBandedNeedlemanWunschTrace(traceBack, sequencesLeft, matrixLeft, coordinate,
+ -diagUpper, -diagLower);
+ // upperDiagonal(seedOld) - upperDiagonal(seed), upperDiagonal(seedOld) - lowerDiagonal(seed));
+ //std::cerr << "TRACEBACK\n";
+ //for (unsigned i = 0; i < length(traceBack.tvs); ++i)
+ // std::cerr << (int)traceBack.tvs[i] << "\t" << traceBack.sizes[i] << "\n";
+ //std::cerr << "---------\n";
+
+ reverse(traceBack.sizes);
+ reverse(traceBack.tvs);
+
+ Align const, InfixSegment>> infixAlign;
+ resize(rows(infixAlign), 2);
+ assignSource(row(infixAlign, 0), infix(sequencesLeft[0], length(sequencesLeft[0]) - endLeftH, length(sequencesLeft[0])));
+ assignSource(row(infixAlign, 1), infix(sequencesLeft[1], length(sequencesLeft[1]) - endLeftV, length(sequencesLeft[1])));
+
+ // std::cerr << "\nLEFT SEQS\n" << row(infixAlign, 0) << "\n" << row(infixAlign, 1) << "\n";
+ _pumpTraceToGaps(row(infixAlign, 0), row(infixAlign, 1), traceBack);
+ // std::cerr << "INFIX ALIGN AFTER LEFT TRACEBACK\n\n" << infixAlign << "\n";
+ // std::cerr << "ALIGN BEFORE INTEGRATION WITH INFIX ALIGN\n\n" << align << "\n";
+ _copyInfixAlignmentIntoAlignment(align, infixAlign, infixAlignHBeginPosition, infixAlignVBeginPosition);
+ // std::cerr << "ALIGN AFTER INTEGRATION WITH INFIX ALIGN\n\n" << align << "\n";
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Conducts the traceback on the extension to the right from best end position
+// and writes the result into align.
+template
+void
+_tracebackRight(TMatrix const & matrixRight,
+ TCoord const & coordinate,
+ std::vector const, InfixSegment>> const & sequencesRight,
+ TBeginPosition const infixAlignHBeginPosition,
+ TBeginPosition const infixAlignVBeginPosition,
+ TDiagonal const diagLower,
+ TDiagonal const diagUpper,
+ TPos const endRightH,
+ TPos const endRightV,
+ TAlign & align) {
+ AlignTraceback traceBack;
+ _alignBandedNeedlemanWunschTrace(traceBack, sequencesRight, matrixRight, coordinate,
+ -diagUpper, -diagLower);
+ // lowerDiagonal(seedOld) - upperDiagonal(seed), lowerDiagonal(seedOld) - lowerDiagonal(seed));
+ //std::cerr << "TRACEBACK\n";
+ //for (unsigned i = 0; i < length(traceBack.tvs); ++i)
+ // std::cerr << (int)traceBack.tvs[i] << "\t" << traceBack.sizes[i] << "\n";
+ //std::cerr << "---------\n";
+
+ Align const, InfixSegment>> infixAlign;
+ resize(rows(infixAlign), 2);
+ assignSource(row(infixAlign, 0), infix(sequencesRight[0], 0, endRightH));
+ assignSource(row(infixAlign, 1), infix(sequencesRight[1], 0, endRightV));
+
+ // std::cerr << "\nRIGHT SEQS\n" << row(infixAlign, 0) << "\n" << row(infixAlign, 1) << "\n";
+ _pumpTraceToGaps(row(infixAlign, 0), row(infixAlign, 1), traceBack);
+ // std::cerr << "INFIX ALIGN AFTER RIGHT TRACEBACK\n\n" << infixAlign << "\n";
+ // std::cerr << "ALIGN BEFORE INTEGRATION WITH INFIX ALIGN\n\n" << align << "\n";
+ _copyInfixAlignmentIntoAlignment(align, infixAlign, infixAlignHBeginPosition, infixAlignVBeginPosition);
+ // std::cerr << "ALIGN AFTER INTEGRATION WITH INFIX ALIGN\n\n" << align << "\n";
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Computes the banded alignment matrix and fills a string with possible start
+// and end positions of an eps-match. Determines the optimal start and end
+// position for the longest eps-match and writes the trace into align.
+template
+bool
+_bestExtension(Segment const & infH, // database
+ Segment const & infV, // query
+ TSeed const & seed,
+ TSeed const & seedOld,
+ TPos const alignLen,
+ TPos const alignErr,
+ TScore const & scoreMatrix,
+ TDir const direction,
+ TSize const minLength,
+ TEps const eps,
+ TAlign & align,
+ stellar_best_extension_time & best_extension_runtime)
+{
+ using TAlphabet = std::remove_cv::type;
+ using TOwningContainer = std::vector;
+
+ typedef std::vector TAlignmentMatrix;
+ typedef ExtensionEndPosition TEndInfo;
+ typedef typename std::vector::const_iterator TEndIterator;
+ typedef typename Diagonal::Type TDiagonal;
+
+ // variables for banded alignment and possible ends of match
+ TAlignmentMatrix matrixRight, matrixLeft;
+ std::vector possibleEndsLeft, possibleEndsRight;
+
+ // new extension to the left of the old seed
+ assert(beginPositionH(seed) <= beginPositionH(seedOld)); // infixLeftH
+ assert(beginPositionV(seed) <= beginPositionV(seedOld)); // infixLeftV
+
+ // old seed covers at least one character
+ assert(beginPositionH(seedOld) < endPositionH(seedOld));
+ assert(beginPositionV(seedOld) < endPositionV(seedOld));
+
+ // new extension to the right of the old seed
+ assert(endPositionH(seedOld) <= endPositionH(seed)); // infixRightH
+ assert(endPositionV(seedOld) <= endPositionV(seed)); // infixRightV
+
+ std::vector> sequencesLeft;
+ std::vector> sequencesRight;
+
+ // Compute diagonals for updated seeds module with infixH/first alignment row being in the horizontal direction.
+ TDiagonal const diagLowerLeft = lowerDiagonal(seedOld) - upperDiagonal(seed);
+ TDiagonal const diagUpperLeft = lowerDiagonal(seedOld) - lowerDiagonal(seed);
+
+ // Compute diagonals for updated seeds module with infixH/first alignment row being in the horizontal direction.
+ TDiagonal const diagLowerRight = upperDiagonal(seedOld) - upperDiagonal(seed);
+ TDiagonal const diagUpperRight = upperDiagonal(seedOld) - lowerDiagonal(seed);
+
+ best_extension_runtime.banded_needleman_wunsch_time.measure_time([&]()
+ {
+ // fill banded matrix and gaps string for ...
+ if (direction == EXTEND_BOTH || direction == EXTEND_LEFT) { // ... extension to the left
+ // prepare copy segment...
+ //!TODO: can these be references instead of copies?
+ TOwningContainer segmentCopyLeftH;
+ TOwningContainer segmentCopyLeftV;
+ segmentCopyLeftH.reserve(beginPositionH(seedOld) - beginPositionH(seed));
+ segmentCopyLeftV.reserve(beginPositionV(seedOld) - beginPositionV(seed));
+
+ best_extension_runtime.banded_needleman_wunsch_left_time.measure_time([&]()
+ {
+ // ...copy reverse complement segment
+ //!TODO: are seqan2 and seqan3 ranks compatible?
+ for (auto n : host(infH).subspan(beginPositionH(seed), beginPositionH(seedOld)) | std::views::reverse | seqan3::views::complement)
+ {
+ segmentCopyLeftH.emplace_back(n.to_rank());
+ }
+
+ for (auto n : host(infV).subspan(beginPositionV(seed), beginPositionV(seedOld)) | std::views::reverse | seqan3::views::complement)
+ {
+ segmentCopyLeftV.emplace_back(n.to_rank());
+ }
+
+ // put infix segments
+ sequencesLeft.emplace_back(infix(segmentCopyLeftH, 0, segmentCopyLeftH.size()));
+ sequencesLeft.emplace_back(infix(segmentCopyLeftV, 0, segmentCopyLeftV.size()));
+
+ _fillMatrixBestEndsLeft(matrixLeft, possibleEndsLeft, sequencesLeft, diagLowerLeft, diagUpperLeft, scoreMatrix);
+ SEQAN_ASSERT_NOT(possibleEndsLeft.empty());
+ }); // measure_time
+ }
+ else
+ possibleEndsLeft.emplace_back(TEndInfo());
+ if (direction == EXTEND_BOTH || direction == EXTEND_RIGHT) { // ... extension to the right
+ best_extension_runtime.banded_needleman_wunsch_right_time.measure_time([&]()
+ {
+ // prepare copy segment...
+ //!TODO: can these be references instead of copies?
+ auto segmentCopyRightH = TOwningContainer(host(infH).begin() + endPositionH(seedOld), host(infH).begin() + endPositionH(seed));
+ auto segmentCopyRightV = TOwningContainer(host(infV).begin() + endPositionV(seedOld), host(infV).begin() + endPositionV(seed));
+
+ sequencesRight.emplace_back(infix(segmentCopyRightH, 0, segmentCopyRightH.size()));
+ sequencesRight.emplace_back(infix(segmentCopyRightV, 0, segmentCopyRightV.size()));
+ _fillMatrixBestEndsRight(matrixRight, possibleEndsRight, sequencesRight, diagLowerRight, diagUpperRight, scoreMatrix);
+ SEQAN_ASSERT_NOT(possibleEndsRight.empty());
+ }); // measure_time
+ } else
+ {
+ possibleEndsRight.emplace_back(TEndInfo());
+ }
+ }); // measure_time
+
+ // longest eps match on poss ends string
+ std::pair endPair = best_extension_runtime.longest_eps_match_time.measure_time([&]()
+ {
+ return longestEpsMatch(possibleEndsLeft, possibleEndsRight, alignLen, alignErr, minLength, eps);
+ });
+
+ if (endPair == std::pair(possibleEndsLeft.end(), possibleEndsRight.end())) { // no eps-match found
+ return false;
+ }
+
+ // determine end positions of maximal eps-match in ...
+ TPos endLeftH = 0, endLeftV = 0;
+ TPos endRightH = 0, endRightV = 0;
+ if((*endPair.first).length != 0) { // ... extension to the left
+ endLeftV = (*endPair.first).coord.i1;
+ // correction for banded coordinates to unbanded:
+ if (diagLowerLeft >= 0)
+ endLeftV += (TPos)(diagLowerLeft);
+ endLeftH = endLeftV + (TPos)((*endPair.first).coord.i2 - diagUpperLeft);
+ }
+ if((*endPair.second).length != 0) { // ... extension to the right
+ endRightV = (*endPair.second).coord.i1;
+ // correction for banded coordinates to unbanded:
+ if (diagLowerRight >= 0)
+ endRightV += (TPos)(diagLowerRight);
+ endRightH = endRightV + (TPos)((*endPair.second).coord.i2 - diagUpperRight);
+ }
+
+ // set begin and end positions of align
+ setBeginPosition(row(align, 0), beginPositionH(seedOld) - endLeftH);
+ setBeginPosition(row(align, 1), beginPositionV(seedOld) - endLeftV);
+ setEndPosition(row(align, 0), endPositionH(seedOld) + endRightH);
+ setEndPosition(row(align, 1), endPositionV(seedOld) + endRightV);
+ // setClippedBeginPosition(row(align, 0), beginPositionH(seedOld) - endLeftH);
+ // setClippedBeginPosition(row(align, 1), beginPositionV(seedOld) - endLeftV);
+ // setBeginPosition(row(align, 0), 0);
+ // setBeginPosition(row(align, 1), 0);
+ // setClippedEndPosition(row(align, 0), endPositionH(seedOld) + endRightH);
+ // setClippedEndPosition(row(align, 1), endPositionV(seedOld) + endRightV);
+
+ best_extension_runtime.construct_seed_alignment_time.measure_time([&]()
+ {
+ // traceback through matrix from begin/end pos on ...
+ if((*endPair.first).length != 0) { // ... extension to the left
+ assert(direction == EXTEND_BOTH || direction == EXTEND_LEFT);
+ auto const infixAlignHBeginPosition = beginPositionH(seed) + length(sequencesLeft[0]) - endLeftH;
+ auto const infixAlignVBeginPosition = beginPositionV(seed) + length(sequencesLeft[1]) - endLeftV;
+ _tracebackLeft(matrixLeft,
+ (*endPair.first).coord,
+ sequencesLeft,
+ infixAlignHBeginPosition,
+ infixAlignVBeginPosition,
+ diagLowerLeft,
+ diagUpperLeft,
+ endLeftH,
+ endLeftV,
+ align);
+ }
+ if((*endPair.second).length != 0) { // ... extension to the right
+ assert(direction == EXTEND_BOTH || direction == EXTEND_RIGHT);
+ auto const infixAlignHBeginPosition = endPositionH(seedOld);
+ auto const infixAlignVBeginPosition = endPositionV(seedOld);
+ _tracebackRight(matrixRight,
+ (*endPair.second).coord,
+ sequencesRight,
+ infixAlignHBeginPosition,
+ infixAlignVBeginPosition,
+ diagLowerRight,
+ diagUpperRight,
+ endRightH,
+ endRightV,
+ align);
+ }
+ SEQAN_ASSERT_EQ(length(row(align, 0)), length(row(align, 1)));
+ }); // measure_time
+
+ return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template
+void
+integrateAlign(Align & align,
+ Align, InfixSegment>, TSpec2> const & infixAlign) {
+ typedef typename Size::Type TSize;
+ typedef typename Position >::Type>::Type TPos;
+
+ String viewPos;
+ TPos pos;
+ for (TSize i = 0; i < length(rows(infixAlign)); ++i) {
+ pos = beginPosition(source(row(infixAlign, i))) + beginPosition(row(infixAlign, i));
+ pos += beginPosition(host(source(row(infixAlign, i))));
+ appendValue(viewPos, toViewPosition(row(align, i), pos));
+ }
+
+ // std::cerr << "HAHA infixAlign == \n" << row(infixAlign, 0) << "\n" << row(infixAlign, 1) << "\n";
+ integrateAlign(align, infixAlign, viewPos);
+ // std::cerr << "HAHA infixAlign == \n" << row(infixAlign, 0) << "\n" << row(infixAlign, 1) << "\n";
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Conducts best X-drop extension and calls _bestExtension.
+// After the call align contains the longest eps-Match that spans the eps-core (localAlign).
+template
+bool
+_extendAndExtract(Align, InfixSegment> > const & localAlign,
+ TScoreValue const scoreDropOff,
+ TScore const & scoreMatrix,
+ Segment, InfixSegment> const & infH,
+ Segment, InfixSegment> const & infV,
+ ExtensionDirection const direction,
+ TSize const minLength,
+ TEps const eps,
+ TAlign & align,
+ stellar_extension_time & extension_runtime) {
+ typedef typename Position::Type TPos;
+ typedef Seed TSeed;
+
+ //!NOTE: TSequence = std::span
+
+ // std::cerr << "LOCAL ALIGN\n" << row(localAlign, 0) << "\n" << row(localAlign, 1) << "\n";
+ // std::cerr << "ALIGN\n" << row(align, 0) << "\n" << row(align, 1) << "\n";
+ integrateAlign(align, localAlign);
+ //std::cerr << __LINE__ << "\tLOCAL ALIGN\n" << row(localAlign, 0) << "\n" << row(localAlign, 1) << "\n";
+ //std::cerr << __LINE__ << "\tALIGN\n" << row(align, 0) << "\n" << row(align, 1) << "\n";
+
+ // Get begin and end position of local alignment (seed) as source positions
+ // in underlying sequences.
+ TPos seedBeginH = beginPosition(row(localAlign, 0)) + beginPosition(infH);
+ TPos seedBeginV = beginPosition(row(localAlign, 1)) + beginPosition(infV);
+ TPos seedEndH = endPosition(row(localAlign, 0)) + beginPosition(infH);
+ TPos seedEndV = endPosition(row(localAlign, 1)) + beginPosition(infV);
+
+ if (direction == EXTEND_NONE) {
+ // set begin and end positions of align
+ setBeginPosition(row(align, 0), seedBeginH);
+ setBeginPosition(row(align, 1), seedBeginV);
+ setEndPosition(row(align, 0), seedEndH);
+ setEndPosition(row(align, 1), seedEndV);
+
+ if ((TSize)length(row(align, 0)) < minLength)
+ return false;
+
+ longestEpsMatch(align, minLength, eps);
+ } else {
+ // gapped X-drop extension of local alignment (seed)
+ TSeed seed(seedBeginH, seedBeginV, seedEndH, seedEndV);
+ TSeed seedOld(seed);
+
+ static_assert(std::is_same>::value,
+ "infH is a nested InfixSegment: Segment, InfixSegment>");
+ Segment infixSequenceH = host(infH); // inner nested Segment
+ Segment infixSequenceV = host(infV); // inner nested Segment
+ extension_runtime.extend_seed_time.measure_time([&]()
+ {
+ extendSeed(seed, infixSequenceH, infixSequenceV, direction, scoreMatrix, scoreDropOff, GappedXDrop());
+ });
+ if (static_cast(seedSize(seed)) < minLength - (int)floor(minLength*eps))
+ return false;
+
+ // determine length and number of error columns of local alignment (seed)
+ TPos alignLen = _max(length(row(localAlign, 0)), length(row(localAlign, 1)));
+ TPos alignErr = 0;
+ for (TPos i = 0; i < alignLen; ++i) {
+ if (!isMatch(localAlign, i)) ++alignErr;
+ }
+
+ // convert seeds from positions in host(seq) to positions in host(host(seq))
+ setBeginPositionH(seedOld, beginPositionH(seedOld) + beginPosition(host(infH)));
+ setEndPositionH(seedOld, endPositionH(seedOld) + beginPosition(host(infH)));
+ setBeginPositionV(seedOld, beginPositionV(seedOld) + beginPosition(host(infV)));
+ setEndPositionV(seedOld, endPositionV(seedOld) + beginPosition(host(infV)));
+ setBeginPositionH(seed, beginPositionH(seed) + beginPosition(host(infH)));
+ setEndPositionH(seed, endPositionH(seed) + beginPosition(host(infH)));
+ setBeginPositionV(seed, beginPositionV(seed) + beginPosition(host(infV)));
+ setEndPositionV(seed, endPositionV(seed) + beginPosition(host(infV)));
+
+ // determine best extension lengths and write the trace into align
+ Segment infixH = infix(infixSequenceH, beginPosition(infH), endPosition(infH));
+ Segment infixV = infix(infixSequenceV, beginPosition(infV), endPosition(infV));
+
+ bool const found_extension = extension_runtime.best_extension_time.measure_time([&]()
+ {
+ return _bestExtension(infixH, infixV, seed, seedOld, alignLen, alignErr, scoreMatrix, direction, minLength, eps, align, extension_runtime.best_extension_time);
+ });
+ if (!found_extension)
+ return false;
+ SEQAN_ASSERT_EQ(length(row(align, 0)), length(row(align, 1)));
+ }
+ SEQAN_ASSERT_EQ(length(row(align, 0)), length(row(align, 1)));
+ //std::cerr << "extracted alignment\n-------------\n" << align << "----------------\n";
+ return true;
+}
+
+} // namespace dream_stellar
diff --git a/include/dream_stellar/stellar_index.hpp b/include/dream_stellar/stellar_index.hpp
new file mode 100644
index 00000000..236b7ecc
--- /dev/null
+++ b/include/dream_stellar/stellar_index.hpp
@@ -0,0 +1,115 @@
+#pragma once
+
+#include
+
+#include
+
+#include
+#include
+
+namespace dream_stellar
+{
+using namespace seqan2;
+
+template , typename TInfixSegment = seqan2::Segment>
+using StellarQGramStringSet = StringSet >;
+
+template
+using StellarQGramIndex = Index const, IndexQGram >;
+
+template
+using StellarSwiftPattern = Pattern, Swift >;
+
+template
+using StellarSwiftFinder = Finder const, InfixSegment> const, Swift >;
+
+template
+struct StellarIndex
+{
+ using TSequence = seqan2::String;
+ using TInfixSegment = seqan2::Segment const, seqan2::InfixSegment>;
+ using TQGramStringSet = StellarQGramStringSet