Skip to content

Commit 5e4fadb

Browse files
committed
adding functionality to extract consensus sequence for blocks
1 parent 833c177 commit 5e4fadb

4 files changed

Lines changed: 87 additions & 5 deletions

File tree

src/consensus.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#include "panmanUtils.hpp"
2+
3+
void panmanUtils::Tree::printConsensus(std::ostream& fout){
4+
std::mutex printMutex;
5+
6+
tbb::parallel_for_each(blocks.begin(), blocks.end(), [&](const auto& eachBlock) {
7+
// for(size_t i = 0; i < blocks.size(); i++) {
8+
9+
std::string consensus;
10+
11+
int32_t primaryBlockId = ((int32_t)eachBlock.primaryBlockId);
12+
int32_t secondaryBlockId = ((int32_t)eachBlock.secondaryBlockId);
13+
14+
for(size_t j = 0; j < eachBlock.consensusSeq.size(); j++) {
15+
bool endFlag = false;
16+
for(size_t k = 0; k < 8; k++) {
17+
const int nucCode = (((eachBlock.consensusSeq[j]) >> (4*(7 - k))) & 15);
18+
19+
if(nucCode == 0) {
20+
endFlag = true;
21+
break;
22+
}
23+
const char nucleotide = panmanUtils::getNucleotideFromCode(nucCode);
24+
consensus.push_back(nucleotide);
25+
}
26+
if(endFlag) {
27+
break;
28+
}
29+
}
30+
std::lock_guard<std::mutex> guard(printMutex);
31+
fout << ">" << primaryBlockId << "\n" << consensus << "\n";
32+
});
33+
34+
}

src/panman.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
#include <set>
2121
#include <boost/iostreams/filter/lzma.hpp>
2222
#include <typeinfo>
23-
23+
#include <capnp/serialize.h>
24+
#include <kj/io.h>
25+
#include <limits>
2426

2527
#include "chaining.cpp"
2628
#include "rotation.cpp"
@@ -35,6 +37,7 @@
3537
#include "reroot.cpp"
3638
#include "aaTrans.cpp"
3739
#include "panman2usher.cpp"
40+
#include "consensus.cpp"
3841
#include "panmanUtils.hpp"
3942

4043
char panmanUtils::getNucleotideFromCode(int code) {
@@ -6950,9 +6953,9 @@ panmanUtils::TreeGroup::TreeGroup(std::istream& fin, bool isOld) {
69506953
capnp::ReaderOptions readerOptions;
69516954
readerOptions.traversalLimitInWords = std::numeric_limits<uint64_t>::max();
69526955
readerOptions.nestingLimit = 1024;
6953-
capnp::InputStreamMessageReader messageReader(kjInputStream, readerOptions);
6956+
capnp::InputStreamMessageReader messageReader(kjInputStream, readerOptions);
69546957

6955-
panman::TreeGroup::Reader TG = messageReader.getRoot<panman::TreeGroup>();
6958+
panman::TreeGroup::Reader TG = messageReader.getRoot<panman::TreeGroup>();
69566959

69576960
int count=0;
69586961
for (auto treeFromTG: TG.getTrees()){

src/panman.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,7 @@ class Tree {
545545
void printVCFParallel(std::string reference, std::ostream& fout);
546546
void printVCFParallel(panmanUtils::Node* node, std::string& fileName);
547547
void extractAminoAcidTranslations(std::ostream& fout, int64_t start, int64_t end);
548-
548+
void printConsensus(std::ostream& fout);
549549
// Extract PanMAT representing a segment of the genome. The start and end coordinates
550550
// are with respect to the root sequence. The strands of the terminal blocks in all
551551
// sequences are assumed to be the same as their strands in the root sequence for the

src/panmanUtils.cpp

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ po::positional_options_description printPathsArgumentDesc;
122122
po::options_description indexDesc("Indexing Command Line Arguments");
123123
po::positional_options_description indexArgumentDesc;
124124
po::options_description printRootDesc("Root Printer Command Line Arguments");
125+
po::options_description printConsensusDesc("Consensus Printer Command Line Arguments");
125126
po::positional_options_description printRootPositionArgumentDesc;
126127

127128

@@ -158,7 +159,7 @@ void setupOptionDescriptions() {
158159
("acr,q", "ACR method [fitch(default), mppa]")
159160
("index",po::value< bool >(0), "Generating indexes and print sequence (passed as reference) between x:y")
160161
("refFile",po::value< std::string >() ,"reference sequence file")
161-
// ("printRoot", "Print root sequence")
162+
("printConsensus", "Print consensus sequences of each block in FASTA format")
162163
// ("printNodePaths", "Print mutations from root to each node")
163164
("toUsher", "Convert a PanMAT in PanMAN to Usher-MAT")
164165
// ("samples",po::value< bool >() ,"Samples in the PanMAN")
@@ -260,6 +261,9 @@ void setupOptionDescriptions() {
260261

261262
printRootDesc.add_options()
262263
("output-file,o", po::value< std::string >(), "Output file name");
264+
265+
printConsensusDesc.add_options()
266+
("output-file,o", po::value< std::string >(), "Output file name");
263267
}
264268

265269
void writePanMAN(po::variables_map &globalVm, panmanUtils::TreeGroup *TG) {
@@ -1208,6 +1212,39 @@ void printRoot(panmanUtils::TreeGroup *TG, po::variables_map &globalVm, std::ofs
12081212

12091213
}
12101214

1215+
void printConsensus(panmanUtils::TreeGroup *TG, po::variables_map &globalVm, std::ofstream &outputFile, std::streambuf * buf) {
1216+
// Print raw sequences to output file
1217+
if(TG == nullptr) {
1218+
std::cout << "No PanMAN selected" << std::endl;
1219+
return;
1220+
}
1221+
1222+
panmanUtils::TreeGroup tg = *TG;
1223+
1224+
auto fastaStart = std::chrono::high_resolution_clock::now();
1225+
for(int i = 0; i < tg.trees.size(); i++) {
1226+
panmanUtils::Tree * T = &tg.trees[i];
1227+
if(globalVm.count("output-file")) {
1228+
std::string fileName = globalVm["output-file"].as< std::string >();
1229+
outputFile.open("./info/" + fileName + "_" + std::to_string(i) + ".fasta");
1230+
buf = outputFile.rdbuf();
1231+
} else {
1232+
buf = std::cout.rdbuf();
1233+
}
1234+
std::ostream fout (buf);
1235+
1236+
1237+
T->printConsensus(fout);
1238+
1239+
if(globalVm.count("output-file")) outputFile.close();
1240+
}
1241+
1242+
auto fastaEnd = std::chrono::high_resolution_clock::now();
1243+
std::chrono::nanoseconds fastaTime = fastaEnd - fastaStart;
1244+
std::cout << "\nFASTA execution time: " << fastaTime.count() << " nanoseconds\n";
1245+
1246+
}
1247+
12111248
void toUsher(panmanUtils::TreeGroup *TG, po::variables_map &globalVm) {
12121249
// Print raw sequences to output file
12131250
if(TG == nullptr) {
@@ -1549,6 +1586,8 @@ void parseAndExecute(int argc, char* argv[]) {
15491586
// } else if(globalVm.count("fasta-fast")){
15501587
// fastaFast(TG, globalVm, outputFile, buf);
15511588
// return;
1589+
} else if (globalVm.count("printConsensus")) {
1590+
printConsensus(TG, globalVm, outputFile, buf);
15521591
} else {
15531592
char** splitCommandArray;
15541593

@@ -1691,6 +1730,12 @@ void parseAndExecute(int argc, char* argv[]) {
16911730
.options(printRootDesc)
16921731
.run(), printRootVm);
16931732
printRoot(TG, printRootVm, outputFile, buf);
1733+
} else if(strcmp(splitCommandArray[0], "printConsensus") == 0) {
1734+
po::variables_map printConsensusVm;
1735+
po::store(po::command_line_parser((int)splitCommand.size(), splitCommandArray)
1736+
.options(printConsensusDesc)
1737+
.run(), printConsensusVm);
1738+
printConsensus(TG, printConsensusVm, outputFile, buf);
16941739
} else if (strcmp(splitCommandArray[0], "exit") == 0 || strcmp(splitCommandArray[0], "q") == 0) {
16951740
return;
16961741
} else {

0 commit comments

Comments
 (0)