Skip to content

Commit eb94d43

Browse files
committed
handling long newick string
1 parent f6e44c9 commit eb94d43

2 files changed

Lines changed: 47 additions & 7 deletions

File tree

panman.capnp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ struct SequenceInverted
6969

7070
struct Tree
7171
{
72-
newick @0: Text;
72+
newick @0: List(Text);
7373
nodes @1: List(Node);
7474
consensusSeqMap @2: List(ConsensusSeqToBlockIds);
7575
gaps @3: List(GapList);

src/panman.cpp

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
#include "ratioTest.cpp"
4242
#include "panmanUtils.hpp"
4343

44+
constexpr size_t NEWICK_CHUNK_SIZE = 1 * 1024 * 1024;
45+
4446
char panmanUtils::getNucleotideFromCode(int code) {
4547
switch(code) {
4648
case 1:
@@ -1646,9 +1648,21 @@ int doPreOrderLoop(panmanUtils::Node* node){
16461648
return c;
16471649
}
16481650

1651+
std::string reconstructNewick(const panman::Tree::Reader& tree) {
1652+
std::string result;
1653+
auto newickList = tree.getNewick();
1654+
1655+
for (auto chunk : newickList) {
1656+
result.append(chunk.cStr());
1657+
}
1658+
1659+
return result;
1660+
}
1661+
16491662
void panmanUtils::Tree::protoMATToTree(const panman::Tree::Reader& mainTree) {
16501663
// Create tree
1651-
root = createTreeFromNewickString(mainTree.getNewick().cStr());
1664+
std::string newickString = reconstructNewick(mainTree);
1665+
root = createTreeFromNewickString(newickString);
16521666
// std::cout << "Size of nodes: " << allNodes.size() << std::endl;
16531667
// std::cout << doPreOrderLoop(root) << std::endl;
16541668

@@ -2616,6 +2630,19 @@ void panmanUtils::Tree::extractPanMATIndex(std::ostream& fout, int64_t start, in
26162630
return;
26172631
}
26182632

2633+
std::vector<std::string> splitNewick(const std::string& newick) {
2634+
std::vector<std::string> chunks;
2635+
chunks.reserve((newick.size() / NEWICK_CHUNK_SIZE) + 1);
2636+
2637+
for (size_t i = 0; i < newick.size(); i += NEWICK_CHUNK_SIZE) {
2638+
chunks.emplace_back(
2639+
newick.substr(i, NEWICK_CHUNK_SIZE)
2640+
);
2641+
}
2642+
2643+
return chunks;
2644+
}
2645+
26192646
void panmanUtils::Tree::extractPanMATSegment(kj::std::StdOutputStream& fout, int64_t start, int64_t end) {
26202647
sequence_t rootSequence;
26212648
blockExists_t rootBlockExists;
@@ -2773,8 +2800,12 @@ void panmanUtils::Tree::extractPanMATSegment(kj::std::StdOutputStream& fout, int
27732800

27742801
std::string newick = getNewickString(newRoot);
27752802
std::string newick2 = getNewickString(root);
2776-
2777-
treeToWrite.setNewick(newick);
2803+
2804+
auto chunks = splitNewick(newick);
2805+
auto newickList = treeToWrite.initNewick(chunks.size());
2806+
for (size_t i = 0; i < chunks.size(); ++i) {
2807+
newickList.set(i, chunks[i]);
2808+
}
27782809

27792810
std::map< std::vector< uint32_t >, std::vector< std::pair< int64_t, bool > > >
27802811
consensusSeqToBlockIds;
@@ -2941,8 +2972,12 @@ void panmanUtils::Tree::writeToFile(kj::std::StdOutputStream& fout, panmanUtils:
29412972
assert(nodeIndex==allNodes.size());
29422973

29432974
std::string newick = getNewickString(node);
2944-
2945-
treeToWrite.setNewick(newick);
2975+
2976+
auto chunks = splitNewick(newick);
2977+
auto newickList = treeToWrite.initNewick(chunks.size());
2978+
for (size_t i = 0; i < chunks.size(); ++i) {
2979+
newickList.set(i, chunks[i]);
2980+
}
29462981

29472982
std::map< std::vector< uint32_t >, std::vector< std::pair< int64_t, bool > > > consensusSeqToBlockIds;
29482983

@@ -6988,6 +7023,7 @@ void panmanUtils::TreeGroup::printFASTA(std::ofstream& fout, bool rootSeq ) {
69887023
}
69897024
}
69907025

7026+
69917027
void panmanUtils::TreeGroup::writeToFile(kj::std::StdOutputStream& fout) {
69927028
capnp::MallocMessageBuilder message;
69937029
panman::TreeGroup::Builder treeGroupToWrite = message.initRoot<panman::TreeGroup>();
@@ -7009,7 +7045,11 @@ void panmanUtils::TreeGroup::writeToFile(kj::std::StdOutputStream& fout) {
70097045
assert(nodeIndex == tree.allNodes.size());
70107046

70117047
std::string newick = tree.getNewickString(node);
7012-
treeToWrite.setNewick(newick);
7048+
auto chunks = splitNewick(newick);
7049+
auto newickList = treeToWrite.initNewick(chunks.size());
7050+
for (size_t i = 0; i < chunks.size(); ++i) {
7051+
newickList.set(i, chunks[i]);
7052+
}
70137053
std::map< std::vector< uint32_t >, std::vector< std::pair< int64_t, bool > > >
70147054
consensusSeqToBlockIds;
70157055

0 commit comments

Comments
 (0)