Skip to content

Commit 0397342

Browse files
authored
Merge pull request #461 from pachterlab/devel
Devel
2 parents fa01edd + 2cb5e17 commit 0397342

9 files changed

+322
-18
lines changed

ext/bifrost/src/CMakeLists.txt

+7-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,13 @@ target_link_libraries(bifrost_static ${ZLIB_LIBRARIES})
2626
target_link_libraries(bifrost_dynamic ${ZLIB_LIBRARIES})
2727

2828
if (ZLIB_FOUND)
29-
include_directories(${ZLIB_INCLUDE_DIRS})
29+
if (DEFINED ZLIB_INCLUDE_DIRS)
30+
include_directories( ${ZLIB_INCLUDE_DIRS} )
31+
elseif (DEFINED ZLIB_INCLUDE_DIR)
32+
include_directories( ${ZLIB_INCLUDE_DIR} )
33+
else()
34+
message(FATAL_ERROR "zlib found but no include directories are set.")
35+
endif()
3036
else()
3137
message(FATAL_ERROR "zlib not found. Required for to output files")
3238
endif(ZLIB_FOUND)

src/CMakeLists.txt

+7-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,13 @@ if (NOT ZLIBNG)
4949
find_package( ZLIB REQUIRED )
5050

5151
if ( ZLIB_FOUND )
52-
include_directories( ${ZLIB_INCLUDE_DIRS} )
52+
if (DEFINED ZLIB_INCLUDE_DIRS)
53+
include_directories( ${ZLIB_INCLUDE_DIRS} )
54+
elseif (DEFINED ZLIB_INCLUDE_DIR)
55+
include_directories( ${ZLIB_INCLUDE_DIR} )
56+
else()
57+
message(FATAL_ERROR "zlib found but no include directories are set.")
58+
endif()
5359
target_link_libraries(kallisto kallisto_core ${ZLIB_LIBRARIES})
5460
else()
5561
message(FATAL_ERROR "zlib not found. Required for to output files" )

src/KmerIndex.cpp

+112-2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <iostream>
1111
#include <unordered_map>
1212
#include <string>
13+
#include <set>
1314
#include "ColoredCDBG.hpp"
1415

1516
// --aa option helper functions
@@ -231,6 +232,18 @@ std::pair<size_t,size_t> KmerIndex::getECInfo() const {
231232
return std::make_pair(max_ec_len, cardinality_zero_encounters);
232233
}
233234

235+
// Begin Shading
236+
std::pair<std::string,std::string> shadedTargetName(std::string& name) {
237+
if (name.find("_shade_") != std::string::npos) {
238+
std::string name_header = "_shade_";
239+
std::string tname = name.substr(0, name.find(name_header));
240+
std::string variant = name.substr(name.find(name_header)+name_header.length(), name.size());
241+
return std::make_pair(tname,variant); // Return the target name and the the associated shade
242+
}
243+
return std::make_pair("",""); // Not a shade
244+
}
245+
// End Shading
246+
234247
void KmerIndex::BuildTranscripts(const ProgramOptions& opt, std::ofstream& out) {
235248
// read input
236249
u_set_<std::string> unique_names;
@@ -354,6 +367,18 @@ void KmerIndex::BuildTranscripts(const ProgramOptions& opt, std::ofstream& out)
354367
}
355368
unique_names.insert(name);
356369
target_names_.push_back(name);
370+
371+
// Begin Shading
372+
auto shade_info = shadedTargetName(name);
373+
if (shade_info.first != "") {
374+
std::string tname = shade_info.first;
375+
std::string variant = shade_info.second;
376+
auto it = std::find(target_names_.begin(), target_names_.end(), tname);
377+
if (it != target_names_.end()) {
378+
shadeToColorTranscriptMap[target_names_.size()-1] = std::distance(target_names_.begin(), it);
379+
}
380+
}
381+
// End Shading
357382
}
358383
}
359384

@@ -399,6 +424,9 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
399424
size_t num_seqs = 0;
400425
int max_color = 0;
401426
u_set_<int> external_input_names;
427+
// Begin Shading
428+
std::set<std::string> variants_set; // Ordered set to keep track of variants (i.e. colors with shades)
429+
// End Shading
402430
for (int i = 0; i < opt.transfasta.size(); i++) { // Currently, this should only be one file
403431
auto fasta = opt.transfasta[i];
404432
fp = opt.transfasta.size() == 1 && opt.transfasta[0] == "-" ? gzdopen(fileno(stdin), "r") : gzopen(fasta.c_str(), "r");
@@ -414,9 +442,23 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
414442
continue;
415443
}
416444
int color = std::atoi(strname.c_str());
445+
// Begin Shading
446+
std::string variant;
447+
auto shade_info = shadedTargetName(strname);
448+
if (shade_info.first != "") {
449+
std::string tname = shade_info.first;
450+
variant = shade_info.second;
451+
color = std::atoi(tname.c_str());
452+
variants_set.insert(std::to_string(color) + "_shade_" + variant);
453+
}
454+
// End Shading
417455
external_input_names.insert(color);
418456
if (color > max_color) max_color = color;
419-
of << ">" << std::to_string(color) << "\n" << str << std::endl;
457+
of << ">" << std::to_string(color);
458+
// Begin Shading
459+
if (!variant.empty()) of << "_shade_" << variant;
460+
// End Shading
461+
of << "\n" << str << std::endl;
420462
num_seqs++;
421463
}
422464
gzclose(fp);
@@ -437,6 +479,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
437479
target_names_.push_back(std::to_string(i));
438480
target_lens_.push_back(k); // dummy length (k-mer size)
439481
}
482+
// Begin Shading
483+
for (const auto& v : variants_set) {
484+
num_trans++; // Each color-shade duo counts as an additional target
485+
target_names_.push_back(v);
486+
target_lens_.push_back(k); // dummy length (k-mer size)
487+
}
488+
if (num_trans != ncolors) {
489+
std::cerr << "[build] Detected " << std::to_string(num_trans-ncolors) << " shades" << std::endl;
490+
}
491+
// End Shading
440492

441493
std::cerr << "[build] Building graph from k-mers" << std::endl;
442494
BuildDeBruijnGraph(opt, tmp_file2, out);
@@ -449,6 +501,9 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
449501
std::vector<std::vector<TRInfo> > trinfos(dbg.size());
450502
std::ifstream infile_a(tmp_file2);
451503
int current_color = 0;
504+
// Begin Shading
505+
std::string current_variant;
506+
// End Shading
452507
std::string line;
453508
while (std::getline(infile_a, line)) {
454509
if (line.length() == 0) {
@@ -458,6 +513,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
458513
current_color = onlist_sequences.cardinality();
459514
} else {
460515
current_color = std::atoi(line.c_str()+1);
516+
// Begin Shading
517+
current_variant = "";
518+
std::string name = line.substr(1);
519+
auto shade_info = shadedTargetName(name);
520+
if (shade_info.first != "") {
521+
std::string tname = shade_info.first;
522+
current_variant = shade_info.second;
523+
current_color = std::atoi(tname.c_str());
524+
}
525+
// End Shading
461526
}
462527
continue;
463528
}
@@ -481,8 +546,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
481546
tr.pos = (proc-um.len) | (!um.strand ? sense : missense);
482547
tr.start = um.dist;
483548
tr.stop = um.dist + um.len;
484-
485549
trinfos[n->id].push_back(tr);
550+
551+
// Begin Shading
552+
if (!current_variant.empty()) {
553+
auto it = variants_set.find(std::to_string(current_color) + "_shade_" + current_variant);
554+
assert(it != variants_set.end());
555+
tr.trid = ncolors + std::distance(variants_set.begin(), it);
556+
trinfos[n->id].push_back(tr);
557+
}
558+
// End Shading
486559
}
487560
}
488561
infile_a.close();
@@ -995,6 +1068,13 @@ void KmerIndex::BuildEquivalenceClasses(const ProgramOptions& opt, const std::st
9951068
tr.stop = um.dist + um.len;
9961069

9971070
trinfos[n->id].push_back(tr);
1071+
// Begin Shading
1072+
auto it = shadeToColorTranscriptMap.find(tr.trid);
1073+
if (it != shadeToColorTranscriptMap.end()) {
1074+
tr.trid = shadeToColorTranscriptMap[tr.trid];
1075+
trinfos[n->id].push_back(tr); // Add the color of the original transcript as well
1076+
}
1077+
// End Shading
9981078
}
9991079
j++;
10001080
}
@@ -1020,6 +1100,11 @@ void KmerIndex::BuildEquivalenceClasses(const ProgramOptions& opt, const std::st
10201100

10211101
std::cerr << "[build] target de Bruijn graph has k-mer length " << dbg.getK() << " and minimizer length " << dbg.getG() << std::endl;
10221102
std::cerr << "[build] target de Bruijn graph has " << dbg.size() << " contigs and contains " << dbg.nbKmers() << " k-mers " << std::endl;
1103+
// Begin Shading
1104+
if (shadeToColorTranscriptMap.size() != 0) {
1105+
std::cerr << "[build] number of shades: " << std::to_string(shadeToColorTranscriptMap.size()) << std::endl;
1106+
}
1107+
// End Shading
10231108
}
10241109

10251110
void KmerIndex::PopulateMosaicECs(std::vector<std::vector<TRInfo> >& trinfos) {
@@ -1418,6 +1503,19 @@ void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable, bool loadDlist) {
14181503
in.read(buffer, tmp_size);
14191504

14201505
target_names_.push_back(std::string(buffer));
1506+
// Begin Shading
1507+
auto shade_info = shadedTargetName(target_names_[target_names_.size()-1]);
1508+
if (shade_info.first != "") {
1509+
std::string tname = shade_info.first;
1510+
std::string variant = shade_info.second;
1511+
auto it = std::find(target_names_.begin(), target_names_.end(), tname);
1512+
if (it != target_names_.end()) {
1513+
shadeToColorTranscriptMap[i] = std::distance(target_names_.begin(), it);
1514+
}
1515+
use_shade = true;
1516+
shade_sequences.add(i);
1517+
}
1518+
// End Shading
14211519
}
14221520
delete[] buffer;
14231521

@@ -1438,6 +1536,11 @@ void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable, bool loadDlist) {
14381536
if (num_trans-onlist_sequences.cardinality() > 0) {
14391537
std::cerr << "[index] number of D-list k-mers: " << pretty_num(static_cast<size_t>(num_trans-onlist_sequences.cardinality())) << std::endl;
14401538
}
1539+
// Begin Shading
1540+
if (shadeToColorTranscriptMap.size() != 0) {
1541+
std::cerr << "[build] number of shades: " << std::to_string(shadeToColorTranscriptMap.size()) << std::endl;
1542+
}
1543+
// End Shading
14411544

14421545
in.close();
14431546

@@ -1594,6 +1697,11 @@ int KmerIndex::mapPair(const char *s1, int l1, const char *s2, int l2) const {
15941697
// post: v contains all equiv classes for the k-mers in s
15951698
void KmerIndex::match(const char *s, int l, std::vector<std::pair<const_UnitigMap<Node>, int>>& v, bool partial, bool cfc) const{
15961699
const Node* n;
1700+
1701+
// Begin Shading
1702+
if (use_shade) partial = false;
1703+
// End Shading
1704+
if (do_union) partial = false;
15971705

15981706
// TODO:
15991707
// Rework KmerIndex::match() such that it uses the following type of logic
@@ -1664,6 +1772,8 @@ void KmerIndex::match(const char *s, int l, std::vector<std::pair<const_UnitigMa
16641772
}
16651773

16661774
v.push_back({um, kit->second});
1775+
1776+
if (no_jump) continue;
16671777

16681778
// Find start and end of O.G. kallisto contig w.r.t. the bifrost-kallisto
16691779
// unitig

src/KmerIndex.h

+14
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,11 @@ struct KmerIndex {
7777
//LoadTranscripts(opt.transfasta);
7878
load_positional_info = opt.bias || opt.pseudobam || opt.genomebam || !opt.single_overhang;
7979
dfk_onlist = opt.dfk_onlist;
80+
do_union = opt.do_union;
81+
no_jump = opt.no_jump;
82+
// Begin Shading
83+
use_shade = false;
84+
// End Shading
8085
}
8186

8287
~KmerIndex() {}
@@ -131,6 +136,8 @@ struct KmerIndex {
131136
std::vector<std::string> target_names_;
132137
std::vector<std::string> target_seqs_; // populated on demand
133138
bool dfk_onlist; // If we want to not use D-list in intersecting ECs
139+
bool do_union; // If we want to do "pseudoalignment" via a "union" rather than an "intersection"
140+
bool no_jump; // If we want to skip the jumping logic during pseudoalignment
134141
bool target_seqs_loaded;
135142
bool load_positional_info; // when should we load positional info in addition to strandedness
136143

@@ -141,6 +148,13 @@ struct KmerIndex {
141148
u_set_<Kmer, KmerHash> d_list;
142149
Kmer dummy_dfk;
143150
const_UnitigMap<Node> um_dummy;
151+
152+
// Begin Shading
153+
// Here, we use the concepts of "shades" as proposed by in Ornaments by Adduri & Kim, 2024 for bias-corrected allele-specific expression estimation
154+
std::unordered_map<int, int> shadeToColorTranscriptMap;
155+
Roaring shade_sequences;
156+
bool use_shade;
157+
// End Shading
144158
};
145159

146160
#endif // KALLISTO_KMERINDEX_H

0 commit comments

Comments
 (0)