Skip to content

Commit

Permalink
code reformat
Browse files Browse the repository at this point in the history
Signed-off-by: Radu Muntean <[email protected]>
  • Loading branch information
heracle committed Aug 14, 2021
1 parent 18c0fe0 commit 249129e
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 42 deletions.
63 changes: 34 additions & 29 deletions metagraph/src/annotation/taxonomy/tax_classifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,19 @@ void TaxonomyBase::read_accversion_to_taxid_map(const std::string &filepath,
exit(1);
}
if (input_accessions.size() == 0 || input_accessions.count(parts[1])) {
// e.g. of nucl.accession2taxid file:
//
// A00001 A00001.1 10641 58418
//
// Thus, parts[1] represents the accession version and parts[2] the corresponding taxid.
accversion_to_taxid_map_[parts[1]] = std::stoul(parts[2]);
}
}
}

TaxonomyClsAnno::TaxonomyClsAnno(const graph::AnnotatedDBG &anno,
const double lca_coverage_rate,
const double kmers_discovery_rate,
double lca_coverage_rate,
double kmers_discovery_rate,
const std::string &tax_tree_filepath,
const std::string &label_taxid_map_filepath)
: TaxonomyBase(lca_coverage_rate, kmers_discovery_rate),
Expand All @@ -101,25 +106,20 @@ TaxonomyClsAnno::TaxonomyClsAnno(const graph::AnnotatedDBG &anno,
// Take one sample label and find the label type.
std::string sample_label = anno_matrix_->get_annotation().get_all_labels()[0];

// If true, require_accversion_to_taxid_map means that the taxid is not mentioned as part of the label. Thus, the program
// needs to parse an additional accession_version to taxid lookup file (optional argument: label_taxid_map_filepath).
bool require_accversion_to_taxid_map;
if (utils::starts_with(sample_label, "gi|")) {
// e.g. >gi|1070643132|ref|NC_031224.1| Arthrobacter phage Mudcat, complete genome
label_type_ = GEN_BANK;
require_accversion_to_taxid_map = true;
} else if (utils::starts_with(utils::split_string(sample_label, ":")[1], "taxid|")) {
// e.g. >kraken:taxid|2016032|NC_047834.1 Alteromonas virus vB_AspP-H4/4, complete genome
label_type_ = TAXID;
require_accversion_to_taxid_map = false;
} else {
logger->error("Error: Can't determine the type of the given label {}. "
"Make sure the labels are in a recognized format.", sample_label);
exit(1);
}

Timer timer;
if (require_accversion_to_taxid_map) {
if (label_type_ == GEN_BANK) {
logger->trace("Parsing label_taxid_map file...");
read_accversion_to_taxid_map(label_taxid_map_filepath, anno_matrix_);
logger->trace("Finished label_taxid_map file in {} sec", timer.elapsed());
Expand Down Expand Up @@ -151,7 +151,6 @@ void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath, ChildrenLi
}

std::string line;
tsl::hopscotch_map<TaxId, TaxId> full_parents_list;
while (getline(f, line)) {
if (line == "") {
logger->error("Error: The Taxonomic Tree file contains empty lines. "
Expand All @@ -166,10 +165,12 @@ void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath, ChildrenLi
tax_tree_filepath);
exit(1);
}
uint32_t act = std::stoul(parts[0]);
uint32_t parent = std::stoul(parts[2]);
full_parents_list[act] = parent;
node_parent_[act] = parent;
// e.g. of nodes.dmp file:
//
// 2 | 131567 | superkingdom | | 0 | 0
//
// Thus, parts[0] represents the child taxid and parts[2] the parent taxid.
node_parent_[std::stoul(parts[0])] = std::stoul(parts[2]);
}

std::vector<TaxId> relevant_taxids;
Expand All @@ -178,52 +179,56 @@ void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath, ChildrenLi

if (accversion_to_taxid_map_.size()) {
// Store only the taxonomic nodes that exists in the annotation matrix.
for (const std::pair<std::string, TaxId> &pair : accversion_to_taxid_map_) {
relevant_taxids.push_back(pair.second);
considered_relevant_taxids.insert(pair.second);
for (const auto &[_, taxid] : accversion_to_taxid_map_) {
relevant_taxids.push_back(taxid);
considered_relevant_taxids.insert(taxid);
}
} else {
// If 'this->accversion_to_taxid_map' is empty, store the entire taxonomic tree.
for (auto it : full_parents_list) {
relevant_taxids.push_back(it.first);
considered_relevant_taxids.insert(it.first);
for (const auto &[child, _] : node_parent_) {
relevant_taxids.push_back(child);
considered_relevant_taxids.insert(child);
}
}
assert(relevant_taxids.size());

uint64_t num_taxid_failed = 0; // num_taxid_failed is used for logging only.
for (uint32_t i = 0; i < relevant_taxids.size(); ++i) {
const TaxId taxid = relevant_taxids[i];
if (!full_parents_list.count(taxid)) {
TaxId taxid = relevant_taxids[i];
auto it_taxid_parent = node_parent_.find(taxid);
if (it_taxid_parent == node_parent_.end()) {
num_taxid_failed += 1;
continue;
}
TaxId taxid_parent = it_taxid_parent->second;

if (not considered_relevant_taxids.count(full_parents_list[taxid])) {
relevant_taxids.push_back(full_parents_list[taxid]);
considered_relevant_taxids.insert(full_parents_list[taxid]);
if (not considered_relevant_taxids.count(taxid_parent)) {
relevant_taxids.push_back(taxid_parent);
considered_relevant_taxids.insert(taxid_parent);
}

// Check if the current taxid is the root.
if (taxid == full_parents_list[taxid]) {
if (taxid == taxid_parent) {
root_node_ = taxid;
}
}
if (num_taxid_failed) {
logger->warn("During the tax_tree_filepath {} parsing, {} taxids were not found out of {} total evaluations.",
logger->warn("During the tax_tree_filepath {} parsing, {} taxids were not found out of {} total evaluations",
tax_tree_filepath, num_taxid_failed, relevant_taxids.size());
}

// Construct the output tree.
for (const TaxId &taxid : relevant_taxids) {
if (taxid == root_node_) {
if (taxid == root_node_)
continue;
auto it_taxid_parent = node_parent_.find(taxid);
if (it_taxid_parent != node_parent_.end()) {
(*tree)[it_taxid_parent->second].push_back(taxid);
}
(*tree)[full_parents_list[taxid]].push_back(taxid);
}
}

void TaxonomyClsAnno::dfs_statistics(const TaxId node,
void TaxonomyClsAnno::dfs_statistics(TaxId node,
const ChildrenList &tree,
std::vector<TaxId> *tree_linearization) {
node_to_linearization_idx_[node] = tree_linearization->size();
Expand Down
27 changes: 14 additions & 13 deletions metagraph/src/annotation/taxonomy/tax_classifier.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <tsl/hopscotch_set.h>
#include <tsl/hopscotch_map.h>

#include "graph/annotated_dbg.hpp"

namespace mtg {
Expand All @@ -22,10 +23,11 @@ class TaxonomyBase {
TAXID, // e.g. ">kraken:taxid|2016032|NC_047834.1 Alteromonas virus vB_AspP-H4/4, complete genome"
};

TaxonomyBase() {};
TaxonomyBase(const double lca_coverage_rate, const double kmers_discovery_rate)
TaxonomyBase() {}
TaxonomyBase(double lca_coverage_rate, double kmers_discovery_rate)
: lca_coverage_rate_(lca_coverage_rate),
kmers_discovery_rate_(kmers_discovery_rate) {};
kmers_discovery_rate_(kmers_discovery_rate) {}
virtual ~TaxonomyBase() {}

TaxId assign_class(const std::string &sequence) const;

Expand Down Expand Up @@ -59,9 +61,9 @@ class TaxonomyBase {
* `desired_number_kmers` threshold and is placed as close as possible to the leaves).
* @param [modified] 'best_lca_dist_to_root' -> the distance to the root for the current classification prediction.
*/
void update_scores_and_lca(const TaxId start_node,
void update_scores_and_lca(TaxId start_node,
const tsl::hopscotch_map<TaxId, uint64_t> &num_kmers_per_node,
const uint64_t desired_number_kmers,
uint64_t desired_number_kmers,
tsl::hopscotch_map<TaxId, uint64_t> *node_scores,
tsl::hopscotch_set<TaxId> *nodes_already_propagated,
TaxId *best_lca,
Expand Down Expand Up @@ -98,8 +100,8 @@ class TaxonomyClsImportDB : public TaxonomyBase {
public:
// todo implement
TaxonomyClsImportDB(const std::string &taxdb_filepath,
const double lca_coverage_rate,
const double kmers_discovery_rate);
double lca_coverage_rate,
double kmers_discovery_rate);

private:
std::vector<TaxId> get_lca_taxids_for_seq(const std::string_view &sequence, bool reversed) const;
Expand All @@ -119,12 +121,11 @@ class TaxonomyClsAnno : public TaxonomyBase {
* Mandatory if the taxid is not mentioned in the label string.
*/
TaxonomyClsAnno(const graph::AnnotatedDBG &anno,
const double lca_coverage_rate,
const double kmers_discovery_rate,
double lca_coverage_rate,
double kmers_discovery_rate,
const std::string &tax_tree_filepath,
const std::string &label_taxid_map_filepath = "");
TaxonomyClsAnno() {};
virtual ~TaxonomyClsAnno() {};
TaxonomyClsAnno() {}

// todo implement
void export_taxdb(const std::string &filepath) const;
Expand All @@ -140,7 +141,7 @@ class TaxonomyClsAnno : public TaxonomyBase {
* @param [input] label_fraction -> threshold used for taxonomic classification.
* @return -> the classification result: a taxid node
*/
TaxId assign_class_toplabels(const std::string &sequence, const double label_fraction) const;
TaxId assign_class_toplabels(const std::string &sequence, double label_fraction) const;

private:
/**
Expand Down Expand Up @@ -169,7 +170,7 @@ class TaxonomyClsAnno : public TaxonomyBase {
* @param [input] tree -> the taxonomic tree stored as a list of children.
* @param [output] tree_linearization -> the linearization of the received tree.
*/
void dfs_statistics(const TaxId node,
void dfs_statistics(TaxId node,
const ChildrenList &tree,
std::vector<TaxId> *tree_linearization);

Expand Down

0 comments on commit 249129e

Please sign in to comment.