Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 134 additions & 36 deletions phylofisher/forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,53 +15,119 @@
from matplotlib.backends.backend_pdf import PdfPages

from phylofisher import help_formatter
from phylofisher.db_map import database, Taxonomies, Metadata

plt.style.use('ggplot')


def configure_colors():
'''
Configure colors for taxonomic groups from color_conf file.
Configure colors for taxonomic groups from database.

:return: dictionary with taxonomic groups as keys and colors as values
:rtype: dict
'''
color_dict = dict()
with open(color_conf, 'r') as infile:
infile.readline()
for line in infile:
line = line.strip()
tax, color = line.split('\t')
color_dict[tax] = color

# If we're in local run mode and have a color_conf file, use it
if 'color_conf' in globals() and os.path.exists(color_conf):
with open(color_conf, 'r') as infile:
infile.readline()
for line in infile:
line = line.strip()
tax, color = line.split('\t')
color_dict[tax] = color
else:
# Use database colors
db_query = Taxonomies.select(Taxonomies.taxonomy, Taxonomies.color)
for q in db_query:
if q.color:
color_dict[q.taxonomy] = q.color
else:
color_dict[q.taxonomy] = 'white'

return color_dict


def parse_metadata(metadata, input_metadata=None):
def parse_metadata(database_path, input_metadata=None):
'''
Parse metadata from dataset and input_metadata (if provided)
Parse metadata from database and input_metadata (if provided)

:param metadata: path to the metadata file
:type metadata: str
:param input_metadata: if input is input metadata instead of database metadata, defaults to None
:type input_metadata: bool, optional
:param database_path: path to the database file
:type database_path: str
:param input_metadata: path to input metadata file, defaults to None
:type input_metadata: str, optional
:return: tuple of metadata dictionary and color dictionary
:rtype: tuple
'''
# Parse database metadata
metadata_comb = {}


database.init(database_path)
database.connect()

color_dict = configure_colors()

db_query = Metadata.select(Metadata.short_name, Metadata.long_name, Metadata.higher_taxonomy, Metadata.lower_taxonomy)
for q in db_query:
tax = q.short_name
full = q.long_name
group = Taxonomies.get(Taxonomies.id == q.higher_taxonomy).taxonomy
sub_tax = Taxonomies.get(Taxonomies.id == q.lower_taxonomy).taxonomy

if group not in color_dict or color_dict[group].lower() in ['x', 'xx']:
color_dict[group] = 'white'
metadata_comb[tax] = {'Higher Taxonomy': group, 'col': color_dict[group], 'full': full,
'Lower Taxonomy': sub_tax}

database.close()


# Parse input metadata if provided
if input_metadata and os.path.exists(input_metadata):
for line in open(input_metadata):
if "FILE_NAME" not in line:
metadata_input = line.split('\t')
tax = metadata_input[2].strip().split('_')[0]
group = metadata_input[3].strip()
full = metadata_input[6].strip()
sub_tax = metadata_input[4]
metadata_comb[tax] = {'Higher Taxonomy': group, 'col': "white", 'full': full, 'Lower Taxonomy': sub_tax}

return metadata_comb, color_dict


def parse_metadata_tsv(metadata_file, input_metadata=None):
'''
Parse metadata from TSV files (for backward compatibility with local runs)

:param metadata_file: path to the metadata TSV file
:type metadata_file: str
:param input_metadata: path to input metadata file, defaults to None
:type input_metadata: str, optional
:return: tuple of metadata dictionary and color dictionary
:rtype: tuple
'''
color_dict = configure_colors()
metadata_comb = {}
for line_ in open(metadata):
if 'Full Name' not in line_:
sline = line_.split('\t')
tax = sline[0].strip()
group = sline[2].strip()
sub_tax = sline[3]
full = sline[1].strip()
if group not in color_dict or color_dict[group].lower() in ['x', 'xx']:
color_dict[group] = 'white'
metadata_comb[tax] = {'Higher Taxonomy': group, 'col': color_dict[group], 'full': full,
'Lower Taxonomy': sub_tax}
if input_metadata:

# Parse database metadata from TSV
if os.path.exists(metadata_file):
for line_ in open(metadata_file):
if 'Full Name' not in line_:
sline = line_.split('\t')
tax = sline[0].strip()
group = sline[2].strip()
sub_tax = sline[3]
full = sline[1].strip()
if group not in color_dict or color_dict[group].lower() in ['x', 'xx']:
color_dict[group] = 'white'
metadata_comb[tax] = {'Higher Taxonomy': group, 'col': color_dict[group], 'full': full,
'Lower Taxonomy': sub_tax}

# Parse input metadata if provided
if input_metadata and os.path.exists(input_metadata):
for line in open(input_metadata):
if "FILE_NAME" not in line:
metadata_input = line.split('\t')
Expand All @@ -70,6 +136,7 @@ def parse_metadata(metadata, input_metadata=None):
full = metadata_input[6].strip()
sub_tax = metadata_input[4]
metadata_comb[tax] = {'Higher Taxonomy': group, 'col': "white", 'full': full, 'Lower Taxonomy': sub_tax}

return metadata_comb, color_dict


Expand All @@ -82,7 +149,7 @@ def suspicious_clades(tree):
:return: tuple of tree name and list of suspicious clades
:rtype: tuple
'''
t = Tree(tree)
t = Tree(tree, format=1)
# midpoint rooted tree
R = t.get_midpoint_outgroup()
t.set_outgroup(R)
Expand All @@ -91,7 +158,17 @@ def suspicious_clades(tree):
for node in t.traverse('preorder'):
if (node.is_root() is False) and (node.is_leaf() is False):
# report only clades which encompass less than a half of all oranisms
if node.support >= 70 and (len(node) < (len(t) - len(node))):
try:
ufboot = float(node.name.split('/')[0])
sh_alrt = float(node.name.split('/')[1])
except IndexError:
ufboot = 0
sh_alrt = 0
except ValueError:
ufboot = 0
sh_alrt = 0

if ufboot >= 95 and sh_alrt >= 80 and (len(node) < (len(t) - len(node))):
clade = node.get_leaf_names()
if len(clade) > 1: # do we need this statement?
supported_clades.append(clade)
Expand Down Expand Up @@ -120,7 +197,7 @@ def get_best_candidates(tree_file):
:return: set of best candidate sequences
:rtype: set
'''
t = Tree(tree_file)
t = Tree(tree_file, format=1)
top_rank = defaultdict(dict)
for node in t.traverse('preorder'):
if node.is_leaf():
Expand Down Expand Up @@ -210,7 +287,7 @@ def collect_contaminants(tree_file, cont_dict):
:return: set of proven contaminants, set of proven contamination (same names as in csv result tables)
:rtype: tuple(set, set)
'''
t = Tree(tree_file)
t = Tree(tree_file, format=1)
R = t.get_midpoint_outgroup()
t.set_outgroup(R)
cont_table_names = set()
Expand Down Expand Up @@ -300,7 +377,7 @@ def tree_to_tsvg(tree_file, contaminants=None, backpropagation=None):
table = open(f"{output_folder}/{name_.split('_')[0]}.tsv", 'r')

top_ranked = get_best_candidates(tree_file)
t = Tree(tree_file)
t = Tree(tree_file, format=1)
ts = TreeStyle()
R = t.get_midpoint_outgroup()
t.set_outgroup(R)
Expand Down Expand Up @@ -482,8 +559,18 @@ def format_nodes(node, node_style, sus_clades, t):
:return: tuple of TextFace object and updated number of suspicious clades
:rtype: tuple
'''
supp = TextFace(f'{int(node.support)}', fsize=8)
if node.support >= 70:
try:
ufboot = float(node.name.split('/')[0])
sh_alrt = float(node.name.split('/')[1])
except IndexError:
ufboot = 0
sh_alrt = 0
except ValueError:
ufboot = 0
sh_alrt = 0

supp = TextFace(f'{int(ufboot)}/{int(sh_alrt)}', fsize=8)
if ufboot >= 80 and sh_alrt >= 95:
supp.bold = True
taxons = set()
orgs = node.get_leaf_names()
Expand Down Expand Up @@ -699,6 +786,10 @@ def backpropagate_contamination(tree_file, cont_names):
args.metadata = f'{args.input}/metadata.tsv'
args.input_metadata = f'{args.input}/input_metadata.tsv'
color_conf = f'{args.input}/tree_colors.tsv'

# For local runs, we might need to parse TSV files if database is not available
# This maintains backward compatibility
args.database_path = None # Indicates to use TSV files

args.input = f'{args.input}/trees'

Expand All @@ -707,17 +798,24 @@ def backpropagate_contamination(tree_file, cont_names):
config = configparser.ConfigParser()
config.read('config.ini')
dfo = str(Path(config['PATHS']['database_folder']).resolve())
args.metadata = str(os.path.join(dfo, 'metadata.tsv'))
color_conf = str(Path(config['PATHS']['color_conf']).resolve())
args.database_path = str(os.path.join(dfo, 'phylofisher.db'))
args.input_metadata = str(os.path.abspath(config['PATHS']['input_file']))

if not args.backpropagate:
os.mkdir(output_folder)

trees = glob.glob(f"{trees_folder}/*.raxml.support")
trees = glob.glob(f"{trees_folder}/*.treefile")

number_of_genes = len(trees)
metadata, tax_col = parse_metadata(args.metadata, args.input_metadata)

# Parse metadata based on whether we're using database or local TSV files
if args.local_run and hasattr(args, 'metadata') and os.path.exists(args.metadata):
# For local runs with TSV files, use a modified parse function
metadata, tax_col = parse_metadata_tsv(args.metadata, args.input_metadata)
else:
# Use database
metadata, tax_col = parse_metadata(args.database_path, args.input_metadata)

threads = args.threads

if not args.backpropagate:
Expand Down
40 changes: 31 additions & 9 deletions phylofisher/sgt_constructor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
import configparser
import os
import shutil
import sys
import subprocess
import textwrap
from pathlib import Path
Expand All @@ -23,15 +23,15 @@ def get_genes(length_filter):

else:
ret = []
len_filt_bmge_dir = f'{args.output}/length_filtration/bmge'
bmge_out_files = [file for file in os.listdir(len_filt_bmge_dir) if file.endswith('.bmge')]
len_filt_dir = f'{args.output}/length_filtered'
bmge_out_files = [file for file in os.listdir(len_filt_dir) if file.endswith('.length_filtered')]
for bmge_out_file in bmge_out_files:
with open(f'{len_filt_bmge_dir}/{bmge_out_file}', 'r') as infile:
with open(f'{len_filt_dir}/{bmge_out_file}', 'r') as infile:
line = infile.readline()
if line == '':
pass
else:
ret.append(bmge_out_file.split('.bmge')[0])
ret.append(bmge_out_file.split('.length_filtered')[0])

return ret

Expand All @@ -50,7 +50,8 @@ def make_config(length_filter):
f'trees_only={args.trees_only}',
f'no_trees={args.no_trees}',
f'database={args.database}',
f'input_metadata={args.input_metadata}'
f'input_metadata={args.input_metadata}',
f'threads_per_job={args.threads_per_job}'
]

return ' '.join(ret)
Expand All @@ -62,7 +63,7 @@ def get_output_files(length_filter):

if length_filter:
for gene in get_genes(length_filter):
ret.append(f'{args.output}/length_filtration/bmge/{gene}.bmge')
ret.append(f'{args.output}/length_filtered/{gene}.length_filtered')

else:
if args.no_trees:
Expand All @@ -82,13 +83,20 @@ def run_snakemake(length_filter=False):
f'snakemake',
f'-s {SNAKEFILE_PATH}',
f'--config {make_config(length_filter)}',
f'--cores {args.threads}',
f'--rerun-incomplete',
f'--keep-going',
f'--nolock',
f'--use-conda',
]

if args.profile is not None:
smk_frags.append(f'--profile {args.profile}')
else:
smk_frags.append(f'--cores {args.threads}')

if args.dry_run:
smk_frags.append(f'-n')

smk_frags.append(get_output_files(length_filter))

smk_cmd = ' '.join(smk_frags)
Expand All @@ -110,7 +118,12 @@ def run_snakemake(length_filter=False):
# Optional Arguments
optional.add_argument('-t', '--threads', metavar='N', type=int, default=1,
help=textwrap.dedent("""\
Desired number of threads to be utilized.
Total number of cores/threads for Snakemake to use for parallel job execution.
Default: 1
"""))
optional.add_argument('--threads-per-job', metavar='N', type=int, default=1,
help=textwrap.dedent("""\
Number of threads each individual tool (mafft, iqtree) should use.
Default: 1
"""))
optional.add_argument('--no_trees', action='store_true',
Expand All @@ -130,6 +143,15 @@ def run_snakemake(length_filter=False):
phylip-relaxed (names are not truncated), or nexus.
Default: fasta
"""))
optional.add_argument('--profile', metavar='<profile>', type=str, default=None,
help=textwrap.dedent("""\
Snakemake cluster profile to use for running jobs.
Default: None
"""))
optional.add_argument('-n', '--dry-run', action='store_true',
help=textwrap.dedent("""\
Perform a dry-run of the Snakemake workflow without executing any jobs.
"""))

args = help_formatter.get_args(parser, optional, required, pre_suf=False, inp_dir=True)

Expand Down
Loading