Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 31 additions & 6 deletions CAT_pack/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ def process_nr(output_dir, log_file, quiet, prefix, cleanup):
"--db_fasta {} \\\n"
"--names {} \\\n"
"--nodes {} \\\n"
"--acc2taxid {} \\\n"
"-o path/to/prepare_output\n".format(
"--acc2tax {} \\\n"
"--db_dir path/to/prepare_output\n".format(
nr_gz.resolve(),
names_dmp.resolve(),
nodes_dmp.resolve(),
Expand Down Expand Up @@ -206,6 +206,30 @@ def load_gtdb_md5sums(md5sums_file):
fields = [f.strip() for f in line.split()]
fname = pathlib.Path(fields[1]).name
md5_dict[fname] = fields[0]

# at the time of GTDB release v207, and looking at earlier versions,
# the filenames in the "latest" section (https://data.gtdb.ecogenomic.org/releases/latest/)
# look like this: "bac120_taxonomy.tsv.gz", but the names
# in the MD5SUM file from the same location look like
# this: "bac120_taxonomy_r207.tsv.gz"
# So, here doing an ad hoc check and removing
# getting listed version
# seeing if that's in the file name as _r<version>
# and removing that string from our dict keys if so

# getting version
gtdb_version = get_gtdb_latest_version()
version_string = f"_r{gtdb_version}"

for key in md5_dict:

if version_string in key:

# removing version string
new_name = key.replace(version_string, "")

md5_dict[new_name] = md5_dict.pop(key)

return md5_dict


Expand Down Expand Up @@ -516,11 +540,11 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False):
# Using `latest` as an entry point
# This needs to be checked for future versions
gtdb_urls = [
"https://data.gtdb.ecogenomic.org/releases/latest/ar122_taxonomy.tsv.gz",
"https://data.gtdb.ecogenomic.org/releases/latest/ar53_taxonomy.tsv.gz",
"https://data.gtdb.ecogenomic.org/releases/latest/bac120_taxonomy.tsv.gz",
"https://data.gtdb.ecogenomic.org/releases/latest/MD5SUM",
"https://data.gtdb.ecogenomic.org/releases/latest/bac120.tree",
"https://data.gtdb.ecogenomic.org/releases/latest/ar122.tree",
"https://data.gtdb.ecogenomic.org/releases/latest/ar53.tree",
"https://data.gtdb.ecogenomic.org/releases/latest/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz",
]

Expand All @@ -530,6 +554,7 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False):
# Check files
md5sums_file = output_dir / pathlib.Path("MD5SUM")
md5sums_dict = load_gtdb_md5sums(md5sums_file)

check_gtdb_md5s(output_dir, md5sums_dict, log_file, quiet)

# Concatenate taxonomies
Expand Down Expand Up @@ -647,8 +672,8 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False):
"--db_fasta {} \\\n"
"--names {} \\\n"
"--nodes {} \\\n"
"--acc2taxid {} \\\n"
"-o path/to/prepare_output\n".format(
"--acc2tax {} \\\n"
"--db_dir path/to/prepare_output\n".format(
all_seqs_fp.resolve(),
names_dmp.resolve(),
nodes_dmp.resolve(),
Expand Down