diff --git a/CAT_pack/download.py b/CAT_pack/download.py index 2f5ea93..04e9234 100644 --- a/CAT_pack/download.py +++ b/CAT_pack/download.py @@ -156,8 +156,8 @@ def process_nr(output_dir, log_file, quiet, prefix, cleanup): "--db_fasta {} \\\n" "--names {} \\\n" "--nodes {} \\\n" - "--acc2taxid {} \\\n" - "-o path/to/prepare_output\n".format( + "--acc2tax {} \\\n" + "--db_dir path/to/prepare_output\n".format( nr_gz.resolve(), names_dmp.resolve(), nodes_dmp.resolve(), @@ -206,6 +206,30 @@ def load_gtdb_md5sums(md5sums_file): fields = [f.strip() for f in line.split()] fname = pathlib.Path(fields[1]).name md5_dict[fname] = fields[0] + + # at the time of GTDB release v207, and looking at earlier versions, + # the filenames in the "latest" section (https://data.gtdb.ecogenomic.org/releases/latest/) + # look like this: "bac120_taxonomy.tsv.gz", but the names + # in the MD5SUM file from the same location look like + # this: "bac120_taxonomy_r207.tsv.gz" + # So, here doing an ad hoc check and removing + # getting listed version + # seeing if that's in the file name as _r + # and removing that string from our dict keys if so + + # getting version + gtdb_version = get_gtdb_latest_version() + version_string = f"_r{gtdb_version}" + + for key in md5_dict: + + if version_string in key: + + # removing version string + new_name = key.replace(version_string, "") + + md5_dict[new_name] = md5_dict.pop(key) + return md5_dict @@ -516,11 +540,11 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False): # Using `latest` as an entry point # This needs to be checked for future versions gtdb_urls = [ - "https://data.gtdb.ecogenomic.org/releases/latest/ar122_taxonomy.tsv.gz", + "https://data.gtdb.ecogenomic.org/releases/latest/ar53_taxonomy.tsv.gz", "https://data.gtdb.ecogenomic.org/releases/latest/bac120_taxonomy.tsv.gz", "https://data.gtdb.ecogenomic.org/releases/latest/MD5SUM", "https://data.gtdb.ecogenomic.org/releases/latest/bac120.tree", - "https://data.gtdb.ecogenomic.org/releases/latest/ar122.tree", + "https://data.gtdb.ecogenomic.org/releases/latest/ar53.tree", "https://data.gtdb.ecogenomic.org/releases/latest/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz", ] @@ -530,6 +554,7 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False): # Check files md5sums_file = output_dir / pathlib.Path("MD5SUM") md5sums_dict = load_gtdb_md5sums(md5sums_file) + check_gtdb_md5s(output_dir, md5sums_dict, log_file, quiet) # Concatenate taxonomies @@ -647,8 +672,8 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False): "--db_fasta {} \\\n" "--names {} \\\n" "--nodes {} \\\n" - "--acc2taxid {} \\\n" - "-o path/to/prepare_output\n".format( + "--acc2tax {} \\\n" + "--db_dir path/to/prepare_output\n".format( all_seqs_fp.resolve(), names_dmp.resolve(), nodes_dmp.resolve(),