From 4916d079a8da712c2c6ec589d88a318ebcab7804 Mon Sep 17 00:00:00 2001 From: Mike Lee Date: Thu, 3 Nov 2022 14:44:27 -0700 Subject: [PATCH 1/3] Update download.py GTDB changed from 122 archaeal genes to 53 relatively recently. these numbers don't regularly change, so it's not like this will happen with every new GTDB release --- CAT_pack/download.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CAT_pack/download.py b/CAT_pack/download.py index 2f5ea93..9cdf9cf 100644 --- a/CAT_pack/download.py +++ b/CAT_pack/download.py @@ -516,11 +516,11 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False): # Using `latest` as an entry point # This needs to be checked for future versions gtdb_urls = [ - "https://data.gtdb.ecogenomic.org/releases/latest/ar122_taxonomy.tsv.gz", + "https://data.gtdb.ecogenomic.org/releases/latest/ar53_taxonomy.tsv.gz", "https://data.gtdb.ecogenomic.org/releases/latest/bac120_taxonomy.tsv.gz", "https://data.gtdb.ecogenomic.org/releases/latest/MD5SUM", "https://data.gtdb.ecogenomic.org/releases/latest/bac120.tree", - "https://data.gtdb.ecogenomic.org/releases/latest/ar122.tree", + "https://data.gtdb.ecogenomic.org/releases/latest/ar53.tree", "https://data.gtdb.ecogenomic.org/releases/latest/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz", ] From 92882846a0eda76ad3b7bc956601ad51314a90e3 Mon Sep 17 00:00:00 2001 From: AstrobioMike Date: Thu, 3 Nov 2022 20:52:55 -0700 Subject: [PATCH 2/3] workaround dealing with MD5 filename disparity for GTDB downloading --- CAT_pack/download.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/CAT_pack/download.py b/CAT_pack/download.py index 9cdf9cf..9f5dc0b 100644 --- a/CAT_pack/download.py +++ b/CAT_pack/download.py @@ -206,6 +206,30 @@ def load_gtdb_md5sums(md5sums_file): fields = [f.strip() for f in line.split()] fname = pathlib.Path(fields[1]).name md5_dict[fname] = fields[0] + + # at the time of GTDB release v207, and looking at earlier versions, + # the filenames in the "latest" section (https://data.gtdb.ecogenomic.org/releases/latest/) + # look like this: "bac120_taxonomy.tsv.gz", but the names + # in the MD5SUM file from the same location look like + # this: "bac120_taxonomy_r207.tsv.gz" + # So, here doing an ad hoc check and removing + # getting listed version + # seeing if that's in the file name as _r + # and removing that string from our dict keys if so + + # getting version + gtdb_version = get_gtdb_latest_version() + version_string = f"_r{gtdb_version}" + + for key in md5_dict: + + if version_string in key: + + # removing version string + new_name = key.replace(version_string, "") + + md5_dict[new_name] = md5_dict.pop(key) + return md5_dict @@ -530,6 +554,7 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False): # Check files md5sums_file = output_dir / pathlib.Path("MD5SUM") md5sums_dict = load_gtdb_md5sums(md5sums_file) + check_gtdb_md5s(output_dir, md5sums_dict, log_file, quiet) # Concatenate taxonomies From 38da76e19e6b60f9b15b3f2f1bcc59d51660e262 Mon Sep 17 00:00:00 2001 From: AstrobioMike Date: Fri, 4 Nov 2022 11:53:47 -0700 Subject: [PATCH 3/3] some input arguments for `CAT prepare` seem to have changed, this modifies the message output of this to match those changes --- CAT_pack/download.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CAT_pack/download.py b/CAT_pack/download.py index 9f5dc0b..04e9234 100644 --- a/CAT_pack/download.py +++ b/CAT_pack/download.py @@ -156,8 +156,8 @@ def process_nr(output_dir, log_file, quiet, prefix, cleanup): "--db_fasta {} \\\n" "--names {} \\\n" "--nodes {} \\\n" - "--acc2taxid {} \\\n" - "-o path/to/prepare_output\n".format( + "--acc2tax {} \\\n" + "--db_dir path/to/prepare_output\n".format( nr_gz.resolve(), names_dmp.resolve(), nodes_dmp.resolve(), @@ -672,8 +672,8 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False): "--db_fasta {} \\\n" "--names {} \\\n" "--nodes {} \\\n" - "--acc2taxid {} \\\n" - "-o path/to/prepare_output\n".format( + "--acc2tax {} \\\n" + "--db_dir path/to/prepare_output\n".format( all_seqs_fp.resolve(), names_dmp.resolve(), nodes_dmp.resolve(),