From 4916d079a8da712c2c6ec589d88a318ebcab7804 Mon Sep 17 00:00:00 2001
From: Mike Lee <michael.lee0517@gmail.com>
Date: Thu, 3 Nov 2022 14:44:27 -0700
Subject: [PATCH 1/3] Update download.py

GTDB changed from 122 archaeal genes to 53 relatively recently. these numbers don't regularly change, so it's not like this will happen with every new GTDB release
---
 CAT_pack/download.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/CAT_pack/download.py b/CAT_pack/download.py
index 2f5ea93..9cdf9cf 100644
--- a/CAT_pack/download.py
+++ b/CAT_pack/download.py
@@ -516,11 +516,11 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False):
     # Using `latest` as an entry point
     # This needs to be checked for future versions
     gtdb_urls = [
-        "https://data.gtdb.ecogenomic.org/releases/latest/ar122_taxonomy.tsv.gz",
+        "https://data.gtdb.ecogenomic.org/releases/latest/ar53_taxonomy.tsv.gz",
         "https://data.gtdb.ecogenomic.org/releases/latest/bac120_taxonomy.tsv.gz",
         "https://data.gtdb.ecogenomic.org/releases/latest/MD5SUM",
         "https://data.gtdb.ecogenomic.org/releases/latest/bac120.tree",
-        "https://data.gtdb.ecogenomic.org/releases/latest/ar122.tree",
+        "https://data.gtdb.ecogenomic.org/releases/latest/ar53.tree",
         "https://data.gtdb.ecogenomic.org/releases/latest/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz",
     ]
 

From 92882846a0eda76ad3b7bc956601ad51314a90e3 Mon Sep 17 00:00:00 2001
From: AstrobioMike <michael.lee0517@gmail.com>
Date: Thu, 3 Nov 2022 20:52:55 -0700
Subject: [PATCH 2/3] workaround dealing with MD5 filename disparity for GTDB
 downloading

---
 CAT_pack/download.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/CAT_pack/download.py b/CAT_pack/download.py
index 9cdf9cf..9f5dc0b 100644
--- a/CAT_pack/download.py
+++ b/CAT_pack/download.py
@@ -206,6 +206,30 @@ def load_gtdb_md5sums(md5sums_file):
             fields = [f.strip() for f in line.split()]
             fname = pathlib.Path(fields[1]).name
             md5_dict[fname] = fields[0]
+
+        # at the time of GTDB release v207, and looking at earlier versions,
+        # the filenames in the "latest" section (https://data.gtdb.ecogenomic.org/releases/latest/)
+        # look like this: "bac120_taxonomy.tsv.gz", but the names
+        # in the MD5SUM file from the same location look like 
+        # this: "bac120_taxonomy_r207.tsv.gz"
+        # So, here doing an ad hoc check and removing
+            # getting listed version
+            # seeing if that's in the file name as _r<version>
+            # and removing that string from our dict keys if so
+
+        # getting version
+        gtdb_version = get_gtdb_latest_version()
+        version_string = f"_r{gtdb_version}"
+
+        for key in md5_dict:
+
+            if version_string in key:
+
+                # removing version string
+                new_name = key.replace(version_string, "")
+
+                md5_dict[new_name] = md5_dict.pop(key)
+
     return md5_dict
 
 
@@ -530,6 +554,7 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False):
     # Check files
     md5sums_file = output_dir / pathlib.Path("MD5SUM")
     md5sums_dict = load_gtdb_md5sums(md5sums_file)
+
     check_gtdb_md5s(output_dir, md5sums_dict, log_file, quiet)
 
     # Concatenate taxonomies

From 38da76e19e6b60f9b15b3f2f1bcc59d51660e262 Mon Sep 17 00:00:00 2001
From: AstrobioMike <michael.lee0517@gmail.com>
Date: Fri, 4 Nov 2022 11:53:47 -0700
Subject: [PATCH 3/3] some input arguments for `CAT prepare` seem to have
 changed, this modifies the message output of this to match those changes

---
 CAT_pack/download.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CAT_pack/download.py b/CAT_pack/download.py
index 9f5dc0b..04e9234 100644
--- a/CAT_pack/download.py
+++ b/CAT_pack/download.py
@@ -156,8 +156,8 @@ def process_nr(output_dir, log_file, quiet, prefix, cleanup):
         "--db_fasta {} \\\n"
         "--names {} \\\n"
         "--nodes {} \\\n"
-        "--acc2taxid {} \\\n"
-        "-o path/to/prepare_output\n".format(
+        "--acc2tax {} \\\n"
+        "--db_dir path/to/prepare_output\n".format(
             nr_gz.resolve(),
             names_dmp.resolve(),
             nodes_dmp.resolve(),
@@ -672,8 +672,8 @@ def process_gtdb(output_dir, log_file, quiet, cleanup=False):
         "--db_fasta {} \\\n"
         "--names {} \\\n"
         "--nodes {} \\\n"
-        "--acc2taxid {} \\\n"
-        "-o path/to/prepare_output\n".format(
+        "--acc2tax {} \\\n"
+        "--db_dir path/to/prepare_output\n".format(
             all_seqs_fp.resolve(),
             names_dmp.resolve(),
             nodes_dmp.resolve(),