Merge pull request #13 from AI-sandbox/fixes

Some minor fixes and improvements mostly related to SNP readers
AI-sandbox · Nov 22, 2024 · ffea32c · ffea32c
2 parents 5dd876b + f35e862
commit ffea32c
Show file tree

Hide file tree

Showing 13 changed files with 146 additions and 89 deletions.
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ pip install snputils
 
 Optionally, for GPU-accelerated functionalities, install the package with the `[gpu]` extra:
 ```bash
-pip install snputils[gpu]
+pip install 'snputils[gpu]'
 ```
 
 ## Key Features
@@ -111,6 +111,6 @@ We would like to thank the open-source Python packages that make **snputils** po
 
 If you use **snputils** in your research, please cite:
 
-> Bonet, D.\*, Comajoan Cara, M.\*, Barrabés, M.\*, Smeriglio, R., Agrawal, D., Dominguez Mantes, A., López, C., Thomassin, C., Calafell, A., Luis, A., Saurina, J., Franquesa, M., Perera, M., Geleta, M., Jaras, A., Sabat, B. O., Abante, J., Moreno-Grau, S., Mas Montserrat, D., Ioannidis, A. G., snputils: A Python library for processing diverse genomes. Annual Meeting of The American Society of Human Genetics, November 2024, Denver, Colorado, USA. \* Equal contribution.
+> Bonet, D.\*, Comajoan Cara, M.\*, Barrabés, M.\*, Smeriglio, R., Agrawal, D., Dominguez Mantes, A., López, C., Thomassin, C., Calafell, A., Luis, A., Saurina, J., Franquesa, M., Perera, M., Geleta, M., Jaras, A., Sabat, B. O., Abante, J., Moreno-Grau, S., Mas Montserrat, D., Ioannidis, A. G., snputils: A Python library for processing diverse genomes. Annual Meeting of The American Society of Human Genetics, November 2024, Denver, Colorado, USA. \*Equal contribution.
 
 Journal paper coming soon!
diff --git a/benchmark/run.sh b/benchmark/run.sh
@@ -63,12 +63,26 @@ echo "Will run benchmark for chromosomes: ${CHROMS[*]}"
 # Create directory structure
 mkdir -p ${DATA_DIR}/vcf ${DATA_DIR}/bed ${DATA_DIR}/pgen benchmark/sbatch benchmark/results
 
+# Determine zip file based on OS
+case "$(uname -s)" in
+    Darwin*)
+        zip_file="plink2_mac_arm64_20241114.zip"
+        ;;
+    Linux*)
+        zip_file="plink2_linux_x86_64_20241114.zip"
+        ;;
+    *)
+        echo "Error: You will need to manually download and setup plink2 for your OS"
+        exit 1
+        ;;
+esac
+
 # Download and setup plink2 if not already present
 if [ ! -f ${DATA_DIR}/plink2 ]; then
-    wget -P ${DATA_DIR} https://s3.amazonaws.com/plink2-assets/plink2_linux_x86_64_20241020.zip
-    unzip ${DATA_DIR}/plink2_linux_x86_64_20241020.zip -d ${DATA_DIR}
+    wget -P ${DATA_DIR} https://s3.amazonaws.com/plink2-assets/alpha6/${zip_file}
+    unzip ${DATA_DIR}/${zip_file} -d ${DATA_DIR}
     chmod +x ${DATA_DIR}/plink2
-    rm ${DATA_DIR}/plink2_linux_x86_64_20241020.zip
+    rm ${DATA_DIR}/${zip_file}
 fi
 
 # Base URL for 1000 genomes data

diff --git a/demos/SNPObj.ipynb b/demos/SNPObj.ipynb
@@ -40,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "2b25818b",
    "metadata": {},
    "outputs": [
@@ -58,7 +58,7 @@
    ],
    "source": [
     "# Define the path to the VCF file\n",
-    "query_path = '../data/subset.vcf'\n",
+    "query_path = '../data/vcf/subset.vcf'\n",
     "\n",
     "# Read VCF into SNPObject with the standard reader\n",
     "reader = VCFReader(query_path)\n",
@@ -796,7 +796,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "3ebf53e5",
    "metadata": {},
    "outputs": [
@@ -811,7 +811,7 @@
    ],
    "source": [
     "# Define the path to the VCF file\n",
-    "query_path = '../data/subset.vcf'\n",
+    "query_path = '../data/vcf/subset.vcf'\n",
     "\n",
     "# Read VCF into SNPObject with the standard reader\n",
     "reader = VCFReader(query_path)\n",

diff --git a/demos/SNP_PCA.ipynb b/demos/SNP_PCA.ipynb
diff --git a/demos/TorchPCA.ipynb b/demos/TorchPCA.ipynb
@@ -58,15 +58,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Genotype data shape: (976599, 4, 2)\n",
+      "Genotype data shape: (993881, 4, 2)\n",
       "Genotype data type: int8\n",
       "Samples: ['HG00096' 'HG00097' 'HG00099' 'HG00100']\n"
      ]
     }
    ],
    "source": [
     "# Load genotype data using VCFReader\n",
-    "reader = VCFReader('../data/subset.vcf')\n",
+    "reader = VCFReader('../data/vcf/subset.vcf')\n",
     "snpobj = reader.read(sum_strands=False)\n",
     "\n",
     "# Extract genotype call data (GT) and display shape and data type\n",
@@ -293,7 +293,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "galaxybio",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -307,7 +307,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.12.7"
   }
  },
  "nbformat": 4,

diff --git a/pyproject.toml b/pyproject.toml
@@ -44,8 +44,8 @@ dependencies = [
 
 [project.optional-dependencies]
 gpu = ["torch"]
-jupyter = ["jupyterlab"]
-testing = ["tox", "pytest", "pytest-cov"]
+demos = ["jupyterlab", "seaborn"]
+tests = ["tox", "pytest", "pytest-cov"]
 docs = ["pdoc", "torch"]  # pdoc requires to import all the libraries that are imported in the code
 benchmark = ["pytest", "pytest-benchmark", "memory_profiler",
              "pandas-plink", "pysam", "scikit-allel", "sgkit[plink]", "hail", "pysnptools", "Pgenlib", "cyvcf2", "plinkio", "PyVCF3"]
@@ -83,7 +83,7 @@ passenv =
     CI
     GITHUB_ACTIONS
 extras =
-    testing
+    tests
     gpu
     docs
 commands = python -m pytest -vv --color=yes --cov=snputils --cov-report=xml

diff --git a/snputils/snp/io/read/__test__/conftest.py b/snputils/snp/io/read/__test__/conftest.py
@@ -4,6 +4,7 @@
 import subprocess
 import urllib.request
 import zipfile
+import platform
 
 import pytest
 
@@ -17,14 +18,31 @@ def data_path():
     data_path = module_path / "data"
     os.makedirs(data_path, exist_ok=True)
 
+    system = platform.system()
+    is_arm = platform.machine().lower().startswith(('arm', 'aarch'))
+
+    plink_urls = {
+        ("Darwin", True): "plink2_mac_arm64_20241114.zip",  # macOS ARM
+        ("Linux", False): "plink2_linux_x86_64_20241114.zip",  # Linux x86_64
+    }
+
+    try:
+        plink_filename = plink_urls[(system, is_arm)]
+    except KeyError:
+        raise RuntimeError(f"Unsupported platform: {system} {platform.machine()}")
+
     files_urls = {
-        "plink2_linux_x86_64_20241020.zip": "https://s3.amazonaws.com/plink2-assets/alpha6/plink2_linux_x86_64_20241114.zip",
-        "ALL.chr21.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz": "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.chr21.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz",
+        plink_filename: f"https://s3.amazonaws.com/plink2-assets/alpha6/{plink_filename}",
+        "vcf/ALL.chr22.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz":
+        "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.chr22.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz",
     }
 
     # Check and download each file if it does not exist
     for file_name, url in files_urls.items():
         file_path = data_path / file_name
+        dir_path = file_path.parent if file_path.is_file() else file_path
+        if not dir_path.exists():
+            os.makedirs(file_path.parent, exist_ok=True)
         if not file_path.exists():
             print(f"Downloading {file_name} to {data_path}. This may take a while...")
             urllib.request.urlretrieve(url, file_path)
@@ -44,8 +62,9 @@ def data_path():
                 with open(subset_file, "w") as file:
                     file.write("\n".join(four_sample_ids))
 
-            subset_vcf = data_path / "subset.vcf"
-            if not subset_vcf.exists():
+            plink_vcf_out = "vcf/subset"
+            subset_vcf_file = data_path / (plink_vcf_out + ".vcf")
+            if not subset_vcf_file.exists():
                 print("Generating subset VCF...")
                 subprocess.run(
                     [
@@ -57,7 +76,7 @@ def data_path():
                         "--recode",
                         "vcf",
                         "--out",
-                        "subset",
+                        plink_vcf_out,
                     ],
                     cwd=str(data_path),
                 )
@@ -69,15 +88,15 @@ def data_path():
         fmt_file = fmt_path / "subset"
         if not fmt_file.exists():
             print(f"Generating {fmt} format...")
-            make_fmt = "--make-pgen vzs" if fmt == "pgen_zst" else f"--make-{fmt.split('_')[0]}"
+            make_fmt = "--make-pgen vzs" if fmt == "pgen_zst" else f"--make-{fmt}"
             subprocess.run(
                 [
                     "./plink2",
                     "--vcf",
-                    subset_vcf,
+                    subset_vcf_file,
                     *make_fmt.split(),
                     "--out",
-                    f"{fmt}/subset",
+                    fmt_file,
                 ],
                 cwd=str(data_path),
             )
@@ -87,7 +106,7 @@ def data_path():
 
 @pytest.fixture(scope="module")
 def snpobj_vcf(data_path):
-    return VCFReader(data_path + "/subset.vcf").read(sum_strands=False)
+    return VCFReader(data_path + "/vcf/subset.vcf").read(sum_strands=False)
 
 
 @pytest.fixture(scope="module")
@@ -98,18 +117,3 @@ def snpobj_bed(data_path):
 @pytest.fixture(scope="module")
 def snpobj_pgen(data_path):
     return PGENReader(data_path + "/pgen/subset").read(sum_strands=False)
-
-
-@pytest.fixture(scope="module")
-def snpobj_vcf_summed_strands(data_path):
-    return VCFReader(data_path + "/subset.vcf").read(sum_strands=True)
-
-
-@pytest.fixture(scope="module")
-def snpojb_bed_summed_strands(data_path):
-    return BEDReader(data_path + "/bed/subset").read(sum_strands=True)
-
-
-@pytest.fixture(scope="module")
-def snpobj_pgen_summed_strands(data_path):
-    return PGENReader(data_path + "/pgen/subset").read(sum_strands=True)
diff --git a/snputils/snp/io/read/__test__/test_fields.py b/snputils/snp/io/read/__test__/test_fields.py
@@ -0,0 +1,51 @@
+import numpy as np
+
+from snputils import VCFReader, BEDReader, PGENReader
+
+
+# TODO: Fails with KeyError: 'calldata/GT' (genotypes = vcf_dict["calldata/GT"].astype(np.int8))
+# def test_vcf_only_samples(data_path, snpobj_vcf):
+#     snpobj_vcf_only_samples = VCFReader(data_path + "/subset.vcf").read(fields=["IID"])
+#     assert np.array_equal(snpobj_vcf_only_samples.samples, snpobj_vcf.samples)
+
+
+def test_bed_only_samples(data_path, snpobj_bed):
+    snpobj_bed_only_samples = BEDReader(data_path + "/bed/subset").read(fields=["IID"])
+    assert np.array_equal(snpobj_bed_only_samples.samples, snpobj_bed.samples)
+
+
+def test_pgen_only_samples(data_path, snpobj_pgen):
+    snpobj_pgen_only_samples = PGENReader(data_path + "/pgen/subset").read(fields=["IID"])
+    assert np.array_equal(snpobj_pgen_only_samples.samples, snpobj_pgen.samples)
+
+
+# TODO: Fails with KeyError: 'calldata/GT' (genotypes = vcf_dict["calldata/GT"].astype(np.int8))
+# def test_vcf_only_variants(data_path, snpobj_vcf):
+#     snpobj_vcf_only_variants = VCFReader(data_path + "/subset.vcf").read(fields=["ID"])
+#     assert np.array_equal(snpobj_vcf_only_variants.variants_id, snpobj_vcf.variants_id)
+
+
+def test_bed_only_variants(data_path, snpobj_bed):
+    snpobj_bed_only_variants = BEDReader(data_path + "/bed/subset").read(fields=["ID"])
+    assert np.array_equal(snpobj_bed_only_variants.variants_id, snpobj_bed.variants_id)
+
+
+def test_pgen_only_variants(data_path, snpobj_pgen):
+    snpobj_pgen_only_variants = PGENReader(data_path + "/pgen/subset").read(fields=["ID"])
+    assert np.array_equal(snpobj_pgen_only_variants.variants_id, snpobj_pgen.variants_id)
+
+
+# TODO: Fails with KeyError: 'samples' (samples=vcf_dict["samples"],)
+# def test_vcf_only_gt(data_path, snpobj_vcf):
+#     snpobj_vcf_only_gt = VCFReader(data_path + "/subset.vcf").read(fields=["GT"])
+#     assert np.array_equal(snpobj_vcf_only_gt.calldata_gt, snpobj_vcf.calldata_gt)
+
+
+def test_bed_only_gt(data_path, snpobj_bed):
+    snpobj_bed_only_gt = BEDReader(data_path + "/bed/subset").read(fields=["GT"])
+    assert np.array_equal(snpobj_bed_only_gt.calldata_gt, snpobj_bed.calldata_gt)
+
+
+def test_pgen_only_gt(data_path, snpobj_pgen):
+    snpobj_pgen_only_gt = PGENReader(data_path + "/pgen/subset").read(fields=["GT"])
+    assert np.array_equal(snpobj_pgen_only_gt.calldata_gt, snpobj_pgen.calldata_gt)
diff --git a/snputils/snp/io/read/__test__/test_formats.py b/snputils/snp/io/read/__test__/test_formats.py
@@ -1,6 +1,7 @@
 import numpy as np
 from snputils import PGENReader
 
+
 # VCF - BED
 
 def test_vcf_bed_samples(snpobj_vcf, snpobj_bed):

diff --git a/snputils/snp/io/read/__test__/test_phased.py b/snputils/snp/io/read/__test__/test_phased.py
diff --git a/snputils/snp/io/read/__test__/test_summed_strands.py b/snputils/snp/io/read/__test__/test_summed_strands.py
@@ -0,0 +1,18 @@
+import numpy as np
+
+from snputils import VCFReader, BEDReader, PGENReader
+
+
+def test_vcf_summed_strands(snpobj_vcf, data_path):
+    snpobj_vcf_summed_strands = VCFReader(data_path + "/vcf/subset.vcf").read(sum_strands=True)
+    assert np.array_equal(snpobj_vcf.calldata_gt.sum(axis=2, dtype=np.int8), snpobj_vcf_summed_strands.calldata_gt)
+
+
+def test_bed_summed_strands(snpobj_bed, data_path):
+    snpojb_bed_summed_strands = BEDReader(data_path + "/bed/subset").read(sum_strands=True)
+    assert np.array_equal(snpobj_bed.calldata_gt.sum(axis=2, dtype=np.int8), snpojb_bed_summed_strands.calldata_gt)
+
+
+def test_pgen_summed_strands(snpobj_pgen, data_path):
+    snpobj_pgen_summed_strands = PGENReader(data_path + "/pgen/subset").read(sum_strands=True)
+    assert np.array_equal(snpobj_pgen.calldata_gt.sum(axis=2, dtype=np.int8), snpobj_pgen_summed_strands.calldata_gt)
diff --git a/snputils/snp/io/read/bed.py b/snputils/snp/io/read/bed.py
@@ -22,7 +22,6 @@ def read(
         variant_ids: Optional[np.ndarray] = None,
         variant_idxs: Optional[np.ndarray] = None,
         sum_strands: bool = False,
-        n_jobs: int = 1
     ) -> SNPObject:
         """
         Read a bed fileset (bed, bim, fam) into a SNPObject.
@@ -174,12 +173,9 @@ def read(
 
         snpobj = SNPObject(
             calldata_gt=genotypes if "GT" in fields else None,
-            samples=fam.get_column("IID").to_numpy() if "IID" in fields else None,
-            variants_ref=bim.get_column("REF").to_numpy() if "REF" in fields else None,
-            variants_alt=bim.get_column("ALT").to_numpy() if "ALT" in fields else None,
-            variants_chrom=bim.get_column("#CHROM").to_numpy() if "#CHROM" in fields else None,
-            variants_id=bim.get_column("ID").to_numpy() if "ID" in fields else None,
-            variants_pos=bim.get_column("POS").to_numpy() if "POS" in fields else None,
+            samples=fam.get_column("IID").to_numpy() if "IID" in fields and "IID" in fam.columns else None,
+            **{f'variants_{k.lower()}': bim.get_column(v).to_numpy() if v in fields and v in bim.columns else None
+               for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS'}.items()}
         )
 
         log.info("Finished constructing SNPObject")

diff --git a/snputils/snp/io/read/pgen.py b/snputils/snp/io/read/pgen.py
@@ -23,7 +23,6 @@ def read(
         variant_ids: Optional[np.ndarray] = None,
         variant_idxs: Optional[np.ndarray] = None,
         sum_strands: bool = False,
-        n_jobs: int = 1
     ) -> SNPObject:
         """
         Read a pgen fileset (pgen, psam, pvar) into a SNPObject.
@@ -248,14 +247,9 @@ def lazy_read(filename: str, **kwargs) -> pl.LazyFrame:
 
         snpobj = SNPObject(
             calldata_gt=genotypes if "GT" in fields else None,
-            samples=psam.get_column("IID").to_numpy() if "IID" in psam.columns and fields else None,
-            variants_ref=pvar.get_column("REF").to_numpy() if "REF" in pvar.columns and fields else None,
-            variants_alt=pvar.get_column("ALT").to_numpy() if "ALT" in pvar.columns and fields else None,
-            variants_chrom=pvar.get_column("#CHROM").to_numpy() if "#CHROM" in pvar.columns and fields else None,
-            variants_id=pvar.get_column("ID").to_numpy() if "ID" in pvar.columns and fields else None,
-            variants_pos=pvar.get_column("POS").to_numpy() if "POS" in pvar.columns and fields else None,
-            variants_filter_pass=pvar.get_column("FILTER").to_numpy() if "FILTER" in pvar.columns and fields else None,
-            variants_qual=pvar.get_column("QUAL").to_numpy() if "QUAL" in pvar.columns and fields else None,
+            samples=psam.get_column("IID").to_numpy() if "IID" in fields and "IID" in psam.columns else None,
+            **{f'variants_{k.lower()}': pvar.get_column(v).to_numpy() if v in fields and v in pvar.columns else None
+               for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS', 'filter_pass': 'FILTER', 'qual': 'QUAL'}.items()}
         )
 
         log.info("Finished constructing SNPObject")