Skip to content

Commit

Permalink
Merge pull request #13 from AI-sandbox/fixes
Browse files Browse the repository at this point in the history
Some minor fixes and improvements mostly related to SNP readers
  • Loading branch information
davidbonet authored Nov 22, 2024
2 parents 5dd876b + f35e862 commit ffea32c
Show file tree
Hide file tree
Showing 13 changed files with 146 additions and 89 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ pip install snputils

Optionally, for GPU-accelerated functionalities, install the package with the `[gpu]` extra:
```bash
pip install snputils[gpu]
pip install 'snputils[gpu]'
```

## Key Features
Expand Down Expand Up @@ -111,6 +111,6 @@ We would like to thank the open-source Python packages that make **snputils** po

If you use **snputils** in your research, please cite:

> Bonet, D.\*, Comajoan Cara, M.\*, Barrabés, M.\*, Smeriglio, R., Agrawal, D., Dominguez Mantes, A., López, C., Thomassin, C., Calafell, A., Luis, A., Saurina, J., Franquesa, M., Perera, M., Geleta, M., Jaras, A., Sabat, B. O., Abante, J., Moreno-Grau, S., Mas Montserrat, D., Ioannidis, A. G., snputils: A Python library for processing diverse genomes. Annual Meeting of The American Society of Human Genetics, November 2024, Denver, Colorado, USA. \* Equal contribution.
> Bonet, D.\*, Comajoan Cara, M.\*, Barrabés, M.\*, Smeriglio, R., Agrawal, D., Dominguez Mantes, A., López, C., Thomassin, C., Calafell, A., Luis, A., Saurina, J., Franquesa, M., Perera, M., Geleta, M., Jaras, A., Sabat, B. O., Abante, J., Moreno-Grau, S., Mas Montserrat, D., Ioannidis, A. G., snputils: A Python library for processing diverse genomes. Annual Meeting of The American Society of Human Genetics, November 2024, Denver, Colorado, USA. \*Equal contribution.
Journal paper coming soon!
20 changes: 17 additions & 3 deletions benchmark/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,26 @@ echo "Will run benchmark for chromosomes: ${CHROMS[*]}"
# Create directory structure
mkdir -p ${DATA_DIR}/vcf ${DATA_DIR}/bed ${DATA_DIR}/pgen benchmark/sbatch benchmark/results

# Determine zip file based on OS
case "$(uname -s)" in
Darwin*)
zip_file="plink2_mac_arm64_20241114.zip"
;;
Linux*)
zip_file="plink2_linux_x86_64_20241114.zip"
;;
*)
echo "Error: You will need to manually download and setup plink2 for your OS"
exit 1
;;
esac

# Download and setup plink2 if not already present
if [ ! -f ${DATA_DIR}/plink2 ]; then
wget -P ${DATA_DIR} https://s3.amazonaws.com/plink2-assets/plink2_linux_x86_64_20241020.zip
unzip ${DATA_DIR}/plink2_linux_x86_64_20241020.zip -d ${DATA_DIR}
wget -P ${DATA_DIR} https://s3.amazonaws.com/plink2-assets/alpha6/${zip_file}
unzip ${DATA_DIR}/${zip_file} -d ${DATA_DIR}
chmod +x ${DATA_DIR}/plink2
rm ${DATA_DIR}/plink2_linux_x86_64_20241020.zip
rm ${DATA_DIR}/${zip_file}
fi

# Base URL for 1000 genomes data
Expand Down
8 changes: 4 additions & 4 deletions demos/SNPObj.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "2b25818b",
"metadata": {},
"outputs": [
Expand All @@ -58,7 +58,7 @@
],
"source": [
"# Define the path to the VCF file\n",
"query_path = '../data/subset.vcf'\n",
"query_path = '../data/vcf/subset.vcf'\n",
"\n",
"# Read VCF into SNPObject with the standard reader\n",
"reader = VCFReader(query_path)\n",
Expand Down Expand Up @@ -796,7 +796,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"id": "3ebf53e5",
"metadata": {},
"outputs": [
Expand All @@ -811,7 +811,7 @@
],
"source": [
"# Define the path to the VCF file\n",
"query_path = '../data/subset.vcf'\n",
"query_path = '../data/vcf/subset.vcf'\n",
"\n",
"# Read VCF into SNPObject with the standard reader\n",
"reader = VCFReader(query_path)\n",
Expand Down
32 changes: 12 additions & 20 deletions demos/SNP_PCA.ipynb

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions demos/TorchPCA.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,15 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Genotype data shape: (976599, 4, 2)\n",
"Genotype data shape: (993881, 4, 2)\n",
"Genotype data type: int8\n",
"Samples: ['HG00096' 'HG00097' 'HG00099' 'HG00100']\n"
]
}
],
"source": [
"# Load genotype data using VCFReader\n",
"reader = VCFReader('../data/subset.vcf')\n",
"reader = VCFReader('../data/vcf/subset.vcf')\n",
"snpobj = reader.read(sum_strands=False)\n",
"\n",
"# Extract genotype call data (GT) and display shape and data type\n",
Expand Down Expand Up @@ -293,7 +293,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "galaxybio",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand All @@ -307,7 +307,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
"version": "3.12.7"
}
},
"nbformat": 4,
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ dependencies = [

[project.optional-dependencies]
gpu = ["torch"]
jupyter = ["jupyterlab"]
testing = ["tox", "pytest", "pytest-cov"]
demos = ["jupyterlab", "seaborn"]
tests = ["tox", "pytest", "pytest-cov"]
docs = ["pdoc", "torch"] # pdoc requires to import all the libraries that are imported in the code
benchmark = ["pytest", "pytest-benchmark", "memory_profiler",
"pandas-plink", "pysam", "scikit-allel", "sgkit[plink]", "hail", "pysnptools", "Pgenlib", "cyvcf2", "plinkio", "PyVCF3"]
Expand Down Expand Up @@ -83,7 +83,7 @@ passenv =
CI
GITHUB_ACTIONS
extras =
testing
tests
gpu
docs
commands = python -m pytest -vv --color=yes --cov=snputils --cov-report=xml
Expand Down
52 changes: 28 additions & 24 deletions snputils/snp/io/read/__test__/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import subprocess
import urllib.request
import zipfile
import platform

import pytest

Expand All @@ -17,14 +18,31 @@ def data_path():
data_path = module_path / "data"
os.makedirs(data_path, exist_ok=True)

system = platform.system()
is_arm = platform.machine().lower().startswith(('arm', 'aarch'))

plink_urls = {
("Darwin", True): "plink2_mac_arm64_20241114.zip", # macOS ARM
("Linux", False): "plink2_linux_x86_64_20241114.zip", # Linux x86_64
}

try:
plink_filename = plink_urls[(system, is_arm)]
except KeyError:
raise RuntimeError(f"Unsupported platform: {system} {platform.machine()}")

files_urls = {
"plink2_linux_x86_64_20241020.zip": "https://s3.amazonaws.com/plink2-assets/alpha6/plink2_linux_x86_64_20241114.zip",
"ALL.chr21.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz": "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.chr21.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz",
plink_filename: f"https://s3.amazonaws.com/plink2-assets/alpha6/{plink_filename}",
"vcf/ALL.chr22.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz":
"https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.chr22.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz",
}

# Check and download each file if it does not exist
for file_name, url in files_urls.items():
file_path = data_path / file_name
dir_path = file_path.parent if file_path.is_file() else file_path
if not dir_path.exists():
os.makedirs(file_path.parent, exist_ok=True)
if not file_path.exists():
print(f"Downloading {file_name} to {data_path}. This may take a while...")
urllib.request.urlretrieve(url, file_path)
Expand All @@ -44,8 +62,9 @@ def data_path():
with open(subset_file, "w") as file:
file.write("\n".join(four_sample_ids))

subset_vcf = data_path / "subset.vcf"
if not subset_vcf.exists():
plink_vcf_out = "vcf/subset"
subset_vcf_file = data_path / (plink_vcf_out + ".vcf")
if not subset_vcf_file.exists():
print("Generating subset VCF...")
subprocess.run(
[
Expand All @@ -57,7 +76,7 @@ def data_path():
"--recode",
"vcf",
"--out",
"subset",
plink_vcf_out,
],
cwd=str(data_path),
)
Expand All @@ -69,15 +88,15 @@ def data_path():
fmt_file = fmt_path / "subset"
if not fmt_file.exists():
print(f"Generating {fmt} format...")
make_fmt = "--make-pgen vzs" if fmt == "pgen_zst" else f"--make-{fmt.split('_')[0]}"
make_fmt = "--make-pgen vzs" if fmt == "pgen_zst" else f"--make-{fmt}"
subprocess.run(
[
"./plink2",
"--vcf",
subset_vcf,
subset_vcf_file,
*make_fmt.split(),
"--out",
f"{fmt}/subset",
fmt_file,
],
cwd=str(data_path),
)
Expand All @@ -87,7 +106,7 @@ def data_path():

@pytest.fixture(scope="module")
def snpobj_vcf(data_path):
return VCFReader(data_path + "/subset.vcf").read(sum_strands=False)
return VCFReader(data_path + "/vcf/subset.vcf").read(sum_strands=False)


@pytest.fixture(scope="module")
Expand All @@ -98,18 +117,3 @@ def snpobj_bed(data_path):
@pytest.fixture(scope="module")
def snpobj_pgen(data_path):
return PGENReader(data_path + "/pgen/subset").read(sum_strands=False)


@pytest.fixture(scope="module")
def snpobj_vcf_summed_strands(data_path):
return VCFReader(data_path + "/subset.vcf").read(sum_strands=True)


@pytest.fixture(scope="module")
def snpojb_bed_summed_strands(data_path):
return BEDReader(data_path + "/bed/subset").read(sum_strands=True)


@pytest.fixture(scope="module")
def snpobj_pgen_summed_strands(data_path):
return PGENReader(data_path + "/pgen/subset").read(sum_strands=True)
51 changes: 51 additions & 0 deletions snputils/snp/io/read/__test__/test_fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import numpy as np

from snputils import VCFReader, BEDReader, PGENReader


# TODO: Fails with KeyError: 'calldata/GT' (genotypes = vcf_dict["calldata/GT"].astype(np.int8))
# def test_vcf_only_samples(data_path, snpobj_vcf):
# snpobj_vcf_only_samples = VCFReader(data_path + "/subset.vcf").read(fields=["IID"])
# assert np.array_equal(snpobj_vcf_only_samples.samples, snpobj_vcf.samples)


def test_bed_only_samples(data_path, snpobj_bed):
snpobj_bed_only_samples = BEDReader(data_path + "/bed/subset").read(fields=["IID"])
assert np.array_equal(snpobj_bed_only_samples.samples, snpobj_bed.samples)


def test_pgen_only_samples(data_path, snpobj_pgen):
snpobj_pgen_only_samples = PGENReader(data_path + "/pgen/subset").read(fields=["IID"])
assert np.array_equal(snpobj_pgen_only_samples.samples, snpobj_pgen.samples)


# TODO: Fails with KeyError: 'calldata/GT' (genotypes = vcf_dict["calldata/GT"].astype(np.int8))
# def test_vcf_only_variants(data_path, snpobj_vcf):
# snpobj_vcf_only_variants = VCFReader(data_path + "/subset.vcf").read(fields=["ID"])
# assert np.array_equal(snpobj_vcf_only_variants.variants_id, snpobj_vcf.variants_id)


def test_bed_only_variants(data_path, snpobj_bed):
snpobj_bed_only_variants = BEDReader(data_path + "/bed/subset").read(fields=["ID"])
assert np.array_equal(snpobj_bed_only_variants.variants_id, snpobj_bed.variants_id)


def test_pgen_only_variants(data_path, snpobj_pgen):
snpobj_pgen_only_variants = PGENReader(data_path + "/pgen/subset").read(fields=["ID"])
assert np.array_equal(snpobj_pgen_only_variants.variants_id, snpobj_pgen.variants_id)


# TODO: Fails with KeyError: 'samples' (samples=vcf_dict["samples"],)
# def test_vcf_only_gt(data_path, snpobj_vcf):
# snpobj_vcf_only_gt = VCFReader(data_path + "/subset.vcf").read(fields=["GT"])
# assert np.array_equal(snpobj_vcf_only_gt.calldata_gt, snpobj_vcf.calldata_gt)


def test_bed_only_gt(data_path, snpobj_bed):
snpobj_bed_only_gt = BEDReader(data_path + "/bed/subset").read(fields=["GT"])
assert np.array_equal(snpobj_bed_only_gt.calldata_gt, snpobj_bed.calldata_gt)


def test_pgen_only_gt(data_path, snpobj_pgen):
snpobj_pgen_only_gt = PGENReader(data_path + "/pgen/subset").read(fields=["GT"])
assert np.array_equal(snpobj_pgen_only_gt.calldata_gt, snpobj_pgen.calldata_gt)
1 change: 1 addition & 0 deletions snputils/snp/io/read/__test__/test_formats.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
from snputils import PGENReader


# VCF - BED

def test_vcf_bed_samples(snpobj_vcf, snpobj_bed):
Expand Down
13 changes: 0 additions & 13 deletions snputils/snp/io/read/__test__/test_phased.py

This file was deleted.

18 changes: 18 additions & 0 deletions snputils/snp/io/read/__test__/test_summed_strands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import numpy as np

from snputils import VCFReader, BEDReader, PGENReader


def test_vcf_summed_strands(snpobj_vcf, data_path):
snpobj_vcf_summed_strands = VCFReader(data_path + "/vcf/subset.vcf").read(sum_strands=True)
assert np.array_equal(snpobj_vcf.calldata_gt.sum(axis=2, dtype=np.int8), snpobj_vcf_summed_strands.calldata_gt)


def test_bed_summed_strands(snpobj_bed, data_path):
snpojb_bed_summed_strands = BEDReader(data_path + "/bed/subset").read(sum_strands=True)
assert np.array_equal(snpobj_bed.calldata_gt.sum(axis=2, dtype=np.int8), snpojb_bed_summed_strands.calldata_gt)


def test_pgen_summed_strands(snpobj_pgen, data_path):
snpobj_pgen_summed_strands = PGENReader(data_path + "/pgen/subset").read(sum_strands=True)
assert np.array_equal(snpobj_pgen.calldata_gt.sum(axis=2, dtype=np.int8), snpobj_pgen_summed_strands.calldata_gt)
10 changes: 3 additions & 7 deletions snputils/snp/io/read/bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def read(
variant_ids: Optional[np.ndarray] = None,
variant_idxs: Optional[np.ndarray] = None,
sum_strands: bool = False,
n_jobs: int = 1
) -> SNPObject:
"""
Read a bed fileset (bed, bim, fam) into a SNPObject.
Expand Down Expand Up @@ -174,12 +173,9 @@ def read(

snpobj = SNPObject(
calldata_gt=genotypes if "GT" in fields else None,
samples=fam.get_column("IID").to_numpy() if "IID" in fields else None,
variants_ref=bim.get_column("REF").to_numpy() if "REF" in fields else None,
variants_alt=bim.get_column("ALT").to_numpy() if "ALT" in fields else None,
variants_chrom=bim.get_column("#CHROM").to_numpy() if "#CHROM" in fields else None,
variants_id=bim.get_column("ID").to_numpy() if "ID" in fields else None,
variants_pos=bim.get_column("POS").to_numpy() if "POS" in fields else None,
samples=fam.get_column("IID").to_numpy() if "IID" in fields and "IID" in fam.columns else None,
**{f'variants_{k.lower()}': bim.get_column(v).to_numpy() if v in fields and v in bim.columns else None
for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS'}.items()}
)

log.info("Finished constructing SNPObject")
Expand Down
12 changes: 3 additions & 9 deletions snputils/snp/io/read/pgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def read(
variant_ids: Optional[np.ndarray] = None,
variant_idxs: Optional[np.ndarray] = None,
sum_strands: bool = False,
n_jobs: int = 1
) -> SNPObject:
"""
Read a pgen fileset (pgen, psam, pvar) into a SNPObject.
Expand Down Expand Up @@ -248,14 +247,9 @@ def lazy_read(filename: str, **kwargs) -> pl.LazyFrame:

snpobj = SNPObject(
calldata_gt=genotypes if "GT" in fields else None,
samples=psam.get_column("IID").to_numpy() if "IID" in psam.columns and fields else None,
variants_ref=pvar.get_column("REF").to_numpy() if "REF" in pvar.columns and fields else None,
variants_alt=pvar.get_column("ALT").to_numpy() if "ALT" in pvar.columns and fields else None,
variants_chrom=pvar.get_column("#CHROM").to_numpy() if "#CHROM" in pvar.columns and fields else None,
variants_id=pvar.get_column("ID").to_numpy() if "ID" in pvar.columns and fields else None,
variants_pos=pvar.get_column("POS").to_numpy() if "POS" in pvar.columns and fields else None,
variants_filter_pass=pvar.get_column("FILTER").to_numpy() if "FILTER" in pvar.columns and fields else None,
variants_qual=pvar.get_column("QUAL").to_numpy() if "QUAL" in pvar.columns and fields else None,
samples=psam.get_column("IID").to_numpy() if "IID" in fields and "IID" in psam.columns else None,
**{f'variants_{k.lower()}': pvar.get_column(v).to_numpy() if v in fields and v in pvar.columns else None
for k, v in {'ref': 'REF', 'alt': 'ALT', 'chrom': '#CHROM', 'id': 'ID', 'pos': 'POS', 'filter_pass': 'FILTER', 'qual': 'QUAL'}.items()}
)

log.info("Finished constructing SNPObject")
Expand Down

0 comments on commit ffea32c

Please sign in to comment.