From e9e9d6540c4e9feaf75db3f326e5ebf25356ee20 Mon Sep 17 00:00:00 2001 From: Tom White Date: Thu, 19 Sep 2024 09:49:13 +0100 Subject: [PATCH 1/6] Install dask[dataframe] explicitly to fix upstream error --- .github/scripts/upstream_install.py | 2 +- requirements.txt | 2 +- setup.cfg | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/scripts/upstream_install.py b/.github/scripts/upstream_install.py index 334795790..87f989d7d 100644 --- a/.github/scripts/upstream_install.py +++ b/.github/scripts/upstream_install.py @@ -14,7 +14,7 @@ def install_deps() -> None: "--upgrade", ) upstream_deps = ( - "git+https://github.com/dask/dask.git#egg=dask[array]", + "git+https://github.com/dask/dask.git#egg=dask[array,dataframe]", "git+https://github.com/dask/distributed.git#egg=distributed", "git+https://github.com/dask/dask-ml.git#egg=dask-ml", "git+https://github.com/pandas-dev/pandas#egg=pandas", diff --git a/requirements.txt b/requirements.txt index dcc24d89b..db170a79f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy < 2 xarray -dask[array] >= 2023.01.0, <= 2024.8.0 +dask[array,dataframe] >= 2023.01.0, <= 2024.8.0 distributed >= 2023.01.0, <= 2024.8.0 dask-ml scipy diff --git a/setup.cfg b/setup.cfg index 8bacbf5f7..42aa1afac 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,7 +30,7 @@ python_requires = >=3.9 install_requires = numpy < 2 xarray - dask[array] >= 2022.01.0, <= 2024.8.0 + dask[array,dataframe] >= 2022.01.0, <= 2024.8.0 distributed >= 2022.01.0, <= 2024.8.0 dask-ml scipy From 34ac4ee74087b37b44a0a8b4705fb20ecfbc366c Mon Sep 17 00:00:00 2001 From: Tom White Date: Thu, 19 Sep 2024 09:51:54 +0100 Subject: [PATCH 2/6] Temp change to run on PR --- .github/workflows/upstream.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/upstream.yml b/.github/workflows/upstream.yml index c4a663137..48fd60173 100644 --- a/.github/workflows/upstream.yml +++ b/.github/workflows/upstream.yml @@ -1,6 +1,7 @@ name: Upstream on: + pull_request: push: schedule: - cron: "0 1 * * *" From bc9d699c7b200ada1ba21087105acbdb34ff2f02 Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 28 Oct 2024 09:31:39 +0000 Subject: [PATCH 3/6] Change minimum Python version of upstream CI to 3.11 to match Zarr --- .github/workflows/upstream.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/upstream.yml b/.github/workflows/upstream.yml index 48fd60173..ef3ccfa2d 100644 --- a/.github/workflows/upstream.yml +++ b/.github/workflows/upstream.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.10", "3.11"] + python-version: ["3.11", "3.12"] steps: - uses: actions/checkout@v2 From 54986078bc779ceacd72678fa0c8f23db1a3b851 Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 28 Oct 2024 09:32:56 +0000 Subject: [PATCH 4/6] Unpin Dask since slowdown issue was addressed in 2024.10.0 --- requirements-numpy2.txt | 4 ++-- requirements.txt | 4 ++-- setup.cfg | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements-numpy2.txt b/requirements-numpy2.txt index 491e63fb8..16d16f990 100644 --- a/requirements-numpy2.txt +++ b/requirements-numpy2.txt @@ -1,7 +1,7 @@ numpy < 2.1 xarray -dask[array] >= 2023.01.0, <= 2024.8.0 -distributed >= 2023.01.0, <= 2024.8.0 +dask[array] >= 2023.01.0, != 2024.8.1, != 2024.9.* +distributed >= 2023.01.0, != 2024.8.1, != 2024.9.* dask-ml scipy typing-extensions diff --git a/requirements.txt b/requirements.txt index db170a79f..eb117e179 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ numpy < 2 xarray -dask[array,dataframe] >= 2023.01.0, <= 2024.8.0 -distributed >= 2023.01.0, <= 2024.8.0 +dask[array,dataframe] >= 2023.01.0, != 2024.8.1, != 2024.9.* +distributed >= 2023.01.0, != 2024.8.1, != 2024.9.* dask-ml scipy typing-extensions diff --git a/setup.cfg b/setup.cfg index 42aa1afac..fa506621d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,8 +30,8 @@ python_requires = >=3.9 install_requires = numpy < 2 xarray - dask[array,dataframe] >= 2022.01.0, <= 2024.8.0 - distributed >= 2022.01.0, <= 2024.8.0 + dask[array,dataframe] >= 2022.01.0, != 2024.8.1, != 2024.9.* + distributed >= 2022.01.0, != 2024.8.1, != 2024.9.* dask-ml scipy zarr >= 2.10.0, != 2.11.0, != 2.11.1, != 2.11.2, < 3 From 5baef495b44ab0417dfe786e125bd6079e5a7323 Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 28 Oct 2024 09:41:37 +0000 Subject: [PATCH 5/6] Don't run upstream CI on Python 3.12 due to cbgen incompatibility --- .github/workflows/upstream.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/upstream.yml b/.github/workflows/upstream.yml index ef3ccfa2d..b3e2a5b4d 100644 --- a/.github/workflows/upstream.yml +++ b/.github/workflows/upstream.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.11", "3.12"] + python-version: ["3.11"] steps: - uses: actions/checkout@v2 From e83b52cdf1ef1b305eefdd8bcaca55b437cc4e4b Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 28 Oct 2024 11:16:53 +0000 Subject: [PATCH 6/6] Fix problem with lambdas wrapping numba functions get recompiled --- sgkit/stats/aggregation.py | 8 ++++---- sgkit/stats/aggregation_numba_fns.py | 6 ++++++ sgkit/stats/popgen.py | 4 +--- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py index 9360e318c..9f3862985 100644 --- a/sgkit/stats/aggregation.py +++ b/sgkit/stats/aggregation.py @@ -680,7 +680,7 @@ def variant_stats( -------- :func:`count_variant_genotypes` """ - from .aggregation_numba_fns import count_hom + from .aggregation_numba_fns import count_hom_new_axis variables.validate(ds, {call_genotype: variables.call_genotype_spec}) mixed_ploidy = ds[call_genotype].attrs.get("mixed_ploidy", False) @@ -697,7 +697,7 @@ def variant_stats( G = da.asarray(ds[call_genotype].data) H = xr.DataArray( da.map_blocks( - lambda *args: count_hom(*args)[:, np.newaxis, :], + count_hom_new_axis, G, np.zeros(3, np.uint64), drop_axis=2, @@ -796,7 +796,7 @@ def sample_stats( ValueError If the dataset contains mixed-ploidy genotype calls. """ - from .aggregation_numba_fns import count_hom + from .aggregation_numba_fns import count_hom_new_axis variables.validate(ds, {call_genotype: variables.call_genotype_spec}) mixed_ploidy = ds[call_genotype].attrs.get("mixed_ploidy", False) @@ -805,7 +805,7 @@ def sample_stats( GT = da.asarray(ds[call_genotype].transpose("samples", "variants", "ploidy").data) H = xr.DataArray( da.map_blocks( - lambda *args: count_hom(*args)[:, np.newaxis, :], + count_hom_new_axis, GT, np.zeros(3, np.uint64), drop_axis=2, diff --git a/sgkit/stats/aggregation_numba_fns.py b/sgkit/stats/aggregation_numba_fns.py index 3335f5457..b84b92a09 100644 --- a/sgkit/stats/aggregation_numba_fns.py +++ b/sgkit/stats/aggregation_numba_fns.py @@ -2,6 +2,8 @@ # in a separate file here, and imported dynamically to avoid # initial compilation overhead. +import numpy as np + from sgkit.accelerate import numba_guvectorize, numba_jit from sgkit.typing import ArrayLike @@ -102,3 +104,7 @@ def count_hom( index = _classify_hom(genotypes[i]) if index >= 0: out[index] += 1 + + +def count_hom_new_axis(genotypes: ArrayLike, _: ArrayLike) -> ArrayLike: + return count_hom(genotypes, _)[:, np.newaxis, :] diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py index d000bdbee..e201dfc98 100644 --- a/sgkit/stats/popgen.py +++ b/sgkit/stats/popgen.py @@ -595,9 +595,7 @@ def pbs( cohorts = cohorts or list(itertools.combinations(range(n_cohorts), 3)) # type: ignore ct = _cohorts_to_array(cohorts, ds.indexes.get("cohorts_0", None)) - p = da.map_blocks( - lambda t: _pbs_cohorts(t, ct), t, chunks=shape, new_axis=3, dtype=np.float64 - ) + p = da.map_blocks(_pbs_cohorts, t, ct, chunks=shape, new_axis=3, dtype=np.float64) assert_array_shape(p, n_windows, n_cohorts, n_cohorts, n_cohorts) new_ds = create_dataset(