earth-chris · PC-FSU · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 7, 2024
diff --git a/.gitignore b/.gitignore
@@ -131,3 +131,7 @@ dmypy.json
 # conda smithy
 build_artifacts
 package/
+
+
+#poetry lock file
+poetry.lock
diff --git a/docs/examples/WorkingWithGeospatialData.ipynb b/docs/examples/WorkingWithGeospatialData.ipynb
diff --git a/elapid/__init__.py b/elapid/__init__.py
@@ -1,5 +1,6 @@
 """User entrypoint to elapid"""
 
+from elapid.evaluate import boyce_index, continuous_boyce_index, get_intervals, plot_PE_curve
 from elapid.features import (
     CategoricalTransformer,
     HingeTransformer,

diff --git a/elapid/evaluate.py b/elapid/evaluate.py
@@ -0,0 +1,189 @@
+import math
+import warnings
+from typing import Dict, List, Tuple, Union
+
+import geopandas as gpd
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from scipy.stats import spearmanr
+
+# implement continuous Boyce index as describe in https://www.whoi.edu/cms/files/hirzel_etal_2006_53457.pdf (Eq.4)
+
+
+def plot_PE_curve(x: Union[np.ndarray, List[float]], y: Union[np.ndarray, List[float]]) -> None:
+    """
+    Plots the Predicted/Expected (P/E) curve for the Boyce Index.
+
+    Args:
+        x (array-like): Habitat suitability values (e.g., interval midpoints).
+        y (array-like): Predicted/Expected ratios corresponding to the habitat suitability intervals.
+
+    Returns:
+        None
+    """
+    plt.figure()
+    plt.plot(x, y, marker="o")
+    plt.xlabel("Habitat suitability")
+    plt.ylabel("Predicted/Expected ratio")
+    plt.title("Boyce Index")
+    plt.show()
+
+
+def get_intervals(
+    nbins: Union[int, List[float], np.ndarray] = 0,
+    bin_size: Union[float, str] = "default",
+    range: Tuple[float, float] = (0, 1),
+) -> np.ndarray:
+    """
+    Generates habitat suitability intervals for the Boyce Index calculation.
+
+    Calculates intervals based on the provided range and either the number of bins or bin size.
+
+    Args:
+        nbins (int or list or np.ndarray, optional): Number of classes or a list of class thresholds. Defaults to 0.
+        bin_size (float or str, optional): Width of the bins. Defaults to 'default', which sets the width as 1/10th of the range.
+        range (tuple or list): Two elements representing the minimum and maximum values of habitat suitability. Default : [0, 1]
+
+    Returns:
+        np.ndarray: An array of intervals, each represented by a pair of lower and upper bounds.
+
+    Raises:
+        ValueError: If invalid values are provided for nbins or bin_size.
+    """
+    mini, maxi = range
+
+    if isinstance(bin_size, float):
+        nbins = (maxi - mini) / bin_size
+        if not nbins.is_integer():
+            warnings.warn(
+                f"bin_size has been adjusted to nearest appropriate size using ceil, as range/bin_size : {(maxi - mini)} / {bin_size} is not an integer.",
+                UserWarning,
+            )
+        nbins = math.ceil(nbins)
+    elif nbins == 0 and bin_size == "default":
+        nbins = 10
+
+    if isinstance(nbins, (list, np.ndarray)):
+        if len(nbins) == 1:
+            raise ValueError("Invalid nbins value. len(nbins) must be > 1")
+        nbins.sort()
+        intervals = np.column_stack((nbins[:-1], nbins[1:]))
+    elif nbins > 1:
+        boundary = np.linspace(mini, maxi, num=nbins + 1)
+        intervals = np.column_stack((boundary[:-1], boundary[1:]))
+    else:
+        raise ValueError("Invalid nbins value. nbins > 1")
+
+    return intervals
+
+
+def boyce_index(yobs: np.ndarray, ypred: np.ndarray, interval: Union[Tuple[float, float], List[float]]) -> float:
+    """
+    Calculates the Boyce index for a given interval.
+
+    Uses the convention as defined in Hirzel et al. 2006 to compute the ratio of observed to expected frequencies.
+
+    Args:
+        yobs (np.ndarray): Suitability values at observed locations (e.g., predictions at presence points).
+        ypred (np.ndarray): Suitability values at random locations (e.g., predictions at background points).
+        interval (tuple or list): Two elements representing the lower and upper bounds of the interval (i.e., habitat suitability).
+
+    Returns:
+        float: The ratio of observed to expected frequencies for the given interval.
+
+    """
+    yobs_bin = (yobs >= interval[0]) & (yobs <= interval[1])
+    ypred_bin = (ypred >= interval[0]) & (ypred <= interval[1])
+
+    pi = np.sum(yobs_bin) / len(yobs_bin)
+    ei = np.sum(ypred_bin) / len(ypred_bin)
+
+    if ei == 0:
+        fi = np.nan  # Avoid division by zero
+    else:
+        fi = pi / ei
+
+    return fi
+
+
+def continuous_boyce_index(
+    yobs: Union[np.ndarray, pd.Series, gpd.GeoSeries],
+    ypred: Union[np.ndarray, pd.Series, gpd.GeoSeries],
+    nbins: Union[int, List[float], np.ndarray] = 0,
+    bin_size: Union[float, str] = "default",
+    to_plot: bool = False,
+) -> Dict[str, Union[np.ndarray, float]]:
+    """
+    Compute the continuous Boyce index to evaluate habitat suitability models.
+
+    Uses the convention as defined in Hirzel et al. 2006 to compute the ratio of observed to expected frequencies.
+
+    Args:
+        yobs (numpy.ndarray | pd.Series | gpd.GeoSeries): Suitability values at observed location (i.e., predictions at presence points).
+        ypred (numpy.ndarray | pd.Series | gpd.GeoSeries): Suitability values at random location (i.e., predictions at background points).
+        nbins (int | list, optional): Number of classes or a list of class thresholds. Defaults to 0.
+        bin_size (float | str, optional): Width of the the bin. Defaults to 'default' which sets width as 1/10th of the fit range.
+        to_plot (bool, optional): Whether to plot the predicted-to-expected (P/E) curve. Defaults to False.
+
+    Returns:
+        dict: A dictionary with the following keys:
+            - 'F.ratio' (numpy.ndarray): The P/E ratio for each bin.
+            - 'Spearman.cor' (float): The Spearman's rank correlation coefficient between interval midpoints and F ratios.
+            - 'HS' (numpy.ndarray): The habitat suitability intervals.
+    """
+    if not isinstance(ypred, (np.ndarray, pd.Series, gpd.GeoSeries)):
+        raise TypeError("The 'ypred' parameter must be a NumPy array, Pandas Series, or GeoPandas GeoSeries.")
+    if not isinstance(yobs, (np.ndarray, pd.Series, gpd.GeoSeries)):
+        raise TypeError("The 'yobs' parameter must be a NumPy array, Pandas Series, or GeoPandas GeoSeries.")
+    if not isinstance(nbins, (int, list, np.ndarray)):
+        raise TypeError("The 'nbins' parameter must be a int, list, or 1-d NumPy array.")
+    if not isinstance(bin_size, (float, str)):
+        raise TypeError("The 'bin_size' parameter must be a float, or str ('default').")
+
+    if isinstance(bin_size, float) and (isinstance(nbins, (list, np.ndarray)) or nbins > 0):
+        raise ValueError(
+            f"Ambiguous value provided. Provide either nbins or bin_size. Cannot provide both. Provided values for nbins, bin_size are: ({nbins, bin_size})"
+        )
+
+    # Check for NaN values and issue warnings
+    if np.isnan(ypred).any():
+        warnings.warn("'ypred' contains NaN values, which will be ignored.", UserWarning)
+        ypred = ypred[~np.isnan(ypred)]
+    if np.isnan(yobs).any():
+        warnings.warn("'yobs' contains NaN values, which will be ignored.", UserWarning)
+        yobs = yobs[~np.isnan(yobs)]
+
+    ypred = np.asarray(ypred)
+    yobs = np.asarray(yobs)
+
+    if ypred.ndim != 1 or yobs.ndim != 1:
+        raise ValueError("Both 'ypred' and 'yobs' must be one-dimensional arrays.")
+
+    if len(ypred) == 0 or len(yobs) == 0:
+        raise ValueError("'ypred' or 'yobs' arrays cannot be empty.")
+
+    mini, maxi = np.min(ypred), np.max(ypred)
+
+    intervals = get_intervals(nbins, bin_size, range=[mini, maxi])
+    f_scores = np.array([boyce_index(yobs, ypred, interval) for interval in intervals])
+
+    valid = ~np.isnan(f_scores)
+    f_valid = f_scores[valid]
+
+    intervals_mid = np.mean(intervals[valid], axis=1)
+    if np.sum(valid) <= 2:
+        corr = np.nan
+    else:
+        corr, _ = spearmanr(f_valid, intervals_mid)
+
+    if to_plot:
+        plot_PE_curve(x=intervals_mid, y=f_valid)
+
+    results = {
+        "F.ratio": f_scores,
+        "Spearman.cor": corr,
+        "HS": intervals,
+    }
+
+    return results
diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
@@ -0,0 +1,186 @@
+import geopandas as gpd
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import pytest
+from shapely.geometry import Point
+
+from elapid.evaluate import boyce_index, continuous_boyce_index
+
+
+# Test Case 1: Normal case with random data
+def test_normal_case():
+    np.random.seed(0)
+    ypred = np.random.rand(1000)
+    yobs = np.random.choice(ypred, size=100, replace=False)
+    results = continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
+    assert "Spearman.cor" in results
+    assert "F.ratio" in results
+    spearman_cor = results["Spearman.cor"]
+    f_ratio = results["F.ratio"]
+    assert not np.isnan(spearman_cor)
+    assert -1 <= spearman_cor <= 1
+    assert len(f_ratio) == 10
+    assert not np.any(np.isnan(f_ratio))
+    assert np.all(f_ratio >= 0)
+
+
+# Test Case 2: Edge case with empty 'ypred' array
+def test_empty_ypred():
+    ypred = np.array([])
+    yobs = np.array([0.5, 0.6, 0.7])
+    with pytest.raises(ValueError) as exc_info:
+        continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
+    assert "'ypred' or 'yobs' arrays cannot be empty." in str(exc_info.value)
+
+
+# Test Case 3: Edge case with empty 'yobs' array
+def test_empty_yobs():
+    ypred = np.random.rand(1000)
+    yobs = np.array([])
+    with pytest.raises(ValueError) as exc_info:
+        continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
+    assert "'ypred' or 'yobs' arrays cannot be empty." in str(exc_info.value)
+
+
+# Test Case 4: 'yobs' containing NaNs
+def test_yobs_with_nans(recwarn):
+    ypred = np.random.rand(1000)
+    yobs = np.random.choice(ypred, size=100, replace=False)
+    yobs[::10] = np.nan  # Introduce NaNs into 'yobs'
+    results = continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
+    # Check for warnings
+    w = recwarn.pop(UserWarning)
+    assert "'yobs' contains NaN values, which will be ignored." in str(w.message)
+    # Ensure function outputs are as expected
+    assert "Spearman.cor" in results
+    spearman_cor = results["Spearman.cor"]
+    if not np.isnan(spearman_cor):
+        assert -1 <= spearman_cor <= 1
+    f_ratio = results["F.ratio"]
+    assert len(f_ratio) == 10
+
+
+# Test Case 5: Invalid 'nbins' value (negative number)
+def test_invalid_nbins():
+    ypred = np.random.rand(1000)
+    yobs = np.random.choice(ypred, size=100, replace=False)
+    with pytest.raises(ValueError):
+        continuous_boyce_index(yobs, ypred, nbins=-5, to_plot=False)
+
+
+# Test Case 6: Custom 'bin_size' value
+def test_custom_bin_size():
+    ypred = np.random.rand(1000)
+    yobs = np.random.choice(ypred, size=100, replace=False)
+    results = continuous_boyce_index(yobs, ypred, bin_size=0.1, to_plot=False)
+    assert "Spearman.cor" in results
+    spearman_cor = results["Spearman.cor"]
+    assert not np.isnan(spearman_cor)
+    assert -1 <= spearman_cor <= 1
+    f_ratio = results["F.ratio"]
+    assert len(f_ratio) > 0
+
+
+# Test Case 7: 'ypred' containing NaNs
+def test_ypred_with_nans(recwarn):
+    ypred = np.random.rand(1000)
+    ypred[::50] = np.nan  # Introduce NaNs into 'ypred'
+    yobs = np.random.choice(ypred[~np.isnan(ypred)], size=100, replace=False)
+    results = continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
+    # Check for warnings
+    w = recwarn.pop(UserWarning)
+    assert "'ypred' contains NaN values, which will be ignored." in str(w.message)
+    # Ensure function outputs are as expected
+    assert "Spearman.cor" in results
+    spearman_cor = results["Spearman.cor"]
+    assert not np.isnan(spearman_cor)
+    assert -1 <= spearman_cor <= 1
+    f_ratio = results["F.ratio"]
+    assert len(f_ratio) == 10
+
+
+# Test Case 8: 'yobs' values outside the range of 'ypred'
+def test_yobs_outside_ypred_range():
+    ypred = np.random.rand(1000)
+    yobs = np.array([1.5, 2.0, 2.5])  # Values outside the range [0, 1]
+    results = continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
+    assert "Spearman.cor" in results
+    spearman_cor = results["Spearman.cor"]
+    # Spearman correlation may be NaN due to insufficient valid data
+    assert np.isnan(spearman_cor) or -1 <= spearman_cor <= 1
+    f_ratio = results["F.ratio"]
+    assert len(f_ratio) == 10
+
+
+# Test Case 9: Large dataset
+def test_large_dataset():
+    ypred = np.random.rand(1000000)
+    yobs = np.random.choice(ypred, size=10000, replace=False)
+    results = continuous_boyce_index(yobs, ypred, nbins=20, to_plot=False)
+    assert "Spearman.cor" in results
+    spearman_cor = results["Spearman.cor"]
+    assert not np.isnan(spearman_cor)
+    assert -1 <= spearman_cor <= 1
+    f_ratio = results["F.ratio"]
+    assert len(f_ratio) == 20
+
+
+# Test Case 10: Using Pandas Series
+def test_with_pandas_series():
+    np.random.seed(0)
+    ypred = pd.Series(np.random.rand(1000))
+    yobs = ypred.sample(n=100, replace=False)
+    results = continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
+    assert "Spearman.cor" in results
+    spearman_cor = results["Spearman.cor"]
+    assert not np.isnan(spearman_cor)
+    assert -1 <= spearman_cor <= 1
+
+
+# Test Case 11: Using GeoPandas GeoSeries
+def test_with_geopandas_geoseries():
+    np.random.seed(0)
+    num_points = 1000
+    x = np.random.uniform(-180, 180, num_points)
+    y = np.random.uniform(-90, 90, num_points)
+    suitability = np.random.rand(num_points)
+    geometry = [Point(xy) for xy in zip(x, y)]
+    gdf = gpd.GeoDataFrame({"suitability": suitability}, geometry=geometry)
+
+    ypred = gdf["suitability"]  # This is a Pandas Series
+    yobs = ypred.sample(n=100, replace=False)
+    results = continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
+    assert "Spearman.cor" in results
+    spearman_cor = results["Spearman.cor"]
+    assert not np.isnan(spearman_cor)
+    assert -1 <= spearman_cor <= 1
+
+
+# Test Case 12: Both 'ypred' and 'yobs' containing NaNs
+def test_both_ypred_yobs_with_nans(recwarn):
+    ypred = np.random.rand(1000)
+    ypred[::50] = np.nan  # Introduce NaNs into 'ypred'
+    yobs = np.random.choice(ypred, size=100, replace=False)
+    yobs[::10] = np.nan  # Introduce NaNs into 'yobs'
+    results = continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
+    # Check for multiple warnings
+    warnings_list = [str(w.message) for w in recwarn.list]
+    assert "'ypred' contains NaN values, which will be ignored." in warnings_list
+    assert "'yobs' contains NaN values, which will be ignored." in warnings_list
+    # Ensure function outputs are as expected
+    assert "Spearman.cor" in results
+    spearman_cor = results["Spearman.cor"]
+    if not np.isnan(spearman_cor):
+        assert -1 <= spearman_cor <= 1
+    f_ratio = results["F.ratio"]
+    assert len(f_ratio) == 10
+
+
+# Test Case 13: Empty arrays after removing NaNs
+def test_empty_arrays_after_nan_removal():
+    ypred = np.array([np.nan, np.nan])
+    yobs = np.array([np.nan, np.nan])
+    with pytest.raises(ValueError) as exc_info:
+        continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
+    assert "'ypred' or 'yobs' arrays cannot be empty." in str(exc_info.value)