earth-chris · PC-FSU · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 7, 2024
diff --git a/.gitignore b/.gitignore
@@ -131,3 +131,7 @@ dmypy.json
 # conda smithy
 build_artifacts
 package/
+
+
+#poetry lock file
+poetry.lock
diff --git a/docs/examples/WorkingWithGeospatialData.ipynb b/docs/examples/WorkingWithGeospatialData.ipynb
diff --git a/elapid/__init__.py b/elapid/__init__.py
@@ -26,4 +26,5 @@
 from elapid.stats import normalize_sample_probabilities
 from elapid.train_test_split import BufferedLeaveOneOut, GeographicKFold, checkerboard_split
 from elapid.utils import download_sample_data, load_object, load_sample_data, save_object
+from elapid.evaluate import boycei, boyce_index
 from elapid.version import __version__
diff --git a/elapid/evaluate.py b/elapid/evaluate.py
@@ -0,0 +1,180 @@
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from scipy.stats import spearmanr
+
+
+# implement Boyce index as describe in https://www.whoi.edu/cms/files/hirzel_etal_2006_53457.pdf (Eq.4)
+
+
+def boycei(interval, obs, fit):
+    """
+    Calculate the Boyce index for a given interval.
+
+    Args:
+        interval (tuple or list): Two elements representing the lower and upper bounds of the interval.
+        obs (numpy.ndarray): Observed suitability values (i.e., predictions at presence points).
+        fit (numpy.ndarray): Suitability values (e.g., from a raster), i.e., predictions at presence + background points.
+
+    Returns:
+        float: The ratio of observed to expected frequencies, representing the Boyce index for the given interval.
+    """
+    # Boolean arrays for classification
+    fit_bin = (fit >= interval[0]) & (fit <= interval[1])
+    obs_bin = (obs >= interval[0]) & (obs <= interval[1])
+
+    # Compute pi and ei
+    pi = np.sum(obs_bin) / len(obs_bin)
+    ei = np.sum(fit_bin) / len(fit_bin)
+
+    if ei == 0:
+        fi = np.nan  # Avoid division by zero
+    else:
+        fi = pi / ei
+
+    return fi
+
+
+def boyce_index(fit, obs, nclass=0, window="default", res=100, PEplot=False):
+    """
+    Compute the Boyce index to evaluate habitat suitability models.
+
+    The Boyce index evaluates how well a model predicts species presence by comparing its predictions
+    to a random distribution of observed presences along the prediction gradients. It is specifically 
+    designed for presence-only models and serves as an appropriate metric in such cases.
+
+    It divides the probability of species presence into ranges and, for each range, calculates the predicted-to-expected ratio (F ratio).
+    The final output is given by the Spearman correlation between the mid-point of the probability interval and the F ratio.
+
+    Index ranges from -1 to +1:
+    - Positive values: Model predictions align with actual species presence distribution.
+    - Values near zero: Model performs similarly to random predictions.
+    - Negative values: Model incorrectly predicts low-quality areas where species are more frequently found.
+
+    This calculation is based on the continuous Boyce index (Eq. 4) as defined in Hirzel et al. 2006.
+
+    Args:
+        fit (numpy.ndarray | pd.Series | gpd.GeoSeries): Suitability values (e.g., predictions at presence + background points).
+        obs (numpy.ndarray | pd.Series | gpd.GeoSeries): Observed suitability values, i.e., predictions at presence points.
+        nclass (int | list, optional): Number of classes or list of class thresholds. Defaults to 0.
+        window (float | str, optional): Width of the moving window. Defaults to 'default' which sets window as 1/10th of the fit range.
+        res (int, optional): Resolution, i.e., number of steps if nclass=0. Defaults to 100.
+        PEplot (bool, optional): Whether to plot the predicted-to-expected (P/E) curve. Defaults to False.
+
+    Returns:
+        dict: A dictionary with the following keys:
+            - 'F.ratio' (numpy.ndarray): The P/E ratio for each bin.
+            - 'Spearman.cor' (float): The Spearman's rank correlation coefficient between interval midpoints and F ratios.
+            - 'HS' (numpy.ndarray): The habitat suitability intervals.
+
+    Example:
+        # Predicted suitability scores (e.g., predictions at presence + background points)
+        predicted = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
+
+        # Observed presence suitability scores (e.g., predictions at presence points)
+        observed = np.array([0.3, 0.7, 0.8, 0.9])
+
+        # Call the boyce_index function to calculate the Boyce index and Spearman correlation
+        results = boyce_index(fit=predicted, obs=observed, nclass=3)
+        print(results)
+
+        # Output:
+        # {'F.ratio': array([0.625, 0.625, 1.875]),
+        #  'Spearman.cor': 0.866,
+        #  'HS': array([[0.1 , 0.4 ],
+        #               [0.4 , 0.7 ],
+        #               [0.7 , 1.  ]])}
+    """
+
+
+    # Check types of fit and obs
+    acceptable_types = (np.ndarray, pd.Series, gpd.GeoSeries)
+    if not isinstance(fit, acceptable_types):
+        raise TypeError("The 'fit' parameter must be a NumPy array, Pandas Series, or GeoPandas GeoSeries.")
+    if not isinstance(obs, acceptable_types):
+        raise TypeError("The 'obs' parameter must be a NumPy array, Pandas Series, or GeoPandas GeoSeries.")
+
+
+    # Convert inputs to NumPy arrays
+    fit = np.asarray(fit)
+    obs = np.asarray(obs)
+
+
+    # Ensure fit and obs are one-dimensional arrays
+    if fit.ndim != 1 or obs.ndim != 1:
+        raise ValueError("Both 'fit' and 'obs' must be one-dimensional arrays.")
+
+
+    # Remove NaNs from fit and obs
+    fit = fit[~np.isnan(fit)]
+    obs = obs[~np.isnan(obs)]
+
+    if len(fit) == 0 or len(obs) == 0:
+        raise ValueError("After removing NaNs, 'fit' or 'obs' arrays cannot be empty.")
+
+
+    # Remove NaNs from fit
+    fit = fit[~np.isnan(fit)]
+
+    if window == "default":
+        window = (np.max(fit) - np.min(fit)) / 10.0
+
+    mini = np.min(fit)
+    maxi = np.max(fit)
+
+    if nclass == 0:
+        vec_mov = np.linspace(mini, maxi - window, num=res+1)
+        intervals = np.column_stack((vec_mov, vec_mov + window))
+    elif isinstance(nclass, (list, np.ndarray)) and len(nclass) > 1:
+        nclass.sort()
+        if mini > nclass[0] or  maxi < nclass[-1]:
+            raise ValueError(f"The range provided via nclass is: ({nclass[0], nclass[-1]}). The range computed via fit is: ({mini, maxi}). Provided range via nclass should be in range computed via (max(fit), min(fit)).")
+        vec_mov = np.concatenate(([mini], nclass))
+        intervals = np.column_stack((vec_mov[:-1], vec_mov[1:]))
+        print(vec_mov)
+        print(intervals)
+    elif nclass > 0:
+        vec_mov = np.linspace(mini, maxi, num=nclass + 1)
+        intervals = np.column_stack((vec_mov[:-1], vec_mov[1:]))
+    else:
+        raise ValueError("Invalid nclass value.")
+
+
+    # Apply boycei function to each interval
+    f_list = []
+    for inter in intervals:
+        fi = boycei(inter, obs, fit)
+        f_list.append(fi)
+    f = np.array(f_list)
+
+
+    # Remove NaNs
+    valid = ~np.isnan(f)
+
+    # use interval midpoints to calculate the spearmanr coeff. 
+    intervals_mid = np.mean(intervals[valid], axis=1)
+    if np.sum(valid) <= 2:
+        corr = np.nan
+    else:
+        f_valid = f[valid]
+        corr, _ = spearmanr(f_valid, intervals_mid)
+
+
+    if PEplot:
+        plt.figure()
+        plt.plot(intervals_mid, f[valid], marker='o')
+        plt.xlabel('Habitat suitability')
+        plt.ylabel('Predicted/Expected ratio')
+        plt.title('Boyce Index')
+        plt.show()
+
+
+    results = {
+        'F.ratio': f,
+        'Spearman.cor': round(corr, 3) if not np.isnan(corr) else np.nan,
+        'HS': intervals,
+    }
+
+    return results
+
diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
@@ -0,0 +1,150 @@
+import numpy as np
+import pytest
+import matplotlib.pyplot as plt
+from elapid.evaluate import boycei, boyce_index
+import pandas as pd
+import geopandas as gpd
+from shapely.geometry import Point
+
+
+
+# Test Case 1: Normal case with random data
+def test_normal_case():
+    np.random.seed(0)
+    fit = np.random.rand(1000)
+    obs = np.random.choice(fit, size=100, replace=False)
+    results = boyce_index(fit, obs, nclass=10, PEplot=False)
+    assert 'Spearman.cor' in results
+    assert 'F.ratio' in results
+    spearman_cor = results['Spearman.cor']
+    f_ratio = results['F.ratio']
+    assert not np.isnan(spearman_cor)
+    assert -1 <= spearman_cor <= 1
+    assert len(f_ratio) == 10
+    assert not np.any(np.isnan(f_ratio))
+    assert np.all(f_ratio >= 0)
+
+# Test Case 2: Edge case with empty 'fit' array
+def test_empty_fit():
+    fit = np.array([])
+    obs = np.array([0.5, 0.6, 0.7])
+    with pytest.raises(ValueError):
+        boyce_index(fit, obs, nclass=10, PEplot=False)
+
+# Test Case 3: Edge case with empty 'obs' array
+def test_empty_obs():
+    fit = np.random.rand(1000)
+    obs = np.array([])
+    with pytest.raises(ValueError) as exc_info:
+        boyce_index(fit, obs, nclass=10, PEplot=False)
+    assert "After removing NaNs, 'fit' or 'obs' arrays cannot be empty." in str(exc_info.value)
+
+# Test Case 4: 'obs' containing NaNs
+def test_obs_with_nans():
+    fit = np.random.rand(1000)
+    obs = np.random.choice(fit, size=100, replace=False)
+    obs[::10] = np.nan  # Introduce NaNs into 'obs'
+    results = boyce_index(fit, obs, nclass=10, PEplot=False)
+    spearman_cor = results['Spearman.cor']
+    assert 'Spearman.cor' in results
+    if not np.isnan(spearman_cor):
+        assert -1 <= spearman_cor <= 1
+    f_ratio = results['F.ratio']
+    assert len(f_ratio) == 10
+
+# Test Case 5: Invalid 'nclass' value (negative number)
+def test_invalid_nclass():
+    fit = np.random.rand(1000)
+    obs = np.random.choice(fit, size=100, replace=False)
+    with pytest.raises(ValueError):
+        boyce_index(fit, obs, nclass=-5, PEplot=False)
+
+# Test Case 6: Custom 'window' value
+def test_custom_window():
+    fit = np.random.rand(1000)
+    obs = np.random.choice(fit, size=100, replace=False)
+    results = boyce_index(fit, obs, window=0.1, PEplot=False)
+    assert 'Spearman.cor' in results
+    spearman_cor = results['Spearman.cor']
+    assert not np.isnan(spearman_cor)
+    assert -1 <= spearman_cor <= 1
+    f_ratio = results['F.ratio']
+    assert len(f_ratio) > 0
+
+# Test Case 7: 'PEplot' set to True
+def test_peplot_true():
+    fit = np.random.rand(1000)
+    obs = np.random.choice(fit, size=100, replace=False)
+    results = boyce_index(fit, obs, nclass=10, PEplot=True)
+    assert 'Spearman.cor' in results
+    spearman_cor = results['Spearman.cor']
+    assert not np.isnan(spearman_cor)
+    assert -1 <= spearman_cor <= 1
+    plt.close('all')  # Close the plot to avoid display during testing
+
+# Test Case 8: 'fit' containing NaNs
+def test_fit_with_nans():
+# In this code snippet:
+    fit = np.random.rand(1000)
+    fit[::50] = np.nan  # Introduce NaNs into 'fit'
+    obs = np.random.choice(fit[~np.isnan(fit)], size=100, replace=False)
+    results = boyce_index(fit, obs, nclass=10, PEplot=False)
+    assert 'Spearman.cor' in results
+    spearman_cor = results['Spearman.cor']
+    assert not np.isnan(spearman_cor)
+    assert -1 <= spearman_cor <= 1
+    f_ratio = results['F.ratio']
+    assert len(f_ratio) == 10
+
+# Test Case 9: 'obs' values outside the range of 'fit'
+def test_obs_outside_fit_range():
+    fit = np.random.rand(1000)
+    obs = np.array([1.5, 2.0, 2.5])  # Values outside the range [0, 1]
+    results = boyce_index(fit, obs, nclass=10, PEplot=False)
+    spearman_cor = results['Spearman.cor']
+    assert 'Spearman.cor' in results
+    assert np.isnan(spearman_cor) or -1 <= spearman_cor <= 1
+    f_ratio = results['F.ratio']
+    assert len(f_ratio) == 10
+
+# Test Case 10: Large dataset
+def test_large_dataset():
+    fit = np.random.rand(1000000)
+    obs = np.random.choice(fit, size=10000, replace=False)
+    results = boyce_index(fit, obs, nclass=20, PEplot=False)
+    assert 'Spearman.cor' in results
+    spearman_cor = results['Spearman.cor']
+    assert not np.isnan(spearman_cor)
+    assert -1 <= spearman_cor <= 1
+    f_ratio = results['F.ratio']
+    assert len(f_ratio) == 20
+
+
+# Test Case 11: Using Pandas Series
+def test_with_pandas_series():
+    np.random.seed(0)
+    fit = pd.Series(np.random.rand(1000))
+    obs = fit.sample(n=100, replace=False)
+    results = boyce_index(fit, obs, nclass=10, PEplot=False)
+    assert 'Spearman.cor' in results
+    spearman_cor = results['Spearman.cor']
+    assert not np.isnan(spearman_cor)
+    assert -1 <= spearman_cor <= 1
+
+# Test Case 12: Using GeoPandas GeoSeries
+def test_with_geopandas_geoseries():
+    np.random.seed(0)
+    num_points = 1000
+    x = np.random.uniform(-180, 180, num_points)
+    y = np.random.uniform(-90, 90, num_points)
+    suitability = np.random.rand(num_points)
+    geometry = [Point(xy) for xy in zip(x, y)]
+    gdf = gpd.GeoDataFrame({'suitability': suitability}, geometry=geometry)
+
+    fit = gdf['suitability']  # This is a Pandas Series
+    obs = fit.sample(n=100, replace=False)
+    results = boyce_index(fit, obs, nclass=10, PEplot=False)
+    assert 'Spearman.cor' in results
+    spearman_cor = results['Spearman.cor']
+    assert not np.isnan(spearman_cor)
+    assert -1 <= spearman_cor <= 1