Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Boyce index code #100

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,7 @@ dmypy.json
# conda smithy
build_artifacts
package/


#poetry lock file
poetry.lock
58 changes: 57 additions & 1 deletion docs/examples/WorkingWithGeospatialData.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions elapid/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""User entrypoint to elapid"""

from elapid.evaluate import boyce_index, continuous_boyce_index, get_intervals, plot_PE_curve
from elapid.features import (
CategoricalTransformer,
HingeTransformer,
Expand Down
189 changes: 189 additions & 0 deletions elapid/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import math
import warnings
from typing import Dict, List, Tuple, Union

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import spearmanr

# implement continuous Boyce index as describe in https://www.whoi.edu/cms/files/hirzel_etal_2006_53457.pdf (Eq.4)


def plot_PE_curve(x: Union[np.ndarray, List[float]], y: Union[np.ndarray, List[float]]) -> None:
"""
Plots the Predicted/Expected (P/E) curve for the Boyce Index.

Args:
x (array-like): Habitat suitability values (e.g., interval midpoints).
y (array-like): Predicted/Expected ratios corresponding to the habitat suitability intervals.

Returns:
None
"""
plt.figure()
plt.plot(x, y, marker="o")
plt.xlabel("Habitat suitability")
plt.ylabel("Predicted/Expected ratio")
plt.title("Boyce Index")
plt.show()


def get_intervals(
nbins: Union[int, List[float], np.ndarray] = 0,
bin_size: Union[float, str] = "default",
range: Tuple[float, float] = (0, 1),
) -> np.ndarray:
"""
Generates habitat suitability intervals for the Boyce Index calculation.

Calculates intervals based on the provided range and either the number of bins or bin size.

Args:
nbins (int or list or np.ndarray, optional): Number of classes or a list of class thresholds. Defaults to 0.
bin_size (float or str, optional): Width of the bins. Defaults to 'default', which sets the width as 1/10th of the range.
range (tuple or list): Two elements representing the minimum and maximum values of habitat suitability. Default : [0, 1]

Returns:
np.ndarray: An array of intervals, each represented by a pair of lower and upper bounds.

Raises:
ValueError: If invalid values are provided for nbins or bin_size.
"""
mini, maxi = range

if isinstance(bin_size, float):
nbins = (maxi - mini) / bin_size
if not nbins.is_integer():
warnings.warn(
f"bin_size has been adjusted to nearest appropriate size using ceil, as range/bin_size : {(maxi - mini)} / {bin_size} is not an integer.",
UserWarning,
)
nbins = math.ceil(nbins)
elif nbins == 0 and bin_size == "default":
nbins = 10

if isinstance(nbins, (list, np.ndarray)):
if len(nbins) == 1:
raise ValueError("Invalid nbins value. len(nbins) must be > 1")
nbins.sort()
intervals = np.column_stack((nbins[:-1], nbins[1:]))
elif nbins > 1:
boundary = np.linspace(mini, maxi, num=nbins + 1)
intervals = np.column_stack((boundary[:-1], boundary[1:]))
else:
raise ValueError("Invalid nbins value. nbins > 1")

return intervals


def boyce_index(yobs: np.ndarray, ypred: np.ndarray, interval: Union[Tuple[float, float], List[float]]) -> float:
"""
Calculates the Boyce index for a given interval.

Uses the convention as defined in Hirzel et al. 2006 to compute the ratio of observed to expected frequencies.

Args:
yobs (np.ndarray): Suitability values at observed locations (e.g., predictions at presence points).
ypred (np.ndarray): Suitability values at random locations (e.g., predictions at background points).
interval (tuple or list): Two elements representing the lower and upper bounds of the interval (i.e., habitat suitability).

Returns:
float: The ratio of observed to expected frequencies for the given interval.

"""
yobs_bin = (yobs >= interval[0]) & (yobs <= interval[1])
ypred_bin = (ypred >= interval[0]) & (ypred <= interval[1])

pi = np.sum(yobs_bin) / len(yobs_bin)
ei = np.sum(ypred_bin) / len(ypred_bin)

if ei == 0:
fi = np.nan # Avoid division by zero
else:
fi = pi / ei

return fi


def continuous_boyce_index(
yobs: Union[np.ndarray, pd.Series, gpd.GeoSeries],
ypred: Union[np.ndarray, pd.Series, gpd.GeoSeries],
nbins: Union[int, List[float], np.ndarray] = 0,
bin_size: Union[float, str] = "default",
to_plot: bool = False,
) -> Dict[str, Union[np.ndarray, float]]:
"""
Compute the continuous Boyce index to evaluate habitat suitability models.

Uses the convention as defined in Hirzel et al. 2006 to compute the ratio of observed to expected frequencies.

Args:
yobs (numpy.ndarray | pd.Series | gpd.GeoSeries): Suitability values at observed location (i.e., predictions at presence points).
ypred (numpy.ndarray | pd.Series | gpd.GeoSeries): Suitability values at random location (i.e., predictions at background points).
nbins (int | list, optional): Number of classes or a list of class thresholds. Defaults to 0.
bin_size (float | str, optional): Width of the the bin. Defaults to 'default' which sets width as 1/10th of the fit range.
to_plot (bool, optional): Whether to plot the predicted-to-expected (P/E) curve. Defaults to False.

Returns:
dict: A dictionary with the following keys:
- 'F.ratio' (numpy.ndarray): The P/E ratio for each bin.
- 'Spearman.cor' (float): The Spearman's rank correlation coefficient between interval midpoints and F ratios.
- 'HS' (numpy.ndarray): The habitat suitability intervals.
"""
if not isinstance(ypred, (np.ndarray, pd.Series, gpd.GeoSeries)):
raise TypeError("The 'ypred' parameter must be a NumPy array, Pandas Series, or GeoPandas GeoSeries.")
if not isinstance(yobs, (np.ndarray, pd.Series, gpd.GeoSeries)):
raise TypeError("The 'yobs' parameter must be a NumPy array, Pandas Series, or GeoPandas GeoSeries.")
if not isinstance(nbins, (int, list, np.ndarray)):
raise TypeError("The 'nbins' parameter must be a int, list, or 1-d NumPy array.")
if not isinstance(bin_size, (float, str)):
raise TypeError("The 'bin_size' parameter must be a float, or str ('default').")

if isinstance(bin_size, float) and (isinstance(nbins, (list, np.ndarray)) or nbins > 0):
raise ValueError(
f"Ambiguous value provided. Provide either nbins or bin_size. Cannot provide both. Provided values for nbins, bin_size are: ({nbins, bin_size})"
)

# Check for NaN values and issue warnings
if np.isnan(ypred).any():
warnings.warn("'ypred' contains NaN values, which will be ignored.", UserWarning)
ypred = ypred[~np.isnan(ypred)]
if np.isnan(yobs).any():
warnings.warn("'yobs' contains NaN values, which will be ignored.", UserWarning)
yobs = yobs[~np.isnan(yobs)]

ypred = np.asarray(ypred)
yobs = np.asarray(yobs)

if ypred.ndim != 1 or yobs.ndim != 1:
raise ValueError("Both 'ypred' and 'yobs' must be one-dimensional arrays.")

if len(ypred) == 0 or len(yobs) == 0:
raise ValueError("'ypred' or 'yobs' arrays cannot be empty.")

mini, maxi = np.min(ypred), np.max(ypred)

intervals = get_intervals(nbins, bin_size, range=[mini, maxi])
f_scores = np.array([boyce_index(yobs, ypred, interval) for interval in intervals])

valid = ~np.isnan(f_scores)
f_valid = f_scores[valid]

intervals_mid = np.mean(intervals[valid], axis=1)
if np.sum(valid) <= 2:
corr = np.nan
else:
corr, _ = spearmanr(f_valid, intervals_mid)

if to_plot:
plot_PE_curve(x=intervals_mid, y=f_valid)

results = {
"F.ratio": f_scores,
"Spearman.cor": corr,
"HS": intervals,
}

return results
186 changes: 186 additions & 0 deletions tests/test_evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pytest
from shapely.geometry import Point

from elapid.evaluate import boyce_index, continuous_boyce_index


# Test Case 1: Normal case with random data
def test_normal_case():
np.random.seed(0)
ypred = np.random.rand(1000)
yobs = np.random.choice(ypred, size=100, replace=False)
results = continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
assert "Spearman.cor" in results
assert "F.ratio" in results
spearman_cor = results["Spearman.cor"]
f_ratio = results["F.ratio"]
assert not np.isnan(spearman_cor)
assert -1 <= spearman_cor <= 1
assert len(f_ratio) == 10
assert not np.any(np.isnan(f_ratio))
assert np.all(f_ratio >= 0)


# Test Case 2: Edge case with empty 'ypred' array
def test_empty_ypred():
ypred = np.array([])
yobs = np.array([0.5, 0.6, 0.7])
with pytest.raises(ValueError) as exc_info:
continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
assert "'ypred' or 'yobs' arrays cannot be empty." in str(exc_info.value)


# Test Case 3: Edge case with empty 'yobs' array
def test_empty_yobs():
ypred = np.random.rand(1000)
yobs = np.array([])
with pytest.raises(ValueError) as exc_info:
continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
assert "'ypred' or 'yobs' arrays cannot be empty." in str(exc_info.value)


# Test Case 4: 'yobs' containing NaNs
def test_yobs_with_nans(recwarn):
ypred = np.random.rand(1000)
yobs = np.random.choice(ypred, size=100, replace=False)
yobs[::10] = np.nan # Introduce NaNs into 'yobs'
results = continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
# Check for warnings
w = recwarn.pop(UserWarning)
assert "'yobs' contains NaN values, which will be ignored." in str(w.message)
# Ensure function outputs are as expected
assert "Spearman.cor" in results
spearman_cor = results["Spearman.cor"]
if not np.isnan(spearman_cor):
assert -1 <= spearman_cor <= 1
f_ratio = results["F.ratio"]
assert len(f_ratio) == 10


# Test Case 5: Invalid 'nbins' value (negative number)
def test_invalid_nbins():
ypred = np.random.rand(1000)
yobs = np.random.choice(ypred, size=100, replace=False)
with pytest.raises(ValueError):
continuous_boyce_index(yobs, ypred, nbins=-5, to_plot=False)


# Test Case 6: Custom 'bin_size' value
def test_custom_bin_size():
ypred = np.random.rand(1000)
yobs = np.random.choice(ypred, size=100, replace=False)
results = continuous_boyce_index(yobs, ypred, bin_size=0.1, to_plot=False)
assert "Spearman.cor" in results
spearman_cor = results["Spearman.cor"]
assert not np.isnan(spearman_cor)
assert -1 <= spearman_cor <= 1
f_ratio = results["F.ratio"]
assert len(f_ratio) > 0


# Test Case 7: 'ypred' containing NaNs
def test_ypred_with_nans(recwarn):
ypred = np.random.rand(1000)
ypred[::50] = np.nan # Introduce NaNs into 'ypred'
yobs = np.random.choice(ypred[~np.isnan(ypred)], size=100, replace=False)
results = continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
# Check for warnings
w = recwarn.pop(UserWarning)
assert "'ypred' contains NaN values, which will be ignored." in str(w.message)
# Ensure function outputs are as expected
assert "Spearman.cor" in results
spearman_cor = results["Spearman.cor"]
assert not np.isnan(spearman_cor)
assert -1 <= spearman_cor <= 1
f_ratio = results["F.ratio"]
assert len(f_ratio) == 10


# Test Case 8: 'yobs' values outside the range of 'ypred'
def test_yobs_outside_ypred_range():
ypred = np.random.rand(1000)
yobs = np.array([1.5, 2.0, 2.5]) # Values outside the range [0, 1]
results = continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
assert "Spearman.cor" in results
spearman_cor = results["Spearman.cor"]
# Spearman correlation may be NaN due to insufficient valid data
assert np.isnan(spearman_cor) or -1 <= spearman_cor <= 1
f_ratio = results["F.ratio"]
assert len(f_ratio) == 10


# Test Case 9: Large dataset
def test_large_dataset():
ypred = np.random.rand(1000000)
yobs = np.random.choice(ypred, size=10000, replace=False)
results = continuous_boyce_index(yobs, ypred, nbins=20, to_plot=False)
assert "Spearman.cor" in results
spearman_cor = results["Spearman.cor"]
assert not np.isnan(spearman_cor)
assert -1 <= spearman_cor <= 1
f_ratio = results["F.ratio"]
assert len(f_ratio) == 20


# Test Case 10: Using Pandas Series
def test_with_pandas_series():
np.random.seed(0)
ypred = pd.Series(np.random.rand(1000))
yobs = ypred.sample(n=100, replace=False)
results = continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
assert "Spearman.cor" in results
spearman_cor = results["Spearman.cor"]
assert not np.isnan(spearman_cor)
assert -1 <= spearman_cor <= 1


# Test Case 11: Using GeoPandas GeoSeries
def test_with_geopandas_geoseries():
np.random.seed(0)
num_points = 1000
x = np.random.uniform(-180, 180, num_points)
y = np.random.uniform(-90, 90, num_points)
suitability = np.random.rand(num_points)
geometry = [Point(xy) for xy in zip(x, y)]
gdf = gpd.GeoDataFrame({"suitability": suitability}, geometry=geometry)

ypred = gdf["suitability"] # This is a Pandas Series
yobs = ypred.sample(n=100, replace=False)
results = continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
assert "Spearman.cor" in results
spearman_cor = results["Spearman.cor"]
assert not np.isnan(spearman_cor)
assert -1 <= spearman_cor <= 1


# Test Case 12: Both 'ypred' and 'yobs' containing NaNs
def test_both_ypred_yobs_with_nans(recwarn):
ypred = np.random.rand(1000)
ypred[::50] = np.nan # Introduce NaNs into 'ypred'
yobs = np.random.choice(ypred, size=100, replace=False)
yobs[::10] = np.nan # Introduce NaNs into 'yobs'
results = continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
# Check for multiple warnings
warnings_list = [str(w.message) for w in recwarn.list]
assert "'ypred' contains NaN values, which will be ignored." in warnings_list
assert "'yobs' contains NaN values, which will be ignored." in warnings_list
# Ensure function outputs are as expected
assert "Spearman.cor" in results
spearman_cor = results["Spearman.cor"]
if not np.isnan(spearman_cor):
assert -1 <= spearman_cor <= 1
f_ratio = results["F.ratio"]
assert len(f_ratio) == 10


# Test Case 13: Empty arrays after removing NaNs
def test_empty_arrays_after_nan_removal():
ypred = np.array([np.nan, np.nan])
yobs = np.array([np.nan, np.nan])
with pytest.raises(ValueError) as exc_info:
continuous_boyce_index(yobs, ypred, nbins=10, to_plot=False)
assert "'ypred' or 'yobs' arrays cannot be empty." in str(exc_info.value)