From 71bbab09e6ef52735dbff5967434313f915fbe9e Mon Sep 17 00:00:00 2001 From: Giacomo Caria Date: Fri, 17 Oct 2025 17:34:45 -0300 Subject: [PATCH 1/2] apply changes --- xarray/core/common.py | 56 ++++++++++++++++++++++++++++++++++-- xarray/core/dataarray.py | 7 +++++ xarray/core/dataset.py | 7 +++++ xarray/groupers.py | 23 +++++++++++++++ xarray/tests/test_groupby.py | 18 ++++++++++++ 5 files changed, 109 insertions(+), 2 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 753c9537135..00b352dd876 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -6,7 +6,16 @@ from contextlib import suppress from html import escape from textwrap import dedent -from typing import TYPE_CHECKING, Any, Concatenate, ParamSpec, TypeVar, Union, overload +from typing import ( + TYPE_CHECKING, + Any, + Concatenate, + Literal, + ParamSpec, + TypeVar, + Union, + overload, +) import numpy as np import pandas as pd @@ -925,6 +934,7 @@ def _resample( offset: pd.Timedelta | datetime.timedelta | str | None, origin: str | DatetimeLike, restore_coord_dims: bool | None, + boundaries: Literal["exact", "trim"] | None = None, **indexer_kwargs: ResampleCompatible | Resampler, ) -> T_Resample: """Returns a Resample object for performing resampling operations. @@ -960,6 +970,11 @@ def _resample( restore_coord_dims : bool, optional If True, also restore the dimension order of multi-dimensional coordinates. + boundaries : {"exact", "trim"}, optional + How to handle boundaries when the data doesn't evenly fit the resampling + frequency. If 'exact', a ValueError will be raised if the data doesn't + evenly fit. If 'trim', incomplete periods are dropped. If None (default), + uses the current behavior (includes incomplete periods). **indexer_kwargs : {dim: freq} The keyword arguments form of ``indexer``. One of indexer or indexer_kwargs must be provided. @@ -1107,8 +1122,45 @@ def _resample( grouper: Resampler if isinstance(freq, ResampleCompatible): grouper = TimeResampler( - freq=freq, closed=closed, label=label, origin=origin, offset=offset + freq=freq, + closed=closed, + label=label, + origin=origin, + offset=offset, + boundaries=boundaries, ) + + # Apply trim logic at the resample level if needed + if boundaries == "trim": + # First, get the resampling periods to identify incomplete ones + from xarray.core.groupby import ResolvedGrouper + + temp_grouper = ResolvedGrouper(grouper, group, self) + temp_encoded = temp_grouper.encoded + + # Count data points in each period + codes = temp_encoded.codes + counts = np.bincount(codes.values) + + if len(counts) > 0: + # Find the most common count (expected points per period) + unique_counts, count_frequencies = np.unique( + counts, return_counts=True + ) + most_common_count = unique_counts[np.argmax(count_frequencies)] + + # Identify incomplete periods + incomplete_periods = counts < most_common_count + + if np.any(incomplete_periods): + # Find which data points belong to incomplete periods + incomplete_codes = np.where(incomplete_periods)[0] + valid_mask = ~np.isin(codes.values, incomplete_codes) + + # Filter the data to exclude incomplete periods + group = group.isel({group.dims[0]: valid_mask}) + # Also update the object to match the filtered group + self = self.isel({group.dims[0]: valid_mask}) elif isinstance(freq, Resampler): grouper = freq else: diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 6c8d0617038..eb33d26e6ee 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -7433,6 +7433,7 @@ def resample( offset: pd.Timedelta | datetime.timedelta | str | None = None, origin: str | DatetimeLike = "start_day", restore_coord_dims: bool | None = None, + boundaries: Literal["exact", "trim"] | None = None, **indexer_kwargs: ResampleCompatible | Resampler, ) -> DataArrayResample: """Returns a Resample object for performing resampling operations. @@ -7468,6 +7469,11 @@ def resample( restore_coord_dims : bool, optional If True, also restore the dimension order of multi-dimensional coordinates. + boundaries : {"exact", "trim"}, optional + How to handle boundaries when the data doesn't evenly fit the resampling + frequency. If 'exact', a ValueError will be raised if the data doesn't + evenly fit. If 'trim', incomplete periods are dropped. If None (default), + uses the current behavior (includes incomplete periods). **indexer_kwargs : str, datetime.timedelta, pd.Timedelta, pd.DateOffset, or Resampler The keyword arguments form of ``indexer``. One of indexer or indexer_kwargs must be provided. @@ -7572,6 +7578,7 @@ def resample( offset=offset, origin=origin, restore_coord_dims=restore_coord_dims, + boundaries=boundaries, **indexer_kwargs, ) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index a5a958ddcbe..a5227fea151 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -10371,6 +10371,7 @@ def resample( offset: pd.Timedelta | datetime.timedelta | str | None = None, origin: str | DatetimeLike = "start_day", restore_coord_dims: bool | None = None, + boundaries: Literal["exact", "trim"] | None = None, **indexer_kwargs: ResampleCompatible | Resampler, ) -> DatasetResample: """Returns a Resample object for performing resampling operations. @@ -10406,6 +10407,11 @@ def resample( restore_coord_dims : bool, optional If True, also restore the dimension order of multi-dimensional coordinates. + boundaries : {"exact", "trim"}, optional + How to handle boundaries when the data doesn't evenly fit the resampling + frequency. If 'exact', a ValueError will be raised if the data doesn't + evenly fit. If 'trim', incomplete periods are dropped. If None (default), + uses the current behavior (includes incomplete periods). **indexer_kwargs : str, datetime.timedelta, pd.Timedelta, pd.DateOffset, or Resampler The keyword arguments form of ``indexer``. One of indexer or indexer_kwargs must be provided. @@ -10438,6 +10444,7 @@ def resample( offset=offset, origin=origin, restore_coord_dims=restore_coord_dims, + boundaries=boundaries, **indexer_kwargs, ) diff --git a/xarray/groupers.py b/xarray/groupers.py index a16933e690f..2174fc9308a 100644 --- a/xarray/groupers.py +++ b/xarray/groupers.py @@ -484,6 +484,11 @@ class TimeResampler(Resampler): - 'end_day': `origin` is the ceiling midnight of the last day offset : pd.Timedelta, datetime.timedelta, or str, default is None An offset timedelta added to the origin. + boundaries : {"exact", "trim"}, optional + How to handle boundaries when the data doesn't evenly fit the resampling + frequency. If 'exact', a ValueError will be raised if the data doesn't + evenly fit. If 'trim', incomplete periods are dropped. If None (default), + uses the current behavior (includes incomplete periods). """ freq: ResampleCompatible @@ -491,6 +496,7 @@ class TimeResampler(Resampler): label: SideOptions | None = field(default=None) origin: str | DatetimeLike = field(default="start_day") offset: pd.Timedelta | datetime.timedelta | str | None = field(default=None) + boundaries: Literal["exact", "trim"] | None = field(default=None, kw_only=True) index_grouper: CFTimeGrouper | pd.Grouper = field(init=False, repr=False) group_as_index: pd.Index = field(init=False, repr=False) @@ -502,6 +508,7 @@ def reset(self) -> Self: label=self.label, origin=self.origin, offset=self.offset, + boundaries=self.boundaries, ) def _init_properties(self, group: T_Group) -> None: @@ -566,6 +573,22 @@ def factorize(self, group: T_Group) -> EncodedGroups: self._init_properties(group) full_index, first_items, codes_ = self._get_index_and_items() sbins = first_items.values.astype(np.int64) + + # Handle boundaries parameter for exact checking + if self.boundaries == "exact": + # Check if data evenly fits the resampling frequency + counts = np.bincount(codes_) + expected_points = len(group) // len(first_items) + incomplete_periods = counts < expected_points + + if np.any(incomplete_periods): + raise ValueError( + f"Data does not evenly fit the resampling frequency. " + f"Expected {expected_points} points per period, but found periods with " + f"{counts[incomplete_periods]} points. Use boundaries='trim' " + f"to handle incomplete periods." + ) + group_indices: GroupIndices = tuple( list(itertools.starmap(slice, pairwise(sbins))) + [slice(sbins[-1], None)] ) diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index e8f9ea4e732..5fd0217cead 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -2081,6 +2081,24 @@ def test_resample_skipna(self) -> None: expected = DataArray([np.nan, 1, 1], [("time", times[::4])]) assert_identical(result, expected) + def test_resample_boundaries(self) -> None: + """Test the boundaries parameter for resample.""" + # Create 31-day data with predictable values (0-30) + times = pd.date_range("2000-01-01", periods=31, freq="D") + array = DataArray(np.arange(31), [("time", times)]) + + # Test boundaries="trim" - drops incomplete periods + result_trim = array.resample(time="7D", boundaries="trim").mean() + assert len(result_trim.time) == 4 + expected_trim = np.array([3.0, 10.0, 17.0, 24.0]) + np.testing.assert_array_equal(result_trim.values, expected_trim) + + # Test boundaries="exact" - raises error for uneven data + with pytest.raises( + ValueError, match="Data does not evenly fit the resampling frequency" + ): + array.resample(time="7D", boundaries="exact").mean() + def test_upsample(self) -> None: times = pd.date_range("2000-01-01", freq="6h", periods=5) array = DataArray(np.arange(5), [("time", times)]) From 33a4d41b1e896d1a5537486fb38c677b48eaa6f0 Mon Sep 17 00:00:00 2001 From: Giacomo Caria Date: Sat, 18 Oct 2025 18:57:36 -0300 Subject: [PATCH 2/2] move trim logic to TimeResampler.factorize --- xarray/core/common.py | 32 -------------------------------- xarray/groupers.py | 19 ++++++++++++++++++- 2 files changed, 18 insertions(+), 33 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 00b352dd876..10a2659c377 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1129,38 +1129,6 @@ def _resample( offset=offset, boundaries=boundaries, ) - - # Apply trim logic at the resample level if needed - if boundaries == "trim": - # First, get the resampling periods to identify incomplete ones - from xarray.core.groupby import ResolvedGrouper - - temp_grouper = ResolvedGrouper(grouper, group, self) - temp_encoded = temp_grouper.encoded - - # Count data points in each period - codes = temp_encoded.codes - counts = np.bincount(codes.values) - - if len(counts) > 0: - # Find the most common count (expected points per period) - unique_counts, count_frequencies = np.unique( - counts, return_counts=True - ) - most_common_count = unique_counts[np.argmax(count_frequencies)] - - # Identify incomplete periods - incomplete_periods = counts < most_common_count - - if np.any(incomplete_periods): - # Find which data points belong to incomplete periods - incomplete_codes = np.where(incomplete_periods)[0] - valid_mask = ~np.isin(codes.values, incomplete_codes) - - # Filter the data to exclude incomplete periods - group = group.isel({group.dims[0]: valid_mask}) - # Also update the object to match the filtered group - self = self.isel({group.dims[0]: valid_mask}) elif isinstance(freq, Resampler): grouper = freq else: diff --git a/xarray/groupers.py b/xarray/groupers.py index 2174fc9308a..23060d0f8e1 100644 --- a/xarray/groupers.py +++ b/xarray/groupers.py @@ -574,7 +574,7 @@ def factorize(self, group: T_Group) -> EncodedGroups: full_index, first_items, codes_ = self._get_index_and_items() sbins = first_items.values.astype(np.int64) - # Handle boundaries parameter for exact checking + # Handle boundaries parameter for exact checking and trim logic if self.boundaries == "exact": # Check if data evenly fits the resampling frequency counts = np.bincount(codes_) @@ -588,6 +588,23 @@ def factorize(self, group: T_Group) -> EncodedGroups: f"{counts[incomplete_periods]} points. Use boundaries='trim' " f"to handle incomplete periods." ) + elif self.boundaries == "trim": + # Apply trim logic: set codes to -1 for incomplete periods + counts = np.bincount(codes_) + + if len(counts) > 0: + # Find the most common count (expected points per period) + unique_counts, count_frequencies = np.unique(counts, return_counts=True) + most_common_count = unique_counts[np.argmax(count_frequencies)] + + # Identify incomplete periods + incomplete_periods = counts < most_common_count + + if np.any(incomplete_periods): + # Find which data points belong to incomplete periods + incomplete_codes = np.where(incomplete_periods)[0] + # Set codes to -1 for points in incomplete periods + codes_[np.isin(codes_, incomplete_codes)] = -1 group_indices: GroupIndices = tuple( list(itertools.starmap(slice, pairwise(sbins))) + [slice(sbins[-1], None)]