-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Description
What happened?
I'm trying to concatenate two xarray Datasets that contain ordered categorical Pandas extension arrays. Pandas converts these to string (object) arrays during concatenation, but xarray raises a TypeError
.
What did you expect to happen?
Concatenation succeeds.
Minimal Complete Verifiable Example
import xarray as xr
import pandas as pd
cat1 = pd.DataFrame({"test": pd.Categorical(["a", "b", "c"], ordered=True)})
cat2 = pd.DataFrame({"test": pd.Categorical(["a", "b", "d"], ordered=True)})
ds1 = xr.Dataset.from_dataframe(cat1)
ds2 = xr.Dataset.from_dataframe(cat2)
xr.concat([ds1, ds2], dim="index")
MVCE confirmation
- Minimal example — the example is as focused as reasonably possible to demonstrate the underlying issue in xarray.
- Complete example — the example is self-contained, including all data and the text of any traceback.
- Verifiable example — the example copy & pastes into an IPython prompt or Binder notebook, returning the result.
- New issue — a search of GitHub Issues suggests this is not a duplicate.
- Recent environment — the issue occurs with the latest version of xarray and its dependencies.
Relevant log output
TypeError Traceback (most recent call last)
Cell In[19], line 9
6 ds1 = xr.Dataset.from_dataframe(cat1)
7 ds2 = xr.Dataset.from_dataframe(cat2)
----> 9 xr.concat([ds1, ds2], dim="index")
File /data/ilia/envs/famo/lib/python3.11/site-packages/xarray/core/concat.py:277, in concat(objs, dim, data_vars, coords, compat, positions, fill_value, join, combine_attrs, create_index_for_new_dim)
264 return _dataarray_concat(
265 objs,
266 dim=dim,
(...) 274 create_index_for_new_dim=create_index_for_new_dim,
275 )
276 elif isinstance(first_obj, Dataset):
--> 277 return _dataset_concat(
278 objs,
279 dim=dim,
280 data_vars=data_vars,
281 coords=coords,
282 compat=compat,
283 positions=positions,
284 fill_value=fill_value,
285 join=join,
286 combine_attrs=combine_attrs,
287 create_index_for_new_dim=create_index_for_new_dim,
288 )
289 else:
290 raise TypeError(
291 "can only concatenate xarray Dataset and DataArray "
292 f"objects, got {type(first_obj)}"
293 )
File /data/ilia/envs/famo/lib/python3.11/site-packages/xarray/core/concat.py:669, in _dataset_concat(datasets, dim, data_vars, coords, compat, positions, fill_value, join, combine_attrs, create_index_for_new_dim)
667 result_vars[k] = v
668 else:
--> 669 combined_var = concat_vars(
670 vars, dim_name, positions, combine_attrs=combine_attrs
671 )
672 # reindex if variable is not present in all datasets
673 if len(variable_index) < concat_index_size:
File /data/ilia/envs/famo/lib/python3.11/site-packages/xarray/core/variable.py:3004, in concat(variables, dim, positions, shortcut, combine_attrs)
3002 return IndexVariable.concat(variables, dim, positions, shortcut, combine_attrs)
3003 else:
-> 3004 return Variable.concat(variables, dim, positions, shortcut, combine_attrs)
File /data/ilia/envs/famo/lib/python3.11/site-packages/xarray/core/variable.py:1752, in Variable.concat(cls, variables, dim, positions, shortcut, combine_attrs)
1750 axis = first_var.get_axis_num(dim)
1751 dims = first_var_dims
-> 1752 data = duck_array_ops.concatenate(arrays, axis=axis)
1753 if positions is not None:
1754 # TODO: deprecate this option -- we don't need it for groupby
1755 # any more.
1756 indices = nputils.inverse_permutation(np.concatenate(positions))
File /data/ilia/envs/famo/lib/python3.11/site-packages/xarray/core/duck_array_ops.py:378, in concatenate(arrays, axis)
376 xp = get_array_namespace(arrays[0])
377 return xp.concat(as_shared_dtype(arrays, xp=xp), axis=axis)
--> 378 return _concatenate(as_shared_dtype(arrays), axis=axis)
File /data/ilia/envs/famo/lib/python3.11/site-packages/xarray/core/extension_array.py:100, in PandasExtensionArray.__array_function__(self, func, types, args, kwargs)
98 if func not in HANDLED_EXTENSION_ARRAY_FUNCTIONS:
99 return func(*args, **kwargs)
--> 100 res = HANDLED_EXTENSION_ARRAY_FUNCTIONS[func](*args, **kwargs)
101 if is_extension_array_dtype(res):
102 return type(self)[type(res)](res)
File /data/ilia/envs/famo/lib/python3.11/site-packages/xarray/core/extension_array.py:48, in __extension_duck_array__concatenate(arrays, axis, out)
44 @implements(np.concatenate)
45 def __extension_duck_array__concatenate(
46 arrays: Sequence[T_ExtensionArray], axis: int = 0, out=None
47 ) -> T_ExtensionArray:
---> 48 return type(arrays[0])._concat_same_type(arrays)
File /data/ilia/envs/famo/lib/python3.11/site-packages/pandas/core/arrays/categorical.py:2527, in Categorical._concat_same_type(cls, to_concat, axis)
2524 result = res_flat.reshape(len(first), -1, order="F")
2525 return result
-> 2527 result = union_categoricals(to_concat)
2528 return result
File /data/ilia/envs/famo/lib/python3.11/site-packages/pandas/core/dtypes/concat.py:341, in union_categoricals(to_union, sort_categories, ignore_order)
339 if all(c.ordered for c in to_union):
340 msg = "to union ordered Categoricals, all categories must be the same"
--> 341 raise TypeError(msg)
342 raise TypeError("Categorical.ordered must be the same")
344 if ignore_order:
TypeError: to union ordered Categoricals, all categories must be the same
Anything else we need to know?
No response
Environment
commit: None
python: 3.11.2 (main, Nov 30 2024, 21:22:50) [GCC 12.2.0]
python-bits: 64
OS: Linux
OS-release: 6.12.12+bpo-amd64
machine: x86_64
processor:
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: ('en_US', 'UTF-8')
libhdf5: 1.14.4
libnetcdf: None
xarray: 2024.10.0
pandas: 2.2.3
numpy: 2.2.5
scipy: 1.15.2
netCDF4: None
pydap: None
h5netcdf: None
h5py: 3.12.1
zarr: 3.0.6
cftime: None
nc_time_axis: None
iris: None
bottleneck: None
dask: 2025.3.0
distributed: 2025.2.0
matplotlib: 3.9.2
cartopy: None
seaborn: 0.13.2
numbagg: None
fsspec: 2024.10.0
cupy: None
pint: None
sparse: 0.16.0
flox: None
numpy_groupies: None
setuptools: 66.1.1
pip: 23.0.1
conda: None
pytest: 8.3.3
mypy: None
IPython: 9.1.0
sphinx: 8.1.3