Skip to content

Commit c954aa2

Browse files
stanmartjtilly
andauthored
Handle string columns of type pandas.StringDType properly (#478)
* Treat new-style pandas string columns like objects * Make tests work with pandas 2.3 * Formulaic can't do it yet * Changelog * Not yet * Release today * Use a non-retired OS * Maybe this? * Move include path to D: * Revert "Maybe this?" This reverts commit c149990. --------- Co-authored-by: Jan Tilly <[email protected]>
1 parent aaa353c commit c954aa2

File tree

7 files changed

+22
-8
lines changed

7 files changed

+22
-8
lines changed

.github/workflows/build-wheels.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
runs-on: ${{ matrix.os }}
1818
strategy:
1919
matrix:
20-
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2019, macos-13, macos-14]
20+
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2022, macos-13, macos-14]
2121

2222
steps:
2323
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

CHANGELOG.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,13 @@
77
Changelog
88
=========
99

10-
4.1.2 - UNRELEASED
10+
4.1.2 - 2025-07-17
1111
------------------
1212

1313
**Bug fix:**
1414

1515
- Fixed a bug which caused issues when constructing tabmat matrices from existing ``ModelSpec``\s when they contained categorical columns with all levels dropped.
16+
- We can now treat dedicated pandas string series - which are the defaults for strings since pandas 2.3 - as categoricals.
1617

1718

1819
4.1.1 - 2025-01-30

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ before-all = [
7474
]
7575

7676
[tool.cibuildwheel.windows.environment]
77-
INCLUDE="C:\\\\a\\\\tabmat\\\\tabmat\\\\envs\\\\build\\\\Library\\\\include"
77+
INCLUDE="D:\\\\a\\\\tabmat\\\\tabmat\\\\envs\\\\build\\\\Library\\\\include"
7878

7979
[tool.cibuildwheel.linux]
8080
before-all = [

src/tabmat/constructor.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,13 @@ def from_df(
9494
for dfcolidx, colname in enumerate(df.columns):
9595
coldata = df[:, dfcolidx]
9696
if object_as_cat:
97-
if isinstance(coldata.dtype, (nw.String, nw.Object)):
97+
if isinstance(coldata.dtype, (nw.String, nw.Object)) or (
98+
pd is not None # until Narwhals handles it natively
99+
and isinstance(
100+
nw.to_native(coldata).dtype,
101+
pd.StringDtype,
102+
)
103+
):
98104
coldata = coldata.cast(nw.Categorical)
99105

100106
# deal with Pandas sparse dtype (not supported by narwhals)

src/tabmat/formula.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def _init(self):
6060
def _is_categorical(self, values):
6161
if isinstance(values, (pd.Series, pd.Categorical)):
6262
return values.dtype == object or isinstance(
63-
values.dtype, pd.CategoricalDtype
63+
values.dtype, (pd.CategoricalDtype, pd.StringDtype)
6464
)
6565
return super()._is_categorical(values)
6666

tests/test_constructor.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def construct_data(backend):
4343

4444
def test_pandas_to_matrix():
4545
df = construct_data("pandas")
46+
original_dtypes = df.dtypes.copy()
4647

4748
mat = tm.from_df(
4849
df, dtype=np.float64, sparse_threshold=0.3, cat_threshold=4, object_as_cat=True
@@ -63,8 +64,8 @@ def test_pandas_to_matrix():
6364

6465
# Prevent a regression where the column type of sparsified dense columns
6566
# was being changed in place.
66-
assert df["cl"].dtype == object
67-
assert df["ds"].dtype == np.float64
67+
assert df["cl"].dtype == original_dtypes["cl"]
68+
assert df["ds"].dtype == original_dtypes["ds"]
6869

6970

7071
@pytest.mark.parametrize("categorical_dtype", [pl.Categorical, pl.Enum(["a", "b"])])

tests/test_formula.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,13 @@ def test_matrix_against_expectation_qcl(df, formula, expected):
305305
"C(cat_1, spans_intercept=False) * cat_2 * cat_3",
306306
id="custom_contrasts",
307307
),
308-
pytest.param("str_1", id="string_as_categorical"),
308+
pytest.param(
309+
"str_1",
310+
id="string_as_categorical",
311+
marks=pytest.mark.xfail(
312+
reason="Formulaic does not treat new-style strings as categorical yet"
313+
),
314+
),
309315
],
310316
)
311317
def test_matrix_against_pandas(df, formula, ensure_full_rank):

0 commit comments

Comments
 (0)