Skip to content

Commit adb6689

Browse files
String dtype: coerce missing values in indexers for string dtype Index (#60454)
* String dtype: coerce missing values in indexers for string dtype Index * cleanup
1 parent b74ee4a commit adb6689

File tree

4 files changed

+22
-36
lines changed

4 files changed

+22
-36
lines changed

pandas/_libs/index.pyx

+1-9
Original file line numberDiff line numberDiff line change
@@ -561,23 +561,15 @@ cdef class StringObjectEngine(ObjectEngine):
561561

562562
cdef:
563563
object na_value
564-
bint uses_na
565564

566565
def __init__(self, ndarray values, na_value):
567566
super().__init__(values)
568567
self.na_value = na_value
569-
self.uses_na = na_value is C_NA
570-
571-
cdef bint _checknull(self, object val):
572-
if self.uses_na:
573-
return val is C_NA
574-
else:
575-
return util.is_nan(val)
576568

577569
cdef _check_type(self, object val):
578570
if isinstance(val, str):
579571
return val
580-
elif self._checknull(val):
572+
elif checknull(val):
581573
return self.na_value
582574
else:
583575
raise KeyError(val)

pandas/tests/frame/indexing/test_indexing.py

-3
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
import numpy as np
1010
import pytest
1111

12-
from pandas._config import using_string_dtype
13-
1412
from pandas._libs import iNaT
1513
from pandas.errors import InvalidIndexError
1614

@@ -503,7 +501,6 @@ def test_setitem_ambig(self, using_infer_string):
503501
else:
504502
assert dm[2].dtype == np.object_
505503

506-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
507504
def test_setitem_None(self, float_frame):
508505
# GH #766
509506
float_frame[None] = float_frame["A"]

pandas/tests/indexes/string/test_indexing.py

+16-17
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,15 @@ def _isnan(val):
1313
return False
1414

1515

16+
def _equivalent_na(dtype, null):
17+
if dtype.na_value is pd.NA and null is pd.NA:
18+
return True
19+
elif _isnan(dtype.na_value) and _isnan(null):
20+
return True
21+
else:
22+
return False
23+
24+
1625
class TestGetLoc:
1726
def test_get_loc(self, any_string_dtype):
1827
index = Index(["a", "b", "c"], dtype=any_string_dtype)
@@ -41,14 +50,7 @@ def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture):
4150

4251
def test_get_loc_missing(self, any_string_dtype, nulls_fixture):
4352
index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype)
44-
if any_string_dtype == "string" and (
45-
(any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA)
46-
or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture))
47-
):
48-
with pytest.raises(KeyError):
49-
index.get_loc(nulls_fixture)
50-
else:
51-
assert index.get_loc(nulls_fixture) == 2
53+
assert index.get_loc(nulls_fixture) == 2
5254

5355

5456
class TestGetIndexer:
@@ -93,9 +95,8 @@ def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string):
9395
result = index.get_indexer(["a", null, "c"])
9496
if using_infer_string:
9597
expected = np.array([0, 2, -1], dtype=np.intp)
96-
elif any_string_dtype == "string" and (
97-
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
98-
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
98+
elif any_string_dtype == "string" and not _equivalent_na(
99+
any_string_dtype, null
99100
):
100101
expected = np.array([0, -1, -1], dtype=np.intp)
101102
else:
@@ -115,9 +116,8 @@ def test_get_indexer_non_unique_nas(
115116
if using_infer_string:
116117
expected_indexer = np.array([0, 2], dtype=np.intp)
117118
expected_missing = np.array([], dtype=np.intp)
118-
elif any_string_dtype == "string" and (
119-
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
120-
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
119+
elif any_string_dtype == "string" and not _equivalent_na(
120+
any_string_dtype, null
121121
):
122122
expected_indexer = np.array([0, -1], dtype=np.intp)
123123
expected_missing = np.array([1], dtype=np.intp)
@@ -133,9 +133,8 @@ def test_get_indexer_non_unique_nas(
133133

134134
if using_infer_string:
135135
expected_indexer = np.array([0, 1, 3], dtype=np.intp)
136-
elif any_string_dtype == "string" and (
137-
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
138-
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
136+
elif any_string_dtype == "string" and not _equivalent_na(
137+
any_string_dtype, null
139138
):
140139
pass
141140
else:

pandas/tests/reshape/test_pivot.py

+5-7
Original file line numberDiff line numberDiff line change
@@ -2668,6 +2668,8 @@ def test_pivot_columns_not_given(self):
26682668
with pytest.raises(TypeError, match="missing 1 required keyword-only argument"):
26692669
df.pivot()
26702670

2671+
# this still fails because columns=None gets passed down to unstack as level=None
2672+
# while at that point None was converted to NaN
26712673
@pytest.mark.xfail(
26722674
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
26732675
)
@@ -2686,10 +2688,7 @@ def test_pivot_columns_is_none(self):
26862688
expected = DataFrame({1: 3}, index=Index([2], name="b"))
26872689
tm.assert_frame_equal(result, expected)
26882690

2689-
@pytest.mark.xfail(
2690-
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
2691-
)
2692-
def test_pivot_index_is_none(self):
2691+
def test_pivot_index_is_none(self, using_infer_string):
26932692
# GH#48293
26942693
df = DataFrame({None: [1], "b": 2, "c": 3})
26952694

@@ -2700,11 +2699,10 @@ def test_pivot_index_is_none(self):
27002699

27012700
result = df.pivot(columns="b", index=None, values="c")
27022701
expected = DataFrame(3, index=[1], columns=Index([2], name="b"))
2702+
if using_infer_string:
2703+
expected.index.name = np.nan
27032704
tm.assert_frame_equal(result, expected)
27042705

2705-
@pytest.mark.xfail(
2706-
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
2707-
)
27082706
def test_pivot_values_is_none(self):
27092707
# GH#48293
27102708
df = DataFrame({None: [1], "b": 2, "c": 3})

0 commit comments

Comments
 (0)