Skip to content

Commit 7eb5668

Browse files
committed
Merge pull request #10014 from evanpw/issue_9603
BUG: null group spills into final group when grouping on a categorical
2 parents e3c7862 + d194c99 commit 7eb5668

File tree

3 files changed

+21
-9
lines changed

3 files changed

+21
-9
lines changed

doc/source/whatsnew/v0.16.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ Bug Fixes
218218
- Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`)
219219

220220

221-
221+
- Bug causing elements with a null group to spill into the final group when grouping by a ``Categorical`` (:issue:`9603`)
222222

223223

224224
- Bug in invalid attribute access on a ``TimedeltaIndex`` incorrectly raised ``ValueError`` instead of ``AttributeError`` (:issue:`9680`)

pandas/lib.pyx

+12-8
Original file line numberDiff line numberDiff line change
@@ -1306,9 +1306,10 @@ def duplicated(ndarray[object] values, take_last=False):
13061306

13071307
def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
13081308
cdef:
1309-
Py_ssize_t i, group_size, n, lab, start
1309+
Py_ssize_t i, group_size, n, start
1310+
int64_t lab
13101311
object slobj
1311-
ndarray[int64_t] starts
1312+
ndarray[int64_t] starts, ends
13121313

13131314
n = len(labels)
13141315

@@ -1318,13 +1319,16 @@ def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
13181319
start = 0
13191320
group_size = 0
13201321
for i in range(n):
1321-
group_size += 1
13221322
lab = labels[i]
1323-
if i == n - 1 or lab != labels[i + 1]:
1324-
starts[lab] = start
1325-
ends[lab] = start + group_size
1326-
start += group_size
1327-
group_size = 0
1323+
if lab < 0:
1324+
start += 1
1325+
else:
1326+
group_size += 1
1327+
if i == n - 1 or lab != labels[i + 1]:
1328+
starts[lab] = start
1329+
ends[lab] = start + group_size
1330+
start += group_size
1331+
group_size = 0
13281332

13291333
return starts, ends
13301334

pandas/tests/test_categorical.py

100644100755
+8
Original file line numberDiff line numberDiff line change
@@ -1841,6 +1841,14 @@ def f(x):
18411841
tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']])
18421842
tm.assert_frame_equal(df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']])
18431843

1844+
# GH 9603
1845+
df = pd.DataFrame({'a': [1, 0, 0, 0]})
1846+
c = pd.cut(df.a, [0, 1, 2, 3, 4])
1847+
result = df.groupby(c).apply(len)
1848+
expected = pd.Series([1, 0, 0, 0], index=c.values.categories)
1849+
expected.index.name = 'a'
1850+
tm.assert_series_equal(result, expected)
1851+
18441852
def test_pivot_table(self):
18451853

18461854
raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"], ordered=True)

0 commit comments

Comments
 (0)