@@ -46,9 +46,8 @@ def recode_for_groupby(c: Categorical, sort: bool, observed: bool) -> Categorica
4646 # In cases with c.ordered, this is equivalent to
4747 # return c.remove_unused_categories(), c
4848
49- unique_codes = unique1d (c .codes ) # type: ignore[no-untyped-call]
49+ take_codes = unique1d (c .codes [ c . codes != - 1 ] ) # type: ignore[no-untyped-call]
5050
51- take_codes = unique_codes [unique_codes != - 1 ]
5251 if sort :
5352 take_codes = np .sort (take_codes )
5453
@@ -67,17 +66,18 @@ def recode_for_groupby(c: Categorical, sort: bool, observed: bool) -> Categorica
6766
6867 # sort=False should order groups in as-encountered order (GH-8868)
6968
70- # xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories
71- all_codes = np .arange (c .categories .nunique ())
69+ # GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories
7270 # GH 38140: exclude nan from indexer for categories
7371 unique_notnan_codes = unique1d (c .codes [c .codes != - 1 ]) # type: ignore[no-untyped-call]
7472 if sort :
7573 unique_notnan_codes = np .sort (unique_notnan_codes )
76- if len (all_codes ) > len (unique_notnan_codes ):
74+ if ( num_cat := len (c . categories ) ) > len (unique_notnan_codes ):
7775 # GH 13179: All categories need to be present, even if missing from the data
78- missing_codes = np .setdiff1d (all_codes , unique_notnan_codes , assume_unique = True )
76+ missing_codes = np .setdiff1d (
77+ np .arange (num_cat ), unique_notnan_codes , assume_unique = True
78+ )
7979 take_codes = np .concatenate ((unique_notnan_codes , missing_codes ))
8080 else :
8181 take_codes = unique_notnan_codes
8282
83- return Categorical (c , c .unique (). categories .take (take_codes ))
83+ return Categorical (c , c .categories .take (take_codes ))
0 commit comments