From 437a32250d3ab877d48b0291d6d44b3c4f091ab3 Mon Sep 17 00:00:00 2001 From: adrien pacifico Date: Thu, 17 Jul 2025 20:36:21 +0200 Subject: [PATCH 01/10] add tests --- pandas/tests/dtypes/test_concat.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index 571e12d0c3303..a1affee6efbba 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -3,7 +3,7 @@ import pandas.core.dtypes.concat as _concat import pandas as pd -from pandas import Series +from pandas import Series, DataFrame import pandas._testing as tm @@ -64,3 +64,21 @@ def test_concat_series_between_empty_and_tzaware_series(using_infer_string): dtype=float, ) tm.assert_frame_equal(result, expected) + + +def test_concat_categorical_dataframes(): + df = DataFrame({"a": [0, 1]}, dtype="category") + df2 = DataFrame({"a": [2, 3]}, dtype="category") + + result = pd.concat([df, df2], axis=0) + + assert result["a"].dtype.name == "category" + + +def test_concat_categorical_series(): + ser = Series([0, 1], dtype="category") + ser2 = Series([2, 3], dtype="category") + + result = pd.concat([ser, ser2], axis=0) + + assert result.dtype.name == "category" From 4d6e5e759602feadeed79ebea060b3483f36499d Mon Sep 17 00:00:00 2001 From: adrien pacifico Date: Thu, 17 Jul 2025 20:43:47 +0200 Subject: [PATCH 02/10] BUG: Fix #51362 special handling for categorical arrays in concat_compat --- pandas/core/dtypes/concat.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index dcf8cb5c78536..75a6c2434c201 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -69,6 +69,11 @@ def concat_compat( ------- a single array, preserving the combined dtypes """ + # Special handling for categorical arrays solves #51362 + if (len(to_concat) and + all(isinstance(arr.dtype, CategoricalDtype) for arr in to_concat) and + axis == 0): + return union_categoricals(to_concat) if len(to_concat) and lib.dtypes_all_equal([obj.dtype for obj in to_concat]): # fastpath! obj = to_concat[0] From 7c5f3a38fc0c6cf9f70bb8f1598072c066bf6cf1 Mon Sep 17 00:00:00 2001 From: adrien pacifico Date: Thu, 17 Jul 2025 21:08:22 +0200 Subject: [PATCH 03/10] run pre-commit and solves issues --- pandas/core/dtypes/concat.py | 8 +++++--- pandas/tests/dtypes/test_concat.py | 9 ++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 75a6c2434c201..dffc208c4e8eb 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -70,9 +70,11 @@ def concat_compat( a single array, preserving the combined dtypes """ # Special handling for categorical arrays solves #51362 - if (len(to_concat) and - all(isinstance(arr.dtype, CategoricalDtype) for arr in to_concat) and - axis == 0): + if ( + len(to_concat) + and all(isinstance(arr.dtype, CategoricalDtype) for arr in to_concat) + and axis == 0 + ): return union_categoricals(to_concat) if len(to_concat) and lib.dtypes_all_equal([obj.dtype for obj in to_concat]): # fastpath! diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index a1affee6efbba..2c3bce033ac4f 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -3,7 +3,10 @@ import pandas.core.dtypes.concat as _concat import pandas as pd -from pandas import Series, DataFrame +from pandas import ( + DataFrame, + Series, +) import pandas._testing as tm @@ -19,7 +22,7 @@ def test_concat_mismatched_categoricals_with_empty(): def test_concat_single_dataframe_tz_aware(): # https://github.com/pandas-dev/pandas/issues/25257 - df = pd.DataFrame( + df = DataFrame( {"timestamp": [pd.Timestamp("2020-04-08 09:00:00.709949+0000", tz="UTC")]} ) expected = df.copy() @@ -53,7 +56,7 @@ def test_concat_series_between_empty_and_tzaware_series(using_infer_string): ser2 = Series(dtype=float) result = pd.concat([ser1, ser2], axis=1) - expected = pd.DataFrame( + expected = DataFrame( data=[ (0.0, None), ], From 458543ff068613fe9c8795d103ad7bad57704a83 Mon Sep 17 00:00:00 2001 From: adrien pacifico Date: Thu, 17 Jul 2025 21:43:17 +0200 Subject: [PATCH 04/10] - correct test --- pandas/tests/reshape/concat/test_append_common.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index d0ff950e7985f..66e64f540cf43 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -743,7 +743,13 @@ def test_categorical_concat_append(self): df_different_categories = DataFrame({"cats": cat3, "vals": vals3}) res = pd.concat([df, df_different_categories], ignore_index=True) - exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]}) + exp = DataFrame( + { + "cats": Categorical(list("abab"), categories=["a", "b", "c"]), + "vals": [1, 2, 1, 2], + } + ) # I do not agree with the test made in #37243 + tm.assert_frame_equal(res, exp) res = df._append(df_different_categories, ignore_index=True) From 21c85e3bb984ccf5e3c4027974005292a332aade Mon Sep 17 00:00:00 2001 From: adrien pacifico Date: Thu, 17 Jul 2025 22:11:23 +0200 Subject: [PATCH 05/10] BUG: Improve categorical handling in concat_compat to respect tests, correct wrong tests --- pandas/core/dtypes/concat.py | 5 ++++- pandas/tests/reshape/concat/test_append_common.py | 10 ++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index dffc208c4e8eb..3907e59d55751 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -75,7 +75,10 @@ def concat_compat( and all(isinstance(arr.dtype, CategoricalDtype) for arr in to_concat) and axis == 0 ): - return union_categoricals(to_concat) + return union_categoricals( + to_concat, sort_categories=True + ) # Performance cost, but necessary to keep tests passing. + # see pandas/tests/reshape/concat/test_append_common.py:498 if len(to_concat) and lib.dtypes_all_equal([obj.dtype for obj in to_concat]): # fastpath! obj = to_concat[0] diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index 66e64f540cf43..edcbb6fae7be7 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -486,15 +486,17 @@ def test_concat_categorical(self): s1 = Series([3, 2], dtype="category") s2 = Series([2, 1], dtype="category") - exp = Series([3, 2, 2, 1]) + exp = Series([3, 2, 2, 1], dtype="category") # should remain category tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) # completely different categories (same dtype) => not-category - s1 = Series([10, 11, np.nan], dtype="category") - s2 = Series([np.nan, 1, 3, 2], dtype="category") + s1 = Series([10.0, 11.0, np.nan], dtype="category") + s2 = Series([np.nan, 1.0, 3.0, 2.0], dtype="category") - exp = Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype=np.float64) + exp = Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype=np.float64).astype( + "category" + ) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) From ce7b5185bfe43ef28b85b437b5477e357a433d4e Mon Sep 17 00:00:00 2001 From: adrien pacifico Date: Thu, 17 Jul 2025 23:25:10 +0200 Subject: [PATCH 06/10] - change test from numpy to categorical_equal assertion - change concat function to handle empty elements --- pandas/core/dtypes/concat.py | 32 ++++++++++++++++++++---------- pandas/tests/dtypes/test_concat.py | 2 +- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 3907e59d55751..28d7444f0c7dd 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -69,16 +69,7 @@ def concat_compat( ------- a single array, preserving the combined dtypes """ - # Special handling for categorical arrays solves #51362 - if ( - len(to_concat) - and all(isinstance(arr.dtype, CategoricalDtype) for arr in to_concat) - and axis == 0 - ): - return union_categoricals( - to_concat, sort_categories=True - ) # Performance cost, but necessary to keep tests passing. - # see pandas/tests/reshape/concat/test_append_common.py:498 + if len(to_concat) and lib.dtypes_all_equal([obj.dtype for obj in to_concat]): # fastpath! obj = to_concat[0] @@ -102,6 +93,27 @@ def concat_compat( to_concat_eas, axis=axis, # type: ignore[call-arg] ) + # Special handling for categorical arrays solves #51362 + if ( + len(to_concat) + and all(isinstance(arr.dtype, CategoricalDtype) for arr in to_concat) + and axis == 0 + ): + # Filter out empty arrays before union, similar to non_empties logic + non_empty_categoricals = [x for x in to_concat if _is_nonempty(x, axis)] + + if len(non_empty_categoricals) == 0: + # All arrays are empty, return the first one (they're all categorical) + return to_concat[0] + elif len(non_empty_categoricals) == 1: + # Only one non-empty array, return it directly + return non_empty_categoricals[0] + else: + # Multiple non-empty arrays, use union_categoricals + return union_categoricals( + non_empty_categoricals, sort_categories=True + ) # Performance cost, but necessary to keep tests passing. + # see pandas/tests/reshape/concat/test_append_common.py:498 # If all arrays are empty, there's nothing to convert, just short-cut to # the concatenation, #3121. diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index 2c3bce033ac4f..672c536cd9845 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -17,7 +17,7 @@ def test_concat_mismatched_categoricals_with_empty(): result = _concat.concat_compat([ser1._values, ser2._values]) expected = pd.concat([ser1, ser2])._values - tm.assert_numpy_array_equal(result, expected) + tm.assert_categorical_equal(result, expected) def test_concat_single_dataframe_tz_aware(): From 1e3037dcae86bb6ee5a30375b69b6e94da7784c8 Mon Sep 17 00:00:00 2001 From: adrien pacifico Date: Thu, 17 Jul 2025 23:53:13 +0200 Subject: [PATCH 07/10] modify test to respect : An empty categorical, and a categorical should concat as a categorical. --- pandas/tests/reshape/concat/test_append_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index edcbb6fae7be7..55d1189be90e9 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -696,7 +696,7 @@ def test_concat_categorical_empty(self): s1 = Series([], dtype="category") s2 = Series([1, 2], dtype="category") - exp = s2.astype(object) + exp = s2 tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) From 325591017fe12568547fe45694d544d36b4eea26 Mon Sep 17 00:00:00 2001 From: adrien pacifico Date: Fri, 18 Jul 2025 00:32:02 +0200 Subject: [PATCH 08/10] - update tests for categorical values --- .../tests/reshape/concat/test_categorical.py | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index 8e6a14e6bfb8f..acebbf6951109 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -7,6 +7,7 @@ import pandas as pd from pandas import ( Categorical, + CategoricalIndex, DataFrame, Series, ) @@ -75,13 +76,13 @@ def test_concat_categoricalindex(self): # GH 16111, categories that aren't lexsorted categories = [9, 0, 1, 2, 3] - a = Series(1, index=pd.CategoricalIndex([9, 0], categories=categories)) - b = Series(2, index=pd.CategoricalIndex([0, 1], categories=categories)) - c = Series(3, index=pd.CategoricalIndex([1, 2], categories=categories)) + a = Series(1, index=CategoricalIndex([9, 0], categories=categories)) + b = Series(2, index=CategoricalIndex([0, 1], categories=categories)) + c = Series(3, index=CategoricalIndex([1, 2], categories=categories)) result = pd.concat([a, b, c], axis=1) - exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories) + exp_idx = CategoricalIndex([9, 0, 1, 2], categories=categories) exp = DataFrame( { 0: [1, 1, np.nan, np.nan], @@ -99,7 +100,7 @@ def test_categorical_concat_preserve(self): s = Series(list("abc"), dtype="category") s2 = Series(list("abd"), dtype="category") - exp = Series(list("abcabd")) + exp = Series(list("abcabd"), dtype="category") res = pd.concat([s, s2], ignore_index=True) tm.assert_series_equal(res, exp) @@ -147,8 +148,8 @@ def test_categorical_index_preserver(self): result = pd.concat([df2, df3]) expected = pd.concat( [ - df2.set_axis(df2.index.astype(object), axis=0), - df3.set_axis(df3.index.astype(object), axis=0), + df2.set_axis(df2.index.astype("category"), axis=0), + df3.set_axis(df3.index.astype("category"), axis=0), ] ) tm.assert_frame_equal(result, expected) @@ -179,7 +180,8 @@ def test_concat_categorical_datetime(self): result = pd.concat([df1, df2]) expected = DataFrame( - {"x": Series([datetime(2021, 1, 1), datetime(2021, 1, 2)])} + {"x": Series([datetime(2021, 1, 1), datetime(2021, 1, 2)])}, + dtype="category", ) tm.assert_equal(result, expected) @@ -227,7 +229,9 @@ def test_categorical_index_upcast(self): b = DataFrame({"foo": [4, 3]}, index=Categorical(["baz", "bar"])) res = pd.concat([a, b]) - exp = DataFrame({"foo": [1, 2, 4, 3]}, index=["foo", "bar", "baz", "bar"]) + exp = DataFrame( + {"foo": [1, 2, 4, 3]}, index=Categorical(["foo", "bar", "baz", "bar"]) + ) tm.assert_equal(res, exp) @@ -235,7 +239,7 @@ def test_categorical_index_upcast(self): b = Series([4, 3], index=Categorical(["baz", "bar"])) res = pd.concat([a, b]) - exp = Series([1, 2, 4, 3], index=["foo", "bar", "baz", "bar"]) + exp = Series([1, 2, 4, 3], index=Categorical(["foo", "bar", "baz", "bar"])) tm.assert_equal(res, exp) @@ -257,9 +261,9 @@ def test_categorical_missing_from_one_frame(self): def test_concat_categorical_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/24845 - c1 = pd.CategoricalIndex(["a", "a"], categories=["a", "b"], ordered=False) - c2 = pd.CategoricalIndex(["b", "b"], categories=["b", "a"], ordered=False) - c3 = pd.CategoricalIndex( + c1 = CategoricalIndex(["a", "a"], categories=["a", "b"], ordered=False) + c2 = CategoricalIndex(["b", "b"], categories=["b", "a"], ordered=False) + c3 = CategoricalIndex( ["a", "a", "b", "b"], categories=["a", "b"], ordered=False ) From 109e3d08ea40c4871274c306da55a98cac093107 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 22 Jul 2025 13:14:33 +0200 Subject: [PATCH 09/10] Restore test_categorical.py to commit 6537afe3 --- .../tests/reshape/concat/test_categorical.py | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index acebbf6951109..8e6a14e6bfb8f 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -7,7 +7,6 @@ import pandas as pd from pandas import ( Categorical, - CategoricalIndex, DataFrame, Series, ) @@ -76,13 +75,13 @@ def test_concat_categoricalindex(self): # GH 16111, categories that aren't lexsorted categories = [9, 0, 1, 2, 3] - a = Series(1, index=CategoricalIndex([9, 0], categories=categories)) - b = Series(2, index=CategoricalIndex([0, 1], categories=categories)) - c = Series(3, index=CategoricalIndex([1, 2], categories=categories)) + a = Series(1, index=pd.CategoricalIndex([9, 0], categories=categories)) + b = Series(2, index=pd.CategoricalIndex([0, 1], categories=categories)) + c = Series(3, index=pd.CategoricalIndex([1, 2], categories=categories)) result = pd.concat([a, b, c], axis=1) - exp_idx = CategoricalIndex([9, 0, 1, 2], categories=categories) + exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories) exp = DataFrame( { 0: [1, 1, np.nan, np.nan], @@ -100,7 +99,7 @@ def test_categorical_concat_preserve(self): s = Series(list("abc"), dtype="category") s2 = Series(list("abd"), dtype="category") - exp = Series(list("abcabd"), dtype="category") + exp = Series(list("abcabd")) res = pd.concat([s, s2], ignore_index=True) tm.assert_series_equal(res, exp) @@ -148,8 +147,8 @@ def test_categorical_index_preserver(self): result = pd.concat([df2, df3]) expected = pd.concat( [ - df2.set_axis(df2.index.astype("category"), axis=0), - df3.set_axis(df3.index.astype("category"), axis=0), + df2.set_axis(df2.index.astype(object), axis=0), + df3.set_axis(df3.index.astype(object), axis=0), ] ) tm.assert_frame_equal(result, expected) @@ -180,8 +179,7 @@ def test_concat_categorical_datetime(self): result = pd.concat([df1, df2]) expected = DataFrame( - {"x": Series([datetime(2021, 1, 1), datetime(2021, 1, 2)])}, - dtype="category", + {"x": Series([datetime(2021, 1, 1), datetime(2021, 1, 2)])} ) tm.assert_equal(result, expected) @@ -229,9 +227,7 @@ def test_categorical_index_upcast(self): b = DataFrame({"foo": [4, 3]}, index=Categorical(["baz", "bar"])) res = pd.concat([a, b]) - exp = DataFrame( - {"foo": [1, 2, 4, 3]}, index=Categorical(["foo", "bar", "baz", "bar"]) - ) + exp = DataFrame({"foo": [1, 2, 4, 3]}, index=["foo", "bar", "baz", "bar"]) tm.assert_equal(res, exp) @@ -239,7 +235,7 @@ def test_categorical_index_upcast(self): b = Series([4, 3], index=Categorical(["baz", "bar"])) res = pd.concat([a, b]) - exp = Series([1, 2, 4, 3], index=Categorical(["foo", "bar", "baz", "bar"])) + exp = Series([1, 2, 4, 3], index=["foo", "bar", "baz", "bar"]) tm.assert_equal(res, exp) @@ -261,9 +257,9 @@ def test_categorical_missing_from_one_frame(self): def test_concat_categorical_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/24845 - c1 = CategoricalIndex(["a", "a"], categories=["a", "b"], ordered=False) - c2 = CategoricalIndex(["b", "b"], categories=["b", "a"], ordered=False) - c3 = CategoricalIndex( + c1 = pd.CategoricalIndex(["a", "a"], categories=["a", "b"], ordered=False) + c2 = pd.CategoricalIndex(["b", "b"], categories=["b", "a"], ordered=False) + c3 = pd.CategoricalIndex( ["a", "a", "b", "b"], categories=["a", "b"], ordered=False ) From db99fb5e8a5924095abebb747a26e5031b667b79 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 22 Jul 2025 14:23:42 +0200 Subject: [PATCH 10/10] Restore test_append_common.py to commit 6537afe3 --- .../reshape/concat/test_append_common.py | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index 55d1189be90e9..d0ff950e7985f 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -486,17 +486,15 @@ def test_concat_categorical(self): s1 = Series([3, 2], dtype="category") s2 = Series([2, 1], dtype="category") - exp = Series([3, 2, 2, 1], dtype="category") # should remain category + exp = Series([3, 2, 2, 1]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) # completely different categories (same dtype) => not-category - s1 = Series([10.0, 11.0, np.nan], dtype="category") - s2 = Series([np.nan, 1.0, 3.0, 2.0], dtype="category") + s1 = Series([10, 11, np.nan], dtype="category") + s2 = Series([np.nan, 1, 3, 2], dtype="category") - exp = Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype=np.float64).astype( - "category" - ) + exp = Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype=np.float64) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) @@ -696,7 +694,7 @@ def test_concat_categorical_empty(self): s1 = Series([], dtype="category") s2 = Series([1, 2], dtype="category") - exp = s2 + exp = s2.astype(object) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) @@ -745,13 +743,7 @@ def test_categorical_concat_append(self): df_different_categories = DataFrame({"cats": cat3, "vals": vals3}) res = pd.concat([df, df_different_categories], ignore_index=True) - exp = DataFrame( - { - "cats": Categorical(list("abab"), categories=["a", "b", "c"]), - "vals": [1, 2, 1, 2], - } - ) # I do not agree with the test made in #37243 - + exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]}) tm.assert_frame_equal(res, exp) res = df._append(df_different_categories, ignore_index=True)