From 200c336ac5a434f989c25a015e5650770fefea3d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 17 Jul 2025 10:16:33 +0200 Subject: [PATCH 1/4] Output formatting: preserve quoting for string categories --- pandas/core/arrays/categorical.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3d2ad109a55ba..6d055f9ddfadd 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2233,8 +2233,14 @@ def _repr_categories(self) -> list[str]: ) from pandas.io.formats import format as fmt + formatter = None + if self.categories.dtype == "str": + # the extension array formatter defaults to boxed=True in format_array + # override here to boxed=False to be consistent with QUOTE_NONNUMERIC + formatter = self.categories._values._formatter(boxed=False) + format_array = partial( - fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC + fmt.format_array, formatter=formatter, quoting=QUOTE_NONNUMERIC ) if len(self.categories) > max_categories: num = max_categories // 2 From 3e264d672c25c6f4ab7270ff74e6cc770fa67e97 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 23 Jul 2025 23:12:09 +0200 Subject: [PATCH 2/4] update tests --- pandas/tests/arrays/categorical/test_repr.py | 32 ++----- .../indexes/categorical/test_category.py | 4 - .../tests/indexes/categorical/test_formats.py | 90 +++---------------- pandas/tests/series/test_formats.py | 39 +++----- pandas/tests/util/test_assert_series_equal.py | 17 +--- 5 files changed, 39 insertions(+), 143 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index a82ba24a2c732..60af3bafb62b2 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -16,16 +16,11 @@ class TestCategoricalReprWithFactor: def test_print(self, using_infer_string): factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) - if using_infer_string: - expected = [ - "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", - "Categories (3, str): [a < b < c]", - ] - else: - expected = [ - "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", - "Categories (3, object): ['a' < 'b' < 'c']", - ] + dtype = "str" if using_infer_string else "object" + expected = [ + "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", + f"Categories (3, {dtype}): ['a' < 'b' < 'c']", + ] expected = "\n".join(expected) actual = repr(factor) assert actual == expected @@ -82,10 +77,7 @@ def test_unicode_print(self, using_infer_string): Categories (3, object): ['aaaaa', 'bb', 'cccc']""" if using_infer_string: - expected = expected.replace( - "(3, object): ['aaaaa', 'bb', 'cccc']", - "(3, str): [aaaaa, bb, cccc]", - ) + expected = expected.replace("object", "str") assert repr(c) == expected @@ -96,10 +88,7 @@ def test_unicode_print(self, using_infer_string): Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501 if using_infer_string: - expected = expected.replace( - "(3, object): ['ああああ', 'いいいいい', 'ううううううう']", - "(3, str): [ああああ, いいいいい, ううううううう]", - ) + expected = expected.replace("object", "str") assert repr(c) == expected @@ -112,12 +101,9 @@ def test_unicode_print(self, using_infer_string): Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501 if using_infer_string: - expected = expected.replace( - "(3, object): ['ああああ', 'いいいいい', 'ううううううう']", - "(3, str): [ああああ, いいいいい, ううううううう]", - ) + expected = expected.replace("object", "str") - assert repr(c) == expected + assert repr(c) == expected def test_categorical_repr(self): c = Categorical([1, 2, 3]) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 262b043adaf58..58e4649cb331b 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import index as libindex from pandas._libs.arrays import NDArrayBacked @@ -199,8 +197,6 @@ def test_unique(self, data, categories, expected_data, ordered): expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) - # TODO(3.0): remove this test once using_string_dtype() is always True - @pytest.mark.xfail(using_string_dtype(), reason="repr doesn't roundtrip") def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) str(ci) diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index 2308a62bc44a4..aed1082de88cc 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -8,125 +8,78 @@ class TestCategoricalIndexReprStringCategories: - def test_string_categorical_index_repr(self, using_infer_string): + def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa: E501 - if using_infer_string: - expected = expected.replace( - "categories=['a', 'bb', 'ccc']", - "categories=[a, bb, ccc]", - ) assert repr(idx) == expected - def test_categorical_index_repr_multiline(self, using_infer_string): + def test_categorical_index_repr_multiline(self): # multiple lines idx = CategoricalIndex(["a", "bb", "ccc"] * 10) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa: E501 - if using_infer_string: - expected = expected.replace( - "categories=['a', 'bb', 'ccc']", - "categories=[a, bb, ccc]", - ) assert repr(idx) == expected - def test_categorical_index_repr_truncated(self, using_infer_string): + def test_categorical_index_repr_truncated(self): # truncated idx = CategoricalIndex(["a", "bb", "ccc"] * 100) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', ... 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa: E501 - if using_infer_string: - expected = expected.replace( - "categories=['a', 'bb', 'ccc']", - "categories=[a, bb, ccc]", - ) assert repr(idx) == expected - def test_categorical_index_repr_many_categories(self, using_infer_string): + def test_categorical_index_repr_many_categories(self): # larger categories idx = CategoricalIndex(list("abcdefghijklmmo")) expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'm', 'o'], categories=['a', 'b', 'c', 'd', ..., 'k', 'l', 'm', 'o'], ordered=False, dtype='category')""" # noqa: E501 - if using_infer_string: - expected = expected.replace( - "categories=['a', 'b', 'c', 'd', ..., 'k', 'l', 'm', 'o']", - "categories=[a, b, c, d, ..., k, l, m, o]", - ) assert repr(idx) == expected - def test_categorical_index_repr_unicode(self, using_infer_string): + def test_categorical_index_repr_unicode(self): # short idx = CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 - if using_infer_string: - expected = expected.replace( - "categories=['あ', 'いい', 'ううう']", - "categories=[あ, いい, ううう]", - ) assert repr(idx) == expected - def test_categorical_index_repr_unicode_multiline(self, using_infer_string): + def test_categorical_index_repr_unicode_multiline(self): # multiple lines idx = CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 - if using_infer_string: - expected = expected.replace( - "categories=['あ', 'いい', 'ううう']", - "categories=[あ, いい, ううう]", - ) assert repr(idx) == expected - def test_categorical_index_repr_unicode_truncated(self, using_infer_string): + def test_categorical_index_repr_unicode_truncated(self): # truncated idx = CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa: E501 - if using_infer_string: - expected = expected.replace( - "categories=['あ', 'いい', 'ううう']", - "categories=[あ, いい, ううう]", - ) assert repr(idx) == expected - def test_categorical_index_repr_unicode_many_categories(self, using_infer_string): + def test_categorical_index_repr_unicode_many_categories(self): # larger categories idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category')""" # noqa: E501 - if using_infer_string: - expected = expected.replace( - "categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ']", - "categories=[あ, い, う, え, ..., し, す, せ, そ]", - ) assert repr(idx) == expected - def test_categorical_index_repr_east_asian_width(self, using_infer_string): + def test_categorical_index_repr_east_asian_width(self): with cf.option_context("display.unicode.east_asian_width", True): # short idx = CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 - if using_infer_string: - expected = expected.replace( - "categories=['あ', 'いい', 'ううう']", - "categories=[あ, いい, ううう]", - ) assert repr(idx) == expected - def test_categorical_index_repr_east_asian_width_multiline( - self, using_infer_string - ): + def test_categorical_index_repr_east_asian_width_multiline(self): with cf.option_context("display.unicode.east_asian_width", True): # multiple lines idx = CategoricalIndex(["あ", "いい", "ううう"] * 10) @@ -136,16 +89,9 @@ def test_categorical_index_repr_east_asian_width_multiline( 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 - if using_infer_string: - expected = expected.replace( - "categories=['あ', 'いい', 'ううう']", - "categories=[あ, いい, ううう]", - ) assert repr(idx) == expected - def test_categorical_index_repr_east_asian_width_truncated( - self, using_infer_string - ): + def test_categorical_index_repr_east_asian_width_truncated(self): with cf.option_context("display.unicode.east_asian_width", True): # truncated idx = CategoricalIndex(["あ", "いい", "ううう"] * 100) @@ -156,25 +102,13 @@ def test_categorical_index_repr_east_asian_width_truncated( 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa: E501 - if using_infer_string: - expected = expected.replace( - "categories=['あ', 'いい', 'ううう']", - "categories=[あ, いい, ううう]", - ) assert repr(idx) == expected - def test_categorical_index_repr_east_asian_width_many_categories( - self, using_infer_string - ): + def test_categorical_index_repr_east_asian_width_many_categories(self): with cf.option_context("display.unicode.east_asian_width", True): idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category')""" # noqa: E501 - if using_infer_string: - expected = expected.replace( - "categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ']", - "categories=[あ, い, う, え, ..., し, す, せ, そ]", - ) assert repr(idx) == expected diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index eb81840f6f8f9..4242c57e05083 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -309,38 +309,27 @@ def test_categorical_repr(self, using_infer_string): assert exp == a.__str__() a = Series(Categorical(["a", "b"] * 25)) + exp = ( + "0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" + "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']" + ) if using_infer_string: - exp = ( - "0 a\n1 b\n" - " ..\n" - "48 a\n49 b\n" - "Length: 50, dtype: category\nCategories (2, str): [a, b]" - ) - else: - exp = ( - "0 a\n1 b\n" - " ..\n" - "48 a\n49 b\n" - "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']" - ) + exp = exp.replace("object", "str") with option_context("display.max_rows", 5): assert exp == repr(a) levs = list("abcdefghijklmnopqrstuvwxyz") a = Series(Categorical(["a", "b"], categories=levs, ordered=True)) + exp = ( + "0 a\n1 b\n" + "dtype: category\n" + "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... " + "'w' < 'x' < 'y' < 'z']" + ) if using_infer_string: - exp = ( - "0 a\n1 b\n" - "dtype: category\n" - "Categories (26, str): [a < b < c < d ... w < x < y < z]" - ) - else: - exp = ( - "0 a\n1 b\n" - "dtype: category\n" - "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... " - "'w' < 'x' < 'y' < 'z']" - ) + exp = exp.replace("object", "str") assert exp == a.__str__() def test_categorical_series_repr(self): diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index c3cd90f2edfb3..8c9fff8e6ae2d 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -214,24 +214,15 @@ def test_series_equal_numeric_values_mismatch(rtol): def test_series_equal_categorical_values_mismatch(rtol, using_infer_string): - if using_infer_string: - msg = """Series are different - -Series values are different \\(66\\.66667 %\\) -\\[index\\]: \\[0, 1, 2\\] -\\[left\\]: \\['a', 'b', 'c'\\] -Categories \\(3, str\\): \\[a, b, c\\] -\\[right\\]: \\['a', 'c', 'b'\\] -Categories \\(3, str\\): \\[a, b, c\\]""" - else: - msg = """Series are different + dtype = "str" if using_infer_string else "object" + msg = f"""Series are different Series values are different \\(66\\.66667 %\\) \\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\['a', 'b', 'c'\\] -Categories \\(3, object\\): \\['a', 'b', 'c'\\] +Categories \\(3, {dtype}\\): \\['a', 'b', 'c'\\] \\[right\\]: \\['a', 'c', 'b'\\] -Categories \\(3, object\\): \\['a', 'b', 'c'\\]""" +Categories \\(3, {dtype}\\): \\['a', 'b', 'c'\\]""" s1 = Series(Categorical(["a", "b", "c"])) s2 = Series(Categorical(["a", "c", "b"])) From 571aee8ce91d02d101c789166b3c9a71c27a70b1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 23 Jul 2025 23:41:39 +0200 Subject: [PATCH 3/4] typing --- pandas/core/arrays/categorical.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 44d176257fbd3..414f9cce01887 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2237,7 +2237,8 @@ def _repr_categories(self) -> list[str]: if self.categories.dtype == "str": # the extension array formatter defaults to boxed=True in format_array # override here to boxed=False to be consistent with QUOTE_NONNUMERIC - formatter = self.categories._values._formatter(boxed=False) + arr: ExtensionArray = self.categories._values + formatter = arr._formatter(boxed=False) format_array = partial( fmt.format_array, formatter=formatter, quoting=QUOTE_NONNUMERIC From 309ba5f93194f8df62b88866f6444f5403ad061a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 24 Jul 2025 00:06:24 +0200 Subject: [PATCH 4/4] typing --- pandas/core/arrays/categorical.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 414f9cce01887..768477d55e883 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2237,8 +2237,9 @@ def _repr_categories(self) -> list[str]: if self.categories.dtype == "str": # the extension array formatter defaults to boxed=True in format_array # override here to boxed=False to be consistent with QUOTE_NONNUMERIC - arr: ExtensionArray = self.categories._values - formatter = arr._formatter(boxed=False) + formatter = cast(ExtensionArray, self.categories._values)._formatter( + boxed=False + ) format_array = partial( fmt.format_array, formatter=formatter, quoting=QUOTE_NONNUMERIC