From f55c96879c3441b0aa9409da28b96b87ce839a57 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Fri, 15 Oct 2021 00:23:35 -0700 Subject: [PATCH 01/23] BUG: sort_index did not respect ignore_index when not sorting --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/generic.py | 9 ++++++++- pandas/tests/test_sorting.py | 9 +++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 243bcf6900d2e..b10ebd350459a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -430,6 +430,7 @@ Indexing - Bug in :meth:`DataFrame.drop` where the error message did not show missing labels with commas when raising ``KeyError`` (:issue:`42881`) - Bug in :meth:`DataFrame.query` where method calls in query strings led to errors when the ``numexpr`` package was installed. (:issue:`22435`) - Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`) +- Bug in :meth:`DataFrame.sort_index` where `ignore_index=True` was not being respected when the passed dataframe was already sorted (:issue:`43591`) Missing diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 26c0b7426727c..5d5765977c49b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4590,10 +4590,17 @@ def sort_index( ) if indexer is None: + if inplace: + result = self + else: + result = self.copy() + + if ignore_index: + result.index = default_index(len(self)) if inplace: return else: - return self.copy() + return result baxis = self._get_block_manager_axis(axis) new_data = self._mgr.take(indexer, axis=baxis, verify=False) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index a49b7c2b7f86e..17c1703831e3f 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -478,3 +478,12 @@ def test_mixed_str_nan(): result = safe_sort(values) expected = np.array([np.nan, "a", "b", "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) + +def test_respect_ignore_index(): + # GH 43591 + df = DataFrame({'a': [1, 2, 3]}) + df.index = [4, 2, 0] + result = df.sort_index(ascending=False, ignore_index=True) + expected = DataFrame({'a': [1, 2, 3]}) + expected.index = [0, 1, 2] + tm.assert_frame_equal(result, expected) \ No newline at end of file From e56f8fb6b8d1a8db1f18f5a52a1337824a2277e6 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Fri, 15 Oct 2021 00:26:24 -0700 Subject: [PATCH 02/23] BUG: sort_index did not respect ignore_index when not sorting --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index b10ebd350459a..780e6dddb47f0 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -430,7 +430,7 @@ Indexing - Bug in :meth:`DataFrame.drop` where the error message did not show missing labels with commas when raising ``KeyError`` (:issue:`42881`) - Bug in :meth:`DataFrame.query` where method calls in query strings led to errors when the ``numexpr`` package was installed. (:issue:`22435`) - Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`) -- Bug in :meth:`DataFrame.sort_index` where `ignore_index=True` was not being respected when the passed dataframe was already sorted (:issue:`43591`) +- Bug in :meth:`DataFrame.sort_index` where `ignore_index=True` was not being respected when passed dataframe was already sorted (:issue:`43591`) Missing From 282ef5148fa2242d667be777f117e2273908e4c6 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Fri, 15 Oct 2021 00:37:07 -0700 Subject: [PATCH 03/23] BUG: sort_index did not respect ignore_index when not sorting --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 780e6dddb47f0..b10ebd350459a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -430,7 +430,7 @@ Indexing - Bug in :meth:`DataFrame.drop` where the error message did not show missing labels with commas when raising ``KeyError`` (:issue:`42881`) - Bug in :meth:`DataFrame.query` where method calls in query strings led to errors when the ``numexpr`` package was installed. (:issue:`22435`) - Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`) -- Bug in :meth:`DataFrame.sort_index` where `ignore_index=True` was not being respected when passed dataframe was already sorted (:issue:`43591`) +- Bug in :meth:`DataFrame.sort_index` where `ignore_index=True` was not being respected when the passed dataframe was already sorted (:issue:`43591`) Missing From 2c5402ea64729473367e8c19cee1949a9cf9ce5d Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Fri, 15 Oct 2021 00:38:56 -0700 Subject: [PATCH 04/23] BUG: sort_index did not respect ignore_index when not sorting --- pandas/tests/test_sorting.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 17c1703831e3f..c75815856c9ff 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -479,11 +479,12 @@ def test_mixed_str_nan(): expected = np.array([np.nan, "a", "b", "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) + def test_respect_ignore_index(): # GH 43591 - df = DataFrame({'a': [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3]}) df.index = [4, 2, 0] result = df.sort_index(ascending=False, ignore_index=True) - expected = DataFrame({'a': [1, 2, 3]}) + expected = DataFrame({"a": [1, 2, 3]}) expected.index = [0, 1, 2] - tm.assert_frame_equal(result, expected) \ No newline at end of file + tm.assert_frame_equal(result, expected) From 837427bd96fee168ff77e76e8e730a50adbc6c10 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Sat, 16 Oct 2021 19:56:06 -0700 Subject: [PATCH 05/23] moved test to frame test directory --- pandas/tests/frame/methods/test_sort_index.py | 8 ++++++++ pandas/tests/test_sorting.py | 10 ---------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index c1141f705acbc..c78a6d015cc16 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -9,6 +9,7 @@ Index, IntervalIndex, MultiIndex, + RangeIndex, Series, Timestamp, ) @@ -418,6 +419,13 @@ def test_sort_index_ignore_index( tm.assert_frame_equal(result_df, expected_df) tm.assert_frame_equal(df, DataFrame(original_dict, index=original_index)) + def test_respect_ignore_index(self): + # GH 43591 + df = DataFrame({"a": [1, 2, 3]}, index=RangeIndex(4, -1, -2)) + result = df.sort_index(ascending=False, ignore_index=True) + expected = DataFrame({"a": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize( "original_dict, sorted_dict, ascending, ignore_index, output_index", diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index c75815856c9ff..a49b7c2b7f86e 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -478,13 +478,3 @@ def test_mixed_str_nan(): result = safe_sort(values) expected = np.array([np.nan, "a", "b", "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) - - -def test_respect_ignore_index(): - # GH 43591 - df = DataFrame({"a": [1, 2, 3]}) - df.index = [4, 2, 0] - result = df.sort_index(ascending=False, ignore_index=True) - expected = DataFrame({"a": [1, 2, 3]}) - expected.index = [0, 1, 2] - tm.assert_frame_equal(result, expected) From 7523b1b84cc30898a8d8b3acff34033682e54897 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Mon, 18 Oct 2021 21:39:59 -0700 Subject: [PATCH 06/23] parameterized over inplace and ignore_index --- pandas/tests/frame/methods/test_sort_index.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index c78a6d015cc16..71822628473f4 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -419,11 +419,22 @@ def test_sort_index_ignore_index( tm.assert_frame_equal(result_df, expected_df) tm.assert_frame_equal(df, DataFrame(original_dict, index=original_index)) - def test_respect_ignore_index(self): + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("ignore_index", [True, False]) + def test_respect_ignore_index(self, inplace, ignore_index): # GH 43591 df = DataFrame({"a": [1, 2, 3]}, index=RangeIndex(4, -1, -2)) - result = df.sort_index(ascending=False, ignore_index=True) - expected = DataFrame({"a": [1, 2, 3]}) + result = df.sort_index( + ascending=False, ignore_index=ignore_index, inplace=inplace + ) + + if inplace: + result = df + if ignore_index: + expected = DataFrame({"a": [1, 2, 3]}) + else: + expected = DataFrame({"a": [1, 2, 3]}, index=RangeIndex(4, -1, -2)) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("inplace", [True, False]) From e1a0aa73efa638d2f71623be1dd011808309fc4e Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Sun, 24 Oct 2021 20:58:45 -0700 Subject: [PATCH 07/23] BUG fix split --- pandas/core/strings/accessor.py | 12 +++++++---- pandas/core/strings/object_array.py | 21 +++++++++++++------- pandas/tests/strings/test_split_partition.py | 18 +++++++++++++++++ 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 4ea29edb7d41b..bd41a67439b8b 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -657,7 +657,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): Parameters ---------- - pat : str, optional + pat : str, or compiled regex optional String or regular expression to split on. If not specified, split on whitespace. n : int, default -1 (all) @@ -668,7 +668,11 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): * If ``True``, return DataFrame/MultiIndex expanding dimensionality. * If ``False``, return Series/Index, containing lists of strings. - + regex : bool, default None + * If ``True``, assumes the passed-in pattern is a regular expression + * If ``False``, treats the pattern as a literal string + * If ``None`` and the pattern length is 1, treats the pattern as a literal string + * If ``None`` and the pattern length is not 1, treats the pattern as a regular expression Returns ------- Series, Index, DataFrame or MultiIndex @@ -784,8 +788,8 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) @forbid_nonstring_types(["bytes"]) - def split(self, pat=None, n=-1, expand=False): - result = self._data.array._str_split(pat, n, expand) + def split(self, pat: str | re.Pattern = None, n=-1, expand=False, regex: bool | None = None): + result = self._data.array._str_split(pat, n, expand, regex) return self._wrap_result(result, returns_string=expand, expand=expand) @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 76ee55ef5f9ad..6299ce52628cd 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -308,21 +308,28 @@ def f(x): return self._str_map(f) - def _str_split(self, pat=None, n=-1, expand=False): + def _str_split(self, pat: str | re.Pattern = None, n=-1, expand=False, regex: bool = None): if pat is None: if n is None or n == 0: n = -1 f = lambda x: x.split(pat, n) else: - if len(pat) == 1: + if regex is not None: + new_pat = pat + if regex is None: + if len(pat) == 1: + new_pat = pat + else: + new_pat = re.compile(pat) + + if isinstance(new_pat, re.Pattern): + if n is None or n == -1: + n = 0 + f = lambda x: new_pat.split(x, maxsplit=n) + else: if n is None or n == 0: n = -1 f = lambda x: x.split(pat, n) - else: - if n is None or n == -1: - n = 0 - regex = re.compile(pat) - f = lambda x: regex.split(x, maxsplit=n) return self._str_map(f, dtype=object) def _str_rsplit(self, pat=None, n=-1): diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index f3f5acd0d2f1c..56f3144004137 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -34,6 +34,24 @@ def test_split(any_string_dtype): exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) + # explicit regex = True split + values = Series('qweqwejpgqweqwe.jpg', dtype=any_string_dtype) + result = values.str.split('\.jpg', regex=True) + exp = Series([['qweqwejpgqweqwe.jpg']]) + tm.assert_series_equal(result, exp) + # explicit regex = False split + result = values.str.split('\.jpg', regex=False) + exp = Series([['qweqwejpgqweqwe.jpg']]) + tm.assert_series_equal(result, exp) + # non explicit regex split, pattern length == 1 + result = values.str.split('.') + exp = Series([['qweqwejpgqweqwe','jpg']]) + tm.assert_series_equal(result, exp) + # non explicit regex split, pattern length != 1 + result = values.str.split('.jpg') + exp = Series([['qweqw','qweqwe', '']]) + tm.assert_series_equal(result, exp) + def test_split_object_mixed(): mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) From d7b3d8e61512f151aba22c36b2341337afde7e5c Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Sun, 24 Oct 2021 22:51:23 -0700 Subject: [PATCH 08/23] ENH: added regex argument to Series.str.split --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/strings/accessor.py | 35 ++++++++++++++------ pandas/core/strings/object_array.py | 6 ++-- pandas/tests/strings/test_split_partition.py | 10 +++--- 4 files changed, 35 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 96daf1d825753..f2a5b06352f0a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -177,7 +177,7 @@ Other enhancements - :meth:`DataFrame.__pos__`, :meth:`DataFrame.__neg__` now retain ``ExtensionDtype`` dtypes (:issue:`43883`) - The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`) - Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`) -- +- :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is `None` (:issue:`43563`, , :issue:`37963`, :issue: `32835`, :issue: `25549`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index bd41a67439b8b..b361955b1b628 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -774,16 +774,31 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): 1 https://docs.python.org/3/tutorial index.html 2 NaN NaN - Remember to escape special characters when explicitly using regular - expressions. - - >>> s = pd.Series(["1+1=2"]) - >>> s - 0 1+1=2 - dtype: object - >>> s.str.split(r"\+|=", expand=True) - 0 1 2 - 0 1 1 2 + When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled as a + regex only if ``len(pat) != 1``. + + >>> s = pd.Series(['foojpgbar.jpg']) + >>> s = s.str.split(".", expand=True) + 0 1 + 0 foojpgbar jpg + >>> s.str.split("\.jpg", expand=True) + 0 1 + 0 foojpgbar + >>> s.str.split(".jpg", expand=True) + 0 1 2 + 0 fo bar + + When ``regex=True``, `pat` is interpreted as a regex + + >>> s.str.split("\.jpg", regex=True, expand=True) + 0 1 + 0 foojpgbar + + When ``regex=False``, `pat` is interpreted as the string itself + + >>> s.str.split("\.jpg", regex=False, expand=True) + 0 + 0 foojpgbar.jpg """ @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 6299ce52628cd..13c7d74462e93 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -314,9 +314,11 @@ def _str_split(self, pat: str | re.Pattern = None, n=-1, expand=False, regex: bo n = -1 f = lambda x: x.split(pat, n) else: - if regex is not None: + if regex is True: + new_pat = re.compile(pat) + elif regex is False: new_pat = pat - if regex is None: + else: if len(pat) == 1: new_pat = pat else: diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 56f3144004137..3a3c69cd39eac 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -36,19 +36,19 @@ def test_split(any_string_dtype): # explicit regex = True split values = Series('qweqwejpgqweqwe.jpg', dtype=any_string_dtype) - result = values.str.split('\.jpg', regex=True) - exp = Series([['qweqwejpgqweqwe.jpg']]) + result = values.str.split(r'\.jpg', regex=True) + exp = Series([['qweqwejpgqweqwe', '']]) tm.assert_series_equal(result, exp) # explicit regex = False split - result = values.str.split('\.jpg', regex=False) + result = values.str.split(r'\.jpg', regex=False) exp = Series([['qweqwejpgqweqwe.jpg']]) tm.assert_series_equal(result, exp) # non explicit regex split, pattern length == 1 - result = values.str.split('.') + result = values.str.split(r'.') exp = Series([['qweqwejpgqweqwe','jpg']]) tm.assert_series_equal(result, exp) # non explicit regex split, pattern length != 1 - result = values.str.split('.jpg') + result = values.str.split(r'.jpg') exp = Series([['qweqw','qweqwe', '']]) tm.assert_series_equal(result, exp) From 20dc2a6edbe4c5fa4d4c5b014b6ff7a54596fd4b Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Mon, 25 Oct 2021 15:43:29 -0700 Subject: [PATCH 09/23] format change --- pandas/core/strings/accessor.py | 11 ++++++----- pandas/tests/strings/test_split_partition.py | 3 +++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index b361955b1b628..842cf21c0a5ee 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -673,6 +673,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): * If ``False``, treats the pattern as a literal string * If ``None`` and the pattern length is 1, treats the pattern as a literal string * If ``None`` and the pattern length is not 1, treats the pattern as a regular expression + Returns ------- Series, Index, DataFrame or MultiIndex @@ -778,25 +779,25 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): regex only if ``len(pat) != 1``. >>> s = pd.Series(['foojpgbar.jpg']) - >>> s = s.str.split(".", expand=True) + >>> s.str.split(r".", expand=True) 0 1 0 foojpgbar jpg - >>> s.str.split("\.jpg", expand=True) + >>> s.str.split(r"\.jpg", expand=True) 0 1 0 foojpgbar - >>> s.str.split(".jpg", expand=True) + >>> s.str.split(r".jpg", expand=True) 0 1 2 0 fo bar When ``regex=True``, `pat` is interpreted as a regex - >>> s.str.split("\.jpg", regex=True, expand=True) + >>> s.str.split(r"\.jpg", regex=True, expand=True) 0 1 0 foojpgbar When ``regex=False``, `pat` is interpreted as the string itself - >>> s.str.split("\.jpg", regex=False, expand=True) + >>> s.str.split(r"\.jpg", regex=False, expand=True) 0 0 foojpgbar.jpg """ diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 3a3c69cd39eac..353bef259c4ca 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -39,14 +39,17 @@ def test_split(any_string_dtype): result = values.str.split(r'\.jpg', regex=True) exp = Series([['qweqwejpgqweqwe', '']]) tm.assert_series_equal(result, exp) + # explicit regex = False split result = values.str.split(r'\.jpg', regex=False) exp = Series([['qweqwejpgqweqwe.jpg']]) tm.assert_series_equal(result, exp) + # non explicit regex split, pattern length == 1 result = values.str.split(r'.') exp = Series([['qweqwejpgqweqwe','jpg']]) tm.assert_series_equal(result, exp) + # non explicit regex split, pattern length != 1 result = values.str.split(r'.jpg') exp = Series([['qweqw','qweqwe', '']]) From 0b139f30f1058f7db594210a90c19f79b84a72d9 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Mon, 25 Oct 2021 15:48:53 -0700 Subject: [PATCH 10/23] resolve conflict --- doc/source/whatsnew/v1.4.0.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 467880efdf0f2..4c79f2f0a32f8 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -530,11 +530,8 @@ Indexing - Bug in indexing on a non-unique object-dtype :class:`Index` with an NA scalar (e.g. ``np.nan``) (:issue:`43711`) - Bug in :meth:`Series.__setitem__` with object dtype when setting an array with matching size and dtype='datetime64[ns]' or dtype='timedelta64[ns]' incorrectly converting the datetime/timedeltas to integers (:issue:`43868`) - Bug in :meth:`DataFrame.sort_index` where ``ignore_index=True`` was not being respected when the index was already sorted (:issue:`43591`) -<<<<<<< HEAD -======= - Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.datetime64("NaT")`` and ``np.timedelta64("NaT")`` (:issue:`43869`) - ->>>>>>> upstream/master Missing ^^^^^^^ From 16049157e10e40af4a3913b3c481b66b00effe82 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Mon, 25 Oct 2021 19:10:02 -0700 Subject: [PATCH 11/23] ENH: added regex argument to Series.str.split --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/strings/accessor.py | 37 ++++++++++++-------- pandas/core/strings/object_array.py | 8 +++-- pandas/tests/strings/test_split_partition.py | 18 +++++----- 4 files changed, 39 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 4c79f2f0a32f8..2e8d0aee041ae 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -179,7 +179,7 @@ Other enhancements - :meth:`DataFrame.__pos__`, :meth:`DataFrame.__neg__` now retain ``ExtensionDtype`` dtypes (:issue:`43883`) - The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`) - Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`) -- :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is `None` (:issue:`43563`, :issue:`37963`, :issue:`32835`) +- :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`37963`, :issue:`32835`, :issue:`25549`) - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) - diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 842cf21c0a5ee..9755e8a9e62f1 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -669,11 +669,13 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): * If ``True``, return DataFrame/MultiIndex expanding dimensionality. * If ``False``, return Series/Index, containing lists of strings. regex : bool, default None - * If ``True``, assumes the passed-in pattern is a regular expression + * If ``True``, assumes the passed-in pattern is a regular expression * If ``False``, treats the pattern as a literal string - * If ``None`` and the pattern length is 1, treats the pattern as a literal string - * If ``None`` and the pattern length is not 1, treats the pattern as a regular expression - + * If ``None`` and the pattern length is 1, treats the pattern as a + literal string + * If ``None`` and the pattern length is not 1, treats the pattern as + a regular expression + Returns ------- Series, Index, DataFrame or MultiIndex @@ -774,9 +776,10 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): 0 this is a regular sentence None 1 https://docs.python.org/3/tutorial index.html 2 NaN NaN - - When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled as a - regex only if ``len(pat) != 1``. + + Remember to escape special characters when explicitly using regular expressions. + When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled + as a regex only if ``len(pat) != 1``. >>> s = pd.Series(['foojpgbar.jpg']) >>> s.str.split(r".", expand=True) @@ -784,19 +787,19 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): 0 foojpgbar jpg >>> s.str.split(r"\.jpg", expand=True) 0 1 - 0 foojpgbar + 0 foojpgbar >>> s.str.split(r".jpg", expand=True) 0 1 2 - 0 fo bar + 0 fo bar When ``regex=True``, `pat` is interpreted as a regex - + >>> s.str.split(r"\.jpg", regex=True, expand=True) 0 1 - 0 foojpgbar - + 0 foojpgbar + When ``regex=False``, `pat` is interpreted as the string itself - + >>> s.str.split(r"\.jpg", regex=False, expand=True) 0 0 foojpgbar.jpg @@ -804,7 +807,13 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) @forbid_nonstring_types(["bytes"]) - def split(self, pat: str | re.Pattern = None, n=-1, expand=False, regex: bool | None = None): + def split( + self, + pat: str | re.Pattern | None = None, + n=-1, + expand=False, + regex: bool | None = None, + ): result = self._data.array._str_split(pat, n, expand, regex) return self._wrap_result(result, returns_string=expand, expand=expand) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 13c7d74462e93..5e71ae2eb73bf 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -308,16 +308,20 @@ def f(x): return self._str_map(f) - def _str_split(self, pat: str | re.Pattern = None, n=-1, expand=False, regex: bool = None): + def _str_split( + self, pat: str | re.Pattern | None = None, n=-1, expand=False, regex: bool | None = None + ): if pat is None: if n is None or n == 0: n = -1 f = lambda x: x.split(pat, n) else: - if regex is True: + new_pat: str | re.Pattern + if regex is True or isinstance(pat, re.Pattern): new_pat = re.compile(pat) elif regex is False: new_pat = pat + # regex is None so link to old behavior #43563 else: if len(pat) == 1: new_pat = pat diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 353bef259c4ca..7a4fa85840385 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -35,24 +35,24 @@ def test_split(any_string_dtype): tm.assert_series_equal(result, exp) # explicit regex = True split - values = Series('qweqwejpgqweqwe.jpg', dtype=any_string_dtype) - result = values.str.split(r'\.jpg', regex=True) - exp = Series([['qweqwejpgqweqwe', '']]) + values = Series("qweqwejpgqweqwe.jpg", dtype=any_string_dtype) + result = values.str.split(r"\.jpg", regex=True) + exp = Series([["qweqwejpgqweqwe", ""]]) tm.assert_series_equal(result, exp) # explicit regex = False split - result = values.str.split(r'\.jpg', regex=False) - exp = Series([['qweqwejpgqweqwe.jpg']]) + result = values.str.split(r"\.jpg", regex=False) + exp = Series([["qweqwejpgqweqwe.jpg"]]) tm.assert_series_equal(result, exp) # non explicit regex split, pattern length == 1 - result = values.str.split(r'.') - exp = Series([['qweqwejpgqweqwe','jpg']]) + result = values.str.split(r".") + exp = Series([["qweqwejpgqweqwe", "jpg"]]) tm.assert_series_equal(result, exp) # non explicit regex split, pattern length != 1 - result = values.str.split(r'.jpg') - exp = Series([['qweqw','qweqwe', '']]) + result = values.str.split(r".jpg") + exp = Series([["qweqw", "qweqwe", ""]]) tm.assert_series_equal(result, exp) From 8312d799858fc650d7bc4b2c1f44b1285850a7bb Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Mon, 25 Oct 2021 19:15:07 -0700 Subject: [PATCH 12/23] changed whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2e8d0aee041ae..2fc5f7506e8c8 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -179,7 +179,7 @@ Other enhancements - :meth:`DataFrame.__pos__`, :meth:`DataFrame.__neg__` now retain ``ExtensionDtype`` dtypes (:issue:`43883`) - The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`) - Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`) -- :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`37963`, :issue:`32835`, :issue:`25549`) +- :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`) - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) - From a82639cb1a9d1a04835ee85a68aafab8800da8dc Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Mon, 25 Oct 2021 19:23:33 -0700 Subject: [PATCH 13/23] fixed mypy error --- pandas/core/strings/accessor.py | 30 ++++++++++++++--------------- pandas/core/strings/object_array.py | 6 +++++- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 9755e8a9e62f1..f091997e58922 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -670,11 +670,11 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): * If ``False``, return Series/Index, containing lists of strings. regex : bool, default None * If ``True``, assumes the passed-in pattern is a regular expression - * If ``False``, treats the pattern as a literal string + * If ``False``, treats the pattern as a literal string. * If ``None`` and the pattern length is 1, treats the pattern as a - literal string + literal string. * If ``None`` and the pattern length is not 1, treats the pattern as - a regular expression + a regular expression. Returns ------- @@ -776,33 +776,33 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): 0 this is a regular sentence None 1 https://docs.python.org/3/tutorial index.html 2 NaN NaN - + Remember to escape special characters when explicitly using regular expressions. When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled as a regex only if ``len(pat) != 1``. - >>> s = pd.Series(['foojpgbar.jpg']) + >>> s = pd.Series(['fooojpgbar.jpg']) >>> s.str.split(r".", expand=True) - 0 1 - 0 foojpgbar jpg + 0 1 + 0 fooojpgbar jpg >>> s.str.split(r"\.jpg", expand=True) - 0 1 - 0 foojpgbar + 0 1 + 0 fooojpgbar >>> s.str.split(r".jpg", expand=True) - 0 1 2 - 0 fo bar + 0 1 2 + 0 foo bar When ``regex=True``, `pat` is interpreted as a regex >>> s.str.split(r"\.jpg", regex=True, expand=True) - 0 1 - 0 foojpgbar + 0 1 + 0 fooojpgbar When ``regex=False``, `pat` is interpreted as the string itself >>> s.str.split(r"\.jpg", regex=False, expand=True) - 0 - 0 foojpgbar.jpg + 0 + 0 fooojpgbar.jpg """ @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 5e71ae2eb73bf..3081575f50700 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -309,7 +309,11 @@ def f(x): return self._str_map(f) def _str_split( - self, pat: str | re.Pattern | None = None, n=-1, expand=False, regex: bool | None = None + self, + pat: str | re.Pattern | None = None, + n=-1, + expand=False, + regex: bool | None = None, ): if pat is None: if n is None or n == 0: From 76e6001f4d241015f96dd64f5d97479c4f88f266 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Mon, 25 Oct 2021 21:29:08 -0700 Subject: [PATCH 14/23] more specific docs --- pandas/core/strings/accessor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index f091997e58922..c685b444450e1 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -668,7 +668,12 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): * If ``True``, return DataFrame/MultiIndex expanding dimensionality. * If ``False``, return Series/Index, containing lists of strings. + regex : bool, default None + Determines whether to handle the pattern as a regular expression. + If ``pat`` is a compiled regular expression, it is interpreted as a + regular expression regardless of ``regex`` + * If ``True``, assumes the passed-in pattern is a regular expression * If ``False``, treats the pattern as a literal string. * If ``None`` and the pattern length is 1, treats the pattern as a From 2c43fb50433379cbcf1b52072d426edd9499aaea Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Wed, 27 Oct 2021 10:45:24 -0700 Subject: [PATCH 15/23] added example --- pandas/core/strings/accessor.py | 37 ++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index c685b444450e1..a71989d4dffff 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -657,7 +657,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): Parameters ---------- - pat : str, or compiled regex optional + pat : str, or compiled regex, optional String or regular expression to split on. If not specified, split on whitespace. n : int, default -1 (all) @@ -676,10 +676,8 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): * If ``True``, assumes the passed-in pattern is a regular expression * If ``False``, treats the pattern as a literal string. - * If ``None`` and the pattern length is 1, treats the pattern as a - literal string. - * If ``None`` and the pattern length is not 1, treats the pattern as - a regular expression. + * If ``None`` and `pat` length is 1, treats `pat` as a literal string. + * If ``None`` and `pat` length is not 1, treats `pat` as a regular expression. Returns ------- @@ -783,31 +781,36 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): 2 NaN NaN Remember to escape special characters when explicitly using regular expressions. + + >>> s = pd.Series(["foo and bar plus baz"]) + >>> s.str.split(r"and|plus", expand=True) + 0 1 2 + 0 foo bar baz + + Regular expressions can be used to handle urls or file names. When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled as a regex only if ``len(pat) != 1``. - >>> s = pd.Series(['fooojpgbar.jpg']) + >>> s = pd.Series(['foojpgbar.jpg']) >>> s.str.split(r".", expand=True) - 0 1 - 0 fooojpgbar jpg + 0 1 + 0 foojpgbar jpg + >>> s.str.split(r"\.jpg", expand=True) - 0 1 - 0 fooojpgbar - >>> s.str.split(r".jpg", expand=True) - 0 1 2 - 0 foo bar + 0 1 + 0 foojpgbar When ``regex=True``, `pat` is interpreted as a regex >>> s.str.split(r"\.jpg", regex=True, expand=True) - 0 1 - 0 fooojpgbar + 0 1 + 0 foojpgbar When ``regex=False``, `pat` is interpreted as the string itself >>> s.str.split(r"\.jpg", regex=False, expand=True) - 0 - 0 fooojpgbar.jpg + 0 + 0 foojpgbar.jpg """ @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) From 2ed798076e173f23b96dc0311b671b95d8a8f084 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Wed, 27 Oct 2021 22:13:14 -0700 Subject: [PATCH 16/23] changed doc to match str_replace, moved tests to a new test func --- pandas/core/strings/accessor.py | 34 ++++++++++++++------ pandas/tests/strings/test_split_partition.py | 3 ++ 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index a71989d4dffff..63a5d04a42702 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -653,7 +653,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): Split strings around given separator/delimiter. Splits the string in the Series/Index from the %(side)s, - at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. + at the specified delimiter string. Parameters ---------- @@ -666,24 +666,30 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): expand : bool, default False Expand the split strings into separate columns. - * If ``True``, return DataFrame/MultiIndex expanding dimensionality. - * If ``False``, return Series/Index, containing lists of strings. + - If ``True``, return DataFrame/MultiIndex expanding dimensionality. + - If ``False``, return Series/Index, containing lists of strings. regex : bool, default None - Determines whether to handle the pattern as a regular expression. - If ``pat`` is a compiled regular expression, it is interpreted as a - regular expression regardless of ``regex`` + Determines if the passed-in pattern is a regular expression: - * If ``True``, assumes the passed-in pattern is a regular expression - * If ``False``, treats the pattern as a literal string. - * If ``None`` and `pat` length is 1, treats `pat` as a literal string. - * If ``None`` and `pat` length is not 1, treats `pat` as a regular expression. + - If ``True``, assumes the passed-in pattern is a regular expression + - If ``False``, treats the pattern as a literal string. + - If ``None`` and `pat` length is 1, treats `pat` as a literal string. + - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression. + - Cannot be set to False if `pat` is a compiled regex + + .. versionadded:: 1.4.0 Returns ------- Series, Index, DataFrame or MultiIndex Type matches caller unless ``expand=True`` (see Notes). + Raises + ------ + ValueError + * if `regex` is False and `pat` is a compiled regex + See Also -------- Series.str.split : Split strings around given separator/delimiter. @@ -706,6 +712,9 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): If using ``expand=True``, Series and Index callers return DataFrame and MultiIndex objects, respectively. + Use of `regex=False` with a `pat` as a compiled regex will raise + an error. + Examples -------- >>> s = pd.Series( @@ -822,6 +831,11 @@ def split( expand=False, regex: bool | None = None, ): + if not regex and is_re(pat): + raise ValueError( + "Cannot use a compiled regex as replacement pattern with regex=False" + ) + result = self._data.array._str_split(pat, n, expand, regex) return self._wrap_result(result, returns_string=expand, expand=expand) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 7a4fa85840385..958fc1f54438f 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -34,6 +34,9 @@ def test_split(any_string_dtype): exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) + +def test_split_regex(any_string_dtype): + # GH 43563 # explicit regex = True split values = Series("qweqwejpgqweqwe.jpg", dtype=any_string_dtype) result = values.str.split(r"\.jpg", regex=True) From 5f0d8dfc5d95a05b26aba56a6b643cfdf72d06b5 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Thu, 28 Oct 2021 14:45:55 -0700 Subject: [PATCH 17/23] changed test string to be readable --- pandas/tests/strings/test_split_partition.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 958fc1f54438f..c39b812ff4bdc 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -38,24 +38,24 @@ def test_split(any_string_dtype): def test_split_regex(any_string_dtype): # GH 43563 # explicit regex = True split - values = Series("qweqwejpgqweqwe.jpg", dtype=any_string_dtype) + values = Series("foo-jpg-bar.jpg", dtype=any_string_dtype) result = values.str.split(r"\.jpg", regex=True) - exp = Series([["qweqwejpgqweqwe", ""]]) + exp = Series([["foo-jpg-bar", ""]]) tm.assert_series_equal(result, exp) # explicit regex = False split result = values.str.split(r"\.jpg", regex=False) - exp = Series([["qweqwejpgqweqwe.jpg"]]) + exp = Series([["foo-jpg-bar.jpg"]]) tm.assert_series_equal(result, exp) # non explicit regex split, pattern length == 1 result = values.str.split(r".") - exp = Series([["qweqwejpgqweqwe", "jpg"]]) + exp = Series([["foo-jpg-bar", "jpg"]]) tm.assert_series_equal(result, exp) # non explicit regex split, pattern length != 1 result = values.str.split(r".jpg") - exp = Series([["qweqw", "qweqwe", ""]]) + exp = Series([["foo", "-bar", ""]]) tm.assert_series_equal(result, exp) From ba812a12fbac6f85948711b5f61833b78eb1b6f8 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Thu, 28 Oct 2021 14:48:27 -0700 Subject: [PATCH 18/23] changed test string to be readable --- pandas/tests/strings/test_split_partition.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index c39b812ff4bdc..4cc759383c1a3 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -38,24 +38,24 @@ def test_split(any_string_dtype): def test_split_regex(any_string_dtype): # GH 43563 # explicit regex = True split - values = Series("foo-jpg-bar.jpg", dtype=any_string_dtype) + values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype) result = values.str.split(r"\.jpg", regex=True) - exp = Series([["foo-jpg-bar", ""]]) + exp = Series([["xxxjpgzzz", ""]]) tm.assert_series_equal(result, exp) # explicit regex = False split result = values.str.split(r"\.jpg", regex=False) - exp = Series([["foo-jpg-bar.jpg"]]) + exp = Series([["xxxjpgzzz.jpg"]]) tm.assert_series_equal(result, exp) # non explicit regex split, pattern length == 1 result = values.str.split(r".") - exp = Series([["foo-jpg-bar", "jpg"]]) + exp = Series([["xxxjpgzzz", "jpg"]]) tm.assert_series_equal(result, exp) # non explicit regex split, pattern length != 1 result = values.str.split(r".jpg") - exp = Series([["foo", "-bar", ""]]) + exp = Series([["xx", "zzz", ""]]) tm.assert_series_equal(result, exp) From e2da86122f37e3a66b33ee435429763119459df9 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Sat, 30 Oct 2021 19:51:41 -0700 Subject: [PATCH 19/23] added test for raises error when regex=False and pat is regex --- pandas/tests/strings/test_split_partition.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 4cc759383c1a3..23c187699e7d4 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -1,4 +1,5 @@ from datetime import datetime +import re import numpy as np import pytest @@ -58,6 +59,14 @@ def test_split_regex(any_string_dtype): exp = Series([["xx", "zzz", ""]]) tm.assert_series_equal(result, exp) + # regex=False with pattern compiled regex raises error + with pytest.raises( + ValueError, + match="Cannot use a compiled regex as replacement pattern with regex=False", + ): + pat = re.compile("xxx") + values.str.split(pat, regex=False) + def test_split_object_mixed(): mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) From 057dcfbb104382dc3f70c3848e2ed2a8662c47b8 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Mon, 1 Nov 2021 21:57:45 -0700 Subject: [PATCH 20/23] added test for explicit regex=True with compiled regex --- pandas/tests/strings/test_split_partition.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 23c187699e7d4..7af5c01367e01 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -44,6 +44,13 @@ def test_split_regex(any_string_dtype): exp = Series([["xxxjpgzzz", ""]]) tm.assert_series_equal(result, exp) + # explicit regex = True split with compiled regex + regex_pat = re.compile(r".jpg") + values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype) + result = values.str.split(regex_pat, regex=True) + exp = Series([["xx", "zzz", ""]]) + tm.assert_series_equal(result, exp) + # explicit regex = False split result = values.str.split(r"\.jpg", regex=False) exp = Series([["xxxjpgzzz.jpg"]]) @@ -64,8 +71,7 @@ def test_split_regex(any_string_dtype): ValueError, match="Cannot use a compiled regex as replacement pattern with regex=False", ): - pat = re.compile("xxx") - values.str.split(pat, regex=False) + values.str.split(regex_pat, regex=False) def test_split_object_mixed(): From ece00f1b7d7afe6e2395b5992b5f2bed283f1ab1 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Mon, 1 Nov 2021 22:01:15 -0700 Subject: [PATCH 21/23] got rid of unnecessary comma in doc string --- pandas/core/strings/accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index d7f22baf3fc67..d477be9d50496 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -663,7 +663,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): Parameters ---------- - pat : str, or compiled regex, optional + pat : str or compiled regex, optional String or regular expression to split on. If not specified, split on whitespace. n : int, default -1 (all) From b6bbf3e3921101541807f678a24162c592b389c8 Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Tue, 2 Nov 2021 10:39:00 -0700 Subject: [PATCH 22/23] added compiled regex example, changed logic so that becomes true when passed in compiled regex --- pandas/core/strings/accessor.py | 12 ++++++++++-- pandas/tests/strings/test_split_partition.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index d477be9d50496..6bdd799c5b0f5 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -821,6 +821,12 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): 0 1 0 foojpgbar + A compiled regex can be passed as `pat` + + >>> s.str.split(re.compile(r"\.jpg")) + 0 1 + 0 foojpgbar + When ``regex=False``, `pat` is interpreted as the string itself >>> s.str.split(r"\.jpg", regex=False, expand=True) @@ -835,13 +841,15 @@ def split( pat: str | re.Pattern | None = None, n=-1, expand=False, + *, regex: bool | None = None, ): - if not regex and is_re(pat): + if regex is False and is_re(pat): raise ValueError( "Cannot use a compiled regex as replacement pattern with regex=False" ) - + if is_re(pat): + regex = True result = self._data.array._str_split(pat, n, expand, regex) return self._wrap_result(result, returns_string=expand, expand=expand) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 7af5c01367e01..01a397938db52 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -47,7 +47,7 @@ def test_split_regex(any_string_dtype): # explicit regex = True split with compiled regex regex_pat = re.compile(r".jpg") values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype) - result = values.str.split(regex_pat, regex=True) + result = values.str.split(regex_pat) exp = Series([["xx", "zzz", ""]]) tm.assert_series_equal(result, exp) From 27ffee767b58b5b73ded417a5cfb9e1eba3a8eef Mon Sep 17 00:00:00 2001 From: Saehui Hwang Date: Tue, 2 Nov 2021 10:41:36 -0700 Subject: [PATCH 23/23] corrected docs --- pandas/core/strings/accessor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 6bdd799c5b0f5..9f163f77a2ae8 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -823,7 +823,8 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): A compiled regex can be passed as `pat` - >>> s.str.split(re.compile(r"\.jpg")) + >>> import re + >>> s.str.split(re.compile(r"\.jpg"), expand=True) 0 1 0 foojpgbar