From aaf9be311552412f14b233295442b02bf28bff10 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Wed, 12 Jun 2019 21:31:52 +0200 Subject: [PATCH] merge master --- doc/source/whatsnew/v0.25.0.rst | 3 +- pandas/core/reshape/melt.py | 3 +- pandas/core/strings.py | 16 ++++- pandas/tests/test_strings.py | 104 +++++++++++++++++++++++++------- 4 files changed, 99 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 76ee21b4c9a50..bac5b95741dd0 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -495,7 +495,7 @@ Other Deprecations Use the public attributes :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop` and :attr:`~RangeIndex.step` instead (:issue:`26581`). - The :meth:`Series.ftype`, :meth:`Series.ftypes` and :meth:`DataFrame.ftypes` methods are deprecated and will be removed in a future version. Instead, use :meth:`Series.dtype` and :meth:`DataFrame.dtypes` (:issue:`26705`). - +- :func:`Series.str.replace`, when ``pat`` is single special regex character (such as ``.|\`` etc) and regex is not defined, regex is by default ``False`` for now, but this might be deprecated in the future. (:issue:`24804`) .. _whatsnew_0250.prior_deprecations: @@ -605,6 +605,7 @@ Conversion Strings ^^^^^^^ +- Bug in :func:`Series.str.replace` not applying regex in patterns of length 1 (:issue:`24804`) - Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`) - diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index d655a8be13de7..332ad04ff674e 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -413,7 +413,8 @@ def melt_stub(df, stub, i, j, value_vars, sep): newdf = melt(df, id_vars=i, value_vars=value_vars, value_name=stub.rstrip(sep), var_name=j) newdf[j] = Categorical(newdf[j]) - newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") + newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", + regex=True) # GH17627 Cast numerics suffixes to int/float newdf[j] = to_numeric(newdf[j], errors='ignore') diff --git a/pandas/core/strings.py b/pandas/core/strings.py index bd756491abd2f..812e8c70580fa 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -421,7 +421,7 @@ def str_endswith(arr, pat, na=np.nan): return _na_map(f, arr, na, dtype=bool) -def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): +def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=None): r""" Replace occurrences of pattern/regex in the Series/Index with some other string. Equivalent to :meth:`str.replace` or @@ -452,9 +452,13 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): flags : int, default 0 (no flags) - re module flags, e.g. re.IGNORECASE - Cannot be set if `pat` is a compiled regex - regex : bool, default True + regex : boolean, default None - If True, assumes the passed-in pattern is a regular expression. - If False, treats the pattern as a literal string + - If `pat` is a single character and `regex` is not specified, `pat` + is interpreted as a string literal. If `pat` is also a regular + expression symbol, a warning is issued that in the future `pat` + will be interpreted as a regex, rather than a literal. - Cannot be set to False if `pat` is a compiled regex or `repl` is a callable. @@ -561,7 +565,7 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): # add case flag, if provided if case is False: flags |= re.IGNORECASE - if is_compiled_re or len(pat) > 1 or flags or callable(repl): + if is_compiled_re or pat or flags or callable(repl): n = n if n >= 0 else 0 compiled = re.compile(pat, flags=flags) f = lambda x: compiled.sub(repl=repl, string=x, count=n) @@ -574,6 +578,12 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): if callable(repl): raise ValueError("Cannot use a callable replacement when " "regex=False") + # if regex is default None, and a single special character is given + # in pat, still take it as a literal, and raise the Future warning + if regex is None and len(pat) == 1 and pat in list(r"[\^$.|?*+()]"): + warnings.warn("'{}' is interpreted as a literal in ".format(pat) + + "default, not regex. It will change in the future.", + FutureWarning) f = lambda x: x.replace(pat, repl, n) return _na_map(f, arr) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index a1d522930e9aa..983e064e514d2 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -6,6 +6,7 @@ from numpy.random import randint import pytest + from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna import pandas.core.strings as strings import pandas.util.testing as tm @@ -892,11 +893,11 @@ def test_casemethods(self): def test_replace(self): values = Series(['fooBAD__barBAD', NA]) - result = values.str.replace('BAD[_]*', '') + result = values.str.replace('BAD[_]*', '', regex=True) exp = Series(['foobar', NA]) tm.assert_series_equal(result, exp) - result = values.str.replace('BAD[_]*', '', n=1) + result = values.str.replace('BAD[_]*', '', regex=True, n=1) exp = Series(['foobarBAD', NA]) tm.assert_series_equal(result, exp) @@ -904,15 +905,27 @@ def test_replace(self): mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD', None, 1, 2.]) - rs = Series(mixed).str.replace('BAD[_]*', '') + rs = Series(mixed).str.replace('BAD[_]*', '', regex=True) xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) + # unicode + values = Series([u'fooBAD__barBAD', NA]) + + result = values.str.replace('BAD[_]*', '', regex=True) + exp = Series([u'foobar', NA]) + tm.assert_series_equal(result, exp) + + result = values.str.replace('BAD[_]*', '', n=1, regex=True) + exp = Series([u'foobarBAD', NA]) + tm.assert_series_equal(result, exp) + # flags + unicode values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) - result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE) + result = values.str.replace(r"(?<=\w),(?=\w)", ", ", regex=True, + flags=re.UNICODE) tm.assert_series_equal(result, exp) # GH 13438 @@ -930,7 +943,7 @@ def test_replace_callable(self): # test with callable repl = lambda m: m.group(0).swapcase() - result = values.str.replace('[a-z][A-Z]{2}', repl, n=2) + result = values.str.replace('[a-z][A-Z]{2}', repl, n=2, regex=True) exp = Series(['foObaD__baRbaD', NA]) tm.assert_series_equal(result, exp) @@ -940,21 +953,21 @@ def test_replace_callable(self): repl = lambda: None with pytest.raises(TypeError, match=p_err): - values.str.replace('a', repl) + values.str.replace('a', repl, regex=True) repl = lambda m, x: None with pytest.raises(TypeError, match=p_err): - values.str.replace('a', repl) + values.str.replace('a', repl, regex=True) repl = lambda m, x, y=None: None with pytest.raises(TypeError, match=p_err): - values.str.replace('a', repl) + values.str.replace('a', repl, regex=True) # test regex named groups values = Series(['Foo Bar Baz', NA]) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group('middle').swapcase() - result = values.str.replace(pat, repl) + result = values.str.replace(pat, repl, regex=True) exp = Series(['bAR', NA]) tm.assert_series_equal(result, exp) @@ -964,11 +977,11 @@ def test_replace_compiled_regex(self): # test with compiled regex pat = re.compile(r'BAD[_]*') - result = values.str.replace(pat, '') + result = values.str.replace(pat, '', regex=True) exp = Series(['foobar', NA]) tm.assert_series_equal(result, exp) - result = values.str.replace(pat, '', n=1) + result = values.str.replace(pat, '', n=1, regex=True) exp = Series(['foobarBAD', NA]) tm.assert_series_equal(result, exp) @@ -976,16 +989,27 @@ def test_replace_compiled_regex(self): mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD', None, 1, 2.]) - rs = Series(mixed).str.replace(pat, '') + rs = Series(mixed).str.replace(pat, '', regex=True) xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) + # unicode + values = Series([u'fooBAD__barBAD', NA]) + + result = values.str.replace(pat, '', regex=True) + exp = Series([u'foobar', NA]) + tm.assert_series_equal(result, exp) + + result = values.str.replace(pat, '', n=1, regex=True) + exp = Series([u'foobarBAD', NA]) + tm.assert_series_equal(result, exp) + # flags + unicode values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - result = values.str.replace(pat, ", ") + result = values.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, exp) # case and flags provided to str.replace will have no effect @@ -995,21 +1019,22 @@ def test_replace_compiled_regex(self): with pytest.raises(ValueError, match="case and flags cannot be"): - result = values.str.replace(pat, '', flags=re.IGNORECASE) + result = values.str.replace(pat, '', flags=re.IGNORECASE, + regex=True) with pytest.raises(ValueError, match="case and flags cannot be"): - result = values.str.replace(pat, '', case=False) + result = values.str.replace(pat, '', case=False, regex=True) with pytest.raises(ValueError, match="case and flags cannot be"): - result = values.str.replace(pat, '', case=True) + result = values.str.replace(pat, '', case=True, regex=True) # test with callable values = Series(['fooBAD__barBAD', NA]) repl = lambda m: m.group(0).swapcase() pat = re.compile('[a-z][A-Z]{2}') - result = values.str.replace(pat, repl, n=2) + result = values.str.replace(pat, repl, n=2, regex=True) exp = Series(['foObaD__baRbaD', NA]) tm.assert_series_equal(result, exp) @@ -1017,7 +1042,7 @@ def test_replace_literal(self): # GH16808 literal replace (regex=False vs regex=True) values = Series(['f.o', 'foo', NA]) exp = Series(['bao', 'bao', NA]) - result = values.str.replace('f.', 'ba') + result = values.str.replace('f.', 'ba', regex=True) tm.assert_series_equal(result, exp) exp = Series(['bao', 'foo', NA]) @@ -2710,6 +2735,7 @@ def test_partition_deprecation(self): result = values.str.rpartition(pat='_') tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings("ignore: '|' is interpreted as a literal") def test_pipe_failures(self): # #2119 s = Series(['A|B|C']) @@ -2719,7 +2745,7 @@ def test_pipe_failures(self): tm.assert_series_equal(result, exp) - result = s.str.replace('|', ' ') + result = s.str.replace('|', ' ', regex=None) exp = Series(['A B C']) tm.assert_series_equal(result, exp) @@ -2980,17 +3006,17 @@ def test_replace_moar(self): s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA', 'dog', 'cat']) - result = s.str.replace('A', 'YYY') + result = s.str.replace('A', 'YYY', regex=True) expected = Series(['YYY', 'B', 'C', 'YYYaba', 'Baca', '', NA, 'CYYYBYYY', 'dog', 'cat']) assert_series_equal(result, expected) - result = s.str.replace('A', 'YYY', case=False) + result = s.str.replace('A', 'YYY', case=False, regex=True) expected = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', '', NA, 'CYYYBYYY', 'dog', 'cYYYt']) assert_series_equal(result, expected) - result = s.str.replace('^.a|dog', 'XX-XX ', case=False) + result = s.str.replace('^.a|dog', 'XX-XX ', case=False, regex=True) expected = Series(['A', 'B', 'C', 'XX-XX ba', 'XX-XX ca', '', NA, 'XX-XX BA', 'XX-XX ', 'XX-XX t']) assert_series_equal(result, expected) @@ -3162,6 +3188,40 @@ def test_method_on_bytes(self): match="Cannot use .str.cat with values of.*"): lhs.str.cat(rhs) + @pytest.mark.filterwarnings("ignore: '.' is interpreted as a literal") + @pytest.mark.parametrize("regex, expected_array", [ + (True, ['foofoofoo', 'foofoofoo']), + (False, ['abc', '123']), + (None, ['abc', '123']) + ]) + def test_replace_single_pattern(self, regex, expected_array): + values = Series(['abc', '123']) + # GH: 24804 + result = values.str.replace('.', 'foo', regex=regex) + expected = Series(expected_array) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("input_array, single_char, replace_char, " + "expect_array, warn", + [("a.c", ".", "b", "abc", True), + ("a@c", "@", "at", "aatc", False)] + ) + def test_replace_warning_single_character(self, input_array, + single_char, replace_char, + expect_array, warn): + # GH: 24804 + values = Series([input_array]) + if warn: + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = values.str.replace(single_char, replace_char, + regex=None) + else: + result = values.str.replace(single_char, replace_char) + + expected = Series([expect_array]) + tm.assert_series_equal(result, expected) + def test_casefold(self): # GH25405 expected = Series(['ss', NA, 'case', 'ssd'])