From 12a1cc779a05aa980de164189409830ec2b1db0d Mon Sep 17 00:00:00 2001 From: "elena.totmenina" Date: Tue, 18 Feb 2020 15:46:42 +0300 Subject: [PATCH 1/5] wip --- sdc/datatypes/hpat_pandas_series_functions.py | 28 ++++++++++++------- sdc/functions/numpy_like.py | 13 +++++++++ sdc/tests/test_series.py | 26 +++++++++++++++++ 3 files changed, 57 insertions(+), 10 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 658d6460c..564b1f355 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -2194,24 +2194,32 @@ def hpat_pandas_series_corr(self, other, method='pearson', min_periods=None): if not isinstance(min_periods, (int, types.Integer, types.Omitted, types.NoneType)) and min_periods is not None: ty_checker.raise_exc(min_periods, 'int64', 'min_periods') - + dtype = self.data.dtype def hpat_pandas_series_corr_impl(self, other, method='pearson', min_periods=None): + len_self = len(self._data) + len_other = len(other._data) if method not in ('pearson', ''): raise ValueError("Method corr(). Unsupported parameter. Given method != 'pearson'") if min_periods is None: min_periods = 1 - if len(self._data) == 0 or len(other._data) == 0: + if len_self == 0 or len_other == 0: return numpy.nan - - self_arr = self._data[:min(len(self._data), len(other._data))] - other_arr = other._data[:min(len(self._data), len(other._data))] - - invalid = numpy.isnan(self_arr) | numpy.isnan(other_arr) - if invalid.any(): - self_arr = self_arr[~invalid] - other_arr = other_arr[~invalid] + # print(numpy_like.corr(0)) + # if len_self != len_other: + min_len = min(len_self, len_other) + # self_arr = numpy.empty(min_len, dtype=dtype) + # other_arr = numpy.empty(min_len, dtype=dtype) + for i in prange(min_len): + # self_arr[i] = self._data[i] + # other_arr[i] = other._data[i] + # Как в dropna + # И потом переписать в новый arr + # invalid = numpy_like.isnan(self_arr) | numpy_like.isnan(other_arr) + # if invalid.any(): + # self_arr = self_arr[~invalid] + # other_arr = other_arr[~invalid] if len(self_arr) < min_periods: return numpy.nan diff --git a/sdc/functions/numpy_like.py b/sdc/functions/numpy_like.py index cbdb43904..387c38641 100644 --- a/sdc/functions/numpy_like.py +++ b/sdc/functions/numpy_like.py @@ -73,6 +73,10 @@ def nansum(self): pass +def corr(self): + pass + + @sdc_overload(astype) def sdc_astype_overload(self, dtype): """ @@ -134,6 +138,15 @@ def sdc_astype_number_impl(self, dtype): ty_checker.raise_exc(self.dtype, 'str or type', 'self.dtype') +@sdc_overload(corr) +def sdc_corr_overload(self): + def sdc_corr_impl(self): + print('QQQQ') + return 0 + + return sdc_corr_impl + + @sdc_overload(copy) def sdc_copy_overload(self): """ diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 1b35cbd48..40f3591b1 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -585,6 +585,32 @@ def test_series_corr_impl(S1, S2, min_periods=None): result = hpat_func(S1, S2, min_periods=period) np.testing.assert_allclose(result, result_ref) + def test_series_corr_1(self): + def test_series_corr_impl(S1, S2, min_periods=None): + return S1.corr(S2, min_periods=min_periods) + + hpat_func = self.jit(test_series_corr_impl) + test_input_data1 = [[.2, .0, .6, .2]]#, + # [.2, .0, .6, .2, .5, .6, .7, .8], + # [], + # [2, 0, 6, 2], + # [.2, .1, np.nan, .5, .3], + # [-1, np.nan, 1, np.inf]] + test_input_data2 = [[.3, .6, .0, .1]]#, + # [.3, .6, .0, .1, .8], + # [], + # [3, 6, 0, 1], + # [.3, .2, .9, .6, np.nan], + # [np.nan, np.nan, np.inf, np.nan]] + for input_data1 in test_input_data1: + for input_data2 in test_input_data2: + S1 = pd.Series(input_data1) + S2 = pd.Series(input_data2) + # for period in [None, 2, 1, 8, -4]: + result_ref = test_series_corr_impl(S1, S2) + result = hpat_func(S1, S2) + np.testing.assert_allclose(result, result_ref) + @skip_sdc_jit('Series.corr() parameter "min_periods" unsupported') def test_series_corr_unsupported_dtype(self): def test_series_corr_impl(S1, S2, min_periods=None): From 2ad1e5acd1e5f738091bd5d8f3616750172d807b Mon Sep 17 00:00:00 2001 From: "elena.totmenina" Date: Wed, 19 Feb 2020 14:59:11 +0300 Subject: [PATCH 2/5] Scale corr --- sdc/datatypes/hpat_pandas_series_functions.py | 45 +---------- sdc/functions/numpy_like.py | 79 ++++++++++++++++--- sdc/utilities/prange_utils.py | 48 +++-------- 3 files changed, 84 insertions(+), 88 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 564b1f355..f7c86e299 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -2194,50 +2194,9 @@ def hpat_pandas_series_corr(self, other, method='pearson', min_periods=None): if not isinstance(min_periods, (int, types.Integer, types.Omitted, types.NoneType)) and min_periods is not None: ty_checker.raise_exc(min_periods, 'int64', 'min_periods') - dtype = self.data.dtype - def hpat_pandas_series_corr_impl(self, other, method='pearson', min_periods=None): - len_self = len(self._data) - len_other = len(other._data) - if method not in ('pearson', ''): - raise ValueError("Method corr(). Unsupported parameter. Given method != 'pearson'") - - if min_periods is None: - min_periods = 1 - - if len_self == 0 or len_other == 0: - return numpy.nan - # print(numpy_like.corr(0)) - # if len_self != len_other: - min_len = min(len_self, len_other) - # self_arr = numpy.empty(min_len, dtype=dtype) - # other_arr = numpy.empty(min_len, dtype=dtype) - for i in prange(min_len): - # self_arr[i] = self._data[i] - # other_arr[i] = other._data[i] - # Как в dropna - # И потом переписать в новый arr - # invalid = numpy_like.isnan(self_arr) | numpy_like.isnan(other_arr) - # if invalid.any(): - # self_arr = self_arr[~invalid] - # other_arr = other_arr[~invalid] - - if len(self_arr) < min_periods: - return numpy.nan - new_self = pandas.Series(self_arr) - new_other = pandas.Series(other_arr) - - n = new_self.count() - ma = new_self.sum() - mb = new_other.sum() - a = n * (self_arr * other_arr).sum() - ma * mb - b1 = n * (self_arr * self_arr).sum() - ma * ma - b2 = n * (other_arr * other_arr).sum() - mb * mb - - if b1 == 0 or b2 == 0: - return numpy.nan - - return a / numpy.sqrt(b1 * b2) + def hpat_pandas_series_corr_impl(self, other, method='pearson', min_periods=None): + return numpy_like.corr(self._data, other._data, method, min_periods) return hpat_pandas_series_corr_impl diff --git a/sdc/functions/numpy_like.py b/sdc/functions/numpy_like.py index 387c38641..856e581e3 100644 --- a/sdc/functions/numpy_like.py +++ b/sdc/functions/numpy_like.py @@ -43,6 +43,7 @@ from sdc.utilities.sdc_typing_utils import TypeChecker from sdc.str_arr_ext import (StringArrayType, pre_alloc_string_array, get_utf8_size, str_arr_is_na) from sdc.utilities.utils import sdc_overload, sdc_register_jitable +from sdc.utilities.prange_utils import parallel_chunks def astype(self, dtype): @@ -138,15 +139,6 @@ def sdc_astype_number_impl(self, dtype): ty_checker.raise_exc(self.dtype, 'str or type', 'self.dtype') -@sdc_overload(corr) -def sdc_corr_overload(self): - def sdc_corr_impl(self): - print('QQQQ') - return 0 - - return sdc_corr_impl - - @sdc_overload(copy) def sdc_copy_overload(self): """ @@ -510,3 +502,72 @@ def nanmean_impl(a): return np.divide(c, count) return nanmean_impl + + +def corr(self, other, method='pearson', min_periods=None): + pass + + +@sdc_overload(corr) +def corr_overload(self, other, method='pearson', min_periods=None): + dtype_self = self.dtype + dtype_other = other.dtype + isnan_self = get_isnan(dtype_self) + isnan_other = get_isnan(dtype_other) + + def corr_impl(self, other, method='pearson', min_periods=None): + len_self = len(self) + len_other = len(other) + if method not in ('pearson', ''): + raise ValueError("Method corr(). Unsupported parameter. Given method != 'pearson'") + + if min_periods is None: + min_periods = 1 + + if len_self == 0 or len_other == 0: + return numpy.nan + + min_len = min(len_self, len_other) + chunks = parallel_chunks(min_len) + arr_len = numpy.empty(len(chunks), dtype=numpy.int64) + length = 0 + + for i in prange(len(chunks)): + chunk = chunks[i] + res = 0 + for j in range(chunk.start, chunk.stop): + if not isnan_self(self[j]) or not isnan_other(other[j]): + res += 1 + length += res + arr_len[i] = res + + result_self = numpy.empty(shape=length, dtype=dtype_self) + result_other = numpy.empty(shape=length, dtype=dtype_other) + for i in prange(len(chunks)): + chunk = chunks[i] + new_start = int(sum(arr_len[0:i])) + new_stop = new_start + arr_len[i] + current_pos = new_start + + for j in range(chunk.start, chunk.stop): + if not isnan_self(self[j]) or not isnan_other(other[j]): + result_self[current_pos] = self[j] + result_other[current_pos] = other[j] + current_pos += 1 + + if len(result_self) < min_periods: + return numpy.nan + + n = length + ma = sum(result_self) + mb = sum(result_other) + a = n * (result_self * result_other).sum() - ma * mb + b1 = n * (result_self * result_self).sum() - ma * ma + b2 = n * (result_other * result_other).sum() - mb * mb + + if b1 == 0 or b2 == 0: + return numpy.nan + + return a / numpy.sqrt(b1 * b2) + + return corr_impl diff --git a/sdc/utilities/prange_utils.py b/sdc/utilities/prange_utils.py index 2380bb513..0f24d34ea 100644 --- a/sdc/utilities/prange_utils.py +++ b/sdc/utilities/prange_utils.py @@ -29,7 +29,7 @@ import sdc from typing import NamedTuple -from sdc.utilities.utils import sdc_overload +from sdc.utilities.utils import sdc_overload, sdc_register_jitable class Chunk(NamedTuple): @@ -37,6 +37,7 @@ class Chunk(NamedTuple): stop: int +@sdc_register_jitable def get_pool_size(): if sdc.config.config_use_parallel_overloads: return numba.config.NUMBA_NUM_THREADS @@ -44,46 +45,21 @@ def get_pool_size(): return 1 -@sdc_overload(get_pool_size) -def get_pool_size_overload(): - pool_size = get_pool_size() - - def get_pool_size_impl(): - return pool_size - - return get_pool_size_impl - - -def get_chunks(size, pool_size=0): - if pool_size == 0: - pool_size = get_pool_size() - - chunk_size = (size - 1) // pool_size + 1 +@sdc_register_jitable +def get_chunks(size, pool_size): + pool_size = min(pool_size, size) + chunk_size = size // pool_size + overload_size = size % pool_size chunks = [] for i in range(pool_size): - start = min(i * chunk_size, size) - stop = min((i + 1) * chunk_size, size) + start = i * chunk_size + min(i, overload_size) + stop = (i + 1) * chunk_size + min(i + 1, overload_size) chunks.append(Chunk(start, stop)) return chunks -@sdc_overload(get_chunks) -def get_chunks_overload(size, pool_size=0): - def get_chunks_impl(size, pool_size=0): - if pool_size == 0: - pool_size = get_pool_size() - - chunk_size = (size - 1) // pool_size + 1 - - chunks = [] - for i in range(pool_size): - start = min(i * chunk_size, size) - stop = min((i + 1) * chunk_size, size) - chunk = Chunk(start, stop) - chunks.append(chunk) - - return chunks - - return get_chunks_impl +@sdc_register_jitable +def parallel_chunks(size): + return get_chunks(size, get_pool_size()) From 3cda0fcb64b3c89138fad722de08a5166b4d3680 Mon Sep 17 00:00:00 2001 From: "elena.totmenina" Date: Wed, 19 Feb 2020 15:02:23 +0300 Subject: [PATCH 3/5] Delete debug test --- sdc/tests/test_series.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 40f3591b1..1b35cbd48 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -585,32 +585,6 @@ def test_series_corr_impl(S1, S2, min_periods=None): result = hpat_func(S1, S2, min_periods=period) np.testing.assert_allclose(result, result_ref) - def test_series_corr_1(self): - def test_series_corr_impl(S1, S2, min_periods=None): - return S1.corr(S2, min_periods=min_periods) - - hpat_func = self.jit(test_series_corr_impl) - test_input_data1 = [[.2, .0, .6, .2]]#, - # [.2, .0, .6, .2, .5, .6, .7, .8], - # [], - # [2, 0, 6, 2], - # [.2, .1, np.nan, .5, .3], - # [-1, np.nan, 1, np.inf]] - test_input_data2 = [[.3, .6, .0, .1]]#, - # [.3, .6, .0, .1, .8], - # [], - # [3, 6, 0, 1], - # [.3, .2, .9, .6, np.nan], - # [np.nan, np.nan, np.inf, np.nan]] - for input_data1 in test_input_data1: - for input_data2 in test_input_data2: - S1 = pd.Series(input_data1) - S2 = pd.Series(input_data2) - # for period in [None, 2, 1, 8, -4]: - result_ref = test_series_corr_impl(S1, S2) - result = hpat_func(S1, S2) - np.testing.assert_allclose(result, result_ref) - @skip_sdc_jit('Series.corr() parameter "min_periods" unsupported') def test_series_corr_unsupported_dtype(self): def test_series_corr_impl(S1, S2, min_periods=None): From 5fbc1c1deed5782828406fdf79c7f6c2fd7b688d Mon Sep 17 00:00:00 2001 From: "elena.totmenina" Date: Thu, 20 Feb 2020 14:12:11 +0300 Subject: [PATCH 4/5] new_algo --- sdc/datatypes/hpat_pandas_series_functions.py | 2 +- sdc/functions/numpy_like.py | 79 +++++++------------ 2 files changed, 28 insertions(+), 53 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 9c00ad75e..1fe4d0133 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -2196,7 +2196,7 @@ def hpat_pandas_series_corr(self, other, method='pearson', min_periods=None): ty_checker.raise_exc(min_periods, 'int64', 'min_periods') def hpat_pandas_series_corr_impl(self, other, method='pearson', min_periods=None): - return numpy_like.corr(self._data, other._data, method, min_periods) + return numpy_like.corr(self, other, method, min_periods) return hpat_pandas_series_corr_impl diff --git a/sdc/functions/numpy_like.py b/sdc/functions/numpy_like.py index d76b26a80..04a9cc4e0 100644 --- a/sdc/functions/numpy_like.py +++ b/sdc/functions/numpy_like.py @@ -74,10 +74,6 @@ def nansum(self): pass -def corr(self): - pass - - @sdc_overload(astype) def sdc_astype_overload(self, dtype): """ @@ -510,65 +506,44 @@ def corr(self, other, method='pearson', min_periods=None): @sdc_overload(corr) def corr_overload(self, other, method='pearson', min_periods=None): - dtype_self = self.dtype - dtype_other = other.dtype - isnan_self = get_isnan(dtype_self) - isnan_other = get_isnan(dtype_other) - def corr_impl(self, other, method='pearson', min_periods=None): - len_self = len(self) - len_other = len(other) if method not in ('pearson', ''): raise ValueError("Method corr(). Unsupported parameter. Given method != 'pearson'") - if min_periods is None: + if min_periods is None or min_periods < 1: min_periods = 1 - if len_self == 0 or len_other == 0: - return numpy.nan + min_len = min(len(self._data), len(other._data)) - min_len = min(len_self, len_other) - chunks = parallel_chunks(min_len) - arr_len = numpy.empty(len(chunks), dtype=numpy.int64) - length = 0 - - for i in prange(len(chunks)): - chunk = chunks[i] - res = 0 - for j in range(chunk.start, chunk.stop): - if not isnan_self(self[j]) or not isnan_other(other[j]): - res += 1 - length += res - arr_len[i] = res - - result_self = numpy.empty(shape=length, dtype=dtype_self) - result_other = numpy.empty(shape=length, dtype=dtype_other) - for i in prange(len(chunks)): - chunk = chunks[i] - new_start = int(sum(arr_len[0:i])) - new_stop = new_start + arr_len[i] - current_pos = new_start - - for j in range(chunk.start, chunk.stop): - if not isnan_self(self[j]) or not isnan_other(other[j]): - result_self[current_pos] = self[j] - result_other[current_pos] = other[j] - current_pos += 1 - - if len(result_self) < min_periods: + if min_len == 0: return numpy.nan - n = length - ma = sum(result_self) - mb = sum(result_other) - a = n * (result_self * result_other).sum() - ma * mb - b1 = n * (result_self * result_self).sum() - ma * ma - b2 = n * (result_other * result_other).sum() - mb * mb - - if b1 == 0 or b2 == 0: + sum_y = 0. + sum_x = 0. + sum_xy = 0. + sum_xx = 0. + sum_yy = 0. + total_count = 0 + for i in prange(min_len): + x = self._data[i] + y = other._data[i] + if not (numpy.isnan(x) or numpy.isnan(y)): + sum_x += x + sum_y += y + sum_xy += x*y + sum_xx += x*x + sum_yy += y*y + total_count += 1 + + if total_count < min_periods: return numpy.nan - return a / numpy.sqrt(b1 * b2) + cov_xy = (sum_xy - sum_x*sum_y/total_count) + var_x = (sum_xx - sum_x*sum_x/total_count) + var_y = (sum_yy - sum_y*sum_y/total_count) + corr_xy = cov_xy/numpy.sqrt(var_x*var_y) + + return corr_xy return corr_impl From 9ea44a51201c48c09a2fd0237319222005276283 Mon Sep 17 00:00:00 2001 From: "elena.totmenina" Date: Thu, 20 Feb 2020 14:13:50 +0300 Subject: [PATCH 5/5] codestyle --- sdc/functions/numpy_like.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sdc/functions/numpy_like.py b/sdc/functions/numpy_like.py index 04a9cc4e0..006b816e1 100644 --- a/sdc/functions/numpy_like.py +++ b/sdc/functions/numpy_like.py @@ -530,18 +530,18 @@ def corr_impl(self, other, method='pearson', min_periods=None): if not (numpy.isnan(x) or numpy.isnan(y)): sum_x += x sum_y += y - sum_xy += x*y - sum_xx += x*x - sum_yy += y*y + sum_xy += x * y + sum_xx += x * x + sum_yy += y * y total_count += 1 if total_count < min_periods: return numpy.nan - cov_xy = (sum_xy - sum_x*sum_y/total_count) - var_x = (sum_xx - sum_x*sum_x/total_count) - var_y = (sum_yy - sum_y*sum_y/total_count) - corr_xy = cov_xy/numpy.sqrt(var_x*var_y) + cov_xy = (sum_xy - sum_x * sum_y / total_count) + var_x = (sum_xx - sum_x * sum_x / total_count) + var_y = (sum_yy - sum_y * sum_y / total_count) + corr_xy = cov_xy / numpy.sqrt(var_x * var_y) return corr_xy