From 9a0effbfb1c1752ef81655a6abda9f564c51d0fe Mon Sep 17 00:00:00 2001 From: FernandoVN98 <38290736+FernandoVN98@users.noreply.github.com> Date: Fri, 10 Nov 2023 12:40:23 +0100 Subject: [PATCH] Random Forest Nested, tests and docs (#449) * Added nested version of Random Forest, tests and corrected documentation --- Dockerfile | 2 +- Jenkinsfile | 2 +- dislib/decomposition/tsqr/base.py | 2 +- dislib/trees/decision_tree.py | 151 +- dislib/trees/distributed/decision_tree.py | 9 + dislib/trees/forest.py | 307 ++-- dislib/trees/nested/__init__.py | 13 + dislib/trees/nested/decision_tree.py | 1505 ++++++++++++++++++++ dislib/trees/nested/forest.py | 755 ++++++++++ dislib/trees/nested/tasks.py | 78 + dislib/trees/nested/terasort.py | 95 ++ docs/source/dislib.trees.distributed.rst | 22 + docs/source/dislib.trees.mmap.rst | 22 + docs/source/dislib.trees.nested.rst | 22 + run_ci_checks.sh | 4 + run_coverage.sh | 3 + run_test_nesting.sh | 146 ++ tests/test_array.py | 3 +- tests/test_tsqr.py | 15 +- tests_nesting/__init__.py | 14 + tests_nesting/__main__.py | 9 + tests_nesting/test_decision_tree_nested.py | 908 ++++++++++++ tests_nesting/test_rf_classifier_nested.py | 757 ++++++++++ tests_nesting/test_rf_regressor_nested.py | 434 ++++++ 24 files changed, 5155 insertions(+), 123 deletions(-) create mode 100644 dislib/trees/nested/__init__.py create mode 100644 dislib/trees/nested/decision_tree.py create mode 100644 dislib/trees/nested/forest.py create mode 100644 dislib/trees/nested/tasks.py create mode 100644 dislib/trees/nested/terasort.py create mode 100644 docs/source/dislib.trees.distributed.rst create mode 100644 docs/source/dislib.trees.mmap.rst create mode 100644 docs/source/dislib.trees.nested.rst create mode 100755 run_test_nesting.sh create mode 100644 tests_nesting/__init__.py create mode 100644 tests_nesting/__main__.py create mode 100644 tests_nesting/test_decision_tree_nested.py create mode 100644 tests_nesting/test_rf_classifier_nested.py create mode 100644 tests_nesting/test_rf_regressor_nested.py diff --git a/Dockerfile b/Dockerfile index 127efbc0..a5200370 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM compss/compss-tutorial:3.1 +FROM compss/compss-tutorial:3.3 MAINTAINER COMPSs Support COPY . dislib/ diff --git a/Jenkinsfile b/Jenkinsfile index f1cd7161..c2592fa0 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -11,7 +11,7 @@ def setGithubCommitStatus(state, description) { pipeline { options { - timeout(time: 5, unit: 'HOURS') + timeout(time: 6, unit: 'HOURS') } agent { node { diff --git a/dislib/decomposition/tsqr/base.py b/dislib/decomposition/tsqr/base.py index bd7c5e67..ed48f02a 100644 --- a/dislib/decomposition/tsqr/base.py +++ b/dislib/decomposition/tsqr/base.py @@ -231,7 +231,7 @@ def tsqr(a: Array, mode="complete", indexes=None): shape_to_use = a._reg_shape[0] q_blocks = [[object() for _ in range(number_blocks)] for _ in range(auxiliar_rs)] - q_blocks_2 = [[]] + q_blocks_2 = [[object()]] r_blocks = [[object() for _ in range(len(block))] for _ in range(number_blocks)] if (irregular_shape_to_use != 0 diff --git a/dislib/trees/decision_tree.py b/dislib/trees/decision_tree.py index 84a35e0c..dd26ec40 100644 --- a/dislib/trees/decision_tree.py +++ b/dislib/trees/decision_tree.py @@ -6,10 +6,15 @@ DecisionTreeClassifierDistributed) from dislib.trees.distributed import (DecisionTreeRegressor as DecisionTreeRegressorDistributed) +from dislib.trees.nested import (DecisionTreeClassifier as + DecisionTreeClassifierNested) +from dislib.trees.nested import (DecisionTreeRegressor as + DecisionTreeRegressorNested) from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor from dislib.trees.distributed.decision_tree import (_RegressionNode, _ClassificationNode) +from pycompss.api.api import compss_wait_on class BaseDecisionTree: @@ -36,6 +41,7 @@ def __init__( split_computation="raw", sync_after_fit=True, mmap=True, + nested=False, ): self.try_features = try_features self.max_depth = max_depth @@ -60,13 +66,17 @@ def __init__( self.sync_after_fit = sync_after_fit self.mmap = mmap + self.nested = nested def fit(self, dataset): """Fits the DecisionTree. Parameters ---------- - dataset : dislib.classification.rf._data.RfDataset + dataset : dislib.trees.mmap.RfDataset / ds-array + It has to be dislib.trees.mmap.RfDataset if the mmap decision tree + is used. When using distributed or nested decision tree the input + to this function should be of type ds-array. """ if self.mmap: if SklearnDTRegressor == self.base_tree: @@ -82,26 +92,46 @@ def fit(self, dataset): self.bootstrap, self.random_state ) else: - if SklearnDTRegressor == self.base_tree: - self.tree = DecisionTreeRegressorDistributed( - self.try_features, self.max_depth, - self.distr_depth, self.sklearn_max, - self.bootstrap, self.random_state, - self.range_max, self.range_min, - self.n_split_points, self.split_computation, - self.sync_after_fit - ) + if self.nested: + if SklearnDTRegressor == self.base_tree: + self.tree = DecisionTreeRegressorNested( + self.try_features, self.max_depth, + self.distr_depth, self.sklearn_max, + self.bootstrap, self.random_state, + self.range_max, self.range_min, + self.n_split_points, self.split_computation, + self.sync_after_fit) + else: + self.tree = DecisionTreeClassifierNested( + self.n_classes, + self.try_features, self.max_depth, + self.distr_depth, self.sklearn_max, + self.bootstrap, self.random_state, + self.range_max, self.range_min, + self.n_split_points, self.split_computation, + self.sync_after_fit) else: - self.tree = DecisionTreeClassifierDistributed( - self.try_features, self.max_depth, - self.distr_depth, self.sklearn_max, - self.bootstrap, self.random_state, - self.n_classes, self.range_max, self.range_min, - self.n_split_points, self.split_computation, - self.sync_after_fit - ) - - def predict(self, x_row): + if SklearnDTRegressor == self.base_tree: + self.tree = DecisionTreeRegressorDistributed( + self.try_features, self.max_depth, + self.distr_depth, self.sklearn_max, + self.bootstrap, self.random_state, + self.range_max, self.range_min, + self.n_split_points, self.split_computation, + self.sync_after_fit + ) + else: + self.tree = DecisionTreeClassifierDistributed( + self.try_features, self.max_depth, + self.distr_depth, self.sklearn_max, + self.bootstrap, self.random_state, + self.n_classes, self.range_max, self.range_min, + self.n_split_points, self.split_computation, + self.sync_after_fit + ) + self.tree.fit(dataset) + + def predict(self, x_row, collect=False): """Predicts target values or classes for the given samples using a fitted tree. @@ -110,6 +140,12 @@ def predict(self, x_row): x_row : ds-array A row block of samples. + collect : boolean + Only affects nested and distributed versions of the algorithm. + When True, the results are synchronized before the returning, + when False, no synchronization is done, but the user should do it + manually when he/she wants the results. + Returns ------- predicted : ndarray @@ -123,7 +159,13 @@ def predict(self, x_row): if self.mmap: return self.tree.predict(x_row) else: - return self.tree.predict(x_row, collect=False) + if self.nested: + prediction = self.tree.predict(x_row) + if collect: + prediction = compss_wait_on(prediction) + return prediction + else: + return self.tree.predict(x_row, collect=collect) class DecisionTreeClassifier(BaseDecisionTree): @@ -148,6 +190,30 @@ class DecisionTreeClassifier(BaseDecisionTree): forests). random_state : RandomState instance The random number generator. + n_classes : int + Number of classes that appear on the dataset. Only needed on + distributed random forest. + range_min : ds-array or np.array + Contains the minimum values of the different attributes of the dataset + Only used on distributed random forest (it is an optional parameter) + range_max : ds-array or np.array + Contains the maximum values of the different attributes of the dataset + Only used on distributed random forest (it is an optional parameter) + n_split_points : String or int + Number of split points to evaluate. + "auto", "sqrt" or integer value. + Used on distributed random forest (non memory map version) + split_computation : String + "raw", "gaussian_approximation" or "uniform_approximation" + distribution of the values followed by the split points selected. + Used on distributed random forest (non memory map version) + sync_after_fit : bool + Synchronize or not after the training. + Used on distributed random forest (non memory map version) + mmap : bool + Use the memory map version or not + nested : bool + Use the nested version or not Attributes ---------- @@ -175,7 +241,7 @@ class DecisionTreeClassifier(BaseDecisionTree): predict(x_row) Predicts classes for the given samples using a fitted tree. predict_proba(x_row) - Predicts class probabilities for the given smaples using a fitted tree. + Predicts class probabilities for the given samples using a fitted tree. """ @@ -194,6 +260,7 @@ def __init__( split_computation="raw", sync_after_fit=True, mmap=True, + nested=False, ): super().__init__( try_features, @@ -211,9 +278,10 @@ def __init__( split_computation=split_computation, sync_after_fit=sync_after_fit, mmap=mmap, + nested=nested, ) - def predict_proba(self, x_row): + def predict_proba(self, x_row, collect=False): """Predicts class probabilities for a row block using a fitted tree. Parameters @@ -221,6 +289,12 @@ def predict_proba(self, x_row): x_row : ds-array A row block of samples. + collect : boolean + Only affects nested and distributed versions of the algorithm. + When True, the results are synchronized before the returning, + when False, no synchronization is done, but the user should do it + manually when he/she wants the results. + Returns ------- predicted_proba : ndarray @@ -235,7 +309,13 @@ def predict_proba(self, x_row): if self.mmap: return self.tree.predict_proba(x_row) else: - return self.tree.predict_proba(x_row, collect=False) + if self.nested: + prediction = self.tree.predict_proba(x_row) + if collect: + prediction = compss_wait_on(prediction) + return prediction + else: + return self.tree.predict_proba(x_row, collect=collect) class DecisionTreeRegressor(BaseDecisionTree): @@ -260,6 +340,27 @@ class DecisionTreeRegressor(BaseDecisionTree): forests). random_state : RandomState instance The random number generator. + range_min : ds-array or np.array + Contains the minimum values of the different attributes of the dataset + Only used on distributed random forest (it is an optional parameter) + range_max : ds-array or np.array + Contains the maximum values of the different attributes of the dataset + Only used on distributed random forest (it is an optional parameter) + n_split_points : String or int + Number of split points to evaluate. + "auto", "sqrt" or integer value. + Used on distributed random forest (non memory map version) + split_computation : String + "raw", "gaussian_approximation" or "uniform_approximation" + distribution of the values followed by the split points selected. + Used on distributed random forest (non memory map version) + sync_after_fit : bool + Synchronize or not after the training. + Used on distributed random forest (non memory map version) + mmap : bool + Use the memory map version or not + nested : bool + Use the nested version or not Attributes ---------- @@ -299,6 +400,7 @@ def __init__( split_computation="raw", sync_after_fit=True, mmap=True, + nested=False, ): super().__init__( try_features, @@ -315,4 +417,5 @@ def __init__( split_computation=split_computation, sync_after_fit=sync_after_fit, mmap=mmap, + nested=nested, ) diff --git a/dislib/trees/distributed/decision_tree.py b/dislib/trees/distributed/decision_tree.py index 28478496..911da813 100644 --- a/dislib/trees/distributed/decision_tree.py +++ b/dislib/trees/distributed/decision_tree.py @@ -380,6 +380,15 @@ def __init__( ) def fit(self, x, y): + """Fits the DecisionTreeRegressor. + + Parameters + ---------- + x : ds-array + Samples of the dataset. + y: ds-array + Labels of the dataset. + """ if self.range_max is None: self.range_max = x.max() if self.range_min is None: diff --git a/dislib/trees/forest.py b/dislib/trees/forest.py index 80fbe50e..00d62981 100644 --- a/dislib/trees/forest.py +++ b/dislib/trees/forest.py @@ -5,19 +5,27 @@ DecisionTreeClassifierMMap, DecisionTreeRegressor as DecisionTreeRegressorMMap) +from dislib.trees.mmap import (RandomForestClassifier as + RandomForestClassifierMMap, + RfClassifierDataset, RfRegressorDataset, + RandomForestRegressor as + RandomForestRegressorMMap) from dislib.trees.distributed import (DecisionTreeClassifier as DecisionTreeClassifierDistributed, DecisionTreeRegressor as DecisionTreeRegressorDistributed) -from dislib.trees.mmap import (RandomForestClassifier as - RandomForestClassifierMMap, - RfClassifierDataset, RfRegressorDataset) -from dislib.trees.mmap import (RandomForestRegressor as - RandomForestRegressorMMap) from dislib.trees.distributed import (RandomForestClassifier as RandomForestClassifierDistributed, RandomForestRegressor as RandomForestRegressorDistributed) +from dislib.trees.nested import (DecisionTreeClassifier as + DecisionTreeClassifierNested, + DecisionTreeRegressor as + DecisionTreeRegressorNested) +from dislib.trees.nested import (RandomForestClassifier as + RandomForestClassifierNested, + RandomForestRegressor as + RandomForestRegressorNested) class BaseRandomForest(BaseEstimator): @@ -46,6 +54,7 @@ def __init__( split_computation="raw", sync_after_fit=True, mmap=True, + nested=False, ): self.n_estimators = n_estimators self.try_features = try_features @@ -64,6 +73,7 @@ def __init__( self.split_computation = split_computation self.sync_after_fit = sync_after_fit self.mmap = mmap + self.nested = nested self.rf = None def fit(self, x, y): @@ -93,27 +103,51 @@ def fit(self, x, y): self.distr_depth, self.sklearn_max, self.hard_vote, self.random_state) else: - if DecisionTreeRegressorDistributed == self.base_tree: - self.rf = RandomForestRegressorDistributed( - self.n_estimators, self.try_features, self.max_depth, - self.distr_depth, self.sklearn_max, - self.random_state, - range_max=self.range_max, range_min=self.range_min, - bootstrap=self.bootstrap, - n_split_points=self.n_split_points, - split_computation=self.split_computation, - sync_after_fit=self.sync_after_fit) + if self.nested: + if DecisionTreeRegressorNested == self.base_tree: + self.rf = RandomForestRegressorNested( + self.n_estimators, self.try_features, + self.max_depth, self.distr_depth, + self.sklearn_max, self.random_state, + range_max=self.range_max, + range_min=self.range_min, + bootstrap=self.bootstrap, + n_split_points=self.n_split_points, + split_computation=self.split_computation, + sync_after_fit=self.sync_after_fit) + else: + self.rf = RandomForestClassifierNested( + self.n_classes, self.n_estimators, + self.try_features, self.max_depth, + self.distr_depth, self.sklearn_max, + self.hard_vote, self.random_state, + range_max=self.range_max, range_min=self.range_min, + bootstrap=self.bootstrap, + n_split_points=self.n_split_points, + split_computation=self.split_computation, + sync_after_fit=self.sync_after_fit) else: - self.rf = RandomForestClassifierDistributed( - self.n_classes, self.n_estimators, - self.try_features, self.max_depth, - self.distr_depth, self.sklearn_max, - self.hard_vote, self.random_state, - range_max=self.range_max, range_min=self.range_min, - bootstrap=self.bootstrap, - n_split_points=self.n_split_points, - split_computation=self.split_computation, - sync_after_fit=self.sync_after_fit) + if DecisionTreeRegressorDistributed == self.base_tree: + self.rf = RandomForestRegressorDistributed( + self.n_estimators, self.try_features, self.max_depth, + self.distr_depth, self.sklearn_max, + self.random_state, + range_max=self.range_max, range_min=self.range_min, + bootstrap=self.bootstrap, + n_split_points=self.n_split_points, + split_computation=self.split_computation, + sync_after_fit=self.sync_after_fit) + else: + self.rf = RandomForestClassifierDistributed( + self.n_classes, self.n_estimators, + self.try_features, self.max_depth, + self.distr_depth, self.sklearn_max, + self.hard_vote, self.random_state, + range_max=self.range_max, range_min=self.range_min, + bootstrap=self.bootstrap, + n_split_points=self.n_split_points, + split_computation=self.split_computation, + sync_after_fit=self.sync_after_fit) self.rf.fit(x, y) if self.mmap and DecisionTreeClassifierMMap == self.base_tree: @@ -206,27 +240,51 @@ def load_model(self, filepath, load_format="json"): self.distr_depth, self.sklearn_max, self.hard_vote, self.random_state) else: - if DecisionTreeRegressorDistributed == self.base_tree: - self.rf = RandomForestRegressorDistributed( - self.n_estimators, self.try_features, self.max_depth, - self.distr_depth, self.sklearn_max, - self.random_state, - range_max=self.range_max, range_min=self.range_min, - bootstrap=self.bootstrap, - n_split_points=self.n_split_points, - split_computation=self.split_computation, - sync_after_fit=self.sync_after_fit) + if self.nested: + if DecisionTreeRegressorNested == self.base_tree: + self.rf = RandomForestRegressorNested( + self.n_estimators, self.try_features, + self.max_depth, self.distr_depth, + self.sklearn_max, self.random_state, + range_max=self.range_max, + range_min=self.range_min, + bootstrap=self.bootstrap, + n_split_points=self.n_split_points, + split_computation=self.split_computation, + sync_after_fit=self.sync_after_fit) + else: + self.rf = RandomForestClassifierNested( + self.n_classes, self.n_estimators, + self.try_features, self.max_depth, + self.distr_depth, self.sklearn_max, + self.hard_vote, self.random_state, + range_max=self.range_max, range_min=self.range_min, + bootstrap=self.bootstrap, + n_split_points=self.n_split_points, + split_computation=self.split_computation, + sync_after_fit=self.sync_after_fit) else: - self.rf = RandomForestClassifierDistributed( - self.n_classes, self.n_estimators, - self.try_features, self.max_depth, - self.distr_depth, self.sklearn_max, - self.hard_vote, self.random_state, - range_max=self.range_max, range_min=self.range_min, - bootstrap=self.bootstrap, - n_split_points=self.n_split_points, - split_computation=self.split_computation, - sync_after_fit=self.sync_after_fit) + if DecisionTreeRegressorDistributed == self.base_tree: + self.rf = RandomForestRegressorDistributed( + self.n_estimators, self.try_features, + self.max_depth, self.distr_depth, + self.sklearn_max, self.random_state, + range_max=self.range_max, range_min=self.range_min, + bootstrap=self.bootstrap, + n_split_points=self.n_split_points, + split_computation=self.split_computation, + sync_after_fit=self.sync_after_fit) + else: + self.rf = RandomForestClassifierDistributed( + self.n_classes, self.n_estimators, + self.try_features, self.max_depth, + self.distr_depth, self.sklearn_max, + self.hard_vote, self.random_state, + range_max=self.range_max, range_min=self.range_min, + bootstrap=self.bootstrap, + n_split_points=self.n_split_points, + split_computation=self.split_computation, + sync_after_fit=self.sync_after_fit) self.rf.load_model(filepath, load_format=load_format) @@ -291,13 +349,18 @@ class RandomForestClassifier(BaseRandomForest): sync_after_fit : bool Synchronize or not after the training. Used on distributed random forest (non memory map version) + mmap : bool + Use the memory map version or not. + nested : bool + Use the nested version or not. Attributes ---------- classes : None or ndarray Array of distinct classes, set at fit(). - trees : list of DecisionTreeClassifier - List of the tree classifiers of this forest, populated at fit(). + rf : RandomForestClassifier selected + Instance of mmap, distributed or nested + RandomForestClassifier selected. """ def __init__( @@ -317,6 +380,7 @@ def __init__( split_computation="raw", sync_after_fit=True, mmap=True, + nested=False, ): if mmap: super().__init__( @@ -331,25 +395,48 @@ def __init__( base_dataset=RfClassifierDataset, ) else: - super().__init__( - n_estimators, - try_features, - max_depth, - distr_depth, - sklearn_max, - hard_vote, - random_state, - base_tree=DecisionTreeClassifierDistributed, - base_dataset=None, - n_classes=n_classes, - range_max=range_max, - range_min=range_min, - bootstrap=bootstrap, - n_split_points=n_split_points, - split_computation=split_computation, - sync_after_fit=sync_after_fit, - mmap=mmap, - ) + if nested: + super().__init__( + n_estimators, + try_features, + max_depth, + distr_depth, + sklearn_max, + hard_vote, + random_state, + base_tree=DecisionTreeClassifierNested, + base_dataset=None, + n_classes=n_classes, + range_max=range_max, + range_min=range_min, + bootstrap=bootstrap, + n_split_points=n_split_points, + split_computation=split_computation, + sync_after_fit=sync_after_fit, + mmap=mmap, + nested=nested, + ) + else: + super().__init__( + n_estimators, + try_features, + max_depth, + distr_depth, + sklearn_max, + hard_vote, + random_state, + base_tree=DecisionTreeClassifierDistributed, + base_dataset=None, + n_classes=n_classes, + range_max=range_max, + range_min=range_min, + bootstrap=bootstrap, + n_split_points=n_split_points, + split_computation=split_computation, + sync_after_fit=sync_after_fit, + mmap=mmap, + nested=nested + ) def predict(self, x): """Predicts target classes using a fitted forest. @@ -520,11 +607,36 @@ class RandomForestRegressor(BaseRandomForest): If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. + n_classes : int + Number of classes that appear on the dataset. Only needed on + distributed random forest. + range_min : ds-array or np.array + Contains the minimum values of the different attributes of the dataset + Only used on distributed random forest (it is an optional parameter) + range_max : ds-array or np.array + Contains the maximum values of the different attributes of the dataset + Only used on distributed random forest (it is an optional parameter) + n_split_points : String or int + Number of split points to evaluate. + "auto", "sqrt" or integer value. + Used on distributed random forest (non memory map version) + split_computation : String + "raw", "gaussian_approximation" or "uniform_approximation" + distribution of the values followed by the split points selected. + Used on distributed random forest (non memory map version) + sync_after_fit : bool + Synchronize or not after the training. + Used on distributed random forest (non memory map version) + mmap : bool + Use the memory map version of the algorithm or not + nested : bool + Use the nested version of the algorithm or not Attributes ---------- - trees : list of DecisionTreeRegressor - List of the tree regressors of this forest, populated at fit(). + rf : RandomForestRegressor selected + Instance of mmap, distributed or nested + RandomForestRegressor selected. """ def __init__( @@ -542,6 +654,7 @@ def __init__( split_computation="raw", sync_after_fit=True, mmap=True, + nested=False, ): hard_vote = None if mmap: @@ -557,24 +670,46 @@ def __init__( base_dataset=RfRegressorDataset, ) else: - super().__init__( - n_estimators, - try_features, - max_depth, - distr_depth, - sklearn_max, - hard_vote, - random_state, - base_tree=DecisionTreeRegressorDistributed, - base_dataset=None, - range_max=range_max, - range_min=range_min, - bootstrap=bootstrap, - n_split_points=n_split_points, - split_computation=split_computation, - sync_after_fit=sync_after_fit, - mmap=mmap, - ) + if nested: + super().__init__( + n_estimators, + try_features, + max_depth, + distr_depth, + sklearn_max, + hard_vote, + random_state, + base_tree=DecisionTreeRegressorNested, + base_dataset=None, + range_max=range_max, + range_min=range_min, + bootstrap=bootstrap, + n_split_points=n_split_points, + split_computation=split_computation, + sync_after_fit=sync_after_fit, + mmap=mmap, + nested=nested, + ) + else: + super().__init__( + n_estimators, + try_features, + max_depth, + distr_depth, + sklearn_max, + hard_vote, + random_state, + base_tree=DecisionTreeRegressorDistributed, + base_dataset=None, + range_max=range_max, + range_min=range_min, + bootstrap=bootstrap, + n_split_points=n_split_points, + split_computation=split_computation, + sync_after_fit=sync_after_fit, + mmap=mmap, + nested=nested, + ) def predict(self, x): """Predicts target values using a fitted forest. diff --git a/dislib/trees/nested/__init__.py b/dislib/trees/nested/__init__.py new file mode 100644 index 00000000..30b0bdb9 --- /dev/null +++ b/dislib/trees/nested/__init__.py @@ -0,0 +1,13 @@ +from dislib.trees.nested.forest import (RandomForestClassifier, + RandomForestRegressor) +from dislib.trees.nested.decision_tree import ( + DecisionTreeClassifier, + DecisionTreeRegressor, +) + +__all__ = [ + "RandomForestClassifier", + "RandomForestRegressor", + "DecisionTreeClassifier", + "DecisionTreeRegressor", +] diff --git a/dislib/trees/nested/decision_tree.py b/dislib/trees/nested/decision_tree.py new file mode 100644 index 00000000..73c89bef --- /dev/null +++ b/dislib/trees/nested/decision_tree.py @@ -0,0 +1,1505 @@ +import math +import numpy as np +from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier +from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor +from pycompss.api.parameter import COLLECTION_IN, IN +from sklearn.utils import check_random_state +from pycompss.api.api import compss_delete_object, compss_wait_on +from dislib.data.array import Array +from pycompss.api.task import task +from pycompss.api.constraint import constraint +import scipy +from dislib.trees.nested.terasort import terasort + + +class BaseDecisionTree: + """Base class for distributed decision trees. + + Warning: This class should not be used directly. + Use derived classes instead. + """ + + def __init__( + self, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + base_node, + base_tree, + n_classes=None, + range_min=None, + range_max=None, + n_split_points="auto", + split_computation="raw", + sync_after_fit=True, + ): + self.n_classes = n_classes + self.try_features = try_features + self.max_depth = max_depth + self.sklearn_max = sklearn_max + self.distr_depth = distr_depth + self.bootstrap = bootstrap + self.random_state = random_state + self.base_node = base_node + self.base_tree = base_tree + + self.n_features = None + + self.tree = None + self.nodes_info = None + self.range_min = range_min + self.range_max = range_max + self.n_split_points = n_split_points + self.split_computation = split_computation + self.sync_after_fit = sync_after_fit + + @constraint(computing_units="${ComputingUnits}") + @task() + def fit(self, x, y): + """Fits the DecisionTree. + + Parameters + ---------- + x : ds-array + Samples of the dataset. + y: ds-array + Labels of the dataset. + """ + if self.range_max is None: + self.range_max = x.max() + if self.range_min is None: + self.range_min = x.min() + self.range_max._blocks = compss_wait_on(self.range_max._blocks) + self.range_min._blocks = compss_wait_on(self.range_min._blocks) + if self.n_split_points == "auto": + self.n_split_points = int(math.log(x.shape[0])) + elif self.n_split_points == "sqrt": + self.n_split_points = int(math.sqrt(x.shape[0])) + elif self.n_split_points < 1 and self.n_split_points > 0: + self.n_split_points = int(self.n_split_points * x.shape[0]) + elif isinstance(self.n_split_points, int): + pass + self.total_length = x.shape[0] + self.number_attributes = x.shape[1] + self.tree = self.base_node() + branches = [[x, y, self.tree]] + nodes_info = [] + selection = _sample_selection(x, + random_state=self.random_state, + bootstrap=self.bootstrap) + num_buckets = x._n_blocks[0] * x._n_blocks[1] + for i in range(self.distr_depth): + branches_pair = [] + for idx, branch_data in enumerate(branches): + x, y, actual_node = branch_data + node_info, results_l, results_l_2, results_r, results_r_2 = ( + _compute_split( + x, y, n_classes=self.n_classes, + range_min=self.range_min, + range_max=self.range_max, + num_buckets=int(num_buckets/(i+1)), + m_try=self.try_features, + number_attributes=self.number_attributes, + indexes_selected=selection, + number_split_points=int(self.n_split_points*(i+1)), + split_computation=self.split_computation, + random_state=self.random_state)) + actual_node.content = int(math.pow(2, int(i)) - 1 + idx) + actual_node.left = self.base_node() + actual_node.right = self.base_node() + splits_computed = [] + splits_computed.append(results_l) + splits_computed.append(results_l_2) + splits_computed.append(actual_node.left) + branches_pair.append(splits_computed) + splits_computed = [] + splits_computed.append(results_r) + splits_computed.append(results_r_2) + splits_computed.append(actual_node.right) + branches_pair.append(splits_computed) + nodes_info.append(node_info) + branches = branches_pair + for branch in branches: + x, y, actual_node = branch + actual_node = construct_subtree(x, y, actual_node, + self.try_features, + self.distr_depth, + max_depth=self.max_depth, + random_state=self.random_state) + nodes_info.append(actual_node) + nodes_info = compss_wait_on(nodes_info) + self.nodes_info = nodes_info + + @constraint(computing_units="${ComputingUnits}") + @task(returns=list) + def predict(self, x): + """Predicts target values or classes for the given samples using + a fitted tree. + + Parameters + ---------- + x_row : ds-array + A row block of samples. + + Returns + ------- + predicted : ndarray + An array with the predicted classes or values for the given + samples. For classification, the values are codes of the fitted + dislib.classification.rf.data.RfDataset. The returned object can + be a pycompss.runtime.Future object. + """ + assert self.tree is not None, "The decision tree is not fitted." + + block_predictions = [] + for x_block in x._blocks: + block_predictions.append(_predict_tree_class(x_block, + self.nodes_info, + 0, self.n_classes)) + return block_predictions + + +class DecisionTreeClassifier(BaseDecisionTree): + """A distributed decision tree classifier. + + Parameters + ---------- + try_features : int + The number of features to consider when looking for the best split. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int + Number of levels of the tree in which the nodes are split in a + distributed way. + bootstrap : bool + Randomly select n_instances samples with repetition (used in random + forests). + random_state : RandomState instance + The random number generator. + + Attributes + ---------- + n_features : int + The number of features of the dataset. It can be a + pycompss.runtime.Future object. + n_classes : int + The number of classes of this RfDataset. It can be a + pycompss.runtime.Future object. + tree : None or _Node + The root node of the tree after the tree is fitted. + nodes_info : None or list of _InnerNodeInfo and _LeafInfo + List of the node information for the nodes of the tree in the same + order as obtained in the fit() method, up to ``distr_depth`` depth. + After fit(), it is a pycompss.runtime.Future object. + subtrees : None or list of _Node + List of subtrees of the tree at ``distr_depth`` depth obtained in the + fit() method. After fit(), it is a list of pycompss.runtime.Future + objects. + + Methods + ------- + fit(dataset) + Fits the DecisionTreeClassifier. + predict(x_row) + Predicts classes for the given samples using a fitted tree. + predict_proba(x_row) + Predicts class probabilities for the given smaples using a fitted tree. + + """ + + def __init__( + self, + n_classes, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + range_min=None, + range_max=None, + n_split_points="auto", + split_computation="raw", + sync_after_fit=True, + ): + super().__init__( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + _ClassificationNode, + SklearnDTClassifier, + n_classes=n_classes, + range_min=range_min, + range_max=range_max, + n_split_points=n_split_points, + split_computation=split_computation, + sync_after_fit=sync_after_fit, + ) + + @constraint(computing_units="${ComputingUnits}") + @task(returns=1) + def predict_proba(self, x): + """Predicts class probabilities for a row block using a fitted tree. + + Parameters + ---------- + x_row : ds-array + A row block of samples. + + Returns + ------- + predicted_proba : list + A list with the predicted probabilities + for the given samples. + It contains a numpy array (if collect=True) + or Future object (if collect=False) for each of the blocks + in the ds-array to predict. + Thus the length of the list is the same + as the number of blocks the ds-array contains. + The shape inside each prediction is (len(x.reg_shape[0]), + self.n_classes). + The returned object can be a + pycompss.runtime.Future object. + """ + + assert self.tree is not None, "The decision tree is not fitted." + + block_predictions = [] + for x_block in x._blocks: + block_predictions.append(_predict_proba_tree(x_block, + self.nodes_info, + 0, self.n_classes)) + block_predictions = compss_wait_on(block_predictions) + return block_predictions + + +class DecisionTreeRegressor(BaseDecisionTree): + """A distributed decision tree regressor. + + Parameters + ---------- + try_features : int + The number of features to consider when looking for the best split. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int + Number of levels of the tree in which the nodes are split in a + distributed way. + bootstrap : bool + Randomly select n_instances samples with repetition (used in random + forests). + random_state : RandomState instance + The random number generator. + + Attributes + ---------- + n_features : int + The number of features of the dataset. It can be a + pycompss.runtime.Future object. + tree : None or _Node + The root node of the tree after the tree is fitted. + nodes_info : None or list of _InnerNodeInfo and _LeafInfo + List of the node information for the nodes of the tree in the same + order as obtained in the fit() method, up to ``distr_depth`` depth. + After fit(), it is a pycompss.runtime.Future object. + subtrees : None or list of _Node + List of subtrees of the tree at ``distr_depth`` depth obtained in the + fit() method. After fit(), it is a list of pycompss.runtime.Future + objects. + + Methods + ------- + fit(dataset) + Fits the DecisionTreeRegressor. + predict(x_row) + Predicts target values for the given samples using a fitted tree. + """ + + def __init__( + self, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + range_min=None, + range_max=None, + n_split_points="auto", + split_computation="raw", + sync_after_fit=True + ): + super().__init__( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + _RegressionNode, + SklearnDTRegressor, + n_classes=None, + range_min=range_min, + range_max=range_max, + n_split_points=n_split_points, + split_computation=split_computation, + sync_after_fit=sync_after_fit, + ) + + @constraint(computing_units="${ComputingUnits}") + @task() + def fit(self, x, y): + """Fits the DecisionTreeRegressor. + + Parameters + ---------- + x : ds-array + Samples of the dataset. + y: ds-array + Labels of the dataset. + """ + if self.range_max is None: + self.range_max = x.max() + if self.range_min is None: + self.range_min = x.min() + self.range_max._blocks = compss_wait_on(self.range_max._blocks) + self.range_min._blocks = compss_wait_on(self.range_min._blocks) + if self.n_split_points == "auto": + self.n_split_points = int(math.log(x.shape[0])) + elif self.n_split_points == "sqrt": + self.n_split_points = int(math.sqrt(x.shape[0])) + elif self.n_split_points < 1 and self.n_split_points > 0: + self.n_split_points = int(self.n_split_points*x.shape[0]) + elif isinstance(self.n_split_points, int): + pass + self.total_length = x.shape[0] + self.number_attributes = x.shape[1] + self.tree = self.base_node() + branches = [[x, y, self.tree]] + nodes_info = [] + selection = _sample_selection(x, random_state=self.random_state, + bootstrap=self.bootstrap) + num_buckets = x._n_blocks[0] * x._n_blocks[1] + for i in range(self.distr_depth): + branches_pair = [] + for idx, branch_data in enumerate(branches): + x, y, actual_node = branch_data + node_info, results_l, results_l_2, results_r, results_r_2 = ( + _compute_split_regressor( + x, y, range_min=self.range_min, + range_max=self.range_max, + num_buckets=int( + num_buckets/(i+1)), + m_try=self.try_features, + number_attributes=self.number_attributes, + indexes_selected=selection, + number_split_points=int(self.n_split_points*(i+1)), + split_computation=self.split_computation, + random_state=self.random_state)) + actual_node.content = int(math.pow(2, int(i)) - 1 + idx) + actual_node.left = self.base_node() + actual_node.right = self.base_node() + splits_computed = [results_l, results_l_2, actual_node.left] + branches_pair.append(splits_computed) + splits_computed = [results_r, results_r_2, actual_node.right] + branches_pair.append(splits_computed) + nodes_info.append(node_info) + branches = branches_pair + for branch in branches: + x, y, actual_node = branch + actual_node = construct_subtree(x, y, actual_node, + self.try_features, + self.distr_depth, + max_depth=self.max_depth, + random_state=self.random_state) + nodes_info.append(actual_node) + nodes_info = compss_wait_on(nodes_info) + self.nodes_info = nodes_info + + +@constraint(computing_units="${ComputingUnits}") +@task(returns=5, priority=True) +def _compute_split_regressor(x, y, num_buckets=4, + range_min=0, range_max=1, indexes_selected=None, + number_attributes=2, m_try=2, + number_split_points=100, split_computation="raw", + random_state=1): + if x[0] is None: + return None, [None], [None], [None], [None] + indexes_to_try = [] + random_state = check_random_state(random_state) + untried_indices = np.setdiff1d(np.arange(number_attributes), + indexes_to_try) + index_selection = _feature_selection( + untried_indices, m_try, random_state + ) + indexes_to_try.append(index_selection) + node_info = _NodeInfo() + final_rights_x = [object()] + final_rights_y = [object()] + final_lefts_x = [object()] + final_lefts_y = [object()] + if num_buckets < 1: + num_buckets = 1 + tried_indices = [] + for _ in range(number_attributes): + untried_indices = np.setdiff1d(np.arange(number_attributes), + tried_indices) + index_selection = _feature_selection( + untried_indices, m_try, random_state + ) + results = terasort(x, index_selection, range_min=range_min, + range_max=range_max, + indexes_selected=indexes_selected, + num_buckets=num_buckets) + split_points_per_attribute = [] + for i in range(len(results[0])): + split_points_per_attribute.append( + get_split_point_various_attributes_bucket( + results[:, i], number_split_points=number_split_points, + split_computation=split_computation)) + [compss_delete_object(b) for results_2 in results for b in results_2] + del results + split_points_per_attribute = compss_wait_on(split_points_per_attribute) + partial_results_left = [] + partial_results_right = [] + for idx, split_values in enumerate(split_points_per_attribute): + partial_results_left.append([]) + partial_results_right.append([]) + if isinstance(x, Array): + for index_blocks, block_s in enumerate(zip( + x._blocks, y._blocks)): + idx_selected = indexes_selected[ + indexes_selected < (index_blocks + 1) * + x._reg_shape[0]] + block_x, block_y = block_s + left_class, right_class = classes_per_split( + block_x, block_y, split_values, index_selection, + idx_selected[idx_selected >= (index_blocks) * + x._reg_shape[0]] % x._reg_shape[0], + regression=True) + partial_results_left[idx].append(left_class) + partial_results_right[idx].append(right_class) + del idx_selected + else: + for block_x, block_y in zip(x, y): + left_class, right_class = classes_per_split( + block_x, block_y, split_values, index_selection, + np.array([0]), regression=True) + partial_results_left[idx].append(left_class) + partial_results_right[idx].append(right_class) + partial_results_right_array = np.array(compss_wait_on( + partial_results_right)) + partial_results_left_array = np.array(compss_wait_on( + partial_results_left)) + store_mse_values = [] + evaluation_of_splits = [] + for idx in range(partial_results_right_array.shape[0]): + for j in range(partial_results_right_array.shape[2]): + global_gini_values, produces_split = ( + merge_partial_results_compute_mse_both_sides( + partial_results_left_array[idx, :, j], + partial_results_right_array[idx, :, j])) + store_mse_values.append(global_gini_values) + evaluation_of_splits.append(produces_split) + + store_mse_values = compss_wait_on(store_mse_values) + evaluation_of_splits = compss_wait_on(evaluation_of_splits) + del partial_results_right_array + del partial_results_left_array + [compss_delete_object(result) for results in + partial_results_right for result in results] + [compss_delete_object(result) for results in + partial_results_left for result in results] + best_attribute, position_m_g, bucket_minimum_gini, minimum_mse = ( + get_minimum_measure(store_mse_values, m_try, gini=False)) + optimal_split_point = select_optimal_split_point( + best_attribute, position_m_g, split_points_per_attribute, + bucket_minimum_gini) + compss_delete_object(position_m_g) + compss_delete_object(bucket_minimum_gini) + compss_delete_object(*evaluation_of_splits) + compss_delete_object(*store_mse_values) + compss_delete_object(*split_points_per_attribute) + rights_x = [] + rights_y = [] + lefts_x = [] + lefts_y = [] + right_sums = [] + right_lengths = [] + left_sums = [] + left_lengths = [] + if isinstance(x, Array): + for block_x, block_y in zip(x._blocks, y._blocks): + (right_x, right_y, left_x, left_y, compress_r, + len_compress_r, compress_l, len_compress_l) = ( + apply_split_points_to_blocks_regression( + block_x, block_y, best_attribute, + optimal_split_point, index_selection)) + rights_x.append([right_x]) + rights_y.append([right_y]) + lefts_x.append([left_x]) + lefts_y.append([left_y]) + right_sums.append(compress_r) + right_lengths.append(len_compress_r) + left_sums.append(compress_l) + left_lengths.append(len_compress_l) + else: + for block_x, block_y in zip(x, y): + (right_x, right_y, left_x, left_y, compress_r, + len_compress_r, compress_l, len_compress_l) = ( + apply_split_points_to_blocks_regression( + block_x, block_y, best_attribute, + optimal_split_point, index_selection)) + rights_x.append([right_x]) + rights_y.append([right_y]) + lefts_x.append([left_x]) + lefts_y.append([left_y]) + right_sums.append(compress_r) + right_lengths.append(len_compress_r) + left_sums.append(compress_l) + left_lengths.append(len_compress_l) + [compss_delete_object(x_data[0]) for x_data in x] + [compss_delete_object(y_data[0]) for y_data in y] + final_rights_x[0] = rights_x + final_rights_y[0] = rights_y + final_lefts_x[0] = lefts_x + final_lefts_y[0] = lefts_y + if (np.sum(left_lengths) + np.sum(right_lengths)) <= 4: + node_info.set(_compute_leaf_info((np.sum(left_sums) + + np.sum(right_sums)) / + (np.sum(left_lengths) + + np.sum(right_lengths)), None, + occurrences=np.sum(left_lengths) + + + np.sum(right_lengths) + )) + elif np.sum(right_lengths) == 0: + node_info.set(_compute_leaf_info( + (np.sum(left_sums) + np.sum(right_sums)) / + (np.sum(left_lengths) + np.sum(right_lengths)), None, + occurrences=np.sum(left_lengths) + np.sum(right_lengths))) + elif np.sum(left_lengths) == 0: + node_info.set(_compute_leaf_info( + (np.sum(left_sums) + np.sum(right_sums)) / + (np.sum(left_lengths) + np.sum(right_lengths)), None, + occurrences=np.sum(left_lengths) + np.sum(right_lengths))) + elif best_attribute is None: + node_info.set(_compute_leaf_info( + (np.sum(left_sums) + np.sum(right_sums)) / + (np.sum(left_lengths) + np.sum(right_lengths)), None, + occurrences=np.sum(left_lengths) + np.sum(right_lengths))) + else: + node_info.set(_InnerNodeInfo(index_selection[ + best_attribute], + optimal_split_point)) + del right_sums + del right_lengths + del left_lengths + del left_sums + del minimum_mse + del optimal_split_point + del best_attribute + return (node_info, final_lefts_x[0], final_lefts_y[0], + final_rights_x[0], final_rights_y[0]) + del right_sums + del right_lengths + del left_lengths + del left_sums + del minimum_mse + del optimal_split_point + del best_attribute + tried_indices.extend(index_selection) + if len(tried_indices) == number_attributes: + break + return node_info, [None], [None], [None], [None] + + +@constraint(computing_units="${ComputingUnits}") +@task(returns=5, priority=True) +def _compute_split(x, y, n_classes=None, num_buckets=4, + range_min=0, range_max=1, + indexes_selected=None, number_attributes=2, m_try=2, + number_split_points=100, + split_computation="raw", random_state=None): + if x[0] is None: + return None, [None], [None], [None], [None] + indexes_to_try = [] + random_state = check_random_state(random_state) + untried_indices = np.setdiff1d(np.arange(number_attributes), + indexes_to_try) + index_selection = _feature_selection( + untried_indices, m_try, random_state + ) + indexes_to_try.append(index_selection) + node_info = _NodeInfo() + final_rights_x = [object()] + final_rights_y = [object()] + final_lefts_x = [object()] + final_lefts_y = [object()] + tried_indices = [] + if num_buckets < 1: + num_buckets = 2 + for _ in range(number_attributes): + untried_indices = np.setdiff1d(np.arange( + number_attributes), tried_indices) + index_selection = _feature_selection( + untried_indices, m_try, random_state + ) + results = terasort(x, index_selection, range_min=range_min, + range_max=range_max, + indexes_selected=indexes_selected, + num_buckets=num_buckets) + split_points_per_attribute = [] + for i in range(len( + results[0])): + split_points_per_attribute.append( + get_split_point_various_attributes_bucket( + results[:, i], number_split_points=number_split_points, + split_computation=split_computation)) + [compss_delete_object(b) for results_2 in results for b in results_2] + del results + split_points_per_attribute = compss_wait_on( + split_points_per_attribute) + partial_results_left = [] + partial_results_right = [] + for idx, split_values in enumerate(split_points_per_attribute): + partial_results_left.append([]) + partial_results_right.append([]) + if isinstance(x, Array): + for index_blocks, block_s in enumerate( + zip(x._blocks, y._blocks)): + idx_selected = indexes_selected[ + indexes_selected < (index_blocks + 1) * + x._reg_shape[0]] + block_x, block_y = block_s + left_class, right_class = classes_per_split( + block_x, block_y, split_values, index_selection, + idx_selected[idx_selected >= (index_blocks) * + x._reg_shape[0]] % x._reg_shape[0]) + partial_results_left[idx].append(left_class) + partial_results_right[idx].append(right_class) + else: + for block_x, block_y in zip(x, y): + left_class, right_class = classes_per_split( + block_x, block_y, split_values, + index_selection, np.array([0])) + partial_results_left[idx].append(left_class) + partial_results_right[idx].append(right_class) + partial_results_right_array = np.array(compss_wait_on( + partial_results_right)) + partial_results_left_array = np.array(compss_wait_on( + partial_results_left)) + store_gini_values = [] + evaluation_of_splits = [] + for idx in range(partial_results_right_array.shape[0]): + for j in range(partial_results_right_array.shape[2]): + global_gini_values, produces_split = ( + merge_partial_results_compute_gini_both_sides( + partial_results_left_array[idx, :, j], + partial_results_right_array[idx, :, j], + n_classes)) + store_gini_values.append(global_gini_values) + evaluation_of_splits.append(produces_split) + store_gini_values = compss_wait_on(store_gini_values) + evaluation_of_splits = compss_wait_on(evaluation_of_splits) + del partial_results_right_array + del partial_results_left_array + [compss_delete_object(result) for results in + partial_results_right for result in results] + [compss_delete_object(result) for results in + partial_results_left for result in results] + best_attribute, position_m_g, bucket_minimum_gini, minimum_ginis = ( + get_minimum_measure(store_gini_values, + len(index_selection), + gini=True)) + optimal_split_point = select_optimal_split_point( + best_attribute, position_m_g, split_points_per_attribute, + bucket_minimum_gini) + compss_delete_object(position_m_g) + compss_delete_object(bucket_minimum_gini) + compss_delete_object(minimum_ginis) + compss_delete_object(*evaluation_of_splits) + compss_delete_object(*store_gini_values) + compss_delete_object(*split_points_per_attribute) + rights_x = [] + rights_y = [] + lefts_x = [] + lefts_y = [] + aggregate = np.zeros(n_classes, dtype=np.int64) + aggregate_r = np.zeros(n_classes, dtype=np.int64) + if isinstance(x, Array): + for block_x, block_y in zip(x._blocks, y._blocks): + right_x, right_y, left_x, left_y, aggregate_r, aggregate = ( + apply_split_points_to_blocks( + block_x, block_y, best_attribute, + optimal_split_point, index_selection, n_classes, + aggregate, aggregate_r)) + rights_x.append([right_x]) + rights_y.append([right_y]) + lefts_x.append([left_x]) + lefts_y.append([left_y]) + else: + for block_x, block_y in zip(x, y): + right_x, right_y, left_x, left_y, aggregate_r, aggregate = ( + apply_split_points_to_blocks( + block_x, block_y, best_attribute, optimal_split_point, + index_selection, n_classes, aggregate, aggregate_r)) + rights_x.append([right_x]) + rights_y.append([right_y]) + lefts_x.append([left_x]) + lefts_y.append([left_y]) + [compss_delete_object(x_data[0]) for x_data in x] + [compss_delete_object(y_data[0]) for y_data in y] + final_rights_x[0] = rights_x + final_rights_y[0] = rights_y + final_lefts_x[0] = lefts_x + final_lefts_y[0] = lefts_y + + if (np.sum(aggregate) + np.sum(aggregate)) <= 4: + node_info.set(_compute_leaf_info(aggregate + + aggregate_r, n_classes)) + elif np.sum(aggregate_r) == 0: + node_info.set(_compute_leaf_info(aggregate + aggregate_r, + n_classes)) + elif np.sum(aggregate) == 0: + node_info.set(_compute_leaf_info(aggregate + aggregate_r, + n_classes)) + elif best_attribute is None: + node_info.set(_compute_leaf_info(aggregate + aggregate_r, + n_classes)) + else: + node_info.set(_InnerNodeInfo(index_selection[best_attribute], + optimal_split_point)) + del best_attribute + del evaluation_of_splits + del optimal_split_point + del aggregate + del aggregate_r + del minimum_ginis + return (node_info, final_lefts_x[0], final_lefts_y[0], + final_rights_x[0], final_rights_y[0]) + del best_attribute + del evaluation_of_splits + del optimal_split_point + del aggregate + del aggregate_r + del minimum_ginis + tried_indices.extend(index_selection) + if len(tried_indices) == number_attributes: + break + return node_info, [None], [None], [None], [None] + + +def _feature_selection(untried_indices, m_try, random_state): + selection_len = min(m_try, len(untried_indices)) + return random_state.choice( + untried_indices, size=selection_len, replace=False + ) + + +def _compute_leaf_info(y_s, n_classes, occurrences=None): + if n_classes is not None: + y_s = y_s.squeeze() + mode = np.argmax(y_s) + return _LeafInfo(np.sum(y_s), y_s, mode) + else: + return _LeafInfo(occurrences, None, y_s) + + +def _predict_tree_class(x, node, node_content_num, n_classes=None, + rights=0, depth=0): + if node_content_num == 0: + node_content_num = node_content_num + 1 + else: + node_content_num = node_content_num * 2 + rights + x = np.block(x) + node_content = node[node_content_num - 1] + if len(x) == 0: + if n_classes is not None: + return np.empty((0, n_classes), dtype=np.float64) + else: + return np.empty((0,), dtype=np.float64) + if isinstance(node_content, _NodeInfo): + if isinstance(node_content.get(), _LeafInfo): + if n_classes is not None: + return np.full((len(x), n_classes), node_content.get().target) + return np.full((len(x),), node_content.get().target) + elif isinstance(node_content.get(), _InnerNodeInfo): + if n_classes is not None: + pred = np.empty((x.shape[0], n_classes), dtype=np.float64) + l_msk = (x[:, node_content.get().index: + (node_content.get().index + 1)] <= + node_content.get().value) + pred[l_msk.flatten(), :] = _predict_tree_class( + x[l_msk.flatten(), :], node, node_content_num, + n_classes=n_classes, + rights=0, depth=depth + 1) + pred[~l_msk.flatten(), :] = _predict_tree_class( + x[~l_msk.flatten(), :], node, node_content_num, + n_classes=n_classes, + rights=1, depth=depth + 1) + return pred + else: + pred = np.empty((x.shape[0],), dtype=np.float64) + l_msk = (x[:, node_content.get().index: + (node_content.get().index + 1)] <= + node_content.get().value) + pred[l_msk.flatten()] = _predict_tree_class( + x[l_msk.flatten()], node, node_content_num, + n_classes=n_classes, + rights=0, depth=depth + 1) + pred[~l_msk.flatten()] = _predict_tree_class( + x[~l_msk.flatten()], node, node_content_num, + n_classes=n_classes, + rights=1, depth=depth + 1) + return pred + elif isinstance(node_content, _ClassificationNode): + if len(x) > 0: + sk_tree_pred = node_content.content.sk_tree.predict(x) + b = np.zeros((sk_tree_pred.size, n_classes)) + b[np.arange(sk_tree_pred.size), sk_tree_pred] = 1 + sk_tree_pred = b + pred = np.zeros((len(x), n_classes), dtype=np.float64) + pred[:, np.arange(n_classes)] = sk_tree_pred + return pred + elif isinstance(node_content, _RegressionNode): + if len(x) > 0: + sk_tree_pred = node_content.content.sk_tree.predict(x) + return sk_tree_pred + + +def _predict_proba_tree(x, node, node_content_num, + n_classes=None, rights=0, depth=0): + if node_content_num == 0: + node_content_num = node_content_num + 1 + else: + node_content_num = node_content_num * 2 + rights + x = np.block(x) + node_content = node[node_content_num - 1] + if len(x) == 0: + return np.empty((0, n_classes), dtype=np.float64) + if isinstance(node_content, _NodeInfo): + if isinstance(node_content.get(), _LeafInfo): + single_pred = (node_content.get().frequencies / + node_content.get().size) + return np.tile(single_pred, (len(x), 1)) + elif isinstance(node_content.get(), _InnerNodeInfo): + pred = np.empty((x.shape[0], n_classes), dtype=np.float64) + l_msk = (x[:, node_content.get().index: + (node_content.get().index + 1)] <= + node_content.get().value) + pred[l_msk.flatten(), :] = compss_wait_on( + _predict_proba_tree(x[l_msk.flatten(), :], + node, node_content_num, + n_classes=n_classes, + rights=0, depth=depth + 1)) + pred[~l_msk.flatten(), :] = compss_wait_on( + _predict_proba_tree(x[~l_msk.flatten(), :], + node, node_content_num, + n_classes=n_classes, + rights=1, depth=depth + 1)) + return pred + elif isinstance(node_content, _ClassificationNode): + if len(x) > 0: + sk_tree_pred = node_content.content.sk_tree.predict_proba(x) + pred = np.zeros((len(x), n_classes), dtype=np.float64) + pred[:, node_content.content.sk_tree.classes_] = sk_tree_pred + return pred + + +def apply_split_points_to_blocks_regression(x_block, y_block, + best_attribute, + optimal_value, indexes_to_try): + if optimal_value is None: + data_to_compress = np.block(y_block) + len_compress_l = np.array([0]) + compress_l = np.array([0]) + if len(data_to_compress) > 0: + compress_l = np.sum(data_to_compress) + len_compress_l = len(data_to_compress) + return (None, None, np.block(x_block), np.block(y_block), + np.array([0]), np.array([0]), compress_l, len_compress_l) + if x_block is None: + return (None, None, None, None, np.array([np.nan]), + np.array([np.nan]), np.array([np.nan]), np.array([np.nan])) + else: + x_block = np.block(x_block) + y_block = np.block(y_block) + left_x = x_block[x_block[:, indexes_to_try[best_attribute]] < + optimal_value] + right_x = x_block[x_block[:, indexes_to_try[best_attribute]] >= + optimal_value] + right_y = y_block[x_block[:, indexes_to_try[best_attribute]] >= + optimal_value] + left_y = y_block[x_block[:, indexes_to_try[best_attribute]] < + optimal_value] + data_to_compress = np.block(right_y) + data_to_compress_2 = np.block(left_y) + if len(data_to_compress) > 0: + compress_r = np.sum(data_to_compress) + len_compress_r = len(data_to_compress) + else: + compress_r = np.array([0]) + len_compress_r = np.array([0]) + if len(data_to_compress_2) > 0: + compress_l = np.sum(data_to_compress_2) + len_compress_l = len(data_to_compress_2) + else: + compress_l = np.array([0]) + len_compress_l = np.array([0]) + del x_block + del y_block + return (right_x, right_y, left_x, left_y, compress_r, + len_compress_r, compress_l, len_compress_l) + + +def apply_split_points_to_blocks(x_block, y_block, best_attribute, + optimal_value, indexes_to_try, + n_classes, aggregate_r, aggregate): + if optimal_value is None: + y_block = np.block(y_block) + if y_block is not None: + if len(y_block) > 0: + data_bincount = np.bincount(y_block.astype(int).flatten()) + if len(data_bincount) < n_classes: + aggregate[:len(data_bincount)] += data_bincount + else: + aggregate += data_bincount + return (None, None, np.block(x_block), np.block(y_block), + aggregate_r, aggregate) + if x_block is None: + return None, None, None, None, aggregate_r, aggregate + else: + x_block = np.block(x_block) + y_block = np.block(y_block) + left_x = x_block[x_block[:, indexes_to_try[best_attribute]] < + optimal_value] + right_x = x_block[x_block[:, indexes_to_try[best_attribute]] >= + optimal_value] + right_y = y_block[x_block[:, indexes_to_try[best_attribute]] >= + optimal_value] + left_y = y_block[x_block[:, indexes_to_try[best_attribute]] < + optimal_value] + del x_block + del y_block + if right_y is not None: + if len(right_y) > 0: + data_bincount = np.bincount(right_y.astype(int).flatten()) + if len(data_bincount) < n_classes: + aggregate_r[:len(data_bincount)] += data_bincount + else: + aggregate_r += data_bincount + if left_y is not None: + if len(left_y) > 0: + data_bincount = np.bincount(left_y.astype(int).flatten()) + if len(data_bincount) < n_classes: + aggregate[:len(data_bincount)] += data_bincount + else: + aggregate += data_bincount + return right_x, right_y, left_x, left_y, aggregate_r, aggregate + + +def select_optimal_split_point(best_attribute, position_m_g, + split_points, bucket_minimum_gini): + if best_attribute is None: + return None + return split_points[bucket_minimum_gini][best_attribute][position_m_g] + + +def get_minimum_measure(ginis_list, number_attributes, gini=True): + if gini: + minimum_measure = 1 + else: + minimum_measure = np.inf + for idx, ginis in enumerate(ginis_list): + if ginis[np.argmin(ginis)] < minimum_measure: + position_m_g = np.argmin(ginis) + minimum_measure = ginis[position_m_g] + best_attribute = idx % number_attributes + actual_bucket = int(math.floor(idx / number_attributes)) + if minimum_measure == 1: + return None, None, None, 1 + if minimum_measure == np.inf: + return None, None, None, np.inf + return best_attribute, position_m_g, actual_bucket, minimum_measure + + +@constraint(computing_units="${ComputingUnits}") +@task(returns=2) +def merge_partial_results_compute_mse_both_sides(partial_results_l, + partial_results_r): + if partial_results_l[0] is None or len(partial_results_l[0]) < 1: + return np.array([np.inf]), False + if partial_results_l[0][0] is None: + return np.array([np.inf]), False + concatted_values_l = [] + value_to_compute_mse = [] + for k in range(len(partial_results_l[0])): + value_to_concat = [] + value_to_mse = [] + for j in range(len(partial_results_l)): + value_to_concat.append(partial_results_l[j][k][1:]) + if not np.isnan(partial_results_l[j][k][0]): + value_to_mse.extend([partial_results_l[j][k][0]]) + else: + value_to_mse.extend([0]) + concatted_values_l.append(np.sum(value_to_concat, axis=0)) + value_to_compute_mse.append(value_to_mse) + number_occurrences = [occurrences[1] for + occurrences in concatted_values_l] + mse_values = [] + for individual_values, value in zip(value_to_compute_mse, + concatted_values_l): + mse_values.append(np.sum(np.square(np.subtract(individual_values, + value[0] / value[1])))) + del value + del concatted_values_l + if partial_results_r[0] is None or len(partial_results_r[0]) < 1: + return np.array([np.inf]), False + if partial_results_r[0][0] is None: + return np.array([np.inf]), False + concatted_values_r = [] + value_to_compute_mse = [] + for k in range(len(partial_results_r[0])): + value_to_concat = [] + value_to_mse = [] + for j in range(len(partial_results_r)): + value_to_concat.append(partial_results_r[j][k][1:]) + if not np.isnan(partial_results_r[j][k][0]): + value_to_mse.extend([partial_results_r[j][k][0]]) + else: + value_to_mse.extend([0]) + concatted_values_r.append(np.sum(value_to_concat, axis=0)) + value_to_compute_mse.append(value_to_mse) + number_occurrences_r = [occurrences[1] for + occurrences in concatted_values_r] + mse_values_r = [] + for individual_values, value in zip(value_to_compute_mse, + concatted_values_r): + mse_values_r.append(np.sum(np.square(np.subtract( + individual_values, value[0] / value[1])))) + del value + del concatted_values_r + if mse_values is None: + return np.array([np.inf]), False + return np.add(mse_values, mse_values_r), np.array( + [number_occurrences_r[i] != 0 and number_occurrences[i] != 0 + for i in range(len(mse_values))]) + + +def gini_function_compressed(y, classes): + if not len(y) != 0: + return 0 + probs = [] + total_y = np.sum(y) + for idx in range(len(classes)): + if len(y) > idx: + probs.append(y[idx]/total_y) + p = np.array(probs) + return 1 - ((p * p).sum()) + + +@constraint(computing_units="${ComputingUnits}") +@task(returns=2) +def merge_partial_results_compute_gini_both_sides(partial_results_l, + partial_results_r, + n_classes): + if partial_results_l[0] is None or len(partial_results_l[0]) < 1: + return np.array([5]), False + if partial_results_l[0][0] is None: + return np.array([5]), False + concatted_values_l = [] + for k in range(len(partial_results_l[0])): + value_to_concat = np.zeros(n_classes) + for j in range(len(partial_results_l)): + if len(partial_results_l[j][k]) > 0: + value_to_concat[:len(partial_results_l[j][k])] = ( + value_to_concat[:len(partial_results_l[j][k])] + + partial_results_l[j][k]) + concatted_values_l.append(value_to_concat) + number_occurrences = [np.sum(occurrences).astype(int) for + occurrences in concatted_values_l] + gini_values = [] + for value in concatted_values_l: + gini_values.append(gini_function_compressed(value, + np.arange(n_classes))) + if partial_results_r[0] is None or len(partial_results_r[0]) < 1: + return np.array([5]), False + if partial_results_r[0][0] is None: + return np.array([5]), False + concatted_values_r = [] + for k in range(len(partial_results_r[0])): + value_to_concat = np.zeros(n_classes) + for j in range(len(partial_results_r)): + value_to_concat[:len(partial_results_r[j][k])] = ( + value_to_concat[:len(partial_results_r[j][k])] + + partial_results_r[j][k]) + concatted_values_r.append(value_to_concat) + gini_values_r = [] + for value in concatted_values_r: + gini_values_r.append(gini_function_compressed( + value, np.arange(n_classes))) + number_occurrences_r = [np.sum(occurrences).astype(int) for + occurrences in concatted_values_r] + del concatted_values_r + return np.array( + [(number_occurrences_r[i] / (number_occurrences_r[i] + + number_occurrences[i]) * + gini_values_r[i]) + (number_occurrences[i] / ( + number_occurrences_r[i] + number_occurrences[i]) * + gini_values[i]) + if number_occurrences[i] >= 4 and number_occurrences_r[i] >= 4 else + 5 for i in range(len(gini_values))]), \ + np.array([number_occurrences_r[i] != 0 and number_occurrences[i] != 0 + for i in range(len(gini_values))]) + + +@constraint(computing_units="${ComputingUnits}") +@task(x_block=COLLECTION_IN, y_block=COLLECTION_IN, returns=2) +def classes_per_split(x_block, y_block, split_points, indexes_to_compare, + indexes_to_select=np.array([0]), regression=False): + number_classes_l = [np.array([]) for _ in range(len(indexes_to_compare))] + number_classes_r = [np.array([]) for _ in range(len(indexes_to_compare))] + number_none_split_points = 0 + for inner_split in split_points: + if np.any(inner_split) is None: + number_none_split_points = number_none_split_points + 1 + if x_block is None or len(x_block) == 0 or \ + number_none_split_points == len(split_points): + for idx in range(len(indexes_to_compare)): + number_classes_l[idx] = np.array([]) + number_classes_r[idx] = np.array([]) + return number_classes_l, number_classes_r + x_block = np.block(x_block) + y_block = np.block(y_block) + if indexes_to_select is not None: + if len(indexes_to_select) == 1: + if indexes_to_select[0] == 0: + x_block = x_block[:, indexes_to_compare] + else: + y_block = y_block[indexes_to_select] + x_block = x_block[indexes_to_select] + x_block = x_block[:, indexes_to_compare] + else: + y_block = y_block[indexes_to_select] + x_block = x_block[indexes_to_select] + x_block = x_block[:, indexes_to_compare] + else: + x_block = x_block[:, indexes_to_compare] + if regression: + for idx, attribute_split_points in enumerate(split_points): + attribute_splittings_l = [] + attribute_splittings_r = [] + for value in attribute_split_points: + attribute_splittings_l.append(np.array( + [np.mean(y_block[x_block[:, idx] < value, 0]), + np.sum(y_block[x_block[:, idx] < value, 0]), + len(y_block[x_block[:, idx] < value, 0])])) + attribute_splittings_r.append(np.array( + [np.mean(y_block[x_block[:, idx] >= value, 0]), + np.sum(y_block[x_block[:, idx] >= value, 0]), + len(y_block[x_block[:, idx] >= value, 0])])) + if len(attribute_splittings_r) == 0: + attribute_splittings_r = np.array([]) + if len(attribute_splittings_l) == 0: + attribute_splittings_l = np.array([]) + number_classes_l[idx] = attribute_splittings_l + number_classes_r[idx] = attribute_splittings_r + else: + for idx, attribute_split_points in enumerate(split_points): + attribute_splittings_l = [] + attribute_splittings_r = [] + for value in attribute_split_points: + attribute_splittings_l.append(np.bincount( + y_block[x_block[:, idx] < value, 0].astype(int))) + attribute_splittings_r.append(np.bincount( + y_block[x_block[:, idx] >= value, 0].astype(int))) + if len(attribute_splittings_r) == 0: + attribute_splittings_r = np.array([]) + if len(attribute_splittings_l) == 0: + attribute_splittings_l = np.array([]) + number_classes_l[idx] = attribute_splittings_l + number_classes_r[idx] = attribute_splittings_r + del x_block + del y_block + return number_classes_l, number_classes_r + + +@constraint(computing_units="${ComputingUnits}") +@task(returns=1) +def get_split_point_various_attributes_bucket(unique_values, + number_split_points=100, + split_computation="raw"): + sample_blocks_list = [] + for idx, bucket in enumerate(unique_values): + if bucket is None: + sample_blocks_list.append([]) + return sample_blocks_list + sample_blocks = np.copy(bucket) + if len(sample_blocks) == 0: + sample_blocks_list.append([]) + return sample_blocks_list + number_split_points_actual = number_split_points + if split_computation == "raw": + sample_blocks[:-1] += sample_blocks[1:] + sample_blocks[-1] = sample_blocks[-1] * 2 + sample_blocks = sample_blocks / 2 + if number_split_points_actual == 0: + number_split_points_actual = 1 + distance_between_split_points = int(len( + sample_blocks) / number_split_points_actual) + if distance_between_split_points == 0: + sample_blocks_list.append(sample_blocks) + else: + sample_blocks_list.append( + sample_blocks[0::distance_between_split_points]) + elif split_computation == "gaussian_approximation": + std = np.std(sample_blocks) + mean = np.mean(sample_blocks) + sample_blocks = np.array([mean + std * scipy.stats.norm.ppf( + (i + 1) / (number_split_points_actual + 1)) for i in + range(number_split_points_actual - 1)]) + sample_blocks_list.append(sample_blocks) + elif split_computation == "uniform_approximation": + maximum = np.max(sample_blocks) + minimum = np.min(sample_blocks) + sample_blocks = np.array([minimum + i * ((maximum - minimum) / ( + number_split_points_actual + 1)) for i in + range(number_split_points_actual)]) + sample_blocks_list.append(sample_blocks) + return sample_blocks_list + + +@constraint(computing_units="${ComputingUnits}") +@task(x=COLLECTION_IN, y=COLLECTION_IN, actual_node=IN, returns=1) +def construct_subtree(x, y, actual_node, m_try, depth, max_depth=25, + random_state=0): + if x is None or x[0] is None: + actual_node.content = None + return actual_node + else: + if max_depth == np.inf: + sklearn_max_depth = None + else: + sklearn_max_depth = max_depth - depth + if isinstance(actual_node, _ClassificationNode): + dt = SklearnDTClassifier( + max_features=m_try, + max_depth=sklearn_max_depth, + random_state=random_state, + ) + elif isinstance(actual_node, _RegressionNode): + dt = SklearnDTRegressor( + max_features=m_try, + max_depth=sklearn_max_depth, + random_state=random_state, + ) + x = np.block(x) + y = np.block(y) + if len(y) == 0 or np.any(y) is None: + actual_node.content = None + else: + dt.fit(x, y.astype(int), check_input=False) + actual_node.content = _SkTreeWrapper(dt) + return actual_node + + +def _sample_selection(x, random_state, bootstrap=True): + if bootstrap: # bootstrap: + selection = random_state.choice( + x.shape[0], size=x.shape[0], replace=True + ) + selection.sort() + else: + selection = np.arange(x.shape[0]) + return selection + + +class _SkTreeWrapper: + def __init__(self, tree): + self.sk_tree = tree + + def toJson(self): + return { + "class_name": self.__class__.__name__, + "module_name": self.__module__, + "items": self.__dict__, + } + + +class _LeafInfo: + def __init__(self, size=None, frequencies=None, target=None): + self.size = size + self.frequencies = frequencies + self.target = target + + def toJson(self): + return { + "class_name": self.__class__.__name__, + "module_name": self.__module__, + "items": self.__dict__, + } + + +class _InnerNodeInfo: + def __init__(self, index=None, value=None): + self.index = index + self.value = value + + def toJson(self): + return { + "class_name": self.__class__.__name__, + "module_name": self.__module__, + "items": self.__dict__, + } + + +class _Node: + """Base class for tree nodes""" + + def __init__(self, is_classifier): + self.content = None + self.left = None + self.right = None + self.is_classifier = is_classifier + self.predict_dtype = np.int64 if is_classifier else np.float64 + + '''def predict(self, sample): + node_content = self.content + if isinstance(node_content, _LeafInfo): + return np.full((len(sample),), node_content.target) + if isinstance(node_content, _SkTreeWrapper): + if len(sample) > 0: + return node_content.sk_tree.predict(sample) + if isinstance(node_content, _InnerNodeInfo): + pred = np.empty((len(sample),), dtype=self.predict_dtype) + left_mask = sample[:, node_content.index] <= node_content.value + pred[left_mask] = self.left.predict(sample[left_mask]) + pred[~left_mask] = self.right.predict(sample[~left_mask]) + return pred + assert len(sample) == 0, "Type not supported" + return np.empty((0,), dtype=self.predict_dtype)''' + + +class _ClassificationNode(_Node): + def __init__(self): + super().__init__(is_classifier=True) + + '''def predict_proba(self, sample, n_classes): + node_content = self.content + if isinstance(node_content, _LeafInfo): + single_pred = node_content.frequencies / node_content.size + return np.tile(single_pred, (len(sample), 1)) + if isinstance(node_content, _SkTreeWrapper): + if len(sample) > 0: + sk_tree_pred = node_content.sk_tree.predict_proba(sample) + pred = np.zeros((len(sample), n_classes), dtype=np.float64) + pred[:, node_content.sk_tree.classes_] = sk_tree_pred + return pred + if isinstance(node_content, _InnerNodeInfo): + pred = np.empty((len(sample), n_classes), dtype=np.float64) + l_msk = sample[:, node_content.index] <= node_content.value + pred[l_msk] = self.left.predict_proba(sample[l_msk], n_classes) + pred[~l_msk] = self.right.predict_proba(sample[~l_msk], n_classes) + return pred + assert len(sample) == 0, "Type not supported" + return np.empty((0, n_classes), dtype=np.float64)''' + + def toJson(self): + return { + "class_name": self.__class__.__name__, + "module_name": self.__module__, + "items": self.__dict__, + } + + +class _RegressionNode(_Node): + def __init__(self): + super().__init__(is_classifier=False) + + def toJson(self): + return { + "class_name": self.__class__.__name__, + "module_name": self.__module__, + "items": self.__dict__, + } + + +class _NodeInfo: + def __init__(self): + self.node_info = None + + def set(self, node_info): + self.node_info = node_info + + def get(self): + return self.node_info + + def toJson(self): + return { + "class_name": self.__class__.__name__, + "module_name": self.__module__, + "items": self.__dict__, + } + + +def encode_forest_helper(obj): + if isinstance(obj, (DecisionTreeClassifier, DecisionTreeRegressor, _Node, + _NodeInfo, + _ClassificationNode, _RegressionNode, _InnerNodeInfo, + _LeafInfo, _SkTreeWrapper)): + return obj.toJson() + + +def decode_forest_helper(class_name, obj): + if class_name == 'DecisionTreeClassifier': + model = eval(class_name)( + n_classes=obj.pop("n_classes"), + try_features=obj.pop("try_features"), + max_depth=obj.pop("max_depth"), + distr_depth=obj.pop("distr_depth"), + sklearn_max=obj.pop("sklearn_max"), + bootstrap=obj.pop("bootstrap"), + random_state=obj.pop("random_state"), + range_min=obj.pop("range_min"), + range_max=obj.pop("range_max"), + n_split_points=obj.pop("n_split_points"), + sync_after_fit=obj.pop("sync_after_fit"), + ) + elif class_name == 'DecisionTreeRegressor': + model = eval(class_name)( + try_features=obj.pop("try_features"), + max_depth=obj.pop("max_depth"), + distr_depth=obj.pop("distr_depth"), + sklearn_max=obj.pop("sklearn_max"), + bootstrap=obj.pop("bootstrap"), + random_state=obj.pop("random_state"), + range_min=obj.pop("range_min"), + range_max=obj.pop("range_max"), + n_split_points=obj.pop("n_split_points"), + sync_after_fit=obj.pop("sync_after_fit"), + ) + elif class_name == '_SkTreeWrapper': + sk_tree = obj.pop("sk_tree") + model = _SkTreeWrapper(sk_tree) + else: + model = eval(class_name)() + model.__dict__.update(obj) + return model diff --git a/dislib/trees/nested/forest.py b/dislib/trees/nested/forest.py new file mode 100644 index 00000000..2305beec --- /dev/null +++ b/dislib/trees/nested/forest.py @@ -0,0 +1,755 @@ +from sklearn.base import BaseEstimator +from sklearn.utils import check_random_state +import math +import numpy as np +from pycompss.api.parameter import COLLECTION_IN, Type, Depth + +from dislib.trees.nested.decision_tree import ( + DecisionTreeClassifier, + DecisionTreeRegressor, encode_forest_helper, decode_forest_helper, +) +from pycompss.api.api import compss_wait_on +from pycompss.api.constraint import constraint +from pycompss.api.task import task +from dislib.data.array import Array +from dislib.utils.base import _paired_partition +from dislib.data.util import decoder_helper, encoder_helper, sync_obj +import json +import numbers +import os +import pickle +import dislib.data.util.model as utilmodel +from sklearn.svm import SVC as SklearnSVC +from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier +from sklearn.tree import DecisionTreeRegressor as SklearnDTRegressor +from sklearn.tree._tree import Tree as SklearnTree +SKLEARN_CLASSES = { + "SVC": SklearnSVC, + "DecisionTreeClassifier": SklearnDTClassifier, + "DecisionTreeRegressor": SklearnDTRegressor, +} + + +class BaseRandomForest(BaseEstimator): + """Base class for distributed random forests. + + Warning: This class should not be used directly. + Use derived classes instead. + """ + + def __init__( + self, + n_estimators, + try_features, + max_depth, + distr_depth, + sklearn_max, + hard_vote, + random_state, + base_tree, + n_classes=None, + range_max=None, + range_min=None, + bootstrap=True, + n_split_points="auto", + split_computation="raw", + sync_after_fit=True, + ): + self.n_classes = n_classes + self.n_estimators = n_estimators + self.try_features = try_features + self.max_depth = max_depth + self.distr_depth = distr_depth + self.sklearn_max = sklearn_max + self.hard_vote = hard_vote + self.random_state = random_state + self.base_tree = base_tree + self.range_max = range_max + self.range_min = range_min + self.bootstrap = bootstrap + self.n_split_points = n_split_points + self.split_computation = split_computation + self.sync_after_fit = sync_after_fit + + def fit(self, x, y): + """Fits a RandomForest. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The training input samples. Internally, its dtype will be converted + to ``dtype=np.float32``. + y : ds-array, shape=(n_samples, 1) + The target values. + + Returns + ------- + self : RandomForest + """ + + try_features = _resolve_try_features(self.try_features, x.shape[1]) + + if self.range_max is None: + self.range_max = x.max() + if self.range_min is None: + self.range_min = x.min() + self.range_max._blocks = compss_wait_on(self.range_max._blocks) + self.range_min._blocks = compss_wait_on(self.range_min._blocks) + + if self.distr_depth == "auto": + distr_depth = max(0, int(math.log10(x.shape[0])) - 4) + distr_depth = min(distr_depth, self.max_depth) + if distr_depth < 1: + self.distr_depth = 1 + else: + self.distr_depth = distr_depth + + self.trees = [] + + for _ in range(self.n_estimators): + random_state = check_random_state(self.random_state) + if isinstance(self.random_state, numbers.Integral): + self.random_state = self.random_state+np.random.randint(100) + if self.n_classes is not None: + tree = self.base_tree( + try_features=try_features, + max_depth=self.max_depth, + distr_depth=self.distr_depth, + sklearn_max=self.sklearn_max, + bootstrap=self.bootstrap, + random_state=random_state, + n_classes=self.n_classes, + range_min=self.range_min, + range_max=self.range_max, + n_split_points=self.n_split_points, + split_computation=self.split_computation, + sync_after_fit=False, + ) + else: + tree = self.base_tree( + try_features=try_features, + max_depth=self.max_depth, + distr_depth=self.distr_depth, + sklearn_max=self.sklearn_max, + bootstrap=self.bootstrap, + random_state=random_state, + range_min=self.range_min, + range_max=self.range_max, + n_split_points=self.n_split_points, + split_computation=self.split_computation, + sync_after_fit=False, + ) + self.trees.append(tree) + + for tree in self.trees: + tree.fit(x, y) + self.trees = compss_wait_on(self.trees) + + return self + + def save_model(self, filepath, overwrite=True, save_format="json"): + """Saves a model to a file. + The model is synchronized before saving and can be reinstantiated in + the exact same state, without any of the code used for model + definition or fitting. + Parameters + ---------- + filepath : str + Path where to save the model + overwrite : bool, optional (default=True) + Whether any existing model at the target + location should be overwritten. + save_format : str, optional (default='json) + Format used to save the models. + Examples + -------- + >>> from dislib.cluster import DecisionTreeClassifier + >>> import numpy as np + >>> import dislib as ds + >>> x = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]]) + >>> x_train = ds.array(x, (2, 2)) + >>> model = DecisionTreeClassifier(n_clusters=2, random_state=0) + >>> model.fit(x_train) + >>> save_model(model, '/tmp/model') + >>> loaded_model = load_model('/tmp/model') + >>> x_test = ds.array(np.array([[0, 0], [4, 4]]), (2, 2)) + >>> model_pred = model.predict(x_test) + >>> loaded_model_pred = loaded_model.predict(x_test) + >>> assert np.allclose(model_pred.collect(), + loaded_model_pred.collect()) + """ + + # Check overwrite + if not overwrite and os.path.isfile(filepath): + return + + _sync_rf(self) + + sync_obj(self.__dict__) + + model_metadata = self.__dict__ + model_metadata["model_name"] = self.__class__.__name__ + + # Save model + if save_format == "json": + with open(filepath, "w") as f: + json.dump(model_metadata, f, default=_encode_helper) + elif save_format == "cbor": + if utilmodel.cbor2 is None: + raise ModuleNotFoundError("No module named 'cbor2'") + with open(filepath, "wb") as f: + utilmodel.cbor2.dump(model_metadata, f, + default=_encode_helper_cbor) + elif save_format == "pickle": + with open(filepath, "wb") as f: + pickle.dump(model_metadata, f) + else: + raise ValueError("Wrong save format.") + + def load_model(self, filepath, load_format="json"): + """Loads a model from a file. + The model is reinstantiated in the exact same state in which it + was saved, without any of the code used for model definition or + fitting. + Parameters + ---------- + filepath : str + Path of the saved the model + load_format : str, optional (default='json') + Format used to load the model. + Examples + -------- + >>> from dislib.cluster import DecisionTreeClassifier + >>> import numpy as np + >>> import dislib as ds + >>> x = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]]) + >>> x_train = ds.array(x, (2, 2)) + >>> model = DecisionTreeClassifier(n_clusters=2, random_state=0) + >>> model.fit(x_train) + >>> save_model(model, '/tmp/model') + >>> loaded_model = load_model('/tmp/model') + >>> x_test = ds.array(np.array([[0, 0], [4, 4]]), (2, 2)) + >>> model_pred = model.predict(x_test) + >>> loaded_model_pred = loaded_model.predict(x_test) + >>> assert np.allclose(model_pred.collect(), + """ + # Load model + if load_format == "json": + with open(filepath, "r") as f: + model_metadata = json.load(f, object_hook=_decode_helper) + elif load_format == "cbor": + if utilmodel.cbor2 is None: + raise ModuleNotFoundError("No module named 'cbor2'") + with open(filepath, "rb") as f: + model_metadata = utilmodel.cbor2.\ + load(f, object_hook=_decode_helper_cbor) + elif load_format == "pickle": + with open(filepath, "rb") as f: + model_metadata = pickle.load(f) + else: + raise ValueError("Wrong load format.") + + for key, val in model_metadata.items(): + setattr(self, key, val) + + +class RandomForestClassifier(BaseRandomForest): + """A distributed random forest classifier. + + Parameters + ---------- + n_estimators : int, optional (default=10) + Number of trees to fit. + try_features : int, str or None, optional (default='sqrt') + The number of features to consider when looking for the best split: + + - If "sqrt", then `try_features=sqrt(n_features)`. + - If "third", then `try_features=n_features // 3`. + - If None, then `try_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int or np.inf, optional (default=np.inf) + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int or str, optional (default='auto') + Number of levels of the tree in which the nodes are split in a + distributed way. + sklearn_max: int or float, optional (default=1e8) + Maximum size (len(subsample)*n_features) of the arrays passed to + sklearn's DecisionTreeClassifier.fit(), which is called to fit subtrees + (subsamples) of our DecisionTreeClassifier. sklearn fit() is used + because it's faster, but requires loading the data to memory, which can + cause memory problems for large datasets. This parameter can be + adjusted to fit the hardware capabilities. + hard_vote : bool, optional (default=False) + If True, it uses majority voting over the predict() result of the + decision tree predictions. If False, it takes the class with the higher + probability given by predict_proba(), which is an average of the + probabilities given by the decision trees. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + Attributes + ---------- + classes : None or ndarray + Array of distinct classes, set at fit(). + trees : list of DecisionTreeClassifier + List of the tree classifiers of this forest, populated at fit(). + """ + + def __init__( + self, + n_classes, + n_estimators=10, + try_features="sqrt", + max_depth=np.inf, + distr_depth="auto", + sklearn_max=1e8, + hard_vote=False, + random_state=None, + range_max=None, + range_min=None, + bootstrap=True, + n_split_points="auto", + split_computation="raw", + sync_after_fit=True, + ): + super().__init__( + n_estimators, + try_features, + max_depth, + distr_depth, + sklearn_max, + hard_vote, + random_state, + base_tree=DecisionTreeClassifier, + n_classes=n_classes, + range_max=range_max, + range_min=range_min, + bootstrap=bootstrap, + n_split_points=n_split_points, + split_computation=split_computation, + sync_after_fit=sync_after_fit, + ) + + def predict(self, x): + """Predicts target values using a fitted forest. + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + y_pred : ds-array, shape=(n_samples, 1) + Predicted values for x. + """ + assert self.trees is not None, "The random forest is not fitted." + + pred_blocks = [] + + if self.hard_vote: + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + pred_blocks.append([_hard_vote(np.arange(self.n_classes), + compss_wait_on( + tree_predictions))]) + else: + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + pred_blocks.append([_soft_vote(np.arange(self.n_classes), + compss_wait_on( + tree_predictions))]) + pred_blocks = compss_wait_on(pred_blocks) + y_pred = Array( + blocks=pred_blocks, + top_left_shape=(x._top_left_shape[0], 1), + reg_shape=(x._reg_shape[0], 1), + shape=(x.shape[0], 1), + sparse=False, + ) + + return y_pred + + def predict_proba(self, x): + """Predicts class probabilities using a fitted forest. + + The probabilities are obtained as an average of the probabilities of + each decision tree. + + + Parameters + ---------- + x : ds-array, shape=(n_samples, n_features) + The input samples. + + Returns + ------- + probabilities : ds-array, shape=(n_samples, n_classes) + Predicted probabilities for the samples to belong to each class. + The columns of the array correspond to the classes given at + self.classes. + """ + assert self.trees is not None, "The random forest is not fitted." + + prob_blocks = [] + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + prob_blocks.append([_join_predictions(tree_predictions)]) + + probabilities = Array( + blocks=prob_blocks, + top_left_shape=(x._top_left_shape[0], self.n_classes), + reg_shape=(x._reg_shape[0], self.n_classes), + shape=(x.shape[0], self.n_classes), + sparse=False, + ) + return probabilities + + def score(self, x, y, collect=False): + assert self.trees is not None, "The random forest is not fitted." + partial_scores = [] + if self.hard_vote: + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + subset_score = _hard_vote_score( + y_row._blocks, np.arange(self.n_classes), tree_predictions + ) + partial_scores.append(subset_score) + else: + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict_proba(x_row)) + subset_score = _soft_vote_score( + y_row._blocks, np.arange(self.n_classes), tree_predictions + ) + partial_scores.append(subset_score) + score = _merge_classification_scores(partial_scores) + + return compss_wait_on(score) if collect else score + + +class RandomForestRegressor(BaseRandomForest): + """A distributed random forest regressor. + + Parameters + ---------- + n_estimators : int, optional (default=10) + Number of trees to fit. + try_features : int, str or None, optional (default='sqrt') + The number of features to consider when looking for the best split: + + - If "sqrt", then `try_features=sqrt(n_features)`. + - If "third", then `try_features=n_features // 3`. + - If None, then `try_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires + to effectively inspect more than ``try_features`` features. + max_depth : int or np.inf, optional (default=np.inf) + The maximum depth of the tree. If np.inf, then nodes are expanded + until all leaves are pure. + distr_depth : int or str, optional (default='auto') + Number of levels of the tree in which the nodes are split in a + distributed way. + sklearn_max: int or float, optional (default=1e8) + Maximum size (len(subsample)*n_features) of the arrays passed to + sklearn's DecisionTreeRegressor.fit(), which is + called to fit subtrees (subsamples) of our DecisionTreeRegressor. + sklearn fit() is used because it's faster, but requires loading + the data to memory, which can cause memory problems + for large datasets. + This parameter can be adjusted to fit the hardware capabilities. + random_state : int, RandomState instance or None, optional + (default=None) + If int, random_state is the seed used by the random number + generator; + If RandomState instance, random_state is the random number + generator; + If None, the random number generator is the RandomState + instance used + by `np.random`. + + Attributes + ---------- + trees : list of DecisionTreeRegressor + List of the tree regressors of this forest, populated at fit(). + """ + + def __init__( + self, + n_estimators=10, + try_features="sqrt", + max_depth=np.inf, + distr_depth="auto", + sklearn_max=1e8, + random_state=None, + range_max=None, + range_min=None, + bootstrap=True, + n_split_points="auto", + split_computation="raw", + sync_after_fit=True, + ): + hard_vote = None + super().__init__( + n_estimators, + try_features, + max_depth, + distr_depth, + sklearn_max, + hard_vote, + random_state, + base_tree=DecisionTreeRegressor, + n_classes=None, + range_max=range_max, + range_min=range_min, + bootstrap=bootstrap, + n_split_points=n_split_points, + split_computation=split_computation, + sync_after_fit=sync_after_fit, + ) + + def predict(self, x): + pred_blocks = [] + for x_row in x._iterator(axis=0): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + pred_blocks.append(tree_predictions) + final_blocks = [] + for tree_predictions in pred_blocks: + final_blocks.append([_join_predictions( + compss_wait_on(tree_predictions))]) + + y_pred = Array( + blocks=final_blocks, + top_left_shape=(x._top_left_shape[0], 1), + reg_shape=(x._reg_shape[0], 1), + shape=(x.shape[0], 1), + sparse=False, + ) + + return y_pred + + def score(self, x, y, collect=False): + assert self.trees is not None, "The random forest is not fitted." + + partial_scores = [] + for x_row, y_row in _paired_partition(x, y): + tree_predictions = [] + for tree in self.trees: + tree_predictions.append(tree.predict(x_row)) + subset_score = _regression_score(y_row._blocks, tree_predictions) + partial_scores.append(subset_score) + + score = _merge_regression_scores(partial_scores) + + return compss_wait_on(score) if collect else score + + +def _base_soft_vote(classes, predictions): + aggregate = predictions[0][0] + for p in predictions[1:]: + aggregate += p[0] + predicted_labels = classes[np.argmax(aggregate, axis=1)] + return np.expand_dims(predicted_labels, axis=1) + + +def _base_hard_vote(classes, predictions): + mode = predictions[0][0] + for p in predictions[1:]: + mode += p[0] + predicted_labels = classes[np.argmax(mode, axis=1)] + return np.expand_dims(predicted_labels, axis=1) + + +def _soft_vote(classes, predictions): + predicted_labels = _base_soft_vote(classes, predictions) + return predicted_labels + + +@constraint(computing_units="${ComputingUnits}") +@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, + predictions=COLLECTION_IN, returns=1) +def _soft_vote_score(y_blocks, classes, predictions): + predicted_labels = _base_soft_vote(classes, predictions) + real_labels = Array._merge_blocks(y_blocks).flatten() + correct = np.count_nonzero(predicted_labels.squeeze() == real_labels) + return correct, len(real_labels) + + +def _hard_vote(classes, predictions): + predicted_labels = _base_hard_vote(classes, predictions) + return predicted_labels + + +@constraint(computing_units="${ComputingUnits}") +@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, + predictions=COLLECTION_IN, returns=1) +def _hard_vote_score(y_blocks, classes, predictions): + predicted_labels = _base_hard_vote(classes, predictions) + real_labels = Array._merge_blocks(y_blocks).flatten() + correct = np.count_nonzero(predicted_labels.squeeze() == real_labels) + return correct, len(real_labels) + + +def _resolve_try_features(try_features, n_features): + if try_features is None: + return n_features + elif try_features == "sqrt": + return int(math.sqrt(n_features)) + elif try_features == "third": + return max(1, n_features // 3) + elif try_features >= 1: + return int(try_features) + else: + return int(try_features*n_features) + + +@constraint(computing_units="${ComputingUnits}") +@task(predictions=COLLECTION_IN, returns=1) +def _join_predictions(predictions): + aggregate = np.block(predictions[0]) + for p in predictions[1:]: + aggregate += np.block(p) + labels = aggregate / len(predictions) + if len(labels.shape) == 1: + labels = labels.reshape(-1, 1) + return labels + + +@constraint(computing_units="${ComputingUnits}") +@task(y_blocks={Type: COLLECTION_IN, Depth: 2}, + predictions=COLLECTION_IN, returns=1) +def _regression_score(y_blocks, predictions): + y_true = Array._merge_blocks(y_blocks).flatten() + y_pred = np.mean(np.squeeze(predictions), axis=0) + n_samples = y_true.shape[0] + y_avg = np.mean(y_true) + u_partial = np.sum(np.square(y_true - y_pred), axis=0) + v_partial = np.sum(np.square(y_true - y_avg), axis=0) + return u_partial, v_partial, y_avg, n_samples + + +@constraint(computing_units="${ComputingUnits}") +@task(partial_scores=COLLECTION_IN, returns=1) +def _merge_classification_scores(partial_scores): + correct = sum(subset_score[0] for subset_score in partial_scores) + total = sum(subset_score[1] for subset_score in partial_scores) + return correct / total + + +@constraint(computing_units="${ComputingUnits}") +@task(partial_scores=COLLECTION_IN, returns=1) +def _merge_regression_scores(partial_scores): + u = v = avg = n = 0 + for u_p, v_p, avg_p, n_p in partial_scores: + u += u_p + + delta = avg_p - avg + avg += delta * n_p / (n + n_p) + v += v_p + delta ** 2 * n * n_p / (n + n_p) + n += n_p + + return 1 - u / v + + +def _encode_helper_cbor(encoder, obj): + encoder.encode(_encode_helper(obj)) + + +def _encode_helper(obj): + encoded = encoder_helper(obj) + if encoded is not None: + return encoded + elif callable(obj): + return { + "class_name": "callable", + "module": obj.__module__, + "name": obj.__name__, + } + elif isinstance(obj, SklearnTree): + return { + "class_name": obj.__class__.__name__, + "n_features": obj.n_features, + "n_classes": obj.n_classes, + "n_outputs": obj.n_outputs, + "items": obj.__getstate__(), + } + elif isinstance(obj, (RandomForestClassifier, RandomForestRegressor, + DecisionTreeClassifier, DecisionTreeRegressor, + SklearnDTClassifier, SklearnDTRegressor)): + return { + "class_name": obj.__class__.__name__, + "module_name": obj.__module__, + "items": obj.__dict__, + } + else: + return encode_forest_helper(obj) + + +def _decode_helper_cbor(decoder, obj): + """Special decoder wrapper for dislib using cbor2.""" + return _decode_helper(obj) + + +def _decode_helper(obj): + if isinstance(obj, dict) and "class_name" in obj: + class_name = obj["class_name"] + decoded = decoder_helper(class_name, obj) + if decoded is not None: + return decoded + elif class_name == "RandomState": + random_state = np.random.RandomState() + random_state.set_state(_decode_helper(obj["items"])) + return random_state + elif class_name == "Tree": + dict_ = _decode_helper(obj["items"]) + model = SklearnTree( + obj["n_features"], obj["n_classes"], obj["n_outputs"] + ) + model.__setstate__(dict_) + return model + elif class_name == "callable": + if obj["module"] == "numpy": + return getattr(np, obj["name"]) + return None + elif ( + class_name in SKLEARN_CLASSES.keys() + and "sklearn" in obj["module_name"] + ): + dict_ = _decode_helper(obj["items"]) + model = SKLEARN_CLASSES[obj["class_name"]]() + model.__dict__.update(dict_) + return model + else: + dict_ = _decode_helper(obj["items"]) + return decode_forest_helper(class_name, dict_) + return obj + + +def _sync_rf(rf): + """Sync the `try_features` and `n_classes` attribute of the different trees + since they cannot be synced recursively. + """ + try_features = compss_wait_on(rf.trees[0].try_features) + n_classes = compss_wait_on(rf.trees[0].n_classes) + for tree in rf.trees: + tree.try_features = try_features + tree.n_classes = n_classes diff --git a/dislib/trees/nested/tasks.py b/dislib/trees/nested/tasks.py new file mode 100644 index 00000000..5489ae80 --- /dev/null +++ b/dislib/trees/nested/tasks.py @@ -0,0 +1,78 @@ +from pycompss.api.task import task +from pycompss.api.parameter import COLLECTION_IN +from pycompss.api.parameter import COLLECTION_OUT +from pycompss.api.constraint import constraint +import numpy as np + + +@constraint(computing_units="${ComputingUnits}") +@task(fragment=COLLECTION_IN, fragment_buckets=COLLECTION_OUT, + range_min=COLLECTION_IN, range_max=COLLECTION_IN) +def filter_fragment(fragment, fragment_buckets, indexes_to_try, + num_buckets, range_min=0, range_max=1, + indexes_selected=np.array([0])): + """ + Task that filters a fragment entries for the given ranges. + * Ranges is a list of tuples where each tuple corresponds to + a range. + * Each tuple (range) is composed by two elements, the minimum + and maximum of each range. + * The filtering is performed by checking which fragment entries' + keys belong to each range. + The entries that belong to each range are considered a bucket. + * The variable buckets is a list of lists, where the inner lists + correspond to the bucket of each range. + + :param fragment: The fragment to be sorted and filtered. + :param ranges: The ranges to apply when filtering. + :return: Multireturn of the buckets. + """ + if len(fragment) == 0: + for idx in range(len(fragment_buckets)): + for idx_2 in range(len(fragment_buckets[idx])): + fragment_buckets[idx][idx_2] = [] + return + fragment = np.block(fragment) + range_min = np.block(range_min) + range_max = np.block(range_max) + for index, value in enumerate(indexes_to_try): + if len(indexes_selected) > 1: + if indexes_selected[0] != 0: + actual_fragment = fragment[indexes_selected, value] + else: + actual_fragment = fragment[:, value] + else: + actual_fragment = fragment[:, value] + split_indexes = np.linspace(range_min[0, value], + range_max[0, value] + 1, num_buckets + 1) + ranges = [] + for ind in range(split_indexes.size - 1): + ranges.append((split_indexes[ind], split_indexes[ind + 1])) + i = 0 + for _range in ranges: + if actual_fragment is not None: + fragment_buckets[index][i] = [k_s_v for k_s_v in + actual_fragment if + _range[0] <= k_s_v < _range[1]] + else: + fragment_buckets[index][i] = [] + i += 1 + + +def combine_and_sort_bucket_elements(args): + """ + Task that combines the buckets received as args parameter and final + sorting. + + args structure = ([],[], ..., []) + + :param args: args that contains the buckets of a single range + :return: A list of tuples with the same format as provided initially + sorted by key. + """ + combined = [] + for e in args: + for kv in e: + combined.append(kv) + sorted_by_key = np.sort(combined) + return np.unique(sorted_by_key) diff --git a/dislib/trees/nested/terasort.py b/dislib/trees/nested/terasort.py new file mode 100644 index 00000000..82340626 --- /dev/null +++ b/dislib/trees/nested/terasort.py @@ -0,0 +1,95 @@ +import numpy as np +from dislib.trees.nested.tasks import filter_fragment, \ + combine_and_sort_bucket_elements +from dislib.data.array import Array +from pycompss.api.api import compss_delete_object, compss_wait_on + + +def terasorting(dataset, indexes_to_try, num_buckets, + range_min=0, range_max=1, + indexes_selected=None, reg_shape=None): + # Init buckets dictionary + list_of_buckets = [] + total_fragments = [] + if indexes_selected is not None: + for idx, d in enumerate(dataset): + fragment_buckets = [[object() for _ in range(num_buckets)] + for _ in range(len(indexes_to_try))] + idx_selected = indexes_selected[indexes_selected < + (idx + 1) * reg_shape] + filter_fragment(d, fragment_buckets, indexes_to_try, num_buckets, + range_min=range_min._blocks, + range_max=range_max._blocks, + indexes_selected=idx_selected[ + idx_selected >= + (idx) * reg_shape] % + reg_shape) + total_fragments.append(fragment_buckets) + total_fragments = np.array(compss_wait_on(total_fragments)) + for index in range(len(indexes_to_try)): + buckets = {} + for i in range(num_buckets): + buckets[i] = [] + for i in range(num_buckets): + buckets[i].append(total_fragments[:, index, i]) + list_of_buckets.append(buckets) + else: + buckets = {} + for d in dataset: + fragment_buckets = [[object() for _ in range(num_buckets)] + for _ in range(len(indexes_to_try))] + filter_fragment(d, fragment_buckets, indexes_to_try, num_buckets, + range_min=range_min._blocks, + range_max=range_max._blocks, + indexes_selected=np.array([0])) + total_fragments.append(fragment_buckets) + total_fragments = np.array(compss_wait_on(total_fragments)) + for index in range(len(indexes_to_try)): + buckets = {} + for i in range(num_buckets): + buckets[i] = [] + for i in range(num_buckets): + buckets[i].append(total_fragments[:, index, i]) + list_of_buckets.append(buckets) + result = dict() + real_key = 0 + for index in range(len(indexes_to_try)): + for key, value in list(list_of_buckets[index].items()): + result[real_key] = combine_and_sort_bucket_elements(value[0]) + real_key += 1 + [compss_delete_object(future_objects) for + value in buckets.items() for future_objects in value[1]] + return_list = [] + for idx, value in enumerate(result.values()): + if idx % num_buckets == 0: + return_list.append([]) + return_list[-1].append(value) + return return_list + + +def terasort(dataset, indexes_to_try, range_min=0, range_max=1, + indexes_selected=None, num_buckets=4): + """ + ---------------------- + Terasort main program + ---------------------- + This application generates a set of fragments that contain randomly + generated key, value tuples and sorts them all considering the key of + each tuple. + + :param num_fragments: Number of fragments to generate + :param num_entries: Number of entries (k,v tuples) within each fragment + :param num_buckets: Number of buckets to consider. + :param seed: Initial seed for the random number generator. + """ + if isinstance(dataset, Array): + result = terasorting(dataset._blocks, indexes_to_try, num_buckets, + range_min=range_min, + range_max=range_max, + indexes_selected=indexes_selected, + reg_shape=dataset._reg_shape[0]) + return np.array(result) + else: + result = terasorting(dataset, indexes_to_try, num_buckets, + range_min=range_min, range_max=range_max) + return np.array(result) diff --git a/docs/source/dislib.trees.distributed.rst b/docs/source/dislib.trees.distributed.rst new file mode 100644 index 00000000..5b958e14 --- /dev/null +++ b/docs/source/dislib.trees.distributed.rst @@ -0,0 +1,22 @@ +dislib.trees.distributed +============================================ + +.. autoclass:: dislib.trees.distributed.DecisionTreeClassifier + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: dislib.trees.distributed.DecisionTreeRegressor + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: dislib.trees.distributed.RandomForestClassifier + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: dislib.trees.distributed.RandomForestRegressor + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/dislib.trees.mmap.rst b/docs/source/dislib.trees.mmap.rst new file mode 100644 index 00000000..c3b759dc --- /dev/null +++ b/docs/source/dislib.trees.mmap.rst @@ -0,0 +1,22 @@ +dislib.trees.mmap +============================================ + +.. autoclass:: dislib.trees.mmap.DecisionTreeClassifier + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: dislib.trees.mmap.DecisionTreeRegressor + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: dislib.trees.mmap.RandomForestClassifier + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: dislib.trees.mmap.RandomForestRegressor + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/dislib.trees.nested.rst b/docs/source/dislib.trees.nested.rst new file mode 100644 index 00000000..7a63ba42 --- /dev/null +++ b/docs/source/dislib.trees.nested.rst @@ -0,0 +1,22 @@ +dislib.trees.nested +============================================ + +.. autoclass:: dislib.trees.nested.DecisionTreeClassifier + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: dislib.trees.nested.DecisionTreeRegressor + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: dislib.trees.nested.RandomForestClassifier + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: dislib.trees.nested.RandomForestRegressor + :members: + :undoc-members: + :show-inheritance: diff --git a/run_ci_checks.sh b/run_ci_checks.sh index 48680b1b..7e068d51 100755 --- a/run_ci_checks.sh +++ b/run_ci_checks.sh @@ -14,5 +14,9 @@ echo "Running tests" # Run the tests in ./tests with PyCOMPSs ./run_tests.sh +echo "Running nesting tests" +# Run the tests in ./tests with PyCOMPSs +./run_test_nesting.sh + echo "Running code coverage" ./run_coverage.sh diff --git a/run_coverage.sh b/run_coverage.sh index f95a029f..62432b7b 100755 --- a/run_coverage.sh +++ b/run_coverage.sh @@ -2,6 +2,9 @@ # Run the coverage of the dislib using the tests in ./tests (sequential) coverage3 run --source dislib tests +coverage3 run -a --source dislib tests_nesting +# Create the report +coverage3 report # Report coverage results to the CLI. coverage3 report -m # Upload coverage report to codecov.io diff --git a/run_test_nesting.sh b/run_test_nesting.sh new file mode 100755 index 00000000..979e9d33 --- /dev/null +++ b/run_test_nesting.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +base_app_dir="$(pwd)/tests_nesting/" +COMPSs_log_folder="/tmp/COMPSsWorker01" +target_log_folder="$(pwd)" +retry_num=1 + +echo $base_app_dir +echo $COMPSs_log_folder +echo $target_log_folder + +AGENT_PIDS="" +exit_value=0 +expected_time="60" +NUM_RETRIES="50" +app_name="Nesting_Tests" + + # Traps and Handlers +function kill_agents() { + for pid in ${AGENT_PIDS}; do + kill -SIGINT ${pid} 2>/dev/null + done +} +trap kill_agents EXIT + +#sed -i '//c'"${COMPSS_HOME}"'<\/InstallDir>' "${base_app_dir}"/project.xml + +mkdir -p /tmp/COMPSsWorker01/ + +echo "" +echo "*** RUNNING AGENTS TESTS ON DISLIB ***" +log_dir="${COMPSs_log_folder}/${app_name}_0${retry_num}/" +mkdir -p "${log_dir}" +output_log="${log_dir}test.outputlog" +error_log="${log_dir}test.errorlog" +touch "${output_log}" +touch "${error_log}" + +port_offset=100 + +for file in "${base_app_dir}"test_*; do + + corresponding_file=$(echo "${file}" | cut -d '/' -f4) + corresponding_file=$(echo "${corresponding_file}" | cut -d '.' -f1) + + log_dir="${COMPSs_log_folder}/${app_name}_0${retry_num}/" + mkdir -p "${log_dir}" + output_log="${log_dir}test.outputlog" + error_log="${log_dir}test.errorlog" + touch "${output_log}" + touch "${error_log}" + + # Starting agent + agent1_log_dir="${log_dir}/agent1/" + mkdir -p "${agent1_log_dir}" + agent1_output_log="${log_dir}agent1.outputlog" + agent1_error_log="${log_dir}agent1.errorlog" + + rest_port=$(( 46000 + port_offset + 1)) + comm_port=$(( 46000 + port_offset + 2)) + which compss_agent_start + compss_agent_start \ + --hostname="COMPSsWorker01" \ + --classpath="${base_app_dir}" \ + --log_dir="${agent1_log_dir}" \ + --rest_port="${rest_port}" \ + --comm_port="${comm_port}" \ + --pythonpath="${base_app_dir}" \ + --python_interpreter="python3"\ + 1>"${agent1_output_log}" 2>"${agent1_error_log}" & + + agent_pid="$!" + + AGENT_PIDS="${AGENT_PIDS} ${agent_pid}" + retries="${NUM_RETRIES}" + echo "testing first agent" + curl -XGET http://127.0.0.1:${rest_port}/COMPSs/test 1>/dev/null 2>/dev/null + ev=$? + + while [ "$ev" != "0" ] && [ "${retries}" -gt "0" ]; do + echo "testing agent on port ${rest_port}" + sleep 2s + retries=$((retries - 1 )) + curl -XGET http://127.0.0.1:${rest_port}/COMPSs/test 1>/dev/null 2>/dev/null + ev=$? + done + echo "TEST invoked" + RESULT=$(grep "test invoked" "${agent1_output_log}") + if [ -z "${RESULT}" ]; then + echo "Agent failed to start" > >(tee -a "${error_log}") + exit 1 + fi + echo "Agent started" > >(tee -a "${output_log}") + sleep 2s + + # Invoking DemoFunction method + "${COMPSS_HOME}/Runtime/scripts/user/compss_agent_call_operation" \ + --lang="PYTHON" \ + --master_node="127.0.0.1" \ + --master_port="${rest_port}" \ + --method_name="main" \ + --stop \ + "${corresponding_file}" > >(tee -a "${output_log}") 2> >(tee -a "${error_log}") + ev=$? + if [ "$ev" != "0" ]; then + echo "Could not invoke main method." > >(tee -a "${error_log}") + exit $ev + fi + echo "main function invoked" > >(tee -a "${output_log}") + + retries="3" + while [ ! -f "${agent1_log_dir}/jobs/job1_NEW.out" ] && [ "${retries}" -gt "0" ]; do + sleep 2s + retries=$((retries - 1 )) + done + if [ ! -f "${agent1_log_dir}/jobs/job1_NEW.out" ]; then + echo "Could not invoke main method." > >(tee -a "${error_log}") + exit 1 + fi + + wait ${AGENT_PIDS} + + if [ ! -f "${agent1_log_dir}/jobs/job2_NEW.out" ]; then + echo "Could not invoke nested method." > >(tee -a "${error_log}") + exit 1 + fi + + job1_end=$(grep "Result tests" "${agent1_log_dir}/jobs/job1_NEW.out") + job1_end_value=$(echo "${job1_end}" | cut -d ' ' -f3) + + if [ ! "${job1_end_value}" == "Passed" ]; then + echo "Unexpected integer value obtained from the test. Expecting Passed and ${job1_end_value} observed!" > >(tee -a "${error_log}") + exit 1 + fi + + + + kill_agents + rm -rf /tmp/COMPSsWorker01/* + AGENT_PIDS="" + + # Copy LOG files + # cp -rf "${log_dir}" "${target_log_folder}" + port_offset=$((port_offset + 100 )); +done +exit 0 diff --git a/tests/test_array.py b/tests/test_array.py index 566cf2ab..6947529b 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1351,8 +1351,7 @@ def test_median(self): class MathTest(BaseTimedTestCase): - @parameterized.expand([((21, 33), (10, 15), False), - ((5, 10), (8, 1), False), + @parameterized.expand([((5, 10), (8, 1), False), ((17, 13), (1, 9), False), ((6, 1), (12, 23), False), ((1, 22), (25, 16), False), diff --git a/tests/test_tsqr.py b/tests/test_tsqr.py index 947b9706..63ccb2be 100644 --- a/tests/test_tsqr.py +++ b/tests/test_tsqr.py @@ -11,7 +11,7 @@ class QRTest(BaseTimedTestCase): @parameterized.expand([ - (2, 1, 64, 36), (3, 1, 64, 36), (4, 1, 32, 36), (16, 1, 20, 10), + (2, 1, 64, 36), (3, 1, 64, 36), (4, 1, 32, 36), ]) def test_tsqr(self, m_size, n_size, b_size_r, b_size_c): """Tests tsqr""" @@ -60,7 +60,7 @@ def test_tsqr_irregular(self, m_size, n_size, b_size_r, b_size_c): self.assertTrue(np.allclose(q.dot(r), m2b)) @parameterized.expand([ - (2, 1, 64, 36), (4, 1, 32, 36), (16, 1, 20, 10), + (2, 1, 64, 36), (4, 1, 32, 36), ]) def test_tsqr_inverse(self, m_size, n_size, b_size_r, b_size_c): """Tests tsqr""" @@ -109,7 +109,7 @@ def test_tsqr_complete_inverse_irregular(self, m_size, n_size, self.assertTrue(np.allclose(q.dot(r), m2b)) @parameterized.expand([ - (2, 1, 64, 36), (4, 1, 32, 36), (16, 1, 20, 10), + (2, 1, 64, 36), (4, 1, 32, 36), ]) def test_tsqr_inverse_indexes(self, m_size, n_size, b_size_r, b_size_c): """Tests tsqr""" @@ -130,7 +130,6 @@ def test_tsqr_inverse_indexes(self, m_size, n_size, b_size_r, b_size_c): @parameterized.expand([ (2, 1, 64, 36), (3, 1, 64, 36), (4, 1, 36, 32), - (16, 1, 20, 10), (16, 2, 20, 10), ]) def test_tsqr_reduced(self, m_size, n_size, b_size_r, b_size_c): """Tests tsqr""" @@ -175,7 +174,7 @@ def test_tsqr_reduced_irregular(self, m_size, n_size, b_size_r, b_size_c): self.assertTrue(np.allclose(q.dot(r), m2b)) @parameterized.expand([ - (2, 1, 64, 36), (4, 1, 36, 32), (16, 1, 20, 10), + (2, 1, 64, 36), (4, 1, 36, 32), ]) def test_tsqr_reduced_inverse(self, m_size, n_size, b_size_r, b_size_c): """Tests tsqr""" @@ -219,7 +218,7 @@ def test_tsqr_reduced_inverse_irregular(self, m_size, n_size, self.assertTrue(np.allclose(q.dot(r), m2b)) @parameterized.expand([ - (2, 1, 64, 36), (4, 1, 36, 32), (16, 1, 20, 10), + (2, 1, 64, 36), (4, 1, 36, 32), ]) def test_tsqr_reduced_inverse_indexes(self, m_size, n_size, b_size_r, b_size_c): @@ -242,7 +241,7 @@ def test_tsqr_reduced_inverse_indexes(self, m_size, n_size, self.assertTrue(q.shape == (q.shape[0], 3)) @parameterized.expand([ - (2, 1, 64, 36), (3, 1, 64, 36), (4, 1, 36, 32), (16, 1, 20, 10), + (2, 1, 64, 36), (3, 1, 64, 36), (4, 1, 36, 32), ]) def test_tsqr_compute_r(self, m_size, n_size, b_size_r, b_size_c): """Tests tsqr""" @@ -258,7 +257,7 @@ def test_tsqr_compute_r(self, m_size, n_size, b_size_r, b_size_c): self.assertTrue(np.allclose(np.triu(r), r)) @parameterized.expand([ - (2, 1, 64, 36), (3, 1, 64, 36), (4, 1, 36, 32), (16, 1, 20, 10), + (2, 1, 64, 36), (3, 1, 64, 36), (4, 1, 36, 32), ]) def test_tsqr_compute_r_reduced(self, m_size, n_size, b_size_r, b_size_c): """Tests tsqr""" diff --git a/tests_nesting/__init__.py b/tests_nesting/__init__.py new file mode 100644 index 00000000..8a9b2dab --- /dev/null +++ b/tests_nesting/__init__.py @@ -0,0 +1,14 @@ +from time import time +import unittest +import numpy as np + + +class BaseTimedTestCase(unittest.TestCase): + def setUp(self): + np.random.seed() + self.start_time = time() + + def tearDown(self): + self.end_time = time() + print("Test %s took: %.3f seconds" % + (self.id(), self.end_time - self.start_time)) diff --git a/tests_nesting/__main__.py b/tests_nesting/__main__.py new file mode 100644 index 00000000..b28e8c19 --- /dev/null +++ b/tests_nesting/__main__.py @@ -0,0 +1,9 @@ +import unittest + + +def load_tests(loader, tests, pattern): + return loader.discover('./tests_nesting/') + + +if __name__ == '__main__': + unittest.main(verbosity=2) diff --git a/tests_nesting/test_decision_tree_nested.py b/tests_nesting/test_decision_tree_nested.py new file mode 100644 index 00000000..1e4243ed --- /dev/null +++ b/tests_nesting/test_decision_tree_nested.py @@ -0,0 +1,908 @@ +from pycompss.api.task import task +from tests import BaseTimedTestCase +import numpy as np +import dislib as ds +import dislib.trees.nested.decision_tree as dt_nested +from dislib.trees.nested.tasks import filter_fragment +from pycompss.api.api import compss_wait_on +from sklearn.metrics import r2_score, accuracy_score +from sklearn.datasets import make_classification, make_regression + + +def test_decision_tree_classifier(): + x1 = np.array( + [ + [0.3, -0.3], + [0.4, -0.5], + [0.5, -0.4], + [0.3, 0.3], + [0.4, 0.5], + [0.5, 0.4], + [-0.3, -0.3], + [-0.4, -0.5], + [-0.5, -0.4], + ] + ) + x2 = np.array([[0.4, -0.3], [0.4, 0.3], [-0.4, -0.3]]) + y1 = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + y2 = np.array([0, 1, 2]) + + x1_ds = ds.array(x1, (3, 2)) + x2_ds = ds.array(x2, (3, 2)) + y1_ds = ds.array(y1[:, np.newaxis], (3, 1)) + + # Model + try_features = 2 + max_depth = np.inf + distr_depth = 1 + sklearn_max = 1e8 + bootstrap = True + seed = 0 + random_state = np.random.RandomState(seed) + n_classes = np.bincount(y1).shape[0] + # Test bootstrap + sample1 = dt_nested._sample_selection(x1, random_state, + bootstrap=True) + sample2 = dt_nested._sample_selection(x1, random_state, + bootstrap=False) + condition = np.array_equal(sample1, np.array([0, 2, 3, 3, 3, 4, 5, 5, 7])) + condition = condition and np.array_equal(sample2, np.array([0, 1, 2, 3, + 4, 5, 6, 7, + 8])) + + # Assert split wrapper + sample = sample2 + rang_min = x1_ds.min() + rang_max = x1_ds.max() + rang_max._blocks = compss_wait_on(rang_max._blocks) + rang_min._blocks = compss_wait_on(rang_min._blocks) + + split = dt_nested._compute_split( + x1_ds, + y1_ds, + n_classes, + indexes_selected=sample, + num_buckets=1, + range_min=rang_min, + range_max=rang_max, + number_split_points=2, + random_state=0, + ) + node_info, results_l, results_l_2, results_r, results_r_2 = split + node_info = compss_wait_on(node_info) + left_group = compss_wait_on(results_l) + y_l = compss_wait_on(results_l_2) + right_group = compss_wait_on(results_r) + y_r = compss_wait_on(results_r_2) + left_group_compare = np.block(left_group) + y_l_compare = np.block(y_l) + right_group_compare = np.block(right_group) + y_r_compare = np.block(y_r) + + condition = condition and node_info.node_info.index in (0, 1) + + condition = condition and np.array_equal(left_group_compare, + np.array([[0.3, -0.3], + [0.3, 0.3], + [-0.3, -0.3], + [-0.4, -0.5], + [-0.5, -0.4]] + )) + + condition = condition and np.array_equal(y_l_compare, + np.array([[0], [1], [2], + [2], [2]])) + + condition = condition and np.array_equal(right_group_compare, + np.array([[0.4, -0.5], + [0.5, -0.4], + [0.4, 0.5], + [0.5, 0.4]])) + + condition = condition and np.array_equal(y_r_compare, + np.array([[0], [0], + [1], [1]])) + + condition = condition and np.isclose(node_info.node_info.value, 0.35) + + rang_min = x1_ds.min() + rang_max = x1_ds.max() + rang_max._blocks = compss_wait_on(rang_max._blocks) + rang_min._blocks = compss_wait_on(rang_min._blocks) + # Test tree + tree = dt_nested.DecisionTreeClassifier( + 3, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + range_max=rang_max, + range_min=rang_min, + n_split_points=2, + split_computation="raw", + sync_after_fit=True, + ) + tree.fit(x1_ds, y1_ds) + + y_pred = compss_wait_on(tree.predict(x2_ds)) + condition = condition and np.array_equal(np.argmax(y_pred, axis=1)[0], y2) + y_pred_proba = compss_wait_on(tree.predict_proba(x2_ds)) + condition = condition and np.array_equal( + np.argmax(y_pred_proba, axis=1)[0], y2) + + random_state = np.random.RandomState(seed) + + tree = dt_nested.DecisionTreeClassifier( + 3, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + n_split_points="auto", + split_computation="raw", + sync_after_fit=True, + ) + + tree.fit(x1_ds, y1_ds) + + y_pred = compss_wait_on(tree.predict(x2_ds)) + condition = condition and np.array_equal(np.argmax(y_pred, axis=1)[0], y2) + y_pred_proba = compss_wait_on(tree.predict_proba(x2_ds)) + condition = condition and np.array_equal( + np.argmax(y_pred_proba, axis=1)[0], y2) + + random_state = np.random.RandomState(seed) + + x, y = make_classification( + n_samples=300, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (50, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (50, 1)) + + tree = dt_nested.DecisionTreeClassifier( + 3, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + n_split_points="sqrt", + split_computation="uniform_approximation", + sync_after_fit=True, + ) + + tree.fit(x1_ds, y1_ds) + + y_pred = compss_wait_on(tree.predict(x2_ds)) + condition = condition and np.array_equal(np.argmax(y_pred, axis=1)[0], y2) + y_pred_proba = compss_wait_on(tree.predict_proba(x2_ds)) + condition = condition and np.array_equal( + np.argmax(y_pred_proba, axis=1)[0], y2) + + random_state = np.random.RandomState(seed) + + tree = dt_nested.DecisionTreeClassifier( + 3, + try_features, + max_depth, + 2, + sklearn_max, + bootstrap, + random_state, + n_split_points=0.444, + split_computation="gaussian_approximation", + sync_after_fit=True, + ) + tree.fit(x_train, y_train) + y_pred = compss_wait_on(tree.predict(x_train)) + y_pred = np.argmax(np.vstack(y_pred), axis=1) + y_train = y_train.collect() + condition = condition and accuracy_score(y_train, y_pred) > 0.6 + y_pred_proba = compss_wait_on(tree.predict_proba(x_train)) + y_pred_proba = np.argmax(np.vstack(y_pred_proba), axis=1) + condition = condition and accuracy_score(y_train, + y_pred_proba) > 0.6 + return condition + + +def test_decision_tree_regressor(): + x1 = np.array( + [ + [0.3, -0.3], + [0.4, -0.5], + [0.5, -0.4], + [0.3, 0.3], + [0.4, 0.5], + [0.5, 0.4], + [-0.3, -0.3], + [-0.4, -0.5], + [-0.5, -0.4], + ] + ) + + # Model + try_features = 2 + max_depth = np.inf + distr_depth = 1 + sklearn_max = 1e8 + bootstrap = True + seed = 0 + random_state = np.random.RandomState(seed) + # Test bootstrap + sample1 = dt_nested._sample_selection(x1, random_state, + bootstrap=True) + sample2 = dt_nested._sample_selection(x1, random_state, + bootstrap=False) + condition = np.array_equal(sample1, np.array([0, 2, 3, 3, 3, 4, 5, 5, 7])) + condition = condition and np.array_equal(sample2, + np.array([0, 1, 2, 3, 4, + 5, 6, 7, 8])) + + x1, y1 = make_regression( + n_samples=1000, + n_features=10, + n_informative=4, + shuffle=True, + random_state=0, + ) + + x2 = x1[800:] + x1 = x1[:800] + y2 = y1[800:] + y1 = y1[:800] + + x1_ds = ds.array(x1, (400, 10)) + x2_ds = ds.array(x2, (100, 10)) + + y1_ds = ds.array(y1, (400, 1)) + rang_min = x1_ds.min() + rang_max = x1_ds.max() + rang_max._blocks = compss_wait_on(rang_max._blocks) + rang_min._blocks = compss_wait_on(rang_min._blocks) + + # Test tree + tree = dt_nested.DecisionTreeRegressor( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + range_max=rang_max, + range_min=rang_min, + n_split_points=2, + split_computation="raw", + sync_after_fit=True, + ) + tree.fit(x1_ds, y1_ds) + y_pred = compss_wait_on(tree.predict(x2_ds)) + y_pred = np.block(y_pred) + condition = condition and r2_score(y_pred.flatten(), y2) > 0.1 + + tree = dt_nested.DecisionTreeRegressor( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + n_split_points="auto", + split_computation="uniform_approximation", + sync_after_fit=True, + ) + tree.fit(x1_ds, y1_ds) + y_pred = compss_wait_on(tree.predict(x2_ds)) + y_pred = np.block(y_pred) + condition = condition and r2_score(y_pred.flatten(), y2) > 0.15 + + tree = dt_nested.DecisionTreeRegressor( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + n_split_points="sqrt", + split_computation="gaussian_approximation", + sync_after_fit=True, + ) + tree.fit(x1_ds, y1_ds) + y_pred = compss_wait_on(tree.predict(x2_ds)) + y_pred = np.block(y_pred) + condition = condition and r2_score(y_pred.flatten(), y2) > 0.15 + + tree = dt_nested.DecisionTreeRegressor( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + n_split_points=0.1, + split_computation="gaussian_approximation", + sync_after_fit=True, + ) + tree.fit(x1_ds, y1_ds) + y_pred = compss_wait_on(tree.predict(x2_ds)) + y_pred = np.block(y_pred) + condition = condition and r2_score(y_pred.flatten(), y2) > 0.15 + return condition + + +def test_auxiliar_functions(): + x1 = np.array( + [ + [0.3, -0.3], + [0.4, -0.5], + [0.5, -0.4], + [0.3, 0.3], + [0.4, 0.5], + [0.5, 0.4], + [-0.3, -0.3], + [-0.4, -0.5], + [-0.5, -0.4], + ] + ) + y1 = np.array([0, 0, 0, 1, 1, 0, 1, 0, 1]) + right_x, right_y, x, y, aggregate_r, aggregate = \ + dt_nested.apply_split_points_to_blocks(x1, y1, 1, + None, [2], + 2, np.array([]), + np.array([0, 0])) + + condition = right_x is None + condition = condition and right_y is None + condition = condition and np.all(x == x1) + condition = condition and np.all(y == y1) + condition = condition and np.all(aggregate_r == np.array([])) + condition = condition and np.all(aggregate == np.array([5, 4])) + + x1 = np.array( + [ + [0.3, -0.3], + [0.4, -0.5], + [0.5, -0.4], + ] + ) + y1 = np.array([0, 0, 0]) + right_x, right_y, x, y, aggregate_r, aggregate = \ + dt_nested.apply_split_points_to_blocks(x1, y1, 1, + None, [2], + 2, np.array([]), + np.array([0, 0])) + condition = condition and right_x is None + condition = condition and right_y is None + condition = condition and np.all(x == x1) + condition = condition and np.all(y == y1) + condition = condition and np.all(aggregate_r == np.array([])) + condition = condition and np.all(aggregate == np.array([3, 0])) + + right_x, right_y, x, y, aggregate_r, aggregate = \ + dt_nested.apply_split_points_to_blocks(None, None, 1, + 1, [2], + 2, np.array([]), + np.array([0, 0])) + condition = condition and right_x is None + condition = condition and right_y is None + condition = condition and x is None + condition = condition and y is None + condition = condition and np.all(aggregate_r == np.array([])) + condition = condition and np.all(aggregate == np.array([0, 0])) + + right_x, right_y, x, y, aggregate_r, \ + len_aggregate_r, aggregate_l, len_aggregate_l = \ + dt_nested.apply_split_points_to_blocks_regression(x1, y1, 1, + None, [2]) + condition = condition and right_x is None + condition = condition and right_y is None + condition = condition and np.all(x == x1) + condition = condition and np.all(y == y1) + condition = condition and np.all(aggregate_r == np.array([0])) + condition = condition and np.all(len_aggregate_r == np.array([0])) + condition = condition and np.all(aggregate_l == np.array([0])) + condition = condition and np.all(len_aggregate_l == np.array([3])) + + optimal_split_point = dt_nested.select_optimal_split_point(None, 3, + 4, 5) + condition = condition and optimal_split_point is None + + gini_value_when_empty_list = dt_nested.get_minimum_measure([], 3) + condition = condition and gini_value_when_empty_list[-1] == 1 + + mse_value_when_empty_list = dt_nested.get_minimum_measure([], + 3, + gini=False) + condition = condition and mse_value_when_empty_list[-1] == np.inf + + mse_value, produces_split = dt_nested. \ + merge_partial_results_compute_mse_both_sides([[None], [None]], + np.array([])) + mse_value = compss_wait_on(mse_value) + produces_split = compss_wait_on(produces_split) + condition = condition and np.all(mse_value == np.array([np.inf])) + condition = condition and produces_split is False + l_par_results = \ + [[[-4.93362945e+01, -2.91577501e+04, 5.91000000e+02], + [-4.64000975e+01, -3.03920638e+04, 6.55000000e+02], + [-3.81689727e+01, -2.71381396e+04, 7.11000000e+02]], + [[-4.90482439e+01, -1.46654249e+04, 2.99000000e+02], + [-4.67085998e+01, -1.50868777e+04, 3.23000000e+02], + [-3.98015317e+01, -1.38111315e+04, 3.47000000e+02]]] + mse_value, produces_split = dt_nested. \ + merge_partial_results_compute_mse_both_sides(l_par_results, + [[None], [None]]) + mse_value = compss_wait_on(mse_value) + produces_split = compss_wait_on(produces_split) + condition = condition and np.all(mse_value == np.array([np.inf])) + condition = condition and produces_split is False + + mse_value, produces_split = dt_nested. \ + merge_partial_results_compute_mse_both_sides(l_par_results, + [None]) + mse_value = compss_wait_on(mse_value) + produces_split = compss_wait_on(produces_split) + condition = condition and np.all(mse_value == np.array([np.inf])) + condition = condition and produces_split is False + + return condition + + +class RandomForestRegressorTest(BaseTimedTestCase): + def test_decision_tree_classifier(self): + x1 = np.array( + [ + [0.3, -0.3], + [0.4, -0.5], + [0.5, -0.4], + [0.3, 0.3], + [0.4, 0.5], + [0.5, 0.4], + [-0.3, -0.3], + [-0.4, -0.5], + [-0.5, -0.4], + ] + ) + x2 = np.array([[0.4, -0.3], [0.4, 0.3], [-0.4, -0.3]]) + y1 = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + y2 = np.array([0, 1, 2]) + + x1_ds = ds.array(x1, (3, 2)) + x2_ds = ds.array(x2, (3, 2)) + y1_ds = ds.array(y1[:, np.newaxis], (3, 1)) + + # Model + try_features = 2 + max_depth = np.inf + distr_depth = 1 + sklearn_max = 1e8 + bootstrap = True + seed = 0 + random_state = np.random.RandomState(seed) + n_classes = np.bincount(y1).shape[0] + # Test bootstrap + sample1 = dt_nested._sample_selection(x1, random_state, + bootstrap=True) + sample2 = dt_nested._sample_selection(x1, random_state, + bootstrap=False) + self.assertTrue( + np.array_equal(sample1, np.array([0, 2, 3, 3, 3, 4, 5, 5, 7])) + ) + self.assertTrue( + np.array_equal(sample2, np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])) + ) + + # Assert split wrapper + sample = sample2 + rang_min = x1_ds.min() + rang_max = x1_ds.max() + + split = dt_nested._compute_split( + x1_ds, + y1_ds, + n_classes, + indexes_selected=sample, + num_buckets=1, + range_min=rang_min, + range_max=rang_max, + number_split_points=2, + random_state=0, + ) + node_info, results_l, results_l_2, results_r, results_r_2 = split + node_info = compss_wait_on(node_info) + left_group = results_l + y_l = results_l_2 + right_group = results_r + y_r = results_r_2 + left_group_compare = np.block(left_group) + y_l_compare = np.block(y_l) + right_group_compare = np.block(right_group) + y_r_compare = np.block(y_r) + + self.assertTrue(node_info.node_info.index in (0, 1)) + + self.assertTrue(np.array_equal(left_group_compare, + np.array([[0.3, -0.3], + [0.3, 0.3], + [-0.3, -0.3], + [-0.4, -0.5], + [-0.5, -0.4]] + ))) + self.assertTrue(np.array_equal(y_l_compare, + np.array([[0], [1], [2], + [2], [2]]))) + self.assertTrue( + np.array_equal(right_group_compare, np.array([[0.4, -0.5], + [0.5, -0.4], + [0.4, 0.5], + [0.5, 0.4]])) + ) + self.assertTrue(np.array_equal(y_r_compare, np.array([[0], [0], + [1], [1]]))) + self.assertAlmostEqual(node_info.node_info.value, 0.35) + + # Test tree + tree = dt_nested.DecisionTreeClassifier( + 3, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + range_max=rang_max, + range_min=rang_min, + n_split_points=2, + split_computation="raw", + sync_after_fit=True, + ) + tree.fit(x1_ds, y1_ds) + y_pred = compss_wait_on(tree.predict(x2_ds)) + self.assertTrue(np.array_equal(np.argmax(y_pred, axis=1)[0], y2)) + y_pred_proba = tree.predict_proba(x2_ds) + self.assertTrue(np.array_equal(np.argmax(y_pred_proba, axis=1)[0], y2)) + + random_state = np.random.RandomState(seed) + + tree = dt_nested.DecisionTreeClassifier( + 3, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + n_split_points="auto", + split_computation="raw", + sync_after_fit=True, + ) + + tree.fit(x1_ds, y1_ds) + y_pred = compss_wait_on(tree.predict(x2_ds)) + self.assertTrue(np.array_equal(np.argmax(y_pred, axis=1)[0], y2)) + y_pred_proba = tree.predict_proba(x2_ds) + self.assertTrue(np.array_equal(np.argmax(y_pred_proba, axis=1)[0], y2)) + + random_state = np.random.RandomState(seed) + + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (500, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (500, 1)) + + tree = dt_nested.DecisionTreeClassifier( + 3, + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + n_split_points="sqrt", + split_computation="uniform_approximation", + sync_after_fit=True, + ) + + tree.fit(x1_ds, y1_ds) + y_pred = compss_wait_on(tree.predict(x2_ds)) + self.assertTrue(np.array_equal(np.argmax(y_pred, axis=1)[0], y2)) + y_pred_proba = tree.predict_proba(x2_ds) + self.assertTrue(np.array_equal(np.argmax(y_pred_proba, axis=1)[0], y2)) + + random_state = np.random.RandomState(seed) + + tree = dt_nested.DecisionTreeClassifier( + 3, + try_features, + max_depth, + 2, + sklearn_max, + bootstrap, + random_state, + n_split_points=0.444, + split_computation="gaussian_approximation", + sync_after_fit=True, + ) + tree.fit(x_train[:100], y_train[:100]) + y_pred = tree.predict(x_train) + y_pred = np.argmax(np.vstack(y_pred), axis=1) + y_train = y_train.collect() + self.assertGreater(accuracy_score(y_train, + y_pred), 0.6) + y_pred_proba = tree.predict_proba(x_train) + y_pred_proba = np.argmax(np.vstack(y_pred_proba), axis=1) + self.assertTrue(accuracy_score(y_train, + y_pred_proba), 0.6) + + def test_decision_tree_regressor(self): + x1 = np.array( + [ + [0.3, -0.3], + [0.4, -0.5], + [0.5, -0.4], + [0.3, 0.3], + [0.4, 0.5], + [0.5, 0.4], + [-0.3, -0.3], + [-0.4, -0.5], + [-0.5, -0.4], + ] + ) + + # Model + try_features = 2 + max_depth = np.inf + distr_depth = 1 + sklearn_max = 1e8 + bootstrap = True + seed = 0 + random_state = np.random.RandomState(seed) + # Test bootstrap + sample1 = dt_nested._sample_selection(x1, random_state, + bootstrap=True) + sample2 = dt_nested._sample_selection(x1, random_state, + bootstrap=False) + self.assertTrue( + np.array_equal(sample1, np.array([0, 2, 3, 3, 3, 4, 5, 5, 7])) + ) + self.assertTrue( + np.array_equal(sample2, np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])) + ) + + x1, y1 = make_regression( + n_samples=1000, + n_features=10, + n_informative=4, + shuffle=True, + random_state=0, + ) + + x2 = x1[800:] + x1 = x1[:800] + y2 = y1[800:] + y1 = y1[:800] + + x1_ds = ds.array(x1, (400, 10)) + x2_ds = ds.array(x2, (100, 10)) + + y1_ds = ds.array(y1, (400, 1)) + rang_min = x1_ds.min() + rang_max = x1_ds.max() + + # Test tree + tree = dt_nested.DecisionTreeRegressor( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + range_max=rang_max, + range_min=rang_min, + n_split_points=2, + split_computation="raw", + sync_after_fit=True, + ) + tree.fit(x1_ds, y1_ds) + y_pred = compss_wait_on(tree.predict(x2_ds)) + y_pred = np.block(y_pred) + self.assertGreater(r2_score(y_pred.flatten(), y2), 0.15) + + tree = dt_nested.DecisionTreeRegressor( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + n_split_points="auto", + split_computation="uniform_approximation", + sync_after_fit=True, + ) + tree.fit(x1_ds, y1_ds) + y_pred = compss_wait_on(tree.predict(x2_ds)) + y_pred = np.block(y_pred) + self.assertGreater(r2_score(y_pred.flatten(), y2), 0.15) + + tree = dt_nested.DecisionTreeRegressor( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + n_split_points="sqrt", + split_computation="gaussian_approximation", + sync_after_fit=True, + ) + tree.fit(x1_ds, y1_ds) + y_pred = compss_wait_on(tree.predict(x2_ds)) + y_pred = np.block(y_pred) + self.assertGreater(r2_score(y_pred.flatten(), y2), 0.15) + + tree = dt_nested.DecisionTreeRegressor( + try_features, + max_depth, + distr_depth, + sklearn_max, + bootstrap, + random_state, + n_split_points=0.1, + split_computation="gaussian_approximation", + sync_after_fit=True, + ) + tree.fit(x1_ds, y1_ds) + y_pred = compss_wait_on(tree.predict(x2_ds)) + y_pred = np.block(y_pred) + self.assertGreater(r2_score(y_pred.flatten(), y2), 0.15) + + def test_auxiliar_functions(self): + x1 = np.array( + [ + [0.3, -0.3], + [0.4, -0.5], + [0.5, -0.4], + [0.3, 0.3], + [0.4, 0.5], + [0.5, 0.4], + [-0.3, -0.3], + [-0.4, -0.5], + [-0.5, -0.4], + ] + ) + y1 = np.array([0, 0, 0, 1, 1, 0, 1, 0, 1]) + right_x, right_y, x, y, aggregate_r, aggregate = \ + dt_nested.apply_split_points_to_blocks(x1, y1, 1, + None, [2], + 2, np.array([]), + np.array([0, 0])) + self.assertTrue(right_x is None) + self.assertTrue(right_y is None) + self.assertTrue(np.all(x == x1)) + self.assertTrue(np.all(y == y1)) + self.assertTrue(np.all(aggregate_r == np.array([]))) + self.assertTrue(np.all(aggregate == np.array([5, 4]))) + + x1 = np.array( + [ + [0.3, -0.3], + [0.4, -0.5], + [0.5, -0.4], + ] + ) + y1 = np.array([0, 0, 0]) + right_x, right_y, x, y, aggregate_r, aggregate = \ + dt_nested.apply_split_points_to_blocks(x1, y1, 1, + None, [2], + 2, np.array([]), + np.array([0, 0])) + self.assertTrue(right_x is None) + self.assertTrue(right_y is None) + self.assertTrue(np.all(x == x1)) + self.assertTrue(np.all(y == y1)) + self.assertTrue(np.all(aggregate_r == np.array([]))) + self.assertTrue(np.all(aggregate == np.array([3, 0]))) + + right_x, right_y, x, y, aggregate_r, aggregate = \ + dt_nested.apply_split_points_to_blocks(None, None, 1, + 1, [2], + 2, np.array([]), + np.array([0, 0])) + self.assertTrue(right_x is None) + self.assertTrue(right_y is None) + self.assertTrue(x is None) + self.assertTrue(y is None) + self.assertTrue(np.all(aggregate_r == np.array([]))) + self.assertTrue(np.all(aggregate == np.array([0, 0]))) + + right_x, right_y, x, y, aggregate_r, \ + len_aggregate_r, aggregate_l, len_aggregate_l = \ + dt_nested.apply_split_points_to_blocks_regression(x1, y1, 1, + None, [2]) + self.assertTrue(right_x is None) + self.assertTrue(right_y is None) + self.assertTrue(np.all(x == x1)) + self.assertTrue(np.all(y == y1)) + self.assertTrue(np.all(aggregate_r == np.array([0]))) + self.assertTrue(np.all(len_aggregate_r == np.array([0]))) + self.assertTrue(np.all(aggregate_l == np.array([0]))) + self.assertTrue(np.all(len_aggregate_l == np.array([3]))) + + optimal_split_point = dt_nested.select_optimal_split_point(None, 3, + 4, 5) + self.assertTrue(optimal_split_point is None) + + gini_value_when_empty_list = dt_nested.get_minimum_measure([], 3) + self.assertTrue(gini_value_when_empty_list[-1] == 1) + + mse_value_when_empty_list = dt_nested.get_minimum_measure([], + 3, + gini=False) + self.assertTrue(mse_value_when_empty_list[-1] == np.inf) + + mse_value, produces_split = dt_nested.\ + merge_partial_results_compute_mse_both_sides([[None], [None]], + np.array([])) + self.assertTrue(np.all(mse_value == np.array([np.inf]))) + self.assertTrue(produces_split is False) + l_par_results = \ + [[[-4.93362945e+01, -2.91577501e+04, 5.91000000e+02], + [-4.64000975e+01, -3.03920638e+04, 6.55000000e+02], + [-3.81689727e+01, -2.71381396e+04, 7.11000000e+02]], + [[-4.90482439e+01, -1.46654249e+04, 2.99000000e+02], + [-4.67085998e+01, -1.50868777e+04, 3.23000000e+02], + [-3.98015317e+01, -1.38111315e+04, 3.47000000e+02]]] + mse_value, produces_split = dt_nested. \ + merge_partial_results_compute_mse_both_sides(l_par_results, + [[None], [None]]) + self.assertTrue(np.all(mse_value == np.array([np.inf]))) + self.assertTrue(produces_split is False) + + mse_value, produces_split = dt_nested.\ + merge_partial_results_compute_mse_both_sides(l_par_results, + [None]) + self.assertTrue(np.all(mse_value == np.array([np.inf]))) + self.assertTrue(produces_split is False) + + fragment_buckets = [[object()]] + filter_fragment([], fragment_buckets, np.array([2, 3]), + 3, range_min=[0], range_max=[1], + indexes_selected=np.array([0])) + self.assertTrue(fragment_buckets == [[[]]]) + + +@task() +def main(): + test = test_decision_tree_classifier() + test2 = test_decision_tree_regressor() + test3 = test_auxiliar_functions() + test = test and test2 and test3 + if test: + print("Result tests: Passed", flush=True) + else: + print("Result tests: Failed", flush=True) + + +if __name__ == "__main__": + main() diff --git a/tests_nesting/test_rf_classifier_nested.py b/tests_nesting/test_rf_classifier_nested.py new file mode 100644 index 00000000..60552976 --- /dev/null +++ b/tests_nesting/test_rf_classifier_nested.py @@ -0,0 +1,757 @@ +import numpy as np +from parameterized import parameterized +from pycompss.api.api import compss_wait_on +from sklearn import datasets +from sklearn.datasets import make_classification + +import dislib as ds +from dislib.classification import RandomForestClassifier +import dislib.data.util.model as utilmodel +from dislib.trees.nested.forest import _resolve_try_features +from tests import BaseTimedTestCase +from pycompss.api.task import task + + +def test_make_classification_score(): + """Tests RandomForestClassifier fit and score with default params.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (1000, 1)) + + rf = RandomForestClassifier(n_classes=3, + random_state=0, mmap=False, + nested=True) + + rf.fit(x_train, y_train) + accuracy = rf.score(x_test, y_test, collect=True) + return accuracy > 0.7 + + +def test_make_classification_predict_and_distr_depth(): + """Tests RandomForestClassifier fit and predict with a distr_depth.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = y[1::2] + + rf = RandomForestClassifier(n_estimators=4, n_classes=3, + distr_depth=2, + n_split_points="auto", + random_state=0, mmap=False, + nested=True) + + rf.fit(x_train, y_train) + y_pred = rf.predict(x_test).collect() + accuracy1 = np.count_nonzero(y_pred == y_test) / len(y_test) + + rf = RandomForestClassifier(n_estimators=4, n_classes=3, + distr_depth=2, + n_split_points="sqrt", + random_state=0, mmap=False, + nested=True) + + rf.fit(x_train, y_train) + y_pred = rf.predict(x_test).collect() + accuracy2 = np.count_nonzero(y_pred == y_test) / len(y_test) + + rf = RandomForestClassifier(n_estimators=4, n_classes=3, + distr_depth=2, + n_split_points=0.2, + random_state=0, mmap=False, + nested=True) + + rf.fit(x_train, y_train) + y_pred = rf.predict(x_test).collect() + accuracy3 = np.count_nonzero(y_pred == y_test) / len(y_test) + + rf = RandomForestClassifier(n_estimators=4, n_classes=3, + distr_depth=2, + n_split_points="sqrt", + random_state=0, mmap=False, + nested=True) + + rf.fit(x_train, y_train) + y_pred = rf.predict(x_test).collect() + accuracy2 = np.count_nonzero(y_pred == y_test) / len(y_test) + + rf = RandomForestClassifier(n_estimators=4, n_classes=3, + distr_depth=2, + n_split_points=0.2, + random_state=0, mmap=False, + nested=True, hard_vote=True) + + rf.fit(x_train, y_train) + y_pred = rf.predict(x_test).collect() + accuracy4 = np.count_nonzero(y_pred == y_test) / len(y_test) + + return accuracy1 > 0.7 and accuracy2 > 0.7 and accuracy3 > 0.7 \ + and accuracy4 > 0.7 + + +def test_make_classification_fit_predict(): + """Tests RandomForestClassifier fit_predict with default params.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + + rf = RandomForestClassifier(n_classes=3, distr_depth=1, + random_state=0, mmap=False, + nested=True) + + y_pred = rf.fit(x_train, y_train).predict(x_train).collect() + y_train = y_train.collect() + accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) + return accuracy > 0.7 + + +def test_make_classification_sklearn_max_predict(): + """Tests RandomForestClassifier predict with sklearn_max.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = y[1::2] + + rf = RandomForestClassifier(n_classes=3, distr_depth=1, + random_state=0, sklearn_max=10, + mmap=False, + nested=True) + + rf.fit(x_train, y_train) + y_pred = rf.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + return accuracy > 0.7 + + +def test_make_classification_sklearn_max_predict_proba(): + """Tests RandomForestClassifier predict_proba with sklearn_max.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = y[1::2] + + rf = RandomForestClassifier(n_classes=3, distr_depth=1, random_state=0, + sklearn_max=10, mmap=False, + nested=True) + + rf.fit(x_train, y_train) + probabilities = rf.predict_proba(x_test).collect() + rf.classes = np.arange(rf.n_classes) + y_pred = rf.classes[np.argmax(probabilities, axis=1)] + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + return accuracy > 0.7 + + +def test_make_classification_hard_vote_predict(): + """Tests RandomForestClassifier predict with hard_vote.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = y[1::2] + + rf = RandomForestClassifier( + n_classes=3, distr_depth=1, random_state=0, + sklearn_max=10, hard_vote=True, mmap=False, + nested=True + ) + + rf.fit(x_train, y_train) + y_pred = rf.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + return accuracy > 0.7 + + +def test_make_classification_hard_vote_score_mix(): + """Tests RandomForestClassifier score with hard_vote, sklearn_max, + distr_depth and max_depth.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (1000, 1)) + + rf = RandomForestClassifier( + n_classes=3, + random_state=0, + sklearn_max=100, + distr_depth=1, + max_depth=12, + hard_vote=True, + mmap=False, + nested=True, + ) + + rf.fit(x_train, y_train) + accuracy = compss_wait_on(rf.score(x_test, y_test)) + return accuracy > 0.7 + + +def test_score_on_iris(): + """Tests RandomForestClassifier with a minimal example.""" + x, y = datasets.load_iris(return_X_y=True) + ds_fit = ds.array(x[::2], block_size=(30, 2)) + fit_y = ds.array(y[::2].reshape(-1, 1), block_size=(30, 1)) + ds_validate = ds.array(x[1::2], block_size=(30, 2)) + validate_y = ds.array(y[1::2].reshape(-1, 1), block_size=(30, 1)) + + rf = RandomForestClassifier( + n_classes=3, distr_depth=1, + n_estimators=1, max_depth=2, random_state=0, + mmap=False, nested=True + ) + rf.fit(ds_fit, fit_y) + accuracy1 = rf.score(ds_validate, validate_y, True) + accuracy2 = rf.score(ds_validate, validate_y, False) + accuracy2 = compss_wait_on(accuracy2) + + # Accuracy should be <= 2/3 for any seed, often exactly equal. + return accuracy1 > (2 / 3) and accuracy2 > (2 / 3) + + +def test_save_load(): + """ + Tests that the save and load methods work properly with the three + expected formats and that an exception is raised when a non-supported + format is provided. + """ + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + + rf = RandomForestClassifier(n_classes=3, distr_depth=1, random_state=0, + n_estimators=5, mmap=False, nested=True) + rf.fit(x_train, y_train) + rf.save_model("./saved_model") + + rf2 = RandomForestClassifier(n_classes=3, distr_depth=1, + random_state=0, + n_estimators=5, mmap=False, nested=True) + rf2.load_model("./saved_model") + y_pred = rf2.predict(x_train).collect() + y_train = y_train.collect() + accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) + condition = accuracy > 0.7 + + rf.save_model("./saved_model", save_format="cbor") + + rf2 = RandomForestClassifier(n_classes=3, distr_depth=1, + random_state=0, + n_estimators=5, mmap=False, nested=True) + rf2.load_model("./saved_model", load_format="cbor") + + y_pred = rf2.predict(x_train).collect() + accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) + condition = condition and accuracy > 0.7 + + rf.save_model("./saved_model", save_format="pickle") + + rf2 = RandomForestClassifier(n_classes=3, distr_depth=1, + random_state=0, + n_estimators=5, mmap=False, nested=True) + rf2.load_model("./saved_model", load_format="pickle") + y_pred = rf2.predict(x_train).collect() + accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) + condition = condition and accuracy > 0.7 + condition_error = False + try: + rf.save_model("./saved_model", save_format="txt") + except ValueError: + condition_error = True + condition = condition and condition_error + + condition_error = False + try: + rf2 = RandomForestClassifier(n_classes=3, distr_depth=1, + random_state=0, + n_estimators=5, mmap=False, + nested=True) + rf2.load_model("./saved_model", load_format="txt") + except ValueError: + condition_error = True + condition = condition and condition_error + + rf = RandomForestClassifier(n_classes=3, distr_depth=1, random_state=0, + n_estimators=1, mmap=False, nested=True) + x_train2 = ds.array(x[::2], (1000, 10)) + y_train2 = ds.array(y[::2][:, np.newaxis], (1000, 1)) + rf.fit(x_train2, y_train2) + rf.save_model("./saved_model", overwrite=False) + + rf2 = RandomForestClassifier(n_classes=3, distr_depth=1, + random_state=0, + n_estimators=5, mmap=False, nested=True) + rf2.load_model("./saved_model", load_format="pickle") + y_pred = rf2.predict(x_train).collect() + accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) + condition = condition and accuracy > 0.7 + + cbor2_module = utilmodel.cbor2 + utilmodel.cbor2 = None + condition_error = False + try: + rf.save_model("./saved_model_error", save_format="cbor") + except ModuleNotFoundError: + condition_error = True + condition = condition and condition_error + condition_error = False + try: + rf2.load_model("./saved_model_error", load_format="cbor") + except ModuleNotFoundError: + condition_error = True + condition = condition and condition_error + utilmodel.cbor2 = cbor2_module + return condition + + +class RFTest(BaseTimedTestCase): + def test_make_classification_score(self): + """Tests RandomForestClassifier fit and score with default params.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (1000, 1)) + + rf = RandomForestClassifier(n_classes=3, + random_state=0, mmap=False, + nested=True) + + rf.fit(x_train, y_train) + accuracy = compss_wait_on(rf.score(x_test, y_test)) + self.assertGreater(accuracy, 0.7) + + def test_make_classification_predict_and_distr_depth(self): + """Tests RandomForestClassifier fit and predict with a distr_depth.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = y[1::2] + + rf = RandomForestClassifier(n_estimators=2, n_classes=3, + distr_depth=2, + n_split_points="auto", + random_state=0, mmap=False, + nested=True) + + rf.fit(x_train, y_train) + y_pred = rf.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + + rf = RandomForestClassifier(n_estimators=2, n_classes=3, + distr_depth=2, + n_split_points="sqrt", + random_state=0, mmap=False, + nested=True) + + rf.fit(x_train, y_train) + y_pred = rf.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + + rf = RandomForestClassifier(n_estimators=2, n_classes=3, + distr_depth=2, + n_split_points=0.2, + random_state=0, mmap=False, + nested=True) + + rf.fit(x_train, y_train) + y_pred = rf.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + + def test_make_classification_fit_predict(self): + """Tests RandomForestClassifier fit_predict with default params.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + + rf = RandomForestClassifier(n_classes=3, distr_depth=1, + random_state=0, mmap=False, + nested=True) + + y_pred = rf.fit(x_train, y_train).predict(x_train).collect() + y_train = y_train.collect() + accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) + self.assertGreater(accuracy, 0.7) + + def test_make_classification_sklearn_max_predict(self): + """Tests RandomForestClassifier predict with sklearn_max.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = y[1::2] + + rf = RandomForestClassifier(n_classes=3, distr_depth=1, + random_state=0, sklearn_max=10, + mmap=False, + nested=True) + + rf.fit(x_train, y_train) + y_pred = rf.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + + def test_make_classification_sklearn_max_predict_proba(self): + """Tests RandomForestClassifier predict_proba with sklearn_max.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = y[1::2] + + rf = RandomForestClassifier(n_classes=3, distr_depth=1, random_state=0, + sklearn_max=10, mmap=False, + nested=True) + + rf.fit(x_train, y_train) + probabilities = rf.predict_proba(x_test).collect() + rf.classes = np.arange(rf.n_classes) + y_pred = rf.classes[np.argmax(probabilities, axis=1)] + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + + def test_make_classification_hard_vote_predict(self): + """Tests RandomForestClassifier predict with hard_vote.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = y[1::2] + + rf = RandomForestClassifier( + n_classes=3, distr_depth=1, random_state=0, + sklearn_max=10, hard_vote=True, mmap=False, + nested=True + ) + + rf.fit(x_train, y_train) + y_pred = rf.predict(x_test).collect() + accuracy = np.count_nonzero(y_pred == y_test) / len(y_test) + self.assertGreater(accuracy, 0.7) + + def test_make_classification_hard_vote_score_mix(self): + """Tests RandomForestClassifier score with hard_vote, sklearn_max, + distr_depth and max_depth.""" + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (1000, 1)) + + rf = RandomForestClassifier( + n_classes=3, + random_state=0, + sklearn_max=100, + distr_depth=1, + max_depth=12, + hard_vote=True, + mmap=False, + nested=True, + ) + + rf.fit(x_train, y_train) + accuracy = compss_wait_on(rf.score(x_test, y_test)) + self.assertGreater(accuracy, 0.7) + + @parameterized.expand([(True,), (False,)]) + def test_score_on_iris(self, collect): + """Tests RandomForestClassifier with a minimal example.""" + x, y = datasets.load_iris(return_X_y=True) + ds_fit = ds.array(x[::2], block_size=(30, 2)) + fit_y = ds.array(y[::2].reshape(-1, 1), block_size=(30, 1)) + ds_validate = ds.array(x[1::2], block_size=(30, 2)) + validate_y = ds.array(y[1::2].reshape(-1, 1), block_size=(30, 1)) + + rf = RandomForestClassifier( + n_classes=3, distr_depth=1, + n_estimators=1, max_depth=2, random_state=0, + mmap=False, nested=True + ) + rf.fit(ds_fit, fit_y) + accuracy = rf.score(ds_validate, validate_y, collect) + if not collect: + accuracy = compss_wait_on(accuracy) + + # Accuracy should be <= 2/3 for any seed, often exactly equal. + self.assertGreater(accuracy, 2 / 3) + + def test_save_load(self): + """ + Tests that the save and load methods work properly with the three + expected formats and that an exception is raised when a non-supported + format is provided. + """ + x, y = make_classification( + n_samples=3000, + n_features=10, + n_classes=3, + n_informative=4, + n_redundant=2, + n_repeated=1, + n_clusters_per_class=2, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + + rf = RandomForestClassifier(n_classes=3, distr_depth=1, random_state=0, + n_estimators=5, mmap=False, nested=True) + rf.fit(x_train, y_train) + rf.save_model("./saved_model") + + rf2 = RandomForestClassifier(n_classes=3, distr_depth=1, + random_state=0, + n_estimators=5, mmap=False, nested=True) + rf2.load_model("./saved_model") + y_pred = rf2.predict(x_train).collect() + y_train = y_train.collect() + accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) + self.assertGreater(accuracy, 0.7) + + rf.save_model("./saved_model", save_format="cbor") + + rf2 = RandomForestClassifier(n_classes=3, distr_depth=1, + random_state=0, + n_estimators=5, mmap=False, nested=True) + rf2.load_model("./saved_model", load_format="cbor") + + y_pred = rf2.predict(x_train).collect() + accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) + self.assertGreater(accuracy, 0.7) + + rf.save_model("./saved_model", save_format="pickle") + + rf2 = RandomForestClassifier(n_classes=3, distr_depth=1, + random_state=0, + n_estimators=5, mmap=False, nested=True) + rf2.load_model("./saved_model", load_format="pickle") + y_pred = rf2.predict(x_train).collect() + accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) + self.assertGreater(accuracy, 0.7) + + with self.assertRaises(ValueError): + rf.save_model("./saved_model", save_format="txt") + + with self.assertRaises(ValueError): + rf2 = RandomForestClassifier(n_classes=3, distr_depth=1, + random_state=0, + n_estimators=5, mmap=False, + nested=True) + rf2.load_model("./saved_model", load_format="txt") + + rf = RandomForestClassifier(n_classes=3, distr_depth=1, random_state=0, + n_estimators=1, mmap=False, nested=True) + x_train2 = ds.array(x[::2], (1000, 10)) + y_train2 = ds.array(y[::2][:, np.newaxis], (1000, 1)) + rf.fit(x_train2, y_train2) + rf.save_model("./saved_model", overwrite=False) + + rf2 = RandomForestClassifier(n_classes=3, distr_depth=1, + random_state=0, + n_estimators=5, mmap=False, nested=True) + rf2.load_model("./saved_model", load_format="pickle") + y_pred = rf2.predict(x_train).collect() + accuracy = np.count_nonzero(y_pred == y_train) / len(y_train) + self.assertGreater(accuracy, 0.7) + + cbor2_module = utilmodel.cbor2 + utilmodel.cbor2 = None + with self.assertRaises(ModuleNotFoundError): + rf.save_model("./saved_model_error", save_format="cbor") + with self.assertRaises(ModuleNotFoundError): + rf2.load_model("./saved_model_error", load_format="cbor") + utilmodel.cbor2 = cbor2_module + + def test_other_functions(self): + number_features = _resolve_try_features("sqrt", 9) + self.assertTrue(number_features == 3) + number_features = _resolve_try_features("third", 12) + self.assertTrue(number_features == 4) + number_features = _resolve_try_features(None, 12) + self.assertTrue(number_features == 12) + number_features = _resolve_try_features(2, 12) + self.assertTrue(number_features == 2) + number_features = _resolve_try_features(0.5, 12) + self.assertTrue(number_features == 6) + + +@task() +def main(): + test = test_make_classification_score() + test2 = test_make_classification_predict_and_distr_depth() + test3 = test_make_classification_fit_predict() + test4 = test_make_classification_sklearn_max_predict() + test5 = test_make_classification_sklearn_max_predict_proba() + test = test and test2 and test3 and test4 and test5 + test6 = test_make_classification_hard_vote_predict() + test7 = test_make_classification_hard_vote_score_mix() + test8 = test_score_on_iris() + test9 = test_save_load() + test = test and test6 and test7 and test8 and test9 + if test: + print("Result tests: Passed", flush=True) + else: + print("Result tests: Failed", flush=True) + + +if __name__ == "__main__": + main() diff --git a/tests_nesting/test_rf_regressor_nested.py b/tests_nesting/test_rf_regressor_nested.py new file mode 100644 index 00000000..c7eba5ff --- /dev/null +++ b/tests_nesting/test_rf_regressor_nested.py @@ -0,0 +1,434 @@ +import numpy as np +from pycompss.api.api import compss_wait_on +from sklearn.datasets import make_regression + +import dislib as ds +from dislib.regression import RandomForestRegressor +import dislib.data.util.model as utilmodel + +from tests import BaseTimedTestCase +from pycompss.api.task import task +from math import isclose + + +def _determination_coefficient(y_true, y_pred): + u = np.sum(np.square(y_true - y_pred)) + v = np.sum(np.square(y_true - np.mean(y_true))) + return 1 - u / v + + +def test_make_regression(): + """Tests RandomForestRegressor fit and score with default params.""" + x, y = make_regression( + n_samples=12000, + n_features=40, + n_informative=4, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (4000, 20)) + y_train = ds.array(y[::2][:, np.newaxis], (4000, 1)) + x_test = ds.array(x[1::2], (4000, 20)) + y_test = ds.array(y[1::2][:, np.newaxis], (4000, 1)) + + rf = RandomForestRegressor(distr_depth=1, random_state=0, + n_estimators=2, mmap=False, + nested=True) + + rf.fit(x_train, y_train) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) + + y_pred = rf.predict(x_test).collect() + y_true = y[1::2] + accuracy2 = _determination_coefficient(y_true, y_pred) + + return accuracy1 > 0.5 and accuracy2 > 0.5 and \ + isclose(accuracy1, accuracy2) + + +def test_make_regression_predict_and_distr_depth(): + """Tests RandomForestRegressor fit and predict with a distr_depth.""" + x, y = make_regression( + n_samples=3000, + n_features=10, + n_informative=4, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (1000, 1)) + + rf = RandomForestRegressor(distr_depth=1, random_state=0, + n_estimators=2, + mmap=False, nested=True) + + rf.fit(x_train, y_train) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) + + y_pred = rf.predict(x_test).collect() + y_true = y[1::2] + accuracy2 = _determination_coefficient(y_true, y_pred) + + return accuracy1 > 0.75 and accuracy2 > 0.75 and \ + isclose(accuracy1, accuracy2) + + +def test_make_regression_sklearn_max_predict(): + """Tests RandomForestRegressor predict with sklearn_max.""" + x, y = make_regression( + n_samples=3000, + n_features=10, + n_informative=4, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (1000, 1)) + + rf = RandomForestRegressor(distr_depth=2, n_estimators=2, + random_state=0, sklearn_max=10, mmap=False, + nested=True) + + rf.fit(x_train, y_train) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) + + y_pred = rf.predict(x_test).collect() + y_true = y[1::2] + accuracy2 = _determination_coefficient(y_true, y_pred) + + return accuracy1 > 0.75 and accuracy2 > 0.75 and \ + isclose(accuracy1, accuracy2) + + +def test_save_load(): + """Tests the save and the load methods of the RandomForestRegressor + class""" + x, y = make_regression( + n_samples=3000, + n_features=10, + n_informative=4, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (1000, 1)) + + rf = RandomForestRegressor(distr_depth=1, + random_state=0, n_estimators=2, mmap=False, + nested=True) + rf.fit(x_train, y_train) + rf.save_model("./rf_regressor") + + rf2 = RandomForestRegressor(distr_depth=1, + random_state=0, n_estimators=2, mmap=False, + nested=True) + rf2.load_model("./rf_regressor") + + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) + accuracy_loaded1 = compss_wait_on(rf2.score(x_test, y_test)) + + y_pred = rf.predict(x_test).collect() + y_pred_loaded = rf2.predict(x_test).collect() + y_true = y[1::2] + accuracy2 = _determination_coefficient(y_true, y_pred) + accuracy_loaded2 = _determination_coefficient(y_true, y_pred_loaded) + + condition = accuracy1 == accuracy_loaded1 + condition = condition and accuracy2 == accuracy_loaded2 + + rf.save_model("./rf_regressor", save_format="cbor") + + rf2 = RandomForestRegressor(distr_depth=1, + random_state=0, n_estimators=2, mmap=False, + nested=True) + rf2.load_model("./rf_regressor", load_format="cbor") + + accuracy_loaded1 = compss_wait_on(rf2.score(x_test, y_test)) + + y_pred_loaded = rf2.predict(x_test).collect() + accuracy_loaded2 = _determination_coefficient(y_true, y_pred_loaded) + + condition = condition and accuracy1 == accuracy_loaded1 + condition = condition and accuracy2 == accuracy_loaded2 + + rf.save_model("./rf_regressor", save_format="pickle") + + rf2 = RandomForestRegressor(distr_depth=1, + random_state=0, n_estimators=2, mmap=False, + nested=True) + rf2.load_model("./rf_regressor", load_format="pickle") + + accuracy_loaded1 = compss_wait_on(rf2.score(x_test, y_test)) + + y_pred_loaded = rf2.predict(x_test).collect() + accuracy_loaded2 = _determination_coefficient(y_true, y_pred_loaded) + + condition = condition and accuracy1 == accuracy_loaded1 + condition = condition and accuracy2 == accuracy_loaded2 + + try: + rf.save_model("./rf_regressor", save_format="txt") + except ValueError: + condition_error = True + condition = condition and condition_error + + try: + rf2 = RandomForestRegressor(distr_depth=1, + random_state=0, n_estimators=2, + mmap=False, nested=True) + rf2.load_model("./rf_regressor", load_format="txt") + except ValueError: + condition_error = True + condition = condition and condition_error + + rf1 = RandomForestRegressor(distr_depth=1, + random_state=0, n_estimators=1, + mmap=False, nested=True) + rf1.save_model("./rf_regressor", overwrite=False) + + rf2 = RandomForestRegressor(distr_depth=1, + random_state=0, n_estimators=2, + mmap=False, nested=True) + rf2.load_model("./rf_regressor", load_format="pickle") + + accuracy_loaded1 = compss_wait_on(rf2.score(x_test, y_test)) + + y_pred_loaded = rf2.predict(x_test).collect() + accuracy_loaded2 = _determination_coefficient(y_true, y_pred_loaded) + + condition = condition and accuracy1 == accuracy_loaded1 + condition = condition and accuracy2 == accuracy_loaded2 + + cbor2_module = utilmodel.cbor2 + utilmodel.cbor2 = None + try: + rf.save_model("./rf_regressor", save_format="cbor") + except ModuleNotFoundError: + condition_error = True + condition = condition and condition_error + try: + rf2.load_model("./rf_regressor", load_format="cbor") + except ModuleNotFoundError: + condition_error = True + condition = condition and condition_error + utilmodel.cbor2 = cbor2_module + return condition + + +class RandomForestRegressorTest(BaseTimedTestCase): + def test_make_regression(self): + """Tests RandomForestRegressor fit and score with default params.""" + x, y = make_regression( + n_samples=12000, + n_features=40, + n_informative=4, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (4000, 20)) + y_train = ds.array(y[::2][:, np.newaxis], (4000, 1)) + x_test = ds.array(x[1::2], (4000, 20)) + y_test = ds.array(y[1::2][:, np.newaxis], (4000, 1)) + + rf = RandomForestRegressor(distr_depth=2, random_state=0, + n_estimators=2, mmap=False, nested=True) + + rf.fit(x_train, y_train) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) + + y_pred = rf.predict(x_test).collect() + y_true = y[1::2] + accuracy2 = _determination_coefficient(y_true, y_pred) + + self.assertGreater(accuracy1, 0.50) + self.assertGreater(accuracy2, 0.50) + self.assertAlmostEqual(accuracy1, accuracy2) + + def test_make_regression_predict_and_distr_depth(self): + """Tests RandomForestRegressor fit and predict with a distr_depth.""" + x, y = make_regression( + n_samples=3000, + n_features=10, + n_informative=4, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (1000, 1)) + + rf = RandomForestRegressor(distr_depth=1, random_state=0, + n_estimators=2, + mmap=False, nested=True) + + rf.fit(x_train, y_train) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) + + y_pred = rf.predict(x_test).collect() + y_true = y[1::2] + accuracy2 = _determination_coefficient(y_true, y_pred) + + self.assertGreater(accuracy1, 0.6) + self.assertGreater(accuracy2, 0.6) + self.assertAlmostEqual(accuracy1, accuracy2) + + def test_make_regression_sklearn_max_predict(self): + """Tests RandomForestRegressor predict with sklearn_max.""" + x, y = make_regression( + n_samples=3000, + n_features=10, + n_informative=4, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (1000, 1)) + + rf = RandomForestRegressor(distr_depth=1, n_estimators=2, + random_state=0, sklearn_max=10, mmap=False, + nested=True) + + rf.fit(x_train, y_train) + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) + + y_pred = rf.predict(x_test).collect() + y_true = y[1::2] + accuracy2 = _determination_coefficient(y_true, y_pred) + + self.assertGreater(accuracy1, 0.75) + self.assertGreater(accuracy2, 0.75) + self.assertAlmostEqual(accuracy1, accuracy2) + + def test_save_load(self): + """Tests the save and the load methods of the RandomForestRegressor + class""" + x, y = make_regression( + n_samples=3000, + n_features=10, + n_informative=4, + shuffle=True, + random_state=0, + ) + x_train = ds.array(x[::2], (1000, 10)) + y_train = ds.array(y[::2][:, np.newaxis], (1000, 1)) + x_test = ds.array(x[1::2], (1000, 10)) + y_test = ds.array(y[1::2][:, np.newaxis], (1000, 1)) + + rf = RandomForestRegressor(distr_depth=1, + random_state=0, n_estimators=2, mmap=False, + nested=True) + rf.fit(x_train, y_train) + rf.save_model("./rf_regressor") + + rf2 = RandomForestRegressor(distr_depth=1, + random_state=0, n_estimators=2, mmap=False, + nested=True) + rf2.load_model("./rf_regressor") + + accuracy1 = compss_wait_on(rf.score(x_test, y_test)) + accuracy_loaded1 = compss_wait_on(rf2.score(x_test, y_test)) + + y_pred = rf.predict(x_test).collect() + y_pred_loaded = rf2.predict(x_test).collect() + y_true = y[1::2] + accuracy2 = _determination_coefficient(y_true, y_pred) + accuracy_loaded2 = _determination_coefficient(y_true, y_pred_loaded) + + self.assertEqual(accuracy1, accuracy_loaded1) + self.assertEqual(accuracy2, accuracy_loaded2) + + rf.save_model("./rf_regressor", save_format="cbor") + + rf2 = RandomForestRegressor(distr_depth=1, + random_state=0, n_estimators=2, mmap=False, + nested=True) + rf2.load_model("./rf_regressor", load_format="cbor") + + accuracy_loaded1 = compss_wait_on(rf2.score(x_test, y_test)) + + y_pred_loaded = rf2.predict(x_test).collect() + accuracy_loaded2 = _determination_coefficient(y_true, y_pred_loaded) + + self.assertEqual(accuracy1, accuracy_loaded1) + self.assertEqual(accuracy2, accuracy_loaded2) + + rf.save_model("./rf_regressor", save_format="pickle") + + rf2 = RandomForestRegressor(distr_depth=1, + random_state=0, n_estimators=2, mmap=False, + nested=True) + rf2.load_model("./rf_regressor", load_format="pickle") + + accuracy_loaded1 = compss_wait_on(rf2.score(x_test, y_test)) + + y_pred_loaded = rf2.predict(x_test).collect() + accuracy_loaded2 = _determination_coefficient(y_true, y_pred_loaded) + + self.assertEqual(accuracy1, accuracy_loaded1) + self.assertEqual(accuracy2, accuracy_loaded2) + + with self.assertRaises(ValueError): + rf.save_model("./rf_regressor", save_format="txt") + + with self.assertRaises(ValueError): + rf2 = RandomForestRegressor(distr_depth=1, + random_state=0, n_estimators=2, + mmap=False, nested=True) + rf2.load_model("./rf_regressor", load_format="txt") + + rf1 = RandomForestRegressor(distr_depth=1, + random_state=0, n_estimators=1, + mmap=False, nested=True) + rf1.save_model("./rf_regressor", overwrite=False) + + rf2 = RandomForestRegressor(distr_depth=1, + random_state=0, n_estimators=2, + mmap=False, nested=True) + rf2.load_model("./rf_regressor", load_format="pickle") + + accuracy_loaded1 = compss_wait_on(rf2.score(x_test, y_test)) + + y_pred_loaded = rf2.predict(x_test).collect() + accuracy_loaded2 = _determination_coefficient(y_true, y_pred_loaded) + + self.assertEqual(accuracy1, accuracy_loaded1) + self.assertEqual(accuracy2, accuracy_loaded2) + + cbor2_module = utilmodel.cbor2 + utilmodel.cbor2 = None + with self.assertRaises(ModuleNotFoundError): + rf.save_model("./rf_regressor", save_format="cbor") + with self.assertRaises(ModuleNotFoundError): + rf2.load_model("./rf_regressor", load_format="cbor") + utilmodel.cbor2 = cbor2_module + + +@task() +def main(): + test = test_make_regression() + test2 = test_make_regression_predict_and_distr_depth() + test3 = test_make_regression_sklearn_max_predict() + test4 = test_save_load() + print("TEST", flush=True) + print(test) + print(test2) + print(test3) + print(test4, flush=True) + test = test and test2 and test3 and test4 + if test: + print("Result tests: Passed", flush=True) + else: + print("Result tests: Failed", flush=True) + + +if __name__ == "__main__": + main()