Skip to content

Commit

Permalink
Merge pull request #1017 from jaredc07/rf_mixer
Browse files Browse the repository at this point in the history
Implement a RandomForest mixer
  • Loading branch information
paxcema authored Oct 14, 2022
2 parents c4805a3 + 944b20c commit f2dcdee
Show file tree
Hide file tree
Showing 6 changed files with 280 additions and 5 deletions.
13 changes: 13 additions & 0 deletions lightwood/api/json_ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,13 @@ def generate_json_ai(
"stop_after": "$problem_definition.seconds_per_mixer",
},
},
{
"module": "RandomForest",
"args": {
"stop_after": "$problem_definition.seconds_per_mixer",
"fit_on_dev": True,
},
},
]
)
elif tss.is_timeseries and tss.horizon > 1:
Expand Down Expand Up @@ -590,6 +597,12 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
"target_encoder", "$encoders[self.target]"
)

elif mixers[i]["module"] == "RandomForest":
mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
"target_encoder", "$encoders[self.target]"
)
mixers[i]["args"]["use_optuna"] = True

elif mixers[i]["module"] == "LightGBMArray":
mixers[i]["args"]["input_cols"] = mixers[i]["args"].get(
"input_cols", "$input_cols"
Expand Down
3 changes: 2 additions & 1 deletion lightwood/mixer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from lightwood.mixer.neural import Neural
from lightwood.mixer.neural_ts import NeuralTs
from lightwood.mixer.lightgbm import LightGBM
from lightwood.mixer.random_forest import RandomForest
from lightwood.mixer.lightgbm_array import LightGBMArray
from lightwood.mixer.sktime import SkTime
from lightwood.mixer.arima import ARIMAMixer
Expand All @@ -16,5 +17,5 @@
except Exception:
QClassic = None

__all__ = ['BaseMixer', 'Neural', 'NeuralTs', 'LightGBM', 'LightGBMArray', 'Unit', 'Regression',
__all__ = ['BaseMixer', 'Neural', 'NeuralTs', 'LightGBM', 'RandomForest', 'LightGBMArray', 'Unit', 'Regression',
'SkTime', 'QClassic', 'ProphetMixer', 'ETSMixer', 'ARIMAMixer', 'NHitsMixer']
197 changes: 197 additions & 0 deletions lightwood/mixer/random_forest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
import time
import torch
import numpy as np
import pandas as pd
import optuna
from optuna import trial as trial_module
from sklearn.model_selection import cross_val_score
from typing import Dict, Union
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

from lightwood.api import dtype
from lightwood.helpers.log import log
from lightwood.encoder.base import BaseEncoder
from lightwood.data.encoded_ds import ConcatedEncodedDs, EncodedDs
from lightwood.mixer.base import BaseMixer
from lightwood.api.types import PredictionArguments


class RandomForest(BaseMixer):
model: Union[RandomForestClassifier, RandomForestRegressor]
dtype_dict: dict
target: str
fit_on_dev: bool
use_optuna: bool
supports_proba: bool

def __init__(
self,
stop_after: float,
target: str,
dtype_dict: Dict[str, str],
fit_on_dev: bool,
use_optuna: bool,
target_encoder: BaseEncoder
):
"""
The `RandomForest` mixer supports both regression and classification tasks.
It inherits from sklearn.ensemble.RandomForestRegressor and sklearn.ensemble.RandomForestClassifier.
(https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
(https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)
:param stop_after: time budget in seconds.
:param target: name of the target column that the mixer will learn to predict.
:param dtype_dict: dictionary with dtypes of all columns in the data.
:param fit_on_dev: whether to perform a `partial_fit()` at the end of `fit()` using the `dev` data split.
:param use_optuna: whether to activate the automated hyperparameter search (optuna-based). Note that setting this flag to `True` does not guarantee the search will run, rather, the speed criteria will be checked first (i.e., if a single iteration is too slow with respect to the time budget, the search will not take place).
""" # noqa
super().__init__(stop_after)
self.target = target
self.dtype_dict = dtype_dict
self.fit_on_dev = fit_on_dev
self.use_optuna = use_optuna
self.target_encoder = target_encoder

self.model = None
self.positive_domain = False
self.num_trials = 20

self.cls_dtypes = [dtype.categorical, dtype.binary, dtype.cat_tsarray]
self.float_dtypes = [dtype.float, dtype.quantity, dtype.num_tsarray]
self.num_dtypes = [dtype.integer] + self.float_dtypes
self.supports_proba = dtype_dict[target] in self.cls_dtypes

self.stable = True

def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
"""
Fits the RandomForest model.
:param train_data: encoded features for training dataset
:param dev_data: encoded features for dev dataset
"""
started = time.time()
log.info('Started fitting RandomForest model')

output_dtype = self.dtype_dict[self.target]

if output_dtype not in self.cls_dtypes + self.num_dtypes:
log.error(f'RandomForest mixer not supported for type: {output_dtype}')
raise Exception(f'RandomForest mixer not supported for type: {output_dtype}')

if self.fit_on_dev:
train_data = ConcatedEncodedDs([train_data, dev_data])

if output_dtype in self.num_dtypes:
X = train_data.get_encoded_data(include_target=False)
try:
Y = train_data.get_encoded_column_data(self.target)
except Exception as e:
log.warning(e)
Y = train_data.get_column_original_data(self.target) # ts: to be fixed

self.model = RandomForestRegressor(
n_estimators=50,
max_depth=5,
max_features=1.,
bootstrap=True,
n_jobs=-1,
random_state=0
)

self.model.fit(X, Y) # sample_weight

elif output_dtype in self.cls_dtypes:
X = train_data.get_encoded_data(include_target=False)
Y = train_data.get_column_original_data(self.target)

self.model = RandomForestClassifier(
n_estimators=50,
max_depth=5,
max_features=1.,
bootstrap=True,
n_jobs=-1,
random_state=0
)

self.model.fit(X, Y) # sample_weight

# need to be improved
elapsed = time.time() - started
num_trials = max(min(int(self.stop_after / elapsed) - 1, self.num_trials), 0)
if self.use_optuna:
log.info(f'The number of trials (Optuna) is {num_trials}.')

direction, metric = ('maximize', 'r2') if output_dtype in self.num_dtypes else ('maximize', 'neg_log_loss')

def objective(trial: trial_module.Trial):
criterion = 'squared_error' if output_dtype in self.num_dtypes \
else trial.suggest_categorical("criterion", ["gini", "entropy"])

params = {
'n_estimators': trial.suggest_int('num_estimators', 2, 512),
'max_depth': trial.suggest_int('max_depth', 2, 15),
'min_samples_split': trial.suggest_int("min_samples_split", 2, 100),
'min_samples_leaf': trial.suggest_int("min_samples_leaf", 1, 100),
'max_features': trial.suggest_float("max_features", 0.01, 1),
'criterion': criterion,
}

self.model.set_params(**params)

return cross_val_score(self.model, X, Y, cv=3, n_jobs=-1, scoring=metric).mean()

if self.use_optuna and num_trials > 0:
study = optuna.create_study(direction=direction)
study.optimize(objective, n_trials=num_trials)
# to be fixed
# print(study.trials_dataframe().tail())
# log.info(f'RandomForest parameters of the best trial: {study.best_params}')
log.info(f'RandomForest n_estimators: {self.model.n_estimators}, max_depth: {self.model.max_depth}')

log.info(f'RandomForest based correlation of: {self.model.score(X, Y)}')

def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
"""
The RandomForest mixer does not support updates. If the model does not exist, a new one will be created and fitted.
:param train_data: encoded features for (new) training dataset
:param dev_data: encoded features for (new) dev dataset
""" # noqa
if self.model is None:
self.fit(train_data, dev_data)

def __call__(self, ds: EncodedDs,
args: PredictionArguments = PredictionArguments()) -> pd.DataFrame:
"""
Call a trained RandomForest mixer to output predictions for the target column.
:param ds: input data with values for all non-target columns.
:param args: inference-time arguments (e.g. whether to output predicted labels or probabilities).
:return: dataframe with predictions.
"""
data = ds.get_encoded_data(include_target=False).tolist()

if self.dtype_dict[self.target] in self.num_dtypes:
predictions = self.model.predict(data)
if predictions.ndim == 1:
decoded_predictions = predictions
else:
decoded_predictions = self.target_encoder.decode(torch.Tensor(predictions))
else:
predictions = self.model.predict_proba(data)
decoded_predictions = self.model.classes_.take(np.argmax(predictions, axis=1), axis=0)

if self.positive_domain:
decoded_predictions = [max(0, p) for p in decoded_predictions]

ydf = pd.DataFrame({'prediction': decoded_predictions})

if args.predict_proba and hasattr(self.model, 'classes_'):
for idx, label in enumerate(self.model.classes_):
ydf[f'__mdb_proba_{label}'] = predictions[:, idx]

return ydf

8 changes: 4 additions & 4 deletions tests/integration/basic/test_model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ def get_mixers(self, df: pd.DataFrame, target: str, prob_kwargs: dict = None):
def test_0_regression_task(self):
df = pd.read_csv('tests/data/concrete_strength.csv')
target = 'concrete_strength'
expected_mixers = ['Neural', 'LightGBM', 'Regression']
expected_mixers = ['Neural', 'LightGBM', 'Regression', 'RandomForest']
mixers = self.get_mixers(df, target)
self.assertEqual(set(mixers), set(expected_mixers))

def test_1_multiclass_task(self):
df = pd.read_csv('tests/data/hdi.csv')
target = 'Development Index'
expected_mixers = ['Neural', 'LightGBM', 'Regression']
expected_mixers = ['Neural', 'LightGBM', 'Regression', 'RandomForest']
mixers = self.get_mixers(df, target)
self.assertEqual(set(mixers), set(expected_mixers))

Expand All @@ -37,7 +37,7 @@ def test_2_unit_text_task(self):
def test_3_complex_text_task(self):
df = pd.read_csv('tests/data/wine_reviews_binary_sample.csv')
target = 'label'
expected_mixers = ['Neural', 'LightGBM', 'Regression']
expected_mixers = ['Neural', 'LightGBM', 'Regression', 'RandomForest']
mixers = self.get_mixers(df, target)
self.assertEqual(set(mixers), set(expected_mixers))

Expand All @@ -53,7 +53,7 @@ def test_4_timeseries_t_plus_1(self):
'window': 5
}
}
expected_mixers = ['NeuralTs', 'LightGBM', 'Regression']
expected_mixers = ['NeuralTs', 'LightGBM', 'Regression', 'RandomForest']
mixers = self.get_mixers(df, target, prob_kwargs=prob_kwargs)
self.assertEqual(set(mixers), set(expected_mixers))

Expand Down
Empty file.
64 changes: 64 additions & 0 deletions tests/unit_tests/mixer/test_random_forest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import unittest
import numpy as np
import pandas as pd
from sklearn.metrics import balanced_accuracy_score
from lightwood.api.types import ProblemDefinition
from lightwood.api.high_level import json_ai_from_problem, predictor_from_json_ai, JsonAI, code_from_json_ai, predictor_from_code # noqa


np.random.seed(42)


class TestBasic(unittest.TestCase):

def get_submodels(self):
submodels = [
{
'module': 'RandomForest',
'args': {
'stop_after': '$problem_definition.seconds_per_mixer',
'fit_on_dev': True,
'target': '$target',
'dtype_dict': '$dtype_dict',
'target_encoder': '$encoders[self.target]',
'use_optuna': True
}
},
]
return submodels

def test_0_regression(self):
df = pd.read_csv('tests/data/concrete_strength.csv')[:500]
target = 'concrete_strength'

pdef = ProblemDefinition.from_dict({'target': target, 'time_aim': 80})
jai = json_ai_from_problem(df, pdef)

jai.model['args']['submodels'] = self.get_submodels()
code = code_from_json_ai(jai)
predictor = predictor_from_code(code)

predictor.learn(df)
predictor.predict(df)

def test_1_binary(self):
df = pd.read_csv('tests/data/ionosphere.csv')[:100]
target = 'target'

pdef = ProblemDefinition.from_dict({'target': target, 'time_aim': 20, 'unbias_target': False})
jai = json_ai_from_problem(df, pdef)

jai.model['args']['submodels'] = [
{
'module': 'RandomForest',
'args': {'stop_after': '$problem_definition.seconds_per_mixer', 'fit_on_dev': True}}
]
code = code_from_json_ai(jai)
predictor = predictor_from_code(code)

predictor.learn(df)
predictions = predictor.predict(df)

acc = balanced_accuracy_score(df[target], predictions['prediction'])
self.assertTrue(acc > 0.5)
self.assertTrue(all([0 <= p <= 1 for p in predictions['confidence']]))

0 comments on commit f2dcdee

Please sign in to comment.