Skip to content

Commit

Permalink
Adding changes for Flaml Sklearn integration (#1361)
Browse files Browse the repository at this point in the history
Flaml provides support for Sklearn models like Random Forests, KNN,
Extra Trees Regressor, and Logistic Regression with regularization. We
plan to integrate these ML models into EVADB.
Link for Flaml documentation:
https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML

---------

Co-authored-by: Jineet Desai <[email protected]>
  • Loading branch information
jineetd and Jineet Desai authored Nov 17, 2023
1 parent 69b39b8 commit 0c25a44
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 42 deletions.
2 changes: 2 additions & 0 deletions evadb/configuration/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,5 @@
DEFAULT_DOCUMENT_CHUNK_OVERLAP = 200
DEFAULT_TRAIN_REGRESSION_METRIC = "rmse"
DEFAULT_XGBOOST_TASK = "regression"
DEFAULT_SKLEARN_TRAIN_MODEL = "rf"
SKLEARN_SUPPORTED_MODELS = ["rf", "extra_tree", "kneighbor"]
33 changes: 23 additions & 10 deletions evadb/executor/create_function_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@
from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry
from evadb.catalog.models.function_metadata_catalog import FunctionMetadataCatalogEntry
from evadb.configuration.constants import (
DEFAULT_SKLEARN_TRAIN_MODEL,
DEFAULT_TRAIN_REGRESSION_METRIC,
DEFAULT_TRAIN_TIME_LIMIT,
DEFAULT_XGBOOST_TASK,
SKLEARN_SUPPORTED_MODELS,
EvaDB_INSTALLATION_DIR,
)
from evadb.database import EvaDBDatabase
Expand All @@ -45,13 +47,12 @@
from evadb.utils.generic_utils import (
load_function_class_from_file,
string_comparison_case_insensitive,
try_to_import_flaml_automl,
try_to_import_ludwig,
try_to_import_neuralforecast,
try_to_import_sklearn,
try_to_import_statsforecast,
try_to_import_torch,
try_to_import_ultralytics,
try_to_import_xgboost,
)
from evadb.utils.logging_manager import logger

Expand Down Expand Up @@ -169,8 +170,7 @@ def handle_sklearn_function(self):
Use Sklearn's regression to train models.
"""
try_to_import_sklearn()
from sklearn.linear_model import LinearRegression
try_to_import_flaml_automl()

assert (
len(self.children) == 1
Expand All @@ -186,13 +186,26 @@ def handle_sklearn_function(self):
aggregated_batch.drop_column_alias()

arg_map = {arg.key: arg.value for arg in self.node.metadata}
model = LinearRegression()
Y = aggregated_batch.frames[arg_map["predict"]]
aggregated_batch.frames.drop([arg_map["predict"]], axis=1, inplace=True)
from flaml import AutoML

model = AutoML()
sklearn_model = arg_map.get("model", DEFAULT_SKLEARN_TRAIN_MODEL)
if sklearn_model not in SKLEARN_SUPPORTED_MODELS:
raise ValueError(
f"Sklearn Model {sklearn_model} provided as input is not supported."
)
settings = {
"time_budget": arg_map.get("time_limit", DEFAULT_TRAIN_TIME_LIMIT),
"metric": arg_map.get("metric", DEFAULT_TRAIN_REGRESSION_METRIC),
"estimator_list": [sklearn_model],
"task": arg_map.get("task", DEFAULT_XGBOOST_TASK),
}
start_time = int(time.time())
model.fit(X=aggregated_batch.frames, y=Y)
model.fit(
dataframe=aggregated_batch.frames, label=arg_map["predict"], **settings
)
train_time = int(time.time()) - start_time
score = model.score(X=aggregated_batch.frames, y=Y)
score = model.best_loss
model_path = os.path.join(
self.db.catalog().get_configuration_catalog_value("model_dir"),
self.node.name,
Expand Down Expand Up @@ -232,7 +245,7 @@ def handle_xgboost_function(self):
We use the Flaml AutoML model for training xgboost models.
"""
try_to_import_xgboost()
try_to_import_flaml_automl()

assert (
len(self.children) == 1
Expand Down
4 changes: 2 additions & 2 deletions evadb/functions/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pandas as pd

from evadb.functions.abstract.abstract_function import AbstractFunction
from evadb.utils.generic_utils import try_to_import_sklearn
from evadb.utils.generic_utils import try_to_import_flaml_automl


class GenericSklearnModel(AbstractFunction):
Expand All @@ -26,7 +26,7 @@ def name(self) -> str:
return "GenericSklearnModel"

def setup(self, model_path: str, predict_col: str, **kwargs):
try_to_import_sklearn()
try_to_import_flaml_automl()

self.model = pickle.load(open(model_path, "rb"))
self.predict_col = predict_col
Expand Down
4 changes: 2 additions & 2 deletions evadb/functions/xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pandas as pd

from evadb.functions.abstract.abstract_function import AbstractFunction
from evadb.utils.generic_utils import try_to_import_xgboost
from evadb.utils.generic_utils import try_to_import_flaml_automl


class GenericXGBoostModel(AbstractFunction):
Expand All @@ -26,7 +26,7 @@ def name(self) -> str:
return "GenericXGBoostModel"

def setup(self, model_path: str, predict_col: str, **kwargs):
try_to_import_xgboost()
try_to_import_flaml_automl()

self.model = pickle.load(open(model_path, "rb"))
self.predict_col = predict_col
Expand Down
27 changes: 4 additions & 23 deletions evadb/utils/generic_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,39 +369,20 @@ def is_forecast_available() -> bool:
return False


def try_to_import_sklearn():
try:
import sklearn # noqa: F401
from sklearn.linear_model import LinearRegression # noqa: F401
except ImportError:
raise ValueError(
"""Could not import sklearn.
Please install it with `pip install scikit-learn`."""
)


def is_sklearn_available() -> bool:
try:
try_to_import_sklearn()
return True
except ValueError: # noqa: E722
return False


def try_to_import_xgboost():
def try_to_import_flaml_automl():
try:
import flaml # noqa: F401
from flaml import AutoML # noqa: F401
except ImportError:
raise ValueError(
"""Could not import Flaml AutoML.
"""Could not import Flaml AutML.
Please install it with `pip install "flaml[automl]"`."""
)


def is_xgboost_available() -> bool:
def is_flaml_automl_available() -> bool:
try:
try_to_import_xgboost()
try_to_import_flaml_automl()
return True
except ValueError: # noqa: E722
return False
Expand Down
4 changes: 3 additions & 1 deletion test/integration_tests/long/test_model_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,9 @@ def test_sklearn_regression(self):
CREATE OR REPLACE FUNCTION PredictHouseRentSklearn FROM
( SELECT number_of_rooms, number_of_bathrooms, days_on_market, rental_price FROM HomeRentals )
TYPE Sklearn
PREDICT 'rental_price';
PREDICT 'rental_price'
MODEL 'extra_tree'
METRIC 'r2';
"""
execute_query_fetch_all(self.evadb, create_predict_function)

Expand Down
7 changes: 3 additions & 4 deletions test/markers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,14 @@

from evadb.utils.generic_utils import (
is_chromadb_available,
is_flaml_automl_available,
is_forecast_available,
is_gpu_available,
is_ludwig_available,
is_milvus_available,
is_pinecone_available,
is_qdrant_available,
is_replicate_available,
is_sklearn_available,
is_xgboost_available,
)

asyncio_skip_marker = pytest.mark.skipif(
Expand Down Expand Up @@ -93,11 +92,11 @@
)

sklearn_skip_marker = pytest.mark.skipif(
is_sklearn_available() is False, reason="Run only if sklearn is available"
is_flaml_automl_available() is False, reason="Run only if Flaml AutoML is available"
)

xgboost_skip_marker = pytest.mark.skipif(
is_xgboost_available() is False, reason="Run only if xgboost is available"
is_flaml_automl_available() is False, reason="Run only if Flaml AutoML is available"
)

chatgpt_skip_marker = pytest.mark.skip(
Expand Down

0 comments on commit 0c25a44

Please sign in to comment.