Skip to content

Example sklearn #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
3 changes: 2 additions & 1 deletion .github/workflows/test-type-lint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ jobs:
run: |
source activate ./ci_env
pip install -e .[dev]
pip install scikit-learn lightning # for docs

- name: Print installed packages
run: |
Expand Down Expand Up @@ -76,7 +77,7 @@ jobs:
sed -i 's/\"auto\"/None/g' README.md
# on Mac: sed -i '' 's/cluster: slurm/cluster: null/g' infra/*.md
# check readmes
pytest --markdown-docs -m markdown-docs `**/*.md`
pytest --markdown-docs -m markdown-docs .

- name: Run basic pylint
run: |
Expand Down
74 changes: 74 additions & 0 deletions docs/infra/example_sklearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""
A minimalist example with sklearn to show how to develop and explore a model with exca.
"""
import typing as tp
import numpy as np
import pydantic
import sys
import exca
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


class Dataset(pydantic.BaseModel):
n_samples: int = 100
noise: float = 0.1
random_state: int = 42
test_size: float = 0.2
model_config = pydantic.ConfigDict(extra="forbid")

def get(self) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
# Generate synthetic data
X, y = make_regression(
n_samples=self.n_samples,
noise=self.noise,
random_state=self.random_state
)
# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=self.test_size,
random_state=self.random_state
)
return X_train, X_test, y_train, y_test


class Model(pydantic.BaseModel):
data: Dataset = Dataset()
alpha: float = 1.0
max_iter: int = 1000
infra: exca.TaskInfra = exca.TaskInfra(folder='.cache/')

@infra.apply
def score(self):
# Get data
X_train, X_test, y_train, y_test = self.data.get()

# Train a Ridge regression model
print('Fit...')
model = Ridge(alpha=self.alpha, max_iter=self.max_iter)
model.fit(X_train, y_train)

# Evaluate
print('Score...')
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
return mse


if __name__ == "__main__":
# Validate config
config = exca.ConfDict.from_args(sys.argv[1:])
model = Model(**config)
print(model.infra.config)

# Score
mse = model.score()
print(mse)
Loading