facebookresearch · kingjr · Dec 11, 2024 · Dec 24, 2024 · Dec 24, 2024 · Dec 24, 2024
diff --git a/.github/workflows/test-type-lint.yaml b/.github/workflows/test-type-lint.yaml
@@ -48,6 +48,7 @@ jobs:
       run: |
         source activate ./ci_env
         pip install -e .[dev]
+        pip install scikit-learn lightning  # for docs
 
     - name: Print installed packages
       run: |
@@ -76,7 +77,7 @@ jobs:
         sed -i 's/\"auto\"/None/g' README.md
         # on Mac: sed -i '' 's/cluster: slurm/cluster: null/g' infra/*.md
         # check readmes
-        pytest --markdown-docs -m markdown-docs `**/*.md`
+        pytest --markdown-docs -m markdown-docs .
 
     - name: Run basic pylint
       run: |

diff --git a/docs/infra/example_sklearn.py b/docs/infra/example_sklearn.py
@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+A minimalist example with sklearn to show how to develop and explore a model with exca.
+"""
+import typing as tp
+import numpy as np
+import pydantic
+import sys
+import exca
+from sklearn.datasets import make_regression
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import Ridge
+from sklearn.metrics import mean_squared_error
+
+
+class Dataset(pydantic.BaseModel):
+    n_samples: int = 100
+    noise: float = 0.1
+    random_state: int = 42
+    test_size: float = 0.2
+    model_config = pydantic.ConfigDict(extra="forbid")
+
+    def get(self) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        # Generate synthetic data
+        X, y = make_regression(
+            n_samples=self.n_samples,
+            noise=self.noise,
+            random_state=self.random_state
+        )
+        # Split into training and testing datasets
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, 
+            test_size=self.test_size, 
+            random_state=self.random_state
+        )
+        return X_train, X_test, y_train, y_test
+
+
+class Model(pydantic.BaseModel):
+    data: Dataset = Dataset()
+    alpha: float = 1.0
+    max_iter: int = 1000
+    infra: exca.TaskInfra = exca.TaskInfra(folder='.cache/')
+
+    @infra.apply
+    def score(self):
+        # Get data
+        X_train, X_test, y_train, y_test = self.data.get()
+
+        # Train a Ridge regression model
+        print('Fit...')
+        model = Ridge(alpha=self.alpha, max_iter=self.max_iter)
+        model.fit(X_train, y_train)
+
+        # Evaluate
+        print('Score...')
+        y_pred = model.predict(X_test)
+        mse = mean_squared_error(y_test, y_pred)
+        return mse
+
+
+if __name__ == "__main__":
+    # Validate config
+    config = exca.ConfDict.from_args(sys.argv[1:])
+    model = Model(**config)
+    print(model.infra.config)
+
+    # Score
+    mse = model.score()
+    print(mse)