crate · amotl · Nov 13, 2023 · Nov 12, 2023 · Nov 12, 2023 · Nov 13, 2023
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -65,7 +65,7 @@ updates:
 
   # Topics.
 
-  - directory: "/topic/machine-learning/classification-automl"
+  - directory: "/topic/machine-learning/automl"
     package-ecosystem: "pip"
     schedule:
       interval: "weekly"

diff --git a/.github/workflows/test-automl.yml b/.github/workflows/test-automl.yml
@@ -5,13 +5,13 @@ on:
     branches: ~
     paths:
     - '.github/workflows/test-automl.yml'
-    - 'topic/machine-learning/classification-automl/**'
+    - 'topic/machine-learning/automl/**'
     - 'requirements.txt'
   push:
     branches: [ main ]
     paths:
     - '.github/workflows/test-automl.yml'
-    - 'topic/machine-learning/classification-automl/**'
+    - 'topic/machine-learning/automl/**'
     - 'requirements.txt'
 
   # Allow job to be triggered manually.
@@ -63,13 +63,13 @@ jobs:
           cache: 'pip'
           cache-dependency-path: |
             requirements.txt
-            topic/machine-learning/classification-automl/requirements.txt
-            topic/machine-learning/classification-automl/requirements-dev.txt
+            topic/machine-learning/automl/requirements.txt
+            topic/machine-learning/automl/requirements-dev.txt
 
       - name: Install utilities
         run: |
           pip install -r requirements.txt
 
-      - name: Validate topic/machine-learning/classification-automl
+      - name: Validate topic/machine-learning/automl
         run: |
-          ngr test --accept-no-venv topic/machine-learning/classification-automl
+          ngr test --accept-no-venv topic/machine-learning/automl
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,6 @@
 __pycache__
 .coverage
 coverage.xml
+mlruns/
+archive/
+logs.log
diff --git a/...learning/classification-automl/.gitignore → topic/machine-learning/automl/.gitignore b/...learning/classification-automl/.gitignore → topic/machine-learning/automl/.gitignore
diff --git a/...-learning/classification-automl/README.md → topic/machine-learning/automl/README.md b/...-learning/classification-automl/README.md → topic/machine-learning/automl/README.md
@@ -53,7 +53,7 @@ and [CrateDB].
 - [requirements.txt](requirements.txt): Pulls the required dependencies to
   run the example programs.
 
-- `automl_classification_with_pycaret.ipynb` [![Open on GitHub](https://img.shields.io/badge/Open%20on-GitHub-lightgray?logo=GitHub)](automl_classification_with_pycaret.ipynb) [![Open in Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/crate/cratedb-examples/blob/main/topic/machine-learning/classification-automl/automl_classification_with_pycaret.ipynb)
+- `automl_classification_with_pycaret.ipynb` [![Open on GitHub](https://img.shields.io/badge/Open%20on-GitHub-lightgray?logo=GitHub)](automl_classification_with_pycaret.ipynb) [![Open in Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/crate/cratedb-examples/blob/main/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb)
 
   This notebook explores the PyCaret framework and shows how to use it to
   train different classification models - using a user churn dataset as an
@@ -62,8 +62,18 @@ and [CrateDB].
   model. The notebook also shows how to use CrateDB as storage for both the raw
   data and the expirement tracking and model registry data.
 
+- `automl_timeseries_forecasting_with_pycaret.ipynb` [![Open on GitHub](https://img.shields.io/badge/Open%20on-GitHub-lightgray?logo=GitHub)](automl_timeseries_forecasting_with_pycaret.ipynb) [![Open in Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/crate/cratedb-examples/blob/main/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb)
+
+  This notebook explores the PyCaret framework and shows how to use it to
+  train various timeseries forecasting models - using a real-world sales dataset
+  as an example. The notebook demonstrates the usage of PyCaret to automatically
+  train and benchmark a multitude of models and at the end select the best
+  performing model. The notebook also shows how to use CrateDB as storage for
+  both the raw data and the expirement tracking and model registry data.
+
 - Accompanied to the Jupyter Notebook files, there are also basic variants of
   the above examples,
+  [automl_timeseries_forecasting_with_pycaret.py](automl_timeseries_forecasting_with_pycaret.py),
   [automl_classification_with_pycaret.py](automl_classification_with_pycaret.py).
 
 [PyCaret]: https://github.com/pycaret/pycaret

diff --git a/.../automl_classification_with_pycaret.ipynb → .../automl_classification_with_pycaret.ipynb b/.../automl_classification_with_pycaret.ipynb → .../automl_classification_with_pycaret.ipynb
@@ -1001,7 +1001,7 @@
     "the winning models, to further improve their performance.\n",
     "\n",
     "By setting `n_select=3` in the above benchmarking call, you told PyCaret to\n",
-    "select the 8 best performing models from the benchmarking run. You can now use\n",
+    "select the 3 best performing models from the benchmarking run. You can now use\n",
     "all 3 of them to tune their hyperparameters. It is quite common that the model\n",
     "ranking changes after hyperparameter tuning.\n",
     "\n",
@@ -2636,7 +2636,7 @@
     "\n",
     "The example below uses tuned models as base models. An even better approach\n",
     "would be to hand-pick the best performing models from all the previous experiments,\n",
-    "but for the sake of brevity, let's stick with the first 5 of the tuned models."
+    "but for the sake of brevity, let's stick with the tuned models."
    ]
   },
   {

diff --git a/...oml/automl_classification_with_pycaret.py → ...oml/automl_classification_with_pycaret.py b/...oml/automl_classification_with_pycaret.py → ...oml/automl_classification_with_pycaret.py
@@ -64,7 +64,6 @@ def try_ensemble_model(model):
     blend_models(estimator_list=tuned_models)
     best_model = automl(optimize="AUC")
 
-    evaluate_model(best_model)
     final_model = finalize_model(best_model)
 
     # Save the model to disk.

diff --git a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb
diff --git a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.py b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.py
@@ -0,0 +1,108 @@
+import pandas as pd
+import sqlalchemy as sa
+import os
+import mlflow_cratedb  # Required to enable the CrateDB MLflow adapter.
+from dotenv import load_dotenv
+from mlflow.sklearn import log_model
+
+from pycaret.time_series import (
+    setup,
+    compare_models,
+    tune_model,
+    blend_models,
+    finalize_model,
+    save_model,
+)
+
+if os.path.isfile(".env"):
+    load_dotenv(".env", override=True)
+
+# Configure database connection string.
+dburi = f"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}"
+engine = sa.create_engine(dburi, echo=os.environ.get("DEBUG"))
+os.environ["MLFLOW_TRACKING_URI"] = f"{dburi}&schema=mlflow"
+
+
+def prepare_data():
+    target_data = pd.read_csv(
+        "https://data.4tu.nl/file/539debdb-a325-412d-b024-593f70cba15b/a801f5d4-5dfe-412a-ace2-a64f93ad0010"
+    )
+    related_data = pd.read_csv(
+        "https://data.4tu.nl/file/539debdb-a325-412d-b024-593f70cba15b/f2bd27bd-deeb-4933-bed7-29325ee05c2e",
+        header=None,
+    )
+    related_data.columns = ["item", "org", "date", "unit_price"]
+    data = target_data.merge(related_data, on=["item", "org", "date"])
+    data["total_sales"] = data["unit_price"] * data["quantity"]
+    data["date"] = pd.to_datetime(data["date"])
+
+    # Insert the data into CrateDB
+    engine = sa.create_engine(dburi, echo=os.environ.get("DEBUG"))
+
+    with engine.connect() as conn:
+        data.to_sql(
+            "sales_data_for_forecast",
+            conn,
+            index=False,
+            chunksize=1000,
+            if_exists="replace",
+        )
+
+        # Refresh table to make sure the data is available for querying - as CrateDB
+        # is eventually consistent
+        conn.execute(sa.text("REFRESH TABLE sales_data_for_forecast;"))
+
+
+def fetch_data():
+    query = """
+        SELECT
+            DATE_TRUNC('month', DATE) AS MONTH,
+            SUM(total_sales) AS total_sales
+        from sales_data_for_forecast
+        group by month
+        order by month
+    """
+
+    with engine.connect() as conn:
+        with conn.execute(sa.text(query)) as cursor:
+            data = pd.DataFrame(cursor.fetchall(), columns=cursor.keys())
+
+    data["month"] = pd.to_datetime(data["month"], unit="ms")
+    return data
+
+
+def run_experiment(data):
+    setup(data = data, fh=15, target="total_sales", index="month", log_experiment=True)
+    if "PYTEST_CURRENT_TEST" in os.environ:
+        best_models = compare_models(sort="MASE",
+                                    include=["arima", "ets", "exp_smooth"],
+                                    n_select=3)
+    else:
+        best_models = compare_models(sort="MASE", n_select=3)
+
+    tuned_models = [tune_model(model) for model in best_models]
+    blend = blend_models(estimator_list=tuned_models)
+    best_model = blend
+    final_model = finalize_model(best_model)
+    os.makedirs("model", exist_ok=True)
+
+    save_model(final_model, "model/timeseriesforecast_model")
+
+    log_model(
+        sk_model=final_model,
+        artifact_path="model/timeseriesforecast_model",
+        registered_model_name="timeseriesforecast_model",
+    )
+
+
+def main():
+    """
+    Provision dataset, and run experiment.
+    """
+    prepare_data()
+    df = fetch_data()
+    run_experiment(df)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/...learning/classification-automl/backlog.md → topic/machine-learning/automl/backlog.md b/...learning/classification-automl/backlog.md → topic/machine-learning/automl/backlog.md
diff --git a/...ning/classification-automl/pyproject.toml → topic/machine-learning/automl/pyproject.toml b/...ning/classification-automl/pyproject.toml → topic/machine-learning/automl/pyproject.toml
@@ -44,6 +44,7 @@ nb_diff_ignore = [
     "/metadata/widgets",
     "/cells/*/execution_count",
     "/cells/*/outputs/*/execution_count",
+    "/cells/*/outputs/*/metadata/nbreg",
     # Ignore images.
     "/cells/*/outputs/*/data/image/png",
     # FIXME: Those pacifiers should be revisited.
@@ -61,6 +62,9 @@ nb_diff_ignore = [
     "/cells/34/outputs",
     "/cells/36/outputs",
     "/cells/40/outputs",
+    # automl_timeseries_forecasting_with_pycaret.ipynb
+    "/cells/19/outputs",
+    "/cells/33/outputs",
 ]
 
 [tool.coverage.run]

diff --git a/...lassification-automl/requirements-dev.txt → ...hine-learning/automl/requirements-dev.txt b/...lassification-automl/requirements-dev.txt → ...hine-learning/automl/requirements-dev.txt
diff --git a/...ng/classification-automl/requirements.txt → .../machine-learning/automl/requirements.txt b/...ng/classification-automl/requirements.txt → .../machine-learning/automl/requirements.txt
@@ -3,6 +3,7 @@ crate[sqlalchemy]
 # mlflow-cratedb==2.7.1
 plotly<5.19
 pycaret[analysis,models,tuner,parallel,test]==3.1.0
+pydantic<2
 python-dotenv<2
 tqdm<5
 werkzeug==2.2.3

diff --git a/...ne-learning/classification-automl/test.py → topic/machine-learning/automl/test.py b/...ne-learning/classification-automl/test.py → topic/machine-learning/automl/test.py
diff --git a/...ml_classification_with_pycaret_files/automl_classification_with_pycaret_5_0.png b/...ml_classification_with_pycaret_files/automl_classification_with_pycaret_5_0.png
diff --git a/...ml_classification_with_pycaret_files/automl_classification_with_pycaret_7_0.png b/...ml_classification_with_pycaret_files/automl_classification_with_pycaret_7_0.png
diff --git a/...ml_classification_with_pycaret_files/automl_classification_with_pycaret_9_0.png b/...ml_classification_with_pycaret_files/automl_classification_with_pycaret_9_0.png