crate · amotl · Nov 8, 2023 · Nov 6, 2023
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -65,6 +65,11 @@ updates:
 
   # Topics.
 
+  - directory: "/topic/machine-learning/classification-automl"
+    package-ecosystem: "pip"
+    schedule:
+      interval: "weekly"
+
   - directory: "/topic/machine-learning/llm-langchain"
     package-ecosystem: "pip"
     schedule:

diff --git a/.github/workflows/test-automl.yml b/.github/workflows/test-automl.yml
@@ -0,0 +1,75 @@
+name: AutoML
+
+on:
+  pull_request:
+    branches: ~
+    paths:
+    - '.github/workflows/test-automl.yml'
+    - 'topic/machine-learning/classification-automl/**'
+    - 'requirements.txt'
+  push:
+    branches: [ main ]
+    paths:
+    - '.github/workflows/test-automl.yml'
+    - 'topic/machine-learning/classification-automl/**'
+    - 'requirements.txt'
+
+  # Allow job to be triggered manually.
+  workflow_dispatch:
+
+  # Run job each night after CrateDB nightly has been published.
+  schedule:
+    - cron: '0 3 * * *'
+
+# Cancel in-progress jobs when pushing to the same branch.
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.ref }}
+
+jobs:
+  test:
+    name: "
+     Python: ${{ matrix.python-version }}
+     CrateDB: ${{ matrix.cratedb-version }}
+     on ${{ matrix.os }}"
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ 'ubuntu-latest' ]
+        python-version: [ '3.10' ]
+        cratedb-version: [ 'nightly' ]
+
+    services:
+      cratedb:
+        image: crate/crate:nightly
+        ports:
+          - 4200:4200
+          - 5432:5432
+
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+
+    steps:
+
+      - name: Acquire sources
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: |
+            requirements.txt
+            topic/machine-learning/classification-automl/requirements.txt
+            topic/machine-learning/classification-automl/requirements-dev.txt
+
+      - name: Install utilities
+        run: |
+          pip install -r requirements.txt
+
+      - name: Validate topic/machine-learning/classification-automl
+        run: |
+          ngr test --accept-no-venv topic/machine-learning/classification-automl
diff --git a/topic/machine-learning/classification-automl/automl_classification_with_pycaret.ipynb b/topic/machine-learning/classification-automl/automl_classification_with_pycaret.ipynb
@@ -175,11 +175,12 @@
     "    dotenv.load_dotenv(\".env\", override=True)\n",
     "\n",
     "dburi = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}\"\n",
-    "engine = sa.create_engine(dburi, echo=True)\n",
+    "engine = sa.create_engine(dburi, echo=os.environ.get('DEBUG'))\n",
     "df = pd.read_csv(\"https://github.com/crate/cratedb-datasets/raw/main/machine-learning/automl/churn-dataset.csv\")\n",
     "\n",
     "with engine.connect() as conn:\n",
-    "    df.to_sql(\"pycaret_churn\", conn, index=False, chunksize=1000, if_exists=\"replace\")\n"
+    "    df.to_sql(\"pycaret_churn\", conn, index=False, chunksize=1000, if_exists=\"replace\")\n",
+    "    conn.execute(sa.text(\"REFRESH TABLE pycaret_churn;\"))"
    ]
   },
   {
@@ -206,13 +207,13 @@
     "import plotly\n",
     "import plotly.express as plotly_express\n",
     "import plotly.graph_objects as go\n",
-    "import mlflow_cratedb  # We need this import to use the CrateDB MLflow store\n",
+    "import mlflow_cratedb  # Required to enable the CrateDB MLflow adapter.\n",
     "\n",
     "if os.path.exists(\".env\"):\n",
     "    dotenv.load_dotenv(\".env\", override=True)\n",
     "\n",
     "dburi = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}\"\n",
-    "engine = sa.create_engine(dburi, echo=True)\n",
+    "engine = sa.create_engine(dburi, echo=os.environ.get('DEBUG'))\n",
     "\n",
     "with engine.connect() as conn:\n",
     "    with conn.execute(sa.text(\"SELECT * FROM pycaret_churn\")) as cursor:\n",
@@ -3373,7 +3374,7 @@
    "source": [
     "os.environ[\n",
     "    \"MLFLOW_TRACKING_URI\"\n",
-    "] = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl=true&schema=mlflow\""
+    "] = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}&schema=mlflow\""
    ]
   },
   {

diff --git a/topic/machine-learning/classification-automl/automl_classification_with_pycaret.py b/topic/machine-learning/classification-automl/automl_classification_with_pycaret.py
@@ -1,28 +1,46 @@
+"""
+Example program for exercising the "AutoML with Pycaret and CrateDB" article.
+This is a standalone variant. A corresponding .ipynb Jupyter Notebook can usually
+be found side-by-side to this file.
+"""
 import os
 import dotenv
 import sqlalchemy as sa
 import pandas as pd
-import mlflow_cratedb  # We need this import to use the CrateDB MLflow store
-from pycaret.classification import *
+import mlflow_cratedb  # Required to enable the CrateDB MLflow adapter.
+from pycaret.classification import setup, compare_models, tune_model, ensemble_model, blend_models, automl, \
+    evaluate_model, finalize_model, save_model, predict_model
 from mlflow.sklearn import log_model
 
+
 if os.path.exists(".env"):
     dotenv.load_dotenv(".env", override=True)
 
 
+# Configure database connection string.
+dburi = f"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}"
+os.environ["MLFLOW_TRACKING_URI"] = f"{dburi}&schema=mlflow"
+
+
 def fetch_data():
-    dburi = f"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}"
+    """
+    Fetch data from CrateDB, using SQL and SQLAlchemy, and wrap result into pandas data frame.
+    """
     engine = sa.create_engine(dburi, echo=True)
 
     with engine.connect() as conn:
         with conn.execute(sa.text("SELECT * FROM pycaret_churn")) as cursor:
             data = pd.DataFrame(cursor.fetchall(), columns=cursor.keys())
-    os.environ["MLFLOW_TRACKING_URI"] = f"{dburi}&schema=mlflow"
+            return data
 
 
 def run_experiment(data):
+    """
+    Run an AutoML experiment using PyCaret, MLflow, and CrateDB.
+    """
     s = setup(
-        data,
+        data=data,
+        data_func=None,
         target="Churn",
         ignore_features=["customerID"],
         log_experiment=True,
@@ -31,32 +49,30 @@ def run_experiment(data):
 
     best_models = compare_models(sort="AUC", exclude=["lightgbm"], n_select=3)
     tuned_models = [tune_model(model) for model in best_models]
-    _ = [ensemble_model(i, method="Bagging") for i in tuned_models]
+    [ensemble_model(i, method="Bagging") for i in tuned_models]
 
     def try_ensemble_model(model):
         try:
             print(type(model))
-            # Attempt to ensemble the model with Boosting method
+            # Attempt to ensemble the model with Boosting method.
             return ensemble_model(model, method="Boosting")
         except Exception as e:
             print("Can't apply boosting.")
             return None
 
-    _ = [try_ensemble_model(i) for i in tuned_models]
-    _ = blend_models(estimator_list=tuned_models)
+    [try_ensemble_model(i) for i in tuned_models]
+    blend_models(estimator_list=tuned_models)
     best_model = automl(optimize="AUC")
 
     evaluate_model(best_model)
     final_model = finalize_model(best_model)
 
-    if not os.path.exists("model"):
-        os.makedirs("model")
-
-    # Save the model to disk
-    _ = save_model(final_model, "model/classification_model")
+    # Save the model to disk.
+    os.makedirs("model", exist_ok=True)
+    save_model(final_model, "model/classification_model")
     predict_model(final_model, s.X_test)
 
-    _ = log_model(
+    log_model(
         sk_model=best_model,
         artifact_path="model/classification_model",
         registered_model_name=f"classification-model",

diff --git a/topic/machine-learning/classification-automl/backlog.md b/topic/machine-learning/classification-automl/backlog.md
@@ -0,0 +1,3 @@
+# Backlog
+
+- Describe / program how to import `churn-dataset.csv`.
diff --git a/topic/machine-learning/classification-automl/pyproject.toml b/topic/machine-learning/classification-automl/pyproject.toml
@@ -0,0 +1,74 @@
+[tool.pytest.ini_options]
+minversion = "2.0"
+addopts = """
+  -rfEX -p pytester --strict-markers --verbosity=3 --capture=no
+  """
+# --cov=. --cov-report=term-missing --cov-report=xml
+env = [
+    "CRATEDB_CONNECTION_STRING=crate://crate@localhost/?schema=testdrive",
+    "CRATE_USER=crate",
+    "CRATE_PASSWORD=",
+    "CRATE_HOST=localhost",
+    "CRATE_SSL=false",
+    "PYDEVD_DISABLE_FILE_VALIDATION=1",
+]
+
+#log_level = "DEBUG"
+#log_cli_level = "DEBUG"
+
+testpaths = [
+    "*.py",
+]
+xfail_strict = true
+markers = [
+]
+
+# pytest-notebook settings
+nb_test_files = true
+nb_coverage = false
+# 120 seconds is too less on CI/GHA
+nb_exec_timeout = 240
+nb_diff_replace = [
+    # Compensate output of `crash`.
+    '"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"',
+    # Compensate other outputs.
+    '"/cells/*/outputs/*/data/text/html" "T_....." "T_na"',
+    '"/cells/*/outputs/*/data/text/plain" "IPython.core.display.HTML object" "pandas.io.formats.style.Styler"',
+    '"/cells/*/outputs/*/data/text/plain" "pandas.io.formats.style.Styler at 0x.+" "pandas.io.formats.style.Styler"',
+    '"/cells/*/outputs/*/data/application/vnd.jupyter.widget-view+json" "model_id: .+" "model_id: na"',
+    '"/cells/*/outputs/*/data/text/html" "\>\d+\.\d+\<\/td\>" "0.3333"',
+]
+# `vector_search.py` does not include any output(s).
+nb_diff_ignore = [
+    "/metadata/language_info",
+    "/metadata/widgets",
+    "/cells/*/execution_count",
+    "/cells/*/outputs/*/execution_count",
+    # Ignore images.
+    "/cells/*/outputs/*/data/image/png",
+    # FIXME: Those pacifiers should be revisited.
+    #        Some are warnings, some are semantic ambiguities.
+    #        Maybe they can be improved in one way or another,
+    #        for improved QA.
+    "/cells/5/outputs",
+    "/cells/14/outputs",
+    "/cells/16/outputs",
+    "/cells/16/outputs",
+    "/cells/18/outputs",
+    "/cells/24/outputs",
+    "/cells/30/outputs/0/data/application/vnd.jupyter.widget-view+json",
+    "/cells/34/outputs",
+    "/cells/36/outputs",
+    "/cells/40/outputs",
+]
+
+[tool.coverage.run]
+branch = false
+
+[tool.coverage.report]
+fail_under = 0
+show_missing = true
+omit = [
+    "conftest.py",
+    "test*.py",
+]
diff --git a/topic/machine-learning/classification-automl/requirements-dev.txt b/topic/machine-learning/classification-automl/requirements-dev.txt
@@ -0,0 +1,11 @@
+# Real.
+# cratedb-toolkit
+# pueblo[testing]
+
+# Development.
+cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@main
+pueblo[testing] @ git+https://github.com/pyveci/pueblo.git@main
+
+# Workstation.
+# --editable=/Users/amo/dev/crate/ecosystem/cratedb-retentions[io]
+# --editable=/Users/amo/dev/pyveci/sources/pueblo[testing]
diff --git a/topic/machine-learning/classification-automl/requirements.txt b/topic/machine-learning/classification-automl/requirements.txt
@@ -1,7 +1,11 @@
+# Real.
 crate[sqlalchemy]
-mlflow-cratedb==2.7.1
+# mlflow-cratedb==2.7.1
 plotly<5.19
 pycaret[analysis,models,tuner,parallel,test]==3.1.0
 python-dotenv<2
 tqdm<5
 werkzeug==2.2.3
+
+# Development.
+mlflow-cratedb @ git+https://github.com/crate-workbench/mlflow-cratedb.git@testing
diff --git a/topic/machine-learning/classification-automl/test.py b/topic/machine-learning/classification-automl/test.py
@@ -0,0 +1,74 @@
+"""
+## About
+
+Test cases for classification model examples with CrateDB, PyCaret and MLflow.
+
+
+## Synopsis
+
+Run all test cases.
+```
+pytest
+```
+
+Run individual test cases.
+```
+pytest -k file
+pytest -k notebook
+```
+"""
+from pathlib import Path
+
+import pytest
+from cratedb_toolkit.util import DatabaseAdapter
+from pueblo.testing.folder import str_list, list_notebooks, list_python_files
+from pueblo.testing.snippet import pytest_notebook, pytest_module_function
+
+HERE = Path(__file__).parent
+
+
+@pytest.fixture()
+def cratedb() -> DatabaseAdapter:
+    """
+    Provide test cases with a connection to CrateDB, with additional tooling.
+    """
+    return DatabaseAdapter(dburi="crate://crate@localhost:4200")
+
+
+@pytest.fixture(scope="function", autouse=True)
+def db_reset(cratedb):
+    """
+    Reset database before each test case.
+    """
+    cratedb.run_sql("DROP TABLE IF EXISTS pycaret_churn;")
+
+
+@pytest.fixture()
+def churn_dataset(cratedb):
+    """
+    Provide test case with a provisioned dataset.
+    """
+    cratedb.import_csv_pandas(
+        filepath="https://github.com/crate/cratedb-datasets/raw/main/machine-learning/automl/churn-dataset.csv",
+        tablename="pycaret_churn",
+    )
+    cratedb.run_sql("REFRESH TABLE pycaret_churn;")
+
+
+@pytest.mark.parametrize("notebook", str_list(list_notebooks(HERE)))
+def test_notebook(request, notebook: str):
+    """
+    From individual Jupyter Notebook file, collect cells as pytest
+    test cases, and run them.
+
+    Not using `NBRegressionFixture`, because it would manually need to be configured.
+    """
+    pytest_notebook(request=request, filepath=notebook)
+
+
+@pytest.mark.parametrize("pyfile", str_list(list_python_files(HERE)))
+def test_file(request, churn_dataset, pyfile: Path):
+    """
+    From individual Python file, collect and wrap the `main` function into a test case, and run it.
+    """
+    pytest_module_function(request, pyfile)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Backlog

		- Describe / program how to import `churn-dataset.csv`.