Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ updates:

# Topics.

- directory: "/topic/machine-learning/classification-automl"
package-ecosystem: "pip"
schedule:
interval: "weekly"

- directory: "/topic/machine-learning/llm-langchain"
package-ecosystem: "pip"
schedule:
Expand Down
75 changes: 75 additions & 0 deletions .github/workflows/test-automl.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
name: AutoML

on:
pull_request:
branches: ~
paths:
- '.github/workflows/test-automl.yml'
- 'topic/machine-learning/classification-automl/**'
- 'requirements.txt'
push:
branches: [ main ]
paths:
- '.github/workflows/test-automl.yml'
- 'topic/machine-learning/classification-automl/**'
- 'requirements.txt'

# Allow job to be triggered manually.
workflow_dispatch:

# Run job each night after CrateDB nightly has been published.
schedule:
- cron: '0 3 * * *'

# Cancel in-progress jobs when pushing to the same branch.
concurrency:
cancel-in-progress: true
group: ${{ github.workflow }}-${{ github.ref }}

jobs:
test:
name: "
Python: ${{ matrix.python-version }}
CrateDB: ${{ matrix.cratedb-version }}
on ${{ matrix.os }}"
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ 'ubuntu-latest' ]
python-version: [ '3.10' ]
cratedb-version: [ 'nightly' ]

services:
cratedb:
image: crate/crate:nightly
ports:
- 4200:4200
- 5432:5432

env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

steps:

- name: Acquire sources
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: 'pip'
cache-dependency-path: |
requirements.txt
topic/machine-learning/classification-automl/requirements.txt
topic/machine-learning/classification-automl/requirements-dev.txt

- name: Install utilities
run: |
pip install -r requirements.txt

- name: Validate topic/machine-learning/classification-automl
run: |
ngr test --accept-no-venv topic/machine-learning/classification-automl
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,12 @@
" dotenv.load_dotenv(\".env\", override=True)\n",
"\n",
"dburi = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}\"\n",
"engine = sa.create_engine(dburi, echo=True)\n",
"engine = sa.create_engine(dburi, echo=os.environ.get('DEBUG'))\n",
"df = pd.read_csv(\"https://github.com/crate/cratedb-datasets/raw/main/machine-learning/automl/churn-dataset.csv\")\n",
"\n",
"with engine.connect() as conn:\n",
" df.to_sql(\"pycaret_churn\", conn, index=False, chunksize=1000, if_exists=\"replace\")\n"
" df.to_sql(\"pycaret_churn\", conn, index=False, chunksize=1000, if_exists=\"replace\")\n",
" conn.execute(sa.text(\"REFRESH TABLE pycaret_churn;\"))"
]
},
{
Expand All @@ -206,13 +207,13 @@
"import plotly\n",
"import plotly.express as plotly_express\n",
"import plotly.graph_objects as go\n",
"import mlflow_cratedb # We need this import to use the CrateDB MLflow store\n",
"import mlflow_cratedb # Required to enable the CrateDB MLflow adapter.\n",
"\n",
"if os.path.exists(\".env\"):\n",
" dotenv.load_dotenv(\".env\", override=True)\n",
"\n",
"dburi = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}\"\n",
"engine = sa.create_engine(dburi, echo=True)\n",
"engine = sa.create_engine(dburi, echo=os.environ.get('DEBUG'))\n",
"\n",
"with engine.connect() as conn:\n",
" with conn.execute(sa.text(\"SELECT * FROM pycaret_churn\")) as cursor:\n",
Expand Down Expand Up @@ -3373,7 +3374,7 @@
"source": [
"os.environ[\n",
" \"MLFLOW_TRACKING_URI\"\n",
"] = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl=true&schema=mlflow\""
"] = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}&schema=mlflow\""
]
},
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,28 +1,46 @@
"""
Example program for exercising the "AutoML with Pycaret and CrateDB" article.
This is a standalone variant. A corresponding .ipynb Jupyter Notebook can usually
be found side-by-side to this file.
"""
import os
import dotenv
import sqlalchemy as sa
import pandas as pd
import mlflow_cratedb # We need this import to use the CrateDB MLflow store
from pycaret.classification import *
import mlflow_cratedb # Required to enable the CrateDB MLflow adapter.
from pycaret.classification import setup, compare_models, tune_model, ensemble_model, blend_models, automl, \
evaluate_model, finalize_model, save_model, predict_model
from mlflow.sklearn import log_model


if os.path.exists(".env"):
dotenv.load_dotenv(".env", override=True)


# Configure database connection string.
dburi = f"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}"
os.environ["MLFLOW_TRACKING_URI"] = f"{dburi}&schema=mlflow"


def fetch_data():
dburi = f"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}"
"""
Fetch data from CrateDB, using SQL and SQLAlchemy, and wrap result into pandas data frame.
"""
engine = sa.create_engine(dburi, echo=True)

with engine.connect() as conn:
with conn.execute(sa.text("SELECT * FROM pycaret_churn")) as cursor:
data = pd.DataFrame(cursor.fetchall(), columns=cursor.keys())
os.environ["MLFLOW_TRACKING_URI"] = f"{dburi}&schema=mlflow"
return data


def run_experiment(data):
"""
Run an AutoML experiment using PyCaret, MLflow, and CrateDB.
"""
s = setup(
data,
data=data,
data_func=None,
target="Churn",
ignore_features=["customerID"],
log_experiment=True,
Expand All @@ -31,32 +49,30 @@ def run_experiment(data):

best_models = compare_models(sort="AUC", exclude=["lightgbm"], n_select=3)
tuned_models = [tune_model(model) for model in best_models]
_ = [ensemble_model(i, method="Bagging") for i in tuned_models]
[ensemble_model(i, method="Bagging") for i in tuned_models]

def try_ensemble_model(model):
try:
print(type(model))
# Attempt to ensemble the model with Boosting method
# Attempt to ensemble the model with Boosting method.
return ensemble_model(model, method="Boosting")
except Exception as e:
print("Can't apply boosting.")
return None

_ = [try_ensemble_model(i) for i in tuned_models]
_ = blend_models(estimator_list=tuned_models)
[try_ensemble_model(i) for i in tuned_models]
blend_models(estimator_list=tuned_models)
best_model = automl(optimize="AUC")

evaluate_model(best_model)
final_model = finalize_model(best_model)

if not os.path.exists("model"):
os.makedirs("model")

# Save the model to disk
_ = save_model(final_model, "model/classification_model")
# Save the model to disk.
os.makedirs("model", exist_ok=True)
save_model(final_model, "model/classification_model")
predict_model(final_model, s.X_test)

_ = log_model(
log_model(
sk_model=best_model,
artifact_path="model/classification_model",
registered_model_name=f"classification-model",
Expand Down
3 changes: 3 additions & 0 deletions topic/machine-learning/classification-automl/backlog.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Backlog

- Describe / program how to import `churn-dataset.csv`.
74 changes: 74 additions & 0 deletions topic/machine-learning/classification-automl/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
[tool.pytest.ini_options]
minversion = "2.0"
addopts = """
-rfEX -p pytester --strict-markers --verbosity=3 --capture=no
"""
# --cov=. --cov-report=term-missing --cov-report=xml
env = [
"CRATEDB_CONNECTION_STRING=crate://crate@localhost/?schema=testdrive",
"CRATE_USER=crate",
"CRATE_PASSWORD=",
"CRATE_HOST=localhost",
"CRATE_SSL=false",
"PYDEVD_DISABLE_FILE_VALIDATION=1",
]

#log_level = "DEBUG"
#log_cli_level = "DEBUG"

testpaths = [
"*.py",
]
xfail_strict = true
markers = [
]

# pytest-notebook settings
nb_test_files = true
nb_coverage = false
# 120 seconds is too less on CI/GHA
nb_exec_timeout = 240
nb_diff_replace = [
# Compensate output of `crash`.
'"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"',
# Compensate other outputs.
'"/cells/*/outputs/*/data/text/html" "T_....." "T_na"',
'"/cells/*/outputs/*/data/text/plain" "IPython.core.display.HTML object" "pandas.io.formats.style.Styler"',
'"/cells/*/outputs/*/data/text/plain" "pandas.io.formats.style.Styler at 0x.+" "pandas.io.formats.style.Styler"',
'"/cells/*/outputs/*/data/application/vnd.jupyter.widget-view+json" "model_id: .+" "model_id: na"',
'"/cells/*/outputs/*/data/text/html" "\>\d+\.\d+\<\/td\>" "0.3333"',
]
# `vector_search.py` does not include any output(s).
nb_diff_ignore = [
"/metadata/language_info",
"/metadata/widgets",
"/cells/*/execution_count",
"/cells/*/outputs/*/execution_count",
# Ignore images.
"/cells/*/outputs/*/data/image/png",
# FIXME: Those pacifiers should be revisited.
# Some are warnings, some are semantic ambiguities.
# Maybe they can be improved in one way or another,
# for improved QA.
"/cells/5/outputs",
"/cells/14/outputs",
"/cells/16/outputs",
"/cells/16/outputs",
"/cells/18/outputs",
"/cells/24/outputs",
"/cells/30/outputs/0/data/application/vnd.jupyter.widget-view+json",
"/cells/34/outputs",
"/cells/36/outputs",
"/cells/40/outputs",
]

[tool.coverage.run]
branch = false

[tool.coverage.report]
fail_under = 0
show_missing = true
omit = [
"conftest.py",
"test*.py",
]
11 changes: 11 additions & 0 deletions topic/machine-learning/classification-automl/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Real.
# cratedb-toolkit
# pueblo[testing]

# Development.
cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@main
pueblo[testing] @ git+https://github.com/pyveci/pueblo.git@main

# Workstation.
# --editable=/Users/amo/dev/crate/ecosystem/cratedb-retentions[io]
# --editable=/Users/amo/dev/pyveci/sources/pueblo[testing]
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
# Real.
crate[sqlalchemy]
mlflow-cratedb==2.7.1
# mlflow-cratedb==2.7.1
plotly<5.19
pycaret[analysis,models,tuner,parallel,test]==3.1.0
python-dotenv<2
tqdm<5
werkzeug==2.2.3

# Development.
mlflow-cratedb @ git+https://github.com/crate-workbench/mlflow-cratedb.git@testing
74 changes: 74 additions & 0 deletions topic/machine-learning/classification-automl/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
## About

Test cases for classification model examples with CrateDB, PyCaret and MLflow.


## Synopsis

Run all test cases.
```
pytest
```

Run individual test cases.
```
pytest -k file
pytest -k notebook
```
"""
from pathlib import Path

import pytest
from cratedb_toolkit.util import DatabaseAdapter
from pueblo.testing.folder import str_list, list_notebooks, list_python_files
from pueblo.testing.snippet import pytest_notebook, pytest_module_function

HERE = Path(__file__).parent


@pytest.fixture()
def cratedb() -> DatabaseAdapter:
"""
Provide test cases with a connection to CrateDB, with additional tooling.
"""
return DatabaseAdapter(dburi="crate://crate@localhost:4200")


@pytest.fixture(scope="function", autouse=True)
def db_reset(cratedb):
"""
Reset database before each test case.
"""
cratedb.run_sql("DROP TABLE IF EXISTS pycaret_churn;")


@pytest.fixture()
def churn_dataset(cratedb):
"""
Provide test case with a provisioned dataset.
"""
cratedb.import_csv_pandas(
filepath="https://github.com/crate/cratedb-datasets/raw/main/machine-learning/automl/churn-dataset.csv",
tablename="pycaret_churn",
)
cratedb.run_sql("REFRESH TABLE pycaret_churn;")


@pytest.mark.parametrize("notebook", str_list(list_notebooks(HERE)))
def test_notebook(request, notebook: str):
"""
From individual Jupyter Notebook file, collect cells as pytest
test cases, and run them.

Not using `NBRegressionFixture`, because it would manually need to be configured.
"""
pytest_notebook(request=request, filepath=notebook)


@pytest.mark.parametrize("pyfile", str_list(list_python_files(HERE)))
def test_file(request, churn_dataset, pyfile: Path):
"""
From individual Python file, collect and wrap the `main` function into a test case, and run it.
"""
pytest_module_function(request, pyfile)
Loading