diff --git a/.github/dependabot.yml b/.github/dependabot.yml index dfc6f6a8..8b3ea780 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -65,6 +65,11 @@ updates: # Topics. + - directory: "/topic/machine-learning/classification-automl" + package-ecosystem: "pip" + schedule: + interval: "weekly" + - directory: "/topic/machine-learning/llm-langchain" package-ecosystem: "pip" schedule: diff --git a/.github/workflows/test-automl.yml b/.github/workflows/test-automl.yml new file mode 100644 index 00000000..468ccc1e --- /dev/null +++ b/.github/workflows/test-automl.yml @@ -0,0 +1,75 @@ +name: AutoML + +on: + pull_request: + branches: ~ + paths: + - '.github/workflows/test-automl.yml' + - 'topic/machine-learning/classification-automl/**' + - 'requirements.txt' + push: + branches: [ main ] + paths: + - '.github/workflows/test-automl.yml' + - 'topic/machine-learning/classification-automl/**' + - 'requirements.txt' + + # Allow job to be triggered manually. + workflow_dispatch: + + # Run job each night after CrateDB nightly has been published. + schedule: + - cron: '0 3 * * *' + +# Cancel in-progress jobs when pushing to the same branch. +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + +jobs: + test: + name: " + Python: ${{ matrix.python-version }} + CrateDB: ${{ matrix.cratedb-version }} + on ${{ matrix.os }}" + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ 'ubuntu-latest' ] + python-version: [ '3.10' ] + cratedb-version: [ 'nightly' ] + + services: + cratedb: + image: crate/crate:nightly + ports: + - 4200:4200 + - 5432:5432 + + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + + steps: + + - name: Acquire sources + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + cache: 'pip' + cache-dependency-path: | + requirements.txt + topic/machine-learning/classification-automl/requirements.txt + topic/machine-learning/classification-automl/requirements-dev.txt + + - name: Install utilities + run: | + pip install -r requirements.txt + + - name: Validate topic/machine-learning/classification-automl + run: | + ngr test --accept-no-venv topic/machine-learning/classification-automl diff --git a/topic/machine-learning/classification-automl/automl_classification_with_pycaret.ipynb b/topic/machine-learning/classification-automl/automl_classification_with_pycaret.ipynb index 2838dc04..7cee4698 100644 --- a/topic/machine-learning/classification-automl/automl_classification_with_pycaret.ipynb +++ b/topic/machine-learning/classification-automl/automl_classification_with_pycaret.ipynb @@ -175,11 +175,12 @@ " dotenv.load_dotenv(\".env\", override=True)\n", "\n", "dburi = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}\"\n", - "engine = sa.create_engine(dburi, echo=True)\n", + "engine = sa.create_engine(dburi, echo=os.environ.get('DEBUG'))\n", "df = pd.read_csv(\"https://github.com/crate/cratedb-datasets/raw/main/machine-learning/automl/churn-dataset.csv\")\n", "\n", "with engine.connect() as conn:\n", - " df.to_sql(\"pycaret_churn\", conn, index=False, chunksize=1000, if_exists=\"replace\")\n" + " df.to_sql(\"pycaret_churn\", conn, index=False, chunksize=1000, if_exists=\"replace\")\n", + " conn.execute(sa.text(\"REFRESH TABLE pycaret_churn;\"))" ] }, { @@ -206,13 +207,13 @@ "import plotly\n", "import plotly.express as plotly_express\n", "import plotly.graph_objects as go\n", - "import mlflow_cratedb # We need this import to use the CrateDB MLflow store\n", + "import mlflow_cratedb # Required to enable the CrateDB MLflow adapter.\n", "\n", "if os.path.exists(\".env\"):\n", " dotenv.load_dotenv(\".env\", override=True)\n", "\n", "dburi = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}\"\n", - "engine = sa.create_engine(dburi, echo=True)\n", + "engine = sa.create_engine(dburi, echo=os.environ.get('DEBUG'))\n", "\n", "with engine.connect() as conn:\n", " with conn.execute(sa.text(\"SELECT * FROM pycaret_churn\")) as cursor:\n", @@ -3373,7 +3374,7 @@ "source": [ "os.environ[\n", " \"MLFLOW_TRACKING_URI\"\n", - "] = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl=true&schema=mlflow\"" + "] = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}&schema=mlflow\"" ] }, { diff --git a/topic/machine-learning/classification-automl/automl_classification_with_pycaret.py b/topic/machine-learning/classification-automl/automl_classification_with_pycaret.py index a2ecb96c..1da8533b 100644 --- a/topic/machine-learning/classification-automl/automl_classification_with_pycaret.py +++ b/topic/machine-learning/classification-automl/automl_classification_with_pycaret.py @@ -1,28 +1,46 @@ +""" +Example program for exercising the "AutoML with Pycaret and CrateDB" article. +This is a standalone variant. A corresponding .ipynb Jupyter Notebook can usually +be found side-by-side to this file. +""" import os import dotenv import sqlalchemy as sa import pandas as pd -import mlflow_cratedb # We need this import to use the CrateDB MLflow store -from pycaret.classification import * +import mlflow_cratedb # Required to enable the CrateDB MLflow adapter. +from pycaret.classification import setup, compare_models, tune_model, ensemble_model, blend_models, automl, \ + evaluate_model, finalize_model, save_model, predict_model from mlflow.sklearn import log_model + if os.path.exists(".env"): dotenv.load_dotenv(".env", override=True) +# Configure database connection string. +dburi = f"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}" +os.environ["MLFLOW_TRACKING_URI"] = f"{dburi}&schema=mlflow" + + def fetch_data(): - dburi = f"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}" + """ + Fetch data from CrateDB, using SQL and SQLAlchemy, and wrap result into pandas data frame. + """ engine = sa.create_engine(dburi, echo=True) with engine.connect() as conn: with conn.execute(sa.text("SELECT * FROM pycaret_churn")) as cursor: data = pd.DataFrame(cursor.fetchall(), columns=cursor.keys()) - os.environ["MLFLOW_TRACKING_URI"] = f"{dburi}&schema=mlflow" + return data def run_experiment(data): + """ + Run an AutoML experiment using PyCaret, MLflow, and CrateDB. + """ s = setup( - data, + data=data, + data_func=None, target="Churn", ignore_features=["customerID"], log_experiment=True, @@ -31,32 +49,30 @@ def run_experiment(data): best_models = compare_models(sort="AUC", exclude=["lightgbm"], n_select=3) tuned_models = [tune_model(model) for model in best_models] - _ = [ensemble_model(i, method="Bagging") for i in tuned_models] + [ensemble_model(i, method="Bagging") for i in tuned_models] def try_ensemble_model(model): try: print(type(model)) - # Attempt to ensemble the model with Boosting method + # Attempt to ensemble the model with Boosting method. return ensemble_model(model, method="Boosting") except Exception as e: print("Can't apply boosting.") return None - _ = [try_ensemble_model(i) for i in tuned_models] - _ = blend_models(estimator_list=tuned_models) + [try_ensemble_model(i) for i in tuned_models] + blend_models(estimator_list=tuned_models) best_model = automl(optimize="AUC") evaluate_model(best_model) final_model = finalize_model(best_model) - if not os.path.exists("model"): - os.makedirs("model") - - # Save the model to disk - _ = save_model(final_model, "model/classification_model") + # Save the model to disk. + os.makedirs("model", exist_ok=True) + save_model(final_model, "model/classification_model") predict_model(final_model, s.X_test) - _ = log_model( + log_model( sk_model=best_model, artifact_path="model/classification_model", registered_model_name=f"classification-model", diff --git a/topic/machine-learning/classification-automl/backlog.md b/topic/machine-learning/classification-automl/backlog.md new file mode 100644 index 00000000..4c86c490 --- /dev/null +++ b/topic/machine-learning/classification-automl/backlog.md @@ -0,0 +1,3 @@ +# Backlog + +- Describe / program how to import `churn-dataset.csv`. diff --git a/topic/machine-learning/classification-automl/pyproject.toml b/topic/machine-learning/classification-automl/pyproject.toml new file mode 100644 index 00000000..1e6332ed --- /dev/null +++ b/topic/machine-learning/classification-automl/pyproject.toml @@ -0,0 +1,74 @@ +[tool.pytest.ini_options] +minversion = "2.0" +addopts = """ + -rfEX -p pytester --strict-markers --verbosity=3 --capture=no + """ +# --cov=. --cov-report=term-missing --cov-report=xml +env = [ + "CRATEDB_CONNECTION_STRING=crate://crate@localhost/?schema=testdrive", + "CRATE_USER=crate", + "CRATE_PASSWORD=", + "CRATE_HOST=localhost", + "CRATE_SSL=false", + "PYDEVD_DISABLE_FILE_VALIDATION=1", +] + +#log_level = "DEBUG" +#log_cli_level = "DEBUG" + +testpaths = [ + "*.py", +] +xfail_strict = true +markers = [ +] + +# pytest-notebook settings +nb_test_files = true +nb_coverage = false +# 120 seconds is too less on CI/GHA +nb_exec_timeout = 240 +nb_diff_replace = [ + # Compensate output of `crash`. + '"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"', + # Compensate other outputs. + '"/cells/*/outputs/*/data/text/html" "T_....." "T_na"', + '"/cells/*/outputs/*/data/text/plain" "IPython.core.display.HTML object" "pandas.io.formats.style.Styler"', + '"/cells/*/outputs/*/data/text/plain" "pandas.io.formats.style.Styler at 0x.+" "pandas.io.formats.style.Styler"', + '"/cells/*/outputs/*/data/application/vnd.jupyter.widget-view+json" "model_id: .+" "model_id: na"', + '"/cells/*/outputs/*/data/text/html" "\>\d+\.\d+\<\/td\>" "0.3333"', +] +# `vector_search.py` does not include any output(s). +nb_diff_ignore = [ + "/metadata/language_info", + "/metadata/widgets", + "/cells/*/execution_count", + "/cells/*/outputs/*/execution_count", + # Ignore images. + "/cells/*/outputs/*/data/image/png", + # FIXME: Those pacifiers should be revisited. + # Some are warnings, some are semantic ambiguities. + # Maybe they can be improved in one way or another, + # for improved QA. + "/cells/5/outputs", + "/cells/14/outputs", + "/cells/16/outputs", + "/cells/16/outputs", + "/cells/18/outputs", + "/cells/24/outputs", + "/cells/30/outputs/0/data/application/vnd.jupyter.widget-view+json", + "/cells/34/outputs", + "/cells/36/outputs", + "/cells/40/outputs", +] + +[tool.coverage.run] +branch = false + +[tool.coverage.report] +fail_under = 0 +show_missing = true +omit = [ + "conftest.py", + "test*.py", +] diff --git a/topic/machine-learning/classification-automl/requirements-dev.txt b/topic/machine-learning/classification-automl/requirements-dev.txt new file mode 100644 index 00000000..fb7a1855 --- /dev/null +++ b/topic/machine-learning/classification-automl/requirements-dev.txt @@ -0,0 +1,11 @@ +# Real. +# cratedb-toolkit +# pueblo[testing] + +# Development. +cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@main +pueblo[testing] @ git+https://github.com/pyveci/pueblo.git@main + +# Workstation. +# --editable=/Users/amo/dev/crate/ecosystem/cratedb-retentions[io] +# --editable=/Users/amo/dev/pyveci/sources/pueblo[testing] diff --git a/topic/machine-learning/classification-automl/requirements.txt b/topic/machine-learning/classification-automl/requirements.txt index 02b07112..885e0404 100644 --- a/topic/machine-learning/classification-automl/requirements.txt +++ b/topic/machine-learning/classification-automl/requirements.txt @@ -1,7 +1,11 @@ +# Real. crate[sqlalchemy] -mlflow-cratedb==2.7.1 +# mlflow-cratedb==2.7.1 plotly<5.19 pycaret[analysis,models,tuner,parallel,test]==3.1.0 python-dotenv<2 tqdm<5 werkzeug==2.2.3 + +# Development. +mlflow-cratedb @ git+https://github.com/crate-workbench/mlflow-cratedb.git@testing diff --git a/topic/machine-learning/classification-automl/test.py b/topic/machine-learning/classification-automl/test.py new file mode 100644 index 00000000..c9df264e --- /dev/null +++ b/topic/machine-learning/classification-automl/test.py @@ -0,0 +1,74 @@ +""" +## About + +Test cases for classification model examples with CrateDB, PyCaret and MLflow. + + +## Synopsis + +Run all test cases. +``` +pytest +``` + +Run individual test cases. +``` +pytest -k file +pytest -k notebook +``` +""" +from pathlib import Path + +import pytest +from cratedb_toolkit.util import DatabaseAdapter +from pueblo.testing.folder import str_list, list_notebooks, list_python_files +from pueblo.testing.snippet import pytest_notebook, pytest_module_function + +HERE = Path(__file__).parent + + +@pytest.fixture() +def cratedb() -> DatabaseAdapter: + """ + Provide test cases with a connection to CrateDB, with additional tooling. + """ + return DatabaseAdapter(dburi="crate://crate@localhost:4200") + + +@pytest.fixture(scope="function", autouse=True) +def db_reset(cratedb): + """ + Reset database before each test case. + """ + cratedb.run_sql("DROP TABLE IF EXISTS pycaret_churn;") + + +@pytest.fixture() +def churn_dataset(cratedb): + """ + Provide test case with a provisioned dataset. + """ + cratedb.import_csv_pandas( + filepath="https://github.com/crate/cratedb-datasets/raw/main/machine-learning/automl/churn-dataset.csv", + tablename="pycaret_churn", + ) + cratedb.run_sql("REFRESH TABLE pycaret_churn;") + + +@pytest.mark.parametrize("notebook", str_list(list_notebooks(HERE))) +def test_notebook(request, notebook: str): + """ + From individual Jupyter Notebook file, collect cells as pytest + test cases, and run them. + + Not using `NBRegressionFixture`, because it would manually need to be configured. + """ + pytest_notebook(request=request, filepath=notebook) + + +@pytest.mark.parametrize("pyfile", str_list(list_python_files(HERE))) +def test_file(request, churn_dataset, pyfile: Path): + """ + From individual Python file, collect and wrap the `main` function into a test case, and run it. + """ + pytest_module_function(request, pyfile) diff --git a/topic/machine-learning/llm-langchain/requirements-dev.txt b/topic/machine-learning/llm-langchain/requirements-dev.txt index ed7a3fd0..d100aa9e 100644 --- a/topic/machine-learning/llm-langchain/requirements-dev.txt +++ b/topic/machine-learning/llm-langchain/requirements-dev.txt @@ -3,7 +3,7 @@ # pueblo[testing] # Development. -cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@amo/add-import-csv +cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@main pueblo[testing] @ git+https://github.com/pyveci/pueblo.git@main # Workstation. diff --git a/topic/machine-learning/mlops-mlflow/requirements-dev.txt b/topic/machine-learning/mlops-mlflow/requirements-dev.txt index ed7a3fd0..d100aa9e 100644 --- a/topic/machine-learning/mlops-mlflow/requirements-dev.txt +++ b/topic/machine-learning/mlops-mlflow/requirements-dev.txt @@ -3,7 +3,7 @@ # pueblo[testing] # Development. -cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@amo/add-import-csv +cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@main pueblo[testing] @ git+https://github.com/pyveci/pueblo.git@main # Workstation.