From 088d12267d141930c1f0f9a9deadd771d5877245 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Mon, 6 Nov 2023 17:23:49 +0100 Subject: [PATCH 1/2] CI: Update Dependabot configuration to point to new `/topic` folder --- .github/dependabot.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 10829f8e..cf9f9aa6 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -63,14 +63,19 @@ updates: schedule: interval: "weekly" - # Frameworks. + # Topics. - - directory: "/framework/langchain" + - directory: "/topic/machine-learning/classification-automl" package-ecosystem: "pip" schedule: interval: "weekly" - - directory: "/framework/mlflow" + - directory: "/topic/machine-learning/llm-langchain" + package-ecosystem: "pip" + schedule: + interval: "weekly" + + - directory: "/topic/machine-learning/timeseries-basics" package-ecosystem: "pip" schedule: interval: "weekly" From 627773d7aaac57587171287a69ddb60e55012599 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Mon, 6 Nov 2023 16:53:48 +0100 Subject: [PATCH 2/2] AutoML: Add software tests and CI recipe --- .github/workflows/test-automl.yml | 75 +++++++++++ requirements.txt | 2 +- .../classification-automl/backlog.md | 3 + .../classification-automl/pyproject.toml | 45 +++++++ .../requirements-dev.txt | 6 + .../classification-automl/test.py | 117 ++++++++++++++++++ 6 files changed, 247 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/test-automl.yml create mode 100644 topic/machine-learning/classification-automl/backlog.md create mode 100644 topic/machine-learning/classification-automl/pyproject.toml create mode 100644 topic/machine-learning/classification-automl/requirements-dev.txt create mode 100644 topic/machine-learning/classification-automl/test.py diff --git a/.github/workflows/test-automl.yml b/.github/workflows/test-automl.yml new file mode 100644 index 00000000..468ccc1e --- /dev/null +++ b/.github/workflows/test-automl.yml @@ -0,0 +1,75 @@ +name: AutoML + +on: + pull_request: + branches: ~ + paths: + - '.github/workflows/test-automl.yml' + - 'topic/machine-learning/classification-automl/**' + - 'requirements.txt' + push: + branches: [ main ] + paths: + - '.github/workflows/test-automl.yml' + - 'topic/machine-learning/classification-automl/**' + - 'requirements.txt' + + # Allow job to be triggered manually. + workflow_dispatch: + + # Run job each night after CrateDB nightly has been published. + schedule: + - cron: '0 3 * * *' + +# Cancel in-progress jobs when pushing to the same branch. +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + +jobs: + test: + name: " + Python: ${{ matrix.python-version }} + CrateDB: ${{ matrix.cratedb-version }} + on ${{ matrix.os }}" + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ 'ubuntu-latest' ] + python-version: [ '3.10' ] + cratedb-version: [ 'nightly' ] + + services: + cratedb: + image: crate/crate:nightly + ports: + - 4200:4200 + - 5432:5432 + + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + + steps: + + - name: Acquire sources + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + cache: 'pip' + cache-dependency-path: | + requirements.txt + topic/machine-learning/classification-automl/requirements.txt + topic/machine-learning/classification-automl/requirements-dev.txt + + - name: Install utilities + run: | + pip install -r requirements.txt + + - name: Validate topic/machine-learning/classification-automl + run: | + ngr test --accept-no-venv topic/machine-learning/classification-automl diff --git a/requirements.txt b/requirements.txt index 00292c44..86add737 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -pueblo[ngr]==0.0.2 +pueblo[ngr]==0.0.3 # Development. # pueblo[ngr] @ git+https://github.com/pyveci/pueblo.git@develop diff --git a/topic/machine-learning/classification-automl/backlog.md b/topic/machine-learning/classification-automl/backlog.md new file mode 100644 index 00000000..4c86c490 --- /dev/null +++ b/topic/machine-learning/classification-automl/backlog.md @@ -0,0 +1,3 @@ +# Backlog + +- Describe / program how to import `churn-dataset.csv`. diff --git a/topic/machine-learning/classification-automl/pyproject.toml b/topic/machine-learning/classification-automl/pyproject.toml new file mode 100644 index 00000000..b9e54d6b --- /dev/null +++ b/topic/machine-learning/classification-automl/pyproject.toml @@ -0,0 +1,45 @@ +[tool.pytest.ini_options] +minversion = "2.0" +addopts = """ + -rfEX -p pytester --strict-markers --verbosity=3 --capture=no + """ +# --cov=. --cov-report=term-missing --cov-report=xml +env = [ + "CRATEDB_CONNECTION_STRING=crate://crate@localhost/?schema=testdrive", + "PYDEVD_DISABLE_FILE_VALIDATION=1", +] + +#log_level = "DEBUG" +#log_cli_level = "DEBUG" + +testpaths = [ + "*.py", +] +xfail_strict = true +markers = [ +] + +# pytest-notebook settings +nb_test_files = true +nb_coverage = true +nb_diff_replace = [ + # Compensate output of `crash`. + '"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"', +] +# `vector_search.py` does not include any output(s). +nb_diff_ignore = [ + "/metadata/language_info", + "/cells/*/execution_count", + "/cells/*/outputs/*/execution_count", +] + +[tool.coverage.run] +branch = false + +[tool.coverage.report] +fail_under = 0 +show_missing = true +omit = [ + "conftest.py", + "test*.py", +] diff --git a/topic/machine-learning/classification-automl/requirements-dev.txt b/topic/machine-learning/classification-automl/requirements-dev.txt new file mode 100644 index 00000000..1f1a8ead --- /dev/null +++ b/topic/machine-learning/classification-automl/requirements-dev.txt @@ -0,0 +1,6 @@ +coverage~=7.3 +ipykernel +pytest<8 +pytest-cov<5 +pytest-env<2 +pytest-notebook<0.9 diff --git a/topic/machine-learning/classification-automl/test.py b/topic/machine-learning/classification-automl/test.py new file mode 100644 index 00000000..c6f2fc79 --- /dev/null +++ b/topic/machine-learning/classification-automl/test.py @@ -0,0 +1,117 @@ +import importlib +import io +import sys +from pathlib import Path +from unittest import mock + +import pytest +from _pytest.python import Function + +HERE = Path(__file__).parent + + +def list_files(path: Path, pattern: str): + """ + Enumerate all files in given directory. + """ + files = path.glob(pattern) + files = [item.relative_to(path) for item in files] + return files + + +def list_notebooks(path: Path): + """ + Enumerate all Jupyter Notebook files found in given directory. + """ + return list_files(path, "**/*.ipynb") + + +def list_pyfiles(path: Path): + """ + Enumerate all regular Python files found in given directory. + """ + pyfiles = [] + for item in list_files(path, "**/*.py"): + if item.suffix != ".py" or item.name in ["conftest.py"] or item.name.startswith("test"): + continue + pyfiles.append(item) + return pyfiles + + +def str_list(things): + """ + Converge list to list of strings. + """ + return map(str, things) + + +@pytest.fixture(scope="function", autouse=True) +def db_init(): + """ + Initialize database. + """ + run_sql(statement="DROP TABLE IF EXISTS pycaret_churn;") + + +def db_provision_churn_dataset(): + """ + Provision database. + """ + # FIXME: `import_csv` is not defined. + import_csv(file=HERE / "churn-dataset.csv") + run_sql(statement="REFRESH TABLE churn_dataset;") + + +def run_sql(statement: str = None, file: str = None): + """ + Run SQL from string or file. + """ + import crate.crash.command + sys.argv = ["foo", "--schema=testdrive"] + if statement: + sys.argv += ["--command", statement] + if file: + sys.stdin = io.StringIO(Path(file).read_text()) + with \ + mock.patch("crate.crash.repl.SQLCompleter._populate_keywords"), \ + mock.patch("crate.crash.command.CrateShell.close"): + try: + crate.crash.command.main() + except SystemExit as ex: + if ex.code != 0: + raise + + +@pytest.mark.parametrize("notebook", str_list(list_notebooks(HERE))) +def test_notebook(request, notebook: str): + """ + From individual Jupyter Notebook file, collect cells as pytest + test cases, and run them. + + Not using `NBRegressionFixture`, because it would manually need to be configured. + """ + from _pytest._py.path import LocalPath + from pytest_notebook.plugin import pytest_collect_file + tests = pytest_collect_file(LocalPath(notebook), request.node) + for test in tests.collect(): + test.runtest() + + +@pytest.mark.parametrize("pyfile", str_list(list_pyfiles(HERE))) +def test_file(request, pyfile: Path): + """ + From individual Python file, collect and wrap the `main` function into a test case. + """ + + # TODO: Make configurable. + entrypoint_symbol = "main" + + # `.py` file needs provisioning. + if str(pyfile).endswith(".py"): + db_provision_churn_dataset() + + path = Path(pyfile) + mod = importlib.import_module(path.stem) + fun = getattr(mod, entrypoint_symbol) + f = Function.from_parent(request.node, name="main", callobj=fun) + f.runtest()