crate · amotl · Nov 6, 2023 · Nov 6, 2023
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -63,14 +63,19 @@ updates:
     schedule:
       interval: "weekly"
 
-  # Frameworks.
+  # Topics.
 
-  - directory: "/framework/langchain"
+  - directory: "/topic/machine-learning/classification-automl"
     package-ecosystem: "pip"
     schedule:
       interval: "weekly"
 
-  - directory: "/framework/mlflow"
+  - directory: "/topic/machine-learning/llm-langchain"
+    package-ecosystem: "pip"
+    schedule:
+      interval: "weekly"
+
+  - directory: "/topic/machine-learning/timeseries-basics"
     package-ecosystem: "pip"
     schedule:
       interval: "weekly"

diff --git a/.github/workflows/test-automl.yml b/.github/workflows/test-automl.yml
@@ -0,0 +1,75 @@
+name: AutoML
+
+on:
+  pull_request:
+    branches: ~
+    paths:
+    - '.github/workflows/test-automl.yml'
+    - 'topic/machine-learning/classification-automl/**'
+    - 'requirements.txt'
+  push:
+    branches: [ main ]
+    paths:
+    - '.github/workflows/test-automl.yml'
+    - 'topic/machine-learning/classification-automl/**'
+    - 'requirements.txt'
+
+  # Allow job to be triggered manually.
+  workflow_dispatch:
+
+  # Run job each night after CrateDB nightly has been published.
+  schedule:
+    - cron: '0 3 * * *'
+
+# Cancel in-progress jobs when pushing to the same branch.
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.ref }}
+
+jobs:
+  test:
+    name: "
+     Python: ${{ matrix.python-version }}
+     CrateDB: ${{ matrix.cratedb-version }}
+     on ${{ matrix.os }}"
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ 'ubuntu-latest' ]
+        python-version: [ '3.10' ]
+        cratedb-version: [ 'nightly' ]
+
+    services:
+      cratedb:
+        image: crate/crate:nightly
+        ports:
+          - 4200:4200
+          - 5432:5432
+
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+
+    steps:
+
+      - name: Acquire sources
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: |
+            requirements.txt
+            topic/machine-learning/classification-automl/requirements.txt
+            topic/machine-learning/classification-automl/requirements-dev.txt
+
+      - name: Install utilities
+        run: |
+          pip install -r requirements.txt
+
+      - name: Validate topic/machine-learning/classification-automl
+        run: |
+          ngr test --accept-no-venv topic/machine-learning/classification-automl
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-pueblo[ngr]==0.0.2
+pueblo[ngr]==0.0.3
 
 # Development.
 # pueblo[ngr] @ git+https://github.com/pyveci/pueblo.git@develop
diff --git a/topic/machine-learning/classification-automl/backlog.md b/topic/machine-learning/classification-automl/backlog.md
@@ -0,0 +1,3 @@
+# Backlog
+
+- Describe / program how to import `churn-dataset.csv`.
diff --git a/topic/machine-learning/classification-automl/pyproject.toml b/topic/machine-learning/classification-automl/pyproject.toml
@@ -0,0 +1,45 @@
+[tool.pytest.ini_options]
+minversion = "2.0"
+addopts = """
+  -rfEX -p pytester --strict-markers --verbosity=3 --capture=no
+  """
+# --cov=. --cov-report=term-missing --cov-report=xml
+env = [
+    "CRATEDB_CONNECTION_STRING=crate://crate@localhost/?schema=testdrive",
+    "PYDEVD_DISABLE_FILE_VALIDATION=1",
+]
+
+#log_level = "DEBUG"
+#log_cli_level = "DEBUG"
+
+testpaths = [
+    "*.py",
+]
+xfail_strict = true
+markers = [
+]
+
+# pytest-notebook settings
+nb_test_files = true
+nb_coverage = true
+nb_diff_replace = [
+    # Compensate output of `crash`.
+    '"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"',
+]
+# `vector_search.py` does not include any output(s).
+nb_diff_ignore = [
+    "/metadata/language_info",
+    "/cells/*/execution_count",
+    "/cells/*/outputs/*/execution_count",
+]
+
+[tool.coverage.run]
+branch = false
+
+[tool.coverage.report]
+fail_under = 0
+show_missing = true
+omit = [
+    "conftest.py",
+    "test*.py",
+]
diff --git a/topic/machine-learning/classification-automl/requirements-dev.txt b/topic/machine-learning/classification-automl/requirements-dev.txt
@@ -0,0 +1,6 @@
+coverage~=7.3
+ipykernel
+pytest<8
+pytest-cov<5
+pytest-env<2
+pytest-notebook<0.9
diff --git a/topic/machine-learning/classification-automl/test.py b/topic/machine-learning/classification-automl/test.py
@@ -0,0 +1,117 @@
+import importlib
+import io
+import sys
+from pathlib import Path
+from unittest import mock
+
+import pytest
+from _pytest.python import Function
+
+HERE = Path(__file__).parent
+
+
+def list_files(path: Path, pattern: str):
+    """
+    Enumerate all files in given directory.
+    """
+    files = path.glob(pattern)
+    files = [item.relative_to(path) for item in files]
+    return files
+
+
+def list_notebooks(path: Path):
+    """
+    Enumerate all Jupyter Notebook files found in given directory.
+    """
+    return list_files(path, "**/*.ipynb")
+
+
+def list_pyfiles(path: Path):
+    """
+    Enumerate all regular Python files found in given directory.
+    """
+    pyfiles = []
+    for item in list_files(path, "**/*.py"):
+        if item.suffix != ".py" or item.name in ["conftest.py"] or item.name.startswith("test"):
+            continue
+        pyfiles.append(item)
+    return pyfiles
+
+
+def str_list(things):
+    """
+    Converge list to list of strings.
+    """
+    return map(str, things)
+
+
+@pytest.fixture(scope="function", autouse=True)
+def db_init():
+    """
+    Initialize database.
+    """
+    run_sql(statement="DROP TABLE IF EXISTS pycaret_churn;")
+
+
+def db_provision_churn_dataset():
+    """
+    Provision database.
+    """
+    # FIXME: `import_csv` is not defined.
+    import_csv(file=HERE / "churn-dataset.csv")
+    run_sql(statement="REFRESH TABLE churn_dataset;")
+
+
+def run_sql(statement: str = None, file: str = None):
+    """
+    Run SQL from string or file.
+    """
+    import crate.crash.command
+    sys.argv = ["foo", "--schema=testdrive"]
+    if statement:
+        sys.argv += ["--command", statement]
+    if file:
+        sys.stdin = io.StringIO(Path(file).read_text())
+    with \
+            mock.patch("crate.crash.repl.SQLCompleter._populate_keywords"), \
+            mock.patch("crate.crash.command.CrateShell.close"):
+        try:
+            crate.crash.command.main()
+        except SystemExit as ex:
+            if ex.code != 0:
+                raise
+
+
+@pytest.mark.parametrize("notebook", str_list(list_notebooks(HERE)))
+def test_notebook(request, notebook: str):
+    """
+    From individual Jupyter Notebook file, collect cells as pytest
+    test cases, and run them.
+
+    Not using `NBRegressionFixture`, because it would manually need to be configured.
+    """
+    from _pytest._py.path import LocalPath
+    from pytest_notebook.plugin import pytest_collect_file
+    tests = pytest_collect_file(LocalPath(notebook), request.node)
+    for test in tests.collect():
+        test.runtest()
+
+
+@pytest.mark.parametrize("pyfile", str_list(list_pyfiles(HERE)))
+def test_file(request, pyfile: Path):
+    """
+    From individual Python file, collect and wrap the `main` function into a test case.
+    """
+
+    # TODO: Make configurable.
+    entrypoint_symbol = "main"
+
+    # `.py` file needs provisioning.
+    if str(pyfile).endswith(".py"):
+        db_provision_churn_dataset()
+
+    path = Path(pyfile)
+    mod = importlib.import_module(path.stem)
+    fun = getattr(mod, entrypoint_symbol)
+    f = Function.from_parent(request.node, name="main", callobj=fun)
+    f.runtest()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Backlog

		- Describe / program how to import `churn-dataset.csv`.