crate · amotl · Nov 7, 2023 · Nov 7, 2023 · Nov 7, 2023
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -63,14 +63,14 @@ updates:
     schedule:
       interval: "weekly"
 
-  # Frameworks.
+  # Topics.
 
-  - directory: "/framework/langchain"
+  - directory: "/topic/machine-learning/llm-langchain"
     package-ecosystem: "pip"
     schedule:
       interval: "weekly"
 
-  - directory: "/framework/mlflow"
+  - directory: "/topic/machine-learning/mlops-mlflow"
     package-ecosystem: "pip"
     schedule:
       interval: "weekly"

diff --git a/.github/workflows/test-mlflow.yml b/.github/workflows/test-mlflow.yml
@@ -0,0 +1,72 @@
+name: MLflow
+
+on:
+  pull_request:
+    branches: ~
+    paths:
+    - '.github/workflows/test-mlflow.yml'
+    - 'topic/machine-learning/mlops-mlflow/**'
+    - 'requirements.txt'
+  push:
+    branches: [ main ]
+    paths:
+    - '.github/workflows/test-mlflow.yml'
+    - 'topic/machine-learning/mlops-mlflow/**'
+    - 'requirements.txt'
+
+  # Allow job to be triggered manually.
+  workflow_dispatch:
+
+  # Run job each night after CrateDB nightly has been published.
+  schedule:
+    - cron: '0 3 * * *'
+
+# Cancel in-progress jobs when pushing to the same branch.
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.ref }}
+
+jobs:
+  test:
+    name: "
+     Python: ${{ matrix.python-version }}
+     CrateDB: ${{ matrix.cratedb-version }}
+     on ${{ matrix.os }}"
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ 'ubuntu-latest' ]
+        python-version: [ '3.10' ]
+        cratedb-version: [ 'nightly' ]
+
+    services:
+      cratedb:
+        image: crate/crate:nightly
+        ports:
+          - 4200:4200
+          - 5432:5432
+
+    steps:
+
+      - name: Acquire sources
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: |
+            requirements.txt
+            topic/machine-learning/mlops-mlflow/requirements.txt
+            topic/machine-learning/mlops-mlflow/requirements-dev.txt
+
+      - name: Install utilities
+        run: |
+          pip install -r requirements.txt
+
+      - name: Validate topic/machine-learning/mlops-mlflow
+        run: |
+          ngr test --accept-no-venv topic/machine-learning/mlops-mlflow
diff --git a/topic/machine-learning/llm-langchain/conftest.py b/topic/machine-learning/llm-langchain/conftest.py
@@ -1,4 +1,6 @@
-from pueblo.testing.notebook import monkeypatch_pytest_notebook_treat_cell_exit_as_notebook_skip
+# Initialize nltk upfront, so that it does not run stray output into Jupyter Notebooks.
+from pueblo.testing.nlp import nltk_init
 
 # Make `pytest.exit()` called in notebook cells gracefully skip testing the whole notebook.
+from pueblo.testing.notebook import monkeypatch_pytest_notebook_treat_cell_exit_as_notebook_skip
 monkeypatch_pytest_notebook_treat_cell_exit_as_notebook_skip()
diff --git a/topic/machine-learning/llm-langchain/document_loader.py b/topic/machine-learning/llm-langchain/document_loader.py
@@ -37,7 +37,7 @@
 
 def main():
     loader = CrateDBLoader(
-        query="SELECT * FROM mlb_teams_2012 LIMIT 3;",
+        query="SELECT * FROM doc.mlb_teams_2012 LIMIT 3;",
         url=CONNECTION_STRING,
         include_rownum_into_metadata=True,
     )

diff --git a/topic/machine-learning/llm-langchain/requirements-dev.txt b/topic/machine-learning/llm-langchain/requirements-dev.txt
@@ -1,6 +1,11 @@
-coverage~=7.3
-ipykernel
-pytest<8
-pytest-cov<5
-pytest-env<2
-pytest-notebook<0.9
+# Real.
+# cratedb-toolkit[io]
+# pueblo[testing]
+
+# Development.
+cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@amo/add-import-csv
+pueblo[testing] @ git+https://github.com/pyveci/pueblo.git@main
+
+# Workstation.
+#--editable=/Users/amo/dev/crate/ecosystem/cratedb-retentions[io]
+#--editable=/Users/amo/dev/pyveci/sources/pueblo[testing]
diff --git a/topic/machine-learning/llm-langchain/requirements.txt b/topic/machine-learning/llm-langchain/requirements.txt
@@ -1,11 +1,13 @@
+# Real.
 crash
 crate[sqlalchemy]
 # langchain[cratedb,openai]
-pueblo[env,nlp]==0.0.2
+# pueblo[env,nlp]==0.0.3
+pydantic>=1,<2
 python-dotenv<2
 requests-cache<2
 unstructured<0.11
 
 # Development.
-langchain[cratedb,openai] @ git+https://github.com/crate-workbench/langchain.git@cratedb#subdirectory=libs/langchain
-# pueblo[env,nlp] @ git+https://github.com/pyveci/pueblo.git@develop
+langchain[cratedb,openai] @ git+https://github.com/crate-workbench/langchain.git@testing#subdirectory=libs/langchain
+pueblo[nlp] @ git+https://github.com/pyveci/pueblo.git@main
diff --git a/topic/machine-learning/llm-langchain/test.py b/topic/machine-learning/llm-langchain/test.py
@@ -1,96 +1,38 @@
-import importlib
-import io
 import os
-import sys
+import time
 from pathlib import Path
-from unittest import mock
 
 import pytest
-from _pytest.python import Function
-
-HERE = Path(__file__).parent
-
-
-def list_files(path: Path, pattern: str):
-    """
-    Enumerate all files in given directory.
-    """
-    files = path.glob(pattern)
-    files = [item.relative_to(path) for item in files]
-    return files
-
-
-def list_notebooks(path: Path):
-    """
-    Enumerate all Jupyter Notebook files found in given directory.
-    """
-    return list_files(path, "**/*.ipynb")
-
-
-def list_pyfiles(path: Path):
-    """
-    Enumerate all regular Python files found in given directory.
-    """
-    pyfiles = []
-    for item in list_files(path, "**/*.py"):
-        if item.suffix != ".py" or item.name in ["conftest.py"] or item.name.startswith("test"):
-            continue
-        pyfiles.append(item)
-    return pyfiles
 
+from cratedb_toolkit.io.sql import DatabaseAdapter
+from pueblo.testing.folder import str_list, list_notebooks, list_python_files
+from pueblo.testing.snippet import pytest_module_function, pytest_notebook
 
-def str_list(things):
-    """
-    Converge list to list of strings.
-    """
-    return map(str, things)
+HERE = Path(__file__).parent
 
 
-@pytest.fixture(scope="session", autouse=True)
-def nltk_init():
-    """
-    Initialize nltk upfront, so that it does not run stray output into Jupyter Notebooks.
-    """
-    download_items = ["averaged_perceptron_tagger", "punkt"]
-    import nltk
-    for item in download_items:
-        nltk.download(item)
+@pytest.fixture()
+def cratedb() -> DatabaseAdapter:
+    return DatabaseAdapter(dburi="crate://crate@localhost:4200")
 
 
 @pytest.fixture(scope="function", autouse=True)
-def db_init():
+def db_init(cratedb):
     """
     Initialize database.
     """
-    run_sql(statement="DROP TABLE IF EXISTS mlb_teams_2012;")
+    cratedb.run_sql("DROP TABLE IF EXISTS mlb_teams_2012;")
+    time.sleep(0.01)
 
 
-def db_provision_mlb_teams_2012():
+def db_provision_mlb_teams_2012(cratedb):
     """
     Provision database.
     """
-    run_sql(file="mlb_teams_2012.sql")
-    run_sql(statement="REFRESH TABLE mlb_teams_2012;")
-
-
-def run_sql(statement: str = None, file: str = None):
-    """
-    Run SQL from string or file.
-    """
-    import crate.crash.command
-    sys.argv = ["foo", "--schema=testdrive"]
-    if statement:
-        sys.argv += ["--command", statement]
-    if file:
-        sys.stdin = io.StringIO(Path(file).read_text())
-    with \
-            mock.patch("crate.crash.repl.SQLCompleter._populate_keywords"), \
-            mock.patch("crate.crash.command.CrateShell.close"):
-        try:
-            crate.crash.command.main()
-        except SystemExit as ex:
-            if ex.code != 0:
-                raise
+    cratedb.run_sql(Path("mlb_teams_2012.sql"))
+    time.sleep(0.01)
+    cratedb.run_sql("REFRESH TABLE mlb_teams_2012;")
+    time.sleep(0.01)
 
 
 @pytest.mark.parametrize("notebook", str_list(list_notebooks(HERE)))
@@ -101,33 +43,22 @@ def test_notebook(request, notebook: str):
 
     Not using `NBRegressionFixture`, because it would manually need to be configured.
     """
-    from _pytest._py.path import LocalPath
-    from pytest_notebook.plugin import pytest_collect_file
-    tests = pytest_collect_file(LocalPath(notebook), request.node)
-    for test in tests.collect():
-        test.runtest()
+    pytest_notebook(request=request, filepath=notebook)
 
 
-@pytest.mark.parametrize("pyfile", str_list(list_pyfiles(HERE)))
-def test_file(request, pyfile: Path):
+@pytest.mark.parametrize("pyfile", str_list(list_python_files(HERE)))
+def test_file(request, cratedb, pyfile: Path):
     """
     From individual Python file, collect and wrap the `main` function into a test case.
     """
 
-    # TODO: Make configurable.
-    entrypoint_symbol = "main"
-
     # Skip `vector_search.py` example, when no `OPENAI_API_KEY` is supplied.
     if str(pyfile).endswith("vector_search.py"):
         if "OPENAI_API_KEY" not in os.environ:
             raise pytest.skip("OPENAI_API_KEY not given")
 
     # `document_loader.py` needs provisioning.
     if str(pyfile).endswith("document_loader.py"):
-        db_provision_mlb_teams_2012()
+        db_provision_mlb_teams_2012(cratedb)
 
-    path = Path(pyfile)
-    mod = importlib.import_module(path.stem)
-    fun = getattr(mod, entrypoint_symbol)
-    f = Function.from_parent(request.node, name="main", callobj=fun)
-    f.runtest()
+    pytest_module_function(request, pyfile)
diff --git a/...ine-learning/timeseries-basics/.gitignore → .../machine-learning/mlops-mlflow/.gitignore b/...ine-learning/timeseries-basics/.gitignore → .../machine-learning/mlops-mlflow/.gitignore
@@ -1,3 +1,6 @@
+.idea
+.ruff_cache
+.venv*
 /mlartifacts
 /mlruns
 /model
diff --git a/topic/machine-learning/mlops-mlflow/pyproject.toml b/topic/machine-learning/mlops-mlflow/pyproject.toml
@@ -0,0 +1,46 @@
+[tool.pytest.ini_options]
+minversion = "2.0"
+addopts = """
+  -rfEX -p pytester --strict-markers --verbosity=3 --capture=no
+  """
+env = [
+    "CRATEDB_CONNECTION_STRING=crate://crate@localhost/?schema=testdrive",
+    "PYDEVD_DISABLE_FILE_VALIDATION=1",
+]
+
+#log_level = "DEBUG"
+#log_cli_level = "DEBUG"
+
+testpaths = [
+    "*.py",
+]
+xfail_strict = true
+markers = [
+]
+
+# pytest-notebook settings
+nb_test_files = true
+nb_coverage = true
+nb_diff_replace = [
+    # Compensate output of `crash`.
+    '"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"',
+]
+# `vector_search.py` does not include any output(s).
+nb_diff_ignore = [
+    "/metadata/language_info",
+    "/cells/*/execution_count",
+    "/cells/*/outputs/*/execution_count",
+    # Ignore images.
+    "/cells/*/outputs/*/data/image/png",
+]
+
+[tool.coverage.run]
+branch = false
+
+[tool.coverage.report]
+fail_under = 0
+show_missing = true
+omit = [
+    "conftest.py",
+    "test*.py",
+]
diff --git a/...hine-learning/timeseries-basics/readme.md → ...c/machine-learning/mlops-mlflow/readme.md b/...hine-learning/timeseries-basics/readme.md → ...c/machine-learning/mlops-mlflow/readme.md
diff --git a/topic/machine-learning/mlops-mlflow/requirements-dev.txt b/topic/machine-learning/mlops-mlflow/requirements-dev.txt
@@ -0,0 +1,11 @@
+# Real.
+# cratedb-toolkit[io]
+# pueblo[testing]
+
+# Development.
+cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@amo/add-import-csv
+pueblo[testing] @ git+https://github.com/pyveci/pueblo.git@main
+
+# Workstation.
+#--editable=/Users/amo/dev/crate/ecosystem/cratedb-retentions[io]
+#--editable=/Users/amo/dev/pyveci/sources/pueblo[testing]
diff --git a/topic/machine-learning/mlops-mlflow/requirements.txt b/topic/machine-learning/mlops-mlflow/requirements.txt
@@ -0,0 +1,7 @@
+# Real.
+# mlflow-cratedb==2.7.1
+pydantic<1
+salesforce-merlion>=2,<3
+
+# Development.
+mlflow-cratedb @ git+https://github.com/crate-workbench/mlflow-cratedb.git@testing