Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ updates:
schedule:
interval: "weekly"

# Frameworks.
# Topics.

- directory: "/framework/langchain"
- directory: "/topic/machine-learning/llm-langchain"
package-ecosystem: "pip"
schedule:
interval: "weekly"

- directory: "/framework/mlflow"
- directory: "/topic/machine-learning/mlops-mlflow"
package-ecosystem: "pip"
schedule:
interval: "weekly"
Expand Down
72 changes: 72 additions & 0 deletions .github/workflows/test-mlflow.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
name: MLflow

on:
pull_request:
branches: ~
paths:
- '.github/workflows/test-mlflow.yml'
- 'topic/machine-learning/mlops-mlflow/**'
- 'requirements.txt'
push:
branches: [ main ]
paths:
- '.github/workflows/test-mlflow.yml'
- 'topic/machine-learning/mlops-mlflow/**'
- 'requirements.txt'

# Allow job to be triggered manually.
workflow_dispatch:

# Run job each night after CrateDB nightly has been published.
schedule:
- cron: '0 3 * * *'

# Cancel in-progress jobs when pushing to the same branch.
concurrency:
cancel-in-progress: true
group: ${{ github.workflow }}-${{ github.ref }}

jobs:
test:
name: "
Python: ${{ matrix.python-version }}
CrateDB: ${{ matrix.cratedb-version }}
on ${{ matrix.os }}"
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ 'ubuntu-latest' ]
python-version: [ '3.10' ]
cratedb-version: [ 'nightly' ]

services:
cratedb:
image: crate/crate:nightly
ports:
- 4200:4200
- 5432:5432

steps:

- name: Acquire sources
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: 'pip'
cache-dependency-path: |
requirements.txt
topic/machine-learning/mlops-mlflow/requirements.txt
topic/machine-learning/mlops-mlflow/requirements-dev.txt

- name: Install utilities
run: |
pip install -r requirements.txt

- name: Validate topic/machine-learning/mlops-mlflow
run: |
ngr test --accept-no-venv topic/machine-learning/mlops-mlflow
4 changes: 3 additions & 1 deletion topic/machine-learning/llm-langchain/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from pueblo.testing.notebook import monkeypatch_pytest_notebook_treat_cell_exit_as_notebook_skip
# Initialize nltk upfront, so that it does not run stray output into Jupyter Notebooks.
from pueblo.testing.nlp import nltk_init

# Make `pytest.exit()` called in notebook cells gracefully skip testing the whole notebook.
from pueblo.testing.notebook import monkeypatch_pytest_notebook_treat_cell_exit_as_notebook_skip
monkeypatch_pytest_notebook_treat_cell_exit_as_notebook_skip()
2 changes: 1 addition & 1 deletion topic/machine-learning/llm-langchain/document_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

def main():
loader = CrateDBLoader(
query="SELECT * FROM mlb_teams_2012 LIMIT 3;",
query="SELECT * FROM doc.mlb_teams_2012 LIMIT 3;",
url=CONNECTION_STRING,
include_rownum_into_metadata=True,
)
Expand Down
17 changes: 11 additions & 6 deletions topic/machine-learning/llm-langchain/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
coverage~=7.3
ipykernel
pytest<8
pytest-cov<5
pytest-env<2
pytest-notebook<0.9
# Real.
# cratedb-toolkit[io]
# pueblo[testing]

# Development.
cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@amo/add-import-csv
pueblo[testing] @ git+https://github.com/pyveci/pueblo.git@main

# Workstation.
#--editable=/Users/amo/dev/crate/ecosystem/cratedb-retentions[io]
#--editable=/Users/amo/dev/pyveci/sources/pueblo[testing]
8 changes: 5 additions & 3 deletions topic/machine-learning/llm-langchain/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# Real.
crash
crate[sqlalchemy]
# langchain[cratedb,openai]
pueblo[env,nlp]==0.0.2
# pueblo[env,nlp]==0.0.3
pydantic>=1,<2
python-dotenv<2
requests-cache<2
unstructured<0.11

# Development.
langchain[cratedb,openai] @ git+https://github.com/crate-workbench/langchain.git@cratedb#subdirectory=libs/langchain
# pueblo[env,nlp] @ git+https://github.com/pyveci/pueblo.git@develop
langchain[cratedb,openai] @ git+https://github.com/crate-workbench/langchain.git@testing#subdirectory=libs/langchain
pueblo[nlp] @ git+https://github.com/pyveci/pueblo.git@main
111 changes: 21 additions & 90 deletions topic/machine-learning/llm-langchain/test.py
Original file line number Diff line number Diff line change
@@ -1,96 +1,38 @@
import importlib
import io
import os
import sys
import time
from pathlib import Path
from unittest import mock

import pytest
from _pytest.python import Function

HERE = Path(__file__).parent


def list_files(path: Path, pattern: str):
"""
Enumerate all files in given directory.
"""
files = path.glob(pattern)
files = [item.relative_to(path) for item in files]
return files


def list_notebooks(path: Path):
"""
Enumerate all Jupyter Notebook files found in given directory.
"""
return list_files(path, "**/*.ipynb")


def list_pyfiles(path: Path):
"""
Enumerate all regular Python files found in given directory.
"""
pyfiles = []
for item in list_files(path, "**/*.py"):
if item.suffix != ".py" or item.name in ["conftest.py"] or item.name.startswith("test"):
continue
pyfiles.append(item)
return pyfiles

from cratedb_toolkit.io.sql import DatabaseAdapter
from pueblo.testing.folder import str_list, list_notebooks, list_python_files
from pueblo.testing.snippet import pytest_module_function, pytest_notebook

def str_list(things):
"""
Converge list to list of strings.
"""
return map(str, things)
HERE = Path(__file__).parent


@pytest.fixture(scope="session", autouse=True)
def nltk_init():
"""
Initialize nltk upfront, so that it does not run stray output into Jupyter Notebooks.
"""
download_items = ["averaged_perceptron_tagger", "punkt"]
import nltk
for item in download_items:
nltk.download(item)
@pytest.fixture()
def cratedb() -> DatabaseAdapter:
return DatabaseAdapter(dburi="crate://crate@localhost:4200")


@pytest.fixture(scope="function", autouse=True)
def db_init():
def db_init(cratedb):
"""
Initialize database.
"""
run_sql(statement="DROP TABLE IF EXISTS mlb_teams_2012;")
cratedb.run_sql("DROP TABLE IF EXISTS mlb_teams_2012;")
time.sleep(0.01)


def db_provision_mlb_teams_2012():
def db_provision_mlb_teams_2012(cratedb):
"""
Provision database.
"""
run_sql(file="mlb_teams_2012.sql")
run_sql(statement="REFRESH TABLE mlb_teams_2012;")


def run_sql(statement: str = None, file: str = None):
"""
Run SQL from string or file.
"""
import crate.crash.command
sys.argv = ["foo", "--schema=testdrive"]
if statement:
sys.argv += ["--command", statement]
if file:
sys.stdin = io.StringIO(Path(file).read_text())
with \
mock.patch("crate.crash.repl.SQLCompleter._populate_keywords"), \
mock.patch("crate.crash.command.CrateShell.close"):
try:
crate.crash.command.main()
except SystemExit as ex:
if ex.code != 0:
raise
cratedb.run_sql(Path("mlb_teams_2012.sql"))
time.sleep(0.01)
cratedb.run_sql("REFRESH TABLE mlb_teams_2012;")
time.sleep(0.01)


@pytest.mark.parametrize("notebook", str_list(list_notebooks(HERE)))
Expand All @@ -101,33 +43,22 @@ def test_notebook(request, notebook: str):

Not using `NBRegressionFixture`, because it would manually need to be configured.
"""
from _pytest._py.path import LocalPath
from pytest_notebook.plugin import pytest_collect_file
tests = pytest_collect_file(LocalPath(notebook), request.node)
for test in tests.collect():
test.runtest()
pytest_notebook(request=request, filepath=notebook)


@pytest.mark.parametrize("pyfile", str_list(list_pyfiles(HERE)))
def test_file(request, pyfile: Path):
@pytest.mark.parametrize("pyfile", str_list(list_python_files(HERE)))
def test_file(request, cratedb, pyfile: Path):
"""
From individual Python file, collect and wrap the `main` function into a test case.
"""

# TODO: Make configurable.
entrypoint_symbol = "main"

# Skip `vector_search.py` example, when no `OPENAI_API_KEY` is supplied.
if str(pyfile).endswith("vector_search.py"):
if "OPENAI_API_KEY" not in os.environ:
raise pytest.skip("OPENAI_API_KEY not given")

# `document_loader.py` needs provisioning.
if str(pyfile).endswith("document_loader.py"):
db_provision_mlb_teams_2012()
db_provision_mlb_teams_2012(cratedb)

path = Path(pyfile)
mod = importlib.import_module(path.stem)
fun = getattr(mod, entrypoint_symbol)
f = Function.from_parent(request.node, name="main", callobj=fun)
f.runtest()
pytest_module_function(request, pyfile)
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
.idea
.ruff_cache
.venv*
/mlartifacts
/mlruns
/model
46 changes: 46 additions & 0 deletions topic/machine-learning/mlops-mlflow/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
[tool.pytest.ini_options]
minversion = "2.0"
addopts = """
-rfEX -p pytester --strict-markers --verbosity=3 --capture=no
"""
env = [
"CRATEDB_CONNECTION_STRING=crate://crate@localhost/?schema=testdrive",
"PYDEVD_DISABLE_FILE_VALIDATION=1",
]

#log_level = "DEBUG"
#log_cli_level = "DEBUG"

testpaths = [
"*.py",
]
xfail_strict = true
markers = [
]

# pytest-notebook settings
nb_test_files = true
nb_coverage = true
nb_diff_replace = [
# Compensate output of `crash`.
'"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"',
]
# `vector_search.py` does not include any output(s).
nb_diff_ignore = [
"/metadata/language_info",
"/cells/*/execution_count",
"/cells/*/outputs/*/execution_count",
# Ignore images.
"/cells/*/outputs/*/data/image/png",
]

[tool.coverage.run]
branch = false

[tool.coverage.report]
fail_under = 0
show_missing = true
omit = [
"conftest.py",
"test*.py",
]
11 changes: 11 additions & 0 deletions topic/machine-learning/mlops-mlflow/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Real.
# cratedb-toolkit[io]
# pueblo[testing]

# Development.
cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@amo/add-import-csv
pueblo[testing] @ git+https://github.com/pyveci/pueblo.git@main

# Workstation.
#--editable=/Users/amo/dev/crate/ecosystem/cratedb-retentions[io]
#--editable=/Users/amo/dev/pyveci/sources/pueblo[testing]
7 changes: 7 additions & 0 deletions topic/machine-learning/mlops-mlflow/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Real.
# mlflow-cratedb==2.7.1
pydantic<1
salesforce-merlion>=2,<3

# Development.
mlflow-cratedb @ git+https://github.com/crate-workbench/mlflow-cratedb.git@testing
Loading