Skip to content
46 changes: 42 additions & 4 deletions docs/providers/local.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,44 @@ curl http://localhost:11434/v1/chat/completions \
-d '{"model": "qwen2.5-coder:14b", "messages": [{"role": "user", "content": "Hello"}]}'
```

## Ollama Launch Provider (Recommended)

If you have Ollama v0.15.0+, the `ollama-launch` provider is the simplest
way to run Koan with local models. Ollama handles server lifecycle and
environment setup automatically:

```yaml
# config.yaml
cli_provider: "ollama-launch"

ollama_launch:
model: "qwen3-coder"
```

Or via environment:

```bash
KOAN_CLI_PROVIDER=ollama-launch
KOAN_OLLAMA_LAUNCH_MODEL=qwen3-coder
```

Then start Koan normally with `make start` — no need for `make ollama`.

**Advantages over `local` provider:**
- No manual `ollama serve` — Ollama auto-starts the server
- No env-var setup (ANTHROPIC_BASE_URL etc.)
- `OLLAMA_NO_CLOUD=1` set by default for privacy
- Version validation on startup

**Model management via Telegram:**

```
/ollama list — List locally available models
/ollama pull NAME — Download a new model
/ollama remove NAME — Delete a local model
/ollama status — Server health check
```

## Per-Project Configuration

Use local LLM for specific projects (e.g., small libraries) while
Expand Down Expand Up @@ -160,11 +198,11 @@ that work best with Koan's agentic loop:

| Model | Size | Tool Use | Notes |
|-------|------|----------|-------|
| `qwen2.5-coder:14b` | 14B | Good | Best balance of size and capability |
| `qwen2.5-coder:7b` | 7B | Fair | Lighter, faster, less reliable tool use |
| `qwen3-coder` | 14B+ | Excellent | Best choice for agentic coding workflows |
| `qwen2.5-coder:14b` | 14B | Good | Solid balance of size and capability |
| `glm-4.7` | 30B | Good | Lightweight deployment, strong reasoning |
| `deepseek-coder-v2:16b` | 16B | Good | Strong coding, good function calling |
| `codellama:34b` | 34B | Fair | Needs more RAM, variable tool use |
| `mistral:7b` | 7B | Basic | Fast but limited tool use |
| `qwen2.5-coder:7b` | 7B | Fair | Lighter, faster, less reliable tool use |

**Hardware requirements vary by model size:**

Expand Down
23 changes: 23 additions & 0 deletions koan/app/cli_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,27 @@ def _cleanup_prompt_file(path: Optional[str]) -> None:
os.unlink(path)


def _merge_provider_env(kwargs: dict) -> None:
"""Merge provider-specific env vars into subprocess kwargs.

Providers declare extra env vars via ``get_env()`` (e.g.
``OLLAMA_NO_CLOUD=1``). These are merged into the subprocess
environment so the protection actually applies at runtime.
Only merges when the caller hasn't already supplied an ``env``.
"""
if "env" in kwargs:
return
try:
from app.provider import get_provider
extra = get_provider().get_env()
if extra:
env = os.environ.copy()
env.update(extra)
kwargs["env"] = env
except Exception:
pass


def run_cli(cmd, **kwargs) -> subprocess.CompletedProcess:
"""Run a CLI command with the prompt passed via temp-file stdin.

Expand All @@ -107,6 +128,7 @@ def run_cli(cmd, **kwargs) -> subprocess.CompletedProcess:
the caller does not provide one, preventing indefinite hangs.
"""
kwargs.setdefault("timeout", DEFAULT_TIMEOUT)
_merge_provider_env(kwargs)
cmd, prompt_path = prepare_prompt_file(cmd)
if prompt_path:
try:
Expand All @@ -129,6 +151,7 @@ def popen_cli(
Returns ``(proc, cleanup)`` where *cleanup()* **must** be called after
the process exits to close the file handle and delete the temp file.
"""
_merge_provider_env(kwargs)
cmd, prompt_path = prepare_prompt_file(cmd)
if prompt_path:
stdin_file = open(prompt_path) # noqa: SIM115
Expand Down
163 changes: 163 additions & 0 deletions koan/app/ollama_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
"""Ollama REST API client.

Lightweight wrapper around Ollama's HTTP API for health checks, model
listing, pulling, and removal. Uses only ``urllib`` (no third-party
deps) to keep Kōan's dependency footprint minimal.

Ollama API docs: https://github.com/ollama/ollama/blob/main/docs/api.md
"""

import json
import socket
import urllib.request
import urllib.error
from typing import Any, Optional, Tuple

DEFAULT_HOST = "http://localhost:11434"


def _api_request(
path: str,
method: str = "GET",
body: Optional[dict] = None,
host: str = DEFAULT_HOST,
timeout: int = 10,
) -> Tuple[bool, Any]:
"""Unified HTTP request to the Ollama API.

Returns (success, data) where data is parsed JSON on success or an
error message string on failure.
"""
url = f"{host.rstrip('/')}{path}"
data = json.dumps(body).encode() if body else None
headers = {"Content-Type": "application/json"} if body else {}

req = urllib.request.Request(url, data=data, headers=headers, method=method)
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
raw = resp.read().decode()
if not raw:
return True, {}
return True, json.loads(raw)
except urllib.error.HTTPError as e:
try:
detail = json.loads(e.read().decode()).get("error", str(e))
except (ValueError, UnicodeDecodeError):
detail = str(e)
return False, detail
except urllib.error.URLError as e:
return False, f"Connection failed: {e.reason}"
except socket.timeout:
return False, "Request timed out"
except (json.JSONDecodeError, UnicodeDecodeError) as e:
return False, f"Invalid response: {e}"
except OSError as e:
return False, str(e)


def is_server_running(host: str = DEFAULT_HOST, timeout: int = 3) -> bool:
"""Check if the Ollama server is responding."""
ok, _ = _api_request("/", host=host, timeout=timeout)
return ok


def get_version(host: str = DEFAULT_HOST, timeout: int = 3) -> Optional[str]:
"""Get the Ollama server version string, or None if unavailable."""
ok, data = _api_request("/api/version", host=host, timeout=timeout)
if ok and isinstance(data, dict):
return data.get("version")
return None


def list_models(host: str = DEFAULT_HOST, timeout: int = 10) -> Tuple[bool, Any]:
"""List locally available models.

Returns (success, list_of_models) where each model is a dict with
keys like 'name', 'size', 'modified_at', 'details', etc.
"""
ok, data = _api_request("/api/tags", host=host, timeout=timeout)
if ok and isinstance(data, dict):
return True, data.get("models", [])
return ok, data


def show_model(name: str, host: str = DEFAULT_HOST, timeout: int = 10) -> Tuple[bool, Any]:
"""Get details about a specific model.

Returns (success, model_info_dict) with keys like 'modelfile',
'parameters', 'template', 'details' (family, parameter_size, etc.).
"""
_validate_model_name(name)
return _api_request("/api/show", method="POST", body={"name": name},
host=host, timeout=timeout)


def pull_model(name: str, host: str = DEFAULT_HOST, timeout: int = 1800) -> Tuple[bool, str]:
"""Pull (download) a model.

Uses the non-streaming API (stream=false). This means the HTTP
request blocks until the entire download completes (up to
``timeout`` seconds, default 30 minutes). There is no progress
feedback during the download — the caller should inform the user
that the operation may take a while for large models (70B+ can
exceed 10 minutes on slower connections).

Returns (success, status_message).
"""
_validate_model_name(name)
ok, data = _api_request(
"/api/pull", method="POST",
body={"name": name, "stream": False},
host=host, timeout=timeout,
)
if ok:
status = data.get("status", "success") if isinstance(data, dict) else "success"
return True, status
return False, str(data)


def delete_model(name: str, host: str = DEFAULT_HOST, timeout: int = 30) -> Tuple[bool, str]:
"""Delete a locally stored model.

Returns (success, message).
"""
_validate_model_name(name)
ok, data = _api_request(
"/api/delete", method="DELETE",
body={"name": name},
host=host, timeout=timeout,
)
if ok:
return True, "deleted"
return False, str(data)


def list_running(host: str = DEFAULT_HOST, timeout: int = 5) -> Tuple[bool, Any]:
"""List models currently loaded in memory (running).

Returns (success, list_of_running_models).
"""
ok, data = _api_request("/api/ps", host=host, timeout=timeout)
if ok and isinstance(data, dict):
return True, data.get("models", [])
return ok, data


def _validate_model_name(name: str) -> None:
"""Raise ValueError if model name is clearly invalid."""
if not name or not name.strip():
raise ValueError("Model name must not be empty")
if any(c in name for c in "\n\r\t"):
raise ValueError(f"Model name contains invalid characters: {name!r}")


def format_model_size(size_bytes: int) -> str:
"""Format byte count as human-readable size (e.g. '4.7 GB').

Uses 1000-based (SI) units to match ``ollama list`` output.
"""
if size_bytes >= 1_000_000_000:
return f"{size_bytes / 1_000_000_000:.1f} GB"
if size_bytes >= 1_000_000:
return f"{size_bytes / 1_000_000:.0f} MB"
return f"{size_bytes} B"
30 changes: 25 additions & 5 deletions koan/app/pid_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,14 +378,19 @@ def start_ollama(koan_root: Path, verify_timeout: float = OLLAMA_VERIFY_TIMEOUT)
# Write PID file — ollama serve is an external binary (no flock)
acquire_pid(koan_root, "ollama", proc.pid)

# Wait briefly for ollama to start listening
# Wait for ollama to start listening on HTTP
deadline = time.monotonic() + verify_timeout
while time.monotonic() < deadline:
if _is_process_alive(proc.pid):
if not _is_process_alive(proc.pid):
release_pid(koan_root, "ollama")
return False, "ollama launched but exited immediately — check ollama logs"
if _ollama_http_ready():
return True, f"ollama serve started (PID {proc.pid})"
time.sleep(0.3)

# Clean up stale PID file — process is dead, don't leave phantom PIDs
# Process is alive but HTTP not ready yet — still report success
if _is_process_alive(proc.pid):
return True, f"ollama serve started (PID {proc.pid}), still warming up"
release_pid(koan_root, "ollama")
return False, "ollama launched but exited immediately — check ollama logs"

Expand Down Expand Up @@ -571,9 +576,24 @@ def _detect_provider(koan_root: Path) -> str:
return "claude"


def _ollama_http_ready() -> bool:
"""Check if Ollama server is responding on HTTP."""
from app.ollama_client import is_server_running
return is_server_running(timeout=2)


def _needs_ollama(provider: str) -> bool:
"""Return True if the provider requires ollama serve."""
return provider in ("local", "ollama")
"""Return True if the provider requires a manually started ollama serve.

ollama-launch manages its own server lifecycle, so it returns False.
"""
if provider == "ollama-launch":
return False
try:
from app.provider import is_ollama_provider
return is_ollama_provider(provider)
except (ImportError, KeyError):
return provider in ("local", "ollama")


def _show_startup_banner(koan_root: Path, provider: str) -> None:
Expand Down
15 changes: 15 additions & 0 deletions koan/app/provider/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,21 @@ def get_provider_by_name(name: str) -> CLIProvider:
return _PROVIDERS[provider_name]()


def is_ollama_provider(name: str = "") -> bool:
"""Return True if the given (or configured) provider uses Ollama.

Checks the ``uses_ollama`` flag on the provider class, which is the
single source of truth for whether a provider requires Ollama.
For unregistered names (e.g. legacy "ollama"), falls back to
name-based detection.
"""
if not name:
name = get_provider_name()
if name in _PROVIDERS:
return _PROVIDERS[name].uses_ollama
return "ollama" in name.lower()


def get_cli_binary() -> str:
"""Get the CLI binary command for the configured provider.

Expand Down
11 changes: 10 additions & 1 deletion koan/app/provider/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Base class and constants for CLI provider abstraction."""

import shutil
from typing import List, Optional, Tuple
from typing import Dict, List, Optional, Tuple


# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -37,6 +37,7 @@ class CLIProvider:
"""

name: str = ""
uses_ollama: bool = False

def binary(self) -> str:
"""Return the CLI binary name or path."""
Expand Down Expand Up @@ -257,6 +258,14 @@ def build_command(
cmd.extend(self.build_effort_args(effort))
return cmd

def get_env(self) -> Dict[str, str]:
"""Return extra environment variables for subprocess invocation.

Override in subclasses that need to inject env vars (e.g.
OLLAMA_NO_CLOUD). Base returns empty dict.
"""
return {}

def check_quota_available(self, project_path: str, timeout: int = 15) -> Tuple[bool, str]:
"""Probe real API quota with a minimal CLI call.

Expand Down
1 change: 1 addition & 0 deletions koan/app/provider/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class LocalLLMProvider(CLIProvider):
"""

name = "local"
uses_ollama = True

def _get_config(self) -> dict:
"""Get local_llm config section from config.yaml."""
Expand Down
Loading
Loading