Skip to content

Commit d1288b2

Browse files
committed
Implementing ContainerBackend
Signed-off-by: Fiona Waters <[email protected]>
1 parent 60fd258 commit d1288b2

File tree

15 files changed

+1643
-881
lines changed

15 files changed

+1643
-881
lines changed

README.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,45 @@ TrainerClient().wait_for_job_status(job_id)
7171
print("\n".join(TrainerClient().get_job_logs(name=job_id)))
7272
```
7373

74+
## Local Development
75+
76+
Kubeflow SDK provides first-class support for local development, allowing you to test and iterate on your models without needing a Kubernetes cluster.
77+
78+
### Execution Backends
79+
80+
Choose the backend that fits your development workflow:
81+
82+
| Backend | Description | Use Case |
83+
|---------|-------------|----------|
84+
| **KubernetesBackend** | Run jobs on Kubernetes cluster | Production, multi-node distributed training |
85+
| **ContainerBackend** | Auto-detects Docker or Podman | Local development with container isolation |
86+
| **LocalProcessBackend** | Run as local Python subprocesses | Quick prototyping, debugging |
87+
88+
### Local Container Execution
89+
90+
The **ContainerBackend** automatically detects and uses either Docker or Podman:
91+
92+
```bash
93+
# Install with Docker support
94+
pip install kubeflow[docker]
95+
96+
# Or install with Podman support
97+
pip install kubeflow[podman]
98+
```
99+
100+
```python
101+
from kubeflow.trainer import TrainerClient, ContainerBackendConfig, CustomTrainer
102+
103+
# Auto-detects Docker or Podman
104+
config = ContainerBackendConfig()
105+
client = TrainerClient(backend_config=config)
106+
107+
# Your training runs in isolated containers
108+
job_id = client.train(trainer=CustomTrainer(func=train_fn))
109+
```
110+
111+
For detailed configuration options and platform-specific setup (macOS, Linux), see the [ContainerBackend documentation](kubeflow/trainer/backends/container/README.md).
112+
74113
## Supported Kubeflow Projects
75114

76115
| Project | Status | Version Support | Description |

kubeflow/trainer/__init__.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@
1515

1616
# Import the Kubeflow Trainer client.
1717
from kubeflow.trainer.api.trainer_client import TrainerClient # noqa: F401
18-
from kubeflow.trainer.backends.docker.types import LocalDockerBackendConfig
19-
from kubeflow.trainer.backends.podman.types import LocalPodmanBackendConfig
18+
from kubeflow.trainer.backends.container.types import ContainerBackendConfig
2019

2120
# import backends and its associated configs
2221
from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig
@@ -60,7 +59,6 @@
6059
"TrainerClient",
6160
"TrainerType",
6261
"LocalProcessBackendConfig",
63-
"LocalDockerBackendConfig",
64-
"LocalPodmanBackendConfig",
62+
"ContainerBackendConfig",
6563
"KubernetesBackendConfig",
6664
]

kubeflow/trainer/api/trainer_client.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,8 @@
1616
import logging
1717
from typing import Optional, Union
1818

19-
from kubeflow.trainer.backends.docker.backend import LocalDockerBackend
20-
from kubeflow.trainer.backends.docker.types import LocalDockerBackendConfig
21-
from kubeflow.trainer.backends.podman.backend import LocalPodmanBackend
22-
from kubeflow.trainer.backends.podman.types import LocalPodmanBackendConfig
19+
from kubeflow.trainer.backends.container.backend import ContainerBackend
20+
from kubeflow.trainer.backends.container.types import ContainerBackendConfig
2321
from kubeflow.trainer.backends.kubernetes.backend import KubernetesBackend
2422
from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig
2523
from kubeflow.trainer.backends.localprocess.backend import (
@@ -38,16 +36,16 @@ def __init__(
3836
backend_config: Union[
3937
KubernetesBackendConfig,
4038
LocalProcessBackendConfig,
41-
LocalDockerBackendConfig,
42-
LocalPodmanBackendConfig,
39+
ContainerBackendConfig,
4340
] = None,
4441
):
4542
"""Initialize a Kubeflow Trainer client.
4643
4744
Args:
48-
backend_config: Backend configuration. Either KubernetesBackendConfig or
49-
LocalProcessBackendConfig, or None to use the backend's
50-
default config class. Defaults to KubernetesBackendConfig.
45+
backend_config: Backend configuration. Either KubernetesBackendConfig,
46+
LocalProcessBackendConfig, ContainerBackendConfig,
47+
or None to use the backend's default config class.
48+
Defaults to KubernetesBackendConfig.
5149
5250
Raises:
5351
ValueError: Invalid backend configuration.
@@ -61,10 +59,8 @@ def __init__(
6159
self.backend = KubernetesBackend(backend_config)
6260
elif isinstance(backend_config, LocalProcessBackendConfig):
6361
self.backend = LocalProcessBackend(backend_config)
64-
elif isinstance(backend_config, LocalDockerBackendConfig):
65-
self.backend = LocalDockerBackend(backend_config)
66-
elif isinstance(backend_config, LocalPodmanBackendConfig):
67-
self.backend = LocalPodmanBackend(backend_config)
62+
elif isinstance(backend_config, ContainerBackendConfig):
63+
self.backend = ContainerBackend(backend_config)
6864
else:
6965
raise ValueError(f"Invalid backend config '{backend_config}'")
7066

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# ContainerBackend
2+
3+
The unified container backend for Kubeflow Trainer that automatically detects and uses either Docker or Podman.
4+
5+
## Overview
6+
7+
This backend provides a single, unified interface for container-based training execution, automatically detecting which container runtime is available on your system.
8+
9+
The implementation uses the **adapter pattern** to abstract away differences between Docker and Podman APIs, providing clean separation between runtime detection logic and container operations.
10+
11+
## Usage
12+
13+
### Basic usage (auto-detection)
14+
15+
```python
16+
from kubeflow.trainer import TrainerClient, ContainerBackendConfig
17+
18+
# Auto-detects Docker or Podman
19+
config = ContainerBackendConfig()
20+
client = TrainerClient(backend_config=config)
21+
```
22+
23+
### Force specific runtime
24+
25+
```python
26+
# Force Docker
27+
config = ContainerBackendConfig(runtime="docker")
28+
client = TrainerClient(backend_config=config)
29+
30+
# Force Podman
31+
config = ContainerBackendConfig(runtime="podman")
32+
client = TrainerClient(backend_config=config)
33+
```
34+
35+
### Configuration options
36+
37+
```python
38+
config = ContainerBackendConfig(
39+
# Optional: force specific runtime ("docker" or "podman")
40+
runtime=None,
41+
42+
# Optional: explicit image override
43+
image="my-custom-image:latest",
44+
45+
# Image pull policy: "IfNotPresent", "Always", or "Never"
46+
pull_policy="IfNotPresent",
47+
48+
# Auto-remove containers and networks on job deletion
49+
auto_remove=True,
50+
51+
# GPU support (varies by runtime)
52+
gpus=None,
53+
54+
# Environment variables for all containers
55+
env={"MY_VAR": "value"},
56+
57+
# Container daemon URL override (required for Colima/Podman Machine on macOS)
58+
container_host=None,
59+
60+
# Base directory for job workspaces
61+
workdir_base=None,
62+
)
63+
```
64+
65+
### macOS-specific configuration
66+
67+
On macOS, you may need to specify `container_host` depending on your container runtime:
68+
69+
**Docker with Colima:**
70+
```python
71+
import os
72+
config = ContainerBackendConfig(
73+
container_host=f"unix://{os.path.expanduser('~')}/.colima/default/docker.sock"
74+
)
75+
```
76+
77+
**Podman Machine:**
78+
```python
79+
import os
80+
config = ContainerBackendConfig(
81+
container_host=f"unix://{os.path.expanduser('~')}/.local/share/containers/podman/machine/podman.sock"
82+
)
83+
```
84+
85+
**Docker Desktop:**
86+
```python
87+
# Usually works without specifying container_host
88+
config = ContainerBackendConfig()
89+
```
90+
91+
Alternatively, set environment variables before running:
92+
```bash
93+
# For Colima
94+
export DOCKER_HOST="unix://$HOME/.colima/default/docker.sock"
95+
96+
# For Podman Machine
97+
export CONTAINER_HOST="unix://$HOME/.local/share/containers/podman/machine/podman.sock"
98+
```
99+
100+
### How it works
101+
102+
The backend initialization follows this logic:
103+
104+
1. If `runtime` is specified in config, use that runtime exclusively
105+
2. Otherwise, try to initialize Docker client adapter
106+
3. If Docker fails, try to initialize Podman client adapter
107+
4. If both fail, raise a RuntimeError
108+
109+
If you don't have Docker or Podman installed, use `LocalProcessBackendConfig` instead, which runs training as local subprocesses.
110+
111+
All container operations are delegated to the adapter, eliminating code duplication.
112+
113+
## Installation
114+
115+
Install with Docker support:
116+
```bash
117+
pip install kubeflow[docker]
118+
```
119+
120+
Install with Podman support:
121+
```bash
122+
pip install kubeflow[podman]
123+
```
124+
125+
Install with both:
126+
```bash
127+
pip install kubeflow[docker,podman]
128+
```
129+
130+
## Example: Training Job
131+
132+
```python
133+
from kubeflow.trainer import TrainerClient, ContainerBackendConfig, CustomTrainer
134+
135+
# Define your training function
136+
def train():
137+
import torch
138+
print(f"Training with PyTorch {torch.__version__}")
139+
# Your training code here
140+
141+
# Create trainer
142+
trainer = CustomTrainer(
143+
func=train,
144+
packages_to_install=["torch"],
145+
)
146+
147+
# Initialize client (auto-detects runtime)
148+
config = ContainerBackendConfig()
149+
client = TrainerClient(backend_config=config)
150+
151+
# Run training
152+
job_name = client.train(trainer=trainer)
153+
print(f"Training job started: {job_name}")
154+
155+
# Get logs
156+
for log in client.get_job_logs(job_name, follow=True):
157+
print(log, end='')
158+
```
159+
160+
## See also
161+
162+
- [Example notebook](TBA) - Complete working example to be added

kubeflow/trainer/backends/podman/__init__.py renamed to kubeflow/trainer/backends/container/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
from kubeflow.trainer.backends.podman.backend import LocalPodmanBackend
16-
from kubeflow.trainer.backends.podman.types import LocalPodmanBackendConfig
15+
from kubeflow.trainer.backends.container.backend import ContainerBackend
16+
from kubeflow.trainer.backends.container.types import ContainerBackendConfig
1717

18-
__all__ = ["LocalPodmanBackend", "LocalPodmanBackendConfig"]
18+
__all__ = ["ContainerBackend", "ContainerBackendConfig"]

0 commit comments

Comments
 (0)