Skip to content

Commit 51cf1ed

Browse files
committed
Checking common sockets before failing
Signed-off-by: Fiona Waters <[email protected]>
1 parent f05220d commit 51cf1ed

File tree

2 files changed

+6
-11
lines changed

2 files changed

+6
-11
lines changed

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,6 @@ client = TrainerClient(backend_config=ContainerBackendConfig())
9494
job_id = client.train(trainer=CustomTrainer(func=train_fn))
9595
```
9696

97-
For detailed configuration options and platform-specific setup (macOS, Linux), see the [ContainerBackend documentation](kubeflow/trainer/backends/container/README.md).
98-
9997
## Supported Kubeflow Projects
10098

10199
| Project | Status | Version Support | Description |

kubeflow/trainer/backends/container/backend.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,8 @@ def _create_adapter(self) -> BaseContainerClientAdapter:
130130
Create the appropriate container client adapter.
131131
132132
Tries Docker first, then Podman if Docker fails, unless a specific
133-
runtime is requested in the config.
133+
runtime is requested in the config. Automatically tries common socket
134+
locations (e.g., Colima for Docker on macOS, user socket for Podman).
134135
135136
Raises RuntimeError if neither Docker nor Podman are available.
136137
"""
@@ -144,19 +145,15 @@ def _create_adapter(self) -> BaseContainerClientAdapter:
144145
[self.cfg.container_runtime] if self.cfg.container_runtime else ["docker", "podman"]
145146
)
146147

148+
attempted_connections = []
147149
last_error = None
150+
148151
for runtime_name in runtimes_to_try:
149152
if runtime_name not in runtime_map:
150153
continue
151154

152-
try:
153-
adapter = runtime_map[runtime_name](self.cfg.container_host)
154-
adapter.ping()
155-
logger.debug(f"Using {runtime_name} as container runtime")
156-
return adapter
157-
except Exception as e:
158-
logger.debug(f"{runtime_name} initialization failed: {e}")
159-
last_error = e
155+
# Try common socket locations for this runtime
156+
socket_locations = self._get_common_socket_locations(runtime_name)
160157

161158
for host in socket_locations:
162159
try:

0 commit comments

Comments
 (0)