Skip to content

Commit 37ee051

Browse files
authored
feat(bootstrap): add Docker preflight check before gateway startup (#321)
1 parent 34ca9ea commit 37ee051

File tree

6 files changed

+633
-27
lines changed

6 files changed

+633
-27
lines changed

crates/openshell-bootstrap/src/docker.rs

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,153 @@ pub fn normalize_arch(arch: &str) -> String {
9191
}
9292
}
9393

94+
/// Result of a successful Docker preflight check.
95+
///
96+
/// Contains the validated Docker client and metadata about the daemon so
97+
/// callers can reuse the connection without re-checking.
98+
#[derive(Debug)]
99+
pub struct DockerPreflight {
100+
/// A Docker client that has been verified as connected and responsive.
101+
pub docker: Docker,
102+
/// Docker daemon version string (e.g., "28.1.1").
103+
pub version: Option<String>,
104+
}
105+
106+
/// Well-known Docker socket paths to probe when the default fails.
107+
///
108+
/// These cover common container runtimes on macOS and Linux:
109+
/// - `/var/run/docker.sock` — default for Docker Desktop, `OrbStack`, Colima
110+
/// - `$HOME/.colima/docker.sock` — Colima (older installs)
111+
/// - `$HOME/.orbstack/run/docker.sock` — `OrbStack` (if symlink is missing)
112+
const WELL_KNOWN_SOCKET_PATHS: &[&str] = &[
113+
"/var/run/docker.sock",
114+
// Expanded at runtime via home_dir():
115+
// ~/.colima/docker.sock
116+
// ~/.orbstack/run/docker.sock
117+
];
118+
119+
/// Check that a Docker-compatible runtime is installed, running, and reachable.
120+
///
121+
/// This is the primary preflight gate. It must be called before any gateway
122+
/// deploy work begins. On failure it produces a user-friendly error with
123+
/// actionable recovery steps instead of a raw bollard connection error.
124+
pub async fn check_docker_available() -> Result<DockerPreflight> {
125+
// Step 1: Try to connect using bollard's default resolution
126+
// (respects DOCKER_HOST, then falls back to /var/run/docker.sock).
127+
let docker = match Docker::connect_with_local_defaults() {
128+
Ok(d) => d,
129+
Err(err) => {
130+
return Err(docker_not_reachable_error(
131+
&format!("{err}"),
132+
"Failed to create Docker client",
133+
));
134+
}
135+
};
136+
137+
// Step 2: Ping the daemon to confirm it's responsive.
138+
if let Err(err) = docker.ping().await {
139+
return Err(docker_not_reachable_error(
140+
&format!("{err}"),
141+
"Docker socket exists but the daemon is not responding",
142+
));
143+
}
144+
145+
// Step 3: Query version info (best-effort — don't fail on this).
146+
let version = match docker.version().await {
147+
Ok(v) => v.version,
148+
Err(_) => None,
149+
};
150+
151+
Ok(DockerPreflight { docker, version })
152+
}
153+
154+
/// Build a rich, user-friendly error when Docker is not reachable.
155+
fn docker_not_reachable_error(raw_err: &str, summary: &str) -> miette::Report {
156+
let docker_host = std::env::var("DOCKER_HOST").ok();
157+
let socket_exists = std::path::Path::new("/var/run/docker.sock").exists();
158+
159+
let mut hints: Vec<String> = Vec::new();
160+
161+
if !socket_exists && docker_host.is_none() {
162+
// No socket and no DOCKER_HOST — likely nothing is installed or started
163+
hints.push(
164+
"No Docker socket found at /var/run/docker.sock and DOCKER_HOST is not set."
165+
.to_string(),
166+
);
167+
hints.push(
168+
"Install and start a Docker-compatible runtime. See the support matrix \
169+
in the OpenShell docs for tested configurations."
170+
.to_string(),
171+
);
172+
173+
// Check for alternative sockets that might exist
174+
let alt_sockets = find_alternative_sockets();
175+
if !alt_sockets.is_empty() {
176+
hints.push(format!(
177+
"Found Docker-compatible socket(s) at alternative path(s):\n {}\n\n \
178+
Set DOCKER_HOST to use one, e.g.:\n\n \
179+
export DOCKER_HOST=unix://{}",
180+
alt_sockets.join("\n "),
181+
alt_sockets[0],
182+
));
183+
}
184+
} else if docker_host.is_some() {
185+
// DOCKER_HOST is set but daemon didn't respond
186+
let host_val = docker_host.unwrap();
187+
hints.push(format!(
188+
"DOCKER_HOST is set to '{host_val}' but the Docker daemon is not responding."
189+
));
190+
hints.push(
191+
"Verify your Docker runtime is started and the DOCKER_HOST value is correct."
192+
.to_string(),
193+
);
194+
} else {
195+
// Socket exists but daemon isn't responding
196+
hints.push(
197+
"Docker socket found at /var/run/docker.sock but the daemon is not responding."
198+
.to_string(),
199+
);
200+
hints.push("Start your Docker runtime and try again.".to_string());
201+
}
202+
203+
hints.push("Verify Docker is working with: docker info".to_string());
204+
205+
let help_text = hints.join("\n\n");
206+
207+
miette::miette!(help = help_text, "{summary}.\n\n {raw_err}")
208+
}
209+
210+
/// Probe for Docker-compatible sockets at non-default locations.
211+
fn find_alternative_sockets() -> Vec<String> {
212+
let mut found = Vec::new();
213+
214+
// Check well-known static paths
215+
for path in WELL_KNOWN_SOCKET_PATHS {
216+
if std::path::Path::new(path).exists() {
217+
found.push(path.to_string());
218+
}
219+
}
220+
221+
// Check home-relative paths
222+
if let Some(home) = home_dir() {
223+
let home_sockets = [
224+
format!("{home}/.colima/docker.sock"),
225+
format!("{home}/.orbstack/run/docker.sock"),
226+
];
227+
for path in &home_sockets {
228+
if std::path::Path::new(path).exists() && !found.contains(path) {
229+
found.push(path.clone());
230+
}
231+
}
232+
}
233+
234+
found
235+
}
236+
237+
fn home_dir() -> Option<String> {
238+
std::env::var("HOME").ok()
239+
}
240+
94241
/// Create an SSH Docker client from remote options.
95242
pub async fn create_ssh_docker_client(remote: &RemoteOptions) -> Result<Docker> {
96243
// Ensure destination has ssh:// prefix
@@ -981,4 +1128,74 @@ mod tests {
9811128
};
9821129
assert_eq!(platform.platform_string(), "linux/arm64");
9831130
}
1131+
1132+
#[test]
1133+
fn docker_not_reachable_error_no_socket_no_docker_host() {
1134+
// Simulate: no socket at default path, no DOCKER_HOST set.
1135+
// We can't guarantee /var/run/docker.sock state in CI, but we can
1136+
// verify the error message is well-formed and contains guidance.
1137+
let err =
1138+
docker_not_reachable_error("connection refused", "Failed to create Docker client");
1139+
let msg = format!("{err:?}");
1140+
assert!(
1141+
msg.contains("Failed to create Docker client"),
1142+
"should include the summary"
1143+
);
1144+
assert!(
1145+
msg.contains("connection refused"),
1146+
"should include the raw error"
1147+
);
1148+
// The message should always include the verification step
1149+
assert!(
1150+
msg.contains("docker info"),
1151+
"should suggest 'docker info' verification"
1152+
);
1153+
}
1154+
1155+
#[test]
1156+
fn docker_not_reachable_error_with_docker_host() {
1157+
// Simulate: DOCKER_HOST is set but daemon unresponsive.
1158+
// We set the env var temporarily (this is test-only).
1159+
let prev_docker_host = std::env::var("DOCKER_HOST").ok();
1160+
// SAFETY: test-only, single-threaded test runner for this test
1161+
unsafe {
1162+
std::env::set_var("DOCKER_HOST", "unix:///tmp/fake-docker.sock");
1163+
}
1164+
1165+
let err = docker_not_reachable_error(
1166+
"daemon not responding",
1167+
"Docker socket exists but the daemon is not responding",
1168+
);
1169+
let msg = format!("{err:?}");
1170+
1171+
// Restore env
1172+
// SAFETY: test-only, restoring previous state
1173+
unsafe {
1174+
match prev_docker_host {
1175+
Some(val) => std::env::set_var("DOCKER_HOST", val),
1176+
None => std::env::remove_var("DOCKER_HOST"),
1177+
}
1178+
}
1179+
1180+
assert!(
1181+
msg.contains("DOCKER_HOST"),
1182+
"should mention DOCKER_HOST when it is set"
1183+
);
1184+
assert!(
1185+
msg.contains("unix:///tmp/fake-docker.sock"),
1186+
"should show the current DOCKER_HOST value"
1187+
);
1188+
}
1189+
1190+
#[test]
1191+
fn find_alternative_sockets_returns_vec() {
1192+
// Verify the function runs without panic and returns a vec.
1193+
// Exact contents depend on the host system, so we just check the type.
1194+
let sockets = find_alternative_sockets();
1195+
// On any system, /var/run/docker.sock may or may not exist
1196+
assert!(
1197+
sockets.len() <= 10,
1198+
"should return a reasonable number of sockets"
1199+
);
1200+
}
9841201
}

crates/openshell-bootstrap/src/errors.rs

Lines changed: 72 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -155,12 +155,16 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[
155155
match_mode: MatchMode::Any,
156156
diagnose: diagnose_certificate_issue,
157157
},
158-
// Docker daemon not running
158+
// Docker daemon not running or socket not found
159159
FailurePattern {
160160
matchers: &[
161161
"Cannot connect to the Docker daemon",
162162
"docker daemon is not running",
163163
"Is the docker daemon running",
164+
"Socket not found",
165+
"No such file or directory",
166+
"Failed to create Docker client",
167+
"Docker socket exists but the daemon is not responding",
164168
],
165169
match_mode: MatchMode::Any,
166170
diagnose: diagnose_docker_not_running,
@@ -203,7 +207,7 @@ fn diagnose_no_default_route(_gateway_name: &str) -> GatewayFailureDiagnosis {
203207
"Stop any container holding the gateway port (default 8080), then retry",
204208
),
205209
RecoveryStep::with_command("Prune unused Docker networks", "docker network prune -f"),
206-
RecoveryStep::new("Restart Docker Desktop (if on Mac/Windows)"),
210+
RecoveryStep::new("Restart your Docker runtime"),
207211
RecoveryStep::new("Then retry: openshell gateway start"),
208212
],
209213
retryable: true,
@@ -309,10 +313,7 @@ fn diagnose_oom_killed(_gateway_name: &str) -> GatewayFailureDiagnosis {
309313
The gateway requires at least 4GB of memory."
310314
.to_string(),
311315
recovery_steps: vec![
312-
RecoveryStep::new(
313-
"Increase Docker memory allocation to at least 4GB \
314-
(Docker Desktop → Settings → Resources)",
315-
),
316+
RecoveryStep::new("Increase Docker memory allocation to at least 4GB"),
316317
RecoveryStep::new("Close other memory-intensive applications"),
317318
RecoveryStep::new("Then retry: openshell gateway start"),
318319
],
@@ -335,10 +336,7 @@ fn diagnose_node_pressure(gateway_name: &str) -> GatewayFailureDiagnosis {
335336
"docker system prune -a --volumes",
336337
),
337338
RecoveryStep::with_command("Check available memory on the host", "free -h"),
338-
RecoveryStep::new(
339-
"Increase Docker resource allocation \
340-
(Docker Desktop → Settings → Resources), or free resources on the host",
341-
),
339+
RecoveryStep::new("Increase Docker resource allocation or free resources on the host"),
342340
RecoveryStep::with_command(
343341
"Destroy and recreate the gateway after freeing resources",
344342
format!("openshell gateway destroy {gateway_name} && openshell gateway start"),
@@ -392,10 +390,16 @@ fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis {
392390
fn diagnose_docker_not_running(_gateway_name: &str) -> GatewayFailureDiagnosis {
393391
GatewayFailureDiagnosis {
394392
summary: "Docker is not running".to_string(),
395-
explanation: "The Docker daemon is not running or not accessible.".to_string(),
393+
explanation: "The Docker daemon is not running or not accessible. OpenShell requires \
394+
a Docker-compatible container runtime to manage gateway clusters."
395+
.to_string(),
396396
recovery_steps: vec![
397-
RecoveryStep::new("Start Docker Desktop (Mac/Windows) or the Docker service (Linux)"),
398-
RecoveryStep::with_command("Verify Docker is running", "docker info"),
397+
RecoveryStep::new("Start your Docker runtime"),
398+
RecoveryStep::with_command("Verify Docker is accessible", "docker info"),
399+
RecoveryStep::new(
400+
"If using a non-default Docker socket, set DOCKER_HOST:\n \
401+
export DOCKER_HOST=unix:///var/run/docker.sock",
402+
),
399403
RecoveryStep::new("Then retry: openshell gateway start"),
400404
],
401405
retryable: true,
@@ -558,6 +562,61 @@ mod tests {
558562
);
559563
}
560564

565+
#[test]
566+
fn test_diagnose_docker_not_running() {
567+
let diagnosis = diagnose_failure("test", "Cannot connect to the Docker daemon", None);
568+
assert!(diagnosis.is_some());
569+
let d = diagnosis.unwrap();
570+
assert!(d.summary.contains("Docker"));
571+
assert!(d.retryable);
572+
}
573+
574+
#[test]
575+
fn test_diagnose_docker_socket_not_found() {
576+
let diagnosis = diagnose_failure("test", "Socket not found: /var/run/docker.sock", None);
577+
assert!(diagnosis.is_some());
578+
let d = diagnosis.unwrap();
579+
assert!(d.summary.contains("Docker"));
580+
assert!(d.retryable);
581+
}
582+
583+
#[test]
584+
fn test_diagnose_docker_no_such_file() {
585+
let diagnosis = diagnose_failure("test", "No such file or directory (os error 2)", None);
586+
assert!(diagnosis.is_some());
587+
let d = diagnosis.unwrap();
588+
assert!(d.summary.contains("Docker"));
589+
}
590+
591+
#[test]
592+
fn test_diagnose_docker_preflight_error() {
593+
let diagnosis = diagnose_failure(
594+
"test",
595+
"Failed to create Docker client.\n\n connection error",
596+
None,
597+
);
598+
assert!(diagnosis.is_some());
599+
let d = diagnosis.unwrap();
600+
assert!(d.summary.contains("Docker"));
601+
assert!(d.retryable);
602+
}
603+
604+
#[test]
605+
fn test_diagnose_docker_recovery_mentions_docker_host() {
606+
let diagnosis = diagnose_failure("test", "Cannot connect to the Docker daemon", None);
607+
let d = diagnosis.unwrap();
608+
let steps_text: String = d
609+
.recovery_steps
610+
.iter()
611+
.map(|s| s.description.clone())
612+
.collect::<Vec<_>>()
613+
.join(" ");
614+
assert!(
615+
steps_text.contains("DOCKER_HOST"),
616+
"recovery steps should mention DOCKER_HOST"
617+
);
618+
}
619+
561620
#[test]
562621
fn test_diagnose_dns_failure_from_namespace_timeout() {
563622
// When wait_for_namespace detects DNS failure, the error message itself

0 commit comments

Comments
 (0)