Skip to content

Commit ebdc1f6

Browse files
committed
fix(bootstrap): surface diagnostics for K8s namespace not ready failures
The 'K8s namespace not ready' error had three gaps preventing diagnostic information from reaching users: 1. The non-interactive (CI/piped) code path used bare error propagation with no diagnosis at all. 2. The interactive path's pattern matcher returned None for the common timeout case, and the generic_failure_diagnosis fallback existed but was never called. 3. Container logs were never passed to the diagnosis engine, so patterns only visible in logs (node pressure, corrupted state, etc.) could not match. Fix all three by fetching container logs at the CLI error-handling site, passing them to diagnose_failure, and falling back to generic_failure_diagnosis when no specific pattern matches. Also add container logs to the two wait_for_namespace error paths that were missing them (timeout and exec-error-on-final-attempt), and update the generic diagnosis to suggest 'openshell doctor' commands.
1 parent cf66d05 commit ebdc1f6

File tree

3 files changed

+73
-15
lines changed

3 files changed

+73
-15
lines changed

crates/openshell-bootstrap/src/errors.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -449,16 +449,20 @@ pub fn generic_failure_diagnosis(gateway_name: &str) -> GatewayFailureDiagnosis
449449
summary: "Gateway failed to start".to_string(),
450450
explanation: "The gateway encountered an unexpected error during startup.".to_string(),
451451
recovery_steps: vec![
452+
RecoveryStep::with_command(
453+
"Check container logs for details",
454+
format!("openshell doctor logs --name {gateway_name}"),
455+
),
456+
RecoveryStep::with_command(
457+
"Run diagnostics",
458+
format!("openshell doctor check --name {gateway_name}"),
459+
),
452460
RecoveryStep::with_command(
453461
"Try destroying and recreating the gateway",
454462
format!(
455463
"openshell gateway destroy --name {gateway_name} && openshell gateway start"
456464
),
457465
),
458-
RecoveryStep::with_command(
459-
"Check container logs for details",
460-
format!("docker logs openshell-cluster-{gateway_name}"),
461-
),
462466
RecoveryStep::new(
463467
"If the issue persists, report it at https://github.com/nvidia/openshell/issues",
464468
),

crates/openshell-bootstrap/src/lib.rs

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,21 @@ pub async fn gateway_container_logs<W: std::io::Write>(
638638
Ok(())
639639
}
640640

641+
/// Fetch the last `n` lines of container logs for a local gateway as a
642+
/// `String`. This is a convenience wrapper for diagnostic call sites (e.g.
643+
/// failure diagnosis in the CLI) that do not hold a Docker client handle.
644+
///
645+
/// Returns an empty string on any Docker/connection error so callers don't
646+
/// need to worry about error handling.
647+
pub async fn fetch_gateway_logs(name: &str, n: usize) -> String {
648+
let docker = match Docker::connect_with_local_defaults() {
649+
Ok(d) => d,
650+
Err(_) => return String::new(),
651+
};
652+
let container = container_name(name);
653+
fetch_recent_logs(&docker, &container, n).await
654+
}
655+
641656
fn default_gateway_image_ref() -> String {
642657
if let Ok(image) = std::env::var("OPENSHELL_CLUSTER_IMAGE")
643658
&& !image.trim().is_empty()
@@ -984,7 +999,11 @@ async fn wait_for_namespace(
984999
}
9851000

9861001
if attempt + 1 == attempts {
987-
return Err(err).wrap_err("K8s namespace not ready");
1002+
let logs = fetch_recent_logs(docker, container_name, 40).await;
1003+
return Err(miette::miette!(
1004+
"exec failed on final attempt while waiting for namespace '{namespace}': {err}\n{logs}"
1005+
))
1006+
.wrap_err("K8s namespace not ready");
9881007
}
9891008
tokio::time::sleep(backoff).await;
9901009
backoff = std::cmp::min(backoff.saturating_mul(2), max_backoff);
@@ -997,8 +1016,9 @@ async fn wait_for_namespace(
9971016
}
9981017

9991018
if attempt + 1 == attempts {
1019+
let logs = fetch_recent_logs(docker, container_name, 40).await;
10001020
return Err(miette::miette!(
1001-
"timed out waiting for namespace '{namespace}' to exist: {output}"
1021+
"timed out waiting for namespace '{namespace}' to exist: {output}\n{logs}"
10021022
))
10031023
.wrap_err("K8s namespace not ready");
10041024
}

crates/openshell-cli/src/run.rs

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,19 +1248,27 @@ pub(crate) async fn deploy_gateway_with_panel(
12481248
"x".red().bold(),
12491249
"Gateway failed:".red().bold(),
12501250
);
1251+
// Fetch container logs for pattern-based diagnosis
1252+
let container_logs = openshell_bootstrap::fetch_gateway_logs(name, 80).await;
1253+
let logs_opt = if container_logs.is_empty() {
1254+
None
1255+
} else {
1256+
Some(container_logs.as_str())
1257+
};
12511258
// Try to diagnose the failure and provide guidance
12521259
let err_str = format!("{err:?}");
1253-
if let Some(diagnosis) =
1254-
openshell_bootstrap::errors::diagnose_failure(name, &err_str, None)
1255-
{
1256-
print_failure_diagnosis(&diagnosis);
1257-
}
1260+
let diagnosis =
1261+
openshell_bootstrap::errors::diagnose_failure(name, &err_str, logs_opt)
1262+
.unwrap_or_else(|| {
1263+
openshell_bootstrap::errors::generic_failure_diagnosis(name)
1264+
});
1265+
print_failure_diagnosis(&diagnosis);
12581266
Err(err)
12591267
}
12601268
}
12611269
} else {
12621270
eprintln!("Deploying {location} gateway {name}...");
1263-
let handle = openshell_bootstrap::deploy_gateway_with_logs(options, |line| {
1271+
let result = openshell_bootstrap::deploy_gateway_with_logs(options, |line| {
12641272
if let Some(status) = line.strip_prefix("[status] ") {
12651273
eprintln!(" {status}");
12661274
} else if line.strip_prefix("[progress] ").is_some() {
@@ -1269,9 +1277,35 @@ pub(crate) async fn deploy_gateway_with_panel(
12691277
eprintln!(" {line}");
12701278
}
12711279
})
1272-
.await?;
1273-
eprintln!("Gateway {name} ready.");
1274-
Ok(handle)
1280+
.await;
1281+
match result {
1282+
Ok(handle) => {
1283+
eprintln!("Gateway {name} ready.");
1284+
Ok(handle)
1285+
}
1286+
Err(err) => {
1287+
eprintln!(
1288+
"{} {} {name}",
1289+
"x".red().bold(),
1290+
"Gateway failed:".red().bold(),
1291+
);
1292+
// Fetch container logs for pattern-based diagnosis
1293+
let container_logs = openshell_bootstrap::fetch_gateway_logs(name, 80).await;
1294+
let logs_opt = if container_logs.is_empty() {
1295+
None
1296+
} else {
1297+
Some(container_logs.as_str())
1298+
};
1299+
let err_str = format!("{err:?}");
1300+
let diagnosis =
1301+
openshell_bootstrap::errors::diagnose_failure(name, &err_str, logs_opt)
1302+
.unwrap_or_else(|| {
1303+
openshell_bootstrap::errors::generic_failure_diagnosis(name)
1304+
});
1305+
print_failure_diagnosis(&diagnosis);
1306+
Err(err)
1307+
}
1308+
}
12751309
}
12761310
}
12771311

0 commit comments

Comments
 (0)