Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
273 changes: 273 additions & 0 deletions src/modules/hpc/ib_diagnostics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
//! InfiniBand diagnostics module
//!
//! Run IB diagnostic tools (ibdiagnet, iblinkinfo, ibstat) and parse results.
//!
//! # Parameters
//!
//! - `check` (required): Diagnostic type - "link_health", "topology", "counters", "full"
//! - `port_filter` (optional): Filter results by port/HCA
//! - `output_dir` (optional): Directory to store diagnostic reports

use std::collections::HashMap;
use std::sync::Arc;
use tokio::runtime::Handle;

use crate::connection::{Connection, ExecuteOptions};
use crate::modules::{
Module, ModuleContext, ModuleError, ModuleOutput, ModuleParams, ModuleResult,
ParallelizationHint, ParamExt,
};

fn get_exec_options(context: &ModuleContext) -> ExecuteOptions {
let mut options = ExecuteOptions::new();
if context.r#become {
options = options.with_escalation(context.become_user.clone());
if let Some(ref method) = context.become_method {
options.escalate_method = Some(method.clone());
}
if let Some(ref password) = context.become_password {
options.escalate_password = Some(password.clone());
}
}
options
}

fn run_cmd(
connection: &Arc<dyn Connection + Send + Sync>,
cmd: &str,
context: &ModuleContext,
) -> ModuleResult<(bool, String, String)> {
let options = get_exec_options(context);
let result = Handle::current()
.block_on(async { connection.execute(cmd, Some(options)).await })
.map_err(|e| ModuleError::ExecutionFailed(format!("Connection error: {}", e)))?;
Ok((result.success, result.stdout, result.stderr))
}

fn run_cmd_ok(
connection: &Arc<dyn Connection + Send + Sync>,
cmd: &str,
context: &ModuleContext,
) -> ModuleResult<String> {
let (success, stdout, stderr) = run_cmd(connection, cmd, context)?;
if !success {
return Err(ModuleError::ExecutionFailed(format!(
"Command failed: {}",
stderr.trim()
)));
}
Ok(stdout)
}

#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
#[serde(rename_all = "snake_case")]
enum DiagnosticCheck {
LinkHealth,
Topology,
Counters,
Full,
}

impl DiagnosticCheck {
fn from_str(s: &str) -> Option<DiagnosticCheck> {
match s.to_lowercase().as_str() {
"link_health" => Some(DiagnosticCheck::LinkHealth),
"topology" => Some(DiagnosticCheck::Topology),
"counters" => Some(DiagnosticCheck::Counters),
"full" => Some(DiagnosticCheck::Full),
_ => None,
}
}

fn to_commands(&self) -> Vec<&'static str> {
match self {
DiagnosticCheck::LinkHealth => vec!["iblinkinfo", "ibstat"],
DiagnosticCheck::Topology => vec!["ibnetdiscover", "iblinkinfo"],
DiagnosticCheck::Counters => vec!["ibqueryerrors"],
DiagnosticCheck::Full => vec!["ibdiagnet", "iblinkinfo", "ibstat", "ibqueryerrors"],
}
}
}

pub struct IbDiagnosticsModule;

impl Module for IbDiagnosticsModule {
fn name(&self) -> &'static str {
"ib_diagnostics"
}

fn description(&self) -> &'static str {
"Run InfiniBand diagnostic tools and parse results"
}

fn parallelization_hint(&self) -> ParallelizationHint {
ParallelizationHint::FullyParallel
}

fn execute(
&self,
params: &ModuleParams,
context: &ModuleContext,
) -> ModuleResult<ModuleOutput> {
let connection = context
.connection
.as_ref()
.ok_or_else(|| ModuleError::ExecutionFailed("No connection available".to_string()))?;

let check_str = params.get_string_required("check")?;
let port_filter = params.get_string("port_filter")?;
let output_dir = params
.get_string("output_dir")?
.unwrap_or_else(|| "/tmp/ib_diagnostics".to_string());

let check = DiagnosticCheck::from_str(&check_str).ok_or_else(|| {
ModuleError::InvalidParameter(format!(
"Invalid check type '{}'. Must be 'link_health', 'topology', 'counters', or 'full'",
check_str
))
})?;

if context.check_mode {
return Ok(
ModuleOutput::ok(format!("Would run IB diagnostics: {:?}", check))
.with_data("check", serde_json::json!(check)),
);
}

// Create output directory
run_cmd_ok(connection, &format!("mkdir -p {}", output_dir), context)?;

let mut results: HashMap<String, String> = HashMap::new();
let mut errors: Vec<String> = Vec::new();

for cmd in check.to_commands() {
let mut full_cmd = cmd.to_string();
if let Some(ref filter) = port_filter {
full_cmd.push_str(&format!(" -C {}", filter));
}

let (ok, stdout, stderr) = run_cmd(connection, &full_cmd, context)?;

if ok {
results.insert(cmd.to_string(), stdout.clone());

// Save to file
let output_file = format!("{}/{}.log", output_dir, cmd);
let escaped = stdout.replace('\'', "'\\''");
let _ = run_cmd(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Propagate failures when writing diagnostic log files

The result of the log-write command is discarded with let _ = run_cmd(...), so permission issues, missing directories, or disk errors silently drop report files even though output_dir is returned as if the artifacts were saved. This makes diagnostics output misleading in environments where report persistence is required; log-write failures should be added to errors or fail the module.

Useful? React with 👍 / 👎.

connection,
&format!("echo '{}' > {}", escaped, output_file),
context,
);
} else {
errors.push(format!("{}: {}", cmd, stderr.trim()));
}
}

// Parse results for issues
let mut issues: Vec<String> = Vec::new();

// Check iblinkinfo for link issues
if let Some(linkinfo) = results.get("iblinkinfo") {
if linkinfo.contains("Down") || linkinfo.contains("Polling") {
issues.push("Link state issues detected".to_string());
}
}

// Check ibqueryerrors for error counters
if let Some(errors_output) = results.get("ibqueryerrors") {
if !errors_output.trim().is_empty() && !errors_output.contains("No errors") {
issues.push("Error counters detected".to_string());
}
}

let summary = if issues.is_empty() && errors.is_empty() {
"IB diagnostics completed successfully. No issues detected.".to_string()
} else if !issues.is_empty() {
format!(
"IB diagnostics completed. {} issue(s) detected: {}",
issues.len(),
issues.join(", ")
)
} else {
format!("IB diagnostics completed with {} error(s)", errors.len())
};

Ok(ModuleOutput::ok(summary)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Mark module failed when diagnostic commands fail

The loop records command failures in errors, but the function always returns ModuleOutput::ok(...), so runs where tools are missing or commands fail (e.g., ibdiagnet not installed or non-zero exit) are reported as successful to callers. In automation workflows this can let playbooks continue despite incomplete diagnostics, which undermines result reliability; the status should switch to failed (or return an error) when errors is non-empty.

Useful? React with 👍 / 👎.

.with_data("check", serde_json::json!(check))
.with_data("output_dir", serde_json::json!(output_dir))
.with_data("issues", serde_json::json!(issues))
.with_data("errors", serde_json::json!(errors))
.with_data("results", serde_json::json!(results)))
}

fn required_params(&self) -> &[&'static str] {
&["check"]
}

fn optional_params(&self) -> HashMap<&'static str, serde_json::Value> {
let mut m = HashMap::new();
m.insert("port_filter", serde_json::json!(null));
m.insert("output_dir", serde_json::json!("/tmp/ib_diagnostics"));
m
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_module_metadata() {
let module = IbDiagnosticsModule;
assert_eq!(module.name(), "ib_diagnostics");
assert!(!module.description().is_empty());
}

#[test]
fn test_required_params() {
let module = IbDiagnosticsModule;
let required = module.required_params();
assert!(required.contains(&"check"));
}

#[test]
fn test_optional_params() {
let module = IbDiagnosticsModule;
let optional = module.optional_params();
assert!(optional.contains_key("port_filter"));
assert!(optional.contains_key("output_dir"));
}

#[test]
fn test_diagnostic_check_from_str() {
assert_eq!(
DiagnosticCheck::from_str("link_health"),
Some(DiagnosticCheck::LinkHealth)
);
assert_eq!(
DiagnosticCheck::from_str("topology"),
Some(DiagnosticCheck::Topology)
);
assert_eq!(
DiagnosticCheck::from_str("COUNTERS"),
Some(DiagnosticCheck::Counters)
);
assert_eq!(
DiagnosticCheck::from_str("full"),
Some(DiagnosticCheck::Full)
);
assert_eq!(DiagnosticCheck::from_str("invalid"), None);
}

#[test]
fn test_diagnostic_check_to_commands() {
let link_health = DiagnosticCheck::LinkHealth;
assert!(link_health.to_commands().contains(&"iblinkinfo"));
assert!(link_health.to_commands().contains(&"ibstat"));

let full = DiagnosticCheck::Full;
assert!(full.to_commands().len() >= 4);
assert!(full.to_commands().contains(&"ibdiagnet"));
}
}
4 changes: 4 additions & 0 deletions src/modules/hpc/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ pub mod hpc_job;
pub mod hpc_queue;
pub mod hpc_server;
#[cfg(feature = "ofed")]
pub mod ib_diagnostics;
#[cfg(feature = "ofed")]
pub mod ib_partition;
#[cfg(feature = "ofed")]
pub mod ib_validate;
Expand Down Expand Up @@ -140,6 +142,8 @@ pub use hpc_job::HpcJobModule;
pub use hpc_queue::HpcQueueModule;
pub use hpc_server::HpcServerModule;
#[cfg(feature = "ofed")]
pub use ib_diagnostics::IbDiagnosticsModule;
#[cfg(feature = "ofed")]
pub use ib_partition::IbPartitionModule;
#[cfg(feature = "ofed")]
pub use ib_validate::IbValidateModule;
Expand Down
1 change: 1 addition & 0 deletions src/modules/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1718,6 +1718,7 @@ impl ModuleRegistry {
hpc::IpoibModule,
hpc::OpensmConfigModule,
hpc::IbPartitionModule,
hpc::IbDiagnosticsModule,
],
);

Expand Down
Loading