diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 9003f19038d..3629ffce9ba 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -662,6 +662,12 @@ impl BundleCollection<'_> { sled_client.support_zpool_info(), ) .boxed(), + save_diag_cmd_output_or_error( + &sled_path, + "health-check", + sled_client.support_health_check(), + ) + .boxed(), ]) // Currently we execute up to 10 commands concurrently which // might be doing their own concurrent work, for example diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index c3f1cbcee51..fbc523b7744 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -659,6 +659,33 @@ } } }, + "/support/health-check": { + "get": { + "operationId": "support_health_check", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_SledDiagnosticsQueryOutput", + "type": "array", + "items": { + "$ref": "#/components/schemas/SledDiagnosticsQueryOutput" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/support/ipadm-info": { "get": { "operationId": "support_ipadm_info", diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 6033650fa4e..7bdba01e1bc 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -673,6 +673,14 @@ pub trait SledAgentApi { request_context: RequestContext, ) -> Result, HttpError>; + #[endpoint { + method = GET, + path = "/support/health-check", + }] + async fn support_health_check( + request_context: RequestContext, + ) -> Result>, HttpError>; + /// This endpoint returns a list of known zones on a sled that have service /// logs that can be collected into a support bundle. #[endpoint { diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 9abd2dfb5c2..0056056eb33 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -1036,6 +1036,20 @@ impl SledAgentApi for SledAgentImpl { Ok(HttpResponseOk(res.get_output())) } + async fn support_health_check( + request_context: RequestContext, + ) -> Result>, HttpError> + { + let sa = request_context.context(); + Ok(HttpResponseOk( + sa.support_health_check() + .await + .into_iter() + .map(|cmd| cmd.get_output()) + .collect::>(), + )) + } + async fn support_logs( request_context: RequestContext, ) -> Result>, HttpError> { diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 57ea9491484..59eb99a4d6e 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -745,6 +745,13 @@ impl SledAgentApi for SledAgentSimImpl { method_unimplemented() } + async fn support_health_check( + _request_context: RequestContext, + ) -> Result>, HttpError> + { + method_unimplemented() + } + async fn support_logs( _request_context: RequestContext, ) -> Result>, HttpError> { diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 5b6d248a4ff..242c6a9e829 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -67,7 +67,8 @@ use sled_agent_types::zone_bundle::{ BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod, PriorityOrder, StorageLimit, ZoneBundleMetadata, }; -use sled_diagnostics::{SledDiagnosticsCmdError, SledDiagnosticsCmdOutput}; +use sled_diagnostics::SledDiagnosticsCmdError; +use sled_diagnostics::SledDiagnosticsCmdOutput; use sled_hardware::{HardwareManager, MemoryReservations, underlay}; use sled_hardware_types::Baseboard; use sled_hardware_types::underlay::BootstrapInterface; @@ -1473,6 +1474,12 @@ impl SledAgent { ) -> Result { sled_diagnostics::zpool_info().await } + + pub(crate) async fn support_health_check( + &self, + ) -> Vec> { + sled_diagnostics::health_check().await + } } #[derive(From, thiserror::Error, Debug)] diff --git a/sled-diagnostics/Cargo.toml b/sled-diagnostics/Cargo.toml index 40a56c5d705..70ca27f157d 100644 --- a/sled-diagnostics/Cargo.toml +++ b/sled-diagnostics/Cargo.toml @@ -29,4 +29,4 @@ zip = { workspace = true, features = ["zstd"] } omicron-common.workspace = true omicron-test-utils.workspace = true omicron-uuid-kinds.workspace = true -sled-storage = { workspace = true, features = ["testing"] } +sled-storage = { workspace = true, features = ["testing"] } diff --git a/sled-diagnostics/src/lib.rs b/sled-diagnostics/src/lib.rs index 8b7639a45b1..b61267af01e 100644 --- a/sled-diagnostics/src/lib.rs +++ b/sled-diagnostics/src/lib.rs @@ -146,3 +146,25 @@ pub async fn zpool_info() -> Result { execute_command_with_timeout(zpool_status(), DEFAULT_TIMEOUT).await } + +pub async fn health_check() +-> Vec> { + [ + uptime(), + kstat_low_page(), + svcs_enabled_but_not_running(), + count_disks(), + zfs_list_unmounted(), + count_crucibles(), + identify_datasets_close_to_quota(), + identify_datasets_with_less_than_300_gib_avail(), + dimm_check(), + ] + .into_iter() + .map(|c| async move { + execute_command_with_timeout(c, DEFAULT_TIMEOUT).await + }) + .collect::>() + .collect::>>() + .await +} diff --git a/sled-diagnostics/src/queries.rs b/sled-diagnostics/src/queries.rs index c7da33ecefb..c1452abbc4d 100644 --- a/sled-diagnostics/src/queries.rs +++ b/sled-diagnostics/src/queries.rs @@ -22,11 +22,14 @@ use crate::contract_stub::ContractError; const DLADM: &str = "/usr/sbin/dladm"; const IPADM: &str = "/usr/sbin/ipadm"; +const KSTAT: &str = "/usr/bin/kstat"; const NVMEADM: &str = "/usr/sbin/nvmeadm"; const PFEXEC: &str = "/usr/bin/pfexec"; const PFILES: &str = "/usr/bin/pfiles"; const PSTACK: &str = "/usr/bin/pstack"; const PARGS: &str = "/usr/bin/pargs"; +const SVCS: &str = "/usr/bin/svcs"; +const UPTIME: &str = "/usr/bin/uptime"; const ZFS: &str = "/usr/sbin/zfs"; const ZONEADM: &str = "/usr/sbin/zoneadm"; const ZPOOL: &str = "/usr/sbin/zpool"; @@ -263,6 +266,99 @@ pub fn pfiles_process(pid: i32) -> Command { cmd } +pub fn uptime() -> Command { + let mut cmd = std::process::Command::new(UPTIME); + cmd.env_clear(); + cmd +} + +pub fn kstat_low_page() -> Command { + let mut cmd = std::process::Command::new(PFEXEC); + cmd.env_clear().arg(KSTAT).arg("-p").arg("unix::system_pages:low_mem_scan"); + cmd +} + +pub fn svcs_enabled_but_not_running() -> Command { + let mut cmd = std::process::Command::new(PFEXEC); + cmd.env_clear().arg(SVCS).arg("-xZ"); + cmd +} + +pub fn count_disks() -> Command { + let mut cmd = std::process::Command::new("bash"); + cmd.env_clear().args([ + "-c", + "(pfexec diskinfo -pH | tee | wc -l | xargs | grep -x '12' > /dev/null) \ + && echo 'OK: All expected disks found' \ + || echo 'WARN: Unexpected number of physical disks (expected 12)'", + ]); + cmd +} + +pub fn zfs_list_unmounted() -> Command { + let mut cmd = std::process::Command::new("bash"); + cmd.env_clear().args([ + "-c", + "pfexec zfs list -r -o name,mounted | grep oxp | grep -v yes$ \ + && echo 'WARN: Found unmounted dataset(s)' \ + || echo 'OK: No unmounted datasets'", + ]); + cmd +} + +pub fn count_crucibles() -> Command { + let mut cmd = std::process::Command::new("bash"); + cmd.env_clear() + .args([ + "-c", + "(zoneadm list | grep crucible | grep -v pantry | tee | wc -l | xargs | grep -x '10' > /dev/null) \ + && echo 'OK: 10 Crucibles found' \ + || echo 'WARN: Unexpected number of crucible zones (expected 10)'" + ]); + cmd +} + +pub fn identify_datasets_close_to_quota() -> Command { + let mut cmd = std::process::Command::new("bash"); + cmd.env_clear() + .args([ + "-c", + "zfs list -Hp -o used,quota,name,avail,mountpoint | \ + egrep 'oxp|oxi' | \ + egrep -v 'none|crucible' | \ + awk '$2 > 0 && $1 / $2 >= 0.8 { any=1; print } END { exit !any }' \ + && echo 'WARN: Found near-quota datasets' \ + || echo 'OK: No near-quota datasets found'" + ]); + cmd +} + +pub fn identify_datasets_with_less_than_300_gib_avail() -> Command { + let mut cmd = std::process::Command::new("bash"); + cmd.env_clear().args([ + "-c", + "zfs list -Hp -o used,quota,name,avail,mountpoint | \ + egrep 'oxp|oxi' | \ + egrep -v 'none|crucible' | \ + awk '$4 < (300 * (1024^3)) { any=1; print } END { exit !any }' \ + && echo 'WARN: Found low-space datasets' \ + || echo 'OK: No low-space datasets found'", + ]); + cmd +} + +pub fn dimm_check() -> Command { + let mut cmd = std::process::Command::new("bash"); + cmd.env_clear().args([ + "-c", + "prtconf -m | \ + grep -v -e 1036271 -e 2084847 \ + && echo 'WARN: Unexpected quantity of system memory' \ + || echo 'OK: Found expected quantity of system memory'", + ]); + cmd +} + pub fn zfs_list() -> Command { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear()