From 38ba45026a323cded07810442d6b9e28443168e3 Mon Sep 17 00:00:00 2001 From: Sarah Witt Date: Mon, 24 Nov 2025 13:13:23 -0500 Subject: [PATCH 1/3] Add recommended monitors --- .../assets/monitors/high_pending_jobs.json | 41 +++++++++++++++++++ .../monitors/no_gpu_slots_available.json | 33 +++++++++++++++ ibm_spectrum_lsf/manifest.json | 4 ++ 3 files changed, 78 insertions(+) create mode 100644 ibm_spectrum_lsf/assets/monitors/high_pending_jobs.json create mode 100644 ibm_spectrum_lsf/assets/monitors/no_gpu_slots_available.json diff --git a/ibm_spectrum_lsf/assets/monitors/high_pending_jobs.json b/ibm_spectrum_lsf/assets/monitors/high_pending_jobs.json new file mode 100644 index 0000000000000..071c149b808f7 --- /dev/null +++ b/ibm_spectrum_lsf/assets/monitors/high_pending_jobs.json @@ -0,0 +1,41 @@ +{ + "version": 2, + "created_at": "2025-11-13T16:53:03.680-05:00", + "last_updated_at": "2025-11-13T16:53:03.680-05:00", + "title": "Number of Pending Jobs is Higher than Normal", + "description": "A high amount of pending jobs in your IBM Spectrum LSF cluster could signify a lack of resources such as CPU and GPU slots available, indicating the need to scale up your cluster.", + "definition": { + "id": 236603394, + "name": "[IBM Spectrum LSF] Number of Pending Jobs is Higher than Normal", + "type": "query alert", + "query": "avg(last_1d):anomalies(sum:ibm_spectrum_lsf.queue.pending{*} by {queue_name,lsf_cluster_name}, 'basic', 2, direction='above', interval=300, alert_window='last_1h', count_default_zero='true') >= 1", + "message": "{{#is_alert}}\nNumber of Pending Jobs in IBM Spectrum LSF cluster {{lsf_cluster_name.name}} is higher than normal. \n{{/is_alert}}\n\n{{#is_warning}}\nNumber of Pending Jobs in IBM Spectrum LSF cluster {{lsf_cluster_name.name}} is higher than normal.\n{{/is_warning}}\n\n{{#is_recovery}}\nNumber of Pending Jobs in IBM Spectrum LSF cluster {{lsf_cluster_name.name}} is back to normal.\n{{/is_recovery}}", + "tags": [], + "options": { + "thresholds": { + "critical": 1, + "critical_recovery": 0, + "warning": 0.8 + }, + "notify_audit": false, + "require_full_window": false, + "renotify_interval": 0, + "threshold_windows": { + "trigger_window": "last_1h", + "recovery_window": "last_15m" + }, + "include_tags": true, + "on_missing_data": "default", + "new_group_delay": 60, + "avalanche_window": 20, + "silenced": {} + }, + "priority": null, + "restriction_policy": { + "bindings": [] + } + }, + "tags": [ + "integration:ibm-spectrum-lsf" + ] +} \ No newline at end of file diff --git a/ibm_spectrum_lsf/assets/monitors/no_gpu_slots_available.json b/ibm_spectrum_lsf/assets/monitors/no_gpu_slots_available.json new file mode 100644 index 0000000000000..1bd1580774976 --- /dev/null +++ b/ibm_spectrum_lsf/assets/monitors/no_gpu_slots_available.json @@ -0,0 +1,33 @@ +{ + "version": 2, + "created_at": "2025-11-13T16:07:02.807-05:00", + "last_updated_at": "2025-11-13T16:07:02.807-05:00", + "title": "No GPU Slots Available", + "description": "In IBM Spectrum LSF, if there are shared available GPUs, jobs that require GPU resources but don't requst an exclusive GPU or multiple GPUs will be able to run.", + "definition": { + "id": 236592543, + "name": "[IBM Spectrum LSF] No GPU Slots Available", + "type": "query alert", + "query": "avg(last_5m):sum:ibm_spectrum_lsf.server.gpu.num_gpus_shared_available{*} by {lsf_cluster_name} <= 0", + "message": "{{#is_alert}}There are no GPU Slots available to be shared with other jobs on cluster {{lsf_cluster_name.name}}. This means that any IBM Spectrum LSF job submitted now will not be able to run until the current jobs that are using GPUs complete. {{/is_alert}}\n\n{{#is_recovery}}\nThere are now available GPU slots on cluster {{lsf_cluster_name.name}}. \n{{/is_recovery}}", + "tags": [], + "options": { + "thresholds": { + "critical": 0 + }, + "notify_audit": false, + "on_missing_data": "show_no_data", + "include_tags": true, + "new_group_delay": 60, + "avalanche_window": 10, + "silenced": {} + }, + "priority": null, + "restriction_policy": { + "bindings": [] + } + }, + "tags": [ + "integration:ibm-spectrum-lsf" + ] +} \ No newline at end of file diff --git a/ibm_spectrum_lsf/manifest.json b/ibm_spectrum_lsf/manifest.json index 324af7c1e428f..dadab725e66fd 100644 --- a/ibm_spectrum_lsf/manifest.json +++ b/ibm_spectrum_lsf/manifest.json @@ -36,6 +36,10 @@ "prefix": "ibm_spectrum_lsf.", "check": "ibm_spectrum_lsf.can_connect", "metadata_path": "metadata.csv" + }, + "monitors": { + "No GPU Slots Available": "assets/monitors/no_gpu_slots_available.json", + "High Pending Jobs": "assets/monitors/high_pending_jobs.json" } } }, From 8f4ae808458b6f84a9280f8c7260696978a483ab Mon Sep 17 00:00:00 2001 From: Sarah Witt Date: Mon, 24 Nov 2025 13:21:56 -0500 Subject: [PATCH 2/3] fix manifest --- ibm_spectrum_lsf/manifest.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ibm_spectrum_lsf/manifest.json b/ibm_spectrum_lsf/manifest.json index dadab725e66fd..ee033a15c7f4c 100644 --- a/ibm_spectrum_lsf/manifest.json +++ b/ibm_spectrum_lsf/manifest.json @@ -36,12 +36,12 @@ "prefix": "ibm_spectrum_lsf.", "check": "ibm_spectrum_lsf.can_connect", "metadata_path": "metadata.csv" - }, - "monitors": { + } + }, + "monitors": { "No GPU Slots Available": "assets/monitors/no_gpu_slots_available.json", "High Pending Jobs": "assets/monitors/high_pending_jobs.json" } - } }, "author": { "support_email": "help@datadoghq.com", From 4f9c403087da06822957860f56f25af474780218 Mon Sep 17 00:00:00 2001 From: Sarah Witt Date: Mon, 24 Nov 2025 14:45:13 -0500 Subject: [PATCH 3/3] update dates --- ibm_spectrum_lsf/assets/monitors/high_pending_jobs.json | 4 ++-- ibm_spectrum_lsf/assets/monitors/no_gpu_slots_available.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ibm_spectrum_lsf/assets/monitors/high_pending_jobs.json b/ibm_spectrum_lsf/assets/monitors/high_pending_jobs.json index 071c149b808f7..5f5ec665e1a2c 100644 --- a/ibm_spectrum_lsf/assets/monitors/high_pending_jobs.json +++ b/ibm_spectrum_lsf/assets/monitors/high_pending_jobs.json @@ -1,7 +1,7 @@ { "version": 2, - "created_at": "2025-11-13T16:53:03.680-05:00", - "last_updated_at": "2025-11-13T16:53:03.680-05:00", + "created_at": "2025-11-24", + "last_updated_at": "2025-11-24", "title": "Number of Pending Jobs is Higher than Normal", "description": "A high amount of pending jobs in your IBM Spectrum LSF cluster could signify a lack of resources such as CPU and GPU slots available, indicating the need to scale up your cluster.", "definition": { diff --git a/ibm_spectrum_lsf/assets/monitors/no_gpu_slots_available.json b/ibm_spectrum_lsf/assets/monitors/no_gpu_slots_available.json index 1bd1580774976..82df777535661 100644 --- a/ibm_spectrum_lsf/assets/monitors/no_gpu_slots_available.json +++ b/ibm_spectrum_lsf/assets/monitors/no_gpu_slots_available.json @@ -1,7 +1,7 @@ { "version": 2, - "created_at": "2025-11-13T16:07:02.807-05:00", - "last_updated_at": "2025-11-13T16:07:02.807-05:00", + "created_at": "2025-11-24", + "last_updated_at": "2025-11-24", "title": "No GPU Slots Available", "description": "In IBM Spectrum LSF, if there are shared available GPUs, jobs that require GPU resources but don't requst an exclusive GPU or multiple GPUs will be able to run.", "definition": {