diff --git a/ibm_spectrum_lsf/assets/monitors/high_pending_jobs.json b/ibm_spectrum_lsf/assets/monitors/high_pending_jobs.json new file mode 100644 index 0000000000000..5f5ec665e1a2c --- /dev/null +++ b/ibm_spectrum_lsf/assets/monitors/high_pending_jobs.json @@ -0,0 +1,41 @@ +{ + "version": 2, + "created_at": "2025-11-24", + "last_updated_at": "2025-11-24", + "title": "Number of Pending Jobs is Higher than Normal", + "description": "A high amount of pending jobs in your IBM Spectrum LSF cluster could signify a lack of resources such as CPU and GPU slots available, indicating the need to scale up your cluster.", + "definition": { + "id": 236603394, + "name": "[IBM Spectrum LSF] Number of Pending Jobs is Higher than Normal", + "type": "query alert", + "query": "avg(last_1d):anomalies(sum:ibm_spectrum_lsf.queue.pending{*} by {queue_name,lsf_cluster_name}, 'basic', 2, direction='above', interval=300, alert_window='last_1h', count_default_zero='true') >= 1", + "message": "{{#is_alert}}\nNumber of Pending Jobs in IBM Spectrum LSF cluster {{lsf_cluster_name.name}} is higher than normal. \n{{/is_alert}}\n\n{{#is_warning}}\nNumber of Pending Jobs in IBM Spectrum LSF cluster {{lsf_cluster_name.name}} is higher than normal.\n{{/is_warning}}\n\n{{#is_recovery}}\nNumber of Pending Jobs in IBM Spectrum LSF cluster {{lsf_cluster_name.name}} is back to normal.\n{{/is_recovery}}", + "tags": [], + "options": { + "thresholds": { + "critical": 1, + "critical_recovery": 0, + "warning": 0.8 + }, + "notify_audit": false, + "require_full_window": false, + "renotify_interval": 0, + "threshold_windows": { + "trigger_window": "last_1h", + "recovery_window": "last_15m" + }, + "include_tags": true, + "on_missing_data": "default", + "new_group_delay": 60, + "avalanche_window": 20, + "silenced": {} + }, + "priority": null, + "restriction_policy": { + "bindings": [] + } + }, + "tags": [ + "integration:ibm-spectrum-lsf" + ] +} \ No newline at end of file diff --git a/ibm_spectrum_lsf/assets/monitors/no_gpu_slots_available.json b/ibm_spectrum_lsf/assets/monitors/no_gpu_slots_available.json new file mode 100644 index 0000000000000..82df777535661 --- /dev/null +++ b/ibm_spectrum_lsf/assets/monitors/no_gpu_slots_available.json @@ -0,0 +1,33 @@ +{ + "version": 2, + "created_at": "2025-11-24", + "last_updated_at": "2025-11-24", + "title": "No GPU Slots Available", + "description": "In IBM Spectrum LSF, if there are shared available GPUs, jobs that require GPU resources but don't requst an exclusive GPU or multiple GPUs will be able to run.", + "definition": { + "id": 236592543, + "name": "[IBM Spectrum LSF] No GPU Slots Available", + "type": "query alert", + "query": "avg(last_5m):sum:ibm_spectrum_lsf.server.gpu.num_gpus_shared_available{*} by {lsf_cluster_name} <= 0", + "message": "{{#is_alert}}There are no GPU Slots available to be shared with other jobs on cluster {{lsf_cluster_name.name}}. This means that any IBM Spectrum LSF job submitted now will not be able to run until the current jobs that are using GPUs complete. {{/is_alert}}\n\n{{#is_recovery}}\nThere are now available GPU slots on cluster {{lsf_cluster_name.name}}. \n{{/is_recovery}}", + "tags": [], + "options": { + "thresholds": { + "critical": 0 + }, + "notify_audit": false, + "on_missing_data": "show_no_data", + "include_tags": true, + "new_group_delay": 60, + "avalanche_window": 10, + "silenced": {} + }, + "priority": null, + "restriction_policy": { + "bindings": [] + } + }, + "tags": [ + "integration:ibm-spectrum-lsf" + ] +} \ No newline at end of file diff --git a/ibm_spectrum_lsf/manifest.json b/ibm_spectrum_lsf/manifest.json index 324af7c1e428f..ee033a15c7f4c 100644 --- a/ibm_spectrum_lsf/manifest.json +++ b/ibm_spectrum_lsf/manifest.json @@ -37,7 +37,11 @@ "check": "ibm_spectrum_lsf.can_connect", "metadata_path": "metadata.csv" } - } + }, + "monitors": { + "No GPU Slots Available": "assets/monitors/no_gpu_slots_available.json", + "High Pending Jobs": "assets/monitors/high_pending_jobs.json" + } }, "author": { "support_email": "help@datadoghq.com",