Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions ibm_spectrum_lsf/assets/monitors/high_pending_jobs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{
"version": 2,
"created_at": "2025-11-24",
"last_updated_at": "2025-11-24",
"title": "Number of Pending Jobs is Higher than Normal",
"description": "A high amount of pending jobs in your IBM Spectrum LSF cluster could signify a lack of resources such as CPU and GPU slots available, indicating the need to scale up your cluster.",
"definition": {
"id": 236603394,
"name": "[IBM Spectrum LSF] Number of Pending Jobs is Higher than Normal",
"type": "query alert",
"query": "avg(last_1d):anomalies(sum:ibm_spectrum_lsf.queue.pending{*} by {queue_name,lsf_cluster_name}, 'basic', 2, direction='above', interval=300, alert_window='last_1h', count_default_zero='true') >= 1",
"message": "{{#is_alert}}\nNumber of Pending Jobs in IBM Spectrum LSF cluster {{lsf_cluster_name.name}} is higher than normal. \n{{/is_alert}}\n\n{{#is_warning}}\nNumber of Pending Jobs in IBM Spectrum LSF cluster {{lsf_cluster_name.name}} is higher than normal.\n{{/is_warning}}\n\n{{#is_recovery}}\nNumber of Pending Jobs in IBM Spectrum LSF cluster {{lsf_cluster_name.name}} is back to normal.\n{{/is_recovery}}",
"tags": [],
"options": {
"thresholds": {
"critical": 1,
"critical_recovery": 0,
"warning": 0.8
},
"notify_audit": false,
"require_full_window": false,
"renotify_interval": 0,
"threshold_windows": {
"trigger_window": "last_1h",
"recovery_window": "last_15m"
},
"include_tags": true,
"on_missing_data": "default",
"new_group_delay": 60,
"avalanche_window": 20,
"silenced": {}
},
"priority": null,
"restriction_policy": {
"bindings": []
}
},
"tags": [
"integration:ibm-spectrum-lsf"
]
}
33 changes: 33 additions & 0 deletions ibm_spectrum_lsf/assets/monitors/no_gpu_slots_available.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"version": 2,
"created_at": "2025-11-24",
"last_updated_at": "2025-11-24",
"title": "No GPU Slots Available",
"description": "In IBM Spectrum LSF, if there are shared available GPUs, jobs that require GPU resources but don't requst an exclusive GPU or multiple GPUs will be able to run.",
"definition": {
"id": 236592543,
"name": "[IBM Spectrum LSF] No GPU Slots Available",
"type": "query alert",
"query": "avg(last_5m):sum:ibm_spectrum_lsf.server.gpu.num_gpus_shared_available{*} by {lsf_cluster_name} <= 0",
"message": "{{#is_alert}}There are no GPU Slots available to be shared with other jobs on cluster {{lsf_cluster_name.name}}. This means that any IBM Spectrum LSF job submitted now will not be able to run until the current jobs that are using GPUs complete. {{/is_alert}}\n\n{{#is_recovery}}\nThere are now available GPU slots on cluster {{lsf_cluster_name.name}}. \n{{/is_recovery}}",
"tags": [],
"options": {
"thresholds": {
"critical": 0
},
"notify_audit": false,
"on_missing_data": "show_no_data",
"include_tags": true,
"new_group_delay": 60,
"avalanche_window": 10,
"silenced": {}
},
"priority": null,
"restriction_policy": {
"bindings": []
}
},
"tags": [
"integration:ibm-spectrum-lsf"
]
}
6 changes: 5 additions & 1 deletion ibm_spectrum_lsf/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,11 @@
"check": "ibm_spectrum_lsf.can_connect",
"metadata_path": "metadata.csv"
}
}
},
"monitors": {
"No GPU Slots Available": "assets/monitors/no_gpu_slots_available.json",
"High Pending Jobs": "assets/monitors/high_pending_jobs.json"
}
},
"author": {
"support_email": "[email protected]",
Expand Down
Loading