@@ -2,12 +2,32 @@ package core
22
33import (
44 "fmt"
5+ "strings"
56
67 "github.com/containers/kubernetes-mcp-server/pkg/api"
7- internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes"
88)
99
10- func initHealthCheckPrompts (o internalk8s.Openshift ) []api.ServerPrompt {
10+ const (
11+ // Health check configuration constants
12+ defaultRestartThreshold = 5
13+ eventLookbackMinutes = 30
14+ maxWarningEvents = 20
15+ )
16+
17+ // isVerboseEnabled checks if the verbose flag is enabled.
18+ // It accepts "true", "1", "yes", or "y" (case-insensitive) as truthy values.
19+ func isVerboseEnabled (value string ) bool {
20+ switch strings .ToLower (value ) {
21+ case "true" , "1" , "yes" , "y" :
22+ return true
23+ default :
24+ return false
25+ }
26+ }
27+
28+ // initHealthCheckPrompts creates prompts for cluster health diagnostics.
29+ // These prompts guide LLMs to systematically check cluster components using existing tools.
30+ func initHealthCheckPrompts () []api.ServerPrompt {
1131 return []api.ServerPrompt {
1232 {
1333 Name : "cluster_health_check" ,
@@ -25,7 +45,7 @@ func initHealthCheckPrompts(o internalk8s.Openshift) []api.ServerPrompt {
2545 },
2646 },
2747 GetMessages : func (arguments map [string ]string ) []api.PromptMessage {
28- verbose := arguments ["verbose" ] == "true"
48+ verbose := isVerboseEnabled ( arguments ["verbose" ])
2949 namespace := arguments ["namespace" ]
3050
3151 return buildHealthCheckPromptMessages (verbose , namespace )
@@ -34,13 +54,15 @@ func initHealthCheckPrompts(o internalk8s.Openshift) []api.ServerPrompt {
3454 }
3555}
3656
57+ // buildHealthCheckPromptMessages constructs the prompt messages for cluster health checks.
58+ // It adapts the instructions based on verbose mode and namespace filtering.
3759func buildHealthCheckPromptMessages (verbose bool , namespace string ) []api.PromptMessage {
3860 scopeMsg := "across all namespaces"
3961 podListInstruction := "- Use pods_list to get all pods"
4062
4163 if namespace != "" {
4264 scopeMsg = fmt .Sprintf ("in namespace '%s'" , namespace )
43- podListInstruction = fmt .Sprintf ("- Use pods_list_in_namespace with namespace parameter set to '%s' to get all pods in namespace '%s'" , namespace , namespace )
65+ podListInstruction = fmt .Sprintf ("- Use pods_list_in_namespace with namespace '%s'" , namespace )
4466 }
4567
4668 verboseMsg := ""
@@ -52,14 +74,17 @@ func buildHealthCheckPromptMessages(verbose bool, namespace string) []api.Prompt
5274 "- Event messages and timestamps"
5375 }
5476
77+ // Construct the event display range dynamically using maxWarningEvents
78+ eventDisplayRange := fmt .Sprintf ("10-%d" , maxWarningEvents )
79+
5580 userMessage := fmt .Sprintf (`Please perform a comprehensive health check on the Kubernetes cluster %s.
5681
5782Follow these steps systematically:
5883
5984## 1. Check Cluster-Level Components
6085
6186### For OpenShift Clusters:
62- - Use resources_list with kind=ClusterOperator to check cluster operator health
87+ - Use resources_list with apiVersion=config.openshift.io/v1 and kind=ClusterOperator to check cluster operator health
6388- Look for operators with:
6489 * Degraded=True (CRITICAL)
6590 * Available=False (CRITICAL)
@@ -70,7 +95,7 @@ Follow these steps systematically:
7095- Note the cluster type in your report
7196
7297## 2. Check Node Health
73- - Use resources_list with kind=Node to examine all nodes
98+ - Use resources_list with apiVersion=v1 and kind=Node to examine all nodes
7499- Check each node for:
75100 * Ready condition != True (CRITICAL)
76101 * Unschedulable spec field = true (WARNING)
@@ -84,30 +109,30 @@ Follow these steps systematically:
84109 * Container state waiting with reason:
85110 - CrashLoopBackOff (CRITICAL)
86111 - ImagePullBackOff or ErrImagePull (CRITICAL)
87- * RestartCount > 5 (WARNING - configurable threshold)
112+ * RestartCount > %d (WARNING - configurable threshold)
88113- Group issues by type and count occurrences
89114
90115## 4. Check Workload Controllers
91116- Use resources_list for each workload type:
92- * kind=Deployment ( apps/v1)
93- * kind=StatefulSet ( apps/v1)
94- * kind=DaemonSet ( apps/v1)
117+ * apiVersion= apps/v1, kind=Deployment
118+ * apiVersion= apps/v1, kind=StatefulSet
119+ * apiVersion= apps/v1, kind=DaemonSet
95120- For each controller, compare:
96121 * spec.replicas vs status.readyReplicas (Deployment/StatefulSet)
97122 * status.desiredNumberScheduled vs status.numberReady (DaemonSet)
98123 * Report mismatches as WARNINGs
99124
100125## 5. Check Storage
101- - Use resources_list with kind=PersistentVolumeClaim
126+ - Use resources_list with apiVersion=v1 and kind=PersistentVolumeClaim
102127- Identify PVCs not in Bound phase (WARNING)
103128- Note namespace and PVC name for each issue
104129
105130## 6. Check Recent Events (Optional)
106131- Use events_list to get cluster events
107132- Filter for:
108133 * Type = Warning
109- * Timestamp within last 30 minutes
110- - Limit to 10-20 most recent warnings
134+ * Timestamp within last %d minutes
135+ - Limit to %s most recent warnings
111136- Include event message and involved object%s
112137
113138## Output Format
@@ -139,7 +164,7 @@ Scope: [all namespaces / specific namespace]
139164[PVC status: total, bound, pending/other]
140165
141166### Recent Events
142- [Warning events from last 30 minutes]
167+ [Warning events from last %d minutes]
143168
144169================================================
145170Summary
@@ -162,7 +187,18 @@ Warnings: [count]
162187- Be efficient: don't call the same tool multiple times unnecessarily
163188- If a resource type doesn't exist (e.g., ClusterOperator on vanilla K8s), skip it gracefully
164189- Provide clear, actionable insights in your summary
165- - Use emojis for visual clarity: ✅ (healthy), ⚠️ (warning), ❌ (critical)` , scopeMsg , podListInstruction , verboseMsg )
190+ - Use emojis for visual clarity: ✅ (healthy), ⚠️ (warning), ❌ (critical)
191+
192+ ### Common apiVersion Values
193+
194+ When using resources_list, specify the correct apiVersion for each resource type:
195+ - Core resources: apiVersion=v1 (Pod, Service, Node, PersistentVolumeClaim, ConfigMap, Secret, Namespace)
196+ - Apps: apiVersion=apps/v1 (Deployment, StatefulSet, DaemonSet, ReplicaSet)
197+ - Batch: apiVersion=batch/v1 (Job, CronJob)
198+ - RBAC: apiVersion=rbac.authorization.k8s.io/v1 (Role, RoleBinding, ClusterRole, ClusterRoleBinding)
199+ - Networking: apiVersion=networking.k8s.io/v1 (Ingress, NetworkPolicy)
200+ - OpenShift Config: apiVersion=config.openshift.io/v1 (ClusterOperator, ClusterVersion)
201+ - OpenShift Routes: apiVersion=route.openshift.io/v1 (Route)` , scopeMsg , podListInstruction , defaultRestartThreshold , eventLookbackMinutes , eventDisplayRange , verboseMsg , eventLookbackMinutes )
166202
167203 assistantMessage := `I'll perform a comprehensive cluster health check following the systematic approach outlined. Let me start by gathering information about the cluster components.`
168204
0 commit comments