Skip to content

Commit 268ec6e

Browse files
Add support for explicitly enabling XIDs in health checks
This change adds support for users to explicitly set specific XIDs as fatal errors that cause a device to be marked as unhealth. When used in conjunction with the ability to ignore ALL XIDs, the set of XIDs that are fatal can be set. To do this, an environment variable DP_ENABLE_HEALTHCHECKS that mirrors the existing DP_DISABLE_HEALTHCHECKS envvar is added. Explicitly enabling an XID overrides disabling them. Signed-off-by: Robert Smith <[email protected]> Co-Authored-by: Evan Lezar <[email protected]>
1 parent dd6edf8 commit 268ec6e

File tree

2 files changed

+265
-53
lines changed

2 files changed

+265
-53
lines changed

internal/rm/health.go

Lines changed: 103 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,18 @@ const (
3232
// disabled entirely. If set, the envvar is treated as a comma-separated list of Xids to ignore. Note that
3333
// this is in addition to the Application errors that are already ignored.
3434
envDisableHealthChecks = "DP_DISABLE_HEALTHCHECKS"
35-
allHealthChecks = "xids"
35+
// envEnableHealthChecks defines the environment variable that is checked to
36+
// determine which XIDs should be explicitly enabled. XIDs specified here
37+
// override the ones specified in the `DP_DISABLE_HEALTHCHECKS`.
38+
// Note that this also allows individual XIDs to be selected when ALL XIDs
39+
// are disabled.
40+
envEnableHealthChecks = "DP_ENABLE_HEALTHCHECKS"
3641
)
3742

3843
// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices
3944
func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devices, unhealthy chan<- *Device) error {
40-
disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks))
41-
if disableHealthChecks == "all" {
42-
disableHealthChecks = allHealthChecks
43-
}
44-
if strings.Contains(disableHealthChecks, "xids") {
45+
xids := getHealthCheckXids()
46+
if xids.IsAllDisabled() {
4547
return nil
4648
}
4749

@@ -59,26 +61,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
5961
}
6062
}()
6163

62-
// FIXME: formalize the full list and document it.
63-
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
64-
// Application errors: the GPU should still be healthy
65-
ignoredXids := []uint64{
66-
13, // Graphics Engine Exception
67-
31, // GPU memory page fault
68-
43, // GPU stopped processing
69-
45, // Preemptive cleanup, due to previous errors
70-
68, // Video processor exception
71-
109, // Context Switch Timeout Error
72-
}
73-
74-
skippedXids := make(map[uint64]bool)
75-
for _, id := range ignoredXids {
76-
skippedXids[id] = true
77-
}
78-
79-
for _, additionalXid := range getAdditionalXids(disableHealthChecks) {
80-
skippedXids[additionalXid] = true
81-
}
64+
klog.Infof("Using XIDs for health checks: %v", xids)
8265

8366
eventSet, ret := r.nvml.EventSetCreate()
8467
if ret != nvml.SUCCESS {
@@ -152,7 +135,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
152135
continue
153136
}
154137

155-
if skippedXids[e.EventData] {
138+
if xids.IsDisabled(e.EventData) {
156139
klog.Infof("Skipping event %+v", e)
157140
continue
158141
}
@@ -188,29 +171,109 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
188171
}
189172
}
190173

191-
// getAdditionalXids returns a list of additional Xids to skip from the specified string.
192-
// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values. Invalid values
193-
// are ignored.
194-
func getAdditionalXids(input string) []uint64 {
195-
if input == "" {
196-
return nil
174+
const allXIDs = 0
175+
176+
// disabledXIDs stores a map of explicitly disabled XIDs.
177+
// The special XID `allXIDs` indicates that all XIDs are disabled, but does
178+
// allow for specific XIDs to be enabled even if this is the case.
179+
type disabledXIDs map[uint64]bool
180+
181+
// Disabled returns whether XID-based health checks are disabled.
182+
// These are considered if all XIDs have been disabled AND no other XIDs have
183+
// been explcitly enabled.
184+
func (h disabledXIDs) IsAllDisabled() bool {
185+
if allDisabled, ok := h[allXIDs]; ok {
186+
return allDisabled
197187
}
188+
// At this point we wither have explicitly disabled XIDs or explicitly
189+
// enabled XIDs. Since ANY XID that's not specified is assumed enabled, we
190+
// return here.
191+
return false
192+
}
198193

199-
var additionalXids []uint64
200-
for _, additionalXid := range strings.Split(input, ",") {
201-
trimmed := strings.TrimSpace(additionalXid)
194+
// IsDisabled checks whether the specified XID has been explicitly disalbled.
195+
// An XID is considered disabled if it has been explicitly disabled, or all XIDs
196+
// have been disabled.
197+
func (h disabledXIDs) IsDisabled(xid uint64) bool {
198+
// Handle the case where enabled=all.
199+
if explicitAll, ok := h[allXIDs]; ok && !explicitAll {
200+
return false
201+
}
202+
// Handle the case where the XID has been specifically enabled (or disabled)
203+
if disabled, ok := h[xid]; ok {
204+
return disabled
205+
}
206+
return h.IsAllDisabled()
207+
}
208+
209+
// getHealthCheckXids returns the XIDs that are considered fatal.
210+
// Here we combine the following (in order of precedence):
211+
// * A list of explicitly disabled XIDs (including all XIDs)
212+
// * A list of hardcoded disabled XIDs
213+
// * A list of explicitly enabled XIDs (including all XIDs)
214+
//
215+
// Note that if an XID is explicitly enabled, this takes precedence over it
216+
// having been disabled either explicitly or implicitly.
217+
func getHealthCheckXids() disabledXIDs {
218+
disabled := newHealthCheckXIDs(
219+
// TODO: We should not read the envvar here directly, but instead
220+
// "upgrade" this to a top-level config option.
221+
strings.Split(strings.ToLower(os.Getenv(envDisableHealthChecks)), ",")...,
222+
)
223+
enabled := newHealthCheckXIDs(
224+
// TODO: We should not read the envvar here directly, but instead
225+
// "upgrade" this to a top-level config option.
226+
strings.Split(strings.ToLower(os.Getenv(envEnableHealthChecks)), ",")...,
227+
)
228+
229+
// Add the list of hardcoded disabled (ignored) XIDs:
230+
// FIXME: formalize the full list and document it.
231+
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
232+
// Application errors: the GPU should still be healthy
233+
ignoredXids := []uint64{
234+
13, // Graphics Engine Exception
235+
31, // GPU memory page fault
236+
43, // GPU stopped processing
237+
45, // Preemptive cleanup, due to previous errors
238+
68, // Video processor exception
239+
109, // Context Switch Timeout Error
240+
}
241+
for _, ignored := range ignoredXids {
242+
disabled[ignored] = true
243+
}
244+
245+
// Explicitly ENABLE specific XIDs,
246+
for enabled := range enabled {
247+
disabled[enabled] = false
248+
}
249+
return disabled
250+
}
251+
252+
// newHealthCheckXIDs converts a list of Xids to a healthCheckXIDs map.
253+
// Special xid values 'all' and 'xids' return a special map that matches all
254+
// xids.
255+
// For other xids, these are converted to a uint64 values with invalid values
256+
// being ignored.
257+
func newHealthCheckXIDs(xids ...string) disabledXIDs {
258+
output := make(disabledXIDs)
259+
for _, xid := range xids {
260+
trimmed := strings.TrimSpace(xid)
261+
if trimmed == "all" || trimmed == "xids" {
262+
// TODO: We should have a different type for "all" and "all-except"
263+
return disabledXIDs{allXIDs: true}
264+
}
202265
if trimmed == "" {
203266
continue
204267
}
205-
xid, err := strconv.ParseUint(trimmed, 10, 64)
268+
id, err := strconv.ParseUint(trimmed, 10, 64)
206269
if err != nil {
207270
klog.Infof("Ignoring malformed Xid value %v: %v", trimmed, err)
208271
continue
209272
}
210-
additionalXids = append(additionalXids, xid)
211-
}
212273

213-
return additionalXids
274+
output[id] = true
275+
}
276+
return output
214277
}
215278

216279
// getDevicePlacement returns the placement of the specified device.

0 commit comments

Comments
 (0)