diff --git a/internal/rm/health.go b/internal/rm/health.go index a1989cd88..5dcc2570d 100644 --- a/internal/rm/health.go +++ b/internal/rm/health.go @@ -32,16 +32,18 @@ const ( // disabled entirely. If set, the envvar is treated as a comma-separated list of Xids to ignore. Note that // this is in addition to the Application errors that are already ignored. envDisableHealthChecks = "DP_DISABLE_HEALTHCHECKS" - allHealthChecks = "xids" + // envEnableHealthChecks defines the environment variable that is checked to + // determine which XIDs should be explicitly enabled. XIDs specified here + // override the ones specified in the `DP_DISABLE_HEALTHCHECKS`. + // Note that this also allows individual XIDs to be selected when ALL XIDs + // are disabled. + envEnableHealthChecks = "DP_ENABLE_HEALTHCHECKS" ) // CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devices, unhealthy chan<- *Device) error { - disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks)) - if disableHealthChecks == "all" { - disableHealthChecks = allHealthChecks - } - if strings.Contains(disableHealthChecks, "xids") { + xids := getHealthCheckXids() + if xids.IsAllDisabled() { return nil } @@ -59,26 +61,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic } }() - // FIXME: formalize the full list and document it. - // http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4 - // Application errors: the GPU should still be healthy - ignoredXids := []uint64{ - 13, // Graphics Engine Exception - 31, // GPU memory page fault - 43, // GPU stopped processing - 45, // Preemptive cleanup, due to previous errors - 68, // Video processor exception - 109, // Context Switch Timeout Error - } - - skippedXids := make(map[uint64]bool) - for _, id := range ignoredXids { - skippedXids[id] = true - } - - for _, additionalXid := range getAdditionalXids(disableHealthChecks) { - skippedXids[additionalXid] = true - } + klog.Infof("Using XIDs for health checks: %v", xids) eventSet, ret := r.nvml.EventSetCreate() if ret != nvml.SUCCESS { @@ -152,7 +135,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic continue } - if skippedXids[e.EventData] { + if xids.IsDisabled(e.EventData) { klog.Infof("Skipping event %+v", e) continue } @@ -188,29 +171,109 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic } } -// getAdditionalXids returns a list of additional Xids to skip from the specified string. -// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values. Invalid values -// are ignored. -func getAdditionalXids(input string) []uint64 { - if input == "" { - return nil +const allXIDs = 0 + +// disabledXIDs stores a map of explicitly disabled XIDs. +// The special XID `allXIDs` indicates that all XIDs are disabled, but does +// allow for specific XIDs to be enabled even if this is the case. +type disabledXIDs map[uint64]bool + +// Disabled returns whether XID-based health checks are disabled. +// These are considered if all XIDs have been disabled AND no other XIDs have +// been explcitly enabled. +func (h disabledXIDs) IsAllDisabled() bool { + if allDisabled, ok := h[allXIDs]; ok { + return allDisabled } + // At this point we wither have explicitly disabled XIDs or explicitly + // enabled XIDs. Since ANY XID that's not specified is assumed enabled, we + // return here. + return false +} - var additionalXids []uint64 - for _, additionalXid := range strings.Split(input, ",") { - trimmed := strings.TrimSpace(additionalXid) +// IsDisabled checks whether the specified XID has been explicitly disalbled. +// An XID is considered disabled if it has been explicitly disabled, or all XIDs +// have been disabled. +func (h disabledXIDs) IsDisabled(xid uint64) bool { + // Handle the case where enabled=all. + if explicitAll, ok := h[allXIDs]; ok && !explicitAll { + return false + } + // Handle the case where the XID has been specifically enabled (or disabled) + if disabled, ok := h[xid]; ok { + return disabled + } + return h.IsAllDisabled() +} + +// getHealthCheckXids returns the XIDs that are considered fatal. +// Here we combine the following (in order of precedence): +// * A list of explicitly disabled XIDs (including all XIDs) +// * A list of hardcoded disabled XIDs +// * A list of explicitly enabled XIDs (including all XIDs) +// +// Note that if an XID is explicitly enabled, this takes precedence over it +// having been disabled either explicitly or implicitly. +func getHealthCheckXids() disabledXIDs { + disabled := newHealthCheckXIDs( + // TODO: We should not read the envvar here directly, but instead + // "upgrade" this to a top-level config option. + strings.Split(strings.ToLower(os.Getenv(envDisableHealthChecks)), ",")..., + ) + enabled := newHealthCheckXIDs( + // TODO: We should not read the envvar here directly, but instead + // "upgrade" this to a top-level config option. + strings.Split(strings.ToLower(os.Getenv(envEnableHealthChecks)), ",")..., + ) + + // Add the list of hardcoded disabled (ignored) XIDs: + // FIXME: formalize the full list and document it. + // http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4 + // Application errors: the GPU should still be healthy + ignoredXids := []uint64{ + 13, // Graphics Engine Exception + 31, // GPU memory page fault + 43, // GPU stopped processing + 45, // Preemptive cleanup, due to previous errors + 68, // Video processor exception + 109, // Context Switch Timeout Error + } + for _, ignored := range ignoredXids { + disabled[ignored] = true + } + + // Explicitly ENABLE specific XIDs, + for enabled := range enabled { + disabled[enabled] = false + } + return disabled +} + +// newHealthCheckXIDs converts a list of Xids to a healthCheckXIDs map. +// Special xid values 'all' and 'xids' return a special map that matches all +// xids. +// For other xids, these are converted to a uint64 values with invalid values +// being ignored. +func newHealthCheckXIDs(xids ...string) disabledXIDs { + output := make(disabledXIDs) + for _, xid := range xids { + trimmed := strings.TrimSpace(xid) + if trimmed == "all" || trimmed == "xids" { + // TODO: We should have a different type for "all" and "all-except" + return disabledXIDs{allXIDs: true} + } if trimmed == "" { continue } - xid, err := strconv.ParseUint(trimmed, 10, 64) + id, err := strconv.ParseUint(trimmed, 10, 64) if err != nil { klog.Infof("Ignoring malformed Xid value %v: %v", trimmed, err) continue } - additionalXids = append(additionalXids, xid) - } - return additionalXids + output[id] = true + } + return output } // getDevicePlacement returns the placement of the specified device. diff --git a/internal/rm/health_test.go b/internal/rm/health_test.go index 101aadf78..f623c550f 100644 --- a/internal/rm/health_test.go +++ b/internal/rm/health_test.go @@ -18,57 +18,206 @@ package rm import ( "fmt" + "strings" "testing" "github.com/stretchr/testify/require" ) -func TestGetAdditionalXids(t *testing.T) { +func TestNewHealthCheckXIDs(t *testing.T) { testCases := []struct { input string - expected []uint64 + expected disabledXIDs }{ - {}, { - input: ",", + expected: disabledXIDs{}, }, { - input: "not-an-int", + input: ",", + expected: disabledXIDs{}, + }, + { + input: "not-an-int", + expected: disabledXIDs{}, }, { input: "68", - expected: []uint64{68}, + expected: disabledXIDs{68: true}, }, { - input: "-68", + input: "-68", + expected: disabledXIDs{}, }, { input: "68 ", - expected: []uint64{68}, + expected: disabledXIDs{68: true}, }, { input: "68,", - expected: []uint64{68}, + expected: disabledXIDs{68: true}, }, { input: ",68", - expected: []uint64{68}, + expected: disabledXIDs{68: true}, }, { input: "68,67", - expected: []uint64{68, 67}, + expected: disabledXIDs{67: true, 68: true}, }, { input: "68,not-an-int,67", - expected: []uint64{68, 67}, + expected: disabledXIDs{67: true, 68: true}, }, } for i, tc := range testCases { t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) { - xids := getAdditionalXids(tc.input) + xids := newHealthCheckXIDs(strings.Split(tc.input, ",")...) require.EqualValues(t, tc.expected, xids) }) } } + +func TestGetHealthCheckXids(t *testing.T) { + testCases := []struct { + description string + enabled string + disabled string + expectedAllDisabled bool + expectedContents disabledXIDs + expectedDisabled map[uint64]bool + }{ + { + description: "empty envvars are default disabled", + expectedAllDisabled: false, + expectedContents: disabledXIDs{ + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + }, + expectedDisabled: map[uint64]bool{ + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + }, + }, + { + description: "disabled is all", + disabled: "all", + expectedAllDisabled: true, + expectedContents: disabledXIDs{ + 0: true, + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + }, + expectedDisabled: map[uint64]bool{ + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + 555: true, + }, + }, + { + description: "disabled is xids", + disabled: "xids", + expectedAllDisabled: true, + expectedContents: disabledXIDs{ + 0: true, + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + }, + expectedDisabled: map[uint64]bool{ + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + 555: true, + }, + }, + { + description: "enabled is all", + enabled: "all", + expectedAllDisabled: false, + expectedContents: disabledXIDs{ + 0: false, + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + }, + expectedDisabled: map[uint64]bool{ + 13: false, + 31: false, + 43: false, + 45: false, + 68: false, + 109: false, + 555: false, + }, + }, + { + description: "enabled overrides disabled", + disabled: "11", + enabled: "11", + expectedAllDisabled: false, + expectedContents: disabledXIDs{ + 11: false, + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + }, + expectedDisabled: map[uint64]bool{ + 11: false, + 13: true, + 31: true, + 43: true, + 45: true, + 68: true, + 109: true, + 555: false, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + t.Setenv(envDisableHealthChecks, tc.disabled) + t.Setenv(envEnableHealthChecks, tc.enabled) + + xids := getHealthCheckXids() + require.EqualValues(t, tc.expectedContents, xids) + require.Equal(t, tc.expectedAllDisabled, xids.IsAllDisabled()) + + disabled := make(map[uint64]bool) + for xid := range tc.expectedDisabled { + disabled[xid] = xids.IsDisabled(xid) + } + require.Equal(t, tc.expectedDisabled, disabled) + }) + } +}