Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 103 additions & 40 deletions internal/rm/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,18 @@ const (
// disabled entirely. If set, the envvar is treated as a comma-separated list of Xids to ignore. Note that
// this is in addition to the Application errors that are already ignored.
envDisableHealthChecks = "DP_DISABLE_HEALTHCHECKS"
allHealthChecks = "xids"
// envEnableHealthChecks defines the environment variable that is checked to
// determine which XIDs should be explicitly enabled. XIDs specified here
// override the ones specified in the `DP_DISABLE_HEALTHCHECKS`.
// Note that this also allows individual XIDs to be selected when ALL XIDs
// are disabled.
envEnableHealthChecks = "DP_ENABLE_HEALTHCHECKS"
)

// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices
func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devices, unhealthy chan<- *Device) error {
disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks))
if disableHealthChecks == "all" {
disableHealthChecks = allHealthChecks
}
if strings.Contains(disableHealthChecks, "xids") {
xids := getHealthCheckXids()
if xids.IsAllDisabled() {
return nil
}

Expand All @@ -59,26 +61,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
}
}()

// FIXME: formalize the full list and document it.
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
// Application errors: the GPU should still be healthy
ignoredXids := []uint64{
13, // Graphics Engine Exception
31, // GPU memory page fault
43, // GPU stopped processing
45, // Preemptive cleanup, due to previous errors
68, // Video processor exception
109, // Context Switch Timeout Error
}

skippedXids := make(map[uint64]bool)
for _, id := range ignoredXids {
skippedXids[id] = true
}

for _, additionalXid := range getAdditionalXids(disableHealthChecks) {
skippedXids[additionalXid] = true
}
klog.Infof("Using XIDs for health checks: %v", xids)

eventSet, ret := r.nvml.EventSetCreate()
if ret != nvml.SUCCESS {
Expand Down Expand Up @@ -152,7 +135,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
continue
}

if skippedXids[e.EventData] {
if xids.IsDisabled(e.EventData) {
klog.Infof("Skipping event %+v", e)
continue
}
Expand Down Expand Up @@ -188,29 +171,109 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
}
}

// getAdditionalXids returns a list of additional Xids to skip from the specified string.
// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values. Invalid values
// are ignored.
func getAdditionalXids(input string) []uint64 {
if input == "" {
return nil
const allXIDs = 0

// disabledXIDs stores a map of explicitly disabled XIDs.
// The special XID `allXIDs` indicates that all XIDs are disabled, but does
// allow for specific XIDs to be enabled even if this is the case.
type disabledXIDs map[uint64]bool

// Disabled returns whether XID-based health checks are disabled.
// These are considered if all XIDs have been disabled AND no other XIDs have
// been explcitly enabled.
func (h disabledXIDs) IsAllDisabled() bool {
if allDisabled, ok := h[allXIDs]; ok {
return allDisabled
}
// At this point we wither have explicitly disabled XIDs or explicitly
// enabled XIDs. Since ANY XID that's not specified is assumed enabled, we
// return here.
return false
}

var additionalXids []uint64
for _, additionalXid := range strings.Split(input, ",") {
trimmed := strings.TrimSpace(additionalXid)
// IsDisabled checks whether the specified XID has been explicitly disalbled.
// An XID is considered disabled if it has been explicitly disabled, or all XIDs
// have been disabled.
func (h disabledXIDs) IsDisabled(xid uint64) bool {
// Handle the case where enabled=all.
if explicitAll, ok := h[allXIDs]; ok && !explicitAll {
return false
}
// Handle the case where the XID has been specifically enabled (or disabled)
if disabled, ok := h[xid]; ok {
return disabled
}
return h.IsAllDisabled()
}

// getHealthCheckXids returns the XIDs that are considered fatal.
// Here we combine the following (in order of precedence):
// * A list of explicitly disabled XIDs (including all XIDs)
// * A list of hardcoded disabled XIDs
// * A list of explicitly enabled XIDs (including all XIDs)
//
// Note that if an XID is explicitly enabled, this takes precedence over it
// having been disabled either explicitly or implicitly.
func getHealthCheckXids() disabledXIDs {
disabled := newHealthCheckXIDs(
// TODO: We should not read the envvar here directly, but instead
// "upgrade" this to a top-level config option.
strings.Split(strings.ToLower(os.Getenv(envDisableHealthChecks)), ",")...,
)
enabled := newHealthCheckXIDs(
// TODO: We should not read the envvar here directly, but instead
// "upgrade" this to a top-level config option.
strings.Split(strings.ToLower(os.Getenv(envEnableHealthChecks)), ",")...,
)

// Add the list of hardcoded disabled (ignored) XIDs:
// FIXME: formalize the full list and document it.
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
// Application errors: the GPU should still be healthy
ignoredXids := []uint64{
13, // Graphics Engine Exception
31, // GPU memory page fault
43, // GPU stopped processing
45, // Preemptive cleanup, due to previous errors
68, // Video processor exception
109, // Context Switch Timeout Error
}
for _, ignored := range ignoredXids {
disabled[ignored] = true
}

// Explicitly ENABLE specific XIDs,
for enabled := range enabled {
disabled[enabled] = false
}
return disabled
}

// newHealthCheckXIDs converts a list of Xids to a healthCheckXIDs map.
// Special xid values 'all' and 'xids' return a special map that matches all
// xids.
// For other xids, these are converted to a uint64 values with invalid values
// being ignored.
func newHealthCheckXIDs(xids ...string) disabledXIDs {
output := make(disabledXIDs)
for _, xid := range xids {
trimmed := strings.TrimSpace(xid)
if trimmed == "all" || trimmed == "xids" {
// TODO: We should have a different type for "all" and "all-except"
return disabledXIDs{allXIDs: true}
}
if trimmed == "" {
continue
}
xid, err := strconv.ParseUint(trimmed, 10, 64)
id, err := strconv.ParseUint(trimmed, 10, 64)
if err != nil {
klog.Infof("Ignoring malformed Xid value %v: %v", trimmed, err)
continue
}
additionalXids = append(additionalXids, xid)
}

return additionalXids
output[id] = true
}
return output
}

// getDevicePlacement returns the placement of the specified device.
Expand Down
Loading