@@ -32,16 +32,18 @@ const (
3232 // disabled entirely. If set, the envvar is treated as a comma-separated list of Xids to ignore. Note that
3333 // this is in addition to the Application errors that are already ignored.
3434 envDisableHealthChecks = "DP_DISABLE_HEALTHCHECKS"
35- allHealthChecks = "xids"
35+ // envEnableHealthChecks defines the environment variable that is checked to
36+ // determine which XIDs should be explicitly enabled. XIDs specified here
37+ // override the ones specified in the `DP_DISABLE_HEALTHCHECKS`.
38+ // Note that this also allows individual XIDs to be selected when ALL XIDs
39+ // are disabled.
40+ envEnableHealthChecks = "DP_ENABLE_HEALTHCHECKS"
3641)
3742
3843// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices
3944func (r * nvmlResourceManager ) checkHealth (stop <- chan interface {}, devices Devices , unhealthy chan <- * Device ) error {
40- disableHealthChecks := strings .ToLower (os .Getenv (envDisableHealthChecks ))
41- if disableHealthChecks == "all" {
42- disableHealthChecks = allHealthChecks
43- }
44- if strings .Contains (disableHealthChecks , "xids" ) {
45+ xids := getHealthCheckXids ()
46+ if xids .IsAllDisabled () {
4547 return nil
4648 }
4749
@@ -59,26 +61,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
5961 }
6062 }()
6163
62- // FIXME: formalize the full list and document it.
63- // http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
64- // Application errors: the GPU should still be healthy
65- ignoredXids := []uint64 {
66- 13 , // Graphics Engine Exception
67- 31 , // GPU memory page fault
68- 43 , // GPU stopped processing
69- 45 , // Preemptive cleanup, due to previous errors
70- 68 , // Video processor exception
71- 109 , // Context Switch Timeout Error
72- }
73-
74- skippedXids := make (map [uint64 ]bool )
75- for _ , id := range ignoredXids {
76- skippedXids [id ] = true
77- }
78-
79- for _ , additionalXid := range getAdditionalXids (disableHealthChecks ) {
80- skippedXids [additionalXid ] = true
81- }
64+ klog .Infof ("Using XIDs for health checks: %v" , xids )
8265
8366 eventSet , ret := r .nvml .EventSetCreate ()
8467 if ret != nvml .SUCCESS {
@@ -152,7 +135,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
152135 continue
153136 }
154137
155- if skippedXids [ e .EventData ] {
138+ if xids . IsDisabled ( e .EventData ) {
156139 klog .Infof ("Skipping event %+v" , e )
157140 continue
158141 }
@@ -188,29 +171,109 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
188171 }
189172}
190173
191- // getAdditionalXids returns a list of additional Xids to skip from the specified string.
192- // The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values. Invalid values
193- // are ignored.
194- func getAdditionalXids (input string ) []uint64 {
195- if input == "" {
196- return nil
174+ const allXIDs = 0
175+
176+ // disabledXIDs stores a map of explicitly disabled XIDs.
177+ // The special XID `allXIDs` indicates that all XIDs are disabled, but does
178+ // allow for specific XIDs to be enabled even if this is the case.
179+ type disabledXIDs map [uint64 ]bool
180+
181+ // Disabled returns whether XID-based health checks are disabled.
182+ // These are considered if all XIDs have been disabled AND no other XIDs have
183+ // been explcitly enabled.
184+ func (h disabledXIDs ) IsAllDisabled () bool {
185+ if allDisabled , ok := h [allXIDs ]; ok {
186+ return allDisabled
197187 }
188+ // At this point we wither have explicitly disabled XIDs or explicitly
189+ // enabled XIDs. Since ANY XID that's not specified is assumed enabled, we
190+ // return here.
191+ return false
192+ }
198193
199- var additionalXids []uint64
200- for _ , additionalXid := range strings .Split (input , "," ) {
201- trimmed := strings .TrimSpace (additionalXid )
194+ // IsDisabled checks whether the specified XID has been explicitly disalbled.
195+ // An XID is considered disabled if it has been explicitly disabled, or all XIDs
196+ // have been disabled.
197+ func (h disabledXIDs ) IsDisabled (xid uint64 ) bool {
198+ // Handle the case where enabled=all.
199+ if explicitAll , ok := h [allXIDs ]; ok && ! explicitAll {
200+ return false
201+ }
202+ // Handle the case where the XID has been specifically enabled (or disabled)
203+ if disabled , ok := h [xid ]; ok {
204+ return disabled
205+ }
206+ return h .IsAllDisabled ()
207+ }
208+
209+ // getHealthCheckXids returns the XIDs that are considered fatal.
210+ // Here we combine the following (in order of precedence):
211+ // * A list of explicitly disabled XIDs (including all XIDs)
212+ // * A list of hardcoded disabled XIDs
213+ // * A list of explicitly enabled XIDs (including all XIDs)
214+ //
215+ // Note that if an XID is explicitly enabled, this takes precedence over it
216+ // having been disabled either explicitly or implicitly.
217+ func getHealthCheckXids () disabledXIDs {
218+ disabled := newHealthCheckXIDs (
219+ // TODO: We should not read the envvar here directly, but instead
220+ // "upgrade" this to a top-level config option.
221+ strings .Split (strings .ToLower (os .Getenv (envDisableHealthChecks )), "," )... ,
222+ )
223+ enabled := newHealthCheckXIDs (
224+ // TODO: We should not read the envvar here directly, but instead
225+ // "upgrade" this to a top-level config option.
226+ strings .Split (strings .ToLower (os .Getenv (envEnableHealthChecks )), "," )... ,
227+ )
228+
229+ // Add the list of hardcoded disabled (ignored) XIDs:
230+ // FIXME: formalize the full list and document it.
231+ // http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
232+ // Application errors: the GPU should still be healthy
233+ ignoredXids := []uint64 {
234+ 13 , // Graphics Engine Exception
235+ 31 , // GPU memory page fault
236+ 43 , // GPU stopped processing
237+ 45 , // Preemptive cleanup, due to previous errors
238+ 68 , // Video processor exception
239+ 109 , // Context Switch Timeout Error
240+ }
241+ for _ , ignored := range ignoredXids {
242+ disabled [ignored ] = true
243+ }
244+
245+ // Explicitly ENABLE specific XIDs,
246+ for enabled := range enabled {
247+ disabled [enabled ] = false
248+ }
249+ return disabled
250+ }
251+
252+ // newHealthCheckXIDs converts a list of Xids to a healthCheckXIDs map.
253+ // Special xid values 'all' and 'xids' return a special map that matches all
254+ // xids.
255+ // For other xids, these are converted to a uint64 values with invalid values
256+ // being ignored.
257+ func newHealthCheckXIDs (xids ... string ) disabledXIDs {
258+ output := make (disabledXIDs )
259+ for _ , xid := range xids {
260+ trimmed := strings .TrimSpace (xid )
261+ if trimmed == "all" || trimmed == "xids" {
262+ // TODO: We should have a different type for "all" and "all-except"
263+ return disabledXIDs {allXIDs : true }
264+ }
202265 if trimmed == "" {
203266 continue
204267 }
205- xid , err := strconv .ParseUint (trimmed , 10 , 64 )
268+ id , err := strconv .ParseUint (trimmed , 10 , 64 )
206269 if err != nil {
207270 klog .Infof ("Ignoring malformed Xid value %v: %v" , trimmed , err )
208271 continue
209272 }
210- additionalXids = append (additionalXids , xid )
211- }
212273
213- return additionalXids
274+ output [id ] = true
275+ }
276+ return output
214277}
215278
216279// getDevicePlacement returns the placement of the specified device.
0 commit comments