28
28
tmpDirTrimmed=$( echo $tmpXmlDir | sed ' s:/*$::' )
29
29
tmpXml=$tmpDirTrimmed /$tmpXmlFileName
30
30
31
+ temperatureWarningTreshold=85
32
+ temperatureCriticalTreshold=95
33
+
31
34
encoderWarning=0
32
35
decoderWarning=0
33
36
gpuWarning=0
34
37
memoryWarning=0
38
+ temperatureWarning=0
35
39
36
40
hash xmlstarlet 2> /dev/null
37
41
checkXmlstarlet=$?
@@ -58,11 +62,17 @@ encoderUtil=$(xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_
58
62
gpuUtil=$( xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/utilization/gpu_util | sed ' s/\ \%*$//' )
59
63
memoryUtil=$( xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/utilization/memory_util | sed ' s/\ \%*$//' )
60
64
decoderUtil=$( xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/utilization/decoder_util | sed ' s/\ \%*$//' )
65
+ temperature=$( xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/temperature/gpu_temp | sed ' s/\ \%*C//' )
66
+ temperatureMax=$( xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/temperature/gpu_temp_max_threshold | sed ' s/\ \%*C//' )
67
+
68
+ temperatureTresholdPercent=$( awk " BEGIN { pc=100*${temperature} /${temperatureMax} ; i=int(pc); print (pc-i<0.5)?i:i+1 }" )
61
69
62
70
rm -f $tmpXml
63
71
64
- if [ $encoderUtil -lt $warning ] && [ $gpuUtil -lt $warning ] && [ $memoryUtil -lt $warning ] && [ $decoderUtil -lt $warning ]; then
65
- echo " OK GPU - $gpuUtil %; Memory - $memoryUtil %; Encoder - $encoderUtil %; Decoder - $decoderUtil % | gpu=$gpuUtil % memory=$memoryUtil % encoder=$encoderUtil % decoder=$decoderUtil %"
72
+ # echo $temperatureTresholdPercent $temperatureWarningTreshold
73
+
74
+ if [ $encoderUtil -lt $warning ] && [ $gpuUtil -lt $warning ] && [ $memoryUtil -lt $warning ] && [ $decoderUtil -lt $warning ] && [ $temperatureTresholdPercent -lt $temperatureWarningTreshold ]; then
75
+ echo " OK GPU - $gpuUtil %; Memory - $memoryUtil %; Encoder - $encoderUtil %; Decoder - $decoderUtil %; Temperature - $temperature | gpu=$gpuUtil % memory=$memoryUtil % encoder=$encoderUtil % decoder=$decoderUtil % temperature=$temperature "
66
76
exit 0
67
77
fi
68
78
@@ -82,12 +92,18 @@ if [ $memoryUtil -gt $warning ] && [ $memoryUtil -lt $critical ]; then
82
92
memoryWarning=1
83
93
fi
84
94
85
- if [ $encoderWarning -eq 1 ] || [ $decoderWarning -eq 1 ] || [ $gpuWarning -eq 1 ] || [ $memoryWarning -eq 1 ]; then
86
- echo " WARNING GPU - $gpuUtil %; Memory - $memoryUtil %; Encoder - $encoderUtil %; Decoder - $decoderUtil % | gpu=$gpuUtil % memory=$memoryUtil % encoder=$encoderUtil % decoder=$decoderUtil %"
95
+ if [ $temperatureTresholdPercent -gt $temperatureWarningTreshold ] && [ $temperatureTresholdPercent -lt $temperatureCriticalTreshold ]; then
96
+ temperatureWarning=1
97
+ fi
98
+
99
+ # echo "enc" $encoderWarning "dec" $decoderWarning "gpu" $gpuWarning "mem" $memoryWarning "temp" $temperatureWarning
100
+
101
+ if [ $encoderWarning -eq 1 ] || [ $decoderWarning -eq 1 ] || [ $gpuWarning -eq 1 ] || [ $memoryWarning -eq 1 ] || [ $temperatureWarning -eq 1 ]; then
102
+ echo " WARNING GPU - $gpuUtil %; Memory - $memoryUtil %; Encoder - $encoderUtil %; Decoder - $decoderUtil %; Temperature - $temperature | gpu=$gpuUtil % memory=$memoryUtil % encoder=$encoderUtil % decoder=$decoderUtil % temperature=$temperature "
87
103
exit 1
88
104
fi
89
105
90
- if [ $encoderUtil -gt $critical ] || [ $gpuUtil -gt $critical ] || [ $memoryUtil -gt $critical ] || [ $decoderUtil -gt $critical ]; then
91
- echo " CRITICAL GPU - $gpuUtil %; Memory - $memoryUtil %; Encoder - $encoderUtil %; Decoder - $decoderUtil % | gpu=$gpuUtil % memory=$memoryUtil % encoder=$encoderUtil % decoder=$decoderUtil %"
106
+ if [ $encoderUtil -gt $critical ] || [ $gpuUtil -gt $critical ] || [ $memoryUtil -gt $critical ] || [ $decoderUtil -gt $critical ] || [ $temperatureTresholdPercent -gt $temperatureCriticalTreshold ] ; then
107
+ echo " CRITICAL GPU - $gpuUtil %; Memory - $memoryUtil %; Encoder - $encoderUtil %; Decoder - $decoderUtil %; Temperature - $temperature | gpu=$gpuUtil % memory=$memoryUtil % encoder=$encoderUtil % decoder=$decoderUtil % temperature= $temperature "
92
108
exit 2
93
109
fi
0 commit comments