Skip to content

Commit 43e32ff

Browse files
authoredDec 8, 2017
add temperature
1 parent cad9664 commit 43e32ff

File tree

1 file changed

+22
-6
lines changed

1 file changed

+22
-6
lines changed
 

‎check_nvidiasmi.sh

+22-6
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,14 @@ done
2828
tmpDirTrimmed=$(echo $tmpXmlDir | sed 's:/*$::')
2929
tmpXml=$tmpDirTrimmed/$tmpXmlFileName
3030

31+
temperatureWarningTreshold=85
32+
temperatureCriticalTreshold=95
33+
3134
encoderWarning=0
3235
decoderWarning=0
3336
gpuWarning=0
3437
memoryWarning=0
38+
temperatureWarning=0
3539

3640
hash xmlstarlet 2>/dev/null
3741
checkXmlstarlet=$?
@@ -58,11 +62,17 @@ encoderUtil=$(xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_
5862
gpuUtil=$(xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/utilization/gpu_util | sed 's/\ \%*$//')
5963
memoryUtil=$(xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/utilization/memory_util | sed 's/\ \%*$//')
6064
decoderUtil=$(xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/utilization/decoder_util | sed 's/\ \%*$//')
65+
temperature=$(xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/temperature/gpu_temp | sed 's/\ \%*C//')
66+
temperatureMax=$(xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/temperature/gpu_temp_max_threshold | sed 's/\ \%*C//')
67+
68+
temperatureTresholdPercent=$(awk "BEGIN { pc=100*${temperature}/${temperatureMax}; i=int(pc); print (pc-i<0.5)?i:i+1 }")
6169

6270
rm -f $tmpXml
6371

64-
if [ $encoderUtil -lt $warning ] && [ $gpuUtil -lt $warning ] && [ $memoryUtil -lt $warning ] && [ $decoderUtil -lt $warning ]; then
65-
echo "OK GPU - $gpuUtil%; Memory - $memoryUtil%; Encoder - $encoderUtil%; Decoder - $decoderUtil% | gpu=$gpuUtil% memory=$memoryUtil% encoder=$encoderUtil% decoder=$decoderUtil%"
72+
#echo $temperatureTresholdPercent $temperatureWarningTreshold
73+
74+
if [ $encoderUtil -lt $warning ] && [ $gpuUtil -lt $warning ] && [ $memoryUtil -lt $warning ] && [ $decoderUtil -lt $warning ] && [ $temperatureTresholdPercent -lt $temperatureWarningTreshold ]; then
75+
echo "OK GPU - $gpuUtil%; Memory - $memoryUtil%; Encoder - $encoderUtil%; Decoder - $decoderUtil%; Temperature - $temperature | gpu=$gpuUtil% memory=$memoryUtil% encoder=$encoderUtil% decoder=$decoderUtil% temperature=$temperature"
6676
exit 0
6777
fi
6878

@@ -82,12 +92,18 @@ if [ $memoryUtil -gt $warning ] && [ $memoryUtil -lt $critical ]; then
8292
memoryWarning=1
8393
fi
8494

85-
if [ $encoderWarning -eq 1 ] || [ $decoderWarning -eq 1 ] || [ $gpuWarning -eq 1 ] || [ $memoryWarning -eq 1 ]; then
86-
echo "WARNING GPU - $gpuUtil%; Memory - $memoryUtil%; Encoder - $encoderUtil%; Decoder - $decoderUtil% | gpu=$gpuUtil% memory=$memoryUtil% encoder=$encoderUtil% decoder=$decoderUtil%"
95+
if [ $temperatureTresholdPercent -gt $temperatureWarningTreshold ] && [ $temperatureTresholdPercent -lt $temperatureCriticalTreshold ]; then
96+
temperatureWarning=1
97+
fi
98+
99+
#echo "enc" $encoderWarning "dec" $decoderWarning "gpu" $gpuWarning "mem" $memoryWarning "temp" $temperatureWarning
100+
101+
if [ $encoderWarning -eq 1 ] || [ $decoderWarning -eq 1 ] || [ $gpuWarning -eq 1 ] || [ $memoryWarning -eq 1 ] || [ $temperatureWarning -eq 1 ]; then
102+
echo "WARNING GPU - $gpuUtil%; Memory - $memoryUtil%; Encoder - $encoderUtil%; Decoder - $decoderUtil%; Temperature - $temperature | gpu=$gpuUtil% memory=$memoryUtil% encoder=$encoderUtil% decoder=$decoderUtil% temperature=$temperature"
87103
exit 1
88104
fi
89105

90-
if [ $encoderUtil -gt $critical ] || [ $gpuUtil -gt $critical ] || [ $memoryUtil -gt $critical ] || [ $decoderUtil -gt $critical ]; then
91-
echo "CRITICAL GPU - $gpuUtil%; Memory - $memoryUtil%; Encoder - $encoderUtil%; Decoder - $decoderUtil% | gpu=$gpuUtil% memory=$memoryUtil% encoder=$encoderUtil% decoder=$decoderUtil%"
106+
if [ $encoderUtil -gt $critical ] || [ $gpuUtil -gt $critical ] || [ $memoryUtil -gt $critical ] || [ $decoderUtil -gt $critical ] || [ $temperatureTresholdPercent -gt $temperatureCriticalTreshold ]; then
107+
echo "CRITICAL GPU - $gpuUtil%; Memory - $memoryUtil%; Encoder - $encoderUtil%; Decoder - $decoderUtil%; Temperature - $temperature | gpu=$gpuUtil% memory=$memoryUtil% encoder=$encoderUtil% decoder=$decoderUtil% temperature=$temperature"
92108
exit 2
93109
fi

0 commit comments

Comments
 (0)
Please sign in to comment.