-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdsu
239 lines (198 loc) · 7.52 KB
/
pdsu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/bin/bash
# Initiate the Dell firmware updates from the admin node
# limits a number of processes to NP and number of the nodes
# that can have SLURM's reboot / rebooting drain reason to RN.
#
# all actions / logs are in $DSUTEMP
#
# Dependencies: cURL, shosts, pdcp, sed, sinfo
# Note: for the expanding a string like node[2-4,7],gpu6 to
# a list of 'node2 node3 node4 node7 gpu6 ...' we use 'shosts'
# from this the same git repo
# Copyright (C) 2025 ivan.tervanto /at/ aalto.fi
# Released under the GNU General Public License
# catching signals to make sure the clean exit
trap 'killall $(jobs -p) 2>/dev/null; exit 1' 1 2 3 6
# for a sake of debuging, works with the '-v'
verbose() {
if [[ $VERBOSE == 'yes' ]]; then
(($#)) && echo "....VERBOSE: $@"
fi
}
# help function
usage() {
(($#)) && echo -e "\nERROR: $@"
echo
echo "Usage: ${0##*/} [options] nodelist"
echo ' nodelist is a comma-separated list with a prefixNN naming convention'
echo ' in the general form: prefix[n-m,l-k,...]; example: node[2-4,7],gpu6'
echo ' -n #, number of nodes to run DSU processes concurrently (default 4)'
echo ' -s #, total number of nodes from the nodelist that can be drained or'
echo ' in the process of applying DSU at once (default 8)'
echo ' -t ##, timeout between running jobs checks (default 1m)'
echo ' "s" for seconds (the default), "m" for minutes, "h" for hours, "d" for days'
echo ' -r, add --reboot to the dsu command (default no)'
echo ' -v, verbose mode'
echo ' -h, this help message'
echo
exit 1
}
VERBOSE='' # by default, no verbose mode
DSUREBOOT='' # by default, no reboot initiated by DSU
# getopt formats input into a '$0 options -- args'
opts=$(getopt "vhrs:n:t:" "$@") || usage
set -- $opts
# extract the options first and leave the args for the later
while true; do
case ${1} in
-h) usage ;;
-r) DSUREBOOT='--reboot' ;;
-v) VERBOSE='yes' ;;
-s) shift; RN=$1 ;;
-t) shift; TIMEOUT=$1 ;;
-n) shift; NP=$1 ;;
--) shift; break ;; # exit when -- is reached
esac
shift
done
# by now, we should have left the node list only
if (($#==1)); then NODES=$1
else usage "Incorrect node list $@"
fi
# define default vars
BOOTSTRAP_URL='https://linux.dell.com/repo/hardware/dsu/bootstrap.cgi'
BOOTSTRAP_FILE=${BOOTSTRAP_URL##*/} # extract the file name from the URL
: ${NP:=4} # number of the simultaneous remote processes/copies
: ${RN:=8} # maximum number of the nodes that can be drained or dsu in the progress
: ${TIMEOUT:=1m} # timeout for the sleep, see rundsu()
DSUTEMP='/root/.dsu' # the action dir
RD='/root' # a directory on the remote node
RUNDSU='rundsu' # temporary script for the remote execution
# all the local preparations / logs in this folder
[[ -d $DSUTEMP ]] || mkdir -p $DSUTEMP
cd $DSUTEMP
verbose "NODES: $NODES"
verbose "NP: $NP"
verbose "RN: $RN"
verbose "TIMEOUT: $TIMEOUT"
verbose "bootstrap file: $DSUTEMP/$BOOTSTRAP_FILE"
# get the dell's bootstrap file
echo "..Downloading $BOOTSTRAP_FILE file"
if ! curl -f -s -O $BOOTSTRAP_URL; then
usage "Can't download $BOOTSTRAP_URL"
fi
# fix that GPG thing; make it "non-interactive"
sed -i 's/IMPORT_GPG_CONFIRMATION="na"/IMPORT_GPG_CONFIRMATION="yes"/' $BOOTSTRAP_FILE
# a string like node[2-4,7],gpu6 to a list 'node2 node3 node4 node7 gpu6 ...'
NODELIST=$(shosts $NODES)
verbose "NODELIST: $NODELIST"
pdcp_cmd="pdcp -f $NP -b -w $NODES"
verbose "pdcp command: $pdcp_cmd ..."
dsu_cmd="dsu -n --import-public-key $DSUREBOOT"
verbose "DSU command: $dsu_cmd"
## the action starts here
# copy the bootstrap file over the nodes
echo "..Copying $BOOTSTRAP_FILE to: $NODELIST"
$pdcp_cmd $BOOTSTRAP_FILE $RD/$BOOTSTRAP_FILE
# making a temp executable and run it on the $NODES
echo "..Creating and copying 'rundsu' to: $NODELIST"
cat <<EOF > $RUNDSU
# $(date -R)
echo '#### RUNNING' $RD/$BOOTSTRAP_FILE
bash $RD/$BOOTSTRAP_FILE
echo '#### UPDATING dell-system-update'
yum -y install dell-system-update
echo '#### RUNNING' $dsu_cmd
$dsu_cmd
EOF
$pdcp_cmd $RUNDSU $RD/$RUNDSU
# the function that does the actual run
rundsu() {
local NODE=$1 SSHST HELPURL
echo "$NODE: Started at $(date -R)"
ssh -o ConnectTimeout=12 $NODE "bash $RD/$RUNDSU" # actual run
SSHST=$? # status of the SSH command (== the latest executed command)
verbose "$NODE: SSH DSU exit code: $SSHST"
# exit codes for the DSU
HELPURL='https://www.dell.com/support/manuals/en-us/system-update/dsu_ug_1.8_revamp/dsu-return-codes?guid=guid-a413b447-0dd2-45fb-a60c-7a472e353e30'
case $SSHST in
8|26)
# 8 Reboot is required for the update to be completed successfully
# 26 Out-of-date updates are selected. For successful updates, reboot is required
echo "$NODE: DSU run is over, draining for the reboot"
scontrol update nodename=$NODE state=drain reason=reboot
;;
0|34)
# 34 There are no updates found which can be applied
echo "$NODE: DSU run is over"
if [[ $DSUREBOOT == '--reboot' ]]; then
echo "$NODE: DSU reboot has been initiated, check the node manually later"
else echo "$NODE: Check the node manually, resume it if needed"
fi
#scontrol update nodename=$NODE state=resume
;;
1|24)
# 1 means something went wrong, like failure with the catalog
# 24 Failure in applying updates
echo "$NODE: DSU run has failed with the exit code: $SSHST"
echo "$NODE: Check the node manually, we do nothing here"
return 1
;;
*)
echo "$NODE: DSU run has ended with the non-zero exit code: $SSHST"
echo "$NODE: Check the node manually, we do nothing here"
echo "see $HELPURL"
return 1
esac
echo "$NODE: Finished at $(date -R)"
return 0
}
# counting currently running 'rundsu' in the bg
count_bg_jobs() {
jobs -r -p | wc -l
}
# counting SLURM drained nodes with the reboot/rebooting reason
# note: counts only the nodes in the $NODES set
count_reboot_nodes() {
sinfo -h -n $NODES -t drain -o "%N %E" -N | grep -E 'reboot|rebooting' | sort -u | wc -l
}
# counters intitialization
cnp=0 crn=0
# going through the NODELIST
for node in $NODELIST; do
# DSU runs
echo "$node: DSU run started: $(date +'%H:%M %d-%m-%Y')"
LOG="$DSUTEMP/$node.$(date +"%d-%m-%Y").log"
echo "$node: Log file: $LOG"
# collect node log info to a dedicated file; send job to a background
{ if rundsu $node > $LOG; then
echo "$node: DSU run finished: $(date +'%H:%M %d-%m-%Y')"
else
echo "$node: DSU run has failed: $(date +'%H:%M %d-%m-%Y')"
echo "$node: Check the log file: $LOG"
#exit 1 # make sense to stop execution of the whole script
fi; } &
# update bg jobs counter
cnp=$(count_bg_jobs)
# update number of the $NODES in the reboot|rebooting stage counter
crn=$(count_reboot_nodes)
verbose "after $node: running jobs: cnp: $cnp, limit: $NP"
verbose "after $node: nodes in reboot: crn: $crn, limit: $RN"
if (( $cnp >= $NP )); then
verbose "NP limit reached $(date +'%H:%M %d-%m') ... sleep"
elif (( $crn+$cnp >= $RN )); then
verbose "RN limit reached $(date +'%H:%M %d-%m') ... sleep"
fi
# if $cnp is greater or equal to $NP: sleep
# if $crn+$cnp is greater or equal to $RN: sleep
# note: we consider all $cnp nodes as "potentially drained"
while (( $cnp >= $NP || $crn+$cnp >= $RN )); do
sleep $TIMEOUT # timeout between checking the jobs number
cnp=$(count_bg_jobs)
crn=$(count_reboot_nodes)
done # while
done # for
echo '...Wating till the job(s) are done'
# to monitor the results, example:
# tail -n 0 -F /root/.dsu/csl{5..9}.*.log
wait