-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfunctions
109 lines (97 loc) · 2.61 KB
/
tfunctions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Copyright (C) 2021 aalto.fi
# Released under the GNU General Public License
# set of bash functions for the HPC cluster admin needs
# source it in your .bashrc
# . tfunctions
resume() {
if [ $# -eq 0 ]; then
echo "usage: resume cn[xx,xx-xx],tbxxx,gpu[xxx-xxx]"
return 1;
else
nodelist=$1
scontrol update nodename=$nodelist state=resume
fi
}
drain() {
if [ $# -lt 2 ]; then
echo "usage: drain wsm[x,xx-xx],gpux the reason"
return 1
else
nodelist=$1
reason="${*:2}"
scontrol update nodename=$nodelist state=drain reason="$reason"
fi
}
down() {
if [ $# -lt 2 ]; then
echo "usage: down wsm[x,xx-xx],gpux the reason"
return 1
else
nodelist=$1
reason="${*:2}"
scontrol update nodename=$nodelist state=down reason="$reason"
fi
}
pdsh_healthcheck() {
if [ $# -lt 1 ]; then
echo "usage: pdsh_healthcheck wsm[xx-xx,x],gpuxx"
return 1
else
nodes="$1"
pdsh -w $nodes 'nhc && healthcheck -d' | dshbak -c
fi
}
# Create a SLURM reservation starting immediately.
reserve() {
if [ $# -lt 2 ]; then
echo Create a SLURM reservation starting immediately.
echo "usage: reserve wsm[xx-xx,x],gpuxx reservation_name [user]"
echo " (Default user=root)"
return 1
else
nodelist=$1
reservation_name=$2
if [ $# -lt 3 ]; then
res_user=root
else
res_user=$3
fi
scontrol create reservation user=$res_user starttime=now duration=infinite flags=maint nodes=$nodelist Reservation=$reservation_name
fi
}
#list SLURM reservations
reservations(){
scontrol show reservation
}
# Delete a SLURM reservation
reservation_rm() {
if [ $# -lt 1 ]; then
echo Delete a SLURM reservation.
echo "usage: reservation_rm resrvation_name"
return 1
else
reservation_name=$1
scontrol delete ReservationName=$reservation_name
fi
}
nst() {
if [[ $# == 0 ]]; then
echo Reports node status shortly
echo Usage: $FUNCNAME node[xx-xx],nodexx,...
return 1
fi
nodelist=$(/bin/hostlist -e -s" " $1)
for n in $nodelist; do
echo Status of $n
echo ' draining reason ' $(sinfo -h -n $n -O reason)
echo ' jobs ' $(squeue -w $n -h 2>/dev/null | wc -l)
echo ' last job ends in (hours) ' $(( ( $(date -d "$(squeue -h -w $n -O endtime | sort | tail -1)" +%s) - $(date +%s) ) / 3600 ))
echo ' power ' $(ipmi $n power status)
ST='na'; ssh -o ConnectTimeout=4 -q $n exit &>/dev/null && ST='ok' || ST='fails'
echo " ssh $ST"
# if [[ "$ST" == ok ]]; then
# NHC='na'; ssh -o ConnectTimeout=10 -q $n nhc &>/dev/null && NHC='ok' || NHC='fails'
# echo " nhc $NHC"
# fi
done
}