-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcluster_manager.sh
456 lines (372 loc) · 16.3 KB
/
cluster_manager.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
#!/bin/bash
# Set the root user password for cluster manager linode.
# It should contain at least two of these four character classes:
# lower case letters - upper case letters - numbers - punctuation.
#
# Some special characters may require escape prefixing and the password to be enclosed in
# single or double quotes.
# Some examples:
# - for password with spaces, enclose in double quotes
# ROOT_PASSWORD="a PassworD with spaces"
#
# - for password with double quotes, enclose in double quotes and prefix every double quote in the password with a backslash \ :
# ROOT_PASSWORD="pswd_WITH_\"dbl\"_quotes"
#
# - for password with $, enclose in double quotes and prefix every $ in the password with a backslash \ :
# ROOT_PASSWORD="pswd_with_\$a_"
ROOT_PASSWORD=""
# Select the configuration plan for cluster manager linode.
# See available plans by running these commands in a terminal:
# $ source ./api_env_linode.conf
# $ ./linode_api.py plans
PLAN_ID=1
# Select the Linode datacenter where cluster manager linode should be created.
# Select a location that is geographically closest to your location.
# See available datacenters by running these commands in a terminal:
# $ source ./api_env_linode.conf
# $ ./linode_api.py datacenters
DATACENTER="newark"
# Select a Linux distribution for the cluster manager linode.
# Currently, this script runs only on Ubuntu 14.04 and Debian 8 distributions.
# Default value of 124 selects Ubuntu 14.04 64-bit LTS (Long Term Support) version.
# See all available Ubuntu distributions by running these commands in a terminal:
# $ source ./api_env_linode.conf
# $ ./linode_api.py distributions "Ubuntu" table
# $ ./linode_api.py distributions "debian" table
DISTRIBUTION=124
# Select a Linux kernel version for the cluster manager linode.
# Default value of 138 selects latest 64-bit kernel version provided by Linode.
# See all available kernels by running these commands in a terminal:
# $ source ./api_env_linode.conf
# $ ./linode_api.py kernels "" table
KERNEL=138
# Set the default available ssh authentication mechanisms to log in to the cluster manager node.
# Password authentication is considered less secure, and hence disabled by default.
# 'yes' disables password authentication and enables only public key authentication.
# 'no' enables both public key and password authentication.
DISABLE_SSH_PASSWORD_AUTHENTICATION=yes
# This label is shown in the Linode Manager UI.
# If you plan to create multiple cluster managers, set this to some unique label for each one
# before creating it.
CLUSTER_MANAGER_LINODE_LABEL="cluster-manager"
# The default Storm and Zookeeper download URLs.
STORM_URL='http://www.us.apache.org/dist/storm/apache-storm-0.9.5/apache-storm-0.9.5.tar.gz'
ZOOKEEPER_URL='http://www.us.apache.org/dist/zookeeper/zookeeper-3.4.6/zookeeper-3.4.6.tar.gz'
# Workaround to avoid "Agent admitted failure to sign using the key." ssh error despite
# using correct key, due to some conflict with gnome-keyring.
SSH_AUTH_SOCK=0
export SSH_AUTH_SOCK
# $1 -> API env conf file
# $2 -> (Optional) The SHA1 or short SHA1 hash of the git revision to download. If not specified,
# it downloads the latest release tagged version.
create_cluster_manager_linode() {
. $1
if [ "x$ROOT_PASSWORD" == "x" ]; then
printf "Error: ROOT_PASSWORD for the cluster manager node is not set. \n \
Open this file in an editor, set ROOT_PASSWORD='<a strong password>' and re-run this script.\n\n"
return 1
fi
# Check if the Zookeeper and Storm packages are available, because sometimes releases are taken off the download
# servers.
local zk_url_check=$(curl -s -o /dev/null -I -w "%{http_code}" "$ZOOKEEPER_URL")
if [ "$zk_url_check" == "404" ]; then
echo "Error: The Zookeeper package $ZOOKEEPER_URL is no longer available. Open this file in an editor and change ZOOKEEPER_URL to an available package URL."
return 1
fi
local storm_url_check=$(curl -s -o /dev/null -I -w "%{http_code}" "$STORM_URL")
if [ "$storm_url_check" == "404" ]; then
echo "Error: The Storm package $STORM_URL is no longer available. Open this file in an editor and change STORM_URL to an available package URL."
return 1
fi
echo "Creating keypair for cluster manager root ssh authentication"
ssh-keygen -t rsa -b 4096 -q -f $HOME/.ssh/clustermgrroot -N ''
echo "Creating keypair for cluster manager clustermgr user ssh authentication"
ssh-keygen -t rsa -b 4096 -q -f $HOME/.ssh/clustermgr -N ''
echo "Creating keypair for cluster manager clustermgrguest user ssh authentication"
ssh-keygen -t rsa -b 4096 -q -f $HOME/.ssh/clustermgrguest -N ''
local linout linerr linret
echo "Creating linode"
linode_api linout linerr linret "create-node" $PLAN_ID "$DATACENTER"
if [ $linret -eq 1 ]; then
echo "Failed to create temporary linode. Error:$linerr"
return 1
fi
local linode_id=$linout
echo "Created cluster manager linode $linode_id"
linode_api linout linerr linret "update-node" $linode_id "$CLUSTER_MANAGER_LINODE_LABEL" "cluster-manager"
if [ $linret -eq 1 ]; then
echo "Failed to update node label $linode_id. Error:$linerr"
return 1
fi
# Create a disk from distribution.
echo "Creating disk"
linode_api linout linerr linret "create-disk-from-distribution" $linode_id "$DISTRIBUTION" \
8000 "$ROOT_PASSWORD" "$HOME/.ssh/clustermgrroot.pub"
if [ $linret -eq 1 ]; then
echo "Failed to create disk. Error:$linerr"
return 1
fi
local disk_id=$(echo $linout|cut -d ',' -f1)
local create_disk_job_id=$(echo $linout|cut -d ',' -f2)
local disk_result
wait_for_job $create_disk_job_id $linode_id
disk_result=$?
if [ $disk_result -eq 0 ]; then
echo "Create disk did not complete even after 4 minutes. Aborting"
return 1
fi
if [ $disk_result -ge 2 ]; then
echo "Create disk failed."
return 1
fi
# Create a swap disk sized according to the linode's RAM.
# The linode distributions are configured to expect /dev/sdb block device
# as swap space. If a swap disk is not part of the configuration, everything
# still succeeds but the dashboard shows a misconfiguration warning, and
# performance may also suffer.
echo "Creating swap disk"
linode_api linout linerr linret "create-swap-disk" $linode_id
if [ $linret -eq 1 ]; then
echo "Failed to create swap disk. Error:$linerr"
return 1
fi
local swap_disk_id=$(echo $linout|cut -d ',' -f1)
local create_swap_disk_job_id=$(echo $linout|cut -d ',' -f2)
local disk_result
wait_for_job $create_swap_disk_job_id $linode_id
disk_result=$?
if [ $disk_result -eq 0 ]; then
echo "Create swap disk did not complete even after 4 minutes. Aborting"
return 1
fi
if [ $disk_result -ge 2 ]; then
echo "Create swap disk failed."
return 1
fi
# Create a configuration profile with that disk.
echo "Creating a configuration"
linode_api linout linerr linret "create-config" $linode_id "$KERNEL" \
"$disk_id,$swap_disk_id" "clustermgr-configuration"
if [ $linret -eq 1 ]; then
echo "Failed to create configuration. Error:$linerr"
return 1
fi
local config_id=$linout
echo "Creating private IP for linode"
linode_api linout linerr linret "add-private-ip" $linode_id
if [ $linret -eq 1 ]; then
echo "Failed to add private IP address. Error:$linerr"
return 1
fi
local private_ip=$linout
echo "Private IP address $private_ip created for linode $linode_id"
# Get public IP address of node.
echo "Getting IP address of linode"
linode_api linout linerr linret "public-ip" $linode_id
if [ $linret -eq 1 ]; then
echo "Failed to get IP address. Error:$linerr"
return 1
fi
local public_ip=$linout
echo "Public IP address: $public_ip"
# Boot the linode.
echo "Booting the linode"
linode_api linout linerr linret "boot" $linode_id $config_id
if [ $linret -eq 1 ]; then
echo "Failed to boot. Error:$linerr"
return 1
fi
local boot_job_id=$linout
# Wait for node to boot up.
local boot_result
wait_for_job $boot_job_id $linode_id
boot_result=$?
if [ $boot_result -eq 0 ]; then
echo "Boot job did not complete even after 4 minutes. Aborting"
return 1
fi
if [ $boot_result -ge 2 ]; then
echo "Booting failed."
return 1
fi
# If scp is attempted too quickly after boot, it's failing with connection refused.
sleep 15
echo "Cluster manager node has booted. Public IP address: $public_ip"
echo "Copying cluster_manager.sh to cluster manager node"
scp -i "$HOME/.ssh/clustermgrroot" -o IdentitiesOnly=yes -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no 'cluster_manager.sh' \
"root@$public_ip:cluster_manager.sh"
echo "Running cluster_manager.sh setup on cluster manager node"
ssh_command "$public_ip" "root" "$HOME/.ssh/clustermgrroot" "chmod ugo+x *.sh; ./cluster_manager.sh setup $2"
echo "Copying clustermgr public key to cluster manager node"
scp -i "$HOME/.ssh/clustermgrroot" -o IdentitiesOnly=yes -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no "$HOME/.ssh/clustermgr.pub" \
"root@$public_ip:/home/clustermgr/.ssh/authorized_keys"
ssh_command "$public_ip" "root" "$HOME/.ssh/clustermgrroot" "chmod go-w /home/clustermgr/.ssh/authorized_keys; chown clustermgr:clustermgr /home/clustermgr/.ssh/authorized_keys"
echo "Copying clustermgrguest public key to cluster manager node"
scp -i "$HOME/.ssh/clustermgrroot" -o IdentitiesOnly=yes -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no "$HOME/.ssh/clustermgrguest.pub" \
"root@$public_ip:/home/clustermgrguest/.ssh/authorized_keys"
ssh_command "$public_ip" "root" "$HOME/.ssh/clustermgrroot" "chmod go-w /home/clustermgrguest/.ssh/authorized_keys; chown clustermgrguest:clustermgrguests /home/clustermgrguest/.ssh/authorized_keys"
# Cleanup:
# We don't want to keep the cluster_manager.sh around, since it contains the root user password for the node.
cd ..
rm -rf storm-linode
echo "Cluster Manager is ready. IP address is $public_ip."
}
# $1 -> (Optional) The SHA1 or short SHA1 hash of the git revision to download. If not specified,
# it downloads the latest release tagged version.
setup_cluster_manager() {
# Disable IPv6 on Cluster Manager because
# apt-get seems to hang on security.ubuntu.com domains when it's enabled.
echo "Disabling IPv6"
echo 'net.ipv6.conf.all.disable_ipv6 = 1' >> /etc/sysctl.conf
echo 'net.ipv6.conf.default.disable_ipv6 = 1' >> /etc/sysctl.conf
echo 'net.ipv6.conf.lo.disable_ipv6 = 1' >> /etc/sysctl.conf
sysctl -p
apt-get -y update
#apt-get -y upgrade
apt-get -y install git python2.7 ssh wget sed tmux
# Create the 'clustermgr' user for running scripts.
# It should be part of sudo because script should modify /etc/hosts of cluster manager node
# Have to logout and login after adding to sudo to be able to use 'sudo' in commands.
addgroup clustermgr
adduser --ingroup clustermgr --gecos "" clustermgr
adduser clustermgr sudo
adduser clustermgr adm
# Setup for git cloning from github (I think email should be setup)
#git config --global user.email "[email protected]"
#git config --global user.name "Your Name"
# Get the latest release version of the scripts.
cd /home/clustermgr
git clone "https://github.com/pathbreak/storm-linode"
cd storm-linode
if [ "x$1" == "x" ]; then
git fetch --tags
latest_release_tag=$(git describe $(git rev-list --tags='release*' --max-count=1))
echo "Activating latest release version of the software : $latestTag"
git checkout $latestTag
else
echo "Activating specified revision of the software : $1"
git checkout $1
fi
chmod +x *.sh *.py
wget "$STORM_URL"
local storm_package_name=$(basename "$STORM_URL")
sed -i -r "s|^INSTALL_STORM_DISTRIBUTION=.*\$|INSTALL_STORM_DISTRIBUTION=./$storm_package_name|" storm-image-example.conf
wget "$ZOOKEEPER_URL"
local zk_package_name=$(basename "$ZOOKEEPER_URL")
sed -i -r "s|^INSTALL_ZOOKEEPER_DISTRIBUTION=.*\$|INSTALL_ZOOKEEPER_DISTRIBUTION=./$zk_package_name|" zk-image-example.conf
mkdir -p /home/clustermgr/.ssh
ssh-keygen -t rsa -b 4096 -q -f /home/clustermgr/.ssh/clusterroot -N ''
ssh-keygen -t rsa -b 4096 -q -f /home/clustermgr/.ssh/clusteradmin -N ''
chmod go-rwx /home/clustermgr/storm-linode/*
chown -R clustermgr:clustermgr /home/clustermgr
# Disable ssh password authentication.
if [ "$DISABLE_SSH_PASSWORD_AUTHENTICATION" == "yes" ]; then
echo "Disabling SSH password authentication"
grep -q 'PasswordAuthentication yes$\|PasswordAuthentication no$' /etc/ssh/sshd_config
if [ $? -eq 1 ]; then
echo 'PasswordAuthentication no' >> /etc/ssh/sshd_config
else
sed -r -i '/PasswordAuthentication yes$|PasswordAuthentication no$/ c PasswordAuthentication no' /etc/ssh/sshd_config
fi
elif [ "$DISABLE_SSH_PASSWORD_AUTHENTICATION" == "no" ]; then
echo "Enabling SSH password authentication"
grep -q 'PasswordAuthentication yes$\|PasswordAuthentication no$' /etc/ssh/sshd_config
if [ $? -eq 1 ]; then
echo 'PasswordAuthentication yes' >> /etc/ssh/sshd_config
else
sed -r -i '/PasswordAuthentication yes$|PasswordAuthentication no$/ c PasswordAuthentication yes' /etc/ssh/sshd_config
fi
fi
service ssh restart
# Create 'clustermgrguest' non-privileged user for devs to get non-sensitive information about clusters,
# such as client node IP addresses.
addgroup clustermgrguests
adduser --ingroup clustermgrguests --gecos "" clustermgrguest
mkdir -p /home/clustermgrguest/.ssh
mkdir -p /home/clustermgrguest/storm-linode
cp cluster_info.sh /home/clustermgrguest/storm-linode/
cp textfileops.sh /home/clustermgrguest/storm-linode/
chown -R clustermgrguest:clustermgrguests /home/clustermgrguest/
}
# $1 -> name of variable which receives output of command
# $2 -> name of variable which receives stderr of command
# $3 -> name of variable which receives return code of command (0=success, >0 are failures)
# $4... -> arguments to linode_api.py
linode_api() {
error_file=$(mktemp)
# Important: Don't combine "local out=$(command)" into one line, because local acts as another command
# and clobbers the command's return code.
# as explained in http://stackoverflow.com/questions/4421257/why-does-local-sweep-the-return-code-of-a-command
# and http://mywiki.wooledge.org/BashPitfalls#local_varname.3D.24.28command.29
local __out
__out=$(./linode_api.py "${@:4}" 2>$error_file)
local __ret=$?
local __err="$(< $error_file)"
# The evaulated assignments are enclosed in "" by escaping with \"\" because
# they may contain spaces and single/double quotes which makes bash think they are
# separate commands.
eval $1="\"$__out\""
eval $2="\"$__err\""
eval $3="$__ret"
rm $error_file
}
# $1 : The Job ID
# $2 : The linode ID
# Return: 0 -> job remains pending even after timeout
# 1 -> job completed
# 2 -> job failed
# 3 -> could not get job status
wait_for_job() {
local attempt=1
local max_attempts=48 # Check every 10 seconds for upto 8 minutes
local job_status=0 # Pending
local linout linerr linret
while [ $attempt -lt $max_attempts ]; do
linode_api linout linerr linret "job-status" $2 $1
if [ $linret -eq 1 ]; then
echo "Failed to find job status. Error:$linerr"
return 3
fi
if [ $linout -eq 1 ]; then
# Job completed
job_status=1
break
elif [ $linout -eq 2 ]; then
# Job failed
job_status=2
break
fi
attempt=$((attempt+1))
sleep 10
done
if [ $job_status -eq 0 ]; then
echo "Job $1 did not complete even after 4 minutes. Aborting"
elif [ $job_status -eq 2 ]; then
echo "Job $1 failed. Error:$linerr"
fi
return $job_status
}
# $1 -> IP address or hostname of node
# $2 -> SSH login username for node
# $3 -> SSH private key for node
# $4... -> Command and paramters to send to remote server.
# Either Redirection character (>) should be escaped with a backslash(\) (without
# the backslash, the redirection is attempted on the host machine instead of guest)
# Or instead of escaping, the command should be enclosed in a pair of '(...)'. That seems to work even with unescaped redirection.
ssh_command() {
# IMPORTANT: The -n option is very important to avoid abruptly terminating callers who are in a "while read" loop.
# This is because ssh without -n option reads complete stdin by default,
# as explained in http://stackoverflow.com/questions/346445/bash-while-read-loop-breaking-early
# -n : avoid reading stdin by redirecting stdin from /dev/null
# -x : disable X11 negotiation
# -q : quiet
ssh -q -n -x -i "$3" -o IdentitiesOnly=yes -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $2@$1 "${@:4}"
}
case $1 in
create-linode)
create_cluster_manager_linode $2 $3
;;
setup)
setup_cluster_manager $2
;;
esac