-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_cluster.sh
executable file
·116 lines (94 loc) · 2.73 KB
/
create_cluster.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/bin/bash
usage() {
echo "
usage:
export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query "Account" --output text)"
export AWS_REGION=us-east-2
export CLUSTER_NAME=trieve-gpu
export CPU_INSTANCE_TYPE=t3.medium
export GPU_INSTANCE_TYPE=g4dn.xlarge
export GPU_COUNT=1
$0
"
}
############
# Parameters
export K8S_VERSION="1.30"
export CPU_INSTANCE_COUNT=5
[ -z $AWS_REGION ] && echo "error: AWS_REGION is not set" && usage && exit
[ -z $CLUSTER_NAME ] && echo "CLUSTER_NAME is not set" && usage && exit
[ -z $AWS_ACCOUNT_ID ] && echo "AWS_ACCOUNT_ID is not set" && usage && exit
[ -z $GPU_COUNT ] && echo "GPU_COUNT is not set" && usage && exit
[ -z $GPU_INSTANCE_TYPE ] && echo "GPU_INSTANCE_TYPE is not set" && usage && exit
[ -z $CPU_INSTANCE_TYPE ] && echo "CPU_INSTANCE_TYPE is not set" && usage && exit
echo "Provision a cluster in $(tput bold)$AWS_REGION$(tput sgr0) named $CLUSTER_NAME for account $AWS_ACCOUNT_ID"
echo "Cluster breakdown:"
echo ""
echo "+ $GPU_COUNT * $(tput bold)$GPU_INSTANCE_TYPE$(tput sgr0)"
echo "+ $CPU_INSTANCE_COUNT * $(tput bold)$CPU_INSTANCE_TYPE$(tput sgr0)"
read -p "Confirm? [y/N]? " -r
if [[ $REPLY =~ ^[Yy]$ ]]
then
eksctl create cluster -f - << EOF
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: $CLUSTER_NAME
region: $AWS_REGION
version: "$K8S_VERSION"
iam:
withOIDC: true
managedNodeGroups:
- name: main
instanceType: $CPU_INSTANCE_TYPE
desiredCapacity: $CPU_INSTANCE_COUNT
maxSize: 8
minSize: 4
volumeSize: 20
ssh:
allow: false
iam:
withAddonPolicies:
awsLoadBalancerController: true
ebs: true
- name: gpu
labels:
eks-node: gpu
instanceType: $GPU_INSTANCE_TYPE
desiredCapacity: $GPU_COUNT
vpc:
cidr: 10.0.0.0/16
addonsConfig:
autoApplyPodIdentityAssociations: true
disableDefaultAddons: false
addons:
- name: eks-pod-identity-agent
version: latest
- name: vpc-cni
version: latest
useDefaultPodIdentityAssociations: true
- name: coredns
version: latest
- name: kube-proxy
version: latest
- name: aws-ebs-csi-driver
version: latest
useDefaultPodIdentityAssociations: true
EOF
echo 'Deployment Done!'
aws eks update-kubeconfig --region $AWS_REGION --name $CLUSTER_NAME
kubectl patch sc gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
echo 'creating config map'
kubectl apply -f ./nvidia-device-plugin.yaml
echo 'Deploying helm chart'
helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
helm repo update nvdp
helm install --upgrade nvdp nvdp/nvidia-device-plugin \
--namespace kube-system \
-f nvdp.yaml \
--version 0.14.0 \
--set config.name=nvidia-device-plugin \
--force
else
echo "Apply canceled"
fi