-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcluster_create.sh
213 lines (180 loc) · 7.49 KB
/
cluster_create.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
####################################################################################################
# OpenDroneMap on k8s for ASDC DronesVL
# Owen Kaluza, Monash University, February 2022
#
# - Create kubernetes cluster nodegroup on openstack
# - This is a the compute cluster used to run ODM jobs and Jupyterhub pods
####################################################################################################
####################################################################################################
#Load the settings, setup openstack and kubectl
source settings.env
if [ -z ${KUBECONFIG+x} ];
then
echo "KUBECONFIG is unset, run : source asdc_run.sh to init cluster";
exit
fi
#Setup node groups
NODEGROUP_CREATED=0
function create_cluster()
{
#Create a cluster nodegroup
#$1 : name
#$2 : flavour
#$3 : number of nodes
#$4 : role
if ! openstack coe nodegroup show $CLUSTER $1 -f value -c status;
then
echo "Cluster nodegroup $1 doesn't exist, creating"
# https://docs.openstack.org/magnum/latest/user/#node-groups
#(Will error if already created so ok to run again without check)
openstack coe nodegroup create $CLUSTER $1 --flavor $2 \
--min-nodes 1 --node-count $3 --role $4 \
--docker-volume-size ${CLUSTER_DOCKER_VOL_SIZE} \
#Labels don't seem to work at all... use magnum.openstack.role=cluster for nodeSelector
# --labels asdc.cloud.edu.au/type=$1,asdc.cloud.edu.au/compute=1 --merge-labels
#NOTE: if usng the --labels option, also add --merge-labels or others will be cleared
#If any nodegroups created, this flag triggers setting taints etc in cluster_deploy.sh
NODEGROUP_CREATED=1
sleep 1
else
echo "Cluster nodegroup $1 already exists"
fi
}
if [ "$NODES_P4" -gt "0" ]; then
create_cluster $NODEGROUP_BASE-P4 $CLUSTER_P4_FLAVOUR $NODES_P4 cluster
fi
if [ "$NODES_A40" -gt "0" ]; then
create_cluster $NODEGROUP_BASE-A40 $CLUSTER_A40_FLAVOUR $NODES_A40 cluster
fi
if [ "$NODES_A100" -gt "0" ]; then
create_cluster $NODEGROUP_BASE-A100 $CLUSTER_A100_FLAVOUR $NODES_A100 cluster
fi
#kubectl get all
#kubectl get all --all-namespaces
kubectl get nodes
kubectl get nodes -l magnum.openstack.org/role=cluster
#kubectl get nodes -l asdc.cloud.edu.au/type=a40
#kubectl get nodes -l nvidia.com/gpu.product=A40
#kubectl get nodes -l nvidia.com/gpu.product=A100
#kubectl get nodes -l nvidia.com/gpu.product=Tesla-P4
#To resize nodegroups
#openstack coe cluster resize $CLUSTER --nodegroup gpu-A40 2
#openstack coe cluster resize $CLUSTER --nodegroup gpu-A100 1
#Create the compute cluster
#source cluster_create.sh
####################################################################################################
echo --- Phase 5 : Cluster config and GPU setup, deploy nodes etc
####################################################################################################
####################################################################################################
# OpenDroneMap on k8s for ASDC DronesVL
# Owen Kaluza, Monash University, February 2022
#
####################################################################################################
#Load the settings, setup openstack and kubectl
source settings.env
if [ -z ${KUBECONFIG+x} ];
then
echo "KUBECONFIG is unset, run : source asdc_run.sh to init cluster";
exit
fi
function nodegroup_check()
{
#$1 nodegroup name
NSTATUS=$(openstack coe nodegroup show $CLUSTER $1 -f value -c status)
if [[ "$NSTATUS" == *"$2"* ]]; then
return 0
fi
return 1
}
function nodegroup_wait()
{
#Wait until nodegroup complete
#$1 nodegroup name
until nodegroup_check $1 "COMPLETE"
do
printf "Nodegroup $NSTATUS "
sleep 2
done
}
####################################################################################################
echo --- Phase 5a : Cluster node taints
####################################################################################################
#Until bug with nodegroup creation fixed, may have to skip this
if [ "$NODES_P4" -gt "0" ]; then
nodegroup_wait $NODEGROUP_BASE-P4
fi
if [ "$NODES_A40" -gt "0" ]; then
nodegroup_wait $NODEGROUP_BASE-A40
fi
if [ "$NODES_A100" -gt "0" ]; then
nodegroup_wait $NODEGROUP_BASE-A100
fi
export NODEGROUP_CREATED=1 #Need to force this if interrupted
if [ $NODEGROUP_CREATED == 1 ];
then
#Apply some labels to the compute pods
for node in $(kubectl get nodes -l magnum.openstack.org/role=cluster -ojsonpath='{.items[*].metadata.name}');
do
#Wait until cinder-csi deployed, otherwise taint will prevent it loading
until kubectl describe node $node | grep "topology.cinder.csi.openstack.org/zone"
do
printf "Waiting for cinder.csi deployment on $node... "
sleep 5
done
kubectl label nodes $node asdc.cloud.edu.au/gpu=1 --overwrite
#https://github.com/NVIDIA/gpu-operator/issues/322
kubectl label nodes $node nvidia.com/mig.config=all-disabled --overwrite
#kubectl get pods -A -owide --field-selector spec.nodeName=$node;
kubectl taint nodes $node compute=true:NoSchedule --overwrite
#Only use the compute hardware for jupyterhub
#Use the compute nodes for jupyterhub pods
#https://zero-to-jupyterhub.readthedocs.io/en/latest/administrator/optimization.html
kubectl label nodes $node hub.jupyter.org/node-purpose=user --overwrite
#Use PreferNoSchedule so pods other than jupyterhub will still run on these nodes if they tolerate compute=true
#kubectl taint nodes $node hub.jupyter.org/dedicated=user:PreferNoSchedule
#csi-cinder daemonset created by OpenStack has no toleration for all taints like other kube-system ds
# - fix this by patching the daemonset
kubectl patch daemonset csi-cinder-nodeplugin -n kube-system --patch "$(cat templates/ds-tolerations-patch.yml)"
done
export NODEGROUP_CREATED=0
fi
####################################################################################################
echo --- Phase 6 : Deployment: Metashape
####################################################################################################
#THIS IS A LEGACY STEP - WHEN WE WANT TO RUN METASHAPE NODES AGAIN, USE FLUXCD
#Apply the secrets
if [ "$NODE_METASHAPE" -gt "0" ]; then
#TODO: move secrets to secrets/secret.env and the rest to asdc-infra
kubectl apply -f metashape/dronedrive_secret.yaml
#Setup the cifs/smb volume mount - this has problems, keeps restarting
# - Install csi plugin
curl -skSL https://raw.githubusercontent.com/kubernetes-csi/csi-driver-smb/master/deploy/install-driver.sh | bash -s master --
# - Create persistent volume and claim
kubectl apply -f metashape/csi-pv.yaml -f metashape/csi-pvc.yaml
#Launch metashape server and load balancer service
#(NOTE: we had to launch these in a separate VM on monash-02 instead
# as monash-01 to monash-02 network is really broken right now
# also - license server does not handle being run in a different container each time)
#apply_template metashape-server.yaml
#apply_template metashape-service.yaml
#wait_for_pod metashape-server
#Launch metashape processing nodes - require nvidia gpu resource
function deploy_metashape()
{
#Deploy Metashape pod with unique name
#$1 = id#
export NODE_NAME=metashape-k8s$1
if ! kubectl get pods | grep $NODE_NAME
then
echo ">>> METASHAPE NODE LAUNCH... " $NODE_NAME
echo "Deploying $2 : $3 as $NODE_NAME"
export NODE_TYPE="metashape"
apply_template metashape.yaml
fi
}
#Deploy Metashape nodes
for (( n=1; n<=$NODE_METASHAPE; n++ ))
do
deploy_metashape $n
done
fi