forked from eksctl-io/eksctl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path23-kubeflow-spot-instance.yaml
212 lines (203 loc) · 8.17 KB
/
23-kubeflow-spot-instance.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# Cost-Optimized EKS cluster for Kubeflow with spot GPU instances and node scale down to zero
# Built in efforts to reducing training costs of ML workloads.
# Supporting tutorial can be found at the following link:
# https://blog.gofynd.com/how-we-reduced-our-ml-training-costs-by-78-a33805cb00cf
# This spec creates a cluster on EKS with the following active nodes
# - 2x m5a.2xlarge - Accomodates all pods of Kubeflow
# It also creates the following nodegroups with 0 nodes running unless a pod comes along and requests for the node to get spun up
# - m5a.2xlarge -- Max Allowed 10 worker nodes
# - p2.xlarge -- Max Allowed 10 worker nodes
# - p3.2xlarge -- Max Allowed 10 worker nodes
# - p3.8xlarge -- Max Allowed 04 worker nodes
# - p3dn.24xlarge -- Max Allowed 01 worker nodes
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
# Name of your cluster, change to whatever you find fit.
# If changed, make sure to change all nodegroup tags from
# 'k8s.io/cluster-autoscaler/cluster-23: "owned"' --> 'k8s.io/cluster-autoscaler/your-new-name: "owned"'
name: cluster-23
# choose your region wisely, this will significantly impact the cost incurred
region: us-east-1
# 1.14 Kubernetes version since Kubeflow 1.0 officially supports the same
version: '1.14'
tags:
# Add more cloud tags if needed for billing
environment: staging
# Add all possible AZs to ensure nodes can be spun up in any AZ later on.
# THIS CAN'T BE CHANGED LATER. YOU WILL HAVE TO CREATE A NEW CLUSTER TO ADD NEW AZ SUPPORT.
# This list applies to the whole clustr and isn't specific to nodegroups
availabilityZones: ["us-east-1a", "us-east-1b", "us-east-1d", "us-east-1f"]
nodeGroups:
- name: ng-1
desiredCapacity: 2
minSize: 0
maxSize: 3
# Set one nodegroup with 100GB volumes for Kubeflow to get deployed.
# Kubeflow requirement states 1-2 Nodes with 100GB volume attached to the node.
volumeSize: 100
volumeType: gp2
instanceType: m5a.2xlarge
availabilityZones: ["us-east-1a"]
labels:
node-class: "worker-node"
tags:
# EC2 tags required for cluster-autoscaler auto-discovery
k8s.io/cluster-autoscaler/node-template/label/lifecycle: OnDemand
k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "false"
k8s.io/cluster-autoscaler/node-template/label/gpu-count: "0"
k8s.io/cluster-autoscaler/enabled: "true"
k8s.io/cluster-autoscaler/cluster-23: "owned"
iam:
withAddonPolicies:
albIngress: true
autoScaler: true
cloudWatch: true
- name: ng-2
desiredCapacity: 0
volumeType: gp2
instanceType: m5a.2xlarge
availabilityZones: ["us-east-1a"]
labels:
node-class: "worker-node"
tags:
# EC2 tags required for cluster-autoscaler auto-discovery
k8s.io/cluster-autoscaler/node-template/label/lifecycle: OnDemand
k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "false"
k8s.io/cluster-autoscaler/node-template/label/gpu-count: "0"
k8s.io/cluster-autoscaler/enabled: "true"
k8s.io/cluster-autoscaler/cluster-23: "owned"
iam:
withAddonPolicies:
albIngress: true
autoScaler: true
cloudWatch: true
- name: 1-gpu-spot-p2-xlarge
minSize: 0
maxSize: 10
instancesDistribution:
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
# Comment out the field to default to OnDemand as max price.
maxPrice: 1.2
instanceTypes: ["p2.xlarge"]
onDemandBaseCapacity: 0
onDemandPercentageAboveBaseCapacity: 0
spotAllocationStrategy: capacity-optimized
labels:
lifecycle: Ec2Spot
aws.amazon.com/spot: "true"
gpu-count: "1"
# Stick to one AZ for all GPU nodes.
# In case of termination, this will prevent volumes from being unavailable
# if the new instance got spun up in another AZ.
availabilityZones: ["us-east-1a"]
taints:
spotInstance: "true:PreferNoSchedule"
tags:
k8s.io/cluster-autoscaler/node-template/label/lifecycle: Ec2Spot
k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "true"
k8s.io/cluster-autoscaler/node-template/label/gpu-count: "1"
k8s.io/cluster-autoscaler/node-template/taint/spotInstance: "true:PreferNoSchedule"
k8s.io/cluster-autoscaler/enabled: "true"
k8s.io/cluster-autoscaler/cluster-23: "owned"
iam:
withAddonPolicies:
autoScaler: true
cloudWatch: true
albIngress: true
- name: 1-gpu-spot-p3-2xlarge
minSize: 0
maxSize: 10
instancesDistribution:
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
# Comment out the field to default to OnDemand as max price.
maxPrice: 1.2
instanceTypes: ["p3.2xlarge"]
onDemandBaseCapacity: 0
onDemandPercentageAboveBaseCapacity: 0
spotAllocationStrategy: capacity-optimized
labels:
lifecycle: Ec2Spot
aws.amazon.com/spot: "true"
gpu-count: "1"
# Stick to one AZ for all GPU nodes.
# In case of termination, this will prevent volumes from being unavailable
# if the new instance got spun up in another AZ.
availabilityZones: ["us-east-1a"]
taints:
spotInstance: "true:PreferNoSchedule"
tags:
k8s.io/cluster-autoscaler/node-template/label/lifecycle: Ec2Spot
k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "true"
k8s.io/cluster-autoscaler/node-template/label/gpu-count: "1"
k8s.io/cluster-autoscaler/node-template/taint/spotInstance: "true:PreferNoSchedule"
k8s.io/cluster-autoscaler/enabled: "true"
k8s.io/cluster-autoscaler/cluster-23: "owned"
iam:
withAddonPolicies:
autoScaler: true
cloudWatch: true
albIngress: true
- name: 4-gpu-spot-p3-8xlarge
minSize: 0
maxSize: 4
instancesDistribution:
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
# Comment out the field to default to OnDemand as max price.
# maxPrice: 4.4
instanceTypes: ["p3.8xlarge"]
onDemandBaseCapacity: 0
onDemandPercentageAboveBaseCapacity: 0
spotAllocationStrategy: capacity-optimized
labels:
lifecycle: Ec2Spot
aws.amazon.com/spot: "true"
gpu-count: "4"
# Stick to one AZ for all GPU nodes.
# In case of termination, this will prevent volumes from being unavailable
# if the new instance got spun up in another AZ.
availabilityZones: ["us-east-1a"]
taints:
spotInstance: "true:PreferNoSchedule"
tags:
k8s.io/cluster-autoscaler/node-template/label/lifecycle: Ec2Spot
k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "true"
k8s.io/cluster-autoscaler/node-template/label/gpu-count: "4"
k8s.io/cluster-autoscaler/node-template/taint/spotInstance: "true:PreferNoSchedule"
k8s.io/cluster-autoscaler/enabled: "true"
k8s.io/cluster-autoscaler/cluster-23: "owned"
iam:
withAddonPolicies:
autoScaler: true
cloudWatch: true
albIngress: true
- name: 8-gpu-spot-p3dn-24xlarge
minSize: 0
maxSize: 1
instancesDistribution:
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
# Comment out the field to default to OnDemand as max price.
maxPrice: 11
instanceTypes: ["p3dn.24xlarge"]
onDemandBaseCapacity: 0
onDemandPercentageAboveBaseCapacity: 0
spotAllocationStrategy: capacity-optimized
labels:
lifecycle: Ec2Spot
aws.amazon.com/spot: "true"
gpu-count: "8"
availabilityZones: ["us-east-1a"]
taints:
spotInstance: "true:PreferNoSchedule"
tags:
k8s.io/cluster-autoscaler/node-template/label/lifecycle: Ec2Spot
k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "true"
k8s.io/cluster-autoscaler/node-template/label/gpu-count: "8"
k8s.io/cluster-autoscaler/node-template/taint/spotInstance: "true:PreferNoSchedule"
k8s.io/cluster-autoscaler/enabled: "true"
k8s.io/cluster-autoscaler/cluster-23: "owned"
iam:
withAddonPolicies:
autoScaler: true
cloudWatch: true
albIngress: true