examples/23-kubeflow-spot-instance.yaml

# Cost-Optimized EKS cluster for Kubeflow with spot GPU instances and node scale down to zero
# Built in efforts to reducing training costs of ML workloads.
# Supporting tutorial can be found at the following link: 
# https://blog.gofynd.com/how-we-reduced-our-ml-training-costs-by-78-a33805cb00cf
# This spec creates a cluster on EKS with the following active nodes 
# - 2x m5a.2xlarge - Accomodates all pods of Kubeflow
# It also creates the following nodegroups with 0 nodes running unless a pod comes along and requests for the node to get spun up
# - m5a.2xlarge   -- Max Allowed 10 worker nodes
# - p2.xlarge     -- Max Allowed 10 worker nodes
# - p3.2xlarge    -- Max Allowed 10 worker nodes
# - p3.8xlarge    -- Max Allowed 04 worker nodes
# - p3dn.24xlarge -- Max Allowed 01 worker nodes

apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig

metadata:
  # Name of your cluster, change to whatever you find fit.
  # If changed, make sure to change all nodegroup tags from 
  # 'k8s.io/cluster-autoscaler/cluster-23: "owned"' --> 'k8s.io/cluster-autoscaler/your-new-name: "owned"'
  name: cluster-23
  # choose your region wisely, this will significantly impact the cost incurred
  region: us-east-1
  # 1.14 Kubernetes version since Kubeflow 1.0 officially supports the same
  version: '1.14'
  tags:
    # Add more cloud tags if needed for billing
    environment: staging

# Add all possible AZs to ensure nodes can be spun up in any AZ later on. 
# THIS CAN'T BE CHANGED LATER. YOU WILL HAVE TO CREATE A NEW CLUSTER TO ADD NEW AZ SUPPORT.
# This list applies to the whole clustr and isn't specific to nodegroups
availabilityZones: ["us-east-1a", "us-east-1b",  "us-east-1d",  "us-east-1f"]

nodeGroups:
  - name: ng-1
    desiredCapacity: 2
    minSize: 0
    maxSize: 3
    # Set one nodegroup with 100GB volumes for Kubeflow to get deployed. 
    # Kubeflow requirement states 1-2 Nodes with 100GB volume attached to the node. 
    volumeSize: 100
    volumeType: gp2
    instanceType: m5a.2xlarge
    availabilityZones: ["us-east-1a"]
    labels:
      node-class: "worker-node"
    tags:
      # EC2 tags required for cluster-autoscaler auto-discovery
      k8s.io/cluster-autoscaler/node-template/label/lifecycle: OnDemand
      k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "false"
      k8s.io/cluster-autoscaler/node-template/label/gpu-count: "0"
      k8s.io/cluster-autoscaler/enabled: "true"
      k8s.io/cluster-autoscaler/cluster-23: "owned"
    iam:
      withAddonPolicies:
        albIngress: true
        autoScaler: true
        cloudWatch: true

  - name: ng-2
    desiredCapacity: 0
    volumeType: gp2
    instanceType: m5a.2xlarge
    availabilityZones: ["us-east-1a"]
    labels:
      node-class: "worker-node"
    tags:
      # EC2 tags required for cluster-autoscaler auto-discovery
      k8s.io/cluster-autoscaler/node-template/label/lifecycle: OnDemand
      k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "false"
      k8s.io/cluster-autoscaler/node-template/label/gpu-count: "0"
      k8s.io/cluster-autoscaler/enabled: "true"
      k8s.io/cluster-autoscaler/cluster-23: "owned"
    iam:
      withAddonPolicies:
        albIngress: true
        autoScaler: true
        cloudWatch: true

  - name: 1-gpu-spot-p2-xlarge
    minSize: 0
    maxSize: 10
    instancesDistribution:
      # set your own max price. AWS spot instance prices no longer cross OnDemand price. 
      # Comment out the field to default to OnDemand as max price. 
      maxPrice: 1.2
      instanceTypes: ["p2.xlarge"]
      onDemandBaseCapacity: 0
      onDemandPercentageAboveBaseCapacity: 0
      spotAllocationStrategy: capacity-optimized
    labels:
      lifecycle: Ec2Spot
      aws.amazon.com/spot: "true"
      gpu-count: "1"
    # Stick to one AZ for all GPU nodes. 
    # In case of termination, this will prevent volumes from being unavailable 
    # if the new instance got spun up in another AZ.
    availabilityZones: ["us-east-1a"]
    taints:
      spotInstance: "true:PreferNoSchedule"
    tags:
      k8s.io/cluster-autoscaler/node-template/label/lifecycle: Ec2Spot
      k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "true"
      k8s.io/cluster-autoscaler/node-template/label/gpu-count: "1"
      k8s.io/cluster-autoscaler/node-template/taint/spotInstance: "true:PreferNoSchedule"
      k8s.io/cluster-autoscaler/enabled: "true"
      k8s.io/cluster-autoscaler/cluster-23: "owned"
    iam:
      withAddonPolicies:
        autoScaler: true
        cloudWatch: true
        albIngress: true

  - name: 1-gpu-spot-p3-2xlarge
    minSize: 0
    maxSize: 10
    instancesDistribution:
      # set your own max price. AWS spot instance prices no longer cross OnDemand price. 
      # Comment out the field to default to OnDemand as max price. 
      maxPrice: 1.2
      instanceTypes: ["p3.2xlarge"]
      onDemandBaseCapacity: 0
      onDemandPercentageAboveBaseCapacity: 0
      spotAllocationStrategy: capacity-optimized
    labels:
      lifecycle: Ec2Spot
      aws.amazon.com/spot: "true"
      gpu-count: "1"
    # Stick to one AZ for all GPU nodes. 
    # In case of termination, this will prevent volumes from being unavailable 
    # if the new instance got spun up in another AZ.
    availabilityZones: ["us-east-1a"]
    taints:
      spotInstance: "true:PreferNoSchedule"
    tags:
      k8s.io/cluster-autoscaler/node-template/label/lifecycle: Ec2Spot
      k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "true"
      k8s.io/cluster-autoscaler/node-template/label/gpu-count: "1"
      k8s.io/cluster-autoscaler/node-template/taint/spotInstance: "true:PreferNoSchedule"
      k8s.io/cluster-autoscaler/enabled: "true"
      k8s.io/cluster-autoscaler/cluster-23: "owned"
    iam:
      withAddonPolicies:
        autoScaler: true
        cloudWatch: true
        albIngress: true

  - name: 4-gpu-spot-p3-8xlarge
    minSize: 0
    maxSize: 4
    instancesDistribution:
      # set your own max price. AWS spot instance prices no longer cross OnDemand price. 
      # Comment out the field to default to OnDemand as max price. 
      # maxPrice: 4.4
      instanceTypes: ["p3.8xlarge"]
      onDemandBaseCapacity: 0
      onDemandPercentageAboveBaseCapacity: 0
      spotAllocationStrategy: capacity-optimized
    labels:
      lifecycle: Ec2Spot
      aws.amazon.com/spot: "true"
      gpu-count: "4"
    # Stick to one AZ for all GPU nodes. 
    # In case of termination, this will prevent volumes from being unavailable 
    # if the new instance got spun up in another AZ.
    availabilityZones: ["us-east-1a"]
    taints:
      spotInstance: "true:PreferNoSchedule"
    tags:
      k8s.io/cluster-autoscaler/node-template/label/lifecycle: Ec2Spot
      k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "true"
      k8s.io/cluster-autoscaler/node-template/label/gpu-count: "4"
      k8s.io/cluster-autoscaler/node-template/taint/spotInstance: "true:PreferNoSchedule"
      k8s.io/cluster-autoscaler/enabled: "true"
      k8s.io/cluster-autoscaler/cluster-23: "owned"
    iam:
      withAddonPolicies:
        autoScaler: true
        cloudWatch: true
        albIngress: true

  - name: 8-gpu-spot-p3dn-24xlarge
    minSize: 0
    maxSize: 1
    instancesDistribution:
      # set your own max price. AWS spot instance prices no longer cross OnDemand price. 
      # Comment out the field to default to OnDemand as max price. 
      maxPrice: 11
      instanceTypes: ["p3dn.24xlarge"]
      onDemandBaseCapacity: 0
      onDemandPercentageAboveBaseCapacity: 0
      spotAllocationStrategy: capacity-optimized
    labels:
      lifecycle: Ec2Spot
      aws.amazon.com/spot: "true"
      gpu-count: "8"
    availabilityZones: ["us-east-1a"]
    taints:
      spotInstance: "true:PreferNoSchedule"
    tags:
      k8s.io/cluster-autoscaler/node-template/label/lifecycle: Ec2Spot
      k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "true"
      k8s.io/cluster-autoscaler/node-template/label/gpu-count: "8"
      k8s.io/cluster-autoscaler/node-template/taint/spotInstance: "true:PreferNoSchedule"
      k8s.io/cluster-autoscaler/enabled: "true"
      k8s.io/cluster-autoscaler/cluster-23: "owned"
    iam:
      withAddonPolicies:
        autoScaler: true
        cloudWatch: true
        albIngress: true