Skip to content
This repository has been archived by the owner on Aug 28, 2024. It is now read-only.

Commit

Permalink
adding infra via eksctl
Browse files Browse the repository at this point in the history
  • Loading branch information
brandonjbjelland committed Aug 8, 2023
1 parent 7fd3104 commit 4ef5f84
Show file tree
Hide file tree
Showing 14 changed files with 232 additions and 706 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,5 @@ gcpmanager-dependencies.yaml
skaffold-dependencies.sh

.ipynb_checkpoints
.vscode/
.vscode/
eks-cluster.yaml
97 changes: 97 additions & 0 deletions install/kubernetes/eks-cluster.yaml.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: substratus
region: us-west-2
version: "1.27"
tags:
createdBy: eksctl
environment: dev
karpenter.sh/discovery: substratus

karpenter:
createServiceAccount: true
withSpotInterruptionQueue: true
defaultInstanceProfile: "KarpenterNodeInstanceProfile-substratus"
version: "v0.29.0"

# TODO(bjb): do we need mngs with karpenter?
# if karpenter doesn't suffice: https://github.com/eksctl-io/eksctl/blob/main/examples/23-kubeflow-spot-instance.yaml
managedNodeGroups:
- name: builder-ng
privateNetworking: true
labels: { role: builders }
instanceTypes:
- m6a.large
volumeSize: 100
minSize: 0
maxSize: 3
desiredCapacity: 1
iam:
withAddonPolicies:
ebs: true
imageBuilder: true
addons:
- name: vpc-cni
attachPolicyARNs:
- arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy
- name: kube-proxy
- name: aws-ebs-csi-driver
wellKnownPolicies:
ebsCSIController: true
- name: coredns

iamIdentityMappings:
- arn: "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}"
username: system:node:{{EC2PrivateDNSName}}
groups:
- system:bootstrappers
- system:nodes

iam:
withOIDC: true
serviceAccounts:
- metadata:
name: karpenter
namespace: karpenter
roleName: ${CLUSTER_NAME}-karpenter
attachPolicyARNs:
- arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME}
roleOnly: true
- metadata:
name: ebs-csi-controller-sa
namespace: kube-system
wellKnownPolicies:
ebsCSIController: true
- metadata:
name: substratus
namespace: substratus
attachPolicy:
Version: "2012-10-17"
Statement:
- Effect: Allow
Action:
- "ecr:*"
Resource:
- "arn:aws:ecr:::${ARTIFACTS_REPO_NAME}"
- Effect: Allow
Action:
- "s3:*"
- "s3-object-lambda:*"
Resource:
- "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}/*"
- "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}"
- metadata:
name: aws-manager
namespace: substratus
attachPolicy:
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-presigned-url.html
Version: "2012-10-17"
Statement:
- Effect: Allow
Action:
- "s3:PutObject"
- "s3:GetObject"
Resource:
- "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}/*"
- "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}"
69 changes: 69 additions & 0 deletions install/kubernetes/karpenter-provisioner.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/
apiVersion: karpenter.sh/v1alpha5
kind: Provisioner
metadata:
name: gpu
spec:
provider:
instanceProfile: eksctl-KarpenterNodeInstanceProfile-substratus
subnetSelector:
karpenter.sh/discovery: substratus
securityGroupSelector:
karpenter.sh/discovery: substratus
ttlSecondsAfterEmpty: 30
consolidation:
enabled: true
taints:
- key: nvidia.com/gpu
value: "true"
effect: NoSchedule
requirements:
- key: karpenter.sh/capacity-type
operator: In
values: ["spot"]
- key: node.kubernetes.io/instance-type
operator: In
values:
# aws ec2 describe-instance-types --region us-west-2 --query "InstanceTypes[?GpuInfo!=null].InstanceType" --output json | jq -r '.[]' | sort | grep -v dl1 | grep -v inf | grep -v p5 | grep -v trn1 | awk '{print "\""$1"\","}'
[
"g2.2xlarge",
"g2.8xlarge",
"g3.16xlarge",
"g3.4xlarge",
"g3.8xlarge",
"g3s.xlarge",
"g4ad.16xlarge",
"g4ad.2xlarge",
"g4ad.4xlarge",
"g4ad.8xlarge",
"g4ad.xlarge",
"g4dn.12xlarge",
"g4dn.16xlarge",
"g4dn.2xlarge",
"g4dn.4xlarge",
"g4dn.8xlarge",
"g4dn.metal",
"g4dn.xlarge",
"g5.12xlarge",
"g5.16xlarge",
"g5.24xlarge",
"g5.2xlarge",
"g5.48xlarge",
"g5.4xlarge",
"g5.8xlarge",
"g5.xlarge",
"g5g.16xlarge",
"g5g.2xlarge",
"g5g.4xlarge",
"g5g.8xlarge",
"g5g.metal",
"g5g.xlarge",
"p2.16xlarge",
"p2.8xlarge",
"p2.xlarge",
"p3.16xlarge",
"p3.2xlarge",
"p3.8xlarge",
"p3dn.24xlarge",
"p4d.24xlarge",
]
22 changes: 22 additions & 0 deletions install/scripts/aws-down.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1 +1,23 @@
#!/bin/bash

set -e
set -u

# Required env variables:
# : "$TOKEN $PROJECT"

export EKSCTL_ENABLE_CREDENTIAL_CACHE=1
export CLUSTER_NAME=substratus
export REGION=us-west-2
export ARTIFACTS_REPO_NAME=substratus
export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-substratus-artifacts

aws s3 rb s3://${ARTIFACTS_BUCKET_NAME} --region ${REGION} || true
aws ecr delete-repository --repository-name ${ARTIFACTS_REPO_NAME} || true

aws cloudformation delete-stack \
--stack-name "Karpenter-${CLUSTER_NAME}" || true

envsubst <../kubernetes/eks-cluster.yaml.tpl >../kubernetes/eks-cluster.yaml
eksctl delete cluster -f ../kubernetes/eks-cluster.yaml
136 changes: 37 additions & 99 deletions install/scripts/aws-up.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -4,68 +4,25 @@ set -e
set -u

# Required env variables:
: "$TOKEN $PROJECT"
# : "$TOKEN $PROJECT"

# Used by gcloud:
# TODO(bjb): pass AWS creds into script
export CLOUDSDK_AUTH_ACCESS_TOKEN=${TOKEN}
# Used by terraform:
export GOOGLE_OAUTH_ACCESS_TOKEN=${TOKEN}
# # TODO(bjb): pass AWS creds into script
# export CLOUDSDK_AUTH_ACCESS_TOKEN=${TOKEN}

INSTALL_OPERATOR="${INSTALL_OPERATOR:-yes}"
AUTO_APPROVE="${AUTO_APPROVE:-no}"

# Create terraform state bucket if one does not exist.
# TODO(bjb): establish a bucket

# Apply infrastructure.
cd terraform/aws

# Backend variables cannot be configured via env variables.
echo "bucket = \"${TF_BUCKET}\"" >>backend.tfvars
terraform init --backend-config=backend.tfvars

export TF_VAR_project_id=${PROJECT}
if [ "${AUTO_APPROVE}" == "yes" ]; then
terraform apply -auto-approve
else
terraform apply
fi
CLUSTER_NAME=$(terraform output --json cluster | jq -r '.name')
CLUSTER_REGION=$(terraform output --json cluster | jq -r '.region')
CLUSTER_ENDPOINT=$(terraform output --json cluster | jq -r '.endpoint')
LOAD_BALANCER_CONTROLLER_ROLE_NAME=$(terraform output --json irsas | jq -r '.load_balancer_controller_irsa_role.iam_role_name')

cd -

# Configure kubectl.
aws eks --region ${CLUSTER_REGION} update-kubeconfig --name ${CLUSTER_NAME}
# Install cluster-level components

# node-termination-handler: https://artifacthub.io/packages/helm/aws/aws-node-termination-handler
helm repo add eks https://aws.github.io/eks-charts
helm upgrade \
--install aws-node-termination-handler \
--namespace kube-system \
--version 0.21.0 \
eks/aws-node-termination-handler

# install EBS snapshotter?: https://github.com/kubernetes-csi/external-snapshotter#usage
# INSTALL_OPERATOR="${INSTALL_OPERATOR:-yes}"
export EKSCTL_ENABLE_CREDENTIAL_CACHE=1
export CLUSTER_NAME=substratus
export REGION=us-west-2
export ARTIFACTS_REPO_NAME=substratus
export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-substratus-artifacts

# TODO(bjb): may not be needed if we can resolve 401 to 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/
# install aws-ebs-csi-driver: https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/docs/install.md
helm repo add aws-ebs-csi-driver https://kubernetes-sigs.github.io/aws-ebs-csi-driver
helm repo update
helm upgrade \
--install aws-ebs-csi-driver \
--namespace kube-system \
aws-ebs-csi-driver/aws-ebs-csi-driver
aws s3 mb s3://${ARTIFACTS_BUCKET_NAME} --region ${REGION} || true
aws ecr create-repository --repository-name ${ARTIFACTS_REPO_NAME} || true

# TODO(bjb): is this needed? Is doing the work here preferred to doing it in terraform?
# install karpenter: https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/
export KARPENTER_VERSION=v0.29.2
export AWS_PARTITION="aws"
export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
export TEMPOUT=$(mktemp)
curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml >$TEMPOUT &&
aws cloudformation deploy \
Expand All @@ -74,54 +31,35 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION}
--capabilities CAPABILITY_NAMED_IAM \
--parameter-overrides "ClusterName=${CLUSTER_NAME}"

eksctl create cluster -f - <<EOF
---
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: ${CLUSTER_NAME}
region: ${CLUSTER_REGION}
version: "1.27"
tags:
karpenter.sh/discovery: ${CLUSTER_NAME}
iam:
withOIDC: true
serviceAccounts:
- metadata:
name: karpenter
namespace: karpenter
roleName: ${CLUSTER_NAME}-karpenter
attachPolicyARNs:
- arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME}
roleOnly: true
iamIdentityMappings:
- arn: "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}"
username: system:node:{{EC2PrivateDNSName}}
groups:
- system:bootstrappers
- system:nodes
managedNodeGroups:
- instanceType: t3a.large
amiFamily: AmazonLinux2
name: ${CLUSTER_NAME}-ng
desiredCapacity: 1
minSize: 0
maxSize: 3
EOF
envsubst <../kubernetes/eks-cluster.yaml.tpl >../kubernetes/eks-cluster.yaml
eksctl create cluster -f ../kubernetes/eks-cluster.yaml || eksctl upgrade cluster -f ../kubernetes/eks-cluster.yaml

export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter"
echo $CLUSTER_ENDPOINT $KARPENTER_IAM_ROLE_ARN
aws iam create-service-linked-role --aws-service-name spot.amazonaws.com || true
aws eks --region ${REGION} update-kubeconfig --name ${CLUSTER_NAME}
# Logout of helm registry to perform an unauthenticated pull against the public ECR
helm registry logout public.ecr.aws || true

helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --version ${KARPENTER_VERSION} --namespace karpenter --create-namespace \
--set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=${KARPENTER_IAM_ROLE_ARN} \
--set settings.aws.clusterName=${CLUSTER_NAME} \
--set settings.aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \
--set settings.aws.interruptionQueueName=${CLUSTER_NAME} \
--set controller.resources.requests.cpu=1 \
--set controller.resources.requests.memory=1Gi \
--set controller.resources.limits.cpu=1 \
--set controller.resources.limits.memory=1Gi \
--wait

kubectl apply -f ../kubernetes/karpenter-provisioner.yaml

# install the load balancer controller: https://docs.aws.amazon.com/eks/latest/userguide/aws-load-balancer-controller.html
helm install aws-load-balancer-controller eks/aws-load-balancer-controller \
-n kube-system \
--set clusterName=${CLUSTER_NAME} \
--set serviceAccount.create=false \
--set serviceAccount.name=${LOAD_BALANCER_CONTROLLER_ROLE_NAME}
# node-termination-handler: https://artifacthub.io/packages/helm/aws/aws-node-termination-handler
helm repo add eks https://aws.github.io/eks-charts
helm upgrade \
--install aws-node-termination-handler \
--namespace kube-system \
--version 0.21.0 \
eks/aws-node-termination-handler

# Install the substratus operator.
# if [ "${INSTALL_OPERATOR}" == "yes" ]; then
Expand Down
7 changes: 0 additions & 7 deletions install/terraform/aws/backend.tf

This file was deleted.

27 changes: 0 additions & 27 deletions install/terraform/aws/common.tf

This file was deleted.

Loading

0 comments on commit 4ef5f84

Please sign in to comment.