Skip to content
This repository has been archived by the owner on Aug 28, 2024. It is now read-only.

adds AWS infra for instantiating and destroying all baseline substratus components #170

Merged
merged 22 commits into from
Aug 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
1222b2f
adds an AWS module for instantiating all substratus components + an o…
brandonjbjelland Aug 6, 2023
7fd3104
paired back terraform install bits. aws-up started
brandonjbjelland Aug 7, 2023
4ef5f84
adding infra via eksctl
brandonjbjelland Aug 8, 2023
c10bb46
updated dockerfile to install eksctl and work with common architectures
brandonjbjelland Aug 8, 2023
03a444d
added a karpenter AWSNodeTemplate
brandonjbjelland Aug 8, 2023
83d2ef0
working with dirs relative to scripts
brandonjbjelland Aug 8, 2023
847ebe7
aws-up and aws-down working in a containerized context via makefile t…
brandonjbjelland Aug 8, 2023
f9cf844
added the nvidia device plugin to get device drivers
brandonjbjelland Aug 8, 2023
9aef30b
needing more resources per node for the daemonsets
brandonjbjelland Aug 9, 2023
daf75d4
moving all aws specific manifests into a dedicated dir
brandonjbjelland Aug 9, 2023
bc9c903
adds provisioners that add standard taint and an accelerator-specific…
brandonjbjelland Aug 9, 2023
e752152
bringing the karpenter config back down to earth
brandonjbjelland Aug 9, 2023
3741018
reverting changes to sci
brandonjbjelland Aug 9, 2023
54e3511
occasional sqs queue left over
brandonjbjelland Aug 9, 2023
e846c81
bugfix
brandonjbjelland Aug 9, 2023
f57fc8b
migrated tools install to a dedicated script that should work on work…
brandonjbjelland Aug 9, 2023
2ede193
migrated to lowercase vars
brandonjbjelland Aug 9, 2023
db6e229
consistent makefile target naming
brandonjbjelland Aug 9, 2023
0431615
dropping probably not needed aws-down steps
brandonjbjelland Aug 9, 2023
5149163
improved caching on docker build. dropping some karpenter configs
brandonjbjelland Aug 9, 2023
76ff2c9
everything is working consistently. shipping it
brandonjbjelland Aug 10, 2023
e7ca133
Merge branch 'main' into feat/add-aws-infra
brandonjbjelland Aug 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,6 @@ gcpmanager-dependencies.yaml
skaffold-dependencies.sh

.ipynb_checkpoints
.vscode/
.vscode/
eks-cluster.yaml
karpenter-provisioner.yaml
77 changes: 49 additions & 28 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ all: build

.PHONY: help
help: ## Display this help.
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)

##@ Development

Expand Down Expand Up @@ -120,40 +120,60 @@ skaffold-dev-gcpmanager: protoc skaffold protogen render-skaffold-manifests ## R
build: manifests generate fmt vet ## Build manager binary.
go build -o bin/manager cmd/controllermanager/main.go

.PHONY: dev-up
dev-up:
docker build ./install -t substratus-installer && \
.PHONY: dev-up-gcp
dev-up-gcp: build-installer
docker run -it \
-v ${HOME}/.kube:/root/.kube \
-e PROJECT=$(shell gcloud config get project) \
-e TOKEN=$(shell gcloud auth print-access-token) \
-e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \
-e INSTALL_OPERATOR=false \
substratus-installer gcp-up.sh
-v ${HOME}/.kube:/root/.kube \
-e PROJECT=$(shell gcloud config get project) \
-e TOKEN=$(shell gcloud auth print-access-token) \
-e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \
-e INSTALL_OPERATOR=false \
substratus-installer gcp-up.sh
mkdir -p secrets
gcloud iam service-accounts keys create --iam-account=substratus-gcp-manager@$(shell gcloud config get project).iam.gserviceaccount.com ./secrets/gcp-manager-key.json

.PHONY: dev-down
dev-down:
.PHONY: dev-down-gcp
dev-down-gcp: build-installer
docker run -it \
-v ${HOME}/.kube:/root/.kube \
-e PROJECT=$(shell gcloud config get project) \
-e TOKEN=$(shell gcloud auth print-access-token) \
-e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \
substratus-installer gcp-down.sh
-v ${HOME}/.kube:/root/.kube \
-e PROJECT=$(shell gcloud config get project) \
-e TOKEN=$(shell gcloud auth print-access-token) \
-e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \
substratus-installer gcp-down.sh
rm ./secrets/gcp-manager-key.json

.PHONY: dev-run
.PHONY: dev-up-aws
dev-up-aws: build-installer
docker run -it \
-v ${HOME}/.kube:/root/.kube \
-e AWS_ACCOUNT_ID="$(shell aws sts get-caller-identity --query Account --output text)" \
-e AWS_ACCESS_KEY_ID=$(shell aws configure get aws_access_key_id) \
-e AWS_SECRET_ACCESS_KEY=$(shell aws configure get aws_secret_access_key) \
-e AWS_SESSION_TOKEN=$(shell aws configure get aws_session_token) \
-e INSTALL_OPERATOR=false \
substratus-installer aws-up.sh

.PHONY: dev-down-aws
dev-down-aws: build-installer
docker run -it \
-v ${HOME}/.kube:/root/.kube \
-e AWS_ACCOUNT_ID="$(shell aws sts get-caller-identity --query Account --output text)" \
-e AWS_ACCESS_KEY_ID=$(shell aws configure get aws_access_key_id) \
-e AWS_SECRET_ACCESS_KEY=$(shell aws configure get aws_secret_access_key) \
-e AWS_SESSION_TOKEN=$(shell aws configure get aws_session_token) \
substratus-installer aws-down.sh

.PHONY: dev-run-gcp
# Controller manager configuration #
dev-run: export CLOUD=gcp
dev-run: export GPU_TYPE=nvidia-l4
dev-run: export PROJECT_ID=$(shell gcloud config get project)
dev-run: export CLUSTER_NAME=substratus
dev-run: export CLUSTER_LOCATION=us-central1
dev-run-gcp: export CLOUD=gcp
dev-run-gcp: export GPU_TYPE=nvidia-l4
dev-run-gcp: export PROJECT_ID=$(shell gcloud config get project)
dev-run-gcp: export CLUSTER_NAME=substratus
dev-run-gcp: export CLUSTER_LOCATION=us-central1
# Cloud manager configuration #
dev-run: export GOOGLE_APPLICATION_CREDENTIALS=./secrets/gcp-manager-key.json
dev-run-gcp: export GOOGLE_APPLICATION_CREDENTIALS=./secrets/gcp-manager-key.json
# Run the controller manager and the cloud manager.
dev-run: manifests kustomize install-crds
dev-run-gcp: manifests kustomize install-crds
go run ./cmd/gcpmanager & \
go run ./cmd/controllermanager/main.go \
--sci-address=localhost:10080 \
Expand All @@ -176,16 +196,17 @@ docker-push: ## Push docker image with the manager.

.PHONY: docs
docs: crd-ref-docs embedmd
$(CRD_REF_DOCS) --config=./docs/api/config.yaml \
$(CRD_REF_DOCS) \
--config=./docs/api/config.yaml \
--log-level=INFO \
--output-path=./docs/api/generated.md \
--source-path=./api \
--templates-dir=./docs/api/templates/markdown \
--templates-dir=./docs/api/templates/markdown \
--renderer=markdown
# TODO: Embed YAML examples into the generate API documentation.
# $(EMBEDMD) -w ./docs/api/generated.md

# PLATFORMS defines the target platforms for the manager image be build to provide support to multiple
# PLATFORMS defines the target platforms for the manager image be build to provide support to multiple
# architectures. (i.e. make docker-buildx IMG=myregistry/mypoperator:0.0.1). To use this option you need to:
# - able to use docker buildx . More info: https://docs.docker.com/build/buildx/
# - have enable BuildKit, More info: https://docs.docker.com/develop/develop-images/build_enhancements/
Expand Down
6 changes: 3 additions & 3 deletions docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,19 @@
Create a GCP environment.

```sh
make dev-up
make dev-up-gcp
```

Run Substratus control plane locally.

```sh
make dev-run
make dev-run-gcp
```

Delete GCP infra.

```sh
make dev-down
make dev-down-gcp
```

TODO: Automate the cleanup of PVs... Don't forget to manually clean them up for now.
Expand Down
40 changes: 4 additions & 36 deletions install/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,44 +1,12 @@
FROM ubuntu:23.04

ENV PATH $PATH:/workspace/scripts:/usr/local/gcloud/google-cloud-sdk/bin
WORKDIR /workspace

# Common
RUN apt-get update && \
apt-get install -y \
gnupg \
software-properties-common \
unzip \
wget \
curl \
git
COPY scripts/get-tools.sh scripts/

# Terraform
RUN wget https://releases.hashicorp.com/terraform/1.4.5/terraform_1.4.5_linux_amd64.zip
RUN unzip terraform_1.4.5_linux_amd64.zip
RUN mv terraform /usr/local/bin/
RUN terraform --version
RUN scripts/get-tools.sh

# Google Cloud (gcloud)
RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz
RUN mkdir -p /usr/local/gcloud \
&& tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \
&& /usr/local/gcloud/google-cloud-sdk/install.sh
ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin
RUN gcloud --version
RUN gcloud components install gke-gcloud-auth-plugin

# Kubectl
RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
RUN chmod +x ./kubectl
RUN mv ./kubectl /usr/local/bin

# Helm
RUN curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
RUN chmod 700 /tmp/get_helm.sh
RUN /tmp/get_helm.sh

# Local files
COPY scripts scripts
COPY terraform terraform
COPY kubernetes kubernetes
COPY scripts scripts
ENV PATH $PATH:/workspace/scripts
96 changes: 96 additions & 0 deletions install/kubernetes/aws/eks-cluster.yaml.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# https://eksctl.io/usage/schema/
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: ${CLUSTER_NAME}
region: ${REGION}
version: "1.27"
tags:
createdBy: eksctl
environment: dev
karpenter.sh/discovery: ${CLUSTER_NAME}

managedNodeGroups:
- name: builder-ng
privateNetworking: true
labels: { role: builders }
amiFamily: Ubuntu2004
instanceTypes:
- m6a.large
volumeSize: 100
minSize: 0
maxSize: 3
desiredCapacity: 1
iam:
withAddonPolicies:
ebs: true
imageBuilder: true

addons:
- name: vpc-cni
attachPolicyARNs:
- arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy
- name: kube-proxy
- name: aws-ebs-csi-driver
wellKnownPolicies:
ebsCSIController: true
- name: coredns

iamIdentityMappings:
- arn: "arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}"
username: system:node:{{EC2PrivateDNSName}}
groups:
- system:bootstrappers
- system:nodes

iam:
withOIDC: true
serviceAccounts:
- metadata:
name: karpenter
namespace: karpenter
roleName: ${CLUSTER_NAME}-karpenter
attachPolicyARNs:
# this is used as spec.instanceProfile in the karpenter AWSNodeTemplate
- arn:aws:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME}
roleOnly: true
- metadata:
name: ebs-csi-controller-sa
namespace: kube-system
wellKnownPolicies:
ebsCSIController: true
- metadata:
name: aws-manager
namespace: substratus
attachPolicy:
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-presigned-url.html
Version: "2012-10-17"
Statement:
- Sid: "AllowUrlPreSigning"
Effect: Allow
Action:
- "s3:PutObject"
- "s3:GetObject"
Resource:
- "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}/*"
- "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}"
- Sid: "FullSubstratusEcrRepoAccess"
Effect: Allow
Action:
- "ecr:*"
Resource:
- "arn:aws:ecr:::${ARTIFACTS_REPO_NAME}"
- Sid: "S3AdminSubstratusBucketAccess"
Effect: Allow
Action:
- "s3:*"
- "s3-object-lambda:*"
Resource:
- "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}/*"
- "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}"
- Sid: "ModifyOwnTrustPolicy"
Effect: Allow
Action:
- "iam:UpdateAssumeRolePolicy"
Resource:
- "arn:aws:iam::${AWS_ACCOUNT_ID}:role/$${aws:userid}"
47 changes: 47 additions & 0 deletions install/kubernetes/aws/karpenter-provisioner.yaml.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
apiVersion: karpenter.k8s.aws/v1alpha1
kind: AWSNodeTemplate
metadata:
name: default
spec:
instanceProfile: KarpenterNodeInstanceProfile-${CLUSTER_NAME}
subnetSelector:
karpenter.sh/discovery: ${CLUSTER_NAME}
securityGroupSelector:
karpenter.sh/discovery: ${CLUSTER_NAME}
---
apiVersion: karpenter.sh/v1alpha5
kind: Provisioner
metadata:
name: nvidia-gpu
spec:
providerRef:
name: default
consolidation:
enabled: true
# These well-known labels (specifically karpenter.k8s.aws/instance-gpu-name)
# will guide karpenter in accelerator and instance type selection:
# https://karpenter.sh/v0.29/concepts/scheduling/#labels
taints:
- key: nvidia.com/gpu
value: "true"
effect: "NoSchedule"
requirements:
- key: karpenter.sh/capacity-type
operator: In
values: ["spot"]
- key: karpenter.k8s.aws/instance-family
operator: In
values: [
"p4de",
"p4d",
"p3dn",
"p3",
"p2",
"g2",
"g3",
"g4",
"g5",
]
- key: "kubernetes.io/arch"
operator: In
values: ["amd64"]
8 changes: 8 additions & 0 deletions install/kubernetes/aws/nvidia-eks-device-plugin.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: karpenter.k8s.aws/instance-gpu-manufacturer
operator: In
values: ["nvidia"]
37 changes: 37 additions & 0 deletions install/scripts/aws-down.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash

set -e
set -u

# Required env variables:
: "$AWS_ACCOUNT_ID $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY"

script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
kubernetes_dir=${script_dir}/../kubernetes

EKSCTL_ENABLE_CREDENTIAL_CACHE=1
export CLUSTER_NAME=substratus
export REGION=us-west-2
export ARTIFACTS_REPO_NAME=${CLUSTER_NAME}
export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts

(aws eks update-kubeconfig \
--region ${REGION} \
--name ${CLUSTER_NAME} &&
kubectl delete deployments --namespace=karpenter --all &&
brandonjbjelland marked this conversation as resolved.
Show resolved Hide resolved
kubectl delete deployments --namespace=kube-system --all) ||
true

envsubst <${kubernetes_dir}/aws/eks-cluster.yaml.tpl >${kubernetes_dir}/aws/eks-cluster.yaml
brandonjbjelland marked this conversation as resolved.
Show resolved Hide resolved
eksctl delete cluster -f ${kubernetes_dir}/aws/eks-cluster.yaml || true

aws cloudformation delete-stack \
--stack-name "Karpenter-${CLUSTER_NAME}" \
--region ${REGION} || true

aws ecr delete-repository \
--repository-name ${ARTIFACTS_REPO_NAME} \
--region ${REGION} >/dev/null || true

aws s3 rb s3://${ARTIFACTS_BUCKET_NAME} \
--region ${REGION} >/dev/null || true
Loading