Skip to content
This repository has been archived by the owner on Aug 28, 2024. It is now read-only.

adds AWS infra for instantiating and destroying all baseline substratus components #170

Merged
merged 22 commits into from
Aug 10, 2023
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
1222b2f
adds an AWS module for instantiating all substratus components + an o…
brandonjbjelland Aug 6, 2023
7fd3104
paired back terraform install bits. aws-up started
brandonjbjelland Aug 7, 2023
4ef5f84
adding infra via eksctl
brandonjbjelland Aug 8, 2023
c10bb46
updated dockerfile to install eksctl and work with common architectures
brandonjbjelland Aug 8, 2023
03a444d
added a karpenter AWSNodeTemplate
brandonjbjelland Aug 8, 2023
83d2ef0
working with dirs relative to scripts
brandonjbjelland Aug 8, 2023
847ebe7
aws-up and aws-down working in a containerized context via makefile t…
brandonjbjelland Aug 8, 2023
f9cf844
added the nvidia device plugin to get device drivers
brandonjbjelland Aug 8, 2023
9aef30b
needing more resources per node for the daemonsets
brandonjbjelland Aug 9, 2023
daf75d4
moving all aws specific manifests into a dedicated dir
brandonjbjelland Aug 9, 2023
bc9c903
adds provisioners that add standard taint and an accelerator-specific…
brandonjbjelland Aug 9, 2023
e752152
bringing the karpenter config back down to earth
brandonjbjelland Aug 9, 2023
3741018
reverting changes to sci
brandonjbjelland Aug 9, 2023
54e3511
occasional sqs queue left over
brandonjbjelland Aug 9, 2023
e846c81
bugfix
brandonjbjelland Aug 9, 2023
f57fc8b
migrated tools install to a dedicated script that should work on work…
brandonjbjelland Aug 9, 2023
2ede193
migrated to lowercase vars
brandonjbjelland Aug 9, 2023
db6e229
consistent makefile target naming
brandonjbjelland Aug 9, 2023
0431615
dropping probably not needed aws-down steps
brandonjbjelland Aug 9, 2023
5149163
improved caching on docker build. dropping some karpenter configs
brandonjbjelland Aug 9, 2023
76ff2c9
everything is working consistently. shipping it
brandonjbjelland Aug 10, 2023
e7ca133
Merge branch 'main' into feat/add-aws-infra
brandonjbjelland Aug 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,6 @@ gcpmanager-dependencies.yaml
skaffold-dependencies.sh

.ipynb_checkpoints
.vscode/
.vscode/
eks-cluster.yaml
karpenter-provisioner.yaml
77 changes: 49 additions & 28 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ all: build

.PHONY: help
help: ## Display this help.
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)

##@ Development

Expand Down Expand Up @@ -120,40 +120,60 @@ skaffold-dev-gcpmanager: protoc skaffold protogen render-skaffold-manifests ## R
build: manifests generate fmt vet ## Build manager binary.
go build -o bin/manager cmd/controllermanager/main.go

.PHONY: dev-up
dev-up:
docker build ./install -t substratus-installer && \
.PHONY: gcp-dev-up
gcp-dev-up: build-installer
docker run -it \
-v ${HOME}/.kube:/root/.kube \
-e PROJECT=$(shell gcloud config get project) \
-e TOKEN=$(shell gcloud auth print-access-token) \
-e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \
-e INSTALL_OPERATOR=false \
substratus-installer gcp-up.sh
-v ${HOME}/.kube:/root/.kube \
-e PROJECT=$(shell gcloud config get project) \
-e TOKEN=$(shell gcloud auth print-access-token) \
-e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \
-e INSTALL_OPERATOR=false \
substratus-installer gcp-up.sh
mkdir -p secrets
gcloud iam service-accounts keys create --iam-account=substratus-gcp-manager@$(shell gcloud config get project).iam.gserviceaccount.com ./secrets/gcp-manager-key.json

.PHONY: dev-down
dev-down:
.PHONY: gcp-dev-down
gcp-dev-down: build-installer
docker run -it \
-v ${HOME}/.kube:/root/.kube \
-e PROJECT=$(shell gcloud config get project) \
-e TOKEN=$(shell gcloud auth print-access-token) \
-e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \
substratus-installer gcp-down.sh
-v ${HOME}/.kube:/root/.kube \
-e PROJECT=$(shell gcloud config get project) \
-e TOKEN=$(shell gcloud auth print-access-token) \
-e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \
substratus-installer gcp-down.sh
rm ./secrets/gcp-manager-key.json

.PHONY: dev-run
.PHONY: aws-dev-up
aws-dev-up: build-installer
docker run -it \
-v ${HOME}/.kube:/root/.kube \
-e AWS_ACCOUNT_ID="$(shell aws sts get-caller-identity --query Account --output text)" \
-e AWS_ACCESS_KEY_ID=$(shell aws configure get aws_access_key_id) \
-e AWS_SECRET_ACCESS_KEY=$(shell aws configure get aws_secret_access_key) \
-e AWS_SESSION_TOKEN=$(shell aws configure get aws_session_token) \
-e INSTALL_OPERATOR=false \
substratus-installer aws-up.sh

.PHONY: aws-dev-down
aws-dev-down: build-installer
docker run -it \
-v ${HOME}/.kube:/root/.kube \
-e AWS_ACCOUNT_ID="$(shell aws sts get-caller-identity --query Account --output text)" \
-e AWS_ACCESS_KEY_ID=$(shell aws configure get aws_access_key_id) \
-e AWS_SECRET_ACCESS_KEY=$(shell aws configure get aws_secret_access_key) \
-e AWS_SESSION_TOKEN=$(shell aws configure get aws_session_token) \
substratus-installer aws-down.sh

.PHONY: gcp-dev-run
# Controller manager configuration #
dev-run: export CLOUD=gcp
dev-run: export GPU_TYPE=nvidia-l4
dev-run: export PROJECT_ID=$(shell gcloud config get project)
dev-run: export CLUSTER_NAME=substratus
dev-run: export CLUSTER_LOCATION=us-central1
gcp-dev-run: export CLOUD=gcp
brandonjbjelland marked this conversation as resolved.
Show resolved Hide resolved
gcp-dev-run: export GPU_TYPE=nvidia-l4
gcp-dev-run: export PROJECT_ID=$(shell gcloud config get project)
gcp-dev-run: export CLUSTER_NAME=substratus
gcp-dev-run: export CLUSTER_LOCATION=us-central1
# Cloud manager configuration #
dev-run: export GOOGLE_APPLICATION_CREDENTIALS=./secrets/gcp-manager-key.json
gcp-dev-run: export GOOGLE_APPLICATION_CREDENTIALS=./secrets/gcp-manager-key.json
# Run the controller manager and the cloud manager.
dev-run: manifests kustomize install-crds
gcp-dev-run: manifests kustomize install-crds
go run ./cmd/gcpmanager & \
go run ./cmd/controllermanager/main.go \
--sci-address=localhost:10080 \
Expand All @@ -176,16 +196,17 @@ docker-push: ## Push docker image with the manager.

.PHONY: docs
docs: crd-ref-docs embedmd
$(CRD_REF_DOCS) --config=./docs/api/config.yaml \
$(CRD_REF_DOCS) \
--config=./docs/api/config.yaml \
--log-level=INFO \
--output-path=./docs/api/generated.md \
--source-path=./api \
--templates-dir=./docs/api/templates/markdown \
--templates-dir=./docs/api/templates/markdown \
--renderer=markdown
# TODO: Embed YAML examples into the generate API documentation.
# $(EMBEDMD) -w ./docs/api/generated.md

# PLATFORMS defines the target platforms for the manager image be build to provide support to multiple
# PLATFORMS defines the target platforms for the manager image be build to provide support to multiple
# architectures. (i.e. make docker-buildx IMG=myregistry/mypoperator:0.0.1). To use this option you need to:
# - able to use docker buildx . More info: https://docs.docker.com/build/buildx/
# - have enable BuildKit, More info: https://docs.docker.com/develop/develop-images/build_enhancements/
Expand Down
6 changes: 3 additions & 3 deletions docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,19 @@
Create a GCP environment.

```sh
make dev-up
make gcp-dev-up
```

Run Substratus control plane locally.

```sh
make dev-run
make gcp-dev-run
```

Delete GCP infra.

```sh
make dev-down
make gcp-dev-down
```

TODO: Automate the cleanup of PVs... Don't forget to manually clean them up for now.
Expand Down
57 changes: 43 additions & 14 deletions install/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,36 +1,65 @@
FROM ubuntu:23.04

WORKDIR /workspace
# Determine platform and architecture
RUN ARCH=$(uname -m) && \
brandonjbjelland marked this conversation as resolved.
Show resolved Hide resolved
PLATFORM=$(uname -s | tr '[:upper:]' '[:lower:]') && \
if [ "$ARCH" = "aarch64" ]; then \
echo "AWSCLI_ARCH=aarch64" >> /etc/environment; \
echo "TERRAFORM_ARCH=arm64" >> /etc/environment; \
echo "PLATFORM_ARCH=${PLATFORM}_arm64" >> /etc/environment; \
elif [ "$ARCH" = "x86_64" ]; then \
echo "AWSCLI_ARCH=x86_64" >> /etc/environment; \
echo "TERRAFORM_ARCH=amd64" >> /etc/environment; \
echo "PLATFORM_ARCH=${PLATFORM}_amd64" >> /etc/environment; \
else \
echo "Unsupported architecture"; \
exit 1; \
fi

# Source the environment file so that the variable is available in the current shell
SHELL ["/bin/bash", "-c"]
RUN source /etc/environment

# Common
RUN apt-get update && \
RUN DEBIAN_FRONTEND="noninteractive" \
apt-get update && \
apt-get install -y \
gnupg \
software-properties-common \
unzip \
wget \
curl \
git
git \
gettext-base

# AWS CLI
RUN source /etc/environment && \
curl "https://awscli.amazonaws.com/awscli-exe-linux-${AWSCLI_ARCH}.zip" -o "awscliv2.zip" && \
unzip awscliv2.zip && \
./aws/install

# eksctl
RUN source /etc/environment && \
curl -sLO "https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_${PLATFORM_ARCH}.tar.gz" && \
tar -xzf eksctl_${PLATFORM_ARCH}.tar.gz -C /tmp && rm eksctl_${PLATFORM_ARCH}.tar.gz && \
mv /tmp/eksctl /usr/local/bin

# Terraform
RUN wget https://releases.hashicorp.com/terraform/1.4.5/terraform_1.4.5_linux_amd64.zip
RUN unzip terraform_1.4.5_linux_amd64.zip
RUN mv terraform /usr/local/bin/
RUN terraform --version
RUN source /etc/environment && \
wget https://releases.hashicorp.com/terraform/1.4.5/terraform_1.4.5_linux_${TERRAFORM_ARCH}.zip && \
unzip terraform_1.4.5_linux_${TERRAFORM_ARCH}.zip && \
mv terraform /usr/local/bin/ && \
terraform --version

# Google Cloud (gcloud)
RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz
RUN mkdir -p /usr/local/gcloud \
RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz && \
mkdir -p /usr/local/gcloud \
&& tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \
&& /usr/local/gcloud/google-cloud-sdk/install.sh
ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin
RUN gcloud --version
RUN gcloud components install gke-gcloud-auth-plugin

# Kubectl
RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
RUN chmod +x ./kubectl
RUN mv ./kubectl /usr/local/bin
RUN gcloud components install gke-gcloud-auth-plugin kubectl

# Helm
RUN curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
Expand Down
102 changes: 102 additions & 0 deletions install/kubernetes/aws/eks-cluster.yaml.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: ${CLUSTER_NAME}
region: ${REGION}
version: "1.27"
tags:
createdBy: eksctl
environment: dev
karpenter.sh/discovery: ${CLUSTER_NAME}

karpenter:
brandonjbjelland marked this conversation as resolved.
Show resolved Hide resolved
createServiceAccount: true
withSpotInterruptionQueue: true
defaultInstanceProfile: "KarpenterNodeInstanceProfile-${CLUSTER_NAME}"
version: "v0.29.0"

# if karpenter doesn't suffice: https://github.com/eksctl-io/eksctl/blob/main/examples/23-kubeflow-spot-instance.yaml
managedNodeGroups:
- name: builder-ng
privateNetworking: true
labels: { role: builders }
amiFamily: Ubuntu2004
instanceTypes:
- m6a.large
volumeSize: 100
minSize: 0
maxSize: 3
desiredCapacity: 1
iam:
withAddonPolicies:
ebs: true
imageBuilder: true

addons:
- name: vpc-cni
attachPolicyARNs:
- arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy
- name: kube-proxy
- name: aws-ebs-csi-driver
wellKnownPolicies:
ebsCSIController: true
- name: coredns

iamIdentityMappings:
- arn: "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}"
username: system:node:{{EC2PrivateDNSName}}
groups:
- system:bootstrappers
- system:nodes

iam:
withOIDC: true
serviceAccounts:
- metadata:
name: karpenter
namespace: karpenter
roleName: ${CLUSTER_NAME}-karpenter
attachPolicyARNs:
# this is used as spec.instanceProfile in the karpenter AWSNodeTemplate
- arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME}
roleOnly: true
- metadata:
name: ebs-csi-controller-sa
namespace: kube-system
wellKnownPolicies:
ebsCSIController: true
- metadata:
name: aws-manager
namespace: substratus
attachPolicy:
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-presigned-url.html
Version: "2012-10-17"
Statement:
- Sid: "AllowUrlPreSigning"
Effect: Allow
Action:
- "s3:PutObject"
- "s3:GetObject"
Resource:
- "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}/*"
- "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}"
- Sid: "FullSubstratusEcrRepoAccess"
Effect: Allow
Action:
- "ecr:*"
Resource:
- "arn:aws:ecr:::${ARTIFACTS_REPO_NAME}"
- Sid: "S3AdminSubstratusBucketAccess"
Effect: Allow
Action:
- "s3:*"
- "s3-object-lambda:*"
Resource:
- "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}/*"
- "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}"
- Sid: "ModifyOwnTrustPolicy"
Effect: Allow
Action:
- "iam:UpdateAssumeRolePolicy"
Resource:
- "arn:aws:iam::${AWS_ACCOUNT_ID}:role/$${aws:userid}"
47 changes: 47 additions & 0 deletions install/kubernetes/aws/karpenter-provisioner.yaml.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
apiVersion: karpenter.k8s.aws/v1alpha1
kind: AWSNodeTemplate
metadata:
name: default
spec:
instanceProfile: KarpenterControllerPolicy-${CLUSTER_NAME}
subnetSelector:
karpenter.sh/discovery: ${CLUSTER_NAME}
securityGroupSelector:
karpenter.sh/discovery: ${CLUSTER_NAME}
---
apiVersion: karpenter.sh/v1alpha5
kind: Provisioner
metadata:
name: nvidia-gpu
spec:
providerRef:
name: default
consolidation:
enabled: true
# These well-known labels (specifically karpenter.k8s.aws/instance-gpu-name)
# will guide karpenter in accelerator and instance type selection:
# https://karpenter.sh/v0.29/concepts/scheduling/#labels
taints:
- key: nvidia.com/gpu
value: "true"
effect: "NoSchedule"
requirements:
- key: karpenter.sh/capacity-type
operator: In
values: ["spot"]
- key: karpenter.k8s.aws/instance-family
operator: In
values: [
"p4de",
"p4d",
"p3dn",
"p3",
"p2",
"g2",
"g3",
"g4",
"g5",
]
- key: "kubernetes.io/arch"
operator: In
values: ["amd64"]
8 changes: 8 additions & 0 deletions install/kubernetes/aws/nvidia-eks-device-plugin.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: karpenter.k8s.aws/instance-gpu-manufacturer
operator: In
values: ["nvidia"]
Loading
Loading