Skip to content
This repository has been archived by the owner on Aug 28, 2024. It is now read-only.

Commit

Permalink
paired back terraform install bits. aws-up started
Browse files Browse the repository at this point in the history
  • Loading branch information
brandonjbjelland committed Aug 7, 2023
1 parent 1222b2f commit 532de8b
Show file tree
Hide file tree
Showing 10 changed files with 203 additions and 285 deletions.
1 change: 1 addition & 0 deletions install/scripts/aws-down.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

131 changes: 131 additions & 0 deletions install/scripts/aws-up.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/bin/bash

set -e
set -u

# Required env variables:
: "$TOKEN $PROJECT"

# Used by gcloud:
# TODO(bjb): pass AWS creds into script
export CLOUDSDK_AUTH_ACCESS_TOKEN=${TOKEN}
# Used by terraform:
export GOOGLE_OAUTH_ACCESS_TOKEN=${TOKEN}

INSTALL_OPERATOR="${INSTALL_OPERATOR:-yes}"
AUTO_APPROVE="${AUTO_APPROVE:-no}"

# Create terraform state bucket if one does not exist.
# TODO(bjb): establish a bucket

# Apply infrastructure.
cd terraform/aws

# Backend variables cannot be configured via env variables.
echo "bucket = \"${TF_BUCKET}\"" >>backend.tfvars
terraform init --backend-config=backend.tfvars

export TF_VAR_project_id=${PROJECT}
if [ "${AUTO_APPROVE}" == "yes" ]; then
terraform apply -auto-approve
else
terraform apply
fi
CLUSTER_NAME=$(terraform output --json cluster | jq -r '.name')
CLUSTER_REGION=$(terraform output --json cluster | jq -r '.region')
CLUSTER_ENDPOINT=$(terraform output --json cluster | jq -r '.endpoint')
LOAD_BALANCER_CONTROLLER_ROLE_NAME=$(terraform output --json irsas | jq -r '.load_balancer_controller_irsa_role.iam_role_name')

cd -

# Configure kubectl.
aws eks --region ${CLUSTER_REGION} update-kubeconfig --name ${CLUSTER_NAME}
# Install cluster-level components

# node-termination-handler: https://artifacthub.io/packages/helm/aws/aws-node-termination-handler
helm repo add eks https://aws.github.io/eks-charts
helm upgrade \
--install aws-node-termination-handler \
--namespace kube-system \
--version 0.21.0 \
eks/aws-node-termination-handler

# install EBS snapshotter?: https://github.com/kubernetes-csi/external-snapshotter#usage

# TODO(bjb): may not be needed if we can resolve 401 to 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/
# install aws-ebs-csi-driver: https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/docs/install.md
helm repo add aws-ebs-csi-driver https://kubernetes-sigs.github.io/aws-ebs-csi-driver
helm repo update
helm upgrade \
--install aws-ebs-csi-driver \
--namespace kube-system \
aws-ebs-csi-driver/aws-ebs-csi-driver

# TODO(bjb): is this needed? Is doing the work here preferred to doing it in terraform?
# install karpenter: https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/
export KARPENTER_VERSION=v0.29.2
export AWS_PARTITION="aws"
export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
export TEMPOUT=$(mktemp)
curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml >$TEMPOUT &&
aws cloudformation deploy \
--stack-name "Karpenter-${CLUSTER_NAME}" \
--template-file "${TEMPOUT}" \
--capabilities CAPABILITY_NAMED_IAM \
--parameter-overrides "ClusterName=${CLUSTER_NAME}"

eksctl create cluster -f - <<EOF
---
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: ${CLUSTER_NAME}
region: ${CLUSTER_REGION}
version: "1.27"
tags:
karpenter.sh/discovery: ${CLUSTER_NAME}
iam:
withOIDC: true
serviceAccounts:
- metadata:
name: karpenter
namespace: karpenter
roleName: ${CLUSTER_NAME}-karpenter
attachPolicyARNs:
- arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME}
roleOnly: true
iamIdentityMappings:
- arn: "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}"
username: system:node:{{EC2PrivateDNSName}}
groups:
- system:bootstrappers
- system:nodes
managedNodeGroups:
- instanceType: t3a.large
amiFamily: AmazonLinux2
name: ${CLUSTER_NAME}-ng
desiredCapacity: 1
minSize: 0
maxSize: 3
EOF

export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter"
echo $CLUSTER_ENDPOINT $KARPENTER_IAM_ROLE_ARN
aws iam create-service-linked-role --aws-service-name spot.amazonaws.com || true

# install the load balancer controller: https://docs.aws.amazon.com/eks/latest/userguide/aws-load-balancer-controller.html
helm install aws-load-balancer-controller eks/aws-load-balancer-controller \
-n kube-system \
--set clusterName=${CLUSTER_NAME} \
--set serviceAccount.create=false \
--set serviceAccount.name=${LOAD_BALANCER_CONTROLLER_ROLE_NAME}

# Install the substratus operator.
# if [ "${INSTALL_OPERATOR}" == "yes" ]; then
# kubectl apply -f kubernetes/namespace.yaml
# kubectl apply -f kubernetes/config.yaml
# kubectl apply -f kubernetes/system.yaml
# fi
6 changes: 0 additions & 6 deletions install/terraform/aws/bucket.tf

This file was deleted.

23 changes: 13 additions & 10 deletions install/terraform/aws/common.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,24 @@ locals {
id = var.existing_vpc == null ? module.vpc[0].vpc_id : var.existing_vpc.id
private_subnet_ids = var.existing_vpc == null ? module.vpc[0].private_subnets : var.existing_vpc.private_subnet_ids
intra_subnet_ids = var.existing_vpc == null ? module.vpc[0].intra_subnets : var.existing_vpc.intra_subnet_ids
endpoints = var.existing_vpc == null ? module.endpoints[0] : {}
}

# passed to substratus_irsa_iam_roles.tf and eks_irsa_iam_roles.tf
eks_cluster = {
name = var.existing_eks_cluster == null ? module.eks[0].cluster_name : var.existing_eks_cluster.name
oidc_provider_arn = var.existing_eks_cluster == null ? module.eks[0].oidc_provider_arn : var.existing_eks_cluster.oidc_provider_arn
managed_node_groups = var.existing_eks_cluster == null ? module.eks[0].eks_managed_node_groups : null
certificate_authority_data = var.existing_eks_cluster == null ? module.eks[0].cluster_certificate_authority_data : ""
endpoint = var.existing_eks_cluster == null ? module.eks[0].cluster_endpoint : ""
name = local.create_cluster == 1 ? module.eks[0].cluster_name : var.existing_eks_cluster.name
oidc_provider_arn = local.create_cluster == 1 ? module.eks[0].oidc_provider_arn : var.existing_eks_cluster.oidc_provider_arn
managed_node_groups = local.create_cluster == 1 ? module.eks[0].eks_managed_node_groups : null
certificate_authority_data = local.create_cluster == 1 ? module.eks[0].cluster_certificate_authority_data : ""
endpoint = local.create_cluster == 1 ? module.eks[0].cluster_endpoint : ""
region = var.region
}

artifacts_bucket = {
arn = var.existing_artifacts_bucket == null ? aws_s3_bucket.artifacts[0].arn : var.existing_artifacts_bucket.arn
id = var.existing_artifacts_bucket == null ? aws_s3_bucket.artifacts[0].id : var.existing_artifacts_bucket.id
irsa_outputs = {
ebs_csi_irsa_role = local.create_cluster == 1 ? module.ebs_csi_irsa_role[0] : {}
load_balancer_controller_irsa_role = local.create_cluster == 1 ? module.load_balancer_controller_irsa_role[0] : {}
node_termination_handler_irsa_role = local.create_cluster == 1 ? module.node_termination_handler_irsa_role[0] : {}
substratus_irsa = local.create_cluster == 1 ? module.substratus_irsa[0] : {}
vpc_cni_ipv4_irsa_role = local.create_cluster == 1 ? module.vpc_cni_ipv4_irsa_role[0] : {}
}

ecr_repository_arn = var.existing_ecr_repository_arn == "" ? aws_ecr_repository.main[0].arn : var.existing_ecr_repository_arn
}
8 changes: 0 additions & 8 deletions install/terraform/aws/container_registry.tf

This file was deleted.

29 changes: 27 additions & 2 deletions install/terraform/aws/eks_cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ data "aws_ami" "deep_learning" {
module "eks" {
count = local.create_cluster
source = "terraform-aws-modules/eks/aws"
version = "19.15.4"
version = "19.16.0"
cluster_name = var.name_prefix
cluster_version = var.cluster_version
cluster_endpoint_public_access = true
Expand All @@ -125,6 +125,17 @@ module "eks" {
subnet_ids = local.vpc.private_subnet_ids
control_plane_subnet_ids = local.vpc.intra_subnet_ids
manage_aws_auth_configmap = true
aws_auth_roles = [
# We need to add in the Karpenter node IAM role for nodes launched by Karpenter
{
rolearn = module.karpenter[0].role_arn
username = "system:node:{{EC2PrivateDNSName}}"
groups = [
"system:bootstrappers",
"system:nodes",
]
},
]

eks_managed_node_group_defaults = {
# We are using the IRSA created below for permissions
Expand Down Expand Up @@ -220,7 +231,10 @@ module "eks" {
}
}
}
tags = var.tags
tags = merge(var.tags, {
# this same tag should exist on a single security group that karpenter will use
"karpenter.sh/discovery" = var.name_prefix
})
}

# ASG tags are needed for the cluster to work with the labels and taints of the
Expand All @@ -235,3 +249,14 @@ resource "aws_autoscaling_group_tag" "cluster_autoscaler_label_tags" {
propagate_at_launch = false
}
}

module "karpenter" {
count = local.create_cluster
source = "terraform-aws-modules/eks/aws//modules/karpenter"
cluster_name = module.eks[0].cluster_name
irsa_oidc_provider_arn = module.eks[0].oidc_provider_arn
policies = {
AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
}
tags = var.tags
}
Original file line number Diff line number Diff line change
@@ -1,20 +1,34 @@
# EKS specific IRSA Roles
data "aws_iam_policy" "eks_cni_policy" {
name = "AmazonEKS_CNI_Policy"
}

# Note: these are currently not used but should be as we install the associated
# add-ons (however we decide to do that)
module "cluster_autoscaler_irsa_role" {
count = local.create_cluster
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
version = "~> 5.28"
data "aws_iam_policy" "iam_full_access" {
name = "IAMFullAccess"
}

role_name_prefix = "cluster-autoscaler"
attach_cluster_autoscaler_policy = true
cluster_autoscaler_cluster_names = [local.eks_cluster.name]
data "aws_iam_policy" "container_registry_full_access" {
name = "AmazonEC2ContainerRegistryFullAccess"
}

data "aws_iam_policy" "s3_full_access" {
name = "AmazonS3FullAccess"
}

module "substratus_irsa" {
count = local.create_cluster
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
version = "~> 5.28"
role_name_prefix = "${var.name_prefix}-substratus-"
role_policy_arns = {
IAMFullAccess = data.aws_iam_policy.iam_full_access.arn
AmazonEC2ContainerRegistryFullAccess = data.aws_iam_policy.container_registry_full_access.arn
AmazonS3FullAccess = data.aws_iam_policy.s3_full_access.arn
}

oidc_providers = {
main = {
provider_arn = local.eks_cluster.oidc_provider_arn
namespace_service_accounts = ["kube-system:cluster-autoscaler"]
namespace_service_accounts = ["substratus:substratus"]
}
}

Expand Down
32 changes: 5 additions & 27 deletions install/terraform/aws/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,33 +1,11 @@
output "artifacts_bucket" {
value = {
arn = local.artifacts_bucket.arn
id = local.artifacts_bucket.id
}
}

output "cluster_name" {
value = local.eks_cluster.name
}

output "cluster_region" {
value = var.region
}

output "cluster" {
value = {
name = local.eks_cluster.name
oidc_provider_arn = local.eks_cluster.oidc_provider_arn
}
value = local.eks_cluster
}

output "ecr_repository_arn" {
value = local.ecr_repository_arn
output "vpc" {
value = local.vpc
}

output "vpc" {
value = {
id = local.vpc.id
private_subnet_ids = local.vpc.private_subnet_ids
intra_subnet_ids = local.vpc.intra_subnet_ids
}
output "irsas" {
value = local.irsa_outputs
}
Loading

0 comments on commit 532de8b

Please sign in to comment.