From 4ef5f846d60bd5d38140d52bde1d277035cfcb61 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Tue, 8 Aug 2023 01:23:33 -0700 Subject: [PATCH] adding infra via eksctl --- .gitignore | 3 +- install/kubernetes/eks-cluster.yaml.tpl | 97 +++++++ install/kubernetes/karpenter-provisioner.yaml | 69 +++++ install/scripts/aws-down.sh | 22 ++ install/scripts/aws-up.sh | 136 +++------ install/terraform/aws/backend.tf | 7 - install/terraform/aws/common.tf | 27 -- install/terraform/aws/eks_cluster.tf | 262 ------------------ install/terraform/aws/irsa_iam_roles.tf | 109 -------- install/terraform/aws/outputs.tf | 11 - install/terraform/aws/providers.tf | 15 - install/terraform/aws/variables.tf | 59 ---- install/terraform/aws/vpc.tf | 116 -------- internal/awsmanager/manager.go | 5 + 14 files changed, 232 insertions(+), 706 deletions(-) create mode 100644 install/kubernetes/eks-cluster.yaml.tpl create mode 100644 install/kubernetes/karpenter-provisioner.yaml mode change 100644 => 100755 install/scripts/aws-down.sh mode change 100644 => 100755 install/scripts/aws-up.sh delete mode 100644 install/terraform/aws/backend.tf delete mode 100644 install/terraform/aws/common.tf delete mode 100644 install/terraform/aws/eks_cluster.tf delete mode 100644 install/terraform/aws/irsa_iam_roles.tf delete mode 100644 install/terraform/aws/outputs.tf delete mode 100644 install/terraform/aws/providers.tf delete mode 100644 install/terraform/aws/variables.tf delete mode 100644 install/terraform/aws/vpc.tf create mode 100644 internal/awsmanager/manager.go diff --git a/.gitignore b/.gitignore index 06f85911..b2ee0d14 100644 --- a/.gitignore +++ b/.gitignore @@ -77,4 +77,5 @@ gcpmanager-dependencies.yaml skaffold-dependencies.sh .ipynb_checkpoints -.vscode/ \ No newline at end of file +.vscode/ +eks-cluster.yaml diff --git a/install/kubernetes/eks-cluster.yaml.tpl b/install/kubernetes/eks-cluster.yaml.tpl new file mode 100644 index 00000000..aba1b0fd --- /dev/null +++ b/install/kubernetes/eks-cluster.yaml.tpl @@ -0,0 +1,97 @@ +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +metadata: + name: substratus + region: us-west-2 + version: "1.27" + tags: + createdBy: eksctl + environment: dev + karpenter.sh/discovery: substratus + +karpenter: + createServiceAccount: true + withSpotInterruptionQueue: true + defaultInstanceProfile: "KarpenterNodeInstanceProfile-substratus" + version: "v0.29.0" + +# TODO(bjb): do we need mngs with karpenter? +# if karpenter doesn't suffice: https://github.com/eksctl-io/eksctl/blob/main/examples/23-kubeflow-spot-instance.yaml +managedNodeGroups: + - name: builder-ng + privateNetworking: true + labels: { role: builders } + instanceTypes: + - m6a.large + volumeSize: 100 + minSize: 0 + maxSize: 3 + desiredCapacity: 1 + iam: + withAddonPolicies: + ebs: true + imageBuilder: true +addons: + - name: vpc-cni + attachPolicyARNs: + - arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy + - name: kube-proxy + - name: aws-ebs-csi-driver + wellKnownPolicies: + ebsCSIController: true + - name: coredns + +iamIdentityMappings: + - arn: "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}" + username: system:node:{{EC2PrivateDNSName}} + groups: + - system:bootstrappers + - system:nodes + +iam: + withOIDC: true + serviceAccounts: + - metadata: + name: karpenter + namespace: karpenter + roleName: ${CLUSTER_NAME}-karpenter + attachPolicyARNs: + - arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME} + roleOnly: true + - metadata: + name: ebs-csi-controller-sa + namespace: kube-system + wellKnownPolicies: + ebsCSIController: true + - metadata: + name: substratus + namespace: substratus + attachPolicy: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "ecr:*" + Resource: + - "arn:aws:ecr:::${ARTIFACTS_REPO_NAME}" + - Effect: Allow + Action: + - "s3:*" + - "s3-object-lambda:*" + Resource: + - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}/*" + - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}" + - metadata: + name: aws-manager + namespace: substratus + attachPolicy: + # https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-presigned-url.html + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "s3:PutObject" + - "s3:GetObject" + Resource: + - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}/*" + - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}" diff --git a/install/kubernetes/karpenter-provisioner.yaml b/install/kubernetes/karpenter-provisioner.yaml new file mode 100644 index 00000000..3bc5d391 --- /dev/null +++ b/install/kubernetes/karpenter-provisioner.yaml @@ -0,0 +1,69 @@ +# https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/ +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: gpu +spec: + provider: + instanceProfile: eksctl-KarpenterNodeInstanceProfile-substratus + subnetSelector: + karpenter.sh/discovery: substratus + securityGroupSelector: + karpenter.sh/discovery: substratus + ttlSecondsAfterEmpty: 30 + consolidation: + enabled: true + taints: + - key: nvidia.com/gpu + value: "true" + effect: NoSchedule + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["spot"] + - key: node.kubernetes.io/instance-type + operator: In + values: + # aws ec2 describe-instance-types --region us-west-2 --query "InstanceTypes[?GpuInfo!=null].InstanceType" --output json | jq -r '.[]' | sort | grep -v dl1 | grep -v inf | grep -v p5 | grep -v trn1 | awk '{print "\""$1"\","}' + [ + "g2.2xlarge", + "g2.8xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3s.xlarge", + "g4ad.16xlarge", + "g4ad.2xlarge", + "g4ad.4xlarge", + "g4ad.8xlarge", + "g4ad.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.12xlarge", + "g5.16xlarge", + "g5.24xlarge", + "g5.2xlarge", + "g5.48xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.xlarge", + "g5g.16xlarge", + "g5g.2xlarge", + "g5g.4xlarge", + "g5g.8xlarge", + "g5g.metal", + "g5g.xlarge", + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "p3.16xlarge", + "p3.2xlarge", + "p3.8xlarge", + "p3dn.24xlarge", + "p4d.24xlarge", + ] diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh old mode 100644 new mode 100755 index 8b137891..d4372572 --- a/install/scripts/aws-down.sh +++ b/install/scripts/aws-down.sh @@ -1 +1,23 @@ +#!/bin/bash +set -e +set -u + +# Required env variables: +# : "$TOKEN $PROJECT" + +export EKSCTL_ENABLE_CREDENTIAL_CACHE=1 +export CLUSTER_NAME=substratus +export REGION=us-west-2 +export ARTIFACTS_REPO_NAME=substratus +export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" +export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-substratus-artifacts + +aws s3 rb s3://${ARTIFACTS_BUCKET_NAME} --region ${REGION} || true +aws ecr delete-repository --repository-name ${ARTIFACTS_REPO_NAME} || true + +aws cloudformation delete-stack \ + --stack-name "Karpenter-${CLUSTER_NAME}" || true + +envsubst <../kubernetes/eks-cluster.yaml.tpl >../kubernetes/eks-cluster.yaml +eksctl delete cluster -f ../kubernetes/eks-cluster.yaml diff --git a/install/scripts/aws-up.sh b/install/scripts/aws-up.sh old mode 100644 new mode 100755 index b620cddc..425a01cb --- a/install/scripts/aws-up.sh +++ b/install/scripts/aws-up.sh @@ -4,68 +4,25 @@ set -e set -u # Required env variables: -: "$TOKEN $PROJECT" +# : "$TOKEN $PROJECT" -# Used by gcloud: -# TODO(bjb): pass AWS creds into script -export CLOUDSDK_AUTH_ACCESS_TOKEN=${TOKEN} -# Used by terraform: -export GOOGLE_OAUTH_ACCESS_TOKEN=${TOKEN} +# # TODO(bjb): pass AWS creds into script +# export CLOUDSDK_AUTH_ACCESS_TOKEN=${TOKEN} -INSTALL_OPERATOR="${INSTALL_OPERATOR:-yes}" -AUTO_APPROVE="${AUTO_APPROVE:-no}" - -# Create terraform state bucket if one does not exist. -# TODO(bjb): establish a bucket - -# Apply infrastructure. -cd terraform/aws - -# Backend variables cannot be configured via env variables. -echo "bucket = \"${TF_BUCKET}\"" >>backend.tfvars -terraform init --backend-config=backend.tfvars - -export TF_VAR_project_id=${PROJECT} -if [ "${AUTO_APPROVE}" == "yes" ]; then - terraform apply -auto-approve -else - terraform apply -fi -CLUSTER_NAME=$(terraform output --json cluster | jq -r '.name') -CLUSTER_REGION=$(terraform output --json cluster | jq -r '.region') -CLUSTER_ENDPOINT=$(terraform output --json cluster | jq -r '.endpoint') -LOAD_BALANCER_CONTROLLER_ROLE_NAME=$(terraform output --json irsas | jq -r '.load_balancer_controller_irsa_role.iam_role_name') - -cd - - -# Configure kubectl. -aws eks --region ${CLUSTER_REGION} update-kubeconfig --name ${CLUSTER_NAME} -# Install cluster-level components - -# node-termination-handler: https://artifacthub.io/packages/helm/aws/aws-node-termination-handler -helm repo add eks https://aws.github.io/eks-charts -helm upgrade \ - --install aws-node-termination-handler \ - --namespace kube-system \ - --version 0.21.0 \ - eks/aws-node-termination-handler - -# install EBS snapshotter?: https://github.com/kubernetes-csi/external-snapshotter#usage +# INSTALL_OPERATOR="${INSTALL_OPERATOR:-yes}" +export EKSCTL_ENABLE_CREDENTIAL_CACHE=1 +export CLUSTER_NAME=substratus +export REGION=us-west-2 +export ARTIFACTS_REPO_NAME=substratus +export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" +export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-substratus-artifacts -# TODO(bjb): may not be needed if we can resolve 401 to 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/ -# install aws-ebs-csi-driver: https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/docs/install.md -helm repo add aws-ebs-csi-driver https://kubernetes-sigs.github.io/aws-ebs-csi-driver -helm repo update -helm upgrade \ - --install aws-ebs-csi-driver \ - --namespace kube-system \ - aws-ebs-csi-driver/aws-ebs-csi-driver +aws s3 mb s3://${ARTIFACTS_BUCKET_NAME} --region ${REGION} || true +aws ecr create-repository --repository-name ${ARTIFACTS_REPO_NAME} || true -# TODO(bjb): is this needed? Is doing the work here preferred to doing it in terraform? # install karpenter: https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/ export KARPENTER_VERSION=v0.29.2 export AWS_PARTITION="aws" -export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" export TEMPOUT=$(mktemp) curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml >$TEMPOUT && aws cloudformation deploy \ @@ -74,54 +31,35 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION} --capabilities CAPABILITY_NAMED_IAM \ --parameter-overrides "ClusterName=${CLUSTER_NAME}" -eksctl create cluster -f - <../kubernetes/eks-cluster.yaml +eksctl create cluster -f ../kubernetes/eks-cluster.yaml || eksctl upgrade cluster -f ../kubernetes/eks-cluster.yaml export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" -echo $CLUSTER_ENDPOINT $KARPENTER_IAM_ROLE_ARN aws iam create-service-linked-role --aws-service-name spot.amazonaws.com || true +aws eks --region ${REGION} update-kubeconfig --name ${CLUSTER_NAME} +# Logout of helm registry to perform an unauthenticated pull against the public ECR +helm registry logout public.ecr.aws || true + +helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --version ${KARPENTER_VERSION} --namespace karpenter --create-namespace \ + --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=${KARPENTER_IAM_ROLE_ARN} \ + --set settings.aws.clusterName=${CLUSTER_NAME} \ + --set settings.aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ + --set settings.aws.interruptionQueueName=${CLUSTER_NAME} \ + --set controller.resources.requests.cpu=1 \ + --set controller.resources.requests.memory=1Gi \ + --set controller.resources.limits.cpu=1 \ + --set controller.resources.limits.memory=1Gi \ + --wait + +kubectl apply -f ../kubernetes/karpenter-provisioner.yaml -# install the load balancer controller: https://docs.aws.amazon.com/eks/latest/userguide/aws-load-balancer-controller.html -helm install aws-load-balancer-controller eks/aws-load-balancer-controller \ - -n kube-system \ - --set clusterName=${CLUSTER_NAME} \ - --set serviceAccount.create=false \ - --set serviceAccount.name=${LOAD_BALANCER_CONTROLLER_ROLE_NAME} +# node-termination-handler: https://artifacthub.io/packages/helm/aws/aws-node-termination-handler +helm repo add eks https://aws.github.io/eks-charts +helm upgrade \ + --install aws-node-termination-handler \ + --namespace kube-system \ + --version 0.21.0 \ + eks/aws-node-termination-handler # Install the substratus operator. # if [ "${INSTALL_OPERATOR}" == "yes" ]; then diff --git a/install/terraform/aws/backend.tf b/install/terraform/aws/backend.tf deleted file mode 100644 index 91722a40..00000000 --- a/install/terraform/aws/backend.tf +++ /dev/null @@ -1,7 +0,0 @@ -terraform { - backend "s3" { - # bucket = "243019462621-terraform-state" - # key = "primary/us-west-2/substratus/terraform.tfstate" - # region = "us-west-2" - } -} diff --git a/install/terraform/aws/common.tf b/install/terraform/aws/common.tf deleted file mode 100644 index d73dc747..00000000 --- a/install/terraform/aws/common.tf +++ /dev/null @@ -1,27 +0,0 @@ -locals { - # passed to cluster.tf - vpc = { - id = var.existing_vpc == null ? module.vpc[0].vpc_id : var.existing_vpc.id - private_subnet_ids = var.existing_vpc == null ? module.vpc[0].private_subnets : var.existing_vpc.private_subnet_ids - intra_subnet_ids = var.existing_vpc == null ? module.vpc[0].intra_subnets : var.existing_vpc.intra_subnet_ids - endpoints = var.existing_vpc == null ? module.endpoints[0] : null - } - - # passed to substratus_irsa_iam_roles.tf and eks_irsa_iam_roles.tf - eks_cluster = { - name = local.create_cluster == 1 ? module.eks[0].cluster_name : var.existing_eks_cluster.name - oidc_provider_arn = local.create_cluster == 1 ? module.eks[0].oidc_provider_arn : var.existing_eks_cluster.oidc_provider_arn - managed_node_groups = local.create_cluster == 1 ? module.eks[0].eks_managed_node_groups : null - certificate_authority_data = local.create_cluster == 1 ? module.eks[0].cluster_certificate_authority_data : "" - endpoint = local.create_cluster == 1 ? module.eks[0].cluster_endpoint : "" - region = var.region - } - - irsa_outputs = { - ebs_csi_irsa_role = local.create_cluster == 1 ? module.ebs_csi_irsa_role[0] : {} - load_balancer_controller_irsa_role = local.create_cluster == 1 ? module.load_balancer_controller_irsa_role[0] : {} - node_termination_handler_irsa_role = local.create_cluster == 1 ? module.node_termination_handler_irsa_role[0] : {} - substratus_irsa = local.create_cluster == 1 ? module.substratus_irsa[0] : {} - vpc_cni_ipv4_irsa_role = local.create_cluster == 1 ? module.vpc_cni_ipv4_irsa_role[0] : {} - } -} diff --git a/install/terraform/aws/eks_cluster.tf b/install/terraform/aws/eks_cluster.tf deleted file mode 100644 index d10e45e2..00000000 --- a/install/terraform/aws/eks_cluster.tf +++ /dev/null @@ -1,262 +0,0 @@ -locals { - create_cluster = var.existing_eks_cluster == null ? 1 : 0 - # We need to lookup K8s taint effect from the AWS API value - taint_effects = { - NO_SCHEDULE = "NoSchedule" - NO_EXECUTE = "NoExecute" - PREFER_NO_SCHEDULE = "PreferNoSchedule" - } - - # The following locals are used to configure tags for the EKS cluster's Auto - # Scaling Groups managed by the cluster autoscaler. - - # `cluster_autoscaler_label_tags` contains the tags related to the Kubernetes - # labels applied to the nodes in the cluster's managed node groups. - # Each tag has a key formed from the node group's name and label name, and a - # value containing the autoscaling group's name, the corresponding - # Kubernetes label key, and its value. These tags are used by the cluster - # autoscaler to determine how nodes should be scaled based on their labels. - cluster_autoscaler_label_tags = local.eks_cluster.managed_node_groups != null ? merge([ - for name, group in local.eks_cluster.managed_node_groups : { - for label_name, label_value in coalesce(group.node_group_labels, {}) : "${name}|label|${label_name}" => { - autoscaling_group = group.node_group_autoscaling_group_names[0], - key = "k8s.io/cluster-autoscaler/node-template/label/${label_name}", - value = label_value, - } - } - ]...) : {} - - # `cluster_autoscaler_taint_tags` contains tags related to the Kubernetes - # taints applied to the nodes in the cluster's managed node groups. - # Each tag's key includes the node group's name and taint key, and its value - # contains information about the taint, such as its value and effect. - # These tags allow the cluster autoscaler to respect the taints when scaling nodes. - cluster_autoscaler_taint_tags = local.eks_cluster.managed_node_groups != null ? merge([ - for name, group in local.eks_cluster.managed_node_groups : { - for taint in coalesce(group.node_group_taints, []) : "${name}|taint|${taint.key}" => { - autoscaling_group = group.node_group_autoscaling_group_names[0], - key = "k8s.io/cluster-autoscaler/node-template/taint/${taint.key}" - value = "${taint.value}:${local.taint_effects[taint.effect]}" - } - } - ]...) : {} - - # `cluster_autoscaler_asg_tags` combines the above label and taint tags into a - # single map, which is then used to create the actual tags on the AWS ASGs - # through the `aws_autoscaling_group_tag` resource. The tags are only applied - # if `existing_eks_cluster` is `null`, ensuring they are only created for new - # clusters. - cluster_autoscaler_asg_tags = merge( - local.cluster_autoscaler_label_tags, - local.cluster_autoscaler_taint_tags - ) -} - -data "aws_ec2_instance_types" "gpu" { - filter { - name = "instance-type" - # from: aws ec2 describe-instance-types --region us-west-2 --query "InstanceTypes[?GpuInfo!=null].InstanceType" --output json | jq -r '.[]' | awk -F. '{print "\"" $1 ".*\","}' | uniq - # non-CUDA supported types added and commented out for now though these have accelerators of some kind - values = [ - # "dl1.*", # no CUDA support - # "inf1.*" # no CUDA support - # "inf2.*" # no CUDA support - "g2.*", - "g3.*", - "g3s.*", - "g4ad.*", - "g4dn.*", - "g5.*", - # "g5g.*", exclude g5g as these are ARM machines - "p2.*", - "p3.*", - "p3dn.*", - "p4d.*", - # "p5.*", # no CUDA support - # "trn1.*", # no CUDA support - # "trn1n32.*", # no CUDA support - ] - } -} - -data "aws_ami" "eks_default" { - most_recent = true - owners = ["amazon"] - - filter { - name = "name" - values = ["amazon-eks-node-${var.cluster_version}-v*"] - } - filter { - name = "architecture" - values = ["x86_64"] - } -} - -data "aws_ami" "deep_learning" { - most_recent = true - owners = ["amazon"] - - filter { - name = "name" - # they don't produce images on any Ubuntu OS newer than this :shrug: - values = ["Deep Learning AMI (Ubuntu 18.04) Version ??.?"] - } - filter { - name = "architecture" - values = ["x86_64"] - } - - filter { - name = "state" - values = ["available"] - } -} - -module "eks" { - count = local.create_cluster - source = "terraform-aws-modules/eks/aws" - version = "19.16.0" - cluster_name = var.name_prefix - cluster_version = var.cluster_version - cluster_endpoint_public_access = true - cluster_ip_family = "ipv4" - vpc_id = local.vpc.id - subnet_ids = local.vpc.private_subnet_ids - control_plane_subnet_ids = local.vpc.intra_subnet_ids - manage_aws_auth_configmap = true - aws_auth_roles = [ - # We need to add in the Karpenter node IAM role for nodes launched by Karpenter - { - rolearn = module.karpenter[0].role_arn - username = "system:node:{{EC2PrivateDNSName}}" - groups = [ - "system:bootstrappers", - "system:nodes", - ] - }, - ] - - eks_managed_node_group_defaults = { - # We are using the IRSA created below for permissions - # However, we have to deploy with the policy attached FIRST (when creating a fresh cluster) - # and then turn this off after the cluster/node group is created. Without this initial policy, - # the VPC CNI fails to assign IPs and nodes cannot join the cluster - # See https://github.com/aws/containers-roadmap/issues/1666 for more context - iam_role_attach_cni_policy = true - subnet_ids = local.vpc.private_subnet_ids - labels = var.labels - ebs_optimized = true - disable_api_termination = false - enable_monitoring = true - use_custom_launch_template = false - force_update_version = true - } - - eks_managed_node_groups = { - builder = { - # By default, the module creates a launch template to ensure tags are propagated to instances, etc., - # so we need to disable it to use the default template provided by the AWS EKS managed node group service - name_prefix = "container-builder" - ami_id = data.aws_ami.eks_default.image_id - disk_size = 100 - min_size = 1 - max_size = 3 - desired_size = 1 - instance_types = [ - "t3a.large" - ] - capacity_type = "SPOT" - local_storage_types = ["ssd"] - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 100 - volume_type = "gp3" - iops = 3000 - throughput = 150 - encrypted = true - delete_on_termination = true - } - } - } - } - - gpu = { - name_prefix = "gpu" - description = "GPU node launch template" - min_size = 0 - max_size = 32 - desired_size = 0 - - ami_id = data.aws_ami.deep_learning.image_id - capacity_type = "SPOT" - instance_types = sort(data.aws_ec2_instance_types.gpu.instance_types) - - update_config = { - max_unavailable_percentage = 100 - } - - local_storage_types = ["ssd"] - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 100 - volume_type = "gp3" - iops = 3000 - throughput = 150 - encrypted = true - delete_on_termination = true - } - } - } - - metadata_options = { - http_endpoint = "enabled" - http_tokens = "required" - instance_metadata_tags = "disabled" - } - - create_iam_role = true - iam_role_name = "eks-managed-gpu-node-group" - iam_role_use_name_prefix = false - iam_role_description = "EKS managed GPU node group" - iam_role_tags = { - Purpose = "Protector of the kubelet" - } - iam_role_additional_policies = { - AmazonEC2ContainerRegistryReadOnly = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" - } - } - } - tags = merge(var.tags, { - # this same tag should exist on a single security group that karpenter will use - "karpenter.sh/discovery" = var.name_prefix - }) -} - -# ASG tags are needed for the cluster to work with the labels and taints of the -# node groups -resource "aws_autoscaling_group_tag" "cluster_autoscaler_label_tags" { - for_each = var.existing_eks_cluster == null ? local.cluster_autoscaler_asg_tags : {} - autoscaling_group_name = each.value.autoscaling_group - - tag { - key = each.value.key - value = each.value.value - propagate_at_launch = false - } -} - -module "karpenter" { - count = local.create_cluster - source = "terraform-aws-modules/eks/aws//modules/karpenter" - cluster_name = module.eks[0].cluster_name - irsa_oidc_provider_arn = module.eks[0].oidc_provider_arn - policies = { - AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - } - tags = var.tags -} diff --git a/install/terraform/aws/irsa_iam_roles.tf b/install/terraform/aws/irsa_iam_roles.tf deleted file mode 100644 index eacc4cb5..00000000 --- a/install/terraform/aws/irsa_iam_roles.tf +++ /dev/null @@ -1,109 +0,0 @@ -data "aws_iam_policy" "eks_cni_policy" { - name = "AmazonEKS_CNI_Policy" -} - -data "aws_iam_policy" "iam_full_access" { - name = "IAMFullAccess" -} - -data "aws_iam_policy" "container_registry_full_access" { - name = "AmazonEC2ContainerRegistryFullAccess" -} - -data "aws_iam_policy" "s3_full_access" { - name = "AmazonS3FullAccess" -} - -module "substratus_irsa" { - count = local.create_cluster - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.28" - role_name_prefix = "${var.name_prefix}-substratus-" - role_policy_arns = { - IAMFullAccess = data.aws_iam_policy.iam_full_access.arn - AmazonEC2ContainerRegistryFullAccess = data.aws_iam_policy.container_registry_full_access.arn - AmazonS3FullAccess = data.aws_iam_policy.s3_full_access.arn - } - - oidc_providers = { - main = { - provider_arn = local.eks_cluster.oidc_provider_arn - namespace_service_accounts = ["substratus:substratus"] - } - } - - tags = var.tags -} - -module "ebs_csi_irsa_role" { - count = local.create_cluster - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.28" - - role_name_prefix = "ebs-csi" - attach_ebs_csi_policy = true - - oidc_providers = { - main = { - provider_arn = local.eks_cluster.oidc_provider_arn - namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] - } - } - - tags = var.tags -} - -module "load_balancer_controller_irsa_role" { - count = local.create_cluster - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.28" - - role_name_prefix = "load-balancer-controller" - attach_load_balancer_controller_policy = true - - oidc_providers = { - main = { - provider_arn = local.eks_cluster.oidc_provider_arn - namespace_service_accounts = ["kube-system:aws-load-balancer-controller"] - } - } - - tags = var.tags -} - -module "node_termination_handler_irsa_role" { - count = local.create_cluster - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.28" - - role_name_prefix = "node-termination-handler" - attach_node_termination_handler_policy = true - - oidc_providers = { - main = { - provider_arn = local.eks_cluster.oidc_provider_arn - namespace_service_accounts = ["kube-system:aws-node"] - } - } - - tags = var.tags -} - -module "vpc_cni_ipv4_irsa_role" { - count = local.create_cluster - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.28" - - role_name_prefix = "vpc-cni-ipv4" - attach_vpc_cni_policy = true - vpc_cni_enable_ipv4 = true - - oidc_providers = { - main = { - provider_arn = local.eks_cluster.oidc_provider_arn - namespace_service_accounts = ["kube-system:aws-node"] - } - } - - tags = var.tags -} diff --git a/install/terraform/aws/outputs.tf b/install/terraform/aws/outputs.tf deleted file mode 100644 index 9df6cfb1..00000000 --- a/install/terraform/aws/outputs.tf +++ /dev/null @@ -1,11 +0,0 @@ -output "cluster" { - value = local.eks_cluster -} - -output "vpc" { - value = local.vpc -} - -output "irsas" { - value = local.irsa_outputs -} diff --git a/install/terraform/aws/providers.tf b/install/terraform/aws/providers.tf deleted file mode 100644 index 808daadc..00000000 --- a/install/terraform/aws/providers.tf +++ /dev/null @@ -1,15 +0,0 @@ -provider "aws" { - region = var.region -} - -provider "kubernetes" { - host = local.eks_cluster.endpoint - cluster_ca_certificate = base64decode(local.eks_cluster.certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", local.eks_cluster.name] - } -} diff --git a/install/terraform/aws/variables.tf b/install/terraform/aws/variables.tf deleted file mode 100644 index 15404b60..00000000 --- a/install/terraform/aws/variables.tf +++ /dev/null @@ -1,59 +0,0 @@ -variable "cluster_version" { - description = "The version of the EKS cluster to deploy (i.e., this is used when var.existing_eks_cluster is null)" - type = string - default = "1.27" -} - -variable "existing_eks_cluster" { - description = "An existing EKS cluster to add substratus components to." - type = object({ - name = string - oidc_provider_arn = string - }) - default = null -} - -variable "existing_vpc" { - description = "An existing VPC to add substratus components to." - type = object({ - id = string - private_subnet_ids = list(string) - intra_subnet_ids = list(string) - }) - default = null -} - -variable "labels" { - type = map(string) - default = { - GithubRepo = "substratus" - GithubOrg = "substratusai" - } -} - -variable "name_prefix" { - description = "Prefix to use for resources" - type = string - default = "substratus-usw2" -} - -variable "region" { - description = "AWS region" - type = string - default = "us-west-2" -} - -# will remove this before pushing to substratus repo -variable "tags" { - type = map(string) - default = { - GithubRepo = "infrastructure" - GithubOrg = "substratusai" - } -} - -variable "vpc_cidr" { - description = "The cidr block of the VPC if created by the module (e.g., used when var.existing_vpc is null)" - type = string - default = "10.0.0.0/16" -} diff --git a/install/terraform/aws/vpc.tf b/install/terraform/aws/vpc.tf deleted file mode 100644 index 32792ccb..00000000 --- a/install/terraform/aws/vpc.tf +++ /dev/null @@ -1,116 +0,0 @@ -data "aws_availability_zones" "available" {} - -locals { - azs = slice(data.aws_availability_zones.available.names, 0, 3) - create_vpc = var.existing_vpc == null ? 1 : 0 -} - -module "vpc" { - count = local.create_vpc - source = "terraform-aws-modules/vpc/aws" - version = "5.1.1" - name = var.name_prefix - cidr = var.vpc_cidr - azs = local.azs - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 6, k)] - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 6, k + 4)] - intra_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 6, k + 20)] - - public_subnet_ipv6_prefixes = [0, 1, 2] - public_subnet_assign_ipv6_address_on_creation = true - private_subnet_ipv6_prefixes = [3, 4, 5] - private_subnet_assign_ipv6_address_on_creation = true - intra_subnet_ipv6_prefixes = [6, 7, 8] - intra_subnet_assign_ipv6_address_on_creation = true - - public_subnet_tags = { - "kubernetes.io/role/elb" = 1 - } - - private_subnet_tags = { - "kubernetes.io/role/internal-elb" = 1 - } - - create_database_subnet_group = false - manage_default_network_acl = false - manage_default_route_table = false - manage_default_security_group = false - - enable_dns_hostnames = true - enable_dns_support = true - enable_nat_gateway = true - single_nat_gateway = true - enable_ipv6 = true - create_egress_only_igw = true - enable_vpn_gateway = false - enable_dhcp_options = false - - # VPC Flow Logs (Cloudwatch log group and IAM role will be created) - enable_flow_log = false - create_flow_log_cloudwatch_log_group = true - create_flow_log_cloudwatch_iam_role = true - flow_log_max_aggregation_interval = 60 - tags = var.tags -} - - -# VPC Endpoints Module - -module "endpoints" { - count = local.create_vpc - source = "terraform-aws-modules/vpc/aws//modules/vpc-endpoints" - version = "5.1.1" - vpc_id = module.vpc[0].vpc_id - create_security_group = true - security_group_name_prefix = "${var.name_prefix}-endpoints-" - security_group_description = "VPC endpoint security group" - security_group_rules = { - ingress_https = { - description = "HTTPS from VPC" - cidr_blocks = [module.vpc[0].vpc_cidr_block] - } - } - - endpoints = { - s3 = { - service = "s3" - tags = { Name = "s3-vpc-endpoint" } - }, - ecr_api = { - service = "ecr.api" - private_dns_enabled = true - subnet_ids = module.vpc[0].private_subnets - policy = data.aws_iam_policy_document.generic_endpoint_policy[0].json - }, - ecr_dkr = { - service = "ecr.dkr" - private_dns_enabled = true - subnet_ids = module.vpc[0].private_subnets - policy = data.aws_iam_policy_document.generic_endpoint_policy[0].json - }, - } - - tags = merge(var.tags, { - Endpoint = "true" - }) -} - -data "aws_iam_policy_document" "generic_endpoint_policy" { - count = local.create_vpc - statement { - effect = "Deny" - actions = ["*"] - resources = ["*"] - - principals { - type = "*" - identifiers = ["*"] - } - - condition { - test = "StringNotEquals" - variable = "aws:SourceVpc" - values = [module.vpc[0].vpc_id] - } - } -} diff --git a/internal/awsmanager/manager.go b/internal/awsmanager/manager.go new file mode 100644 index 00000000..7a877604 --- /dev/null +++ b/internal/awsmanager/manager.go @@ -0,0 +1,5 @@ +// Package gcp provides an AWS implementation of the Substratus Cloud Interface (SCI) +package awsmanager + +// examples: https://docs.aws.amazon.com/AmazonS3/latest/userguide/example_s3_Scenario_PresignedUrl_section.html +// Checking object integrity: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html