From 1222b2f4c307bee43127a25deb3ca4d4e329dd0e Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Sat, 5 Aug 2023 18:23:38 -0700 Subject: [PATCH 01/21] adds an AWS module for instantiating all substratus components + an optional VPC --- install/terraform/aws/backend.tf | 7 + install/terraform/aws/bucket.tf | 6 + install/terraform/aws/common.tf | 24 ++ install/terraform/aws/container_registry.tf | 8 + install/terraform/aws/eks_cluster.tf | 237 ++++++++++++++++++ install/terraform/aws/eks_irsa_iam_roles.tf | 95 +++++++ install/terraform/aws/outputs.tf | 33 +++ install/terraform/aws/providers.tf | 15 ++ .../aws/substratus_irsa_iam_roles.tf | 193 ++++++++++++++ install/terraform/aws/variables.tf | 86 +++++++ install/terraform/aws/vpc.tf | 116 +++++++++ 11 files changed, 820 insertions(+) create mode 100644 install/terraform/aws/backend.tf create mode 100644 install/terraform/aws/bucket.tf create mode 100644 install/terraform/aws/common.tf create mode 100644 install/terraform/aws/container_registry.tf create mode 100644 install/terraform/aws/eks_cluster.tf create mode 100644 install/terraform/aws/eks_irsa_iam_roles.tf create mode 100644 install/terraform/aws/outputs.tf create mode 100644 install/terraform/aws/providers.tf create mode 100644 install/terraform/aws/substratus_irsa_iam_roles.tf create mode 100644 install/terraform/aws/variables.tf create mode 100644 install/terraform/aws/vpc.tf diff --git a/install/terraform/aws/backend.tf b/install/terraform/aws/backend.tf new file mode 100644 index 00000000..91722a40 --- /dev/null +++ b/install/terraform/aws/backend.tf @@ -0,0 +1,7 @@ +terraform { + backend "s3" { + # bucket = "243019462621-terraform-state" + # key = "primary/us-west-2/substratus/terraform.tfstate" + # region = "us-west-2" + } +} diff --git a/install/terraform/aws/bucket.tf b/install/terraform/aws/bucket.tf new file mode 100644 index 00000000..5c4200f7 --- /dev/null +++ b/install/terraform/aws/bucket.tf @@ -0,0 +1,6 @@ +data "aws_caller_identity" "current" {} + +resource "aws_s3_bucket" "artifacts" { + count = var.existing_artifacts_bucket == null ? 1 : 0 + bucket = "${data.aws_caller_identity.current.account_id}-${var.name_prefix}-artifacts" +} diff --git a/install/terraform/aws/common.tf b/install/terraform/aws/common.tf new file mode 100644 index 00000000..89f2f898 --- /dev/null +++ b/install/terraform/aws/common.tf @@ -0,0 +1,24 @@ +locals { + # passed to cluster.tf + vpc = { + id = var.existing_vpc == null ? module.vpc[0].vpc_id : var.existing_vpc.id + private_subnet_ids = var.existing_vpc == null ? module.vpc[0].private_subnets : var.existing_vpc.private_subnet_ids + intra_subnet_ids = var.existing_vpc == null ? module.vpc[0].intra_subnets : var.existing_vpc.intra_subnet_ids + } + + # passed to substratus_irsa_iam_roles.tf and eks_irsa_iam_roles.tf + eks_cluster = { + name = var.existing_eks_cluster == null ? module.eks[0].cluster_name : var.existing_eks_cluster.name + oidc_provider_arn = var.existing_eks_cluster == null ? module.eks[0].oidc_provider_arn : var.existing_eks_cluster.oidc_provider_arn + managed_node_groups = var.existing_eks_cluster == null ? module.eks[0].eks_managed_node_groups : null + certificate_authority_data = var.existing_eks_cluster == null ? module.eks[0].cluster_certificate_authority_data : "" + endpoint = var.existing_eks_cluster == null ? module.eks[0].cluster_endpoint : "" + } + + artifacts_bucket = { + arn = var.existing_artifacts_bucket == null ? aws_s3_bucket.artifacts[0].arn : var.existing_artifacts_bucket.arn + id = var.existing_artifacts_bucket == null ? aws_s3_bucket.artifacts[0].id : var.existing_artifacts_bucket.id + } + + ecr_repository_arn = var.existing_ecr_repository_arn == "" ? aws_ecr_repository.main[0].arn : var.existing_ecr_repository_arn +} diff --git a/install/terraform/aws/container_registry.tf b/install/terraform/aws/container_registry.tf new file mode 100644 index 00000000..44e59616 --- /dev/null +++ b/install/terraform/aws/container_registry.tf @@ -0,0 +1,8 @@ +resource "aws_ecr_repository" "main" { + count = var.existing_ecr_repository_arn == "" ? 1 : 0 + name = var.name_prefix + image_tag_mutability = "MUTABLE" + image_scanning_configuration { + scan_on_push = var.image_scan_on_push + } +} diff --git a/install/terraform/aws/eks_cluster.tf b/install/terraform/aws/eks_cluster.tf new file mode 100644 index 00000000..b573b7d4 --- /dev/null +++ b/install/terraform/aws/eks_cluster.tf @@ -0,0 +1,237 @@ +locals { + create_cluster = var.existing_eks_cluster == null ? 1 : 0 + # We need to lookup K8s taint effect from the AWS API value + taint_effects = { + NO_SCHEDULE = "NoSchedule" + NO_EXECUTE = "NoExecute" + PREFER_NO_SCHEDULE = "PreferNoSchedule" + } + + # The following locals are used to configure tags for the EKS cluster's Auto + # Scaling Groups managed by the cluster autoscaler. + + # `cluster_autoscaler_label_tags` contains the tags related to the Kubernetes + # labels applied to the nodes in the cluster's managed node groups. + # Each tag has a key formed from the node group's name and label name, and a + # value containing the autoscaling group's name, the corresponding + # Kubernetes label key, and its value. These tags are used by the cluster + # autoscaler to determine how nodes should be scaled based on their labels. + cluster_autoscaler_label_tags = local.eks_cluster.managed_node_groups != null ? merge([ + for name, group in local.eks_cluster.managed_node_groups : { + for label_name, label_value in coalesce(group.node_group_labels, {}) : "${name}|label|${label_name}" => { + autoscaling_group = group.node_group_autoscaling_group_names[0], + key = "k8s.io/cluster-autoscaler/node-template/label/${label_name}", + value = label_value, + } + } + ]...) : {} + + # `cluster_autoscaler_taint_tags` contains tags related to the Kubernetes + # taints applied to the nodes in the cluster's managed node groups. + # Each tag's key includes the node group's name and taint key, and its value + # contains information about the taint, such as its value and effect. + # These tags allow the cluster autoscaler to respect the taints when scaling nodes. + cluster_autoscaler_taint_tags = local.eks_cluster.managed_node_groups != null ? merge([ + for name, group in local.eks_cluster.managed_node_groups : { + for taint in coalesce(group.node_group_taints, []) : "${name}|taint|${taint.key}" => { + autoscaling_group = group.node_group_autoscaling_group_names[0], + key = "k8s.io/cluster-autoscaler/node-template/taint/${taint.key}" + value = "${taint.value}:${local.taint_effects[taint.effect]}" + } + } + ]...) : {} + + # `cluster_autoscaler_asg_tags` combines the above label and taint tags into a + # single map, which is then used to create the actual tags on the AWS ASGs + # through the `aws_autoscaling_group_tag` resource. The tags are only applied + # if `existing_eks_cluster` is `null`, ensuring they are only created for new + # clusters. + cluster_autoscaler_asg_tags = merge( + local.cluster_autoscaler_label_tags, + local.cluster_autoscaler_taint_tags + ) +} + +data "aws_ec2_instance_types" "gpu" { + filter { + name = "instance-type" + # from: aws ec2 describe-instance-types --region us-west-2 --query "InstanceTypes[?GpuInfo!=null].InstanceType" --output json | jq -r '.[]' | awk -F. '{print "\"" $1 ".*\","}' | uniq + # non-CUDA supported types added and commented out for now though these have accelerators of some kind + values = [ + # "dl1.*", # no CUDA support + # "inf1.*" # no CUDA support + # "inf2.*" # no CUDA support + "g2.*", + "g3.*", + "g3s.*", + "g4ad.*", + "g4dn.*", + "g5.*", + # "g5g.*", exclude g5g as these are ARM machines + "p2.*", + "p3.*", + "p3dn.*", + "p4d.*", + # "p5.*", # no CUDA support + # "trn1.*", # no CUDA support + # "trn1n32.*", # no CUDA support + ] + } +} + +data "aws_ami" "eks_default" { + most_recent = true + owners = ["amazon"] + + filter { + name = "name" + values = ["amazon-eks-node-${var.cluster_version}-v*"] + } + filter { + name = "architecture" + values = ["x86_64"] + } +} + +data "aws_ami" "deep_learning" { + most_recent = true + owners = ["amazon"] + + filter { + name = "name" + # they don't produce images on any Ubuntu OS newer than this :shrug: + values = ["Deep Learning AMI (Ubuntu 18.04) Version ??.?"] + } + filter { + name = "architecture" + values = ["x86_64"] + } + + filter { + name = "state" + values = ["available"] + } +} + +module "eks" { + count = local.create_cluster + source = "terraform-aws-modules/eks/aws" + version = "19.15.4" + cluster_name = var.name_prefix + cluster_version = var.cluster_version + cluster_endpoint_public_access = true + cluster_ip_family = "ipv4" + vpc_id = local.vpc.id + subnet_ids = local.vpc.private_subnet_ids + control_plane_subnet_ids = local.vpc.intra_subnet_ids + manage_aws_auth_configmap = true + + eks_managed_node_group_defaults = { + # We are using the IRSA created below for permissions + # However, we have to deploy with the policy attached FIRST (when creating a fresh cluster) + # and then turn this off after the cluster/node group is created. Without this initial policy, + # the VPC CNI fails to assign IPs and nodes cannot join the cluster + # See https://github.com/aws/containers-roadmap/issues/1666 for more context + iam_role_attach_cni_policy = true + subnet_ids = local.vpc.private_subnet_ids + labels = var.labels + ebs_optimized = true + disable_api_termination = false + enable_monitoring = true + use_custom_launch_template = false + force_update_version = true + } + + eks_managed_node_groups = { + builder = { + # By default, the module creates a launch template to ensure tags are propagated to instances, etc., + # so we need to disable it to use the default template provided by the AWS EKS managed node group service + name_prefix = "container-builder" + ami_id = data.aws_ami.eks_default.image_id + disk_size = 100 + min_size = 1 + max_size = 3 + desired_size = 1 + instance_types = [ + "t3a.large" + ] + capacity_type = "SPOT" + local_storage_types = ["ssd"] + block_device_mappings = { + xvda = { + device_name = "/dev/xvda" + ebs = { + volume_size = 100 + volume_type = "gp3" + iops = 3000 + throughput = 150 + encrypted = true + delete_on_termination = true + } + } + } + } + + gpu = { + name_prefix = "gpu" + description = "GPU node launch template" + min_size = 0 + max_size = 32 + desired_size = 0 + + ami_id = data.aws_ami.deep_learning.image_id + capacity_type = "SPOT" + instance_types = sort(data.aws_ec2_instance_types.gpu.instance_types) + + update_config = { + max_unavailable_percentage = 100 + } + + local_storage_types = ["ssd"] + block_device_mappings = { + xvda = { + device_name = "/dev/xvda" + ebs = { + volume_size = 100 + volume_type = "gp3" + iops = 3000 + throughput = 150 + encrypted = true + delete_on_termination = true + } + } + } + + metadata_options = { + http_endpoint = "enabled" + http_tokens = "required" + instance_metadata_tags = "disabled" + } + + create_iam_role = true + iam_role_name = "eks-managed-gpu-node-group" + iam_role_use_name_prefix = false + iam_role_description = "EKS managed GPU node group" + iam_role_tags = { + Purpose = "Protector of the kubelet" + } + iam_role_additional_policies = { + AmazonEC2ContainerRegistryReadOnly = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + } + } + } + tags = var.tags +} + +# ASG tags are needed for the cluster to work with the labels and taints of the +# node groups +resource "aws_autoscaling_group_tag" "cluster_autoscaler_label_tags" { + for_each = var.existing_eks_cluster == null ? local.cluster_autoscaler_asg_tags : {} + autoscaling_group_name = each.value.autoscaling_group + + tag { + key = each.value.key + value = each.value.value + propagate_at_launch = false + } +} diff --git a/install/terraform/aws/eks_irsa_iam_roles.tf b/install/terraform/aws/eks_irsa_iam_roles.tf new file mode 100644 index 00000000..ab788e60 --- /dev/null +++ b/install/terraform/aws/eks_irsa_iam_roles.tf @@ -0,0 +1,95 @@ +# EKS specific IRSA Roles + +# Note: these are currently not used but should be as we install the associated +# add-ons (however we decide to do that) +module "cluster_autoscaler_irsa_role" { + count = local.create_cluster + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.28" + + role_name_prefix = "cluster-autoscaler" + attach_cluster_autoscaler_policy = true + cluster_autoscaler_cluster_names = [local.eks_cluster.name] + + oidc_providers = { + main = { + provider_arn = local.eks_cluster.oidc_provider_arn + namespace_service_accounts = ["kube-system:cluster-autoscaler"] + } + } + + tags = var.tags +} + +module "ebs_csi_irsa_role" { + count = local.create_cluster + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.28" + + role_name_prefix = "ebs-csi" + attach_ebs_csi_policy = true + + oidc_providers = { + main = { + provider_arn = local.eks_cluster.oidc_provider_arn + namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] + } + } + + tags = var.tags +} + +module "load_balancer_controller_irsa_role" { + count = local.create_cluster + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.28" + + role_name_prefix = "load-balancer-controller" + attach_load_balancer_controller_policy = true + + oidc_providers = { + main = { + provider_arn = local.eks_cluster.oidc_provider_arn + namespace_service_accounts = ["kube-system:aws-load-balancer-controller"] + } + } + + tags = var.tags +} + +module "node_termination_handler_irsa_role" { + count = local.create_cluster + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.28" + + role_name_prefix = "node-termination-handler" + attach_node_termination_handler_policy = true + + oidc_providers = { + main = { + provider_arn = local.eks_cluster.oidc_provider_arn + namespace_service_accounts = ["kube-system:aws-node"] + } + } + + tags = var.tags +} + +module "vpc_cni_ipv4_irsa_role" { + count = local.create_cluster + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.28" + + role_name_prefix = "vpc-cni-ipv4" + attach_vpc_cni_policy = true + vpc_cni_enable_ipv4 = true + + oidc_providers = { + main = { + provider_arn = local.eks_cluster.oidc_provider_arn + namespace_service_accounts = ["kube-system:aws-node"] + } + } + + tags = var.tags +} diff --git a/install/terraform/aws/outputs.tf b/install/terraform/aws/outputs.tf new file mode 100644 index 00000000..7846c2d8 --- /dev/null +++ b/install/terraform/aws/outputs.tf @@ -0,0 +1,33 @@ +output "artifacts_bucket" { + value = { + arn = local.artifacts_bucket.arn + id = local.artifacts_bucket.id + } +} + +output "cluster_name" { + value = local.eks_cluster.name +} + +output "cluster_region" { + value = var.region +} + +output "cluster" { + value = { + name = local.eks_cluster.name + oidc_provider_arn = local.eks_cluster.oidc_provider_arn + } +} + +output "ecr_repository_arn" { + value = local.ecr_repository_arn +} + +output "vpc" { + value = { + id = local.vpc.id + private_subnet_ids = local.vpc.private_subnet_ids + intra_subnet_ids = local.vpc.intra_subnet_ids + } +} diff --git a/install/terraform/aws/providers.tf b/install/terraform/aws/providers.tf new file mode 100644 index 00000000..808daadc --- /dev/null +++ b/install/terraform/aws/providers.tf @@ -0,0 +1,15 @@ +provider "aws" { + region = var.region +} + +provider "kubernetes" { + host = local.eks_cluster.endpoint + cluster_ca_certificate = base64decode(local.eks_cluster.certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", local.eks_cluster.name] + } +} diff --git a/install/terraform/aws/substratus_irsa_iam_roles.tf b/install/terraform/aws/substratus_irsa_iam_roles.tf new file mode 100644 index 00000000..e869b1a4 --- /dev/null +++ b/install/terraform/aws/substratus_irsa_iam_roles.tf @@ -0,0 +1,193 @@ +resource "aws_iam_policy" "ecr_writer" { + count = var.create_substratus_irsa_roles == true ? 1 : 0 + name = "${var.name_prefix}-ecr-writer" + description = "A policy allowing full access to the ${local.artifacts_bucket.id} bucket" + + policy = jsonencode({ + "Version" : "2012-10-17", + "Statement" : [ + { + "Effect" : "Allow", + "Action" : [ + "ecr:*" + ], + "Resource" : local.ecr_repository_arn + } + ] + }) + + tags = var.tags +} + +resource "aws_iam_policy" "s3_full_bucket_access" { + count = var.create_substratus_irsa_roles == true ? 1 : 0 + name = "${var.name_prefix}-AmazonS3FullAccess" + description = "A policy allowing full access to the ${local.artifacts_bucket.id} bucket" + + policy = jsonencode({ + "Version" : "2012-10-17", + "Statement" : [ + { + "Effect" : "Allow", + "Action" : [ + "s3:*", + "s3-object-lambda:*" + ], + "Resource" : [ + "${local.artifacts_bucket.arn}", + "${local.artifacts_bucket.arn}/*", + ] + } + ] + }) + + tags = var.tags +} + +resource "aws_iam_policy" "s3_readonly_bucket_access" { + count = var.create_substratus_irsa_roles == true ? 1 : 0 + name = "${var.name_prefix}-AmazonS3ReadOnlyAccess" + description = "A policy allowing read-only access to the ${local.artifacts_bucket.id} bucket" + + policy = jsonencode({ + "Version" : "2012-10-17", + "Statement" : [ + { + "Effect" : "Allow", + "Action" : [ + "s3:Get*", + "s3:List*", + "s3-object-lambda:Get*", + "s3-object-lambda:List*" + ], + "Resource" : [ + "${local.artifacts_bucket.arn}", + "${local.artifacts_bucket.arn}/*", + ] + } + ] + }) + + tags = var.tags +} + +module "container_builder_irsa" { + count = var.create_substratus_irsa_roles == true ? 1 : 0 + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.28" + + role_name_prefix = "${var.name_prefix}-container-builder-" + role_policy_arns = { + ECRWriter = aws_iam_policy.ecr_writer[0].arn + SubstratusAmazonS3ReadOnlyAccess = aws_iam_policy.s3_readonly_bucket_access[0].arn + } + + oidc_providers = { + main = { + provider_arn = local.eks_cluster.oidc_provider_arn + namespace_service_accounts = ["default:container-builder"] + } + } + + tags = var.tags +} + +module "modeller_irsa" { + count = var.create_substratus_irsa_roles == true ? 1 : 0 + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.28" + + role_name_prefix = "${var.name_prefix}-modeller-" + role_policy_arns = { + SubstratusAmazonS3FullAccess = aws_iam_policy.s3_full_bucket_access[0].arn + } + + oidc_providers = { + main = { + provider_arn = local.eks_cluster.oidc_provider_arn + namespace_service_accounts = ["default:modeller"] + } + } + + tags = var.tags +} + +module "model_server_irsa" { + count = var.create_substratus_irsa_roles == true ? 1 : 0 + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.28" + + role_name_prefix = "${var.name_prefix}-model-server-" + role_policy_arns = { + SubstratusAmazonS3FullAccess = aws_iam_policy.s3_full_bucket_access[0].arn + } + + oidc_providers = { + main = { + provider_arn = local.eks_cluster.oidc_provider_arn + namespace_service_accounts = ["default:model-server"] + } + } + + tags = var.tags +} + +module "notebook_irsa" { + count = var.create_substratus_irsa_roles == true ? 1 : 0 + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.28" + + role_name_prefix = "${var.name_prefix}-notebook-" + role_policy_arns = { + SubstratusAmazonS3FullAccess = aws_iam_policy.s3_full_bucket_access[0].arn + } + + oidc_providers = { + main = { + provider_arn = local.eks_cluster.oidc_provider_arn + namespace_service_accounts = ["default:notebook"] + } + } + + tags = var.tags +} + +module "data_loader_irsa" { + count = var.create_substratus_irsa_roles == true ? 1 : 0 + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.28" + + role_name_prefix = "${var.name_prefix}-data-loader-" + role_policy_arns = { + SubstratusAmazonS3FullAccess = aws_iam_policy.s3_full_bucket_access[0].arn + } + + oidc_providers = { + main = { + provider_arn = local.eks_cluster.oidc_provider_arn + namespace_service_accounts = ["default:data-loader"] + } + } + + tags = var.tags +} + +module "aws_manager_irsa" { + count = var.create_substratus_irsa_roles == true ? 1 : 0 + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.28" + + role_name_prefix = "${var.name_prefix}-aws-manager-" + role_policy_arns = { + SubstratusAmazonS3FullAccess = aws_iam_policy.s3_full_bucket_access[0].arn + } + + oidc_providers = { + main = { + provider_arn = local.eks_cluster.oidc_provider_arn + namespace_service_accounts = ["substratus:aws-manager"] + } + } + + tags = var.tags +} diff --git a/install/terraform/aws/variables.tf b/install/terraform/aws/variables.tf new file mode 100644 index 00000000..8fcd7157 --- /dev/null +++ b/install/terraform/aws/variables.tf @@ -0,0 +1,86 @@ +variable "cluster_version" { + description = "The version of the EKS cluster to deploy (i.e., this is used when var.existing_eks_cluster is null)" + type = string + default = "1.27" +} + +variable "create_substratus_irsa_roles" { + description = "A boolean controlling the creation of substratus IRSA roles" + type = bool + default = true +} + +variable "existing_artifacts_bucket" { + description = "An existing artifacts bucket to use for this substratus install." + type = object({ + id = string + arn = string + }) + default = null +} + +variable "existing_ecr_repository_arn" { + description = "The ARN of an existing ECR repository to use instead of creating a new one" + type = string + default = "" +} + +variable "existing_eks_cluster" { + description = "An existing EKS cluster to add substratus components to." + type = object({ + name = string + oidc_provider_arn = string + }) + default = null +} + +variable "existing_vpc" { + description = "An existing VPC to add substratus components to." + type = object({ + id = string + private_subnet_ids = list(string) + intra_subnet_ids = list(string) + }) + default = null +} + +variable "image_scan_on_push" { + type = bool + default = false + description = "Scan images for vulnerabilities on push to ECR ($0.09 per scan on push)" +} + +variable "lables" { + type = map(string) + default = { + GithubRepo = "substratus" + GithubOrg = "substratusai" + } +} + +variable "name_prefix" { + description = "Prefix to use for resources" + type = string + default = "substratus-usw2" +} + +variable "region" { + description = "AWS region" + type = string + default = "us-west-2" +} + +# will remove this before pushing to substratus repo +variable "tags" { + type = map(string) + default = { + GithubRepo = "infrastructure" + GithubOrg = "substratusai" + } +} + +variable "vpc_cidr" { + description = "The cidr block of the VPC if created by the module (e.g., used when var.existing_vpc is null)" + type = string + default = "10.0.0.0/16" +} diff --git a/install/terraform/aws/vpc.tf b/install/terraform/aws/vpc.tf new file mode 100644 index 00000000..32792ccb --- /dev/null +++ b/install/terraform/aws/vpc.tf @@ -0,0 +1,116 @@ +data "aws_availability_zones" "available" {} + +locals { + azs = slice(data.aws_availability_zones.available.names, 0, 3) + create_vpc = var.existing_vpc == null ? 1 : 0 +} + +module "vpc" { + count = local.create_vpc + source = "terraform-aws-modules/vpc/aws" + version = "5.1.1" + name = var.name_prefix + cidr = var.vpc_cidr + azs = local.azs + private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 6, k)] + public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 6, k + 4)] + intra_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 6, k + 20)] + + public_subnet_ipv6_prefixes = [0, 1, 2] + public_subnet_assign_ipv6_address_on_creation = true + private_subnet_ipv6_prefixes = [3, 4, 5] + private_subnet_assign_ipv6_address_on_creation = true + intra_subnet_ipv6_prefixes = [6, 7, 8] + intra_subnet_assign_ipv6_address_on_creation = true + + public_subnet_tags = { + "kubernetes.io/role/elb" = 1 + } + + private_subnet_tags = { + "kubernetes.io/role/internal-elb" = 1 + } + + create_database_subnet_group = false + manage_default_network_acl = false + manage_default_route_table = false + manage_default_security_group = false + + enable_dns_hostnames = true + enable_dns_support = true + enable_nat_gateway = true + single_nat_gateway = true + enable_ipv6 = true + create_egress_only_igw = true + enable_vpn_gateway = false + enable_dhcp_options = false + + # VPC Flow Logs (Cloudwatch log group and IAM role will be created) + enable_flow_log = false + create_flow_log_cloudwatch_log_group = true + create_flow_log_cloudwatch_iam_role = true + flow_log_max_aggregation_interval = 60 + tags = var.tags +} + + +# VPC Endpoints Module + +module "endpoints" { + count = local.create_vpc + source = "terraform-aws-modules/vpc/aws//modules/vpc-endpoints" + version = "5.1.1" + vpc_id = module.vpc[0].vpc_id + create_security_group = true + security_group_name_prefix = "${var.name_prefix}-endpoints-" + security_group_description = "VPC endpoint security group" + security_group_rules = { + ingress_https = { + description = "HTTPS from VPC" + cidr_blocks = [module.vpc[0].vpc_cidr_block] + } + } + + endpoints = { + s3 = { + service = "s3" + tags = { Name = "s3-vpc-endpoint" } + }, + ecr_api = { + service = "ecr.api" + private_dns_enabled = true + subnet_ids = module.vpc[0].private_subnets + policy = data.aws_iam_policy_document.generic_endpoint_policy[0].json + }, + ecr_dkr = { + service = "ecr.dkr" + private_dns_enabled = true + subnet_ids = module.vpc[0].private_subnets + policy = data.aws_iam_policy_document.generic_endpoint_policy[0].json + }, + } + + tags = merge(var.tags, { + Endpoint = "true" + }) +} + +data "aws_iam_policy_document" "generic_endpoint_policy" { + count = local.create_vpc + statement { + effect = "Deny" + actions = ["*"] + resources = ["*"] + + principals { + type = "*" + identifiers = ["*"] + } + + condition { + test = "StringNotEquals" + variable = "aws:SourceVpc" + values = [module.vpc[0].vpc_id] + } + } +} From 7fd31044549e7a8731f2ae11cc06c5403772607f Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Mon, 7 Aug 2023 02:28:31 -0700 Subject: [PATCH 02/21] paired back terraform install bits. aws-up started --- install/scripts/aws-down.sh | 1 + install/scripts/aws-up.sh | 131 ++++++++++++ install/terraform/aws/bucket.tf | 6 - install/terraform/aws/common.tf | 23 ++- install/terraform/aws/container_registry.tf | 8 - install/terraform/aws/eks_cluster.tf | 29 ++- ...ks_irsa_iam_roles.tf => irsa_iam_roles.tf} | 36 +++- install/terraform/aws/outputs.tf | 32 +-- .../aws/substratus_irsa_iam_roles.tf | 193 ------------------ install/terraform/aws/variables.tf | 29 +-- 10 files changed, 203 insertions(+), 285 deletions(-) create mode 100644 install/scripts/aws-down.sh create mode 100644 install/scripts/aws-up.sh delete mode 100644 install/terraform/aws/bucket.tf delete mode 100644 install/terraform/aws/container_registry.tf rename install/terraform/aws/{eks_irsa_iam_roles.tf => irsa_iam_roles.tf} (69%) delete mode 100644 install/terraform/aws/substratus_irsa_iam_roles.tf diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/install/scripts/aws-down.sh @@ -0,0 +1 @@ + diff --git a/install/scripts/aws-up.sh b/install/scripts/aws-up.sh new file mode 100644 index 00000000..b620cddc --- /dev/null +++ b/install/scripts/aws-up.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +set -e +set -u + +# Required env variables: +: "$TOKEN $PROJECT" + +# Used by gcloud: +# TODO(bjb): pass AWS creds into script +export CLOUDSDK_AUTH_ACCESS_TOKEN=${TOKEN} +# Used by terraform: +export GOOGLE_OAUTH_ACCESS_TOKEN=${TOKEN} + +INSTALL_OPERATOR="${INSTALL_OPERATOR:-yes}" +AUTO_APPROVE="${AUTO_APPROVE:-no}" + +# Create terraform state bucket if one does not exist. +# TODO(bjb): establish a bucket + +# Apply infrastructure. +cd terraform/aws + +# Backend variables cannot be configured via env variables. +echo "bucket = \"${TF_BUCKET}\"" >>backend.tfvars +terraform init --backend-config=backend.tfvars + +export TF_VAR_project_id=${PROJECT} +if [ "${AUTO_APPROVE}" == "yes" ]; then + terraform apply -auto-approve +else + terraform apply +fi +CLUSTER_NAME=$(terraform output --json cluster | jq -r '.name') +CLUSTER_REGION=$(terraform output --json cluster | jq -r '.region') +CLUSTER_ENDPOINT=$(terraform output --json cluster | jq -r '.endpoint') +LOAD_BALANCER_CONTROLLER_ROLE_NAME=$(terraform output --json irsas | jq -r '.load_balancer_controller_irsa_role.iam_role_name') + +cd - + +# Configure kubectl. +aws eks --region ${CLUSTER_REGION} update-kubeconfig --name ${CLUSTER_NAME} +# Install cluster-level components + +# node-termination-handler: https://artifacthub.io/packages/helm/aws/aws-node-termination-handler +helm repo add eks https://aws.github.io/eks-charts +helm upgrade \ + --install aws-node-termination-handler \ + --namespace kube-system \ + --version 0.21.0 \ + eks/aws-node-termination-handler + +# install EBS snapshotter?: https://github.com/kubernetes-csi/external-snapshotter#usage + +# TODO(bjb): may not be needed if we can resolve 401 to 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/ +# install aws-ebs-csi-driver: https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/docs/install.md +helm repo add aws-ebs-csi-driver https://kubernetes-sigs.github.io/aws-ebs-csi-driver +helm repo update +helm upgrade \ + --install aws-ebs-csi-driver \ + --namespace kube-system \ + aws-ebs-csi-driver/aws-ebs-csi-driver + +# TODO(bjb): is this needed? Is doing the work here preferred to doing it in terraform? +# install karpenter: https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/ +export KARPENTER_VERSION=v0.29.2 +export AWS_PARTITION="aws" +export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" +export TEMPOUT=$(mktemp) +curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml >$TEMPOUT && + aws cloudformation deploy \ + --stack-name "Karpenter-${CLUSTER_NAME}" \ + --template-file "${TEMPOUT}" \ + --capabilities CAPABILITY_NAMED_IAM \ + --parameter-overrides "ClusterName=${CLUSTER_NAME}" + +eksctl create cluster -f - < Date: Tue, 8 Aug 2023 01:23:33 -0700 Subject: [PATCH 03/21] adding infra via eksctl --- .gitignore | 3 +- install/kubernetes/eks-cluster.yaml.tpl | 97 +++++++ install/kubernetes/karpenter-provisioner.yaml | 69 +++++ install/scripts/aws-down.sh | 22 ++ install/scripts/aws-up.sh | 136 +++------ install/terraform/aws/backend.tf | 7 - install/terraform/aws/common.tf | 27 -- install/terraform/aws/eks_cluster.tf | 262 ------------------ install/terraform/aws/irsa_iam_roles.tf | 109 -------- install/terraform/aws/outputs.tf | 11 - install/terraform/aws/providers.tf | 15 - install/terraform/aws/variables.tf | 59 ---- install/terraform/aws/vpc.tf | 116 -------- internal/awsmanager/manager.go | 5 + 14 files changed, 232 insertions(+), 706 deletions(-) create mode 100644 install/kubernetes/eks-cluster.yaml.tpl create mode 100644 install/kubernetes/karpenter-provisioner.yaml mode change 100644 => 100755 install/scripts/aws-down.sh mode change 100644 => 100755 install/scripts/aws-up.sh delete mode 100644 install/terraform/aws/backend.tf delete mode 100644 install/terraform/aws/common.tf delete mode 100644 install/terraform/aws/eks_cluster.tf delete mode 100644 install/terraform/aws/irsa_iam_roles.tf delete mode 100644 install/terraform/aws/outputs.tf delete mode 100644 install/terraform/aws/providers.tf delete mode 100644 install/terraform/aws/variables.tf delete mode 100644 install/terraform/aws/vpc.tf create mode 100644 internal/awsmanager/manager.go diff --git a/.gitignore b/.gitignore index 06f85911..b2ee0d14 100644 --- a/.gitignore +++ b/.gitignore @@ -77,4 +77,5 @@ gcpmanager-dependencies.yaml skaffold-dependencies.sh .ipynb_checkpoints -.vscode/ \ No newline at end of file +.vscode/ +eks-cluster.yaml diff --git a/install/kubernetes/eks-cluster.yaml.tpl b/install/kubernetes/eks-cluster.yaml.tpl new file mode 100644 index 00000000..aba1b0fd --- /dev/null +++ b/install/kubernetes/eks-cluster.yaml.tpl @@ -0,0 +1,97 @@ +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +metadata: + name: substratus + region: us-west-2 + version: "1.27" + tags: + createdBy: eksctl + environment: dev + karpenter.sh/discovery: substratus + +karpenter: + createServiceAccount: true + withSpotInterruptionQueue: true + defaultInstanceProfile: "KarpenterNodeInstanceProfile-substratus" + version: "v0.29.0" + +# TODO(bjb): do we need mngs with karpenter? +# if karpenter doesn't suffice: https://github.com/eksctl-io/eksctl/blob/main/examples/23-kubeflow-spot-instance.yaml +managedNodeGroups: + - name: builder-ng + privateNetworking: true + labels: { role: builders } + instanceTypes: + - m6a.large + volumeSize: 100 + minSize: 0 + maxSize: 3 + desiredCapacity: 1 + iam: + withAddonPolicies: + ebs: true + imageBuilder: true +addons: + - name: vpc-cni + attachPolicyARNs: + - arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy + - name: kube-proxy + - name: aws-ebs-csi-driver + wellKnownPolicies: + ebsCSIController: true + - name: coredns + +iamIdentityMappings: + - arn: "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}" + username: system:node:{{EC2PrivateDNSName}} + groups: + - system:bootstrappers + - system:nodes + +iam: + withOIDC: true + serviceAccounts: + - metadata: + name: karpenter + namespace: karpenter + roleName: ${CLUSTER_NAME}-karpenter + attachPolicyARNs: + - arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME} + roleOnly: true + - metadata: + name: ebs-csi-controller-sa + namespace: kube-system + wellKnownPolicies: + ebsCSIController: true + - metadata: + name: substratus + namespace: substratus + attachPolicy: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "ecr:*" + Resource: + - "arn:aws:ecr:::${ARTIFACTS_REPO_NAME}" + - Effect: Allow + Action: + - "s3:*" + - "s3-object-lambda:*" + Resource: + - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}/*" + - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}" + - metadata: + name: aws-manager + namespace: substratus + attachPolicy: + # https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-presigned-url.html + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "s3:PutObject" + - "s3:GetObject" + Resource: + - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}/*" + - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}" diff --git a/install/kubernetes/karpenter-provisioner.yaml b/install/kubernetes/karpenter-provisioner.yaml new file mode 100644 index 00000000..3bc5d391 --- /dev/null +++ b/install/kubernetes/karpenter-provisioner.yaml @@ -0,0 +1,69 @@ +# https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/ +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: gpu +spec: + provider: + instanceProfile: eksctl-KarpenterNodeInstanceProfile-substratus + subnetSelector: + karpenter.sh/discovery: substratus + securityGroupSelector: + karpenter.sh/discovery: substratus + ttlSecondsAfterEmpty: 30 + consolidation: + enabled: true + taints: + - key: nvidia.com/gpu + value: "true" + effect: NoSchedule + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["spot"] + - key: node.kubernetes.io/instance-type + operator: In + values: + # aws ec2 describe-instance-types --region us-west-2 --query "InstanceTypes[?GpuInfo!=null].InstanceType" --output json | jq -r '.[]' | sort | grep -v dl1 | grep -v inf | grep -v p5 | grep -v trn1 | awk '{print "\""$1"\","}' + [ + "g2.2xlarge", + "g2.8xlarge", + "g3.16xlarge", + "g3.4xlarge", + "g3.8xlarge", + "g3s.xlarge", + "g4ad.16xlarge", + "g4ad.2xlarge", + "g4ad.4xlarge", + "g4ad.8xlarge", + "g4ad.xlarge", + "g4dn.12xlarge", + "g4dn.16xlarge", + "g4dn.2xlarge", + "g4dn.4xlarge", + "g4dn.8xlarge", + "g4dn.metal", + "g4dn.xlarge", + "g5.12xlarge", + "g5.16xlarge", + "g5.24xlarge", + "g5.2xlarge", + "g5.48xlarge", + "g5.4xlarge", + "g5.8xlarge", + "g5.xlarge", + "g5g.16xlarge", + "g5g.2xlarge", + "g5g.4xlarge", + "g5g.8xlarge", + "g5g.metal", + "g5g.xlarge", + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "p3.16xlarge", + "p3.2xlarge", + "p3.8xlarge", + "p3dn.24xlarge", + "p4d.24xlarge", + ] diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh old mode 100644 new mode 100755 index 8b137891..d4372572 --- a/install/scripts/aws-down.sh +++ b/install/scripts/aws-down.sh @@ -1 +1,23 @@ +#!/bin/bash +set -e +set -u + +# Required env variables: +# : "$TOKEN $PROJECT" + +export EKSCTL_ENABLE_CREDENTIAL_CACHE=1 +export CLUSTER_NAME=substratus +export REGION=us-west-2 +export ARTIFACTS_REPO_NAME=substratus +export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" +export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-substratus-artifacts + +aws s3 rb s3://${ARTIFACTS_BUCKET_NAME} --region ${REGION} || true +aws ecr delete-repository --repository-name ${ARTIFACTS_REPO_NAME} || true + +aws cloudformation delete-stack \ + --stack-name "Karpenter-${CLUSTER_NAME}" || true + +envsubst <../kubernetes/eks-cluster.yaml.tpl >../kubernetes/eks-cluster.yaml +eksctl delete cluster -f ../kubernetes/eks-cluster.yaml diff --git a/install/scripts/aws-up.sh b/install/scripts/aws-up.sh old mode 100644 new mode 100755 index b620cddc..425a01cb --- a/install/scripts/aws-up.sh +++ b/install/scripts/aws-up.sh @@ -4,68 +4,25 @@ set -e set -u # Required env variables: -: "$TOKEN $PROJECT" +# : "$TOKEN $PROJECT" -# Used by gcloud: -# TODO(bjb): pass AWS creds into script -export CLOUDSDK_AUTH_ACCESS_TOKEN=${TOKEN} -# Used by terraform: -export GOOGLE_OAUTH_ACCESS_TOKEN=${TOKEN} +# # TODO(bjb): pass AWS creds into script +# export CLOUDSDK_AUTH_ACCESS_TOKEN=${TOKEN} -INSTALL_OPERATOR="${INSTALL_OPERATOR:-yes}" -AUTO_APPROVE="${AUTO_APPROVE:-no}" - -# Create terraform state bucket if one does not exist. -# TODO(bjb): establish a bucket - -# Apply infrastructure. -cd terraform/aws - -# Backend variables cannot be configured via env variables. -echo "bucket = \"${TF_BUCKET}\"" >>backend.tfvars -terraform init --backend-config=backend.tfvars - -export TF_VAR_project_id=${PROJECT} -if [ "${AUTO_APPROVE}" == "yes" ]; then - terraform apply -auto-approve -else - terraform apply -fi -CLUSTER_NAME=$(terraform output --json cluster | jq -r '.name') -CLUSTER_REGION=$(terraform output --json cluster | jq -r '.region') -CLUSTER_ENDPOINT=$(terraform output --json cluster | jq -r '.endpoint') -LOAD_BALANCER_CONTROLLER_ROLE_NAME=$(terraform output --json irsas | jq -r '.load_balancer_controller_irsa_role.iam_role_name') - -cd - - -# Configure kubectl. -aws eks --region ${CLUSTER_REGION} update-kubeconfig --name ${CLUSTER_NAME} -# Install cluster-level components - -# node-termination-handler: https://artifacthub.io/packages/helm/aws/aws-node-termination-handler -helm repo add eks https://aws.github.io/eks-charts -helm upgrade \ - --install aws-node-termination-handler \ - --namespace kube-system \ - --version 0.21.0 \ - eks/aws-node-termination-handler - -# install EBS snapshotter?: https://github.com/kubernetes-csi/external-snapshotter#usage +# INSTALL_OPERATOR="${INSTALL_OPERATOR:-yes}" +export EKSCTL_ENABLE_CREDENTIAL_CACHE=1 +export CLUSTER_NAME=substratus +export REGION=us-west-2 +export ARTIFACTS_REPO_NAME=substratus +export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" +export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-substratus-artifacts -# TODO(bjb): may not be needed if we can resolve 401 to 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/ -# install aws-ebs-csi-driver: https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/docs/install.md -helm repo add aws-ebs-csi-driver https://kubernetes-sigs.github.io/aws-ebs-csi-driver -helm repo update -helm upgrade \ - --install aws-ebs-csi-driver \ - --namespace kube-system \ - aws-ebs-csi-driver/aws-ebs-csi-driver +aws s3 mb s3://${ARTIFACTS_BUCKET_NAME} --region ${REGION} || true +aws ecr create-repository --repository-name ${ARTIFACTS_REPO_NAME} || true -# TODO(bjb): is this needed? Is doing the work here preferred to doing it in terraform? # install karpenter: https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/ export KARPENTER_VERSION=v0.29.2 export AWS_PARTITION="aws" -export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" export TEMPOUT=$(mktemp) curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml >$TEMPOUT && aws cloudformation deploy \ @@ -74,54 +31,35 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION} --capabilities CAPABILITY_NAMED_IAM \ --parameter-overrides "ClusterName=${CLUSTER_NAME}" -eksctl create cluster -f - <../kubernetes/eks-cluster.yaml +eksctl create cluster -f ../kubernetes/eks-cluster.yaml || eksctl upgrade cluster -f ../kubernetes/eks-cluster.yaml export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" -echo $CLUSTER_ENDPOINT $KARPENTER_IAM_ROLE_ARN aws iam create-service-linked-role --aws-service-name spot.amazonaws.com || true +aws eks --region ${REGION} update-kubeconfig --name ${CLUSTER_NAME} +# Logout of helm registry to perform an unauthenticated pull against the public ECR +helm registry logout public.ecr.aws || true + +helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --version ${KARPENTER_VERSION} --namespace karpenter --create-namespace \ + --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=${KARPENTER_IAM_ROLE_ARN} \ + --set settings.aws.clusterName=${CLUSTER_NAME} \ + --set settings.aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ + --set settings.aws.interruptionQueueName=${CLUSTER_NAME} \ + --set controller.resources.requests.cpu=1 \ + --set controller.resources.requests.memory=1Gi \ + --set controller.resources.limits.cpu=1 \ + --set controller.resources.limits.memory=1Gi \ + --wait + +kubectl apply -f ../kubernetes/karpenter-provisioner.yaml -# install the load balancer controller: https://docs.aws.amazon.com/eks/latest/userguide/aws-load-balancer-controller.html -helm install aws-load-balancer-controller eks/aws-load-balancer-controller \ - -n kube-system \ - --set clusterName=${CLUSTER_NAME} \ - --set serviceAccount.create=false \ - --set serviceAccount.name=${LOAD_BALANCER_CONTROLLER_ROLE_NAME} +# node-termination-handler: https://artifacthub.io/packages/helm/aws/aws-node-termination-handler +helm repo add eks https://aws.github.io/eks-charts +helm upgrade \ + --install aws-node-termination-handler \ + --namespace kube-system \ + --version 0.21.0 \ + eks/aws-node-termination-handler # Install the substratus operator. # if [ "${INSTALL_OPERATOR}" == "yes" ]; then diff --git a/install/terraform/aws/backend.tf b/install/terraform/aws/backend.tf deleted file mode 100644 index 91722a40..00000000 --- a/install/terraform/aws/backend.tf +++ /dev/null @@ -1,7 +0,0 @@ -terraform { - backend "s3" { - # bucket = "243019462621-terraform-state" - # key = "primary/us-west-2/substratus/terraform.tfstate" - # region = "us-west-2" - } -} diff --git a/install/terraform/aws/common.tf b/install/terraform/aws/common.tf deleted file mode 100644 index d73dc747..00000000 --- a/install/terraform/aws/common.tf +++ /dev/null @@ -1,27 +0,0 @@ -locals { - # passed to cluster.tf - vpc = { - id = var.existing_vpc == null ? module.vpc[0].vpc_id : var.existing_vpc.id - private_subnet_ids = var.existing_vpc == null ? module.vpc[0].private_subnets : var.existing_vpc.private_subnet_ids - intra_subnet_ids = var.existing_vpc == null ? module.vpc[0].intra_subnets : var.existing_vpc.intra_subnet_ids - endpoints = var.existing_vpc == null ? module.endpoints[0] : null - } - - # passed to substratus_irsa_iam_roles.tf and eks_irsa_iam_roles.tf - eks_cluster = { - name = local.create_cluster == 1 ? module.eks[0].cluster_name : var.existing_eks_cluster.name - oidc_provider_arn = local.create_cluster == 1 ? module.eks[0].oidc_provider_arn : var.existing_eks_cluster.oidc_provider_arn - managed_node_groups = local.create_cluster == 1 ? module.eks[0].eks_managed_node_groups : null - certificate_authority_data = local.create_cluster == 1 ? module.eks[0].cluster_certificate_authority_data : "" - endpoint = local.create_cluster == 1 ? module.eks[0].cluster_endpoint : "" - region = var.region - } - - irsa_outputs = { - ebs_csi_irsa_role = local.create_cluster == 1 ? module.ebs_csi_irsa_role[0] : {} - load_balancer_controller_irsa_role = local.create_cluster == 1 ? module.load_balancer_controller_irsa_role[0] : {} - node_termination_handler_irsa_role = local.create_cluster == 1 ? module.node_termination_handler_irsa_role[0] : {} - substratus_irsa = local.create_cluster == 1 ? module.substratus_irsa[0] : {} - vpc_cni_ipv4_irsa_role = local.create_cluster == 1 ? module.vpc_cni_ipv4_irsa_role[0] : {} - } -} diff --git a/install/terraform/aws/eks_cluster.tf b/install/terraform/aws/eks_cluster.tf deleted file mode 100644 index d10e45e2..00000000 --- a/install/terraform/aws/eks_cluster.tf +++ /dev/null @@ -1,262 +0,0 @@ -locals { - create_cluster = var.existing_eks_cluster == null ? 1 : 0 - # We need to lookup K8s taint effect from the AWS API value - taint_effects = { - NO_SCHEDULE = "NoSchedule" - NO_EXECUTE = "NoExecute" - PREFER_NO_SCHEDULE = "PreferNoSchedule" - } - - # The following locals are used to configure tags for the EKS cluster's Auto - # Scaling Groups managed by the cluster autoscaler. - - # `cluster_autoscaler_label_tags` contains the tags related to the Kubernetes - # labels applied to the nodes in the cluster's managed node groups. - # Each tag has a key formed from the node group's name and label name, and a - # value containing the autoscaling group's name, the corresponding - # Kubernetes label key, and its value. These tags are used by the cluster - # autoscaler to determine how nodes should be scaled based on their labels. - cluster_autoscaler_label_tags = local.eks_cluster.managed_node_groups != null ? merge([ - for name, group in local.eks_cluster.managed_node_groups : { - for label_name, label_value in coalesce(group.node_group_labels, {}) : "${name}|label|${label_name}" => { - autoscaling_group = group.node_group_autoscaling_group_names[0], - key = "k8s.io/cluster-autoscaler/node-template/label/${label_name}", - value = label_value, - } - } - ]...) : {} - - # `cluster_autoscaler_taint_tags` contains tags related to the Kubernetes - # taints applied to the nodes in the cluster's managed node groups. - # Each tag's key includes the node group's name and taint key, and its value - # contains information about the taint, such as its value and effect. - # These tags allow the cluster autoscaler to respect the taints when scaling nodes. - cluster_autoscaler_taint_tags = local.eks_cluster.managed_node_groups != null ? merge([ - for name, group in local.eks_cluster.managed_node_groups : { - for taint in coalesce(group.node_group_taints, []) : "${name}|taint|${taint.key}" => { - autoscaling_group = group.node_group_autoscaling_group_names[0], - key = "k8s.io/cluster-autoscaler/node-template/taint/${taint.key}" - value = "${taint.value}:${local.taint_effects[taint.effect]}" - } - } - ]...) : {} - - # `cluster_autoscaler_asg_tags` combines the above label and taint tags into a - # single map, which is then used to create the actual tags on the AWS ASGs - # through the `aws_autoscaling_group_tag` resource. The tags are only applied - # if `existing_eks_cluster` is `null`, ensuring they are only created for new - # clusters. - cluster_autoscaler_asg_tags = merge( - local.cluster_autoscaler_label_tags, - local.cluster_autoscaler_taint_tags - ) -} - -data "aws_ec2_instance_types" "gpu" { - filter { - name = "instance-type" - # from: aws ec2 describe-instance-types --region us-west-2 --query "InstanceTypes[?GpuInfo!=null].InstanceType" --output json | jq -r '.[]' | awk -F. '{print "\"" $1 ".*\","}' | uniq - # non-CUDA supported types added and commented out for now though these have accelerators of some kind - values = [ - # "dl1.*", # no CUDA support - # "inf1.*" # no CUDA support - # "inf2.*" # no CUDA support - "g2.*", - "g3.*", - "g3s.*", - "g4ad.*", - "g4dn.*", - "g5.*", - # "g5g.*", exclude g5g as these are ARM machines - "p2.*", - "p3.*", - "p3dn.*", - "p4d.*", - # "p5.*", # no CUDA support - # "trn1.*", # no CUDA support - # "trn1n32.*", # no CUDA support - ] - } -} - -data "aws_ami" "eks_default" { - most_recent = true - owners = ["amazon"] - - filter { - name = "name" - values = ["amazon-eks-node-${var.cluster_version}-v*"] - } - filter { - name = "architecture" - values = ["x86_64"] - } -} - -data "aws_ami" "deep_learning" { - most_recent = true - owners = ["amazon"] - - filter { - name = "name" - # they don't produce images on any Ubuntu OS newer than this :shrug: - values = ["Deep Learning AMI (Ubuntu 18.04) Version ??.?"] - } - filter { - name = "architecture" - values = ["x86_64"] - } - - filter { - name = "state" - values = ["available"] - } -} - -module "eks" { - count = local.create_cluster - source = "terraform-aws-modules/eks/aws" - version = "19.16.0" - cluster_name = var.name_prefix - cluster_version = var.cluster_version - cluster_endpoint_public_access = true - cluster_ip_family = "ipv4" - vpc_id = local.vpc.id - subnet_ids = local.vpc.private_subnet_ids - control_plane_subnet_ids = local.vpc.intra_subnet_ids - manage_aws_auth_configmap = true - aws_auth_roles = [ - # We need to add in the Karpenter node IAM role for nodes launched by Karpenter - { - rolearn = module.karpenter[0].role_arn - username = "system:node:{{EC2PrivateDNSName}}" - groups = [ - "system:bootstrappers", - "system:nodes", - ] - }, - ] - - eks_managed_node_group_defaults = { - # We are using the IRSA created below for permissions - # However, we have to deploy with the policy attached FIRST (when creating a fresh cluster) - # and then turn this off after the cluster/node group is created. Without this initial policy, - # the VPC CNI fails to assign IPs and nodes cannot join the cluster - # See https://github.com/aws/containers-roadmap/issues/1666 for more context - iam_role_attach_cni_policy = true - subnet_ids = local.vpc.private_subnet_ids - labels = var.labels - ebs_optimized = true - disable_api_termination = false - enable_monitoring = true - use_custom_launch_template = false - force_update_version = true - } - - eks_managed_node_groups = { - builder = { - # By default, the module creates a launch template to ensure tags are propagated to instances, etc., - # so we need to disable it to use the default template provided by the AWS EKS managed node group service - name_prefix = "container-builder" - ami_id = data.aws_ami.eks_default.image_id - disk_size = 100 - min_size = 1 - max_size = 3 - desired_size = 1 - instance_types = [ - "t3a.large" - ] - capacity_type = "SPOT" - local_storage_types = ["ssd"] - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 100 - volume_type = "gp3" - iops = 3000 - throughput = 150 - encrypted = true - delete_on_termination = true - } - } - } - } - - gpu = { - name_prefix = "gpu" - description = "GPU node launch template" - min_size = 0 - max_size = 32 - desired_size = 0 - - ami_id = data.aws_ami.deep_learning.image_id - capacity_type = "SPOT" - instance_types = sort(data.aws_ec2_instance_types.gpu.instance_types) - - update_config = { - max_unavailable_percentage = 100 - } - - local_storage_types = ["ssd"] - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 100 - volume_type = "gp3" - iops = 3000 - throughput = 150 - encrypted = true - delete_on_termination = true - } - } - } - - metadata_options = { - http_endpoint = "enabled" - http_tokens = "required" - instance_metadata_tags = "disabled" - } - - create_iam_role = true - iam_role_name = "eks-managed-gpu-node-group" - iam_role_use_name_prefix = false - iam_role_description = "EKS managed GPU node group" - iam_role_tags = { - Purpose = "Protector of the kubelet" - } - iam_role_additional_policies = { - AmazonEC2ContainerRegistryReadOnly = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" - } - } - } - tags = merge(var.tags, { - # this same tag should exist on a single security group that karpenter will use - "karpenter.sh/discovery" = var.name_prefix - }) -} - -# ASG tags are needed for the cluster to work with the labels and taints of the -# node groups -resource "aws_autoscaling_group_tag" "cluster_autoscaler_label_tags" { - for_each = var.existing_eks_cluster == null ? local.cluster_autoscaler_asg_tags : {} - autoscaling_group_name = each.value.autoscaling_group - - tag { - key = each.value.key - value = each.value.value - propagate_at_launch = false - } -} - -module "karpenter" { - count = local.create_cluster - source = "terraform-aws-modules/eks/aws//modules/karpenter" - cluster_name = module.eks[0].cluster_name - irsa_oidc_provider_arn = module.eks[0].oidc_provider_arn - policies = { - AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - } - tags = var.tags -} diff --git a/install/terraform/aws/irsa_iam_roles.tf b/install/terraform/aws/irsa_iam_roles.tf deleted file mode 100644 index eacc4cb5..00000000 --- a/install/terraform/aws/irsa_iam_roles.tf +++ /dev/null @@ -1,109 +0,0 @@ -data "aws_iam_policy" "eks_cni_policy" { - name = "AmazonEKS_CNI_Policy" -} - -data "aws_iam_policy" "iam_full_access" { - name = "IAMFullAccess" -} - -data "aws_iam_policy" "container_registry_full_access" { - name = "AmazonEC2ContainerRegistryFullAccess" -} - -data "aws_iam_policy" "s3_full_access" { - name = "AmazonS3FullAccess" -} - -module "substratus_irsa" { - count = local.create_cluster - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.28" - role_name_prefix = "${var.name_prefix}-substratus-" - role_policy_arns = { - IAMFullAccess = data.aws_iam_policy.iam_full_access.arn - AmazonEC2ContainerRegistryFullAccess = data.aws_iam_policy.container_registry_full_access.arn - AmazonS3FullAccess = data.aws_iam_policy.s3_full_access.arn - } - - oidc_providers = { - main = { - provider_arn = local.eks_cluster.oidc_provider_arn - namespace_service_accounts = ["substratus:substratus"] - } - } - - tags = var.tags -} - -module "ebs_csi_irsa_role" { - count = local.create_cluster - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.28" - - role_name_prefix = "ebs-csi" - attach_ebs_csi_policy = true - - oidc_providers = { - main = { - provider_arn = local.eks_cluster.oidc_provider_arn - namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] - } - } - - tags = var.tags -} - -module "load_balancer_controller_irsa_role" { - count = local.create_cluster - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.28" - - role_name_prefix = "load-balancer-controller" - attach_load_balancer_controller_policy = true - - oidc_providers = { - main = { - provider_arn = local.eks_cluster.oidc_provider_arn - namespace_service_accounts = ["kube-system:aws-load-balancer-controller"] - } - } - - tags = var.tags -} - -module "node_termination_handler_irsa_role" { - count = local.create_cluster - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.28" - - role_name_prefix = "node-termination-handler" - attach_node_termination_handler_policy = true - - oidc_providers = { - main = { - provider_arn = local.eks_cluster.oidc_provider_arn - namespace_service_accounts = ["kube-system:aws-node"] - } - } - - tags = var.tags -} - -module "vpc_cni_ipv4_irsa_role" { - count = local.create_cluster - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.28" - - role_name_prefix = "vpc-cni-ipv4" - attach_vpc_cni_policy = true - vpc_cni_enable_ipv4 = true - - oidc_providers = { - main = { - provider_arn = local.eks_cluster.oidc_provider_arn - namespace_service_accounts = ["kube-system:aws-node"] - } - } - - tags = var.tags -} diff --git a/install/terraform/aws/outputs.tf b/install/terraform/aws/outputs.tf deleted file mode 100644 index 9df6cfb1..00000000 --- a/install/terraform/aws/outputs.tf +++ /dev/null @@ -1,11 +0,0 @@ -output "cluster" { - value = local.eks_cluster -} - -output "vpc" { - value = local.vpc -} - -output "irsas" { - value = local.irsa_outputs -} diff --git a/install/terraform/aws/providers.tf b/install/terraform/aws/providers.tf deleted file mode 100644 index 808daadc..00000000 --- a/install/terraform/aws/providers.tf +++ /dev/null @@ -1,15 +0,0 @@ -provider "aws" { - region = var.region -} - -provider "kubernetes" { - host = local.eks_cluster.endpoint - cluster_ca_certificate = base64decode(local.eks_cluster.certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", local.eks_cluster.name] - } -} diff --git a/install/terraform/aws/variables.tf b/install/terraform/aws/variables.tf deleted file mode 100644 index 15404b60..00000000 --- a/install/terraform/aws/variables.tf +++ /dev/null @@ -1,59 +0,0 @@ -variable "cluster_version" { - description = "The version of the EKS cluster to deploy (i.e., this is used when var.existing_eks_cluster is null)" - type = string - default = "1.27" -} - -variable "existing_eks_cluster" { - description = "An existing EKS cluster to add substratus components to." - type = object({ - name = string - oidc_provider_arn = string - }) - default = null -} - -variable "existing_vpc" { - description = "An existing VPC to add substratus components to." - type = object({ - id = string - private_subnet_ids = list(string) - intra_subnet_ids = list(string) - }) - default = null -} - -variable "labels" { - type = map(string) - default = { - GithubRepo = "substratus" - GithubOrg = "substratusai" - } -} - -variable "name_prefix" { - description = "Prefix to use for resources" - type = string - default = "substratus-usw2" -} - -variable "region" { - description = "AWS region" - type = string - default = "us-west-2" -} - -# will remove this before pushing to substratus repo -variable "tags" { - type = map(string) - default = { - GithubRepo = "infrastructure" - GithubOrg = "substratusai" - } -} - -variable "vpc_cidr" { - description = "The cidr block of the VPC if created by the module (e.g., used when var.existing_vpc is null)" - type = string - default = "10.0.0.0/16" -} diff --git a/install/terraform/aws/vpc.tf b/install/terraform/aws/vpc.tf deleted file mode 100644 index 32792ccb..00000000 --- a/install/terraform/aws/vpc.tf +++ /dev/null @@ -1,116 +0,0 @@ -data "aws_availability_zones" "available" {} - -locals { - azs = slice(data.aws_availability_zones.available.names, 0, 3) - create_vpc = var.existing_vpc == null ? 1 : 0 -} - -module "vpc" { - count = local.create_vpc - source = "terraform-aws-modules/vpc/aws" - version = "5.1.1" - name = var.name_prefix - cidr = var.vpc_cidr - azs = local.azs - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 6, k)] - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 6, k + 4)] - intra_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 6, k + 20)] - - public_subnet_ipv6_prefixes = [0, 1, 2] - public_subnet_assign_ipv6_address_on_creation = true - private_subnet_ipv6_prefixes = [3, 4, 5] - private_subnet_assign_ipv6_address_on_creation = true - intra_subnet_ipv6_prefixes = [6, 7, 8] - intra_subnet_assign_ipv6_address_on_creation = true - - public_subnet_tags = { - "kubernetes.io/role/elb" = 1 - } - - private_subnet_tags = { - "kubernetes.io/role/internal-elb" = 1 - } - - create_database_subnet_group = false - manage_default_network_acl = false - manage_default_route_table = false - manage_default_security_group = false - - enable_dns_hostnames = true - enable_dns_support = true - enable_nat_gateway = true - single_nat_gateway = true - enable_ipv6 = true - create_egress_only_igw = true - enable_vpn_gateway = false - enable_dhcp_options = false - - # VPC Flow Logs (Cloudwatch log group and IAM role will be created) - enable_flow_log = false - create_flow_log_cloudwatch_log_group = true - create_flow_log_cloudwatch_iam_role = true - flow_log_max_aggregation_interval = 60 - tags = var.tags -} - - -# VPC Endpoints Module - -module "endpoints" { - count = local.create_vpc - source = "terraform-aws-modules/vpc/aws//modules/vpc-endpoints" - version = "5.1.1" - vpc_id = module.vpc[0].vpc_id - create_security_group = true - security_group_name_prefix = "${var.name_prefix}-endpoints-" - security_group_description = "VPC endpoint security group" - security_group_rules = { - ingress_https = { - description = "HTTPS from VPC" - cidr_blocks = [module.vpc[0].vpc_cidr_block] - } - } - - endpoints = { - s3 = { - service = "s3" - tags = { Name = "s3-vpc-endpoint" } - }, - ecr_api = { - service = "ecr.api" - private_dns_enabled = true - subnet_ids = module.vpc[0].private_subnets - policy = data.aws_iam_policy_document.generic_endpoint_policy[0].json - }, - ecr_dkr = { - service = "ecr.dkr" - private_dns_enabled = true - subnet_ids = module.vpc[0].private_subnets - policy = data.aws_iam_policy_document.generic_endpoint_policy[0].json - }, - } - - tags = merge(var.tags, { - Endpoint = "true" - }) -} - -data "aws_iam_policy_document" "generic_endpoint_policy" { - count = local.create_vpc - statement { - effect = "Deny" - actions = ["*"] - resources = ["*"] - - principals { - type = "*" - identifiers = ["*"] - } - - condition { - test = "StringNotEquals" - variable = "aws:SourceVpc" - values = [module.vpc[0].vpc_id] - } - } -} diff --git a/internal/awsmanager/manager.go b/internal/awsmanager/manager.go new file mode 100644 index 00000000..7a877604 --- /dev/null +++ b/internal/awsmanager/manager.go @@ -0,0 +1,5 @@ +// Package gcp provides an AWS implementation of the Substratus Cloud Interface (SCI) +package awsmanager + +// examples: https://docs.aws.amazon.com/AmazonS3/latest/userguide/example_s3_Scenario_PresignedUrl_section.html +// Checking object integrity: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html From c10bb4606d24a187c475ffaaa862d2bc694e1ed4 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Tue, 8 Aug 2023 01:59:43 -0700 Subject: [PATCH 04/21] updated dockerfile to install eksctl and work with common architectures --- install/Dockerfile | 58 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/install/Dockerfile b/install/Dockerfile index 731a7fc7..8d3c0b2d 100644 --- a/install/Dockerfile +++ b/install/Dockerfile @@ -1,36 +1,66 @@ FROM ubuntu:23.04 WORKDIR /workspace +# Determine platform and architecture +RUN ARCH=$(uname -m) && \ + PLATFORM=$(uname -s | tr '[:upper:]' '[:lower:]') && \ + if [ "$ARCH" = "aarch64" ]; then \ + echo "AWSCLI_ARCH=aarch64" >> /etc/environment; \ + echo "TERRAFORM_ARCH=arm64" >> /etc/environment; \ + echo "PLATFORM_ARCH=${PLATFORM}_arm64" >> /etc/environment; \ + elif [ "$ARCH" = "x86_64" ]; then \ + echo "AWSCLI_ARCH=x86_64" >> /etc/environment; \ + echo "TERRAFORM_ARCH=amd64" >> /etc/environment; \ + echo "PLATFORM_ARCH=${PLATFORM}_amd64" >> /etc/environment; \ + else \ + echo "Unsupported architecture"; \ + exit 1; \ + fi + +# Source the environment file so that the variable is available in the current shell +SHELL ["/bin/bash", "-c"] +RUN source /etc/environment # Common -RUN apt-get update && \ +RUN DEBIAN_FRONTEND="noninteractive" \ + apt-get update && \ apt-get install -y \ gnupg \ software-properties-common \ unzip \ wget \ curl \ - git + git \ + tzdata \ + keyboard-configuration + +# AWS CLI +RUN source /etc/environment && \ + curl "https://awscli.amazonaws.com/awscli-exe-linux-${AWSCLI_ARCH}.zip" -o "awscliv2.zip" && \ + unzip awscliv2.zip && \ + ./aws/install + +# eksctl +RUN source /etc/environment && \ + curl -sLO "https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_${PLATFORM_ARCH}.tar.gz" && \ + tar -xzf eksctl_${PLATFORM_ARCH}.tar.gz -C /tmp && rm eksctl_${PLATFORM_ARCH}.tar.gz && \ + mv /tmp/eksctl /usr/local/bin # Terraform -RUN wget https://releases.hashicorp.com/terraform/1.4.5/terraform_1.4.5_linux_amd64.zip -RUN unzip terraform_1.4.5_linux_amd64.zip -RUN mv terraform /usr/local/bin/ -RUN terraform --version +RUN source /etc/environment && \ + wget https://releases.hashicorp.com/terraform/1.4.5/terraform_1.4.5_linux_${TERRAFORM_ARCH}.zip && \ + unzip terraform_1.4.5_linux_${TERRAFORM_ARCH}.zip && \ + mv terraform /usr/local/bin/ && \ + terraform --version # Google Cloud (gcloud) -RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz -RUN mkdir -p /usr/local/gcloud \ +RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz && \ + mkdir -p /usr/local/gcloud \ && tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \ && /usr/local/gcloud/google-cloud-sdk/install.sh ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin RUN gcloud --version -RUN gcloud components install gke-gcloud-auth-plugin - -# Kubectl -RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl -RUN chmod +x ./kubectl -RUN mv ./kubectl /usr/local/bin +RUN gcloud components install gke-gcloud-auth-plugin kubectl # Helm RUN curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 From 03a444dd9943a31ad88e59d3e79392454afa08b4 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Tue, 8 Aug 2023 02:07:02 -0700 Subject: [PATCH 05/21] added a karpenter AWSNodeTemplate --- .gitignore | 1 + ...ioner.yaml => karpenter-provisioner.yaml.tpl} | 16 +++++++++++++--- install/scripts/aws-up.sh | 1 + 3 files changed, 15 insertions(+), 3 deletions(-) rename install/kubernetes/{karpenter-provisioner.yaml => karpenter-provisioner.yaml.tpl} (81%) diff --git a/.gitignore b/.gitignore index b2ee0d14..e9d5ddaa 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,4 @@ skaffold-dependencies.sh .ipynb_checkpoints .vscode/ eks-cluster.yaml +karpenter-provisioner.yaml diff --git a/install/kubernetes/karpenter-provisioner.yaml b/install/kubernetes/karpenter-provisioner.yaml.tpl similarity index 81% rename from install/kubernetes/karpenter-provisioner.yaml rename to install/kubernetes/karpenter-provisioner.yaml.tpl index 3bc5d391..1fafec36 100644 --- a/install/kubernetes/karpenter-provisioner.yaml +++ b/install/kubernetes/karpenter-provisioner.yaml.tpl @@ -1,3 +1,13 @@ +apiVersion: karpenter.k8s.aws/v1alpha1 +kind: AWSNodeTemplate +metadata: + name: default +spec: + subnetSelector: + karpenter.sh/discovery: ${CLUSTER_NAME} + securityGroupSelector: + karpenter.sh/discovery: ${CLUSTER_NAME} +--- # https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/ apiVersion: karpenter.sh/v1alpha5 kind: Provisioner @@ -5,11 +15,11 @@ metadata: name: gpu spec: provider: - instanceProfile: eksctl-KarpenterNodeInstanceProfile-substratus + instanceProfile: eksctl-KarpenterNodeInstanceProfile-${CLUSTER_NAME} subnetSelector: - karpenter.sh/discovery: substratus + karpenter.sh/discovery: ${CLUSTER_NAME} securityGroupSelector: - karpenter.sh/discovery: substratus + karpenter.sh/discovery: ${CLUSTER_NAME} ttlSecondsAfterEmpty: 30 consolidation: enabled: true diff --git a/install/scripts/aws-up.sh b/install/scripts/aws-up.sh index 425a01cb..6d22f456 100755 --- a/install/scripts/aws-up.sh +++ b/install/scripts/aws-up.sh @@ -51,6 +51,7 @@ helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --vers --set controller.resources.limits.memory=1Gi \ --wait +envsubst <../kubernetes/karpenter-provisioner.yaml.tpl >../kubernetes/karpenter-provisioner.yaml.yaml kubectl apply -f ../kubernetes/karpenter-provisioner.yaml # node-termination-handler: https://artifacthub.io/packages/helm/aws/aws-node-termination-handler From 83d2ef0184da026a909f7ce8f2cc2adce0c8771a Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Tue, 8 Aug 2023 13:45:12 -0700 Subject: [PATCH 06/21] working with dirs relative to scripts --- install/kubernetes/eks-cluster.yaml.tpl | 2 +- install/scripts/aws-down.sh | 11 ++++++----- install/scripts/aws-up.sh | 19 ++++++++++--------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/install/kubernetes/eks-cluster.yaml.tpl b/install/kubernetes/eks-cluster.yaml.tpl index aba1b0fd..9c349cae 100644 --- a/install/kubernetes/eks-cluster.yaml.tpl +++ b/install/kubernetes/eks-cluster.yaml.tpl @@ -26,7 +26,7 @@ managedNodeGroups: volumeSize: 100 minSize: 0 maxSize: 3 - desiredCapacity: 1 + desiredCapacity: 2 iam: withAddonPolicies: ebs: true diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh index d4372572..b776f377 100755 --- a/install/scripts/aws-down.sh +++ b/install/scripts/aws-down.sh @@ -5,6 +5,8 @@ set -u # Required env variables: # : "$TOKEN $PROJECT" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +KUBERENTES_DIR=${SCRIPT_DIR}/../kubernetes export EKSCTL_ENABLE_CREDENTIAL_CACHE=1 export CLUSTER_NAME=substratus @@ -13,11 +15,10 @@ export ARTIFACTS_REPO_NAME=substratus export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-substratus-artifacts -aws s3 rb s3://${ARTIFACTS_BUCKET_NAME} --region ${REGION} || true -aws ecr delete-repository --repository-name ${ARTIFACTS_REPO_NAME} || true - +aws s3 rb s3://${ARTIFACTS_BUCKET_NAME} --region ${REGION} >/dev/null || true +aws ecr delete-repository --repository-name ${ARTIFACTS_REPO_NAME} >/dev/null || true aws cloudformation delete-stack \ --stack-name "Karpenter-${CLUSTER_NAME}" || true -envsubst <../kubernetes/eks-cluster.yaml.tpl >../kubernetes/eks-cluster.yaml -eksctl delete cluster -f ../kubernetes/eks-cluster.yaml +envsubst <${KUBERENTES_DIR}/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/eks-cluster.yaml +eksctl delete cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml diff --git a/install/scripts/aws-up.sh b/install/scripts/aws-up.sh index 6d22f456..424c4eeb 100755 --- a/install/scripts/aws-up.sh +++ b/install/scripts/aws-up.sh @@ -8,7 +8,8 @@ set -u # # TODO(bjb): pass AWS creds into script # export CLOUDSDK_AUTH_ACCESS_TOKEN=${TOKEN} - +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +KUBERENTES_DIR=${SCRIPT_DIR}/../kubernetes # INSTALL_OPERATOR="${INSTALL_OPERATOR:-yes}" export EKSCTL_ENABLE_CREDENTIAL_CACHE=1 export CLUSTER_NAME=substratus @@ -17,9 +18,8 @@ export ARTIFACTS_REPO_NAME=substratus export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-substratus-artifacts -aws s3 mb s3://${ARTIFACTS_BUCKET_NAME} --region ${REGION} || true -aws ecr create-repository --repository-name ${ARTIFACTS_REPO_NAME} || true - +aws s3 mb s3://${ARTIFACTS_BUCKET_NAME} --region ${REGION} >/dev/null || true +aws ecr create-repository --repository-name ${ARTIFACTS_REPO_NAME} --region ${REGION} >/dev/null || true # install karpenter: https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/ export KARPENTER_VERSION=v0.29.2 export AWS_PARTITION="aws" @@ -29,10 +29,11 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION} --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ --capabilities CAPABILITY_NAMED_IAM \ - --parameter-overrides "ClusterName=${CLUSTER_NAME}" + --parameter-overrides "ClusterName=${CLUSTER_NAME}" \ + --region ${REGION} -envsubst <../kubernetes/eks-cluster.yaml.tpl >../kubernetes/eks-cluster.yaml -eksctl create cluster -f ../kubernetes/eks-cluster.yaml || eksctl upgrade cluster -f ../kubernetes/eks-cluster.yaml +envsubst <${KUBERENTES_DIR}/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/eks-cluster.yaml +eksctl create cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml || eksctl upgrade cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" aws iam create-service-linked-role --aws-service-name spot.amazonaws.com || true @@ -51,8 +52,8 @@ helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --vers --set controller.resources.limits.memory=1Gi \ --wait -envsubst <../kubernetes/karpenter-provisioner.yaml.tpl >../kubernetes/karpenter-provisioner.yaml.yaml -kubectl apply -f ../kubernetes/karpenter-provisioner.yaml +envsubst <${KUBERENTES_DIR}/karpenter-provisioner.yaml.tpl >${KUBERENTES_DIR}/karpenter-provisioner.yaml +kubectl apply -f ${KUBERENTES_DIR}/karpenter-provisioner.yaml # node-termination-handler: https://artifacthub.io/packages/helm/aws/aws-node-termination-handler helm repo add eks https://aws.github.io/eks-charts From 847ebe7d0ede7c7e4bad09bcc23fe7cdcd1a47a9 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Tue, 8 Aug 2023 15:40:02 -0700 Subject: [PATCH 07/21] aws-up and aws-down working in a containerized context via makefile targets --- Makefile | 77 ++++++++++++------- docs/development.md | 6 +- install/Dockerfile | 1 + install/kubernetes/eks-cluster.yaml.tpl | 17 ++-- .../kubernetes/karpenter-provisioner.yaml.tpl | 53 +++---------- install/scripts/aws-down.sh | 34 +++++--- install/scripts/aws-up.sh | 61 +++++++++------ 7 files changed, 132 insertions(+), 117 deletions(-) diff --git a/Makefile b/Makefile index 737ebb72..c7a34a6e 100644 --- a/Makefile +++ b/Makefile @@ -73,7 +73,7 @@ all: build .PHONY: help help: ## Display this help. - @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) ##@ Development @@ -120,40 +120,60 @@ skaffold-dev-gcpmanager: protoc skaffold protogen render-skaffold-manifests ## R build: manifests generate fmt vet ## Build manager binary. go build -o bin/manager cmd/controllermanager/main.go -.PHONY: dev-up -dev-up: - docker build ./install -t substratus-installer && \ +.PHONY: gcp-dev-up +gcp-dev-up: build-installer docker run -it \ - -v ${HOME}/.kube:/root/.kube \ - -e PROJECT=$(shell gcloud config get project) \ - -e TOKEN=$(shell gcloud auth print-access-token) \ - -e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \ - -e INSTALL_OPERATOR=false \ - substratus-installer gcp-up.sh + -v ${HOME}/.kube:/root/.kube \ + -e PROJECT=$(shell gcloud config get project) \ + -e TOKEN=$(shell gcloud auth print-access-token) \ + -e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \ + -e INSTALL_OPERATOR=false \ + substratus-installer gcp-up.sh mkdir -p secrets gcloud iam service-accounts keys create --iam-account=substratus-gcp-manager@$(shell gcloud config get project).iam.gserviceaccount.com ./secrets/gcp-manager-key.json -.PHONY: dev-down -dev-down: +.PHONY: gcp-dev-down +gcp-dev-down: build-installer docker run -it \ - -v ${HOME}/.kube:/root/.kube \ - -e PROJECT=$(shell gcloud config get project) \ - -e TOKEN=$(shell gcloud auth print-access-token) \ - -e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \ - substratus-installer gcp-down.sh + -v ${HOME}/.kube:/root/.kube \ + -e PROJECT=$(shell gcloud config get project) \ + -e TOKEN=$(shell gcloud auth print-access-token) \ + -e TF_VAR_attach_gpu_nodepools=${ATTACH_GPU_NODEPOOLS} \ + substratus-installer gcp-down.sh rm ./secrets/gcp-manager-key.json -.PHONY: dev-run +.PHONY: aws-dev-up +aws-dev-up: build-installer + docker run -it \ + -v ${HOME}/.kube:/root/.kube \ + -e AWS_ACCOUNT_ID="$(shell aws sts get-caller-identity --query Account --output text)" \ + -e AWS_ACCESS_KEY_ID=$(shell aws configure get aws_access_key_id) \ + -e AWS_SECRET_ACCESS_KEY=$(shell aws configure get aws_secret_access_key) \ + -e AWS_SESSION_TOKEN=$(shell aws configure get aws_session_token) \ + -e INSTALL_OPERATOR=false \ + substratus-installer aws-up.sh + +.PHONY: aws-dev-down +aws-dev-down: build-installer + docker run -it \ + -v ${HOME}/.kube:/root/.kube \ + -e AWS_ACCOUNT_ID="$(shell aws sts get-caller-identity --query Account --output text)" \ + -e AWS_ACCESS_KEY_ID=$(shell aws configure get aws_access_key_id) \ + -e AWS_SECRET_ACCESS_KEY=$(shell aws configure get aws_secret_access_key) \ + -e AWS_SESSION_TOKEN=$(shell aws configure get aws_session_token) \ + substratus-installer aws-down.sh + +.PHONY: gcp-dev-run # Controller manager configuration # -dev-run: export CLOUD=gcp -dev-run: export GPU_TYPE=nvidia-l4 -dev-run: export PROJECT_ID=$(shell gcloud config get project) -dev-run: export CLUSTER_NAME=substratus -dev-run: export CLUSTER_LOCATION=us-central1 +gcp-dev-run: export CLOUD=gcp +gcp-dev-run: export GPU_TYPE=nvidia-l4 +gcp-dev-run: export PROJECT_ID=$(shell gcloud config get project) +gcp-dev-run: export CLUSTER_NAME=substratus +gcp-dev-run: export CLUSTER_LOCATION=us-central1 # Cloud manager configuration # -dev-run: export GOOGLE_APPLICATION_CREDENTIALS=./secrets/gcp-manager-key.json +gcp-dev-run: export GOOGLE_APPLICATION_CREDENTIALS=./secrets/gcp-manager-key.json # Run the controller manager and the cloud manager. -dev-run: manifests kustomize install-crds +gcp-dev-run: manifests kustomize install-crds go run ./cmd/gcpmanager & \ go run ./cmd/controllermanager/main.go \ --sci-address=localhost:10080 \ @@ -176,16 +196,17 @@ docker-push: ## Push docker image with the manager. .PHONY: docs docs: crd-ref-docs embedmd - $(CRD_REF_DOCS) --config=./docs/api/config.yaml \ + $(CRD_REF_DOCS) \ + --config=./docs/api/config.yaml \ --log-level=INFO \ --output-path=./docs/api/generated.md \ --source-path=./api \ - --templates-dir=./docs/api/templates/markdown \ + --templates-dir=./docs/api/templates/markdown \ --renderer=markdown # TODO: Embed YAML examples into the generate API documentation. # $(EMBEDMD) -w ./docs/api/generated.md -# PLATFORMS defines the target platforms for the manager image be build to provide support to multiple +# PLATFORMS defines the target platforms for the manager image be build to provide support to multiple # architectures. (i.e. make docker-buildx IMG=myregistry/mypoperator:0.0.1). To use this option you need to: # - able to use docker buildx . More info: https://docs.docker.com/build/buildx/ # - have enable BuildKit, More info: https://docs.docker.com/develop/develop-images/build_enhancements/ diff --git a/docs/development.md b/docs/development.md index bcdb9c25..b5000690 100644 --- a/docs/development.md +++ b/docs/development.md @@ -5,19 +5,19 @@ Create a GCP environment. ```sh -make dev-up +make gcp-dev-up ``` Run Substratus control plane locally. ```sh -make dev-run +make gcp-dev-run ``` Delete GCP infra. ```sh -make dev-down +make gcp-dev-down ``` TODO: Automate the cleanup of PVs... Don't forget to manually clean them up for now. diff --git a/install/Dockerfile b/install/Dockerfile index 8d3c0b2d..30ee639d 100644 --- a/install/Dockerfile +++ b/install/Dockerfile @@ -32,6 +32,7 @@ RUN DEBIAN_FRONTEND="noninteractive" \ curl \ git \ tzdata \ + gettext-base \ keyboard-configuration # AWS CLI diff --git a/install/kubernetes/eks-cluster.yaml.tpl b/install/kubernetes/eks-cluster.yaml.tpl index 9c349cae..2982b105 100644 --- a/install/kubernetes/eks-cluster.yaml.tpl +++ b/install/kubernetes/eks-cluster.yaml.tpl @@ -1,21 +1,20 @@ apiVersion: eksctl.io/v1alpha5 kind: ClusterConfig metadata: - name: substratus - region: us-west-2 + name: ${CLUSTER_NAME} + region: ${REGION} version: "1.27" tags: createdBy: eksctl environment: dev - karpenter.sh/discovery: substratus + karpenter.sh/discovery: ${CLUSTER_NAME} karpenter: createServiceAccount: true withSpotInterruptionQueue: true - defaultInstanceProfile: "KarpenterNodeInstanceProfile-substratus" + defaultInstanceProfile: "KarpenterNodeInstanceProfile-${CLUSTER_NAME}" version: "v0.29.0" -# TODO(bjb): do we need mngs with karpenter? # if karpenter doesn't suffice: https://github.com/eksctl-io/eksctl/blob/main/examples/23-kubeflow-spot-instance.yaml managedNodeGroups: - name: builder-ng @@ -26,7 +25,7 @@ managedNodeGroups: volumeSize: 100 minSize: 0 maxSize: 3 - desiredCapacity: 2 + desiredCapacity: 1 iam: withAddonPolicies: ebs: true @@ -64,8 +63,8 @@ iam: wellKnownPolicies: ebsCSIController: true - metadata: - name: substratus - namespace: substratus + name: ${CLUSTER_NAME} + namespace: ${CLUSTER_NAME} attachPolicy: Version: "2012-10-17" Statement: @@ -83,7 +82,7 @@ iam: - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}" - metadata: name: aws-manager - namespace: substratus + namespace: ${CLUSTER_NAME} attachPolicy: # https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-presigned-url.html Version: "2012-10-17" diff --git a/install/kubernetes/karpenter-provisioner.yaml.tpl b/install/kubernetes/karpenter-provisioner.yaml.tpl index 1fafec36..f614e8ad 100644 --- a/install/kubernetes/karpenter-provisioner.yaml.tpl +++ b/install/kubernetes/karpenter-provisioner.yaml.tpl @@ -20,7 +20,6 @@ spec: karpenter.sh/discovery: ${CLUSTER_NAME} securityGroupSelector: karpenter.sh/discovery: ${CLUSTER_NAME} - ttlSecondsAfterEmpty: 30 consolidation: enabled: true taints: @@ -34,46 +33,12 @@ spec: - key: node.kubernetes.io/instance-type operator: In values: - # aws ec2 describe-instance-types --region us-west-2 --query "InstanceTypes[?GpuInfo!=null].InstanceType" --output json | jq -r '.[]' | sort | grep -v dl1 | grep -v inf | grep -v p5 | grep -v trn1 | awk '{print "\""$1"\","}' - [ - "g2.2xlarge", - "g2.8xlarge", - "g3.16xlarge", - "g3.4xlarge", - "g3.8xlarge", - "g3s.xlarge", - "g4ad.16xlarge", - "g4ad.2xlarge", - "g4ad.4xlarge", - "g4ad.8xlarge", - "g4ad.xlarge", - "g4dn.12xlarge", - "g4dn.16xlarge", - "g4dn.2xlarge", - "g4dn.4xlarge", - "g4dn.8xlarge", - "g4dn.metal", - "g4dn.xlarge", - "g5.12xlarge", - "g5.16xlarge", - "g5.24xlarge", - "g5.2xlarge", - "g5.48xlarge", - "g5.4xlarge", - "g5.8xlarge", - "g5.xlarge", - "g5g.16xlarge", - "g5g.2xlarge", - "g5g.4xlarge", - "g5g.8xlarge", - "g5g.metal", - "g5g.xlarge", - "p2.16xlarge", - "p2.8xlarge", - "p2.xlarge", - "p3.16xlarge", - "p3.2xlarge", - "p3.8xlarge", - "p3dn.24xlarge", - "p4d.24xlarge", - ] + - key: karpenter.k8s.aws/instance-category + operator: In + values: ["g", "p"] + - key: karpenter.k8s.aws/instance-family + operator: NotIn + values: ["p5"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh index b776f377..1facf3f5 100755 --- a/install/scripts/aws-down.sh +++ b/install/scripts/aws-down.sh @@ -4,21 +4,37 @@ set -e set -u # Required env variables: -# : "$TOKEN $PROJECT" +: "$AWS_ACCOUNT_ID $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" KUBERENTES_DIR=${SCRIPT_DIR}/../kubernetes -export EKSCTL_ENABLE_CREDENTIAL_CACHE=1 +EKSCTL_ENABLE_CREDENTIAL_CACHE=1 export CLUSTER_NAME=substratus export REGION=us-west-2 -export ARTIFACTS_REPO_NAME=substratus -export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" -export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-substratus-artifacts +export ARTIFACTS_REPO_NAME=${CLUSTER_NAME} +export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts + +aws eks update-kubeconfig \ + --region ${REGION} \ + --name ${CLUSTER_NAME} && + kubectl delete deployments --namespace=karpenter --all && + kubectl delete deployments --namespace=kube-system --all || + true + +aws iam delete-policy \ + --policy-arn arn:aws:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME} || + true -aws s3 rb s3://${ARTIFACTS_BUCKET_NAME} --region ${REGION} >/dev/null || true -aws ecr delete-repository --repository-name ${ARTIFACTS_REPO_NAME} >/dev/null || true aws cloudformation delete-stack \ - --stack-name "Karpenter-${CLUSTER_NAME}" || true + --stack-name "Karpenter-${CLUSTER_NAME}" \ + --region ${REGION} || true envsubst <${KUBERENTES_DIR}/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/eks-cluster.yaml -eksctl delete cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml +eksctl delete cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml || true + +aws ecr delete-repository \ + --repository-name ${ARTIFACTS_REPO_NAME} \ + --region ${REGION} >/dev/null || true + +aws s3 rb s3://${ARTIFACTS_BUCKET_NAME} \ + --region ${REGION} >/dev/null || true diff --git a/install/scripts/aws-up.sh b/install/scripts/aws-up.sh index 424c4eeb..4792de4f 100755 --- a/install/scripts/aws-up.sh +++ b/install/scripts/aws-up.sh @@ -4,26 +4,31 @@ set -e set -u # Required env variables: -# : "$TOKEN $PROJECT" +: "$AWS_ACCOUNT_ID $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY" + +INSTALL_OPERATOR="${INSTALL_OPERATOR:-yes}" -# # TODO(bjb): pass AWS creds into script -# export CLOUDSDK_AUTH_ACCESS_TOKEN=${TOKEN} SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" KUBERENTES_DIR=${SCRIPT_DIR}/../kubernetes -# INSTALL_OPERATOR="${INSTALL_OPERATOR:-yes}" -export EKSCTL_ENABLE_CREDENTIAL_CACHE=1 + +EKSCTL_ENABLE_CREDENTIAL_CACHE=1 export CLUSTER_NAME=substratus export REGION=us-west-2 -export ARTIFACTS_REPO_NAME=substratus -export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" -export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-substratus-artifacts - -aws s3 mb s3://${ARTIFACTS_BUCKET_NAME} --region ${REGION} >/dev/null || true -aws ecr create-repository --repository-name ${ARTIFACTS_REPO_NAME} --region ${REGION} >/dev/null || true -# install karpenter: https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/ +export ARTIFACTS_REPO_NAME=${CLUSTER_NAME} +export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts export KARPENTER_VERSION=v0.29.2 export AWS_PARTITION="aws" -export TEMPOUT=$(mktemp) +export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" +TEMPOUT=$(mktemp) + +aws s3 mb s3://${ARTIFACTS_BUCKET_NAME} \ + --region ${REGION} >/dev/null || true + +aws ecr create-repository \ + --repository-name ${ARTIFACTS_REPO_NAME} \ + --region ${REGION} >/dev/null || true + +# install karpenter: https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/ curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml >$TEMPOUT && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ @@ -33,15 +38,23 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION} --region ${REGION} envsubst <${KUBERENTES_DIR}/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/eks-cluster.yaml -eksctl create cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml || eksctl upgrade cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml +eksctl create cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml || + eksctl upgrade cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml + +aws iam create-service-linked-role \ + --aws-service-name spot.amazonaws.com || true + +aws eks update-kubeconfig \ + --region ${REGION} \ + --name ${CLUSTER_NAME} -export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" -aws iam create-service-linked-role --aws-service-name spot.amazonaws.com || true -aws eks --region ${REGION} update-kubeconfig --name ${CLUSTER_NAME} # Logout of helm registry to perform an unauthenticated pull against the public ECR helm registry logout public.ecr.aws || true - -helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --version ${KARPENTER_VERSION} --namespace karpenter --create-namespace \ +helm upgrade \ + --create-namespace \ + --install karpenter oci://public.ecr.aws/karpenter/karpenter \ + --version ${KARPENTER_VERSION} \ + --namespace karpenter \ --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=${KARPENTER_IAM_ROLE_ARN} \ --set settings.aws.clusterName=${CLUSTER_NAME} \ --set settings.aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ @@ -64,8 +77,8 @@ helm upgrade \ eks/aws-node-termination-handler # Install the substratus operator. -# if [ "${INSTALL_OPERATOR}" == "yes" ]; then -# kubectl apply -f kubernetes/namespace.yaml -# kubectl apply -f kubernetes/config.yaml -# kubectl apply -f kubernetes/system.yaml -# fi +if [ "${INSTALL_OPERATOR}" == "yes" ]; then + kubectl apply -f kubernetes/namespace.yaml + kubectl apply -f kubernetes/config.yaml + kubectl apply -f kubernetes/system.yaml +fi From f9cf844da994f2d9f620ca197977addf2d2786b4 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Tue, 8 Aug 2023 16:44:24 -0700 Subject: [PATCH 08/21] added the nvidia device plugin to get device drivers --- install/kubernetes/eks-cluster.yaml.tpl | 5 +++-- .../kubernetes/karpenter-provisioner.yaml.tpl | 17 +++++++---------- install/scripts/aws-down.sh | 7 ++++--- install/scripts/aws-up.sh | 16 +++++++++++++--- 4 files changed, 27 insertions(+), 18 deletions(-) diff --git a/install/kubernetes/eks-cluster.yaml.tpl b/install/kubernetes/eks-cluster.yaml.tpl index 2982b105..f51e3402 100644 --- a/install/kubernetes/eks-cluster.yaml.tpl +++ b/install/kubernetes/eks-cluster.yaml.tpl @@ -21,11 +21,12 @@ managedNodeGroups: privateNetworking: true labels: { role: builders } instanceTypes: - - m6a.large + - t3a.small + # - m6a.large volumeSize: 100 minSize: 0 maxSize: 3 - desiredCapacity: 1 + desiredCapacity: 2 iam: withAddonPolicies: ebs: true diff --git a/install/kubernetes/karpenter-provisioner.yaml.tpl b/install/kubernetes/karpenter-provisioner.yaml.tpl index f614e8ad..c860c6ff 100644 --- a/install/kubernetes/karpenter-provisioner.yaml.tpl +++ b/install/kubernetes/karpenter-provisioner.yaml.tpl @@ -25,20 +25,17 @@ spec: taints: - key: nvidia.com/gpu value: "true" - effect: NoSchedule + effect: "NoSchedule" requirements: - key: karpenter.sh/capacity-type operator: In values: ["spot"] - - key: node.kubernetes.io/instance-type - operator: In - values: - key: karpenter.k8s.aws/instance-category - operator: In - values: ["g", "p"] + operator: In + values: ["g", "p"] - key: karpenter.k8s.aws/instance-family - operator: NotIn - values: ["p5"] + operator: NotIn + values: ["p5"] - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] + operator: In + values: ["amd64"] diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh index 1facf3f5..5f11a2eb 100755 --- a/install/scripts/aws-down.sh +++ b/install/scripts/aws-down.sh @@ -14,11 +14,11 @@ export REGION=us-west-2 export ARTIFACTS_REPO_NAME=${CLUSTER_NAME} export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts -aws eks update-kubeconfig \ +(aws eks update-kubeconfig \ --region ${REGION} \ --name ${CLUSTER_NAME} && - kubectl delete deployments --namespace=karpenter --all && - kubectl delete deployments --namespace=kube-system --all || + kubectl delete deployments --namespace=karpenter --all || + kubectl delete deployments --namespace=kube-system --all) || true aws iam delete-policy \ @@ -30,6 +30,7 @@ aws cloudformation delete-stack \ --region ${REGION} || true envsubst <${KUBERENTES_DIR}/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/eks-cluster.yaml +cat ${KUBERENTES_DIR}/eks-cluster.yaml eksctl delete cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml || true aws ecr delete-repository \ diff --git a/install/scripts/aws-up.sh b/install/scripts/aws-up.sh index 4792de4f..c7947fdb 100755 --- a/install/scripts/aws-up.sh +++ b/install/scripts/aws-up.sh @@ -38,6 +38,7 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION} --region ${REGION} envsubst <${KUBERENTES_DIR}/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/eks-cluster.yaml +cat ${KUBERENTES_DIR}/eks-cluster.yaml eksctl create cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml || eksctl upgrade cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml @@ -66,6 +67,7 @@ helm upgrade \ --wait envsubst <${KUBERENTES_DIR}/karpenter-provisioner.yaml.tpl >${KUBERENTES_DIR}/karpenter-provisioner.yaml +cat ${KUBERENTES_DIR}/karpenter-provisioner.yaml kubectl apply -f ${KUBERENTES_DIR}/karpenter-provisioner.yaml # node-termination-handler: https://artifacthub.io/packages/helm/aws/aws-node-termination-handler @@ -76,9 +78,17 @@ helm upgrade \ --version 0.21.0 \ eks/aws-node-termination-handler +# nvidia-device-plugin: https://github.com/NVIDIA/k8s-device-plugin#deployment-via-helm +helm repo add nvdp https://nvidia.github.io/k8s-device-plugin +helm upgrade \ + --install nvdp nvdp/nvidia-device-plugin \ + --namespace nvidia-device-plugin \ + --create-namespace \ + --version 0.14.1 + # Install the substratus operator. if [ "${INSTALL_OPERATOR}" == "yes" ]; then - kubectl apply -f kubernetes/namespace.yaml - kubectl apply -f kubernetes/config.yaml - kubectl apply -f kubernetes/system.yaml + kubectl apply -f ${KUBERENTES_DIR}/namespace.yaml + kubectl apply -f ${KUBERENTES_DIR}/config.yaml + kubectl apply -f ${KUBERENTES_DIR}/system.yaml fi From 9aef30b1d7800b3adaaaa77f53c04aa788d048e1 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Tue, 8 Aug 2023 21:15:47 -0700 Subject: [PATCH 09/21] needing more resources per node for the daemonsets --- install/kubernetes/eks-cluster.yaml.tpl | 3 +-- install/scripts/aws-down.sh | 11 +++-------- install/scripts/aws-up.sh | 2 -- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/install/kubernetes/eks-cluster.yaml.tpl b/install/kubernetes/eks-cluster.yaml.tpl index f51e3402..ba41d0c9 100644 --- a/install/kubernetes/eks-cluster.yaml.tpl +++ b/install/kubernetes/eks-cluster.yaml.tpl @@ -21,8 +21,7 @@ managedNodeGroups: privateNetworking: true labels: { role: builders } instanceTypes: - - t3a.small - # - m6a.large + - m6a.large volumeSize: 100 minSize: 0 maxSize: 3 diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh index 5f11a2eb..fef7a6fe 100755 --- a/install/scripts/aws-down.sh +++ b/install/scripts/aws-down.sh @@ -17,22 +17,17 @@ export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts (aws eks update-kubeconfig \ --region ${REGION} \ --name ${CLUSTER_NAME} && - kubectl delete deployments --namespace=karpenter --all || + kubectl delete deployments --namespace=karpenter --all && kubectl delete deployments --namespace=kube-system --all) || true -aws iam delete-policy \ - --policy-arn arn:aws:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME} || - true +envsubst <${KUBERENTES_DIR}/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/eks-cluster.yaml +eksctl delete cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml || true aws cloudformation delete-stack \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --region ${REGION} || true -envsubst <${KUBERENTES_DIR}/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/eks-cluster.yaml -cat ${KUBERENTES_DIR}/eks-cluster.yaml -eksctl delete cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml || true - aws ecr delete-repository \ --repository-name ${ARTIFACTS_REPO_NAME} \ --region ${REGION} >/dev/null || true diff --git a/install/scripts/aws-up.sh b/install/scripts/aws-up.sh index c7947fdb..3519b280 100755 --- a/install/scripts/aws-up.sh +++ b/install/scripts/aws-up.sh @@ -38,7 +38,6 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION} --region ${REGION} envsubst <${KUBERENTES_DIR}/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/eks-cluster.yaml -cat ${KUBERENTES_DIR}/eks-cluster.yaml eksctl create cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml || eksctl upgrade cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml @@ -67,7 +66,6 @@ helm upgrade \ --wait envsubst <${KUBERENTES_DIR}/karpenter-provisioner.yaml.tpl >${KUBERENTES_DIR}/karpenter-provisioner.yaml -cat ${KUBERENTES_DIR}/karpenter-provisioner.yaml kubectl apply -f ${KUBERENTES_DIR}/karpenter-provisioner.yaml # node-termination-handler: https://artifacthub.io/packages/helm/aws/aws-node-termination-handler From daf75d40a483383335dc68797464c87cb8914e14 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Tue, 8 Aug 2023 22:34:24 -0700 Subject: [PATCH 10/21] moving all aws specific manifests into a dedicated dir --- install/kubernetes/{ => aws}/eks-cluster.yaml.tpl | 4 +++- .../{ => aws}/karpenter-provisioner.yaml.tpl | 0 install/kubernetes/aws/nvidia-eks-device-plugin.yaml | 9 +++++++++ install/scripts/aws-down.sh | 4 ++-- install/scripts/aws-up.sh | 11 ++++++----- 5 files changed, 20 insertions(+), 8 deletions(-) rename install/kubernetes/{ => aws}/eks-cluster.yaml.tpl (98%) rename install/kubernetes/{ => aws}/karpenter-provisioner.yaml.tpl (100%) create mode 100644 install/kubernetes/aws/nvidia-eks-device-plugin.yaml diff --git a/install/kubernetes/eks-cluster.yaml.tpl b/install/kubernetes/aws/eks-cluster.yaml.tpl similarity index 98% rename from install/kubernetes/eks-cluster.yaml.tpl rename to install/kubernetes/aws/eks-cluster.yaml.tpl index ba41d0c9..32b235fe 100644 --- a/install/kubernetes/eks-cluster.yaml.tpl +++ b/install/kubernetes/aws/eks-cluster.yaml.tpl @@ -20,16 +20,18 @@ managedNodeGroups: - name: builder-ng privateNetworking: true labels: { role: builders } + amiFamily: Ubuntu2004 instanceTypes: - m6a.large volumeSize: 100 minSize: 0 maxSize: 3 - desiredCapacity: 2 + desiredCapacity: 1 iam: withAddonPolicies: ebs: true imageBuilder: true + addons: - name: vpc-cni attachPolicyARNs: diff --git a/install/kubernetes/karpenter-provisioner.yaml.tpl b/install/kubernetes/aws/karpenter-provisioner.yaml.tpl similarity index 100% rename from install/kubernetes/karpenter-provisioner.yaml.tpl rename to install/kubernetes/aws/karpenter-provisioner.yaml.tpl diff --git a/install/kubernetes/aws/nvidia-eks-device-plugin.yaml b/install/kubernetes/aws/nvidia-eks-device-plugin.yaml new file mode 100644 index 00000000..6b772f43 --- /dev/null +++ b/install/kubernetes/aws/nvidia-eks-device-plugin.yaml @@ -0,0 +1,9 @@ +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: aws.amazon.com/eks-accelerator + operator: Exists + - key: aws.amazon.com/eks-gpu-driver-version + operator: DoesNotExist diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh index fef7a6fe..624ba610 100755 --- a/install/scripts/aws-down.sh +++ b/install/scripts/aws-down.sh @@ -21,8 +21,8 @@ export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts kubectl delete deployments --namespace=kube-system --all) || true -envsubst <${KUBERENTES_DIR}/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/eks-cluster.yaml -eksctl delete cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml || true +envsubst <${KUBERENTES_DIR}/aws/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/aws/eks-cluster.yaml +eksctl delete cluster -f ${KUBERENTES_DIR}/aws/eks-cluster.yaml || true aws cloudformation delete-stack \ --stack-name "Karpenter-${CLUSTER_NAME}" \ diff --git a/install/scripts/aws-up.sh b/install/scripts/aws-up.sh index 3519b280..b406a374 100755 --- a/install/scripts/aws-up.sh +++ b/install/scripts/aws-up.sh @@ -37,9 +37,9 @@ curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION} --parameter-overrides "ClusterName=${CLUSTER_NAME}" \ --region ${REGION} -envsubst <${KUBERENTES_DIR}/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/eks-cluster.yaml -eksctl create cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml || - eksctl upgrade cluster -f ${KUBERENTES_DIR}/eks-cluster.yaml +envsubst <${KUBERENTES_DIR}/aws/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/aws/eks-cluster.yaml +eksctl create cluster -f ${KUBERENTES_DIR}/aws/eks-cluster.yaml || + eksctl upgrade cluster -f ${KUBERENTES_DIR}/aws/eks-cluster.yaml aws iam create-service-linked-role \ --aws-service-name spot.amazonaws.com || true @@ -65,8 +65,8 @@ helm upgrade \ --set controller.resources.limits.memory=1Gi \ --wait -envsubst <${KUBERENTES_DIR}/karpenter-provisioner.yaml.tpl >${KUBERENTES_DIR}/karpenter-provisioner.yaml -kubectl apply -f ${KUBERENTES_DIR}/karpenter-provisioner.yaml +envsubst <${KUBERENTES_DIR}/aws/karpenter-provisioner.yaml.tpl >${KUBERENTES_DIR}/aws/karpenter-provisioner.yaml +kubectl apply -f ${KUBERENTES_DIR}/aws/karpenter-provisioner.yaml # node-termination-handler: https://artifacthub.io/packages/helm/aws/aws-node-termination-handler helm repo add eks https://aws.github.io/eks-charts @@ -82,6 +82,7 @@ helm upgrade \ --install nvdp nvdp/nvidia-device-plugin \ --namespace nvidia-device-plugin \ --create-namespace \ + --values ${KUBERENTES_DIR}/aws/nvidia-eks-device-plugin.yaml \ --version 0.14.1 # Install the substratus operator. From bc9c90310e2150cf30fb40990a179a02ce0ccdf9 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Tue, 8 Aug 2023 23:35:04 -0700 Subject: [PATCH 11/21] adds provisioners that add standard taint and an accelerator-specific label --- .../aws/karpenter-provisioner.yaml.tpl | 203 ++++++++++++++++-- .../aws/nvidia-eks-device-plugin.yaml | 2 - internal/awsmanager/manager.go | 9 +- internal/sci/sci.proto | 13 ++ 4 files changed, 212 insertions(+), 15 deletions(-) diff --git a/install/kubernetes/aws/karpenter-provisioner.yaml.tpl b/install/kubernetes/aws/karpenter-provisioner.yaml.tpl index c860c6ff..4a66f907 100644 --- a/install/kubernetes/aws/karpenter-provisioner.yaml.tpl +++ b/install/kubernetes/aws/karpenter-provisioner.yaml.tpl @@ -3,25 +3,23 @@ kind: AWSNodeTemplate metadata: name: default spec: + instanceProfile: eksctl-KarpenterNodeInstanceProfile-${CLUSTER_NAME} subnetSelector: karpenter.sh/discovery: ${CLUSTER_NAME} securityGroupSelector: karpenter.sh/discovery: ${CLUSTER_NAME} --- -# https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/ apiVersion: karpenter.sh/v1alpha5 kind: Provisioner metadata: - name: gpu + name: p4-gpu spec: - provider: - instanceProfile: eksctl-KarpenterNodeInstanceProfile-${CLUSTER_NAME} - subnetSelector: - karpenter.sh/discovery: ${CLUSTER_NAME} - securityGroupSelector: - karpenter.sh/discovery: ${CLUSTER_NAME} + providerRef: + name: default consolidation: enabled: true + labels: + aws.amazon.com/eks-accelerator: nvidia-a100 taints: - key: nvidia.com/gpu value: "true" @@ -30,12 +28,193 @@ spec: - key: karpenter.sh/capacity-type operator: In values: ["spot"] - - key: karpenter.k8s.aws/instance-category + - key: karpenter.k8s.aws/instance-family + operator: In + values: ["p4"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] +--- +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: p3-gpu +spec: + providerRef: + name: default + consolidation: + enabled: true + labels: + aws.amazon.com/eks-accelerator: nvidia-tesla-v100 + taints: + - key: nvidia.com/gpu + value: "true" + effect: "NoSchedule" + requirements: + - key: karpenter.sh/capacity-type operator: In - values: ["g", "p"] + values: ["spot"] - key: karpenter.k8s.aws/instance-family - operator: NotIn - values: ["p5"] + operator: In + values: ["p3"] - key: "kubernetes.io/arch" operator: In values: ["amd64"] +--- +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: p2-gpu +spec: + providerRef: + name: default + consolidation: + enabled: true + labels: + aws.amazon.com/eks-accelerator: nvidia-tesla-k80 + taints: + - key: nvidia.com/gpu + value: "true" + effect: "NoSchedule" + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["spot"] + - key: karpenter.k8s.aws/instance-family + operator: In + values: ["p2"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] +--- +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: g5-gpu +spec: + providerRef: + name: default + consolidation: + enabled: true + labels: + aws.amazon.com/eks-accelerator: nvidia-a10g + taints: + - key: nvidia.com/gpu + value: "true" + effect: "NoSchedule" + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["spot"] + - key: karpenter.k8s.aws/instance-family + operator: In + values: ["g5"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] +--- +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: g4ad-gpu +spec: + providerRef: + name: default + consolidation: + enabled: true + taints: + - key: amd.com/gpu + value: "true" + effect: "NoSchedule" + - key: aws.amazon.com/eks-accelerator + value: "amd-radeon-pro-v520" + effect: "NoSchedule" + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["spot"] + - key: karpenter.k8s.aws/instance-family + operator: In + values: ["g4ad"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] +--- +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: g4dn-gpu +spec: + providerRef: + name: default + consolidation: + enabled: true + labels: + aws.amazon.com/eks-accelerator: nvidia-t4 + taints: + - key: nvidia.com/gpu + value: "true" + effect: "NoSchedule" + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["spot"] + - key: karpenter.k8s.aws/instance-family + operator: In + values: ["g4dn"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] +--- +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: g3-gpu +spec: + providerRef: + name: default + consolidation: + enabled: true + labels: + aws.amazon.com/eks-accelerator: nvidia-tesla-m60 + taints: + - key: nvidia.com/gpu + value: "true" + effect: "NoSchedule" + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["spot"] + - key: karpenter.k8s.aws/instance-family + operator: In + values: ["g3"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] +--- +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: g2-gpu +spec: + providerRef: + name: default + consolidation: + enabled: true + labels: + aws.amazon.com/eks-accelerator: nvidia-grid-k520 + taints: + - key: nvidia.com/gpu + value: "true" + effect: "NoSchedule" + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["spot"] + - key: karpenter.k8s.aws/instance-family + operator: In + values: ["g2"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] +--- diff --git a/install/kubernetes/aws/nvidia-eks-device-plugin.yaml b/install/kubernetes/aws/nvidia-eks-device-plugin.yaml index 6b772f43..5a6df70b 100644 --- a/install/kubernetes/aws/nvidia-eks-device-plugin.yaml +++ b/install/kubernetes/aws/nvidia-eks-device-plugin.yaml @@ -5,5 +5,3 @@ affinity: - matchExpressions: - key: aws.amazon.com/eks-accelerator operator: Exists - - key: aws.amazon.com/eks-gpu-driver-version - operator: DoesNotExist diff --git a/internal/awsmanager/manager.go b/internal/awsmanager/manager.go index 7a877604..cdf3e514 100644 --- a/internal/awsmanager/manager.go +++ b/internal/awsmanager/manager.go @@ -1,5 +1,12 @@ // Package gcp provides an AWS implementation of the Substratus Cloud Interface (SCI) package awsmanager -// examples: https://docs.aws.amazon.com/AmazonS3/latest/userguide/example_s3_Scenario_PresignedUrl_section.html +// Presigned URL example: https://docs.aws.amazon.com/AmazonS3/latest/userguide/example_s3_Scenario_PresignedUrl_section.html // Checking object integrity: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html + +// Update policy: https://docs.aws.amazon.com/sdk-for-go/api/service/iam/#IAM.UpdateAssumeRolePolicy +// REST API: https://docs.aws.amazon.com/IAM/latest/APIReference/API_UpdateAssumeRolePolicy.html + +// requires: +// 1. a policy document which is an aws.String having a string encoded JSON blob of the trust policy +// 2. a rolename that we will refer to as the principal here diff --git a/internal/sci/sci.proto b/internal/sci/sci.proto index 2ced6249..fb206dd2 100644 --- a/internal/sci/sci.proto +++ b/internal/sci/sci.proto @@ -6,6 +6,11 @@ option go_package = "github.com/substratusai/substratus/internal/sci"; service Controller { rpc CreateSignedURL(CreateSignedURLRequest) returns (CreateSignedURLResponse) {} rpc GetObjectMd5(GetObjectMd5Request) returns (GetObjectMd5Response) {} + rpc UpdateIamPrincipal(UpdateIamPrincipalRequest) returns (UpdateIamPrincipalResponse) {} +} + +message UpdateIamPrincipalRequest { + string principalId = 1; // the email address, IAM role name or } message CreateSignedURLRequest { @@ -27,3 +32,11 @@ message GetObjectMd5Request { message GetObjectMd5Response { string md5checksum = 1; } + +message Put { + string = 1; +} + +message GetObjectMd5Response { + string md5checksum = 1; +} From e752152b5165418e4e68c73846b3cde8515f8984 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Wed, 9 Aug 2023 00:40:59 -0700 Subject: [PATCH 12/21] bringing the karpenter config back down to earth --- install/Dockerfile | 4 +- install/kubernetes/aws/eks-cluster.yaml.tpl | 35 +-- .../aws/karpenter-provisioner.yaml.tpl | 203 ++---------------- .../aws/nvidia-eks-device-plugin.yaml | 5 +- 4 files changed, 38 insertions(+), 209 deletions(-) diff --git a/install/Dockerfile b/install/Dockerfile index 30ee639d..44646101 100644 --- a/install/Dockerfile +++ b/install/Dockerfile @@ -31,9 +31,7 @@ RUN DEBIAN_FRONTEND="noninteractive" \ wget \ curl \ git \ - tzdata \ - gettext-base \ - keyboard-configuration + gettext-base # AWS CLI RUN source /etc/environment && \ diff --git a/install/kubernetes/aws/eks-cluster.yaml.tpl b/install/kubernetes/aws/eks-cluster.yaml.tpl index 32b235fe..3c08a87e 100644 --- a/install/kubernetes/aws/eks-cluster.yaml.tpl +++ b/install/kubernetes/aws/eks-cluster.yaml.tpl @@ -65,34 +65,37 @@ iam: wellKnownPolicies: ebsCSIController: true - metadata: - name: ${CLUSTER_NAME} - namespace: ${CLUSTER_NAME} + name: aws-manager + namespace: substratus attachPolicy: + # https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-presigned-url.html Version: "2012-10-17" Statement: - - Effect: Allow + - Sid: "AllowUrlPreSigning" + Effect: Allow + Action: + - "s3:PutObject" + - "s3:GetObject" + Resource: + - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}/*" + - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}" + - Sid: "FullSubstratusEcrRepoAccess" + Effect: Allow Action: - "ecr:*" Resource: - "arn:aws:ecr:::${ARTIFACTS_REPO_NAME}" - - Effect: Allow + - Sid: "S3AdminSubstratusBucketAccess" + Effect: Allow Action: - "s3:*" - "s3-object-lambda:*" Resource: - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}/*" - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}" - - metadata: - name: aws-manager - namespace: ${CLUSTER_NAME} - attachPolicy: - # https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-presigned-url.html - Version: "2012-10-17" - Statement: - - Effect: Allow + - Sid: "ModifyOwnTrustPolicy" + Effect: Allow Action: - - "s3:PutObject" - - "s3:GetObject" + - "iam:UpdateAssumeRolePolicy" Resource: - - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}/*" - - "arn:aws:s3:::${ARTIFACTS_BUCKET_NAME}" + - "arn:aws:iam::${AWS_ACCOUNT_ID}:role/$${aws:userid}" diff --git a/install/kubernetes/aws/karpenter-provisioner.yaml.tpl b/install/kubernetes/aws/karpenter-provisioner.yaml.tpl index 4a66f907..608cef04 100644 --- a/install/kubernetes/aws/karpenter-provisioner.yaml.tpl +++ b/install/kubernetes/aws/karpenter-provisioner.yaml.tpl @@ -12,14 +12,15 @@ spec: apiVersion: karpenter.sh/v1alpha5 kind: Provisioner metadata: - name: p4-gpu + name: nvidia-gpu spec: providerRef: name: default consolidation: enabled: true - labels: - aws.amazon.com/eks-accelerator: nvidia-a100 + # These well-known labels (specifically karpenter.k8s.aws/instance-gpu-name) + # will guide karpenter in accelerator and instance type selection: + # https://karpenter.sh/v0.29/concepts/scheduling/#labels taints: - key: nvidia.com/gpu value: "true" @@ -30,191 +31,17 @@ spec: values: ["spot"] - key: karpenter.k8s.aws/instance-family operator: In - values: ["p4"] + values: [ + "p4de", + "p4d", + "p3dn", + "p3", + "p2", + "g2", + "g3", + "g4", + "g5", + ] - key: "kubernetes.io/arch" operator: In values: ["amd64"] ---- -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: p3-gpu -spec: - providerRef: - name: default - consolidation: - enabled: true - labels: - aws.amazon.com/eks-accelerator: nvidia-tesla-v100 - taints: - - key: nvidia.com/gpu - value: "true" - effect: "NoSchedule" - requirements: - - key: karpenter.sh/capacity-type - operator: In - values: ["spot"] - - key: karpenter.k8s.aws/instance-family - operator: In - values: ["p3"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] ---- -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: p2-gpu -spec: - providerRef: - name: default - consolidation: - enabled: true - labels: - aws.amazon.com/eks-accelerator: nvidia-tesla-k80 - taints: - - key: nvidia.com/gpu - value: "true" - effect: "NoSchedule" - requirements: - - key: karpenter.sh/capacity-type - operator: In - values: ["spot"] - - key: karpenter.k8s.aws/instance-family - operator: In - values: ["p2"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] ---- -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: g5-gpu -spec: - providerRef: - name: default - consolidation: - enabled: true - labels: - aws.amazon.com/eks-accelerator: nvidia-a10g - taints: - - key: nvidia.com/gpu - value: "true" - effect: "NoSchedule" - requirements: - - key: karpenter.sh/capacity-type - operator: In - values: ["spot"] - - key: karpenter.k8s.aws/instance-family - operator: In - values: ["g5"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] ---- -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: g4ad-gpu -spec: - providerRef: - name: default - consolidation: - enabled: true - taints: - - key: amd.com/gpu - value: "true" - effect: "NoSchedule" - - key: aws.amazon.com/eks-accelerator - value: "amd-radeon-pro-v520" - effect: "NoSchedule" - requirements: - - key: karpenter.sh/capacity-type - operator: In - values: ["spot"] - - key: karpenter.k8s.aws/instance-family - operator: In - values: ["g4ad"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] ---- -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: g4dn-gpu -spec: - providerRef: - name: default - consolidation: - enabled: true - labels: - aws.amazon.com/eks-accelerator: nvidia-t4 - taints: - - key: nvidia.com/gpu - value: "true" - effect: "NoSchedule" - requirements: - - key: karpenter.sh/capacity-type - operator: In - values: ["spot"] - - key: karpenter.k8s.aws/instance-family - operator: In - values: ["g4dn"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] ---- -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: g3-gpu -spec: - providerRef: - name: default - consolidation: - enabled: true - labels: - aws.amazon.com/eks-accelerator: nvidia-tesla-m60 - taints: - - key: nvidia.com/gpu - value: "true" - effect: "NoSchedule" - requirements: - - key: karpenter.sh/capacity-type - operator: In - values: ["spot"] - - key: karpenter.k8s.aws/instance-family - operator: In - values: ["g3"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] ---- -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: g2-gpu -spec: - providerRef: - name: default - consolidation: - enabled: true - labels: - aws.amazon.com/eks-accelerator: nvidia-grid-k520 - taints: - - key: nvidia.com/gpu - value: "true" - effect: "NoSchedule" - requirements: - - key: karpenter.sh/capacity-type - operator: In - values: ["spot"] - - key: karpenter.k8s.aws/instance-family - operator: In - values: ["g2"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] ---- diff --git a/install/kubernetes/aws/nvidia-eks-device-plugin.yaml b/install/kubernetes/aws/nvidia-eks-device-plugin.yaml index 5a6df70b..4cafa362 100644 --- a/install/kubernetes/aws/nvidia-eks-device-plugin.yaml +++ b/install/kubernetes/aws/nvidia-eks-device-plugin.yaml @@ -3,5 +3,6 @@ affinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - - key: aws.amazon.com/eks-accelerator - operator: Exists + - key: karpenter.k8s.aws/instance-gpu-manufacturer + operator: In + values: ["nvidia"] From 3741018109b94956dd492297c0d3a55c80e89a37 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Wed, 9 Aug 2023 00:47:33 -0700 Subject: [PATCH 13/21] reverting changes to sci --- internal/sci/sci.proto | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/internal/sci/sci.proto b/internal/sci/sci.proto index fb206dd2..2ced6249 100644 --- a/internal/sci/sci.proto +++ b/internal/sci/sci.proto @@ -6,11 +6,6 @@ option go_package = "github.com/substratusai/substratus/internal/sci"; service Controller { rpc CreateSignedURL(CreateSignedURLRequest) returns (CreateSignedURLResponse) {} rpc GetObjectMd5(GetObjectMd5Request) returns (GetObjectMd5Response) {} - rpc UpdateIamPrincipal(UpdateIamPrincipalRequest) returns (UpdateIamPrincipalResponse) {} -} - -message UpdateIamPrincipalRequest { - string principalId = 1; // the email address, IAM role name or } message CreateSignedURLRequest { @@ -32,11 +27,3 @@ message GetObjectMd5Request { message GetObjectMd5Response { string md5checksum = 1; } - -message Put { - string = 1; -} - -message GetObjectMd5Response { - string md5checksum = 1; -} From 54e35118558f3460db91239fd26470baa6c53e40 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Wed, 9 Aug 2023 01:35:57 -0700 Subject: [PATCH 14/21] occasional sqs queue left over --- install/scripts/aws-down.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh index 624ba610..e4b3e36d 100755 --- a/install/scripts/aws-down.sh +++ b/install/scripts/aws-down.sh @@ -24,6 +24,9 @@ export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts envsubst <${KUBERENTES_DIR}/aws/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/aws/eks-cluster.yaml eksctl delete cluster -f ${KUBERENTES_DIR}/aws/eks-cluster.yaml || true +aws sqs delete-queue \ + --queue-url https://sqs.${REGION}.amazonaws.com/${AWS_ACCOUNT_ID}/substratus \ + --region ${REGION} || true aws cloudformation delete-stack \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --region ${REGION} || true From e846c81daf6a67c0f055109def260e1def473b6c Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Wed, 9 Aug 2023 01:46:54 -0700 Subject: [PATCH 15/21] bugfix --- install/kubernetes/aws/eks-cluster.yaml.tpl | 1 + install/kubernetes/aws/karpenter-provisioner.yaml.tpl | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/install/kubernetes/aws/eks-cluster.yaml.tpl b/install/kubernetes/aws/eks-cluster.yaml.tpl index 3c08a87e..f5a56eef 100644 --- a/install/kubernetes/aws/eks-cluster.yaml.tpl +++ b/install/kubernetes/aws/eks-cluster.yaml.tpl @@ -57,6 +57,7 @@ iam: namespace: karpenter roleName: ${CLUSTER_NAME}-karpenter attachPolicyARNs: + # this is used as spec.instanceProfile in the karpenter AWSNodeTemplate - arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME} roleOnly: true - metadata: diff --git a/install/kubernetes/aws/karpenter-provisioner.yaml.tpl b/install/kubernetes/aws/karpenter-provisioner.yaml.tpl index 608cef04..387bcee5 100644 --- a/install/kubernetes/aws/karpenter-provisioner.yaml.tpl +++ b/install/kubernetes/aws/karpenter-provisioner.yaml.tpl @@ -3,7 +3,7 @@ kind: AWSNodeTemplate metadata: name: default spec: - instanceProfile: eksctl-KarpenterNodeInstanceProfile-${CLUSTER_NAME} + instanceProfile: KarpenterControllerPolicy-${CLUSTER_NAME} subnetSelector: karpenter.sh/discovery: ${CLUSTER_NAME} securityGroupSelector: From f57fc8b3f08f572af79861079cc51bf3ddd1b355 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Wed, 9 Aug 2023 10:01:36 -0700 Subject: [PATCH 16/21] migrated tools install to a dedicated script that should work on workstations --- Makefile | 16 +++---- docs/development.md | 2 +- install/Dockerfile | 68 ++------------------------ install/scripts/aws-down.sh | 7 +-- install/scripts/get-tools.sh | 93 ++++++++++++++++++++++++++++++++++++ 5 files changed, 109 insertions(+), 77 deletions(-) create mode 100755 install/scripts/get-tools.sh diff --git a/Makefile b/Makefile index c7a34a6e..c28cfb2f 100644 --- a/Makefile +++ b/Makefile @@ -163,17 +163,17 @@ aws-dev-down: build-installer -e AWS_SESSION_TOKEN=$(shell aws configure get aws_session_token) \ substratus-installer aws-down.sh -.PHONY: gcp-dev-run +.PHONY: dev-run-gcp # Controller manager configuration # -gcp-dev-run: export CLOUD=gcp -gcp-dev-run: export GPU_TYPE=nvidia-l4 -gcp-dev-run: export PROJECT_ID=$(shell gcloud config get project) -gcp-dev-run: export CLUSTER_NAME=substratus -gcp-dev-run: export CLUSTER_LOCATION=us-central1 +dev-run-gcp: export CLOUD=gcp +dev-run-gcp: export GPU_TYPE=nvidia-l4 +dev-run-gcp: export PROJECT_ID=$(shell gcloud config get project) +dev-run-gcp: export CLUSTER_NAME=substratus +dev-run-gcp: export CLUSTER_LOCATION=us-central1 # Cloud manager configuration # -gcp-dev-run: export GOOGLE_APPLICATION_CREDENTIALS=./secrets/gcp-manager-key.json +dev-run-gcp: export GOOGLE_APPLICATION_CREDENTIALS=./secrets/gcp-manager-key.json # Run the controller manager and the cloud manager. -gcp-dev-run: manifests kustomize install-crds +dev-run-gcp: manifests kustomize install-crds go run ./cmd/gcpmanager & \ go run ./cmd/controllermanager/main.go \ --sci-address=localhost:10080 \ diff --git a/docs/development.md b/docs/development.md index b5000690..d07e0311 100644 --- a/docs/development.md +++ b/docs/development.md @@ -11,7 +11,7 @@ make gcp-dev-up Run Substratus control plane locally. ```sh -make gcp-dev-run +make dev-run-gcp ``` Delete GCP infra. diff --git a/install/Dockerfile b/install/Dockerfile index 44646101..91f6ebf0 100644 --- a/install/Dockerfile +++ b/install/Dockerfile @@ -1,73 +1,11 @@ FROM ubuntu:23.04 +ENV PATH $PATH:/workspace/scripts:/usr/local/gcloud/google-cloud-sdk/bin WORKDIR /workspace -# Determine platform and architecture -RUN ARCH=$(uname -m) && \ - PLATFORM=$(uname -s | tr '[:upper:]' '[:lower:]') && \ - if [ "$ARCH" = "aarch64" ]; then \ - echo "AWSCLI_ARCH=aarch64" >> /etc/environment; \ - echo "TERRAFORM_ARCH=arm64" >> /etc/environment; \ - echo "PLATFORM_ARCH=${PLATFORM}_arm64" >> /etc/environment; \ - elif [ "$ARCH" = "x86_64" ]; then \ - echo "AWSCLI_ARCH=x86_64" >> /etc/environment; \ - echo "TERRAFORM_ARCH=amd64" >> /etc/environment; \ - echo "PLATFORM_ARCH=${PLATFORM}_amd64" >> /etc/environment; \ - else \ - echo "Unsupported architecture"; \ - exit 1; \ - fi -# Source the environment file so that the variable is available in the current shell -SHELL ["/bin/bash", "-c"] -RUN source /etc/environment - -# Common -RUN DEBIAN_FRONTEND="noninteractive" \ - apt-get update && \ - apt-get install -y \ - gnupg \ - software-properties-common \ - unzip \ - wget \ - curl \ - git \ - gettext-base - -# AWS CLI -RUN source /etc/environment && \ - curl "https://awscli.amazonaws.com/awscli-exe-linux-${AWSCLI_ARCH}.zip" -o "awscliv2.zip" && \ - unzip awscliv2.zip && \ - ./aws/install - -# eksctl -RUN source /etc/environment && \ - curl -sLO "https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_${PLATFORM_ARCH}.tar.gz" && \ - tar -xzf eksctl_${PLATFORM_ARCH}.tar.gz -C /tmp && rm eksctl_${PLATFORM_ARCH}.tar.gz && \ - mv /tmp/eksctl /usr/local/bin - -# Terraform -RUN source /etc/environment && \ - wget https://releases.hashicorp.com/terraform/1.4.5/terraform_1.4.5_linux_${TERRAFORM_ARCH}.zip && \ - unzip terraform_1.4.5_linux_${TERRAFORM_ARCH}.zip && \ - mv terraform /usr/local/bin/ && \ - terraform --version - -# Google Cloud (gcloud) -RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz && \ - mkdir -p /usr/local/gcloud \ - && tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \ - && /usr/local/gcloud/google-cloud-sdk/install.sh -ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin -RUN gcloud --version -RUN gcloud components install gke-gcloud-auth-plugin kubectl +COPY scripts scripts -# Helm -RUN curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 -RUN chmod 700 /tmp/get_helm.sh -RUN /tmp/get_helm.sh +RUN scripts/get-tools.sh -# Local files COPY terraform terraform COPY kubernetes kubernetes -COPY scripts scripts -ENV PATH $PATH:/workspace/scripts diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh index e4b3e36d..3393ec11 100755 --- a/install/scripts/aws-down.sh +++ b/install/scripts/aws-down.sh @@ -24,13 +24,14 @@ export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts envsubst <${KUBERENTES_DIR}/aws/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/aws/eks-cluster.yaml eksctl delete cluster -f ${KUBERENTES_DIR}/aws/eks-cluster.yaml || true -aws sqs delete-queue \ - --queue-url https://sqs.${REGION}.amazonaws.com/${AWS_ACCOUNT_ID}/substratus \ - --region ${REGION} || true aws cloudformation delete-stack \ --stack-name "Karpenter-${CLUSTER_NAME}" \ --region ${REGION} || true +aws sqs delete-queue \ + --queue-url https://sqs.${REGION}.amazonaws.com/${AWS_ACCOUNT_ID}/substratus \ + --region ${REGION} || true + aws ecr delete-repository \ --repository-name ${ARTIFACTS_REPO_NAME} \ --region ${REGION} >/dev/null || true diff --git a/install/scripts/get-tools.sh b/install/scripts/get-tools.sh new file mode 100755 index 00000000..1e4f0c4c --- /dev/null +++ b/install/scripts/get-tools.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +terraform_version="1.4.5" + +tempout=$(mktemp -d) + +# Determine platform and architecture +arch=$(uname -m) +platform=$(uname -s | tr '[:upper:]' '[:lower:]') + +if [[ "$arch" = "aarch64" || "$arch" = "arm64" ]]; then + awscli_arch=aarch64 + terraform_arch=arm64 + platform_arch=${platform}_arm64 +elif [ "$arch" = "x86_64" ]; then + awscli_arch=x86_64 + terraform_arch=amd64 + platform_arch=${platform}_amd64 +else + echo "Unsupported architecture" + exit 1 +fi + +# install all our common tools +if [ "${platform}" == "linux" ]; then + DEBIAN_FRONTEND="noninteractive" \ + apt-get update + apt-get install -y \ + gnupg \ + software-properties-common \ + unzip \ + curl \ + git \ + python3-venv \ + gettext-base +elif [ "${platform}" == "darwin" ]; then + brew install \ + gnupg \ + unzip \ + curl \ + git \ + gettext +else + echo "Unsupported platform" + exit 1 +fi + +install_awscli() { + curl "https://s3.amazonaws.com/aws-cli/awscli-bundle.zip" -o "${tempout}/awscli-bundle.zip" + unzip "${tempout}/awscli-bundle.zip" -d ${tempout} + python3 "${tempout}/awscli-bundle/install" -i /usr/local/aws -b /usr/local/bin/aws +} + +install_eksctl() { + curl -sL "https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_${platform_arch}.tar.gz" \ + -o ${tempout}/eksctl.tar.gz + tar -xzf ${tempout}/eksctl.tar.gz -C /tmp + (mv /tmp/eksctl /usr/local/bin || sudo mv /tmp/eksctl /usr/local/bin) +} + +install_terraform() { + curl https://releases.hashicorp.com/terraform/${terraform_version}/terraform_${terraform_version}_${platform}_${terraform_arch}.zip \ + -o ${tempout}/terraform.zip + unzip ${tempout}/terraform.zip -d ${tempout} + (mv ${tempout}/terraform /usr/local/bin/ || sudo mv ${tempout}/terraform /usr/local/bin/) +} + +install_gcloud() { + curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz \ + -o ${tempout}/google-cloud-sdk.tar.gz + mkdir -p /usr/local/gcloud + tar -C /usr/local/gcloud -xvf ${tempout}/google-cloud-sdk.tar.gz + /usr/local/gcloud/google-cloud-sdk/install.sh + gcloud components install gke-gcloud-auth-plugin kubectl +} + +install_helm() { + curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 \ + -o ${tempout}/get_helm.sh + chmod 700 ${tempout}/get_helm.sh + ${tempout}/get_helm.sh +} + +if ! command -v aws &>/dev/null; then install_awscli; fi +if ! command -v eksctl &>/dev/null; then install_eksctl; fi +if ! command -v terraform &>/dev/null; then install_terraform; fi +if ! command -v gcloud &>/dev/null; then install_gcloud; fi +if ! command -v kubectl &>/dev/null; then install_gcloud; fi +if ! command -v helm &>/dev/null; then install_helm; fi + +rm -r ${tempout} + +echo "Installation complete!" From 2ede19397c1066f0d0fd122a250da5f97b2ef187 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Wed, 9 Aug 2023 10:37:43 -0700 Subject: [PATCH 17/21] migrated to lowercase vars --- install/scripts/aws-down.sh | 8 +++--- install/scripts/aws-up.sh | 51 +++++++++++++++---------------------- 2 files changed, 25 insertions(+), 34 deletions(-) diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh index 3393ec11..bca62391 100755 --- a/install/scripts/aws-down.sh +++ b/install/scripts/aws-down.sh @@ -5,8 +5,8 @@ set -u # Required env variables: : "$AWS_ACCOUNT_ID $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY" -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -KUBERENTES_DIR=${SCRIPT_DIR}/../kubernetes +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +kubernetes_dir=${script_dir}/../kubernetes EKSCTL_ENABLE_CREDENTIAL_CACHE=1 export CLUSTER_NAME=substratus @@ -21,8 +21,8 @@ export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts kubectl delete deployments --namespace=kube-system --all) || true -envsubst <${KUBERENTES_DIR}/aws/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/aws/eks-cluster.yaml -eksctl delete cluster -f ${KUBERENTES_DIR}/aws/eks-cluster.yaml || true +envsubst <${kubernetes_dir}/aws/eks-cluster.yaml.tpl >${kubernetes_dir}/aws/eks-cluster.yaml +eksctl delete cluster -f ${kubernetes_dir}/aws/eks-cluster.yaml || true aws cloudformation delete-stack \ --stack-name "Karpenter-${CLUSTER_NAME}" \ diff --git a/install/scripts/aws-up.sh b/install/scripts/aws-up.sh index b406a374..e686be7f 100755 --- a/install/scripts/aws-up.sh +++ b/install/scripts/aws-up.sh @@ -6,20 +6,19 @@ set -u # Required env variables: : "$AWS_ACCOUNT_ID $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY" -INSTALL_OPERATOR="${INSTALL_OPERATOR:-yes}" +install_operator="${INSTALL_OPERATOR:-yes}" -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -KUBERENTES_DIR=${SCRIPT_DIR}/../kubernetes +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +kubernetes_dir=${script_dir}/../kubernetes -EKSCTL_ENABLE_CREDENTIAL_CACHE=1 +eksctl_enable_credential_cache=1 export CLUSTER_NAME=substratus export REGION=us-west-2 export ARTIFACTS_REPO_NAME=${CLUSTER_NAME} export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts -export KARPENTER_VERSION=v0.29.2 -export AWS_PARTITION="aws" -export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" -TEMPOUT=$(mktemp) +export karpenter_version=v0.29.2 +export karpenter_iam_role_arn="arn:aws:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" +tempout=$(mktemp) aws s3 mb s3://${ARTIFACTS_BUCKET_NAME} \ --region ${REGION} >/dev/null || true @@ -29,17 +28,17 @@ aws ecr create-repository \ --region ${REGION} >/dev/null || true # install karpenter: https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/ -curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml >$TEMPOUT && +curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${karpenter_version}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml >$tempout && aws cloudformation deploy \ --stack-name "Karpenter-${CLUSTER_NAME}" \ - --template-file "${TEMPOUT}" \ + --template-file "${tempout}" \ --capabilities CAPABILITY_NAMED_IAM \ --parameter-overrides "ClusterName=${CLUSTER_NAME}" \ --region ${REGION} -envsubst <${KUBERENTES_DIR}/aws/eks-cluster.yaml.tpl >${KUBERENTES_DIR}/aws/eks-cluster.yaml -eksctl create cluster -f ${KUBERENTES_DIR}/aws/eks-cluster.yaml || - eksctl upgrade cluster -f ${KUBERENTES_DIR}/aws/eks-cluster.yaml +envsubst <${kubernetes_dir}/aws/eks-cluster.yaml.tpl >${kubernetes_dir}/aws/eks-cluster.yaml +eksctl create cluster -f ${kubernetes_dir}/aws/eks-cluster.yaml || + eksctl upgrade cluster -f ${kubernetes_dir}/aws/eks-cluster.yaml aws iam create-service-linked-role \ --aws-service-name spot.amazonaws.com || true @@ -53,9 +52,9 @@ helm registry logout public.ecr.aws || true helm upgrade \ --create-namespace \ --install karpenter oci://public.ecr.aws/karpenter/karpenter \ - --version ${KARPENTER_VERSION} \ + --version ${karpenter_version} \ --namespace karpenter \ - --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=${KARPENTER_IAM_ROLE_ARN} \ + --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=${karpenter_iam_role_arn} \ --set settings.aws.clusterName=${CLUSTER_NAME} \ --set settings.aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ --set settings.aws.interruptionQueueName=${CLUSTER_NAME} \ @@ -65,16 +64,8 @@ helm upgrade \ --set controller.resources.limits.memory=1Gi \ --wait -envsubst <${KUBERENTES_DIR}/aws/karpenter-provisioner.yaml.tpl >${KUBERENTES_DIR}/aws/karpenter-provisioner.yaml -kubectl apply -f ${KUBERENTES_DIR}/aws/karpenter-provisioner.yaml - -# node-termination-handler: https://artifacthub.io/packages/helm/aws/aws-node-termination-handler -helm repo add eks https://aws.github.io/eks-charts -helm upgrade \ - --install aws-node-termination-handler \ - --namespace kube-system \ - --version 0.21.0 \ - eks/aws-node-termination-handler +envsubst <${kubernetes_dir}/aws/karpenter-provisioner.yaml.tpl >${kubernetes_dir}/aws/karpenter-provisioner.yaml +kubectl apply -f ${kubernetes_dir}/aws/karpenter-provisioner.yaml # nvidia-device-plugin: https://github.com/NVIDIA/k8s-device-plugin#deployment-via-helm helm repo add nvdp https://nvidia.github.io/k8s-device-plugin @@ -82,12 +73,12 @@ helm upgrade \ --install nvdp nvdp/nvidia-device-plugin \ --namespace nvidia-device-plugin \ --create-namespace \ - --values ${KUBERENTES_DIR}/aws/nvidia-eks-device-plugin.yaml \ + --values ${kubernetes_dir}/aws/nvidia-eks-device-plugin.yaml \ --version 0.14.1 # Install the substratus operator. -if [ "${INSTALL_OPERATOR}" == "yes" ]; then - kubectl apply -f ${KUBERENTES_DIR}/namespace.yaml - kubectl apply -f ${KUBERENTES_DIR}/config.yaml - kubectl apply -f ${KUBERENTES_DIR}/system.yaml +if [ "${install_operator}" == "yes" ]; then + kubectl apply -f ${kubernetes_dir}/namespace.yaml + kubectl apply -f ${kubernetes_dir}/config.yaml + kubectl apply -f ${kubernetes_dir}/system.yaml fi From db6e22979e6957897c6164ddad1694545f8389cc Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Wed, 9 Aug 2023 10:42:42 -0700 Subject: [PATCH 18/21] consistent makefile target naming --- Makefile | 16 ++++++++-------- docs/development.md | 4 ++-- install/scripts/aws-up.sh | 28 ---------------------------- 3 files changed, 10 insertions(+), 38 deletions(-) diff --git a/Makefile b/Makefile index c28cfb2f..ae07a365 100644 --- a/Makefile +++ b/Makefile @@ -120,8 +120,8 @@ skaffold-dev-gcpmanager: protoc skaffold protogen render-skaffold-manifests ## R build: manifests generate fmt vet ## Build manager binary. go build -o bin/manager cmd/controllermanager/main.go -.PHONY: gcp-dev-up -gcp-dev-up: build-installer +.PHONY: dev-up-gcp +dev-up-gcp: build-installer docker run -it \ -v ${HOME}/.kube:/root/.kube \ -e PROJECT=$(shell gcloud config get project) \ @@ -132,8 +132,8 @@ gcp-dev-up: build-installer mkdir -p secrets gcloud iam service-accounts keys create --iam-account=substratus-gcp-manager@$(shell gcloud config get project).iam.gserviceaccount.com ./secrets/gcp-manager-key.json -.PHONY: gcp-dev-down -gcp-dev-down: build-installer +.PHONY: dev-down-gcp +dev-down-gcp: build-installer docker run -it \ -v ${HOME}/.kube:/root/.kube \ -e PROJECT=$(shell gcloud config get project) \ @@ -142,8 +142,8 @@ gcp-dev-down: build-installer substratus-installer gcp-down.sh rm ./secrets/gcp-manager-key.json -.PHONY: aws-dev-up -aws-dev-up: build-installer +.PHONY: dev-up-aws +dev-up-aws: build-installer docker run -it \ -v ${HOME}/.kube:/root/.kube \ -e AWS_ACCOUNT_ID="$(shell aws sts get-caller-identity --query Account --output text)" \ @@ -153,8 +153,8 @@ aws-dev-up: build-installer -e INSTALL_OPERATOR=false \ substratus-installer aws-up.sh -.PHONY: aws-dev-down -aws-dev-down: build-installer +.PHONY: dev-down-aws +dev-down-aws: build-installer docker run -it \ -v ${HOME}/.kube:/root/.kube \ -e AWS_ACCOUNT_ID="$(shell aws sts get-caller-identity --query Account --output text)" \ diff --git a/docs/development.md b/docs/development.md index d07e0311..a4453239 100644 --- a/docs/development.md +++ b/docs/development.md @@ -5,7 +5,7 @@ Create a GCP environment. ```sh -make gcp-dev-up +make dev-up-gcp ``` Run Substratus control plane locally. @@ -17,7 +17,7 @@ make dev-run-gcp Delete GCP infra. ```sh -make gcp-dev-down +make dev-down-gcp ``` TODO: Automate the cleanup of PVs... Don't forget to manually clean them up for now. diff --git a/install/scripts/aws-up.sh b/install/scripts/aws-up.sh index e686be7f..3b792cdc 100755 --- a/install/scripts/aws-up.sh +++ b/install/scripts/aws-up.sh @@ -16,8 +16,6 @@ export CLUSTER_NAME=substratus export REGION=us-west-2 export ARTIFACTS_REPO_NAME=${CLUSTER_NAME} export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts -export karpenter_version=v0.29.2 -export karpenter_iam_role_arn="arn:aws:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" tempout=$(mktemp) aws s3 mb s3://${ARTIFACTS_BUCKET_NAME} \ @@ -27,15 +25,6 @@ aws ecr create-repository \ --repository-name ${ARTIFACTS_REPO_NAME} \ --region ${REGION} >/dev/null || true -# install karpenter: https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/ -curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${karpenter_version}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml >$tempout && - aws cloudformation deploy \ - --stack-name "Karpenter-${CLUSTER_NAME}" \ - --template-file "${tempout}" \ - --capabilities CAPABILITY_NAMED_IAM \ - --parameter-overrides "ClusterName=${CLUSTER_NAME}" \ - --region ${REGION} - envsubst <${kubernetes_dir}/aws/eks-cluster.yaml.tpl >${kubernetes_dir}/aws/eks-cluster.yaml eksctl create cluster -f ${kubernetes_dir}/aws/eks-cluster.yaml || eksctl upgrade cluster -f ${kubernetes_dir}/aws/eks-cluster.yaml @@ -47,23 +36,6 @@ aws eks update-kubeconfig \ --region ${REGION} \ --name ${CLUSTER_NAME} -# Logout of helm registry to perform an unauthenticated pull against the public ECR -helm registry logout public.ecr.aws || true -helm upgrade \ - --create-namespace \ - --install karpenter oci://public.ecr.aws/karpenter/karpenter \ - --version ${karpenter_version} \ - --namespace karpenter \ - --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=${karpenter_iam_role_arn} \ - --set settings.aws.clusterName=${CLUSTER_NAME} \ - --set settings.aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ - --set settings.aws.interruptionQueueName=${CLUSTER_NAME} \ - --set controller.resources.requests.cpu=1 \ - --set controller.resources.requests.memory=1Gi \ - --set controller.resources.limits.cpu=1 \ - --set controller.resources.limits.memory=1Gi \ - --wait - envsubst <${kubernetes_dir}/aws/karpenter-provisioner.yaml.tpl >${kubernetes_dir}/aws/karpenter-provisioner.yaml kubectl apply -f ${kubernetes_dir}/aws/karpenter-provisioner.yaml From 043161592bbde0ff03ac4f8d0bae434d23c23438 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Wed, 9 Aug 2023 10:45:23 -0700 Subject: [PATCH 19/21] dropping probably not needed aws-down steps --- install/scripts/aws-down.sh | 8 -------- 1 file changed, 8 deletions(-) diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh index bca62391..57d4ca5d 100755 --- a/install/scripts/aws-down.sh +++ b/install/scripts/aws-down.sh @@ -24,14 +24,6 @@ export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts envsubst <${kubernetes_dir}/aws/eks-cluster.yaml.tpl >${kubernetes_dir}/aws/eks-cluster.yaml eksctl delete cluster -f ${kubernetes_dir}/aws/eks-cluster.yaml || true -aws cloudformation delete-stack \ - --stack-name "Karpenter-${CLUSTER_NAME}" \ - --region ${REGION} || true - -aws sqs delete-queue \ - --queue-url https://sqs.${REGION}.amazonaws.com/${AWS_ACCOUNT_ID}/substratus \ - --region ${REGION} || true - aws ecr delete-repository \ --repository-name ${ARTIFACTS_REPO_NAME} \ --region ${REGION} >/dev/null || true From 5149163682db3c1084704e1e826a821f6ba9fe7c Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Wed, 9 Aug 2023 11:06:31 -0700 Subject: [PATCH 20/21] improved caching on docker build. dropping some karpenter configs --- install/Dockerfile | 3 ++- install/kubernetes/aws/eks-cluster.yaml.tpl | 4 +--- install/kubernetes/aws/karpenter-provisioner.yaml.tpl | 2 +- install/scripts/aws-down.sh | 1 + install/scripts/aws-up.sh | 3 +-- 5 files changed, 6 insertions(+), 7 deletions(-) diff --git a/install/Dockerfile b/install/Dockerfile index 91f6ebf0..fbe4f3e6 100644 --- a/install/Dockerfile +++ b/install/Dockerfile @@ -3,9 +3,10 @@ FROM ubuntu:23.04 ENV PATH $PATH:/workspace/scripts:/usr/local/gcloud/google-cloud-sdk/bin WORKDIR /workspace -COPY scripts scripts +COPY scripts/get-tools.sh scripts/ RUN scripts/get-tools.sh +COPY scripts scripts COPY terraform terraform COPY kubernetes kubernetes diff --git a/install/kubernetes/aws/eks-cluster.yaml.tpl b/install/kubernetes/aws/eks-cluster.yaml.tpl index f5a56eef..9c5de2ec 100644 --- a/install/kubernetes/aws/eks-cluster.yaml.tpl +++ b/install/kubernetes/aws/eks-cluster.yaml.tpl @@ -1,3 +1,4 @@ +# https://eksctl.io/usage/schema/ apiVersion: eksctl.io/v1alpha5 kind: ClusterConfig metadata: @@ -10,12 +11,9 @@ metadata: karpenter.sh/discovery: ${CLUSTER_NAME} karpenter: - createServiceAccount: true withSpotInterruptionQueue: true - defaultInstanceProfile: "KarpenterNodeInstanceProfile-${CLUSTER_NAME}" version: "v0.29.0" -# if karpenter doesn't suffice: https://github.com/eksctl-io/eksctl/blob/main/examples/23-kubeflow-spot-instance.yaml managedNodeGroups: - name: builder-ng privateNetworking: true diff --git a/install/kubernetes/aws/karpenter-provisioner.yaml.tpl b/install/kubernetes/aws/karpenter-provisioner.yaml.tpl index 387bcee5..608cef04 100644 --- a/install/kubernetes/aws/karpenter-provisioner.yaml.tpl +++ b/install/kubernetes/aws/karpenter-provisioner.yaml.tpl @@ -3,7 +3,7 @@ kind: AWSNodeTemplate metadata: name: default spec: - instanceProfile: KarpenterControllerPolicy-${CLUSTER_NAME} + instanceProfile: eksctl-KarpenterNodeInstanceProfile-${CLUSTER_NAME} subnetSelector: karpenter.sh/discovery: ${CLUSTER_NAME} securityGroupSelector: diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh index 57d4ca5d..2d31a3f5 100755 --- a/install/scripts/aws-down.sh +++ b/install/scripts/aws-down.sh @@ -5,6 +5,7 @@ set -u # Required env variables: : "$AWS_ACCOUNT_ID $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY" + script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" kubernetes_dir=${script_dir}/../kubernetes diff --git a/install/scripts/aws-up.sh b/install/scripts/aws-up.sh index 3b792cdc..c41d5f46 100755 --- a/install/scripts/aws-up.sh +++ b/install/scripts/aws-up.sh @@ -5,13 +5,12 @@ set -u # Required env variables: : "$AWS_ACCOUNT_ID $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY" - install_operator="${INSTALL_OPERATOR:-yes}" script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" kubernetes_dir=${script_dir}/../kubernetes -eksctl_enable_credential_cache=1 +EKSCTL_ENABLE_CREDENTIAL_CACHE=1 export CLUSTER_NAME=substratus export REGION=us-west-2 export ARTIFACTS_REPO_NAME=${CLUSTER_NAME} From 76ff2c9a997c196c7e1bbc7d193c9de2e060fbd0 Mon Sep 17 00:00:00 2001 From: Brandon Bjelland Date: Wed, 9 Aug 2023 22:19:21 -0700 Subject: [PATCH 21/21] everything is working consistently. shipping it --- install/kubernetes/aws/eks-cluster.yaml.tpl | 8 ++--- .../aws/karpenter-provisioner.yaml.tpl | 2 +- install/scripts/aws-down.sh | 4 +++ install/scripts/aws-up.sh | 29 +++++++++++++++++-- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/install/kubernetes/aws/eks-cluster.yaml.tpl b/install/kubernetes/aws/eks-cluster.yaml.tpl index 9c5de2ec..f9bee6e5 100644 --- a/install/kubernetes/aws/eks-cluster.yaml.tpl +++ b/install/kubernetes/aws/eks-cluster.yaml.tpl @@ -10,10 +10,6 @@ metadata: environment: dev karpenter.sh/discovery: ${CLUSTER_NAME} -karpenter: - withSpotInterruptionQueue: true - version: "v0.29.0" - managedNodeGroups: - name: builder-ng privateNetworking: true @@ -41,7 +37,7 @@ addons: - name: coredns iamIdentityMappings: - - arn: "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}" + - arn: "arn:aws:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}" username: system:node:{{EC2PrivateDNSName}} groups: - system:bootstrappers @@ -56,7 +52,7 @@ iam: roleName: ${CLUSTER_NAME}-karpenter attachPolicyARNs: # this is used as spec.instanceProfile in the karpenter AWSNodeTemplate - - arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME} + - arn:aws:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME} roleOnly: true - metadata: name: ebs-csi-controller-sa diff --git a/install/kubernetes/aws/karpenter-provisioner.yaml.tpl b/install/kubernetes/aws/karpenter-provisioner.yaml.tpl index 608cef04..951e0aca 100644 --- a/install/kubernetes/aws/karpenter-provisioner.yaml.tpl +++ b/install/kubernetes/aws/karpenter-provisioner.yaml.tpl @@ -3,7 +3,7 @@ kind: AWSNodeTemplate metadata: name: default spec: - instanceProfile: eksctl-KarpenterNodeInstanceProfile-${CLUSTER_NAME} + instanceProfile: KarpenterNodeInstanceProfile-${CLUSTER_NAME} subnetSelector: karpenter.sh/discovery: ${CLUSTER_NAME} securityGroupSelector: diff --git a/install/scripts/aws-down.sh b/install/scripts/aws-down.sh index 2d31a3f5..53ca7934 100755 --- a/install/scripts/aws-down.sh +++ b/install/scripts/aws-down.sh @@ -25,6 +25,10 @@ export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts envsubst <${kubernetes_dir}/aws/eks-cluster.yaml.tpl >${kubernetes_dir}/aws/eks-cluster.yaml eksctl delete cluster -f ${kubernetes_dir}/aws/eks-cluster.yaml || true +aws cloudformation delete-stack \ + --stack-name "Karpenter-${CLUSTER_NAME}" \ + --region ${REGION} || true + aws ecr delete-repository \ --repository-name ${ARTIFACTS_REPO_NAME} \ --region ${REGION} >/dev/null || true diff --git a/install/scripts/aws-up.sh b/install/scripts/aws-up.sh index c41d5f46..0fee3e8a 100755 --- a/install/scripts/aws-up.sh +++ b/install/scripts/aws-up.sh @@ -11,10 +11,12 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" kubernetes_dir=${script_dir}/../kubernetes EKSCTL_ENABLE_CREDENTIAL_CACHE=1 +karpenter_version=v0.29.2 export CLUSTER_NAME=substratus export REGION=us-west-2 export ARTIFACTS_REPO_NAME=${CLUSTER_NAME} export ARTIFACTS_BUCKET_NAME=${AWS_ACCOUNT_ID}-${CLUSTER_NAME}-artifacts +karpenter_iam_role_arn="arn:aws:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter" tempout=$(mktemp) aws s3 mb s3://${ARTIFACTS_BUCKET_NAME} \ @@ -24,6 +26,14 @@ aws ecr create-repository \ --repository-name ${ARTIFACTS_REPO_NAME} \ --region ${REGION} >/dev/null || true +curl -fsSL https://raw.githubusercontent.com/aws/karpenter/"${karpenter_version}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml >$tempout +aws cloudformation deploy \ + --stack-name "Karpenter-${CLUSTER_NAME}" \ + --template-file "${tempout}" \ + --capabilities CAPABILITY_NAMED_IAM \ + --region ${REGION} \ + --parameter-overrides "ClusterName=${CLUSTER_NAME}" + envsubst <${kubernetes_dir}/aws/eks-cluster.yaml.tpl >${kubernetes_dir}/aws/eks-cluster.yaml eksctl create cluster -f ${kubernetes_dir}/aws/eks-cluster.yaml || eksctl upgrade cluster -f ${kubernetes_dir}/aws/eks-cluster.yaml @@ -31,9 +41,22 @@ eksctl create cluster -f ${kubernetes_dir}/aws/eks-cluster.yaml || aws iam create-service-linked-role \ --aws-service-name spot.amazonaws.com || true -aws eks update-kubeconfig \ - --region ${REGION} \ - --name ${CLUSTER_NAME} +# Logout of helm registry to perform an unauthenticated pull against the public ECR +helm registry logout public.ecr.aws || true +helm upgrade \ + --create-namespace \ + --install karpenter oci://public.ecr.aws/karpenter/karpenter \ + --version ${karpenter_version} \ + --namespace karpenter \ + --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=${karpenter_iam_role_arn} \ + --set settings.aws.clusterName=${CLUSTER_NAME} \ + --set settings.aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ + --set settings.aws.interruptionQueueName=${CLUSTER_NAME} \ + --set controller.resources.requests.cpu=1 \ + --set controller.resources.requests.memory=1Gi \ + --set controller.resources.limits.cpu=1 \ + --set controller.resources.limits.memory=1Gi \ + --wait envsubst <${kubernetes_dir}/aws/karpenter-provisioner.yaml.tpl >${kubernetes_dir}/aws/karpenter-provisioner.yaml kubectl apply -f ${kubernetes_dir}/aws/karpenter-provisioner.yaml