From 2e3f211b1c4280203dc2cfdc148f95258189cff2 Mon Sep 17 00:00:00 2001 From: Matheus Fidelis Date: Fri, 11 Oct 2024 18:43:38 -0300 Subject: [PATCH] eks upgrade and disruption queue for karpenter --- README.md | 34 +++++++--- aws-auth-config.tf | 5 ++ helm_argo_rollouts.tf | 28 ++++---- helm_karpenter.tf | 149 ++++++++++++++++++++++++++++++++++++++++++ iam_karpenter.tf | 3 +- variables.tf | 14 ++-- 6 files changed, 201 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 3fa7024..a9762e0 100644 --- a/README.md +++ b/README.md @@ -65,11 +65,21 @@ No modules. | Name | Type | |------|------| | [aws_api_gateway_vpc_link.nlb](https://registry.terraform.io/providers/aws/latest/docs/resources/api_gateway_vpc_link) | resource | +| [aws_cloudwatch_event_rule.karpenter_termination_handler_instance_terminate](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_rule) | resource | +| [aws_cloudwatch_event_rule.karpenter_termination_handler_rebalance](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_rule) | resource | +| [aws_cloudwatch_event_rule.karpenter_termination_handler_scheduled_change](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_rule) | resource | +| [aws_cloudwatch_event_rule.karpenter_termination_handler_spot_termination](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_rule) | resource | +| [aws_cloudwatch_event_rule.karpenter_termination_handler_state_change](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_rule.node_termination_handler_instance_terminate](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_rule.node_termination_handler_rebalance](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_rule.node_termination_handler_scheduled_change](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_rule.node_termination_handler_spot_termination](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_rule.node_termination_handler_state_change](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_rule) | resource | +| [aws_cloudwatch_event_target.karpenter_termination_handler_instance_terminate](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_cloudwatch_event_target.karpenter_termination_handler_rebalance](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_cloudwatch_event_target.karpenter_termination_handler_scheduled_change](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_cloudwatch_event_target.karpenter_termination_handler_spot_termination](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_cloudwatch_event_target.karpenter_termination_handler_state_change](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_target) | resource | | [aws_cloudwatch_event_target.node_termination_handler_instance_terminate](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_target) | resource | | [aws_cloudwatch_event_target.node_termination_handler_rebalance](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_target) | resource | | [aws_cloudwatch_event_target.node_termination_handler_scheduled_change](https://registry.terraform.io/providers/aws/latest/docs/resources/cloudwatch_event_target) | resource | @@ -148,7 +158,9 @@ No modules. | [aws_security_group_rule.nodeport](https://registry.terraform.io/providers/aws/latest/docs/resources/security_group_rule) | resource | | [aws_security_group_rule.nodeport_cluster](https://registry.terraform.io/providers/aws/latest/docs/resources/security_group_rule) | resource | | [aws_security_group_rule.nodeport_cluster_udp](https://registry.terraform.io/providers/aws/latest/docs/resources/security_group_rule) | resource | +| [aws_sqs_queue.karpenter_termination_handler](https://registry.terraform.io/providers/aws/latest/docs/resources/sqs_queue) | resource | | [aws_sqs_queue.node_termination_handler](https://registry.terraform.io/providers/aws/latest/docs/resources/sqs_queue) | resource | +| [aws_sqs_queue_policy.karpenter_termination_handler](https://registry.terraform.io/providers/aws/latest/docs/resources/sqs_queue_policy) | resource | | [aws_sqs_queue_policy.node_termination_handler](https://registry.terraform.io/providers/aws/latest/docs/resources/sqs_queue_policy) | resource | | [aws_subnet.pods_subnet_1a](https://registry.terraform.io/providers/aws/latest/docs/resources/subnet) | resource | | [aws_subnet.pods_subnet_1b](https://registry.terraform.io/providers/aws/latest/docs/resources/subnet) | resource | @@ -184,8 +196,8 @@ No modules. | [kubectl_manifest.istio_target_group_binding_https](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [kubectl_manifest.jaeger_gateway](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [kubectl_manifest.jaeger_virtual_service](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | -| [kubectl_manifest.karpenter_provisioner](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | -| [kubectl_manifest.karpenter_template](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | +| [kubectl_manifest.karpenter_node_class](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | +| [kubectl_manifest.karpenter_node_pool](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [kubectl_manifest.kiali_gateway](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [kubectl_manifest.kiali_virtual_service](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [kubectl_manifest.rollouts_gateway](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | @@ -215,13 +227,13 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [addon\_cni\_version](#input\_addon\_cni\_version) | Specifies the version of the AWS VPC CNI (Container Network Interface) plugin to use, which manages the network interfaces for pod networking. | `string` | `"v1.18.3-eksbuild.1"` | no | -| [addon\_coredns\_version](#input\_addon\_coredns\_version) | Defines the version of CoreDNS to use, a DNS server/forwarder that is integral to internal Kubernetes DNS resolution. | `string` | `"v1.11.1-eksbuild.9"` | no | -| [addon\_csi\_version](#input\_addon\_csi\_version) | Indicates the version of the Container Storage Interface (CSI) driver to use for managing storage volumes in Kubernetes. | `string` | `"v1.26.1-eksbuild.1"` | no | -| [addon\_kubeproxy\_version](#input\_addon\_kubeproxy\_version) | Sets the version of Kubeproxy to be used, which handles Kubernetes network services like forwarding the requests to correct containers. | `string` | `"v1.30.0-eksbuild.3"` | no | +| [addon\_cni\_version](#input\_addon\_cni\_version) | Specifies the version of the AWS VPC CNI (Container Network Interface) plugin to use, which manages the network interfaces for pod networking. | `string` | `"v1.18.3-eksbuild.2"` | no | +| [addon\_coredns\_version](#input\_addon\_coredns\_version) | Defines the version of CoreDNS to use, a DNS server/forwarder that is integral to internal Kubernetes DNS resolution. | `string` | `"v1.11.3-eksbuild.1"` | no | +| [addon\_csi\_version](#input\_addon\_csi\_version) | Indicates the version of the Container Storage Interface (CSI) driver to use for managing storage volumes in Kubernetes. | `string` | `"v1.35.0-eksbuild.1"` | no | +| [addon\_kubeproxy\_version](#input\_addon\_kubeproxy\_version) | Sets the version of Kubeproxy to be used, which handles Kubernetes network services like forwarding the requests to correct containers. | `string` | `"v1.31.0-eksbuild.5"` | no | | [argo\_rollouts\_toggle](#input\_argo\_rollouts\_toggle) | Enables the installation of Argo Rollouts, providing advanced deployment strategies like Canary and Blue-Green deployments in Kubernetes. | `bool` | `true` | no | | [argo\_rollouts\_virtual\_service\_host](#input\_argo\_rollouts\_virtual\_service\_host) | The hostname for the Argo Rollouts virtual service, used for advanced deployment capabilities like canary and blue-green deployments in Kubernetes. | `string` | `"argo-rollouts.k8s.raj.ninja"` | no | -| [auto\_scale\_options](#input\_auto\_scale\_options) | Configuration for the EKS cluster auto-scaling. It includes the minimum (min), maximum (max), and desired (desired) number of worker nodes. | `map` |
{
"desired": 6,
"max": 10,
"min": 4
}
| no | +| [auto\_scale\_options](#input\_auto\_scale\_options) | Configuration for the EKS cluster auto-scaling. It includes the minimum (min), maximum (max), and desired (desired) number of worker nodes. | `map` |
{
"desired": 4,
"max": 10,
"min": 4
}
| no | | [aws\_region](#input\_aws\_region) | AWS region where the EKS cluster will be deployed. This should be set to the region where you want your Kubernetes resources to reside. | `string` | `"us-east-1"` | no | | [chaos\_mesh\_toggle](#input\_chaos\_mesh\_toggle) | Determines whether to install Chaos Mesh, a cloud-native Chaos Engineering platform that orchestrates chaos experiments on Kubernetes environments. | `bool` | `false` | no | | [cluster\_autoscaler\_toggle](#input\_cluster\_autoscaler\_toggle) | Enable or disable the Cluster Autoscaler installation. When true, Cluster Autoscaler is installed to automatically adjust the number of nodes in the cluster. | `bool` | `false` | no | @@ -230,16 +242,18 @@ No modules. | [default\_tags](#input\_default\_tags) | A map of default tags to apply to all resources. These tags can help with identifying and organizing resources within the AWS environment. | `map(string)` |
{
"Environment": "prod",
"Foo": "Bar",
"Ping": "Pong"
}
| no | | [descheduler\_toggle](#input\_descheduler\_toggle) | Controls the installation of the Descheduler, a tool to balance and optimize the distribution of Pods across the cluster for improved efficiency. | `bool` | `false` | no | | [enable\_cross\_zone\_load\_balancing](#input\_enable\_cross\_zone\_load\_balancing) | Controls whether cross-zone load balancing is enabled for the Network Load Balancer, allowing even traffic distribution across all zones. | `bool` | `false` | no | +| [enable\_jaeger](#input\_enable\_jaeger) | Flag to create jaeger standalone stack | `bool` | `false` | no | | [enable\_managed\_prometheus](#input\_enable\_managed\_prometheus) | Determines if the managed Prometheus service should be enabled. Managed Prometheus provides a fully managed monitoring service compatible with Prometheus. | `bool` | `false` | no | | [enable\_prometheus\_stack](#input\_enable\_prometheus\_stack) | n/a | `bool` | `true` | no | -| [enable\_vpc\_link](#input\_enable\_vpc\_link) | Create VPC Link associated to Network Load Balancing | `bool` | `true` | no | +| [enable\_vpc\_link](#input\_enable\_vpc\_link) | Create VPC Link associated to Network Load Balancing | `bool` | `false` | no | | [grafana\_virtual\_service\_host](#input\_grafana\_virtual\_service\_host) | The hostname for the Grafana virtual service, used in Istio routing. This host is used to access Grafana dashboards for monitoring metrics. | `string` | `"grafana.k8s.raj.ninja"` | no | | [istio\_ingress\_max\_pods](#input\_istio\_ingress\_max\_pods) | The maximum number of pods to scale up for the Istio ingress gateway. This limits the resources used and manages the scaling behavior. | `number` | `9` | no | | [istio\_ingress\_min\_pods](#input\_istio\_ingress\_min\_pods) | The minimum number of pods to maintain for the Istio ingress gateway. This ensures basic availability and load handling. | `number` | `3` | no | | [jaeger\_virtual\_service\_host](#input\_jaeger\_virtual\_service\_host) | The hostname for the Jaeger virtual service, used for tracing and monitoring microservices within the Istio service mesh. | `string` | `"jaeger.k8s.raj.ninja"` | no | -| [k8s\_version](#input\_k8s\_version) | The version of Kubernetes to use for the EKS cluster. This version should be compatible with the AWS EKS service and other infrastructure components. | `string` | `"1.30"` | no | +| [k8s\_version](#input\_k8s\_version) | The version of Kubernetes to use for the EKS cluster. This version should be compatible with the AWS EKS service and other infrastructure components. | `string` | `"1.31"` | no | | [karpenter\_availability\_zones](#input\_karpenter\_availability\_zones) | A list of AWS availability zones where Karpenter should launch nodes. These zones should be in the same region as the EKS cluster. | `list(any)` |
[
"us-east-1a",
"us-east-1b",
"us-east-1c"
]
| no | | [karpenter\_capacity\_type](#input\_karpenter\_capacity\_type) | Defines the capacity types for provisioning instances in the cluster, such as 'spot' or 'on\_demand', offering cost-saving options or consistent availability respectively. | `list(any)` |
[
"spot"
]
| no | +| [karpenter\_ec2\_node\_family](#input\_karpenter\_ec2\_node\_family) | n/a | `string` | `"Bottlerocket"` | no | | [karpenter\_instance\_family](#input\_karpenter\_instance\_family) | Defines a list of EC2 instance families to be considered by Karpenter for node provisioning. Instance families like 'c6' and 'c5' offer different compute capabilities. | `list(any)` |
[
"c6",
"c6a",
"c5"
]
| no | | [karpenter\_instance\_sizes](#input\_karpenter\_instance\_sizes) | Specifies a list of instance sizes within the chosen instance families to allow diversity in the provisioned nodes by Karpenter. | `list(any)` |
[
"large",
"2xlarge"
]
| no | | [karpenter\_toggle](#input\_karpenter\_toggle) | Determines whether Karpenter is enabled for the EKS cluster. Karpenter is an open-source auto-scaler for Kubernetes clusters. | `bool` | `true` | no | @@ -253,7 +267,7 @@ No modules. | [nlb\_ingress\_enable\_termination\_protection](#input\_nlb\_ingress\_enable\_termination\_protection) | Determines if termination protection is enabled for the Network Load Balancer, preventing accidental deletion. | `bool` | `false` | no | | [nlb\_ingress\_internal](#input\_nlb\_ingress\_internal) | Indicates whether the Network Load Balancer (NLB) for the EKS cluster should be internal, restricting access to within the AWS network. | `bool` | `false` | no | | [nlb\_ingress\_type](#input\_nlb\_ingress\_type) | Specifies the type of ingress to be used, such as 'network', determining how the NLB handles incoming traffic to the EKS cluster. | `string` | `"network"` | no | -| [node\_termination\_handler\_toggle](#input\_node\_termination\_handler\_toggle) | Enables the AWS Node Termination Handler, which ensures that Kubernetes workloads are gracefully handled during EC2 instance terminations or disruptions. | `bool` | `true` | no | +| [node\_termination\_handler\_toggle](#input\_node\_termination\_handler\_toggle) | Enables the AWS Node Termination Handler, which ensures that Kubernetes workloads are gracefully handled during EC2 instance terminations or disruptions. | `bool` | `false` | no | | [nodes\_instances\_sizes](#input\_nodes\_instances\_sizes) | A list of EC2 instance types to use for the EKS worker nodes. These instance types should balance between cost, performance, and resource requirements for your workload. | `list` |
[
"t3.large"
]
| no | | [proxy\_protocol\_v2](#input\_proxy\_protocol\_v2) | Enables or disables Proxy Protocol v2 on the Network Load Balancer, used for preserving client IP addresses and other connection information. | `bool` | `false` | no | diff --git a/aws-auth-config.tf b/aws-auth-config.tf index 6a9e370..5e7f4e8 100644 --- a/aws-auth-config.tf +++ b/aws-auth-config.tf @@ -12,6 +12,11 @@ resource "kubernetes_config_map" "aws-auth" { - system:bootstrappers - system:nodes - system:node-proxier +- username: system:anonymous + groups: + - system:bootstrappers + - system:nodes + - system:node-proxier YAML } diff --git a/helm_argo_rollouts.tf b/helm_argo_rollouts.tf index 989a7a2..d2246b2 100644 --- a/helm_argo_rollouts.tf +++ b/helm_argo_rollouts.tf @@ -20,20 +20,20 @@ resource "helm_release" "argo_rollouts" { value = true } - # set { - # name = "podAnnotations.prometheus\\.io/scrape" - # value = true - # } - - # set { - # name = "podAnnotations.prometheus\\.io/path" - # value = "/metrics" - # } - - # set { - # name = "podAnnotations.prometheus\\.io/port" - # value = "8090" - # } + set { + name = "podAnnotations.prometheus\\.io/scrape" + value = true + } + + set { + name = "podAnnotations.prometheus\\.io/path" + value = "/metrics" + } + + set { + name = "podAnnotations.prometheus\\.io/port" + value = "8090" + } depends_on = [ diff --git a/helm_karpenter.tf b/helm_karpenter.tf index 28edf5f..d52e00f 100644 --- a/helm_karpenter.tf +++ b/helm_karpenter.tf @@ -23,6 +23,11 @@ resource "helm_release" "karpenter" { value = aws_eks_cluster.main.endpoint } + set { + name = "settings.interruptionQueue" + value = aws_sqs_queue.karpenter_termination_handler[count.index].name + } + set { name = "aws.defaultInstanceProfile" value = aws_iam_instance_profile.nodes.name @@ -112,3 +117,147 @@ resource "aws_launch_template" "karpenter" { } } } + + +resource "aws_sqs_queue" "karpenter_termination_handler" { + count = var.karpenter_toggle ? 1 : 0 + name = format("%s-karpenter", var.cluster_name) + delay_seconds = 0 + max_message_size = 2048 + message_retention_seconds = 86400 + receive_wait_time_seconds = 10 + visibility_timeout_seconds = 60 +} + +resource "aws_sqs_queue_policy" "karpenter_termination_handler" { + count = var.karpenter_toggle ? 1 : 0 + queue_url = aws_sqs_queue.karpenter_termination_handler[count.index].id + policy = <