Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(eks/cijenkinsio-agents-2): setup technical addons for taint and toleration + add autoscaler #67

Merged
merged 16 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions ci.jenkins.io.tf
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ resource "aws_instance" "ci_jenkins_io" {

disable_api_termination = true # Protect ourselves from accidental deletion

user_data = templatefile("${path.root}/.shared-tools/terraform/cloudinit.tftpl", { hostname = local.ci_jenkins_io_fqdn, admin_username = "ubuntu" })
user_data = templatefile("${path.root}/.shared-tools/terraform/cloudinit.tftpl", { hostname = local.ci_jenkins_io["controller_vm_fqdn"], admin_username = "ubuntu" })

root_block_device {
delete_on_termination = false # Even if we terminate the machine
Expand Down Expand Up @@ -133,22 +133,22 @@ resource "aws_instance" "ci_jenkins_io" {
### DNS Zone delegated from Azure DNS (jenkins-infra/azure-net)
# `updatecli` maintains sync between the 2 repositories using the infra reports (see outputs.tf)
resource "aws_route53_zone" "aws_ci_jenkins_io" {
name = local.ci_jenkins_io_fqdn
name = local.ci_jenkins_io["controller_vm_fqdn"]

tags = local.common_tags
}

resource "aws_route53_record" "a_aws_ci_jenkins_io" {
zone_id = aws_route53_zone.aws_ci_jenkins_io.zone_id
name = local.ci_jenkins_io_fqdn
name = local.ci_jenkins_io["controller_vm_fqdn"]
type = "A"
ttl = 60
records = [aws_eip.ci_jenkins_io.public_ip]
}

resource "aws_route53_record" "aaaa_aws_ci_jenkins_io" {
zone_id = aws_route53_zone.aws_ci_jenkins_io.zone_id
name = local.ci_jenkins_io_fqdn
name = local.ci_jenkins_io["controller_vm_fqdn"]
type = "AAAA"
ttl = 60
records = aws_instance.ci_jenkins_io.ipv6_addresses
Expand Down
93 changes: 75 additions & 18 deletions eks-cijenkinsio-agents-2.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ module "cijenkinsio-agents-2" {
subnet_ids = slice(module.vpc.private_subnets, 1, 3)

# Required to allow EKS service accounts to authenticate to AWS API through OIDC (and assume IAM roles)
# useful for autoscaler, EKS addons and any AWS APi usage
# useful for autoscaler, EKS addons and any AWS API usage
enable_irsa = true

# Allow the terraform CI IAM user to be co-owner of the cluster
enable_cluster_creator_admin_permissions = true

# avoid using config map to specify admin accesses (decrease attack surface)
# Avoid using config map to specify admin accesses (decrease attack surface)
authentication_mode = "API"

access_entries = {
Expand Down Expand Up @@ -65,40 +65,47 @@ module "cijenkinsio-agents-2" {

create_cluster_primary_security_group_tags = false

# Do not use interpolated values from `local` in either keys and values of provided tags (or `cluster_tags)
# To avoid having and implicit dependency to a resource not available when parsing the module (infamous errror `Error: Invalid for_each argument`)
# Ref. same error as having a `depends_on` in https://github.com/terraform-aws-modules/terraform-aws-eks/issues/2337
tags = merge(local.common_tags, {
GithubRepo = "terraform-aws-sponsorship"
GithubOrg = "jenkins-infra"

associated_service = "eks/cijenkinsio-agents-2"
})

# VPC is defined in vpc.tf
vpc_id = module.vpc.vpc_id

## Manage EKS addons with module - https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/eks_addon
# See new versions with `aws eks describe-addon-versions --kubernetes-version <k8s-version> --addon-name <addon>`
cluster_addons = {
# https://docs.aws.amazon.com/cli/latest/reference/eks/describe-addon-versions.html
coredns = {
# https://docs.aws.amazon.com/cli/latest/reference/eks/describe-addon-versions.html
# TODO: track with updatecli
addon_version = "v1.11.3-eksbuild.2"
configuration_values = jsonencode({
"tolerations" = local.cijenkinsio_agents_2["tolerations"]["applications"],
})
}
# Kube-proxy on an Amazon EKS cluster has the same compatibility and skew policy as Kubernetes
# See https://kubernetes.io/releases/version-skew-policy/#kube-proxy
kube-proxy = {
# https://docs.aws.amazon.com/cli/latest/reference/eks/describe-addon-versions.html
# TODO: track with updatecli
addon_version = "v1.29.10-eksbuild.3"
}
# https://github.com/aws/amazon-vpc-cni-k8s/releases
vpc-cni = {
# https://docs.aws.amazon.com/cli/latest/reference/eks/describe-addon-versions.html
# TODO: track with updatecli
addon_version = "v1.19.0-eksbuild.1"
configuration_values = jsonencode({
"tolerations" = local.cijenkinsio_agents_2["tolerations"]["applications"],
})
}
eks-pod-identity-agent = {
# https://docs.aws.amazon.com/cli/latest/reference/eks/describe-addon-versions.html
# TODO: track with updatecli
addon_version = "v1.3.4-eksbuild.1"
configuration_values = jsonencode({
"tolerations" = local.cijenkinsio_agents_2["tolerations"]["applications"],
})
}
## https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/CHANGELOG.md
# aws-ebs-csi-driver = {
Expand All @@ -107,11 +114,13 @@ module "cijenkinsio-agents-2" {
# # TODO specify service account
# # service_account_role_arn = module.cijenkinsio-agents-2_irsa_ebs.iam_role_arn
# }
# locals: ebs_account_namespace = "kube-system"
# locals: ebs_account_name = "ebs-csi-controller-sa"
}

eks_managed_node_groups = {
# This worker pool is expected to host the "technical" services such as cluster-autoscaler, data cluster-agent, ACP, etc.
tiny_ondemand_linux = {
# This worker pool is expected to host the "technical" services such as pod autoscaler, etc.
name = "tiny-ondemand-linux"

instance_types = ["t4g.large"] # 2vcpu 8Gio
Expand All @@ -122,7 +131,19 @@ module "cijenkinsio-agents-2" {
max_size = 3
desired_size = 1

subnet_ids = slice(module.vpc.private_subnets, 1, 2) # Only 1 subnet in 1 AZ
subnet_ids = slice(module.vpc.private_subnets, 1, 2) # Only 1 subnet in 1 AZ (for EBS)

labels = {
jenkins = local.ci_jenkins_io["service_fqdn"]
role = "applications"
}
taints = { for toleration_key, toleration_value in local.cijenkinsio_agents_2["tolerations"]["applications"] :
toleration_key => {
key = toleration_value["key"],
value = toleration_value.value
effect = local.toleration_taint_effects[toleration_value.effect]
}
}
},
}

Expand All @@ -149,18 +170,55 @@ module "cijenkinsio-agents-2" {
}
}

# Configure the jenkins-infra/kubernetes-management admin service account
module "autoscaler_irsa_role" {
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
# TODO track with updatecli
version = "5.48.0"

role_name = "${module.cijenkinsio-agents-2.cluster_name}-cluster-autoscaler"
attach_cluster_autoscaler_policy = true

cluster_autoscaler_cluster_names = [module.cijenkinsio-agents-2.cluster_name]

oidc_providers = {
main = {
provider_arn = module.cijenkinsio-agents-2.oidc_provider_arn
namespace_service_accounts = ["${local.cijenkinsio_agents_2["autoscaler"]["namespace"]}:${local.cijenkinsio_agents_2["autoscaler"]["serviceaccount"]}"]
}
}

tags = local.common_tags
}

# Used by kubernetes/helm provider to authenticate to cluster with the AWS IAM identity (using a token)
data "aws_eks_cluster_auth" "cijenkinsio-agents-2" {
name = module.cijenkinsio-agents-2.cluster_name
}

provider "kubernetes" {
alias = "cijenkinsio-agents-2"
host = module.cijenkinsio-agents-2.cluster_endpoint
cluster_ca_certificate = base64decode(module.cijenkinsio-agents-2.cluster_certificate_authority_data)
token = data.aws_eks_cluster_auth.cijenkinsio-agents-2.token
### Install Cluster Autoscaler
resource "helm_release" "cluster-autoscaler" {
provider = helm.cijenkinsio-agents-2
name = "cluster-autoscaler"
repository = "https://kubernetes.github.io/autoscaler"
chart = "cluster-autoscaler"
# TODO: track with updatecli
version = "9.43.2"
create_namespace = true
namespace = local.cijenkinsio_agents_2["autoscaler"]["namespace"]

values = [
templatefile("./helm/cluster-autoscaler-values.yaml.tfpl", {
region = local.region,
serviceAccountName = local.cijenkinsio_agents_2["autoscaler"]["serviceaccount"],
autoscalerRoleArn = module.autoscaler_irsa_role.iam_role_arn,
clusterName = module.cijenkinsio-agents-2.cluster_name,
nodeSelectors = module.cijenkinsio-agents-2.eks_managed_node_groups["tiny_ondemand_linux"].node_group_labels,
nodeTolerations = local.cijenkinsio_agents_2["tolerations"]["applications"],
})
]
}

### Define admin credential to be used in jenkins-infra/kubernetes-management
module "cijenkinsio-agents-2_admin_sa" {
providers = {
kubernetes = kubernetes.cijenkinsio-agents-2
Expand All @@ -170,7 +228,6 @@ module "cijenkinsio-agents-2_admin_sa" {
cluster_hostname = module.cijenkinsio-agents-2.cluster_endpoint
cluster_ca_certificate_b64 = module.cijenkinsio-agents-2.cluster_certificate_authority_data
}

output "kubeconfig_cijenkinsio-agents-2" {
sensitive = true
value = module.cijenkinsio-agents-2_admin_sa.kubeconfig
Expand Down
25 changes: 25 additions & 0 deletions helm/cluster-autoscaler-values.yaml.tfpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---
awsRegion: ${region}
smerle33 marked this conversation as resolved.
Show resolved Hide resolved

nodeSelector:
%{ for label_key, label_value in nodeSelectors ~}
${label_key}: "${label_value}"
%{ endfor ~}
smerle33 marked this conversation as resolved.
Show resolved Hide resolved

tolerations:
${yamlencode(nodeTolerations)}

extraArgs:
balance-similar-node-groups: true
replicaCount: 2

rbac:
create: true
serviceAccount:
name: "${serviceAccountName}"
annotations:
eks.amazonaws.com/role-arn: "${autoscalerRoleArn}"

autoDiscovery:
enabled: true
clusterName: "${clusterName}"
36 changes: 29 additions & 7 deletions locals.tf
Original file line number Diff line number Diff line change
@@ -1,17 +1,39 @@
locals {
aws_account_id = "326712726440"
region = "us-east-2"
autoscaler_account_namespace = "autoscaler"
autoscaler_account_name = "cluster-autoscaler-aws-cluster-autoscaler-chart"
ebs_account_namespace = "kube-system"
ebs_account_name = "ebs-csi-controller-sa"
aws_account_id = "326712726440"
region = "us-east-2"

common_tags = {
"scope" = "terraform-managed"
"repository" = "jenkins-infra/terraform-aws-sponsorship"
}

ci_jenkins_io_fqdn = "aws.ci.jenkins.io"
ci_jenkins_io = {
service_fqdn = "ci.jenkins.io"
controller_vm_fqdn = "aws.ci.jenkins.io"
}

cijenkinsio_agents_2 = {
autoscaler = {
namespace = "autoscaler",
serviceaccount = "cluster-autoscaler-aws-cluster-autoscaler-chart",
smerle33 marked this conversation as resolved.
Show resolved Hide resolved
},
tolerations = {
applications = [
{
"effect" : "NoSchedule",
"key" : "${local.ci_jenkins_io["service_fqdn"]}/applications",
"operator" : "Equal",
"value" : "true"
},
],
},
}

toleration_taint_effects = {
"NoSchedule" = "NO_SCHEDULE",
"NoExecute" = "NO_EXECUTE",
"PreferNoSchedule" = "PREFER_NO_SCHEDULE",
}

#####
## External and outbounds IP used by resources for network restrictions.
Expand Down
5 changes: 3 additions & 2 deletions outputs.tf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
resource "local_file" "jenkins_infra_data_report" {
content = jsonencode({
"${local.ci_jenkins_io_fqdn}" = {
"${local.ci_jenkins_io["controller_vm_fqdn"]}" = {
"name_servers" = aws_route53_zone.aws_ci_jenkins_io.name_servers,
"outbound_ips" = {
"agents" = module.vpc.nat_public_ips,
Expand All @@ -18,7 +18,8 @@ resource "local_file" "jenkins_infra_data_report" {
},
},
"cijenkinsio-agents-2" = {
"cluster_endpoint" = module.cijenkinsio-agents-2.cluster_endpoint
"cluster_endpoint" = module.cijenkinsio-agents-2.cluster_endpoint,
"tolerations" = local.cijenkinsio_agents_2["tolerations"],
},
})
filename = "${path.module}/jenkins-infra-data-reports/aws-sponsorship.json"
Expand Down
23 changes: 21 additions & 2 deletions providers.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,42 @@ provider "aws" {
provider "local" {
}

# TODO track with updatecli
provider "cloudinit" {
# Required by the EKS module
}

# TODO track with updatecli
provider "null" {
# Required by the EKS module
}

# TODO track with updatecli
provider "time" {
# Required by the EKS module
}

# TODO track with updatecli
provider "tls" {
# Required by the EKS module
}

# There are other kubernetes providers defined in other files with specific auth.
# This one is a placeholder to ensure lock file has the proper setup
# TODO track with updatecli
provider "kubernetes" {
alias = "cijenkinsio-agents-2"

host = module.cijenkinsio-agents-2.cluster_endpoint
cluster_ca_certificate = base64decode(module.cijenkinsio-agents-2.cluster_certificate_authority_data)
token = data.aws_eks_cluster_auth.cijenkinsio-agents-2.token
}

# TODO track with updatecli
provider "helm" {
alias = "cijenkinsio-agents-2"

kubernetes {
host = module.cijenkinsio-agents-2.cluster_endpoint
token = data.aws_eks_cluster_auth.cijenkinsio-agents-2.token
cluster_ca_certificate = base64decode(module.cijenkinsio-agents-2.cluster_certificate_authority_data)
}
}