Skip to content

Commit

Permalink
fix: replace instance_type_spread_required with instance_type_anti_af…
Browse files Browse the repository at this point in the history
…finity_required to resolve Karpenter scheduling issues
  • Loading branch information
fullykubed committed Oct 21, 2024
1 parent 7802e21 commit 644f24d
Show file tree
Hide file tree
Showing 64 changed files with 679 additions and 677 deletions.
100 changes: 50 additions & 50 deletions packages/infrastructure/kube_argo/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -52,51 +52,51 @@ module "pull_through" {
}

module "util_controller" {
source = "../kube_workload_utility"
workload_name = "argo-controller"
instance_type_spread_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
az_spread_required = var.enhanced_ha_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
source = "../kube_workload_utility"
workload_name = "argo-controller"
instance_type_anti_affinity_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
az_spread_required = var.enhanced_ha_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
}

module "util_server" {
source = "../kube_workload_utility"
workload_name = "argo-server"
instance_type_spread_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
az_spread_required = var.enhanced_ha_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
source = "../kube_workload_utility"
workload_name = "argo-server"
instance_type_anti_affinity_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
az_spread_required = var.enhanced_ha_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
}

module "util_events_controller" {
source = "../kube_workload_utility"
workload_name = "argo-events-controller"
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
instance_type_spread_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
az_spread_required = var.enhanced_ha_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
source = "../kube_workload_utility"
workload_name = "argo-events-controller"
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
instance_type_anti_affinity_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
az_spread_required = var.enhanced_ha_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
}

module "util_webhook" {
source = "../kube_workload_utility"
workload_name = "argo-webhook"
instance_type_spread_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
az_spread_required = var.enhanced_ha_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
source = "../kube_workload_utility"
workload_name = "argo-webhook"
instance_type_anti_affinity_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
az_spread_required = var.enhanced_ha_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
}

module "constants" {
Expand Down Expand Up @@ -247,20 +247,20 @@ resource "kubernetes_config_map" "artifacts" {
module "database" {
source = "../kube_pg_cluster"

eks_cluster_name = var.eks_cluster_name
pg_cluster_namespace = local.namespace
pg_initial_storage_gb = 2
pg_memory_mb = 1000
pg_cpu_millicores = 250
pg_instances = 2
pg_smart_shutdown_timeout = 2
aws_iam_ip_allow_list = var.aws_iam_ip_allow_list
pull_through_cache_enabled = var.pull_through_cache_enabled
burstable_nodes_enabled = true
backups_force_delete = true
monitoring_enabled = var.monitoring_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
instance_type_spread_required = var.enhanced_ha_enabled
eks_cluster_name = var.eks_cluster_name
pg_cluster_namespace = local.namespace
pg_initial_storage_gb = 2
pg_memory_mb = 1000
pg_cpu_millicores = 250
pg_instances = 2
pg_smart_shutdown_timeout = 2
aws_iam_ip_allow_list = var.aws_iam_ip_allow_list
pull_through_cache_enabled = var.pull_through_cache_enabled
burstable_nodes_enabled = true
backups_force_delete = true
monitoring_enabled = var.monitoring_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
instance_type_anti_affinity_required = var.enhanced_ha_enabled

pg_recovery_mode_enabled = var.db_recovery_mode_enabled
pg_recovery_directory = var.db_recovery_directory
Expand Down
14 changes: 7 additions & 7 deletions packages/infrastructure/kube_argo_event_bus/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@ data "pf_kube_labels" "labels" {
}

module "util" {
source = "../kube_workload_utility"
workload_name = "argo-event-bus"
instance_type_spread_required = var.instance_type_spread_required
burstable_nodes_enabled = true
controller_nodes_enabled = true
az_spread_required = true // stateful workload
extra_labels = data.pf_kube_labels.labels.labels
source = "../kube_workload_utility"
workload_name = "argo-event-bus"
instance_type_anti_affinity_required = var.instance_type_anti_affinity_required
burstable_nodes_enabled = true
controller_nodes_enabled = true
az_spread_required = true // stateful workload
extra_labels = data.pf_kube_labels.labels.labels
}

module "constants" {
Expand Down
4 changes: 2 additions & 2 deletions packages/infrastructure/kube_argo_event_bus/vars.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ variable "event_bus_initial_volume_size" {
default = "1Gi"
}

variable "instance_type_spread_required" {
description = "Whether to enable topology spread constraints to spread pods across instance types (with DoNotSchedule)"
variable "instance_type_anti_affinity_required" {
description = "Whether to enable anti-affinity to prevent pods from being scheduled on the same instance type"
type = bool
default = true
}
6 changes: 3 additions & 3 deletions packages/infrastructure/kube_argo_event_source/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ module "util" {
source = "../kube_workload_utility"
workload_name = var.name

host_anti_affinity_required = var.replicas > 1
instance_type_spread_required = var.replicas > 1 && var.instance_type_spread_required
az_spread_preferred = var.replicas > 1 && var.az_spread_preferred
host_anti_affinity_required = var.replicas > 1
instance_type_anti_affinity_required = var.replicas > 1 && var.instance_type_anti_affinity_required
az_spread_preferred = var.replicas > 1 && var.az_spread_preferred

burstable_nodes_enabled = true
controller_nodes_enabled = true
Expand Down
4 changes: 2 additions & 2 deletions packages/infrastructure/kube_argo_event_source/vars.tf
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ variable "spot_nodes_enabled" {
default = true
}

variable "instance_type_spread_required" {
description = "Whether to enable topology spread constraints to spread pods across instance types (with DoNotSchedule)"
variable "instance_type_anti_affinity_required" {
description = "Whether to enable anti-affinity to prevent pods from being scheduled on the same instance type"
type = bool
default = true
}
Expand Down
10 changes: 5 additions & 5 deletions packages/infrastructure/kube_argo_sensor/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,11 @@ module "util" {

# HA not needed b/c this can be offline for a minute or two
# without causing any major disruptions
host_anti_affinity_required = false
instance_type_spread_required = false
az_anti_affinity_required = false
az_spread_preferred = false
az_spread_required = false
host_anti_affinity_required = false
instance_type_anti_affinity_required = false
az_anti_affinity_required = false
az_spread_preferred = false
az_spread_required = false

burstable_nodes_enabled = true
controller_nodes_enabled = true
Expand Down
74 changes: 37 additions & 37 deletions packages/infrastructure/kube_authentik/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -56,25 +56,25 @@ module "constants" {
module "util_server" {
source = "../kube_workload_utility"

workload_name = "authentik-server"
instance_type_spread_required = var.enhanced_ha_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
az_spread_preferred = var.enhanced_ha_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
workload_name = "authentik-server"
instance_type_anti_affinity_required = var.enhanced_ha_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
az_spread_preferred = var.enhanced_ha_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
}

module "util_worker" {
source = "../kube_workload_utility"

workload_name = "authentik-worker"
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
instance_type_spread_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
workload_name = "authentik-worker"
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
instance_type_anti_affinity_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
}

module "namespace" {
Expand All @@ -90,20 +90,20 @@ module "namespace" {
module "database" {
source = "../kube_pg_cluster"

eks_cluster_name = var.eks_cluster_name
pg_cluster_namespace = local.namespace
pg_initial_storage_gb = 10
pg_memory_mb = 1000
pg_cpu_millicores = 250
pg_instances = 2
pg_smart_shutdown_timeout = 1
aws_iam_ip_allow_list = var.aws_iam_ip_allow_list
pull_through_cache_enabled = var.pull_through_cache_enabled
pgbouncer_pool_mode = "transaction" // See https://github.com/goauthentik/authentik/issues/9152
burstable_nodes_enabled = true
monitoring_enabled = var.monitoring_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
instance_type_spread_required = var.enhanced_ha_enabled
eks_cluster_name = var.eks_cluster_name
pg_cluster_namespace = local.namespace
pg_initial_storage_gb = 10
pg_memory_mb = 1000
pg_cpu_millicores = 250
pg_instances = 2
pg_smart_shutdown_timeout = 1
aws_iam_ip_allow_list = var.aws_iam_ip_allow_list
pull_through_cache_enabled = var.pull_through_cache_enabled
pgbouncer_pool_mode = "transaction" // See https://github.com/goauthentik/authentik/issues/9152
burstable_nodes_enabled = true
monitoring_enabled = var.monitoring_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
instance_type_anti_affinity_required = var.enhanced_ha_enabled

pg_recovery_mode_enabled = var.db_recovery_mode_enabled
pg_recovery_directory = var.db_recovery_directory
Expand All @@ -118,15 +118,15 @@ module "database" {
module "redis" {
source = "../kube_redis_sentinel"

namespace = local.namespace
replica_count = 3
burstable_nodes_enabled = true
controller_nodes_enabled = true
pull_through_cache_enabled = var.pull_through_cache_enabled
vpa_enabled = var.vpa_enabled
monitoring_enabled = var.monitoring_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
instance_type_spread_required = var.enhanced_ha_enabled
namespace = local.namespace
replica_count = 3
burstable_nodes_enabled = true
controller_nodes_enabled = true
pull_through_cache_enabled = var.pull_through_cache_enabled
vpa_enabled = var.vpa_enabled
monitoring_enabled = var.monitoring_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
instance_type_anti_affinity_required = var.enhanced_ha_enabled
}

/***************************************
Expand Down
14 changes: 7 additions & 7 deletions packages/infrastructure/kube_aws_ebs_csi/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ module "pull_through" {
module "util_controller" {
source = "../kube_workload_utility"

workload_name = "ebs-csi-controller"
burstable_nodes_enabled = true
controller_nodes_enabled = true
instance_type_spread_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
extra_labels = data.pf_kube_labels.labels.labels
workload_name = "ebs-csi-controller"
burstable_nodes_enabled = true
controller_nodes_enabled = true
instance_type_anti_affinity_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
extra_labels = data.pf_kube_labels.labels.labels
}

module "constants" {
Expand Down
14 changes: 7 additions & 7 deletions packages/infrastructure/kube_bastion/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,13 @@ module "bastion" {
namespace = module.namespace.namespace
name = local.name

replicas = 2
burstable_nodes_enabled = true
controller_nodes_enabled = true
instance_type_spread_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
priority_class_name = module.constants.cluster_important_priority_class_name
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
replicas = 2
burstable_nodes_enabled = true
controller_nodes_enabled = true
instance_type_anti_affinity_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
priority_class_name = module.constants.cluster_important_priority_class_name
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled

// https://superuser.com/questions/1547888/is-sshd-hard-coded-to-require-root-access
// SSHD requires root to run unfortunately. However, we drop all capability except
Expand Down
6 changes: 3 additions & 3 deletions packages/infrastructure/kube_buildkit/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,9 @@ module "buildkit" {
pull_through_cache_enabled = var.pull_through_cache_enabled

# High availability is not required
instance_type_spread_required = false
az_spread_required = false
az_spread_preferred = false
instance_type_anti_affinity_required = false
az_spread_required = false
az_spread_preferred = false

# Ensure that we are using the appropriate CPU architectures
arm_nodes_enabled = each.key == "arm64"
Expand Down
14 changes: 7 additions & 7 deletions packages/infrastructure/kube_cert_manager/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,13 @@ module "util_controller" {
module "util_webhook" {
source = "../kube_workload_utility"

workload_name = "cert-manager-webhook"
instance_type_spread_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
workload_name = "cert-manager-webhook"
instance_type_anti_affinity_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
}

module "util_ca_injector" {
Expand Down
14 changes: 7 additions & 7 deletions packages/infrastructure/kube_cilium/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,13 @@ module "pull_through" {
module "util_controller" {
source = "../kube_workload_utility"

workload_name = "cilium-operator"
instance_type_spread_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
workload_name = "cilium-operator"
instance_type_anti_affinity_required = var.enhanced_ha_enabled
az_spread_preferred = var.enhanced_ha_enabled
panfactum_scheduler_enabled = var.panfactum_scheduler_enabled
burstable_nodes_enabled = true
controller_nodes_enabled = true
extra_labels = data.pf_kube_labels.labels.labels
}

module "util_agent" {
Expand Down
Loading

0 comments on commit 644f24d

Please sign in to comment.