diff --git a/packages/infrastructure/kube_cilium/main.tf b/packages/infrastructure/kube_cilium/main.tf
index abe21ea5..de3907e5 100644
--- a/packages/infrastructure/kube_cilium/main.tf
+++ b/packages/infrastructure/kube_cilium/main.tf
@@ -259,12 +259,6 @@ resource "helm_release" "cilium" {
{
key = "node.kubernetes.io/not-ready"
operator = "Exists"
- effect = "NoExecute"
- },
- {
- key = "node.kubernetes.io/not-ready"
- operator = "Exists"
- effect = "NoSchedule"
},
{
key = "node.kubernetes.io/unreachable"
diff --git a/packages/infrastructure/kube_monitoring/main.tf b/packages/infrastructure/kube_monitoring/main.tf
index c46f4d77..a1795412 100644
--- a/packages/infrastructure/kube_monitoring/main.tf
+++ b/packages/infrastructure/kube_monitoring/main.tf
@@ -30,6 +30,46 @@ locals {
name = "monitoring"
namespace = module.namespace.namespace
+ default_tracked_labels = [
+ "panfactum.com/environment",
+ "panfactum.com/module",
+ "panfactum.com/region",
+ "panfactum.com/root-module",
+ "panfactum.com/stack-commit",
+ "panfactum.com/stack-version"
+ ]
+ labels_to_track = tolist(toset(concat(local.default_tracked_labels, var.additional_tracked_resource_labels)))
+
+ default_tracked_resources = [
+ "certificatesigningrequests",
+ "configmaps",
+ "cronjobs",
+ "daemonsets",
+ "deployments",
+ "endpoints",
+ "horizontalpodautoscalers",
+ "ingresses",
+ "jobs",
+ "leases",
+ "limitranges",
+ "mutatingwebhookconfigurations",
+ "namespaces",
+ "networkpolicies",
+ "nodes",
+ "persistentvolumeclaims",
+ "persistentvolumes",
+ "poddisruptionbudgets",
+ "pods",
+ "replicasets",
+ "resourcequotas",
+ "secrets",
+ "services",
+ "statefulsets",
+ "storageclasses",
+ "validatingwebhookconfigurations"
+ ]
+ resources_to_track = tolist(toset(concat(local.default_tracked_resources, var.additional_tracked_resources)))
+
default_resources = {
requests = {
memory = "100Mi"
@@ -99,6 +139,10 @@ locals {
id = random_id.thanos_query_frontend.hex
}
+ alertmanager_match = {
+ id = random_id.alertmanager.hex
+ }
+
thanos_store_gateway_index_config = {
type = "REDIS"
config = {
@@ -206,6 +250,11 @@ resource "random_id" "thanos_bucket_web" {
prefix = "thanos-bucket-web-"
}
+resource "random_id" "alertmanager" {
+ byte_length = 8
+ prefix = "alertmanager-"
+}
+
module "kube_labels_operator" {
source = "../kube_labels"
@@ -398,6 +447,22 @@ module "kube_labels_thanos_query_frontend" {
extra_tags = merge(var.extra_tags, local.thanos_query_frontend_match)
}
+module "kube_labels_alertmanager" {
+ source = "../kube_labels"
+
+ # generate: common_vars_no_extra_tags.snippet.txt
+ pf_stack_version = var.pf_stack_version
+ pf_stack_commit = var.pf_stack_commit
+ environment = var.environment
+ region = var.region
+ pf_root_module = var.pf_root_module
+ pf_module = var.pf_module
+ is_local = var.is_local
+ # end-generate
+
+ extra_tags = merge(var.extra_tags, local.alertmanager_match)
+}
+
module "constants_operator" {
source = "../constants"
@@ -614,6 +679,24 @@ module "constants_thanos_query_frontend" {
extra_tags = merge(var.extra_tags, local.thanos_query_frontend_match)
}
+module "constants_alertmanager" {
+ source = "../constants"
+
+ matching_labels = local.alertmanager_match
+
+ # generate: common_vars_no_extra_tags.snippet.txt
+ pf_stack_version = var.pf_stack_version
+ pf_stack_commit = var.pf_stack_commit
+ environment = var.environment
+ region = var.region
+ pf_root_module = var.pf_root_module
+ pf_module = var.pf_module
+ is_local = var.is_local
+ # end-generate
+
+ extra_tags = merge(var.extra_tags, local.alertmanager_match)
+}
+
/***************************************
* Namespace
@@ -836,6 +919,18 @@ resource "helm_release" "prometheus_stack" {
enabled = true
}
+ defaultRules = {
+ create = true
+ rules = {
+ etcd = var.monitoring_etcd_enabled
+ kubeSchedulerAlerting = false // Not exposed in EKS
+ kubeSchedulerRecording = false // Not exposed in EKS
+ kubernetesSystem = false // Not exposed in EKS
+ kubeControllerManager = false // Not exposed in EKS
+ kubeProxy = false // We do not use kube-proxy
+ }
+ }
+
//////////////////////////////////////////////////////////
// Prometheus Operator
//////////////////////////////////////////////////////////
@@ -942,12 +1037,119 @@ resource "helm_release" "prometheus_stack" {
image = local.default_k8s_image
customLabels = module.kube_labels_kube_state_metrics.kube_labels
extraArgs = [
- "--metric-labels-allowlist=pods=[*]"
+ "--metric-labels-allowlist=*=[${join(",", local.labels_to_track)}]"
]
updateStrategy = "Recreate"
tolerations = module.constants_kube_state_metrics.burstable_node_toleration_helm
+ resources = local.default_resources
+
+ collectors = local.resources_to_track
+
+ prometheus = {
+ monitor = {
+ metricRelabelings = concat(
+ // Removes the panfactum.com/ prefix
+ [for label in [
+ "label_panfactum_com_environment",
+ "label_panfactum_com_region",
+ "label_panfactum_com_stack_version",
+ "label_panfactum_com_stack_commit",
+ "label_panfactum_com_module",
+ "label_panfactum_com_root_module"
+ ] : {
+ sourceLabels = ["__name__", label],
+ regex = "(.*_labels);(.+)"
+ targetLabel = "label_${trimprefix(label, "label_panfactum_com_")}"
+ replacement = "$2"
+ action = "replace"
+ }
+ ],
+ [
+ {
+ regex = ".*panfactum_com.*"
+ action = "labeldrop"
+ },
+
+ // This addresses a bug in a previous version of the stack
+ // where the access mode array contained duplicate entries
+ // for postgres deployments. This causes duplicate samples
+ // to be sent to prometheus which triggers alerts.
+ {
+ action = "drop"
+ regex = "kube_persistentvolumeclaim_access_mode"
+ sourceLabels = ["__name__"]
+ },
+ ]
+ )
+ }
+ }
+ }
- resources = local.default_resources
+ //////////////////////////////////////////////////////////
+ // etcd
+ //////////////////////////////////////////////////////////
+ kubeEtcd = {
+ enabled = var.monitoring_etcd_enabled
+ }
+
+ //////////////////////////////////////////////////////////
+ // Kubernetes API server monitoring
+ //////////////////////////////////////////////////////////
+ kubeApiServer = {
+ enabled = true
+ serviceMonitor = {
+ metricRelabelings = [
+ {
+ action = "drop"
+ regex = "apiserver_request_duration_seconds_.*" # Use apiserver_request_sli_duration_seconds_ instead
+ sourceLabels = ["__name__"]
+ },
+ # These aren't really important to track and they use a lot of space
+ {
+ action = "drop"
+ regex = "apiserver_request_body_size_.*"
+ sourceLabels = ["__name__"]
+ },
+ {
+ action = "drop"
+ regex = "apiserver_response_body_size_.*"
+ sourceLabels = ["__name__"]
+ },
+ {
+ action = "drop"
+ regex = "kubernetes_feature_enabled"
+ sourceLabels = ["__name__"]
+ }
+ ]
+ }
+ }
+
+ //////////////////////////////////////////////////////////
+ // Kubernetes Scheduler
+ //////////////////////////////////////////////////////////
+ kubeScheduler = {
+ enabled = false // not exposed in EKS
+ }
+
+ //////////////////////////////////////////////////////////
+ // kube-proxy
+ //////////////////////////////////////////////////////////
+ kubeProxy = {
+ enabled = false // we do not use kube-proxy
+ }
+
+ //////////////////////////////////////////////////////////
+ // Kubernetes Controller Manager
+ //////////////////////////////////////////////////////////
+ kubeControllerManager = {
+ enabled = false // not exposed in EKS
+ }
+
+ //////////////////////////////////////////////////////////
+ // coreDNS
+ //////////////////////////////////////////////////////////
+ coreDns = {
+ enabled = false // we monitor this in our own module
}
//////////////////////////////////////////////////////////
@@ -987,7 +1189,7 @@ resource "helm_release" "prometheus_stack" {
logLevel = var.prometheus_log_level
logFormat = "json"
scrapeInterval = "${var.prometheus_default_scrape_interval_seconds}s"
- retention = "6h" // This should be 3x the block window (2h) and then data will get shipped to s3 by thanos
+ retention = "1h" // This is only for local retention (before data is shipped to s3 by thanos)
disableCompaction = true
storageSpec = {
@@ -1011,6 +1213,7 @@ resource "helm_release" "prometheus_stack" {
logLevel = var.prometheus_log_level
logFormat = "json"
resources = local.default_resources
+ blockSize = "30m"
objectStorageConfig = {
secret = {
type = "s3"
@@ -1025,6 +1228,47 @@ resource "helm_release" "prometheus_stack" {
}
}
+ //////////////////////////////////////////////////////////
+ // Alert Manager
+ //////////////////////////////////////////////////////////
+ alertmanager = {
+ enabled = true
+ service = {
+ labels = module.kube_labels_alertmanager.kube_labels
+ }
+ alertmanagerSpec = {
+ podMetadata = {
+ labels = module.kube_labels_alertmanager.kube_labels
+ }
+ image = local.default_image
+ logLevel = var.alertmanager_log_level
+ logFormat = "json"
+
+ storage = {
+ volumeClaimTemplate = {
+ spec = {
+ storageClassName = var.alertmanager_storage_class_name
+ resources = {
+ requests = {
+ storage = "${var.alertmanager_local_storage_initial_size_gb}Gi"
+ }
+ }
+ annotations = {
+ "velero.io/exclude-from-backup" = "true"
+ }
+ }
+ }
+ }
+
+ replicas = 2
+ resources = local.default_resources
+ affinity = module.constants_alertmanager.pod_anti_affinity_instance_type_helm
+ tolerations = module.constants_alertmanager.burstable_node_toleration_helm
+ topologySpreadConstraints = module.constants_alertmanager.topology_spread_zone_strict
+ priorityClassName = module.constants_alertmanager.cluster_important_priority_class_name
+ }
+ }
+
//////////////////////////////////////////////////////////
// Grafana
//////////////////////////////////////////////////////////
@@ -1655,7 +1899,6 @@ resource "kubernetes_manifest" "pdb_thanos_query_frontend" {
}
resource "kubernetes_manifest" "pdb_thanos_query" {
- count = var.thanos_bucket_web_enable ? 1 : 0
manifest = {
apiVersion = "policy/v1"
kind = "PodDisruptionBudget"
@@ -1674,6 +1917,25 @@ resource "kubernetes_manifest" "pdb_thanos_query" {
depends_on = [helm_release.thanos]
}
+resource "kubernetes_manifest" "alertmanager" {
+ manifest = {
+ apiVersion = "policy/v1"
+ kind = "PodDisruptionBudget"
+ metadata = {
+ name = "alertmanager"
+ namespace = local.namespace
+ labels = module.kube_labels_alertmanager.kube_labels
+ }
+ spec = {
+ selector = {
+ matchLabels = local.alertmanager_match
+ }
+ maxUnavailable = 1
+ }
+ }
+ depends_on = [helm_release.prometheus_stack]
+}
+
/***************************************
* Autoscaling
***************************************/
@@ -1935,6 +2197,27 @@ resource "kubernetes_manifest" "vpa_thanos_query" {
depends_on = [helm_release.thanos]
}
+resource "kubernetes_manifest" "vpa_alertmanager" {
+ count = var.vpa_enabled ? 1 : 0
+ manifest = {
+ apiVersion = "autoscaling.k8s.io/v1"
+ kind = "VerticalPodAutoscaler"
+ metadata = {
+ name = "alertmanager"
+ namespace = local.namespace
+ labels = module.kube_labels_alertmanager.kube_labels
+ }
+ spec = {
+ targetRef = {
+ apiVersion = "monitoring.coreos.com/v1"
+ kind = "Alertmanager"
+ name = "monitoring"
+ }
+ }
+ }
+ depends_on = [helm_release.prometheus_stack]
+}
+
/***************************************
* SSO Login for Grafana
diff --git a/packages/infrastructure/kube_monitoring/vars.tf b/packages/infrastructure/kube_monitoring/vars.tf
index 5d9e5eef..6026d4b8 100644
--- a/packages/infrastructure/kube_monitoring/vars.tf
+++ b/packages/infrastructure/kube_monitoring/vars.tf
@@ -70,6 +70,16 @@ variable "prometheus_log_level" {
}
}
+variable "alertmanager_log_level" {
+ description = "The log level for the alertmanager pods"
+ type = string
+ default = "info"
+ validation {
+ condition = contains(["info", "error", "warn", "debug"], var.alertmanager_log_level)
+ error_message = "Invalid alertmanager_log_level provided."
+ }
+}
+
variable "thanos_log_level" {
description = "The log level for the thanos pods"
type = string
@@ -138,4 +148,34 @@ variable "thanos_bucket_web_enable" {
description = "Whether to enable the web dashboard for the Thanos bucket analyzer which can show debugging information about your metrics data"
type = bool
default = true
+}
+
+variable "alertmanager_storage_class_name" {
+ description = "The storage class to use for local alertmanager storage"
+ type = string
+ default = "ebs-standard"
+}
+
+variable "alertmanager_local_storage_initial_size_gb" {
+ description = "Number of GB to use for the local alertmanager storage (before autoscaled)"
+ type = number
+ default = 2
+}
+
+variable "monitoring_etcd_enabled" {
+ description = "Whether to monitor the Kubernetes API server's etcd instances. Only enable for debugging purposes as it contains a huge amount of metrics."
+ type = bool
+ default = false
+}
+
+variable "additional_tracked_resource_labels" {
+ description = "Kubernetes resource labels to include in metric labels"
+ type = list(string)
+ default = []
+}
+
+variable "additional_tracked_resources" {
+ description = "Additional Kubernetes resources to track in kube-state-metrics"
+ type = list(string)
+ default = []
}
\ No newline at end of file
diff --git a/packages/infrastructure/kube_pg_cluster/main.tf b/packages/infrastructure/kube_pg_cluster/main.tf
index baff5a52..59abc0e5 100644
--- a/packages/infrastructure/kube_pg_cluster/main.tf
+++ b/packages/infrastructure/kube_pg_cluster/main.tf
@@ -434,7 +434,6 @@ resource "kubernetes_manifest" "postgres_cluster" {
storage = {
pvcTemplate = {
- accessModes = ["ReadWriteOnce"]
resources = {
requests = {
storage = "${var.pg_storage_gb}Gi"
diff --git a/packages/reference/environments/production/us-east-2/kube_monitoring/module.yaml b/packages/reference/environments/production/us-east-2/kube_monitoring/module.yaml
index bcccce6f..c4dc5497 100644
--- a/packages/reference/environments/production/us-east-2/kube_monitoring/module.yaml
+++ b/packages/reference/environments/production/us-east-2/kube_monitoring/module.yaml
@@ -3,4 +3,5 @@ providers:
- helm
- kubernetes
- random
- - aws
\ No newline at end of file
+ - aws
+ - vault
\ No newline at end of file
diff --git a/packages/reference/environments/production/us-east-2/pf_website/version.yaml b/packages/reference/environments/production/us-east-2/pf_website/version.yaml
index 4a931009..b927c59f 100644
--- a/packages/reference/environments/production/us-east-2/pf_website/version.yaml
+++ b/packages/reference/environments/production/us-east-2/pf_website/version.yaml
@@ -1 +1 @@
-version: alpha.87
+version: alpha.88
diff --git a/packages/website/src/app/changelog/edge/page.mdx b/packages/website/src/app/changelog/edge/page.mdx
index abbfd660..6c05ebcd 100644
--- a/packages/website/src/app/changelog/edge/page.mdx
+++ b/packages/website/src/app/changelog/edge/page.mdx
@@ -69,6 +69,11 @@ provisions).
* Updates many controller deployments to use the [Recreate](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#recreate-deployment)
deployment strategy to improve timing and efficiency of applying Panfactum upgrades.
+### Fixes
+
+- PVCs for postgres instances were inadvertently created with duplicated entries for accessModes. This has been fixed,
+but the fix will not retroactively adjust existing PVCs as they are immutable.
+
## edge.24-05-15
### Breaking Changes
diff --git a/packages/website/src/app/docs/main/reference/infrastructure-modules/kubernetes/kube_monitoring/page.mdx b/packages/website/src/app/docs/main/reference/infrastructure-modules/kubernetes/kube_monitoring/page.mdx
index bccd15d2..f1a2a8bd 100644
--- a/packages/website/src/app/docs/main/reference/infrastructure-modules/kubernetes/kube_monitoring/page.mdx
+++ b/packages/website/src/app/docs/main/reference/infrastructure-modules/kubernetes/kube_monitoring/page.mdx
@@ -48,6 +48,46 @@ Type: `string`
The following input variables are optional (have default values):
+### [additional\_tracked\_resource\_labels](#input_additional_tracked_resource_labels)
+
+Description: Kubernetes resource labels to include in metric labels
+
+Type: `list(string)`
+
+Default: `[]`
+
+### [additional\_tracked\_resources](#input_additional_tracked_resources)
+
+Description: Additional Kubernetes resources to track in kube-state-metrics
+
+Type: `list(string)`
+
+Default: `[]`
+
+### [alertmanager\_local\_storage\_initial\_size\_gb](#input_alertmanager_local_storage_initial_size_gb)
+
+Description: Number of GB to use for the local alertmanager storage (before autoscaled)
+
+Type: `number`
+
+Default: `2`
+
+### [alertmanager\_log\_level](#input_alertmanager_log_level)
+
+Description: The log level for the alertmanager pods
+
+Type: `string`
+
+Default: `"info"`
+
+### [alertmanager\_storage\_class\_name](#input_alertmanager_storage_class_name)
+
+Description: The storage class to use for local alertmanager storage
+
+Type: `string`
+
+Default: `"ebs-standard"`
+
### [aws\_iam\_ip\_allow\_list](#input_aws_iam_ip_allow_list)
Description: A list of IPs that can use the service account token to authenticate with AWS API
@@ -96,6 +136,14 @@ Type: `number`
Default: `15`
+### [monitoring\_etcd\_enabled](#input_monitoring_etcd_enabled)
+
+Description: Whether to monitor the Kubernetes API server's etcd instances. Only enable for debugging purposes as it contains a huge amount of metrics.
+
+Type: `bool`
+
+Default: `false`
+
### [prometheus\_default\_scrape\_interval\_seconds](#input_prometheus_default_scrape_interval_seconds)
Description: The default interval between prometheus scrapes (in seconds)