diff --git a/src/aks-leonardo/03_monitoring.tf b/src/aks-leonardo/03_monitoring.tf index 4f6172855d..dd117de217 100644 --- a/src/aks-leonardo/03_monitoring.tf +++ b/src/aks-leonardo/03_monitoring.tf @@ -27,5 +27,26 @@ module "elastic_agent" { } +# Kubernetes Event Exporter +module "kubernetes_event_exporter" { + count = var.env_short != "p" ? 0 : 1 + source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_event_exporter?ref=v8.76.0" + namespace = "monitoring" + + custom_config = "env/itn-prod/exporter/kubernetes-event-exporter-config.yml.tftpl" + custom_variables = { + enable_slack = false + enable_opsgenie = true + opsgenie_receiver_name = "opsgenie" + opsgenie_api_key = data.azurerm_key_vault_secret.opsgenie_kubexporter_api_key.0.value + } +} + +data "azurerm_key_vault_secret" "opsgenie_kubexporter_api_key" { + count = var.env_short != "p" ? 0 : 1 + key_vault_id = data.azurerm_key_vault.kv_italy.id + name = "opsgenie-infra-kubexporter-webhook-token" +} + // TODO mettere nel kv il secret quickstart-es-elastic-user tramite sops diff --git a/src/aks-leonardo/README.md b/src/aks-leonardo/README.md index d888ccc4cc..8336a7da05 100644 --- a/src/aks-leonardo/README.md +++ b/src/aks-leonardo/README.md @@ -45,6 +45,7 @@ Re-enable all the resource, commented before to complete the procedure | [aks\_storage\_class](#module\_aks\_storage\_class) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_storage_class | v8.17.1 | | [elastic\_agent](#module\_elastic\_agent) | git::https://github.com/pagopa/terraform-azurerm-v3.git//elastic_agent | v8.50.0 | | [keda\_pod\_identity](#module\_keda\_pod\_identity) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_pod_identity | v8.17.1 | +| [kubernetes\_event\_exporter](#module\_kubernetes\_event\_exporter) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_event_exporter | v8.76.0 | | [nginx\_ingress](#module\_nginx\_ingress) | terraform-module/release/helm | 2.7.0 | ## Resources @@ -85,6 +86,7 @@ Re-enable all the resource, commented before to complete the procedure | [azurerm_client_config.current](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/client_config) | data source | | [azurerm_container_registry.acr](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/container_registry) | data source | | [azurerm_key_vault.kv_italy](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/key_vault) | data source | +| [azurerm_key_vault_secret.opsgenie_kubexporter_api_key](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/key_vault_secret) | data source | | [azurerm_key_vault_secret.vm_debug_ssh_pass](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/key_vault_secret) | data source | | [azurerm_key_vault_secret.vm_debug_ssh_user](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/key_vault_secret) | data source | | [azurerm_log_analytics_workspace.log_analytics](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/log_analytics_workspace) | data source | diff --git a/src/aks-leonardo/env/itn-prod/exporter/kubernetes-event-exporter-config.yml.tftpl b/src/aks-leonardo/env/itn-prod/exporter/kubernetes-event-exporter-config.yml.tftpl new file mode 100644 index 0000000000..f0940f2b88 --- /dev/null +++ b/src/aks-leonardo/env/itn-prod/exporter/kubernetes-event-exporter-config.yml.tftpl @@ -0,0 +1,84 @@ +config: + logLevel: "info" + receivers: + - file: + layout: {} + path: /dev/stdout + name: dump + %{ if enable_opsgenie } + - name: "${opsgenie_receiver_name}-warning" + opsgenie: + apiKey: "${opsgenie_api_key}" + priority: "P3" + message: "[INFRA-pagoPa][AKS-ITN-PROD][Sev3] {{ .Reason }} for {{ .InvolvedObject.Namespace }}/{{ .InvolvedObject.Name }} on K8s cluster" + alias: "{{ .UID }}" + description: "
{{ toPrettyJson . }}" + tags: + - "event" + - "{{ .Reason }}" + - "{{ .InvolvedObject.Kind }}" + - "{{ .InvolvedObject.Name }}" + - name: "${opsgenie_receiver_name}-critical" + opsgenie: + apiKey: "${opsgenie_api_key}" + priority: "P1" + message: "[INFRA-pagoPa][AKS-ITN-PROD][Sev1] {{ .Reason }} for {{ .InvolvedObject.Namespace }}/{{ .InvolvedObject.Name }} on K8s cluster" + alias: "{{ .UID }}" + description: "
{{ toPrettyJson . }}" + tags: + - "event" + - "{{ .Reason }}" + - "{{ .InvolvedObject.Kind }}" + - "{{ .InvolvedObject.Name }}" + %{ endif } + route: + match: + - receiver: dump + routes: + %{ if enable_opsgenie } + - drop: + - reason: "Unhealthy" + - kind: "HorizontalPodAutoscaler" + - reason: "ScaledObjectCheckFailed" + - reason: "FailedToUpdateEndpoint" + - reason: "FailedScheduling" + - reason: "EgressBlocked" + - reason: "OOMKilling" + - reason: "RebootScheduled" + - reason: "RedeployScheduled" + - reason: "FreezeScheduled" + - reason: "TerminateScheduled" + - reason: "PreemptScheduled" + match: + - receiver: "${opsgenie_receiver_name}-critical" + type: "Warning" + - receiver: "${opsgenie_receiver_name}-critical" + reason: "Failed" + - drop: + - reason: "Unhealthy" + - kind: "HorizontalPodAutoscaler" + - reason: "ScaledObjectCheckFailed" + - reason: "FailedToUpdateEndpoint" + - reason: "FailedScheduling" + - reason: "EgressBlocked" + - reason: "RebootScheduled" + - reason: "RedeployScheduled" + - reason: "FreezeScheduled" + - reason: "TerminateScheduled" + - reason: "PreemptScheduled" + match: + - receiver: ${opsgenie_receiver_name}-warning + reason: "OOMKilling" + - receiver: ${opsgenie_receiver_name}-warning + reason: "RebootScheduled" + - receiver: ${opsgenie_receiver_name}-warning + reason: "RedeployScheduled" + - receiver: ${opsgenie_receiver_name}-warning + reason: "FreezeScheduled" + - receiver: ${opsgenie_receiver_name}-warning + reason: "TerminateScheduled" + - receiver: ${opsgenie_receiver_name}-warning + reason: "PreemptScheduled" + - receiver: "${opsgenie_receiver_name}-warning" + reason: "NotTriggerScaleUp" + %{ endif } diff --git a/src/aks-platform/05_monitoring.tf b/src/aks-platform/05_monitoring.tf index 9d3614fb34..e87d1b0f59 100644 --- a/src/aks-platform/05_monitoring.tf +++ b/src/aks-platform/05_monitoring.tf @@ -67,17 +67,16 @@ resource "helm_release" "monitoring_reloader" { # Kubernetes Event Exporter module "kubernetes_event_exporter" { count = var.env_short != "p" ? 0 : 1 - source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_event_exporter?ref=v8.70.0" + source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_event_exporter?ref=v8.76.0" namespace = "monitoring" - # Slack integration - enable_slack = false - slack_channel = "#pagopa_status" - slack_token = "" - - # OpsGenie integrations - enable_opsgenie = true - opsgenie_api_key = data.azurerm_key_vault_secret.opsgenie_kubexporter_api_key.0.value + custom_config = "env/weu-prod/exporter/kubernetes-event-exporter-config.yml.tftpl" + custom_variables = { + enable_slack = false + enable_opsgenie = true + opsgenie_receiver_name = "opsgenie" + opsgenie_api_key = data.azurerm_key_vault_secret.opsgenie_kubexporter_api_key.0.value + } } data "azurerm_key_vault_secret" "opsgenie_kubexporter_api_key" { @@ -88,7 +87,7 @@ data "azurerm_key_vault_secret" "opsgenie_kubexporter_api_key" { module "opencosts" { enable_opencost = var.env_short == "d" ? true : false - source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_opencosts?ref=v8.69.0" + source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_opencosts?ref=v8.71.0" aks_name = module.aks.name aks_rg_name = module.aks.aks_resource_group_name env = var.env @@ -120,7 +119,7 @@ resource "kubernetes_manifest" "service_monitor" { "selector" : { "matchLabels" : { "app.kubernetes.io/instance" : "prometheus-opencost-exporter" - "app.kubernetes.io/name" : "prometheus-opencost-exporter" + "app.kubernetes.io/name" : "opencost" } } "endpoints" : [ diff --git a/src/aks-platform/README.md b/src/aks-platform/README.md index ec6828e275..0d8359b9d6 100644 --- a/src/aks-platform/README.md +++ b/src/aks-platform/README.md @@ -18,10 +18,10 @@ | [aks](#module\_aks) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster | v8.69.0 | | [aks\_snet](#module\_aks\_snet) | git::https://github.com/pagopa/terraform-azurerm-v3.git//subnet | v8.53.0 | | [keda\_pod\_identity](#module\_keda\_pod\_identity) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_pod_identity | v8.53.0 | -| [kubernetes\_event\_exporter](#module\_kubernetes\_event\_exporter) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_event_exporter | v8.70.0 | +| [kubernetes\_event\_exporter](#module\_kubernetes\_event\_exporter) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_event_exporter | v8.76.0 | | [monitoring\_pod\_identity](#module\_monitoring\_pod\_identity) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_pod_identity | v8.53.0 | | [nginx\_ingress](#module\_nginx\_ingress) | terraform-module/release/helm | 2.8.0 | -| [opencosts](#module\_opencosts) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_opencosts | v8.69.0 | +| [opencosts](#module\_opencosts) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_opencosts | v8.71.0 | | [tls\_checker](#module\_tls\_checker) | git::https://github.com/pagopa/terraform-azurerm-v3.git//tls_checker | v8.54.0 | | [tls\_checker\_workload\_identity\_configuration](#module\_tls\_checker\_workload\_identity\_configuration) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_workload_identity_configuration | v8.54.0 | | [tls\_checker\_workload\_identity\_init](#module\_tls\_checker\_workload\_identity\_init) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_workload_identity_init | v8.54.0 | diff --git a/src/aks-platform/env/weu-prod/exporter/kubernetes-event-exporter-config.yml.tftpl b/src/aks-platform/env/weu-prod/exporter/kubernetes-event-exporter-config.yml.tftpl new file mode 100644 index 0000000000..09f83c78e2 --- /dev/null +++ b/src/aks-platform/env/weu-prod/exporter/kubernetes-event-exporter-config.yml.tftpl @@ -0,0 +1,84 @@ +config: + logLevel: "debug" + receivers: + - file: + layout: {} + path: /dev/stdout + name: dump + %{ if enable_opsgenie } + - name: "${opsgenie_receiver_name}-warning" + opsgenie: + apiKey: "${opsgenie_api_key}" + priority: "P3" + message: "[INFRA-pagoPa][AKS-WEU-PROD][Sev3] {{ .Reason }} for {{ .InvolvedObject.Namespace }}/{{ .InvolvedObject.Name }} on K8s cluster" + alias: "{{ .UID }}" + description: "
{{ toPrettyJson . }}" + tags: + - "event" + - "{{ .Reason }}" + - "{{ .InvolvedObject.Kind }}" + - "{{ .InvolvedObject.Name }}" + - name: "${opsgenie_receiver_name}-critical" + opsgenie: + apiKey: "${opsgenie_api_key}" + priority: "P1" + message: "[INFRA-pagoPa][AKS-WEU-PROD][Sev1] {{ .Reason }} for {{ .InvolvedObject.Namespace }}/{{ .InvolvedObject.Name }} on K8s cluster" + alias: "{{ .UID }}" + description: "
{{ toPrettyJson . }}" + tags: + - "event" + - "{{ .Reason }}" + - "{{ .InvolvedObject.Kind }}" + - "{{ .InvolvedObject.Name }}" + %{ endif } + route: + match: + - receiver: dump + routes: + %{ if enable_opsgenie } + - drop: + - reason: "Unhealthy" + - kind: "HorizontalPodAutoscaler" + - reason: "ScaledObjectCheckFailed" + - reason: "FailedToUpdateEndpoint" + - reason: "FailedScheduling" + - reason: "EgressBlocked" + - reason: "OOMKilling" + - reason: "RebootScheduled" + - reason: "RedeployScheduled" + - reason: "FreezeScheduled" + - reason: "TerminateScheduled" + - reason: "PreemptScheduled" + match: + - receiver: "${opsgenie_receiver_name}-critical" + type: "Warning" + - receiver: "${opsgenie_receiver_name}-critical" + reason: "Failed" + - drop: + - reason: "Unhealthy" + - kind: "HorizontalPodAutoscaler" + - reason: "ScaledObjectCheckFailed" + - reason: "FailedToUpdateEndpoint" + - reason: "FailedScheduling" + - reason: "EgressBlocked" + - reason: "RebootScheduled" + - reason: "RedeployScheduled" + - reason: "FreezeScheduled" + - reason: "TerminateScheduled" + - reason: "PreemptScheduled" + match: + - receiver: ${opsgenie_receiver_name}-warning + reason: "OOMKilling" + - receiver: ${opsgenie_receiver_name}-warning + reason: "RebootScheduled" + - receiver: ${opsgenie_receiver_name}-warning + reason: "RedeployScheduled" + - receiver: ${opsgenie_receiver_name}-warning + reason: "FreezeScheduled" + - receiver: ${opsgenie_receiver_name}-warning + reason: "TerminateScheduled" + - receiver: ${opsgenie_receiver_name}-warning + reason: "PreemptScheduled" + - receiver: "${opsgenie_receiver_name}-warning" + reason: "NotTriggerScaleUp" + %{ endif }