Skip to content

Commit

Permalink
feat: [PAYMCLOUD-204] Add custom Kubernetes Event Exporter configurat…
Browse files Browse the repository at this point in the history
…ions (#2732)

Add custom Kubernetes Event Exporter configurations

Introduced new configurations for Kubernetes Event Exporter in ITN-PROD and WEU-PROD environments, enabling OpsGenie integration with specific routing and filtering logic. Updated Terraform modules to utilize custom configuration templates and streamlined variables to enhance event management and monitoring workflows.

Signed-off-by: Fabio Felici <[email protected]>
  • Loading branch information
ffppa authored Jan 21, 2025
1 parent 92e62fd commit 584ac67
Show file tree
Hide file tree
Showing 6 changed files with 203 additions and 13 deletions.
21 changes: 21 additions & 0 deletions src/aks-leonardo/03_monitoring.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,26 @@ module "elastic_agent" {

}

# Kubernetes Event Exporter
module "kubernetes_event_exporter" {
count = var.env_short != "p" ? 0 : 1
source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_event_exporter?ref=v8.76.0"
namespace = "monitoring"

custom_config = "env/itn-prod/exporter/kubernetes-event-exporter-config.yml.tftpl"
custom_variables = {
enable_slack = false
enable_opsgenie = true
opsgenie_receiver_name = "opsgenie"
opsgenie_api_key = data.azurerm_key_vault_secret.opsgenie_kubexporter_api_key.0.value
}
}

data "azurerm_key_vault_secret" "opsgenie_kubexporter_api_key" {
count = var.env_short != "p" ? 0 : 1
key_vault_id = data.azurerm_key_vault.kv_italy.id
name = "opsgenie-infra-kubexporter-webhook-token"
}

// TODO mettere nel kv il secret quickstart-es-elastic-user tramite sops

2 changes: 2 additions & 0 deletions src/aks-leonardo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Re-enable all the resource, commented before to complete the procedure
| <a name="module_aks_storage_class"></a> [aks\_storage\_class](#module\_aks\_storage\_class) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_storage_class | v8.17.1 |
| <a name="module_elastic_agent"></a> [elastic\_agent](#module\_elastic\_agent) | git::https://github.com/pagopa/terraform-azurerm-v3.git//elastic_agent | v8.50.0 |
| <a name="module_keda_pod_identity"></a> [keda\_pod\_identity](#module\_keda\_pod\_identity) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_pod_identity | v8.17.1 |
| <a name="module_kubernetes_event_exporter"></a> [kubernetes\_event\_exporter](#module\_kubernetes\_event\_exporter) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_event_exporter | v8.76.0 |
| <a name="module_nginx_ingress"></a> [nginx\_ingress](#module\_nginx\_ingress) | terraform-module/release/helm | 2.7.0 |

## Resources
Expand Down Expand Up @@ -85,6 +86,7 @@ Re-enable all the resource, commented before to complete the procedure
| [azurerm_client_config.current](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/client_config) | data source |
| [azurerm_container_registry.acr](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/container_registry) | data source |
| [azurerm_key_vault.kv_italy](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/key_vault) | data source |
| [azurerm_key_vault_secret.opsgenie_kubexporter_api_key](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/key_vault_secret) | data source |
| [azurerm_key_vault_secret.vm_debug_ssh_pass](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/key_vault_secret) | data source |
| [azurerm_key_vault_secret.vm_debug_ssh_user](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/key_vault_secret) | data source |
| [azurerm_log_analytics_workspace.log_analytics](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/log_analytics_workspace) | data source |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
config:
logLevel: "info"
receivers:
- file:
layout: {}
path: /dev/stdout
name: dump
%{ if enable_opsgenie }
- name: "${opsgenie_receiver_name}-warning"
opsgenie:
apiKey: "${opsgenie_api_key}"
priority: "P3"
message: "[INFRA-pagoPa][AKS-ITN-PROD][Sev3] {{ .Reason }} for {{ .InvolvedObject.Namespace }}/{{ .InvolvedObject.Name }} on K8s cluster"
alias: "{{ .UID }}"
description: "<pre>{{ toPrettyJson . }}</pre>"
tags:
- "event"
- "{{ .Reason }}"
- "{{ .InvolvedObject.Kind }}"
- "{{ .InvolvedObject.Name }}"
- name: "${opsgenie_receiver_name}-critical"
opsgenie:
apiKey: "${opsgenie_api_key}"
priority: "P1"
message: "[INFRA-pagoPa][AKS-ITN-PROD][Sev1] {{ .Reason }} for {{ .InvolvedObject.Namespace }}/{{ .InvolvedObject.Name }} on K8s cluster"
alias: "{{ .UID }}"
description: "<pre>{{ toPrettyJson . }}</pre>"
tags:
- "event"
- "{{ .Reason }}"
- "{{ .InvolvedObject.Kind }}"
- "{{ .InvolvedObject.Name }}"
%{ endif }
route:
match:
- receiver: dump
routes:
%{ if enable_opsgenie }
- drop:
- reason: "Unhealthy"
- kind: "HorizontalPodAutoscaler"
- reason: "ScaledObjectCheckFailed"
- reason: "FailedToUpdateEndpoint"
- reason: "FailedScheduling"
- reason: "EgressBlocked"
- reason: "OOMKilling"
- reason: "RebootScheduled"
- reason: "RedeployScheduled"
- reason: "FreezeScheduled"
- reason: "TerminateScheduled"
- reason: "PreemptScheduled"
match:
- receiver: "${opsgenie_receiver_name}-critical"
type: "Warning"
- receiver: "${opsgenie_receiver_name}-critical"
reason: "Failed"
- drop:
- reason: "Unhealthy"
- kind: "HorizontalPodAutoscaler"
- reason: "ScaledObjectCheckFailed"
- reason: "FailedToUpdateEndpoint"
- reason: "FailedScheduling"
- reason: "EgressBlocked"
- reason: "RebootScheduled"
- reason: "RedeployScheduled"
- reason: "FreezeScheduled"
- reason: "TerminateScheduled"
- reason: "PreemptScheduled"
match:
- receiver: ${opsgenie_receiver_name}-warning
reason: "OOMKilling"
- receiver: ${opsgenie_receiver_name}-warning
reason: "RebootScheduled"
- receiver: ${opsgenie_receiver_name}-warning
reason: "RedeployScheduled"
- receiver: ${opsgenie_receiver_name}-warning
reason: "FreezeScheduled"
- receiver: ${opsgenie_receiver_name}-warning
reason: "TerminateScheduled"
- receiver: ${opsgenie_receiver_name}-warning
reason: "PreemptScheduled"
- receiver: "${opsgenie_receiver_name}-warning"
reason: "NotTriggerScaleUp"
%{ endif }
21 changes: 10 additions & 11 deletions src/aks-platform/05_monitoring.tf
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,16 @@ resource "helm_release" "monitoring_reloader" {
# Kubernetes Event Exporter
module "kubernetes_event_exporter" {
count = var.env_short != "p" ? 0 : 1
source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_event_exporter?ref=v8.70.0"
source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_event_exporter?ref=v8.76.0"
namespace = "monitoring"

# Slack integration
enable_slack = false
slack_channel = "#pagopa_status"
slack_token = ""

# OpsGenie integrations
enable_opsgenie = true
opsgenie_api_key = data.azurerm_key_vault_secret.opsgenie_kubexporter_api_key.0.value
custom_config = "env/weu-prod/exporter/kubernetes-event-exporter-config.yml.tftpl"
custom_variables = {
enable_slack = false
enable_opsgenie = true
opsgenie_receiver_name = "opsgenie"
opsgenie_api_key = data.azurerm_key_vault_secret.opsgenie_kubexporter_api_key.0.value
}
}

data "azurerm_key_vault_secret" "opsgenie_kubexporter_api_key" {
Expand All @@ -88,7 +87,7 @@ data "azurerm_key_vault_secret" "opsgenie_kubexporter_api_key" {

module "opencosts" {
enable_opencost = var.env_short == "d" ? true : false
source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_opencosts?ref=v8.69.0"
source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_opencosts?ref=v8.71.0"
aks_name = module.aks.name
aks_rg_name = module.aks.aks_resource_group_name
env = var.env
Expand Down Expand Up @@ -120,7 +119,7 @@ resource "kubernetes_manifest" "service_monitor" {
"selector" : {
"matchLabels" : {
"app.kubernetes.io/instance" : "prometheus-opencost-exporter"
"app.kubernetes.io/name" : "prometheus-opencost-exporter"
"app.kubernetes.io/name" : "opencost"
}
}
"endpoints" : [
Expand Down
4 changes: 2 additions & 2 deletions src/aks-platform/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
| <a name="module_aks"></a> [aks](#module\_aks) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster | v8.69.0 |
| <a name="module_aks_snet"></a> [aks\_snet](#module\_aks\_snet) | git::https://github.com/pagopa/terraform-azurerm-v3.git//subnet | v8.53.0 |
| <a name="module_keda_pod_identity"></a> [keda\_pod\_identity](#module\_keda\_pod\_identity) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_pod_identity | v8.53.0 |
| <a name="module_kubernetes_event_exporter"></a> [kubernetes\_event\_exporter](#module\_kubernetes\_event\_exporter) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_event_exporter | v8.70.0 |
| <a name="module_kubernetes_event_exporter"></a> [kubernetes\_event\_exporter](#module\_kubernetes\_event\_exporter) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_event_exporter | v8.76.0 |
| <a name="module_monitoring_pod_identity"></a> [monitoring\_pod\_identity](#module\_monitoring\_pod\_identity) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_pod_identity | v8.53.0 |
| <a name="module_nginx_ingress"></a> [nginx\_ingress](#module\_nginx\_ingress) | terraform-module/release/helm | 2.8.0 |
| <a name="module_opencosts"></a> [opencosts](#module\_opencosts) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_opencosts | v8.69.0 |
| <a name="module_opencosts"></a> [opencosts](#module\_opencosts) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_opencosts | v8.71.0 |
| <a name="module_tls_checker"></a> [tls\_checker](#module\_tls\_checker) | git::https://github.com/pagopa/terraform-azurerm-v3.git//tls_checker | v8.54.0 |
| <a name="module_tls_checker_workload_identity_configuration"></a> [tls\_checker\_workload\_identity\_configuration](#module\_tls\_checker\_workload\_identity\_configuration) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_workload_identity_configuration | v8.54.0 |
| <a name="module_tls_checker_workload_identity_init"></a> [tls\_checker\_workload\_identity\_init](#module\_tls\_checker\_workload\_identity\_init) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_workload_identity_init | v8.54.0 |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
config:
logLevel: "debug"
receivers:
- file:
layout: {}
path: /dev/stdout
name: dump
%{ if enable_opsgenie }
- name: "${opsgenie_receiver_name}-warning"
opsgenie:
apiKey: "${opsgenie_api_key}"
priority: "P3"
message: "[INFRA-pagoPa][AKS-WEU-PROD][Sev3] {{ .Reason }} for {{ .InvolvedObject.Namespace }}/{{ .InvolvedObject.Name }} on K8s cluster"
alias: "{{ .UID }}"
description: "<pre>{{ toPrettyJson . }}</pre>"
tags:
- "event"
- "{{ .Reason }}"
- "{{ .InvolvedObject.Kind }}"
- "{{ .InvolvedObject.Name }}"
- name: "${opsgenie_receiver_name}-critical"
opsgenie:
apiKey: "${opsgenie_api_key}"
priority: "P1"
message: "[INFRA-pagoPa][AKS-WEU-PROD][Sev1] {{ .Reason }} for {{ .InvolvedObject.Namespace }}/{{ .InvolvedObject.Name }} on K8s cluster"
alias: "{{ .UID }}"
description: "<pre>{{ toPrettyJson . }}</pre>"
tags:
- "event"
- "{{ .Reason }}"
- "{{ .InvolvedObject.Kind }}"
- "{{ .InvolvedObject.Name }}"
%{ endif }
route:
match:
- receiver: dump
routes:
%{ if enable_opsgenie }
- drop:
- reason: "Unhealthy"
- kind: "HorizontalPodAutoscaler"
- reason: "ScaledObjectCheckFailed"
- reason: "FailedToUpdateEndpoint"
- reason: "FailedScheduling"
- reason: "EgressBlocked"
- reason: "OOMKilling"
- reason: "RebootScheduled"
- reason: "RedeployScheduled"
- reason: "FreezeScheduled"
- reason: "TerminateScheduled"
- reason: "PreemptScheduled"
match:
- receiver: "${opsgenie_receiver_name}-critical"
type: "Warning"
- receiver: "${opsgenie_receiver_name}-critical"
reason: "Failed"
- drop:
- reason: "Unhealthy"
- kind: "HorizontalPodAutoscaler"
- reason: "ScaledObjectCheckFailed"
- reason: "FailedToUpdateEndpoint"
- reason: "FailedScheduling"
- reason: "EgressBlocked"
- reason: "RebootScheduled"
- reason: "RedeployScheduled"
- reason: "FreezeScheduled"
- reason: "TerminateScheduled"
- reason: "PreemptScheduled"
match:
- receiver: ${opsgenie_receiver_name}-warning
reason: "OOMKilling"
- receiver: ${opsgenie_receiver_name}-warning
reason: "RebootScheduled"
- receiver: ${opsgenie_receiver_name}-warning
reason: "RedeployScheduled"
- receiver: ${opsgenie_receiver_name}-warning
reason: "FreezeScheduled"
- receiver: ${opsgenie_receiver_name}-warning
reason: "TerminateScheduled"
- receiver: ${opsgenie_receiver_name}-warning
reason: "PreemptScheduled"
- receiver: "${opsgenie_receiver_name}-warning"
reason: "NotTriggerScaleUp"
%{ endif }

0 comments on commit 584ac67

Please sign in to comment.