diff --git a/85_prometheus_grafana_private_endpoint/ampls.tf b/85_prometheus_grafana_private_endpoint/ampls.tf index d1fdc4e..4bc8a64 100644 --- a/85_prometheus_grafana_private_endpoint/ampls.tf +++ b/85_prometheus_grafana_private_endpoint/ampls.tf @@ -18,17 +18,17 @@ resource "azurerm_monitor_private_link_scoped_service" "ampls-dce-log-analytics" } +# # not required +# # resource "azurerm_monitor_private_link_scoped_service" "prometheus" { +# # name = "ampls-prometheus" +# # resource_group_name = azurerm_resource_group.rg_monitoring.name +# # scope_name = azurerm_monitor_private_link_scope.ampls.name +# # linked_resource_id = azurerm_monitor_workspace.prometheus.id +# # } -# resource "azurerm_monitor_private_link_scoped_service" "prometheus" { -# name = "ampls-prometheus" +# resource "azurerm_monitor_private_link_scoped_service" "dce-prometheus" { +# name = "ampls-dce-prometheus" # resource_group_name = azurerm_resource_group.rg_monitoring.name # scope_name = azurerm_monitor_private_link_scope.ampls.name -# linked_resource_id = azurerm_monitor_workspace.prometheus.id -# } - -resource "azurerm_monitor_private_link_scoped_service" "dce-prometheus" { - name = "ampls-dce-prometheus" - resource_group_name = azurerm_resource_group.rg_monitoring.name - scope_name = azurerm_monitor_private_link_scope.ampls.name - linked_resource_id = azurerm_monitor_data_collection_endpoint.dce-prometheus.id -} \ No newline at end of file +# linked_resource_id = azurerm_monitor_data_collection_endpoint.dce-prometheus.id +# } \ No newline at end of file diff --git a/85_prometheus_grafana_private_endpoint/data_collection_rule.tf b/85_prometheus_grafana_private_endpoint/data_collection_rule.tf deleted file mode 100644 index b8ad98a..0000000 --- a/85_prometheus_grafana_private_endpoint/data_collection_rule.tf +++ /dev/null @@ -1,33 +0,0 @@ -resource "azurerm_monitor_data_collection_rule" "dcr-prometheus" { - name = "dcr-prometheus" - resource_group_name = azurerm_resource_group.rg_monitoring.name - location = azurerm_resource_group.rg_monitoring.location - data_collection_endpoint_id = azurerm_monitor_data_collection_endpoint.dce-prometheus.id - kind = "Linux" - - data_sources { - prometheus_forwarder { - name = "PrometheusDataSource" - streams = ["Microsoft-PrometheusMetrics"] - } - } - - destinations { - monitor_account { - monitor_account_id = azurerm_monitor_workspace.prometheus.id - name = azurerm_monitor_workspace.prometheus.name - } - } - - data_flow { - streams = ["Microsoft-PrometheusMetrics"] - destinations = [azurerm_monitor_workspace.prometheus.name] - } -} - -# associate to a Data Collection Rule -resource "azurerm_monitor_data_collection_rule_association" "dcr-aks-prometheus" { - name = "dcr-aks-prometheus" - target_resource_id = azurerm_kubernetes_cluster.aks.id - data_collection_rule_id = azurerm_monitor_data_collection_rule.dcr-prometheus.id -} \ No newline at end of file diff --git a/85_prometheus_grafana_private_endpoint/data_collection_endpoint.tf b/85_prometheus_grafana_private_endpoint/dce-log_analytics.tf similarity index 62% rename from 85_prometheus_grafana_private_endpoint/data_collection_endpoint.tf rename to 85_prometheus_grafana_private_endpoint/dce-log_analytics.tf index e395248..7aacd7e 100644 --- a/85_prometheus_grafana_private_endpoint/data_collection_endpoint.tf +++ b/85_prometheus_grafana_private_endpoint/dce-log_analytics.tf @@ -1,14 +1,13 @@ -resource "azurerm_monitor_data_collection_endpoint" "dce-prometheus" { - name = "dce-prometheus" +resource "azurerm_monitor_data_collection_endpoint" "dce-log-analytics" { + name = "dce-log-analytics" resource_group_name = azurerm_resource_group.rg_monitoring.name location = azurerm_resource_group.rg_monitoring.location - kind = "Linux" - public_network_access_enabled = false # true # false + public_network_access_enabled = false } # associate to a Data Collection Endpoint -resource "azurerm_monitor_data_collection_rule_association" "dce-aks-prometheus" { +resource "azurerm_monitor_data_collection_rule_association" "dcra-dce-log-analytics-aks" { name = "configurationAccessEndpoint" # name is required when data_collection_rule_id is specified. And when data_collection_endpoint_id is specified, the name is populated with configurationAccessEndpoint target_resource_id = azurerm_kubernetes_cluster.aks.id - data_collection_endpoint_id = azurerm_monitor_data_collection_endpoint.dce-prometheus.id -} + data_collection_endpoint_id = azurerm_monitor_data_collection_endpoint.dce-log-analytics.id +} \ No newline at end of file diff --git a/85_prometheus_grafana_private_endpoint/dce-prometheus.tf b/85_prometheus_grafana_private_endpoint/dce-prometheus.tf new file mode 100644 index 0000000..25da49c --- /dev/null +++ b/85_prometheus_grafana_private_endpoint/dce-prometheus.tf @@ -0,0 +1,14 @@ +# resource "azurerm_monitor_data_collection_endpoint" "dce-prometheus" { +# name = "dce-prometheus" +# resource_group_name = azurerm_resource_group.rg_monitoring.name +# location = azurerm_resource_group.rg_monitoring.location +# kind = "Linux" +# public_network_access_enabled = false # true # false +# } + +# # associate to a Data Collection Endpoint +# resource "azurerm_monitor_data_collection_rule_association" "dcra-dce-prometheus-aks" { +# name = "configurationAccessEndpoint" # "dcra-dce-prometheus-aks" # # name is required when data_collection_rule_id is specified. And when data_collection_endpoint_id is specified, the name is populated with configurationAccessEndpoint +# target_resource_id = azurerm_kubernetes_cluster.aks.id +# data_collection_endpoint_id = azurerm_monitor_data_collection_endpoint.dce-prometheus.id +# } diff --git a/85_prometheus_grafana_private_endpoint/dce-dcr-log_analytics.tf b/85_prometheus_grafana_private_endpoint/dcr-log_analytics.tf similarity index 64% rename from 85_prometheus_grafana_private_endpoint/dce-dcr-log_analytics.tf rename to 85_prometheus_grafana_private_endpoint/dcr-log_analytics.tf index e483782..13c0e39 100644 --- a/85_prometheus_grafana_private_endpoint/dce-dcr-log_analytics.tf +++ b/85_prometheus_grafana_private_endpoint/dcr-log_analytics.tf @@ -1,17 +1,3 @@ -resource "azurerm_monitor_data_collection_endpoint" "dce-log-analytics" { - name = "dce-log-analytics" - resource_group_name = azurerm_resource_group.rg_monitoring.name - location = azurerm_resource_group.rg_monitoring.location - public_network_access_enabled = false -} - -# associate to a Data Collection Endpoint -resource "azurerm_monitor_data_collection_rule_association" "dce-aks-log-analytics" { - name = "configurationAccessEndpoint" # name is required when data_collection_rule_id is specified. And when data_collection_endpoint_id is specified, the name is populated with configurationAccessEndpoint - target_resource_id = azurerm_kubernetes_cluster.aks.id - data_collection_endpoint_id = azurerm_monitor_data_collection_endpoint.dce-log-analytics.id -} - resource "azurerm_monitor_data_collection_rule" "dcr-log-analytics" { name = "dcr-log-analytics" resource_group_name = azurerm_resource_group.rg_monitoring.name @@ -66,8 +52,8 @@ resource "azurerm_monitor_data_collection_rule" "dcr-log-analytics" { } # associate to a Data Collection Rule -resource "azurerm_monitor_data_collection_rule_association" "dcr-aks-log-analytics" { - name = "dcr-aks-log-analytics" +resource "azurerm_monitor_data_collection_rule_association" "dcra-dcr-log-analytics-aks" { + name = "dcra-dcr-log-analytics-aks" target_resource_id = azurerm_kubernetes_cluster.aks.id data_collection_rule_id = azurerm_monitor_data_collection_rule.dcr-log-analytics.id } diff --git a/85_prometheus_grafana_private_endpoint/dcr-prometheus.tf b/85_prometheus_grafana_private_endpoint/dcr-prometheus.tf new file mode 100644 index 0000000..337bb46 --- /dev/null +++ b/85_prometheus_grafana_private_endpoint/dcr-prometheus.tf @@ -0,0 +1,35 @@ +# resource "azurerm_monitor_data_collection_rule" "dcr-prometheus" { +# name = "dcr-prometheus" +# resource_group_name = azurerm_resource_group.rg_monitoring.name +# location = azurerm_resource_group.rg_monitoring.location +# data_collection_endpoint_id = azurerm_monitor_data_collection_endpoint.dce-prometheus.id +# kind = "Linux" +# description = "DCR for Azure Monitor Metrics Profile (Managed Prometheus)" + +# data_sources { +# prometheus_forwarder { +# name = "PrometheusDataSource" +# streams = ["Microsoft-PrometheusMetrics"] +# } +# } + +# destinations { +# monitor_account { +# monitor_account_id = azurerm_monitor_workspace.prometheus.id +# name = azurerm_monitor_workspace.prometheus.name +# } +# } + +# data_flow { +# streams = ["Microsoft-PrometheusMetrics"] +# destinations = [azurerm_monitor_workspace.prometheus.name] +# } +# } + +# # associate to a Data Collection Rule +# resource "azurerm_monitor_data_collection_rule_association" "dcra-dcr-prometheus-aks" { +# name = "dcra-dcr-prometheus-aks" +# target_resource_id = azurerm_kubernetes_cluster.aks.id +# data_collection_rule_id = azurerm_monitor_data_collection_rule.dcr-prometheus.id +# description = "Association of data collection rule. Deleting this association will break the data collection for this AKS Cluster." +# } diff --git a/85_prometheus_grafana_private_endpoint/grafana.tf b/85_prometheus_grafana_private_endpoint/grafana.tf index f471757..c56d199 100644 --- a/85_prometheus_grafana_private_endpoint/grafana.tf +++ b/85_prometheus_grafana_private_endpoint/grafana.tf @@ -1,48 +1,48 @@ -resource "azurerm_dashboard_grafana" "grafana" { - name = var.grafana_name - resource_group_name = azurerm_resource_group.rg_monitoring.name - location = azurerm_resource_group.rg_monitoring.location - api_key_enabled = true - deterministic_outbound_ip_enabled = true - public_network_access_enabled = true - sku = "Standard" - zone_redundancy_enabled = false - grafana_major_version = "10" # 9 - - azure_monitor_workspace_integrations { - resource_id = azurerm_monitor_workspace.prometheus.id - } - - identity { - type = "SystemAssigned" # "UserAssigned" # - # identity_ids = [azurerm_user_assigned_identity.identity-grafana.id] - } -} - -data "azurerm_client_config" "current" {} - -resource "azurerm_role_assignment" "role_grafana_admin" { - scope = azurerm_dashboard_grafana.grafana.id - role_definition_name = "Grafana Admin" - principal_id = data.azurerm_client_config.current.object_id -} - -resource "azurerm_role_assignment" "role_monitoring_data_reader" { - scope = azurerm_monitor_workspace.prometheus.id - role_definition_name = "Monitoring Data Reader" - principal_id = azurerm_dashboard_grafana.grafana.identity.0.principal_id # azurerm_user_assigned_identity.identity-grafana.principal_id # -} - -data "azurerm_subscription" "current" {} - -resource "azurerm_role_assignment" "role_monitoring_reader" { - scope = data.azurerm_subscription.current.id - role_definition_name = "Monitoring Reader" - principal_id = azurerm_dashboard_grafana.grafana.identity.0.principal_id # azurerm_user_assigned_identity.identity-grafana.principal_id # -} - -# resource "azurerm_user_assigned_identity" "identity-grafana" { -# name = "identity-grafana" -# resource_group_name = azurerm_resource_group.rg_monitoring.name -# location = azurerm_resource_group.rg_monitoring.location +# resource "azurerm_dashboard_grafana" "grafana" { +# name = var.grafana_name +# resource_group_name = azurerm_resource_group.rg_monitoring.name +# location = azurerm_resource_group.rg_monitoring.location +# api_key_enabled = true +# deterministic_outbound_ip_enabled = true +# public_network_access_enabled = true +# sku = "Standard" +# zone_redundancy_enabled = false +# grafana_major_version = "10" # 9 + +# azure_monitor_workspace_integrations { +# resource_id = azurerm_monitor_workspace.prometheus.id +# } + +# identity { +# type = "SystemAssigned" # "UserAssigned" # +# # identity_ids = [azurerm_user_assigned_identity.identity-grafana.id] +# } # } + +# data "azurerm_client_config" "current" {} + +# resource "azurerm_role_assignment" "role_grafana_admin" { +# scope = azurerm_dashboard_grafana.grafana.id +# role_definition_name = "Grafana Admin" +# principal_id = data.azurerm_client_config.current.object_id +# } + +# resource "azurerm_role_assignment" "role_monitoring_data_reader" { +# scope = azurerm_monitor_workspace.prometheus.id +# role_definition_name = "Monitoring Data Reader" +# principal_id = azurerm_dashboard_grafana.grafana.identity.0.principal_id # azurerm_user_assigned_identity.identity-grafana.principal_id # +# } + +# data "azurerm_subscription" "current" {} + +# resource "azurerm_role_assignment" "role_monitoring_reader" { +# scope = data.azurerm_subscription.current.id +# role_definition_name = "Monitoring Reader" +# principal_id = azurerm_dashboard_grafana.grafana.identity.0.principal_id # azurerm_user_assigned_identity.identity-grafana.principal_id # +# } + +# # resource "azurerm_user_assigned_identity" "identity-grafana" { +# # name = "identity-grafana" +# # resource_group_name = azurerm_resource_group.rg_monitoring.name +# # location = azurerm_resource_group.rg_monitoring.location +# # } diff --git a/85_prometheus_grafana_private_endpoint/output.tf b/85_prometheus_grafana_private_endpoint/output.tf index 63c6791..1be5dfa 100644 --- a/85_prometheus_grafana_private_endpoint/output.tf +++ b/85_prometheus_grafana_private_endpoint/output.tf @@ -1,18 +1,18 @@ -output "query_endpoint" { - value = azurerm_monitor_workspace.prometheus.query_endpoint -} +# output "query_endpoint" { +# value = azurerm_monitor_workspace.prometheus.query_endpoint +# } -output "garafana_endpoint" { - value = azurerm_dashboard_grafana.grafana.endpoint -} +# output "garafana_endpoint" { +# value = azurerm_dashboard_grafana.grafana.endpoint +# } -output "grafana_name" { - value = azurerm_dashboard_grafana.grafana.name -} +# output "grafana_name" { +# value = azurerm_dashboard_grafana.grafana.name +# } -output "grafana_rg_name" { - value = azurerm_dashboard_grafana.grafana.resource_group_name -} +# output "grafana_rg_name" { +# value = azurerm_dashboard_grafana.grafana.resource_group_name +# } output "aks_name" { value = azurerm_kubernetes_cluster.aks.name diff --git a/85_prometheus_grafana_private_endpoint/private_endpoint-ampls.tf b/85_prometheus_grafana_private_endpoint/pe-ampls.tf similarity index 82% rename from 85_prometheus_grafana_private_endpoint/private_endpoint-ampls.tf rename to 85_prometheus_grafana_private_endpoint/pe-ampls.tf index 0e4e996..d44bd07 100644 --- a/85_prometheus_grafana_private_endpoint/private_endpoint-ampls.tf +++ b/85_prometheus_grafana_private_endpoint/pe-ampls.tf @@ -1,3 +1,13 @@ +locals { + dns_zones_ampls = [ + "privatelink.monitor.azure.com", + "privatelink.oms.opinsights.azure.com", + "privatelink.ods.opinsights.azure.com", + "privatelink.agentsvc.azure-automation.net", + "privatelink.blob.core.windows.net" + ] +} + resource "azurerm_private_endpoint" "pe-ampls" { name = "pe-ampls" resource_group_name = azurerm_virtual_network.vnet.resource_group_name @@ -13,27 +23,12 @@ resource "azurerm_private_endpoint" "pe-ampls" { private_dns_zone_group { name = "private-dns-zone" - private_dns_zone_ids = [ for zone in azurerm_private_dns_zone.zones : zone.id ] + private_dns_zone_ids = [for zone in azurerm_private_dns_zone.zones : zone.id] } } -output "zone_id" { - value = [ for zone in azurerm_private_dns_zone.zones : zone.id ] -} - -variable "dns_zones_ampls" { - type = list(string) - default = [ - "privatelink.monitor.azure.com", - "privatelink.oms.opinsights.azure.com", - "privatelink.ods.opinsights.azure.com", - "privatelink.agentsvc.azure-automation.net", - "privatelink.blob.core.windows.net" - ] -} - resource "azurerm_private_dns_zone" "zones" { - for_each = toset(var.dns_zones_ampls) + for_each = toset(local.dns_zones_ampls) name = each.value resource_group_name = azurerm_resource_group.rg_monitoring.name } diff --git a/85_prometheus_grafana_private_endpoint/prometheus.tf b/85_prometheus_grafana_private_endpoint/prometheus.tf index 95edde9..483723d 100644 --- a/85_prometheus_grafana_private_endpoint/prometheus.tf +++ b/85_prometheus_grafana_private_endpoint/prometheus.tf @@ -1,224 +1,224 @@ -resource "azurerm_monitor_workspace" "prometheus" { - name = var.prometheus_name - resource_group_name = azurerm_resource_group.rg_monitoring.name - location = azurerm_resource_group.rg_monitoring.location - public_network_access_enabled = false # false # true -} - -resource "azurerm_role_assignment" "role_monitoring_data_reader_me" { - scope = azurerm_monitor_workspace.prometheus.id - role_definition_name = "Monitoring Data Reader" - principal_id = data.azurerm_client_config.current.object_id -} - -resource "azurerm_monitor_alert_prometheus_rule_group" "alert-prometheus-nodes" { - name = "NodeRecordingRulesRuleGroup" - resource_group_name = azurerm_resource_group.rg_monitoring.name - location = azurerm_resource_group.rg_monitoring.location - cluster_name = azurerm_kubernetes_cluster.aks.name - rule_group_enabled = true - interval = "PT1M" - scopes = [azurerm_monitor_workspace.prometheus.id] - - rule { - record = "instance:node_num_cpu:sum" - expression = "count without (cpu, mode) (node_cpu_seconds_total{job=\"node\",mode=\"idle\"})" - enabled = true - } - - rule { - record = "instance:node_cpu_utilisation:rate5m" - expression = "1 - avg without (cpu) (sum without (mode) (rate(node_cpu_seconds_total{job=\"node\", mode=~\"idle|iowait|steal\"}[5m])))" - enabled = true - } - - rule { - record = "instance:node_load1_per_cpu:ratio" - expression = "(node_load1{job=\"node\"}/ instance:node_num_cpu:sum{job=\"node\"})" - enabled = true - } - - rule { - record = "instance:node_memory_utilisation:ratio" - expression = "1 - ((node_memory_MemAvailable_bytes{job=\"node\"} or (node_memory_Buffers_bytes{job=\"node\"} + node_memory_Cached_bytes{job=\"node\"} + node_memory_MemFree_bytes{job=\"node\"} + node_memory_Slab_bytes{job=\"node\"})) / node_memory_MemTotal_bytes{job=\"node\"})" - enabled = true - } - - rule { - record = "instance:node_vmstat_pgmajfault:rate5m" - expression = "rate(node_vmstat_pgmajfault{job=\"node\"}[5m])" - enabled = true - } - - rule { - record = "instance_device:node_disk_io_time_seconds:rate5m" - expression = "rate(node_disk_io_time_seconds_total{job=\"node\", device!=\"\"}[5m])" - enabled = true - } - - rule { - record = "instance_device:node_disk_io_time_weighted_seconds:rate5m" - expression = "rate(node_disk_io_time_weighted_seconds_total{job=\"node\", device!=\"\"}[5m])" - enabled = true - } - - rule { - record = "instance:node_network_receive_bytes_excluding_lo:rate5m" - expression = "sum without (device) (rate(node_network_receive_bytes_total{job=\"node\", device!=\"lo\"}[5m]))" - enabled = true - } - - rule { - record = "instance:node_network_transmit_bytes_excluding_lo:rate5m" - expression = "sum without (device) (rate(node_network_transmit_bytes_total{job=\"node\", device!=\"lo\"}[5m]))" - enabled = true - } - - rule { - record = "instance:node_network_receive_drop_excluding_lo:rate5m" - expression = "sum without (device) (rate(node_network_receive_drop_total{job=\"node\", device!=\"lo\"}[5m]))" - enabled = true - } - - rule { - record = "instance:node_network_transmit_drop_excluding_lo:rate5m" - expression = "sum without (device) (rate(node_network_transmit_drop_total{job=\"node\", device!=\"lo\"}[5m]))" - enabled = true - } -} - -resource "azurerm_monitor_alert_prometheus_rule_group" "alert_prometheus_k8s" { - name = "KubernetesRecordingRulesRuleGroup" - resource_group_name = azurerm_resource_group.rg_monitoring.name - location = azurerm_resource_group.rg_monitoring.location - cluster_name = azurerm_kubernetes_cluster.aks.name - rule_group_enabled = true - interval = "PT1M" - scopes = [azurerm_monitor_workspace.prometheus.id] - - rule { - record = "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate" - expression = "sum by (cluster, namespace, pod, container) (irate(container_cpu_usage_seconds_total{job=\"cadvisor\", image!=\"\"}[5m])) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\"}))" - enabled = true - } - - rule { - record = "node_namespace_pod_container:container_memory_working_set_bytes" - expression = "container_memory_working_set_bytes{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" - enabled = true - } - - rule { - record = "node_namespace_pod_container:container_memory_rss" - expression = "container_memory_rss{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" - enabled = true - } - - rule { - record = "node_namespace_pod_container:container_memory_cache" - expression = "container_memory_cache{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" - enabled = true - } - - rule { - record = "node_namespace_pod_container:container_memory_swap" - expression = "container_memory_swap{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" - enabled = true - } - - rule { - record = "cluster:namespace:pod_memory:active:kube_pod_container_resource_requests" - expression = "kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"} * on(namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ((kube_pod_status_phase{phase=~\"Pending|Running\"} == 1))" - enabled = true - } - - rule { - record = "namespace_memory:kube_pod_container_resource_requests:sum" - expression = "sum by (namespace, cluster) (sum by (namespace, pod, cluster) (max by (namespace, pod, container, cluster) (kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" - enabled = true - } - - rule { - record = "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests" - expression = "kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ((kube_pod_status_phase{phase=~\"Pending|Running\"} == 1))" - enabled = true - } - - rule { - record = "namespace_cpu:kube_pod_container_resource_requests:sum" - expression = "sum by (namespace, cluster) (sum by(namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" - enabled = true - } - - rule { - record = "cluster:namespace:pod_memory:active:kube_pod_container_resource_limits" - expression = "kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ((kube_pod_status_phase{phase=~\"Pending|Running\"} == 1))" - enabled = true - } - - rule { - record = "namespace_memory:kube_pod_container_resource_limits:sum" - expression = "sum by (namespace, cluster) (sum by (namespace, pod, cluster) (max by (namespace, pod, container, cluster) (kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" - enabled = true - } - - rule { - record = "cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits" - expression = "kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1) )" - enabled = true - } - - rule { - record = "namespace_cpu:kube_pod_container_resource_limits:sum" - expression = "sum by (namespace, cluster) (sum by (namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" - enabled = true - } - - rule { - record = "namespace_workload_pod:kube_pod_owner:relabel" - expression = "max by (cluster, namespace, workload, pod) (label_replace(label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"}, \"replicaset\", \"$1\", \"owner_name\", \"(.*)\") * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (1, max by (replicaset, namespace, owner_name) (kube_replicaset_owner{job=\"kube-state-metrics\"})), \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" - labels = { - "workload_type" = "deployment" - } - enabled = true - } - - rule { - record = "namespace_workload_pod:kube_pod_owner:relabel" - expression = "max by (cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"DaemonSet\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" - labels = { - "workload_type" = "daemonset" - } - enabled = true - } - - rule { - record = "namespace_workload_pod:kube_pod_owner:relabel" - expression = "max by (cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"StatefulSet\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" - labels = { - "workload_type" = "statefulset" - } - enabled = true - } - - rule { - record = "namespace_workload_pod:kube_pod_owner:relabel" - expression = "max by (cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"Job\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" - labels = { - "workload_type" = "job" - } - enabled = true - } - - rule { - record = ":node_memory_MemAvailable_bytes:sum" - expression = "sum(node_memory_MemAvailable_bytes{job=\"node\"} or (node_memory_Buffers_bytes{job=\"node\"} + node_memory_Cached_bytes{job=\"node\"} + node_memory_MemFree_bytes{job=\"node\"} + node_memory_Slab_bytes{job=\"node\"})) by (cluster)" - enabled = true - } - - rule { - record = "cluster:node_cpu:ratio_rate5m" - expression = "sum(rate(node_cpu_seconds_total{job=\"node\",mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m])) by (cluster) /count(sum(node_cpu_seconds_total{job=\"node\"}) by (cluster, instance, cpu)) by (cluster)" - enabled = true - } -} +# resource "azurerm_monitor_workspace" "prometheus" { +# name = var.prometheus_name +# resource_group_name = azurerm_resource_group.rg_monitoring.name +# location = azurerm_resource_group.rg_monitoring.location +# public_network_access_enabled = false # false # true +# } + +# resource "azurerm_role_assignment" "role_monitoring_data_reader_me" { +# scope = azurerm_monitor_workspace.prometheus.id +# role_definition_name = "Monitoring Data Reader" +# principal_id = data.azurerm_client_config.current.object_id +# } + +# resource "azurerm_monitor_alert_prometheus_rule_group" "alert-prometheus-nodes" { +# name = "NodeRecordingRulesRuleGroup" +# resource_group_name = azurerm_resource_group.rg_monitoring.name +# location = azurerm_resource_group.rg_monitoring.location +# cluster_name = azurerm_kubernetes_cluster.aks.name +# rule_group_enabled = true +# interval = "PT1M" +# scopes = [azurerm_monitor_workspace.prometheus.id] + +# rule { +# record = "instance:node_num_cpu:sum" +# expression = "count without (cpu, mode) (node_cpu_seconds_total{job=\"node\",mode=\"idle\"})" +# enabled = true +# } + +# rule { +# record = "instance:node_cpu_utilisation:rate5m" +# expression = "1 - avg without (cpu) (sum without (mode) (rate(node_cpu_seconds_total{job=\"node\", mode=~\"idle|iowait|steal\"}[5m])))" +# enabled = true +# } + +# rule { +# record = "instance:node_load1_per_cpu:ratio" +# expression = "(node_load1{job=\"node\"}/ instance:node_num_cpu:sum{job=\"node\"})" +# enabled = true +# } + +# rule { +# record = "instance:node_memory_utilisation:ratio" +# expression = "1 - ((node_memory_MemAvailable_bytes{job=\"node\"} or (node_memory_Buffers_bytes{job=\"node\"} + node_memory_Cached_bytes{job=\"node\"} + node_memory_MemFree_bytes{job=\"node\"} + node_memory_Slab_bytes{job=\"node\"})) / node_memory_MemTotal_bytes{job=\"node\"})" +# enabled = true +# } + +# rule { +# record = "instance:node_vmstat_pgmajfault:rate5m" +# expression = "rate(node_vmstat_pgmajfault{job=\"node\"}[5m])" +# enabled = true +# } + +# rule { +# record = "instance_device:node_disk_io_time_seconds:rate5m" +# expression = "rate(node_disk_io_time_seconds_total{job=\"node\", device!=\"\"}[5m])" +# enabled = true +# } + +# rule { +# record = "instance_device:node_disk_io_time_weighted_seconds:rate5m" +# expression = "rate(node_disk_io_time_weighted_seconds_total{job=\"node\", device!=\"\"}[5m])" +# enabled = true +# } + +# rule { +# record = "instance:node_network_receive_bytes_excluding_lo:rate5m" +# expression = "sum without (device) (rate(node_network_receive_bytes_total{job=\"node\", device!=\"lo\"}[5m]))" +# enabled = true +# } + +# rule { +# record = "instance:node_network_transmit_bytes_excluding_lo:rate5m" +# expression = "sum without (device) (rate(node_network_transmit_bytes_total{job=\"node\", device!=\"lo\"}[5m]))" +# enabled = true +# } + +# rule { +# record = "instance:node_network_receive_drop_excluding_lo:rate5m" +# expression = "sum without (device) (rate(node_network_receive_drop_total{job=\"node\", device!=\"lo\"}[5m]))" +# enabled = true +# } + +# rule { +# record = "instance:node_network_transmit_drop_excluding_lo:rate5m" +# expression = "sum without (device) (rate(node_network_transmit_drop_total{job=\"node\", device!=\"lo\"}[5m]))" +# enabled = true +# } +# } + +# resource "azurerm_monitor_alert_prometheus_rule_group" "alert-prometheus-k8s" { +# name = "KubernetesRecordingRulesRuleGroup" +# resource_group_name = azurerm_resource_group.rg_monitoring.name +# location = azurerm_resource_group.rg_monitoring.location +# cluster_name = azurerm_kubernetes_cluster.aks.name +# rule_group_enabled = true +# interval = "PT1M" +# scopes = [azurerm_monitor_workspace.prometheus.id] + +# rule { +# record = "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate" +# expression = "sum by (cluster, namespace, pod, container) (irate(container_cpu_usage_seconds_total{job=\"cadvisor\", image!=\"\"}[5m])) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\"}))" +# enabled = true +# } + +# rule { +# record = "node_namespace_pod_container:container_memory_working_set_bytes" +# expression = "container_memory_working_set_bytes{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" +# enabled = true +# } + +# rule { +# record = "node_namespace_pod_container:container_memory_rss" +# expression = "container_memory_rss{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" +# enabled = true +# } + +# rule { +# record = "node_namespace_pod_container:container_memory_cache" +# expression = "container_memory_cache{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" +# enabled = true +# } + +# rule { +# record = "node_namespace_pod_container:container_memory_swap" +# expression = "container_memory_swap{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" +# enabled = true +# } + +# rule { +# record = "cluster:namespace:pod_memory:active:kube_pod_container_resource_requests" +# expression = "kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"} * on(namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ((kube_pod_status_phase{phase=~\"Pending|Running\"} == 1))" +# enabled = true +# } + +# rule { +# record = "namespace_memory:kube_pod_container_resource_requests:sum" +# expression = "sum by (namespace, cluster) (sum by (namespace, pod, cluster) (max by (namespace, pod, container, cluster) (kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" +# enabled = true +# } + +# rule { +# record = "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests" +# expression = "kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ((kube_pod_status_phase{phase=~\"Pending|Running\"} == 1))" +# enabled = true +# } + +# rule { +# record = "namespace_cpu:kube_pod_container_resource_requests:sum" +# expression = "sum by (namespace, cluster) (sum by(namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" +# enabled = true +# } + +# rule { +# record = "cluster:namespace:pod_memory:active:kube_pod_container_resource_limits" +# expression = "kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ((kube_pod_status_phase{phase=~\"Pending|Running\"} == 1))" +# enabled = true +# } + +# rule { +# record = "namespace_memory:kube_pod_container_resource_limits:sum" +# expression = "sum by (namespace, cluster) (sum by (namespace, pod, cluster) (max by (namespace, pod, container, cluster) (kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" +# enabled = true +# } + +# rule { +# record = "cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits" +# expression = "kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1) )" +# enabled = true +# } + +# rule { +# record = "namespace_cpu:kube_pod_container_resource_limits:sum" +# expression = "sum by (namespace, cluster) (sum by (namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" +# enabled = true +# } + +# rule { +# record = "namespace_workload_pod:kube_pod_owner:relabel" +# expression = "max by (cluster, namespace, workload, pod) (label_replace(label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"}, \"replicaset\", \"$1\", \"owner_name\", \"(.*)\") * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (1, max by (replicaset, namespace, owner_name) (kube_replicaset_owner{job=\"kube-state-metrics\"})), \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" +# labels = { +# "workload_type" = "deployment" +# } +# enabled = true +# } + +# rule { +# record = "namespace_workload_pod:kube_pod_owner:relabel" +# expression = "max by (cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"DaemonSet\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" +# labels = { +# "workload_type" = "daemonset" +# } +# enabled = true +# } + +# rule { +# record = "namespace_workload_pod:kube_pod_owner:relabel" +# expression = "max by (cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"StatefulSet\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" +# labels = { +# "workload_type" = "statefulset" +# } +# enabled = true +# } + +# rule { +# record = "namespace_workload_pod:kube_pod_owner:relabel" +# expression = "max by (cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"Job\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" +# labels = { +# "workload_type" = "job" +# } +# enabled = true +# } + +# rule { +# record = ":node_memory_MemAvailable_bytes:sum" +# expression = "sum(node_memory_MemAvailable_bytes{job=\"node\"} or (node_memory_Buffers_bytes{job=\"node\"} + node_memory_Cached_bytes{job=\"node\"} + node_memory_MemFree_bytes{job=\"node\"} + node_memory_Slab_bytes{job=\"node\"})) by (cluster)" +# enabled = true +# } + +# rule { +# record = "cluster:node_cpu:ratio_rate5m" +# expression = "sum(rate(node_cpu_seconds_total{job=\"node\",mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m])) by (cluster) /count(sum(node_cpu_seconds_total{job=\"node\"}) by (cluster, instance, cpu)) by (cluster)" +# enabled = true +# } +# } diff --git a/85_prometheus_grafana_private_endpoint/windows-vm.tf b/85_prometheus_grafana_private_endpoint/windows-vm.tf new file mode 100644 index 0000000..968356f --- /dev/null +++ b/85_prometheus_grafana_private_endpoint/windows-vm.tf @@ -0,0 +1,55 @@ +resource "azurerm_network_interface" "nic-vm" { + name = "nic-vm-windows" + resource_group_name = azurerm_resource_group.rg_aks_cluster.name + location = azurerm_resource_group.rg_aks_cluster.location + + ip_configuration { + name = "internal" + subnet_id = azurerm_subnet.snet-aks.id + private_ip_address_allocation = "Dynamic" + } +} + +resource "azurerm_windows_virtual_machine" "vm" { + name = "vm-jumpbox-w11" + resource_group_name = azurerm_resource_group.rg_aks_cluster.name + location = azurerm_resource_group.rg_aks_cluster.location + size = "Standard_B2ats_v2" + admin_username = "azureuser" + admin_password = "@Aa123456789" + network_interface_ids = [azurerm_network_interface.nic-vm.id] + priority = "Spot" + eviction_policy = "Deallocate" + +# custom_data = filebase64("../scripts/install-tools-windows.ps1") + + os_disk { + name = "os-disk-vm" + caching = "ReadWrite" + storage_account_type = "Standard_LRS" + } + + source_image_reference { + publisher = "MicrosoftWindowsDesktop" + offer = "windows-11" + sku = "win11-23h2-pro" + version = "latest" + } + + boot_diagnostics { + storage_account_uri = null + } +} + +# resource "azurerm_virtual_machine_extension" "cloudinit" { +# name = "cloudinit" +# virtual_machine_id = azurerm_windows_virtual_machine.vm.id +# publisher = "Microsoft.Compute" +# type = "CustomScriptExtension" +# type_handler_version = "1.10" +# settings = <