diff --git a/85_prometheus_grafana_private_endpoint/dce-prometheus.tf b/85_prometheus_grafana_private_endpoint/dce-prometheus.tf deleted file mode 100644 index 25da49c..0000000 --- a/85_prometheus_grafana_private_endpoint/dce-prometheus.tf +++ /dev/null @@ -1,14 +0,0 @@ -# resource "azurerm_monitor_data_collection_endpoint" "dce-prometheus" { -# name = "dce-prometheus" -# resource_group_name = azurerm_resource_group.rg_monitoring.name -# location = azurerm_resource_group.rg_monitoring.location -# kind = "Linux" -# public_network_access_enabled = false # true # false -# } - -# # associate to a Data Collection Endpoint -# resource "azurerm_monitor_data_collection_rule_association" "dcra-dce-prometheus-aks" { -# name = "configurationAccessEndpoint" # "dcra-dce-prometheus-aks" # # name is required when data_collection_rule_id is specified. And when data_collection_endpoint_id is specified, the name is populated with configurationAccessEndpoint -# target_resource_id = azurerm_kubernetes_cluster.aks.id -# data_collection_endpoint_id = azurerm_monitor_data_collection_endpoint.dce-prometheus.id -# } diff --git a/85_prometheus_grafana_private_endpoint/dcr-prometheus.tf b/85_prometheus_grafana_private_endpoint/dcr-prometheus.tf deleted file mode 100644 index 337bb46..0000000 --- a/85_prometheus_grafana_private_endpoint/dcr-prometheus.tf +++ /dev/null @@ -1,35 +0,0 @@ -# resource "azurerm_monitor_data_collection_rule" "dcr-prometheus" { -# name = "dcr-prometheus" -# resource_group_name = azurerm_resource_group.rg_monitoring.name -# location = azurerm_resource_group.rg_monitoring.location -# data_collection_endpoint_id = azurerm_monitor_data_collection_endpoint.dce-prometheus.id -# kind = "Linux" -# description = "DCR for Azure Monitor Metrics Profile (Managed Prometheus)" - -# data_sources { -# prometheus_forwarder { -# name = "PrometheusDataSource" -# streams = ["Microsoft-PrometheusMetrics"] -# } -# } - -# destinations { -# monitor_account { -# monitor_account_id = azurerm_monitor_workspace.prometheus.id -# name = azurerm_monitor_workspace.prometheus.name -# } -# } - -# data_flow { -# streams = ["Microsoft-PrometheusMetrics"] -# destinations = [azurerm_monitor_workspace.prometheus.name] -# } -# } - -# # associate to a Data Collection Rule -# resource "azurerm_monitor_data_collection_rule_association" "dcra-dcr-prometheus-aks" { -# name = "dcra-dcr-prometheus-aks" -# target_resource_id = azurerm_kubernetes_cluster.aks.id -# data_collection_rule_id = azurerm_monitor_data_collection_rule.dcr-prometheus.id -# description = "Association of data collection rule. Deleting this association will break the data collection for this AKS Cluster." -# } diff --git a/85_prometheus_grafana_private_endpoint/grafana.tf b/85_prometheus_grafana_private_endpoint/grafana.tf deleted file mode 100644 index c56d199..0000000 --- a/85_prometheus_grafana_private_endpoint/grafana.tf +++ /dev/null @@ -1,48 +0,0 @@ -# resource "azurerm_dashboard_grafana" "grafana" { -# name = var.grafana_name -# resource_group_name = azurerm_resource_group.rg_monitoring.name -# location = azurerm_resource_group.rg_monitoring.location -# api_key_enabled = true -# deterministic_outbound_ip_enabled = true -# public_network_access_enabled = true -# sku = "Standard" -# zone_redundancy_enabled = false -# grafana_major_version = "10" # 9 - -# azure_monitor_workspace_integrations { -# resource_id = azurerm_monitor_workspace.prometheus.id -# } - -# identity { -# type = "SystemAssigned" # "UserAssigned" # -# # identity_ids = [azurerm_user_assigned_identity.identity-grafana.id] -# } -# } - -# data "azurerm_client_config" "current" {} - -# resource "azurerm_role_assignment" "role_grafana_admin" { -# scope = azurerm_dashboard_grafana.grafana.id -# role_definition_name = "Grafana Admin" -# principal_id = data.azurerm_client_config.current.object_id -# } - -# resource "azurerm_role_assignment" "role_monitoring_data_reader" { -# scope = azurerm_monitor_workspace.prometheus.id -# role_definition_name = "Monitoring Data Reader" -# principal_id = azurerm_dashboard_grafana.grafana.identity.0.principal_id # azurerm_user_assigned_identity.identity-grafana.principal_id # -# } - -# data "azurerm_subscription" "current" {} - -# resource "azurerm_role_assignment" "role_monitoring_reader" { -# scope = data.azurerm_subscription.current.id -# role_definition_name = "Monitoring Reader" -# principal_id = azurerm_dashboard_grafana.grafana.identity.0.principal_id # azurerm_user_assigned_identity.identity-grafana.principal_id # -# } - -# # resource "azurerm_user_assigned_identity" "identity-grafana" { -# # name = "identity-grafana" -# # resource_group_name = azurerm_resource_group.rg_monitoring.name -# # location = azurerm_resource_group.rg_monitoring.location -# # } diff --git a/85_prometheus_grafana_private_endpoint/prometheus.tf b/85_prometheus_grafana_private_endpoint/prometheus.tf deleted file mode 100644 index 483723d..0000000 --- a/85_prometheus_grafana_private_endpoint/prometheus.tf +++ /dev/null @@ -1,224 +0,0 @@ -# resource "azurerm_monitor_workspace" "prometheus" { -# name = var.prometheus_name -# resource_group_name = azurerm_resource_group.rg_monitoring.name -# location = azurerm_resource_group.rg_monitoring.location -# public_network_access_enabled = false # false # true -# } - -# resource "azurerm_role_assignment" "role_monitoring_data_reader_me" { -# scope = azurerm_monitor_workspace.prometheus.id -# role_definition_name = "Monitoring Data Reader" -# principal_id = data.azurerm_client_config.current.object_id -# } - -# resource "azurerm_monitor_alert_prometheus_rule_group" "alert-prometheus-nodes" { -# name = "NodeRecordingRulesRuleGroup" -# resource_group_name = azurerm_resource_group.rg_monitoring.name -# location = azurerm_resource_group.rg_monitoring.location -# cluster_name = azurerm_kubernetes_cluster.aks.name -# rule_group_enabled = true -# interval = "PT1M" -# scopes = [azurerm_monitor_workspace.prometheus.id] - -# rule { -# record = "instance:node_num_cpu:sum" -# expression = "count without (cpu, mode) (node_cpu_seconds_total{job=\"node\",mode=\"idle\"})" -# enabled = true -# } - -# rule { -# record = "instance:node_cpu_utilisation:rate5m" -# expression = "1 - avg without (cpu) (sum without (mode) (rate(node_cpu_seconds_total{job=\"node\", mode=~\"idle|iowait|steal\"}[5m])))" -# enabled = true -# } - -# rule { -# record = "instance:node_load1_per_cpu:ratio" -# expression = "(node_load1{job=\"node\"}/ instance:node_num_cpu:sum{job=\"node\"})" -# enabled = true -# } - -# rule { -# record = "instance:node_memory_utilisation:ratio" -# expression = "1 - ((node_memory_MemAvailable_bytes{job=\"node\"} or (node_memory_Buffers_bytes{job=\"node\"} + node_memory_Cached_bytes{job=\"node\"} + node_memory_MemFree_bytes{job=\"node\"} + node_memory_Slab_bytes{job=\"node\"})) / node_memory_MemTotal_bytes{job=\"node\"})" -# enabled = true -# } - -# rule { -# record = "instance:node_vmstat_pgmajfault:rate5m" -# expression = "rate(node_vmstat_pgmajfault{job=\"node\"}[5m])" -# enabled = true -# } - -# rule { -# record = "instance_device:node_disk_io_time_seconds:rate5m" -# expression = "rate(node_disk_io_time_seconds_total{job=\"node\", device!=\"\"}[5m])" -# enabled = true -# } - -# rule { -# record = "instance_device:node_disk_io_time_weighted_seconds:rate5m" -# expression = "rate(node_disk_io_time_weighted_seconds_total{job=\"node\", device!=\"\"}[5m])" -# enabled = true -# } - -# rule { -# record = "instance:node_network_receive_bytes_excluding_lo:rate5m" -# expression = "sum without (device) (rate(node_network_receive_bytes_total{job=\"node\", device!=\"lo\"}[5m]))" -# enabled = true -# } - -# rule { -# record = "instance:node_network_transmit_bytes_excluding_lo:rate5m" -# expression = "sum without (device) (rate(node_network_transmit_bytes_total{job=\"node\", device!=\"lo\"}[5m]))" -# enabled = true -# } - -# rule { -# record = "instance:node_network_receive_drop_excluding_lo:rate5m" -# expression = "sum without (device) (rate(node_network_receive_drop_total{job=\"node\", device!=\"lo\"}[5m]))" -# enabled = true -# } - -# rule { -# record = "instance:node_network_transmit_drop_excluding_lo:rate5m" -# expression = "sum without (device) (rate(node_network_transmit_drop_total{job=\"node\", device!=\"lo\"}[5m]))" -# enabled = true -# } -# } - -# resource "azurerm_monitor_alert_prometheus_rule_group" "alert-prometheus-k8s" { -# name = "KubernetesRecordingRulesRuleGroup" -# resource_group_name = azurerm_resource_group.rg_monitoring.name -# location = azurerm_resource_group.rg_monitoring.location -# cluster_name = azurerm_kubernetes_cluster.aks.name -# rule_group_enabled = true -# interval = "PT1M" -# scopes = [azurerm_monitor_workspace.prometheus.id] - -# rule { -# record = "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate" -# expression = "sum by (cluster, namespace, pod, container) (irate(container_cpu_usage_seconds_total{job=\"cadvisor\", image!=\"\"}[5m])) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\"}))" -# enabled = true -# } - -# rule { -# record = "node_namespace_pod_container:container_memory_working_set_bytes" -# expression = "container_memory_working_set_bytes{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" -# enabled = true -# } - -# rule { -# record = "node_namespace_pod_container:container_memory_rss" -# expression = "container_memory_rss{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" -# enabled = true -# } - -# rule { -# record = "node_namespace_pod_container:container_memory_cache" -# expression = "container_memory_cache{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" -# enabled = true -# } - -# rule { -# record = "node_namespace_pod_container:container_memory_swap" -# expression = "container_memory_swap{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" -# enabled = true -# } - -# rule { -# record = "cluster:namespace:pod_memory:active:kube_pod_container_resource_requests" -# expression = "kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"} * on(namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ((kube_pod_status_phase{phase=~\"Pending|Running\"} == 1))" -# enabled = true -# } - -# rule { -# record = "namespace_memory:kube_pod_container_resource_requests:sum" -# expression = "sum by (namespace, cluster) (sum by (namespace, pod, cluster) (max by (namespace, pod, container, cluster) (kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" -# enabled = true -# } - -# rule { -# record = "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests" -# expression = "kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ((kube_pod_status_phase{phase=~\"Pending|Running\"} == 1))" -# enabled = true -# } - -# rule { -# record = "namespace_cpu:kube_pod_container_resource_requests:sum" -# expression = "sum by (namespace, cluster) (sum by(namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" -# enabled = true -# } - -# rule { -# record = "cluster:namespace:pod_memory:active:kube_pod_container_resource_limits" -# expression = "kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ((kube_pod_status_phase{phase=~\"Pending|Running\"} == 1))" -# enabled = true -# } - -# rule { -# record = "namespace_memory:kube_pod_container_resource_limits:sum" -# expression = "sum by (namespace, cluster) (sum by (namespace, pod, cluster) (max by (namespace, pod, container, cluster) (kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" -# enabled = true -# } - -# rule { -# record = "cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits" -# expression = "kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1) )" -# enabled = true -# } - -# rule { -# record = "namespace_cpu:kube_pod_container_resource_limits:sum" -# expression = "sum by (namespace, cluster) (sum by (namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" -# enabled = true -# } - -# rule { -# record = "namespace_workload_pod:kube_pod_owner:relabel" -# expression = "max by (cluster, namespace, workload, pod) (label_replace(label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"}, \"replicaset\", \"$1\", \"owner_name\", \"(.*)\") * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (1, max by (replicaset, namespace, owner_name) (kube_replicaset_owner{job=\"kube-state-metrics\"})), \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" -# labels = { -# "workload_type" = "deployment" -# } -# enabled = true -# } - -# rule { -# record = "namespace_workload_pod:kube_pod_owner:relabel" -# expression = "max by (cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"DaemonSet\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" -# labels = { -# "workload_type" = "daemonset" -# } -# enabled = true -# } - -# rule { -# record = "namespace_workload_pod:kube_pod_owner:relabel" -# expression = "max by (cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"StatefulSet\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" -# labels = { -# "workload_type" = "statefulset" -# } -# enabled = true -# } - -# rule { -# record = "namespace_workload_pod:kube_pod_owner:relabel" -# expression = "max by (cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"Job\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" -# labels = { -# "workload_type" = "job" -# } -# enabled = true -# } - -# rule { -# record = ":node_memory_MemAvailable_bytes:sum" -# expression = "sum(node_memory_MemAvailable_bytes{job=\"node\"} or (node_memory_Buffers_bytes{job=\"node\"} + node_memory_Cached_bytes{job=\"node\"} + node_memory_MemFree_bytes{job=\"node\"} + node_memory_Slab_bytes{job=\"node\"})) by (cluster)" -# enabled = true -# } - -# rule { -# record = "cluster:node_cpu:ratio_rate5m" -# expression = "sum(rate(node_cpu_seconds_total{job=\"node\",mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m])) by (cluster) /count(sum(node_cpu_seconds_total{job=\"node\"}) by (cluster, instance, cpu)) by (cluster)" -# enabled = true -# } -# } diff --git a/85_prometheus_grafana_private_endpoint/resource_group.tf b/85_prometheus_grafana_private_endpoint/resource_group.tf deleted file mode 100644 index 276e5b7..0000000 --- a/85_prometheus_grafana_private_endpoint/resource_group.tf +++ /dev/null @@ -1,9 +0,0 @@ -resource "azurerm_resource_group" "rg_aks_cluster" { - name = var.rg_aks_cluster - location = var.resources_location -} - -resource "azurerm_resource_group" "rg_monitoring" { - name = var.rg_monitoring - location = var.resources_location -} \ No newline at end of file diff --git a/85_prometheus_grafana_private_endpoint/variables.tf b/85_prometheus_grafana_private_endpoint/variables.tf deleted file mode 100644 index f4d0003..0000000 --- a/85_prometheus_grafana_private_endpoint/variables.tf +++ /dev/null @@ -1,29 +0,0 @@ -variable "resources_location" { - type = string - default = "swedencentral" -} - -variable "rg_aks_cluster" { - type = string - default = "rg-aks-cluster" -} - -variable "rg_monitoring" { - type = string - default = "rg-monitoring" -} - -variable "aks_name" { - type = string - default = "aks-cluster" -} - -variable "grafana_name" { - type = string - default = "azure-grafana-17" -} - -variable "prometheus_name" { - type = string - default = "azure-prometheus" -} diff --git a/85_prometheus_grafana_private_endpoint/.infracost/terraform_modules/manifest.json b/87_log_analytics_ampls/.infracost/terraform_modules/manifest.json similarity index 100% rename from 85_prometheus_grafana_private_endpoint/.infracost/terraform_modules/manifest.json rename to 87_log_analytics_ampls/.infracost/terraform_modules/manifest.json diff --git a/85_prometheus_grafana_private_endpoint/Readme.md b/87_log_analytics_ampls/Readme.md similarity index 100% rename from 85_prometheus_grafana_private_endpoint/Readme.md rename to 87_log_analytics_ampls/Readme.md diff --git a/85_prometheus_grafana_private_endpoint/aks.tf b/87_log_analytics_ampls/aks.tf similarity index 81% rename from 85_prometheus_grafana_private_endpoint/aks.tf rename to 87_log_analytics_ampls/aks.tf index bbb3abf..fb23477 100644 --- a/85_prometheus_grafana_private_endpoint/aks.tf +++ b/87_log_analytics_ampls/aks.tf @@ -1,7 +1,7 @@ resource "azurerm_kubernetes_cluster" "aks" { - name = var.aks_name - location = azurerm_resource_group.rg_aks_cluster.location - resource_group_name = azurerm_resource_group.rg_aks_cluster.name + name = "aks-cluster" + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name dns_prefix = "aks" kubernetes_version = "1.28.5" # "1.29.0" @@ -13,7 +13,7 @@ resource "azurerm_kubernetes_cluster" "aks" { default_node_pool { name = "systempool" - node_count = "3" + node_count = 3 vm_size = "standard_b2als_v2" vnet_subnet_id = azurerm_subnet.snet-aks.id } diff --git a/85_prometheus_grafana_private_endpoint/ampls.tf b/87_log_analytics_ampls/ampls.tf similarity index 87% rename from 85_prometheus_grafana_private_endpoint/ampls.tf rename to 87_log_analytics_ampls/ampls.tf index 4bc8a64..f63dc33 100644 --- a/85_prometheus_grafana_private_endpoint/ampls.tf +++ b/87_log_analytics_ampls/ampls.tf @@ -1,18 +1,18 @@ resource "azurerm_monitor_private_link_scope" "ampls" { name = "ampls-monitoring" - resource_group_name = azurerm_resource_group.rg_monitoring.name + resource_group_name = azurerm_resource_group.rg.name } resource "azurerm_monitor_private_link_scoped_service" "ampls-log-analytics" { name = "ampls-log-analytics" - resource_group_name = azurerm_resource_group.rg_monitoring.name + resource_group_name = azurerm_resource_group.rg.name scope_name = azurerm_monitor_private_link_scope.ampls.name linked_resource_id = azurerm_log_analytics_workspace.workspace.id } resource "azurerm_monitor_private_link_scoped_service" "ampls-dce-log-analytics" { name = "ampls-dce-log-analytics" - resource_group_name = azurerm_resource_group.rg_monitoring.name + resource_group_name = azurerm_resource_group.rg.name scope_name = azurerm_monitor_private_link_scope.ampls.name linked_resource_id = azurerm_monitor_data_collection_endpoint.dce-log-analytics.id } diff --git a/87_log_analytics_ampls/bastion.tf b/87_log_analytics_ampls/bastion.tf new file mode 100644 index 0000000..8f05360 --- /dev/null +++ b/87_log_analytics_ampls/bastion.tf @@ -0,0 +1,25 @@ +resource "azurerm_public_ip" "pip-bastion" { + name = "pip-bastion" + resource_group_name = azurerm_resource_group.rg-jumpbox.name + location = azurerm_resource_group.rg-jumpbox.location + allocation_method = "Static" + sku = "Standard" +} + +resource "azurerm_bastion_host" "bastion" { + name = "bastion" + resource_group_name = azurerm_resource_group.rg-jumpbox.name + location = azurerm_resource_group.rg-jumpbox.location + sku = "Standard" # "Standard" # "Basic", "Developer" + copy_paste_enabled = true + file_copy_enabled = false + shareable_link_enabled = false + tunneling_enabled = false + ip_connect_enabled = false + + ip_configuration { + name = "configuration" + subnet_id = azurerm_subnet.snet-bastion.id + public_ip_address_id = azurerm_public_ip.pip-bastion.id + } +} \ No newline at end of file diff --git a/87_log_analytics_ampls/commands.ps1 b/87_log_analytics_ampls/commands.ps1 new file mode 100644 index 0000000..721ba96 --- /dev/null +++ b/87_log_analytics_ampls/commands.ps1 @@ -0,0 +1,58 @@ +# https://github.com/Azure/prometheus-collector/blob/main/AddonTerraformTemplate/main.tf + +terraform init + +terraform plan -out tfplan + +terraform apply tfplan + +terraform destroy + +$grafana_name=(terraform output grafana_name) +$grafana_rg_name=(terraform output grafana_rg_name) + +$aks_name=(terraform output aks_name) +$aks_rg_name=(terraform output aks_rg_name) + +az grafana dashboard import ` + --name $grafana_name ` + --resource-group $grafana_rg_name ` + --definition "https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/grafana/dashboards/nginx.json" + + +az grafana dashboard import ` + --name $grafana_name ` + --resource-group $grafana_rg_name ` + --definition "https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/grafana/dashboards/request-handling-performance.json" + + + +az aks get-credentials --resource-group $aks_rg_name --name $aks_name --overwrite-existing + +helm install nginx-ingress oci://ghcr.io/nginxinc/charts/nginx-ingress --namespace ingress --create-namespace + +kubectl apply -f deploy-svc-ingress.yaml + +kubectl apply -f container-azm-ms-agentconfig.yaml + +kubectl apply -f ama-metrics-settings-configmap.yaml + +# Kubernetes / API server +az grafana dashboard import --name $grafana_name --resource-group $grafana_rg_name --definition 20331 + +# Kubernetes / ETCD +az grafana dashboard import --name $grafana_name --resource-group $grafana_rg_name --definition 20330 + +# Dashboard for IP consumption +# https://github.com/Azure/azure-container-networking/tree/master/cns/doc/examples/metrics +kubectl -n kube-system get nnc +# NAME ALLOCATED IPS NC MODE NC VERSION +# aks-systempool-96223890-vmss000000 256 static 0 +# aks-systempool-96223890-vmss000001 256 static 0 +# aks-systempool-96223890-vmss000002 256 static 0 + +# https://github.com/grafana/helm-charts/tree/main/charts/grafana +helm repo add grafana https://grafana.github.io/helm-charts +helm repo update + +helm install grafana grafana/grafana --namespace monitoring --create-namespace --set persistence.enabled=true --set persistence.size=10Gi --set adminPassword=admin --set service.type=LoadBalancer \ No newline at end of file diff --git a/85_prometheus_grafana_private_endpoint/commands.sh b/87_log_analytics_ampls/commands.sh similarity index 100% rename from 85_prometheus_grafana_private_endpoint/commands.sh rename to 87_log_analytics_ampls/commands.sh diff --git a/85_prometheus_grafana_private_endpoint/container-azm-ms-agentconfig.yaml b/87_log_analytics_ampls/container-azm-ms-agentconfig.yaml similarity index 100% rename from 85_prometheus_grafana_private_endpoint/container-azm-ms-agentconfig.yaml rename to 87_log_analytics_ampls/container-azm-ms-agentconfig.yaml diff --git a/87_log_analytics_ampls/counter-pod.yaml b/87_log_analytics_ampls/counter-pod.yaml new file mode 100644 index 0000000..b56a76a --- /dev/null +++ b/87_log_analytics_ampls/counter-pod.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Pod +metadata: + name: counter +spec: + containers: + - name: count + image: busybox + args: [/bin/sh, -c, 'i=0; while true; do echo "This is demo log $i: $(date)"; i=$((i+1)); sleep 10; done'] \ No newline at end of file diff --git a/85_prometheus_grafana_private_endpoint/dce-log_analytics.tf b/87_log_analytics_ampls/dce-log_analytics.tf similarity index 81% rename from 85_prometheus_grafana_private_endpoint/dce-log_analytics.tf rename to 87_log_analytics_ampls/dce-log_analytics.tf index 7aacd7e..99fc49e 100644 --- a/85_prometheus_grafana_private_endpoint/dce-log_analytics.tf +++ b/87_log_analytics_ampls/dce-log_analytics.tf @@ -1,7 +1,7 @@ resource "azurerm_monitor_data_collection_endpoint" "dce-log-analytics" { name = "dce-log-analytics" - resource_group_name = azurerm_resource_group.rg_monitoring.name - location = azurerm_resource_group.rg_monitoring.location + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location public_network_access_enabled = false } diff --git a/85_prometheus_grafana_private_endpoint/dcr-log_analytics.tf b/87_log_analytics_ampls/dcr-log_analytics.tf similarity index 91% rename from 85_prometheus_grafana_private_endpoint/dcr-log_analytics.tf rename to 87_log_analytics_ampls/dcr-log_analytics.tf index 13c0e39..9127c2a 100644 --- a/85_prometheus_grafana_private_endpoint/dcr-log_analytics.tf +++ b/87_log_analytics_ampls/dcr-log_analytics.tf @@ -1,7 +1,7 @@ resource "azurerm_monitor_data_collection_rule" "dcr-log-analytics" { name = "dcr-log-analytics" - resource_group_name = azurerm_resource_group.rg_monitoring.name - location = azurerm_resource_group.rg_monitoring.location + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location data_collection_endpoint_id = azurerm_monitor_data_collection_endpoint.dce-log-analytics.id destinations { diff --git a/87_log_analytics_ampls/diagnostic_setting.tf b/87_log_analytics_ampls/diagnostic_setting.tf new file mode 100644 index 0000000..b662bae --- /dev/null +++ b/87_log_analytics_ampls/diagnostic_setting.tf @@ -0,0 +1,42 @@ +locals { + resources = [ + { + type = "aks" + id = azurerm_kubernetes_cluster.aks.id + } + ] +} + +data "azurerm_monitor_diagnostic_categories" "resources" { + for_each = { for resource in local.resources : resource.type => resource } + + resource_id = each.value.id +} + +resource "azurerm_monitor_diagnostic_setting" "rule" { + for_each = { for resource in local.resources : resource.type => resource } + + name = "diagnostic-setting" + target_resource_id = each.value.id + log_analytics_workspace_id = azurerm_log_analytics_workspace.workspace.id + log_analytics_destination_type = "Dedicated" # "AzureDiagnostics" + + dynamic "enabled_log" { + iterator = entry + for_each = data.azurerm_monitor_diagnostic_categories.resources[each.key].log_category_types + + content { + category = entry.value + } + } + + dynamic "metric" { + iterator = entry + for_each = data.azurerm_monitor_diagnostic_categories.resources[each.key].metrics + + content { + category = entry.value + enabled = true + } + } +} diff --git a/85_prometheus_grafana_private_endpoint/images/85_prometheus_grafana__architecture.png b/87_log_analytics_ampls/images/85_prometheus_grafana__architecture.png similarity index 100% rename from 85_prometheus_grafana_private_endpoint/images/85_prometheus_grafana__architecture.png rename to 87_log_analytics_ampls/images/85_prometheus_grafana__architecture.png diff --git a/85_prometheus_grafana_private_endpoint/log_analytics.tf b/87_log_analytics_ampls/log_analytics.tf similarity index 84% rename from 85_prometheus_grafana_private_endpoint/log_analytics.tf rename to 87_log_analytics_ampls/log_analytics.tf index 450a3d5..86ff15b 100644 --- a/85_prometheus_grafana_private_endpoint/log_analytics.tf +++ b/87_log_analytics_ampls/log_analytics.tf @@ -1,11 +1,11 @@ resource "azurerm_log_analytics_workspace" "workspace" { name = "log-analytics-workspace" - resource_group_name = azurerm_resource_group.rg_monitoring.name - location = var.resources_location + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location sku = "PerGB2018" # PerGB2018, Free, PerNode, Premium, Standard, Standalone, Unlimited, CapacityReservation retention_in_days = 30 # possible values are either 7 (Free Tier only) or range between 30 and 730 internet_ingestion_enabled = false - internet_query_enabled = true + internet_query_enabled = false } # resource "azurerm_log_analytics_solution" "solution" { diff --git a/87_log_analytics_ampls/output.tf b/87_log_analytics_ampls/output.tf new file mode 100644 index 0000000..63d4ec9 --- /dev/null +++ b/87_log_analytics_ampls/output.tf @@ -0,0 +1,7 @@ +output "aks_name" { + value = azurerm_kubernetes_cluster.aks.name +} + +output "aks_rg_name" { + value = azurerm_kubernetes_cluster.aks.resource_group_name +} \ No newline at end of file diff --git a/85_prometheus_grafana_private_endpoint/pe-ampls.tf b/87_log_analytics_ampls/pe-ampls.tf similarity index 86% rename from 85_prometheus_grafana_private_endpoint/pe-ampls.tf rename to 87_log_analytics_ampls/pe-ampls.tf index d44bd07..47355dc 100644 --- a/85_prometheus_grafana_private_endpoint/pe-ampls.tf +++ b/87_log_analytics_ampls/pe-ampls.tf @@ -10,8 +10,8 @@ locals { resource "azurerm_private_endpoint" "pe-ampls" { name = "pe-ampls" - resource_group_name = azurerm_virtual_network.vnet.resource_group_name - location = azurerm_virtual_network.vnet.location + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location subnet_id = azurerm_subnet.snet-pe.id private_service_connection { @@ -30,7 +30,7 @@ resource "azurerm_private_endpoint" "pe-ampls" { resource "azurerm_private_dns_zone" "zones" { for_each = toset(local.dns_zones_ampls) name = each.value - resource_group_name = azurerm_resource_group.rg_monitoring.name + resource_group_name = azurerm_resource_group.rg.name } resource "azurerm_private_dns_zone_virtual_network_link" "link" { diff --git a/85_prometheus_grafana_private_endpoint/providers.tf b/87_log_analytics_ampls/providers.tf similarity index 74% rename from 85_prometheus_grafana_private_endpoint/providers.tf rename to 87_log_analytics_ampls/providers.tf index 5f23363..b57fead 100644 --- a/85_prometheus_grafana_private_endpoint/providers.tf +++ b/87_log_analytics_ampls/providers.tf @@ -13,11 +13,6 @@ terraform { source = "hashicorp/azuread" version = "= 2.47.0" } - - # azapi = { - # source = "Azure/azapi" - # version = "1.12.1" - # } } } @@ -27,8 +22,4 @@ provider "azurerm" { # Configure the Azure Active Directory Provider provider "azuread" { # default takes current user/identity tenant -} - -# provider "azapi" { -# # Configuration options -# } +} \ No newline at end of file diff --git a/87_log_analytics_ampls/rg.tf b/87_log_analytics_ampls/rg.tf new file mode 100644 index 0000000..a5465f3 --- /dev/null +++ b/87_log_analytics_ampls/rg.tf @@ -0,0 +1,10 @@ +resource "azurerm_resource_group" "rg" { + name = "rg-aks-monitoring-${var.prefix}" + location = "swedencentral" +} + +resource "azurerm_resource_group" "rg-jumpbox" { + name = "rg-jumpbox-${var.prefix}" + location = "swedencentral" +} + diff --git a/87_log_analytics_ampls/variables.tf b/87_log_analytics_ampls/variables.tf new file mode 100644 index 0000000..7358604 --- /dev/null +++ b/87_log_analytics_ampls/variables.tf @@ -0,0 +1,3 @@ +variable "prefix" { + default = 900 +} \ No newline at end of file diff --git a/85_prometheus_grafana_private_endpoint/vnet.tf b/87_log_analytics_ampls/vnet.tf similarity index 62% rename from 85_prometheus_grafana_private_endpoint/vnet.tf rename to 87_log_analytics_ampls/vnet.tf index b59b497..2a42a9e 100644 --- a/85_prometheus_grafana_private_endpoint/vnet.tf +++ b/87_log_analytics_ampls/vnet.tf @@ -1,7 +1,7 @@ resource "azurerm_virtual_network" "vnet" { name = "vnet-aks" - resource_group_name = azurerm_resource_group.rg_aks_cluster.name - location = azurerm_resource_group.rg_aks_cluster.location + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location address_space = ["10.10.0.0/16"] } @@ -17,4 +17,11 @@ resource "azurerm_subnet" "snet-pe" { virtual_network_name = azurerm_virtual_network.vnet.name resource_group_name = azurerm_virtual_network.vnet.resource_group_name address_prefixes = ["10.10.1.0/24"] +} + +resource "azurerm_subnet" "snet-bastion" { + name = "AzureBastionSubnet" + virtual_network_name = azurerm_virtual_network.vnet.name + resource_group_name = azurerm_virtual_network.vnet.resource_group_name + address_prefixes = ["10.10.2.0/24"] } \ No newline at end of file diff --git a/87_log_analytics_ampls/windows-vm.tf b/87_log_analytics_ampls/windows-vm.tf new file mode 100644 index 0000000..30deb2f --- /dev/null +++ b/87_log_analytics_ampls/windows-vm.tf @@ -0,0 +1,40 @@ +resource "azurerm_network_interface" "nic-vm" { + name = "nic-vm-windows" + resource_group_name = azurerm_resource_group.rg-jumpbox.name + location = azurerm_resource_group.rg-jumpbox.location + + ip_configuration { + name = "internal" + subnet_id = azurerm_subnet.snet-aks.id + private_ip_address_allocation = "Dynamic" + } +} + +resource "azurerm_windows_virtual_machine" "vm" { + name = "vm-jumpbox-w11" + resource_group_name = azurerm_resource_group.rg-jumpbox.name + location = azurerm_resource_group.rg-jumpbox.location + size = "Standard_B2als_v2" # "Standard_B2ats_v2" + admin_username = "azureuser" + admin_password = "@Aa123456789" + network_interface_ids = [azurerm_network_interface.nic-vm.id] + priority = "Spot" + eviction_policy = "Deallocate" + + os_disk { + name = "os-disk-vm" + caching = "ReadWrite" + storage_account_type = "Standard_LRS" + } + + source_image_reference { + publisher = "MicrosoftWindowsDesktop" + offer = "windows-11" + sku = "win11-23h2-pro" + version = "latest" + } + + boot_diagnostics { + storage_account_uri = null + } +} \ No newline at end of file diff --git a/88_prometheus_grafana_ampls/.infracost/terraform_modules/manifest.json b/88_prometheus_grafana_ampls/.infracost/terraform_modules/manifest.json new file mode 100644 index 0000000..5d95a8b --- /dev/null +++ b/88_prometheus_grafana_ampls/.infracost/terraform_modules/manifest.json @@ -0,0 +1 @@ +{"Path":"d:\\Projects\\docker-kubernetes-course\\85_prometheus_grafana","Version":"2.0","Modules":[]} \ No newline at end of file diff --git a/88_prometheus_grafana_ampls/Readme.md b/88_prometheus_grafana_ampls/Readme.md new file mode 100644 index 0000000..4526c6d --- /dev/null +++ b/88_prometheus_grafana_ampls/Readme.md @@ -0,0 +1,42 @@ +# Using Azure Grafana and Prometheus workspace in AKS using Terraform + +## Introduction + +This lab shows how to use Terraform to provision an AKS cluster, Grafana and Monitor Workspace for Prometheus. All configured together to collect metrics from the cluster and expose it through Grafana dashboard. + +![](images\85_prometheus_grafana__architecture.png) + +## Challenges + +Azure Monitor Workspace for Prometheus is a new service (in preview). +It is not yet supported with ARM template or with Terraform resource. + +So, we'll use `azapi` terraform provider to create the Monitor Workspace for Prometheus. + +And we'll use a `local-exec` to run a command line to configure AKS with Prometheus. + +AKS, Grafana and Log Analytics are suported with ARM templates and Terraform. + +## Deploying the resources using Terraform + +To deploy the Terraform configuration files, run the following commands: + +```sh +terraform init + +terraform plan -out tfplan + +terraform apply tfplan +``` + +## Cleanup resources + +To delete the creates resources, run the following command: + +```sh +terraform destroy +``` + +## More readings + +https://learn.microsoft.com/en-us/azure/azure-monitor/essentials/azure-monitor-workspace-manage?tabs=azure-portal diff --git a/88_prometheus_grafana_ampls/aks.tf b/88_prometheus_grafana_ampls/aks.tf new file mode 100644 index 0000000..a1aa1ca --- /dev/null +++ b/88_prometheus_grafana_ampls/aks.tf @@ -0,0 +1,43 @@ +resource "azurerm_kubernetes_cluster" "aks" { + name = "aks-cluster" + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + dns_prefix = "aks" + kubernetes_version = "1.28.5" # "1.29.0" + + network_profile { + network_plugin = "azure" + network_plugin_mode = "overlay" + ebpf_data_plane = "cilium" + } + + default_node_pool { + name = "systempool" + node_count = 3 + vm_size = "standard_b2als_v2" + vnet_subnet_id = azurerm_subnet.snet-aks.id + } + + identity { + type = "UserAssigned" # "SystemAssigned" + identity_ids = [azurerm_user_assigned_identity.identity-aks.id] + } + + oms_agent { + log_analytics_workspace_id = azurerm_log_analytics_workspace.workspace.id + msi_auth_for_monitoring_enabled = true + } + + monitor_metrics { + annotations_allowed = null + labels_allowed = null + } + + lifecycle { + ignore_changes = [ + # monitor_metrics, + default_node_pool.0.upgrade_settings, + # default_node_pool.0.upgrade_settings.0.max_surge, + ] + } +} diff --git a/88_prometheus_grafana_ampls/aks_identity.tf b/88_prometheus_grafana_ampls/aks_identity.tf new file mode 100644 index 0000000..c0007c9 --- /dev/null +++ b/88_prometheus_grafana_ampls/aks_identity.tf @@ -0,0 +1,26 @@ +resource "azurerm_user_assigned_identity" "identity-aks" { + name = "identity-aks" + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location +} + +resource "azurerm_role_assignment" "network-contributor" { + scope = azurerm_virtual_network.vnet.id + role_definition_name = "Network Contributor" + principal_id = azurerm_user_assigned_identity.identity-aks.principal_id + skip_service_principal_aad_check = true +} + +# resource "azurerm_role_assignment" "Managed-Identity-Operator" { +# scope = azurerm_user_assigned_identity.identity-kubelet.id +# role_definition_name = "Managed Identity Operator" +# principal_id = azurerm_user_assigned_identity.identity_aks.principal_id +# skip_service_principal_aad_check = true +# } + +# resource "azurerm_role_assignment" "role_identity_aks_contributor" { +# scope = azurerm_resource_group.rg.id +# role_definition_name = "Contributor" +# principal_id = azurerm_user_assigned_identity.identity_aks.principal_id +# skip_service_principal_aad_check = true +# } \ No newline at end of file diff --git a/85_prometheus_grafana_private_endpoint/ama-metrics-settings-configmap.yaml b/88_prometheus_grafana_ampls/ama-metrics-settings-configmap.yaml similarity index 100% rename from 85_prometheus_grafana_private_endpoint/ama-metrics-settings-configmap.yaml rename to 88_prometheus_grafana_ampls/ama-metrics-settings-configmap.yaml diff --git a/88_prometheus_grafana_ampls/ampls.tf b/88_prometheus_grafana_ampls/ampls.tf new file mode 100644 index 0000000..ef9cd68 --- /dev/null +++ b/88_prometheus_grafana_ampls/ampls.tf @@ -0,0 +1,33 @@ +resource "azurerm_monitor_private_link_scope" "ampls" { + name = "ampls-monitoring" + resource_group_name = azurerm_resource_group.rg.name +} + +resource "azurerm_monitor_private_link_scoped_service" "ampls-log-analytics" { + name = "ampls-log-analytics" + scope_name = azurerm_monitor_private_link_scope.ampls.name + resource_group_name = azurerm_monitor_private_link_scope.ampls.resource_group_name + linked_resource_id = azurerm_log_analytics_workspace.workspace.id +} + +resource "azurerm_monitor_private_link_scoped_service" "ampls-dce-log-analytics" { + name = "ampls-dce-log-analytics" + scope_name = azurerm_monitor_private_link_scope.ampls.name + resource_group_name = azurerm_monitor_private_link_scope.ampls.resource_group_name + linked_resource_id = azurerm_monitor_data_collection_endpoint.dce-log-analytics.id +} + +# # # not required +# resource "azurerm_monitor_private_link_scoped_service" "prometheus" { +# name = "ampls-prometheus" +# resource_group_name = azurerm_resource_group.rg.name +# scope_name = azurerm_monitor_private_link_scope.ampls.name +# linked_resource_id = azurerm_monitor_workspace.prometheus.id +# } + +resource "azurerm_monitor_private_link_scoped_service" "ampls-dce-prometheus" { + name = "ampls-dce-prometheus" + scope_name = azurerm_monitor_private_link_scope.ampls.name + resource_group_name = azurerm_monitor_private_link_scope.ampls.resource_group_name + linked_resource_id = azurerm_monitor_data_collection_endpoint.dce-prometheus.id +} \ No newline at end of file diff --git a/88_prometheus_grafana_ampls/bastion.tf b/88_prometheus_grafana_ampls/bastion.tf new file mode 100644 index 0000000..8f05360 --- /dev/null +++ b/88_prometheus_grafana_ampls/bastion.tf @@ -0,0 +1,25 @@ +resource "azurerm_public_ip" "pip-bastion" { + name = "pip-bastion" + resource_group_name = azurerm_resource_group.rg-jumpbox.name + location = azurerm_resource_group.rg-jumpbox.location + allocation_method = "Static" + sku = "Standard" +} + +resource "azurerm_bastion_host" "bastion" { + name = "bastion" + resource_group_name = azurerm_resource_group.rg-jumpbox.name + location = azurerm_resource_group.rg-jumpbox.location + sku = "Standard" # "Standard" # "Basic", "Developer" + copy_paste_enabled = true + file_copy_enabled = false + shareable_link_enabled = false + tunneling_enabled = false + ip_connect_enabled = false + + ip_configuration { + name = "configuration" + subnet_id = azurerm_subnet.snet-bastion.id + public_ip_address_id = azurerm_public_ip.pip-bastion.id + } +} \ No newline at end of file diff --git a/85_prometheus_grafana_private_endpoint/commands.ps1 b/88_prometheus_grafana_ampls/commands.ps1 similarity index 100% rename from 85_prometheus_grafana_private_endpoint/commands.ps1 rename to 88_prometheus_grafana_ampls/commands.ps1 diff --git a/88_prometheus_grafana_ampls/commands.sh b/88_prometheus_grafana_ampls/commands.sh new file mode 100644 index 0000000..e331d02 --- /dev/null +++ b/88_prometheus_grafana_ampls/commands.sh @@ -0,0 +1,7 @@ +terraform init + +terraform plan -out tfplan + +terraform apply tfplan + +terraform destroy \ No newline at end of file diff --git a/88_prometheus_grafana_ampls/container-azm-ms-agentconfig.yaml b/88_prometheus_grafana_ampls/container-azm-ms-agentconfig.yaml new file mode 100644 index 0000000..9ce03d6 --- /dev/null +++ b/88_prometheus_grafana_ampls/container-azm-ms-agentconfig.yaml @@ -0,0 +1,211 @@ +# src: https://raw.githubusercontent.com/microsoft/Docker-Provider/ci_prod/kubernetes/container-azm-ms-agentconfig.yaml +# doc: https://learn.microsoft.com/en-us/azure/azure-monitor/containers/container-insights-agent-config + +kind: ConfigMap +apiVersion: v1 +metadata: + name: container-azm-ms-agentconfig + namespace: kube-system +data: + schema-version: + #string.used by agent to parse config. supported versions are {v1}. Configs with other schema versions will be rejected by the agent. + v1 + config-version: + #string.used by customer to keep track of this config file's version in their source control/repository (max allowed 10 chars, other chars will be truncated) + ver1 + log-data-collection-settings: |- + # Log data collection settings + # Any errors related to config map settings can be found in the KubeMonAgentEvents table in the Log Analytics workspace that the cluster is sending data to. + + [log_collection_settings] + [log_collection_settings.stdout] + # In the absense of this configmap, default value for enabled is true + enabled = true + # exclude_namespaces setting holds good only if enabled is set to true + # kube-system,gatekeeper-system log collection are disabled by default in the absence of 'log_collection_settings.stdout' setting. If you want to enable kube-system,gatekeeper-system, remove them from the following setting. + # If you want to continue to disable kube-system,gatekeeper-system log collection keep the namespaces in the following setting and add any other namespace you want to disable log collection to the array. + # In the absense of this configmap, default value for exclude_namespaces = ["kube-system","gatekeeper-system"] + exclude_namespaces = ["gatekeeper-system"] # ["kube-system","gatekeeper-system"] + + [log_collection_settings.stderr] + # Default value for enabled is true + enabled = true + # exclude_namespaces setting holds good only if enabled is set to true + # kube-system,gatekeeper-system log collection are disabled by default in the absence of 'log_collection_settings.stderr' setting. If you want to enable kube-system,gatekeeper-system, remove them from the following setting. + # If you want to continue to disable kube-system,gatekeeper-system log collection keep the namespaces in the following setting and add any other namespace you want to disable log collection to the array. + # In the absense of this configmap, default value for exclude_namespaces = ["kube-system","gatekeeper-system"] + exclude_namespaces = ["gatekeeper-system"] # ["kube-system","gatekeeper-system"] + + [log_collection_settings.env_var] + # In the absense of this configmap, default value for enabled is true + enabled = true + [log_collection_settings.enrich_container_logs] + # In the absense of this configmap, default value for enrich_container_logs is false + enabled = true + # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image + [log_collection_settings.collect_all_kube_events] + # In the absense of this configmap, default value for collect_all_kube_events is false + # When the setting is set to false, only the kube events with !normal event type will be collected + enabled = false + # When this is enabled (enabled = true), all kube events including normal events will be collected + [log_collection_settings.schema] + # In the absence of this configmap, default value for containerlog_schema_version is "v1" + # Supported values for this setting are "v1","v2" + # See documentation at https://aka.ms/ContainerLogv2 for benefits of v2 schema over v1 schema before opting for "v2" schema + containerlog_schema_version = "v2" + [log_collection_settings.enable_multiline_logs] + # fluent-bit based multiline log collection for .NET, Go, Java, and Python stacktraces. + # if enabled will also stitch together container logs split by docker/cri due to size limits(16KB per log line) + enabled = "true" + + + prometheus-data-collection-settings: |- + # Custom Prometheus metrics data collection settings + [prometheus_data_collection_settings.cluster] + # Cluster level scrape endpoint(s). These metrics will be scraped from agent's Replicaset (singleton) + # Any errors related to prometheus scraping can be found in the KubeMonAgentEvents table in the Log Analytics workspace that the cluster is sending data to. + + #Interval specifying how often to scrape for metrics. This is duration of time and can be specified for supporting settings by combining an integer value and time unit as a string value. Valid time units are ns, us (or µs), ms, s, m, h. + interval = "1m" + + ## Uncomment the following settings with valid string arrays for prometheus scraping + #fieldpass = ["metric_to_pass1", "metric_to_pass12"] + + #fielddrop = ["metric_to_drop"] + + # An array of urls to scrape metrics from. + # urls = ["http://myurl:9101/metrics"] + + # An array of Kubernetes services to scrape metrics from. + # kubernetes_services = ["http://my-service-dns.my-namespace:9102/metrics"] + + # When monitor_kubernetes_pods = true, replicaset will scrape Kubernetes pods for the following prometheus annotations: + # - prometheus.io/scrape: Enable scraping for this pod + # - prometheus.io/scheme: Default is http + # - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. + # - prometheus.io/port: If port is not 9102 use this annotation + monitor_kubernetes_pods = false + + ## Restricts Kubernetes monitoring to namespaces for pods that have annotations set and are scraped using the monitor_kubernetes_pods setting. + ## This will take effect when monitor_kubernetes_pods is set to true + ## ex: monitor_kubernetes_pods_namespaces = ["default1", "default2", "default3"] + # monitor_kubernetes_pods_namespaces = ["default1"] + + ## Label selector to target pods which have the specified label + ## This will take effect when monitor_kubernetes_pods is set to true + ## Reference the docs at https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + # kubernetes_label_selector = "env=dev,app=nginx" + + ## Field selector to target pods which have the specified field + ## This will take effect when monitor_kubernetes_pods is set to true + ## Reference the docs at https://kubernetes.io/docs/concepts/overview/working-with-objects/field-selectors/ + ## eg. To scrape pods on a specific node + # kubernetes_field_selector = "spec.nodeName=$HOSTNAME" + + [prometheus_data_collection_settings.node] + # Node level scrape endpoint(s). These metrics will be scraped from agent's DaemonSet running in every node in the cluster + # Any errors related to prometheus scraping can be found in the KubeMonAgentEvents table in the Log Analytics workspace that the cluster is sending data to. + + #Interval specifying how often to scrape for metrics. This is duration of time and can be specified for supporting settings by combining an integer value and time unit as a string value. Valid time units are ns, us (or µs), ms, s, m, h. + interval = "1m" + + ## Uncomment the following settings with valid string arrays for prometheus scraping + + # An array of urls to scrape metrics from. $NODE_IP (all upper case) will substitute of running Node's IP address + # urls = ["http://$NODE_IP:9103/metrics"] + + #fieldpass = ["metric_to_pass1", "metric_to_pass12"] + + #fielddrop = ["metric_to_drop"] + + metric_collection_settings: |- + # Metrics collection settings for metrics sent to Log Analytics and MDM + [metric_collection_settings.collect_kube_system_pv_metrics] + # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false + # When the setting is set to false, only the persistent volume metrics outside the kube-system namespace will be collected + enabled = false + # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected + + alertable-metrics-configuration-settings: |- + # Alertable metrics configuration settings for container resource utilization + [alertable_metrics_configuration_settings.container_resource_utilization_thresholds] + # The threshold(Type Float) will be rounded off to 2 decimal points + # Threshold for container cpu, metric will be sent only when cpu utilization exceeds or becomes equal to the following percentage + container_cpu_threshold_percentage = 95.0 + # Threshold for container memoryRss, metric will be sent only when memory rss exceeds or becomes equal to the following percentage + container_memory_rss_threshold_percentage = 95.0 + # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage + container_memory_working_set_threshold_percentage = 95.0 + + # Alertable metrics configuration settings for persistent volume utilization + [alertable_metrics_configuration_settings.pv_utilization_thresholds] + # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage + pv_usage_threshold_percentage = 60.0 + + # Alertable metrics configuration settings for completed jobs count + [alertable_metrics_configuration_settings.job_completion_threshold] + # Threshold for completed job count , metric will be sent only for those jobs which were completed earlier than the following threshold + job_completion_threshold_time_minutes = 360 + integrations: |- + [integrations.azure_network_policy_manager] + collect_basic_metrics = false + collect_advanced_metrics = false + [integrations.azure_subnet_ip_usage] + enabled = true + +# Doc - https://github.com/microsoft/Docker-Provider/blob/ci_prod/Documentation/AgentSettings/ReadMe.md + agent-settings: |- + # prometheus scrape fluent bit settings for high scale + # buffer size should be greater than or equal to chunk size else we set it to chunk size. + # settings scoped to prometheus sidecar container. all values in mb + [agent_settings.prometheus_fbit_settings] + tcp_listener_chunk_size = 10 + tcp_listener_buffer_size = 10 + tcp_listener_mem_buf_limit = 200 + + # prometheus scrape fluent bit settings for high scale + # buffer size should be greater than or equal to chunk size else we set it to chunk size. + # settings scoped to daemonset container. all values in mb + # [agent_settings.node_prometheus_fbit_settings] + # tcp_listener_chunk_size = 1 + # tcp_listener_buffer_size = 1 + # tcp_listener_mem_buf_limit = 10 + + # prometheus scrape fluent bit settings for high scale + # buffer size should be greater than or equal to chunk size else we set it to chunk size. + # settings scoped to replicaset container. all values in mb + # [agent_settings.cluster_prometheus_fbit_settings] + # tcp_listener_chunk_size = 1 + # tcp_listener_buffer_size = 1 + # tcp_listener_mem_buf_limit = 10 + + # The following settings are "undocumented", we don't recommend uncommenting them unless directed by Microsoft. + # They increase the maximum stdout/stderr log collection rate but will also cause higher cpu/memory usage. + ## Ref for more details about Ignore_Older - https://docs.fluentbit.io/manual/v/1.7/pipeline/inputs/tail + # [agent_settings.fbit_config] + # log_flush_interval_secs = "1" # default value is 15 + # tail_mem_buf_limit_megabytes = "10" # default value is 10 + # tail_buf_chunksize_megabytes = "1" # default value is 32kb (comment out this line for default) + # tail_buf_maxsize_megabytes = "1" # default value is 32kb (comment out this line for default) + # tail_ignore_older = "5m" # default value same as fluent-bit default i.e.0m + + # On both AKS & Arc K8s enviornments, if Cluster has configured with Forward Proxy then Proxy settings automatically applied and used for the agent + # Certain configurations, proxy config should be ignored for example Cluster with AMPLS + Proxy + # in such scenarios, use the following config to ignore proxy settings + # [agent_settings.proxy_config] + # ignore_proxy_settings = "true" # if this is not applied, default value is false + + # The following settings are "undocumented", we don't recommend uncommenting them unless directed by Microsoft. + # Configuration settings for the waittime for the network listeners to be available + # [agent_settings.network_listener_waittime] + # tcp_port_25226 = 45 # Port 25226 is used for telegraf to fluent-bit data in ReplicaSet + # tcp_port_25228 = 60 # Port 25228 is used for telegraf to fluentd data + # tcp_port_25229 = 45 # Port 25229 is used for telegraf to fluent-bit data in DaemonSet + + # The following settings are "undocumented", we don't recommend uncommenting them unless directed by Microsoft. + # [agent_settings.mdsd_config] + # monitoring_max_event_rate = "50000" # default 20K eps + # backpressure_memory_threshold_in_mb = "1500" # default 3500MB + # upload_max_size_in_mb = "20" # default 2MB + # upload_frequency_seconds = "1" # default 60 upload_frequency_seconds + # compression_level = "0" # supported levels 0 to 9 and 0 means no compression \ No newline at end of file diff --git a/88_prometheus_grafana_ampls/dce-log_analytics.tf b/88_prometheus_grafana_ampls/dce-log_analytics.tf new file mode 100644 index 0000000..1dc6e2b --- /dev/null +++ b/88_prometheus_grafana_ampls/dce-log_analytics.tf @@ -0,0 +1,14 @@ +resource "azurerm_monitor_data_collection_endpoint" "dce-log-analytics" { + name = "dce-log-analytics" + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location + public_network_access_enabled = false +} + +# required +# associate to a Data Collection Endpoint +# resource "azurerm_monitor_data_collection_rule_association" "dcra-dce-log-analytics-aks" { +# name = "configurationAccessEndpoint" # name is required when data_collection_rule_id is specified. And when data_collection_endpoint_id is specified, the name is populated with configurationAccessEndpoint +# target_resource_id = azurerm_kubernetes_cluster.aks.id +# data_collection_endpoint_id = azurerm_monitor_data_collection_endpoint.dce-log-analytics.id +# } \ No newline at end of file diff --git a/88_prometheus_grafana_ampls/dce-prometheus.tf b/88_prometheus_grafana_ampls/dce-prometheus.tf new file mode 100644 index 0000000..15516e8 --- /dev/null +++ b/88_prometheus_grafana_ampls/dce-prometheus.tf @@ -0,0 +1,14 @@ +resource "azurerm_monitor_data_collection_endpoint" "dce-prometheus" { + name = "dce-prometheus" + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location + kind = "Linux" + public_network_access_enabled = false # true # false +} + +# associate to a Data Collection Endpoint +resource "azurerm_monitor_data_collection_rule_association" "dcra-dce-prometheus-aks" { +# name = "configurationAccessEndpoint" # "dcra-dce-prometheus-aks" # # name is required when data_collection_rule_id is specified. And when data_collection_endpoint_id is specified, the name is populated with configurationAccessEndpoint + target_resource_id = azurerm_kubernetes_cluster.aks.id + data_collection_endpoint_id = azurerm_monitor_data_collection_endpoint.dce-prometheus.id +} diff --git a/88_prometheus_grafana_ampls/dcr-log_analytics.tf b/88_prometheus_grafana_ampls/dcr-log_analytics.tf new file mode 100644 index 0000000..9127c2a --- /dev/null +++ b/88_prometheus_grafana_ampls/dcr-log_analytics.tf @@ -0,0 +1,59 @@ +resource "azurerm_monitor_data_collection_rule" "dcr-log-analytics" { + name = "dcr-log-analytics" + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location + data_collection_endpoint_id = azurerm_monitor_data_collection_endpoint.dce-log-analytics.id + + destinations { + log_analytics { + name = "log_analytics" + workspace_resource_id = azurerm_log_analytics_workspace.workspace.id + } + } + + data_flow { + streams = ["Microsoft-ContainerInsights-Group-Default"] + destinations = ["log_analytics"] + } + + data_sources { + syslog { + name = "example-syslog" + # streams = ["Microsoft-Syslog"] + facility_names = [ + "*" + ] + log_levels = [ + "Debug", + "Info", + "Notice", + "Warning", + "Error", + "Critical", + "Alert", + "Emergency", + ] + } + extension { + extension_name = "ContainerInsights" + name = "ContainerInsightsExtension" + streams = ["Microsoft-ContainerInsights-Group-Default"] + extension_json = jsonencode( + { + dataCollectionSettings = { + enableContainerLogV2 = true + interval = "1m" + namespaceFilteringMode = "Off" + } + } + ) + } + } +} + +# associate to a Data Collection Rule +resource "azurerm_monitor_data_collection_rule_association" "dcra-dcr-log-analytics-aks" { + name = "dcra-dcr-log-analytics-aks" + target_resource_id = azurerm_kubernetes_cluster.aks.id + data_collection_rule_id = azurerm_monitor_data_collection_rule.dcr-log-analytics.id +} diff --git a/88_prometheus_grafana_ampls/dcr-prometheus.tf b/88_prometheus_grafana_ampls/dcr-prometheus.tf new file mode 100644 index 0000000..a107ea0 --- /dev/null +++ b/88_prometheus_grafana_ampls/dcr-prometheus.tf @@ -0,0 +1,35 @@ +resource "azurerm_monitor_data_collection_rule" "dcr-prometheus" { + name = "dcr-prometheus" + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location + data_collection_endpoint_id = azurerm_monitor_data_collection_endpoint.dce-prometheus.id + kind = "Linux" + description = "DCR for Azure Monitor Metrics Profile (Managed Prometheus)" + + data_sources { + prometheus_forwarder { + name = "PrometheusDataSource" + streams = ["Microsoft-PrometheusMetrics"] + } + } + + destinations { + monitor_account { + monitor_account_id = azurerm_monitor_workspace.prometheus.id + name = azurerm_monitor_workspace.prometheus.name + } + } + + data_flow { + streams = ["Microsoft-PrometheusMetrics"] + destinations = [azurerm_monitor_workspace.prometheus.name] + } +} + +# associate to a Data Collection Rule +resource "azurerm_monitor_data_collection_rule_association" "dcra-dcr-prometheus-aks" { + name = "dcra-dcr-prometheus-aks" + target_resource_id = azurerm_kubernetes_cluster.aks.id + data_collection_rule_id = azurerm_monitor_data_collection_rule.dcr-prometheus.id + description = "Association of DCR. Deleting this association will break the data collection for this AKS Cluster." +} diff --git a/85_prometheus_grafana_private_endpoint/deploy-svc-ingress.yaml b/88_prometheus_grafana_ampls/deploy-svc-ingress.yaml similarity index 100% rename from 85_prometheus_grafana_private_endpoint/deploy-svc-ingress.yaml rename to 88_prometheus_grafana_ampls/deploy-svc-ingress.yaml diff --git a/85_prometheus_grafana_private_endpoint/diagnostic_setting.tf b/88_prometheus_grafana_ampls/diagnostic_setting.tf similarity index 94% rename from 85_prometheus_grafana_private_endpoint/diagnostic_setting.tf rename to 88_prometheus_grafana_ampls/diagnostic_setting.tf index 5507012..7227ecd 100644 --- a/85_prometheus_grafana_private_endpoint/diagnostic_setting.tf +++ b/88_prometheus_grafana_ampls/diagnostic_setting.tf @@ -24,7 +24,7 @@ resource "azurerm_monitor_diagnostic_setting" "rule" { name = "diagnostic-setting" target_resource_id = each.value.id log_analytics_workspace_id = azurerm_log_analytics_workspace.workspace.id - log_analytics_destination_type = "AzureDiagnostics" + log_analytics_destination_type = "Dedicated" # "AzureDiagnostics" dynamic "enabled_log" { iterator = entry diff --git a/88_prometheus_grafana_ampls/grafana.tf b/88_prometheus_grafana_ampls/grafana.tf new file mode 100644 index 0000000..ecfac5d --- /dev/null +++ b/88_prometheus_grafana_ampls/grafana.tf @@ -0,0 +1,48 @@ +resource "azurerm_dashboard_grafana" "grafana" { + name = "azure-grafana-${var.prefix}" + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location + api_key_enabled = true + deterministic_outbound_ip_enabled = true + sku = "Standard" + zone_redundancy_enabled = false + grafana_major_version = "10" # 9 + public_network_access_enabled = true + + azure_monitor_workspace_integrations { + resource_id = azurerm_monitor_workspace.prometheus.id + } + + identity { + type = "SystemAssigned" # "UserAssigned" # + # identity_ids = [azurerm_user_assigned_identity.identity-grafana.id] + } +} + +data "azurerm_client_config" "current" {} + +resource "azurerm_role_assignment" "role_grafana_admin" { + scope = azurerm_dashboard_grafana.grafana.id + role_definition_name = "Grafana Admin" + principal_id = data.azurerm_client_config.current.object_id +} + +resource "azurerm_role_assignment" "role_monitoring_data_reader" { + scope = azurerm_monitor_workspace.prometheus.id + role_definition_name = "Monitoring Data Reader" + principal_id = azurerm_dashboard_grafana.grafana.identity.0.principal_id # azurerm_user_assigned_identity.identity-grafana.principal_id # +} + +data "azurerm_subscription" "current" {} + +resource "azurerm_role_assignment" "role_monitoring_reader" { + scope = data.azurerm_subscription.current.id + role_definition_name = "Monitoring Reader" + principal_id = azurerm_dashboard_grafana.grafana.identity.0.principal_id # azurerm_user_assigned_identity.identity-grafana.principal_id # +} + +# resource "azurerm_user_assigned_identity" "identity-grafana" { +# name = "identity-grafana" +# resource_group_name = azurerm_resource_group.rg.name +# location = azurerm_resource_group.rg.location +# } diff --git a/88_prometheus_grafana_ampls/images/85_prometheus_grafana__architecture.png b/88_prometheus_grafana_ampls/images/85_prometheus_grafana__architecture.png new file mode 100644 index 0000000..4b45bc4 Binary files /dev/null and b/88_prometheus_grafana_ampls/images/85_prometheus_grafana__architecture.png differ diff --git a/85_prometheus_grafana_private_endpoint/import_grafafana_dashboard.tf b/88_prometheus_grafana_ampls/import_grafafana_dashboard.tf similarity index 100% rename from 85_prometheus_grafana_private_endpoint/import_grafafana_dashboard.tf rename to 88_prometheus_grafana_ampls/import_grafafana_dashboard.tf diff --git a/88_prometheus_grafana_ampls/log_analytics.tf b/88_prometheus_grafana_ampls/log_analytics.tf new file mode 100644 index 0000000..86ff15b --- /dev/null +++ b/88_prometheus_grafana_ampls/log_analytics.tf @@ -0,0 +1,22 @@ +resource "azurerm_log_analytics_workspace" "workspace" { + name = "log-analytics-workspace" + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location + sku = "PerGB2018" # PerGB2018, Free, PerNode, Premium, Standard, Standalone, Unlimited, CapacityReservation + retention_in_days = 30 # possible values are either 7 (Free Tier only) or range between 30 and 730 + internet_ingestion_enabled = false + internet_query_enabled = false +} + +# resource "azurerm_log_analytics_solution" "solution" { +# solution_name = "ContainerInsights" +# location = azurerm_log_analytics_workspace.workspace.location +# resource_group_name = azurerm_log_analytics_workspace.workspace.resource_group_name +# workspace_resource_id = azurerm_log_analytics_workspace.workspace.id +# workspace_name = azurerm_log_analytics_workspace.workspace.name + +# plan { +# publisher = "Microsoft" +# product = "OMSGallery/ContainerInsights" +# } +# } diff --git a/88_prometheus_grafana_ampls/logger-pod.yaml b/88_prometheus_grafana_ampls/logger-pod.yaml new file mode 100644 index 0000000..4344a06 --- /dev/null +++ b/88_prometheus_grafana_ampls/logger-pod.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Pod +metadata: + name: logger +spec: + containers: + - name: count + image: busybox + args: [/bin/sh, -c, 'i=0; while true; do echo "This is demo log $i: $(date)"; i=$((i+1)); sleep 10; done'] \ No newline at end of file diff --git a/85_prometheus_grafana_private_endpoint/nginx.tf b/88_prometheus_grafana_ampls/nginx.tf similarity index 100% rename from 85_prometheus_grafana_private_endpoint/nginx.tf rename to 88_prometheus_grafana_ampls/nginx.tf diff --git a/85_prometheus_grafana_private_endpoint/output.tf b/88_prometheus_grafana_ampls/output.tf similarity index 100% rename from 85_prometheus_grafana_private_endpoint/output.tf rename to 88_prometheus_grafana_ampls/output.tf diff --git a/88_prometheus_grafana_ampls/pe-ampls.tf b/88_prometheus_grafana_ampls/pe-ampls.tf new file mode 100644 index 0000000..c260f89 --- /dev/null +++ b/88_prometheus_grafana_ampls/pe-ampls.tf @@ -0,0 +1,42 @@ +locals { + dns_zones_ampls = toset([ + "privatelink.monitor.azure.com", + "privatelink.oms.opinsights.azure.com", + "privatelink.ods.opinsights.azure.com", + "privatelink.agentsvc.azure-automation.net", + "privatelink.blob.core.windows.net", + ]) +} + +resource "azurerm_private_endpoint" "pe-ampls" { + name = "pe-ampls" + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location + subnet_id = azurerm_subnet.snet-pe.id + + private_service_connection { + name = "connection" + is_manual_connection = false + subresource_names = ["azuremonitor"] + private_connection_resource_id = azurerm_monitor_private_link_scope.ampls.id + } + + private_dns_zone_group { + name = "private-dns-zone" + private_dns_zone_ids = [for zone in azurerm_private_dns_zone.zones : zone.id] + } +} + +resource "azurerm_private_dns_zone" "zones" { + for_each = local.dns_zones_ampls + name = each.value + resource_group_name = azurerm_resource_group.rg.name +} + +resource "azurerm_private_dns_zone_virtual_network_link" "link" { + for_each = azurerm_private_dns_zone.zones + name = "vnet-link-${each.key}" + private_dns_zone_name = each.value.name + resource_group_name = each.value.resource_group_name + virtual_network_id = azurerm_virtual_network.vnet.id +} diff --git a/88_prometheus_grafana_ampls/prometheus.tf b/88_prometheus_grafana_ampls/prometheus.tf new file mode 100644 index 0000000..1804c89 --- /dev/null +++ b/88_prometheus_grafana_ampls/prometheus.tf @@ -0,0 +1,224 @@ +resource "azurerm_monitor_workspace" "prometheus" { + name = "azure-prometheus" + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location + public_network_access_enabled = true # false # true +} + +resource "azurerm_role_assignment" "role_monitoring_data_reader_me" { + scope = azurerm_monitor_workspace.prometheus.id + role_definition_name = "Monitoring Data Reader" + principal_id = data.azurerm_client_config.current.object_id +} + +resource "azurerm_monitor_alert_prometheus_rule_group" "alert-prometheus-nodes" { + name = "NodeRecordingRulesRuleGroup" + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location + cluster_name = azurerm_kubernetes_cluster.aks.name + rule_group_enabled = true + interval = "PT1M" + scopes = [azurerm_monitor_workspace.prometheus.id] + + rule { + record = "instance:node_num_cpu:sum" + expression = "count without (cpu, mode) (node_cpu_seconds_total{job=\"node\",mode=\"idle\"})" + enabled = true + } + + rule { + record = "instance:node_cpu_utilisation:rate5m" + expression = "1 - avg without (cpu) (sum without (mode) (rate(node_cpu_seconds_total{job=\"node\", mode=~\"idle|iowait|steal\"}[5m])))" + enabled = true + } + + rule { + record = "instance:node_load1_per_cpu:ratio" + expression = "(node_load1{job=\"node\"}/ instance:node_num_cpu:sum{job=\"node\"})" + enabled = true + } + + rule { + record = "instance:node_memory_utilisation:ratio" + expression = "1 - ((node_memory_MemAvailable_bytes{job=\"node\"} or (node_memory_Buffers_bytes{job=\"node\"} + node_memory_Cached_bytes{job=\"node\"} + node_memory_MemFree_bytes{job=\"node\"} + node_memory_Slab_bytes{job=\"node\"})) / node_memory_MemTotal_bytes{job=\"node\"})" + enabled = true + } + + rule { + record = "instance:node_vmstat_pgmajfault:rate5m" + expression = "rate(node_vmstat_pgmajfault{job=\"node\"}[5m])" + enabled = true + } + + rule { + record = "instance_device:node_disk_io_time_seconds:rate5m" + expression = "rate(node_disk_io_time_seconds_total{job=\"node\", device!=\"\"}[5m])" + enabled = true + } + + rule { + record = "instance_device:node_disk_io_time_weighted_seconds:rate5m" + expression = "rate(node_disk_io_time_weighted_seconds_total{job=\"node\", device!=\"\"}[5m])" + enabled = true + } + + rule { + record = "instance:node_network_receive_bytes_excluding_lo:rate5m" + expression = "sum without (device) (rate(node_network_receive_bytes_total{job=\"node\", device!=\"lo\"}[5m]))" + enabled = true + } + + rule { + record = "instance:node_network_transmit_bytes_excluding_lo:rate5m" + expression = "sum without (device) (rate(node_network_transmit_bytes_total{job=\"node\", device!=\"lo\"}[5m]))" + enabled = true + } + + rule { + record = "instance:node_network_receive_drop_excluding_lo:rate5m" + expression = "sum without (device) (rate(node_network_receive_drop_total{job=\"node\", device!=\"lo\"}[5m]))" + enabled = true + } + + rule { + record = "instance:node_network_transmit_drop_excluding_lo:rate5m" + expression = "sum without (device) (rate(node_network_transmit_drop_total{job=\"node\", device!=\"lo\"}[5m]))" + enabled = true + } +} + +resource "azurerm_monitor_alert_prometheus_rule_group" "alert-prometheus-k8s" { + name = "KubernetesRecordingRulesRuleGroup" + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location + cluster_name = azurerm_kubernetes_cluster.aks.name + rule_group_enabled = true + interval = "PT1M" + scopes = [azurerm_monitor_workspace.prometheus.id] + + rule { + record = "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate" + expression = "sum by (cluster, namespace, pod, container) (irate(container_cpu_usage_seconds_total{job=\"cadvisor\", image!=\"\"}[5m])) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\"}))" + enabled = true + } + + rule { + record = "node_namespace_pod_container:container_memory_working_set_bytes" + expression = "container_memory_working_set_bytes{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" + enabled = true + } + + rule { + record = "node_namespace_pod_container:container_memory_rss" + expression = "container_memory_rss{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" + enabled = true + } + + rule { + record = "node_namespace_pod_container:container_memory_cache" + expression = "container_memory_cache{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" + enabled = true + } + + rule { + record = "node_namespace_pod_container:container_memory_swap" + expression = "container_memory_swap{job=\"cadvisor\", image!=\"\"}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))" + enabled = true + } + + rule { + record = "cluster:namespace:pod_memory:active:kube_pod_container_resource_requests" + expression = "kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"} * on(namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ((kube_pod_status_phase{phase=~\"Pending|Running\"} == 1))" + enabled = true + } + + rule { + record = "namespace_memory:kube_pod_container_resource_requests:sum" + expression = "sum by (namespace, cluster) (sum by (namespace, pod, cluster) (max by (namespace, pod, container, cluster) (kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" + enabled = true + } + + rule { + record = "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests" + expression = "kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ((kube_pod_status_phase{phase=~\"Pending|Running\"} == 1))" + enabled = true + } + + rule { + record = "namespace_cpu:kube_pod_container_resource_requests:sum" + expression = "sum by (namespace, cluster) (sum by(namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" + enabled = true + } + + rule { + record = "cluster:namespace:pod_memory:active:kube_pod_container_resource_limits" + expression = "kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ((kube_pod_status_phase{phase=~\"Pending|Running\"} == 1))" + enabled = true + } + + rule { + record = "namespace_memory:kube_pod_container_resource_limits:sum" + expression = "sum by (namespace, cluster) (sum by (namespace, pod, cluster) (max by (namespace, pod, container, cluster) (kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" + enabled = true + } + + rule { + record = "cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits" + expression = "kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1) )" + enabled = true + } + + rule { + record = "namespace_cpu:kube_pod_container_resource_limits:sum" + expression = "sum by (namespace, cluster) (sum by (namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"}) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))" + enabled = true + } + + rule { + record = "namespace_workload_pod:kube_pod_owner:relabel" + expression = "max by (cluster, namespace, workload, pod) (label_replace(label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"}, \"replicaset\", \"$1\", \"owner_name\", \"(.*)\") * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (1, max by (replicaset, namespace, owner_name) (kube_replicaset_owner{job=\"kube-state-metrics\"})), \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" + labels = { + "workload_type" = "deployment" + } + enabled = true + } + + rule { + record = "namespace_workload_pod:kube_pod_owner:relabel" + expression = "max by (cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"DaemonSet\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" + labels = { + "workload_type" = "daemonset" + } + enabled = true + } + + rule { + record = "namespace_workload_pod:kube_pod_owner:relabel" + expression = "max by (cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"StatefulSet\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" + labels = { + "workload_type" = "statefulset" + } + enabled = true + } + + rule { + record = "namespace_workload_pod:kube_pod_owner:relabel" + expression = "max by (cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"Job\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))" + labels = { + "workload_type" = "job" + } + enabled = true + } + + rule { + record = ":node_memory_MemAvailable_bytes:sum" + expression = "sum(node_memory_MemAvailable_bytes{job=\"node\"} or (node_memory_Buffers_bytes{job=\"node\"} + node_memory_Cached_bytes{job=\"node\"} + node_memory_MemFree_bytes{job=\"node\"} + node_memory_Slab_bytes{job=\"node\"})) by (cluster)" + enabled = true + } + + rule { + record = "cluster:node_cpu:ratio_rate5m" + expression = "sum(rate(node_cpu_seconds_total{job=\"node\",mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m])) by (cluster) /count(sum(node_cpu_seconds_total{job=\"node\"}) by (cluster, instance, cpu)) by (cluster)" + enabled = true + } +} diff --git a/88_prometheus_grafana_ampls/providers.tf b/88_prometheus_grafana_ampls/providers.tf new file mode 100644 index 0000000..df6204a --- /dev/null +++ b/88_prometheus_grafana_ampls/providers.tf @@ -0,0 +1,25 @@ +terraform { + + required_version = ">= 1.2.8" + + required_providers { + + azurerm = { + source = "hashicorp/azurerm" + version = "= 3.94.0" + } + + azuread = { + source = "hashicorp/azuread" + version = "= 2.47.0" + } + } +} + +provider "azurerm" { + features {} +} + +# Configure the Azure Active Directory Provider +provider "azuread" { # default takes current user/identity tenant +} diff --git a/88_prometheus_grafana_ampls/rg.tf b/88_prometheus_grafana_ampls/rg.tf new file mode 100644 index 0000000..a5465f3 --- /dev/null +++ b/88_prometheus_grafana_ampls/rg.tf @@ -0,0 +1,10 @@ +resource "azurerm_resource_group" "rg" { + name = "rg-aks-monitoring-${var.prefix}" + location = "swedencentral" +} + +resource "azurerm_resource_group" "rg-jumpbox" { + name = "rg-jumpbox-${var.prefix}" + location = "swedencentral" +} + diff --git a/88_prometheus_grafana_ampls/variables.tf b/88_prometheus_grafana_ampls/variables.tf new file mode 100644 index 0000000..7358604 --- /dev/null +++ b/88_prometheus_grafana_ampls/variables.tf @@ -0,0 +1,3 @@ +variable "prefix" { + default = 900 +} \ No newline at end of file diff --git a/88_prometheus_grafana_ampls/vnet.tf b/88_prometheus_grafana_ampls/vnet.tf new file mode 100644 index 0000000..09af5b5 --- /dev/null +++ b/88_prometheus_grafana_ampls/vnet.tf @@ -0,0 +1,29 @@ +resource "azurerm_virtual_network" "vnet" { + name = "vnet-aks" + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location + address_space = ["10.10.0.0/16"] +} + +resource "azurerm_subnet" "snet-aks" { + name = "snet-aks" + virtual_network_name = azurerm_virtual_network.vnet.name + resource_group_name = azurerm_virtual_network.vnet.resource_group_name + address_prefixes = ["10.10.0.0/24"] +} + +resource "azurerm_subnet" "snet-pe" { + name = "snet-pe" + virtual_network_name = azurerm_virtual_network.vnet.name + resource_group_name = azurerm_virtual_network.vnet.resource_group_name + address_prefixes = ["10.10.1.0/24"] + + private_link_service_network_policies_enabled = false +} + +resource "azurerm_subnet" "snet-bastion" { + name = "AzureBastionSubnet" + virtual_network_name = azurerm_virtual_network.vnet.name + resource_group_name = azurerm_virtual_network.vnet.resource_group_name + address_prefixes = ["10.10.2.0/24"] +} diff --git a/85_prometheus_grafana_private_endpoint/windows-vm.tf b/88_prometheus_grafana_ampls/windows-vm.tf similarity index 58% rename from 85_prometheus_grafana_private_endpoint/windows-vm.tf rename to 88_prometheus_grafana_ampls/windows-vm.tf index 968356f..99dd284 100644 --- a/85_prometheus_grafana_private_endpoint/windows-vm.tf +++ b/88_prometheus_grafana_ampls/windows-vm.tf @@ -1,7 +1,7 @@ resource "azurerm_network_interface" "nic-vm" { - name = "nic-vm-windows" - resource_group_name = azurerm_resource_group.rg_aks_cluster.name - location = azurerm_resource_group.rg_aks_cluster.location + name = "nic-vm-windows" + resource_group_name = azurerm_resource_group.rg-jumpbox.name + location = azurerm_resource_group.rg-jumpbox.location ip_configuration { name = "internal" @@ -12,16 +12,16 @@ resource "azurerm_network_interface" "nic-vm" { resource "azurerm_windows_virtual_machine" "vm" { name = "vm-jumpbox-w11" - resource_group_name = azurerm_resource_group.rg_aks_cluster.name - location = azurerm_resource_group.rg_aks_cluster.location - size = "Standard_B2ats_v2" + resource_group_name = azurerm_resource_group.rg-jumpbox.name + location = azurerm_resource_group.rg-jumpbox.location + size = "Standard_B2als_v2" # "Standard_B2ats_v2" admin_username = "azureuser" admin_password = "@Aa123456789" network_interface_ids = [azurerm_network_interface.nic-vm.id] priority = "Spot" eviction_policy = "Deallocate" -# custom_data = filebase64("../scripts/install-tools-windows.ps1") + # custom_data = filebase64("../scripts/install-tools-windows.ps1") os_disk { name = "os-disk-vm" @@ -36,9 +36,9 @@ resource "azurerm_windows_virtual_machine" "vm" { version = "latest" } - boot_diagnostics { - storage_account_uri = null - } + # boot_diagnostics { + # storage_account_uri = null + # } } # resource "azurerm_virtual_machine_extension" "cloudinit" { @@ -53,3 +53,18 @@ resource "azurerm_windows_virtual_machine" "vm" { # } # SETTINGS # } + +data "azurerm_virtual_machine" "vm" { + name = azurerm_windows_virtual_machine.vm.name + resource_group_name = azurerm_windows_virtual_machine.vm.resource_group_name +} + +check "check_vm_state" { + assert { + condition = data.azurerm_virtual_machine.vm.power_state == "running" + error_message = format("Virtual Machine (%s) should be in a 'running' status, instead state is '%s'", + data.azurerm_virtual_machine.vm.id, + data.azurerm_virtual_machine.vm.power_state + ) + } +}