Skip to content

Commit

Permalink
add k8s filter on kube_cluster_name (#44)
Browse files Browse the repository at this point in the history
* Add kube_cluster_name in k8s monitors to identify on which cluster is the resource (node, pod, pv...)
  • Loading branch information
Aohzan authored Mar 17, 2023
1 parent 1b033a5 commit 48dce5f
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 18 deletions.
8 changes: 4 additions & 4 deletions caas/kubernetes/ingress/vts/monitors-ingress.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ resource "datadog_monitor" "nginx_ingress_too_many_5xx" {

query = <<EOQ
${var.ingress_5xx_time_aggregator}(${var.ingress_5xx_timeframe}): default(
sum:nginx_ingress.nginx_upstream_responses_total${module.filter-tags-5xx.query_alert} by {upstream,ingress_class}.as_rate() /
(sum:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class}.as_rate() + ${var.artificial_requests_count})
sum:nginx_ingress.nginx_upstream_responses_total${module.filter-tags-5xx.query_alert} by {upstream,ingress_class,kube_cluster_name}.as_rate() /
(sum:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class,kube_cluster_name}.as_rate() + ${var.artificial_requests_count})
* 100, 0) > ${var.ingress_5xx_threshold_critical}
EOQ

Expand Down Expand Up @@ -37,8 +37,8 @@ resource "datadog_monitor" "nginx_ingress_too_many_4xx" {

query = <<EOQ
${var.ingress_4xx_time_aggregator}(${var.ingress_4xx_timeframe}): default(
sum:nginx_ingress.nginx_upstream_responses_total${module.filter-tags-4xx.query_alert} by {upstream,ingress_class}.as_rate() /
(sum:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class}.as_rate() + ${var.artificial_requests_count})
sum:nginx_ingress.nginx_upstream_responses_total${module.filter-tags-4xx.query_alert} by {upstream,ingress_class,kube_cluster_name}.as_rate() /
(sum:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class,kube_cluster_name}.as_rate() + ${var.artificial_requests_count})
* 100, 0) > ${var.ingress_4xx_threshold_critical}
EOQ

Expand Down
22 changes: 11 additions & 11 deletions caas/kubernetes/node/monitors-k8s-node.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ resource "datadog_monitor" "disk_pressure" {
type = "service check"

query = <<EOQ
"kubernetes_state.node.disk_pressure"${module.filter-tags.service_check}.by("node").last(6).count_by_status()
"kubernetes_state.node.disk_pressure"${module.filter-tags.service_check}.by("node","kube_cluster_name").last(6).count_by_status()
EOQ

monitor_thresholds {
Expand All @@ -32,7 +32,7 @@ resource "datadog_monitor" "disk_out" {
type = "service check"

query = <<EOQ
"kubernetes_state.node.out_of_disk"${module.filter-tags.service_check}.by("node").last(6).count_by_status()
"kubernetes_state.node.out_of_disk"${module.filter-tags.service_check}.by("node","kube_cluster_name").last(6).count_by_status()
EOQ

monitor_thresholds {
Expand All @@ -59,7 +59,7 @@ resource "datadog_monitor" "memory_pressure" {
type = "service check"

query = <<EOQ
"kubernetes_state.node.memory_pressure"${module.filter-tags.service_check}.by("node").last(6).count_by_status()
"kubernetes_state.node.memory_pressure"${module.filter-tags.service_check}.by("node","kube_cluster_name").last(6).count_by_status()
EOQ

monitor_thresholds {
Expand All @@ -86,7 +86,7 @@ resource "datadog_monitor" "ready" {
type = "service check"

query = <<EOQ
"kubernetes_state.node.ready"${module.filter-tags.service_check}.by("node").last(6).count_by_status()
"kubernetes_state.node.ready"${module.filter-tags.service_check}.by("node","kube_cluster_name").last(6).count_by_status()
EOQ

monitor_thresholds {
Expand All @@ -113,7 +113,7 @@ resource "datadog_monitor" "kubelet_ping" {
type = "service check"

query = <<EOQ
"kubernetes.kubelet.check.ping"${module.filter-tags.service_check}.by("name").last(6).count_by_status()
"kubernetes.kubelet.check.ping"${module.filter-tags.service_check}.by("name","kube_cluster_name").last(6).count_by_status()
EOQ

monitor_thresholds {
Expand Down Expand Up @@ -141,7 +141,7 @@ resource "datadog_monitor" "kubelet_syncloop" {
type = "service check"

query = <<EOQ
"kubernetes.kubelet.check.syncloop"${module.filter-tags.service_check}.by("name").last(6).count_by_status()
"kubernetes.kubelet.check.syncloop"${module.filter-tags.service_check}.by("name","kube_cluster_name").last(6).count_by_status()
EOQ

monitor_thresholds {
Expand Down Expand Up @@ -187,7 +187,7 @@ resource "datadog_monitor" "node_unschedulable" {

query = <<EOQ
${var.node_unschedulable_time_aggregator}(${var.node_unschedulable_timeframe}):
sum:kubernetes_state.node.status${module.filter-tags-unschedulable.query_alert} by {node}
sum:kubernetes_state.node.status${module.filter-tags-unschedulable.query_alert} by {node,kube_cluster_name}
> 0
EOQ

Expand Down Expand Up @@ -216,8 +216,8 @@ resource "datadog_monitor" "volume_space" {

query = <<EOQ
${var.volume_space_time_aggregator}(${var.volume_space_timeframe}):
avg:kubernetes.kubelet.volume.stats.used_bytes${module.filter-tags.query_alert} by {name,persistentvolumeclaim} /
avg:kubernetes.kubelet.volume.stats.capacity_bytes${module.filter-tags.query_alert} by {name,persistentvolumeclaim}
avg:kubernetes.kubelet.volume.stats.used_bytes${module.filter-tags.query_alert} by {name,persistentvolumeclaim,kube_cluster_name} /
avg:kubernetes.kubelet.volume.stats.capacity_bytes${module.filter-tags.query_alert} by {name,persistentvolumeclaim,kube_cluster_name}
* 100 > ${var.volume_space_threshold_critical}
EOQ

Expand Down Expand Up @@ -247,8 +247,8 @@ resource "datadog_monitor" "volume_inodes" {

query = <<EOQ
${var.volume_inodes_time_aggregator}(${var.volume_inodes_timeframe}):
avg:kubernetes.kubelet.volume.stats.inodes_used${module.filter-tags.query_alert} by {name,persistentvolumeclaim} /
avg:kubernetes.kubelet.volume.stats.inodes${module.filter-tags.query_alert} by {name,persistentvolumeclaim}
avg:kubernetes.kubelet.volume.stats.inodes_used${module.filter-tags.query_alert} by {name,persistentvolumeclaim,kube_cluster_name} /
avg:kubernetes.kubelet.volume.stats.inodes${module.filter-tags.query_alert} by {name,persistentvolumeclaim,kube_cluster_name}
* 100 > ${var.volume_inodes_threshold_critical}
EOQ

Expand Down
2 changes: 1 addition & 1 deletion caas/kubernetes/workload/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ Creates DataDog monitors with the following checks:
| <a name="input_replica_current_threshold_critical"></a> [replica\_current\_threshold\_critical](#input\_replica\_current\_threshold\_critical) | Current replica critical threshold | `number` | `1` | no |
| <a name="input_replica_current_time_aggregator"></a> [replica\_current\_time\_aggregator](#input\_replica\_current\_time\_aggregator) | Monitor aggregator for Current replica [available values: min, max or avg] | `string` | `"max"` | no |
| <a name="input_replica_current_timeframe"></a> [replica\_current\_timeframe](#input\_replica\_current\_timeframe) | Monitor timeframe for Current replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_15m"` | no |
| <a name="input_replica_group_by"></a> [replica\_group\_by](#input\_replica\_group\_by) | Select group by element on monitors | `list` | <pre>[<br> "namespace",<br> "replicaset"<br>]</pre> | no |
| <a name="input_replica_group_by"></a> [replica\_group\_by](#input\_replica\_group\_by) | Select group by element on monitors | `list` | <pre>[<br> "namespace",<br> "replicaset",<br> "kube_cluster_name"<br>]</pre> | no |
| <a name="input_replica_ready_enabled"></a> [replica\_ready\_enabled](#input\_replica\_ready\_enabled) | Flag to enable Ready replica monitor | `string` | `"true"` | no |
| <a name="input_replica_ready_extra_tags"></a> [replica\_ready\_extra\_tags](#input\_replica\_ready\_extra\_tags) | Extra tags for Ready replica monitor | `list(string)` | `[]` | no |
| <a name="input_replica_ready_message"></a> [replica\_ready\_message](#input\_replica\_ready\_message) | Custom message for Ready replica monitor | `string` | `""` | no |
Expand Down
2 changes: 1 addition & 1 deletion caas/kubernetes/workload/inputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ variable "replica_current_threshold_critical" {
}

variable "replica_group_by" {
default = ["namespace", "replicaset"]
default = ["namespace", "replicaset", "kube_cluster_name"]
description = "Select group by element on monitors"
}

2 changes: 1 addition & 1 deletion caas/kubernetes/workload/monitors-k8s-workload.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ resource "datadog_monitor" "job" {
type = "service check"

query = <<EOQ
"kubernetes_state.job.complete"${module.filter-tags.service_check}.by("kube_job").last(6).count_by_status()
"kubernetes_state.job.complete"${module.filter-tags.service_check}.by("kube_job", "kube_cluster_name").last(6).count_by_status()
EOQ

monitor_thresholds {
Expand Down

0 comments on commit 48dce5f

Please sign in to comment.