diff --git a/component/rules.jsonnet b/component/rules.jsonnet index 88ef8bf7..432f00fe 100644 --- a/component/rules.jsonnet +++ b/component/rules.jsonnet @@ -110,7 +110,24 @@ local additionalRules = { severity: 'critical', }, annotations: { - message: '{{ $labels.node }}: Memory usage more than 97% (current value is: {{ $value | humanizePercentage }})%', + message: '{{ $labels.instance }}: Memory usage more than 97% (current value is: {{ $value | humanizePercentage }})%', + }, + }, + { + alert: 'NodeTcpMemoryUtilizationHigh', + expr: 'node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625', + 'for': '30m', + labels: { + severity: 'critical', + }, + annotations: { + message: 'TCP memory usage is high on {{ $labels.instance }}', + description: ||| + TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}. + + Check the node for processes with unusual amounts of TCP sockets. + |||, + runbook_url: 'https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html', }, }, ], diff --git a/docs/modules/ROOT/pages/runbooks/tcp-memory-usage.adoc b/docs/modules/ROOT/pages/runbooks/tcp-memory-usage.adoc new file mode 100644 index 00000000..fad49579 --- /dev/null +++ b/docs/modules/ROOT/pages/runbooks/tcp-memory-usage.adoc @@ -0,0 +1,112 @@ += SYN_NodeTcpMemoryUtilizationTooHigh + +== icon:glasses[] Overview + +This alert indicates that the node for which it fires has unusually high TCP memory utilization. +The alert is currently configured to fire when a node's TCP memory usage exceeds the kernel TCP memory "pressure" threshold which is set to 6.25% of the node's available memory on RHEL8 and RHEL9. +See this https://access.redhat.com/solutions/6964027[Red Hat solution] for further details. + +== icon:search[] Investigate + +* Investigate the historical TCP memory usage of nodes on the cluster. +Use the following metric to do so. ++ +[source] +---- +node_sockstat_TCP_mem_bytes +---- + +* Login to the node and switch to the host namespace ++ +[source,bash] +---- +oc debug node/ --as=cluster-admin -n syn-debug-nodes +# Wait for pod to start +chroot /host +---- + +* Check TCP memory usage directly on the node ++ +[source,shell] +---- +# cat /proc/net/sockstat +sockets: used 542 +TCP: inuse 155 orphan 0 tw 260 alloc 1545 mem 0 +UDP: inuse 7 mem 2 +UDPLITE: inuse 0 +RAW: inuse 2 +FRAG: inuse 0 memory 0 +---- ++ +NOTE: This file shows memory usage (field `mem`) in 4 KiB pages. + +* Check TCP socket stats summary directly on the node ++ +[source,shell] +---- +# ss -s +Total: 537 +TCP: 1749 (estab 157, closed 1568, orphaned 0, timewait 231) + +Transport Total IP IPv6 +RAW 3 2 1 +UDP 11 7 4 +TCP 181 155 26 +INET 195 164 31 +FRAG 0 0 0 +---- + +* You can try to identify pods with unusually high TCP memory usage by running the following bash snippet on the node. ++ +[source,bash] +---- +# Iterate through all pods which are running (state SANDBOX_READY) on the node +for p in $(crictl pods -o json | jq -r '.items[]|select(.state=="SANDBOX_READY").id'); do + # Extract the network namespace name (a UUID) from the pod metadata + netns=$(crictl inspectp $p | jq -r '.info.runtimeSpec.linux.namespaces[]|select(.type=="network").path|split("/")[-1]') + # only compute and show socket memory usage for pods that don't use the host + # network namespace. + if [ "$netns" != "" ]; then + # Print the pod name + crictl inspectp $p | jq '.status.metadata.name' + # List active TCP sockets in the network namespace of the pod, and sum up + # the amount of TCP memory used by all the sockets. The awk expression + # excludes fields rb, wb and d, which indicate the maximum allocatable + # buffer sizes and amount of dropped packets, from the output of ss -tm + ss -N $netns -tm | grep skmem | cut -d: -f2 | tr -d 'a-z()' | \ + awk -F, 'START { count=0; sum=0 } { count+=1; sum+=$1+$3+$5+$6+$7+$8 } END { printf "%d sockets use %d bytes of TCP memory\n", count, sum }' + fi +done +---- ++ +[NOTE] +==== +This snippet computes the _current_ TCP memory usage based on the values reported by `ss -tm`. +So far, we've not been able to conclusively determine that this will actually highlight the root cause for high TCP memory usage on a node. +However, the snippet is still a starting point to start digging. +==== ++ +TIP: If you find a better snippet to identify pods with high TCP memory usage please update this runbook. + +* If you don't see any outliers in TCP memory usage, you can try to find processes which have a large discrepancy between open socket file descriptors and active sockets as reported by `ss`. +You can extract a container's primary process with the following command. ++ +[source,bash] +---- +crictl inspect | jq '.info.pid' +---- ++ +To determine the number of socket FDs which are held by a process, you can use the following oneliner. ++ +[source,bash] +---- +ls -l /proc//fd | grep socket | wc -l <1> +---- +<1> Substitute `` with the PID of the process you want to look at. + +== icon:wrench[] Tune + +If this alert isn't actionable, noisy, or was raised too late you might want to tune it. + +Currently, the alert can be tuned through component-openshift4's `patchRules` mechanism. +Most likely, you'll want to either tune the threshold or the duration for which the threshold must be exceeded for the alert to fire. diff --git a/docs/modules/ROOT/partials/nav.adoc b/docs/modules/ROOT/partials/nav.adoc index 7d0a0dc1..3e1ba9da 100644 --- a/docs/modules/ROOT/partials/nav.adoc +++ b/docs/modules/ROOT/partials/nav.adoc @@ -19,3 +19,4 @@ * xref:runbooks/cpucapacity.adoc[CPU Capacity Alert] * xref:runbooks/unusedcapacity.adoc[Node Capacity Alert] * xref:runbooks/remotewrite.adoc[Prometheus RemoteWrite Alert] +* xref:runbooks/tcp-memory-usage.adoc[NodeTcpMemoryUtilizationTooHigh Alert] diff --git a/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 7f949238..63be3e18 100644 --- a/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1616,10 +1616,25 @@ spec: syn_component: openshift4-monitoring - name: syn-node-utilization rules: + - alert: SYN_NodeTcpMemoryUtilizationHigh + annotations: + description: | + TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}. + + Check the node for processes with unusual amounts of TCP sockets. + message: TCP memory usage is high on {{ $labels.instance }} + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html + syn_component: openshift4-monitoring + expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_node_memory_free_percent annotations: - message: '{{ $labels.node }}: Memory usage more than 97% (current value - is: {{ $value | humanizePercentage }})%' + message: '{{ $labels.instance }}: Memory usage more than 97% (current + value is: {{ $value | humanizePercentage }})%' syn_component: openshift4-monitoring expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.97 diff --git a/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 7f949238..63be3e18 100644 --- a/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1616,10 +1616,25 @@ spec: syn_component: openshift4-monitoring - name: syn-node-utilization rules: + - alert: SYN_NodeTcpMemoryUtilizationHigh + annotations: + description: | + TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}. + + Check the node for processes with unusual amounts of TCP sockets. + message: TCP memory usage is high on {{ $labels.instance }} + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html + syn_component: openshift4-monitoring + expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_node_memory_free_percent annotations: - message: '{{ $labels.node }}: Memory usage more than 97% (current value - is: {{ $value | humanizePercentage }})%' + message: '{{ $labels.instance }}: Memory usage more than 97% (current + value is: {{ $value | humanizePercentage }})%' syn_component: openshift4-monitoring expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.97 diff --git a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 9edd5259..61120e67 100644 --- a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1591,10 +1591,25 @@ spec: syn_component: openshift4-monitoring - name: syn-node-utilization rules: + - alert: SYN_NodeTcpMemoryUtilizationHigh + annotations: + description: | + TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}. + + Check the node for processes with unusual amounts of TCP sockets. + message: TCP memory usage is high on {{ $labels.instance }} + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html + syn_component: openshift4-monitoring + expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_node_memory_free_percent annotations: - message: '{{ $labels.node }}: Memory usage more than 97% (current value - is: {{ $value | humanizePercentage }})%' + message: '{{ $labels.instance }}: Memory usage more than 97% (current + value is: {{ $value | humanizePercentage }})%' syn_component: openshift4-monitoring expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.97 diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index c2a5d682..18ea9a06 100644 --- a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -150,9 +150,9 @@ spec: annotations: description: | Networking control plane is degraded. Networking configuration updates applied to the cluster will not be - implemented while there is no OVN Kubernetes leader. Existing workloads should continue to have connectivity. + implemented while there is no OVN Kubernetes cluster manager leader. Existing workloads should continue to have connectivity. OVN-Kubernetes control plane is not functional. - runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoOvnMasterLeader.md + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoOvnClusterManagerLeader.md summary: There is no ovn-kubernetes cluster manager leader. syn_component: openshift4-monitoring expr: | @@ -168,8 +168,8 @@ spec: annotations: description: | Networking control plane is degraded. Networking configuration updates applied to the cluster will not be - implemented while there are no OVN Kubernetes pods. - runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoRunningOvnMaster.md + implemented while there are no OVN Kubernetes control plane pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoRunningOvnControlPlane.md summary: There is no running ovn-kubernetes control plane. syn_component: openshift4-monitoring expr: | @@ -180,44 +180,6 @@ spec: severity: critical syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NorthboundStale - annotations: - description: | - Networking control plane is degraded. Networking configuration updates applied to the cluster will not be - implemented. Existing workloads should continue to have connectivity. OVN-Kubernetes control plane and/or - OVN northbound database may not be functional. - runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NorthboundStaleAlert.md - summary: ovn-kubernetes has not written anything to the northbound database - for too long. - syn_component: openshift4-monitoring - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - time() - max_over_time(ovnkube_controller_nb_e2e_timestamp[5m]) > 120 - for: 10m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-monitoring - - alert: SYN_SouthboundStale - annotations: - description: | - Networking control plane is degraded. Networking configuration updates may not be applied to the cluster or - taking a long time to apply. This usually means there is a large load on OVN component 'northd' or it is not - functioning. - runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/SouthboundStaleAlert.md - summary: ovn-northd has not successfully synced any changes to the southbound - DB for too long. - syn_component: openshift4-monitoring - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(ovnkube_controller_nb_e2e_timestamp[5m]) - max_over_time(ovnkube_controller_sb_e2e_timestamp[5m]) > 120 - for: 10m - labels: - severity: critical - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_V4SubnetAllocationThresholdExceeded annotations: description: More than 80% of IPv4 subnets are used. Insufficient IPv4 @@ -268,6 +230,26 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_NorthboundStale + annotations: + description: | + OVN-Kubernetes controller and/or OVN northbound database may cause a + degraded networking control plane for the affected node. Existing + workloads should continue to have connectivity but new workloads may + be impacted. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NorthboundStaleAlert.md + summary: OVN-Kubernetes controller {{ $labels.instance }} has not successfully + synced any changes to the northbound database for too long. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + time() - max_over_time(ovnkube_controller_nb_e2e_timestamp[5m]) > 120 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_OVNKubernetesControllerDisconnectedSouthboundDatabase annotations: description: | @@ -341,6 +323,21 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNorthdInactive + annotations: + description: | + An inactive OVN northd instance may cause a degraded networking + control plane for the affected node. Existing workloads should + continue to have connectivity but new workloads may be impacted. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/OVNKubernetesNorthdInactive.md + summary: OVN northd {{ $labels.instance }} is not active. + syn_component: openshift4-monitoring + expr: count(ovn_northd_status != 1) BY (instance, name, namespace) > 0 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_OVNKubernetesResourceRetryFailure annotations: description: | @@ -353,6 +350,26 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_SouthboundStale + annotations: + description: | + OVN-Kubernetes controller and/or OVN northbound database may cause a + degraded networking control plane for the affected node. Existing + workloads should continue to have connectivity but new workloads may + be impacted. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/SouthboundStaleAlert.md + summary: OVN northd {{ $labels.instance }} has not successfully synced + any changes to the southbound database for too long. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(ovnkube_controller_nb_e2e_timestamp[5m]) - max_over_time(ovnkube_controller_sb_e2e_timestamp[5m]) > 120 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - name: syn-cluster-operators rules: - alert: SYN_CannotEvaluateConditionalUpdates @@ -1750,10 +1767,25 @@ spec: syn_component: openshift4-monitoring - name: syn-node-utilization rules: + - alert: SYN_NodeTcpMemoryUtilizationHigh + annotations: + description: | + TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}. + + Check the node for processes with unusual amounts of TCP sockets. + message: TCP memory usage is high on {{ $labels.instance }} + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html + syn_component: openshift4-monitoring + expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_node_memory_free_percent annotations: - message: '{{ $labels.node }}: Memory usage more than 97% (current value - is: {{ $value | humanizePercentage }})%' + message: '{{ $labels.instance }}: Memory usage more than 97% (current + value is: {{ $value | humanizePercentage }})%' syn_component: openshift4-monitoring expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.97 diff --git a/tests/golden/release-4.13/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.13/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index ede3d6be..1ed76730 100644 --- a/tests/golden/release-4.13/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/release-4.13/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1588,10 +1588,25 @@ spec: syn_component: openshift4-monitoring - name: syn-node-utilization rules: + - alert: SYN_NodeTcpMemoryUtilizationHigh + annotations: + description: | + TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}. + + Check the node for processes with unusual amounts of TCP sockets. + message: TCP memory usage is high on {{ $labels.instance }} + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html + syn_component: openshift4-monitoring + expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_node_memory_free_percent annotations: - message: '{{ $labels.node }}: Memory usage more than 97% (current value - is: {{ $value | humanizePercentage }})%' + message: '{{ $labels.instance }}: Memory usage more than 97% (current + value is: {{ $value | humanizePercentage }})%' syn_component: openshift4-monitoring expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.97 diff --git a/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index b82453d0..08627bed 100644 --- a/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1593,10 +1593,25 @@ spec: syn_component: openshift4-monitoring - name: syn-node-utilization rules: + - alert: SYN_NodeTcpMemoryUtilizationHigh + annotations: + description: | + TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}. + + Check the node for processes with unusual amounts of TCP sockets. + message: TCP memory usage is high on {{ $labels.instance }} + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html + syn_component: openshift4-monitoring + expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_node_memory_free_percent annotations: - message: '{{ $labels.node }}: Memory usage more than 97% (current value - is: {{ $value | humanizePercentage }})%' + message: '{{ $labels.instance }}: Memory usage more than 97% (current + value is: {{ $value | humanizePercentage }})%' syn_component: openshift4-monitoring expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.97 diff --git a/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 7f949238..63be3e18 100644 --- a/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1616,10 +1616,25 @@ spec: syn_component: openshift4-monitoring - name: syn-node-utilization rules: + - alert: SYN_NodeTcpMemoryUtilizationHigh + annotations: + description: | + TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}. + + Check the node for processes with unusual amounts of TCP sockets. + message: TCP memory usage is high on {{ $labels.instance }} + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html + syn_component: openshift4-monitoring + expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_node_memory_free_percent annotations: - message: '{{ $labels.node }}: Memory usage more than 97% (current value - is: {{ $value | humanizePercentage }})%' + message: '{{ $labels.instance }}: Memory usage more than 97% (current + value is: {{ $value | humanizePercentage }})%' syn_component: openshift4-monitoring expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.97 diff --git a/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 7f949238..63be3e18 100644 --- a/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1616,10 +1616,25 @@ spec: syn_component: openshift4-monitoring - name: syn-node-utilization rules: + - alert: SYN_NodeTcpMemoryUtilizationHigh + annotations: + description: | + TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}. + + Check the node for processes with unusual amounts of TCP sockets. + message: TCP memory usage is high on {{ $labels.instance }} + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html + syn_component: openshift4-monitoring + expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_node_memory_free_percent annotations: - message: '{{ $labels.node }}: Memory usage more than 97% (current value - is: {{ $value | humanizePercentage }})%' + message: '{{ $labels.instance }}: Memory usage more than 97% (current + value is: {{ $value | humanizePercentage }})%' syn_component: openshift4-monitoring expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.97 diff --git a/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index d30d2eb4..7eaebac4 100644 --- a/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1704,10 +1704,26 @@ spec: syn_team: clumsy-donkeys - name: syn-node-utilization rules: + - alert: SYN_NodeTcpMemoryUtilizationHigh + annotations: + description: | + TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}. + + Check the node for processes with unusual amounts of TCP sockets. + message: TCP memory usage is high on {{ $labels.instance }} + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html + syn_component: openshift4-monitoring + expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + syn_team: clumsy-donkeys - alert: SYN_node_memory_free_percent annotations: - message: '{{ $labels.node }}: Memory usage more than 97% (current value - is: {{ $value | humanizePercentage }})%' + message: '{{ $labels.instance }}: Memory usage more than 97% (current + value is: {{ $value | humanizePercentage }})%' syn_component: openshift4-monitoring expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.97 diff --git a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index e6e9c087..e306e457 100644 --- a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1616,10 +1616,25 @@ spec: syn_component: openshift4-monitoring - name: syn-node-utilization rules: + - alert: SYN_NodeTcpMemoryUtilizationHigh + annotations: + description: | + TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}. + + Check the node for processes with unusual amounts of TCP sockets. + message: TCP memory usage is high on {{ $labels.instance }} + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html + syn_component: openshift4-monitoring + expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_node_memory_free_percent annotations: - message: '{{ $labels.node }}: Memory usage more than 97% (current value - is: {{ $value | humanizePercentage }})%' + message: '{{ $labels.instance }}: Memory usage more than 97% (current + value is: {{ $value | humanizePercentage }})%' syn_component: openshift4-monitoring expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.97 diff --git a/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 7abdf082..884c0fa0 100644 --- a/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1616,10 +1616,25 @@ spec: syn_component: openshift4-monitoring - name: syn-node-utilization rules: + - alert: SYN_NodeTcpMemoryUtilizationHigh + annotations: + description: | + TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}. + + Check the node for processes with unusual amounts of TCP sockets. + message: TCP memory usage is high on {{ $labels.instance }} + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html + syn_component: openshift4-monitoring + expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_node_memory_free_percent annotations: - message: '{{ $labels.node }}: Memory usage more than 97% (current value - is: {{ $value | humanizePercentage }})%' + message: '{{ $labels.instance }}: Memory usage more than 97% (current + value is: {{ $value | humanizePercentage }})%' syn_component: openshift4-monitoring expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.97