appuio · simu · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024
@@ -110,7 +110,24 @@ local additionalRules = {
             severity: 'critical',
           },
           annotations: {
-            message: '{{ $labels.node }}: Memory usage more than 97% (current value is: {{ $value | humanizePercentage }})%',
+            message: '{{ $labels.instance }}: Memory usage more than 97% (current value is: {{ $value | humanizePercentage }})%',
+          },
+        },
+        {
+          alert: 'NodeTcpMemoryUtilizationHigh',
+          expr: 'node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625',
+          'for': '30m',
+          labels: {
+            severity: 'critical',
+          },
+          annotations: {
+            message: 'TCP memory usage is high on {{ $labels.instance }}',
+            description: |||
+              TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}.
+
+              Check the node for processes with unusual amounts of TCP sockets.
+            |||,
+            runbook_url: 'https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html',
           },
         },
       ],

@@ -0,0 +1,112 @@
+= SYN_NodeTcpMemoryUtilizationTooHigh
+
+== icon:glasses[] Overview
+
+This alert indicates that the node for which it fires has unusually high TCP memory utilization.
+The alert is currently configured to fire when a node's TCP memory usage exceeds the kernel TCP memory "pressure" threshold which is set to 6.25% of the node's available memory on RHEL8 and RHEL9.
+See this https://access.redhat.com/solutions/6964027[Red Hat solution] for further details.
+
+== icon:search[] Investigate
+
+* Investigate the historical TCP memory usage of nodes on the cluster.
+Use the following metric to do so.
++
+[source]
+----
+node_sockstat_TCP_mem_bytes
+----
+
+* Login to the node and switch to the host namespace
++
+[source,bash]
+----
+oc debug node/<nodename> --as=cluster-admin -n syn-debug-nodes
+# Wait for pod to start
+chroot /host
+----
+
+* Check TCP memory usage directly on the node
++
+[source,shell]
+----
+# cat /proc/net/sockstat
+sockets: used 542
+TCP: inuse 155 orphan 0 tw 260 alloc 1545 mem 0
+UDP: inuse 7 mem 2
+UDPLITE: inuse 0
+RAW: inuse 2
+FRAG: inuse 0 memory 0
+----
++
+NOTE: This file shows memory usage (field `mem`) in 4 KiB pages.
+
+* Check TCP socket stats summary directly on the node
++
+[source,shell]
+----
+# ss -s
+Total: 537
+TCP:   1749 (estab 157, closed 1568, orphaned 0, timewait 231)
+
+Transport Total     IP        IPv6
+RAW	  3         2         1
+UDP	  11        7         4
+TCP	  181       155       26
+INET	  195       164       31
+FRAG	  0         0         0
+----
+
+* You can try to identify pods with unusually high TCP memory usage by running the following bash snippet on the node.
++
+[source,bash]
+----
+# Iterate through all pods which are running (state SANDBOX_READY) on the node
+for p in $(crictl pods -o json | jq -r '.items[]|select(.state=="SANDBOX_READY").id'); do
+  # Extract the network namespace name (a UUID) from the pod metadata
+  netns=$(crictl inspectp $p | jq -r '.info.runtimeSpec.linux.namespaces[]|select(.type=="network").path|split("/")[-1]')
+  # only compute and show socket memory usage for pods that don't use the host
+  # network namespace.
+  if [ "$netns" != "" ]; then
+    # Print the pod name
+    crictl inspectp $p | jq '.status.metadata.name'
+    # List active TCP sockets in the network namespace of the pod, and sum up
+    # the amount of TCP memory used by all the sockets. The awk expression
+    # excludes fields rb, wb and d, which indicate the maximum allocatable
+    # buffer sizes and amount of dropped packets, from the output of ss -tm
+    ss -N $netns -tm | grep skmem | cut -d: -f2 | tr -d 'a-z()' | \
+      awk -F, 'START { count=0; sum=0 } { count+=1; sum+=$1+$3+$5+$6+$7+$8 } END { printf "%d sockets use %d bytes of TCP memory\n", count, sum }'
+  fi
+done
+----
++
+[NOTE]
+====
+This snippet computes the _current_ TCP memory usage based on the values reported by `ss -tm`.
+So far, we've not been able to conclusively determine that this will actually highlight the root cause for high TCP memory usage on a node.
+However, the snippet is still a starting point to start digging.
+====
++
+TIP: If you find a better snippet to identify pods with high TCP memory usage please update this runbook.
+
+* If you don't see any outliers in TCP memory usage, you can try to find processes which have a large discrepancy between open socket file descriptors and active sockets as reported by `ss`.
+You can extract a container's primary process with the following command.
++
+[source,bash]
+----
+crictl inspect <container_id> | jq '.info.pid'
+----
++
+To determine the number of socket FDs which are held by a process, you can use the following oneliner.
++
+[source,bash]
+----
+ls -l /proc/<PID>/fd | grep socket | wc -l <1>
+----
+<1> Substitute `<PID>` with the PID of the process you want to look at.
+
+== icon:wrench[] Tune
+
+If this alert isn't actionable, noisy, or was raised too late you might want to tune it.
+
+Currently, the alert can be tuned through component-openshift4's `patchRules` mechanism.
+Most likely, you'll want to either tune the threshold or the duration for which the threshold must be exceeded for the alert to fire.
@@ -19,3 +19,4 @@
 * xref:runbooks/cpucapacity.adoc[CPU Capacity Alert]
 * xref:runbooks/unusedcapacity.adoc[Node Capacity Alert]
 * xref:runbooks/remotewrite.adoc[Prometheus RemoteWrite Alert]
+* xref:runbooks/tcp-memory-usage.adoc[NodeTcpMemoryUtilizationTooHigh Alert]
@@ -1616,10 +1616,25 @@ spec:
             syn_component: openshift4-monitoring
     - name: syn-node-utilization
       rules:
+        - alert: SYN_NodeTcpMemoryUtilizationHigh
+          annotations:
+            description: |
+              TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}.
+
+              Check the node for processes with unusual amounts of TCP sockets.
+            message: TCP memory usage is high on {{ $labels.instance }}
+            runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html
+            syn_component: openshift4-monitoring
+          expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625
+          for: 30m
+          labels:
+            severity: critical
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_node_memory_free_percent
           annotations:
-            message: '{{ $labels.node }}: Memory usage more than 97% (current value
-              is: {{ $value | humanizePercentage }})%'
+            message: '{{ $labels.instance }}: Memory usage more than 97% (current
+              value is: {{ $value | humanizePercentage }})%'
             syn_component: openshift4-monitoring
           expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes
             > 0.97

@@ -1616,10 +1616,25 @@ spec:
             syn_component: openshift4-monitoring
     - name: syn-node-utilization
       rules:
+        - alert: SYN_NodeTcpMemoryUtilizationHigh
+          annotations:
+            description: |
+              TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}.
+
+              Check the node for processes with unusual amounts of TCP sockets.
+            message: TCP memory usage is high on {{ $labels.instance }}
+            runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html
+            syn_component: openshift4-monitoring
+          expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625
+          for: 30m
+          labels:
+            severity: critical
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_node_memory_free_percent
           annotations:
-            message: '{{ $labels.node }}: Memory usage more than 97% (current value
-              is: {{ $value | humanizePercentage }})%'
+            message: '{{ $labels.instance }}: Memory usage more than 97% (current
+              value is: {{ $value | humanizePercentage }})%'
             syn_component: openshift4-monitoring
           expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes
             > 0.97

@@ -1591,10 +1591,25 @@ spec:
             syn_component: openshift4-monitoring
     - name: syn-node-utilization
       rules:
+        - alert: SYN_NodeTcpMemoryUtilizationHigh
+          annotations:
+            description: |
+              TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}.
+
+              Check the node for processes with unusual amounts of TCP sockets.
+            message: TCP memory usage is high on {{ $labels.instance }}
+            runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html
+            syn_component: openshift4-monitoring
+          expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625
+          for: 30m
+          labels:
+            severity: critical
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_node_memory_free_percent
           annotations:
-            message: '{{ $labels.node }}: Memory usage more than 97% (current value
-              is: {{ $value | humanizePercentage }})%'
+            message: '{{ $labels.instance }}: Memory usage more than 97% (current
+              value is: {{ $value | humanizePercentage }})%'
             syn_component: openshift4-monitoring
           expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes
             > 0.97

@@ -150,9 +150,9 @@ spec:
           annotations:
             description: |
               Networking control plane is degraded. Networking configuration updates applied to the cluster will not be
-              implemented while there is no OVN Kubernetes leader. Existing workloads should continue to have connectivity.
+              implemented while there is no OVN Kubernetes cluster manager leader. Existing workloads should continue to have connectivity.
               OVN-Kubernetes control plane is not functional.
-            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoOvnMasterLeader.md
+            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoOvnClusterManagerLeader.md
             summary: There is no ovn-kubernetes cluster manager leader.
             syn_component: openshift4-monitoring
           expr: |
@@ -168,8 +168,8 @@ spec:
           annotations:
             description: |
               Networking control plane is degraded. Networking configuration updates applied to the cluster will not be
-              implemented while there are no OVN Kubernetes pods.
-            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoRunningOvnMaster.md
+              implemented while there are no OVN Kubernetes control plane pods.
+            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoRunningOvnControlPlane.md
             summary: There is no running ovn-kubernetes control plane.
             syn_component: openshift4-monitoring
           expr: |
@@ -180,44 +180,6 @@ spec:
             severity: critical
             syn: 'true'
             syn_component: openshift4-monitoring
-        - alert: SYN_NorthboundStale
-          annotations:
-            description: |
-              Networking control plane is degraded. Networking configuration updates applied to the cluster will not be
-              implemented. Existing workloads should continue to have connectivity. OVN-Kubernetes control plane and/or
-              OVN northbound database may not be functional.
-            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NorthboundStaleAlert.md
-            summary: ovn-kubernetes has not written anything to the northbound database
-              for too long.
-            syn_component: openshift4-monitoring
-          expr: |
-            # Without max_over_time, failed scrapes could create false negatives, see
-            # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
-            time() - max_over_time(ovnkube_controller_nb_e2e_timestamp[5m]) > 120
-          for: 10m
-          labels:
-            severity: critical
-            syn: 'true'
-            syn_component: openshift4-monitoring
-        - alert: SYN_SouthboundStale
-          annotations:
-            description: |
-              Networking control plane is degraded. Networking configuration updates may not be applied to the cluster or
-              taking a long time to apply. This usually means there is a large load on OVN component 'northd' or it is not
-              functioning.
-            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/SouthboundStaleAlert.md
-            summary: ovn-northd has not successfully synced any changes to the southbound
-              DB for too long.
-            syn_component: openshift4-monitoring
-          expr: |
-            # Without max_over_time, failed scrapes could create false negatives, see
-            # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
-            max_over_time(ovnkube_controller_nb_e2e_timestamp[5m]) - max_over_time(ovnkube_controller_sb_e2e_timestamp[5m]) > 120
-          for: 10m
-          labels:
-            severity: critical
-            syn: 'true'
-            syn_component: openshift4-monitoring
         - alert: SYN_V4SubnetAllocationThresholdExceeded
           annotations:
             description: More than 80% of IPv4 subnets are used. Insufficient IPv4
@@ -268,6 +230,26 @@ spec:
             severity: warning
             syn: 'true'
             syn_component: openshift4-monitoring
+        - alert: SYN_NorthboundStale
+          annotations:
+            description: |
+              OVN-Kubernetes controller and/or OVN northbound database may cause a
+              degraded networking control plane for the affected node. Existing
+              workloads should continue to have connectivity but new workloads may
+              be impacted.
+            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NorthboundStaleAlert.md
+            summary: OVN-Kubernetes controller {{ $labels.instance }} has not successfully
+              synced any changes to the northbound database for too long.
+            syn_component: openshift4-monitoring
+          expr: |
+            # Without max_over_time, failed scrapes could create false negatives, see
+            # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+            time() - max_over_time(ovnkube_controller_nb_e2e_timestamp[5m]) > 120
+          for: 10m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_OVNKubernetesControllerDisconnectedSouthboundDatabase
           annotations:
             description: |
@@ -341,6 +323,21 @@ spec:
             severity: warning
             syn: 'true'
             syn_component: openshift4-monitoring
+        - alert: SYN_OVNKubernetesNorthdInactive
+          annotations:
+            description: |
+              An inactive OVN northd instance may cause a degraded networking
+              control plane for the affected node. Existing workloads should
+              continue to have connectivity but new workloads may be impacted.
+            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/OVNKubernetesNorthdInactive.md
+            summary: OVN northd {{ $labels.instance }} is not active.
+            syn_component: openshift4-monitoring
+          expr: count(ovn_northd_status != 1) BY (instance, name, namespace) > 0
+          for: 10m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_OVNKubernetesResourceRetryFailure
           annotations:
             description: |
@@ -353,6 +350,26 @@ spec:
             severity: warning
             syn: 'true'
             syn_component: openshift4-monitoring
+        - alert: SYN_SouthboundStale
+          annotations:
+            description: |
+              OVN-Kubernetes controller and/or OVN northbound database may cause a
+              degraded networking control plane for the affected node. Existing
+              workloads should continue to have connectivity but new workloads may
+              be impacted.
+            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/SouthboundStaleAlert.md
+            summary: OVN northd {{ $labels.instance }} has not successfully synced
+              any changes to the southbound database for too long.
+            syn_component: openshift4-monitoring
+          expr: |
+            # Without max_over_time, failed scrapes could create false negatives, see
+            # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+            max_over_time(ovnkube_controller_nb_e2e_timestamp[5m]) - max_over_time(ovnkube_controller_sb_e2e_timestamp[5m]) > 120
+          for: 10m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-monitoring
     - name: syn-cluster-operators
       rules:
         - alert: SYN_CannotEvaluateConditionalUpdates
@@ -1750,10 +1767,25 @@ spec:
             syn_component: openshift4-monitoring
     - name: syn-node-utilization
       rules:
+        - alert: SYN_NodeTcpMemoryUtilizationHigh
+          annotations:
+            description: |
+              TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}.
+
+              Check the node for processes with unusual amounts of TCP sockets.
+            message: TCP memory usage is high on {{ $labels.instance }}
+            runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html
+            syn_component: openshift4-monitoring
+          expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625
+          for: 30m
+          labels:
+            severity: critical
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_node_memory_free_percent
           annotations:
-            message: '{{ $labels.node }}: Memory usage more than 97% (current value
-              is: {{ $value | humanizePercentage }})%'
+            message: '{{ $labels.instance }}: Memory usage more than 97% (current
+              value is: {{ $value | humanizePercentage }})%'
             syn_component: openshift4-monitoring
           expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes
             > 0.97