From 28f9433e6fdcb8f769881143ea79e20c4ea7a4c1 Mon Sep 17 00:00:00 2001 From: Andrey Arapov Date: Wed, 4 Sep 2024 14:04:58 +0200 Subject: [PATCH 1/2] Fix handling of zombie processes by replacing xargs with shell loops Address an issue where using `xargs` caused argument lists to become too long on systems with many processes. This resulted in errors such as: /usr/local/bin/kill_zombie_parents.sh: line 14: /usr/bin/ps: Argument list too long Changes include: - Replacing `xargs` with shell loops for reading process IDs in `detect_zombies` and `find_zombie_parents`. - Using `while read` loops to safely build the list of child processes without exceeding argument length limits. - Ensuring more robust zombie process detection and signaling by avoiding argument overflows. This update ensures the script works reliably even in environments with a large number of processes. Affected functions: - Modified zombie process handling logic in the `detect_zombies`, `pidtree`, and `find_zombie_parents` functions. --- .../providers/provider-faq-and-guide/index.md | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/src/content/Docs/providers/provider-faq-and-guide/index.md b/src/content/Docs/providers/provider-faq-and-guide/index.md index 68964a88..9e4e5fe5 100644 --- a/src/content/Docs/providers/provider-faq-and-guide/index.md +++ b/src/content/Docs/providers/provider-faq-and-guide/index.md @@ -1160,9 +1160,12 @@ Since providers cannot control the internal configuration of tenant containers, local parent=$1 local list="$parent" local children - while : ; do - children=$(ps --ppid "$parent" -o pid= | xargs) + children="" + while read -r child; do + children+="$child " + done < <(ps --ppid "$parent" -o pid=) + children=${children% } # Remove trailing space if [[ -z "$children" ]]; then break fi @@ -1176,23 +1179,27 @@ Since providers cannot control the internal configuration of tenant containers, detect_zombies() { # Find all containerd-shim processes shim_pids=$(pgrep -f containerd-shim) - for shim_pid in $shim_pids; do # Use pidtree function to get the process tree under each containerd-shim - pidtree "$shim_pid" | xargs ps -o pid,ppid,stat,cmd --forest | awk ' - { - if ($3 ~ /^Z/) { - print $1; # print the PID of the zombie process - } - }' + pidtree "$shim_pid" | while read -r pid; do + ps -o pid=,ppid=,stat=,cmd= -p "$pid" --forest | awk ' + { + if ($3 ~ /^Z/) { + print $1; # print the PID of the zombie process + } + }' + done done } # Function to find unique parent PIDs of the zombie processes find_zombie_parents() { - detect_zombies | xargs -I {} ps -o ppid= -p {} | sort -u + detect_zombies | while read -r zombie_pid; do + ps -o ppid= -p "$zombie_pid" | tr -d ' ' + done | sort -u } + # Get the threshold from the script argument or default to 50 if not provided or invalid threshold=${1:-50} @@ -1224,7 +1231,12 @@ Since providers cannot control the internal configuration of tenant containers, for parent in $zombie_parents; do if ps -p $parent >/dev/null 2>&1; then # Check if any zombies from the initial detection are still present under their parent PID - persistent_parent_zombies=$(echo "$persistent_zombies" | xargs -I {} ps -o ppid= -p {} | grep "^$parent$") + persistent_parent_zombies="" + while read -r zombie_pid; do + if [[ $(ps -o ppid= -p "$zombie_pid" | tr -d ' ') == $parent ]]; then + persistent_parent_zombies="$persistent_parent_zombies $zombie_pid" + fi + done < <(echo "$persistent_zombies") if [[ -n "$persistent_parent_zombies" ]]; then echo "Persistent zombie found; sending SIGCHLD to parent process $parent." kill -SIGCHLD $parent From 9553925411f684f258e4cfda3c5a58c49c056dd6 Mon Sep 17 00:00:00 2001 From: Andrey Arapov Date: Wed, 4 Sep 2024 15:01:28 +0200 Subject: [PATCH 2/2] Drastically improve zombie process cleanup efficiency This commit overhauls the zombie process cleanup script, addressing a critical performance issue: - Previously, the script would scan all processes and zombies, gathering parent PIDs for EACH zombie. This approach was extremely inefficient, taking up to 20 minutes when the server had 465,276 zombie processes. - Now, the script stops after finding the first zombie process and focuses on terminating its parent. This approach is based on the understanding that multiple zombies are typically linked to a single parent process. Terminating the parent process will make all of its zombie children disappear, drastically improving efficiency. Other improvements include: - Add safety checks to avoid terminating containerd-shim processes - Remove the zombie threshold check, allowing immediate action on all zombies. This prevents the misconception that a small number of zombie processes is acceptable and encourages users to properly address the root cause by integrating process reapers like tini. - Increase crontab execution frequency from every 15 minutes to every 5 minutes These changes result in a much more responsive and efficient handling of zombie processes, reducing cleanup time from 20 minutes to just seconds, even with hundreds of thousands of zombies. --- .../providers/provider-faq-and-guide/index.md | 176 ++++++++---------- 1 file changed, 78 insertions(+), 98 deletions(-) diff --git a/src/content/Docs/providers/provider-faq-and-guide/index.md b/src/content/Docs/providers/provider-faq-and-guide/index.md index 9e4e5fe5..9f5867b5 100644 --- a/src/content/Docs/providers/provider-faq-and-guide/index.md +++ b/src/content/Docs/providers/provider-faq-and-guide/index.md @@ -1150,114 +1150,94 @@ Since providers cannot control the internal configuration of tenant containers, ```bash cat > /usr/local/bin/kill_zombie_parents.sh <<'EOF' #!/bin/bash - # This script detects persistent zombie processes that are descendants of containerd-shim processes + # This script detects zombie processes that are descendants of containerd-shim processes # and first attempts to prompt the parent process to reap them by sending a SIGCHLD signal. - # If the number of zombies exceeds the specified threshold and SIGCHLD does not resolve the issue, - # the script will proceed to kill the parent processes. - - # Function to print the process tree for a given parent PID - pidtree() { - local parent=$1 - local list="$parent" - local children - while : ; do - children="" - while read -r child; do - children+="$child " - done < <(ps --ppid "$parent" -o pid=) - children=${children% } # Remove trailing space - if [[ -z "$children" ]]; then - break + + find_zombie_and_parents() { + for pid in /proc/[0-9]*; do + if [[ -r $pid/stat ]]; then + read -r proc_pid comm state ppid < <(cut -d' ' -f1,2,3,4 "$pid/stat") + if [[ $state == "Z" ]]; then + echo "$proc_pid $ppid" + return 0 + fi fi - list="$list $children" - parent="$children" done - echo "$list" + return 1 } - # Function to detect zombies under containerd-shim processes - detect_zombies() { - # Find all containerd-shim processes - shim_pids=$(pgrep -f containerd-shim) - for shim_pid in $shim_pids; do - # Use pidtree function to get the process tree under each containerd-shim - pidtree "$shim_pid" | while read -r pid; do - ps -o pid=,ppid=,stat=,cmd= -p "$pid" --forest | awk ' - { - if ($3 ~ /^Z/) { - print $1; # print the PID of the zombie process - } - }' - done - done + get_parent_chain() { + local pid=$1 + local chain="" + while [[ $pid -ne 1 ]]; do + if [[ ! -r /proc/$pid/stat ]]; then + break + fi + read -r ppid cmd < <(awk '{print $4, $2}' /proc/$pid/stat) + chain="$pid:$cmd $chain" + pid=$ppid + done + echo "$chain" } - # Function to find unique parent PIDs of the zombie processes - find_zombie_parents() { - detect_zombies | while read -r zombie_pid; do - ps -o ppid= -p "$zombie_pid" | tr -d ' ' - done | sort -u + is_process_zombie() { + local pid=$1 + if [[ -r /proc/$pid/stat ]]; then + read -r state < <(cut -d' ' -f3 /proc/$pid/stat) + [[ $state == "Z" ]] + else + return 1 + fi } + attempt_kill() { + local pid=$1 + local signal=$2 + local wait_time=$3 + local signal_name=${4:-$signal} + + echo "Attempting to send $signal_name to parent process $pid" + kill $signal $pid + sleep $wait_time + + if is_process_zombie $zombie_pid; then + echo "Zombie process $zombie_pid still exists after $signal_name" + return 1 + else + echo "Zombie process $zombie_pid no longer exists after $signal_name" + return 0 + fi + } - # Get the threshold from the script argument or default to 50 if not provided or invalid - threshold=${1:-50} - - # Ensure the threshold is a number - if ! [[ "$threshold" =~ ^[0-9]+$ ]]; then - echo "Invalid threshold specified. Defaulting to 50." - threshold=50 - fi - - # First detection of zombie processes and their PIDs under containerd-shim - initial_zombies=$(detect_zombies) - zombie_count=$(echo "$initial_zombies" | wc -w) - - # If the number of zombies is less than or equal to the threshold, exit - if [[ $zombie_count -le $threshold ]]; then - exit 0 - fi - - # Identify the parent PID of the zombie processes - zombie_parents=$(find_zombie_parents) - - # Wait for a short period to see if the zombies persist - sleep 15 - - # Re-check for zombie processes and their PIDs under containerd-shim - persistent_zombies=$(detect_zombies) - - # Compare initial and persistent zombies and take action if necessary - for parent in $zombie_parents; do - if ps -p $parent >/dev/null 2>&1; then - # Check if any zombies from the initial detection are still present under their parent PID - persistent_parent_zombies="" - while read -r zombie_pid; do - if [[ $(ps -o ppid= -p "$zombie_pid" | tr -d ' ') == $parent ]]; then - persistent_parent_zombies="$persistent_parent_zombies $zombie_pid" - fi - done < <(echo "$persistent_zombies") - if [[ -n "$persistent_parent_zombies" ]]; then - echo "Persistent zombie found; sending SIGCHLD to parent process $parent." - kill -SIGCHLD $parent - sleep 15 # Give the parent process time to reap the zombies - - # Re-check if the parent process still has zombies - remaining_zombies=$(ps --ppid $parent -o stat | grep '^Z' | wc -l) - if [[ $remaining_zombies -gt 0 ]]; then - echo "SIGCHLD did not work; killing (SIGTERM) parent process $parent." - kill -TERM $parent - sleep 15 # Give the process a chance to terminate - - # Force kill if it didn't terminate - if ps -p $parent >/dev/null 2>&1; then - echo "Force killing (SIGKILL) parent process $parent." - kill -KILL $parent - fi - fi - fi + if zombie_info=$(find_zombie_and_parents); then + zombie_pid=$(echo "$zombie_info" | awk '{print $1}') + parent_pid=$(echo "$zombie_info" | awk '{print $2}') + + echo "Found zombie process $zombie_pid with immediate parent $parent_pid" + + parent_chain=$(get_parent_chain "$parent_pid") + echo "Parent chain: $parent_chain" + + if [[ $parent_chain == *"containerd-shim"* ]]; then + echo "Top-level parent is containerd-shim" + immediate_parent=$(echo "$parent_chain" | awk -F' ' '{print $1}' | cut -d':' -f1) + if [[ $immediate_parent != $parent_pid ]]; then + if attempt_kill $parent_pid -SIGCHLD 15 "SIGCHLD"; then + echo "Zombie process cleaned up after SIGCHLD" + elif attempt_kill $parent_pid -SIGTERM 15 "SIGTERM"; then + echo "Zombie process cleaned up after SIGTERM" + elif attempt_kill $parent_pid -SIGKILL 5 "SIGKILL"; then + echo "Zombie process cleaned up after SIGKILL" + else + echo "Failed to clean up zombie process after all attempts" + fi + else + echo "Immediate parent is containerd-shim. Not killing." fi - done + else + echo "Top-level parent is not containerd-shim. No action taken." + fi + fi EOF ``` @@ -1276,7 +1256,7 @@ Since providers cannot control the internal configuration of tenant containers, PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin SHELL=/bin/bash - */15 * * * * root /usr/local/bin/kill_zombie_parents.sh 50 | logger -t kill_zombie_parents + */5 * * * * root /usr/local/bin/kill_zombie_parents.sh | logger -t kill_zombie_parents EOF ```