Skip to content

Commit

Permalink
Replaced wait_for_pods with python script
Browse files Browse the repository at this point in the history
Signed-off-by: hbelmiro <[email protected]>
  • Loading branch information
hbelmiro committed Sep 3, 2024
1 parent ea23e9a commit 7fa7b63
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 55 deletions.
4 changes: 2 additions & 2 deletions scripts/deploy/github/deploy-kfp-tekton.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ then
exit 1
fi

# Check if all pods are running - allow 20 retries (10 minutes)
wait_for_pods kubeflow 40 30 || EXIT_CODE=$?
# Check if all pods are running - (10 minutes)
wait_for_pods || EXIT_CODE=$?
if [[ $EXIT_CODE -ne 0 ]]
then
echo "Deploy unsuccessful. Not all pods running."
Expand Down
4 changes: 2 additions & 2 deletions scripts/deploy/github/deploy-kfp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ then
exit 1
fi

# Check if all pods are running - allow 20 retries (10 minutes)
wait_for_pods kubeflow 40 30 || EXIT_CODE=$?
# Check if all pods are running - (10 minutes)
wait_for_pods || EXIT_CODE=$?
if [[ $EXIT_CODE -ne 0 ]]
then
echo "Deploy unsuccessful. Not all pods running."
Expand Down
53 changes: 2 additions & 51 deletions scripts/deploy/github/helper-functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,57 +56,8 @@ wait_for_namespace () {
}

wait_for_pods () {
if [[ $# -ne 3 ]]
then
echo "Usage: wait_for_pods namespace max_retries sleep_time"
return 1
fi

local namespace=$1
local max_retries=$2
local sleep_time=$3

local i=0

while [[ $i -lt $max_retries ]]
do
local pods
local ready_pods
local num_pods
local num_ready

pods=$(kubectl get pod -n "$namespace" --no-headers 2>/dev/null)

if [[ -z $pods ]]; then
echo "No pods found or error retrieving pods."
else
num_pods=$(echo "$pods" | wc -l | xargs)
num_ready=0

while IFS= read -r pod; do
ready_status=$(echo "$pod" | awk '{print $2}')
if [[ "$ready_status" == "$(echo "$ready_status" | cut -d/ -f2)" ]]; then
num_ready=$((num_ready+1))
fi
done <<< "$pods"

local msg="${num_ready}/${num_pods} pods are ready in \"${namespace}\"."

if [[ $num_ready -ne $num_pods ]]; then
# for debugging
# kubectl get pod -n "$namespace" | grep '0/1' | awk '{print $1}' | xargs kubectl describe pod -n "$namespace"
echo "$msg Checking again in ${sleep_time}s."
else
echo "$msg"
return 0
fi
fi

sleep "$sleep_time"
i=$((i+1))
done

return 1
pip install -r kfp-readiness/requirements.txt
python kfp-readiness/wait_for_pods.py
}

deploy_with_retries () {
Expand Down
17 changes: 17 additions & 0 deletions scripts/deploy/github/kfp-readiness/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
cachetools==5.5.0
certifi==2024.8.30
charset-normalizer==3.3.2
google-auth==2.34.0
idna==3.8
kubernetes==26.1.0
oauthlib==3.2.2
pyasn1==0.6.0
pyasn1_modules==0.4.0
python-dateutil==2.9.0.post0
PyYAML==6.0.2
requests==2.32.3
requests-oauthlib==2.0.0
rsa==4.9
six==1.16.0
urllib3==2.2.2
websocket-client==1.8.0
78 changes: 78 additions & 0 deletions scripts/deploy/github/kfp-readiness/wait_for_pods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import logging
import time
import urllib3
import sys
from kubernetes import client, config

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

namespace = 'kubeflow'

config.load_kube_config()
v1 = client.CoreV1Api()


def get_pod_statuses():
pods = v1.list_namespaced_pod(namespace=namespace)
statuses = {}
for pod in pods.items:
pod_name = pod.metadata.name
pod_status = pod.status.phase
ready_containers = sum(1 for status in pod.status.container_statuses if status.ready)
total_containers = len(pod.status.container_statuses)
statuses[pod_name] = (pod_status, ready_containers, total_containers)
return statuses


def all_pods_ready(statuses):
return all(pod_status == 'Running' and ready == total
for pod_status, ready, total in statuses.values())


def check_pods(calm_time=10, timeout=600, retries_after_ready=5):
start_time = time.time()
stable_count = 0
previous_statuses = {}

while time.time() - start_time < timeout:
current_statuses = get_pod_statuses()

logging.info("Checking pod statuses...")
for pod_name, (pod_status, ready, total) in current_statuses.items():
logging.info(f"Pod {pod_name} - Status: {pod_status}, Ready: {ready}/{total}")

if current_statuses == previous_statuses:
if all_pods_ready(current_statuses):
stable_count += 1
if stable_count >= retries_after_ready:
logging.info("All pods are calm and fully ready.")
break
else:
logging.info(
f"Pods are calm but have only been stable for {stable_count}/{retries_after_ready} retries.")
else:
stable_count = 0
else:
stable_count = 0

previous_statuses = current_statuses
logging.info(f"Pods are still stabilizing. Retrying in {calm_time} seconds...")
time.sleep(calm_time)
else:
logging.info("Pods did not stabilize within the timeout period.")
sys.exit(1)

logging.info("Final pod statuses:")
for pod_name, (pod_status, ready, total) in previous_statuses.items():
if pod_status == 'Running' and ready == total:
logging.info(f"Pod {pod_name} is fully ready ({ready}/{total})")
else:
logging.info(f"Pod {pod_name} is not ready (Status: {pod_status}, Ready: {ready}/{total})")

sys.exit(0)


if __name__ == "__main__":
check_pods()

0 comments on commit 7fa7b63

Please sign in to comment.