Skip to content

Commit

Permalink
Improvements to wait_for_pods function
Browse files Browse the repository at this point in the history
Signed-off-by: hbelmiro <[email protected]>
  • Loading branch information
hbelmiro committed Sep 3, 2024
1 parent 0d098db commit 4b7fa6f
Show file tree
Hide file tree
Showing 12 changed files with 143 additions and 72 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ jobs:
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Create KFP cluster
uses: ./.github/actions/kfp-tekton-cluster
- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
Expand All @@ -45,6 +43,8 @@ jobs:
python3 -m venv .venv
. .venv/bin/activate
pip install -e sdk/python
- name: Create KFP cluster
uses: ./.github/actions/kfp-tekton-cluster
- name: "flip coin test"
run: |
. .venv/bin/activate
Expand Down
30 changes: 30 additions & 0 deletions .github/workflows/e2e-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9

- name: Create KFP cluster
uses: ./.github/actions/kfp-cluster

Expand All @@ -46,6 +51,11 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9

- name: Create KFP cluster
uses: ./.github/actions/kfp-cluster

Expand All @@ -69,6 +79,11 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9

- name: Create KFP cluster
uses: ./.github/actions/kfp-cluster

Expand All @@ -92,6 +107,11 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9

- name: Create KFP cluster
uses: ./.github/actions/kfp-cluster

Expand All @@ -115,6 +135,11 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9

- name: Create KFP cluster
uses: ./.github/actions/kfp-cluster

Expand Down Expand Up @@ -144,6 +169,11 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9

- name: Create KFP cluster
uses: ./.github/actions/kfp-cluster

Expand Down
11 changes: 6 additions & 5 deletions .github/workflows/kfp-kubernetes-execution-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ on:
pull_request:
paths:
- '.github/workflows/kfp-kubernetes-execution-tests.yml'
- 'scripts/deploy/github/**'
- 'sdk/python/**'
- 'api/v2alpha1/**'
- 'kubernetes_platform/**'
Expand All @@ -18,17 +19,17 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.9'

- name: Create KFP cluster
uses: ./.github/actions/kfp-cluster

- name: Forward API port
run: ./scripts/deploy/github/forward-port.sh "kubeflow" "ml-pipeline" 8888 8888

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.9'

- name: apt-get update
run: sudo apt-get update

Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/kfp-samples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ on:
- master
pull_request:
paths:
- 'scripts/deploy/github/**'
- 'samples/**'
- 'backend/src/v2/**'
- '.github/workflows/kfp-samples.yml'
Expand All @@ -21,7 +22,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.8
python-version: 3.9

- name: Create KFP cluster
uses: ./.github/actions/kfp-cluster
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/kubeflow-pipelines-integration-v2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ on:
pull_request:
paths:
- '.github/workflows/kubeflow-pipelines-integration-v2.yml'
- 'scripts/deploy/github/**'
- 'samples'
- 'core'
- 'backend'
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/periodic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Create KFP cluster
uses: ./.github/actions/kfp-cluster
- name: Port forward kfp apiserver
Expand Down
11 changes: 6 additions & 5 deletions .github/workflows/sdk-execution.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ on:
pull_request:
paths:
- '.github/workflows/sdk-execution.yml'
- 'scripts/deploy/github/**'
- 'sdk/python/**'
- 'api/v2alpha1/**'

Expand All @@ -17,17 +18,17 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9

- name: Create KFP cluster
uses: ./.github/actions/kfp-cluster

- name: Forward API port
run: ./scripts/deploy/github/forward-port.sh "kubeflow" "ml-pipeline" 8888 8888

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.8

- name: apt-get update
run: sudo apt-get update

Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/upgrade-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ on:
pull_request:
paths:
- '.github/workflows/upgrade-test.yml'
- 'scripts/deploy/github/**'
- 'backend/**'
- 'manifests/kustomize/**'

Expand All @@ -17,6 +18,11 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9

- name: Create KFP cluster
uses: ./.github/actions/kfp-cluster

Expand Down
6 changes: 2 additions & 4 deletions scripts/deploy/github/deploy-kfp-tekton.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,14 @@ then
exit 1
fi

# Check if all pods are running - allow 20 retries (10 minutes)
wait_for_pods kubeflow 40 30 || EXIT_CODE=$?
# Check if all pods are running - (10 minutes)
wait_for_pods || EXIT_CODE=$?
if [[ $EXIT_CODE -ne 0 ]]
then
echo "Deploy unsuccessful. Not all pods running."
exit 1
fi

echo "List Kubeflow: "
kubectl get pod -n kubeflow
collect_artifacts kubeflow

echo "List Tekton control plane: "
Expand Down
6 changes: 2 additions & 4 deletions scripts/deploy/github/deploy-kfp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,14 @@ then
exit 1
fi

# Check if all pods are running - allow 20 retries (10 minutes)
wait_for_pods kubeflow 40 30 || EXIT_CODE=$?
# Check if all pods are running - (10 minutes)
wait_for_pods || EXIT_CODE=$?
if [[ $EXIT_CODE -ne 0 ]]
then
echo "Deploy unsuccessful. Not all pods running."
exit 1
fi

echo "List Kubeflow: "
kubectl get pod -n kubeflow
collect_artifacts kubeflow

echo "Finished KFP deployment."
Expand Down
54 changes: 3 additions & 51 deletions scripts/deploy/github/helper-functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,57 +56,9 @@ wait_for_namespace () {
}

wait_for_pods () {
if [[ $# -ne 3 ]]
then
echo "Usage: wait_for_pods namespace max_retries sleep_time"
return 1
fi

local namespace=$1
local max_retries=$2
local sleep_time=$3

local i=0

while [[ $i -lt $max_retries ]]
do
local pods
local statuses
local num_pods
local num_running
pods=$(kubectl get pod -n "$namespace")
# echo "$pods"
# kubectl get pvc -n "$namespace"

if [[ -z $pods ]]
then
echo "no pod is up yet"
else
# Using quotations around variables to keep column format in echo
# Remove 1st line (header line) -> trim whitespace -> cut statuses column (3rd column)
# Might be overkill to parse down to specific columns :).
statuses=$(echo "$pods" | tail -n +2 | tr -s ' ' | cut -d ' ' -f 3)
num_pods=$(echo "$statuses" | wc -l | xargs)
num_running=$(echo "$statuses" | grep -ow "Running\|Completed" | wc -l | xargs)

local msg="${num_running}/${num_pods} pods running in \"${namespace}\"."

if [[ $num_running -ne $num_pods ]]
then
# for debugging
# kubectl get pod -n "$namespace" | grep '0/1' | awk '{print $1}' | xargs kubectl describe pod -n "$namespace"
echo "$msg Checking again in ${sleep_time}s."
else
echo "$msg"
return 0
fi
fi

sleep "$sleep_time"
i=$((i+1))
done

return 1
C_DIR="${BASH_SOURCE%/*}"
pip install -r "${C_DIR}"/../../../sdk/python/requirements.txt
python "${C_DIR}"/kfp-readiness/wait_for_pods.py
}

deploy_with_retries () {
Expand Down
79 changes: 79 additions & 0 deletions scripts/deploy/github/kfp-readiness/wait_for_pods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import logging
import time
import urllib3
import sys
from kubernetes import client, config

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

namespace = 'kubeflow'

config.load_kube_config()
v1 = client.CoreV1Api()


def get_pod_statuses():
pods = v1.list_namespaced_pod(namespace=namespace)
statuses = {}
for pod in pods.items:
pod_name = pod.metadata.name
pod_status = pod.status.phase
container_statuses = pod.status.container_statuses or []
ready_containers = sum(1 for status in container_statuses if status.ready)
total_containers = len(container_statuses)
statuses[pod_name] = (pod_status, ready_containers, total_containers)
return statuses


def all_pods_ready(statuses):
return all(pod_status == 'Running' and ready == total
for pod_status, ready, total in statuses.values())


def check_pods(calm_time=10, timeout=600, retries_after_ready=5):
start_time = time.time()
stable_count = 0
previous_statuses = {}

while time.time() - start_time < timeout:
current_statuses = get_pod_statuses()

logging.info("Checking pod statuses...")
for pod_name, (pod_status, ready, total) in current_statuses.items():
logging.info(f"Pod {pod_name} - Status: {pod_status}, Ready: {ready}/{total}")

if current_statuses == previous_statuses:
if all_pods_ready(current_statuses):
stable_count += 1
if stable_count >= retries_after_ready:
logging.info("All pods are calm and fully ready.")
break
else:
logging.info(
f"Pods are calm but have only been stable for {stable_count}/{retries_after_ready} retries.")
else:
stable_count = 0
else:
stable_count = 0

previous_statuses = current_statuses
logging.info(f"Pods are still stabilizing. Retrying in {calm_time} seconds...")
time.sleep(calm_time)
else:
logging.info("Pods did not stabilize within the timeout period.")
sys.exit(1)

logging.info("Final pod statuses:")
for pod_name, (pod_status, ready, total) in previous_statuses.items():
if pod_status == 'Running' and ready == total:
logging.info(f"Pod {pod_name} is fully ready ({ready}/{total})")
else:
logging.info(f"Pod {pod_name} is not ready (Status: {pod_status}, Ready: {ready}/{total})")

sys.exit(0)


if __name__ == "__main__":
check_pods()

0 comments on commit 4b7fa6f

Please sign in to comment.