Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix E2E failure with pod status check #847

Merged
merged 1 commit into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/call-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ jobs:
environment: ${{ matrix.device }}
env:
E2E_TYPE: ${{ inputs.type }}
HAMI_VERSION: ${{ inputs.ref }}
steps:
- name: checkout code
uses: actions/checkout@v4
Expand Down Expand Up @@ -68,8 +69,6 @@ jobs:
ssh root@$VSPHERE_GPU_VM_IP "nerdctl image ls | grep hami"

- name: deploy hami helm
env:
HAMI_VERSION: ${{ inputs.ref }}
run: |
make helm-deploy

Expand Down
5 changes: 3 additions & 2 deletions hack/deploy-helm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@ else
fi

# Set Helm Chart source based on E2E_TYPE.
echo "E2E Type is: ${E2E_TYPE}"

if [ "${E2E_TYPE}" == "pullrequest" ]; then
echo "E2E Type is: ${E2E_TYPE}"
# Ensure the charts directory exists and contains a .tgz file
if [ -d "charts" ] && [ -n "$(ls charts/*.tgz 2>/dev/null)" ]; then
HELM_SOURCE=$(ls charts/*.tgz | head -n 1) # Use the first .tgz file found
Expand Down Expand Up @@ -96,7 +97,7 @@ fi
echo "Checking Pod status..."
kubectl --kubeconfig "${KUBE_CONF}" get po -n "${TARGET_NS}"

if ! util::check_pods_status "${KUBE_CONF}" "${TARGET_NS}"; then
if ! util::check_pods_status "${KUBE_CONF}" ; then
echo "Error: Pods are not running correctly."
exit 1
fi
Expand Down
38 changes: 32 additions & 6 deletions hack/util.sh
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,38 @@ function util::wait_ip_reachable {
# Check Pod status in a namespace.
function util::check_pods_status {
local kubeconfig=${1:-""}
local namespace=${2:-"hami-system"}
local namespace=${2:-""}
local retries=${3:-10}
local interval=${4:-30}

local attempt=0
local unhealthy_pods
unhealthy_pods=$(kubectl get po -n "$namespace" --kubeconfig "$kubeconfig" --no-headers | awk '!/Running|Succeeded/ {print $1}')

while (( attempt < retries )); do
echo "Checking Pod status (Attempt $(( attempt + 1 ))/$retries)..."

# Checking unhealthy pods in namespaces,ignore the Running & Succeeded status
if [[ -z "$namespace" ]]; then
unhealthy_pods=$(kubectl get po -A --kubeconfig "$kubeconfig" --no-headers --ignore-not-found | awk '!/Running|Succeeded|Completed/ {print $2}')
else
unhealthy_pods=$(kubectl get po -n "$namespace" --kubeconfig "$kubeconfig" --no-headers --ignore-not-found | awk '!/Running|Succeeded|Completed/ {print $1}')
fi

if [[ -z "$unhealthy_pods" ]]; then
echo "PASS: All Pods are in Running or Succeeded state."
return 0
fi

echo "Found unhealthy pods:"
echo "$unhealthy_pods"

if (( attempt < retries - 1 )); then
echo "Retrying pod check in ${interval}s..."
sleep "$interval"
fi

(( attempt++ ))
done

if [[ -n "$unhealthy_pods" ]]; then
echo "Found unhealthy pods in namespace $namespace:"
Expand All @@ -134,8 +163,5 @@ function util::check_pods_status {
done

return 1
else
echo "PASS: All Pods are in Running state."
return 0
fi
}
}