diff --git a/scripts/library.sh b/scripts/library.sh index e7ecbc9bed5..ab9f303fe59 100755 --- a/scripts/library.sh +++ b/scripts/library.sh @@ -127,13 +127,50 @@ function wait_until_object_does_not_exist() { # Waits until all pods are running in the given namespace. # Parameters: $1 - namespace. function wait_until_pods_running() { - echo "Waiting until all pods in namespace $1 are up" - kubectl wait pod --for=condition=Ready -n "$1" -l '!job-name' --timeout=5m || return 1 - # Also wait for all the job pods to be completed. - # This is mainly for maintaining backward compatibility. - if [[ $(kubectl get jobs --ignore-not-found=true -n "$1") ]]; then - kubectl wait job --for=condition=Complete --all -n "$1" --timeout=5m || return 1 + echo -n "Waiting until all pods in namespace $1 are up" + local failed_pod="" + for i in {1..150}; do # timeout after 5 minutes + # List all pods. Ignore Terminating pods as those have either been replaced through + # a deployment or terminated on purpose (through chaosduck for example). + local pods="$(kubectl get pods --no-headers -n $1 2>/dev/null | grep -v Terminating)" + # All pods must be running (ignore ImagePull error to allow the pod to retry) + local not_running_pods=$(echo "${pods}" | grep -v Running | grep -v Completed | grep -v ErrImagePull | grep -v ImagePullBackOff) + if [[ -n "${pods}" ]] && [[ -z "${not_running_pods}" ]]; then + # All Pods are running or completed. Verify the containers on each Pod. + local all_ready=1 + while read pod ; do + local status=(`echo -n ${pod} | cut -f2 -d' ' | tr '/' ' '`) + # Set this Pod as the failed_pod. If nothing is wrong with it, then after the checks, set + # failed_pod to the empty string. + failed_pod=$(echo -n "${pod}" | cut -f1 -d' ') + # All containers must be ready + [[ -z ${status[0]} ]] && all_ready=0 && break + [[ -z ${status[1]} ]] && all_ready=0 && break + [[ ${status[0]} -lt 1 ]] && all_ready=0 && break + [[ ${status[1]} -lt 1 ]] && all_ready=0 && break + [[ ${status[0]} -ne ${status[1]} ]] && all_ready=0 && break + # All the tests passed, this is not a failed pod. + failed_pod="" + done <<< "$(echo "${pods}" | grep -v Completed)" + if (( all_ready )); then + echo -e "\nAll pods are up:\n${pods}" + return 0 + fi + elif [[ -n "${not_running_pods}" ]]; then + # At least one Pod is not running, just save the first one's name as the failed_pod. + failed_pod="$(echo "${not_running_pods}" | head -n 1 | cut -f1 -d' ')" + fi + echo -n "." + sleep 2 + done + echo -e "\n\nERROR: timeout waiting for pods to come up\n${pods}" + if [[ -n "${failed_pod}" ]]; then + echo -e "\n\nFailed Pod (data in YAML format) - ${failed_pod}\n" + kubectl -n $1 get pods "${failed_pod}" -oyaml + echo -e "\n\nPod Logs\n" + kubectl -n $1 logs "${failed_pod}" --all-containers fi + return 1 } # Waits until all batch jobs complete in the given namespace.