Skip to content

Commit ca08a58

Browse files
committed
better detect pod failure due to node termination
Signed-off-by: Tim Ramlot <[email protected]>
1 parent fad19e0 commit ca08a58

File tree

1 file changed

+28
-1
lines changed

1 file changed

+28
-1
lines changed

pkg/plank/reconciler.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,7 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
490490
r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
491491
return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
492492
}
493-
} else if pod.Status.Reason == Terminated {
493+
} else if isPodTerminated(pod) {
494494
// Pod was terminated.
495495
if pj.Spec.ErrorOnTermination {
496496
// ErrorOnTermination is enabled, complete the PJ and mark it as
@@ -701,6 +701,33 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
701701
return nil, nil
702702
}
703703

704+
func isPodTerminated(pod *corev1.Pod) bool {
705+
// If there was a Graceful node shutdown, the Pod's status will have a
706+
// reason set to "Terminated":
707+
// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
708+
if pod.Status.Reason == Terminated {
709+
return true
710+
}
711+
712+
for _, condition := range pod.Status.Conditions {
713+
// If the node does no longer exist and the pod gets garbage collected,
714+
// this condition will be set:
715+
// https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions
716+
if condition.Reason == "DeletionByPodGC" {
717+
return true
718+
}
719+
720+
// On GCP, before a new spot instance is started, the old pods are garbage
721+
// collected (if they have not been already by the Kubernetes PodGC):
722+
// https://github.com/kubernetes/cloud-provider-gcp/blob/25e5dcc715781316bc5e39f8b17c0d5b313453f7/cmd/gcp-controller-manager/node_csr_approver.go#L1035-L1058
723+
if condition.Reason == "DeletionByGCPControllerManager" {
724+
return true
725+
}
726+
}
727+
728+
return false
729+
}
730+
704731
// syncTriggeredJob syncs jobs that do not yet have an associated test workload running
705732
func (r *reconciler) syncTriggeredJob(ctx context.Context, pj *prowv1.ProwJob) (*reconcile.Result, error) {
706733
prevPJ := pj.DeepCopy()

0 commit comments

Comments
 (0)