diff --git a/config/prow/cluster/prowjob-crd/prowjob_customresourcedefinition.yaml b/config/prow/cluster/prowjob-crd/prowjob_customresourcedefinition.yaml index fa741371cf..7358bd3933 100644 --- a/config/prow/cluster/prowjob-crd/prowjob_customresourcedefinition.yaml +++ b/config/prow/cluster/prowjob-crd/prowjob_customresourcedefinition.yaml @@ -1584,6 +1584,13 @@ spec: If this field is unspecified or false, a new pod will be created to replace the evicted one. type: boolean + error_on_termination: + description: |- + ErrorOnTermination indicates that the ProwJob should be completed and given + the ErrorState status if the pod that is executing the job is terminated. + If this field is unspecified or false, a new pod will be created to replace + the terminated one. + type: boolean extra_refs: description: |- ExtraRefs are auxiliary repositories that diff --git a/pkg/apis/prowjobs/v1/types.go b/pkg/apis/prowjobs/v1/types.go index 336a8d6295..31006ba08b 100644 --- a/pkg/apis/prowjobs/v1/types.go +++ b/pkg/apis/prowjobs/v1/types.go @@ -182,6 +182,11 @@ type ProwJobSpec struct { // If this field is unspecified or false, a new pod will be created to replace // the evicted one. ErrorOnEviction bool `json:"error_on_eviction,omitempty"` + // ErrorOnTermination indicates that the ProwJob should be completed and given + // the ErrorState status if the pod that is executing the job is terminated. + // If this field is unspecified or false, a new pod will be created to replace + // the terminated one. + ErrorOnTermination bool `json:"error_on_termination,omitempty"` // PodSpec provides the basis for running the test under // a Kubernetes agent diff --git a/pkg/config/config.go b/pkg/config/config.go index 400278b5a8..1262130aa0 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -661,6 +661,13 @@ type Plank struct { // Defaults to 3. A value of 0 means no retries. MaxRevivals *int `json:"max_revivals,omitempty"` + // TerminationConditionReasons is a set of pod status condition reasons on which the + // controller will match to determine if the pod's node is being terminated. If a node is being + // terminated the controller will restart the prowjob, unless the ErrorOnTermination option + // is set on the prowjob or the MaxRevivals option is reached. + // Defaults to ["DeletionByPodGC", "DeletionByGCPControllerManager"]. + TerminationConditionReasons []string `json:"termination_condition_reasons,omitempty"` + // DefaultDecorationConfigs holds the default decoration config for specific values. // // Each entry in the slice specifies Repo and Cluster regexp filter fields to @@ -2514,6 +2521,19 @@ func parseProwConfig(c *Config) error { c.Plank.MaxRevivals = &maxRetries } + if c.Plank.TerminationConditionReasons == nil { + c.Plank.TerminationConditionReasons = []string{ + // If the node does no longer exist and the pod gets garbage collected, + // this condition will be set: + // https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions + "DeletionByPodGC", + // On GCP, before a new spot instance is started, the old pods are garbage + // collected (if they have not been already by the Kubernetes PodGC): + // https://github.com/kubernetes/cloud-provider-gcp/blob/25e5dcc715781316bc5e39f8b17c0d5b313453f7/cmd/gcp-controller-manager/node_csr_approver.go#L1035-L1058 + "DeletionByGCPControllerManager", + } + } + if err := c.Gerrit.DefaultAndValidate(); err != nil { return fmt.Errorf("validating gerrit config: %w", err) } @@ -2913,6 +2933,8 @@ func validateAgent(v JobBase, podNamespace string) error { return fmt.Errorf("decoration requires agent: %s (found %q)", k, agent) case v.ErrorOnEviction && agent != k: return fmt.Errorf("error_on_eviction only applies to agent: %s (found %q)", k, agent) + case v.ErrorOnTermination && agent != k: + return fmt.Errorf("error_on_termination only applies to agent: %s (found %q)", k, agent) case v.Namespace == nil || *v.Namespace == "": return fmt.Errorf("failed to default namespace") case *v.Namespace != podNamespace && agent != p: diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index dc1e55fc2e..2fa40b0526 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -1428,6 +1428,13 @@ func TestValidateAgent(t *testing.T) { }, pass: true, }, + { + name: "error_on_termination allowed for kubernetes agent", + base: func(j *JobBase) { + j.ErrorOnTermination = true + }, + pass: true, + }, } for _, tc := range cases { @@ -8430,6 +8437,9 @@ plank: pod_pending_timeout: 10m0s pod_running_timeout: 48h0m0s pod_unscheduled_timeout: 5m0s + termination_condition_reasons: + - DeletionByPodGC + - DeletionByGCPControllerManager pod_namespace: default prowjob_namespace: default push_gateway: @@ -8515,6 +8525,9 @@ plank: pod_pending_timeout: 10m0s pod_running_timeout: 48h0m0s pod_unscheduled_timeout: 5m0s + termination_condition_reasons: + - DeletionByPodGC + - DeletionByGCPControllerManager pod_namespace: default prowjob_namespace: default push_gateway: @@ -8593,6 +8606,9 @@ plank: pod_pending_timeout: 10m0s pod_running_timeout: 48h0m0s pod_unscheduled_timeout: 5m0s + termination_condition_reasons: + - DeletionByPodGC + - DeletionByGCPControllerManager pod_namespace: default prowjob_namespace: default push_gateway: @@ -8676,6 +8692,9 @@ plank: pod_pending_timeout: 10m0s pod_running_timeout: 48h0m0s pod_unscheduled_timeout: 5m0s + termination_condition_reasons: + - DeletionByPodGC + - DeletionByGCPControllerManager pod_namespace: default prowjob_namespace: default push_gateway: diff --git a/pkg/config/jobs.go b/pkg/config/jobs.go index 84c2501fe7..a647a888cd 100644 --- a/pkg/config/jobs.go +++ b/pkg/config/jobs.go @@ -109,6 +109,11 @@ type JobBase struct { // If this field is unspecified or false, a new pod will be created to replace // the evicted one. ErrorOnEviction bool `json:"error_on_eviction,omitempty"` + // ErrorOnTermination indicates that the ProwJob should be completed and given + // the ErrorState status if the pod that is executing the job is terminated. + // If this field is unspecified or false, a new pod will be created to replace + // the terminated one. + ErrorOnTermination bool `json:"error_on_termination,omitempty"` // SourcePath contains the path where this job is defined SourcePath string `json:"-"` // Spec is the Kubernetes pod spec used if Agent is kubernetes. diff --git a/pkg/config/prow-config-documented.yaml b/pkg/config/prow-config-documented.yaml index 01947fdafb..faf31263ac 100644 --- a/pkg/config/prow-config-documented.yaml +++ b/pkg/config/prow-config-documented.yaml @@ -1321,6 +1321,13 @@ plank: # Use `org/repo`, `org` or `*` as a key. report_templates: "": "" + # TerminationConditionReasons is a set of pod status condition reasons on which the + # controller will match to determine if the pod's node is being terminated. If a node is being + # terminated the controller will restart the prowjob, unless the ErrorOnTermination option + # is set on the prowjob or the MaxRevivals option is reached. + # Defaults to ["DeletionByPodGC", "DeletionByGCPControllerManager"]. + termination_condition_reasons: + - "" # PodNamespace is the namespace in the cluster that prow # components will use for looking up Pods owned by ProwJobs. # The namespace needs to exist and will not be created by prow. diff --git a/pkg/pjutil/pjutil.go b/pkg/pjutil/pjutil.go index 28ee541f56..c5ccbb9116 100644 --- a/pkg/pjutil/pjutil.go +++ b/pkg/pjutil/pjutil.go @@ -228,12 +228,13 @@ func specFromJobBase(jb config.JobBase) prowapi.ProwJobSpec { namespace = *jb.Namespace } return prowapi.ProwJobSpec{ - Job: jb.Name, - Agent: prowapi.ProwJobAgent(jb.Agent), - Cluster: jb.Cluster, - Namespace: namespace, - MaxConcurrency: jb.MaxConcurrency, - ErrorOnEviction: jb.ErrorOnEviction, + Job: jb.Name, + Agent: prowapi.ProwJobAgent(jb.Agent), + Cluster: jb.Cluster, + Namespace: namespace, + MaxConcurrency: jb.MaxConcurrency, + ErrorOnEviction: jb.ErrorOnEviction, + ErrorOnTermination: jb.ErrorOnTermination, ExtraRefs: DecorateExtraRefs(jb.ExtraRefs, jb), DecorationConfig: jb.DecorationConfig, diff --git a/pkg/plank/controller_test.go b/pkg/plank/controller_test.go index 5192e3b18b..3efbc47255 100644 --- a/pkg/plank/controller_test.go +++ b/pkg/plank/controller_test.go @@ -65,7 +65,10 @@ const ( podDeletionPreventionFinalizer = "keep-from-vanishing" ) -var maxRevivals = 3 +var ( + maxRevivals = 3 + terminationConditionReasons = []string{"DeletionByPodGC", "DeletionByGCPControllerManager"} +) func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[string]int) *fca { presubmits := []config.Presubmit{ @@ -104,11 +107,12 @@ func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[st MaxConcurrency: maxConcurrency, MaxGoroutines: 20, }, - JobQueueCapacities: queueCapacities, - PodPendingTimeout: &metav1.Duration{Duration: podPendingTimeout}, - PodRunningTimeout: &metav1.Duration{Duration: podRunningTimeout}, - PodUnscheduledTimeout: &metav1.Duration{Duration: podUnscheduledTimeout}, - MaxRevivals: &maxRevivals, + JobQueueCapacities: queueCapacities, + PodPendingTimeout: &metav1.Duration{Duration: podPendingTimeout}, + PodRunningTimeout: &metav1.Duration{Duration: podRunningTimeout}, + PodUnscheduledTimeout: &metav1.Duration{Duration: podUnscheduledTimeout}, + MaxRevivals: &maxRevivals, + TerminationConditionReasons: terminationConditionReasons, }, }, JobConfig: config.JobConfig{ @@ -1217,10 +1221,7 @@ func TestSyncPendingJob(t *testing.T) { ExpectedURL: "boop-42/error", }, { - // TODO: this test case tests the current behavior, but the behavior - // is non-ideal: the pod execution did not fail, instead the node on which - // the pod was running terminated - Name: "a terminated pod is handled as-if it failed", + Name: "delete terminated pod", PJ: prowapi.ProwJob{ ObjectMeta: metav1.ObjectMeta{ Name: "boop-42", @@ -1246,8 +1247,72 @@ func TestSyncPendingJob(t *testing.T) { }, }, }, + ExpectedComplete: false, + ExpectedState: prowapi.PendingState, + ExpectedNumPods: 0, + }, + { + Name: "delete terminated pod and remove its k8sreporter finalizer", + PJ: prowapi.ProwJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "boop-42", + Namespace: "prowjobs", + }, + Spec: prowapi.ProwJobSpec{ + PodSpec: &v1.PodSpec{Containers: []v1.Container{{Name: "test-name", Env: []v1.EnvVar{}}}}, + }, + Status: prowapi.ProwJobStatus{ + State: prowapi.PendingState, + PodName: "boop-42", + }, + }, + Pods: []v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "boop-42", + Namespace: "pods", + Finalizers: []string{"prow.x-k8s.io/gcsk8sreporter"}, + }, + Status: v1.PodStatus{ + Phase: v1.PodFailed, + Reason: Terminated, + }, + }, + }, + ExpectedComplete: false, + ExpectedState: prowapi.PendingState, + ExpectedNumPods: 0, + }, + { + Name: "don't delete terminated pod w/ error_on_termination, complete PJ instead", + PJ: prowapi.ProwJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "boop-42", + Namespace: "prowjobs", + }, + Spec: prowapi.ProwJobSpec{ + ErrorOnTermination: true, + PodSpec: &v1.PodSpec{Containers: []v1.Container{{Name: "test-name", Env: []v1.EnvVar{}}}}, + }, + Status: prowapi.ProwJobStatus{ + State: prowapi.PendingState, + PodName: "boop-42", + }, + }, + Pods: []v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "boop-42", + Namespace: "pods", + }, + Status: v1.PodStatus{ + Phase: v1.PodFailed, + Reason: Terminated, + }, + }, + }, ExpectedComplete: true, - ExpectedState: prowapi.FailureState, + ExpectedState: prowapi.ErrorState, ExpectedNumPods: 1, ExpectedURL: "boop-42/error", }, diff --git a/pkg/plank/reconciler.go b/pkg/plank/reconciler.go index 10338938ea..a5c748a981 100644 --- a/pkg/plank/reconciler.go +++ b/pkg/plank/reconciler.go @@ -21,6 +21,7 @@ import ( "encoding/json" "errors" "fmt" + "slices" "strings" "sync" "time" @@ -469,7 +470,7 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r pj.Status.PodName = pn r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is missing, starting a new pod") } - } else if podUnexpectedStopCause := getPodUnexpectedStopCause(pod); podUnexpectedStopCause != PodUnexpectedStopCauseNone { + } else if podUnexpectedStopCause := getPodUnexpectedStopCause(pod, r.config().Plank.TerminationConditionReasons); podUnexpectedStopCause != PodUnexpectedStopCauseNone { switch { case podUnexpectedStopCause == PodUnexpectedStopCauseEvicted && pj.Spec.ErrorOnEviction: // ErrorOnEviction is enabled, complete the PJ and mark it as errored. @@ -477,6 +478,12 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r pj.SetComplete() pj.Status.State = prowv1.ErrorState pj.Status.Description = "Job pod was evicted by the cluster." + case podUnexpectedStopCause == PodUnexpectedStopCauseTerminated && pj.Spec.ErrorOnTermination: + // ErrorOnTermination is enabled, complete the PJ and mark it as errored. + r.log.WithField("error-on-termination", true).WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got terminated, fail job.") + pj.SetComplete() + pj.Status.State = prowv1.ErrorState + pj.Status.Description = "Job pod's node was terminated." case pj.Status.PodRevivalCount >= *r.config().Plank.MaxRevivals: // MaxRevivals is reached, complete the PJ and mark it as errored. r.log.WithField("unexpected-stop-cause", podUnexpectedStopCause).WithFields(pjutil.ProwJobFields(pj)).Info("Pod Node reached max retries, fail job.") @@ -661,14 +668,28 @@ const ( PodUnexpectedStopCauseNone PodUnexpectedStopCause = "" PodUnexpectedStopCauseUnknown PodUnexpectedStopCause = "unknown" PodUnexpectedStopCauseEvicted PodUnexpectedStopCause = "evicted" + PodUnexpectedStopCauseTerminated PodUnexpectedStopCause = "terminated" PodUnexpectedStopCauseUnreachable PodUnexpectedStopCause = "unreachable" ) -func getPodUnexpectedStopCause(pod *corev1.Pod) PodUnexpectedStopCause { +func getPodUnexpectedStopCause(pod *corev1.Pod, terminationConditionReasons []string) PodUnexpectedStopCause { if pod.Status.Reason == Evicted { return PodUnexpectedStopCauseEvicted } + // If there was a Graceful node shutdown, the Pod's status will have a + // reason set to "Terminated": + // https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown + if pod.Status.Reason == Terminated { + return PodUnexpectedStopCauseTerminated + } + + for _, condition := range pod.Status.Conditions { + if slices.Contains(terminationConditionReasons, condition.Reason) { + return PodUnexpectedStopCauseTerminated + } + } + if pod.Status.Reason == NodeUnreachablePodReason && pod.DeletionTimestamp != nil { return PodUnexpectedStopCauseUnreachable }