From 871e63a9d9fd25881b18bbd57050e32927112e10 Mon Sep 17 00:00:00 2001
From: Sascha Schwarze <schwarzs@de.ibm.com>
Date: Fri, 19 Jul 2024 14:25:17 +0200
Subject: [PATCH] Add feature flag treat-pod-as-always-schedulable

The feature flag allows to declare that Pods in the system will eventually all get scheduled and Revisions should therefore not be marked unschedulable

Signed-off-by: Sascha Schwarze <schwarzs@de.ibm.com>
---
 config/core/configmaps/features.yaml          | 11 ++++++-
 pkg/apis/config/features.go                   |  3 ++
 pkg/apis/config/features_test.go              | 29 +++++++++++++++++++
 .../revision/reconcile_resources.go           | 12 +++++---
 pkg/reconciler/revision/table_test.go         |  3 ++
 5 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/config/core/configmaps/features.yaml b/config/core/configmaps/features.yaml
index 51830a89b6d5..8874697e35a7 100644
--- a/config/core/configmaps/features.yaml
+++ b/config/core/configmaps/features.yaml
@@ -22,7 +22,7 @@ metadata:
     app.kubernetes.io/component: controller
     app.kubernetes.io/version: devel
   annotations:
-    knative.dev/example-checksum: "63a13754"
+    knative.dev/example-checksum: "0ac06704"
 data:
   _example: |-
     ################################
@@ -242,3 +242,12 @@ data:
 
     # Default queue proxy resource requests and limits to good values for most cases if set.
     queueproxy.resource-defaults: "disabled"
+
+    # treat-pod-as-always-schedulable can be used to define that Pods in the system will always be
+    # scheduled, and a Revision should not be marked unschedulable.
+    # Setting this to `enabled` makes sense if you have cluster-autoscaling set up for you cluster
+    # where unschedulable Pods trigger the addition of a new Node and are therefore a short and
+    # transient state.
+    #
+    # See https://github.com/knative/serving/issues/14862
+    treat-pod-as-always-schedulable: "disabled"
diff --git a/pkg/apis/config/features.go b/pkg/apis/config/features.go
index 56db93419c74..1c514a3511af 100644
--- a/pkg/apis/config/features.go
+++ b/pkg/apis/config/features.go
@@ -109,6 +109,7 @@ func defaultFeaturesConfig() *Features {
 		SecurePodDefaults:                Disabled,
 		TagHeaderBasedRouting:            Disabled,
 		AutoDetectHTTP2:                  Disabled,
+		TreatPodAsAlwaysSchedulable:      Disabled,
 	}
 }
 
@@ -126,6 +127,7 @@ func NewFeaturesConfigFromMap(data map[string]string) (*Features, error) {
 		asFlag("queueproxy.resource-defaults", &nc.QueueProxyResourceDefaults),
 		asFlag("secure-pod-defaults", &nc.SecurePodDefaults),
 		asFlag("tag-header-based-routing", &nc.TagHeaderBasedRouting),
+		asFlag("treat-pod-as-always-schedulable", &nc.TreatPodAsAlwaysSchedulable),
 		asFlag(FeatureContainerSpecAddCapabilities, &nc.ContainerSpecAddCapabilities),
 		asFlag(FeaturePodSpecAffinity, &nc.PodSpecAffinity),
 		asFlag(FeaturePodSpecDNSConfig, &nc.PodSpecDNSConfig),
@@ -191,6 +193,7 @@ type Features struct {
 	SecurePodDefaults                Flag
 	TagHeaderBasedRouting            Flag
 	AutoDetectHTTP2                  Flag
+	TreatPodAsAlwaysSchedulable      Flag
 }
 
 // asFlag parses the value at key as a Flag into the target, if it exists.
diff --git a/pkg/apis/config/features_test.go b/pkg/apis/config/features_test.go
index 0995c2f56061..09838d4b4f8d 100644
--- a/pkg/apis/config/features_test.go
+++ b/pkg/apis/config/features_test.go
@@ -80,6 +80,7 @@ func TestFeaturesConfiguration(t *testing.T) {
 			SecurePodDefaults:                Enabled,
 			QueueProxyResourceDefaults:       Enabled,
 			TagHeaderBasedRouting:            Enabled,
+			TreatPodAsAlwaysSchedulable:      Enabled,
 		}),
 		data: map[string]string{
 			"multi-container":                              "Enabled",
@@ -103,6 +104,7 @@ func TestFeaturesConfiguration(t *testing.T) {
 			"secure-pod-defaults":                          "Enabled",
 			"queueproxy.resource-defaults":                 "Enabled",
 			"tag-header-based-routing":                     "Enabled",
+			"treat-pod-as-always-schedulable":              "Enabled",
 		},
 	}, {
 		name:    "multi-container Allowed",
@@ -672,6 +674,33 @@ func TestFeaturesConfiguration(t *testing.T) {
 			data: map[string]string{
 				"kubernetes.podspec-hostnetwork": "Disabled",
 			},
+		}, {
+			name:    "treat-pod-as-always-schedulable Allowed",
+			wantErr: false,
+			wantFeatures: defaultWith(&Features{
+				TreatPodAsAlwaysSchedulable: Allowed,
+			}),
+			data: map[string]string{
+				"treat-pod-as-always-schedulable": "Allowed",
+			},
+		}, {
+			name:    "treat-pod-as-always-schedulable Enabled",
+			wantErr: false,
+			wantFeatures: defaultWith(&Features{
+				TreatPodAsAlwaysSchedulable: Enabled,
+			}),
+			data: map[string]string{
+				"treat-pod-as-always-schedulable": "Enabled",
+			},
+		}, {
+			name:    "treat-pod-as-always-schedulable Disabled",
+			wantErr: false,
+			wantFeatures: defaultWith(&Features{
+				TreatPodAsAlwaysSchedulable: Disabled,
+			}),
+			data: map[string]string{
+				"treat-pod-as-always-schedulable": "Disabled",
+			},
 		}}
 
 	for _, tt := range configTests {
diff --git a/pkg/reconciler/revision/reconcile_resources.go b/pkg/reconciler/revision/reconcile_resources.go
index 3fa6f07d24fe..d32e686f8a43 100644
--- a/pkg/reconciler/revision/reconcile_resources.go
+++ b/pkg/reconciler/revision/reconcile_resources.go
@@ -36,6 +36,7 @@ import (
 	"knative.dev/pkg/kmp"
 	"knative.dev/pkg/logging"
 	"knative.dev/pkg/logging/logkey"
+	apicfg "knative.dev/serving/pkg/apis/config"
 	v1 "knative.dev/serving/pkg/apis/serving/v1"
 	"knative.dev/serving/pkg/networking"
 	"knative.dev/serving/pkg/reconciler/revision/config"
@@ -87,10 +88,13 @@ func (c *Reconciler) reconcileDeployment(ctx context.Context, rev *v1.Revision)
 
 			// Update the revision status if pod cannot be scheduled (possibly resource constraints)
 			// If pod cannot be scheduled then we expect the container status to be empty.
-			for _, cond := range pod.Status.Conditions {
-				if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse {
-					rev.Status.MarkResourcesAvailableFalse(cond.Reason, cond.Message)
-					break
+			treatPodAsAlwaysSchedulable := config.FromContext(ctx).Features.TreatPodAsAlwaysSchedulable
+			if treatPodAsAlwaysSchedulable == apicfg.Disabled {
+				for _, cond := range pod.Status.Conditions {
+					if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse {
+						rev.Status.MarkResourcesAvailableFalse(cond.Reason, cond.Message)
+						break
+					}
 				}
 			}
 
diff --git a/pkg/reconciler/revision/table_test.go b/pkg/reconciler/revision/table_test.go
index c534b56e7ed8..c03927c5c75e 100644
--- a/pkg/reconciler/revision/table_test.go
+++ b/pkg/reconciler/revision/table_test.go
@@ -615,6 +615,9 @@ func TestReconcile(t *testing.T) {
 			Object: pa("foo", "pod-schedule-error", WithReachabilityUnreachable),
 		}},
 		Key: "foo/pod-schedule-error",
+		Ctx: defaultconfig.ToContext(context.Background(), &defaultconfig.Config{Features: &defaultconfig.Features{
+			TreatPodAsAlwaysSchedulable: defaultconfig.Disabled,
+		}}),
 	}, {
 		Name: "ready steady state",
 		// Test the transition that Reconcile makes when Endpoints become ready on the