Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

koord-descheduler: add arbitration to migration controller #1651

Merged
Merged
71 changes: 22 additions & 49 deletions pkg/descheduler/controllers/migration/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,13 @@ import (
sev1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1"
deschedulerconfig "github.com/koordinator-sh/koordinator/pkg/descheduler/apis/config"
"github.com/koordinator-sh/koordinator/pkg/descheduler/apis/config/validation"
"github.com/koordinator-sh/koordinator/pkg/descheduler/controllers/migration/arbitrator"
"github.com/koordinator-sh/koordinator/pkg/descheduler/controllers/migration/controllerfinder"
"github.com/koordinator-sh/koordinator/pkg/descheduler/controllers/migration/evictor"
"github.com/koordinator-sh/koordinator/pkg/descheduler/controllers/migration/reservation"
"github.com/koordinator-sh/koordinator/pkg/descheduler/controllers/migration/util"
"github.com/koordinator-sh/koordinator/pkg/descheduler/controllers/names"
"github.com/koordinator-sh/koordinator/pkg/descheduler/controllers/options"
evictionsutil "github.com/koordinator-sh/koordinator/pkg/descheduler/evictions"
"github.com/koordinator-sh/koordinator/pkg/descheduler/framework"
utilclient "github.com/koordinator-sh/koordinator/pkg/util/client"
)
Expand Down Expand Up @@ -108,7 +108,23 @@ func New(args runtime.Object, handle framework.Handle) (framework.Plugin, error)
return nil, err
}

if err = c.Watch(&source.Kind{Type: &sev1alpha1.PodMigrationJob{}}, &handler.EnqueueRequestForObject{}, &predicate.Funcs{
// New Arbitrator
eahydra marked this conversation as resolved.
Show resolved Hide resolved
var eventHandler handler.EventHandler = &handler.EnqueueRequestForObject{}
if controllerArgs.ArbitrationArgs.Enabled {
eahydra marked this conversation as resolved.
Show resolved Hide resolved
a, err := arbitrator.New(controllerArgs.ArbitrationArgs, arbitrator.Options{
Client: r.Client,
EventRecorder: r.eventRecorder,
RetryableFilter: r.retryablePodFilter,
NonRetryableFilter: r.nonRetryablePodFilter,
Manager: options.Manager,
})
if err != nil {
klog.ErrorS(err, "failed to New Arbitrator")
eahydra marked this conversation as resolved.
Show resolved Hide resolved
}
eventHandler = arbitrator.NewHandler(a, r.Client)
}

if err = c.Watch(&source.Kind{Type: &sev1alpha1.PodMigrationJob{}}, eventHandler, &predicate.Funcs{
DeleteFunc: func(event event.DeleteEvent) bool {
job := event.Object.(*sev1alpha1.PodMigrationJob)
r.assumedCache.delete(job)
Expand Down Expand Up @@ -421,18 +437,11 @@ func (r *Reconciler) preparePendingJob(ctx context.Context, job *sev1alpha1.PodM
}

markPodPrepareMigrating(pod)
if !evictionsutil.HaveEvictAnnotation(job) {
if aborted, err := r.abortJobIfNonRetryablePodFilterFailed(ctx, pod, job); aborted || err != nil {
if err == nil {
err = fmt.Errorf("abort job since failed to non-retryable Pod filter")
}
return reconcile.Result{}, err
}
if requeue, err := r.requeueJobIfRetryablePodFilterFailed(ctx, pod, job); requeue || err != nil {
return reconcile.Result{RequeueAfter: defaultRequeueAfter}, err
}
}

// delete passed arbitration annotation
if job.Annotations != nil {
delete(job.Annotations, arbitrator.AnnotationPassedArbitration)
eahydra marked this conversation as resolved.
Show resolved Hide resolved
}
job.Status.Phase = sev1alpha1.PodMigrationJobRunning
err = r.Client.Status().Update(ctx, job)
return reconcile.Result{}, err
Expand Down Expand Up @@ -487,42 +496,6 @@ func (r *Reconciler) abortJobIfTimeout(ctx context.Context, job *sev1alpha1.PodM
return true, err
}

func (r *Reconciler) requeueJobIfRetryablePodFilterFailed(ctx context.Context, pod *corev1.Pod, job *sev1alpha1.PodMigrationJob) (bool, error) {
if r.retryablePodFilter == nil {
return false, nil
}

if pod != nil {
if !r.retryablePodFilter(pod) {
r.eventRecorder.Eventf(job, nil, corev1.EventTypeWarning, "Requeue", "Migrating", "Failed to retriable filter")
return true, nil
}
}

return false, nil
}

func (r *Reconciler) abortJobIfNonRetryablePodFilterFailed(ctx context.Context, pod *corev1.Pod, job *sev1alpha1.PodMigrationJob) (bool, error) {
if r.nonRetryablePodFilter == nil {
return false, nil
}

if pod != nil {
if !r.nonRetryablePodFilter(pod) {
job.Status.Phase = sev1alpha1.PodMigrationJobFailed
job.Status.Reason = sev1alpha1.PodMigrationJobReasonForbiddenMigratePod
job.Status.Message = fmt.Sprintf("Pod %q is forbidden to migrate because it does not meet the requirements", klog.KObj(pod))
err := r.Status().Update(ctx, job)
if err == nil {
r.eventRecorder.Eventf(job, nil, corev1.EventTypeWarning, sev1alpha1.PodMigrationJobReasonForbiddenMigratePod, "Migrating", job.Status.Message)
}
return true, err
}
}

return false, nil
}

func (r *Reconciler) abortJobByInvalidPodRef(ctx context.Context, job *sev1alpha1.PodMigrationJob) error {
job.Status.Phase = sev1alpha1.PodMigrationJobFailed
job.Status.Reason = "InvalidPodRef"
Expand Down
115 changes: 5 additions & 110 deletions pkg/descheduler/controllers/migration/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import (
sev1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1"
deschedulerconfig "github.com/koordinator-sh/koordinator/pkg/descheduler/apis/config"
"github.com/koordinator-sh/koordinator/pkg/descheduler/apis/config/v1alpha2"
"github.com/koordinator-sh/koordinator/pkg/descheduler/controllers/migration/arbitrator"
"github.com/koordinator-sh/koordinator/pkg/descheduler/controllers/migration/controllerfinder"
"github.com/koordinator-sh/koordinator/pkg/descheduler/controllers/migration/reservation"
"github.com/koordinator-sh/koordinator/pkg/descheduler/controllers/migration/util"
Expand Down Expand Up @@ -1441,7 +1442,6 @@ func TestEvict(t *testing.T) {
Phase: corev1.PodRunning,
},
}
assert.True(t, reconciler.Filter(pod))

assert.True(t, reconciler.Evict(context.TODO(), pod, framework.EvictOptions{}))
var jobList sev1alpha1.PodMigrationJobList
Expand Down Expand Up @@ -1509,115 +1509,6 @@ func TestAbortJobIfReserveOnSameNode(t *testing.T) {
assert.Equal(t, sev1alpha1.PodMigrationJobReasonForbiddenMigratePod, job.Status.Reason)
}

func TestRequeueJobIfRetryablePodFilterFailed(t *testing.T) {
reconciler := newTestReconciler()
enter := false
reconciler.retryablePodFilter = func(pod *corev1.Pod) bool {
enter = true
assert.True(t, isPodPrepareMigrating(pod))
return false
}

job := &sev1alpha1.PodMigrationJob{
ObjectMeta: metav1.ObjectMeta{
Name: "test",
CreationTimestamp: metav1.Time{Time: time.Now()},
},
Spec: sev1alpha1.PodMigrationJobSpec{
PodRef: &corev1.ObjectReference{
Namespace: "default",
Name: "test-pod",
},
},
}
assert.Nil(t, reconciler.Client.Create(context.TODO(), job))
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Namespace: "default",
Name: "test-pod",
OwnerReferences: []metav1.OwnerReference{
{
APIVersion: "apps/v1",
Controller: pointer.Bool(true),
Kind: "StatefulSet",
Name: "test",
UID: "2f96233d-a6b9-4981-b594-7c90c987aed9",
},
},
},
Spec: corev1.PodSpec{
SchedulerName: "koord-scheduler",
},
Status: corev1.PodStatus{
Phase: corev1.PodRunning,
},
}
assert.Nil(t, reconciler.Client.Create(context.TODO(), pod))

result, err := reconciler.doMigrate(context.TODO(), job)
assert.True(t, enter)
assert.NoError(t, err)
assert.True(t, result.RequeueAfter != 0)
assert.NoError(t, reconciler.Client.Get(context.TODO(), types.NamespacedName{Name: job.Name}, job))
assert.Equal(t, sev1alpha1.PodMigrationJobPhase(""), job.Status.Phase)
assert.Equal(t, "", job.Status.Reason)
}

func TestAbortJobIfNonRetryablePodFilterFailed(t *testing.T) {
reconciler := newTestReconciler()
enter := false
reconciler.nonRetryablePodFilter = func(pod *corev1.Pod) bool {
enter = true
assert.True(t, isPodPrepareMigrating(pod))
return false
}

job := &sev1alpha1.PodMigrationJob{
ObjectMeta: metav1.ObjectMeta{
Name: "test",
CreationTimestamp: metav1.Time{Time: time.Now()},
},
Spec: sev1alpha1.PodMigrationJobSpec{
PodRef: &corev1.ObjectReference{
Namespace: "default",
Name: "test-pod",
},
},
}
assert.Nil(t, reconciler.Client.Create(context.TODO(), job))
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Namespace: "default",
Name: "test-pod",
OwnerReferences: []metav1.OwnerReference{
{
APIVersion: "apps/v1",
Controller: pointer.Bool(true),
Kind: "StatefulSet",
Name: "test",
UID: "2f96233d-a6b9-4981-b594-7c90c987aed9",
},
},
},
Spec: corev1.PodSpec{
SchedulerName: "koord-scheduler",
},
Status: corev1.PodStatus{
Phase: corev1.PodRunning,
},
}
assert.Nil(t, reconciler.Client.Create(context.TODO(), pod))

result, err := reconciler.doMigrate(context.TODO(), job)
assert.True(t, enter)
assert.NotNil(t, err)
assert.Equal(t, reconcile.Result{}, result)

assert.NoError(t, reconciler.Client.Get(context.TODO(), types.NamespacedName{Name: job.Name}, job))
assert.Equal(t, sev1alpha1.PodMigrationJobFailed, job.Status.Phase)
assert.Equal(t, sev1alpha1.PodMigrationJobReasonForbiddenMigratePod, job.Status.Reason)
}

func TestFilterExistingMigrationJob(t *testing.T) {
reconciler := newTestReconciler()

Expand Down Expand Up @@ -1760,6 +1651,7 @@ func TestFilterMaxMigratingPerNode(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("test-%d", i),
CreationTimestamp: metav1.Time{Time: time.Now()},
Annotations: map[string]string{arbitrator.AnnotationPassedArbitration: "true"},
},
Spec: sev1alpha1.PodMigrationJobSpec{
PodRef: &corev1.ObjectReference{
Expand Down Expand Up @@ -1909,6 +1801,7 @@ func TestFilterMaxMigratingPerNamespace(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("test-%d", i),
CreationTimestamp: metav1.Time{Time: time.Now()},
Annotations: map[string]string{arbitrator.AnnotationPassedArbitration: "true"},
},
Spec: sev1alpha1.PodMigrationJobSpec{
PodRef: &corev1.ObjectReference{
Expand Down Expand Up @@ -2096,6 +1989,7 @@ func TestFilterMaxMigratingPerWorkload(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("test-%d", i),
CreationTimestamp: metav1.Time{Time: time.Now()},
Annotations: map[string]string{arbitrator.AnnotationPassedArbitration: "true"},
},
Spec: sev1alpha1.PodMigrationJobSpec{
PodRef: &corev1.ObjectReference{
Expand Down Expand Up @@ -2326,6 +2220,7 @@ func TestFilterMaxUnavailablePerWorkload(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("test-%d", i),
CreationTimestamp: metav1.Time{Time: time.Now()},
Annotations: map[string]string{arbitrator.AnnotationPassedArbitration: "true"},
},
Spec: sev1alpha1.PodMigrationJobSpec{
PodRef: &corev1.ObjectReference{
Expand Down
5 changes: 0 additions & 5 deletions pkg/descheduler/controllers/migration/evict.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,6 @@ func (r *Reconciler) Evict(ctx context.Context, pod *corev1.Pod, evictOptions fr
return true
}

if !r.Filter(pod) {
klog.Errorf("Pod %q cannot be evicted since failed to filter", klog.KObj(pod))
return false
}

err := CreatePodMigrationJob(ctx, pod, evictOptions, r.Client, r.args)
return err == nil
}
Expand Down
Loading