Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: preserve NVIPAM CM after upgrade #672

Merged
merged 1 commit into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ apiVersion: mellanox.com/v1alpha1
kind: NicClusterPolicy
metadata:
name: nic-cluster-policy
annotations:
helm.sh/hook: post-install,post-upgrade
adrianchiris marked this conversation as resolved.
Show resolved Hide resolved
spec:
{{- if .Values.nodeAffinity }}
nodeAffinity:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,54 +1,59 @@
{{- if .Values.deployCR }}
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "network-operator.fullname" . }}-hooks-sa
name: {{ include "network-operator.fullname" . }}-hooks-scale-sa
annotations:
helm.sh/hook: post-delete
helm.sh/hook: pre-install,pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "network-operator.fullname" . }}-hooks-role
name: {{ include "network-operator.fullname" . }}-hooks-scale-role
annotations:
helm.sh/hook: post-delete
helm.sh/hook: pre-install,pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
rules:
- apiGroups:
- mellanox.com
- apps
resources:
- nicclusterpolicies
- deployments/scale
verbs:
- delete
- "*"
- apiGroups:
- ""
resources:
- pods
verbs:
- "*"
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{ include "network-operator.fullname" . }}-hooks-binding
name: {{ include "network-operator.fullname" . }}-hooks-scale-binding
annotations:
helm.sh/hook: post-delete
helm.sh/hook: pre-install,pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
subjects:
- kind: ServiceAccount
name: {{ include "network-operator.fullname" . }}-hooks-sa
name: {{ include "network-operator.fullname" . }}-hooks-scale-sa
namespace: {{ .Release.Namespace }}
roleRef:
kind: ClusterRole
name: {{ include "network-operator.fullname" . }}-hooks-role
name: {{ include "network-operator.fullname" . }}-hooks-scale-role
apiGroup: rbac.authorization.k8s.io
---
apiVersion: batch/v1
kind: Job
metadata:
name: network-operator-delete-cr
name: network-operator-scale-down
namespace: {{ .Release.Namespace }}
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "1"
"helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation
labels:
Expand All @@ -57,7 +62,7 @@ metadata:
spec:
template:
metadata:
name: network-operator-delete-cr
name: network-operator-scale-down
labels:
{{- include "network-operator.labels" . | nindent 8 }}
app.kubernetes.io/component: "network-operator"
Expand All @@ -74,16 +79,16 @@ spec:
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "network-operator.fullname" . }}-hooks-sa
serviceAccountName: {{ include "network-operator.fullname" . }}-hooks-scale-sa
imagePullSecrets: {{ include "network-operator.operator.imagePullSecrets" . }}
containers:
- name: delete-cr
- name: scale-down
image: "{{ .Values.operator.repository }}/{{ .Values.operator.image }}:{{ .Values.operator.tag | default .Chart.AppVersion }}"
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- >
kubectl delete nicclusterpolicies.mellanox.com nic-cluster-policy;
kubectl scale -n {{ .Release.Namespace }} deployment {{ include "network-operator.fullname" . }} --replicas=0 || true;
kubectl wait -n {{ .Release.Namespace }} pod -l control-plane={{ .Release.Name }}-controller --for=delete --timeout=30s;
restartPolicy: OnFailure
{{- end }}
46 changes: 46 additions & 0 deletions pkg/migrate/migrate.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,14 @@ package migrate

import (
"context"
"fmt"

"github.com/go-logr/logr"
v1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
apiErrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/Mellanox/network-operator/pkg/config"
Expand All @@ -46,6 +49,11 @@ func migrate(ctx context.Context, log logr.Logger, c client.Client) error {
// not critical for the operator operation, safer to ignore
log.V(consts.LogLevelWarning).Info("ignore error during whereabouts CronJob removal")
}
if err := removeStateLabelFromNVIpamConfigMap(ctx, log, c); err != nil {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to return error if we ignore it? We've got logging inside removeStateLabelFromNVIpamConfigMap so no need to return an error

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I followed the same pattern as removeWhereaboutsIPReconcileCronJob.
WDYT?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should keep it as is. removeStateLabelFromNVIpamConfigMap function do the action and an error can happen during this action - it is fair to return the error from the function. It is up to caller to decide what to do with the error.

Copy link
Collaborator

@ykulazhenkov ykulazhenkov Nov 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO this should not block the PR

Copy link
Collaborator

@ykulazhenkov ykulazhenkov Nov 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I have one more thing. It is pretty safe to run removeWhereaboutsIPReconcileCronJob in a "best effort" mode and ignore any error if it happens (we will have an unneeded Job in the cluster, but this will not break anything). But with removeStateLabelFromNVIpamConfigMap, we are in a bit different situation - if we fail to remove the label, we will probably lose all nv-ipam allocations. Maybe we should crash in this case and rely on k8s to restart the Pod? WDYT?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, returning error to main.

// critical for the operator operation, will fail NVIPAM migration
log.V(consts.LogLevelError).Error(err, "error trying to remove state label on NV IPAM configmap")
return err
}
return nil
}

Expand All @@ -68,3 +76,41 @@ func removeWhereaboutsIPReconcileCronJob(ctx context.Context, log logr.Logger, c
}
return nil
}

// reason: remove state label on NV IPAM config map if exists, to allow migration to IPPool CR
// If the state label is present, the config map will be removed by the NCP Controller as a stale object
func removeStateLabelFromNVIpamConfigMap(ctx context.Context, log logr.Logger, c client.Client) error {
namespace := config.FromEnv().State.NetworkOperatorResourceNamespace
cmName := "nvidia-k8s-ipam-config"
cfg := &corev1.ConfigMap{}
key := types.NamespacedName{
Name: cmName,
Namespace: namespace,
}
err := c.Get(ctx, key, cfg)
if apiErrors.IsNotFound(err) {
log.V(consts.LogLevelDebug).Info("NVIPAM config map not found, skip remove state label")
return nil
} else if err != nil {
log.V(consts.LogLevelError).Error(err, "fail to get NVIPAM configmap")
return err
}
_, ok := cfg.Labels[consts.StateLabel]
if ok {
log.V(consts.LogLevelDebug).Info("clear State label from NVIPAM configmap")
patch := []byte(fmt.Sprintf("[{\"op\": \"remove\", \"path\": \"/metadata/labels/%s\"}]", consts.StateLabel))
err = c.Patch(ctx, &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: cmName,
Namespace: namespace,
},
}, client.RawPatch(types.JSONPatchType, patch))
if err != nil {
log.V(consts.LogLevelError).Error(err, "fail to remove State label from NVIPAM configmap")
return err
}
} else {
log.V(consts.LogLevelDebug).Info("state label not set on NVIPAM configmap")
}
return nil
}