Skip to content

Commit

Permalink
fix: preserve NVIPAM CM after upgrade
Browse files Browse the repository at this point in the history
In order to allow migration from config map
to IP Pool CRs in NV IPAM, the config map needs to be kept.

- Removing the state label will make sure it is not deleted as
a stale object.

- Remove the helm hook annotation on NiClusterPolicy CR template,
  that causes delete and recreate of CR and the delete-cr hook.
  Note that this will prevent CR validation in install stage.

- Make sure to stop existing Network Operator before updating CR,
  so that it does not delete the config map since config is removed.

Signed-off-by: Fred Rolland <[email protected]>
(cherry picked from commit fc6aca4)
  • Loading branch information
rollandf authored and e0ne committed Nov 16, 2023
1 parent fac2376 commit 1295f17
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ apiVersion: mellanox.com/v1alpha1
kind: NicClusterPolicy
metadata:
name: nic-cluster-policy
annotations:
helm.sh/hook: post-install,post-upgrade
spec:
{{- if .Values.nodeAffinity }}
nodeAffinity:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,54 +1,59 @@
{{- if .Values.deployCR }}
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "network-operator.fullname" . }}-hooks-sa
name: {{ include "network-operator.fullname" . }}-hooks-scale-sa
annotations:
helm.sh/hook: post-delete
helm.sh/hook: pre-install,pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "network-operator.fullname" . }}-hooks-role
name: {{ include "network-operator.fullname" . }}-hooks-scale-role
annotations:
helm.sh/hook: post-delete
helm.sh/hook: pre-install,pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
rules:
- apiGroups:
- mellanox.com
- apps
resources:
- nicclusterpolicies
- deployments/scale
verbs:
- delete
- "*"
- apiGroups:
- ""
resources:
- pods
verbs:
- "*"
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{ include "network-operator.fullname" . }}-hooks-binding
name: {{ include "network-operator.fullname" . }}-hooks-scale-binding
annotations:
helm.sh/hook: post-delete
helm.sh/hook: pre-install,pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
subjects:
- kind: ServiceAccount
name: {{ include "network-operator.fullname" . }}-hooks-sa
name: {{ include "network-operator.fullname" . }}-hooks-scale-sa
namespace: {{ .Release.Namespace }}
roleRef:
kind: ClusterRole
name: {{ include "network-operator.fullname" . }}-hooks-role
name: {{ include "network-operator.fullname" . }}-hooks-scale-role
apiGroup: rbac.authorization.k8s.io
---
apiVersion: batch/v1
kind: Job
metadata:
name: network-operator-delete-cr
name: network-operator-scale-down
namespace: {{ .Release.Namespace }}
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "1"
"helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation
labels:
Expand All @@ -57,7 +62,7 @@ metadata:
spec:
template:
metadata:
name: network-operator-delete-cr
name: network-operator-scale-down
labels:
{{- include "network-operator.labels" . | nindent 8 }}
app.kubernetes.io/component: "network-operator"
Expand All @@ -74,16 +79,16 @@ spec:
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "network-operator.fullname" . }}-hooks-sa
serviceAccountName: {{ include "network-operator.fullname" . }}-hooks-scale-sa
imagePullSecrets: {{ include "network-operator.operator.imagePullSecrets" . }}
containers:
- name: delete-cr
- name: scale-down
image: "{{ .Values.operator.repository }}/{{ .Values.operator.image }}:{{ .Values.operator.tag | default .Chart.AppVersion }}"
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- >
kubectl delete nicclusterpolicies.mellanox.com nic-cluster-policy;
kubectl scale -n {{ .Release.Namespace }} deployment {{ include "network-operator.fullname" . }} --replicas=0 || true;
kubectl wait -n {{ .Release.Namespace }} pod -l control-plane={{ .Release.Name }}-controller --for=delete --timeout=30s;
restartPolicy: OnFailure
{{- end }}
46 changes: 46 additions & 0 deletions pkg/migrate/migrate.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,14 @@ package migrate

import (
"context"
"fmt"

"github.com/go-logr/logr"
v1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
apiErrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/Mellanox/network-operator/pkg/config"
Expand All @@ -46,6 +49,11 @@ func migrate(ctx context.Context, log logr.Logger, c client.Client) error {
// not critical for the operator operation, safer to ignore
log.V(consts.LogLevelWarning).Info("ignore error during whereabouts CronJob removal")
}
if err := removeStateLabelFromNVIpamConfigMap(ctx, log, c); err != nil {
// critical for the operator operation, will fail NVIPAM migration
log.V(consts.LogLevelError).Error(err, "error trying to remove state label on NV IPAM configmap")
return err
}
return nil
}

Expand All @@ -68,3 +76,41 @@ func removeWhereaboutsIPReconcileCronJob(ctx context.Context, log logr.Logger, c
}
return nil
}

// reason: remove state label on NV IPAM config map if exists, to allow migration to IPPool CR
// If the state label is present, the config map will be removed by the NCP Controller as a stale object
func removeStateLabelFromNVIpamConfigMap(ctx context.Context, log logr.Logger, c client.Client) error {
namespace := config.FromEnv().State.NetworkOperatorResourceNamespace
cmName := "nvidia-k8s-ipam-config"
cfg := &corev1.ConfigMap{}
key := types.NamespacedName{
Name: cmName,
Namespace: namespace,
}
err := c.Get(ctx, key, cfg)
if apiErrors.IsNotFound(err) {
log.V(consts.LogLevelDebug).Info("NVIPAM config map not found, skip remove state label")
return nil
} else if err != nil {
log.V(consts.LogLevelError).Error(err, "fail to get NVIPAM configmap")
return err
}
_, ok := cfg.Labels[consts.StateLabel]
if ok {
log.V(consts.LogLevelDebug).Info("clear State label from NVIPAM configmap")
patch := []byte(fmt.Sprintf("[{\"op\": \"remove\", \"path\": \"/metadata/labels/%s\"}]", consts.StateLabel))
err = c.Patch(ctx, &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: cmName,
Namespace: namespace,
},
}, client.RawPatch(types.JSONPatchType, patch))
if err != nil {
log.V(consts.LogLevelError).Error(err, "fail to remove State label from NVIPAM configmap")
return err
}
} else {
log.V(consts.LogLevelDebug).Info("state label not set on NVIPAM configmap")
}
return nil
}

0 comments on commit 1295f17

Please sign in to comment.