Skip to content

Commit

Permalink
fix: preserve NVIPAM CM after upgrade
Browse files Browse the repository at this point in the history
In order to allow migration from config map
to IP Pool CRs in NV IPAM, the config map needs to be kept.

- Removing the state label will make sure it is not deleted as
a stale object.

- Remove the helm hook annotation on NiClusterPolicy CR template,
  that causes delete and recreate of CR and the delete-cr hook.
  Note that this will prevent CR validation in install stage.

- Make sure to stop existing Network Operator before updating CR,
  so that it does not delete the config map since config is removed.

Signed-off-by: Fred Rolland <[email protected]>
  • Loading branch information
rollandf committed Nov 14, 2023
1 parent 24db15a commit 6dc56e7
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ apiVersion: mellanox.com/v1alpha1
kind: NicClusterPolicy
metadata:
name: nic-cluster-policy
annotations:
helm.sh/hook: post-install,post-upgrade
spec:
{{- if .Values.nodeAffinity }}
nodeAffinity:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,54 +1,59 @@
{{- if .Values.deployCR }}
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "network-operator.fullname" . }}-hooks-sa
name: {{ include "network-operator.fullname" . }}-hooks-scale-sa
annotations:
helm.sh/hook: post-delete
helm.sh/hook: pre-install,pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "network-operator.fullname" . }}-hooks-role
name: {{ include "network-operator.fullname" . }}-hooks-scale-role
annotations:
helm.sh/hook: post-delete
helm.sh/hook: pre-install,pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
rules:
- apiGroups:
- mellanox.com
- apps
resources:
- nicclusterpolicies
- deployments/scale
verbs:
- delete
- "*"
- apiGroups:
- ""
resources:
- pods
verbs:
- "*"
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{ include "network-operator.fullname" . }}-hooks-binding
name: {{ include "network-operator.fullname" . }}-hooks-scale-binding
annotations:
helm.sh/hook: post-delete
helm.sh/hook: pre-install,pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
subjects:
- kind: ServiceAccount
name: {{ include "network-operator.fullname" . }}-hooks-sa
name: {{ include "network-operator.fullname" . }}-hooks-scale-sa
namespace: {{ .Release.Namespace }}
roleRef:
kind: ClusterRole
name: {{ include "network-operator.fullname" . }}-hooks-role
name: {{ include "network-operator.fullname" . }}-hooks-scale-role
apiGroup: rbac.authorization.k8s.io
---
apiVersion: batch/v1
kind: Job
metadata:
name: network-operator-delete-cr
name: network-operator-scale-down
namespace: {{ .Release.Namespace }}
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "1"
"helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation
labels:
Expand All @@ -57,7 +62,7 @@ metadata:
spec:
template:
metadata:
name: network-operator-delete-cr
name: network-operator-scale-down
labels:
{{- include "network-operator.labels" . | nindent 8 }}
app.kubernetes.io/component: "network-operator"
Expand All @@ -74,16 +79,16 @@ spec:
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "network-operator.fullname" . }}-hooks-sa
serviceAccountName: {{ include "network-operator.fullname" . }}-hooks-scale-sa
imagePullSecrets: {{ include "network-operator.operator.imagePullSecrets" . }}
containers:
- name: delete-cr
- name: scale-down
image: "{{ .Values.operator.repository }}/{{ .Values.operator.image }}:{{ .Values.operator.tag | default .Chart.AppVersion }}"
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- >
kubectl delete nicclusterpolicies.mellanox.com nic-cluster-policy;
kubectl scale -n {{ .Release.Namespace }} deployment {{ include "network-operator.fullname" . }} --replicas=0 || true;
kubectl wait -n {{ .Release.Namespace }} pod -l control-plane={{ .Release.Name }}-controller --for=delete --timeout=30s;
restartPolicy: OnFailure
{{- end }}
37 changes: 37 additions & 0 deletions pkg/migrate/migrate.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,14 @@ package migrate

import (
"context"
"fmt"

"github.com/go-logr/logr"
v1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
apiErrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/Mellanox/network-operator/pkg/config"
Expand All @@ -46,6 +49,10 @@ func migrate(ctx context.Context, log logr.Logger, c client.Client) error {
// not critical for the operator operation, safer to ignore
log.V(consts.LogLevelWarning).Info("ignore error during whereabouts CronJob removal")
}
if err := removeStateLabelFromNVIpamConfigMap(ctx, log, c); err != nil {
// not critical for the operator operation, safer to ignore
log.V(consts.LogLevelWarning).Info("ignore error during removing state label on NV IPAM configmap")
}
return nil
}

Expand All @@ -68,3 +75,33 @@ func removeWhereaboutsIPReconcileCronJob(ctx context.Context, log logr.Logger, c
}
return nil
}

// reason: remove state label on NV IPAM config map if exists, to allow migration to IPPool CR
// If the state label is present, the config map will be removed by the NCP Controller as a stale object
func removeStateLabelFromNVIpamConfigMap(ctx context.Context, log logr.Logger, c client.Client) error {
namespace := config.FromEnv().State.NetworkOperatorResourceNamespace
cmName := "nvidia-k8s-ipam-config"
cfg := &corev1.ConfigMap{}
key := types.NamespacedName{
Name: cmName,
Namespace: namespace,
}
err := c.Get(ctx, key, cfg)
if apiErrors.IsNotFound(err) {
log.V(consts.LogLevelDebug).Info("NVIPAM config map not found, skip remove state label")
return nil
}
log.V(consts.LogLevelDebug).Info("clear State label from NVIPAM configmap")
patch := []byte(fmt.Sprintf("[{\"op\": \"remove\", \"path\": \"/metadata/labels/%s\"}]", consts.StateLabel))
err = c.Patch(ctx, &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: cmName,
Namespace: namespace,
},
}, client.RawPatch(types.JSONPatchType, patch))
if err != nil {
log.V(consts.LogLevelError).Error(err, "fail to remove State label from NVIPAM configmap")
return err
}
return nil
}

0 comments on commit 6dc56e7

Please sign in to comment.