Skip to content

Commit

Permalink
Merge pull request #43 from almaslennikov/stop-reboot-loops
Browse files Browse the repository at this point in the history
fix: validate if nv config update applied after reboot
  • Loading branch information
almaslennikov authored Nov 8, 2024
2 parents de7d739 + 158b4f0 commit b5c3f52
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 5 deletions.
5 changes: 4 additions & 1 deletion cmd/nic-configuration-daemon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,10 @@ func main() {
}
}

eventRecorder := mgr.GetEventRecorderFor("NicDeviceReconciler")

hostUtils := host.NewHostUtils()
hostManager := host.NewHostManager(nodeName, hostUtils, mgr.GetEventRecorderFor("NicDeviceReconciler"))
hostManager := host.NewHostManager(nodeName, hostUtils, eventRecorder)
maintenanceManager := maintenance.New(mgr.GetClient(), hostUtils, nodeName, namespace)

if err := initNicFwMap(namespace); err != nil {
Expand All @@ -90,6 +92,7 @@ func main() {
NamespaceName: namespace,
HostManager: hostManager,
MaintenanceManager: maintenanceManager,
EventRecorder: eventRecorder,
}
err = nicDeviceReconciler.SetupWithManager(mgr, true)
if err != nil {
Expand Down
54 changes: 51 additions & 3 deletions internal/controller/nicdevice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,13 @@ package controller

import (
"context"
"errors"
"sync"
"time"

v1 "k8s.io/api/core/v1"
"k8s.io/client-go/tools/record"

maintenanceoperator "github.com/Mellanox/maintenance-operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -55,7 +59,10 @@ type NicDeviceReconciler struct {
NamespaceName string

HostManager host.HostManager
HostUtils host.HostUtils
MaintenanceManager maintenance.MaintenanceManager

EventRecorder record.EventRecorder
}

type nicDeviceConfigurationStatuses []*nicDeviceConfigurationStatus
Expand Down Expand Up @@ -440,16 +447,57 @@ func (r *NicDeviceReconciler) handleSpecValidation(ctx context.Context, statuses
}
}
}

status.nvConfigUpdateRequired = nvConfigUpdateRequired
status.rebootRequired = rebootRequired

if nvConfigUpdateRequired {
log.Log.V(2).Info("update started for device", "device", status.device.Name)
err = r.updateDeviceStatusCondition(ctx, status.device, consts.UpdateStartedReason, metav1.ConditionTrue, "")
if err != nil {
status.lastStageError = err
}
}
} else if rebootRequired {
// There might be a case where FW config didn't apply after a reboot because of some error in FW. In this case
// we don't want the node to be kept in a reboot loop (FW configured -> reboot -> Config was not applied -> FW configured -> etc.).
// To break the reboot loop, we should compare the last time the status was changed to PendingReboot to the node's uptime.
// If the node started after the status was changed, we assume the node was rebooted and the config couldn't apply.
// In this case, we indicate the error to the user with the status change and emit an error event.
statusCondition := meta.FindStatusCondition(status.device.Status.Conditions, consts.ConfigUpdateInProgressCondition)
if statusCondition == nil {
return
}

status.nvConfigUpdateRequired = nvConfigUpdateRequired
status.rebootRequired = rebootRequired
switch statusCondition.Reason {
case consts.PendingRebootReason:
// We need to determine, whether a reboot has happened since the PendingReboot status has been set
uptime, err := r.HostUtils.GetHostUptimeSeconds()
if err != nil {
status.lastStageError = err
return
}

sinceStatusUpdate := time.Since(statusCondition.LastTransitionTime.Time)

// If more time has passed since boot than since the status update, the reboot hasn't happened yet
if uptime > sinceStatusUpdate {
return
}

log.Log.Info("nv config failed to update after reboot for device", "device", status.device.Name)
r.EventRecorder.Event(status.device, v1.EventTypeWarning, consts.FirmwareError, consts.FwConfigNotAppliedAfterRebootErrorMsg)
err = r.updateDeviceStatusCondition(ctx, status.device, consts.FirmwareError, metav1.ConditionFalse, consts.FwConfigNotAppliedAfterRebootErrorMsg)
if err != nil {
status.lastStageError = err
}

fallthrough
case consts.FirmwareError:
status.nvConfigUpdateRequired = false
status.rebootRequired = false
status.lastStageError = errors.New(consts.FwConfigNotAppliedAfterRebootErrorMsg)
}
}
}(i)
}

Expand Down
84 changes: 83 additions & 1 deletion internal/controller/nicdevice_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (
"sync"
"time"

"k8s.io/apimachinery/pkg/api/meta"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/stretchr/testify/mock"
Expand Down Expand Up @@ -52,6 +54,7 @@ var _ = Describe("NicDeviceReconciler", func() {
reconciler *NicDeviceReconciler
hostManager *hostMocks.HostManager
maintenanceManager *maintenanceMocks.MaintenanceManager
hostUtils *hostMocks.HostUtils
nodeName = "test-node"
deviceName = "test-device"
ctx context.Context
Expand Down Expand Up @@ -99,6 +102,7 @@ var _ = Describe("NicDeviceReconciler", func() {
deviceDiscoveryReconcileTime = 1 * time.Second
hostManager = &hostMocks.HostManager{}
maintenanceManager = &maintenanceMocks.MaintenanceManager{}
hostUtils = &hostMocks.HostUtils{}

reconciler = &NicDeviceReconciler{
Client: mgr.GetClient(),
Expand All @@ -107,6 +111,8 @@ var _ = Describe("NicDeviceReconciler", func() {
NamespaceName: namespaceName,
HostManager: hostManager,
MaintenanceManager: maintenanceManager,
HostUtils: hostUtils,
EventRecorder: mgr.GetEventRecorderFor("testReconciler"),
}
Expect(reconciler.SetupWithManager(mgr, false)).To(Succeed())
})
Expand Down Expand Up @@ -212,7 +218,7 @@ var _ = Describe("NicDeviceReconciler", func() {
})

Describe("reconcile a single device", func() {
var createDevice = func(setLastSpecAnnotation bool) {
var createDevice = func(setLastSpecAnnotation bool) *v1alpha1.NicDevice {
device := &v1alpha1.NicDevice{
ObjectMeta: metav1.ObjectMeta{Name: deviceName, Namespace: namespaceName},
Spec: v1alpha1.NicDeviceSpec{
Expand Down Expand Up @@ -245,6 +251,8 @@ var _ = Describe("NicDeviceReconciler", func() {
}},
}
Expect(k8sClient.Status().Update(ctx, device)).To(Succeed())

return device
}

It("Should not reconcile device not from its node", func() {
Expand Down Expand Up @@ -512,6 +520,80 @@ var _ = Describe("NicDeviceReconciler", func() {
hostManager.AssertExpectations(GinkgoT())
maintenanceManager.AssertExpectations(GinkgoT())
})
It("Should not request another reboot if nv config failed to apply after the first one", func() {
hostManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(false, true, nil)
hostUtils.On("GetHostUptimeSeconds").Return(time.Second*0, nil)

device := createDevice(false)

cond := metav1.Condition{
Type: consts.ConfigUpdateInProgressCondition,
Status: metav1.ConditionTrue,
ObservedGeneration: device.Generation,
Reason: consts.PendingRebootReason,
Message: "",
}
meta.SetStatusCondition(&device.Status.Conditions, cond)
Expect(k8sClient.Status().Update(ctx, device)).To(Succeed())

startManager()

Eventually(func() []metav1.Condition {
device := &v1alpha1.NicDevice{}
Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed())
return device.Status.Conditions
}, timeout).Should(testutils.MatchCondition(metav1.Condition{
Type: consts.ConfigUpdateInProgressCondition,
Status: metav1.ConditionFalse,
Reason: consts.FirmwareError,
Message: consts.FwConfigNotAppliedAfterRebootErrorMsg,
}))

hostManager.AssertNotCalled(GinkgoT(), "ApplyDeviceRuntimeSpec", mock.Anything)
maintenanceManager.AssertNotCalled(GinkgoT(), "ScheduleMaintenance", mock.Anything)
maintenanceManager.AssertNotCalled(GinkgoT(), "MaintenanceAllowed", mock.Anything)
hostManager.AssertNotCalled(GinkgoT(), "ApplyDeviceNvSpec", mock.Anything, mock.Anything)
maintenanceManager.AssertNotCalled(GinkgoT(), "Reboot")
maintenanceManager.AssertNotCalled(GinkgoT(), "ReleaseMaintenance", mock.Anything)
hostManager.AssertExpectations(GinkgoT())
maintenanceManager.AssertExpectations(GinkgoT())
})

It("Should not fail on an not applied nv config when reboot hasn't happened yet", func() {
hostManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(false, true, nil)
hostUtils.On("GetHostUptimeSeconds").Return(time.Second*1000, nil)

device := createDevice(false)

cond := metav1.Condition{
Type: consts.ConfigUpdateInProgressCondition,
Status: metav1.ConditionTrue,
ObservedGeneration: device.Generation,
Reason: consts.PendingRebootReason,
Message: "",
}
meta.SetStatusCondition(&device.Status.Conditions, cond)
Expect(k8sClient.Status().Update(ctx, device)).To(Succeed())

startManager()

Consistently(func() []metav1.Condition {
device := &v1alpha1.NicDevice{}
Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed())
return device.Status.Conditions
}, timeout).ShouldNot(testutils.MatchCondition(metav1.Condition{
Type: consts.ConfigUpdateInProgressCondition,
Status: metav1.ConditionFalse,
Reason: consts.FirmwareError,
Message: consts.FwConfigNotAppliedAfterRebootErrorMsg,
}))

hostManager.AssertNotCalled(GinkgoT(), "ApplyDeviceRuntimeSpec", mock.Anything)
hostManager.AssertNotCalled(GinkgoT(), "ApplyDeviceNvSpec", mock.Anything, mock.Anything)
maintenanceManager.AssertNotCalled(GinkgoT(), "ReleaseMaintenance", mock.Anything)
hostManager.AssertExpectations(GinkgoT())
maintenanceManager.AssertExpectations(GinkgoT())
})
})
Describe("reconcile multiple devices", func() {
var (
Expand Down
3 changes: 3 additions & 0 deletions pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ const (
RuntimeConfigUpdateFailedReason = "RuntimeConfigUpdateFailed"
UpdateSuccessfulReason = "UpdateSuccessful"
SpecValidationFailed = "SpecValidationFailed"
FirmwareError = "FirmwareError"

DeviceConfigSpecEmptyReason = "DeviceConfigSpecEmpty"
DeviceFwMatchReason = "DeviceFirmwareConfigMatch"
Expand Down Expand Up @@ -79,4 +80,6 @@ const (

SupportedNicFirmwareConfigmap = "supported-nic-firmware"
Mlx5ModuleVersionPath = "/sys/bus/pci/drivers/mlx5_core/module/version"

FwConfigNotAppliedAfterRebootErrorMsg = "firmware configuration failed to apply after reboot"
)
30 changes: 30 additions & 0 deletions pkg/host/mocks/HostUtils.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions pkg/host/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"strconv"
"strings"
"syscall"
"time"

"github.com/Mellanox/rdmamap"
"github.com/jaypipes/ghw"
Expand Down Expand Up @@ -80,6 +81,8 @@ type HostUtils interface {
ScheduleReboot() error
// GetOfedVersion retrieves installed OFED version
GetOfedVersion() string
// GetHostUptimeSeconds returns the host uptime in seconds
GetHostUptimeSeconds() (time.Duration, error)
}

type hostUtils struct {
Expand Down Expand Up @@ -630,6 +633,20 @@ func (h *hostUtils) GetOfedVersion() string {
return version
}

// GetHostUptimeSeconds returns the host uptime in seconds
func (h *hostUtils) GetHostUptimeSeconds() (time.Duration, error) {
log.Log.V(2).Info("HostUtils.GetHostUptimeSeconds()")
output, err := os.ReadFile("/proc/uptime")
if err != nil {
log.Log.Error(err, "HostUtils.GetHostUptimeSeconds(): failed to read the system's uptime")
return 0, err
}
uptimeStr := strings.Split(string(output), " ")[0]
uptimeSeconds, _ := strconv.ParseFloat(uptimeStr, 64)

return time.Duration(uptimeSeconds) * time.Second, nil
}

func NewHostUtils() HostUtils {
return &hostUtils{execInterface: execUtils.New()}
}

0 comments on commit b5c3f52

Please sign in to comment.