Skip to content

Commit

Permalink
Merge pull request #808 from ykulazhenkov/pr-systemd-wait-for-devices
Browse files Browse the repository at this point in the history
Add waitForDevicesInitialization to systemd service
  • Loading branch information
e0ne authored Nov 26, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
2 parents 4185aa4 + 8d4ae20 commit 71d2d16
Showing 7 changed files with 131 additions and 0 deletions.
57 changes: 57 additions & 0 deletions cmd/sriov-network-config-daemon/service.go
Original file line number Diff line number Diff line change
@@ -19,6 +19,7 @@ import (
"errors"
"fmt"
"os"
"time"

"github.com/go-logr/logr"
"github.com/spf13/cobra"
@@ -40,6 +41,12 @@ import (
const (
PhasePre = "pre"
PhasePost = "post"

// InitializationDeviceDiscoveryTimeoutSec constant defines the number of
// seconds to wait for devices to be registered in the system with the expected name.
InitializationDeviceDiscoveryTimeoutSec = 60
// InitializationDeviceUdevProcessingTimeoutSec constant defines the number of seconds to wait for udev rules to process
InitializationDeviceUdevProcessingTimeoutSec = 60
)

var (
@@ -104,6 +111,8 @@ func runServiceCmd(cmd *cobra.Command, args []string) error {
return updateSriovResultErr(setupLog, phaseArg, fmt.Errorf("failed to create hostHelpers: %v", err))
}

waitForDevicesInitialization(setupLog, sriovConf, hostHelpers)

if phaseArg == PhasePre {
err = phasePre(setupLog, sriovConf, hostHelpers)
} else {
@@ -303,3 +312,51 @@ func updateResult(setupLog logr.Logger, result, msg string) error {
setupLog.V(0).Info("result file updated", "SyncStatus", sriovResult.SyncStatus, "LastSyncError", msg)
return nil
}

// waitForDevicesInitialization should be executed in both the pre and post-networking stages.
// This function ensures that the network devices specified in the configuration are registered
// and handled by UDEV. Sometimes, the initialization of network devices might take a significant
// amount of time, and the sriov-config systemd service may start before the devices are fully
// processed, leading to failure.
//
// To address this, we not only check if the devices are registered with the correct name but also
// wait for the udev event queue to empty. This increases the likelihood that the service will start
// only when the devices are fully initialized. It is required to call this function in the
// "post-networking" phase as well because the OS network manager might change device configurations,
// and we need to ensure these changes are fully processed before starting the post-networking part.
//
// The timeouts used in this function are intentionally kept low to avoid blocking the OS loading
// process for too long in case of any issues.
//
// Note: Currently, this function handles only Baremetal clusters. We do not have evidence that
// this logic is required for virtual clusters.
func waitForDevicesInitialization(setupLog logr.Logger, conf *systemd.SriovConfig, hostHelpers helper.HostHelpersInterface) {
if conf.PlatformType != consts.Baremetal {
// skip waiting on virtual cluster
return
}
// wait for devices from the spec to be registered in the system with expected names
devicesToWait := make(map[string]string, len(conf.Spec.Interfaces))
for _, d := range conf.Spec.Interfaces {
devicesToWait[d.PciAddress] = d.Name
}
deadline := time.Now().Add(time.Second * time.Duration(InitializationDeviceDiscoveryTimeoutSec))
for time.Now().Before(deadline) {
for pciAddr, name := range devicesToWait {
if hostHelpers.TryGetInterfaceName(pciAddr) == name {
delete(devicesToWait, pciAddr)
}
}
if len(devicesToWait) == 0 {
break
}
time.Sleep(time.Second)
}
if len(devicesToWait) != 0 {
setupLog.Info("WARNING: some devices were not initialized", "devices", devicesToWait, "timeout", InitializationDeviceDiscoveryTimeoutSec)
}
if err := hostHelpers.WaitUdevEventsProcessed(InitializationDeviceUdevProcessingTimeoutSec); err != nil {
setupLog.Info("WARNING: failed to wait for udev events processing", "reason", err.Error(),
"timeout", InitializationDeviceUdevProcessingTimeoutSec)
}
}
20 changes: 20 additions & 0 deletions cmd/sriov-network-config-daemon/service_test.go
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@ package main
import (
"fmt"

"github.com/go-logr/logr"
"github.com/golang/mock/gomock"
"github.com/spf13/cobra"
"gopkg.in/yaml.v3"
@@ -158,6 +159,8 @@ var _ = Describe("Service", func() {
"/etc/sriov-operator/sriov-interface-result.yaml": []byte("something"),
},
})
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("enp216s0f0np0")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
hostHelpers.EXPECT().CheckRDMAEnabled().Return(true, nil)
hostHelpers.EXPECT().TryEnableTun().Return()
hostHelpers.EXPECT().TryEnableVhostNet().Return()
@@ -211,6 +214,8 @@ var _ = Describe("Service", func() {
"/etc/sriov-operator/sriov-interface-result.yaml": []byte("something"),
},
})
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("enp216s0f0np0")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
hostHelpers.EXPECT().CheckRDMAEnabled().Return(true, nil)
hostHelpers.EXPECT().TryEnableTun().Return()
hostHelpers.EXPECT().TryEnableVhostNet().Return()
@@ -236,6 +241,8 @@ var _ = Describe("Service", func() {
"/etc/sriov-operator/sriov-interface-result.yaml": getTestResultFileContent("InProgress", ""),
},
})
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("enp216s0f0np0")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
hostHelpers.EXPECT().DiscoverSriovDevices(hostHelpers).Return([]sriovnetworkv1.InterfaceExt{{
Name: "enp216s0f0np0",
}}, nil)
@@ -276,4 +283,17 @@ var _ = Describe("Service", func() {
testHelpers.GinkgoAssertFileContentsEquals("/etc/sriov-operator/sriov-interface-result.yaml",
string(getTestResultFileContent("Failed", "post: unexpected result of the pre phase: Failed, syncError: pretest")))
})
It("waitForDevicesInitialization", func() {
cfg := &systemd.SriovConfig{Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{
Interfaces: []sriovnetworkv1.Interface{
{Name: "name1", PciAddress: "0000:d8:00.0"},
{Name: "name2", PciAddress: "0000:d8:00.1"}}}}
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("other")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.1").Return("")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.0").Return("name1")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.1").Return("")
hostHelpers.EXPECT().TryGetInterfaceName("0000:d8:00.1").Return("name2")
hostHelpers.EXPECT().WaitUdevEventsProcessed(60).Return(nil)
waitForDevicesInitialization(logr.Discard(), cfg, hostHelpers)
})
})
14 changes: 14 additions & 0 deletions pkg/helper/mock/mock_helper.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions pkg/host/internal/udev/udev.go
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@ import (
"os"
"path"
"path/filepath"
"strconv"
"strings"

"sigs.k8s.io/controller-runtime/pkg/log"
@@ -126,6 +127,18 @@ func (u *udev) LoadUdevRules() error {
return nil
}

// WaitUdevEventsProcessed calls `udevadm settle“ with provided timeout
// The command watches the udev event queue, and exits if all current events are handled.
func (u *udev) WaitUdevEventsProcessed(timeout int) error {
log.Log.V(2).Info("WaitUdevEventsProcessed()")
_, stderr, err := u.utilsHelper.RunCommand("udevadm", "settle", "-t", strconv.Itoa(timeout))
if err != nil {
log.Log.Error(err, "WaitUdevEventsProcessed(): failed to wait for udev rules to process", "error", stderr, "timeout", timeout)
return err
}
return nil
}

func (u *udev) addUdevRule(pfPciAddress, ruleName, ruleContent string) error {
log.Log.V(2).Info("addUdevRule()", "device", pfPciAddress, "rule", ruleName)
rulePath := u.getRuleFolderPath()
10 changes: 10 additions & 0 deletions pkg/host/internal/udev/udev_test.go
Original file line number Diff line number Diff line change
@@ -210,4 +210,14 @@ var _ = Describe("UDEV", func() {
Expect(s.LoadUdevRules()).To(MatchError(testError))
})
})
Context("WaitUdevEventsProcessed", func() {
It("Succeed", func() {
utilsMock.EXPECT().RunCommand("udevadm", "settle", "-t", "10").Return("", "", nil)
Expect(s.WaitUdevEventsProcessed(10)).NotTo(HaveOccurred())
})
It("Command Failed", func() {
utilsMock.EXPECT().RunCommand("udevadm", "settle", "-t", "20").Return("", "", testError)
Expect(s.WaitUdevEventsProcessed(20)).To(MatchError(testError))
})
})
})
14 changes: 14 additions & 0 deletions pkg/host/mock/mock_host.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions pkg/host/types/interfaces.go
Original file line number Diff line number Diff line change
@@ -164,6 +164,9 @@ type UdevInterface interface {
RemoveVfRepresentorUdevRule(pfPciAddress string) error
// LoadUdevRules triggers udev rules for network subsystem
LoadUdevRules() error
// WaitUdevEventsProcessed calls `udevadm settle“ with provided timeout
// The command watches the udev event queue, and exits if all current events are handled.
WaitUdevEventsProcessed(timeout int) error
}

type VdpaInterface interface {

0 comments on commit 71d2d16

Please sign in to comment.