Skip to content

Commit

Permalink
E2E: slim the graceful restart test
Browse files Browse the repository at this point in the history
This e2e test flakes often, and especially when the github actions
are slow. Therefore we reduce the overall peering to make the test
require less resources. What this commit does:
1. GR test will only use peering from single node to ebgp-single-hop
(so we will run 4x less `docker exec` to grab the routes).
2. GR test will restart single frr-k8s pod and will monitor that single
route and wait only that pod to be healthy.
3. Given we target the GR between specific host and specific peer, we
know which logs to look in case of failure.

Signed-off-by: karampok <[email protected]>
  • Loading branch information
karampok committed Oct 21, 2024
1 parent 3fcf491 commit 52ffda7
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 41 deletions.
10 changes: 10 additions & 0 deletions e2etests/pkg/config/from_containers.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
package config

import (
"fmt"
"net"

frrk8sv1beta1 "github.com/metallb/frr-k8s/api/v1beta1"
Expand Down Expand Up @@ -136,3 +137,12 @@ func ContainersForVRF(frrs []*frrcontainer.FRR, vrf string) []*frrcontainer.FRR
}
return res
}

func ContainerByName(frrs []*frrcontainer.FRR, name string) (*frrcontainer.FRR, error) {
for _, f := range frrs {
if f.Name == name {
return f, nil
}
}
return nil, fmt.Errorf("container with name %s not found", name)
}
47 changes: 25 additions & 22 deletions e2etests/pkg/k8s/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@ package k8s
import (
"context"
"fmt"
"slices"
"time"

"errors"

"github.com/onsi/ginkgo/v2"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -42,38 +40,43 @@ func FRRK8sPods(cs clientset.Interface) ([]*corev1.Pod, error) {
return res, nil
}

func RestartFRRK8sPods(cs clientset.Interface) error {
pods, err := FRRK8sPods(cs)
func RestartFRRK8sPod(cs clientset.Interface, nodeName string) error {
podForNode := func() (*corev1.Pod, error) {
pods, err := FRRK8sPods(cs)
if err != nil {
return nil, err
}
for _, p := range pods {
if p.Spec.NodeName == nodeName {
return p, nil
}
}
return nil, fmt.Errorf("no frr-k8s pod found on node %s", nodeName)
}

oldPod, err := podForNode()
if err != nil {
return err
}
oldNames := []string{}
for _, p := range pods {
oldNames = append(oldNames, p.Name)
err := cs.CoreV1().Pods(FRRK8sNamespace).Delete(context.Background(), p.Name, metav1.DeleteOptions{})
if err != nil {
return err
}
if err := cs.CoreV1().Pods(FRRK8sNamespace).Delete(context.Background(), oldPod.Name, metav1.DeleteOptions{}); err != nil {
return err
}

return wait.PollUntilContextTimeout(context.Background(), 10*time.Second,
120*time.Second, false, func(context.Context) (bool, error) {
npods, err := FRRK8sPods(cs)
newPod, err := podForNode()
if err != nil {
return false, err
}
if len(npods) != len(pods) {
return false, nil

if newPod.Name == oldPod.Name {
return false, fmt.Errorf("pod was not deleted")
}
for _, p := range npods {
if slices.Contains(oldNames, p.Name) {
return false, nil
}
if !podIsRunningAndReady(p) {
ginkgo.By(fmt.Sprintf("\t%v pod %s not ready", time.Now(), p.Name))
return false, nil
}

if !podIsRunningAndReady(newPod) {
return false, nil
}

return true, nil
})
}
Expand Down
10 changes: 5 additions & 5 deletions e2etests/pkg/routes/routes.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,23 +32,23 @@ func PodHasPrefixFromContainer(pod *v1.Pod, frr frrcontainer.FRR, vrf, prefix st
func CheckNeighborHasPrefix(neighbor frrcontainer.FRR, vrf, prefix string, nodes []v1.Node) error {
routesV4, routesV6, err := frr.Routes(neighbor)
if err != nil {
return err
return fmt.Errorf("Routes %w", err)
}

_, cidr, err := net.ParseCIDR(prefix)
if err != nil {
return err
return fmt.Errorf("ParseCIDR %w", err)
}

route, err := routeForCIDR(cidr, routesV4, routesV6)
if err != nil {
return err
return fmt.Errorf("routerForCIDR: %w", err)
}

cidrFamily := ipfamily.ForCIDR(cidr)
err = frr.RoutesMatchNodes(nodes, route, cidrFamily, vrf)
if err != nil {
return err
return fmt.Errorf("RoutesMatchNodes %w", err)
}
return nil
}
Expand All @@ -62,7 +62,7 @@ type RouteNotFoundError struct {
}

func (e RouteNotFoundError) Error() string {
return fmt.Sprintf("route not found %s", e.Route)
return fmt.Sprintf("route not found for dst prefix %s", e.Route)
}

func routeForCIDR(cidr *net.IPNet, routesV4 map[string]frr.Route, routesV6 map[string]frr.Route) (frr.Route, error) {
Expand Down
35 changes: 21 additions & 14 deletions e2etests/tests/graceful_restart.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/onsi/ginkgo/v2"
"github.com/openshift-kni/k8sreporter"
"go.universe.tf/e2etest/pkg/frr/container"
frrcontainer "go.universe.tf/e2etest/pkg/frr/container"

frrk8sv1beta1 "github.com/metallb/frr-k8s/api/v1beta1"
"github.com/metallb/frrk8stests/pkg/config"
Expand Down Expand Up @@ -54,7 +55,7 @@ var _ = ginkgo.Describe("Establish BGP session with EnableGracefulRestart", func
Expect(err).NotTo(HaveOccurred())

err = cleanup(updater)
Expect(err).NotTo(HaveOccurred(), "cleanup config in API and infra containers")
Expect(err).NotTo(HaveOccurred(), "cleanup config in API and in infra containers")

cs = k8sclient.New()
nodes, err = k8s.Nodes(cs)
Expand All @@ -68,18 +69,20 @@ var _ = ginkgo.Describe("Establish BGP session with EnableGracefulRestart", func
dump.K8sInfo(testName, reporter)
dump.BGPInfo(testName, infra.FRRContainers, cs)
}

err := cleanup(updater)
Expect(err).NotTo(HaveOccurred(), "cleanup config in API and in infra containers")
})

ginkgo.Context("When restarting the frrk8s deamon pods", func() {

ginkgo.DescribeTable("external BGP peer maintains routes", func(ipFam ipfamily.Family, prefix string) {
frrs := config.ContainersForVRF(infra.FRRContainers, "")
for _, c := range frrs {
err := container.PairWithNodes(cs, c, ipFam)
Expect(err).NotTo(HaveOccurred(), "set frr config in infra containers failed")
}
cnt, err := config.ContainerByName(infra.FRRContainers, "ebgp-single-hop")
Expect(err).NotTo(HaveOccurred())
err = container.PairWithNodes(cs, cnt, ipFam)
Expect(err).NotTo(HaveOccurred(), "set frr config in infra containers failed")

peersConfig := config.PeersForContainers(frrs, ipFam,
peersConfig := config.PeersForContainers([]*frrcontainer.FRR{cnt}, ipFam,
config.EnableAllowAll, config.EnableGracefulRestart)

frrConfigCR := frrk8sv1beta1.FRRConfiguration{
Expand All @@ -88,6 +91,11 @@ var _ = ginkgo.Describe("Establish BGP session with EnableGracefulRestart", func
Namespace: k8s.FRRK8sNamespace,
},
Spec: frrk8sv1beta1.FRRConfigurationSpec{
NodeSelector: metav1.LabelSelector{
MatchLabels: map[string]string{
"kubernetes.io/hostname": nodes[0].GetLabels()["kubernetes.io/hostname"],
},
},
BGP: frrk8sv1beta1.BGPConfig{
Routers: []frrk8sv1beta1.Router{
{
Expand All @@ -100,33 +108,32 @@ var _ = ginkgo.Describe("Establish BGP session with EnableGracefulRestart", func
},
}

err := updater.Update(peersConfig.Secrets, frrConfigCR)
err = updater.Update(peersConfig.Secrets, frrConfigCR)
Expect(err).NotTo(HaveOccurred(), "apply the CR in k8s api failed")

check := func() error {
for _, p := range peersConfig.Peers() {
err := routes.CheckNeighborHasPrefix(p.FRR, p.FRR.RouterConfig.VRF, prefix, nodes)
err := routes.CheckNeighborHasPrefix(p.FRR, p.FRR.RouterConfig.VRF, prefix, nodes[:1])
if err != nil {
return fmt.Errorf("Neigh %s does not have prefix %s: %w", p.FRR.Name, prefix, err)
}
}
return nil
}

Eventually(check, time.Minute, time.Second).ShouldNot(HaveOccurred(),
Eventually(check, time.Minute, 5*time.Second).ShouldNot(HaveOccurred(),
"route should exist before we restart frr-k8s")

c := make(chan struct{})
go func() { // go restart frr-k8s while Consistently check that route exists
defer ginkgo.GinkgoRecover()
err := k8s.RestartFRRK8sPods(cs)
err := k8s.RestartFRRK8sPodOnNode(cs, nodes[0].GetName())
Expect(err).NotTo(HaveOccurred(), "frr-k8s pods failed to restart")
close(c)
}()

// 2*time.Minute is important because that is the Graceful Restart timer.
Consistently(check, 2*time.Minute, time.Second).ShouldNot(HaveOccurred())
Eventually(c, time.Minute, time.Second).Should(BeClosed(), "restart FRRK8s pods are not yet ready")
Consistently(check, time.Minute, time.Second).ShouldNot(HaveOccurred(), "route check failed during frr-k8s-pod reboot")
Eventually(c, time.Minute, time.Second).Should(BeClosed(), "restarted frr-k8s pods are not yet ready")
},
ginkgo.Entry("IPV4", ipfamily.IPv4, "192.168.2.0/24"),
ginkgo.Entry("IPV6", ipfamily.IPv6, "fc00:f853:ccd:e799::/64"),
Expand Down

0 comments on commit 52ffda7

Please sign in to comment.