Skip to content

Commit

Permalink
Merge pull request #2212 from tnozicka/fix-ignition
Browse files Browse the repository at this point in the history
Fix accidental ignition and stuck graceful termination
  • Loading branch information
scylla-operator-bot[bot] authored Dec 11, 2024
2 parents d4da9df + 978875b commit 91b1fe0
Show file tree
Hide file tree
Showing 13 changed files with 434 additions and 86 deletions.
105 changes: 68 additions & 37 deletions pkg/controller/ignition/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package ignition

import (
"context"
"errors"
"fmt"
"sync/atomic"
"time"
Expand All @@ -12,6 +13,7 @@ import (
"github.com/scylladb/scylla-operator/pkg/helpers"
"github.com/scylladb/scylla-operator/pkg/internalapi"
"github.com/scylladb/scylla-operator/pkg/naming"
"github.com/scylladb/scylla-operator/pkg/pointer"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
corev1informers "k8s.io/client-go/informers/core/v1"
Expand Down Expand Up @@ -89,18 +91,10 @@ func (c *Controller) IsIgnited() bool {
return c.ignited.Load()
}

func (c *Controller) Sync(ctx context.Context) error {
startTime := time.Now()
klog.V(4).InfoS("Started syncing observer", "Name", c.Observer.Name(), "startTime", startTime)
defer func() {
klog.V(4).InfoS("Finished syncing observer", "Name", c.Observer.Name(), "duration", time.Since(startTime))
}()

ignited := true

func (c *Controller) evaluateIgnitionState() (bool, error) {
svc, err := c.serviceLister.Services(c.namespace).Get(c.serviceName)
if err != nil {
return fmt.Errorf("can't get service %q: %w", c.serviceName, err)
return false, fmt.Errorf("can't get service %q: %w", c.serviceName, err)
}

// TODO: This isn't bound to the lifecycle of the ScyllaDB Pod and should be evaluated in the controller for the resource.
Expand All @@ -112,62 +106,59 @@ func (c *Controller) Sync(ctx context.Context) error {
"Waiting for identity service to have at least one ingress point",
"Service", naming.ManualRef(c.namespace, c.serviceName),
)
ignited = false
} else {
klog.V(2).InfoS(
"Service is available and has an IP address",
"Service", naming.ManualRef(svc.Namespace, svc.Name),
"UID", svc.UID,
)
return false, nil
}
klog.V(2).InfoS(
"Service is available and has an IP address",
"Service", naming.ManualRef(svc.Namespace, svc.Name),
"UID", svc.UID,
)
}

pod, err := c.podLister.Pods(c.namespace).Get(c.serviceName)
if err != nil {
return fmt.Errorf("can't get pod %q: %w", c.serviceName, err)
return false, fmt.Errorf("can't get pod %q: %w", c.serviceName, err)
}

if len(pod.Status.PodIP) == 0 {
klog.V(2).InfoS("PodIP is not yet set", "Pod", klog.KObj(pod), "UID", pod.UID)
ignited = false
} else {
klog.V(2).InfoS("PodIP is present on the Pod", "Pod", klog.KObj(pod), "UID", pod.UID, "IP", pod.Status.PodIP)
return false, nil
}
klog.V(2).InfoS("PodIP is present on the Pod", "Pod", klog.KObj(pod), "UID", pod.UID, "IP", pod.Status.PodIP)

containerID, err := controllerhelpers.GetScyllaContainerID(pod)
if err != nil {
return controllertools.NonRetriable(
return false, controllertools.NonRetriable(
fmt.Errorf("can't get scylla container id in pod %q: %v", naming.ObjRef(pod), err),
)
}

if len(containerID) == 0 {
klog.V(2).InfoS("ScyllaDB ContainerID is not yet set", "Pod", klog.KObj(pod), "UID", pod.UID)
ignited = false
} else {
klog.V(2).InfoS("Pod has ScyllaDB ContainerID set", "Pod", klog.KObj(pod), "UID", pod.UID, "ContainerID", containerID)
return false, nil
}
klog.V(2).InfoS("Pod has ScyllaDB ContainerID set", "Pod", klog.KObj(pod), "UID", pod.UID, "ContainerID", containerID)

cmLabelSelector := labels.Set{
naming.OwnerUIDLabel: string(pod.UID),
naming.ConfigMapTypeLabel: string(naming.NodeConfigDataConfigMapType),
}.AsSelector()
configMaps, err := c.configMapLister.ConfigMaps(c.namespace).List(cmLabelSelector)
if err != nil {
return fmt.Errorf("can't list tuning configmap: %w", err)
return false, fmt.Errorf("can't list tuning configmap: %w", err)
}

switch l := len(configMaps); l {
case 0:
klog.V(2).InfoS("Tuning ConfigMap for pod is not yet available", "Pod", klog.KObj(pod), "UID", pod.UID)
ignited = false
return false, nil

case 1:
cm := configMaps[0]
src := &internalapi.SidecarRuntimeConfig{}
src, err = controllerhelpers.GetSidecarRuntimeConfigFromConfigMap(cm)
if err != nil {
return controllertools.NonRetriable(
return false, controllertools.NonRetriable(
fmt.Errorf("can't get sidecar runtime config from configmap %q: %w", naming.ObjRef(cm), err),
)
}
Expand All @@ -179,39 +170,79 @@ func (c *Controller) Sync(ctx context.Context) error {
"ContainerID", containerID,
"NodeConfig", src.BlockingNodeConfigs,
)
ignited = false
return false, nil
}
} else {
klog.V(2).InfoS("Scylla runtime config is not yet updated with our ContainerID",
"ConfigMap", klog.KObj(cm),
"ConfigContainerID", src.ContainerID,
"SidecarContainerID", containerID,
)
ignited = false
return false, nil
}

default:
return fmt.Errorf("mutiple tuning configmaps are present for pod %q with UID %q", naming.ObjRef(pod), pod.UID)
return false, fmt.Errorf("mutiple tuning configmaps are present for pod %q with UID %q", naming.ObjRef(pod), pod.UID)
}

return true, nil
}
func (c *Controller) Sync(ctx context.Context) error {
startTime := time.Now()
klog.V(4).InfoS("Started syncing observer", "Name", c.Observer.Name(), "startTime", startTime)
defer func() {
klog.V(4).InfoS("Finished syncing observer", "Name", c.Observer.Name(), "duration", time.Since(startTime))
}()

svc, err := c.serviceLister.Services(c.namespace).Get(c.serviceName)
if err != nil {
return fmt.Errorf("can't get service %q: %w", c.serviceName, err)
}

var ignitionOverride *bool
if svc.Annotations != nil {
forceIgnitionString, hasForceIgnitionString := svc.Annotations[naming.ForceIgnitionValueAnnotation]
if hasForceIgnitionString {
switch forceIgnitionString {
case "true":
ignitionOverride = pointer.Ptr(true)
case "false":
ignitionOverride = pointer.Ptr(false)
default:
klog.ErrorS(errors.New("invalid ignition override value"), "Value", forceIgnitionString, "Key", naming.ForceIgnitionValueAnnotation)
ignitionOverride = nil
}
}
}

var ignited bool
if ignitionOverride != nil {
ignited = *ignitionOverride
klog.InfoS("Forcing ignition state", "Ignited", ignited, "Annotation", naming.ForceIgnitionValueAnnotation)
} else {
ignited, err = c.evaluateIgnitionState()
if err != nil {
return fmt.Errorf("can't evaluate ignition state: %w", err)
}
}

if ignited {
klog.V(2).InfoS("Ignition successful", "SignalFile", naming.ScyllaDBIgnitionDonePath)
err = helpers.TouchFile(naming.ScyllaDBIgnitionDonePath)
if err != nil {
return fmt.Errorf("can't touch signal file %q: %w", naming.ScyllaDBIgnitionDonePath, err)
}
} else {
klog.V(2).InfoS("Waiting for ignition to complete.", "SignalFile", naming.ScyllaDBIgnitionDonePath)
}

klog.V(2).InfoS("Updating ignition", "Ignited", ignited, "SignalFile", naming.ScyllaDBIgnitionDonePath)
klog.V(2).InfoS("Updating ignition", "Ignited", ignited)

oldIgnited := c.ignited.Load()
if ignited != oldIgnited {
klog.InfoS("Ignition state has changed", "New", ignited, "Old", oldIgnited)
}
c.ignited.Store(ignited)

err = helpers.TouchFile(naming.ScyllaDBIgnitionDonePath)
if err != nil {
return fmt.Errorf("can't touch signal file %q: %w", naming.ScyllaDBIgnitionDonePath, err)
}

return nil
}
20 changes: 14 additions & 6 deletions pkg/controller/scylladbdatacenter/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -636,9 +636,12 @@ func StatefulSetForRack(rack scyllav1alpha1.RackSpec, sdc *scyllav1alpha1.Scylla
"inherit_errexit",
"-c",
strings.TrimSpace(`
trap 'kill $( jobs -p ); exit 0' TERM
printf 'INFO %s ignition - Waiting for /mnt/shared/ignition.done\n' "$( date '+%Y-%m-%d %H:%M:%S,%3N' )" > /dev/stderr
until [[ -f "/mnt/shared/ignition.done" ]]; do
sleep 1;
sleep 1 &
wait
done
printf 'INFO %s ignition - Ignited. Starting ScyllaDB...\n' "$( date '+%Y-%m-%d %H:%M:%S,%3N' )" > /dev/stderr
Expand Down Expand Up @@ -836,7 +839,9 @@ exec /mnt/shared/scylla-operator sidecar \
"inherit_errexit",
"-c",
strings.TrimSpace(`
trap 'rm /mnt/shared/ignition.done' EXIT
trap 'kill $( jobs -p ); exit 0' TERM
trap 'rm -f /mnt/shared/ignition.done' EXIT
nodetool drain &
sleep ` + strconv.Itoa(minTerminationGracePeriodSeconds) + ` &
wait
Expand Down Expand Up @@ -892,7 +897,7 @@ wait
},
},
{
Name: "scylladb-ignition",
Name: naming.ScyllaDBIgnitionContainerName,
Image: sidecarImage,
ImagePullPolicy: corev1.PullIfNotPresent,
Command: []string{
Expand Down Expand Up @@ -1166,7 +1171,7 @@ func getScyllaDBManagerAgentContainer(r scyllav1alpha1.RackSpec, sdc *scyllav1al
}

cnt := &corev1.Container{
Name: "scylla-manager-agent",
Name: naming.ScyllaManagerAgentContainerName,
Image: *sdc.Spec.ScyllaDBManagerAgent.Image,
ImagePullPolicy: corev1.PullIfNotPresent,
// There is no point in starting scylla-manager before ScyllaDB is tuned and ignited. The manager agent fails after 60 attempts and hits backoff unnecessarily.
Expand All @@ -1178,13 +1183,16 @@ func getScyllaDBManagerAgentContainer(r scyllav1alpha1.RackSpec, sdc *scyllav1al
"inherit_errexit",
"-c",
strings.TrimSpace(`
trap 'kill $( jobs -p ); exit 0' TERM
printf '{"L":"INFO","T":"%s","M":"Waiting for /mnt/shared/ignition.done"}\n' "$( date -u '+%Y-%m-%dT%H:%M:%S,%3NZ' )" > /dev/stderr
until [[ -f "/mnt/shared/ignition.done" ]]; do
sleep 1;
sleep 1 &
wait
done
printf '{"L":"INFO","T":"%s","M":"Ignited. Starting ScyllaDB Manager Agent"}\n' "$( date -u '+%Y-%m-%dT%H:%M:%S,%3NZ' )" > /dev/stderr
scylla-manager-agent \
exec scylla-manager-agent \
-c ` + fmt.Sprintf("%q ", naming.ScyllaAgentConfigDefaultFile) + `\
-c ` + fmt.Sprintf("%q ", path.Join(naming.ScyllaAgentConfigDirName, naming.ScyllaAgentConfigFileName)) + `\
-c ` + fmt.Sprintf("%q ", path.Join(naming.ScyllaAgentConfigDirName, naming.ScyllaAgentAuthTokenFileName)) + `
Expand Down
19 changes: 14 additions & 5 deletions pkg/controller/scylladbdatacenter/resource_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -724,9 +724,12 @@ func TestStatefulSetForRack(t *testing.T) {
"inherit_errexit",
"-c",
strings.TrimSpace(`
trap 'kill $( jobs -p ); exit 0' TERM
printf 'INFO %s ignition - Waiting for /mnt/shared/ignition.done\n' "$( date '+%Y-%m-%d %H:%M:%S,%3N' )" > /dev/stderr
until [[ -f "/mnt/shared/ignition.done" ]]; do
sleep 1;
sleep 1 &
wait
done
printf 'INFO %s ignition - Ignited. Starting ScyllaDB...\n' "$( date '+%Y-%m-%d %H:%M:%S,%3N' )" > /dev/stderr
Expand Down Expand Up @@ -878,10 +881,13 @@ exec /mnt/shared/scylla-operator sidecar \
"-O",
"inherit_errexit",
"-c",
`trap 'rm /mnt/shared/ignition.done' EXIT
strings.TrimSpace(`
trap 'kill $( jobs -p ); exit 0' TERM
trap 'rm -f /mnt/shared/ignition.done' EXIT
nodetool drain &
sleep 15 &
wait`,
wait`),
},
},
},
Expand Down Expand Up @@ -993,13 +999,16 @@ wait`,
"inherit_errexit",
"-c",
strings.TrimSpace(`
trap 'kill $( jobs -p ); exit 0' TERM
printf '{"L":"INFO","T":"%s","M":"Waiting for /mnt/shared/ignition.done"}\n' "$( date -u '+%Y-%m-%dT%H:%M:%S,%3NZ' )" > /dev/stderr
until [[ -f "/mnt/shared/ignition.done" ]]; do
sleep 1;
sleep 1 &
wait
done
printf '{"L":"INFO","T":"%s","M":"Ignited. Starting ScyllaDB Manager Agent"}\n' "$( date -u '+%Y-%m-%dT%H:%M:%S,%3NZ' )" > /dev/stderr
scylla-manager-agent \
exec scylla-manager-agent \
-c "/etc/scylla-manager-agent/scylla-manager-agent.yaml" \
-c "/mnt/scylla-agent-config/scylla-manager-agent.yaml" \
-c "/mnt/scylla-agent-config/auth-token.yaml"
Expand Down
15 changes: 10 additions & 5 deletions pkg/naming/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ const (
// Readiness check will always fail when this label is added to member service.
NodeMaintenanceLabel = "scylla/node-maintenance"

// ForceIgnitionValueAnnotation allows to force ignition state. The value can be either "true" or "false".
ForceIgnitionValueAnnotation = "internal.scylla-operator.scylladb.com/force-ignition-value"

LabelValueTrue = "true"
LabelValueFalse = "false"
)
Expand Down Expand Up @@ -110,11 +113,13 @@ const (

// Configuration Values
const (
ScyllaContainerName = "scylla"
SidecarInjectorContainerName = "sidecar-injection"
PerftuneContainerName = "perftune"
CleanupContainerName = "cleanup"
RLimitsContainerName = "rlimits"
ScyllaContainerName = "scylla"
ScyllaDBIgnitionContainerName = "scylladb-ignition"
ScyllaManagerAgentContainerName = "scylla-manager-agent"
SidecarInjectorContainerName = "sidecar-injection"
PerftuneContainerName = "perftune"
CleanupContainerName = "cleanup"
RLimitsContainerName = "rlimits"

PVCTemplateName = "data"

Expand Down
4 changes: 4 additions & 0 deletions pkg/naming/names.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ func MemberServiceNameForScyllaCluster(r scyllav1.RackSpec, sc *scyllav1.ScyllaC
return fmt.Sprintf("%s-%d", StatefulSetNameForRackForScyllaCluster(r, sc), idx)
}

func PodNameForScyllaCluster(r scyllav1.RackSpec, sc *scyllav1.ScyllaCluster, idx int) string {
return MemberServiceNameForScyllaCluster(r, sc, idx)
}

func IdentityServiceName(sdc *scyllav1alpha1.ScyllaDBDatacenter) string {
return fmt.Sprintf("%s-client", sdc.Name)
}
Expand Down
Loading

0 comments on commit 91b1fe0

Please sign in to comment.