diff --git a/cluster/defaults.go b/cluster/defaults.go index 3601c45c..1532ba5f 100644 --- a/cluster/defaults.go +++ b/cluster/defaults.go @@ -32,7 +32,7 @@ const ( DefaultIngressController = "nginx" DefaultEtcdBackupCreationPeriod = "12h" - DefaultEtcdBackupRetentionPeriod = "3d" + DefaultEtcdBackupRetentionPeriod = "72h" DefaultEtcdSnapshot = true DefaultMonitoringProvider = "metrics-server" diff --git a/services/etcd.go b/services/etcd.go index 294ebe09..26600913 100644 --- a/services/etcd.go +++ b/services/etcd.go @@ -273,7 +273,19 @@ func RunEtcdSnapshotSave(ctx context.Context, etcdHost *hosts.Host, prsMap map[s } return docker.RemoveContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdSnapshotOnceContainerName) } - return docker.DoRunContainer(ctx, etcdHost.DClient, imageCfg, hostCfg, EtcdSnapshotContainerName, etcdHost.Address, ETCDRole, prsMap) + if err := docker.DoRunContainer(ctx, etcdHost.DClient, imageCfg, hostCfg, EtcdSnapshotContainerName, etcdHost.Address, ETCDRole, prsMap); err != nil { + return err + } + // check if the container exited with error + snapshotCont, err := docker.InspectContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdSnapshotContainerName) + if err != nil { + return err + } + if snapshotCont.State.Status == "exited" || snapshotCont.State.Restarting { + log.Warnf(ctx, "Etcd rolling snapshot container failed to start correctly") + return docker.RemoveContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdSnapshotContainerName) + } + return nil } func RestoreEtcdSnapshot(ctx context.Context, etcdHost *hosts.Host, prsMap map[string]v3.PrivateRegistry, etcdRestoreImage, snapshotName, initCluster string) error {