Skip to content

Commit

Permalink
Add drive repair support
Browse files Browse the repository at this point in the history
Signed-off-by: Bala.FA <[email protected]>
  • Loading branch information
balamurugana committed Jul 25, 2024
1 parent a3471d6 commit feddf0b
Show file tree
Hide file tree
Showing 23 changed files with 872 additions and 77 deletions.
4 changes: 2 additions & 2 deletions .golangci.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
linters-settings:
gofumpt:
lang-version: "1.22"
run:
go: "1.22"

misspell:
locale: US
Expand Down
1 change: 1 addition & 0 deletions cmd/directpv/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ func init() {
mainCmd.AddCommand(legacyControllerCmd)
mainCmd.AddCommand(legacyNodeServerCmd)
mainCmd.AddCommand(nodeControllerCmd)
mainCmd.AddCommand(repairCmd)
}

func main() {
Expand Down
80 changes: 80 additions & 0 deletions cmd/directpv/repair.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// This file is part of MinIO DirectPV
// Copyright (c) 2024 MinIO, Inc.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

package main

import (
"context"
"errors"

directpvtypes "github.com/minio/directpv/pkg/apis/directpv.min.io/types"
"github.com/minio/directpv/pkg/client"
drivepkg "github.com/minio/directpv/pkg/drive"
"github.com/minio/directpv/pkg/types"
"github.com/spf13/cobra"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

var (
forceFlag = false
disablePrefetchFlag = false
dryRunFlag = false
)

var repairCmd = &cobra.Command{
Use: "repair <DRIVE-ID>",
Short: "Start drive repair.",
SilenceUsage: true,
SilenceErrors: true,
RunE: func(c *cobra.Command, args []string) error {
switch len(args) {
case 0:
return errors.New("DRIVE-ID must be provided")
case 1:
default:
return errors.New("only one DRIVE-ID must be provided")
}
return startRepair(c.Context(), args[0])
},
}

func init() {
repairCmd.PersistentFlags().BoolVar(&forceFlag, "force", forceFlag, "Force log zeroing")
repairCmd.PersistentFlags().BoolVar(&disablePrefetchFlag, "disable-prefetch", disablePrefetchFlag, "Disable prefetching of inode and directory blocks")
repairCmd.PersistentFlags().BoolVar(&dryRunFlag, "dry-run", dryRunFlag, "No modify mode")
}

func startRepair(ctx context.Context, driveID string) error {
var cancel context.CancelFunc
ctx, cancel = context.WithCancel(ctx)
defer cancel()

drive, err := client.DriveClient().Get(ctx, driveID, metav1.GetOptions{})
if err != nil {
return err
}

if drive.Status.Status != directpvtypes.DriveStatusRepairing {
drive.Status.Status = directpvtypes.DriveStatusRepairing
}

updatedDrive, err := client.DriveClient().Update(ctx, drive, metav1.UpdateOptions{TypeMeta: types.NewDriveTypeMeta()})
if err != nil {
return err
}

return drivepkg.Repair(ctx, updatedDrive, forceFlag, disablePrefetchFlag, dryRunFlag)
}
1 change: 1 addition & 0 deletions cmd/kubectl-directpv/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ Use "{{.CommandPath}} [command] --help" for more information about this command.
mainCmd.AddCommand(cleanCmd)
mainCmd.AddCommand(suspendCmd)
mainCmd.AddCommand(resumeCmd)
mainCmd.AddCommand(repairCmd)
mainCmd.AddCommand(removeCmd)
mainCmd.AddCommand(uninstallCmd)
mainCmd.SetHelpCommand(&cobra.Command{
Expand Down
92 changes: 92 additions & 0 deletions cmd/kubectl-directpv/repair.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// This file is part of MinIO DirectPV
// Copyright (c) 2024 MinIO, Inc.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

package main

import (
"context"
"errors"
"os"
"strings"

"github.com/minio/directpv/pkg/admin"
"github.com/minio/directpv/pkg/consts"
"github.com/spf13/cobra"
)

var (
forceFlag = false
disablePrefetchFlag = false
)

var repairCmd = &cobra.Command{
Use: "repair DRIVE ...",
Short: "Repair filesystem of drives",
SilenceUsage: true,
SilenceErrors: true,
Example: strings.ReplaceAll(
`1. Repair drives
$ kubectl {PLUGIN_NAME} repair 3b562992-f752-4a41-8be4-4e688ae8cd4c`,
`{PLUGIN_NAME}`,
consts.AppName,
),
Run: func(c *cobra.Command, args []string) {
driveIDArgs = args
if err := validateRepairCmd(); err != nil {
eprintf(true, "%v\n", err)
os.Exit(-1)
}

repairMain(c.Context())
},
}

func init() {
setFlagOpts(repairCmd)

addDryRunFlag(repairCmd, "Repair drives with no modify mode")
repairCmd.PersistentFlags().BoolVar(&forceFlag, "force", forceFlag, "Force log zeroing")
repairCmd.PersistentFlags().BoolVar(&disablePrefetchFlag, "disable-prefetch", disablePrefetchFlag, "Disable prefetching of inode and directory blocks")
}

func validateRepairCmd() error {
if err := validateDriveIDArgs(); err != nil {
return err
}

if len(driveIDArgs) == 0 {
return errors.New("no drive provided to repair")
}

return nil
}

func repairMain(ctx context.Context) {
_, err := adminClient.Repair(
ctx,
admin.RepairArgs{
DriveIDs: driveIDSelectors,
DryRun: dryRunFlag,
ForceFlag: forceFlag,
DisablePrefetchFlag: disablePrefetchFlag,
},
logFunc,
)
if err != nil {
eprintf(!errors.Is(err, admin.ErrNoMatchingResourcesFound), "%v\n", err)
os.Exit(1)
}
}
22 changes: 22 additions & 0 deletions docs/command-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,28 @@ EXAMPLES:
$ kubectl directpv resume volumes pvc-0700b8c7-85b2-4894-b83a-274484f220d0
```

## `repair` command
```
Repair filesystem of drives
USAGE:
directpv repair DRIVE ... [flags]
FLAGS:
--dry-run Repair drives with no modify mode
--force Force log zeroing
--disable-prefetch Disable prefetching of inode and directory blocks
-h, --help help for repair
GLOBAL FLAGS:
--kubeconfig string Path to the kubeconfig file to use for CLI requests
--quiet Suppress printing error messages
EXAMPLES:
1. Repair drives
$ kubectl directpv repair 3b562992-f752-4a41-8be4-4e688ae8cd4c
```

## `remove` command
```
Remove unused drives from DirectPV
Expand Down
18 changes: 16 additions & 2 deletions docs/drive-management.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,25 @@ Refer [remove command](./command-reference.md#remove-command) for more informati
By Kubernetes design, [StatefulSet](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) workload is active only if all of its pods are in running state. Any faulty drive(s) will prevent the statefulset from starting up. DirectPV provides a workaround to suspend failed drives which will mount the respective volumes on empty `/var/lib/directpv/tmp` directory with read-only access. This can be done by executing the `suspend drives` command. Below is an example:

```sh
> kubectl directpv suspend drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
$ kubectl directpv suspend drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
```

Suspended drives can be resumed once they are fixed. Upon resuming, the corresponding volumes will resume using the respective allocated drives. This can be done by using the `resume drives` command. Below is an example:

```sh
> kubectl directpv resume drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
$ kubectl directpv resume drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
```

## Repair drives

***CAUTION: THIS IS DANGEROUS OPERATION WHICH LEADS TO DATA LOSS***

In a rare situation, filesystem on faulty drives can be repaired to make them usable. As a first step, faulty drives must be suspended, then the `repair` command should be run for them. The `repair` command creates onetime Kubernetes `Job` with the pod name as `repair-<DRIVE-ID>` and these jobs are auto removed after five minutes of its completion. Progress and status of the drive repair can be viewed using `kubectl log` command. Below is an example:

```sh
# Suspend faulty drives
$ kubectl directpv suspend drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0

# Run repair command on suspended drives
$ kubectl directpv repair af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
```
5 changes: 2 additions & 3 deletions pkg/admin/installer/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,8 @@ const (
namespace = consts.AppName
healthZContainerPortName = "healthz"
healthZContainerPort = 9898
volumePathSysDir = "/sys"
volumeNameSocketDir = "socket-dir"
socketDir = "/csi"
csiDirVolumeName = "socket-dir"
csiDirVolumePath = "/csi"
selectorKey = "selector." + consts.GroupName
kubeNodeNameEnvVarName = "KUBE_NODE_NAME"
csiEndpointEnvVarName = "CSI_ENDPOINT"
Expand Down
62 changes: 27 additions & 35 deletions pkg/admin/installer/daemonset.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
directpvtypes "github.com/minio/directpv/pkg/apis/directpv.min.io/types"
"github.com/minio/directpv/pkg/client"
"github.com/minio/directpv/pkg/consts"
"github.com/minio/directpv/pkg/k8s"
legacyclient "github.com/minio/directpv/pkg/legacy/client"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
Expand All @@ -32,20 +33,11 @@ import (
)

const (
volumeNameMountpointDir = "mountpoint-dir"
volumeNameRegistrationDir = "registration-dir"
volumeNamePluginDir = "plugins-dir"
volumeNameAppRootDir = consts.AppName + "-common-root"
volumeNameLegacyAppRootDir = "direct-csi-common-root"
appRootDir = consts.AppRootDir + "/"
legacyAppRootDir = "/var/lib/direct-csi/"
volumeNameSysDir = "sysfs"
volumeNameDevDir = "devfs"
volumePathDevDir = "/dev"
volumeNameRunUdevData = "run-udev-data-dir"
volumePathRunUdevData = consts.UdevDataDir
socketFile = "/csi.sock"
totalDaemonsetSteps = 2
kubeletPodsDirVolumeName = "mountpoint-dir"
registrationDirVolumeName = "registration-dir"
kubeletPluginsDirVolumeName = "plugins-dir"
socketFile = "/csi.sock"
totalDaemonsetSteps = 2
)

type daemonsetTask struct {
Expand Down Expand Up @@ -97,25 +89,25 @@ func newSecurityContext(seccompProfile string) *corev1.SecurityContext {

func getVolumesAndMounts(pluginSocketDir string) (volumes []corev1.Volume, volumeMounts []corev1.VolumeMount) {
volumes = []corev1.Volume{
newHostPathVolume(volumeNameSocketDir, pluginSocketDir),
newHostPathVolume(volumeNameMountpointDir, kubeletDirPath+"/pods"),
newHostPathVolume(volumeNameRegistrationDir, kubeletDirPath+"/plugins_registry"),
newHostPathVolume(volumeNamePluginDir, kubeletDirPath+"/plugins"),
newHostPathVolume(volumeNameAppRootDir, appRootDir),
newHostPathVolume(volumeNameSysDir, volumePathSysDir),
newHostPathVolume(volumeNameDevDir, volumePathDevDir),
newHostPathVolume(volumeNameRunUdevData, volumePathRunUdevData),
newHostPathVolume(volumeNameLegacyAppRootDir, legacyAppRootDir),
k8s.NewHostPathVolume(csiDirVolumeName, pluginSocketDir),
k8s.NewHostPathVolume(kubeletPodsDirVolumeName, kubeletDirPath+"/pods"),
k8s.NewHostPathVolume(registrationDirVolumeName, kubeletDirPath+"/plugins_registry"),
k8s.NewHostPathVolume(kubeletPluginsDirVolumeName, kubeletDirPath+"/plugins"),
k8s.NewHostPathVolume(consts.AppRootDirVolumeName, consts.AppRootDirVolumePath),
k8s.NewHostPathVolume(consts.LegacyAppRootDirVolumeName, consts.LegacyAppRootDirVolumePath),
k8s.NewHostPathVolume(consts.SysDirVolumeName, consts.SysDirVolumePath),
k8s.NewHostPathVolume(consts.DevDirVolumeName, consts.DevDirVolumePath),
k8s.NewHostPathVolume(consts.RunUdevDataVolumeName, consts.RunUdevDataVolumePath),
}
volumeMounts = []corev1.VolumeMount{
newVolumeMount(volumeNameSocketDir, socketDir, corev1.MountPropagationNone, false),
newVolumeMount(volumeNameMountpointDir, kubeletDirPath+"/pods", corev1.MountPropagationBidirectional, false),
newVolumeMount(volumeNamePluginDir, kubeletDirPath+"/plugins", corev1.MountPropagationBidirectional, false),
newVolumeMount(volumeNameAppRootDir, appRootDir, corev1.MountPropagationBidirectional, false),
newVolumeMount(volumeNameSysDir, volumePathSysDir, corev1.MountPropagationBidirectional, false),
newVolumeMount(volumeNameDevDir, volumePathDevDir, corev1.MountPropagationHostToContainer, true),
newVolumeMount(volumeNameRunUdevData, volumePathRunUdevData, corev1.MountPropagationBidirectional, true),
newVolumeMount(volumeNameLegacyAppRootDir, legacyAppRootDir, corev1.MountPropagationBidirectional, false),
k8s.NewVolumeMount(csiDirVolumeName, csiDirVolumePath, corev1.MountPropagationNone, false),
k8s.NewVolumeMount(kubeletPodsDirVolumeName, kubeletDirPath+"/pods", corev1.MountPropagationBidirectional, false),
k8s.NewVolumeMount(kubeletPluginsDirVolumeName, kubeletDirPath+"/plugins", corev1.MountPropagationBidirectional, false),
k8s.NewVolumeMount(consts.AppRootDirVolumeName, consts.AppRootDirVolumePath, corev1.MountPropagationBidirectional, false),
k8s.NewVolumeMount(consts.LegacyAppRootDirVolumeName, consts.LegacyAppRootDirVolumePath, corev1.MountPropagationBidirectional, false),
k8s.NewVolumeMount(consts.SysDirVolumeName, consts.SysDirVolumePath, corev1.MountPropagationBidirectional, false),
k8s.NewVolumeMount(consts.DevDirVolumeName, consts.DevDirVolumePath, corev1.MountPropagationHostToContainer, true),
k8s.NewVolumeMount(consts.RunUdevDataVolumeName, consts.RunUdevDataVolumePath, corev1.MountPropagationBidirectional, true),
}

return
Expand All @@ -132,8 +124,8 @@ func nodeDriverRegistrarContainer(image, pluginSocketDir string) corev1.Containe
},
Env: []corev1.EnvVar{kubeNodeNameEnvVar},
VolumeMounts: []corev1.VolumeMount{
newVolumeMount(volumeNameSocketDir, socketDir, corev1.MountPropagationNone, false),
newVolumeMount(volumeNameRegistrationDir, "/registration", corev1.MountPropagationNone, false),
k8s.NewVolumeMount(csiDirVolumeName, csiDirVolumePath, corev1.MountPropagationNone, false),
k8s.NewVolumeMount(registrationDirVolumeName, "/registration", corev1.MountPropagationNone, false),
},
TerminationMessagePolicy: corev1.TerminationMessageFallbackToLogsOnError,
TerminationMessagePath: "/var/log/driver-registrar-termination-log",
Expand Down Expand Up @@ -195,13 +187,13 @@ func livenessProbeContainer(image string) corev1.Container {
Name: "liveness-probe",
Image: image,
Args: []string{
fmt.Sprintf("--csi-address=%v%v", socketDir, socketFile),
fmt.Sprintf("--csi-address=%v%v", csiDirVolumePath, socketFile),
fmt.Sprintf("--health-port=%v", healthZContainerPort),
},
TerminationMessagePolicy: corev1.TerminationMessageFallbackToLogsOnError,
TerminationMessagePath: "/var/log/driver-liveness-termination-log",
VolumeMounts: []corev1.VolumeMount{
newVolumeMount(volumeNameSocketDir, socketDir, corev1.MountPropagationNone, false),
k8s.NewVolumeMount(csiDirVolumeName, csiDirVolumePath, corev1.MountPropagationNone, false),
},
}
}
Expand Down
Loading

0 comments on commit feddf0b

Please sign in to comment.