From a592901a9d0ef934ac67568190b157b7a4673a3d Mon Sep 17 00:00:00 2001 From: Vasil Averyanau Date: Tue, 17 Dec 2024 17:40:54 +0100 Subject: [PATCH] feat(backup): extends manifest with info needed for 1-to-1 restore. This adds following data to the backup manifest: General: cluster_id: uuid of the cluster dc: data center name rack: rack from the scylla configuration node_id: id of the scylla node (equals to host id) task_id: uuid of the backup task snapshot_tag: snapshot tag shard_count: number of shard on scylla node cpu_count: number of cpus on scylla node storage_size: total size of the disk in bytes Instance Details: cloud_provider: aws|gcp|azure or empty in case of on-premise instance_type: instance type, e.g. t2.nano or empty when on-premise Fixes: #4130 --- .../v2_config_node_info.golden.json | 5 +- ..._node_info_alternator_disabled.golden.json | 5 +- pkg/service/backup/backupspec/manifest.go | 31 ++++++-- .../backup/service_backup_integration_test.go | 72 +++++++++++++++++++ pkg/service/backup/worker_manifest.go | 42 +++++++++-- 5 files changed, 141 insertions(+), 14 deletions(-) diff --git a/pkg/scyllaclient/testdata/scylla_api/v2_config_node_info.golden.json b/pkg/scyllaclient/testdata/scylla_api/v2_config_node_info.golden.json index bb23b229b4..5118cc942e 100644 --- a/pkg/scyllaclient/testdata/scylla_api/v2_config_node_info.golden.json +++ b/pkg/scyllaclient/testdata/scylla_api/v2_config_node_info.golden.json @@ -18,5 +18,6 @@ "rpc_port":"9160", "sstable_uuid_format":true, "consistent_cluster_management":true, - "enable_tablets":true -} \ No newline at end of file + "enable_tablets":true, + "data_directory": "/var/lib/scylla/data" +} diff --git a/pkg/scyllaclient/testdata/scylla_api/v2_config_node_info_alternator_disabled.golden.json b/pkg/scyllaclient/testdata/scylla_api/v2_config_node_info_alternator_disabled.golden.json index fa9484d0b1..6eb3b53f49 100644 --- a/pkg/scyllaclient/testdata/scylla_api/v2_config_node_info_alternator_disabled.golden.json +++ b/pkg/scyllaclient/testdata/scylla_api/v2_config_node_info_alternator_disabled.golden.json @@ -17,5 +17,6 @@ "rpc_address":"192.168.100.101", "rpc_port":"9160", "sstable_uuid_format":false, - "consistent_cluster_management":false -} \ No newline at end of file + "consistent_cluster_management":false, + "data_directory": "/var/lib/scylla/data" +} diff --git a/pkg/service/backup/backupspec/manifest.go b/pkg/service/backup/backupspec/manifest.go index 00aed1906f..9c5c13c6e4 100644 --- a/pkg/service/backup/backupspec/manifest.go +++ b/pkg/service/backup/backupspec/manifest.go @@ -115,12 +115,31 @@ func (m *ManifestInfo) fileNameParser(v string) error { // ManifestContent is structure containing information about the backup. type ManifestContent struct { - Version string `json:"version"` - ClusterName string `json:"cluster_name"` - IP string `json:"ip"` - Size int64 `json:"size"` - Tokens []int64 `json:"tokens"` - Schema string `json:"schema"` + Version string `json:"version"` + ClusterName string `json:"cluster_name"` + IP string `json:"ip"` + Size int64 `json:"size"` + Tokens []int64 `json:"tokens"` + Schema string `json:"schema"` + Rack string `json:"rack"` + ShardCount int `json:"shard_count"` + CPUCount int `json:"cpu_count"` + StorageSize uint64 `json:"storage_size"` + InstanceDetails InstanceDetails `json:"instance_details"` + + // Fields below are also persent in the manifest file path. + DC string `json:"dc"` + ClusterID uuid.UUID `json:"cluster_id"` + NodeID string `json:"node_id"` + TaskID uuid.UUID `json:"task_id"` + SnapshotTag string `json:"snapshot_tag"` +} + +// InstanceDetails extends backup manifest with additional instance details. +// Mainly needed for 1-to-1 restore. +type InstanceDetails struct { + CloudProvider string `json:"cloud_provider,omitempty"` + InstanceType string `json:"instance_type,omitempty"` } // ManifestContentWithIndex is structure containing information about the backup diff --git a/pkg/service/backup/service_backup_integration_test.go b/pkg/service/backup/service_backup_integration_test.go index 8cd5d4485f..443bb94c67 100644 --- a/pkg/service/backup/service_backup_integration_test.go +++ b/pkg/service/backup/service_backup_integration_test.go @@ -10,6 +10,7 @@ import ( "context" "encoding/json" "fmt" + "io" "math/rand" "net/http" "os" @@ -28,6 +29,7 @@ import ( "github.com/scylladb/gocqlx/v2" "github.com/scylladb/scylla-manager/v3/pkg/service/cluster" "github.com/scylladb/scylla-manager/v3/pkg/util" + "github.com/scylladb/scylla-manager/v3/swagger/gen/agent/models" "go.uber.org/atomic" "go.uber.org/zap/zapcore" @@ -643,6 +645,27 @@ func TestBackupSmokeIntegration(t *testing.T) { t.Fatal(err) } + // Mocking /cloud/metadata endpoint as it's not expected to work reliably on ci. + // But with mock we can at least check that response is used correctly and saved to manifest file. + h.Hrt.SetInterceptor(httpx.RoundTripperFunc(func(req *http.Request) (*http.Response, error) { + if req.URL.Path != "/agent/cloud/metadata" { + return nil, nil + } + + metaMock := models.InstanceMetadata{ + CloudProvider: "test_provider", + InstanceType: "test_instance_type", + } + + metaMockBytes, err := json.Marshal(metaMock) + if err != nil { + t.Fatal(err) + } + resp := httpx.MakeResponse(req, http.StatusOK) + resp.Body = io.NopCloser(bytes.NewReader(metaMockBytes)) + return resp, nil + })) + Print("When: run backup") if err := h.service.Backup(ctx, h.ClusterID, h.TaskID, h.RunID, target); err != nil { t.Fatal(err) @@ -836,6 +859,55 @@ func assertManifestHasCorrectFormat(t *testing.T, ctx context.Context, h *backup if !strset.New(schemas...).Has(mc.Schema) { t.Errorf("Schema=%s, not found in schemas %s", mc.Schema, schemas) } + + var infoFromPath ManifestInfo + if err := infoFromPath.ParsePath(manifestPath); err != nil { + t.Fatal("manifest file in wrong path", manifestPath) + } + + if mc.DC != infoFromPath.DC { + t.Errorf("DC=%s, expected %s", mc.DC, infoFromPath.DC) + } + + if mc.ClusterID.String() != infoFromPath.ClusterID.String() { + t.Errorf("ClustedID=%s, expected %s", mc.ClusterID, infoFromPath.ClusterID) + } + + if mc.NodeID != infoFromPath.NodeID { + t.Errorf("NodeID=%s, expected %s", mc.NodeID, infoFromPath.NodeID) + } + + if mc.TaskID.String() != infoFromPath.TaskID.String() { + t.Errorf("TaskID=%s, expected %s", mc.TaskID, infoFromPath.TaskID) + } + + if mc.SnapshotTag != infoFromPath.SnapshotTag { + t.Errorf("SnapshotTag=%s, expected %s", mc.SnapshotTag, infoFromPath.SnapshotTag) + } + + if mc.Rack != "rack1" { + t.Errorf("Rack=%s, expected rack1", mc.Rack) + } + + if mc.ShardCount == 0 { + t.Errorf("ShardCount=0, expected > 0") + } + + if mc.CPUCount == 0 { + t.Errorf("CPUCount=0, expected > 0") + } + + if mc.StorageSize == 0 { + t.Errorf("StorageSize=0, expected > 0") + } + + if mc.InstanceDetails.InstanceType != "test_instance_type" { + t.Errorf("InstanceDetails.InstanceType=%s, expected test_instance_type", mc.InstanceDetails.InstanceType) + } + + if mc.InstanceDetails.CloudProvider != "test_provider" { + t.Errorf("InstanceDetails.CloudProvider=%s, expected aws", mc.InstanceDetails.CloudProvider) + } } func TestBackupWithNodesDownIntegration(t *testing.T) { diff --git a/pkg/service/backup/worker_manifest.go b/pkg/service/backup/worker_manifest.go index 1e121a4b17..ce3d23d0b1 100644 --- a/pkg/service/backup/worker_manifest.go +++ b/pkg/service/backup/worker_manifest.go @@ -45,11 +45,14 @@ func (w *worker) createAndUploadHostManifest(ctx context.Context, h hostInfo) er return err } - m := w.createTemporaryManifest(h, tokens) + m, err := w.createTemporaryManifest(ctx, h, tokens) + if err != nil { + return errors.Wrap(err, "create temp manifest") + } return w.uploadHostManifest(ctx, h, m) } -func (w *worker) createTemporaryManifest(h hostInfo, tokens []int64) ManifestInfoWithContent { +func (w *worker) createTemporaryManifest(ctx context.Context, h hostInfo, tokens []int64) (ManifestInfoWithContent, error) { m := &ManifestInfo{ Location: h.Location, DC: h.DC, @@ -65,9 +68,14 @@ func (w *worker) createTemporaryManifest(h hostInfo, tokens []int64) ManifestInf c := &ManifestContentWithIndex{ ManifestContent: ManifestContent{ Version: "v2", - ClusterName: w.ClusterName, IP: h.IP, Tokens: tokens, + ClusterName: w.ClusterName, + DC: h.DC, + ClusterID: w.ClusterID, + NodeID: h.ID, + TaskID: w.TaskID, + SnapshotTag: w.SnapshotTag, }, Index: make([]FilesMeta, len(dirs)), } @@ -88,10 +96,36 @@ func (w *worker) createTemporaryManifest(h hostInfo, tokens []int64) ManifestInf c.Size += d.Progress.Size } + rack, err := w.Client.HostRack(ctx, h.IP) + if err != nil { + return ManifestInfoWithContent{}, errors.Wrap(err, "client.HostRack") + } + c.Rack = rack + + shardCound, err := w.Client.ShardCount(ctx, h.IP) + if err != nil { + return ManifestInfoWithContent{}, errors.Wrap(err, "client.ShardCount") + } + c.ShardCount = int(shardCound) + + // VA_TODO: candidate for #3892 (but only after #4181 gets fixed...). + nodeInfo, err := w.Client.NodeInfo(ctx, h.IP) + if err != nil { + return ManifestInfoWithContent{}, errors.Wrap(err, "client.NodeInfo") + } + c.CPUCount = int(nodeInfo.CPUCount) + c.StorageSize = nodeInfo.StorageSize + + instanceMeta, err := w.Client.CloudMetadata(ctx, h.IP) + if err != nil { + return ManifestInfoWithContent{}, errors.Wrap(err, "client.CloudMetadata") + } + c.InstanceDetails = InstanceDetails(instanceMeta) + return ManifestInfoWithContent{ ManifestInfo: m, ManifestContentWithIndex: c, - } + }, nil } func (w *worker) uploadHostManifest(ctx context.Context, h hostInfo, m ManifestInfoWithContent) error {