Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Mattermost debug data gathering #1071

Merged
merged 3 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions cmd/cloud/cluster_installation.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ package main

import (
"fmt"
"os"
"strings"
"time"

"github.com/mattermost/mattermost-cloud/model"
"github.com/pkg/errors"
Expand All @@ -25,6 +27,7 @@ func newCmdClusterInstallation() *cobra.Command {
cmd.AddCommand(newCmdClusterInstallationStatus())
cmd.AddCommand(newCmdClusterInstallationMMCTL())
cmd.AddCommand(newCmdClusterInstallationMattermostCLI())
cmd.AddCommand(newCmdClusterInstallationPPROF())
cmd.AddCommand(newCmdClusterInstallationMigration())

return cmd
Expand Down Expand Up @@ -293,6 +296,44 @@ func newCmdClusterInstallationMattermostCLI() *cobra.Command {
return cmd
}

func newCmdClusterInstallationPPROF() *cobra.Command {
var flags clusterInstallationPPROFFlags

cmd := &cobra.Command{
Use: "pprof",
Short: "Gather pprof data from a cluster installation",
RunE: func(command *cobra.Command, args []string) error {
command.SilenceUsage = true

client := createClient(flags.clusterFlags)

output, err := client.ExecClusterInstallationPPROF(flags.clusterInstallationID)
if err != nil {
return errors.Wrap(err, "failed to run mattermost CLI command")
}
if output == nil {
return errors.Wrap(err, "no debug data returned")
}

filename := fmt.Sprintf("%s.%s.prof.zip", flags.clusterInstallationID, time.Now().Format("2006-01-02.15-04-05.MST"))
err = os.WriteFile(filename, output, 0644)
if err != nil {
return errors.Wrap(err, "failed to save debug zip")
}

fmt.Printf("Debug data saved to %s\n", filename)

return nil
},
PreRun: func(cmd *cobra.Command, args []string) {
flags.clusterFlags.addFlags(cmd)
},
}
flags.addFlags(cmd)

return cmd
}

func newCmdClusterInstallationMigration() *cobra.Command {
var flags clusterInstallationMigrationFlags

Expand Down
11 changes: 11 additions & 0 deletions cmd/cloud/cluster_installation_flag.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,17 @@ func (flags *clusterInstallationMattermostCLIFlags) addFlags(command *cobra.Comm
_ = command.MarkFlagRequired("command")
}

type clusterInstallationPPROFFlags struct {
clusterFlags
clusterInstallationID string
}

func (flags *clusterInstallationPPROFFlags) addFlags(command *cobra.Command) {
command.Flags().StringVar(&flags.clusterInstallationID, "cluster-installation", "", "The id of the cluster installation.")

_ = command.MarkFlagRequired("cluster-installation")
}

type clusterInstallationMigrationFlags struct {
clusterFlags
installation string
Expand Down
94 changes: 94 additions & 0 deletions internal/api/cluster_installation.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@
package api

import (
"archive/zip"
"fmt"
"net/http"
"os"
"path"

"github.com/gorilla/mux"
"github.com/mattermost/mattermost-cloud/internal/common"
Expand Down Expand Up @@ -35,6 +38,7 @@ func initClusterInstallation(apiRouter *mux.Router, context *Context) {
clusterInstallationRouter.Handle("/config", addContext(handleSetClusterInstallationConfig)).Methods("PUT")
clusterInstallationRouter.Handle("/exec/{command}", addContext(handleRunClusterInstallationExecCommand)).Methods("POST")
clusterInstallationRouter.Handle("/mattermost_cli", addContext(handleRunClusterInstallationMattermostCLI)).Methods("POST")
clusterInstallationRouter.Handle("/pprof", addContext(handleRunClusterInstallationGetPPROF)).Methods("GET")
clusterInstallationRouter.Handle("/status", addContext(handleGetClusterInstallationStatus)).Methods("GET")
}

Expand Down Expand Up @@ -315,6 +319,96 @@ func handleRunClusterInstallationExecCommand(c *Context, w http.ResponseWriter,
w.Write(output)
}

// handleRunClusterInstallationGetPPROF responds to POST /api/cluster_installation/{cluster_installation}/pprof,
// running pprof commands on all pods and returning the output as a dubug zip file.
func handleRunClusterInstallationGetPPROF(c *Context, w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
clusterInstallationID := vars["cluster_installation"]
c.Logger = c.Logger.WithField("cluster_installation", clusterInstallationID)

clusterInstallation, err := c.Store.GetClusterInstallation(clusterInstallationID)
if err != nil {
c.Logger.WithError(err).Error("failed to query cluster installation")
w.WriteHeader(http.StatusInternalServerError)
return
}
if clusterInstallation == nil {
c.Logger.Error("cluster installation not found")
w.WriteHeader(http.StatusNotFound)
return
}
if clusterInstallation.IsDeleted() {
c.Logger.Error("cluster installation is deleted")
w.WriteHeader(http.StatusGone)
return
}

if clusterInstallation.APISecurityLock {
logSecurityLockConflict("cluster-installation", c.Logger)
w.WriteHeader(http.StatusForbidden)
return
}

cluster, err := c.Store.GetCluster(clusterInstallation.ClusterID)
if err != nil {
c.Logger.WithError(err).Error("failed to query cluster")
w.WriteHeader(http.StatusInternalServerError)
return
}
if cluster == nil {
c.Logger.Errorf("failed to find cluster %s associated with cluster installation", clusterInstallation.ClusterID)
w.WriteHeader(http.StatusInternalServerError)
return
}

gabrieljackson marked this conversation as resolved.
Show resolved Hide resolved
debugData, execErr, err := c.Provisioner.ExecClusterInstallationPPROF(cluster, clusterInstallation)
if err != nil {
c.Logger.WithError(err).Error("failed to prepare command execution")
w.WriteHeader(http.StatusInternalServerError)
return
}
if execErr != nil {
c.Logger.WithError(execErr).Error("failed to execute command")
w.WriteHeader(http.StatusConflict)
return
}

// Create a temporary zipfile which will be cleaned up after being sent.
tempDir, err := os.MkdirTemp("", "pprof-")
if err != nil {
c.Logger.WithError(err).Error("failed to create temporary pprof directory")
w.WriteHeader(http.StatusInternalServerError)
return
}
defer os.RemoveAll(tempDir)

tempZipPath := path.Join(tempDir, fmt.Sprintf("%s.tempprof.zip", clusterInstallationID))
tempZipFile, err := os.Create(tempZipPath)
if err != nil {
c.Logger.WithError(err).Error("failed to create temporary pprof zip file")
w.WriteHeader(http.StatusInternalServerError)
return
}

zipFileWriter := zip.NewWriter(tempZipFile)
err = populateZipfile(zipFileWriter, debugData.ToFileData())
if err != nil {
c.Logger.WithError(err).Error("failed to populate temporary pprof zip file")
w.WriteHeader(http.StatusInternalServerError)
return
}

debugBytes, err := os.ReadFile(tempZipPath)
if err != nil {
c.Logger.WithError(err).Error("failed to read temporary pprof zip file")
w.WriteHeader(http.StatusInternalServerError)
return
}

w.WriteHeader(http.StatusOK)
w.Write(debugBytes)
}

// handleRunClusterInstallationMattermostCLI responds to POST /api/cluster_installation/{cluster_installation}/mattermost_cli, running a Mattermost CLI command and returning any output.
// TODO: deprecate or refactor into /exec/command endpoint
func handleRunClusterInstallationMattermostCLI(c *Context, w http.ResponseWriter, r *http.Request) {
Expand Down
82 changes: 81 additions & 1 deletion internal/api/cluster_installation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,86 @@ func TestRunClusterInstallationMattermostCLI(t *testing.T) {
})
}

func TestRunClusterInstallationGetPPROF(t *testing.T) {
logger := testlib.MakeLogger(t)
sqlStore := store.MakeTestSQLStore(t, logger)

mProvisioner := &mockProvisioner{DebugData: model.ClusterInstallationDebugData{
{
Name: "pod1",
HeapProf: []byte(model.NewID()),
GoroutineProf: []byte(model.NewID()),
},
{
Name: "pod2",
HeapProf: []byte(model.NewID()),
GoroutineProf: []byte(model.NewID()),
},
{
Name: "pod3",
HeapProf: []byte(model.NewID()),
GoroutineProf: []byte(model.NewID()),
},
}}

router := mux.NewRouter()
api.Register(router, &api.Context{
Store: sqlStore,
Supervisor: &mockSupervisor{},
Provisioner: mProvisioner,
Metrics: &mockMetrics{},
Logger: logger,
})
ts := httptest.NewServer(router)
defer ts.Close()

client := model.NewClient(ts.URL)

cluster := &model.Cluster{}
err := sqlStore.CreateCluster(cluster, nil)
require.NoError(t, err)

clusterInstallation1 := &model.ClusterInstallation{
ClusterID: cluster.ID,
InstallationID: model.NewID(),
}
err = sqlStore.CreateClusterInstallation(clusterInstallation1)
require.NoError(t, err)

t.Run("success", func(t *testing.T) {
bytes, errTest := client.ExecClusterInstallationPPROF(clusterInstallation1.ID)
require.NoError(t, errTest)
require.NotEmpty(t, bytes)
})

t.Run("unknown cluster installation", func(t *testing.T) {
bytes, errTest := client.ExecClusterInstallationPPROF(model.NewID())
require.EqualError(t, errTest, "failed with status code 404")
require.Empty(t, bytes)
})

t.Run("while api-security-locked", func(t *testing.T) {
errTest := sqlStore.LockClusterInstallationAPI(clusterInstallation1.ID)
require.NoError(t, errTest)

bytes, errTest := client.ExecClusterInstallationPPROF(clusterInstallation1.ID)
require.EqualError(t, errTest, "failed with status code 403")
require.Empty(t, bytes)

errTest = sqlStore.UnlockClusterInstallationAPI(clusterInstallation1.ID)
require.NoError(t, errTest)
})

t.Run("cluster installation deleted", func(t *testing.T) {
errTest := sqlStore.DeleteClusterInstallation(clusterInstallation1.ID)
require.NoError(t, errTest)

bytes, errTest := client.ExecClusterInstallationPPROF(clusterInstallation1.ID)
require.Error(t, errTest)
require.Empty(t, bytes)
})
}

func TestMigrateClusterInstallations(t *testing.T) {
logger := testlib.MakeLogger(t)
sqlStore := store.MakeTestSQLStore(t, logger)
Expand Down Expand Up @@ -1189,8 +1269,8 @@ func TestMigrateDNSForNonHibernatingInstallation(t *testing.T) {
_, err = client.MigrateDNS(&model.MigrateClusterInstallationRequest{InstallationID: "", SourceClusterID: sourceCluster.ID, TargetClusterID: targetCluster.ID, DNSSwitch: true, LockInstallation: true})
require.EqualError(t, err, "failed with status code 404")
})

}

func TestDeleteInActiveClusterInstallationsByCluster(t *testing.T) {
logger := testlib.MakeLogger(t)
sqlStore := store.MakeTestSQLStore(t, logger)
Expand Down
5 changes: 5 additions & 0 deletions internal/api/common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ func (m *mockMetrics) ObserveAPIEndpointDuration(handler, method string, statusC

type mockProvisioner struct {
Output []byte
DebugData model.ClusterInstallationDebugData
ExecError error
CommandError error
}
Expand Down Expand Up @@ -67,6 +68,10 @@ func (s *mockProvisioner) ExecMattermostCLI(*model.Cluster, *model.ClusterInstal
return s.Output, s.CommandError
}

func (s *mockProvisioner) ExecClusterInstallationPPROF(*model.Cluster, *model.ClusterInstallation) (model.ClusterInstallationDebugData, error, error) {
return s.DebugData, s.ExecError, s.CommandError
}

func (s *mockProvisioner) GetClusterResources(*model.Cluster, bool, log.FieldLogger) (*k8s.ClusterResources, error) {
return nil, nil
}
1 change: 1 addition & 0 deletions internal/api/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ type Provisioner interface {
ExecClusterInstallationCLI(cluster *model.Cluster, clusterInstallation *model.ClusterInstallation, args ...string) ([]byte, error, error)
ExecMMCTL(cluster *model.Cluster, clusterInstallation *model.ClusterInstallation, args ...string) ([]byte, error)
ExecMattermostCLI(cluster *model.Cluster, clusterInstallation *model.ClusterInstallation, args ...string) ([]byte, error)
ExecClusterInstallationPPROF(cluster *model.Cluster, clusterInstallation *model.ClusterInstallation) (model.ClusterInstallationDebugData, error, error)
GetClusterInstallationStatus(cluster *model.Cluster, clusterInstallation *model.ClusterInstallation) (*model.ClusterInstallationStatus, error)
}

Expand Down
23 changes: 23 additions & 0 deletions internal/api/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
package api

import (
"archive/zip"
"net/url"
"strconv"
"time"

"github.com/mattermost/mattermost-cloud/model"

Expand Down Expand Up @@ -105,3 +107,24 @@ func parseDeletionLocked(u *url.URL) (*bool, error) {

return &locked, nil
}

func populateZipfile(w *zip.Writer, fileDatas []model.FileData) error {
defer w.Close()
for _, fd := range fileDatas {
f, err := w.CreateHeader(&zip.FileHeader{
Name: fd.Filename,
Method: zip.Deflate,
Modified: time.Now(),
})

if err != nil {
return err
}

_, err = f.Write(fd.Body)
if err != nil {
return err
}
}
return nil
}
Loading
Loading