Skip to content

Commit

Permalink
Improve error handling when cannot connect to target(s) and when targ…
Browse files Browse the repository at this point in the history
…et doesn't meet requirements for metrics command (#95)

* handle connection errors

* confirm metric targets have same architecture

* metrics error msg when target not supported due to failure to read events
  • Loading branch information
harp-intel authored Nov 18, 2024
1 parent cea353d commit 6b0fa55
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 81 deletions.
11 changes: 10 additions & 1 deletion cmd/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,13 +207,22 @@ func runCmd(cmd *cobra.Command, args []string) error {
appContext := cmd.Context().Value(common.AppContext{}).(common.AppContext)
localTempDir := appContext.TempDir
// get the targets
myTargets, err := common.GetTargets(cmd, true, true, localTempDir)
myTargets, targetErrs, err := common.GetTargets(cmd, true, true, localTempDir)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
slog.Error(err.Error())
cmd.SilenceUsage = true
return err
}
// check for errors in target connections
for _, err := range targetErrs {
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
slog.Error(err.Error())
cmd.SilenceUsage = true
return err
}
}
if len(myTargets) == 0 {
err := fmt.Errorf("no targets specified")
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
Expand Down
15 changes: 6 additions & 9 deletions cmd/metrics/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,10 @@ func LoadMetadata(myTarget target.Target, noRoot bool, perfPath string, localTem
var err error
var output string
if metadata.SupportsFixedCycles, output, err = getSupportsFixedEvent(myTarget, "cpu-cycles", cpu.MicroArchitecture, noRoot, perfPath, localTempDir); err != nil {
err = fmt.Errorf("failed to determine if fixed-counter cpu-cycles is supported: %v", err)
err = fmt.Errorf("failed to determine if fixed-counter 'cpu-cycles' is supported: %v", err)
} else {
if !metadata.SupportsFixedCycles {
slog.Warn("Fixed-counter cpu-cycles events not supported", slog.String("output", output))
slog.Warn("Fixed-counter 'cpu-cycles' events not supported", slog.String("output", output))
}
}
slowFuncChannel <- err
Expand All @@ -153,10 +153,10 @@ func LoadMetadata(myTarget target.Target, noRoot bool, perfPath string, localTem
var err error
var output string
if metadata.SupportsFixedInstructions, output, err = getSupportsFixedEvent(myTarget, "instructions", cpu.MicroArchitecture, noRoot, perfPath, localTempDir); err != nil {
err = fmt.Errorf("failed to determine if fixed-counter instructions is supported: %v", err)
err = fmt.Errorf("failed to determine if fixed-counter 'instructions' is supported: %v", err)
} else {
if !metadata.SupportsFixedInstructions {
slog.Warn("Fixed-counter instructions events not supported", slog.String("output", output))
slog.Warn("Fixed-counter 'instructions' events not supported", slog.String("output", output))
}
}
slowFuncChannel <- err
Expand All @@ -170,11 +170,8 @@ func LoadMetadata(myTarget target.Target, noRoot bool, perfPath string, localTem
errs = append(errs, <-slowFuncChannel)
for _, errInside := range errs {
if errInside != nil {
if err == nil {
err = errInside
} else {
err = fmt.Errorf("%v : %v", err, errInside)
}
slog.Error("error loading metadata", slog.String("error", errInside.Error()), slog.String("target", myTarget.GetName()))
err = fmt.Errorf("target not supported, see log for details")
}
}
}()
Expand Down
121 changes: 83 additions & 38 deletions cmd/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -472,16 +472,17 @@ func validateFlags(cmd *cobra.Command, args []string) error {
}

type targetContext struct {
target target.Target
err error
perfPath string
tempDir string
metadata Metadata
nmiDisabled bool
perfMuxIntervals map[string]int
groupDefinitions []GroupDefinition
metricDefinitions []MetricDefinition
printedFiles []string
target target.Target
err error
perfPath string
tempDir string
metadata Metadata
nmiDisabled bool
perfMuxIntervalsSet bool
perfMuxIntervals map[string]int
groupDefinitions []GroupDefinition
metricDefinitions []MetricDefinition
printedFiles []string
}

type targetError struct {
Expand All @@ -494,7 +495,6 @@ func runCmd(cmd *cobra.Command, args []string) error {
appContext := cmd.Context().Value(common.AppContext{}).(common.AppContext)
localTempDir := appContext.TempDir
localOutputDir := appContext.OutputDir

// handle signals
// child processes will exit when the signals are received which will
// allow this app to exit normally
Expand All @@ -505,7 +505,6 @@ func runCmd(cmd *cobra.Command, args []string) error {
setSignalReceived()
slog.Info("received signal", slog.String("signal", sig.String()))
}()

// round up to next perfPrintInterval second (the collection interval used by perf stat)
if flagDuration != 0 {
qf := float64(flagDuration) / float64(flagPerfPrintInterval)
Expand All @@ -514,32 +513,22 @@ func runCmd(cmd *cobra.Command, args []string) error {
flagDuration = (qi + 1) * flagPerfPrintInterval
}
}

// get the targets
myTargets, err := common.GetTargets(cmd, !flagNoRoot, !flagNoRoot, localTempDir)
myTargets, targetErrs, err := common.GetTargets(cmd, !flagNoRoot, !flagNoRoot, localTempDir)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
slog.Error(err.Error())
cmd.SilenceUsage = true
return err
}
if len(myTargets) == 0 {
err := fmt.Errorf("no targets specified")
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
slog.Error(err.Error())
cmd.SilenceUsage = true
return err
}
targetTempRoot, _ := cmd.Flags().GetString(common.FlagTargetTempDirName)

// check for live mode with multiple targets
if flagLive && len(myTargets) > 1 {
err := fmt.Errorf("live mode is only supported for a single target")
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
slog.Error(err.Error())
cmd.SilenceUsage = true
return err
}

// create progress spinner
multiSpinner := progress.NewMultiSpinner()
for _, myTarget := range myTargets {
Expand All @@ -553,7 +542,48 @@ func runCmd(cmd *cobra.Command, args []string) error {
}
multiSpinner.Start()
defer multiSpinner.Finish()

// check for errors in target creation
for i := range targetErrs {
if targetErrs[i] != nil {
_ = multiSpinner.Status(myTargets[i].GetName(), fmt.Sprintf("Error: %v", targetErrs[i]))
// remove target from targets list
myTargets = append(myTargets[:i], myTargets[i+1:]...)
}
}
// check if any targets remain
if len(myTargets) == 0 {
err := fmt.Errorf("no targets specified")
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
slog.Error(err.Error())
cmd.SilenceUsage = true
return err
}
// check if all targets have the same architecture
for _, target := range myTargets {
tArch, err := target.GetArchitecture()
if err != nil {
err = fmt.Errorf("failed to get architecture: %w", err)
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
slog.Error(err.Error())
cmd.SilenceUsage = true
return err
}
tArch0, err := myTargets[0].GetArchitecture()
if err != nil {
err = fmt.Errorf("failed to get architecture: %w", err)
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
slog.Error(err.Error())
cmd.SilenceUsage = true
return err
}
if tArch != tArch0 {
err := fmt.Errorf("all targets must have the same architecture")
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
slog.Error(err.Error())
cmd.SilenceUsage = true
return err
}
}
// extract perf into local temp directory (assumes all targets have the same architecture)
localPerfPath, err := extractPerf(myTargets[0], localTempDir)
if err != nil {
Expand All @@ -562,23 +592,26 @@ func runCmd(cmd *cobra.Command, args []string) error {
cmd.SilenceUsage = true
return err
}

// prepare the targets
channelTargetError := make(chan targetError)
var targetContexts []targetContext
for _, myTarget := range myTargets {
targetContexts = append(targetContexts, targetContext{target: myTarget})
}
targetTempRoot, _ := cmd.Flags().GetString(common.FlagTargetTempDirName)
for i := range targetContexts {
go prepareTarget(&targetContexts[i], targetTempRoot, localTempDir, localPerfPath, channelTargetError, multiSpinner.Status)
}
// wait for all targets to be prepared
numPreparedTargets := 0
for range targetContexts {
targetError := <-channelTargetError
if targetError.err != nil {
slog.Error("failed to prepare target", slog.String("target", targetError.target.GetName()), slog.String("error", targetError.err.Error()))
} else {
numPreparedTargets++
}
}

// schedule temporary directory cleanup
if cmd.Parent().PersistentFlags().Lookup("debug").Value.String() != "true" { // don't remove the directory if we're debugging
defer func() {
Expand All @@ -592,7 +625,6 @@ func runCmd(cmd *cobra.Command, args []string) error {
}
}()
}

// schedule NMI watchdog reset
defer func() {
for _, targetContext := range targetContexts {
Expand All @@ -604,28 +636,44 @@ func runCmd(cmd *cobra.Command, args []string) error {
}
}
}()

// schedule mux interval reset
defer func() {
for _, targetContext := range targetContexts {
err := SetMuxIntervals(targetContext.target, targetContext.perfMuxIntervals, localTempDir)
if err != nil {
slog.Error("failed to reset perf mux intervals", slog.String("target", targetContext.target.GetName()), slog.String("error", err.Error()))
if targetContext.perfMuxIntervalsSet {
err := SetMuxIntervals(targetContext.target, targetContext.perfMuxIntervals, localTempDir)
if err != nil {
slog.Error("failed to reset perf mux intervals", slog.String("target", targetContext.target.GetName()), slog.String("error", err.Error()))
}
}
}
}()

// check if any targets were successfully prepared
if numPreparedTargets == 0 {
err := fmt.Errorf("no targets were successfully prepared")
slog.Error(err.Error())
cmd.SilenceUsage = true
return err
}
// prepare the metrics for each target
for i := range targetContexts {
go prepareMetrics(&targetContexts[i], localTempDir, channelTargetError, multiSpinner.Status)
}
// wait for all metrics to be prepared
numTargetsWithPreparedMetrics := 0
for range targetContexts {
targetError := <-channelTargetError
if targetError.err != nil {
slog.Error("failed to prepare metrics", slog.String("target", targetError.target.GetName()), slog.String("error", targetError.err.Error()))
} else {
numTargetsWithPreparedMetrics++
}
}

if numTargetsWithPreparedMetrics == 0 {
err := fmt.Errorf("no targets had metrics prepared")
slog.Error(err.Error())
cmd.SilenceUsage = true
return err
}
// show metric names and exit, if requested
if flagShowMetricNames {
// stop the multiSpinner
Expand All @@ -638,7 +686,6 @@ func runCmd(cmd *cobra.Command, args []string) error {
}
return nil
}

// create the local output directory
if !flagLive {
err = common.CreateOutputDir(localOutputDir)
Expand All @@ -649,7 +696,6 @@ func runCmd(cmd *cobra.Command, args []string) error {
return err
}
}

// write metadata to file
if flagWriteEventsToFile {
for _, targetContext := range targetContexts {
Expand All @@ -661,7 +707,6 @@ func runCmd(cmd *cobra.Command, args []string) error {
}
}
}

// start the metric collection
for i := range targetContexts {
if targetContexts[i].err == nil {
Expand Down Expand Up @@ -796,6 +841,7 @@ func prepareTarget(targetContext *targetContext, targetTempRoot string, localTem
channelError <- targetError{target: myTarget, err: err}
return
}
targetContext.perfMuxIntervalsSet = true
}
// get the full path to the perf binary
if targetContext.perfPath, err = getPerfPath(myTarget, localPerfPath); err != nil {
Expand All @@ -819,7 +865,6 @@ func prepareMetrics(targetContext *targetContext, localTempDir string, channelEr
_ = statusUpdate(myTarget.GetName(), "collecting metadata")
var err error
if targetContext.metadata, err = LoadMetadata(myTarget, flagNoRoot, targetContext.perfPath, localTempDir); err != nil {
err = fmt.Errorf("failed to load metadata: %w", err)
_ = statusUpdate(myTarget.GetName(), fmt.Sprintf("Error: %s", err.Error()))
targetContext.err = err
channelError <- targetError{target: myTarget, err: err}
Expand Down
25 changes: 17 additions & 8 deletions internal/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,20 +148,13 @@ func (rc *ReportingCommand) Run() error {
}
}
// get the targets
myTargets, err := GetTargets(rc.Cmd, elevated, false, localTempDir)
myTargets, targetErrs, err := GetTargets(rc.Cmd, elevated, false, localTempDir)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
slog.Error(err.Error())
rc.Cmd.SilenceUsage = true
return err
}
if len(myTargets) == 0 {
err := fmt.Errorf("no targets specified")
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
slog.Error(err.Error())
rc.Cmd.SilenceUsage = true
return err
}
// setup and start the progress indicator
multiSpinner := progress.NewMultiSpinner()
for _, target := range myTargets {
Expand All @@ -174,6 +167,22 @@ func (rc *ReportingCommand) Run() error {
}
}
multiSpinner.Start()
// check for errors in target creation
for i := range targetErrs {
if targetErrs[i] != nil {
_ = multiSpinner.Status(myTargets[i].GetName(), fmt.Sprintf("Error: %v", targetErrs[i]))
// remove target from targets list
myTargets = append(myTargets[:i], myTargets[i+1:]...)
}
}
// check if we have any targets to run the scripts on
if len(myTargets) == 0 {
err := fmt.Errorf("no targets specified")
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
slog.Error(err.Error())
rc.Cmd.SilenceUsage = true
return err
}
// run the scripts on the targets
channelTargetScriptOutputs := make(chan TargetScriptOutputs)
channelError := make(chan error)
Expand Down
Loading

0 comments on commit 6b0fa55

Please sign in to comment.