Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support for GCP C4 instances #134

Merged
merged 1 commit into from
Dec 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 20 additions & 9 deletions cmd/metrics/event_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func LoadEventGroups(eventDefinitionOverridePath string, metadata Metadata) (gro
uarch := strings.ToLower(strings.Split(metadata.Microarchitecture, "_")[0])
// use alternate events/metrics when TMA fixed counters are not supported
alternate := ""
if (uarch == "icx" || uarch == "spr" || uarch == "emr") && !metadata.SupportsFixedTMA {
if (uarch == "icx" || uarch == "spr" || uarch == "emr") && !metadata.SupportsFixedTMA { // AWS VM instances
alternate = "_nofixedtma"
}
eventFileName := fmt.Sprintf("%s%s.txt", uarch, alternate)
Expand Down Expand Up @@ -132,20 +132,32 @@ func isCollectableEvent(event EventDefinition, metadata Metadata) bool {
slog.Debug("Fixed counter TMA not supported on target", slog.String("event", event.Name))
return false
}
// short-circuit for cpu events
if event.Device == "cpu" && !strings.HasPrefix(event.Name, "OCR") {
// PEBS events (not supported on GCP c4 VMs)
pebsEventNames := []string{"INT_MISC.UNKNOWN_BRANCH_CYCLES", "UOPS_RETIRED.MS"}
if !metadata.SupportsPEBS && util.StringInList(event.Name, pebsEventNames) {
slog.Debug("PEBS events not supported on target", slog.String("event", event.Name))
return false
}
// short-circuit for cpu events that aren't off-core response events
if event.Device == "cpu" && !(strings.HasPrefix(event.Name, "OCR") || strings.HasPrefix(event.Name, "OFFCORE_REQUESTS_OUTSTANDING")) {
return true
}
// short-circuit off-core response events
if event.Device == "cpu" &&
strings.HasPrefix(event.Name, "OCR") &&
metadata.SupportsUncore {
if flagScope == scopeProcess || flagScope == scopeCgroup {
// off-core response events
if event.Device == "cpu" && (strings.HasPrefix(event.Name, "OCR") || strings.HasPrefix(event.Name, "OFFCORE_REQUESTS_OUTSTANDING")) {
if !metadata.SupportsOCR {
slog.Debug("Off-core response events not supported on target", slog.String("event", event.Name))
return false
} else if flagScope == scopeProcess || flagScope == scopeCgroup {
slog.Debug("Off-core response events not supported in process or cgroup scope", slog.String("event", event.Name))
return false
}
return true
}
// uncore events
if !metadata.SupportsUncore && strings.HasPrefix(event.Name, "UNC") {
slog.Debug("Uncore events not supported on target", slog.String("event", event.Name))
return false
}
// exclude uncore events when
// - their corresponding device is not found
// - not in system-wide collection scope
Expand Down Expand Up @@ -176,7 +188,6 @@ func isCollectableEvent(event EventDefinition, metadata Metadata) bool {
slog.Debug("ref-cycles not supported on target", slog.String("event", event.Name))
return false
}

// no cstate and power events when collecting at process or cgroup scope
if (flagScope == scopeProcess || flagScope == scopeCgroup) &&
(strings.Contains(event.Name, "cstate_") || strings.Contains(event.Name, "power/energy")) {
Expand Down
22 changes: 17 additions & 5 deletions cmd/metrics/event_frame.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,13 @@ func parseEvents(rawEvents [][]byte, eventGroupDefinitions []GroupDefinition) (e
for _, rawEvent := range rawEvents {
var event Event
if event, err = parseEventJSON(rawEvent); err != nil {
err = fmt.Errorf("failed to parse perf event: %v", err)
return
if strings.Contains(err.Error(), "unrecognized event format") {
slog.Error(err.Error(), slog.String("event", string(rawEvent)))
return
} else {
slog.Warn(err.Error(), slog.String("event", string(rawEvent)))
event.Value = math.NaN()
}
}
if event.Event != previousEvent {
eventIdx++
Expand Down Expand Up @@ -347,10 +352,17 @@ func parseEventJSON(rawEvent []byte) (event Event, err error) {
err = fmt.Errorf("unrecognized event format: \"%s\"", rawEvent)
return
}
if event.CounterValue == "<not supported>" {
err = fmt.Errorf("event not supported: \"%s\"", rawEvent)
return
}
if event.CounterValue == "<not counted>" {
err = fmt.Errorf("event not counted: \"%s\"", rawEvent)
return
}
if event.Value, err = strconv.ParseFloat(event.CounterValue, 64); err != nil {
event.Value = math.NaN()
err = nil
slog.Debug("failed to parse event value", slog.String("event", string(rawEvent)))
err = fmt.Errorf("failed to parse event value as float: \"%s\"", rawEvent)
return
}
return
}
70 changes: 70 additions & 0 deletions cmd/metrics/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ type Metadata struct {
SupportsFixedTMA bool
SupportsRefCycles bool
SupportsUncore bool
SupportsPEBS bool
SupportsOCR bool
ThreadsPerCore int
TSC int
TSCFrequencyHz int
Expand Down Expand Up @@ -161,13 +163,41 @@ func LoadMetadata(myTarget target.Target, noRoot bool, perfPath string, localTem
}
slowFuncChannel <- err
}()
// PEBS
go func() {
var err error
var output string
if metadata.SupportsPEBS, output, err = getSupportsPEBS(myTarget, noRoot, perfPath, localTempDir); err != nil {
err = fmt.Errorf("failed to determine if 'PEBS' is supported: %v", err)
} else {
if !metadata.SupportsPEBS {
slog.Warn("'PEBS' events not supported", slog.String("output", output))
}
}
slowFuncChannel <- err
}()
// Offcore response
go func() {
var err error
var output string
if metadata.SupportsOCR, output, err = getSupportsOCR(myTarget, noRoot, perfPath, localTempDir); err != nil {
err = fmt.Errorf("failed to determine if 'OCR' is supported: %v", err)
} else {
if !metadata.SupportsOCR {
slog.Warn("'OCR' events not supported", slog.String("output", output))
}
}
slowFuncChannel <- err
}()
defer func() {
var errs []error
errs = append(errs, <-slowFuncChannel)
errs = append(errs, <-slowFuncChannel)
errs = append(errs, <-slowFuncChannel)
errs = append(errs, <-slowFuncChannel)
errs = append(errs, <-slowFuncChannel)
errs = append(errs, <-slowFuncChannel)
errs = append(errs, <-slowFuncChannel)
for _, errInside := range errs {
if errInside != nil {
slog.Error("error loading metadata", slog.String("error", errInside.Error()), slog.String("target", myTarget.GetName()))
Expand Down Expand Up @@ -218,6 +248,8 @@ func (md Metadata) String() string {
"Fixed TMA slot supported: %t, "+
"ref-cycles supported: %t, "+
"Uncore supported: %t, "+
"PEBS supported: %t, "+
"OCR supported: %t, "+
"PMU Driver version: %s, "+
"Kernel version: %s, ",
md.ModelName,
Expand All @@ -234,6 +266,8 @@ func (md Metadata) String() string {
md.SupportsFixedTMA,
md.SupportsRefCycles,
md.SupportsUncore,
md.SupportsPEBS,
md.SupportsOCR,
md.PMUDriverVersion,
md.KernelVersion)
for deviceName, deviceIds := range md.UncoreDeviceIDs {
Expand Down Expand Up @@ -355,6 +389,42 @@ func getSupportsRefCycles(myTarget target.Target, noRoot bool, perfPath string,
return
}

// getSupportsPEBS() - checks if the PEBS events are supported on the target
// On some VMs, e.g. GCP C4, PEBS events are not supported and perf returns '<not supported>'
// Events that use MSR 0x3F7 are PEBS events. We use the INT_MISC.UNKNOWN_BRANCH_CYCLES event since
// it is a PEBS event that we used in EMR metrics.
func getSupportsPEBS(myTarget target.Target, noRoot bool, perfPath string, localTempDir string) (supported bool, output string, err error) {
scriptDef := script.ScriptDefinition{
Name: "perf stat pebs",
Script: perfPath + " stat -a -e cpu/event=0xad,umask=0x40,period=1000003,name='INT_MISC.UNKNOWN_BRANCH_CYCLES'/ sleep 1",
Superuser: !noRoot,
}
scriptOutput, err := script.RunScript(myTarget, scriptDef, localTempDir)
if err != nil {
err = fmt.Errorf("failed to determine if pebs is supported: %s, %d, %v", scriptOutput.Stderr, scriptOutput.Exitcode, err)
return
}
supported = !strings.Contains(scriptOutput.Stderr, "<not supported>")
return
}

// getSupportsOCR() - checks if the offcore response events are supported on the target
// On some VMs, e.g. GCP C4, offcore response events are not supported and perf returns '<not supported>'
func getSupportsOCR(myTarget target.Target, noRoot bool, perfPath string, localTempDir string) (supported bool, output string, err error) {
scriptDef := script.ScriptDefinition{
Name: "perf stat ocr",
Script: perfPath + " stat -a -e cpu/event=0x2a,umask=0x01,offcore_rsp=0x104004477,name='OCR.READS_TO_CORE.LOCAL_DRAM'/ sleep 1",
Superuser: !noRoot,
}
scriptOutput, err := script.RunScript(myTarget, scriptDef, localTempDir)
if err != nil {
err = fmt.Errorf("failed to determine if ocr is supported: %s, %d, %v", scriptOutput.Stderr, scriptOutput.Exitcode, err)
return
}
supported = !strings.Contains(scriptOutput.Stderr, "<not supported>")
return
}

// getSupportsFixedTMA - checks if the fixed TMA counter events are
// supported by perf.
//
Expand Down
86 changes: 44 additions & 42 deletions cmd/metrics/metric_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ type MetricDefinition struct {
// definition file. When the override path argument is empty, the function will load metrics from
// the file associated with the platform's architecture found in the provided metadata. When
// a list of metric names is provided, only those metric definitions will be loaded.
func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics []string, uncollectableEvents []string, metadata Metadata) (metrics []MetricDefinition, err error) {
func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics []string, metadata Metadata) (metrics []MetricDefinition, err error) {
var bytes []byte
if metricDefinitionOverridePath != "" {
if bytes, err = os.ReadFile(metricDefinitionOverridePath); err != nil {
Expand All @@ -56,20 +56,6 @@ func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics
if err = json.Unmarshal(bytes, &metricsInFile); err != nil {
return
}
// remove "metric_" prefix from metric names
for i := range metricsInFile {
metricsInFile[i].Name = strings.TrimPrefix(metricsInFile[i].Name, "metric_")
}
// remove metrics from list that use uncollectable events
for _, uncollectableEvent := range uncollectableEvents {
for i := 0; i < len(metricsInFile); i++ {
if strings.Contains(metricsInFile[i].Expression, uncollectableEvent) {
slog.Debug("removing metric that uses uncollectable event", slog.String("metric", metricsInFile[i].Name), slog.String("event", uncollectableEvent))
metricsInFile = append(metricsInFile[:i], metricsInFile[i+1:]...)
i--
}
}
}
// if a list of metric names provided, reduce list to match
if len(selectedMetrics) > 0 {
// confirm provided metric names are valid (included in metrics defined in file)
Expand Down Expand Up @@ -102,7 +88,7 @@ func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics
// ConfigureMetrics prepares metrics for use by the evaluator, by e.g., replacing
// metric constants with known values and aligning metric variables to perf event
// groups
func ConfigureMetrics(metrics []MetricDefinition, evaluatorFunctions map[string]govaluate.ExpressionFunction, metadata Metadata) (err error) {
func ConfigureMetrics(loadedMetrics []MetricDefinition, uncollectableEvents []string, evaluatorFunctions map[string]govaluate.ExpressionFunction, metadata Metadata) (metrics []MetricDefinition, err error) {
// get constants as strings
tscFreq := fmt.Sprintf("%f", float64(metadata.TSCFrequencyHz))
tsc := fmt.Sprintf("%f", float64(metadata.TSC))
Expand All @@ -112,54 +98,70 @@ func ConfigureMetrics(metrics []MetricDefinition, evaluatorFunctions map[string]
hyperThreadingOn := fmt.Sprintf("%t", metadata.ThreadsPerCore > 1)
threadsPerCore := fmt.Sprintf("%f", float64(metadata.ThreadsPerCore))
// configure each metric
for metricIdx := range metrics {
for metricIdx := range loadedMetrics {
tmpMetric := loadedMetrics[metricIdx]
// abbreviate event names in metric expressions to match abbreviations used in uncollectableEvents
tmpMetric.Expression = abbreviateEventName(tmpMetric.Expression)
tmpMetric.ExpressionTxn = abbreviateEventName(tmpMetric.ExpressionTxn)
// skip metrics that use uncollectable events
foundUncollectable := false
for _, uncollectableEvent := range uncollectableEvents {
if strings.Contains(tmpMetric.Expression, uncollectableEvent) {
slog.Warn("removing metric that uses uncollectable event", slog.String("metric", tmpMetric.Name), slog.String("event", uncollectableEvent))
foundUncollectable = true
break
}
}
if foundUncollectable {
continue
}
// swap in per-txn metric definition if transaction rate is provided
if flagTransactionRate != 0 && metrics[metricIdx].ExpressionTxn != "" {
metrics[metricIdx].Expression = metrics[metricIdx].ExpressionTxn
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[TXN]", fmt.Sprintf("%f", flagTransactionRate))
metrics[metricIdx].Name = metrics[metricIdx].NameTxn
if flagTransactionRate != 0 && tmpMetric.ExpressionTxn != "" {
tmpMetric.Expression = tmpMetric.ExpressionTxn
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[TXN]", fmt.Sprintf("%f", flagTransactionRate))
tmpMetric.Name = tmpMetric.NameTxn
}
// remove "metric_" prefix from metric names
tmpMetric.Name = strings.TrimPrefix(tmpMetric.Name, "metric_")
// transform if/else to ?/:
var transformed string
if transformed, err = transformConditional(metrics[metricIdx].Expression); err != nil {
if transformed, err = transformConditional(tmpMetric.Expression); err != nil {
return
}
if transformed != metrics[metricIdx].Expression {
slog.Debug("transformed metric", slog.String("original", metrics[metricIdx].Name), slog.String("transformed", transformed))
metrics[metricIdx].Expression = transformed
if transformed != tmpMetric.Expression {
slog.Debug("transformed metric", slog.String("original", tmpMetric.Name), slog.String("transformed", transformed))
tmpMetric.Expression = transformed
}
// replace constants with their values
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[SYSTEM_TSC_FREQ]", tscFreq)
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[TSC]", tsc)
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[CORES_PER_SOCKET]", coresPerSocket)
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[CHAS_PER_SOCKET]", chasPerSocket)
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[SOCKET_COUNT]", socketCount)
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[HYPERTHREADING_ON]", hyperThreadingOn)
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[CONST_THREAD_COUNT]", threadsPerCore)
// abbreviate event names
metrics[metricIdx].Expression = abbreviateEventName(metrics[metricIdx].Expression)
metrics[metricIdx].ExpressionTxn = abbreviateEventName(metrics[metricIdx].ExpressionTxn)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[SYSTEM_TSC_FREQ]", tscFreq)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[TSC]", tsc)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[CORES_PER_SOCKET]", coresPerSocket)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[CHAS_PER_SOCKET]", chasPerSocket)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[SOCKET_COUNT]", socketCount)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[HYPERTHREADING_ON]", hyperThreadingOn)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[CONST_THREAD_COUNT]", threadsPerCore)
// get a list of the variables in the expression
metrics[metricIdx].Variables = make(map[string]int)
tmpMetric.Variables = make(map[string]int)
expressionIdx := 0
for {
startVar := strings.IndexRune(metrics[metricIdx].Expression[expressionIdx:], '[')
startVar := strings.IndexRune(tmpMetric.Expression[expressionIdx:], '[')
if startVar == -1 { // no more vars in this expression
break
}
endVar := strings.IndexRune(metrics[metricIdx].Expression[expressionIdx:], ']')
endVar := strings.IndexRune(tmpMetric.Expression[expressionIdx:], ']')
if endVar == -1 {
err = fmt.Errorf("didn't find end of variable indicator (]) in expression: %s", metrics[metricIdx].Expression[expressionIdx:])
err = fmt.Errorf("didn't find end of variable indicator (]) in expression: %s", tmpMetric.Expression[expressionIdx:])
return
}
// add the variable name to the map, set group index to -1 to indicate it has not yet been determined
metrics[metricIdx].Variables[metrics[metricIdx].Expression[expressionIdx:][startVar+1:endVar]] = -1
tmpMetric.Variables[tmpMetric.Expression[expressionIdx:][startVar+1:endVar]] = -1
expressionIdx += endVar + 1
}
if metrics[metricIdx].Evaluable, err = govaluate.NewEvaluableExpressionWithFunctions(metrics[metricIdx].Expression, evaluatorFunctions); err != nil {
slog.Error("failed to create evaluable expression for metric", slog.String("error", err.Error()), slog.String("metric name", metrics[metricIdx].Name), slog.String("metric expression", metrics[metricIdx].Expression))
if tmpMetric.Evaluable, err = govaluate.NewEvaluableExpressionWithFunctions(tmpMetric.Expression, evaluatorFunctions); err != nil {
slog.Error("failed to create evaluable expression for metric", slog.String("error", err.Error()), slog.String("metric name", tmpMetric.Name), slog.String("metric expression", tmpMetric.Expression))
return
}
metrics = append(metrics, tmpMetric)
}
return
}
Expand Down
5 changes: 3 additions & 2 deletions cmd/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -883,15 +883,16 @@ func prepareMetrics(targetContext *targetContext, localTempDir string, channelEr
return
}
// load metric definitions
if targetContext.metricDefinitions, err = LoadMetricDefinitions(flagMetricFilePath, flagMetricsList, uncollectableEvents, targetContext.metadata); err != nil {
var loadedMetrics []MetricDefinition
if loadedMetrics, err = LoadMetricDefinitions(flagMetricFilePath, flagMetricsList, targetContext.metadata); err != nil {
err = fmt.Errorf("failed to load metric definitions: %w", err)
_ = statusUpdate(myTarget.GetName(), fmt.Sprintf("Error: %s", err.Error()))
targetContext.err = err
channelError <- targetError{target: myTarget, err: err}
return
}
// configure metrics
if err = ConfigureMetrics(targetContext.metricDefinitions, GetEvaluatorFunctions(), targetContext.metadata); err != nil {
if targetContext.metricDefinitions, err = ConfigureMetrics(loadedMetrics, uncollectableEvents, GetEvaluatorFunctions(), targetContext.metadata); err != nil {
err = fmt.Errorf("failed to configure metrics: %w", err)
_ = statusUpdate(myTarget.GetName(), fmt.Sprintf("Error: %s", err.Error()))
targetContext.err = err
Expand Down
2 changes: 1 addition & 1 deletion cmd/metrics/summary.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ func (m *metricsFromCSV) getStats() (stats map[string]metricStats, err error) {
sum := 0.0
for _, row := range m.rows {
val := row.metrics[metricName]
if math.IsNaN(val) {
if math.IsNaN(val) || math.IsInf(val, 0) {
continue
}
if math.IsNaN(min) { // min was initialized to NaN
Expand Down
Loading