Skip to content

Commit

Permalink
Add RDMA collector (#182)
Browse files Browse the repository at this point in the history
* Add RDMA collector which exports selected system wide counters and compute unit's hw counters like QP, CQ, MR, etc. QP mode setting will continue at each scrape until we manage to enable on all links. Ensure to disable all Per-PID modes before exiting collector.

* Update docs

---------

Signed-off-by: Mahendra Paipuri <[email protected]>
  • Loading branch information
mahendrapaipuri authored Oct 4, 2024
1 parent aad9126 commit 3a7a132
Show file tree
Hide file tree
Showing 15 changed files with 1,938 additions and 75 deletions.
82 changes: 82 additions & 0 deletions pkg/collector/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/mahendrapaipuri/ceems/internal/osexec"
"github.com/prometheus/procfs"
)

type Device struct {
Expand Down Expand Up @@ -244,6 +245,87 @@ func GetAMDGPUDevices(rocmSmiPath string, logger log.Logger) (map[int]Device, er
return parseAmdSmioutput(string(rocmSmiOutput), logger), nil
}

// cgroupProcs returns a map of active cgroups and processes contained in each cgroup.
func cgroupProcs(fs procfs.FS, idRegex *regexp.Regexp, targetEnvVars []string, procFilter func(string) bool) (map[string][]procfs.Proc, error) {
// Get all active procs
allProcs, err := fs.AllProcs()
if err != nil {
return nil, err
}

// If no idRegex provided, return empty
if idRegex == nil {
return nil, errors.New("cgroup IDs cannot be retrieved due to empty regex")
}

cgroups := make(map[string][]procfs.Proc)

for _, proc := range allProcs {
// Get cgroup ID from regex
var cgroupID string

cgrps, err := proc.Cgroups()
if err != nil || len(cgrps) == 0 {
continue
}

for _, cgrp := range cgrps {
cgroupIDMatches := idRegex.FindStringSubmatch(cgrp.Path)
if len(cgroupIDMatches) <= 1 {
continue
}

cgroupID = cgroupIDMatches[1]

break
}

// If no cgroupID found, ignore
if cgroupID == "" {
continue
}

// if targetEnvVars is not empty check if this env vars is present for the process
// We dont check for the value of env var. Presence of env var is enough to
// trigger the profiling of that process
if len(targetEnvVars) > 0 {
environ, err := proc.Environ()
if err != nil {
continue
}

for _, env := range environ {
for _, targetEnvVar := range targetEnvVars {
if strings.HasPrefix(env, targetEnvVar) {
goto check_process
}
}
}

// If target env var(s) is not found, return
continue
}

check_process:
// Ignore processes where command line matches the regex
if procFilter != nil {
procCmdLine, err := proc.CmdLine()
if err != nil || len(procCmdLine) == 0 {
continue
}

// Ignore process if matches found
if procFilter(strings.Join(procCmdLine, " ")) {
continue
}
}

cgroups[cgroupID] = append(cgroups[cgroupID], proc)
}

return cgroups, nil
}

// fileExists checks if given file exists or not.
func fileExists(filename string) bool {
info, err := os.Stat(filename)
Expand Down
74 changes: 3 additions & 71 deletions pkg/collector/perf.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//go:build !perf
// +build !perf
//go:build !noperf
// +build !noperf

package collector

Expand Down Expand Up @@ -1123,79 +1123,11 @@ func discoverer(data interface{}) error {
return security.ErrSecurityCtxDataAssertion
}

allProcs, err := d.procfs.AllProcs()
cgroups, err := cgroupProcs(d.procfs, d.cgroupManager.idRegex, d.targetEnvVars, d.cgroupManager.procFilter)
if err != nil {
return err
}

cgroups := make(map[string][]procfs.Proc)

for _, proc := range allProcs {
// if targetEnvVars is not empty check if this env vars is present for the process
// We dont check for the value of env var. Presence of env var is enough to
// trigger the profiling of that process
if len(d.targetEnvVars) > 0 {
environ, err := proc.Environ()
if err != nil {
continue
}

for _, env := range environ {
for _, targetEnvVar := range d.targetEnvVars {
if strings.HasPrefix(env, targetEnvVar) {
goto check_process
}
}
}

// If target env var(s) is not found, return
continue
}

check_process:

// Ignore processes where command line matches the regex
if d.cgroupManager.procFilter != nil {
procCmdLine, err := proc.CmdLine()
if err != nil || len(procCmdLine) == 0 {
continue
}

// Ignore process if matches found
if d.cgroupManager.procFilter(strings.Join(procCmdLine, " ")) {
continue
}
}

// Get cgroup ID from regex
var cgroupID string

if d.cgroupManager.idRegex != nil {
cgroups, err := proc.Cgroups()
if err != nil || len(cgroups) == 0 {
continue
}

for _, cgroup := range cgroups {
cgroupIDMatches := d.cgroupManager.idRegex.FindStringSubmatch(cgroup.Path)
if len(cgroupIDMatches) <= 1 {
continue
}

cgroupID = cgroupIDMatches[1]

break
}
}

// If no cgroupID found, ignore
if cgroupID == "" {
continue
}

cgroups[cgroupID] = append(cgroups[cgroupID], proc)
}

// Read cgroups proc map into d
d.cgroups = cgroups

Expand Down
4 changes: 2 additions & 2 deletions pkg/collector/perf_test.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//go:build !perf
// +build !perf
//go:build !noperf
// +build !noperf

package collector

Expand Down
Loading

0 comments on commit 3a7a132

Please sign in to comment.