From 3a7a1321116b41772429d059e1b7f0f1fe8d155b Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri <44365948+mahendrapaipuri@users.noreply.github.com> Date: Fri, 4 Oct 2024 17:50:29 +0200 Subject: [PATCH] Add RDMA collector (#182) * Add RDMA collector which exports selected system wide counters and compute unit's hw counters like QP, CQ, MR, etc. QP mode setting will continue at each scrape until we manage to enable on all links. Ensure to disable all Per-PID modes before exiting collector. * Update docs --------- Signed-off-by: Mahendra Paipuri --- pkg/collector/helper.go | 82 +++ pkg/collector/perf.go | 74 +- pkg/collector/perf_test.go | 4 +- pkg/collector/rdma.go | 667 ++++++++++++++++++ pkg/collector/rdma_test.go | 265 +++++++ pkg/collector/slurm.go | 34 + pkg/collector/slurm_test.go | 2 + ...-test-cgroupsv2-nvidia-ipmiutil-output.txt | 50 ++ pkg/collector/testdata/rdma | 118 ++++ pkg/collector/testdata/sys.ttar | 635 ++++++++++++++++- scripts/e2e-test.sh | 2 + website/cspell.json | 3 +- website/docs/components/ceems-exporter.md | 48 ++ website/docs/components/metrics.md | 26 + website/md-link-check.json | 3 + 15 files changed, 1938 insertions(+), 75 deletions(-) create mode 100644 pkg/collector/rdma.go create mode 100644 pkg/collector/rdma_test.go create mode 100755 pkg/collector/testdata/rdma diff --git a/pkg/collector/helper.go b/pkg/collector/helper.go index 8c30ec91..217e117d 100644 --- a/pkg/collector/helper.go +++ b/pkg/collector/helper.go @@ -14,6 +14,7 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/mahendrapaipuri/ceems/internal/osexec" + "github.com/prometheus/procfs" ) type Device struct { @@ -244,6 +245,87 @@ func GetAMDGPUDevices(rocmSmiPath string, logger log.Logger) (map[int]Device, er return parseAmdSmioutput(string(rocmSmiOutput), logger), nil } +// cgroupProcs returns a map of active cgroups and processes contained in each cgroup. +func cgroupProcs(fs procfs.FS, idRegex *regexp.Regexp, targetEnvVars []string, procFilter func(string) bool) (map[string][]procfs.Proc, error) { + // Get all active procs + allProcs, err := fs.AllProcs() + if err != nil { + return nil, err + } + + // If no idRegex provided, return empty + if idRegex == nil { + return nil, errors.New("cgroup IDs cannot be retrieved due to empty regex") + } + + cgroups := make(map[string][]procfs.Proc) + + for _, proc := range allProcs { + // Get cgroup ID from regex + var cgroupID string + + cgrps, err := proc.Cgroups() + if err != nil || len(cgrps) == 0 { + continue + } + + for _, cgrp := range cgrps { + cgroupIDMatches := idRegex.FindStringSubmatch(cgrp.Path) + if len(cgroupIDMatches) <= 1 { + continue + } + + cgroupID = cgroupIDMatches[1] + + break + } + + // If no cgroupID found, ignore + if cgroupID == "" { + continue + } + + // if targetEnvVars is not empty check if this env vars is present for the process + // We dont check for the value of env var. Presence of env var is enough to + // trigger the profiling of that process + if len(targetEnvVars) > 0 { + environ, err := proc.Environ() + if err != nil { + continue + } + + for _, env := range environ { + for _, targetEnvVar := range targetEnvVars { + if strings.HasPrefix(env, targetEnvVar) { + goto check_process + } + } + } + + // If target env var(s) is not found, return + continue + } + + check_process: + // Ignore processes where command line matches the regex + if procFilter != nil { + procCmdLine, err := proc.CmdLine() + if err != nil || len(procCmdLine) == 0 { + continue + } + + // Ignore process if matches found + if procFilter(strings.Join(procCmdLine, " ")) { + continue + } + } + + cgroups[cgroupID] = append(cgroups[cgroupID], proc) + } + + return cgroups, nil +} + // fileExists checks if given file exists or not. func fileExists(filename string) bool { info, err := os.Stat(filename) diff --git a/pkg/collector/perf.go b/pkg/collector/perf.go index e7115292..b5ab4164 100644 --- a/pkg/collector/perf.go +++ b/pkg/collector/perf.go @@ -1,5 +1,5 @@ -//go:build !perf -// +build !perf +//go:build !noperf +// +build !noperf package collector @@ -1123,79 +1123,11 @@ func discoverer(data interface{}) error { return security.ErrSecurityCtxDataAssertion } - allProcs, err := d.procfs.AllProcs() + cgroups, err := cgroupProcs(d.procfs, d.cgroupManager.idRegex, d.targetEnvVars, d.cgroupManager.procFilter) if err != nil { return err } - cgroups := make(map[string][]procfs.Proc) - - for _, proc := range allProcs { - // if targetEnvVars is not empty check if this env vars is present for the process - // We dont check for the value of env var. Presence of env var is enough to - // trigger the profiling of that process - if len(d.targetEnvVars) > 0 { - environ, err := proc.Environ() - if err != nil { - continue - } - - for _, env := range environ { - for _, targetEnvVar := range d.targetEnvVars { - if strings.HasPrefix(env, targetEnvVar) { - goto check_process - } - } - } - - // If target env var(s) is not found, return - continue - } - - check_process: - - // Ignore processes where command line matches the regex - if d.cgroupManager.procFilter != nil { - procCmdLine, err := proc.CmdLine() - if err != nil || len(procCmdLine) == 0 { - continue - } - - // Ignore process if matches found - if d.cgroupManager.procFilter(strings.Join(procCmdLine, " ")) { - continue - } - } - - // Get cgroup ID from regex - var cgroupID string - - if d.cgroupManager.idRegex != nil { - cgroups, err := proc.Cgroups() - if err != nil || len(cgroups) == 0 { - continue - } - - for _, cgroup := range cgroups { - cgroupIDMatches := d.cgroupManager.idRegex.FindStringSubmatch(cgroup.Path) - if len(cgroupIDMatches) <= 1 { - continue - } - - cgroupID = cgroupIDMatches[1] - - break - } - } - - // If no cgroupID found, ignore - if cgroupID == "" { - continue - } - - cgroups[cgroupID] = append(cgroups[cgroupID], proc) - } - // Read cgroups proc map into d d.cgroups = cgroups diff --git a/pkg/collector/perf_test.go b/pkg/collector/perf_test.go index e94a3246..cd804197 100644 --- a/pkg/collector/perf_test.go +++ b/pkg/collector/perf_test.go @@ -1,5 +1,5 @@ -//go:build !perf -// +build !perf +//go:build !noperf +// +build !noperf package collector diff --git a/pkg/collector/rdma.go b/pkg/collector/rdma.go new file mode 100644 index 00000000..52c51878 --- /dev/null +++ b/pkg/collector/rdma.go @@ -0,0 +1,667 @@ +//go:build !nordma +// +build !nordma + +package collector + +import ( + "context" + "errors" + "fmt" + "os" + "os/exec" + "regexp" + "strconv" + "strings" + "sync" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/mahendrapaipuri/ceems/internal/osexec" + "github.com/mahendrapaipuri/ceems/internal/security" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/procfs" + "github.com/prometheus/procfs/sysfs" +) + +const rdmaCollectorSubsystem = "rdma" + +// CLI opts. +var ( + rdmaStatsEnabled = CEEMSExporterApp.Flag( + "collector.rdma.stats", + "Enables collection of RDMA stats (default: disabled)", + ).Default("false").Bool() + + // test related opts. + rdmaCmd = CEEMSExporterApp.Flag( + "collector.rdma.cmd", + "Path to rdma command", + ).Default("").Hidden().String() +) + +type mr struct { + num int + len uint64 + dev string +} + +type cq struct { + num int + len uint64 + dev string +} + +type qp struct { + num int + dev string + port string + hwCounters map[string]uint64 +} + +type rdmaCollector struct { + sysfs sysfs.FS + procfs procfs.FS + logger log.Logger + cgroupManager *cgroupManager + hostname string + isAvailable bool + rdmaCmd string + qpModes map[string]bool + securityContexts map[string]*security.SecurityContext + metricDescs map[string]*prometheus.Desc + hwCounters []string +} + +// Security context names. +const ( + rdmaExecCmdCtx = "rdma_exec_cmd" +) + +// NewRDMACollector returns a new Collector exposing RAPL metrics. +func NewRDMACollector(logger log.Logger, cgManager *cgroupManager) (*rdmaCollector, error) { + sysfs, err := sysfs.NewFS(*sysPath) + if err != nil { + return nil, fmt.Errorf("failed to open sysfs: %w", err) + } + + // Instantiate a new Proc FS + procfs, err := procfs.NewFS(*procfsPath) + if err != nil { + return nil, err + } + + // Setup RDMA command + var rdmaCmdPath string + if *rdmaCmd != "" { + rdmaCmdPath = *rdmaCmd + } else { + if rdmaCmdPath, err = exec.LookPath("rdma"); err != nil { + level.Error(logger). + Log("msg", "rdma command not found. Not all RDMA metrics will be reported.", "err", err) + } + } + + // Check if RDMA devices exist + _, err = sysfs.InfiniBandClass() + if err != nil && errors.Is(err, os.ErrNotExist) { + level.Error(logger). + Log("msg", "RDMA devices do not exist. RDMA collector wont return any data", "err", err) + + return &rdmaCollector{isAvailable: false}, nil + } + + // Get current qp mode + // We cannot turn on per PID counters when link is already being used by a process. + // So we keep a state variable of modes of all links and attempt to turn them on + // on every scrape request if they are not turned on already. + // As this per PID counters are only supported by Mellanox devices, we setup + // this map only for them. This map will be nil for other types of devices + qpModes, err := qpMode(rdmaCmdPath) + if err != nil { + level.Error(logger). + Log("msg", "Failed to get RDMA qp mode", "err", err) + } + + // If per QP counters are enabled, we need to disable them when exporter exits. + // So create a security context with cap_setuid and cap_setgid to be able to + // disable per QP counters + // + // Setup necessary capabilities. + securityContexts := make(map[string]*security.SecurityContext) + + if len(qpModes) > 0 { + level.Info(logger).Log("msg", "Per-PID QP stats available") + + caps := setupCollectorCaps(logger, rdmaCollectorSubsystem, []string{"cap_setuid", "cap_setgid"}) + + // Setup new security context(s) + securityContexts[rdmaExecCmdCtx], err = security.NewSecurityContext(rdmaExecCmdCtx, caps, security.ExecAsUser, logger) + if err != nil { + level.Error(logger).Log("msg", "Failed to create a security context for RDMA collector", "err", err) + + return nil, err + } + } + + // Port counters descriptions. + portCountersDecs := map[string]string{ + "port_constraint_errors_received_total": "Number of packets received on the switch physical port that are discarded", + "port_constraint_errors_transmitted_total": "Number of packets not transmitted from the switch physical port", + "port_data_received_bytes_total": "Number of data octets received on all links", + "port_data_transmitted_bytes_total": "Number of data octets transmitted on all links", + "port_discards_received_total": "Number of inbound packets discarded by the port because the port is down or congested", + "port_discards_transmitted_total": "Number of outbound packets discarded by the port because the port is down or congested", + "port_errors_received_total": "Number of packets containing an error that were received on this port", + "port_packets_received_total": "Number of packets received on all VLs by this port (including errors)", + "port_packets_transmitted_total": "Number of packets transmitted on all VLs from this port (including errors)", + "state_id": "State of the InfiniBand port (0: no change, 1: down, 2: init, 3: armed, 4: active, 5: act defer)", + } + + // HW counters descriptions. + hwCountersDecs := map[string]string{ + "rx_write_requests": "Number of received write requests for the associated QPs", + "rx_read_requests": "Number of received read requests for the associated QPs", + "rx_atomic_requests": "Number of received atomic request for the associated QPs", + "req_cqe_error": "Number of times requester detected CQEs completed with errors", + "req_cqe_flush_error": "Number of times requester detected CQEs completed with flushed errors", + "req_remote_access_errors": "Number of times requester detected remote access errors", + "req_remote_invalid_request": "Number of times requester detected remote invalid request errors", + "resp_cqe_error": "Number of times responder detected CQEs completed with errors", + "resp_cqe_flush_error": "Number of times responder detected CQEs completed with flushed errors", + "resp_local_length_error": "Number of times responder detected local length errors", + "resp_remote_access_errors": "Number of times responder detected remote access errors", + } + + // HW counters descriptions. + wpsCountersDecs := map[string]string{ + "qps_active": "Number of active QPs", + "cqs_active": "Number of active CQs", + "mrs_active": "Number of active MRs", + "cqe_len_active": "Length of active CQs", + "mrs_len_active": "Length of active MRs", + } + + metricDescs := make(map[string]*prometheus.Desc) + + for metricName, description := range portCountersDecs { + metricDescs[metricName] = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, rdmaCollectorSubsystem, metricName), + description, + []string{"manager", "hostname", "device", "port"}, + nil, + ) + } + + var hwCounters []string + for metricName, description := range hwCountersDecs { + hwCounters = append(hwCounters, metricName) + metricDescs[metricName] = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, rdmaCollectorSubsystem, metricName), + description, + []string{"manager", "hostname", "device", "port", "uuid"}, + nil, + ) + } + + for metricName, description := range wpsCountersDecs { + metricDescs[metricName] = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, rdmaCollectorSubsystem, metricName), + description, + []string{"manager", "hostname", "device", "port", "uuid"}, + nil, + ) + } + + return &rdmaCollector{ + sysfs: sysfs, + procfs: procfs, + logger: logger, + cgroupManager: cgManager, + hostname: hostname, + rdmaCmd: rdmaCmdPath, + isAvailable: true, + qpModes: qpModes, + securityContexts: securityContexts, + metricDescs: metricDescs, + hwCounters: hwCounters, + }, nil +} + +// Update implements Collector and exposes RDMA related metrics. +func (c *rdmaCollector) Update(ch chan<- prometheus.Metric) error { + if !c.isAvailable { + return ErrNoData + } + + // Check QP modes and attempt to enable PID if not already done + if err := c.perPIDCounters(true); err != nil { + level.Error(c.logger).Log("msg", "Failed to enable Per-PID QP stats", "err", err) + } + + return c.update(ch) +} + +// Stop releases system resources used by the collector. +func (c *rdmaCollector) Stop(_ context.Context) error { + level.Debug(c.logger).Log("msg", "Stopping", "collector", rdmaCollectorSubsystem) + + return c.perPIDCounters(false) +} + +// perPIDCounters enables/disables per PID counters for supported devices. +func (c *rdmaCollector) perPIDCounters(enable bool) error { + // If there no supported devices, return + if c.qpModes == nil { + return nil + } + + // Return if there is no security context found + securityCtx, ok := c.securityContexts[rdmaExecCmdCtx] + if !ok { + return security.ErrNoSecurityCtx + } + + // Set per QP counters off when exiting + var allErrs error + + for link, mode := range c.qpModes { + if mode != enable { + var cmd []string + if enable { + cmd = []string{"rdma", "statistic", "qp", "set", "link", link, "auto", "type,pid", "on"} + } else { + cmd = []string{"rdma", "statistic", "qp", "set", "link", link, "auto", "off"} + } + + // Execute command as root + dataPtr := &security.ExecSecurityCtxData{ + Cmd: cmd, + Logger: c.logger, + UID: 0, + GID: 0, + } + + // If command didnt return error, we successfully enabled/disabled mode + if err := securityCtx.Exec(dataPtr); err != nil { + allErrs = errors.Join(allErrs, err) + } else { + c.qpModes[link] = enable + } + } + } + + if allErrs != nil { + return allErrs + } + + return nil +} + +// update fetches different RDMA stats. +func (c *rdmaCollector) update(ch chan<- prometheus.Metric) error { + // First get cgroups and their associated procs + procCgroup, err := c.procCgroups() + if err != nil { + level.Error(c.logger).Log("msg", "Failed to fetch active cgroups", "err", err) + + return ErrNoData + } + + // Initialise a wait group + wg := sync.WaitGroup{} + + // Fetch MRs + wg.Add(1) + + go func(p map[string]string) { + defer wg.Done() + + mrs, err := c.devMR(p) + if err != nil { + level.Error(c.logger).Log("msg", "Failed to fetch RDMA MR stats", "err", err) + + return + } + + for uuid, mr := range mrs { + ch <- prometheus.MustNewConstMetric(c.metricDescs["mrs_active"], prometheus.GaugeValue, float64(mr.num), c.cgroupManager.manager, c.hostname, mr.dev, "", uuid) + ch <- prometheus.MustNewConstMetric(c.metricDescs["mrs_len_active"], prometheus.GaugeValue, float64(mr.len), c.cgroupManager.manager, c.hostname, mr.dev, "", uuid) + } + }(procCgroup) + + // Fetch CQs + wg.Add(1) + + go func(p map[string]string) { + defer wg.Done() + + cqs, err := c.devCQ(p) + if err != nil { + level.Error(c.logger).Log("msg", "Failed to fetch RDMA CQ stats", "err", err) + + return + } + + for uuid, cq := range cqs { + ch <- prometheus.MustNewConstMetric(c.metricDescs["cqs_active"], prometheus.GaugeValue, float64(cq.num), c.cgroupManager.manager, c.hostname, cq.dev, "", uuid) + ch <- prometheus.MustNewConstMetric(c.metricDescs["cqe_len_active"], prometheus.GaugeValue, float64(cq.len), c.cgroupManager.manager, c.hostname, cq.dev, "", uuid) + } + }(procCgroup) + + // Fetch QPs + wg.Add(1) + + go func(p map[string]string) { + defer wg.Done() + + qps, err := c.linkQP(p) + if err != nil { + level.Error(c.logger).Log("msg", "Failed to fetch RDMA QP stats", "err", err) + + return + } + + for uuid, qp := range qps { + ch <- prometheus.MustNewConstMetric(c.metricDescs["qps_active"], prometheus.GaugeValue, float64(qp.num), c.cgroupManager.manager, c.hostname, qp.dev, qp.port, uuid) + + for _, hwCounter := range c.hwCounters { + if qp.hwCounters[hwCounter] > 0 { + ch <- prometheus.MustNewConstMetric(c.metricDescs[hwCounter], prometheus.CounterValue, float64(qp.hwCounters[hwCounter]), c.cgroupManager.manager, c.hostname, qp.dev, qp.port, uuid) + } + } + } + }(procCgroup) + + // Fetch sys wide counters + wg.Add(1) + + go func() { + defer wg.Done() + + counters, err := c.linkCountersSysWide() + if err != nil { + level.Error(c.logger).Log("msg", "Failed to fetch system wide RDMA counters", "err", err) + + return + } + + var vType prometheus.ValueType + + for link, cnts := range counters { + l := strings.Split(link, "/") + device := l[0] + port := l[1] + + for n, v := range cnts { + if v > 0 { + if n == "state_id" { + vType = prometheus.GaugeValue + } else { + vType = prometheus.CounterValue + } + ch <- prometheus.MustNewConstMetric(c.metricDescs[n], vType, float64(v), c.cgroupManager.manager, c.hostname, device, port) + } + } + } + }() + + // Wait for all go routines + wg.Wait() + + return nil +} + +// procCgroups returns cgroup ID of all relevant processes. +func (c *rdmaCollector) procCgroups() (map[string]string, error) { + // First get cgroups and their associated procs + cgroups, err := cgroupProcs(c.procfs, c.cgroupManager.idRegex, nil, c.cgroupManager.procFilter) + if err != nil { + level.Error(c.logger).Log("msg", "Failed to fetch active cgroups", "err", err) + + return nil, err + } + + // Make invert mapping of cgroups + procCgroup := make(map[string]string) + + for cgroupID, procs := range cgroups { + for _, proc := range procs { + p := strconv.FormatInt(int64(proc.PID), 10) + procCgroup[p] = cgroupID + } + } + + return procCgroup, nil +} + +// devMR returns Memory Regions (MRs) stats of all active cgroups. +func (c *rdmaCollector) devMR(procCgroup map[string]string) (map[string]*mr, error) { + // Arguments to command + args := []string{"resource", "show", "mr"} + + // Execute command + out, err := osexec.Execute(c.rdmaCmd, args, nil) + if err != nil { + return nil, err + } + + // Define regexes + devRegex := regexp.MustCompile(`^dev\s*([a-z0-9_]+)`) + pidRegex := regexp.MustCompile(`.+?pid\s*([\d]+)`) + mrlenRegex := regexp.MustCompile(`.+?mrlen\s*([\d]+)`) + + // Read line by line and match dev, pid and mrlen + mrs := make(map[string]*mr) + + for _, line := range strings.Split(string(out), "\n") { + if devMatch := devRegex.FindStringSubmatch(line); len(devMatch) > 1 { + if pidMatch := pidRegex.FindStringSubmatch(line); len(pidMatch) > 1 { + if uuid, ok := procCgroup[pidMatch[1]]; ok { + if mrLenMatch := mrlenRegex.FindStringSubmatch(line); len(mrLenMatch) > 1 { + if l, err := strconv.ParseUint(mrLenMatch[1], 10, 64); err == nil { + if _, ok := mrs[uuid]; ok { + mrs[uuid].num++ + mrs[uuid].len += l + } else { + mrs[uuid] = &mr{1, l, devMatch[1]} + } + } + } + } + } + } + } + + return mrs, nil +} + +// devCQ returns Completion Queues (CQs) stats of all active cgroups. +func (c *rdmaCollector) devCQ(procCgroup map[string]string) (map[string]*cq, error) { + // Arguments to command + args := []string{"resource", "show", "cq"} + + // Execute command + out, err := osexec.Execute(c.rdmaCmd, args, nil) + if err != nil { + return nil, err + } + + // Define regexes + devRegex := regexp.MustCompile(`^dev\s*([a-z0-9_]+)`) + pidRegex := regexp.MustCompile(`.+?pid\s*([\d]+)`) + cqeRegex := regexp.MustCompile(`.+?cqe\s*([\d]+)`) + + // Read line by line and match dev, pid and mrlen + cqs := make(map[string]*cq) + + for _, line := range strings.Split(string(out), "\n") { + if devMatch := devRegex.FindStringSubmatch(line); len(devMatch) > 1 { + if pidMatch := pidRegex.FindStringSubmatch(line); len(pidMatch) > 1 { + if uuid, ok := procCgroup[pidMatch[1]]; ok { + if cqeMatch := cqeRegex.FindStringSubmatch(line); len(cqeMatch) > 1 { + if l, err := strconv.ParseUint(cqeMatch[1], 10, 64); err == nil { + if _, ok := cqs[uuid]; ok { + cqs[uuid].num++ + cqs[uuid].len += l + } else { + cqs[uuid] = &cq{1, l, devMatch[1]} + } + } + } + } + } + } + } + + return cqs, nil +} + +// linkQP returns Queue Pairs (QPs) stats of all active cgroups. +func (c *rdmaCollector) linkQP(procCgroup map[string]string) (map[string]*qp, error) { + // Arguments to command + args := []string{"resource", "show", "qp"} + + // Execute command + out, err := osexec.Execute(c.rdmaCmd, args, nil) + if err != nil { + return nil, err + } + + // Define regexes + linkRegex := regexp.MustCompile(`^link\s*([a-z0-9_/]+)`) + pidRegex := regexp.MustCompile(`.+?pid\s*([\d]+)`) + + // Read line by line and match dev, pid and mrlen + qps := make(map[string]*qp) + + for _, line := range strings.Split(string(out), "\n") { + if linkMatch := linkRegex.FindStringSubmatch(line); len(linkMatch) > 1 { + if pidMatch := pidRegex.FindStringSubmatch(line); len(pidMatch) > 1 { + if uuid, ok := procCgroup[pidMatch[1]]; ok { + if _, ok := qps[uuid]; ok { + qps[uuid].num++ + } else { + link := strings.Split(linkMatch[1], "/") + if len(link) == 2 { + qps[uuid] = &qp{1, link[0], link[1], make(map[string]uint64)} + } + } + } + } + } + } + + // If per PID counters are enabled, fetch them + if len(c.qpModes) > 0 { + // Arguments to command + args := []string{"statistic", "qp", "show"} + + // Execute command + out, err := osexec.Execute(c.rdmaCmd, args, nil) + if err != nil { + level.Error(c.logger).Log("msg", "Failed to fetch per PID QP stats", "err", err) + + return qps, nil + } + + for _, line := range strings.Split(string(out), "\n") { + if linkMatch := linkRegex.FindStringSubmatch(line); len(linkMatch) > 1 { + for _, hwCounter := range c.hwCounters { + if pidMatch := pidRegex.FindStringSubmatch(line); len(pidMatch) > 1 { + if uuid, ok := procCgroup[pidMatch[1]]; ok { + counterRegex := regexp.MustCompile(fmt.Sprintf(`.+?%s\s*([\d]+)`, hwCounter)) + if counterMatch := counterRegex.FindStringSubmatch(line); len(counterMatch) > 1 { + if v, err := strconv.ParseUint(counterMatch[1], 10, 64); err == nil { + if _, ok := qps[uuid]; !ok { + link := strings.Split(linkMatch[1], "/") + qps[uuid] = &qp{1, link[0], link[1], make(map[string]uint64)} + } + + qps[uuid].hwCounters[hwCounter] = v + } + } + } + } + } + } + } + } + + return qps, nil +} + +// linkCountersSysWide returns system wide counters of all RDMA devices. +func (c *rdmaCollector) linkCountersSysWide() (map[string]map[string]uint64, error) { + devices, err := c.sysfs.InfiniBandClass() + if err != nil { + return nil, fmt.Errorf("error obtaining InfiniBand class info: %w", err) + } + + counters := make(map[string]map[string]uint64) + + for _, device := range devices { + for _, port := range device.Ports { + link := fmt.Sprintf("%s/%d", device.Name, port.Port) + counters[link] = map[string]uint64{ + "port_constraint_errors_received_total": sanitizeMetric(port.Counters.PortRcvConstraintErrors), + "port_constraint_errors_transmitted_total": sanitizeMetric(port.Counters.PortXmitConstraintErrors), + "port_data_received_bytes_total": sanitizeMetric(port.Counters.PortRcvData), + "port_data_transmitted_bytes_total": sanitizeMetric(port.Counters.PortXmitData), + "port_discards_received_total": sanitizeMetric(port.Counters.PortRcvDiscards), + "port_discards_transmitted_total": sanitizeMetric(port.Counters.PortXmitDiscards), + "port_errors_received_total": sanitizeMetric(port.Counters.PortRcvErrors), + "port_packets_received_total": sanitizeMetric(port.Counters.PortRcvPackets), + "port_packets_transmitted_total": sanitizeMetric(port.Counters.PortXmitPackets), + "state_id": uint64(port.StateID), + } + } + } + + return counters, nil +} + +// sanitizeMetric returns 0 if pointer is nil else metrics value. +func sanitizeMetric(value *uint64) uint64 { + if value == nil { + return 0 + } + + return *value +} + +// qpMode returns current QP mode for all links. +func qpMode(rdmaCmd string) (map[string]bool, error) { + args := []string{"statistic", "qp", "mode"} + + // Execute command + out, err := osexec.Execute(rdmaCmd, args, nil) + if err != nil { + return nil, err + } + + // Define regexes + linkRegex := regexp.MustCompile(`^link\s*([a-z0-9_/]+)`) + autoRegex := regexp.MustCompile(`.+?auto\s*([a-z,]+)`) + + // Split output and get mode for each device + linkMode := make(map[string]bool) + + for _, line := range strings.Split(string(out), "\n") { + if linkMatch := linkRegex.FindStringSubmatch(line); len(linkMatch) > 1 && strings.HasPrefix(linkMatch[1], "mlx") { + if autoMatch := autoRegex.FindStringSubmatch(line); len(autoMatch) > 1 { + if autoMatch[1] == "off" { + linkMode[linkMatch[1]] = false + } else { + linkMode[linkMatch[1]] = true + } + } + } + } + + return linkMode, nil +} + +// rdmaCollectorEnabled returns true if RDMA stats are enabled. +func rdmaCollectorEnabled() bool { + return *rdmaStatsEnabled +} diff --git a/pkg/collector/rdma_test.go b/pkg/collector/rdma_test.go new file mode 100644 index 00000000..b3d1ef3d --- /dev/null +++ b/pkg/collector/rdma_test.go @@ -0,0 +1,265 @@ +//go:build !nordma +// +build !nordma + +package collector + +import ( + "context" + "testing" + + "github.com/containerd/cgroups/v3" + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/procfs" + "github.com/prometheus/procfs/sysfs" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestRDMACollector(t *testing.T) { + _, err := CEEMSExporterApp.Parse([]string{ + "--path.procfs", "testdata/proc", + "--path.sysfs", "testdata/sys", + "--collector.rdma.stats", + "--collector.rdma.cmd", "testdata/rdma", + }) + require.NoError(t, err) + + // cgroup manager + cgManager := &cgroupManager{ + mode: cgroups.Unified, + mountPoint: "testdata/sys/fs/cgroup/system.slice/slurmstepd.scope", + idRegex: slurmCgroupPathRegex, + procFilter: func(p string) bool { + return slurmIgnoreProcsRegex.MatchString(p) + }, + } + + collector, err := NewPerfCollector(log.NewNopLogger(), cgManager) + require.NoError(t, err) + + // Setup background goroutine to capture metrics. + metrics := make(chan prometheus.Metric) + defer close(metrics) + + go func() { + i := 0 + for range metrics { + i++ + } + }() + + err = collector.Update(metrics) + require.NoError(t, err) + + err = collector.Stop(context.Background()) + require.NoError(t, err) +} + +func TestDevMR(t *testing.T) { + _, err := CEEMSExporterApp.Parse([]string{ + "--path.procfs", "testdata/proc", + }) + require.NoError(t, err) + + // cgroup manager + cgManager := &cgroupManager{ + mode: cgroups.Unified, + idRegex: slurmCgroupPathRegex, + procFilter: func(p string) bool { + return slurmIgnoreProcsRegex.MatchString(p) + }, + } + + // Instantiate a new Proc FS + procfs, err := procfs.NewFS(*procfsPath) + require.NoError(t, err) + + c := rdmaCollector{ + logger: log.NewNopLogger(), + rdmaCmd: "testdata/rdma", + procfs: procfs, + cgroupManager: cgManager, + } + + // Get cgroup IDs + procCgroup, err := c.procCgroups() + require.NoError(t, err) + + expectedMRs := map[string]*mr{ + "1320003": {2, 4194304, "mlx5_0"}, + "4824887": {2, 4194304, "mlx5_0"}, + } + + // Get MR stats + mrs, err := c.devMR(procCgroup) + require.NoError(t, err) + assert.Equal(t, expectedMRs, mrs) +} + +func TestDevCQ(t *testing.T) { + _, err := CEEMSExporterApp.Parse([]string{ + "--path.procfs", "testdata/proc", + }) + require.NoError(t, err) + + // cgroup manager + cgManager := &cgroupManager{ + mode: cgroups.Unified, + idRegex: slurmCgroupPathRegex, + procFilter: func(p string) bool { + return slurmIgnoreProcsRegex.MatchString(p) + }, + } + + // Instantiate a new Proc FS + procfs, err := procfs.NewFS(*procfsPath) + require.NoError(t, err) + + c := rdmaCollector{ + logger: log.NewNopLogger(), + rdmaCmd: "testdata/rdma", + procfs: procfs, + cgroupManager: cgManager, + } + + // Get cgroup IDs + procCgroup, err := c.procCgroups() + require.NoError(t, err) + + expectedCQs := map[string]*cq{ + "1320003": {2, 8190, "mlx5_0"}, + "4824887": {2, 8190, "mlx5_0"}, + } + + // Get MR stats + cqs, err := c.devCQ(procCgroup) + require.NoError(t, err) + assert.Equal(t, expectedCQs, cqs) +} + +func TestLinkQP(t *testing.T) { + _, err := CEEMSExporterApp.Parse([]string{ + "--path.procfs", "testdata/proc", + }) + require.NoError(t, err) + + // cgroup manager + cgManager := &cgroupManager{ + mode: cgroups.Unified, + idRegex: slurmCgroupPathRegex, + procFilter: func(p string) bool { + return slurmIgnoreProcsRegex.MatchString(p) + }, + } + + // Instantiate a new Proc FS + procfs, err := procfs.NewFS(*procfsPath) + require.NoError(t, err) + + c := rdmaCollector{ + logger: log.NewNopLogger(), + rdmaCmd: "testdata/rdma", + procfs: procfs, + cgroupManager: cgManager, + qpModes: map[string]bool{"mlx5_0": true}, + hwCounters: []string{"rx_write_requests", "rx_read_requests"}, + } + + // Get cgroup IDs + procCgroup, err := c.procCgroups() + require.NoError(t, err) + + expected := map[string]*qp{ + "1320003": {16, "mlx5_0", "1", map[string]uint64{"rx_read_requests": 0, "rx_write_requests": 41988882}}, + "4824887": {16, "mlx5_0", "1", map[string]uint64{"rx_write_requests": 0, "rx_read_requests": 0}}, + } + + // Get MR stats + qps, err := c.linkQP(procCgroup) + require.NoError(t, err) + assert.Equal(t, expected, qps) +} + +func TestLinkCountersSysWide(t *testing.T) { + _, err := CEEMSExporterApp.Parse([]string{ + "--path.sysfs", "testdata/sys", + }) + require.NoError(t, err) + + // cgroup manager + cgManager := &cgroupManager{ + mode: cgroups.Unified, + idRegex: slurmCgroupPathRegex, + procFilter: func(p string) bool { + return slurmIgnoreProcsRegex.MatchString(p) + }, + } + + // Instantiate a new Proc FS + sysfs, err := sysfs.NewFS(*sysPath) + require.NoError(t, err) + + c := rdmaCollector{ + logger: log.NewNopLogger(), + sysfs: sysfs, + cgroupManager: cgManager, + hwCounters: []string{"rx_write_requests", "rx_read_requests"}, + } + + expected := map[string]map[string]uint64{ + "hfi1_0/1": { + "port_constraint_errors_received_total": 0x0, + "port_constraint_errors_transmitted_total": 0x0, + "port_data_received_bytes_total": 0x1416445f428, + "port_data_transmitted_bytes_total": 0xfec563343c, + "port_discards_received_total": 0x0, + "port_discards_transmitted_total": 0x0, + "port_errors_received_total": 0x0, + "port_packets_received_total": 0x2607abd3, + "port_packets_transmitted_total": 0x21dfdb88, + "state_id": 0x4, + }, + "mlx4_0/1": { + "port_constraint_errors_received_total": 0x0, + "port_constraint_errors_transmitted_total": 0x0, + "port_data_received_bytes_total": 0x21194bae4, + "port_data_transmitted_bytes_total": 0x18b043df3c, + "port_discards_received_total": 0x0, + "port_discards_transmitted_total": 0x0, + "port_errors_received_total": 0x0, + "port_packets_received_total": 0x532195c, + "port_packets_transmitted_total": 0x51c32e2, + "state_id": 0x4, + }, + "mlx4_0/2": { + "port_constraint_errors_received_total": 0x0, + "port_constraint_errors_transmitted_total": 0x0, + "port_data_received_bytes_total": 0x24a9d24c0, + "port_data_transmitted_bytes_total": 0x18b7b6d468, + "port_discards_received_total": 0x0, + "port_discards_transmitted_total": 0x0, + "port_errors_received_total": 0x0, + "port_packets_received_total": 0x5531960, + "port_packets_transmitted_total": 0x5484702, + "state_id": 0x4, + }, + "mlx5_0/1": { + "port_constraint_errors_received_total": 0x0, + "port_constraint_errors_transmitted_total": 0x0, + "port_data_received_bytes_total": 0x10e1a85288, + "port_data_transmitted_bytes_total": 0xa7aeb10cfc0, + "port_discards_received_total": 0x0, + "port_discards_transmitted_total": 0x0, + "port_errors_received_total": 0x0, + "port_packets_received_total": 0x204c9520, + "port_packets_transmitted_total": 0x28a29aec4, + "state_id": 0x4, + }, + } + + // Get MR stats + counters, err := c.linkCountersSysWide() + require.NoError(t, err) + assert.Equal(t, expected, counters) +} diff --git a/pkg/collector/slurm.go b/pkg/collector/slurm.go index db389173..427c4e07 100644 --- a/pkg/collector/slurm.go +++ b/pkg/collector/slurm.go @@ -101,6 +101,7 @@ type slurmCollector struct { cgroupCollector *cgroupCollector perfCollector *perfCollector ebpfCollector *ebpfCollector + rdmaCollector *rdmaCollector hostname string gpuDevs map[int]Device procFS procfs.FS @@ -175,6 +176,18 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { } } + // Start new instance of rdmaCollector + var rdmaCollector *rdmaCollector + + if rdmaCollectorEnabled() { + rdmaCollector, err = NewRDMACollector(logger, cgroupManager) + if err != nil { + level.Info(logger).Log("msg", "Failed to create RDMA collector", "err", err) + + return nil, err + } + } + // Attempt to get GPU devices var gpuTypes []string @@ -220,6 +233,7 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { cgroupCollector: cgCollector, perfCollector: perfCollector, ebpfCollector: ebpfCollector, + rdmaCollector: rdmaCollector, hostname: hostname, gpuDevs: gpuDevs, procFS: procFS, @@ -300,6 +314,19 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { }() } + if rdmaCollectorEnabled() { + wg.Add(1) + + go func() { + defer wg.Done() + + // Update RDMA metrics + if err := c.rdmaCollector.Update(ch); err != nil { + level.Error(c.logger).Log("msg", "Failed to update RDMA stats", "err", err) + } + }() + } + // Wait for all go routines wg.Wait() @@ -330,6 +357,13 @@ func (c *slurmCollector) Stop(ctx context.Context) error { } } + // Stop rdmaCollector + if rdmaCollectorEnabled() { + if err := c.rdmaCollector.Stop(ctx); err != nil { + level.Error(c.logger).Log("msg", "Failed to stop RDMA collector", "err", err) + } + } + return nil } diff --git a/pkg/collector/slurm_test.go b/pkg/collector/slurm_test.go index a7c9ef2f..ffaa5ef6 100644 --- a/pkg/collector/slurm_test.go +++ b/pkg/collector/slurm_test.go @@ -36,10 +36,12 @@ func TestNewSlurmCollector(t *testing.T) { []string{ "--path.cgroupfs", "testdata/sys/fs/cgroup", "--path.procfs", "testdata/proc", + "--path.sysfs", "testdata/sys", "--collector.slurm.gpu-job-map-path", "testdata/gpujobmap", "--collector.slurm.swap-memory-metrics", "--collector.slurm.psi-metrics", "--collector.perf.hardware-events", + "--collector.rdma.stats", "--collector.slurm.nvidia-smi-path", "testdata/nvidia-smi", "--collector.cgroups.force-version", "v2", }, diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt index a9652bab..8d8653c9 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt @@ -102,6 +102,56 @@ ceems_meminfo_MemTotal_bytes{hostname=""} 1.6042172416e+10 # TYPE ceems_rapl_package_joules_total counter ceems_rapl_package_joules_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0"} 258218.293244 ceems_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1"} 130570.505826 +# HELP ceems_rdma_cqe_len_active Length of active CQs +# TYPE ceems_rdma_cqe_len_active gauge +ceems_rdma_cqe_len_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="1320003"} 8190 +ceems_rdma_cqe_len_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="4824887"} 8190 +# HELP ceems_rdma_cqs_active Number of active CQs +# TYPE ceems_rdma_cqs_active gauge +ceems_rdma_cqs_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="1320003"} 2 +ceems_rdma_cqs_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="4824887"} 2 +# HELP ceems_rdma_mrs_active Number of active MRs +# TYPE ceems_rdma_mrs_active gauge +ceems_rdma_mrs_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="1320003"} 2 +ceems_rdma_mrs_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="4824887"} 2 +# HELP ceems_rdma_mrs_len_active Length of active MRs +# TYPE ceems_rdma_mrs_len_active gauge +ceems_rdma_mrs_len_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="1320003"} 4.194304e+06 +ceems_rdma_mrs_len_active{device="mlx5_0",hostname="",manager="slurm",port="",uuid="4824887"} 4.194304e+06 +# HELP ceems_rdma_port_data_received_bytes_total Number of data octets received on all links +# TYPE ceems_rdma_port_data_received_bytes_total counter +ceems_rdma_port_data_received_bytes_total{device="hfi1_0",hostname="",manager="slurm",port="1"} 1.380366808104e+12 +ceems_rdma_port_data_received_bytes_total{device="mlx4_0",hostname="",manager="slurm",port="1"} 8.884894436e+09 +ceems_rdma_port_data_received_bytes_total{device="mlx4_0",hostname="",manager="slurm",port="2"} 9.841747136e+09 +ceems_rdma_port_data_received_bytes_total{device="mlx5_0",hostname="",manager="slurm",port="1"} 7.2505381512e+10 +# HELP ceems_rdma_port_data_transmitted_bytes_total Number of data octets transmitted on all links +# TYPE ceems_rdma_port_data_transmitted_bytes_total counter +ceems_rdma_port_data_transmitted_bytes_total{device="hfi1_0",hostname="",manager="slurm",port="1"} 1.094233306172e+12 +ceems_rdma_port_data_transmitted_bytes_total{device="mlx4_0",hostname="",manager="slurm",port="1"} 1.0603645318e+11 +ceems_rdma_port_data_transmitted_bytes_total{device="mlx4_0",hostname="",manager="slurm",port="2"} 1.0616142756e+11 +ceems_rdma_port_data_transmitted_bytes_total{device="mlx5_0",hostname="",manager="slurm",port="1"} 1.1523046035392e+13 +# HELP ceems_rdma_port_packets_received_total Number of packets received on all VLs by this port (including errors) +# TYPE ceems_rdma_port_packets_received_total counter +ceems_rdma_port_packets_received_total{device="hfi1_0",hostname="",manager="slurm",port="1"} 6.38036947e+08 +ceems_rdma_port_packets_received_total{device="mlx4_0",hostname="",manager="slurm",port="1"} 8.7169372e+07 +ceems_rdma_port_packets_received_total{device="mlx4_0",hostname="",manager="slurm",port="2"} 8.9332064e+07 +ceems_rdma_port_packets_received_total{device="mlx5_0",hostname="",manager="slurm",port="1"} 5.41889824e+08 +# HELP ceems_rdma_port_packets_transmitted_total Number of packets transmitted on all VLs from this port (including errors) +# TYPE ceems_rdma_port_packets_transmitted_total counter +ceems_rdma_port_packets_transmitted_total{device="hfi1_0",hostname="",manager="slurm",port="1"} 5.68318856e+08 +ceems_rdma_port_packets_transmitted_total{device="mlx4_0",hostname="",manager="slurm",port="1"} 8.5734114e+07 +ceems_rdma_port_packets_transmitted_total{device="mlx4_0",hostname="",manager="slurm",port="2"} 8.862285e+07 +ceems_rdma_port_packets_transmitted_total{device="mlx5_0",hostname="",manager="slurm",port="1"} 1.0907922116e+10 +# HELP ceems_rdma_qps_active Number of active QPs +# TYPE ceems_rdma_qps_active gauge +ceems_rdma_qps_active{device="mlx5_0",hostname="",manager="slurm",port="1",uuid="1320003"} 16 +ceems_rdma_qps_active{device="mlx5_0",hostname="",manager="slurm",port="1",uuid="4824887"} 16 +# HELP ceems_rdma_state_id State of the InfiniBand port (0: no change, 1: down, 2: init, 3: armed, 4: active, 5: act defer) +# TYPE ceems_rdma_state_id gauge +ceems_rdma_state_id{device="hfi1_0",hostname="",manager="slurm",port="1"} 4 +ceems_rdma_state_id{device="mlx4_0",hostname="",manager="slurm",port="1"} 4 +ceems_rdma_state_id{device="mlx4_0",hostname="",manager="slurm",port="2"} 4 +ceems_rdma_state_id{device="mlx5_0",hostname="",manager="slurm",port="1"} 4 # HELP ceems_scrape_collector_duration_seconds ceems_exporter: Duration of a collector scrape. # TYPE ceems_scrape_collector_duration_seconds gauge # HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. diff --git a/pkg/collector/testdata/rdma b/pkg/collector/testdata/rdma new file mode 100755 index 00000000..eec4bd86 --- /dev/null +++ b/pkg/collector/testdata/rdma @@ -0,0 +1,118 @@ +#!/bin/sh + +sub_help(){ + echo "Usage: rdma [ OPTIONS ] OBJECT { COMMAND | help }" + echo " rdma [ -f[orce] ] -b[atch] filename" + echo "where OBJECT := { dev | link | resource | system | statistic | help }" + echo " OPTIONS := { -V[ersion] | -d[etails] | -j[son] | -p[retty] -r[aw]}" +} + +print_mr(){ + echo """dev mlx5_0 mrn 4 mrlen 2097152 pdn 9 pid 46231 comm ib_write_bw +dev mlx5_0 mrn 5 mrlen 2097152 pdn 8 pid 46235 comm ib_write_bw +dev mlx5_0 mrn 4 mrlen 2097152 pdn 9 pid 46236 comm ib_write_bw +dev mlx5_0 mrn 5 mrlen 2097152 pdn 8 pid 46281 comm ib_write_bw""" +} + +print_cq(){ + echo """dev mlx5_0 cqn 1 cqe 2047 users 5 poll-ctx WORKQUEUE adaptive-moderation on comm [ib_core] +dev mlx5_0 cqn 2 cqe 255 users 1 poll-ctx DIRECT adaptive-moderation on comm [mlx5_ib] +dev mlx5_0 cqn 3 cqe 255 users 0 poll-ctx DIRECT adaptive-moderation on comm [mlx5_ib] +dev mlx5_0 cqn 8 cqe 4095 users 32 adaptive-moderation off ctxn 4 pid 46231 comm ib_write_bw +dev mlx5_0 cqn 9 cqe 4095 users 32 adaptive-moderation off ctxn 5 pid 46235 comm ib_write_bw +dev mlx5_0 cqn 8 cqe 4095 users 32 adaptive-moderation off ctxn 4 pid 46236 comm ib_write_bw +dev mlx5_0 cqn 9 cqe 4095 users 32 adaptive-moderation off ctxn 5 pid 46281 comm ib_write_bw""" +} + +print_qp(){ + echo """link mlx5_0/- lqpn 0 type SMI state RTS sq-psn 0 comm [ib_core] +link mlx5_0/- lqpn 1 type GSI state RTS sq-psn 0 comm [ib_core] +link mlx5_0/1 lqpn 101 type UD state RTS sq-psn 79 comm [ib_core] +link mlx5_0/1 lqpn 813 rqpn 814 type RC state RTS rq-psn 9940491 sq-psn 2406910 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 814 rqpn 813 type RC state RTR rq-psn 2406926 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 815 rqpn 816 type RC state RTR rq-psn 13129518 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 816 rqpn 815 type RC state RTS rq-psn 5560784 sq-psn 13129534 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 817 rqpn 818 type RC state RTR rq-psn 11593195 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 818 rqpn 817 type RC state RTS rq-psn 9218980 sq-psn 11593210 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 819 rqpn 820 type RC state RTR rq-psn 5734471 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 820 rqpn 819 type RC state RTS rq-psn 16423148 sq-psn 5734486 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 821 rqpn 822 type RC state RTR rq-psn 869801 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 822 rqpn 821 type RC state RTS rq-psn 9391558 sq-psn 869817 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 823 rqpn 824 type RC state RTR rq-psn 5156666 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 824 rqpn 823 type RC state RTS rq-psn 9298810 sq-psn 5156682 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 825 rqpn 826 type RC state RTR rq-psn 15415907 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 826 rqpn 825 type RC state RTS rq-psn 11846939 sq-psn 15415923 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 827 rqpn 828 type RC state RTR rq-psn 6749855 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 828 rqpn 827 type RC state RTS rq-psn 4257602 sq-psn 6749872 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 829 rqpn 830 type RC state RTR rq-psn 4637926 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 830 rqpn 829 type RC state RTS rq-psn 16710024 sq-psn 4637942 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 831 rqpn 832 type RC state RTR rq-psn 15710300 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 832 rqpn 831 type RC state RTS rq-psn 7371059 sq-psn 15710316 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 833 rqpn 834 type RC state RTR rq-psn 9654443 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 834 rqpn 833 type RC state RTS rq-psn 5445009 sq-psn 9654460 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 835 rqpn 836 type RC state RTR rq-psn 14796958 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 836 rqpn 835 type RC state RTS rq-psn 1943687 sq-psn 14796974 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 837 rqpn 838 type RC state RTR rq-psn 1242029 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 838 rqpn 837 type RC state RTS rq-psn 1082929 sq-psn 1242045 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 839 rqpn 840 type RC state RTR rq-psn 15154813 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 840 rqpn 839 type RC state RTS rq-psn 10133331 sq-psn 15154829 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 841 rqpn 842 type RC state RTR rq-psn 9704396 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 842 rqpn 841 type RC state RTS rq-psn 7626827 sq-psn 9704413 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw +link mlx5_0/1 lqpn 843 rqpn 844 type RC state RTR rq-psn 11722317 sq-psn 0 path-mig-state MIGRATED pdn 9 pid 46235 comm ib_write_bw +link mlx5_0/1 lqpn 844 rqpn 843 type RC state RTS rq-psn 16033001 sq-psn 11722333 path-mig-state MIGRATED pdn 8 pid 46231 comm ib_write_bw""" +} + +print_qp_stat(){ + echo """link mlx5_0/1 cntn 4 qp-type RC pid 46231 comm ib_write_bw rx_write_requests 0 rx_read_requests 0 rx_atomic_requests 0 out_of_buffer 0 out_of_sequence 0 duplicate_request 0 rnr_nak_retry_err 0 packet_seq_err 0 implied_nak_seq_err 0 local_ack_timeout_err 0 rp_cnp_ignored 0 rp_cnp_handled 0 np_ecn_marked_roce_packets 0 np_cnp_sent 0 + LQPN: <813,816,818,820,822,824,826,828,830,832,834,836,838,840,842,844> +link mlx5_0/1 cntn 5 qp-type RC pid 46235 comm ib_write_bw rx_write_requests 41988882 rx_read_requests 0 rx_atomic_requests 0 out_of_buffer 0 out_of_sequence 0 duplicate_request 0 rnr_nak_retry_err 0 packet_seq_err 0 implied_nak_seq_err 0 local_ack_timeout_err 0 rp_cnp_ignored 0 rp_cnp_handled 0 np_ecn_marked_roce_packets 0 np_cnp_sent 0 + LQPN: <814,815,817,819,821,823,825,827,829,831,833,835,837,839,841,843>""" +} + +sub_resource(){ + case $2 in + "mr") + print_mr + ;; + "cq") + print_cq + ;; + "qp") + print_qp + ;; + *) + shift + echo "Error: unknown subcommand for resource." >&2 + exit 1 + ;; + esac +} + +sub_statistic(){ + case $1 in + "qp") + print_qp_stat + ;; + *) + shift + echo "Error: unknown subcommand for statistic." >&2 + exit 1 + ;; + esac +} + +subcommand=$1 +case $subcommand in + "" | "-h" | "--help") + sub_help + ;; + *) + shift + sub_${subcommand} $@ + if [ $? = 127 ]; then + echo "Error: '$subcommand' is not a known subcommand." >&2 + echo " Run '$ProgName --help' for a list of known subcommands." >&2 + exit 1 + fi + ;; +esac diff --git a/pkg/collector/testdata/sys.ttar b/pkg/collector/testdata/sys.ttar index 323a6a24..a880bdf4 100644 --- a/pkg/collector/testdata/sys.ttar +++ b/pkg/collector/testdata/sys.ttar @@ -1,10 +1,643 @@ -# Archive created by ttar -C pkg/collector/fixtures -c -f pkg/collector/fixtures/sys.ttar sys +# Archive created by ttar -C pkg/collector/testdata -c -f pkg/collector/testdata/sys.ttar sys Directory: sys Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/class Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/hfi1_0 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/board_id +Lines: 1 +HPE 100Gb 1-port OP101 QSFP28 x16 PCIe Gen3 with Intel Omni-Path Adapter +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/fw_ver +Lines: 1 +1.27.0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/hfi1_0/ports +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/hfi1_0/ports/1 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/hfi1_0/ports/1/counters +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/VL15_dropped +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/excessive_buffer_overrun_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/link_downed +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/link_error_recovery +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/local_link_integrity_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/port_rcv_constraint_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/port_rcv_data +Lines: 1 +345091702026 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/port_rcv_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/port_rcv_packets +Lines: 1 +638036947 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/port_rcv_remote_physical_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/port_rcv_switch_relay_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/port_xmit_constraint_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/port_xmit_data +Lines: 1 +273558326543 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/port_xmit_discards +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/port_xmit_packets +Lines: 1 +568318856 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/port_xmit_wait +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/counters/symbol_error +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/phys_state +Lines: 1 +5: LinkUp +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/rate +Lines: 1 +100 Gb/sec (4X EDR) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/hfi1_0/ports/1/state +Lines: 1 +4: ACTIVE +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/mlx4_0 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/board_id +Lines: 1 +SM_1141000001000 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/fw_ver +Lines: 1 +2.31.5050 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/hca_type +Lines: 1 +MT4099 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/mlx4_0/ports +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/mlx4_0/ports/1 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/mlx4_0/ports/1/counters +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/VL15_dropped +Lines: 1 +0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/excessive_buffer_overrun_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/link_downed +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/link_error_recovery +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/local_link_integrity_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_constraint_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_data +Lines: 1 +2221223609 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_packets +Lines: 1 +87169372 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_remote_physical_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_switch_relay_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_constraint_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_data +Lines: 1 +26509113295 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_discards +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_packets +Lines: 1 +85734114 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_wait +Lines: 1 +3599 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/symbol_error +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/phys_state +Lines: 1 +5: LinkUp +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/rate +Lines: 1 +40 Gb/sec (4X QDR) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/state +Lines: 1 +4: ACTIVE +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/mlx4_0/ports/2 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/mlx4_0/ports/2/counters +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/VL15_dropped +Lines: 1 +0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/excessive_buffer_overrun_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/link_downed +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/link_error_recovery +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/local_link_integrity_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_constraint_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_data +Lines: 1 +2460436784 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_packets +Lines: 1 +89332064 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_remote_physical_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_switch_relay_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_constraint_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_data +Lines: 1 +26540356890 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_discards +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_packets +Lines: 1 +88622850 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_wait +Lines: 1 +3846 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/counters/symbol_error +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/phys_state +Lines: 1 +5: LinkUp +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/rate +Lines: 1 +40 Gb/sec (4X QDR) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/2/state +Lines: 1 +4: ACTIVE +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/mlx5_0 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/board_id +Lines: 1 +SM_2001000001034 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/fw_ver +Lines: 1 +14.28.2006 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/hca_type +Lines: 1 +MT4118 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/mlx5_0/ports +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/mlx5_0/ports/1 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/mlx5_0/ports/1/counters +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/VL15_dropped +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/excessive_buffer_overrun_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/link_downed +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/link_error_recovery +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/local_link_integrity_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/multicast_rcv_packets +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/multicast_xmit_packets +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/port_rcv_constraint_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/port_rcv_data +Lines: 1 +18126345378 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/port_rcv_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/port_rcv_packets +Lines: 1 +541889824 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/port_rcv_remote_physical_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/port_rcv_switch_relay_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/port_xmit_constraint_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/port_xmit_data +Lines: 1 +2880761508848 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/port_xmit_discards +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/port_xmit_packets +Lines: 1 +10907922116 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/port_xmit_wait +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/symbol_error +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/unicast_rcv_packets +Lines: 1 +541889824 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/counters/unicast_xmit_packets +Lines: 1 +10907922116 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/mlx5_0/ports/1/hw_counters +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/duplicate_request +Lines: 1 +41 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/implied_nak_seq_err +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/lifespan +Lines: 1 +10 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/local_ack_timeout_err +Lines: 1 +131 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/out_of_buffer +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/out_of_sequence +Lines: 1 +1 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/packet_seq_err +Lines: 1 +1 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/req_cqe_error +Lines: 1 +3481 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/req_cqe_flush_error +Lines: 1 +80 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/req_remote_access_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/req_remote_invalid_request +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/resp_cqe_error +Lines: 1 +8109 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/resp_cqe_flush_error +Lines: 1 +4708 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/resp_local_length_error +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/resp_remote_access_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/rnr_nak_retry_err +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/roce_adp_retrans +Lines: 1 +99 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/roce_adp_retrans_to +Lines: 1 +4 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/roce_slow_restart +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/roce_slow_restart_cnps +Lines: 1 +131 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/roce_slow_restart_trans +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/rx_atomic_requests +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/rx_dct_connect +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/rx_read_requests +Lines: 1 +175528982 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/hw_counters/rx_write_requests +Lines: 1 +742114 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/phys_state +Lines: 1 +4: ACTIVE +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/rate +Lines: 1 +25 Gb/sec (1X EDR) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx5_0/ports/1/state +Lines: 1 +4: ACTIVE +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/class/powercap Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/scripts/e2e-test.sh b/scripts/e2e-test.sh index ff88c762..7bab3f0b 100755 --- a/scripts/e2e-test.sh +++ b/scripts/e2e-test.sh @@ -349,6 +349,8 @@ then --collector.slurm.gpu-type="nvidia" \ --collector.slurm.nvidia-smi-path="pkg/collector/testdata/nvidia-smi" \ --collector.slurm.gpu-job-map-path="pkg/collector/testdata/gpujobmap" \ + --collector.rdma.stats \ + --collector.rdma.cmd="pkg/collector/testdata/rdma" \ --collector.empty-hostname-label \ --collector.ipmi_dcmi.test-mode \ --web.listen-address "127.0.0.1:${port}" \ diff --git a/website/cspell.json b/website/cspell.json index 1206cf0e..a7e8404c 100644 --- a/website/cspell.json +++ b/website/cspell.json @@ -51,7 +51,8 @@ "ebpf", "cpus", "memsw", - "retrans" + "retrans", + "Mellanox" ], // flagWords - list of words to be always considered incorrect // This is useful for offensive words and common spelling errors. diff --git a/website/docs/components/ceems-exporter.md b/website/docs/components/ceems-exporter.md index 07f2cc32..3ff0cd87 100644 --- a/website/docs/components/ceems-exporter.md +++ b/website/docs/components/ceems-exporter.md @@ -26,6 +26,7 @@ metrics like IO, networking, performance _etc_. Currently available sub-collecto - Perf sub-collector: Exports hardware, software and cache performance metrics - eBPF sub-collector: Exports IO and network related metrics +- RDMA sub-collector: Exports selected RDMA stats These sub-collectors are not meant to work alone and they can enabled only when a main collector that monitors resource manager's compute units is activated. @@ -177,6 +178,53 @@ per protocol (TCP/UDP) and per IP family (IPv4/IPv6). - Number of retransmission bytes (only for TCP) - Number of retransmission packets (only for TCP) +### RDMA sub-collector + +Data transfer in RDMA happens directly between RDMA NIC and remote machine memory bypassing +CPU. Thus, it is hard to trace the RDMA's data transfer on a compute unit granularity. However, +the system wide data transfer metrics are readily available at `/sys/class/infiniband` +pseudo-filesystem. Thus, this sub-collector exports important system wide RDMA stats along +with few low-level metrics on a compute unit level. + +#### System wide RDMA stats + +- Number of data octets received on all links +- Number of data octets transmitted on all links +- Number of packets received on all VLs by this port (including errors) +- Number of packets transmitted on all VLs from this port (including errors) +- Number of packets received on the switch physical port that are discarded +- Number of packets not transmitted from the switch physical port +- Number of inbound packets discarded by the port because the port is down or congested +- Number of outbound packets discarded by the port because the port is down or congested +- Number of packets containing an error that were received on this port +- State of the InfiniBand port + +#### Per compute unit RDMA stats + +- Number of active Queue Pairs (QPs) +- Number of active Completion Queues (CQs) +- Number of active Memory Regions (MRs) +- Length of active CQs +- Length of active MRs + +In the case of Mellanox devices, following metrics are available for each compute unit: + +- Number of received write requests for the associated QPs +- Number of received read requests for the associated QPs +- Number of received atomic request for the associated QPs +- Number of times requester detected CQEs completed with errors +- Number of times requester detected CQEs completed with flushed errors +- Number of times requester detected remote access errors +- Number of times requester detected remote invalid request errors +- Number of times responder detected CQEs completed with errors +- Number of times responder detected CQEs completed with flushed errors +- Number of times responder detected local length errors +- Number of times responder detected remote access errors + +In order to interpret these metrics, please take a look at this +[very nice blog](https://cuterwrite.top/en/p/rdma-element/) which explains internals +of RDMA very well. + ## Collectors ### Slurm collector diff --git a/website/docs/components/metrics.md b/website/docs/components/metrics.md index b322ecf7..af99094b 100644 --- a/website/docs/components/metrics.md +++ b/website/docs/components/metrics.md @@ -79,3 +79,29 @@ shows the collector that metric belongs to. | ebpf | ceems_ebpf_egress_bytes_total | manager, uuid, proto, family | Total number of egress bytes of protocol `proto` and family `family` by compute unit identified by label `uuid`. | | ebpf | ceems_ebpf_retrans_packets_total | manager, uuid, proto, family | Total number of retransmission packets of protocol `proto` and family `family` by compute unit identified by label `uuid` (Only for TCP). | | ebpf | ceems_ebpf_retrans_bytes_total | manager, uuid, proto, family | Total number of retransmission bytes of protocol `proto` and family `family` by compute unit identified by label `uuid`. | +| rdma | ceems_rdma_port_constraint_errors_received_total | manager, device, port | Total number of packets received on the switch physical port that are discarded (system-wide metric). | +| rdma | ceems_rdma_port_constraint_errors_transmitted_total | manager, device, port | Total number of packets not transmitted from the switch physical port (system-wide metric). | +| rdma | ceems_rdma_port_data_received_bytes_total | manager, device, port | Total number of data octets received on all links (system-wide metric). | +| rdma | ceems_rdma_port_data_transmitted_bytes_total | manager, device, port | Total number of data octets transmitted on all links (system-wide metric). | +| rdma | ceems_rdma_port_discards_received_total | manager, device, port | Total number of inbound packets discarded by the port because the port is down or congested (system-wide metric). | +| rdma | ceems_rdma_port_discards_transmitted_total | manager, device, port | Total number of outbound packets discarded by the port because the port is down or congested (system-wide metric). | +| rdma | ceems_rdma_port_errors_received_total | manager, device, port | Total number of packets containing an error that were received on this port (system-wide metric). | +| rdma | ceems_rdma_port_packets_received_total | manager, device, port | Total number of packets received on all VLs by this port (including errors) (system-wide metric). | +| rdma | ceems_rdma_port_packets_transmitted_total | manager, device, port | Total number of packets transmitted on all VLs from this port (including errors). | +| rdma | ceems_rdma_state_id | manager, device, port | State of the InfiniBand port (0: no change, 1: down, 2: init, 3: armed, 4: active, 5: act defer). | +| rdma | ceems_rdma_rx_write_requests | manager, uuid, device, port | Total number of received write requests for the associated QPs for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_rx_read_requests | manager, uuid, device, port | Total number of Number of received read requests for the associated QPs for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_rx_atomic_requests | manager, uuid, device, port | Total number of received atomic request for the associated QPs for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_req_cqe_error | manager, uuid, device, port | Total number of times requester detected CQEs completed with errors for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_req_cqe_flush_error | manager, uuid, device, port | Total number of times requester detected CQEs completed with flushed errors for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_req_remote_access_errors | manager, uuid, device, port | Total number of times requester detected remote access errors for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_req_remote_invalid_request | manager, uuid, device, port | Total number of times requester detected remote invalid request errors for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_resp_cqe_error | manager, uuid, device, port | Total number of times responder detected CQEs completed with errors for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_resp_cqe_flush_error | manager, uuid, device, port | Total number of times responder detected CQEs completed with flushed errors for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_resp_local_length_error | manager, uuid, device, port | Total number of times responder detected local length errors for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_resp_remote_access_errors | manager, uuid, device, port | Total number of times responder detected remote access errors for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_qps_active | manager, uuid, device, port | Total number of active QPs for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_cqs_active | manager, uuid, device, port | Total number of active CQs for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_mrs_active | manager, uuid, device, port | Total number of active MRs for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_cqe_len_active | manager, uuid, device, port | Total Length of active CQEs for device `device` and compute unit identified by label `uuid`. | +| rdma | ceems_rdma_mrs_len_active | manager, uuid, device, port | Total Length of active MRs for device `device` and compute unit identified by label `uuid`. | diff --git a/website/md-link-check.json b/website/md-link-check.json index 83f30a88..311f8a35 100644 --- a/website/md-link-check.json +++ b/website/md-link-check.json @@ -18,6 +18,9 @@ }, { "pattern": "https://tbhaxor.com/understanding-linux-capabilities/" + }, + { + "pattern": "https://cuterwrite.top/en/p/rdma-element/" } ], "replacementPatterns": [