Skip to content

Commit

Permalink
Add libvirt collector (#186)
Browse files Browse the repository at this point in the history
* feat: Add libvirt collector

* feat: Support block IO metrics. Block IO metrics from cgroups can be fetched now.

* Keep a cgroup ID to compute unit UUID map to transform cgroup IDs to UUIDs.

* Log with sub_collector key when applicable

* docs: Update docs with libvirt sections

---------

Signed-off-by: Mahendra Paipuri <[email protected]>
  • Loading branch information
mahendrapaipuri authored Oct 10, 2024
1 parent 7413fe9 commit 1af0bdb
Show file tree
Hide file tree
Showing 35 changed files with 9,003 additions and 640 deletions.
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ test-e2e: build pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc
./scripts/e2e-test.sh -s exporter-cgroups-v2-nogpu
./scripts/e2e-test.sh -s exporter-cgroups-v2-procfs
./scripts/e2e-test.sh -s exporter-cgroups-v2-all-metrics
./scripts/e2e-test.sh -s exporter-cgroups-v1-libvirt
./scripts/e2e-test.sh -s exporter-cgroups-v2-libvirt
else
.PHONY: test-e2e
test-e2e: $(PROMTOOL) build pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc/.unpacked
Expand Down Expand Up @@ -204,6 +206,8 @@ test-e2e-update: build pkg/collector/testdata/sys/.unpacked pkg/collector/testda
./scripts/e2e-test.sh -s exporter-cgroups-v2-nogpu -u || true
./scripts/e2e-test.sh -s exporter-cgroups-v2-procfs -u || true
./scripts/e2e-test.sh -s exporter-cgroups-v2-all-metrics -u || true
./scripts/e2e-test.sh -s exporter-cgroups-v1-libvirt -u || true
./scripts/e2e-test.sh -s exporter-cgroups-v2-libvirt -u || true
else
.PHONY: test-e2e-update
test-e2e-update: $(PROMTOOL) build pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc/.unpacked
Expand Down
14 changes: 7 additions & 7 deletions etc/nvidia-dcgm-exporter/counters.csv
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,14 @@ DCGM_FI_DRIVER_VERSION, label, Driver Version

# Profiling metrics. Ref: https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#profiling-metrics
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Fraction of time any portion of the graphics or compute engines were active.
DCGM_FI_PROF_SM_ACTIVE, gauge, Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors.
DCGM_FI_PROF_SM_OCCUPANCY, gauge, Fraction of resident warps on a multiprocessor, relative to the maximum number of concurrent warps supported on a multiprocessor.
DCGM_FI_PROF_SM_ACTIVE, gauge, Fraction of time at least one warp was active on a multiprocessor averaged over all multiprocessors.
DCGM_FI_PROF_SM_OCCUPANCY, gauge, Fraction of resident warps on a multiprocessor relative to the maximum number of concurrent warps supported on a multiprocessor.
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Fraction of cycles the tensor (HMMA / IMMA) pipe was active.
DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Fraction of cycles the FP64 (double precision) pipe was active.
DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Fraction of cycles the FMA (FP32 (single precision), and integer) pipe was active.
DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Fraction of cycles the FMA (FP32 (single precision) and integer) pipe was active.
DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Fraction of cycles the FP16 (half precision) pipe was active. The value represents an average over a time interval and is not an instantaneous value.
DCGM_FI_PROF_DRAM_ACTIVE, gauge, Fraction of cycles where data was sent to or received from device memory.
DCGM_FI_PROF_NVLINK_TX_BYTES, gauge, Total rate of data transmitted over NVLink, not including protocol headers, in bytes per second.
DCGM_FI_PROF_NVLINK_RX_BYTES, gauge, Total rate of data received over NVLink, not including protocol headers, in bytes per second.
DCGM_FI_PROF_PCIE_TX_BYTES, gauge, Total rate of data transmitted over PCIE, not including protocol headers, in bytes per second.
DCGM_FI_PROF_PCIE_RX_BYTES, gauge, Total rate of data received over PCIE, not including protocol headers, in bytes per second.
DCGM_FI_PROF_NVLINK_TX_BYTES, gauge, Total rate of data transmitted over NVLink not including protocol headers in bytes per second.
DCGM_FI_PROF_NVLINK_RX_BYTES, gauge, Total rate of data received over NVLink not including protocol headers in bytes per second.
DCGM_FI_PROF_PCIE_TX_BYTES, gauge, Total rate of data transmitted over PCIE not including protocol headers in bytes per second.
DCGM_FI_PROF_PCIE_RX_BYTES, gauge, Total rate of data received over PCIE not including protocol headers in bytes per second.
10 changes: 2 additions & 8 deletions etc/slurm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,5 @@ This directory provides those scripts that should be used with SLURM.
An example [systemd service file](https://github.com/mahendrapaipuri/ceems/blob/main/init/systemd/ceems_exporter_no_privs.service)
is also provided in the repo that can be used along with these prolog and epilog scripts.

> [!IMPORTANT]
> The CLI argument `--collector.slurm.gpu-job-map-path`
is hidden and cannot be seen in `ceems_exporter --help` output. However, this argument
exists in the exporter and can be used.

Even with such prolog and epilog scripts, operators should grant the user running CEEMS
exporter permissions to run `ipmi-dcmi` command as this command can be executable by only
`root` by default.
Even with such prolog and epilog scripts, operators should grant the CEEMS exporter
process additional privileges for collectors like `ipmi_dcmi`, `ebpf`, _etc_.
147 changes: 145 additions & 2 deletions pkg/collector/cgroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs/blockdevice"
)

const (
Expand All @@ -35,6 +36,12 @@ const (
libvirt = "libvirt"
)

// Block IO Op names.
const (
readOp = "Read"
writeOp = "Write"
)

// Regular expressions of cgroup paths for different resource managers.
/*
For v1 possibilities are /cpuacct/slurm/uid_1000/job_211
Expand All @@ -50,8 +57,16 @@ var (
)

// Ref: https://libvirt.org/cgroups.html#legacy-cgroups-layout
// Take escaped unicode characters in regex
/*
For v1 possibilities are /cpuacct/machine.slice/machine-qemu\x2d2\x2dinstance\x2d00000001.scope
/memory/machine.slice/machine-qemu\x2d2\x2dinstance\x2d00000001.scope
For v2 possibilities are /machine.slice/machine-qemu\x2d2\x2dinstance\x2d00000001.scope
/machine.slice/machine-qemu\x2d2\x2dinstance\x2d00000001.scope/libvirt
*/
var (
libvirtCgroupPathRegex = regexp.MustCompile("^.*/(?:.+?)instance-([0-9]+)(?:.*$)")
libvirtCgroupPathRegex = regexp.MustCompile("^.*/(?:.+?)-qemu-(?:[0-9]+)-(instance-[0-9a-f]+)(?:.*$)")
)

// CLI options.
Expand Down Expand Up @@ -206,7 +221,7 @@ func NewCgroupManager(name string) (*cgroupManager, error) {

// Add filter functions
manager.pathFilter = func(p string) bool {
return false
return strings.Contains(p, "/libvirt")
}
manager.procFilter = func(p string) bool {
return false
Expand Down Expand Up @@ -239,6 +254,11 @@ type cgMetric struct {
memswTotal float64
memswFailCount float64
memoryPressure float64
blkioReadBytes map[string]float64
blkioWriteBytes map[string]float64
blkioReadReqs map[string]float64
blkioWriteReqs map[string]float64
blkioPressure float64
rdmaHCAHandles map[string]float64
rdmaHCAObjects map[string]float64
uuid string
Expand All @@ -252,6 +272,7 @@ type cgroupCollector struct {
opts cgroupOpts
hostname string
hostMemTotal float64
blockDevices map[string]string
numCgs *prometheus.Desc
cgCPUUser *prometheus.Desc
cgCPUSystem *prometheus.Desc
Expand All @@ -266,13 +287,19 @@ type cgroupCollector struct {
cgMemswTotal *prometheus.Desc
cgMemswFailCount *prometheus.Desc
cgMemoryPressure *prometheus.Desc
cgBlkioReadBytes *prometheus.Desc
cgBlkioWriteBytes *prometheus.Desc
cgBlkioReadReqs *prometheus.Desc
cgBlkioWriteReqs *prometheus.Desc
cgBlkioPressure *prometheus.Desc
cgRDMAHCAHandles *prometheus.Desc
cgRDMAHCAObjects *prometheus.Desc
collectError *prometheus.Desc
}

type cgroupOpts struct {
collectSwapMemStats bool
collectBlockIOStats bool
collectPSIStats bool
}

Expand All @@ -292,12 +319,29 @@ func NewCgroupCollector(logger log.Logger, cgManager *cgroupManager, opts cgroup

defer file.Close()

// Read block IO stats just to get block devices info.
// We construct a map from major:minor to device name using this info
blockDevices := make(map[string]string)

if blockdevice, err := blockdevice.NewFS(*procfsPath, *sysPath); err == nil {
if stats, err := blockdevice.ProcDiskstats(); err == nil {
for _, s := range stats {
blockDevices[fmt.Sprintf("%d:%d", s.Info.MajorNumber, s.Info.MinorNumber)] = s.Info.DeviceName
}
} else {
level.Error(logger).Log("msg", "Failed to get stats of block devices on the host", "err", err)
}
} else {
level.Error(logger).Log("msg", "Failed to get list of block devices on the host", "err", err)
}

return &cgroupCollector{
logger: logger,
cgroupManager: cgManager,
opts: opts,
hostMemTotal: memTotal,
hostname: hostname,
blockDevices: blockDevices,
numCgs: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, genericSubsystem, "units"),
"Total number of jobs",
Expand Down Expand Up @@ -382,6 +426,36 @@ func NewCgroupCollector(logger log.Logger, cgManager *cgroupManager, opts cgroup
[]string{"manager", "hostname", "uuid"},
nil,
),
cgBlkioReadBytes: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, genericSubsystem, "unit_blkio_read_total_bytes"),
"Total block IO read bytes",
[]string{"manager", "hostname", "uuid", "device"},
nil,
),
cgBlkioWriteBytes: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, genericSubsystem, "unit_blkio_write_total_bytes"),
"Total block IO write bytes",
[]string{"manager", "hostname", "uuid", "device"},
nil,
),
cgBlkioReadReqs: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, genericSubsystem, "unit_blkio_read_total_requests"),
"Total block IO read requests",
[]string{"manager", "hostname", "uuid", "device"},
nil,
),
cgBlkioWriteReqs: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, genericSubsystem, "unit_blkio_write_total_requests"),
"Total block IO write requests",
[]string{"manager", "hostname", "uuid", "device"},
nil,
),
cgBlkioPressure: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, genericSubsystem, "unit_blkio_psi_seconds"),
"Total block IO PSI in seconds",
[]string{"manager", "hostname", "uuid", "device"},
nil,
),
cgRDMAHCAHandles: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, genericSubsystem, "unit_rdma_hca_handles"),
"Current number of RDMA HCA handles",
Expand Down Expand Up @@ -436,6 +510,27 @@ func (c *cgroupCollector) Update(ch chan<- prometheus.Metric, metrics []cgMetric
ch <- prometheus.MustNewConstMetric(c.cgMemswFailCount, prometheus.GaugeValue, m.memswFailCount, c.cgroupManager.manager, c.hostname, m.uuid)
}

// Block IO stats
if c.opts.collectBlockIOStats {
for device := range m.blkioReadBytes {
if v, ok := m.blkioReadBytes[device]; ok && v > 0 {
ch <- prometheus.MustNewConstMetric(c.cgBlkioReadBytes, prometheus.GaugeValue, v, c.cgroupManager.manager, c.hostname, m.uuid, device)
}

if v, ok := m.blkioWriteBytes[device]; ok && v > 0 {
ch <- prometheus.MustNewConstMetric(c.cgBlkioWriteBytes, prometheus.GaugeValue, v, c.cgroupManager.manager, c.hostname, m.uuid, device)
}

if v, ok := m.blkioReadReqs[device]; ok && v > 0 {
ch <- prometheus.MustNewConstMetric(c.cgBlkioReadReqs, prometheus.GaugeValue, v, c.cgroupManager.manager, c.hostname, m.uuid, device)
}

if v, ok := m.blkioWriteReqs[device]; ok && v > 0 {
ch <- prometheus.MustNewConstMetric(c.cgBlkioWriteReqs, prometheus.GaugeValue, v, c.cgroupManager.manager, c.hostname, m.uuid, device)
}
}
}

// PSI stats
if c.opts.collectPSIStats {
ch <- prometheus.MustNewConstMetric(c.cgCPUPressure, prometheus.GaugeValue, m.cpuPressure, c.cgroupManager.manager, c.hostname, m.uuid)
Expand Down Expand Up @@ -646,6 +741,34 @@ func (c *cgroupCollector) statsV1(metric *cgMetric) {
}
}

// Get block IO stats
if stats.GetBlkio() != nil {
metric.blkioReadBytes = make(map[string]float64)
metric.blkioReadReqs = make(map[string]float64)
metric.blkioWriteBytes = make(map[string]float64)
metric.blkioWriteReqs = make(map[string]float64)

for _, stat := range stats.GetBlkio().GetIoServiceBytesRecursive() {
devName := c.blockDevices[fmt.Sprintf("%d:%d", stat.GetMajor(), stat.GetMinor())]

if stat.GetOp() == readOp {
metric.blkioReadBytes[devName] = float64(stat.GetValue())
} else if stat.GetOp() == writeOp {
metric.blkioWriteBytes[devName] = float64(stat.GetValue())
}
}

for _, stat := range stats.GetBlkio().GetIoServicedRecursive() {
devName := c.blockDevices[fmt.Sprintf("%d:%d", stat.GetMajor(), stat.GetMinor())]

if stat.GetOp() == readOp {
metric.blkioReadReqs[devName] = float64(stat.GetValue())
} else if stat.GetOp() == writeOp {
metric.blkioWriteReqs[devName] = float64(stat.GetValue())
}
}
}

// Get RDMA metrics if available
if stats.GetRdma() != nil {
metric.rdmaHCAHandles = make(map[string]float64)
Expand Down Expand Up @@ -739,6 +862,26 @@ func (c *cgroupCollector) statsV2(metric *cgMetric) {
metric.memoryFailCount = float64(stats.GetMemoryEvents().GetOom())
}

// Get block IO stats
if stats.GetIo() != nil {
metric.blkioReadBytes = make(map[string]float64)
metric.blkioReadReqs = make(map[string]float64)
metric.blkioWriteBytes = make(map[string]float64)
metric.blkioWriteReqs = make(map[string]float64)

for _, stat := range stats.GetIo().GetUsage() {
devName := c.blockDevices[fmt.Sprintf("%d:%d", stat.GetMajor(), stat.GetMinor())]
metric.blkioReadBytes[devName] = float64(stat.GetRbytes())
metric.blkioReadReqs[devName] = float64(stat.GetRios())
metric.blkioWriteBytes[devName] = float64(stat.GetWbytes())
metric.blkioWriteReqs[devName] = float64(stat.GetWios())
}

if stats.GetIo().GetPSI() != nil {
metric.blkioPressure = float64(stats.GetIo().GetPSI().GetFull().GetTotal()) / 1000000.0
}
}

// Get RDMA stats
if stats.GetRdma() != nil {
metric.rdmaHCAHandles = make(map[string]float64)
Expand Down
5 changes: 5 additions & 0 deletions pkg/collector/cgroup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,11 @@ func TestCgroupsV2Metrics(t *testing.T) {
memoryPressure: 0,
rdmaHCAHandles: map[string]float64{"hfi1_0": 479, "hfi1_1": 1479, "hfi1_2": 2479},
rdmaHCAObjects: map[string]float64{"hfi1_0": 340, "hfi1_1": 1340, "hfi1_2": 2340},
blkioReadBytes: map[string]float64{},
blkioWriteBytes: map[string]float64{},
blkioReadReqs: map[string]float64{},
blkioWriteReqs: map[string]float64{},
blkioPressure: 0,
err: false,
}

Expand Down
Loading

0 comments on commit 1af0bdb

Please sign in to comment.