diff --git a/Makefile b/Makefile index c6a4b6ed..4a90239e 100644 --- a/Makefile +++ b/Makefile @@ -155,6 +155,8 @@ test-e2e: build pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc ./scripts/e2e-test.sh -s exporter-cgroups-v2-nogpu ./scripts/e2e-test.sh -s exporter-cgroups-v2-procfs ./scripts/e2e-test.sh -s exporter-cgroups-v2-all-metrics + ./scripts/e2e-test.sh -s exporter-cgroups-v1-libvirt + ./scripts/e2e-test.sh -s exporter-cgroups-v2-libvirt else .PHONY: test-e2e test-e2e: $(PROMTOOL) build pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc/.unpacked @@ -204,6 +206,8 @@ test-e2e-update: build pkg/collector/testdata/sys/.unpacked pkg/collector/testda ./scripts/e2e-test.sh -s exporter-cgroups-v2-nogpu -u || true ./scripts/e2e-test.sh -s exporter-cgroups-v2-procfs -u || true ./scripts/e2e-test.sh -s exporter-cgroups-v2-all-metrics -u || true + ./scripts/e2e-test.sh -s exporter-cgroups-v1-libvirt -u || true + ./scripts/e2e-test.sh -s exporter-cgroups-v2-libvirt -u || true else .PHONY: test-e2e-update test-e2e-update: $(PROMTOOL) build pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc/.unpacked diff --git a/etc/nvidia-dcgm-exporter/counters.csv b/etc/nvidia-dcgm-exporter/counters.csv index e03abf76..16528436 100644 --- a/etc/nvidia-dcgm-exporter/counters.csv +++ b/etc/nvidia-dcgm-exporter/counters.csv @@ -78,14 +78,14 @@ DCGM_FI_DRIVER_VERSION, label, Driver Version # Profiling metrics. Ref: https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#profiling-metrics DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Fraction of time any portion of the graphics or compute engines were active. -DCGM_FI_PROF_SM_ACTIVE, gauge, Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors. -DCGM_FI_PROF_SM_OCCUPANCY, gauge, Fraction of resident warps on a multiprocessor, relative to the maximum number of concurrent warps supported on a multiprocessor. +DCGM_FI_PROF_SM_ACTIVE, gauge, Fraction of time at least one warp was active on a multiprocessor averaged over all multiprocessors. +DCGM_FI_PROF_SM_OCCUPANCY, gauge, Fraction of resident warps on a multiprocessor relative to the maximum number of concurrent warps supported on a multiprocessor. DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Fraction of cycles the tensor (HMMA / IMMA) pipe was active. DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Fraction of cycles the FP64 (double precision) pipe was active. -DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Fraction of cycles the FMA (FP32 (single precision), and integer) pipe was active. +DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Fraction of cycles the FMA (FP32 (single precision) and integer) pipe was active. DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Fraction of cycles the FP16 (half precision) pipe was active. The value represents an average over a time interval and is not an instantaneous value. DCGM_FI_PROF_DRAM_ACTIVE, gauge, Fraction of cycles where data was sent to or received from device memory. -DCGM_FI_PROF_NVLINK_TX_BYTES, gauge, Total rate of data transmitted over NVLink, not including protocol headers, in bytes per second. -DCGM_FI_PROF_NVLINK_RX_BYTES, gauge, Total rate of data received over NVLink, not including protocol headers, in bytes per second. -DCGM_FI_PROF_PCIE_TX_BYTES, gauge, Total rate of data transmitted over PCIE, not including protocol headers, in bytes per second. -DCGM_FI_PROF_PCIE_RX_BYTES, gauge, Total rate of data received over PCIE, not including protocol headers, in bytes per second. +DCGM_FI_PROF_NVLINK_TX_BYTES, gauge, Total rate of data transmitted over NVLink not including protocol headers in bytes per second. +DCGM_FI_PROF_NVLINK_RX_BYTES, gauge, Total rate of data received over NVLink not including protocol headers in bytes per second. +DCGM_FI_PROF_PCIE_TX_BYTES, gauge, Total rate of data transmitted over PCIE not including protocol headers in bytes per second. +DCGM_FI_PROF_PCIE_RX_BYTES, gauge, Total rate of data received over PCIE not including protocol headers in bytes per second. diff --git a/etc/slurm/README.md b/etc/slurm/README.md index 54345f7c..92782df9 100644 --- a/etc/slurm/README.md +++ b/etc/slurm/README.md @@ -12,11 +12,5 @@ This directory provides those scripts that should be used with SLURM. An example [systemd service file](https://github.com/mahendrapaipuri/ceems/blob/main/init/systemd/ceems_exporter_no_privs.service) is also provided in the repo that can be used along with these prolog and epilog scripts. -> [!IMPORTANT] -> The CLI argument `--collector.slurm.gpu-job-map-path` -is hidden and cannot be seen in `ceems_exporter --help` output. However, this argument -exists in the exporter and can be used. - -Even with such prolog and epilog scripts, operators should grant the user running CEEMS -exporter permissions to run `ipmi-dcmi` command as this command can be executable by only -`root` by default. +Even with such prolog and epilog scripts, operators should grant the CEEMS exporter +process additional privileges for collectors like `ipmi_dcmi`, `ebpf`, _etc_. diff --git a/pkg/collector/cgroup.go b/pkg/collector/cgroup.go index 707531b4..7da36e70 100644 --- a/pkg/collector/cgroup.go +++ b/pkg/collector/cgroup.go @@ -19,6 +19,7 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/procfs/blockdevice" ) const ( @@ -35,6 +36,12 @@ const ( libvirt = "libvirt" ) +// Block IO Op names. +const ( + readOp = "Read" + writeOp = "Write" +) + // Regular expressions of cgroup paths for different resource managers. /* For v1 possibilities are /cpuacct/slurm/uid_1000/job_211 @@ -50,8 +57,16 @@ var ( ) // Ref: https://libvirt.org/cgroups.html#legacy-cgroups-layout +// Take escaped unicode characters in regex +/* + For v1 possibilities are /cpuacct/machine.slice/machine-qemu\x2d2\x2dinstance\x2d00000001.scope + /memory/machine.slice/machine-qemu\x2d2\x2dinstance\x2d00000001.scope + + For v2 possibilities are /machine.slice/machine-qemu\x2d2\x2dinstance\x2d00000001.scope + /machine.slice/machine-qemu\x2d2\x2dinstance\x2d00000001.scope/libvirt +*/ var ( - libvirtCgroupPathRegex = regexp.MustCompile("^.*/(?:.+?)instance-([0-9]+)(?:.*$)") + libvirtCgroupPathRegex = regexp.MustCompile("^.*/(?:.+?)-qemu-(?:[0-9]+)-(instance-[0-9a-f]+)(?:.*$)") ) // CLI options. @@ -206,7 +221,7 @@ func NewCgroupManager(name string) (*cgroupManager, error) { // Add filter functions manager.pathFilter = func(p string) bool { - return false + return strings.Contains(p, "/libvirt") } manager.procFilter = func(p string) bool { return false @@ -239,6 +254,11 @@ type cgMetric struct { memswTotal float64 memswFailCount float64 memoryPressure float64 + blkioReadBytes map[string]float64 + blkioWriteBytes map[string]float64 + blkioReadReqs map[string]float64 + blkioWriteReqs map[string]float64 + blkioPressure float64 rdmaHCAHandles map[string]float64 rdmaHCAObjects map[string]float64 uuid string @@ -252,6 +272,7 @@ type cgroupCollector struct { opts cgroupOpts hostname string hostMemTotal float64 + blockDevices map[string]string numCgs *prometheus.Desc cgCPUUser *prometheus.Desc cgCPUSystem *prometheus.Desc @@ -266,6 +287,11 @@ type cgroupCollector struct { cgMemswTotal *prometheus.Desc cgMemswFailCount *prometheus.Desc cgMemoryPressure *prometheus.Desc + cgBlkioReadBytes *prometheus.Desc + cgBlkioWriteBytes *prometheus.Desc + cgBlkioReadReqs *prometheus.Desc + cgBlkioWriteReqs *prometheus.Desc + cgBlkioPressure *prometheus.Desc cgRDMAHCAHandles *prometheus.Desc cgRDMAHCAObjects *prometheus.Desc collectError *prometheus.Desc @@ -273,6 +299,7 @@ type cgroupCollector struct { type cgroupOpts struct { collectSwapMemStats bool + collectBlockIOStats bool collectPSIStats bool } @@ -292,12 +319,29 @@ func NewCgroupCollector(logger log.Logger, cgManager *cgroupManager, opts cgroup defer file.Close() + // Read block IO stats just to get block devices info. + // We construct a map from major:minor to device name using this info + blockDevices := make(map[string]string) + + if blockdevice, err := blockdevice.NewFS(*procfsPath, *sysPath); err == nil { + if stats, err := blockdevice.ProcDiskstats(); err == nil { + for _, s := range stats { + blockDevices[fmt.Sprintf("%d:%d", s.Info.MajorNumber, s.Info.MinorNumber)] = s.Info.DeviceName + } + } else { + level.Error(logger).Log("msg", "Failed to get stats of block devices on the host", "err", err) + } + } else { + level.Error(logger).Log("msg", "Failed to get list of block devices on the host", "err", err) + } + return &cgroupCollector{ logger: logger, cgroupManager: cgManager, opts: opts, hostMemTotal: memTotal, hostname: hostname, + blockDevices: blockDevices, numCgs: prometheus.NewDesc( prometheus.BuildFQName(Namespace, genericSubsystem, "units"), "Total number of jobs", @@ -382,6 +426,36 @@ func NewCgroupCollector(logger log.Logger, cgManager *cgroupManager, opts cgroup []string{"manager", "hostname", "uuid"}, nil, ), + cgBlkioReadBytes: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_blkio_read_total_bytes"), + "Total block IO read bytes", + []string{"manager", "hostname", "uuid", "device"}, + nil, + ), + cgBlkioWriteBytes: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_blkio_write_total_bytes"), + "Total block IO write bytes", + []string{"manager", "hostname", "uuid", "device"}, + nil, + ), + cgBlkioReadReqs: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_blkio_read_total_requests"), + "Total block IO read requests", + []string{"manager", "hostname", "uuid", "device"}, + nil, + ), + cgBlkioWriteReqs: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_blkio_write_total_requests"), + "Total block IO write requests", + []string{"manager", "hostname", "uuid", "device"}, + nil, + ), + cgBlkioPressure: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_blkio_psi_seconds"), + "Total block IO PSI in seconds", + []string{"manager", "hostname", "uuid", "device"}, + nil, + ), cgRDMAHCAHandles: prometheus.NewDesc( prometheus.BuildFQName(Namespace, genericSubsystem, "unit_rdma_hca_handles"), "Current number of RDMA HCA handles", @@ -436,6 +510,27 @@ func (c *cgroupCollector) Update(ch chan<- prometheus.Metric, metrics []cgMetric ch <- prometheus.MustNewConstMetric(c.cgMemswFailCount, prometheus.GaugeValue, m.memswFailCount, c.cgroupManager.manager, c.hostname, m.uuid) } + // Block IO stats + if c.opts.collectBlockIOStats { + for device := range m.blkioReadBytes { + if v, ok := m.blkioReadBytes[device]; ok && v > 0 { + ch <- prometheus.MustNewConstMetric(c.cgBlkioReadBytes, prometheus.GaugeValue, v, c.cgroupManager.manager, c.hostname, m.uuid, device) + } + + if v, ok := m.blkioWriteBytes[device]; ok && v > 0 { + ch <- prometheus.MustNewConstMetric(c.cgBlkioWriteBytes, prometheus.GaugeValue, v, c.cgroupManager.manager, c.hostname, m.uuid, device) + } + + if v, ok := m.blkioReadReqs[device]; ok && v > 0 { + ch <- prometheus.MustNewConstMetric(c.cgBlkioReadReqs, prometheus.GaugeValue, v, c.cgroupManager.manager, c.hostname, m.uuid, device) + } + + if v, ok := m.blkioWriteReqs[device]; ok && v > 0 { + ch <- prometheus.MustNewConstMetric(c.cgBlkioWriteReqs, prometheus.GaugeValue, v, c.cgroupManager.manager, c.hostname, m.uuid, device) + } + } + } + // PSI stats if c.opts.collectPSIStats { ch <- prometheus.MustNewConstMetric(c.cgCPUPressure, prometheus.GaugeValue, m.cpuPressure, c.cgroupManager.manager, c.hostname, m.uuid) @@ -646,6 +741,34 @@ func (c *cgroupCollector) statsV1(metric *cgMetric) { } } + // Get block IO stats + if stats.GetBlkio() != nil { + metric.blkioReadBytes = make(map[string]float64) + metric.blkioReadReqs = make(map[string]float64) + metric.blkioWriteBytes = make(map[string]float64) + metric.blkioWriteReqs = make(map[string]float64) + + for _, stat := range stats.GetBlkio().GetIoServiceBytesRecursive() { + devName := c.blockDevices[fmt.Sprintf("%d:%d", stat.GetMajor(), stat.GetMinor())] + + if stat.GetOp() == readOp { + metric.blkioReadBytes[devName] = float64(stat.GetValue()) + } else if stat.GetOp() == writeOp { + metric.blkioWriteBytes[devName] = float64(stat.GetValue()) + } + } + + for _, stat := range stats.GetBlkio().GetIoServicedRecursive() { + devName := c.blockDevices[fmt.Sprintf("%d:%d", stat.GetMajor(), stat.GetMinor())] + + if stat.GetOp() == readOp { + metric.blkioReadReqs[devName] = float64(stat.GetValue()) + } else if stat.GetOp() == writeOp { + metric.blkioWriteReqs[devName] = float64(stat.GetValue()) + } + } + } + // Get RDMA metrics if available if stats.GetRdma() != nil { metric.rdmaHCAHandles = make(map[string]float64) @@ -739,6 +862,26 @@ func (c *cgroupCollector) statsV2(metric *cgMetric) { metric.memoryFailCount = float64(stats.GetMemoryEvents().GetOom()) } + // Get block IO stats + if stats.GetIo() != nil { + metric.blkioReadBytes = make(map[string]float64) + metric.blkioReadReqs = make(map[string]float64) + metric.blkioWriteBytes = make(map[string]float64) + metric.blkioWriteReqs = make(map[string]float64) + + for _, stat := range stats.GetIo().GetUsage() { + devName := c.blockDevices[fmt.Sprintf("%d:%d", stat.GetMajor(), stat.GetMinor())] + metric.blkioReadBytes[devName] = float64(stat.GetRbytes()) + metric.blkioReadReqs[devName] = float64(stat.GetRios()) + metric.blkioWriteBytes[devName] = float64(stat.GetWbytes()) + metric.blkioWriteReqs[devName] = float64(stat.GetWios()) + } + + if stats.GetIo().GetPSI() != nil { + metric.blkioPressure = float64(stats.GetIo().GetPSI().GetFull().GetTotal()) / 1000000.0 + } + } + // Get RDMA stats if stats.GetRdma() != nil { metric.rdmaHCAHandles = make(map[string]float64) diff --git a/pkg/collector/cgroup_test.go b/pkg/collector/cgroup_test.go index b280d73a..f435f7a7 100644 --- a/pkg/collector/cgroup_test.go +++ b/pkg/collector/cgroup_test.go @@ -102,6 +102,11 @@ func TestCgroupsV2Metrics(t *testing.T) { memoryPressure: 0, rdmaHCAHandles: map[string]float64{"hfi1_0": 479, "hfi1_1": 1479, "hfi1_2": 2479}, rdmaHCAObjects: map[string]float64{"hfi1_0": 340, "hfi1_1": 1340, "hfi1_2": 2340}, + blkioReadBytes: map[string]float64{}, + blkioWriteBytes: map[string]float64{}, + blkioReadReqs: map[string]float64{}, + blkioWriteReqs: map[string]float64{}, + blkioPressure: 0, err: false, } diff --git a/pkg/collector/ebpf.go b/pkg/collector/ebpf.go index cb848218..b80dc684 100644 --- a/pkg/collector/ebpf.go +++ b/pkg/collector/ebpf.go @@ -137,44 +137,44 @@ type aggMetrics struct { // ebpfReadMapsCtxData contains the input/output data for // reading eBPF maps to execute inside security context. type ebpfReadMapsCtxData struct { - opts ebpfOpts - cgroupIDUUIDCache map[uint64]string - activeCgroupIDs []uint64 - netColl *ebpf.Collection - vfsColl *ebpf.Collection - aggMetrics *aggMetrics + opts ebpfOpts + cgroupIDUUIDCache map[uint64]string + activeCgroupInodes []uint64 + netColl *ebpf.Collection + vfsColl *ebpf.Collection + aggMetrics *aggMetrics } type ebpfCollector struct { - logger log.Logger - hostname string - opts ebpfOpts - cgroupManager *cgroupManager - cgroupIDUUIDCache map[uint64]string - cgroupPathIDCache map[string]uint64 - activeCgroupIDs []uint64 - netColl *ebpf.Collection - vfsColl *ebpf.Collection - links map[string]link.Link - securityContexts map[string]*security.SecurityContext - vfsWriteRequests *prometheus.Desc - vfsWriteBytes *prometheus.Desc - vfsWriteErrors *prometheus.Desc - vfsReadRequests *prometheus.Desc - vfsReadBytes *prometheus.Desc - vfsReadErrors *prometheus.Desc - vfsOpenRequests *prometheus.Desc - vfsOpenErrors *prometheus.Desc - vfsCreateRequests *prometheus.Desc - vfsCreateErrors *prometheus.Desc - vfsUnlinkRequests *prometheus.Desc - vfsUnlinkErrors *prometheus.Desc - netIngressPackets *prometheus.Desc - netIngressBytes *prometheus.Desc - netEgressPackets *prometheus.Desc - netEgressBytes *prometheus.Desc - netRetransPackets *prometheus.Desc - netRetransBytes *prometheus.Desc + logger log.Logger + hostname string + opts ebpfOpts + cgroupManager *cgroupManager + cgroupIDUUIDCache map[uint64]string + cgroupPathIDCache map[string]uint64 + activeCgroupInodes []uint64 + netColl *ebpf.Collection + vfsColl *ebpf.Collection + links map[string]link.Link + securityContexts map[string]*security.SecurityContext + vfsWriteRequests *prometheus.Desc + vfsWriteBytes *prometheus.Desc + vfsWriteErrors *prometheus.Desc + vfsReadRequests *prometheus.Desc + vfsReadBytes *prometheus.Desc + vfsReadErrors *prometheus.Desc + vfsOpenRequests *prometheus.Desc + vfsOpenErrors *prometheus.Desc + vfsCreateRequests *prometheus.Desc + vfsCreateErrors *prometheus.Desc + vfsUnlinkRequests *prometheus.Desc + vfsUnlinkErrors *prometheus.Desc + netIngressPackets *prometheus.Desc + netIngressBytes *prometheus.Desc + netEgressPackets *prometheus.Desc + netEgressBytes *prometheus.Desc + netRetransPackets *prometheus.Desc + netRetransBytes *prometheus.Desc } // NewEbpfCollector returns a new instance of ebpf collector. @@ -507,9 +507,11 @@ func NewEbpfCollector(logger log.Logger, cgManager *cgroupManager) (*ebpfCollect } // Update implements Collector and update job metrics. -func (c *ebpfCollector) Update(ch chan<- prometheus.Metric) error { +// cgroupIDUUIDMap provides a map to cgroupID to compute unit UUID. If the map is empty, it means +// cgroup ID and compute unit UUID is identical. +func (c *ebpfCollector) Update(ch chan<- prometheus.Metric, cgroupIDUUIDMap map[string]string) error { // Fetch all active cgroups - if err := c.discoverCgroups(); err != nil { + if err := c.discoverCgroups(cgroupIDUUIDMap); err != nil { return err } @@ -814,11 +816,11 @@ func (c *ebpfCollector) updateNetRetrans(ch chan<- prometheus.Metric, aggMetrics // readMaps reads the BPF maps in a security context and returns aggregate metrics. func (c *ebpfCollector) readMaps() (*aggMetrics, error) { dataPtr := &ebpfReadMapsCtxData{ - opts: c.opts, - cgroupIDUUIDCache: c.cgroupIDUUIDCache, - activeCgroupIDs: c.activeCgroupIDs, - vfsColl: c.vfsColl, - netColl: c.netColl, + opts: c.opts, + cgroupIDUUIDCache: c.cgroupIDUUIDCache, + activeCgroupInodes: c.activeCgroupInodes, + vfsColl: c.vfsColl, + netColl: c.netColl, } // Start new profilers within security context @@ -833,14 +835,14 @@ func (c *ebpfCollector) readMaps() (*aggMetrics, error) { // discoverCgroups walks through cgroup file system and discover all relevant cgroups based // on cgroupManager. -func (c *ebpfCollector) discoverCgroups() error { +func (c *ebpfCollector) discoverCgroups(cgroupIDUUIDMap map[string]string) error { // Get currently active uuids and cgroup paths to evict older entries in caches var activeCgroupUUIDs []string var activeCgroupPaths []string // Reset activeCgroups from last scrape - c.activeCgroupIDs = make([]uint64, 0) + c.activeCgroupInodes = make([]uint64, 0) // Walk through all cgroups and get cgroup paths if err := filepath.WalkDir(c.cgroupManager.mountPoint, func(p string, info fs.DirEntry, err error) error { @@ -853,19 +855,35 @@ func (c *ebpfCollector) discoverCgroups() error { return nil } + // Unescape UTF-8 characters in cgroup path + sanitizedPath, err := unescapeString(p) + if err != nil { + level.Error(c.logger).Log("msg", "Failed to sanitize cgroup path", "path", p, "err", err) + + return nil + } + // Get cgroup ID - cgroupIDMatches := c.cgroupManager.idRegex.FindStringSubmatch(p) + cgroupIDMatches := c.cgroupManager.idRegex.FindStringSubmatch(sanitizedPath) if len(cgroupIDMatches) <= 1 { return nil } - uuid := strings.TrimSpace(cgroupIDMatches[1]) - if uuid == "" { - level.Error(c.logger).Log("msg", "Empty UUID", "path", p) + cgroupID := strings.TrimSpace(cgroupIDMatches[1]) + if cgroupID == "" { + level.Error(c.logger).Log("msg", "Empty cgroup ID", "path", p) return nil } + // Get compute unit UUID from cgroup ID + var uuid string + if cgroupIDUUIDMap != nil { + uuid = cgroupIDUUIDMap[cgroupID] + } else { + uuid = cgroupID + } + // Get inode of the cgroup path if not already present in the cache if _, ok := c.cgroupPathIDCache[p]; !ok { if inode, err := inode(p); err == nil { @@ -877,10 +895,10 @@ func (c *ebpfCollector) discoverCgroups() error { c.cgroupIDUUIDCache[c.cgroupPathIDCache[p]] = uuid } - // Populate activeCgroupUUIDs, activeCgroupIDs and activeCgroupPaths + // Populate activeCgroupUUIDs, activeCgroupInodes and activeCgroupPaths activeCgroupPaths = append(activeCgroupPaths, p) activeCgroupUUIDs = append(activeCgroupUUIDs, uuid) - c.activeCgroupIDs = append(c.activeCgroupIDs, c.cgroupPathIDCache[p]) + c.activeCgroupInodes = append(c.activeCgroupInodes, c.cgroupPathIDCache[p]) level.Debug(c.logger).Log("msg", "cgroup path", "path", p) @@ -983,7 +1001,7 @@ func aggVFSStats(d *ebpfReadMapsCtxData) { } for entries.Next(&rwKey, &rwValue) { - if slices.Contains(d.activeCgroupIDs, uint64(rwKey.Cid)) { + if slices.Contains(d.activeCgroupInodes, uint64(rwKey.Cid)) { mount := unix.ByteSliceToString(rwKey.Mnt[:]) if !containsMount(mount, d.opts.vfsMountPoints) { continue @@ -1010,7 +1028,7 @@ func aggVFSStats(d *ebpfReadMapsCtxData) { } for entries.Next(&inodeKey, &inodeValue) { - if slices.Contains(d.activeCgroupIDs, uint64(inodeKey)) { + if slices.Contains(d.activeCgroupInodes, uint64(inodeKey)) { uuid := d.cgroupIDUUIDCache[uint64(inodeKey)] if v, ok := d.aggMetrics.inode[mapName][uuid]; ok { d.aggMetrics.inode[mapName][uuid] = bpfVfsInodeEvent{ @@ -1046,7 +1064,7 @@ func aggNetStats(d *ebpfReadMapsCtxData) { } for entries.Next(&key, &value) { - if slices.Contains(d.activeCgroupIDs, uint64(key.Cid)) { + if slices.Contains(d.activeCgroupInodes, uint64(key.Cid)) { promKey := promNetEventKey{ UUID: d.cgroupIDUUIDCache[uint64(key.Cid)], Proto: protoMap[int(key.Proto)], diff --git a/pkg/collector/ebpf_test.go b/pkg/collector/ebpf_test.go index cb060c91..9107cd71 100644 --- a/pkg/collector/ebpf_test.go +++ b/pkg/collector/ebpf_test.go @@ -186,7 +186,7 @@ func TestNewEbpfCollector(t *testing.T) { } }() - err = collector.Update(metrics) + err = collector.Update(metrics, nil) require.NoError(t, err) err = collector.Stop(context.Background()) @@ -224,10 +224,10 @@ func TestActiveCgroupsV2(t *testing.T) { } // Get active cgroups - err = c.discoverCgroups() + err = c.discoverCgroups(nil) require.NoError(t, err) - assert.Len(t, c.activeCgroupIDs, 39) + assert.Len(t, c.activeCgroupInodes, 39) assert.Len(t, c.cgroupIDUUIDCache, 39) assert.Len(t, c.cgroupPathIDCache, 39) @@ -273,10 +273,10 @@ func TestActiveCgroupsV1(t *testing.T) { } // Get active cgroups - err = c.discoverCgroups() + err = c.discoverCgroups(nil) require.NoError(t, err) - assert.Len(t, c.activeCgroupIDs, 6) + assert.Len(t, c.activeCgroupInodes, 6) assert.Len(t, c.cgroupIDUUIDCache, 6) assert.Len(t, c.cgroupPathIDCache, 6) diff --git a/pkg/collector/helper.go b/pkg/collector/helper.go index 217e117d..e2bc47de 100644 --- a/pkg/collector/helper.go +++ b/pkg/collector/helper.go @@ -17,16 +17,75 @@ import ( "github.com/prometheus/procfs" ) +type BusID struct { + domain uint64 + bus uint64 + slot uint64 + function uint64 +} + +// Compare compares the provided bus ID with current bus ID and +// returns true if they match and false in all other cases. +func (b *BusID) Compare(bTest BusID) bool { + // Check equality component per component in ID + if b.domain == bTest.domain && b.bus == bTest.bus && b.slot == bTest.slot && b.function == bTest.function { + return true + } else { + return false + } +} + +// Device contains the details of GPU devices. type Device struct { - index string - name string - uuid string - isMig bool + index string + name string + uuid string + busID BusID + isMig bool + isvGPU bool +} + +// String implements Stringer interface of the Device struct. +func (d Device) String() string { + return fmt.Sprintf( + "name: %s; index: %s; uuid: %s; bus_id: %v; is_mig_instance: %t; is_vgpu_instance: %t", + d.name, d.index, d.uuid, d.busID, d.isMig, d.isvGPU, + ) +} + +// CompareBusID compares the provided bus ID with device bus ID and +// returns true if they match and false in all other cases. +func (d *Device) CompareBusID(id string) bool { + // Parse bus id that needs to be compared + busID, err := parseBusID(id) + if err != nil { + return false + } + + // Check equality component per component in ID + return d.busID.Compare(busID) } var ( metricNameRegex = regexp.MustCompile(`_*[^0-9A-Za-z_]+_*`) reParens = regexp.MustCompile(`\((.*)\)`) + pciBusIDRegex = regexp.MustCompile(`(?P[0-9a-fA-F]+):(?P[0-9a-fA-F]+):(?P[0-9a-fA-F]+)\.(?P[0-9a-fA-F]+)`) +) + +// Used for e2e tests. +var ( + gpuType = CEEMSExporterApp.Flag( + "collector.gpu.type", + "GPU device type. Currently only nvidia and amd devices are supported.", + ).Hidden().Enum("nvidia", "amd") + nvidiaSmiPath = CEEMSExporterApp.Flag( + "collector.gpu.nvidia-smi-path", + "Absolute path to nvidia-smi binary. Use only for testing.", + ).Hidden().Default("").String() + rocmSmiPath = CEEMSExporterApp.Flag( + "collector.gpu.rocm-smi-path", + "Absolute path to rocm-smi binary. Use only for testing.", + ).Hidden().Default("").String() ) // SanitizeMetricName sanitize the given metric name by replacing invalid characters by underscores. @@ -44,49 +103,6 @@ func SanitizeMetricName(metricName string) string { return metricNameRegex.ReplaceAllString(metricName, "_") } -// LoadCgroupsV2Metrics returns cgroup metrics from a given path. -func LoadCgroupsV2Metrics( - name string, - cgroupfsPath string, - controllers []string, -) (map[string]float64, error) { - data := make(map[string]float64) - - for _, fName := range controllers { - contents, err := os.ReadFile(filepath.Join(cgroupfsPath, name, fName)) - if err != nil { - return data, err - } - - for _, line := range strings.Split(string(contents), "\n") { - // Some of the above have a single value and others have a "data_name 123" - parts := strings.Fields(line) - indName := fName - indData := 0 - - if len(parts) == 1 || len(parts) == 2 { - if len(parts) == 2 { - indName += "." + parts[0] - indData = 1 - } - - if parts[indData] == "max" { - data[indName] = -1.0 - } else { - f, err := strconv.ParseFloat(parts[indData], 64) - if err == nil { - data[indName] = f - } else { - return data, err - } - } - } - } - } - - return data, nil -} - // GetGPUDevices returns GPU devices. func GetGPUDevices(gpuType string, logger log.Logger) (map[int]Device, error) { if gpuType == "nvidia" { @@ -111,7 +127,7 @@ func parseNvidiaSmiOutput(cmdOutput string, logger log.Logger) map[int]Device { } devDetails := strings.Split(line, ",") - if len(devDetails) < 3 { + if len(devDetails) < 4 { continue } @@ -119,6 +135,13 @@ func parseNvidiaSmiOutput(cmdOutput string, logger log.Logger) map[int]Device { devIndx := strings.TrimSpace(devDetails[0]) devName := strings.TrimSpace(devDetails[1]) devUUID := strings.TrimSpace(devDetails[2]) + devBusID := strings.TrimSpace(devDetails[3]) + + // Parse bus ID + busID, err := parseBusID(devBusID) + if err != nil { + level.Error(logger).Log("msg", "Failed to parse GPU bus ID", "bus_id", devBusID, "err", err) + } // Check if device is in MiG mode isMig := false @@ -126,10 +149,9 @@ func parseNvidiaSmiOutput(cmdOutput string, logger log.Logger) map[int]Device { isMig = true } - level.Debug(logger). - Log("msg", "Found nVIDIA GPU", "name", devName, "UUID", devUUID, "isMig:", isMig) + gpuDevices[devIndxInt] = Device{index: devIndx, name: devName, uuid: devUUID, busID: busID, isMig: isMig} + level.Debug(logger).Log("msg", "Found nVIDIA GPU", "gpu", gpuDevices[devIndxInt]) - gpuDevices[devIndxInt] = Device{index: devIndx, name: devName, uuid: devUUID, isMig: isMig} devIndxInt++ } @@ -147,7 +169,7 @@ func parseNvidiaSmiOutput(cmdOutput string, logger log.Logger) map[int]Device { // nvml go bindings. This way we dont have deps on nvidia stuff and keep // exporter simple. // -// NOTE: Hoping this command returns MIG devices too. +// NOTE: This command does not return MIG devices. func GetNvidiaGPUDevices(nvidiaSmiPath string, logger log.Logger) (map[int]Device, error) { // Check if nvidia-smi binary exists var nvidiaSmiCmd string @@ -166,7 +188,7 @@ func GetNvidiaGPUDevices(nvidiaSmiPath string, logger log.Logger) (map[int]Devic } // Execute nvidia-smi command to get available GPUs - args := []string{"--query-gpu=index,name,uuid", "--format=csv"} + args := []string{"--query-gpu=index,name,uuid,gpu_bus_id", "--format=csv"} nvidiaSmiOutput, err := osexec.Execute(nvidiaSmiCmd, args, nil) if err != nil { @@ -187,22 +209,28 @@ func parseAmdSmioutput(cmdOutput string, logger log.Logger) map[int]Device { } devDetails := strings.Split(line, ",") - if len(devDetails) < 6 { + if len(devDetails) < 7 { continue } // Get device index, name and UUID devIndx := strings.TrimPrefix(devDetails[0], "card") devUUID := strings.TrimSpace(devDetails[1]) - devName := strings.TrimSpace(devDetails[2]) + devBusID := strings.TrimSpace(devDetails[2]) + devName := strings.TrimSpace(devDetails[3]) + + // Parse bus ID + busID, err := parseBusID(devBusID) + if err != nil { + level.Error(logger).Log("msg", "Failed to parse GPU bus ID", "bus_id", devBusID, "err", err) + } // Set isMig to false as it does not apply for AMD GPUs isMig := false - level.Debug(logger). - Log("msg", "Found AMD GPU", "name", devName, "UUID", devUUID) + gpuDevices[devIndxInt] = Device{index: devIndx, name: devName, uuid: devUUID, busID: busID, isMig: isMig} + level.Debug(logger).Log("msg", "Found AMD GPU", "gpu", gpuDevices[devIndxInt]) - gpuDevices[devIndxInt] = Device{index: devIndx, name: devName, uuid: devUUID, isMig: isMig} devIndxInt++ } @@ -211,11 +239,11 @@ func parseAmdSmioutput(cmdOutput string, logger log.Logger) map[int]Device { // GetAMDGPUDevices returns all GPU devices using rocm-smi command // Example output: -// bash-4.4$ rocm-smi --showproductname --showserial --csv +// bash-4.4$ rocm-smi --showproductname --showserial --showbus --csv // device,Serial Number,Card series,Card model,Card vendor,Card SKU -// card0,20170000800c,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 -// card1,20170003580c,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 -// card2,20180003050c,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317. +// card0,20170000800c,0000:C5:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 +// card1,20170003580c,0000:C5:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 +// card2,20180003050c,0000:C5:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317. func GetAMDGPUDevices(rocmSmiPath string, logger log.Logger) (map[int]Device, error) { // Check if rocm-smi binary exists var rocmSmiCmd string @@ -235,7 +263,7 @@ func GetAMDGPUDevices(rocmSmiPath string, logger log.Logger) (map[int]Device, er } // Execute nvidia-smi command to get available GPUs - args := []string{"--showproductname", "--showserial", "--csv"} + args := []string{"--showproductname", "--showserial", "--showbus", "--csv"} rocmSmiOutput, err := osexec.Execute(rocmSmiCmd, args, nil) if err != nil { @@ -270,7 +298,18 @@ func cgroupProcs(fs procfs.FS, idRegex *regexp.Regexp, targetEnvVars []string, p } for _, cgrp := range cgrps { - cgroupIDMatches := idRegex.FindStringSubmatch(cgrp.Path) + // If cgroup path is root, skip + if cgrp.Path == "/" { + continue + } + + // Unescape UTF-8 characters in cgroup path + sanitizedPath, err := unescapeString(cgrp.Path) + if err != nil { + continue + } + + cgroupIDMatches := idRegex.FindStringSubmatch(sanitizedPath) if len(cgroupIDMatches) <= 1 { continue } @@ -285,7 +324,7 @@ func cgroupProcs(fs procfs.FS, idRegex *regexp.Regexp, targetEnvVars []string, p continue } - // if targetEnvVars is not empty check if this env vars is present for the process + // If targetEnvVars is not empty check if this env vars is present for the process // We dont check for the value of env var. Presence of env var is enough to // trigger the profiling of that process if len(targetEnvVars) > 0 { @@ -355,18 +394,6 @@ func lookPath(f string) (string, error) { return "", errors.New("path does not exist") } -// // Find named matches in regex groups and return a map. -// func findNamedMatches(regex *regexp.Regexp, str string) map[string]string { -// match := regex.FindStringSubmatch(str) - -// results := map[string]string{} -// for i, name := range match { -// results[regex.SubexpNames()[i]] = name -// } - -// return results -// } - // inode returns the inode of a given path. func inode(path string) (uint64, error) { info, err := os.Stat(path) @@ -381,3 +408,38 @@ func inode(path string) (uint64, error) { return stat.Ino, nil } + +// parseBusID parses PCIe bus ID string to BusID struct. +func parseBusID(id string) (BusID, error) { + // Bus ID is in form of ::. + matches := pciBusIDRegex.FindStringSubmatch(id) + + var values []uint64 + + for i, match := range matches { + if i != 0 { + value, err := strconv.ParseUint(match, 16, 16) + if err != nil { + return BusID{}, err + } + + values = append(values, value) + } + } + + if len(values) == 4 { + return BusID{domain: values[0], bus: values[1], slot: values[2], function: values[3]}, nil + } + + return BusID{}, fmt.Errorf("error parsing PCIe bus ID: %s", id) +} + +// unescapeString sanitizes the string by unescaping UTF-8 characters. +func unescapeString(s string) (string, error) { + sanitized, err := strconv.Unquote("\"" + s + "\"") + if err != nil { + return "", err + } + + return sanitized, nil +} diff --git a/pkg/collector/helper_test.go b/pkg/collector/helper_test.go index ad6803a9..fa0b3ef4 100644 --- a/pkg/collector/helper_test.go +++ b/pkg/collector/helper_test.go @@ -14,16 +14,16 @@ import ( ) var ( - expectedNvidiaSmiOutput = `index, name, uuid -0, Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e -1, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3 -2, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3 -3, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3` - expectedAmdSmiOutput = `device,Serial Number,Card series,Card model,Card vendor,Card SKU -card0,20170000800c,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 -card1,20170003580c,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 -card2,20180003050c,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 -card3,20170005280c,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317` + expectedNvidiaSmiOutput = `index, name, uuid, bus_id +0, Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e, 00000000:07:00.0 +1, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3, 00000000:0B:00.0 +2, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3, 00000000:48:00.0 +3, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3, 00000000:4C:00.0` + expectedAmdSmiOutput = `device,Serial Number,PCI Bus,Card series,Card model,Card vendor,Card SKU +card0,20170000800c,0000:C5:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 +card1,20170003580c,0000:C8:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 +card2,20180003050c,0000:8A:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317 +card3,20170005280c,0000:8D:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. [AMD/ATI],D16317` ) func getExpectedNvidiaDevs() map[int]Device { @@ -32,24 +32,28 @@ func getExpectedNvidiaDevs() map[int]Device { index: "0", name: "Tesla V100-SXM2-32GB", uuid: "GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e", + busID: BusID{domain: 0x0, bus: 0x7, slot: 0x0, function: 0x0}, isMig: false, } nvidiaDevs[1] = Device{ index: "1", name: "Tesla V100-SXM2-32GB", uuid: "GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3", + busID: BusID{domain: 0x0, bus: 0xb, slot: 0x0, function: 0x0}, isMig: false, } nvidiaDevs[2] = Device{ index: "2", name: "Tesla V100-SXM2-32GB", uuid: "GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3", + busID: BusID{domain: 0x0, bus: 0x48, slot: 0x0, function: 0x0}, isMig: false, } nvidiaDevs[3] = Device{ index: "3", name: "Tesla V100-SXM2-32GB", uuid: "GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3", + busID: BusID{domain: 0x0, bus: 0x4c, slot: 0x0, function: 0x0}, isMig: false, } @@ -62,24 +66,28 @@ func getExpectedAmdDevs() map[int]Device { index: "0", name: "deon Instinct MI50 32GB", uuid: "20170000800c", + busID: BusID{domain: 0x0, bus: 0xc5, slot: 0x0, function: 0x0}, isMig: false, } amdDevs[1] = Device{ index: "1", name: "deon Instinct MI50 32GB", uuid: "20170003580c", + busID: BusID{domain: 0x0, bus: 0xc8, slot: 0x0, function: 0x0}, isMig: false, } amdDevs[2] = Device{ index: "2", name: "deon Instinct MI50 32GB", uuid: "20180003050c", + busID: BusID{domain: 0x0, bus: 0x8a, slot: 0x0, function: 0x0}, isMig: false, } amdDevs[3] = Device{ index: "3", name: "deon Instinct MI50 32GB", uuid: "20170005280c", + busID: BusID{domain: 0x0, bus: 0x8d, slot: 0x0, function: 0x0}, isMig: false, } @@ -109,3 +117,47 @@ echo """%s""" require.NoError(t, err) assert.Equal(t, gpuDevices, getExpectedAmdDevs()) } + +func TestParseBusIDPass(t *testing.T) { + id := "00000000:AD:00.0" + busID, err := parseBusID(id) + require.NoError(t, err) + + expectedID := BusID{domain: 0x0, bus: 0xad, slot: 0x0, function: 0x0} + + assert.Equal(t, expectedID, busID) +} + +func TestParseBusIDFail(t *testing.T) { + // Missing component + id := "00000000:AD:00" + _, err := parseBusID(id) + require.Error(t, err) + + // Malformed ID + id = "00000000:AD:00:4" + _, err = parseBusID(id) + require.Error(t, err) + + // Not Hex + id = "ggggggg:AD:00:0" + _, err = parseBusID(id) + require.Error(t, err) +} + +func TestCompareBusIDs(t *testing.T) { + // Sample Device + d := Device{busID: BusID{domain: 0x0, bus: 0xad, slot: 0x0, function: 0x0}} + + // Test ID - pass + id := "00000000:AD:00.0" + assert.True(t, d.CompareBusID(id)) + + // Test ID - fail + id = "00000000:AD:0A.0" + assert.False(t, d.CompareBusID(id)) + + // Test ID - error fail + id = "00000000:AD:00" + assert.False(t, d.CompareBusID(id)) +} diff --git a/pkg/collector/libvirt.go b/pkg/collector/libvirt.go new file mode 100644 index 00000000..19a0d476 --- /dev/null +++ b/pkg/collector/libvirt.go @@ -0,0 +1,563 @@ +//go:build !nolibvirt +// +build !nolibvirt + +package collector + +import ( + "context" + "encoding/xml" + "fmt" + "io/fs" + "os" + "path/filepath" + "slices" + "strconv" + "strings" + "sync" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/mahendrapaipuri/ceems/internal/security" + "github.com/prometheus/client_golang/prometheus" +) + +const ( + libvirtCollectorSubsystem = "libvirt" +) + +// CLI opts. +var ( + // cgroup opts. + libvirtCollectSwapMemoryStats = CEEMSExporterApp.Flag( + "collector.libvirt.swap-memory-metrics", + "Enables collection of swap memory metrics (default: disabled)", + ).Default("false").Bool() + libvirtCollectBlkIOStats = CEEMSExporterApp.Flag( + "collector.libvirt.blkio-metrics", + "Enables collection of block IO metrics (default: disabled)", + ).Default("false").Bool() + libvirtCollectPSIStats = CEEMSExporterApp.Flag( + "collector.libvirt.psi-metrics", + "Enables collection of PSI metrics (default: disabled)", + ).Default("false").Bool() + + // testing flags. + libvirtXMLDir = CEEMSExporterApp.Flag( + "collector.libvirt.xml-dir", + "Directory containing XML files of instances", + ).Default("/etc/libvirt/qemu").Hidden().String() +) + +// Security context names. +const ( + libvirtReadXMLCtx = "libvirt_read_xml" +) + +// Domain is the top level XML field for libvirt XML schema. +type Domain struct { + Devices Devices `xml:"devices"` + Name string `xml:"name"` + UUID string `xml:"uuid"` +} + +type Devices struct { + HostDevs []HostDev `xml:"hostdev"` +} + +type HostDev struct { + XMLName xml.Name `xml:"hostdev"` + Mode string `xml:"mode,attr"` + Type string `xml:"type,attr"` + Managed string `xml:"managed,attr"` + Model string `xml:"model,attr"` + Display string `xml:"display,attr"` + Source Source `xml:"source"` + Address Address `xml:"address"` +} + +type Source struct { + XMLName xml.Name `xml:"source"` + Address Address `xml:"address"` +} + +type Address struct { + XMLName xml.Name `xml:"address"` + UUID string `xml:"uuid,attr"` + Type string `xml:"type,attr"` + Domain string `xml:"domain,attr"` + Bus string `xml:"bus,attr"` + Slot string `xml:"slot,attr"` + Function string `xml:"function,attr"` +} + +// libvirtReadXMLSecurityCtxData contains the input/output data for +// reading XML files inside a security context. +type libvirtReadXMLSecurityCtxData struct { + xmlPath string + instanceID string + devices map[int]Device + instanceProps instanceProps +} + +// instanceProps contains VM properties. +type instanceProps struct { + uuid string // This is Openstack's specific UUID + gpuOrdinals []string // GPU ordinals bound to instance +} + +type libvirtMetrics struct { + cgMetrics []cgMetric + instanceProps []instanceProps + instanceIDUUIDMap map[string]string +} + +type libvirtCollector struct { + logger log.Logger + cgroupManager *cgroupManager + cgroupCollector *cgroupCollector + perfCollector *perfCollector + ebpfCollector *ebpfCollector + rdmaCollector *rdmaCollector + hostname string + gpuDevs map[int]Device + instanceGpuFlag *prometheus.Desc + collectError *prometheus.Desc + instancePropsCache map[string]instanceProps + securityContexts map[string]*security.SecurityContext +} + +func init() { + RegisterCollector(libvirtCollectorSubsystem, defaultDisabled, NewLibvirtCollector) +} + +// NewLibvirtCollector returns a new libvirt collector exposing a summary of cgroups. +func NewLibvirtCollector(logger log.Logger) (Collector, error) { + // Get SLURM's cgroup details + cgroupManager, err := NewCgroupManager("libvirt") + if err != nil { + level.Info(logger).Log("msg", "Failed to create cgroup manager", "err", err) + + return nil, err + } + + level.Info(logger).Log("cgroup", cgroupManager) + + // Set cgroup options + opts := cgroupOpts{ + collectSwapMemStats: *libvirtCollectSwapMemoryStats, + collectBlockIOStats: *libvirtCollectBlkIOStats, + collectPSIStats: *libvirtCollectPSIStats, + } + + // Start new instance of cgroupCollector + cgCollector, err := NewCgroupCollector(log.With(logger, "sub_collector", "cgroup"), cgroupManager, opts) + if err != nil { + level.Info(logger).Log("msg", "Failed to create cgroup collector", "err", err) + + return nil, err + } + + // Start new instance of perfCollector + var perfCollector *perfCollector + + if perfCollectorEnabled() { + perfCollector, err = NewPerfCollector(log.With(logger, "sub_collector", "perf"), cgroupManager) + if err != nil { + level.Info(logger).Log("msg", "Failed to create perf collector", "err", err) + + return nil, err + } + } + + // Start new instance of ebpfCollector + var ebpfCollector *ebpfCollector + + if ebpfCollectorEnabled() { + ebpfCollector, err = NewEbpfCollector(log.With(logger, "sub_collector", "ebpf"), cgroupManager) + if err != nil { + level.Info(logger).Log("msg", "Failed to create ebpf collector", "err", err) + + return nil, err + } + } + + // Start new instance of rdmaCollector + var rdmaCollector *rdmaCollector + + if rdmaCollectorEnabled() { + rdmaCollector, err = NewRDMACollector(log.With(logger, "sub_collector", "rdma"), cgroupManager) + if err != nil { + level.Info(logger).Log("msg", "Failed to create RDMA collector", "err", err) + + return nil, err + } + } + + // Attempt to get GPU devices + var gpuTypes []string + + var gpuDevs map[int]Device + + if *gpuType != "" { + gpuTypes = []string{*gpuType} + } else { + gpuTypes = []string{"nvidia", "amd"} + } + + for _, gpuType := range gpuTypes { + gpuDevs, err = GetGPUDevices(gpuType, logger) + if err == nil { + level.Info(logger).Log("gpu", gpuType) + + break + } + } + + // Setup necessary capabilities. These are the caps we need to read + // XML files in /etc/libvirt/qemu folder that contains GPU devs used by guests. + caps := setupCollectorCaps(logger, libvirtCollectorSubsystem, []string{"cap_dac_read_search"}) + + // Setup new security context(s) + securityCtx, err := security.NewSecurityContext(libvirtReadXMLCtx, caps, readLibvirtXMLFile, logger) + if err != nil { + level.Error(logger).Log("msg", "Failed to create a security context", "err", err) + + return nil, err + } + + return &libvirtCollector{ + cgroupManager: cgroupManager, + cgroupCollector: cgCollector, + perfCollector: perfCollector, + ebpfCollector: ebpfCollector, + rdmaCollector: rdmaCollector, + hostname: hostname, + gpuDevs: gpuDevs, + instancePropsCache: make(map[string]instanceProps), + securityContexts: map[string]*security.SecurityContext{libvirtReadXMLCtx: securityCtx}, + instanceGpuFlag: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_gpu_index_flag"), + "Indicates running instance on GPU, 1=instance running", + []string{ + "manager", + "hostname", + "uuid", + "index", + "hindex", + "gpuuuid", + }, + nil, + ), + collectError: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, genericSubsystem, "collect_error"), + "Indicates collection error, 0=no error, 1=error", + []string{"manager", "hostname", "uuid"}, + nil, + ), + logger: logger, + }, nil +} + +// Update implements Collector and update instance metrics. +func (c *libvirtCollector) Update(ch chan<- prometheus.Metric) error { + // Discover all active cgroups + metrics, err := c.discoverCgroups() + if err != nil { + return fmt.Errorf("%w: %w", ErrNoData, err) + } + + // Start a wait group + wg := sync.WaitGroup{} + wg.Add(1) + + go func() { + defer wg.Done() + + // Update cgroup metrics + if err := c.cgroupCollector.Update(ch, metrics.cgMetrics); err != nil { + level.Error(c.logger).Log("msg", "Failed to update cgroup stats", "err", err) + } + + // Update instance GPU ordinals + if len(c.gpuDevs) > 0 { + c.updateGPUOrdinals(ch, metrics.instanceProps) + } + }() + + if perfCollectorEnabled() { + wg.Add(1) + + go func() { + defer wg.Done() + + // Update perf metrics + if err := c.perfCollector.Update(ch, metrics.instanceIDUUIDMap); err != nil { + level.Error(c.logger).Log("msg", "Failed to update perf stats", "err", err) + } + }() + } + + if ebpfCollectorEnabled() { + wg.Add(1) + + go func() { + defer wg.Done() + + // Update ebpf metrics + if err := c.ebpfCollector.Update(ch, metrics.instanceIDUUIDMap); err != nil { + level.Error(c.logger).Log("msg", "Failed to update IO and/or network stats", "err", err) + } + }() + } + + if rdmaCollectorEnabled() { + wg.Add(1) + + go func() { + defer wg.Done() + + // Update RDMA metrics + if err := c.rdmaCollector.Update(ch, metrics.instanceIDUUIDMap); err != nil { + level.Error(c.logger).Log("msg", "Failed to update RDMA stats", "err", err) + } + }() + } + + // Wait for all go routines + wg.Wait() + + return nil +} + +// Stop releases system resources used by the collector. +func (c *libvirtCollector) Stop(ctx context.Context) error { + level.Debug(c.logger).Log("msg", "Stopping", "collector", libvirtCollectorSubsystem) + + // Stop all sub collectors + // Stop cgroupCollector + if err := c.cgroupCollector.Stop(ctx); err != nil { + level.Error(c.logger).Log("msg", "Failed to stop cgroup collector", "err", err) + } + + // Stop perfCollector + if perfCollectorEnabled() { + if err := c.perfCollector.Stop(ctx); err != nil { + level.Error(c.logger).Log("msg", "Failed to stop perf collector", "err", err) + } + } + + // Stop ebpfCollector + if ebpfCollectorEnabled() { + if err := c.ebpfCollector.Stop(ctx); err != nil { + level.Error(c.logger).Log("msg", "Failed to stop ebpf collector", "err", err) + } + } + + // Stop rdmaCollector + if rdmaCollectorEnabled() { + if err := c.rdmaCollector.Stop(ctx); err != nil { + level.Error(c.logger).Log("msg", "Failed to stop RDMA collector", "err", err) + } + } + + return nil +} + +// updateGPUOrdinals updates the metrics channel with GPU ordinals for instance. +func (c *libvirtCollector) updateGPUOrdinals(ch chan<- prometheus.Metric, instanceProps []instanceProps) { + // Update instance properties + for _, p := range instanceProps { + // GPU instance mapping + for _, gpuOrdinal := range p.gpuOrdinals { + var gpuuuid string + // Check the int index of devices where gpuOrdinal == dev.index + for _, dev := range c.gpuDevs { + if gpuOrdinal == dev.index { + gpuuuid = dev.uuid + + break + } + } + ch <- prometheus.MustNewConstMetric(c.instanceGpuFlag, prometheus.GaugeValue, float64(1), c.cgroupManager.manager, c.hostname, p.uuid, gpuOrdinal, fmt.Sprintf("%s-gpu-%s", c.hostname, gpuOrdinal), gpuuuid) + } + } +} + +// discoverCgroups finds active cgroup paths and returns initialised metric structs. +func (c *libvirtCollector) discoverCgroups() (libvirtMetrics, error) { + // Get currently active instances and set them in activeInstanceIDs state variable + var activeInstanceIDs []string + + var instnProps []instanceProps + + var cgMetrics []cgMetric + + instanceIDUUIDMap := make(map[string]string) + + // Walk through all cgroups and get cgroup paths + // https://goplay.tools/snippet/coVDkIozuhg + if err := filepath.WalkDir(c.cgroupManager.mountPoint, func(p string, info fs.DirEntry, err error) error { + if err != nil { + return err + } + + // Ignore inner cgroups of instances + if !info.IsDir() || c.cgroupManager.pathFilter(p) { + return nil + } + + // Get relative path of cgroup + rel, err := filepath.Rel(c.cgroupManager.root, p) + if err != nil { + level.Error(c.logger).Log("msg", "Failed to resolve relative path for cgroup", "path", p, "err", err) + + return nil + } + + // Unescape UTF-8 characters in cgroup path + sanitizedPath, err := unescapeString(p) + if err != nil { + level.Error(c.logger).Log("msg", "Failed to sanitize cgroup path", "path", p, "err", err) + + return nil + } + + // Get cgroup ID which is instance ID + cgroupIDMatches := c.cgroupManager.idRegex.FindStringSubmatch(sanitizedPath) + if len(cgroupIDMatches) <= 1 { + return nil + } + + instanceID := strings.TrimSpace(cgroupIDMatches[1]) + if instanceID == "" { + level.Error(c.logger).Log("msg", "Empty instance ID", "path", p) + + return nil + } + + // Check if we already passed through this instance + if slices.Contains(activeInstanceIDs, instanceID) { + return nil + } + + // Get instance details + if iProps, ok := c.instancePropsCache[instanceID]; !ok { + c.instancePropsCache[instanceID] = c.instanceProperties(instanceID) + instnProps = append(instnProps, c.instancePropsCache[instanceID]) + instanceIDUUIDMap[instanceID] = c.instancePropsCache[instanceID].uuid + } else { + instnProps = append(instnProps, iProps) + instanceIDUUIDMap[instanceID] = iProps.uuid + } + + activeInstanceIDs = append(activeInstanceIDs, instanceID) + cgMetrics = append(cgMetrics, cgMetric{uuid: instanceIDUUIDMap[instanceID], path: "/" + rel}) + + level.Debug(c.logger).Log("msg", "cgroup path", "path", p) + + return nil + }); err != nil { + level.Error(c.logger). + Log("msg", "Error walking cgroup subsystem", "path", c.cgroupManager.mountPoint, "err", err) + + return libvirtMetrics{}, err + } + + // Remove terminated instances from instancePropsCache + for uuid := range c.instancePropsCache { + if !slices.Contains(activeInstanceIDs, uuid) { + delete(c.instancePropsCache, uuid) + } + } + + return libvirtMetrics{cgMetrics: cgMetrics, instanceProps: instnProps, instanceIDUUIDMap: instanceIDUUIDMap}, nil +} + +// instanceProperties returns instance properties parsed from XML file. +func (c *libvirtCollector) instanceProperties(instanceID string) instanceProps { + // Read XML file in a security context that raises necessary capabilities + dataPtr := &libvirtReadXMLSecurityCtxData{ + xmlPath: *libvirtXMLDir, + devices: c.gpuDevs, + instanceID: instanceID, + } + + if securityCtx, ok := c.securityContexts[libvirtReadXMLCtx]; ok { + if err := securityCtx.Exec(dataPtr); err != nil { + level.Error(c.logger).Log( + "msg", "Failed to run inside security contxt", "instance_id", instanceID, "err", err, + ) + + return instanceProps{} + } + } else { + level.Error(c.logger).Log( + "msg", "Security context not found", "name", libvirtReadXMLCtx, "instance_id", instanceID, + ) + + return instanceProps{} + } + + return dataPtr.instanceProps +} + +// readLibvirtXMLFile reads the libvirt's XML file inside a security context. +func readLibvirtXMLFile(data interface{}) error { + // Assert data + var d *libvirtReadXMLSecurityCtxData + + var ok bool + if d, ok = data.(*libvirtReadXMLSecurityCtxData); !ok { + return security.ErrSecurityCtxDataAssertion + } + + // Get full file path + xmlFilePath := filepath.Join(d.xmlPath, d.instanceID+".xml") + + // If file does not exist return error + if _, err := os.Stat(xmlFilePath); err != nil { + return err + } + + // Read XML file contents + xmlByteArray, err := os.ReadFile(xmlFilePath) + if err != nil { + return err + } + + // Read XML byte array into domain + var domain Domain + if err := xml.Unmarshal(xmlByteArray, &domain); err != nil { + return err + } + + // Loop over hostdevs to get GPU IDs + var gpuOrdinals []string + + for _, hostDev := range domain.Devices.HostDevs { + // PCIe pass through + if hostDev.Type == "pci" { + gpuBusID := fmt.Sprintf( + "%s:%s:%s.%s", + strings.TrimPrefix(hostDev.Address.Domain, "0x"), + strings.TrimPrefix(hostDev.Address.Bus, "0x"), + strings.TrimPrefix(hostDev.Address.Slot, "0x"), + strings.TrimPrefix(hostDev.Address.Function, "0x"), + ) + + // Check if the current Bus ID matches with any existing GPUs + for idx, dev := range d.devices { + if dev.CompareBusID(gpuBusID) { + gpuOrdinals = append(gpuOrdinals, strconv.FormatInt(int64(idx), 10)) + + break + } + } + } + } + + // Read instance properties into dataPointer + d.instanceProps = instanceProps{uuid: domain.UUID, gpuOrdinals: gpuOrdinals} + + return nil +} diff --git a/pkg/collector/libvirt_test.go b/pkg/collector/libvirt_test.go new file mode 100644 index 00000000..73ea5e16 --- /dev/null +++ b/pkg/collector/libvirt_test.go @@ -0,0 +1,231 @@ +//go:build !nolibvirt +// +build !nolibvirt + +package collector + +import ( + "context" + "fmt" + "os" + "strconv" + "strings" + "testing" + + "github.com/containerd/cgroups/v3" + "github.com/go-kit/log" + "github.com/mahendrapaipuri/ceems/internal/security" + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestNewLibvirtCollector(t *testing.T) { + _, err := CEEMSExporterApp.Parse( + []string{ + "--path.cgroupfs", "testdata/sys/fs/cgroup", + "--path.procfs", "testdata/proc", + "--path.sysfs", "testdata/sys", + "--collector.libvirt.swap-memory-metrics", + "--collector.libvirt.psi-metrics", + "--collector.perf.hardware-events", + "--collector.rdma.stats", + "--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi", + "--collector.cgroups.force-version", "v2", + }, + ) + require.NoError(t, err) + + collector, err := NewLibvirtCollector(log.NewNopLogger()) + require.NoError(t, err) + + // Setup background goroutine to capture metrics. + metrics := make(chan prometheus.Metric) + defer close(metrics) + + go func() { + i := 0 + for range metrics { + i++ + } + }() + + err = collector.Update(metrics) + require.NoError(t, err) + + err = collector.Stop(context.Background()) + require.NoError(t, err) +} + +func TestLibvirtInstanceProps(t *testing.T) { + _, err := CEEMSExporterApp.Parse( + []string{ + "--path.cgroupfs", "testdata/sys/fs/cgroup", + "--collector.libvirt.xml-dir", "testdata/qemu", + "--collector.cgroups.force-version", "v2", + }, + ) + require.NoError(t, err) + + // cgroup Manager + cgManager := &cgroupManager{ + mode: cgroups.Unified, + mountPoint: "testdata/sys/fs/cgroup/machine.slice", + idRegex: libvirtCgroupPathRegex, + pathFilter: func(p string) bool { + return strings.Contains(p, "/libvirt") + }, + } + + c := libvirtCollector{ + gpuDevs: mockGPUDevices(), + logger: log.NewNopLogger(), + cgroupManager: cgManager, + instancePropsCache: make(map[string]instanceProps), + securityContexts: make(map[string]*security.SecurityContext), + } + + // Add dummy security context + c.securityContexts[libvirtReadXMLCtx], err = security.NewSecurityContext( + libvirtReadXMLCtx, + nil, + readLibvirtXMLFile, + c.logger, + ) + require.NoError(t, err) + + expectedProps := instanceProps{ + gpuOrdinals: []string{"0", "1"}, + uuid: "57f2d45e-8ddf-4338-91df-62d0044ff1b5", + } + + metrics, err := c.discoverCgroups() + require.NoError(t, err) + + var gotProps instanceProps + + for _, props := range metrics.instanceProps { + if props.uuid == expectedProps.uuid { + gotProps = props + } + } + + assert.Equal(t, expectedProps, gotProps) +} + +func TestInstancePropsCaching(t *testing.T) { + path := t.TempDir() + + cgroupsPath := path + "/cgroups" + err := os.Mkdir(cgroupsPath, 0o750) + require.NoError(t, err) + + xmlFilePath := path + "/qemu" + err = os.Mkdir(xmlFilePath, 0o750) + require.NoError(t, err) + + _, err = CEEMSExporterApp.Parse( + []string{ + "--path.cgroupfs", cgroupsPath, + "--collector.libvirt.xml-dir", xmlFilePath, + }, + ) + require.NoError(t, err) + + // cgroup Manager + cgManager := &cgroupManager{ + mode: cgroups.Unified, + root: cgroupsPath, + mountPoint: cgroupsPath + "/cpuacct/machine.slice", + idRegex: libvirtCgroupPathRegex, + pathFilter: func(p string) bool { + return strings.Contains(p, "/libvirt") + }, + } + + mockGPUDevs := mockGPUDevices() + c := libvirtCollector{ + cgroupManager: cgManager, + logger: log.NewNopLogger(), + gpuDevs: mockGPUDevs, + instancePropsCache: make(map[string]instanceProps), + securityContexts: make(map[string]*security.SecurityContext), + } + + // Add dummy security context + c.securityContexts[libvirtReadXMLCtx], err = security.NewSecurityContext( + libvirtReadXMLCtx, + nil, + readLibvirtXMLFile, + c.logger, + ) + require.NoError(t, err) + + // Add cgroups + for i := range 20 { + dir := fmt.Sprintf("%s/cpuacct/machine.slice/machine-qemu\x2d1\x2dinstance\x2d0000000%d.scope", cgroupsPath, i) + + err = os.MkdirAll(dir, 0o750) + require.NoError(t, err) + } + + // Binds GPUs to first n jobs + for igpu := range mockGPUDevs { + xmlContentPH := ` +instance-%[1]d +%[1]d + + + +
+ +
+ + +` + xmlContent := fmt.Sprintf(xmlContentPH, igpu, strconv.FormatUint(mockGPUDevs[igpu].busID.bus, 16)) + err = os.WriteFile( + fmt.Sprintf("%s/instance-0000000%d.xml", xmlFilePath, igpu), + []byte(xmlContent), + 0o600, + ) + require.NoError(t, err) + } + + // Now call get metrics which should populate instancePropsCache + _, err = c.discoverCgroups() + require.NoError(t, err) + + // Check if instancePropsCache has 20 instances and GPU ordinals are correct + assert.Len(t, c.instancePropsCache, 20) + + for igpu := range mockGPUDevs { + gpuIDString := strconv.FormatInt(int64(igpu), 10) + assert.Equal(t, []string{gpuIDString}, c.instancePropsCache["instance-0000000"+gpuIDString].gpuOrdinals) + } + + // Remove first 10 instances and add new 10 more instances + for i := range 10 { + dir := fmt.Sprintf("%s/cpuacct/machine.slice/machine-qemu\x2d1\x2dinstance\x2d0000000%d.scope", cgroupsPath, i) + + err = os.RemoveAll(dir) + require.NoError(t, err) + } + + for i := 19; i < 25; i++ { + dir := fmt.Sprintf("%s/cpuacct/machine.slice/machine-qemu\x2d1\x2dinstance\x2d0000000%d.scope", cgroupsPath, i) + + err = os.MkdirAll(dir, 0o750) + require.NoError(t, err) + } + + // Now call again get metrics which should populate instancePropsCache + _, err = c.discoverCgroups() + require.NoError(t, err) + + // Check if instancePropsCache has only 15 instances and GPU ordinals are empty + assert.Len(t, c.instancePropsCache, 15) + + for _, p := range c.instancePropsCache { + assert.Empty(t, p.gpuOrdinals) + } +} diff --git a/pkg/collector/perf.go b/pkg/collector/perf.go index b5ab4164..3bac0b38 100644 --- a/pkg/collector/perf.go +++ b/pkg/collector/perf.go @@ -537,7 +537,9 @@ func NewPerfCollector(logger log.Logger, cgManager *cgroupManager) (*perfCollect } // Update implements the Collector interface and will collect metrics per compute unit. -func (c *perfCollector) Update(ch chan<- prometheus.Metric) error { +// cgroupIDUUIDMap provides a map to cgroupID to compute unit UUID. If the map is empty, it means +// cgroup ID and compute unit UUID is identical. +func (c *perfCollector) Update(ch chan<- prometheus.Metric, cgroupIDUUIDMap map[string]string) error { // Discover new processes cgroups, err := c.discoverProcess() if err != nil { @@ -564,21 +566,28 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error { // Update metrics in go routines for each cgroup for cgroupID, procs := range cgroups { - go func(cid string, ps []procfs.Proc) { + var uuid string + if cgroupIDUUIDMap != nil { + uuid = cgroupIDUUIDMap[cgroupID] + } else { + uuid = cgroupID + } + + go func(u string, ps []procfs.Proc) { defer wg.Done() - if err := c.updateHardwareCounters(cid, ps, ch); err != nil { - level.Error(c.logger).Log("msg", "failed to update hardware counters", "cgroup", cgroupID, "err", err) + if err := c.updateHardwareCounters(u, ps, ch); err != nil { + level.Error(c.logger).Log("msg", "failed to update hardware counters", "uuid", u, "err", err) } - if err := c.updateSoftwareCounters(cid, ps, ch); err != nil { - level.Error(c.logger).Log("msg", "failed to update software counters", "cgroup", cgroupID, "err", err) + if err := c.updateSoftwareCounters(u, ps, ch); err != nil { + level.Error(c.logger).Log("msg", "failed to update software counters", "uuid", u, "err", err) } - if err := c.updateCacheCounters(cid, ps, ch); err != nil { - level.Error(c.logger).Log("msg", "failed to update cache counters", "cgroup", cgroupID, "err", err) + if err := c.updateCacheCounters(u, ps, ch); err != nil { + level.Error(c.logger).Log("msg", "failed to update cache counters", "uuid", u, "err", err) } - }(cgroupID, procs) + }(uuid, procs) } // Wait all go routines @@ -843,7 +852,11 @@ func (c *perfCollector) discoverProcess() (map[string][]procfs.Proc, error) { } } - level.Debug(c.logger).Log("msg", "Discovered cgroups for profiling") + if len(dataPtr.cgroups) > 0 { + level.Debug(c.logger).Log("msg", "Discovered cgroups for profiling") + } else { + level.Debug(c.logger).Log("msg", "No cgroups found for profiling") + } return dataPtr.cgroups, nil } diff --git a/pkg/collector/perf_test.go b/pkg/collector/perf_test.go index cd804197..3d87d541 100644 --- a/pkg/collector/perf_test.go +++ b/pkg/collector/perf_test.go @@ -50,7 +50,7 @@ func TestPerfCollector(t *testing.T) { } }() - err = collector.Update(metrics) + err = collector.Update(metrics, nil) require.NoError(t, err) err = collector.Stop(context.Background()) diff --git a/pkg/collector/rdma.go b/pkg/collector/rdma.go index 52c51878..fc13ef7e 100644 --- a/pkg/collector/rdma.go +++ b/pkg/collector/rdma.go @@ -228,7 +228,9 @@ func NewRDMACollector(logger log.Logger, cgManager *cgroupManager) (*rdmaCollect } // Update implements Collector and exposes RDMA related metrics. -func (c *rdmaCollector) Update(ch chan<- prometheus.Metric) error { +// cgroupIDUUIDMap provides a map to cgroupID to compute unit UUID. If the map is empty, it means +// cgroup ID and compute unit UUID is identical. +func (c *rdmaCollector) Update(ch chan<- prometheus.Metric, cgroupIDUUIDMap map[string]string) error { if !c.isAvailable { return ErrNoData } @@ -238,7 +240,7 @@ func (c *rdmaCollector) Update(ch chan<- prometheus.Metric) error { level.Error(c.logger).Log("msg", "Failed to enable Per-PID QP stats", "err", err) } - return c.update(ch) + return c.update(ch, cgroupIDUUIDMap) } // Stop releases system resources used by the collector. @@ -251,7 +253,7 @@ func (c *rdmaCollector) Stop(_ context.Context) error { // perPIDCounters enables/disables per PID counters for supported devices. func (c *rdmaCollector) perPIDCounters(enable bool) error { // If there no supported devices, return - if c.qpModes == nil { + if len(c.qpModes) == 0 { return nil } @@ -298,9 +300,9 @@ func (c *rdmaCollector) perPIDCounters(enable bool) error { } // update fetches different RDMA stats. -func (c *rdmaCollector) update(ch chan<- prometheus.Metric) error { +func (c *rdmaCollector) update(ch chan<- prometheus.Metric, cgroupIDUUIDMap map[string]string) error { // First get cgroups and their associated procs - procCgroup, err := c.procCgroups() + procCgroup, err := c.procCgroups(cgroupIDUUIDMap) if err != nil { level.Error(c.logger).Log("msg", "Failed to fetch active cgroups", "err", err) @@ -412,7 +414,7 @@ func (c *rdmaCollector) update(ch chan<- prometheus.Metric) error { } // procCgroups returns cgroup ID of all relevant processes. -func (c *rdmaCollector) procCgroups() (map[string]string, error) { +func (c *rdmaCollector) procCgroups(cgroupIDUUIDMap map[string]string) (map[string]string, error) { // First get cgroups and their associated procs cgroups, err := cgroupProcs(c.procfs, c.cgroupManager.idRegex, nil, c.cgroupManager.procFilter) if err != nil { @@ -425,9 +427,16 @@ func (c *rdmaCollector) procCgroups() (map[string]string, error) { procCgroup := make(map[string]string) for cgroupID, procs := range cgroups { + var uuid string + if cgroupIDUUIDMap != nil { + uuid = cgroupIDUUIDMap[cgroupID] + } else { + uuid = cgroupID + } + for _, proc := range procs { p := strconv.FormatInt(int64(proc.PID), 10) - procCgroup[p] = cgroupID + procCgroup[p] = uuid } } diff --git a/pkg/collector/rdma_test.go b/pkg/collector/rdma_test.go index b3d1ef3d..58bc165d 100644 --- a/pkg/collector/rdma_test.go +++ b/pkg/collector/rdma_test.go @@ -35,7 +35,7 @@ func TestRDMACollector(t *testing.T) { }, } - collector, err := NewPerfCollector(log.NewNopLogger(), cgManager) + collector, err := NewRDMACollector(log.NewNopLogger(), cgManager) require.NoError(t, err) // Setup background goroutine to capture metrics. @@ -49,7 +49,7 @@ func TestRDMACollector(t *testing.T) { } }() - err = collector.Update(metrics) + err = collector.Update(metrics, nil) require.NoError(t, err) err = collector.Stop(context.Background()) @@ -83,7 +83,7 @@ func TestDevMR(t *testing.T) { } // Get cgroup IDs - procCgroup, err := c.procCgroups() + procCgroup, err := c.procCgroups(nil) require.NoError(t, err) expectedMRs := map[string]*mr{ @@ -124,7 +124,7 @@ func TestDevCQ(t *testing.T) { } // Get cgroup IDs - procCgroup, err := c.procCgroups() + procCgroup, err := c.procCgroups(nil) require.NoError(t, err) expectedCQs := map[string]*cq{ @@ -167,7 +167,7 @@ func TestLinkQP(t *testing.T) { } // Get cgroup IDs - procCgroup, err := c.procCgroups() + procCgroup, err := c.procCgroups(nil) require.NoError(t, err) expected := map[string]*qp{ diff --git a/pkg/collector/slurm.go b/pkg/collector/slurm.go index 427c4e07..e3638748 100644 --- a/pkg/collector/slurm.go +++ b/pkg/collector/slurm.go @@ -50,20 +50,6 @@ var ( "collector.slurm.gpu-job-map-path", "Path to file that maps GPU ordinals to job IDs.", ).Default("/run/gpujobmap").String() - - // Used for e2e tests. - gpuType = CEEMSExporterApp.Flag( - "collector.slurm.gpu-type", - "GPU device type. Currently only nvidia and amd devices are supported.", - ).Hidden().Enum("nvidia", "amd") - nvidiaSmiPath = CEEMSExporterApp.Flag( - "collector.slurm.nvidia-smi-path", - "Absolute path to nvidia-smi binary. Use only for testing.", - ).Hidden().Default("").String() - rocmSmiPath = CEEMSExporterApp.Flag( - "collector.slurm.rocm-smi-path", - "Absolute path to rocm-smi binary. Use only for testing.", - ).Hidden().Default("").String() ) // Security context names. @@ -79,20 +65,20 @@ type slurmReadProcSecurityCtxData struct { gpuOrdinals []string } -// props contains SLURM job properties. -type props struct { +// jobProps contains SLURM job properties. +type jobProps struct { uuid string // This is SLURM's job ID gpuOrdinals []string // GPU ordinals bound to job } // emptyGPUOrdinals returns true if gpuOrdinals is empty. -func (p *props) emptyGPUOrdinals() bool { +func (p *jobProps) emptyGPUOrdinals() bool { return len(p.gpuOrdinals) == 0 } type slurmMetrics struct { cgMetrics []cgMetric - jobProps []props + jobProps []jobProps } type slurmCollector struct { @@ -107,7 +93,7 @@ type slurmCollector struct { procFS procfs.FS jobGpuFlag *prometheus.Desc collectError *prometheus.Desc - jobPropsCache map[string]props + jobPropsCache map[string]jobProps securityContexts map[string]*security.SecurityContext } @@ -142,10 +128,11 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { opts := cgroupOpts{ collectSwapMemStats: *slurmCollectSwapMemoryStatsDepre || *slurmCollectSwapMemoryStats, collectPSIStats: *slurmCollectPSIStatsDepre || *slurmCollectPSIStats, + collectBlockIOStats: false, // SLURM does not support blkio controller. } // Start new instance of cgroupCollector - cgCollector, err := NewCgroupCollector(logger, cgroupManager, opts) + cgCollector, err := NewCgroupCollector(log.With(logger, "sub_collector", "cgroup"), cgroupManager, opts) if err != nil { level.Info(logger).Log("msg", "Failed to create cgroup collector", "err", err) @@ -156,7 +143,7 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { var perfCollector *perfCollector if perfCollectorEnabled() { - perfCollector, err = NewPerfCollector(logger, cgroupManager) + perfCollector, err = NewPerfCollector(log.With(logger, "sub_collector", "perf"), cgroupManager) if err != nil { level.Info(logger).Log("msg", "Failed to create perf collector", "err", err) @@ -168,7 +155,7 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { var ebpfCollector *ebpfCollector if ebpfCollectorEnabled() { - ebpfCollector, err = NewEbpfCollector(logger, cgroupManager) + ebpfCollector, err = NewEbpfCollector(log.With(logger, "sub_collector", "ebpf"), cgroupManager) if err != nil { level.Info(logger).Log("msg", "Failed to create ebpf collector", "err", err) @@ -180,7 +167,7 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { var rdmaCollector *rdmaCollector if rdmaCollectorEnabled() { - rdmaCollector, err = NewRDMACollector(logger, cgroupManager) + rdmaCollector, err = NewRDMACollector(log.With(logger, "sub_collector", "rdma"), cgroupManager) if err != nil { level.Info(logger).Log("msg", "Failed to create RDMA collector", "err", err) @@ -237,7 +224,7 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { hostname: hostname, gpuDevs: gpuDevs, procFS: procFS, - jobPropsCache: make(map[string]props), + jobPropsCache: make(map[string]jobProps), securityContexts: map[string]*security.SecurityContext{slurmReadProcCtx: securityCtx}, jobGpuFlag: prometheus.NewDesc( prometheus.BuildFQName(Namespace, genericSubsystem, "unit_gpu_index_flag"), @@ -295,7 +282,7 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { defer wg.Done() // Update perf metrics - if err := c.perfCollector.Update(ch); err != nil { + if err := c.perfCollector.Update(ch, nil); err != nil { level.Error(c.logger).Log("msg", "Failed to update perf stats", "err", err) } }() @@ -308,7 +295,7 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { defer wg.Done() // Update ebpf metrics - if err := c.ebpfCollector.Update(ch); err != nil { + if err := c.ebpfCollector.Update(ch, nil); err != nil { level.Error(c.logger).Log("msg", "Failed to update IO and/or network stats", "err", err) } }() @@ -321,7 +308,7 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { defer wg.Done() // Update RDMA metrics - if err := c.rdmaCollector.Update(ch); err != nil { + if err := c.rdmaCollector.Update(ch, nil); err != nil { level.Error(c.logger).Log("msg", "Failed to update RDMA stats", "err", err) } }() @@ -368,7 +355,7 @@ func (c *slurmCollector) Stop(ctx context.Context) error { } // updateGPUOrdinals updates the metrics channel with GPU ordinals for SLURM job. -func (c *slurmCollector) updateGPUOrdinals(ch chan<- prometheus.Metric, jobProps []props) { +func (c *slurmCollector) updateGPUOrdinals(ch chan<- prometheus.Metric, jobProps []jobProps) { // Update slurm job properties for _, p := range jobProps { // GPU job mapping @@ -392,7 +379,7 @@ func (c *slurmCollector) discoverCgroups() (slurmMetrics, error) { // Get currently active jobs and set them in activeJobs state variable var activeJobUUIDs []string - var jobProps []props + var jProps []jobProps var cgMetrics []cgMetric @@ -437,12 +424,12 @@ func (c *slurmCollector) discoverCgroups() (slurmMetrics, error) { // Get GPU ordinals of the job if len(c.gpuDevs) > 0 { - if jProps, ok := c.jobPropsCache[jobuuid]; !ok || (ok && jProps.emptyGPUOrdinals()) { + if jobPropsCached, ok := c.jobPropsCache[jobuuid]; !ok || (ok && jobPropsCached.emptyGPUOrdinals()) { gpuOrdinals = c.gpuOrdinals(jobuuid) - c.jobPropsCache[jobuuid] = props{uuid: jobuuid, gpuOrdinals: gpuOrdinals} - jobProps = append(jobProps, c.jobPropsCache[jobuuid]) + c.jobPropsCache[jobuuid] = jobProps{uuid: jobuuid, gpuOrdinals: gpuOrdinals} + jProps = append(jProps, c.jobPropsCache[jobuuid]) } else { - jobProps = append(jobProps, jProps) + jProps = append(jProps, jobPropsCached) } } @@ -466,7 +453,7 @@ func (c *slurmCollector) discoverCgroups() (slurmMetrics, error) { } } - return slurmMetrics{cgMetrics: cgMetrics, jobProps: jobProps}, nil + return slurmMetrics{cgMetrics: cgMetrics, jobProps: jProps}, nil } // gpuOrdinalsFromProlog returns GPU ordinals of jobs from prolog generated run time files by SLURM. diff --git a/pkg/collector/slurm_test.go b/pkg/collector/slurm_test.go index ffaa5ef6..128815c6 100644 --- a/pkg/collector/slurm_test.go +++ b/pkg/collector/slurm_test.go @@ -23,9 +23,21 @@ import ( func mockGPUDevices() map[int]Device { devs := make(map[int]Device, 4) + busIDs := []BusID{ + {domain: 0, bus: 7, slot: 0, function: 0}, + {domain: 0, bus: 11, slot: 0, function: 0}, + {domain: 0, bus: 72, slot: 0, function: 0}, + {domain: 0, bus: 76, slot: 0, function: 0}, + {domain: 0, bus: 77, slot: 0, function: 0}, + } + for i := 0; i <= 4; i++ { idxString := strconv.Itoa(i) - devs[i] = Device{index: idxString, uuid: fmt.Sprintf("GPU-%d", i)} + devs[i] = Device{ + index: idxString, + uuid: fmt.Sprintf("GPU-%d", i), + busID: busIDs[i], + } } return devs @@ -42,7 +54,7 @@ func TestNewSlurmCollector(t *testing.T) { "--collector.slurm.psi-metrics", "--collector.perf.hardware-events", "--collector.rdma.stats", - "--collector.slurm.nvidia-smi-path", "testdata/nvidia-smi", + "--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi", "--collector.cgroups.force-version", "v2", }, ) @@ -93,10 +105,10 @@ func TestSlurmJobPropsWithProlog(t *testing.T) { gpuDevs: mockGPUDevices(), logger: log.NewNopLogger(), cgroupManager: cgManager, - jobPropsCache: make(map[string]props), + jobPropsCache: make(map[string]jobProps), } - expectedProps := props{ + expectedProps := jobProps{ gpuOrdinals: []string{"0"}, uuid: "1009249", } @@ -104,7 +116,7 @@ func TestSlurmJobPropsWithProlog(t *testing.T) { metrics, err := c.discoverCgroups() require.NoError(t, err) - var gotProps props + var gotProps jobProps for _, props := range metrics.jobProps { if props.uuid == expectedProps.uuid { @@ -142,7 +154,7 @@ func TestSlurmJobPropsWithProcsFS(t *testing.T) { cgroupManager: cgManager, gpuDevs: mockGPUDevices(), logger: log.NewNopLogger(), - jobPropsCache: make(map[string]props), + jobPropsCache: make(map[string]jobProps), procFS: procFS, securityContexts: make(map[string]*security.SecurityContext), } @@ -156,7 +168,7 @@ func TestSlurmJobPropsWithProcsFS(t *testing.T) { ) require.NoError(t, err) - expectedProps := props{ + expectedProps := jobProps{ uuid: "1009248", gpuOrdinals: []string{"2", "3"}, } @@ -164,7 +176,7 @@ func TestSlurmJobPropsWithProcsFS(t *testing.T) { metrics, err := c.discoverCgroups() require.NoError(t, err) - var gotProps props + var gotProps jobProps for _, props := range metrics.jobProps { if props.uuid == expectedProps.uuid { @@ -210,7 +222,7 @@ func TestJobPropsCaching(t *testing.T) { cgroupManager: cgManager, logger: log.NewNopLogger(), gpuDevs: mockGPUDevs, - jobPropsCache: make(map[string]props), + jobPropsCache: make(map[string]jobProps), } // Add cgroups diff --git a/pkg/collector/testdata/nvidia-smi b/pkg/collector/testdata/nvidia-smi index bb25aa95..bf392714 100755 --- a/pkg/collector/testdata/nvidia-smi +++ b/pkg/collector/testdata/nvidia-smi @@ -1,7 +1,7 @@ #!/bin/bash -printf """index, name, uuid -0, Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e -1, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3 -2, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3 -3, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3""" +printf """index, name, uuid, bus_id +0, Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e, 00000000:07:00.0 +1, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3, 00000000:0B:00.0 +2, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3, 00000000:48:00.0 +3, Tesla V100-SXM2-32GB, GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3, 00000000:4C:00.0""" diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv1-libvirt-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv1-libvirt-output.txt new file mode 100644 index 00000000..d6b79b13 --- /dev/null +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv1-libvirt-output.txt @@ -0,0 +1,162 @@ +# HELP ceems_compute_unit_blkio_read_total_bytes Total block IO read bytes +# TYPE ceems_compute_unit_blkio_read_total_bytes gauge +ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 3.25280768e+08 +ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 3.25280768e+08 +ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 3.25280768e+08 +ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 3.25280768e+08 +# HELP ceems_compute_unit_blkio_read_total_requests Total block IO read requests +# TYPE ceems_compute_unit_blkio_read_total_requests gauge +ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 10957 +ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 10957 +ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 10957 +ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 10957 +# HELP ceems_compute_unit_blkio_write_total_bytes Total block IO write bytes +# TYPE ceems_compute_unit_blkio_write_total_bytes gauge +ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 3.088384e+07 +ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 3.088384e+07 +ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 3.088384e+07 +ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 3.088384e+07 +# HELP ceems_compute_unit_blkio_write_total_requests Total block IO write requests +# TYPE ceems_compute_unit_blkio_write_total_requests gauge +ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 4803 +ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 4803 +ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 4803 +ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 4803 +# HELP ceems_compute_unit_cpu_psi_seconds Total CPU PSI in seconds +# TYPE ceems_compute_unit_cpu_psi_seconds gauge +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 +# HELP ceems_compute_unit_cpu_system_seconds_total Total job CPU system seconds +# TYPE ceems_compute_unit_cpu_system_seconds_total counter +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0.45 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0.45 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0.45 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0.45 +# HELP ceems_compute_unit_cpu_user_seconds_total Total job CPU user seconds +# TYPE ceems_compute_unit_cpu_user_seconds_total counter +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0.39 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0.39 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0.39 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0.39 +# HELP ceems_compute_unit_cpus Total number of job CPUs +# TYPE ceems_compute_unit_cpus gauge +ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 +ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 +ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 +ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 +# HELP ceems_compute_unit_gpu_index_flag Indicates running instance on GPU, 1=instance running +# TYPE ceems_compute_unit_gpu_index_flag gauge +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hindex="-gpu-1",hostname="",index="1",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",hindex="-gpu-2",hostname="",index="2",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hindex="-gpu-0",hostname="",index="0",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1 +# HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes +# TYPE ceems_compute_unit_memory_cache_bytes gauge +ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 2.1086208e+07 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 2.1086208e+07 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 2.1086208e+07 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 2.1086208e+07 +# HELP ceems_compute_unit_memory_fail_count Memory fail count +# TYPE ceems_compute_unit_memory_fail_count gauge +ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 +# HELP ceems_compute_unit_memory_psi_seconds Total memory PSI in seconds +# TYPE ceems_compute_unit_memory_psi_seconds gauge +ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 +ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 +ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 +ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 +# HELP ceems_compute_unit_memory_rss_bytes Memory RSS used in bytes +# TYPE ceems_compute_unit_memory_rss_bytes gauge +ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1.0407936e+07 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 1.0407936e+07 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 1.0407936e+07 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 1.0407936e+07 +# HELP ceems_compute_unit_memory_total_bytes Memory total in bytes +# TYPE ceems_compute_unit_memory_total_bytes gauge +ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 2.01362030592e+11 +ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 2.01362030592e+11 +ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 2.01362030592e+11 +ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 2.01362030592e+11 +# HELP ceems_compute_unit_memory_used_bytes Memory used in bytes +# TYPE ceems_compute_unit_memory_used_bytes gauge +ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 4.0194048e+07 +ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 4.0194048e+07 +ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 4.0194048e+07 +ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 4.0194048e+07 +# HELP ceems_compute_unit_memsw_fail_count Swap fail count +# TYPE ceems_compute_unit_memsw_fail_count gauge +ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 +ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 +ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 +ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 +# HELP ceems_compute_unit_memsw_total_bytes Swap total in bytes +# TYPE ceems_compute_unit_memsw_total_bytes gauge +ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 9.223372036854772e+18 +ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 9.223372036854772e+18 +ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 9.223372036854772e+18 +ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 9.223372036854772e+18 +# HELP ceems_compute_unit_memsw_used_bytes Swap used in bytes +# TYPE ceems_compute_unit_memsw_used_bytes gauge +ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 4.032512e+07 +ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 4.032512e+07 +ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 4.032512e+07 +ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 4.032512e+07 +# HELP ceems_compute_units Total number of jobs +# TYPE ceems_compute_units gauge +ceems_compute_units{hostname="",manager="libvirt"} 4 +# HELP ceems_cpu_count Number of CPUs. +# TYPE ceems_cpu_count gauge +ceems_cpu_count{hostname=""} 8 +# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core. +# TYPE ceems_cpu_per_core_count gauge +ceems_cpu_per_core_count{hostname=""} 2 +# HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode. +# TYPE ceems_cpu_seconds_total counter +ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04 +ceems_cpu_seconds_total{hostname="",mode="iowait"} 35.52 +ceems_cpu_seconds_total{hostname="",mode="irq"} 0.02 +ceems_cpu_seconds_total{hostname="",mode="nice"} 6.12 +ceems_cpu_seconds_total{hostname="",mode="softirq"} 39.44 +ceems_cpu_seconds_total{hostname="",mode="steal"} 0 +ceems_cpu_seconds_total{hostname="",mode="system"} 1119.22 +ceems_cpu_seconds_total{hostname="",mode="user"} 3018.54 +# HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. +# TYPE ceems_exporter_build_info gauge +# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts +# TYPE ceems_ipmi_dcmi_avg_watts gauge +ceems_ipmi_dcmi_avg_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts +# TYPE ceems_ipmi_dcmi_current_watts gauge +ceems_ipmi_dcmi_current_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts +# TYPE ceems_ipmi_dcmi_max_watts gauge +ceems_ipmi_dcmi_max_watts{hostname=""} 6132 +# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts +# TYPE ceems_ipmi_dcmi_min_watts gauge +ceems_ipmi_dcmi_min_watts{hostname=""} 5748 +# HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. +# TYPE ceems_meminfo_MemAvailable_bytes gauge +ceems_meminfo_MemAvailable_bytes{hostname=""} 0 +# HELP ceems_meminfo_MemFree_bytes Memory information field MemFree_bytes. +# TYPE ceems_meminfo_MemFree_bytes gauge +ceems_meminfo_MemFree_bytes{hostname=""} 4.50891776e+08 +# HELP ceems_meminfo_MemTotal_bytes Memory information field MemTotal_bytes. +# TYPE ceems_meminfo_MemTotal_bytes gauge +ceems_meminfo_MemTotal_bytes{hostname=""} 1.6042172416e+10 +# HELP ceems_rapl_package_joules_total Current RAPL package value in joules +# TYPE ceems_rapl_package_joules_total counter +ceems_rapl_package_joules_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0"} 258218.293244 +ceems_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1"} 130570.505826 +# HELP ceems_scrape_collector_duration_seconds ceems_exporter: Duration of a collector scrape. +# TYPE ceems_scrape_collector_duration_seconds gauge +# HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. +# TYPE ceems_scrape_collector_success gauge +ceems_scrape_collector_success{collector="cpu"} 1 +ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="libvirt"} 1 +ceems_scrape_collector_success{collector="meminfo"} 1 +ceems_scrape_collector_success{collector="rapl"} 1 diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-libvirt-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-libvirt-output.txt new file mode 100644 index 00000000..c423579b --- /dev/null +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-libvirt-output.txt @@ -0,0 +1,162 @@ +# HELP ceems_compute_unit_blkio_read_total_bytes Total block IO read bytes +# TYPE ceems_compute_unit_blkio_read_total_bytes gauge +ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 3.0206976e+07 +ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 3.0206976e+07 +ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 3.0206976e+07 +ceems_compute_unit_blkio_read_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 3.0206976e+07 +# HELP ceems_compute_unit_blkio_read_total_requests Total block IO read requests +# TYPE ceems_compute_unit_blkio_read_total_requests gauge +ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1141 +ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 1141 +ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 1141 +ceems_compute_unit_blkio_read_total_requests{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 1141 +# HELP ceems_compute_unit_blkio_write_total_bytes Total block IO write bytes +# TYPE ceems_compute_unit_blkio_write_total_bytes gauge +ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1.00337664e+09 +ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 1.00337664e+09 +ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 1.00337664e+09 +ceems_compute_unit_blkio_write_total_bytes{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 1.00337664e+09 +# HELP ceems_compute_unit_blkio_write_total_requests Total block IO write requests +# TYPE ceems_compute_unit_blkio_write_total_requests gauge +ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 14997 +ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 14997 +ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 14997 +ceems_compute_unit_blkio_write_total_requests{device="sdc",hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 14997 +# HELP ceems_compute_unit_cpu_psi_seconds Total CPU PSI in seconds +# TYPE ceems_compute_unit_cpu_psi_seconds gauge +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 +# HELP ceems_compute_unit_cpu_system_seconds_total Total job CPU system seconds +# TYPE ceems_compute_unit_cpu_system_seconds_total counter +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 115.777502 +# HELP ceems_compute_unit_cpu_user_seconds_total Total job CPU user seconds +# TYPE ceems_compute_unit_cpu_user_seconds_total counter +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 60375.292848 +# HELP ceems_compute_unit_cpus Total number of job CPUs +# TYPE ceems_compute_unit_cpus gauge +ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 2 +ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 2 +ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 2 +ceems_compute_unit_cpus{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 2 +# HELP ceems_compute_unit_gpu_index_flag Indicates running instance on GPU, 1=instance running +# TYPE ceems_compute_unit_gpu_index_flag gauge +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hindex="-gpu-1",hostname="",index="1",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",hindex="-gpu-2",hostname="",index="2",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 1 +ceems_compute_unit_gpu_index_flag{gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hindex="-gpu-0",hostname="",index="0",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1 +# HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes +# TYPE ceems_compute_unit_memory_cache_bytes gauge +ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 +# HELP ceems_compute_unit_memory_fail_count Memory fail count +# TYPE ceems_compute_unit_memory_fail_count gauge +ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 +# HELP ceems_compute_unit_memory_psi_seconds Total memory PSI in seconds +# TYPE ceems_compute_unit_memory_psi_seconds gauge +ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 +ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 +ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 +ceems_compute_unit_memory_psi_seconds{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 +# HELP ceems_compute_unit_memory_rss_bytes Memory RSS used in bytes +# TYPE ceems_compute_unit_memory_rss_bytes gauge +ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 4.098592768e+09 +# HELP ceems_compute_unit_memory_total_bytes Memory total in bytes +# TYPE ceems_compute_unit_memory_total_bytes gauge +ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 4.294967296e+09 +# HELP ceems_compute_unit_memory_used_bytes Memory used in bytes +# TYPE ceems_compute_unit_memory_used_bytes gauge +ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 4.111491072e+09 +# HELP ceems_compute_unit_memsw_fail_count Swap fail count +# TYPE ceems_compute_unit_memsw_fail_count gauge +ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 +ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 +ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 +ceems_compute_unit_memsw_fail_count{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 +# HELP ceems_compute_unit_memsw_total_bytes Swap total in bytes +# TYPE ceems_compute_unit_memsw_total_bytes gauge +ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 1.6042172416e+10 +ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 1.6042172416e+10 +ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 1.6042172416e+10 +ceems_compute_unit_memsw_total_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 1.6042172416e+10 +# HELP ceems_compute_unit_memsw_used_bytes Swap used in bytes +# TYPE ceems_compute_unit_memsw_used_bytes gauge +ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="57f2d45e-8ddf-4338-91df-62d0044ff1b5"} 0 +ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="5f7f6db0-7f7d-4c31-acc6-a03ec4d3ad4e"} 0 +ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="b674a0a2-c300-4dc6-8c9c-65df16da6d69"} 0 +ceems_compute_unit_memsw_used_bytes{hostname="",manager="libvirt",uuid="bf9ccd0f-4cd7-4ea2-8855-b56467326f61"} 0 +# HELP ceems_compute_units Total number of jobs +# TYPE ceems_compute_units gauge +ceems_compute_units{hostname="",manager="libvirt"} 4 +# HELP ceems_cpu_count Number of CPUs. +# TYPE ceems_cpu_count gauge +ceems_cpu_count{hostname=""} 8 +# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core. +# TYPE ceems_cpu_per_core_count gauge +ceems_cpu_per_core_count{hostname=""} 2 +# HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode. +# TYPE ceems_cpu_seconds_total counter +ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04 +ceems_cpu_seconds_total{hostname="",mode="iowait"} 35.52 +ceems_cpu_seconds_total{hostname="",mode="irq"} 0.02 +ceems_cpu_seconds_total{hostname="",mode="nice"} 6.12 +ceems_cpu_seconds_total{hostname="",mode="softirq"} 39.44 +ceems_cpu_seconds_total{hostname="",mode="steal"} 0 +ceems_cpu_seconds_total{hostname="",mode="system"} 1119.22 +ceems_cpu_seconds_total{hostname="",mode="user"} 3018.54 +# HELP ceems_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which ceems_exporter was built, and the goos and goarch for the build. +# TYPE ceems_exporter_build_info gauge +# HELP ceems_ipmi_dcmi_avg_watts Average Power consumption in watts +# TYPE ceems_ipmi_dcmi_avg_watts gauge +ceems_ipmi_dcmi_avg_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_current_watts Current Power consumption in watts +# TYPE ceems_ipmi_dcmi_current_watts gauge +ceems_ipmi_dcmi_current_watts{hostname=""} 5942 +# HELP ceems_ipmi_dcmi_max_watts Maximum Power consumption in watts +# TYPE ceems_ipmi_dcmi_max_watts gauge +ceems_ipmi_dcmi_max_watts{hostname=""} 6132 +# HELP ceems_ipmi_dcmi_min_watts Minimum Power consumption in watts +# TYPE ceems_ipmi_dcmi_min_watts gauge +ceems_ipmi_dcmi_min_watts{hostname=""} 5748 +# HELP ceems_meminfo_MemAvailable_bytes Memory information field MemAvailable_bytes. +# TYPE ceems_meminfo_MemAvailable_bytes gauge +ceems_meminfo_MemAvailable_bytes{hostname=""} 0 +# HELP ceems_meminfo_MemFree_bytes Memory information field MemFree_bytes. +# TYPE ceems_meminfo_MemFree_bytes gauge +ceems_meminfo_MemFree_bytes{hostname=""} 4.50891776e+08 +# HELP ceems_meminfo_MemTotal_bytes Memory information field MemTotal_bytes. +# TYPE ceems_meminfo_MemTotal_bytes gauge +ceems_meminfo_MemTotal_bytes{hostname=""} 1.6042172416e+10 +# HELP ceems_rapl_package_joules_total Current RAPL package value in joules +# TYPE ceems_rapl_package_joules_total counter +ceems_rapl_package_joules_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0"} 258218.293244 +ceems_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1"} 130570.505826 +# HELP ceems_scrape_collector_duration_seconds ceems_exporter: Duration of a collector scrape. +# TYPE ceems_scrape_collector_duration_seconds gauge +# HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. +# TYPE ceems_scrape_collector_success gauge +ceems_scrape_collector_success{collector="cpu"} 1 +ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 +ceems_scrape_collector_success{collector="libvirt"} 1 +ceems_scrape_collector_success{collector="meminfo"} 1 +ceems_scrape_collector_success{collector="rapl"} 1 diff --git a/pkg/collector/testdata/qemu/instance-00000001.xml b/pkg/collector/testdata/qemu/instance-00000001.xml new file mode 100644 index 00000000..0477e250 --- /dev/null +++ b/pkg/collector/testdata/qemu/instance-00000001.xml @@ -0,0 +1,129 @@ + + + + instance-00000001 + b674a0a2-c300-4dc6-8c9c-65df16da6d69 + + + + test-1 + 2024-10-04 18:05:09 + + 512 + 1 + 0 + 0 + 1 + + + admin + admin + + + + + + + + + + + 524288 + 524288 + 1 + + + OpenStack Foundation + OpenStack Nova + 30.1.0 + b674a0a2-c300-4dc6-8c9c-65df16da6d69 + b674a0a2-c300-4dc6-8c9c-65df16da6d69 + Virtual Machine + + + + hvm + + + + + + + + + + Nehalem + + + + + + + + destroy + restart + destroy + + /usr/bin/qemu-system-x86_64 + + + + + 4ebacfd6-f8c5-4d99-b6ca-02bf4c49abf5 + +
+ + + + + + + + +
+ + + + + + + + + + + + + + + + +