Skip to content

Commit

Permalink
feat: Add new cpus_per_core metric
Browse files Browse the repository at this point in the history
Signed-off-by: Mahendra Paipuri <[email protected]>
  • Loading branch information
mahendrapaipuri committed Apr 14, 2024
1 parent 7e7ea22 commit 78b5e49
Show file tree
Hide file tree
Showing 10 changed files with 87 additions and 78 deletions.
41 changes: 0 additions & 41 deletions pkg/collector/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"github.com/prometheus/common/version"
"github.com/prometheus/exporter-toolkit/web"
"github.com/prometheus/exporter-toolkit/web/kingpinflag"
"github.com/prometheus/procfs"
)

// CEEMSExporter represents the `ceems_exporter` cli.
Expand All @@ -40,12 +39,6 @@ var CEEMSExporterApp = *kingpin.New(
// Current hostname
var hostname string

// Current host's physical core count
var physicalCores int

// Current host's logical core count
var logicalCores int

// Empty hostname flag (Used only for testing)
var emptyHostnameLabel *bool

Expand Down Expand Up @@ -144,40 +137,6 @@ func (b *CEEMSExporter) Main() error {
}
}

// Get physical and logical core count
fs, err := procfs.NewFS(*procfsPath)
if err != nil {
return fmt.Errorf("failed to open procfs: %w", err)
}

// Get cpu info from /proc/cpuinfo
info, err := fs.CPUInfo()
if err != nil {
return fmt.Errorf("failed to open cpuinfo: %w", err)
}

// Get number of physical cores
var socketCoreMap = make(map[string]int)
for _, cpu := range info {
socketCoreMap[cpu.PhysicalID] = int(cpu.CPUCores)
logicalCores++
}
for _, cores := range socketCoreMap {
physicalCores += cores
}

// On ARM and some other architectures there is no CPUCores variable in the info.
// As HT/SMT is Intel's properitiary stuff, we can safely set
// physicalCores = logicalCores when physicalCores == 0 on other architectures
if physicalCores == 0 {
physicalCores = logicalCores
}

// In tests, the expected output is 4
if *emptyHostnameLabel {
physicalCores = 4
}

runtime.GOMAXPROCS(*maxProcs)
level.Debug(logger).Log("msg", "Go MAXPROCS", "procs", runtime.GOMAXPROCS(0))

Expand Down
49 changes: 42 additions & 7 deletions pkg/collector/cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ package collector
import (
"fmt"
"math"
"strconv"
"sync"

"github.com/go-kit/log"
Expand All @@ -18,12 +17,13 @@ import (
type cpuCollector struct {
fs procfs.FS
cpu *prometheus.Desc
ncpu *prometheus.Desc
ncpus *prometheus.Desc
ncpusPerCore *prometheus.Desc
logger log.Logger
cpuStats procfs.CPUStat
cpuStatsMutex sync.Mutex
hostname string
cpusPerCore string
cpusPerCore float64
}

// Idle jump back limit in seconds.
Expand All @@ -49,22 +49,56 @@ func NewCPUCollector(logger log.Logger) (Collector, error) {
return nil, fmt.Errorf("failed to open procfs: %w", err)
}

// Get cpu info from /proc/cpuinfo
info, err := fs.CPUInfo()
if err != nil {
return nil, fmt.Errorf("failed to open cpuinfo: %w", err)
}

// Get number of physical cores
var socketCoreMap = make(map[string]int)
var physicalCores, logicalCores int
for _, cpu := range info {
socketCoreMap[cpu.PhysicalID] = int(cpu.CPUCores)
logicalCores++
}
for _, cores := range socketCoreMap {
physicalCores += cores
}

// On ARM and some other architectures there is no CPUCores variable in the info.
// As HT/SMT is Intel's properitiary stuff, we can safely set
// physicalCores = logicalCores when physicalCores == 0 on other architectures
if physicalCores == 0 {
physicalCores = logicalCores
}

// In tests, the expected output is 4
if *emptyHostnameLabel {
physicalCores = 4
}

return &cpuCollector{
fs: fs,
cpu: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, cpuCollectorSubsystem, "seconds_total"),
"Seconds the CPUs spent in each mode.",
[]string{"hostname", "mode"}, nil,
),
ncpu: prometheus.NewDesc(
ncpus: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, cpuCollectorSubsystem, "count"),
"Number of CPUs.",
[]string{"hostname", "cpuspercore"}, nil,
[]string{"hostname"}, nil,
),
ncpusPerCore: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, cpuCollectorSubsystem, "per_core_count"),
"Number of logical CPUs per physical core.",
[]string{"hostname"}, nil,
),
logger: logger,
hostname: hostname,
// Ensure that cpusPerCore is at least 1 in all cases
cpusPerCore: strconv.Itoa(int(math.Max(1, float64(int(math.Max(float64(logicalCores), 1))/int(math.Max(float64(physicalCores), 1)))))),
cpusPerCore: math.Max(1, float64(int(math.Max(float64(logicalCores), 1))/int(math.Max(float64(physicalCores), 1)))),
cpuStats: procfs.CPUStat{},
}, nil
}
Expand All @@ -85,7 +119,8 @@ func (c *cpuCollector) Update(ch chan<- prometheus.Metric) error {
// Acquire a lock to read the stats.
c.cpuStatsMutex.Lock()
defer c.cpuStatsMutex.Unlock()
ch <- prometheus.MustNewConstMetric(c.ncpu, prometheus.GaugeValue, float64(ncpus), c.hostname, c.cpusPerCore)
ch <- prometheus.MustNewConstMetric(c.ncpus, prometheus.GaugeValue, float64(ncpus), c.hostname)
ch <- prometheus.MustNewConstMetric(c.ncpusPerCore, prometheus.GaugeValue, float64(c.cpusPerCore), c.hostname)
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, c.cpuStats.User, c.hostname, "user")
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, c.cpuStats.Nice, c.hostname, "nice")
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, c.cpuStats.System, c.hostname, "system")
Expand Down
7 changes: 2 additions & 5 deletions pkg/collector/slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ type slurmCollector struct {
hostname string
gpuDevs map[int]Device
hostMemTotal float64
cpusPerCore string
numJobs *prometheus.Desc
jobCPUUser *prometheus.Desc
jobCPUSystem *prometheus.Desc
Expand Down Expand Up @@ -218,8 +217,6 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) {
hostname: hostname,
gpuDevs: gpuDevs,
hostMemTotal: memTotal,
// Ensure that cpusPerCore is at least 1 in all cases
cpusPerCore: strconv.Itoa(int(math.Max(1, float64(int(math.Max(float64(logicalCores), 1))/int(math.Max(float64(physicalCores), 1)))))),
numJobs: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, genericSubsystem, "units"),
"Total number of jobs",
Expand Down Expand Up @@ -247,7 +244,7 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) {
jobCPUs: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, genericSubsystem, "unit_cpus"),
"Total number of job CPUs",
[]string{"manager", "hostname", "user", "project", "uuid", "cpuspercore"},
[]string{"manager", "hostname", "user", "project", "uuid"},
nil,
),
jobCPUPressure: prometheus.NewDesc(
Expand Down Expand Up @@ -386,7 +383,7 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error {
cpus = metrics[filepath.Dir(dir)].cpus
}
}
ch <- prometheus.MustNewConstMetric(c.jobCPUs, prometheus.GaugeValue, float64(cpus), c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid, c.cpusPerCore)
ch <- prometheus.MustNewConstMetric(c.jobCPUs, prometheus.GaugeValue, float64(cpus), c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid)

// Memory stats
ch <- prometheus.MustNewConstMetric(c.jobMemoryRSS, prometheus.GaugeValue, m.memoryRSS, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid)
Expand Down
11 changes: 7 additions & 4 deletions pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="t
ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0.39
# HELP ceems_compute_unit_cpus Total number of job CPUs
# TYPE ceems_compute_unit_cpus gauge
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0
# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running
# TYPE ceems_compute_unit_gpu_index_flag gauge
ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1
Expand Down Expand Up @@ -61,7 +61,10 @@ ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",
ceems_compute_units{hostname="",manager="slurm"} 3
# HELP ceems_cpu_count Number of CPUs.
# TYPE ceems_cpu_count gauge
ceems_cpu_count{cpuspercore="2",hostname=""} 8
ceems_cpu_count{hostname=""} 8
# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core.
# TYPE ceems_cpu_per_core_count gauge
ceems_cpu_per_core_count{hostname=""} 2
# HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode.
# TYPE ceems_cpu_seconds_total counter
ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="t
ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848
# HELP ceems_compute_unit_cpus Total number of job CPUs
# TYPE ceems_compute_unit_cpus gauge
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2
# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running
# TYPE ceems_compute_unit_gpu_index_flag gauge
ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="20170005280c",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1
Expand Down Expand Up @@ -86,7 +86,10 @@ ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",
ceems_compute_units{hostname="",manager="slurm"} 3
# HELP ceems_cpu_count Number of CPUs.
# TYPE ceems_cpu_count gauge
ceems_cpu_count{cpuspercore="2",hostname=""} 8
ceems_cpu_count{hostname=""} 8
# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core.
# TYPE ceems_cpu_per_core_count gauge
ceems_cpu_per_core_count{hostname=""} 2
# HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode.
# TYPE ceems_cpu_seconds_total counter
ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="t
ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848
# HELP ceems_compute_unit_cpus Total number of job CPUs
# TYPE ceems_compute_unit_cpus gauge
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2
# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running
# TYPE ceems_compute_unit_gpu_index_flag gauge
ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="20170005280c",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1
Expand Down Expand Up @@ -61,7 +61,10 @@ ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",
ceems_compute_units{hostname="",manager="slurm"} 3
# HELP ceems_cpu_count Number of CPUs.
# TYPE ceems_cpu_count gauge
ceems_cpu_count{cpuspercore="2",hostname=""} 8
ceems_cpu_count{hostname=""} 8
# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core.
# TYPE ceems_cpu_per_core_count gauge
ceems_cpu_per_core_count{hostname=""} 2
# HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode.
# TYPE ceems_cpu_seconds_total counter
ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="t
ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848
# HELP ceems_compute_unit_cpus Total number of job CPUs
# TYPE ceems_compute_unit_cpus gauge
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2
# HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes
# TYPE ceems_compute_unit_memory_cache_bytes gauge
ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0
Expand Down Expand Up @@ -55,7 +55,10 @@ ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",
ceems_compute_units{hostname="",manager="slurm"} 3
# HELP ceems_cpu_count Number of CPUs.
# TYPE ceems_cpu_count gauge
ceems_cpu_count{cpuspercore="2",hostname=""} 8
ceems_cpu_count{hostname=""} 8
# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core.
# TYPE ceems_cpu_per_core_count gauge
ceems_cpu_per_core_count{hostname=""} 2
# HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode.
# TYPE ceems_cpu_seconds_total counter
ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="t
ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 60375.292848
# HELP ceems_compute_unit_cpus Total number of job CPUs
# TYPE ceems_compute_unit_cpus gauge
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 2
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 2
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 2
# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running
# TYPE ceems_compute_unit_gpu_index_flag gauge
ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="1009248"} 1
Expand Down Expand Up @@ -61,7 +61,10 @@ ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",
ceems_compute_units{hostname="",manager="slurm"} 3
# HELP ceems_cpu_count Number of CPUs.
# TYPE ceems_cpu_count gauge
ceems_cpu_count{cpuspercore="2",hostname=""} 8
ceems_cpu_count{hostname=""} 8
# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core.
# TYPE ceems_cpu_per_core_count gauge
ceems_cpu_per_core_count{hostname=""} 2
# HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode.
# TYPE ceems_cpu_seconds_total counter
ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="t
ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848
# HELP ceems_compute_unit_cpus Total number of job CPUs
# TYPE ceems_compute_unit_cpus gauge
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2
ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2
ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2
# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running
# TYPE ceems_compute_unit_gpu_index_flag gauge
ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1
Expand Down Expand Up @@ -61,7 +61,10 @@ ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",
ceems_compute_units{hostname="",manager="slurm"} 3
# HELP ceems_cpu_count Number of CPUs.
# TYPE ceems_cpu_count gauge
ceems_cpu_count{cpuspercore="2",hostname=""} 8
ceems_cpu_count{hostname=""} 8
# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core.
# TYPE ceems_cpu_per_core_count gauge
ceems_cpu_per_core_count{hostname=""} 2
# HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode.
# TYPE ceems_cpu_seconds_total counter
ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04
Expand Down
2 changes: 1 addition & 1 deletion scripts/checkmetrics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ fi
search_dir="$2"
for entry in "$search_dir"/*
do
lint=$($1 check metrics < "$entry" 2>&1 | grep -v -E "^ceems_compute_unit_(memory_fail_count|memsw_fail_count)|ceems_meminfo_|ceems_cpu_count")
lint=$($1 check metrics < "$entry" 2>&1 | grep -v -E "^ceems_compute_unit_(memory_fail_count|memsw_fail_count)|ceems_meminfo_|ceems_cpu_count|ceems_cpu_per_core_count")

if [[ -n $lint ]]; then
echo -e "Some Prometheus metrics do not follow best practices:\n"
Expand Down

0 comments on commit 78b5e49

Please sign in to comment.