diff --git a/pkg/collector/cli.go b/pkg/collector/cli.go index 573d7ad4..20a32c4b 100644 --- a/pkg/collector/cli.go +++ b/pkg/collector/cli.go @@ -19,7 +19,6 @@ import ( "github.com/prometheus/common/version" "github.com/prometheus/exporter-toolkit/web" "github.com/prometheus/exporter-toolkit/web/kingpinflag" - "github.com/prometheus/procfs" ) // CEEMSExporter represents the `ceems_exporter` cli. @@ -40,12 +39,6 @@ var CEEMSExporterApp = *kingpin.New( // Current hostname var hostname string -// Current host's physical core count -var physicalCores int - -// Current host's logical core count -var logicalCores int - // Empty hostname flag (Used only for testing) var emptyHostnameLabel *bool @@ -144,40 +137,6 @@ func (b *CEEMSExporter) Main() error { } } - // Get physical and logical core count - fs, err := procfs.NewFS(*procfsPath) - if err != nil { - return fmt.Errorf("failed to open procfs: %w", err) - } - - // Get cpu info from /proc/cpuinfo - info, err := fs.CPUInfo() - if err != nil { - return fmt.Errorf("failed to open cpuinfo: %w", err) - } - - // Get number of physical cores - var socketCoreMap = make(map[string]int) - for _, cpu := range info { - socketCoreMap[cpu.PhysicalID] = int(cpu.CPUCores) - logicalCores++ - } - for _, cores := range socketCoreMap { - physicalCores += cores - } - - // On ARM and some other architectures there is no CPUCores variable in the info. - // As HT/SMT is Intel's properitiary stuff, we can safely set - // physicalCores = logicalCores when physicalCores == 0 on other architectures - if physicalCores == 0 { - physicalCores = logicalCores - } - - // In tests, the expected output is 4 - if *emptyHostnameLabel { - physicalCores = 4 - } - runtime.GOMAXPROCS(*maxProcs) level.Debug(logger).Log("msg", "Go MAXPROCS", "procs", runtime.GOMAXPROCS(0)) diff --git a/pkg/collector/cpu.go b/pkg/collector/cpu.go index 200f32a3..6277f6a9 100644 --- a/pkg/collector/cpu.go +++ b/pkg/collector/cpu.go @@ -6,7 +6,6 @@ package collector import ( "fmt" "math" - "strconv" "sync" "github.com/go-kit/log" @@ -18,12 +17,13 @@ import ( type cpuCollector struct { fs procfs.FS cpu *prometheus.Desc - ncpu *prometheus.Desc + ncpus *prometheus.Desc + ncpusPerCore *prometheus.Desc logger log.Logger cpuStats procfs.CPUStat cpuStatsMutex sync.Mutex hostname string - cpusPerCore string + cpusPerCore float64 } // Idle jump back limit in seconds. @@ -49,6 +49,35 @@ func NewCPUCollector(logger log.Logger) (Collector, error) { return nil, fmt.Errorf("failed to open procfs: %w", err) } + // Get cpu info from /proc/cpuinfo + info, err := fs.CPUInfo() + if err != nil { + return nil, fmt.Errorf("failed to open cpuinfo: %w", err) + } + + // Get number of physical cores + var socketCoreMap = make(map[string]int) + var physicalCores, logicalCores int + for _, cpu := range info { + socketCoreMap[cpu.PhysicalID] = int(cpu.CPUCores) + logicalCores++ + } + for _, cores := range socketCoreMap { + physicalCores += cores + } + + // On ARM and some other architectures there is no CPUCores variable in the info. + // As HT/SMT is Intel's properitiary stuff, we can safely set + // physicalCores = logicalCores when physicalCores == 0 on other architectures + if physicalCores == 0 { + physicalCores = logicalCores + } + + // In tests, the expected output is 4 + if *emptyHostnameLabel { + physicalCores = 4 + } + return &cpuCollector{ fs: fs, cpu: prometheus.NewDesc( @@ -56,15 +85,20 @@ func NewCPUCollector(logger log.Logger) (Collector, error) { "Seconds the CPUs spent in each mode.", []string{"hostname", "mode"}, nil, ), - ncpu: prometheus.NewDesc( + ncpus: prometheus.NewDesc( prometheus.BuildFQName(Namespace, cpuCollectorSubsystem, "count"), "Number of CPUs.", - []string{"hostname", "cpuspercore"}, nil, + []string{"hostname"}, nil, + ), + ncpusPerCore: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, cpuCollectorSubsystem, "per_core_count"), + "Number of logical CPUs per physical core.", + []string{"hostname"}, nil, ), logger: logger, hostname: hostname, // Ensure that cpusPerCore is at least 1 in all cases - cpusPerCore: strconv.Itoa(int(math.Max(1, float64(int(math.Max(float64(logicalCores), 1))/int(math.Max(float64(physicalCores), 1)))))), + cpusPerCore: math.Max(1, float64(int(math.Max(float64(logicalCores), 1))/int(math.Max(float64(physicalCores), 1)))), cpuStats: procfs.CPUStat{}, }, nil } @@ -85,7 +119,8 @@ func (c *cpuCollector) Update(ch chan<- prometheus.Metric) error { // Acquire a lock to read the stats. c.cpuStatsMutex.Lock() defer c.cpuStatsMutex.Unlock() - ch <- prometheus.MustNewConstMetric(c.ncpu, prometheus.GaugeValue, float64(ncpus), c.hostname, c.cpusPerCore) + ch <- prometheus.MustNewConstMetric(c.ncpus, prometheus.GaugeValue, float64(ncpus), c.hostname) + ch <- prometheus.MustNewConstMetric(c.ncpusPerCore, prometheus.GaugeValue, float64(c.cpusPerCore), c.hostname) ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, c.cpuStats.User, c.hostname, "user") ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, c.cpuStats.Nice, c.hostname, "nice") ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, c.cpuStats.System, c.hostname, "system") diff --git a/pkg/collector/slurm.go b/pkg/collector/slurm.go index 2beba1bf..f5fe4791 100644 --- a/pkg/collector/slurm.go +++ b/pkg/collector/slurm.go @@ -133,7 +133,6 @@ type slurmCollector struct { hostname string gpuDevs map[int]Device hostMemTotal float64 - cpusPerCore string numJobs *prometheus.Desc jobCPUUser *prometheus.Desc jobCPUSystem *prometheus.Desc @@ -218,8 +217,6 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { hostname: hostname, gpuDevs: gpuDevs, hostMemTotal: memTotal, - // Ensure that cpusPerCore is at least 1 in all cases - cpusPerCore: strconv.Itoa(int(math.Max(1, float64(int(math.Max(float64(logicalCores), 1))/int(math.Max(float64(physicalCores), 1)))))), numJobs: prometheus.NewDesc( prometheus.BuildFQName(Namespace, genericSubsystem, "units"), "Total number of jobs", @@ -247,7 +244,7 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { jobCPUs: prometheus.NewDesc( prometheus.BuildFQName(Namespace, genericSubsystem, "unit_cpus"), "Total number of job CPUs", - []string{"manager", "hostname", "user", "project", "uuid", "cpuspercore"}, + []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), jobCPUPressure: prometheus.NewDesc( @@ -386,7 +383,7 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { cpus = metrics[filepath.Dir(dir)].cpus } } - ch <- prometheus.MustNewConstMetric(c.jobCPUs, prometheus.GaugeValue, float64(cpus), c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid, c.cpusPerCore) + ch <- prometheus.MustNewConstMetric(c.jobCPUs, prometheus.GaugeValue, float64(cpus), c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) // Memory stats ch <- prometheus.MustNewConstMetric(c.jobMemoryRSS, prometheus.GaugeValue, m.memoryRSS, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt index 75554bcd..f39e08a7 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt @@ -10,9 +10,9 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="t ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0.39 # HELP ceems_compute_unit_cpus Total number of job CPUs # TYPE ceems_compute_unit_cpus gauge -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 # HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running # TYPE ceems_compute_unit_gpu_index_flag gauge ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 @@ -61,7 +61,10 @@ ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm", ceems_compute_units{hostname="",manager="slurm"} 3 # HELP ceems_cpu_count Number of CPUs. # TYPE ceems_cpu_count gauge -ceems_cpu_count{cpuspercore="2",hostname=""} 8 +ceems_cpu_count{hostname=""} 8 +# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core. +# TYPE ceems_cpu_per_core_count gauge +ceems_cpu_per_core_count{hostname=""} 2 # HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode. # TYPE ceems_cpu_seconds_total counter ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04 diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt index 49feaa5a..ac67c60d 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt @@ -15,9 +15,9 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="t ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848 # HELP ceems_compute_unit_cpus Total number of job CPUs # TYPE ceems_compute_unit_cpus gauge -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 # HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running # TYPE ceems_compute_unit_gpu_index_flag gauge ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="20170005280c",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 @@ -86,7 +86,10 @@ ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm", ceems_compute_units{hostname="",manager="slurm"} 3 # HELP ceems_cpu_count Number of CPUs. # TYPE ceems_cpu_count gauge -ceems_cpu_count{cpuspercore="2",hostname=""} 8 +ceems_cpu_count{hostname=""} 8 +# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core. +# TYPE ceems_cpu_per_core_count gauge +ceems_cpu_per_core_count{hostname=""} 2 # HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode. # TYPE ceems_cpu_seconds_total counter ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04 diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt index 302147a4..6b381aa9 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt @@ -10,9 +10,9 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="t ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848 # HELP ceems_compute_unit_cpus Total number of job CPUs # TYPE ceems_compute_unit_cpus gauge -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 # HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running # TYPE ceems_compute_unit_gpu_index_flag gauge ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="20170005280c",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 @@ -61,7 +61,10 @@ ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm", ceems_compute_units{hostname="",manager="slurm"} 3 # HELP ceems_cpu_count Number of CPUs. # TYPE ceems_cpu_count gauge -ceems_cpu_count{cpuspercore="2",hostname=""} 8 +ceems_cpu_count{hostname=""} 8 +# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core. +# TYPE ceems_cpu_per_core_count gauge +ceems_cpu_per_core_count{hostname=""} 2 # HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode. # TYPE ceems_cpu_seconds_total counter ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04 diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nogpu-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-nogpu-output.txt index 59c58134..91d96705 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nogpu-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-nogpu-output.txt @@ -10,9 +10,9 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="t ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848 # HELP ceems_compute_unit_cpus Total number of job CPUs # TYPE ceems_compute_unit_cpus gauge -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 # HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes # TYPE ceems_compute_unit_memory_cache_bytes gauge ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 @@ -55,7 +55,10 @@ ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm", ceems_compute_units{hostname="",manager="slurm"} 3 # HELP ceems_cpu_count Number of CPUs. # TYPE ceems_cpu_count gauge -ceems_cpu_count{cpuspercore="2",hostname=""} 8 +ceems_cpu_count{hostname=""} 8 +# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core. +# TYPE ceems_cpu_per_core_count gauge +ceems_cpu_per_core_count{hostname=""} 2 # HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode. # TYPE ceems_cpu_seconds_total counter ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04 diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt index 563c98b6..278f1253 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt @@ -10,9 +10,9 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="t ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 60375.292848 # HELP ceems_compute_unit_cpus Total number of job CPUs # TYPE ceems_compute_unit_cpus gauge -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 2 -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 2 -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 2 # HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running # TYPE ceems_compute_unit_gpu_index_flag gauge ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="1009248"} 1 @@ -61,7 +61,10 @@ ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm", ceems_compute_units{hostname="",manager="slurm"} 3 # HELP ceems_cpu_count Number of CPUs. # TYPE ceems_cpu_count gauge -ceems_cpu_count{cpuspercore="2",hostname=""} 8 +ceems_cpu_count{hostname=""} 8 +# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core. +# TYPE ceems_cpu_per_core_count gauge +ceems_cpu_per_core_count{hostname=""} 2 # HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode. # TYPE ceems_cpu_seconds_total counter ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04 diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt index 3953e028..43251fe3 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt @@ -10,9 +10,9 @@ ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="t ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848 # HELP ceems_compute_unit_cpus Total number of job CPUs # TYPE ceems_compute_unit_cpus gauge -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 -ceems_compute_unit_cpus{cpuspercore="2",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 # HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running # TYPE ceems_compute_unit_gpu_index_flag gauge ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 @@ -61,7 +61,10 @@ ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm", ceems_compute_units{hostname="",manager="slurm"} 3 # HELP ceems_cpu_count Number of CPUs. # TYPE ceems_cpu_count gauge -ceems_cpu_count{cpuspercore="2",hostname=""} 8 +ceems_cpu_count{hostname=""} 8 +# HELP ceems_cpu_per_core_count Number of logical CPUs per physical core. +# TYPE ceems_cpu_per_core_count gauge +ceems_cpu_per_core_count{hostname=""} 2 # HELP ceems_cpu_seconds_total Seconds the CPUs spent in each mode. # TYPE ceems_cpu_seconds_total counter ceems_cpu_seconds_total{hostname="",mode="idle"} 89790.04 diff --git a/scripts/checkmetrics.sh b/scripts/checkmetrics.sh index 06f0742c..57f2c96a 100755 --- a/scripts/checkmetrics.sh +++ b/scripts/checkmetrics.sh @@ -9,7 +9,7 @@ fi search_dir="$2" for entry in "$search_dir"/* do - lint=$($1 check metrics < "$entry" 2>&1 | grep -v -E "^ceems_compute_unit_(memory_fail_count|memsw_fail_count)|ceems_meminfo_|ceems_cpu_count") + lint=$($1 check metrics < "$entry" 2>&1 | grep -v -E "^ceems_compute_unit_(memory_fail_count|memsw_fail_count)|ceems_meminfo_|ceems_cpu_count|ceems_cpu_per_core_count") if [[ -n $lint ]]; then echo -e "Some Prometheus metrics do not follow best practices:\n"