From 4614d31ec5226c341aefa14989cf7d6b47f6ca13 Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Thu, 4 Apr 2024 13:46:47 +0200 Subject: [PATCH] chore: Use generic name in metric labels * This will help to use unified dashboards for different resource managers * We already have inside the metric a label to differentiate different managers * Update test fixture outputs Signed-off-by: Mahendra Paipuri --- pkg/collector/slurm.go | 39 ++-- .../output/e2e-test-cgroupsv1-output.txt | 122 ++++++------- .../e2e-test-cgroupsv2-all-metrics-output.txt | 172 +++++++++--------- ...e2e-test-cgroupsv2-amd-ipmitool-output.txt | 122 ++++++------- .../e2e-test-cgroupsv2-nogpu-output.txt | 110 +++++------ ...-test-cgroupsv2-nvidia-ipmiutil-output.txt | 122 ++++++------- .../e2e-test-cgroupsv2-procfs-output.txt | 122 ++++++------- 7 files changed, 405 insertions(+), 404 deletions(-) diff --git a/pkg/collector/slurm.go b/pkg/collector/slurm.go index 3fd7c4ad..bad380a7 100644 --- a/pkg/collector/slurm.go +++ b/pkg/collector/slurm.go @@ -27,6 +27,7 @@ import ( const ( slurmCollectorSubsystem = "slurm" + genericSubsystem = "compute" ) var ( @@ -217,109 +218,109 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { gpuDevs: gpuDevs, hostMemTotal: memTotal, numJobs: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "jobs"), + prometheus.BuildFQName(Namespace, genericSubsystem, "units"), "Total number of jobs", []string{"manager", "hostname"}, nil, ), jobCPUUser: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_cpu_user_seconds"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_cpu_user_seconds_total"), "Total job CPU user seconds", []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), jobCPUSystem: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_cpu_system_seconds"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_cpu_system_seconds_total"), "Total job CPU system seconds", []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), // cpuTotal: prometheus.NewDesc( - // prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_cpu_total_seconds"), + // prometheus.BuildFQName(Namespace, genericSubsystem, "job_cpu_total_seconds"), // "Total job CPU total seconds", // []string{"manager", "hostname", "user", "project", "uuid"}, // nil, // ), jobCPUs: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_cpus"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_cpus"), "Total number of job CPUs", []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), jobCPUPressure: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_cpu_psi_seconds"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_cpu_psi_seconds"), "Total CPU PSI in seconds", []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), jobMemoryRSS: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_memory_rss_bytes"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_memory_rss_bytes"), "Memory RSS used in bytes", []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), jobMemoryCache: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_memory_cache_bytes"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_memory_cache_bytes"), "Memory cache used in bytes", []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), jobMemoryUsed: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_memory_used_bytes"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_memory_used_bytes"), "Memory used in bytes", []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), jobMemoryTotal: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_memory_total_bytes"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_memory_total_bytes"), "Memory total in bytes", []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), jobMemoryFailCount: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_memory_fail_count"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_memory_fail_count"), "Memory fail count", []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), jobMemswUsed: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_memsw_used_bytes"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_memsw_used_bytes"), "Swap used in bytes", []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), jobMemswTotal: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_memsw_total_bytes"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_memsw_total_bytes"), "Swap total in bytes", []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), jobMemswFailCount: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_memsw_fail_count"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_memsw_fail_count"), "Swap fail count", []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), jobMemoryPressure: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_memory_psi_seconds"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_memory_psi_seconds"), "Total memory PSI in seconds", []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), jobRDMAHCAHandles: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_rdma_hca_handles"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_rdma_hca_handles"), "Current number of RDMA HCA handles", []string{"manager", "hostname", "user", "project", "uuid", "device"}, nil, ), jobRDMAHCAObjects: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_rdma_hca_objects"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_rdma_hca_objects"), "Current number of RDMA HCA objects", []string{"manager", "hostname", "user", "project", "uuid", "device"}, nil, ), jobGpuFlag: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_gpu_index_flag"), + prometheus.BuildFQName(Namespace, genericSubsystem, "unit_gpu_index_flag"), "Indicates running job on GPU, 1=job running", []string{ "manager", @@ -334,7 +335,7 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { nil, ), collectError: prometheus.NewDesc( - prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "collect_error"), + prometheus.BuildFQName(Namespace, genericSubsystem, "collect_error"), "Indicates collection error, 0=no error, 1=error", []string{"manager", "hostname", "user", "project", "uuid"}, nil, diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt index e3999ef4..761f7732 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt @@ -1,3 +1,64 @@ +# HELP ceems_compute_unit_cpu_system_seconds_total Total job CPU system seconds +# TYPE ceems_compute_unit_cpu_system_seconds_total gauge +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0.45 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0.45 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0.45 +# HELP ceems_compute_unit_cpu_user_seconds_total Total job CPU user seconds +# TYPE ceems_compute_unit_cpu_user_seconds_total gauge +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0.39 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0.39 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0.39 +# HELP ceems_compute_unit_cpus Total number of job CPUs +# TYPE ceems_compute_unit_cpus gauge +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running +# TYPE ceems_compute_unit_gpu_index_flag gauge +ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",hindex="-gpu-2",hostname="",index="2",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc2",gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hindex="-gpu-0",hostname="",index="0",manager="slurm",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc3",gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hindex="-gpu-1",hostname="",index="1",manager="slurm",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 1 +# HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes +# TYPE ceems_compute_unit_memory_cache_bytes gauge +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2.1086208e+07 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2.1086208e+07 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2.1086208e+07 +# HELP ceems_compute_unit_memory_fail_count Memory fail count +# TYPE ceems_compute_unit_memory_fail_count gauge +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_compute_unit_memory_rss_bytes Memory RSS used in bytes +# TYPE ceems_compute_unit_memory_rss_bytes gauge +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1.0407936e+07 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1.0407936e+07 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 1.0407936e+07 +# HELP ceems_compute_unit_memory_total_bytes Memory total in bytes +# TYPE ceems_compute_unit_memory_total_bytes gauge +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2.01362030592e+11 +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2.01362030592e+11 +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2.01362030592e+11 +# HELP ceems_compute_unit_memory_used_bytes Memory used in bytes +# TYPE ceems_compute_unit_memory_used_bytes gauge +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.0194048e+07 +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.0194048e+07 +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.0194048e+07 +# HELP ceems_compute_unit_rdma_hca_handles Current number of RDMA HCA handles +# TYPE ceems_compute_unit_rdma_hca_handles gauge +ceems_compute_unit_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 479 +ceems_compute_unit_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 289 +ceems_compute_unit_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1479 +ceems_compute_unit_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2479 +# HELP ceems_compute_unit_rdma_hca_objects Current number of RDMA HCA objects +# TYPE ceems_compute_unit_rdma_hca_objects gauge +ceems_compute_unit_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 479 +ceems_compute_unit_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 289 +ceems_compute_unit_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1479 +ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2479 +# HELP ceems_compute_units Total number of jobs +# TYPE ceems_compute_units gauge +ceems_compute_units{hostname="",manager="slurm"} 3 # HELP ceems_cpu_count Number of CPUs. # TYPE ceems_cpu_count gauge ceems_cpu_count{hostname=""} 8 @@ -47,67 +108,6 @@ ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 ceems_scrape_collector_success{collector="slurm"} 1 -# HELP ceems_slurm_job_cpu_system_seconds Total job CPU system seconds -# TYPE ceems_slurm_job_cpu_system_seconds gauge -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0.45 -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0.45 -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0.45 -# HELP ceems_slurm_job_cpu_user_seconds Total job CPU user seconds -# TYPE ceems_slurm_job_cpu_user_seconds gauge -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0.39 -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0.39 -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0.39 -# HELP ceems_slurm_job_cpus Total number of job CPUs -# TYPE ceems_slurm_job_cpus gauge -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 -# HELP ceems_slurm_job_gpu_index_flag Indicates running job on GPU, 1=job running -# TYPE ceems_slurm_job_gpu_index_flag gauge -ceems_slurm_job_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",hindex="-gpu-2",hostname="",index="2",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc2",gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hindex="-gpu-0",hostname="",index="0",manager="slurm",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc3",gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hindex="-gpu-1",hostname="",index="1",manager="slurm",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 1 -# HELP ceems_slurm_job_memory_cache_bytes Memory cache used in bytes -# TYPE ceems_slurm_job_memory_cache_bytes gauge -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2.1086208e+07 -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2.1086208e+07 -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2.1086208e+07 -# HELP ceems_slurm_job_memory_fail_count Memory fail count -# TYPE ceems_slurm_job_memory_fail_count gauge -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 -# HELP ceems_slurm_job_memory_rss_bytes Memory RSS used in bytes -# TYPE ceems_slurm_job_memory_rss_bytes gauge -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1.0407936e+07 -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1.0407936e+07 -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 1.0407936e+07 -# HELP ceems_slurm_job_memory_total_bytes Memory total in bytes -# TYPE ceems_slurm_job_memory_total_bytes gauge -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2.01362030592e+11 -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2.01362030592e+11 -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2.01362030592e+11 -# HELP ceems_slurm_job_memory_used_bytes Memory used in bytes -# TYPE ceems_slurm_job_memory_used_bytes gauge -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.0194048e+07 -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.0194048e+07 -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.0194048e+07 -# HELP ceems_slurm_job_rdma_hca_handles Current number of RDMA HCA handles -# TYPE ceems_slurm_job_rdma_hca_handles gauge -ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 479 -ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 289 -ceems_slurm_job_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1479 -ceems_slurm_job_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2479 -# HELP ceems_slurm_job_rdma_hca_objects Current number of RDMA HCA objects -# TYPE ceems_slurm_job_rdma_hca_objects gauge -ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 479 -ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 289 -ceems_slurm_job_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1479 -ceems_slurm_job_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2479 -# HELP ceems_slurm_jobs Total number of jobs -# TYPE ceems_slurm_jobs gauge -ceems_slurm_jobs{hostname="",manager="slurm"} 3 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary # HELP go_goroutines Number of goroutines that currently exist. diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt index b232e963..90ef9d2d 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt @@ -1,3 +1,89 @@ +# HELP ceems_compute_unit_cpu_psi_seconds Total CPU PSI in seconds +# TYPE ceems_compute_unit_cpu_psi_seconds gauge +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_cpu_psi_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_compute_unit_cpu_system_seconds_total Total job CPU system seconds +# TYPE ceems_compute_unit_cpu_system_seconds_total gauge +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 115.777502 +# HELP ceems_compute_unit_cpu_user_seconds_total Total job CPU user seconds +# TYPE ceems_compute_unit_cpu_user_seconds_total gauge +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848 +# HELP ceems_compute_unit_cpus Total number of job CPUs +# TYPE ceems_compute_unit_cpus gauge +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 +# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running +# TYPE ceems_compute_unit_gpu_index_flag gauge +ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="20170005280c",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="20180003050c",hindex="-gpu-2",hostname="",index="2",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc2",gpuuuid="20170000800c",hindex="-gpu-0",hostname="",index="0",manager="slurm",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc3",gpuuuid="20170003580c",hindex="-gpu-1",hostname="",index="1",manager="slurm",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 1 +# HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes +# TYPE ceems_compute_unit_memory_cache_bytes gauge +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_compute_unit_memory_fail_count Memory fail count +# TYPE ceems_compute_unit_memory_fail_count gauge +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_compute_unit_memory_psi_seconds Total memory PSI in seconds +# TYPE ceems_compute_unit_memory_psi_seconds gauge +ceems_compute_unit_memory_psi_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_memory_psi_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_memory_psi_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_compute_unit_memory_rss_bytes Memory RSS used in bytes +# TYPE ceems_compute_unit_memory_rss_bytes gauge +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.098592768e+09 +# HELP ceems_compute_unit_memory_total_bytes Memory total in bytes +# TYPE ceems_compute_unit_memory_total_bytes gauge +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.294967296e+09 +# HELP ceems_compute_unit_memory_used_bytes Memory used in bytes +# TYPE ceems_compute_unit_memory_used_bytes gauge +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.111491072e+09 +# HELP ceems_compute_unit_memsw_fail_count Swap fail count +# TYPE ceems_compute_unit_memsw_fail_count gauge +ceems_compute_unit_memsw_fail_count{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_memsw_fail_count{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_memsw_fail_count{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_compute_unit_memsw_total_bytes Swap total in bytes +# TYPE ceems_compute_unit_memsw_total_bytes gauge +ceems_compute_unit_memsw_total_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1.6042172416e+10 +ceems_compute_unit_memsw_total_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1.6042172416e+10 +ceems_compute_unit_memsw_total_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 1.6042172416e+10 +# HELP ceems_compute_unit_memsw_used_bytes Swap used in bytes +# TYPE ceems_compute_unit_memsw_used_bytes gauge +ceems_compute_unit_memsw_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_memsw_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_memsw_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_compute_unit_rdma_hca_handles Current number of RDMA HCA handles +# TYPE ceems_compute_unit_rdma_hca_handles gauge +ceems_compute_unit_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_compute_unit_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_compute_unit_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_compute_unit_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 +# HELP ceems_compute_unit_rdma_hca_objects Current number of RDMA HCA objects +# TYPE ceems_compute_unit_rdma_hca_objects gauge +ceems_compute_unit_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_compute_unit_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_compute_unit_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 +# HELP ceems_compute_units Total number of jobs +# TYPE ceems_compute_units gauge +ceems_compute_units{hostname="",manager="slurm"} 3 # HELP ceems_cpu_count Number of CPUs. # TYPE ceems_cpu_count gauge ceems_cpu_count{hostname=""} 8 @@ -47,92 +133,6 @@ ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 ceems_scrape_collector_success{collector="slurm"} 1 -# HELP ceems_slurm_job_cpu_psi_seconds Total CPU PSI in seconds -# TYPE ceems_slurm_job_cpu_psi_seconds gauge -ceems_slurm_job_cpu_psi_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_slurm_job_cpu_psi_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_slurm_job_cpu_psi_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 -# HELP ceems_slurm_job_cpu_system_seconds Total job CPU system seconds -# TYPE ceems_slurm_job_cpu_system_seconds gauge -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 115.777502 -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 115.777502 -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 115.777502 -# HELP ceems_slurm_job_cpu_user_seconds Total job CPU user seconds -# TYPE ceems_slurm_job_cpu_user_seconds gauge -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 60375.292848 -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 60375.292848 -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848 -# HELP ceems_slurm_job_cpus Total number of job CPUs -# TYPE ceems_slurm_job_cpus gauge -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 -# HELP ceems_slurm_job_gpu_index_flag Indicates running job on GPU, 1=job running -# TYPE ceems_slurm_job_gpu_index_flag gauge -ceems_slurm_job_gpu_index_flag{account="testacc",gpuuuid="20170005280c",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc",gpuuuid="20180003050c",hindex="-gpu-2",hostname="",index="2",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc2",gpuuuid="20170000800c",hindex="-gpu-0",hostname="",index="0",manager="slurm",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc3",gpuuuid="20170003580c",hindex="-gpu-1",hostname="",index="1",manager="slurm",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 1 -# HELP ceems_slurm_job_memory_cache_bytes Memory cache used in bytes -# TYPE ceems_slurm_job_memory_cache_bytes gauge -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 -# HELP ceems_slurm_job_memory_fail_count Memory fail count -# TYPE ceems_slurm_job_memory_fail_count gauge -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 -# HELP ceems_slurm_job_memory_psi_seconds Total memory PSI in seconds -# TYPE ceems_slurm_job_memory_psi_seconds gauge -ceems_slurm_job_memory_psi_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_slurm_job_memory_psi_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_slurm_job_memory_psi_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 -# HELP ceems_slurm_job_memory_rss_bytes Memory RSS used in bytes -# TYPE ceems_slurm_job_memory_rss_bytes gauge -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.098592768e+09 -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.098592768e+09 -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.098592768e+09 -# HELP ceems_slurm_job_memory_total_bytes Memory total in bytes -# TYPE ceems_slurm_job_memory_total_bytes gauge -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.294967296e+09 -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.294967296e+09 -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.294967296e+09 -# HELP ceems_slurm_job_memory_used_bytes Memory used in bytes -# TYPE ceems_slurm_job_memory_used_bytes gauge -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.111491072e+09 -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.111491072e+09 -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.111491072e+09 -# HELP ceems_slurm_job_memsw_fail_count Swap fail count -# TYPE ceems_slurm_job_memsw_fail_count gauge -ceems_slurm_job_memsw_fail_count{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_slurm_job_memsw_fail_count{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_slurm_job_memsw_fail_count{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 -# HELP ceems_slurm_job_memsw_total_bytes Swap total in bytes -# TYPE ceems_slurm_job_memsw_total_bytes gauge -ceems_slurm_job_memsw_total_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1.6042172416e+10 -ceems_slurm_job_memsw_total_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1.6042172416e+10 -ceems_slurm_job_memsw_total_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 1.6042172416e+10 -# HELP ceems_slurm_job_memsw_used_bytes Swap used in bytes -# TYPE ceems_slurm_job_memsw_used_bytes gauge -ceems_slurm_job_memsw_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_slurm_job_memsw_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_slurm_job_memsw_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 -# HELP ceems_slurm_job_rdma_hca_handles Current number of RDMA HCA handles -# TYPE ceems_slurm_job_rdma_hca_handles gauge -ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 -ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 -ceems_slurm_job_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 -ceems_slurm_job_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 -# HELP ceems_slurm_job_rdma_hca_objects Current number of RDMA HCA objects -# TYPE ceems_slurm_job_rdma_hca_objects gauge -ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 -ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 -ceems_slurm_job_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 -ceems_slurm_job_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 -# HELP ceems_slurm_jobs Total number of jobs -# TYPE ceems_slurm_jobs gauge -ceems_slurm_jobs{hostname="",manager="slurm"} 3 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary # HELP go_goroutines Number of goroutines that currently exist. diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt index cc08aec8..c5032ffd 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt @@ -1,3 +1,64 @@ +# HELP ceems_compute_unit_cpu_system_seconds_total Total job CPU system seconds +# TYPE ceems_compute_unit_cpu_system_seconds_total gauge +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 115.777502 +# HELP ceems_compute_unit_cpu_user_seconds_total Total job CPU user seconds +# TYPE ceems_compute_unit_cpu_user_seconds_total gauge +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848 +# HELP ceems_compute_unit_cpus Total number of job CPUs +# TYPE ceems_compute_unit_cpus gauge +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 +# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running +# TYPE ceems_compute_unit_gpu_index_flag gauge +ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="20170005280c",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="20180003050c",hindex="-gpu-2",hostname="",index="2",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc2",gpuuuid="20170000800c",hindex="-gpu-0",hostname="",index="0",manager="slurm",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc3",gpuuuid="20170003580c",hindex="-gpu-1",hostname="",index="1",manager="slurm",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 1 +# HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes +# TYPE ceems_compute_unit_memory_cache_bytes gauge +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_compute_unit_memory_fail_count Memory fail count +# TYPE ceems_compute_unit_memory_fail_count gauge +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_compute_unit_memory_rss_bytes Memory RSS used in bytes +# TYPE ceems_compute_unit_memory_rss_bytes gauge +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.098592768e+09 +# HELP ceems_compute_unit_memory_total_bytes Memory total in bytes +# TYPE ceems_compute_unit_memory_total_bytes gauge +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.294967296e+09 +# HELP ceems_compute_unit_memory_used_bytes Memory used in bytes +# TYPE ceems_compute_unit_memory_used_bytes gauge +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.111491072e+09 +# HELP ceems_compute_unit_rdma_hca_handles Current number of RDMA HCA handles +# TYPE ceems_compute_unit_rdma_hca_handles gauge +ceems_compute_unit_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_compute_unit_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_compute_unit_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_compute_unit_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 +# HELP ceems_compute_unit_rdma_hca_objects Current number of RDMA HCA objects +# TYPE ceems_compute_unit_rdma_hca_objects gauge +ceems_compute_unit_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_compute_unit_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_compute_unit_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 +# HELP ceems_compute_units Total number of jobs +# TYPE ceems_compute_units gauge +ceems_compute_units{hostname="",manager="slurm"} 3 # HELP ceems_cpu_count Number of CPUs. # TYPE ceems_cpu_count gauge ceems_cpu_count{hostname=""} 8 @@ -47,67 +108,6 @@ ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 ceems_scrape_collector_success{collector="slurm"} 1 -# HELP ceems_slurm_job_cpu_system_seconds Total job CPU system seconds -# TYPE ceems_slurm_job_cpu_system_seconds gauge -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 115.777502 -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 115.777502 -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 115.777502 -# HELP ceems_slurm_job_cpu_user_seconds Total job CPU user seconds -# TYPE ceems_slurm_job_cpu_user_seconds gauge -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 60375.292848 -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 60375.292848 -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848 -# HELP ceems_slurm_job_cpus Total number of job CPUs -# TYPE ceems_slurm_job_cpus gauge -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 -# HELP ceems_slurm_job_gpu_index_flag Indicates running job on GPU, 1=job running -# TYPE ceems_slurm_job_gpu_index_flag gauge -ceems_slurm_job_gpu_index_flag{account="testacc",gpuuuid="20170005280c",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc",gpuuuid="20180003050c",hindex="-gpu-2",hostname="",index="2",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc2",gpuuuid="20170000800c",hindex="-gpu-0",hostname="",index="0",manager="slurm",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc3",gpuuuid="20170003580c",hindex="-gpu-1",hostname="",index="1",manager="slurm",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 1 -# HELP ceems_slurm_job_memory_cache_bytes Memory cache used in bytes -# TYPE ceems_slurm_job_memory_cache_bytes gauge -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 -# HELP ceems_slurm_job_memory_fail_count Memory fail count -# TYPE ceems_slurm_job_memory_fail_count gauge -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 -# HELP ceems_slurm_job_memory_rss_bytes Memory RSS used in bytes -# TYPE ceems_slurm_job_memory_rss_bytes gauge -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.098592768e+09 -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.098592768e+09 -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.098592768e+09 -# HELP ceems_slurm_job_memory_total_bytes Memory total in bytes -# TYPE ceems_slurm_job_memory_total_bytes gauge -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.294967296e+09 -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.294967296e+09 -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.294967296e+09 -# HELP ceems_slurm_job_memory_used_bytes Memory used in bytes -# TYPE ceems_slurm_job_memory_used_bytes gauge -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.111491072e+09 -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.111491072e+09 -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.111491072e+09 -# HELP ceems_slurm_job_rdma_hca_handles Current number of RDMA HCA handles -# TYPE ceems_slurm_job_rdma_hca_handles gauge -ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 -ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 -ceems_slurm_job_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 -ceems_slurm_job_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 -# HELP ceems_slurm_job_rdma_hca_objects Current number of RDMA HCA objects -# TYPE ceems_slurm_job_rdma_hca_objects gauge -ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 -ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 -ceems_slurm_job_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 -ceems_slurm_job_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 -# HELP ceems_slurm_jobs Total number of jobs -# TYPE ceems_slurm_jobs gauge -ceems_slurm_jobs{hostname="",manager="slurm"} 3 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary # HELP go_goroutines Number of goroutines that currently exist. diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nogpu-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-nogpu-output.txt index 573166b4..50d07189 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nogpu-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-nogpu-output.txt @@ -1,3 +1,58 @@ +# HELP ceems_compute_unit_cpu_system_seconds_total Total job CPU system seconds +# TYPE ceems_compute_unit_cpu_system_seconds_total gauge +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 115.777502 +# HELP ceems_compute_unit_cpu_user_seconds_total Total job CPU user seconds +# TYPE ceems_compute_unit_cpu_user_seconds_total gauge +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848 +# HELP ceems_compute_unit_cpus Total number of job CPUs +# TYPE ceems_compute_unit_cpus gauge +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 +# HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes +# TYPE ceems_compute_unit_memory_cache_bytes gauge +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_compute_unit_memory_fail_count Memory fail count +# TYPE ceems_compute_unit_memory_fail_count gauge +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_compute_unit_memory_rss_bytes Memory RSS used in bytes +# TYPE ceems_compute_unit_memory_rss_bytes gauge +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.098592768e+09 +# HELP ceems_compute_unit_memory_total_bytes Memory total in bytes +# TYPE ceems_compute_unit_memory_total_bytes gauge +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.294967296e+09 +# HELP ceems_compute_unit_memory_used_bytes Memory used in bytes +# TYPE ceems_compute_unit_memory_used_bytes gauge +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.111491072e+09 +# HELP ceems_compute_unit_rdma_hca_handles Current number of RDMA HCA handles +# TYPE ceems_compute_unit_rdma_hca_handles gauge +ceems_compute_unit_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_compute_unit_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_compute_unit_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_compute_unit_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 +# HELP ceems_compute_unit_rdma_hca_objects Current number of RDMA HCA objects +# TYPE ceems_compute_unit_rdma_hca_objects gauge +ceems_compute_unit_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_compute_unit_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_compute_unit_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 +# HELP ceems_compute_units Total number of jobs +# TYPE ceems_compute_units gauge +ceems_compute_units{hostname="",manager="slurm"} 3 # HELP ceems_cpu_count Number of CPUs. # TYPE ceems_cpu_count gauge ceems_cpu_count{hostname=""} 8 @@ -47,61 +102,6 @@ ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 ceems_scrape_collector_success{collector="slurm"} 1 -# HELP ceems_slurm_job_cpu_system_seconds Total job CPU system seconds -# TYPE ceems_slurm_job_cpu_system_seconds gauge -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 115.777502 -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 115.777502 -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 115.777502 -# HELP ceems_slurm_job_cpu_user_seconds Total job CPU user seconds -# TYPE ceems_slurm_job_cpu_user_seconds gauge -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 60375.292848 -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 60375.292848 -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848 -# HELP ceems_slurm_job_cpus Total number of job CPUs -# TYPE ceems_slurm_job_cpus gauge -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 -# HELP ceems_slurm_job_memory_cache_bytes Memory cache used in bytes -# TYPE ceems_slurm_job_memory_cache_bytes gauge -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 -# HELP ceems_slurm_job_memory_fail_count Memory fail count -# TYPE ceems_slurm_job_memory_fail_count gauge -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 -# HELP ceems_slurm_job_memory_rss_bytes Memory RSS used in bytes -# TYPE ceems_slurm_job_memory_rss_bytes gauge -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.098592768e+09 -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.098592768e+09 -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.098592768e+09 -# HELP ceems_slurm_job_memory_total_bytes Memory total in bytes -# TYPE ceems_slurm_job_memory_total_bytes gauge -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.294967296e+09 -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.294967296e+09 -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.294967296e+09 -# HELP ceems_slurm_job_memory_used_bytes Memory used in bytes -# TYPE ceems_slurm_job_memory_used_bytes gauge -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.111491072e+09 -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.111491072e+09 -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.111491072e+09 -# HELP ceems_slurm_job_rdma_hca_handles Current number of RDMA HCA handles -# TYPE ceems_slurm_job_rdma_hca_handles gauge -ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 -ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 -ceems_slurm_job_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 -ceems_slurm_job_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 -# HELP ceems_slurm_job_rdma_hca_objects Current number of RDMA HCA objects -# TYPE ceems_slurm_job_rdma_hca_objects gauge -ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 -ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 -ceems_slurm_job_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 -ceems_slurm_job_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 -# HELP ceems_slurm_jobs Total number of jobs -# TYPE ceems_slurm_jobs gauge -ceems_slurm_jobs{hostname="",manager="slurm"} 3 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary # HELP go_goroutines Number of goroutines that currently exist. diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt index ff3d99f4..9da55051 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt @@ -1,3 +1,64 @@ +# HELP ceems_compute_unit_cpu_system_seconds_total Total job CPU system seconds +# TYPE ceems_compute_unit_cpu_system_seconds_total gauge +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 115.777502 +# HELP ceems_compute_unit_cpu_user_seconds_total Total job CPU user seconds +# TYPE ceems_compute_unit_cpu_user_seconds_total gauge +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 60375.292848 +# HELP ceems_compute_unit_cpus Total number of job CPUs +# TYPE ceems_compute_unit_cpus gauge +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 2 +# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running +# TYPE ceems_compute_unit_gpu_index_flag gauge +ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="1009248"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",hindex="-gpu-2",hostname="",index="2",manager="slurm",user="testusr",uuid="1009248"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc2",gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hindex="-gpu-0",hostname="",index="0",manager="slurm",user="testusr2",uuid="1009249"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc3",gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hindex="-gpu-1",hostname="",index="1",manager="slurm",user="testusr2",uuid="1009250"} 1 +# HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes +# TYPE ceems_compute_unit_memory_cache_bytes gauge +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 0 +# HELP ceems_compute_unit_memory_fail_count Memory fail count +# TYPE ceems_compute_unit_memory_fail_count gauge +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 0 +# HELP ceems_compute_unit_memory_rss_bytes Memory RSS used in bytes +# TYPE ceems_compute_unit_memory_rss_bytes gauge +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 4.098592768e+09 +# HELP ceems_compute_unit_memory_total_bytes Memory total in bytes +# TYPE ceems_compute_unit_memory_total_bytes gauge +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 4.294967296e+09 +# HELP ceems_compute_unit_memory_used_bytes Memory used in bytes +# TYPE ceems_compute_unit_memory_used_bytes gauge +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 4.111491072e+09 +# HELP ceems_compute_unit_rdma_hca_handles Current number of RDMA HCA handles +# TYPE ceems_compute_unit_rdma_hca_handles gauge +ceems_compute_unit_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 479 +ceems_compute_unit_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 289 +ceems_compute_unit_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 1479 +ceems_compute_unit_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 2479 +# HELP ceems_compute_unit_rdma_hca_objects Current number of RDMA HCA objects +# TYPE ceems_compute_unit_rdma_hca_objects gauge +ceems_compute_unit_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 479 +ceems_compute_unit_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 289 +ceems_compute_unit_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 1479 +ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 2479 +# HELP ceems_compute_units Total number of jobs +# TYPE ceems_compute_units gauge +ceems_compute_units{hostname="",manager="slurm"} 3 # HELP ceems_cpu_count Number of CPUs. # TYPE ceems_cpu_count gauge ceems_cpu_count{hostname=""} 8 @@ -47,67 +108,6 @@ ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 ceems_scrape_collector_success{collector="slurm"} 1 -# HELP ceems_slurm_job_cpu_system_seconds Total job CPU system seconds -# TYPE ceems_slurm_job_cpu_system_seconds gauge -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 115.777502 -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 115.777502 -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 115.777502 -# HELP ceems_slurm_job_cpu_user_seconds Total job CPU user seconds -# TYPE ceems_slurm_job_cpu_user_seconds gauge -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 60375.292848 -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 60375.292848 -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 60375.292848 -# HELP ceems_slurm_job_cpus Total number of job CPUs -# TYPE ceems_slurm_job_cpus gauge -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 2 -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 2 -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 2 -# HELP ceems_slurm_job_gpu_index_flag Indicates running job on GPU, 1=job running -# TYPE ceems_slurm_job_gpu_index_flag gauge -ceems_slurm_job_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="1009248"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",hindex="-gpu-2",hostname="",index="2",manager="slurm",user="testusr",uuid="1009248"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc2",gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hindex="-gpu-0",hostname="",index="0",manager="slurm",user="testusr2",uuid="1009249"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc3",gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hindex="-gpu-1",hostname="",index="1",manager="slurm",user="testusr2",uuid="1009250"} 1 -# HELP ceems_slurm_job_memory_cache_bytes Memory cache used in bytes -# TYPE ceems_slurm_job_memory_cache_bytes gauge -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 0 -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 0 -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 0 -# HELP ceems_slurm_job_memory_fail_count Memory fail count -# TYPE ceems_slurm_job_memory_fail_count gauge -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 0 -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 0 -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 0 -# HELP ceems_slurm_job_memory_rss_bytes Memory RSS used in bytes -# TYPE ceems_slurm_job_memory_rss_bytes gauge -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 4.098592768e+09 -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 4.098592768e+09 -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 4.098592768e+09 -# HELP ceems_slurm_job_memory_total_bytes Memory total in bytes -# TYPE ceems_slurm_job_memory_total_bytes gauge -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 4.294967296e+09 -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 4.294967296e+09 -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 4.294967296e+09 -# HELP ceems_slurm_job_memory_used_bytes Memory used in bytes -# TYPE ceems_slurm_job_memory_used_bytes gauge -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 4.111491072e+09 -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 4.111491072e+09 -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 4.111491072e+09 -# HELP ceems_slurm_job_rdma_hca_handles Current number of RDMA HCA handles -# TYPE ceems_slurm_job_rdma_hca_handles gauge -ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 479 -ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 289 -ceems_slurm_job_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 1479 -ceems_slurm_job_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 2479 -# HELP ceems_slurm_job_rdma_hca_objects Current number of RDMA HCA objects -# TYPE ceems_slurm_job_rdma_hca_objects gauge -ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 479 -ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 289 -ceems_slurm_job_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 1479 -ceems_slurm_job_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 2479 -# HELP ceems_slurm_jobs Total number of jobs -# TYPE ceems_slurm_jobs gauge -ceems_slurm_jobs{hostname="",manager="slurm"} 3 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary # HELP go_goroutines Number of goroutines that currently exist. diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt b/pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt index 6dcbbe3e..373b3ad9 100644 --- a/pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt +++ b/pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt @@ -1,3 +1,64 @@ +# HELP ceems_compute_unit_cpu_system_seconds_total Total job CPU system seconds +# TYPE ceems_compute_unit_cpu_system_seconds_total gauge +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 115.777502 +ceems_compute_unit_cpu_system_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 115.777502 +# HELP ceems_compute_unit_cpu_user_seconds_total Total job CPU user seconds +# TYPE ceems_compute_unit_cpu_user_seconds_total gauge +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 60375.292848 +ceems_compute_unit_cpu_user_seconds_total{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848 +# HELP ceems_compute_unit_cpus Total number of job CPUs +# TYPE ceems_compute_unit_cpus gauge +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 +ceems_compute_unit_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 +# HELP ceems_compute_unit_gpu_index_flag Indicates running job on GPU, 1=job running +# TYPE ceems_compute_unit_gpu_index_flag gauge +ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",hindex="-gpu-2",hostname="",index="2",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc2",gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hindex="-gpu-0",hostname="",index="0",manager="slurm",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1 +ceems_compute_unit_gpu_index_flag{account="testacc3",gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hindex="-gpu-1",hostname="",index="1",manager="slurm",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 1 +# HELP ceems_compute_unit_memory_cache_bytes Memory cache used in bytes +# TYPE ceems_compute_unit_memory_cache_bytes gauge +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_memory_cache_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_compute_unit_memory_fail_count Memory fail count +# TYPE ceems_compute_unit_memory_fail_count gauge +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 +ceems_compute_unit_memory_fail_count{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_compute_unit_memory_rss_bytes Memory RSS used in bytes +# TYPE ceems_compute_unit_memory_rss_bytes gauge +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.098592768e+09 +ceems_compute_unit_memory_rss_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.098592768e+09 +# HELP ceems_compute_unit_memory_total_bytes Memory total in bytes +# TYPE ceems_compute_unit_memory_total_bytes gauge +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.294967296e+09 +ceems_compute_unit_memory_total_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.294967296e+09 +# HELP ceems_compute_unit_memory_used_bytes Memory used in bytes +# TYPE ceems_compute_unit_memory_used_bytes gauge +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.111491072e+09 +ceems_compute_unit_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.111491072e+09 +# HELP ceems_compute_unit_rdma_hca_handles Current number of RDMA HCA handles +# TYPE ceems_compute_unit_rdma_hca_handles gauge +ceems_compute_unit_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_compute_unit_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_compute_unit_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_compute_unit_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 +# HELP ceems_compute_unit_rdma_hca_objects Current number of RDMA HCA objects +# TYPE ceems_compute_unit_rdma_hca_objects gauge +ceems_compute_unit_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_compute_unit_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_compute_unit_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_compute_unit_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 +# HELP ceems_compute_units Total number of jobs +# TYPE ceems_compute_units gauge +ceems_compute_units{hostname="",manager="slurm"} 3 # HELP ceems_cpu_count Number of CPUs. # TYPE ceems_cpu_count gauge ceems_cpu_count{hostname=""} 8 @@ -47,67 +108,6 @@ ceems_scrape_collector_success{collector="ipmi_dcmi"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 ceems_scrape_collector_success{collector="rapl"} 1 ceems_scrape_collector_success{collector="slurm"} 1 -# HELP ceems_slurm_job_cpu_system_seconds Total job CPU system seconds -# TYPE ceems_slurm_job_cpu_system_seconds gauge -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 115.777502 -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 115.777502 -ceems_slurm_job_cpu_system_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 115.777502 -# HELP ceems_slurm_job_cpu_user_seconds Total job CPU user seconds -# TYPE ceems_slurm_job_cpu_user_seconds gauge -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 60375.292848 -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 60375.292848 -ceems_slurm_job_cpu_user_seconds{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 60375.292848 -# HELP ceems_slurm_job_cpus Total number of job CPUs -# TYPE ceems_slurm_job_cpus gauge -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2 -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2 -ceems_slurm_job_cpus{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 2 -# HELP ceems_slurm_job_gpu_index_flag Indicates running job on GPU, 1=job running -# TYPE ceems_slurm_job_gpu_index_flag gauge -ceems_slurm_job_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3",hindex="-gpu-3",hostname="",index="3",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc",gpuuuid="GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3",hindex="-gpu-2",hostname="",index="2",manager="slurm",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc2",gpuuuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e",hindex="-gpu-0",hostname="",index="0",manager="slurm",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1 -ceems_slurm_job_gpu_index_flag{account="testacc3",gpuuuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3",hindex="-gpu-1",hostname="",index="1",manager="slurm",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 1 -# HELP ceems_slurm_job_memory_cache_bytes Memory cache used in bytes -# TYPE ceems_slurm_job_memory_cache_bytes gauge -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_slurm_job_memory_cache_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 -# HELP ceems_slurm_job_memory_fail_count Memory fail count -# TYPE ceems_slurm_job_memory_fail_count gauge -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 -ceems_slurm_job_memory_fail_count{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 -# HELP ceems_slurm_job_memory_rss_bytes Memory RSS used in bytes -# TYPE ceems_slurm_job_memory_rss_bytes gauge -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.098592768e+09 -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.098592768e+09 -ceems_slurm_job_memory_rss_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.098592768e+09 -# HELP ceems_slurm_job_memory_total_bytes Memory total in bytes -# TYPE ceems_slurm_job_memory_total_bytes gauge -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.294967296e+09 -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.294967296e+09 -ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.294967296e+09 -# HELP ceems_slurm_job_memory_used_bytes Memory used in bytes -# TYPE ceems_slurm_job_memory_used_bytes gauge -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.111491072e+09 -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.111491072e+09 -ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.111491072e+09 -# HELP ceems_slurm_job_rdma_hca_handles Current number of RDMA HCA handles -# TYPE ceems_slurm_job_rdma_hca_handles gauge -ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 -ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 -ceems_slurm_job_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 -ceems_slurm_job_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 -# HELP ceems_slurm_job_rdma_hca_objects Current number of RDMA HCA objects -# TYPE ceems_slurm_job_rdma_hca_objects gauge -ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 -ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 -ceems_slurm_job_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 -ceems_slurm_job_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 -# HELP ceems_slurm_jobs Total number of jobs -# TYPE ceems_slurm_jobs gauge -ceems_slurm_jobs{hostname="",manager="slurm"} 3 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary # HELP go_goroutines Number of goroutines that currently exist.