Skip to content

Commit

Permalink
refactor: Use consistent CLI flags for exporter
Browse files Browse the repository at this point in the history
* Add deprecate notices for old flags when used

* Update docs with new flags

Signed-off-by: Mahendra Paipuri <[email protected]>
  • Loading branch information
mahendrapaipuri committed Aug 17, 2024
1 parent 7429f2c commit 1dc5ff1
Show file tree
Hide file tree
Showing 11 changed files with 233 additions and 173 deletions.
24 changes: 12 additions & 12 deletions etc/slurm/README.md
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
# SLURM epilog and prolog scripts

CEEMS exporter needs to perform few privileged actions to collect certain information of
CEEMS exporter needs to perform few privileged actions to collect certain information of
compute units. An example [systemd service file](https://github.com/mahendrapaipuri/ceems/blob/main/build/package/ceems_exporter/ceems_exporter.service)
provided in the repo shows the linux capabilities necessary for these privileged actions.
provided in the repo shows the linux capabilities necessary for these privileged actions.

If the operators would like to avoid privileges on CEEMS exporter and run it fully in
userland an alternative approach, in SLURM context, is to use Epilog and Prolog scripts
to write the necessary job information to a file that is readable by CEEMS exporter.
If the operators would like to avoid privileges on CEEMS exporter and run it fully in
userland an alternative approach, in SLURM context, is to use Epilog and Prolog scripts
to write the necessary job information to a file that is readable by CEEMS exporter.
This directory provides those scripts that should be used with SLURM.

An example [systemd service file](https://github.com/mahendrapaipuri/ceems/blob/main/init/systemd/ceems_exporter_no_privs.service)
is also provided in the repo that can be used along with these prolog and epilog scripts.
An example [systemd service file](https://github.com/mahendrapaipuri/ceems/blob/main/init/systemd/ceems_exporter_no_privs.service)
is also provided in the repo that can be used along with these prolog and epilog scripts.

> [!IMPORTANT]
> The CLI arguments `--collector.slurm.job.props.path` and `--collector.slurm.gpu.job.map.path`
are hidden and cannot be seen in `ceems_exporter --help` output. However, these arguments
> The CLI arguments `--collector.slurm.job-props-path` and `--collector.slurm.gpu-job-map-path`
are hidden and cannot be seen in `ceems_exporter --help` output. However, these arguments
exists in the exporter and can be used.

Even with such prolog and epilog scripts, operators should grant the user running CEEMS
exporter permissions to run `ipmi-dcmi` command as this command can be executable by only
`root` by default.
Even with such prolog and epilog scripts, operators should grant the user running CEEMS
exporter permissions to run `ipmi-dcmi` command as this command can be executable by only
`root` by default.
2 changes: 1 addition & 1 deletion etc/slurm/prolog.d/gpujobmap.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

# Need to use this path in --collector.nvidia.gpu.job.map.path flag for ceems_exporter
# Need to use this path in --collector.slurm.gpu-job-map-path flag for ceems_exporter
DEST=/run/gpujobmap
[ -e $DEST ] || mkdir -m 755 $DEST

Expand Down
2 changes: 1 addition & 1 deletion etc/slurm/prolog.d/slurmjobprops.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

# Need to use this path in --collector.slurm.job.props.path flag for ceems_exporter
# Need to use this path in --collector.slurm.job-props-path flag for ceems_exporter
DEST=/run/slurmjobprops
[ -e $DEST ] || mkdir -m 755 $DEST

Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ func (b *CEEMSExporter) Main() error {

// This is hidden flag only used for e2e testing
emptyHostnameLabel = b.App.Flag(
"collector.empty.hostname.label",
"collector.empty-hostname-label",
"Use empty hostname in labels. Only for testing. (default is disabled)",
).Hidden().Default("false").Bool()

Expand Down
18 changes: 15 additions & 3 deletions pkg/collector/ipmi.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,13 @@ type impiCollector struct {
*/

var (
ipmiDcmiCmd = CEEMSExporterApp.Flag(
ipmiDcmiCmdDepr = CEEMSExporterApp.Flag(
"collector.ipmi.dcmi.cmd",
"IPMI DCMI command to get system power statistics. Use full path to executables.",
).Hidden().Default("").String()
ipmiDcmiCmd = CEEMSExporterApp.Flag(
"collector.ipmi_dcmi.cmd",
"IPMI DCMI command to get system power statistics. Use full path to executables.",
).Default("").String()

ipmiDcmiCmds = []string{
Expand Down Expand Up @@ -165,6 +169,10 @@ func init() {

// NewIPMICollector returns a new Collector exposing IMPI DCMI power metrics.
func NewIPMICollector(logger log.Logger) (Collector, error) {
if *ipmiDcmiCmdDepr != "" {
level.Warn(logger).Log("msg", "flag --collector.ipmi.dcmi.cmd has been deprecated. Use --collector.ipmi_dcmi.cmd instead.")
}

var execMode string

// Initialize metricDesc map
Expand All @@ -191,10 +199,14 @@ func NewIPMICollector(logger log.Logger) (Collector, error) {

// If no IPMI command is provided, try to find one
var cmdSlice []string
if *ipmiDcmiCmd == "" {
if *ipmiDcmiCmd == "" && *ipmiDcmiCmdDepr == "" {
cmdSlice = findIPMICmd()
} else {
cmdSlice = strings.Split(*ipmiDcmiCmd, " ")
if *ipmiDcmiCmdDepr != "" {
cmdSlice = strings.Split(*ipmiDcmiCmdDepr, " ")
} else {
cmdSlice = strings.Split(*ipmiDcmiCmd, " ")
}
}

level.Debug(logger).Log(
Expand Down
20 changes: 15 additions & 5 deletions pkg/collector/meminfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,27 @@ type meminfoCollector struct {
hostname string
}

var meminfoAllStatistics = CEEMSExporterApp.Flag(
"collector.meminfo.all.stats",
"Enable collecting all meminfo stats (default is disabled).",
).Default("false").Bool()
var (
meminfoAllStatisticsDepr = CEEMSExporterApp.Flag(
"collector.meminfo.all.stats",
"Enable collecting all meminfo stats (default is disabled).",
).Hidden().Default("false").Bool()
meminfoAllStatistics = CEEMSExporterApp.Flag(
"collector.meminfo.all-stats",
"Enable collecting all meminfo stats (default is disabled).",
).Default("false").Bool()
)

func init() {
RegisterCollector(memInfoSubsystem, defaultEnabled, NewMeminfoCollector)
}

// NewMeminfoCollector returns a new Collector exposing memory stats.
func NewMeminfoCollector(logger log.Logger) (Collector, error) {
if *meminfoAllStatisticsDepr {
level.Warn(logger).Log("msg", "flag --collector.meminfo.all.stats has been deprecated. Use --collector.meminfo.all-stats instead")
}

return &meminfoCollector{
logger: logger,
hostname: hostname,
Expand All @@ -56,7 +66,7 @@ func (c *meminfoCollector) Update(ch chan<- prometheus.Metric) error {

// Export only MemTotal, MemFree and MemAvailable fields if meminfoAllStatistics is false
var memInfoStats map[string]float64
if *meminfoAllStatistics {
if *meminfoAllStatistics || *meminfoAllStatisticsDepr {
memInfoStats = memInfo
} else {
memInfoStats = map[string]float64{
Expand Down
61 changes: 45 additions & 16 deletions pkg/collector/slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,43 +31,51 @@ const (
)

var (
metricLock = sync.RWMutex{}
collectSwapMemoryStats = CEEMSExporterApp.Flag(
metricLock = sync.RWMutex{}
collectSwapMemoryStatsDepre = CEEMSExporterApp.Flag(
"collector.slurm.swap.memory.metrics",
"Enables collection of swap memory metrics (default: disabled)",
).Default("false").Hidden().Bool()
collectSwapMemoryStats = CEEMSExporterApp.Flag(
"collector.slurm.swap-memory-metrics",
"Enables collection of swap memory metrics (default: disabled)",
).Default("false").Bool()
collectPSIStats = CEEMSExporterApp.Flag(
collectPSIStatsDepre = CEEMSExporterApp.Flag(
"collector.slurm.psi.metrics",
"Enables collection of PSI metrics (default: disabled)",
).Default("false").Hidden().Bool()
collectPSIStats = CEEMSExporterApp.Flag(
"collector.slurm.psi-metrics",
"Enables collection of PSI metrics (default: disabled)",
).Default("false").Bool()
useJobIDHash = CEEMSExporterApp.Flag(
"collector.slurm.create.unique.jobids",
"collector.slurm.create-unique-jobids",
`Enables calculation of a unique hash based job UUID (default: disabled).
UUID is calculated based on SLURM_JOBID, SLURM_JOB_USER, SLURM_JOB_ACCOUNT, SLURM_JOB_NODELIST.`,
).Default("false").Hidden().Bool()
gpuType = CEEMSExporterApp.Flag(
"collector.slurm.gpu.type",
"collector.slurm.gpu-type",
"GPU device type. Currently only nvidia and amd devices are supported.",
).Enum("nvidia", "amd")
).Hidden().Enum("nvidia", "amd")
jobStatPath = CEEMSExporterApp.Flag(
"collector.slurm.job.props.path",
"collector.slurm.job-props-path",
`Directory containing files with job properties. Files should be named after SLURM_JOBID
with contents as "$SLURM_JOB_USER $SLURM_JOB_ACCOUNT $SLURM_JOB_NODELIST" in the same order.`,
).Default("/run/slurmjobprops").Hidden().String()
gpuStatPath = CEEMSExporterApp.Flag(
"collector.slurm.gpu.job.map.path",
"collector.slurm.gpu-job-map-path",
"Path to file that maps GPU ordinals to job IDs.",
).Default("/run/gpujobmap").Hidden().String()
forceCgroupsVersion = CEEMSExporterApp.Flag(
"collector.slurm.force.cgroups.version",
"collector.slurm.force-cgroups-version",
"Set cgroups version manually. Used only for testing.",
).Hidden().Enum("v1", "v2")
nvidiaSmiPath = CEEMSExporterApp.Flag(
"collector.slurm.nvidia.smi.path",
"collector.slurm.nvidia-smi-path",
"Absolute path to nvidia-smi binary. Use only for testing.",
).Hidden().Default("").String()
rocmSmiPath = CEEMSExporterApp.Flag(
"collector.slurm.rocm.smi.path",
"collector.slurm.rocm-smi-path",
"Absolute path to rocm-smi binary. Use only for testing.",
).Hidden().Default("").String()
)
Expand Down Expand Up @@ -161,6 +169,15 @@ func init() {

// NewSlurmCollector returns a new Collector exposing a summary of cgroups.
func NewSlurmCollector(logger log.Logger) (Collector, error) {
// Log deprecation notices
if *collectPSIStatsDepre {
level.Warn(logger).Log("msg", "flag --collector.slurm.psi.metrics has been deprecated. Use --collector.slurm.psi-metrics instead")
}

if *collectSwapMemoryStatsDepre {
level.Warn(logger).Log("msg", "flag --collector.slurm.swap.memory.metrics has been deprecated. Use --collector.slurm.swap-memory-metrics instead")
}

var cgroupsVersion string

var cgroupsRootPath string
Expand Down Expand Up @@ -196,13 +213,25 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) {
}

// Attempt to get GPU devices
var gpuTypes []string

var gpuDevs map[int]Device

var err error

gpuDevs, err = GetGPUDevices(*gpuType, logger)
if err == nil {
level.Info(logger).Log("msg", "GPU devices found")
if *gpuType != "" {
gpuTypes = []string{*gpuType}
} else {
gpuTypes = []string{"nvidia", "amd"}
}

for _, gpuType := range gpuTypes {
gpuDevs, err = GetGPUDevices(gpuType, logger)
if err == nil {
level.Info(logger).Log("msg", "GPU devices found", "type", gpuType)

break
}
}

// Get total memory of host
Expand Down Expand Up @@ -405,13 +434,13 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error {
ch <- prometheus.MustNewConstMetric(c.jobMemoryFailCount, prometheus.GaugeValue, m.memoryFailCount, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid)

// PSI stats. Push them only if they are available
if *collectSwapMemoryStats {
if *collectSwapMemoryStatsDepre || *collectSwapMemoryStats {
ch <- prometheus.MustNewConstMetric(c.jobMemswUsed, prometheus.GaugeValue, m.memswUsed, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid)
ch <- prometheus.MustNewConstMetric(c.jobMemswTotal, prometheus.GaugeValue, m.memswTotal, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid)
ch <- prometheus.MustNewConstMetric(c.jobMemswFailCount, prometheus.GaugeValue, m.memswFailCount, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid)
}

if *collectPSIStats {
if *collectPSIStatsDepre || *collectPSIStats {
ch <- prometheus.MustNewConstMetric(c.jobCPUPressure, prometheus.GaugeValue, m.cpuPressure, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid)
ch <- prometheus.MustNewConstMetric(c.jobMemoryPressure, prometheus.GaugeValue, m.memoryPressure, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid)
}
Expand Down
20 changes: 10 additions & 10 deletions pkg/collector/slurm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ func TestNewSlurmCollector(t *testing.T) {
_, err := CEEMSExporterApp.Parse(
[]string{
"--path.cgroupfs", "testdata/sys/fs/cgroup",
"--collector.slurm.create.unique.jobids",
"--collector.slurm.job.props.path", "testdata/slurmjobprops",
"--collector.slurm.gpu.job.map.path", "testdata/gpujobmap",
"--collector.slurm.create-unique-jobids",
"--collector.slurm.job-props-path", "testdata/slurmjobprops",
"--collector.slurm.gpu-job-map-path", "testdata/gpujobmap",
},
)
require.NoError(t, err)
Expand All @@ -45,9 +45,9 @@ func TestCgroupsV2SlurmJobMetrics(t *testing.T) {
_, err := CEEMSExporterApp.Parse(
[]string{
"--path.cgroupfs", "testdata/sys/fs/cgroup",
"--collector.slurm.create.unique.jobids",
"--collector.slurm.job.props.path", "testdata/slurmjobprops",
"--collector.slurm.gpu.job.map.path", "testdata/gpujobmap",
"--collector.slurm.create-unique-jobids",
"--collector.slurm.job-props-path", "testdata/slurmjobprops",
"--collector.slurm.gpu-job-map-path", "testdata/gpujobmap",
},
)
require.NoError(t, err)
Expand Down Expand Up @@ -95,7 +95,7 @@ func TestCgroupsV2SlurmJobMetricsWithProcFs(t *testing.T) {
_, err := CEEMSExporterApp.Parse(
[]string{
"--path.cgroupfs", "testdata/sys/fs/cgroup",
"--collector.slurm.create.unique.jobids",
"--collector.slurm.create-unique-jobids",
"--path.procfs", "testdata/proc",
},
)
Expand Down Expand Up @@ -144,7 +144,7 @@ func TestCgroupsV2SlurmJobMetricsNoJobProps(t *testing.T) {
_, err := CEEMSExporterApp.Parse(
[]string{
"--path.cgroupfs", "testdata/sys/fs/cgroup",
"--collector.slurm.create.unique.jobids",
"--collector.slurm.create-unique-jobids",
},
)
require.NoError(t, err)
Expand Down Expand Up @@ -191,8 +191,8 @@ func TestCgroupsV1SlurmJobMetrics(t *testing.T) {
[]string{
"--path.cgroupfs", "testdata/sys/fs/cgroup",
"--path.procfs", "testdata/proc",
"--collector.slurm.create.unique.jobids",
"--collector.slurm.job.props.path", "testdata/slurmjobprops",
"--collector.slurm.create-unique-jobids",
"--collector.slurm.job-props-path", "testdata/slurmjobprops",
},
)
require.NoError(t, err)
Expand Down
Loading

0 comments on commit 1dc5ff1

Please sign in to comment.