Skip to content

Commit

Permalink
Use consistent CLI flags for exporter (#144)
Browse files Browse the repository at this point in the history
* refactor: Use consistent CLI flags for exporter

* Add deprecate notices for old flags when used

* Update docs with new flags

* refactor: Make starting debug server configurable

* Run debug server on separate port only on localhost by default

---------

Signed-off-by: Mahendra Paipuri <[email protected]>
  • Loading branch information
mahendrapaipuri authored Aug 18, 2024
1 parent 2a6b79f commit 3a71e26
Show file tree
Hide file tree
Showing 11 changed files with 270 additions and 174 deletions.
24 changes: 12 additions & 12 deletions etc/slurm/README.md
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
# SLURM epilog and prolog scripts

CEEMS exporter needs to perform few privileged actions to collect certain information of
CEEMS exporter needs to perform few privileged actions to collect certain information of
compute units. An example [systemd service file](https://github.com/mahendrapaipuri/ceems/blob/main/build/package/ceems_exporter/ceems_exporter.service)
provided in the repo shows the linux capabilities necessary for these privileged actions.
provided in the repo shows the linux capabilities necessary for these privileged actions.

If the operators would like to avoid privileges on CEEMS exporter and run it fully in
userland an alternative approach, in SLURM context, is to use Epilog and Prolog scripts
to write the necessary job information to a file that is readable by CEEMS exporter.
If the operators would like to avoid privileges on CEEMS exporter and run it fully in
userland an alternative approach, in SLURM context, is to use Epilog and Prolog scripts
to write the necessary job information to a file that is readable by CEEMS exporter.
This directory provides those scripts that should be used with SLURM.

An example [systemd service file](https://github.com/mahendrapaipuri/ceems/blob/main/init/systemd/ceems_exporter_no_privs.service)
is also provided in the repo that can be used along with these prolog and epilog scripts.
An example [systemd service file](https://github.com/mahendrapaipuri/ceems/blob/main/init/systemd/ceems_exporter_no_privs.service)
is also provided in the repo that can be used along with these prolog and epilog scripts.

> [!IMPORTANT]
> The CLI arguments `--collector.slurm.job.props.path` and `--collector.slurm.gpu.job.map.path`
are hidden and cannot be seen in `ceems_exporter --help` output. However, these arguments
> The CLI arguments `--collector.slurm.job-props-path` and `--collector.slurm.gpu-job-map-path`
are hidden and cannot be seen in `ceems_exporter --help` output. However, these arguments
exists in the exporter and can be used.

Even with such prolog and epilog scripts, operators should grant the user running CEEMS
exporter permissions to run `ipmi-dcmi` command as this command can be executable by only
`root` by default.
Even with such prolog and epilog scripts, operators should grant the user running CEEMS
exporter permissions to run `ipmi-dcmi` command as this command can be executable by only
`root` by default.
2 changes: 1 addition & 1 deletion etc/slurm/prolog.d/gpujobmap.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

# Need to use this path in --collector.nvidia.gpu.job.map.path flag for ceems_exporter
# Need to use this path in --collector.slurm.gpu-job-map-path flag for ceems_exporter
DEST=/run/gpujobmap
[ -e $DEST ] || mkdir -m 755 $DEST

Expand Down
2 changes: 1 addition & 1 deletion etc/slurm/prolog.d/slurmjobprops.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

# Need to use this path in --collector.slurm.job.props.path flag for ceems_exporter
# Need to use this path in --collector.slurm.job-props-path flag for ceems_exporter
DEST=/run/slurmjobprops
[ -e $DEST ] || mkdir -m 755 $DEST

Expand Down
40 changes: 38 additions & 2 deletions pkg/collector/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ package collector

import (
"fmt"
std_log "log"
"net/http"
_ "net/http/pprof" // #nosec
"net/http/pprof"
"os"
"os/user"
"runtime"
Expand Down Expand Up @@ -97,12 +98,20 @@ func (b *CEEMSExporter) Main() error {
maxProcs = b.App.Flag(
"runtime.gomaxprocs", "The target number of CPUs Go will run on (GOMAXPROCS)",
).Envar("GOMAXPROCS").Default("1").Int()
enableDebugServer = b.App.Flag(
"web.debug-server",
"Enable debug server (default: disabled).",
).Default("false").Bool()
debugServerAddr = b.App.Flag(
"web.debug-server.listen-address",
"Address on which debug server will be exposed. Running debug server on localhost is strongly recommended.",
).Default("localhost:8010").String()
toolkitFlags = kingpinflag.AddFlags(&b.App, ":9010")
)

// This is hidden flag only used for e2e testing
emptyHostnameLabel = b.App.Flag(
"collector.empty.hostname.label",
"collector.empty-hostname-label",
"Use empty hostname in labels. Only for testing. (default is disabled)",
).Hidden().Default("false").Bool()

Expand Down Expand Up @@ -145,6 +154,33 @@ func (b *CEEMSExporter) Main() error {
runtime.GOMAXPROCS(*maxProcs)
level.Debug(logger).Log("msg", "Go MAXPROCS", "procs", runtime.GOMAXPROCS(0))

// Reset default routes (removing access to profiling)
http.DefaultServeMux = http.NewServeMux()

if *enableDebugServer {
// Recreating routes to profiling manually
pprofServeMux := http.NewServeMux()
pprofServeMux.HandleFunc("/debug/pprof/", pprof.Index)
pprofServeMux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
pprofServeMux.HandleFunc("/debug/pprof/profile", pprof.Profile)
pprofServeMux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)

go func() {
debugServer := &http.Server{
// slowloris attack: https://app.deepsource.com/directory/analyzers/go/issues/GO-S2112
ReadHeaderTimeout: 2 * time.Second,
// Only use routes for the profiling interface
Handler: pprofServeMux,
// Exposing them on loopback on a specific port for debbuging access
Addr: *debugServerAddr,
}

if err := debugServer.ListenAndServe(); err != nil {
std_log.Println("Failed to start debug server", "err", err)
}
}()
}

http.Handle(*metricsPath, b.newHandler(!*disableExporterMetrics, *maxRequests, logger))

if *metricsPath != "/" {
Expand Down
18 changes: 15 additions & 3 deletions pkg/collector/ipmi.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,13 @@ type impiCollector struct {
*/

var (
ipmiDcmiCmd = CEEMSExporterApp.Flag(
ipmiDcmiCmdDepr = CEEMSExporterApp.Flag(
"collector.ipmi.dcmi.cmd",
"IPMI DCMI command to get system power statistics. Use full path to executables.",
).Hidden().Default("").String()
ipmiDcmiCmd = CEEMSExporterApp.Flag(
"collector.ipmi_dcmi.cmd",
"IPMI DCMI command to get system power statistics. Use full path to executables.",
).Default("").String()

ipmiDcmiCmds = []string{
Expand Down Expand Up @@ -165,6 +169,10 @@ func init() {

// NewIPMICollector returns a new Collector exposing IMPI DCMI power metrics.
func NewIPMICollector(logger log.Logger) (Collector, error) {
if *ipmiDcmiCmdDepr != "" {
level.Warn(logger).Log("msg", "flag --collector.ipmi.dcmi.cmd has been deprecated. Use --collector.ipmi_dcmi.cmd instead.")
}

var execMode string

// Initialize metricDesc map
Expand All @@ -191,10 +199,14 @@ func NewIPMICollector(logger log.Logger) (Collector, error) {

// If no IPMI command is provided, try to find one
var cmdSlice []string
if *ipmiDcmiCmd == "" {
if *ipmiDcmiCmd == "" && *ipmiDcmiCmdDepr == "" {
cmdSlice = findIPMICmd()
} else {
cmdSlice = strings.Split(*ipmiDcmiCmd, " ")
if *ipmiDcmiCmdDepr != "" {
cmdSlice = strings.Split(*ipmiDcmiCmdDepr, " ")
} else {
cmdSlice = strings.Split(*ipmiDcmiCmd, " ")
}
}

level.Debug(logger).Log(
Expand Down
20 changes: 15 additions & 5 deletions pkg/collector/meminfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,27 @@ type meminfoCollector struct {
hostname string
}

var meminfoAllStatistics = CEEMSExporterApp.Flag(
"collector.meminfo.all.stats",
"Enable collecting all meminfo stats (default is disabled).",
).Default("false").Bool()
var (
meminfoAllStatisticsDepr = CEEMSExporterApp.Flag(
"collector.meminfo.all.stats",
"Enable collecting all meminfo stats (default: disabled).",
).Hidden().Default("false").Bool()
meminfoAllStatistics = CEEMSExporterApp.Flag(
"collector.meminfo.all-stats",
"Enable collecting all meminfo stats (default: disabled).",
).Default("false").Bool()
)

func init() {
RegisterCollector(memInfoSubsystem, defaultEnabled, NewMeminfoCollector)
}

// NewMeminfoCollector returns a new Collector exposing memory stats.
func NewMeminfoCollector(logger log.Logger) (Collector, error) {
if *meminfoAllStatisticsDepr {
level.Warn(logger).Log("msg", "flag --collector.meminfo.all.stats has been deprecated. Use --collector.meminfo.all-stats instead")
}

return &meminfoCollector{
logger: logger,
hostname: hostname,
Expand All @@ -56,7 +66,7 @@ func (c *meminfoCollector) Update(ch chan<- prometheus.Metric) error {

// Export only MemTotal, MemFree and MemAvailable fields if meminfoAllStatistics is false
var memInfoStats map[string]float64
if *meminfoAllStatistics {
if *meminfoAllStatistics || *meminfoAllStatisticsDepr {
memInfoStats = memInfo
} else {
memInfoStats = map[string]float64{
Expand Down
61 changes: 45 additions & 16 deletions pkg/collector/slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,43 +31,51 @@ const (
)

var (
metricLock = sync.RWMutex{}
collectSwapMemoryStats = CEEMSExporterApp.Flag(
metricLock = sync.RWMutex{}
collectSwapMemoryStatsDepre = CEEMSExporterApp.Flag(
"collector.slurm.swap.memory.metrics",
"Enables collection of swap memory metrics (default: disabled)",
).Default("false").Hidden().Bool()
collectSwapMemoryStats = CEEMSExporterApp.Flag(
"collector.slurm.swap-memory-metrics",
"Enables collection of swap memory metrics (default: disabled)",
).Default("false").Bool()
collectPSIStats = CEEMSExporterApp.Flag(
collectPSIStatsDepre = CEEMSExporterApp.Flag(
"collector.slurm.psi.metrics",
"Enables collection of PSI metrics (default: disabled)",
).Default("false").Hidden().Bool()
collectPSIStats = CEEMSExporterApp.Flag(
"collector.slurm.psi-metrics",
"Enables collection of PSI metrics (default: disabled)",
).Default("false").Bool()
useJobIDHash = CEEMSExporterApp.Flag(
"collector.slurm.create.unique.jobids",
"collector.slurm.create-unique-jobids",
`Enables calculation of a unique hash based job UUID (default: disabled).
UUID is calculated based on SLURM_JOBID, SLURM_JOB_USER, SLURM_JOB_ACCOUNT, SLURM_JOB_NODELIST.`,
).Default("false").Hidden().Bool()
gpuType = CEEMSExporterApp.Flag(
"collector.slurm.gpu.type",
"collector.slurm.gpu-type",
"GPU device type. Currently only nvidia and amd devices are supported.",
).Enum("nvidia", "amd")
).Hidden().Enum("nvidia", "amd")
jobStatPath = CEEMSExporterApp.Flag(
"collector.slurm.job.props.path",
"collector.slurm.job-props-path",
`Directory containing files with job properties. Files should be named after SLURM_JOBID
with contents as "$SLURM_JOB_USER $SLURM_JOB_ACCOUNT $SLURM_JOB_NODELIST" in the same order.`,
).Default("/run/slurmjobprops").Hidden().String()
gpuStatPath = CEEMSExporterApp.Flag(
"collector.slurm.gpu.job.map.path",
"collector.slurm.gpu-job-map-path",
"Path to file that maps GPU ordinals to job IDs.",
).Default("/run/gpujobmap").Hidden().String()
forceCgroupsVersion = CEEMSExporterApp.Flag(
"collector.slurm.force.cgroups.version",
"collector.slurm.force-cgroups-version",
"Set cgroups version manually. Used only for testing.",
).Hidden().Enum("v1", "v2")
nvidiaSmiPath = CEEMSExporterApp.Flag(
"collector.slurm.nvidia.smi.path",
"collector.slurm.nvidia-smi-path",
"Absolute path to nvidia-smi binary. Use only for testing.",
).Hidden().Default("").String()
rocmSmiPath = CEEMSExporterApp.Flag(
"collector.slurm.rocm.smi.path",
"collector.slurm.rocm-smi-path",
"Absolute path to rocm-smi binary. Use only for testing.",
).Hidden().Default("").String()
)
Expand Down Expand Up @@ -161,6 +169,15 @@ func init() {

// NewSlurmCollector returns a new Collector exposing a summary of cgroups.
func NewSlurmCollector(logger log.Logger) (Collector, error) {
// Log deprecation notices
if *collectPSIStatsDepre {
level.Warn(logger).Log("msg", "flag --collector.slurm.psi.metrics has been deprecated. Use --collector.slurm.psi-metrics instead")
}

if *collectSwapMemoryStatsDepre {
level.Warn(logger).Log("msg", "flag --collector.slurm.swap.memory.metrics has been deprecated. Use --collector.slurm.swap-memory-metrics instead")
}

var cgroupsVersion string

var cgroupsRootPath string
Expand Down Expand Up @@ -196,13 +213,25 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) {
}

// Attempt to get GPU devices
var gpuTypes []string

var gpuDevs map[int]Device

var err error

gpuDevs, err = GetGPUDevices(*gpuType, logger)
if err == nil {
level.Info(logger).Log("msg", "GPU devices found")
if *gpuType != "" {
gpuTypes = []string{*gpuType}
} else {
gpuTypes = []string{"nvidia", "amd"}
}

for _, gpuType := range gpuTypes {
gpuDevs, err = GetGPUDevices(gpuType, logger)
if err == nil {
level.Info(logger).Log("msg", "GPU devices found", "type", gpuType)

break
}
}

// Get total memory of host
Expand Down Expand Up @@ -405,13 +434,13 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error {
ch <- prometheus.MustNewConstMetric(c.jobMemoryFailCount, prometheus.GaugeValue, m.memoryFailCount, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid)

// PSI stats. Push them only if they are available
if *collectSwapMemoryStats {
if *collectSwapMemoryStatsDepre || *collectSwapMemoryStats {
ch <- prometheus.MustNewConstMetric(c.jobMemswUsed, prometheus.GaugeValue, m.memswUsed, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid)
ch <- prometheus.MustNewConstMetric(c.jobMemswTotal, prometheus.GaugeValue, m.memswTotal, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid)
ch <- prometheus.MustNewConstMetric(c.jobMemswFailCount, prometheus.GaugeValue, m.memswFailCount, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid)
}

if *collectPSIStats {
if *collectPSIStatsDepre || *collectPSIStats {
ch <- prometheus.MustNewConstMetric(c.jobCPUPressure, prometheus.GaugeValue, m.cpuPressure, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid)
ch <- prometheus.MustNewConstMetric(c.jobMemoryPressure, prometheus.GaugeValue, m.memoryPressure, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid)
}
Expand Down
20 changes: 10 additions & 10 deletions pkg/collector/slurm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ func TestNewSlurmCollector(t *testing.T) {
_, err := CEEMSExporterApp.Parse(
[]string{
"--path.cgroupfs", "testdata/sys/fs/cgroup",
"--collector.slurm.create.unique.jobids",
"--collector.slurm.job.props.path", "testdata/slurmjobprops",
"--collector.slurm.gpu.job.map.path", "testdata/gpujobmap",
"--collector.slurm.create-unique-jobids",
"--collector.slurm.job-props-path", "testdata/slurmjobprops",
"--collector.slurm.gpu-job-map-path", "testdata/gpujobmap",
},
)
require.NoError(t, err)
Expand All @@ -45,9 +45,9 @@ func TestCgroupsV2SlurmJobMetrics(t *testing.T) {
_, err := CEEMSExporterApp.Parse(
[]string{
"--path.cgroupfs", "testdata/sys/fs/cgroup",
"--collector.slurm.create.unique.jobids",
"--collector.slurm.job.props.path", "testdata/slurmjobprops",
"--collector.slurm.gpu.job.map.path", "testdata/gpujobmap",
"--collector.slurm.create-unique-jobids",
"--collector.slurm.job-props-path", "testdata/slurmjobprops",
"--collector.slurm.gpu-job-map-path", "testdata/gpujobmap",
},
)
require.NoError(t, err)
Expand Down Expand Up @@ -95,7 +95,7 @@ func TestCgroupsV2SlurmJobMetricsWithProcFs(t *testing.T) {
_, err := CEEMSExporterApp.Parse(
[]string{
"--path.cgroupfs", "testdata/sys/fs/cgroup",
"--collector.slurm.create.unique.jobids",
"--collector.slurm.create-unique-jobids",
"--path.procfs", "testdata/proc",
},
)
Expand Down Expand Up @@ -144,7 +144,7 @@ func TestCgroupsV2SlurmJobMetricsNoJobProps(t *testing.T) {
_, err := CEEMSExporterApp.Parse(
[]string{
"--path.cgroupfs", "testdata/sys/fs/cgroup",
"--collector.slurm.create.unique.jobids",
"--collector.slurm.create-unique-jobids",
},
)
require.NoError(t, err)
Expand Down Expand Up @@ -191,8 +191,8 @@ func TestCgroupsV1SlurmJobMetrics(t *testing.T) {
[]string{
"--path.cgroupfs", "testdata/sys/fs/cgroup",
"--path.procfs", "testdata/proc",
"--collector.slurm.create.unique.jobids",
"--collector.slurm.job.props.path", "testdata/slurmjobprops",
"--collector.slurm.create-unique-jobids",
"--collector.slurm.job-props-path", "testdata/slurmjobprops",
},
)
require.NoError(t, err)
Expand Down
Loading

0 comments on commit 3a71e26

Please sign in to comment.