diff --git a/Makefile b/Makefile index 485cb94..e1c7883 100644 --- a/Makefile +++ b/Makefile @@ -109,7 +109,7 @@ $(eval $(call goarch_pair,amd64,386)) $(eval $(call goarch_pair,mips64,mips)) $(eval $(call goarch_pair,mips64el,mipsel)) -all:: vet common-all $(cross-test) $(test-docker) $(checkmetrics) $(checkrules) $(checkbpf) $(test-e2e) +all:: vet common-all $(checkbpf) $(cross-test) $(test-docker) $(checkmetrics) $(checkrules) $(test-e2e) .PHONY: coverage coverage: @@ -158,6 +158,8 @@ test-e2e: build pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc ./scripts/e2e-test.sh -s exporter-cgroups-v2-all-metrics ./scripts/e2e-test.sh -s exporter-cgroups-v1-libvirt ./scripts/e2e-test.sh -s exporter-cgroups-v2-libvirt + ./scripts/e2e-test.sh -s discoverer-cgroups-v2-slurm + ./scripts/e2e-test.sh -s discoverer-cgroups-v1-slurm else .PHONY: test-e2e test-e2e: $(PROMTOOL) build pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc/.unpacked @@ -210,6 +212,8 @@ test-e2e-update: build pkg/collector/testdata/sys/.unpacked pkg/collector/testda ./scripts/e2e-test.sh -s exporter-cgroups-v2-all-metrics -u || true ./scripts/e2e-test.sh -s exporter-cgroups-v1-libvirt -u || true ./scripts/e2e-test.sh -s exporter-cgroups-v2-libvirt -u || true + ./scripts/e2e-test.sh -s discoverer-cgroups-v2-slurm -u || true + ./scripts/e2e-test.sh -s discoverer-cgroups-v1-slurm -u || true else .PHONY: test-e2e-update test-e2e-update: $(PROMTOOL) build pkg/collector/testdata/sys/.unpacked pkg/collector/testdata/proc/.unpacked @@ -255,7 +259,7 @@ skip-test-e2e: .PHONY: checkmetrics checkmetrics: $(PROMTOOL) @echo ">> checking metrics for correctness" - ./scripts/checkmetrics.sh $(PROMTOOL) $(e2e-out) + ./scripts/checkmetrics.sh $(PROMTOOL) $(e2e-out)/exporter .PHONY: skip-checkmetrics skip-checkmetrics: $(PROMTOOL) diff --git a/pkg/collector/alloy_targets.go b/pkg/collector/alloy_targets.go new file mode 100644 index 0000000..c0a13ec --- /dev/null +++ b/pkg/collector/alloy_targets.go @@ -0,0 +1,317 @@ +package collector + +import ( + "encoding/json" + "fmt" + "net/http" + "strconv" + "strings" + "time" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/mahendrapaipuri/ceems/internal/security" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/prometheus/procfs" +) + +// CLI opts. +var ( + cgManager = CEEMSExporterApp.Flag( + "discoverer.alloy-targets.resource-manager", + "Discover Grafana Alloy targets from this resource manager [supported: slurm and libvirt].", + ).Enum("slurm", "libvirt") + alloyTargetEnvVars = CEEMSExporterApp.Flag( + "discoverer.alloy-targets.env-var", + "Enable continuous profiling by Pyroscope only on the processes having any of these environment variables.", + ).Strings() +) + +const ( + contentTypeHeader = "Content-Type" + contentType = "application/json" +) + +const ( + alloyTargetDiscovererSubSystem = "alloy_targets" +) + +// Security context names. +const ( + alloyTargetDiscovererCtx = "alloy_targets_discoverer" +) + +// alloyTargetDiscovererSecurityCtxData contains the input/output data for +// discoverer function to execute inside security context. +type alloyTargetDiscovererSecurityCtxData = perfDiscovererSecurityCtxData + +type Target struct { + Targets []string `json:"targets"` + Labels map[string]string `json:"labels"` +} + +type alloyTargetOpts struct { + targetEnvVars []string +} + +type CEEMSAlloyTargetDiscoverer struct { + logger log.Logger + cgroupManager *cgroupManager + fs procfs.FS + opts alloyTargetOpts + enabled bool + securityContexts map[string]*security.SecurityContext +} + +// NewAlloyTargetDiscoverer returns a new HTTP alloy discoverer. +func NewAlloyTargetDiscoverer(logger log.Logger) (*CEEMSAlloyTargetDiscoverer, error) { + // If no resource manager is provided, return an instance with enabled set to false + if *cgManager == "" { + level.Warn(logger).Log("msg", "No resource manager selected for discoverer") + + return &CEEMSAlloyTargetDiscoverer{logger: logger, enabled: false}, nil + } + + // Make alloyTargetOpts + opts := alloyTargetOpts{ + targetEnvVars: *alloyTargetEnvVars, + } + + // Instantiate a new Proc FS + fs, err := procfs.NewFS(*procfsPath) + if err != nil { + level.Error(logger).Log("msg", "Unable to open procfs", "path", *procfsPath, "err", err) + + return nil, err + } + + // Get SLURM's cgroup details + cgroupManager, err := NewCgroupManager(*cgManager) + if err != nil { + level.Info(logger).Log("msg", "Failed to create cgroup manager", "err", err) + + return nil, err + } + + level.Info(logger).Log("cgroup", cgroupManager) + + discoverer := &CEEMSAlloyTargetDiscoverer{ + logger: logger, + fs: fs, + cgroupManager: cgroupManager, + opts: opts, + enabled: true, + } + + // Setup new security context(s) + // Security context for openining profilers + discoverer.securityContexts = make(map[string]*security.SecurityContext) + + // If we need to inspect env vars of processes, we will need cap_sys_ptrace and + // cap_dac_read_search caps + if len(discoverer.opts.targetEnvVars) > 0 { + capabilities := []string{"cap_sys_ptrace", "cap_dac_read_search"} + auxCaps := setupCollectorCaps(logger, alloyTargetDiscovererSubSystem, capabilities) + + discoverer.securityContexts[alloyTargetDiscovererCtx], err = security.NewSecurityContext( + alloyTargetDiscovererCtx, + auxCaps, + targetDiscoverer, + logger, + ) + if err != nil { + level.Error(logger).Log("msg", "Failed to create a security context for alloy target discoverer", "err", err) + + return nil, err + } + } + + return discoverer, nil +} + +// Discover targets for Grafana Alloy. +func (d *CEEMSAlloyTargetDiscoverer) Discover() ([]Target, error) { + begin := time.Now() + targets, err := d.discover() + duration := time.Since(begin) + + if err != nil { + level.Debug(d.logger).Log("msg", "discoverer failed", "duration_seconds", duration.Seconds()) + } else { + level.Debug(d.logger).Log("msg", "discoverer succeeded", "duration_seconds", duration.Seconds()) + } + + return targets, err +} + +// discover targets by reading processes and mapping them to cgroups. +func (d *CEEMSAlloyTargetDiscoverer) discover() ([]Target, error) { + // If the discoverer is not enabled, return empty targets + if !d.enabled { + level.Debug(d.logger).Log("msg", "Grafana Alloy targets discoverer not enabled") + + return []Target{}, nil + } + + // Read discovered cgroups into data pointer + dataPtr := &alloyTargetDiscovererSecurityCtxData{ + procfs: d.fs, + cgroupManager: d.cgroupManager, + targetEnvVars: d.opts.targetEnvVars, + } + + // If there is a need to read processes' environ, use security context + // else execute function natively + if len(d.opts.targetEnvVars) > 0 { + if securityCtx, ok := d.securityContexts[alloyTargetDiscovererCtx]; ok { + if err := securityCtx.Exec(dataPtr); err != nil { + return nil, err + } + } else { + return nil, security.ErrNoSecurityCtx + } + } else { + if err := targetDiscoverer(dataPtr); err != nil { + return nil, err + } + } + + if len(dataPtr.cgroups) > 0 { + level.Debug(d.logger).Log("msg", "Discovered targets for Grafana Alloy") + } else { + level.Debug(d.logger).Log("msg", "No targets found for Grafana Alloy") + } + + // Make targets from cgrpoups + var targets []Target + + for uuid, procs := range dataPtr.cgroups { + for _, proc := range procs { + exe, _ := proc.Executable() + comm, _ := proc.CmdLine() + + var realUID, effecUID uint64 + if status, err := proc.NewStatus(); err == nil { + realUID = status.UIDs[0] + effecUID = status.UIDs[1] + } + + target := Target{ + Targets: []string{uuid}, + Labels: map[string]string{ + "__process_pid__": strconv.FormatInt(int64(proc.PID), 10), + "__process_exe": exe, + "__process_commandline": strings.Join(comm, " "), + "__process_real_uid": strconv.FormatUint(realUID, 10), + "__process_effective_uid": strconv.FormatUint(effecUID, 10), + "service_name": uuid, + }, + } + + targets = append(targets, target) + } + } + + return targets, nil +} + +// discoverer returns a map of discovered cgroup ID to procs by looking at each process +// in proc FS. Walking through cgroup fs is not really an option here as cgroups v1 +// wont have all PIDs of cgroup if the PID controller is not turned on. +// The current implementation should work for both cgroups v1 and v2. +// This function might be executed in a security context if targetEnvVars is not +// empty. +func targetDiscoverer(data interface{}) error { + // Assert data is of alloyTargetDiscovererSecurityCtxData + var d *alloyTargetDiscovererSecurityCtxData + + var ok bool + if d, ok = data.(*alloyTargetDiscovererSecurityCtxData); !ok { + return security.ErrSecurityCtxDataAssertion + } + + cgroups, err := cgroupProcs(d.procfs, d.cgroupManager.idRegex, d.targetEnvVars, d.cgroupManager.procFilter) + if err != nil { + return err + } + + // Read cgroups proc map into d + d.cgroups = cgroups + + return nil +} + +// TargetsHandlerFor returns http.Handler for Alloy targets. +func TargetsHandlerFor(discoverer *CEEMSAlloyTargetDiscoverer, opts promhttp.HandlerOpts) http.Handler { + var inFlightSem chan struct{} + + if opts.MaxRequestsInFlight > 0 { + inFlightSem = make(chan struct{}, opts.MaxRequestsInFlight) + } + + h := http.HandlerFunc(func(rsp http.ResponseWriter, req *http.Request) { + if inFlightSem != nil { + select { + case inFlightSem <- struct{}{}: // All good, carry on. + defer func() { <-inFlightSem }() + default: + http.Error(rsp, fmt.Sprintf( + "Limit of concurrent requests reached (%d), try again later.", opts.MaxRequestsInFlight, + ), http.StatusServiceUnavailable) + + return + } + } + + targets, err := discoverer.Discover() + if err != nil { + if opts.ErrorLog != nil { + opts.ErrorLog.Println("error gathering metrics:", err) + } + + switch opts.ErrorHandling { + case promhttp.PanicOnError: + panic(err) + case promhttp.ContinueOnError: + if len(targets) == 0 { + // Still report the error if no targets have been gathered. + httpError(rsp, err) + + return + } + case promhttp.HTTPErrorOnError: + httpError(rsp, err) + + return + } + } + + rsp.Header().Set(contentTypeHeader, contentType) + httpEncode(rsp, targets) + }) + + if opts.Timeout <= 0 { + return h + } + + return http.TimeoutHandler(h, opts.Timeout, fmt.Sprintf( + "Exceeded configured timeout of %v.\n", + opts.Timeout, + )) +} + +// httpEncode encodes response to http.ResponseWriter. +func httpEncode(rsp http.ResponseWriter, response []Target) { + if err := json.NewEncoder(rsp).Encode(&response); err != nil { + rsp.Write([]byte("KO")) + } +} + +// httpError calls http.Error with the provided error and http.StatusInternalServerError. +func httpError(rsp http.ResponseWriter, err error) { + http.Error( + rsp, + "An error has occurred while serving targets:\n\n"+err.Error(), + http.StatusInternalServerError, + ) +} diff --git a/pkg/collector/alloy_targets_test.go b/pkg/collector/alloy_targets_test.go new file mode 100644 index 0000000..826cae9 --- /dev/null +++ b/pkg/collector/alloy_targets_test.go @@ -0,0 +1,90 @@ +package collector + +import ( + "testing" + + "github.com/go-kit/log" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +var expectedTargets = []Target{ + { + Targets: []string{"1320003"}, + Labels: map[string]string{ + "__process_commandline": "/gpfslocalsup/spack_soft/gromacs/2022.2/gcc-8.4.1-kblhs7pjrcqlgv675gejjjy7n3h6wz2n/bin/gmx_mpi mdrun -ntomp 10 -v -deffnm run10 -multidir 1/ 2/ 3/ 4/", + "__process_effective_uid": "1000", + "__process_exe": "/usr/bin/vim", + "__process_pid__": "46236", + "__process_real_uid": "1000", + "service_name": "1320003", + }, + }, + { + Targets: []string{"1320003"}, + Labels: map[string]string{ + "__process_commandline": "/gpfslocalsup/spack_soft/gromacs/2022.2/gcc-8.4.1-kblhs7pjrcqlgv675gejjjy7n3h6wz2n/bin/gmx_mpi mdrun -ntomp 10 -v -deffnm run10 -multidir 1/ 2/ 3/ 4/", + "__process_effective_uid": "1000", + "__process_exe": "/usr/bin/vim", + "__process_pid__": "46235", + "__process_real_uid": "1000", + "service_name": "1320003", + }, + }, + { + Targets: []string{"4824887"}, + Labels: map[string]string{ + "__process_commandline": "/gpfslocalsup/spack_soft/gromacs/2022.2/gcc-8.4.1-kblhs7pjrcqlgv675gejjjy7n3h6wz2n/bin/gmx_mpi mdrun -ntomp 10 -v -deffnm run10 -multidir 1/ 2/ 3/ 4/", + "__process_effective_uid": "1000", + "__process_exe": "/usr/bin/vim", + "__process_pid__": "46281", + "__process_real_uid": "1000", + "service_name": "4824887", + }, + }, + { + Targets: []string{"4824887"}, + Labels: map[string]string{ + "__process_commandline": "/gpfslocalsup/spack_soft/gromacs/2022.2/gcc-8.4.1-kblhs7pjrcqlgv675gejjjy7n3h6wz2n/bin/gmx_mpi mdrun -ntomp 10 -v -deffnm run10 -multidir 1/ 2/ 3/ 4/", + "__process_effective_uid": "1000", + "__process_exe": "/usr/bin/vim", + "__process_pid__": "46231", + "__process_real_uid": "1000", + "service_name": "4824887", + }, + }, +} + +func TestAlloyDiscovererSlurmCgroupsV2(t *testing.T) { + _, err := CEEMSExporterApp.Parse([]string{ + "--path.procfs", "testdata/proc", + "--path.cgroupfs", "testdata/sys/fs/cgroup", + "--discoverer.alloy-targets.resource-manager", "slurm", + "--collector.cgroups.force-version", "v2", + }) + require.NoError(t, err) + + discoverer, err := NewAlloyTargetDiscoverer(log.NewNopLogger()) + require.NoError(t, err) + + targets, err := discoverer.Discover() + require.NoError(t, err) + assert.ElementsMatch(t, expectedTargets, targets) +} + +func TestAlloyDiscovererSlurmCgroupsV1(t *testing.T) { + _, err := CEEMSExporterApp.Parse([]string{ + "--path.procfs", "testdata/proc", + "--path.cgroupfs", "testdata/sys/fs/cgroup", + "--discoverer.alloy-targets.resource-manager", "slurm", + "--collector.cgroups.force-version", "v1", + }) + require.NoError(t, err) + + discoverer, err := NewAlloyTargetDiscoverer(log.NewNopLogger()) + require.NoError(t, err) + + targets, err := discoverer.Discover() + require.NoError(t, err) + assert.ElementsMatch(t, expectedTargets, targets) +} diff --git a/pkg/collector/cli.go b/pkg/collector/cli.go index 06ebc3c..926105d 100644 --- a/pkg/collector/cli.go +++ b/pkg/collector/cli.go @@ -11,6 +11,7 @@ import ( "time" "github.com/alecthomas/kingpin/v2" + "github.com/go-kit/log" "github.com/go-kit/log/level" internal_runtime "github.com/mahendrapaipuri/ceems/internal/runtime" "github.com/mahendrapaipuri/ceems/internal/security" @@ -69,6 +70,10 @@ func (b *CEEMSExporter) Main() error { "web.telemetry-path", "Path under which to expose metrics.", ).Default("/metrics").String() + targetsPath = b.App.Flag( + "web.targets-path", + "Path under which to expose Grafana Alloy targets.", + ).Default("/alloy-targets").String() disableExporterMetrics = b.App.Flag( "web.disable-exporter-metrics", "Exclude metrics about the exporter itself (promhttp_*, process_*, go_*).", @@ -149,6 +154,12 @@ func (b *CEEMSExporter) Main() error { return err } + // Create a new instance of Alloy targets discoverer + discoverer, err := NewAlloyTargetDiscoverer(log.With(logger, "discoverer", "alloy_targets")) + if err != nil { + return err + } + if user, err := user.Current(); err == nil && user.Uid == "0" { level.Info(logger). Log("msg", "CEEMS Exporter is running as root user. Privileges will be dropped and process will be run as unprivileged user") @@ -176,13 +187,15 @@ func (b *CEEMSExporter) Main() error { // Create web server config config := &Config{ - Logger: logger, - Collector: collector, + Logger: logger, + Collector: collector, + Discoverer: discoverer, Web: WebConfig{ Addresses: *webListenAddresses, WebSystemdSocket: *systemdSocket, WebConfigFile: *webConfigFile, MetricsPath: *metricsPath, + TargetsPath: *targetsPath, MaxRequests: *maxRequests, IncludeExporterMetrics: !*disableExporterMetrics, EnableDebugServer: *enableDebugServer, @@ -195,6 +208,10 @@ func (b *CEEMSExporter) Main() error { Address: *metricsPath, Text: "Metrics", }, + { + Address: *targetsPath, + Text: "Grafana Alloy Targets", + }, }, }, }, diff --git a/pkg/collector/cli_test.go b/pkg/collector/cli_test.go index a63b86c..bc85c4f 100644 --- a/pkg/collector/cli_test.go +++ b/pkg/collector/cli_test.go @@ -13,14 +13,16 @@ import ( ) func queryExporter(address string) error { - resp, err := http.Get(fmt.Sprintf("http://%s/metrics", address)) //nolint:noctx - if err != nil { - return err - } - defer resp.Body.Close() + for _, path := range []string{"metrics", "alloy-targets"} { + resp, err := http.Get(fmt.Sprintf("http://%s/%s", address, path)) //nolint:noctx + if err != nil { + return err + } + defer resp.Body.Close() - if want, have := http.StatusOK, resp.StatusCode; want != have { - return fmt.Errorf("want /metrics status code %d, have %d.", want, have) + if want, have := http.StatusOK, resp.StatusCode; want != have { + return fmt.Errorf("want /%s status code %d, have %d.", path, want, have) + } } return nil diff --git a/pkg/collector/perf.go b/pkg/collector/perf.go index 3bac0b3..00bc92c 100644 --- a/pkg/collector/perf.go +++ b/pkg/collector/perf.go @@ -49,7 +49,7 @@ var ( ).Strings() perfProfilersEnvVars = CEEMSExporterApp.Flag( "collector.perf.env-var", - "Processes having any of these environment variables set will be profiled. If empty, all processes will be profiled.", + "Enable profiling only on the processes having any of these environment variables set will be profiled. If empty, all processes will be profiled.", ).Strings() ) diff --git a/pkg/collector/server.go b/pkg/collector/server.go index 57d03fa..50d67f8 100644 --- a/pkg/collector/server.go +++ b/pkg/collector/server.go @@ -25,6 +25,7 @@ type WebConfig struct { WebSystemdSocket bool WebConfigFile string MetricsPath string + TargetsPath string MaxRequests int IncludeExporterMetrics bool EnableDebugServer bool @@ -33,22 +34,24 @@ type WebConfig struct { // Config makes a server config. type Config struct { - Logger log.Logger - Collector *CEEMSCollector - Web WebConfig + Logger log.Logger + Collector *CEEMSCollector + Discoverer *CEEMSAlloyTargetDiscoverer + Web WebConfig } // CEEMSExporterServer struct implements HTTP server for exporter. type CEEMSExporterServer struct { - logger log.Logger - server *http.Server - webConfig *web.FlagConfig - collector *CEEMSCollector - handler *metricsHandler + logger log.Logger + server *http.Server + webConfig *web.FlagConfig + collector *CEEMSCollector + discoverer *CEEMSAlloyTargetDiscoverer + metricsHandler *metricsHandler + targetsHandler *targetsHandler } -// metricsHandler wraps an metrics http.Handler. Create instances with -// newHandler. +// metricsHandler wraps an metrics http.Handler. type metricsHandler struct { handler http.Handler // exporterMetricsRegistry is a separate registry for the metrics about @@ -64,12 +67,24 @@ func (h *metricsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { h.handler.ServeHTTP(w, r) } +// targetsHandler wraps an Alloy targets http.Handler. +type targetsHandler struct { + handler http.Handler + maxRequests int +} + +// ServeHTTP implements http.Handler. +func (h *targetsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + h.handler.ServeHTTP(w, r) +} + // NewCEEMSExporterServer creates new CEEMSExporterServer struct instance. func NewCEEMSExporterServer(c *Config) (*CEEMSExporterServer, error) { router := mux.NewRouter() server := &CEEMSExporterServer{ - logger: c.Logger, - collector: c.Collector, + logger: c.Logger, + collector: c.Collector, + discoverer: c.Discoverer, server: &http.Server{ Addr: c.Web.Addresses[0], Handler: router, @@ -82,26 +97,29 @@ func NewCEEMSExporterServer(c *Config) (*CEEMSExporterServer, error) { WebSystemdSocket: &c.Web.WebSystemdSocket, WebConfigFile: &c.Web.WebConfigFile, }, - handler: &metricsHandler{ + metricsHandler: &metricsHandler{ metricsRegistry: prometheus.NewRegistry(), exporterMetricsRegistry: prometheus.NewRegistry(), includeExporterMetrics: c.Web.IncludeExporterMetrics, maxRequests: c.Web.MaxRequests, }, + targetsHandler: &targetsHandler{ + maxRequests: c.Web.MaxRequests, + }, } // Register exporter metrics when requested if c.Web.IncludeExporterMetrics { - server.handler.exporterMetricsRegistry.MustRegister( + server.metricsHandler.exporterMetricsRegistry.MustRegister( promcollectors.NewProcessCollector(promcollectors.ProcessCollectorOpts{}), promcollectors.NewGoCollector(), ) } // Register metrics collector with Prometheus - server.handler.metricsRegistry.MustRegister(version.NewCollector(CEEMSExporterAppName)) + server.metricsHandler.metricsRegistry.MustRegister(version.NewCollector(CEEMSExporterAppName)) - if err := server.handler.metricsRegistry.Register(server.collector); err != nil { + if err := server.metricsHandler.metricsRegistry.Register(server.collector); err != nil { return nil, fmt.Errorf("couldn't register compute resource collector: %w", err) } @@ -116,7 +134,10 @@ func NewCEEMSExporterServer(c *Config) (*CEEMSExporterServer, error) { } // Handle metrics path - router.Handle(c.Web.MetricsPath, server.metricsHandler()) + router.Handle(c.Web.MetricsPath, server.newMetricsHandler()) + + // Handle targets path + router.Handle(c.Web.TargetsPath, server.newTargetsHandler()) // If EnableDebugServer is true add debug endpoints if c.Web.EnableDebugServer { @@ -166,34 +187,46 @@ func (s *CEEMSExporterServer) Shutdown(ctx context.Context) error { return nil } -// metricsHandler creates a new handler for exporting metrics. -func (s *CEEMSExporterServer) metricsHandler() http.Handler { +// newMetricsHandler creates a new handler for exporting metrics. +func (s *CEEMSExporterServer) newMetricsHandler() http.Handler { var handler http.Handler - if s.handler.includeExporterMetrics { + if s.metricsHandler.includeExporterMetrics { handler = promhttp.HandlerFor( - prometheus.Gatherers{s.handler.exporterMetricsRegistry, s.handler.metricsRegistry}, + prometheus.Gatherers{s.metricsHandler.exporterMetricsRegistry, s.metricsHandler.metricsRegistry}, promhttp.HandlerOpts{ ErrorLog: stdlog.New(log.NewStdlibAdapter(level.Error(s.logger)), "", 0), ErrorHandling: promhttp.ContinueOnError, - MaxRequestsInFlight: s.handler.maxRequests, - Registry: s.handler.exporterMetricsRegistry, + MaxRequestsInFlight: s.metricsHandler.maxRequests, + Registry: s.metricsHandler.exporterMetricsRegistry, }, ) // Note that we have to use h.exporterMetricsRegistry here to // use the same promhttp metrics for all expositions. handler = promhttp.InstrumentMetricHandler( - s.handler.exporterMetricsRegistry, handler, + s.metricsHandler.exporterMetricsRegistry, handler, ) } else { handler = promhttp.HandlerFor( - s.handler.metricsRegistry, + s.metricsHandler.metricsRegistry, promhttp.HandlerOpts{ ErrorLog: stdlog.New(log.NewStdlibAdapter(level.Error(s.logger)), "", 0), ErrorHandling: promhttp.ContinueOnError, - MaxRequestsInFlight: s.handler.maxRequests, + MaxRequestsInFlight: s.metricsHandler.maxRequests, }, ) } return handler } + +// newTargetsHandler creates a new handler for exporting Grafana Alloy targets. +func (s *CEEMSExporterServer) newTargetsHandler() http.Handler { + return TargetsHandlerFor( + s.discoverer, + promhttp.HandlerOpts{ + ErrorLog: stdlog.New(log.NewStdlibAdapter(level.Error(s.logger)), "", 0), + ErrorHandling: promhttp.ContinueOnError, + MaxRequestsInFlight: s.targetsHandler.maxRequests, + }, + ) +} diff --git a/pkg/collector/testdata/output/discoverer/e2e-test-discoverer-cgroupsv1-slurm-output.txt b/pkg/collector/testdata/output/discoverer/e2e-test-discoverer-cgroupsv1-slurm-output.txt new file mode 100644 index 0000000..d44cc4a --- /dev/null +++ b/pkg/collector/testdata/output/discoverer/e2e-test-discoverer-cgroupsv1-slurm-output.txt @@ -0,0 +1 @@ +[{"targets":["1320003"],"labels":{"__process_commandline":"/gpfslocalsup/spack_soft/gromacs/2022.2/gcc-8.4.1-kblhs7pjrcqlgv675gejjjy7n3h6wz2n/bin/gmx_mpi mdrun -ntomp 10 -v -deffnm run10 -multidir 1/ 2/ 3/ 4/","__process_effective_uid":"1000","__process_exe":"/usr/bin/vim","__process_pid__":"46236","__process_real_uid":"1000","service_name":"1320003"}},{"targets":["1320003"],"labels":{"__process_commandline":"/gpfslocalsup/spack_soft/gromacs/2022.2/gcc-8.4.1-kblhs7pjrcqlgv675gejjjy7n3h6wz2n/bin/gmx_mpi mdrun -ntomp 10 -v -deffnm run10 -multidir 1/ 2/ 3/ 4/","__process_effective_uid":"1000","__process_exe":"/usr/bin/vim","__process_pid__":"46235","__process_real_uid":"1000","service_name":"1320003"}},{"targets":["4824887"],"labels":{"__process_commandline":"/gpfslocalsup/spack_soft/gromacs/2022.2/gcc-8.4.1-kblhs7pjrcqlgv675gejjjy7n3h6wz2n/bin/gmx_mpi mdrun -ntomp 10 -v -deffnm run10 -multidir 1/ 2/ 3/ 4/","__process_effective_uid":"1000","__process_exe":"/usr/bin/vim","__process_pid__":"46281","__process_real_uid":"1000","service_name":"4824887"}},{"targets":["4824887"],"labels":{"__process_commandline":"/gpfslocalsup/spack_soft/gromacs/2022.2/gcc-8.4.1-kblhs7pjrcqlgv675gejjjy7n3h6wz2n/bin/gmx_mpi mdrun -ntomp 10 -v -deffnm run10 -multidir 1/ 2/ 3/ 4/","__process_effective_uid":"1000","__process_exe":"/usr/bin/vim","__process_pid__":"46231","__process_real_uid":"1000","service_name":"4824887"}}] diff --git a/pkg/collector/testdata/output/discoverer/e2e-test-discoverer-cgroupsv2-slurm-output.txt b/pkg/collector/testdata/output/discoverer/e2e-test-discoverer-cgroupsv2-slurm-output.txt new file mode 100644 index 0000000..d44cc4a --- /dev/null +++ b/pkg/collector/testdata/output/discoverer/e2e-test-discoverer-cgroupsv2-slurm-output.txt @@ -0,0 +1 @@ +[{"targets":["1320003"],"labels":{"__process_commandline":"/gpfslocalsup/spack_soft/gromacs/2022.2/gcc-8.4.1-kblhs7pjrcqlgv675gejjjy7n3h6wz2n/bin/gmx_mpi mdrun -ntomp 10 -v -deffnm run10 -multidir 1/ 2/ 3/ 4/","__process_effective_uid":"1000","__process_exe":"/usr/bin/vim","__process_pid__":"46236","__process_real_uid":"1000","service_name":"1320003"}},{"targets":["1320003"],"labels":{"__process_commandline":"/gpfslocalsup/spack_soft/gromacs/2022.2/gcc-8.4.1-kblhs7pjrcqlgv675gejjjy7n3h6wz2n/bin/gmx_mpi mdrun -ntomp 10 -v -deffnm run10 -multidir 1/ 2/ 3/ 4/","__process_effective_uid":"1000","__process_exe":"/usr/bin/vim","__process_pid__":"46235","__process_real_uid":"1000","service_name":"1320003"}},{"targets":["4824887"],"labels":{"__process_commandline":"/gpfslocalsup/spack_soft/gromacs/2022.2/gcc-8.4.1-kblhs7pjrcqlgv675gejjjy7n3h6wz2n/bin/gmx_mpi mdrun -ntomp 10 -v -deffnm run10 -multidir 1/ 2/ 3/ 4/","__process_effective_uid":"1000","__process_exe":"/usr/bin/vim","__process_pid__":"46281","__process_real_uid":"1000","service_name":"4824887"}},{"targets":["4824887"],"labels":{"__process_commandline":"/gpfslocalsup/spack_soft/gromacs/2022.2/gcc-8.4.1-kblhs7pjrcqlgv675gejjjy7n3h6wz2n/bin/gmx_mpi mdrun -ntomp 10 -v -deffnm run10 -multidir 1/ 2/ 3/ 4/","__process_effective_uid":"1000","__process_exe":"/usr/bin/vim","__process_pid__":"46231","__process_real_uid":"1000","service_name":"4824887"}}] diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv1-libvirt-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-libvirt-output.txt similarity index 100% rename from pkg/collector/testdata/output/e2e-test-cgroupsv1-libvirt-output.txt rename to pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-libvirt-output.txt diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv1-memory-subsystem-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-memory-subsystem-output.txt similarity index 100% rename from pkg/collector/testdata/output/e2e-test-cgroupsv1-memory-subsystem-output.txt rename to pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-memory-subsystem-output.txt diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-output.txt similarity index 100% rename from pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt rename to pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-output.txt diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-all-metrics-output.txt similarity index 100% rename from pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt rename to pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-all-metrics-output.txt diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-amd-ipmitool-output.txt similarity index 100% rename from pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt rename to pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-amd-ipmitool-output.txt diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-libvirt-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-libvirt-output.txt similarity index 100% rename from pkg/collector/testdata/output/e2e-test-cgroupsv2-libvirt-output.txt rename to pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-libvirt-output.txt diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nogpu-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nogpu-output.txt similarity index 100% rename from pkg/collector/testdata/output/e2e-test-cgroupsv2-nogpu-output.txt rename to pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nogpu-output.txt diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-gpu-reordering.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nvidia-gpu-reordering.txt similarity index 100% rename from pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-gpu-reordering.txt rename to pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nvidia-gpu-reordering.txt diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt similarity index 100% rename from pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt rename to pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt diff --git a/pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt b/pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-procfs-output.txt similarity index 100% rename from pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt rename to pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-procfs-output.txt diff --git a/scripts/e2e-test.sh b/scripts/e2e-test.sh index 0e61ab2..0aa87ba 100755 --- a/scripts/e2e-test.sh +++ b/scripts/e2e-test.sh @@ -41,7 +41,7 @@ do esac done -if [[ "${scenario}" =~ ^"exporter" ]] +if [[ "${scenario}" =~ ^"exporter" ]] || [[ "${scenario}" =~ ^"discoverer" ]] then # cgroups_mode=$([ $(stat -fc %T /sys/fs/cgroup/) = "cgroup2fs" ] && echo "unified" || ( [ -e /sys/fs/cgroup/unified/ ] && echo "hybrid" || echo "legacy")) # cgroups_mode="legacy" @@ -50,52 +50,62 @@ then then cgroups_mode="legacy" desc="Cgroups V1" - fixture='pkg/collector/testdata/output/e2e-test-cgroupsv1-output.txt' + fixture='pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-output.txt' elif [ "${scenario}" = "exporter-cgroups-v1-memory-subsystem" ] then cgroups_mode="legacy" desc="Cgroups V1 with memory subsystem" - fixture='pkg/collector/testdata/output/e2e-test-cgroupsv1-memory-subsystem-output.txt' + fixture='pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-memory-subsystem-output.txt' elif [ "${scenario}" = "exporter-cgroups-v2-nvidia-ipmiutil" ] then cgroups_mode="unified" desc="Cgroups V2 with nVIDIA GPU and ipmiutil" - fixture='pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt' + fixture='pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt' elif [ "${scenario}" = "exporter-cgroups-v2-nvidia-gpu-reordering" ] then cgroups_mode="unified" desc="Cgroups V2 with nVIDIA GPU reordering" - fixture='pkg/collector/testdata/output/e2e-test-cgroupsv2-nvidia-gpu-reordering.txt' + fixture='pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nvidia-gpu-reordering.txt' elif [ "${scenario}" = "exporter-cgroups-v2-amd-ipmitool" ] then cgroups_mode="unified" desc="Cgroups V2 with AMD GPU and ipmitool" - fixture='pkg/collector/testdata/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt' + fixture='pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-amd-ipmitool-output.txt' elif [ "${scenario}" = "exporter-cgroups-v2-nogpu" ] then cgroups_mode="unified" desc="Cgroups V2 when there are no GPUs" - fixture='pkg/collector/testdata/output/e2e-test-cgroupsv2-nogpu-output.txt' + fixture='pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-nogpu-output.txt' elif [ "${scenario}" = "exporter-cgroups-v2-procfs" ] then cgroups_mode="unified" desc="Cgroups V2 using /proc for fetching job properties" - fixture='pkg/collector/testdata/output/e2e-test-cgroupsv2-procfs-output.txt' + fixture='pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-procfs-output.txt' elif [ "${scenario}" = "exporter-cgroups-v2-all-metrics" ] then cgroups_mode="unified" desc="Cgroups V2 enabling all available cgroups metrics" - fixture='pkg/collector/testdata/output/e2e-test-cgroupsv2-all-metrics-output.txt' + fixture='pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-all-metrics-output.txt' elif [ "${scenario}" = "exporter-cgroups-v1-libvirt" ] then cgroups_mode="legacy" desc="Cgroups V1 with libvirt" - fixture='pkg/collector/testdata/output/e2e-test-cgroupsv1-libvirt-output.txt' + fixture='pkg/collector/testdata/output/exporter/e2e-test-cgroupsv1-libvirt-output.txt' elif [ "${scenario}" = "exporter-cgroups-v2-libvirt" ] then cgroups_mode="unified" desc="Cgroups V2 with libvirt" - fixture='pkg/collector/testdata/output/e2e-test-cgroupsv2-libvirt-output.txt' + fixture='pkg/collector/testdata/output/exporter/e2e-test-cgroupsv2-libvirt-output.txt' + elif [ "${scenario}" = "discoverer-cgroups-v2-slurm" ] + then + cgroups_mode="unified" + desc="Cgroups V2 discoverer for Slurm" + fixture='pkg/collector/testdata/output/discoverer/e2e-test-discoverer-cgroupsv2-slurm-output.txt' + elif [ "${scenario}" = "discoverer-cgroups-v1-slurm" ] + then + cgroups_mode="legacy" + desc="Cgroups V1 discoverer for Slurm" + fixture='pkg/collector/testdata/output/discoverer/e2e-test-discoverer-cgroupsv1-slurm-output.txt' fi logfile="${tmpdir}/ceems_exporter.log" @@ -308,7 +318,7 @@ waitport() { sleep 1 } -if [[ "${scenario}" =~ ^"exporter" ]] +if [[ "${scenario}" =~ ^"exporter" ]] || [[ "${scenario}" =~ ^"discoverer" ]] then if [ ! -x ./bin/ceems_exporter ] then @@ -499,6 +509,36 @@ then --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ --log.level="debug" > "${logfile}" 2>&1 & + elif [ "${scenario}" = "discoverer-cgroups-v2-slurm" ] + then + ./bin/ceems_exporter \ + --path.sysfs="pkg/collector/testdata/sys" \ + --path.cgroupfs="pkg/collector/testdata/sys/fs/cgroup" \ + --path.procfs="pkg/collector/testdata/proc" \ + --discoverer.alloy-targets.resource-manager="slurm" \ + --collector.cgroups.force-version="v2" \ + --collector.slurm \ + --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ + --collector.ipmi_dcmi.test-mode \ + --collector.empty-hostname-label \ + --web.listen-address "127.0.0.1:${port}" \ + --web.disable-exporter-metrics \ + --log.level="debug" > "${logfile}" 2>&1 & + elif [ "${scenario}" = "discoverer-cgroups-v1-slurm" ] + then + ./bin/ceems_exporter \ + --path.sysfs="pkg/collector/testdata/sys" \ + --path.cgroupfs="pkg/collector/testdata/sys/fs/cgroup" \ + --path.procfs="pkg/collector/testdata/proc" \ + --discoverer.alloy-targets.resource-manager="slurm" \ + --collector.slurm \ + --collector.cgroups.force-version="v1" \ + --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ + --collector.ipmi_dcmi.test-mode \ + --collector.empty-hostname-label \ + --web.listen-address "127.0.0.1:${port}" \ + --web.disable-exporter-metrics \ + --log.level="debug" > "${logfile}" 2>&1 & fi echo $! > "${pidfile}" @@ -506,7 +546,12 @@ then # sleep 1 waitport "${port}" - get "127.0.0.1:${port}/metrics" | grep -E -v "${skip_re}" > "${fixture_output}" + if [[ "${scenario}" =~ ^"discoverer" ]] + then + get "127.0.0.1:${port}/alloy-targets" | grep -E -v "${skip_re}" > "${fixture_output}" + else + get "127.0.0.1:${port}/metrics" | grep -E -v "${skip_re}" > "${fixture_output}" + fi elif [[ "${scenario}" =~ ^"api" ]] then if [ ! -x ./bin/ceems_api_server ]