diff --git a/collector/slm.go b/collector/slm.go index 47e6bd67..f821c75d 100644 --- a/collector/slm.go +++ b/collector/slm.go @@ -61,9 +61,6 @@ type SLM struct { client *http.Client url *url.URL - up prometheus.Gauge - totalScrapes, jsonParseFailures prometheus.Counter - slmMetrics []*slmMetric policyMetrics []*policyMetric slmStatusMetric *slmStatusMetric @@ -75,19 +72,6 @@ func NewSLM(logger log.Logger, client *http.Client, url *url.URL) *SLM { logger: logger, client: client, url: url, - - up: prometheus.NewGauge(prometheus.GaugeOpts{ - Name: prometheus.BuildFQName(namespace, "slm_stats", "up"), - Help: "Was the last scrape of the Elasticsearch SLM endpoint successful.", - }), - totalScrapes: prometheus.NewCounter(prometheus.CounterOpts{ - Name: prometheus.BuildFQName(namespace, "slm_stats", "total_scrapes"), - Help: "Current total Elasticsearch SLM scrapes.", - }), - jsonParseFailures: prometheus.NewCounter(prometheus.CounterOpts{ - Name: prometheus.BuildFQName(namespace, "slm_stats", "json_parse_failures"), - Help: "Number of errors while parsing JSON.", - }), slmMetrics: []*slmMetric{ { Type: prometheus.CounterValue, @@ -257,9 +241,6 @@ func (s *SLM) Describe(ch chan<- *prometheus.Desc) { ch <- metric.Desc } - ch <- s.up.Desc() - ch <- s.totalScrapes.Desc() - ch <- s.jsonParseFailures.Desc() } func (s *SLM) fetchAndDecodeSLMStats() (SLMStatsResponse, error) { @@ -289,12 +270,10 @@ func (s *SLM) fetchAndDecodeSLMStats() (SLMStatsResponse, error) { bts, err := io.ReadAll(res.Body) if err != nil { - s.jsonParseFailures.Inc() return ssr, err } if err := json.Unmarshal(bts, &ssr); err != nil { - s.jsonParseFailures.Inc() return ssr, err } @@ -328,12 +307,10 @@ func (s *SLM) fetchAndDecodeSLMStatus() (SLMStatusResponse, error) { bts, err := io.ReadAll(res.Body) if err != nil { - s.jsonParseFailures.Inc() return ssr, err } if err := json.Unmarshal(bts, &ssr); err != nil { - s.jsonParseFailures.Inc() return ssr, err } @@ -342,16 +319,9 @@ func (s *SLM) fetchAndDecodeSLMStatus() (SLMStatusResponse, error) { // Collect gets SLM metric values func (s *SLM) Collect(ch chan<- prometheus.Metric) { - s.totalScrapes.Inc() - defer func() { - ch <- s.up - ch <- s.totalScrapes - ch <- s.jsonParseFailures - }() slmStatusResp, err := s.fetchAndDecodeSLMStatus() if err != nil { - s.up.Set(0) level.Warn(s.logger).Log( "msg", "failed to fetch and decode slm status", "err", err, @@ -361,7 +331,6 @@ func (s *SLM) Collect(ch chan<- prometheus.Metric) { slmStatsResp, err := s.fetchAndDecodeSLMStats() if err != nil { - s.up.Set(0) level.Warn(s.logger).Log( "msg", "failed to fetch and decode slm stats", "err", err, @@ -369,8 +338,6 @@ func (s *SLM) Collect(ch chan<- prometheus.Metric) { return } - s.up.Set(1) - for _, status := range statuses { ch <- prometheus.MustNewConstMetric( s.slmStatusMetric.Desc, diff --git a/collector/slm_test.go b/collector/slm_test.go index 5bbea562..cc028d4c 100644 --- a/collector/slm_test.go +++ b/collector/slm_test.go @@ -14,13 +14,17 @@ package collector import ( - "fmt" + "io" "net/http" "net/http/httptest" "net/url" + "os" + "path" + "strings" "testing" "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus/testutil" ) func TestSLM(t *testing.T) { @@ -31,35 +35,104 @@ func TestSLM(t *testing.T) { // curl -XPUT http://127.0.0.1:9200/_slm/policy/everything -H 'Content-Type: application/json' -d '{"schedule":"0 */15 * * * ?","name":"","repository":"my_repository","config":{"indices":".*","include_global_state":true,"ignore_unavailable":true},"retention":{"expire_after":"7d"}}' // curl http://127.0.0.1:9200/_slm/stats (Numbers manually tweaked) - tcs := map[string]string{ - "7.15.0": `{"retention_runs":9,"retention_failed":0,"retention_timed_out":0,"retention_deletion_time":"1.2m","retention_deletion_time_millis":72491,"total_snapshots_taken":103,"total_snapshots_failed":2,"total_snapshots_deleted":20,"total_snapshot_deletion_failures":0,"policy_stats":[{"policy":"everything","snapshots_taken":50,"snapshots_failed":2,"snapshots_deleted":20,"snapshot_deletion_failures":0}]}`, + tests := []struct { + name string + file string + want string + }{ + { + name: "7.15.0", + file: "7.15.0.json", + want: `# HELP elasticsearch_slm_stats_operation_mode Operating status of SLM + # TYPE elasticsearch_slm_stats_operation_mode gauge + elasticsearch_slm_stats_operation_mode{operation_mode="RUNNING"} 0 + elasticsearch_slm_stats_operation_mode{operation_mode="STOPPED"} 0 + elasticsearch_slm_stats_operation_mode{operation_mode="STOPPING"} 0 + # HELP elasticsearch_slm_stats_retention_deletion_time_seconds Retention run deletion time + # TYPE elasticsearch_slm_stats_retention_deletion_time_seconds gauge + elasticsearch_slm_stats_retention_deletion_time_seconds 72.491 + # HELP elasticsearch_slm_stats_retention_failed_total Total failed retention runs + # TYPE elasticsearch_slm_stats_retention_failed_total counter + elasticsearch_slm_stats_retention_failed_total 0 + # HELP elasticsearch_slm_stats_retention_runs_total Total retention runs + # TYPE elasticsearch_slm_stats_retention_runs_total counter + elasticsearch_slm_stats_retention_runs_total 9 + # HELP elasticsearch_slm_stats_retention_timed_out_total Total timed out retention runs + # TYPE elasticsearch_slm_stats_retention_timed_out_total counter + elasticsearch_slm_stats_retention_timed_out_total 0 + # HELP elasticsearch_slm_stats_snapshot_deletion_failures_total Total snapshot deletion failures + # TYPE elasticsearch_slm_stats_snapshot_deletion_failures_total counter + elasticsearch_slm_stats_snapshot_deletion_failures_total{policy="everything"} 0 + # HELP elasticsearch_slm_stats_snapshots_deleted_total Total snapshots deleted + # TYPE elasticsearch_slm_stats_snapshots_deleted_total counter + elasticsearch_slm_stats_snapshots_deleted_total{policy="everything"} 20 + # HELP elasticsearch_slm_stats_snapshots_failed_total Total snapshots failed + # TYPE elasticsearch_slm_stats_snapshots_failed_total counter + elasticsearch_slm_stats_snapshots_failed_total{policy="everything"} 2 + # HELP elasticsearch_slm_stats_snapshots_taken_total Total snapshots taken + # TYPE elasticsearch_slm_stats_snapshots_taken_total counter + elasticsearch_slm_stats_snapshots_taken_total{policy="everything"} 50 + # HELP elasticsearch_slm_stats_total_snapshot_deletion_failures_total Total snapshot deletion failures + # TYPE elasticsearch_slm_stats_total_snapshot_deletion_failures_total counter + elasticsearch_slm_stats_total_snapshot_deletion_failures_total 0 + # HELP elasticsearch_slm_stats_total_snapshots_deleted_total Total snapshots deleted + # TYPE elasticsearch_slm_stats_total_snapshots_deleted_total counter + elasticsearch_slm_stats_total_snapshots_deleted_total 20 + # HELP elasticsearch_slm_stats_total_snapshots_failed_total Total snapshots failed + # TYPE elasticsearch_slm_stats_total_snapshots_failed_total counter + elasticsearch_slm_stats_total_snapshots_failed_total 2 + # HELP elasticsearch_slm_stats_total_snapshots_taken_total Total snapshots taken + # TYPE elasticsearch_slm_stats_total_snapshots_taken_total counter + elasticsearch_slm_stats_total_snapshots_taken_total 103 + `, + }, } - for ver, out := range tcs { - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - fmt.Fprintln(w, out) - })) - defer ts.Close() - u, err := url.Parse(ts.URL) - if err != nil { - t.Fatalf("Failed to parse URL: %s", err) - } - s := NewSLM(log.NewNopLogger(), http.DefaultClient, u) - stats, err := s.fetchAndDecodeSLMStats() - if err != nil { - t.Fatalf("Failed to fetch or decode snapshots stats: %s", err) - } - t.Logf("[%s] SLM Response: %+v", ver, stats) - slmStats := stats - policyStats := stats.PolicyStats[0] + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fStatsPath := path.Join("../fixtures/slm/stats/", tt.file) + fStats, err := os.Open(fStatsPath) + if err != nil { + t.Fatal(err) + } + defer fStats.Close() - if slmStats.TotalSnapshotsTaken != 103 { - t.Errorf("Bad number of total snapshots taken") - } + fStatusPath := path.Join("../fixtures/slm/status/", tt.file) + fStatus, err := os.Open(fStatusPath) + if err != nil { + t.Fatal(err) + } + defer fStatus.Close() + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.RequestURI { + case "/_slm/stats": + io.Copy(w, fStats) + return + case "/_slm/status": + io.Copy(w, fStatus) + return + } + + http.Error(w, "Not Found", http.StatusNotFound) + })) + defer ts.Close() + + u, err := url.Parse(ts.URL) + if err != nil { + t.Fatalf("Failed to parse URL: %s", err) + } + + s := NewSLM(log.NewNopLogger(), http.DefaultClient, u) + if err != nil { + t.Fatal(err) + } + + if err := testutil.CollectAndCompare(s, strings.NewReader(tt.want)); err != nil { + t.Fatal(err) + } + }) - if policyStats.SnapshotsTaken != 50 { - t.Errorf("Bad number of policy snapshots taken") - } } } diff --git a/fixtures/slm/stats/7.15.0.json b/fixtures/slm/stats/7.15.0.json new file mode 100644 index 00000000..e94f8d7b --- /dev/null +++ b/fixtures/slm/stats/7.15.0.json @@ -0,0 +1,20 @@ +{ + "retention_runs": 9, + "retention_failed": 0, + "retention_timed_out": 0, + "retention_deletion_time": "1.2m", + "retention_deletion_time_millis": 72491, + "total_snapshots_taken": 103, + "total_snapshots_failed": 2, + "total_snapshots_deleted": 20, + "total_snapshot_deletion_failures": 0, + "policy_stats": [ + { + "policy": "everything", + "snapshots_taken": 50, + "snapshots_failed": 2, + "snapshots_deleted": 20, + "snapshot_deletion_failures": 0 + } + ] +} diff --git a/fixtures/slm/status/7.15.0.json b/fixtures/slm/status/7.15.0.json new file mode 100644 index 00000000..0967ef42 --- /dev/null +++ b/fixtures/slm/status/7.15.0.json @@ -0,0 +1 @@ +{}