From 0c5c3477c0c5e7c50083c80194c8b6fcaf98e437 Mon Sep 17 00:00:00 2001 From: Michael Colby Date: Thu, 5 Dec 2024 12:05:37 -0800 Subject: [PATCH] Splunkenterprisereceiver add health metric (#1) * Initial commit * Corrected structs to fit json API response * Added to changelog * PR number added to changelog --- .chloggen/changes.yaml | 22 +++++++ .../splunkenterprisereceiver/documentation.md | 15 +++++ .../internal/metadata/generated_config.go | 4 ++ .../metadata/generated_config_test.go | 2 + .../internal/metadata/generated_metrics.go | 60 +++++++++++++++++++ .../metadata/generated_metrics_test.go | 22 +++++++ .../internal/metadata/testdata/config.yaml | 4 ++ .../splunkenterprisereceiver/metadata.yaml | 14 +++++ receiver/splunkenterprisereceiver/scraper.go | 57 +++++++++++++++++- .../splunkenterprisereceiver/search_result.go | 15 +++++ 10 files changed, 213 insertions(+), 2 deletions(-) create mode 100644 .chloggen/changes.yaml diff --git a/.chloggen/changes.yaml b/.chloggen/changes.yaml new file mode 100644 index 000000000000..57810a898fb1 --- /dev/null +++ b/.chloggen/changes.yaml @@ -0,0 +1,22 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: 'enhancement' + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: 'splunkenterprisereceiver' + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: "Added a new `splunk.health` metric." + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [1] + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [user] diff --git a/receiver/splunkenterprisereceiver/documentation.md b/receiver/splunkenterprisereceiver/documentation.md index dfc2559816a4..e5948f0a5699 100644 --- a/receiver/splunkenterprisereceiver/documentation.md +++ b/receiver/splunkenterprisereceiver/documentation.md @@ -41,6 +41,21 @@ Gauge tracking the number of buckets and their searchable status. *Note:** Searc | splunk.host | The name of the splunk host | Any Str | | splunk.indexer.searchable | The searchability status reported for a specific object | Any Str | +### splunk.health + +The status (color) of the Splunk server. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| {status} | Gauge | Int | + +#### Attributes + +| Name | Description | Values | +| ---- | ----------- | ------ | +| splunk.feature | The Feature name from the Splunk Health Introspection Endpoint | Any Str | +| splunk.feature.health | The Health (in color form) of a Splunk Feature from the Splunk Health Introspection Endpoint | Any Str | + ### splunk.indexer.avg.rate Gauge tracking the average rate of indexed data. **Note:** Search is best run against a Cluster Manager. diff --git a/receiver/splunkenterprisereceiver/internal/metadata/generated_config.go b/receiver/splunkenterprisereceiver/internal/metadata/generated_config.go index 4ea3c87bfe5c..f3b59169f00c 100644 --- a/receiver/splunkenterprisereceiver/internal/metadata/generated_config.go +++ b/receiver/splunkenterprisereceiver/internal/metadata/generated_config.go @@ -36,6 +36,7 @@ type MetricsConfig struct { SplunkDataIndexesExtendedEventCount MetricConfig `mapstructure:"splunk.data.indexes.extended.event.count"` SplunkDataIndexesExtendedRawSize MetricConfig `mapstructure:"splunk.data.indexes.extended.raw.size"` SplunkDataIndexesExtendedTotalSize MetricConfig `mapstructure:"splunk.data.indexes.extended.total.size"` + SplunkHealth MetricConfig `mapstructure:"splunk.health"` SplunkIndexerAvgRate MetricConfig `mapstructure:"splunk.indexer.avg.rate"` SplunkIndexerCPUTime MetricConfig `mapstructure:"splunk.indexer.cpu.time"` SplunkIndexerQueueRatio MetricConfig `mapstructure:"splunk.indexer.queue.ratio"` @@ -98,6 +99,9 @@ func DefaultMetricsConfig() MetricsConfig { SplunkDataIndexesExtendedTotalSize: MetricConfig{ Enabled: false, }, + SplunkHealth: MetricConfig{ + Enabled: true, + }, SplunkIndexerAvgRate: MetricConfig{ Enabled: true, }, diff --git a/receiver/splunkenterprisereceiver/internal/metadata/generated_config_test.go b/receiver/splunkenterprisereceiver/internal/metadata/generated_config_test.go index 6dead898ea26..5376660051a3 100644 --- a/receiver/splunkenterprisereceiver/internal/metadata/generated_config_test.go +++ b/receiver/splunkenterprisereceiver/internal/metadata/generated_config_test.go @@ -34,6 +34,7 @@ func TestMetricsBuilderConfig(t *testing.T) { SplunkDataIndexesExtendedEventCount: MetricConfig{Enabled: true}, SplunkDataIndexesExtendedRawSize: MetricConfig{Enabled: true}, SplunkDataIndexesExtendedTotalSize: MetricConfig{Enabled: true}, + SplunkHealth: MetricConfig{Enabled: true}, SplunkIndexerAvgRate: MetricConfig{Enabled: true}, SplunkIndexerCPUTime: MetricConfig{Enabled: true}, SplunkIndexerQueueRatio: MetricConfig{Enabled: true}, @@ -81,6 +82,7 @@ func TestMetricsBuilderConfig(t *testing.T) { SplunkDataIndexesExtendedEventCount: MetricConfig{Enabled: false}, SplunkDataIndexesExtendedRawSize: MetricConfig{Enabled: false}, SplunkDataIndexesExtendedTotalSize: MetricConfig{Enabled: false}, + SplunkHealth: MetricConfig{Enabled: false}, SplunkIndexerAvgRate: MetricConfig{Enabled: false}, SplunkIndexerCPUTime: MetricConfig{Enabled: false}, SplunkIndexerQueueRatio: MetricConfig{Enabled: false}, diff --git a/receiver/splunkenterprisereceiver/internal/metadata/generated_metrics.go b/receiver/splunkenterprisereceiver/internal/metadata/generated_metrics.go index c6d6e18c8d5b..272673d43700 100644 --- a/receiver/splunkenterprisereceiver/internal/metadata/generated_metrics.go +++ b/receiver/splunkenterprisereceiver/internal/metadata/generated_metrics.go @@ -474,6 +474,58 @@ func newMetricSplunkDataIndexesExtendedTotalSize(cfg MetricConfig) metricSplunkD return m } +type metricSplunkHealth struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills splunk.health metric with initial data. +func (m *metricSplunkHealth) init() { + m.data.SetName("splunk.health") + m.data.SetDescription("The status (color) of the Splunk server.") + m.data.SetUnit("{status}") + m.data.SetEmptyGauge() + m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) +} + +func (m *metricSplunkHealth) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, splunkFeatureAttributeValue string, splunkFeatureHealthAttributeValue string) { + if !m.config.Enabled { + return + } + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetIntValue(val) + dp.Attributes().PutStr("splunk.feature", splunkFeatureAttributeValue) + dp.Attributes().PutStr("splunk.feature.health", splunkFeatureHealthAttributeValue) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricSplunkHealth) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricSplunkHealth) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricSplunkHealth(cfg MetricConfig) metricSplunkHealth { + m := metricSplunkHealth{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + type metricSplunkIndexerAvgRate struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. @@ -2075,6 +2127,7 @@ type MetricsBuilder struct { metricSplunkDataIndexesExtendedEventCount metricSplunkDataIndexesExtendedEventCount metricSplunkDataIndexesExtendedRawSize metricSplunkDataIndexesExtendedRawSize metricSplunkDataIndexesExtendedTotalSize metricSplunkDataIndexesExtendedTotalSize + metricSplunkHealth metricSplunkHealth metricSplunkIndexerAvgRate metricSplunkIndexerAvgRate metricSplunkIndexerCPUTime metricSplunkIndexerCPUTime metricSplunkIndexerQueueRatio metricSplunkIndexerQueueRatio @@ -2141,6 +2194,7 @@ func NewMetricsBuilder(mbc MetricsBuilderConfig, settings receiver.Settings, opt metricSplunkDataIndexesExtendedEventCount: newMetricSplunkDataIndexesExtendedEventCount(mbc.Metrics.SplunkDataIndexesExtendedEventCount), metricSplunkDataIndexesExtendedRawSize: newMetricSplunkDataIndexesExtendedRawSize(mbc.Metrics.SplunkDataIndexesExtendedRawSize), metricSplunkDataIndexesExtendedTotalSize: newMetricSplunkDataIndexesExtendedTotalSize(mbc.Metrics.SplunkDataIndexesExtendedTotalSize), + metricSplunkHealth: newMetricSplunkHealth(mbc.Metrics.SplunkHealth), metricSplunkIndexerAvgRate: newMetricSplunkIndexerAvgRate(mbc.Metrics.SplunkIndexerAvgRate), metricSplunkIndexerCPUTime: newMetricSplunkIndexerCPUTime(mbc.Metrics.SplunkIndexerCPUTime), metricSplunkIndexerQueueRatio: newMetricSplunkIndexerQueueRatio(mbc.Metrics.SplunkIndexerQueueRatio), @@ -2246,6 +2300,7 @@ func (mb *MetricsBuilder) EmitForResource(options ...ResourceMetricsOption) { mb.metricSplunkDataIndexesExtendedEventCount.emit(ils.Metrics()) mb.metricSplunkDataIndexesExtendedRawSize.emit(ils.Metrics()) mb.metricSplunkDataIndexesExtendedTotalSize.emit(ils.Metrics()) + mb.metricSplunkHealth.emit(ils.Metrics()) mb.metricSplunkIndexerAvgRate.emit(ils.Metrics()) mb.metricSplunkIndexerCPUTime.emit(ils.Metrics()) mb.metricSplunkIndexerQueueRatio.emit(ils.Metrics()) @@ -2343,6 +2398,11 @@ func (mb *MetricsBuilder) RecordSplunkDataIndexesExtendedTotalSizeDataPoint(ts p mb.metricSplunkDataIndexesExtendedTotalSize.recordDataPoint(mb.startTime, ts, val, splunkIndexNameAttributeValue) } +// RecordSplunkHealthDataPoint adds a data point to splunk.health metric. +func (mb *MetricsBuilder) RecordSplunkHealthDataPoint(ts pcommon.Timestamp, val int64, splunkFeatureAttributeValue string, splunkFeatureHealthAttributeValue string) { + mb.metricSplunkHealth.recordDataPoint(mb.startTime, ts, val, splunkFeatureAttributeValue, splunkFeatureHealthAttributeValue) +} + // RecordSplunkIndexerAvgRateDataPoint adds a data point to splunk.indexer.avg.rate metric. func (mb *MetricsBuilder) RecordSplunkIndexerAvgRateDataPoint(ts pcommon.Timestamp, val float64, splunkHostAttributeValue string) { mb.metricSplunkIndexerAvgRate.recordDataPoint(mb.startTime, ts, val, splunkHostAttributeValue) diff --git a/receiver/splunkenterprisereceiver/internal/metadata/generated_metrics_test.go b/receiver/splunkenterprisereceiver/internal/metadata/generated_metrics_test.go index 76406e0fdbb8..7687d538966d 100644 --- a/receiver/splunkenterprisereceiver/internal/metadata/generated_metrics_test.go +++ b/receiver/splunkenterprisereceiver/internal/metadata/generated_metrics_test.go @@ -88,6 +88,10 @@ func TestMetricsBuilder(t *testing.T) { allMetricsCount++ mb.RecordSplunkDataIndexesExtendedTotalSizeDataPoint(ts, 1, "splunk.index.name-val") + defaultMetricsCount++ + allMetricsCount++ + mb.RecordSplunkHealthDataPoint(ts, 1, "splunk.feature-val", "splunk.feature.health-val") + defaultMetricsCount++ allMetricsCount++ mb.RecordSplunkIndexerAvgRateDataPoint(ts, 1, "splunk.host-val") @@ -367,6 +371,24 @@ func TestMetricsBuilder(t *testing.T) { attrVal, ok := dp.Attributes().Get("splunk.index.name") assert.True(t, ok) assert.EqualValues(t, "splunk.index.name-val", attrVal.Str()) + case "splunk.health": + assert.False(t, validatedMetrics["splunk.health"], "Found a duplicate in the metrics slice: splunk.health") + validatedMetrics["splunk.health"] = true + assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) + assert.Equal(t, "The status (color) of the Splunk server.", ms.At(i).Description()) + assert.Equal(t, "{status}", ms.At(i).Unit()) + dp := ms.At(i).Gauge().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) + assert.Equal(t, int64(1), dp.IntValue()) + attrVal, ok := dp.Attributes().Get("splunk.feature") + assert.True(t, ok) + assert.EqualValues(t, "splunk.feature-val", attrVal.Str()) + attrVal, ok = dp.Attributes().Get("splunk.feature.health") + assert.True(t, ok) + assert.EqualValues(t, "splunk.feature.health-val", attrVal.Str()) case "splunk.indexer.avg.rate": assert.False(t, validatedMetrics["splunk.indexer.avg.rate"], "Found a duplicate in the metrics slice: splunk.indexer.avg.rate") validatedMetrics["splunk.indexer.avg.rate"] = true diff --git a/receiver/splunkenterprisereceiver/internal/metadata/testdata/config.yaml b/receiver/splunkenterprisereceiver/internal/metadata/testdata/config.yaml index 90380c4d00e2..aeb4be01c4b3 100644 --- a/receiver/splunkenterprisereceiver/internal/metadata/testdata/config.yaml +++ b/receiver/splunkenterprisereceiver/internal/metadata/testdata/config.yaml @@ -19,6 +19,8 @@ all_set: enabled: true splunk.data.indexes.extended.total.size: enabled: true + splunk.health: + enabled: true splunk.indexer.avg.rate: enabled: true splunk.indexer.cpu.time: @@ -101,6 +103,8 @@ none_set: enabled: false splunk.data.indexes.extended.total.size: enabled: false + splunk.health: + enabled: false splunk.indexer.avg.rate: enabled: false splunk.indexer.cpu.time: diff --git a/receiver/splunkenterprisereceiver/metadata.yaml b/receiver/splunkenterprisereceiver/metadata.yaml index 6ead19fbe79b..d330a112c6a4 100644 --- a/receiver/splunkenterprisereceiver/metadata.yaml +++ b/receiver/splunkenterprisereceiver/metadata.yaml @@ -39,6 +39,12 @@ attributes: splunk.searchartifacts.cache.type: description: The search artifacts cache type type: string + splunk.feature: + description: The Feature name from the Splunk Health Introspection Endpoint + type: string + splunk.feature.health: + description: The Health (in color form) of a Splunk Feature from the Splunk Health Introspection Endpoint + type: string metrics: splunk.license.index.usage: @@ -345,6 +351,14 @@ metrics: aggregation_temporality: cumulative value_type: int attributes: [splunk.host] + #`services/server/health/splunkd/details` + splunk.health: + enabled: True + description: The status (color) of the Splunk server. + unit: "{status}" + gauge: + value_type: int + attributes: [splunk.feature, splunk.feature.health] tests: config: diff --git a/receiver/splunkenterprisereceiver/scraper.go b/receiver/splunkenterprisereceiver/scraper.go index d517a2da07af..04bf4d231e59 100644 --- a/receiver/splunkenterprisereceiver/scraper.go +++ b/receiver/splunkenterprisereceiver/scraper.go @@ -101,6 +101,7 @@ func (s *splunkScraper) scrape(ctx context.Context) (pmetric.Metrics, error) { s.scrapeIndexerAvgRate, s.scrapeKVStoreStatus, s.scrapeSearchArtifacts, + s.scrapeHealth, } errChan := make(chan error, len(metricScrapes)) @@ -1075,12 +1076,12 @@ func unmarshallSearchReq(res *http.Response, sr *searchResponse) error { body, err := io.ReadAll(res.Body) if err != nil { - return fmt.Errorf("Failed to read response: %w", err) + return fmt.Errorf("failed to read response: %w", err) } err = xml.Unmarshal(body, &sr) if err != nil { - return fmt.Errorf("Failed to unmarshall response: %w", err) + return fmt.Errorf("failed to unmarshall response: %w", err) } return nil @@ -1733,3 +1734,55 @@ func (s *splunkScraper) scrapeSearchArtifacts(ctx context.Context, now pcommon.T } } } + +// Scrape Health Introspection Endpoint +func (s *splunkScraper) scrapeHealth(ctx context.Context, now pcommon.Timestamp, errs chan error) { + if !s.conf.MetricsBuilderConfig.Metrics.SplunkHealth.Enabled { + return + } + + ctx = context.WithValue(ctx, endpointType("type"), typeCm) + + ept := apiDict[`SplunkHealth`] + var ha HealthArtifacts + + req, err := s.splunkClient.createAPIRequest(ctx, ept) + if err != nil { + errs <- err + return + } + + res, err := s.splunkClient.makeRequest(req) + if err != nil { + errs <- err + return + } + defer res.Body.Close() + + if err := json.NewDecoder(res.Body).Decode(&ha); err != nil { + errs <- err + return + } + + s.settings.Logger.Debug(fmt.Sprintf("Features: %s", ha.Entries)) + for _, details := range ha.Entries { + s.traverseHealthDetailFeatures(details.Content, now) + } +} + +func (s *splunkScraper) traverseHealthDetailFeatures(details HealthDetails, now pcommon.Timestamp) { + if details.Features == nil { + return + } + + for k, feature := range details.Features { + if feature.Health != "red" { + s.settings.Logger.Debug(feature.Health) + s.mb.RecordSplunkHealthDataPoint(now, 1, k, feature.Health) + } else { + s.settings.Logger.Debug(feature.Health) + s.mb.RecordSplunkHealthDataPoint(now, 0, k, feature.Health) + } + s.traverseHealthDetailFeatures(feature, now) + } +} diff --git a/receiver/splunkenterprisereceiver/search_result.go b/receiver/splunkenterprisereceiver/search_result.go index bd6c4318b016..17d8d6f8440e 100644 --- a/receiver/splunkenterprisereceiver/search_result.go +++ b/receiver/splunkenterprisereceiver/search_result.go @@ -25,6 +25,7 @@ var apiDict = map[string]string{ `SplunkIntrospectionQueues`: `/services/server/introspection/queues?output_mode=json&count=-1`, `SplunkKVStoreStatus`: `/services/kvstore/status?output_mode=json`, `SplunkDispatchArtifacts`: `/services/server/status/dispatch-artifacts?output_mode=json&count=-1`, + `SplunkHealth`: `/services/server/health/splunkd/details?output_mode=json`, } type searchResponse struct { @@ -156,3 +157,17 @@ type DispatchArtifactContent struct { StatusCacheSize string `json:"cached_job_status_status_csv_size_mb"` CacheTotalEntries string `json:"cached_job_status_total_entries"` } + +// '/services/server/health/splunkd/details +type HealthArtifacts struct { + Entries []HealthArtifactEntry `json:"entry"` +} + +type HealthArtifactEntry struct { + Content HealthDetails `json:"content"` +} + +type HealthDetails struct { + Health string `json:"health"` + Features map[string]HealthDetails `json:"features,omitempty"` +}