From c6315dc39dc82714deafd67e3d44233059b27385 Mon Sep 17 00:00:00 2001 From: Hisar Balik Date: Fri, 27 Dec 2024 18:15:29 +0100 Subject: [PATCH] introduce GatewayNoDataDelivered aler rule --- .../config/otelcol_rule_builder.go | 14 ++++++ internal/selfmonitor/config/rules.go | 1 + internal/selfmonitor/config/rules_test.go | 48 +++++++++++-------- .../prober/otel_pipeline_prober.go | 3 +- 4 files changed, 44 insertions(+), 22 deletions(-) diff --git a/internal/selfmonitor/config/otelcol_rule_builder.go b/internal/selfmonitor/config/otelcol_rule_builder.go index ffdb1fcf6..606cb1bd8 100644 --- a/internal/selfmonitor/config/otelcol_rule_builder.go +++ b/internal/selfmonitor/config/otelcol_rule_builder.go @@ -11,6 +11,7 @@ const ( metricOtelCollectorExporterQueueCapacity = "otelcol_exporter_queue_capacity" metricOtelCollectorExporterEnqueueFailed = "otelcol_exporter_enqueue_failed" metricOtelCollectorReceiverRefused = "otelcol_receiver_refused" + metricOtelCollectorReceiverAccepted = "otelcol_receiver_accepted" ) type otelCollectorRuleBuilder struct { @@ -26,6 +27,7 @@ func (rb otelCollectorRuleBuilder) rules() []Rule { rb.exporterQueueAlmostFullRule(), rb.exporterEnqueueFailedRule(), rb.receiverRefusedRule(), + rb.noDataDeliveredRule(), } } @@ -90,3 +92,15 @@ func (rb otelCollectorRuleBuilder) receiverRefusedRule() Rule { build(), } } + +func (rb otelCollectorRuleBuilder) noDataDeliveredRule() Rule { + receivedRule := rate(rb.formatMetricName(metricOtelCollectorReceiverAccepted), selectService(rb.serviceName)).sumBy(labelPipelineName).greaterThan(0).build() + + exportedRule := rate(rb.formatMetricName(metricOtelCollectorExporterSent), selectService(rb.serviceName)).sumBy(labelPipelineName).equal(0).build() + + return Rule{ + Alert: rb.namePrefix + RuleNameGatewayNoDataDelivered, + Expr: and(receivedRule, exportedRule), + For: alertWaitTime, + } +} diff --git a/internal/selfmonitor/config/rules.go b/internal/selfmonitor/config/rules.go index 14779ca31..94b5fc98c 100644 --- a/internal/selfmonitor/config/rules.go +++ b/internal/selfmonitor/config/rules.go @@ -14,6 +14,7 @@ const ( RuleNameGatewayExporterQueueAlmostFull = "GatewayExporterQueueAlmostFull" RuleNameGatewayExporterEnqueueFailed = "GatewayExporterEnqueueFailed" RuleNameGatewayReceiverRefusedData = "GatewayReceiverRefusedData" + RuleNameGatewayNoDataDelivered = "GatewayNoDataDelivered" // Fluent Bit rule names. Note that the actual full names will be prefixed with Log RuleNameLogAgentExporterSentLogs = "AgentExporterSentLogs" diff --git a/internal/selfmonitor/config/rules_test.go b/internal/selfmonitor/config/rules_test.go index 402dffb75..e22d0f384 100644 --- a/internal/selfmonitor/config/rules_test.go +++ b/internal/selfmonitor/config/rules_test.go @@ -14,7 +14,7 @@ func TestMakeRules(t *testing.T) { ruleGroup := rules.Groups[0] require.Equal(t, "default", ruleGroup.Name) - require.Len(t, ruleGroup.Rules, 15) + require.Len(t, ruleGroup.Rules, 17) require.Equal(t, "MetricGatewayExporterSentData", ruleGroup.Rules[0].Alert) require.Equal(t, "sum by (pipeline_name) (rate(otelcol_exporter_sent_metric_points{service=\"telemetry-metric-gateway-metrics\"}[5m])) > 0", ruleGroup.Rules[0].Expr) @@ -30,35 +30,41 @@ func TestMakeRules(t *testing.T) { require.Equal(t, "MetricGatewayReceiverRefusedData", ruleGroup.Rules[4].Alert) require.Equal(t, "sum by (receiver) (rate(otelcol_receiver_refused_metric_points{service=\"telemetry-metric-gateway-metrics\"}[5m])) > 0", ruleGroup.Rules[4].Expr) - require.Equal(t, "TraceGatewayExporterSentData", ruleGroup.Rules[5].Alert) - require.Equal(t, "sum by (pipeline_name) (rate(otelcol_exporter_sent_spans{service=\"telemetry-trace-gateway-metrics\"}[5m])) > 0", ruleGroup.Rules[5].Expr) + require.Equal(t, "MetricGatewayNoDataDelivered", ruleGroup.Rules[5].Alert) + require.Equal(t, "(sum by (pipeline_name) (rate(otelcol_receiver_accepted_metric_points{service=\"telemetry-metric-gateway-metrics\"}[5m])) > 0) and (sum by (pipeline_name) (rate(otelcol_exporter_sent_metric_points{service=\"telemetry-metric-gateway-metrics\"}[5m])) == 0)", ruleGroup.Rules[5].Expr) - require.Equal(t, "TraceGatewayExporterDroppedData", ruleGroup.Rules[6].Alert) - require.Equal(t, "sum by (pipeline_name) (rate(otelcol_exporter_send_failed_spans{service=\"telemetry-trace-gateway-metrics\"}[5m])) > 0", ruleGroup.Rules[6].Expr) + require.Equal(t, "TraceGatewayExporterSentData", ruleGroup.Rules[6].Alert) + require.Equal(t, "sum by (pipeline_name) (rate(otelcol_exporter_sent_spans{service=\"telemetry-trace-gateway-metrics\"}[5m])) > 0", ruleGroup.Rules[6].Expr) - require.Equal(t, "TraceGatewayExporterQueueAlmostFull", ruleGroup.Rules[7].Alert) - require.Equal(t, "max by (pipeline_name) (otelcol_exporter_queue_size{service=\"telemetry-trace-gateway-metrics\"} / ignoring(data_type) otelcol_exporter_queue_capacity{service=\"telemetry-trace-gateway-metrics\"}) > 0.8", ruleGroup.Rules[7].Expr) + require.Equal(t, "TraceGatewayExporterDroppedData", ruleGroup.Rules[7].Alert) + require.Equal(t, "sum by (pipeline_name) (rate(otelcol_exporter_send_failed_spans{service=\"telemetry-trace-gateway-metrics\"}[5m])) > 0", ruleGroup.Rules[7].Expr) - require.Equal(t, "TraceGatewayExporterEnqueueFailed", ruleGroup.Rules[8].Alert) - require.Equal(t, "sum by (pipeline_name) (rate(otelcol_exporter_enqueue_failed_spans{service=\"telemetry-trace-gateway-metrics\"}[5m])) > 0", ruleGroup.Rules[8].Expr) + require.Equal(t, "TraceGatewayExporterQueueAlmostFull", ruleGroup.Rules[8].Alert) + require.Equal(t, "max by (pipeline_name) (otelcol_exporter_queue_size{service=\"telemetry-trace-gateway-metrics\"} / ignoring(data_type) otelcol_exporter_queue_capacity{service=\"telemetry-trace-gateway-metrics\"}) > 0.8", ruleGroup.Rules[8].Expr) - require.Equal(t, "TraceGatewayReceiverRefusedData", ruleGroup.Rules[9].Alert) - require.Equal(t, "sum by (receiver) (rate(otelcol_receiver_refused_spans{service=\"telemetry-trace-gateway-metrics\"}[5m])) > 0", ruleGroup.Rules[9].Expr) + require.Equal(t, "TraceGatewayExporterEnqueueFailed", ruleGroup.Rules[9].Alert) + require.Equal(t, "sum by (pipeline_name) (rate(otelcol_exporter_enqueue_failed_spans{service=\"telemetry-trace-gateway-metrics\"}[5m])) > 0", ruleGroup.Rules[9].Expr) - require.Equal(t, "LogAgentExporterSentLogs", ruleGroup.Rules[10].Alert) - require.Equal(t, "sum by (pipeline_name) (rate(fluentbit_output_proc_bytes_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0", ruleGroup.Rules[10].Expr) + require.Equal(t, "TraceGatewayReceiverRefusedData", ruleGroup.Rules[10].Alert) + require.Equal(t, "sum by (receiver) (rate(otelcol_receiver_refused_spans{service=\"telemetry-trace-gateway-metrics\"}[5m])) > 0", ruleGroup.Rules[10].Expr) - require.Equal(t, "LogAgentExporterDroppedLogs", ruleGroup.Rules[11].Alert) - require.Equal(t, "sum by (pipeline_name) (rate(fluentbit_output_dropped_records_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0", ruleGroup.Rules[11].Expr) + require.Equal(t, "TraceGatewayNoDataDelivered", ruleGroup.Rules[11].Alert) + require.Equal(t, "(sum by (pipeline_name) (rate(otelcol_receiver_accepted_spans{service=\"telemetry-trace-gateway-metrics\"}[5m])) > 0) and (sum by (pipeline_name) (rate(otelcol_exporter_sent_spans{service=\"telemetry-trace-gateway-metrics\"}[5m])) == 0)", ruleGroup.Rules[11].Expr) - require.Equal(t, "LogAgentBufferInUse", ruleGroup.Rules[12].Alert) - require.Equal(t, "telemetry_fsbuffer_usage_bytes{service=\"telemetry-fluent-bit-exporter-metrics\"} > 300000000", ruleGroup.Rules[12].Expr) + require.Equal(t, "LogAgentExporterSentLogs", ruleGroup.Rules[12].Alert) + require.Equal(t, "sum by (pipeline_name) (rate(fluentbit_output_proc_bytes_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0", ruleGroup.Rules[12].Expr) - require.Equal(t, "LogAgentBufferFull", ruleGroup.Rules[13].Alert) - require.Equal(t, "telemetry_fsbuffer_usage_bytes{service=\"telemetry-fluent-bit-exporter-metrics\"} > 900000000", ruleGroup.Rules[13].Expr) + require.Equal(t, "LogAgentExporterDroppedLogs", ruleGroup.Rules[13].Alert) + require.Equal(t, "sum by (pipeline_name) (rate(fluentbit_output_dropped_records_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0", ruleGroup.Rules[13].Expr) - require.Equal(t, "LogAgentNoLogsDelivered", ruleGroup.Rules[14].Alert) - require.Equal(t, "(sum by (pipeline_name) (rate(fluentbit_input_bytes_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0) and (sum by (pipeline_name) (rate(fluentbit_output_proc_bytes_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) == 0)", ruleGroup.Rules[14].Expr) + require.Equal(t, "LogAgentBufferInUse", ruleGroup.Rules[14].Alert) + require.Equal(t, "telemetry_fsbuffer_usage_bytes{service=\"telemetry-fluent-bit-exporter-metrics\"} > 300000000", ruleGroup.Rules[14].Expr) + + require.Equal(t, "LogAgentBufferFull", ruleGroup.Rules[15].Alert) + require.Equal(t, "telemetry_fsbuffer_usage_bytes{service=\"telemetry-fluent-bit-exporter-metrics\"} > 900000000", ruleGroup.Rules[15].Expr) + + require.Equal(t, "LogAgentNoLogsDelivered", ruleGroup.Rules[16].Alert) + require.Equal(t, "(sum by (pipeline_name) (rate(fluentbit_input_bytes_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0) and (sum by (pipeline_name) (rate(fluentbit_output_proc_bytes_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) == 0)", ruleGroup.Rules[16].Expr) } func TestMatchesLogPipelineRule(t *testing.T) { diff --git a/internal/selfmonitor/prober/otel_pipeline_prober.go b/internal/selfmonitor/prober/otel_pipeline_prober.go index 7e4b17ddd..2425b84b7 100644 --- a/internal/selfmonitor/prober/otel_pipeline_prober.go +++ b/internal/selfmonitor/prober/otel_pipeline_prober.go @@ -88,7 +88,8 @@ func (p *OTelPipelineProber) healthy(alerts []promv1.Alert, pipelineName string) return !(p.isFiring(alerts, config.RuleNameGatewayExporterDroppedData, pipelineName) || p.isFiring(alerts, config.RuleNameGatewayExporterQueueAlmostFull, pipelineName) || p.isFiring(alerts, config.RuleNameGatewayExporterEnqueueFailed, pipelineName) || - p.isFiring(alerts, config.RuleNameGatewayReceiverRefusedData, pipelineName)) + p.isFiring(alerts, config.RuleNameGatewayReceiverRefusedData, pipelineName) || + p.isFiring(alerts, config.RuleNameGatewayNoDataDelivered, pipelineName)) } func (p *OTelPipelineProber) isFiring(alerts []promv1.Alert, ruleName, pipelineName string) bool {