From a30cc8a5a3824855b43a70413f3b4c5a18f53fad Mon Sep 17 00:00:00 2001 From: gaantunes Date: Fri, 24 May 2024 13:35:42 -0300 Subject: [PATCH] Alloy-Mixin: allow k8s cluster and alloy cluster disable, add logs dashboard (#808) --- operations/alloy-mixin/alerts.libsonnet | 21 +- .../alloy-mixin/alerts/clustering.libsonnet | 144 ++-- .../alloy-mixin/alerts/controller.libsonnet | 49 +- .../alerts/opentelemetry.libsonnet | 55 +- operations/alloy-mixin/config.libsonnet | 12 + operations/alloy-mixin/dashboards.libsonnet | 28 +- .../dashboards/alloy-logs.libsonnet | 35 + .../dashboards/cluster-node.libsonnet | 124 +-- .../dashboards/cluster-overview.libsonnet | 47 +- .../dashboards/controller.libsonnet | 114 +-- .../dashboards/opentelemetry.libsonnet | 74 +- .../dashboards/prometheus.libsonnet | 771 +++++++++--------- .../dashboards/resources.libsonnet | 74 +- .../dashboards/utils/dashboard.jsonnet | 12 +- operations/alloy-mixin/jsonnetfile.json | 30 +- operations/alloy-mixin/jsonnetfile.lock.json | 56 ++ operations/alloy-mixin/mixin.libsonnet | 3 +- 17 files changed, 965 insertions(+), 684 deletions(-) create mode 100644 operations/alloy-mixin/config.libsonnet create mode 100644 operations/alloy-mixin/dashboards/alloy-logs.libsonnet create mode 100644 operations/alloy-mixin/jsonnetfile.lock.json diff --git a/operations/alloy-mixin/alerts.libsonnet b/operations/alloy-mixin/alerts.libsonnet index d8e247fb13..548e41febd 100644 --- a/operations/alloy-mixin/alerts.libsonnet +++ b/operations/alloy-mixin/alerts.libsonnet @@ -1,9 +1,20 @@ +local clusterAlerts = (import './alerts/clustering.libsonnet'); +local controllerAlerts = (import './alerts/controller.libsonnet'); +local openTelemetryAlerts = (import './alerts/opentelemetry.libsonnet'); + { + local alloyClusterAlerts = [clusterAlerts.newAlloyClusterAlertsGroup($._config.enableK8sCluster)], + + local otherAlerts = [ + controllerAlerts.newControllerAlertsGroup($._config.enableK8sCluster), + openTelemetryAlerts.newOpenTelemetryAlertsGroup($._config.enableK8sCluster) + ], + prometheusAlerts+: { - groups+: [ - (import './alerts/clustering.libsonnet'), - (import './alerts/controller.libsonnet'), - (import './alerts/opentelemetry.libsonnet'), - ], + groups+: + if $._config.enableAlloyCluster then + alloyClusterAlerts + otherAlerts + else + otherAlerts }, } diff --git a/operations/alloy-mixin/alerts/clustering.libsonnet b/operations/alloy-mixin/alerts/clustering.libsonnet index 2f4f49e13e..9f2411285b 100644 --- a/operations/alloy-mixin/alerts/clustering.libsonnet +++ b/operations/alloy-mixin/alerts/clustering.libsonnet @@ -1,67 +1,91 @@ local alert = import './utils/alert.jsonnet'; -alert.newGroup( - 'alloy_clustering', - [ - // Cluster not converging. - alert.newRule( - 'ClusterNotConverging', - 'stddev by (cluster, namespace) (sum without (state) (cluster_node_peers)) != 0', - 'Cluster is not converging: nodes report different number of peers in the cluster.', - '10m', - ), +{ + newAlloyClusterAlertsGroup(enableK8sCluster=true):: + alert.newGroup( + 'alloy_clustering', + [ + // Cluster not converging. + alert.newRule( + 'ClusterNotConverging', + if enableK8sCluster then + 'stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) != 0' + else + 'stddev by (job) (sum without (state) (cluster_node_peers)) != 0', + 'Cluster is not converging: nodes report different number of peers in the cluster.', + '10m', + ), - alert.newRule( - 'ClusterNodeCountMismatch', - // Assert that the number of known peers (regardless of state) reported by each - // Alloy instance matches the number of running Alloy instances in the - // same cluster and namespace as reported by a count of Prometheus - // metrics. - ||| - sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) - |||, - 'Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.', - '15m', - ), + alert.newRule( + 'ClusterNodeCountMismatch', + // Assert that the number of known peers (regardless of state) reported by each + // Alloy instance matches the number of running Alloy instances in the + // same cluster and namespace as reported by a count of Prometheus + // metrics. + if enableK8sCluster then ||| + sum without (state) (cluster_node_peers) != + on (cluster, namespace, job) group_left + count by (cluster, namespace, job) (cluster_node_info) + ||| else ||| + sum without (state) (cluster_node_peers) != + on (job) group_left + count by (job) (cluster_node_info) + ||| + , + 'Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.', + '15m', + ), - // Nodes health score is not zero. - alert.newRule( - 'ClusterNodeUnhealthy', - ||| - cluster_node_gossip_health_score > 0 - |||, - 'Cluster node is reporting a gossip protocol health score > 0.', - '10m', - ), + // Nodes health score is not zero. + alert.newRule( + 'ClusterNodeUnhealthy', + ||| + cluster_node_gossip_health_score > 0 + |||, + 'Cluster node is reporting a gossip protocol health score > 0.', + '10m', + ), - // Node tried to join the cluster with an already-present node name. - alert.newRule( - 'ClusterNodeNameConflict', - 'sum by (cluster, namespace) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0', - 'A node tried to join the cluster with a name conflicting with an existing peer.', - '10m', - ), + // Node tried to join the cluster with an already-present node name. + alert.newRule( + 'ClusterNodeNameConflict', + if enableK8sCluster then + 'sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0' + else + 'sum by (job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0' + , + 'A node tried to join the cluster with a name conflicting with an existing peer.', + '10m', + ), - // Node stuck in Terminating state. - alert.newRule( - 'ClusterNodeStuckTerminating', - 'sum by (cluster, namespace, instance) (cluster_node_peers{state="terminating"}) > 0', - 'Cluster node stuck in Terminating state.', - '10m', - ), + // Node stuck in Terminating state. + alert.newRule( + 'ClusterNodeStuckTerminating', + if enableK8sCluster then + 'sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) > 0' + else + 'sum by (job, instance) (cluster_node_peers{state="terminating"}) > 0' + , + 'Cluster node stuck in Terminating state.', + '10m', + ), - // Nodes are not using the same configuration file. - alert.newRule( - 'ClusterConfigurationDrift', - ||| - count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace) cluster_node_info) - ) > 1 - |||, - 'Cluster nodes are not using the same configuration file.', - '5m', - ), - ] -) + // Nodes are not using the same configuration file. + alert.newRule( + 'ClusterConfigurationDrift', + if enableK8sCluster then ||| + count without (sha256) ( + max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + ) > 1 + ||| else ||| + count without (sha256) ( + max by (sha256, job) (alloy_config_hash and on(job) cluster_node_info) + ) > 1 + ||| + , + 'Cluster nodes are not using the same configuration file.', + '5m', + ), + ] + ) +} diff --git a/operations/alloy-mixin/alerts/controller.libsonnet b/operations/alloy-mixin/alerts/controller.libsonnet index 2d43680b84..175c229d11 100644 --- a/operations/alloy-mixin/alerts/controller.libsonnet +++ b/operations/alloy-mixin/alerts/controller.libsonnet @@ -1,22 +1,33 @@ local alert = import './utils/alert.jsonnet'; -alert.newGroup( - 'alloy_controller', - [ - // Component evaluations are taking too long, which can lead to e.g. stale targets. - alert.newRule( - 'SlowComponentEvaluations', - 'sum by (cluster, namespace, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0', - 'Component evaluations are taking too long.', - '15m', - ), +{ + newControllerAlertsGroup(enableK8sCluster=true): + alert.newGroup( + 'alloy_controller', + [ + // Component evaluations are taking too long, which can lead to e.g. stale targets. + alert.newRule( + 'SlowComponentEvaluations', + if enableK8sCluster then + 'sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0' + else + 'sum by (job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0' + , + 'Component evaluations are taking too long.', + '15m', + ), - // Unhealthy components detected. - alert.newRule( - 'UnhealthyComponents', - 'sum by (cluster, namespace) (alloy_component_controller_running_components{health_type!="healthy"}) > 0', - 'Unhealthy components detected.', - '15m', - ), - ] -) + // Unhealthy components detected. + alert.newRule( + 'UnhealthyComponents', + if enableK8sCluster then + 'sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0' + else + 'sum by (job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0' + , + 'Unhealthy components detected.', + '15m', + ), + ] + ) +} diff --git a/operations/alloy-mixin/alerts/opentelemetry.libsonnet b/operations/alloy-mixin/alerts/opentelemetry.libsonnet index e611545a18..23d23c3ad3 100644 --- a/operations/alloy-mixin/alerts/opentelemetry.libsonnet +++ b/operations/alloy-mixin/alerts/opentelemetry.libsonnet @@ -1,25 +1,36 @@ local alert = import './utils/alert.jsonnet'; -alert.newGroup( - 'alloy_otelcol', - [ - // An otelcol.exporter component rcould not push some spans to the pipeline. - // This could be due to reaching a limit such as the ones - // imposed by otelcol.processor.memory_limiter. - alert.newRule( - 'OtelcolReceiverRefusedSpans', - 'sum by (cluster, namespace) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0', - 'The receiver could not push some spans to the pipeline.', - '5m', - ), +{ + newOpenTelemetryAlertsGroup(enableK8sCluster=true): + alert.newGroup( + 'alloy_otelcol', + [ + // An otelcol.exporter component rcould not push some spans to the pipeline. + // This could be due to reaching a limit such as the ones + // imposed by otelcol.processor.memory_limiter. + alert.newRule( + 'OtelcolReceiverRefusedSpans', + if enableK8sCluster then + 'sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0' + else + 'sum by (job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0' + , + 'The receiver could not push some spans to the pipeline.', + '5m', + ), - // The exporter failed to send spans to their destination. - // There could be an issue with the payload or with the destination endpoint. - alert.newRule( - 'OtelcolExporterFailedSpans', - 'sum by (cluster, namespace) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0', - 'The exporter failed to send spans to their destination.', - '5m', - ), - ] -) + // The exporter failed to send spans to their destination. + // There could be an issue with the payload or with the destination endpoint. + alert.newRule( + 'OtelcolExporterFailedSpans', + if enableK8sCluster then + 'sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0' + else + 'sum by (job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0' + , + 'The exporter failed to send spans to their destination.', + '5m', + ), + ] + ) +} diff --git a/operations/alloy-mixin/config.libsonnet b/operations/alloy-mixin/config.libsonnet new file mode 100644 index 0000000000..09d7f31e09 --- /dev/null +++ b/operations/alloy-mixin/config.libsonnet @@ -0,0 +1,12 @@ +{ + _config+:: { + enableK8sCluster: true, + enableAlloyCluster: true, + enableLokiLogs: true, + filterSelector: 'job=~"$job"', + groupSelector: if self.enableK8sCluster then self.k8sClusterSelector + ', ' + self.filterSelector else self.filterSelector, + instanceSelector: self.groupSelector + ', instance=~"$instance"', + k8sClusterSelector: 'cluster=~"$cluster", namespace=~"$namespace"', + dashboardTag: 'alloy-mixin' + } +} \ No newline at end of file diff --git a/operations/alloy-mixin/dashboards.libsonnet b/operations/alloy-mixin/dashboards.libsonnet index 661de183dc..281c48c765 100644 --- a/operations/alloy-mixin/dashboards.libsonnet +++ b/operations/alloy-mixin/dashboards.libsonnet @@ -1,9 +1,21 @@ -{ - grafanaDashboards+: - (import './dashboards/controller.libsonnet') + - (import './dashboards/resources.libsonnet') + - (import './dashboards/prometheus.libsonnet') + - (import './dashboards/cluster-node.libsonnet') + - (import './dashboards/opentelemetry.libsonnet') + - (import './dashboards/cluster-overview.libsonnet'), +local alloyClusterDashboards = + (import './dashboards/cluster-node.libsonnet') + + (import './dashboards/cluster-overview.libsonnet') + + (import './config.libsonnet'); + +local otherDashboards = + (import './dashboards/resources.libsonnet') + + (import './dashboards/controller.libsonnet') + + (import './dashboards/prometheus.libsonnet') + + (import './dashboards/opentelemetry.libsonnet') + + (import './config.libsonnet'); + +(import './dashboards/alloy-logs.libsonnet') + +{ + grafanaDashboards+: + if $._config.enableAlloyCluster then + alloyClusterDashboards + + otherDashboards + else + otherDashboards } diff --git a/operations/alloy-mixin/dashboards/alloy-logs.libsonnet b/operations/alloy-mixin/dashboards/alloy-logs.libsonnet new file mode 100644 index 0000000000..e1d4e894bd --- /dev/null +++ b/operations/alloy-mixin/dashboards/alloy-logs.libsonnet @@ -0,0 +1,35 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-v10.0.0/main.libsonnet'; +local logsDashboard = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libsonnet'; + +{ + + local labels = if $._config.enableK8sCluster then ['cluster', 'namespace', 'job', 'instance', 'level'] else ['job', 'instance', 'level'], + + grafanaDashboards+: + if $._config.enableLokiLogs then { + local alloyLogs = + logsDashboard.new( + 'Alloy logs overview', + datasourceName='loki_datasource', + datasourceRegex='', + filterSelector=$._config.filterSelector, + labels=labels, + formatParser=null, + showLogsVolume=true + ) + { + panels+: + { + logs+: + // Alloy logs already have timestamp + g.panel.logs.options.withShowTime(false), + }, + dashboards+: + { + logs+: g.dashboard.withLinksMixin($.grafanaDashboards['alloy-resources.json'].links) + + g.dashboard.withRefresh('10s'), + }, + }, + 'alloy-logs.json': alloyLogs.dashboards.logs, + } else {}, +} diff --git a/operations/alloy-mixin/dashboards/cluster-node.libsonnet b/operations/alloy-mixin/dashboards/cluster-node.libsonnet index 0e6241afdc..4d9f417b80 100644 --- a/operations/alloy-mixin/dashboards/cluster-node.libsonnet +++ b/operations/alloy-mixin/dashboards/cluster-node.libsonnet @@ -3,25 +3,29 @@ local panel = import './utils/panel.jsonnet'; local filename = 'alloy-cluster-node.json'; { + local templateVariables = + if $._config.enableK8sCluster then + [ + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace"}, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace", job=~"$job"}, instance)'), + ] + else + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job"}, instance)'), + ], + [filename]: - dashboard.new(name='Alloy / Cluster Node') + + dashboard.new(name='Alloy / Cluster Node', tag=$._config.dashboardTag) + dashboard.withDocsLink( url='https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode', desc='Clustering documentation', ) + - dashboard.withDashboardsLink() + + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + - dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('cluster', ||| - label_values(alloy_component_controller_running_components, cluster) - |||), - dashboard.newTemplateVariable('namespace', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster"}, namespace) - |||), - dashboard.newTemplateVariable('instance', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace"}, instance) - |||), - ]) + + dashboard.withTemplateVariablesMixin(templateVariables) + // TODO(@tpaschalis) Make the annotation optional. dashboard.withAnnotations([ dashboard.newLokiAnnotation('Deployments', '{cluster="$cluster", container="kube-diff-logger"} | json | namespace_extracted="alloy" | name_extracted=~"alloy.*"', 'rgba(0, 211, 255, 1)'), @@ -49,22 +53,30 @@ local filename = 'alloy-cluster-node.json'; panel.withPosition({ x: 0, y: 1, w: 12, h: 8 }) + panel.withQueries([ panel.newNamedInstantQuery( - expr='sum(cluster_node_lamport_time{instance="$instance", cluster="$cluster", namespace="$namespace"})', + expr= ||| + sum(cluster_node_lamport_time{%(instanceSelector)s}) + ||| % $._config, refId='Lamport clock time', format='table', ), panel.newNamedInstantQuery( - expr='sum(cluster_node_update_observers{instance="$instance", cluster="$cluster", namespace="$namespace"})', + expr= ||| + sum(cluster_node_update_observers{%(instanceSelector)s}) + ||| % $._config, refId='Internal cluster state observers', format='table', ), panel.newNamedInstantQuery( - expr='sum(cluster_node_gossip_health_score{instance="$instance", cluster="$cluster", namespace="$namespace"})', + expr= ||| + sum(cluster_node_gossip_health_score{%(instanceSelector)s}) + ||| % $._config, refId='Gossip health score', format='table', ), panel.newNamedInstantQuery( - expr='sum(cluster_node_gossip_proto_version{instance="$instance", cluster="$cluster", namespace="$namespace"})', + expr= ||| + sum(cluster_node_gossip_proto_version{%(instanceSelector)s}) + ||| % $._config, refId='Gossip protocol version', format='table', ), @@ -100,7 +112,9 @@ local filename = 'alloy-cluster-node.json'; panel.withPosition({ x: 12, y: 1, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='rate(cluster_node_gossip_received_events_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval])', + expr= ||| + rate(cluster_node_gossip_received_events_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{event}}' ), ]) @@ -114,7 +128,9 @@ local filename = 'alloy-cluster-node.json'; panel.withPosition({ x: 0, y: 9, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='sum(cluster_node_peers{instance="$instance", cluster="$cluster", namespace="$namespace"})', + expr= ||| + sum(cluster_node_peers{%(instanceSelector)s}) + ||| % $._config, ), ]) + panel.withUnit('suffix:peers') @@ -128,7 +144,9 @@ local filename = 'alloy-cluster-node.json'; panel.withPosition({ x: 12, y: 9, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='cluster_node_peers{instance="$instance", cluster="$cluster", namespace="$namespace"}', + expr= ||| + cluster_node_peers{%(instanceSelector)s} + ||| % $._config, legendFormat='{{state}}', ), ]) + @@ -150,11 +168,15 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr='rate(cluster_transport_rx_bytes_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval])', + expr= ||| + rate(cluster_transport_rx_bytes_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='rx', ), panel.newQuery( - expr='-1 * rate(cluster_transport_tx_bytes_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval])', + expr= ||| + -1 * rate(cluster_transport_tx_bytes_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='tx', ), ]) + @@ -172,21 +194,21 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr=||| + expr= ||| 1 - ( - rate(cluster_transport_tx_packets_failed_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) / - rate(cluster_transport_tx_packets_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) + rate(cluster_transport_tx_packets_failed_total{%(instanceSelector)s}[$__rate_interval]) / + rate(cluster_transport_tx_packets_total{%(instanceSelector)s}[$__rate_interval]) ) - |||, + ||| % $._config, legendFormat='Tx success %', ), panel.newQuery( - expr=||| + expr= ||| 1 - ( - rate(cluster_transport_rx_packets_failed_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) / - rate(cluster_transport_rx_packets_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) - ) - |||, + rate(cluster_transport_rx_packets_failed_total{%(instanceSelector)s}[$__rate_interval]) / + rate(cluster_transport_rx_packets_total{%(instanceSelector)s}[$__rate_interval]) + ) + ||| % $._config, legendFormat='Rx success %', ), ]) + @@ -208,11 +230,15 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr='cluster_transport_tx_packet_queue_length{instance="$instance", cluster="$cluster", namespace="$namespace"}', + expr= ||| + cluster_transport_tx_packet_queue_length{%(instanceSelector)s} + ||| % $._config, legendFormat='tx queue', ), panel.newQuery( - expr='cluster_transport_rx_packet_queue_length{instance="$instance", cluster="$cluster", namespace="$namespace"}', + expr= ||| + cluster_transport_rx_packet_queue_length{%(instanceSelector)s} + ||| % $._config, legendFormat='rx queue', ), ]) + @@ -229,11 +255,15 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr='rate(cluster_transport_stream_rx_bytes_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval])', + expr= ||| + rate(cluster_transport_stream_rx_bytes_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='rx', ), panel.newQuery( - expr='-1 * rate(cluster_transport_stream_tx_bytes_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval])', + expr= ||| + -1 * rate(cluster_transport_stream_tx_bytes_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='tx', ), ]) + @@ -251,21 +281,21 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr=||| + expr= ||| 1 - ( - rate(cluster_transport_stream_tx_packets_failed_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) / - rate(cluster_transport_stream_tx_packets_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) - ) - |||, + rate(cluster_transport_stream_tx_packets_failed_total{%(instanceSelector)s}[$__rate_interval]) / + rate(cluster_transport_stream_tx_packets_total{%(instanceSelector)s}[$__rate_interval]) + ) + ||| % $._config, legendFormat='Tx success %' ), panel.newQuery( - expr=||| + expr= ||| 1 - ( - rate(cluster_transport_stream_rx_packets_failed_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) / - rate(cluster_transport_stream_rx_packets_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) - ) - |||, + rate(cluster_transport_stream_rx_packets_failed_total{%(instanceSelector)s}[$__rate_interval]) / + rate(cluster_transport_stream_rx_packets_total{%(instanceSelector)s}[$__rate_interval]) + ) + ||| % $._config, legendFormat='Rx success %' ), ]) + @@ -287,7 +317,9 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr='cluster_transport_streams{instance="$instance", cluster="$cluster", namespace="$namespace"}', + expr= ||| + cluster_transport_streams{%(instanceSelector)s} + ||| % $._config, legendFormat='Open streams' ), ]) diff --git a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet index 314828cbe4..361eb6f93e 100644 --- a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet +++ b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet @@ -4,22 +4,27 @@ local filename = 'alloy-cluster-overview.json'; local cluster_node_filename = 'alloy-cluster-node.json'; { + local templateVariables = + if $._config.enableK8sCluster then + [ + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace"}, job)'), + ] + else + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + ], + [filename]: - dashboard.new(name='Alloy / Cluster Overview') + + dashboard.new(name='Alloy / Cluster Overview', tag=$._config.dashboardTag) + dashboard.withDocsLink( url='https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode', desc='Clustering documentation', ) + - dashboard.withDashboardsLink() + + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + - dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('cluster', ||| - label_values(alloy_component_controller_running_components, cluster) - |||), - dashboard.newTemplateVariable('namespace', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster"}, namespace) - |||), - ]) + + dashboard.withTemplateVariablesMixin(templateVariables) + // TODO(@tpaschalis) Make the annotation optional. dashboard.withAnnotations([ dashboard.newLokiAnnotation('Deployments', '{cluster="$cluster", container="kube-diff-logger"} | json | namespace_extracted="alloy" | name_extracted=~"alloy.*"', 'rgba(0, 211, 255, 1)'), @@ -31,7 +36,7 @@ local cluster_node_filename = 'alloy-cluster-node.json'; panel.withPosition({ h: 9, w: 8, x: 0, y: 0 }) + panel.withQueries([ panel.newInstantQuery( - expr='count(cluster_node_info{cluster="$cluster", namespace="$namespace"})' + expr='count(cluster_node_info{%(groupSelector)s})' ), ]) ), @@ -44,7 +49,7 @@ local cluster_node_filename = 'alloy-cluster-node.json'; panel.withPosition({ h: 9, w: 16, x: 8, y: 0 }) + panel.withQueries([ panel.newInstantQuery( - expr='cluster_node_info{cluster="$cluster", namespace="$namespace"}', + expr='cluster_node_info{%(groupSelector)s}', format='table', ), ]) + @@ -97,7 +102,7 @@ local cluster_node_filename = 'alloy-cluster-node.json'; { targetBlank: false, title: 'Detail dashboard for node', - url: '/d/%(uid)s/alloy-cluster-node?var-instance=${__data.fields.instance}&var-datasource=${datasource}&var-loki_datasource=${loki_datasource}&var-cluster=${cluster}&var-namespace=${namespace}' % { uid: std.md5(cluster_node_filename) }, + url: '/d/%(uid)s/alloy-cluster-node?var-instance=${__data.fields.instance}&var-datasource=${datasource}&var-loki_datasource=${loki_datasource}&var-job=${job}&var-cluster=${cluster}&var-namespace=${namespace}' % { uid: std.md5(cluster_node_filename) }, }, ], }, @@ -122,14 +127,14 @@ local cluster_node_filename = 'alloy-cluster-node.json'; panel.withPosition({ h: 9, w: 8, x: 0, y: 9 }) + panel.withQueries([ panel.newInstantQuery( - expr=||| + expr= ||| clamp(( - sum(stddev by (state) (cluster_node_peers{cluster="$cluster", namespace="$namespace"}) != 0) or - (sum(abs(sum without (state) (cluster_node_peers{cluster="$cluster", namespace="$namespace"})) - scalar(count(cluster_node_info{cluster="$cluster", namespace="$namespace"})) != 0)) + sum(stddev by (state) (cluster_node_peers{%(groupSelector)s}) != 0) or + (sum(abs(sum without (state) (cluster_node_peers{%(groupSelector)s})) - scalar(count(cluster_node_info{%(groupSelector)s})) != 0)) ), 1, 1 ) - |||, + ||| % $._config, format='time_series' ), ]) + @@ -191,14 +196,14 @@ local cluster_node_filename = 'alloy-cluster-node.json'; panel.withPosition({ h: 9, w: 16, x: 8, y: 9 }) + panel.withQueries([ panel.newQuery( - expr=||| + expr= ||| ceil(clamp(( - sum(stddev by (state) (cluster_node_peers{cluster="$cluster", namespace="$namespace"})) or - (sum(abs(sum without (state) (cluster_node_peers{cluster="$cluster", namespace="$namespace"})) - scalar(count(cluster_node_info{cluster="$cluster", namespace="$namespace"})))) + sum(stddev by (state) (cluster_node_peers{%(groupSelector)s})) or + (sum(abs(sum without (state) (cluster_node_peers{%(groupSelector)s})) - scalar(count(cluster_node_info{%(groupSelector)s})))) ), 0, 1 )) - |||, + ||| % $._config, legendFormat='Converged' ), ]) + diff --git a/operations/alloy-mixin/dashboards/controller.libsonnet b/operations/alloy-mixin/dashboards/controller.libsonnet index bd6623e80b..aa5b4ce357 100644 --- a/operations/alloy-mixin/dashboards/controller.libsonnet +++ b/operations/alloy-mixin/dashboards/controller.libsonnet @@ -3,22 +3,28 @@ local panel = import './utils/panel.jsonnet'; local filename = 'alloy-controller.json'; { + + local templateVariables = + if $._config.enableK8sCluster then + [ + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace"}, job)'), + ] + else + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + ], + [filename]: - dashboard.new(name='Alloy / Controller') + + dashboard.new(name='Alloy / Controller', tag=$._config.dashboardTag) + dashboard.withDocsLink( url='https://grafana.com/docs/alloy/latest/concepts/component_controller/', desc='Component controller documentation', ) + - dashboard.withDashboardsLink() + + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + - dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('cluster', ||| - label_values(alloy_component_controller_running_components, cluster) - |||), - dashboard.newTemplateVariable('namespace', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster"}, namespace) - |||), - ]) + + dashboard.withTemplateVariablesMixin(templateVariables) + // TODO(@tpaschalis) Make the annotation optional. dashboard.withAnnotations([ dashboard.newLokiAnnotation('Deployments', '{cluster="$cluster", container="kube-diff-logger"} | json | namespace_extracted="alloy" | name_extracted=~"alloy.*"', 'rgba(0, 211, 255, 1)'), @@ -34,7 +40,9 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 0, y: 0, w: 10, h: 4 }) + panel.withQueries([ panel.newQuery( - expr='count(alloy_component_controller_evaluating{cluster="$cluster", namespace="$namespace"})', + expr= ||| + count(alloy_component_controller_evaluating{%(groupSelector)s}) + ||| % $._config, ), ]) ), @@ -49,7 +57,9 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 0, y: 4, w: 10, h: 4 }) + panel.withQueries([ panel.newQuery( - expr='sum(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace"})', + expr= ||| + sum(alloy_component_controller_running_components{%(groupSelector)s}) + ||| % $._config, ), ]) ), @@ -72,10 +82,10 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 0, y: 8, w: 10, h: 4 }) + panel.withQueries([ panel.newQuery( - expr=||| - sum(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace",health_type="healthy"}) / - sum(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace"}) - |||, + expr= ||| + sum(alloy_component_controller_running_components{%(groupSelector)s,health_type="healthy"}) / + sum(alloy_component_controller_running_components{%(groupSelector)s}) + ||| % $._config, ), ]) ), @@ -157,19 +167,27 @@ local filename = 'alloy-controller.json'; panel.withQueries([ panel.newInstantQuery( legendFormat='Healthy', - expr='sum(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace", health_type="healthy"}) or vector(0)', + expr= ||| + sum(alloy_component_controller_running_components{%(groupSelector)s, health_type="healthy"}) or vector(0) + ||| % $._config, ), panel.newInstantQuery( legendFormat='Unhealthy', - expr='sum(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace", health_type="unhealthy"}) or vector(0)', + expr= ||| + sum(alloy_component_controller_running_components{%(groupSelector)s, health_type="unhealthy"}) or vector(0) + ||| % $._config, ), panel.newInstantQuery( legendFormat='Unknown', - expr='sum(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace", health_type="unknown"}) or vector(0)', + expr= ||| + sum(alloy_component_controller_running_components{%(groupSelector)s, health_type="unknown"}) or vector(0) + ||| % $._config, ), panel.newInstantQuery( legendFormat='Exited', - expr='sum(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace", health_type="exited"}) or vector(0)', + expr= ||| + sum(alloy_component_controller_running_components{%(groupSelector)s, health_type="exited"}) or vector(0) + ||| % $._config, ), ]) ), @@ -194,7 +212,9 @@ local filename = 'alloy-controller.json'; panel.withMultiTooltip() + panel.withQueries([ panel.newQuery( - expr='sum by (instance) (rate(alloy_component_evaluation_seconds_count{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))', + expr= ||| + sum by (instance) (rate(alloy_component_evaluation_seconds_count{%(groupSelector)s}[$__rate_interval])) + ||| % $._config, ), ]) ), @@ -218,33 +238,33 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 8, y: 12, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - histogram_quantile(0.99, sum(rate(alloy_component_evaluation_seconds{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))) + expr= ||| + histogram_quantile(0.99, sum(rate(alloy_component_evaluation_seconds{%(groupSelector)s}[$__rate_interval]))) or - histogram_quantile(0.99, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))) - |||, + histogram_quantile(0.99, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{%(groupSelector)s}[$__rate_interval]))) + ||| % $._config, legendFormat='99th percentile', ), panel.newQuery( - expr=||| - histogram_quantile(0.50, sum(rate(alloy_component_evaluation_seconds{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))) + expr= ||| + histogram_quantile(0.50, sum(rate(alloy_component_evaluation_seconds{%(groupSelector)s}[$__rate_interval]))) or - histogram_quantile(0.50, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))) - |||, + histogram_quantile(0.50, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{%(groupSelector)s}[$__rate_interval]))) + ||| % $._config, legendFormat='50th percentile', ), panel.newQuery( - expr=||| + expr= ||| ( - histogram_sum(sum(rate(alloy_component_evaluation_seconds{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))) / - histogram_count(sum(rate(alloy_component_evaluation_seconds{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))) + histogram_sum(sum(rate(alloy_component_evaluation_seconds{%(groupSelector)s}[$__rate_interval]))) / + histogram_count(sum(rate(alloy_component_evaluation_seconds{%(groupSelector)s}[$__rate_interval]))) ) or ( - sum(rate(alloy_component_evaluation_seconds_sum{cluster="$cluster",namespace="$namespace"}[$__rate_interval])) / - sum(rate(alloy_component_evaluation_seconds_count{cluster="$cluster",namespace="$namespace"}[$__rate_interval])) + sum(rate(alloy_component_evaluation_seconds_sum{%(groupSelector)s}[$__rate_interval])) / + sum(rate(alloy_component_evaluation_seconds_count{%(groupSelector)s}[$__rate_interval])) ) - |||, + ||| % $._config, legendFormat='Average', ), ]) @@ -263,10 +283,10 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - sum by (component_path, component_id) (rate(alloy_component_evaluation_slow_seconds{cluster="$cluster", namespace="$namespace"}[$__rate_interval])) - / scalar(sum(rate(alloy_component_evaluation_seconds_sum{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))) - |||, + expr= ||| + sum by (component_path, component_id) (rate(alloy_component_evaluation_slow_seconds{%(groupSelector)s}[$__rate_interval])) + / scalar(sum(rate(alloy_component_evaluation_seconds_sum{%(groupSelector)s}[$__rate_interval]))) + ||| % $._config, legendFormat='{{component path}} {{component_id}}', ), ]) @@ -286,11 +306,11 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 0, y: 22, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - sum(increase(alloy_component_evaluation_seconds{cluster="$cluster", namespace="$namespace"}[$__rate_interval])) + expr= ||| + sum(increase(alloy_component_evaluation_seconds{%(groupSelector)s}[$__rate_interval])) or ignoring (le) - sum by (le) (increase(alloy_component_evaluation_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval])) - |||, + sum by (le) (increase(alloy_component_evaluation_seconds_bucket{%(groupSelector)s}[$__rate_interval]))' + ||| % $._config, format='heatmap', legendFormat='{{le}}', ), @@ -311,11 +331,11 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 8, y: 22, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - sum(increase(alloy_component_dependencies_wait_seconds{cluster="$cluster", namespace="$namespace"}[$__rate_interval])) + expr= ||| + sum(increase(alloy_component_dependencies_wait_seconds{%(groupSelector)s}[$__rate_interval])) or ignoring (le) - sum by (le) (increase(alloy_component_dependencies_wait_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval])) - |||, + sum by (le) (increase(alloy_component_dependencies_wait_seconds_bucket{%(groupSelector)s}[$__rate_interval])) + ||| % $._config, format='heatmap', legendFormat='{{le}}', ), diff --git a/operations/alloy-mixin/dashboards/opentelemetry.libsonnet b/operations/alloy-mixin/dashboards/opentelemetry.libsonnet index 04aa577186..c78d6af468 100644 --- a/operations/alloy-mixin/dashboards/opentelemetry.libsonnet +++ b/operations/alloy-mixin/dashboards/opentelemetry.libsonnet @@ -15,21 +15,25 @@ local stackedPanelMixin = { }; { + local templateVariables = + if $._config.enableK8sCluster then + [ + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace"}, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace", job=~"$job"}, instance)'), + ] + else + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job"}, instance)'), + ], + [filename]: - dashboard.new(name='Alloy / OpenTelemetry') + - dashboard.withDashboardsLink() + + dashboard.new(name='Alloy / OpenTelemetry', tag=$._config.dashboardTag) + + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + - dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('cluster', ||| - label_values(alloy_component_controller_running_components, cluster) - |||), - dashboard.newTemplateVariable('namespace', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster"}, namespace) - |||), - dashboard.newMultiTemplateVariable('instance', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace"}, instance) - |||), - ]) + + dashboard.withTemplateVariablesMixin(templateVariables) + dashboard.withPanelsMixin([ // "Receivers for traces" row ( @@ -45,9 +49,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 0, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - rate(receiver_accepted_spans_ratio_total{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval]) - |||, + expr= ||| + rate(receiver_accepted_spans_ratio_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, //TODO: How will the dashboard look if there is more than one receiver component? The legend is not unique enough? legendFormat='{{ pod }} / {{ transport }}', ), @@ -63,9 +67,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 8, y: 0, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - rate(receiver_refused_spans_ratio_total{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval]) - |||, + expr= ||| + rate(receiver_refused_spans_ratio_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{ pod }} / {{ transport }}', ), ]) @@ -78,7 +82,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 16, y: 0, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr='sum by (le) (increase(rpc_server_duration_milliseconds_bucket{cluster="$cluster", namespace="$namespace", instance=~"$instance", rpc_service="opentelemetry.proto.collector.trace.v1.TraceService"}[$__rate_interval]))', + expr= ||| + sum by (le) (increase(rpc_server_duration_milliseconds_bucket{%(instanceSelector)s, rpc_service="opentelemetry.proto.collector.trace.v1.TraceService"}[$__rate_interval])) + ||| % $._config, format='heatmap', legendFormat='{{le}}', ), @@ -99,7 +105,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 10, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr='sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval]))', + expr= ||| + sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{%(instanceSelector)s}[$__rate_interval])) + ||| % $._config, format='heatmap', legendFormat='{{le}}', ), @@ -116,9 +124,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 8, y: 10, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - processor_batch_metadata_cardinality_ratio{cluster="$cluster", namespace="$namespace", instance=~"$instance"} - |||, + expr= ||| + processor_batch_metadata_cardinality_ratio{%(instanceSelector)s} + ||| % $._config, legendFormat='{{ pod }}', ), ]) @@ -131,9 +139,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 16, y: 10, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - rate(processor_batch_timeout_trigger_send_ratio_total{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval]) - |||, + expr= ||| + rate(processor_batch_timeout_trigger_send_ratio_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{ pod }}', ), ]) @@ -153,9 +161,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 20, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - rate(exporter_sent_spans_ratio_total{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval]) - |||, + expr= ||| + rate(exporter_sent_spans_ratio_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{ pod }}', ), ]) @@ -169,9 +177,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 8, y: 20, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - rate(exporter_send_failed_spans_ratio_total{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval]) - |||, + expr= ||| + rate(exporter_send_failed_spans_ratio_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{ pod }}', ), ]) diff --git a/operations/alloy-mixin/dashboards/prometheus.libsonnet b/operations/alloy-mixin/dashboards/prometheus.libsonnet index e54b28bd08..b023dcc264 100644 --- a/operations/alloy-mixin/dashboards/prometheus.libsonnet +++ b/operations/alloy-mixin/dashboards/prometheus.libsonnet @@ -2,428 +2,435 @@ local dashboard = import './utils/dashboard.jsonnet'; local panel = import './utils/panel.jsonnet'; local filename = 'alloy-prometheus-remote-write.json'; -local stackedPanelMixin = { - fieldConfig+: { - defaults+: { - custom+: { - fillOpacity: 20, - gradientMode: 'hue', - stacking: { mode: 'normal' }, +{ + local stackedPanelMixin = { + fieldConfig+: { + defaults+: { + custom+: { + fillOpacity: 20, + gradientMode: 'hue', + stacking: { mode: 'normal' }, + }, }, }, }, -}; -local scrapePanels(y_offset) = [ - panel.newRow(title='prometheus.scrape', y=y_offset), + local scrapePanels(y_offset) = [ + panel.newRow(title='prometheus.scrape', y=y_offset), - // Scrape success rate - ( - panel.new(title='Scrape success rate in $cluster', type='timeseries') + - panel.withUnit('percentunit') + - panel.withDescription(||| - Percentage of targets successfully scraped by prometheus.scrape - components. + // Scrape success rate + ( + panel.new(title='Scrape success rate in $cluster', type='timeseries') + + panel.withUnit('percentunit') + + panel.withDescription(||| + Percentage of targets successfully scraped by prometheus.scrape + components. - This metric is calculated by dividing the number of targets - successfully scraped by the total number of targets scraped, - across all the namespaces in the selected cluster. + This metric is calculated by dividing the number of targets + successfully scraped by the total number of targets scraped, + across all the namespaces in the selected cluster. - Low success rates can indicate a problem with scrape targets, - stale service discovery, or Alloy misconfiguration. - |||) + - panel.withPosition({ x: 0, y: 1 + y_offset, w: 12, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum(up{cluster="$cluster"}) - / - count (up{cluster="$cluster"}) - |||, - legendFormat='% of targets successfully scraped', - ), - ]) - ), + Low success rates can indicate a problem with scrape targets, + stale service discovery, or Alloy misconfiguration. + |||) + + panel.withPosition({ x: 0, y: 1 + y_offset, w: 12, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr=||| + sum(up{job=~"$job", cluster=~"$cluster"}) + / + count (up{job=~"$job", cluster=~"$cluster"}) + |||, + legendFormat='% of targets successfully scraped', + ), + ]) + ), - // Scrape duration - ( - panel.new(title='Scrape duration in $cluster', type='timeseries') + - panel.withUnit('s') + - panel.withDescription(||| - Duration of successful scrapes by prometheus.scrape components, - across all the namespaces in the selected cluster. + // Scrape duration + ( + panel.new(title='Scrape duration in $cluster', type='timeseries') + + panel.withUnit('s') + + panel.withDescription(||| + Duration of successful scrapes by prometheus.scrape components, + across all the namespaces in the selected cluster. - This metric should be below your configured scrape interval. - High durations can indicate a problem with a scrape target or - a performance issue with Alloy. - |||) + - panel.withPosition({ x: 12, y: 1 + y_offset, w: 12, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - quantile(0.99, scrape_duration_seconds{cluster="$cluster"}) - |||, - legendFormat='p99', - ), - panel.newQuery( - expr=||| - quantile(0.95, scrape_duration_seconds{cluster="$cluster"}) - |||, - legendFormat='p95', - ), - panel.newQuery( - expr=||| - quantile(0.50, scrape_duration_seconds{cluster="$cluster"}) - |||, - legendFormat='p50', - ), + This metric should be below your configured scrape interval. + High durations can indicate a problem with a scrape target or + a performance issue with Alloy. + |||) + + panel.withPosition({ x: 12, y: 1 + y_offset, w: 12, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr=||| + quantile(0.99, scrape_duration_seconds{job=~"$job", cluster=~"$cluster"}) + |||, + legendFormat='p99', + ), + panel.newQuery( + expr=||| + quantile(0.95, scrape_duration_seconds{job=~"$job", cluster=~"$cluster"}) + |||, + legendFormat='p95', + ), + panel.newQuery( + expr=||| + quantile(0.50, scrape_duration_seconds{job=~"$job", cluster=~"$cluster"}) + |||, + legendFormat='p50', + ), - ]) - ), -]; + ]) + ), + ], -local remoteWritePanels(y_offset) = [ - panel.newRow(title='prometheus.remote_write', y=y_offset), + local remoteWritePanels(y_offset) = [ + panel.newRow(title='prometheus.remote_write', y=y_offset), - // WAL delay - ( - panel.new(title='WAL delay', type='timeseries') + - panel.withUnit('s') + - panel.withDescription(||| - How far behind prometheus.remote_write from samples recently written - to the WAL. + // WAL delay + ( + panel.new(title='WAL delay', type='timeseries') + + panel.withUnit('s') + + panel.withDescription(||| + How far behind prometheus.remote_write from samples recently written + to the WAL. - Each endpoint prometheus.remote_write is configured to send metrics - has its own delay. The time shown here is the sum across all - endpoints for the given component. + Each endpoint prometheus.remote_write is configured to send metrics + has its own delay. The time shown here is the sum across all + endpoints for the given component. - It is normal for the WAL delay to be within 1-3 scrape intervals. If - the WAL delay continues to increase beyond that amount, try - increasing the number of maximum shards. - |||) + - panel.withPosition({ x: 0, y: 1 + y_offset, w: 6, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum by (instance, component_path, component_id) ( - prometheus_remote_storage_highest_timestamp_in_seconds{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component"} - - ignoring(url, remote_name) group_right(instance) - prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"} - ) - |||, - legendFormat='{{instance}} / {{component_path}} {{component_id}}', - ), - ]) - ), + It is normal for the WAL delay to be within 1-3 scrape intervals. If + the WAL delay continues to increase beyond that amount, try + increasing the number of maximum shards. + |||) + + panel.withPosition({ x: 0, y: 1 + y_offset, w: 6, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by (instance, component_path, component_id) ( + prometheus_remote_storage_highest_timestamp_in_seconds{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component"} + - ignoring(url, remote_name) group_right(instance) + prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"} + ) + ||| % $._config, + legendFormat='{{instance}} / {{component_path}} {{component_id}}', + ), + ]) + ), - // Data write throughput - ( - panel.new(title='Data write throughput', type='timeseries') + - stackedPanelMixin + - panel.withUnit('Bps') + - panel.withDescription(||| - Rate of data containing samples and metadata sent by - prometheus.remote_write. - |||) + - panel.withPosition({ x: 6, y: 1 + y_offset, w: 6, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum without (remote_name, url) ( - rate(prometheus_remote_storage_bytes_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + - rate(prometheus_remote_storage_metadata_bytes_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - ) - |||, - legendFormat='{{instance}} / {{component_path}} {{component_id}}', - ), - ]) - ), + // Data write throughput + ( + panel.new(title='Data write throughput', type='timeseries') + + stackedPanelMixin + + panel.withUnit('Bps') + + panel.withDescription(||| + Rate of data containing samples and metadata sent by + prometheus.remote_write. + |||) + + panel.withPosition({ x: 6, y: 1 + y_offset, w: 6, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum without (remote_name, url) ( + rate(prometheus_remote_storage_bytes_total{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + + rate(prometheus_remote_storage_metadata_bytes_total{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + ) + ||| % $._config, + legendFormat='{{instance}} / {{component_path}} {{component_id}}', + ), + ]) + ), - // Write latency - ( - panel.new(title='Write latency', type='timeseries') + - panel.withUnit('s') + - panel.withDescription(||| - Latency of writes to the remote system made by - prometheus.remote_write. - |||) + - panel.withPosition({ x: 12, y: 1 + y_offset, w: 6, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - histogram_quantile(0.99, sum by (le) ( - rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - )) - |||, - legendFormat='99th percentile', - ), - panel.newQuery( - expr=||| - histogram_quantile(0.50, sum by (le) ( - rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - )) - |||, - legendFormat='50th percentile', - ), - panel.newQuery( - expr=||| - sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) / - sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) - |||, - legendFormat='Average', - ), - ]) - ), + // Write latency + ( + panel.new(title='Write latency', type='timeseries') + + panel.withUnit('s') + + panel.withDescription(||| + Latency of writes to the remote system made by + prometheus.remote_write. + |||) + + panel.withPosition({ x: 12, y: 1 + y_offset, w: 6, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= ||| + histogram_quantile(0.99, sum by (le) ( + rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + )) + ||| % $._config, + legendFormat='99th percentile', + ), + panel.newQuery( + expr= ||| + histogram_quantile(0.50, sum by (le) ( + rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + )) + ||| % $._config, + legendFormat='50th percentile', + ), + panel.newQuery( + expr= ||| + sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) / + sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) + ||| % $._config, + legendFormat='Average', + ), + ]) + ), - // Shards - ( - local minMaxOverride = { - properties: [{ - id: 'custom.lineStyle', - value: { - dash: [10, 15], - fill: 'dash', - }, - }, { - id: 'custom.showPoints', - value: 'never', - }, { - id: 'custom.hideFrom', - value: { - legend: true, - tooltip: false, - viz: false, - }, - }], - }; + // Shards + ( + local minMaxOverride = { + properties: [{ + id: 'custom.lineStyle', + value: { + dash: [10, 15], + fill: 'dash', + }, + }, { + id: 'custom.showPoints', + value: 'never', + }, { + id: 'custom.hideFrom', + value: { + legend: true, + tooltip: false, + viz: false, + }, + }], + }; - panel.new(title='Shards', type='timeseries') { - fieldConfig+: { - overrides: [ - minMaxOverride { matcher: { id: 'byName', options: 'Minimum' } }, - minMaxOverride { matcher: { id: 'byName', options: 'Maximum' } }, - ], - }, - } + - panel.withUnit('none') + - panel.withDescription(||| - Total number of shards which are concurrently sending samples read - from the Write-Ahead Log. + panel.new(title='Shards', type='timeseries') { + fieldConfig+: { + overrides: [ + minMaxOverride { matcher: { id: 'byName', options: 'Minimum' } }, + minMaxOverride { matcher: { id: 'byName', options: 'Maximum' } }, + ], + }, + } + + panel.withUnit('none') + + panel.withDescription(||| + Total number of shards which are concurrently sending samples read + from the Write-Ahead Log. - Shards are bound to a minimum and maximum, displayed on the graph. - The lowest minimum and the highest maximum across all clients is - shown. + Shards are bound to a minimum and maximum, displayed on the graph. + The lowest minimum and the highest maximum across all clients is + shown. - Each client has its own set of shards, minimum shards, and maximum - shards; filter to a specific URL to display more granular - information. - |||) + - panel.withPosition({ x: 18, y: 1 + y_offset, w: 6, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum without (remote_name, url) ( - prometheus_remote_storage_shards{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"} - ) - |||, - legendFormat='{{instance}} / {{component_path}} {{component_id}}', - ), - panel.newQuery( - expr=||| - min ( - prometheus_remote_storage_shards_min{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"} - ) - |||, - legendFormat='Minimum', - ), - panel.newQuery( - expr=||| - max ( - prometheus_remote_storage_shards_max{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"} - ) - |||, - legendFormat='Maximum', - ), - ]) - ), + Each client has its own set of shards, minimum shards, and maximum + shards; filter to a specific URL to display more granular + information. + |||) + + panel.withPosition({ x: 18, y: 1 + y_offset, w: 6, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum without (remote_name, url) ( + prometheus_remote_storage_shards{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"} + ) + ||| % $._config, + legendFormat='{{instance}} / {{component_path}} {{component_id}}', + ), + panel.newQuery( + expr= ||| + min ( + prometheus_remote_storage_shards_min{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"} + ) + ||| % $._config, + legendFormat='Minimum', + ), + panel.newQuery( + expr= ||| + max ( + prometheus_remote_storage_shards_max{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"} + ) + ||| % $._config, + legendFormat='Maximum', + ), + ]) + ), - // Sent samples / second - ( - panel.new(title='Sent samples / second', type='timeseries') + - stackedPanelMixin + - panel.withUnit('cps') + - panel.withDescription(||| - Total outgoing samples sent by prometheus.remote_write. - |||) + - panel.withPosition({ x: 0, y: 11 + y_offset, w: 8, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum without (url, remote_name) ( - rate(prometheus_remote_storage_samples_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - ) - |||, - legendFormat='{{instance}} / {{component_path}} {{component_id}}', - ), - ]) - ), + // Sent samples / second + ( + panel.new(title='Sent samples / second', type='timeseries') + + stackedPanelMixin + + panel.withUnit('cps') + + panel.withDescription(||| + Total outgoing samples sent by prometheus.remote_write. + |||) + + panel.withPosition({ x: 0, y: 11 + y_offset, w: 8, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum without (url, remote_name) ( + rate(prometheus_remote_storage_samples_total{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + ) + ||| % $._config, + legendFormat='{{instance}} / {{component_path}} {{component_id}}', + ), + ]) + ), - // Failed samples / second - ( - panel.new(title='Failed samples / second', type='timeseries') + - stackedPanelMixin + - panel.withUnit('cps') + - panel.withDescription(||| - Rate of samples which prometheus.remote_write could not send due to - non-recoverable errors. - |||) + - panel.withPosition({ x: 8, y: 11 + y_offset, w: 8, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum without (url,remote_name) ( - rate(prometheus_remote_storage_samples_failed_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - ) - |||, - legendFormat='{{instance}} / {{component_path}} {{component_id}}', - ), - ]) - ), + // Failed samples / second + ( + panel.new(title='Failed samples / second', type='timeseries') + + stackedPanelMixin + + panel.withUnit('cps') + + panel.withDescription(||| + Rate of samples which prometheus.remote_write could not send due to + non-recoverable errors. + |||) + + panel.withPosition({ x: 8, y: 11 + y_offset, w: 8, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum without (url,remote_name) ( + rate(prometheus_remote_storage_samples_failed_total{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + ) + ||| % $._config, + legendFormat='{{instance}} / {{component_path}} {{component_id}}', + ), + ]) + ), - // Retried samples / second - ( - panel.new(title='Retried samples / second', type='timeseries') + - stackedPanelMixin + - panel.withUnit('cps') + - panel.withDescription(||| - Rate of samples which prometheus.remote_write attempted to resend - after receiving a recoverable error. - |||) + - panel.withPosition({ x: 16, y: 11 + y_offset, w: 8, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum without (url,remote_name) ( - rate(prometheus_remote_storage_samples_retried_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - ) - |||, - legendFormat='{{instance}} / {{component_path}} {{component_id}}', - ), - ]) - ), + // Retried samples / second + ( + panel.new(title='Retried samples / second', type='timeseries') + + stackedPanelMixin + + panel.withUnit('cps') + + panel.withDescription(||| + Rate of samples which prometheus.remote_write attempted to resend + after receiving a recoverable error. + |||) + + panel.withPosition({ x: 16, y: 11 + y_offset, w: 8, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum without (url,remote_name) ( + rate(prometheus_remote_storage_samples_retried_total{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + ) + ||| % $._config, + legendFormat='{{instance}} / {{component_path}} {{component_id}}', + ), + ]) + ), - // Active series (Total) - ( - panel.new(title='Active series (total)', type='timeseries') { - options+: { - legend+: { - showLegend: false, + // Active series (Total) + ( + panel.new(title='Active series (total)', type='timeseries') { + options+: { + legend+: { + showLegend: false, + }, }, - }, - } + - panel.withUnit('short') + - panel.withDescription(||| - Total number of active series across all components. + } + + panel.withUnit('short') + + panel.withDescription(||| + Total number of active series across all components. - An "active series" is a series that prometheus.remote_write recently - received a sample for. Active series are garbage collected whenever a - truncation of the WAL occurs. - |||) + - panel.withPosition({ x: 0, y: 21 + y_offset, w: 8, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum(prometheus_remote_write_wal_storage_active_series{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}) - |||, - legendFormat='Series', - ), - ]) - ), + An "active series" is a series that prometheus.remote_write recently + received a sample for. Active series are garbage collected whenever a + truncation of the WAL occurs. + |||) + + panel.withPosition({ x: 0, y: 21 + y_offset, w: 8, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum(prometheus_remote_write_wal_storage_active_series{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}) + ||| % $._config, + legendFormat='Series', + ), + ]) + ), - // Active series (by instance/component) - ( - panel.new(title='Active series (by instance/component)', type='timeseries') + - panel.withUnit('short') + - panel.withDescription(||| - Total number of active series which are currently being tracked by - prometheus.remote_write components, with separate lines for each Alloy instance. + // Active series (by instance/component) + ( + panel.new(title='Active series (by instance/component)', type='timeseries') + + panel.withUnit('short') + + panel.withDescription(||| + Total number of active series which are currently being tracked by + prometheus.remote_write components, with separate lines for each Alloy instance. - An "active series" is a series that prometheus.remote_write recently - received a sample for. Active series are garbage collected whenever a - truncation of the WAL occurs. - |||) + - panel.withPosition({ x: 8, y: 21 + y_offset, w: 8, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - prometheus_remote_write_wal_storage_active_series{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id!="", component_path=~"$component_path", component_id=~"$component", url=~"$url"} - |||, - legendFormat='{{instance}} / {{component_path}} {{component_id}}', - ), - ]) - ), + An "active series" is a series that prometheus.remote_write recently + received a sample for. Active series are garbage collected whenever a + truncation of the WAL occurs. + |||) + + panel.withPosition({ x: 8, y: 21 + y_offset, w: 8, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= ||| + prometheus_remote_write_wal_storage_active_series{%(instanceSelector)s, component_id!="", component_path=~"$component_path", component_id=~"$component", url=~"$url"} + ||| % $._config, + legendFormat='{{instance}} / {{component_path}} {{component_id}}', + ), + ]) + ), - // Active series (by component) - ( - panel.new(title='Active series (by component)', type='timeseries') + - panel.withUnit('short') + - panel.withDescription(||| - Total number of active series which are currently being tracked by - prometheus.remote_write components, aggregated across all instances. + // Active series (by component) + ( + panel.new(title='Active series (by component)', type='timeseries') + + panel.withUnit('short') + + panel.withDescription(||| + Total number of active series which are currently being tracked by + prometheus.remote_write components, aggregated across all instances. - An "active series" is a series that prometheus.remote_write recently - received a sample for. Active series are garbage collected whenever a - truncation of the WAL occurs. - |||) + - panel.withPosition({ x: 16, y: 21 + y_offset, w: 8, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum by (component_path, component_id) (prometheus_remote_write_wal_storage_active_series{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id!="", component_path=~"$component_path", component_id=~"$component", url=~"$url"}) - |||, - legendFormat='{{component_path}} {{component_id}}', - ), - ]) - ), -]; + An "active series" is a series that prometheus.remote_write recently + received a sample for. Active series are garbage collected whenever a + truncation of the WAL occurs. + |||) + + panel.withPosition({ x: 16, y: 21 + y_offset, w: 8, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by (component_path, component_id) (prometheus_remote_write_wal_storage_active_series{%(instanceSelector)s, component_id!="", component_path=~"$component_path", component_id=~"$component", url=~"$url"}) + ||| % $._config, + legendFormat='{{component_path}} {{component_id}}', + ), + ]) + ), + ], + + local panels = + if $._config.enableK8sCluster then + // First row, offset is 0 + scrapePanels(y_offset=0) + + // Scrape panels take 11 units, so offset next row by 11. + remoteWritePanels(y_offset=11) + else + remoteWritePanels(y_offset=0), + + local templateVariables = + if $._config.enableK8sCluster then + [ + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace"}, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace", job=~"$job"}, instance)'), + dashboard.newMultiTemplateVariable('component_path', 'label_values(prometheus_remote_write_wal_samples_appended_total{cluster=~"$cluster", namespace=~"$namespace", job=~"$job", instance=~"$instance", component_id=~"prometheus.remote_write.*", component_path=~".*"}, component_path)'), + dashboard.newMultiTemplateVariable('component', 'label_values(prometheus_remote_write_wal_samples_appended_total{cluster=~"$cluster", namespace=~"$namespace", job=~"$job", instance=~"$instance", component_id=~"prometheus.remote_write.*"}, component_id)'), + dashboard.newMultiTemplateVariable('url', 'label_values(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~"$cluster", namespace=~"$namespace", job="$job", instance=~"$instance", component_id=~"$component"}, url)'), + ] + else + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job"}, instance)'), + dashboard.newMultiTemplateVariable('component_path', 'label_values(prometheus_remote_write_wal_samples_appended_total{job=~"$job", instance=~"$instance", component_id=~"prometheus.remote_write.*", component_path=~".*"}, component_path)'), + dashboard.newMultiTemplateVariable('component', 'label_values(prometheus_remote_write_wal_samples_appended_total{job=~"$job", instance=~"$instance", component_id=~"prometheus.remote_write.*"}, component_id)'), + dashboard.newMultiTemplateVariable('url', 'label_values(prometheus_remote_storage_sent_batch_duration_seconds_sum{job=~"$job", instance=~"$instance", component_id=~"$component"}, url)'), + ], -{ [filename]: - dashboard.new(name='Alloy / Prometheus Components') + + dashboard.new(name='Alloy / Prometheus Components', tag=$._config.dashboardTag) + dashboard.withDocsLink( url='https://grafana.com/docs/alloy/latest/reference/components/prometheus.remote_write/', desc='Component documentation', ) + - dashboard.withDashboardsLink() + + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + - dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('cluster', ||| - label_values(alloy_component_controller_running_components, cluster) - |||), - dashboard.newTemplateVariable('namespace', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster"}, namespace) - |||), - dashboard.newMultiTemplateVariable('instance', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace"}, instance) - |||), - dashboard.newMultiTemplateVariable('component_path', ||| - label_values(prometheus_remote_write_wal_samples_appended_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"prometheus\\.remote_write\\..*", component_path=~".*"}, component_path) - |||), - dashboard.newMultiTemplateVariable('component', ||| - label_values(prometheus_remote_write_wal_samples_appended_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"prometheus\\.remote_write\\..*"}, component_id) - |||), - dashboard.newMultiTemplateVariable('url', ||| - label_values(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"$component"}, url) - |||), - ]) + + dashboard.withTemplateVariablesMixin(templateVariables) + // TODO(@tpaschalis) Make the annotation optional. dashboard.withAnnotations([ - dashboard.newLokiAnnotation('Deployments', '{cluster="$cluster", container="kube-diff-logger"} | json | namespace_extracted="alloy" | name_extracted=~"alloy.*"', 'rgba(0, 211, 255, 1)'), + dashboard.newLokiAnnotation('Deployments', '{cluster=~"$cluster", container="kube-diff-logger"} | json | namespace_extracted="alloy" | name_extracted=~"alloy.*"', 'rgba(0, 211, 255, 1)'), ]) + dashboard.withPanelsMixin( - // First row, offset is 0 - scrapePanels(y_offset=0) + - // Scrape panels take 11 units, so offset next row by 11. - remoteWritePanels(y_offset=11) + panels ), } diff --git a/operations/alloy-mixin/dashboards/resources.libsonnet b/operations/alloy-mixin/dashboards/resources.libsonnet index 8d38b7c789..7aba016d98 100644 --- a/operations/alloy-mixin/dashboards/resources.libsonnet +++ b/operations/alloy-mixin/dashboards/resources.libsonnet @@ -27,21 +27,25 @@ local stackedPanelMixin = { }; { + local templateVariables = + if $._config.enableK8sCluster then + [ + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace"}, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace", job=~"$job"}, instance)'), + ] + else + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job"}, instance)'), + ], + [filename]: - dashboard.new(name='Alloy / Resources') + - dashboard.withDashboardsLink() + + dashboard.new(name='Alloy / Resources', tag=$._config.dashboardTag) + + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + - dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('cluster', ||| - label_values(alloy_component_controller_running_components, cluster) - |||), - dashboard.newTemplateVariable('namespace', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster"}, namespace) - |||), - dashboard.newMultiTemplateVariable('instance', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace"}, instance) - |||), - ]) + + dashboard.withTemplateVariablesMixin(templateVariables) + // TODO(@tpaschalis) Make the annotation optional. dashboard.withAnnotations([ dashboard.newLokiAnnotation('Deployments', '{cluster="$cluster", container="kube-diff-logger"} | json | namespace_extracted="alloy" | name_extracted=~"alloy.*"', 'rgba(0, 211, 255, 1)'), @@ -59,7 +63,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 0, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='rate(alloy_resources_process_cpu_seconds_total{cluster="$cluster",namespace="$namespace",instance=~"$instance"}[$__rate_interval])', + expr= ||| + rate(alloy_resources_process_cpu_seconds_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{instance}}' ), ]) @@ -75,7 +81,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 12, y: 0, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='alloy_resources_process_resident_memory_bytes{cluster="$cluster",namespace="$namespace",instance=~"$instance"}', + expr= ||| + alloy_resources_process_resident_memory_bytes{%(instanceSelector)s} + ||| % $._config, legendFormat='{{instance}}' ), ]) @@ -95,11 +103,11 @@ local stackedPanelMixin = { // Lots of programs export go_goroutines so we ignore anything that // doesn't also have an Alloy-specific metric (i.e., // alloy_build_info). - expr=||| - rate(go_gc_duration_seconds_count{cluster="$cluster",namespace="$namespace",instance=~"$instance"}[5m]) + expr= ||| + rate(go_gc_duration_seconds_count{%(instanceSelector)s}[5m]) and on(instance) - alloy_build_info{cluster="$cluster",namespace="$namespace",instance=~"$instance"} - |||, + alloy_build_info{%(instanceSelector)s} + ||| % $._config, legendFormat='{{instance}}' ), ]) @@ -119,11 +127,11 @@ local stackedPanelMixin = { // Lots of programs export go_goroutines so we ignore anything that // doesn't also have an Alloy-specific metric (i.e., // alloy_build_info). - expr=||| - go_goroutines{cluster="$cluster",namespace="$namespace",instance=~"$instance"} + expr= ||| + go_goroutines{%(instanceSelector)s} and on(instance) - alloy_build_info{cluster="$cluster",namespace="$namespace",instance=~"$instance"} - |||, + alloy_build_info{%(instanceSelector)s} + ||| % $._config, legendFormat='{{instance}}' ), ]) @@ -142,11 +150,11 @@ local stackedPanelMixin = { // Lots of programs export go_memstats_heap_inuse_bytes so we ignore // anything that doesn't also have an Alloy-specific metric // (i.e., alloy_build_info). - expr=||| - go_memstats_heap_inuse_bytes{cluster="$cluster",namespace="$namespace",instance=~"$instance"} + expr= ||| + go_memstats_heap_inuse_bytes{%(instanceSelector)s} and on(instance) - alloy_build_info{cluster="$cluster",namespace="$namespace",instance=~"$instance"} - |||, + alloy_build_info{%(instanceSelector)s} + ||| % $._config, legendFormat='{{instance}}' ), ]) @@ -167,9 +175,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 16, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr=||| - rate(alloy_resources_machine_rx_bytes_total{cluster="$cluster",namespace="$namespace",instance=~"$instance"}[$__rate_interval]) - |||, + expr= ||| + rate(alloy_resources_machine_rx_bytes_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{instance}}' ), ]) @@ -190,9 +198,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 12, y: 16, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr=||| - rate(alloy_resources_machine_tx_bytes_total{cluster="$cluster",namespace="$namespace",instance=~"$instance"}[$__rate_interval]) - |||, + expr= ||| + rate(alloy_resources_machine_tx_bytes_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{instance}}' ), ]) diff --git a/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet b/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet index 40b5f85ced..09135d023a 100644 --- a/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet +++ b/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet @@ -2,13 +2,13 @@ // schemaVersion present in Grafana 9. { - new(name=''):: { + new(name='', tag='alloy-mixin'):: { title: name, timezone: 'utc', refresh: '10s', schemaVersion: 36, graphTooltip: 1, // shared crosshair for all graphs - tags: ['alloy-mixin'], + tags: [tag], templating: { list: [{ name: 'datasource', @@ -76,7 +76,7 @@ }, datasource: '${datasource}', refresh: 2, - sort: 2, + sort: 2, }, newLokiAnnotation(name, expression, color):: { @@ -90,9 +90,9 @@ }, newMultiTemplateVariable(name, query):: $.newTemplateVariable(name, query) { + multi: true, allValue: '.*', includeAll: true, - multi: true, }, withPanelsMixin(panels):: { panels+: panels }, @@ -114,7 +114,7 @@ }], }, - withDashboardsLink():: { + withDashboardsLink(tag='alloy-mixin'):: { links+: [{ title: 'Dashboards', type: 'dashboards', @@ -122,7 +122,7 @@ icon: 'external link', includeVars: true, keepTime: true, - tags: ['alloy-mixin'], + tags: [tag], targetBlank: false, }], }, diff --git a/operations/alloy-mixin/jsonnetfile.json b/operations/alloy-mixin/jsonnetfile.json index 4388812ca2..fb40c35599 100644 --- a/operations/alloy-mixin/jsonnetfile.json +++ b/operations/alloy-mixin/jsonnetfile.json @@ -1,5 +1,33 @@ { "version": 1, - "dependencies": [], + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v10.0.0" + } + }, + "version": "main" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib/" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib/logs" + } + }, + "version": "master" + } + ], "legacyImports": true } diff --git a/operations/alloy-mixin/jsonnetfile.lock.json b/operations/alloy-mixin/jsonnetfile.lock.json new file mode 100644 index 0000000000..97201a3be0 --- /dev/null +++ b/operations/alloy-mixin/jsonnetfile.lock.json @@ -0,0 +1,56 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v10.0.0" + } + }, + "version": "1c56af39815c4903e47c27194444456f005f65df", + "sum": "xdcrJPJlpkq4+5LpGwN4tPAuheNNLXZjE6tDcyvFjr0=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib/" + } + }, + "version": "21526e83f442793d5a0c5969867d123915422b79", + "sum": "IkBo9nj0Qt1eC9w80dO5SI4yvHzmmXcKx5BK8H8U0Mk=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib/logs" + } + }, + "version": "21526e83f442793d5a0c5969867d123915422b79", + "sum": "CemcPbsPzyRUchDLH1TKTxWWgBlg1MRT0jH2X172z6w=" + }, + { + "source": { + "git": { + "remote": "https://github.com/jsonnet-libs/docsonnet.git", + "subdir": "doc-util" + } + }, + "version": "6ac6c69685b8c29c54515448eaca583da2d88150", + "sum": "BrAL/k23jq+xy9oA7TWIhUx07dsA/QLm3g7ktCwe//U=" + }, + { + "source": { + "git": { + "remote": "https://github.com/jsonnet-libs/xtd.git", + "subdir": "" + } + }, + "version": "63d430b69a95741061c2f7fc9d84b1a778511d9c", + "sum": "qiZi3axUSXCVzKUF83zSAxklwrnitMmrDK4XAfjPMdE=" + } + ], + "legacyImports": false +} diff --git a/operations/alloy-mixin/mixin.libsonnet b/operations/alloy-mixin/mixin.libsonnet index 741c943035..cd32e269c6 100644 --- a/operations/alloy-mixin/mixin.libsonnet +++ b/operations/alloy-mixin/mixin.libsonnet @@ -1,3 +1,4 @@ { grafanaDashboardFolder: 'Alloy' } + (import './dashboards.libsonnet') + -(import './alerts.libsonnet') +(import './alerts.libsonnet') + +(import './config.libsonnet')