From 828c888442e40990b5e1f26580a099c20c2b4f5e Mon Sep 17 00:00:00 2001 From: Milos Zivkovic Date: Thu, 9 May 2024 19:33:31 +0200 Subject: [PATCH] Add initial telemetry Docker example --- tm2/pkg/telemetry/README.md | 23 +- tm2/pkg/telemetry/docker/Makefile | 11 + .../telemetry/docker/collector/collector.yaml | 22 + tm2/pkg/telemetry/docker/docker-compose.yml | 34 ++ .../telemetry/docker/grafana/dashboards.yaml | 8 + .../telemetry/docker/grafana/datasources.yaml | 13 + .../docker/grafana/gno-dashboards.json | 541 ++++++++++++++++++ .../docker/prometheus/prometheus.yml | 7 + tm2/pkg/telemetry/metrics/metrics.go | 6 +- 9 files changed, 655 insertions(+), 10 deletions(-) create mode 100644 tm2/pkg/telemetry/docker/Makefile create mode 100644 tm2/pkg/telemetry/docker/collector/collector.yaml create mode 100644 tm2/pkg/telemetry/docker/docker-compose.yml create mode 100644 tm2/pkg/telemetry/docker/grafana/dashboards.yaml create mode 100644 tm2/pkg/telemetry/docker/grafana/datasources.yaml create mode 100644 tm2/pkg/telemetry/docker/grafana/gno-dashboards.json create mode 100644 tm2/pkg/telemetry/docker/prometheus/prometheus.yml diff --git a/tm2/pkg/telemetry/README.md b/tm2/pkg/telemetry/README.md index cadaecc89ab..783b00de789 100644 --- a/tm2/pkg/telemetry/README.md +++ b/tm2/pkg/telemetry/README.md @@ -1,6 +1,7 @@ # Telemetry -The purpose of this package is to provide a way to easily integrate OpenTelemetry Protocol (OTLP) metrics collection into a Tendermint 2 node. +The purpose of this package is to provide a way to easily integrate OpenTelemetry Protocol (OTLP) metrics collection +into a Tendermint 2 node. ## Configure Telemetry @@ -8,10 +9,16 @@ Telemetry can be regularly configured within the TM2 node through the `[telemetry]` section. It is disabled by default. ## OTEL configuration -There are many ways configure the OTEL pipeline for exporting metrics. Here is an example of how a local OTEL collector can be configured to send metrics to Grafana Cloud. This is an optional step and can be highly customized. + +There are many ways configure the OTEL pipeline for exporting metrics. Here is an example of how a local OTEL collector +can be configured to send metrics to Grafana Cloud. This is an optional step and can be highly customized. ### OTEL collector -The latest collector releases can be found [here](https://github.com/open-telemetry/opentelemetry-collector-releases/releases). This is an example of the config that can be used to receive metrics from gno.land and publish them to Grafana Cloud. + +The latest collector releases can be +found [here](https://github.com/open-telemetry/opentelemetry-collector-releases/releases). This is an example of the +config that can be used to receive metrics from gno.land and publish them to Grafana Cloud. + ```yaml receivers: otlp: @@ -29,13 +36,15 @@ exporters: service: pipelines: metrics: - receivers: [otlp] - processors: [batch] - exporters: [otlphttp] + receivers: [ otlp ] + processors: [ batch ] + exporters: [ otlphttp ] ``` -Collector exporter environment variables, including those for authentication, can be found [here](https://opentelemetry.io/docs/specs/otel/protocol/exporter/). +Collector exporter environment variables, including those for authentication, can be +found [here](https://opentelemetry.io/docs/specs/otel/protocol/exporter/). ## Resources + - https://opentelemetry.io/docs/collector/ - https://grafana.com/docs/grafana-cloud/monitor-applications/application-observability/setup/collector/ diff --git a/tm2/pkg/telemetry/docker/Makefile b/tm2/pkg/telemetry/docker/Makefile new file mode 100644 index 00000000000..d9dd7f98243 --- /dev/null +++ b/tm2/pkg/telemetry/docker/Makefile @@ -0,0 +1,11 @@ +.PHONY: up +up: + docker compose up -d + +.PHONY: down +down: + docker compose down + +.PHONY: clean +clean: + docker compose down -v \ No newline at end of file diff --git a/tm2/pkg/telemetry/docker/collector/collector.yaml b/tm2/pkg/telemetry/docker/collector/collector.yaml new file mode 100644 index 00000000000..d258dd025bb --- /dev/null +++ b/tm2/pkg/telemetry/docker/collector/collector.yaml @@ -0,0 +1,22 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + +processors: + batch: + +exporters: + prometheus: + endpoint: collector:8090 + +service: + telemetry: + logs: + level: "debug" + pipelines: + metrics: + receivers: [ otlp ] + processors: [ batch ] + exporters: [ prometheus ] diff --git a/tm2/pkg/telemetry/docker/docker-compose.yml b/tm2/pkg/telemetry/docker/docker-compose.yml new file mode 100644 index 00000000000..f36b9120c8d --- /dev/null +++ b/tm2/pkg/telemetry/docker/docker-compose.yml @@ -0,0 +1,34 @@ +services: + collector: + image: otel/opentelemetry-collector-contrib:latest + ports: + - "4317:4317" + - "4318:4318" + - "8090" + volumes: + - ./collector/collector.yaml:/etc/otelcol-contrib/config.yaml + prometheus: + image: prom/prometheus:latest + command: + - "--enable-feature=remote-write-receiver" + - "--config.file=/etc/prometheus/prometheus.yml" + ports: + - "9090:9090" + volumes: + - prometheus:/prometheus + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + grafana: + image: grafana/grafana-enterprise + volumes: + - grafana_data:/var/lib/grafana + - ./grafana/datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml + - ./grafana/dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml + - ./grafana/gno-dashboards.json:/var/lib/grafana/dashboards/gno-dashboards.json + ports: + - "3000:3000" + +volumes: + prometheus: + driver: local + grafana_data: + driver: local diff --git a/tm2/pkg/telemetry/docker/grafana/dashboards.yaml b/tm2/pkg/telemetry/docker/grafana/dashboards.yaml new file mode 100644 index 00000000000..6a70278b8a1 --- /dev/null +++ b/tm2/pkg/telemetry/docker/grafana/dashboards.yaml @@ -0,0 +1,8 @@ +apiVersion: 1 + +providers: + - name: Gno Node Metrics + folder: Gno + type: file + options: + path: /var/lib/grafana/dashboards \ No newline at end of file diff --git a/tm2/pkg/telemetry/docker/grafana/datasources.yaml b/tm2/pkg/telemetry/docker/grafana/datasources.yaml new file mode 100644 index 00000000000..917b8a544f1 --- /dev/null +++ b/tm2/pkg/telemetry/docker/grafana/datasources.yaml @@ -0,0 +1,13 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + orgId: 1 + url: http://prometheus:9090 + basicAuth: false + isDefault: false + version: 1 + editable: true + uid: prometheus diff --git a/tm2/pkg/telemetry/docker/grafana/gno-dashboards.json b/tm2/pkg/telemetry/docker/grafana/gno-dashboards.json new file mode 100644 index 00000000000..609ee1605ff --- /dev/null +++ b/tm2/pkg/telemetry/docker/grafana/gno-dashboards.json @@ -0,0 +1,541 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "All Gno node metrics", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "panels": [ + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 10, + "title": "General", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "The total number of transactions on the network", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "txs" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "txs_counter_total", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Total Network Transactions", + "type": "stat" + }, + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 9, + "title": "Consensus", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 3, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(build_block_hist_milliseconds_sum[60m])/rate(build_block_hist_milliseconds_count[60m])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Average Block Build Time [60min]", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 5000 + }, + { + "color": "red", + "value": 10000 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(block_interval_hist_seconds_sum[60m])/rate(block_interval_hist_seconds_count[60m])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Average Block Interval [60min]", + "type": "gauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 8, + "panels": [], + "title": "Misc", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 1000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 2, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(broadcast_tx_hist_milliseconds_sum[60m])/rate(broadcast_tx_hist_milliseconds_count[60m])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Average Transaction Broadcast Duration [60min]", + "type": "gauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 7, + "panels": [], + "title": "JSON-RPC", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 2000 + }, + { + "color": "red", + "value": 5000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 5, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(http_request_time_hist_milliseconds_sum[10m])/rate(http_request_time_hist_milliseconds_count[10m])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Average HTTP Request Round Trip Time [10min]", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 2000 + }, + { + "color": "red", + "value": 5000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 6, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(ws_request_time_hist_milliseconds_sum[10m])/rate(ws_request_time_hist_milliseconds_count[10m])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Average WS Request Round Trip Time [10min]", + "type": "gauge" + } + ], + "refresh": "5s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Gno Node Metrics", + "uid": "bdl7d5yogxjb4b", + "version": 6, + "weekStart": "" +} \ No newline at end of file diff --git a/tm2/pkg/telemetry/docker/prometheus/prometheus.yml b/tm2/pkg/telemetry/docker/prometheus/prometheus.yml new file mode 100644 index 00000000000..bf95bc2f5d4 --- /dev/null +++ b/tm2/pkg/telemetry/docker/prometheus/prometheus.yml @@ -0,0 +1,7 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: 'opentelemetry' + static_configs: + - targets: [ 'collector:8090' ] diff --git a/tm2/pkg/telemetry/metrics/metrics.go b/tm2/pkg/telemetry/metrics/metrics.go index a9dac6a83c2..2eecdfdffc6 100644 --- a/tm2/pkg/telemetry/metrics/metrics.go +++ b/tm2/pkg/telemetry/metrics/metrics.go @@ -48,9 +48,6 @@ var ( // BroadcastTxTimer measures the transaction broadcast duration BroadcastTxTimer metric.Int64Histogram - // BuildBlockTimer measures the block build duration - BuildBlockTimer metric.Int64Histogram - // Networking // // InboundPeers measures the active number of inbound peers @@ -89,6 +86,9 @@ var ( // Consensus // + // BuildBlockTimer measures the block build duration + BuildBlockTimer metric.Int64Histogram + // ValidatorsCount measures the size of the active validator set ValidatorsCount *Int64Gauge