diff --git a/docs/deployment/observability.md b/docs/deployment/observability.md index 344f9d68c..8f5b3d6bb 100644 --- a/docs/deployment/observability.md +++ b/docs/deployment/observability.md @@ -44,13 +44,13 @@ Envoy Control Runner exposes a set of metrics on standard Spring Actuator's `/ac Metric | Description | Labels ----------------------|----------------------------------------------------|------------------------------------ - **grpc.connections** | Number of running gRPC connections of a given type | type (cds/xds/lds/rds/sds/unknown) + **connections** | Number of running gRPC connections of a given type | stream-type (cds/xds/lds/rds/sds/unknown), connection-type (grpc) #### xDS requests Metric | Description | Labels -------------------------|---------------------------------------------------|-------------------------------------------------------------- - **grpc.requests.count** | Counter of received gRPC requests of a given type | type (cds/xds/lds/rds/sds/unknown), metric-type(total/delta) + **requests.total** | Counter of received gRPC requests of a given type | stream-type (cds/xds/lds/rds/sds/unknown), connection-type (grpc), discovery-request-type(total/delta) #### Snapshot @@ -62,4 +62,4 @@ Envoy Control Runner exposes a set of metrics on standard Spring Actuator's `/ac Metric | Description | Labels -------------------------------------------|----------------------------------------------------------------|---------------------------------------------- - **cross-dc-synchronization.errors.total** | Counter of synchronization errors for a given DC and operation | cluster, operation (get-instances/get-state) + **cross.dc.synchronization.errors.total** | Counter of synchronization errors for a given DC and operation | cluster, operation (get-instances/get-state) diff --git a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/server/callbacks/MetricsDiscoveryServerCallbacks.kt b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/server/callbacks/MetricsDiscoveryServerCallbacks.kt index 6afc62fe1..92177fd31 100644 --- a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/server/callbacks/MetricsDiscoveryServerCallbacks.kt +++ b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/server/callbacks/MetricsDiscoveryServerCallbacks.kt @@ -36,9 +36,8 @@ class MetricsDiscoveryServerCallbacks(private val meterRegistry: MeterRegistry) .map { type -> type to AtomicInteger(0) } .toMap() - meterRegistry.gauge("grpc.connections", Tags.of("type", "all"), connections) connectionsByType.forEach { (type, typeConnections) -> - meterRegistry.gauge("grpc.connections", Tags.of("type", type.name.lowercase()), typeConnections) + meterRegistry.gauge("connections", Tags.of("connection-type", "grpc", "stream-type", type.name.lowercase()), typeConnections) } } @@ -54,8 +53,8 @@ class MetricsDiscoveryServerCallbacks(private val meterRegistry: MeterRegistry) override fun onV3StreamRequest(streamId: Long, request: V3DiscoveryRequest) { meterRegistry.counter( - "grpc.requests.count", - Tags.of("type", StreamType.fromTypeUrl(request.typeUrl).name.lowercase(), "metric-type", "total") + "requests.total", + Tags.of("connection-type", "grpc", "stream-type", StreamType.fromTypeUrl(request.typeUrl).name.lowercase(), "discovery-request-type", "total") ) .increment() } @@ -65,8 +64,8 @@ class MetricsDiscoveryServerCallbacks(private val meterRegistry: MeterRegistry) request: V3DeltaDiscoveryRequest ) { meterRegistry.counter( - "grpc.requests.count", - Tags.of("type", StreamType.fromTypeUrl(request.typeUrl).name.lowercase(), "metric-type", "delta") + "requests.total", + Tags.of("connection-type", "grpc", "stream-type", StreamType.fromTypeUrl(request.typeUrl).name.lowercase(), "discovery-request-type", "delta") ) .increment() } diff --git a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/snapshot/EnvoySnapshotFactory.kt b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/snapshot/EnvoySnapshotFactory.kt index 4f1e8900c..bcbb61357 100644 --- a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/snapshot/EnvoySnapshotFactory.kt +++ b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/snapshot/EnvoySnapshotFactory.kt @@ -70,7 +70,7 @@ class EnvoySnapshotFactory( ) sample.stop( meterRegistry.timer( - "snapshot-factory.seconds", + "snapshot.factory.seconds", Tags.of("operation", "new-snapshot", "type", "global") ) ) @@ -163,7 +163,7 @@ class EnvoySnapshotFactory( val newSnapshotForGroup = newSnapshotForGroup(group, globalSnapshot) groupSample.stop( meterRegistry.timer( - "snapshot-factory.seconds", + "snapshot.factory.seconds", Tags.of("operation", "new-snapshot", "type", "group") ) ) diff --git a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/snapshot/SnapshotUpdater.kt b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/snapshot/SnapshotUpdater.kt index 08020b56e..e9155f298 100644 --- a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/snapshot/SnapshotUpdater.kt +++ b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/snapshot/SnapshotUpdater.kt @@ -91,7 +91,7 @@ class SnapshotUpdater( // see GroupChangeWatcher return onGroupAdded .publishOn(globalSnapshotScheduler) - .measureBuffer("snapshot-updater.count.total", meterRegistry) + .measureBuffer("snapshot.updater.count.total", meterRegistry) .checkpoint("snapshot-updater-groups-published") .name("snapshot-updater.count.total") .tag("type", "groups") @@ -101,7 +101,7 @@ class SnapshotUpdater( } .onErrorResume { e -> meterRegistry.counter( - "snapshot-updater.errors.total", + "snapshot.updater.errors.total", Tags.of("type", "groups") ) .increment() @@ -112,16 +112,16 @@ class SnapshotUpdater( internal fun services(states: Flux): Flux { return states - .name("snapshot-updater.count.total") + .name("snapshot.updater.count.total") .tag("type", "services") .tag("status", "sampled") .metrics() - .onBackpressureLatestMeasured("snapshot-updater.count.total", meterRegistry) + .onBackpressureLatestMeasured("snapshot.updater.count.total", meterRegistry) // prefetch = 1, instead of default 256, to avoid processing stale states in case of backpressure .publishOn(globalSnapshotScheduler, 1) - .measureBuffer("snapshot-updater.count.total", meterRegistry) // todo + .measureBuffer("snapshot.updater.count.total", meterRegistry) .checkpoint("snapshot-updater-services-published") - .name("snapshot-updater.count.total") + .name("snapshot.updater.count.total") .tag("type", "services") .tag("status", "published") .metrics() @@ -152,7 +152,7 @@ class SnapshotUpdater( .filter { it != emptyUpdateResult } .onErrorResume { e -> meterRegistry.counter( - "snapshot-updater.errors.total", + "snapshot.updater.errors.total", Tags.of("type", "services") ).increment() logger.error("Unable to process service changes", e) @@ -176,14 +176,14 @@ class SnapshotUpdater( } } catch (e: Throwable) { meterRegistry.counter( - "snapshot-updater.errors.total", Tags.of("service", group.serviceName) + "snapshot.updater.errors.total", Tags.of("service", group.serviceName) ).increment() logger.error("Unable to create snapshot for group ${group.serviceName}", e) } } private val updateSnapshotForGroupsTimer = - meterRegistry.timer("snapshot-updater.duration.seconds", Tags.of("type", "groups")) + meterRegistry.timer("snapshot.updater.duration.seconds", Tags.of("type", "groups")) private fun updateSnapshotForGroups( groups: Collection, @@ -198,7 +198,7 @@ class SnapshotUpdater( } else if (result.xdsSnapshot != null && group.communicationMode == XDS) { updateSnapshotForGroup(group, result.xdsSnapshot) } else { - meterRegistry.counter("snapshot-updater.errors.total", Tags.of("type", "communication-mode")) + meterRegistry.counter("snapshot.updater.errors.total", Tags.of("type", "communication-mode")) .increment() logger.error( "Requested snapshot for ${group.communicationMode.name} mode, but it is not here. " + diff --git a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteClusterStateChanges.kt b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteClusterStateChanges.kt index 6ef67a998..f177f718c 100644 --- a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteClusterStateChanges.kt +++ b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteClusterStateChanges.kt @@ -14,5 +14,5 @@ class RemoteClusterStateChanges( .getChanges(properties.sync.pollingInterval) .startWith(MultiClusterState.empty()) .distinctUntilChanged() - .name("cross-dc-changes-distinct").metrics() + .name("cross.dc.synchronization.distinct").metrics() } diff --git a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteServices.kt b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteServices.kt index c2aebe6f9..6a3492a3e 100644 --- a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteServices.kt +++ b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteServices.kt @@ -30,14 +30,14 @@ class RemoteServices( fun getChanges(interval: Long): Flux { val aclFlux: Flux = Flux.create({ sink -> scheduler.scheduleWithFixedDelay({ - meterRegistry.timer("cross-dc-synchronization.seconds", Tags.of("operation", "get-multi-cluster-state")) + meterRegistry.timer("cross.dc.synchronization.seconds", Tags.of("operation", "get-multi-cluster-state")) .recordCallable { getChanges(sink::next, interval) } }, 0, interval, TimeUnit.SECONDS) }, FluxSink.OverflowStrategy.LATEST) return aclFlux.doOnCancel { - meterRegistry.counter("cross-dc-synchronization.cancelled").increment() + meterRegistry.counter("cross.dc.synchronization.cancelled").increment() logger.warn("Cancelling cross dc sync") } } @@ -62,7 +62,7 @@ class RemoteServices( .orTimeout(interval, TimeUnit.SECONDS) .exceptionally { meterRegistry.counter( - "cross-dc-synchronization.errors.total", + "cross.dc.synchronization.errors.total", Tags.of("cluster", cluster, "operation", "get-state") ).increment() logger.warn("Error synchronizing instances ${it.message}", it) @@ -76,7 +76,7 @@ class RemoteServices( cluster to instances } catch (e: Exception) { meterRegistry.counter( - "cross-dc-synchronization.errors.total", + "cross.dc.synchronization.errors.total", Tags.of("cluster", cluster, "operation", "get-instances") ).increment() logger.warn("Failed fetching instances from $cluster", e) @@ -89,7 +89,7 @@ class RemoteServices( state: ServicesState ): ClusterState { meterRegistry.counter( - "cross-dc-synchronization.total", Tags.of("cluster", cluster) + "cross.dc.synchronization.total", Tags.of("cluster", cluster) ) .increment() val clusterState = ClusterState( diff --git a/envoy-control-core/src/test/kotlin/pl/allegro/tech/servicemesh/envoycontrol/snapshot/SnapshotUpdaterTest.kt b/envoy-control-core/src/test/kotlin/pl/allegro/tech/servicemesh/envoycontrol/snapshot/SnapshotUpdaterTest.kt index 37cc0ac7d..1d3fd3935 100644 --- a/envoy-control-core/src/test/kotlin/pl/allegro/tech/servicemesh/envoycontrol/snapshot/SnapshotUpdaterTest.kt +++ b/envoy-control-core/src/test/kotlin/pl/allegro/tech/servicemesh/envoycontrol/snapshot/SnapshotUpdaterTest.kt @@ -469,7 +469,7 @@ class SnapshotUpdaterTest { val snapshot = cache.getSnapshot(servicesGroup) assertThat(snapshot).isEqualTo(null) assertThat( - simpleMeterRegistry.find("snapshot-updater.errors.total") + simpleMeterRegistry.find("snapshot.updater.errors.total") .tags(Tags.of("service", "example-service")) .counter()?.count() ).isEqualTo(1.0) diff --git a/envoy-control-runner/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RestTemplateControlPlaneClient.kt b/envoy-control-runner/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RestTemplateControlPlaneClient.kt index 881127cbf..73f6e3d0f 100644 --- a/envoy-control-runner/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RestTemplateControlPlaneClient.kt +++ b/envoy-control-runner/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RestTemplateControlPlaneClient.kt @@ -33,17 +33,17 @@ class RestTemplateControlPlaneClient( } private fun timed(function: () -> T): T { - return meterRegistry.timer("cross-dc-synchronization.seconds", Tags.of("operation", "get-state")) + return meterRegistry.timer("cross.dc.synchronization.seconds", Tags.of("operation", "get-state")) .record(function) } private fun success() { - meterRegistry.counter("cross-dc-synchronization", Tags.of("operation", "get-state", "status", "success")) + meterRegistry.counter("cross.dc.synchronization", Tags.of("operation", "get-state", "status", "success")) .increment() } private fun failure() { - meterRegistry.counter("cross-dc-synchronization", Tags.of("operation", "get-state", "status", "failure")) + meterRegistry.counter("cross.dc.synchronization", Tags.of("operation", "get-state", "status", "failure")) .increment() } } diff --git a/envoy-control-tests/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/MetricsDiscoveryServerCallbacksTest.kt b/envoy-control-tests/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/MetricsDiscoveryServerCallbacksTest.kt index b03524b6c..ea54ec40f 100644 --- a/envoy-control-tests/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/MetricsDiscoveryServerCallbacksTest.kt +++ b/envoy-control-tests/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/MetricsDiscoveryServerCallbacksTest.kt @@ -4,7 +4,6 @@ import io.micrometer.core.instrument.Tags import org.assertj.core.api.Assertions.assertThat import org.junit.jupiter.api.Test import org.junit.jupiter.api.extension.RegisterExtension -import org.junit.platform.commons.util.Preconditions.condition import pl.allegro.tech.servicemesh.envoycontrol.assertions.untilAsserted import pl.allegro.tech.servicemesh.envoycontrol.config.Ads import pl.allegro.tech.servicemesh.envoycontrol.config.DeltaAds @@ -233,14 +232,15 @@ interface MetricsDiscoveryServerCallbacksTest { // expect untilAsserted { expectedGrpcConnectionsGaugeValues().forEach { (type, value) -> - val metric = "grpc.connections" + val metric = "connections" assertThat( meterRegistry.find(metric) - .tags(Tags.of("type", type.name.lowercase())).gauge() + .tags(Tags.of("stream-type", type.name.lowercase(), "connection-type", "grpc")).gauge() ).isNotNull assertThat( meterRegistry.get(metric) - .tags(Tags.of("type", type.name.lowercase())).gauge().value().toInt() + .tags(Tags.of("stream-type", type.name.lowercase(), "connection-type", "grpc")).gauge().value() + .toInt() ).isEqualTo(value) } } @@ -259,10 +259,10 @@ interface MetricsDiscoveryServerCallbacksTest { } } - private fun assertCondition(type: String, condition: Predicate, metricType: String) { + private fun assertCondition(type: String, condition: Predicate, reqTpe: String) { val counterValue = - envoyControl().app.meterRegistry().find("grpc.requests.count") - .tags(Tags.of("type", type, "metric-type", metricType)) + envoyControl().app.meterRegistry().find("requests.total") + .tags(Tags.of("stream-type", type, "discovery-request-type", reqTpe, "connection-type", "grpc")) .counter()?.count()?.toInt() logger.info("$type $counterValue") assertThat(counterValue).satisfies(Consumer { condition.test(it) })