From 2d6bd1254237d50443d4e448ed24c0fcce6bf5b8 Mon Sep 17 00:00:00 2001 From: Caleb Lloyd Date: Thu, 7 Mar 2024 17:26:05 -0500 Subject: [PATCH 1/3] remove high cardinality labels server_route_id and server_gateway_id Signed-off-by: Caleb Lloyd --- README.md | 131 +----------------- .../dashboards/clusters-dashboard.json | 10 +- .../dashboards/nats-surveyor-dashboard.json | 10 +- surveyor/collector_statz.go | 8 +- surveyor/surveyor_test.go | 7 +- 5 files changed, 18 insertions(+), 148 deletions(-) diff --git a/README.md b/README.md index 3d49d40..0125d8b 100644 --- a/README.md +++ b/README.md @@ -101,9 +101,8 @@ NATS_SURVEYOR_LOG_LEVEL=debug Scrape output is the in form of nats_core_NNNN_metric, where NNN is `server`, `route`, or `gateway`. -To aid filtering, each metric has labels. These include `nats_server_cluster`, -`nats_server_host`, `nats_server_id`. Routes have additional flags, `nats_server_route_id` -and gatways have `nats_server_gateway_id` and `nats_server_gateway_name`. +To aid filtering, each metric has labels. These include `server_cluster`, `server_name`, and `server_id`. +Routes have the additional label `server_route_name` and gateways have the additional label `server_gateway_name`. The info metrics has a nats_server_version label with the current version. @@ -112,132 +111,6 @@ and no additional NATS metrics when there is no connectivity to the NATS system. allows users to differentiate between a problem with the exporter itself connectivity with the NATS system. -### Scrape Output - -``` -nats_core_active_account_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 2 -nats_core_active_account_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 2 -nats_core_active_account_count{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 2 -# HELP nats_core_connection_count Current number of client connections gauge -# TYPE nats_core_connection_count gauge -nats_core_connection_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 0 -nats_core_connection_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 1 -nats_core_connection_count{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 0 -# HELP nats_core_core_count Machine cores gauge -# TYPE nats_core_core_count gauge -nats_core_core_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 8 -nats_core_core_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 8 -nats_core_core_count{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 8 -# HELP nats_core_cpu_percentage Server cpu utilization gauge -# TYPE nats_core_cpu_percentage gauge -nats_core_cpu_percentage{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 0 -nats_core_cpu_percentage{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 0 -nats_core_cpu_percentage{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 0 -# HELP nats_core_gateway_inbound_msg_count Number inbound messages through the gateway gauge -# TYPE nats_core_gateway_inbound_msg_count gauge -nats_core_gateway_inbound_msg_count{nats_server_cluster="region1",nats_server_gateway_id="7",nats_server_gateway_name="region2",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 0 -nats_core_gateway_inbound_msg_count{nats_server_cluster="region1",nats_server_gateway_id="9",nats_server_gateway_name="region2",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 1 -nats_core_gateway_inbound_msg_count{nats_server_cluster="region2",nats_server_gateway_id="4",nats_server_gateway_name="region1",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 2 -# HELP nats_core_gateway_recv_bytes Number of messages sent over the gateway gauge -# TYPE nats_core_gateway_recv_bytes gauge -nats_core_gateway_recv_bytes{nats_server_cluster="region1",nats_server_gateway_id="7",nats_server_gateway_name="region2",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 0 -nats_core_gateway_recv_bytes{nats_server_cluster="region1",nats_server_gateway_id="9",nats_server_gateway_name="region2",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 852 -nats_core_gateway_recv_bytes{nats_server_cluster="region2",nats_server_gateway_id="4",nats_server_gateway_name="region1",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 4005 -# HELP nats_core_gateway_recv_msg_count Number of messages sent over the gateway gauge -# TYPE nats_core_gateway_recv_msg_count gauge -nats_core_gateway_recv_msg_count{nats_server_cluster="region1",nats_server_gateway_id="7",nats_server_gateway_name="region2",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 0 -nats_core_gateway_recv_msg_count{nats_server_cluster="region1",nats_server_gateway_id="9",nats_server_gateway_name="region2",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 1 -nats_core_gateway_recv_msg_count{nats_server_cluster="region2",nats_server_gateway_id="4",nats_server_gateway_name="region1",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 5 -# HELP nats_core_gateway_sent_bytes Number of messages sent over the gateway gauge -# TYPE nats_core_gateway_sent_bytes gauge -nats_core_gateway_sent_bytes{nats_server_cluster="region1",nats_server_gateway_id="7",nats_server_gateway_name="region2",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 1719 -nats_core_gateway_sent_bytes{nats_server_cluster="region1",nats_server_gateway_id="9",nats_server_gateway_name="region2",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 2286 -nats_core_gateway_sent_bytes{nats_server_cluster="region2",nats_server_gateway_id="4",nats_server_gateway_name="region1",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 852 -# HELP nats_core_gateway_sent_msgs Number of messages sent over the gateway gauge -# TYPE nats_core_gateway_sent_msgs gauge -nats_core_gateway_sent_msgs{nats_server_cluster="region1",nats_server_gateway_id="7",nats_server_gateway_name="region2",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 2 -nats_core_gateway_sent_msgs{nats_server_cluster="region1",nats_server_gateway_id="9",nats_server_gateway_name="region2",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 3 -nats_core_gateway_sent_msgs{nats_server_cluster="region2",nats_server_gateway_id="4",nats_server_gateway_name="region1",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 1 -# HELP nats_core_info General Server information Summary gauge -# TYPE nats_core_info gauge -nats_core_info{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW",nats_server_version="2.0.2"} 1 -nats_core_info{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF",nats_server_version="2.0.2"} 1 -nats_core_info{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A",nats_server_version="2.0.2"} 1 -# HELP nats_core_mem_bytes Server memory gauge -# TYPE nats_core_mem_bytes gauge -nats_core_mem_bytes{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 1.2685312e+07 -nats_core_mem_bytes{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 1.2992512e+07 -nats_core_mem_bytes{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 1.1309056e+07 -# HELP nats_core_nats_up 1 if connected to NATS, 0 otherwise. A gauge. -# TYPE nats_core_nats_up gauge -nats_core_nats_up 1 -# HELP nats_core_recv_bytes Number of messages received gauge -# TYPE nats_core_recv_bytes gauge -nats_core_recv_bytes{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 0 -nats_core_recv_bytes{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 6528 -nats_core_recv_bytes{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 4005 -# HELP nats_core_recv_msgs_count Number of messages received gauge -# TYPE nats_core_recv_msgs_count gauge -nats_core_recv_msgs_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 7 -nats_core_recv_msgs_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 15 -nats_core_recv_msgs_count{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 5 -# HELP nats_core_route_pending_bytes Number of bytes pending in the route gauge -# TYPE nats_core_route_pending_bytes gauge -nats_core_route_pending_bytes{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW",nats_server_route_id="4"} 0 -nats_core_route_pending_bytes{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF",nats_server_route_id="4"} 0 -# HELP nats_core_route_recv_bytes Number of bytes received over the route gauge -# TYPE nats_core_route_recv_bytes gauge -nats_core_route_recv_bytes{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW",nats_server_route_id="4"} 0 -nats_core_route_recv_bytes{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF",nats_server_route_id="4"} 5676 -# HELP nats_core_route_recv_msg_count Number of messages received over the route gauge -# TYPE nats_core_route_recv_msg_count gauge -nats_core_route_recv_msg_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW",nats_server_route_id="4"} 7 -nats_core_route_recv_msg_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF",nats_server_route_id="4"} 7 -# HELP nats_core_route_sent_bytes Number of bytes sent over the route gauge -# TYPE nats_core_route_sent_bytes gauge -nats_core_route_sent_bytes{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW",nats_server_route_id="4"} 5676 -nats_core_route_sent_bytes{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF",nats_server_route_id="4"} 0 -# HELP nats_core_route_sent_msg_count Number of messages sent over the route gauge -# TYPE nats_core_route_sent_msg_count gauge -nats_core_route_sent_msg_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW",nats_server_route_id="4"} 7 -nats_core_route_sent_msg_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF",nats_server_route_id="4"} 7 -# HELP nats_core_rtt_nanoseconds RTT in nanoseconds gauge -# TYPE nats_core_rtt_nanoseconds gauge -nats_core_rtt_nanoseconds{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 1.8008293e+07 -nats_core_rtt_nanoseconds{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 1.3031788e+07 -nats_core_rtt_nanoseconds{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 1.7976382e+07 -# HELP nats_core_sent_bytes Number of messages sent gauge -# TYPE nats_core_sent_bytes gauge -nats_core_sent_bytes{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 7395 -nats_core_sent_bytes{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 13661 -nats_core_sent_bytes{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 852 -# HELP nats_core_sent_msgs_count Number of messages sent gauge -# TYPE nats_core_sent_msgs_count gauge -nats_core_sent_msgs_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 17 -nats_core_sent_msgs_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 32 -nats_core_sent_msgs_count{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 2 -# HELP nats_core_slow_consumer_count Number of slow consumers gauge -# TYPE nats_core_slow_consumer_count gauge -nats_core_slow_consumer_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 0 -nats_core_slow_consumer_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 0 -nats_core_slow_consumer_count{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 0 -# HELP nats_core_start_time Server start time gauge -# TYPE nats_core_start_time gauge -nats_core_start_time{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 1.571110522019796e+18 -nats_core_start_time{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 1.571110522019795e+18 -nats_core_start_time{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 1.571110952301371e+18 -# HELP nats_core_subs_count Current number of subscriptions gauge -# TYPE nats_core_subs_count gauge -nats_core_subs_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 17 -nats_core_subs_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 17 -nats_core_subs_count{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 8 -# HELP nats_core_total_connection_count Total number of client connections serviced gauge -# TYPE nats_core_total_connection_count gauge -nats_core_total_connection_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDGERVW3RX7A6RAJQ34E7HPBFUD35322XRZJNTOMTFI7MHAXL2PS3OVW"} 2 -nats_core_total_connection_count{nats_server_cluster="region1",nats_server_host="localhost",nats_server_id="NDYW2PLO6QVP2VKKUMWGWJXBMPTZKB3UAYME26BTKOGLNN55NSEK3RQF"} 5 -nats_core_total_connection_count{nats_server_cluster="region2",nats_server_host="localhost",nats_server_id="NCBI75V5ASPJAEAR3VPS2YELXP7K6CUXXWAD5PB2SJ4BOIYQHU6JKV7A"} 0 -``` - ## Docker Compose An easy way to start the NATS Surveyor stack (Grafana, Prometheus, and NATS diff --git a/docker-compose/grafana/provisioning/dashboards/clusters-dashboard.json b/docker-compose/grafana/provisioning/dashboards/clusters-dashboard.json index ead0cc9..3f3ec14 100644 --- a/docker-compose/grafana/provisioning/dashboards/clusters-dashboard.json +++ b/docker-compose/grafana/provisioning/dashboards/clusters-dashboard.json @@ -428,7 +428,7 @@ { "expr": "rate(nats_core_route_sent_msg_count{server_cluster=~\"$cluster\"}[1m])", "interval": "", - "legendFormat": "{{server_name}} - ID {{server_route_id}}", + "legendFormat": "Server {{server_name}} - Route {{server_route_name}}", "refId": "A" } ], @@ -527,7 +527,7 @@ { "expr": "rate(nats_core_route_recv_msg_count{server_cluster=~\"$cluster\"}[1m])", "interval": "", - "legendFormat": "{{server_name}} - ID {{server_route_id}}", + "legendFormat": "Server {{server_name}} - Route {{server_route_name}}", "refId": "A" } ], @@ -641,7 +641,7 @@ { "expr": "rate(nats_core_route_sent_bytes{server_cluster=~\"$cluster\"}[1m])", "interval": "", - "legendFormat": "{{server_name}} - ID {{server_route_id}}", + "legendFormat": "Server {{server_name}} - Route {{server_route_name}}", "refId": "A" } ], @@ -739,7 +739,7 @@ "targets": [ { "expr": "rate(nats_core_route_recv_bytes{server_cluster=~\"$cluster\"}[1m])", - "legendFormat": "{{server_name}} - ID {{server_route_id}}", + "legendFormat": "Server {{server_name}} - Route {{server_route_name}}", "refId": "A" } ], @@ -852,7 +852,7 @@ "targets": [ { "expr": "nats_core_route_pending_bytes{server_cluster=~\"$cluster\"}", - "legendFormat": "{{server_name}} - {{server_route_id}}", + "legendFormat": "Server {{server_name}} - Route {{server_route_name}}", "refId": "A" } ], diff --git a/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json b/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json index 9da482b..13d75ff 100644 --- a/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json +++ b/docker-compose/grafana/provisioning/dashboards/nats-surveyor-dashboard.json @@ -3091,7 +3091,7 @@ "targets": [ { "expr": "rate(nats_core_route_sent_bytes{server_cluster=~\"$cluster\"}[1m])", - "legendFormat": "{{server_name}} - ID {{server_route_id}}", + "legendFormat": "Server {{server_name}} - Route {{server_route_name}}", "refId": "A" } ], @@ -3178,7 +3178,7 @@ "targets": [ { "expr": "rate(nats_core_route_recv_bytes{server_cluster=~\"$cluster\"}[1m])", - "legendFormat": "{{server_name}} - ID {{server_route_id}}", + "legendFormat": "Server {{server_name}} - Route {{server_route_name}}", "refId": "A" } ], @@ -3265,7 +3265,7 @@ "targets": [ { "expr": "rate(nats_core_route_sent_msg_count{server_cluster=~\"$cluster\"}[1m])", - "legendFormat": "{{server_name}} - ID {{server_route_id}}", + "legendFormat": "Server {{server_name}} - Route {{server_route_name}}", "refId": "A" } ], @@ -3352,7 +3352,7 @@ "targets": [ { "expr": "rate(nats_core_route_recv_msg_count{server_cluster=~\"$cluster\"}[1m])", - "legendFormat": "{{server_name}} - ID {{server_route_id}}", + "legendFormat": "Server {{server_name}} - Route {{server_route_name}}", "refId": "A" } ], @@ -3526,7 +3526,7 @@ "targets": [ { "expr": "nats_core_route_pending_bytes{server_cluster=~\"$cluster\"}", - "legendFormat": "{{server_name}} - {{server_route_id}}", + "legendFormat": "Server {{server_name}} - Route {{server_route_name}}", "refId": "A" } ], diff --git a/surveyor/collector_statz.go b/surveyor/collector_statz.go index 3c14b92..c73bb9c 100644 --- a/surveyor/collector_statz.go +++ b/surveyor/collector_statz.go @@ -221,11 +221,11 @@ func (sc *StatzCollector) serverInfoLabelValues(sm *server.ServerStatsMsg) []str } func (sc *StatzCollector) routeLabelValues(sm *server.ServerStatsMsg, rStat *server.RouteStat) []string { - return []string{sm.Server.Cluster, serverName(sm), sm.Server.ID, strconv.FormatUint(rStat.ID, 10)} + return []string{sm.Server.Cluster, serverName(sm), sm.Server.ID, rStat.Name} } func (sc *StatzCollector) gatewayLabelValues(sm *server.ServerStatsMsg, gStat *server.GatewayStat) []string { - return []string{sm.Server.Cluster, serverName(sm), sm.Server.ID, gStat.Name, strconv.FormatUint(gStat.ID, 10)} + return []string{sm.Server.Cluster, serverName(sm), sm.Server.ID, gStat.Name} } // Up/Down on servers - look at discovery mechanisms in Prometheus - aging out, how does it work? @@ -387,8 +387,8 @@ func NewStatzCollector(nc *nats.Conn, logger *logrus.Logger, numServers int, ser // TODO - normalize these if possible. Jetstream varies from the other server labels serverLabels: []string{"server_cluster", "server_name", "server_id"}, serverInfoLabels: []string{"server_cluster", "server_name", "server_id", "server_version"}, - routeLabels: []string{"server_cluster", "server_name", "server_id", "server_route_id"}, - gatewayLabels: []string{"server_cluster", "server_name", "server_id", "server_gateway_name", "server_gateway_id"}, + routeLabels: []string{"server_cluster", "server_name", "server_id", "server_route_name"}, + gatewayLabels: []string{"server_cluster", "server_name", "server_id", "server_gateway_name"}, jsServerLabels: []string{"server_id", "server_name", "cluster_name"}, jsServerInfoLabels: []string{"server_name", "server_host", "server_id", "server_cluster", "server_domain", "server_version", "server_jetstream"}, constLabels: constLabels, diff --git a/surveyor/surveyor_test.go b/surveyor/surveyor_test.go index 34d1eac..aebe57d 100644 --- a/surveyor/surveyor_test.go +++ b/surveyor/surveyor_test.go @@ -150,11 +150,8 @@ func TestSurveyor_Basic(t *testing.T) { if !strings.Contains(output, "server_gateway_name") { t.Fatalf("invalid output, missing 'server_gateway_name': %v\n", output) } - if !strings.Contains(output, "server_gateway_id") { - t.Fatalf("invalid output, missing 'server_gateway_id': %v\n", output) - } - if !strings.Contains(output, "server_route_id") { - t.Fatalf("invalid output, missing 'server_route_id': %v\n", output) + if !strings.Contains(output, "server_route_name") { + t.Fatalf("invalid output, missing 'server_route_name': %v\n", output) } if !strings.Contains(output, "nats_survey_surveyed_count 3") { t.Fatalf("invalid output, missing 'nats_survey_surveyed_count 3': %v\n", output) From c32e1d6bf5f072723098d3a518ae40f9fd106254 Mon Sep 17 00:00:00 2001 From: Caleb Lloyd Date: Thu, 7 Mar 2024 18:43:21 -0500 Subject: [PATCH 2/3] remap route/gateway IDs to indexes Signed-off-by: Caleb Lloyd --- surveyor/collector_statz.go | 92 ++++++++++++++++++++++++++++++-- surveyor/collector_statz_test.go | 49 +++++++++++++++++ surveyor/surveyor_test.go | 6 +++ 3 files changed, 143 insertions(+), 4 deletions(-) create mode 100644 surveyor/collector_statz_test.go diff --git a/surveyor/collector_statz.go b/surveyor/collector_statz.go index c73bb9c..91db7e0 100644 --- a/surveyor/collector_statz.go +++ b/surveyor/collector_statz.go @@ -18,6 +18,7 @@ import ( "encoding/json" "fmt" "io" + "slices" "sort" "strconv" "strings" @@ -29,6 +30,7 @@ import ( "github.com/nats-io/nats.go" "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" + "golang.org/x/exp/maps" "golang.org/x/sync/singleflight" ) @@ -139,6 +141,8 @@ type StatzCollector struct { descs statzDescs collectAccounts bool natsUp *prometheus.Desc + routeIDRemap map[string]map[uint64]int + gatewayIDRemap map[string]map[uint64]int serverLabels []string serverInfoLabels []string @@ -221,11 +225,25 @@ func (sc *StatzCollector) serverInfoLabelValues(sm *server.ServerStatsMsg) []str } func (sc *StatzCollector) routeLabelValues(sm *server.ServerStatsMsg, rStat *server.RouteStat) []string { - return []string{sm.Server.Cluster, serverName(sm), sm.Server.ID, rStat.Name} + idxS := strconv.FormatUint(rStat.ID, 10) + if byName, ok := sc.routeIDRemap[rStat.Name]; ok { + if idx, ok := byName[rStat.ID]; ok { + idxS = strconv.Itoa(idx) + } + } + + return []string{sm.Server.Cluster, serverName(sm), sm.Server.ID, rStat.Name, idxS} } func (sc *StatzCollector) gatewayLabelValues(sm *server.ServerStatsMsg, gStat *server.GatewayStat) []string { - return []string{sm.Server.Cluster, serverName(sm), sm.Server.ID, gStat.Name} + idxS := strconv.FormatUint(gStat.ID, 10) + if byName, ok := sc.gatewayIDRemap[gStat.Name]; ok { + if idx, ok := byName[gStat.ID]; ok { + idxS = strconv.Itoa(idx) + } + } + + return []string{sm.Server.Cluster, serverName(sm), sm.Server.ID, gStat.Name, idxS} } // Up/Down on servers - look at discovery mechanisms in Prometheus - aging out, how does it work? @@ -383,12 +401,14 @@ func NewStatzCollector(nc *nats.Conn, logger *logrus.Logger, numServers int, ser servers: make(map[string]bool), doneCh: make(chan struct{}, 1), collectAccounts: accounts, + routeIDRemap: make(map[string]map[uint64]int), + gatewayIDRemap: make(map[string]map[uint64]int), // TODO - normalize these if possible. Jetstream varies from the other server labels serverLabels: []string{"server_cluster", "server_name", "server_id"}, serverInfoLabels: []string{"server_cluster", "server_name", "server_id", "server_version"}, - routeLabels: []string{"server_cluster", "server_name", "server_id", "server_route_name"}, - gatewayLabels: []string{"server_cluster", "server_name", "server_id", "server_gateway_name"}, + routeLabels: []string{"server_cluster", "server_name", "server_id", "server_route_name", "server_route_name_idx"}, + gatewayLabels: []string{"server_cluster", "server_name", "server_id", "server_gateway_name", "server_gateway_name_idx"}, jsServerLabels: []string{"server_id", "server_name", "cluster_name"}, jsServerInfoLabels: []string{"server_name", "server_host", "server_id", "server_cluster", "server_domain", "server_version", "server_jetstream"}, constLabels: constLabels, @@ -1013,6 +1033,14 @@ func (sc *StatzCollector) Collect(ch chan<- prometheus.Metric) { } } } + + pairs := make([]nameIDPair, len(sm.Stats.Routes)) + for i, rs := range sm.Stats.Routes { + pairs[i].id = rs.ID + pairs[i].name = rs.Name + } + sc.routeIDRemap = remapIdToIdx(pairs, sc.routeIDRemap) + for _, rs := range sm.Stats.Routes { labels = sc.routeLabelValues(sm, rs) metrics.newGaugeMetric(sc.descs.RouteSentMsgs, float64(rs.Sent.Msgs), labels) @@ -1022,6 +1050,13 @@ func (sc *StatzCollector) Collect(ch chan<- prometheus.Metric) { metrics.newGaugeMetric(sc.descs.RoutePending, float64(rs.Pending), labels) } + pairs = make([]nameIDPair, len(sm.Stats.Gateways)) + for i, rs := range sm.Stats.Gateways { + pairs[i].id = rs.ID + pairs[i].name = rs.Name + } + sc.gatewayIDRemap = remapIdToIdx(pairs, sc.gatewayIDRemap) + for _, gw := range sm.Stats.Gateways { labels = sc.gatewayLabelValues(sm, gw) metrics.newGaugeMetric(sc.descs.GatewaySentMsgs, float64(gw.Sent.Msgs), labels) @@ -1184,3 +1219,52 @@ func unmarshalMsg(msg *nats.Msg, v any) error { return json.Unmarshal(data, v) } + +type nameIDPair struct { + name string + id uint64 +} + +func remapIdToIdx(pairs []nameIDPair, existingMapping map[string]map[uint64]int) map[string]map[uint64]int { + newMapping := make(map[string]map[uint64]int) + + // give existing the same idx + for _, rs := range pairs { + newByName, ok := newMapping[rs.name] + if !ok { + newByName = make(map[uint64]int) + newMapping[rs.name] = newByName + } + + existingByName, ok := existingMapping[rs.name] + if !ok { + continue + } + + idx, ok := existingByName[rs.id] + if !ok { + continue + } + + newByName[rs.id] = idx + } + + // assign new ones new idx + for _, path := range pairs { + newByName := newMapping[path.name] + _, ok := newByName[path.id] + if ok { + continue + } + + vals := maps.Values(newByName) + for i := 0; i <= len(vals); i++ { + if !slices.Contains(vals, i) { + newByName[path.id] = i + break + } + } + } + + return newMapping +} diff --git a/surveyor/collector_statz_test.go b/surveyor/collector_statz_test.go new file mode 100644 index 0000000..b3bfef8 --- /dev/null +++ b/surveyor/collector_statz_test.go @@ -0,0 +1,49 @@ +package surveyor + +import ( + "reflect" + "testing" +) + +func TestRemapIdToIdx(t *testing.T) { + existingMapping := map[string]map[uint64]int{ + "a": { + 100: 0, + 200: 2, + }, + "b": { + 100: 0, + }, + } + + pairs := []nameIDPair{ + {name: "a", id: 200}, + {name: "a", id: 100}, + {name: "a", id: 300}, + {name: "a", id: 400}, + {name: "b", id: 200}, + {name: "c", id: 200}, + {name: "c", id: 100}, + } + + newMapping := remapIdToIdx(pairs, existingMapping) + expected := map[string]map[uint64]int{ + "a": { + 100: 0, + 200: 2, + 300: 1, + 400: 3, + }, + "b": { + 200: 0, + }, + "c": { + 200: 0, + 100: 1, + }, + } + + if !reflect.DeepEqual(expected, newMapping) { + t.Fatalf("Invalid mapping config; want: %v; got: %v", expected, newMapping) + } +} diff --git a/surveyor/surveyor_test.go b/surveyor/surveyor_test.go index aebe57d..5cfdcba 100644 --- a/surveyor/surveyor_test.go +++ b/surveyor/surveyor_test.go @@ -150,9 +150,15 @@ func TestSurveyor_Basic(t *testing.T) { if !strings.Contains(output, "server_gateway_name") { t.Fatalf("invalid output, missing 'server_gateway_name': %v\n", output) } + if !strings.Contains(output, "server_gateway_name_idx") { + t.Fatalf("invalid output, missing 'server_gateway_name_idx': %v\n", output) + } if !strings.Contains(output, "server_route_name") { t.Fatalf("invalid output, missing 'server_route_name': %v\n", output) } + if !strings.Contains(output, "server_route_name_idx") { + t.Fatalf("invalid output, missing 'server_route_name_idx': %v\n", output) + } if !strings.Contains(output, "nats_survey_surveyed_count 3") { t.Fatalf("invalid output, missing 'nats_survey_surveyed_count 3': %v\n", output) } From b9672ad1370dceb8bb5b077bf859455a8763dc77 Mon Sep 17 00:00:00 2001 From: Caleb Lloyd Date: Thu, 7 Mar 2024 18:46:18 -0500 Subject: [PATCH 3/3] lint Signed-off-by: Caleb Lloyd --- .github/workflows/go.yaml | 1 - scripts/lint.sh | 1 - surveyor/collector_statz.go | 6 +++--- surveyor/collector_statz_test.go | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/go.yaml b/.github/workflows/go.yaml index 53516e1..ce50592 100644 --- a/.github/workflows/go.yaml +++ b/.github/workflows/go.yaml @@ -35,7 +35,6 @@ jobs: --disable gocritic \ --enable stylecheck \ --enable unconvert \ - --enable gocyclo \ --enable gofmt \ --enable misspell \ --enable unparam \ diff --git a/scripts/lint.sh b/scripts/lint.sh index d0323c9..b9e0d1a 100755 --- a/scripts/lint.sh +++ b/scripts/lint.sh @@ -9,7 +9,6 @@ $(go env GOPATH)/bin/golangci-lint run \ --enable interfacer \ --enable unconvert \ --enable dupl \ - --enable gocyclo \ --enable gofmt \ --enable goimports \ --enable misspell \ diff --git a/surveyor/collector_statz.go b/surveyor/collector_statz.go index 91db7e0..c1168a8 100644 --- a/surveyor/collector_statz.go +++ b/surveyor/collector_statz.go @@ -1039,7 +1039,7 @@ func (sc *StatzCollector) Collect(ch chan<- prometheus.Metric) { pairs[i].id = rs.ID pairs[i].name = rs.Name } - sc.routeIDRemap = remapIdToIdx(pairs, sc.routeIDRemap) + sc.routeIDRemap = remapIDToIdx(pairs, sc.routeIDRemap) for _, rs := range sm.Stats.Routes { labels = sc.routeLabelValues(sm, rs) @@ -1055,7 +1055,7 @@ func (sc *StatzCollector) Collect(ch chan<- prometheus.Metric) { pairs[i].id = rs.ID pairs[i].name = rs.Name } - sc.gatewayIDRemap = remapIdToIdx(pairs, sc.gatewayIDRemap) + sc.gatewayIDRemap = remapIDToIdx(pairs, sc.gatewayIDRemap) for _, gw := range sm.Stats.Gateways { labels = sc.gatewayLabelValues(sm, gw) @@ -1225,7 +1225,7 @@ type nameIDPair struct { id uint64 } -func remapIdToIdx(pairs []nameIDPair, existingMapping map[string]map[uint64]int) map[string]map[uint64]int { +func remapIDToIdx(pairs []nameIDPair, existingMapping map[string]map[uint64]int) map[string]map[uint64]int { newMapping := make(map[string]map[uint64]int) // give existing the same idx diff --git a/surveyor/collector_statz_test.go b/surveyor/collector_statz_test.go index b3bfef8..d55097b 100644 --- a/surveyor/collector_statz_test.go +++ b/surveyor/collector_statz_test.go @@ -26,7 +26,7 @@ func TestRemapIdToIdx(t *testing.T) { {name: "c", id: 100}, } - newMapping := remapIdToIdx(pairs, existingMapping) + newMapping := remapIDToIdx(pairs, existingMapping) expected := map[string]map[uint64]int{ "a": { 100: 0,