diff --git a/engine/access/ping/engine.go b/engine/access/ping/engine.go index 4cd9da347e0..aa2511b1aad 100644 --- a/engine/access/ping/engine.go +++ b/engine/access/ping/engine.go @@ -107,21 +107,18 @@ func (e *Engine) startPing() { // pingNode pings the given peer and updates the metrics with the result and the additional node information func (e *Engine) pingNode(peer *flow.Identity) { id := peer.ID() - reachable := e.pingAddress(id) - info := e.nodeInfo[id] - e.metrics.NodeReachable(peer, info, reachable) -} -// pingAddress sends a ping request to the given address, and block until either receive -// a ping respond then return true, or hitting a timeout and return false. -// if there is other unknown error, return false -func (e *Engine) pingAddress(target flow.Identifier) bool { - // ignore the ping duration for now - // ping will timeout in libp2p.PingTimeoutSecs seconds - _, err := e.middleware.Ping(target) + // ping the node + rtt, err := e.middleware.Ping(id) // ping will timeout in libp2p.PingTimeoutSecs seconds if err != nil { - e.log.Debug().Err(err).Str("target", target.String()).Msg("failed to ping") - return false + e.log.Debug().Err(err).Str("target", id.String()).Msg("failed to ping") + // report the rtt duration as negative to make it easier to distinguish between pingable and non-pingable nodes + rtt = -1 } - return true + + // get the additional info about the node + info := e.nodeInfo[id] + + // update metric + e.metrics.NodeReachable(peer, info, rtt) } diff --git a/module/metrics.go b/module/metrics.go index 5c70c2c0c22..83ed3707008 100644 --- a/module/metrics.go +++ b/module/metrics.go @@ -328,7 +328,7 @@ type TransactionMetrics interface { } type PingMetrics interface { - // NodeReachable tracks the node availability of the node and reports it as 1 if the node was successfully pinged, 0 - // otherwise. The nodeInfo provides additional information about the node such as the name of the node operator - NodeReachable(node *flow.Identity, nodeInfo string, reachable bool) + // NodeReachable tracks the round trip time in milliseconds taken to ping a node + // The nodeInfo provides additional information about the node such as the name of the node operator + NodeReachable(node *flow.Identity, nodeInfo string, rtt time.Duration) } diff --git a/module/metrics/ping.go b/module/metrics/ping.go index be6fbb0ff40..2261238593e 100644 --- a/module/metrics/ping.go +++ b/module/metrics/ping.go @@ -1,6 +1,8 @@ package metrics import ( + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" @@ -23,14 +25,17 @@ func NewPingCollector() *PingCollector { return pc } -func (pc *PingCollector) NodeReachable(node *flow.Identity, nodeInfo string, reachable bool) { - var val float64 - if reachable { - val = 1 +func (pc *PingCollector) NodeReachable(node *flow.Identity, nodeInfo string, rtt time.Duration) { + var rttValue float64 + if rtt > 0 { + rttValue = float64(rtt.Milliseconds()) + } else { + rttValue = -1 } + pc.reachable.With(prometheus.Labels{ LabelNodeID: node.String(), LabelNodeRole: node.Role.String(), LabelNodeInfo: nodeInfo}). - Set(val) + Set(rttValue) } diff --git a/module/mock/ping_metrics.go b/module/mock/ping_metrics.go index 83248d793ba..6447fc74418 100644 --- a/module/mock/ping_metrics.go +++ b/module/mock/ping_metrics.go @@ -5,6 +5,8 @@ package mock import ( flow "github.com/onflow/flow-go/model/flow" mock "github.com/stretchr/testify/mock" + + time "time" ) // PingMetrics is an autogenerated mock type for the PingMetrics type @@ -12,7 +14,7 @@ type PingMetrics struct { mock.Mock } -// NodeReachable provides a mock function with given fields: node, nodeInfo, reachable -func (_m *PingMetrics) NodeReachable(node *flow.Identity, nodeInfo string, reachable bool) { - _m.Called(node, nodeInfo, reachable) +// NodeReachable provides a mock function with given fields: node, nodeInfo, rtt +func (_m *PingMetrics) NodeReachable(node *flow.Identity, nodeInfo string, rtt time.Duration) { + _m.Called(node, nodeInfo, rtt) }