Skip to content
This repository has been archived by the owner on Jan 2, 2025. It is now read-only.

Commit

Permalink
Feat/empty blocks (#89)
Browse files Browse the repository at this point in the history
* feat: add tenderduty_empty_proposed_blocks metric

* feat: add empty_blocks_percentage and consecutive_empty alerting

* chore: update config

* fix: config

* fix: EmptyPercentageAlerts formating

* feat: rm lib64 symlink
  • Loading branch information
quertc authored Dec 6, 2024
1 parent 6a2899b commit 5524674
Show file tree
Hide file tree
Showing 7 changed files with 161 additions and 13 deletions.
15 changes: 15 additions & 0 deletions example-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,21 @@ chains:
# Percentage Missed alert Pagerduty Severity
percentage_priority: warning

# Empty blocks notification configuration
consecutive_empty_enabled: yes
# How many consecutive empty blocks should trigger a notification?
consecutive_empty: 3
# Consecutive Empty alert Pagerduty Severity
consecutive_empty_priority: critical

# For some Cosmos EVM chains, empty consensus blocks may decrease execution uptime
# since they aren't included in EVM state. Should an alert be sent if empty blocks are detected?
empty_percentage_enabled: yes
# What percentage should trigger the alert
empty_percentage: 2
# Percentage Empty alert Pagerduty Severity
empty_percentage_priority: warning

# Should an alert be sent if the validator is not in the active set ie, jailed,
# tombstoned, unbonding?
alert_if_inactive: yes
Expand Down
71 changes: 70 additions & 1 deletion td2/alert.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ func (c *Config) alert(chainName, message, severity string, resolved bool, id *s
// and also updates a few prometheus stats
// FIXME: not watching for nodes that are lagging the head block!
func (cc *ChainConfig) watch() {
var missedAlarm, pctAlarm, noNodes bool
var missedAlarm, pctAlarm, noNodes, emptyBlocksAlarm, emptyPctAlarm bool
inactive := "jailed"
nodeAlarms := make(map[string]bool)

Expand Down Expand Up @@ -572,6 +572,75 @@ func (cc *ChainConfig) watch() {
cc.activeAlerts = alarms.getCount(cc.name)
}

// empty blocks alarm handling
if !emptyBlocksAlarm && cc.Alerts.ConsecutiveEmptyAlerts && int(cc.statConsecutiveEmpty) >= cc.Alerts.ConsecutiveEmpty {
// alert on empty blocks counter!
emptyBlocksAlarm = true
id := cc.valInfo.Valcons + "empty"
td.alert(
cc.name,
fmt.Sprintf("%s has proposed %d consecutive empty blocks on %s", cc.valInfo.Moniker, cc.Alerts.ConsecutiveEmpty, cc.ChainId),
cc.Alerts.ConsecutiveEmptyPriority,
false,
&id,
)
cc.activeAlerts = alarms.getCount(cc.name)
} else if emptyBlocksAlarm && int(cc.statConsecutiveEmpty) < cc.Alerts.ConsecutiveEmpty {
// clear the alert
emptyBlocksAlarm = false
id := cc.valInfo.Valcons + "empty"
td.alert(
cc.name,
fmt.Sprintf("%s has proposed %d consecutive empty blocks on %s", cc.valInfo.Moniker, cc.Alerts.ConsecutiveEmpty, cc.ChainId),
"info",
true,
&id,
)
cc.activeAlerts = alarms.getCount(cc.name)
}

// window percentage empty block alarms
var emptyBlocksPercent float64
if cc.statTotalProps > 0 {
emptyBlocksPercent = 100 * float64(cc.statTotalPropsEmpty) / float64(cc.statTotalProps)
}

if cc.Alerts.EmptyPercentageAlerts && !emptyPctAlarm && emptyBlocksPercent > float64(cc.Alerts.EmptyWindow) {
// alert on empty block percentage!
emptyPctAlarm = true
id := cc.valInfo.Valcons + "empty_percent"
td.alert(
cc.name,
fmt.Sprintf("%s has > %d%% empty blocks (%d of %d proposed blocks) on %s",
cc.valInfo.Moniker,
cc.Alerts.EmptyWindow,
int(cc.statTotalPropsEmpty),
int(cc.statTotalProps),
cc.ChainId),
cc.Alerts.EmptyPercentagePriority,
false,
&id,
)
cc.activeAlerts = alarms.getCount(cc.name)
} else if cc.Alerts.EmptyPercentageAlerts && emptyPctAlarm && emptyBlocksPercent < float64(cc.Alerts.EmptyWindow) {
// clear the alert
emptyPctAlarm = false
id := cc.valInfo.Valcons + "empty_percent"
td.alert(
cc.name,
fmt.Sprintf("%s has > %d%% empty blocks (%d of %d proposed blocks) on %s",
cc.valInfo.Moniker,
cc.Alerts.EmptyWindow,
int(cc.statTotalPropsEmpty),
int(cc.statTotalProps),
cc.ChainId),
"info",
true,
&id,
)
cc.activeAlerts = alarms.getCount(cc.name)
}

// node down alarms
for _, node := range cc.Nodes {
// window percentage missed block alarms
Expand Down
19 changes: 16 additions & 3 deletions td2/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ package tenderduty
import (
"context"
"fmt"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
"log"
"net/http"
"sync"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
)

var (
Expand All @@ -25,6 +26,8 @@ const (
metricPrevote
metricPrecommit
metricConsecutive
metricEmptyBlocks
metricConsecutiveEmpty
metricWindowMissed
metricWindowSize
metricLastBlockSeconds
Expand Down Expand Up @@ -91,6 +94,14 @@ func prometheusExporter(ctx context.Context, updates chan *promUpdate) {
Name: "tenderduty_consecutive_missed_blocks",
Help: "the current count of consecutively missed blocks regardless of precommit or prevote status",
}, chainLabels)
emptyBlocks := promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "tenderduty_empty_proposed_blocks",
Help: "count of empty blocks proposed (blocks with zero transactions) since tenderduty was started",
}, chainLabels)
consecutiveEmpty := promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "tenderduty_consecutive_empty_blocks",
Help: "the current count of consecutively proposed empty blocks",
}, chainLabels)
windowSize := promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "tenderduty_missed_block_window",
Help: "the missed block aka slashing window",
Expand Down Expand Up @@ -135,6 +146,8 @@ func prometheusExporter(ctx context.Context, updates chan *promUpdate) {
metricPrevote: missedPrevote,
metricPrecommit: missedPrecommit,
metricConsecutive: missedConsecutive,
metricEmptyBlocks: emptyBlocks,
metricConsecutiveEmpty: consecutiveEmpty,
metricWindowMissed: missedWindow,
metricWindowSize: windowSize,
metricLastBlockSeconds: lastBlockSec,
Expand Down
16 changes: 16 additions & 0 deletions td2/static/grid.js
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,17 @@ function legend() {

offset += 65 * scale
grad = ctx.createLinearGradient(offset, 0, offset+gridW, gridH)
grad.addColorStop(0, 'rgb(255,215,0)');
grad.addColorStop(0.3, 'rgb(255,235,100)');
grad.addColorStop(0.8, 'rgb(255,223,66)');
ctx.fillStyle = grad
ctx.fillRect(offset, 0, gridW, gridH)
ctx.fillStyle = 'grey'
offset += gridW + gridW/2
ctx.fillText("proposer/empty",offset, gridH/1.2)

offset += 110 * scale
grad = ctx.createLinearGradient(offset, 0, offset+gridW, gridH)
grad.addColorStop(0, 'rgba(0,0,0,0.2)');
ctx.fillStyle = grad
ctx.fillRect(offset, 0, gridW, gridH)
Expand Down Expand Up @@ -148,6 +159,11 @@ function drawSeries(multiStates) {
crossThrough = false
const grad = ctx.createLinearGradient((i*gridW)+gridTextW, (gridH*j), (i * gridW) + gridW +gridTextW, (gridH*j))
switch (multiStates.Status[j].blocks[i]) {
case 5: // empty proposed
grad.addColorStop(0, 'rgb(255,215,0)');
grad.addColorStop(0.3, 'rgb(255,235,100)');
grad.addColorStop(0.8, 'rgb(255,223,66)');
break
case 4: // proposed
grad.addColorStop(0, 'rgb(123,255,66)');
grad.addColorStop(0.3, 'rgb(240,255,128)');
Expand Down
2 changes: 1 addition & 1 deletion td2/static/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
<canvas id="canvas" height="20" width="4735"></canvas>
</div>
<div id="legendContainer" class="uk-nav-center uk-background-secondary uk-padding-remove">
<canvas id="legend" height="32" width="700"></canvas>
<canvas id="legend" height="32" width="900"></canvas>
</div>
</div>

Expand Down
30 changes: 23 additions & 7 deletions td2/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,14 @@ type ChainConfig struct {
lastBlockNum int64
activeAlerts int

statTotalSigns float64
statTotalProps float64
statTotalMiss float64
statPrevoteMiss float64
statPrecommitMiss float64
statConsecutiveMiss float64
statTotalSigns float64
statTotalProps float64
statTotalMiss float64
statPrevoteMiss float64
statPrecommitMiss float64
statConsecutiveMiss float64
statTotalPropsEmpty float64
statConsecutiveEmpty float64

// ChainId is used to ensure any endpoints contacted claim to be on the correct chain. This is a weak verification,
// no light client validation is performed, so caution is advised when using public endpoints.
Expand Down Expand Up @@ -158,6 +160,20 @@ type AlertConfig struct {
// PercentageAlerts is whether to alert on percentage based misses
PercentageAlerts bool `yaml:"percentage_enabled"`

// How many consecutive empty blocks are acceptable before alerting
ConsecutiveEmpty int `yaml:"consecutive_empty"`
// Tag for pagerduty to set the alert priority for empty blocks
ConsecutiveEmptyPriority string `yaml:"consecutive_empty_priority"`
// Whether to alert on consecutive empty blocks
ConsecutiveEmptyAlerts bool `yaml:"consecutive_empty_enabled"`

// EmptyWindow is how many blocks empty as a percentage of proposed blocks since tenderduty was started to trigger an alert
EmptyWindow int `yaml:"empty_percentage"`
// EmptyPercentagePriority is a tag for pagerduty to route on priority
EmptyPercentagePriority string `yaml:"empty_percentage_priority"`
// EmptyPercentageAlerts is whether to alert on percentage based empty blocks
EmptyPercentageAlerts bool `yaml:"empty_percentage_enabled"`

// AlertIfInactive decides if tenderduty send an alert if the validator is not in the active set?
AlertIfInactive bool `yaml:"alert_if_inactive"`
// AlertIfNoServers: should an alert be sent if no servers are reachable?
Expand Down Expand Up @@ -319,7 +335,7 @@ func validateConfig(c *Config) (fatal bool, problems []string) {
fallthrough
case v.Alerts.Telegram.Enabled && !c.Telegram.Enabled:
problems = append(problems, fmt.Sprintf("warn: %20s is configured for telegram alerts, but it is not enabled", k))
case !v.Alerts.ConsecutiveAlerts && !v.Alerts.PercentageAlerts && !v.Alerts.AlertIfInactive && !v.Alerts.AlertIfNoServers:
case !v.Alerts.ConsecutiveAlerts && !v.Alerts.PercentageAlerts && !v.Alerts.AlertIfInactive && !v.Alerts.AlertIfNoServers && !v.Alerts.ConsecutiveEmptyAlerts && !v.Alerts.EmptyPercentageAlerts:
problems = append(problems, fmt.Sprintf("warn: %20s has no alert types configured", k))
fallthrough
case !v.Alerts.Pagerduty.Enabled && !v.Alerts.Discord.Enabled && !v.Alerts.Telegram.Enabled && !v.Alerts.Slack.Enabled:
Expand Down
21 changes: 20 additions & 1 deletion td2/ws.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ const (
StatusPrecommit
StatusSigned
StatusProposed
StatusProposedEmpty
)

// StatusUpdate is passed over a channel from the websocket client indicating the current state, it is immediate in the
Expand All @@ -41,6 +42,7 @@ type StatusUpdate struct {
Height int64
Status StatusType
Final bool
Empty bool
}

// WsReply is a trimmed down version of the JSON sent from a tendermint websocket subscription.
Expand Down Expand Up @@ -150,6 +152,13 @@ func (cc *ChainConfig) WsRun() {
cc.statTotalProps += 1
cc.statTotalSigns += 1
cc.statConsecutiveMiss = 0
cc.statConsecutiveEmpty = 0
case StatusProposedEmpty:
cc.statTotalPropsEmpty += 1
cc.statTotalProps += 1
cc.statTotalSigns += 1
cc.statConsecutiveMiss = 0
cc.statConsecutiveEmpty += 1
}
signState = -1
healthyNodes := 0
Expand Down Expand Up @@ -196,6 +205,8 @@ func (cc *ChainConfig) WsRun() {
td.statsChan <- cc.mkUpdate(metricPrevote, cc.statPrevoteMiss, "")
td.statsChan <- cc.mkUpdate(metricPrecommit, cc.statPrecommitMiss, "")
td.statsChan <- cc.mkUpdate(metricConsecutive, cc.statConsecutiveMiss, "")
td.statsChan <- cc.mkUpdate(metricEmptyBlocks, float64(cc.statTotalPropsEmpty), "")
td.statsChan <- cc.mkUpdate(metricConsecutiveEmpty, float64(cc.statConsecutiveEmpty), "")
td.statsChan <- cc.mkUpdate(metricUnealthyNodes, float64(len(cc.Nodes)-healthyNodes), "")
}
}
Expand Down Expand Up @@ -286,6 +297,9 @@ type rawBlock struct {
LastCommit struct {
Signatures []signature `json:"signatures"`
} `json:"last_commit"`
Data struct {
Txs []json.RawMessage `json:"txs"`
} `json:"data"`
} `json:"block"`
}

Expand Down Expand Up @@ -327,9 +341,14 @@ func handleBlocks(ctx context.Context, blocks chan *WsReply, results chan Status
Height: b.Block.Header.Height.val(),
Status: Statusmissed,
Final: true,
Empty: len(b.Block.Data.Txs) == 0,
}
if b.Block.Header.ProposerAddress == address {
upd.Status = StatusProposed
if upd.Empty {
upd.Status = StatusProposedEmpty
} else {
upd.Status = StatusProposed
}
} else if b.find(address) {
upd.Status = StatusSigned
}
Expand Down

0 comments on commit 5524674

Please sign in to comment.