Skip to content

Commit

Permalink
fix: panic in prometheus (#1427)
Browse files Browse the repository at this point in the history
We witnessed this panic in one of our testnets using the v2 mempool
caused from incorrectly configuring prometheus.

```
panic: inconsistent label cardinality: expected 3 label values but got 2 in prometheus.Labels{"chain_id":"test", "version":"6f334a2"}

goroutine 151 [running]:
github.com/prometheus/client_golang/prometheus.(*CounterVec).With(...)
	/go/pkg/mod/github.com/prometheus/[email protected]/prometheus/counter.go:296
github.com/go-kit/kit/metrics/prometheus.(*Counter).Add(0xc0013035a0, 0x0)
	/go/pkg/mod/github.com/go-kit/[email protected]/metrics/prometheus/prometheus.go:45 +0x131
github.com/tendermint/tendermint/mempool/cat.(*TxPool).CheckToPurgeExpiredTxs(0xc0012f7d40)
```

This PR fixes this problem. To make things safer in the future, it
introduces a new metric "ExpiredTxs", specifically for those
transactions that were kicked from the mempool due to the TTL
  • Loading branch information
cmwaters authored Jul 26, 2024
1 parent dae3ca9 commit 3aea3d1
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 30 deletions.
2 changes: 1 addition & 1 deletion mempool/cat/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -747,7 +747,7 @@ func (txmp *TxPool) purgeExpiredTxs(blockHeight int64) {
for _, tx := range purgedTxs {
txmp.evictedTxCache.Push(tx.key)
}
txmp.metrics.EvictedTxs.Add(float64(numExpired))
txmp.metrics.ExpiredTxs.Add(float64(numExpired))

// purge old evicted and seen transactions
if txmp.config.TTLDuration == 0 {
Expand Down
31 changes: 15 additions & 16 deletions mempool/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,6 @@ const (
// MetricsSubsystem is a subsystem shared by all metrics exposed by this
// package.
MetricsSubsystem = "mempool"

TypeLabel = "type"

FailedPrecheck = "precheck"
FailedAdding = "adding"
FailedRecheck = "recheck"

EvictedNewTxFullMempool = "full-removed-incoming"
EvictedExistingTxFullMempool = "full-removed-existing"
EvictedTxExpiredBlocks = "expired-ttl-blocks"
EvictedTxExpiredTime = "expired-ttl-time"
)

// Metrics contains metrics exposed by this package.
Expand All @@ -42,10 +31,13 @@ type Metrics struct {

// EvictedTxs defines the number of evicted transactions. These are valid
// transactions that passed CheckTx and existed in the mempool but were later
// evicted to make room for higher priority valid transactions that passed
// CheckTx.
// evicted to make room for higher priority valid transactions
EvictedTxs metrics.Counter

// ExpiredTxs defines transactions that were removed from the mempool due
// to a TTL
ExpiredTxs metrics.Counter

// SuccessfulTxs defines the number of transactions that successfully made
// it into a block.
SuccessfulTxs metrics.Counter
Expand Down Expand Up @@ -75,7 +67,6 @@ func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics {
for i := 0; i < len(labelsAndValues); i += 2 {
labels = append(labels, labelsAndValues[i])
}
typedCounterLabels := append(append(make([]string, 0, len(labels)+1), labels...), TypeLabel)
return &Metrics{
Size: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Expand Down Expand Up @@ -104,14 +95,21 @@ func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics {
Subsystem: MetricsSubsystem,
Name: "failed_txs",
Help: "Number of failed transactions.",
}, typedCounterLabels).With(labelsAndValues...),
}, labels).With(labelsAndValues...),

EvictedTxs: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "evicted_txs",
Help: "Number of evicted transactions.",
}, typedCounterLabels).With(labelsAndValues...),
}, labels).With(labelsAndValues...),

ExpiredTxs: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "expired_txs",
Help: "Number of expired transactions.",
}, labels).With(labelsAndValues...),

SuccessfulTxs: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: namespace,
Expand Down Expand Up @@ -158,6 +156,7 @@ func NopMetrics() *Metrics {
TxSizeBytes: discard.NewHistogram(),
FailedTxs: discard.NewCounter(),
EvictedTxs: discard.NewCounter(),
ExpiredTxs: discard.NewCounter(),
SuccessfulTxs: discard.NewCounter(),
RecheckTimes: discard.NewCounter(),
AlreadySeenTxs: discard.NewCounter(),
Expand Down
20 changes: 8 additions & 12 deletions mempool/v1/mempool.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ func (txmp *TxMempool) CheckTx(tx types.Tx, cb func(*abci.Response), txInfo memp
// If a precheck hook is defined, call it before invoking the application.
if txmp.preCheck != nil {
if err := txmp.preCheck(tx); err != nil {
txmp.metrics.FailedTxs.With(mempool.TypeLabel, mempool.FailedPrecheck).Add(1)
txmp.metrics.FailedTxs.Add(1)
return 0, mempool.ErrPreCheck{Reason: err}
}
}
Expand Down Expand Up @@ -500,7 +500,7 @@ func (txmp *TxMempool) addNewTransaction(wtx *WrappedTx, checkTxRes *abci.Respon
"post_check_err", err,
)

txmp.metrics.FailedTxs.With(mempool.TypeLabel, mempool.FailedAdding).Add(1)
txmp.metrics.FailedTxs.Add(1)

// Remove the invalid transaction from the cache, unless the operator has
// instructed us to keep invalid transactions.
Expand Down Expand Up @@ -568,7 +568,7 @@ func (txmp *TxMempool) addNewTransaction(wtx *WrappedTx, checkTxRes *abci.Respon
checkTxRes.MempoolError =
fmt.Sprintf("rejected valid incoming transaction; mempool is full (%X)",
wtx.tx.Hash())
txmp.metrics.EvictedTxs.With(mempool.TypeLabel, mempool.EvictedNewTxFullMempool).Add(1)
txmp.metrics.EvictedTxs.Add(1)
// Add it to evicted transactions cache
txmp.evictedTxs.Push(wtx.tx)
return
Expand Down Expand Up @@ -602,7 +602,7 @@ func (txmp *TxMempool) addNewTransaction(wtx *WrappedTx, checkTxRes *abci.Respon
)
txmp.removeTxByElement(vic)
txmp.cache.Remove(w.tx)
txmp.metrics.EvictedTxs.With(mempool.TypeLabel, mempool.EvictedExistingTxFullMempool).Add(1)
txmp.metrics.EvictedTxs.Add(1)
// Add it to evicted transactions cache
txmp.evictedTxs.Push(w.tx)
// We may not need to evict all the eligible transactions. Bail out
Expand Down Expand Up @@ -681,7 +681,7 @@ func (txmp *TxMempool) handleRecheckResult(tx types.Tx, checkTxRes *abci.Respons
"code", checkTxRes.Code,
)
txmp.removeTxByElement(elt)
txmp.metrics.FailedTxs.With(mempool.TypeLabel, mempool.FailedRecheck).Add(1)
txmp.metrics.FailedTxs.Add(1)
if !txmp.config.KeepInvalidTxsInCache {
txmp.cache.Remove(wtx.tx)
}
Expand Down Expand Up @@ -791,16 +791,12 @@ func (txmp *TxMempool) purgeExpiredTxs(blockHeight int64) {
next := cur.Next()

w := cur.Value.(*WrappedTx)
if txmp.config.TTLNumBlocks > 0 && (blockHeight-w.height) > txmp.config.TTLNumBlocks {
if txmp.config.TTLNumBlocks > 0 && (blockHeight-w.height) > txmp.config.TTLNumBlocks ||
txmp.config.TTLDuration > 0 && now.Sub(w.timestamp) > txmp.config.TTLDuration {
txmp.removeTxByElement(cur)
txmp.cache.Remove(w.tx)
txmp.metrics.EvictedTxs.With(mempool.TypeLabel, mempool.EvictedTxExpiredBlocks).Add(1)
txmp.evictedTxs.Push(w.tx)
} else if txmp.config.TTLDuration > 0 && now.Sub(w.timestamp) > txmp.config.TTLDuration {
txmp.removeTxByElement(cur)
txmp.cache.Remove(w.tx)
txmp.evictedTxs.Push(w.tx)
txmp.metrics.EvictedTxs.With(mempool.TypeLabel, mempool.EvictedTxExpiredTime).Add(1)
txmp.metrics.ExpiredTxs.Add(1)
}
cur = next
}
Expand Down
2 changes: 1 addition & 1 deletion state/mocks/block_store.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 3aea3d1

Please sign in to comment.