From 4dff4fd5d6fee4e773fc41c6461c203cf6abae45 Mon Sep 17 00:00:00 2001 From: Vlad Gusev Date: Thu, 12 Dec 2024 17:43:52 +0200 Subject: [PATCH] Add basic Prometheus metrics for binlog-collector The following metrics have been added: - pxc_binlog_collector_success_total - pxc_binlog_collector_failure_total - pxc_binlog_collector_last_processing_timestamp - pxc_binlog_collector_last_upload_timestamp - pxc_binlog_collector_gap_detected_total --- cmd/pitr/collector/collector.go | 50 ++++++++++++++++++++++ cmd/pitr/main.go | 16 +++++++ pkg/pxc/app/deployment/binlog-collector.go | 10 +++++ 3 files changed, 76 insertions(+) diff --git a/cmd/pitr/collector/collector.go b/cmd/pitr/collector/collector.go index c35cf41a0d..fea808e03d 100644 --- a/cmd/pitr/collector/collector.go +++ b/cmd/pitr/collector/collector.go @@ -16,11 +16,53 @@ import ( "github.com/go-sql-driver/mysql" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" "github.com/percona/percona-xtradb-cluster-operator/cmd/pitr/pxc" "github.com/percona/percona-xtradb-cluster-operator/pkg/pxc/backup/storage" ) +var ( + pxcBinlogCollectorBackupSuccess = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "pxc_binlog_collector_success_total", + Help: "Total number of successful binlog backups", + }, + ) + pxcBinlogCollectorBackupFailure = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "pxc_binlog_collector_failure_total", + Help: "Total number of failed binlog backups", + }, + ) + pxcBinlogCollectorLastProcessingTime = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "pxc_binlog_collector_last_processing_timestamp", + Help: "Timestamp of the last successful binlog processing", + }, + ) + pxcBinlogCollectorLastUploadTime = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "pxc_binlog_collector_last_upload_timestamp", + Help: "Timestamp of the last successful binlog upload", + }, + ) + pxcBinlogCollectorGapDetected = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "pxc_binlog_collector_gap_detected_total", + Help: "Total number of times the gap was detected in binlog", + }, + ) +) + +func init() { + prometheus.MustRegister(pxcBinlogCollectorBackupSuccess) + prometheus.MustRegister(pxcBinlogCollectorBackupFailure) + prometheus.MustRegister(pxcBinlogCollectorLastProcessingTime) + prometheus.MustRegister(pxcBinlogCollectorLastUploadTime) + prometheus.MustRegister(pxcBinlogCollectorGapDetected) +} + type Collector struct { db *pxc.PXC storage storage.Storage @@ -103,6 +145,7 @@ func New(ctx context.Context, c Config) (*Collector, error) { func (c *Collector) Run(ctx context.Context) error { err := c.newDB(ctx) if err != nil { + pxcBinlogCollectorBackupFailure.Inc() return errors.Wrap(err, "new db connection") } defer c.close() @@ -113,9 +156,11 @@ func (c *Collector) Run(ctx context.Context) error { err = c.CollectBinLogs(ctx) if err != nil { + pxcBinlogCollectorBackupFailure.Inc() return errors.Wrap(err, "collect binlog files") } + pxcBinlogCollectorBackupSuccess.Inc() return nil } @@ -369,6 +414,7 @@ func (c *Collector) CollectBinLogs(ctx context.Context) error { if lastUploadedBinlogName == "" { log.Println("ERROR: Couldn't find the binlog that contains GTID set:", c.lastUploadedSet.Raw()) log.Println("ERROR: Gap detected in the binary logs. Binary logs will be uploaded anyway, but full backup needed for consistent recovery.") + pxcBinlogCollectorGapDetected.Inc() if err := createGapFile(c.lastUploadedSet); err != nil { return errors.Wrap(err, "create gap file") } @@ -382,6 +428,7 @@ func (c *Collector) CollectBinLogs(ctx context.Context) error { if len(list) == 0 { log.Println("No binlogs to upload") + pxcBinlogCollectorLastProcessingTime.SetToCurrentTime() return nil } @@ -411,6 +458,9 @@ func (c *Collector) CollectBinLogs(ctx context.Context) error { return errors.Wrap(err, "update timeline file") } } + + pxcBinlogCollectorLastUploadTime.SetToCurrentTime() + pxcBinlogCollectorLastProcessingTime.SetToCurrentTime() return nil } diff --git a/cmd/pitr/main.go b/cmd/pitr/main.go index 51e07e9fe7..689db13daf 100644 --- a/cmd/pitr/main.go +++ b/cmd/pitr/main.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "log" + "net/http" "os" "os/signal" "syscall" @@ -14,6 +15,7 @@ import ( "github.com/percona/percona-xtradb-cluster-operator/cmd/pitr/recoverer" "github.com/caarlos0/env" + "github.com/prometheus/client_golang/prometheus/promhttp" ) func main() { @@ -23,6 +25,13 @@ func main() { } ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGTERM, os.Interrupt) defer stop() + + go func() { + http.Handle("/metrics", promhttp.Handler()) + http.HandleFunc("/health", healthHandler) + log.Fatal(http.ListenAndServe(":8080", nil)) + }() + switch command { case "collect": runCollector(ctx) @@ -34,6 +43,13 @@ func main() { } } +func healthHandler(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + if _, err := w.Write([]byte("ok")); err != nil { + log.Println("ERROR: writing health response:", err) + } +} + func runCollector(ctx context.Context) { config, err := getCollectorConfig() if err != nil { diff --git a/pkg/pxc/app/deployment/binlog-collector.go b/pkg/pxc/app/deployment/binlog-collector.go index b80331bfb9..66fe96fdf2 100644 --- a/pkg/pxc/app/deployment/binlog-collector.go +++ b/pkg/pxc/app/deployment/binlog-collector.go @@ -91,6 +91,16 @@ func GetBinlogCollectorDeployment(cr *api.PerconaXtraDBCluster, initImage string }, }, } + + if cr.CompareVersionWith("1.16.0") >= 0 { + container.Ports = []corev1.ContainerPort{ + { + ContainerPort: 8080, + Name: "metrics", + }, + } + } + replicas := int32(1) var initContainers []corev1.Container