Drop Pending size and count from replication metrics (#12378)

Real-time metrics calculated in-memory rely on the initial replication metrics saved with data usage. However, this can lag behind the actual state of the cluster at the time of server restart leading to inaccurate Pending size/counts reported to Prometheus. Dropping the Pending metrics as this can be more reliably monitored by applications with replication notifications. Signed-off-by: Poorna Krishnamoorthy <poorna@minio.io>
2026-03-16 17:53:43 +01:00 · 2021-05-31 20:26:52 -07:00
parent ab7410af11
commit 3690de0c6b
4 changed files with 10 additions and 113 deletions
--- a/cmd/metrics.go
+++ b/cmd/metrics.go
@@ -18,6 +18,7 @@
 package cmd

 import (
+	"math"
 	"net/http"
 	"strings"
 	"sync/atomic"
@@ -441,56 +442,23 @@ func getLatestReplicationStats(bucket string, u madmin.BucketUsageInfo) (s Bucke
 	for _, bucketStat := range bucketStats {
 		replStats.FailedCount += bucketStat.ReplicationStats.FailedCount
 		replStats.FailedSize += bucketStat.ReplicationStats.FailedSize
-		replStats.PendingCount += bucketStat.ReplicationStats.PendingCount
-		replStats.PendingSize += bucketStat.ReplicationStats.PendingSize
 		replStats.ReplicaSize += bucketStat.ReplicationStats.ReplicaSize
 		replStats.ReplicatedSize += bucketStat.ReplicationStats.ReplicatedSize
 	}
 	usageStat := globalReplicationStats.GetInitialUsage(bucket)
-	replStats.FailedCount += usageStat.FailedCount
-	replStats.FailedSize += usageStat.FailedSize
-	replStats.PendingCount += usageStat.PendingCount
-	replStats.PendingSize += usageStat.PendingSize
 	replStats.ReplicaSize += usageStat.ReplicaSize
 	replStats.ReplicatedSize += usageStat.ReplicatedSize

 	// use in memory replication stats if it is ahead of usage info.
+	s.ReplicatedSize = u.ReplicatedSize
 	if replStats.ReplicatedSize >= u.ReplicatedSize {
 		s.ReplicatedSize = replStats.ReplicatedSize
-	} else {
-		s.ReplicatedSize = u.ReplicatedSize
 	}
-
-	if replStats.PendingSize > u.ReplicationPendingSize {
-		s.PendingSize = replStats.PendingSize
-	} else {
-		s.PendingSize = u.ReplicationPendingSize
-	}
-
-	if replStats.FailedSize > u.ReplicationFailedSize {
-		s.FailedSize = replStats.FailedSize
-	} else {
-		s.FailedSize = u.ReplicationFailedSize
-	}
-
-	if replStats.ReplicaSize > u.ReplicaSize {
-		s.ReplicaSize = replStats.ReplicaSize
-	} else {
-		s.ReplicaSize = u.ReplicaSize
-	}
-
-	if replStats.PendingCount > u.ReplicationPendingCount {
-		s.PendingCount = replStats.PendingCount
-	} else {
-		s.PendingCount = u.ReplicationPendingCount
-	}
-
-	if replStats.FailedCount > u.ReplicationFailedCount {
-		s.FailedCount = replStats.FailedCount
-	} else {
-		s.FailedCount = u.ReplicationFailedCount
-	}
-
+	// Reset FailedSize and FailedCount to 0 for negative overflows which can
+	// happen since data usage picture can lag behind actual usage state at the time of cluster start
+	s.FailedSize = uint64(math.Max(float64(replStats.FailedSize), 0))
+	s.FailedCount = uint64(math.Max(float64(replStats.FailedCount), 0))
+	s.ReplicaSize = uint64(math.Max(float64(replStats.ReplicaSize), float64(u.ReplicaSize)))
 	return s
 }

@@ -537,15 +505,6 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
 			float64(usageInfo.ObjectsCount),
 			bucket,
 		)
-		ch <- prometheus.MustNewConstMetric(
-			prometheus.NewDesc(
-				prometheus.BuildFQName("bucket", "replication", "pending_size"),
-				"Total capacity pending to be replicated",
-				[]string{"bucket"}, nil),
-			prometheus.GaugeValue,
-			float64(stat.PendingSize),
-			bucket,
-		)
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
 				prometheus.BuildFQName("bucket", "replication", "failed_size"),
@@ -573,15 +532,6 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
 			float64(stat.ReplicaSize),
 			bucket,
 		)
-		ch <- prometheus.MustNewConstMetric(
-			prometheus.NewDesc(
-				prometheus.BuildFQName("bucket", "replication", "pending_count"),
-				"Total replication operations pending",
-				[]string{"bucket"}, nil),
-			prometheus.GaugeValue,
-			float64(stat.PendingCount),
-			bucket,
-		)
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
 				prometheus.BuildFQName("bucket", "replication", "failed_count"),