PIP-1490: Expand Opentracing instrumentation.

mailgun · Nov 19, 2021 · 702dcd7 · 702dcd7
1 parent 0aac075
commit 702dcd7
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 14 deletions.
diff --git a/algorithms.go b/algorithms.go
@@ -296,7 +296,7 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 			Limit:     b.Limit,
 			Remaining: int64(b.Remaining),
 			Status:    Status_UNDER_LIMIT,
-			ResetTime: now + (b.Limit - int64(b.Remaining)) * int64(rate),
+			ResetTime: now + (b.Limit-int64(b.Remaining))*int64(rate),
 		}
 
 		if s != nil {
@@ -317,7 +317,7 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 		if int64(b.Remaining) == r.Hits {
 			b.Remaining -= float64(r.Hits)
 			rl.Remaining = 0
-			rl.ResetTime = now + (rl.Limit - rl.Remaining) * int64(rate)
+			rl.ResetTime = now + (rl.Limit-rl.Remaining)*int64(rate)
 			return rl, nil
 		}
 
@@ -335,8 +335,8 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 
 		b.Remaining -= float64(r.Hits)
 		rl.Remaining = int64(b.Remaining)
-		rl.ResetTime = now + (rl.Limit - rl.Remaining) * int64(rate)
-		c.UpdateExpiration(r.HashKey(), now + duration)
+		rl.ResetTime = now + (rl.Limit-rl.Remaining)*int64(rate)
+		c.UpdateExpiration(r.HashKey(), now+duration)
 		return rl, nil
 	}
 
@@ -366,14 +366,14 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 		Status:    Status_UNDER_LIMIT,
 		Limit:     b.Limit,
 		Remaining: r.Burst - r.Hits,
-		ResetTime: now + (b.Limit - (r.Burst - r.Hits)) * int64(rate),
+		ResetTime: now + (b.Limit-(r.Burst-r.Hits))*int64(rate),
 	}
 
 	// Client could be requesting that we start with the bucket OVER_LIMIT
 	if r.Hits > r.Burst {
 		rl.Status = Status_OVER_LIMIT
 		rl.Remaining = 0
-		rl.ResetTime = now + (rl.Limit - rl.Remaining) * int64(rate)
+		rl.ResetTime = now + (rl.Limit-rl.Remaining)*int64(rate)
 		b.Remaining = 0
 	}
 

diff --git a/cache.go b/cache.go
@@ -21,6 +21,7 @@ package gubernator
 import (
 	"container/list"
 	"sync"
+	"sync/atomic"
 
 	"github.com/mailgun/holster/v4/clock"
 	"github.com/mailgun/holster/v4/setter"
@@ -68,6 +69,9 @@ type LRUCache struct {
 	// Stats
 	sizeMetric   *prometheus.Desc
 	accessMetric *prometheus.Desc
+
+	LockCounter   uint64
+	UnlockCounter uint64
 }
 
 type CacheItem struct {
@@ -102,10 +106,12 @@ func NewLRUCache(maxSize int) *LRUCache {
 }
 
 func (c *LRUCache) Lock() {
+	atomic.AddUint64(&c.LockCounter, 1)
 	c.mutex.Lock()
 }
 
 func (c *LRUCache) Unlock() {
+	atomic.AddUint64(&c.UnlockCounter, 1)
 	c.mutex.Unlock()
 }
 

diff --git a/functional_test.go b/functional_test.go
@@ -391,7 +391,7 @@ func TestLeakyBucket(t *testing.T) {
 			assert.Equal(t, test.Status, rl.Status)
 			assert.Equal(t, test.Remaining, rl.Remaining)
 			assert.Equal(t, int64(10), rl.Limit)
-			assert.Equal(t, clock.Now().Unix() + (rl.Limit - rl.Remaining) * 3, rl.ResetTime/1000)
+			assert.Equal(t, clock.Now().Unix()+(rl.Limit-rl.Remaining)*3, rl.ResetTime/1000)
 			clock.Advance(test.Sleep)
 		})
 	}
@@ -498,7 +498,7 @@ func TestLeakyBucketWithBurst(t *testing.T) {
 			assert.Equal(t, test.Status, rl.Status)
 			assert.Equal(t, test.Remaining, rl.Remaining)
 			assert.Equal(t, int64(10), rl.Limit)
-			assert.Equal(t, clock.Now().Unix() + (rl.Limit - rl.Remaining) * 3, rl.ResetTime/1000)
+			assert.Equal(t, clock.Now().Unix()+(rl.Limit-rl.Remaining)*3, rl.ResetTime/1000)
 			clock.Advance(test.Sleep)
 		})
 	}

diff --git a/gubernator.go b/gubernator.go
@@ -21,6 +21,7 @@ import (
 	"fmt"
 	"strings"
 	"sync"
+	"sync/atomic"
 
 	"github.com/mailgun/gubernator/v2/tracing"
 	"github.com/mailgun/holster/v4/setter"
@@ -80,6 +81,18 @@ var funcTimeMetric = prometheus.NewSummaryVec(prometheus.SummaryOpts{
 var asyncRequestsRetriesCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
 	Name: "baliedge_asyncrequests_retries",
 }, []string{"name"})
+var queueLengthMetric = prometheus.NewSummaryVec(prometheus.SummaryOpts{
+	Name: "baliedge_queue_length",
+	Objectives: map[float64]float64{
+		0.99: 0.001,
+	},
+}, []string{"peerAddr"})
+var lockCounterMetric = prometheus.NewSummary(prometheus.SummaryOpts{
+	Name: "baliedge_lock_counter",
+	Objectives: map[float64]float64{
+		0.99: 0.001,
+	},
+})
 
 // NewV1Instance instantiate a single instance of a gubernator peer and registers this
 // instance with the provided GRPCServer.
@@ -514,7 +527,10 @@ func (s *V1Instance) getRateLimit(ctx context.Context, r *RateLimitReq) (*RateLi
 	s.conf.Cache.Lock()
 	defer s.conf.Cache.Unlock()
 	lockTimer.ObserveDuration()
-	tracing.LogInfo(span, "conf.Cache.Lock()")
+	lruCache := s.conf.Cache.(*LRUCache)
+	lockCounter := atomic.LoadUint64(&lruCache.LockCounter) - atomic.LoadUint64(&lruCache.UnlockCounter)
+	lockCounterMetric.Observe(float64(lockCounter))
+	tracing.LogInfo(span, "conf.Cache.Lock()", "lockCounter", lockCounter)
 
 	if HasBehavior(r.Behavior, Behavior_GLOBAL) {
 		s.global.QueueUpdate(r)
@@ -682,6 +698,8 @@ func (s *V1Instance) Describe(ch chan<- *prometheus.Desc) {
 	getPeerRateLimitLockDurationMetric.Describe(ch)
 	funcTimeMetric.Describe(ch)
 	asyncRequestsRetriesCounter.Describe(ch)
+	queueLengthMetric.Describe(ch)
+	lockCounterMetric.Describe(ch)
 }
 
 // Collect fetches metrics from the server for use by prometheus
@@ -693,6 +711,8 @@ func (s *V1Instance) Collect(ch chan<- prometheus.Metric) {
 	getPeerRateLimitLockDurationMetric.Collect(ch)
 	funcTimeMetric.Collect(ch)
 	asyncRequestsRetriesCounter.Collect(ch)
+	queueLengthMetric.Collect(ch)
+	lockCounterMetric.Collect(ch)
 }
 
 // HasBehavior returns true if the provided behavior is set

diff --git a/kubernetesconfig.go b/kubernetesconfig.go
@@ -1,3 +1,4 @@
+//go:build !local
 // +build !local
 
 package gubernator

diff --git a/kubernetesconfig_local.go b/kubernetesconfig_local.go
@@ -1,3 +1,4 @@
+//go:build local
 // +build local
 
 package gubernator

diff --git a/peer_client.go b/peer_client.go
@@ -195,7 +195,7 @@ func (c *PeerClient) GetPeerRateLimit(ctx context.Context, r *RateLimitReq) (*Ra
 		logrus.
 			WithError(errors.WithStack(err)).
 			WithFields(logrus.Fields{
-				"request": r,
+				"request":  r,
 				"peerAddr": c.conf.Info.GRPCAddress,
 			}).
 			Error(errPart)
@@ -330,12 +330,16 @@ func (c *PeerClient) getPeerRateLimitsBatch(ctx context.Context, r *RateLimitReq
 	}
 	req := request{
 		request: r,
-		resp: make(chan *response, 1),
-		ctx: ctx,
+		resp:    make(chan *response, 1),
+		ctx:     ctx,
 	}
 
 	// Enqueue the request to be sent
 	span2, _ := tracing.StartNamedSpan(ctx, "Enqueue request")
+	span2.SetTag("queueLen", len(c.queue))
+	srcPeerAddr := c.Info().GRPCAddress
+	queueLengthMetric.WithLabelValues(srcPeerAddr).Observe(float64(len(c.queue)))
+
 	c.queue <- &req
 	span2.Finish()
 
@@ -396,7 +400,7 @@ func (c *PeerClient) run() {
 				if len(queue) == c.conf.Behavior.BatchLimit {
 					logMsg := "run() reached batch limit"
 					logrus.WithFields(logrus.Fields{
-						"queueLen": len(queue),
+						"queueLen":   len(queue),
 						"batchLimit": c.conf.Behavior.BatchLimit,
 					}).Info(logMsg)
 					tracing.LogInfo(reqSpan, logMsg)
@@ -454,7 +458,7 @@ func (c *PeerClient) sendQueue(ctx context.Context, queue []*request) {
 		logrus.
 			WithError(err).
 			WithFields(logrus.Fields{
-				"queueLen": len(queue),
+				"queueLen":     len(queue),
 				"batchTimeout": c.conf.Behavior.BatchTimeout.String(),
 			}).
 			Error(logPart)