From eabc5e6921f461a373c330def642ddbac68846c4 Mon Sep 17 00:00:00 2001 From: Shawn Poulson <92753637+Baliedge@users.noreply.github.com> Date: Mon, 5 Jun 2023 11:53:09 -0400 Subject: [PATCH] Reduce tracing detail (#177) * PIP-2499: Update holster. * PIP-2499: Simplify tracing. * PIP-2499: Remove unnecessary `AddEvent()` calls. * PIP-2499: Reduce detail. --------- Co-authored-by: Shawn Poulson --- algorithms.go | 25 ------------------------- go.mod | 2 +- go.sum | 4 ++-- gubernator.go | 13 +++++-------- peer_client.go | 11 ++--------- 5 files changed, 10 insertions(+), 45 deletions(-) diff --git a/algorithms.go b/algorithms.go index c1dadf60..43fe4fdd 100644 --- a/algorithms.go +++ b/algorithms.go @@ -41,15 +41,12 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * // Get rate limit from cache. hashKey := r.HashKey() item, ok := c.GetItem(hashKey) - span.AddEvent("c.GetItem()") if s != nil && !ok { // Cache miss. // Check our store for the item. if item, ok = s.Get(ctx, r); ok { - span.AddEvent("Check store for rate limit") c.Add(item) - span.AddEvent("c.Add()") } } @@ -78,15 +75,11 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * if ok { // Item found in cache or store. - span.AddEvent("Update existing rate limit") - if HasBehavior(r.Behavior, Behavior_RESET_REMAINING) { c.Remove(hashKey) - span.AddEvent("c.Remove()") if s != nil { s.Remove(ctx, hashKey) - span.AddEvent("s.Remove()") } return &RateLimitResp{ Status: Status_UNDER_LIMIT, @@ -107,18 +100,15 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * span.AddEvent("Client switched algorithms; perhaps due to a migration?") c.Remove(hashKey) - span.AddEvent("c.Remove()") if s != nil { s.Remove(ctx, hashKey) - span.AddEvent("s.Remove()") } return tokenBucketNewItem(ctx, s, c, r) } // Update the limit if it changed. - span.AddEvent("Update the limit if changed") if t.Limit != r.Limit { // Add difference to remaining. t.Remaining += r.Limit - t.Limit @@ -164,14 +154,12 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * if s != nil { defer func() { s.OnChange(ctx, r, item) - span.AddEvent("defer s.OnChange()") }() } // Client is only interested in retrieving the current status or // updating the rate limit config. if r.Hits == 0 { - span.AddEvent("Return current status, apply no change") return rl, nil } @@ -230,7 +218,6 @@ func tokenBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq) } // Add a new rate limit to the cache. - span.AddEvent("Add a new rate limit to the cache") if HasBehavior(r.Behavior, Behavior_DURATION_IS_GREGORIAN) { expire, err = GregorianExpiration(clock.Now(), r.Duration) if err != nil { @@ -262,11 +249,9 @@ func tokenBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq) } c.Add(item) - span.AddEvent("c.Add()") if s != nil { s.OnChange(ctx, r, item) - span.AddEvent("s.OnChange()") } return rl, nil @@ -292,15 +277,12 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * // Get rate limit from cache. hashKey := r.HashKey() item, ok := c.GetItem(hashKey) - span.AddEvent("c.GetItem()") if s != nil && !ok { // Cache miss. // Check our store for the item. if item, ok = s.Get(ctx, r); ok { - span.AddEvent("Check store for rate limit") c.Add(item) - span.AddEvent("c.Add()") } } @@ -329,17 +311,14 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * if ok { // Item found in cache or store. - span.AddEvent("Update existing rate limit") b, ok := item.Value.(*LeakyBucketItem) if !ok { // Client switched algorithms; perhaps due to a migration? c.Remove(hashKey) - span.AddEvent("c.Remove()") if s != nil { s.Remove(ctx, hashKey) - span.AddEvent("s.Remove()") } return leakyBucketNewItem(ctx, s, c, r) @@ -410,7 +389,6 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * if s != nil { defer func() { s.OnChange(ctx, r, item) - span.AddEvent("s.OnChange()") }() } @@ -457,7 +435,6 @@ func leakyBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq) defer func() { tracing.EndScope(ctx, err) }() - span := trace.SpanFromContext(ctx) now := MillisecondNow() duration := r.Duration @@ -506,11 +483,9 @@ func leakyBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq) } c.Add(item) - span.AddEvent("c.Add()") if s != nil { s.OnChange(ctx, r, item) - span.AddEvent("s.OnChange()") } return &rl, nil diff --git a/go.mod b/go.mod index 703011af..9f770346 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/davecgh/go-spew v1.1.1 github.com/grpc-ecosystem/grpc-gateway/v2 v2.11.3 github.com/hashicorp/memberlist v0.5.0 - github.com/mailgun/holster/v4 v4.12.0 + github.com/mailgun/holster/v4 v4.12.4 github.com/miekg/dns v1.1.50 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.13.0 diff --git a/go.sum b/go.sum index 863e0567..33227ad8 100644 --- a/go.sum +++ b/go.sum @@ -296,8 +296,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/mailgun/holster/v4 v4.12.0 h1:SsorK7fkIdkyW7pE1Vg4aCXWiwZEz15cf8FT31IPlZU= -github.com/mailgun/holster/v4 v4.12.0/go.mod h1:s2H6HX3iMCub1K1Y7qjf0RgpzS2+mRxyj+i3vlb4FK0= +github.com/mailgun/holster/v4 v4.12.4 h1:U9wHgacx+s1/mPhl9ELMhdnMofg2HZHBmeMWnur3HJ0= +github.com/mailgun/holster/v4 v4.12.4/go.mod h1:s2H6HX3iMCub1K1Y7qjf0RgpzS2+mRxyj+i3vlb4FK0= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= diff --git a/gubernator.go b/gubernator.go index d9e2771e..abbe3eac 100644 --- a/gubernator.go +++ b/gubernator.go @@ -193,6 +193,9 @@ func (s *V1Instance) Close() (reterr error) { // peer that does. func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (retval *GetRateLimitsResp, reterr error) { span := trace.SpanFromContext(ctx) + span.SetAttributes( + attribute.Int("num.items", len(r.Requests)), + ) funcTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("V1Instance.GetRateLimits")) defer funcTimer.ObserveDuration() @@ -217,7 +220,7 @@ func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (re // For each item in the request body for i, req := range r.Requests { - _ = tracing.NamedScope(ctx, "Iterate requests", func(ctx context.Context) error { + _ = tracing.NamedScopeDebug(ctx, "Iterate requests", func(ctx context.Context) error { key := req.Name + "_" + req.UniqueKey var peer *PeerClient var err error @@ -325,7 +328,7 @@ func (s *V1Instance) asyncRequests(ctx context.Context, req *AsyncReq) { var attempts int var err error - ctx = tracing.StartScope(ctx) + ctx = tracing.StartScopeDebug(ctx) defer tracing.EndScope(ctx, nil) span := trace.SpanFromContext(ctx) span.SetAttributes( @@ -548,7 +551,6 @@ func (s *V1Instance) HealthCheck(ctx context.Context, r *HealthCheckReq) (retval s.peerMutex.RLock() defer s.peerMutex.RUnlock() - span.AddEvent("peerMutex.RLock()") // Iterate through local peers and get their last errors localPeers := s.conf.LocalPicker.Peers() @@ -615,12 +617,10 @@ func (s *V1Instance) getRateLimit(ctx context.Context, r *RateLimitReq) (retval if HasBehavior(r.Behavior, Behavior_GLOBAL) { s.global.QueueUpdate(r) - span.AddEvent("s.global.QueueUpdate(r)") } if HasBehavior(r.Behavior, Behavior_MULTI_REGION) { s.mutliRegion.QueueHits(r) - span.AddEvent("s.mutliRegion.QueueHits(r)") } resp, err := s.gubernatorPool.GetRateLimit(ctx, r) @@ -719,15 +719,12 @@ func (s *V1Instance) SetPeers(peerInfo []PeerInfo) { // GetPeer returns a peer client for the hash key provided func (s *V1Instance) GetPeer(ctx context.Context, key string) (retval *PeerClient, reterr error) { - span := trace.SpanFromContext(ctx) - funcTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("V1Instance.GetPeer")) defer funcTimer.ObserveDuration() lockTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("V1Instance.GetPeer_RLock")) s.peerMutex.RLock() defer s.peerMutex.RUnlock() - span.AddEvent("peerMutex.RLock()") lockTimer.ObserveDuration() peer, err := s.conf.LocalPicker.Get(key) diff --git a/peer_client.go b/peer_client.go index 4768a076..ba658cc5 100644 --- a/peer_client.go +++ b/peer_client.go @@ -98,7 +98,6 @@ func (c *PeerClient) connect(ctx context.Context) (reterr error) { defer func() { tracing.EndScope(ctx, reterr) }() - span := trace.SpanFromContext(ctx) // NOTE: To future self, this mutex is used here because we need to know if the peer is disconnecting and // handle ErrClosing. Since this mutex MUST be here we take this opportunity to also see if we are connected. @@ -112,7 +111,6 @@ func (c *PeerClient) connect(ctx context.Context) (reterr error) { c.mutex.RLock() lockTimer.ObserveDuration() - span.AddEvent("mutex.RLock()") if c.status == peerClosing { c.mutex.RUnlock() @@ -128,7 +126,6 @@ func (c *PeerClient) connect(ctx context.Context) (reterr error) { c.mutex.RUnlock() c.mutex.Lock() defer c.mutex.Unlock() - span.AddEvent("mutex.Lock()") // Now that we have the RW lock, ensure no else got here ahead of us. if c.status == peerConnected { @@ -169,7 +166,7 @@ func (c *PeerClient) Info() PeerInfo { // GetPeerRateLimit forwards a rate limit request to a peer. If the rate limit has `behavior == BATCHING` configured // this method will attempt to batch the rate limits func (c *PeerClient) GetPeerRateLimit(ctx context.Context, r *RateLimitReq) (retval *RateLimitResp, reterr error) { - ctx = tracing.StartScope(ctx) + ctx = tracing.StartScopeDebug(ctx) defer func() { tracing.EndScope(ctx, reterr) }() @@ -222,7 +219,6 @@ func (c *PeerClient) GetPeerRateLimits(ctx context.Context, r *GetPeerRateLimits // a race condition if called within a separate go routine if the internal wg is `0` // when Wait() is called then Add(1) is called concurrently. c.mutex.RLock() - span.AddEvent("mutex.RLock()") c.wg.Add(1) defer func() { c.mutex.RUnlock() @@ -251,7 +247,6 @@ func (c *PeerClient) UpdatePeerGlobals(ctx context.Context, r *UpdatePeerGlobals defer func() { tracing.EndScope(ctx, reterr) }() - span := trace.SpanFromContext(ctx) if err := c.connect(ctx); err != nil { return nil, c.setLastErr(err) @@ -259,7 +254,6 @@ func (c *PeerClient) UpdatePeerGlobals(ctx context.Context, r *UpdatePeerGlobals // See NOTE above about RLock and wg.Add(1) c.mutex.RLock() - span.AddEvent("mutex.RLock()") c.wg.Add(1) defer func() { c.mutex.RUnlock() @@ -328,7 +322,6 @@ func (c *PeerClient) getPeerRateLimitsBatch(ctx context.Context, r *RateLimitReq // See NOTE above about RLock and wg.Add(1) c.mutex.RLock() - span.AddEvent("mutex.RLock()") if c.status == peerClosing { err := &PeerErr{err: errors.New("already disconnecting")} return nil, c.setLastErr(err) @@ -396,7 +389,7 @@ func (c *PeerClient) run() { return } - _ = tracing.CallNamedScope(r.ctx, "Send batch", func(reqCtx context.Context) error { + _ = tracing.CallNamedScopeDebug(r.ctx, "Send batch", func(reqCtx context.Context) error { span := trace.SpanFromContext(reqCtx) span.SetAttributes( attribute.String("peer.grpcAddress", c.conf.Info.GRPCAddress),