diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..66a2fbd0 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +gubernator +gubernator-cli diff --git a/.gitignore b/.gitignore index 88f825f3..5f0d8ea4 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ gubernator.egg-info/ .DS_Store *.iml googleapis/ +coverage.out +coverage.html diff --git a/CHANGELOG b/CHANGELOG index 68fbd4f7..a68d9e31 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,17 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [2.0.0-rc.13] - 2022-01-19 +## Changes +* Added Opentracing support in gRPC service and various critical functions. +* Added many more useful Prometheus metrics. +* Refactored GetRateCheck calls to use a hash ring for parallel processing, +instead of locking a shared mutex to process requests sequentially. +* Rate checks now respect client's context deadline and will abort processing +immediately if canceled. +* Fixed stack overflow panic in token bucket ratelimit checking. +* Fixed leaky bucket ratelimits expiring prematurely. + ## [2.0.0-rc.12] - 2021-12-28 ## Changes * Include s.conf.Behaviors in Config for NewV1Instance diff --git a/CODEOWNERS b/CODEOWNERS index 8b4706f2..19af0f81 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -3,3 +3,4 @@ # These owners will be the default owners for everything in the repo. * @thrawn01 +* @Baliedge diff --git a/Dockerfile-cli b/Dockerfile-cli new file mode 100644 index 00000000..289f9000 --- /dev/null +++ b/Dockerfile-cli @@ -0,0 +1,30 @@ +# Build image +FROM golang:1.17 as build + +WORKDIR /go/src + +# This should create cached layer of our dependencies for subsequent builds to use +COPY go.mod /go/src +COPY go.sum /go/src +RUN go mod download + +# Copy the local package files to the container +ADD . /go/src + +ARG VERSION + +# Build the server inside the container +RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -installsuffix cgo \ + -ldflags "-w -s -X main.Version=$VERSION" -o /gubernator-cli /go/src/cmd/gubernator-cli/main.go + +# Create our deploy image +FROM scratch + +# Certs for ssl +COPY --from=build /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ + +# Copy our static executable. +COPY --from=build /gubernator-cli /gubernator-cli + +# Run the server +ENTRYPOINT ["/gubernator-cli"] diff --git a/Makefile b/Makefile index 356a7649..249a5a86 100644 --- a/Makefile +++ b/Makefile @@ -1,24 +1,32 @@ -.PHONY: release docker proto certs .DEFAULT_GOAL := release - VERSION=$(shell cat version) - LDFLAGS="-X main.Version=$(VERSION)" +.PHONY: test test: - go test ./... -v -race -p=1 -count=1 + (go test -v -race -p=1 -count=1 -coverprofile coverage.out ./...; ret=$$?; \ + go tool cover -func coverage.out; \ + go tool cover -html coverage.out -o coverage.html; \ + exit $$ret) +.PHONY: docker docker: docker build --build-arg VERSION=$(VERSION) -t ghcr.io/mailgun/gubernator:$(VERSION) . docker tag ghcr.io/mailgun/gubernator:$(VERSION) ghcr.io/mailgun/gubernator:latest +.PHONY: release release: - GOOS=darwin GOARCH=amd64 go build -ldflags $(LDFLAGS) -o gubernator.darwin ./cmd/gubernator/main.go - GOOS=linux GOARCH=amd64 go build -ldflags $(LDFLAGS) -o gubernator.linux ./cmd/gubernator/main.go + go build -v -ldflags $(LDFLAGS) -o gubernator ./cmd/gubernator/main.go +.PHONY: clean +clean: + rm -f gubernator gubernator-cli + +.PHONY: proto proto: scripts/proto.sh +.PHONY: certs certs: rm certs/*.key certs/*.srl certs/*.csr certs/*.pem openssl genrsa -out certs/ca.key 4096 @@ -32,4 +40,3 @@ certs: openssl req -sha1 -key certs/client-auth.key -new -out certs/client-auth.req -subj "/C=US/ST=TX/O=Mailgun Technologies, Inc./CN=client.com/emailAddress=admin@mailgun.com" openssl x509 -req -days 3650 -in certs/client-auth.req -CA certs/client-auth-ca.pem -CAkey certs/client-auth-ca.key -passin pass:test -out certs/client-auth.pem openssl x509 -extfile certs/client-auth.conf -extensions ssl_client -req -days 3650 -in certs/client-auth.req -CA certs/client-auth-ca.pem -CAkey certs/client-auth-ca.key -passin pass:test -out certs/client-auth.pem - diff --git a/README.md b/README.md index 7b3992ea..67e605ee 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,12 @@ Examples when using `Behavior = DURATION_IS_GREGORIAN` * If `Duration = 0` (Minutes) then the rate limit will reset to `Current = 0` at the end of the minute the rate limit was created. * If `Duration = 4` (Months) then the rate limit will reset to `Current = 0` at the end of the month the rate limit was created. +## Reset Remaining Behavior +Users may add behavior `Behavior_RESET_REMAINING` to the rate check request. +This will reset the rate limit as if created new on first use. + +When using Reset Remaining, the `Hits` field should be 0. + ## Gubernator as a library If you are using golang, you can use Gubernator as a library. This is useful if you wish to implement a rate limit service with your own company specific model @@ -324,3 +330,10 @@ See the `example.conf` for all available config options and their descriptions. See [architecture.md](/architecture.md) for a full description of the architecture and the inner workings of gubernator. +## Monitoring +Gubernator publishes Prometheus metrics for realtime monitoring. See +[prometheus.md](prometheus.md) for details. + +## Jaeger Tracing +Gubernator supports tracing using Jaeger Tracing tools. See +[jaegertracing.md](jaegertracing.md) for details. diff --git a/algorithms.go b/algorithms.go index 7ea11d95..aca6458e 100644 --- a/algorithms.go +++ b/algorithms.go @@ -17,42 +17,70 @@ limitations under the License. package gubernator import ( + "context" + + "github.com/mailgun/gubernator/v2/tracing" "github.com/mailgun/holster/v4/clock" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" ) // Implements token bucket algorithm for rate limiting. https://en.wikipedia.org/wiki/Token_bucket -func tokenBucket(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) { +func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + tokenBucketTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("tokenBucket")) + defer tokenBucketTimer.ObserveDuration() + // Get rate limit from cache. hashKey := r.HashKey() item, ok := c.GetItem(hashKey) + tracing.LogInfo(span, "c.GetItem()") if s != nil && !ok { // Cache miss. // Check our store for the item. - if item, ok = s.Get(r); ok { + if item, ok = s.Get(ctx, r); ok { + tracing.LogInfo(span, "Check store for rate limit") c.Add(item) + tracing.LogInfo(span, "c.Add()") } } // Sanity checks. if ok { if item.Value == nil { - logrus.Error("tokenBucket: Invalid cache item; Value is nil") + msgPart := "tokenBucket: Invalid cache item; Value is nil" + tracing.LogInfo(span, msgPart, + "hashKey", hashKey, + "key", r.UniqueKey, + "name", r.Name, + ) + logrus.Error(msgPart) ok = false } else if item.Key != hashKey { - logrus.Error("tokenBucket: Invalid cache item; key mismatch") + msgPart := "tokenBucket: Invalid cache item; key mismatch" + tracing.LogInfo(span, msgPart, + "itemKey", item.Key, + "hashKey", hashKey, + "name", r.Name, + ) + logrus.Error(msgPart) ok = false } } if ok { // Item found in cache or store. + tracing.LogInfo(span, "Update existing rate limit") + if HasBehavior(r.Behavior, Behavior_RESET_REMAINING) { c.Remove(hashKey) + tracing.LogInfo(span, "c.Remove()") if s != nil { - s.Remove(hashKey) + s.Remove(ctx, hashKey) + tracing.LogInfo(span, "s.Remove()") } return &RateLimitResp{ Status: Status_UNDER_LIMIT, @@ -70,22 +98,21 @@ func tokenBucket(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err er t, ok := item.Value.(*TokenBucketItem) if !ok { // Client switched algorithms; perhaps due to a migration? + tracing.LogInfo(span, "Client switched algorithms; perhaps due to a migration?") + c.Remove(hashKey) + tracing.LogInfo(span, "c.Remove()") if s != nil { - s.Remove(hashKey) + s.Remove(ctx, hashKey) + tracing.LogInfo(span, "s.Remove()") } - return tokenBucketNewItem(s, c, r) - } - - if s != nil { - defer func() { - s.OnChange(r, item) - }() + return tokenBucketNewItem(ctx, s, c, r) } // Update the limit if it changed. + tracing.LogInfo(span, "Update the limit if changed") if t.Limit != r.Limit { // Add difference to remaining. t.Remaining += r.Limit - t.Limit @@ -104,6 +131,7 @@ func tokenBucket(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err er // If the duration config changed, update the new ExpireAt. if t.Duration != r.Duration { + tracing.LogInfo(span, "Duration changed") expire := t.CreatedAt + r.Duration if HasBehavior(r.Behavior, Behavior_DURATION_IS_GREGORIAN) { expire, err = GregorianExpiration(clock.Now(), r.Duration) @@ -116,6 +144,7 @@ func tokenBucket(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err er now := MillisecondNow() if expire <= now { // Renew item. + tracing.LogInfo(span, "Limit has expired") expire = now + r.Duration t.CreatedAt = now t.Remaining = t.Limit @@ -126,14 +155,24 @@ func tokenBucket(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err er rl.ResetTime = expire } + if s != nil { + defer func() { + s.OnChange(ctx, r, item) + tracing.LogInfo(span, "defer s.OnChange()") + }() + } + // Client is only interested in retrieving the current status or // updating the rate limit config. if r.Hits == 0 { + tracing.LogInfo(span, "Return current status, apply no change") return rl, nil } // If we are already at the limit. if rl.Remaining == 0 { + tracing.LogInfo(span, "Already over the limit") + overLimitCounter.Add(1) rl.Status = Status_OVER_LIMIT t.Status = rl.Status return rl, nil @@ -141,6 +180,7 @@ func tokenBucket(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err er // If requested hits takes the remainder. if t.Remaining == r.Hits { + tracing.LogInfo(span, "At the limit") t.Remaining = 0 rl.Remaining = 0 return rl, nil @@ -149,21 +189,27 @@ func tokenBucket(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err er // If requested is more than available, then return over the limit // without updating the cache. if r.Hits > t.Remaining { + tracing.LogInfo(span, "Over the limit") + overLimitCounter.Add(1) rl.Status = Status_OVER_LIMIT return rl, nil } + tracing.LogInfo(span, "Under the limit") t.Remaining -= r.Hits rl.Remaining = t.Remaining return rl, nil } // Item is not found in cache or store, create new. - return tokenBucketNewItem(s, c, r) + return tokenBucketNewItem(ctx, s, c, r) } // Called by tokenBucket() when adding a new item in the store. -func tokenBucketNewItem(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) { +func tokenBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + now := MillisecondNow() expire := now + r.Duration @@ -181,6 +227,7 @@ func tokenBucketNewItem(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, } // Add a new rate limit to the cache. + tracing.LogInfo(span, "Add a new rate limit to the cache") if HasBehavior(r.Behavior, Behavior_DURATION_IS_GREGORIAN) { expire, err = GregorianExpiration(clock.Now(), r.Duration) if err != nil { @@ -197,22 +244,31 @@ func tokenBucketNewItem(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, // Client could be requesting that we always return OVER_LIMIT. if r.Hits > r.Limit { + tracing.LogInfo(span, "Over the limit") + overLimitCounter.Add(1) rl.Status = Status_OVER_LIMIT rl.Remaining = r.Limit t.Remaining = r.Limit } c.Add(item) + tracing.LogInfo(span, "c.Add()") if s != nil { - s.OnChange(r, item) + s.OnChange(ctx, r, item) + tracing.LogInfo(span, "s.OnChange()") } return rl, nil } // Implements leaky bucket algorithm for rate limiting https://en.wikipedia.org/wiki/Leaky_bucket -func leakyBucket(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) { +func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + leakyBucketTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("V1Instance.getRateLimit_leakyBucket")) + defer leakyBucketTimer.ObserveDuration() + if r.Burst == 0 { r.Burst = r.Limit } @@ -222,38 +278,57 @@ func leakyBucket(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err er // Get rate limit from cache. hashKey := r.HashKey() item, ok := c.GetItem(hashKey) + tracing.LogInfo(span, "c.GetItem()") if s != nil && !ok { // Cache miss. // Check our store for the item. - if item, ok = s.Get(r); ok { + if item, ok = s.Get(ctx, r); ok { + tracing.LogInfo(span, "Check store for rate limit") c.Add(item) + tracing.LogInfo(span, "c.Add()") } } // Sanity checks. if ok { if item.Value == nil { - logrus.Error("leakyBucket: Invalid cache item; Value is nil") + msgPart := "leakyBucket: Invalid cache item; Value is nil" + tracing.LogInfo(span, msgPart, + "hashKey", hashKey, + "key", r.UniqueKey, + "name", r.Name, + ) + logrus.Error(msgPart) ok = false } else if item.Key != hashKey { - logrus.Error("leakyBucket: Invalid cache item; key mismatch") + msgPart := "leakyBucket: Invalid cache item; key mismatch" + tracing.LogInfo(span, msgPart, + "itemKey", item.Key, + "hashKey", hashKey, + "name", r.Name, + ) + logrus.Error(msgPart) ok = false } } if ok { // Item found in cache or store. + tracing.LogInfo(span, "Update existing rate limit") + b, ok := item.Value.(*LeakyBucketItem) if !ok { // Client switched algorithms; perhaps due to a migration? c.Remove(hashKey) + tracing.LogInfo(span, "c.Remove()") if s != nil { - s.Remove(hashKey) + s.Remove(ctx, hashKey) + tracing.LogInfo(span, "s.Remove()") } - return leakyBucketNewItem(s, c, r) + return leakyBucketNewItem(ctx, s, c, r) } if HasBehavior(r.Behavior, Behavior_RESET_REMAINING) { @@ -320,12 +395,14 @@ func leakyBucket(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err er if s != nil { defer func() { - s.OnChange(r, item) + s.OnChange(ctx, r, item) + tracing.LogInfo(span, "s.OnChange()") }() } // If we are already at the limit if int64(b.Remaining) == 0 { + overLimitCounter.Add(1) rl.Status = Status_OVER_LIMIT return rl, nil } @@ -341,6 +418,7 @@ func leakyBucket(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err er // If requested is more than available, then return over the limit // without updating the bucket. if r.Hits > int64(b.Remaining) { + overLimitCounter.Add(1) rl.Status = Status_OVER_LIMIT return rl, nil } @@ -356,11 +434,14 @@ func leakyBucket(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err er return rl, nil } - return leakyBucketNewItem(s, c, r) + return leakyBucketNewItem(ctx, s, c, r) } // Called by leakyBucket() when adding a new item in the store. -func leakyBucketNewItem(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) { +func leakyBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + now := MillisecondNow() duration := r.Duration rate := float64(duration) / float64(r.Limit) @@ -393,6 +474,7 @@ func leakyBucketNewItem(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, // Client could be requesting that we start with the bucket OVER_LIMIT if r.Hits > r.Burst { + overLimitCounter.Add(1) rl.Status = Status_OVER_LIMIT rl.Remaining = 0 rl.ResetTime = now + (rl.Limit-rl.Remaining)*int64(rate) @@ -407,9 +489,11 @@ func leakyBucketNewItem(s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, } c.Add(item) + tracing.LogInfo(span, "c.Add()") if s != nil { - s.OnChange(r, item) + s.OnChange(ctx, r, item) + tracing.LogInfo(span, "s.OnChange()") } return &rl, nil diff --git a/benchmark_cache_test.go b/benchmark_cache_test.go new file mode 100644 index 00000000..7e66aef9 --- /dev/null +++ b/benchmark_cache_test.go @@ -0,0 +1,160 @@ +package gubernator_test + +import ( + "strconv" + "sync" + "testing" + "time" + + "github.com/mailgun/gubernator/v2" + "github.com/mailgun/holster/v4/clock" +) + +func BenchmarkCache(b *testing.B) { + testCases := []struct { + Name string + NewTestCache func() gubernator.Cache + LockRequired bool + }{ + { + Name: "LRUCache", + NewTestCache: func() gubernator.Cache { + return gubernator.NewLRUCache(0) + }, + LockRequired: true, + }, + } + + for _, testCase := range testCases { + b.Run(testCase.Name, func(b *testing.B) { + b.Run("Sequential reads", func(b *testing.B) { + cache := testCase.NewTestCache() + expire := clock.Now().Add(time.Hour).UnixMilli() + + for i := 0; i < b.N; i++ { + key := strconv.Itoa(i) + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expire, + } + cache.Add(item) + } + + b.ReportAllocs() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + key := strconv.Itoa(i) + _, _ = cache.GetItem(key) + } + }) + + b.Run("Sequential writes", func(b *testing.B) { + cache := testCase.NewTestCache() + expire := clock.Now().Add(time.Hour).UnixMilli() + + b.ReportAllocs() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + item := &gubernator.CacheItem{ + Key: strconv.Itoa(i), + Value: i, + ExpireAt: expire, + } + cache.Add(item) + } + }) + + b.Run("Concurrent reads", func(b *testing.B) { + cache := testCase.NewTestCache() + expire := clock.Now().Add(time.Hour).UnixMilli() + + for i := 0; i < b.N; i++ { + key := strconv.Itoa(i) + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expire, + } + cache.Add(item) + } + + var wg sync.WaitGroup + var mutex sync.Mutex + var task func(i int) + + if testCase.LockRequired { + task = func(i int) { + mutex.Lock() + defer mutex.Unlock() + key := strconv.Itoa(i) + _, _ = cache.GetItem(key) + wg.Done() + } + } else { + task = func(i int) { + key := strconv.Itoa(i) + _, _ = cache.GetItem(key) + wg.Done() + } + } + + b.ReportAllocs() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + wg.Add(1) + go task(i) + } + + wg.Wait() + }) + + b.Run("Concurrent writes", func(b *testing.B) { + cache := testCase.NewTestCache() + expire := clock.Now().Add(time.Hour).UnixMilli() + + var wg sync.WaitGroup + var mutex sync.Mutex + var task func(i int) + + if testCase.LockRequired { + task = func(i int) { + mutex.Lock() + defer mutex.Unlock() + item := &gubernator.CacheItem{ + Key: strconv.Itoa(i), + Value: i, + ExpireAt: expire, + } + cache.Add(item) + wg.Done() + } + } else { + task = func(i int) { + item := &gubernator.CacheItem{ + Key: strconv.Itoa(i), + Value: i, + ExpireAt: expire, + } + cache.Add(item) + wg.Done() + } + } + + b.ReportAllocs() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + wg.Add(1) + go task(i) + } + + wg.Wait() + }) + + }) + } +} diff --git a/benchmark_test.go b/benchmark_test.go index 5d151a5a..56d0fe57 100644 --- a/benchmark_test.go +++ b/benchmark_test.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -23,20 +23,23 @@ import ( guber "github.com/mailgun/gubernator/v2" "github.com/mailgun/gubernator/v2/cluster" "github.com/mailgun/holster/v4/syncutil" + "github.com/stretchr/testify/require" ) -func BenchmarkServer_GetPeerRateLimitNoBatching(b *testing.B) { +func BenchmarkServer(b *testing.B) { + ctx := context.Background() conf := guber.Config{} - if err := conf.SetDefaults(); err != nil { - b.Errorf("SetDefaults err: %s", err) - } + err := conf.SetDefaults() + require.NoError(b, err, "Error in conf.SetDefaults") - client := guber.NewPeerClient(guber.PeerConfig{ - Info: cluster.GetRandomPeer(cluster.DataCenterNone), - Behavior: conf.Behaviors, - }) + b.Run("GetPeerRateLimit() with no batching", func(b *testing.B) { + client := guber.NewPeerClient(guber.PeerConfig{ + Info: cluster.GetRandomPeer(cluster.DataCenterNone), + Behavior: conf.Behaviors, + }) + + b.ResetTimer() - b.Run("GetPeerRateLimitNoBatching", func(b *testing.B) { for n := 0; n < b.N; n++ { _, err := client.GetPeerRateLimit(context.Background(), &guber.RateLimitReq{ Name: "get_peer_rate_limits_benchmark", @@ -47,21 +50,19 @@ func BenchmarkServer_GetPeerRateLimitNoBatching(b *testing.B) { Hits: 1, }) if err != nil { - b.Errorf("client.RateLimit() err: %s", err) + b.Errorf("Error in client.GetPeerRateLimit: %s", err) } } }) -} -func BenchmarkServer_GetRateLimit(b *testing.B) { - client, err := guber.DialV1Server(cluster.GetRandomPeer(cluster.DataCenterNone).GRPCAddress, nil) - if err != nil { - b.Errorf("NewV1Client err: %s", err) - } + b.Run("GetRateLimit()", func(b *testing.B) { + client, err := guber.DialV1Server(cluster.GetRandomPeer(cluster.DataCenterNone).GRPCAddress, nil) + require.NoError(b, err, "Error in guber.DialV1Server") + + b.ResetTimer() - b.Run("GetRateLimit", func(b *testing.B) { for n := 0; n < b.N; n++ { - _, err := client.GetRateLimits(context.Background(), &guber.GetRateLimitsReq{ + _, err := client.GetRateLimits(ctx, &guber.GetRateLimitsReq{ Requests: []*guber.RateLimitReq{ { Name: "get_rate_limit_benchmark", @@ -73,19 +74,17 @@ func BenchmarkServer_GetRateLimit(b *testing.B) { }, }) if err != nil { - b.Errorf("client.RateLimit() err: %s", err) + b.Errorf("Error in client.GetRateLimits(): %s", err) } } }) -} -func BenchmarkServer_GetRateLimitGlobal(b *testing.B) { - client, err := guber.DialV1Server(cluster.GetRandomPeer(cluster.DataCenterNone).GRPCAddress, nil) - if err != nil { - b.Errorf("NewV1Client err: %s", err) - } + b.Run("GetRateLimitGlobal()", func(b *testing.B) { + client, err := guber.DialV1Server(cluster.GetRandomPeer(cluster.DataCenterNone).GRPCAddress, nil) + require.NoError(b, err, "Error in guber.DialV1Server") + + b.ResetTimer() - b.Run("GetRateLimitGlobal", func(b *testing.B) { for n := 0; n < b.N; n++ { _, err := client.GetRateLimits(context.Background(), &guber.GetRateLimitsReq{ Requests: []*guber.RateLimitReq{ @@ -100,48 +99,32 @@ func BenchmarkServer_GetRateLimitGlobal(b *testing.B) { }, }) if err != nil { - b.Errorf("client.RateLimit() err: %s", err) + b.Errorf("Error in client.GetRateLimits: %s", err) } } }) -} -func BenchmarkServer_Ping(b *testing.B) { - client, err := guber.DialV1Server(cluster.GetRandomPeer(cluster.DataCenterNone).GRPCAddress, nil) - if err != nil { - b.Errorf("NewV1Client err: %s", err) - } + b.Run("HealthCheck", func(b *testing.B) { + client, err := guber.DialV1Server(cluster.GetRandomPeer(cluster.DataCenterNone).GRPCAddress, nil) + require.NoError(b, err, "Error in guber.DialV1Server") - //dur := time.Nanosecond * 117728 - //total := time.Second / dur - //fmt.Printf("Total: %d\n", total) + b.ResetTimer() - b.Run("HealthCheck", func(b *testing.B) { for n := 0; n < b.N; n++ { if _, err := client.HealthCheck(context.Background(), &guber.HealthCheckReq{}); err != nil { - b.Errorf("client.HealthCheck() err: %s", err) + b.Errorf("Error in client.HealthCheck: %s", err) } } }) -} -/*func BenchmarkServer_GRPCGateway(b *testing.B) { - for n := 0; n < b.N; n++ { - _, err := http.Get("http://" + cluster.GetHTTPAddress() + "/v1/HealthCheck") - if err != nil { - b.Errorf("GRPCGateway() err: %s", err) - } - } -}*/ + b.Run("Thundering herd", func(b *testing.B) { + client, err := guber.DialV1Server(cluster.GetRandomPeer(cluster.DataCenterNone).GRPCAddress, nil) + require.NoError(b, err, "Error in guber.DialV1Server") -func BenchmarkServer_ThunderingHerd(b *testing.B) { - client, err := guber.DialV1Server(cluster.GetRandomPeer(cluster.DataCenterNone).GRPCAddress, nil) - if err != nil { - b.Errorf("NewV1Client err: %s", err) - } + b.ResetTimer() - b.Run("ThunderingHerd", func(b *testing.B) { fan := syncutil.NewFanOut(100) + for n := 0; n < b.N; n++ { fan.Run(func(o interface{}) error { _, err := client.GetRateLimits(context.Background(), &guber.GetRateLimitsReq{ @@ -156,7 +139,7 @@ func BenchmarkServer_ThunderingHerd(b *testing.B) { }, }) if err != nil { - b.Errorf("client.RateLimit() err: %s", err) + b.Errorf("Error in client.GetRateLimits: %s", err) } return nil }, nil) diff --git a/cache.go b/cache.go index f61951ca..04ef11a0 100644 --- a/cache.go +++ b/cache.go @@ -12,53 +12,19 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - -This work is derived from github.com/golang/groupcache/lru */ package gubernator -import ( - "container/list" - "sync" - - "github.com/mailgun/holster/v4/clock" - "github.com/mailgun/holster/v4/setter" - "github.com/prometheus/client_golang/prometheus" -) - // So algorithms can interface with different cache implementations type Cache interface { - // Access methods - Add(*CacheItem) bool - UpdateExpiration(key interface{}, expireAt int64) bool - GetItem(key interface{}) (value *CacheItem, ok bool) + Add(item *CacheItem) bool + UpdateExpiration(key string, expireAt int64) bool + GetItem(key string) (value *CacheItem, ok bool) Each() chan *CacheItem - Remove(key interface{}) - - // If the cache is exclusive, this will control access to the cache - Unlock() - Lock() -} - -// Holds stats collected about the cache -type cachStats struct { - Size int64 - Miss int64 - Hit int64 -} - -// Cache is an thread unsafe LRU cache that supports expiration -type LRUCache struct { - cache map[interface{}]*list.Element - mutex sync.Mutex - ll *list.List - stats cachStats - cacheSize int - - // Stats - sizeMetric *prometheus.Desc - accessMetric *prometheus.Desc + Remove(key string) + Size() int64 + Close() error } type CacheItem struct { @@ -66,7 +32,7 @@ type CacheItem struct { Key string Value interface{} - // Timestamp when rate limit expires + // Timestamp when rate limit expires in epoch milliseconds. ExpireAt int64 // Timestamp when the cache should invalidate this rate limit. This is useful when used in conjunction with // a persistent store to ensure our node has the most up to date info from the store. Ignored if set to `0` @@ -74,145 +40,3 @@ type CacheItem struct { // for the latest rate limit data. InvalidAt int64 } - -var _ Cache = &LRUCache{} - -// New creates a new Cache with a maximum size -func NewLRUCache(maxSize int) *LRUCache { - setter.SetDefault(&maxSize, 50_000) - - return &LRUCache{ - cache: make(map[interface{}]*list.Element), - ll: list.New(), - cacheSize: maxSize, - sizeMetric: prometheus.NewDesc("gubernator_cache_size", - "The number of items in LRU Cache which holds the rate limits.", nil, nil), - accessMetric: prometheus.NewDesc("gubernator_cache_access_count", - "Cache access counts.", []string{"type"}, nil), - } -} - -func (c *LRUCache) Lock() { - c.mutex.Lock() -} - -func (c *LRUCache) Unlock() { - c.mutex.Unlock() -} - -func (c *LRUCache) Each() chan *CacheItem { - out := make(chan *CacheItem) - go func() { - for _, ele := range c.cache { - out <- ele.Value.(*CacheItem) - } - close(out) - }() - return out -} - -// Adds a value to the cache. -func (c *LRUCache) Add(record *CacheItem) bool { - // If the key already exist, set the new value - if ee, ok := c.cache[record.Key]; ok { - c.ll.MoveToFront(ee) - temp := ee.Value.(*CacheItem) - *temp = *record - return true - } - - ele := c.ll.PushFront(record) - c.cache[record.Key] = ele - if c.cacheSize != 0 && c.ll.Len() > c.cacheSize { - c.removeOldest() - } - return false -} - -// Return unix epoch in milliseconds -func MillisecondNow() int64 { - return clock.Now().UnixNano() / 1000000 -} - -// GetItem returns the item stored in the cache -func (c *LRUCache) GetItem(key interface{}) (item *CacheItem, ok bool) { - - if ele, hit := c.cache[key]; hit { - entry := ele.Value.(*CacheItem) - - now := MillisecondNow() - // If the entry is invalidated - if entry.InvalidAt != 0 && entry.InvalidAt < now { - c.removeElement(ele) - c.stats.Miss++ - return - } - - // If the entry has expired, remove it from the cache - if entry.ExpireAt < now { - c.removeElement(ele) - c.stats.Miss++ - return - } - c.stats.Hit++ - c.ll.MoveToFront(ele) - return entry, true - } - c.stats.Miss++ - return -} - -// Remove removes the provided key from the cache. -func (c *LRUCache) Remove(key interface{}) { - if ele, hit := c.cache[key]; hit { - c.removeElement(ele) - } -} - -// RemoveOldest removes the oldest item from the cache. -func (c *LRUCache) removeOldest() { - ele := c.ll.Back() - if ele != nil { - c.removeElement(ele) - } -} - -func (c *LRUCache) removeElement(e *list.Element) { - c.ll.Remove(e) - kv := e.Value.(*CacheItem) - delete(c.cache, kv.Key) -} - -// Len returns the number of items in the cache. -func (c *LRUCache) Size() int { - return c.ll.Len() -} - -func (c *LRUCache) Stats(_ bool) cachStats { - return c.stats -} - -// Update the expiration time for the key -func (c *LRUCache) UpdateExpiration(key interface{}, expireAt int64) bool { - if ele, hit := c.cache[key]; hit { - entry := ele.Value.(*CacheItem) - entry.ExpireAt = expireAt - return true - } - return false -} - -// Describe fetches prometheus metrics to be registered -func (c *LRUCache) Describe(ch chan<- *prometheus.Desc) { - ch <- c.sizeMetric - ch <- c.accessMetric -} - -// Collect fetches metric counts and gauges from the cache -func (c *LRUCache) Collect(ch chan<- prometheus.Metric) { - c.mutex.Lock() - defer c.mutex.Unlock() - ch <- prometheus.MustNewConstMetric(c.accessMetric, prometheus.CounterValue, float64(c.stats.Hit), "hit") - ch <- prometheus.MustNewConstMetric(c.accessMetric, prometheus.CounterValue, float64(c.stats.Miss), "miss") - ch <- prometheus.MustNewConstMetric(c.sizeMetric, prometheus.GaugeValue, float64(len(c.cache))) -} diff --git a/client.go b/client.go index 272b03fe..62ecdfc7 100644 --- a/client.go +++ b/client.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,6 +22,8 @@ import ( "time" "github.com/mailgun/holster/v4/clock" + otgrpc "github.com/opentracing-contrib/go-grpc" + "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "google.golang.org/grpc" "google.golang.org/grpc/credentials" @@ -43,9 +45,19 @@ func DialV1Server(server string, tls *tls.Config) (V1Client, error) { return nil, errors.New("server is empty; must provide a server") } - opts := []grpc.DialOption{grpc.WithInsecure()} + // Setup Opentracing interceptor to propagate spans. + tracer := opentracing.GlobalTracer() + tracingUnaryInterceptor := otgrpc.OpenTracingClientInterceptor(tracer) + tracingStreamInterceptor := otgrpc.OpenTracingStreamClientInterceptor(tracer) + + opts := []grpc.DialOption{ + grpc.WithUnaryInterceptor(tracingUnaryInterceptor), + grpc.WithStreamInterceptor(tracingStreamInterceptor), + } if tls != nil { - opts = []grpc.DialOption{grpc.WithTransportCredentials(credentials.NewTLS(tls))} + opts = append(opts, grpc.WithTransportCredentials(credentials.NewTLS(tls))) + } else { + opts = append(opts, grpc.WithInsecure()) } conn, err := grpc.Dial(server, opts...) diff --git a/cluster/cluster.go b/cluster/cluster.go index 07498d64..6a004958 100644 --- a/cluster/cluster.go +++ b/cluster/cluster.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ import ( "math/rand" "github.com/mailgun/gubernator/v2" + "github.com/mailgun/gubernator/v2/tracing" "github.com/mailgun/holster/v4/clock" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -99,7 +100,7 @@ func Restart(ctx context.Context) error { // StartWith a local cluster with specific addresses func StartWith(localPeers []gubernator.PeerInfo) error { for _, peer := range localPeers { - ctx, cancel := context.WithTimeout(context.Background(), clock.Second*10) + ctx, cancel := tracing.ContextWithTimeout(context.Background(), clock.Second*10) d, err := gubernator.SpawnDaemon(ctx, gubernator.DaemonConfig{ Logger: logrus.WithField("instance", peer.GRPCAddress), GRPCListenAddress: peer.GRPCAddress, diff --git a/cluster/cluster_test.go b/cluster/cluster_test.go index 6797b2a9..1d9f8edb 100644 --- a/cluster/cluster_test.go +++ b/cluster/cluster_test.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/cmd/gubernator-cli/main.go b/cmd/gubernator-cli/main.go index 89070999..0502970e 100644 --- a/cmd/gubernator-cli/main.go +++ b/cmd/gubernator-cli/main.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,52 +18,70 @@ package main import ( "context" - "errors" "flag" "fmt" "math/rand" "os" + "strings" "github.com/davecgh/go-spew/spew" guber "github.com/mailgun/gubernator/v2" + "github.com/mailgun/gubernator/v2/tracing" "github.com/mailgun/holster/v4/clock" "github.com/mailgun/holster/v4/setter" "github.com/mailgun/holster/v4/syncutil" + "github.com/opentracing/opentracing-go" + "github.com/opentracing/opentracing-go/ext" + "github.com/pkg/errors" "github.com/sirupsen/logrus" + jaegerConfig "github.com/uber/jaeger-client-go/config" + "golang.org/x/time/rate" ) var log *logrus.Logger - -func checkErr(err error) { - if err != nil { - log.Errorf(err.Error()) - os.Exit(1) - } -} - -func randInt(min, max int) int64 { - return int64(rand.Intn(max-min) + min) -} +var configFile, grpcAddress string +var concurrency uint64 +var checksPerRequest uint64 +var reqRate float64 +var quiet bool func main() { - var configFile, GRPCAddress string - var err error - log = logrus.StandardLogger() flags := flag.NewFlagSet("gubernator", flag.ContinueOnError) - flags.StringVar(&configFile, "config", "", "environment config file") - flags.StringVar(&GRPCAddress, "e", "", "the gubernator GRPC endpoint address") + flags.StringVar(&configFile, "config", "", "Environment config file") + flags.StringVar(&grpcAddress, "e", "", "Gubernator GRPC endpoint address") + flags.Uint64Var(&concurrency, "concurrency", 1, "Concurrent threads (default 1)") + flags.Uint64Var(&checksPerRequest, "checks", 1, "Rate checks per request (default 1)") + flags.Float64Var(&reqRate, "rate", 0, "Request rate overall, 0 = no rate limit") + flags.BoolVar(&quiet, "q", false, "Quiet logging") checkErr(flags.Parse(os.Args[1:])) + ctx := context.Background() + err := initTracing() + if err != nil { + log.WithError(err).Warn("Error in initTracing") + } + span, _ := tracing.StartSpan(ctx) + + // Print startup message. + argsMsg := fmt.Sprintf("Command line: %s", strings.Join(os.Args[1:], " ")) + log.Info(argsMsg) + tracing.LogInfo(span, argsMsg) + span.Finish() + conf, err := guber.SetupDaemonConfig(log, configFile) checkErr(err) - setter.SetOverride(&conf.GRPCListenAddress, GRPCAddress) + setter.SetOverride(&conf.GRPCListenAddress, grpcAddress) - if configFile == "" && GRPCAddress == "" && os.Getenv("GUBER_GRPC_ADDRESS") == "" { + if configFile == "" && grpcAddress == "" && os.Getenv("GUBER_GRPC_ADDRESS") == "" { checkErr(errors.New("please provide a GRPC endpoint via -e or from a config " + "file via -config or set the env GUBER_GRPC_ADDRESS")) } + if quiet { + log.SetLevel(logrus.ErrorLevel) + } + err = guber.SetupTLS(conf.TLS) checkErr(err) @@ -71,38 +89,131 @@ func main() { client, err := guber.DialV1Server(conf.GRPCListenAddress, conf.ClientTLS()) checkErr(err) - // Generate a selection of rate limits with random limits + // Generate a selection of rate limits with random limits. var rateLimits []*guber.RateLimitReq for i := 0; i < 2000; i++ { rateLimits = append(rateLimits, &guber.RateLimitReq{ - Name: fmt.Sprintf("ID-%d", i), + Name: fmt.Sprintf("gubernator-cli-%d", i), UniqueKey: guber.RandomString(10), Hits: 1, - Limit: randInt(1, 10), - Duration: randInt(int(clock.Millisecond*500), int(clock.Second*6)), + Limit: int64(randInt(1, 1000)), + Duration: int64(randInt(int(clock.Millisecond*500), int(clock.Second*6))), + Behavior: guber.Behavior_BATCHING, Algorithm: guber.Algorithm_TOKEN_BUCKET, }) } - fan := syncutil.NewFanOut(10) + fan := syncutil.NewFanOut(int(concurrency)) + var limiter *rate.Limiter + if reqRate > 0 { + l := rate.Limit(reqRate) + log.WithField("reqRate", reqRate).Info("") + limiter = rate.NewLimiter(l, 1) + } + + // Replay requests in endless loop. for { - for _, rateLimit := range rateLimits { + for i := int(0); i < len(rateLimits); i += int(checksPerRequest) { + req := &guber.GetRateLimitsReq{ + Requests: rateLimits[i:min(i+int(checksPerRequest), len(rateLimits))], + } + fan.Run(func(obj interface{}) error { - r := obj.(*guber.RateLimitReq) - ctx, cancel := context.WithTimeout(context.Background(), clock.Millisecond*500) - // Now hit our cluster with the rate limits - resp, err := client.GetRateLimits(ctx, &guber.GetRateLimitsReq{ - Requests: []*guber.RateLimitReq{r}, - }) - checkErr(err) - cancel() - - if resp.Responses[0].Status == guber.Status_OVER_LIMIT { - spew.Dump(resp) + req := obj.(*guber.GetRateLimitsReq) + + if reqRate > 0 { + limiter.Wait(ctx) } + + sendRequest(ctx, client, req) + return nil - }, rateLimit) + }, req) + } + } +} + +func min(a, b int) int { + if a <= b { + return a + } + return b +} + +func checkErr(err error) { + if err != nil { + log.Fatalf(err.Error()) + } +} + +func randInt(min, max int) int { + return rand.Intn(max-min) + min +} + +func sendRequest(ctx context.Context, client guber.V1Client, req *guber.GetRateLimitsReq) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + ctx, cancel := tracing.ContextWithTimeout(ctx, clock.Millisecond*500) + + // Now hit our cluster with the rate limits + resp, err := client.GetRateLimits(ctx, req) + cancel() + if err != nil { + ext.LogError(span, errors.Wrap(err, "Error in client.GetRateLimits")) + log.WithError(err).Error("Error in client.GetRateLimits") + return + } + + // Sanity checks. + if resp == nil { + log.Error("Response object is unexpectedly nil") + return + } + if resp.Responses == nil { + log.Error("Responses array is unexpectedly nil") + return + } + + // Check for overlimit response. + overlimit := false + + for itemNum, resp := range resp.Responses { + if resp.Status == guber.Status_OVER_LIMIT { + overlimit = true + log.WithField("name", req.Requests[itemNum].Name).Info("Overlimit!") } } + + if overlimit { + span.SetTag("overlimit", true) + if !quiet { + log.Info(spew.Sdump(resp)) + } + } +} + +// Configure tracer and set as global tracer. +// Be sure to call closer.Close() on application exit. +func initTracing() error { + // Configure new tracer. + cfg, err := jaegerConfig.FromEnv() + if err != nil { + return errors.Wrap(err, "Error in jaeger.FromEnv()") + } + if cfg.ServiceName == "" { + cfg.ServiceName = "gubernator-cli" + } + + var tracer opentracing.Tracer + + tracer, _, err = cfg.NewTracer() + if err != nil { + return errors.Wrap(err, "Error in cfg.NewTracer") + } + + // Set as global tracer. + opentracing.SetGlobalTracer(tracer) + + return nil } diff --git a/cmd/gubernator-cluster/main.go b/cmd/gubernator-cluster/main.go index b0277ab2..d218154e 100644 --- a/cmd/gubernator-cluster/main.go +++ b/cmd/gubernator-cluster/main.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/cmd/gubernator/main.go b/cmd/gubernator/main.go index faf95c27..ee9ffaae 100644 --- a/cmd/gubernator/main.go +++ b/cmd/gubernator/main.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,23 +19,28 @@ package main import ( "context" "flag" + "io" "os" "os/signal" "runtime" "syscall" "github.com/mailgun/gubernator/v2" + "github.com/mailgun/gubernator/v2/tracing" "github.com/mailgun/holster/v4/clock" + "github.com/opentracing/opentracing-go" + "github.com/pkg/errors" "github.com/sirupsen/logrus" + jaegerConfig "github.com/uber/jaeger-client-go/config" "k8s.io/klog" ) var log = logrus.WithField("category", "gubernator") var Version = "dev-build" +var tracerCloser io.Closer func main() { var configFile string - var err error logrus.Infof("Gubernator %s (%s/%s)", Version, runtime.GOARCH, runtime.GOOS) flags := flag.NewFlagSet("gubernator", flag.ContinueOnError) @@ -50,16 +55,22 @@ func main() { klog.InitFlags(nil) flag.Set("logtostderr", "true") + err := initTracing() + if err != nil { + log.WithError(err).Warn("Error in initTracing") + } + // Read our config from the environment or optional environment config file conf, err := gubernator.SetupDaemonConfig(logrus.StandardLogger(), configFile) checkErr(err, "while getting config") - ctx, cancel := context.WithTimeout(context.Background(), clock.Second*10) + ctx, cancel := tracing.ContextWithTimeout(context.Background(), clock.Second*10) defer cancel() // Start the daemon daemon, err := gubernator.SpawnDaemon(ctx, conf) checkErr(err, "while spawning daemon") + cancel() // Wait here for signals to clean up our mess c := make(chan os.Signal, 1) @@ -67,13 +78,45 @@ func main() { for range c { log.Info("caught signal; shutting down") daemon.Close() - os.Exit(0) + exit(0) } } func checkErr(err error, msg string) { if err != nil { log.WithError(err).Error(msg) - os.Exit(1) + exit(1) } } + +func exit(code int) { + if tracerCloser != nil { + tracerCloser.Close() + } + os.Exit(code) +} + +// Configure tracer and set as global tracer. +// Be sure to call closer.Close() on application exit. +func initTracing() error { + // Configure new tracer. + cfg, err := jaegerConfig.FromEnv() + if err != nil { + return errors.Wrap(err, "Error in jaeger.FromEnv()") + } + if cfg.ServiceName == "" { + cfg.ServiceName = "gubernator" + } + + var tracer opentracing.Tracer + + tracer, tracerCloser, err = cfg.NewTracer() + if err != nil { + return errors.Wrap(err, "Error in cfg.NewTracer") + } + + // Set as global tracer. + opentracing.SetGlobalTracer(tracer) + + return nil +} diff --git a/config.go b/config.go index 652cfc0b..89364690 100644 --- a/config.go +++ b/config.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ import ( "io/ioutil" "net" "os" + "runtime" "strconv" "strings" "time" @@ -72,7 +73,7 @@ type Config struct { Behaviors BehaviorConfig // (Optional) The cache implementation - Cache Cache + CacheFactory func(maxSize int) Cache // (Optional) A persistent store implementation. Allows the implementor the ability to store the rate limits this // instance of gubernator owns. It's up to the implementor to decide what rate limits to persist. @@ -102,6 +103,10 @@ type Config struct { // (Optional) The TLS config used when connecting to gubernator peers PeerTLS *tls.Config + + // (Optional) Number of worker goroutines to launch for request processing in GubernatorPool. + // Default is set to number of CPUs. + PoolWorkers int } func (c *Config) SetDefaults() error { @@ -119,7 +124,15 @@ func (c *Config) SetDefaults() error { setter.SetDefault(&c.LocalPicker, NewReplicatedConsistentHash(nil, defaultReplicas)) setter.SetDefault(&c.RegionPicker, NewRegionPicker(nil)) - setter.SetDefault(&c.Cache, NewLRUCache(0)) + + numCpus := runtime.NumCPU() + setter.SetDefault(&c.PoolWorkers, numCpus) + + if c.CacheFactory == nil { + c.CacheFactory = func(maxSize int) Cache { + return NewLRUCache(maxSize) + } + } if c.Behaviors.BatchLimit > maxBatchSize { return fmt.Errorf("Behaviors.BatchLimit cannot exceed '%d'", maxBatchSize) diff --git a/daemon.go b/daemon.go index 91c006aa..42d5c69b 100644 --- a/daemon.go +++ b/daemon.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2020 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,9 +26,12 @@ import ( "time" "github.com/grpc-ecosystem/grpc-gateway/v2/runtime" + "github.com/mailgun/gubernator/v2/tracing" "github.com/mailgun/holster/v4/etcdutil" "github.com/mailgun/holster/v4/setter" "github.com/mailgun/holster/v4/syncutil" + otgrpc "github.com/opentracing-contrib/go-grpc" + "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/collectors" @@ -55,12 +58,16 @@ type Daemon struct { statsHandler *GRPCStatsHandler promRegister *prometheus.Registry gwCancel context.CancelFunc + gubeConfig Config } // SpawnDaemon starts a new gubernator daemon according to the provided DaemonConfig. // This function will block until the daemon responds to connections as specified // by GRPCListenAddress and HTTPListenAddress func SpawnDaemon(ctx context.Context, conf DaemonConfig) (*Daemon, error) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + s := Daemon{ log: conf.Logger, conf: conf, @@ -76,12 +83,17 @@ func SpawnDaemon(ctx context.Context, conf DaemonConfig) (*Daemon, error) { func (s *Daemon) Start(ctx context.Context) error { var err error - // The LRU cache we store rate limits in - cache := NewLRUCache(s.conf.CacheSize) - - // cache also implements prometheus.Collector interface s.promRegister = prometheus.NewRegistry() - s.promRegister.Register(cache) + + // The LRU cache for storing rate limits. + cacheCollector := NewLRUCacheCollector() + s.promRegister.Register(cacheCollector) + + cacheFactory := func(maxSize int) Cache { + cache := NewLRUCache(maxSize) + cacheCollector.AddCache(cache) + return cache + } // Handler to collect duration and API access metrics for GRPC s.statsHandler = NewGRPCStatsHandler() @@ -103,22 +115,34 @@ func (s *Daemon) Start(ctx context.Context) error { return err } + // Opentracing on gRPC endpoints. + tracer := opentracing.GlobalTracer() + tracingUnaryInterceptor := otgrpc.OpenTracingServerInterceptor(tracer) + tracingStreamInterceptor := otgrpc.OpenTracingStreamServerInterceptor(tracer) + + opts = append(opts, + grpc.UnaryInterceptor(tracingUnaryInterceptor), + grpc.StreamInterceptor(tracingStreamInterceptor), + ) + if s.conf.ServerTLS() != nil { // Create two GRPC server instances, one for TLS and the other for the API Gateway - s.grpcSrvs = append(s.grpcSrvs, grpc.NewServer(append(opts, grpc.Creds(credentials.NewTLS(s.conf.ServerTLS())))...)) + opts2 := append(opts, grpc.Creds(credentials.NewTLS(s.conf.ServerTLS()))) + s.grpcSrvs = append(s.grpcSrvs, grpc.NewServer(opts2...)) } s.grpcSrvs = append(s.grpcSrvs, grpc.NewServer(opts...)) // Registers a new gubernator instance with the GRPC server - s.V1Server, err = NewV1Instance(Config{ - PeerTLS: s.conf.ClientTLS(), - DataCenter: s.conf.DataCenter, - LocalPicker: s.conf.Picker, - GRPCServers: s.grpcSrvs, - Logger: s.log, - Cache: cache, - Behaviors: s.conf.Behaviors, - }) + s.gubeConfig = Config{ + PeerTLS: s.conf.ClientTLS(), + DataCenter: s.conf.DataCenter, + LocalPicker: s.conf.Picker, + GRPCServers: s.grpcSrvs, + Logger: s.log, + CacheFactory: cacheFactory, + Behaviors: s.conf.Behaviors, + } + s.V1Server, err = NewV1Instance(s.gubeConfig) if err != nil { return errors.Wrap(err, "while creating new gubernator instance") } diff --git a/dns.go b/dns.go index cd79a76f..403fcf95 100644 --- a/dns.go +++ b/dns.go @@ -1,3 +1,19 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package gubernator import ( diff --git a/etcd.go b/etcd.go index a99b3cad..c5d619f4 100644 --- a/etcd.go +++ b/etcd.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ import ( "context" "encoding/json" + "github.com/mailgun/gubernator/v2/tracing" "github.com/mailgun/holster/v4/clock" "github.com/mailgun/holster/v4/setter" "github.com/mailgun/holster/v4/syncutil" @@ -94,7 +95,6 @@ func NewEtcdPool(conf EtcdPoolConfig) (*EtcdPool, error) { } func (e *EtcdPool) run(peer PeerInfo) error { - // Register our instance with etcd if err := e.register(peer); err != nil { return err @@ -139,7 +139,7 @@ func (e *EtcdPool) watchPeers() error { } func (e *EtcdPool) collectPeers(revision *int64) error { - ctx, cancel := context.WithTimeout(e.ctx, etcdTimeout) + ctx, cancel := tracing.ContextWithTimeout(e.ctx, etcdTimeout) defer cancel() resp, err := e.conf.Client.Get(ctx, e.conf.KeyPrefix, etcd.WithPrefix()) @@ -232,7 +232,7 @@ func (e *EtcdPool) register(peer PeerInfo) error { var lease *etcd.LeaseGrantResponse register := func() error { - ctx, cancel := context.WithTimeout(e.ctx, etcdTimeout) + ctx, cancel := tracing.ContextWithTimeout(e.ctx, etcdTimeout) defer cancel() var err error @@ -296,7 +296,7 @@ func (e *EtcdPool) register(peer PeerInfo) error { } lastKeepAlive = clock.Now() case <-done: - ctx, cancel := context.WithTimeout(context.Background(), etcdTimeout) + ctx, cancel := tracing.ContextWithTimeout(context.Background(), etcdTimeout) if _, err := e.conf.Client.Delete(ctx, instanceKey); err != nil { e.log.WithError(err). Warn("during etcd delete") diff --git a/flags.go b/flags.go index 3b92073d..25fce44f 100644 --- a/flags.go +++ b/flags.go @@ -1,3 +1,19 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package gubernator import "github.com/sirupsen/logrus" diff --git a/functional_test.go b/functional_test.go index b7911a3e..9d67ad58 100644 --- a/functional_test.go +++ b/functional_test.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/global.go b/global.go index 43709cf3..d9f77c62 100644 --- a/global.go +++ b/global.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ import ( "context" "time" + "github.com/mailgun/gubernator/v2/tracing" "github.com/mailgun/holster/v4/clock" "github.com/mailgun/holster/v4/syncutil" "github.com/prometheus/client_golang/prometheus" @@ -79,6 +80,9 @@ func (gm *globalManager) runAsyncHits() { hits := make(map[string]*RateLimitReq) gm.wg.Until(func(done chan struct{}) bool { + span, ctx := tracing.StartSpan(context.Background()) + defer span.Finish() + select { case r := <-gm.asyncQueue: // Aggregate the hits into a single request @@ -92,7 +96,7 @@ func (gm *globalManager) runAsyncHits() { // Send the hits if we reached our batch limit if len(hits) == gm.conf.GlobalBatchLimit { - gm.sendHits(hits) + gm.sendHits(ctx, hits) hits = make(map[string]*RateLimitReq) return true } @@ -105,7 +109,7 @@ func (gm *globalManager) runAsyncHits() { case <-interval.C: if len(hits) != 0 { - gm.sendHits(hits) + gm.sendHits(ctx, hits) hits = make(map[string]*RateLimitReq) } case <-done: @@ -117,7 +121,7 @@ func (gm *globalManager) runAsyncHits() { // sendHits takes the hits collected by runAsyncHits and sends them to their // owning peers -func (gm *globalManager) sendHits(hits map[string]*RateLimitReq) { +func (gm *globalManager) sendHits(ctx context.Context, hits map[string]*RateLimitReq) { type pair struct { client *PeerClient req GetPeerRateLimitsReq @@ -127,7 +131,7 @@ func (gm *globalManager) sendHits(hits map[string]*RateLimitReq) { // Assign each request to a peer for _, r := range hits { - peer, err := gm.instance.GetPeer(r.HashKey()) + peer, err := gm.instance.GetPeer(ctx, r.HashKey()) if err != nil { gm.log.WithError(err).Errorf("while getting peer for hash key '%s'", r.HashKey()) continue @@ -146,7 +150,7 @@ func (gm *globalManager) sendHits(hits map[string]*RateLimitReq) { // Send the rate limit requests to their respective owning peers. for _, p := range peerRequests { - ctx, cancel := context.WithTimeout(context.Background(), gm.conf.GlobalTimeout) + ctx, cancel := tracing.ContextWithTimeout(context.Background(), gm.conf.GlobalTimeout) _, err := p.client.GetPeerRateLimits(ctx, &p.req) cancel() @@ -165,13 +169,16 @@ func (gm *globalManager) runBroadcasts() { updates := make(map[string]*RateLimitReq) gm.wg.Until(func(done chan struct{}) bool { + span, ctx := tracing.StartSpan(context.Background()) + defer span.Finish() + select { case r := <-gm.broadcastQueue: updates[r.HashKey()] = r // Send the hits if we reached our batch limit if len(updates) == gm.conf.GlobalBatchLimit { - gm.broadcastPeers(updates) + gm.broadcastPeers(ctx, updates) updates = make(map[string]*RateLimitReq) return true } @@ -184,7 +191,7 @@ func (gm *globalManager) runBroadcasts() { case <-interval.C: if len(updates) != 0 { - gm.broadcastPeers(updates) + gm.broadcastPeers(ctx, updates) updates = make(map[string]*RateLimitReq) } case <-done: @@ -195,7 +202,7 @@ func (gm *globalManager) runBroadcasts() { } // broadcastPeers broadcasts global rate limit statuses to all other peers -func (gm *globalManager) broadcastPeers(updates map[string]*RateLimitReq) { +func (gm *globalManager) broadcastPeers(ctx context.Context, updates map[string]*RateLimitReq) { var req UpdatePeerGlobalsReq start := clock.Now() @@ -207,7 +214,7 @@ func (gm *globalManager) broadcastPeers(updates map[string]*RateLimitReq) { SetBehavior(&rl.Behavior, Behavior_GLOBAL, false) rl.Hits = 0 - status, err := gm.instance.getRateLimit(rl) + status, err := gm.instance.getRateLimit(ctx, rl) if err != nil { gm.log.WithError(err).Errorf("while broadcasting update to peers for: '%s'", rl.HashKey()) continue @@ -226,7 +233,7 @@ func (gm *globalManager) broadcastPeers(updates map[string]*RateLimitReq) { continue } - ctx, cancel := context.WithTimeout(context.Background(), gm.conf.GlobalTimeout) + ctx, cancel := tracing.ContextWithTimeout(context.Background(), gm.conf.GlobalTimeout) _, err := peer.UpdatePeerGlobals(ctx, &req) cancel() diff --git a/go.mod b/go.mod index 21a64c77..b6323765 100644 --- a/go.mod +++ b/go.mod @@ -1,22 +1,26 @@ module github.com/mailgun/gubernator/v2 -go 1.14 +go 1.17 require ( + github.com/OneOfOne/xxhash v1.2.8 github.com/davecgh/go-spew v1.1.1 github.com/grpc-ecosystem/grpc-gateway/v2 v2.5.0 github.com/hashicorp/memberlist v0.2.4 github.com/mailgun/holster/v4 v4.0.0 github.com/miekg/dns v1.1.43 + github.com/opentracing-contrib/go-grpc v0.0.0-20210225150812-73cb765af46e + github.com/opentracing/opentracing-go v1.2.0 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.11.0 github.com/prometheus/common v0.26.0 github.com/segmentio/fasthash v1.0.2 github.com/sirupsen/logrus v1.8.1 github.com/stretchr/testify v1.7.0 + github.com/uber/jaeger-client-go v2.29.1+incompatible go.etcd.io/etcd/client/v3 v3.5.0 golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4 - google.golang.org/api v0.30.0 + golang.org/x/time v0.0.0-20191024005414-555d28b269f0 google.golang.org/genproto v0.0.0-20210617175327-b9e0b3197ced google.golang.org/grpc v1.39.0 google.golang.org/protobuf v1.27.1 @@ -25,3 +29,53 @@ require ( k8s.io/client-go v0.0.0-20190620085101-78d2af792bab k8s.io/klog v0.3.1 ) + +require ( + github.com/HdrHistogram/hdrhistogram-go v1.1.2 // indirect + github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.1.1 // indirect + github.com/coreos/go-semver v0.3.0 // indirect + github.com/coreos/go-systemd/v22 v22.3.2 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/protobuf v1.5.2 // indirect + github.com/google/btree v1.0.0 // indirect + github.com/google/go-cmp v0.5.6 // indirect + github.com/google/gofuzz v1.0.0 // indirect + github.com/googleapis/gnostic v0.0.0-20170729233727-0c5108395e2d // indirect + github.com/hashicorp/errwrap v1.0.0 // indirect + github.com/hashicorp/go-immutable-radix v1.0.0 // indirect + github.com/hashicorp/go-msgpack v0.5.3 // indirect + github.com/hashicorp/go-multierror v1.1.0 // indirect + github.com/hashicorp/go-sockaddr v1.0.0 // indirect + github.com/hashicorp/golang-lru v0.5.1 // indirect + github.com/imdario/mergo v0.3.5 // indirect + github.com/json-iterator/go v1.1.11 // indirect + github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.1 // indirect + github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/prometheus/client_model v0.2.0 // indirect + github.com/prometheus/procfs v0.6.0 // indirect + github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 // indirect + github.com/spf13/pflag v1.0.1 // indirect + github.com/stretchr/objx v0.1.1 // indirect + github.com/uber/jaeger-lib v2.4.1+incompatible // indirect + go.etcd.io/etcd/api/v3 v3.5.0 // indirect + go.etcd.io/etcd/client/pkg/v3 v3.5.0 // indirect + go.uber.org/atomic v1.7.0 // indirect + go.uber.org/multierr v1.6.0 // indirect + go.uber.org/zap v1.17.0 // indirect + golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9 // indirect + golang.org/x/oauth2 v0.0.0-20210615190721-d04028783cf1 // indirect + golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40 // indirect + golang.org/x/text v0.3.5 // indirect + google.golang.org/appengine v1.6.6 // indirect + gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f // indirect + gopkg.in/inf.v0 v0.9.0 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect + gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect + k8s.io/utils v0.0.0-20190221042446-c2654d5206da // indirect + sigs.k8s.io/yaml v1.2.0 // indirect +) diff --git a/go.sum b/go.sum index 34731f1a..a2aa2585 100644 --- a/go.sum +++ b/go.sum @@ -12,7 +12,6 @@ cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bP cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk= cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs= cloud.google.com/go v0.62.0/go.mod h1:jmCYTdRCQuc1PHIIJ/maLInMho30T/Y0M4hTdTShOYc= -cloud.google.com/go v0.65.0 h1:Dg9iHVQfrhq82rUNu9ZxUDrJLaxFUe/HlCVaLyRruq8= cloud.google.com/go v0.65.0/go.mod h1:O5N8zS7uWy9vkA9vayVHs65eM1ubvY4h553ofrNHObY= cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= @@ -35,10 +34,15 @@ dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7 github.com/Azure/go-autorest v11.1.2+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/HdrHistogram/hdrhistogram-go v1.1.2 h1:5IcZpTvzydCQeHzK4Ef/D5rrSqwxob0t8PQPMybUNFM= +github.com/HdrHistogram/hdrhistogram-go v1.1.2/go.mod h1:yDgFjdqOqDEKOvasDdhWNXYg9BVp4O+o5f6V/ehm6Oo= +github.com/OneOfOne/xxhash v1.2.8 h1:31czK/TI9sNkxIKfaUfGlU47BAxQ0ztGgd9vPyqimf8= +github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q= github.com/Shopify/toxiproxy v2.1.4+incompatible h1:TKdv8HiTLgE5wdJuEML90aBgNWsokNbMijUGhmcoBJc= github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI= github.com/ahmetb/go-linq v3.0.0+incompatible h1:qQkjjOXKrKOTy83X8OpRmnKflXKQIL/mC/gMVVDMhOA= github.com/ahmetb/go-linq v3.0.0+incompatible/go.mod h1:PFffvbdbtw+QTB0WKRP0cNht7vnCfnGlEpak/DVg5cY= +github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= @@ -69,6 +73,7 @@ github.com/coreos/go-semver v0.3.0 h1:wkHLiw0WNATZnSG7epLsujiMCgPAc9xhjJ4tgnAxmf github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI= github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -86,6 +91,7 @@ github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7 github.com/evanphx/json-patch v0.0.0-20190203023257-5858425f7550/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/fatih/color v1.9.0/go.mod h1:eQcE1qtQxscV5RaZvpXrrb8Drkc3/DdQ+uUYCNjL+zU= +github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= @@ -103,6 +109,7 @@ github.com/gogo/protobuf v0.0.0-20171007142547-342cbe0a0415/go.mod h1:r8qH/GZQm5 github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/glog v0.0.0-20210429001901-424d2337a529 h1:2voWjNECnrZRbfwXxHB1/j8wa6xdKn85B5NzgVL/pTU= github.com/golang/glog v0.0.0-20210429001901-424d2337a529/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= @@ -166,7 +173,6 @@ github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm4 github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= -github.com/googleapis/gax-go/v2 v2.0.5 h1:sjZBwGj9Jlw33ImPtvFviGYvseOtDM7hkSKB7+Tv3SM= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/googleapis/gnostic v0.0.0-20170729233727-0c5108395e2d h1:7XGaL1e6bYS1yIonGp9761ExpPPV1ui0SAC59Yube9k= github.com/googleapis/gnostic v0.0.0-20170729233727-0c5108395e2d/go.mod h1:sJBsCZ4ayReDTBIg8b9dl28c5xFWyhBTVRp3pOg5EKY= @@ -179,6 +185,7 @@ github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4 github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/grpc-ecosystem/grpc-gateway/v2 v2.5.0 h1:ajue7SzQMywqRjg2fK7dcpc0QhFGpTR2plWfV4EZWR4= github.com/grpc-ecosystem/grpc-gateway/v2 v2.5.0/go.mod h1:r1hZAcvfFXuYmcKyCJI9wlyOPIZUJl6FCB8Cpca/NLE= +github.com/grpc-ecosystem/grpc-opentracing v0.0.0-20180507213350-8e809c8a8645/go.mod h1:6iZfnjpejD4L/4DwD7NryNaJyCQdzwWwH2MWhCA90Kw= github.com/hashicorp/consul/api v1.8.1/go.mod h1:sDjTOq0yUyv5G4h+BqSea7Fn6BU+XbolEz1952UB+mk= github.com/hashicorp/consul/sdk v0.7.0/go.mod h1:fY08Y9z5SvJqevyZNy6WWPXiG3KwBPAvlcdx16zZ0fM= github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= @@ -223,17 +230,18 @@ github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1 github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM= +github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs= github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mailgun/holster/v4 v4.0.0 h1:agmjX6skCovLK+2FRk34Dx9L6DzBldTnT7jUMTR/UYA= github.com/mailgun/holster/v4 v4.0.0/go.mod h1:3Gavxi9KJwRAcA7UkZcDl2YOGp4Hyy3Mmdq7UCayCpM= github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= @@ -248,7 +256,6 @@ github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0j github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= github.com/miekg/dns v1.1.26/go.mod h1:bPDLeHnStXmXAq1m/Ch/hvfNHr14JKNPMBo3VZKjuso= -github.com/miekg/dns v1.1.42 h1:gWGe42RGaIqXQZ+r3WUGEKBEtvPHY2SXo4dqixDNxuY= github.com/miekg/dns v1.1.42/go.mod h1:+evo5L0630/F6ca/Z9+GAqzhjGyn8/c+TBaOyfEl0V4= github.com/miekg/dns v1.1.43 h1:JKfpVSCB84vrAmHzyrsxB5NAr5kLoMXZArPSw7Qlgyg= github.com/miekg/dns v1.1.43/go.mod h1:+evo5L0630/F6ca/Z9+GAqzhjGyn8/c+TBaOyfEl0V4= @@ -266,8 +273,15 @@ github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3Rllmb github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/gomega v0.0.0-20190113212917-5533ce8a0da3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= +github.com/opentracing-contrib/go-grpc v0.0.0-20210225150812-73cb765af46e h1:4cPxUYdgaGzZIT5/j0IfqOrrXmq6bG8AwvwisMXpdrg= +github.com/opentracing-contrib/go-grpc v0.0.0-20210225150812-73cb765af46e/go.mod h1:DYR5Eij8rJl8h7gblRrOZ8g0kW1umSpKqYIBTgeDtLo= +github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= +github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs= +github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c h1:Lgl0gzECD8GnQ5QCWA8o6BtfL6mDH5rQgM4/fX3avOs= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= @@ -321,6 +335,10 @@ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81P github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/uber/jaeger-client-go v2.29.1+incompatible h1:R9ec3zO3sGpzs0abd43Y+fBZRJ9uiH6lXyR/+u6brW4= +github.com/uber/jaeger-client-go v2.29.1+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk= +github.com/uber/jaeger-lib v2.4.1+incompatible h1:td4jdvLcExb4cBISKIpHuGoVXh+dVKhn2Um6rjCsSsg= +github.com/uber/jaeger-lib v2.4.1+incompatible/go.mod h1:ComeNDZlWwrWnDv8aPp0Ba6+uUTzImX/AauajbLI56U= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= @@ -336,7 +354,6 @@ go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= -go.opencensus.io v0.22.4 h1:LYy1Hy3MJdrCdMwwzxA/dRok4ejH+RwNGbuoD9fCjto= go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= @@ -355,7 +372,10 @@ golang.org/x/crypto v0.0.0-20190923035154-9ee001bba392/go.mod h1:/lpIB1dKB+9EgE3 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9 h1:psW17arqaxU48Z5kZ0CQnkZWQJsqcURM6tKiBApRjXI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek= @@ -365,6 +385,7 @@ golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u0 golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= +golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= @@ -404,6 +425,7 @@ golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190921015927-1a5e07d1ff72/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190923162816-aa69164e4478/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -507,8 +529,10 @@ golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxb golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0 h1:/5xXl8Y5W96D+TtHSlonuFqGHIWVuyCkGJLwGh9JJFs= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= @@ -557,6 +581,10 @@ golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= +gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= +gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= +gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= @@ -572,7 +600,6 @@ google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/ google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= google.golang.org/api v0.28.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= google.golang.org/api v0.29.0/go.mod h1:Lcubydp8VUV7KeIHD9z2Bys/sm/vGKnG1UHuDBSrHWM= -google.golang.org/api v0.30.0 h1:yfrXXP61wVuLb0vBcG6qaOoIoqYEzOQS8jum51jkv2w= google.golang.org/api v0.30.0/go.mod h1:QGmEvQ87FHZNiUVJkT14jQNYJ4ZJjdRF23ZXz5138Fc= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -618,6 +645,7 @@ google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZi google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.23.1/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= @@ -648,8 +676,9 @@ google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQ gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= gopkg.in/inf.v0 v0.9.0 h1:3zYtXIO92bvsdS3ggAdA8Gb4Azj0YU+TVY1uGYNFA8o= @@ -687,6 +716,7 @@ k8s.io/kube-openapi v0.0.0-20190228160746-b3a7cee44a30/go.mod h1:BXM9ceUBTj2QnfH k8s.io/utils v0.0.0-20190221042446-c2654d5206da h1:ElyM7RPonbKnQqOcw7dG2IK5uvQQn3b/WPHqD5mBvP4= k8s.io/utils v0.0.0-20190221042446-c2654d5206da/go.mod h1:8k8uAuAQ0rXslZKaEWd0c3oVhZz7sSzSiPnVZayjIX0= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= +rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= diff --git a/grpc_stats.go b/grpc_stats.go index ddcf5a10..39cc662a 100644 --- a/grpc_stats.go +++ b/grpc_stats.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -50,12 +50,15 @@ func NewGRPCStatsHandler() *GRPCStatsHandler { c := &GRPCStatsHandler{ grpcRequestCount: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "gubernator_grpc_request_counts", - Help: "GRPC requests by status.", + Help: "The count of gRPC requests.", }, []string{"status", "method"}), grpcRequestDuration: prometheus.NewSummaryVec(prometheus.SummaryOpts{ - Name: "gubernator_grpc_request_duration", - Help: "GRPC request durations in seconds", - Objectives: map[float64]float64{0.5: 0.05, 0.99: 0.001}, + Name: "gubernator_grpc_request_duration", + Help: "The timings of gRPC requests in seconds", + Objectives: map[float64]float64{ + 0.5: 0.05, + 0.99: 0.001, + }, }, []string{"method"}), } c.run() diff --git a/gubernator.go b/gubernator.go index 3c414dcb..6147a021 100644 --- a/gubernator.go +++ b/gubernator.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,9 +21,12 @@ import ( "fmt" "strings" "sync" + "sync/atomic" + "github.com/mailgun/gubernator/v2/tracing" "github.com/mailgun/holster/v4/setter" "github.com/mailgun/holster/v4/syncutil" + "github.com/opentracing/opentracing-go/ext" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" @@ -41,17 +44,72 @@ const ( type V1Instance struct { UnimplementedV1Server UnimplementedPeersV1Server - global *globalManager - mutliRegion *mutliRegionManager - peerMutex sync.RWMutex - log logrus.FieldLogger - conf Config - isClosed bool + global *globalManager + mutliRegion *mutliRegionManager + peerMutex sync.RWMutex + log logrus.FieldLogger + conf Config + isClosed bool + getRateLimitsCounter int64 + gubernatorPool *GubernatorPool } +var getRateLimitCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "gubernator_getratelimit_counter", + Help: "The count of getRateLimit() calls. Label \"calltype\" may be \"local\" for calls handled by the same peer, \"forward\" for calls forwarded to another peer, or \"global\" for global rate limits.", +}, []string{"calltype"}) +var funcTimeMetric = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "gubernator_func_duration", + Help: "The timings of key functions in Gubernator in seconds.", + Objectives: map[float64]float64{ + 0.99: 0.001, + }, +}, []string{"name"}) +var asyncRequestRetriesCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "gubernator_asyncrequest_retries", + Help: "The count of retries occurred in asyncRequests() forwarding a request to another peer.", +}, []string{"name"}) +var queueLengthMetric = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "gubernator_queue_length", + Help: "The getRateLimitsBatch() queue length in PeerClient.", + Objectives: map[float64]float64{ + 0.99: 0.001, + }, +}, []string{"peerAddr"}) +var checkCounter = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "gubernator_check_counter", + Help: "The number of rate limits checked.", +}) +var overLimitCounter = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "gubernator_over_limit_counter", + Help: "The number of rate limit checks that are over the limit.", +}) +var concurrentChecksMetric = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "gubernator_concurrent_checks_counter", + Help: "The number of concurrent GetRateLimits API calls.", + Objectives: map[float64]float64{ + 0.99: 0.001, + }, +}) +var checkErrorCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "gubernator_check_error_counter", + Help: "The number of errors while checking rate limits.", +}, []string{"error"}) +var poolWorkerQueueLength = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "gubernator_pool_queue_length", + Help: "The number of GetRateLimit requests queued up in GubernatorPool workers.", + Objectives: map[float64]float64{ + 0.99: 0.001, + }, +}, []string{"method", "worker"}) + // NewV1Instance instantiate a single instance of a gubernator peer and registers this // instance with the provided GRPCServer. func NewV1Instance(conf Config) (*V1Instance, error) { + span, ctx := tracing.StartSpan(context.Background()) + ext.SamplingPriority.Set(span, 1) + defer span.Finish() + if conf.GRPCServers == nil { return nil, errors.New("at least one GRPCServer instance is required") } @@ -65,6 +123,7 @@ func NewV1Instance(conf Config) (*V1Instance, error) { } setter.SetDefault(&s.log, logrus.WithField("category", "gubernator")) + s.gubernatorPool = NewGubernatorPool(&conf, conf.PoolWorkers, 0) s.global = newGlobalManager(conf.Behaviors, &s) s.mutliRegion = newMultiRegionManager(conf.Behaviors, &s) @@ -78,18 +137,20 @@ func NewV1Instance(conf Config) (*V1Instance, error) { return &s, nil } - ch, err := s.conf.Loader.Load() + // Load the cache. + err := s.gubernatorPool.Load(ctx) if err != nil { - return nil, errors.Wrap(err, "while loading persistent from store") + return nil, errors.Wrap(err, "Error in checkHandlerPool.Load") } - for item := range ch { - s.conf.Cache.Add(item) - } return &s, nil } func (s *V1Instance) Close() error { + span, ctx := tracing.StartSpan(context.Background()) + ext.SamplingPriority.Set(span, 1) + defer span.Finish() + if s.isClosed { return nil } @@ -101,22 +162,39 @@ func (s *V1Instance) Close() error { s.global.Close() s.mutliRegion.Close() - out := make(chan *CacheItem, 500) - go func() { - for item := range s.conf.Cache.Each() { - out <- item - } - close(out) - }() + err := s.gubernatorPool.Store(ctx) + if err != nil { + logrus.WithError(err).Error("Error in checkHandlerPool.Store") + return errors.Wrap(err, "Error in checkHandlerPool.Store") + } + + err = s.gubernatorPool.Close() + if err != nil { + logrus.WithError(err).Error("Error in checkHandlerPool.Close") + return errors.Wrap(err, "Error in checkHandlerPool.Close") + } + s.isClosed = true - return s.conf.Loader.Save(out) + return nil } // GetRateLimits is the public interface used by clients to request rate limits from the system. If the // rate limit `Name` and `UniqueKey` is not owned by this instance then we forward the request to the // peer that does. func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*GetRateLimitsResp, error) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + + funcTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("V1Instance.GetRateLimits")) + defer funcTimer.ObserveDuration() + + concurrentCounter := atomic.AddInt64(&s.getRateLimitsCounter, 1) + defer atomic.AddInt64(&s.getRateLimitsCounter, -1) + span.SetTag("concurrentCounter", concurrentCounter) + concurrentChecksMetric.Observe(float64(concurrentCounter)) + if len(r.Requests) > maxBatchSize { + checkErrorCounter.WithLabelValues("Request too large").Add(1) return nil, status.Errorf(codes.OutOfRange, "Requests.RateLimits list too large; max size is '%d'", maxBatchSize) } @@ -130,62 +208,82 @@ func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*G // For each item in the request body for i, req := range r.Requests { - key := req.Name + "_" + req.UniqueKey - var peer *PeerClient - var err error - - if len(req.UniqueKey) == 0 { - resp.Responses[i] = &RateLimitResp{Error: "field 'unique_key' cannot be empty"} - continue - } - - if len(req.Name) == 0 { - resp.Responses[i] = &RateLimitResp{Error: "field 'namespace' cannot be empty"} - continue - } + span2, ctx2 := tracing.StartNamedSpan(ctx, "Iterate requests") + + func() { + defer span2.Finish() + key := req.Name + "_" + req.UniqueKey + var peer *PeerClient + var err error + + if len(req.UniqueKey) == 0 { + checkErrorCounter.WithLabelValues("Invalid request").Add(1) + resp.Responses[i] = &RateLimitResp{Error: "field 'unique_key' cannot be empty"} + return + } - peer, err = s.GetPeer(key) - if err != nil { - resp.Responses[i] = &RateLimitResp{ - Error: fmt.Sprintf("while finding peer that owns rate limit '%s' - '%s'", key, err), + if len(req.Name) == 0 { + checkErrorCounter.WithLabelValues("Invalid request").Add(1) + resp.Responses[i] = &RateLimitResp{Error: "field 'namespace' cannot be empty"} + return } - continue - } - // If our server instance is the owner of this rate limit - if peer.Info().IsOwner { - // Apply our rate limit algorithm to the request - resp.Responses[i], err = s.getRateLimit(req) + peer, err = s.GetPeer(ctx2, key) if err != nil { + countError(err, "Error in GetPeer") + err = errors.Wrapf(err, "Error in GetPeer, looking up peer that owns rate limit '%s'", key) resp.Responses[i] = &RateLimitResp{ - Error: fmt.Sprintf("while applying rate limit for '%s' - '%s'", key, err), + Error: err.Error(), } + return } - } else { - if HasBehavior(req.Behavior, Behavior_GLOBAL) { - resp.Responses[i], err = s.getGlobalRateLimit(req) + + // If our server instance is the owner of this rate limit + if peer.Info().IsOwner { + // Apply our rate limit algorithm to the request + getRateLimitCounter.WithLabelValues("local").Add(1) + funcTimer1 := prometheus.NewTimer(funcTimeMetric.WithLabelValues("V1Instance.getRateLimit (local)")) + resp.Responses[i], err = s.getRateLimit(ctx2, req) + funcTimer1.ObserveDuration() if err != nil { - resp.Responses[i] = &RateLimitResp{Error: err.Error()} + err2 := errors.Wrapf(err, "Error while apply rate limit for '%s'", key) + ext.LogError(span2, err2) + resp.Responses[i] = &RateLimitResp{Error: err2.Error()} + } + } else { + if HasBehavior(req.Behavior, Behavior_GLOBAL) { + resp.Responses[i], err = s.getGlobalRateLimit(ctx2, req) + if err != nil { + err2 := errors.Wrap(err, "Error in getGlobalRateLimit") + ext.LogError(span2, err2) + resp.Responses[i] = &RateLimitResp{Error: err2.Error()} + } + + // Inform the client of the owner key of the key + resp.Responses[i].Metadata = map[string]string{"owner": peer.Info().GRPCAddress} + return } - // Inform the client of the owner key of the key - resp.Responses[i].Metadata = map[string]string{"owner": peer.Info().GRPCAddress} - continue + // Request must be forwarded to peer that owns the key. + // Launch remote peer request in goroutine. + wg.Add(1) + go s.asyncRequests(ctx2, &AsyncReq{ + AsyncCh: asyncCh, + Peer: peer, + Req: req, + WG: &wg, + Key: key, + Idx: i, + }) } - wg.Add(1) - go s.asyncRequests(ctx, &AsyncReq{ - AsyncCh: asyncCh, - Peer: peer, - Req: req, - WG: &wg, - Key: key, - Idx: i, - }) - } + }() } // Wait for any async responses if any + span3, _ := tracing.StartNamedSpan(ctx, "Wait for responses") wg.Wait() + span3.Finish() + close(asyncCh) for a := range asyncCh { resp.Responses[a.Idx] = a.Resp @@ -212,50 +310,88 @@ func (s *V1Instance) asyncRequests(ctx context.Context, req *AsyncReq) { var attempts int var err error + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + span.SetTag("request.name", req.Req.Name) + span.SetTag("request.key", req.Req.UniqueKey) + span.SetTag("request.limit", req.Req.Limit) + span.SetTag("request.duration", req.Req.Duration) + span.SetTag("peer.grpcAddress", req.Peer.Info().GRPCAddress) + + funcTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("V1Instance.asyncRequests")) + defer funcTimer.ObserveDuration() + resp := AsyncResp{ Idx: req.Idx, } for { if attempts > 5 { - resp.Resp = &RateLimitResp{ - Error: fmt.Sprintf("GetPeer() keeps returning peers that are not connected for '%s' - '%s'", req.Key, err), - } + logrus. + WithError(err). + WithField("key", req.Key). + Error("GetPeer() returned peer that is not connected") + err2 := errors.Wrapf(err, "GetPeer() keeps returning peers that are not connected for '%s'", req.Key) + resp.Resp = &RateLimitResp{Error: err2.Error()} + countError(err, "Peer not connected") + ext.LogError(span, err2) break } // If we are attempting again, the owner of the this rate limit might have changed to us! if attempts != 0 { if req.Peer.Info().IsOwner { - resp.Resp, err = s.getRateLimit(req.Req) + getRateLimitCounter.WithLabelValues("local").Add(1) + resp.Resp, err = s.getRateLimit(ctx, req.Req) if err != nil { - resp.Resp = &RateLimitResp{ - Error: fmt.Sprintf("while applying rate limit for '%s' - '%s'", req.Key, err), - } + logrus. + WithError(err). + WithField("key", req.Key). + Error("Error applying rate limit") + err2 := errors.Wrapf(err, "Error in getRateLimit for '%s'", req.Key) + resp.Resp = &RateLimitResp{Error: err2.Error()} + ext.LogError(span, err2) } break } } // Make an RPC call to the peer that owns this rate limit + getRateLimitCounter.WithLabelValues("forward").Add(1) r, err := req.Peer.GetPeerRateLimit(ctx, req.Req) if err != nil { if IsNotReady(err) { attempts++ - req.Peer, err = s.GetPeer(req.Key) + asyncRequestRetriesCounter.WithLabelValues(req.Req.Name).Add(1) + req.Peer, err = s.GetPeer(ctx, req.Key) if err != nil { - resp.Resp = &RateLimitResp{ - Error: fmt.Sprintf("while finding peer that owns rate limit '%s' - '%s'", req.Key, err), - } + errPart := fmt.Sprintf("Error finding peer that owns rate limit '%s'", req.Key) + err2 := errors.Wrap(err, errPart) + logrus. + WithError(err). + WithField("key", req.Key). + Error(errPart) + countError(err, "Error in GetPeer") + ext.LogError(span, err2) + resp.Resp = &RateLimitResp{Error: err2.Error()} break } continue } - resp.Resp = &RateLimitResp{ - Error: fmt.Sprintf("while fetching rate limit '%s' from peer - '%s'", req.Key, err), - } + + errPart := fmt.Sprintf("Error while fetching rate limit '%s' from peer", req.Key) + err2 := errors.Wrap(err, errPart) + logrus. + WithError(err). + WithField("key", req.Key). + Error("Error fetching rate limit from peer") + // Not calling `countError()` because we expect the remote end to + // report this error. + ext.LogError(span, err2) + resp.Resp = &RateLimitResp{Error: err2.Error()} break } + // Inform the client of the owner key of the key resp.Resp = r resp.Resp.Metadata = map[string]string{"owner": req.Peer.Info().GRPCAddress} @@ -264,20 +400,33 @@ func (s *V1Instance) asyncRequests(ctx context.Context, req *AsyncReq) { req.AsyncCh <- resp req.WG.Done() + + if isDeadlineExceeded(ctx.Err()) { + checkErrorCounter.WithLabelValues("Timeout forwarding to peer").Add(1) + } } // getGlobalRateLimit handles rate limits that are marked as `Behavior = GLOBAL`. Rate limit responses // are returned from the local cache and the hits are queued to be sent to the owning peer. -func (s *V1Instance) getGlobalRateLimit(req *RateLimitReq) (*RateLimitResp, error) { +func (s *V1Instance) getGlobalRateLimit(ctx context.Context, req *RateLimitReq) (*RateLimitResp, error) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + + funcTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("V1Instance.getGlobalRateLimit")) + defer funcTimer.ObserveDuration() // Queue the hit for async update after we have prepared our response. // NOTE: The defer here avoids a race condition where we queue the req to // be forwarded to the owning peer in a separate goroutine but simultaneously // access and possibly copy the req in this method. defer s.global.QueueHit(req) - s.conf.Cache.Lock() - item, ok := s.conf.Cache.GetItem(req.HashKey()) - s.conf.Cache.Unlock() + item, ok, err := s.gubernatorPool.GetCacheItem(ctx, req.HashKey()) + if err != nil { + countError(err, "Error in gubernatorPool.GetCacheItem") + err2 := errors.Wrap(err, "Error in checkHandlerPool.GetCacheItem") + ext.LogError(span, err2) + return nil, err2 + } if ok { // Global rate limits are always stored as RateLimitResp regardless of algorithm rl, ok := item.Value.(*RateLimitResp) @@ -287,44 +436,68 @@ func (s *V1Instance) getGlobalRateLimit(req *RateLimitReq) (*RateLimitResp, erro // We get here if the owning node hasn't asynchronously forwarded it's updates to us yet and // our cache still holds the rate limit we created on the first hit. } + cpy := proto.Clone(req).(*RateLimitReq) cpy.Behavior = Behavior_NO_BATCHING + // Process the rate limit like we own it - resp, err := s.getRateLimit(cpy) - return resp, err + getRateLimitCounter.WithLabelValues("global").Add(1) + resp, err := s.getRateLimit(ctx, cpy) + if err != nil { + err2 := errors.Wrap(err, "Error in getRateLimit") + ext.LogError(span, err2) + return nil, err2 + } + + return resp, nil } // UpdatePeerGlobals updates the local cache with a list of global rate limits. This method should only // be called by a peer who is the owner of a global rate limit. func (s *V1Instance) UpdatePeerGlobals(ctx context.Context, r *UpdatePeerGlobalsReq) (*UpdatePeerGlobalsResp, error) { - s.conf.Cache.Lock() - defer s.conf.Cache.Unlock() + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() for _, g := range r.Globals { - s.conf.Cache.Add(&CacheItem{ + item := &CacheItem{ ExpireAt: g.Status.ResetTime, Algorithm: g.Algorithm, Value: g.Status, Key: g.Key, - }) + } + err := s.gubernatorPool.AddCacheItem(ctx, g.Key, item) + if err != nil { + return nil, errors.Wrap(err, "Error in checkHandlerPool.AddCacheItem") + } } + return &UpdatePeerGlobalsResp{}, nil } // GetPeerRateLimits is called by other peers to get the rate limits owned by this peer. func (s *V1Instance) GetPeerRateLimits(ctx context.Context, r *GetPeerRateLimitsReq) (*GetPeerRateLimitsResp, error) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + + span.SetTag("numRequests", len(r.Requests)) + var resp GetPeerRateLimitsResp if len(r.Requests) > maxBatchSize { - return nil, status.Errorf(codes.OutOfRange, - "'PeerRequest.rate_limits' list too large; max size is '%d'", maxBatchSize) + err2 := fmt.Errorf("'PeerRequest.rate_limits' list too large; max size is '%d'", maxBatchSize) + ext.LogError(span, err2) + checkErrorCounter.WithLabelValues("Request too large").Add(1) + return nil, status.Error(codes.OutOfRange, err2.Error()) } for _, req := range r.Requests { - rl, err := s.getRateLimit(req) + rl, err := s.getRateLimit(ctx, req) if err != nil { // Return the error for this request - rl = &RateLimitResp{Error: err.Error()} + err2 := errors.Wrap(err, "Error in getRateLimit") + ext.LogError(span, err2) + rl = &RateLimitResp{Error: err2.Error()} + // checkErrorCounter is updated within getRateLimit(). } resp.RateLimits = append(resp.RateLimits, rl) } @@ -333,9 +506,14 @@ func (s *V1Instance) GetPeerRateLimits(ctx context.Context, r *GetPeerRateLimits // HealthCheck Returns the health of our instance. func (s *V1Instance) HealthCheck(ctx context.Context, r *HealthCheckReq) (*HealthCheckResp, error) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + var errs []string s.peerMutex.RLock() + defer s.peerMutex.RUnlock() + tracing.LogInfo(span, "peerMutex.RLock()") // Iterate through local peers and get their last errors localPeers := s.conf.LocalPicker.Peers() @@ -344,7 +522,9 @@ func (s *V1Instance) HealthCheck(ctx context.Context, r *HealthCheckReq) (*Healt if lastErr != nil { for _, err := range lastErr { - errs = append(errs, err) + err2 := fmt.Errorf("Error returned from local peer.GetLastErr: %s", err) + ext.LogError(span, err2) + errs = append(errs, err2.Error()) } } } @@ -356,7 +536,9 @@ func (s *V1Instance) HealthCheck(ctx context.Context, r *HealthCheckReq) (*Healt if lastErr != nil { for _, err := range lastErr { - errs = append(errs, err) + err2 := fmt.Errorf("Error returned from region peer.GetLastErr: %s", err) + ext.LogError(span, err2) + errs = append(errs, err2.Error()) } } } @@ -371,29 +553,40 @@ func (s *V1Instance) HealthCheck(ctx context.Context, r *HealthCheckReq) (*Healt health.Message = strings.Join(errs, "|") } - defer s.peerMutex.RUnlock() + span.SetTag("health.peerCount", health.PeerCount) + span.SetTag("health.status", health.Status) + return &health, nil } -func (s *V1Instance) getRateLimit(r *RateLimitReq) (*RateLimitResp, error) { - s.conf.Cache.Lock() - defer s.conf.Cache.Unlock() +func (s *V1Instance) getRateLimit(ctx context.Context, r *RateLimitReq) (*RateLimitResp, error) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + span.SetTag("request.name", r.Name) + span.SetTag("request.key", r.UniqueKey) + span.SetTag("request.limit", r.Limit) + span.SetTag("request.duration", r.Duration) + + funcTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("V1Instance.getRateLimit")) + defer funcTimer.ObserveDuration() + checkCounter.Add(1) if HasBehavior(r.Behavior, Behavior_GLOBAL) { s.global.QueueUpdate(r) + tracing.LogInfo(span, "s.global.QueueUpdate(r)") } if HasBehavior(r.Behavior, Behavior_MULTI_REGION) { s.mutliRegion.QueueHits(r) + tracing.LogInfo(span, "s.mutliRegion.QueueHits(r)") } - switch r.Algorithm { - case Algorithm_TOKEN_BUCKET: - return tokenBucket(s.conf.Store, s.conf.Cache, r) - case Algorithm_LEAKY_BUCKET: - return leakyBucket(s.conf.Store, s.conf.Cache, r) + resp, err := s.gubernatorPool.GetRateLimit(ctx, r) + if isDeadlineExceeded(err) { + checkErrorCounter.WithLabelValues("Timeout").Add(1) } - return nil, errors.Errorf("invalid rate limit algorithm '%d'", r.Algorithm) + + return resp, err } // SetPeers is called by the implementor to indicate the pool of peers has changed @@ -429,6 +622,7 @@ func (s *V1Instance) SetPeers(peerInfo []PeerInfo) { } s.peerMutex.Lock() + // Replace our current pickers oldLocalPicker := s.conf.LocalPicker oldRegionPicker := s.conf.RegionPicker @@ -439,7 +633,7 @@ func (s *V1Instance) SetPeers(peerInfo []PeerInfo) { s.log.WithField("peers", peerInfo).Debug("peers updated") // Shutdown any old peers we no longer need - ctx, cancel := context.WithTimeout(context.Background(), s.conf.Behaviors.BatchTimeout) + ctx, cancel := tracing.ContextWithTimeout(context.Background(), s.conf.Behaviors.BatchTimeout) defer cancel() var shutdownPeers []*PeerClient @@ -480,14 +674,31 @@ func (s *V1Instance) SetPeers(peerInfo []PeerInfo) { } // GetPeer returns a peer client for the hash key provided -func (s *V1Instance) GetPeer(key string) (*PeerClient, error) { +func (s *V1Instance) GetPeer(ctx context.Context, key string) (*PeerClient, error) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + + funcTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("V1Instance.GetPeer")) + defer funcTimer.ObserveDuration() + lockTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("V1Instance.GetPeer_RLock")) + + if ctx.Err() != nil { + ext.LogError(span, ctx.Err()) + return nil, ctx.Err() + } + s.peerMutex.RLock() + defer s.peerMutex.RUnlock() + tracing.LogInfo(span, "peerMutex.RLock()") + lockTimer.ObserveDuration() + peer, err := s.conf.LocalPicker.Get(key) if err != nil { - s.peerMutex.RUnlock() - return nil, err + err2 := errors.Wrap(err, "Error in conf.LocalPicker.Get") + ext.LogError(span, err2) + return nil, err2 } - s.peerMutex.RUnlock() + return peer, nil } @@ -507,12 +718,30 @@ func (s *V1Instance) GetRegionPickers() map[string]PeerPicker { func (s *V1Instance) Describe(ch chan<- *prometheus.Desc) { ch <- s.global.asyncMetrics.Desc() ch <- s.global.broadcastMetrics.Desc() + getRateLimitCounter.Describe(ch) + funcTimeMetric.Describe(ch) + asyncRequestRetriesCounter.Describe(ch) + queueLengthMetric.Describe(ch) + concurrentChecksMetric.Describe(ch) + checkErrorCounter.Describe(ch) + overLimitCounter.Describe(ch) + checkCounter.Describe(ch) + poolWorkerQueueLength.Describe(ch) } // Collect fetches metrics from the server for use by prometheus func (s *V1Instance) Collect(ch chan<- prometheus.Metric) { ch <- s.global.asyncMetrics ch <- s.global.broadcastMetrics + getRateLimitCounter.Collect(ch) + funcTimeMetric.Collect(ch) + asyncRequestRetriesCounter.Collect(ch) + queueLengthMetric.Collect(ch) + concurrentChecksMetric.Collect(ch) + checkErrorCounter.Collect(ch) + overLimitCounter.Collect(ch) + checkCounter.Collect(ch) + poolWorkerQueueLength.Collect(ch) } // HasBehavior returns true if the provided behavior is set @@ -529,3 +758,33 @@ func SetBehavior(b *Behavior, flag Behavior, set bool) { *b &= mask } } + +// Count an error type in the checkErrorCounter metric. +// Recurse into wrapped errors if necessary. +func countError(err error, defaultType string) { + for { + if err == nil { + checkErrorCounter.WithLabelValues(defaultType).Add(1) + return + } + if errors.Is(err, context.DeadlineExceeded) { + checkErrorCounter.WithLabelValues("Timeout").Add(1) + return + } + + err = errors.Unwrap(err) + } +} + +func isDeadlineExceeded(err error) bool { + for { + if err == nil { + return false + } + if errors.Is(err, context.DeadlineExceeded) { + return true + } + + err = errors.Unwrap(err) + } +} diff --git a/gubernator.pb.go b/gubernator.pb.go index f8795c2e..36576b74 100644 --- a/gubernator.pb.go +++ b/gubernator.pb.go @@ -1,5 +1,5 @@ // -//Copyright 2018-2019 Mailgun Technologies Inc +//Copyright 2018-2022 Mailgun Technologies Inc // //Licensed under the Apache License, Version 2.0 (the "License"); //you may not use this file except in compliance with the License. diff --git a/gubernator_pool.go b/gubernator_pool.go new file mode 100644 index 00000000..2ffe9665 --- /dev/null +++ b/gubernator_pool.go @@ -0,0 +1,652 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gubernator + +// Threadsafe worker pool for handling concurrent Gubernator requests. +// Ensures requests are synchronized to avoid caching conflicts. +// Handle concurrent requests by sharding cache key space across multiple +// workers. +// Uses hash ring design pattern to distribute requests to an assigned worker. +// No mutex locking necessary because each worker has its own data space and +// processes requests sequentially. +// +// Request workflow: +// - A 63-bit hash is generated from an incoming request by its Key/Name +// values. (Actually 64 bit, but we toss out one bit to properly calculate +// the next step.) +// - Workers are assigned equal size hash ranges. The worker is selected by +// choosing the worker index associated with that linear hash value range. +// - The worker has command channels for each method call. The request is +// enqueued to the appropriate channel. +// - The worker pulls the request from the appropriate channel and executes the +// business logic for that method. Then, it sends a response back using the +// requester's provided response channel. + +import ( + "context" + "fmt" + "io" + "strconv" + "sync" + "sync/atomic" + + "github.com/OneOfOne/xxhash" + "github.com/mailgun/gubernator/v2/tracing" + "github.com/mailgun/holster/v4/setter" + "github.com/opentracing/opentracing-go/ext" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +type GubernatorPool struct { + workers []*poolWorker + workerCacheSize int + hasher ipoolHasher + hashRingStep uint64 + conf *Config + done chan struct{} +} + +type poolWorker struct { + name string + conf *Config + cache Cache + getRateLimitRequest chan *request + storeRequest chan poolStoreRequest + loadRequest chan poolLoadRequest + addCacheItemRequest chan poolAddCacheItemRequest + getCacheItemRequest chan poolGetCacheItemRequest +} + +type ipoolHasher interface { + // Return a 63-bit hash derived from input. + ComputeHash63(input string) uint64 +} + +// Standard implementation of ipoolHasher. +type poolHasher struct { +} + +// Method request/response structs. +type poolStoreRequest struct { + ctx context.Context + response chan poolStoreResponse + out chan<- *CacheItem +} + +type poolStoreResponse struct{} + +type poolLoadRequest struct { + ctx context.Context + response chan poolLoadResponse + in <-chan *CacheItem +} + +type poolLoadResponse struct{} + +type poolAddCacheItemRequest struct { + ctx context.Context + response chan poolAddCacheItemResponse + item *CacheItem +} + +type poolAddCacheItemResponse struct { + exists bool +} + +type poolGetCacheItemRequest struct { + ctx context.Context + response chan poolGetCacheItemResponse + key string +} + +type poolGetCacheItemResponse struct { + item *CacheItem + ok bool +} + +var _ io.Closer = &GubernatorPool{} +var _ ipoolHasher = &poolHasher{} + +var poolWorkerCounter int64 + +func NewGubernatorPool(conf *Config, concurrency int, cacheSize int) *GubernatorPool { + setter.SetDefault(&cacheSize, 50_000) + + // Compute hashRingStep as interval between workers' 63-bit hash ranges. + // 64th bit is used here as a max value that is just out of range of 63-bit space to calculate the step. + chp := &GubernatorPool{ + workers: make([]*poolWorker, concurrency), + workerCacheSize: cacheSize / concurrency, + hasher: newPoolHasher(), + hashRingStep: uint64(1 << 63) / uint64(concurrency), + conf: conf, + done: make(chan struct{}), + } + + // Create workers. + for i := 0; i < concurrency; i++ { + chp.workers[i] = chp.newWorker() + go chp.worker(chp.workers[i]) + } + + return chp +} + +func newPoolHasher() *poolHasher { + return &poolHasher{} +} + +func (ph *poolHasher) ComputeHash63(input string) uint64 { + return xxhash.ChecksumString64S(input, 0) >> 1 +} + +func (chp *GubernatorPool) Close() error { + close(chp.done) + return nil +} + +// Create a new pool worker instance. +func (chp *GubernatorPool) newWorker() *poolWorker { + const commandChannelSize = 10000 + + worker := &poolWorker{ + cache: chp.conf.CacheFactory(chp.workerCacheSize), + getRateLimitRequest: make(chan *request, commandChannelSize), + storeRequest: make(chan poolStoreRequest, commandChannelSize), + loadRequest: make(chan poolLoadRequest, commandChannelSize), + addCacheItemRequest: make(chan poolAddCacheItemRequest, commandChannelSize), + getCacheItemRequest: make(chan poolGetCacheItemRequest, commandChannelSize), + } + workerNumber := atomic.AddInt64(&poolWorkerCounter, 1) - 1 + worker.name = strconv.FormatInt(workerNumber, 10) + return worker +} + +// Returns the request channel associated with the key. +// Hash the key, then lookup hash ring to find the worker. +func (chp *GubernatorPool) getWorker(key string) *poolWorker { + hash := chp.hasher.ComputeHash63(key) + idx := hash / chp.hashRingStep + return chp.workers[idx] +} + +// Pool worker for processing Gubernator requests. +// Each worker maintains its own state. +// A hash ring will distribute requests to an assigned worker by key. +// See: getWorker() +func (chp *GubernatorPool) worker(worker *poolWorker) { + for { + // Dispatch requests from each channel. + select { + case req, ok := <-worker.getRateLimitRequest: + if !ok { + // Channel closed. Unexpected, but should be handled. + logrus.Error("checkHandlerPool worker stopped because channel closed") + return + } + + chp.handleGetRateLimit(req, worker.cache) + + case req, ok := <-worker.storeRequest: + if !ok { + // Channel closed. Unexpected, but should be handled. + logrus.Error("checkHandlerPool worker stopped because channel closed") + return + } + + chp.handleStore(req, worker.cache) + + case req, ok := <-worker.loadRequest: + if !ok { + // Channel closed. Unexpected, but should be handled. + logrus.Error("checkHandlerPool worker stopped because channel closed") + return + } + + chp.handleLoad(req, worker.cache) + + case req, ok := <-worker.addCacheItemRequest: + if !ok { + // Channel closed. Unexpected, but should be handled. + logrus.Error("checkHandlerPool worker stopped because channel closed") + return + } + + chp.handleAddCacheItem(req, worker.cache) + + case req, ok := <-worker.getCacheItemRequest: + if !ok { + // Channel closed. Unexpected, but should be handled. + logrus.Error("checkHandlerPool worker stopped because channel closed") + return + } + + chp.handleGetCacheItem(req, worker.cache) + + case <-chp.done: + // Clean up. + return + } + } +} + +// Send a GetRateLimit request to worker pool. +func (chp *GubernatorPool) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) (*RateLimitResp, error) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + + // Delegate request to assigned channel based on request key. + worker := chp.getWorker(rlRequest.UniqueKey) + handlerRequest := &request{ + ctx: ctx, + resp: make(chan *response, 1), + request: rlRequest, + } + + // Send request. + tracing.LogInfo(span, "Sending request...", "channelLength", len(worker.getRateLimitRequest)) + select { + case worker.getRateLimitRequest <- handlerRequest: + // Successfully sent request. + case <-ctx.Done(): + ext.LogError(span, ctx.Err()) + return nil, ctx.Err() + } + + poolWorkerQueueLength.WithLabelValues("GetRateLimit", worker.name).Observe(float64(len(worker.getRateLimitRequest))) + + // Wait for response. + tracing.LogInfo(span, "Waiting for response...") + select { + case handlerResponse := <-handlerRequest.resp: + // Successfully read response. + return handlerResponse.rl, handlerResponse.err + case <-ctx.Done(): + ext.LogError(span, ctx.Err()) + return nil, ctx.Err() + } +} + +// Handle request received by worker. +func (chp *GubernatorPool) handleGetRateLimit(handlerRequest *request, cache Cache) { + span, ctx := tracing.StartSpan(handlerRequest.ctx) + defer span.Finish() + + var rlResponse *RateLimitResp + var err error + + switch handlerRequest.request.Algorithm { + case Algorithm_TOKEN_BUCKET: + rlResponse, err = tokenBucket(ctx, chp.conf.Store, cache, handlerRequest.request) + if err != nil { + msg := "Error in tokenBucket" + countError(err, msg) + err = errors.Wrap(err, msg) + ext.LogError(span, err) + } + + case Algorithm_LEAKY_BUCKET: + rlResponse, err = leakyBucket(ctx, chp.conf.Store, cache, handlerRequest.request) + if err != nil { + msg := "Error in leakyBucket" + countError(err, msg) + err = errors.Wrap(err, msg) + ext.LogError(span, err) + } + + default: + err = errors.Errorf("Invalid rate limit algorithm '%d'", handlerRequest.request.Algorithm) + ext.LogError(span, err) + checkErrorCounter.WithLabelValues("Invalid algorithm").Add(1) + } + + handlerResponse := &response{ + rl: rlResponse, + err: err, + } + + select { + case handlerRequest.resp <- handlerResponse: + // Success. + + case <-ctx.Done(): + // Context canceled. + ext.LogError(span, ctx.Err()) + } +} + +// Atomically load cache from persistent storage. +// Read from persistent storage. Load into each appropriate worker's cache. +// Workers are locked during this load operation to prevent race conditions. +func (chp *GubernatorPool) Load(ctx context.Context) error { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + + ch, err := chp.conf.Loader.Load() + if err != nil { + return errors.Wrap(err, "Error in loader.Load") + } + + type loadChannel struct { + ch chan *CacheItem + worker *poolWorker + respChan chan poolLoadResponse + } + + // Map request channel hash to load channel. + loadChMap := map[*poolWorker]loadChannel{} + + // Send each item to assigned channel's cache. +mainloop: + for { + var item *CacheItem + var ok bool + + select { + case item, ok = <-ch: + if !ok { + break mainloop + } + // Successfully received item. + + case <-ctx.Done(): + // Context canceled. + return ctx.Err() + } + + worker := chp.getWorker(item.Key) + + // Initiate a load channel with each worker. + loadCh, exist := loadChMap[worker] + if !exist { + loadCh = loadChannel{ + ch: make(chan *CacheItem), + worker: worker, + respChan: make(chan poolLoadResponse), + } + loadChMap[worker] = loadCh + + // Tie up the worker while loading. + worker.loadRequest <- poolLoadRequest{ + ctx: ctx, + response: loadCh.respChan, + in: loadCh.ch, + } + } + + // Send item to worker's load channel. + select { + case loadCh.ch <- item: + // Successfully sent item. + + case <-ctx.Done(): + // Context canceled. + return ctx.Err() + } + } + + // Clean up. + for _, loadCh := range loadChMap { + close(loadCh.ch) + + // Load response confirms all items have been loaded and the worker + // resumes normal operation. + select { + case <-loadCh.respChan: + // Successfully received response. + + case <-ctx.Done(): + // Context canceled. + return ctx.Err() + } + } + + return nil +} + +func (chp *GubernatorPool) handleLoad(request poolLoadRequest, cache Cache) { + span, ctx := tracing.StartSpan(request.ctx) + defer span.Finish() + +mainloop: + for { + var item *CacheItem + var ok bool + + select { + case item, ok = <-request.in: + if !ok { + break mainloop + } + // Successfully received item. + + case <-ctx.Done(): + // Context canceled. + return + } + + cache.Add(item) + } + + response := poolLoadResponse{} + + select { + case request.response <- response: + // Successfully sent response. + + case <-ctx.Done(): + // Context canceled. + ext.LogError(span, ctx.Err()) + } +} + +// Atomically store cache to persistent storage. +// Save all workers' caches to persistent storage. +// Workers are locked during this store operation to prevent race conditions. +func (chp *GubernatorPool) Store(ctx context.Context) error { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + + var wg sync.WaitGroup + out := make(chan *CacheItem, 500) + + // Iterate each worker's cache to `out` channel. + for _, worker := range chp.workers { + wg.Add(1) + + go func(worker *poolWorker) { + span2, ctx2 := tracing.StartNamedSpan(ctx, fmt.Sprintf("%p", worker)) + defer span2.Finish() + defer wg.Done() + + respChan := make(chan poolStoreResponse) + req := poolStoreRequest{ + ctx: ctx2, + response: respChan, + out: out, + } + + select { + case worker.storeRequest <- req: + // Successfully sent request. + select { + case <-respChan: + // Successfully received response. + return + + case <-ctx2.Done(): + // Context canceled. + ext.LogError(span2, ctx2.Err()) + return + } + + case <-ctx2.Done(): + // Context canceled. + ext.LogError(span2, ctx2.Err()) + return + } + }(worker) + } + + // When all iterators are done, close `out` channel. + go func() { + wg.Wait() + close(out) + }() + + if ctx.Err() != nil { + ext.LogError(span, ctx.Err()) + return ctx.Err() + } + + return chp.conf.Loader.Save(out) +} + +func (chp *GubernatorPool) handleStore(request poolStoreRequest, cache Cache) { + span, ctx := tracing.StartSpan(request.ctx) + defer span.Finish() + + for item := range cache.Each() { + select { + case request.out <- item: + // Successfully sent item. + + case <-ctx.Done(): + // Context canceled. + ext.LogError(span, ctx.Err()) + return + } + } + + response := poolStoreResponse{} + + select { + case request.response <- response: + // Successfully sent response. + + case <-ctx.Done(): + // Context canceled. + ext.LogError(span, ctx.Err()) + } +} + +// Add to worker's cache. +func (chp *GubernatorPool) AddCacheItem(ctx context.Context, key string, item *CacheItem) error { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + + respChan := make(chan poolAddCacheItemResponse) + worker := chp.getWorker(key) + req := poolAddCacheItemRequest{ + ctx: ctx, + response: respChan, + item: item, + } + + select { + case worker.addCacheItemRequest <- req: + // Successfully sent request. + poolWorkerQueueLength.WithLabelValues("AddCacheItem", worker.name).Observe(float64(len(worker.addCacheItemRequest))) + + select { + case <-respChan: + // Successfully received response. + return nil + + case <-ctx.Done(): + // Context canceled. + ext.LogError(span, ctx.Err()) + return ctx.Err() + } + + case <-ctx.Done(): + // Context canceled. + ext.LogError(span, ctx.Err()) + return ctx.Err() + } +} + +func (chp *GubernatorPool) handleAddCacheItem(request poolAddCacheItemRequest, cache Cache) { + span, ctx := tracing.StartSpan(request.ctx) + defer span.Finish() + + exists := cache.Add(request.item) + response := poolAddCacheItemResponse{exists} + + select { + case request.response <- response: + // Successfully sent response. + + case <-ctx.Done(): + // Context canceled. + ext.LogError(span, ctx.Err()) + } +} + +// Get item from worker's cache. +func (chp *GubernatorPool) GetCacheItem(ctx context.Context, key string) (*CacheItem, bool, error) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + + respChan := make(chan poolGetCacheItemResponse) + worker := chp.getWorker(key) + req := poolGetCacheItemRequest{ + ctx: ctx, + response: respChan, + key: key, + } + + select { + case worker.getCacheItemRequest <- req: + // Successfully sent requst. + poolWorkerQueueLength.WithLabelValues("GetCacheItem", worker.name).Observe(float64(len(worker.getCacheItemRequest))) + + select { + case resp := <-respChan: + // Successfully received response. + return resp.item, resp.ok, nil + + case <-ctx.Done(): + // Context canceled. + ext.LogError(span, ctx.Err()) + return nil, false, ctx.Err() + } + + case <-ctx.Done(): + // Context canceled. + ext.LogError(span, ctx.Err()) + return nil, false, ctx.Err() + } +} + +func (chp *GubernatorPool) handleGetCacheItem(request poolGetCacheItemRequest, cache Cache) { + span, ctx := tracing.StartSpan(request.ctx) + defer span.Finish() + + item, ok := cache.GetItem(request.key) + response := poolGetCacheItemResponse{item, ok} + + select { + case request.response <- response: + // Successfully sent response. + + case <-ctx.Done(): + // Context canceled. + ext.LogError(span, ctx.Err()) + } +} diff --git a/gubernator_pool_internal_test.go b/gubernator_pool_internal_test.go new file mode 100644 index 00000000..dfa2cd49 --- /dev/null +++ b/gubernator_pool_internal_test.go @@ -0,0 +1,83 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gubernator + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" +) + +type MockPoolHasher struct { + mock.Mock +} + +func (m *MockPoolHasher) ComputeHash63(input string) uint64 { + args := m.Called(input) + retval, _ := args.Get(0).(uint64) + return retval +} + +func TestGubernatorPoolInternal(t *testing.T) { + t.Run("getWorker()", func(t *testing.T) { + const concurrency = 32 + const cacheSize = 1000 + conf := &Config{} + conf.SetDefaults() + + // Test that getWorker() interpolates the hash to find the expected worker. + testCases := []struct{ + Name string + Hash uint64 + ExpectedIdx int + }{ + {"Hash 0%", 0, 0}, + {"Hash 50%", 0x3fff_ffff_ffff_ffff, (concurrency / 2) - 1}, + {"Hash 50% + 1", 0x4000_0000_0000_0000, (concurrency / 2)}, + {"Hash 100%", 0x7fff_ffff_ffff_ffff, concurrency - 1}, + } + + for _, testCase := range testCases { + t.Run(testCase.Name, func(t *testing.T) { + pool := NewGubernatorPool(conf, concurrency, cacheSize) + defer pool.Close() + mockHasher := &MockPoolHasher{} + pool.hasher = mockHasher + + // Setup mocks. + mockHasher.On("ComputeHash63", mock.Anything).Once().Return(testCase.Hash) + + // Call code. + worker := pool.getWorker("Foobar") + + // Verify + require.NotNil(t, worker) + + var actualIdx int + for ; actualIdx < len(pool.workers); actualIdx++ { + if pool.workers[actualIdx] == worker { + break + } + } + assert.Equal(t, testCase.ExpectedIdx, actualIdx) + mockHasher.AssertExpectations(t) + }) + } + }) +} diff --git a/gubernator_pool_test.go b/gubernator_pool_test.go new file mode 100644 index 00000000..782a3848 --- /dev/null +++ b/gubernator_pool_test.go @@ -0,0 +1,132 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gubernator_test + +import ( + "context" + "fmt" + "sort" + "testing" + + guber "github.com/mailgun/gubernator/v2" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" +) + +func TestGubernatorPool(t *testing.T) { + ctx := context.Background() + + testCases := []struct { + name string + concurrency int + }{ + {"Single-threaded", 1}, + {"Multi-threaded", 4}, + } + + for _, testCase := range testCases { + t.Run(testCase.name, func(t *testing.T) { + // Setup mock data. + const NumCacheItems = 100 + cacheItems := []*guber.CacheItem{} + for i := 0; i < NumCacheItems; i++ { + cacheItems = append(cacheItems, &guber.CacheItem{ + Key: fmt.Sprintf("Foobar%04d", i), + Value: fmt.Sprintf("Stuff%04d", i), + ExpireAt: 4131978658000, + }) + } + + t.Run("Load()", func(t *testing.T) { + mockLoader := &MockLoader2{} + mockCache := &MockCache{} + conf := &guber.Config{ + CacheFactory: func(maxSize int) guber.Cache { + return mockCache + }, + Loader: mockLoader, + } + conf.SetDefaults() + chp := guber.NewGubernatorPool(conf, testCase.concurrency, 0) + + // Mock Loader. + fakeLoadCh := make(chan *guber.CacheItem, NumCacheItems) + for _, item := range cacheItems { + fakeLoadCh <- item + } + close(fakeLoadCh) + mockLoader.On("Load").Once().Return(fakeLoadCh, nil) + + // Mock Cache. + for _, item := range cacheItems { + mockCache.On("Add", item).Once().Return(false) + } + + // Call code. + err := chp.Load(ctx) + + // Verify. + require.NoError(t, err, "Error in chp.Load") + }) + + t.Run("Store()", func(t *testing.T) { + mockLoader := &MockLoader2{} + mockCache := &MockCache{} + conf := &guber.Config{ + CacheFactory: func(maxSize int) guber.Cache { + return mockCache + }, + Loader: mockLoader, + } + conf.SetDefaults() + chp := guber.NewGubernatorPool(conf, testCase.concurrency, 0) + + // Mock Loader. + mockLoader.On("Save", mock.Anything).Once().Return(nil). + Run(func(args mock.Arguments) { + // Verify items sent over the channel passed to Save(). + saveCh := args.Get(0).(chan *guber.CacheItem) + savedItems := []*guber.CacheItem{} + for item := range saveCh { + savedItems = append(savedItems, item) + } + + // Verify saved result. + sort.Slice(savedItems, func(a, b int) bool { + return savedItems[a].Key < savedItems[b].Key + }) + assert.Equal(t, cacheItems, savedItems) + }) + + // Mock Cache. + eachCh := make(chan *guber.CacheItem, NumCacheItems) + for _, item := range cacheItems { + eachCh <- item + } + close(eachCh) + mockCache.On("Each").Times(testCase.concurrency).Return(eachCh) + + // Call code. + err := chp.Store(ctx) + + // Verify. + require.NoError(t, err, "Error in chp.Store") + }) + }) + } +} diff --git a/interval.go b/interval.go index 6ecc8d40..14868d4c 100644 --- a/interval.go +++ b/interval.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/interval_test.go b/interval_test.go index dc9b0ec5..05ab6b36 100644 --- a/interval_test.go +++ b/interval_test.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/jaegertracing.md b/jaegertracing.md new file mode 100644 index 00000000..7b67f60e --- /dev/null +++ b/jaegertracing.md @@ -0,0 +1,79 @@ +# Jaeger Tracing +Gubernator supports [OpenTracing](https://opentracing.io) for generating +detailed traces of server behavior using [Jaeger +Tracing](https://www.jaegertracing.io/) tools. + +## Enabling Jaeger +Jaeger is enabled by default and sends traces to localhost port 6831/udp. + +Configure with environment variables, such as: + +| Name | Description | +| ---------------------- | ----------- | +| `JAEGER_SERVICE_NAME` | Service name. | +| `JAEGER_AGENT_HOST` | Jaeger server hostname or IP. | +| `JAEGER_SAMPLER_TYPE` | The sampler type: `remote`, `const`, `probablistic`, or `ratelimiting`. | +| `JAEGER_SAMPLER_PARAM` | The sampler parameter. | +| `JAEGER_DISABLED` | Set to `true` to disable sending Jaeger traces. | + +See also the [full list of variables](https://github.com/jaegertracing/jaeger-client-go#environment-variables). + +## Sampling +Because Gubernator generates a trace for each request, it is recommended to use +`probablistic` or `ratelimiting` [sampler +type](https://www.jaegertracing.io/docs/1.30/sampling/) to reduce the volume of +data sent to your Jaeger server. + +## Distributed Traces +OpenTracing defines capabilities for clients to send trace ids to downstream +services. That service will link the client span with the server span. When +the client and server both send traces to the same Jaeger server, the trace +will appear with the two spans linked in the same view. + +See `tracing/tracing.go` for usage examples. + +### Gubernator Standlone +When deployed as a standalone daemon, Gubernator's gRPC service will receive +embedded trace ids in requests from the client's `context` object. + +For this to work, the client must be configured to embed tracing ids. + +#### gRPC +If using Gubernator's Golang gRPC client, the client must be created like so: + +```go + import ( + "github.com/opentracing/opentracing-go" + otgrpc "github.com/opentracing-contrib/go-grpc" + "google.golang.org/grpc" + ) + + // ... + + tracer := opentracing.GlobalTracer() + tracingUnaryInterceptor := otgrpc.OpenTracingClientInterceptor(tracer) + tracingStreamInterceptor := otgrpc.OpenTracingStreamClientInterceptor(tracer) + + opts := []grpc.DialOption{ + grpc.WithBlock(), + grpc.WithUnaryInterceptor(tracingUnaryInterceptor), + grpc.WithStreamInterceptor(tracingStreamInterceptor), + } + + endpoint := "" + conn, err := grpc.DialContext(ctx, endpoint, opts...) +``` + +#### HTTP +If using HTTP, the tracing ids must be embedded in HTTP headers. This is +typically done using Jaeger client functionality. + +See: https://medium.com/opentracing/distributed-tracing-in-10-minutes-51b378ee40f1 + +### Gubernator Module +When embedded into a dependent codebase as a Go module, most all Gubernator +functions create spans linked to the trace ids embedded into the `context` +object. + +Follow the same steps to configure your codebase as the Gubernator standalone, +above. diff --git a/kubernetes.go b/kubernetes.go index 13df2e76..a753bfc5 100644 --- a/kubernetes.go +++ b/kubernetes.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/kubernetesconfig.go b/kubernetesconfig.go index bcb8007d..d9f49c3d 100644 --- a/kubernetesconfig.go +++ b/kubernetesconfig.go @@ -1,3 +1,4 @@ +//go:build !local // +build !local package gubernator diff --git a/kubernetesconfig_local.go b/kubernetesconfig_local.go index ab808108..979c5bb6 100644 --- a/kubernetesconfig_local.go +++ b/kubernetesconfig_local.go @@ -1,3 +1,4 @@ +//go:build local // +build local package gubernator diff --git a/logging/logging.go b/logging/logging.go index 3bc0a9df..76b25828 100644 --- a/logging/logging.go +++ b/logging/logging.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ package logging import ( "encoding/json" + "github.com/pkg/errors" "github.com/sirupsen/logrus" ) diff --git a/lrucache.go b/lrucache.go new file mode 100644 index 00000000..c639b5c5 --- /dev/null +++ b/lrucache.go @@ -0,0 +1,212 @@ +/* +Modifications Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This work is derived from github.com/golang/groupcache/lru +*/ + +package gubernator + +import ( + "container/list" + "sync/atomic" + + "github.com/mailgun/holster/v4/clock" + "github.com/mailgun/holster/v4/setter" + "github.com/prometheus/client_golang/prometheus" +) + +// Cache is an LRU cache that supports expiration. +// Not thread-safe. Be sure to use a mutex to prevent concurrent method calls. +type LRUCache struct { + cache map[string]*list.Element + ll *list.List + cacheSize int + cacheLen int64 +} + +// Prometheus metrics collector for LRUCache. +// Register only one collector, add one or more caches to this collector. +type LRUCacheCollector struct { + caches []Cache +} + +var _ Cache = &LRUCache{} +var _ prometheus.Collector = &LRUCacheCollector{} + +var sizeMetric = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "gubernator_cache_size", + Help: "The number of items in LRU Cache which holds the rate limits.", +}) +var accessMetric = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "gubernator_cache_access_count", + Help: "Cache access counts. Label \"type\" = hit|miss.", +}, []string{"type"}) + +// New creates a new Cache with a maximum size. +func NewLRUCache(maxSize int) *LRUCache { + setter.SetDefault(&maxSize, 50_000) + + return &LRUCache{ + cache: make(map[string]*list.Element), + ll: list.New(), + cacheSize: maxSize, + } +} + +// FIXME: Not threadsafe. Each() maintains a goroutine that iterates. +// Cannot safely access other Cache methods while iterating. +// It would be safer if this were done using an iterator or delegate pattern +// that doesn't require a goroutine. +// May need to reassess functional requirements. +func (c *LRUCache) Each() chan *CacheItem { + out := make(chan *CacheItem) + go func() { + for _, ele := range c.cache { + out <- ele.Value.(*CacheItem) + } + close(out) + }() + return out +} + +// Adds a value to the cache. +func (c *LRUCache) Add(item *CacheItem) bool { + // If the key already exist, set the new value + if ee, ok := c.cache[item.Key]; ok { + c.ll.MoveToFront(ee) + ee.Value = item + return true + } + + ele := c.ll.PushFront(item) + c.cache[item.Key] = ele + if c.cacheSize != 0 && c.ll.Len() > c.cacheSize { + c.removeOldest() + } + atomic.StoreInt64(&c.cacheLen, int64(c.ll.Len())) + return false +} + +// Return unix epoch in milliseconds +func MillisecondNow() int64 { + return clock.Now().UnixNano() / 1000000 +} + +// GetItem returns the item stored in the cache +func (c *LRUCache) GetItem(key string) (item *CacheItem, ok bool) { + if ele, hit := c.cache[key]; hit { + entry := ele.Value.(*CacheItem) + + now := MillisecondNow() + // If the entry is invalidated + if entry.InvalidAt != 0 && entry.InvalidAt < now { + c.removeElement(ele) + accessMetric.WithLabelValues("miss").Add(1) + return + } + + // If the entry has expired, remove it from the cache + if entry.ExpireAt < now { + c.removeElement(ele) + accessMetric.WithLabelValues("miss").Add(1) + return + } + + accessMetric.WithLabelValues("hit").Add(1) + c.ll.MoveToFront(ele) + return entry, true + } + + accessMetric.WithLabelValues("miss").Add(1) + return +} + +// Remove removes the provided key from the cache. +func (c *LRUCache) Remove(key string) { + if ele, hit := c.cache[key]; hit { + c.removeElement(ele) + } +} + +// RemoveOldest removes the oldest item from the cache. +func (c *LRUCache) removeOldest() { + ele := c.ll.Back() + if ele != nil { + c.removeElement(ele) + } +} + +func (c *LRUCache) removeElement(e *list.Element) { + c.ll.Remove(e) + kv := e.Value.(*CacheItem) + delete(c.cache, kv.Key) + atomic.StoreInt64(&c.cacheLen, int64(c.ll.Len())) +} + +// Returns the number of items in the cache. +func (c *LRUCache) Size() int64 { + return atomic.LoadInt64(&c.cacheLen) +} + +// Update the expiration time for the key +func (c *LRUCache) UpdateExpiration(key string, expireAt int64) bool { + if ele, hit := c.cache[key]; hit { + entry := ele.Value.(*CacheItem) + entry.ExpireAt = expireAt + return true + } + return false +} + +func (c *LRUCache) Close() error { + c.cache = nil + c.ll = nil + c.cacheLen = 0 + return nil +} + +func NewLRUCacheCollector() *LRUCacheCollector { + return &LRUCacheCollector{ + caches: []Cache{}, + } +} + +// Add a Cache object to be tracked by the collector. +func (collector *LRUCacheCollector) AddCache(cache Cache) { + collector.caches = append(collector.caches, cache) +} + +// Describe fetches prometheus metrics to be registered +func (collector *LRUCacheCollector) Describe(ch chan<- *prometheus.Desc) { + sizeMetric.Describe(ch) + accessMetric.Describe(ch) +} + +// Collect fetches metric counts and gauges from the cache +func (collector *LRUCacheCollector) Collect(ch chan<- prometheus.Metric) { + sizeMetric.Set(collector.getSize()) + sizeMetric.Collect(ch) + accessMetric.Collect(ch) +} + +func (collector *LRUCacheCollector) getSize() float64 { + var size float64 + + for _, cache := range collector.caches { + size += float64(cache.Size()) + } + + return size +} diff --git a/lrucache_test.go b/lrucache_test.go new file mode 100644 index 00000000..a188e6d7 --- /dev/null +++ b/lrucache_test.go @@ -0,0 +1,549 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gubernator_test + +import ( + "math/rand" + "strconv" + "sync" + "testing" + "time" + + "github.com/mailgun/gubernator/v2" + "github.com/mailgun/holster/v4/clock" + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestLRUCache(t *testing.T) { + const iterations = 1000 + const concurrency = 100 + expireAt := clock.Now().Add(1 * time.Hour).UnixMilli() + var mutex sync.Mutex + + t.Run("Happy path", func(t *testing.T) { + cache := gubernator.NewLRUCache(0) + + // Populate cache. + for i := 0; i < iterations; i++ { + key := strconv.Itoa(i) + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expireAt, + } + mutex.Lock() + exists := cache.Add(item) + mutex.Unlock() + assert.False(t, exists) + } + + // Validate cache. + assert.Equal(t, int64(iterations), cache.Size()) + + for i := 0; i < iterations; i++ { + key := strconv.Itoa(i) + mutex.Lock() + item, ok := cache.GetItem(key) + mutex.Unlock() + require.True(t, ok) + require.NotNil(t, item) + assert.Equal(t, item.Value, i) + } + + // Clear cache. + for i := 0; i < iterations; i++ { + key := strconv.Itoa(i) + mutex.Lock() + cache.Remove(key) + mutex.Unlock() + } + + assert.Zero(t, cache.Size()) + }) + + t.Run("Update an existing key", func(t *testing.T) { + cache := gubernator.NewLRUCache(0) + const key = "foobar" + + // Add key. + item1 := &gubernator.CacheItem{ + Key: key, + Value: "initial value", + ExpireAt: expireAt, + } + exists1 := cache.Add(item1) + require.False(t, exists1) + + // Update same key. + item2 := &gubernator.CacheItem{ + Key: key, + Value: "new value", + ExpireAt: expireAt, + } + exists2 := cache.Add(item2) + require.True(t, exists2) + + // Verify. + verifyItem, ok := cache.GetItem(key) + require.True(t, ok) + assert.Equal(t, item2, verifyItem) + }) + + t.Run("Concurrent reads", func(t *testing.T) { + cache := gubernator.NewLRUCache(0) + + // Populate cache. + for i := 0; i < iterations; i++ { + key := strconv.Itoa(i) + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expireAt, + } + exists := cache.Add(item) + assert.False(t, exists) + } + + assert.Equal(t, int64(iterations), cache.Size()) + var launchWg, doneWg sync.WaitGroup + launchWg.Add(1) + + for thread := 0; thread < concurrency; thread++ { + doneWg.Add(1) + + go func() { + defer doneWg.Done() + launchWg.Wait() + + for i := 0; i < iterations; i++ { + key := strconv.Itoa(i) + mutex.Lock() + item, ok := cache.GetItem(key) + mutex.Unlock() + assert.True(t, ok) + require.NotNil(t, item) + assert.Equal(t, item.Value, i) + } + }() + } + + // Wait for goroutines to finish. + launchWg.Done() + doneWg.Wait() + }) + + t.Run("Concurrent writes", func(t *testing.T) { + cache := gubernator.NewLRUCache(0) + expireAt := clock.Now().Add(1 * time.Hour).UnixMilli() + var launchWg, doneWg sync.WaitGroup + launchWg.Add(1) + + for thread := 0; thread < concurrency; thread++ { + doneWg.Add(1) + + go func() { + defer doneWg.Done() + launchWg.Wait() + + for i := 0; i < iterations; i++ { + key := strconv.Itoa(i) + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expireAt, + } + mutex.Lock() + cache.Add(item) + mutex.Unlock() + } + }() + } + + // Wait for goroutines to finish. + launchWg.Done() + doneWg.Wait() + }) + + t.Run("Concurrent reads and writes", func(t *testing.T) { + cache := gubernator.NewLRUCache(0) + + // Populate cache. + for i := 0; i < iterations; i++ { + key := strconv.Itoa(i) + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expireAt, + } + mutex.Lock() + exists := cache.Add(item) + mutex.Unlock() + assert.False(t, exists) + } + + assert.Equal(t, int64(iterations), cache.Size()) + var launchWg, doneWg sync.WaitGroup + launchWg.Add(1) + + for thread := 0; thread < concurrency; thread++ { + doneWg.Add(2) + + go func() { + defer doneWg.Done() + launchWg.Wait() + + for i := 0; i < iterations; i++ { + key := strconv.Itoa(i) + mutex.Lock() + item, ok := cache.GetItem(key) + mutex.Unlock() + assert.True(t, ok) + require.NotNil(t, item) + assert.Equal(t, item.Value, i) + } + }() + + go func() { + defer doneWg.Done() + launchWg.Wait() + + for i := 0; i < iterations; i++ { + key := strconv.Itoa(i) + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expireAt, + } + mutex.Lock() + cache.Add(item) + mutex.Unlock() + } + }() + } + + // Wait for goroutines to finish. + launchWg.Done() + doneWg.Wait() + }) + + t.Run("Collect metrics during concurrent reads/writes", func(t *testing.T) { + cache := gubernator.NewLRUCache(0) + + // Populate cache. + for i := 0; i < iterations; i++ { + key := strconv.Itoa(i) + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expireAt, + } + mutex.Lock() + cache.Add(item) + mutex.Unlock() + } + + assert.Equal(t, int64(iterations), cache.Size()) + var launchWg, doneWg sync.WaitGroup + launchWg.Add(1) + + for thread := 0; thread < concurrency; thread++ { + doneWg.Add(3) + + go func() { + defer doneWg.Done() + launchWg.Wait() + + for i := 0; i < iterations; i++ { + // Get, cache hit. + key := strconv.Itoa(i) + mutex.Lock() + _, _ = cache.GetItem(key) + mutex.Unlock() + + // Get, cache miss. + key2 := strconv.Itoa(rand.Intn(1000) + 10000) + mutex.Lock() + _, _ = cache.GetItem(key2) + mutex.Unlock() + } + }() + + go func() { + defer doneWg.Done() + launchWg.Wait() + + for i := 0; i < iterations; i++ { + // Add existing. + key := strconv.Itoa(i) + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expireAt, + } + mutex.Lock() + cache.Add(item) + mutex.Unlock() + + // Add new. + key2 := strconv.Itoa(rand.Intn(1000) + 20000) + item2 := &gubernator.CacheItem{ + Key: key2, + Value: i, + ExpireAt: expireAt, + } + mutex.Lock() + cache.Add(item2) + mutex.Unlock() + } + }() + + collector := gubernator.NewLRUCacheCollector() + collector.AddCache(cache) + + go func() { + defer doneWg.Done() + launchWg.Wait() + + for i := 0; i < iterations; i++ { + // Get metrics. + ch := make(chan prometheus.Metric, 10) + collector.Collect(ch) + } + }() + } + + // Wait for goroutines to finish. + launchWg.Done() + doneWg.Wait() + }) +} + +func BenchmarkLRUCache(b *testing.B) { + var mutex sync.Mutex + + b.Run("Sequential reads", func(b *testing.B) { + cache := gubernator.NewLRUCache(b.N) + expireAt := clock.Now().Add(1 * time.Hour).UnixMilli() + + // Populate cache. + for i := 0; i < b.N; i++ { + key := strconv.Itoa(i) + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expireAt, + } + exists := cache.Add(item) + assert.False(b, exists) + } + + b.ReportAllocs() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + key := strconv.Itoa(i) + mutex.Lock() + _, _ = cache.GetItem(key) + mutex.Unlock() + } + }) + + b.Run("Sequential writes", func(b *testing.B) { + cache := gubernator.NewLRUCache(0) + expireAt := clock.Now().Add(1 * time.Hour).UnixMilli() + + b.ReportAllocs() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + key := strconv.Itoa(i) + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expireAt, + } + mutex.Lock() + cache.Add(item) + mutex.Unlock() + } + }) + + b.Run("Concurrent reads", func(b *testing.B) { + cache := gubernator.NewLRUCache(b.N) + expireAt := clock.Now().Add(1 * time.Hour).UnixMilli() + + // Populate cache. + for i := 0; i < b.N; i++ { + key := strconv.Itoa(i) + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expireAt, + } + exists := cache.Add(item) + assert.False(b, exists) + } + + var launchWg, doneWg sync.WaitGroup + launchWg.Add(1) + + for i := 0; i < b.N; i++ { + key := strconv.Itoa(i) + doneWg.Add(1) + + go func() { + defer doneWg.Done() + launchWg.Wait() + + mutex.Lock() + _, _ = cache.GetItem(key) + mutex.Unlock() + }() + } + + b.ReportAllocs() + b.ResetTimer() + launchWg.Done() + doneWg.Wait() + }) + + b.Run("Concurrent writes", func(b *testing.B) { + cache := gubernator.NewLRUCache(0) + expireAt := clock.Now().Add(1 * time.Hour).UnixMilli() + var launchWg, doneWg sync.WaitGroup + launchWg.Add(1) + + for i := 0; i < b.N; i++ { + key := strconv.Itoa(i) + doneWg.Add(1) + + go func(i int) { + defer doneWg.Done() + launchWg.Wait() + + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expireAt, + } + mutex.Lock() + cache.Add(item) + mutex.Unlock() + }(i) + } + + b.ReportAllocs() + b.ResetTimer() + launchWg.Done() + doneWg.Wait() + }) + + b.Run("Concurrent reads and writes of existing keys", func(b *testing.B) { + cache := gubernator.NewLRUCache(0) + expireAt := clock.Now().Add(1 * time.Hour).UnixMilli() + var launchWg, doneWg sync.WaitGroup + launchWg.Add(1) + + // Populate cache. + for i := 0; i < b.N; i++ { + key := strconv.Itoa(i) + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expireAt, + } + exists := cache.Add(item) + assert.False(b, exists) + } + + for i := 0; i < b.N; i++ { + key := strconv.Itoa(i) + doneWg.Add(2) + + go func() { + defer doneWg.Done() + launchWg.Wait() + + mutex.Lock() + _, _ = cache.GetItem(key) + mutex.Unlock() + }() + + go func(i int) { + defer doneWg.Done() + launchWg.Wait() + + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expireAt, + } + mutex.Lock() + cache.Add(item) + mutex.Unlock() + }(i) + } + + b.ReportAllocs() + b.ResetTimer() + launchWg.Done() + doneWg.Wait() + }) + + b.Run("Concurrent reads and writes of non-existent keys", func(b *testing.B) { + cache := gubernator.NewLRUCache(0) + expireAt := clock.Now().Add(1 * time.Hour).UnixMilli() + var launchWg, doneWg sync.WaitGroup + launchWg.Add(1) + + for i := 0; i < b.N; i++ { + doneWg.Add(2) + + go func(i int) { + defer doneWg.Done() + launchWg.Wait() + + key := strconv.Itoa(i) + mutex.Lock() + _, _ = cache.GetItem(key) + mutex.Unlock() + }(i) + + go func(i int) { + defer doneWg.Done() + launchWg.Wait() + + key := "z" + strconv.Itoa(i) + item := &gubernator.CacheItem{ + Key: key, + Value: i, + ExpireAt: expireAt, + } + mutex.Lock() + cache.Add(item) + mutex.Unlock() + }(i) + } + + b.ReportAllocs() + b.ResetTimer() + launchWg.Done() + doneWg.Wait() + }) +} diff --git a/memberlist.go b/memberlist.go index ec7ca337..8935b16e 100644 --- a/memberlist.go +++ b/memberlist.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2020 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/mock_cache_test.go b/mock_cache_test.go new file mode 100644 index 00000000..d2e02e66 --- /dev/null +++ b/mock_cache_test.go @@ -0,0 +1,66 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gubernator_test + +// Mock implementation of Cache. + +import ( + guber "github.com/mailgun/gubernator/v2" + "github.com/stretchr/testify/mock" +) + +type MockCache struct { + mock.Mock +} + +var _ guber.Cache = &MockCache{} + +func (m *MockCache) Add(item *guber.CacheItem) bool { + args := m.Called(item) + return args.Bool(0) +} + +func (m *MockCache) UpdateExpiration(key string, expireAt int64) bool { + args := m.Called(key, expireAt) + return args.Bool(0) +} + +func (m *MockCache) GetItem(key string) (value *guber.CacheItem, ok bool) { + args := m.Called(key) + retval, _ := args.Get(0).(*guber.CacheItem) + return retval, args.Bool(1) +} + +func (m *MockCache) Each() chan *guber.CacheItem { + args := m.Called() + retval, _ := args.Get(0).(chan *guber.CacheItem) + return retval +} + +func (m *MockCache) Remove(key string) { + m.Called(key) +} + +func (m *MockCache) Size() int64 { + args := m.Called() + return int64(args.Int(0)) +} + +func (m *MockCache) Close() error { + args := m.Called() + return args.Error(0) +} diff --git a/mock_loader_test.go b/mock_loader_test.go new file mode 100644 index 00000000..3a52c686 --- /dev/null +++ b/mock_loader_test.go @@ -0,0 +1,41 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gubernator_test + +// Mock implementation of Loader. + +import ( + guber "github.com/mailgun/gubernator/v2" + "github.com/stretchr/testify/mock" +) + +type MockLoader2 struct { + mock.Mock +} + +var _ guber.Loader = &MockLoader2{} + +func (m *MockLoader2) Load() (chan *guber.CacheItem, error) { + args := m.Called() + retval := args.Get(0).(chan *guber.CacheItem) + return retval, args.Error(1) +} + +func (m *MockLoader2) Save(ch chan *guber.CacheItem) error { + args := m.Called(ch) + return args.Error(0) +} diff --git a/mock_store_test.go b/mock_store_test.go index a5f5fab2..8d3d1bdd 100644 --- a/mock_store_test.go +++ b/mock_store_test.go @@ -19,6 +19,8 @@ package gubernator_test // Mock implementation of Store. import ( + "context" + guber "github.com/mailgun/gubernator/v2" "github.com/stretchr/testify/mock" ) @@ -29,19 +31,16 @@ type MockStore2 struct { var _ guber.Store = &MockStore2{} -func (m *MockStore2) OnChange(r *guber.RateLimitReq, item *guber.CacheItem) { - m.Called(r, item) +func (m *MockStore2) OnChange(ctx context.Context, r *guber.RateLimitReq, item *guber.CacheItem) { + m.Called(ctx, r, item) } -func (m *MockStore2) Get(r *guber.RateLimitReq) (*guber.CacheItem, bool) { - args := m.Called(r) - var retval *guber.CacheItem - if retval2, ok := args.Get(0).(*guber.CacheItem); ok { - retval = retval2 - } +func (m *MockStore2) Get(ctx context.Context, r *guber.RateLimitReq) (*guber.CacheItem, bool) { + args := m.Called(ctx, r) + retval, _ := args.Get(0).(*guber.CacheItem) return retval, args.Bool(1) } -func (m *MockStore2) Remove(key string) { - m.Called(key) +func (m *MockStore2) Remove(ctx context.Context, key string) { + m.Called(ctx, key) } diff --git a/multiregion.go b/multiregion.go index c58c41ef..7b32b178 100644 --- a/multiregion.go +++ b/multiregion.go @@ -1,3 +1,19 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package gubernator import ( diff --git a/net.go b/net.go index 3536606e..b20a743a 100644 --- a/net.go +++ b/net.go @@ -1,3 +1,19 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package gubernator import ( diff --git a/peer_client.go b/peer_client.go index 02af0956..4041edef 100644 --- a/peer_client.go +++ b/peer_client.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,9 +22,15 @@ import ( "fmt" "sync" + "github.com/mailgun/gubernator/v2/tracing" "github.com/mailgun/holster/v4/clock" "github.com/mailgun/holster/v4/collections" + otgrpc "github.com/opentracing-contrib/go-grpc" + "github.com/opentracing/opentracing-go" + "github.com/opentracing/opentracing-go/ext" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/credentials" ) @@ -66,6 +72,7 @@ type response struct { type request struct { request *RateLimitReq resp chan *response + ctx context.Context } type PeerConfig struct { @@ -84,14 +91,24 @@ func NewPeerClient(conf PeerConfig) *PeerClient { } // Connect establishes a GRPC connection to a peer -func (c *PeerClient) connect() error { +func (c *PeerClient) connect(ctx context.Context) error { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + // NOTE: To future self, this mutex is used here because we need to know if the peer is disconnecting and // handle ErrClosing. Since this mutex MUST be here we take this opportunity to also see if we are connected. // Doing this here encapsulates managing the connected state to the PeerClient struct. Previously a PeerClient // was connected when `NewPeerClient()` was called however, when adding support for multi data centers having a // PeerClient connected to every Peer in every data center continuously is not desirable. + funcTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("PeerClient.connect")) + defer funcTimer.ObserveDuration() + lockTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("PeerClient.connect_RLock")) + c.mutex.RLock() + lockTimer.ObserveDuration() + tracing.LogInfo(span, "mutex.RLock()") + if c.status == peerClosing { c.mutex.RUnlock() return &PeerErr{err: errors.New("already disconnecting")} @@ -106,16 +123,28 @@ func (c *PeerClient) connect() error { c.mutex.RUnlock() c.mutex.Lock() defer c.mutex.Unlock() + tracing.LogInfo(span, "mutex.Lock()") // Now that we have the RW lock, ensure no else got here ahead of us. if c.status == peerConnected { return nil } + // Setup Opentracing interceptor to propagate spans. + tracer := opentracing.GlobalTracer() + tracingUnaryInterceptor := otgrpc.OpenTracingClientInterceptor(tracer) + tracingStreamInterceptor := otgrpc.OpenTracingStreamClientInterceptor(tracer) + var err error - opts := []grpc.DialOption{grpc.WithInsecure()} + opts := []grpc.DialOption{ + grpc.WithUnaryInterceptor(tracingUnaryInterceptor), + grpc.WithStreamInterceptor(tracingStreamInterceptor), + } + if c.conf.TLS != nil { - opts = []grpc.DialOption{grpc.WithTransportCredentials(credentials.NewTLS(c.conf.TLS))} + opts = append(opts, grpc.WithTransportCredentials(credentials.NewTLS(c.conf.TLS))) + } else { + opts = append(opts, grpc.WithInsecure()) } c.conn, err = grpc.Dial(c.conf.Info.GRPCAddress, opts...) @@ -139,6 +168,13 @@ func (c *PeerClient) Info() PeerInfo { // GetPeerRateLimit forwards a rate limit request to a peer. If the rate limit has `behavior == BATCHING` configured // this method will attempt to batch the rate limits func (c *PeerClient) GetPeerRateLimit(ctx context.Context, r *RateLimitReq) (*RateLimitResp, error) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + span.SetTag("request.name", r.Name) + span.SetTag("request.key", r.UniqueKey) + span.SetTag("request.limit", r.Limit) + span.SetTag("request.duration", r.Duration) + // If config asked for no batching if HasBehavior(r.Behavior, Behavior_NO_BATCHING) { // Send a single low latency rate limit request @@ -146,23 +182,41 @@ func (c *PeerClient) GetPeerRateLimit(ctx context.Context, r *RateLimitReq) (*Ra Requests: []*RateLimitReq{r}, }) if err != nil { - return nil, c.setLastErr(err) + err2 := errors.Wrap(err, "Error in GetPeerRateLimits") + ext.LogError(span, err2) + return nil, c.setLastErr(err2) } return resp.RateLimits[0], nil } - return c.getPeerRateLimitsBatch(ctx, r) + + rateLimitResp, err := c.getPeerRateLimitsBatch(ctx, r) + if err != nil { + err2 := errors.Wrap(err, "Error in getPeerRateLimitsBatch") + ext.LogError(span, err2) + return nil, c.setLastErr(err2) + } + + return rateLimitResp, nil } // GetPeerRateLimits requests a list of rate limit statuses from a peer func (c *PeerClient) GetPeerRateLimits(ctx context.Context, r *GetPeerRateLimitsReq) (*GetPeerRateLimitsResp, error) { - if err := c.connect(); err != nil { - return nil, err + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + span.SetTag("numRequests", len(r.Requests)) + + if err := c.connect(ctx); err != nil { + err2 := errors.Wrap(err, "Error in connect") + ext.LogError(span, err2) + checkErrorCounter.WithLabelValues("Connect error").Add(1) + return nil, c.setLastErr(err2) } // NOTE: This must be done within the RLock since calling Wait() in Shutdown() causes // a race condition if called within a separate go routine if the internal wg is `0` // when Wait() is called then Add(1) is called concurrently. c.mutex.RLock() + tracing.LogInfo(span, "mutex.RLock()") c.wg.Add(1) defer func() { c.mutex.RUnlock() @@ -171,24 +225,34 @@ func (c *PeerClient) GetPeerRateLimits(ctx context.Context, r *GetPeerRateLimits resp, err := c.client.GetPeerRateLimits(ctx, r) if err != nil { - return nil, c.setLastErr(err) + err2 := errors.Wrap(err, "Error in client.GetPeerRateLimits") + ext.LogError(span, err2) + // checkErrorCounter is updated within client.GetPeerRateLimits(). + return nil, c.setLastErr(err2) } // Unlikely, but this avoids a panic if something wonky happens if len(resp.RateLimits) != len(r.Requests) { - return nil, errors.New("number of rate limits in peer response does not match request") + err = errors.New("number of rate limits in peer response does not match request") + ext.LogError(span, err) + checkErrorCounter.WithLabelValues("Item mismatch").Add(1) + return nil, c.setLastErr(err) } return resp, nil } // UpdatePeerGlobals sends global rate limit status updates to a peer func (c *PeerClient) UpdatePeerGlobals(ctx context.Context, r *UpdatePeerGlobalsReq) (*UpdatePeerGlobalsResp, error) { - if err := c.connect(); err != nil { - return nil, err + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + + if err := c.connect(ctx); err != nil { + return nil, c.setLastErr(err) } // See NOTE above about RLock and wg.Add(1) c.mutex.RLock() + tracing.LogInfo(span, "mutex.RLock()") c.wg.Add(1) defer func() { c.mutex.RUnlock() @@ -235,19 +299,49 @@ func (c *PeerClient) GetLastErr() []string { } func (c *PeerClient) getPeerRateLimitsBatch(ctx context.Context, r *RateLimitReq) (*RateLimitResp, error) { - if err := c.connect(); err != nil { - return nil, err + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + span.SetTag("request.name", r.Name) + span.SetTag("request.key", r.UniqueKey) + span.SetTag("request.limit", r.Limit) + span.SetTag("request.duration", r.Duration) + + funcTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("PeerClient.getPeerRateLimitsBatch")) + defer funcTimer.ObserveDuration() + + if err := c.connect(ctx); err != nil { + err2 := errors.Wrap(err, "Error in connect") + ext.LogError(span, err2) + return nil, c.setLastErr(err2) } // See NOTE above about RLock and wg.Add(1) c.mutex.RLock() + tracing.LogInfo(span, "mutex.RLock()") if c.status == peerClosing { - return nil, &PeerErr{err: errors.New("already disconnecting")} + err2 := &PeerErr{err: errors.New("already disconnecting")} + ext.LogError(span, err2) + return nil, c.setLastErr(err2) + } + req := request{ + request: r, + resp: make(chan *response, 1), + ctx: ctx, } - req := request{request: r, resp: make(chan *response, 1)} // Enqueue the request to be sent - c.queue <- &req + tracing.LogInfo(span, "Enqueue request", "queueLength", len(c.queue)) + peerAddr := c.Info().GRPCAddress + queueLengthMetric.WithLabelValues(peerAddr).Observe(float64(len(c.queue))) + + select { + case c.queue <- &req: + // Successfully enqueued request. + case <-ctx.Done(): + err := errors.Wrap(ctx.Err(), "Error while enqueuing request") + ext.LogError(span, err) + return nil, err + } c.wg.Add(1) defer func() { @@ -256,14 +350,20 @@ func (c *PeerClient) getPeerRateLimitsBatch(ctx context.Context, r *RateLimitReq }() // Wait for a response or context cancel + span3, ctx2 := tracing.StartNamedSpan(ctx, "Wait for response") + defer span3.Finish() + select { case resp := <-req.resp: if resp.err != nil { - return nil, c.setLastErr(resp.err) + err2 := errors.Wrap(c.setLastErr(resp.err), "Request error") + ext.LogError(span, err2) + return nil, c.setLastErr(err2) } return resp.rl, nil - case <-ctx.Done(): - return nil, c.setLastErr(ctx.Err()) + case <-ctx2.Done(): + ext.LogError(span, ctx2.Err()) + return nil, ctx2.Err() } } @@ -276,56 +376,96 @@ func (c *PeerClient) run() { var queue []*request for { + ctx := context.Background() + select { case r, ok := <-c.queue: // If the queue has shutdown, we need to send the rest of the queue if !ok { if len(queue) > 0 { - c.sendQueue(queue) + c.sendQueue(ctx, queue) } return } - queue = append(queue, r) - - // Send the queue if we reached our batch limit - if len(queue) == c.conf.Behavior.BatchLimit { - c.sendQueue(queue) - queue = nil - continue - } + // Wrap logic in anon function so we can use defer. + func() { + // Use context of the request for opentracing span. + reqSpan, reqCtx := tracing.StartSpan(r.ctx) + defer reqSpan.Finish() + + queue = append(queue, r) + + // Send the queue if we reached our batch limit + if len(queue) == c.conf.Behavior.BatchLimit { + logMsg := "run() reached batch limit" + logrus.WithFields(logrus.Fields{ + "queueLen": len(queue), + "batchLimit": c.conf.Behavior.BatchLimit, + }).Info(logMsg) + tracing.LogInfo(reqSpan, logMsg) + + c.sendQueue(reqCtx, queue) + queue = nil + return + } - // If this is our first queued item since last send - // queue the next interval - if len(queue) == 1 { - interval.Next() - } + // If this is our first queued item since last send + // queue the next interval + if len(queue) == 1 { + interval.Next() + } + }() case <-interval.C: if len(queue) != 0 { - c.sendQueue(queue) + intervalSpan, ctx2 := tracing.StartSpan(ctx) + intervalSpan.SetTag("queueLen", len(queue)) + intervalSpan.SetTag("batchWait", c.conf.Behavior.BatchWait.String()) + + c.sendQueue(ctx2, queue) queue = nil - } + intervalSpan.Finish() + } } } } // sendQueue sends the queue provided and returns the responses to // waiting go routines -func (c *PeerClient) sendQueue(queue []*request) { +func (c *PeerClient) sendQueue(ctx context.Context, queue []*request) { + span, ctx := tracing.StartSpan(ctx) + defer span.Finish() + span.SetTag("queueLen", len(queue)) + + funcTimer := prometheus.NewTimer(funcTimeMetric.WithLabelValues("PeerClient.sendQueue")) + defer funcTimer.ObserveDuration() + var req GetPeerRateLimitsReq for _, r := range queue { req.Requests = append(req.Requests, r.request) } - ctx, cancel := context.WithTimeout(context.Background(), c.conf.Behavior.BatchTimeout) - resp, err := c.client.GetPeerRateLimits(ctx, &req) - cancel() + ctx2, cancel2 := tracing.ContextWithTimeout(ctx, c.conf.Behavior.BatchTimeout) + resp, err := c.client.GetPeerRateLimits(ctx2, &req) + cancel2() // An error here indicates the entire request failed if err != nil { - c.setLastErr(err) + logPart := "Error in client.GetPeerRateLimits" + err2 := errors.Wrap(err, logPart) + logrus. + WithError(err). + WithFields(logrus.Fields{ + "queueLen": len(queue), + "batchTimeout": c.conf.Behavior.BatchTimeout.String(), + }). + Error(logPart) + ext.LogError(span, err2) + c.setLastErr(err2) + // checkErrorCounter is updated within client.GetPeerRateLimits(). + for _, r := range queue { r.resp <- &response{err: err} } @@ -335,7 +475,10 @@ func (c *PeerClient) sendQueue(queue []*request) { // Unlikely, but this avoids a panic if something wonky happens if len(resp.RateLimits) != len(queue) { err = errors.New("server responded with incorrect rate limit list size") + ext.LogError(span, err) + for _, r := range queue { + checkErrorCounter.WithLabelValues("Item mismatch").Add(1) r.resp <- &response{err: err} } return diff --git a/peer_client_test.go b/peer_client_test.go index 275f103d..57d5d0a5 100644 --- a/peer_client_test.go +++ b/peer_client_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package gubernator_test import ( diff --git a/peers.pb.go b/peers.pb.go index 1043af24..0b85e54e 100644 --- a/peers.pb.go +++ b/peers.pb.go @@ -1,5 +1,5 @@ // -//Copyright 2018-2019 Mailgun Technologies Inc +//Copyright 2018-2022 Mailgun Technologies Inc // //Licensed under the Apache License, Version 2.0 (the "License"); //you may not use this file except in compliance with the License. diff --git a/prometheus.md b/prometheus.md new file mode 100644 index 00000000..94e3bfd5 --- /dev/null +++ b/prometheus.md @@ -0,0 +1,33 @@ +# Prometheus Metrics +Gubernator can be monitored realtime using [Prometheus](https://prometheus.io/) metrics. + +## Enabling Metric Collection +Metrics are exposed under two possible deployment scenarios: + +1. Gubernator deployed as a standalone daemon. + * Metrics endpoint published at the HTTP `/metrics` URI. +2. Gubernator embedded as a Go module. + * The dependant codebase is responsible for publishing the HTTP `/metrics` URI. + * See `daemon.go` for examples using the `promhttp` module. + +Finally, configure a Prometheus job to scrape the server's `/metrics` URI. + +## Metrics + +| Metric | Type | Description | +| -------------------------------------- | ------- | ----------- | +| `gubernator_async_durations` | Summary | The timings of GLOBAL async sends in seconds. | +| `gubernator_asyncrequest_retries` | Counter | The count of retries occurred in asyncRequests() forwarding a request to another peer. | +| `gubernator_broadcast_durations` | Summary | The timings of GLOBAL broadcasts to peers in seconds. | +| `gubernator_cache_access_count` | Counter | The count of LRUCache accesses during rate checks. | +| `gubernator_cache_size` | Gauge | The number of items in LRU Cache which holds the rate limits. | +| `gubernator_check_counter` | Counter | The number of rate limits checked. | +| `gubernator_check_error_counter` | Counter | The number of errors while checking rate limits. | +| `gubernator_concurrent_checks_counter` | Summary | 99th quantile of concurrent rate checks. This includes rate checks processed locally and forwarded to other peers. | +| `gubernator_func_duration` | Summary | The 99th quantile of key function timings in seconds. | +| `gubernator_getratelimit_counter` | Counter | The count of getRateLimit() calls. Label \"calltype\" may be \"local\" for calls handled by the same peer, \"forward\" for calls forwarded to another peer, or \"global\" for global rate limits. | +| `gubernator_grpc_request_counts` | Counter | The count of gRPC requests. | +| `gubernator_grpc_request_duration` | Summary | The 99th quantile timings of gRPC requests in seconds. | +| `gubernator_over_limit_counter` | Counter | The number of rate limit checks that are over the limit. | +| `gubernator_pool_queue_length` | Summary | The 99th quantile of rate check requests queued up in GubernatorPool. The is the work queue for local rate checks. | +| `gubernator_queue_length` | Summary | The 99th quantile of rate check requests queued up for batching to other peers by getPeerRateLimitsBatch(). This is the work queue for remote rate checks. Label "peerAddr" indicates queued requests to that peer. | diff --git a/proto/gubernator.proto b/proto/gubernator.proto index 0d065e8b..5ac0c584 100644 --- a/proto/gubernator.proto +++ b/proto/gubernator.proto @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/proto/peers.proto b/proto/peers.proto index 08b5b638..5caefae4 100644 --- a/proto/peers.proto +++ b/proto/peers.proto @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/python/setup.py b/python/setup.py index 3cedb7ec..eb8126c7 100755 --- a/python/setup.py +++ b/python/setup.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 2018-2019 Mailgun Technologies Inc +# Copyright 2018-2022 Mailgun Technologies Inc # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/tests/test_client.py b/python/tests/test_client.py index ddb8afd5..28efabfb 100644 --- a/python/tests/test_client.py +++ b/python/tests/test_client.py @@ -1,4 +1,4 @@ -# Copyright 2018-2019 Mailgun Technologies Inc +# Copyright 2018-2022 Mailgun Technologies Inc # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/region_picker.go b/region_picker.go index ec025b9c..f6755fae 100644 --- a/region_picker.go +++ b/region_picker.go @@ -1,3 +1,19 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package gubernator import ( diff --git a/replicated_hash.go b/replicated_hash.go index 18d3b04f..c53504e1 100644 --- a/replicated_hash.go +++ b/replicated_hash.go @@ -1,5 +1,5 @@ /* -Copyright 2018-2019 Mailgun Technologies Inc +Copyright 2018-2022 Mailgun Technologies Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/replicated_hash_test.go b/replicated_hash_test.go index c87ad67e..bfd243b8 100644 --- a/replicated_hash_test.go +++ b/replicated_hash_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package gubernator import ( diff --git a/scripts/gubernator-cli.py b/scripts/gubernator-cli.py index 518e9361..aaea52d0 100755 --- a/scripts/gubernator-cli.py +++ b/scripts/gubernator-cli.py @@ -1,6 +1,6 @@ #! /usr/bin/env python -# Copyright 2018-2019 Mailgun Technologies Inc +# Copyright 2018-2022 Mailgun Technologies Inc # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/scripts/proto.sh b/scripts/proto.sh index d1ef1931..e341d3d4 100755 --- a/scripts/proto.sh +++ b/scripts/proto.sh @@ -1,6 +1,6 @@ #! /bin/sh -# Copyright 2018-2019 Mailgun Technologies Inc +# Copyright 2018-2022 Mailgun Technologies Inc # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/scripts/start-cluster.sh b/scripts/start-cluster.sh index 32c33f37..f4e0f615 100755 --- a/scripts/start-cluster.sh +++ b/scripts/start-cluster.sh @@ -1,6 +1,6 @@ #! /bin/sh -# Copyright 2018-2019 Mailgun Technologies Inc +# Copyright 2018-2022 Mailgun Technologies Inc # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/store.go b/store.go index 20a7663f..1c234610 100644 --- a/store.go +++ b/store.go @@ -1,5 +1,23 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package gubernator +import "context" + // PERSISTENT STORE DETAILS // The storage interfaces defined here allows the implementor flexibility in storage options. Depending on the @@ -25,24 +43,25 @@ type TokenBucketItem struct { } // Store interface allows implementors to off load storage of all or a subset of ratelimits to -// some persistent store. Methods OnChange() and Get() should avoid blocking as much as possible as these -// methods are called on every rate limit request and will effect the performance of gubernator. +// some persistent store. Methods OnChange() and Remove() should avoid blocking where possible +// to maximize performance of gubernator. +// Implementations MUST be threadsafe. type Store interface { // Called by gubernator *after* a rate limit item is updated. It's up to the store to // decide if this rate limit item should be persisted in the store. It's up to the // store to expire old rate limit items. The CacheItem represents the current state of // the rate limit item *after* the RateLimitReq has been applied. - OnChange(r *RateLimitReq, item *CacheItem) + OnChange(ctx context.Context, r *RateLimitReq, item *CacheItem) // Called by gubernator when a rate limit is missing from the cache. It's up to the store // to decide if this request is fulfilled. Should return true if the request is fulfilled // and false if the request is not fulfilled or doesn't exist in the store. - Get(r *RateLimitReq) (*CacheItem, bool) + Get(ctx context.Context, r *RateLimitReq) (*CacheItem, bool) // Called by gubernator when an existing rate limit should be removed from the store. // NOTE: This is NOT called when an rate limit expires from the cache, store implementors // must expire rate limits in the store. - Remove(key string) + Remove(ctx context.Context, key string) } // Loader interface allows implementors to store all or a subset of ratelimits into a persistent @@ -76,18 +95,18 @@ type MockStore struct { var _ Store = &MockStore{} -func (ms *MockStore) OnChange(r *RateLimitReq, item *CacheItem) { +func (ms *MockStore) OnChange(ctx context.Context, r *RateLimitReq, item *CacheItem) { ms.Called["OnChange()"] += 1 ms.CacheItems[item.Key] = item } -func (ms *MockStore) Get(r *RateLimitReq) (*CacheItem, bool) { +func (ms *MockStore) Get(ctx context.Context, r *RateLimitReq) (*CacheItem, bool) { ms.Called["Get()"] += 1 item, ok := ms.CacheItems[r.HashKey()] return item, ok } -func (ms *MockStore) Remove(key string) { +func (ms *MockStore) Remove(ctx context.Context, key string) { ms.Called["Remove()"] += 1 delete(ms.CacheItems, key) } diff --git a/store_test.go b/store_test.go index b48ee325..558c9141 100644 --- a/store_test.go +++ b/store_test.go @@ -125,6 +125,7 @@ func TestLoader(t *testing.T) { } func TestStore(t *testing.T) { + ctx := context.Background() setup := func() (*MockStore2, *v1Server, gubernator.V1Client) { store := &MockStore2{} @@ -157,8 +158,8 @@ func TestStore(t *testing.T) { // Create a mock argument matcher for CacheItem input. // Verify item matches expected algorithm, limit, and duration. - matchItem := func(algorithm gubernator.Algorithm, req *gubernator.RateLimitReq) interface{} { - switch algorithm { + matchItem := func(req *gubernator.RateLimitReq) interface{} { + switch req.Algorithm { case gubernator.Algorithm_TOKEN_BUCKET: return mock.MatchedBy(func(item *gubernator.CacheItem) bool { titem, ok := item.Value.(*gubernator.TokenBucketItem) @@ -166,7 +167,7 @@ func TestStore(t *testing.T) { return false } - return item.Algorithm == algorithm && + return item.Algorithm == req.Algorithm && item.Key == req.HashKey() && titem.Limit == req.Limit && titem.Duration == req.Duration @@ -179,7 +180,7 @@ func TestStore(t *testing.T) { return false } - return item.Algorithm == algorithm && + return item.Algorithm == req.Algorithm && item.Key == req.HashKey() && litem.Limit == req.Limit && litem.Duration == req.Duration @@ -239,11 +240,11 @@ func TestStore(t *testing.T) { } // Setup mocks. - store.On("Get", matchReq(req)).Once().Return(&gubernator.CacheItem{}, false) - store.On("OnChange", matchReq(req), matchItem(testCase.Algorithm, req)).Once() + store.On("Get", mock.Anything, matchReq(req)).Once().Return(nil, false) + store.On("OnChange", mock.Anything, matchReq(req), matchItem(req)).Once() // Call code. - resp, err := client.GetRateLimits(context.Background(), &gubernator.GetRateLimitsReq{ + resp, err := client.GetRateLimits(ctx, &gubernator.GetRateLimitsReq{ Requests: []*gubernator.RateLimitReq{req}, }) require.NoError(t, err) @@ -255,10 +256,10 @@ func TestStore(t *testing.T) { t.Run("Second rate check pulls from cache", func(t *testing.T) { // Setup mocks. - store.On("OnChange", matchReq(req), matchItem(testCase.Algorithm, req)).Once() + store.On("OnChange", mock.Anything, matchReq(req), matchItem(req)).Once() // Call code. - resp, err := client.GetRateLimits(context.Background(), &gubernator.GetRateLimitsReq{ + resp, err := client.GetRateLimits(ctx, &gubernator.GetRateLimitsReq{ Requests: []*gubernator.RateLimitReq{req}, }) require.NoError(t, err) @@ -293,11 +294,11 @@ func TestStore(t *testing.T) { Value: createBucketItem(req), } - store.On("Get", matchReq(req)).Once().Return(storedItem, true) - store.On("OnChange", matchReq(req), matchItem(testCase.Algorithm, req)).Once() + store.On("Get", mock.Anything, matchReq(req)).Once().Return(storedItem, true) + store.On("OnChange", mock.Anything, matchReq(req), matchItem(req)).Once() // Call code. - resp, err := client.GetRateLimits(context.Background(), &gubernator.GetRateLimitsReq{ + resp, err := client.GetRateLimits(ctx, &gubernator.GetRateLimitsReq{ Requests: []*gubernator.RateLimitReq{req}, }) require.NoError(t, err) @@ -332,12 +333,12 @@ func TestStore(t *testing.T) { Value: &struct{}{}, } - store.On("Get", matchReq(req)).Once().Return(storedItem, true) - store.On("Remove", req.HashKey()).Once() - store.On("OnChange", matchReq(req), matchItem(testCase.Algorithm, req)).Once() + store.On("Get", mock.Anything, matchReq(req)).Once().Return(storedItem, true) + store.On("Remove", mock.Anything, req.HashKey()).Once() + store.On("OnChange", mock.Anything, matchReq(req), matchItem(req)).Once() // Call code. - resp, err := client.GetRateLimits(context.Background(), &gubernator.GetRateLimitsReq{ + resp, err := client.GetRateLimits(ctx, &gubernator.GetRateLimitsReq{ Requests: []*gubernator.RateLimitReq{req}, }) require.NoError(t, err) @@ -387,9 +388,10 @@ func TestStore(t *testing.T) { Value: bucketItem, } - store.On("Get", matchReq(req)).Once().Return(storedItem, true) + store.On("Get", mock.Anything, matchReq(req)).Once().Return(storedItem, true) store.On("OnChange", + mock.Anything, matchReq(req), mock.MatchedBy(func(item *gubernator.CacheItem) bool { switch req.Algorithm { @@ -426,7 +428,7 @@ func TestStore(t *testing.T) { Once() // Call code. - resp, err := client.GetRateLimits(context.Background(), &gubernator.GetRateLimitsReq{ + resp, err := client.GetRateLimits(ctx, &gubernator.GetRateLimitsReq{ Requests: []*gubernator.RateLimitReq{req}, }) require.NoError(t, err) @@ -476,9 +478,10 @@ func TestStore(t *testing.T) { Value: bucketItem, } - store.On("Get", matchReq(req)).Once().Return(storedItem, true) + store.On("Get", mock.Anything, matchReq(req)).Once().Return(storedItem, true) store.On("OnChange", + mock.Anything, matchReq(req), mock.MatchedBy(func(item *gubernator.CacheItem) bool { switch req.Algorithm { @@ -515,7 +518,7 @@ func TestStore(t *testing.T) { Once() // Call code. - resp, err := client.GetRateLimits(context.Background(), &gubernator.GetRateLimitsReq{ + resp, err := client.GetRateLimits(ctx, &gubernator.GetRateLimitsReq{ Requests: []*gubernator.RateLimitReq{req}, }) require.NoError(t, err) diff --git a/tls.go b/tls.go index 1515df14..d98749c9 100644 --- a/tls.go +++ b/tls.go @@ -1,3 +1,19 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package gubernator import ( diff --git a/tls_test.go b/tls_test.go index 322540a7..2012518d 100644 --- a/tls_test.go +++ b/tls_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package gubernator_test import ( diff --git a/tracing/tracing.go b/tracing/tracing.go new file mode 100644 index 00000000..d19df368 --- /dev/null +++ b/tracing/tracing.go @@ -0,0 +1,92 @@ +/* +Copyright 2018-2022 Mailgun Technologies Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tracing + +// General purpose Opentracing functions. + +import ( + "context" + "fmt" + "runtime" + "strconv" + "time" + + "github.com/opentracing/opentracing-go" +) + +// Start a span using the full function name as the operation name. +// When done, be sure to call span.Finish(). +func StartSpan(ctx context.Context) (opentracing.Span, context.Context) { + operationName, fileTag := getCallerInfoForTracing(2) + + span, ctx2 := opentracing.StartSpanFromContext(ctx, operationName) + span.SetTag("file", fileTag) + + return span, ctx2 +} + +// Start a span using given operation name. +// When done, be sure to call span.Finish(). +func StartNamedSpan(ctx context.Context, operationName string) (opentracing.Span, context.Context) { + _, fileTag := getCallerInfoForTracing(2) + + span, ctx2 := opentracing.StartSpanFromContext(ctx, operationName) + span.SetTag("file", fileTag) + + return span, ctx2 +} + +func getCallerInfoForTracing(stackIndex int) (string, string) { + fileTag := "unknown" + operationName := "unknown" + pc, file, line, callerOk := runtime.Caller(stackIndex) + + if callerOk { + operationName = runtime.FuncForPC(pc).Name() + fileTag = file + ":" + strconv.Itoa(line) + } + + return operationName, fileTag +} + +// Log a message to span. +// Optionally pass additional key/value pairs. +func LogInfo(span opentracing.Span, message string, keyValues ...interface{}) { + args := append( + []interface{}{ + "event", "info", + "event.message", message, + }, + keyValues..., + ) + span.LogKV(args...) +} + +// Do context.WithTimeout and log details of the deadline origin. +func ContextWithTimeout(ctx context.Context, duration time.Duration) (context.Context, context.CancelFunc) { + deadline := time.Now().Add(duration) + _, fn, line, _ := runtime.Caller(1) + + if span := opentracing.SpanFromContext(ctx); span != nil { + LogInfo(span, "Set context deadline", + "deadline", deadline.Format(time.RFC3339), + "source", fmt.Sprintf("%s:%d", fn, line), + ) + } + + return context.WithTimeout(ctx, duration) +} diff --git a/version b/version index 0a2d216d..1bf44299 100644 --- a/version +++ b/version @@ -1 +1 @@ -v2.0.0-rc.12 +v2.0.0-rc.13