From f9a94257a1d2801dd6ea1efe90fdcc8431b83383 Mon Sep 17 00:00:00 2001 From: kruskall <99559985+kruskall@users.noreply.github.com> Date: Tue, 28 Jan 2025 19:47:46 +0100 Subject: [PATCH 1/2] Translate otel metrics to libbeat monitoring (#15094) * Translate otel metrics to libbeat monitoring * demo: send metrics directly and add another reader * Revert "demo: send metrics directly and add another reader" This reverts commit 166a7179380dd47f394ee90c388346578a3024b5. * lint: fix linter issues * lint: fix linter issues * feat: refactor code to propagate meterprovider and fix tests * lint: fix linter issues * refactor: remove unused files * refactor: remove more global meters * refactor: cleanup more unused code * refactor: remove more unused code * test: account for agentcfg metric in test * test: account for agentcfg metric in test * fix: pass meter provider in main func * Fix LSM metrics tests * test: fix remaining test * lint: fix linter issues * fix: update docappender metrics name * test: update systemtest metric assertions * fix: update grpc interceptor meter path metrics were not being exported because the meter was not recognized as apm-server meter * fix: add back output.type=elasticsearch * test: upate remaining systemtest * test: remove debug print * fix: use correct outputRegistry variable fix panic * fix: remove panic on err * fix: propagate meter provider to grpc services * lint: add meterprovider field docs * lint: fix comment typo * feat: pass apmotel gatherer too * refactor: use switch pattern consistently when mapping metrics * Update beater.go * Update beat.go * Update beater.go * fix: solve compile errors and systemtest fix * refactor: reduce diff noise * lint: fix linter issues * lint: fix linter issues * Update x-pack/apm-server/main.go Co-authored-by: Andrew Wilkins * test: use legacy metrics for validating grpc tests * fix: unregister callback if available forgot to pus this --------- Co-authored-by: Andrew Wilkins Co-authored-by: Andrew Wilkins (cherry picked from commit 378b60c2f9f7cb7ebd91b05d7fe034974459733a) # Conflicts: # internal/beatcmd/beat.go # internal/beater/beater.go # internal/beater/server.go # internal/beater/server_test.go --- internal/agentcfg/elasticsearch.go | 67 ++-- internal/agentcfg/elasticsearch_test.go | 6 +- internal/beatcmd/beat.go | 219 ++++++++++++ internal/beatcmd/reloader.go | 8 +- internal/beater/api/config/agent/handler.go | 6 - internal/beater/api/intake/handler.go | 25 +- internal/beater/api/intake/handler_test.go | 29 +- internal/beater/api/mux.go | 72 ++-- internal/beater/api/mux_config_agent_test.go | 11 +- .../beater/api/mux_intake_backend_test.go | 11 +- internal/beater/api/mux_intake_rum_test.go | 16 +- internal/beater/api/mux_root_test.go | 11 +- internal/beater/api/mux_test.go | 35 +- internal/beater/api/root/handler.go | 7 - internal/beater/beater.go | 26 +- internal/beater/interceptors/metrics.go | 74 ++-- internal/beater/interceptors/metrics_test.go | 317 +++++++----------- .../middleware/monitoring_middleware.go | 48 +-- .../middleware/monitoring_middleware_test.go | 73 ++-- internal/beater/monitoringtest/monitoring.go | 79 ----- .../beater/monitoringtest/opentelemetry.go | 31 +- internal/beater/otlp/common.go | 60 ---- internal/beater/otlp/grpc.go | 53 ++- internal/beater/otlp/grpc_test.go | 91 ++--- internal/beater/otlp/http.go | 41 ++- internal/beater/otlp/http_test.go | 72 ++-- internal/beater/request/result.go | 26 -- internal/beater/request/result_test.go | 21 -- internal/beater/server.go | 21 +- internal/beater/server_test.go | 47 ++- internal/beater/tracing.go | 6 +- internal/model/modelprocessor/eventcounter.go | 51 +-- .../model/modelprocessor/eventcounter_test.go | 23 +- internal/processor/stream/result.go | 77 ----- internal/processor/stream/result_test.go | 68 ---- systemtest/otlp_test.go | 2 +- x-pack/apm-server/main.go | 37 +- x-pack/apm-server/main_test.go | 131 +++++++- x-pack/apm-server/sampling/config.go | 5 + x-pack/apm-server/sampling/groups.go | 17 +- x-pack/apm-server/sampling/groups_test.go | 13 +- x-pack/apm-server/sampling/processor.go | 94 ++---- .../sampling/processor_bench_test.go | 3 +- x-pack/apm-server/sampling/processor_test.go | 205 ++++------- 44 files changed, 1096 insertions(+), 1239 deletions(-) delete mode 100644 internal/beater/monitoringtest/monitoring.go delete mode 100644 internal/beater/otlp/common.go delete mode 100644 internal/processor/stream/result.go delete mode 100644 internal/processor/stream/result_test.go diff --git a/internal/agentcfg/elasticsearch.go b/internal/agentcfg/elasticsearch.go index 518646797ed..bf01ddf7114 100644 --- a/internal/agentcfg/elasticsearch.go +++ b/internal/agentcfg/elasticsearch.go @@ -29,11 +29,11 @@ import ( "github.com/pkg/errors" "go.elastic.co/apm/v2" + "go.opentelemetry.io/otel/metric" "github.com/elastic/apm-server/internal/elasticsearch" "github.com/elastic/apm-server/internal/logs" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/monitoring" "github.com/elastic/go-elasticsearch/v8/esapi" ) @@ -71,14 +71,15 @@ type ElasticsearchFetcher struct { logger, rateLimitedLogger *logp.Logger - tracer *apm.Tracer - metrics fetcherMetrics -} + tracer *apm.Tracer -type fetcherMetrics struct { - fetchES, fetchFallback, fetchFallbackUnavailable, fetchInvalid, - cacheRefreshSuccesses, cacheRefreshFailures, - cacheEntriesCount atomic.Int64 + esCacheEntriesCount metric.Int64Gauge + esFetchCount metric.Int64Counter + esFetchFallbackCount metric.Int64Counter + esFetchUnavailableCount metric.Int64Counter + esFetchInvalidCount metric.Int64Counter + esCacheRefreshSuccesses metric.Int64Counter + esCacheRefreshFailures metric.Int64Counter } func NewElasticsearchFetcher( @@ -86,7 +87,18 @@ func NewElasticsearchFetcher( cacheDuration time.Duration, fetcher Fetcher, tracer *apm.Tracer, + mp metric.MeterProvider, ) *ElasticsearchFetcher { + meter := mp.Meter("github.com/elastic/apm-server/internal/agentcfg") + + esCacheEntriesCount, _ := meter.Int64Gauge("apm-server.agentcfg.elasticsearch.cache.entries.count") + esFetchCount, _ := meter.Int64Counter("apm-server.agentcfg.elasticsearch.fetch.es") + esFetchFallbackCount, _ := meter.Int64Counter("apm-server.agentcfg.elasticsearch.fetch.fallback") + esFetchUnavailableCount, _ := meter.Int64Counter("apm-server.agentcfg.elasticsearch.fetch.unavailable") + esFetchInvalidCount, _ := meter.Int64Counter("apm-server.agentcfg.elasticsearch.fetch.invalid") + esCacheRefreshSuccesses, _ := meter.Int64Counter("apm-server.agentcfg.elasticsearch.cache.refresh.successes") + esCacheRefreshFailures, _ := meter.Int64Counter("apm-server.agentcfg.elasticsearch.cache.refresh.failures") + logger := logp.NewLogger("agentcfg") return &ElasticsearchFetcher{ client: client, @@ -96,6 +108,14 @@ func NewElasticsearchFetcher( logger: logger, rateLimitedLogger: logger.WithOptions(logs.WithRateLimit(loggerRateLimit)), tracer: tracer, + + esCacheEntriesCount: esCacheEntriesCount, + esFetchCount: esFetchCount, + esFetchFallbackCount: esFetchFallbackCount, + esFetchUnavailableCount: esFetchUnavailableCount, + esFetchInvalidCount: esFetchInvalidCount, + esCacheRefreshSuccesses: esCacheRefreshSuccesses, + esCacheRefreshFailures: esCacheRefreshFailures, } } @@ -105,22 +125,22 @@ func (f *ElasticsearchFetcher) Fetch(ctx context.Context, query Query) (Result, // Happy path: serve fetch requests using an initialized cache. f.mu.RLock() defer f.mu.RUnlock() - f.metrics.fetchES.Add(1) + f.esFetchCount.Add(ctx, 1) return matchAgentConfig(query, f.cache), nil } if f.fallbackFetcher != nil { - f.metrics.fetchFallback.Add(1) + f.esFetchFallbackCount.Add(ctx, 1) return f.fallbackFetcher.Fetch(ctx, query) } if f.invalidESCfg.Load() { - f.metrics.fetchInvalid.Add(1) + f.esFetchInvalidCount.Add(ctx, 1) f.rateLimitedLogger.Errorf("rejecting fetch request: no valid elasticsearch config") return Result{}, errors.New(ErrNoValidElasticsearchConfig) } - f.metrics.fetchFallbackUnavailable.Add(1) + f.esFetchUnavailableCount.Add(ctx, 1) f.rateLimitedLogger.Warnf("rejecting fetch request: infrastructure is not ready") return Result{}, errors.New(ErrInfrastructureNotReady) } @@ -213,9 +233,9 @@ func (f *ElasticsearchFetcher) refreshCache(ctx context.Context) (err error) { defer func() { if err != nil { - f.metrics.cacheRefreshFailures.Add(1) + f.esCacheRefreshFailures.Add(ctx, 1) } else { - f.metrics.cacheRefreshSuccesses.Add(1) + f.esCacheRefreshSuccesses.Add(ctx, 1) } }() @@ -247,7 +267,7 @@ func (f *ElasticsearchFetcher) refreshCache(ctx context.Context) (err error) { f.cache = buffer f.mu.Unlock() f.cacheInitialized.Store(true) - f.metrics.cacheEntriesCount.Store(int64(len(f.cache))) + f.esCacheEntriesCount.Record(ctx, int64(len(f.cache))) f.last = time.Now() return nil } @@ -304,20 +324,3 @@ func (f *ElasticsearchFetcher) singlePageRefresh(ctx context.Context, scrollID s } return result, json.NewDecoder(resp.Body).Decode(&result) } - -// CollectMonitoring may be called to collect monitoring metrics from the -// fetcher. It is intended to be used with libbeat/monitoring.NewFunc. -// -// The metrics should be added to the "apm-server.agentcfg.elasticsearch" registry. -func (f *ElasticsearchFetcher) CollectMonitoring(_ monitoring.Mode, V monitoring.Visitor) { - V.OnRegistryStart() - defer V.OnRegistryFinished() - - monitoring.ReportInt(V, "cache.entries.count", f.metrics.cacheEntriesCount.Load()) - monitoring.ReportInt(V, "fetch.es", f.metrics.fetchES.Load()) - monitoring.ReportInt(V, "fetch.fallback", f.metrics.fetchFallback.Load()) - monitoring.ReportInt(V, "fetch.unavailable", f.metrics.fetchFallbackUnavailable.Load()) - monitoring.ReportInt(V, "fetch.invalid", f.metrics.fetchInvalid.Load()) - monitoring.ReportInt(V, "cache.refresh.successes", f.metrics.cacheRefreshSuccesses.Load()) - monitoring.ReportInt(V, "cache.refresh.failures", f.metrics.cacheRefreshFailures.Load()) -} diff --git a/internal/agentcfg/elasticsearch_test.go b/internal/agentcfg/elasticsearch_test.go index ea8391d1756..df86db50e26 100644 --- a/internal/agentcfg/elasticsearch_test.go +++ b/internal/agentcfg/elasticsearch_test.go @@ -30,6 +30,7 @@ import ( "github.com/stretchr/testify/require" "go.elastic.co/apm/v2" "go.elastic.co/apm/v2/apmtest" + "go.opentelemetry.io/otel/metric/noop" "github.com/elastic/apm-server/internal/elasticsearch" ) @@ -104,7 +105,7 @@ func newElasticsearchFetcher( w.WriteHeader(200) w.Write(b) i += searchSize - }), time.Second, nil, rt) + }), time.Second, nil, rt, noop.NewMeterProvider()) fetcher.searchSize = searchSize return fetcher } @@ -193,6 +194,7 @@ func TestFetchUseFallback(t *testing.T) { time.Second, fallbackFetcher, apmtest.NewRecordingTracer().Tracer, + noop.NewMeterProvider(), ) fetcher.refreshCache(context.Background()) @@ -208,6 +210,7 @@ func TestFetchNoFallbackInvalidESCfg(t *testing.T) { time.Second, nil, apmtest.NewRecordingTracer().Tracer, + noop.NewMeterProvider(), ) err := fetcher.refreshCache(context.Background()) @@ -224,6 +227,7 @@ func TestFetchNoFallback(t *testing.T) { time.Second, nil, apmtest.NewRecordingTracer().Tracer, + noop.NewMeterProvider(), ) err := fetcher.refreshCache(context.Background()) diff --git a/internal/beatcmd/beat.go b/internal/beatcmd/beat.go index b63d99d8f65..a2d6238c7c6 100644 --- a/internal/beatcmd/beat.go +++ b/internal/beatcmd/beat.go @@ -457,6 +457,225 @@ func (b *Beat) registerMetrics() { monitoring.NewBool(managementRegistry, "enabled").Set(b.Manager.Enabled()) } +<<<<<<< HEAD +======= +func (b *Beat) registerStatsMetrics() { + if b.Config.Output.Name() != "elasticsearch" { + return + } + + libbeatRegistry := monitoring.Default.GetRegistry("libbeat") + monitoring.NewFunc(libbeatRegistry, "output", func(_ monitoring.Mode, v monitoring.Visitor) { + var rm metricdata.ResourceMetrics + if err := b.metricReader.Collect(context.Background(), &rm); err != nil { + return + } + v.OnRegistryStart() + defer v.OnRegistryFinished() + monitoring.ReportString(v, "type", "elasticsearch") + for _, sm := range rm.ScopeMetrics { + switch { + case sm.Scope.Name == "github.com/elastic/go-docappender": + addDocappenderLibbeatOutputMetrics(context.Background(), v, sm) + } + } + }) + monitoring.NewFunc(libbeatRegistry, "pipeline", func(_ monitoring.Mode, v monitoring.Visitor) { + var rm metricdata.ResourceMetrics + if err := b.metricReader.Collect(context.Background(), &rm); err != nil { + return + } + v.OnRegistryStart() + defer v.OnRegistryFinished() + for _, sm := range rm.ScopeMetrics { + switch { + case sm.Scope.Name == "github.com/elastic/go-docappender": + addDocappenderLibbeatPipelineMetrics(context.Background(), v, sm) + } + } + }) + monitoring.NewFunc(monitoring.Default, "output.elasticsearch", func(_ monitoring.Mode, v monitoring.Visitor) { + var rm metricdata.ResourceMetrics + if err := b.metricReader.Collect(context.Background(), &rm); err != nil { + return + } + v.OnRegistryStart() + defer v.OnRegistryFinished() + for _, sm := range rm.ScopeMetrics { + switch { + case sm.Scope.Name == "github.com/elastic/go-docappender": + addDocappenderOutputElasticsearchMetrics(context.Background(), v, sm) + } + } + }) + monitoring.NewFunc(monitoring.Default, "apm-server", func(_ monitoring.Mode, v monitoring.Visitor) { + var rm metricdata.ResourceMetrics + if err := b.metricReader.Collect(context.Background(), &rm); err != nil { + return + } + v.OnRegistryStart() + defer v.OnRegistryFinished() + for _, sm := range rm.ScopeMetrics { + switch { + case strings.HasPrefix(sm.Scope.Name, "github.com/elastic/apm-server"): + // All simple scalar metrics that begin with the name "apm-server." + // in github.com/elastic/apm-server/... scopes are mapped directly. + addAPMServerMetrics(v, sm) + } + } + }) +} + +// getScalarInt64 returns a single-value, dimensionless +// gauge or counter integer value, or (0, false) if the +// data does not match these constraints. +func getScalarInt64(data metricdata.Aggregation) (int64, bool) { + switch data := data.(type) { + case metricdata.Sum[int64]: + if len(data.DataPoints) != 1 || data.DataPoints[0].Attributes.Len() != 0 { + break + } + return data.DataPoints[0].Value, true + case metricdata.Gauge[int64]: + if len(data.DataPoints) != 1 || data.DataPoints[0].Attributes.Len() != 0 { + break + } + return data.DataPoints[0].Value, true + } + return 0, false +} + +func addAPMServerMetrics(v monitoring.Visitor, sm metricdata.ScopeMetrics) { + for _, m := range sm.Metrics { + if suffix, ok := strings.CutPrefix(m.Name, "apm-server."); ok { + if value, ok := getScalarInt64(m.Data); ok { + keys := strings.Split(suffix, ".") + for i := 0; i < len(keys)-1; i++ { + v.OnRegistryStart() + v.OnKey(keys[i]) + } + monitoring.ReportInt(v, keys[len(keys)-1], value) + for i := 0; i < len(keys)-1; i++ { + v.OnRegistryFinished() + } + } + } + } +} + +// Adapt go-docappender's OTel metrics to beats stack monitoring metrics, +// with a mixture of libbeat-specific and apm-server specific metric names. +func addDocappenderLibbeatOutputMetrics(ctx context.Context, v monitoring.Visitor, sm metricdata.ScopeMetrics) { + var writeBytes int64 + + v.OnRegistryStart() + v.OnKey("events") + for _, m := range sm.Metrics { + switch m.Name { + case "elasticsearch.events.processed": + var acked, toomany, failed int64 + data, _ := m.Data.(metricdata.Sum[int64]) + for _, dp := range data.DataPoints { + status, ok := dp.Attributes.Value(attribute.Key("status")) + if !ok { + continue + } + switch status.AsString() { + case "Success": + acked += dp.Value + case "TooMany": + toomany += dp.Value + fallthrough + default: + failed += dp.Value + } + } + monitoring.ReportInt(v, "acked", acked) + monitoring.ReportInt(v, "failed", failed) + monitoring.ReportInt(v, "toomany", toomany) + case "elasticsearch.events.count": + if value, ok := getScalarInt64(m.Data); ok { + monitoring.ReportInt(v, "total", value) + } + case "elasticsearch.events.queued": + if value, ok := getScalarInt64(m.Data); ok { + monitoring.ReportInt(v, "active", value) + } + case "elasticsearch.flushed.bytes": + if value, ok := getScalarInt64(m.Data); ok { + writeBytes = value + } + case "elasticsearch.bulk_requests.count": + if value, ok := getScalarInt64(m.Data); ok { + monitoring.ReportInt(v, "batches", value) + } + } + } + v.OnRegistryFinished() + + if writeBytes > 0 { + v.OnRegistryStart() + v.OnKey("write") + monitoring.ReportInt(v, "bytes", writeBytes) + v.OnRegistryFinished() + } +} + +func addDocappenderLibbeatPipelineMetrics(ctx context.Context, v monitoring.Visitor, sm metricdata.ScopeMetrics) { + v.OnRegistryStart() + defer v.OnRegistryFinished() + v.OnKey("events") + + for _, m := range sm.Metrics { + switch m.Name { + case "elasticsearch.events.count": + if value, ok := getScalarInt64(m.Data); ok { + monitoring.ReportInt(v, "total", value) + } + } + } +} + +// Add non-libbeat Elasticsearch output metrics under "output.elasticsearch". +func addDocappenderOutputElasticsearchMetrics(ctx context.Context, v monitoring.Visitor, sm metricdata.ScopeMetrics) { + var bulkRequestsCount, bulkRequestsAvailable int64 + var indexersCreated, indexersDestroyed int64 + for _, m := range sm.Metrics { + switch m.Name { + case "elasticsearch.bulk_requests.count": + if value, ok := getScalarInt64(m.Data); ok { + bulkRequestsCount = value + } + case "elasticsearch.bulk_requests.available": + if value, ok := getScalarInt64(m.Data); ok { + bulkRequestsAvailable = value + } + case "elasticsearch.indexer.created": + if value, ok := getScalarInt64(m.Data); ok { + indexersCreated = value + } + case "elasticsearch.indexer.destroyed": + if value, ok := getScalarInt64(m.Data); ok { + indexersDestroyed = value + } + } + } + + v.OnRegistryStart() + v.OnKey("bulk_requests") + monitoring.ReportInt(v, "completed", bulkRequestsCount) + monitoring.ReportInt(v, "available", bulkRequestsAvailable) + v.OnRegistryFinished() + + v.OnRegistryStart() + v.OnKey("indexers") + monitoring.ReportInt(v, "created", indexersCreated) + monitoring.ReportInt(v, "destroyed", indexersDestroyed) + monitoring.ReportInt(v, "active", indexersCreated-indexersDestroyed+1) + v.OnRegistryFinished() +} + +>>>>>>> 378b60c2 ( Translate otel metrics to libbeat monitoring (#15094)) // registerElasticsearchVerfication registers a global callback to make sure // the Elasticsearch instance we are connecting to has a valid license, and is // at least on the same version as APM Server. diff --git a/internal/beatcmd/reloader.go b/internal/beatcmd/reloader.go index 27da683e3dc..d2bd135c527 100644 --- a/internal/beatcmd/reloader.go +++ b/internal/beatcmd/reloader.go @@ -233,9 +233,11 @@ func (r *Reloader) reload(inputConfig, outputConfig, apmTracingConfig *config.C) // allow the runner to perform initialisations that must run // synchronously. newRunner, err := r.newRunner(RunnerParams{ - Config: mergedConfig, - Info: r.info, - Logger: r.logger, + Config: mergedConfig, + Info: r.info, + Logger: r.logger, + MeterProvider: r.meterProvider, + MetricsGatherer: r.metricGatherer, }) if err != nil { return err diff --git a/internal/beater/api/config/agent/handler.go b/internal/beater/api/config/agent/handler.go index 1967123a93d..8faddc6a3b3 100644 --- a/internal/beater/api/config/agent/handler.go +++ b/internal/beater/api/config/agent/handler.go @@ -26,8 +26,6 @@ import ( "github.com/pkg/errors" - "github.com/elastic/elastic-agent-libs/monitoring" - "github.com/elastic/apm-server/internal/agentcfg" "github.com/elastic/apm-server/internal/beater/auth" "github.com/elastic/apm-server/internal/beater/headers" @@ -43,10 +41,6 @@ const ( ) var ( - // MonitoringMap holds a mapping for request.IDs to monitoring counters - MonitoringMap = request.DefaultMonitoringMapForRegistry(registry) - registry = monitoring.Default.NewRegistry("apm-server.acm") - errCacheControl = fmt.Sprintf("max-age=%v, must-revalidate", errMaxAgeDuration.Seconds()) ) diff --git a/internal/beater/api/intake/handler.go b/internal/beater/api/intake/handler.go index 4c64d5af54b..edfdcff660c 100644 --- a/internal/beater/api/intake/handler.go +++ b/internal/beater/api/intake/handler.go @@ -18,12 +18,13 @@ package intake import ( + "context" "errors" "fmt" "net/http" "strings" - "github.com/elastic/elastic-agent-libs/monitoring" + "go.opentelemetry.io/otel/metric" "github.com/elastic/apm-data/input/elasticapm" "github.com/elastic/apm-data/model/modelpb" @@ -39,15 +40,6 @@ const ( ) var ( - // MonitoringMap holds a mapping for request.IDs to monitoring counters - MonitoringMap = request.DefaultMonitoringMapForRegistry(registry) - registry = monitoring.Default.NewRegistry("apm-server.server") - - streamRegistry = monitoring.Default.NewRegistry("apm-server.processor.stream") - eventsAccepted = monitoring.NewInt(streamRegistry, "accepted") - eventsInvalid = monitoring.NewInt(streamRegistry, "errors.invalid") - eventsTooLarge = monitoring.NewInt(streamRegistry, "errors.toolarge") - errMethodNotAllowed = errors.New("only POST requests are supported") errServerShuttingDown = errors.New("server is shutting down") errInvalidContentType = errors.New("invalid content type") @@ -59,7 +51,12 @@ var ( type RequestMetadataFunc func(*request.Context) *modelpb.APMEvent // Handler returns a request.Handler for managing intake requests for backend and rum events. -func Handler(handler elasticapm.StreamHandler, requestMetadataFunc RequestMetadataFunc, batchProcessor modelpb.BatchProcessor) request.Handler { +func Handler(mp metric.MeterProvider, handler elasticapm.StreamHandler, requestMetadataFunc RequestMetadataFunc, batchProcessor modelpb.BatchProcessor) request.Handler { + meter := mp.Meter("github.com/elastic/apm-server/internal/beater/api/intake") + eventsAccepted, _ := meter.Int64Counter("apm-server.processor.stream.accepted") + eventsInvalid, _ := meter.Int64Counter("apm-server.processor.stream.errors.invalid") + eventsTooLarge, _ := meter.Int64Counter("apm-server.processor.stream.errors.toolarge") + return func(c *request.Context) { if err := validateRequest(c); err != nil { writeError(c, err) @@ -82,9 +79,9 @@ func Handler(handler elasticapm.StreamHandler, requestMetadataFunc RequestMetada batchProcessor, &result, ) - eventsAccepted.Add(int64(result.Accepted)) - eventsInvalid.Add(int64(result.Invalid)) - eventsTooLarge.Add(int64(result.TooLarge)) + eventsAccepted.Add(context.Background(), int64(result.Accepted)) + eventsInvalid.Add(context.Background(), int64(result.Invalid)) + eventsTooLarge.Add(context.Background(), int64(result.TooLarge)) writeStreamResult(c, result, err) } } diff --git a/internal/beater/api/intake/handler_test.go b/internal/beater/api/intake/handler_test.go index 4c52df49e83..3622ff9f6d0 100644 --- a/internal/beater/api/intake/handler_test.go +++ b/internal/beater/api/intake/handler_test.go @@ -32,12 +32,16 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/metric/noop" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" "golang.org/x/sync/semaphore" "github.com/elastic/apm-data/input/elasticapm" "github.com/elastic/apm-data/model/modelpb" "github.com/elastic/apm-server/internal/beater/config" "github.com/elastic/apm-server/internal/beater/headers" + "github.com/elastic/apm-server/internal/beater/monitoringtest" "github.com/elastic/apm-server/internal/beater/request" "github.com/elastic/apm-server/internal/model/modelprocessor" "github.com/elastic/apm-server/internal/publish" @@ -141,7 +145,7 @@ func TestIntakeHandler(t *testing.T) { tc.setup(t) // call handler - h := Handler(tc.processor, emptyRequestMetadata, tc.batchProcessor) + h := Handler(noop.NewMeterProvider(), tc.processor, emptyRequestMetadata, tc.batchProcessor) h(tc.c) require.Equal(t, string(tc.id), string(tc.c.Result.ID)) @@ -163,10 +167,6 @@ func TestIntakeHandler(t *testing.T) { } func TestIntakeHandlerMonitoring(t *testing.T) { - eventsAccepted.Set(1) - eventsInvalid.Set(2) - eventsTooLarge.Set(3) - streamHandler := streamHandlerFunc(func( ctx context.Context, base *modelpb.APMEvent, @@ -181,15 +181,24 @@ func TestIntakeHandlerMonitoring(t *testing.T) { return errors.New("something bad happened at the end") }) - h := Handler(streamHandler, emptyRequestMetadata, modelprocessor.Nop{}) + reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( + func(ik sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality + }, + )) + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + + h := Handler(mp, streamHandler, emptyRequestMetadata, modelprocessor.Nop{}) req := httptest.NewRequest("POST", "/", nil) c := request.NewContext() c.Reset(httptest.NewRecorder(), req) h(c) - assert.Equal(t, int64(11), eventsAccepted.Get()) - assert.Equal(t, int64(102), eventsInvalid.Get()) - assert.Equal(t, int64(1003), eventsTooLarge.Get()) + monitoringtest.ExpectOtelMetrics(t, reader, map[string]any{ + "apm-server.processor.stream.accepted": 10, + "apm-server.processor.stream.errors.invalid": 100, + "apm-server.processor.stream.errors.toolarge": 1000, + }) } func TestIntakeHandlerContentType(t *testing.T) { @@ -206,7 +215,7 @@ func TestIntakeHandlerContentType(t *testing.T) { } tc.setup(t) - h := Handler(tc.processor, emptyRequestMetadata, tc.batchProcessor) + h := Handler(noop.NewMeterProvider(), tc.processor, emptyRequestMetadata, tc.batchProcessor) h(tc.c) assert.Equal(t, tc.code, tc.w.Code, tc.c.Result.Err) } diff --git a/internal/beater/api/mux.go b/internal/beater/api/mux.go index 639b7ead0a9..712bf933a70 100644 --- a/internal/beater/api/mux.go +++ b/internal/beater/api/mux.go @@ -25,10 +25,10 @@ import ( "github.com/gorilla/mux" "github.com/pkg/errors" + "go.opentelemetry.io/otel/metric" "go.uber.org/zap" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/monitoring" "github.com/elastic/apm-data/input" "github.com/elastic/apm-data/input/elasticapm" @@ -89,6 +89,7 @@ func NewMux( sourcemapFetcher sourcemap.Fetcher, publishReady func() bool, semaphore input.Semaphore, + meterProvider metric.MeterProvider, ) (*mux.Router, error) { pool := request.NewContextPool() logger := logp.NewLogger(logs.Handler) @@ -116,18 +117,18 @@ func NewMux( handlerFn func() (request.Handler, error) } - otlpHandlers := otlp.NewHTTPHandlers(zapLogger, batchProcessor, semaphore) - rumIntakeHandler := builder.rumIntakeHandler() + otlpHandlers := otlp.NewHTTPHandlers(zapLogger, batchProcessor, semaphore, meterProvider) + rumIntakeHandler := builder.rumIntakeHandler(meterProvider) routeMap := []route{ - {RootPath, builder.rootHandler(publishReady)}, - {AgentConfigPath, builder.backendAgentConfigHandler(fetcher)}, - {AgentConfigRUMPath, builder.rumAgentConfigHandler(fetcher)}, + {RootPath, builder.rootHandler(publishReady, meterProvider)}, + {AgentConfigPath, builder.backendAgentConfigHandler(fetcher, meterProvider)}, + {AgentConfigRUMPath, builder.rumAgentConfigHandler(fetcher, meterProvider)}, {IntakeRUMPath, rumIntakeHandler}, {IntakeRUMV3Path, rumIntakeHandler}, - {IntakePath, builder.backendIntakeHandler}, - {OTLPTracesIntakePath, builder.otlpHandler(otlpHandlers.HandleTraces, otlp.HTTPTracesMonitoringMap)}, - {OTLPMetricsIntakePath, builder.otlpHandler(otlpHandlers.HandleMetrics, otlp.HTTPMetricsMonitoringMap)}, - {OTLPLogsIntakePath, builder.otlpHandler(otlpHandlers.HandleLogs, otlp.HTTPLogsMonitoringMap)}, + {IntakePath, builder.backendIntakeHandler(meterProvider)}, + {OTLPTracesIntakePath, builder.otlpHandler(otlpHandlers.HandleTraces, "apm-server.otlp.http.traces.", meterProvider)}, + {OTLPMetricsIntakePath, builder.otlpHandler(otlpHandlers.HandleMetrics, "apm-server.otlp.http.metrics.", meterProvider)}, + {OTLPLogsIntakePath, builder.otlpHandler(otlpHandlers.HandleLogs, "apm-server.otlp.http.logs.", meterProvider)}, } for _, route := range routeMap { @@ -170,21 +171,23 @@ type routeBuilder struct { intakeSemaphore input.Semaphore } -func (r *routeBuilder) backendIntakeHandler() (request.Handler, error) { - h := intake.Handler(r.intakeProcessor, backendRequestMetadataFunc(r.cfg), r.batchProcessor) - return middleware.Wrap(h, backendMiddleware(r.cfg, r.authenticator, r.ratelimitStore, intake.MonitoringMap)...) +func (r *routeBuilder) backendIntakeHandler(mp metric.MeterProvider) func() (request.Handler, error) { + return func() (request.Handler, error) { + h := intake.Handler(mp, r.intakeProcessor, backendRequestMetadataFunc(r.cfg), r.batchProcessor) + return middleware.Wrap(h, backendMiddleware(r.cfg, r.authenticator, r.ratelimitStore, "apm-server.server.", mp)...) + } } -func (r *routeBuilder) otlpHandler(handler http.HandlerFunc, monitoringMap map[request.ResultID]*monitoring.Int) func() (request.Handler, error) { +func (r *routeBuilder) otlpHandler(handler http.HandlerFunc, metricsPrefix string, mp metric.MeterProvider) func() (request.Handler, error) { return func() (request.Handler, error) { h := func(c *request.Context) { handler(c.ResponseWriter, c.Request) } - return middleware.Wrap(h, backendMiddleware(r.cfg, r.authenticator, r.ratelimitStore, monitoringMap)...) + return middleware.Wrap(h, backendMiddleware(r.cfg, r.authenticator, r.ratelimitStore, metricsPrefix, mp)...) } } -func (r *routeBuilder) rumIntakeHandler() func() (request.Handler, error) { +func (r *routeBuilder) rumIntakeHandler(mp metric.MeterProvider) func() (request.Handler, error) { return func() (request.Handler, error) { var batchProcessors modelprocessor.Chained // The order of these processors is important. Source mapping must happen before identifying library frames, or @@ -214,34 +217,34 @@ func (r *routeBuilder) rumIntakeHandler() func() (request.Handler, error) { batchProcessors = append(batchProcessors, modelprocessor.SetCulprit{}) } batchProcessors = append(batchProcessors, r.batchProcessor) // r.batchProcessor always goes last - h := intake.Handler(r.intakeProcessor, rumRequestMetadataFunc(r.cfg), batchProcessors) - return middleware.Wrap(h, rumMiddleware(r.cfg, r.authenticator, r.ratelimitStore, intake.MonitoringMap)...) + h := intake.Handler(mp, r.intakeProcessor, rumRequestMetadataFunc(r.cfg), batchProcessors) + return middleware.Wrap(h, rumMiddleware(r.cfg, r.authenticator, r.ratelimitStore, "apm-server.server.", mp)...) } } -func (r *routeBuilder) rootHandler(publishReady func() bool) func() (request.Handler, error) { +func (r *routeBuilder) rootHandler(publishReady func() bool, mp metric.MeterProvider) func() (request.Handler, error) { return func() (request.Handler, error) { h := root.Handler(root.HandlerConfig{ Version: version.VersionWithQualifier(), PublishReady: publishReady, }) - return middleware.Wrap(h, rootMiddleware(r.cfg, r.authenticator)...) + return middleware.Wrap(h, rootMiddleware(r.cfg, r.authenticator, mp)...) } } -func (r *routeBuilder) backendAgentConfigHandler(f agentcfg.Fetcher) func() (request.Handler, error) { +func (r *routeBuilder) backendAgentConfigHandler(f agentcfg.Fetcher, mp metric.MeterProvider) func() (request.Handler, error) { return func() (request.Handler, error) { - return agentConfigHandler(r.cfg, r.authenticator, r.ratelimitStore, backendMiddleware, f) + return agentConfigHandler(r.cfg, r.authenticator, r.ratelimitStore, backendMiddleware, f, mp) } } -func (r *routeBuilder) rumAgentConfigHandler(f agentcfg.Fetcher) func() (request.Handler, error) { +func (r *routeBuilder) rumAgentConfigHandler(f agentcfg.Fetcher, mp metric.MeterProvider) func() (request.Handler, error) { return func() (request.Handler, error) { - return agentConfigHandler(r.cfg, r.authenticator, r.ratelimitStore, rumMiddleware, f) + return agentConfigHandler(r.cfg, r.authenticator, r.ratelimitStore, rumMiddleware, f, mp) } } -type middlewareFunc func(*config.Config, *auth.Authenticator, *ratelimit.Store, map[request.ResultID]*monitoring.Int) []middleware.Middleware +type middlewareFunc func(*config.Config, *auth.Authenticator, *ratelimit.Store, string, metric.MeterProvider) []middleware.Middleware func agentConfigHandler( cfg *config.Config, @@ -249,22 +252,23 @@ func agentConfigHandler( ratelimitStore *ratelimit.Store, middlewareFunc middlewareFunc, f agentcfg.Fetcher, + mp metric.MeterProvider, ) (request.Handler, error) { - mw := middlewareFunc(cfg, authenticator, ratelimitStore, agent.MonitoringMap) + mw := middlewareFunc(cfg, authenticator, ratelimitStore, "apm-server.acm.", mp) h := agent.NewHandler(f, cfg.AgentConfig.Cache.Expiration, cfg.DefaultServiceEnvironment, cfg.AgentAuth.Anonymous.AllowAgent) return middleware.Wrap(h, mw...) } -func apmMiddleware(m map[request.ResultID]*monitoring.Int) []middleware.Middleware { +func apmMiddleware(mp metric.MeterProvider, metricsPrefix string) []middleware.Middleware { return []middleware.Middleware{ middleware.LogMiddleware(), middleware.RecoverPanicMiddleware(), - middleware.MonitoringMiddleware(m, nil), + middleware.MonitoringMiddleware(metricsPrefix, mp), } } -func backendMiddleware(cfg *config.Config, authenticator *auth.Authenticator, ratelimitStore *ratelimit.Store, m map[request.ResultID]*monitoring.Int) []middleware.Middleware { - backendMiddleware := append(apmMiddleware(m), +func backendMiddleware(cfg *config.Config, authenticator *auth.Authenticator, ratelimitStore *ratelimit.Store, metricsPrefix string, mp metric.MeterProvider) []middleware.Middleware { + backendMiddleware := append(apmMiddleware(mp, metricsPrefix), middleware.ResponseHeadersMiddleware(cfg.ResponseHeaders), middleware.AuthMiddleware(authenticator, true), middleware.AnonymousRateLimitMiddleware(ratelimitStore), @@ -272,11 +276,11 @@ func backendMiddleware(cfg *config.Config, authenticator *auth.Authenticator, ra return backendMiddleware } -func rumMiddleware(cfg *config.Config, authenticator *auth.Authenticator, ratelimitStore *ratelimit.Store, m map[request.ResultID]*monitoring.Int) []middleware.Middleware { +func rumMiddleware(cfg *config.Config, authenticator *auth.Authenticator, ratelimitStore *ratelimit.Store, metricsPrefix string, mp metric.MeterProvider) []middleware.Middleware { msg := "RUM endpoint is disabled. " + "Configure the `apm-server.rum` section in apm-server.yml to enable ingestion of RUM events. " + "If you are not using the RUM agent, you can safely ignore this error." - rumMiddleware := append(apmMiddleware(m), + rumMiddleware := append(apmMiddleware(mp, metricsPrefix), middleware.ResponseHeadersMiddleware(cfg.ResponseHeaders), middleware.ResponseHeadersMiddleware(cfg.RumConfig.ResponseHeaders), middleware.CORSMiddleware(cfg.RumConfig.AllowOrigins, cfg.RumConfig.AllowHeaders), @@ -286,8 +290,8 @@ func rumMiddleware(cfg *config.Config, authenticator *auth.Authenticator, rateli return append(rumMiddleware, middleware.KillSwitchMiddleware(cfg.RumConfig.Enabled, msg)) } -func rootMiddleware(cfg *config.Config, authenticator *auth.Authenticator) []middleware.Middleware { - return append(apmMiddleware(root.MonitoringMap), +func rootMiddleware(cfg *config.Config, authenticator *auth.Authenticator, mp metric.MeterProvider) []middleware.Middleware { + return append(apmMiddleware(mp, "apm-server.root."), middleware.ResponseHeadersMiddleware(cfg.ResponseHeaders), middleware.AuthMiddleware(authenticator, false), ) diff --git a/internal/beater/api/mux_config_agent_test.go b/internal/beater/api/mux_config_agent_test.go index 9adfe60661d..3a947c51b7b 100644 --- a/internal/beater/api/mux_config_agent_test.go +++ b/internal/beater/api/mux_config_agent_test.go @@ -24,7 +24,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/elastic/apm-server/internal/beater/api/config/agent" "github.com/elastic/apm-server/internal/beater/config" "github.com/elastic/apm-server/internal/beater/headers" "github.com/elastic/apm-server/internal/beater/request" @@ -61,11 +60,11 @@ func TestConfigAgentHandler_PanicMiddleware(t *testing.T) { } func TestConfigAgentHandler_MonitoringMiddleware(t *testing.T) { - testMonitoringMiddleware(t, "/config/v1/agents", agent.MonitoringMap, map[request.ResultID]int{ - request.IDRequestCount: 1, - request.IDResponseCount: 1, - request.IDResponseErrorsCount: 1, - request.IDResponseErrorsInvalidQuery: 1, + testMonitoringMiddleware(t, "/config/v1/agents", map[string]any{ + "http.server." + string(request.IDRequestCount): 1, + "http.server." + string(request.IDResponseCount): 1, + "http.server." + string(request.IDResponseErrorsCount): 1, + "http.server." + string(request.IDResponseErrorsInvalidQuery): 1, }) } diff --git a/internal/beater/api/mux_intake_backend_test.go b/internal/beater/api/mux_intake_backend_test.go index 1707582fa06..a2329bfa49a 100644 --- a/internal/beater/api/mux_intake_backend_test.go +++ b/internal/beater/api/mux_intake_backend_test.go @@ -25,7 +25,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/elastic/apm-server/internal/beater/api/intake" "github.com/elastic/apm-server/internal/beater/config" "github.com/elastic/apm-server/internal/beater/headers" "github.com/elastic/apm-server/internal/beater/request" @@ -66,11 +65,11 @@ func TestIntakeBackendHandler_PanicMiddleware(t *testing.T) { func TestIntakeBackendHandler_MonitoringMiddleware(t *testing.T) { // send GET request resulting in 405 MethodNotAllowed error - testMonitoringMiddleware(t, "/intake/v2/events", intake.MonitoringMap, map[request.ResultID]int{ - request.IDRequestCount: 1, - request.IDResponseCount: 1, - request.IDResponseErrorsCount: 1, - request.IDResponseErrorsMethodNotAllowed: 1, + testMonitoringMiddleware(t, "/intake/v2/events", map[string]any{ + "http.server." + string(request.IDRequestCount): 1, + "http.server." + string(request.IDResponseCount): 1, + "http.server." + string(request.IDResponseErrorsCount): 1, + "http.server." + string(request.IDResponseErrorsMethodNotAllowed): 1, }) } diff --git a/internal/beater/api/mux_intake_rum_test.go b/internal/beater/api/mux_intake_rum_test.go index f50f82d915a..977aea40ae9 100644 --- a/internal/beater/api/mux_intake_rum_test.go +++ b/internal/beater/api/mux_intake_rum_test.go @@ -25,8 +25,8 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/metric/noop" - "github.com/elastic/apm-server/internal/beater/api/intake" "github.com/elastic/apm-server/internal/beater/auth" "github.com/elastic/apm-server/internal/beater/config" "github.com/elastic/apm-server/internal/beater/headers" @@ -49,7 +49,7 @@ func TestOPTIONS(t *testing.T) { requestTaken <- struct{}{} <-done }, - rumMiddleware(cfg, authenticator, ratelimitStore, intake.MonitoringMap)...) + rumMiddleware(cfg, authenticator, ratelimitStore, "", noop.NewMeterProvider())...) // use this to block the single allowed concurrent requests go func() { @@ -99,7 +99,7 @@ func TestRUMHandler_KillSwitchMiddleware(t *testing.T) { func TestRUMHandler_CORSMiddleware(t *testing.T) { cfg := cfgEnabledRUM() cfg.RumConfig.AllowOrigins = []string{"foo"} - h := newTestMux(t, cfg) + h, _ := newTestMux(t, cfg) for _, path := range []string{"/intake/v2/rum/events", "/intake/v3/rum/events"} { req := httptest.NewRequest(http.MethodPost, path, nil) @@ -118,11 +118,11 @@ func TestIntakeRUMHandler_PanicMiddleware(t *testing.T) { func TestRumHandler_MonitoringMiddleware(t *testing.T) { // send GET request resulting in 403 Forbidden error for _, path := range []string{"/intake/v2/rum/events", "/intake/v3/rum/events"} { - testMonitoringMiddleware(t, path, intake.MonitoringMap, map[request.ResultID]int{ - request.IDRequestCount: 1, - request.IDResponseCount: 1, - request.IDResponseErrorsCount: 1, - request.IDResponseErrorsForbidden: 1, + testMonitoringMiddleware(t, path, map[string]any{ + "http.server." + string(request.IDRequestCount): 1, + "http.server." + string(request.IDResponseCount): 1, + "http.server." + string(request.IDResponseErrorsCount): 1, + "http.server." + string(request.IDResponseErrorsForbidden): 1, }) } } diff --git a/internal/beater/api/mux_root_test.go b/internal/beater/api/mux_root_test.go index b31f58b0c07..2b2af7dcc47 100644 --- a/internal/beater/api/mux_root_test.go +++ b/internal/beater/api/mux_root_test.go @@ -25,7 +25,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/elastic/apm-server/internal/beater/api/root" "github.com/elastic/apm-server/internal/beater/config" "github.com/elastic/apm-server/internal/beater/headers" "github.com/elastic/apm-server/internal/beater/request" @@ -64,10 +63,10 @@ func TestRootHandler_PanicMiddleware(t *testing.T) { } func TestRootHandler_MonitoringMiddleware(t *testing.T) { - testMonitoringMiddleware(t, "/", root.MonitoringMap, map[request.ResultID]int{ - request.IDRequestCount: 1, - request.IDResponseCount: 1, - request.IDResponseValidCount: 1, - request.IDResponseValidOK: 1, + testMonitoringMiddleware(t, "/", map[string]any{ + "http.server." + string(request.IDRequestCount): 1, + "http.server." + string(request.IDResponseCount): 1, + "http.server." + string(request.IDResponseValidCount): 1, + "http.server." + string(request.IDResponseValidOK): 1, }) } diff --git a/internal/beater/api/mux_test.go b/internal/beater/api/mux_test.go index ce0ca1d8f90..171a443dcdb 100644 --- a/internal/beater/api/mux_test.go +++ b/internal/beater/api/mux_test.go @@ -29,6 +29,8 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" "golang.org/x/sync/semaphore" "github.com/elastic/apm-data/model/modelpb" @@ -39,7 +41,6 @@ import ( "github.com/elastic/apm-server/internal/beater/ratelimit" "github.com/elastic/apm-server/internal/beater/request" "github.com/elastic/apm-server/internal/sourcemap" - "github.com/elastic/elastic-agent-libs/monitoring" ) func TestBackendRequestMetadata(t *testing.T) { @@ -113,7 +114,7 @@ func requestToMuxerWithHeaderAndQueryString( } func requestToMuxer(cfg *config.Config, r *http.Request) (*httptest.ResponseRecorder, error) { - mux, err := muxBuilder{}.build(cfg) + _, mux, err := muxBuilder{}.build(cfg) if err != nil { return nil, err } @@ -123,7 +124,7 @@ func requestToMuxer(cfg *config.Config, r *http.Request) (*httptest.ResponseReco } func testPanicMiddleware(t *testing.T, urlPath string) { - h := newTestMux(t, config.DefaultConfig()) + h, _ := newTestMux(t, config.DefaultConfig()) req := httptest.NewRequest(http.MethodGet, urlPath, nil) var rec WriterPanicOnce @@ -133,21 +134,18 @@ func testPanicMiddleware(t *testing.T, urlPath string) { assert.JSONEq(t, `{"error":"panic handling request"}`, rec.Body.String()) } -func testMonitoringMiddleware(t *testing.T, urlPath string, monitoringMap map[request.ResultID]*monitoring.Int, expected map[request.ResultID]int) { - monitoringtest.ClearRegistry(monitoringMap) - - h := newTestMux(t, config.DefaultConfig()) +func testMonitoringMiddleware(t *testing.T, urlPath string, expectedMetrics map[string]any) { + h, reader := newTestMux(t, config.DefaultConfig()) req := httptest.NewRequest(http.MethodGet, urlPath, nil) h.ServeHTTP(httptest.NewRecorder(), req) - equal, result := monitoringtest.CompareMonitoringInt(expected, monitoringMap) - assert.True(t, equal, result) + monitoringtest.ExpectContainOtelMetrics(t, reader, expectedMetrics) } -func newTestMux(t *testing.T, cfg *config.Config) http.Handler { - mux, err := muxBuilder{}.build(cfg) +func newTestMux(t *testing.T, cfg *config.Config) (http.Handler, sdkmetric.Reader) { + reader, mux, err := muxBuilder{}.build(cfg) require.NoError(t, err) - return mux + return mux, reader } type muxBuilder struct { @@ -155,11 +153,18 @@ type muxBuilder struct { Managed bool } -func (m muxBuilder) build(cfg *config.Config) (http.Handler, error) { +func (m muxBuilder) build(cfg *config.Config) (sdkmetric.Reader, http.Handler, error) { + reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( + func(ik sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality + }, + )) + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + nopBatchProcessor := modelpb.ProcessBatchFunc(func(context.Context, *modelpb.Batch) error { return nil }) ratelimitStore, _ := ratelimit.NewStore(1000, 1000, 1000) authenticator, _ := auth.NewAuthenticator(cfg.AgentAuth) - return NewMux( + r, err := NewMux( cfg, nopBatchProcessor, authenticator, @@ -168,7 +173,9 @@ func (m muxBuilder) build(cfg *config.Config) (http.Handler, error) { m.SourcemapFetcher, func() bool { return true }, semaphore.NewWeighted(1), + mp, ) + return reader, r, err } // WriterPanicOnce implements the http.ResponseWriter interface diff --git a/internal/beater/api/root/handler.go b/internal/beater/api/root/handler.go index ad12cf8ea43..2ef21a67bc4 100644 --- a/internal/beater/api/root/handler.go +++ b/internal/beater/api/root/handler.go @@ -22,18 +22,11 @@ import ( "github.com/elastic/beats/v7/libbeat/version" "github.com/elastic/elastic-agent-libs/mapstr" - "github.com/elastic/elastic-agent-libs/monitoring" "github.com/elastic/apm-server/internal/beater/auth" "github.com/elastic/apm-server/internal/beater/request" ) -var ( - // MonitoringMap holds a mapping for request.IDs to monitoring counters - MonitoringMap = request.DefaultMonitoringMapForRegistry(registry) - registry = monitoring.Default.NewRegistry("apm-server.root") -) - // HandlerConfig holds configuration for Handler. type HandlerConfig struct { // PublishReady reports whether or not the server is ready for publishing events. diff --git a/internal/beater/beater.go b/internal/beater/beater.go index fe779bee8fe..c7c826f59aa 100644 --- a/internal/beater/beater.go +++ b/internal/beater/beater.go @@ -73,11 +73,6 @@ import ( "github.com/elastic/apm-server/internal/version" ) -var ( - monitoringRegistry = monitoring.Default.NewRegistry("apm-server.sampling") - transactionsDroppedCounter = monitoring.NewInt(monitoringRegistry, "transactions_dropped") -) - // Runner initialises and runs and orchestrates the APM Server // HTTP and gRPC servers, event processing pipeline, and output. type Runner struct { @@ -157,6 +152,7 @@ func NewRunner(args RunnerParams) (*Runner, error) { func (s *Runner) Run(ctx context.Context) error { defer s.listener.Close() g, ctx := errgroup.WithContext(ctx) + meter := s.meterProvider.Meter("github.com/elastic/apm-server/internal/beater") // backgroundContext is a context to use in operations that should // block until shutdown, and will be cancelled after the shutdown @@ -384,13 +380,17 @@ func (s *Runner) Run(ctx context.Context) error { if err != nil { return err } + transactionsDroppedCounter, err := meter.Int64Counter("apm-server.sampling.transactions_dropped") + if err != nil { + return err + } batchProcessor := srvmodelprocessor.NewTracer("beater.ProcessBatch", modelprocessor.Chained{ // Ensure all events have observer.*, ecs.*, and data_stream.* fields added, // and are counted in metrics. This is done in the final processors to ensure // aggregated metrics are also processed. newObserverBatchProcessor(), &modelprocessor.SetDataStream{Namespace: s.config.DataStreams.Namespace}, - srvmodelprocessor.NewEventCounter(monitoring.Default.GetRegistry("apm-server")), + srvmodelprocessor.NewEventCounter(s.meterProvider), // The server always drops non-RUM unsampled transactions. We store RUM unsampled // transactions as they are needed by the User Experience app, which performs @@ -399,7 +399,7 @@ func (s *Runner) Run(ctx context.Context) error { // It is important that this is done just before calling the publisher to // avoid affecting aggregations. modelprocessor.NewDropUnsampled(false /* don't drop RUM unsampled transactions*/, func(i int64) { - transactionsDroppedCounter.Add(i) + transactionsDroppedCounter.Add(context.Background(), i) }), finalBatchProcessor, }) @@ -410,7 +410,11 @@ func (s *Runner) Run(ctx context.Context) error { kibanaClient, newElasticsearchClient, tracer, +<<<<<<< HEAD s.logger, +======= + s.meterProvider, +>>>>>>> 378b60c2 ( Translate otel metrics to libbeat monitoring (#15094)) ) if err != nil { return err @@ -436,6 +440,7 @@ func (s *Runner) Run(ctx context.Context) error { Namespace: s.config.DataStreams.Namespace, Logger: s.logger, Tracer: tracer, + MeterProvider: s.meterProvider, Authenticator: authenticator, RateLimitStore: ratelimitStore, BatchProcessor: batchProcessor, @@ -490,7 +495,7 @@ func (s *Runner) Run(ctx context.Context) error { return runServer(ctx, serverParams) }) if tracerServerListener != nil { - tracerServer, err := newTracerServer(s.config, tracerServerListener, s.logger, serverParams.BatchProcessor, serverParams.Semaphore) + tracerServer, err := newTracerServer(s.config, tracerServerListener, s.logger, serverParams.BatchProcessor, serverParams.Semaphore, serverParams.MeterProvider) if err != nil { return fmt.Errorf("failed to create self-instrumentation server: %w", err) } @@ -680,6 +685,11 @@ func (s *Runner) newFinalBatchProcessor( monitoring.Default.Remove("libbeat") libbeatMonitoringRegistry := monitoring.Default.NewRegistry("libbeat") if s.elasticsearchOutputConfig == nil { +<<<<<<< HEAD +======= + monitoring.Default.Remove("libbeat") + libbeatMonitoringRegistry := monitoring.Default.NewRegistry("libbeat") +>>>>>>> 378b60c2 ( Translate otel metrics to libbeat monitoring (#15094)) return s.newLibbeatFinalBatchProcessor(tracer, libbeatMonitoringRegistry) } diff --git a/internal/beater/interceptors/metrics.go b/internal/beater/interceptors/metrics.go index 8addfdddc15..b12c2a0eb6c 100644 --- a/internal/beater/interceptors/metrics.go +++ b/internal/beater/interceptors/metrics.go @@ -21,7 +21,6 @@ import ( "context" "time" - "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/metric" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -29,31 +28,12 @@ import ( "github.com/elastic/apm-server/internal/beater/request" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/monitoring" ) const ( requestDurationHistogram = "request.duration" ) -var methodUnaryRequestMetrics = make(map[string]map[request.ResultID]*monitoring.Int) - -// RegisterMethodUnaryRequestMetrics registers a UnaryRequestMetrics for the -// given full method name. This can be used when the gRPC service implementation -// is not exensible, such as in the case of the OTLP services. -// -// This function must only be called from package init functions; it is not safe -// for concurrent access. -func RegisterMethodUnaryRequestMetrics(fullMethod string, m map[request.ResultID]*monitoring.Int) { - methodUnaryRequestMetrics[fullMethod] = m -} - -// UnaryRequestMetrics is an interface that gRPC services may implement -// to provide a metrics registry for the Metrics interceptor. -type UnaryRequestMetrics interface { - RequestMetrics(fullMethod string) map[request.ResultID]*monitoring.Int -} - type metricsInterceptor struct { logger *logp.Logger meter metric.Meter @@ -69,24 +49,24 @@ func (m *metricsInterceptor) Interceptor() grpc.UnaryServerInterceptor { info *grpc.UnaryServerInfo, handler grpc.UnaryHandler, ) (interface{}, error) { - var ints map[request.ResultID]*monitoring.Int - if requestMetrics, ok := info.Server.(UnaryRequestMetrics); ok { - ints = requestMetrics.RequestMetrics(info.FullMethod) - } else { - ints = methodUnaryRequestMetrics[info.FullMethod] - } - if ints == nil { + var legacyMetricsPrefix string + + switch info.FullMethod { + case "/opentelemetry.proto.collector.metrics.v1.MetricsService/Export": + legacyMetricsPrefix = "apm-server.otlp.grpc.metrics." + case "/opentelemetry.proto.collector.trace.v1.TraceService/Export": + legacyMetricsPrefix = "apm-server.otlp.grpc.traces." + case "/opentelemetry.proto.collector.logs.v1.LogsService/Export": + legacyMetricsPrefix = "apm-server.otlp.grpc.logs." + default: m.logger.With( "grpc.request.method", info.FullMethod, ).Warn("metrics registry missing") return handler(ctx, req) } - m.getCounter(string(request.IDRequestCount)).Add(ctx, 1) - defer m.getCounter(string(request.IDResponseCount)).Add(ctx, 1) - - ints[request.IDRequestCount].Inc() - defer ints[request.IDResponseCount].Inc() + m.inc(ctx, legacyMetricsPrefix, request.IDRequestCount) + defer m.inc(ctx, legacyMetricsPrefix, request.IDResponseCount) start := time.Now() resp, err := handler(ctx, req) @@ -99,31 +79,29 @@ func (m *metricsInterceptor) Interceptor() grpc.UnaryServerInterceptor { if s, ok := status.FromError(err); ok { switch s.Code() { case codes.Unauthenticated: - m.getCounter(string(request.IDResponseErrorsUnauthorized)).Add(ctx, 1) - ints[request.IDResponseErrorsUnauthorized].Inc() + m.inc(ctx, legacyMetricsPrefix, request.IDResponseErrorsUnauthorized) case codes.DeadlineExceeded, codes.Canceled: - m.getCounter(string(request.IDResponseErrorsTimeout)).Add(ctx, 1) - ints[request.IDResponseErrorsTimeout].Inc() + m.inc(ctx, legacyMetricsPrefix, request.IDResponseErrorsTimeout) case codes.ResourceExhausted: - m.getCounter(string(request.IDResponseErrorsRateLimit)).Add(ctx, 1) - ints[request.IDResponseErrorsRateLimit].Inc() + m.inc(ctx, legacyMetricsPrefix, request.IDResponseErrorsRateLimit) } } } - - m.getCounter(string(responseID)).Add(ctx, 1) - ints[responseID].Inc() - + m.inc(ctx, legacyMetricsPrefix, responseID) return resp, err } } -func (m *metricsInterceptor) getCounter(n string) metric.Int64Counter { - name := "grpc.server." + n +func (m *metricsInterceptor) inc(ctx context.Context, legacyMetricsPrefix string, id request.ResultID) { + m.getCounter("grpc.server.", string(id)).Add(ctx, 1) + m.getCounter(legacyMetricsPrefix, string(id)).Add(ctx, 1) +} + +func (m *metricsInterceptor) getCounter(prefix, n string) metric.Int64Counter { + name := prefix + n if met, ok := m.counters[name]; ok { return met } - nm, _ := m.meter.Int64Counter(name) m.counters[name] = nm return nm @@ -151,13 +129,9 @@ func (m *metricsInterceptor) getHistogram(n string, opts ...metric.Int64Histogra // if neither of these are available, a warning will be logged and no metrics // will be gathered. func Metrics(logger *logp.Logger, mp metric.MeterProvider) grpc.UnaryServerInterceptor { - if mp == nil { - mp = otel.GetMeterProvider() - } - i := &metricsInterceptor{ logger: logger, - meter: mp.Meter("internal/beater/interceptors"), + meter: mp.Meter("github.com/elastic/apm-server/internal/beater/interceptors"), counters: map[string]metric.Int64Counter{}, histograms: map[string]metric.Int64Histogram{}, diff --git a/internal/beater/interceptors/metrics_test.go b/internal/beater/interceptors/metrics_test.go index 211d14fd7b0..bec190a50b2 100644 --- a/internal/beater/interceptors/metrics_test.go +++ b/internal/beater/interceptors/metrics_test.go @@ -22,7 +22,6 @@ import ( "errors" "testing" - "github.com/stretchr/testify/assert" sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/metric/metricdata" "google.golang.org/grpc" @@ -32,210 +31,148 @@ import ( "github.com/elastic/apm-server/internal/beater/monitoringtest" "github.com/elastic/apm-server/internal/beater/request" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/monitoring" -) - -var monitoringKeys = append( - request.DefaultResultIDs, - request.IDResponseErrorsRateLimit, - request.IDResponseErrorsTimeout, - request.IDResponseErrorsUnauthorized, ) func TestMetrics(t *testing.T) { - reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( - func(ik sdkmetric.InstrumentKind) metricdata.Temporality { - return metricdata.DeltaTemporality - }, - )) - mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) - - registry := monitoring.NewRegistry() - - monitoringMap := request.MonitoringMapForRegistry(registry, monitoringKeys) - methodName := "test_method_name" - logger := logp.NewLogger("interceptor.metrics.test") - - interceptor := Metrics(logger, mp) - - ctx := context.Background() - info := &grpc.UnaryServerInfo{ - FullMethod: methodName, - Server: requestMetricsFunc(func(fullMethod string) map[request.ResultID]*monitoring.Int { - assert.Equal(t, methodName, fullMethod) - return monitoringMap - }), - } - - for _, tc := range []struct { - name string - f func(ctx context.Context, req interface{}) (interface{}, error) - monitoringInt map[request.ResultID]int64 - expectedOtel map[string]interface{} + for _, metrics := range []struct { + methodName string + prefix string }{ { - name: "with an error", - f: func(ctx context.Context, req interface{}) (interface{}, error) { - return nil, errors.New("error") - }, - monitoringInt: map[request.ResultID]int64{ - request.IDRequestCount: 1, - request.IDResponseCount: 1, - request.IDResponseValidCount: 0, - request.IDResponseErrorsCount: 1, - request.IDResponseErrorsRateLimit: 0, - request.IDResponseErrorsTimeout: 0, - request.IDResponseErrorsUnauthorized: 0, - }, - expectedOtel: map[string]interface{}{ - "grpc.server." + string(request.IDRequestCount): 1, - "grpc.server." + string(request.IDResponseCount): 1, - "grpc.server." + string(request.IDResponseErrorsCount): 1, - - "grpc.server.request.duration": 1, - }, + methodName: "/opentelemetry.proto.collector.metrics.v1.MetricsService/Export", + prefix: "apm-server.otlp.grpc.metrics.", }, { - name: "with an unauthenticated error", - f: func(ctx context.Context, req interface{}) (interface{}, error) { - return nil, status.Error(codes.Unauthenticated, "error") - }, - monitoringInt: map[request.ResultID]int64{ - request.IDRequestCount: 1, - request.IDResponseCount: 1, - request.IDResponseValidCount: 0, - request.IDResponseErrorsCount: 1, - request.IDResponseErrorsInternal: 0, - request.IDResponseErrorsRateLimit: 0, - request.IDResponseErrorsTimeout: 0, - request.IDResponseErrorsUnauthorized: 1, - }, - expectedOtel: map[string]interface{}{ - "grpc.server." + string(request.IDRequestCount): 1, - "grpc.server." + string(request.IDResponseCount): 1, - "grpc.server." + string(request.IDResponseErrorsCount): 1, - "grpc.server." + string(request.IDResponseErrorsUnauthorized): 1, - - "grpc.server.request.duration": 1, - }, + methodName: "/opentelemetry.proto.collector.trace.v1.TraceService/Export", + prefix: "apm-server.otlp.grpc.traces.", }, { - name: "with a deadline exceeded error", - f: func(ctx context.Context, req interface{}) (interface{}, error) { - return nil, status.Error(codes.DeadlineExceeded, "request timed out") - }, - monitoringInt: map[request.ResultID]int64{ - request.IDRequestCount: 1, - request.IDResponseCount: 1, - request.IDResponseValidCount: 0, - request.IDResponseErrorsCount: 1, - request.IDResponseErrorsInternal: 0, - request.IDResponseErrorsRateLimit: 0, - request.IDResponseErrorsTimeout: 1, - request.IDResponseErrorsUnauthorized: 0, - }, - expectedOtel: map[string]interface{}{ - "grpc.server." + string(request.IDRequestCount): 1, - "grpc.server." + string(request.IDResponseCount): 1, - "grpc.server." + string(request.IDResponseErrorsCount): 1, - "grpc.server." + string(request.IDResponseErrorsTimeout): 1, - - "grpc.server.request.duration": 1, - }, + methodName: "/opentelemetry.proto.collector.logs.v1.LogsService/Export", + prefix: "apm-server.otlp.grpc.logs.", }, - { - name: "with a canceled error", - f: func(ctx context.Context, req interface{}) (interface{}, error) { - return nil, status.Error(codes.Canceled, "request timed out") - }, - monitoringInt: map[request.ResultID]int64{ - request.IDRequestCount: 1, - request.IDResponseCount: 1, - request.IDResponseValidCount: 0, - request.IDResponseErrorsCount: 1, - request.IDResponseErrorsInternal: 0, - request.IDResponseErrorsRateLimit: 0, - request.IDResponseErrorsTimeout: 1, - request.IDResponseErrorsUnauthorized: 0, + } { + reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( + func(ik sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality }, - expectedOtel: map[string]interface{}{ - "grpc.server." + string(request.IDRequestCount): 1, - "grpc.server." + string(request.IDResponseCount): 1, - "grpc.server." + string(request.IDResponseErrorsCount): 1, - "grpc.server." + string(request.IDResponseErrorsTimeout): 1, + )) + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) - "grpc.server.request.duration": 1, - }, - }, - { - name: "with a resource exhausted error", - f: func(ctx context.Context, req interface{}) (interface{}, error) { - return nil, status.Error(codes.ResourceExhausted, "rate limit exceeded") - }, - monitoringInt: map[request.ResultID]int64{ - request.IDRequestCount: 1, - request.IDResponseCount: 1, - request.IDResponseValidCount: 0, - request.IDResponseErrorsCount: 1, - request.IDResponseErrorsInternal: 0, - request.IDResponseErrorsRateLimit: 1, - request.IDResponseErrorsTimeout: 0, - request.IDResponseErrorsUnauthorized: 0, - }, - expectedOtel: map[string]interface{}{ - "grpc.server." + string(request.IDRequestCount): 1, - "grpc.server." + string(request.IDResponseCount): 1, - "grpc.server." + string(request.IDResponseErrorsCount): 1, - "grpc.server." + string(request.IDResponseErrorsRateLimit): 1, + logger := logp.NewLogger("interceptor.metrics.test") - "grpc.server.request.duration": 1, - }, - }, - { - name: "with a success", - f: func(ctx context.Context, req interface{}) (interface{}, error) { - return nil, nil - }, - monitoringInt: map[request.ResultID]int64{ - request.IDRequestCount: 1, - request.IDResponseCount: 1, - request.IDResponseValidCount: 1, - request.IDResponseErrorsCount: 0, - request.IDResponseErrorsInternal: 0, - request.IDResponseErrorsRateLimit: 0, - request.IDResponseErrorsTimeout: 0, - request.IDResponseErrorsUnauthorized: 0, - }, - expectedOtel: map[string]interface{}{ - "grpc.server." + string(request.IDRequestCount): 1, - "grpc.server." + string(request.IDResponseCount): 1, - "grpc.server." + string(request.IDResponseValidCount): 1, + interceptor := Metrics(logger, mp) - "grpc.server.request.duration": 1, - }, - }, - } { - t.Run(tc.name, func(t *testing.T) { - interceptor(ctx, nil, info, tc.f) - assertMonitoring(t, tc.monitoringInt, monitoringMap) - monitoringtest.ClearRegistry(monitoringMap) - monitoringtest.ExpectOtelMetrics(t, reader, tc.expectedOtel) - }) - } -} + ctx := context.Background() + info := &grpc.UnaryServerInfo{ + FullMethod: metrics.methodName, + } -func assertMonitoring(t *testing.T, expected map[request.ResultID]int64, actual map[request.ResultID]*monitoring.Int) { - for _, k := range monitoringKeys { - if val, ok := expected[k]; ok { - assert.Equalf(t, val, actual[k].Get(), "%s mismatch", k) - } else { - assert.Zerof(t, actual[k].Get(), "%s mismatch", k) + for _, tc := range []struct { + name string + f func(ctx context.Context, req interface{}) (interface{}, error) + monitoringInt map[request.ResultID]int64 + expectedOtel map[string]interface{} + }{ + { + name: "with an error", + f: func(ctx context.Context, req interface{}) (interface{}, error) { + return nil, errors.New("error") + }, + expectedOtel: map[string]interface{}{ + string(request.IDRequestCount): 1, + string(request.IDResponseCount): 1, + string(request.IDResponseErrorsCount): 1, + + "request.duration": 1, + }, + }, + { + name: "with an unauthenticated error", + f: func(ctx context.Context, req interface{}) (interface{}, error) { + return nil, status.Error(codes.Unauthenticated, "error") + }, + expectedOtel: map[string]interface{}{ + string(request.IDRequestCount): 1, + string(request.IDResponseCount): 1, + string(request.IDResponseErrorsCount): 1, + string(request.IDResponseErrorsUnauthorized): 1, + + "request.duration": 1, + }, + }, + { + name: "with a deadline exceeded error", + f: func(ctx context.Context, req interface{}) (interface{}, error) { + return nil, status.Error(codes.DeadlineExceeded, "request timed out") + }, + expectedOtel: map[string]interface{}{ + string(request.IDRequestCount): 1, + string(request.IDResponseCount): 1, + string(request.IDResponseErrorsCount): 1, + string(request.IDResponseErrorsTimeout): 1, + + "request.duration": 1, + }, + }, + { + name: "with a canceled error", + f: func(ctx context.Context, req interface{}) (interface{}, error) { + return nil, status.Error(codes.Canceled, "request timed out") + }, + expectedOtel: map[string]interface{}{ + string(request.IDRequestCount): 1, + string(request.IDResponseCount): 1, + string(request.IDResponseErrorsCount): 1, + string(request.IDResponseErrorsTimeout): 1, + + "request.duration": 1, + }, + }, + { + name: "with a resource exhausted error", + f: func(ctx context.Context, req interface{}) (interface{}, error) { + return nil, status.Error(codes.ResourceExhausted, "rate limit exceeded") + }, + expectedOtel: map[string]interface{}{ + string(request.IDRequestCount): 1, + string(request.IDResponseCount): 1, + string(request.IDResponseErrorsCount): 1, + string(request.IDResponseErrorsRateLimit): 1, + + "request.duration": 1, + }, + }, + { + name: "with a success", + f: func(ctx context.Context, req interface{}) (interface{}, error) { + return nil, nil + }, + expectedOtel: map[string]interface{}{ + string(request.IDRequestCount): 1, + string(request.IDResponseCount): 1, + string(request.IDResponseValidCount): 1, + + "request.duration": 1, + }, + }, + } { + t.Run(tc.name, func(t *testing.T) { + interceptor(ctx, nil, info, tc.f) + + expectedMetrics := make(map[string]any, 2*len(tc.expectedOtel)) + + for k, v := range tc.expectedOtel { + // add otel metrics + expectedMetrics["grpc.server."+k] = v + + if k != "request.duration" { + // add legacy metrics + expectedMetrics[metrics.prefix+k] = v + } + } + + monitoringtest.ExpectOtelMetrics(t, reader, expectedMetrics) + }) } } } - -type requestMetricsFunc func(fullMethod string) map[request.ResultID]*monitoring.Int - -func (f requestMetricsFunc) RequestMetrics(fullMethod string) map[request.ResultID]*monitoring.Int { - return f(fullMethod) -} diff --git a/internal/beater/middleware/monitoring_middleware.go b/internal/beater/middleware/monitoring_middleware.go index e34b4b52ebc..9dd06eec4d4 100644 --- a/internal/beater/middleware/monitoring_middleware.go +++ b/internal/beater/middleware/monitoring_middleware.go @@ -23,11 +23,9 @@ import ( "sync" "time" - "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/metric" "github.com/elastic/apm-server/internal/beater/request" - "github.com/elastic/elastic-agent-libs/monitoring" ) const ( @@ -35,9 +33,9 @@ const ( ) type monitoringMiddleware struct { - meter metric.Meter + meter metric.Meter + legacyMetricsPrefix string - ints map[request.ResultID]*monitoring.Int counters sync.Map histograms sync.Map } @@ -47,43 +45,35 @@ func (m *monitoringMiddleware) Middleware() Middleware { return func(c *request.Context) { ctx := context.Background() - m.getCounter(string(request.IDRequestCount)).Add(ctx, 1) - m.inc(request.IDRequestCount) + m.inc(ctx, request.IDRequestCount) start := time.Now() h(c) duration := time.Since(start) m.getHistogram(requestDurationHistogram, metric.WithUnit("ms")).Record(ctx, duration.Milliseconds()) - m.getCounter(string(request.IDResponseCount)).Add(ctx, 1) - m.inc(request.IDResponseCount) + m.inc(ctx, request.IDResponseCount) if c.Result.StatusCode >= http.StatusBadRequest { - m.getCounter(string(request.IDResponseErrorsCount)).Add(ctx, 1) - m.inc(request.IDResponseErrorsCount) + m.inc(ctx, request.IDResponseErrorsCount) } else { - m.getCounter(string(request.IDResponseValidCount)).Add(ctx, 1) - m.inc(request.IDResponseValidCount) + m.inc(ctx, request.IDResponseValidCount) } - - m.getCounter(string(c.Result.ID)).Add(ctx, 1) - m.inc(c.Result.ID) + m.inc(ctx, c.Result.ID) }, nil } } -func (m *monitoringMiddleware) inc(id request.ResultID) { - if counter, ok := m.ints[id]; ok { - counter.Inc() - } +func (m *monitoringMiddleware) inc(ctx context.Context, id request.ResultID) { + m.getCounter("http.server.", string(id)).Add(ctx, 1) + m.getCounter(m.legacyMetricsPrefix, string(id)).Add(ctx, 1) } -func (m *monitoringMiddleware) getCounter(n string) metric.Int64Counter { - name := "http.server." + n +func (m *monitoringMiddleware) getCounter(prefix, name string) metric.Int64Counter { + name = prefix + name if met, ok := m.counters.Load(name); ok { return met.(metric.Int64Counter) } - nm, _ := m.meter.Int64Counter(name) m.counters.LoadOrStore(name, nm) return nm @@ -102,16 +92,12 @@ func (m *monitoringMiddleware) getHistogram(n string, opts ...metric.Int64Histog // MonitoringMiddleware returns a middleware that increases monitoring counters for collecting metrics // about request processing. As input parameter it takes a map capable of mapping a request.ResultID to a counter. -func MonitoringMiddleware(m map[request.ResultID]*monitoring.Int, mp metric.MeterProvider) Middleware { - if mp == nil { - mp = otel.GetMeterProvider() - } - +func MonitoringMiddleware(legacyMetricsPrefix string, mp metric.MeterProvider) Middleware { mid := &monitoringMiddleware{ - meter: mp.Meter("internal/beater/middleware"), - ints: m, - counters: sync.Map{}, - histograms: sync.Map{}, + meter: mp.Meter("github.com/elastic/apm-server/internal/beater/middleware"), + legacyMetricsPrefix: legacyMetricsPrefix, + counters: sync.Map{}, + histograms: sync.Map{}, } return mid.Middleware() diff --git a/internal/beater/middleware/monitoring_middleware_test.go b/internal/beater/middleware/monitoring_middleware_test.go index 7e6e741cecc..9c410e9eb5b 100644 --- a/internal/beater/middleware/monitoring_middleware_test.go +++ b/internal/beater/middleware/monitoring_middleware_test.go @@ -21,28 +21,17 @@ import ( "sync" "testing" - "github.com/stretchr/testify/assert" sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/metric/metricdata" - "github.com/elastic/elastic-agent-libs/monitoring" - "github.com/elastic/apm-server/internal/beater/monitoringtest" "github.com/elastic/apm-server/internal/beater/request" ) -var ( - mockMonitoringRegistry = monitoring.Default.NewRegistry("mock.monitoring") - mockMonitoringNil = map[request.ResultID]*monitoring.Int{} - mockMonitoring = request.DefaultMonitoringMapForRegistry(mockMonitoringRegistry) -) - func TestMonitoringHandler(t *testing.T) { checkMonitoring := func(t *testing.T, h func(*request.Context), - expected map[request.ResultID]int, expectedOtel map[string]interface{}, - m map[request.ResultID]*monitoring.Int, ) { reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( func(ik sdkmetric.InstrumentKind) metricdata.Temporality { @@ -51,11 +40,8 @@ func TestMonitoringHandler(t *testing.T) { )) mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) - monitoringtest.ClearRegistry(m) c, _ := DefaultContextWithResponseRecorder() - Apply(MonitoringMiddleware(m, mp), h)(c) - equal, result := monitoringtest.CompareMonitoringInt(expected, m) - assert.True(t, equal, result) + Apply(MonitoringMiddleware("", mp), h)(c) monitoringtest.ExpectOtelMetrics(t, reader, expectedOtel) } @@ -63,99 +49,90 @@ func TestMonitoringHandler(t *testing.T) { t.Run("Error", func(t *testing.T) { checkMonitoring(t, Handler403, - map[request.ResultID]int{ - request.IDRequestCount: 1, - request.IDResponseCount: 1, - request.IDResponseErrorsCount: 1, - request.IDResponseErrorsForbidden: 1, - }, map[string]interface{}{ "http.server." + string(request.IDRequestCount): 1, "http.server." + string(request.IDResponseCount): 1, "http.server." + string(request.IDResponseErrorsCount): 1, "http.server." + string(request.IDResponseErrorsForbidden): 1, + string(request.IDRequestCount): 1, + string(request.IDResponseCount): 1, + string(request.IDResponseErrorsCount): 1, + string(request.IDResponseErrorsForbidden): 1, "http.server.request.duration": 1, }, - mockMonitoring, ) }) t.Run("Accepted", func(t *testing.T) { checkMonitoring(t, Handler202, - map[request.ResultID]int{ - request.IDRequestCount: 1, - request.IDResponseCount: 1, - request.IDResponseValidCount: 1, - request.IDResponseValidAccepted: 1, - }, map[string]interface{}{ "http.server." + string(request.IDRequestCount): 1, "http.server." + string(request.IDResponseCount): 1, "http.server." + string(request.IDResponseValidCount): 1, "http.server." + string(request.IDResponseValidAccepted): 1, + string(request.IDRequestCount): 1, + string(request.IDResponseCount): 1, + string(request.IDResponseValidCount): 1, + string(request.IDResponseValidAccepted): 1, "http.server.request.duration": 1, }, - mockMonitoring, ) }) t.Run("Idle", func(t *testing.T) { checkMonitoring(t, HandlerIdle, - map[request.ResultID]int{ - request.IDRequestCount: 1, - request.IDResponseCount: 1, - request.IDResponseValidCount: 1, - request.IDUnset: 1, - }, map[string]interface{}{ "http.server." + string(request.IDRequestCount): 1, "http.server." + string(request.IDResponseCount): 1, "http.server." + string(request.IDResponseValidCount): 1, "http.server." + string(request.IDUnset): 1, + string(request.IDRequestCount): 1, + string(request.IDResponseCount): 1, + string(request.IDResponseValidCount): 1, + string(request.IDUnset): 1, "http.server.request.duration": 1, }, - mockMonitoring, ) }) t.Run("Panic", func(t *testing.T) { checkMonitoring(t, Apply(RecoverPanicMiddleware(), HandlerPanic), - map[request.ResultID]int{ - request.IDRequestCount: 1, - request.IDResponseCount: 1, - request.IDResponseErrorsCount: 1, - request.IDResponseErrorsInternal: 1, - }, map[string]interface{}{ "http.server." + string(request.IDRequestCount): 1, "http.server." + string(request.IDResponseCount): 1, "http.server." + string(request.IDResponseErrorsCount): 1, "http.server." + string(request.IDResponseErrorsInternal): 1, + string(request.IDRequestCount): 1, + string(request.IDResponseCount): 1, + string(request.IDResponseErrorsCount): 1, + string(request.IDResponseErrorsInternal): 1, "http.server.request.duration": 1, }, - mockMonitoring) + ) }) t.Run("Nil", func(t *testing.T) { checkMonitoring(t, HandlerIdle, - map[request.ResultID]int{}, map[string]interface{}{ "http.server." + string(request.IDRequestCount): 1, "http.server." + string(request.IDResponseCount): 1, "http.server." + string(request.IDResponseValidCount): 1, "http.server." + string(request.IDUnset): 1, + string(request.IDRequestCount): 1, + string(request.IDResponseCount): 1, + string(request.IDResponseValidCount): 1, + string(request.IDUnset): 1, "http.server.request.duration": 1, }, - mockMonitoringNil, ) }) @@ -167,7 +144,7 @@ func TestMonitoringHandler(t *testing.T) { }, )) mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) - m := MonitoringMiddleware(mockMonitoringNil, mp) + m := MonitoringMiddleware("", mp) c, _ := DefaultContextWithResponseRecorder() var wg sync.WaitGroup for range i { @@ -183,6 +160,10 @@ func TestMonitoringHandler(t *testing.T) { "http.server." + string(request.IDResponseCount): i, "http.server." + string(request.IDResponseValidCount): i, "http.server." + string(request.IDUnset): i, + string(request.IDRequestCount): i, + string(request.IDResponseCount): i, + string(request.IDResponseValidCount): i, + string(request.IDUnset): i, "http.server.request.duration": i, }) diff --git a/internal/beater/monitoringtest/monitoring.go b/internal/beater/monitoringtest/monitoring.go deleted file mode 100644 index b6ae3fb1517..00000000000 --- a/internal/beater/monitoringtest/monitoring.go +++ /dev/null @@ -1,79 +0,0 @@ -// Licensed to Elasticsearch B.V. under one or more contributor -// license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright -// ownership. Elasticsearch B.V. licenses this file to you under -// the Apache License, Version 2.0 (the "License"); you may -// not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package monitoringtest - -import ( - "fmt" - - "github.com/elastic/elastic-agent-libs/monitoring" - - "github.com/elastic/apm-server/internal/beater/request" -) - -// TODO(axw) consider moving these to the middleware package, -// and removing their use in other packages. We should only -// need to check specific values in one place, and elsewhere -// we might want to check that the middleware is installed by -// checking that the middleware has done _something_. - -// CompareMonitoringInt matches expected with real monitoring counters and -// returns false and an a string showind diffs if not matching. -// -// The caller is expected to call ClearRegistry before invoking some code -// path that should update monitoring counters. -func CompareMonitoringInt( - expected map[request.ResultID]int, - m map[request.ResultID]*monitoring.Int, -) (bool, string) { - var result string - for _, id := range allRequestResultIDs() { - monitoringIntVal := int64(0) - monitoringInt := m[id] - if monitoringInt != nil { - monitoringIntVal = monitoringInt.Get() - } - expectedVal := int64(0) - if val, included := expected[id]; included { - expectedVal = int64(val) - } - if expectedVal != monitoringIntVal { - result += fmt.Sprintf("[%s] Expected: %d, Received: %d", id, expectedVal, monitoringIntVal) - } - } - return len(result) == 0, result -} - -// allRequestResultIDs returns all registered request.ResultIDs (needs to be manually maintained) -func allRequestResultIDs() []request.ResultID { - var ids []request.ResultID - ids = append(ids, request.DefaultResultIDs...) - for k := range request.MapResultIDToStatus { - ids = append(ids, k) - } - return ids -} - -// ClearRegistry sets all counters to 0 and removes all registered counters from the registry -// Only use this in test environments -func ClearRegistry(m map[request.ResultID]*monitoring.Int) { - for _, i := range m { - if i != nil { - i.Set(0) - } - } -} diff --git a/internal/beater/monitoringtest/opentelemetry.go b/internal/beater/monitoringtest/opentelemetry.go index 54d355a78a6..4e6ff2eabf9 100644 --- a/internal/beater/monitoringtest/opentelemetry.go +++ b/internal/beater/monitoringtest/opentelemetry.go @@ -27,6 +27,14 @@ import ( ) func ExpectOtelMetrics(t *testing.T, reader sdkmetric.Reader, expectedMetrics map[string]interface{}) { + assertOtelMetrics(t, reader, expectedMetrics, true) +} + +func ExpectContainOtelMetrics(t *testing.T, reader sdkmetric.Reader, expectedMetrics map[string]interface{}) { + assertOtelMetrics(t, reader, expectedMetrics, false) +} + +func assertOtelMetrics(t *testing.T, reader sdkmetric.Reader, expectedMetrics map[string]interface{}, match bool) { t.Helper() var rm metricdata.ResourceMetrics @@ -38,6 +46,19 @@ func ExpectOtelMetrics(t *testing.T, reader sdkmetric.Reader, expectedMetrics ma for _, m := range sm.Metrics { switch d := m.Data.(type) { + case metricdata.Gauge[int64]: + assert.Equal(t, 1, len(d.DataPoints)) + foundMetrics = append(foundMetrics, m.Name) + + if v, ok := expectedMetrics[m.Name]; ok { + if dp, ok := v.(int); ok { + assert.Equal(t, int64(dp), d.DataPoints[0].Value, m.Name) + } else { + assert.Fail(t, "expected an int value", m.Name) + } + } else if match { + assert.Fail(t, "unexpected metric", m.Name) + } case metricdata.Sum[int64]: assert.Equal(t, 1, len(d.DataPoints)) foundMetrics = append(foundMetrics, m.Name) @@ -48,7 +69,7 @@ func ExpectOtelMetrics(t *testing.T, reader sdkmetric.Reader, expectedMetrics ma } else { assert.Fail(t, "expected an int value", m.Name) } - } else { + } else if match { assert.Fail(t, "unexpected metric", m.Name) } case metricdata.Histogram[int64]: @@ -61,7 +82,7 @@ func ExpectOtelMetrics(t *testing.T, reader sdkmetric.Reader, expectedMetrics ma } else { assert.Fail(t, "expected an int value", m.Name) } - } else { + } else if match { assert.Fail(t, "unexpected metric", m.Name) } } @@ -72,5 +93,9 @@ func ExpectOtelMetrics(t *testing.T, reader sdkmetric.Reader, expectedMetrics ma for k := range expectedMetrics { expectedMetricsKeys = append(expectedMetricsKeys, k) } - assert.ElementsMatch(t, expectedMetricsKeys, foundMetrics) + if match { + assert.ElementsMatch(t, expectedMetricsKeys, foundMetrics) + } else { + assert.Subset(t, foundMetrics, expectedMetricsKeys) + } } diff --git a/internal/beater/otlp/common.go b/internal/beater/otlp/common.go deleted file mode 100644 index aab2d588cf9..00000000000 --- a/internal/beater/otlp/common.go +++ /dev/null @@ -1,60 +0,0 @@ -// Licensed to Elasticsearch B.V. under one or more contributor -// license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright -// ownership. Elasticsearch B.V. licenses this file to you under -// the Apache License, Version 2.0 (the "License"); you may -// not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package otlp - -import ( - "sync" - - "github.com/elastic/apm-data/input/otlp" - "github.com/elastic/apm-server/internal/beater/request" - "github.com/elastic/elastic-agent-libs/monitoring" -) - -var ( - monitoringKeys = append(request.DefaultResultIDs, - request.IDResponseErrorsRateLimit, - request.IDResponseErrorsTimeout, - request.IDResponseErrorsUnauthorized, - ) -) - -type monitoredConsumer struct { - mu sync.RWMutex - consumer *otlp.Consumer -} - -func (m *monitoredConsumer) set(c *otlp.Consumer) { - m.mu.Lock() - defer m.mu.Unlock() - m.consumer = c -} - -func (m *monitoredConsumer) collect(mode monitoring.Mode, V monitoring.Visitor) { - V.OnRegistryStart() - defer V.OnRegistryFinished() - - m.mu.RLock() - c := m.consumer - m.mu.RUnlock() - if c == nil { - return - } - - stats := c.Stats() - monitoring.ReportInt(V, "unsupported_dropped", stats.UnsupportedMetricsDropped) -} diff --git a/internal/beater/otlp/grpc.go b/internal/beater/otlp/grpc.go index 2684e1bd398..512c15bfee6 100644 --- a/internal/beater/otlp/grpc.go +++ b/internal/beater/otlp/grpc.go @@ -23,45 +23,16 @@ import ( "go.opentelemetry.io/collector/pdata/plog/plogotlp" "go.opentelemetry.io/collector/pdata/pmetric/pmetricotlp" "go.opentelemetry.io/collector/pdata/ptrace/ptraceotlp" + "go.opentelemetry.io/otel/metric" "go.uber.org/zap" "google.golang.org/grpc" - "github.com/elastic/elastic-agent-libs/monitoring" - "github.com/elastic/apm-data/input" "github.com/elastic/apm-data/input/otlp" "github.com/elastic/apm-data/model/modelpb" - "github.com/elastic/apm-server/internal/beater/interceptors" - "github.com/elastic/apm-server/internal/beater/request" -) - -var ( - gRPCMetricsRegistry = monitoring.Default.NewRegistry("apm-server.otlp.grpc.metrics") - gRPCMetricsMonitoringMap = request.MonitoringMapForRegistry(gRPCMetricsRegistry, monitoringKeys) - gRPCTracesRegistry = monitoring.Default.NewRegistry("apm-server.otlp.grpc.traces") - gRPCTracesMonitoringMap = request.MonitoringMapForRegistry(gRPCTracesRegistry, monitoringKeys) - gRPCLogsRegistry = monitoring.Default.NewRegistry("apm-server.otlp.grpc.logs") - gRPCLogsMonitoringMap = request.MonitoringMapForRegistry(gRPCLogsRegistry, monitoringKeys) - - gRPCMonitoredConsumer monitoredConsumer ) -func init() { - monitoring.NewFunc(gRPCMetricsRegistry, "consumer", gRPCMonitoredConsumer.collect, monitoring.Report) - - interceptors.RegisterMethodUnaryRequestMetrics( - "/opentelemetry.proto.collector.metrics.v1.MetricsService/Export", - gRPCMetricsMonitoringMap, - ) - interceptors.RegisterMethodUnaryRequestMetrics( - "/opentelemetry.proto.collector.trace.v1.TraceService/Export", - gRPCTracesMonitoringMap, - ) - interceptors.RegisterMethodUnaryRequestMetrics( - "/opentelemetry.proto.collector.logs.v1.LogsService/Export", - gRPCLogsMonitoringMap, - ) -} +var unsupportedGRPCMetricRegistration metric.Registration // RegisterGRPCServices registers OTLP consumer services with the given gRPC server. func RegisterGRPCServices( @@ -69,6 +40,7 @@ func RegisterGRPCServices( logger *zap.Logger, processor modelpb.BatchProcessor, semaphore input.Semaphore, + mp metric.MeterProvider, ) { // TODO(axw) stop assuming we have only one OTLP gRPC service running // at any time, and instead aggregate metrics from consumers that are @@ -79,7 +51,24 @@ func RegisterGRPCServices( Semaphore: semaphore, RemapOTelMetrics: true, }) - gRPCMonitoredConsumer.set(consumer) + + meter := mp.Meter("github.com/elastic/apm-server/internal/beater/otlp") + grpcMetricsConsumerUnsupportedDropped, _ := meter.Int64ObservableCounter( + "apm-server.otlp.grpc.metrics.consumer.unsupported_dropped", + ) + + // TODO we should add an otel counter metric directly in the + // apm-data consumer, then we could get rid of the callback. + if unsupportedGRPCMetricRegistration != nil { + unsupportedGRPCMetricRegistration.Unregister() + } + unsupportedGRPCMetricRegistration, _ = meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + stats := consumer.Stats() + if stats.UnsupportedMetricsDropped > 0 { + o.ObserveInt64(grpcMetricsConsumerUnsupportedDropped, stats.UnsupportedMetricsDropped) + } + return nil + }, grpcMetricsConsumerUnsupportedDropped) ptraceotlp.RegisterGRPCServer(grpcServer, &tracesService{consumer: consumer}) pmetricotlp.RegisterGRPCServer(grpcServer, &metricsService{consumer: consumer}) diff --git a/internal/beater/otlp/grpc_test.go b/internal/beater/otlp/grpc_test.go index 5d4e70ad871..2182c035876 100644 --- a/internal/beater/otlp/grpc_test.go +++ b/internal/beater/otlp/grpc_test.go @@ -31,6 +31,9 @@ import ( "go.opentelemetry.io/collector/pdata/pmetric/pmetricotlp" "go.opentelemetry.io/collector/pdata/ptrace" "go.opentelemetry.io/collector/pdata/ptrace/ptraceotlp" + "go.opentelemetry.io/otel/metric" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" "go.uber.org/zap" "golang.org/x/sync/semaphore" "google.golang.org/grpc" @@ -39,9 +42,9 @@ import ( "github.com/elastic/apm-data/model/modelpb" "github.com/elastic/apm-server/internal/beater/interceptors" + "github.com/elastic/apm-server/internal/beater/monitoringtest" "github.com/elastic/apm-server/internal/beater/otlp" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/monitoring" ) func TestConsumeTracesGRPC(t *testing.T) { @@ -52,7 +55,14 @@ func TestConsumeTracesGRPC(t *testing.T) { return reportError } - conn := newGRPCServer(t, batchProcessor) + reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( + func(ik sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality + }, + )) + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + + conn := newGRPCServer(t, batchProcessor, mp) client := ptraceotlp.NewGRPCClient(conn) // Send a minimal trace to verify that everything is connected properly. @@ -77,19 +87,12 @@ func TestConsumeTracesGRPC(t *testing.T) { assert.Len(t, batches[0], 1) assert.Len(t, batches[1], 1) - actual := map[string]interface{}{} - monitoring.GetRegistry("apm-server.otlp.grpc.traces").Do(monitoring.Full, func(key string, value interface{}) { - actual[key] = value + monitoringtest.ExpectContainOtelMetrics(t, reader, map[string]any{ + "apm-server.otlp.grpc.traces.request.count": 2, + "apm-server.otlp.grpc.traces.response.valid.count": 1, + "apm-server.otlp.grpc.traces.response.count": 2, + "apm-server.otlp.grpc.traces.response.errors.count": 1, }) - assert.Equal(t, map[string]interface{}{ - "request.count": int64(2), - "response.count": int64(2), - "response.errors.count": int64(1), - "response.valid.count": int64(1), - "response.errors.ratelimit": int64(0), - "response.errors.timeout": int64(0), - "response.errors.unauthorized": int64(0), - }, actual) } func TestConsumeMetricsGRPC(t *testing.T) { @@ -98,7 +101,14 @@ func TestConsumeMetricsGRPC(t *testing.T) { return reportError } - conn := newGRPCServer(t, batchProcessor) + reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( + func(ik sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality + }, + )) + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + + conn := newGRPCServer(t, batchProcessor, mp) client := pmetricotlp.NewGRPCClient(conn) // Send a minimal metric to verify that everything is connected properly. @@ -122,21 +132,12 @@ func TestConsumeMetricsGRPC(t *testing.T) { errStatus := status.Convert(err) assert.Equal(t, "failed to publish events", errStatus.Message()) - actual := map[string]interface{}{} - monitoring.GetRegistry("apm-server.otlp.grpc.metrics").Do(monitoring.Full, func(key string, value interface{}) { - actual[key] = value + monitoringtest.ExpectContainOtelMetrics(t, reader, map[string]any{ + "apm-server.otlp.grpc.metrics.request.count": 2, + "apm-server.otlp.grpc.metrics.response.valid.count": 1, + "apm-server.otlp.grpc.metrics.response.count": 2, + "apm-server.otlp.grpc.metrics.response.errors.count": 1, }) - assert.Equal(t, map[string]interface{}{ - "consumer.unsupported_dropped": int64(0), - - "request.count": int64(2), - "response.count": int64(2), - "response.errors.count": int64(1), - "response.valid.count": int64(1), - "response.errors.ratelimit": int64(0), - "response.errors.timeout": int64(0), - "response.errors.unauthorized": int64(0), - }, actual) } func TestConsumeLogsGRPC(t *testing.T) { @@ -147,7 +148,14 @@ func TestConsumeLogsGRPC(t *testing.T) { return reportError } - conn := newGRPCServer(t, batchProcessor) + reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( + func(ik sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality + }, + )) + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + + conn := newGRPCServer(t, batchProcessor, mp) client := plogotlp.NewGRPCClient(conn) // Send a minimal log record to verify that everything is connected properly. @@ -171,28 +179,21 @@ func TestConsumeLogsGRPC(t *testing.T) { assert.Len(t, batches[0], 1) assert.Len(t, batches[1], 1) - actual := map[string]interface{}{} - monitoring.GetRegistry("apm-server.otlp.grpc.logs").Do(monitoring.Full, func(key string, value interface{}) { - actual[key] = value + monitoringtest.ExpectContainOtelMetrics(t, reader, map[string]any{ + "apm-server.otlp.grpc.logs.request.count": 2, + "apm-server.otlp.grpc.logs.response.valid.count": 1, + "apm-server.otlp.grpc.logs.response.count": 2, + "apm-server.otlp.grpc.logs.response.errors.count": 1, }) - assert.Equal(t, map[string]interface{}{ - "request.count": int64(2), - "response.count": int64(2), - "response.errors.count": int64(1), - "response.valid.count": int64(1), - "response.errors.ratelimit": int64(0), - "response.errors.timeout": int64(0), - "response.errors.unauthorized": int64(0), - }, actual) } -func newGRPCServer(t *testing.T, batchProcessor modelpb.BatchProcessor) *grpc.ClientConn { +func newGRPCServer(t *testing.T, batchProcessor modelpb.BatchProcessor, mp metric.MeterProvider) *grpc.ClientConn { lis, err := net.Listen("tcp", "localhost:0") require.NoError(t, err) logger := logp.NewLogger("otlp.grpc.test") - srv := grpc.NewServer(grpc.UnaryInterceptor(interceptors.Metrics(logger, nil))) + srv := grpc.NewServer(grpc.UnaryInterceptor(interceptors.Metrics(logger, mp))) semaphore := semaphore.NewWeighted(1) - otlp.RegisterGRPCServices(srv, zap.NewNop(), batchProcessor, semaphore) + otlp.RegisterGRPCServices(srv, zap.NewNop(), batchProcessor, semaphore, mp) go srv.Serve(lis) t.Cleanup(srv.GracefulStop) diff --git a/internal/beater/otlp/http.go b/internal/beater/otlp/http.go index 5600ea5dd62..597c535448d 100644 --- a/internal/beater/otlp/http.go +++ b/internal/beater/otlp/http.go @@ -18,6 +18,7 @@ package otlp import ( + "context" "fmt" "io" "net/http" @@ -25,6 +26,7 @@ import ( "go.opentelemetry.io/collector/pdata/plog/plogotlp" "go.opentelemetry.io/collector/pdata/pmetric/pmetricotlp" "go.opentelemetry.io/collector/pdata/ptrace/ptraceotlp" + "go.opentelemetry.io/otel/metric" "go.uber.org/zap" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -33,26 +35,11 @@ import ( "github.com/elastic/apm-data/input" "github.com/elastic/apm-data/input/otlp" "github.com/elastic/apm-data/model/modelpb" - "github.com/elastic/apm-server/internal/beater/request" - "github.com/elastic/elastic-agent-libs/monitoring" ) -var ( - httpMetricsRegistry = monitoring.Default.NewRegistry("apm-server.otlp.http.metrics") - HTTPMetricsMonitoringMap = request.MonitoringMapForRegistry(httpMetricsRegistry, monitoringKeys) - httpTracesRegistry = monitoring.Default.NewRegistry("apm-server.otlp.http.traces") - HTTPTracesMonitoringMap = request.MonitoringMapForRegistry(httpTracesRegistry, monitoringKeys) - httpLogsRegistry = monitoring.Default.NewRegistry("apm-server.otlp.http.logs") - HTTPLogsMonitoringMap = request.MonitoringMapForRegistry(httpLogsRegistry, monitoringKeys) +var unsupportedHTTPMetricRegistration metric.Registration - httpMonitoredConsumer monitoredConsumer -) - -func init() { - monitoring.NewFunc(httpMetricsRegistry, "consumer", httpMonitoredConsumer.collect, monitoring.Report) -} - -func NewHTTPHandlers(logger *zap.Logger, processor modelpb.BatchProcessor, semaphore input.Semaphore) HTTPHandlers { +func NewHTTPHandlers(logger *zap.Logger, processor modelpb.BatchProcessor, semaphore input.Semaphore, mp metric.MeterProvider) HTTPHandlers { // TODO(axw) stop assuming we have only one OTLP HTTP consumer running // at any time, and instead aggregate metrics from consumers that are // dynamically registered and unregistered. @@ -62,7 +49,25 @@ func NewHTTPHandlers(logger *zap.Logger, processor modelpb.BatchProcessor, semap Semaphore: semaphore, RemapOTelMetrics: true, }) - httpMonitoredConsumer.set(consumer) + + meter := mp.Meter("github.com/elastic/apm-server/internal/beater/otlp") + httpMetricsConsumerUnsupportedDropped, _ := meter.Int64ObservableCounter( + "apm-server.otlp.http.metrics.consumer.unsupported_dropped", + ) + + // TODO we should add an otel counter metric directly in the + // apm-data consumer, then we could get rid of the callback. + if unsupportedHTTPMetricRegistration != nil { + unsupportedHTTPMetricRegistration.Unregister() + } + unsupportedHTTPMetricRegistration, _ = meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + stats := consumer.Stats() + if stats.UnsupportedMetricsDropped > 0 { + o.ObserveInt64(httpMetricsConsumerUnsupportedDropped, stats.UnsupportedMetricsDropped) + } + return nil + }, httpMetricsConsumerUnsupportedDropped) + return HTTPHandlers{consumer: consumer} } diff --git a/internal/beater/otlp/http_test.go b/internal/beater/otlp/http_test.go index 93702049759..35895496d02 100644 --- a/internal/beater/otlp/http_test.go +++ b/internal/beater/otlp/http_test.go @@ -33,6 +33,8 @@ import ( "go.opentelemetry.io/collector/pdata/pmetric/pmetricotlp" "go.opentelemetry.io/collector/pdata/ptrace" "go.opentelemetry.io/collector/pdata/ptrace/ptraceotlp" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" "golang.org/x/sync/semaphore" "github.com/elastic/apm-data/model/modelpb" @@ -40,8 +42,8 @@ import ( "github.com/elastic/apm-server/internal/beater/api" "github.com/elastic/apm-server/internal/beater/auth" "github.com/elastic/apm-server/internal/beater/config" + "github.com/elastic/apm-server/internal/beater/monitoringtest" "github.com/elastic/apm-server/internal/beater/ratelimit" - "github.com/elastic/elastic-agent-libs/monitoring" ) func TestConsumeTracesHTTP(t *testing.T) { @@ -52,7 +54,7 @@ func TestConsumeTracesHTTP(t *testing.T) { return reportError } - addr := newHTTPServer(t, batchProcessor) + addr, reader := newHTTPServer(t, batchProcessor) // Send a minimal trace to verify that everything is connected properly. // @@ -75,19 +77,12 @@ func TestConsumeTracesHTTP(t *testing.T) { require.Len(t, batches, 1) assert.Len(t, batches[0], 1) - actual := map[string]interface{}{} - monitoring.GetRegistry("apm-server.otlp.http.traces").Do(monitoring.Full, func(key string, value interface{}) { - actual[key] = value + monitoringtest.ExpectContainOtelMetrics(t, reader, map[string]any{ + "http.server.request.count": 1, + "http.server.response.count": 1, + "http.server.response.valid.count": 1, }) - assert.Equal(t, map[string]interface{}{ - "request.count": int64(1), - "response.count": int64(1), - "response.errors.count": int64(0), - "response.valid.count": int64(1), - "response.errors.ratelimit": int64(0), - "response.errors.timeout": int64(0), - "response.errors.unauthorized": int64(0), - }, actual) + } func TestConsumeMetricsHTTP(t *testing.T) { @@ -96,7 +91,7 @@ func TestConsumeMetricsHTTP(t *testing.T) { return reportError } - addr := newHTTPServer(t, batchProcessor) + addr, reader := newHTTPServer(t, batchProcessor) // Send a minimal metric to verify that everything is connected properly. // @@ -119,21 +114,11 @@ func TestConsumeMetricsHTTP(t *testing.T) { assert.NoError(t, err) assert.NoError(t, rsp.Body.Close()) - actual := map[string]interface{}{} - monitoring.GetRegistry("apm-server.otlp.http.metrics").Do(monitoring.Full, func(key string, value interface{}) { - actual[key] = value + monitoringtest.ExpectContainOtelMetrics(t, reader, map[string]any{ + "http.server.request.count": 1, + "http.server.response.count": 1, + "http.server.response.valid.count": 1, }) - assert.Equal(t, map[string]interface{}{ - "consumer.unsupported_dropped": int64(0), - - "request.count": int64(1), - "response.count": int64(1), - "response.errors.count": int64(0), - "response.valid.count": int64(1), - "response.errors.ratelimit": int64(0), - "response.errors.timeout": int64(0), - "response.errors.unauthorized": int64(0), - }, actual) } func TestConsumeLogsHTTP(t *testing.T) { @@ -144,7 +129,7 @@ func TestConsumeLogsHTTP(t *testing.T) { return reportError } - addr := newHTTPServer(t, batchProcessor) + addr, reader := newHTTPServer(t, batchProcessor) // Send a minimal log record to verify that everything is connected properly. // @@ -165,24 +150,22 @@ func TestConsumeLogsHTTP(t *testing.T) { assert.NoError(t, rsp.Body.Close()) require.Len(t, batches, 1) - actual := map[string]interface{}{} - monitoring.GetRegistry("apm-server.otlp.http.logs").Do(monitoring.Full, func(key string, value interface{}) { - actual[key] = value + monitoringtest.ExpectContainOtelMetrics(t, reader, map[string]any{ + "http.server.request.count": 1, + "http.server.response.count": 1, + "http.server.response.valid.count": 1, }) - assert.Equal(t, map[string]interface{}{ - "request.count": int64(1), - "response.count": int64(1), - "response.errors.count": int64(0), - "response.valid.count": int64(1), - "response.errors.ratelimit": int64(0), - "response.errors.timeout": int64(0), - "response.errors.unauthorized": int64(0), - }, actual) } -func newHTTPServer(t *testing.T, batchProcessor modelpb.BatchProcessor) string { +func newHTTPServer(t *testing.T, batchProcessor modelpb.BatchProcessor) (string, sdkmetric.Reader) { lis, err := net.Listen("tcp", "localhost:0") require.NoError(t, err) + reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( + func(ik sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality + }, + )) + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) cfg := &config.Config{} auth, _ := auth.NewAuthenticator(cfg.AgentAuth) ratelimitStore, _ := ratelimit.NewStore(1000, 1000, 1000) @@ -195,6 +178,7 @@ func newHTTPServer(t *testing.T, batchProcessor modelpb.BatchProcessor) string { nil, func() bool { return true }, semaphore.NewWeighted(1), + mp, ) require.NoError(t, err) srv := http.Server{Handler: router} @@ -202,5 +186,5 @@ func newHTTPServer(t *testing.T, batchProcessor modelpb.BatchProcessor) string { require.NoError(t, srv.Close()) }) go srv.Serve(lis) - return lis.Addr().String() + return lis.Addr().String(), reader } diff --git a/internal/beater/request/result.go b/internal/beater/request/result.go index a69117a9cfc..2f03298e2a9 100644 --- a/internal/beater/request/result.go +++ b/internal/beater/request/result.go @@ -20,8 +20,6 @@ package request import ( "net/http" - "github.com/elastic/elastic-agent-libs/monitoring" - "github.com/pkg/errors" ) @@ -101,9 +99,6 @@ var ( IDResponseErrorsServiceUnavailable: {Code: http.StatusServiceUnavailable, Keyword: "service unavailable"}, IDResponseErrorsInternal: {Code: http.StatusInternalServerError, Keyword: "internal error"}, } - - // DefaultResultIDs is a list of the default result IDs used by the package. - DefaultResultIDs = []ResultID{IDRequestCount, IDResponseCount, IDResponseErrorsCount, IDResponseValidCount} ) // ResultID unique string identifying a requests Result @@ -125,27 +120,6 @@ type Result struct { Stacktrace string } -// DefaultMonitoringMapForRegistry returns map matching resultIDs to monitoring counters for given registry. -func DefaultMonitoringMapForRegistry(r *monitoring.Registry) map[ResultID]*monitoring.Int { - ids := append(DefaultResultIDs, IDUnset) - for id := range MapResultIDToStatus { - ids = append(ids, id) - } - return MonitoringMapForRegistry(r, ids) -} - -// MonitoringMapForRegistry returns map matching resultIDs to monitoring counters for given registry and keys -func MonitoringMapForRegistry(r *monitoring.Registry, ids []ResultID) map[ResultID]*monitoring.Int { - m := map[ResultID]*monitoring.Int{} - counter := func(s ResultID) *monitoring.Int { - return monitoring.NewInt(r, string(s)) - } - for _, id := range ids { - m[id] = counter(id) - } - return m -} - // Reset sets result to it's empty values func (r *Result) Reset() { r.ID = IDUnset diff --git a/internal/beater/request/result_test.go b/internal/beater/request/result_test.go index 88e05e00189..07de0e29cfc 100644 --- a/internal/beater/request/result_test.go +++ b/internal/beater/request/result_test.go @@ -24,8 +24,6 @@ import ( "github.com/stretchr/testify/assert" - "github.com/elastic/elastic-agent-libs/monitoring" - "github.com/pkg/errors" ) @@ -188,22 +186,3 @@ func TestResult_Failure(t *testing.T) { assert.True(t, (&Result{StatusCode: http.StatusBadRequest}).Failure()) assert.True(t, (&Result{StatusCode: http.StatusServiceUnavailable}).Failure()) } - -func TestDefaultMonitoringMapForRegistry(t *testing.T) { - mockRegistry := monitoring.NewRegistry().NewRegistry("mock-default") - m := DefaultMonitoringMapForRegistry(mockRegistry) - assert.Equal(t, 22, len(m)) - for id := range m { - assert.Equal(t, int64(0), m[id].Get()) - } -} - -func TestMonitoringMapForRegistry(t *testing.T) { - keys := []ResultID{IDEventDroppedCount, IDResponseErrorsServiceUnavailable} - mockRegistry := monitoring.NewRegistry().NewRegistry("mock-with-keys") - m := MonitoringMapForRegistry(mockRegistry, keys) - assert.Equal(t, 2, len(m)) - for id := range m { - assert.Equal(t, int64(0), m[id].Get()) - } -} diff --git a/internal/beater/server.go b/internal/beater/server.go index ac4593adeea..8dd59174bb0 100644 --- a/internal/beater/server.go +++ b/internal/beater/server.go @@ -24,13 +24,13 @@ import ( "go.elastic.co/apm/module/apmgorilla/v2" "go.elastic.co/apm/v2" + "go.opentelemetry.io/otel/metric" "go.uber.org/zap" "golang.org/x/sync/errgroup" "google.golang.org/grpc" "github.com/elastic/beats/v7/libbeat/version" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/monitoring" "github.com/elastic/apm-data/input" "github.com/elastic/apm-data/model/modelpb" @@ -47,12 +47,15 @@ import ( "github.com/elastic/apm-server/internal/sourcemap" ) +<<<<<<< HEAD var ( agentcfgMonitoringRegistry = monitoring.Default.NewRegistry("apm-server.agentcfg") agentcfgDeprecationNotice = "deprecation notice: support for passing fleet agent configs will be removed in an upcoming version" ) +======= +>>>>>>> 378b60c2 ( Translate otel metrics to libbeat monitoring (#15094)) // WrapServerFunc is a function for injecting behaviour into ServerParams // and RunServerFunc. // @@ -88,6 +91,9 @@ type ServerParams struct { // for self-instrumentation. Tracer *apm.Tracer + // MeterProvider is the MeterProvider + MeterProvider metric.MeterProvider + // Authenticator holds an authenticator that can be used for // authenticating clients, and obtaining authentication details // and an auth.Authorizer for authorizing the client for future @@ -183,6 +189,7 @@ func newServer(args ServerParams, listener net.Listener) (server, error) { args.SourcemapFetcher, publishReady, args.Semaphore, + args.MeterProvider, ) if err != nil { return server{}, err @@ -202,8 +209,12 @@ func newServer(args ServerParams, listener net.Listener) (server, error) { } } zapLogger := zap.New(args.Logger.Core(), zap.WithCaller(true)) +<<<<<<< HEAD otlp.RegisterGRPCServices(args.GRPCServer, zapLogger, otlpBatchProcessor, args.Semaphore) jaeger.RegisterGRPCServices(args.GRPCServer, zapLogger, args.BatchProcessor, args.AgentConfig, args.Semaphore) +======= + otlp.RegisterGRPCServices(args.GRPCServer, zapLogger, otlpBatchProcessor, args.Semaphore, args.MeterProvider) +>>>>>>> 378b60c2 ( Translate otel metrics to libbeat monitoring (#15094)) return server{ logger: args.Logger, @@ -242,7 +253,11 @@ func newAgentConfigFetcher( kibanaClient *kibana.Client, newElasticsearchClient func(*elasticsearch.Config) (*elasticsearch.Client, error), tracer *apm.Tracer, +<<<<<<< HEAD logger *logp.Logger, +======= + mp metric.MeterProvider, +>>>>>>> 378b60c2 ( Translate otel metrics to libbeat monitoring (#15094)) ) (agentcfg.Fetcher, func(context.Context) error, error) { // Always use ElasticsearchFetcher, and as a fallback, use: // 1. no fallback if Elasticsearch is explicitly configured @@ -272,8 +287,6 @@ func newAgentConfigFetcher( if err != nil { return nil, nil, err } - esFetcher := agentcfg.NewElasticsearchFetcher(esClient, cfg.AgentConfig.Cache.Expiration, fallbackFetcher, tracer) - agentcfgMonitoringRegistry.Remove("elasticsearch") - monitoring.NewFunc(agentcfgMonitoringRegistry, "elasticsearch", esFetcher.CollectMonitoring, monitoring.Report) + esFetcher := agentcfg.NewElasticsearchFetcher(esClient, cfg.AgentConfig.Cache.Expiration, fallbackFetcher, tracer, mp) return agentcfg.SanitizingFetcher{Fetcher: esFetcher}, esFetcher.Run, nil } diff --git a/internal/beater/server_test.go b/internal/beater/server_test.go index 3f757e6ea19..bc413ca4d1d 100644 --- a/internal/beater/server_test.go +++ b/internal/beater/server_test.go @@ -52,12 +52,10 @@ import ( _ "github.com/elastic/beats/v7/libbeat/publisher/queue/memqueue" agentconfig "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/monitoring" "github.com/elastic/apm-data/model/modelpb" "github.com/elastic/apm-server/internal/beater" "github.com/elastic/apm-server/internal/beater/api" - "github.com/elastic/apm-server/internal/beater/api/intake" "github.com/elastic/apm-server/internal/beater/beatertest" "github.com/elastic/apm-server/internal/beater/config" "github.com/elastic/apm-server/internal/beater/monitoringtest" @@ -471,12 +469,23 @@ func TestServerElasticsearchOutput(t *testing.T) { defer elasticsearchServer.Close() defer close(done) +<<<<<<< HEAD // Pre-create the libbeat registry with some variables that should not // be reported, as we define our own libbeat metrics registry. monitoring.Default.Remove("libbeat.whatever") monitoring.NewInt(monitoring.Default, "libbeat.whatever") srv := beatertest.NewServer(t, beatertest.WithConfig(agentconfig.MustNewConfigFrom(map[string]interface{}{ +======= + reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( + func(ik sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality + }, + )) + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + + srv := beatertest.NewServer(t, beatertest.WithMeterProvider(mp), beatertest.WithConfig(agentconfig.MustNewConfigFrom(map[string]interface{}{ +>>>>>>> 378b60c2 ( Translate otel metrics to libbeat monitoring (#15094)) "output.elasticsearch": map[string]interface{}{ "hosts": []string{elasticsearchServer.URL}, "flush_interval": "1ms", @@ -502,6 +511,7 @@ func TestServerElasticsearchOutput(t *testing.T) { t.Fatal("timed out waiting for bulk request") } +<<<<<<< HEAD snapshot := monitoring.CollectStructSnapshot(monitoring.Default.GetRegistry("libbeat"), monitoring.Full, false) assert.Equal(t, map[string]interface{}{ "output": map[string]interface{}{ @@ -540,6 +550,13 @@ func TestServerElasticsearchOutput(t *testing.T) { }, }, }, snapshot) +======= + monitoringtest.ExpectContainOtelMetrics(t, reader, map[string]any{ + "elasticsearch.events.count": 5, + "elasticsearch.events.queued": 5, + "elasticsearch.bulk_requests.available": 9, + }) +>>>>>>> 378b60c2 ( Translate otel metrics to libbeat monitoring (#15094)) } func TestServerPProf(t *testing.T) { @@ -604,9 +621,16 @@ func TestWrapServerAPMInstrumentationTimeout(t *testing.T) { found := make(chan struct{}) reqCtx, reqCancel := context.WithCancel(context.Background()) + reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( + func(ik sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality + }, + )) + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + escfg, _ := beatertest.ElasticsearchOutputConfig(t) _ = logp.DevelopmentSetup(logp.ToObserverOutput()) - srv := beatertest.NewServer(t, beatertest.WithConfig(escfg, agentconfig.MustNewConfigFrom( + srv := beatertest.NewServer(t, beatertest.WithMeterProvider(mp), beatertest.WithConfig(escfg, agentconfig.MustNewConfigFrom( map[string]interface{}{ "instrumentation.enabled": true, })), beatertest.WithWrapServer( @@ -635,8 +659,6 @@ func TestWrapServerAPMInstrumentationTimeout(t *testing.T) { }, )) - monitoringtest.ClearRegistry(intake.MonitoringMap) - req, err := http.NewRequestWithContext(reqCtx, http.MethodPost, srv.URL+api.IntakePath, bytes.NewReader(testData)) require.NoError(t, err) req.Header.Add("Content-Type", "application/x-ndjson") @@ -669,15 +691,12 @@ func TestWrapServerAPMInstrumentationTimeout(t *testing.T) { } } // Assert that metrics have expected response values reported. - equal, result := monitoringtest.CompareMonitoringInt(map[request.ResultID]int{ - request.IDRequestCount: 2, - request.IDResponseCount: 2, - request.IDResponseErrorsCount: 1, - request.IDResponseValidCount: 1, - request.IDResponseErrorsTimeout: 1, // test data POST /intake/v2/events - request.IDResponseValidAccepted: 1, // self-instrumentation - }, intake.MonitoringMap) - assert.True(t, equal, result) + monitoringtest.ExpectContainOtelMetrics(t, reader, map[string]any{ + "http.server." + string(request.IDRequestCount): 1, + "http.server." + string(request.IDResponseCount): 1, + "http.server." + string(request.IDResponseErrorsCount): 1, + "http.server." + string(request.IDResponseErrorsTimeout): 1, // test data POST /intake/v2/events + }) } var testData = func() []byte { diff --git a/internal/beater/tracing.go b/internal/beater/tracing.go index fdc5cc25211..b858fa5069e 100644 --- a/internal/beater/tracing.go +++ b/internal/beater/tracing.go @@ -21,6 +21,9 @@ import ( "net" "net/http" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/metric/noop" + "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/apm-data/input" @@ -32,7 +35,7 @@ import ( "github.com/elastic/apm-server/internal/beater/ratelimit" ) -func newTracerServer(cfg *config.Config, listener net.Listener, logger *logp.Logger, batchProcessor modelpb.BatchProcessor, semaphore input.Semaphore) (*http.Server, error) { +func newTracerServer(cfg *config.Config, listener net.Listener, logger *logp.Logger, batchProcessor modelpb.BatchProcessor, semaphore input.Semaphore, mp metric.MeterProvider) (*http.Server, error) { ratelimitStore, err := ratelimit.NewStore(1, 1, 1) // unused, arbitrary params if err != nil { return nil, err @@ -51,6 +54,7 @@ func newTracerServer(cfg *config.Config, listener net.Listener, logger *logp.Log nil, // no sourcemap store func() bool { return true }, // ready for publishing semaphore, + noop.NewMeterProvider(), ) if err != nil { return nil, err diff --git a/internal/model/modelprocessor/eventcounter.go b/internal/model/modelprocessor/eventcounter.go index 6757fa57778..62bc21d7cb0 100644 --- a/internal/model/modelprocessor/eventcounter.go +++ b/internal/model/modelprocessor/eventcounter.go @@ -20,9 +20,8 @@ package modelprocessor import ( "context" "fmt" - "sync" - "github.com/elastic/elastic-agent-libs/monitoring" + "go.opentelemetry.io/otel/metric" "github.com/elastic/apm-data/model/modelpb" ) @@ -33,46 +32,30 @@ import ( // Metrics are named after the event type: `processor..transformations`. // These metrics are used to populate the "Processed Events" graphs in Stack Monitoring. type EventCounter struct { - registry *monitoring.Registry - - mu sync.RWMutex - eventCounters [modelpb.MaxEventType]*monitoring.Int + // Ideally event type would be a dimension, but we can't + // use dimensions when adapting to libbeat monitoring. + eventCounters [modelpb.MaxEventType + 1]metric.Int64Counter } // NewEventCounter returns an EventCounter that counts events processed, recording -// them as `.transformations` under the given registry. -func NewEventCounter(registry *monitoring.Registry) *EventCounter { - return &EventCounter{registry: registry} +// them as `apm-server.processor..transformations` under the given registry. +func NewEventCounter(mp metric.MeterProvider) *EventCounter { + meter := mp.Meter("github.com/elastic/apm-server/internal/model/modelprocessor") + c := &EventCounter{} + for i := range c.eventCounters { + eventType := modelpb.APMEventType(i) + counter, _ := meter.Int64Counter( + fmt.Sprintf("apm-server.processor.%s.transformations", eventType), + ) + c.eventCounters[i] = counter + } + return c } // ProcessBatch counts events in b, grouping by APMEvent.Processor.Event. func (c *EventCounter) ProcessBatch(ctx context.Context, b *modelpb.Batch) error { for _, event := range *b { - eventType := event.Type() - if eventType == modelpb.UndefinedEventType { - continue - } - - c.mu.RLock() - eventCounter := c.eventCounters[eventType-1] - c.mu.RUnlock() - if eventCounter == nil { - c.mu.Lock() - eventCounter = c.eventCounters[eventType-1] - if eventCounter == nil { - // Metric may exist in the registry but not in our map, - // so first check if it exists before attempting to create. - name := fmt.Sprintf("processor.%s.transformations", eventType) - var ok bool - eventCounter, ok = c.registry.Get(name).(*monitoring.Int) - if !ok { - eventCounter = monitoring.NewInt(c.registry, name) - } - c.eventCounters[eventType-1] = eventCounter - } - c.mu.Unlock() - } - eventCounter.Inc() + c.eventCounters[event.Type()].Add(ctx, 1) } return nil } diff --git a/internal/model/modelprocessor/eventcounter_test.go b/internal/model/modelprocessor/eventcounter_test.go index 3bf86fc1c82..7a38b68604c 100644 --- a/internal/model/modelprocessor/eventcounter_test.go +++ b/internal/model/modelprocessor/eventcounter_test.go @@ -22,10 +22,11 @@ import ( "testing" "github.com/stretchr/testify/assert" - - "github.com/elastic/elastic-agent-libs/monitoring" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" "github.com/elastic/apm-data/model/modelpb" + "github.com/elastic/apm-server/internal/beater/monitoringtest" "github.com/elastic/apm-server/internal/model/modelprocessor" ) @@ -37,15 +38,19 @@ func TestEventCounter(t *testing.T) { {Transaction: &modelpb.Transaction{Type: "transaction_type"}}, } - expected := monitoring.MakeFlatSnapshot() - expected.Ints["processor.span.transformations"] = 1 - expected.Ints["processor.transaction.transformations"] = 2 + reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( + func(ik sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality + }, + )) + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) - registry := monitoring.NewRegistry() - processor := modelprocessor.NewEventCounter(registry) + processor := modelprocessor.NewEventCounter(mp) err := processor.ProcessBatch(context.Background(), &batch) assert.NoError(t, err) - snapshot := monitoring.CollectFlatSnapshot(registry, monitoring.Full, false) - assert.Equal(t, expected, snapshot) + monitoringtest.ExpectContainOtelMetrics(t, reader, map[string]any{ + "apm-server.processor.span.transformations": 1, + "apm-server.processor.transaction.transformations": 2, + }) } diff --git a/internal/processor/stream/result.go b/internal/processor/stream/result.go deleted file mode 100644 index 88bd5ea2868..00000000000 --- a/internal/processor/stream/result.go +++ /dev/null @@ -1,77 +0,0 @@ -// Licensed to Elasticsearch B.V. under one or more contributor -// license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright -// ownership. Elasticsearch B.V. licenses this file to you under -// the Apache License, Version 2.0 (the "License"); you may -// not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package stream - -import ( - "errors" - - "github.com/elastic/elastic-agent-libs/monitoring" -) - -const ( - errorsLimit = 5 -) - -var ( - m = monitoring.Default.NewRegistry("apm-server.processor.stream") - mAccepted = monitoring.NewInt(m, "accepted") - mInvalid = monitoring.NewInt(m, "errors.invalid") - mTooLarge = monitoring.NewInt(m, "errors.toolarge") -) - -type Result struct { - Accepted int - Errors []error -} - -func (r *Result) LimitedAdd(err error) { - r.add(err, len(r.Errors) < errorsLimit) -} - -func (r *Result) Add(err error) { - r.add(err, true) -} - -func (r *Result) AddAccepted(ct int) { - r.Accepted += ct - mAccepted.Add(int64(ct)) -} - -func (r *Result) add(err error, add bool) { - var invalid *InvalidInputError - if errors.As(err, &invalid) { - if invalid.TooLarge { - mTooLarge.Inc() - } else { - mInvalid.Inc() - } - } - if add { - r.Errors = append(r.Errors, err) - } -} - -type InvalidInputError struct { - TooLarge bool - Message string - Document string -} - -func (e *InvalidInputError) Error() string { - return e.Message -} diff --git a/internal/processor/stream/result_test.go b/internal/processor/stream/result_test.go deleted file mode 100644 index b93718295b2..00000000000 --- a/internal/processor/stream/result_test.go +++ /dev/null @@ -1,68 +0,0 @@ -// Licensed to Elasticsearch B.V. under one or more contributor -// license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright -// ownership. Elasticsearch B.V. licenses this file to you under -// the Apache License, Version 2.0 (the "License"); you may -// not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package stream - -import ( - "testing" - - "github.com/pkg/errors" - "github.com/stretchr/testify/assert" -) - -func TestResultAdd(t *testing.T) { - err1 := &InvalidInputError{Message: "err1", Document: "buf1"} - err2 := &InvalidInputError{Message: "err2", Document: "buf2"} - err3 := &InvalidInputError{Message: "err3", Document: "buf3"} - err4 := &InvalidInputError{Message: "err4"} - err5 := &InvalidInputError{Message: "err5"} - err6 := &InvalidInputError{Message: "err6"} - err7 := &InvalidInputError{Message: "err7"} - - result := Result{} - result.LimitedAdd(err1) - result.LimitedAdd(err2) - result.LimitedAdd(err3) - result.LimitedAdd(err4) - result.LimitedAdd(err5) - result.LimitedAdd(err5) - result.LimitedAdd(err6) // limited, not added - result.Add(err7) // unconditionally added - - assert.Len(t, result.Errors, 6) - assert.Equal(t, []error{err1, err2, err3, err4, err5, err7}, result.Errors) -} - -func TestMonitoring(t *testing.T) { - initialAccepted := mAccepted.Get() - initialInvalid := mInvalid.Get() - initialTooLarge := mTooLarge.Get() - - var result Result - result.AddAccepted(9) - result.AddAccepted(3) - for i := 0; i < 10; i++ { - result.LimitedAdd(&InvalidInputError{TooLarge: false}) - } - result.LimitedAdd(&InvalidInputError{TooLarge: true}) - result.Add(&InvalidInputError{TooLarge: true}) - result.Add(errors.New("error")) - - assert.Equal(t, int64(12), mAccepted.Get()-initialAccepted) - assert.Equal(t, int64(10), mInvalid.Get()-initialInvalid) - assert.Equal(t, int64(2), mTooLarge.Get()-initialTooLarge) -} diff --git a/systemtest/otlp_test.go b/systemtest/otlp_test.go index 73e8f3f75c3..51d3245782f 100644 --- a/systemtest/otlp_test.go +++ b/systemtest/otlp_test.go @@ -213,7 +213,7 @@ func TestOTLPGRPCMetrics(t *testing.T) { // Make sure we report monitoring for the metrics consumer. Metric values are unit tested. doc := getBeatsMonitoringStats(t, srv, nil) - assert.True(t, gjson.GetBytes(doc.RawSource, "beats_stats.metrics.apm-server.otlp.grpc.metrics.consumer").Exists()) + assert.Equal(t, int64(2), gjson.GetBytes(doc.RawSource, "beats_stats.metrics.apm-server.otlp.grpc.metrics.response.count").Int()) } func TestOTLPGRPCMetrics_partialSuccess(t *testing.T) { diff --git a/x-pack/apm-server/main.go b/x-pack/apm-server/main.go index f2e026c8aab..96a12ecdaff 100644 --- a/x-pack/apm-server/main.go +++ b/x-pack/apm-server/main.go @@ -12,6 +12,7 @@ import ( "sync" "github.com/gofrs/uuid/v5" + "go.opentelemetry.io/otel/metric" "golang.org/x/sync/errgroup" "github.com/elastic/beats/v7/libbeat/common/reload" @@ -20,7 +21,6 @@ import ( "github.com/elastic/elastic-agent-client/v7/pkg/proto" "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/monitoring" "github.com/elastic/elastic-agent-libs/paths" "github.com/elastic/elastic-agent-libs/transport/tlscommon" @@ -37,13 +37,10 @@ const ( ) var ( - // Note: this registry is created in github.com/elastic/apm-server/sampling. That package - // will hopefully disappear in the future, when agents no longer send unsampled transactions. - samplingMonitoringRegistry = monitoring.Default.GetRegistry("apm-server.sampling") - // badgerDB holds the badger database to use when tail-based sampling is configured. - badgerMu sync.Mutex - badgerDB *eventstorage.StorageManager + badgerMu sync.Mutex + badgerDB *eventstorage.StorageManager + badgerDBMetricRegistration metric.Registration storageMu sync.Mutex storage *eventstorage.ManagedReadWriter @@ -105,8 +102,6 @@ func newProcessors(args beater.ServerParams) ([]namedProcessor, error) { if err != nil { return nil, fmt.Errorf("error creating %s: %w", name, err) } - samplingMonitoringRegistry.Remove("tail") - monitoring.NewFunc(samplingMonitoringRegistry, "tail", sampler.CollectMonitoring, monitoring.Report) processors = append(processors, namedProcessor{name: name, processor: sampler}) } return processors, nil @@ -120,7 +115,7 @@ func newTailSamplingProcessor(args beater.ServerParams) (*sampling.Processor, er } storageDir := paths.Resolve(paths.Data, tailSamplingStorageDir) - badgerDB, err = getBadgerDB(storageDir) + badgerDB, err = getBadgerDB(storageDir, args.MeterProvider) if err != nil { return nil, fmt.Errorf("failed to get Badger database: %w", err) } @@ -141,6 +136,7 @@ func newTailSamplingProcessor(args beater.ServerParams) (*sampling.Processor, er return sampling.NewProcessor(sampling.Config{ BatchProcessor: args.BatchProcessor, + MeterProvider: args.MeterProvider, LocalSamplingConfig: sampling.LocalSamplingConfig{ FlushInterval: tailSamplingConfig.Interval, MaxDynamicServices: 1000, @@ -169,7 +165,7 @@ func newTailSamplingProcessor(args beater.ServerParams) (*sampling.Processor, er }) } -func getBadgerDB(storageDir string) (*eventstorage.StorageManager, error) { +func getBadgerDB(storageDir string, mp metric.MeterProvider) (*eventstorage.StorageManager, error) { badgerMu.Lock() defer badgerMu.Unlock() if badgerDB == nil { @@ -178,6 +174,17 @@ func getBadgerDB(storageDir string) (*eventstorage.StorageManager, error) { return nil, err } badgerDB = sm + + meter := mp.Meter("github.com/elastic/apm-server/x-pack/apm-server") + lsmSizeGauge, _ := meter.Int64ObservableGauge("apm-server.sampling.tail.storage.lsm_size") + valueLogSizeGauge, _ := meter.Int64ObservableGauge("apm-server.sampling.tail.storage.value_log_size") + + badgerDBMetricRegistration, _ = meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + lsmSize, valueLogSize := sm.Size() + o.ObserveInt64(lsmSizeGauge, lsmSize) + o.ObserveInt64(valueLogSizeGauge, valueLogSize) + return nil + }, lsmSizeGauge, valueLogSizeGauge) } return badgerDB, nil } @@ -257,8 +264,14 @@ func wrapServer(args beater.ServerParams, runServer beater.RunServerFunc) (beate // called concurrently with opening badger.DB/accessing the badgerDB global, // so it does not need to hold badgerMu. func closeBadger() error { + if badgerDBMetricRegistration != nil { + badgerDBMetricRegistration.Unregister() + badgerDBMetricRegistration = nil + } if badgerDB != nil { - return badgerDB.Close() + db := badgerDB + badgerDB = nil + return db.Close() } return nil } diff --git a/x-pack/apm-server/main_test.go b/x-pack/apm-server/main_test.go index fad69edc008..981ffb8f631 100644 --- a/x-pack/apm-server/main_test.go +++ b/x-pack/apm-server/main_test.go @@ -8,28 +8,31 @@ package main import ( "context" + "path/filepath" "testing" + "time" + "github.com/gofrs/uuid/v5" "github.com/pkg/errors" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.elastic.co/apm/v2/apmtest" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" "github.com/elastic/apm-data/model/modelpb" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/monitoring" "github.com/elastic/elastic-agent-libs/paths" "github.com/elastic/apm-server/internal/beater" "github.com/elastic/apm-server/internal/beater/config" + "github.com/elastic/apm-server/internal/beater/monitoringtest" "github.com/elastic/apm-server/internal/elasticsearch" + "github.com/elastic/apm-server/x-pack/apm-server/sampling" + "github.com/elastic/apm-server/x-pack/apm-server/sampling/pubsub/pubsubtest" ) func TestMonitoring(t *testing.T) { - // samplingMonitoringRegistry will be nil, as under normal circumstances - // we rely on apm-server/sampling to create the registry. - samplingMonitoringRegistry = monitoring.NewRegistry() - home := t.TempDir() err := paths.InitPaths(&paths.Path{Home: home}) require.NoError(t, err) @@ -45,25 +48,137 @@ func TestMonitoring(t *testing.T) { cfg.Aggregation.ServiceTransactions.MaxGroups = 10000 cfg.Aggregation.ServiceDestinations.MaxGroups = 10000 + reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( + func(ik sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality + }, + )) + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + // Wrap & run the server twice, to ensure metric registration does not panic. runServerError := errors.New("runServer") for i := 0; i < 2; i++ { - var tailSamplingMonitoringSnapshot monitoring.FlatSnapshot serverParams, runServer, err := wrapServer(beater.ServerParams{ Config: cfg, Logger: logp.NewLogger(""), Tracer: apmtest.DiscardTracer, + MeterProvider: mp, BatchProcessor: modelpb.ProcessBatchFunc(func(ctx context.Context, b *modelpb.Batch) error { return nil }), Namespace: "default", NewElasticsearchClient: elasticsearch.NewClient, }, func(ctx context.Context, args beater.ServerParams) error { - tailSamplingMonitoringSnapshot = monitoring.CollectFlatSnapshot(samplingMonitoringRegistry, monitoring.Full, false) return runServerError }) require.NoError(t, err) err = runServer(context.Background(), serverParams) assert.Equal(t, runServerError, err) - assert.NotEqual(t, monitoring.MakeFlatSnapshot(), tailSamplingMonitoringSnapshot) + monitoringtest.ExpectOtelMetrics(t, reader, map[string]any{ + "apm-server.sampling.tail.storage.lsm_size": 0, + "apm-server.sampling.tail.storage.value_log_size": 0, + }) + } +} + +func TestStorageMonitoring(t *testing.T) { + config, reader := newTempdirConfig(t) + + processor, err := sampling.NewProcessor(config) + require.NoError(t, err) + go processor.Run() + defer processor.Stop(context.Background()) + for i := 0; i < 100; i++ { + traceID := uuid.Must(uuid.NewV4()).String() + batch := modelpb.Batch{{ + Trace: &modelpb.Trace{Id: traceID}, + Event: &modelpb.Event{Duration: uint64(123 * time.Millisecond)}, + Transaction: &modelpb.Transaction{ + Type: "type", + Id: traceID, + Sampled: true, + }, + }} + err := processor.ProcessBatch(context.Background(), &batch) + require.NoError(t, err) + assert.Empty(t, batch) + } + + // Stop the processor, flushing pending writes, and reopen storage. + // Reopening storage is necessary to immediately recalculate the + // storage size, otherwise we must wait for a minute (hard-coded in + // badger) for storage metrics to be updated. + err = processor.Stop(context.Background()) + require.NoError(t, err) + require.NoError(t, closeBadger()) + badgerDB, err = getBadgerDB(config.StorageDir, config.MeterProvider) + require.NoError(t, err) + + lsmSize := getGauge(t, reader, "apm-server.sampling.tail.storage.lsm_size") + assert.NotZero(t, lsmSize) + vlogSize := getGauge(t, reader, "apm-server.sampling.tail.storage.value_log_size") + assert.NotZero(t, vlogSize) +} + +func newTempdirConfig(tb testing.TB) (sampling.Config, sdkmetric.Reader) { + tempdir := filepath.Join(tb.TempDir(), "samplingtest") + reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( + func(ik sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality + }, + )) + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + + closeBadger() + badgerDB, err := getBadgerDB(tempdir, mp) + require.NoError(tb, err) + tb.Cleanup(func() { closeBadger() }) + + storage := badgerDB.NewReadWriter() + + return sampling.Config{ + BatchProcessor: modelpb.ProcessBatchFunc(func(context.Context, *modelpb.Batch) error { return nil }), + MeterProvider: mp, + LocalSamplingConfig: sampling.LocalSamplingConfig{ + FlushInterval: time.Second, + MaxDynamicServices: 1000, + IngestRateDecayFactor: 0.9, + Policies: []sampling.Policy{ + {SampleRate: 0.1}, + }, + }, + RemoteSamplingConfig: sampling.RemoteSamplingConfig{ + Elasticsearch: pubsubtest.Client(nil, nil), + SampledTracesDataStream: sampling.DataStreamConfig{ + Type: "traces", + Dataset: "sampled", + Namespace: "testing", + }, + UUID: "local-apm-server", + }, + StorageConfig: sampling.StorageConfig{ + DB: badgerDB, + Storage: storage, + StorageDir: tempdir, + StorageGCInterval: time.Second, + TTL: 30 * time.Minute, + StorageLimit: 0, // No storage limit. + }, + }, reader +} + +func getGauge(t testing.TB, reader sdkmetric.Reader, name string) int64 { + var rm metricdata.ResourceMetrics + assert.NoError(t, reader.Collect(context.Background(), &rm)) + + assert.NotEqual(t, 0, len(rm.ScopeMetrics)) + + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name == name { + return m.Data.(metricdata.Gauge[int64]).DataPoints[0].Value + } + } } + + return 0 } diff --git a/x-pack/apm-server/sampling/config.go b/x-pack/apm-server/sampling/config.go index b6d4d6ce252..7f45d8f2a06 100644 --- a/x-pack/apm-server/sampling/config.go +++ b/x-pack/apm-server/sampling/config.go @@ -8,6 +8,7 @@ import ( "time" "github.com/pkg/errors" + "go.opentelemetry.io/otel/metric" "github.com/elastic/apm-data/model/modelpb" "github.com/elastic/apm-server/x-pack/apm-server/sampling/eventstorage" @@ -21,6 +22,10 @@ type Config struct { // tail-sampled trace events. BatchProcessor modelpb.BatchProcessor + // MeterProvider holds a metric.MeterProvider that can be used for + // creating metrics. + MeterProvider metric.MeterProvider + LocalSamplingConfig RemoteSamplingConfig StorageConfig diff --git a/x-pack/apm-server/sampling/groups.go b/x-pack/apm-server/sampling/groups.go index c852ffa36d3..ddef682e279 100644 --- a/x-pack/apm-server/sampling/groups.go +++ b/x-pack/apm-server/sampling/groups.go @@ -5,12 +5,15 @@ package sampling import ( + "context" "errors" "math" "math/rand" "sync" "time" + "go.opentelemetry.io/otel/metric" + "github.com/elastic/apm-data/model/modelpb" ) @@ -32,6 +35,10 @@ type traceGroups struct { // be created, and events may be dropped. maxDynamicServiceGroups int + // numDynamicServiceGroupsCounter is used for reporting the current number + // of dynamic service groups. + numDynamicServiceGroupsCounter metric.Int64UpDownCounter + mu sync.RWMutex policyGroups []policyGroup numDynamicServiceGroups int @@ -60,14 +67,17 @@ func (g *policyGroup) match(transactionEvent *modelpb.APMEvent) bool { } func newTraceGroups( + meter metric.Meter, policies []Policy, maxDynamicServiceGroups int, ingestRateDecayFactor float64, ) *traceGroups { + numDynamicServiceGroupsCounter, _ := meter.Int64UpDownCounter("apm-server.sampling.tail.dynamic_service_groups") groups := &traceGroups{ - ingestRateDecayFactor: ingestRateDecayFactor, - maxDynamicServiceGroups: maxDynamicServiceGroups, - policyGroups: make([]policyGroup, len(policies)), + ingestRateDecayFactor: ingestRateDecayFactor, + maxDynamicServiceGroups: maxDynamicServiceGroups, + numDynamicServiceGroupsCounter: numDynamicServiceGroupsCounter, + policyGroups: make([]policyGroup, len(policies)), } for i, policy := range policies { pg := policyGroup{policy: policy} @@ -150,6 +160,7 @@ func (g *traceGroups) getTraceGroup(transactionEvent *modelpb.APMEvent) (*traceG return nil, errTooManyTraceGroups } g.numDynamicServiceGroups++ + g.numDynamicServiceGroupsCounter.Add(context.Background(), 1) group = newTraceGroup(pg.policy.SampleRate) pg.dynamic[transactionEvent.GetService().GetName()] = group } diff --git a/x-pack/apm-server/sampling/groups_test.go b/x-pack/apm-server/sampling/groups_test.go index 1312f806e39..4fbaf313a5e 100644 --- a/x-pack/apm-server/sampling/groups_test.go +++ b/x-pack/apm-server/sampling/groups_test.go @@ -12,6 +12,7 @@ import ( "github.com/gofrs/uuid/v5" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/metric/noop" "github.com/elastic/apm-data/model/modelpb" ) @@ -63,7 +64,7 @@ func TestTraceGroupsPolicies(t *testing.T) { policy.ServiceName = "" policies = append(policies, policy) } - groups := newTraceGroups(policies, 1000, 1.0) + groups := newTraceGroups(noop.Meter{}, policies, 1000, 1.0) assertSampleRate := func(sampleRate float64, serviceName, serviceEnvironment, traceOutcome, traceName string) { tx := makeTransaction(serviceName, serviceEnvironment, traceOutcome, traceName) @@ -91,7 +92,7 @@ func TestTraceGroupsMax(t *testing.T) { ingestRateCoefficient = 1.0 ) policies := []Policy{{SampleRate: 1.0}} - groups := newTraceGroups(policies, maxDynamicServices, ingestRateCoefficient) + groups := newTraceGroups(noop.Meter{}, policies, maxDynamicServices, ingestRateCoefficient) for i := 0; i < maxDynamicServices; i++ { serviceName := fmt.Sprintf("service_group_%d", i) @@ -130,7 +131,7 @@ func TestTraceGroupReservoirResize(t *testing.T) { ingestRateCoefficient = 0.75 ) policies := []Policy{{SampleRate: 0.2}} - groups := newTraceGroups(policies, maxDynamicServices, ingestRateCoefficient) + groups := newTraceGroups(noop.Meter{}, policies, maxDynamicServices, ingestRateCoefficient) sendTransactions := func(n int) { for i := 0; i < n; i++ { @@ -172,7 +173,7 @@ func TestTraceGroupReservoirResizeMinimum(t *testing.T) { ingestRateCoefficient = 1.0 ) policies := []Policy{{SampleRate: 0.1}} - groups := newTraceGroups(policies, maxDynamicServices, ingestRateCoefficient) + groups := newTraceGroups(noop.Meter{}, policies, maxDynamicServices, ingestRateCoefficient) sendTransactions := func(n int) { for i := 0; i < n; i++ { @@ -208,7 +209,7 @@ func TestTraceGroupsRemoval(t *testing.T) { {SampleRate: 0.5}, {PolicyCriteria: PolicyCriteria{ServiceName: "defined_later"}, SampleRate: 0.5}, } - groups := newTraceGroups(policies, maxDynamicServices, ingestRateCoefficient) + groups := newTraceGroups(noop.Meter{}, policies, maxDynamicServices, ingestRateCoefficient) for i := 0; i < 10000; i++ { _, err := groups.sampleTrace(&modelpb.APMEvent{ @@ -263,7 +264,7 @@ func BenchmarkTraceGroups(b *testing.B) { ingestRateCoefficient = 1.0 ) policies := []Policy{{SampleRate: 1.0}} - groups := newTraceGroups(policies, maxDynamicServices, ingestRateCoefficient) + groups := newTraceGroups(noop.Meter{}, policies, maxDynamicServices, ingestRateCoefficient) b.RunParallel(func(pb *testing.PB) { // Transaction identifiers are different for each goroutine, simulating diff --git a/x-pack/apm-server/sampling/processor.go b/x-pack/apm-server/sampling/processor.go index 4289a991597..716f5a9e659 100644 --- a/x-pack/apm-server/sampling/processor.go +++ b/x-pack/apm-server/sampling/processor.go @@ -10,10 +10,10 @@ import ( "fmt" "os" "sync" - "sync/atomic" "time" "github.com/pkg/errors" + "go.opentelemetry.io/otel/metric" "golang.org/x/sync/errgroup" "github.com/elastic/apm-data/model/modelpb" @@ -21,7 +21,6 @@ import ( "github.com/elastic/apm-server/x-pack/apm-server/sampling/eventstorage" "github.com/elastic/apm-server/x-pack/apm-server/sampling/pubsub" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/monitoring" ) const ( @@ -42,7 +41,7 @@ type Processor struct { groups *traceGroups eventStore *wrappedRW - eventMetrics *eventMetrics // heap-allocated for 64-bit alignment + eventMetrics eventMetrics stopMu sync.Mutex stopping chan struct{} @@ -50,12 +49,12 @@ type Processor struct { } type eventMetrics struct { - processed int64 - dropped int64 - stored int64 - sampled int64 - headUnsampled int64 - failedWrites int64 + processed metric.Int64Counter + dropped metric.Int64Counter + stored metric.Int64Counter + sampled metric.Int64Counter + headUnsampled metric.Int64Counter + failedWrites metric.Int64Counter } // NewProcessor returns a new Processor, for tail-sampling trace events. @@ -64,54 +63,27 @@ func NewProcessor(config Config) (*Processor, error) { return nil, errors.Wrap(err, "invalid tail-sampling config") } + meter := config.MeterProvider.Meter("github.com/elastic/apm-server/x-pack/apm-server/sampling") + logger := logp.NewLogger(logs.Sampling) p := &Processor{ config: config, logger: logger, rateLimitedLogger: logger.WithOptions(logs.WithRateLimit(loggerRateLimit)), - groups: newTraceGroups(config.Policies, config.MaxDynamicServices, config.IngestRateDecayFactor), + groups: newTraceGroups(meter, config.Policies, config.MaxDynamicServices, config.IngestRateDecayFactor), eventStore: newWrappedRW(config.Storage, config.TTL, int64(config.StorageLimit)), - eventMetrics: &eventMetrics{}, stopping: make(chan struct{}), stopped: make(chan struct{}), } - return p, nil -} -// CollectMonitoring may be called to collect monitoring metrics related to -// tail-sampling. It is intended to be used with libbeat/monitoring.NewFunc. -// -// The metrics should be added to the "apm-server.sampling.tail" registry. -func (p *Processor) CollectMonitoring(_ monitoring.Mode, V monitoring.Visitor) { - V.OnRegistryStart() - defer V.OnRegistryFinished() + p.eventMetrics.processed, _ = meter.Int64Counter("apm-server.sampling.tail.events.processed") + p.eventMetrics.dropped, _ = meter.Int64Counter("apm-server.sampling.tail.events.dropped") + p.eventMetrics.stored, _ = meter.Int64Counter("apm-server.sampling.tail.events.stored") + p.eventMetrics.sampled, _ = meter.Int64Counter("apm-server.sampling.tail.events.sampled") + p.eventMetrics.headUnsampled, _ = meter.Int64Counter("apm-server.sampling.tail.events.head_unsampled") + p.eventMetrics.failedWrites, _ = meter.Int64Counter("apm-server.sampling.tail.events.failed_writes") - // TODO(axw) it might be nice to also report some metrics about: - // - // - The time between receiving events and when they are indexed. - // This could be accomplished by recording the time when the - // payload was received in the ECS field `event.created`. The - // final metric would ideally be a distribution, which is not - // currently an option in libbeat/monitoring. - - p.groups.mu.RLock() - numDynamicGroups := p.groups.numDynamicServiceGroups - p.groups.mu.RUnlock() - monitoring.ReportInt(V, "dynamic_service_groups", int64(numDynamicGroups)) - - monitoring.ReportNamespace(V, "storage", func() { - lsmSize, valueLogSize := p.config.DB.Size() - monitoring.ReportInt(V, "lsm_size", int64(lsmSize)) - monitoring.ReportInt(V, "value_log_size", int64(valueLogSize)) - }) - monitoring.ReportNamespace(V, "events", func() { - monitoring.ReportInt(V, "processed", atomic.LoadInt64(&p.eventMetrics.processed)) - monitoring.ReportInt(V, "dropped", atomic.LoadInt64(&p.eventMetrics.dropped)) - monitoring.ReportInt(V, "stored", atomic.LoadInt64(&p.eventMetrics.stored)) - monitoring.ReportInt(V, "sampled", atomic.LoadInt64(&p.eventMetrics.sampled)) - monitoring.ReportInt(V, "head_unsampled", atomic.LoadInt64(&p.eventMetrics.headUnsampled)) - monitoring.ReportInt(V, "failed_writes", atomic.LoadInt64(&p.eventMetrics.failedWrites)) - }) + return p, nil } // ProcessBatch tail-samples transactions and spans. @@ -133,11 +105,11 @@ func (p *Processor) ProcessBatch(ctx context.Context, batch *modelpb.Batch) erro var err error switch event.Type() { case modelpb.TransactionEventType: - atomic.AddInt64(&p.eventMetrics.processed, 1) - report, stored, err = p.processTransaction(event) + p.eventMetrics.processed.Add(ctx, 1) + report, stored, err = p.processTransaction(ctx, event) case modelpb.SpanEventType: - atomic.AddInt64(&p.eventMetrics.processed, 1) - report, stored, err = p.processSpan(event) + p.eventMetrics.processed.Add(ctx, 1) + report, stored, err = p.processSpan(ctx, event) default: continue } @@ -164,18 +136,18 @@ func (p *Processor) ProcessBatch(ctx context.Context, batch *modelpb.Batch) erro i-- } - p.updateProcessorMetrics(report, stored, failed) + p.updateProcessorMetrics(ctx, report, stored, failed) } *batch = events return nil } -func (p *Processor) updateProcessorMetrics(report, stored, failedWrite bool) { +func (p *Processor) updateProcessorMetrics(ctx context.Context, report, stored, failedWrite bool) { if failedWrite { - atomic.AddInt64(&p.eventMetrics.failedWrites, 1) + p.eventMetrics.failedWrites.Add(ctx, 1) } if stored { - atomic.AddInt64(&p.eventMetrics.stored, 1) + p.eventMetrics.stored.Add(ctx, 1) } else if !report { // We only increment the "dropped" counter if // we neither reported nor stored the event, so @@ -186,15 +158,15 @@ func (p *Processor) updateProcessorMetrics(report, stored, failedWrite bool) { // The counter does not include events that are // implicitly dropped, i.e. stored and never // indexed. - atomic.AddInt64(&p.eventMetrics.dropped, 1) + p.eventMetrics.dropped.Add(ctx, 1) } } -func (p *Processor) processTransaction(event *modelpb.APMEvent) (report, stored bool, _ error) { +func (p *Processor) processTransaction(ctx context.Context, event *modelpb.APMEvent) (report, stored bool, _ error) { if !event.Transaction.Sampled { // (Head-based) unsampled transactions are passed through // by the tail sampler. - atomic.AddInt64(&p.eventMetrics.headUnsampled, 1) + p.eventMetrics.headUnsampled.Add(ctx, 1) return true, false, nil } @@ -205,7 +177,7 @@ func (p *Processor) processTransaction(event *modelpb.APMEvent) (report, stored // if it was sampled. report := traceSampled if report { - atomic.AddInt64(&p.eventMetrics.sampled, 1) + p.eventMetrics.sampled.Add(ctx, 1) } return report, false, nil case eventstorage.ErrNotFound: @@ -257,7 +229,7 @@ sampling policies without service name specified. return false, true, p.eventStore.WriteTraceEvent(event.Trace.Id, event.Transaction.Id, event) } -func (p *Processor) processSpan(event *modelpb.APMEvent) (report, stored bool, _ error) { +func (p *Processor) processSpan(ctx context.Context, event *modelpb.APMEvent) (report, stored bool, _ error) { traceSampled, err := p.eventStore.IsTraceSampled(event.Trace.Id) if err != nil { if err == eventstorage.ErrNotFound { @@ -268,7 +240,7 @@ func (p *Processor) processSpan(event *modelpb.APMEvent) (report, stored bool, _ } // Tail-sampling decision has been made, report or drop the event. if traceSampled { - atomic.AddInt64(&p.eventMetrics.sampled, 1) + p.eventMetrics.sampled.Add(ctx, 1) } return traceSampled, false, nil } @@ -509,7 +481,7 @@ func (p *Processor) Run() error { } } } - atomic.AddInt64(&p.eventMetrics.sampled, int64(len(events))) + p.eventMetrics.sampled.Add(gracefulContext, int64(len(events))) if err := p.config.BatchProcessor.ProcessBatch(gracefulContext, &events); err != nil { p.logger.With(logp.Error(err)).Warn("failed to report events") } diff --git a/x-pack/apm-server/sampling/processor_bench_test.go b/x-pack/apm-server/sampling/processor_bench_test.go index 196a62f0cd4..85940eff0d2 100644 --- a/x-pack/apm-server/sampling/processor_bench_test.go +++ b/x-pack/apm-server/sampling/processor_bench_test.go @@ -20,7 +20,8 @@ import ( ) func BenchmarkProcess(b *testing.B) { - processor, err := sampling.NewProcessor(newTempdirConfig(b)) + cfg, _ := newTempdirConfig(b) + processor, err := sampling.NewProcessor(cfg) require.NoError(b, err) go processor.Run() b.Cleanup(func() { processor.Stop(context.Background()) }) diff --git a/x-pack/apm-server/sampling/processor_test.go b/x-pack/apm-server/sampling/processor_test.go index 5a301e208cf..2f28edba34d 100644 --- a/x-pack/apm-server/sampling/processor_test.go +++ b/x-pack/apm-server/sampling/processor_test.go @@ -9,7 +9,6 @@ import ( "fmt" "math/rand" "os" - "path" "path/filepath" "runtime" "sort" @@ -22,18 +21,21 @@ import ( "github.com/pkg/errors" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" "golang.org/x/sync/errgroup" "google.golang.org/protobuf/testing/protocmp" "github.com/elastic/apm-data/model/modelpb" + "github.com/elastic/apm-server/internal/beater/monitoringtest" "github.com/elastic/apm-server/x-pack/apm-server/sampling" "github.com/elastic/apm-server/x-pack/apm-server/sampling/eventstorage" "github.com/elastic/apm-server/x-pack/apm-server/sampling/pubsub/pubsubtest" - "github.com/elastic/elastic-agent-libs/monitoring" ) func TestProcessUnsampled(t *testing.T) { - processor, err := sampling.NewProcessor(newTempdirConfig(t)) + cfg, _ := newTempdirConfig(t) + processor, err := sampling.NewProcessor(cfg) require.NoError(t, err) go processor.Run() defer processor.Stop(context.Background()) @@ -57,7 +59,7 @@ func TestProcessUnsampled(t *testing.T) { } func TestProcessAlreadyTailSampled(t *testing.T) { - config := newTempdirConfig(t) + config, metricreader := newTempdirConfig(t) // Seed event storage with a tail-sampling decisions, to show that // subsequent events in the trace will be reported immediately. @@ -129,14 +131,11 @@ func TestProcessAlreadyTailSampled(t *testing.T) { // they were received after the trace sampling entry expired. assert.Equal(t, modelpb.Batch{&transaction1, &span1}, batch) - expectedMonitoring := monitoring.MakeFlatSnapshot() - expectedMonitoring.Ints["sampling.events.processed"] = 4 - expectedMonitoring.Ints["sampling.events.head_unsampled"] = 0 - expectedMonitoring.Ints["sampling.events.stored"] = 2 - expectedMonitoring.Ints["sampling.events.sampled"] = 2 - expectedMonitoring.Ints["sampling.events.dropped"] = 0 - expectedMonitoring.Ints["sampling.events.failed_writes"] = 0 - assertMonitoring(t, processor, expectedMonitoring, `sampling.events.*`) + monitoringtest.ExpectContainOtelMetrics(t, metricreader, map[string]any{ + "apm-server.sampling.tail.events.processed": 4, + "apm-server.sampling.tail.events.stored": 2, + "apm-server.sampling.tail.events.sampled": 2, + }) // Stop the processor and flush global storage so we can access the database. assert.NoError(t, processor.Stop(context.Background())) @@ -167,7 +166,7 @@ func TestProcessLocalTailSampling(t *testing.T) { }, } { t.Run(fmt.Sprintf("%f", tc.sampleRate), func(t *testing.T) { - config := newTempdirConfig(t) + config, metricreader := newTempdirConfig(t) config.Policies = []sampling.Policy{{SampleRate: tc.sampleRate}} config.FlushInterval = 10 * time.Millisecond published := make(chan string) @@ -248,14 +247,11 @@ func TestProcessLocalTailSampling(t *testing.T) { sampledTraceEvents = trace2Events } - expectedMonitoring := monitoring.MakeFlatSnapshot() - expectedMonitoring.Ints["sampling.events.processed"] = 4 - expectedMonitoring.Ints["sampling.events.stored"] = 4 - expectedMonitoring.Ints["sampling.events.sampled"] = 2 - expectedMonitoring.Ints["sampling.events.head_unsampled"] = 0 - expectedMonitoring.Ints["sampling.events.dropped"] = 0 - expectedMonitoring.Ints["sampling.events.failed_writes"] = 0 - assertMonitoring(t, processor, expectedMonitoring, `sampling.events.*`) + monitoringtest.ExpectContainOtelMetrics(t, metricreader, map[string]any{ + "apm-server.sampling.tail.events.processed": 4, + "apm-server.sampling.tail.events.stored": 4, + "apm-server.sampling.tail.events.sampled": 2, + }) // Stop the processor and flush global storage so we can access the database. assert.NoError(t, processor.Stop(context.Background())) @@ -289,7 +285,7 @@ func TestProcessLocalTailSampling(t *testing.T) { } func TestProcessLocalTailSamplingUnsampled(t *testing.T) { - config := newTempdirConfig(t) + config, metricreader := newTempdirConfig(t) config.FlushInterval = time.Minute processor, err := sampling.NewProcessor(config) require.NoError(t, err) @@ -315,7 +311,7 @@ func TestProcessLocalTailSamplingUnsampled(t *testing.T) { assert.Empty(t, batch) // break out of the loop as soon as the first one is dropped. - droppedEvents := collectProcessorMetrics(processor).Ints["sampling.events.dropped"] + droppedEvents := getSum(t, metricreader, "apm-server.sampling.events.dropped") if droppedEvents != 0 { break } @@ -344,7 +340,7 @@ func TestProcessLocalTailSamplingUnsampled(t *testing.T) { } func TestProcessLocalTailSamplingPolicyOrder(t *testing.T) { - config := newTempdirConfig(t) + config, _ := newTempdirConfig(t) config.Policies = []sampling.Policy{{ PolicyCriteria: sampling.PolicyCriteria{TraceName: "trace_name"}, SampleRate: 0.5, @@ -411,7 +407,7 @@ func TestProcessLocalTailSamplingPolicyOrder(t *testing.T) { } func TestProcessRemoteTailSampling(t *testing.T) { - config := newTempdirConfig(t) + config, metricreader := newTempdirConfig(t) config.Policies = []sampling.Policy{{SampleRate: 0.5}} config.FlushInterval = 10 * time.Millisecond @@ -479,14 +475,11 @@ func TestProcessRemoteTailSampling(t *testing.T) { assert.NoError(t, config.Storage.Flush()) assert.Empty(t, published) // remote decisions don't get republished - expectedMonitoring := monitoring.MakeFlatSnapshot() - expectedMonitoring.Ints["sampling.events.processed"] = 1 - expectedMonitoring.Ints["sampling.events.stored"] = 1 - expectedMonitoring.Ints["sampling.events.sampled"] = 1 - expectedMonitoring.Ints["sampling.events.head_unsampled"] = 0 - expectedMonitoring.Ints["sampling.events.dropped"] = 0 - expectedMonitoring.Ints["sampling.events.failed_writes"] = 0 - assertMonitoring(t, processor, expectedMonitoring, `sampling.events.*`) + monitoringtest.ExpectOtelMetrics(t, metricreader, map[string]any{ + "apm-server.sampling.tail.events.processed": 1, + "apm-server.sampling.tail.events.stored": 1, + "apm-server.sampling.tail.events.sampled": 1, + }) assert.Empty(t, cmp.Diff(trace1Events, events, protocmp.Transform())) @@ -543,7 +536,7 @@ func (m errorRW) Flush() error { func TestProcessDiscardOnWriteFailure(t *testing.T) { for _, discard := range []bool{true, false} { t.Run(fmt.Sprintf("discard=%v", discard), func(t *testing.T) { - config := newTempdirConfig(t) + config, _ := newTempdirConfig(t) config.DiscardOnWriteFailure = discard config.Storage = errorRW{err: errors.New("boom")} processor, err := sampling.NewProcessor(config) @@ -576,7 +569,7 @@ func TestProcessDiscardOnWriteFailure(t *testing.T) { } func TestGroupsMonitoring(t *testing.T) { - config := newTempdirConfig(t) + config, reader := newTempdirConfig(t) config.MaxDynamicServices = 5 config.FlushInterval = time.Minute config.Policies[0].SampleRate = 0.99 @@ -600,50 +593,13 @@ func TestGroupsMonitoring(t *testing.T) { require.NoError(t, err) } - expectedMonitoring := monitoring.MakeFlatSnapshot() - expectedMonitoring.Ints["sampling.dynamic_service_groups"] = int64(config.MaxDynamicServices) - expectedMonitoring.Ints["sampling.events.processed"] = int64(config.MaxDynamicServices) + 2 - expectedMonitoring.Ints["sampling.events.stored"] = int64(config.MaxDynamicServices) - expectedMonitoring.Ints["sampling.events.dropped"] = 1 // final event dropped, after service limit reached - expectedMonitoring.Ints["sampling.events.sampled"] = 0 - expectedMonitoring.Ints["sampling.events.head_unsampled"] = 1 - expectedMonitoring.Ints["sampling.events.failed_writes"] = 0 - assertMonitoring(t, processor, expectedMonitoring, `sampling.events.*`, `sampling.dynamic_service_groups`) -} - -func TestStorageMonitoring(t *testing.T) { - config := newTempdirConfig(t) - - processor, err := sampling.NewProcessor(config) - require.NoError(t, err) - go processor.Run() - defer processor.Stop(context.Background()) - for i := 0; i < 100; i++ { - traceID := uuid.Must(uuid.NewV4()).String() - batch := modelpb.Batch{{ - Trace: &modelpb.Trace{Id: traceID}, - Event: &modelpb.Event{Duration: uint64(123 * time.Millisecond)}, - Transaction: &modelpb.Transaction{ - Type: "type", - Id: traceID, - Sampled: true, - }, - }} - err := processor.ProcessBatch(context.Background(), &batch) - require.NoError(t, err) - assert.Empty(t, batch) - } - - // Stop the processor and create a new one, which will reopen storage - // and calculate the storage size. Otherwise we must wait for a minute - // (hard-coded in badger) for storage metrics to be updated. - processor.Stop(context.Background()) - processor, err = sampling.NewProcessor(config) - require.NoError(t, err) - - metrics := collectProcessorMetrics(processor) - assert.NotZero(t, metrics.Ints, "sampling.storage.lsm_size") - assert.NotZero(t, metrics.Ints, "sampling.storage.value_log_size") + monitoringtest.ExpectOtelMetrics(t, reader, map[string]any{ + "apm-server.sampling.tail.dynamic_service_groups": config.MaxDynamicServices, + "apm-server.sampling.tail.events.processed": config.MaxDynamicServices + 2, + "apm-server.sampling.tail.events.stored": config.MaxDynamicServices, + "apm-server.sampling.tail.events.dropped": 1, // final event dropped, after service limit reached + "apm-server.sampling.tail.events.head_unsampled": 1, + }) } func TestStorageGC(t *testing.T) { @@ -651,7 +607,7 @@ func TestStorageGC(t *testing.T) { t.Skip("skipping slow test") } - config := newTempdirConfig(t) + config, _ := newTempdirConfig(t) config.TTL = 10 * time.Millisecond config.FlushInterval = 10 * time.Millisecond @@ -707,7 +663,7 @@ func TestStorageGCConcurrency(t *testing.T) { t.Skip("skipping slow test") } - config := newTempdirConfig(t) + config, _ := newTempdirConfig(t) config.TTL = 10 * time.Millisecond config.FlushInterval = 10 * time.Millisecond config.StorageGCInterval = 10 * time.Millisecond @@ -758,7 +714,7 @@ func TestStorageLimit(t *testing.T) { return processor } - config := newTempdirConfig(t) + config, metricreader := newTempdirConfig(t) // Write 5K span events and close the DB to persist to disk the storage // size and assert that none are reported immediately. writeBatch(5000, config, func(b modelpb.Batch) { @@ -786,13 +742,13 @@ func TestStorageLimit(t *testing.T) { // we have, the more sharded writes we'll have, resulting in a greater buffer. // To avoid huge test time on large systems do this incrementally for i := 1; i < runtime.NumCPU(); i++ { - processor := writeBatch(150_000*i, config, func(b modelpb.Batch) { + writeBatch(150_000*i, config, func(b modelpb.Batch) { assert.NotEmpty(t, b) }) - failedWrites := collectProcessorMetrics(processor).Ints["sampling.events.failed_writes"] - t.Log(failedWrites) // Ensure that there are some failed writes. + failedWrites := getSum(t, metricreader, "apm-server.sampling.tail.events.failed_writes") + t.Log(failedWrites) if failedWrites >= 1 { return @@ -803,7 +759,7 @@ func TestStorageLimit(t *testing.T) { } func TestProcessRemoteTailSamplingPersistence(t *testing.T) { - config := newTempdirConfig(t) + config, _ := newTempdirConfig(t) config.Policies = []sampling.Policy{{SampleRate: 0.5}} config.FlushInterval = 10 * time.Millisecond @@ -876,7 +832,7 @@ func TestDropLoop(t *testing.T) { } { t.Run(tc.name, func(t *testing.T) { t.Parallel() - config := newTempdirConfig(t) + config, _ := newTempdirConfig(t) config.StorageGCInterval = time.Hour // effectively disable GC config.FlushInterval = 10 * time.Millisecond @@ -952,7 +908,7 @@ func TestDropLoop(t *testing.T) { } func TestGracefulShutdown(t *testing.T) { - config := newTempdirConfig(t) + config, _ := newTempdirConfig(t) sampleRate := 0.5 config.Policies = []sampling.Policy{{SampleRate: sampleRate}} config.FlushInterval = time.Minute // disable finalize @@ -993,7 +949,7 @@ func TestGracefulShutdown(t *testing.T) { assert.Equal(t, int(sampleRate*float64(totalTraces)), count) } -func newTempdirConfig(tb testing.TB) sampling.Config { +func newTempdirConfig(tb testing.TB) (sampling.Config, sdkmetric.Reader) { tempdir, err := os.MkdirTemp("", "samplingtest") require.NoError(tb, err) tb.Cleanup(func() { os.RemoveAll(tempdir) }) @@ -1004,8 +960,16 @@ func newTempdirConfig(tb testing.TB) sampling.Config { storage := badgerDB.NewReadWriter() + reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( + func(ik sdkmetric.InstrumentKind) metricdata.Temporality { + return metricdata.DeltaTemporality + }, + )) + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + return sampling.Config{ BatchProcessor: modelpb.ProcessBatchFunc(func(context.Context, *modelpb.Batch) error { return nil }), + MeterProvider: mp, LocalSamplingConfig: sampling.LocalSamplingConfig{ FlushInterval: time.Second, MaxDynamicServices: 1000, @@ -1031,63 +995,24 @@ func newTempdirConfig(tb testing.TB) sampling.Config { TTL: 30 * time.Minute, StorageLimit: 0, // No storage limit. }, - } + }, reader } -func assertMonitoring(t testing.TB, p *sampling.Processor, expected monitoring.FlatSnapshot, matches ...string) { - t.Helper() - actual := collectProcessorMetrics(p) - matchAny := func(k string) bool { return true } - if len(matches) > 0 { - matchAny = func(k string) bool { - for _, pattern := range matches { - matched, err := path.Match(pattern, k) - if err != nil { - panic(err) - } - if matched { - return true - } +func getSum(t testing.TB, reader sdkmetric.Reader, name string) int64 { + var rm metricdata.ResourceMetrics + assert.NoError(t, reader.Collect(context.Background(), &rm)) + + assert.NotEqual(t, 0, len(rm.ScopeMetrics)) + + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name == name { + return m.Data.(metricdata.Sum[int64]).DataPoints[0].Value } - return false - } - } - for k := range actual.Bools { - if !matchAny(k) { - delete(actual.Bools, k) - } - } - for k := range actual.Ints { - if !matchAny(k) { - delete(actual.Ints, k) - } - } - for k := range actual.Floats { - if !matchAny(k) { - delete(actual.Floats, k) - } - } - for k := range actual.Strings { - if !matchAny(k) { - delete(actual.Strings, k) } } - for k := range actual.StringSlices { - if !matchAny(k) { - delete(actual.StringSlices, k) - } - } - assert.Equal(t, expected, actual) -} -func collectProcessorMetrics(p *sampling.Processor) monitoring.FlatSnapshot { - registry := monitoring.NewRegistry() - monitoring.NewFunc(registry, "sampling", p.CollectMonitoring) - return monitoring.CollectFlatSnapshot( - registry, - monitoring.Full, - false, // expvar - ) + return 0 } // waitFileModified waits up to 10 seconds for filename to exist and for its From 10d0fd8c2f97df001cec20eea6f6d3185e9f066f Mon Sep 17 00:00:00 2001 From: kruskal <99559985+kruskall@users.noreply.github.com> Date: Fri, 31 Jan 2025 19:25:19 +0100 Subject: [PATCH 2/2] fix: resolve conflicts --- internal/beatcmd/beat.go | 36 ++++++++++++---- internal/beater/beater_test.go | 3 +- internal/beater/interceptors/metrics.go | 4 ++ internal/beater/jaeger/grpc.go | 52 +---------------------- internal/beater/jaeger/grpc_test.go | 3 +- internal/beater/server.go | 14 +----- internal/beater/server_test.go | 6 --- systemtest/jaeger_test.go | 3 -- x-pack/apm-server/sampling/groups_test.go | 2 +- 9 files changed, 40 insertions(+), 83 deletions(-) diff --git a/internal/beatcmd/beat.go b/internal/beatcmd/beat.go index 2eac2938063..177d3b421f5 100644 --- a/internal/beatcmd/beat.go +++ b/internal/beatcmd/beat.go @@ -577,21 +577,41 @@ func getScalarInt64(data metricdata.Aggregation) (int64, bool) { } func addAPMServerMetrics(v monitoring.Visitor, sm metricdata.ScopeMetrics) { + beatsMetrics := make(map[string]any) for _, m := range sm.Metrics { if suffix, ok := strings.CutPrefix(m.Name, "apm-server."); ok { if value, ok := getScalarInt64(m.Data); ok { - keys := strings.Split(suffix, ".") - for i := 0; i < len(keys)-1; i++ { - v.OnRegistryStart() - v.OnKey(keys[i]) - } - monitoring.ReportInt(v, keys[len(keys)-1], value) - for i := 0; i < len(keys)-1; i++ { - v.OnRegistryFinished() + current := beatsMetrics + suffixSlice := strings.Split(suffix, ".") + for i := 0; i < len(suffixSlice)-1; i++ { + k := suffixSlice[i] + if _, ok := current[k]; !ok { + current[k] = make(map[string]any) + } + if currentmap, ok := current[k].(map[string]any); ok { + current = currentmap + } } + current[suffixSlice[len(suffixSlice)-1]] = value } } } + + reportOnKey(v, beatsMetrics) +} + +func reportOnKey(v monitoring.Visitor, m map[string]any) { + for key, value := range m { + if valueMap, ok := value.(map[string]any); ok { + v.OnRegistryStart() + v.OnKey(key) + reportOnKey(v, valueMap) + v.OnRegistryFinished() + } + if valueMetric, ok := value.(int64); ok { + monitoring.ReportInt(v, key, valueMetric) + } + } } // Adapt go-docappender's OTel metrics to beats stack monitoring metrics, diff --git a/internal/beater/beater_test.go b/internal/beater/beater_test.go index a07188572c2..f3916f47839 100644 --- a/internal/beater/beater_test.go +++ b/internal/beater/beater_test.go @@ -36,6 +36,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.elastic.co/apm/v2/apmtest" + "go.opentelemetry.io/otel/metric/noop" "go.uber.org/zap" "go.uber.org/zap/zapcore" "go.uber.org/zap/zaptest/observer" @@ -257,7 +258,7 @@ func TestAgentConfigFetcherDeprecation(t *testing.T) { AgentName: "foo", }, }, - }, nil, func(c *elasticsearch.Config) (*elasticsearch.Client, error) { return nil, nil }, nil, logger) + }, nil, func(c *elasticsearch.Config) (*elasticsearch.Client, error) { return nil, nil }, nil, logger, noop.NewMeterProvider()) require.NoError(t, err) all := observed.All() diff --git a/internal/beater/interceptors/metrics.go b/internal/beater/interceptors/metrics.go index b12c2a0eb6c..c49d3039448 100644 --- a/internal/beater/interceptors/metrics.go +++ b/internal/beater/interceptors/metrics.go @@ -58,6 +58,10 @@ func (m *metricsInterceptor) Interceptor() grpc.UnaryServerInterceptor { legacyMetricsPrefix = "apm-server.otlp.grpc.traces." case "/opentelemetry.proto.collector.logs.v1.LogsService/Export": legacyMetricsPrefix = "apm-server.otlp.grpc.logs." + case "/jaeger.api_v2.CollectorService/PostSpans": + legacyMetricsPrefix = "apm-server.jaeger.grpc.collect." + case "/jaeger.api_v2.SamplingManager/GetSamplingStrategy": + legacyMetricsPrefix = "apm-server.jaeger.grpc.sampling." default: m.logger.With( "grpc.request.method", info.FullMethod, diff --git a/internal/beater/jaeger/grpc.go b/internal/beater/jaeger/grpc.go index f0c722d56e2..8bc877fe66f 100644 --- a/internal/beater/jaeger/grpc.go +++ b/internal/beater/jaeger/grpc.go @@ -28,31 +28,18 @@ import ( "github.com/jaegertracing/jaeger/proto-gen/api_v2" jaegertranslator "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/jaeger" "go.opentelemetry.io/collector/consumer" + "go.opentelemetry.io/otel/metric" "go.uber.org/zap" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - "github.com/elastic/elastic-agent-libs/monitoring" - "github.com/elastic/apm-data/input" "github.com/elastic/apm-data/input/otlp" "github.com/elastic/apm-data/model/modelpb" "github.com/elastic/apm-server/internal/agentcfg" "github.com/elastic/apm-server/internal/beater/auth" "github.com/elastic/apm-server/internal/beater/config" - "github.com/elastic/apm-server/internal/beater/request" -) - -var ( - gRPCCollectorRegistry = monitoring.Default.NewRegistry("apm-server.jaeger.grpc.collect") - gRPCCollectorMonitoringMap monitoringMap = request.MonitoringMapForRegistry( - gRPCCollectorRegistry, append(request.DefaultResultIDs, - request.IDResponseErrorsRateLimit, - request.IDResponseErrorsTimeout, - request.IDResponseErrorsUnauthorized, - ), - ) ) const ( @@ -70,6 +57,7 @@ func RegisterGRPCServices( processor modelpb.BatchProcessor, fetcher agentcfg.Fetcher, semaphore input.Semaphore, + mp metric.MeterProvider, ) { traceConsumer := otlp.NewConsumer(otlp.ConsumerConfig{ Processor: processor, @@ -118,12 +106,6 @@ func (c *grpcCollector) AuthenticateUnaryCall( return authenticator.Authenticate(ctx, kind, token) } -// MonitoringMap returns the request metrics registry for this service, -// to support interceptors.Metrics. -func (c *grpcCollector) RequestMetrics(fullMethodName string) map[request.ResultID]*monitoring.Int { - return gRPCCollectorMonitoringMap -} - // PostSpans implements the api_v2/collector.proto. It converts spans received via Jaeger Proto batch to open-telemetry // TraceData and passes them on to the internal Consumer taking care of converting into Elastic APM format. // The implementation of the protobuf contract is based on the open-telemetry implementation at @@ -141,8 +123,6 @@ func (c *grpcCollector) PostSpans(ctx context.Context, r *api_v2.PostSpansReques } func (c *grpcCollector) postSpans(ctx context.Context, batch jaegermodel.Batch) error { - spanCount := int64(len(batch.Spans)) - gRPCCollectorMonitoringMap.add(request.IDEventReceivedCount, spanCount) traces, err := jaegertranslator.ProtoToTraces([]*jaegermodel.Batch{&batch}) if err != nil { return err @@ -151,11 +131,6 @@ func (c *grpcCollector) postSpans(ctx context.Context, batch jaegermodel.Batch) } var ( - gRPCSamplingRegistry = monitoring.Default.NewRegistry("apm-server.jaeger.grpc.sampling") - gRPCSamplingMonitoringMap monitoringMap = request.MonitoringMapForRegistry( - gRPCSamplingRegistry, append(request.DefaultResultIDs, request.IDEventReceivedCount), - ) - jaegerAgentPrefixes = []string{otlp.AgentNameJaeger} ) @@ -206,19 +181,16 @@ func (s *grpcSampler) fetchSamplingRate(ctx context.Context, service string) (fl } result, err := s.fetcher.Fetch(ctx, query) if err != nil { - gRPCSamplingMonitoringMap.inc(request.IDResponseErrorsServiceUnavailable) return 0, fmt.Errorf("fetching sampling rate failed: %w", err) } if sr, ok := result.Source.Settings[agentcfg.TransactionSamplingRateKey]; ok { srFloat64, err := strconv.ParseFloat(sr, 64) if err != nil { - gRPCSamplingMonitoringMap.inc(request.IDResponseErrorsInternal) return 0, fmt.Errorf("parsing error for sampling rate `%v`: %w", sr, err) } return srFloat64, nil } - gRPCSamplingMonitoringMap.inc(request.IDResponseErrorsNotFound) return 0, fmt.Errorf("no sampling rate found for %v", service) } @@ -254,23 +226,3 @@ func (s *grpcSampler) AuthenticateUnaryCall( } return anonymousAuthenticator.Authenticate(ctx, "", "") } - -// MonitoringMap returns the request metrics registry for this service, -// to support interceptors.Metrics. -func (s *grpcSampler) RequestMetrics(fullMethodName string) map[request.ResultID]*monitoring.Int { - return gRPCSamplingMonitoringMap -} - -type monitoringMap map[request.ResultID]*monitoring.Int - -func (m monitoringMap) inc(id request.ResultID) { - if counter, ok := m[id]; ok { - counter.Inc() - } -} - -func (m monitoringMap) add(id request.ResultID, n int64) { - if counter, ok := m[id]; ok { - counter.Add(n) - } -} diff --git a/internal/beater/jaeger/grpc_test.go b/internal/beater/jaeger/grpc_test.go index eeae66f12c2..009e5933d39 100644 --- a/internal/beater/jaeger/grpc_test.go +++ b/internal/beater/jaeger/grpc_test.go @@ -29,6 +29,7 @@ import ( "github.com/stretchr/testify/require" "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/ptrace" + "go.opentelemetry.io/otel/metric/noop" "go.uber.org/zap" "go.uber.org/zap/zaptest/observer" "golang.org/x/sync/semaphore" @@ -233,7 +234,7 @@ func newServer(t *testing.T, batchProcessor modelpb.BatchProcessor, agentcfgFetc semaphore := semaphore.NewWeighted(1) core, observedLogs := observer.New(zap.DebugLevel) - RegisterGRPCServices(srv, zap.New(core), batchProcessor, agentcfgFetcher, semaphore) + RegisterGRPCServices(srv, zap.New(core), batchProcessor, agentcfgFetcher, semaphore, noop.NewMeterProvider()) go srv.Serve(lis) t.Cleanup(srv.GracefulStop) diff --git a/internal/beater/server.go b/internal/beater/server.go index 8dd59174bb0..8f7109f3a80 100644 --- a/internal/beater/server.go +++ b/internal/beater/server.go @@ -47,15 +47,10 @@ import ( "github.com/elastic/apm-server/internal/sourcemap" ) -<<<<<<< HEAD var ( - agentcfgMonitoringRegistry = monitoring.Default.NewRegistry("apm-server.agentcfg") - agentcfgDeprecationNotice = "deprecation notice: support for passing fleet agent configs will be removed in an upcoming version" ) -======= ->>>>>>> 378b60c2 ( Translate otel metrics to libbeat monitoring (#15094)) // WrapServerFunc is a function for injecting behaviour into ServerParams // and RunServerFunc. // @@ -209,12 +204,8 @@ func newServer(args ServerParams, listener net.Listener) (server, error) { } } zapLogger := zap.New(args.Logger.Core(), zap.WithCaller(true)) -<<<<<<< HEAD - otlp.RegisterGRPCServices(args.GRPCServer, zapLogger, otlpBatchProcessor, args.Semaphore) - jaeger.RegisterGRPCServices(args.GRPCServer, zapLogger, args.BatchProcessor, args.AgentConfig, args.Semaphore) -======= otlp.RegisterGRPCServices(args.GRPCServer, zapLogger, otlpBatchProcessor, args.Semaphore, args.MeterProvider) ->>>>>>> 378b60c2 ( Translate otel metrics to libbeat monitoring (#15094)) + jaeger.RegisterGRPCServices(args.GRPCServer, zapLogger, args.BatchProcessor, args.AgentConfig, args.Semaphore, args.MeterProvider) return server{ logger: args.Logger, @@ -253,11 +244,8 @@ func newAgentConfigFetcher( kibanaClient *kibana.Client, newElasticsearchClient func(*elasticsearch.Config) (*elasticsearch.Client, error), tracer *apm.Tracer, -<<<<<<< HEAD logger *logp.Logger, -======= mp metric.MeterProvider, ->>>>>>> 378b60c2 ( Translate otel metrics to libbeat monitoring (#15094)) ) (agentcfg.Fetcher, func(context.Context) error, error) { // Always use ElasticsearchFetcher, and as a fallback, use: // 1. no fallback if Elasticsearch is explicitly configured diff --git a/internal/beater/server_test.go b/internal/beater/server_test.go index f48e2239f16..e450f861d4d 100644 --- a/internal/beater/server_test.go +++ b/internal/beater/server_test.go @@ -471,12 +471,6 @@ func TestServerElasticsearchOutput(t *testing.T) { defer elasticsearchServer.Close() defer close(done) - // Pre-create the libbeat registry with some variables that should not - // be reported, as we define our own libbeat metrics registry. - monitoring.Default.Remove("libbeat.whatever") - monitoring.NewInt(monitoring.Default, "libbeat.whatever") - - srv := beatertest.NewServer(t, beatertest.WithConfig(agentconfig.MustNewConfigFrom(map[string]interface{}{ reader := sdkmetric.NewManualReader(sdkmetric.WithTemporalitySelector( func(ik sdkmetric.InstrumentKind) metricdata.Temporality { return metricdata.DeltaTemporality diff --git a/systemtest/jaeger_test.go b/systemtest/jaeger_test.go index 8d8b5891a07..12dd650a2f8 100644 --- a/systemtest/jaeger_test.go +++ b/systemtest/jaeger_test.go @@ -20,7 +20,6 @@ package systemtest_test import ( "context" "encoding/json" - "fmt" "net/url" "os" "testing" @@ -30,7 +29,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/tidwall/gjson" - "go.opentelemetry.io/otel" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/credentials" @@ -56,7 +54,6 @@ func TestJaeger(t *testing.T) { }) } - fmt.Fprintf(os.Stdout, "%#v\n", otel.GetMeterProvider()) doc := getBeatsMonitoringStats(t, srv, nil) assert.GreaterOrEqual(t, gjson.GetBytes(doc.RawSource, "beats_stats.metrics.apm-server.jaeger.grpc.collect.request.count").Int(), int64(1)) } diff --git a/x-pack/apm-server/sampling/groups_test.go b/x-pack/apm-server/sampling/groups_test.go index c0db9a60ffb..88238bd5577 100644 --- a/x-pack/apm-server/sampling/groups_test.go +++ b/x-pack/apm-server/sampling/groups_test.go @@ -268,7 +268,7 @@ func TestTraceGroupsRemovalConcurrent(t *testing.T) { policies := []Policy{ {SampleRate: 1}, } - groups := newTraceGroups(policies, maxDynamicServices, ingestRateCoefficient) + groups := newTraceGroups(noop.Meter{}, policies, maxDynamicServices, ingestRateCoefficient) wg := sync.WaitGroup{} wg.Add(2)