diff --git a/CHANGELOG.md b/CHANGELOG.md index 0eb2c4e6ef..a7c000e400 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -122,6 +122,12 @@ v1.11.0-rc.0 - Update the `prometheus.exporter.process` component to get the `remove_empty_groups` option. (@dehaansa) - Remove unnecessary allocations in `stage.static_labels`. (@kalleep) + +- Add `cache_minimum_ttl` argument to `faro.receiver.sourcemaps` block to optionally specify the duration after which the cache map clears itself if sourcemap was not used for the specified duration (@mateagluhak) + +- Add `cache_error_cleanup_interval` argument to `faro.receiver.sourcemaps` block to specify the duration after which the cached sourcemap errors are removed from the cache (@mateagluhak) + +- Add `cache_cleanup_check_interval` argument to `faro.receiver.sourcemaps` block to specify how often to check if any sourcemaps need to be cleared from the cache (@mateagluhak) - Upgrade `beyla.ebpf` from Beyla version v2.2.5 to v2.5.8 The full list of changes can be found in the [Beyla release notes](https://github.com/grafana/beyla/releases/tag/v2.5.2) (@marctc) diff --git a/docs/sources/reference/components/faro/faro.receiver.md b/docs/sources/reference/components/faro/faro.receiver.md index 4fa987477d..4c7142d3dd 100644 --- a/docs/sources/reference/components/faro/faro.receiver.md +++ b/docs/sources/reference/components/faro/faro.receiver.md @@ -134,11 +134,14 @@ Configuring the `rate` argument determines how fast the bucket refills, and conf The `sourcemaps` block configures how to retrieve sourcemaps. Sourcemaps are then used to transform file and line information from minified code into the file and line information from the original source code. -| Name | Type | Description | Default | Required | -|-------------------------|----------------|--------------------------------------------|---------|----------| -| `download` | `bool` | Whether to download sourcemaps. | `true` | no | -| `download_from_origins` | `list(string)` | Which origins to download sourcemaps from. | `["*"]` | no | -| `download_timeout` | `duration` | Timeout when downloading sourcemaps. | `"1s"` | no | +| Name | Type | Description | Default | Required | +| ------------------------------ | -------------- | ---------------------------------------------------------------------------------- | ------- | -------- | +| `cache_cleanup_check_interval` | `duration` | How often should cached sourcemaps be checked for cleanup. | `"1h"` | no | +| `cache_error_cleanup_interval` | `duration` | Duration after which the download of source map that previously failed is retried. | `"1h"` | no | +| `cache_minimum_ttl` | `duration` | Duration after which source map is deleted from cache if not used. | `inf` | no | +| `download_from_origins` | `list(string)` | Which origins to download sourcemaps from. | `["*"]` | no | +| `download_timeout` | `duration` | Timeout when downloading sourcemaps. | `"1s"` | no | +| `download` | `bool` | Whether to download sourcemaps. | `true` | no | When exceptions are sent to the `faro.receiver` component, it can download sourcemaps from the web application. You can disable this behavior by setting the `download` argument to `false`. @@ -151,6 +154,15 @@ The `*` character indicates a wildcard. By default, sourcemap downloads are subject to a timeout of `"1s"`, specified by the `download_timeout` argument. Setting `download_timeout` to `"0s"` disables timeouts. +By default, sourcemaps are held in memory indefinitely. +You can set `cache_minimum_ttl` to clear sourcemaps that aren't used during the specified duration. + +By default, if there's an error while downloading or parsing a sourcemap, the error is cached. +After the duration specified by `cache_error_cleanup_interval`, all errors are cleared from the cache. + +By default, cached sourcemaps are checked for cleanup every 30 seconds. +You can modify the frequency by setting the `cache_cleanup_check_interval` argument. + To retrieve sourcemaps from disk instead of the network, specify one or more [`location` blocks][location]. When `location` blocks are provided, they're checked first for sourcemaps before falling back to downloading. @@ -209,7 +221,7 @@ The template value is replaced with the release value provided by the [Faro Web * `faro_receiver_request_message_bytes` (histogram): Size (in bytes) of HTTP requests received from clients. * `faro_receiver_response_message_bytes` (histogram): Size (in bytes) of HTTP responses sent to clients. * `faro_receiver_inflight_requests` (gauge): Current number of inflight requests. -* `faro_receiver_sourcemap_cache_size` (counter): Number of items in sourcemap cache per origin. +* `faro_receiver_sourcemap_cache_size` (gauge): Number of items in sourcemap cache per origin. * `faro_receiver_sourcemap_downloads_total` (counter): Total number of sourcemap downloads performed per origin and status. * `faro_receiver_sourcemap_file_reads_total` (counter): Total number of sourcemap retrievals using the filesystem per origin and status. diff --git a/internal/component/faro/receiver/arguments.go b/internal/component/faro/receiver/arguments.go index 232f561a94..2dfd413747 100644 --- a/internal/component/faro/receiver/arguments.go +++ b/internal/component/faro/receiver/arguments.go @@ -3,6 +3,7 @@ package receiver import ( "encoding" "fmt" + "math" "time" "github.com/alecthomas/units" @@ -71,17 +72,23 @@ func (r *RateLimitingArguments) SetToDefault() { // SourceMapsArguments configures how app_agent_receiver will retrieve source // maps for transforming stack traces. type SourceMapsArguments struct { - Download bool `alloy:"download,attr,optional"` - DownloadFromOrigins []string `alloy:"download_from_origins,attr,optional"` - DownloadTimeout time.Duration `alloy:"download_timeout,attr,optional"` - Locations []LocationArguments `alloy:"location,block,optional"` + Download bool `alloy:"download,attr,optional"` + DownloadFromOrigins []string `alloy:"download_from_origins,attr,optional"` + DownloadTimeout time.Duration `alloy:"download_timeout,attr,optional"` + CacheMinimumTtl time.Duration `alloy:"cache_minimum_ttl,attr,optional"` + CacheErrorCleanupInterval time.Duration `alloy:"cache_error_cleanup_interval,attr,optional"` + CacheCleanupCheckInterval time.Duration `alloy:"cache_cleanup_check_interval,attr,optional"` + Locations []LocationArguments `alloy:"location,block,optional"` } func (s *SourceMapsArguments) SetToDefault() { *s = SourceMapsArguments{ - Download: true, - DownloadFromOrigins: []string{"*"}, - DownloadTimeout: time.Second, + Download: true, + DownloadFromOrigins: []string{"*"}, + DownloadTimeout: time.Second, + CacheErrorCleanupInterval: time.Hour, + CacheMinimumTtl: time.Duration(math.MaxInt64), + CacheCleanupCheckInterval: time.Second * 30, } } diff --git a/internal/component/faro/receiver/receiver.go b/internal/component/faro/receiver/receiver.go index aebab7ebce..a136315f9a 100644 --- a/internal/component/faro/receiver/receiver.go +++ b/internal/component/faro/receiver/receiver.go @@ -25,6 +25,11 @@ func init() { }) } +type cleanupRoutines struct { + cancel context.CancelFunc + wg sync.WaitGroup +} + type Component struct { log log.Logger handler *handler @@ -35,6 +40,9 @@ type Component struct { argsMut sync.RWMutex args Arguments + cleanupMut sync.Mutex + cleanup *cleanupRoutines + metrics *metricsExporter logs *logsExporter traces *tracesExporter @@ -93,6 +101,7 @@ func (c *Component) Run(ctx context.Context) error { if cancelCurrentActor != nil { cancelCurrentActor() } + c.stopCleanup() }() for { @@ -131,13 +140,17 @@ func (c *Component) Update(args component.Arguments) error { c.handler.Update(newArgs.Server) - c.lazySourceMaps.SetInner(newSourceMapsStore( + innerStore := newSourceMapsStore( log.With(c.log, "subcomponent", "handler"), newArgs.SourceMaps, c.sourceMapsMetrics, nil, // Use default HTTP client. nil, // Use default FS implementation. - )) + ) + c.lazySourceMaps.SetInner(innerStore) + + c.stopCleanup() + c.startCleanup(newArgs, innerStore) c.logs.SetReceivers(newArgs.Output.Logs) c.traces.SetConsumers(newArgs.Output.Traces) @@ -232,3 +245,59 @@ func (vs *varSourceMapsStore) SetInner(inner sourceMapsStore) { vs.inner = inner } + +func (c *Component) stopCleanup() { + c.cleanupMut.Lock() + defer c.cleanupMut.Unlock() + if c.cleanup != nil { + c.cleanup.cancel() // signal goroutines to exit + c.cleanup.wg.Wait() // wait for them + c.cleanup = nil + } +} + +func (c *Component) startCleanup(args Arguments, s *sourceMapsStoreImpl) { + c.cleanupMut.Lock() + defer c.cleanupMut.Unlock() + + cleanupCtx, cleanupCancel := context.WithCancel(context.Background()) + cr := &cleanupRoutines{cancel: cleanupCancel} + + if d := args.SourceMaps.CacheCleanupCheckInterval; d > 0 { + cr.wg.Add(1) + go func(interval time.Duration) { + defer cr.wg.Done() + s.CleanOldCacheEntries() + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-cleanupCtx.Done(): + return + case <-ticker.C: + s.CleanOldCacheEntries() + } + } + }(d) + } + + if d := args.SourceMaps.CacheErrorCleanupInterval; d > 0 { + cr.wg.Add(1) + go func(interval time.Duration) { + defer cr.wg.Done() + s.CleanCachedErrors() + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-cleanupCtx.Done(): + return + case <-ticker.C: + s.CleanCachedErrors() + } + } + }(d) + } + + c.cleanup = cr +} diff --git a/internal/component/faro/receiver/sourcemaps.go b/internal/component/faro/receiver/sourcemaps.go index 48533a8a3b..293e83f7ff 100644 --- a/internal/component/faro/receiver/sourcemaps.go +++ b/internal/component/faro/receiver/sourcemaps.go @@ -13,6 +13,7 @@ import ( "strings" "sync" "text/template" + "time" "github.com/go-kit/log" "github.com/go-sourcemap/sourcemap" @@ -67,14 +68,14 @@ func (fs osFileService) ReadFile(name string) ([]byte, error) { } type sourceMapMetrics struct { - cacheSize *prometheus.CounterVec + cacheSize *prometheus.GaugeVec downloads *prometheus.CounterVec fileReads *prometheus.CounterVec } func newSourceMapMetrics(reg prometheus.Registerer) *sourceMapMetrics { m := &sourceMapMetrics{ - cacheSize: prometheus.NewCounterVec(prometheus.CounterOpts{ + cacheSize: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "faro_receiver_sourcemap_cache_size", Help: "number of items in source map cache, per origin", }, []string{"origin"}), @@ -88,7 +89,7 @@ func newSourceMapMetrics(reg prometheus.Registerer) *sourceMapMetrics { }, []string{"origin", "status"}), } - m.cacheSize = util.MustRegisterOrGet(reg, m.cacheSize).(*prometheus.CounterVec) + m.cacheSize = util.MustRegisterOrGet(reg, m.cacheSize).(*prometheus.GaugeVec) m.downloads = util.MustRegisterOrGet(reg, m.downloads).(*prometheus.CounterVec) m.fileReads = util.MustRegisterOrGet(reg, m.fileReads).(*prometheus.CounterVec) return m @@ -99,6 +100,16 @@ type sourcemapFileLocation struct { pathTemplate *template.Template } +type timeSource interface { + Now() time.Time +} + +type realTimeSource struct{} + +func (realTimeSource) Now() time.Time { + return time.Now() +} + type sourceMapsStoreImpl struct { log log.Logger cli httpClient @@ -107,8 +118,14 @@ type sourceMapsStoreImpl struct { metrics *sourceMapMetrics locs []*sourcemapFileLocation - cacheMut sync.Mutex - cache map[string]*sourcemap.Consumer + cacheMut sync.Mutex + cache map[string]*cachedSourceMap + timeSource timeSource +} + +type cachedSourceMap struct { + consumer *sourcemap.Consumer + lastUsed time.Time } // newSourceMapStore creates an implementation of sourceMapsStore. The returned @@ -141,27 +158,29 @@ func newSourceMapsStore(log log.Logger, args SourceMapsArguments, metrics *sourc } return &sourceMapsStoreImpl{ - log: log, - cli: cli, - fs: fs, - args: args, - cache: make(map[string]*sourcemap.Consumer), - metrics: metrics, - locs: locs, + log: log, + cli: cli, + fs: fs, + args: args, + cache: make(map[string]*cachedSourceMap), + metrics: metrics, + locs: locs, + timeSource: realTimeSource{}, } } func (store *sourceMapsStoreImpl) GetSourceMap(sourceURL string, release string) (*sourcemap.Consumer, error) { - // TODO(rfratto): GetSourceMap is weak to transient errors, since it always - // caches the result, even when there's an error. This means that transient - // errors will be cached forever, preventing source maps from being retrieved. store.cacheMut.Lock() defer store.cacheMut.Unlock() cacheKey := fmt.Sprintf("%s__%s", sourceURL, release) - if sm, ok := store.cache[cacheKey]; ok { - return sm, nil + if cached, ok := store.cache[cacheKey]; ok { + if cached != nil { + cached.lastUsed = store.timeSource.Now() + return cached.consumer, nil + } + return nil, nil } content, sourceMapURL, err := store.getSourceMapContent(sourceURL, release) @@ -177,11 +196,39 @@ func (store *sourceMapsStoreImpl) GetSourceMap(sourceURL string, release string) return nil, err } level.Info(store.log).Log("msg", "successfully parsed source map", "url", sourceMapURL, "release", release) - store.cache[cacheKey] = consumer + store.cache[cacheKey] = &cachedSourceMap{ + consumer: consumer, + lastUsed: store.timeSource.Now(), + } store.metrics.cacheSize.WithLabelValues(getOrigin(sourceURL)).Inc() return consumer, nil } +func (store *sourceMapsStoreImpl) CleanOldCacheEntries() { + store.cacheMut.Lock() + defer store.cacheMut.Unlock() + + for key, cached := range store.cache { + if cached != nil && cached.lastUsed.Before(store.timeSource.Now().Add(-store.args.CacheMinimumTtl)) { + srcUrl := strings.SplitN(key, "__", 2)[0] + origin := getOrigin(srcUrl) + store.metrics.cacheSize.WithLabelValues(origin).Dec() + delete(store.cache, key) + } + } +} + +func (store *sourceMapsStoreImpl) CleanCachedErrors() { + store.cacheMut.Lock() + defer store.cacheMut.Unlock() + + for key, cached := range store.cache { + if cached == nil { + delete(store.cache, key) + } + } +} + func (store *sourceMapsStoreImpl) getSourceMapContent(sourceURL string, release string) (content []byte, sourceMapURL string, err error) { // Attempt to find the source map in the filesystem first. for _, loc := range store.locs { diff --git a/internal/component/faro/receiver/sourcemaps_test.go b/internal/component/faro/receiver/sourcemaps_test.go index c4ed78f03e..a1508c12e6 100644 --- a/internal/component/faro/receiver/sourcemaps_test.go +++ b/internal/component/faro/receiver/sourcemaps_test.go @@ -9,13 +9,24 @@ import ( "os" "path/filepath" "testing" + "time" "github.com/grafana/alloy/internal/component/faro/receiver/internal/payload" alloyutil "github.com/grafana/alloy/internal/util" + "github.com/grafana/pyroscope/ebpf/util" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/require" ) +// mockTimeSource is a test helper for controlling time. +type mockTimeSource struct { + now time.Time +} + +func (m *mockTimeSource) Now() time.Time { + return m.now +} + func Test_sourceMapsStoreImpl_DownloadSuccess(t *testing.T) { var ( logger = alloyutil.TestLogger(t) @@ -604,6 +615,150 @@ func Test_sourceMapsStoreImpl_RealWorldPathValidation(t *testing.T) { require.Empty(t, fileService.reads, "should not read file when stat fails") } +func TestSourceMapsStoreImpl_CleanCachedErrors(t *testing.T) { + tt := []struct { + name string + cache map[string]*cachedSourceMap + expectedCacheSize int + }{ + { + name: "should remove cached error", + cache: map[string]*cachedSourceMap{ + "http://shouldRemoveCachedErrors.com__v1": nil, + }, + expectedCacheSize: 0, + }, + { + name: "should not remove from map if no errors", + cache: map[string]*cachedSourceMap{ + "http://shouldNotRemoveFromCache.com__v2": {}, + }, + expectedCacheSize: 1, + }, + { + name: "should not remove from map if no errors", + cache: map[string]*cachedSourceMap{ + "http://shouldNotRemoveFromCache.com__v1": {}, + "http://shouldNotRemoveFromCache.com__v2": {}, + }, + expectedCacheSize: 2, + }, + { + name: "should remove only cached errors", + cache: map[string]*cachedSourceMap{ + "http://shouldNotRemoveFromCache.com__v1": nil, + "http://shouldNotRemoveFromCache.com__v2": {}, + }, + expectedCacheSize: 1, + }, + } + + logger := util.TestLogger(t) + + for _, tc := range tt { + + reg := prometheus.NewRegistry() + metrics := newSourceMapMetrics(reg) + + store := &sourceMapsStoreImpl{ + log: logger, + args: SourceMapsArguments{CacheMinimumTtl: 5 * time.Minute}, + metrics: metrics, + cli: &mockHTTPClient{}, + fs: newTestFileService(), + cache: tc.cache, + timeSource: &mockTimeSource{now: time.Now()}, + } + + t.Run(tc.name, func(t *testing.T) { + store.CleanCachedErrors() + require.Equal(t, tc.expectedCacheSize, len(store.cache)) + }) + } +} + +func TestSourceMapsStoreImpl_CleanOldCachedEntries(t *testing.T) { + tt := []struct { + name string + cache map[string]*cachedSourceMap + timeSource *mockTimeSource + cacheTimeout time.Duration + expectedCacheSize int + }{ + { + name: "should clear entry from cache if too old", + cache: map[string]*cachedSourceMap{ + "http://shouldRemoveCachedErrors.com__v1": {lastUsed: time.Now()}, + }, + timeSource: &mockTimeSource{now: time.Now().Add(5 * time.Minute)}, + cacheTimeout: 5 * time.Minute, + expectedCacheSize: 0, + }, + { + name: "should not clear entry from cache if not too old", + cache: map[string]*cachedSourceMap{ + "http://shouldRemoveCachedErrors.com__v1": {lastUsed: time.Now()}, + }, + timeSource: &mockTimeSource{now: time.Now().Add(3 * time.Minute)}, + cacheTimeout: 5 * time.Minute, + expectedCacheSize: 1, + }, + { + name: "should clear only old entries from cache", + cache: map[string]*cachedSourceMap{ + "http://shouldRemoveCachedErrors.com__v1": {lastUsed: time.Now()}, + "http://shouldRemoveCachedErrors.com__v2": {lastUsed: time.Now().Add(-5 * time.Minute)}, + }, + timeSource: &mockTimeSource{now: time.Now()}, + cacheTimeout: 5 * time.Minute, + expectedCacheSize: 1, + }, + { + name: "should not clear multiple entries", + cache: map[string]*cachedSourceMap{ + "http://shouldRemoveCachedErrors.com__v1": {lastUsed: time.Now().Add(3 * time.Minute)}, + "http://shouldRemoveCachedErrors.com__v2": {lastUsed: time.Now().Add(4 * time.Minute)}, + }, + timeSource: &mockTimeSource{now: time.Now()}, + cacheTimeout: 5 * time.Minute, + expectedCacheSize: 2, + }, + { + name: "should clear multiple old entries from cache", + cache: map[string]*cachedSourceMap{ + "http://shouldRemoveCachedErrors.com__v1": {lastUsed: time.Now().Add(-10 * time.Minute)}, + "http://shouldRemoveCachedErrors.com__v2": {lastUsed: time.Now().Add(-7 * time.Minute)}, + }, + timeSource: &mockTimeSource{now: time.Now()}, + cacheTimeout: 5 * time.Minute, + expectedCacheSize: 0, + }, + } + + logger := util.TestLogger(t) + + for _, tc := range tt { + + reg := prometheus.NewRegistry() + metrics := newSourceMapMetrics(reg) + + store := &sourceMapsStoreImpl{ + log: logger, + args: SourceMapsArguments{CacheMinimumTtl: tc.cacheTimeout}, + metrics: metrics, + cli: &mockHTTPClient{}, + fs: newTestFileService(), + cache: tc.cache, + timeSource: tc.timeSource, + } + + t.Run(tc.name, func(t *testing.T) { + store.CleanOldCacheEntries() + require.Equal(t, tc.expectedCacheSize, len(store.cache)) + }) + } +} + type mockHTTPClient struct { responses []struct { *http.Response @@ -695,4 +850,5 @@ func newTestFileService() *testFileService { stats: make([]string, 0), reads: make([]string, 0), } + }