From f4b0759ead76932948b0e7530245973287e79fe4 Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Tue, 26 Apr 2022 15:08:04 +0200 Subject: [PATCH 01/20] Move Data sending goroutine --- apm-lambda-extension/extension/apm_server.go | 23 +++++++++++++++++++- apm-lambda-extension/main.go | 19 +--------------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/apm-lambda-extension/extension/apm_server.go b/apm-lambda-extension/extension/apm_server.go index 5c61806c..f45b96ed 100644 --- a/apm-lambda-extension/extension/apm_server.go +++ b/apm-lambda-extension/extension/apm_server.go @@ -63,6 +63,27 @@ var ApmServerTransportState = ApmServerTransportStateType{ ReconnectionCount: -1, } +func StartBackgroundSending(ctx context.Context, agentDataChannel chan AgentData, client *http.Client, config *extensionConfig, funcDone chan struct{}, backgroundDataSendWg *sync.WaitGroup) { + go func() { + defer backgroundDataSendWg.Done() + if !IsTransportStatusHealthyOrPending() { + return + } + for { + select { + case <-funcDone: + Log.Debug("Received signal that function has completed, not processing any more agent data") + return + case agentData := <-agentDataChannel: + if err := PostToApmServer(client, agentData, config, ctx); err != nil { + Log.Errorf("Error sending to APM server, skipping: %v", err) + return + } + } + } + }() +} + // PostToApmServer takes a chunk of APM agent data and posts it to the APM server. // // The function compresses the APM agent data, if it's not already compressed. @@ -135,7 +156,7 @@ func PostToApmServer(client *http.Client, agentData AgentData, config *extension return nil } -// IsTransportStatusHealthyOrPending returns true if the APM server transport status is +// IsTransportStatusHealthyOrPending returns true if the APM server transport status is // healthy or pending, and false otherwise. // // This function is public for use in tests. diff --git a/apm-lambda-extension/main.go b/apm-lambda-extension/main.go index d5da6429..59a49b3a 100644 --- a/apm-lambda-extension/main.go +++ b/apm-lambda-extension/main.go @@ -144,24 +144,7 @@ func main() { // Stop checking for, and sending agent data when the function invocation // has completed, signaled via a channel. backgroundDataSendWg.Add(1) - go func() { - defer backgroundDataSendWg.Done() - if !extension.IsTransportStatusHealthyOrPending() { - return - } - for { - select { - case <-funcDone: - extension.Log.Debug("Received signal that function has completed, not processing any more agent data") - return - case agentData := <-agentDataChannel: - if err := extension.PostToApmServer(client, agentData, config, ctx); err != nil { - extension.Log.Errorf("Error sending to APM server, skipping: %v", err) - return - } - } - } - }() + extension.StartBackgroundSending(ctx, agentDataChannel, client, config, funcDone, &backgroundDataSendWg) // Receive Logs API events // Send to the runtimeDoneSignal channel to signal when a runtimeDone event is received From 43ca2824994e907f710965bd46116c382e2df528 Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Tue, 26 Apr 2022 17:04:45 +0200 Subject: [PATCH 02/20] Refactor Transport State Struct --- .../e2e-testing/.e2e_test_config | 2 +- apm-lambda-extension/extension/apm_server.go | 105 +++++----- .../extension/apm_server_test.go | 198 +++++++++--------- apm-lambda-extension/extension/client.go | 4 +- apm-lambda-extension/extension/http_server.go | 11 +- .../extension/http_server_test.go | 29 ++- .../extension/process_events.go | 12 +- .../extension/route_handlers.go | 15 +- apm-lambda-extension/main.go | 19 +- apm-lambda-extension/main_test.go | 5 - 10 files changed, 197 insertions(+), 203 deletions(-) diff --git a/apm-lambda-extension/e2e-testing/.e2e_test_config b/apm-lambda-extension/e2e-testing/.e2e_test_config index 201df541..396d6d47 100644 --- a/apm-lambda-extension/e2e-testing/.e2e_test_config +++ b/apm-lambda-extension/e2e-testing/.e2e_test_config @@ -1,2 +1,2 @@ -RUN_E2E_TESTS=false +RUN_E2E_TESTS=true ELASTIC_APM_LOG_LEVEL=info \ No newline at end of file diff --git a/apm-lambda-extension/extension/apm_server.go b/apm-lambda-extension/extension/apm_server.go index f45b96ed..a4144f39 100644 --- a/apm-lambda-extension/extension/apm_server.go +++ b/apm-lambda-extension/extension/apm_server.go @@ -36,10 +36,10 @@ var bufferPool = sync.Pool{New: func() interface{} { return &bytes.Buffer{} }} -type ApmServerTransportStatusType string - // Constants for the state of the transport used in // the backoff implementation. +type ApmServerTransportStatusType string + const ( Failing ApmServerTransportStatusType = "Failing" Pending ApmServerTransportStatusType = "Pending" @@ -48,25 +48,36 @@ const ( // A struct to track the state and status of sending // to the APM server. Used in the backoff implementation. -type ApmServerTransportStateType struct { +type ApmServerTransport struct { + sync.Pool sync.Mutex + ctx context.Context + config *extensionConfig + DataChannel chan AgentData + Client *http.Client Status ApmServerTransportStatusType ReconnectionCount int GracePeriodTimer *time.Timer } -// The status of transport to the APM server. -// -// This instance of the ApmServerTransportStateType is public for use in tests. -var ApmServerTransportState = ApmServerTransportStateType{ - Status: Healthy, - ReconnectionCount: -1, +func InitApmServerTransport(ctx context.Context, config *extensionConfig) *ApmServerTransport { + var transport ApmServerTransport + transport.DataChannel = make(chan AgentData, 100) + transport.Client = &http.Client{ + Timeout: time.Duration(config.DataForwarderTimeoutSeconds) * time.Second, + Transport: http.DefaultTransport.(*http.Transport).Clone(), + } + transport.config = config + transport.ctx = ctx + transport.Status = Healthy + transport.ReconnectionCount = -1 + return &transport } -func StartBackgroundSending(ctx context.Context, agentDataChannel chan AgentData, client *http.Client, config *extensionConfig, funcDone chan struct{}, backgroundDataSendWg *sync.WaitGroup) { +func StartBackgroundSending(transport *ApmServerTransport, funcDone chan struct{}, backgroundDataSendWg *sync.WaitGroup) { go func() { defer backgroundDataSendWg.Done() - if !IsTransportStatusHealthyOrPending() { + if transport.Status == Failing { return } for { @@ -74,8 +85,8 @@ func StartBackgroundSending(ctx context.Context, agentDataChannel chan AgentData case <-funcDone: Log.Debug("Received signal that function has completed, not processing any more agent data") return - case agentData := <-agentDataChannel: - if err := PostToApmServer(client, agentData, config, ctx); err != nil { + case agentData := <-transport.DataChannel: + if err := PostToApmServer(transport, agentData); err != nil { Log.Errorf("Error sending to APM server, skipping: %v", err) return } @@ -89,10 +100,10 @@ func StartBackgroundSending(ctx context.Context, agentDataChannel chan AgentData // The function compresses the APM agent data, if it's not already compressed. // It sets the APM transport status to failing upon errors, as part of the backoff // strategy. -func PostToApmServer(client *http.Client, agentData AgentData, config *extensionConfig, ctx context.Context) error { +func PostToApmServer(transport *ApmServerTransport, agentData AgentData) error { // todo: can this be a streaming or streaming style call that keeps the // connection open across invocations? - if !IsTransportStatusHealthyOrPending() { + if transport.Status == Failing { return errors.New("transport status is unhealthy") } @@ -122,22 +133,22 @@ func PostToApmServer(client *http.Client, agentData AgentData, config *extension r = buf } - req, err := http.NewRequest("POST", config.apmServerUrl+endpointURI, r) + req, err := http.NewRequest("POST", transport.config.apmServerUrl+endpointURI, r) if err != nil { return fmt.Errorf("failed to create a new request when posting to APM server: %v", err) } req.Header.Add("Content-Encoding", encoding) req.Header.Add("Content-Type", "application/x-ndjson") - if config.apmServerApiKey != "" { - req.Header.Add("Authorization", "ApiKey "+config.apmServerApiKey) - } else if config.apmServerSecretToken != "" { - req.Header.Add("Authorization", "Bearer "+config.apmServerSecretToken) + if transport.config.apmServerApiKey != "" { + req.Header.Add("Authorization", "ApiKey "+transport.config.apmServerApiKey) + } else if transport.config.apmServerSecretToken != "" { + req.Header.Add("Authorization", "Bearer "+transport.config.apmServerSecretToken) } Log.Debug("Sending data chunk to APM Server") - resp, err := client.Do(req) + resp, err := transport.Client.Do(req) if err != nil { - SetApmServerTransportState(Failing, ctx) + SetApmServerTransportState(transport, Failing) return fmt.Errorf("failed to post to APM server: %v", err) } @@ -145,25 +156,17 @@ func PostToApmServer(client *http.Client, agentData AgentData, config *extension defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { - SetApmServerTransportState(Failing, ctx) + SetApmServerTransportState(transport, Failing) return fmt.Errorf("failed to read the response body after posting to the APM server") } - SetApmServerTransportState(Healthy, ctx) + SetApmServerTransportState(transport, Healthy) Log.Debug("Transport status set to healthy") Log.Debugf("APM server response body: %v", string(body)) Log.Debugf("APM server response status code: %v", resp.StatusCode) return nil } -// IsTransportStatusHealthyOrPending returns true if the APM server transport status is -// healthy or pending, and false otherwise. -// -// This function is public for use in tests. -func IsTransportStatusHealthyOrPending() bool { - return ApmServerTransportState.Status != Failing -} - // SetApmServerTransportState takes a state of the APM server transport and updates // the current state of the transport. For a change to a failing state, the grace period // is calculated and a go routine is started that waits for that period to complete @@ -171,31 +174,31 @@ func IsTransportStatusHealthyOrPending() bool { // to the APM server. // // This function is public for use in tests. -func SetApmServerTransportState(status ApmServerTransportStatusType, ctx context.Context) { +func SetApmServerTransportState(transport *ApmServerTransport, status ApmServerTransportStatusType) { switch status { case Healthy: - ApmServerTransportState.Lock() - ApmServerTransportState.Status = status - Log.Debugf("APM Server Transport status set to %s", status) - ApmServerTransportState.ReconnectionCount = -1 - ApmServerTransportState.Unlock() + transport.Lock() + transport.Status = status + Log.Debugf("APM Server Transport status set to %s", transport.Status) + transport.ReconnectionCount = -1 + transport.Unlock() case Failing: - ApmServerTransportState.Lock() - ApmServerTransportState.Status = status - Log.Debugf("APM Server Transport status set to %s", status) - ApmServerTransportState.ReconnectionCount++ - ApmServerTransportState.GracePeriodTimer = time.NewTimer(computeGracePeriod()) - Log.Debugf("Grace period entered, reconnection count : %d", ApmServerTransportState.ReconnectionCount) + transport.Lock() + transport.Status = status + Log.Debugf("APM Server Transport status set to %s", transport.Status) + transport.ReconnectionCount++ + transport.GracePeriodTimer = time.NewTimer(computeGracePeriod(transport)) + Log.Debugf("Grace period entered, reconnection count : %d", transport.ReconnectionCount) go func() { select { - case <-ApmServerTransportState.GracePeriodTimer.C: + case <-transport.GracePeriodTimer.C: Log.Debug("Grace period over - timer timed out") - case <-ctx.Done(): + case <-transport.ctx.Done(): Log.Debug("Grace period over - context done") } - ApmServerTransportState.Status = Pending - Log.Debugf("APM Server Transport status set to %s", status) - ApmServerTransportState.Unlock() + transport.Status = Pending + Log.Debugf("APM Server Transport status set to %s", transport.Status) + transport.Unlock() }() default: Log.Errorf("Cannot set APM Server Transport status to %s", status) @@ -203,8 +206,8 @@ func SetApmServerTransportState(status ApmServerTransportStatusType, ctx context } // ComputeGracePeriod https://github.com/elastic/apm/blob/main/specs/agents/transport.md#transport-errors -func computeGracePeriod() time.Duration { - gracePeriodWithoutJitter := math.Pow(math.Min(float64(ApmServerTransportState.ReconnectionCount), 6), 2) +func computeGracePeriod(transport *ApmServerTransport) time.Duration { + gracePeriodWithoutJitter := math.Pow(math.Min(float64(transport.ReconnectionCount), 6), 2) jitter := rand.Float64()/5 - 0.1 return time.Duration((gracePeriodWithoutJitter + jitter*gracePeriodWithoutJitter) * float64(time.Second)) } diff --git a/apm-lambda-extension/extension/apm_server_test.go b/apm-lambda-extension/extension/apm_server_test.go index df7b2ac5..7d03c8c1 100644 --- a/apm-lambda-extension/extension/apm_server_test.go +++ b/apm-lambda-extension/extension/apm_server_test.go @@ -29,6 +29,7 @@ import ( ) func TestPostToApmServerDataCompressed(t *testing.T) { + s := "A long time ago in a galaxy far, far away..." // Compress the data @@ -68,8 +69,8 @@ func TestPostToApmServerDataCompressed(t *testing.T) { config := extensionConfig{ apmServerUrl: apmServer.URL + "/", } - - err := PostToApmServer(apmServer.Client(), agentData, &config, context.Background()) + transport := InitApmServerTransport(context.Background(), &config) + err := PostToApmServer(transport, agentData) assert.Equal(t, nil, err) } @@ -113,88 +114,94 @@ func TestPostToApmServerDataNotCompressed(t *testing.T) { config := extensionConfig{ apmServerUrl: apmServer.URL + "/", } - - err := PostToApmServer(apmServer.Client(), agentData, &config, context.Background()) + transport := InitApmServerTransport(context.Background(), &config) + err := PostToApmServer(transport, agentData) assert.Equal(t, nil, err) } func TestGracePeriod(t *testing.T) { - ApmServerTransportState.ReconnectionCount = 0 - val0 := computeGracePeriod().Seconds() + transport := InitApmServerTransport(context.Background(), &extensionConfig{}) + + transport.ReconnectionCount = 0 + val0 := computeGracePeriod(transport).Seconds() assert.Equal(t, val0, float64(0)) - ApmServerTransportState.ReconnectionCount = 1 - val1 := computeGracePeriod().Seconds() + transport.ReconnectionCount = 1 + val1 := computeGracePeriod(transport).Seconds() assert.InDelta(t, val1, float64(1), 0.1*1) - ApmServerTransportState.ReconnectionCount = 2 - val2 := computeGracePeriod().Seconds() + transport.ReconnectionCount = 2 + val2 := computeGracePeriod(transport).Seconds() assert.InDelta(t, val2, float64(4), 0.1*4) - ApmServerTransportState.ReconnectionCount = 3 - val3 := computeGracePeriod().Seconds() + transport.ReconnectionCount = 3 + val3 := computeGracePeriod(transport).Seconds() assert.InDelta(t, val3, float64(9), 0.1*9) - ApmServerTransportState.ReconnectionCount = 4 - val4 := computeGracePeriod().Seconds() + transport.ReconnectionCount = 4 + val4 := computeGracePeriod(transport).Seconds() assert.InDelta(t, val4, float64(16), 0.1*16) - ApmServerTransportState.ReconnectionCount = 5 - val5 := computeGracePeriod().Seconds() + transport.ReconnectionCount = 5 + val5 := computeGracePeriod(transport).Seconds() assert.InDelta(t, val5, float64(25), 0.1*25) - ApmServerTransportState.ReconnectionCount = 6 - val6 := computeGracePeriod().Seconds() + transport.ReconnectionCount = 6 + val6 := computeGracePeriod(transport).Seconds() assert.InDelta(t, val6, float64(36), 0.1*36) - ApmServerTransportState.ReconnectionCount = 7 - val7 := computeGracePeriod().Seconds() + transport.ReconnectionCount = 7 + val7 := computeGracePeriod(transport).Seconds() assert.InDelta(t, val7, float64(36), 0.1*36) } func TestSetHealthyTransport(t *testing.T) { - SetApmServerTransportState(Healthy, context.Background()) - assert.True(t, ApmServerTransportState.Status == Healthy) - assert.Equal(t, ApmServerTransportState.ReconnectionCount, -1) + transport := InitApmServerTransport(context.Background(), &extensionConfig{}) + SetApmServerTransportState(transport, Healthy) + assert.True(t, transport.Status == Healthy) + assert.Equal(t, transport.ReconnectionCount, -1) } func TestSetFailingTransport(t *testing.T) { // By explicitly setting the reconnection count to 0, we ensure that the grace period will not be 0 // and avoid a race between reaching the pending status and the test assertion. - ApmServerTransportState.ReconnectionCount = 0 - SetApmServerTransportState(Failing, context.Background()) - assert.True(t, ApmServerTransportState.Status == Failing) - assert.Equal(t, ApmServerTransportState.ReconnectionCount, 1) + transport := InitApmServerTransport(context.Background(), &extensionConfig{}) + transport.ReconnectionCount = 0 + SetApmServerTransportState(transport, Failing) + assert.True(t, transport.Status == Failing) + assert.Equal(t, transport.ReconnectionCount, 1) } func TestSetPendingTransport(t *testing.T) { - SetApmServerTransportState(Healthy, context.Background()) - SetApmServerTransportState(Failing, context.Background()) + transport := InitApmServerTransport(context.Background(), &extensionConfig{}) + SetApmServerTransportState(transport, Healthy) + SetApmServerTransportState(transport, Failing) for { - if IsTransportStatusHealthyOrPending() { + if transport.Status != Failing { break } } - assert.True(t, ApmServerTransportState.Status == Pending) - assert.Equal(t, ApmServerTransportState.ReconnectionCount, 0) + assert.True(t, transport.Status == Pending) + assert.Equal(t, transport.ReconnectionCount, 0) } func TestSetPendingTransportExplicitly(t *testing.T) { - SetApmServerTransportState(Healthy, context.Background()) - SetApmServerTransportState(Pending, context.Background()) - assert.True(t, ApmServerTransportState.Status == Healthy) - assert.Equal(t, ApmServerTransportState.ReconnectionCount, -1) + transport := InitApmServerTransport(context.Background(), &extensionConfig{}) + SetApmServerTransportState(transport, Healthy) + SetApmServerTransportState(transport, Pending) + assert.True(t, transport.Status == Healthy) + assert.Equal(t, transport.ReconnectionCount, -1) } func TestSetInvalidTransport(t *testing.T) { - SetApmServerTransportState(Healthy, context.Background()) - SetApmServerTransportState("Invalid", context.Background()) - assert.True(t, ApmServerTransportState.Status == Healthy) - assert.Equal(t, ApmServerTransportState.ReconnectionCount, -1) + transport := InitApmServerTransport(context.Background(), &extensionConfig{}) + SetApmServerTransportState(transport, Healthy) + SetApmServerTransportState(transport, "Invalid") + assert.True(t, transport.Status == Healthy) + assert.Equal(t, transport.ReconnectionCount, -1) } func TestEnterBackoffFromHealthy(t *testing.T) { - SetApmServerTransportState(Healthy, context.Background()) // Compress the data pr, pw := io.Pipe() gw, _ := gzip.NewWriterLevel(pw, gzip.BestSpeed) @@ -226,31 +233,24 @@ func TestEnterBackoffFromHealthy(t *testing.T) { return } })) - // Close the APM server early so that POST requests fail and that backoff is enabled - apmServer.Close() - config := extensionConfig{ apmServerUrl: apmServer.URL + "/", } + transport := InitApmServerTransport(context.Background(), &config) + SetApmServerTransportState(transport, Healthy) - if err := PostToApmServer(apmServer.Client(), agentData, &config, context.Background()); err != nil { + // Close the APM server early so that POST requests fail and that backoff is enabled + apmServer.Close() + + if err := PostToApmServer(transport, agentData); err != nil { return } // No way to know for sure if failing or pending (0 sec grace period) - assert.True(t, ApmServerTransportState.Status != Healthy) - assert.Equal(t, ApmServerTransportState.ReconnectionCount, 0) + assert.True(t, transport.Status != Healthy) + assert.Equal(t, transport.ReconnectionCount, 0) } func TestEnterBackoffFromFailing(t *testing.T) { - SetApmServerTransportState(Healthy, context.Background()) - SetApmServerTransportState(Failing, context.Background()) - for { - if IsTransportStatusHealthyOrPending() { - break - } - } - assert.Equal(t, ApmServerTransportState.Status, Pending) - // Compress the data pr, pw := io.Pipe() gw, _ := gzip.NewWriterLevel(pw, gzip.BestSpeed) @@ -290,21 +290,22 @@ func TestEnterBackoffFromFailing(t *testing.T) { apmServerUrl: apmServer.URL + "/", } - assert.Error(t, PostToApmServer(apmServer.Client(), agentData, &config, context.Background())) - assert.Equal(t, ApmServerTransportState.Status, Failing) - assert.Equal(t, ApmServerTransportState.ReconnectionCount, 1) -} - -func TestAPMServerRecovery(t *testing.T) { - SetApmServerTransportState(Healthy, context.Background()) - SetApmServerTransportState(Failing, context.Background()) + transport := InitApmServerTransport(context.Background(), &config) + SetApmServerTransportState(transport, Healthy) + SetApmServerTransportState(transport, Failing) for { - if IsTransportStatusHealthyOrPending() { + if transport.Status != Failing { break } } - assert.Equal(t, ApmServerTransportState.Status, Pending) + assert.Equal(t, transport.Status, Pending) + assert.Error(t, PostToApmServer(transport, agentData)) + assert.Equal(t, transport.Status, Failing) + assert.Equal(t, transport.ReconnectionCount, 1) +} + +func TestAPMServerRecovery(t *testing.T) { // Compress the data pr, pw := io.Pipe() gw, _ := gzip.NewWriterLevel(pw, gzip.BestSpeed) @@ -342,21 +343,22 @@ func TestAPMServerRecovery(t *testing.T) { apmServerUrl: apmServer.URL + "/", } - assert.NoError(t, PostToApmServer(apmServer.Client(), agentData, &config, context.Background())) - assert.Equal(t, ApmServerTransportState.Status, Healthy) - assert.Equal(t, ApmServerTransportState.ReconnectionCount, -1) -} - -func TestContinuedAPMServerFailure(t *testing.T) { - SetApmServerTransportState(Healthy, context.Background()) - SetApmServerTransportState(Failing, context.Background()) + transport := InitApmServerTransport(context.Background(), &config) + SetApmServerTransportState(transport, Healthy) + SetApmServerTransportState(transport, Failing) for { - if IsTransportStatusHealthyOrPending() { + if transport.Status != Failing { break } } - assert.Equal(t, ApmServerTransportState.Status, Pending) + assert.Equal(t, transport.Status, Pending) + assert.NoError(t, PostToApmServer(transport, agentData)) + assert.Equal(t, transport.Status, Healthy) + assert.Equal(t, transport.ReconnectionCount, -1) +} + +func TestContinuedAPMServerFailure(t *testing.T) { // Compress the data pr, pw := io.Pipe() gw, _ := gzip.NewWriterLevel(pw, gzip.BestSpeed) @@ -385,31 +387,30 @@ func TestContinuedAPMServerFailure(t *testing.T) { assert.Equal(t, string(data), string(bytes)) assert.Equal(t, "gzip", r.Header.Get("Content-Encoding")) if _, err := w.Write([]byte(`{"foo": "bar"}`)); err != nil { - t.Fail() return } })) - apmServer.Close() // Close the APM server early so that POST requests fail and that backoff is enabled + apmServer.Close() config := extensionConfig{ apmServerUrl: apmServer.URL + "/", } - assert.Error(t, PostToApmServer(apmServer.Client(), agentData, &config, context.Background())) - assert.Equal(t, ApmServerTransportState.Status, Failing) - assert.Equal(t, ApmServerTransportState.ReconnectionCount, 1) + transport := InitApmServerTransport(context.Background(), &config) + SetApmServerTransportState(transport, Healthy) + SetApmServerTransportState(transport, Failing) + for { + if transport.Status != Failing { + break + } + } + assert.Equal(t, transport.Status, Pending) + assert.Error(t, PostToApmServer(transport, agentData)) + assert.Equal(t, transport.Status, Failing) + assert.Equal(t, transport.ReconnectionCount, 1) } func BenchmarkPostToAPM(b *testing.B) { - // Copied from https://github.com/elastic/apm-server/blob/master/testdata/intake-v2/transactions.ndjson. - benchBody := []byte(`{"metadata": {"service": {"name": "1234_service-12a3","node": {"configured_name": "node-123"},"version": "5.1.3","environment": "staging","language": {"name": "ecmascript","version": "8"},"runtime": {"name": "node","version": "8.0.0"},"framework": {"name": "Express","version": "1.2.3"},"agent": {"name": "elastic-node","version": "3.14.0"}},"user": {"id": "123user", "username": "bar", "email": "bar@user.com"}, "labels": {"tag0": null, "tag1": "one", "tag2": 2}, "process": {"pid": 1234,"ppid": 6789,"title": "node","argv": ["node","server.js"]},"system": {"hostname": "prod1.example.com","architecture": "x64","platform": "darwin", "container": {"id": "container-id"}, "kubernetes": {"namespace": "namespace1", "pod": {"uid": "pod-uid", "name": "pod-name"}, "node": {"name": "node-name"}}},"cloud":{"account":{"id":"account_id","name":"account_name"},"availability_zone":"cloud_availability_zone","instance":{"id":"instance_id","name":"instance_name"},"machine":{"type":"machine_type"},"project":{"id":"project_id","name":"project_name"},"provider":"cloud_provider","region":"cloud_region","service":{"name":"lambda"}}}} -{"transaction": { "id": "945254c567a5417e", "trace_id": "0123456789abcdef0123456789abcdef", "parent_id": "abcdefabcdef01234567", "type": "request", "duration": 32.592981, "span_count": { "started": 43 }}} -{"transaction": {"id": "4340a8e0df1906ecbfa9", "trace_id": "0acd456789abcdef0123456789abcdef", "name": "GET /api/types","type": "request","duration": 32.592981,"outcome":"success", "result": "success", "timestamp": 1496170407154000, "sampled": true, "span_count": {"started": 17},"context": {"service": {"runtime": {"version": "7.0"}},"page":{"referer":"http://localhost:8000/test/e2e/","url":"http://localhost:8000/test/e2e/general-usecase/"}, "request": {"socket": {"remote_address": "12.53.12.1","encrypted": true},"http_version": "1.1","method": "POST","url": {"protocol": "https:","full": "https://www.example.com/p/a/t/h?query=string#hash","hostname": "www.example.com","port": "8080","pathname": "/p/a/t/h","search": "?query=string","hash": "#hash","raw": "/p/a/t/h?query=string#hash"},"headers": {"user-agent":["Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36","Mozilla Chrome Edge"],"content-type": "text/html","cookie": "c1=v1, c2=v2","some-other-header": "foo","array": ["foo","bar","baz"]},"cookies": {"c1": "v1","c2": "v2"},"env": {"SERVER_SOFTWARE": "nginx","GATEWAY_INTERFACE": "CGI/1.1"},"body": {"str": "hello world","additional": { "foo": {},"bar": 123,"req": "additional information"}}},"response": {"status_code": 200,"headers": {"content-type": "application/json"},"headers_sent": true,"finished": true,"transfer_size":25.8,"encoded_body_size":26.90,"decoded_body_size":29.90}, "user": {"domain": "ldap://abc","id": "99","username": "foo"},"tags": {"organization_uuid": "9f0e9d64-c185-4d21-a6f4-4673ed561ec8", "tag2": 12, "tag3": 12.45, "tag4": false, "tag5": null },"custom": {"my_key": 1,"some_other_value": "foo bar","and_objects": {"foo": ["bar","baz"]},"(": "not a valid regex and that is fine"}}}} -{"transaction": { "id": "cdef4340a8e0df19", "trace_id": "0acd456789abcdef0123456789abcdef", "type": "request", "duration": 13.980558, "timestamp": 1532976822281000, "sampled": null, "span_count": { "dropped": 55, "started": 436 }, "marks": {"navigationTiming": {"appBeforeBootstrap": 608.9300000000001,"navigationStart": -21},"another_mark": {"some_long": 10,"some_float": 10.0}, "performance": {}}, "context": { "request": { "socket": { "remote_address": "192.0.1", "encrypted": null }, "method": "POST", "headers": { "user-agent": null, "content-type": null, "cookie": null }, "url": { "protocol": null, "full": null, "hostname": null, "port": null, "pathname": null, "search": null, "hash": null, "raw": null } }, "response": { "headers": { "content-type": null } }, "service": {"environment":"testing","name": "service1","node": {"configured_name": "node-ABC"}, "language": {"version": "2.5", "name": "ruby"}, "agent": {"version": "2.2", "name": "elastic-ruby", "ephemeral_id": "justanid"}, "framework": {"version": "5.0", "name": "Rails"}, "version": "2", "runtime": {"version": "2.5", "name": "cruby"}}},"experience":{"cls":1,"fid":2.0,"tbt":3.4,"longtask":{"count":3,"sum":2.5,"max":1}}}} -{"transaction": { "id": "00xxxxFFaaaa1234", "trace_id": "0123456789abcdef0123456789abcdef", "name": "amqp receive", "parent_id": "abcdefabcdef01234567", "type": "messaging", "duration": 3, "span_count": { "started": 1 }, "context": {"message": {"queue": { "name": "new_users"}, "age":{ "ms": 1577958057123}, "headers": {"user_id": "1ax3", "involved_services": ["user", "auth"]}, "body": "user created", "routing_key": "user-created-transaction"}},"session":{"id":"sunday","sequence":123}}} -{"transaction": { "name": "july-2021-delete-after-july-31", "type": "lambda", "result": "success", "id": "142e61450efb8574", "trace_id": "eb56529a1f461c5e7e2f66ecb075e983", "subtype": null, "action": null, "duration": 38.853, "timestamp": 1631736666365048, "sampled": true, "context": { "cloud": { "origin": { "account": { "id": "abc123" }, "provider": "aws", "region": "us-east-1", "service": { "name": "serviceName" } } }, "service": { "origin": { "id": "abc123", "name": "service-name", "version": "1.0" } }, "user": {}, "tags": {}, "custom": { } }, "sync": true, "span_count": { "started": 0 }, "outcome": "unknown", "faas": { "coldstart": false, "execution": "2e13b309-23e1-417f-8bf7-074fc96bc683", "trigger": { "request_id": "FuH2Cir_vHcEMUA=", "type": "http" } }, "sample_rate": 1 } } -`) - agentData := AgentData{Data: benchBody, ContentEncoding: ""} // Create apm server and handler apmServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -428,12 +429,21 @@ func BenchmarkPostToAPM(b *testing.B) { apmServerUrl: apmServer.URL + "/", } - client := &http.Client{ - Transport: http.DefaultTransport.(*http.Transport).Clone(), - } + transport := InitApmServerTransport(context.Background(), &config) + + // Copied from https://github.com/elastic/apm-server/blob/master/testdata/intake-v2/transactions.ndjson. + benchBody := []byte(`{"metadata": {"service": {"name": "1234_service-12a3","node": {"configured_name": "node-123"},"version": "5.1.3","environment": "staging","language": {"name": "ecmascript","version": "8"},"runtime": {"name": "node","version": "8.0.0"},"framework": {"name": "Express","version": "1.2.3"},"agent": {"name": "elastic-node","version": "3.14.0"}},"user": {"id": "123user", "username": "bar", "email": "bar@user.com"}, "labels": {"tag0": null, "tag1": "one", "tag2": 2}, "process": {"pid": 1234,"ppid": 6789,"title": "node","argv": ["node","server.js"]},"system": {"hostname": "prod1.example.com","architecture": "x64","platform": "darwin", "container": {"id": "container-id"}, "kubernetes": {"namespace": "namespace1", "pod": {"uid": "pod-uid", "name": "pod-name"}, "node": {"name": "node-name"}}},"cloud":{"account":{"id":"account_id","name":"account_name"},"availability_zone":"cloud_availability_zone","instance":{"id":"instance_id","name":"instance_name"},"machine":{"type":"machine_type"},"project":{"id":"project_id","name":"project_name"},"provider":"cloud_provider","region":"cloud_region","service":{"name":"lambda"}}}} +{"transaction": { "id": "945254c567a5417e", "trace_id": "0123456789abcdef0123456789abcdef", "parent_id": "abcdefabcdef01234567", "type": "request", "duration": 32.592981, "span_count": { "started": 43 }}} +{"transaction": {"id": "4340a8e0df1906ecbfa9", "trace_id": "0acd456789abcdef0123456789abcdef", "name": "GET /api/types","type": "request","duration": 32.592981,"outcome":"success", "result": "success", "timestamp": 1496170407154000, "sampled": true, "span_count": {"started": 17},"context": {"service": {"runtime": {"version": "7.0"}},"page":{"referer":"http://localhost:8000/test/e2e/","url":"http://localhost:8000/test/e2e/general-usecase/"}, "request": {"socket": {"remote_address": "12.53.12.1","encrypted": true},"http_version": "1.1","method": "POST","url": {"protocol": "https:","full": "https://www.example.com/p/a/t/h?query=string#hash","hostname": "www.example.com","port": "8080","pathname": "/p/a/t/h","search": "?query=string","hash": "#hash","raw": "/p/a/t/h?query=string#hash"},"headers": {"user-agent":["Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36","Mozilla Chrome Edge"],"content-type": "text/html","cookie": "c1=v1, c2=v2","some-other-header": "foo","array": ["foo","bar","baz"]},"cookies": {"c1": "v1","c2": "v2"},"env": {"SERVER_SOFTWARE": "nginx","GATEWAY_INTERFACE": "CGI/1.1"},"body": {"str": "hello world","additional": { "foo": {},"bar": 123,"req": "additional information"}}},"response": {"status_code": 200,"headers": {"content-type": "application/json"},"headers_sent": true,"finished": true,"transfer_size":25.8,"encoded_body_size":26.90,"decoded_body_size":29.90}, "user": {"domain": "ldap://abc","id": "99","username": "foo"},"tags": {"organization_uuid": "9f0e9d64-c185-4d21-a6f4-4673ed561ec8", "tag2": 12, "tag3": 12.45, "tag4": false, "tag5": null },"custom": {"my_key": 1,"some_other_value": "foo bar","and_objects": {"foo": ["bar","baz"]},"(": "not a valid regex and that is fine"}}}} +{"transaction": { "id": "cdef4340a8e0df19", "trace_id": "0acd456789abcdef0123456789abcdef", "type": "request", "duration": 13.980558, "timestamp": 1532976822281000, "sampled": null, "span_count": { "dropped": 55, "started": 436 }, "marks": {"navigationTiming": {"appBeforeBootstrap": 608.9300000000001,"navigationStart": -21},"another_mark": {"some_long": 10,"some_float": 10.0}, "performance": {}}, "context": { "request": { "socket": { "remote_address": "192.0.1", "encrypted": null }, "method": "POST", "headers": { "user-agent": null, "content-type": null, "cookie": null }, "url": { "protocol": null, "full": null, "hostname": null, "port": null, "pathname": null, "search": null, "hash": null, "raw": null } }, "response": { "headers": { "content-type": null } }, "service": {"environment":"testing","name": "service1","node": {"configured_name": "node-ABC"}, "language": {"version": "2.5", "name": "ruby"}, "agent": {"version": "2.2", "name": "elastic-ruby", "ephemeral_id": "justanid"}, "framework": {"version": "5.0", "name": "Rails"}, "version": "2", "runtime": {"version": "2.5", "name": "cruby"}}},"experience":{"cls":1,"fid":2.0,"tbt":3.4,"longtask":{"count":3,"sum":2.5,"max":1}}}} +{"transaction": { "id": "00xxxxFFaaaa1234", "trace_id": "0123456789abcdef0123456789abcdef", "name": "amqp receive", "parent_id": "abcdefabcdef01234567", "type": "messaging", "duration": 3, "span_count": { "started": 1 }, "context": {"message": {"queue": { "name": "new_users"}, "age":{ "ms": 1577958057123}, "headers": {"user_id": "1ax3", "involved_services": ["user", "auth"]}, "body": "user created", "routing_key": "user-created-transaction"}},"session":{"id":"sunday","sequence":123}}} +{"transaction": { "name": "july-2021-delete-after-july-31", "type": "lambda", "result": "success", "id": "142e61450efb8574", "trace_id": "eb56529a1f461c5e7e2f66ecb075e983", "subtype": null, "action": null, "duration": 38.853, "timestamp": 1631736666365048, "sampled": true, "context": { "cloud": { "origin": { "account": { "id": "abc123" }, "provider": "aws", "region": "us-east-1", "service": { "name": "serviceName" } } }, "service": { "origin": { "id": "abc123", "name": "service-name", "version": "1.0" } }, "user": {}, "tags": {}, "custom": { } }, "sync": true, "span_count": { "started": 0 }, "outcome": "unknown", "faas": { "coldstart": false, "execution": "2e13b309-23e1-417f-8bf7-074fc96bc683", "trigger": { "request_id": "FuH2Cir_vHcEMUA=", "type": "http" } }, "sample_rate": 1 } } +`) + agentData := AgentData{Data: benchBody, ContentEncoding: ""} + b.ResetTimer() for i := 0; i < b.N; i++ { - if err := PostToApmServer(client, agentData, &config, context.Background()); err != nil { + if err := PostToApmServer(transport, agentData); err != nil { b.Fatal(err) } } diff --git a/apm-lambda-extension/extension/client.go b/apm-lambda-extension/extension/client.go index 1ce3354f..fd8d41d5 100644 --- a/apm-lambda-extension/extension/client.go +++ b/apm-lambda-extension/extension/client.go @@ -67,14 +67,14 @@ const ( extensionErrorType = "Lambda-Extension-Function-Error-Type" ) -// Client is a simple client for the Lambda Extensions API +// Client is a simple Client for the Lambda Extensions API type Client struct { baseURL string httpClient *http.Client ExtensionID string } -// NewClient returns a Lambda Extensions API client +// NewClient returns a Lambda Extensions API Client func NewClient(awsLambdaRuntimeAPI string) *Client { baseURL := fmt.Sprintf("http://%s/2020-01-01/extension", awsLambdaRuntimeAPI) return &Client{ diff --git a/apm-lambda-extension/extension/http_server.go b/apm-lambda-extension/extension/http_server.go index 3415994b..536d7120 100644 --- a/apm-lambda-extension/extension/http_server.go +++ b/apm-lambda-extension/extension/http_server.go @@ -18,7 +18,6 @@ package extension import ( - "context" "net" "net/http" "time" @@ -27,13 +26,13 @@ import ( var agentDataServer *http.Server // StartHttpServer starts the server listening for APM agent data. -func StartHttpServer(ctx context.Context, agentDataChan chan AgentData, config *extensionConfig) (err error) { +func StartHttpServer(transport *ApmServerTransport) (err error) { mux := http.NewServeMux() - mux.HandleFunc("/", handleInfoRequest(ctx, config.apmServerUrl, config)) - mux.HandleFunc("/intake/v2/events", handleIntakeV2Events(agentDataChan)) - timeout := time.Duration(config.dataReceiverTimeoutSeconds) * time.Second + mux.HandleFunc("/", handleInfoRequest(transport)) + mux.HandleFunc("/intake/v2/events", handleIntakeV2Events(transport.DataChannel)) + timeout := time.Duration(transport.config.dataReceiverTimeoutSeconds) * time.Second agentDataServer = &http.Server{ - Addr: config.dataReceiverServerPort, + Addr: transport.config.dataReceiverServerPort, Handler: mux, ReadTimeout: timeout, WriteTimeout: timeout, diff --git a/apm-lambda-extension/extension/http_server_test.go b/apm-lambda-extension/extension/http_server_test.go index 4c08cfec..9a3e2df6 100644 --- a/apm-lambda-extension/extension/http_server_test.go +++ b/apm-lambda-extension/extension/http_server_test.go @@ -51,7 +51,6 @@ func TestInfoProxy(t *testing.T) { defer apmServer.Close() // Create extension config and start the server - dataChannel := make(chan AgentData, 100) config := extensionConfig{ apmServerUrl: apmServer.URL, apmServerSecretToken: "foo", @@ -59,8 +58,8 @@ func TestInfoProxy(t *testing.T) { dataReceiverServerPort: ":1234", dataReceiverTimeoutSeconds: 15, } - - if err := StartHttpServer(context.Background(), dataChannel, &config); err != nil { + transport := InitApmServerTransport(context.Background(), &config) + if err := StartHttpServer(transport); err != nil { t.Fail() return } @@ -100,7 +99,6 @@ func TestInfoProxyErrorStatusCode(t *testing.T) { defer apmServer.Close() // Create extension config and start the server - dataChannel := make(chan AgentData, 100) config := extensionConfig{ apmServerUrl: apmServer.URL, apmServerSecretToken: "foo", @@ -108,8 +106,9 @@ func TestInfoProxyErrorStatusCode(t *testing.T) { dataReceiverServerPort: ":1234", dataReceiverTimeoutSeconds: 15, } + transport := InitApmServerTransport(context.Background(), &config) - if err := StartHttpServer(context.Background(), dataChannel, &config); err != nil { + if err := StartHttpServer(transport); err != nil { t.Fail() return } @@ -144,16 +143,16 @@ func Test_handleInfoRequest(t *testing.T) { ` // Create extension config - dataChannel := make(chan AgentData, 100) config := extensionConfig{ apmServerSecretToken: "foo", apmServerApiKey: "bar", dataReceiverServerPort: ":1234", dataReceiverTimeoutSeconds: 15, } + transport := InitApmServerTransport(context.Background(), &config) // Start extension server - if err := StartHttpServer(context.Background(), dataChannel, &config); err != nil { + if err := StartHttpServer(transport); err != nil { t.Fail() return } @@ -211,14 +210,14 @@ func Test_handleIntakeV2EventsQueryParam(t *testing.T) { defer apmServer.Close() // Create extension config and start the server - dataChannel := make(chan AgentData, 100) config := extensionConfig{ apmServerUrl: apmServer.URL, dataReceiverServerPort: ":1234", dataReceiverTimeoutSeconds: 15, } + transport := InitApmServerTransport(context.Background(), &config) - if err := StartHttpServer(context.Background(), dataChannel, &config); err != nil { + if err := StartHttpServer(transport); err != nil { t.Fail() return } @@ -247,7 +246,7 @@ func Test_handleIntakeV2EventsQueryParam(t *testing.T) { select { case <-AgentDoneSignal: - <-dataChannel + <-transport.DataChannel case <-timer.C: t.Log("Timed out waiting for server to send FuncDone signal") t.Fail() @@ -263,14 +262,14 @@ func Test_handleIntakeV2EventsNoQueryParam(t *testing.T) { defer apmServer.Close() // Create extension config and start the server - dataChannel := make(chan AgentData, 100) config := extensionConfig{ apmServerUrl: apmServer.URL, dataReceiverServerPort: ":1234", dataReceiverTimeoutSeconds: 15, } + transport := InitApmServerTransport(context.Background(), &config) - if err := StartHttpServer(context.Background(), dataChannel, &config); err != nil { + if err := StartHttpServer(transport); err != nil { t.Fail() return } @@ -292,7 +291,7 @@ func Test_handleIntakeV2EventsNoQueryParam(t *testing.T) { t.Logf("Error fetching %s, [%v]", agentDataServer.Addr, err) t.Fail() } - <-dataChannel + <-transport.DataChannel assert.Equal(t, 202, resp.StatusCode) } @@ -307,14 +306,14 @@ func Test_handleIntakeV2EventsQueryParamEmptyData(t *testing.T) { defer apmServer.Close() // Create extension config and start the server - dataChannel := make(chan AgentData, 100) config := extensionConfig{ apmServerUrl: apmServer.URL, dataReceiverServerPort: ":1234", dataReceiverTimeoutSeconds: 15, } + transport := InitApmServerTransport(context.Background(), &config) - if err := StartHttpServer(context.Background(), dataChannel, &config); err != nil { + if err := StartHttpServer(transport); err != nil { t.Fail() return } diff --git a/apm-lambda-extension/extension/process_events.go b/apm-lambda-extension/extension/process_events.go index c47e5644..3beb4e20 100644 --- a/apm-lambda-extension/extension/process_events.go +++ b/apm-lambda-extension/extension/process_events.go @@ -18,9 +18,7 @@ package extension import ( - "context" "encoding/json" - "net/http" ) // ProcessShutdown processes the Shutdown event received from the @@ -31,17 +29,17 @@ func ProcessShutdown() { } // FlushAPMData reads all the apm data in the apm data channel and sends it to the APM server. -func FlushAPMData(client *http.Client, dataChannel chan AgentData, config *extensionConfig, ctx context.Context) { - if !IsTransportStatusHealthyOrPending() { - Log.Debug("Flush skipped - Transport unhealthy") +func FlushAPMData(transport *ApmServerTransport) { + if transport.Status == Failing { + Log.Debug("Flush skipped - Transport failing") return } Log.Debug("Flush started - Checking for agent data") for { select { - case agentData := <-dataChannel: + case agentData := <-transport.DataChannel: Log.Debug("Flush in progress - Processing agent data") - if err := PostToApmServer(client, agentData, config, ctx); err != nil { + if err := PostToApmServer(transport, agentData); err != nil { Log.Errorf("Error sending to APM server, skipping: %v", err) } default: diff --git a/apm-lambda-extension/extension/route_handlers.go b/apm-lambda-extension/extension/route_handlers.go index d813f967..3386cdd5 100644 --- a/apm-lambda-extension/extension/route_handlers.go +++ b/apm-lambda-extension/extension/route_handlers.go @@ -18,7 +18,6 @@ package extension import ( - "context" "io/ioutil" "net/http" "net/http/httputil" @@ -32,17 +31,16 @@ type AgentData struct { } var AgentDoneSignal chan struct{} -var mainExtensionContext context.Context +var currApmServerTransport *ApmServerTransport // URL: http://server/ -func handleInfoRequest(ctx context.Context, apmServerUrl string, config *extensionConfig) func(w http.ResponseWriter, r *http.Request) { +func handleInfoRequest(apmServerTransport *ApmServerTransport) func(w http.ResponseWriter, r *http.Request) { return func(w http.ResponseWriter, r *http.Request) { Log.Debug("Handling APM Server Info Request") - mainExtensionContext = ctx // Init reverse proxy - parsedApmServerUrl, err := url.Parse(apmServerUrl) + parsedApmServerUrl, err := url.Parse(apmServerTransport.config.apmServerUrl) if err != nil { Log.Errorf("could not parse APM server URL: %v", err) return @@ -50,15 +48,16 @@ func handleInfoRequest(ctx context.Context, apmServerUrl string, config *extensi reverseProxy := httputil.NewSingleHostReverseProxy(parsedApmServerUrl) - reverseProxyTimeout := time.Duration(config.DataForwarderTimeoutSeconds) * time.Second + reverseProxyTimeout := time.Duration(apmServerTransport.config.DataForwarderTimeoutSeconds) * time.Second customTransport := http.DefaultTransport.(*http.Transport).Clone() customTransport.ResponseHeaderTimeout = reverseProxyTimeout reverseProxy.Transport = customTransport + currApmServerTransport = apmServerTransport reverseProxy.ErrorHandler = reverseProxyErrorHandler // Process request (the Golang doc suggests removing any pre-existing X-Forwarded-For header coming - // from the client or an untrusted proxy to prevent IP spoofing : https://pkg.go.dev/net/http/httputil#ReverseProxy + // from the Client or an untrusted proxy to prevent IP spoofing : https://pkg.go.dev/net/http/httputil#ReverseProxy r.Header.Del("X-Forwarded-For") // Update headers to allow for SSL redirection @@ -73,7 +72,7 @@ func handleInfoRequest(ctx context.Context, apmServerUrl string, config *extensi } func reverseProxyErrorHandler(res http.ResponseWriter, req *http.Request, err error) { - SetApmServerTransportState(Failing, mainExtensionContext) + SetApmServerTransportState(currApmServerTransport, Failing) Log.Errorf("Error querying version from the APM Server: %v", err) } diff --git a/apm-lambda-extension/main.go b/apm-lambda-extension/main.go index 59a49b3a..6d412b60 100644 --- a/apm-lambda-extension/main.go +++ b/apm-lambda-extension/main.go @@ -19,7 +19,6 @@ package main import ( "context" - "net/http" "os" "os/signal" "path/filepath" @@ -71,22 +70,14 @@ func main() { } extension.Log.Debugf("Register response: %v", extension.PrettyPrint(res)) - // Create a channel to buffer apm agent data - agentDataChannel := make(chan extension.AgentData, 100) + // Init APM Server Transport struct + apmServerTransport := extension.InitApmServerTransport(ctx, config) // Start http server to receive data from agent - if err = extension.StartHttpServer(ctx, agentDataChannel, config); err != nil { + if err = extension.StartHttpServer(apmServerTransport); err != nil { extension.Log.Errorf("Could not start APM data receiver : %v", err) } - // Create a client to use for sending data to the apm server - client := &http.Client{ - Timeout: time.Duration(config.DataForwarderTimeoutSeconds) * time.Second, - Transport: http.DefaultTransport.(*http.Transport).Clone(), - } - - // Create a channel used - // Make channel for collecting logs and create a HTTP server to listen for them logsChannel := make(chan logsapi.LogEvent) @@ -144,7 +135,7 @@ func main() { // Stop checking for, and sending agent data when the function invocation // has completed, signaled via a channel. backgroundDataSendWg.Add(1) - extension.StartBackgroundSending(ctx, agentDataChannel, client, config, funcDone, &backgroundDataSendWg) + extension.StartBackgroundSending(apmServerTransport, funcDone, &backgroundDataSendWg) // Receive Logs API events // Send to the runtimeDoneSignal channel to signal when a runtimeDone event is received @@ -192,7 +183,7 @@ func main() { backgroundDataSendWg.Wait() if config.SendStrategy == extension.SyncFlush { // Flush APM data now that the function invocation has completed - extension.FlushAPMData(client, agentDataChannel, config, ctx) + extension.FlushAPMData(apmServerTransport) } } } diff --git a/apm-lambda-extension/main_test.go b/apm-lambda-extension/main_test.go index 9c88c0b2..0f2d40ae 100644 --- a/apm-lambda-extension/main_test.go +++ b/apm-lambda-extension/main_test.go @@ -345,7 +345,6 @@ func (suite *MainUnitTestsSuite) SetupTest() { http.DefaultServeMux = new(http.ServeMux) suite.eventsChannel = make(chan MockEvent, 100) suite.lambdaServer, suite.apmServer, suite.apmServerInternals, suite.lambdaServerInternals = initMockServers(suite.eventsChannel) - extension.SetApmServerTransportState(extension.Healthy, suite.ctx) } // This function executes after each test case @@ -435,9 +434,6 @@ func (suite *MainUnitTestsSuite) TestAPMServerRecovery() { // TestGracePeriodHangs verifies that the WaitforGracePeriod goroutine ends when main() ends. // This can be checked by asserting that apmTransportStatus is pending after the execution of main. func (suite *MainUnitTestsSuite) TestGracePeriodHangs() { - extension.ApmServerTransportState.Status = extension.Pending - extension.ApmServerTransportState.ReconnectionCount = 100 - eventsChain := []MockEvent{ {Type: InvokeStandard, APMServerBehavior: Hangs, ExecutionDuration: 1, Timeout: 500}, } @@ -446,7 +442,6 @@ func (suite *MainUnitTestsSuite) TestGracePeriodHangs() { time.Sleep(100 * time.Millisecond) suite.apmServerInternals.UnlockSignalChannel <- struct{}{} - defer assert.Equal(suite.T(), extension.IsTransportStatusHealthyOrPending(), true) } // TestAPMServerCrashesDuringExecution tests that main does not panic nor runs indefinitely when the APM server crashes From 83b0ce8b592fba0167d3347532c28393f7d8ef80 Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Tue, 26 Apr 2022 18:42:47 +0200 Subject: [PATCH 03/20] Remove state variables --- .../e2e-testing/.e2e_test_config | 2 +- apm-lambda-extension/extension/apm_server.go | 20 +++-- apm-lambda-extension/extension/http_server.go | 16 ++-- .../extension/http_server_test.go | 30 ++++---- .../extension/process_events.go | 4 +- .../extension/route_handlers.go | 7 +- .../logsapi/route_handlers.go | 4 +- apm-lambda-extension/logsapi/subscribe.go | 75 +++++++++++++++---- .../logsapi/subscribe_test.go | 30 +++----- apm-lambda-extension/main.go | 50 +++---------- apm-lambda-extension/main_test.go | 3 +- 11 files changed, 126 insertions(+), 115 deletions(-) diff --git a/apm-lambda-extension/e2e-testing/.e2e_test_config b/apm-lambda-extension/e2e-testing/.e2e_test_config index 396d6d47..201df541 100644 --- a/apm-lambda-extension/e2e-testing/.e2e_test_config +++ b/apm-lambda-extension/e2e-testing/.e2e_test_config @@ -1,2 +1,2 @@ -RUN_E2E_TESTS=true +RUN_E2E_TESTS=false ELASTIC_APM_LOG_LEVEL=info \ No newline at end of file diff --git a/apm-lambda-extension/extension/apm_server.go b/apm-lambda-extension/extension/apm_server.go index a4144f39..02307e69 100644 --- a/apm-lambda-extension/extension/apm_server.go +++ b/apm-lambda-extension/extension/apm_server.go @@ -32,10 +32,6 @@ import ( "time" ) -var bufferPool = sync.Pool{New: func() interface{} { - return &bytes.Buffer{} -}} - // Constants for the state of the transport used in // the backoff implementation. type ApmServerTransportStatusType string @@ -49,10 +45,11 @@ const ( // A struct to track the state and status of sending // to the APM server. Used in the backoff implementation. type ApmServerTransport struct { - sync.Pool sync.Mutex + bufferPool sync.Pool ctx context.Context config *extensionConfig + AgentDoneSignal chan struct{} DataChannel chan AgentData Client *http.Client Status ApmServerTransportStatusType @@ -62,6 +59,10 @@ type ApmServerTransport struct { func InitApmServerTransport(ctx context.Context, config *extensionConfig) *ApmServerTransport { var transport ApmServerTransport + transport.bufferPool = sync.Pool{New: func() interface{} { + return &bytes.Buffer{} + }} + transport.AgentDoneSignal = make(chan struct{}, 1) transport.DataChannel = make(chan AgentData, 100) transport.Client = &http.Client{ Timeout: time.Duration(config.DataForwarderTimeoutSeconds) * time.Second, @@ -74,7 +75,10 @@ func InitApmServerTransport(ctx context.Context, config *extensionConfig) *ApmSe return &transport } -func StartBackgroundSending(transport *ApmServerTransport, funcDone chan struct{}, backgroundDataSendWg *sync.WaitGroup) { +// StartBackgroundApmDataForwarding Receive agent data as it comes in and post it to the APM server. +// Stop checking for, and sending agent data when the function invocation +// has completed, signaled via a channel. +func StartBackgroundApmDataForwarding(transport *ApmServerTransport, funcDone chan struct{}, backgroundDataSendWg *sync.WaitGroup) { go func() { defer backgroundDataSendWg.Done() if transport.Status == Failing { @@ -115,10 +119,10 @@ func PostToApmServer(transport *ApmServerTransport, agentData AgentData) error { r = bytes.NewReader(agentData.Data) } else { encoding = "gzip" - buf := bufferPool.Get().(*bytes.Buffer) + buf := transport.bufferPool.Get().(*bytes.Buffer) defer func() { buf.Reset() - bufferPool.Put(buf) + transport.bufferPool.Put(buf) }() gw, err := gzip.NewWriterLevel(buf, gzip.BestSpeed) if err != nil { diff --git a/apm-lambda-extension/extension/http_server.go b/apm-lambda-extension/extension/http_server.go index 536d7120..1519d25f 100644 --- a/apm-lambda-extension/extension/http_server.go +++ b/apm-lambda-extension/extension/http_server.go @@ -23,15 +23,13 @@ import ( "time" ) -var agentDataServer *http.Server - // StartHttpServer starts the server listening for APM agent data. -func StartHttpServer(transport *ApmServerTransport) (err error) { +func StartHttpServer(transport *ApmServerTransport) (agentDataServer *http.Server, err error) { mux := http.NewServeMux() mux.HandleFunc("/", handleInfoRequest(transport)) - mux.HandleFunc("/intake/v2/events", handleIntakeV2Events(transport.DataChannel)) + mux.HandleFunc("/intake/v2/events", handleIntakeV2Events(transport)) timeout := time.Duration(transport.config.dataReceiverTimeoutSeconds) * time.Second - agentDataServer = &http.Server{ + server := &http.Server{ Addr: transport.config.dataReceiverServerPort, Handler: mux, ReadTimeout: timeout, @@ -39,14 +37,14 @@ func StartHttpServer(transport *ApmServerTransport) (err error) { MaxHeaderBytes: 1 << 20, } - ln, err := net.Listen("tcp", agentDataServer.Addr) + ln, err := net.Listen("tcp", server.Addr) if err != nil { return } go func() { - Log.Infof("Extension listening for apm data on %s", agentDataServer.Addr) - if err = agentDataServer.Serve(ln); err != nil { + Log.Infof("Extension listening for apm data on %s", server.Addr) + if err = server.Serve(ln); err != nil { if err.Error() == "http: Server closed" { Log.Debug(err) } else { @@ -54,5 +52,5 @@ func StartHttpServer(transport *ApmServerTransport) (err error) { } } }() - return nil + return server, nil } diff --git a/apm-lambda-extension/extension/http_server_test.go b/apm-lambda-extension/extension/http_server_test.go index 9a3e2df6..07c00a21 100644 --- a/apm-lambda-extension/extension/http_server_test.go +++ b/apm-lambda-extension/extension/http_server_test.go @@ -59,7 +59,8 @@ func TestInfoProxy(t *testing.T) { dataReceiverTimeoutSeconds: 15, } transport := InitApmServerTransport(context.Background(), &config) - if err := StartHttpServer(transport); err != nil { + agentDataServer, err := StartHttpServer(transport) + if err != nil { t.Fail() return } @@ -108,7 +109,8 @@ func TestInfoProxyErrorStatusCode(t *testing.T) { } transport := InitApmServerTransport(context.Background(), &config) - if err := StartHttpServer(transport); err != nil { + agentDataServer, err := StartHttpServer(transport) + if err != nil { t.Fail() return } @@ -152,7 +154,8 @@ func Test_handleInfoRequest(t *testing.T) { transport := InitApmServerTransport(context.Background(), &config) // Start extension server - if err := StartHttpServer(transport); err != nil { + agentDataServer, err := StartHttpServer(transport) + if err != nil { t.Fail() return } @@ -188,10 +191,10 @@ func (errReader) Read(_ []byte) (int, error) { } func Test_handleInfoRequestInvalidBody(t *testing.T) { - testChan := make(chan AgentData) + transport := InitApmServerTransport(context.Background(), &extensionConfig{}) mux := http.NewServeMux() urlPath := "/intake/v2/events" - mux.HandleFunc(urlPath, handleIntakeV2Events(testChan)) + mux.HandleFunc(urlPath, handleIntakeV2Events(transport)) req := httptest.NewRequest(http.MethodGet, urlPath, errReader(0)) recorder := httptest.NewRecorder() @@ -202,8 +205,6 @@ func Test_handleInfoRequestInvalidBody(t *testing.T) { func Test_handleIntakeV2EventsQueryParam(t *testing.T) { body := []byte(`{"metadata": {}`) - AgentDoneSignal = make(chan struct{}) - // Create apm server and handler apmServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { })) @@ -217,7 +218,8 @@ func Test_handleIntakeV2EventsQueryParam(t *testing.T) { } transport := InitApmServerTransport(context.Background(), &config) - if err := StartHttpServer(transport); err != nil { + agentDataServer, err := StartHttpServer(transport) + if err != nil { t.Fail() return } @@ -245,7 +247,7 @@ func Test_handleIntakeV2EventsQueryParam(t *testing.T) { defer timer.Stop() select { - case <-AgentDoneSignal: + case <-transport.AgentDoneSignal: <-transport.DataChannel case <-timer.C: t.Log("Timed out waiting for server to send FuncDone signal") @@ -269,7 +271,8 @@ func Test_handleIntakeV2EventsNoQueryParam(t *testing.T) { } transport := InitApmServerTransport(context.Background(), &config) - if err := StartHttpServer(transport); err != nil { + agentDataServer, err := StartHttpServer(transport) + if err != nil { t.Fail() return } @@ -298,8 +301,6 @@ func Test_handleIntakeV2EventsNoQueryParam(t *testing.T) { func Test_handleIntakeV2EventsQueryParamEmptyData(t *testing.T) { body := []byte(``) - AgentDoneSignal = make(chan struct{}) - // Create apm server and handler apmServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { })) @@ -313,7 +314,8 @@ func Test_handleIntakeV2EventsQueryParamEmptyData(t *testing.T) { } transport := InitApmServerTransport(context.Background(), &config) - if err := StartHttpServer(transport); err != nil { + agentDataServer, err := StartHttpServer(transport) + if err != nil { t.Fail() return } @@ -341,7 +343,7 @@ func Test_handleIntakeV2EventsQueryParamEmptyData(t *testing.T) { defer timer.Stop() select { - case <-AgentDoneSignal: + case <-transport.AgentDoneSignal: case <-timer.C: t.Log("Timed out waiting for server to send FuncDone signal") t.Fail() diff --git a/apm-lambda-extension/extension/process_events.go b/apm-lambda-extension/extension/process_events.go index 3beb4e20..b18c238f 100644 --- a/apm-lambda-extension/extension/process_events.go +++ b/apm-lambda-extension/extension/process_events.go @@ -19,13 +19,15 @@ package extension import ( "encoding/json" + "net/http" ) // ProcessShutdown processes the Shutdown event received from the // Lambda runtime API. -func ProcessShutdown() { +func ProcessShutdown(agentDataServer *http.Server, logsServer *http.Server) { Log.Info("Received SHUTDOWN event, exiting") agentDataServer.Close() + logsServer.Close() } // FlushAPMData reads all the apm data in the apm data channel and sends it to the APM server. diff --git a/apm-lambda-extension/extension/route_handlers.go b/apm-lambda-extension/extension/route_handlers.go index 3386cdd5..c3d43700 100644 --- a/apm-lambda-extension/extension/route_handlers.go +++ b/apm-lambda-extension/extension/route_handlers.go @@ -30,7 +30,6 @@ type AgentData struct { ContentEncoding string } -var AgentDoneSignal chan struct{} var currApmServerTransport *ApmServerTransport // URL: http://server/ @@ -77,7 +76,7 @@ func reverseProxyErrorHandler(res http.ResponseWriter, req *http.Request, err er } // URL: http://server/intake/v2/events -func handleIntakeV2Events(agentDataChan chan AgentData) func(w http.ResponseWriter, r *http.Request) { +func handleIntakeV2Events(transport *ApmServerTransport) func(w http.ResponseWriter, r *http.Request) { return func(w http.ResponseWriter, r *http.Request) { Log.Debug("Handling APM Data Intake") @@ -95,11 +94,11 @@ func handleIntakeV2Events(agentDataChan chan AgentData) func(w http.ResponseWrit ContentEncoding: r.Header.Get("Content-Encoding"), } - EnqueueAPMData(agentDataChan, agentData) + EnqueueAPMData(transport.DataChannel, agentData) } if len(r.URL.Query()["flushed"]) > 0 && r.URL.Query()["flushed"][0] == "true" { - AgentDoneSignal <- struct{}{} + transport.AgentDoneSignal <- struct{}{} } w.WriteHeader(http.StatusAccepted) diff --git a/apm-lambda-extension/logsapi/route_handlers.go b/apm-lambda-extension/logsapi/route_handlers.go index 31f52cf2..e2857a12 100644 --- a/apm-lambda-extension/logsapi/route_handlers.go +++ b/apm-lambda-extension/logsapi/route_handlers.go @@ -24,7 +24,7 @@ import ( "time" ) -func handleLogEventsRequest(out chan LogEvent) func(w http.ResponseWriter, r *http.Request) { +func handleLogEventsRequest(transport *LogsTransport) func(w http.ResponseWriter, r *http.Request) { return func(w http.ResponseWriter, r *http.Request) { var logEvents []LogEvent @@ -40,7 +40,7 @@ func handleLogEventsRequest(out chan LogEvent) func(w http.ResponseWriter, r *ht w.WriteHeader(http.StatusInternalServerError) continue } - out <- logEvents[idx] + transport.LogsChannel <- logEvents[idx] } } } diff --git a/apm-lambda-extension/logsapi/subscribe.go b/apm-lambda-extension/logsapi/subscribe.go index d1afb82a..69e1e4e0 100644 --- a/apm-lambda-extension/logsapi/subscribe.go +++ b/apm-lambda-extension/logsapi/subscribe.go @@ -29,9 +29,26 @@ import ( "github.com/pkg/errors" ) -var ListenerHost = "sandbox" -var Server *http.Server -var Listener net.Listener +// TestListenerAddr For e2e testing purposes +var TestListenerAddr net.Addr + +type LogsTransport struct { + ctx context.Context + LogsChannel chan LogEvent + Listener net.Listener + ListenerHost string + RuntimeDoneSignal chan struct{} + Server *http.Server +} + +func InitLogsTransport(ctx context.Context) *LogsTransport { + var transport LogsTransport + transport.ctx = ctx + transport.ListenerHost = "localhost" + transport.LogsChannel = make(chan LogEvent, 100) + transport.RuntimeDoneSignal = make(chan struct{}, 1) + return &transport +} // LogEvent represents an event received from the Logs API type LogEvent struct { @@ -48,7 +65,7 @@ type LogEventRecord struct { } // Subscribes to the Logs API -func subscribe(extensionID string, eventTypes []EventType) error { +func subscribe(transport *LogsTransport, extensionID string, eventTypes []EventType) error { extensionsAPIAddress, ok := os.LookupEnv("AWS_LAMBDA_RUNTIME_API") if !ok { return errors.New("AWS_LAMBDA_RUNTIME_API is not set") @@ -60,42 +77,43 @@ func subscribe(extensionID string, eventTypes []EventType) error { return err } - _, port, _ := net.SplitHostPort(Listener.Addr().String()) - _, err = logsAPIClient.Subscribe(eventTypes, URI("http://"+ListenerHost+":"+port), extensionID) + _, port, _ := net.SplitHostPort(transport.Listener.Addr().String()) + _, err = logsAPIClient.Subscribe(eventTypes, URI("http://"+transport.ListenerHost+":"+port), extensionID) return err } // Subscribe starts the HTTP server listening for log events and subscribes to the Logs API -func Subscribe(ctx context.Context, extensionID string, eventTypes []EventType, out chan LogEvent) (err error) { +func Subscribe(transport *LogsTransport, extensionID string, eventTypes []EventType) (err error) { if checkAWSSamLocal() { return errors.New("Detected sam local environment") } - if err = startHTTPServer(out); err != nil { + if err = startHTTPServer(transport); err != nil { return } - if err = subscribe(extensionID, eventTypes); err != nil { + if err = subscribe(transport, extensionID, eventTypes); err != nil { return } return nil } -func startHTTPServer(out chan LogEvent) error { +func startHTTPServer(transport *LogsTransport) error { mux := http.NewServeMux() - mux.HandleFunc("/", handleLogEventsRequest(out)) + mux.HandleFunc("/", handleLogEventsRequest(transport)) var err error - Server = &http.Server{ + transport.Server = &http.Server{ Handler: mux, } - if Listener, err = net.Listen("tcp", ListenerHost+":0"); err != nil { + if transport.Listener, err = net.Listen("tcp", transport.ListenerHost+":0"); err != nil { return err } + TestListenerAddr = transport.Listener.Addr() go func() { - extension.Log.Infof("Extension listening for Lambda Logs API events on %s", Listener.Addr().String()) - if err = Server.Serve(Listener); err != nil { + extension.Log.Infof("Extension listening for Lambda Logs API events on %s", transport.Listener.Addr().String()) + if err = transport.Server.Serve(transport.Listener); err != nil { extension.Log.Errorf("Error upon Logs API server start : %v", err) } }() @@ -110,3 +128,30 @@ func checkAWSSamLocal() bool { return false } + +// StartBackgroundLogsProcessing Receive Logs API events +// Send to the runtimeDoneSignal channel to signal when a runtimeDone event is received +func StartBackgroundLogsProcessing(transport *LogsTransport, funcDone chan struct{}, requestID string) { + go func() { + for { + select { + case <-funcDone: + extension.Log.Debug("Received signal that function has completed, not processing any more log events") + return + case logEvent := <-transport.LogsChannel: + extension.Log.Debugf("Received log event %v", logEvent.Type) + // Check the logEvent for runtimeDone and compare the RequestID + // to the id that came in via the Next API + if logEvent.Type == RuntimeDone { + if logEvent.Record.RequestId == requestID { + extension.Log.Info("Received runtimeDone event for this function invocation") + transport.RuntimeDoneSignal <- struct{}{} + return + } else { + extension.Log.Debug("Log API runtimeDone event request id didn't match") + } + } + } + } + }() +} diff --git a/apm-lambda-extension/logsapi/subscribe_test.go b/apm-lambda-extension/logsapi/subscribe_test.go index 8c8f4f0c..238159bb 100644 --- a/apm-lambda-extension/logsapi/subscribe_test.go +++ b/apm-lambda-extension/logsapi/subscribe_test.go @@ -31,8 +31,7 @@ import ( ) func TestSubscribeWithSamLocalEnv(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() + transport := InitLogsTransport(context.Background()) if err := os.Setenv("AWS_SAM_LOCAL", "true"); err != nil { t.Fail() } @@ -41,17 +40,13 @@ func TestSubscribeWithSamLocalEnv(t *testing.T) { t.Fail() } }) - out := make(chan LogEvent) - err := Subscribe(ctx, "testID", []EventType{Platform}, out) + err := Subscribe(transport, "testID", []EventType{Platform}) assert.Error(t, err) } func TestSubscribeAWSRequest(t *testing.T) { - ListenerHost = "localhost" - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - out := make(chan LogEvent, 1) + transport := InitLogsTransport(context.Background()) // For subscription request expectedTypes := []EventType{Platform} expectedBufferingCfg := BufferingCfg{ @@ -77,12 +72,12 @@ func TestSubscribeAWSRequest(t *testing.T) { } // Subscribe to the logs api and start the http server listening for events - if err := Subscribe(ctx, "testID", []EventType{Platform}, out); err != nil { + if err := Subscribe(transport, "testID", []EventType{Platform}); err != nil { t.Logf("Error subscribing, %v", err) t.Fail() return } - defer Server.Close() + defer transport.Server.Close() // Create a request to send to the logs listener platformDoneEvent := `{ @@ -94,7 +89,7 @@ func TestSubscribeAWSRequest(t *testing.T) { } }` body := []byte(`[` + platformDoneEvent + `]`) - url := "http://" + Listener.Addr().String() + url := "http://" + transport.Listener.Addr().String() req, err := http.NewRequest("GET", url, bytes.NewReader(body)) if err != nil { t.Log("Could not create request") @@ -106,15 +101,12 @@ func TestSubscribeAWSRequest(t *testing.T) { t.Logf("Error fetching %s, [%v]", url, err) t.Fail() } - event := <-out + event := <-transport.LogsChannel assert.Equal(t, event.Record.RequestId, "6f7f0961f83442118a7af6fe80b88") } func TestSubscribeWithBadLogsRequest(t *testing.T) { - ListenerHost = "localhost" - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - out := make(chan LogEvent) + transport := InitLogsTransport(context.Background()) // Create aws runtime API server and handler awsRuntimeApiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {})) @@ -127,17 +119,17 @@ func TestSubscribeWithBadLogsRequest(t *testing.T) { } // Subscribe to the logs api and start the http server listening for events - if err := Subscribe(ctx, "testID", []EventType{Platform}, out); err != nil { + if err := Subscribe(transport, "testID", []EventType{Platform}); err != nil { t.Logf("Error subscribing, %v", err) t.Fail() return } - defer Server.Close() + defer transport.Server.Close() // Create a request to send to the logs listener logEvent := `{"invalid": "json"}` body := []byte(`[` + logEvent + `]`) - url := "http://" + Listener.Addr().String() + url := "http://" + transport.Listener.Addr().String() req, err := http.NewRequest("GET", url, bytes.NewReader(body)) if err != nil { t.Log("Could not create request") diff --git a/apm-lambda-extension/main.go b/apm-lambda-extension/main.go index 6d412b60..812a0f67 100644 --- a/apm-lambda-extension/main.go +++ b/apm-lambda-extension/main.go @@ -74,22 +74,23 @@ func main() { apmServerTransport := extension.InitApmServerTransport(ctx, config) // Start http server to receive data from agent - if err = extension.StartHttpServer(apmServerTransport); err != nil { + agentDataServer, err := extension.StartHttpServer(apmServerTransport) + if err != nil { extension.Log.Errorf("Could not start APM data receiver : %v", err) } + // Init APM Server Transport struct // Make channel for collecting logs and create a HTTP server to listen for them - logsChannel := make(chan logsapi.LogEvent) + logsTransport := logsapi.InitLogsTransport(ctx) // Use a wait group to ensure the background go routine sending to the APM server // completes before signaling that the extension is ready for the next invocation. var backgroundDataSendWg sync.WaitGroup if err := logsapi.Subscribe( - ctx, + logsTransport, extensionClient.ExtensionID, []logsapi.EventType{logsapi.Platform}, - logsChannel, ); err != nil { extension.Log.Warnf("Error while subscribing to the Logs API: %v", err) } @@ -116,51 +117,20 @@ func main() { extension.Log.Debug("Received event.") extension.Log.Debugf("%v", extension.PrettyPrint(event)) - // Make a channel for signaling that we received the agent flushed signal - extension.AgentDoneSignal = make(chan struct{}) - // Make a channel for signaling that we received the runtimeDone logs API event - runtimeDoneSignal := make(chan struct{}) // Make a channel for signaling that the function invocation is complete funcDone := make(chan struct{}) // A shutdown event indicates the execution environment is shutting down. // This is usually due to inactivity. if event.EventType == extension.Shutdown { - extension.ProcessShutdown() + extension.ProcessShutdown(agentDataServer, logsTransport.Server) cancel() return } - // Receive agent data as it comes in and post it to the APM server. - // Stop checking for, and sending agent data when the function invocation - // has completed, signaled via a channel. backgroundDataSendWg.Add(1) - extension.StartBackgroundSending(apmServerTransport, funcDone, &backgroundDataSendWg) - - // Receive Logs API events - // Send to the runtimeDoneSignal channel to signal when a runtimeDone event is received - go func() { - for { - select { - case <-funcDone: - extension.Log.Debug("Received signal that function has completed, not processing any more log events") - return - case logEvent := <-logsChannel: - extension.Log.Debugf("Received log event %v", logEvent.Type) - // Check the logEvent for runtimeDone and compare the RequestID - // to the id that came in via the Next API - if logEvent.Type == logsapi.RuntimeDone { - if logEvent.Record.RequestId == event.RequestID { - extension.Log.Info("Received runtimeDone event for this function invocation") - runtimeDoneSignal <- struct{}{} - return - } else { - extension.Log.Debug("Log API runtimeDone event request id didn't match") - } - } - } - } - }() + extension.StartBackgroundApmDataForwarding(apmServerTransport, funcDone, &backgroundDataSendWg) + logsapi.StartBackgroundLogsProcessing(logsTransport, funcDone, event.RequestID) // Calculate how long to wait for a runtimeDoneSignal or AgentDoneSignal signal flushDeadlineMs := event.DeadlineMs - 100 @@ -171,9 +141,9 @@ func main() { defer timer.Stop() select { - case <-extension.AgentDoneSignal: + case <-apmServerTransport.AgentDoneSignal: extension.Log.Debug("Received agent done signal") - case <-runtimeDoneSignal: + case <-logsTransport.RuntimeDoneSignal: extension.Log.Debug("Received runtimeDone signal") case <-timer.C: extension.Log.Info("Time expired waiting for agent signal or runtimeDone event") diff --git a/apm-lambda-extension/main_test.go b/apm-lambda-extension/main_test.go index 0f2d40ae..a5d6997a 100644 --- a/apm-lambda-extension/main_test.go +++ b/apm-lambda-extension/main_test.go @@ -145,7 +145,6 @@ func initMockServers(eventsChannel chan MockEvent) (*httptest.Server, *httptest. } // Mock Lambda Server - logsapi.ListenerHost = "localhost" var lambdaServerInternals MockServerInternals lambdaServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.RequestURI { @@ -305,7 +304,7 @@ func sendLogEvent(requestId string, logEventType logsapi.SubEventType) { extension.Log.Errorf("Could not encode record : %v", err) return } - host, port, _ := net.SplitHostPort(logsapi.Listener.Addr().String()) + host, port, _ := net.SplitHostPort(logsapi.TestListenerAddr.String()) req, _ := http.NewRequest("POST", "http://"+host+":"+port, bufLogEvent) client := http.Client{} if _, err := client.Do(req); err != nil { From 176e264a61310cf61fb20455acd166e9160ee273 Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Thu, 5 May 2022 15:16:29 +0200 Subject: [PATCH 04/20] Pass contexts as separate arguments --- apm-lambda-extension/extension/apm_server.go | 20 +++--- .../extension/apm_server_test.go | 70 +++++++++---------- apm-lambda-extension/extension/http_server.go | 5 +- .../extension/http_server_test.go | 26 +++---- .../extension/process_events.go | 5 +- .../extension/route_handlers.go | 7 +- apm-lambda-extension/logsapi/subscribe.go | 16 +++-- .../logsapi/subscribe_test.go | 12 ++-- apm-lambda-extension/main.go | 11 +-- 9 files changed, 90 insertions(+), 82 deletions(-) diff --git a/apm-lambda-extension/extension/apm_server.go b/apm-lambda-extension/extension/apm_server.go index 02307e69..83ad9063 100644 --- a/apm-lambda-extension/extension/apm_server.go +++ b/apm-lambda-extension/extension/apm_server.go @@ -47,7 +47,6 @@ const ( type ApmServerTransport struct { sync.Mutex bufferPool sync.Pool - ctx context.Context config *extensionConfig AgentDoneSignal chan struct{} DataChannel chan AgentData @@ -57,7 +56,7 @@ type ApmServerTransport struct { GracePeriodTimer *time.Timer } -func InitApmServerTransport(ctx context.Context, config *extensionConfig) *ApmServerTransport { +func InitApmServerTransport(config *extensionConfig) *ApmServerTransport { var transport ApmServerTransport transport.bufferPool = sync.Pool{New: func() interface{} { return &bytes.Buffer{} @@ -69,7 +68,6 @@ func InitApmServerTransport(ctx context.Context, config *extensionConfig) *ApmSe Transport: http.DefaultTransport.(*http.Transport).Clone(), } transport.config = config - transport.ctx = ctx transport.Status = Healthy transport.ReconnectionCount = -1 return &transport @@ -78,7 +76,7 @@ func InitApmServerTransport(ctx context.Context, config *extensionConfig) *ApmSe // StartBackgroundApmDataForwarding Receive agent data as it comes in and post it to the APM server. // Stop checking for, and sending agent data when the function invocation // has completed, signaled via a channel. -func StartBackgroundApmDataForwarding(transport *ApmServerTransport, funcDone chan struct{}, backgroundDataSendWg *sync.WaitGroup) { +func StartBackgroundApmDataForwarding(ctx context.Context, transport *ApmServerTransport, funcDone chan struct{}, backgroundDataSendWg *sync.WaitGroup) { go func() { defer backgroundDataSendWg.Done() if transport.Status == Failing { @@ -90,7 +88,7 @@ func StartBackgroundApmDataForwarding(transport *ApmServerTransport, funcDone ch Log.Debug("Received signal that function has completed, not processing any more agent data") return case agentData := <-transport.DataChannel: - if err := PostToApmServer(transport, agentData); err != nil { + if err := PostToApmServer(ctx, transport, agentData); err != nil { Log.Errorf("Error sending to APM server, skipping: %v", err) return } @@ -104,7 +102,7 @@ func StartBackgroundApmDataForwarding(transport *ApmServerTransport, funcDone ch // The function compresses the APM agent data, if it's not already compressed. // It sets the APM transport status to failing upon errors, as part of the backoff // strategy. -func PostToApmServer(transport *ApmServerTransport, agentData AgentData) error { +func PostToApmServer(ctx context.Context, transport *ApmServerTransport, agentData AgentData) error { // todo: can this be a streaming or streaming style call that keeps the // connection open across invocations? if transport.Status == Failing { @@ -152,7 +150,7 @@ func PostToApmServer(transport *ApmServerTransport, agentData AgentData) error { Log.Debug("Sending data chunk to APM Server") resp, err := transport.Client.Do(req) if err != nil { - SetApmServerTransportState(transport, Failing) + SetApmServerTransportState(ctx, transport, Failing) return fmt.Errorf("failed to post to APM server: %v", err) } @@ -160,11 +158,11 @@ func PostToApmServer(transport *ApmServerTransport, agentData AgentData) error { defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { - SetApmServerTransportState(transport, Failing) + SetApmServerTransportState(ctx, transport, Failing) return fmt.Errorf("failed to read the response body after posting to the APM server") } - SetApmServerTransportState(transport, Healthy) + SetApmServerTransportState(ctx, transport, Healthy) Log.Debug("Transport status set to healthy") Log.Debugf("APM server response body: %v", string(body)) Log.Debugf("APM server response status code: %v", resp.StatusCode) @@ -178,7 +176,7 @@ func PostToApmServer(transport *ApmServerTransport, agentData AgentData) error { // to the APM server. // // This function is public for use in tests. -func SetApmServerTransportState(transport *ApmServerTransport, status ApmServerTransportStatusType) { +func SetApmServerTransportState(ctx context.Context, transport *ApmServerTransport, status ApmServerTransportStatusType) { switch status { case Healthy: transport.Lock() @@ -197,7 +195,7 @@ func SetApmServerTransportState(transport *ApmServerTransport, status ApmServerT select { case <-transport.GracePeriodTimer.C: Log.Debug("Grace period over - timer timed out") - case <-transport.ctx.Done(): + case <-ctx.Done(): Log.Debug("Grace period over - context done") } transport.Status = Pending diff --git a/apm-lambda-extension/extension/apm_server_test.go b/apm-lambda-extension/extension/apm_server_test.go index 7d03c8c1..e2e57338 100644 --- a/apm-lambda-extension/extension/apm_server_test.go +++ b/apm-lambda-extension/extension/apm_server_test.go @@ -69,8 +69,8 @@ func TestPostToApmServerDataCompressed(t *testing.T) { config := extensionConfig{ apmServerUrl: apmServer.URL + "/", } - transport := InitApmServerTransport(context.Background(), &config) - err := PostToApmServer(transport, agentData) + transport := InitApmServerTransport(&config) + err := PostToApmServer(context.Background(), transport, agentData) assert.Equal(t, nil, err) } @@ -114,13 +114,13 @@ func TestPostToApmServerDataNotCompressed(t *testing.T) { config := extensionConfig{ apmServerUrl: apmServer.URL + "/", } - transport := InitApmServerTransport(context.Background(), &config) - err := PostToApmServer(transport, agentData) + transport := InitApmServerTransport(&config) + err := PostToApmServer(context.Background(), transport, agentData) assert.Equal(t, nil, err) } func TestGracePeriod(t *testing.T) { - transport := InitApmServerTransport(context.Background(), &extensionConfig{}) + transport := InitApmServerTransport(&extensionConfig{}) transport.ReconnectionCount = 0 val0 := computeGracePeriod(transport).Seconds() @@ -156,8 +156,8 @@ func TestGracePeriod(t *testing.T) { } func TestSetHealthyTransport(t *testing.T) { - transport := InitApmServerTransport(context.Background(), &extensionConfig{}) - SetApmServerTransportState(transport, Healthy) + transport := InitApmServerTransport(&extensionConfig{}) + SetApmServerTransportState(context.Background(), transport, Healthy) assert.True(t, transport.Status == Healthy) assert.Equal(t, transport.ReconnectionCount, -1) } @@ -165,17 +165,17 @@ func TestSetHealthyTransport(t *testing.T) { func TestSetFailingTransport(t *testing.T) { // By explicitly setting the reconnection count to 0, we ensure that the grace period will not be 0 // and avoid a race between reaching the pending status and the test assertion. - transport := InitApmServerTransport(context.Background(), &extensionConfig{}) + transport := InitApmServerTransport(&extensionConfig{}) transport.ReconnectionCount = 0 - SetApmServerTransportState(transport, Failing) + SetApmServerTransportState(context.Background(), transport, Failing) assert.True(t, transport.Status == Failing) assert.Equal(t, transport.ReconnectionCount, 1) } func TestSetPendingTransport(t *testing.T) { - transport := InitApmServerTransport(context.Background(), &extensionConfig{}) - SetApmServerTransportState(transport, Healthy) - SetApmServerTransportState(transport, Failing) + transport := InitApmServerTransport(&extensionConfig{}) + SetApmServerTransportState(context.Background(), transport, Healthy) + SetApmServerTransportState(context.Background(), transport, Failing) for { if transport.Status != Failing { break @@ -186,17 +186,17 @@ func TestSetPendingTransport(t *testing.T) { } func TestSetPendingTransportExplicitly(t *testing.T) { - transport := InitApmServerTransport(context.Background(), &extensionConfig{}) - SetApmServerTransportState(transport, Healthy) - SetApmServerTransportState(transport, Pending) + transport := InitApmServerTransport(&extensionConfig{}) + SetApmServerTransportState(context.Background(), transport, Healthy) + SetApmServerTransportState(context.Background(), transport, Pending) assert.True(t, transport.Status == Healthy) assert.Equal(t, transport.ReconnectionCount, -1) } func TestSetInvalidTransport(t *testing.T) { - transport := InitApmServerTransport(context.Background(), &extensionConfig{}) - SetApmServerTransportState(transport, Healthy) - SetApmServerTransportState(transport, "Invalid") + transport := InitApmServerTransport(&extensionConfig{}) + SetApmServerTransportState(context.Background(), transport, Healthy) + SetApmServerTransportState(context.Background(), transport, "Invalid") assert.True(t, transport.Status == Healthy) assert.Equal(t, transport.ReconnectionCount, -1) } @@ -236,13 +236,13 @@ func TestEnterBackoffFromHealthy(t *testing.T) { config := extensionConfig{ apmServerUrl: apmServer.URL + "/", } - transport := InitApmServerTransport(context.Background(), &config) - SetApmServerTransportState(transport, Healthy) + transport := InitApmServerTransport(&config) + SetApmServerTransportState(context.Background(), transport, Healthy) // Close the APM server early so that POST requests fail and that backoff is enabled apmServer.Close() - if err := PostToApmServer(transport, agentData); err != nil { + if err := PostToApmServer(context.Background(), transport, agentData); err != nil { return } // No way to know for sure if failing or pending (0 sec grace period) @@ -290,9 +290,9 @@ func TestEnterBackoffFromFailing(t *testing.T) { apmServerUrl: apmServer.URL + "/", } - transport := InitApmServerTransport(context.Background(), &config) - SetApmServerTransportState(transport, Healthy) - SetApmServerTransportState(transport, Failing) + transport := InitApmServerTransport(&config) + SetApmServerTransportState(context.Background(), transport, Healthy) + SetApmServerTransportState(context.Background(), transport, Failing) for { if transport.Status != Failing { break @@ -300,7 +300,7 @@ func TestEnterBackoffFromFailing(t *testing.T) { } assert.Equal(t, transport.Status, Pending) - assert.Error(t, PostToApmServer(transport, agentData)) + assert.Error(t, PostToApmServer(context.Background(), transport, agentData)) assert.Equal(t, transport.Status, Failing) assert.Equal(t, transport.ReconnectionCount, 1) } @@ -343,9 +343,9 @@ func TestAPMServerRecovery(t *testing.T) { apmServerUrl: apmServer.URL + "/", } - transport := InitApmServerTransport(context.Background(), &config) - SetApmServerTransportState(transport, Healthy) - SetApmServerTransportState(transport, Failing) + transport := InitApmServerTransport(&config) + SetApmServerTransportState(context.Background(), transport, Healthy) + SetApmServerTransportState(context.Background(), transport, Failing) for { if transport.Status != Failing { break @@ -353,7 +353,7 @@ func TestAPMServerRecovery(t *testing.T) { } assert.Equal(t, transport.Status, Pending) - assert.NoError(t, PostToApmServer(transport, agentData)) + assert.NoError(t, PostToApmServer(context.Background(), transport, agentData)) assert.Equal(t, transport.Status, Healthy) assert.Equal(t, transport.ReconnectionCount, -1) } @@ -396,16 +396,16 @@ func TestContinuedAPMServerFailure(t *testing.T) { apmServerUrl: apmServer.URL + "/", } - transport := InitApmServerTransport(context.Background(), &config) - SetApmServerTransportState(transport, Healthy) - SetApmServerTransportState(transport, Failing) + transport := InitApmServerTransport(&config) + SetApmServerTransportState(context.Background(), transport, Healthy) + SetApmServerTransportState(context.Background(), transport, Failing) for { if transport.Status != Failing { break } } assert.Equal(t, transport.Status, Pending) - assert.Error(t, PostToApmServer(transport, agentData)) + assert.Error(t, PostToApmServer(context.Background(), transport, agentData)) assert.Equal(t, transport.Status, Failing) assert.Equal(t, transport.ReconnectionCount, 1) } @@ -429,7 +429,7 @@ func BenchmarkPostToAPM(b *testing.B) { apmServerUrl: apmServer.URL + "/", } - transport := InitApmServerTransport(context.Background(), &config) + transport := InitApmServerTransport(&config) // Copied from https://github.com/elastic/apm-server/blob/master/testdata/intake-v2/transactions.ndjson. benchBody := []byte(`{"metadata": {"service": {"name": "1234_service-12a3","node": {"configured_name": "node-123"},"version": "5.1.3","environment": "staging","language": {"name": "ecmascript","version": "8"},"runtime": {"name": "node","version": "8.0.0"},"framework": {"name": "Express","version": "1.2.3"},"agent": {"name": "elastic-node","version": "3.14.0"}},"user": {"id": "123user", "username": "bar", "email": "bar@user.com"}, "labels": {"tag0": null, "tag1": "one", "tag2": 2}, "process": {"pid": 1234,"ppid": 6789,"title": "node","argv": ["node","server.js"]},"system": {"hostname": "prod1.example.com","architecture": "x64","platform": "darwin", "container": {"id": "container-id"}, "kubernetes": {"namespace": "namespace1", "pod": {"uid": "pod-uid", "name": "pod-name"}, "node": {"name": "node-name"}}},"cloud":{"account":{"id":"account_id","name":"account_name"},"availability_zone":"cloud_availability_zone","instance":{"id":"instance_id","name":"instance_name"},"machine":{"type":"machine_type"},"project":{"id":"project_id","name":"project_name"},"provider":"cloud_provider","region":"cloud_region","service":{"name":"lambda"}}}} @@ -443,7 +443,7 @@ func BenchmarkPostToAPM(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - if err := PostToApmServer(transport, agentData); err != nil { + if err := PostToApmServer(context.Background(), transport, agentData); err != nil { b.Fatal(err) } } diff --git a/apm-lambda-extension/extension/http_server.go b/apm-lambda-extension/extension/http_server.go index 1519d25f..d961559f 100644 --- a/apm-lambda-extension/extension/http_server.go +++ b/apm-lambda-extension/extension/http_server.go @@ -18,15 +18,16 @@ package extension import ( + "context" "net" "net/http" "time" ) // StartHttpServer starts the server listening for APM agent data. -func StartHttpServer(transport *ApmServerTransport) (agentDataServer *http.Server, err error) { +func StartHttpServer(ctx context.Context, transport *ApmServerTransport) (agentDataServer *http.Server, err error) { mux := http.NewServeMux() - mux.HandleFunc("/", handleInfoRequest(transport)) + mux.HandleFunc("/", handleInfoRequest(ctx, transport)) mux.HandleFunc("/intake/v2/events", handleIntakeV2Events(transport)) timeout := time.Duration(transport.config.dataReceiverTimeoutSeconds) * time.Second server := &http.Server{ diff --git a/apm-lambda-extension/extension/http_server_test.go b/apm-lambda-extension/extension/http_server_test.go index 07c00a21..e9d2236e 100644 --- a/apm-lambda-extension/extension/http_server_test.go +++ b/apm-lambda-extension/extension/http_server_test.go @@ -58,8 +58,8 @@ func TestInfoProxy(t *testing.T) { dataReceiverServerPort: ":1234", dataReceiverTimeoutSeconds: 15, } - transport := InitApmServerTransport(context.Background(), &config) - agentDataServer, err := StartHttpServer(transport) + transport := InitApmServerTransport(&config) + agentDataServer, err := StartHttpServer(context.Background(), transport) if err != nil { t.Fail() return @@ -107,9 +107,9 @@ func TestInfoProxyErrorStatusCode(t *testing.T) { dataReceiverServerPort: ":1234", dataReceiverTimeoutSeconds: 15, } - transport := InitApmServerTransport(context.Background(), &config) + transport := InitApmServerTransport(&config) - agentDataServer, err := StartHttpServer(transport) + agentDataServer, err := StartHttpServer(context.Background(), transport) if err != nil { t.Fail() return @@ -151,10 +151,10 @@ func Test_handleInfoRequest(t *testing.T) { dataReceiverServerPort: ":1234", dataReceiverTimeoutSeconds: 15, } - transport := InitApmServerTransport(context.Background(), &config) + transport := InitApmServerTransport(&config) // Start extension server - agentDataServer, err := StartHttpServer(transport) + agentDataServer, err := StartHttpServer(context.Background(), transport) if err != nil { t.Fail() return @@ -191,7 +191,7 @@ func (errReader) Read(_ []byte) (int, error) { } func Test_handleInfoRequestInvalidBody(t *testing.T) { - transport := InitApmServerTransport(context.Background(), &extensionConfig{}) + transport := InitApmServerTransport(&extensionConfig{}) mux := http.NewServeMux() urlPath := "/intake/v2/events" mux.HandleFunc(urlPath, handleIntakeV2Events(transport)) @@ -216,9 +216,9 @@ func Test_handleIntakeV2EventsQueryParam(t *testing.T) { dataReceiverServerPort: ":1234", dataReceiverTimeoutSeconds: 15, } - transport := InitApmServerTransport(context.Background(), &config) + transport := InitApmServerTransport(&config) - agentDataServer, err := StartHttpServer(transport) + agentDataServer, err := StartHttpServer(context.Background(), transport) if err != nil { t.Fail() return @@ -269,9 +269,9 @@ func Test_handleIntakeV2EventsNoQueryParam(t *testing.T) { dataReceiverServerPort: ":1234", dataReceiverTimeoutSeconds: 15, } - transport := InitApmServerTransport(context.Background(), &config) + transport := InitApmServerTransport(&config) - agentDataServer, err := StartHttpServer(transport) + agentDataServer, err := StartHttpServer(context.Background(), transport) if err != nil { t.Fail() return @@ -312,9 +312,9 @@ func Test_handleIntakeV2EventsQueryParamEmptyData(t *testing.T) { dataReceiverServerPort: ":1234", dataReceiverTimeoutSeconds: 15, } - transport := InitApmServerTransport(context.Background(), &config) + transport := InitApmServerTransport(&config) - agentDataServer, err := StartHttpServer(transport) + agentDataServer, err := StartHttpServer(context.Background(), transport) if err != nil { t.Fail() return diff --git a/apm-lambda-extension/extension/process_events.go b/apm-lambda-extension/extension/process_events.go index b18c238f..549baf63 100644 --- a/apm-lambda-extension/extension/process_events.go +++ b/apm-lambda-extension/extension/process_events.go @@ -18,6 +18,7 @@ package extension import ( + "context" "encoding/json" "net/http" ) @@ -31,7 +32,7 @@ func ProcessShutdown(agentDataServer *http.Server, logsServer *http.Server) { } // FlushAPMData reads all the apm data in the apm data channel and sends it to the APM server. -func FlushAPMData(transport *ApmServerTransport) { +func FlushAPMData(ctx context.Context, transport *ApmServerTransport) { if transport.Status == Failing { Log.Debug("Flush skipped - Transport failing") return @@ -41,7 +42,7 @@ func FlushAPMData(transport *ApmServerTransport) { select { case agentData := <-transport.DataChannel: Log.Debug("Flush in progress - Processing agent data") - if err := PostToApmServer(transport, agentData); err != nil { + if err := PostToApmServer(ctx, transport, agentData); err != nil { Log.Errorf("Error sending to APM server, skipping: %v", err) } default: diff --git a/apm-lambda-extension/extension/route_handlers.go b/apm-lambda-extension/extension/route_handlers.go index c3d43700..b4d5262a 100644 --- a/apm-lambda-extension/extension/route_handlers.go +++ b/apm-lambda-extension/extension/route_handlers.go @@ -18,6 +18,7 @@ package extension import ( + "context" "io/ioutil" "net/http" "net/http/httputil" @@ -31,9 +32,10 @@ type AgentData struct { } var currApmServerTransport *ApmServerTransport +var currContext context.Context // URL: http://server/ -func handleInfoRequest(apmServerTransport *ApmServerTransport) func(w http.ResponseWriter, r *http.Request) { +func handleInfoRequest(ctx context.Context, apmServerTransport *ApmServerTransport) func(w http.ResponseWriter, r *http.Request) { return func(w http.ResponseWriter, r *http.Request) { Log.Debug("Handling APM Server Info Request") @@ -53,6 +55,7 @@ func handleInfoRequest(apmServerTransport *ApmServerTransport) func(w http.Respo reverseProxy.Transport = customTransport currApmServerTransport = apmServerTransport + currContext = ctx reverseProxy.ErrorHandler = reverseProxyErrorHandler // Process request (the Golang doc suggests removing any pre-existing X-Forwarded-For header coming @@ -71,7 +74,7 @@ func handleInfoRequest(apmServerTransport *ApmServerTransport) func(w http.Respo } func reverseProxyErrorHandler(res http.ResponseWriter, req *http.Request, err error) { - SetApmServerTransportState(currApmServerTransport, Failing) + SetApmServerTransportState(currContext, currApmServerTransport, Failing) Log.Errorf("Error querying version from the APM Server: %v", err) } diff --git a/apm-lambda-extension/logsapi/subscribe.go b/apm-lambda-extension/logsapi/subscribe.go index 69e1e4e0..6fef9c2d 100644 --- a/apm-lambda-extension/logsapi/subscribe.go +++ b/apm-lambda-extension/logsapi/subscribe.go @@ -33,7 +33,6 @@ import ( var TestListenerAddr net.Addr type LogsTransport struct { - ctx context.Context LogsChannel chan LogEvent Listener net.Listener ListenerHost string @@ -41,9 +40,8 @@ type LogsTransport struct { Server *http.Server } -func InitLogsTransport(ctx context.Context) *LogsTransport { +func InitLogsTransport() *LogsTransport { var transport LogsTransport - transport.ctx = ctx transport.ListenerHost = "localhost" transport.LogsChannel = make(chan LogEvent, 100) transport.RuntimeDoneSignal = make(chan struct{}, 1) @@ -83,11 +81,11 @@ func subscribe(transport *LogsTransport, extensionID string, eventTypes []EventT } // Subscribe starts the HTTP server listening for log events and subscribes to the Logs API -func Subscribe(transport *LogsTransport, extensionID string, eventTypes []EventType) (err error) { +func Subscribe(ctx context.Context, transport *LogsTransport, extensionID string, eventTypes []EventType) (err error) { if checkAWSSamLocal() { return errors.New("Detected sam local environment") } - if err = startHTTPServer(transport); err != nil { + if err = startHTTPServer(ctx, transport); err != nil { return } @@ -97,7 +95,7 @@ func Subscribe(transport *LogsTransport, extensionID string, eventTypes []EventT return nil } -func startHTTPServer(transport *LogsTransport) error { +func startHTTPServer(ctx context.Context, transport *LogsTransport) error { mux := http.NewServeMux() mux.HandleFunc("/", handleLogEventsRequest(transport)) var err error @@ -117,6 +115,12 @@ func startHTTPServer(transport *LogsTransport) error { extension.Log.Errorf("Error upon Logs API server start : %v", err) } }() + + go func() { + <-ctx.Done() + transport.Server.Close() + }() + return nil } diff --git a/apm-lambda-extension/logsapi/subscribe_test.go b/apm-lambda-extension/logsapi/subscribe_test.go index 238159bb..021594a8 100644 --- a/apm-lambda-extension/logsapi/subscribe_test.go +++ b/apm-lambda-extension/logsapi/subscribe_test.go @@ -31,7 +31,7 @@ import ( ) func TestSubscribeWithSamLocalEnv(t *testing.T) { - transport := InitLogsTransport(context.Background()) + transport := InitLogsTransport() if err := os.Setenv("AWS_SAM_LOCAL", "true"); err != nil { t.Fail() } @@ -41,12 +41,12 @@ func TestSubscribeWithSamLocalEnv(t *testing.T) { } }) - err := Subscribe(transport, "testID", []EventType{Platform}) + err := Subscribe(context.Background(), transport, "testID", []EventType{Platform}) assert.Error(t, err) } func TestSubscribeAWSRequest(t *testing.T) { - transport := InitLogsTransport(context.Background()) + transport := InitLogsTransport() // For subscription request expectedTypes := []EventType{Platform} expectedBufferingCfg := BufferingCfg{ @@ -72,7 +72,7 @@ func TestSubscribeAWSRequest(t *testing.T) { } // Subscribe to the logs api and start the http server listening for events - if err := Subscribe(transport, "testID", []EventType{Platform}); err != nil { + if err := Subscribe(context.Background(), transport, "testID", []EventType{Platform}); err != nil { t.Logf("Error subscribing, %v", err) t.Fail() return @@ -106,7 +106,7 @@ func TestSubscribeAWSRequest(t *testing.T) { } func TestSubscribeWithBadLogsRequest(t *testing.T) { - transport := InitLogsTransport(context.Background()) + transport := InitLogsTransport() // Create aws runtime API server and handler awsRuntimeApiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {})) @@ -119,7 +119,7 @@ func TestSubscribeWithBadLogsRequest(t *testing.T) { } // Subscribe to the logs api and start the http server listening for events - if err := Subscribe(transport, "testID", []EventType{Platform}); err != nil { + if err := Subscribe(context.Background(), transport, "testID", []EventType{Platform}); err != nil { t.Logf("Error subscribing, %v", err) t.Fail() return diff --git a/apm-lambda-extension/main.go b/apm-lambda-extension/main.go index 812a0f67..32b22158 100644 --- a/apm-lambda-extension/main.go +++ b/apm-lambda-extension/main.go @@ -71,23 +71,24 @@ func main() { extension.Log.Debugf("Register response: %v", extension.PrettyPrint(res)) // Init APM Server Transport struct - apmServerTransport := extension.InitApmServerTransport(ctx, config) + apmServerTransport := extension.InitApmServerTransport(config) // Start http server to receive data from agent - agentDataServer, err := extension.StartHttpServer(apmServerTransport) + agentDataServer, err := extension.StartHttpServer(ctx, apmServerTransport) if err != nil { extension.Log.Errorf("Could not start APM data receiver : %v", err) } // Init APM Server Transport struct // Make channel for collecting logs and create a HTTP server to listen for them - logsTransport := logsapi.InitLogsTransport(ctx) + logsTransport := logsapi.InitLogsTransport() // Use a wait group to ensure the background go routine sending to the APM server // completes before signaling that the extension is ready for the next invocation. var backgroundDataSendWg sync.WaitGroup if err := logsapi.Subscribe( + ctx, logsTransport, extensionClient.ExtensionID, []logsapi.EventType{logsapi.Platform}, @@ -129,7 +130,7 @@ func main() { } backgroundDataSendWg.Add(1) - extension.StartBackgroundApmDataForwarding(apmServerTransport, funcDone, &backgroundDataSendWg) + extension.StartBackgroundApmDataForwarding(ctx, apmServerTransport, funcDone, &backgroundDataSendWg) logsapi.StartBackgroundLogsProcessing(logsTransport, funcDone, event.RequestID) // Calculate how long to wait for a runtimeDoneSignal or AgentDoneSignal signal @@ -153,7 +154,7 @@ func main() { backgroundDataSendWg.Wait() if config.SendStrategy == extension.SyncFlush { // Flush APM data now that the function invocation has completed - extension.FlushAPMData(apmServerTransport) + extension.FlushAPMData(ctx, apmServerTransport) } } } From 7c2427664f31f1d4fb8c33958727c395c4afc7eb Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Thu, 5 May 2022 15:47:48 +0200 Subject: [PATCH 05/20] Init logsTransport as part of logs API Subscription method --- apm-lambda-extension/extension/process_events.go | 3 +-- apm-lambda-extension/logsapi/subscribe.go | 16 +++++++++++----- apm-lambda-extension/logsapi/subscribe_test.go | 13 ++++++------- apm-lambda-extension/main.go | 14 +++----------- 4 files changed, 21 insertions(+), 25 deletions(-) diff --git a/apm-lambda-extension/extension/process_events.go b/apm-lambda-extension/extension/process_events.go index 549baf63..71a7d36f 100644 --- a/apm-lambda-extension/extension/process_events.go +++ b/apm-lambda-extension/extension/process_events.go @@ -25,10 +25,9 @@ import ( // ProcessShutdown processes the Shutdown event received from the // Lambda runtime API. -func ProcessShutdown(agentDataServer *http.Server, logsServer *http.Server) { +func ProcessShutdown(agentDataServer *http.Server) { Log.Info("Received SHUTDOWN event, exiting") agentDataServer.Close() - logsServer.Close() } // FlushAPMData reads all the apm data in the apm data channel and sends it to the APM server. diff --git a/apm-lambda-extension/logsapi/subscribe.go b/apm-lambda-extension/logsapi/subscribe.go index 6fef9c2d..cf8f994d 100644 --- a/apm-lambda-extension/logsapi/subscribe.go +++ b/apm-lambda-extension/logsapi/subscribe.go @@ -64,6 +64,7 @@ type LogEventRecord struct { // Subscribes to the Logs API func subscribe(transport *LogsTransport, extensionID string, eventTypes []EventType) error { + extensionsAPIAddress, ok := os.LookupEnv("AWS_LAMBDA_RUNTIME_API") if !ok { return errors.New("AWS_LAMBDA_RUNTIME_API is not set") @@ -81,18 +82,23 @@ func subscribe(transport *LogsTransport, extensionID string, eventTypes []EventT } // Subscribe starts the HTTP server listening for log events and subscribes to the Logs API -func Subscribe(ctx context.Context, transport *LogsTransport, extensionID string, eventTypes []EventType) (err error) { +func Subscribe(ctx context.Context, extensionID string, eventTypes []EventType) (transport *LogsTransport, err error) { if checkAWSSamLocal() { - return errors.New("Detected sam local environment") + return nil, errors.New("Detected sam local environment") } + + // Init APM Server Transport struct + // Make channel for collecting logs and create a HTTP server to listen for them + transport = InitLogsTransport() + if err = startHTTPServer(ctx, transport); err != nil { - return + return nil, err } if err = subscribe(transport, extensionID, eventTypes); err != nil { - return + return nil, err } - return nil + return transport, nil } func startHTTPServer(ctx context.Context, transport *LogsTransport) error { diff --git a/apm-lambda-extension/logsapi/subscribe_test.go b/apm-lambda-extension/logsapi/subscribe_test.go index 021594a8..80ad1b9f 100644 --- a/apm-lambda-extension/logsapi/subscribe_test.go +++ b/apm-lambda-extension/logsapi/subscribe_test.go @@ -31,7 +31,6 @@ import ( ) func TestSubscribeWithSamLocalEnv(t *testing.T) { - transport := InitLogsTransport() if err := os.Setenv("AWS_SAM_LOCAL", "true"); err != nil { t.Fail() } @@ -41,12 +40,12 @@ func TestSubscribeWithSamLocalEnv(t *testing.T) { } }) - err := Subscribe(context.Background(), transport, "testID", []EventType{Platform}) + _, err := Subscribe(context.Background(), "testID", []EventType{Platform}) assert.Error(t, err) } func TestSubscribeAWSRequest(t *testing.T) { - transport := InitLogsTransport() + // For subscription request expectedTypes := []EventType{Platform} expectedBufferingCfg := BufferingCfg{ @@ -72,7 +71,8 @@ func TestSubscribeAWSRequest(t *testing.T) { } // Subscribe to the logs api and start the http server listening for events - if err := Subscribe(context.Background(), transport, "testID", []EventType{Platform}); err != nil { + transport, err := Subscribe(context.Background(), "testID", []EventType{Platform}) + if err != nil { t.Logf("Error subscribing, %v", err) t.Fail() return @@ -106,8 +106,6 @@ func TestSubscribeAWSRequest(t *testing.T) { } func TestSubscribeWithBadLogsRequest(t *testing.T) { - transport := InitLogsTransport() - // Create aws runtime API server and handler awsRuntimeApiServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {})) defer awsRuntimeApiServer.Close() @@ -119,7 +117,8 @@ func TestSubscribeWithBadLogsRequest(t *testing.T) { } // Subscribe to the logs api and start the http server listening for events - if err := Subscribe(context.Background(), transport, "testID", []EventType{Platform}); err != nil { + transport, err := Subscribe(context.Background(), "testID", []EventType{Platform}); + if err != nil { t.Logf("Error subscribing, %v", err) t.Fail() return diff --git a/apm-lambda-extension/main.go b/apm-lambda-extension/main.go index 32b22158..7e729d62 100644 --- a/apm-lambda-extension/main.go +++ b/apm-lambda-extension/main.go @@ -79,20 +79,12 @@ func main() { extension.Log.Errorf("Could not start APM data receiver : %v", err) } - // Init APM Server Transport struct - // Make channel for collecting logs and create a HTTP server to listen for them - logsTransport := logsapi.InitLogsTransport() - // Use a wait group to ensure the background go routine sending to the APM server // completes before signaling that the extension is ready for the next invocation. var backgroundDataSendWg sync.WaitGroup - if err := logsapi.Subscribe( - ctx, - logsTransport, - extensionClient.ExtensionID, - []logsapi.EventType{logsapi.Platform}, - ); err != nil { + logsTransport, err := logsapi.Subscribe(ctx, extensionClient.ExtensionID, []logsapi.EventType{logsapi.Platform}) + if err != nil { extension.Log.Warnf("Error while subscribing to the Logs API: %v", err) } @@ -124,7 +116,7 @@ func main() { // A shutdown event indicates the execution environment is shutting down. // This is usually due to inactivity. if event.EventType == extension.Shutdown { - extension.ProcessShutdown(agentDataServer, logsTransport.Server) + extension.ProcessShutdown(agentDataServer) cancel() return } From 2870ee74b9207a7fed02173d07f71d642baf6246 Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Thu, 5 May 2022 23:49:38 +0200 Subject: [PATCH 06/20] Implement WaitRuntimeDone and refactor shutdown --- apm-lambda-extension/extension/apm_server.go | 6 +- .../extension/process_events.go | 8 -- apm-lambda-extension/logsapi/subscribe.go | 42 +++--- apm-lambda-extension/main.go | 122 ++++++++++-------- 4 files changed, 89 insertions(+), 89 deletions(-) diff --git a/apm-lambda-extension/extension/apm_server.go b/apm-lambda-extension/extension/apm_server.go index 83ad9063..ef73a390 100644 --- a/apm-lambda-extension/extension/apm_server.go +++ b/apm-lambda-extension/extension/apm_server.go @@ -76,7 +76,7 @@ func InitApmServerTransport(config *extensionConfig) *ApmServerTransport { // StartBackgroundApmDataForwarding Receive agent data as it comes in and post it to the APM server. // Stop checking for, and sending agent data when the function invocation // has completed, signaled via a channel. -func StartBackgroundApmDataForwarding(ctx context.Context, transport *ApmServerTransport, funcDone chan struct{}, backgroundDataSendWg *sync.WaitGroup) { +func StartBackgroundApmDataForwarding(ctx context.Context, transport *ApmServerTransport, backgroundDataSendWg *sync.WaitGroup) { go func() { defer backgroundDataSendWg.Done() if transport.Status == Failing { @@ -84,8 +84,8 @@ func StartBackgroundApmDataForwarding(ctx context.Context, transport *ApmServerT } for { select { - case <-funcDone: - Log.Debug("Received signal that function has completed, not processing any more agent data") + case <-ctx.Done(): + Log.Debug("Invocation context cancelled, not processing any more agent data") return case agentData := <-transport.DataChannel: if err := PostToApmServer(ctx, transport, agentData); err != nil { diff --git a/apm-lambda-extension/extension/process_events.go b/apm-lambda-extension/extension/process_events.go index 71a7d36f..bc5658df 100644 --- a/apm-lambda-extension/extension/process_events.go +++ b/apm-lambda-extension/extension/process_events.go @@ -20,16 +20,8 @@ package extension import ( "context" "encoding/json" - "net/http" ) -// ProcessShutdown processes the Shutdown event received from the -// Lambda runtime API. -func ProcessShutdown(agentDataServer *http.Server) { - Log.Info("Received SHUTDOWN event, exiting") - agentDataServer.Close() -} - // FlushAPMData reads all the apm data in the apm data channel and sends it to the APM server. func FlushAPMData(ctx context.Context, transport *ApmServerTransport) { if transport.Status == Failing { diff --git a/apm-lambda-extension/logsapi/subscribe.go b/apm-lambda-extension/logsapi/subscribe.go index cf8f994d..6f578783 100644 --- a/apm-lambda-extension/logsapi/subscribe.go +++ b/apm-lambda-extension/logsapi/subscribe.go @@ -139,29 +139,27 @@ func checkAWSSamLocal() bool { return false } -// StartBackgroundLogsProcessing Receive Logs API events -// Send to the runtimeDoneSignal channel to signal when a runtimeDone event is received -func StartBackgroundLogsProcessing(transport *LogsTransport, funcDone chan struct{}, requestID string) { - go func() { - for { - select { - case <-funcDone: - extension.Log.Debug("Received signal that function has completed, not processing any more log events") - return - case logEvent := <-transport.LogsChannel: - extension.Log.Debugf("Received log event %v", logEvent.Type) - // Check the logEvent for runtimeDone and compare the RequestID - // to the id that came in via the Next API - if logEvent.Type == RuntimeDone { - if logEvent.Record.RequestId == requestID { - extension.Log.Info("Received runtimeDone event for this function invocation") - transport.RuntimeDoneSignal <- struct{}{} - return - } else { - extension.Log.Debug("Log API runtimeDone event request id didn't match") - } +// WaitRuntimeDone consumes events until a RuntimeDone event corresponding +// to requestID is received, or ctx is cancelled, and then returns. +func WaitRuntimeDone(ctx context.Context, requestID string, transport *LogsTransport) error { + for { + select { + case logEvent := <-transport.LogsChannel: + extension.Log.Debugf("Received log event %v", logEvent.Type) + // Check the logEvent for runtimeDone and compare the RequestID + // to the id that came in via the Next API + if logEvent.Type == RuntimeDone { + if logEvent.Record.RequestId == requestID { + extension.Log.Info("Received runtimeDone event for this function invocation") + transport.RuntimeDoneSignal <- struct{}{} + return nil + } else { + extension.Log.Debug("Log API runtimeDone event request id didn't match") } } + case <-ctx.Done(): + extension.Log.Debug("Current invocation over. Interrupting logs processing goroutine") + return nil } - }() + } } diff --git a/apm-lambda-extension/main.go b/apm-lambda-extension/main.go index 7e729d62..42f4c2a1 100644 --- a/apm-lambda-extension/main.go +++ b/apm-lambda-extension/main.go @@ -39,9 +39,8 @@ var ( func main() { + // Global context ctx, cancel := context.WithCancel(context.Background()) - - // Trigger ctx.Done() in all relevant goroutines when main ends defer cancel() sigs := make(chan os.Signal, 1) @@ -70,18 +69,16 @@ func main() { } extension.Log.Debugf("Register response: %v", extension.PrettyPrint(res)) - // Init APM Server Transport struct + // Init APM Server Transport struct and start http server to receive data from agent apmServerTransport := extension.InitApmServerTransport(config) - - // Start http server to receive data from agent agentDataServer, err := extension.StartHttpServer(ctx, apmServerTransport) if err != nil { extension.Log.Errorf("Could not start APM data receiver : %v", err) } + defer agentDataServer.Close() // Use a wait group to ensure the background go routine sending to the APM server // completes before signaling that the extension is ready for the next invocation. - var backgroundDataSendWg sync.WaitGroup logsTransport, err := logsapi.Subscribe(ctx, extensionClient.ExtensionID, []logsapi.EventType{logsapi.Platform}) if err != nil { @@ -93,56 +90,9 @@ func main() { case <-ctx.Done(): return default: - // call Next method of extension API. This long polling HTTP method - // will block until there's an invocation of the function - extension.Log.Infof("Waiting for next event...") - event, err := extensionClient.NextEvent(ctx) - if err != nil { - status, err := extensionClient.ExitError(ctx, err.Error()) - if err != nil { - panic(err) - } - extension.Log.Errorf("Error: %s", err) - extension.Log.Infof("Exit signal sent to runtime : %s", status) - extension.Log.Infof("Exiting") - return - } - extension.Log.Debug("Received event.") - extension.Log.Debugf("%v", extension.PrettyPrint(event)) - - // Make a channel for signaling that the function invocation is complete - funcDone := make(chan struct{}) - - // A shutdown event indicates the execution environment is shutting down. - // This is usually due to inactivity. - if event.EventType == extension.Shutdown { - extension.ProcessShutdown(agentDataServer) - cancel() - return - } - - backgroundDataSendWg.Add(1) - extension.StartBackgroundApmDataForwarding(ctx, apmServerTransport, funcDone, &backgroundDataSendWg) - logsapi.StartBackgroundLogsProcessing(logsTransport, funcDone, event.RequestID) - - // Calculate how long to wait for a runtimeDoneSignal or AgentDoneSignal signal - flushDeadlineMs := event.DeadlineMs - 100 - durationUntilFlushDeadline := time.Until(time.Unix(flushDeadlineMs/1000, 0)) - - // Create a timer that expires after durationUntilFlushDeadline - timer := time.NewTimer(durationUntilFlushDeadline) - defer timer.Stop() - - select { - case <-apmServerTransport.AgentDoneSignal: - extension.Log.Debug("Received agent done signal") - case <-logsTransport.RuntimeDoneSignal: - extension.Log.Debug("Received runtimeDone signal") - case <-timer.C: - extension.Log.Info("Time expired waiting for agent signal or runtimeDone event") - } - - close(funcDone) + var backgroundDataSendWg sync.WaitGroup + processEvent(ctx, cancel, apmServerTransport, logsTransport, &backgroundDataSendWg) + extension.Log.Debug("Waiting for background data send to end") backgroundDataSendWg.Wait() if config.SendStrategy == extension.SyncFlush { // Flush APM data now that the function invocation has completed @@ -151,3 +101,63 @@ func main() { } } } + +func processEvent(ctx context.Context, cancel context.CancelFunc, apmServerTransport *extension.ApmServerTransport, logsTransport *logsapi.LogsTransport, backgroundDataSendWg *sync.WaitGroup) { + // Invocation context + invocationCtx, invocationCancel := context.WithCancel(ctx) + defer invocationCancel() + + // call Next method of extension API. This long polling HTTP method + // will block until there's an invocation of the function + extension.Log.Infof("Waiting for next event...") + event, err := extensionClient.NextEvent(ctx) + if err != nil { + status, err := extensionClient.ExitError(ctx, err.Error()) + if err != nil { + panic(err) + } + extension.Log.Errorf("Error: %s", err) + extension.Log.Infof("Exit signal sent to runtime : %s", status) + extension.Log.Infof("Exiting") + return + } + + extension.Log.Debug("Received event.") + extension.Log.Debugf("%v", extension.PrettyPrint(event)) + + if event.EventType == extension.Shutdown { + cancel() + return + } + + // APM Data Processing + backgroundDataSendWg.Add(1) + extension.StartBackgroundApmDataForwarding(invocationCtx, apmServerTransport, backgroundDataSendWg) + + // Lambda Service Logs Processing + runtimeDone := make(chan struct{}) + go func() { + if err := logsapi.WaitRuntimeDone(invocationCtx, event.RequestID, logsTransport); err != nil { + extension.Log.Errorf("Error while processing Lambda Logs ; %v", err) + } else { + close(runtimeDone) + } + }() + + // Calculate how long to wait for a runtimeDoneSignal or AgentDoneSignal signal + flushDeadlineMs := event.DeadlineMs - 100 + durationUntilFlushDeadline := time.Until(time.Unix(flushDeadlineMs/1000, 0)) + + // Create a timer that expires after durationUntilFlushDeadline + timer := time.NewTimer(durationUntilFlushDeadline) + defer timer.Stop() + + select { + case <-apmServerTransport.AgentDoneSignal: + extension.Log.Debug("Received agent done signal") + case <-logsTransport.RuntimeDoneSignal: + extension.Log.Debug("Received runtimeDone signal") + case <-timer.C: + extension.Log.Info("Time expired waiting for agent signal or runtimeDone event") + } +} From 08559157a8dba3f39fd50259214c32229edb77c7 Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Fri, 6 May 2022 00:17:35 +0200 Subject: [PATCH 07/20] Refactor flush and APM Data Forwarding --- apm-lambda-extension/extension/apm_server.go | 50 +++++++++++++------ .../extension/process_events.go | 22 -------- apm-lambda-extension/main.go | 6 ++- 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/apm-lambda-extension/extension/apm_server.go b/apm-lambda-extension/extension/apm_server.go index ef73a390..2fa5ed93 100644 --- a/apm-lambda-extension/extension/apm_server.go +++ b/apm-lambda-extension/extension/apm_server.go @@ -76,25 +76,43 @@ func InitApmServerTransport(config *extensionConfig) *ApmServerTransport { // StartBackgroundApmDataForwarding Receive agent data as it comes in and post it to the APM server. // Stop checking for, and sending agent data when the function invocation // has completed, signaled via a channel. -func StartBackgroundApmDataForwarding(ctx context.Context, transport *ApmServerTransport, backgroundDataSendWg *sync.WaitGroup) { - go func() { - defer backgroundDataSendWg.Done() - if transport.Status == Failing { - return +func (transport *ApmServerTransport) ForwardApmData(ctx context.Context, backgroundDataSendWg *sync.WaitGroup) error { + defer backgroundDataSendWg.Done() + if transport.Status == Failing { + return nil + } + for { + select { + case <-ctx.Done(): + Log.Debug("Invocation context cancelled, not processing any more agent data") + return nil + case agentData := <-transport.DataChannel: + if err := PostToApmServer(ctx, transport, agentData); err != nil { + return errors.New(fmt.Sprintf("Error sending to APM server, skipping: %v", err)) + } } - for { - select { - case <-ctx.Done(): - Log.Debug("Invocation context cancelled, not processing any more agent data") - return - case agentData := <-transport.DataChannel: - if err := PostToApmServer(ctx, transport, agentData); err != nil { - Log.Errorf("Error sending to APM server, skipping: %v", err) - return - } + } +} + +// FlushAPMData reads all the apm data in the apm data channel and sends it to the APM server. +func FlushAPMData(ctx context.Context, transport *ApmServerTransport) { + if transport.Status == Failing { + Log.Debug("Flush skipped - Transport failing") + return + } + Log.Debug("Flush started - Checking for agent data") + for { + select { + case agentData := <-transport.DataChannel: + Log.Debug("Flush in progress - Processing agent data") + if err := PostToApmServer(ctx, transport, agentData); err != nil { + Log.Errorf("Error sending to APM server, skipping: %v", err) } + default: + Log.Debug("Flush ended - No agent data on buffer") + return } - }() + } } // PostToApmServer takes a chunk of APM agent data and posts it to the APM server. diff --git a/apm-lambda-extension/extension/process_events.go b/apm-lambda-extension/extension/process_events.go index bc5658df..e2388562 100644 --- a/apm-lambda-extension/extension/process_events.go +++ b/apm-lambda-extension/extension/process_events.go @@ -18,31 +18,9 @@ package extension import ( - "context" "encoding/json" ) -// FlushAPMData reads all the apm data in the apm data channel and sends it to the APM server. -func FlushAPMData(ctx context.Context, transport *ApmServerTransport) { - if transport.Status == Failing { - Log.Debug("Flush skipped - Transport failing") - return - } - Log.Debug("Flush started - Checking for agent data") - for { - select { - case agentData := <-transport.DataChannel: - Log.Debug("Flush in progress - Processing agent data") - if err := PostToApmServer(ctx, transport, agentData); err != nil { - Log.Errorf("Error sending to APM server, skipping: %v", err) - } - default: - Log.Debug("Flush ended - No agent data on buffer") - return - } - } -} - // PrettyPrint prints formatted, legible json data. func PrettyPrint(v interface{}) string { data, err := json.MarshalIndent(v, "", "\t") diff --git a/apm-lambda-extension/main.go b/apm-lambda-extension/main.go index 42f4c2a1..992d2fac 100644 --- a/apm-lambda-extension/main.go +++ b/apm-lambda-extension/main.go @@ -132,7 +132,11 @@ func processEvent(ctx context.Context, cancel context.CancelFunc, apmServerTrans // APM Data Processing backgroundDataSendWg.Add(1) - extension.StartBackgroundApmDataForwarding(invocationCtx, apmServerTransport, backgroundDataSendWg) + go func() { + if err := apmServerTransport.ForwardApmData(invocationCtx, backgroundDataSendWg); err != nil { + extension.Log.Error(err) + } + }() // Lambda Service Logs Processing runtimeDone := make(chan struct{}) From f7972f04d355f46ada660200159399e12655f667 Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Fri, 6 May 2022 00:23:30 +0200 Subject: [PATCH 08/20] Fix linting --- apm-lambda-extension/extension/apm_server.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apm-lambda-extension/extension/apm_server.go b/apm-lambda-extension/extension/apm_server.go index 2fa5ed93..a87ca12e 100644 --- a/apm-lambda-extension/extension/apm_server.go +++ b/apm-lambda-extension/extension/apm_server.go @@ -88,7 +88,7 @@ func (transport *ApmServerTransport) ForwardApmData(ctx context.Context, backgro return nil case agentData := <-transport.DataChannel: if err := PostToApmServer(ctx, transport, agentData); err != nil { - return errors.New(fmt.Sprintf("Error sending to APM server, skipping: %v", err)) + return fmt.Errorf("error sending to APM server, skipping: %v", err) } } } From 63cd01e2a32772463fa6a72465895db308dd1b4d Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Fri, 6 May 2022 00:31:26 +0200 Subject: [PATCH 09/20] Refactor unnecessarily exported fields --- apm-lambda-extension/extension/apm_server.go | 58 ++++++++-------- .../extension/apm_server_test.go | 68 +++++++++---------- apm-lambda-extension/extension/http_server.go | 2 +- .../extension/http_server_test.go | 4 +- .../extension/route_handlers.go | 8 +-- .../logsapi/route_handlers.go | 2 +- apm-lambda-extension/logsapi/subscribe.go | 32 ++++----- .../logsapi/subscribe_test.go | 12 ++-- 8 files changed, 93 insertions(+), 93 deletions(-) diff --git a/apm-lambda-extension/extension/apm_server.go b/apm-lambda-extension/extension/apm_server.go index a87ca12e..07559ed8 100644 --- a/apm-lambda-extension/extension/apm_server.go +++ b/apm-lambda-extension/extension/apm_server.go @@ -49,11 +49,11 @@ type ApmServerTransport struct { bufferPool sync.Pool config *extensionConfig AgentDoneSignal chan struct{} - DataChannel chan AgentData - Client *http.Client - Status ApmServerTransportStatusType - ReconnectionCount int - GracePeriodTimer *time.Timer + dataChannel chan AgentData + client *http.Client + status ApmServerTransportStatusType + reconnectionCount int + gracePeriodTimer *time.Timer } func InitApmServerTransport(config *extensionConfig) *ApmServerTransport { @@ -62,14 +62,14 @@ func InitApmServerTransport(config *extensionConfig) *ApmServerTransport { return &bytes.Buffer{} }} transport.AgentDoneSignal = make(chan struct{}, 1) - transport.DataChannel = make(chan AgentData, 100) - transport.Client = &http.Client{ + transport.dataChannel = make(chan AgentData, 100) + transport.client = &http.Client{ Timeout: time.Duration(config.DataForwarderTimeoutSeconds) * time.Second, Transport: http.DefaultTransport.(*http.Transport).Clone(), } transport.config = config - transport.Status = Healthy - transport.ReconnectionCount = -1 + transport.status = Healthy + transport.reconnectionCount = -1 return &transport } @@ -78,7 +78,7 @@ func InitApmServerTransport(config *extensionConfig) *ApmServerTransport { // has completed, signaled via a channel. func (transport *ApmServerTransport) ForwardApmData(ctx context.Context, backgroundDataSendWg *sync.WaitGroup) error { defer backgroundDataSendWg.Done() - if transport.Status == Failing { + if transport.status == Failing { return nil } for { @@ -86,7 +86,7 @@ func (transport *ApmServerTransport) ForwardApmData(ctx context.Context, backgro case <-ctx.Done(): Log.Debug("Invocation context cancelled, not processing any more agent data") return nil - case agentData := <-transport.DataChannel: + case agentData := <-transport.dataChannel: if err := PostToApmServer(ctx, transport, agentData); err != nil { return fmt.Errorf("error sending to APM server, skipping: %v", err) } @@ -96,14 +96,14 @@ func (transport *ApmServerTransport) ForwardApmData(ctx context.Context, backgro // FlushAPMData reads all the apm data in the apm data channel and sends it to the APM server. func FlushAPMData(ctx context.Context, transport *ApmServerTransport) { - if transport.Status == Failing { + if transport.status == Failing { Log.Debug("Flush skipped - Transport failing") return } Log.Debug("Flush started - Checking for agent data") for { select { - case agentData := <-transport.DataChannel: + case agentData := <-transport.dataChannel: Log.Debug("Flush in progress - Processing agent data") if err := PostToApmServer(ctx, transport, agentData); err != nil { Log.Errorf("Error sending to APM server, skipping: %v", err) @@ -123,7 +123,7 @@ func FlushAPMData(ctx context.Context, transport *ApmServerTransport) { func PostToApmServer(ctx context.Context, transport *ApmServerTransport, agentData AgentData) error { // todo: can this be a streaming or streaming style call that keeps the // connection open across invocations? - if transport.Status == Failing { + if transport.status == Failing { return errors.New("transport status is unhealthy") } @@ -165,8 +165,8 @@ func PostToApmServer(ctx context.Context, transport *ApmServerTransport, agentDa req.Header.Add("Authorization", "Bearer "+transport.config.apmServerSecretToken) } - Log.Debug("Sending data chunk to APM Server") - resp, err := transport.Client.Do(req) + Log.Debug("Sending data chunk to APM server") + resp, err := transport.client.Do(req) if err != nil { SetApmServerTransportState(ctx, transport, Failing) return fmt.Errorf("failed to post to APM server: %v", err) @@ -198,36 +198,36 @@ func SetApmServerTransportState(ctx context.Context, transport *ApmServerTranspo switch status { case Healthy: transport.Lock() - transport.Status = status - Log.Debugf("APM Server Transport status set to %s", transport.Status) - transport.ReconnectionCount = -1 + transport.status = status + Log.Debugf("APM server Transport status set to %s", transport.status) + transport.reconnectionCount = -1 transport.Unlock() case Failing: transport.Lock() - transport.Status = status - Log.Debugf("APM Server Transport status set to %s", transport.Status) - transport.ReconnectionCount++ - transport.GracePeriodTimer = time.NewTimer(computeGracePeriod(transport)) - Log.Debugf("Grace period entered, reconnection count : %d", transport.ReconnectionCount) + transport.status = status + Log.Debugf("APM server Transport status set to %s", transport.status) + transport.reconnectionCount++ + transport.gracePeriodTimer = time.NewTimer(computeGracePeriod(transport)) + Log.Debugf("Grace period entered, reconnection count : %d", transport.reconnectionCount) go func() { select { - case <-transport.GracePeriodTimer.C: + case <-transport.gracePeriodTimer.C: Log.Debug("Grace period over - timer timed out") case <-ctx.Done(): Log.Debug("Grace period over - context done") } - transport.Status = Pending - Log.Debugf("APM Server Transport status set to %s", transport.Status) + transport.status = Pending + Log.Debugf("APM server Transport status set to %s", transport.status) transport.Unlock() }() default: - Log.Errorf("Cannot set APM Server Transport status to %s", status) + Log.Errorf("Cannot set APM server Transport status to %s", status) } } // ComputeGracePeriod https://github.com/elastic/apm/blob/main/specs/agents/transport.md#transport-errors func computeGracePeriod(transport *ApmServerTransport) time.Duration { - gracePeriodWithoutJitter := math.Pow(math.Min(float64(transport.ReconnectionCount), 6), 2) + gracePeriodWithoutJitter := math.Pow(math.Min(float64(transport.reconnectionCount), 6), 2) jitter := rand.Float64()/5 - 0.1 return time.Duration((gracePeriodWithoutJitter + jitter*gracePeriodWithoutJitter) * float64(time.Second)) } diff --git a/apm-lambda-extension/extension/apm_server_test.go b/apm-lambda-extension/extension/apm_server_test.go index e2e57338..a347b06c 100644 --- a/apm-lambda-extension/extension/apm_server_test.go +++ b/apm-lambda-extension/extension/apm_server_test.go @@ -122,35 +122,35 @@ func TestPostToApmServerDataNotCompressed(t *testing.T) { func TestGracePeriod(t *testing.T) { transport := InitApmServerTransport(&extensionConfig{}) - transport.ReconnectionCount = 0 + transport.reconnectionCount = 0 val0 := computeGracePeriod(transport).Seconds() assert.Equal(t, val0, float64(0)) - transport.ReconnectionCount = 1 + transport.reconnectionCount = 1 val1 := computeGracePeriod(transport).Seconds() assert.InDelta(t, val1, float64(1), 0.1*1) - transport.ReconnectionCount = 2 + transport.reconnectionCount = 2 val2 := computeGracePeriod(transport).Seconds() assert.InDelta(t, val2, float64(4), 0.1*4) - transport.ReconnectionCount = 3 + transport.reconnectionCount = 3 val3 := computeGracePeriod(transport).Seconds() assert.InDelta(t, val3, float64(9), 0.1*9) - transport.ReconnectionCount = 4 + transport.reconnectionCount = 4 val4 := computeGracePeriod(transport).Seconds() assert.InDelta(t, val4, float64(16), 0.1*16) - transport.ReconnectionCount = 5 + transport.reconnectionCount = 5 val5 := computeGracePeriod(transport).Seconds() assert.InDelta(t, val5, float64(25), 0.1*25) - transport.ReconnectionCount = 6 + transport.reconnectionCount = 6 val6 := computeGracePeriod(transport).Seconds() assert.InDelta(t, val6, float64(36), 0.1*36) - transport.ReconnectionCount = 7 + transport.reconnectionCount = 7 val7 := computeGracePeriod(transport).Seconds() assert.InDelta(t, val7, float64(36), 0.1*36) } @@ -158,18 +158,18 @@ func TestGracePeriod(t *testing.T) { func TestSetHealthyTransport(t *testing.T) { transport := InitApmServerTransport(&extensionConfig{}) SetApmServerTransportState(context.Background(), transport, Healthy) - assert.True(t, transport.Status == Healthy) - assert.Equal(t, transport.ReconnectionCount, -1) + assert.True(t, transport.status == Healthy) + assert.Equal(t, transport.reconnectionCount, -1) } func TestSetFailingTransport(t *testing.T) { // By explicitly setting the reconnection count to 0, we ensure that the grace period will not be 0 // and avoid a race between reaching the pending status and the test assertion. transport := InitApmServerTransport(&extensionConfig{}) - transport.ReconnectionCount = 0 + transport.reconnectionCount = 0 SetApmServerTransportState(context.Background(), transport, Failing) - assert.True(t, transport.Status == Failing) - assert.Equal(t, transport.ReconnectionCount, 1) + assert.True(t, transport.status == Failing) + assert.Equal(t, transport.reconnectionCount, 1) } func TestSetPendingTransport(t *testing.T) { @@ -177,28 +177,28 @@ func TestSetPendingTransport(t *testing.T) { SetApmServerTransportState(context.Background(), transport, Healthy) SetApmServerTransportState(context.Background(), transport, Failing) for { - if transport.Status != Failing { + if transport.status != Failing { break } } - assert.True(t, transport.Status == Pending) - assert.Equal(t, transport.ReconnectionCount, 0) + assert.True(t, transport.status == Pending) + assert.Equal(t, transport.reconnectionCount, 0) } func TestSetPendingTransportExplicitly(t *testing.T) { transport := InitApmServerTransport(&extensionConfig{}) SetApmServerTransportState(context.Background(), transport, Healthy) SetApmServerTransportState(context.Background(), transport, Pending) - assert.True(t, transport.Status == Healthy) - assert.Equal(t, transport.ReconnectionCount, -1) + assert.True(t, transport.status == Healthy) + assert.Equal(t, transport.reconnectionCount, -1) } func TestSetInvalidTransport(t *testing.T) { transport := InitApmServerTransport(&extensionConfig{}) SetApmServerTransportState(context.Background(), transport, Healthy) SetApmServerTransportState(context.Background(), transport, "Invalid") - assert.True(t, transport.Status == Healthy) - assert.Equal(t, transport.ReconnectionCount, -1) + assert.True(t, transport.status == Healthy) + assert.Equal(t, transport.reconnectionCount, -1) } func TestEnterBackoffFromHealthy(t *testing.T) { @@ -246,8 +246,8 @@ func TestEnterBackoffFromHealthy(t *testing.T) { return } // No way to know for sure if failing or pending (0 sec grace period) - assert.True(t, transport.Status != Healthy) - assert.Equal(t, transport.ReconnectionCount, 0) + assert.True(t, transport.status != Healthy) + assert.Equal(t, transport.reconnectionCount, 0) } func TestEnterBackoffFromFailing(t *testing.T) { @@ -294,15 +294,15 @@ func TestEnterBackoffFromFailing(t *testing.T) { SetApmServerTransportState(context.Background(), transport, Healthy) SetApmServerTransportState(context.Background(), transport, Failing) for { - if transport.Status != Failing { + if transport.status != Failing { break } } - assert.Equal(t, transport.Status, Pending) + assert.Equal(t, transport.status, Pending) assert.Error(t, PostToApmServer(context.Background(), transport, agentData)) - assert.Equal(t, transport.Status, Failing) - assert.Equal(t, transport.ReconnectionCount, 1) + assert.Equal(t, transport.status, Failing) + assert.Equal(t, transport.reconnectionCount, 1) } func TestAPMServerRecovery(t *testing.T) { @@ -347,15 +347,15 @@ func TestAPMServerRecovery(t *testing.T) { SetApmServerTransportState(context.Background(), transport, Healthy) SetApmServerTransportState(context.Background(), transport, Failing) for { - if transport.Status != Failing { + if transport.status != Failing { break } } - assert.Equal(t, transport.Status, Pending) + assert.Equal(t, transport.status, Pending) assert.NoError(t, PostToApmServer(context.Background(), transport, agentData)) - assert.Equal(t, transport.Status, Healthy) - assert.Equal(t, transport.ReconnectionCount, -1) + assert.Equal(t, transport.status, Healthy) + assert.Equal(t, transport.reconnectionCount, -1) } func TestContinuedAPMServerFailure(t *testing.T) { @@ -400,14 +400,14 @@ func TestContinuedAPMServerFailure(t *testing.T) { SetApmServerTransportState(context.Background(), transport, Healthy) SetApmServerTransportState(context.Background(), transport, Failing) for { - if transport.Status != Failing { + if transport.status != Failing { break } } - assert.Equal(t, transport.Status, Pending) + assert.Equal(t, transport.status, Pending) assert.Error(t, PostToApmServer(context.Background(), transport, agentData)) - assert.Equal(t, transport.Status, Failing) - assert.Equal(t, transport.ReconnectionCount, 1) + assert.Equal(t, transport.status, Failing) + assert.Equal(t, transport.reconnectionCount, 1) } func BenchmarkPostToAPM(b *testing.B) { diff --git a/apm-lambda-extension/extension/http_server.go b/apm-lambda-extension/extension/http_server.go index d961559f..6794f5ff 100644 --- a/apm-lambda-extension/extension/http_server.go +++ b/apm-lambda-extension/extension/http_server.go @@ -46,7 +46,7 @@ func StartHttpServer(ctx context.Context, transport *ApmServerTransport) (agentD go func() { Log.Infof("Extension listening for apm data on %s", server.Addr) if err = server.Serve(ln); err != nil { - if err.Error() == "http: Server closed" { + if err.Error() == "http: server closed" { Log.Debug(err) } else { Log.Errorf("Error upon APM data server start : %v", err) diff --git a/apm-lambda-extension/extension/http_server_test.go b/apm-lambda-extension/extension/http_server_test.go index e9d2236e..67f53c6a 100644 --- a/apm-lambda-extension/extension/http_server_test.go +++ b/apm-lambda-extension/extension/http_server_test.go @@ -248,7 +248,7 @@ func Test_handleIntakeV2EventsQueryParam(t *testing.T) { select { case <-transport.AgentDoneSignal: - <-transport.DataChannel + <-transport.dataChannel case <-timer.C: t.Log("Timed out waiting for server to send FuncDone signal") t.Fail() @@ -294,7 +294,7 @@ func Test_handleIntakeV2EventsNoQueryParam(t *testing.T) { t.Logf("Error fetching %s, [%v]", agentDataServer.Addr, err) t.Fail() } - <-transport.DataChannel + <-transport.dataChannel assert.Equal(t, 202, resp.StatusCode) } diff --git a/apm-lambda-extension/extension/route_handlers.go b/apm-lambda-extension/extension/route_handlers.go index b4d5262a..089ef16d 100644 --- a/apm-lambda-extension/extension/route_handlers.go +++ b/apm-lambda-extension/extension/route_handlers.go @@ -38,7 +38,7 @@ var currContext context.Context func handleInfoRequest(ctx context.Context, apmServerTransport *ApmServerTransport) func(w http.ResponseWriter, r *http.Request) { return func(w http.ResponseWriter, r *http.Request) { - Log.Debug("Handling APM Server Info Request") + Log.Debug("Handling APM server Info Request") // Init reverse proxy parsedApmServerUrl, err := url.Parse(apmServerTransport.config.apmServerUrl) @@ -59,7 +59,7 @@ func handleInfoRequest(ctx context.Context, apmServerTransport *ApmServerTranspo reverseProxy.ErrorHandler = reverseProxyErrorHandler // Process request (the Golang doc suggests removing any pre-existing X-Forwarded-For header coming - // from the Client or an untrusted proxy to prevent IP spoofing : https://pkg.go.dev/net/http/httputil#ReverseProxy + // from the client or an untrusted proxy to prevent IP spoofing : https://pkg.go.dev/net/http/httputil#ReverseProxy r.Header.Del("X-Forwarded-For") // Update headers to allow for SSL redirection @@ -75,7 +75,7 @@ func handleInfoRequest(ctx context.Context, apmServerTransport *ApmServerTranspo func reverseProxyErrorHandler(res http.ResponseWriter, req *http.Request, err error) { SetApmServerTransportState(currContext, currApmServerTransport, Failing) - Log.Errorf("Error querying version from the APM Server: %v", err) + Log.Errorf("Error querying version from the APM server: %v", err) } // URL: http://server/intake/v2/events @@ -97,7 +97,7 @@ func handleIntakeV2Events(transport *ApmServerTransport) func(w http.ResponseWri ContentEncoding: r.Header.Get("Content-Encoding"), } - EnqueueAPMData(transport.DataChannel, agentData) + EnqueueAPMData(transport.dataChannel, agentData) } if len(r.URL.Query()["flushed"]) > 0 && r.URL.Query()["flushed"][0] == "true" { diff --git a/apm-lambda-extension/logsapi/route_handlers.go b/apm-lambda-extension/logsapi/route_handlers.go index e2857a12..3da4c316 100644 --- a/apm-lambda-extension/logsapi/route_handlers.go +++ b/apm-lambda-extension/logsapi/route_handlers.go @@ -40,7 +40,7 @@ func handleLogEventsRequest(transport *LogsTransport) func(w http.ResponseWriter w.WriteHeader(http.StatusInternalServerError) continue } - transport.LogsChannel <- logEvents[idx] + transport.logsChannel <- logEvents[idx] } } } diff --git a/apm-lambda-extension/logsapi/subscribe.go b/apm-lambda-extension/logsapi/subscribe.go index 6f578783..6aea34ea 100644 --- a/apm-lambda-extension/logsapi/subscribe.go +++ b/apm-lambda-extension/logsapi/subscribe.go @@ -33,17 +33,17 @@ import ( var TestListenerAddr net.Addr type LogsTransport struct { - LogsChannel chan LogEvent - Listener net.Listener - ListenerHost string + logsChannel chan LogEvent + listener net.Listener + listenerHost string RuntimeDoneSignal chan struct{} - Server *http.Server + server *http.Server } func InitLogsTransport() *LogsTransport { var transport LogsTransport - transport.ListenerHost = "localhost" - transport.LogsChannel = make(chan LogEvent, 100) + transport.listenerHost = "localhost" + transport.logsChannel = make(chan LogEvent, 100) transport.RuntimeDoneSignal = make(chan struct{}, 1) return &transport } @@ -76,8 +76,8 @@ func subscribe(transport *LogsTransport, extensionID string, eventTypes []EventT return err } - _, port, _ := net.SplitHostPort(transport.Listener.Addr().String()) - _, err = logsAPIClient.Subscribe(eventTypes, URI("http://"+transport.ListenerHost+":"+port), extensionID) + _, port, _ := net.SplitHostPort(transport.listener.Addr().String()) + _, err = logsAPIClient.Subscribe(eventTypes, URI("http://"+transport.listenerHost+":"+port), extensionID) return err } @@ -87,7 +87,7 @@ func Subscribe(ctx context.Context, extensionID string, eventTypes []EventType) return nil, errors.New("Detected sam local environment") } - // Init APM Server Transport struct + // Init APM server Transport struct // Make channel for collecting logs and create a HTTP server to listen for them transport = InitLogsTransport() @@ -106,25 +106,25 @@ func startHTTPServer(ctx context.Context, transport *LogsTransport) error { mux.HandleFunc("/", handleLogEventsRequest(transport)) var err error - transport.Server = &http.Server{ + transport.server = &http.Server{ Handler: mux, } - if transport.Listener, err = net.Listen("tcp", transport.ListenerHost+":0"); err != nil { + if transport.listener, err = net.Listen("tcp", transport.listenerHost+":0"); err != nil { return err } - TestListenerAddr = transport.Listener.Addr() + TestListenerAddr = transport.listener.Addr() go func() { - extension.Log.Infof("Extension listening for Lambda Logs API events on %s", transport.Listener.Addr().String()) - if err = transport.Server.Serve(transport.Listener); err != nil { + extension.Log.Infof("Extension listening for Lambda Logs API events on %s", transport.listener.Addr().String()) + if err = transport.server.Serve(transport.listener); err != nil { extension.Log.Errorf("Error upon Logs API server start : %v", err) } }() go func() { <-ctx.Done() - transport.Server.Close() + transport.server.Close() }() return nil @@ -144,7 +144,7 @@ func checkAWSSamLocal() bool { func WaitRuntimeDone(ctx context.Context, requestID string, transport *LogsTransport) error { for { select { - case logEvent := <-transport.LogsChannel: + case logEvent := <-transport.logsChannel: extension.Log.Debugf("Received log event %v", logEvent.Type) // Check the logEvent for runtimeDone and compare the RequestID // to the id that came in via the Next API diff --git a/apm-lambda-extension/logsapi/subscribe_test.go b/apm-lambda-extension/logsapi/subscribe_test.go index 80ad1b9f..450f32f9 100644 --- a/apm-lambda-extension/logsapi/subscribe_test.go +++ b/apm-lambda-extension/logsapi/subscribe_test.go @@ -77,7 +77,7 @@ func TestSubscribeAWSRequest(t *testing.T) { t.Fail() return } - defer transport.Server.Close() + defer transport.server.Close() // Create a request to send to the logs listener platformDoneEvent := `{ @@ -89,7 +89,7 @@ func TestSubscribeAWSRequest(t *testing.T) { } }` body := []byte(`[` + platformDoneEvent + `]`) - url := "http://" + transport.Listener.Addr().String() + url := "http://" + transport.listener.Addr().String() req, err := http.NewRequest("GET", url, bytes.NewReader(body)) if err != nil { t.Log("Could not create request") @@ -101,7 +101,7 @@ func TestSubscribeAWSRequest(t *testing.T) { t.Logf("Error fetching %s, [%v]", url, err) t.Fail() } - event := <-transport.LogsChannel + event := <-transport.logsChannel assert.Equal(t, event.Record.RequestId, "6f7f0961f83442118a7af6fe80b88") } @@ -117,18 +117,18 @@ func TestSubscribeWithBadLogsRequest(t *testing.T) { } // Subscribe to the logs api and start the http server listening for events - transport, err := Subscribe(context.Background(), "testID", []EventType{Platform}); + transport, err := Subscribe(context.Background(), "testID", []EventType{Platform}) if err != nil { t.Logf("Error subscribing, %v", err) t.Fail() return } - defer transport.Server.Close() + defer transport.server.Close() // Create a request to send to the logs listener logEvent := `{"invalid": "json"}` body := []byte(`[` + logEvent + `]`) - url := "http://" + transport.Listener.Addr().String() + url := "http://" + transport.listener.Addr().String() req, err := http.NewRequest("GET", url, bytes.NewReader(body)) if err != nil { t.Log("Could not create request") From 3c198fcc4d058aef633ff2cc23ca46dde06b5942 Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Fri, 6 May 2022 11:28:13 +0200 Subject: [PATCH 10/20] Remove RuntimeDone --- apm-lambda-extension/logsapi/subscribe.go | 14 ++++++-------- apm-lambda-extension/main.go | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/apm-lambda-extension/logsapi/subscribe.go b/apm-lambda-extension/logsapi/subscribe.go index 6aea34ea..fa043a2c 100644 --- a/apm-lambda-extension/logsapi/subscribe.go +++ b/apm-lambda-extension/logsapi/subscribe.go @@ -33,18 +33,16 @@ import ( var TestListenerAddr net.Addr type LogsTransport struct { - logsChannel chan LogEvent - listener net.Listener - listenerHost string - RuntimeDoneSignal chan struct{} - server *http.Server + logsChannel chan LogEvent + listener net.Listener + listenerHost string + server *http.Server } func InitLogsTransport() *LogsTransport { var transport LogsTransport transport.listenerHost = "localhost" transport.logsChannel = make(chan LogEvent, 100) - transport.RuntimeDoneSignal = make(chan struct{}, 1) return &transport } @@ -141,7 +139,7 @@ func checkAWSSamLocal() bool { // WaitRuntimeDone consumes events until a RuntimeDone event corresponding // to requestID is received, or ctx is cancelled, and then returns. -func WaitRuntimeDone(ctx context.Context, requestID string, transport *LogsTransport) error { +func WaitRuntimeDone(ctx context.Context, requestID string, transport *LogsTransport, runtimeDoneSignal chan struct{}) error { for { select { case logEvent := <-transport.logsChannel: @@ -151,7 +149,7 @@ func WaitRuntimeDone(ctx context.Context, requestID string, transport *LogsTrans if logEvent.Type == RuntimeDone { if logEvent.Record.RequestId == requestID { extension.Log.Info("Received runtimeDone event for this function invocation") - transport.RuntimeDoneSignal <- struct{}{} + runtimeDoneSignal <- struct{}{} return nil } else { extension.Log.Debug("Log API runtimeDone event request id didn't match") diff --git a/apm-lambda-extension/main.go b/apm-lambda-extension/main.go index 992d2fac..2e94a900 100644 --- a/apm-lambda-extension/main.go +++ b/apm-lambda-extension/main.go @@ -141,7 +141,7 @@ func processEvent(ctx context.Context, cancel context.CancelFunc, apmServerTrans // Lambda Service Logs Processing runtimeDone := make(chan struct{}) go func() { - if err := logsapi.WaitRuntimeDone(invocationCtx, event.RequestID, logsTransport); err != nil { + if err := logsapi.WaitRuntimeDone(invocationCtx, event.RequestID, logsTransport, runtimeDone); err != nil { extension.Log.Errorf("Error while processing Lambda Logs ; %v", err) } else { close(runtimeDone) From b60383ae1345c322cbea354e1b5f402add960c2f Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Fri, 6 May 2022 11:29:13 +0200 Subject: [PATCH 11/20] Refactor runtimeDone in main --- apm-lambda-extension/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apm-lambda-extension/main.go b/apm-lambda-extension/main.go index 2e94a900..8b3297a7 100644 --- a/apm-lambda-extension/main.go +++ b/apm-lambda-extension/main.go @@ -159,7 +159,7 @@ func processEvent(ctx context.Context, cancel context.CancelFunc, apmServerTrans select { case <-apmServerTransport.AgentDoneSignal: extension.Log.Debug("Received agent done signal") - case <-logsTransport.RuntimeDoneSignal: + case <-runtimeDone: extension.Log.Debug("Received runtimeDone signal") case <-timer.C: extension.Log.Info("Time expired waiting for agent signal or runtimeDone event") From 2f0825d9b5d315caf808846e04005d7599c86ec5 Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Fri, 6 May 2022 11:34:32 +0200 Subject: [PATCH 12/20] Keep waitgroup interactions in data forwarding goroutine --- apm-lambda-extension/extension/apm_server.go | 3 +-- apm-lambda-extension/main.go | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apm-lambda-extension/extension/apm_server.go b/apm-lambda-extension/extension/apm_server.go index 07559ed8..f83e52b6 100644 --- a/apm-lambda-extension/extension/apm_server.go +++ b/apm-lambda-extension/extension/apm_server.go @@ -76,8 +76,7 @@ func InitApmServerTransport(config *extensionConfig) *ApmServerTransport { // StartBackgroundApmDataForwarding Receive agent data as it comes in and post it to the APM server. // Stop checking for, and sending agent data when the function invocation // has completed, signaled via a channel. -func (transport *ApmServerTransport) ForwardApmData(ctx context.Context, backgroundDataSendWg *sync.WaitGroup) error { - defer backgroundDataSendWg.Done() +func (transport *ApmServerTransport) ForwardApmData(ctx context.Context) error { if transport.status == Failing { return nil } diff --git a/apm-lambda-extension/main.go b/apm-lambda-extension/main.go index 8b3297a7..6a603040 100644 --- a/apm-lambda-extension/main.go +++ b/apm-lambda-extension/main.go @@ -133,7 +133,8 @@ func processEvent(ctx context.Context, cancel context.CancelFunc, apmServerTrans // APM Data Processing backgroundDataSendWg.Add(1) go func() { - if err := apmServerTransport.ForwardApmData(invocationCtx, backgroundDataSendWg); err != nil { + defer backgroundDataSendWg.Done() + if err := apmServerTransport.ForwardApmData(invocationCtx); err != nil { extension.Log.Error(err) } }() From cb0f18929f531b1c9c110fdeddd11638d8082c98 Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Fri, 6 May 2022 11:43:26 +0200 Subject: [PATCH 13/20] Additional comments --- apm-lambda-extension/extension/{process_events.go => util.go} | 0 apm-lambda-extension/logsapi/subscribe.go | 1 + 2 files changed, 1 insertion(+) rename apm-lambda-extension/extension/{process_events.go => util.go} (100%) diff --git a/apm-lambda-extension/extension/process_events.go b/apm-lambda-extension/extension/util.go similarity index 100% rename from apm-lambda-extension/extension/process_events.go rename to apm-lambda-extension/extension/util.go diff --git a/apm-lambda-extension/logsapi/subscribe.go b/apm-lambda-extension/logsapi/subscribe.go index fa043a2c..5a9b9523 100644 --- a/apm-lambda-extension/logsapi/subscribe.go +++ b/apm-lambda-extension/logsapi/subscribe.go @@ -29,6 +29,7 @@ import ( "github.com/pkg/errors" ) +// TODO: Remove global variable and find another way to retrieve Logs Listener network info when testing main // TestListenerAddr For e2e testing purposes var TestListenerAddr net.Addr From 457e2475aed380cc718a28b3c56f86d081663db7 Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Fri, 6 May 2022 11:50:31 +0200 Subject: [PATCH 14/20] Refactor reverse proxy error handler --- apm-lambda-extension/extension/route_handlers.go | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/apm-lambda-extension/extension/route_handlers.go b/apm-lambda-extension/extension/route_handlers.go index 089ef16d..776a4fe8 100644 --- a/apm-lambda-extension/extension/route_handlers.go +++ b/apm-lambda-extension/extension/route_handlers.go @@ -31,9 +31,6 @@ type AgentData struct { ContentEncoding string } -var currApmServerTransport *ApmServerTransport -var currContext context.Context - // URL: http://server/ func handleInfoRequest(ctx context.Context, apmServerTransport *ApmServerTransport) func(w http.ResponseWriter, r *http.Request) { return func(w http.ResponseWriter, r *http.Request) { @@ -54,9 +51,10 @@ func handleInfoRequest(ctx context.Context, apmServerTransport *ApmServerTranspo customTransport.ResponseHeaderTimeout = reverseProxyTimeout reverseProxy.Transport = customTransport - currApmServerTransport = apmServerTransport - currContext = ctx - reverseProxy.ErrorHandler = reverseProxyErrorHandler + reverseProxy.ErrorHandler = func(w http.ResponseWriter, r *http.Request, err error) { + SetApmServerTransportState(ctx, apmServerTransport, Failing) + Log.Errorf("Error querying version from the APM server: %v", err) + } // Process request (the Golang doc suggests removing any pre-existing X-Forwarded-For header coming // from the client or an untrusted proxy to prevent IP spoofing : https://pkg.go.dev/net/http/httputil#ReverseProxy @@ -73,11 +71,6 @@ func handleInfoRequest(ctx context.Context, apmServerTransport *ApmServerTranspo } } -func reverseProxyErrorHandler(res http.ResponseWriter, req *http.Request, err error) { - SetApmServerTransportState(currContext, currApmServerTransport, Failing) - Log.Errorf("Error querying version from the APM server: %v", err) -} - // URL: http://server/intake/v2/events func handleIntakeV2Events(transport *ApmServerTransport) func(w http.ResponseWriter, r *http.Request) { return func(w http.ResponseWriter, r *http.Request) { From d71ccfb98277e9d98fd558d9fd3be9cf796a6515 Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Fri, 6 May 2022 13:07:14 +0200 Subject: [PATCH 15/20] Refactor APM server transport-related functions --- ...{apm_server.go => apm_server_transport.go} | 24 ++++---- ...r_test.go => apm_server_transport_test.go} | 60 +++++++++---------- .../extension/route_handlers.go | 4 +- apm-lambda-extension/main.go | 2 +- 4 files changed, 45 insertions(+), 45 deletions(-) rename apm-lambda-extension/extension/{apm_server.go => apm_server_transport.go} (89%) rename apm-lambda-extension/extension/{apm_server_test.go => apm_server_transport_test.go} (89%) diff --git a/apm-lambda-extension/extension/apm_server.go b/apm-lambda-extension/extension/apm_server_transport.go similarity index 89% rename from apm-lambda-extension/extension/apm_server.go rename to apm-lambda-extension/extension/apm_server_transport.go index f83e52b6..1d569999 100644 --- a/apm-lambda-extension/extension/apm_server.go +++ b/apm-lambda-extension/extension/apm_server_transport.go @@ -86,7 +86,7 @@ func (transport *ApmServerTransport) ForwardApmData(ctx context.Context) error { Log.Debug("Invocation context cancelled, not processing any more agent data") return nil case agentData := <-transport.dataChannel: - if err := PostToApmServer(ctx, transport, agentData); err != nil { + if err := transport.PostToApmServer(ctx, agentData); err != nil { return fmt.Errorf("error sending to APM server, skipping: %v", err) } } @@ -94,7 +94,7 @@ func (transport *ApmServerTransport) ForwardApmData(ctx context.Context) error { } // FlushAPMData reads all the apm data in the apm data channel and sends it to the APM server. -func FlushAPMData(ctx context.Context, transport *ApmServerTransport) { +func (transport *ApmServerTransport) FlushAPMData(ctx context.Context) { if transport.status == Failing { Log.Debug("Flush skipped - Transport failing") return @@ -104,7 +104,7 @@ func FlushAPMData(ctx context.Context, transport *ApmServerTransport) { select { case agentData := <-transport.dataChannel: Log.Debug("Flush in progress - Processing agent data") - if err := PostToApmServer(ctx, transport, agentData); err != nil { + if err := transport.PostToApmServer(ctx, agentData); err != nil { Log.Errorf("Error sending to APM server, skipping: %v", err) } default: @@ -119,7 +119,7 @@ func FlushAPMData(ctx context.Context, transport *ApmServerTransport) { // The function compresses the APM agent data, if it's not already compressed. // It sets the APM transport status to failing upon errors, as part of the backoff // strategy. -func PostToApmServer(ctx context.Context, transport *ApmServerTransport, agentData AgentData) error { +func (transport *ApmServerTransport) PostToApmServer(ctx context.Context, agentData AgentData) error { // todo: can this be a streaming or streaming style call that keeps the // connection open across invocations? if transport.status == Failing { @@ -167,7 +167,7 @@ func PostToApmServer(ctx context.Context, transport *ApmServerTransport, agentDa Log.Debug("Sending data chunk to APM server") resp, err := transport.client.Do(req) if err != nil { - SetApmServerTransportState(ctx, transport, Failing) + transport.SetApmServerTransportState(ctx, Failing) return fmt.Errorf("failed to post to APM server: %v", err) } @@ -175,11 +175,11 @@ func PostToApmServer(ctx context.Context, transport *ApmServerTransport, agentDa defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { - SetApmServerTransportState(ctx, transport, Failing) + transport.SetApmServerTransportState(ctx, Failing) return fmt.Errorf("failed to read the response body after posting to the APM server") } - SetApmServerTransportState(ctx, transport, Healthy) + transport.SetApmServerTransportState(ctx, Healthy) Log.Debug("Transport status set to healthy") Log.Debugf("APM server response body: %v", string(body)) Log.Debugf("APM server response status code: %v", resp.StatusCode) @@ -193,7 +193,7 @@ func PostToApmServer(ctx context.Context, transport *ApmServerTransport, agentDa // to the APM server. // // This function is public for use in tests. -func SetApmServerTransportState(ctx context.Context, transport *ApmServerTransport, status ApmServerTransportStatusType) { +func (transport *ApmServerTransport) SetApmServerTransportState(ctx context.Context, status ApmServerTransportStatusType) { switch status { case Healthy: transport.Lock() @@ -206,7 +206,7 @@ func SetApmServerTransportState(ctx context.Context, transport *ApmServerTranspo transport.status = status Log.Debugf("APM server Transport status set to %s", transport.status) transport.reconnectionCount++ - transport.gracePeriodTimer = time.NewTimer(computeGracePeriod(transport)) + transport.gracePeriodTimer = time.NewTimer(transport.computeGracePeriod()) Log.Debugf("Grace period entered, reconnection count : %d", transport.reconnectionCount) go func() { select { @@ -225,7 +225,7 @@ func SetApmServerTransportState(ctx context.Context, transport *ApmServerTranspo } // ComputeGracePeriod https://github.com/elastic/apm/blob/main/specs/agents/transport.md#transport-errors -func computeGracePeriod(transport *ApmServerTransport) time.Duration { +func (transport *ApmServerTransport) computeGracePeriod() time.Duration { gracePeriodWithoutJitter := math.Pow(math.Min(float64(transport.reconnectionCount), 6), 2) jitter := rand.Float64()/5 - 0.1 return time.Duration((gracePeriodWithoutJitter + jitter*gracePeriodWithoutJitter) * float64(time.Second)) @@ -233,9 +233,9 @@ func computeGracePeriod(transport *ApmServerTransport) time.Duration { // EnqueueAPMData adds a AgentData struct to the agent data channel, effectively queueing for a send // to the APM server. -func EnqueueAPMData(agentDataChannel chan AgentData, agentData AgentData) { +func (transport *ApmServerTransport) EnqueueAPMData(agentData AgentData) { select { - case agentDataChannel <- agentData: + case transport.dataChannel <- agentData: Log.Debug("Adding agent data to buffer to be sent to apm server") default: Log.Warn("Channel full: dropping a subset of agent data") diff --git a/apm-lambda-extension/extension/apm_server_test.go b/apm-lambda-extension/extension/apm_server_transport_test.go similarity index 89% rename from apm-lambda-extension/extension/apm_server_test.go rename to apm-lambda-extension/extension/apm_server_transport_test.go index a347b06c..5362e588 100644 --- a/apm-lambda-extension/extension/apm_server_test.go +++ b/apm-lambda-extension/extension/apm_server_transport_test.go @@ -70,7 +70,7 @@ func TestPostToApmServerDataCompressed(t *testing.T) { apmServerUrl: apmServer.URL + "/", } transport := InitApmServerTransport(&config) - err := PostToApmServer(context.Background(), transport, agentData) + err := transport.PostToApmServer(context.Background(), agentData) assert.Equal(t, nil, err) } @@ -115,7 +115,7 @@ func TestPostToApmServerDataNotCompressed(t *testing.T) { apmServerUrl: apmServer.URL + "/", } transport := InitApmServerTransport(&config) - err := PostToApmServer(context.Background(), transport, agentData) + err := transport.PostToApmServer(context.Background(), agentData) assert.Equal(t, nil, err) } @@ -123,41 +123,41 @@ func TestGracePeriod(t *testing.T) { transport := InitApmServerTransport(&extensionConfig{}) transport.reconnectionCount = 0 - val0 := computeGracePeriod(transport).Seconds() + val0 := transport.computeGracePeriod().Seconds() assert.Equal(t, val0, float64(0)) transport.reconnectionCount = 1 - val1 := computeGracePeriod(transport).Seconds() + val1 := transport.computeGracePeriod().Seconds() assert.InDelta(t, val1, float64(1), 0.1*1) transport.reconnectionCount = 2 - val2 := computeGracePeriod(transport).Seconds() + val2 := transport.computeGracePeriod().Seconds() assert.InDelta(t, val2, float64(4), 0.1*4) transport.reconnectionCount = 3 - val3 := computeGracePeriod(transport).Seconds() + val3 := transport.computeGracePeriod().Seconds() assert.InDelta(t, val3, float64(9), 0.1*9) transport.reconnectionCount = 4 - val4 := computeGracePeriod(transport).Seconds() + val4 := transport.computeGracePeriod().Seconds() assert.InDelta(t, val4, float64(16), 0.1*16) transport.reconnectionCount = 5 - val5 := computeGracePeriod(transport).Seconds() + val5 := transport.computeGracePeriod().Seconds() assert.InDelta(t, val5, float64(25), 0.1*25) transport.reconnectionCount = 6 - val6 := computeGracePeriod(transport).Seconds() + val6 := transport.computeGracePeriod().Seconds() assert.InDelta(t, val6, float64(36), 0.1*36) transport.reconnectionCount = 7 - val7 := computeGracePeriod(transport).Seconds() + val7 := transport.computeGracePeriod().Seconds() assert.InDelta(t, val7, float64(36), 0.1*36) } func TestSetHealthyTransport(t *testing.T) { transport := InitApmServerTransport(&extensionConfig{}) - SetApmServerTransportState(context.Background(), transport, Healthy) + transport.SetApmServerTransportState(context.Background(), Healthy) assert.True(t, transport.status == Healthy) assert.Equal(t, transport.reconnectionCount, -1) } @@ -167,15 +167,15 @@ func TestSetFailingTransport(t *testing.T) { // and avoid a race between reaching the pending status and the test assertion. transport := InitApmServerTransport(&extensionConfig{}) transport.reconnectionCount = 0 - SetApmServerTransportState(context.Background(), transport, Failing) + transport.SetApmServerTransportState(context.Background(), Failing) assert.True(t, transport.status == Failing) assert.Equal(t, transport.reconnectionCount, 1) } func TestSetPendingTransport(t *testing.T) { transport := InitApmServerTransport(&extensionConfig{}) - SetApmServerTransportState(context.Background(), transport, Healthy) - SetApmServerTransportState(context.Background(), transport, Failing) + transport.SetApmServerTransportState(context.Background(), Healthy) + transport.SetApmServerTransportState(context.Background(), Failing) for { if transport.status != Failing { break @@ -187,16 +187,16 @@ func TestSetPendingTransport(t *testing.T) { func TestSetPendingTransportExplicitly(t *testing.T) { transport := InitApmServerTransport(&extensionConfig{}) - SetApmServerTransportState(context.Background(), transport, Healthy) - SetApmServerTransportState(context.Background(), transport, Pending) + transport.SetApmServerTransportState(context.Background(), Healthy) + transport.SetApmServerTransportState(context.Background(), Pending) assert.True(t, transport.status == Healthy) assert.Equal(t, transport.reconnectionCount, -1) } func TestSetInvalidTransport(t *testing.T) { transport := InitApmServerTransport(&extensionConfig{}) - SetApmServerTransportState(context.Background(), transport, Healthy) - SetApmServerTransportState(context.Background(), transport, "Invalid") + transport.SetApmServerTransportState(context.Background(), Healthy) + transport.SetApmServerTransportState(context.Background(), "Invalid") assert.True(t, transport.status == Healthy) assert.Equal(t, transport.reconnectionCount, -1) } @@ -237,12 +237,12 @@ func TestEnterBackoffFromHealthy(t *testing.T) { apmServerUrl: apmServer.URL + "/", } transport := InitApmServerTransport(&config) - SetApmServerTransportState(context.Background(), transport, Healthy) + transport.SetApmServerTransportState(context.Background(), Healthy) // Close the APM server early so that POST requests fail and that backoff is enabled apmServer.Close() - if err := PostToApmServer(context.Background(), transport, agentData); err != nil { + if err := transport.PostToApmServer(context.Background(), agentData); err != nil { return } // No way to know for sure if failing or pending (0 sec grace period) @@ -291,8 +291,8 @@ func TestEnterBackoffFromFailing(t *testing.T) { } transport := InitApmServerTransport(&config) - SetApmServerTransportState(context.Background(), transport, Healthy) - SetApmServerTransportState(context.Background(), transport, Failing) + transport.SetApmServerTransportState(context.Background(), Healthy) + transport.SetApmServerTransportState(context.Background(), Failing) for { if transport.status != Failing { break @@ -300,7 +300,7 @@ func TestEnterBackoffFromFailing(t *testing.T) { } assert.Equal(t, transport.status, Pending) - assert.Error(t, PostToApmServer(context.Background(), transport, agentData)) + assert.Error(t, transport.PostToApmServer(context.Background(), agentData)) assert.Equal(t, transport.status, Failing) assert.Equal(t, transport.reconnectionCount, 1) } @@ -344,8 +344,8 @@ func TestAPMServerRecovery(t *testing.T) { } transport := InitApmServerTransport(&config) - SetApmServerTransportState(context.Background(), transport, Healthy) - SetApmServerTransportState(context.Background(), transport, Failing) + transport.SetApmServerTransportState(context.Background(), Healthy) + transport.SetApmServerTransportState(context.Background(), Failing) for { if transport.status != Failing { break @@ -353,7 +353,7 @@ func TestAPMServerRecovery(t *testing.T) { } assert.Equal(t, transport.status, Pending) - assert.NoError(t, PostToApmServer(context.Background(), transport, agentData)) + assert.NoError(t, transport.PostToApmServer(context.Background(), agentData)) assert.Equal(t, transport.status, Healthy) assert.Equal(t, transport.reconnectionCount, -1) } @@ -397,15 +397,15 @@ func TestContinuedAPMServerFailure(t *testing.T) { } transport := InitApmServerTransport(&config) - SetApmServerTransportState(context.Background(), transport, Healthy) - SetApmServerTransportState(context.Background(), transport, Failing) + transport.SetApmServerTransportState(context.Background(), Healthy) + transport.SetApmServerTransportState(context.Background(), Failing) for { if transport.status != Failing { break } } assert.Equal(t, transport.status, Pending) - assert.Error(t, PostToApmServer(context.Background(), transport, agentData)) + assert.Error(t, transport.PostToApmServer(context.Background(), agentData)) assert.Equal(t, transport.status, Failing) assert.Equal(t, transport.reconnectionCount, 1) } @@ -443,7 +443,7 @@ func BenchmarkPostToAPM(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - if err := PostToApmServer(context.Background(), transport, agentData); err != nil { + if err := transport.PostToApmServer(context.Background(), agentData); err != nil { b.Fatal(err) } } diff --git a/apm-lambda-extension/extension/route_handlers.go b/apm-lambda-extension/extension/route_handlers.go index 776a4fe8..ab7d9f0f 100644 --- a/apm-lambda-extension/extension/route_handlers.go +++ b/apm-lambda-extension/extension/route_handlers.go @@ -52,7 +52,7 @@ func handleInfoRequest(ctx context.Context, apmServerTransport *ApmServerTranspo reverseProxy.Transport = customTransport reverseProxy.ErrorHandler = func(w http.ResponseWriter, r *http.Request, err error) { - SetApmServerTransportState(ctx, apmServerTransport, Failing) + apmServerTransport.SetApmServerTransportState(ctx, Failing) Log.Errorf("Error querying version from the APM server: %v", err) } @@ -90,7 +90,7 @@ func handleIntakeV2Events(transport *ApmServerTransport) func(w http.ResponseWri ContentEncoding: r.Header.Get("Content-Encoding"), } - EnqueueAPMData(transport.dataChannel, agentData) + transport.EnqueueAPMData(agentData) } if len(r.URL.Query()["flushed"]) > 0 && r.URL.Query()["flushed"][0] == "true" { diff --git a/apm-lambda-extension/main.go b/apm-lambda-extension/main.go index 6a603040..f6b75f86 100644 --- a/apm-lambda-extension/main.go +++ b/apm-lambda-extension/main.go @@ -96,7 +96,7 @@ func main() { backgroundDataSendWg.Wait() if config.SendStrategy == extension.SyncFlush { // Flush APM data now that the function invocation has completed - extension.FlushAPMData(ctx, apmServerTransport) + apmServerTransport.FlushAPMData(ctx) } } } From 241bf4acac2f52c40a2afe6222f69d28985a536d Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Fri, 6 May 2022 13:39:49 +0200 Subject: [PATCH 16/20] Added comments regarding processEvents() signals --- apm-lambda-extension/main.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/apm-lambda-extension/main.go b/apm-lambda-extension/main.go index f6b75f86..c49a7a79 100644 --- a/apm-lambda-extension/main.go +++ b/apm-lambda-extension/main.go @@ -157,6 +157,12 @@ func processEvent(ctx context.Context, cancel context.CancelFunc, apmServerTrans timer := time.NewTimer(durationUntilFlushDeadline) defer timer.Stop() + // The extension relies on 3 independent mechanisms to minimize the time interval between the end of the execution of + // the lambda function and the end of the execution of processEvent() + // 1) AgentDoneSignal is triggered upon reception of a `flushed=true` query from the agent + // 2) [Backup 1] RuntimeDone is triggered upon reception of a Lambda log entry certifying the end of the execution of the current function + // 3) [Backup 2] If all else fails, the extension relies of the timeout of the Lambda function to interrupt itself 100 ms before the specified deadline. + // This time interval is large enough to attempt a last flush attempt (if SendStrategy == syncFlush) before the environment gets shut down. select { case <-apmServerTransport.AgentDoneSignal: extension.Log.Debug("Received agent done signal") From 488627d2616df180e096a897f941e4942e64386a Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Fri, 6 May 2022 16:17:41 +0200 Subject: [PATCH 17/20] Add test for AgentDone and RuntimeDone race --- apm-lambda-extension/main_test.go | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/apm-lambda-extension/main_test.go b/apm-lambda-extension/main_test.go index a5d6997a..d2e181f9 100644 --- a/apm-lambda-extension/main_test.go +++ b/apm-lambda-extension/main_test.go @@ -47,6 +47,7 @@ const ( InvokeStandard MockEventType = "Standard" InvokeStandardInfo MockEventType = "StandardInfo" InvokeStandardFlush MockEventType = "Flush" + InvokeLateFlush MockEventType = "LateFlush" InvokeWaitgroupsRace MockEventType = "InvokeWaitgroupsRace" InvokeMultipleTransactionsOverload MockEventType = "MultipleTransactionsOverload" Shutdown MockEventType = "Shutdown" @@ -205,6 +206,7 @@ func initMockServers(eventsChannel chan MockEvent) (*httptest.Server, *httptest. func processMockEvent(currId string, event MockEvent, extensionPort string, internals *MockServerInternals) { sendLogEvent(currId, "platform.start") client := http.Client{} + sendRuntimeDone := true switch event.Type { case InvokeHang: time.Sleep(time.Duration(event.Timeout) * time.Second) @@ -219,6 +221,14 @@ func processMockEvent(currId string, event MockEvent, extensionPort string, inte if _, err := client.Do(reqData); err != nil { extension.Log.Error(err.Error()) } + case InvokeLateFlush: + time.Sleep(time.Duration(event.ExecutionDuration) * time.Second) + reqData, _ := http.NewRequest("POST", fmt.Sprintf("http://localhost:%s/intake/v2/events?flushed=true", extensionPort), bytes.NewBuffer([]byte(event.APMServerBehavior))) + go func() { + if _, err := client.Do(reqData); err != nil { + extension.Log.Error(err.Error()) + } + }() case InvokeWaitgroupsRace: time.Sleep(time.Duration(event.ExecutionDuration) * time.Second) reqData0, _ := http.NewRequest("POST", fmt.Sprintf("http://localhost:%s/intake/v2/events", extensionPort), bytes.NewBuffer([]byte(event.APMServerBehavior))) @@ -260,7 +270,9 @@ func processMockEvent(currId string, event MockEvent, extensionPort string, inte extension.Log.Debugf("Response seen by the agent : %d", res.StatusCode) case Shutdown: } - sendLogEvent(currId, "platform.runtimeDone") + if sendRuntimeDone { + sendLogEvent(currId, "platform.runtimeDone") + } } func sendNextEventInfo(w http.ResponseWriter, id string, event MockEvent) { @@ -373,6 +385,18 @@ func (suite *MainUnitTestsSuite) TestFlush() { assert.True(suite.T(), strings.Contains(suite.apmServerInternals.Data, string(TimelyResponse))) } +// TestLateFlush checks if there is no race condition between RuntimeDone and AgentDone +// The test is built so that the AgentDone signal is received after RuntimeDone, which causes the next event to be interrupted. +func (suite *MainUnitTestsSuite) TestLateFlush() { + eventsChain := []MockEvent{ + {Type: InvokeLateFlush, APMServerBehavior: TimelyResponse, ExecutionDuration: 1, Timeout: 5}, + {Type: InvokeStandard, APMServerBehavior: TimelyResponse, ExecutionDuration: 1, Timeout: 5}, + } + eventQueueGenerator(eventsChain, suite.eventsChannel) + assert.NotPanics(suite.T(), main) + assert.True(suite.T(), strings.Contains(suite.apmServerInternals.Data, string(TimelyResponse+TimelyResponse))) +} + // TestWaitGroup checks if there is no race condition between the main waitgroups (issue #128) func (suite *MainUnitTestsSuite) TestWaitGroup() { eventsChain := []MockEvent{ From de544ed24e95d5874249c0659045cfb1b1851b29 Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Mon, 9 May 2022 11:36:42 +0200 Subject: [PATCH 18/20] Open and Close Agent flush channel for each invocation --- apm-lambda-extension/extension/apm_server_transport.go | 1 - apm-lambda-extension/extension/http_server_test.go | 6 +++--- apm-lambda-extension/main.go | 2 ++ apm-lambda-extension/main_test.go | 4 ++++ 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/apm-lambda-extension/extension/apm_server_transport.go b/apm-lambda-extension/extension/apm_server_transport.go index 1d569999..9b519098 100644 --- a/apm-lambda-extension/extension/apm_server_transport.go +++ b/apm-lambda-extension/extension/apm_server_transport.go @@ -61,7 +61,6 @@ func InitApmServerTransport(config *extensionConfig) *ApmServerTransport { transport.bufferPool = sync.Pool{New: func() interface{} { return &bytes.Buffer{} }} - transport.AgentDoneSignal = make(chan struct{}, 1) transport.dataChannel = make(chan AgentData, 100) transport.client = &http.Client{ Timeout: time.Duration(config.DataForwarderTimeoutSeconds) * time.Second, diff --git a/apm-lambda-extension/extension/http_server_test.go b/apm-lambda-extension/extension/http_server_test.go index 67f53c6a..190a65fe 100644 --- a/apm-lambda-extension/extension/http_server_test.go +++ b/apm-lambda-extension/extension/http_server_test.go @@ -217,7 +217,7 @@ func Test_handleIntakeV2EventsQueryParam(t *testing.T) { dataReceiverTimeoutSeconds: 15, } transport := InitApmServerTransport(&config) - + transport.AgentDoneSignal = make(chan struct{}, 1) agentDataServer, err := StartHttpServer(context.Background(), transport) if err != nil { t.Fail() @@ -270,7 +270,7 @@ func Test_handleIntakeV2EventsNoQueryParam(t *testing.T) { dataReceiverTimeoutSeconds: 15, } transport := InitApmServerTransport(&config) - + transport.AgentDoneSignal = make(chan struct{}, 1) agentDataServer, err := StartHttpServer(context.Background(), transport) if err != nil { t.Fail() @@ -313,7 +313,7 @@ func Test_handleIntakeV2EventsQueryParamEmptyData(t *testing.T) { dataReceiverTimeoutSeconds: 15, } transport := InitApmServerTransport(&config) - + transport.AgentDoneSignal = make(chan struct{}, 1) agentDataServer, err := StartHttpServer(context.Background(), transport) if err != nil { t.Fail() diff --git a/apm-lambda-extension/main.go b/apm-lambda-extension/main.go index c49a7a79..23c9df0c 100644 --- a/apm-lambda-extension/main.go +++ b/apm-lambda-extension/main.go @@ -131,6 +131,8 @@ func processEvent(ctx context.Context, cancel context.CancelFunc, apmServerTrans } // APM Data Processing + apmServerTransport.AgentDoneSignal = make(chan struct{}) + defer close(apmServerTransport.AgentDoneSignal) backgroundDataSendWg.Add(1) go func() { defer backgroundDataSendWg.Done() diff --git a/apm-lambda-extension/main_test.go b/apm-lambda-extension/main_test.go index d2e181f9..8355f9f3 100644 --- a/apm-lambda-extension/main_test.go +++ b/apm-lambda-extension/main_test.go @@ -57,6 +57,7 @@ type MockServerInternals struct { Data string WaitForUnlockSignal bool UnlockSignalChannel chan struct{} + WaitGroup sync.WaitGroup } type APMServerBehavior string @@ -161,6 +162,7 @@ func initMockServers(eventsChannel chan MockEvent) (*httptest.Server, *httptest. return } case "/2020-01-01/extension/event/next": + lambdaServerInternals.WaitGroup.Wait() currId := uuid.New().String() select { case nextEvent := <-eventsChannel: @@ -224,10 +226,12 @@ func processMockEvent(currId string, event MockEvent, extensionPort string, inte case InvokeLateFlush: time.Sleep(time.Duration(event.ExecutionDuration) * time.Second) reqData, _ := http.NewRequest("POST", fmt.Sprintf("http://localhost:%s/intake/v2/events?flushed=true", extensionPort), bytes.NewBuffer([]byte(event.APMServerBehavior))) + internals.WaitGroup.Add(1) go func() { if _, err := client.Do(reqData); err != nil { extension.Log.Error(err.Error()) } + internals.WaitGroup.Done() }() case InvokeWaitgroupsRace: time.Sleep(time.Duration(event.ExecutionDuration) * time.Second) From 84b261ae937fe783c8ead15d55ea6aaca546261f Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Thu, 19 May 2022 11:42:05 +0200 Subject: [PATCH 19/20] Fix Logs API listener host and handle nil case --- apm-lambda-extension/logsapi/subscribe.go | 24 ++++++++++++++++++++--- apm-lambda-extension/main.go | 20 ++++++++++++------- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/apm-lambda-extension/logsapi/subscribe.go b/apm-lambda-extension/logsapi/subscribe.go index 5a9b9523..316c7ee9 100644 --- a/apm-lambda-extension/logsapi/subscribe.go +++ b/apm-lambda-extension/logsapi/subscribe.go @@ -40,9 +40,9 @@ type LogsTransport struct { server *http.Server } -func InitLogsTransport() *LogsTransport { +func InitLogsTransport(listenerHost string) *LogsTransport { var transport LogsTransport - transport.listenerHost = "localhost" + transport.listenerHost = listenerHost transport.logsChannel = make(chan LogEvent, 100) return &transport } @@ -88,7 +88,11 @@ func Subscribe(ctx context.Context, extensionID string, eventTypes []EventType) // Init APM server Transport struct // Make channel for collecting logs and create a HTTP server to listen for them - transport = InitLogsTransport() + if checkLambdaFunction() { + transport = InitLogsTransport("sandbox") + } else { + transport = InitLogsTransport("localhost") + } if err = startHTTPServer(ctx, transport); err != nil { return nil, err @@ -129,6 +133,8 @@ func startHTTPServer(ctx context.Context, transport *LogsTransport) error { return nil } +// checkAWSSamLocal checks if the extension is running in a SAM CLI container. +// The Logs API is not supported in that scenario. func checkAWSSamLocal() bool { envAWSLocal, ok := os.LookupEnv("AWS_SAM_LOCAL") if ok && envAWSLocal == "true" { @@ -138,6 +144,18 @@ func checkAWSSamLocal() bool { return false } +// checkLambdaFunction checks if the extension is together with an actual Lambda function. +// It is currently used together with checkAWSSamLocal as a best effort solution to determine if +// the extension actually runs in dev (unit tests), SAM, or in a local Lambda environment. +func checkLambdaFunction() bool { + _, ok := os.LookupEnv("AWS_LAMBDA_FUNCTION_NAME") + if ok { + return true + } + + return false +} + // WaitRuntimeDone consumes events until a RuntimeDone event corresponding // to requestID is received, or ctx is cancelled, and then returns. func WaitRuntimeDone(ctx context.Context, requestID string, transport *LogsTransport, runtimeDoneSignal chan struct{}) error { diff --git a/apm-lambda-extension/main.go b/apm-lambda-extension/main.go index 23c9df0c..5e7d9f58 100644 --- a/apm-lambda-extension/main.go +++ b/apm-lambda-extension/main.go @@ -142,14 +142,20 @@ func processEvent(ctx context.Context, cancel context.CancelFunc, apmServerTrans }() // Lambda Service Logs Processing + // This goroutine should not be started if subscription failed runtimeDone := make(chan struct{}) - go func() { - if err := logsapi.WaitRuntimeDone(invocationCtx, event.RequestID, logsTransport, runtimeDone); err != nil { - extension.Log.Errorf("Error while processing Lambda Logs ; %v", err) - } else { - close(runtimeDone) - } - }() + if logsTransport != nil { + go func() { + if err := logsapi.WaitRuntimeDone(invocationCtx, event.RequestID, logsTransport, runtimeDone); err != nil { + extension.Log.Errorf("Error while processing Lambda Logs ; %v", err) + } else { + close(runtimeDone) + } + }() + } else { + extension.Log.Warn("Logs collection not started due to earlier subscription failure") + close(runtimeDone) + } // Calculate how long to wait for a runtimeDoneSignal or AgentDoneSignal signal flushDeadlineMs := event.DeadlineMs - 100 From 4c3e476c8760b1528e8be158e60c3d6b2c751cbd Mon Sep 17 00:00:00 2001 From: Jean-Louis Voiseux <48380853+jlvoiseux@users.noreply.github.com> Date: Thu, 19 May 2022 11:55:14 +0200 Subject: [PATCH 20/20] Add relevant unit test --- apm-lambda-extension/logsapi/subscribe.go | 4 ++-- apm-lambda-extension/logsapi/subscribe_test.go | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/apm-lambda-extension/logsapi/subscribe.go b/apm-lambda-extension/logsapi/subscribe.go index 316c7ee9..3d844bd3 100644 --- a/apm-lambda-extension/logsapi/subscribe.go +++ b/apm-lambda-extension/logsapi/subscribe.go @@ -148,8 +148,8 @@ func checkAWSSamLocal() bool { // It is currently used together with checkAWSSamLocal as a best effort solution to determine if // the extension actually runs in dev (unit tests), SAM, or in a local Lambda environment. func checkLambdaFunction() bool { - _, ok := os.LookupEnv("AWS_LAMBDA_FUNCTION_NAME") - if ok { + lambdaName, ok := os.LookupEnv("AWS_LAMBDA_FUNCTION_NAME") + if ok && lambdaName != "" { return true } diff --git a/apm-lambda-extension/logsapi/subscribe_test.go b/apm-lambda-extension/logsapi/subscribe_test.go index 450f32f9..df1494fe 100644 --- a/apm-lambda-extension/logsapi/subscribe_test.go +++ b/apm-lambda-extension/logsapi/subscribe_test.go @@ -44,6 +44,20 @@ func TestSubscribeWithSamLocalEnv(t *testing.T) { assert.Error(t, err) } +func TestSubscribeWithLambdaFunction(t *testing.T) { + if err := os.Setenv("AWS_LAMBDA_FUNCTION_NAME", "mock"); err != nil { + t.Fail() + } + t.Cleanup(func() { + if err := os.Unsetenv("AWS_LAMBDA_FUNCTION_NAME"); err != nil { + t.Fail() + } + }) + + _, err := Subscribe(context.Background(), "testID", []EventType{Platform}) + assert.Error(t, err, "listen tcp: lookup sandbox: no such host") +} + func TestSubscribeAWSRequest(t *testing.T) { // For subscription request