diff --git a/dev/load-test/test-recipes/README-rapid-burst.md b/dev/load-test/test-recipes/README-rapid-burst.md index ea661b8e5..e8e26ac0b 100644 --- a/dev/load-test/test-recipes/README-rapid-burst.md +++ b/dev/load-test/test-recipes/README-rapid-burst.md @@ -44,12 +44,18 @@ Before running this test, ensure the following prerequisites are met: args: - --leader-elect=true - --extensions + - --enable-pprof-debug + - --enable-tracing + - --zap-log-level=debug + - --zap-encoder=json - --kube-api-qps=1000 - --kube-api-burst=1000 - --sandbox-concurrent-workers=1000 - --sandbox-claim-concurrent-workers=1000 - --sandbox-warm-pool-concurrent-workers=1000 ``` + - If you are using tracing, see [GKE OTLP Metrics](https://docs.cloud.google.com/stackdriver/docs/otlp-metrics/deploy-collector) + for how to deploy the collector. - Apply your modified manifests to your cluster to install the agent-sandbox controller. ```bash cd ~/agent-sandbox diff --git a/dev/load-test/test-recipes/rapid-burst-test.yaml b/dev/load-test/test-recipes/rapid-burst-test.yaml index 17f81d192..2b743a73f 100644 --- a/dev/load-test/test-recipes/rapid-burst-test.yaml +++ b/dev/load-test/test-recipes/rapid-burst-test.yaml @@ -51,6 +51,26 @@ steps: query: histogram_quantile(0.99, sum(rate(agent_sandbox_claim_startup_latency_ms_bucket{}[%v])) by (le)) threshold: 5000 # Equivalent to 5s requireSamples: true + - Identifier: SandboxClaimControllerStartupLatency + Method: GenericPrometheusQuery + Params: + action: start + metricName: Agent Sandbox Claim Controller Startup Latency (ms) + metricVersion: v1 + unit: ms + queries: + - name: ControllerStartupLatency50 + query: histogram_quantile(0.50, sum(rate(agent_sandbox_claim_controller_startup_latency_ms_bucket{}[%v])) by (le)) + threshold: 1000 + requireSamples: true + - name: ControllerStartupLatency90 + query: histogram_quantile(0.90, sum(rate(agent_sandbox_claim_controller_startup_latency_ms_bucket{}[%v])) by (le)) + threshold: 1000 + requireSamples: true + - name: ControllerStartupLatency99 + query: histogram_quantile(0.99, sum(rate(agent_sandbox_claim_controller_startup_latency_ms_bucket{}[%v])) by (le)) + threshold: 5000 + requireSamples: true - name: Setup Sandbox Template phases: @@ -154,6 +174,11 @@ steps: Params: action: gather enableViolations: true + - Identifier: SandboxClaimControllerStartupLatency + Method: GenericPrometheusQuery + Params: + action: gather + enableViolations: true - name: Delete Sandbox Claims phases: diff --git a/extensions/controllers/sandboxclaim_controller.go b/extensions/controllers/sandboxclaim_controller.go index 226dfdc2e..dd9dc0899 100644 --- a/extensions/controllers/sandboxclaim_controller.go +++ b/extensions/controllers/sandboxclaim_controller.go @@ -41,6 +41,8 @@ import ( asmetrics "sigs.k8s.io/agent-sandbox/internal/metrics" ) +const observabilityAnnotation = "agent-sandbox.kubernetes.io/controller-first-observed-at" + // ErrTemplateNotFound is a sentinel error indicating a SandboxTemplate was not found. var ErrTemplateNotFound = errors.New("SandboxTemplate not found") @@ -92,15 +94,23 @@ func (r *SandboxClaimReconciler) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{}, nil } - // Initialize trace ID for active resources missing an ID. Inline patch, - // no early return, to avoid forcing a second reconcile cycle. - tc := r.Tracer.GetTraceContext(ctx) - if tc != "" && (claim.Annotations == nil || claim.Annotations[asmetrics.TraceContextAnnotation] == "") { + // Initialize trace ID and observation time for active resources missing them. + // Inline patch, no early return, to avoid forcing a second reconcile cycle. + traceContext := r.Tracer.GetTraceContext(ctx) + needObservabilityPatch := claim.Annotations[observabilityAnnotation] == "" + needTraceContextPatch := traceContext != "" && (claim.Annotations[asmetrics.TraceContextAnnotation] == "") + + if needObservabilityPatch || needTraceContextPatch { patch := client.MergeFrom(claim.DeepCopy()) if claim.Annotations == nil { claim.Annotations = make(map[string]string) } - claim.Annotations[asmetrics.TraceContextAnnotation] = tc + if needObservabilityPatch { + claim.Annotations[observabilityAnnotation] = time.Now().Format(time.RFC3339Nano) + } + if needTraceContextPatch { + claim.Annotations[asmetrics.TraceContextAnnotation] = traceContext + } if err := r.Patch(ctx, claim, patch); err != nil { return ctrl.Result{}, err } @@ -441,8 +451,8 @@ func (r *SandboxClaimReconciler) adoptSandboxFromCandidates(ctx context.Context, if adopted.Annotations == nil { adopted.Annotations = make(map[string]string) } - if tc, ok := claim.Annotations[asmetrics.TraceContextAnnotation]; ok { - adopted.Annotations[asmetrics.TraceContextAnnotation] = tc + if traceContext, ok := claim.Annotations[asmetrics.TraceContextAnnotation]; ok { + adopted.Annotations[asmetrics.TraceContextAnnotation] = traceContext } // Add sandbox ID label to pod template for NetworkPolicy targeting @@ -516,8 +526,8 @@ func (r *SandboxClaimReconciler) createSandbox(ctx context.Context, claim *exten if sandbox.Annotations == nil { sandbox.Annotations = make(map[string]string) } - if tc, ok := claim.Annotations[asmetrics.TraceContextAnnotation]; ok { - sandbox.Annotations[asmetrics.TraceContextAnnotation] = tc + if traceContext, ok := claim.Annotations[asmetrics.TraceContextAnnotation]; ok { + sandbox.Annotations[asmetrics.TraceContextAnnotation] = traceContext } // Track the sandbox template ref to be used by metrics collector @@ -822,6 +832,19 @@ func (r *SandboxClaimReconciler) recordCreationLatencyMetric( // startup latency when the TemplateRef is updated. asmetrics.RecordClaimStartupLatency(claim.CreationTimestamp.Time, launchType, claim.Spec.TemplateRef.Name) + // Record controller startup latency + if claim.Annotations[observabilityAnnotation] != "" { + observedTimeString := claim.Annotations[observabilityAnnotation] + observedTime, err := time.Parse(time.RFC3339Nano, observedTimeString) + if err != nil { + logger.Error(err, "Failed to parse controller observation time", "value", observedTimeString) + } else { + controllerLatency := time.Since(observedTime).Milliseconds() + logger.V(1).Info("Recording controller startup latency", "claim", claim.Name, "latency_ms", controllerLatency) + asmetrics.RecordClaimControllerStartupLatency(observedTime, launchType, claim.Spec.TemplateRef.Name) + } + } + // For cold launches, also record the time from Sandbox creation to Ready state to capture controller overhead. if sandbox == nil || sandbox.CreationTimestamp.IsZero() { return diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 37cb16113..b437085ce 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -36,9 +36,23 @@ var ( ClaimStartupLatency = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "agent_sandbox_claim_startup_latency_ms", - Help: "End-to-end latency from SandboxClaim creation to Pod Ready state in milliseconds.", - // Buckets for latency from 50ms to 4 minutes - Buckets: []float64{50, 100, 250, 500, 1000, 2500, 5000, 10000, 30000, 60000, 120000, 240000}, + Help: "End-to-end latency from SandboxClaim creation to Sandbox Ready state in milliseconds.", + // Buckets for latency from 100ms to 4 minutes + Buckets: []float64{100, 250, 500, 750, 1000, 1250, 1500, 2000, 2500, 5000, 10000, 30000, 60000, 120000, 240000}, + }, + []string{"launch_type", "sandbox_template"}, + ) + + // ClaimControllerStartupLatency measures the time from controller first observed timestamp to SandboxClaim Ready state. + // Labels: + // - launch_type: "warm", "cold", "unknown" + // - sandbox_template: the SandboxTemplateRef + ClaimControllerStartupLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "agent_sandbox_claim_controller_startup_latency_ms", + Help: "Latency from controller first observed SandboxClaim to Sandbox Ready state in milliseconds.", + // Buckets for latency from 100ms to 4 minutes + Buckets: []float64{100, 250, 500, 750, 1000, 1250, 1500, 2000, 2500, 5000, 10000, 30000, 60000, 120000, 240000}, }, []string{"launch_type", "sandbox_template"}, ) @@ -91,6 +105,7 @@ var ( // Init registers custom metrics with the global controller-runtime registry. func init() { metrics.Registry.MustRegister(ClaimStartupLatency) + metrics.Registry.MustRegister(ClaimControllerStartupLatency) metrics.Registry.MustRegister(SandboxCreationLatency) metrics.Registry.MustRegister(SandboxClaimCreationTotal) } @@ -101,6 +116,12 @@ func RecordClaimStartupLatency(startTime time.Time, launchType, templateName str ClaimStartupLatency.WithLabelValues(launchType, templateName).Observe(duration) } +// RecordClaimControllerStartupLatency records the duration since the provided controller start time. +func RecordClaimControllerStartupLatency(startTime time.Time, launchType, templateName string) { + duration := float64(time.Since(startTime).Milliseconds()) + ClaimControllerStartupLatency.WithLabelValues(launchType, templateName).Observe(duration) +} + // RecordSandboxCreationLatency records the measured latency duration for a sandbox creation. func RecordSandboxCreationLatency(duration time.Duration, namespace, launchType, templateName string) { SandboxCreationLatency.WithLabelValues(namespace, launchType, templateName).Observe(float64(duration.Milliseconds()))