Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions dev/load-test/test-recipes/README-rapid-burst.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,18 @@ Before running this test, ensure the following prerequisites are met:
args:
- --leader-elect=true
- --extensions
- --enable-pprof-debug
- --enable-tracing
- --zap-log-level=debug
- --zap-encoder=json
- --kube-api-qps=1000
- --kube-api-burst=1000
- --sandbox-concurrent-workers=1000
- --sandbox-claim-concurrent-workers=1000
- --sandbox-warm-pool-concurrent-workers=1000
```
- If you are using tracing, see [GKE OTLP Metrics](https://docs.cloud.google.com/stackdriver/docs/otlp-metrics/deploy-collector)
for how to deploy the collector.
- Apply your modified manifests to your cluster to install the agent-sandbox controller.
```bash
cd ~/agent-sandbox
Expand Down
25 changes: 25 additions & 0 deletions dev/load-test/test-recipes/rapid-burst-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,26 @@ steps:
query: histogram_quantile(0.99, sum(rate(agent_sandbox_claim_startup_latency_ms_bucket{}[%v])) by (le))
threshold: 5000 # Equivalent to 5s
requireSamples: true
- Identifier: SandboxClaimControllerStartupLatency
Method: GenericPrometheusQuery
Params:
action: start
metricName: Agent Sandbox Claim Controller Startup Latency (ms)
metricVersion: v1
unit: ms
queries:
- name: ControllerStartupLatency50
query: histogram_quantile(0.50, sum(rate(agent_sandbox_claim_controller_startup_latency_ms_bucket{}[%v])) by (le))
threshold: 1000
requireSamples: true
- name: ControllerStartupLatency90
query: histogram_quantile(0.90, sum(rate(agent_sandbox_claim_controller_startup_latency_ms_bucket{}[%v])) by (le))
threshold: 1000
requireSamples: true
- name: ControllerStartupLatency99
query: histogram_quantile(0.99, sum(rate(agent_sandbox_claim_controller_startup_latency_ms_bucket{}[%v])) by (le))
threshold: 5000
requireSamples: true

- name: Setup Sandbox Template
phases:
Expand Down Expand Up @@ -154,6 +174,11 @@ steps:
Params:
action: gather
enableViolations: true
- Identifier: SandboxClaimControllerStartupLatency
Method: GenericPrometheusQuery
Params:
action: gather
enableViolations: true

- name: Delete Sandbox Claims
phases:
Expand Down
41 changes: 32 additions & 9 deletions extensions/controllers/sandboxclaim_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ import (
asmetrics "sigs.k8s.io/agent-sandbox/internal/metrics"
)

const observabilityAnnotation = "agent-sandbox.kubernetes.io/controller-first-observed-at"

// ErrTemplateNotFound is a sentinel error indicating a SandboxTemplate was not found.
var ErrTemplateNotFound = errors.New("SandboxTemplate not found")

Expand Down Expand Up @@ -92,15 +94,23 @@ func (r *SandboxClaimReconciler) Reconcile(ctx context.Context, req ctrl.Request
return ctrl.Result{}, nil
}

// Initialize trace ID for active resources missing an ID. Inline patch,
// no early return, to avoid forcing a second reconcile cycle.
tc := r.Tracer.GetTraceContext(ctx)
if tc != "" && (claim.Annotations == nil || claim.Annotations[asmetrics.TraceContextAnnotation] == "") {
// Initialize trace ID and observation time for active resources missing them.
// Inline patch, no early return, to avoid forcing a second reconcile cycle.
traceContext := r.Tracer.GetTraceContext(ctx)
needObservabilityPatch := claim.Annotations[observabilityAnnotation] == ""
needTraceContextPatch := traceContext != "" && (claim.Annotations[asmetrics.TraceContextAnnotation] == "")

if needObservabilityPatch || needTraceContextPatch {
patch := client.MergeFrom(claim.DeepCopy())
if claim.Annotations == nil {
claim.Annotations = make(map[string]string)
}
claim.Annotations[asmetrics.TraceContextAnnotation] = tc
if needObservabilityPatch {
claim.Annotations[observabilityAnnotation] = time.Now().Format(time.RFC3339Nano)
}
if needTraceContextPatch {
claim.Annotations[asmetrics.TraceContextAnnotation] = traceContext
}
if err := r.Patch(ctx, claim, patch); err != nil {
return ctrl.Result{}, err
}
Expand Down Expand Up @@ -441,8 +451,8 @@ func (r *SandboxClaimReconciler) adoptSandboxFromCandidates(ctx context.Context,
if adopted.Annotations == nil {
adopted.Annotations = make(map[string]string)
}
if tc, ok := claim.Annotations[asmetrics.TraceContextAnnotation]; ok {
adopted.Annotations[asmetrics.TraceContextAnnotation] = tc
if traceContext, ok := claim.Annotations[asmetrics.TraceContextAnnotation]; ok {
adopted.Annotations[asmetrics.TraceContextAnnotation] = traceContext
}

// Add sandbox ID label to pod template for NetworkPolicy targeting
Expand Down Expand Up @@ -516,8 +526,8 @@ func (r *SandboxClaimReconciler) createSandbox(ctx context.Context, claim *exten
if sandbox.Annotations == nil {
sandbox.Annotations = make(map[string]string)
}
if tc, ok := claim.Annotations[asmetrics.TraceContextAnnotation]; ok {
sandbox.Annotations[asmetrics.TraceContextAnnotation] = tc
if traceContext, ok := claim.Annotations[asmetrics.TraceContextAnnotation]; ok {
sandbox.Annotations[asmetrics.TraceContextAnnotation] = traceContext
}

// Track the sandbox template ref to be used by metrics collector
Expand Down Expand Up @@ -822,6 +832,19 @@ func (r *SandboxClaimReconciler) recordCreationLatencyMetric(
// startup latency when the TemplateRef is updated.
asmetrics.RecordClaimStartupLatency(claim.CreationTimestamp.Time, launchType, claim.Spec.TemplateRef.Name)

// Record controller startup latency
if claim.Annotations[observabilityAnnotation] != "" {
observedTimeString := claim.Annotations[observabilityAnnotation]
observedTime, err := time.Parse(time.RFC3339Nano, observedTimeString)
if err != nil {
logger.Error(err, "Failed to parse controller observation time", "value", observedTimeString)
} else {
controllerLatency := time.Since(observedTime).Milliseconds()
logger.V(1).Info("Recording controller startup latency", "claim", claim.Name, "latency_ms", controllerLatency)
asmetrics.RecordClaimControllerStartupLatency(observedTime, launchType, claim.Spec.TemplateRef.Name)
}
}

// For cold launches, also record the time from Sandbox creation to Ready state to capture controller overhead.
if sandbox == nil || sandbox.CreationTimestamp.IsZero() {
return
Expand Down
27 changes: 24 additions & 3 deletions internal/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,23 @@ var (
ClaimStartupLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "agent_sandbox_claim_startup_latency_ms",
Help: "End-to-end latency from SandboxClaim creation to Pod Ready state in milliseconds.",
// Buckets for latency from 50ms to 4 minutes
Buckets: []float64{50, 100, 250, 500, 1000, 2500, 5000, 10000, 30000, 60000, 120000, 240000},
Help: "End-to-end latency from SandboxClaim creation to Sandbox Ready state in milliseconds.",
// Buckets for latency from 100ms to 4 minutes
Buckets: []float64{100, 250, 500, 750, 1000, 1250, 1500, 2000, 2500, 5000, 10000, 30000, 60000, 120000, 240000},
},
[]string{"launch_type", "sandbox_template"},
)

// ClaimControllerStartupLatency measures the time from controller first observed timestamp to SandboxClaim Ready state.
// Labels:
// - launch_type: "warm", "cold", "unknown"
// - sandbox_template: the SandboxTemplateRef
ClaimControllerStartupLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "agent_sandbox_claim_controller_startup_latency_ms",
Help: "Latency from controller first observed SandboxClaim to Sandbox Ready state in milliseconds.",
// Buckets for latency from 100ms to 4 minutes
Buckets: []float64{100, 250, 500, 750, 1000, 1250, 1500, 2000, 2500, 5000, 10000, 30000, 60000, 120000, 240000},
},
[]string{"launch_type", "sandbox_template"},
)
Expand Down Expand Up @@ -91,6 +105,7 @@ var (
// Init registers custom metrics with the global controller-runtime registry.
func init() {
metrics.Registry.MustRegister(ClaimStartupLatency)
metrics.Registry.MustRegister(ClaimControllerStartupLatency)
metrics.Registry.MustRegister(SandboxCreationLatency)
metrics.Registry.MustRegister(SandboxClaimCreationTotal)
}
Expand All @@ -101,6 +116,12 @@ func RecordClaimStartupLatency(startTime time.Time, launchType, templateName str
ClaimStartupLatency.WithLabelValues(launchType, templateName).Observe(duration)
}

// RecordClaimControllerStartupLatency records the duration since the provided controller start time.
func RecordClaimControllerStartupLatency(startTime time.Time, launchType, templateName string) {
duration := float64(time.Since(startTime).Milliseconds())
ClaimControllerStartupLatency.WithLabelValues(launchType, templateName).Observe(duration)
}

// RecordSandboxCreationLatency records the measured latency duration for a sandbox creation.
func RecordSandboxCreationLatency(duration time.Duration, namespace, launchType, templateName string) {
SandboxCreationLatency.WithLabelValues(namespace, launchType, templateName).Observe(float64(duration.Milliseconds()))
Expand Down