Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions dev/load-test/test-recipes/rapid-burst-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,26 @@ steps:
query: histogram_quantile(0.99, sum(rate(agent_sandbox_claim_startup_latency_ms_bucket{}[%v])) by (le))
threshold: 5000 # Equivalent to 5s
requireSamples: true
- Identifier: SandboxClaimControllerStartupLatency
Method: GenericPrometheusQuery
Params:
action: start
metricName: Agent Sandbox Claim Controller Startup Latency (ms)
metricVersion: v1
unit: ms
queries:
- name: ControllerStartupLatency50
query: histogram_quantile(0.50, sum(rate(agent_sandbox_claim_controller_startup_latency_ms_bucket{}[%v])) by (le))
threshold: 1000
requireSamples: true
- name: ControllerStartupLatency90
query: histogram_quantile(0.90, sum(rate(agent_sandbox_claim_controller_startup_latency_ms_bucket{}[%v])) by (le))
threshold: 1000
requireSamples: true
- name: ControllerStartupLatency99
query: histogram_quantile(0.99, sum(rate(agent_sandbox_claim_controller_startup_latency_ms_bucket{}[%v])) by (le))
threshold: 5000
requireSamples: true

- name: Setup Sandbox Template
phases:
Expand Down Expand Up @@ -154,6 +174,11 @@ steps:
Params:
action: gather
enableViolations: true
- Identifier: SandboxClaimControllerStartupLatency
Method: GenericPrometheusQuery
Params:
action: gather
enableViolations: true

- name: Delete Sandbox Claims
phases:
Expand Down
31 changes: 27 additions & 4 deletions extensions/controllers/sandboxclaim_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,24 @@ func (r *SandboxClaimReconciler) Reconcile(ctx context.Context, req ctrl.Request
return ctrl.Result{}, nil
}

// Initialize trace ID for active resources missing an ID. Inline patch,
// no early return, to avoid forcing a second reconcile cycle.
// Initialize trace ID and observation time for active resources missing them.
// Inline patch, no early return, to avoid forcing a second reconcile cycle.
tc := r.Tracer.GetTraceContext(ctx)
if tc != "" && (claim.Annotations == nil || claim.Annotations[asmetrics.TraceContextAnnotation] == "") {
obsAnnotation := "agent-sandbox.kubernetes.io/controller-first-observed-at"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: I would just call the variables traceContext, observabilityAnnotation, needObservabilityPatch, needTraceContextPatch; it makes it easier to read

needObsPatch := claim.Annotations == nil || claim.Annotations[obsAnnotation] == ""
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
needObsPatch := claim.Annotations == nil || claim.Annotations[obsAnnotation] == ""
needObsPatch := claim.Annotations[obsAnnotation] == ""

You can read from a nil map, it is treated as the empty map

needTcPatch := tc != "" && (claim.Annotations == nil || claim.Annotations[asmetrics.TraceContextAnnotation] == "")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
needTcPatch := tc != "" && (claim.Annotations == nil || claim.Annotations[asmetrics.TraceContextAnnotation] == "")
needTcPatch := tc != "" && claim.Annotations[asmetrics.TraceContextAnnotation] == ""


if needObsPatch || needTcPatch {
patch := client.MergeFrom(claim.DeepCopy())
if claim.Annotations == nil {
claim.Annotations = make(map[string]string)
}
claim.Annotations[asmetrics.TraceContextAnnotation] = tc
if needObsPatch {
claim.Annotations[obsAnnotation] = time.Now().Format(time.RFC3339Nano)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might consider just keeping an in memory map, but ... given we're already writing to the apiserver for the trace... sgtm

}
if needTcPatch {
claim.Annotations[asmetrics.TraceContextAnnotation] = tc
}
if err := r.Patch(ctx, claim, patch); err != nil {
return ctrl.Result{}, err
}
Expand Down Expand Up @@ -822,6 +831,20 @@ func (r *SandboxClaimReconciler) recordCreationLatencyMetric(
// startup latency when the TemplateRef is updated.
asmetrics.RecordClaimStartupLatency(claim.CreationTimestamp.Time, launchType, claim.Spec.TemplateRef.Name)

// Record controller startup latency
obsAnnotation := "agent-sandbox.kubernetes.io/controller-first-observed-at"
if claim.Annotations != nil && claim.Annotations[obsAnnotation] != "" {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if claim.Annotations != nil && claim.Annotations[obsAnnotation] != "" {
if obsStr := claim.Annotations[obsAnnotation]; obsStr != "" {

obsStr := claim.Annotations[obsAnnotation]
observedTime, err := time.Parse(time.RFC3339Nano, obsStr)
if err != nil {
logger.Error(err, "Failed to parse controller observation time", "value", obsStr)
} else {
latency := time.Since(observedTime).Milliseconds()
logger.V(1).Info("Recording controller startup latency", "claim", claim.Name, "latency_ms", latency)
asmetrics.RecordClaimControllerStartupLatency(observedTime, launchType, claim.Spec.TemplateRef.Name)
}
}

// For cold launches, also record the time from Sandbox creation to Ready state to capture controller overhead.
if sandbox == nil || sandbox.CreationTimestamp.IsZero() {
return
Expand Down
23 changes: 22 additions & 1 deletion internal/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,21 @@ var (
ClaimStartupLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "agent_sandbox_claim_startup_latency_ms",
Help: "End-to-end latency from SandboxClaim creation to Pod Ready state in milliseconds.",
Help: "End-to-end latency from SandboxClaim creation to Sandbox Ready state in milliseconds.",
// Buckets for latency from 50ms to 4 minutes
Buckets: []float64{50, 100, 250, 500, 1000, 2500, 5000, 10000, 30000, 60000, 120000, 240000},
},
[]string{"launch_type", "sandbox_template"},
)

// ClaimControllerStartupLatency measures the time from controller first observed timestamp to SandboxClaim Ready state.
// Labels:
// - launch_type: "warm", "cold", "unknown"
// - sandbox_template: the SandboxTemplateRef
ClaimControllerStartupLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "agent_sandbox_claim_controller_startup_latency_ms",
Help: "Latency from controller first observed SandboxClaim to Sandbox Ready state in milliseconds.",
// Buckets for latency from 50ms to 4 minutes
Buckets: []float64{50, 100, 250, 500, 1000, 2500, 5000, 10000, 30000, 60000, 120000, 240000},
},
Expand Down Expand Up @@ -91,6 +105,7 @@ var (
// Init registers custom metrics with the global controller-runtime registry.
func init() {
metrics.Registry.MustRegister(ClaimStartupLatency)
metrics.Registry.MustRegister(ClaimControllerStartupLatency)
metrics.Registry.MustRegister(SandboxCreationLatency)
metrics.Registry.MustRegister(SandboxClaimCreationTotal)
}
Expand All @@ -101,6 +116,12 @@ func RecordClaimStartupLatency(startTime time.Time, launchType, templateName str
ClaimStartupLatency.WithLabelValues(launchType, templateName).Observe(duration)
}

// RecordClaimControllerStartupLatency records the duration since the provided controller start time.
func RecordClaimControllerStartupLatency(startTime time.Time, launchType, templateName string) {
duration := float64(time.Since(startTime).Milliseconds())
ClaimControllerStartupLatency.WithLabelValues(launchType, templateName).Observe(duration)
}

// RecordSandboxCreationLatency records the measured latency duration for a sandbox creation.
func RecordSandboxCreationLatency(duration time.Duration, namespace, launchType, templateName string) {
SandboxCreationLatency.WithLabelValues(namespace, launchType, templateName).Observe(float64(duration.Milliseconds()))
Expand Down