From 8c9fc2226b15085e5768acd2d874ddcf4498adbc Mon Sep 17 00:00:00 2001
From: Marcus <marcuseagan@gmail.com>
Date: Tue, 29 Jul 2025 21:18:09 -0700
Subject: [PATCH 1/6] Implement metrics using otel

---
 .../src/memory_awaited_action_db.rs           | 108 ++-
 nativelink-util/BUILD.bazel                   |   3 +
 nativelink-util/src/lib.rs                    |   1 +
 nativelink-util/src/metrics.rs                | 618 ++++++++++++++++++
 nativelink-util/tests/metrics_test.rs         | 114 ++++
 5 files changed, 842 insertions(+), 2 deletions(-)
 create mode 100644 nativelink-util/src/metrics.rs
 create mode 100644 nativelink-util/tests/metrics_test.rs

diff --git a/nativelink-scheduler/src/memory_awaited_action_db.rs b/nativelink-scheduler/src/memory_awaited_action_db.rs
index 96bb2b87..04bee355 100644
--- a/nativelink-scheduler/src/memory_awaited_action_db.rs
+++ b/nativelink-scheduler/src/memory_awaited_action_db.rs
@@ -29,6 +29,9 @@ use nativelink_util::action_messages::{
 use nativelink_util::chunked_stream::ChunkedStream;
 use nativelink_util::evicting_map::{EvictingMap, LenEntry};
 use nativelink_util::instant_wrapper::InstantWrapper;
+use nativelink_util::metrics::{
+    EXECUTION_METRICS, ExecutionResult, ExecutionStage, make_execution_attributes,
+};
 use nativelink_util::spawn;
 use nativelink_util::task::JoinHandleDropGuard;
 use tokio::sync::{Notify, mpsc, watch};
@@ -630,6 +633,95 @@ impl<I: InstantWrapper, NowFn: Fn() -> I + Clone + Send + Sync> AwaitedActionDbI
                 .is_same_stage(&new_awaited_action.state().stage);
 
             if !is_same_stage {
+                // Record metrics for stage transitions
+                let metrics = &*EXECUTION_METRICS;
+                let old_stage = &old_awaited_action.state().stage;
+                let new_stage = &new_awaited_action.state().stage;
+
+                // Track stage transitions
+                let base_attrs = make_execution_attributes(
+                    "unknown",
+                    None,
+                    Some(old_awaited_action.action_info().priority),
+                );
+                metrics.execution_stage_transitions.add(1, &base_attrs);
+
+                // Update active count for old stage
+                let old_stage_attrs = match old_stage {
+                    ActionStage::Unknown => vec![opentelemetry::KeyValue::new(
+                        nativelink_util::metrics::EXECUTION_STAGE,
+                        ExecutionStage::Unknown,
+                    )],
+                    ActionStage::CacheCheck => vec![opentelemetry::KeyValue::new(
+                        nativelink_util::metrics::EXECUTION_STAGE,
+                        ExecutionStage::CacheCheck,
+                    )],
+                    ActionStage::Queued => vec![opentelemetry::KeyValue::new(
+                        nativelink_util::metrics::EXECUTION_STAGE,
+                        ExecutionStage::Queued,
+                    )],
+                    ActionStage::Executing => vec![opentelemetry::KeyValue::new(
+                        nativelink_util::metrics::EXECUTION_STAGE,
+                        ExecutionStage::Executing,
+                    )],
+                    ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => {
+                        vec![opentelemetry::KeyValue::new(
+                            nativelink_util::metrics::EXECUTION_STAGE,
+                            ExecutionStage::Completed,
+                        )]
+                    }
+                };
+                metrics.execution_active_count.add(-1, &old_stage_attrs);
+
+                // Update active count for new stage
+                let new_stage_attrs = match new_stage {
+                    ActionStage::Unknown => vec![opentelemetry::KeyValue::new(
+                        nativelink_util::metrics::EXECUTION_STAGE,
+                        ExecutionStage::Unknown,
+                    )],
+                    ActionStage::CacheCheck => vec![opentelemetry::KeyValue::new(
+                        nativelink_util::metrics::EXECUTION_STAGE,
+                        ExecutionStage::CacheCheck,
+                    )],
+                    ActionStage::Queued => vec![opentelemetry::KeyValue::new(
+                        nativelink_util::metrics::EXECUTION_STAGE,
+                        ExecutionStage::Queued,
+                    )],
+                    ActionStage::Executing => vec![opentelemetry::KeyValue::new(
+                        nativelink_util::metrics::EXECUTION_STAGE,
+                        ExecutionStage::Executing,
+                    )],
+                    ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => {
+                        vec![opentelemetry::KeyValue::new(
+                            nativelink_util::metrics::EXECUTION_STAGE,
+                            ExecutionStage::Completed,
+                        )]
+                    }
+                };
+                metrics.execution_active_count.add(1, &new_stage_attrs);
+
+                // Record completion metrics
+                if let ActionStage::Completed(action_result) = new_stage {
+                    let result_attrs = if action_result.exit_code == 0 {
+                        vec![opentelemetry::KeyValue::new(
+                            nativelink_util::metrics::EXECUTION_RESULT,
+                            ExecutionResult::Success,
+                        )]
+                    } else {
+                        vec![opentelemetry::KeyValue::new(
+                            nativelink_util::metrics::EXECUTION_RESULT,
+                            ExecutionResult::Failure,
+                        )]
+                    };
+                    metrics.execution_completed_count.add(1, &result_attrs);
+                } else if let ActionStage::CompletedFromCache(_) = new_stage {
+                    let result_attrs = vec![opentelemetry::KeyValue::new(
+                        nativelink_util::metrics::EXECUTION_RESULT,
+                        ExecutionResult::CacheHit,
+                    )];
+                    metrics.execution_completed_count.add(1, &result_attrs);
+                }
+
                 self.sorted_action_info_hash_keys
                     .process_state_changes(&old_awaited_action, &new_awaited_action)?;
                 Self::process_state_changes_for_hash_key_map(
@@ -695,8 +787,11 @@ impl<I: InstantWrapper, NowFn: Fn() -> I + Clone + Send + Sync> AwaitedActionDbI
             ActionUniqueQualifier::Uncacheable(_unique_key) => None,
         };
         let operation_id = OperationId::default();
-        let awaited_action =
-            AwaitedAction::new(operation_id.clone(), action_info, (self.now_fn)().now());
+        let awaited_action = AwaitedAction::new(
+            operation_id.clone(),
+            action_info.clone(),
+            (self.now_fn)().now(),
+        );
         debug_assert!(
             ActionStage::Queued == awaited_action.state().stage,
             "Expected action to be queued"
@@ -731,6 +826,15 @@ impl<I: InstantWrapper, NowFn: Fn() -> I + Clone + Send + Sync> AwaitedActionDbI
             }
         }
 
+        // Record metric for new action entering the queue
+        let metrics = &*EXECUTION_METRICS;
+        let _base_attrs = make_execution_attributes("unknown", None, Some(action_info.priority));
+        let queued_attrs = vec![opentelemetry::KeyValue::new(
+            nativelink_util::metrics::EXECUTION_STAGE,
+            ExecutionStage::Queued,
+        )];
+        metrics.execution_active_count.add(1, &queued_attrs);
+
         self.sorted_action_info_hash_keys
             .insert_sort_map_for_stage(
                 &ActionStage::Queued,
diff --git a/nativelink-util/BUILD.bazel b/nativelink-util/BUILD.bazel
index c288c7e6..d1e1e557 100644
--- a/nativelink-util/BUILD.bazel
+++ b/nativelink-util/BUILD.bazel
@@ -24,6 +24,7 @@ rust_library(
         "src/instant_wrapper.rs",
         "src/known_platform_property_provider.rs",
         "src/lib.rs",
+        "src/metrics.rs",
         "src/metrics_utils.rs",
         "src/operation_state_manager.rs",
         "src/origin_event.rs",
@@ -95,6 +96,7 @@ rust_test_suite(
         "tests/evicting_map_test.rs",
         "tests/fastcdc_test.rs",
         "tests/health_utils_test.rs",
+        "tests/metrics_test.rs",
         "tests/operation_id_tests.rs",
         "tests/origin_event_test.rs",
         "tests/proto_stream_utils_test.rs",
@@ -120,6 +122,7 @@ rust_test_suite(
         "@crates//:http-body-util",
         "@crates//:hyper-1.6.0",
         "@crates//:mock_instant",
+        "@crates//:opentelemetry",
         "@crates//:parking_lot",
         "@crates//:pretty_assertions",
         "@crates//:rand",
diff --git a/nativelink-util/src/lib.rs b/nativelink-util/src/lib.rs
index bcab5b23..c29069f6 100644
--- a/nativelink-util/src/lib.rs
+++ b/nativelink-util/src/lib.rs
@@ -25,6 +25,7 @@ pub mod fs;
 pub mod health_utils;
 pub mod instant_wrapper;
 pub mod known_platform_property_provider;
+pub mod metrics;
 pub mod metrics_utils;
 pub mod operation_state_manager;
 pub mod origin_event;
diff --git a/nativelink-util/src/metrics.rs b/nativelink-util/src/metrics.rs
new file mode 100644
index 00000000..3b8dea07
--- /dev/null
+++ b/nativelink-util/src/metrics.rs
@@ -0,0 +1,618 @@
+// Copyright 2025 The NativeLink Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::LazyLock;
+
+use opentelemetry::{InstrumentationScope, KeyValue, Value, global, metrics};
+
+// Metric attribute keys for cache operations.
+pub const CACHE_TYPE: &str = "cache.type";
+pub const CACHE_OPERATION: &str = "cache.operation.name";
+pub const CACHE_RESULT: &str = "cache.operation.result";
+
+// Metric attribute keys for remote execution operations.
+pub const EXECUTION_STAGE: &str = "execution.stage";
+pub const EXECUTION_RESULT: &str = "execution.result";
+pub const EXECUTION_INSTANCE: &str = "execution.instance";
+pub const EXECUTION_PRIORITY: &str = "execution.priority";
+pub const EXECUTION_WORKER_ID: &str = "execution.worker_id";
+pub const EXECUTION_EXIT_CODE: &str = "execution.exit_code";
+
+/// Cache operation types for metrics classification.
+#[derive(Debug, Clone, Copy)]
+pub enum CacheOperationName {
+    /// Data retrieval operations (get, peek, contains, etc.)
+    Read,
+    /// Data storage operations (insert, update, replace, etc.)
+    Write,
+    /// Explicit data removal operations
+    Delete,
+    /// Automatic cache maintenance (evictions, TTL cleanup, etc.)
+    Evict,
+}
+
+impl From<CacheOperationName> for Value {
+    fn from(op: CacheOperationName) -> Self {
+        match op {
+            CacheOperationName::Read => Self::from("read"),
+            CacheOperationName::Write => Self::from("write"),
+            CacheOperationName::Delete => Self::from("delete"),
+            CacheOperationName::Evict => Self::from("evict"),
+        }
+    }
+}
+
+/// Results of cache operations.
+///
+/// Result semantics vary by operation type:
+/// - Read: Hit/Miss/Expired indicate data availability
+/// - Write/Delete/Evict: Success/Error indicate completion status
+#[derive(Debug, Clone, Copy)]
+pub enum CacheOperationResult {
+    /// Data found and valid (Read operations)
+    Hit,
+    /// Data not found (Read operations)
+    Miss,
+    /// Data found but invalid/expired (Read operations)
+    Expired,
+    /// Operation completed successfully (Write/Delete/Evict operations)
+    Success,
+    /// Operation failed (any operation type)
+    Error,
+}
+
+impl From<CacheOperationResult> for Value {
+    fn from(result: CacheOperationResult) -> Self {
+        match result {
+            CacheOperationResult::Hit => Self::from("hit"),
+            CacheOperationResult::Miss => Self::from("miss"),
+            CacheOperationResult::Expired => Self::from("expired"),
+            CacheOperationResult::Success => Self::from("success"),
+            CacheOperationResult::Error => Self::from("error"),
+        }
+    }
+}
+
+/// Remote execution stages for metrics classification.
+#[derive(Debug, Clone, Copy)]
+pub enum ExecutionStage {
+    /// Unknown stage
+    Unknown,
+    /// Checking cache for existing results
+    CacheCheck,
+    /// Action is queued waiting for execution
+    Queued,
+    /// Action is being executed by a worker
+    Executing,
+    /// Action execution completed
+    Completed,
+}
+
+impl From<ExecutionStage> for Value {
+    fn from(stage: ExecutionStage) -> Self {
+        match stage {
+            ExecutionStage::Unknown => Self::from("unknown"),
+            ExecutionStage::CacheCheck => Self::from("cache_check"),
+            ExecutionStage::Queued => Self::from("queued"),
+            ExecutionStage::Executing => Self::from("executing"),
+            ExecutionStage::Completed => Self::from("completed"),
+        }
+    }
+}
+
+/// Results of remote execution operations.
+#[derive(Debug, Clone, Copy)]
+pub enum ExecutionResult {
+    /// Execution completed successfully
+    Success,
+    /// Execution failed
+    Failure,
+    /// Execution was cancelled
+    Cancelled,
+    /// Execution timed out
+    Timeout,
+    /// Result was found in cache
+    CacheHit,
+}
+
+impl From<ExecutionResult> for Value {
+    fn from(result: ExecutionResult) -> Self {
+        match result {
+            ExecutionResult::Success => Self::from("success"),
+            ExecutionResult::Failure => Self::from("failure"),
+            ExecutionResult::Cancelled => Self::from("cancelled"),
+            ExecutionResult::Timeout => Self::from("timeout"),
+            ExecutionResult::CacheHit => Self::from("cache_hit"),
+        }
+    }
+}
+
+/// Pre-allocated attribute combinations for efficient cache metrics collection.
+///
+/// Avoids runtime allocation by pre-computing common attribute combinations
+/// for cache operations and results.
+#[derive(Debug)]
+pub struct CacheMetricAttrs {
+    // Read operation attributes
+    read_hit: Vec<KeyValue>,
+    read_miss: Vec<KeyValue>,
+    read_expired: Vec<KeyValue>,
+
+    // Write operation attributes
+    write_success: Vec<KeyValue>,
+    write_error: Vec<KeyValue>,
+
+    // Delete operation attributes
+    delete_success: Vec<KeyValue>,
+    delete_miss: Vec<KeyValue>,
+    delete_error: Vec<KeyValue>,
+
+    // Evict operation attributes
+    evict_success: Vec<KeyValue>,
+    evict_expired: Vec<KeyValue>,
+}
+
+impl CacheMetricAttrs {
+    /// Creates a new set of pre-computed attributes.
+    ///
+    /// The `base_attrs` are included in all attribute combinations (e.g., cache
+    /// type, instance ID).
+    #[must_use]
+    pub fn new(base_attrs: &[KeyValue]) -> Self {
+        let make_attrs = |op: CacheOperationName, result: CacheOperationResult| {
+            let mut attrs = base_attrs.to_vec();
+            attrs.push(KeyValue::new(CACHE_OPERATION, op));
+            attrs.push(KeyValue::new(CACHE_RESULT, result));
+            attrs
+        };
+
+        Self {
+            read_hit: make_attrs(CacheOperationName::Read, CacheOperationResult::Hit),
+            read_miss: make_attrs(CacheOperationName::Read, CacheOperationResult::Miss),
+            read_expired: make_attrs(CacheOperationName::Read, CacheOperationResult::Expired),
+
+            write_success: make_attrs(CacheOperationName::Write, CacheOperationResult::Success),
+            write_error: make_attrs(CacheOperationName::Write, CacheOperationResult::Error),
+
+            delete_success: make_attrs(CacheOperationName::Delete, CacheOperationResult::Success),
+            delete_miss: make_attrs(CacheOperationName::Delete, CacheOperationResult::Miss),
+            delete_error: make_attrs(CacheOperationName::Delete, CacheOperationResult::Error),
+
+            evict_success: make_attrs(CacheOperationName::Evict, CacheOperationResult::Success),
+            evict_expired: make_attrs(CacheOperationName::Evict, CacheOperationResult::Expired),
+        }
+    }
+
+    // Attribute accessors
+    #[must_use]
+    pub fn read_hit(&self) -> &[KeyValue] {
+        &self.read_hit
+    }
+    #[must_use]
+    pub fn read_miss(&self) -> &[KeyValue] {
+        &self.read_miss
+    }
+    #[must_use]
+    pub fn read_expired(&self) -> &[KeyValue] {
+        &self.read_expired
+    }
+    #[must_use]
+    pub fn write_success(&self) -> &[KeyValue] {
+        &self.write_success
+    }
+    #[must_use]
+    pub fn write_error(&self) -> &[KeyValue] {
+        &self.write_error
+    }
+    #[must_use]
+    pub fn delete_success(&self) -> &[KeyValue] {
+        &self.delete_success
+    }
+    #[must_use]
+    pub fn delete_miss(&self) -> &[KeyValue] {
+        &self.delete_miss
+    }
+    #[must_use]
+    pub fn delete_error(&self) -> &[KeyValue] {
+        &self.delete_error
+    }
+    #[must_use]
+    pub fn evict_success(&self) -> &[KeyValue] {
+        &self.evict_success
+    }
+    #[must_use]
+    pub fn evict_expired(&self) -> &[KeyValue] {
+        &self.evict_expired
+    }
+}
+
+/// Pre-allocated attribute combinations for efficient remote execution metrics collection.
+#[derive(Debug)]
+pub struct ExecutionMetricAttrs {
+    // Stage transition attributes
+    unknown: Vec<KeyValue>,
+    cache_check: Vec<KeyValue>,
+    queued: Vec<KeyValue>,
+    executing: Vec<KeyValue>,
+    completed_success: Vec<KeyValue>,
+    completed_failure: Vec<KeyValue>,
+    completed_cancelled: Vec<KeyValue>,
+    completed_timeout: Vec<KeyValue>,
+    completed_cache_hit: Vec<KeyValue>,
+}
+
+impl ExecutionMetricAttrs {
+    /// Creates a new set of pre-computed attributes.
+    ///
+    /// The `base_attrs` are included in all attribute combinations (e.g., instance
+    /// name, worker ID).
+    #[must_use]
+    pub fn new(base_attrs: &[KeyValue]) -> Self {
+        let make_attrs = |stage: ExecutionStage, result: Option<ExecutionResult>| {
+            let mut attrs = base_attrs.to_vec();
+            attrs.push(KeyValue::new(EXECUTION_STAGE, stage));
+            if let Some(result) = result {
+                attrs.push(KeyValue::new(EXECUTION_RESULT, result));
+            }
+            attrs
+        };
+
+        Self {
+            unknown: make_attrs(ExecutionStage::Unknown, None),
+            cache_check: make_attrs(ExecutionStage::CacheCheck, None),
+            queued: make_attrs(ExecutionStage::Queued, None),
+            executing: make_attrs(ExecutionStage::Executing, None),
+            completed_success: make_attrs(
+                ExecutionStage::Completed,
+                Some(ExecutionResult::Success),
+            ),
+            completed_failure: make_attrs(
+                ExecutionStage::Completed,
+                Some(ExecutionResult::Failure),
+            ),
+            completed_cancelled: make_attrs(
+                ExecutionStage::Completed,
+                Some(ExecutionResult::Cancelled),
+            ),
+            completed_timeout: make_attrs(
+                ExecutionStage::Completed,
+                Some(ExecutionResult::Timeout),
+            ),
+            completed_cache_hit: make_attrs(
+                ExecutionStage::Completed,
+                Some(ExecutionResult::CacheHit),
+            ),
+        }
+    }
+
+    // Attribute accessors
+    #[must_use]
+    pub fn unknown(&self) -> &[KeyValue] {
+        &self.unknown
+    }
+    #[must_use]
+    pub fn cache_check(&self) -> &[KeyValue] {
+        &self.cache_check
+    }
+    #[must_use]
+    pub fn queued(&self) -> &[KeyValue] {
+        &self.queued
+    }
+    #[must_use]
+    pub fn executing(&self) -> &[KeyValue] {
+        &self.executing
+    }
+    #[must_use]
+    pub fn completed_success(&self) -> &[KeyValue] {
+        &self.completed_success
+    }
+    #[must_use]
+    pub fn completed_failure(&self) -> &[KeyValue] {
+        &self.completed_failure
+    }
+    #[must_use]
+    pub fn completed_cancelled(&self) -> &[KeyValue] {
+        &self.completed_cancelled
+    }
+    #[must_use]
+    pub fn completed_timeout(&self) -> &[KeyValue] {
+        &self.completed_timeout
+    }
+    #[must_use]
+    pub fn completed_cache_hit(&self) -> &[KeyValue] {
+        &self.completed_cache_hit
+    }
+}
+
+/// Global cache metrics instruments.
+pub static CACHE_METRICS: LazyLock<CacheMetrics> = LazyLock::new(|| {
+    let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build());
+
+    CacheMetrics {
+        cache_operation_duration: meter
+            .f64_histogram("cache.operation.duration")
+            .with_description("Duration of cache operations in milliseconds")
+            .with_unit("ms")
+            // The range of these is quite large as a cache might be backed by
+            // memory, a filesystem, or network storage. The current values were
+            // determined empirically and might need adjustment.
+            .with_boundaries(vec![
+                // Microsecond range
+                0.001, // 1μs
+                0.005, // 5μs
+                0.01,  // 10μs
+                0.05,  // 50μs
+                0.1,   // 100μs
+                // Sub-millisecond range
+                0.2, // 200μs
+                0.5, // 500μs
+                1.0, // 1ms
+                // Low millisecond range
+                2.0,   // 2ms
+                5.0,   // 5ms
+                10.0,  // 10ms
+                20.0,  // 20ms
+                50.0,  // 50ms
+                100.0, // 100ms
+                // Higher latency range
+                200.0,  // 200ms
+                500.0,  // 500ms
+                1000.0, // 1 second
+                2000.0, // 2 seconds
+                5000.0, // 5 seconds
+            ])
+            .build(),
+
+        cache_operations: meter
+            .u64_counter("cache.operations")
+            .with_description("Total cache operations by type and result")
+            .build(),
+
+        cache_io: meter
+            .u64_counter("cache.io")
+            .with_description("Total bytes processed by cache operations")
+            .with_unit("By")
+            .build(),
+
+        cache_size: meter
+            .i64_up_down_counter("cache.size")
+            .with_description("Current total size of cached data")
+            .with_unit("By")
+            .build(),
+
+        cache_entries: meter
+            .i64_up_down_counter("cache.entries")
+            .with_description("Current number of cached entries")
+            .with_unit("{entry}")
+            .build(),
+
+        cache_entry_size: meter
+            .u64_histogram("cache.item.size")
+            .with_description("Size distribution of cached entries")
+            .with_unit("By")
+            .build(),
+    }
+});
+
+/// OpenTelemetry metrics instruments for cache monitoring.
+#[derive(Debug)]
+pub struct CacheMetrics {
+    /// Histogram of cache operation durations in milliseconds
+    pub cache_operation_duration: metrics::Histogram<f64>,
+    /// Counter of cache operations by type and result
+    pub cache_operations: metrics::Counter<u64>,
+    /// Counter of bytes read/written during cache operations
+    pub cache_io: metrics::Counter<u64>,
+    /// Current total size of all cached data in bytes
+    pub cache_size: metrics::UpDownCounter<i64>,
+    /// Current number of entries in cache
+    pub cache_entries: metrics::UpDownCounter<i64>,
+    /// Histogram of individual cache entry sizes in bytes
+    pub cache_entry_size: metrics::Histogram<u64>,
+}
+
+/// Global remote execution metrics instruments.
+pub static EXECUTION_METRICS: LazyLock<ExecutionMetrics> = LazyLock::new(|| {
+    let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build());
+
+    ExecutionMetrics {
+        execution_stage_duration: meter
+            .f64_histogram("execution.stage.duration")
+            .with_description("Duration of each execution stage in seconds")
+            .with_unit("s")
+            .with_boundaries(vec![
+                // Sub-second range
+                0.001, // 1ms
+                0.01,  // 10ms
+                0.1,   // 100ms
+                0.5,   // 500ms
+                1.0,   // 1s
+                // Multi-second range
+                2.0,    // 2s
+                5.0,    // 5s
+                10.0,   // 10s
+                30.0,   // 30s
+                60.0,   // 1 minute
+                120.0,  // 2 minutes
+                300.0,  // 5 minutes
+                600.0,  // 10 minutes
+                1800.0, // 30 minutes
+                3600.0, // 1 hour
+            ])
+            .build(),
+
+        execution_total_duration: meter
+            .f64_histogram("execution.total.duration")
+            .with_description(
+                "Total duration of action execution from submission to completion in seconds",
+            )
+            .with_unit("s")
+            .with_boundaries(vec![
+                // Sub-second range
+                0.01, // 10ms
+                0.1,  // 100ms
+                0.5,  // 500ms
+                1.0,  // 1s
+                // Multi-second range
+                5.0,    // 5s
+                10.0,   // 10s
+                30.0,   // 30s
+                60.0,   // 1 minute
+                300.0,  // 5 minutes
+                600.0,  // 10 minutes
+                1800.0, // 30 minutes
+                3600.0, // 1 hour
+                7200.0, // 2 hours
+            ])
+            .build(),
+
+        execution_queue_time: meter
+            .f64_histogram("execution.queue.time")
+            .with_description("Time spent waiting in queue before execution in seconds")
+            .with_unit("s")
+            .with_boundaries(vec![
+                0.001, // 1ms
+                0.01,  // 10ms
+                0.1,   // 100ms
+                0.5,   // 500ms
+                1.0,   // 1s
+                2.0,   // 2s
+                5.0,   // 5s
+                10.0,  // 10s
+                30.0,  // 30s
+                60.0,  // 1 minute
+                300.0, // 5 minutes
+                600.0, // 10 minutes
+            ])
+            .build(),
+
+        execution_active_count: meter
+            .i64_up_down_counter("execution.active.count")
+            .with_description("Number of actions currently in each stage")
+            .with_unit("{action}")
+            .build(),
+
+        execution_completed_count: meter
+            .u64_counter("execution.completed.count")
+            .with_description("Total number of completed executions by result")
+            .with_unit("{action}")
+            .build(),
+
+        execution_stage_transitions: meter
+            .u64_counter("execution.stage.transitions")
+            .with_description("Number of stage transitions")
+            .with_unit("{transition}")
+            .build(),
+
+        execution_output_size: meter
+            .u64_histogram("execution.output.size")
+            .with_description("Size of execution outputs in bytes")
+            .with_unit("By")
+            .with_boundaries(vec![
+                1_024.0,          // 1KB
+                10_240.0,         // 10KB
+                102_400.0,        // 100KB
+                1_048_576.0,      // 1MB
+                10_485_760.0,     // 10MB
+                104_857_600.0,    // 100MB
+                1_073_741_824.0,  // 1GB
+                10_737_418_240.0, // 10GB
+            ])
+            .build(),
+
+        execution_cpu_time: meter
+            .f64_histogram("execution.cpu.time")
+            .with_description("CPU time consumed by action execution in seconds")
+            .with_unit("s")
+            .with_boundaries(vec![
+                0.01,   // 10ms
+                0.1,    // 100ms
+                1.0,    // 1s
+                10.0,   // 10s
+                60.0,   // 1 minute
+                300.0,  // 5 minutes
+                600.0,  // 10 minutes
+                1800.0, // 30 minutes
+                3600.0, // 1 hour
+            ])
+            .build(),
+
+        execution_memory_usage: meter
+            .u64_histogram("execution.memory.usage")
+            .with_description("Peak memory usage during execution in bytes")
+            .with_unit("By")
+            .with_boundaries(vec![
+                1_048_576.0,      // 1MB
+                10_485_760.0,     // 10MB
+                104_857_600.0,    // 100MB
+                524_288_000.0,    // 500MB
+                1_073_741_824.0,  // 1GB
+                5_368_709_120.0,  // 5GB
+                10_737_418_240.0, // 10GB
+                53_687_091_200.0, // 50GB
+            ])
+            .build(),
+
+        execution_retry_count: meter
+            .u64_counter("execution.retry.count")
+            .with_description("Number of execution retries")
+            .with_unit("{retry}")
+            .build(),
+    }
+});
+
+/// OpenTelemetry metrics instruments for remote execution monitoring.
+#[derive(Debug)]
+pub struct ExecutionMetrics {
+    /// Histogram of stage durations in seconds
+    pub execution_stage_duration: metrics::Histogram<f64>,
+    /// Histogram of total execution durations in seconds
+    pub execution_total_duration: metrics::Histogram<f64>,
+    /// Histogram of queue wait times in seconds
+    pub execution_queue_time: metrics::Histogram<f64>,
+    /// Current number of actions in each stage
+    pub execution_active_count: metrics::UpDownCounter<i64>,
+    /// Total number of completed executions
+    pub execution_completed_count: metrics::Counter<u64>,
+    /// Number of stage transitions
+    pub execution_stage_transitions: metrics::Counter<u64>,
+    /// Histogram of output sizes in bytes
+    pub execution_output_size: metrics::Histogram<u64>,
+    /// Histogram of CPU time in seconds
+    pub execution_cpu_time: metrics::Histogram<f64>,
+    /// Histogram of peak memory usage in bytes
+    pub execution_memory_usage: metrics::Histogram<u64>,
+    /// Counter for execution retries
+    pub execution_retry_count: metrics::Counter<u64>,
+}
+
+/// Helper function to create attributes for execution metrics
+#[must_use]
+pub fn make_execution_attributes(
+    instance_name: &str,
+    worker_id: Option<&str>,
+    priority: Option<i32>,
+) -> Vec<KeyValue> {
+    let mut attrs = vec![KeyValue::new(EXECUTION_INSTANCE, instance_name.to_string())];
+
+    if let Some(worker_id) = worker_id {
+        attrs.push(KeyValue::new(EXECUTION_WORKER_ID, worker_id.to_string()));
+    }
+
+    if let Some(priority) = priority {
+        attrs.push(KeyValue::new(EXECUTION_PRIORITY, i64::from(priority)));
+    }
+
+    attrs
+}
diff --git a/nativelink-util/tests/metrics_test.rs b/nativelink-util/tests/metrics_test.rs
new file mode 100644
index 00000000..0f8548de
--- /dev/null
+++ b/nativelink-util/tests/metrics_test.rs
@@ -0,0 +1,114 @@
+// Copyright 2025 The NativeLink Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use nativelink_util::metrics::{
+    CACHE_METRICS, CacheMetricAttrs, EXECUTION_METRICS, ExecutionMetricAttrs,
+    make_execution_attributes,
+};
+use opentelemetry::KeyValue;
+
+#[test]
+fn test_cache_metric_attrs() {
+    let base_attrs = vec![
+        KeyValue::new("cache.type", "test_cache"),
+        KeyValue::new("instance", "test_instance"),
+    ];
+
+    let attrs = CacheMetricAttrs::new(&base_attrs);
+
+    // Verify that the pre-computed attributes contain the expected values
+    let read_hit_attrs = attrs.read_hit();
+    assert_eq!(read_hit_attrs.len(), 4);
+    assert!(
+        read_hit_attrs
+            .iter()
+            .any(|kv| kv.key.as_str() == "cache.type" && kv.value.to_string() == "test_cache")
+    );
+    assert!(
+        read_hit_attrs
+            .iter()
+            .any(|kv| kv.key.as_str() == "cache.operation.name" && kv.value.to_string() == "read")
+    );
+    assert!(
+        read_hit_attrs
+            .iter()
+            .any(|kv| kv.key.as_str() == "cache.operation.result" && kv.value.to_string() == "hit")
+    );
+}
+
+#[test]
+fn test_execution_metric_attrs() {
+    let base_attrs = vec![
+        KeyValue::new("execution.instance", "test_instance"),
+        KeyValue::new("execution.worker_id", "worker_123"),
+    ];
+
+    let attrs = ExecutionMetricAttrs::new(&base_attrs);
+
+    // Verify that the pre-computed attributes contain the expected values
+    let queued_attrs = attrs.queued();
+    assert_eq!(queued_attrs.len(), 3);
+    assert!(queued_attrs.iter().any(
+        |kv| kv.key.as_str() == "execution.instance" && kv.value.to_string() == "test_instance"
+    ));
+    assert!(
+        queued_attrs
+            .iter()
+            .any(|kv| kv.key.as_str() == "execution.stage" && kv.value.to_string() == "queued")
+    );
+
+    let completed_success_attrs = attrs.completed_success();
+    assert_eq!(completed_success_attrs.len(), 4);
+    assert!(
+        completed_success_attrs
+            .iter()
+            .any(|kv| kv.key.as_str() == "execution.stage" && kv.value.to_string() == "completed")
+    );
+    assert!(
+        completed_success_attrs
+            .iter()
+            .any(|kv| kv.key.as_str() == "execution.result" && kv.value.to_string() == "success")
+    );
+}
+
+#[test]
+fn test_make_execution_attributes() {
+    let attrs = make_execution_attributes("test_instance", Some("worker_456"), Some(100));
+
+    assert_eq!(attrs.len(), 3);
+    assert!(attrs.iter().any(
+        |kv| kv.key.as_str() == "execution.instance" && kv.value.to_string() == "test_instance"
+    ));
+    assert!(
+        attrs
+            .iter()
+            .any(|kv| kv.key.as_str() == "execution.worker_id"
+                && kv.value.to_string() == "worker_456")
+    );
+    assert!(
+        attrs
+            .iter()
+            .any(|kv| kv.key.as_str() == "execution.priority"
+                && kv.value == opentelemetry::Value::I64(100))
+    );
+}
+
+#[test]
+fn test_metrics_lazy_initialization() {
+    // Verify that the lazy static initialization works
+    let _cache_metrics = &*CACHE_METRICS;
+    let _execution_metrics = &*EXECUTION_METRICS;
+
+    // If we got here without panicking, the metrics were initialized successfully
+}

From 5ae7d49595cac0317c590621d44b8ffd2f7c9f78 Mon Sep 17 00:00:00 2001
From: Marcus <marcuseagan@gmail.com>
Date: Sat, 6 Sep 2025 17:03:09 +0900
Subject: [PATCH 2/6] from<ActionStage> trait

---
 .../src/memory_awaited_action_db.rs           | 56 +++----------------
 nativelink-util/src/metrics.rs                | 16 ++++++
 2 files changed, 24 insertions(+), 48 deletions(-)

diff --git a/nativelink-scheduler/src/memory_awaited_action_db.rs b/nativelink-scheduler/src/memory_awaited_action_db.rs
index 04bee355..4ffcd2d2 100644
--- a/nativelink-scheduler/src/memory_awaited_action_db.rs
+++ b/nativelink-scheduler/src/memory_awaited_action_db.rs
@@ -647,57 +647,17 @@ impl<I: InstantWrapper, NowFn: Fn() -> I + Clone + Send + Sync> AwaitedActionDbI
                 metrics.execution_stage_transitions.add(1, &base_attrs);
 
                 // Update active count for old stage
-                let old_stage_attrs = match old_stage {
-                    ActionStage::Unknown => vec![opentelemetry::KeyValue::new(
-                        nativelink_util::metrics::EXECUTION_STAGE,
-                        ExecutionStage::Unknown,
-                    )],
-                    ActionStage::CacheCheck => vec![opentelemetry::KeyValue::new(
-                        nativelink_util::metrics::EXECUTION_STAGE,
-                        ExecutionStage::CacheCheck,
-                    )],
-                    ActionStage::Queued => vec![opentelemetry::KeyValue::new(
-                        nativelink_util::metrics::EXECUTION_STAGE,
-                        ExecutionStage::Queued,
-                    )],
-                    ActionStage::Executing => vec![opentelemetry::KeyValue::new(
-                        nativelink_util::metrics::EXECUTION_STAGE,
-                        ExecutionStage::Executing,
-                    )],
-                    ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => {
-                        vec![opentelemetry::KeyValue::new(
-                            nativelink_util::metrics::EXECUTION_STAGE,
-                            ExecutionStage::Completed,
-                        )]
-                    }
-                };
+                let old_stage_attrs = vec![opentelemetry::KeyValue::new(
+                    nativelink_util::metrics::EXECUTION_STAGE,
+                    ExecutionStage::from(old_stage.clone()),
+                )];
                 metrics.execution_active_count.add(-1, &old_stage_attrs);
 
                 // Update active count for new stage
-                let new_stage_attrs = match new_stage {
-                    ActionStage::Unknown => vec![opentelemetry::KeyValue::new(
-                        nativelink_util::metrics::EXECUTION_STAGE,
-                        ExecutionStage::Unknown,
-                    )],
-                    ActionStage::CacheCheck => vec![opentelemetry::KeyValue::new(
-                        nativelink_util::metrics::EXECUTION_STAGE,
-                        ExecutionStage::CacheCheck,
-                    )],
-                    ActionStage::Queued => vec![opentelemetry::KeyValue::new(
-                        nativelink_util::metrics::EXECUTION_STAGE,
-                        ExecutionStage::Queued,
-                    )],
-                    ActionStage::Executing => vec![opentelemetry::KeyValue::new(
-                        nativelink_util::metrics::EXECUTION_STAGE,
-                        ExecutionStage::Executing,
-                    )],
-                    ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => {
-                        vec![opentelemetry::KeyValue::new(
-                            nativelink_util::metrics::EXECUTION_STAGE,
-                            ExecutionStage::Completed,
-                        )]
-                    }
-                };
+                let new_stage_attrs = vec![opentelemetry::KeyValue::new(
+                    nativelink_util::metrics::EXECUTION_STAGE,
+                    ExecutionStage::from(new_stage.clone()),
+                )];
                 metrics.execution_active_count.add(1, &new_stage_attrs);
 
                 // Record completion metrics
diff --git a/nativelink-util/src/metrics.rs b/nativelink-util/src/metrics.rs
index 3b8dea07..315b5dcb 100644
--- a/nativelink-util/src/metrics.rs
+++ b/nativelink-util/src/metrics.rs
@@ -16,6 +16,8 @@ use std::sync::LazyLock;
 
 use opentelemetry::{InstrumentationScope, KeyValue, Value, global, metrics};
 
+use crate::action_messages::ActionStage;
+
 // Metric attribute keys for cache operations.
 pub const CACHE_TYPE: &str = "cache.type";
 pub const CACHE_OPERATION: &str = "cache.operation.name";
@@ -111,6 +113,20 @@ impl From<ExecutionStage> for Value {
     }
 }
 
+impl From<ActionStage> for ExecutionStage {
+    fn from(stage: ActionStage) -> Self {
+        match stage {
+            ActionStage::Unknown => ExecutionStage::Unknown,
+            ActionStage::CacheCheck => ExecutionStage::CacheCheck,
+            ActionStage::Queued => ExecutionStage::Queued,
+            ActionStage::Executing => ExecutionStage::Executing,
+            ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => {
+                ExecutionStage::Completed
+            }
+        }
+    }
+}
+
 /// Results of remote execution operations.
 #[derive(Debug, Clone, Copy)]
 pub enum ExecutionResult {

From 2b385341a688db720b5a5b12aae4aed8eeb41c6b Mon Sep 17 00:00:00 2001
From: Marcus <marcuseagan@gmail.com>
Date: Sat, 6 Sep 2025 17:14:12 +0900
Subject: [PATCH 3/6] add tests and refactor expensive clone

---
 .../src/memory_awaited_action_db.rs           |  4 +-
 nativelink-util/src/metrics.rs                | 16 +++-
 nativelink-util/tests/metrics_test.rs         | 86 ++++++++++++++++++-
 3 files changed, 102 insertions(+), 4 deletions(-)

diff --git a/nativelink-scheduler/src/memory_awaited_action_db.rs b/nativelink-scheduler/src/memory_awaited_action_db.rs
index 4ffcd2d2..d31e48a4 100644
--- a/nativelink-scheduler/src/memory_awaited_action_db.rs
+++ b/nativelink-scheduler/src/memory_awaited_action_db.rs
@@ -649,14 +649,14 @@ impl<I: InstantWrapper, NowFn: Fn() -> I + Clone + Send + Sync> AwaitedActionDbI
                 // Update active count for old stage
                 let old_stage_attrs = vec![opentelemetry::KeyValue::new(
                     nativelink_util::metrics::EXECUTION_STAGE,
-                    ExecutionStage::from(old_stage.clone()),
+                    ExecutionStage::from(old_stage),
                 )];
                 metrics.execution_active_count.add(-1, &old_stage_attrs);
 
                 // Update active count for new stage
                 let new_stage_attrs = vec![opentelemetry::KeyValue::new(
                     nativelink_util::metrics::EXECUTION_STAGE,
-                    ExecutionStage::from(new_stage.clone()),
+                    ExecutionStage::from(new_stage),
                 )];
                 metrics.execution_active_count.add(1, &new_stage_attrs);
 
diff --git a/nativelink-util/src/metrics.rs b/nativelink-util/src/metrics.rs
index 315b5dcb..63dce9cc 100644
--- a/nativelink-util/src/metrics.rs
+++ b/nativelink-util/src/metrics.rs
@@ -87,7 +87,7 @@ impl From<CacheOperationResult> for Value {
 }
 
 /// Remote execution stages for metrics classification.
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum ExecutionStage {
     /// Unknown stage
     Unknown,
@@ -127,6 +127,20 @@ impl From<ActionStage> for ExecutionStage {
     }
 }
 
+impl From<&ActionStage> for ExecutionStage {
+    fn from(stage: &ActionStage) -> Self {
+        match stage {
+            ActionStage::Unknown => ExecutionStage::Unknown,
+            ActionStage::CacheCheck => ExecutionStage::CacheCheck,
+            ActionStage::Queued => ExecutionStage::Queued,
+            ActionStage::Executing => ExecutionStage::Executing,
+            ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => {
+                ExecutionStage::Completed
+            }
+        }
+    }
+}
+
 /// Results of remote execution operations.
 #[derive(Debug, Clone, Copy)]
 pub enum ExecutionResult {
diff --git a/nativelink-util/tests/metrics_test.rs b/nativelink-util/tests/metrics_test.rs
index 0f8548de..e52bfb2d 100644
--- a/nativelink-util/tests/metrics_test.rs
+++ b/nativelink-util/tests/metrics_test.rs
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use nativelink_util::action_messages::{ActionResult, ActionStage};
 use nativelink_util::metrics::{
-    CACHE_METRICS, CacheMetricAttrs, EXECUTION_METRICS, ExecutionMetricAttrs,
+    CACHE_METRICS, CacheMetricAttrs, EXECUTION_METRICS, ExecutionMetricAttrs, ExecutionStage,
     make_execution_attributes,
 };
 use opentelemetry::KeyValue;
@@ -112,3 +113,86 @@ fn test_metrics_lazy_initialization() {
 
     // If we got here without panicking, the metrics were initialized successfully
 }
+
+#[test]
+fn test_action_stage_to_execution_stage_conversion() {
+    // Test conversion from owned ActionStage values
+    assert_eq!(
+        ExecutionStage::from(ActionStage::Unknown),
+        ExecutionStage::Unknown
+    );
+    assert_eq!(
+        ExecutionStage::from(ActionStage::CacheCheck),
+        ExecutionStage::CacheCheck
+    );
+    assert_eq!(
+        ExecutionStage::from(ActionStage::Queued),
+        ExecutionStage::Queued
+    );
+    assert_eq!(
+        ExecutionStage::from(ActionStage::Executing),
+        ExecutionStage::Executing
+    );
+
+    // Test that Completed variants map to ExecutionStage::Completed
+    let action_result = ActionResult::default();
+    assert_eq!(
+        ExecutionStage::from(ActionStage::Completed(action_result.clone())),
+        ExecutionStage::Completed
+    );
+
+    // Note: We can't easily test CompletedFromCache without creating a ProtoActionResult,
+    // but the implementation handles it the same as Completed
+}
+
+#[test]
+fn test_action_stage_ref_to_execution_stage_conversion() {
+    // Test conversion from ActionStage references
+    let unknown = ActionStage::Unknown;
+    let cache_check = ActionStage::CacheCheck;
+    let queued = ActionStage::Queued;
+    let executing = ActionStage::Executing;
+    let completed = ActionStage::Completed(ActionResult::default());
+
+    assert_eq!(ExecutionStage::from(&unknown), ExecutionStage::Unknown);
+    assert_eq!(
+        ExecutionStage::from(&cache_check),
+        ExecutionStage::CacheCheck
+    );
+    assert_eq!(ExecutionStage::from(&queued), ExecutionStage::Queued);
+    assert_eq!(ExecutionStage::from(&executing), ExecutionStage::Executing);
+    assert_eq!(ExecutionStage::from(&completed), ExecutionStage::Completed);
+}
+
+#[test]
+fn test_action_stage_conversion_avoids_clone() {
+    use nativelink_util::action_messages::{FileInfo, NameOrPath};
+    use nativelink_util::common::DigestInfo;
+
+    // This test verifies that using a reference doesn't clone the large ActionResult
+    let large_file_info = FileInfo {
+        name_or_path: NameOrPath::Path("test.txt".to_string()),
+        digest: DigestInfo::new([0u8; 32], 100),
+        is_executable: false,
+    };
+    let large_action_result = ActionResult {
+        output_files: vec![large_file_info; 1000], // Large vector to make clone expensive
+        ..Default::default()
+    };
+    let completed = ActionStage::Completed(large_action_result);
+
+    // Using a reference should be fast even with large data
+    let start = std::time::Instant::now();
+    for _ in 0..10000 {
+        let _stage = ExecutionStage::from(&completed);
+    }
+    let elapsed = start.elapsed();
+
+    // This should complete very quickly since we're not cloning
+    // In practice, 10000 conversions should take less than 1ms
+    assert!(
+        elapsed.as_millis() < 100,
+        "Reference conversion took too long: {:?}",
+        elapsed
+    );
+}

From 2960aa13c464678fd7e3a13b4c6376d55397e7fd Mon Sep 17 00:00:00 2001
From: Marcus <marcuseagan@gmail.com>
Date: Sat, 6 Sep 2025 17:26:51 +0900
Subject: [PATCH 4/6] moved to the ternary operator

---
 .../src/memory_awaited_action_db.rs           | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/nativelink-scheduler/src/memory_awaited_action_db.rs b/nativelink-scheduler/src/memory_awaited_action_db.rs
index d31e48a4..38d3ce97 100644
--- a/nativelink-scheduler/src/memory_awaited_action_db.rs
+++ b/nativelink-scheduler/src/memory_awaited_action_db.rs
@@ -662,17 +662,14 @@ impl<I: InstantWrapper, NowFn: Fn() -> I + Clone + Send + Sync> AwaitedActionDbI
 
                 // Record completion metrics
                 if let ActionStage::Completed(action_result) = new_stage {
-                    let result_attrs = if action_result.exit_code == 0 {
-                        vec![opentelemetry::KeyValue::new(
-                            nativelink_util::metrics::EXECUTION_RESULT,
-                            ExecutionResult::Success,
-                        )]
-                    } else {
-                        vec![opentelemetry::KeyValue::new(
-                            nativelink_util::metrics::EXECUTION_RESULT,
-                            ExecutionResult::Failure,
-                        )]
-                    };
+                    let result_attrs = vec![opentelemetry::KeyValue::new(
+                        nativelink_util::metrics::EXECUTION_RESULT,
+                        if action_result.exit_code == 0 {
+                            ExecutionResult::Success
+                        } else {
+                            ExecutionResult::Failure
+                        },
+                    )];
                     metrics.execution_completed_count.add(1, &result_attrs);
                 } else if let ActionStage::CompletedFromCache(_) = new_stage {
                     let result_attrs = vec![opentelemetry::KeyValue::new(

From a7eb2d96176c891496ba7c173e66db4cc28301b6 Mon Sep 17 00:00:00 2001
From: Marcus <marcuseagan@gmail.com>
Date: Sat, 6 Sep 2025 22:41:12 +0900
Subject: [PATCH 5/6] add docs wrap otel impl

---
 .../vocabularies/TraceMachina/accept.txt      |   5 +
 deployment-examples/metrics/README.md         | 420 ++++++++++++++++++
 .../metrics/docker-compose.yaml               | 139 ++++++
 .../provisioning/dashboards/dashboard.yaml    |  12 +
 .../provisioning/datasources/prometheus.yaml  |  23 +
 .../metrics/kubernetes/otel-collector.yaml    | 274 ++++++++++++
 .../metrics/kubernetes/prometheus.yaml        | 338 ++++++++++++++
 .../metrics/otel-collector-config.yaml        | 139 ++++++
 .../metrics/prometheus-config.yaml            | 169 +++++++
 .../metrics/prometheus-recording-rules.yml    | 277 ++++++++++++
 .../docs/docs/deployment-examples/metrics.mdx | 420 ++++++++++++++++++
 11 files changed, 2216 insertions(+)
 create mode 100644 deployment-examples/metrics/README.md
 create mode 100644 deployment-examples/metrics/docker-compose.yaml
 create mode 100644 deployment-examples/metrics/grafana/provisioning/dashboards/dashboard.yaml
 create mode 100644 deployment-examples/metrics/grafana/provisioning/datasources/prometheus.yaml
 create mode 100644 deployment-examples/metrics/kubernetes/otel-collector.yaml
 create mode 100644 deployment-examples/metrics/kubernetes/prometheus.yaml
 create mode 100644 deployment-examples/metrics/otel-collector-config.yaml
 create mode 100644 deployment-examples/metrics/prometheus-config.yaml
 create mode 100644 deployment-examples/metrics/prometheus-recording-rules.yml
 create mode 100644 web/platform/src/content/docs/docs/deployment-examples/metrics.mdx

diff --git a/.github/styles/config/vocabularies/TraceMachina/accept.txt b/.github/styles/config/vocabularies/TraceMachina/accept.txt
index d26ccae5..ce038a45 100644
--- a/.github/styles/config/vocabularies/TraceMachina/accept.txt
+++ b/.github/styles/config/vocabularies/TraceMachina/accept.txt
@@ -16,6 +16,8 @@ FFI
 FFIs
 GPUs
 Goma
+gzip
+[Hh]eatmap
 [Hh]ermeticity
 Istio
 JDK
@@ -106,7 +108,10 @@ Thirdwave
 Norwest
 Databricks
 Datadog
+Downsampling
 Brex
 Citrix
 Menlo
 benchmarked
+Thanos
+Quickwit
diff --git a/deployment-examples/metrics/README.md b/deployment-examples/metrics/README.md
new file mode 100644
index 00000000..142846e8
--- /dev/null
+++ b/deployment-examples/metrics/README.md
@@ -0,0 +1,420 @@
+# NativeLink Metrics with OpenTelemetry
+
+This directory contains configurations and examples for collecting, processing, and visualizing NativeLink metrics using OpenTelemetry (OTEL) and various server systems.
+
+## Overview
+
+NativeLink exposes comprehensive metrics about cache operations and remote execution through OpenTelemetry. These metrics provide insights into:
+
+- **Cache Performance**: Hit rates, operation latencies, eviction rates
+- **Execution Pipeline**: Queue times, stage durations, success rates
+- **System Health**: Worker utilization, throughput, error rates
+
+## Quick Start
+
+### Using Docker Compose (Recommended for Development)
+
+1. Start the metrics stack:
+```bash
+cd deployment-examples/metrics
+docker-compose up -d
+```
+
+2. Configure NativeLink to send metrics to the collector:
+```bash
+export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
+export OTEL_EXPORTER_OTLP_PROTOCOL=grpc
+export OTEL_SERVICE_NAME=nativelink
+export OTEL_RESOURCE_ATTRIBUTES="deployment.environment=dev,nativelink.instance_name=main"
+```
+
+3. Start NativeLink with your configuration:
+```bash
+nativelink /path/to/config.json
+```
+
+4. Access the metrics:
+- Prometheus UI: http://localhost:9090
+- Grafana: http://localhost:3000 (if included)
+- OTEL Collector metrics: http://localhost:8888/metrics
+
+### Using Kubernetes
+
+1. Deploy the OTEL Collector:
+```bash
+kubectl apply -f kubernetes/otel-collector.yaml
+```
+
+2. Deploy Prometheus with OTLP receiver enabled:
+```bash
+kubectl apply -f kubernetes/prometheus.yaml
+```
+
+3. Configure NativeLink deployment to send metrics:
+```yaml
+env:
+  - name: OTEL_EXPORTER_OTLP_ENDPOINT
+    value: "http://otel-collector:4317"
+  - name: OTEL_EXPORTER_OTLP_PROTOCOL
+    value: "grpc"
+  - name: OTEL_RESOURCE_ATTRIBUTES
+    value: "deployment.environment=prod,k8s.cluster.name=main"
+```
+
+## Metrics Catalog
+
+### Cache Metrics
+
+| Metric | Type | Description | Labels |
+|--------|------|-------------|--------|
+| `nativelink_cache_operations` | Counter | Total cache operations | `cache_type`, `cache_operation_name`, `cache_operation_result` |
+| `nativelink_cache_operation_duration` | Histogram | Operation latency in milliseconds | `cache_type`, `cache_operation_name` |
+| `nativelink_cache_io` | Counter | Bytes read/written | `cache_type`, `cache_operation_name` |
+| `nativelink_cache_size` | Gauge | Current cache size in bytes | `cache_type` |
+| `nativelink_cache_entries` | Gauge | Number of cached entries | `cache_type` |
+| `nativelink_cache_item_size` | Histogram | Size distribution of cache entries | `cache_type` |
+
+**Cache Operation Names:**
+- `read`: Data retrieval operations
+- `write`: Data storage operations
+- `delete`: Explicit removal operations
+- `evict`: Automatic evictions (LRU, TTL)
+
+**Cache Operation Results:**
+- `hit`: Data found and valid (reads)
+- `miss`: Data not found (reads)
+- `expired`: Data found but stale (reads)
+- `success`: Operation completed (writes/deletes)
+- `error`: Operation failed
+
+### Execution Metrics
+
+| Metric | Type | Description | Labels |
+|--------|------|-------------|--------|
+| `nativelink_execution_stage_duration` | Histogram | Time spent in each execution stage | `execution_stage` |
+| `nativelink_execution_total_duration` | Histogram | Total execution time from submission to completion | `execution_instance` |
+| `nativelink_execution_queue_time` | Histogram | Time spent waiting in queue | `execution_priority` |
+| `nativelink_execution_active_count` | Gauge | Current actions in each stage | `execution_stage` |
+| `nativelink_execution_completed_count` | Counter | Completed executions | `execution_result` |
+| `nativelink_execution_stage_transitions` | Counter | Stage transition events | `execution_instance`, `execution_priority` |
+| `nativelink_execution_output_size` | Histogram | Size of execution outputs | - |
+| `nativelink_execution_retry_count` | Counter | Number of retries | - |
+
+**Execution Stages:**
+- `unknown`: Initial state
+- `cache_check`: Checking for cached results
+- `queued`: Waiting for available worker
+- `executing`: Running on worker
+- `completed`: Finished execution
+
+**Execution Results:**
+- `success`: Completed with exit code 0
+- `failure`: Completed with non-zero exit code
+- `cancelled`: Execution was cancelled
+- `timeout`: Execution timed out
+- `cache_hit`: Result found in cache
+
+## Configuration
+
+### Environment Variables
+
+NativeLink uses standard OpenTelemetry environment variables:
+
+```bash
+# OTLP Exporter Configuration
+OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317  # Collector endpoint
+OTEL_EXPORTER_OTLP_PROTOCOL=grpc                   # Protocol (grpc or http/protobuf)
+OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer token"  # Optional auth headers
+OTEL_EXPORTER_OTLP_COMPRESSION=gzip                # Compression (none, gzip)
+
+# Resource Attributes
+OTEL_SERVICE_NAME=nativelink                       # Service name (fixed)
+OTEL_RESOURCE_ATTRIBUTES="key1=value1,key2=value2" # Custom attributes
+
+# Metric Export Configuration
+OTEL_METRIC_EXPORT_INTERVAL=60000                  # Export interval in ms (default: 60s)
+OTEL_METRIC_EXPORT_TIMEOUT=30000                   # Export timeout in ms (default: 30s)
+
+# Disable telemetry types
+OTEL_TRACES_EXPORTER=none                          # Disable traces (if only metrics needed)
+OTEL_LOGS_EXPORTER=none                            # Disable logs (if only metrics needed)
+```
+
+### Collector Configuration
+
+The OTEL Collector can be configured to:
+1. Add resource attributes
+2. Batch metrics for efficiency
+3. Export to multiple metrics servers
+4. Transform metric attributes
+
+See `otel-collector-config.yaml` for a complete example.
+
+## Server Options
+
+### Prometheus (Recommended)
+
+Prometheus offers native OTLP support and excellent query capabilities.
+
+**Direct OTLP Ingestion:**
+```bash
+prometheus --web.enable-otlp-receiver \
+          --storage.tsdb.out-of-order-time-window=30m
+```
+
+**Via Collector Scraping:**
+```yaml
+scrape_configs:
+  - job_name: 'otel-collector'
+    static_configs:
+      - targets: ['otel-collector:9090']
+```
+
+### Grafana Cloud
+
+For managed metrics:
+```yaml
+exporters:
+  otlphttp:
+    endpoint: https://otlp-gateway-prod-us-central-0.grafana.net/otlp
+    headers:
+      Authorization: "Bearer ${GRAFANA_CLOUD_TOKEN}"
+```
+
+### ClickHouse
+
+For high-volume metrics storage:
+```yaml
+exporters:
+  clickhouse:
+    endpoint: tcp://clickhouse:9000
+    database: metrics
+    ttl_days: 30
+    logs_table: otel_logs
+    metrics_table: otel_metrics
+```
+
+### Quickwit
+
+For unified logs and metrics:
+```yaml
+exporters:
+  otlp:
+    endpoint: quickwit:7281
+    headers:
+      "x-quickwit-index": "nativelink-metrics"
+```
+
+## Example Queries
+
+### Prometheus/PromQL
+
+**Cache hit rate:**
+```promql
+sum(rate(nativelink_cache_operations{cache_operation_result="hit"}[5m])) by (cache_type) /
+sum(rate(nativelink_cache_operations{cache_operation_name="read"}[5m])) by (cache_type)
+```
+
+**Execution success rate:**
+```promql
+sum(rate(nativelink_execution_completed_count{execution_result="success"}[5m])) /
+sum(rate(nativelink_execution_completed_count[5m]))
+```
+
+**Queue depth by priority:**
+```promql
+sum(nativelink_execution_active_count{execution_stage="queued"}) by (execution_priority)
+```
+
+**P95 cache operation latency:**
+```promql
+histogram_quantile(0.95,
+  sum(rate(nativelink_cache_operation_duration_bucket[5m])) by (le, cache_type)
+)
+```
+
+**Worker utilization:**
+```promql
+count(nativelink_execution_active_count{execution_stage="executing"} > 0) /
+count(count by (execution_worker_id) (nativelink_execution_active_count))
+```
+
+### Joining with Resource Attributes
+
+Use `target_info` to join resource attributes:
+```promql
+rate(nativelink_execution_completed_count[5m])
+* on (job, instance) group_left (k8s_cluster_name, deployment_environment)
+target_info
+```
+
+## Dashboards
+
+### Grafana Dashboard
+
+Import the included dashboard for a comprehensive view:
+```bash
+# Import via API
+curl -X POST http://admin:admin@localhost:3000/api/dashboards/db \
+  -H "Content-Type: application/json" \
+  -d @grafana-dashboard.json
+
+# Or import via UI at http://localhost:3000
+```
+
+Key panels include:
+- Execution pipeline overview
+- Cache performance metrics
+- Worker utilization heatmap
+- Error rate tracking
+- Queue depth over time
+- Stage duration percentiles
+
+## Alerting
+
+### Example Alert Rules
+
+```yaml
+groups:
+  - name: nativelink_alerts
+    rules:
+      - alert: HighErrorRate
+        expr: |
+          (1 - (
+            sum(rate(nativelink_execution_completed_count{execution_result="success"}[5m])) /
+            sum(rate(nativelink_execution_completed_count[5m]))
+          )) > 0.05
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High execution error rate ({{ $value | humanizePercentage }})"
+
+      - alert: CacheMissRateHigh
+        expr: |
+          (1 - nativelink:cache_hit_rate) > 0.5
+        for: 10m
+        labels:
+          severity: info
+        annotations:
+          summary: "Cache miss rate above 50% for {{ $labels.cache_type }}"
+
+      - alert: QueueBacklog
+        expr: |
+          sum(nativelink_execution_active_count{execution_stage="queued"}) > 100
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Queue backlog above 100 actions"
+
+      - alert: WorkerUtilizationLow
+        expr: |
+          nativelink:worker_utilization < 0.3
+        for: 30m
+        labels:
+          severity: info
+        annotations:
+          summary: "Worker utilization below 30%"
+```
+
+## Troubleshooting
+
+### No Metrics Appearing
+
+1. Check NativeLink is configured with OTEL environment variables:
+```bash
+ps aux | grep nativelink | grep OTEL
+```
+
+2. Verify collector is receiving data:
+```bash
+curl http://localhost:13133/health
+curl http://localhost:8888/metrics | grep otelcol_receiver_accepted_metric_points
+```
+
+3. Check collector logs:
+```bash
+docker logs otel-collector
+# or
+kubectl logs -l app=otel-collector
+```
+
+### High Memory Usage
+
+1. Adjust collector batch size:
+```yaml
+processors:
+  batch:
+    send_batch_size: 512  # Reduce from 1024
+```
+
+2. Increase memory limits:
+```yaml
+memory_limiter:
+  limit_mib: 1024  # Increase from 512
+```
+
+3. Reduce metric cardinality by dropping labels:
+```yaml
+processors:
+  attributes:
+    actions:
+      - key: unnecessary_label
+        action: delete
+```
+
+### Out-of-Order Samples
+
+Enable out-of-order ingestion in Prometheus:
+```yaml
+storage:
+  tsdb:
+    out_of_order_time_window: 1h  # Increase from 30m
+```
+
+### Missing Resource Attributes
+
+Ensure attributes are promoted in Prometheus:
+```yaml
+otlp:
+  promote_resource_attributes:
+    - your.custom.attribute
+```
+
+## Performance Tuning
+
+### Collector Optimization
+
+1. **Batching**: Adjust batch processor settings based on volume
+2. **Compression**: Enable gzip for network efficiency
+3. **Sampling**: Use tail sampling for high-volume traces
+4. **Filtering**: Drop unnecessary metrics at collector level
+
+### Prometheus Optimization
+
+1. **Recording Rules**: Pre-calculate expensive queries
+2. **Retention**: Set appropriate retention periods
+3. **Downsampling**: Use Thanos or Cortex for long-term storage
+4. **Federation**: Split metrics across multiple Prometheus instances
+
+### NativeLink Optimization
+
+1. **Export Interval**: Increase `OTEL_METRIC_EXPORT_INTERVAL` to reduce overhead
+2. **Resource Attributes**: Minimize cardinality of custom attributes
+3. **Metric Selection**: Disable unused metric types if needed
+
+## Additional Resources
+
+- [OpenTelemetry Documentation](https://opentelemetry.io/docs/)
+- [Prometheus Best Practices](https://prometheus.io/docs/practices/)
+- [OTEL Collector Configuration](https://opentelemetry.io/docs/collector/configuration/)
+- [NativeLink Documentation](https://nativelink.com/docs)
+- [Grafana Dashboard Examples](https://grafana.com/grafana/dashboards/)
+
+## Support
+
+For issues or questions:
+- File an issue: https://github.com/TraceMachina/nativelink/issues
+- Join our Discord: https://discord.gg/nativelink
+- Check documentation: https://nativelink.com/docs
diff --git a/deployment-examples/metrics/docker-compose.yaml b/deployment-examples/metrics/docker-compose.yaml
new file mode 100644
index 00000000..9a943b30
--- /dev/null
+++ b/deployment-examples/metrics/docker-compose.yaml
@@ -0,0 +1,139 @@
+version: '3.8'
+
+services:
+  # OpenTelemetry Collector
+  otel-collector:
+    image: otel/opentelemetry-collector-contrib:0.98.0
+    container_name: otel-collector
+    restart: unless-stopped
+    command: ["--config=/etc/otel-collector/config.yaml"]
+    volumes:
+      - ./otel-collector-config.yaml:/etc/otel-collector/config.yaml:ro
+    ports:
+      - "4317:4317"   # OTLP gRPC receiver
+      - "4318:4318"   # OTLP HTTP receiver
+      - "9090:9090"   # Prometheus metrics exporter
+      - "8888:8888"   # Collector metrics
+      - "13133:13133" # Health check
+    environment:
+      - OTLP_BACKEND_ENDPOINT=${OTLP_BACKEND_ENDPOINT:-otlp-backend:4317}
+      - OTLP_BACKEND_TOKEN=${OTLP_BACKEND_TOKEN:-}
+    networks:
+      - metrics
+
+  # Prometheus with OTLP support
+  prometheus:
+    image: prom/prometheus:v2.50.0
+    container_name: prometheus
+    restart: unless-stopped
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--web.console.libraries=/etc/prometheus/console_libraries'
+      - '--web.console.templates=/etc/prometheus/consoles'
+      - '--web.enable-lifecycle'
+      - '--web.enable-otlp-receiver'  # Enable OTLP receiver
+      - '--storage.tsdb.retention.time=30d'
+      - '--storage.tsdb.out-of-order-time-window=30m'  # Handle out-of-order samples
+    volumes:
+      - ./prometheus-config.yaml:/etc/prometheus/prometheus.yml:ro
+      - ./prometheus-recording-rules.yml:/etc/prometheus/rules/nativelink.yml:ro
+      - prometheus_data:/prometheus
+    ports:
+      - "9091:9090"  # Prometheus web UI (different port to avoid conflict with collector)
+    networks:
+      - metrics
+    depends_on:
+      - otel-collector
+
+  # Grafana for visualization
+  grafana:
+    image: grafana/grafana:10.3.0
+    container_name: grafana
+    restart: unless-stopped
+    ports:
+      - "3000:3000"
+    environment:
+      - GF_SECURITY_ADMIN_USER=admin
+      - GF_SECURITY_ADMIN_PASSWORD=admin
+      - GF_INSTALL_PLUGINS=grafana-piechart-panel
+      - GF_USERS_ALLOW_SIGN_UP=false
+    volumes:
+      - grafana_data:/var/lib/grafana
+      - ./grafana/provisioning:/etc/grafana/provisioning:ro
+      - ./grafana/dashboards:/var/lib/grafana/dashboards:ro
+    networks:
+      - metrics
+    depends_on:
+      - prometheus
+
+  # Optional: AlertManager for alerts
+  alertmanager:
+    image: prom/alertmanager:v0.27.0
+    container_name: alertmanager
+    restart: unless-stopped
+    volumes:
+      - ./alertmanager-config.yml:/etc/alertmanager/config.yml:ro
+      - alertmanager_data:/alertmanager
+    ports:
+      - "9093:9093"
+    command:
+      - '--config.file=/etc/alertmanager/config.yml'
+      - '--storage.path=/alertmanager'
+    networks:
+      - metrics
+
+  # Optional: Node exporter for host metrics
+  node-exporter:
+    image: prom/node-exporter:v1.7.0
+    container_name: node-exporter
+    restart: unless-stopped
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.rootfs=/rootfs'
+      - '--path.sysfs=/host/sys'
+      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
+    ports:
+      - "9100:9100"
+    networks:
+      - metrics
+
+  # Optional: Jaeger for trace visualization (if traces are enabled)
+  jaeger:
+    image: jaegertracing/all-in-one:1.53
+    container_name: jaeger
+    restart: unless-stopped
+    environment:
+      - COLLECTOR_OTLP_ENABLED=true
+    ports:
+      - "16686:16686"  # Jaeger UI
+      - "14268:14268"  # Jaeger collector HTTP
+    networks:
+      - metrics
+
+volumes:
+  prometheus_data:
+  grafana_data:
+  alertmanager_data:
+
+networks:
+  metrics:
+    driver: bridge
+
+# Usage Instructions:
+# 1. Start the stack: docker-compose up -d
+# 2. Configure NativeLink with these environment variables:
+#    export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
+#    export OTEL_EXPORTER_OTLP_PROTOCOL=grpc
+#    export OTEL_SERVICE_NAME=nativelink
+#    export OTEL_RESOURCE_ATTRIBUTES="deployment.environment=dev"
+# 3. Access services:
+#    - Prometheus: http://localhost:9091
+#    - Grafana: http://localhost:3000 (admin/admin)
+#    - Jaeger: http://localhost:16686
+#    - AlertManager: http://localhost:9093
+#    - OTEL Collector metrics: http://localhost:8888/metrics
diff --git a/deployment-examples/metrics/grafana/provisioning/dashboards/dashboard.yaml b/deployment-examples/metrics/grafana/provisioning/dashboards/dashboard.yaml
new file mode 100644
index 00000000..20e6f666
--- /dev/null
+++ b/deployment-examples/metrics/grafana/provisioning/dashboards/dashboard.yaml
@@ -0,0 +1,12 @@
+apiVersion: 1
+
+providers:
+  - name: 'NativeLink Dashboards'
+    orgId: 1
+    folder: 'NativeLink'
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /var/lib/grafana/dashboards
diff --git a/deployment-examples/metrics/grafana/provisioning/datasources/prometheus.yaml b/deployment-examples/metrics/grafana/provisioning/datasources/prometheus.yaml
new file mode 100644
index 00000000..e553ac28
--- /dev/null
+++ b/deployment-examples/metrics/grafana/provisioning/datasources/prometheus.yaml
@@ -0,0 +1,23 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: true
+    jsonData:
+      timeInterval: "15s"
+      queryTimeout: "60s"
+      httpMethod: POST
+
+  - name: OTEL-Collector-Prometheus
+    type: prometheus
+    access: proxy
+    url: http://otel-collector:9090
+    editable: true
+    jsonData:
+      timeInterval: "15s"
+      queryTimeout: "60s"
+      httpMethod: POST
diff --git a/deployment-examples/metrics/kubernetes/otel-collector.yaml b/deployment-examples/metrics/kubernetes/otel-collector.yaml
new file mode 100644
index 00000000..739eecf6
--- /dev/null
+++ b/deployment-examples/metrics/kubernetes/otel-collector.yaml
@@ -0,0 +1,274 @@
+# OpenTelemetry Collector Deployment for NativeLink Metrics
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: otel-collector-config
+  namespace: nativelink
+data:
+  collector.yaml: |
+    receivers:
+      otlp:
+        protocols:
+          grpc:
+            endpoint: 0.0.0.0:4317
+          http:
+            endpoint: 0.0.0.0:4318
+
+    processors:
+      resource:
+        attributes:
+          - key: service.namespace
+            value: nativelink
+            action: upsert
+          - key: k8s.cluster.name
+            from_attribute: K8S_CLUSTER_NAME
+            action: insert
+          - key: deployment.environment
+            from_attribute: DEPLOYMENT_ENV
+            action: insert
+
+      transform/nativelink:
+        metric_statements:
+          - context: datapoint
+            statements:
+              - set(attributes["instance_name"], resource.attributes["nativelink.instance_name"])
+                where resource.attributes["nativelink.instance_name"] != nil
+
+      batch:
+        timeout: 10s
+        send_batch_size: 1024
+
+      memory_limiter:
+        check_interval: 1s
+        limit_mib: 1024
+        spike_limit_mib: 256
+
+    exporters:
+      prometheus:
+        endpoint: 0.0.0.0:9090
+        namespace: nativelink
+        resource_to_telemetry_conversion:
+          enabled: true
+        enable_open_metrics: true
+
+      otlphttp/prometheus:
+        endpoint: http://prometheus:9090/api/v1/otlp/v1/metrics
+        compression: gzip
+
+    extensions:
+      health_check:
+        endpoint: 0.0.0.0:13133
+      pprof:
+        endpoint: 0.0.0.0:1777
+      zpages:
+        endpoint: 0.0.0.0:55679
+
+    service:
+      extensions: [health_check, pprof, zpages]
+      pipelines:
+        metrics:
+          receivers: [otlp]
+          processors: [memory_limiter, resource, transform/nativelink, batch]
+          exporters: [prometheus]
+        metrics/prometheus_otlp:
+          receivers: [otlp]
+          processors: [memory_limiter, resource, transform/nativelink, batch]
+          exporters: [otlphttp/prometheus]
+
+      telemetry:
+        logs:
+          level: info
+        metrics:
+          level: detailed
+          address: 0.0.0.0:8888
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: otel-collector
+  namespace: nativelink
+  labels:
+    app: otel-collector
+spec:
+  type: ClusterIP
+  selector:
+    app: otel-collector
+  ports:
+    - name: otlp-grpc
+      port: 4317
+      targetPort: 4317
+      protocol: TCP
+    - name: otlp-http
+      port: 4318
+      targetPort: 4318
+      protocol: TCP
+    - name: prometheus
+      port: 9090
+      targetPort: 9090
+      protocol: TCP
+    - name: metrics
+      port: 8888
+      targetPort: 8888
+      protocol: TCP
+    - name: health
+      port: 13133
+      targetPort: 13133
+      protocol: TCP
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: otel-collector
+  namespace: nativelink
+  labels:
+    app: otel-collector
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: otel-collector
+  template:
+    metadata:
+      labels:
+        app: otel-collector
+    spec:
+      serviceAccountName: otel-collector
+      containers:
+        - name: otel-collector
+          image: otel/opentelemetry-collector-contrib:0.98.0
+          args:
+            - "--config=/conf/collector.yaml"
+          ports:
+            - containerPort: 4317
+              name: otlp-grpc
+            - containerPort: 4318
+              name: otlp-http
+            - containerPort: 9090
+              name: prometheus
+            - containerPort: 8888
+              name: metrics
+            - containerPort: 13133
+              name: health
+          env:
+            - name: K8S_CLUSTER_NAME
+              value: "nativelink-cluster"
+            - name: DEPLOYMENT_ENV
+              value: "production"
+            - name: K8S_NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+            - name: K8S_POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: K8S_POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+          volumeMounts:
+            - name: config
+              mountPath: /conf
+          resources:
+            requests:
+              memory: "512Mi"
+              cpu: "200m"
+            limits:
+              memory: "1Gi"
+              cpu: "1000m"
+          livenessProbe:
+            httpGet:
+              path: /
+              port: 13133
+            initialDelaySeconds: 10
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /
+              port: 13133
+            initialDelaySeconds: 5
+            periodSeconds: 5
+      volumes:
+        - name: config
+          configMap:
+            name: otel-collector-config
+
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: otel-collector
+  namespace: nativelink
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: otel-collector
+rules:
+  - apiGroups: [""]
+    resources: ["pods", "namespaces", "nodes"]
+    verbs: ["get", "watch", "list"]
+  - apiGroups: ["apps"]
+    resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
+    verbs: ["get", "watch", "list"]
+  - apiGroups: ["batch"]
+    resources: ["jobs", "cronjobs"]
+    verbs: ["get", "watch", "list"]
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: otel-collector
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: otel-collector
+subjects:
+  - kind: ServiceAccount
+    name: otel-collector
+    namespace: nativelink
+
+---
+# HorizontalPodAutoscaler for OTEL Collector
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: otel-collector
+  namespace: nativelink
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: otel-collector
+  minReplicas: 2
+  maxReplicas: 10
+  metrics:
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: 80
+    - type: Resource
+      resource:
+        name: memory
+        target:
+          type: Utilization
+          averageUtilization: 80
+
+---
+# PodDisruptionBudget for high availability
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: otel-collector
+  namespace: nativelink
+spec:
+  minAvailable: 1
+  selector:
+    matchLabels:
+      app: otel-collector
diff --git a/deployment-examples/metrics/kubernetes/prometheus.yaml b/deployment-examples/metrics/kubernetes/prometheus.yaml
new file mode 100644
index 00000000..eacc026b
--- /dev/null
+++ b/deployment-examples/metrics/kubernetes/prometheus.yaml
@@ -0,0 +1,338 @@
+# Prometheus Deployment for NativeLink Metrics
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: nativelink
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prometheus-config
+  namespace: nativelink
+data:
+  prometheus.yml: |
+    global:
+      scrape_interval: 15s
+      evaluation_interval: 15s
+      external_labels:
+        cluster: 'nativelink-k8s'
+        environment: 'production'
+
+    # OTLP configuration (requires --web.enable-otlp-receiver flag)
+    otlp:
+      promote_resource_attributes:
+        - service.instance.id
+        - service.name
+        - service.namespace
+        - service.version
+        - cloud.availability_zone
+        - cloud.region
+        - container.name
+        - deployment.environment
+        - k8s.cluster.name
+        - k8s.container.name
+        - k8s.deployment.name
+        - k8s.namespace.name
+        - k8s.pod.name
+        - k8s.statefulset.name
+        - nativelink.instance_name
+        - nativelink.worker_id
+        - nativelink.scheduler_name
+
+      keep_identifying_resource_attributes: true
+      translation_strategy: NoUTF8EscapingWithSuffixes
+
+    storage:
+      tsdb:
+        out_of_order_time_window: 30m
+        retention.time: 30d
+
+    scrape_configs:
+      - job_name: 'otel-collector'
+        static_configs:
+          - targets: ['otel-collector:9090']
+        metric_relabel_configs:
+          - source_labels: [__name__]
+            regex: '(nativelink_.*)'
+            target_label: __name__
+            replacement: '${1}'
+
+      - job_name: 'prometheus'
+        static_configs:
+          - targets: ['localhost:9090']
+
+      # Kubernetes service discovery for NativeLink pods
+      - job_name: 'nativelink-pods'
+        kubernetes_sd_configs:
+          - role: pod
+            namespaces:
+              names: ['nativelink']
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+            action: keep
+            regex: true
+          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
+            action: replace
+            target_label: __metrics_path__
+            regex: (.+)
+          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
+            action: replace
+            regex: ([^:]+)(?::\d+)?;(\d+)
+            replacement: $1:$2
+            target_label: __address__
+          - action: labelmap
+            regex: __meta_kubernetes_pod_label_(.+)
+          - source_labels: [__meta_kubernetes_namespace]
+            action: replace
+            target_label: kubernetes_namespace
+          - source_labels: [__meta_kubernetes_pod_name]
+            action: replace
+            target_label: kubernetes_pod_name
+
+    rule_files:
+      - /etc/prometheus/rules/*.yml
+
+    alerting:
+      alertmanagers:
+        - static_configs:
+            - targets: ['alertmanager:9093']
+
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prometheus-rules
+  namespace: nativelink
+data:
+  nativelink-rules.yml: |
+    groups:
+      - name: nativelink_alerts
+        interval: 30s
+        rules:
+          - alert: NativeLinkHighErrorRate
+            expr: |
+              (1 - (
+                sum(rate(nativelink_execution_completed_count{execution_result="success"}[5m])) /
+                sum(rate(nativelink_execution_completed_count[5m]))
+              )) > 0.05
+            for: 5m
+            labels:
+              severity: warning
+              component: nativelink
+            annotations:
+              summary: "High execution error rate ({{ $value | humanizePercentage }})"
+              description: "NativeLink execution error rate is above 5% for the last 5 minutes"
+
+          - alert: NativeLinkCacheMissRateHigh
+            expr: |
+              (1 - (
+                sum(rate(nativelink_cache_operations{cache_operation_result="hit"}[5m])) by (cache_type) /
+                sum(rate(nativelink_cache_operations{cache_operation_name="read"}[5m])) by (cache_type)
+              )) > 0.5
+            for: 10m
+            labels:
+              severity: info
+              component: nativelink
+            annotations:
+              summary: "Cache miss rate above 50% for {{ $labels.cache_type }}"
+              description: "Cache {{ $labels.cache_type }} has a miss rate above 50% for 10 minutes"
+
+          - alert: NativeLinkQueueBacklog
+            expr: |
+              sum(nativelink_execution_active_count{execution_stage="queued"}) > 100
+            for: 15m
+            labels:
+              severity: warning
+              component: nativelink
+            annotations:
+              summary: "Execution queue backlog above 100 actions"
+              description: "{{ $value }} actions are queued for execution"
+
+          - alert: NativeLinkWorkerUtilizationLow
+            expr: |
+              count(nativelink_execution_active_count{execution_stage="executing"} > 0) /
+              count(count by (execution_worker_id) (nativelink_execution_active_count)) < 0.3
+            for: 30m
+            labels:
+              severity: info
+              component: nativelink
+            annotations:
+              summary: "Worker utilization below 30%"
+              description: "Only {{ $value | humanizePercentage }} of workers are active"
+
+          - alert: NativeLinkCacheEvictionRateHigh
+            expr: |
+              sum(rate(nativelink_cache_operations{cache_operation_name="evict"}[5m])) by (cache_type) > 10
+            for: 10m
+            labels:
+              severity: warning
+              component: nativelink
+            annotations:
+              summary: "High cache eviction rate for {{ $labels.cache_type }}"
+              description: "Cache {{ $labels.cache_type }} is evicting {{ $value }} items per second"
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus
+  namespace: nativelink
+  labels:
+    app: prometheus
+spec:
+  type: ClusterIP
+  selector:
+    app: prometheus
+  ports:
+    - name: web
+      port: 9090
+      targetPort: 9090
+      protocol: TCP
+
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: prometheus
+  namespace: nativelink
+  labels:
+    app: prometheus
+spec:
+  serviceName: prometheus
+  replicas: 1
+  selector:
+    matchLabels:
+      app: prometheus
+  template:
+    metadata:
+      labels:
+        app: prometheus
+    spec:
+      serviceAccountName: prometheus
+      securityContext:
+        fsGroup: 65534
+        runAsNonRoot: true
+        runAsUser: 65534
+      containers:
+        - name: prometheus
+          image: prom/prometheus:v2.50.0
+          args:
+            - '--config.file=/etc/prometheus/prometheus.yml'
+            - '--storage.tsdb.path=/prometheus'
+            - '--web.console.libraries=/etc/prometheus/console_libraries'
+            - '--web.console.templates=/etc/prometheus/consoles'
+            - '--web.enable-lifecycle'
+            - '--web.enable-otlp-receiver'
+            - '--storage.tsdb.retention.time=30d'
+            - '--storage.tsdb.out-of-order-time-window=30m'
+          ports:
+            - containerPort: 9090
+              name: web
+          volumeMounts:
+            - name: config
+              mountPath: /etc/prometheus
+            - name: rules
+              mountPath: /etc/prometheus/rules
+            - name: storage
+              mountPath: /prometheus
+          resources:
+            requests:
+              memory: "1Gi"
+              cpu: "500m"
+            limits:
+              memory: "4Gi"
+              cpu: "2000m"
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: 9090
+            initialDelaySeconds: 30
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: 9090
+            initialDelaySeconds: 5
+            periodSeconds: 5
+      volumes:
+        - name: config
+          configMap:
+            name: prometheus-config
+        - name: rules
+          configMap:
+            name: prometheus-rules
+  volumeClaimTemplates:
+    - metadata:
+        name: storage
+      spec:
+        accessModes: ["ReadWriteOnce"]
+        resources:
+          requests:
+            storage: 50Gi
+
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: prometheus
+  namespace: nativelink
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: prometheus
+rules:
+  - apiGroups: [""]
+    resources:
+      - nodes
+      - nodes/proxy
+      - services
+      - endpoints
+      - pods
+    verbs: ["get", "list", "watch"]
+  - apiGroups: ["extensions"]
+    resources:
+      - ingresses
+    verbs: ["get", "list", "watch"]
+  - apiGroups: ["networking.k8s.io"]
+    resources:
+      - ingresses
+    verbs: ["get", "list", "watch"]
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: prometheus
+subjects:
+  - kind: ServiceAccount
+    name: prometheus
+    namespace: nativelink
+
+---
+# Ingress for external access (optional)
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: prometheus
+  namespace: nativelink
+  annotations:
+    nginx.ingress.kubernetes.io/rewrite-target: /
+spec:
+  ingressClassName: nginx
+  rules:
+    - host: prometheus.nativelink.local
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: prometheus
+                port:
+                  number: 9090
diff --git a/deployment-examples/metrics/otel-collector-config.yaml b/deployment-examples/metrics/otel-collector-config.yaml
new file mode 100644
index 00000000..fe90896f
--- /dev/null
+++ b/deployment-examples/metrics/otel-collector-config.yaml
@@ -0,0 +1,139 @@
+# OpenTelemetry Collector Configuration for NativeLink Metrics
+# This configuration receives metrics from NativeLink via OTLP and exports them to various backends
+
+receivers:
+  # Receive metrics from NativeLink via OTLP gRPC
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:4318
+
+processors:
+  # Add resource attributes for better metric identification
+  resource:
+    attributes:
+      - key: service.namespace
+        value: nativelink
+        action: upsert
+      - key: deployment.environment
+        from_attribute: deployment_environment
+        action: insert
+      - key: deployment.region
+        from_attribute: deployment_region
+        action: insert
+
+  # Transform metrics to add NativeLink-specific attributes
+  transform/nativelink:
+    metric_statements:
+      - context: datapoint
+        statements:
+          # Add instance name from resource attributes if available
+          - set(attributes["instance_name"], resource.attributes["nativelink.instance_name"])
+            where resource.attributes["nativelink.instance_name"] != nil
+
+  # Batch metrics for efficiency
+  batch:
+    timeout: 10s
+    send_batch_size: 1024
+    send_batch_max_size: 2048
+
+  # Add memory limiter to prevent OOM
+  memory_limiter:
+    check_interval: 1s
+    limit_mib: 512
+    spike_limit_mib: 128
+
+exporters:
+  # Export metrics to Prometheus format
+  prometheus:
+    endpoint: 0.0.0.0:9090
+    namespace: nativelink
+    const_labels:
+      service: nativelink
+    resource_to_telemetry_conversion:
+      enabled: true
+    enable_open_metrics: true
+    # Add metric descriptions for NativeLink metrics
+    metric_expiration: 10m
+
+  # Direct OTLP export to Prometheus (when Prometheus has OTLP receiver enabled)
+  otlphttp/prometheus:
+    endpoint: http://prometheus:9090/api/v1/otlp/v1/metrics
+    compression: gzip
+    retry_on_failure:
+      enabled: true
+      initial_interval: 5s
+      max_interval: 30s
+      max_elapsed_time: 300s
+
+  # Export to other OTLP backends (e.g., Grafana Cloud, ClickHouse)
+  otlp/backend:
+    endpoint: "${OTLP_BACKEND_ENDPOINT}"
+    compression: gzip
+    headers:
+      Authorization: "Bearer ${OTLP_BACKEND_TOKEN}"
+    retry_on_failure:
+      enabled: true
+      initial_interval: 5s
+      max_interval: 30s
+      max_elapsed_time: 300s
+
+  # Debug exporter for troubleshooting
+  debug:
+    verbosity: detailed
+    sampling_initial: 5
+    sampling_thereafter: 200
+
+extensions:
+  health_check:
+    endpoint: 0.0.0.0:13133
+    path: /health
+    check_collector_pipeline:
+      enabled: true
+      interval: 15s
+      exporter_failure_threshold: 5
+
+  pprof:
+    endpoint: 0.0.0.0:1777
+
+  zpages:
+    endpoint: 0.0.0.0:55679
+
+service:
+  extensions: [health_check, pprof, zpages]
+  pipelines:
+    # Main metrics pipeline - exports to Prometheus scrape endpoint
+    metrics:
+      receivers: [otlp]
+      processors: [memory_limiter, resource, transform/nativelink, batch]
+      exporters: [prometheus]
+
+    # Direct to Prometheus OTLP endpoint (if enabled)
+    metrics/prometheus_otlp:
+      receivers: [otlp]
+      processors: [memory_limiter, resource, transform/nativelink, batch]
+      exporters: [otlphttp/prometheus]
+
+    # Optional: Send to additional backend
+    # Uncomment and configure OTLP_BACKEND_ENDPOINT environment variable
+    # metrics/backend:
+    #   receivers: [otlp]
+    #   processors: [memory_limiter, resource, transform/nativelink, batch]
+    #   exporters: [otlp/backend]
+
+    # Debug pipeline for development
+    # metrics/debug:
+    #   receivers: [otlp]
+    #   processors: [memory_limiter]
+    #   exporters: [debug]
+
+  telemetry:
+    logs:
+      level: info
+      initial_fields:
+        service: otel-collector
+    metrics:
+      level: detailed
+      address: 0.0.0.0:8888
diff --git a/deployment-examples/metrics/prometheus-config.yaml b/deployment-examples/metrics/prometheus-config.yaml
new file mode 100644
index 00000000..40e0e952
--- /dev/null
+++ b/deployment-examples/metrics/prometheus-config.yaml
@@ -0,0 +1,169 @@
+# Prometheus Configuration for NativeLink Metrics
+# This configuration sets up Prometheus to receive metrics via OTLP and scrape format
+
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+  external_labels:
+    cluster: 'nativelink-cluster'
+    environment: 'production'
+
+# Enable OTLP receiver (requires --web.enable-otlp-receiver flag)
+otlp:
+  # Promote NativeLink-specific resource attributes to labels
+  promote_resource_attributes:
+    - service.instance.id
+    - service.name
+    - service.namespace
+    - service.version
+    # Cloud/Infrastructure attributes
+    - cloud.availability_zone
+    - cloud.region
+    - container.name
+    - deployment.environment
+    - deployment.environment.name
+    # Kubernetes attributes
+    - k8s.cluster.name
+    - k8s.container.name
+    - k8s.cronjob.name
+    - k8s.daemonset.name
+    - k8s.deployment.name
+    - k8s.job.name
+    - k8s.namespace.name
+    - k8s.pod.name
+    - k8s.replicaset.name
+    - k8s.statefulset.name
+    # NativeLink-specific attributes
+    - nativelink.instance_name
+    - nativelink.worker_id
+    - nativelink.scheduler_name
+
+  # Keep identifying resource attributes in target_info
+  keep_identifying_resource_attributes: true
+
+  # Use NoTranslation to preserve metric names with UTF-8 support
+  # This keeps OpenTelemetry semantic convention names intact
+  translation_strategy: NoUTF8EscapingWithSuffixes
+
+# Storage configuration for handling out-of-order samples
+storage:
+  tsdb:
+    # Allow 30 minutes of out-of-order samples (for batched OTLP data)
+    out_of_order_time_window: 30m
+    # Retention period for metrics
+    retention.time: 30d
+    # Maximum number of concurrent queries
+    max_concurrent_queries: 20
+
+# Scrape configurations
+scrape_configs:
+  # Scrape the OTEL Collector's Prometheus endpoint
+  - job_name: 'otel-collector'
+    static_configs:
+      - targets: ['otel-collector:9090']
+    metric_relabel_configs:
+      # Add nativelink prefix to all metrics from collector
+      - source_labels: [__name__]
+        regex: '(nativelink_.*)'
+        target_label: __name__
+        replacement: '${1}'
+
+  # Scrape Prometheus's own metrics
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+
+  # Optional: Direct scrape of NativeLink instances (if metrics endpoint is exposed)
+  # - job_name: 'nativelink-direct'
+  #   static_configs:
+  #     - targets: ['nativelink-cas:8080', 'nativelink-scheduler:8080']
+  #   metrics_path: '/metrics'
+
+# Recording rules for common NativeLink queries
+rule_files:
+  - /etc/prometheus/rules/*.yml
+
+# Alerting configuration
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets: ['alertmanager:9093']
+
+# Example recording rules for NativeLink metrics
+# Save this as a separate file: rules/nativelink-recording-rules.yml
+# rule_files content example:
+---
+# Recording Rules for NativeLink Metrics
+groups:
+  - name: nativelink_execution
+    interval: 30s
+    rules:
+      # Execution success rate
+      - record: nativelink:execution_success_rate
+        expr: |
+          sum(rate(nativelink_execution_completed_count{execution_result="success"}[5m])) /
+          sum(rate(nativelink_execution_completed_count[5m]))
+
+      # Average queue time
+      - record: nativelink:execution_queue_time_avg
+        expr: |
+          histogram_quantile(0.5,
+            sum(rate(nativelink_execution_queue_time_bucket[5m])) by (le, instance_name)
+          )
+
+      # Actions per stage
+      - record: nativelink:execution_active_by_stage
+        expr: |
+          sum(nativelink_execution_active_count) by (execution_stage, instance_name)
+
+      # Stage transition rate
+      - record: nativelink:stage_transition_rate
+        expr: |
+          sum(rate(nativelink_execution_stage_transitions[5m])) by (instance_name)
+
+  - name: nativelink_cache
+    interval: 30s
+    rules:
+      # Cache hit rate
+      - record: nativelink:cache_hit_rate
+        expr: |
+          sum(rate(nativelink_cache_operations{cache_operation_result="hit"}[5m])) by (cache_type) /
+          sum(rate(nativelink_cache_operations{cache_operation_name="read"}[5m])) by (cache_type)
+
+      # Cache operation latency p95
+      - record: nativelink:cache_operation_latency_p95
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(nativelink_cache_operation_duration_bucket[5m])) by (le, cache_type, cache_operation_name)
+          )
+
+      # Cache size utilization
+      - record: nativelink:cache_size_bytes
+        expr: |
+          sum(nativelink_cache_size) by (cache_type, instance_name)
+
+      # Cache eviction rate
+      - record: nativelink:cache_eviction_rate
+        expr: |
+          sum(rate(nativelink_cache_operations{cache_operation_name="evict"}[5m])) by (cache_type)
+
+  - name: nativelink_performance
+    interval: 60s
+    rules:
+      # Overall system throughput
+      - record: nativelink:system_throughput
+        expr: |
+          sum(rate(nativelink_execution_completed_count[5m]))
+
+      # Worker utilization
+      - record: nativelink:worker_utilization
+        expr: |
+          sum(nativelink_execution_active_count{execution_stage="executing"}) by (execution_worker_id) /
+          count(count by (execution_worker_id) (nativelink_execution_active_count))
+
+      # Action completion time (from queued to completed)
+      - record: nativelink:action_total_duration_p99
+        expr: |
+          histogram_quantile(0.99,
+            sum(rate(nativelink_execution_total_duration_bucket[5m])) by (le, instance_name)
+          )
diff --git a/deployment-examples/metrics/prometheus-recording-rules.yml b/deployment-examples/metrics/prometheus-recording-rules.yml
new file mode 100644
index 00000000..665a4e87
--- /dev/null
+++ b/deployment-examples/metrics/prometheus-recording-rules.yml
@@ -0,0 +1,277 @@
+# Recording Rules for NativeLink Metrics
+# These rules pre-calculate common queries for better dashboard performance
+
+groups:
+  - name: nativelink_execution
+    interval: 30s
+    rules:
+      # Execution success rate by instance
+      - record: nativelink:execution_success_rate
+        expr: |
+          sum by (instance_name, execution_instance) (
+            rate(nativelink_execution_completed_count{execution_result="success"}[5m])
+          ) /
+          sum by (instance_name, execution_instance) (
+            rate(nativelink_execution_completed_count[5m])
+          )
+
+      # Cache hit rate from executions
+      - record: nativelink:execution_cache_hit_rate
+        expr: |
+          sum by (instance_name) (
+            rate(nativelink_execution_completed_count{execution_result="cache_hit"}[5m])
+          ) /
+          sum by (instance_name) (
+            rate(nativelink_execution_completed_count[5m])
+          )
+
+      # Average queue time (median)
+      - record: nativelink:execution_queue_time_p50
+        expr: |
+          histogram_quantile(0.5,
+            sum by (le, instance_name, execution_instance) (
+              rate(nativelink_execution_queue_time_bucket[5m])
+            )
+          )
+
+      # Queue time 95th percentile
+      - record: nativelink:execution_queue_time_p95
+        expr: |
+          histogram_quantile(0.95,
+            sum by (le, instance_name, execution_instance) (
+              rate(nativelink_execution_queue_time_bucket[5m])
+            )
+          )
+
+      # Actions currently in each stage
+      - record: nativelink:execution_active_by_stage
+        expr: |
+          sum by (execution_stage, instance_name, execution_instance) (
+            nativelink_execution_active_count
+          )
+
+      # Stage transition rate
+      - record: nativelink:stage_transition_rate
+        expr: |
+          sum by (instance_name, execution_instance, execution_priority) (
+            rate(nativelink_execution_stage_transitions[5m])
+          )
+
+      # Execution duration by stage (p50, p95, p99)
+      - record: nativelink:execution_stage_duration_p50
+        expr: |
+          histogram_quantile(0.5,
+            sum by (le, execution_stage, instance_name) (
+              rate(nativelink_execution_stage_duration_bucket[5m])
+            )
+          )
+
+      - record: nativelink:execution_stage_duration_p95
+        expr: |
+          histogram_quantile(0.95,
+            sum by (le, execution_stage, instance_name) (
+              rate(nativelink_execution_stage_duration_bucket[5m])
+            )
+          )
+
+      - record: nativelink:execution_stage_duration_p99
+        expr: |
+          histogram_quantile(0.99,
+            sum by (le, execution_stage, instance_name) (
+              rate(nativelink_execution_stage_duration_bucket[5m])
+            )
+          )
+
+      # Total execution time from submission to completion
+      - record: nativelink:execution_total_duration_p50
+        expr: |
+          histogram_quantile(0.5,
+            sum by (le, instance_name, execution_instance) (
+              rate(nativelink_execution_total_duration_bucket[5m])
+            )
+          )
+
+      - record: nativelink:execution_total_duration_p95
+        expr: |
+          histogram_quantile(0.95,
+            sum by (le, instance_name, execution_instance) (
+              rate(nativelink_execution_total_duration_bucket[5m])
+            )
+          )
+
+      # Execution output size distribution
+      - record: nativelink:execution_output_size_p50
+        expr: |
+          histogram_quantile(0.5,
+            sum by (le, instance_name) (
+              rate(nativelink_execution_output_size_bucket[5m])
+            )
+          )
+
+      - record: nativelink:execution_output_size_p95
+        expr: |
+          histogram_quantile(0.95,
+            sum by (le, instance_name) (
+              rate(nativelink_execution_output_size_bucket[5m])
+            )
+          )
+
+  - name: nativelink_cache
+    interval: 30s
+    rules:
+      # Cache hit rate by operation and cache type
+      - record: nativelink:cache_hit_rate
+        expr: |
+          sum by (cache_type, instance_name) (
+            rate(nativelink_cache_operations{cache_operation_result="hit"}[5m])
+          ) /
+          sum by (cache_type, instance_name) (
+            rate(nativelink_cache_operations{cache_operation_name="read"}[5m])
+          )
+
+      # Cache operation latency percentiles
+      - record: nativelink:cache_operation_latency_p50
+        expr: |
+          histogram_quantile(0.5,
+            sum by (le, cache_type, cache_operation_name, instance_name) (
+              rate(nativelink_cache_operation_duration_bucket[5m])
+            )
+          )
+
+      - record: nativelink:cache_operation_latency_p95
+        expr: |
+          histogram_quantile(0.95,
+            sum by (le, cache_type, cache_operation_name, instance_name) (
+              rate(nativelink_cache_operation_duration_bucket[5m])
+            )
+          )
+
+      - record: nativelink:cache_operation_latency_p99
+        expr: |
+          histogram_quantile(0.99,
+            sum by (le, cache_type, cache_operation_name, instance_name) (
+              rate(nativelink_cache_operation_duration_bucket[5m])
+            )
+          )
+
+      # Cache size and entry count
+      - record: nativelink:cache_size_bytes
+        expr: |
+          sum by (cache_type, instance_name) (nativelink_cache_size)
+
+      - record: nativelink:cache_entry_count
+        expr: |
+          sum by (cache_type, instance_name) (nativelink_cache_entries)
+
+      # Cache eviction rate
+      - record: nativelink:cache_eviction_rate
+        expr: |
+          sum by (cache_type, instance_name) (
+            rate(nativelink_cache_operations{cache_operation_name="evict"}[5m])
+          )
+
+      # Cache throughput (bytes/sec)
+      - record: nativelink:cache_read_throughput_bytes
+        expr: |
+          sum by (cache_type, instance_name) (
+            rate(nativelink_cache_io{cache_operation_name="read"}[5m])
+          )
+
+      - record: nativelink:cache_write_throughput_bytes
+        expr: |
+          sum by (cache_type, instance_name) (
+            rate(nativelink_cache_io{cache_operation_name="write"}[5m])
+          )
+
+      # Cache error rate
+      - record: nativelink:cache_error_rate
+        expr: |
+          sum by (cache_type, cache_operation_name, instance_name) (
+            rate(nativelink_cache_operations{cache_operation_result="error"}[5m])
+          )
+
+  - name: nativelink_performance
+    interval: 60s
+    rules:
+      # Overall system throughput (actions/sec)
+      - record: nativelink:system_throughput
+        expr: |
+          sum(rate(nativelink_execution_completed_count[5m]))
+
+      # System success rate
+      - record: nativelink:system_success_rate
+        expr: |
+          sum(rate(nativelink_execution_completed_count{execution_result="success"}[5m])) /
+          sum(rate(nativelink_execution_completed_count[5m]))
+
+      # Worker utilization (percentage of workers executing)
+      - record: nativelink:worker_utilization
+        expr: |
+          count by (instance_name) (
+            nativelink_execution_active_count{execution_stage="executing"} > 0
+          ) /
+          count by (instance_name) (
+            nativelink_execution_active_count
+          )
+
+      # Queue depth (actions waiting)
+      - record: nativelink:queue_depth
+        expr: |
+          sum by (instance_name, execution_priority) (
+            nativelink_execution_active_count{execution_stage="queued"}
+          )
+
+      # Average actions per worker
+      - record: nativelink:actions_per_worker
+        expr: |
+          sum by (execution_worker_id) (
+            nativelink_execution_active_count{execution_stage="executing"}
+          )
+
+      # Memory usage estimation from output sizes
+      - record: nativelink:estimated_memory_usage_bytes
+        expr: |
+          sum by (instance_name) (
+            nativelink_execution_output_size_sum
+          )
+
+      # Retry rate
+      - record: nativelink:execution_retry_rate
+        expr: |
+          sum by (instance_name) (
+            rate(nativelink_execution_retry_count[5m])
+          )
+
+  - name: nativelink_slo
+    interval: 60s
+    rules:
+      # SLO: 99% of executions should complete successfully
+      - record: nativelink:slo_execution_success_rate
+        expr: |
+          sum(rate(nativelink_execution_completed_count{execution_result="success"}[1h])) /
+          sum(rate(nativelink_execution_completed_count[1h]))
+
+      # SLO: 95% of cache reads should be under 100ms
+      - record: nativelink:slo_cache_read_latency
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(nativelink_cache_operation_duration_bucket{cache_operation_name="read"}[1h])) by (le)
+          ) < 0.1
+
+      # SLO: Queue time should be under 30s for 90% of actions
+      - record: nativelink:slo_queue_time
+        expr: |
+          histogram_quantile(0.9,
+            sum(rate(nativelink_execution_queue_time_bucket[1h])) by (le)
+          ) < 30
+
+      # Error budget remaining (based on 99% success SLO)
+      - record: nativelink:error_budget_remaining
+        expr: |
+          1 - (
+            (1 - 0.99) -
+            (1 - (
+              sum(rate(nativelink_execution_completed_count{execution_result="success"}[30d])) /
+              sum(rate(nativelink_execution_completed_count[30d]))
+            ))
+          ) / (1 - 0.99)
diff --git a/web/platform/src/content/docs/docs/deployment-examples/metrics.mdx b/web/platform/src/content/docs/docs/deployment-examples/metrics.mdx
new file mode 100644
index 00000000..ee277011
--- /dev/null
+++ b/web/platform/src/content/docs/docs/deployment-examples/metrics.mdx
@@ -0,0 +1,420 @@
+---
+title: Metrics and Observability
+description: 'Configure OpenTelemetry metrics collection for NativeLink'
+---
+
+import { Tabs, TabItem } from '@astrojs/starlight/components';
+
+NativeLink provides comprehensive metrics through OpenTelemetry (OTEL), enabling deep insights into cache performance, remote execution pipelines, and system health.
+
+## Overview
+
+NativeLink automatically exports metrics when configured with OTEL environment variables. The metrics cover:
+
+- **Cache Operations**: Hit rates, latencies, evictions
+- **Execution Pipeline**: Queue depths, stage durations, success rates
+- **System Health**: Worker utilization, throughput, error rates
+
+## Quick Start
+
+<Tabs>
+<TabItem label="Docker Compose">
+
+```bash
+# Clone the repository
+git clone https://github.com/TraceMachina/nativelink
+cd nativelink/deployment-examples/metrics
+
+# Start the metrics stack
+docker-compose up -d
+
+# Configure NativeLink
+export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
+export OTEL_EXPORTER_OTLP_PROTOCOL=grpc
+export OTEL_SERVICE_NAME=nativelink
+export OTEL_RESOURCE_ATTRIBUTES="deployment.environment=dev"
+
+# Run NativeLink
+nativelink /path/to/config.json
+```
+
+Access the services:
+- Prometheus: http://localhost:9091
+- Grafana: http://localhost:3000 (admin/admin)
+- OTEL Collector: http://localhost:8888/metrics
+
+</TabItem>
+<TabItem label="Kubernetes">
+
+```bash
+# Create namespace
+kubectl create namespace nativelink
+
+# Deploy OTEL Collector
+kubectl apply -f deployment-examples/metrics/kubernetes/otel-collector.yaml
+
+# Deploy Prometheus
+kubectl apply -f deployment-examples/metrics/kubernetes/prometheus.yaml
+
+# Configure NativeLink pods
+kubectl set env deployment/nativelink \
+  OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 \
+  OTEL_EXPORTER_OTLP_PROTOCOL=grpc \
+  OTEL_RESOURCE_ATTRIBUTES="k8s.cluster.name=main"
+```
+
+</TabItem>
+<TabItem label="Direct to Prometheus">
+
+```bash
+# Start Prometheus with OTLP receiver
+prometheus \
+  --web.enable-otlp-receiver \
+  --storage.tsdb.out-of-order-time-window=30m \
+  --config.file=prometheus.yml
+
+# Configure NativeLink
+export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
+export OTEL_EXPORTER_OTLP_METRICS_ENDPOINT=http://localhost:9090/api/v1/otlp/v1/metrics
+export OTEL_SERVICE_NAME=nativelink
+export OTEL_RESOURCE_ATTRIBUTES="service.instance.id=$(uuidgen)"
+
+# Disable traces and logs
+export OTEL_TRACES_EXPORTER=none
+export OTEL_LOGS_EXPORTER=none
+```
+
+</TabItem>
+</Tabs>
+
+## Configuration
+
+### Environment Variables
+
+NativeLink uses standard OpenTelemetry environment variables:
+
+```bash
+# Core OTLP Configuration
+OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
+OTEL_EXPORTER_OTLP_PROTOCOL=grpc  # or http/protobuf
+OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer token"
+OTEL_EXPORTER_OTLP_COMPRESSION=gzip
+
+# Resource Attributes (customize for your deployment)
+OTEL_SERVICE_NAME=nativelink  # Fixed value
+OTEL_RESOURCE_ATTRIBUTES="deployment.environment=prod,region=us-east-1"
+
+# Metric Export Intervals
+OTEL_METRIC_EXPORT_INTERVAL=60000  # 60 seconds
+OTEL_METRIC_EXPORT_TIMEOUT=30000   # 30 seconds
+```
+
+### Collector Configuration
+
+The OTEL Collector adds resource attributes and batches metrics:
+
+```yaml
+processors:
+  resource:
+    attributes:
+      - key: service.namespace
+        value: nativelink
+        action: upsert
+  batch:
+    timeout: 10s
+    send_batch_size: 1024
+```
+
+## Metrics Reference
+
+### Cache Metrics
+
+Monitor cache performance and efficiency:
+
+| Metric | Description | Key Labels |
+|--------|-------------|------------|
+| `nativelink_cache_operations` | Operations count by type and result | `cache_type`, `operation`, `result` |
+| `nativelink_cache_operation_duration` | Operation latency histogram | `cache_type`, `operation` |
+| `nativelink_cache_hit_rate` | Calculated hit rate (recording rule) | `cache_type` |
+| `nativelink_cache_size` | Current cache size in bytes | `cache_type` |
+| `nativelink_cache_eviction_rate` | Evictions per second | `cache_type` |
+
+### Execution Metrics
+
+Track remote execution pipeline performance:
+
+| Metric | Description | Key Labels |
+|--------|-------------|------------|
+| `nativelink_execution_active_count` | Actions in each stage | `execution_stage` |
+| `nativelink_execution_completed_count` | Completed actions | `execution_result` |
+| `nativelink_execution_queue_time` | Queue wait time histogram | `priority` |
+| `nativelink_execution_stage_duration` | Time per stage | `execution_stage` |
+| `nativelink_execution_success_rate` | Success percentage (recording rule) | `instance` |
+
+### Execution Stages
+
+Actions progress through these stages:
+1. `unknown` - Initial state
+2. `cache_check` - Checking for cached results
+3. `queued` - Waiting for worker
+4. `executing` - Running on worker
+5. `completed` - Finished (success/failure/cache_hit)
+
+## Example Queries
+
+### Cache Performance
+
+```promql
+# Cache hit rate by type
+sum(rate(nativelink_cache_operations{result="hit"}[5m])) by (cache_type) /
+sum(rate(nativelink_cache_operations{operation="read"}[5m])) by (cache_type)
+
+# P95 cache operation latency
+histogram_quantile(0.95,
+  sum(rate(nativelink_cache_operation_duration_bucket[5m])) by (le, cache_type)
+)
+
+# Cache eviction rate
+sum(rate(nativelink_cache_operations{operation="evict"}[5m])) by (cache_type)
+```
+
+### Execution Pipeline
+
+```promql
+# Execution success rate
+sum(rate(nativelink_execution_completed_count{result="success"}[5m])) /
+sum(rate(nativelink_execution_completed_count[5m]))
+
+# Queue depth by priority
+sum(nativelink_execution_active_count{stage="queued"}) by (priority)
+
+# Average queue time
+histogram_quantile(0.5,
+  sum(rate(nativelink_execution_queue_time_bucket[5m])) by (le)
+)
+
+# Worker utilization
+count(nativelink_execution_active_count{stage="executing"} > 0) /
+count(count by (worker_id) (nativelink_execution_active_count))
+```
+
+### System Health
+
+```promql
+# Overall throughput (actions/sec)
+sum(rate(nativelink_execution_completed_count[5m]))
+
+# Error rate
+sum(rate(nativelink_execution_completed_count{result="failure"}[5m])) /
+sum(rate(nativelink_execution_completed_count[5m]))
+
+# Stage transition rate
+sum(rate(nativelink_execution_stage_transitions[5m])) by (instance)
+```
+
+## Dashboards
+
+### Grafana Dashboard
+
+Import the pre-built dashboard for comprehensive monitoring:
+
+```json
+{
+  "title": "NativeLink Metrics",
+  "panels": [
+    {
+      "title": "Execution Success Rate",
+      "targets": [{
+        "expr": "nativelink:execution_success_rate"
+      }]
+    },
+    {
+      "title": "Cache Hit Rate",
+      "targets": [{
+        "expr": "nativelink:cache_hit_rate"
+      }]
+    },
+    {
+      "title": "Queue Depth",
+      "targets": [{
+        "expr": "sum(nativelink_execution_active_count{stage=\"queued\"})"
+      }]
+    }
+  ]
+}
+```
+
+### Key Metrics to Monitor
+
+1. **SLI/SLO Metrics**:
+   - Execution success rate > 99%
+   - Cache hit rate > 80%
+   - P95 queue time < 30s
+   - P95 cache latency < 100ms
+
+2. **Capacity Planning**:
+   - Queue depth trends
+   - Worker utilization
+   - Cache size growth
+   - Eviction rates
+
+3. **Performance Optimization**:
+   - Stage duration breakdowns
+   - Cache operation latencies
+   - Output size distributions
+   - Retry rates
+
+## Server Options
+
+### Prometheus (Recommended)
+
+Best for most deployments with excellent query capabilities:
+
+```yaml
+# Enable OTLP receiver
+prometheus --web.enable-otlp-receiver
+
+# Configure out-of-order handling
+storage:
+  tsdb:
+    out_of_order_time_window: 30m
+```
+
+### Grafana Cloud
+
+Managed solution with built-in dashboards:
+
+```bash
+export OTEL_EXPORTER_OTLP_ENDPOINT=https://otlp-gateway.grafana.net/otlp
+export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer ${GRAFANA_TOKEN}"
+```
+
+### ClickHouse
+
+For high-volume metrics with SQL queries:
+
+```yaml
+exporters:
+  clickhouse:
+    endpoint: tcp://clickhouse:9000
+    database: nativelink_metrics
+    ttl_days: 90
+```
+
+### Quickwit
+
+Unified logs and metrics search:
+
+```yaml
+exporters:
+  otlp:
+    endpoint: quickwit:7281
+    headers:
+      x-quickwit-index: nativelink-metrics
+```
+
+## Alerting
+
+### Critical Alerts
+
+```yaml
+- alert: HighErrorRate
+  expr: |
+    (1 - nativelink:execution_success_rate) > 0.05
+  for: 5m
+  annotations:
+    summary: "Execution error rate above 5%"
+
+- alert: QueueBacklog
+  expr: |
+    sum(nativelink_execution_active_count{stage="queued"}) > 100
+  for: 15m
+  annotations:
+    summary: "Queue backlog exceeds 100 actions"
+
+- alert: CacheEvictionHigh
+  expr: |
+    rate(nativelink_cache_operations{operation="evict"}[5m]) > 10
+  for: 10m
+  annotations:
+    summary: "Cache eviction rate exceeds threshold"
+```
+
+## Troubleshooting
+
+### No Metrics Appearing
+
+1. Verify OTEL environment variables:
+   ```bash
+   env | grep OTEL_
+   ```
+
+2. Check collector health:
+   ```bash
+   curl http://localhost:13133/health
+   ```
+
+3. Verify metrics are being received:
+   ```bash
+   curl http://localhost:8888/metrics | grep otelcol_receiver
+   ```
+
+### High Cardinality
+
+Reduce label dimensions:
+```yaml
+processors:
+  attributes:
+    actions:
+      - key: high_cardinality_label
+        action: delete
+```
+
+### Out-of-Order Samples
+
+Increase Prometheus window:
+```yaml
+storage:
+  tsdb:
+    out_of_order_time_window: 1h
+```
+
+## Performance Tuning
+
+### Metric Export Optimization
+
+```bash
+# Increase export interval for lower overhead
+export OTEL_METRIC_EXPORT_INTERVAL=120000  # 2 minutes
+
+# Batch metrics at collector
+processors:
+  batch:
+    send_batch_size: 2048
+    timeout: 30s
+```
+
+### Recording Rules
+
+Use Prometheus recording rules for expensive queries:
+```yaml
+- record: nativelink:hourly_success_rate
+  expr: |
+    avg_over_time(nativelink:execution_success_rate[1h])
+```
+
+### Sampling
+
+For high-volume deployments, sample metrics:
+```yaml
+processors:
+  probabilistic_sampler:
+    sampling_percentage: 10  # Sample 10% of metrics
+```
+
+## Additional Resources
+
+- [OpenTelemetry Documentation](https://opentelemetry.io/docs/)
+- [Prometheus Best Practices](https://prometheus.io/docs/practices/)
+- [Grafana Dashboard Gallery](https://grafana.com/grafana/dashboards/)
+- [NativeLink GitHub](https://github.com/TraceMachina/nativelink)

From c48cc4bf426ec1eb3d84e87adab031eedfbb3cc1 Mon Sep 17 00:00:00 2001
From: Marcus <marcuseagan@gmail.com>
Date: Sat, 6 Sep 2025 23:08:04 +0900
Subject: [PATCH 6/6] adds comprehensive metrics documentation

---
 nativelink-util/src/metrics.rs                            | 8 +++++---
 .../src/content/docs/docs/deployment-examples/metrics.mdx | 1 -
 web/platform/starlight.conf.ts                            | 4 ++++
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/nativelink-util/src/metrics.rs b/nativelink-util/src/metrics.rs
index 63dce9cc..2c916c05 100644
--- a/nativelink-util/src/metrics.rs
+++ b/nativelink-util/src/metrics.rs
@@ -1,10 +1,12 @@
 // Copyright 2025 The NativeLink Authors. All rights reserved.
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Business Source License, Version 1.1 (the "License");
 // you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+// You may requested a copy of the License by emailing contact@nativelink.com.
 //
-//    http://www.apache.org/licenses/LICENSE-2.0
+// Use of this module requires an enterprise license agreement, which can be
+// attained by emailing contact@nativelink.com or signing up for Nativelink
+// Cloud at app.nativelink.com.
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/web/platform/src/content/docs/docs/deployment-examples/metrics.mdx b/web/platform/src/content/docs/docs/deployment-examples/metrics.mdx
index ee277011..614eab1b 100644
--- a/web/platform/src/content/docs/docs/deployment-examples/metrics.mdx
+++ b/web/platform/src/content/docs/docs/deployment-examples/metrics.mdx
@@ -2,7 +2,6 @@
 title: Metrics and Observability
 description: 'Configure OpenTelemetry metrics collection for NativeLink'
 ---
-
 import { Tabs, TabItem } from '@astrojs/starlight/components';
 
 NativeLink provides comprehensive metrics through OpenTelemetry (OTEL), enabling deep insights into cache performance, remote execution pipelines, and system health.
diff --git a/web/platform/starlight.conf.ts b/web/platform/starlight.conf.ts
index 38f644d6..931530cf 100644
--- a/web/platform/starlight.conf.ts
+++ b/web/platform/starlight.conf.ts
@@ -146,6 +146,10 @@ export const starlightConfig = {
           label: "Chromium",
           link: `${docsRoot}/deployment-examples/chromium`,
         },
+        {
+          label: "Metrics and Observability",
+          link: `${docsRoot}/deployment-examples/metrics`,
+        },
       ],
     },
     {