From 8c9fc2226b15085e5768acd2d874ddcf4498adbc Mon Sep 17 00:00:00 2001 From: Marcus Date: Tue, 29 Jul 2025 21:18:09 -0700 Subject: [PATCH 1/6] Implement metrics using otel --- .../src/memory_awaited_action_db.rs | 108 ++- nativelink-util/BUILD.bazel | 3 + nativelink-util/src/lib.rs | 1 + nativelink-util/src/metrics.rs | 618 ++++++++++++++++++ nativelink-util/tests/metrics_test.rs | 114 ++++ 5 files changed, 842 insertions(+), 2 deletions(-) create mode 100644 nativelink-util/src/metrics.rs create mode 100644 nativelink-util/tests/metrics_test.rs diff --git a/nativelink-scheduler/src/memory_awaited_action_db.rs b/nativelink-scheduler/src/memory_awaited_action_db.rs index 96bb2b87..04bee355 100644 --- a/nativelink-scheduler/src/memory_awaited_action_db.rs +++ b/nativelink-scheduler/src/memory_awaited_action_db.rs @@ -29,6 +29,9 @@ use nativelink_util::action_messages::{ use nativelink_util::chunked_stream::ChunkedStream; use nativelink_util::evicting_map::{EvictingMap, LenEntry}; use nativelink_util::instant_wrapper::InstantWrapper; +use nativelink_util::metrics::{ + EXECUTION_METRICS, ExecutionResult, ExecutionStage, make_execution_attributes, +}; use nativelink_util::spawn; use nativelink_util::task::JoinHandleDropGuard; use tokio::sync::{Notify, mpsc, watch}; @@ -630,6 +633,95 @@ impl I + Clone + Send + Sync> AwaitedActionDbI .is_same_stage(&new_awaited_action.state().stage); if !is_same_stage { + // Record metrics for stage transitions + let metrics = &*EXECUTION_METRICS; + let old_stage = &old_awaited_action.state().stage; + let new_stage = &new_awaited_action.state().stage; + + // Track stage transitions + let base_attrs = make_execution_attributes( + "unknown", + None, + Some(old_awaited_action.action_info().priority), + ); + metrics.execution_stage_transitions.add(1, &base_attrs); + + // Update active count for old stage + let old_stage_attrs = match old_stage { + ActionStage::Unknown => vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::Unknown, + )], + ActionStage::CacheCheck => vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::CacheCheck, + )], + ActionStage::Queued => vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::Queued, + )], + ActionStage::Executing => vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::Executing, + )], + ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => { + vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::Completed, + )] + } + }; + metrics.execution_active_count.add(-1, &old_stage_attrs); + + // Update active count for new stage + let new_stage_attrs = match new_stage { + ActionStage::Unknown => vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::Unknown, + )], + ActionStage::CacheCheck => vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::CacheCheck, + )], + ActionStage::Queued => vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::Queued, + )], + ActionStage::Executing => vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::Executing, + )], + ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => { + vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::Completed, + )] + } + }; + metrics.execution_active_count.add(1, &new_stage_attrs); + + // Record completion metrics + if let ActionStage::Completed(action_result) = new_stage { + let result_attrs = if action_result.exit_code == 0 { + vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_RESULT, + ExecutionResult::Success, + )] + } else { + vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_RESULT, + ExecutionResult::Failure, + )] + }; + metrics.execution_completed_count.add(1, &result_attrs); + } else if let ActionStage::CompletedFromCache(_) = new_stage { + let result_attrs = vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_RESULT, + ExecutionResult::CacheHit, + )]; + metrics.execution_completed_count.add(1, &result_attrs); + } + self.sorted_action_info_hash_keys .process_state_changes(&old_awaited_action, &new_awaited_action)?; Self::process_state_changes_for_hash_key_map( @@ -695,8 +787,11 @@ impl I + Clone + Send + Sync> AwaitedActionDbI ActionUniqueQualifier::Uncacheable(_unique_key) => None, }; let operation_id = OperationId::default(); - let awaited_action = - AwaitedAction::new(operation_id.clone(), action_info, (self.now_fn)().now()); + let awaited_action = AwaitedAction::new( + operation_id.clone(), + action_info.clone(), + (self.now_fn)().now(), + ); debug_assert!( ActionStage::Queued == awaited_action.state().stage, "Expected action to be queued" @@ -731,6 +826,15 @@ impl I + Clone + Send + Sync> AwaitedActionDbI } } + // Record metric for new action entering the queue + let metrics = &*EXECUTION_METRICS; + let _base_attrs = make_execution_attributes("unknown", None, Some(action_info.priority)); + let queued_attrs = vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::Queued, + )]; + metrics.execution_active_count.add(1, &queued_attrs); + self.sorted_action_info_hash_keys .insert_sort_map_for_stage( &ActionStage::Queued, diff --git a/nativelink-util/BUILD.bazel b/nativelink-util/BUILD.bazel index c288c7e6..d1e1e557 100644 --- a/nativelink-util/BUILD.bazel +++ b/nativelink-util/BUILD.bazel @@ -24,6 +24,7 @@ rust_library( "src/instant_wrapper.rs", "src/known_platform_property_provider.rs", "src/lib.rs", + "src/metrics.rs", "src/metrics_utils.rs", "src/operation_state_manager.rs", "src/origin_event.rs", @@ -95,6 +96,7 @@ rust_test_suite( "tests/evicting_map_test.rs", "tests/fastcdc_test.rs", "tests/health_utils_test.rs", + "tests/metrics_test.rs", "tests/operation_id_tests.rs", "tests/origin_event_test.rs", "tests/proto_stream_utils_test.rs", @@ -120,6 +122,7 @@ rust_test_suite( "@crates//:http-body-util", "@crates//:hyper-1.6.0", "@crates//:mock_instant", + "@crates//:opentelemetry", "@crates//:parking_lot", "@crates//:pretty_assertions", "@crates//:rand", diff --git a/nativelink-util/src/lib.rs b/nativelink-util/src/lib.rs index bcab5b23..c29069f6 100644 --- a/nativelink-util/src/lib.rs +++ b/nativelink-util/src/lib.rs @@ -25,6 +25,7 @@ pub mod fs; pub mod health_utils; pub mod instant_wrapper; pub mod known_platform_property_provider; +pub mod metrics; pub mod metrics_utils; pub mod operation_state_manager; pub mod origin_event; diff --git a/nativelink-util/src/metrics.rs b/nativelink-util/src/metrics.rs new file mode 100644 index 00000000..3b8dea07 --- /dev/null +++ b/nativelink-util/src/metrics.rs @@ -0,0 +1,618 @@ +// Copyright 2025 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::LazyLock; + +use opentelemetry::{InstrumentationScope, KeyValue, Value, global, metrics}; + +// Metric attribute keys for cache operations. +pub const CACHE_TYPE: &str = "cache.type"; +pub const CACHE_OPERATION: &str = "cache.operation.name"; +pub const CACHE_RESULT: &str = "cache.operation.result"; + +// Metric attribute keys for remote execution operations. +pub const EXECUTION_STAGE: &str = "execution.stage"; +pub const EXECUTION_RESULT: &str = "execution.result"; +pub const EXECUTION_INSTANCE: &str = "execution.instance"; +pub const EXECUTION_PRIORITY: &str = "execution.priority"; +pub const EXECUTION_WORKER_ID: &str = "execution.worker_id"; +pub const EXECUTION_EXIT_CODE: &str = "execution.exit_code"; + +/// Cache operation types for metrics classification. +#[derive(Debug, Clone, Copy)] +pub enum CacheOperationName { + /// Data retrieval operations (get, peek, contains, etc.) + Read, + /// Data storage operations (insert, update, replace, etc.) + Write, + /// Explicit data removal operations + Delete, + /// Automatic cache maintenance (evictions, TTL cleanup, etc.) + Evict, +} + +impl From for Value { + fn from(op: CacheOperationName) -> Self { + match op { + CacheOperationName::Read => Self::from("read"), + CacheOperationName::Write => Self::from("write"), + CacheOperationName::Delete => Self::from("delete"), + CacheOperationName::Evict => Self::from("evict"), + } + } +} + +/// Results of cache operations. +/// +/// Result semantics vary by operation type: +/// - Read: Hit/Miss/Expired indicate data availability +/// - Write/Delete/Evict: Success/Error indicate completion status +#[derive(Debug, Clone, Copy)] +pub enum CacheOperationResult { + /// Data found and valid (Read operations) + Hit, + /// Data not found (Read operations) + Miss, + /// Data found but invalid/expired (Read operations) + Expired, + /// Operation completed successfully (Write/Delete/Evict operations) + Success, + /// Operation failed (any operation type) + Error, +} + +impl From for Value { + fn from(result: CacheOperationResult) -> Self { + match result { + CacheOperationResult::Hit => Self::from("hit"), + CacheOperationResult::Miss => Self::from("miss"), + CacheOperationResult::Expired => Self::from("expired"), + CacheOperationResult::Success => Self::from("success"), + CacheOperationResult::Error => Self::from("error"), + } + } +} + +/// Remote execution stages for metrics classification. +#[derive(Debug, Clone, Copy)] +pub enum ExecutionStage { + /// Unknown stage + Unknown, + /// Checking cache for existing results + CacheCheck, + /// Action is queued waiting for execution + Queued, + /// Action is being executed by a worker + Executing, + /// Action execution completed + Completed, +} + +impl From for Value { + fn from(stage: ExecutionStage) -> Self { + match stage { + ExecutionStage::Unknown => Self::from("unknown"), + ExecutionStage::CacheCheck => Self::from("cache_check"), + ExecutionStage::Queued => Self::from("queued"), + ExecutionStage::Executing => Self::from("executing"), + ExecutionStage::Completed => Self::from("completed"), + } + } +} + +/// Results of remote execution operations. +#[derive(Debug, Clone, Copy)] +pub enum ExecutionResult { + /// Execution completed successfully + Success, + /// Execution failed + Failure, + /// Execution was cancelled + Cancelled, + /// Execution timed out + Timeout, + /// Result was found in cache + CacheHit, +} + +impl From for Value { + fn from(result: ExecutionResult) -> Self { + match result { + ExecutionResult::Success => Self::from("success"), + ExecutionResult::Failure => Self::from("failure"), + ExecutionResult::Cancelled => Self::from("cancelled"), + ExecutionResult::Timeout => Self::from("timeout"), + ExecutionResult::CacheHit => Self::from("cache_hit"), + } + } +} + +/// Pre-allocated attribute combinations for efficient cache metrics collection. +/// +/// Avoids runtime allocation by pre-computing common attribute combinations +/// for cache operations and results. +#[derive(Debug)] +pub struct CacheMetricAttrs { + // Read operation attributes + read_hit: Vec, + read_miss: Vec, + read_expired: Vec, + + // Write operation attributes + write_success: Vec, + write_error: Vec, + + // Delete operation attributes + delete_success: Vec, + delete_miss: Vec, + delete_error: Vec, + + // Evict operation attributes + evict_success: Vec, + evict_expired: Vec, +} + +impl CacheMetricAttrs { + /// Creates a new set of pre-computed attributes. + /// + /// The `base_attrs` are included in all attribute combinations (e.g., cache + /// type, instance ID). + #[must_use] + pub fn new(base_attrs: &[KeyValue]) -> Self { + let make_attrs = |op: CacheOperationName, result: CacheOperationResult| { + let mut attrs = base_attrs.to_vec(); + attrs.push(KeyValue::new(CACHE_OPERATION, op)); + attrs.push(KeyValue::new(CACHE_RESULT, result)); + attrs + }; + + Self { + read_hit: make_attrs(CacheOperationName::Read, CacheOperationResult::Hit), + read_miss: make_attrs(CacheOperationName::Read, CacheOperationResult::Miss), + read_expired: make_attrs(CacheOperationName::Read, CacheOperationResult::Expired), + + write_success: make_attrs(CacheOperationName::Write, CacheOperationResult::Success), + write_error: make_attrs(CacheOperationName::Write, CacheOperationResult::Error), + + delete_success: make_attrs(CacheOperationName::Delete, CacheOperationResult::Success), + delete_miss: make_attrs(CacheOperationName::Delete, CacheOperationResult::Miss), + delete_error: make_attrs(CacheOperationName::Delete, CacheOperationResult::Error), + + evict_success: make_attrs(CacheOperationName::Evict, CacheOperationResult::Success), + evict_expired: make_attrs(CacheOperationName::Evict, CacheOperationResult::Expired), + } + } + + // Attribute accessors + #[must_use] + pub fn read_hit(&self) -> &[KeyValue] { + &self.read_hit + } + #[must_use] + pub fn read_miss(&self) -> &[KeyValue] { + &self.read_miss + } + #[must_use] + pub fn read_expired(&self) -> &[KeyValue] { + &self.read_expired + } + #[must_use] + pub fn write_success(&self) -> &[KeyValue] { + &self.write_success + } + #[must_use] + pub fn write_error(&self) -> &[KeyValue] { + &self.write_error + } + #[must_use] + pub fn delete_success(&self) -> &[KeyValue] { + &self.delete_success + } + #[must_use] + pub fn delete_miss(&self) -> &[KeyValue] { + &self.delete_miss + } + #[must_use] + pub fn delete_error(&self) -> &[KeyValue] { + &self.delete_error + } + #[must_use] + pub fn evict_success(&self) -> &[KeyValue] { + &self.evict_success + } + #[must_use] + pub fn evict_expired(&self) -> &[KeyValue] { + &self.evict_expired + } +} + +/// Pre-allocated attribute combinations for efficient remote execution metrics collection. +#[derive(Debug)] +pub struct ExecutionMetricAttrs { + // Stage transition attributes + unknown: Vec, + cache_check: Vec, + queued: Vec, + executing: Vec, + completed_success: Vec, + completed_failure: Vec, + completed_cancelled: Vec, + completed_timeout: Vec, + completed_cache_hit: Vec, +} + +impl ExecutionMetricAttrs { + /// Creates a new set of pre-computed attributes. + /// + /// The `base_attrs` are included in all attribute combinations (e.g., instance + /// name, worker ID). + #[must_use] + pub fn new(base_attrs: &[KeyValue]) -> Self { + let make_attrs = |stage: ExecutionStage, result: Option| { + let mut attrs = base_attrs.to_vec(); + attrs.push(KeyValue::new(EXECUTION_STAGE, stage)); + if let Some(result) = result { + attrs.push(KeyValue::new(EXECUTION_RESULT, result)); + } + attrs + }; + + Self { + unknown: make_attrs(ExecutionStage::Unknown, None), + cache_check: make_attrs(ExecutionStage::CacheCheck, None), + queued: make_attrs(ExecutionStage::Queued, None), + executing: make_attrs(ExecutionStage::Executing, None), + completed_success: make_attrs( + ExecutionStage::Completed, + Some(ExecutionResult::Success), + ), + completed_failure: make_attrs( + ExecutionStage::Completed, + Some(ExecutionResult::Failure), + ), + completed_cancelled: make_attrs( + ExecutionStage::Completed, + Some(ExecutionResult::Cancelled), + ), + completed_timeout: make_attrs( + ExecutionStage::Completed, + Some(ExecutionResult::Timeout), + ), + completed_cache_hit: make_attrs( + ExecutionStage::Completed, + Some(ExecutionResult::CacheHit), + ), + } + } + + // Attribute accessors + #[must_use] + pub fn unknown(&self) -> &[KeyValue] { + &self.unknown + } + #[must_use] + pub fn cache_check(&self) -> &[KeyValue] { + &self.cache_check + } + #[must_use] + pub fn queued(&self) -> &[KeyValue] { + &self.queued + } + #[must_use] + pub fn executing(&self) -> &[KeyValue] { + &self.executing + } + #[must_use] + pub fn completed_success(&self) -> &[KeyValue] { + &self.completed_success + } + #[must_use] + pub fn completed_failure(&self) -> &[KeyValue] { + &self.completed_failure + } + #[must_use] + pub fn completed_cancelled(&self) -> &[KeyValue] { + &self.completed_cancelled + } + #[must_use] + pub fn completed_timeout(&self) -> &[KeyValue] { + &self.completed_timeout + } + #[must_use] + pub fn completed_cache_hit(&self) -> &[KeyValue] { + &self.completed_cache_hit + } +} + +/// Global cache metrics instruments. +pub static CACHE_METRICS: LazyLock = LazyLock::new(|| { + let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build()); + + CacheMetrics { + cache_operation_duration: meter + .f64_histogram("cache.operation.duration") + .with_description("Duration of cache operations in milliseconds") + .with_unit("ms") + // The range of these is quite large as a cache might be backed by + // memory, a filesystem, or network storage. The current values were + // determined empirically and might need adjustment. + .with_boundaries(vec![ + // Microsecond range + 0.001, // 1μs + 0.005, // 5μs + 0.01, // 10μs + 0.05, // 50μs + 0.1, // 100μs + // Sub-millisecond range + 0.2, // 200μs + 0.5, // 500μs + 1.0, // 1ms + // Low millisecond range + 2.0, // 2ms + 5.0, // 5ms + 10.0, // 10ms + 20.0, // 20ms + 50.0, // 50ms + 100.0, // 100ms + // Higher latency range + 200.0, // 200ms + 500.0, // 500ms + 1000.0, // 1 second + 2000.0, // 2 seconds + 5000.0, // 5 seconds + ]) + .build(), + + cache_operations: meter + .u64_counter("cache.operations") + .with_description("Total cache operations by type and result") + .build(), + + cache_io: meter + .u64_counter("cache.io") + .with_description("Total bytes processed by cache operations") + .with_unit("By") + .build(), + + cache_size: meter + .i64_up_down_counter("cache.size") + .with_description("Current total size of cached data") + .with_unit("By") + .build(), + + cache_entries: meter + .i64_up_down_counter("cache.entries") + .with_description("Current number of cached entries") + .with_unit("{entry}") + .build(), + + cache_entry_size: meter + .u64_histogram("cache.item.size") + .with_description("Size distribution of cached entries") + .with_unit("By") + .build(), + } +}); + +/// OpenTelemetry metrics instruments for cache monitoring. +#[derive(Debug)] +pub struct CacheMetrics { + /// Histogram of cache operation durations in milliseconds + pub cache_operation_duration: metrics::Histogram, + /// Counter of cache operations by type and result + pub cache_operations: metrics::Counter, + /// Counter of bytes read/written during cache operations + pub cache_io: metrics::Counter, + /// Current total size of all cached data in bytes + pub cache_size: metrics::UpDownCounter, + /// Current number of entries in cache + pub cache_entries: metrics::UpDownCounter, + /// Histogram of individual cache entry sizes in bytes + pub cache_entry_size: metrics::Histogram, +} + +/// Global remote execution metrics instruments. +pub static EXECUTION_METRICS: LazyLock = LazyLock::new(|| { + let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build()); + + ExecutionMetrics { + execution_stage_duration: meter + .f64_histogram("execution.stage.duration") + .with_description("Duration of each execution stage in seconds") + .with_unit("s") + .with_boundaries(vec![ + // Sub-second range + 0.001, // 1ms + 0.01, // 10ms + 0.1, // 100ms + 0.5, // 500ms + 1.0, // 1s + // Multi-second range + 2.0, // 2s + 5.0, // 5s + 10.0, // 10s + 30.0, // 30s + 60.0, // 1 minute + 120.0, // 2 minutes + 300.0, // 5 minutes + 600.0, // 10 minutes + 1800.0, // 30 minutes + 3600.0, // 1 hour + ]) + .build(), + + execution_total_duration: meter + .f64_histogram("execution.total.duration") + .with_description( + "Total duration of action execution from submission to completion in seconds", + ) + .with_unit("s") + .with_boundaries(vec![ + // Sub-second range + 0.01, // 10ms + 0.1, // 100ms + 0.5, // 500ms + 1.0, // 1s + // Multi-second range + 5.0, // 5s + 10.0, // 10s + 30.0, // 30s + 60.0, // 1 minute + 300.0, // 5 minutes + 600.0, // 10 minutes + 1800.0, // 30 minutes + 3600.0, // 1 hour + 7200.0, // 2 hours + ]) + .build(), + + execution_queue_time: meter + .f64_histogram("execution.queue.time") + .with_description("Time spent waiting in queue before execution in seconds") + .with_unit("s") + .with_boundaries(vec![ + 0.001, // 1ms + 0.01, // 10ms + 0.1, // 100ms + 0.5, // 500ms + 1.0, // 1s + 2.0, // 2s + 5.0, // 5s + 10.0, // 10s + 30.0, // 30s + 60.0, // 1 minute + 300.0, // 5 minutes + 600.0, // 10 minutes + ]) + .build(), + + execution_active_count: meter + .i64_up_down_counter("execution.active.count") + .with_description("Number of actions currently in each stage") + .with_unit("{action}") + .build(), + + execution_completed_count: meter + .u64_counter("execution.completed.count") + .with_description("Total number of completed executions by result") + .with_unit("{action}") + .build(), + + execution_stage_transitions: meter + .u64_counter("execution.stage.transitions") + .with_description("Number of stage transitions") + .with_unit("{transition}") + .build(), + + execution_output_size: meter + .u64_histogram("execution.output.size") + .with_description("Size of execution outputs in bytes") + .with_unit("By") + .with_boundaries(vec![ + 1_024.0, // 1KB + 10_240.0, // 10KB + 102_400.0, // 100KB + 1_048_576.0, // 1MB + 10_485_760.0, // 10MB + 104_857_600.0, // 100MB + 1_073_741_824.0, // 1GB + 10_737_418_240.0, // 10GB + ]) + .build(), + + execution_cpu_time: meter + .f64_histogram("execution.cpu.time") + .with_description("CPU time consumed by action execution in seconds") + .with_unit("s") + .with_boundaries(vec![ + 0.01, // 10ms + 0.1, // 100ms + 1.0, // 1s + 10.0, // 10s + 60.0, // 1 minute + 300.0, // 5 minutes + 600.0, // 10 minutes + 1800.0, // 30 minutes + 3600.0, // 1 hour + ]) + .build(), + + execution_memory_usage: meter + .u64_histogram("execution.memory.usage") + .with_description("Peak memory usage during execution in bytes") + .with_unit("By") + .with_boundaries(vec![ + 1_048_576.0, // 1MB + 10_485_760.0, // 10MB + 104_857_600.0, // 100MB + 524_288_000.0, // 500MB + 1_073_741_824.0, // 1GB + 5_368_709_120.0, // 5GB + 10_737_418_240.0, // 10GB + 53_687_091_200.0, // 50GB + ]) + .build(), + + execution_retry_count: meter + .u64_counter("execution.retry.count") + .with_description("Number of execution retries") + .with_unit("{retry}") + .build(), + } +}); + +/// OpenTelemetry metrics instruments for remote execution monitoring. +#[derive(Debug)] +pub struct ExecutionMetrics { + /// Histogram of stage durations in seconds + pub execution_stage_duration: metrics::Histogram, + /// Histogram of total execution durations in seconds + pub execution_total_duration: metrics::Histogram, + /// Histogram of queue wait times in seconds + pub execution_queue_time: metrics::Histogram, + /// Current number of actions in each stage + pub execution_active_count: metrics::UpDownCounter, + /// Total number of completed executions + pub execution_completed_count: metrics::Counter, + /// Number of stage transitions + pub execution_stage_transitions: metrics::Counter, + /// Histogram of output sizes in bytes + pub execution_output_size: metrics::Histogram, + /// Histogram of CPU time in seconds + pub execution_cpu_time: metrics::Histogram, + /// Histogram of peak memory usage in bytes + pub execution_memory_usage: metrics::Histogram, + /// Counter for execution retries + pub execution_retry_count: metrics::Counter, +} + +/// Helper function to create attributes for execution metrics +#[must_use] +pub fn make_execution_attributes( + instance_name: &str, + worker_id: Option<&str>, + priority: Option, +) -> Vec { + let mut attrs = vec![KeyValue::new(EXECUTION_INSTANCE, instance_name.to_string())]; + + if let Some(worker_id) = worker_id { + attrs.push(KeyValue::new(EXECUTION_WORKER_ID, worker_id.to_string())); + } + + if let Some(priority) = priority { + attrs.push(KeyValue::new(EXECUTION_PRIORITY, i64::from(priority))); + } + + attrs +} diff --git a/nativelink-util/tests/metrics_test.rs b/nativelink-util/tests/metrics_test.rs new file mode 100644 index 00000000..0f8548de --- /dev/null +++ b/nativelink-util/tests/metrics_test.rs @@ -0,0 +1,114 @@ +// Copyright 2025 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use nativelink_util::metrics::{ + CACHE_METRICS, CacheMetricAttrs, EXECUTION_METRICS, ExecutionMetricAttrs, + make_execution_attributes, +}; +use opentelemetry::KeyValue; + +#[test] +fn test_cache_metric_attrs() { + let base_attrs = vec![ + KeyValue::new("cache.type", "test_cache"), + KeyValue::new("instance", "test_instance"), + ]; + + let attrs = CacheMetricAttrs::new(&base_attrs); + + // Verify that the pre-computed attributes contain the expected values + let read_hit_attrs = attrs.read_hit(); + assert_eq!(read_hit_attrs.len(), 4); + assert!( + read_hit_attrs + .iter() + .any(|kv| kv.key.as_str() == "cache.type" && kv.value.to_string() == "test_cache") + ); + assert!( + read_hit_attrs + .iter() + .any(|kv| kv.key.as_str() == "cache.operation.name" && kv.value.to_string() == "read") + ); + assert!( + read_hit_attrs + .iter() + .any(|kv| kv.key.as_str() == "cache.operation.result" && kv.value.to_string() == "hit") + ); +} + +#[test] +fn test_execution_metric_attrs() { + let base_attrs = vec![ + KeyValue::new("execution.instance", "test_instance"), + KeyValue::new("execution.worker_id", "worker_123"), + ]; + + let attrs = ExecutionMetricAttrs::new(&base_attrs); + + // Verify that the pre-computed attributes contain the expected values + let queued_attrs = attrs.queued(); + assert_eq!(queued_attrs.len(), 3); + assert!(queued_attrs.iter().any( + |kv| kv.key.as_str() == "execution.instance" && kv.value.to_string() == "test_instance" + )); + assert!( + queued_attrs + .iter() + .any(|kv| kv.key.as_str() == "execution.stage" && kv.value.to_string() == "queued") + ); + + let completed_success_attrs = attrs.completed_success(); + assert_eq!(completed_success_attrs.len(), 4); + assert!( + completed_success_attrs + .iter() + .any(|kv| kv.key.as_str() == "execution.stage" && kv.value.to_string() == "completed") + ); + assert!( + completed_success_attrs + .iter() + .any(|kv| kv.key.as_str() == "execution.result" && kv.value.to_string() == "success") + ); +} + +#[test] +fn test_make_execution_attributes() { + let attrs = make_execution_attributes("test_instance", Some("worker_456"), Some(100)); + + assert_eq!(attrs.len(), 3); + assert!(attrs.iter().any( + |kv| kv.key.as_str() == "execution.instance" && kv.value.to_string() == "test_instance" + )); + assert!( + attrs + .iter() + .any(|kv| kv.key.as_str() == "execution.worker_id" + && kv.value.to_string() == "worker_456") + ); + assert!( + attrs + .iter() + .any(|kv| kv.key.as_str() == "execution.priority" + && kv.value == opentelemetry::Value::I64(100)) + ); +} + +#[test] +fn test_metrics_lazy_initialization() { + // Verify that the lazy static initialization works + let _cache_metrics = &*CACHE_METRICS; + let _execution_metrics = &*EXECUTION_METRICS; + + // If we got here without panicking, the metrics were initialized successfully +} From 5ae7d49595cac0317c590621d44b8ffd2f7c9f78 Mon Sep 17 00:00:00 2001 From: Marcus Date: Sat, 6 Sep 2025 17:03:09 +0900 Subject: [PATCH 2/6] from trait --- .../src/memory_awaited_action_db.rs | 56 +++---------------- nativelink-util/src/metrics.rs | 16 ++++++ 2 files changed, 24 insertions(+), 48 deletions(-) diff --git a/nativelink-scheduler/src/memory_awaited_action_db.rs b/nativelink-scheduler/src/memory_awaited_action_db.rs index 04bee355..4ffcd2d2 100644 --- a/nativelink-scheduler/src/memory_awaited_action_db.rs +++ b/nativelink-scheduler/src/memory_awaited_action_db.rs @@ -647,57 +647,17 @@ impl I + Clone + Send + Sync> AwaitedActionDbI metrics.execution_stage_transitions.add(1, &base_attrs); // Update active count for old stage - let old_stage_attrs = match old_stage { - ActionStage::Unknown => vec![opentelemetry::KeyValue::new( - nativelink_util::metrics::EXECUTION_STAGE, - ExecutionStage::Unknown, - )], - ActionStage::CacheCheck => vec![opentelemetry::KeyValue::new( - nativelink_util::metrics::EXECUTION_STAGE, - ExecutionStage::CacheCheck, - )], - ActionStage::Queued => vec![opentelemetry::KeyValue::new( - nativelink_util::metrics::EXECUTION_STAGE, - ExecutionStage::Queued, - )], - ActionStage::Executing => vec![opentelemetry::KeyValue::new( - nativelink_util::metrics::EXECUTION_STAGE, - ExecutionStage::Executing, - )], - ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => { - vec![opentelemetry::KeyValue::new( - nativelink_util::metrics::EXECUTION_STAGE, - ExecutionStage::Completed, - )] - } - }; + let old_stage_attrs = vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::from(old_stage.clone()), + )]; metrics.execution_active_count.add(-1, &old_stage_attrs); // Update active count for new stage - let new_stage_attrs = match new_stage { - ActionStage::Unknown => vec![opentelemetry::KeyValue::new( - nativelink_util::metrics::EXECUTION_STAGE, - ExecutionStage::Unknown, - )], - ActionStage::CacheCheck => vec![opentelemetry::KeyValue::new( - nativelink_util::metrics::EXECUTION_STAGE, - ExecutionStage::CacheCheck, - )], - ActionStage::Queued => vec![opentelemetry::KeyValue::new( - nativelink_util::metrics::EXECUTION_STAGE, - ExecutionStage::Queued, - )], - ActionStage::Executing => vec![opentelemetry::KeyValue::new( - nativelink_util::metrics::EXECUTION_STAGE, - ExecutionStage::Executing, - )], - ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => { - vec![opentelemetry::KeyValue::new( - nativelink_util::metrics::EXECUTION_STAGE, - ExecutionStage::Completed, - )] - } - }; + let new_stage_attrs = vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::from(new_stage.clone()), + )]; metrics.execution_active_count.add(1, &new_stage_attrs); // Record completion metrics diff --git a/nativelink-util/src/metrics.rs b/nativelink-util/src/metrics.rs index 3b8dea07..315b5dcb 100644 --- a/nativelink-util/src/metrics.rs +++ b/nativelink-util/src/metrics.rs @@ -16,6 +16,8 @@ use std::sync::LazyLock; use opentelemetry::{InstrumentationScope, KeyValue, Value, global, metrics}; +use crate::action_messages::ActionStage; + // Metric attribute keys for cache operations. pub const CACHE_TYPE: &str = "cache.type"; pub const CACHE_OPERATION: &str = "cache.operation.name"; @@ -111,6 +113,20 @@ impl From for Value { } } +impl From for ExecutionStage { + fn from(stage: ActionStage) -> Self { + match stage { + ActionStage::Unknown => ExecutionStage::Unknown, + ActionStage::CacheCheck => ExecutionStage::CacheCheck, + ActionStage::Queued => ExecutionStage::Queued, + ActionStage::Executing => ExecutionStage::Executing, + ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => { + ExecutionStage::Completed + } + } + } +} + /// Results of remote execution operations. #[derive(Debug, Clone, Copy)] pub enum ExecutionResult { From 2b385341a688db720b5a5b12aae4aed8eeb41c6b Mon Sep 17 00:00:00 2001 From: Marcus Date: Sat, 6 Sep 2025 17:14:12 +0900 Subject: [PATCH 3/6] add tests and refactor expensive clone --- .../src/memory_awaited_action_db.rs | 4 +- nativelink-util/src/metrics.rs | 16 +++- nativelink-util/tests/metrics_test.rs | 86 ++++++++++++++++++- 3 files changed, 102 insertions(+), 4 deletions(-) diff --git a/nativelink-scheduler/src/memory_awaited_action_db.rs b/nativelink-scheduler/src/memory_awaited_action_db.rs index 4ffcd2d2..d31e48a4 100644 --- a/nativelink-scheduler/src/memory_awaited_action_db.rs +++ b/nativelink-scheduler/src/memory_awaited_action_db.rs @@ -649,14 +649,14 @@ impl I + Clone + Send + Sync> AwaitedActionDbI // Update active count for old stage let old_stage_attrs = vec![opentelemetry::KeyValue::new( nativelink_util::metrics::EXECUTION_STAGE, - ExecutionStage::from(old_stage.clone()), + ExecutionStage::from(old_stage), )]; metrics.execution_active_count.add(-1, &old_stage_attrs); // Update active count for new stage let new_stage_attrs = vec![opentelemetry::KeyValue::new( nativelink_util::metrics::EXECUTION_STAGE, - ExecutionStage::from(new_stage.clone()), + ExecutionStage::from(new_stage), )]; metrics.execution_active_count.add(1, &new_stage_attrs); diff --git a/nativelink-util/src/metrics.rs b/nativelink-util/src/metrics.rs index 315b5dcb..63dce9cc 100644 --- a/nativelink-util/src/metrics.rs +++ b/nativelink-util/src/metrics.rs @@ -87,7 +87,7 @@ impl From for Value { } /// Remote execution stages for metrics classification. -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ExecutionStage { /// Unknown stage Unknown, @@ -127,6 +127,20 @@ impl From for ExecutionStage { } } +impl From<&ActionStage> for ExecutionStage { + fn from(stage: &ActionStage) -> Self { + match stage { + ActionStage::Unknown => ExecutionStage::Unknown, + ActionStage::CacheCheck => ExecutionStage::CacheCheck, + ActionStage::Queued => ExecutionStage::Queued, + ActionStage::Executing => ExecutionStage::Executing, + ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => { + ExecutionStage::Completed + } + } + } +} + /// Results of remote execution operations. #[derive(Debug, Clone, Copy)] pub enum ExecutionResult { diff --git a/nativelink-util/tests/metrics_test.rs b/nativelink-util/tests/metrics_test.rs index 0f8548de..e52bfb2d 100644 --- a/nativelink-util/tests/metrics_test.rs +++ b/nativelink-util/tests/metrics_test.rs @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +use nativelink_util::action_messages::{ActionResult, ActionStage}; use nativelink_util::metrics::{ - CACHE_METRICS, CacheMetricAttrs, EXECUTION_METRICS, ExecutionMetricAttrs, + CACHE_METRICS, CacheMetricAttrs, EXECUTION_METRICS, ExecutionMetricAttrs, ExecutionStage, make_execution_attributes, }; use opentelemetry::KeyValue; @@ -112,3 +113,86 @@ fn test_metrics_lazy_initialization() { // If we got here without panicking, the metrics were initialized successfully } + +#[test] +fn test_action_stage_to_execution_stage_conversion() { + // Test conversion from owned ActionStage values + assert_eq!( + ExecutionStage::from(ActionStage::Unknown), + ExecutionStage::Unknown + ); + assert_eq!( + ExecutionStage::from(ActionStage::CacheCheck), + ExecutionStage::CacheCheck + ); + assert_eq!( + ExecutionStage::from(ActionStage::Queued), + ExecutionStage::Queued + ); + assert_eq!( + ExecutionStage::from(ActionStage::Executing), + ExecutionStage::Executing + ); + + // Test that Completed variants map to ExecutionStage::Completed + let action_result = ActionResult::default(); + assert_eq!( + ExecutionStage::from(ActionStage::Completed(action_result.clone())), + ExecutionStage::Completed + ); + + // Note: We can't easily test CompletedFromCache without creating a ProtoActionResult, + // but the implementation handles it the same as Completed +} + +#[test] +fn test_action_stage_ref_to_execution_stage_conversion() { + // Test conversion from ActionStage references + let unknown = ActionStage::Unknown; + let cache_check = ActionStage::CacheCheck; + let queued = ActionStage::Queued; + let executing = ActionStage::Executing; + let completed = ActionStage::Completed(ActionResult::default()); + + assert_eq!(ExecutionStage::from(&unknown), ExecutionStage::Unknown); + assert_eq!( + ExecutionStage::from(&cache_check), + ExecutionStage::CacheCheck + ); + assert_eq!(ExecutionStage::from(&queued), ExecutionStage::Queued); + assert_eq!(ExecutionStage::from(&executing), ExecutionStage::Executing); + assert_eq!(ExecutionStage::from(&completed), ExecutionStage::Completed); +} + +#[test] +fn test_action_stage_conversion_avoids_clone() { + use nativelink_util::action_messages::{FileInfo, NameOrPath}; + use nativelink_util::common::DigestInfo; + + // This test verifies that using a reference doesn't clone the large ActionResult + let large_file_info = FileInfo { + name_or_path: NameOrPath::Path("test.txt".to_string()), + digest: DigestInfo::new([0u8; 32], 100), + is_executable: false, + }; + let large_action_result = ActionResult { + output_files: vec![large_file_info; 1000], // Large vector to make clone expensive + ..Default::default() + }; + let completed = ActionStage::Completed(large_action_result); + + // Using a reference should be fast even with large data + let start = std::time::Instant::now(); + for _ in 0..10000 { + let _stage = ExecutionStage::from(&completed); + } + let elapsed = start.elapsed(); + + // This should complete very quickly since we're not cloning + // In practice, 10000 conversions should take less than 1ms + assert!( + elapsed.as_millis() < 100, + "Reference conversion took too long: {:?}", + elapsed + ); +} From 2960aa13c464678fd7e3a13b4c6376d55397e7fd Mon Sep 17 00:00:00 2001 From: Marcus Date: Sat, 6 Sep 2025 17:26:51 +0900 Subject: [PATCH 4/6] moved to the ternary operator --- .../src/memory_awaited_action_db.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/nativelink-scheduler/src/memory_awaited_action_db.rs b/nativelink-scheduler/src/memory_awaited_action_db.rs index d31e48a4..38d3ce97 100644 --- a/nativelink-scheduler/src/memory_awaited_action_db.rs +++ b/nativelink-scheduler/src/memory_awaited_action_db.rs @@ -662,17 +662,14 @@ impl I + Clone + Send + Sync> AwaitedActionDbI // Record completion metrics if let ActionStage::Completed(action_result) = new_stage { - let result_attrs = if action_result.exit_code == 0 { - vec![opentelemetry::KeyValue::new( - nativelink_util::metrics::EXECUTION_RESULT, - ExecutionResult::Success, - )] - } else { - vec![opentelemetry::KeyValue::new( - nativelink_util::metrics::EXECUTION_RESULT, - ExecutionResult::Failure, - )] - }; + let result_attrs = vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_RESULT, + if action_result.exit_code == 0 { + ExecutionResult::Success + } else { + ExecutionResult::Failure + }, + )]; metrics.execution_completed_count.add(1, &result_attrs); } else if let ActionStage::CompletedFromCache(_) = new_stage { let result_attrs = vec![opentelemetry::KeyValue::new( From a7eb2d96176c891496ba7c173e66db4cc28301b6 Mon Sep 17 00:00:00 2001 From: Marcus Date: Sat, 6 Sep 2025 22:41:12 +0900 Subject: [PATCH 5/6] add docs wrap otel impl --- .../vocabularies/TraceMachina/accept.txt | 5 + deployment-examples/metrics/README.md | 420 ++++++++++++++++++ .../metrics/docker-compose.yaml | 139 ++++++ .../provisioning/dashboards/dashboard.yaml | 12 + .../provisioning/datasources/prometheus.yaml | 23 + .../metrics/kubernetes/otel-collector.yaml | 274 ++++++++++++ .../metrics/kubernetes/prometheus.yaml | 338 ++++++++++++++ .../metrics/otel-collector-config.yaml | 139 ++++++ .../metrics/prometheus-config.yaml | 169 +++++++ .../metrics/prometheus-recording-rules.yml | 277 ++++++++++++ .../docs/docs/deployment-examples/metrics.mdx | 420 ++++++++++++++++++ 11 files changed, 2216 insertions(+) create mode 100644 deployment-examples/metrics/README.md create mode 100644 deployment-examples/metrics/docker-compose.yaml create mode 100644 deployment-examples/metrics/grafana/provisioning/dashboards/dashboard.yaml create mode 100644 deployment-examples/metrics/grafana/provisioning/datasources/prometheus.yaml create mode 100644 deployment-examples/metrics/kubernetes/otel-collector.yaml create mode 100644 deployment-examples/metrics/kubernetes/prometheus.yaml create mode 100644 deployment-examples/metrics/otel-collector-config.yaml create mode 100644 deployment-examples/metrics/prometheus-config.yaml create mode 100644 deployment-examples/metrics/prometheus-recording-rules.yml create mode 100644 web/platform/src/content/docs/docs/deployment-examples/metrics.mdx diff --git a/.github/styles/config/vocabularies/TraceMachina/accept.txt b/.github/styles/config/vocabularies/TraceMachina/accept.txt index d26ccae5..ce038a45 100644 --- a/.github/styles/config/vocabularies/TraceMachina/accept.txt +++ b/.github/styles/config/vocabularies/TraceMachina/accept.txt @@ -16,6 +16,8 @@ FFI FFIs GPUs Goma +gzip +[Hh]eatmap [Hh]ermeticity Istio JDK @@ -106,7 +108,10 @@ Thirdwave Norwest Databricks Datadog +Downsampling Brex Citrix Menlo benchmarked +Thanos +Quickwit diff --git a/deployment-examples/metrics/README.md b/deployment-examples/metrics/README.md new file mode 100644 index 00000000..142846e8 --- /dev/null +++ b/deployment-examples/metrics/README.md @@ -0,0 +1,420 @@ +# NativeLink Metrics with OpenTelemetry + +This directory contains configurations and examples for collecting, processing, and visualizing NativeLink metrics using OpenTelemetry (OTEL) and various server systems. + +## Overview + +NativeLink exposes comprehensive metrics about cache operations and remote execution through OpenTelemetry. These metrics provide insights into: + +- **Cache Performance**: Hit rates, operation latencies, eviction rates +- **Execution Pipeline**: Queue times, stage durations, success rates +- **System Health**: Worker utilization, throughput, error rates + +## Quick Start + +### Using Docker Compose (Recommended for Development) + +1. Start the metrics stack: +```bash +cd deployment-examples/metrics +docker-compose up -d +``` + +2. Configure NativeLink to send metrics to the collector: +```bash +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +export OTEL_EXPORTER_OTLP_PROTOCOL=grpc +export OTEL_SERVICE_NAME=nativelink +export OTEL_RESOURCE_ATTRIBUTES="deployment.environment=dev,nativelink.instance_name=main" +``` + +3. Start NativeLink with your configuration: +```bash +nativelink /path/to/config.json +``` + +4. Access the metrics: +- Prometheus UI: http://localhost:9090 +- Grafana: http://localhost:3000 (if included) +- OTEL Collector metrics: http://localhost:8888/metrics + +### Using Kubernetes + +1. Deploy the OTEL Collector: +```bash +kubectl apply -f kubernetes/otel-collector.yaml +``` + +2. Deploy Prometheus with OTLP receiver enabled: +```bash +kubectl apply -f kubernetes/prometheus.yaml +``` + +3. Configure NativeLink deployment to send metrics: +```yaml +env: + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "http://otel-collector:4317" + - name: OTEL_EXPORTER_OTLP_PROTOCOL + value: "grpc" + - name: OTEL_RESOURCE_ATTRIBUTES + value: "deployment.environment=prod,k8s.cluster.name=main" +``` + +## Metrics Catalog + +### Cache Metrics + +| Metric | Type | Description | Labels | +|--------|------|-------------|--------| +| `nativelink_cache_operations` | Counter | Total cache operations | `cache_type`, `cache_operation_name`, `cache_operation_result` | +| `nativelink_cache_operation_duration` | Histogram | Operation latency in milliseconds | `cache_type`, `cache_operation_name` | +| `nativelink_cache_io` | Counter | Bytes read/written | `cache_type`, `cache_operation_name` | +| `nativelink_cache_size` | Gauge | Current cache size in bytes | `cache_type` | +| `nativelink_cache_entries` | Gauge | Number of cached entries | `cache_type` | +| `nativelink_cache_item_size` | Histogram | Size distribution of cache entries | `cache_type` | + +**Cache Operation Names:** +- `read`: Data retrieval operations +- `write`: Data storage operations +- `delete`: Explicit removal operations +- `evict`: Automatic evictions (LRU, TTL) + +**Cache Operation Results:** +- `hit`: Data found and valid (reads) +- `miss`: Data not found (reads) +- `expired`: Data found but stale (reads) +- `success`: Operation completed (writes/deletes) +- `error`: Operation failed + +### Execution Metrics + +| Metric | Type | Description | Labels | +|--------|------|-------------|--------| +| `nativelink_execution_stage_duration` | Histogram | Time spent in each execution stage | `execution_stage` | +| `nativelink_execution_total_duration` | Histogram | Total execution time from submission to completion | `execution_instance` | +| `nativelink_execution_queue_time` | Histogram | Time spent waiting in queue | `execution_priority` | +| `nativelink_execution_active_count` | Gauge | Current actions in each stage | `execution_stage` | +| `nativelink_execution_completed_count` | Counter | Completed executions | `execution_result` | +| `nativelink_execution_stage_transitions` | Counter | Stage transition events | `execution_instance`, `execution_priority` | +| `nativelink_execution_output_size` | Histogram | Size of execution outputs | - | +| `nativelink_execution_retry_count` | Counter | Number of retries | - | + +**Execution Stages:** +- `unknown`: Initial state +- `cache_check`: Checking for cached results +- `queued`: Waiting for available worker +- `executing`: Running on worker +- `completed`: Finished execution + +**Execution Results:** +- `success`: Completed with exit code 0 +- `failure`: Completed with non-zero exit code +- `cancelled`: Execution was cancelled +- `timeout`: Execution timed out +- `cache_hit`: Result found in cache + +## Configuration + +### Environment Variables + +NativeLink uses standard OpenTelemetry environment variables: + +```bash +# OTLP Exporter Configuration +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 # Collector endpoint +OTEL_EXPORTER_OTLP_PROTOCOL=grpc # Protocol (grpc or http/protobuf) +OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer token" # Optional auth headers +OTEL_EXPORTER_OTLP_COMPRESSION=gzip # Compression (none, gzip) + +# Resource Attributes +OTEL_SERVICE_NAME=nativelink # Service name (fixed) +OTEL_RESOURCE_ATTRIBUTES="key1=value1,key2=value2" # Custom attributes + +# Metric Export Configuration +OTEL_METRIC_EXPORT_INTERVAL=60000 # Export interval in ms (default: 60s) +OTEL_METRIC_EXPORT_TIMEOUT=30000 # Export timeout in ms (default: 30s) + +# Disable telemetry types +OTEL_TRACES_EXPORTER=none # Disable traces (if only metrics needed) +OTEL_LOGS_EXPORTER=none # Disable logs (if only metrics needed) +``` + +### Collector Configuration + +The OTEL Collector can be configured to: +1. Add resource attributes +2. Batch metrics for efficiency +3. Export to multiple metrics servers +4. Transform metric attributes + +See `otel-collector-config.yaml` for a complete example. + +## Server Options + +### Prometheus (Recommended) + +Prometheus offers native OTLP support and excellent query capabilities. + +**Direct OTLP Ingestion:** +```bash +prometheus --web.enable-otlp-receiver \ + --storage.tsdb.out-of-order-time-window=30m +``` + +**Via Collector Scraping:** +```yaml +scrape_configs: + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:9090'] +``` + +### Grafana Cloud + +For managed metrics: +```yaml +exporters: + otlphttp: + endpoint: https://otlp-gateway-prod-us-central-0.grafana.net/otlp + headers: + Authorization: "Bearer ${GRAFANA_CLOUD_TOKEN}" +``` + +### ClickHouse + +For high-volume metrics storage: +```yaml +exporters: + clickhouse: + endpoint: tcp://clickhouse:9000 + database: metrics + ttl_days: 30 + logs_table: otel_logs + metrics_table: otel_metrics +``` + +### Quickwit + +For unified logs and metrics: +```yaml +exporters: + otlp: + endpoint: quickwit:7281 + headers: + "x-quickwit-index": "nativelink-metrics" +``` + +## Example Queries + +### Prometheus/PromQL + +**Cache hit rate:** +```promql +sum(rate(nativelink_cache_operations{cache_operation_result="hit"}[5m])) by (cache_type) / +sum(rate(nativelink_cache_operations{cache_operation_name="read"}[5m])) by (cache_type) +``` + +**Execution success rate:** +```promql +sum(rate(nativelink_execution_completed_count{execution_result="success"}[5m])) / +sum(rate(nativelink_execution_completed_count[5m])) +``` + +**Queue depth by priority:** +```promql +sum(nativelink_execution_active_count{execution_stage="queued"}) by (execution_priority) +``` + +**P95 cache operation latency:** +```promql +histogram_quantile(0.95, + sum(rate(nativelink_cache_operation_duration_bucket[5m])) by (le, cache_type) +) +``` + +**Worker utilization:** +```promql +count(nativelink_execution_active_count{execution_stage="executing"} > 0) / +count(count by (execution_worker_id) (nativelink_execution_active_count)) +``` + +### Joining with Resource Attributes + +Use `target_info` to join resource attributes: +```promql +rate(nativelink_execution_completed_count[5m]) +* on (job, instance) group_left (k8s_cluster_name, deployment_environment) +target_info +``` + +## Dashboards + +### Grafana Dashboard + +Import the included dashboard for a comprehensive view: +```bash +# Import via API +curl -X POST http://admin:admin@localhost:3000/api/dashboards/db \ + -H "Content-Type: application/json" \ + -d @grafana-dashboard.json + +# Or import via UI at http://localhost:3000 +``` + +Key panels include: +- Execution pipeline overview +- Cache performance metrics +- Worker utilization heatmap +- Error rate tracking +- Queue depth over time +- Stage duration percentiles + +## Alerting + +### Example Alert Rules + +```yaml +groups: + - name: nativelink_alerts + rules: + - alert: HighErrorRate + expr: | + (1 - ( + sum(rate(nativelink_execution_completed_count{execution_result="success"}[5m])) / + sum(rate(nativelink_execution_completed_count[5m])) + )) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "High execution error rate ({{ $value | humanizePercentage }})" + + - alert: CacheMissRateHigh + expr: | + (1 - nativelink:cache_hit_rate) > 0.5 + for: 10m + labels: + severity: info + annotations: + summary: "Cache miss rate above 50% for {{ $labels.cache_type }}" + + - alert: QueueBacklog + expr: | + sum(nativelink_execution_active_count{execution_stage="queued"}) > 100 + for: 15m + labels: + severity: warning + annotations: + summary: "Queue backlog above 100 actions" + + - alert: WorkerUtilizationLow + expr: | + nativelink:worker_utilization < 0.3 + for: 30m + labels: + severity: info + annotations: + summary: "Worker utilization below 30%" +``` + +## Troubleshooting + +### No Metrics Appearing + +1. Check NativeLink is configured with OTEL environment variables: +```bash +ps aux | grep nativelink | grep OTEL +``` + +2. Verify collector is receiving data: +```bash +curl http://localhost:13133/health +curl http://localhost:8888/metrics | grep otelcol_receiver_accepted_metric_points +``` + +3. Check collector logs: +```bash +docker logs otel-collector +# or +kubectl logs -l app=otel-collector +``` + +### High Memory Usage + +1. Adjust collector batch size: +```yaml +processors: + batch: + send_batch_size: 512 # Reduce from 1024 +``` + +2. Increase memory limits: +```yaml +memory_limiter: + limit_mib: 1024 # Increase from 512 +``` + +3. Reduce metric cardinality by dropping labels: +```yaml +processors: + attributes: + actions: + - key: unnecessary_label + action: delete +``` + +### Out-of-Order Samples + +Enable out-of-order ingestion in Prometheus: +```yaml +storage: + tsdb: + out_of_order_time_window: 1h # Increase from 30m +``` + +### Missing Resource Attributes + +Ensure attributes are promoted in Prometheus: +```yaml +otlp: + promote_resource_attributes: + - your.custom.attribute +``` + +## Performance Tuning + +### Collector Optimization + +1. **Batching**: Adjust batch processor settings based on volume +2. **Compression**: Enable gzip for network efficiency +3. **Sampling**: Use tail sampling for high-volume traces +4. **Filtering**: Drop unnecessary metrics at collector level + +### Prometheus Optimization + +1. **Recording Rules**: Pre-calculate expensive queries +2. **Retention**: Set appropriate retention periods +3. **Downsampling**: Use Thanos or Cortex for long-term storage +4. **Federation**: Split metrics across multiple Prometheus instances + +### NativeLink Optimization + +1. **Export Interval**: Increase `OTEL_METRIC_EXPORT_INTERVAL` to reduce overhead +2. **Resource Attributes**: Minimize cardinality of custom attributes +3. **Metric Selection**: Disable unused metric types if needed + +## Additional Resources + +- [OpenTelemetry Documentation](https://opentelemetry.io/docs/) +- [Prometheus Best Practices](https://prometheus.io/docs/practices/) +- [OTEL Collector Configuration](https://opentelemetry.io/docs/collector/configuration/) +- [NativeLink Documentation](https://nativelink.com/docs) +- [Grafana Dashboard Examples](https://grafana.com/grafana/dashboards/) + +## Support + +For issues or questions: +- File an issue: https://github.com/TraceMachina/nativelink/issues +- Join our Discord: https://discord.gg/nativelink +- Check documentation: https://nativelink.com/docs diff --git a/deployment-examples/metrics/docker-compose.yaml b/deployment-examples/metrics/docker-compose.yaml new file mode 100644 index 00000000..9a943b30 --- /dev/null +++ b/deployment-examples/metrics/docker-compose.yaml @@ -0,0 +1,139 @@ +version: '3.8' + +services: + # OpenTelemetry Collector + otel-collector: + image: otel/opentelemetry-collector-contrib:0.98.0 + container_name: otel-collector + restart: unless-stopped + command: ["--config=/etc/otel-collector/config.yaml"] + volumes: + - ./otel-collector-config.yaml:/etc/otel-collector/config.yaml:ro + ports: + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP HTTP receiver + - "9090:9090" # Prometheus metrics exporter + - "8888:8888" # Collector metrics + - "13133:13133" # Health check + environment: + - OTLP_BACKEND_ENDPOINT=${OTLP_BACKEND_ENDPOINT:-otlp-backend:4317} + - OTLP_BACKEND_TOKEN=${OTLP_BACKEND_TOKEN:-} + networks: + - metrics + + # Prometheus with OTLP support + prometheus: + image: prom/prometheus:v2.50.0 + container_name: prometheus + restart: unless-stopped + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--web.enable-lifecycle' + - '--web.enable-otlp-receiver' # Enable OTLP receiver + - '--storage.tsdb.retention.time=30d' + - '--storage.tsdb.out-of-order-time-window=30m' # Handle out-of-order samples + volumes: + - ./prometheus-config.yaml:/etc/prometheus/prometheus.yml:ro + - ./prometheus-recording-rules.yml:/etc/prometheus/rules/nativelink.yml:ro + - prometheus_data:/prometheus + ports: + - "9091:9090" # Prometheus web UI (different port to avoid conflict with collector) + networks: + - metrics + depends_on: + - otel-collector + + # Grafana for visualization + grafana: + image: grafana/grafana:10.3.0 + container_name: grafana + restart: unless-stopped + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_INSTALL_PLUGINS=grafana-piechart-panel + - GF_USERS_ALLOW_SIGN_UP=false + volumes: + - grafana_data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + networks: + - metrics + depends_on: + - prometheus + + # Optional: AlertManager for alerts + alertmanager: + image: prom/alertmanager:v0.27.0 + container_name: alertmanager + restart: unless-stopped + volumes: + - ./alertmanager-config.yml:/etc/alertmanager/config.yml:ro + - alertmanager_data:/alertmanager + ports: + - "9093:9093" + command: + - '--config.file=/etc/alertmanager/config.yml' + - '--storage.path=/alertmanager' + networks: + - metrics + + # Optional: Node exporter for host metrics + node-exporter: + image: prom/node-exporter:v1.7.0 + container_name: node-exporter + restart: unless-stopped + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.rootfs=/rootfs' + - '--path.sysfs=/host/sys' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' + ports: + - "9100:9100" + networks: + - metrics + + # Optional: Jaeger for trace visualization (if traces are enabled) + jaeger: + image: jaegertracing/all-in-one:1.53 + container_name: jaeger + restart: unless-stopped + environment: + - COLLECTOR_OTLP_ENABLED=true + ports: + - "16686:16686" # Jaeger UI + - "14268:14268" # Jaeger collector HTTP + networks: + - metrics + +volumes: + prometheus_data: + grafana_data: + alertmanager_data: + +networks: + metrics: + driver: bridge + +# Usage Instructions: +# 1. Start the stack: docker-compose up -d +# 2. Configure NativeLink with these environment variables: +# export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +# export OTEL_EXPORTER_OTLP_PROTOCOL=grpc +# export OTEL_SERVICE_NAME=nativelink +# export OTEL_RESOURCE_ATTRIBUTES="deployment.environment=dev" +# 3. Access services: +# - Prometheus: http://localhost:9091 +# - Grafana: http://localhost:3000 (admin/admin) +# - Jaeger: http://localhost:16686 +# - AlertManager: http://localhost:9093 +# - OTEL Collector metrics: http://localhost:8888/metrics diff --git a/deployment-examples/metrics/grafana/provisioning/dashboards/dashboard.yaml b/deployment-examples/metrics/grafana/provisioning/dashboards/dashboard.yaml new file mode 100644 index 00000000..20e6f666 --- /dev/null +++ b/deployment-examples/metrics/grafana/provisioning/dashboards/dashboard.yaml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: 'NativeLink Dashboards' + orgId: 1 + folder: 'NativeLink' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards diff --git a/deployment-examples/metrics/grafana/provisioning/datasources/prometheus.yaml b/deployment-examples/metrics/grafana/provisioning/datasources/prometheus.yaml new file mode 100644 index 00000000..e553ac28 --- /dev/null +++ b/deployment-examples/metrics/grafana/provisioning/datasources/prometheus.yaml @@ -0,0 +1,23 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true + jsonData: + timeInterval: "15s" + queryTimeout: "60s" + httpMethod: POST + + - name: OTEL-Collector-Prometheus + type: prometheus + access: proxy + url: http://otel-collector:9090 + editable: true + jsonData: + timeInterval: "15s" + queryTimeout: "60s" + httpMethod: POST diff --git a/deployment-examples/metrics/kubernetes/otel-collector.yaml b/deployment-examples/metrics/kubernetes/otel-collector.yaml new file mode 100644 index 00000000..739eecf6 --- /dev/null +++ b/deployment-examples/metrics/kubernetes/otel-collector.yaml @@ -0,0 +1,274 @@ +# OpenTelemetry Collector Deployment for NativeLink Metrics +apiVersion: v1 +kind: ConfigMap +metadata: + name: otel-collector-config + namespace: nativelink +data: + collector.yaml: | + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + processors: + resource: + attributes: + - key: service.namespace + value: nativelink + action: upsert + - key: k8s.cluster.name + from_attribute: K8S_CLUSTER_NAME + action: insert + - key: deployment.environment + from_attribute: DEPLOYMENT_ENV + action: insert + + transform/nativelink: + metric_statements: + - context: datapoint + statements: + - set(attributes["instance_name"], resource.attributes["nativelink.instance_name"]) + where resource.attributes["nativelink.instance_name"] != nil + + batch: + timeout: 10s + send_batch_size: 1024 + + memory_limiter: + check_interval: 1s + limit_mib: 1024 + spike_limit_mib: 256 + + exporters: + prometheus: + endpoint: 0.0.0.0:9090 + namespace: nativelink + resource_to_telemetry_conversion: + enabled: true + enable_open_metrics: true + + otlphttp/prometheus: + endpoint: http://prometheus:9090/api/v1/otlp/v1/metrics + compression: gzip + + extensions: + health_check: + endpoint: 0.0.0.0:13133 + pprof: + endpoint: 0.0.0.0:1777 + zpages: + endpoint: 0.0.0.0:55679 + + service: + extensions: [health_check, pprof, zpages] + pipelines: + metrics: + receivers: [otlp] + processors: [memory_limiter, resource, transform/nativelink, batch] + exporters: [prometheus] + metrics/prometheus_otlp: + receivers: [otlp] + processors: [memory_limiter, resource, transform/nativelink, batch] + exporters: [otlphttp/prometheus] + + telemetry: + logs: + level: info + metrics: + level: detailed + address: 0.0.0.0:8888 + +--- +apiVersion: v1 +kind: Service +metadata: + name: otel-collector + namespace: nativelink + labels: + app: otel-collector +spec: + type: ClusterIP + selector: + app: otel-collector + ports: + - name: otlp-grpc + port: 4317 + targetPort: 4317 + protocol: TCP + - name: otlp-http + port: 4318 + targetPort: 4318 + protocol: TCP + - name: prometheus + port: 9090 + targetPort: 9090 + protocol: TCP + - name: metrics + port: 8888 + targetPort: 8888 + protocol: TCP + - name: health + port: 13133 + targetPort: 13133 + protocol: TCP + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otel-collector + namespace: nativelink + labels: + app: otel-collector +spec: + replicas: 2 + selector: + matchLabels: + app: otel-collector + template: + metadata: + labels: + app: otel-collector + spec: + serviceAccountName: otel-collector + containers: + - name: otel-collector + image: otel/opentelemetry-collector-contrib:0.98.0 + args: + - "--config=/conf/collector.yaml" + ports: + - containerPort: 4317 + name: otlp-grpc + - containerPort: 4318 + name: otlp-http + - containerPort: 9090 + name: prometheus + - containerPort: 8888 + name: metrics + - containerPort: 13133 + name: health + env: + - name: K8S_CLUSTER_NAME + value: "nativelink-cluster" + - name: DEPLOYMENT_ENV + value: "production" + - name: K8S_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: K8S_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: config + mountPath: /conf + resources: + requests: + memory: "512Mi" + cpu: "200m" + limits: + memory: "1Gi" + cpu: "1000m" + livenessProbe: + httpGet: + path: / + port: 13133 + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + httpGet: + path: / + port: 13133 + initialDelaySeconds: 5 + periodSeconds: 5 + volumes: + - name: config + configMap: + name: otel-collector-config + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + namespace: nativelink + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: otel-collector +rules: + - apiGroups: [""] + resources: ["pods", "namespaces", "nodes"] + verbs: ["get", "watch", "list"] + - apiGroups: ["apps"] + resources: ["deployments", "daemonsets", "statefulsets", "replicasets"] + verbs: ["get", "watch", "list"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "watch", "list"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: otel-collector +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: otel-collector +subjects: + - kind: ServiceAccount + name: otel-collector + namespace: nativelink + +--- +# HorizontalPodAutoscaler for OTEL Collector +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: otel-collector + namespace: nativelink +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: otel-collector + minReplicas: 2 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 + +--- +# PodDisruptionBudget for high availability +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: otel-collector + namespace: nativelink +spec: + minAvailable: 1 + selector: + matchLabels: + app: otel-collector diff --git a/deployment-examples/metrics/kubernetes/prometheus.yaml b/deployment-examples/metrics/kubernetes/prometheus.yaml new file mode 100644 index 00000000..eacc026b --- /dev/null +++ b/deployment-examples/metrics/kubernetes/prometheus.yaml @@ -0,0 +1,338 @@ +# Prometheus Deployment for NativeLink Metrics +apiVersion: v1 +kind: Namespace +metadata: + name: nativelink +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: nativelink +data: + prometheus.yml: | + global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + cluster: 'nativelink-k8s' + environment: 'production' + + # OTLP configuration (requires --web.enable-otlp-receiver flag) + otlp: + promote_resource_attributes: + - service.instance.id + - service.name + - service.namespace + - service.version + - cloud.availability_zone + - cloud.region + - container.name + - deployment.environment + - k8s.cluster.name + - k8s.container.name + - k8s.deployment.name + - k8s.namespace.name + - k8s.pod.name + - k8s.statefulset.name + - nativelink.instance_name + - nativelink.worker_id + - nativelink.scheduler_name + + keep_identifying_resource_attributes: true + translation_strategy: NoUTF8EscapingWithSuffixes + + storage: + tsdb: + out_of_order_time_window: 30m + retention.time: 30d + + scrape_configs: + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:9090'] + metric_relabel_configs: + - source_labels: [__name__] + regex: '(nativelink_.*)' + target_label: __name__ + replacement: '${1}' + + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # Kubernetes service discovery for NativeLink pods + - job_name: 'nativelink-pods' + kubernetes_sd_configs: + - role: pod + namespaces: + names: ['nativelink'] + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + + rule_files: + - /etc/prometheus/rules/*.yml + + alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-rules + namespace: nativelink +data: + nativelink-rules.yml: | + groups: + - name: nativelink_alerts + interval: 30s + rules: + - alert: NativeLinkHighErrorRate + expr: | + (1 - ( + sum(rate(nativelink_execution_completed_count{execution_result="success"}[5m])) / + sum(rate(nativelink_execution_completed_count[5m])) + )) > 0.05 + for: 5m + labels: + severity: warning + component: nativelink + annotations: + summary: "High execution error rate ({{ $value | humanizePercentage }})" + description: "NativeLink execution error rate is above 5% for the last 5 minutes" + + - alert: NativeLinkCacheMissRateHigh + expr: | + (1 - ( + sum(rate(nativelink_cache_operations{cache_operation_result="hit"}[5m])) by (cache_type) / + sum(rate(nativelink_cache_operations{cache_operation_name="read"}[5m])) by (cache_type) + )) > 0.5 + for: 10m + labels: + severity: info + component: nativelink + annotations: + summary: "Cache miss rate above 50% for {{ $labels.cache_type }}" + description: "Cache {{ $labels.cache_type }} has a miss rate above 50% for 10 minutes" + + - alert: NativeLinkQueueBacklog + expr: | + sum(nativelink_execution_active_count{execution_stage="queued"}) > 100 + for: 15m + labels: + severity: warning + component: nativelink + annotations: + summary: "Execution queue backlog above 100 actions" + description: "{{ $value }} actions are queued for execution" + + - alert: NativeLinkWorkerUtilizationLow + expr: | + count(nativelink_execution_active_count{execution_stage="executing"} > 0) / + count(count by (execution_worker_id) (nativelink_execution_active_count)) < 0.3 + for: 30m + labels: + severity: info + component: nativelink + annotations: + summary: "Worker utilization below 30%" + description: "Only {{ $value | humanizePercentage }} of workers are active" + + - alert: NativeLinkCacheEvictionRateHigh + expr: | + sum(rate(nativelink_cache_operations{cache_operation_name="evict"}[5m])) by (cache_type) > 10 + for: 10m + labels: + severity: warning + component: nativelink + annotations: + summary: "High cache eviction rate for {{ $labels.cache_type }}" + description: "Cache {{ $labels.cache_type }} is evicting {{ $value }} items per second" + +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: nativelink + labels: + app: prometheus +spec: + type: ClusterIP + selector: + app: prometheus + ports: + - name: web + port: 9090 + targetPort: 9090 + protocol: TCP + +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: prometheus + namespace: nativelink + labels: + app: prometheus +spec: + serviceName: prometheus + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + serviceAccountName: prometheus + securityContext: + fsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: prometheus + image: prom/prometheus:v2.50.0 + args: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--web.enable-lifecycle' + - '--web.enable-otlp-receiver' + - '--storage.tsdb.retention.time=30d' + - '--storage.tsdb.out-of-order-time-window=30m' + ports: + - containerPort: 9090 + name: web + volumeMounts: + - name: config + mountPath: /etc/prometheus + - name: rules + mountPath: /etc/prometheus/rules + - name: storage + mountPath: /prometheus + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "4Gi" + cpu: "2000m" + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + initialDelaySeconds: 5 + periodSeconds: 5 + volumes: + - name: config + configMap: + name: prometheus-config + - name: rules + configMap: + name: prometheus-rules + volumeClaimTemplates: + - metadata: + name: storage + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 50Gi + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: nativelink + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: ["extensions"] + resources: + - ingresses + verbs: ["get", "list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: + - ingresses + verbs: ["get", "list", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: nativelink + +--- +# Ingress for external access (optional) +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: prometheus + namespace: nativelink + annotations: + nginx.ingress.kubernetes.io/rewrite-target: / +spec: + ingressClassName: nginx + rules: + - host: prometheus.nativelink.local + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: prometheus + port: + number: 9090 diff --git a/deployment-examples/metrics/otel-collector-config.yaml b/deployment-examples/metrics/otel-collector-config.yaml new file mode 100644 index 00000000..fe90896f --- /dev/null +++ b/deployment-examples/metrics/otel-collector-config.yaml @@ -0,0 +1,139 @@ +# OpenTelemetry Collector Configuration for NativeLink Metrics +# This configuration receives metrics from NativeLink via OTLP and exports them to various backends + +receivers: + # Receive metrics from NativeLink via OTLP gRPC + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + # Add resource attributes for better metric identification + resource: + attributes: + - key: service.namespace + value: nativelink + action: upsert + - key: deployment.environment + from_attribute: deployment_environment + action: insert + - key: deployment.region + from_attribute: deployment_region + action: insert + + # Transform metrics to add NativeLink-specific attributes + transform/nativelink: + metric_statements: + - context: datapoint + statements: + # Add instance name from resource attributes if available + - set(attributes["instance_name"], resource.attributes["nativelink.instance_name"]) + where resource.attributes["nativelink.instance_name"] != nil + + # Batch metrics for efficiency + batch: + timeout: 10s + send_batch_size: 1024 + send_batch_max_size: 2048 + + # Add memory limiter to prevent OOM + memory_limiter: + check_interval: 1s + limit_mib: 512 + spike_limit_mib: 128 + +exporters: + # Export metrics to Prometheus format + prometheus: + endpoint: 0.0.0.0:9090 + namespace: nativelink + const_labels: + service: nativelink + resource_to_telemetry_conversion: + enabled: true + enable_open_metrics: true + # Add metric descriptions for NativeLink metrics + metric_expiration: 10m + + # Direct OTLP export to Prometheus (when Prometheus has OTLP receiver enabled) + otlphttp/prometheus: + endpoint: http://prometheus:9090/api/v1/otlp/v1/metrics + compression: gzip + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + + # Export to other OTLP backends (e.g., Grafana Cloud, ClickHouse) + otlp/backend: + endpoint: "${OTLP_BACKEND_ENDPOINT}" + compression: gzip + headers: + Authorization: "Bearer ${OTLP_BACKEND_TOKEN}" + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + + # Debug exporter for troubleshooting + debug: + verbosity: detailed + sampling_initial: 5 + sampling_thereafter: 200 + +extensions: + health_check: + endpoint: 0.0.0.0:13133 + path: /health + check_collector_pipeline: + enabled: true + interval: 15s + exporter_failure_threshold: 5 + + pprof: + endpoint: 0.0.0.0:1777 + + zpages: + endpoint: 0.0.0.0:55679 + +service: + extensions: [health_check, pprof, zpages] + pipelines: + # Main metrics pipeline - exports to Prometheus scrape endpoint + metrics: + receivers: [otlp] + processors: [memory_limiter, resource, transform/nativelink, batch] + exporters: [prometheus] + + # Direct to Prometheus OTLP endpoint (if enabled) + metrics/prometheus_otlp: + receivers: [otlp] + processors: [memory_limiter, resource, transform/nativelink, batch] + exporters: [otlphttp/prometheus] + + # Optional: Send to additional backend + # Uncomment and configure OTLP_BACKEND_ENDPOINT environment variable + # metrics/backend: + # receivers: [otlp] + # processors: [memory_limiter, resource, transform/nativelink, batch] + # exporters: [otlp/backend] + + # Debug pipeline for development + # metrics/debug: + # receivers: [otlp] + # processors: [memory_limiter] + # exporters: [debug] + + telemetry: + logs: + level: info + initial_fields: + service: otel-collector + metrics: + level: detailed + address: 0.0.0.0:8888 diff --git a/deployment-examples/metrics/prometheus-config.yaml b/deployment-examples/metrics/prometheus-config.yaml new file mode 100644 index 00000000..40e0e952 --- /dev/null +++ b/deployment-examples/metrics/prometheus-config.yaml @@ -0,0 +1,169 @@ +# Prometheus Configuration for NativeLink Metrics +# This configuration sets up Prometheus to receive metrics via OTLP and scrape format + +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + cluster: 'nativelink-cluster' + environment: 'production' + +# Enable OTLP receiver (requires --web.enable-otlp-receiver flag) +otlp: + # Promote NativeLink-specific resource attributes to labels + promote_resource_attributes: + - service.instance.id + - service.name + - service.namespace + - service.version + # Cloud/Infrastructure attributes + - cloud.availability_zone + - cloud.region + - container.name + - deployment.environment + - deployment.environment.name + # Kubernetes attributes + - k8s.cluster.name + - k8s.container.name + - k8s.cronjob.name + - k8s.daemonset.name + - k8s.deployment.name + - k8s.job.name + - k8s.namespace.name + - k8s.pod.name + - k8s.replicaset.name + - k8s.statefulset.name + # NativeLink-specific attributes + - nativelink.instance_name + - nativelink.worker_id + - nativelink.scheduler_name + + # Keep identifying resource attributes in target_info + keep_identifying_resource_attributes: true + + # Use NoTranslation to preserve metric names with UTF-8 support + # This keeps OpenTelemetry semantic convention names intact + translation_strategy: NoUTF8EscapingWithSuffixes + +# Storage configuration for handling out-of-order samples +storage: + tsdb: + # Allow 30 minutes of out-of-order samples (for batched OTLP data) + out_of_order_time_window: 30m + # Retention period for metrics + retention.time: 30d + # Maximum number of concurrent queries + max_concurrent_queries: 20 + +# Scrape configurations +scrape_configs: + # Scrape the OTEL Collector's Prometheus endpoint + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:9090'] + metric_relabel_configs: + # Add nativelink prefix to all metrics from collector + - source_labels: [__name__] + regex: '(nativelink_.*)' + target_label: __name__ + replacement: '${1}' + + # Scrape Prometheus's own metrics + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # Optional: Direct scrape of NativeLink instances (if metrics endpoint is exposed) + # - job_name: 'nativelink-direct' + # static_configs: + # - targets: ['nativelink-cas:8080', 'nativelink-scheduler:8080'] + # metrics_path: '/metrics' + +# Recording rules for common NativeLink queries +rule_files: + - /etc/prometheus/rules/*.yml + +# Alerting configuration +alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + +# Example recording rules for NativeLink metrics +# Save this as a separate file: rules/nativelink-recording-rules.yml +# rule_files content example: +--- +# Recording Rules for NativeLink Metrics +groups: + - name: nativelink_execution + interval: 30s + rules: + # Execution success rate + - record: nativelink:execution_success_rate + expr: | + sum(rate(nativelink_execution_completed_count{execution_result="success"}[5m])) / + sum(rate(nativelink_execution_completed_count[5m])) + + # Average queue time + - record: nativelink:execution_queue_time_avg + expr: | + histogram_quantile(0.5, + sum(rate(nativelink_execution_queue_time_bucket[5m])) by (le, instance_name) + ) + + # Actions per stage + - record: nativelink:execution_active_by_stage + expr: | + sum(nativelink_execution_active_count) by (execution_stage, instance_name) + + # Stage transition rate + - record: nativelink:stage_transition_rate + expr: | + sum(rate(nativelink_execution_stage_transitions[5m])) by (instance_name) + + - name: nativelink_cache + interval: 30s + rules: + # Cache hit rate + - record: nativelink:cache_hit_rate + expr: | + sum(rate(nativelink_cache_operations{cache_operation_result="hit"}[5m])) by (cache_type) / + sum(rate(nativelink_cache_operations{cache_operation_name="read"}[5m])) by (cache_type) + + # Cache operation latency p95 + - record: nativelink:cache_operation_latency_p95 + expr: | + histogram_quantile(0.95, + sum(rate(nativelink_cache_operation_duration_bucket[5m])) by (le, cache_type, cache_operation_name) + ) + + # Cache size utilization + - record: nativelink:cache_size_bytes + expr: | + sum(nativelink_cache_size) by (cache_type, instance_name) + + # Cache eviction rate + - record: nativelink:cache_eviction_rate + expr: | + sum(rate(nativelink_cache_operations{cache_operation_name="evict"}[5m])) by (cache_type) + + - name: nativelink_performance + interval: 60s + rules: + # Overall system throughput + - record: nativelink:system_throughput + expr: | + sum(rate(nativelink_execution_completed_count[5m])) + + # Worker utilization + - record: nativelink:worker_utilization + expr: | + sum(nativelink_execution_active_count{execution_stage="executing"}) by (execution_worker_id) / + count(count by (execution_worker_id) (nativelink_execution_active_count)) + + # Action completion time (from queued to completed) + - record: nativelink:action_total_duration_p99 + expr: | + histogram_quantile(0.99, + sum(rate(nativelink_execution_total_duration_bucket[5m])) by (le, instance_name) + ) diff --git a/deployment-examples/metrics/prometheus-recording-rules.yml b/deployment-examples/metrics/prometheus-recording-rules.yml new file mode 100644 index 00000000..665a4e87 --- /dev/null +++ b/deployment-examples/metrics/prometheus-recording-rules.yml @@ -0,0 +1,277 @@ +# Recording Rules for NativeLink Metrics +# These rules pre-calculate common queries for better dashboard performance + +groups: + - name: nativelink_execution + interval: 30s + rules: + # Execution success rate by instance + - record: nativelink:execution_success_rate + expr: | + sum by (instance_name, execution_instance) ( + rate(nativelink_execution_completed_count{execution_result="success"}[5m]) + ) / + sum by (instance_name, execution_instance) ( + rate(nativelink_execution_completed_count[5m]) + ) + + # Cache hit rate from executions + - record: nativelink:execution_cache_hit_rate + expr: | + sum by (instance_name) ( + rate(nativelink_execution_completed_count{execution_result="cache_hit"}[5m]) + ) / + sum by (instance_name) ( + rate(nativelink_execution_completed_count[5m]) + ) + + # Average queue time (median) + - record: nativelink:execution_queue_time_p50 + expr: | + histogram_quantile(0.5, + sum by (le, instance_name, execution_instance) ( + rate(nativelink_execution_queue_time_bucket[5m]) + ) + ) + + # Queue time 95th percentile + - record: nativelink:execution_queue_time_p95 + expr: | + histogram_quantile(0.95, + sum by (le, instance_name, execution_instance) ( + rate(nativelink_execution_queue_time_bucket[5m]) + ) + ) + + # Actions currently in each stage + - record: nativelink:execution_active_by_stage + expr: | + sum by (execution_stage, instance_name, execution_instance) ( + nativelink_execution_active_count + ) + + # Stage transition rate + - record: nativelink:stage_transition_rate + expr: | + sum by (instance_name, execution_instance, execution_priority) ( + rate(nativelink_execution_stage_transitions[5m]) + ) + + # Execution duration by stage (p50, p95, p99) + - record: nativelink:execution_stage_duration_p50 + expr: | + histogram_quantile(0.5, + sum by (le, execution_stage, instance_name) ( + rate(nativelink_execution_stage_duration_bucket[5m]) + ) + ) + + - record: nativelink:execution_stage_duration_p95 + expr: | + histogram_quantile(0.95, + sum by (le, execution_stage, instance_name) ( + rate(nativelink_execution_stage_duration_bucket[5m]) + ) + ) + + - record: nativelink:execution_stage_duration_p99 + expr: | + histogram_quantile(0.99, + sum by (le, execution_stage, instance_name) ( + rate(nativelink_execution_stage_duration_bucket[5m]) + ) + ) + + # Total execution time from submission to completion + - record: nativelink:execution_total_duration_p50 + expr: | + histogram_quantile(0.5, + sum by (le, instance_name, execution_instance) ( + rate(nativelink_execution_total_duration_bucket[5m]) + ) + ) + + - record: nativelink:execution_total_duration_p95 + expr: | + histogram_quantile(0.95, + sum by (le, instance_name, execution_instance) ( + rate(nativelink_execution_total_duration_bucket[5m]) + ) + ) + + # Execution output size distribution + - record: nativelink:execution_output_size_p50 + expr: | + histogram_quantile(0.5, + sum by (le, instance_name) ( + rate(nativelink_execution_output_size_bucket[5m]) + ) + ) + + - record: nativelink:execution_output_size_p95 + expr: | + histogram_quantile(0.95, + sum by (le, instance_name) ( + rate(nativelink_execution_output_size_bucket[5m]) + ) + ) + + - name: nativelink_cache + interval: 30s + rules: + # Cache hit rate by operation and cache type + - record: nativelink:cache_hit_rate + expr: | + sum by (cache_type, instance_name) ( + rate(nativelink_cache_operations{cache_operation_result="hit"}[5m]) + ) / + sum by (cache_type, instance_name) ( + rate(nativelink_cache_operations{cache_operation_name="read"}[5m]) + ) + + # Cache operation latency percentiles + - record: nativelink:cache_operation_latency_p50 + expr: | + histogram_quantile(0.5, + sum by (le, cache_type, cache_operation_name, instance_name) ( + rate(nativelink_cache_operation_duration_bucket[5m]) + ) + ) + + - record: nativelink:cache_operation_latency_p95 + expr: | + histogram_quantile(0.95, + sum by (le, cache_type, cache_operation_name, instance_name) ( + rate(nativelink_cache_operation_duration_bucket[5m]) + ) + ) + + - record: nativelink:cache_operation_latency_p99 + expr: | + histogram_quantile(0.99, + sum by (le, cache_type, cache_operation_name, instance_name) ( + rate(nativelink_cache_operation_duration_bucket[5m]) + ) + ) + + # Cache size and entry count + - record: nativelink:cache_size_bytes + expr: | + sum by (cache_type, instance_name) (nativelink_cache_size) + + - record: nativelink:cache_entry_count + expr: | + sum by (cache_type, instance_name) (nativelink_cache_entries) + + # Cache eviction rate + - record: nativelink:cache_eviction_rate + expr: | + sum by (cache_type, instance_name) ( + rate(nativelink_cache_operations{cache_operation_name="evict"}[5m]) + ) + + # Cache throughput (bytes/sec) + - record: nativelink:cache_read_throughput_bytes + expr: | + sum by (cache_type, instance_name) ( + rate(nativelink_cache_io{cache_operation_name="read"}[5m]) + ) + + - record: nativelink:cache_write_throughput_bytes + expr: | + sum by (cache_type, instance_name) ( + rate(nativelink_cache_io{cache_operation_name="write"}[5m]) + ) + + # Cache error rate + - record: nativelink:cache_error_rate + expr: | + sum by (cache_type, cache_operation_name, instance_name) ( + rate(nativelink_cache_operations{cache_operation_result="error"}[5m]) + ) + + - name: nativelink_performance + interval: 60s + rules: + # Overall system throughput (actions/sec) + - record: nativelink:system_throughput + expr: | + sum(rate(nativelink_execution_completed_count[5m])) + + # System success rate + - record: nativelink:system_success_rate + expr: | + sum(rate(nativelink_execution_completed_count{execution_result="success"}[5m])) / + sum(rate(nativelink_execution_completed_count[5m])) + + # Worker utilization (percentage of workers executing) + - record: nativelink:worker_utilization + expr: | + count by (instance_name) ( + nativelink_execution_active_count{execution_stage="executing"} > 0 + ) / + count by (instance_name) ( + nativelink_execution_active_count + ) + + # Queue depth (actions waiting) + - record: nativelink:queue_depth + expr: | + sum by (instance_name, execution_priority) ( + nativelink_execution_active_count{execution_stage="queued"} + ) + + # Average actions per worker + - record: nativelink:actions_per_worker + expr: | + sum by (execution_worker_id) ( + nativelink_execution_active_count{execution_stage="executing"} + ) + + # Memory usage estimation from output sizes + - record: nativelink:estimated_memory_usage_bytes + expr: | + sum by (instance_name) ( + nativelink_execution_output_size_sum + ) + + # Retry rate + - record: nativelink:execution_retry_rate + expr: | + sum by (instance_name) ( + rate(nativelink_execution_retry_count[5m]) + ) + + - name: nativelink_slo + interval: 60s + rules: + # SLO: 99% of executions should complete successfully + - record: nativelink:slo_execution_success_rate + expr: | + sum(rate(nativelink_execution_completed_count{execution_result="success"}[1h])) / + sum(rate(nativelink_execution_completed_count[1h])) + + # SLO: 95% of cache reads should be under 100ms + - record: nativelink:slo_cache_read_latency + expr: | + histogram_quantile(0.95, + sum(rate(nativelink_cache_operation_duration_bucket{cache_operation_name="read"}[1h])) by (le) + ) < 0.1 + + # SLO: Queue time should be under 30s for 90% of actions + - record: nativelink:slo_queue_time + expr: | + histogram_quantile(0.9, + sum(rate(nativelink_execution_queue_time_bucket[1h])) by (le) + ) < 30 + + # Error budget remaining (based on 99% success SLO) + - record: nativelink:error_budget_remaining + expr: | + 1 - ( + (1 - 0.99) - + (1 - ( + sum(rate(nativelink_execution_completed_count{execution_result="success"}[30d])) / + sum(rate(nativelink_execution_completed_count[30d])) + )) + ) / (1 - 0.99) diff --git a/web/platform/src/content/docs/docs/deployment-examples/metrics.mdx b/web/platform/src/content/docs/docs/deployment-examples/metrics.mdx new file mode 100644 index 00000000..ee277011 --- /dev/null +++ b/web/platform/src/content/docs/docs/deployment-examples/metrics.mdx @@ -0,0 +1,420 @@ +--- +title: Metrics and Observability +description: 'Configure OpenTelemetry metrics collection for NativeLink' +--- + +import { Tabs, TabItem } from '@astrojs/starlight/components'; + +NativeLink provides comprehensive metrics through OpenTelemetry (OTEL), enabling deep insights into cache performance, remote execution pipelines, and system health. + +## Overview + +NativeLink automatically exports metrics when configured with OTEL environment variables. The metrics cover: + +- **Cache Operations**: Hit rates, latencies, evictions +- **Execution Pipeline**: Queue depths, stage durations, success rates +- **System Health**: Worker utilization, throughput, error rates + +## Quick Start + + + + +```bash +# Clone the repository +git clone https://github.com/TraceMachina/nativelink +cd nativelink/deployment-examples/metrics + +# Start the metrics stack +docker-compose up -d + +# Configure NativeLink +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +export OTEL_EXPORTER_OTLP_PROTOCOL=grpc +export OTEL_SERVICE_NAME=nativelink +export OTEL_RESOURCE_ATTRIBUTES="deployment.environment=dev" + +# Run NativeLink +nativelink /path/to/config.json +``` + +Access the services: +- Prometheus: http://localhost:9091 +- Grafana: http://localhost:3000 (admin/admin) +- OTEL Collector: http://localhost:8888/metrics + + + + +```bash +# Create namespace +kubectl create namespace nativelink + +# Deploy OTEL Collector +kubectl apply -f deployment-examples/metrics/kubernetes/otel-collector.yaml + +# Deploy Prometheus +kubectl apply -f deployment-examples/metrics/kubernetes/prometheus.yaml + +# Configure NativeLink pods +kubectl set env deployment/nativelink \ + OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 \ + OTEL_EXPORTER_OTLP_PROTOCOL=grpc \ + OTEL_RESOURCE_ATTRIBUTES="k8s.cluster.name=main" +``` + + + + +```bash +# Start Prometheus with OTLP receiver +prometheus \ + --web.enable-otlp-receiver \ + --storage.tsdb.out-of-order-time-window=30m \ + --config.file=prometheus.yml + +# Configure NativeLink +export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf +export OTEL_EXPORTER_OTLP_METRICS_ENDPOINT=http://localhost:9090/api/v1/otlp/v1/metrics +export OTEL_SERVICE_NAME=nativelink +export OTEL_RESOURCE_ATTRIBUTES="service.instance.id=$(uuidgen)" + +# Disable traces and logs +export OTEL_TRACES_EXPORTER=none +export OTEL_LOGS_EXPORTER=none +``` + + + + +## Configuration + +### Environment Variables + +NativeLink uses standard OpenTelemetry environment variables: + +```bash +# Core OTLP Configuration +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +OTEL_EXPORTER_OTLP_PROTOCOL=grpc # or http/protobuf +OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer token" +OTEL_EXPORTER_OTLP_COMPRESSION=gzip + +# Resource Attributes (customize for your deployment) +OTEL_SERVICE_NAME=nativelink # Fixed value +OTEL_RESOURCE_ATTRIBUTES="deployment.environment=prod,region=us-east-1" + +# Metric Export Intervals +OTEL_METRIC_EXPORT_INTERVAL=60000 # 60 seconds +OTEL_METRIC_EXPORT_TIMEOUT=30000 # 30 seconds +``` + +### Collector Configuration + +The OTEL Collector adds resource attributes and batches metrics: + +```yaml +processors: + resource: + attributes: + - key: service.namespace + value: nativelink + action: upsert + batch: + timeout: 10s + send_batch_size: 1024 +``` + +## Metrics Reference + +### Cache Metrics + +Monitor cache performance and efficiency: + +| Metric | Description | Key Labels | +|--------|-------------|------------| +| `nativelink_cache_operations` | Operations count by type and result | `cache_type`, `operation`, `result` | +| `nativelink_cache_operation_duration` | Operation latency histogram | `cache_type`, `operation` | +| `nativelink_cache_hit_rate` | Calculated hit rate (recording rule) | `cache_type` | +| `nativelink_cache_size` | Current cache size in bytes | `cache_type` | +| `nativelink_cache_eviction_rate` | Evictions per second | `cache_type` | + +### Execution Metrics + +Track remote execution pipeline performance: + +| Metric | Description | Key Labels | +|--------|-------------|------------| +| `nativelink_execution_active_count` | Actions in each stage | `execution_stage` | +| `nativelink_execution_completed_count` | Completed actions | `execution_result` | +| `nativelink_execution_queue_time` | Queue wait time histogram | `priority` | +| `nativelink_execution_stage_duration` | Time per stage | `execution_stage` | +| `nativelink_execution_success_rate` | Success percentage (recording rule) | `instance` | + +### Execution Stages + +Actions progress through these stages: +1. `unknown` - Initial state +2. `cache_check` - Checking for cached results +3. `queued` - Waiting for worker +4. `executing` - Running on worker +5. `completed` - Finished (success/failure/cache_hit) + +## Example Queries + +### Cache Performance + +```promql +# Cache hit rate by type +sum(rate(nativelink_cache_operations{result="hit"}[5m])) by (cache_type) / +sum(rate(nativelink_cache_operations{operation="read"}[5m])) by (cache_type) + +# P95 cache operation latency +histogram_quantile(0.95, + sum(rate(nativelink_cache_operation_duration_bucket[5m])) by (le, cache_type) +) + +# Cache eviction rate +sum(rate(nativelink_cache_operations{operation="evict"}[5m])) by (cache_type) +``` + +### Execution Pipeline + +```promql +# Execution success rate +sum(rate(nativelink_execution_completed_count{result="success"}[5m])) / +sum(rate(nativelink_execution_completed_count[5m])) + +# Queue depth by priority +sum(nativelink_execution_active_count{stage="queued"}) by (priority) + +# Average queue time +histogram_quantile(0.5, + sum(rate(nativelink_execution_queue_time_bucket[5m])) by (le) +) + +# Worker utilization +count(nativelink_execution_active_count{stage="executing"} > 0) / +count(count by (worker_id) (nativelink_execution_active_count)) +``` + +### System Health + +```promql +# Overall throughput (actions/sec) +sum(rate(nativelink_execution_completed_count[5m])) + +# Error rate +sum(rate(nativelink_execution_completed_count{result="failure"}[5m])) / +sum(rate(nativelink_execution_completed_count[5m])) + +# Stage transition rate +sum(rate(nativelink_execution_stage_transitions[5m])) by (instance) +``` + +## Dashboards + +### Grafana Dashboard + +Import the pre-built dashboard for comprehensive monitoring: + +```json +{ + "title": "NativeLink Metrics", + "panels": [ + { + "title": "Execution Success Rate", + "targets": [{ + "expr": "nativelink:execution_success_rate" + }] + }, + { + "title": "Cache Hit Rate", + "targets": [{ + "expr": "nativelink:cache_hit_rate" + }] + }, + { + "title": "Queue Depth", + "targets": [{ + "expr": "sum(nativelink_execution_active_count{stage=\"queued\"})" + }] + } + ] +} +``` + +### Key Metrics to Monitor + +1. **SLI/SLO Metrics**: + - Execution success rate > 99% + - Cache hit rate > 80% + - P95 queue time < 30s + - P95 cache latency < 100ms + +2. **Capacity Planning**: + - Queue depth trends + - Worker utilization + - Cache size growth + - Eviction rates + +3. **Performance Optimization**: + - Stage duration breakdowns + - Cache operation latencies + - Output size distributions + - Retry rates + +## Server Options + +### Prometheus (Recommended) + +Best for most deployments with excellent query capabilities: + +```yaml +# Enable OTLP receiver +prometheus --web.enable-otlp-receiver + +# Configure out-of-order handling +storage: + tsdb: + out_of_order_time_window: 30m +``` + +### Grafana Cloud + +Managed solution with built-in dashboards: + +```bash +export OTEL_EXPORTER_OTLP_ENDPOINT=https://otlp-gateway.grafana.net/otlp +export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer ${GRAFANA_TOKEN}" +``` + +### ClickHouse + +For high-volume metrics with SQL queries: + +```yaml +exporters: + clickhouse: + endpoint: tcp://clickhouse:9000 + database: nativelink_metrics + ttl_days: 90 +``` + +### Quickwit + +Unified logs and metrics search: + +```yaml +exporters: + otlp: + endpoint: quickwit:7281 + headers: + x-quickwit-index: nativelink-metrics +``` + +## Alerting + +### Critical Alerts + +```yaml +- alert: HighErrorRate + expr: | + (1 - nativelink:execution_success_rate) > 0.05 + for: 5m + annotations: + summary: "Execution error rate above 5%" + +- alert: QueueBacklog + expr: | + sum(nativelink_execution_active_count{stage="queued"}) > 100 + for: 15m + annotations: + summary: "Queue backlog exceeds 100 actions" + +- alert: CacheEvictionHigh + expr: | + rate(nativelink_cache_operations{operation="evict"}[5m]) > 10 + for: 10m + annotations: + summary: "Cache eviction rate exceeds threshold" +``` + +## Troubleshooting + +### No Metrics Appearing + +1. Verify OTEL environment variables: + ```bash + env | grep OTEL_ + ``` + +2. Check collector health: + ```bash + curl http://localhost:13133/health + ``` + +3. Verify metrics are being received: + ```bash + curl http://localhost:8888/metrics | grep otelcol_receiver + ``` + +### High Cardinality + +Reduce label dimensions: +```yaml +processors: + attributes: + actions: + - key: high_cardinality_label + action: delete +``` + +### Out-of-Order Samples + +Increase Prometheus window: +```yaml +storage: + tsdb: + out_of_order_time_window: 1h +``` + +## Performance Tuning + +### Metric Export Optimization + +```bash +# Increase export interval for lower overhead +export OTEL_METRIC_EXPORT_INTERVAL=120000 # 2 minutes + +# Batch metrics at collector +processors: + batch: + send_batch_size: 2048 + timeout: 30s +``` + +### Recording Rules + +Use Prometheus recording rules for expensive queries: +```yaml +- record: nativelink:hourly_success_rate + expr: | + avg_over_time(nativelink:execution_success_rate[1h]) +``` + +### Sampling + +For high-volume deployments, sample metrics: +```yaml +processors: + probabilistic_sampler: + sampling_percentage: 10 # Sample 10% of metrics +``` + +## Additional Resources + +- [OpenTelemetry Documentation](https://opentelemetry.io/docs/) +- [Prometheus Best Practices](https://prometheus.io/docs/practices/) +- [Grafana Dashboard Gallery](https://grafana.com/grafana/dashboards/) +- [NativeLink GitHub](https://github.com/TraceMachina/nativelink) From c48cc4bf426ec1eb3d84e87adab031eedfbb3cc1 Mon Sep 17 00:00:00 2001 From: Marcus Date: Sat, 6 Sep 2025 23:08:04 +0900 Subject: [PATCH 6/6] adds comprehensive metrics documentation --- nativelink-util/src/metrics.rs | 8 +++++--- .../src/content/docs/docs/deployment-examples/metrics.mdx | 1 - web/platform/starlight.conf.ts | 4 ++++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/nativelink-util/src/metrics.rs b/nativelink-util/src/metrics.rs index 63dce9cc..2c916c05 100644 --- a/nativelink-util/src/metrics.rs +++ b/nativelink-util/src/metrics.rs @@ -1,10 +1,12 @@ // Copyright 2025 The NativeLink Authors. All rights reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Business Source License, Version 1.1 (the "License"); // you may not use this file except in compliance with the License. -// You may obtain a copy of the License at +// You may requested a copy of the License by emailing contact@nativelink.com. // -// http://www.apache.org/licenses/LICENSE-2.0 +// Use of this module requires an enterprise license agreement, which can be +// attained by emailing contact@nativelink.com or signing up for Nativelink +// Cloud at app.nativelink.com. // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, diff --git a/web/platform/src/content/docs/docs/deployment-examples/metrics.mdx b/web/platform/src/content/docs/docs/deployment-examples/metrics.mdx index ee277011..614eab1b 100644 --- a/web/platform/src/content/docs/docs/deployment-examples/metrics.mdx +++ b/web/platform/src/content/docs/docs/deployment-examples/metrics.mdx @@ -2,7 +2,6 @@ title: Metrics and Observability description: 'Configure OpenTelemetry metrics collection for NativeLink' --- - import { Tabs, TabItem } from '@astrojs/starlight/components'; NativeLink provides comprehensive metrics through OpenTelemetry (OTEL), enabling deep insights into cache performance, remote execution pipelines, and system health. diff --git a/web/platform/starlight.conf.ts b/web/platform/starlight.conf.ts index 38f644d6..931530cf 100644 --- a/web/platform/starlight.conf.ts +++ b/web/platform/starlight.conf.ts @@ -146,6 +146,10 @@ export const starlightConfig = { label: "Chromium", link: `${docsRoot}/deployment-examples/chromium`, }, + { + label: "Metrics and Observability", + link: `${docsRoot}/deployment-examples/metrics`, + }, ], }, {