/build/source/nativelink-util/src/metrics.rs

Source
// Copyright 2025 The NativeLink Authors. All rights reserved.
//
// Licensed under the Business Source License, Version 1.1 (the "License");
// you may not use this file except in compliance with the License.
// You may requested a copy of the License by emailing contact@nativelink.com.
//
// Use of this module requires an enterprise license agreement, which can be
// attained by emailing contact@nativelink.com or signing up for Nativelink
// Cloud at app.nativelink.com.
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::sync::LazyLock;

use opentelemetry::{InstrumentationScope, KeyValue, Value, global, metrics};

use crate::action_messages::ActionStage;

// Metric attribute keys for cache operations.
pub const CACHE_TYPE: &str = "cache.type";
pub const CACHE_OPERATION: &str = "cache.operation.name";
pub const CACHE_RESULT: &str = "cache.operation.result";

// Metric attribute keys for remote execution operations.
pub const EXECUTION_STAGE: &str = "execution.stage";
pub const EXECUTION_RESULT: &str = "execution.result";
pub const EXECUTION_INSTANCE: &str = "execution.instance";
pub const EXECUTION_PRIORITY: &str = "execution.priority";
pub const EXECUTION_WORKER_ID: &str = "execution.worker_id";
pub const EXECUTION_EXIT_CODE: &str = "execution.exit_code";
pub const EXECUTION_ACTION_DIGEST: &str = "execution.action_digest";

/// Cache operation types for metrics classification.
#[derive(Debug, Clone, Copy)]
pub enum CacheOperationName {
    /// Data retrieval operations (get, peek, contains, etc.)
    Read,
    /// Data storage operations (insert, update, replace, etc.)
    Write,
    /// Explicit data removal operations
    Delete,
    /// Automatic cache maintenance (evictions, TTL cleanup, etc.)
    Evict,
}

impl From<CacheOperationName> for Value {
    fn from(op: CacheOperationName) -> Self {
        match op {
            CacheOperationName::Read => Self::from("read"),
            CacheOperationName::Write => Self::from("write"),
            CacheOperationName::Delete => Self::from("delete"),
            CacheOperationName::Evict => Self::from("evict"),
        }
    }
}

/// Results of cache operations.
///
/// Result semantics vary by operation type:
/// - Read: Hit/Miss/Expired indicate data availability
/// - Write/Delete/Evict: Success/Error indicate completion status
#[derive(Debug, Clone, Copy)]
pub enum CacheOperationResult {
    /// Data found and valid (Read operations)
    Hit,
    /// Data not found (Read operations)
    Miss,
    /// Data found but invalid/expired (Read operations)
    Expired,
    /// Operation completed successfully (Write/Delete/Evict operations)
    Success,
    /// Operation failed (any operation type)
    Error,
}

impl From<CacheOperationResult> for Value {
    fn from(result: CacheOperationResult) -> Self {
        match result {
            CacheOperationResult::Hit => Self::from("hit"),
            CacheOperationResult::Miss => Self::from("miss"),
            CacheOperationResult::Expired => Self::from("expired"),
            CacheOperationResult::Success => Self::from("success"),
            CacheOperationResult::Error => Self::from("error"),
        }
    }
}

/// Remote execution stages for metrics classification.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ExecutionStage {
    /// Unknown stage
    Unknown,
    /// Checking cache for existing results
    CacheCheck,
    /// Action is queued waiting for execution
    Queued,
    /// Action is being executed by a worker
    Executing,
    /// Action execution completed
    Completed,
}

impl From<ExecutionStage> for Value {
    fn from(stage: ExecutionStage) -> Self {
        match stage {
            ExecutionStage::Unknown => Self::from("unknown"),
            ExecutionStage::CacheCheck => Self::from("cache_check"),
            ExecutionStage::Queued => Self::from("queued"),
            ExecutionStage::Executing => Self::from("executing"),
            ExecutionStage::Completed => Self::from("completed"),
        }
    }
}

impl From<ActionStage> for ExecutionStage {
    fn from(stage: ActionStage) -> Self {
        match stage {
            ActionStage::Unknown => Self::Unknown,
            ActionStage::CacheCheck => Self::CacheCheck,
            ActionStage::Queued => Self::Queued,
            ActionStage::Executing => Self::Executing,
            ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => Self::Completed,
        }
    }
}

impl From<&ActionStage> for ExecutionStage {
    fn from(stage: &ActionStage) -> Self {
        match stage {
            ActionStage::Unknown => Self::Unknown,
            ActionStage::CacheCheck => Self::CacheCheck,
            ActionStage::Queued => Self::Queued,
            ActionStage::Executing => Self::Executing,
            ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => Self::Completed,
        }
    }
}

/// Results of remote execution operations.
#[derive(Debug, Clone, Copy)]
pub enum ExecutionResult {
    /// Execution completed successfully
    Success,
    /// Execution failed
    Failure,
    /// Execution was cancelled
    Cancelled,
    /// Execution timed out
    Timeout,
    /// Result was found in cache
    CacheHit,
}

impl From<ExecutionResult> for Value {
    fn from(result: ExecutionResult) -> Self {
        match result {
            ExecutionResult::Success => Self::from("success"),
            ExecutionResult::Failure => Self::from("failure"),
            ExecutionResult::Cancelled => Self::from("cancelled"),
            ExecutionResult::Timeout => Self::from("timeout"),
            ExecutionResult::CacheHit => Self::from("cache_hit"),
        }
    }
}

/// Pre-allocated attribute combinations for efficient cache metrics collection.
///
/// Avoids runtime allocation by pre-computing common attribute combinations
/// for cache operations and results.
#[derive(Debug)]
pub struct CacheMetricAttrs {
    // Read operation attributes
    read_hit: Vec<KeyValue>,
    read_miss: Vec<KeyValue>,
    read_expired: Vec<KeyValue>,

    // Write operation attributes
    write_success: Vec<KeyValue>,
    write_error: Vec<KeyValue>,

    // Delete operation attributes
    delete_success: Vec<KeyValue>,
    delete_miss: Vec<KeyValue>,
    delete_error: Vec<KeyValue>,

    // Evict operation attributes
    evict_success: Vec<KeyValue>,
    evict_expired: Vec<KeyValue>,
}

impl CacheMetricAttrs {
    /// Creates a new set of pre-computed attributes.
    ///
    /// The `base_attrs` are included in all attribute combinations (e.g., cache
    /// type, instance ID).
    #[must_use]
    pub fn new(base_attrs: &[KeyValue]) -> Self {
        let make_attrs1 = |op: CacheOperationName, result: CacheOperationResult| {
            let mut attrs = base_attrs.to_vec();
            attrs.push(KeyValue::new(CACHE_OPERATION, op));
            attrs.push(KeyValue::new(CACHE_RESULT, result));
            attrs
        };

        Self {
            read_hit: make_attrs(CacheOperationName::Read, CacheOperationResult::Hit),
            read_miss: make_attrs(CacheOperationName::Read, CacheOperationResult::Miss),
            read_expired: make_attrs(CacheOperationName::Read, CacheOperationResult::Expired),

            write_success: make_attrs(CacheOperationName::Write, CacheOperationResult::Success),
            write_error: make_attrs(CacheOperationName::Write, CacheOperationResult::Error),

            delete_success: make_attrs(CacheOperationName::Delete, CacheOperationResult::Success),
            delete_miss: make_attrs(CacheOperationName::Delete, CacheOperationResult::Miss),
            delete_error: make_attrs(CacheOperationName::Delete, CacheOperationResult::Error),

            evict_success: make_attrs(CacheOperationName::Evict, CacheOperationResult::Success),
            evict_expired: make_attrs(CacheOperationName::Evict, CacheOperationResult::Expired),
        }
    }

    // Attribute accessors
    #[must_use]
    pub fn read_hit(&self) -> &[KeyValue] {
        &self.read_hit
    }
    #[must_use]
    pub fn read_miss(&self) -> &[KeyValue] {
        &self.read_miss
    }
    #[must_use]
    pub fn read_expired(&self) -> &[KeyValue] {
        &self.read_expired
    }
    #[must_use]
    pub fn write_success(&self) -> &[KeyValue] {
        &self.write_success
    }
    #[must_use]
    pub fn write_error(&self) -> &[KeyValue] {
        &self.write_error
    }
    #[must_use]
    pub fn delete_success(&self) -> &[KeyValue] {
        &self.delete_success
    }
    #[must_use]
    pub fn delete_miss(&self) -> &[KeyValue] {
        &self.delete_miss
    }
    #[must_use]
    pub fn delete_error(&self) -> &[KeyValue] {
        &self.delete_error
    }
    #[must_use]
    pub fn evict_success(&self) -> &[KeyValue] {
        &self.evict_success
    }
    #[must_use]
    pub fn evict_expired(&self) -> &[KeyValue] {
        &self.evict_expired
    }
}

/// Pre-allocated attribute combinations for efficient remote execution metrics collection.
#[derive(Debug)]
pub struct ExecutionMetricAttrs {
    // Stage transition attributes
    unknown: Vec<KeyValue>,
    cache_check: Vec<KeyValue>,
    queued: Vec<KeyValue>,
    executing: Vec<KeyValue>,
    completed_success: Vec<KeyValue>,
    completed_failure: Vec<KeyValue>,
    completed_cancelled: Vec<KeyValue>,
    completed_timeout: Vec<KeyValue>,
    completed_cache_hit: Vec<KeyValue>,
}

impl ExecutionMetricAttrs {
    /// Creates a new set of pre-computed attributes.
    ///
    /// The `base_attrs` are included in all attribute combinations (e.g., instance
    /// name, worker ID).
    #[must_use]
    pub fn new(base_attrs: &[KeyValue]) -> Self {
        let make_attrs1 = |stage: ExecutionStage, result: Option<ExecutionResult>| {
            let mut attrs = base_attrs.to_vec();
            attrs.push(KeyValue::new(EXECUTION_STAGE, stage));
            if let Some(result5) = result {
                attrs.push(KeyValue::new(EXECUTION_RESULT, result));
            }4
            attrs
        };

        Self {
            unknown: make_attrs(ExecutionStage::Unknown, None),
            cache_check: make_attrs(ExecutionStage::CacheCheck, None),
            queued: make_attrs(ExecutionStage::Queued, None),
            executing: make_attrs(ExecutionStage::Executing, None),
            completed_success: make_attrs(
                ExecutionStage::Completed,
                Some(ExecutionResult::Success),
            ),
            completed_failure: make_attrs(
                ExecutionStage::Completed,
                Some(ExecutionResult::Failure),
            ),
            completed_cancelled: make_attrs(
                ExecutionStage::Completed,
                Some(ExecutionResult::Cancelled),
            ),
            completed_timeout: make_attrs(
                ExecutionStage::Completed,
                Some(ExecutionResult::Timeout),
            ),
            completed_cache_hit: make_attrs(
                ExecutionStage::Completed,
                Some(ExecutionResult::CacheHit),
            ),
        }
    }

    // Attribute accessors
    #[must_use]
    pub fn unknown(&self) -> &[KeyValue] {
        &self.unknown
    }
    #[must_use]
    pub fn cache_check(&self) -> &[KeyValue] {
        &self.cache_check
    }
    #[must_use]
    pub fn queued(&self) -> &[KeyValue] {
        &self.queued
    }
    #[must_use]
    pub fn executing(&self) -> &[KeyValue] {
        &self.executing
    }
    #[must_use]
    pub fn completed_success(&self) -> &[KeyValue] {
        &self.completed_success
    }
    #[must_use]
    pub fn completed_failure(&self) -> &[KeyValue] {
        &self.completed_failure
    }
    #[must_use]
    pub fn completed_cancelled(&self) -> &[KeyValue] {
        &self.completed_cancelled
    }
    #[must_use]
    pub fn completed_timeout(&self) -> &[KeyValue] {
        &self.completed_timeout
    }
    #[must_use]
    pub fn completed_cache_hit(&self) -> &[KeyValue] {
        &self.completed_cache_hit
    }
}

/// Global cache metrics instruments.
pub static CACHE_METRICS: LazyLock<CacheMetrics> = LazyLock::new(|| {
    let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build());

    CacheMetrics {
        cache_operation_duration: meter
            .f64_histogram("cache.operation.duration")
            .with_description("Duration of cache operations in milliseconds")
            .with_unit("ms")
            // The range of these is quite large as a cache might be backed by
            // memory, a filesystem, or network storage. The current values were
            // determined empirically and might need adjustment.
            .with_boundaries(vec![
                // Microsecond range
                0.001, // 1μs
                0.005, // 5μs
                0.01,  // 10μs
                0.05,  // 50μs
                0.1,   // 100μs
                // Sub-millisecond range
                0.2, // 200μs
                0.5, // 500μs
                1.0, // 1ms
                // Low millisecond range
                2.0,   // 2ms
                5.0,   // 5ms
                10.0,  // 10ms
                20.0,  // 20ms
                50.0,  // 50ms
                100.0, // 100ms
                // Higher latency range
                200.0,  // 200ms
                500.0,  // 500ms
                1000.0, // 1 second
                2000.0, // 2 seconds
                5000.0, // 5 seconds
            ])
            .build(),

        cache_operations: meter
            .u64_counter("cache.operations")
            .with_description("Total cache operations by type and result")
            .build(),

        cache_io: meter
            .u64_counter("cache.io")
            .with_description("Total bytes processed by cache operations")
            .with_unit("By")
            .build(),

        cache_size: meter
            .i64_up_down_counter("cache.size")
            .with_description("Current total size of cached data")
            .with_unit("By")
            .build(),

        cache_entries: meter
            .i64_up_down_counter("cache.entries")
            .with_description("Current number of cached entries")
            .with_unit("{entry}")
            .build(),

        cache_entry_size: meter
            .u64_histogram("cache.item.size")
            .with_description("Size distribution of cached entries")
            .with_unit("By")
            .build(),
    }
});

/// OpenTelemetry metrics instruments for cache monitoring.
#[derive(Debug)]
pub struct CacheMetrics {
    /// Histogram of cache operation durations in milliseconds
    pub cache_operation_duration: metrics::Histogram<f64>,
    /// Counter of cache operations by type and result
    pub cache_operations: metrics::Counter<u64>,
    /// Counter of bytes read/written during cache operations
    pub cache_io: metrics::Counter<u64>,
    /// Current total size of all cached data in bytes
    pub cache_size: metrics::UpDownCounter<i64>,
    /// Current number of entries in cache
    pub cache_entries: metrics::UpDownCounter<i64>,
    /// Histogram of individual cache entry sizes in bytes
    pub cache_entry_size: metrics::Histogram<u64>,
}

/// Global remote execution metrics instruments.
pub static EXECUTION_METRICS: LazyLock<ExecutionMetrics> = LazyLock::new(|| {
    let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build());

    ExecutionMetrics {
        execution_stage_duration: meter
            .f64_histogram("execution.stage.duration")
            .with_description("Duration of each execution stage in seconds")
            .with_unit("s")
            .with_boundaries(vec![
                // Sub-second range
                0.001, // 1ms
                0.01,  // 10ms
                0.1,   // 100ms
                0.5,   // 500ms
                1.0,   // 1s
                // Multi-second range
                2.0,    // 2s
                5.0,    // 5s
                10.0,   // 10s
                30.0,   // 30s
                60.0,   // 1 minute
                120.0,  // 2 minutes
                300.0,  // 5 minutes
                600.0,  // 10 minutes
                1800.0, // 30 minutes
                3600.0, // 1 hour
            ])
            .build(),

        execution_total_duration: meter
            .f64_histogram("execution.total.duration")
            .with_description(
                "Total duration of action execution from submission to completion in seconds",
            )
            .with_unit("s")
            .with_boundaries(vec![
                // Sub-second range
                0.01, // 10ms
                0.1,  // 100ms
                0.5,  // 500ms
                1.0,  // 1s
                // Multi-second range
                5.0,    // 5s
                10.0,   // 10s
                30.0,   // 30s
                60.0,   // 1 minute
                300.0,  // 5 minutes
                600.0,  // 10 minutes
                1800.0, // 30 minutes
                3600.0, // 1 hour
                7200.0, // 2 hours
            ])
            .build(),

        execution_queue_time: meter
            .f64_histogram("execution.queue.time")
            .with_description("Time spent waiting in queue before execution in seconds")
            .with_unit("s")
            .with_boundaries(vec![
                0.001, // 1ms
                0.01,  // 10ms
                0.1,   // 100ms
                0.5,   // 500ms
                1.0,   // 1s
                2.0,   // 2s
                5.0,   // 5s
                10.0,  // 10s
                30.0,  // 30s
                60.0,  // 1 minute
                300.0, // 5 minutes
                600.0, // 10 minutes
            ])
            .build(),

        execution_active_count: meter
            .i64_up_down_counter("execution.active.count")
            .with_description("Number of actions currently in each stage")
            .with_unit("{action}")
            .build(),

        execution_completed_count: meter
            .u64_counter("execution.completed.count")
            .with_description("Total number of completed executions by result")
            .with_unit("{action}")
            .build(),

        execution_stage_transitions: meter
            .u64_counter("execution.stage.transitions")
            .with_description("Number of stage transitions")
            .with_unit("{transition}")
            .build(),

        execution_output_size: meter
            .u64_histogram("execution.output.size")
            .with_description("Size of execution outputs in bytes")
            .with_unit("By")
            .with_boundaries(vec![
                1_024.0,          // 1KB
                10_240.0,         // 10KB
                102_400.0,        // 100KB
                1_048_576.0,      // 1MB
                10_485_760.0,     // 10MB
                104_857_600.0,    // 100MB
                1_073_741_824.0,  // 1GB
                10_737_418_240.0, // 10GB
            ])
            .build(),

        execution_cpu_time: meter
            .f64_histogram("execution.cpu.time")
            .with_description("CPU time consumed by action execution in seconds")
            .with_unit("s")
            .with_boundaries(vec![
                0.01,   // 10ms
                0.1,    // 100ms
                1.0,    // 1s
                10.0,   // 10s
                60.0,   // 1 minute
                300.0,  // 5 minutes
                600.0,  // 10 minutes
                1800.0, // 30 minutes
                3600.0, // 1 hour
            ])
            .build(),

        execution_memory_usage: meter
            .u64_histogram("execution.memory.usage")
            .with_description("Peak memory usage during execution in bytes")
            .with_unit("By")
            .with_boundaries(vec![
                1_048_576.0,      // 1MB
                10_485_760.0,     // 10MB
                104_857_600.0,    // 100MB
                524_288_000.0,    // 500MB
                1_073_741_824.0,  // 1GB
                5_368_709_120.0,  // 5GB
                10_737_418_240.0, // 10GB
                53_687_091_200.0, // 50GB
            ])
            .build(),

        execution_retry_count: meter
            .u64_counter("execution.retry.count")
            .with_description("Number of execution retries")
            .with_unit("{retry}")
            .build(),
    }
});

/// OpenTelemetry metrics instruments for remote execution monitoring.
#[derive(Debug)]
pub struct ExecutionMetrics {
    /// Histogram of stage durations in seconds
    pub execution_stage_duration: metrics::Histogram<f64>,
    /// Histogram of total execution durations in seconds
    pub execution_total_duration: metrics::Histogram<f64>,
    /// Histogram of queue wait times in seconds
    pub execution_queue_time: metrics::Histogram<f64>,
    /// Current number of actions in each stage
    pub execution_active_count: metrics::UpDownCounter<i64>,
    /// Total number of completed executions
    pub execution_completed_count: metrics::Counter<u64>,
    /// Number of stage transitions
    pub execution_stage_transitions: metrics::Counter<u64>,
    /// Histogram of output sizes in bytes
    pub execution_output_size: metrics::Histogram<u64>,
    /// Histogram of CPU time in seconds
    pub execution_cpu_time: metrics::Histogram<f64>,
    /// Histogram of peak memory usage in bytes
    pub execution_memory_usage: metrics::Histogram<u64>,
    /// Counter for execution retries
    pub execution_retry_count: metrics::Counter<u64>,
}

/// Helper function to create attributes for execution metrics
#[must_use]
pub fn make_execution_attributes(
    instance_name: &str,
    worker_id: Option<&str>,
    priority: Option<i32>,
) -> Vec<KeyValue> {
    let mut attrs = vec![KeyValue::new(EXECUTION_INSTANCE, instance_name.to_string())];

    if let Some(worker_id38) = worker_id {
        attrs.push(KeyValue::new(EXECUTION_WORKER_ID, worker_id.to_string()));
    }

    if let Some(priority) = priority {
        attrs.push(KeyValue::new(EXECUTION_PRIORITY, i64::from(priority)));
    }0

    attrs
}

Coverage Report

Created: 2025-12-17 22:46

Line	Count	Source
1		// Copyright 2025 The NativeLink Authors. All rights reserved.
2		//
3		// Licensed under the Business Source License, Version 1.1 (the "License");
4		// you may not use this file except in compliance with the License.
5		// You may requested a copy of the License by emailing contact@nativelink.com.
6		//
7		// Use of this module requires an enterprise license agreement, which can be
8		// attained by emailing contact@nativelink.com or signing up for Nativelink
9		// Cloud at app.nativelink.com.
10		//
11		// Unless required by applicable law or agreed to in writing, software
12		// distributed under the License is distributed on an "AS IS" BASIS,
13		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		// See the License for the specific language governing permissions and
15		// limitations under the License.
16
17		use std::sync::LazyLock;
18
19		use opentelemetry::{InstrumentationScope, KeyValue, Value, global, metrics};
20
21		use crate::action_messages::ActionStage;
22
23		// Metric attribute keys for cache operations.
24		pub const CACHE_TYPE: &str = "cache.type";
25		pub const CACHE_OPERATION: &str = "cache.operation.name";
26		pub const CACHE_RESULT: &str = "cache.operation.result";
27
28		// Metric attribute keys for remote execution operations.
29		pub const EXECUTION_STAGE: &str = "execution.stage";
30		pub const EXECUTION_RESULT: &str = "execution.result";
31		pub const EXECUTION_INSTANCE: &str = "execution.instance";
32		pub const EXECUTION_PRIORITY: &str = "execution.priority";
33		pub const EXECUTION_WORKER_ID: &str = "execution.worker_id";
34		pub const EXECUTION_EXIT_CODE: &str = "execution.exit_code";
35		pub const EXECUTION_ACTION_DIGEST: &str = "execution.action_digest";
36
37		/// Cache operation types for metrics classification.
38		#[derive(Debug, Clone, Copy)]
39		pub enum CacheOperationName {
40		/// Data retrieval operations (get, peek, contains, etc.)
41		Read,
42		/// Data storage operations (insert, update, replace, etc.)
43		Write,
44		/// Explicit data removal operations
45		Delete,
46		/// Automatic cache maintenance (evictions, TTL cleanup, etc.)
47		Evict,
48		}
49
50		impl From<CacheOperationName> for Value {
51	10	fn from(op: CacheOperationName) -> Self {
52	10	match op {
53	3	CacheOperationName::Read => Self::from("read"),
54	2	CacheOperationName::Write => Self::from("write"),
55	3	CacheOperationName::Delete => Self::from("delete"),
56	2	CacheOperationName::Evict => Self::from("evict"),
57		}
58	10	}
59		}
60
61		/// Results of cache operations.
62		///
63		/// Result semantics vary by operation type:
64		/// - Read: Hit/Miss/Expired indicate data availability
65		/// - Write/Delete/Evict: Success/Error indicate completion status
66		#[derive(Debug, Clone, Copy)]
67		pub enum CacheOperationResult {
68		/// Data found and valid (Read operations)
69		Hit,
70		/// Data not found (Read operations)
71		Miss,
72		/// Data found but invalid/expired (Read operations)
73		Expired,
74		/// Operation completed successfully (Write/Delete/Evict operations)
75		Success,
76		/// Operation failed (any operation type)
77		Error,
78		}
79
80		impl From<CacheOperationResult> for Value {
81	10	fn from(result: CacheOperationResult) -> Self {
82	10	match result {
83	1	CacheOperationResult::Hit => Self::from("hit"),
84	2	CacheOperationResult::Miss => Self::from("miss"),
85	2	CacheOperationResult::Expired => Self::from("expired"),
86	3	CacheOperationResult::Success => Self::from("success"),
87	2	CacheOperationResult::Error => Self::from("error"),
88		}
89	10	}
90		}
91
92		/// Remote execution stages for metrics classification.
93		#[derive(Debug, Clone, Copy, PartialEq, Eq)]
94		pub enum ExecutionStage {
95		/// Unknown stage
96		Unknown,
97		/// Checking cache for existing results
98		CacheCheck,
99		/// Action is queued waiting for execution
100		Queued,
101		/// Action is being executed by a worker
102		Executing,
103		/// Action execution completed
104		Completed,
105		}
106
107		impl From<ExecutionStage> for Value {
108	158	fn from(stage: ExecutionStage) -> Self {
109	158	match stage {
110	1	ExecutionStage::Unknown => Self::from("unknown"),
111	1	ExecutionStage::CacheCheck => Self::from("cache_check"),
112	68	ExecutionStage::Queued => Self::from("queued"),
113	70	ExecutionStage::Executing => Self::from("executing"),
114	18	ExecutionStage::Completed => Self::from("completed"),
115		}
116	158	}
117		}
118
119		impl From<ActionStage> for ExecutionStage {
120	5	fn from(stage: ActionStage) -> Self {
121	5	match stage {
122	1	ActionStage::Unknown => Self::Unknown,
123	1	ActionStage::CacheCheck => Self::CacheCheck,
124	1	ActionStage::Queued => Self::Queued,
125	1	ActionStage::Executing => Self::Executing,
126	1	ActionStage::Completed(_) \| ActionStage::CompletedFromCache(_) => Self::Completed,
127		}
128	5	}
129		}
130
131		impl From<&ActionStage> for ExecutionStage {
132	10.1k	fn from(stage: &ActionStage) -> Self {
133	10.1k	match stage {
134	1	ActionStage::Unknown => Self::Unknown,
135	1	ActionStage::CacheCheck => Self::CacheCheck,
136	44	ActionStage::Queued => Self::Queued,
137	70	ActionStage::Executing => Self::Executing,
138	10.0k	ActionStage::Completed(_) \| ActionStage::CompletedFromCache(_) => Self::Completed,
139		}
140	10.1k	}
141		}
142
143		/// Results of remote execution operations.
144		#[derive(Debug, Clone, Copy)]
145		pub enum ExecutionResult {
146		/// Execution completed successfully
147		Success,
148		/// Execution failed
149		Failure,
150		/// Execution was cancelled
151		Cancelled,
152		/// Execution timed out
153		Timeout,
154		/// Result was found in cache
155		CacheHit,
156		}
157
158		impl From<ExecutionResult> for Value {
159	18	fn from(result: ExecutionResult) -> Self {
160	18	match result {
161	11	ExecutionResult::Success => Self::from("success"),
162	4	ExecutionResult::Failure => Self::from("failure"),
163	1	ExecutionResult::Cancelled => Self::from("cancelled"),
164	1	ExecutionResult::Timeout => Self::from("timeout"),
165	1	ExecutionResult::CacheHit => Self::from("cache_hit"),
166		}
167	18	}
168		}
169
170		/// Pre-allocated attribute combinations for efficient cache metrics collection.
171		///
172		/// Avoids runtime allocation by pre-computing common attribute combinations
173		/// for cache operations and results.
174		#[derive(Debug)]
175		pub struct CacheMetricAttrs {
176		// Read operation attributes
177		read_hit: Vec<KeyValue>,
178		read_miss: Vec<KeyValue>,
179		read_expired: Vec<KeyValue>,
180
181		// Write operation attributes
182		write_success: Vec<KeyValue>,
183		write_error: Vec<KeyValue>,
184
185		// Delete operation attributes
186		delete_success: Vec<KeyValue>,
187		delete_miss: Vec<KeyValue>,
188		delete_error: Vec<KeyValue>,
189
190		// Evict operation attributes
191		evict_success: Vec<KeyValue>,
192		evict_expired: Vec<KeyValue>,
193		}
194
195		impl CacheMetricAttrs {
196		/// Creates a new set of pre-computed attributes.
197		///
198		/// The `base_attrs` are included in all attribute combinations (e.g., cache
199		/// type, instance ID).
200		#[must_use]
201	1	pub fn new(base_attrs: &[KeyValue]) -> Self {
202	10	let make_attrs1 = \|op: CacheOperationName, result: CacheOperationResult\| {
203	10	let mut attrs = base_attrs.to_vec();
204	10	attrs.push(KeyValue::new(CACHE_OPERATION, op));
205	10	attrs.push(KeyValue::new(CACHE_RESULT, result));
206	10	attrs
207	10	};
208
209	1	Self {
210	1	read_hit: make_attrs(CacheOperationName::Read, CacheOperationResult::Hit),
211	1	read_miss: make_attrs(CacheOperationName::Read, CacheOperationResult::Miss),
212	1	read_expired: make_attrs(CacheOperationName::Read, CacheOperationResult::Expired),
213	1
214	1	write_success: make_attrs(CacheOperationName::Write, CacheOperationResult::Success),
215	1	write_error: make_attrs(CacheOperationName::Write, CacheOperationResult::Error),
216	1
217	1	delete_success: make_attrs(CacheOperationName::Delete, CacheOperationResult::Success),
218	1	delete_miss: make_attrs(CacheOperationName::Delete, CacheOperationResult::Miss),
219	1	delete_error: make_attrs(CacheOperationName::Delete, CacheOperationResult::Error),
220	1
221	1	evict_success: make_attrs(CacheOperationName::Evict, CacheOperationResult::Success),
222	1	evict_expired: make_attrs(CacheOperationName::Evict, CacheOperationResult::Expired),
223	1	}
224	1	}
225
226		// Attribute accessors
227		#[must_use]
228	0	pub fn read_hit(&self) -> &[KeyValue] {
229	0	&self.read_hit
230	0	}
231		#[must_use]
232	0	pub fn read_miss(&self) -> &[KeyValue] {
233	0	&self.read_miss
234	0	}
235		#[must_use]
236	0	pub fn read_expired(&self) -> &[KeyValue] {
237	0	&self.read_expired
238	0	}
239		#[must_use]
240	0	pub fn write_success(&self) -> &[KeyValue] {
241	0	&self.write_success
242	0	}
243		#[must_use]
244	0	pub fn write_error(&self) -> &[KeyValue] {
245	0	&self.write_error
246	0	}
247		#[must_use]
248	0	pub fn delete_success(&self) -> &[KeyValue] {
249	0	&self.delete_success
250	0	}
251		#[must_use]
252	0	pub fn delete_miss(&self) -> &[KeyValue] {
253	0	&self.delete_miss
254	0	}
255		#[must_use]
256	0	pub fn delete_error(&self) -> &[KeyValue] {
257	0	&self.delete_error
258	0	}
259		#[must_use]
260	0	pub fn evict_success(&self) -> &[KeyValue] {
261	0	&self.evict_success
262	0	}
263		#[must_use]
264	0	pub fn evict_expired(&self) -> &[KeyValue] {
265	0	&self.evict_expired
266	0	}
267		}
268
269		/// Pre-allocated attribute combinations for efficient remote execution metrics collection.
270		#[derive(Debug)]
271		pub struct ExecutionMetricAttrs {
272		// Stage transition attributes
273		unknown: Vec<KeyValue>,
274		cache_check: Vec<KeyValue>,
275		queued: Vec<KeyValue>,
276		executing: Vec<KeyValue>,
277		completed_success: Vec<KeyValue>,
278		completed_failure: Vec<KeyValue>,
279		completed_cancelled: Vec<KeyValue>,
280		completed_timeout: Vec<KeyValue>,
281		completed_cache_hit: Vec<KeyValue>,
282		}
283
284		impl ExecutionMetricAttrs {
285		/// Creates a new set of pre-computed attributes.
286		///
287		/// The `base_attrs` are included in all attribute combinations (e.g., instance
288		/// name, worker ID).
289		#[must_use]
290	1	pub fn new(base_attrs: &[KeyValue]) -> Self {
291	9	let make_attrs1 = \|stage: ExecutionStage, result: Option<ExecutionResult>\| {
292	9	let mut attrs = base_attrs.to_vec();
293	9	attrs.push(KeyValue::new(EXECUTION_STAGE, stage));
294	9	if let Some( result5 ) = result { Branch (294:20): [True: 5, False: 4] Branch (294:20): [Folded - Ignored]
295	5	attrs.push(KeyValue::new(EXECUTION_RESULT, result));
296	5	}4
297	9	attrs
298	9	};
299
300	1	Self {
301	1	unknown: make_attrs(ExecutionStage::Unknown, None),
302	1	cache_check: make_attrs(ExecutionStage::CacheCheck, None),
303	1	queued: make_attrs(ExecutionStage::Queued, None),
304	1	executing: make_attrs(ExecutionStage::Executing, None),
305	1	completed_success: make_attrs(
306	1	ExecutionStage::Completed,
307	1	Some(ExecutionResult::Success),
308	1	),
309	1	completed_failure: make_attrs(
310	1	ExecutionStage::Completed,
311	1	Some(ExecutionResult::Failure),
312	1	),
313	1	completed_cancelled: make_attrs(
314	1	ExecutionStage::Completed,
315	1	Some(ExecutionResult::Cancelled),
316	1	),
317	1	completed_timeout: make_attrs(
318	1	ExecutionStage::Completed,
319	1	Some(ExecutionResult::Timeout),
320	1	),
321	1	completed_cache_hit: make_attrs(
322	1	ExecutionStage::Completed,
323	1	Some(ExecutionResult::CacheHit),
324	1	),
325	1	}
326	1	}
327
328		// Attribute accessors
329		#[must_use]
330	0	pub fn unknown(&self) -> &[KeyValue] {
331	0	&self.unknown
332	0	}
333		#[must_use]
334	0	pub fn cache_check(&self) -> &[KeyValue] {
335	0	&self.cache_check
336	0	}
337		#[must_use]
338	1	pub fn queued(&self) -> &[KeyValue] {
339	1	&self.queued
340	1	}
341		#[must_use]
342	0	pub fn executing(&self) -> &[KeyValue] {
343	0	&self.executing
344	0	}
345		#[must_use]
346	1	pub fn completed_success(&self) -> &[KeyValue] {
347	1	&self.completed_success
348	1	}
349		#[must_use]
350	0	pub fn completed_failure(&self) -> &[KeyValue] {
351	0	&self.completed_failure
352	0	}
353		#[must_use]
354	0	pub fn completed_cancelled(&self) -> &[KeyValue] {
355	0	&self.completed_cancelled
356	0	}
357		#[must_use]
358	0	pub fn completed_timeout(&self) -> &[KeyValue] {
359	0	&self.completed_timeout
360	0	}
361		#[must_use]
362	0	pub fn completed_cache_hit(&self) -> &[KeyValue] {
363	0	&self.completed_cache_hit
364	0	}
365		}
366
367		/// Global cache metrics instruments.
368	1	pub static CACHE_METRICS: LazyLock<CacheMetrics> = LazyLock::new(\|\| {
369	1	let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build());
370
371	1	CacheMetrics {
372	1	cache_operation_duration: meter
373	1	.f64_histogram("cache.operation.duration")
374	1	.with_description("Duration of cache operations in milliseconds")
375	1	.with_unit("ms")
376	1	// The range of these is quite large as a cache might be backed by
377	1	// memory, a filesystem, or network storage. The current values were
378	1	// determined empirically and might need adjustment.
379	1	.with_boundaries(vec![
380	1	// Microsecond range
381	1	0.001, // 1μs
382	1	0.005, // 5μs
383	1	0.01, // 10μs
384	1	0.05, // 50μs
385	1	0.1, // 100μs
386	1	// Sub-millisecond range
387	1	0.2, // 200μs
388	1	0.5, // 500μs
389	1	1.0, // 1ms
390	1	// Low millisecond range
391	1	2.0, // 2ms
392	1	5.0, // 5ms
393	1	10.0, // 10ms
394	1	20.0, // 20ms
395	1	50.0, // 50ms
396	1	100.0, // 100ms
397	1	// Higher latency range
398	1	200.0, // 200ms
399	1	500.0, // 500ms
400	1	1000.0, // 1 second
401	1	2000.0, // 2 seconds
402	1	5000.0, // 5 seconds
403	1	])
404	1	.build(),
405	1
406	1	cache_operations: meter
407	1	.u64_counter("cache.operations")
408	1	.with_description("Total cache operations by type and result")
409	1	.build(),
410	1
411	1	cache_io: meter
412	1	.u64_counter("cache.io")
413	1	.with_description("Total bytes processed by cache operations")
414	1	.with_unit("By")
415	1	.build(),
416	1
417	1	cache_size: meter
418	1	.i64_up_down_counter("cache.size")
419	1	.with_description("Current total size of cached data")
420	1	.with_unit("By")
421	1	.build(),
422	1
423	1	cache_entries: meter
424	1	.i64_up_down_counter("cache.entries")
425	1	.with_description("Current number of cached entries")
426	1	.with_unit("{entry}")
427	1	.build(),
428	1
429	1	cache_entry_size: meter
430	1	.u64_histogram("cache.item.size")
431	1	.with_description("Size distribution of cached entries")
432	1	.with_unit("By")
433	1	.build(),
434	1	}
435	1	});
436
437		/// OpenTelemetry metrics instruments for cache monitoring.
438		#[derive(Debug)]
439		pub struct CacheMetrics {
440		/// Histogram of cache operation durations in milliseconds
441		pub cache_operation_duration: metrics::Histogram<f64>,
442		/// Counter of cache operations by type and result
443		pub cache_operations: metrics::Counter<u64>,
444		/// Counter of bytes read/written during cache operations
445		pub cache_io: metrics::Counter<u64>,
446		/// Current total size of all cached data in bytes
447		pub cache_size: metrics::UpDownCounter<i64>,
448		/// Current number of entries in cache
449		pub cache_entries: metrics::UpDownCounter<i64>,
450		/// Histogram of individual cache entry sizes in bytes
451		pub cache_entry_size: metrics::Histogram<u64>,
452		}
453
454		/// Global remote execution metrics instruments.
455	3	pub static EXECUTION_METRICS: LazyLock<ExecutionMetrics> = LazyLock::new(\|\| {
456	3	let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build());
457
458	3	ExecutionMetrics {
459	3	execution_stage_duration: meter
460	3	.f64_histogram("execution.stage.duration")
461	3	.with_description("Duration of each execution stage in seconds")
462	3	.with_unit("s")
463	3	.with_boundaries(vec![
464	3	// Sub-second range
465	3	0.001, // 1ms
466	3	0.01, // 10ms
467	3	0.1, // 100ms
468	3	0.5, // 500ms
469	3	1.0, // 1s
470	3	// Multi-second range
471	3	2.0, // 2s
472	3	5.0, // 5s
473	3	10.0, // 10s
474	3	30.0, // 30s
475	3	60.0, // 1 minute
476	3	120.0, // 2 minutes
477	3	300.0, // 5 minutes
478	3	600.0, // 10 minutes
479	3	1800.0, // 30 minutes
480	3	3600.0, // 1 hour
481	3	])
482	3	.build(),
483	3
484	3	execution_total_duration: meter
485	3	.f64_histogram("execution.total.duration")
486	3	.with_description(
487	3	"Total duration of action execution from submission to completion in seconds",
488	3	)
489	3	.with_unit("s")
490	3	.with_boundaries(vec![
491	3	// Sub-second range
492	3	0.01, // 10ms
493	3	0.1, // 100ms
494	3	0.5, // 500ms
495	3	1.0, // 1s
496	3	// Multi-second range
497	3	5.0, // 5s
498	3	10.0, // 10s
499	3	30.0, // 30s
500	3	60.0, // 1 minute
501	3	300.0, // 5 minutes
502	3	600.0, // 10 minutes
503	3	1800.0, // 30 minutes
504	3	3600.0, // 1 hour
505	3	7200.0, // 2 hours
506	3	])
507	3	.build(),
508	3
509	3	execution_queue_time: meter
510	3	.f64_histogram("execution.queue.time")
511	3	.with_description("Time spent waiting in queue before execution in seconds")
512	3	.with_unit("s")
513	3	.with_boundaries(vec![
514	3	0.001, // 1ms
515	3	0.01, // 10ms
516	3	0.1, // 100ms
517	3	0.5, // 500ms
518	3	1.0, // 1s
519	3	2.0, // 2s
520	3	5.0, // 5s
521	3	10.0, // 10s
522	3	30.0, // 30s
523	3	60.0, // 1 minute
524	3	300.0, // 5 minutes
525	3	600.0, // 10 minutes
526	3	])
527	3	.build(),
528	3
529	3	execution_active_count: meter
530	3	.i64_up_down_counter("execution.active.count")
531	3	.with_description("Number of actions currently in each stage")
532	3	.with_unit("{action}")
533	3	.build(),
534	3
535	3	execution_completed_count: meter
536	3	.u64_counter("execution.completed.count")
537	3	.with_description("Total number of completed executions by result")
538	3	.with_unit("{action}")
539	3	.build(),
540	3
541	3	execution_stage_transitions: meter
542	3	.u64_counter("execution.stage.transitions")
543	3	.with_description("Number of stage transitions")
544	3	.with_unit("{transition}")
545	3	.build(),
546	3
547	3	execution_output_size: meter
548	3	.u64_histogram("execution.output.size")
549	3	.with_description("Size of execution outputs in bytes")
550	3	.with_unit("By")
551	3	.with_boundaries(vec![
552	3	1_024.0, // 1KB
553	3	10_240.0, // 10KB
554	3	102_400.0, // 100KB
555	3	1_048_576.0, // 1MB
556	3	10_485_760.0, // 10MB
557	3	104_857_600.0, // 100MB
558	3	1_073_741_824.0, // 1GB
559	3	10_737_418_240.0, // 10GB
560	3	])
561	3	.build(),
562	3
563	3	execution_cpu_time: meter
564	3	.f64_histogram("execution.cpu.time")
565	3	.with_description("CPU time consumed by action execution in seconds")
566	3	.with_unit("s")
567	3	.with_boundaries(vec![
568	3	0.01, // 10ms
569	3	0.1, // 100ms
570	3	1.0, // 1s
571	3	10.0, // 10s
572	3	60.0, // 1 minute
573	3	300.0, // 5 minutes
574	3	600.0, // 10 minutes
575	3	1800.0, // 30 minutes
576	3	3600.0, // 1 hour
577	3	])
578	3	.build(),
579	3
580	3	execution_memory_usage: meter
581	3	.u64_histogram("execution.memory.usage")
582	3	.with_description("Peak memory usage during execution in bytes")
583	3	.with_unit("By")
584	3	.with_boundaries(vec![
585	3	1_048_576.0, // 1MB
586	3	10_485_760.0, // 10MB
587	3	104_857_600.0, // 100MB
588	3	524_288_000.0, // 500MB
589	3	1_073_741_824.0, // 1GB
590	3	5_368_709_120.0, // 5GB
591	3	10_737_418_240.0, // 10GB
592	3	53_687_091_200.0, // 50GB
593	3	])
594	3	.build(),
595	3
596	3	execution_retry_count: meter
597	3	.u64_counter("execution.retry.count")
598	3	.with_description("Number of execution retries")
599	3	.with_unit("{retry}")
600	3	.build(),
601	3	}
602	3	});
603
604		/// OpenTelemetry metrics instruments for remote execution monitoring.
605		#[derive(Debug)]
606		pub struct ExecutionMetrics {
607		/// Histogram of stage durations in seconds
608		pub execution_stage_duration: metrics::Histogram<f64>,
609		/// Histogram of total execution durations in seconds
610		pub execution_total_duration: metrics::Histogram<f64>,
611		/// Histogram of queue wait times in seconds
612		pub execution_queue_time: metrics::Histogram<f64>,
613		/// Current number of actions in each stage
614		pub execution_active_count: metrics::UpDownCounter<i64>,
615		/// Total number of completed executions
616		pub execution_completed_count: metrics::Counter<u64>,
617		/// Number of stage transitions
618		pub execution_stage_transitions: metrics::Counter<u64>,
619		/// Histogram of output sizes in bytes
620		pub execution_output_size: metrics::Histogram<u64>,
621		/// Histogram of CPU time in seconds
622		pub execution_cpu_time: metrics::Histogram<f64>,
623		/// Histogram of peak memory usage in bytes
624		pub execution_memory_usage: metrics::Histogram<u64>,
625		/// Counter for execution retries
626		pub execution_retry_count: metrics::Counter<u64>,
627		}
628
629		/// Helper function to create attributes for execution metrics
630		#[must_use]
631	111	pub fn make_execution_attributes(
632	111	instance_name: &str,
633	111	worker_id: Option<&str>,
634	111	priority: Option<i32>,
635	111	) -> Vec<KeyValue> {
636	111	let mut attrs = vec![KeyValue::new(EXECUTION_INSTANCE, instance_name.to_string())];
637
638	111	if let Some( worker_id38 ) = worker_id { Branch (638:12): [True: 38, False: 73] Branch (638:12): [Folded - Ignored]
639	38	attrs.push(KeyValue::new(EXECUTION_WORKER_ID, worker_id.to_string()));
640	73	}
641
642	111	if let Some(priority) = priority { Branch (642:12): [True: 111, False: 0] Branch (642:12): [Folded - Ignored]
643	111	attrs.push(KeyValue::new(EXECUTION_PRIORITY, i64::from(priority)));
644	111	}0
645
646	111	attrs
647	111	}