/build/source/nativelink-util/src/metrics.rs
Line | Count | Source |
1 | | // Copyright 2025 The NativeLink Authors. All rights reserved. |
2 | | // |
3 | | // Licensed under the Business Source License, Version 1.1 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may requested a copy of the License by emailing contact@nativelink.com. |
6 | | // |
7 | | // Use of this module requires an enterprise license agreement, which can be |
8 | | // attained by emailing contact@nativelink.com or signing up for Nativelink |
9 | | // Cloud at app.nativelink.com. |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | |
17 | | use std::sync::LazyLock; |
18 | | |
19 | | use opentelemetry::{InstrumentationScope, KeyValue, Value, global, metrics}; |
20 | | |
21 | | use crate::action_messages::ActionStage; |
22 | | |
23 | | // Metric attribute keys for cache operations. |
24 | | pub const CACHE_TYPE: &str = "cache.type"; |
25 | | pub const CACHE_OPERATION: &str = "cache.operation.name"; |
26 | | pub const CACHE_RESULT: &str = "cache.operation.result"; |
27 | | |
28 | | // Metric attribute keys for remote execution operations. |
29 | | pub const EXECUTION_STAGE: &str = "execution.stage"; |
30 | | pub const EXECUTION_RESULT: &str = "execution.result"; |
31 | | pub const EXECUTION_INSTANCE: &str = "execution.instance"; |
32 | | pub const EXECUTION_PRIORITY: &str = "execution.priority"; |
33 | | pub const EXECUTION_WORKER_ID: &str = "execution.worker_id"; |
34 | | pub const EXECUTION_EXIT_CODE: &str = "execution.exit_code"; |
35 | | pub const EXECUTION_ACTION_DIGEST: &str = "execution.action_digest"; |
36 | | |
37 | | /// Cache operation types for metrics classification. |
38 | | #[derive(Debug, Clone, Copy)] |
39 | | pub enum CacheOperationName { |
40 | | /// Data retrieval operations (get, peek, contains, etc.) |
41 | | Read, |
42 | | /// Data storage operations (insert, update, replace, etc.) |
43 | | Write, |
44 | | /// Explicit data removal operations |
45 | | Delete, |
46 | | /// Automatic cache maintenance (evictions, TTL cleanup, etc.) |
47 | | Evict, |
48 | | } |
49 | | |
50 | | impl From<CacheOperationName> for Value { |
51 | 33 | fn from(op: CacheOperationName) -> Self { |
52 | 33 | match op { |
53 | 12 | CacheOperationName::Read => Self::from("read"), |
54 | 6 | CacheOperationName::Write => Self::from("write"), |
55 | 9 | CacheOperationName::Delete => Self::from("delete"), |
56 | 6 | CacheOperationName::Evict => Self::from("evict"), |
57 | | } |
58 | 33 | } |
59 | | } |
60 | | |
61 | | /// Results of cache operations. |
62 | | /// |
63 | | /// Result semantics vary by operation type: |
64 | | /// - Read: Hit/Miss/Expired indicate data availability |
65 | | /// - Write/Delete/Evict: Success/Error indicate completion status |
66 | | #[derive(Debug, Clone, Copy)] |
67 | | pub enum CacheOperationResult { |
68 | | /// Data found and valid (Read operations) |
69 | | Hit, |
70 | | /// Data not found (Read operations) |
71 | | Miss, |
72 | | /// Data found but invalid/expired (Read operations) |
73 | | Expired, |
74 | | /// Operation completed successfully (Write/Delete/Evict operations) |
75 | | Success, |
76 | | /// Operation failed (any operation type) |
77 | | Error, |
78 | | } |
79 | | |
80 | | impl From<CacheOperationResult> for Value { |
81 | 33 | fn from(result: CacheOperationResult) -> Self { |
82 | 33 | match result { |
83 | 3 | CacheOperationResult::Hit => Self::from("hit"), |
84 | 6 | CacheOperationResult::Miss => Self::from("miss"), |
85 | 6 | CacheOperationResult::Expired => Self::from("expired"), |
86 | 9 | CacheOperationResult::Success => Self::from("success"), |
87 | 9 | CacheOperationResult::Error => Self::from("error"), |
88 | | } |
89 | 33 | } |
90 | | } |
91 | | |
92 | | /// Remote execution stages for metrics classification. |
93 | | #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
94 | | pub enum ExecutionStage { |
95 | | /// Unknown stage |
96 | | Unknown, |
97 | | /// Checking cache for existing results |
98 | | CacheCheck, |
99 | | /// Action is queued waiting for execution |
100 | | Queued, |
101 | | /// Action is being executed by a worker |
102 | | Executing, |
103 | | /// Action execution completed |
104 | | Completed, |
105 | | } |
106 | | |
107 | | impl From<ExecutionStage> for Value { |
108 | 172 | fn from(stage: ExecutionStage) -> Self { |
109 | 172 | match stage { |
110 | 1 | ExecutionStage::Unknown => Self::from("unknown"), |
111 | 1 | ExecutionStage::CacheCheck => Self::from("cache_check"), |
112 | 74 | ExecutionStage::Queued => Self::from("queued"), |
113 | 76 | ExecutionStage::Executing => Self::from("executing"), |
114 | 20 | ExecutionStage::Completed => Self::from("completed"), |
115 | | } |
116 | 172 | } |
117 | | } |
118 | | |
119 | | impl From<ActionStage> for ExecutionStage { |
120 | 5 | fn from(stage: ActionStage) -> Self { |
121 | 5 | match stage { |
122 | 1 | ActionStage::Unknown => Self::Unknown, |
123 | 1 | ActionStage::CacheCheck => Self::CacheCheck, |
124 | 1 | ActionStage::Queued => Self::Queued, |
125 | 1 | ActionStage::Executing => Self::Executing, |
126 | 1 | ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => Self::Completed, |
127 | | } |
128 | 5 | } |
129 | | } |
130 | | |
131 | | impl From<&ActionStage> for ExecutionStage { |
132 | 10.1k | fn from(stage: &ActionStage) -> Self { |
133 | 10.1k | match stage { |
134 | 1 | ActionStage::Unknown => Self::Unknown, |
135 | 1 | ActionStage::CacheCheck => Self::CacheCheck, |
136 | 48 | ActionStage::Queued => Self::Queued, |
137 | 76 | ActionStage::Executing => Self::Executing, |
138 | 10.0k | ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => Self::Completed, |
139 | | } |
140 | 10.1k | } |
141 | | } |
142 | | |
143 | | /// Results of remote execution operations. |
144 | | #[derive(Debug, Clone, Copy)] |
145 | | pub enum ExecutionResult { |
146 | | /// Execution completed successfully |
147 | | Success, |
148 | | /// Execution failed |
149 | | Failure, |
150 | | /// Execution was cancelled |
151 | | Cancelled, |
152 | | /// Execution timed out |
153 | | Timeout, |
154 | | /// Result was found in cache |
155 | | CacheHit, |
156 | | } |
157 | | |
158 | | impl From<ExecutionResult> for Value { |
159 | 20 | fn from(result: ExecutionResult) -> Self { |
160 | 20 | match result { |
161 | 11 | ExecutionResult::Success => Self::from("success"), |
162 | 6 | ExecutionResult::Failure => Self::from("failure"), |
163 | 1 | ExecutionResult::Cancelled => Self::from("cancelled"), |
164 | 1 | ExecutionResult::Timeout => Self::from("timeout"), |
165 | 1 | ExecutionResult::CacheHit => Self::from("cache_hit"), |
166 | | } |
167 | 20 | } |
168 | | } |
169 | | |
170 | | /// Pre-allocated attribute combinations for efficient cache metrics collection. |
171 | | /// |
172 | | /// Avoids runtime allocation by pre-computing common attribute combinations |
173 | | /// for cache operations and results. |
174 | | #[derive(Debug)] |
175 | | pub struct CacheMetricAttrs { |
176 | | // Read operation attributes |
177 | | read_hit: Vec<KeyValue>, |
178 | | read_miss: Vec<KeyValue>, |
179 | | read_expired: Vec<KeyValue>, |
180 | | read_error: Vec<KeyValue>, |
181 | | |
182 | | // Write operation attributes |
183 | | write_success: Vec<KeyValue>, |
184 | | write_error: Vec<KeyValue>, |
185 | | |
186 | | // Delete operation attributes |
187 | | delete_success: Vec<KeyValue>, |
188 | | delete_miss: Vec<KeyValue>, |
189 | | delete_error: Vec<KeyValue>, |
190 | | |
191 | | // Evict operation attributes |
192 | | evict_success: Vec<KeyValue>, |
193 | | evict_expired: Vec<KeyValue>, |
194 | | } |
195 | | |
196 | | impl CacheMetricAttrs { |
197 | | /// Creates a new set of pre-computed attributes. |
198 | | /// |
199 | | /// The `base_attrs` are included in all attribute combinations (e.g., cache |
200 | | /// type, instance ID). |
201 | | #[must_use] |
202 | 3 | pub fn new(base_attrs: &[KeyValue]) -> Self { |
203 | 33 | let make_attrs3 = |op: CacheOperationName, result: CacheOperationResult| { |
204 | 33 | let mut attrs = base_attrs.to_vec(); |
205 | 33 | attrs.push(KeyValue::new(CACHE_OPERATION, op)); |
206 | 33 | attrs.push(KeyValue::new(CACHE_RESULT, result)); |
207 | 33 | attrs |
208 | 33 | }; |
209 | | |
210 | 3 | Self { |
211 | 3 | read_hit: make_attrs(CacheOperationName::Read, CacheOperationResult::Hit), |
212 | 3 | read_miss: make_attrs(CacheOperationName::Read, CacheOperationResult::Miss), |
213 | 3 | read_expired: make_attrs(CacheOperationName::Read, CacheOperationResult::Expired), |
214 | 3 | read_error: make_attrs(CacheOperationName::Read, CacheOperationResult::Error), |
215 | 3 | |
216 | 3 | write_success: make_attrs(CacheOperationName::Write, CacheOperationResult::Success), |
217 | 3 | write_error: make_attrs(CacheOperationName::Write, CacheOperationResult::Error), |
218 | 3 | |
219 | 3 | delete_success: make_attrs(CacheOperationName::Delete, CacheOperationResult::Success), |
220 | 3 | delete_miss: make_attrs(CacheOperationName::Delete, CacheOperationResult::Miss), |
221 | 3 | delete_error: make_attrs(CacheOperationName::Delete, CacheOperationResult::Error), |
222 | 3 | |
223 | 3 | evict_success: make_attrs(CacheOperationName::Evict, CacheOperationResult::Success), |
224 | 3 | evict_expired: make_attrs(CacheOperationName::Evict, CacheOperationResult::Expired), |
225 | 3 | } |
226 | 3 | } |
227 | | |
228 | | // Attribute accessors |
229 | | #[must_use] |
230 | 6 | pub fn read_hit(&self) -> &[KeyValue] { |
231 | 6 | &self.read_hit |
232 | 6 | } |
233 | | #[must_use] |
234 | 2 | pub fn read_miss(&self) -> &[KeyValue] { |
235 | 2 | &self.read_miss |
236 | 2 | } |
237 | | #[must_use] |
238 | 0 | pub fn read_expired(&self) -> &[KeyValue] { |
239 | 0 | &self.read_expired |
240 | 0 | } |
241 | | #[must_use] |
242 | 0 | pub fn read_error(&self) -> &[KeyValue] { |
243 | 0 | &self.read_error |
244 | 0 | } |
245 | | #[must_use] |
246 | 4 | pub fn write_success(&self) -> &[KeyValue] { |
247 | 4 | &self.write_success |
248 | 4 | } |
249 | | #[must_use] |
250 | 0 | pub fn write_error(&self) -> &[KeyValue] { |
251 | 0 | &self.write_error |
252 | 0 | } |
253 | | #[must_use] |
254 | 0 | pub fn delete_success(&self) -> &[KeyValue] { |
255 | 0 | &self.delete_success |
256 | 0 | } |
257 | | #[must_use] |
258 | 0 | pub fn delete_miss(&self) -> &[KeyValue] { |
259 | 0 | &self.delete_miss |
260 | 0 | } |
261 | | #[must_use] |
262 | 0 | pub fn delete_error(&self) -> &[KeyValue] { |
263 | 0 | &self.delete_error |
264 | 0 | } |
265 | | #[must_use] |
266 | 0 | pub fn evict_success(&self) -> &[KeyValue] { |
267 | 0 | &self.evict_success |
268 | 0 | } |
269 | | #[must_use] |
270 | 0 | pub fn evict_expired(&self) -> &[KeyValue] { |
271 | 0 | &self.evict_expired |
272 | 0 | } |
273 | | } |
274 | | |
275 | | /// Pre-allocated attribute combinations for efficient remote execution metrics collection. |
276 | | #[derive(Debug)] |
277 | | pub struct ExecutionMetricAttrs { |
278 | | // Stage transition attributes |
279 | | unknown: Vec<KeyValue>, |
280 | | cache_check: Vec<KeyValue>, |
281 | | queued: Vec<KeyValue>, |
282 | | executing: Vec<KeyValue>, |
283 | | completed_success: Vec<KeyValue>, |
284 | | completed_failure: Vec<KeyValue>, |
285 | | completed_cancelled: Vec<KeyValue>, |
286 | | completed_timeout: Vec<KeyValue>, |
287 | | completed_cache_hit: Vec<KeyValue>, |
288 | | } |
289 | | |
290 | | impl ExecutionMetricAttrs { |
291 | | /// Creates a new set of pre-computed attributes. |
292 | | /// |
293 | | /// The `base_attrs` are included in all attribute combinations (e.g., instance |
294 | | /// name, worker ID). |
295 | | #[must_use] |
296 | 1 | pub fn new(base_attrs: &[KeyValue]) -> Self { |
297 | 9 | let make_attrs1 = |stage: ExecutionStage, result: Option<ExecutionResult>| { |
298 | 9 | let mut attrs = base_attrs.to_vec(); |
299 | 9 | attrs.push(KeyValue::new(EXECUTION_STAGE, stage)); |
300 | 9 | if let Some(result5 ) = result { |
301 | 5 | attrs.push(KeyValue::new(EXECUTION_RESULT, result)); |
302 | 5 | }4 |
303 | 9 | attrs |
304 | 9 | }; |
305 | | |
306 | 1 | Self { |
307 | 1 | unknown: make_attrs(ExecutionStage::Unknown, None), |
308 | 1 | cache_check: make_attrs(ExecutionStage::CacheCheck, None), |
309 | 1 | queued: make_attrs(ExecutionStage::Queued, None), |
310 | 1 | executing: make_attrs(ExecutionStage::Executing, None), |
311 | 1 | completed_success: make_attrs( |
312 | 1 | ExecutionStage::Completed, |
313 | 1 | Some(ExecutionResult::Success), |
314 | 1 | ), |
315 | 1 | completed_failure: make_attrs( |
316 | 1 | ExecutionStage::Completed, |
317 | 1 | Some(ExecutionResult::Failure), |
318 | 1 | ), |
319 | 1 | completed_cancelled: make_attrs( |
320 | 1 | ExecutionStage::Completed, |
321 | 1 | Some(ExecutionResult::Cancelled), |
322 | 1 | ), |
323 | 1 | completed_timeout: make_attrs( |
324 | 1 | ExecutionStage::Completed, |
325 | 1 | Some(ExecutionResult::Timeout), |
326 | 1 | ), |
327 | 1 | completed_cache_hit: make_attrs( |
328 | 1 | ExecutionStage::Completed, |
329 | 1 | Some(ExecutionResult::CacheHit), |
330 | 1 | ), |
331 | 1 | } |
332 | 1 | } |
333 | | |
334 | | // Attribute accessors |
335 | | #[must_use] |
336 | 0 | pub fn unknown(&self) -> &[KeyValue] { |
337 | 0 | &self.unknown |
338 | 0 | } |
339 | | #[must_use] |
340 | 0 | pub fn cache_check(&self) -> &[KeyValue] { |
341 | 0 | &self.cache_check |
342 | 0 | } |
343 | | #[must_use] |
344 | 0 | pub fn queued(&self) -> &[KeyValue] { |
345 | 0 | &self.queued |
346 | 0 | } |
347 | | #[must_use] |
348 | 0 | pub fn executing(&self) -> &[KeyValue] { |
349 | 0 | &self.executing |
350 | 0 | } |
351 | | #[must_use] |
352 | 1 | pub fn completed_success(&self) -> &[KeyValue] { |
353 | 1 | &self.completed_success |
354 | 1 | } |
355 | | #[must_use] |
356 | 0 | pub fn completed_failure(&self) -> &[KeyValue] { |
357 | 0 | &self.completed_failure |
358 | 0 | } |
359 | | #[must_use] |
360 | 0 | pub fn completed_cancelled(&self) -> &[KeyValue] { |
361 | 0 | &self.completed_cancelled |
362 | 0 | } |
363 | | #[must_use] |
364 | 0 | pub fn completed_timeout(&self) -> &[KeyValue] { |
365 | 0 | &self.completed_timeout |
366 | 0 | } |
367 | | #[must_use] |
368 | 0 | pub fn completed_cache_hit(&self) -> &[KeyValue] { |
369 | 0 | &self.completed_cache_hit |
370 | 0 | } |
371 | | } |
372 | | |
373 | | /// Global cache metrics instruments. |
374 | 2 | pub static CACHE_METRICS: LazyLock<CacheMetrics> = LazyLock::new(|| { |
375 | 2 | let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build()); |
376 | | |
377 | 2 | CacheMetrics { |
378 | 2 | cache_operation_duration: meter |
379 | 2 | .f64_histogram("cache.operation.duration") |
380 | 2 | .with_description("Duration of cache operations in milliseconds") |
381 | 2 | .with_unit("ms") |
382 | 2 | // The range of these is quite large as a cache might be backed by |
383 | 2 | // memory, a filesystem, or network storage. The current values were |
384 | 2 | // determined empirically and might need adjustment. |
385 | 2 | .with_boundaries(vec![ |
386 | 2 | // Microsecond range |
387 | 2 | 0.001, // 1μs |
388 | 2 | 0.005, // 5μs |
389 | 2 | 0.01, // 10μs |
390 | 2 | 0.05, // 50μs |
391 | 2 | 0.1, // 100μs |
392 | 2 | // Sub-millisecond range |
393 | 2 | 0.2, // 200μs |
394 | 2 | 0.5, // 500μs |
395 | 2 | 1.0, // 1ms |
396 | 2 | // Low millisecond range |
397 | 2 | 2.0, // 2ms |
398 | 2 | 5.0, // 5ms |
399 | 2 | 10.0, // 10ms |
400 | 2 | 20.0, // 20ms |
401 | 2 | 50.0, // 50ms |
402 | 2 | 100.0, // 100ms |
403 | 2 | // Higher latency range |
404 | 2 | 200.0, // 200ms |
405 | 2 | 500.0, // 500ms |
406 | 2 | 1000.0, // 1 second |
407 | 2 | 2000.0, // 2 seconds |
408 | 2 | 5000.0, // 5 seconds |
409 | 2 | ]) |
410 | 2 | .build(), |
411 | 2 | |
412 | 2 | cache_operations: meter |
413 | 2 | .u64_counter("cache.operations") |
414 | 2 | .with_description("Total cache operations by type and result") |
415 | 2 | .build(), |
416 | 2 | |
417 | 2 | cache_io: meter |
418 | 2 | .u64_counter("cache.io") |
419 | 2 | .with_description("Total bytes processed by cache operations") |
420 | 2 | .with_unit("By") |
421 | 2 | .build(), |
422 | 2 | |
423 | 2 | cache_size: meter |
424 | 2 | .i64_up_down_counter("cache.size") |
425 | 2 | .with_description("Current total size of cached data") |
426 | 2 | .with_unit("By") |
427 | 2 | .build(), |
428 | 2 | |
429 | 2 | cache_entries: meter |
430 | 2 | .i64_up_down_counter("cache.entries") |
431 | 2 | .with_description("Current number of cached entries") |
432 | 2 | .with_unit("{entry}") |
433 | 2 | .build(), |
434 | 2 | |
435 | 2 | cache_entry_size: meter |
436 | 2 | .u64_histogram("cache.item.size") |
437 | 2 | .with_description("Size distribution of cached entries") |
438 | 2 | .with_unit("By") |
439 | 2 | .build(), |
440 | 2 | } |
441 | 2 | }); |
442 | | |
443 | | /// OpenTelemetry metrics instruments for cache monitoring. |
444 | | #[derive(Debug)] |
445 | | pub struct CacheMetrics { |
446 | | /// Histogram of cache operation durations in milliseconds |
447 | | pub cache_operation_duration: metrics::Histogram<f64>, |
448 | | /// Counter of cache operations by type and result |
449 | | pub cache_operations: metrics::Counter<u64>, |
450 | | /// Counter of bytes read/written during cache operations |
451 | | pub cache_io: metrics::Counter<u64>, |
452 | | /// Current total size of all cached data in bytes |
453 | | pub cache_size: metrics::UpDownCounter<i64>, |
454 | | /// Current number of entries in cache |
455 | | pub cache_entries: metrics::UpDownCounter<i64>, |
456 | | /// Histogram of individual cache entry sizes in bytes |
457 | | pub cache_entry_size: metrics::Histogram<u64>, |
458 | | } |
459 | | |
460 | | /// Global remote execution metrics instruments. |
461 | 3 | pub static EXECUTION_METRICS: LazyLock<ExecutionMetrics> = LazyLock::new(|| { |
462 | 3 | let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build()); |
463 | | |
464 | 3 | ExecutionMetrics { |
465 | 3 | execution_stage_duration: meter |
466 | 3 | .f64_histogram("execution.stage.duration") |
467 | 3 | .with_description("Duration of each execution stage in seconds") |
468 | 3 | .with_unit("s") |
469 | 3 | .with_boundaries(vec![ |
470 | 3 | // Sub-second range |
471 | 3 | 0.001, // 1ms |
472 | 3 | 0.01, // 10ms |
473 | 3 | 0.1, // 100ms |
474 | 3 | 0.5, // 500ms |
475 | 3 | 1.0, // 1s |
476 | 3 | // Multi-second range |
477 | 3 | 2.0, // 2s |
478 | 3 | 5.0, // 5s |
479 | 3 | 10.0, // 10s |
480 | 3 | 30.0, // 30s |
481 | 3 | 60.0, // 1 minute |
482 | 3 | 120.0, // 2 minutes |
483 | 3 | 300.0, // 5 minutes |
484 | 3 | 600.0, // 10 minutes |
485 | 3 | 1800.0, // 30 minutes |
486 | 3 | 3600.0, // 1 hour |
487 | 3 | ]) |
488 | 3 | .build(), |
489 | 3 | |
490 | 3 | execution_total_duration: meter |
491 | 3 | .f64_histogram("execution.total.duration") |
492 | 3 | .with_description( |
493 | 3 | "Total duration of action execution from submission to completion in seconds", |
494 | 3 | ) |
495 | 3 | .with_unit("s") |
496 | 3 | .with_boundaries(vec![ |
497 | 3 | // Sub-second range |
498 | 3 | 0.01, // 10ms |
499 | 3 | 0.1, // 100ms |
500 | 3 | 0.5, // 500ms |
501 | 3 | 1.0, // 1s |
502 | 3 | // Multi-second range |
503 | 3 | 5.0, // 5s |
504 | 3 | 10.0, // 10s |
505 | 3 | 30.0, // 30s |
506 | 3 | 60.0, // 1 minute |
507 | 3 | 300.0, // 5 minutes |
508 | 3 | 600.0, // 10 minutes |
509 | 3 | 1800.0, // 30 minutes |
510 | 3 | 3600.0, // 1 hour |
511 | 3 | 7200.0, // 2 hours |
512 | 3 | ]) |
513 | 3 | .build(), |
514 | 3 | |
515 | 3 | execution_queue_time: meter |
516 | 3 | .f64_histogram("execution.queue.time") |
517 | 3 | .with_description("Time spent waiting in queue before execution in seconds") |
518 | 3 | .with_unit("s") |
519 | 3 | .with_boundaries(vec![ |
520 | 3 | 0.001, // 1ms |
521 | 3 | 0.01, // 10ms |
522 | 3 | 0.1, // 100ms |
523 | 3 | 0.5, // 500ms |
524 | 3 | 1.0, // 1s |
525 | 3 | 2.0, // 2s |
526 | 3 | 5.0, // 5s |
527 | 3 | 10.0, // 10s |
528 | 3 | 30.0, // 30s |
529 | 3 | 60.0, // 1 minute |
530 | 3 | 300.0, // 5 minutes |
531 | 3 | 600.0, // 10 minutes |
532 | 3 | ]) |
533 | 3 | .build(), |
534 | 3 | |
535 | 3 | execution_active_count: meter |
536 | 3 | .i64_up_down_counter("execution.active.count") |
537 | 3 | .with_description("Number of actions currently in each stage") |
538 | 3 | .with_unit("{action}") |
539 | 3 | .build(), |
540 | 3 | |
541 | 3 | execution_completed_count: meter |
542 | 3 | .u64_counter("execution.completed.count") |
543 | 3 | .with_description("Total number of completed executions by result") |
544 | 3 | .with_unit("{action}") |
545 | 3 | .build(), |
546 | 3 | |
547 | 3 | execution_stage_transitions: meter |
548 | 3 | .u64_counter("execution.stage.transitions") |
549 | 3 | .with_description("Number of stage transitions") |
550 | 3 | .with_unit("{transition}") |
551 | 3 | .build(), |
552 | 3 | |
553 | 3 | execution_output_size: meter |
554 | 3 | .u64_histogram("execution.output.size") |
555 | 3 | .with_description("Size of execution outputs in bytes") |
556 | 3 | .with_unit("By") |
557 | 3 | .with_boundaries(vec![ |
558 | 3 | 1_024.0, // 1KB |
559 | 3 | 10_240.0, // 10KB |
560 | 3 | 102_400.0, // 100KB |
561 | 3 | 1_048_576.0, // 1MB |
562 | 3 | 10_485_760.0, // 10MB |
563 | 3 | 104_857_600.0, // 100MB |
564 | 3 | 1_073_741_824.0, // 1GB |
565 | 3 | 10_737_418_240.0, // 10GB |
566 | 3 | ]) |
567 | 3 | .build(), |
568 | 3 | |
569 | 3 | execution_cpu_time: meter |
570 | 3 | .f64_histogram("execution.cpu.time") |
571 | 3 | .with_description("CPU time consumed by action execution in seconds") |
572 | 3 | .with_unit("s") |
573 | 3 | .with_boundaries(vec![ |
574 | 3 | 0.01, // 10ms |
575 | 3 | 0.1, // 100ms |
576 | 3 | 1.0, // 1s |
577 | 3 | 10.0, // 10s |
578 | 3 | 60.0, // 1 minute |
579 | 3 | 300.0, // 5 minutes |
580 | 3 | 600.0, // 10 minutes |
581 | 3 | 1800.0, // 30 minutes |
582 | 3 | 3600.0, // 1 hour |
583 | 3 | ]) |
584 | 3 | .build(), |
585 | 3 | |
586 | 3 | execution_memory_usage: meter |
587 | 3 | .u64_histogram("execution.memory.usage") |
588 | 3 | .with_description("Peak memory usage during execution in bytes") |
589 | 3 | .with_unit("By") |
590 | 3 | .with_boundaries(vec![ |
591 | 3 | 1_048_576.0, // 1MB |
592 | 3 | 10_485_760.0, // 10MB |
593 | 3 | 104_857_600.0, // 100MB |
594 | 3 | 524_288_000.0, // 500MB |
595 | 3 | 1_073_741_824.0, // 1GB |
596 | 3 | 5_368_709_120.0, // 5GB |
597 | 3 | 10_737_418_240.0, // 10GB |
598 | 3 | 53_687_091_200.0, // 50GB |
599 | 3 | ]) |
600 | 3 | .build(), |
601 | 3 | |
602 | 3 | execution_retry_count: meter |
603 | 3 | .u64_counter("execution.retry.count") |
604 | 3 | .with_description("Number of execution retries") |
605 | 3 | .with_unit("{retry}") |
606 | 3 | .build(), |
607 | 3 | } |
608 | 3 | }); |
609 | | |
610 | | /// OpenTelemetry metrics instruments for remote execution monitoring. |
611 | | #[derive(Debug)] |
612 | | pub struct ExecutionMetrics { |
613 | | /// Histogram of stage durations in seconds |
614 | | pub execution_stage_duration: metrics::Histogram<f64>, |
615 | | /// Histogram of total execution durations in seconds |
616 | | pub execution_total_duration: metrics::Histogram<f64>, |
617 | | /// Histogram of queue wait times in seconds |
618 | | pub execution_queue_time: metrics::Histogram<f64>, |
619 | | /// Current number of actions in each stage |
620 | | pub execution_active_count: metrics::UpDownCounter<i64>, |
621 | | /// Total number of completed executions |
622 | | pub execution_completed_count: metrics::Counter<u64>, |
623 | | /// Number of stage transitions |
624 | | pub execution_stage_transitions: metrics::Counter<u64>, |
625 | | /// Histogram of output sizes in bytes |
626 | | pub execution_output_size: metrics::Histogram<u64>, |
627 | | /// Histogram of CPU time in seconds |
628 | | pub execution_cpu_time: metrics::Histogram<f64>, |
629 | | /// Histogram of peak memory usage in bytes |
630 | | pub execution_memory_usage: metrics::Histogram<u64>, |
631 | | /// Counter for execution retries |
632 | | pub execution_retry_count: metrics::Counter<u64>, |
633 | | } |
634 | | |
635 | | /// Helper function to create attributes for execution metrics |
636 | | #[must_use] |
637 | 121 | pub fn make_execution_attributes( |
638 | 121 | instance_name: &str, |
639 | 121 | worker_id: Option<&str>, |
640 | 121 | priority: Option<i32>, |
641 | 121 | ) -> Vec<KeyValue> { |
642 | 121 | let mut attrs = vec![KeyValue::new(EXECUTION_INSTANCE, instance_name.to_string())]; |
643 | | |
644 | 121 | if let Some(worker_id41 ) = worker_id { |
645 | 41 | attrs.push(KeyValue::new(EXECUTION_WORKER_ID, worker_id.to_string())); |
646 | 80 | } |
647 | | |
648 | 121 | if let Some(priority) = priority { |
649 | 121 | attrs.push(KeyValue::new(EXECUTION_PRIORITY, i64::from(priority))); |
650 | 121 | }0 |
651 | | |
652 | 121 | attrs |
653 | 121 | } |