Coverage Report

Created: 2025-05-30 16:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/build/source/nativelink-scheduler/src/simple_scheduler_state_manager.rs
Line
Count
Source
1
// Copyright 2024 The NativeLink Authors. All rights reserved.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//    http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
use core::ops::Bound;
16
use core::time::Duration;
17
use std::string::ToString;
18
use std::sync::{Arc, Weak};
19
use std::time::SystemTime;
20
21
use async_lock::Mutex;
22
use async_trait::async_trait;
23
use futures::{StreamExt, TryStreamExt, stream};
24
use nativelink_error::{Code, Error, ResultExt, make_err};
25
use nativelink_metric::MetricsComponent;
26
use nativelink_util::action_messages::{
27
    ActionInfo, ActionResult, ActionStage, ActionState, ActionUniqueQualifier, ExecutionMetadata,
28
    OperationId, WorkerId,
29
};
30
use nativelink_util::instant_wrapper::InstantWrapper;
31
use nativelink_util::known_platform_property_provider::KnownPlatformPropertyProvider;
32
use nativelink_util::operation_state_manager::{
33
    ActionStateResult, ActionStateResultStream, ClientStateManager, MatchingEngineStateManager,
34
    OperationFilter, OperationStageFlags, OrderDirection, UpdateOperationType, WorkerStateManager,
35
};
36
use nativelink_util::origin_event::OriginMetadata;
37
use tracing::{info, warn};
38
39
use super::awaited_action_db::{
40
    AwaitedAction, AwaitedActionDb, AwaitedActionSubscriber, SortedAwaitedActionState,
41
};
42
43
/// Maximum number of times an update to the database
44
/// can fail before giving up.
45
const MAX_UPDATE_RETRIES: usize = 5;
46
47
/// Simple struct that implements the `ActionStateResult` trait and always returns an error.
48
struct ErrorActionStateResult(Error);
49
50
#[async_trait]
51
impl ActionStateResult for ErrorActionStateResult {
52
0
    async fn as_state(&self) -> Result<(Arc<ActionState>, Option<OriginMetadata>), Error> {
53
0
        Err(self.0.clone())
54
0
    }
55
56
0
    async fn changed(&mut self) -> Result<(Arc<ActionState>, Option<OriginMetadata>), Error> {
57
0
        Err(self.0.clone())
58
0
    }
59
60
0
    async fn as_action_info(&self) -> Result<(Arc<ActionInfo>, Option<OriginMetadata>), Error> {
61
0
        Err(self.0.clone())
62
0
    }
63
}
64
65
struct ClientActionStateResult<U, T, I, NowFn>
66
where
67
    U: AwaitedActionSubscriber,
68
    T: AwaitedActionDb,
69
    I: InstantWrapper,
70
    NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static,
71
{
72
    inner: MatchingEngineActionStateResult<U, T, I, NowFn>,
73
}
74
75
impl<U, T, I, NowFn> ClientActionStateResult<U, T, I, NowFn>
76
where
77
    U: AwaitedActionSubscriber,
78
    T: AwaitedActionDb,
79
    I: InstantWrapper,
80
    NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static,
81
{
82
530
    const fn new(
83
530
        sub: U,
84
530
        simple_scheduler_state_manager: Weak<SimpleSchedulerStateManager<T, I, NowFn>>,
85
530
        no_event_action_timeout: Duration,
86
530
        now_fn: NowFn,
87
530
    ) -> Self {
88
530
        Self {
89
530
            inner: MatchingEngineActionStateResult::new(
90
530
                sub,
91
530
                simple_scheduler_state_manager,
92
530
                no_event_action_timeout,
93
530
                now_fn,
94
530
            ),
95
530
        }
96
530
    }
97
}
98
99
#[async_trait]
100
impl<U, T, I, NowFn> ActionStateResult for ClientActionStateResult<U, T, I, NowFn>
101
where
102
    U: AwaitedActionSubscriber,
103
    T: AwaitedActionDb,
104
    I: InstantWrapper,
105
    NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static,
106
{
107
6
    async fn as_state(&self) -> Result<(Arc<ActionState>, Option<OriginMetadata>), Error> {
108
3
        self.inner.as_state().await
109
6
    }
110
111
96
    async fn changed(&mut self) -> Result<(Arc<ActionState>, Option<OriginMetadata>), Error> {
112
48
        self.inner.changed().await
113
92
    }
114
115
0
    async fn as_action_info(&self) -> Result<(Arc<ActionInfo>, Option<OriginMetadata>), Error> {
116
0
        self.inner.as_action_info().await
117
0
    }
118
}
119
120
struct MatchingEngineActionStateResult<U, T, I, NowFn>
121
where
122
    U: AwaitedActionSubscriber,
123
    T: AwaitedActionDb,
124
    I: InstantWrapper,
125
    NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static,
126
{
127
    awaited_action_sub: U,
128
    simple_scheduler_state_manager: Weak<SimpleSchedulerStateManager<T, I, NowFn>>,
129
    no_event_action_timeout: Duration,
130
    now_fn: NowFn,
131
}
132
impl<U, T, I, NowFn> MatchingEngineActionStateResult<U, T, I, NowFn>
133
where
134
    U: AwaitedActionSubscriber,
135
    T: AwaitedActionDb,
136
    I: InstantWrapper,
137
    NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static,
138
{
139
575
    const fn new(
140
575
        awaited_action_sub: U,
141
575
        simple_scheduler_state_manager: Weak<SimpleSchedulerStateManager<T, I, NowFn>>,
142
575
        no_event_action_timeout: Duration,
143
575
        now_fn: NowFn,
144
575
    ) -> Self {
145
575
        Self {
146
575
            awaited_action_sub,
147
575
            simple_scheduler_state_manager,
148
575
            no_event_action_timeout,
149
575
            now_fn,
150
575
        }
151
575
    }
152
}
153
154
#[async_trait]
155
impl<U, T, I, NowFn> ActionStateResult for MatchingEngineActionStateResult<U, T, I, NowFn>
156
where
157
    U: AwaitedActionSubscriber,
158
    T: AwaitedActionDb,
159
    I: InstantWrapper,
160
    NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static,
161
{
162
62
    async fn as_state(&self) -> Result<(Arc<ActionState>, Option<OriginMetadata>), Error> {
163
31
        let awaited_action = self
164
31
            .awaited_action_sub
165
31
            .borrow()
166
31
            .await
167
31
            .err_tip(|| "In MatchingEngineActionStateResult::as_state")
?0
;
168
31
        Ok((
169
31
            awaited_action.state().clone(),
170
31
            awaited_action.maybe_origin_metadata().cloned(),
171
31
        ))
172
62
    }
173
174
96
    async fn changed(&mut self) -> Result<(Arc<ActionState>, Option<OriginMetadata>), Error> {
175
48
        let mut timeout_attempts = 0;
176
        loop {
177
58
            tokio::select! {
178
58
                
awaited_action_result44
= self.awaited_action_sub.changed() => {
179
44
                    return awaited_action_result
180
44
                        .err_tip(|| "In MatchingEngineActionStateResult::changed")
181
44
                        .map(|v| (v.state().clone(), v.maybe_origin_metadata().cloned()));
182
                }
183
58
                () = (self.now_fn)().sleep(self.no_event_action_timeout) => {
184
10
                    // Timeout happened, do additional checks below.
185
10
                }
186
            }
187
188
10
            let awaited_action = self
189
10
                .awaited_action_sub
190
10
                .borrow()
191
10
                .await
192
10
                .err_tip(|| "In MatchingEngineActionStateResult::changed")
?0
;
193
194
10
            if 
matches!1
(awaited_action.state().stage, ActionStage::Queued) {
195
                // Actions in queued state do not get periodically updated,
196
                // so we don't need to timeout them.
197
9
                continue;
198
1
            }
199
200
1
            let simple_scheduler_state_manager = self
201
1
                .simple_scheduler_state_manager
202
1
                .upgrade()
203
1
                .err_tip(|| format!(
"Failed to upgrade weak reference to SimpleSchedulerStateManager in MatchingEngineActionStateResult::changed at attempt: {timeout_attempts}"0
))
?0
;
204
205
1
            warn!(
206
                ?awaited_action,
207
1
                "OperationId {} / {} timed out after {} seconds issuing a retry",
208
1
                awaited_action.operation_id(),
209
1
                awaited_action.state().client_operation_id,
210
1
                self.no_event_action_timeout.as_secs_f32(),
211
            );
212
213
1
            simple_scheduler_state_manager
214
1
                .timeout_operation_id(awaited_action.operation_id())
215
1
                .await
216
1
                .err_tip(|| "In MatchingEngineActionStateResult::changed")
?0
;
217
218
1
            if timeout_attempts >= MAX_UPDATE_RETRIES {
  Branch (218:16): [True: 0, False: 0]
  Branch (218:16): [True: 0, False: 0]
  Branch (218:16): [Folded - Ignored]
  Branch (218:16): [True: 0, False: 1]
  Branch (218:16): [True: 0, False: 0]
219
0
                return Err(make_err!(
220
0
                    Code::Internal,
221
0
                    "Failed to update action after {} retries with no error set in MatchingEngineActionStateResult::changed - {} {:?}",
222
0
                    MAX_UPDATE_RETRIES,
223
0
                    awaited_action.operation_id(),
224
0
                    awaited_action.state().stage,
225
0
                ));
226
1
            }
227
1
            timeout_attempts += 1;
228
        }
229
92
    }
230
231
90
    async fn as_action_info(&self) -> Result<(Arc<ActionInfo>, Option<OriginMetadata>), Error> {
232
45
        let awaited_action = self
233
45
            .awaited_action_sub
234
45
            .borrow()
235
45
            .await
236
45
            .err_tip(|| "In MatchingEngineActionStateResult::as_action_info")
?0
;
237
45
        Ok((
238
45
            awaited_action.action_info().clone(),
239
45
            awaited_action.maybe_origin_metadata().cloned(),
240
45
        ))
241
90
    }
242
}
243
244
/// `SimpleSchedulerStateManager` is responsible for maintaining the state of the scheduler.
245
/// Scheduler state includes the actions that are queued, active, and recently completed.
246
/// It also includes the workers that are available to execute actions based on allocation
247
/// strategy.
248
#[derive(MetricsComponent)]
249
pub(crate) struct SimpleSchedulerStateManager<T, I, NowFn>
250
where
251
    T: AwaitedActionDb,
252
    I: InstantWrapper,
253
    NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static,
254
{
255
    /// Database for storing the state of all actions.
256
    #[metric(group = "action_db")]
257
    action_db: T,
258
259
    /// Maximum number of times a job can be retried.
260
    // TODO(aaronmondal) This should be a scheduler decorator instead
261
    // of always having it on every SimpleScheduler.
262
    #[metric(help = "Maximum number of times a job can be retried")]
263
    max_job_retries: usize,
264
265
    /// Duration after which an action is considered to be timed out if
266
    /// no event is received.
267
    #[metric(
268
        help = "Duration after which an action is considered to be timed out if no event is received"
269
    )]
270
    no_event_action_timeout: Duration,
271
272
    /// Mark operation as timed out if the worker has not updated in this duration.
273
    /// This is used to prevent operations from being stuck in the queue forever
274
    /// if it is not being processed by any worker.
275
    client_action_timeout: Duration,
276
277
    // A lock to ensure only one timeout operation is running at a time
278
    // on this service.
279
    timeout_operation_mux: Mutex<()>,
280
281
    /// Weak reference to self.
282
    // We use a weak reference to reduce the risk of a memory leak from
283
    // future changes. If this becomes some kind of performance issue,
284
    // we can consider using a strong reference.
285
    weak_self: Weak<Self>,
286
287
    /// Function to get the current time.
288
    now_fn: NowFn,
289
}
290
291
impl<T, I, NowFn> SimpleSchedulerStateManager<T, I, NowFn>
292
where
293
    T: AwaitedActionDb,
294
    I: InstantWrapper,
295
    NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static,
296
{
297
22
    pub(crate) fn new(
298
22
        max_job_retries: usize,
299
22
        no_event_action_timeout: Duration,
300
22
        client_action_timeout: Duration,
301
22
        action_db: T,
302
22
        now_fn: NowFn,
303
22
    ) -> Arc<Self> {
304
22
        Arc::new_cyclic(|weak_self| Self {
305
22
            action_db,
306
22
            max_job_retries,
307
22
            no_event_action_timeout,
308
22
            client_action_timeout,
309
22
            timeout_operation_mux: Mutex::new(()),
310
22
            weak_self: weak_self.clone(),
311
22
            now_fn,
312
22
        })
313
22
    }
314
315
548
    async fn apply_filter_predicate(
316
548
        &self,
317
548
        awaited_action: &AwaitedAction,
318
548
        filter: &OperationFilter,
319
548
    ) -> bool {
320
        // Note: The caller must filter `client_operation_id`.
321
322
548
        if awaited_action.last_client_keepalive_timestamp() + self.client_action_timeout
  Branch (322:12): [True: 0, False: 0]
  Branch (322:12): [True: 0, False: 0]
  Branch (322:12): [Folded - Ignored]
  Branch (322:12): [True: 0, False: 546]
  Branch (322:12): [True: 0, False: 2]
323
548
            < (self.now_fn)().now()
324
        {
325
0
            if !awaited_action.state().stage.is_finished() {
  Branch (325:16): [True: 0, False: 0]
  Branch (325:16): [True: 0, False: 0]
  Branch (325:16): [Folded - Ignored]
  Branch (325:16): [True: 0, False: 0]
  Branch (325:16): [True: 0, False: 0]
326
0
                let mut state = awaited_action.state().as_ref().clone();
327
0
                state.stage = ActionStage::Completed(ActionResult {
328
0
                    error: Some(make_err!(
329
0
                        Code::DeadlineExceeded,
330
0
                        "Operation timed out {} seconds of having no more clients listening",
331
0
                        self.client_action_timeout.as_secs_f32(),
332
0
                    )),
333
0
                    ..ActionResult::default()
334
0
                });
335
0
                let mut new_awaited_action = awaited_action.clone();
336
0
                new_awaited_action.worker_set_state(Arc::new(state), (self.now_fn)().now());
337
0
                if let Err(err) = self
  Branch (337:24): [True: 0, False: 0]
  Branch (337:24): [True: 0, False: 0]
  Branch (337:24): [Folded - Ignored]
  Branch (337:24): [True: 0, False: 0]
  Branch (337:24): [True: 0, False: 0]
338
0
                    .action_db
339
0
                    .update_awaited_action(new_awaited_action)
340
0
                    .await
341
                {
342
0
                    warn!(
343
0
                        "Failed to update action to timed out state after client keepalive timeout. This is ok if multiple schedulers tried to set the state at the same time: {err}",
344
                    );
345
0
                }
346
0
            }
347
0
            return false;
348
548
        }
349
350
548
        if let Some(
operation_id0
) = &filter.operation_id {
  Branch (350:16): [True: 0, False: 0]
  Branch (350:16): [True: 0, False: 0]
  Branch (350:16): [Folded - Ignored]
  Branch (350:16): [True: 0, False: 546]
  Branch (350:16): [True: 0, False: 2]
351
0
            if operation_id != awaited_action.operation_id() {
  Branch (351:16): [True: 0, False: 0]
  Branch (351:16): [True: 0, False: 0]
  Branch (351:16): [Folded - Ignored]
  Branch (351:16): [True: 0, False: 0]
  Branch (351:16): [True: 0, False: 0]
352
0
                return false;
353
0
            }
354
548
        }
355
356
548
        if filter.worker_id.is_some() && 
filter.worker_id.as_ref()0
!= awaited_action.worker_id() {
  Branch (356:12): [True: 0, False: 0]
  Branch (356:42): [True: 0, False: 0]
  Branch (356:12): [True: 0, False: 0]
  Branch (356:42): [True: 0, False: 0]
  Branch (356:12): [Folded - Ignored]
  Branch (356:42): [Folded - Ignored]
  Branch (356:12): [True: 0, False: 546]
  Branch (356:42): [True: 0, False: 0]
  Branch (356:12): [True: 0, False: 2]
  Branch (356:42): [True: 0, False: 0]
357
0
            return false;
358
548
        }
359
360
        {
361
548
            if let Some(
filter_unique_key0
) = &filter.unique_key {
  Branch (361:20): [True: 0, False: 0]
  Branch (361:20): [True: 0, False: 0]
  Branch (361:20): [Folded - Ignored]
  Branch (361:20): [True: 0, False: 546]
  Branch (361:20): [True: 0, False: 2]
362
0
                match &awaited_action.action_info().unique_qualifier {
363
0
                    ActionUniqueQualifier::Cacheable(unique_key) => {
364
0
                        if filter_unique_key != unique_key {
  Branch (364:28): [True: 0, False: 0]
  Branch (364:28): [True: 0, False: 0]
  Branch (364:28): [Folded - Ignored]
  Branch (364:28): [True: 0, False: 0]
  Branch (364:28): [True: 0, False: 0]
365
0
                            return false;
366
0
                        }
367
                    }
368
                    ActionUniqueQualifier::Uncacheable(_) => {
369
0
                        return false;
370
                    }
371
                }
372
548
            }
373
548
            if let Some(
action_digest0
) = filter.action_digest {
  Branch (373:20): [True: 0, False: 0]
  Branch (373:20): [True: 0, False: 0]
  Branch (373:20): [Folded - Ignored]
  Branch (373:20): [True: 0, False: 546]
  Branch (373:20): [True: 0, False: 2]
374
0
                if action_digest != awaited_action.action_info().digest() {
  Branch (374:20): [True: 0, False: 0]
  Branch (374:20): [True: 0, False: 0]
  Branch (374:20): [Folded - Ignored]
  Branch (374:20): [True: 0, False: 0]
  Branch (374:20): [True: 0, False: 0]
375
0
                    return false;
376
0
                }
377
548
            }
378
        }
379
380
        {
381
548
            let last_worker_update_timestamp = awaited_action.last_worker_updated_timestamp();
382
548
            if let Some(
worker_update_before0
) = filter.worker_update_before {
  Branch (382:20): [True: 0, False: 0]
  Branch (382:20): [True: 0, False: 0]
  Branch (382:20): [Folded - Ignored]
  Branch (382:20): [True: 0, False: 546]
  Branch (382:20): [True: 0, False: 2]
383
0
                if worker_update_before < last_worker_update_timestamp {
  Branch (383:20): [True: 0, False: 0]
  Branch (383:20): [True: 0, False: 0]
  Branch (383:20): [Folded - Ignored]
  Branch (383:20): [True: 0, False: 0]
  Branch (383:20): [True: 0, False: 0]
384
0
                    return false;
385
0
                }
386
548
            }
387
548
            if let Some(
completed_before0
) = filter.completed_before {
  Branch (387:20): [True: 0, False: 0]
  Branch (387:20): [True: 0, False: 0]
  Branch (387:20): [Folded - Ignored]
  Branch (387:20): [True: 0, False: 546]
  Branch (387:20): [True: 0, False: 2]
388
0
                if awaited_action.state().stage.is_finished()
  Branch (388:20): [True: 0, False: 0]
  Branch (388:20): [True: 0, False: 0]
  Branch (388:20): [Folded - Ignored]
  Branch (388:20): [True: 0, False: 0]
  Branch (388:20): [True: 0, False: 0]
389
0
                    && completed_before < last_worker_update_timestamp
  Branch (389:24): [True: 0, False: 0]
  Branch (389:24): [True: 0, False: 0]
  Branch (389:24): [Folded - Ignored]
  Branch (389:24): [True: 0, False: 0]
  Branch (389:24): [True: 0, False: 0]
390
                {
391
0
                    return false;
392
0
                }
393
548
            }
394
548
            if filter.stages != OperationStageFlags::Any {
  Branch (394:16): [True: 0, False: 0]
  Branch (394:16): [True: 0, False: 0]
  Branch (394:16): [Folded - Ignored]
  Branch (394:16): [True: 543, False: 3]
  Branch (394:16): [True: 2, False: 0]
395
545
                let stage_flag = match awaited_action.state().stage {
396
0
                    ActionStage::Unknown => OperationStageFlags::Any,
397
0
                    ActionStage::CacheCheck => OperationStageFlags::CacheCheck,
398
545
                    ActionStage::Queued => OperationStageFlags::Queued,
399
0
                    ActionStage::Executing => OperationStageFlags::Executing,
400
                    ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => {
401
0
                        OperationStageFlags::Completed
402
                    }
403
                };
404
545
                if !filter.stages.intersects(stage_flag) {
  Branch (404:20): [True: 0, False: 0]
  Branch (404:20): [True: 0, False: 0]
  Branch (404:20): [Folded - Ignored]
  Branch (404:20): [True: 0, False: 543]
  Branch (404:20): [True: 0, False: 2]
405
0
                    return false;
406
545
                }
407
3
            }
408
        }
409
410
548
        true
411
548
    }
412
413
    /// Let the scheduler know that an operation has timed out from
414
    /// the client side (ie: worker has not updated in a while).
415
1
    async fn timeout_operation_id(&self, operation_id: &OperationId) -> Result<(), Error> {
416
        // Ensure that only one timeout operation is running at a time.
417
        // Failing to do this could result in the same operation being
418
        // timed out multiple times at the same time.
419
        // Note: We could implement this on a per-operation_id basis, but it is quite
420
        // complex to manage the locks.
421
1
        let _lock = self.timeout_operation_mux.lock().await;
422
423
1
        let awaited_action_subscriber = self
424
1
            .action_db
425
1
            .get_by_operation_id(operation_id)
426
1
            .await
427
1
            .err_tip(|| "In SimpleSchedulerStateManager::timeout_operation_id")
?0
428
1
            .err_tip(|| 
{0
429
0
                format!("Operation id {operation_id} does not exist in SimpleSchedulerStateManager::timeout_operation_id")
430
0
            })?;
431
432
1
        let awaited_action = awaited_action_subscriber
433
1
            .borrow()
434
1
            .await
435
1
            .err_tip(|| "In SimpleSchedulerStateManager::timeout_operation_id")
?0
;
436
437
        // If the action is not executing, we should not timeout the action.
438
1
        if !
matches!0
(awaited_action.state().stage, ActionStage::Executing) {
  Branch (438:12): [True: 0, False: 0]
  Branch (438:12): [True: 0, False: 0]
  Branch (438:12): [Folded - Ignored]
  Branch (438:12): [True: 0, False: 1]
  Branch (438:12): [True: 0, False: 0]
439
0
            return Ok(());
440
1
        }
441
442
1
        let last_worker_updated = awaited_action
443
1
            .last_worker_updated_timestamp()
444
1
            .duration_since(SystemTime::UNIX_EPOCH)
445
1
            .map_err(|e| 
{0
446
0
                make_err!(
447
0
                    Code::Internal,
448
                    "Failed to convert last_worker_updated to duration since epoch {e:?}"
449
                )
450
0
            })?;
451
1
        let worker_should_update_before = last_worker_updated
452
1
            .checked_add(self.no_event_action_timeout)
453
1
            .err_tip(|| "Timestamp too big in SimpleSchedulerStateManager::timeout_operation_id")
?0
;
454
1
        if worker_should_update_before < (self.now_fn)().elapsed() {
  Branch (454:12): [True: 0, False: 0]
  Branch (454:12): [True: 0, False: 0]
  Branch (454:12): [Folded - Ignored]
  Branch (454:12): [True: 0, False: 1]
  Branch (454:12): [True: 0, False: 0]
455
            // The action was updated recently, we should not timeout the action.
456
            // This is to prevent timing out actions that have recently been updated
457
            // (like multiple clients timeout the same action at the same time).
458
0
            return Ok(());
459
1
        }
460
461
1
        self.assign_operation(
462
1
            operation_id,
463
1
            Err(make_err!(
464
1
                Code::DeadlineExceeded,
465
1
                "Operation timed out after {} seconds",
466
1
                self.no_event_action_timeout.as_secs_f32(),
467
1
            )),
468
1
        )
469
1
        .await
470
1
    }
471
472
41
    async fn inner_update_operation(
473
41
        &self,
474
41
        operation_id: &OperationId,
475
41
        maybe_worker_id: Option<&WorkerId>,
476
41
        update: UpdateOperationType,
477
41
    ) -> Result<(), Error> {
478
41
        let mut last_err = None;
479
42
        for _ in 0..MAX_UPDATE_RETRIES {
480
42
            let maybe_awaited_action_subscriber = self
481
42
                .action_db
482
42
                .get_by_operation_id(operation_id)
483
42
                .await
484
42
                .err_tip(|| "In SimpleSchedulerStateManager::update_operation")
?0
;
485
42
            let Some(
awaited_action_subscriber41
) = maybe_awaited_action_subscriber else {
  Branch (485:17): [True: 0, False: 0]
  Branch (485:17): [True: 0, False: 0]
  Branch (485:17): [Folded - Ignored]
  Branch (485:17): [True: 39, False: 0]
  Branch (485:17): [True: 2, False: 1]
486
                // No action found. It is ok if the action was not found. It
487
                // probably means that the action was dropped, but worker was
488
                // still processing it.
489
1
                return Ok(());
490
            };
491
492
41
            let mut awaited_action = awaited_action_subscriber
493
41
                .borrow()
494
41
                .await
495
41
                .err_tip(|| "In SimpleSchedulerStateManager::update_operation")
?0
;
496
497
            // Make sure the worker id matches the awaited action worker id.
498
            // This might happen if the worker sending the update is not the
499
            // worker that was assigned.
500
41
            if awaited_action.worker_id().is_some()
  Branch (500:16): [True: 0, False: 0]
  Branch (500:16): [True: 0, False: 0]
  Branch (500:16): [Folded - Ignored]
  Branch (500:16): [True: 13, False: 26]
  Branch (500:16): [True: 0, False: 2]
501
13
                && maybe_worker_id.is_some()
  Branch (501:20): [True: 0, False: 0]
  Branch (501:20): [True: 0, False: 0]
  Branch (501:20): [Folded - Ignored]
  Branch (501:20): [True: 12, False: 1]
  Branch (501:20): [True: 0, False: 0]
502
12
                && maybe_worker_id != awaited_action.worker_id()
  Branch (502:20): [True: 0, False: 0]
  Branch (502:20): [True: 0, False: 0]
  Branch (502:20): [Folded - Ignored]
  Branch (502:20): [True: 0, False: 12]
  Branch (502:20): [True: 0, False: 0]
503
            {
504
                // If another worker is already assigned to the action, another
505
                // worker probably picked up the action. We should not update the
506
                // action in this case and abort this operation.
507
0
                let err = make_err!(
508
0
                    Code::Aborted,
509
                    "Worker ids do not match - {:?} != {:?} for {:?}",
510
                    maybe_worker_id,
511
0
                    awaited_action.worker_id(),
512
                    awaited_action,
513
                );
514
0
                info!(
515
0
                    "Worker ids do not match - {:?} != {:?} for {:?}. This is probably due to another worker picking up the action.",
516
                    maybe_worker_id,
517
0
                    awaited_action.worker_id(),
518
                    awaited_action,
519
                );
520
0
                return Err(err);
521
41
            }
522
523
            // Make sure we don't update an action that is already completed.
524
41
            if awaited_action.state().stage.is_finished() {
  Branch (524:16): [True: 0, False: 0]
  Branch (524:16): [True: 0, False: 0]
  Branch (524:16): [Folded - Ignored]
  Branch (524:16): [True: 0, False: 39]
  Branch (524:16): [True: 0, False: 2]
525
0
                return Err(make_err!(
526
0
                    Code::Internal,
527
0
                    "Action {operation_id:?} is already completed with state {:?} - maybe_worker_id: {:?}",
528
0
                    awaited_action.state().stage,
529
0
                    maybe_worker_id,
530
0
                ));
531
41
            }
532
533
41
            let stage = match &update {
534
                UpdateOperationType::KeepAlive => {
535
0
                    awaited_action.worker_keep_alive((self.now_fn)().now());
536
0
                    return self
537
0
                        .action_db
538
0
                        .update_awaited_action(awaited_action)
539
0
                        .await
540
0
                        .err_tip(|| "Failed to send KeepAlive in SimpleSchedulerStateManager::update_operation");
541
                }
542
33
                UpdateOperationType::UpdateWithActionStage(stage) => stage.clone(),
543
8
                UpdateOperationType::UpdateWithError(err) => {
544
                    // Don't count a backpressure failure as an attempt for an action.
545
8
                    let due_to_backpressure = err.code == Code::ResourceExhausted;
546
8
                    if !due_to_backpressure {
  Branch (546:24): [True: 0, False: 0]
  Branch (546:24): [True: 0, False: 0]
  Branch (546:24): [Folded - Ignored]
  Branch (546:24): [True: 8, False: 0]
  Branch (546:24): [True: 0, False: 0]
547
8
                        awaited_action.attempts += 1;
548
8
                    
}0
549
550
8
                    if awaited_action.attempts > self.max_job_retries {
  Branch (550:24): [True: 0, False: 0]
  Branch (550:24): [True: 0, False: 0]
  Branch (550:24): [Folded - Ignored]
  Branch (550:24): [True: 1, False: 7]
  Branch (550:24): [True: 0, False: 0]
551
1
                        ActionStage::Completed(ActionResult {
552
1
                            execution_metadata: ExecutionMetadata {
553
1
                                worker: maybe_worker_id.map_or_else(String::default, ToString::to_string),
554
1
                                ..ExecutionMetadata::default()
555
1
                            },
556
1
                            error: Some(err.clone().merge(make_err!(
557
1
                                Code::Internal,
558
1
                                "Job cancelled because it attempted to execute too many times {} > {} times {}",
559
1
                                awaited_action.attempts,
560
1
                                self.max_job_retries,
561
1
                                format!("for operation_id: {operation_id}, maybe_worker_id: {maybe_worker_id:?}"),
562
1
                            ))),
563
1
                            ..ActionResult::default()
564
1
                        })
565
                    } else {
566
7
                        ActionStage::Queued
567
                    }
568
                }
569
            };
570
41
            let now = (self.now_fn)().now();
571
41
            if 
matches!34
(stage, ActionStage::Queued) {
572
7
                // If the action is queued, we need to unset the worker id regardless of
573
7
                // which worker sent the update.
574
7
                awaited_action.set_worker_id(None, now);
575
34
            } else {
576
34
                awaited_action.set_worker_id(maybe_worker_id.cloned(), now);
577
34
            }
578
41
            awaited_action.worker_set_state(
579
41
                Arc::new(ActionState {
580
41
                    stage,
581
41
                    // Client id is not known here, it is the responsibility of
582
41
                    // the the subscriber impl to replace this with the
583
41
                    // correct client id.
584
41
                    client_operation_id: operation_id.clone(),
585
41
                    action_digest: awaited_action.action_info().digest(),
586
41
                }),
587
41
                now,
588
            );
589
590
41
            let update_action_result = self
591
41
                .action_db
592
41
                .update_awaited_action(awaited_action)
593
41
                .await
594
41
                .err_tip(|| "In SimpleSchedulerStateManager::update_operation");
595
41
            if let Err(
err2
) = update_action_result {
  Branch (595:20): [True: 0, False: 0]
  Branch (595:20): [True: 0, False: 0]
  Branch (595:20): [Folded - Ignored]
  Branch (595:20): [True: 0, False: 39]
  Branch (595:20): [True: 2, False: 0]
596
                // We use Aborted to signal that the action was not
597
                // updated due to the data being set was not the latest
598
                // but can be retried.
599
2
                if err.code == Code::Aborted {
  Branch (599:20): [True: 0, False: 0]
  Branch (599:20): [True: 0, False: 0]
  Branch (599:20): [Folded - Ignored]
  Branch (599:20): [True: 0, False: 0]
  Branch (599:20): [True: 1, False: 1]
600
1
                    last_err = Some(err);
601
1
                    continue;
602
1
                }
603
1
                return Err(err);
604
39
            }
605
39
            return Ok(());
606
        }
607
0
        Err(last_err.unwrap_or_else(|| {
608
0
            make_err!(
609
0
                Code::Internal,
610
                "Failed to update action after {} retries with no error set",
611
                MAX_UPDATE_RETRIES,
612
            )
613
0
        }))
614
41
    }
615
616
27
    async fn inner_add_operation(
617
27
        &self,
618
27
        new_client_operation_id: OperationId,
619
27
        action_info: Arc<ActionInfo>,
620
27
    ) -> Result<T::Subscriber, Error> {
621
27
        self.action_db
622
27
            .add_action(new_client_operation_id, action_info)
623
27
            .await
624
27
            .err_tip(|| "In SimpleSchedulerStateManager::add_operation")
625
27
    }
626
627
597
    async fn inner_filter_operations<'a, F>(
628
597
        &'a self,
629
597
        filter: OperationFilter,
630
597
        to_action_state_result: F,
631
597
    ) -> Result<ActionStateResultStream<'a>, Error>
632
597
    where
633
597
        F: Fn(T::Subscriber) -> Box<dyn ActionStateResult> + Send + Sync + 'a,
634
597
    {
635
594
        const fn sorted_awaited_action_state_for_flags(
636
594
            stage: OperationStageFlags,
637
594
        ) -> Option<SortedAwaitedActionState> {
638
594
            match stage {
639
0
                OperationStageFlags::CacheCheck => Some(SortedAwaitedActionState::CacheCheck),
640
594
                OperationStageFlags::Queued => Some(SortedAwaitedActionState::Queued),
641
0
                OperationStageFlags::Executing => Some(SortedAwaitedActionState::Executing),
642
0
                OperationStageFlags::Completed => Some(SortedAwaitedActionState::Completed),
643
0
                _ => None,
644
            }
645
594
        }
646
647
597
        if let Some(
operation_id0
) = &filter.operation_id {
  Branch (647:16): [True: 0, False: 0]
  Branch (647:16): [True: 0, False: 0]
  Branch (647:16): [True: 0, False: 0]
  Branch (647:16): [True: 0, False: 0]
  Branch (647:16): [Folded - Ignored]
  Branch (647:16): [True: 0, False: 503]
  Branch (647:16): [True: 0, False: 90]
  Branch (647:16): [True: 0, False: 0]
  Branch (647:16): [True: 0, False: 4]
648
0
            let maybe_subscriber = self
649
0
                .action_db
650
0
                .get_by_operation_id(operation_id)
651
0
                .await
652
0
                .err_tip(|| "In SimpleSchedulerStateManager::filter_operations")?;
653
0
            let Some(subscriber) = maybe_subscriber else {
  Branch (653:17): [True: 0, False: 0]
  Branch (653:17): [True: 0, False: 0]
  Branch (653:17): [True: 0, False: 0]
  Branch (653:17): [True: 0, False: 0]
  Branch (653:17): [Folded - Ignored]
  Branch (653:17): [True: 0, False: 0]
  Branch (653:17): [True: 0, False: 0]
  Branch (653:17): [True: 0, False: 0]
  Branch (653:17): [True: 0, False: 0]
654
0
                return Ok(Box::pin(stream::empty()));
655
            };
656
0
            let awaited_action = subscriber
657
0
                .borrow()
658
0
                .await
659
0
                .err_tip(|| "In SimpleSchedulerStateManager::filter_operations")?;
660
0
            if !self.apply_filter_predicate(&awaited_action, &filter).await {
  Branch (660:16): [True: 0, False: 0]
  Branch (660:16): [True: 0, False: 0]
  Branch (660:16): [True: 0, False: 0]
  Branch (660:16): [True: 0, False: 0]
  Branch (660:16): [Folded - Ignored]
  Branch (660:16): [True: 0, False: 0]
  Branch (660:16): [True: 0, False: 0]
  Branch (660:16): [True: 0, False: 0]
  Branch (660:16): [True: 0, False: 0]
661
0
                return Ok(Box::pin(stream::empty()));
662
0
            }
663
0
            return Ok(Box::pin(stream::once(async move {
664
0
                to_action_state_result(subscriber)
665
0
            })));
666
597
        }
667
597
        if let Some(
client_operation_id3
) = &filter.client_operation_id {
  Branch (667:16): [True: 0, False: 0]
  Branch (667:16): [True: 0, False: 0]
  Branch (667:16): [True: 0, False: 0]
  Branch (667:16): [True: 0, False: 0]
  Branch (667:16): [Folded - Ignored]
  Branch (667:16): [True: 3, False: 500]
  Branch (667:16): [True: 0, False: 90]
  Branch (667:16): [True: 0, False: 0]
  Branch (667:16): [True: 0, False: 4]
668
3
            let maybe_subscriber = self
669
3
                .action_db
670
3
                .get_awaited_action_by_id(client_operation_id)
671
3
                .await
672
3
                .err_tip(|| "In SimpleSchedulerStateManager::filter_operations")
?0
;
673
3
            let Some(subscriber) = maybe_subscriber else {
  Branch (673:17): [True: 0, False: 0]
  Branch (673:17): [True: 0, False: 0]
  Branch (673:17): [True: 0, False: 0]
  Branch (673:17): [True: 0, False: 0]
  Branch (673:17): [Folded - Ignored]
  Branch (673:17): [True: 3, False: 0]
  Branch (673:17): [True: 0, False: 0]
  Branch (673:17): [True: 0, False: 0]
  Branch (673:17): [True: 0, False: 0]
674
0
                return Ok(Box::pin(stream::empty()));
675
            };
676
3
            let awaited_action = subscriber
677
3
                .borrow()
678
3
                .await
679
3
                .err_tip(|| "In SimpleSchedulerStateManager::filter_operations")
?0
;
680
3
            if !self.apply_filter_predicate(&awaited_action, &filter).await {
  Branch (680:16): [True: 0, False: 0]
  Branch (680:16): [True: 0, False: 0]
  Branch (680:16): [True: 0, False: 0]
  Branch (680:16): [True: 0, False: 0]
  Branch (680:16): [Folded - Ignored]
  Branch (680:16): [True: 0, False: 3]
  Branch (680:16): [True: 0, False: 0]
  Branch (680:16): [True: 0, False: 0]
  Branch (680:16): [True: 0, False: 0]
681
0
                return Ok(Box::pin(stream::empty()));
682
3
            }
683
3
            return Ok(Box::pin(stream::once(async move {
684
3
                to_action_state_result(subscriber)
685
3
            })));
686
594
        }
687
688
594
        let Some(sorted_awaited_action_state) =
  Branch (688:13): [True: 0, False: 0]
  Branch (688:13): [True: 0, False: 0]
  Branch (688:13): [True: 0, False: 0]
  Branch (688:13): [True: 0, False: 0]
  Branch (688:13): [Folded - Ignored]
  Branch (688:13): [True: 500, False: 0]
  Branch (688:13): [True: 90, False: 0]
  Branch (688:13): [True: 0, False: 0]
  Branch (688:13): [True: 4, False: 0]
689
594
            sorted_awaited_action_state_for_flags(filter.stages)
690
        else {
691
0
            let mut all_items: Vec<_> = self
692
0
                .action_db
693
0
                .get_all_awaited_actions()
694
0
                .await
695
0
                .err_tip(|| "In SimpleSchedulerStateManager::filter_operations")?
696
0
                .and_then(|awaited_action_subscriber| async move {
697
0
                    let awaited_action = awaited_action_subscriber
698
0
                        .borrow()
699
0
                        .await
700
0
                        .err_tip(|| "In SimpleSchedulerStateManager::filter_operations")?;
701
0
                    Ok((awaited_action_subscriber, awaited_action))
702
0
                })
703
0
                .try_filter_map(|(subscriber, awaited_action)| {
704
0
                    let filter = filter.clone();
705
0
                    async move {
706
0
                        if self.apply_filter_predicate(&awaited_action, &filter).await {
  Branch (706:28): [True: 0, False: 0]
  Branch (706:28): [True: 0, False: 0]
  Branch (706:28): [True: 0, False: 0]
  Branch (706:28): [True: 0, False: 0]
  Branch (706:28): [Folded - Ignored]
  Branch (706:28): [True: 0, False: 0]
  Branch (706:28): [True: 0, False: 0]
  Branch (706:28): [True: 0, False: 0]
  Branch (706:28): [True: 0, False: 0]
707
0
                            Ok(Some((subscriber, awaited_action.sort_key())))
708
                        } else {
709
0
                            Ok(None)
710
                        }
711
0
                    }
712
0
                })
713
0
                .try_collect()
714
0
                .await
715
0
                .err_tip(|| "In SimpleSchedulerStateManager::filter_operations")?;
716
0
            match filter.order_by_priority_direction {
717
0
                Some(OrderDirection::Asc) => all_items.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)),
718
0
                Some(OrderDirection::Desc) => all_items.sort_unstable_by(|(_, a), (_, b)| b.cmp(a)),
719
0
                None => {}
720
            }
721
0
            return Ok(Box::pin(stream::iter(
722
0
                all_items
723
0
                    .into_iter()
724
0
                    .map(move |(subscriber, _)| to_action_state_result(subscriber)),
725
            )));
726
        };
727
728
594
        let desc = 
matches!500
(
729
94
            filter.order_by_priority_direction,
730
            Some(OrderDirection::Desc)
731
        );
732
594
        let stream = self
733
594
            .action_db
734
594
            .get_range_of_actions(
735
594
                sorted_awaited_action_state,
736
594
                Bound::Unbounded,
737
594
                Bound::Unbounded,
738
594
                desc,
739
594
            )
740
594
            .await
741
594
            .err_tip(|| "In SimpleSchedulerStateManager::filter_operations")
?0
742
594
            .and_then(|awaited_action_subscriber| async move 
{545
743
545
                let awaited_action = awaited_action_subscriber
744
545
                    .borrow()
745
545
                    .await
746
545
                    .err_tip(|| "In SimpleSchedulerStateManager::filter_operations")
?0
;
747
545
                Ok((awaited_action_subscriber, awaited_action))
748
1.09k
            })
749
594
            .try_filter_map(move |(subscriber, awaited_action)| 
{545
750
545
                let filter = filter.clone();
751
545
                async move {
752
545
                    if self.apply_filter_predicate(&awaited_action, &filter).await {
  Branch (752:24): [True: 0, False: 0]
  Branch (752:24): [True: 0, False: 0]
  Branch (752:24): [True: 0, False: 0]
  Branch (752:24): [True: 0, False: 0]
  Branch (752:24): [Folded - Ignored]
  Branch (752:24): [True: 500, False: 0]
  Branch (752:24): [True: 43, False: 0]
  Branch (752:24): [True: 0, False: 0]
  Branch (752:24): [True: 2, False: 0]
753
545
                        Ok(Some(subscriber))
754
                    } else {
755
0
                        Ok(None)
756
                    }
757
545
                }
758
545
            })
759
594
            .map(move |result| -> Box<dyn ActionStateResult> 
{545
760
545
                result.map_or_else(
761
0
                    |e| -> Box<dyn ActionStateResult> { Box::new(ErrorActionStateResult(e)) },
762
545
                    |v| -> Box<dyn ActionStateResult> { to_action_state_result(v) },
763
                )
764
545
            });
765
594
        Ok(Box::pin(stream))
766
597
    }
767
}
768
769
#[async_trait]
770
impl<T, I, NowFn> ClientStateManager for SimpleSchedulerStateManager<T, I, NowFn>
771
where
772
    T: AwaitedActionDb,
773
    I: InstantWrapper,
774
    NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static,
775
{
776
    async fn add_action(
777
        &self,
778
        client_operation_id: OperationId,
779
        action_info: Arc<ActionInfo>,
780
54
    ) -> Result<Box<dyn ActionStateResult>, Error> {
781
27
        let sub = self
782
27
            .inner_add_operation(client_operation_id, action_info.clone())
783
27
            .await
?0
;
784
785
27
        Ok(Box::new(ClientActionStateResult::new(
786
27
            sub,
787
27
            self.weak_self.clone(),
788
27
            self.no_event_action_timeout,
789
27
            self.now_fn.clone(),
790
27
        )))
791
54
    }
792
793
    async fn filter_operations<'a>(
794
        &'a self,
795
        filter: OperationFilter,
796
1.00k
    ) -> Result<ActionStateResultStream<'a>, Error> {
797
503
        self.inner_filter_operations(filter, move |rx| {
798
503
            Box::new(ClientActionStateResult::new(
799
503
                rx,
800
503
                self.weak_self.clone(),
801
503
                self.no_event_action_timeout,
802
503
                self.now_fn.clone(),
803
503
            ))
804
503
        })
805
503
        .await
806
1.00k
    }
807
808
0
    fn as_known_platform_property_provider(&self) -> Option<&dyn KnownPlatformPropertyProvider> {
809
0
        None
810
0
    }
811
}
812
813
#[async_trait]
814
impl<T, I, NowFn> WorkerStateManager for SimpleSchedulerStateManager<T, I, NowFn>
815
where
816
    T: AwaitedActionDb,
817
    I: InstantWrapper,
818
    NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static,
819
{
820
    async fn update_operation(
821
        &self,
822
        operation_id: &OperationId,
823
        worker_id: &WorkerId,
824
        update: UpdateOperationType,
825
24
    ) -> Result<(), Error> {
826
12
        self.inner_update_operation(operation_id, Some(worker_id), update)
827
12
            .await
828
24
    }
829
}
830
831
#[async_trait]
832
impl<T, I, NowFn> MatchingEngineStateManager for SimpleSchedulerStateManager<T, I, NowFn>
833
where
834
    T: AwaitedActionDb,
835
    I: InstantWrapper,
836
    NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static,
837
{
838
    async fn filter_operations<'a>(
839
        &'a self,
840
        filter: OperationFilter,
841
188
    ) -> Result<ActionStateResultStream<'a>, Error> {
842
94
        self.inner_filter_operations(filter, |rx| 
{45
843
45
            Box::new(MatchingEngineActionStateResult::new(
844
45
                rx,
845
45
                self.weak_self.clone(),
846
45
                self.no_event_action_timeout,
847
45
                self.now_fn.clone(),
848
45
            ))
849
45
        })
850
94
        .await
851
188
    }
852
853
    async fn assign_operation(
854
        &self,
855
        operation_id: &OperationId,
856
        worker_id_or_reason_for_unassign: Result<&WorkerId, Error>,
857
58
    ) -> Result<(), Error> {
858
29
        let (maybe_worker_id, update) = match worker_id_or_reason_for_unassign {
859
28
            Ok(worker_id) => (
860
28
                Some(worker_id),
861
28
                UpdateOperationType::UpdateWithActionStage(ActionStage::Executing),
862
28
            ),
863
1
            Err(err) => (None, UpdateOperationType::UpdateWithError(err)),
864
        };
865
29
        self.inner_update_operation(operation_id, maybe_worker_id, update)
866
29
            .await
867
58
    }
868
}