/build/source/nativelink-config/src/schedulers.rs

Source (jump to first uncovered line)
// Copyright 2024 The NativeLink Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::collections::HashMap;

use serde::Deserialize;

use crate::serde_utils::{convert_duration_with_shellexpand, convert_numeric_with_shellexpand};
use crate::stores::{GrpcEndpoint, Retry, StoreRefName};

#[allow(non_camel_case_types)]
#[derive(Deserialize, Debug)]
pub enum SchedulerSpec {
    simple(SimpleSpec),
    grpc(GrpcSpec),
    cache_lookup(CacheLookupSpec),
    property_modifier(PropertyModifierSpec),
}

/// When the scheduler matches tasks to workers that are capable of running
/// the task, this value will be used to determine how the property is treated.
#[allow(non_camel_case_types)]
#[derive(Deserialize, Debug, Clone, Copy, Hash, Eq, PartialEq)]
pub enum PropertyType {
    /// Requires the platform property to be a u64 and when the scheduler looks
    /// for appropriate worker nodes that are capable of executing the task,
    /// the task will not run on a node that has less than this value.
    minimum,

    /// Requires the platform property to be a string and when the scheduler
    /// looks for appropriate worker nodes that are capable of executing the
    /// task, the task will not run on a node that does not have this property
    /// set to the value with exact string match.
    exact,

    /// Does not restrict on this value and instead will be passed to the worker
    /// as an informational piece.
    /// TODO(allada) In the future this will be used by the scheduler and worker
    /// to cause the scheduler to prefer certain workers over others, but not
    /// restrict them based on these values.
    priority,
}

/// When a worker is being searched for to run a job, this will be used
/// on how to choose which worker should run the job when multiple
/// workers are able to run the task.
#[allow(non_camel_case_types)]
#[derive(Copy, Clone, Deserialize, Debug, Default)]
pub enum WorkerAllocationStrategy {
    /// Prefer workers that have been least recently used to run a job.
    #[default]
    least_recently_used,
    /// Prefer workers that have been most recently used to run a job.
    most_recently_used,
}

#[derive(Deserialize, Debug, Default)]
#[serde(deny_unknown_fields)]
pub struct SimpleSpec {
    /// A list of supported platform properties mapped to how these properties
    /// are used when the scheduler looks for worker nodes capable of running
    /// the task.
    ///
    /// For example, a value of:
    /// ```json
    /// { "cpu_count": "minimum", "cpu_arch": "exact" }
    /// ```
    /// With a job that contains:
    /// ```json
    /// { "cpu_count": "8", "cpu_arch": "arm" }
    /// ```
    /// Will result in the scheduler filtering out any workers that do not have
    /// `"cpu_arch" = "arm"` and filter out any workers that have less than 8 cpu
    /// cores available.
    ///
    /// The property names here must match the property keys provided by the
    /// worker nodes when they join the pool. In other words, the workers will
    /// publish their capabilities to the scheduler when they join the worker
    /// pool. If the worker fails to notify the scheduler of its (for example)
    /// `"cpu_arch"`, the scheduler will never send any jobs to it, if all jobs
    /// have the `"cpu_arch"` label. There is no special treatment of any platform
    /// property labels other and entirely driven by worker configs and this
    /// config.
    pub supported_platform_properties: Option<HashMap<String, PropertyType>>,

    /// The amount of time to retain completed actions in memory for in case
    /// a `WaitExecution` is called after the action has completed.
    /// Default: 60 (seconds)
    #[serde(default, deserialize_with = "convert_duration_with_shellexpand")]
    pub retain_completed_for_s: u32,

    /// Mark operations as completed with error if no client has updated them
    /// within this duration.
    /// Default: 60 (seconds)
    #[serde(default, deserialize_with = "convert_duration_with_shellexpand")]
    pub client_action_timeout_s: u64,

    /// Remove workers from pool once the worker has not responded in this
    /// amount of time in seconds.
    /// Default: 5 (seconds)
    #[serde(default, deserialize_with = "convert_duration_with_shellexpand")]
    pub worker_timeout_s: u64,

    /// If a job returns an internal error or times out this many times when
    /// attempting to run on a worker the scheduler will return the last error
    /// to the client. Jobs will be retried and this configuration is to help
    /// prevent one rogue job from infinitely retrying and taking up a lot of
    /// resources when the task itself is the one causing the server to go
    /// into a bad state.
    /// Default: 3
    #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")]
    pub max_job_retries: usize,

    /// The strategy used to assign workers jobs.
    #[serde(default)]
    pub allocation_strategy: WorkerAllocationStrategy,

    /// The storage backend to use for the scheduler.
    /// Default: memory
    pub experimental_backend: Option<ExperimentalSimpleSchedulerBackend>,
}

#[allow(non_camel_case_types)]
#[derive(Deserialize, Debug)]
pub enum ExperimentalSimpleSchedulerBackend {
    /// Use an in-memory store for the scheduler.
    memory,
    /// Use a redis store for the scheduler.
    redis(ExperimentalRedisSchedulerBackend),
}

#[derive(Deserialize, Debug, Default)]
#[serde(deny_unknown_fields)]
pub struct ExperimentalRedisSchedulerBackend {
    /// A reference to the redis store to use for the scheduler.
    /// Note: This MUST resolve to a `RedisSpec`.
    pub redis_store: StoreRefName,
}

/// A scheduler that simply forwards requests to an upstream scheduler.  This
/// is useful to use when doing some kind of local action cache or CAS away from
/// the main cluster of workers.  In general, it's more efficient to point the
/// build at the main scheduler directly though.
#[derive(Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct GrpcSpec {
    /// The upstream scheduler to forward requests to.
    pub endpoint: GrpcEndpoint,

    /// Retry configuration to use when a network request fails.
    #[serde(default)]
    pub retry: Retry,

    /// Limit the number of simultaneous upstream requests to this many.  A
    /// value of zero is treated as unlimited.  If the limit is reached the
    /// request is queued.
    #[serde(default)]
    pub max_concurrent_requests: usize,

    /// The number of connections to make to each specified endpoint to balance
    /// the load over multiple TCP connections.  Default 1.
    #[serde(default)]
    pub connections_per_endpoint: usize,
}

#[derive(Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct CacheLookupSpec {
    /// The reference to the action cache store used to return cached
    /// actions from rather than running them again.
    /// To prevent unintended issues, this store should probably be a `CompletenessCheckingSpec`.
    pub ac_store: StoreRefName,

    /// The nested scheduler to use if cache lookup fails.
    pub scheduler: Box<SchedulerSpec>,
}

#[derive(Deserialize, Debug, Clone)]
#[serde(deny_unknown_fields)]
pub struct PlatformPropertyAddition {
    /// The name of the property to add.
    pub name: String,
    /// The value to assign to the property.
    pub value: String,
}

#[allow(non_camel_case_types)]
#[derive(Deserialize, Debug, Clone)]
pub enum PropertyModification {
    /// Add a property to the action properties.
    add(PlatformPropertyAddition),
    /// Remove a named property from the action.
    remove(String),
}

#[derive(Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct PropertyModifierSpec {
    /// A list of modifications to perform to incoming actions for the nested
    /// scheduler.  These are performed in order and blindly, so removing a
    /// property that doesn't exist is fine and overwriting an existing property
    /// is also fine.  If adding properties that do not exist in the nested
    /// scheduler is not supported and will likely cause unexpected behaviour.
    pub modifications: Vec<PropertyModification>,

    /// The nested scheduler to use after modifying the properties.
    pub scheduler: Box<SchedulerSpec>,
}

Coverage Report

Created: 2024-11-20 10:13

Line	Count	Source (jump to first uncovered line)
1		// Copyright 2024 The NativeLink Authors. All rights reserved.
2		//
3		// Licensed under the Apache License, Version 2.0 (the "License");
4		// you may not use this file except in compliance with the License.
5		// You may obtain a copy of the License at
6		//
7		// http://www.apache.org/licenses/LICENSE-2.0
8		//
9		// Unless required by applicable law or agreed to in writing, software
10		// distributed under the License is distributed on an "AS IS" BASIS,
11		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12		// See the License for the specific language governing permissions and
13		// limitations under the License.
14
15		use std::collections::HashMap;
16
17		use serde::Deserialize;
18
19		use crate::serde_utils::{convert_duration_with_shellexpand, convert_numeric_with_shellexpand};
20		use crate::stores::{GrpcEndpoint, Retry, StoreRefName};
21
22		#[allow(non_camel_case_types)]
23	0	#[derive(Deserialize, Debug)]
24		pub enum SchedulerSpec {
25		simple(SimpleSpec),
26		grpc(GrpcSpec),
27		cache_lookup(CacheLookupSpec),
28		property_modifier(PropertyModifierSpec),
29		}
30
31		/// When the scheduler matches tasks to workers that are capable of running
32		/// the task, this value will be used to determine how the property is treated.
33		#[allow(non_camel_case_types)]
34	0	#[derive(Deserialize, Debug, Clone, Copy, Hash, Eq, PartialEq)]
35		pub enum PropertyType {
36		/// Requires the platform property to be a u64 and when the scheduler looks
37		/// for appropriate worker nodes that are capable of executing the task,
38		/// the task will not run on a node that has less than this value.
39		minimum,
40
41		/// Requires the platform property to be a string and when the scheduler
42		/// looks for appropriate worker nodes that are capable of executing the
43		/// task, the task will not run on a node that does not have this property
44		/// set to the value with exact string match.
45		exact,
46
47		/// Does not restrict on this value and instead will be passed to the worker
48		/// as an informational piece.
49		/// TODO(allada) In the future this will be used by the scheduler and worker
50		/// to cause the scheduler to prefer certain workers over others, but not
51		/// restrict them based on these values.
52		priority,
53		}
54
55		/// When a worker is being searched for to run a job, this will be used
56		/// on how to choose which worker should run the job when multiple
57		/// workers are able to run the task.
58		#[allow(non_camel_case_types)]
59	0	#[derive(Copy, Clone, Deserialize, Debug, Default)]
60		pub enum WorkerAllocationStrategy {
61		/// Prefer workers that have been least recently used to run a job.
62		#[default]
63		least_recently_used,
64		/// Prefer workers that have been most recently used to run a job.
65		most_recently_used,
66		}
67
68	0	#[derive(Deserialize, Debug, Default)]
69		#[serde(deny_unknown_fields)]
70		pub struct SimpleSpec {
71		/// A list of supported platform properties mapped to how these properties
72		/// are used when the scheduler looks for worker nodes capable of running
73		/// the task.
74		///
75		/// For example, a value of:
76		/// ```json
77		/// { "cpu_count": "minimum", "cpu_arch": "exact" }
78		/// ```
79		/// With a job that contains:
80		/// ```json
81		/// { "cpu_count": "8", "cpu_arch": "arm" }
82		/// ```
83		/// Will result in the scheduler filtering out any workers that do not have
84		/// `"cpu_arch" = "arm"` and filter out any workers that have less than 8 cpu
85		/// cores available.
86		///
87		/// The property names here must match the property keys provided by the
88		/// worker nodes when they join the pool. In other words, the workers will
89		/// publish their capabilities to the scheduler when they join the worker
90		/// pool. If the worker fails to notify the scheduler of its (for example)
91		/// `"cpu_arch"`, the scheduler will never send any jobs to it, if all jobs
92		/// have the `"cpu_arch"` label. There is no special treatment of any platform
93		/// property labels other and entirely driven by worker configs and this
94		/// config.
95		pub supported_platform_properties: Option<HashMap<String, PropertyType>>,
96
97		/// The amount of time to retain completed actions in memory for in case
98		/// a `WaitExecution` is called after the action has completed.
99		/// Default: 60 (seconds)
100		#[serde(default, deserialize_with = "convert_duration_with_shellexpand")]
101		pub retain_completed_for_s: u32,
102
103		/// Mark operations as completed with error if no client has updated them
104		/// within this duration.
105		/// Default: 60 (seconds)
106		#[serde(default, deserialize_with = "convert_duration_with_shellexpand")]
107		pub client_action_timeout_s: u64,
108
109		/// Remove workers from pool once the worker has not responded in this
110		/// amount of time in seconds.
111		/// Default: 5 (seconds)
112		#[serde(default, deserialize_with = "convert_duration_with_shellexpand")]
113		pub worker_timeout_s: u64,
114
115		/// If a job returns an internal error or times out this many times when
116		/// attempting to run on a worker the scheduler will return the last error
117		/// to the client. Jobs will be retried and this configuration is to help
118		/// prevent one rogue job from infinitely retrying and taking up a lot of
119		/// resources when the task itself is the one causing the server to go
120		/// into a bad state.
121		/// Default: 3
122		#[serde(default, deserialize_with = "convert_numeric_with_shellexpand")]
123		pub max_job_retries: usize,
124
125		/// The strategy used to assign workers jobs.
126		#[serde(default)]
127		pub allocation_strategy: WorkerAllocationStrategy,
128
129		/// The storage backend to use for the scheduler.
130		/// Default: memory
131		pub experimental_backend: Option<ExperimentalSimpleSchedulerBackend>,
132		}
133
134		#[allow(non_camel_case_types)]
135	0	#[derive(Deserialize, Debug)]
136		pub enum ExperimentalSimpleSchedulerBackend {
137		/// Use an in-memory store for the scheduler.
138		memory,
139		/// Use a redis store for the scheduler.
140		redis(ExperimentalRedisSchedulerBackend),
141		}
142
143	0	#[derive(Deserialize, Debug, Default)]
144		#[serde(deny_unknown_fields)]
145		pub struct ExperimentalRedisSchedulerBackend {
146		/// A reference to the redis store to use for the scheduler.
147		/// Note: This MUST resolve to a `RedisSpec`.
148		pub redis_store: StoreRefName,
149		}
150
151		/// A scheduler that simply forwards requests to an upstream scheduler. This
152		/// is useful to use when doing some kind of local action cache or CAS away from
153		/// the main cluster of workers. In general, it's more efficient to point the
154		/// build at the main scheduler directly though.
155	0	#[derive(Deserialize, Debug)]
156		#[serde(deny_unknown_fields)]
157		pub struct GrpcSpec {
158		/// The upstream scheduler to forward requests to.
159		pub endpoint: GrpcEndpoint,
160
161		/// Retry configuration to use when a network request fails.
162		#[serde(default)]
163		pub retry: Retry,
164
165		/// Limit the number of simultaneous upstream requests to this many. A
166		/// value of zero is treated as unlimited. If the limit is reached the
167		/// request is queued.
168		#[serde(default)]
169		pub max_concurrent_requests: usize,
170
171		/// The number of connections to make to each specified endpoint to balance
172		/// the load over multiple TCP connections. Default 1.
173		#[serde(default)]
174		pub connections_per_endpoint: usize,
175		}
176
177	0	#[derive(Deserialize, Debug)]
178		#[serde(deny_unknown_fields)]
179		pub struct CacheLookupSpec {
180		/// The reference to the action cache store used to return cached
181		/// actions from rather than running them again.
182		/// To prevent unintended issues, this store should probably be a `CompletenessCheckingSpec`.
183		pub ac_store: StoreRefName,
184
185		/// The nested scheduler to use if cache lookup fails.
186		pub scheduler: Box<SchedulerSpec>,
187		}
188
189	0	#[derive(Deserialize, Debug, Clone)]
190		#[serde(deny_unknown_fields)]
191		pub struct PlatformPropertyAddition {
192		/// The name of the property to add.
193		pub name: String,
194		/// The value to assign to the property.
195		pub value: String,
196		}
197
198		#[allow(non_camel_case_types)]
199	0	#[derive(Deserialize, Debug, Clone)]
200		pub enum PropertyModification {
201		/// Add a property to the action properties.
202		add(PlatformPropertyAddition),
203		/// Remove a named property from the action.
204		remove(String),
205		}
206
207	0	#[derive(Deserialize, Debug)]
208		#[serde(deny_unknown_fields)]
209		pub struct PropertyModifierSpec {
210		/// A list of modifications to perform to incoming actions for the nested
211		/// scheduler. These are performed in order and blindly, so removing a
212		/// property that doesn't exist is fine and overwriting an existing property
213		/// is also fine. If adding properties that do not exist in the nested
214		/// scheduler is not supported and will likely cause unexpected behaviour.
215		pub modifications: Vec<PropertyModification>,
216
217		/// The nested scheduler to use after modifying the properties.
218		pub scheduler: Box<SchedulerSpec>,
219		}