/build/source/nativelink-service/src/worker_api_server.rs

Source
// Copyright 2024 The NativeLink Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use core::convert::Into;
use core::pin::Pin;
use core::time::Duration;
use std::collections::HashMap;
use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};

use futures::stream::unfold;
use futures::Stream;
use nativelink_config::cas_server::WorkerApiConfig;
use nativelink_error::{make_err, Code, Error, ResultExt};
use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::worker_api_server::{
    WorkerApi, WorkerApiServer as Server,
};
use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{
    execute_result, ConnectWorkerRequest, ExecuteResult, GoingAwayRequest, KeepAliveRequest, UpdateForWorker
};
use nativelink_scheduler::worker::Worker;
use nativelink_scheduler::worker_scheduler::WorkerScheduler;
use nativelink_util::background_spawn;
use nativelink_util::action_messages::{OperationId, WorkerId};
use nativelink_util::operation_state_manager::UpdateOperationType;
use nativelink_util::platform_properties::PlatformProperties;
use rand::RngCore;
use tokio::sync::mpsc;
use tokio::time::interval;
use tonic::{Request, Response, Status};
use tracing::{debug, error, warn, instrument, Level};
use uuid::Uuid;

pub type ConnectWorkerStream =
    Pin<Box<dyn Stream<Item = Result<UpdateForWorker, Status>> + Send + Sync + 'static>>;

pub type NowFn = Box<dyn Fn() -> Result<Duration, Error> + Send + Sync>;

pub struct WorkerApiServer {
    scheduler: Arc<dyn WorkerScheduler>,
    now_fn: NowFn,
    node_id: [u8; 6],
}

impl core::fmt::Debug for WorkerApiServer {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        f.debug_struct("WorkerApiServer")
            .field("node_id", &self.node_id)
            .finish_non_exhaustive()
    }
}

impl WorkerApiServer {
    pub fn new(
        config: &WorkerApiConfig,
        schedulers: &HashMap<String, Arc<dyn WorkerScheduler>>,
    ) -> Result<Self, Error> {
        let node_id = {
            let mut out = [0; 6];
            rand::rng().fill_bytes(&mut out);
            out
        };
        for scheduler in schedulers.values() {
            // This will protect us from holding a reference to the scheduler forever in the
            // event our ExecutionServer dies. Our scheduler is a weak ref, so the spawn will
            // eventually see the Arc went away and return.
            let weak_scheduler = Arc::downgrade(scheduler);
            background_spawn!("worker_api_server", async move {
                let mut ticker = interval(Duration::from_secs(1));
                loop {
                    ticker.tick().await;
                    let timestamp = SystemTime::now()
                        .duration_since(UNIX_EPOCH)
                        .expect("Error: system time is now behind unix epoch");
                    match weak_scheduler.upgrade() {
                        Some(scheduler) => {
                            if let Err(err) =
                                scheduler.remove_timedout_workers(timestamp.as_secs()).await
                            {
                                error!(?err, "Failed to remove_timedout_workers",);
                            }
                        }
                        // If we fail to upgrade, our service is probably destroyed, so return.
                        None => return,
                    }
                }
            });
        }

        Self::new_with_now_fn(
            config,
            schedulers,
            Box::new(move || {
                SystemTime::now()
                    .duration_since(UNIX_EPOCH)
                    .map_err(|_| make_err!(Code::Internal, "System time is now behind unix epoch"))
            }),
            node_id,
        )
    }

    /// Same as `new()`, but you can pass a custom `now_fn`, that returns a Duration since `UNIX_EPOCH`
    /// representing the current time. Used mostly in  unit tests.
    pub fn new_with_now_fn(
        config: &WorkerApiConfig,
        schedulers: &HashMap<String, Arc<dyn WorkerScheduler>>,
        now_fn: NowFn,
        node_id: [u8; 6],
    ) -> Result<Self, Error> {
        let scheduler = schedulers
            .get(&config.scheduler)
            .err_tip(|| {0
                format!(
                    "Scheduler needs config for '{}' because it exists in worker_api",
                    config.scheduler
                )
            })?
            .clone();
        Ok(Self {
            scheduler,
            now_fn,
            node_id,
        })
    }

    pub fn into_service(self) -> Server<Self> {
        Server::new(self)
    }

    async fn inner_connect_worker(
        &self,
        connect_worker_request: ConnectWorkerRequest,
    ) -> Result<Response<ConnectWorkerStream>, Error> {
        let (tx, rx) = mpsc::unbounded_channel();

        // First convert our proto platform properties into one our scheduler understands.
        let platform_properties = {
            let mut platform_properties = PlatformProperties::default();
            for property0 in connect_worker_request.properties {
                let platform_property_value = self
                    .scheduler
                    .get_platform_property_manager()
                    .make_prop_value(&property.name, &property.value)
                    .err_tip(|| "Bad Property during connect_worker()")?;
                platform_properties
                    .properties
                    .insert(property.name.clone(), platform_property_value);
            }
            platform_properties
        };

        // Now register the worker with the scheduler.
        let worker_id = {
            let worker_id = WorkerId(format!(
                "{}{}",
                connect_worker_request.worker_id_prefix,
                Uuid::now_v6(&self.node_id).hyphenated()
            ));
            let worker = Worker::new(
                worker_id.clone(),
                platform_properties,
                tx,
                (self.now_fn)()?0.as_secs(),
            );
            self.scheduler
                .add_worker(worker)
                .await
                .err_tip(|| "Failed to add worker in inner_connect_worker()")?0;
            worker_id
        };

        Ok(Response::new(Box::pin(unfold(
            (rx, worker_id),
            move |state| async move {
                let (mut rx, worker_id) = state;
                if let Some(update_for_worker) = rx.recv().await {
                    return Some((Ok(update_for_worker), (rx, worker_id)));
                }
                warn!(
                    ?worker_id,
                    "UpdateForWorker channel was closed, thus closing connection to worker node",
                );

                None
            },
        ))))
    }

    async fn inner_keep_alive(
        &self,
        keep_alive_request: KeepAliveRequest,
    ) -> Result<Response<()>, Error> {
        let worker_id: WorkerId = keep_alive_request.worker_id.into();
        self.scheduler
            .worker_keep_alive_received(&worker_id, (self.now_fn)()?0.as_secs())
            .await
            .err_tip(|| "Could not process keep_alive from worker in inner_keep_alive()")?0;
        Ok(Response::new(()))
    }

    async fn inner_going_away(
        &self,
        going_away_request: GoingAwayRequest,
    ) -> Result<Response<()>, Error> {
        let worker_id: WorkerId = going_away_request.worker_id.into();
        self.scheduler
            .remove_worker(&worker_id)
            .await
            .err_tip(|| "While calling WorkerApiServer::inner_going_away")?;
        Ok(Response::new(()))
    }

    async fn inner_execution_response(
        &self,
        execute_result: ExecuteResult,
    ) -> Result<Response<()>, Error> {
        let worker_id: WorkerId = execute_result.worker_id.into();
        let operation_id = OperationId::from(execute_result.operation_id);

        match execute_result
            .result
            .err_tip(|| "Expected result to exist in ExecuteResult")?0
        {
            execute_result::Result::ExecuteResponse(finished_result) => {
                let action_stage = finished_result
                    .try_into()
                    .err_tip(|| "Failed to convert ExecuteResponse into an ActionStage")?0;
                self.scheduler
                    .update_action(
                        &worker_id,
                        &operation_id,
                        UpdateOperationType::UpdateWithActionStage(action_stage),
                    )
                    .await
                    .err_tip(|| format!("Failed to operation {operation_id:?}"0))?0;
            }
            execute_result::Result::InternalError(e) => {
                self.scheduler
                    .update_action(
                        &worker_id,
                        &operation_id,
                        UpdateOperationType::UpdateWithError(e.into()),
                    )
                    .await
                    .err_tip(|| format!("Failed to operation {operation_id:?}"))?;
            }
        }
        Ok(Response::new(()))
    }
}

#[tonic::async_trait]
impl WorkerApi for WorkerApiServer {
    type ConnectWorkerStream = ConnectWorkerStream;

    #[instrument(
        err,
        level = Level::ERROR,
        skip_all,
        fields(request = ?grpc_request.get_ref())
    )]
    async fn connect_worker(
        &self,
        grpc_request: Request<ConnectWorkerRequest>,
    ) -> Result<Response<Self::ConnectWorkerStream>, Status> {
        let resp = self
            .inner_connect_worker(grpc_request.into_inner())
            .await
            .map_err(Into::into);
        if resp.is_ok() {
            debug!(return = "Ok(<stream>)");
        }
        resp
    }

    #[instrument(
        err,
        ret(level = Level::DEBUG),
        level = Level::DEBUG,
        skip_all,
        fields(request = ?grpc_request.get_ref())
    )]
    async fn keep_alive(
        &self,
        grpc_request: Request<KeepAliveRequest>,
    ) -> Result<Response<()>, Status> {
        self.inner_keep_alive(grpc_request.into_inner())
            .await
            .map_err(Into::into)
    }

    #[instrument(
        err,
        ret(level = Level::INFO),
        level = Level::ERROR,
        skip_all,
        fields(request = ?grpc_request.get_ref())
    )]
    async fn going_away(
        &self,
        grpc_request: Request<GoingAwayRequest>,
    ) -> Result<Response<()>, Status> {
        self.inner_going_away(grpc_request.into_inner())
            .await
            .map_err(Into::into)
    }

    #[instrument(
        err,
        ret(level = Level::DEBUG),
        level = Level::ERROR,
        skip_all,
        fields(request = ?grpc_request.get_ref())
    )]
    async fn execution_response(
        &self,
        grpc_request: Request<ExecuteResult>,
    ) -> Result<Response<()>, Status> {
        self.inner_execution_response(grpc_request.into_inner())
            .await
            .map_err(Into::into)
    }
}

Coverage Report

Created: 2025-07-10 19:59

Line	Count	Source
1		// Copyright 2024 The NativeLink Authors. All rights reserved.
2		//
3		// Licensed under the Apache License, Version 2.0 (the "License");
4		// you may not use this file except in compliance with the License.
5		// You may obtain a copy of the License at
6		//
7		// http://www.apache.org/licenses/LICENSE-2.0
8		//
9		// Unless required by applicable law or agreed to in writing, software
10		// distributed under the License is distributed on an "AS IS" BASIS,
11		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12		// See the License for the specific language governing permissions and
13		// limitations under the License.
14
15		use core::convert::Into;
16		use core::pin::Pin;
17		use core::time::Duration;
18		use std::collections::HashMap;
19		use std::sync::Arc;
20		use std::time::{SystemTime, UNIX_EPOCH};
21
22		use futures::stream::unfold;
23		use futures::Stream;
24		use nativelink_config::cas_server::WorkerApiConfig;
25		use nativelink_error::{make_err, Code, Error, ResultExt};
26		use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::worker_api_server::{
27		WorkerApi, WorkerApiServer as Server,
28		};
29		use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{
30		execute_result, ConnectWorkerRequest, ExecuteResult, GoingAwayRequest, KeepAliveRequest, UpdateForWorker
31		};
32		use nativelink_scheduler::worker::Worker;
33		use nativelink_scheduler::worker_scheduler::WorkerScheduler;
34		use nativelink_util::background_spawn;
35		use nativelink_util::action_messages::{OperationId, WorkerId};
36		use nativelink_util::operation_state_manager::UpdateOperationType;
37		use nativelink_util::platform_properties::PlatformProperties;
38		use rand::RngCore;
39		use tokio::sync::mpsc;
40		use tokio::time::interval;
41		use tonic::{Request, Response, Status};
42		use tracing::{debug, error, warn, instrument, Level};
43		use uuid::Uuid;
44
45		pub type ConnectWorkerStream =
46		Pin<Box<dyn Stream<Item = Result<UpdateForWorker, Status>> + Send + Sync + 'static>>;
47
48		pub type NowFn = Box<dyn Fn() -> Result<Duration, Error> + Send + Sync>;
49
50		pub struct WorkerApiServer {
51		scheduler: Arc<dyn WorkerScheduler>,
52		now_fn: NowFn,
53		node_id: [u8; 6],
54		}
55
56		impl core::fmt::Debug for WorkerApiServer {
57	0	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
58	0	f.debug_struct("WorkerApiServer")
59	0	.field("node_id", &self.node_id)
60	0	.finish_non_exhaustive()
61	0	}
62		}
63
64		impl WorkerApiServer {
65	0	pub fn new(
66	0	config: &WorkerApiConfig,
67	0	schedulers: &HashMap<String, Arc<dyn WorkerScheduler>>,
68	0	) -> Result<Self, Error> {
69	0	let node_id = {
70	0	let mut out = [0; 6];
71	0	rand::rng().fill_bytes(&mut out);
72	0	out
73		};
74	0	for scheduler in schedulers.values() {
75		// This will protect us from holding a reference to the scheduler forever in the
76		// event our ExecutionServer dies. Our scheduler is a weak ref, so the spawn will
77		// eventually see the Arc went away and return.
78	0	let weak_scheduler = Arc::downgrade(scheduler);
79	0	background_spawn!("worker_api_server", async move {
80	0	let mut ticker = interval(Duration::from_secs(1));
81		loop {
82	0	ticker.tick().await;
83	0	let timestamp = SystemTime::now()
84	0	.duration_since(UNIX_EPOCH)
85	0	.expect("Error: system time is now behind unix epoch");
86	0	match weak_scheduler.upgrade() {
87	0	Some(scheduler) => {
88	0	if let Err(err) = Branch (88:36): [True: 0, False: 0] Branch (88:36): [Folded - Ignored]
89	0	scheduler.remove_timedout_workers(timestamp.as_secs()).await
90		{
91	0	error!(?err, "Failed to remove_timedout_workers",);
92	0	}
93		}
94		// If we fail to upgrade, our service is probably destroyed, so return.
95	0	None => return,
96		}
97		}
98	0	});
99		}
100
101	0	Self::new_with_now_fn(
102	0	config,
103	0	schedulers,
104	0	Box::new(move \|\| {
105	0	SystemTime::now()
106	0	.duration_since(UNIX_EPOCH)
107	0	.map_err(\|_\| make_err!(Code::Internal, "System time is now behind unix epoch"))
108	0	}),
109	0	node_id,
110		)
111	0	}
112
113		/// Same as `new()`, but you can pass a custom `now_fn`, that returns a Duration since `UNIX_EPOCH`
114		/// representing the current time. Used mostly in unit tests.
115	6	pub fn new_with_now_fn(
116	6	config: &WorkerApiConfig,
117	6	schedulers: &HashMap<String, Arc<dyn WorkerScheduler>>,
118	6	now_fn: NowFn,
119	6	node_id: [u8; 6],
120	6	) -> Result<Self, Error> {
121	6	let scheduler = schedulers
122	6	.get(&config.scheduler)
123	6	.err_tip(\|\| {0
124	0	format!(
125	0	"Scheduler needs config for '{}' because it exists in worker_api",
126		config.scheduler
127		)
128	0	})?
129	6	.clone();
130	6	Ok(Self {
131	6	scheduler,
132	6	now_fn,
133	6	node_id,
134	6	})
135	6	}
136
137	0	pub fn into_service(self) -> Server<Self> {
138	0	Server::new(self)
139	0	}
140
141	6	async fn inner_connect_worker(
142	6	&self,
143	6	connect_worker_request: ConnectWorkerRequest,
144	6	) -> Result<Response<ConnectWorkerStream>, Error> {
145	6	let (tx, rx) = mpsc::unbounded_channel();
146
147		// First convert our proto platform properties into one our scheduler understands.
148	6	let platform_properties = {
149	6	let mut platform_properties = PlatformProperties::default();
150	6	for property0 in connect_worker_request.properties {
151	0	let platform_property_value = self
152	0	.scheduler
153	0	.get_platform_property_manager()
154	0	.make_prop_value(&property.name, &property.value)
155	0	.err_tip(\|\| "Bad Property during connect_worker()")?;
156	0	platform_properties
157	0	.properties
158	0	.insert(property.name.clone(), platform_property_value);
159		}
160	6	platform_properties
161		};
162
163		// Now register the worker with the scheduler.
164	6	let worker_id = {
165	6	let worker_id = WorkerId(format!(
166	6	"{}{}",
167	6	connect_worker_request.worker_id_prefix,
168	6	Uuid::now_v6(&self.node_id).hyphenated()
169	6	));
170	6	let worker = Worker::new(
171	6	worker_id.clone(),
172	6	platform_properties,
173	6	tx,
174	6	(self.now_fn)() ?0 .as_secs(),
175		);
176	6	self.scheduler
177	6	.add_worker(worker)
178	6	.await
179	6	.err_tip(\|\| "Failed to add worker in inner_connect_worker()") ?0 ;
180	6	worker_id
181		};
182
183	6	Ok(Response::new(Box::pin(unfold(
184	6	(rx, worker_id),
185	8	move \|state\| async move {
186	8	let (mut rx, worker_id) = state;
187	8	if let Some(update_for_worker) = rx.recv().await { Branch (187:24): [True: 8, False: 0] Branch (187:24): [Folded - Ignored]
188	8	return Some((Ok(update_for_worker), (rx, worker_id)));
189	0	}
190	0	warn!(
191		?worker_id,
192	0	"UpdateForWorker channel was closed, thus closing connection to worker node",
193		);
194
195	0	None
196	16	},
197		))))
198	6	}
199
200	1	async fn inner_keep_alive(
201	1	&self,
202	1	keep_alive_request: KeepAliveRequest,
203	1	) -> Result<Response<()>, Error> {
204	1	let worker_id: WorkerId = keep_alive_request.worker_id.into();
205	1	self.scheduler
206	1	.worker_keep_alive_received(&worker_id, (self.now_fn)() ?0 .as_secs())
207	1	.await
208	1	.err_tip(\|\| "Could not process keep_alive from worker in inner_keep_alive()") ?0 ;
209	1	Ok(Response::new(()))
210	1	}
211
212	0	async fn inner_going_away(
213	0	&self,
214	0	going_away_request: GoingAwayRequest,
215	0	) -> Result<Response<()>, Error> {
216	0	let worker_id: WorkerId = going_away_request.worker_id.into();
217	0	self.scheduler
218	0	.remove_worker(&worker_id)
219	0	.await
220	0	.err_tip(\|\| "While calling WorkerApiServer::inner_going_away")?;
221	0	Ok(Response::new(()))
222	0	}
223
224	1	async fn inner_execution_response(
225	1	&self,
226	1	execute_result: ExecuteResult,
227	1	) -> Result<Response<()>, Error> {
228	1	let worker_id: WorkerId = execute_result.worker_id.into();
229	1	let operation_id = OperationId::from(execute_result.operation_id);
230
231	1	match execute_result
232	1	.result
233	1	.err_tip(\|\| "Expected result to exist in ExecuteResult") ?0
234		{
235	1	execute_result::Result::ExecuteResponse(finished_result) => {
236	1	let action_stage = finished_result
237	1	.try_into()
238	1	.err_tip(\|\| "Failed to convert ExecuteResponse into an ActionStage") ?0 ;
239	1	self.scheduler
240	1	.update_action(
241	1	&worker_id,
242	1	&operation_id,
243	1	UpdateOperationType::UpdateWithActionStage(action_stage),
244	1	)
245	1	.await
246	1	.err_tip(\|\| format!( "Failed to operation {operation_id:?}"0 )) ?0 ;
247		}
248	0	execute_result::Result::InternalError(e) => {
249	0	self.scheduler
250	0	.update_action(
251	0	&worker_id,
252	0	&operation_id,
253	0	UpdateOperationType::UpdateWithError(e.into()),
254	0	)
255	0	.await
256	0	.err_tip(\|\| format!("Failed to operation {operation_id:?}"))?;
257		}
258		}
259	1	Ok(Response::new(()))
260	1	}
261		}
262
263		#[tonic::async_trait]
264		impl WorkerApi for WorkerApiServer {
265		type ConnectWorkerStream = ConnectWorkerStream;
266
267		#[instrument(
268		err,
269		level = Level::ERROR,
270		skip_all,
271		fields(request = ?grpc_request.get_ref())
272		)]
273		async fn connect_worker(
274		&self,
275		grpc_request: Request<ConnectWorkerRequest>,
276	12	) -> Result<Response<Self::ConnectWorkerStream>, Status> {
277	6	let resp = self
278	6	.inner_connect_worker(grpc_request.into_inner())
279	6	.await
280	6	.map_err(Into::into);
281	6	if resp.is_ok() { Branch (281:12): [True: 6, False: 0] Branch (281:12): [Folded - Ignored]
282	6	debug!(return = "Ok(<stream>)");
283	0	}
284	6	resp
285	12	}
286
287		#[instrument(
288		err,
289		ret(level = Level::DEBUG),
290		level = Level::DEBUG,
291		skip_all,
292		fields(request = ?grpc_request.get_ref())
293		)]
294		async fn keep_alive(
295		&self,
296		grpc_request: Request<KeepAliveRequest>,
297	2	) -> Result<Response<()>, Status> {
298	1	self.inner_keep_alive(grpc_request.into_inner())
299	1	.await
300	1	.map_err(Into::into)
301	2	}
302
303		#[instrument(
304		err,
305		ret(level = Level::INFO),
306		level = Level::ERROR,
307		skip_all,
308		fields(request = ?grpc_request.get_ref())
309		)]
310		async fn going_away(
311		&self,
312		grpc_request: Request<GoingAwayRequest>,
313	0	) -> Result<Response<()>, Status> {
314	0	self.inner_going_away(grpc_request.into_inner())
315	0	.await
316	0	.map_err(Into::into)
317	0	}
318
319		#[instrument(
320		err,
321		ret(level = Level::DEBUG),
322		level = Level::ERROR,
323		skip_all,
324		fields(request = ?grpc_request.get_ref())
325		)]
326		async fn execution_response(
327		&self,
328		grpc_request: Request<ExecuteResult>,
329	2	) -> Result<Response<()>, Status> {
330	1	self.inner_execution_response(grpc_request.into_inner())
331	1	.await
332	1	.map_err(Into::into)
333	2	}
334		}