/build/source/src/bin/nativelink.rs
Line | Count | Source |
1 | | // Copyright 2024 The NativeLink Authors. All rights reserved. |
2 | | // |
3 | | // Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // See LICENSE file for details |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | use core::net::SocketAddr; |
16 | | use core::time::Duration; |
17 | | use std::collections::{HashMap, HashSet}; |
18 | | use std::sync::Arc; |
19 | | |
20 | | use async_lock::Mutex as AsyncMutex; |
21 | | use axum::Router; |
22 | | use axum::http::Uri; |
23 | | use clap::Parser; |
24 | | use futures::FutureExt; |
25 | | use futures::future::{BoxFuture, Either, OptionFuture, TryFutureExt, try_join_all}; |
26 | | use hyper::StatusCode; |
27 | | use hyper_util::rt::tokio::TokioIo; |
28 | | use hyper_util::server::conn::auto; |
29 | | use hyper_util::service::TowerToHyperService; |
30 | | use mimalloc::MiMalloc; |
31 | | use nativelink_config::cas_server::{ |
32 | | CasConfig, GlobalConfig, HttpCompressionAlgorithm, ListenerConfig, SchedulerConfig, |
33 | | ServerConfig, StoreConfig, WorkerConfig, |
34 | | }; |
35 | | use nativelink_config::stores::ConfigDigestHashFunction; |
36 | | use nativelink_error::{Code, Error, ResultExt, make_err, make_input_err}; |
37 | | use nativelink_scheduler::default_scheduler_factory::scheduler_factory; |
38 | | use nativelink_service::ac_server::AcServer; |
39 | | use nativelink_service::bep_server::BepServer; |
40 | | use nativelink_service::bytestream_server::ByteStreamServer; |
41 | | use nativelink_service::capabilities_server::CapabilitiesServer; |
42 | | use nativelink_service::cas_server::CasServer; |
43 | | use nativelink_service::execution_server::ExecutionServer; |
44 | | use nativelink_service::fetch_server::FetchServer; |
45 | | use nativelink_service::health_server::HealthServer; |
46 | | use nativelink_service::push_server::PushServer; |
47 | | use nativelink_service::worker_api_server::WorkerApiServer; |
48 | | use nativelink_store::default_store_factory::store_factory; |
49 | | use nativelink_store::store_manager::StoreManager; |
50 | | use nativelink_util::common::fs::set_open_file_limit; |
51 | | use nativelink_util::digest_hasher::{DigestHasherFunc, set_default_digest_hasher_func}; |
52 | | use nativelink_util::health_utils::HealthRegistryBuilder; |
53 | | use nativelink_util::origin_event_publisher::OriginEventPublisher; |
54 | | #[cfg(target_family = "unix")] |
55 | | use nativelink_util::shutdown_guard::Priority; |
56 | | use nativelink_util::shutdown_guard::ShutdownGuard; |
57 | | use nativelink_util::store_trait::{ |
58 | | DEFAULT_DIGEST_SIZE_HEALTH_CHECK_CFG, set_default_digest_size_health_check, |
59 | | }; |
60 | | use nativelink_util::task::TaskExecutor; |
61 | | use nativelink_util::telemetry::init_tracing; |
62 | | use nativelink_util::{background_spawn, fs, spawn}; |
63 | | use nativelink_worker::local_worker::new_local_worker; |
64 | | use rustls_pki_types::pem::PemObject; |
65 | | use rustls_pki_types::{CertificateRevocationListDer, PrivateKeyDer}; |
66 | | use tokio::net::TcpListener; |
67 | | use tokio::select; |
68 | | #[cfg(target_family = "unix")] |
69 | | use tokio::signal::unix::{SignalKind, signal}; |
70 | | use tokio::sync::oneshot::Sender; |
71 | | use tokio::sync::{broadcast, mpsc, oneshot}; |
72 | | use tokio_rustls::TlsAcceptor; |
73 | | use tokio_rustls::rustls::pki_types::CertificateDer; |
74 | | use tokio_rustls::rustls::server::WebPkiClientVerifier; |
75 | | use tokio_rustls::rustls::{RootCertStore, ServerConfig as TlsServerConfig}; |
76 | | use tonic::codec::CompressionEncoding; |
77 | | use tonic::service::Routes; |
78 | | use tracing::{error, error_span, info, trace_span, warn}; |
79 | | |
80 | | #[global_allocator] |
81 | | static GLOBAL: MiMalloc = MiMalloc; |
82 | | |
83 | | /// Note: This must be kept in sync with the documentation in `AdminConfig::path`. |
84 | | const DEFAULT_ADMIN_API_PATH: &str = "/admin"; |
85 | | |
86 | | // Note: This must be kept in sync with the documentation in `HealthConfig::path`. |
87 | | const DEFAULT_HEALTH_STATUS_CHECK_PATH: &str = "/status"; |
88 | | |
89 | | // Note: This must be kept in sync with the documentation in |
90 | | // `OriginEventsConfig::max_event_queue_size`. |
91 | | const DEFAULT_MAX_QUEUE_EVENTS: usize = 0x0001_0000; |
92 | | |
93 | | /// Broadcast Channel Capacity |
94 | | /// Note: The actual capacity may be greater than the provided capacity. |
95 | | const BROADCAST_CAPACITY: usize = 1; |
96 | | |
97 | | /// Backend for bazel remote execution / cache API. |
98 | | #[derive(Parser, Debug)] |
99 | | #[clap( |
100 | | author = "Trace Machina, Inc. <nativelink@tracemachina.com>", |
101 | | version, |
102 | | about, |
103 | | long_about = None |
104 | | )] |
105 | | struct Args { |
106 | | /// Config file to use. |
107 | | #[clap(value_parser)] |
108 | | config_file: String, |
109 | | } |
110 | | |
111 | | trait RoutesExt { |
112 | | fn add_optional_service<S>(self, svc: Option<S>) -> Self |
113 | | where |
114 | | S: tower::Service< |
115 | | axum::http::Request<tonic::body::Body>, |
116 | | Error = core::convert::Infallible, |
117 | | > + tonic::server::NamedService |
118 | | + Clone |
119 | | + Send |
120 | | + Sync |
121 | | + 'static, |
122 | | S::Response: axum::response::IntoResponse, |
123 | | S::Future: Send + 'static; |
124 | | } |
125 | | |
126 | | impl RoutesExt for Routes { |
127 | 0 | fn add_optional_service<S>(mut self, svc: Option<S>) -> Self |
128 | 0 | where |
129 | 0 | S: tower::Service< |
130 | 0 | axum::http::Request<tonic::body::Body>, |
131 | 0 | Error = core::convert::Infallible, |
132 | 0 | > + tonic::server::NamedService |
133 | 0 | + Clone |
134 | 0 | + Send |
135 | 0 | + Sync |
136 | 0 | + 'static, |
137 | 0 | S::Response: axum::response::IntoResponse, |
138 | 0 | S::Future: Send + 'static, |
139 | | { |
140 | 0 | if let Some(svc) = svc { Branch (140:16): [Folded - Ignored]
|
141 | 0 | self = self.add_service(svc); |
142 | 0 | } |
143 | 0 | self |
144 | 0 | } |
145 | | } |
146 | | |
147 | | /// If this value changes update the documentation in the config definition. |
148 | | const DEFAULT_MAX_DECODING_MESSAGE_SIZE: usize = 4 * 1024 * 1024; |
149 | | |
150 | | macro_rules! service_setup { |
151 | | ($v: tt, $http_config: tt) => {{ |
152 | | let mut service = $v.into_service(); |
153 | | let max_decoding_message_size = if $http_config.max_decoding_message_size == 0 { |
154 | | DEFAULT_MAX_DECODING_MESSAGE_SIZE |
155 | | } else { |
156 | | $http_config.max_decoding_message_size |
157 | | }; |
158 | | service = service.max_decoding_message_size(max_decoding_message_size); |
159 | | let send_algo = &$http_config.compression.send_compression_algorithm; |
160 | | if let Some(encoding) = into_encoding(send_algo.unwrap_or(HttpCompressionAlgorithm::None)) { |
161 | | service = service.send_compressed(encoding); |
162 | | } |
163 | | for encoding in $http_config |
164 | | .compression |
165 | | .accepted_compression_algorithms |
166 | | .iter() |
167 | | // Filter None values. |
168 | 0 | .filter_map(|from: &HttpCompressionAlgorithm| into_encoding(*from)) |
169 | | { |
170 | | service = service.accept_compressed(encoding); |
171 | | } |
172 | | service |
173 | | }}; |
174 | | } |
175 | | |
176 | 0 | async fn inner_main( |
177 | 0 | cfg: CasConfig, |
178 | 0 | shutdown_tx: broadcast::Sender<ShutdownGuard>, |
179 | 0 | scheduler_shutdown_tx: Sender<()>, |
180 | 0 | ) -> Result<(), Error> { |
181 | 0 | const fn into_encoding(from: HttpCompressionAlgorithm) -> Option<CompressionEncoding> { |
182 | 0 | match from { |
183 | 0 | HttpCompressionAlgorithm::Gzip => Some(CompressionEncoding::Gzip), |
184 | 0 | HttpCompressionAlgorithm::None => None, |
185 | | } |
186 | 0 | } |
187 | | |
188 | 0 | let health_registry_builder = |
189 | 0 | Arc::new(AsyncMutex::new(HealthRegistryBuilder::new("nativelink"))); |
190 | | |
191 | 0 | let store_manager = Arc::new(StoreManager::new()); |
192 | | { |
193 | 0 | let mut health_registry_lock = health_registry_builder.lock().await; |
194 | | |
195 | 0 | for StoreConfig { name, spec } in cfg.stores { |
196 | 0 | let health_component_name = format!("stores/{name}"); |
197 | 0 | let mut health_register_store = |
198 | 0 | health_registry_lock.sub_builder(&health_component_name); |
199 | 0 | let store = store_factory(&spec, &store_manager, Some(&mut health_register_store)) |
200 | 0 | .await |
201 | 0 | .err_tip(|| format!("Failed to create store '{name}'"))?; |
202 | 0 | store_manager.add_store(&name, store); |
203 | | } |
204 | | } |
205 | | |
206 | 0 | let mut root_futures: Vec<BoxFuture<Result<(), Error>>> = Vec::new(); |
207 | | |
208 | 0 | let maybe_origin_event_tx = cfg |
209 | 0 | .experimental_origin_events |
210 | 0 | .as_ref() |
211 | 0 | .map(|origin_events_cfg| { |
212 | 0 | let mut max_queued_events = origin_events_cfg.max_event_queue_size; |
213 | 0 | if max_queued_events == 0 { Branch (213:16): [Folded - Ignored]
|
214 | 0 | max_queued_events = DEFAULT_MAX_QUEUE_EVENTS; |
215 | 0 | } |
216 | 0 | let (tx, rx) = mpsc::channel(max_queued_events); |
217 | 0 | let store_name = origin_events_cfg.publisher.store.as_str(); |
218 | 0 | let store = store_manager.get_store(store_name).err_tip(|| { |
219 | 0 | format!("Could not get store {store_name} for origin event publisher") |
220 | 0 | })?; |
221 | | |
222 | 0 | root_futures.push(Box::pin( |
223 | 0 | OriginEventPublisher::new(store, rx, shutdown_tx.clone()) |
224 | 0 | .run() |
225 | 0 | .map(Ok), |
226 | 0 | )); |
227 | | |
228 | 0 | Ok::<_, Error>(tx) |
229 | 0 | }) |
230 | 0 | .transpose()?; |
231 | | |
232 | 0 | let mut action_schedulers = HashMap::new(); |
233 | 0 | let mut worker_schedulers = HashMap::new(); |
234 | 0 | for SchedulerConfig { name, spec } in cfg.schedulers.iter().flatten() { |
235 | 0 | let (maybe_action_scheduler, maybe_worker_scheduler) = |
236 | 0 | scheduler_factory(spec, &store_manager, maybe_origin_event_tx.as_ref()) |
237 | 0 | .err_tip(|| format!("Failed to create scheduler '{name}'"))?; |
238 | 0 | if let Some(action_scheduler) = maybe_action_scheduler { Branch (238:16): [Folded - Ignored]
|
239 | 0 | action_schedulers.insert(name.clone(), action_scheduler.clone()); |
240 | 0 | } |
241 | 0 | if let Some(worker_scheduler) = maybe_worker_scheduler { Branch (241:16): [Folded - Ignored]
|
242 | 0 | worker_schedulers.insert(name.clone(), worker_scheduler.clone()); |
243 | 0 | } |
244 | | } |
245 | | |
246 | 0 | let server_cfgs: Vec<ServerConfig> = cfg.servers.into_iter().collect(); |
247 | | |
248 | 0 | for server_cfg in server_cfgs { |
249 | 0 | let services = server_cfg |
250 | 0 | .services |
251 | 0 | .err_tip(|| "'services' must be configured")?; |
252 | | |
253 | | // Currently we only support http as our socket type. |
254 | 0 | let ListenerConfig::Http(http_config) = server_cfg.listener; |
255 | | |
256 | 0 | let tonic_services = Routes::builder() |
257 | 0 | .routes() |
258 | 0 | .add_optional_service( |
259 | 0 | services |
260 | 0 | .ac |
261 | 0 | .map_or(Ok(None), |cfg| { |
262 | 0 | AcServer::new(&cfg, &store_manager) |
263 | 0 | .map(|v| Some(service_setup!(v, http_config))) |
264 | 0 | }) |
265 | 0 | .err_tip(|| "Could not create AC service")?, |
266 | | ) |
267 | 0 | .add_optional_service( |
268 | 0 | services |
269 | 0 | .cas |
270 | 0 | .map_or(Ok(None), |cfg| { |
271 | 0 | CasServer::new(&cfg, &store_manager) |
272 | 0 | .map(|v| Some(service_setup!(v, http_config))) |
273 | 0 | }) |
274 | 0 | .err_tip(|| "Could not create CAS service")?, |
275 | | ) |
276 | 0 | .add_optional_service( |
277 | 0 | services |
278 | 0 | .execution |
279 | 0 | .map_or(Ok(None), |cfg| { |
280 | 0 | ExecutionServer::new(&cfg, &action_schedulers, &store_manager) |
281 | 0 | .map(|v| Some(service_setup!(v, http_config))) |
282 | 0 | }) |
283 | 0 | .err_tip(|| "Could not create Execution service")?, |
284 | | ) |
285 | 0 | .add_optional_service( |
286 | 0 | services |
287 | 0 | .fetch |
288 | 0 | .map_or(Ok(None), |cfg| { |
289 | 0 | FetchServer::new(&cfg, &store_manager) |
290 | 0 | .map(|v| Some(service_setup!(v, http_config))) |
291 | 0 | }) |
292 | 0 | .err_tip(|| "Could not create Fetch service")?, |
293 | | ) |
294 | 0 | .add_optional_service( |
295 | 0 | services |
296 | 0 | .push |
297 | 0 | .map_or(Ok(None), |cfg| { |
298 | 0 | PushServer::new(&cfg, &store_manager) |
299 | 0 | .map(|v| Some(service_setup!(v, http_config))) |
300 | 0 | }) |
301 | 0 | .err_tip(|| "Could not create Push service")?, |
302 | | ) |
303 | 0 | .add_optional_service( |
304 | 0 | services |
305 | 0 | .bytestream |
306 | 0 | .map_or(Ok(None), |cfg| { |
307 | 0 | ByteStreamServer::new(&cfg, &store_manager) |
308 | 0 | .map(|v| Some(service_setup!(v, http_config))) |
309 | 0 | }) |
310 | 0 | .err_tip(|| "Could not create ByteStream service")?, |
311 | | ) |
312 | 0 | .add_optional_service( |
313 | 0 | OptionFuture::from( |
314 | 0 | services |
315 | 0 | .capabilities |
316 | 0 | .as_ref() |
317 | 0 | .map(|cfg| CapabilitiesServer::new(cfg, &action_schedulers)), |
318 | | ) |
319 | 0 | .await |
320 | 0 | .map_or(Ok::<Option<CapabilitiesServer>, Error>(None), |server| { |
321 | 0 | Ok(Some(server?)) |
322 | 0 | }) |
323 | 0 | .err_tip(|| "Could not create Capabilities service")? |
324 | 0 | .map(|v| service_setup!(v, http_config)), |
325 | | ) |
326 | 0 | .add_optional_service( |
327 | 0 | services |
328 | 0 | .worker_api |
329 | 0 | .map_or(Ok(None), |cfg| { |
330 | 0 | WorkerApiServer::new(&cfg, &worker_schedulers) |
331 | 0 | .map(|v| Some(service_setup!(v, http_config))) |
332 | 0 | }) |
333 | 0 | .err_tip(|| "Could not create WorkerApi service")?, |
334 | | ) |
335 | 0 | .add_optional_service( |
336 | 0 | services |
337 | 0 | .experimental_bep |
338 | 0 | .map_or(Ok(None), |cfg| { |
339 | 0 | BepServer::new(&cfg, &store_manager) |
340 | 0 | .map(|v| Some(service_setup!(v, http_config))) |
341 | 0 | }) |
342 | 0 | .err_tip(|| "Could not create BEP service")?, |
343 | | ); |
344 | | |
345 | 0 | let health_registry = health_registry_builder.lock().await.build(); |
346 | | |
347 | 0 | let mut svc = |
348 | 0 | tonic_services |
349 | 0 | .into_axum_router() |
350 | 0 | .layer(nativelink_util::telemetry::OtlpLayer::new( |
351 | 0 | server_cfg.experimental_identity_header.required, |
352 | | )); |
353 | | |
354 | 0 | if let Some(health_cfg) = services.health { Branch (354:16): [Folded - Ignored]
|
355 | 0 | let path = if health_cfg.path.is_empty() { Branch (355:27): [Folded - Ignored]
|
356 | 0 | DEFAULT_HEALTH_STATUS_CHECK_PATH |
357 | | } else { |
358 | 0 | &health_cfg.path |
359 | | }; |
360 | 0 | svc = svc.route_service(path, HealthServer::new(health_registry, &health_cfg)); |
361 | 0 | } |
362 | | |
363 | 0 | if let Some(admin_config) = services.admin { Branch (363:16): [Folded - Ignored]
|
364 | 0 | let path = if admin_config.path.is_empty() { Branch (364:27): [Folded - Ignored]
|
365 | 0 | DEFAULT_ADMIN_API_PATH |
366 | | } else { |
367 | 0 | &admin_config.path |
368 | | }; |
369 | 0 | let worker_schedulers = Arc::new(worker_schedulers.clone()); |
370 | 0 | svc = svc.nest_service( |
371 | 0 | path, |
372 | 0 | Router::new().route( |
373 | 0 | "/scheduler/{instance_name}/set_drain_worker/{worker_id}/{is_draining}", |
374 | 0 | axum::routing::post( |
375 | 0 | move |params: axum::extract::Path<(String, String, String)>| async move { |
376 | 0 | let (instance_name, worker_id, is_draining) = params.0; |
377 | 0 | (async move { |
378 | 0 | let is_draining = match is_draining.as_str() { |
379 | 0 | "0" => false, |
380 | 0 | "1" => true, |
381 | | _ => { |
382 | 0 | return Err(make_err!( |
383 | 0 | Code::Internal, |
384 | 0 | "{} is neither 0 nor 1", |
385 | 0 | is_draining |
386 | 0 | )); |
387 | | } |
388 | | }; |
389 | 0 | worker_schedulers |
390 | 0 | .get(&instance_name) |
391 | 0 | .err_tip(|| { |
392 | 0 | format!( |
393 | 0 | "Can not get an instance with the name of '{}'", |
394 | 0 | &instance_name |
395 | | ) |
396 | 0 | })? |
397 | 0 | .clone() |
398 | 0 | .set_drain_worker(&worker_id.clone().into(), is_draining) |
399 | 0 | .await?; |
400 | 0 | Ok::<_, Error>(format!("Draining worker {worker_id}")) |
401 | | }) |
402 | 0 | .await |
403 | 0 | .map_err(|e| { |
404 | 0 | Err::<String, _>(( |
405 | 0 | StatusCode::INTERNAL_SERVER_ERROR, |
406 | 0 | format!("Error: {e:?}"), |
407 | 0 | )) |
408 | 0 | }) |
409 | 0 | }, |
410 | | ), |
411 | | ), |
412 | | ); |
413 | 0 | } |
414 | | |
415 | | // This is the default service that executes if no other endpoint matches. |
416 | 0 | svc = svc.fallback(|uri: Uri| async move { |
417 | 0 | warn!("No route for {uri}"); |
418 | 0 | (StatusCode::NOT_FOUND, format!("No route for {uri}")) |
419 | 0 | }); |
420 | | |
421 | | // Configure our TLS acceptor if we have TLS configured. |
422 | 0 | let maybe_tls_acceptor = http_config.tls.map_or(Ok(None), |tls_config| { |
423 | 0 | fn read_cert(cert_file: &str) -> Result<Vec<CertificateDer<'static>>, Error> { |
424 | 0 | let mut cert_reader = std::io::BufReader::new( |
425 | 0 | std::fs::File::open(cert_file) |
426 | 0 | .err_tip(|| format!("Could not open cert file {cert_file}"))?, |
427 | | ); |
428 | 0 | let certs = CertificateDer::pem_reader_iter(&mut cert_reader) |
429 | 0 | .collect::<Result<Vec<CertificateDer<'_>>, _>>() |
430 | 0 | .err_tip(|| format!("Could not extract certs from file {cert_file}"))?; |
431 | 0 | Ok(certs) |
432 | 0 | } |
433 | 0 | let certs = read_cert(&tls_config.cert_file)?; |
434 | 0 | let mut key_reader = std::io::BufReader::new( |
435 | 0 | std::fs::File::open(&tls_config.key_file) |
436 | 0 | .err_tip(|| format!("Could not open key file {}", tls_config.key_file))?, |
437 | | ); |
438 | 0 | let key = match PrivateKeyDer::from_pem_reader(&mut key_reader) |
439 | 0 | .err_tip(|| format!("Could not extract key(s) from file {}", tls_config.key_file))? |
440 | | { |
441 | 0 | PrivateKeyDer::Pkcs8(key) => key.into(), |
442 | 0 | PrivateKeyDer::Sec1(key) => key.into(), |
443 | 0 | PrivateKeyDer::Pkcs1(key) => key.into(), |
444 | | _ => { |
445 | 0 | return Err(make_err!( |
446 | 0 | Code::Internal, |
447 | 0 | "No keys found in file {}", |
448 | 0 | tls_config.key_file |
449 | 0 | )); |
450 | | } |
451 | | }; |
452 | 0 | if PrivateKeyDer::from_pem_reader(&mut key_reader).is_ok() { Branch (452:16): [Folded - Ignored]
|
453 | 0 | return Err(make_err!( |
454 | 0 | Code::InvalidArgument, |
455 | 0 | "Expected 1 key in file {}", |
456 | 0 | tls_config.key_file |
457 | 0 | )); |
458 | 0 | } |
459 | 0 | let verifier = if let Some(client_ca_file) = &tls_config.client_ca_file { Branch (459:35): [Folded - Ignored]
|
460 | 0 | let mut client_auth_roots = RootCertStore::empty(); |
461 | 0 | for cert in read_cert(client_ca_file)? { |
462 | 0 | client_auth_roots.add(cert).map_err(|e| { |
463 | 0 | make_err!(Code::Internal, "Could not read client CA: {e:?}") |
464 | 0 | })?; |
465 | | } |
466 | 0 | let crls = if let Some(client_crl_file) = &tls_config.client_crl_file { Branch (466:35): [Folded - Ignored]
|
467 | 0 | let mut crl_reader = std::io::BufReader::new( |
468 | 0 | std::fs::File::open(client_crl_file) |
469 | 0 | .err_tip(|| format!("Could not open CRL file {client_crl_file}"))?, |
470 | | ); |
471 | 0 | CertificateRevocationListDer::pem_reader_iter(&mut crl_reader) |
472 | 0 | .collect::<Result<_, _>>() |
473 | 0 | .err_tip(|| format!("Could not extract CRLs from file {client_crl_file}"))? |
474 | | } else { |
475 | 0 | Vec::new() |
476 | | }; |
477 | 0 | WebPkiClientVerifier::builder(Arc::new(client_auth_roots)) |
478 | 0 | .with_crls(crls) |
479 | 0 | .build() |
480 | 0 | .map_err(|e| { |
481 | 0 | make_err!( |
482 | 0 | Code::Internal, |
483 | | "Could not create WebPkiClientVerifier: {e:?}" |
484 | | ) |
485 | 0 | })? |
486 | | } else { |
487 | 0 | WebPkiClientVerifier::no_client_auth() |
488 | | }; |
489 | 0 | let mut config = TlsServerConfig::builder() |
490 | 0 | .with_client_cert_verifier(verifier) |
491 | 0 | .with_single_cert(certs, key) |
492 | 0 | .map_err(|e| { |
493 | 0 | make_err!(Code::Internal, "Could not create TlsServerConfig : {e:?}") |
494 | 0 | })?; |
495 | | |
496 | 0 | config.alpn_protocols.push("h2".into()); |
497 | 0 | Ok(Some(TlsAcceptor::from(Arc::new(config)))) |
498 | 0 | })?; |
499 | | |
500 | 0 | let socket_addr = http_config |
501 | 0 | .socket_address |
502 | 0 | .parse::<SocketAddr>() |
503 | 0 | .map_err(|e| { |
504 | 0 | make_input_err!("Invalid address '{}' - {e:?}", http_config.socket_address) |
505 | 0 | })?; |
506 | 0 | let tcp_listener = TcpListener::bind(&socket_addr).await?; |
507 | 0 | let mut http = auto::Builder::new(TaskExecutor::default()); |
508 | | |
509 | 0 | let http_config = &http_config.advanced_http; |
510 | 0 | if let Some(value) = http_config.http2_keep_alive_interval { Branch (510:16): [Folded - Ignored]
|
511 | 0 | http.http2() |
512 | 0 | .keep_alive_interval(Duration::from_secs(u64::from(value))); |
513 | 0 | } |
514 | | |
515 | 0 | if let Some(value) = http_config.experimental_http2_max_pending_accept_reset_streams { Branch (515:16): [Folded - Ignored]
|
516 | 0 | http.http2() |
517 | 0 | .max_pending_accept_reset_streams(usize::try_from(value).err_tip( |
518 | | || "Could not convert experimental_http2_max_pending_accept_reset_streams", |
519 | 0 | )?); |
520 | 0 | } |
521 | 0 | if let Some(value) = http_config.experimental_http2_initial_stream_window_size { Branch (521:16): [Folded - Ignored]
|
522 | 0 | http.http2().initial_stream_window_size(value); |
523 | 0 | } |
524 | 0 | if let Some(value) = http_config.experimental_http2_initial_connection_window_size { Branch (524:16): [Folded - Ignored]
|
525 | 0 | http.http2().initial_connection_window_size(value); |
526 | 0 | } |
527 | 0 | if let Some(value) = http_config.experimental_http2_adaptive_window { Branch (527:16): [Folded - Ignored]
|
528 | 0 | http.http2().adaptive_window(value); |
529 | 0 | } |
530 | 0 | if let Some(value) = http_config.experimental_http2_max_frame_size { Branch (530:16): [Folded - Ignored]
|
531 | 0 | http.http2().max_frame_size(value); |
532 | 0 | } |
533 | 0 | if let Some(value) = http_config.experimental_http2_max_concurrent_streams { Branch (533:16): [Folded - Ignored]
|
534 | 0 | http.http2().max_concurrent_streams(value); |
535 | 0 | } |
536 | 0 | if let Some(value) = http_config.experimental_http2_keep_alive_timeout { Branch (536:16): [Folded - Ignored]
|
537 | 0 | http.http2() |
538 | 0 | .keep_alive_timeout(Duration::from_secs(u64::from(value))); |
539 | 0 | } |
540 | 0 | if let Some(value) = http_config.experimental_http2_max_send_buf_size { Branch (540:16): [Folded - Ignored]
|
541 | 0 | http.http2().max_send_buf_size( |
542 | 0 | usize::try_from(value).err_tip(|| "Could not convert http2_max_send_buf_size")?, |
543 | | ); |
544 | 0 | } |
545 | 0 | if http_config.experimental_http2_enable_connect_protocol == Some(true) { Branch (545:12): [Folded - Ignored]
|
546 | 0 | http.http2().enable_connect_protocol(); |
547 | 0 | } |
548 | 0 | if let Some(value) = http_config.experimental_http2_max_header_list_size { Branch (548:16): [Folded - Ignored]
|
549 | 0 | http.http2().max_header_list_size(value); |
550 | 0 | } |
551 | 0 | info!("Ready, listening on {socket_addr}",); |
552 | 0 | root_futures.push(Box::pin(async move { |
553 | | loop { |
554 | 0 | select! { |
555 | 0 | accept_result = tcp_listener.accept() => { |
556 | 0 | match accept_result { |
557 | 0 | Ok((tcp_stream, remote_addr)) => { |
558 | 0 | info!( |
559 | | target: "nativelink::services", |
560 | | ?remote_addr, |
561 | | ?socket_addr, |
562 | 0 | "Client connected" |
563 | | ); |
564 | | |
565 | 0 | let (http, svc, maybe_tls_acceptor) = |
566 | 0 | (http.clone(), svc.clone(), maybe_tls_acceptor.clone()); |
567 | | |
568 | 0 | background_spawn!( |
569 | | name: "http_connection", |
570 | 0 | fut: error_span!( |
571 | | "http_connection", |
572 | | remote_addr = %remote_addr, |
573 | | socket_addr = %socket_addr, |
574 | 0 | ).in_scope(|| async move { |
575 | 0 | let serve_connection = if let Some(tls_acceptor) = maybe_tls_acceptor { Branch (575:71): [Folded - Ignored]
|
576 | 0 | match tls_acceptor.accept(tcp_stream).await { |
577 | 0 | Ok(tls_stream) => Either::Left(http.serve_connection( |
578 | 0 | TokioIo::new(tls_stream), |
579 | 0 | TowerToHyperService::new(svc), |
580 | 0 | )), |
581 | 0 | Err(err) => { |
582 | 0 | error!(?err, "Failed to accept tls stream"); |
583 | 0 | return; |
584 | | } |
585 | | } |
586 | | } else { |
587 | 0 | Either::Right(http.serve_connection( |
588 | 0 | TokioIo::new(tcp_stream), |
589 | 0 | TowerToHyperService::new(svc), |
590 | 0 | )) |
591 | | }; |
592 | | |
593 | 0 | if let Err(err) = serve_connection.await { Branch (593:48): [Folded - Ignored]
|
594 | 0 | error!( |
595 | | target: "nativelink::services", |
596 | | ?err, |
597 | 0 | "Failed running service" |
598 | | ); |
599 | 0 | } |
600 | 0 | }), |
601 | | target: "nativelink::services", |
602 | | ?remote_addr, |
603 | | ?socket_addr, |
604 | | ); |
605 | | }, |
606 | 0 | Err(err) => { |
607 | 0 | error!(?err, "Failed to accept tcp connection"); |
608 | | } |
609 | | } |
610 | | }, |
611 | | } |
612 | | } |
613 | | // Unreachable |
614 | | })); |
615 | | } |
616 | | |
617 | | { |
618 | | // We start workers after our TcpListener is setup so if our worker connects to one |
619 | | // of these services it will be able to connect. |
620 | 0 | let worker_cfgs = cfg.workers.unwrap_or_default(); |
621 | 0 | let mut worker_names = HashSet::with_capacity(worker_cfgs.len()); |
622 | 0 | for (i, worker_cfg) in worker_cfgs.into_iter().enumerate() { |
623 | 0 | let spawn_fut = match worker_cfg { |
624 | 0 | WorkerConfig::Local(local_worker_cfg) => { |
625 | 0 | let fast_slow_store = store_manager |
626 | 0 | .get_store(&local_worker_cfg.cas_fast_slow_store) |
627 | 0 | .err_tip(|| { |
628 | 0 | format!( |
629 | 0 | "Failed to find store for cas_store_ref in worker config : {}", |
630 | | local_worker_cfg.cas_fast_slow_store |
631 | | ) |
632 | 0 | })?; |
633 | | |
634 | 0 | let maybe_ac_store = if let Some(ac_store_ref) = Branch (634:49): [Folded - Ignored]
|
635 | 0 | &local_worker_cfg.upload_action_result.ac_store |
636 | | { |
637 | 0 | Some(store_manager.get_store(ac_store_ref).err_tip(|| { |
638 | 0 | format!("Failed to find store for ac_store in worker config : {ac_store_ref}") |
639 | 0 | })?) |
640 | | } else { |
641 | 0 | None |
642 | | }; |
643 | | // Note: Defaults to fast_slow_store if not specified. If this ever changes it must |
644 | | // be updated in config documentation for the `historical_results_store` the field. |
645 | 0 | let historical_store = if let Some(cas_store_ref) = &local_worker_cfg Branch (645:51): [Folded - Ignored]
|
646 | 0 | .upload_action_result |
647 | 0 | .historical_results_store |
648 | | { |
649 | 0 | store_manager.get_store(cas_store_ref).err_tip(|| { |
650 | 0 | format!( |
651 | 0 | "Failed to find store for historical_results_store in worker config : {cas_store_ref}" |
652 | | ) |
653 | 0 | })? |
654 | | } else { |
655 | 0 | fast_slow_store.clone() |
656 | | }; |
657 | 0 | let local_worker = new_local_worker( |
658 | 0 | Arc::new(local_worker_cfg), |
659 | 0 | fast_slow_store, |
660 | 0 | maybe_ac_store, |
661 | 0 | historical_store, |
662 | 0 | ) |
663 | 0 | .await |
664 | 0 | .err_tip(|| "Could not make LocalWorker")?; |
665 | | |
666 | 0 | let name = if local_worker.name().is_empty() { Branch (666:35): [Folded - Ignored]
|
667 | 0 | format!("worker_{i}") |
668 | | } else { |
669 | 0 | local_worker.name().clone() |
670 | | }; |
671 | | |
672 | 0 | if worker_names.contains(&name) { Branch (672:24): [Folded - Ignored]
|
673 | 0 | Err(make_input_err!( |
674 | 0 | "Duplicate worker name '{}' found in config", |
675 | 0 | name |
676 | 0 | ))?; |
677 | 0 | } |
678 | 0 | worker_names.insert(name.clone()); |
679 | 0 | let shutdown_rx = shutdown_tx.subscribe(); |
680 | 0 | let fut = trace_span!("worker_ctx", worker_name = %name) |
681 | 0 | .in_scope(|| local_worker.run(shutdown_rx)); |
682 | 0 | spawn!("worker", fut, ?name) |
683 | | } |
684 | | }; |
685 | 0 | root_futures.push(Box::pin(spawn_fut.map_ok_or_else(|e| Err(e.into()), |v| v))); |
686 | | } |
687 | | } |
688 | | |
689 | | // Set up a shutdown handler for the worker schedulers. |
690 | 0 | let mut shutdown_rx = shutdown_tx.subscribe(); |
691 | 0 | root_futures.push(Box::pin(async move { |
692 | 0 | if let Ok(shutdown_guard) = shutdown_rx.recv().await { Branch (692:16): [Folded - Ignored]
|
693 | 0 | let _ = scheduler_shutdown_tx.send(()); |
694 | 0 | for (_name, scheduler) in worker_schedulers { |
695 | 0 | scheduler.shutdown(shutdown_guard.clone()).await; |
696 | | } |
697 | 0 | } |
698 | 0 | Ok(()) |
699 | 0 | })); |
700 | | |
701 | 0 | if let Err(e) = try_join_all(root_futures).await { Branch (701:12): [Folded - Ignored]
|
702 | 0 | panic!("{e:?}"); |
703 | 0 | } |
704 | | |
705 | 0 | Ok(()) |
706 | 0 | } |
707 | | |
708 | 0 | fn get_config() -> Result<CasConfig, Error> { |
709 | 0 | let args = Args::parse(); |
710 | 0 | CasConfig::try_from_json5_file(&args.config_file) |
711 | 0 | } |
712 | | |
713 | 0 | fn main() -> Result<(), Box<dyn core::error::Error>> { |
714 | | #[expect(clippy::disallowed_methods, reason = "starting main runtime")] |
715 | 0 | let runtime = tokio::runtime::Builder::new_multi_thread() |
716 | 0 | .enable_all() |
717 | 0 | .build()?; |
718 | | |
719 | | // The OTLP exporters need to run in a Tokio context |
720 | | // Do this first so all the other logging works |
721 | | #[expect(clippy::disallowed_methods, reason = "tracing init on main runtime")] |
722 | 0 | runtime.block_on(async { tokio::spawn(async { init_tracing() }).await? })?; |
723 | | |
724 | 0 | let mut cfg = get_config()?; |
725 | | |
726 | 0 | let global_cfg = if let Some(global_cfg) = &mut cfg.global { Branch (726:29): [Folded - Ignored]
|
727 | 0 | if global_cfg.max_open_files == 0 { Branch (727:12): [Folded - Ignored]
|
728 | 0 | global_cfg.max_open_files = fs::DEFAULT_OPEN_FILE_LIMIT; |
729 | 0 | } |
730 | 0 | if global_cfg.default_digest_size_health_check == 0 { Branch (730:12): [Folded - Ignored]
|
731 | 0 | global_cfg.default_digest_size_health_check = DEFAULT_DIGEST_SIZE_HEALTH_CHECK_CFG; |
732 | 0 | } |
733 | | |
734 | 0 | *global_cfg |
735 | | } else { |
736 | 0 | GlobalConfig { |
737 | 0 | max_open_files: fs::DEFAULT_OPEN_FILE_LIMIT, |
738 | 0 | default_digest_hash_function: None, |
739 | 0 | default_digest_size_health_check: DEFAULT_DIGEST_SIZE_HEALTH_CHECK_CFG, |
740 | 0 | } |
741 | | }; |
742 | 0 | set_open_file_limit(global_cfg.max_open_files); |
743 | 0 | set_default_digest_hasher_func(DigestHasherFunc::from( |
744 | 0 | global_cfg |
745 | 0 | .default_digest_hash_function |
746 | 0 | .unwrap_or(ConfigDigestHashFunction::Sha256), |
747 | 0 | ))?; |
748 | 0 | set_default_digest_size_health_check(global_cfg.default_digest_size_health_check)?; |
749 | | |
750 | | // Initiates the shutdown process by broadcasting the shutdown signal via the `oneshot::Sender` to all listeners. |
751 | | // Each listener will perform its cleanup and then drop its `oneshot::Sender`, signaling completion. |
752 | | // Once all `oneshot::Sender` instances are dropped, the worker knows it can safely terminate. |
753 | 0 | let (shutdown_tx, _) = broadcast::channel::<ShutdownGuard>(BROADCAST_CAPACITY); |
754 | | #[cfg(target_family = "unix")] |
755 | 0 | let shutdown_tx_clone = shutdown_tx.clone(); |
756 | | #[cfg(target_family = "unix")] |
757 | 0 | let mut shutdown_guard = ShutdownGuard::default(); |
758 | | |
759 | | #[expect(clippy::disallowed_methods, reason = "signal handler on main runtime")] |
760 | 0 | runtime.spawn(async move { |
761 | 0 | tokio::signal::ctrl_c() |
762 | 0 | .await |
763 | 0 | .expect("Failed to listen to SIGINT"); |
764 | 0 | eprintln!("User terminated process via SIGINT"); |
765 | 0 | std::process::exit(130); |
766 | | }); |
767 | | |
768 | | #[allow(unused_variables)] |
769 | 0 | let (scheduler_shutdown_tx, scheduler_shutdown_rx) = oneshot::channel(); |
770 | | |
771 | | #[cfg(target_family = "unix")] |
772 | | #[expect(clippy::disallowed_methods, reason = "signal handler on main runtime")] |
773 | 0 | runtime.spawn(async move { |
774 | 0 | signal(SignalKind::terminate()) |
775 | 0 | .expect("Failed to listen to SIGTERM") |
776 | 0 | .recv() |
777 | 0 | .await; |
778 | 0 | warn!("Process terminated via SIGTERM",); |
779 | 0 | drop(shutdown_tx_clone.send(shutdown_guard.clone())); |
780 | 0 | scheduler_shutdown_rx |
781 | 0 | .await |
782 | 0 | .expect("Failed to receive scheduler shutdown"); |
783 | 0 | let () = shutdown_guard.wait_for(Priority::P0).await; |
784 | 0 | warn!("Successfully shut down nativelink.",); |
785 | 0 | std::process::exit(143); |
786 | | }); |
787 | | |
788 | | #[expect(clippy::disallowed_methods, reason = "waiting on everything to finish")] |
789 | 0 | runtime |
790 | 0 | .block_on(async { |
791 | 0 | trace_span!("main") |
792 | 0 | .in_scope(|| async { inner_main(cfg, shutdown_tx, scheduler_shutdown_tx).await }) |
793 | 0 | .await |
794 | 0 | }) |
795 | 0 | .err_tip(|| "main() function failed")?; |
796 | 0 | Ok(()) |
797 | 0 | } |