/build/source/nativelink-worker/src/namespace_utils.rs
Line | Count | Source |
1 | | // Copyright 2026 The NativeLink Authors. All rights reserved. |
2 | | // |
3 | | // Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // See LICENSE file for details |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | /// A wrapper around a Child to send SIGTERM to kill the process instead |
16 | | /// of SIGKILL as it's wrapped by the stub. |
17 | | #[derive(Debug)] |
18 | | pub struct MaybeNamespacedChild { |
19 | | namespaced: bool, |
20 | | child: tokio::process::Child, |
21 | | } |
22 | | |
23 | | impl MaybeNamespacedChild { |
24 | 18 | pub const fn new(namespaced: bool, child: tokio::process::Child) -> Self { |
25 | 18 | Self { namespaced, child } |
26 | 18 | } |
27 | | |
28 | | /// Send SIGTERM if namespaced which sends SIGKILL to the child, otherwise |
29 | | /// send SIGKILL to the child. |
30 | 7 | pub async fn kill(&mut self) -> Result<(), std::io::Error> { |
31 | 7 | if self.namespaced { |
32 | | // It would be safer to call send_signal to use the pidfd to avoid |
33 | | // races, however this is still an experimental API, see: |
34 | | // https://github.com/rust-lang/rust/issues/141975 |
35 | | // self.child.std_child().send_signal(Signal::SIGTERM)?; |
36 | | // return self.child.wait().await.map(|_| ()); |
37 | 6 | if let Some(pid) = self.child.id() { |
38 | | // SAFETY: pid is valid as provided by the wrapper and we are |
39 | | // sending a signal to the namespaced stub. |
40 | 6 | unsafe { libc::kill(pid as libc::pid_t, libc::SIGTERM) }; |
41 | 6 | return self.child.wait().await.map(|_| ()); |
42 | 0 | } |
43 | 1 | } |
44 | 1 | self.child.kill().await |
45 | 7 | } |
46 | | |
47 | 2 | pub fn try_wait(&mut self) -> Result<Option<std::process::ExitStatus>, std::io::Error> { |
48 | 2 | self.child.try_wait() |
49 | 2 | } |
50 | | |
51 | 21 | pub async fn wait(&mut self) -> Result<std::process::ExitStatus, std::io::Error> {18 |
52 | 18 | self.child.wait().await |
53 | 17 | } |
54 | | } |
55 | | |
56 | | /// Determines whether the namespaces provided by this module are supported |
57 | | /// on the currently running system by forking a process and trying to enter |
58 | | /// it into the new namespaces. |
59 | 31 | pub fn namespaces_supported(mount: bool) -> bool { |
60 | | // SAFETY: Posix requires that geteuid is always successful. |
61 | 31 | let uid = unsafe { libc::geteuid() }; |
62 | 31 | let uid_map = format!("{uid} {uid} 1\n"); |
63 | | // SAFETY: We ensure that if pid == 0 we only call async-signal-safe functions. |
64 | 31 | let pid = unsafe { libc::fork() }; |
65 | 31 | match pid { |
66 | | 0 => { |
67 | 0 | let mut flags = |
68 | 0 | libc::CLONE_NEWPID | libc::CLONE_NEWUSER | libc::CLONE_NEWIPC | libc::CLONE_NEWUTS; |
69 | 0 | if mount { |
70 | 0 | flags |= libc::CLONE_NEWNS; |
71 | 0 | } |
72 | | // SAFETY: Unshare does not have any unsafe effects and modifies no |
73 | | // memory, it is also async-signal-safe. |
74 | 0 | if unsafe { libc::unshare(flags) } == 0 |
75 | 0 | && write_signal_safe(c"/proc/self/uid_map", uid_map.as_bytes()).is_ok() |
76 | | { |
77 | | // SAFETY: Mount uses no memory and is async-signal-safe. |
78 | 0 | if !mount |
79 | 0 | || unsafe { |
80 | 0 | libc::mount( |
81 | 0 | core::ptr::null(), |
82 | 0 | c"/".as_ptr(), |
83 | 0 | core::ptr::null(), |
84 | 0 | libc::MS_REC | libc::MS_PRIVATE, |
85 | 0 | core::ptr::null(), |
86 | 0 | ) |
87 | 0 | } == 0 |
88 | | { |
89 | | // SAFETY: It is always safe to _exit. |
90 | 0 | unsafe { libc::_exit(0) }; |
91 | 0 | } |
92 | 0 | } |
93 | | // SAFETY: It is always safe to _exit. |
94 | 0 | unsafe { libc::_exit(1) }; |
95 | | } |
96 | 31 | pid if pid > 0 => { |
97 | 31 | let mut status = 0; |
98 | | // SAFETY: The pid is valid and created by us and the status is our own stack. |
99 | 31 | while unsafe { libc::waitpid(pid, &raw mut status, 0) } == -1 { |
100 | | // SAFETY: We just called a libc function that failed (-1). |
101 | 0 | if unsafe { *libc::__errno_location() } != libc::EINTR { |
102 | 0 | return false; |
103 | 0 | } |
104 | | } |
105 | 31 | libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0 |
106 | | } |
107 | 0 | _ => false, |
108 | | } |
109 | 31 | } |
110 | | |
111 | | /// Writes to a file in an async-signal-safe manner, does the write in a |
112 | | /// single chunk and assumes it will all be consumed, if the whole chunk |
113 | | /// is not written returns Err(EIO). This is expected to be used for |
114 | | /// special files such as /proc which will always accept the whole buffer. |
115 | 0 | fn write_signal_safe(file_name: &core::ffi::CStr, data: &[u8]) -> Result<(), core::ffi::c_int> { |
116 | | // SAFETY: The path is a CStr which is guaranteed to end in a NUL byte |
117 | | // and the returned file descriptor is always closed. |
118 | 0 | let fd = unsafe { libc::open(file_name.as_ptr().cast(), libc::O_WRONLY) }; |
119 | 0 | if fd < 0 { |
120 | | // SAFETY: We just called a libc function that failed (-1). |
121 | 0 | return Err(unsafe { *libc::__errno_location() }); |
122 | 0 | } |
123 | 0 | let fd = OwnedFd(fd); |
124 | | |
125 | | // SAFETY: The data is a known length slice and the file descriptor is |
126 | | // known to be valid as we just opened it. |
127 | 0 | let bytes_written = unsafe { libc::write(fd.0, data.as_ptr().cast(), data.len()) }; |
128 | | |
129 | 0 | if bytes_written == -1 { |
130 | | // SAFETY: We just called a libc function that failed (-1). |
131 | 0 | Err(unsafe { *libc::__errno_location() }) |
132 | 0 | } else if bytes_written as usize != data.len() { |
133 | 0 | Err(libc::EIO) |
134 | | } else { |
135 | 0 | Ok(()) |
136 | | } |
137 | 0 | } |
138 | | |
139 | | /// An async-signal-safe method to close all open file descriptors for the |
140 | | /// current process. This function is unsafe as any existing handles to |
141 | | /// file descriptors will be invalidated. None may be used after calling |
142 | | /// this function. |
143 | 0 | unsafe fn close_all_fds() { |
144 | | // SAFETY: It is safe to call close on all file descriptors as this is |
145 | | // the purpose of the function. |
146 | 0 | if unsafe { libc::syscall(libc::SYS_close_range, 0, libc::INT_MAX, 0) } == 0 { |
147 | 0 | return; |
148 | 0 | } |
149 | | // Since we're <5.9 kernel, we need to get the max FD count. |
150 | 0 | let mut rlim = core::mem::MaybeUninit::<libc::rlimit>::uninit(); |
151 | | // SAFETY: We just allocated the memory for this and getrlimit is async-signal-safe. |
152 | 0 | let max_fd = if unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, rlim.as_mut_ptr()) } == 0 { |
153 | | // SAFETY: We just initialised this in getrlimit above that succeeded. |
154 | 0 | let cur = unsafe { rlim.assume_init().rlim_cur }; |
155 | 0 | if cur == libc::RLIM_INFINITY { |
156 | | // Sane fallback for unlimited environments |
157 | 0 | 0x0001_0000 |
158 | | } else { |
159 | 0 | core::ffi::c_int::try_from(cur).unwrap_or(0x0001_0000) |
160 | | } |
161 | | } else { |
162 | | // Fallback for getrlimit failure. |
163 | 0 | 4096 |
164 | | }; |
165 | 0 | for fd in 0..max_fd { |
166 | 0 | // SAFETY: It is safe to close a file descriptor that is not open and |
167 | 0 | // we also want to close all, so there's no issue with closing file |
168 | 0 | // descriptors that others may have handles to. |
169 | 0 | unsafe { libc::close(fd) }; |
170 | 0 | } |
171 | 0 | } |
172 | | |
173 | | /// Write the value n to the given slice as a decimal string. |
174 | 0 | fn u32_to_bytes(mut n: u32, buf: &mut [u8]) -> usize { |
175 | 0 | if n == 0 { |
176 | 0 | buf[0] = b'0'; |
177 | 0 | return 1; |
178 | 0 | } |
179 | 0 | let mut i = 0; |
180 | 0 | while n > 0 { |
181 | 0 | buf[i] = b'0' + (n % 10) as u8; |
182 | 0 | n /= 10; |
183 | 0 | i += 1; |
184 | 0 | } |
185 | 0 | buf[..i].reverse(); |
186 | 0 | i |
187 | 0 | } |
188 | | |
189 | | /// Create a line in the buffer of the format "{id} {id} 1\n" in an |
190 | | /// async-signal-safe manner. |
191 | 0 | fn create_map_line(id: u32, buffer: &mut [u8; 32]) -> &'_ [u8] { |
192 | 0 | let mut pos = 0; |
193 | 0 | pos += u32_to_bytes(id, &mut buffer[pos..]); |
194 | 0 | buffer[pos] = b' '; |
195 | 0 | pos += 1; |
196 | 0 | pos += u32_to_bytes(id, &mut buffer[pos..]); |
197 | 0 | buffer[pos] = b' '; |
198 | 0 | pos += 1; |
199 | 0 | buffer[pos] = b'1'; |
200 | 0 | pos += 1; |
201 | 0 | buffer[pos] = b'\n'; |
202 | 0 | pos += 1; |
203 | 0 | &buffer[..pos] |
204 | 0 | } |
205 | | |
206 | | /// A simple wrapper around a file descriptor to ensure async-signal-safety |
207 | | /// rather than the std version which may allocate. |
208 | | struct OwnedFd(libc::c_int); |
209 | | |
210 | | impl Drop for OwnedFd { |
211 | 0 | fn drop(&mut self) { |
212 | | // SAFETY: We own the file descriptor, so we can close it. |
213 | 0 | unsafe { |
214 | 0 | libc::close(self.0); |
215 | 0 | } |
216 | 0 | } |
217 | | } |
218 | | |
219 | 0 | fn perform_remount( |
220 | 0 | root_action_directory: &core::ffi::CStr, |
221 | 0 | action_directory: &core::ffi::CStr, |
222 | 0 | ) -> Result<(), std::io::Error> { |
223 | | // Make the mount namespace private to avoid changes propagating back to the host. |
224 | | // SAFETY: mount is async-signal-safe. We pass a null pointer for the source and valid |
225 | | // C-string pointers for the target. The parameters match POSIX requirements. |
226 | 0 | if unsafe { |
227 | 0 | libc::mount( |
228 | 0 | core::ptr::null(), |
229 | 0 | c"/".as_ptr(), |
230 | 0 | core::ptr::null(), |
231 | 0 | libc::MS_REC | libc::MS_PRIVATE, |
232 | 0 | core::ptr::null(), |
233 | 0 | ) |
234 | 0 | } != 0 |
235 | | { |
236 | 0 | return Err(std::io::Error::last_os_error()); |
237 | 0 | } |
238 | | |
239 | | // Bind mount the action directory to itself to "save" its current contents before |
240 | | // we mask its parent. |
241 | | // SAFETY: mount is async-signal-safe. We pass valid C-string pointers for the paths. |
242 | 0 | if unsafe { |
243 | 0 | libc::mount( |
244 | 0 | action_directory.as_ptr(), |
245 | 0 | action_directory.as_ptr(), |
246 | 0 | core::ptr::null(), |
247 | 0 | libc::MS_BIND | libc::MS_REC, |
248 | 0 | core::ptr::null(), |
249 | 0 | ) |
250 | 0 | } != 0 |
251 | | { |
252 | 0 | return Err(std::io::Error::last_os_error()); |
253 | 0 | } |
254 | | |
255 | | // Open the directory with O_PATH so we can find it after masking the parent. |
256 | | // SAFETY: open is async-signal-safe. The path is a valid C-string. |
257 | 0 | let fd = unsafe { libc::open(action_directory.as_ptr(), libc::O_PATH) }; |
258 | 0 | if fd < 0 { |
259 | 0 | return Err(std::io::Error::last_os_error()); |
260 | 0 | } |
261 | 0 | let fd = OwnedFd(fd); |
262 | | |
263 | | // Mask the root action directory with a tmpfs to ensure sibling directories aren't visible. |
264 | | // SAFETY: mount is async-signal-safe. The filesystem type and target are valid C-strings. |
265 | 0 | if unsafe { |
266 | 0 | libc::mount( |
267 | 0 | c"tmpfs".as_ptr(), |
268 | 0 | root_action_directory.as_ptr(), |
269 | 0 | c"tmpfs".as_ptr(), |
270 | 0 | 0, |
271 | 0 | core::ptr::null(), |
272 | 0 | ) |
273 | 0 | } != 0 |
274 | | { |
275 | 0 | return Err(std::io::Error::last_os_error()); |
276 | 0 | } |
277 | | |
278 | | // Recreate the specific operation's directory inside the empty tmpfs. |
279 | | // SAFETY: mkdir is async-signal-safe and the path is a valid C-string. |
280 | 0 | if unsafe { libc::mkdir(action_directory.as_ptr(), 0o777) } != 0 { |
281 | 0 | return Err(std::io::Error::last_os_error()); |
282 | 0 | } |
283 | | |
284 | | // Bind mount the saved directory back from the file descriptor to the new path. |
285 | 0 | let mut proc_path = [0u8; 64]; |
286 | 0 | let mut pos = 0; |
287 | 0 | for &b in b"/proc/self/fd/" { |
288 | 0 | proc_path[pos] = b; |
289 | 0 | pos += 1; |
290 | 0 | } |
291 | 0 | pos += u32_to_bytes(fd.0 as u32, &mut proc_path[pos..]); |
292 | 0 | proc_path[pos] = 0; |
293 | | |
294 | | // SAFETY: mount is async-signal-safe. The target path is a valid C-string and the source |
295 | | // path is correctly formatted using /proc/self/fd/. |
296 | 0 | if unsafe { |
297 | 0 | libc::mount( |
298 | 0 | proc_path.as_ptr().cast(), |
299 | 0 | action_directory.as_ptr(), |
300 | 0 | core::ptr::null(), |
301 | 0 | libc::MS_BIND | libc::MS_REC, |
302 | 0 | core::ptr::null(), |
303 | 0 | ) |
304 | 0 | } != 0 |
305 | | { |
306 | 0 | return Err(std::io::Error::last_os_error()); |
307 | 0 | } |
308 | | |
309 | 0 | Ok(()) |
310 | 0 | } |
311 | | |
312 | | /// A hook for a `Command::spawn` to create the process in a new namespace. |
313 | | /// This creates a stub process that the Command points at which forwards |
314 | | /// SIGKILL to the actual process in the new user, PID, UTS and IPC |
315 | | /// namespaces. Pass this function to `CommandBuilder::pre_exec`. |
316 | | /// |
317 | | /// This function is async-signal-safe and has no external locks or |
318 | | /// memory allocations. |
319 | 0 | pub fn configure_namespace( |
320 | 0 | mount: bool, |
321 | 0 | root_action_directory: &core::ffi::CStr, |
322 | 0 | action_directory: &core::ffi::CStr, |
323 | 0 | ) -> std::io::Result<()> { |
324 | | // SAFETY: It is always safe to call geteuid on Posix. |
325 | 0 | let uid = unsafe { libc::geteuid() }; |
326 | | // SAFETY: It is always safe to call getegid on Posix. |
327 | 0 | let gid = unsafe { libc::getegid() }; |
328 | | |
329 | 0 | let mut flags = |
330 | 0 | libc::CLONE_NEWPID | libc::CLONE_NEWUSER | libc::CLONE_NEWIPC | libc::CLONE_NEWUTS; |
331 | 0 | if mount { |
332 | 0 | flags |= libc::CLONE_NEWNS; |
333 | 0 | } |
334 | | // SAFETY: Unshare does not have any unsafe effects and modifies no |
335 | | // memory, it is also async-signal-safe. |
336 | 0 | if unsafe { libc::unshare(flags) } != 0 { |
337 | 0 | return Err(std::io::Error::last_os_error()); |
338 | 0 | } |
339 | | |
340 | 0 | if let Err(e) = write_signal_safe(c"/proc/self/setgroups", b"deny") { |
341 | | // If we fail to write this it will just make gid_map fail later, |
342 | | // but we may be able to continue anyway. |
343 | 0 | if e != libc::EPERM && e != libc::EACCES && e != libc::ENOENT { |
344 | 0 | return Err(std::io::Error::from_raw_os_error(e)); |
345 | 0 | } |
346 | 0 | } |
347 | | |
348 | 0 | let mut buffer = [0u8; 32]; |
349 | 0 | write_signal_safe(c"/proc/self/uid_map", create_map_line(uid, &mut buffer)) |
350 | 0 | .map_err(std::io::Error::from_raw_os_error)?; |
351 | | |
352 | | // If we can't write to gid_map, we just ignore it. This usually happens if |
353 | | // setgroups was not written to (because of permissions) or if we are in a |
354 | | // restricted environment. |
355 | 0 | if let Err(e) = write_signal_safe(c"/proc/self/gid_map", create_map_line(gid, &mut buffer)) { |
356 | | // If this fails then we can probably continue just fine, it's just |
357 | | // the uid that's important. |
358 | 0 | if e != libc::EPERM && e != libc::EACCES { |
359 | 0 | return Err(std::io::Error::from_raw_os_error(e)); |
360 | 0 | } |
361 | 0 | } |
362 | | |
363 | | // Configure the mount namespace if enabled. |
364 | 0 | if mount { |
365 | 0 | perform_remount(root_action_directory, action_directory).unwrap(); |
366 | 0 | } |
367 | | |
368 | | // Set hostname to "nativelink" to ensure reproducibility. |
369 | 0 | let hostname = b"nativelink"; |
370 | | // SAFETY: We reference the static memory above only and this is |
371 | | // async-signal-safe. |
372 | 0 | if unsafe { libc::sethostname(hostname.as_ptr().cast(), hostname.len()) } != 0 { |
373 | | // SAFETY: We just called a libc function that failed. |
374 | 0 | let err = unsafe { *libc::__errno_location() }; |
375 | 0 | if err != libc::EPERM && err != libc::EACCES { |
376 | 0 | return Err(std::io::Error::from_raw_os_error(err)); |
377 | 0 | } |
378 | 0 | } |
379 | | |
380 | | // Fork to enter the PID namespace. |
381 | | // SAFETY: We are already in a required async-signal-safe environment, we |
382 | | // will continue to ensure that ongoing. |
383 | 0 | match unsafe { libc::fork() } { |
384 | | 0 => { |
385 | | // SAFETY: This function is async-signal-safe and references no memory or resources. |
386 | 0 | if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 { |
387 | | // SAFETY: It's always safe to _exit. |
388 | 0 | unsafe { libc::_exit(1) }; |
389 | 0 | } |
390 | 0 | Ok(()) |
391 | | } |
392 | 0 | pid if pid > 0 => { |
393 | | // Ensure that any children spawned by the action are re-parented to |
394 | | // this process if their parent dies. This is effectively a sub-reaper. |
395 | | // SAFETY: prctl is async-signal-safe. |
396 | 0 | unsafe { libc::prctl(libc::PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) }; |
397 | | |
398 | | // SAFETY: All operations below simply _exit and therefore there |
399 | | // are no issues with dangling file descriptor handles. |
400 | 0 | unsafe { close_all_fds() }; |
401 | | |
402 | 0 | let mut sigset = core::mem::MaybeUninit::<libc::sigset_t>::uninit(); |
403 | | // SAFETY: sigset is on the stack and we are initializing it. |
404 | 0 | unsafe { |
405 | 0 | libc::sigemptyset(sigset.as_mut_ptr()); |
406 | 0 | libc::sigaddset(sigset.as_mut_ptr(), libc::SIGTERM); |
407 | 0 | libc::sigaddset(sigset.as_mut_ptr(), libc::SIGCHLD); |
408 | 0 | libc::sigprocmask(libc::SIG_BLOCK, sigset.as_ptr(), core::ptr::null_mut()); |
409 | 0 | } |
410 | | |
411 | | loop { |
412 | | // Reap all exited children. |
413 | | loop { |
414 | 0 | let mut status = 0; |
415 | | // SAFETY: The status is on the stack and waitpid is otherwise |
416 | | // safe to call. |
417 | 0 | let res = unsafe { libc::waitpid(-1, &raw mut status, libc::WNOHANG) }; |
418 | 0 | if res == pid { |
419 | 0 | if libc::WIFEXITED(status) { |
420 | | // SAFETY: It's always safe to _exit. |
421 | 0 | unsafe { libc::_exit(libc::WEXITSTATUS(status)) }; |
422 | 0 | } else if libc::WIFSIGNALED(status) { |
423 | | // Try to exit with the same signal as the child. |
424 | | // SAFETY: The sigset was previously allocated and used on the stack. |
425 | | unsafe { |
426 | 0 | libc::sigprocmask( |
427 | | libc::SIG_UNBLOCK, |
428 | 0 | sigset.as_ptr(), |
429 | 0 | core::ptr::null_mut(), |
430 | | ) |
431 | | }; |
432 | | // SAFETY: It's always safe to raise and as a fallback we _exit below. |
433 | 0 | unsafe { libc::raise(libc::WTERMSIG(status)) }; |
434 | | // We shouldn't get here, but it's a fallback in case. |
435 | | // SAFETY: It's always safe to _exit. |
436 | 0 | unsafe { libc::_exit(libc::WTERMSIG(status)) }; |
437 | 0 | } |
438 | 0 | } else if res <= 0 { |
439 | | // SAFETY: We just called a libc function that failed. |
440 | 0 | if res == -1 && unsafe { *libc::__errno_location() } != libc::EINTR { |
441 | | // SAFETY: It's always safe to _exit. |
442 | 0 | unsafe { libc::_exit(255) }; |
443 | 0 | } |
444 | | // Break the reaping loop to wait for signals. |
445 | 0 | break; |
446 | 0 | } |
447 | | } |
448 | | |
449 | 0 | let mut siginfo = core::mem::MaybeUninit::<libc::siginfo_t>::uninit(); |
450 | | // SAFETY: sigset is initialized and siginfo is on the stack. |
451 | 0 | let sig = unsafe { libc::sigwaitinfo(sigset.as_ptr(), siginfo.as_mut_ptr()) }; |
452 | | |
453 | 0 | if sig == libc::SIGTERM { |
454 | 0 | // SAFETY: pid is valid and we are sending a signal. |
455 | 0 | unsafe { libc::kill(pid, libc::SIGKILL) }; |
456 | 0 | } |
457 | | } |
458 | | } |
459 | 0 | _ => Err(std::io::Error::last_os_error()), |
460 | | } |
461 | 0 | } |