Coverage Report

Created: 2026-04-07 13:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/build/source/nativelink-worker/src/namespace_utils.rs
Line
Count
Source
1
// Copyright 2026 The NativeLink Authors. All rights reserved.
2
//
3
// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//    See LICENSE file for details
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
/// A wrapper around a Child to send SIGTERM to kill the process instead
16
/// of SIGKILL as it's wrapped by the stub.
17
#[derive(Debug)]
18
pub struct MaybeNamespacedChild {
19
    namespaced: bool,
20
    child: tokio::process::Child,
21
}
22
23
impl MaybeNamespacedChild {
24
18
    pub const fn new(namespaced: bool, child: tokio::process::Child) -> Self {
25
18
        Self { namespaced, child }
26
18
    }
27
28
    /// Send SIGTERM if namespaced which sends SIGKILL to the child, otherwise
29
    /// send SIGKILL to the child.
30
7
    pub async fn kill(&mut self) -> Result<(), std::io::Error> {
31
7
        if self.namespaced {
32
            // It would be safer to call send_signal to use the pidfd to avoid
33
            // races, however this is still an experimental API, see:
34
            // https://github.com/rust-lang/rust/issues/141975
35
            // self.child.std_child().send_signal(Signal::SIGTERM)?;
36
            // return self.child.wait().await.map(|_| ());
37
6
            if let Some(pid) = self.child.id() {
38
                // SAFETY: pid is valid as provided by the wrapper and we are
39
                // sending a signal to the namespaced stub.
40
6
                unsafe { libc::kill(pid as libc::pid_t, libc::SIGTERM) };
41
6
                return self.child.wait().await.map(|_| ());
42
0
            }
43
1
        }
44
1
        self.child.kill().await
45
7
    }
46
47
2
    pub fn try_wait(&mut self) -> Result<Option<std::process::ExitStatus>, std::io::Error> {
48
2
        self.child.try_wait()
49
2
    }
50
51
21
    pub async fn wait(&mut self) -> Result<std::process::ExitStatus, std::io::Error> 
{18
52
18
        self.child.wait().await
53
17
    }
54
}
55
56
/// Determines whether the namespaces provided by this module are supported
57
/// on the currently running system by forking a process and trying to enter
58
/// it into the new namespaces.
59
31
pub fn namespaces_supported(mount: bool) -> bool {
60
    // SAFETY: Posix requires that geteuid is always successful.
61
31
    let uid = unsafe { libc::geteuid() };
62
31
    let uid_map = format!("{uid} {uid} 1\n");
63
    // SAFETY: We ensure that if pid == 0 we only call async-signal-safe functions.
64
31
    let pid = unsafe { libc::fork() };
65
31
    match pid {
66
        0 => {
67
0
            let mut flags =
68
0
                libc::CLONE_NEWPID | libc::CLONE_NEWUSER | libc::CLONE_NEWIPC | libc::CLONE_NEWUTS;
69
0
            if mount {
70
0
                flags |= libc::CLONE_NEWNS;
71
0
            }
72
            // SAFETY: Unshare does not have any unsafe effects and modifies no
73
            // memory, it is also async-signal-safe.
74
0
            if unsafe { libc::unshare(flags) } == 0
75
0
                && write_signal_safe(c"/proc/self/uid_map", uid_map.as_bytes()).is_ok()
76
            {
77
                // SAFETY: Mount uses no memory and is async-signal-safe.
78
0
                if !mount
79
0
                    || unsafe {
80
0
                        libc::mount(
81
0
                            core::ptr::null(),
82
0
                            c"/".as_ptr(),
83
0
                            core::ptr::null(),
84
0
                            libc::MS_REC | libc::MS_PRIVATE,
85
0
                            core::ptr::null(),
86
0
                        )
87
0
                    } == 0
88
                {
89
                    // SAFETY: It is always safe to _exit.
90
0
                    unsafe { libc::_exit(0) };
91
0
                }
92
0
            }
93
            // SAFETY: It is always safe to _exit.
94
0
            unsafe { libc::_exit(1) };
95
        }
96
31
        pid if pid > 0 => {
97
31
            let mut status = 0;
98
            // SAFETY: The pid is valid and created by us and the status is our own stack.
99
31
            while unsafe { libc::waitpid(pid, &raw mut status, 0) } == -1 {
100
                // SAFETY: We just called a libc function that failed (-1).
101
0
                if unsafe { *libc::__errno_location() } != libc::EINTR {
102
0
                    return false;
103
0
                }
104
            }
105
31
            libc::WIFEXITED(status) && libc::WEXITSTATUS(status) == 0
106
        }
107
0
        _ => false,
108
    }
109
31
}
110
111
/// Writes to a file in an async-signal-safe manner, does the write in a
112
/// single chunk and assumes it will all be consumed, if the whole chunk
113
/// is not written returns Err(EIO).  This is expected to be used for
114
/// special files such as /proc which will always accept the whole buffer.
115
0
fn write_signal_safe(file_name: &core::ffi::CStr, data: &[u8]) -> Result<(), core::ffi::c_int> {
116
    // SAFETY: The path is a CStr which is guaranteed to end in a NUL byte
117
    // and the returned file descriptor is always closed.
118
0
    let fd = unsafe { libc::open(file_name.as_ptr().cast(), libc::O_WRONLY) };
119
0
    if fd < 0 {
120
        // SAFETY: We just called a libc function that failed (-1).
121
0
        return Err(unsafe { *libc::__errno_location() });
122
0
    }
123
0
    let fd = OwnedFd(fd);
124
125
    // SAFETY: The data is a known length slice and the file descriptor is
126
    // known to be valid as we just opened it.
127
0
    let bytes_written = unsafe { libc::write(fd.0, data.as_ptr().cast(), data.len()) };
128
129
0
    if bytes_written == -1 {
130
        // SAFETY: We just called a libc function that failed (-1).
131
0
        Err(unsafe { *libc::__errno_location() })
132
0
    } else if bytes_written as usize != data.len() {
133
0
        Err(libc::EIO)
134
    } else {
135
0
        Ok(())
136
    }
137
0
}
138
139
/// An async-signal-safe method to close all open file descriptors for the
140
/// current process.  This function is unsafe as any existing handles to
141
/// file descriptors will be invalidated.  None may be used after calling
142
/// this function.
143
0
unsafe fn close_all_fds() {
144
    // SAFETY: It is safe to call close on all file descriptors as this is
145
    // the purpose of the function.
146
0
    if unsafe { libc::syscall(libc::SYS_close_range, 0, libc::INT_MAX, 0) } == 0 {
147
0
        return;
148
0
    }
149
    // Since we're <5.9 kernel, we need to get the max FD count.
150
0
    let mut rlim = core::mem::MaybeUninit::<libc::rlimit>::uninit();
151
    // SAFETY: We just allocated the memory for this and getrlimit is async-signal-safe.
152
0
    let max_fd = if unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, rlim.as_mut_ptr()) } == 0 {
153
        // SAFETY: We just initialised this in getrlimit above that succeeded.
154
0
        let cur = unsafe { rlim.assume_init().rlim_cur };
155
0
        if cur == libc::RLIM_INFINITY {
156
            // Sane fallback for unlimited environments
157
0
            0x0001_0000
158
        } else {
159
0
            core::ffi::c_int::try_from(cur).unwrap_or(0x0001_0000)
160
        }
161
    } else {
162
        // Fallback for getrlimit failure.
163
0
        4096
164
    };
165
0
    for fd in 0..max_fd {
166
0
        // SAFETY: It is safe to close a file descriptor that is not open and
167
0
        // we also want to close all, so there's no issue with closing file
168
0
        // descriptors that others may have handles to.
169
0
        unsafe { libc::close(fd) };
170
0
    }
171
0
}
172
173
/// Write the value n to the given slice as a decimal string.
174
0
fn u32_to_bytes(mut n: u32, buf: &mut [u8]) -> usize {
175
0
    if n == 0 {
176
0
        buf[0] = b'0';
177
0
        return 1;
178
0
    }
179
0
    let mut i = 0;
180
0
    while n > 0 {
181
0
        buf[i] = b'0' + (n % 10) as u8;
182
0
        n /= 10;
183
0
        i += 1;
184
0
    }
185
0
    buf[..i].reverse();
186
0
    i
187
0
}
188
189
/// Create a line in the buffer of the format "{id} {id} 1\n" in an
190
/// async-signal-safe manner.
191
0
fn create_map_line(id: u32, buffer: &mut [u8; 32]) -> &'_ [u8] {
192
0
    let mut pos = 0;
193
0
    pos += u32_to_bytes(id, &mut buffer[pos..]);
194
0
    buffer[pos] = b' ';
195
0
    pos += 1;
196
0
    pos += u32_to_bytes(id, &mut buffer[pos..]);
197
0
    buffer[pos] = b' ';
198
0
    pos += 1;
199
0
    buffer[pos] = b'1';
200
0
    pos += 1;
201
0
    buffer[pos] = b'\n';
202
0
    pos += 1;
203
0
    &buffer[..pos]
204
0
}
205
206
/// A simple wrapper around a file descriptor to ensure async-signal-safety
207
/// rather than the std version which may allocate.
208
struct OwnedFd(libc::c_int);
209
210
impl Drop for OwnedFd {
211
0
    fn drop(&mut self) {
212
        // SAFETY: We own the file descriptor, so we can close it.
213
0
        unsafe {
214
0
            libc::close(self.0);
215
0
        }
216
0
    }
217
}
218
219
0
fn perform_remount(
220
0
    root_action_directory: &core::ffi::CStr,
221
0
    action_directory: &core::ffi::CStr,
222
0
) -> Result<(), std::io::Error> {
223
    // Make the mount namespace private to avoid changes propagating back to the host.
224
    // SAFETY: mount is async-signal-safe. We pass a null pointer for the source and valid
225
    // C-string pointers for the target. The parameters match POSIX requirements.
226
0
    if unsafe {
227
0
        libc::mount(
228
0
            core::ptr::null(),
229
0
            c"/".as_ptr(),
230
0
            core::ptr::null(),
231
0
            libc::MS_REC | libc::MS_PRIVATE,
232
0
            core::ptr::null(),
233
0
        )
234
0
    } != 0
235
    {
236
0
        return Err(std::io::Error::last_os_error());
237
0
    }
238
239
    // Bind mount the action directory to itself to "save" its current contents before
240
    // we mask its parent.
241
    // SAFETY: mount is async-signal-safe. We pass valid C-string pointers for the paths.
242
0
    if unsafe {
243
0
        libc::mount(
244
0
            action_directory.as_ptr(),
245
0
            action_directory.as_ptr(),
246
0
            core::ptr::null(),
247
0
            libc::MS_BIND | libc::MS_REC,
248
0
            core::ptr::null(),
249
0
        )
250
0
    } != 0
251
    {
252
0
        return Err(std::io::Error::last_os_error());
253
0
    }
254
255
    // Open the directory with O_PATH so we can find it after masking the parent.
256
    // SAFETY: open is async-signal-safe. The path is a valid C-string.
257
0
    let fd = unsafe { libc::open(action_directory.as_ptr(), libc::O_PATH) };
258
0
    if fd < 0 {
259
0
        return Err(std::io::Error::last_os_error());
260
0
    }
261
0
    let fd = OwnedFd(fd);
262
263
    // Mask the root action directory with a tmpfs to ensure sibling directories aren't visible.
264
    // SAFETY: mount is async-signal-safe. The filesystem type and target are valid C-strings.
265
0
    if unsafe {
266
0
        libc::mount(
267
0
            c"tmpfs".as_ptr(),
268
0
            root_action_directory.as_ptr(),
269
0
            c"tmpfs".as_ptr(),
270
0
            0,
271
0
            core::ptr::null(),
272
0
        )
273
0
    } != 0
274
    {
275
0
        return Err(std::io::Error::last_os_error());
276
0
    }
277
278
    // Recreate the specific operation's directory inside the empty tmpfs.
279
    // SAFETY: mkdir is async-signal-safe and the path is a valid C-string.
280
0
    if unsafe { libc::mkdir(action_directory.as_ptr(), 0o777) } != 0 {
281
0
        return Err(std::io::Error::last_os_error());
282
0
    }
283
284
    // Bind mount the saved directory back from the file descriptor to the new path.
285
0
    let mut proc_path = [0u8; 64];
286
0
    let mut pos = 0;
287
0
    for &b in b"/proc/self/fd/" {
288
0
        proc_path[pos] = b;
289
0
        pos += 1;
290
0
    }
291
0
    pos += u32_to_bytes(fd.0 as u32, &mut proc_path[pos..]);
292
0
    proc_path[pos] = 0;
293
294
    // SAFETY: mount is async-signal-safe. The target path is a valid C-string and the source
295
    // path is correctly formatted using /proc/self/fd/.
296
0
    if unsafe {
297
0
        libc::mount(
298
0
            proc_path.as_ptr().cast(),
299
0
            action_directory.as_ptr(),
300
0
            core::ptr::null(),
301
0
            libc::MS_BIND | libc::MS_REC,
302
0
            core::ptr::null(),
303
0
        )
304
0
    } != 0
305
    {
306
0
        return Err(std::io::Error::last_os_error());
307
0
    }
308
309
0
    Ok(())
310
0
}
311
312
/// A hook for a `Command::spawn` to create the process in a new namespace.
313
/// This creates a stub process that the Command points at which forwards
314
/// SIGKILL to the actual process in the new user, PID, UTS and IPC
315
/// namespaces.  Pass this function to `CommandBuilder::pre_exec`.
316
///
317
/// This function is async-signal-safe and has no external locks or
318
/// memory allocations.
319
0
pub fn configure_namespace(
320
0
    mount: bool,
321
0
    root_action_directory: &core::ffi::CStr,
322
0
    action_directory: &core::ffi::CStr,
323
0
) -> std::io::Result<()> {
324
    // SAFETY: It is always safe to call geteuid on Posix.
325
0
    let uid = unsafe { libc::geteuid() };
326
    // SAFETY: It is always safe to call getegid on Posix.
327
0
    let gid = unsafe { libc::getegid() };
328
329
0
    let mut flags =
330
0
        libc::CLONE_NEWPID | libc::CLONE_NEWUSER | libc::CLONE_NEWIPC | libc::CLONE_NEWUTS;
331
0
    if mount {
332
0
        flags |= libc::CLONE_NEWNS;
333
0
    }
334
    // SAFETY: Unshare does not have any unsafe effects and modifies no
335
    // memory, it is also async-signal-safe.
336
0
    if unsafe { libc::unshare(flags) } != 0 {
337
0
        return Err(std::io::Error::last_os_error());
338
0
    }
339
340
0
    if let Err(e) = write_signal_safe(c"/proc/self/setgroups", b"deny") {
341
        // If we fail to write this it will just make gid_map fail later,
342
        // but we may be able to continue anyway.
343
0
        if e != libc::EPERM && e != libc::EACCES && e != libc::ENOENT {
344
0
            return Err(std::io::Error::from_raw_os_error(e));
345
0
        }
346
0
    }
347
348
0
    let mut buffer = [0u8; 32];
349
0
    write_signal_safe(c"/proc/self/uid_map", create_map_line(uid, &mut buffer))
350
0
        .map_err(std::io::Error::from_raw_os_error)?;
351
352
    // If we can't write to gid_map, we just ignore it. This usually happens if
353
    // setgroups was not written to (because of permissions) or if we are in a
354
    // restricted environment.
355
0
    if let Err(e) = write_signal_safe(c"/proc/self/gid_map", create_map_line(gid, &mut buffer)) {
356
        // If this fails then we can probably continue just fine, it's just
357
        // the uid that's important.
358
0
        if e != libc::EPERM && e != libc::EACCES {
359
0
            return Err(std::io::Error::from_raw_os_error(e));
360
0
        }
361
0
    }
362
363
    // Configure the mount namespace if enabled.
364
0
    if mount {
365
0
        perform_remount(root_action_directory, action_directory).unwrap();
366
0
    }
367
368
    // Set hostname to "nativelink" to ensure reproducibility.
369
0
    let hostname = b"nativelink";
370
    // SAFETY: We reference the static memory above only and this is
371
    // async-signal-safe.
372
0
    if unsafe { libc::sethostname(hostname.as_ptr().cast(), hostname.len()) } != 0 {
373
        // SAFETY: We just called a libc function that failed.
374
0
        let err = unsafe { *libc::__errno_location() };
375
0
        if err != libc::EPERM && err != libc::EACCES {
376
0
            return Err(std::io::Error::from_raw_os_error(err));
377
0
        }
378
0
    }
379
380
    // Fork to enter the PID namespace.
381
    // SAFETY: We are already in a required async-signal-safe environment, we
382
    // will continue to ensure that ongoing.
383
0
    match unsafe { libc::fork() } {
384
        0 => {
385
            // SAFETY: This function is async-signal-safe and references no memory or resources.
386
0
            if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
387
                // SAFETY: It's always safe to _exit.
388
0
                unsafe { libc::_exit(1) };
389
0
            }
390
0
            Ok(())
391
        }
392
0
        pid if pid > 0 => {
393
            // Ensure that any children spawned by the action are re-parented to
394
            // this process if their parent dies. This is effectively a sub-reaper.
395
            // SAFETY: prctl is async-signal-safe.
396
0
            unsafe { libc::prctl(libc::PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) };
397
398
            // SAFETY: All operations below simply _exit and therefore there
399
            // are no issues with dangling file descriptor handles.
400
0
            unsafe { close_all_fds() };
401
402
0
            let mut sigset = core::mem::MaybeUninit::<libc::sigset_t>::uninit();
403
            // SAFETY: sigset is on the stack and we are initializing it.
404
0
            unsafe {
405
0
                libc::sigemptyset(sigset.as_mut_ptr());
406
0
                libc::sigaddset(sigset.as_mut_ptr(), libc::SIGTERM);
407
0
                libc::sigaddset(sigset.as_mut_ptr(), libc::SIGCHLD);
408
0
                libc::sigprocmask(libc::SIG_BLOCK, sigset.as_ptr(), core::ptr::null_mut());
409
0
            }
410
411
            loop {
412
                // Reap all exited children.
413
                loop {
414
0
                    let mut status = 0;
415
                    // SAFETY: The status is on the stack and waitpid is otherwise
416
                    // safe to call.
417
0
                    let res = unsafe { libc::waitpid(-1, &raw mut status, libc::WNOHANG) };
418
0
                    if res == pid {
419
0
                        if libc::WIFEXITED(status) {
420
                            // SAFETY: It's always safe to _exit.
421
0
                            unsafe { libc::_exit(libc::WEXITSTATUS(status)) };
422
0
                        } else if libc::WIFSIGNALED(status) {
423
                            // Try to exit with the same signal as the child.
424
                            // SAFETY: The sigset was previously allocated and used on the stack.
425
                            unsafe {
426
0
                                libc::sigprocmask(
427
                                    libc::SIG_UNBLOCK,
428
0
                                    sigset.as_ptr(),
429
0
                                    core::ptr::null_mut(),
430
                                )
431
                            };
432
                            // SAFETY: It's always safe to raise and as a fallback we _exit below.
433
0
                            unsafe { libc::raise(libc::WTERMSIG(status)) };
434
                            // We shouldn't get here, but it's a fallback in case.
435
                            // SAFETY: It's always safe to _exit.
436
0
                            unsafe { libc::_exit(libc::WTERMSIG(status)) };
437
0
                        }
438
0
                    } else if res <= 0 {
439
                        // SAFETY: We just called a libc function that failed.
440
0
                        if res == -1 && unsafe { *libc::__errno_location() } != libc::EINTR {
441
                            // SAFETY: It's always safe to _exit.
442
0
                            unsafe { libc::_exit(255) };
443
0
                        }
444
                        // Break the reaping loop to wait for signals.
445
0
                        break;
446
0
                    }
447
                }
448
449
0
                let mut siginfo = core::mem::MaybeUninit::<libc::siginfo_t>::uninit();
450
                // SAFETY: sigset is initialized and siginfo is on the stack.
451
0
                let sig = unsafe { libc::sigwaitinfo(sigset.as_ptr(), siginfo.as_mut_ptr()) };
452
453
0
                if sig == libc::SIGTERM {
454
0
                    // SAFETY: pid is valid and we are sending a signal.
455
0
                    unsafe { libc::kill(pid, libc::SIGKILL) };
456
0
                }
457
            }
458
        }
459
0
        _ => Err(std::io::Error::last_os_error()),
460
    }
461
0
}