Coverage Report

Created: 2026-05-23 21:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/build/source/nativelink-worker/src/namespace_utils.rs
Line
Count
Source
1
// Copyright 2026 The NativeLink Authors. All rights reserved.
2
//
3
// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//    See LICENSE file for details
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
use std::io::Error;
16
17
use tracing::error;
18
19
/// A wrapper around a Child to send SIGTERM to kill the process instead
20
/// of SIGKILL as it's wrapped by the stub.
21
#[derive(Debug)]
22
pub struct MaybeNamespacedChild {
23
    namespaced: bool,
24
    child: tokio::process::Child,
25
}
26
27
impl MaybeNamespacedChild {
28
0
    pub const fn new(namespaced: bool, child: tokio::process::Child) -> Self {
29
0
        Self { namespaced, child }
30
0
    }
31
32
    /// Send SIGTERM if namespaced which sends SIGKILL to the child, otherwise
33
    /// send SIGKILL to the child.
34
7
    
pub async fn kill(&mut self) -> Result<(), Error>0
{
35
7
        if self.namespaced {
36
            // It would be safer to call send_signal to use the pidfd to avoid
37
            // races, however this is still an experimental API, see:
38
            // https://github.com/rust-lang/rust/issues/141975
39
            // self.child.std_child().send_signal(Signal::SIGTERM)?;
40
            // return self.child.wait().await.map(|_| ());
41
6
            if let Some(pid) = self.child.id() {
42
                // SAFETY: pid is valid as provided by the wrapper and we are
43
                // sending a signal to the namespaced stub.
44
6
                unsafe { libc::kill(pid as libc::pid_t, libc::SIGTERM) };
45
6
                return self.child.wait().await.map(|_| ());
46
0
            }
47
1
        }
48
1
        self.child.kill().await
49
7
    }
50
51
2
    pub fn try_wait(&mut self) -> Result<Option<std::process::ExitStatus>, Error> {
52
2
        self.child.try_wait()
53
2
    }
54
55
24
    pub async fn wait(&mut self) -> Result<std::process::ExitStatus, Error> 
{22
56
22
        self.child.wait().await
57
20
    }
58
}
59
60
0
fn exit(status: i32) -> ! {
61
    // SAFETY: It is always safe to _exit.
62
0
    unsafe { libc::_exit(status) };
63
}
64
65
enum NamespaceErrorType {
66
    Unshare = 1,
67
    WriteSignalSafe,
68
    Mount,
69
}
70
71
const NS_ERROR_TYPE_BITS: u8 = 2; // This is 2 because the highest value (NamespaceErrorType::Mount) is 3 and so we can store all of this in two bits
72
const NS_ERROR_TYPE_MASK: i32 = 0x3; // 11 - i.e. NS_ERROR_TYPE_BITS lowest bits
73
74
/// Determines whether the namespaces provided by this module are supported
75
/// on the currently running system by forking a process and trying to enter
76
/// it into the new namespaces.
77
36
pub fn namespaces_supported(mount: bool) -> bool {
78
    // SAFETY: Posix requires that geteuid is always successful.
79
36
    let uid = unsafe { libc::geteuid() };
80
36
    let uid_map = format!("{uid} {uid} 1\n");
81
    // SAFETY: We ensure that if pid == 0 we only call async-signal-safe functions.
82
36
    let pid = unsafe { libc::fork() };
83
36
    match pid {
84
        0 => {
85
0
            let mut flags =
86
0
                libc::CLONE_NEWPID | libc::CLONE_NEWUSER | libc::CLONE_NEWIPC | libc::CLONE_NEWUTS;
87
0
            if mount {
88
0
                flags |= libc::CLONE_NEWNS;
89
0
            }
90
            // SAFETY: Unshare does not have any unsafe effects and modifies no
91
            // memory, it is also async-signal-safe.
92
0
            if unsafe { libc::unshare(flags) } == 0 {
93
0
                match write_signal_safe(c"/proc/self/uid_map", uid_map.as_bytes()) {
94
                    Ok(()) => {
95
0
                        if !mount {
96
0
                            exit(0);
97
0
                        }
98
                        // SAFETY: Mount uses no memory and is async-signal-safe.
99
0
                        if unsafe {
100
0
                            libc::mount(
101
0
                                core::ptr::null(),
102
0
                                c"/".as_ptr(),
103
0
                                core::ptr::null(),
104
0
                                libc::MS_REC | libc::MS_PRIVATE,
105
0
                                core::ptr::null(),
106
0
                            )
107
0
                        } == 0
108
                        {
109
0
                            exit(0);
110
                        } else {
111
                            // SAFETY: We just called a libc function that failed (-1).
112
0
                            let errno = unsafe { *libc::__errno_location() };
113
0
                            exit(
114
0
                                (NamespaceErrorType::Mount as i32) | (errno << NS_ERROR_TYPE_BITS),
115
                            );
116
                        }
117
                    }
118
0
                    Err(uid_map_err) => {
119
0
                        exit(
120
0
                            (NamespaceErrorType::WriteSignalSafe as i32)
121
0
                                | (uid_map_err << NS_ERROR_TYPE_BITS),
122
                        );
123
                    }
124
                }
125
            } else {
126
                // SAFETY: We just called a libc function that failed (-1).
127
0
                let errno = unsafe { *libc::__errno_location() };
128
0
                exit((NamespaceErrorType::Unshare as i32) | (errno << NS_ERROR_TYPE_BITS));
129
            }
130
        }
131
36
        pid if pid > 0 => {
132
36
            let mut status = 0;
133
            // SAFETY: The pid is valid and created by us and the status is our own stack.
134
36
            while unsafe { libc::waitpid(pid, &raw mut status, 0) } == -1 {
135
                // SAFETY: We just called a libc function that failed (-1).
136
0
                let errno = unsafe { *libc::__errno_location() };
137
0
                if errno != libc::EINTR {
138
0
                    error!(errno = errno, "Namespaces: Failure in waitpid");
139
0
                    return false;
140
0
                }
141
            }
142
36
            if libc::WIFEXITED(status) {
143
36
                match libc::WEXITSTATUS(status) {
144
                    0 => {
145
36
                        return true;
146
                    }
147
0
                    s if s & NS_ERROR_TYPE_MASK == NamespaceErrorType::Unshare as i32 => {
148
0
                        let errno = s >> NS_ERROR_TYPE_BITS;
149
0
                        error!(errno, "Namespaces: Error during unshare");
150
0
                        if errno == libc::EPERM {
151
0
                            error!(
152
                                "If the worker is inside Docker, namespaces don't work unless it's a privileged container"
153
                            );
154
0
                        }
155
                    }
156
0
                    s if s & NS_ERROR_TYPE_MASK == NamespaceErrorType::WriteSignalSafe as i32 => {
157
0
                        error!(
158
0
                            errno = s >> NS_ERROR_TYPE_BITS,
159
                            "Namespaces: Error while writing to /proc/self/uid_map"
160
                        );
161
                    }
162
0
                    s if s & NS_ERROR_TYPE_MASK == NamespaceErrorType::Mount as i32 => {
163
0
                        error!(
164
0
                            errno = s >> NS_ERROR_TYPE_BITS,
165
                            "Failure to mount during namespace checking"
166
                        );
167
                    }
168
0
                    other => {
169
0
                        error!(
170
                            exit_code = other,
171
                            "Namespace check failure with unknown exit code"
172
                        );
173
                    }
174
                }
175
            } else {
176
0
                error!(
177
                    exit_code = status,
178
                    "Namespaces: waitpid exit with non-exit code"
179
                );
180
            }
181
0
            false
182
        }
183
0
        _ => false,
184
    }
185
36
}
186
187
/// Writes to a file in an async-signal-safe manner, does the write in a
188
/// single chunk and assumes it will all be consumed, if the whole chunk
189
/// is not written returns Err(EIO).  This is expected to be used for
190
/// special files such as /proc which will always accept the whole buffer.
191
0
fn write_signal_safe(file_name: &core::ffi::CStr, data: &[u8]) -> Result<(), core::ffi::c_int> {
192
    // SAFETY: The path is a CStr which is guaranteed to end in a NUL byte
193
    // and the returned file descriptor is always closed.
194
0
    let fd = unsafe { libc::open(file_name.as_ptr().cast(), libc::O_WRONLY) };
195
0
    if fd < 0 {
196
        // SAFETY: We just called a libc function that failed (-1).
197
0
        return Err(unsafe { *libc::__errno_location() });
198
0
    }
199
0
    let fd = OwnedFd(fd);
200
201
    // SAFETY: The data is a known length slice and the file descriptor is
202
    // known to be valid as we just opened it.
203
0
    let bytes_written = unsafe { libc::write(fd.0, data.as_ptr().cast(), data.len()) };
204
205
0
    if bytes_written == -1 {
206
        // SAFETY: We just called a libc function that failed (-1).
207
0
        Err(unsafe { *libc::__errno_location() })
208
0
    } else if bytes_written as usize != data.len() {
209
0
        Err(libc::EIO)
210
    } else {
211
0
        Ok(())
212
    }
213
0
}
214
215
/// An async-signal-safe method to close all open file descriptors for the
216
/// current process.  This function is unsafe as any existing handles to
217
/// file descriptors will be invalidated.  None may be used after calling
218
/// this function.
219
0
unsafe fn close_all_fds() {
220
    // SAFETY: It is safe to call close on all file descriptors as this is
221
    // the purpose of the function.
222
0
    if unsafe { libc::syscall(libc::SYS_close_range, 0, libc::INT_MAX, 0) } == 0 {
223
0
        return;
224
0
    }
225
    // Since we're <5.9 kernel, we need to get the max FD count.
226
0
    let mut rlim = core::mem::MaybeUninit::<libc::rlimit>::uninit();
227
    // SAFETY: We just allocated the memory for this and getrlimit is async-signal-safe.
228
0
    let max_fd = if unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, rlim.as_mut_ptr()) } == 0 {
229
        // SAFETY: We just initialised this in getrlimit above that succeeded.
230
0
        let cur = unsafe { rlim.assume_init().rlim_cur };
231
0
        if cur == libc::RLIM_INFINITY {
232
            // Sane fallback for unlimited environments
233
0
            0x0001_0000
234
        } else {
235
0
            core::ffi::c_int::try_from(cur).unwrap_or(0x0001_0000)
236
        }
237
    } else {
238
        // Fallback for getrlimit failure.
239
0
        4096
240
    };
241
0
    for fd in 0..max_fd {
242
0
        // SAFETY: It is safe to close a file descriptor that is not open and
243
0
        // we also want to close all, so there's no issue with closing file
244
0
        // descriptors that others may have handles to.
245
0
        unsafe { libc::close(fd) };
246
0
    }
247
0
}
248
249
/// Write the value n to the given slice as a decimal string.
250
0
fn u32_to_bytes(mut n: u32, buf: &mut [u8]) -> usize {
251
0
    if n == 0 {
252
0
        buf[0] = b'0';
253
0
        return 1;
254
0
    }
255
0
    let mut i = 0;
256
0
    while n > 0 {
257
0
        buf[i] = b'0' + (n % 10) as u8;
258
0
        n /= 10;
259
0
        i += 1;
260
0
    }
261
0
    buf[..i].reverse();
262
0
    i
263
0
}
264
265
/// Create a line in the buffer of the format "{id} {id} 1\n" in an
266
/// async-signal-safe manner.
267
0
fn create_map_line(id: u32, buffer: &mut [u8; 32]) -> &'_ [u8] {
268
0
    let mut pos = 0;
269
0
    pos += u32_to_bytes(id, &mut buffer[pos..]);
270
0
    buffer[pos] = b' ';
271
0
    pos += 1;
272
0
    pos += u32_to_bytes(id, &mut buffer[pos..]);
273
0
    buffer[pos] = b' ';
274
0
    pos += 1;
275
0
    buffer[pos] = b'1';
276
0
    pos += 1;
277
0
    buffer[pos] = b'\n';
278
0
    pos += 1;
279
0
    &buffer[..pos]
280
0
}
281
282
/// A simple wrapper around a file descriptor to ensure async-signal-safety
283
/// rather than the std version which may allocate.
284
struct OwnedFd(libc::c_int);
285
286
impl Drop for OwnedFd {
287
0
    fn drop(&mut self) {
288
        // SAFETY: We own the file descriptor, so we can close it.
289
0
        unsafe {
290
0
            libc::close(self.0);
291
0
        }
292
0
    }
293
}
294
295
0
fn perform_remount(
296
0
    root_action_directory: &core::ffi::CStr,
297
0
    action_directory: &core::ffi::CStr,
298
0
) -> Result<(), Error> {
299
    // Make the mount namespace private to avoid changes propagating back to the host.
300
    // SAFETY: mount is async-signal-safe. We pass a null pointer for the source and valid
301
    // C-string pointers for the target. The parameters match POSIX requirements.
302
0
    if unsafe {
303
0
        libc::mount(
304
0
            core::ptr::null(),
305
0
            c"/".as_ptr(),
306
0
            core::ptr::null(),
307
0
            libc::MS_REC | libc::MS_PRIVATE,
308
0
            core::ptr::null(),
309
0
        )
310
0
    } != 0
311
    {
312
0
        return Err(Error::last_os_error());
313
0
    }
314
315
    // Bind mount the action directory to itself to "save" its current contents before
316
    // we mask its parent.
317
    // SAFETY: mount is async-signal-safe. We pass valid C-string pointers for the paths.
318
0
    if unsafe {
319
0
        libc::mount(
320
0
            action_directory.as_ptr(),
321
0
            action_directory.as_ptr(),
322
0
            core::ptr::null(),
323
0
            libc::MS_BIND | libc::MS_REC,
324
0
            core::ptr::null(),
325
0
        )
326
0
    } != 0
327
    {
328
0
        return Err(Error::last_os_error());
329
0
    }
330
331
    // Open the directory with O_PATH so we can find it after masking the parent.
332
    // SAFETY: open is async-signal-safe. The path is a valid C-string.
333
0
    let fd = unsafe { libc::open(action_directory.as_ptr(), libc::O_PATH) };
334
0
    if fd < 0 {
335
0
        return Err(Error::last_os_error());
336
0
    }
337
0
    let fd = OwnedFd(fd);
338
339
    // Mask the root action directory with a tmpfs to ensure sibling directories aren't visible.
340
    // SAFETY: mount is async-signal-safe. The filesystem type and target are valid C-strings.
341
0
    if unsafe {
342
0
        libc::mount(
343
0
            c"tmpfs".as_ptr(),
344
0
            root_action_directory.as_ptr(),
345
0
            c"tmpfs".as_ptr(),
346
0
            0,
347
0
            core::ptr::null(),
348
0
        )
349
0
    } != 0
350
    {
351
0
        return Err(Error::last_os_error());
352
0
    }
353
354
    // Recreate the specific operation's directory inside the empty tmpfs.
355
    // SAFETY: mkdir is async-signal-safe and the path is a valid C-string.
356
0
    if unsafe { libc::mkdir(action_directory.as_ptr(), 0o777) } != 0 {
357
0
        return Err(Error::last_os_error());
358
0
    }
359
360
    // Bind mount the saved directory back from the file descriptor to the new path.
361
0
    let mut proc_path = [0u8; 64];
362
0
    let mut pos = 0;
363
0
    for &b in b"/proc/self/fd/" {
364
0
        proc_path[pos] = b;
365
0
        pos += 1;
366
0
    }
367
0
    pos += u32_to_bytes(fd.0 as u32, &mut proc_path[pos..]);
368
0
    proc_path[pos] = 0;
369
370
    // SAFETY: mount is async-signal-safe. The target path is a valid C-string and the source
371
    // path is correctly formatted using /proc/self/fd/.
372
0
    if unsafe {
373
0
        libc::mount(
374
0
            proc_path.as_ptr().cast(),
375
0
            action_directory.as_ptr(),
376
0
            core::ptr::null(),
377
0
            libc::MS_BIND | libc::MS_REC,
378
0
            core::ptr::null(),
379
0
        )
380
0
    } != 0
381
    {
382
0
        return Err(Error::last_os_error());
383
0
    }
384
385
0
    Ok(())
386
0
}
387
388
/// A hook for a `Command::spawn` to create the process in a new namespace.
389
/// This creates a stub process that the Command points at which forwards
390
/// SIGKILL to the actual process in the new user, PID, UTS and IPC
391
/// namespaces.  Pass this function to `CommandBuilder::pre_exec`.
392
///
393
/// This function is async-signal-safe and has no external locks or
394
/// memory allocations.
395
0
pub fn configure_namespace(
396
0
    mount: bool,
397
0
    root_action_directory: &core::ffi::CStr,
398
0
    action_directory: &core::ffi::CStr,
399
0
) -> std::io::Result<()> {
400
    // SAFETY: It is always safe to call geteuid on Posix.
401
0
    let uid = unsafe { libc::geteuid() };
402
    // SAFETY: It is always safe to call getegid on Posix.
403
0
    let gid = unsafe { libc::getegid() };
404
405
0
    let mut flags =
406
0
        libc::CLONE_NEWPID | libc::CLONE_NEWUSER | libc::CLONE_NEWIPC | libc::CLONE_NEWUTS;
407
0
    if mount {
408
0
        flags |= libc::CLONE_NEWNS;
409
0
    }
410
    // SAFETY: Unshare does not have any unsafe effects and modifies no
411
    // memory, it is also async-signal-safe.
412
0
    if unsafe { libc::unshare(flags) } != 0 {
413
0
        return Err(Error::last_os_error());
414
0
    }
415
416
0
    if let Err(e) = write_signal_safe(c"/proc/self/setgroups", b"deny") {
417
        // If we fail to write this it will just make gid_map fail later,
418
        // but we may be able to continue anyway.
419
0
        if e != libc::EPERM && e != libc::EACCES && e != libc::ENOENT {
420
0
            return Err(Error::from_raw_os_error(e));
421
0
        }
422
0
    }
423
424
0
    let mut buffer = [0u8; 32];
425
0
    write_signal_safe(c"/proc/self/uid_map", create_map_line(uid, &mut buffer))
426
0
        .map_err(Error::from_raw_os_error)?;
427
428
    // If we can't write to gid_map, we just ignore it. This usually happens if
429
    // setgroups was not written to (because of permissions) or if we are in a
430
    // restricted environment.
431
0
    if let Err(e) = write_signal_safe(c"/proc/self/gid_map", create_map_line(gid, &mut buffer)) {
432
        // If this fails then we can probably continue just fine, it's just
433
        // the uid that's important.
434
0
        if e != libc::EPERM && e != libc::EACCES {
435
0
            return Err(Error::from_raw_os_error(e));
436
0
        }
437
0
    }
438
439
    // Configure the mount namespace if enabled.
440
0
    if mount {
441
0
        perform_remount(root_action_directory, action_directory).unwrap();
442
0
    }
443
444
    // Set hostname to "nativelink" to ensure reproducibility.
445
0
    let hostname = b"nativelink";
446
    // SAFETY: We reference the static memory above only and this is
447
    // async-signal-safe.
448
0
    if unsafe { libc::sethostname(hostname.as_ptr().cast(), hostname.len()) } != 0 {
449
        // SAFETY: We just called a libc function that failed.
450
0
        let err = unsafe { *libc::__errno_location() };
451
0
        if err != libc::EPERM && err != libc::EACCES {
452
0
            return Err(Error::from_raw_os_error(err));
453
0
        }
454
0
    }
455
456
    // Fork to enter the PID namespace.
457
    // SAFETY: We are already in a required async-signal-safe environment, we
458
    // will continue to ensure that ongoing.
459
0
    match unsafe { libc::fork() } {
460
        0 => {
461
            // SAFETY: This function is async-signal-safe and references no memory or resources.
462
0
            if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
463
0
                exit(1);
464
0
            }
465
0
            Ok(())
466
        }
467
0
        pid if pid > 0 => {
468
            // Ensure that any children spawned by the action are re-parented to
469
            // this process if their parent dies. This is effectively a sub-reaper.
470
            // SAFETY: prctl is async-signal-safe.
471
0
            unsafe { libc::prctl(libc::PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) };
472
473
            // SAFETY: All operations below simply _exit and therefore there
474
            // are no issues with dangling file descriptor handles.
475
0
            unsafe { close_all_fds() };
476
477
0
            let mut sigset = core::mem::MaybeUninit::<libc::sigset_t>::uninit();
478
            // SAFETY: sigset is on the stack and we are initializing it.
479
0
            unsafe {
480
0
                libc::sigemptyset(sigset.as_mut_ptr());
481
0
                libc::sigaddset(sigset.as_mut_ptr(), libc::SIGTERM);
482
0
                libc::sigaddset(sigset.as_mut_ptr(), libc::SIGCHLD);
483
0
                libc::sigprocmask(libc::SIG_BLOCK, sigset.as_ptr(), core::ptr::null_mut());
484
0
            }
485
486
            loop {
487
                // Reap all exited children.
488
                loop {
489
0
                    let mut status = 0;
490
                    // SAFETY: The status is on the stack and waitpid is otherwise
491
                    // safe to call.
492
0
                    let res = unsafe { libc::waitpid(-1, &raw mut status, libc::WNOHANG) };
493
0
                    if res == pid {
494
0
                        if libc::WIFEXITED(status) {
495
0
                            exit(libc::WEXITSTATUS(status));
496
0
                        } else if libc::WIFSIGNALED(status) {
497
                            // Try to exit with the same signal as the child.
498
                            // SAFETY: The sigset was previously allocated and used on the stack.
499
                            unsafe {
500
0
                                libc::sigprocmask(
501
                                    libc::SIG_UNBLOCK,
502
0
                                    sigset.as_ptr(),
503
0
                                    core::ptr::null_mut(),
504
                                )
505
                            };
506
                            // SAFETY: It's always safe to raise and as a fallback we _exit below.
507
0
                            unsafe { libc::raise(libc::WTERMSIG(status)) };
508
                            // We shouldn't get here, but it's a fallback in case.
509
0
                            exit(libc::WTERMSIG(status));
510
0
                        }
511
0
                    } else if res <= 0 {
512
                        // SAFETY: We just called a libc function that failed.
513
0
                        if res == -1 && unsafe { *libc::__errno_location() } != libc::EINTR {
514
0
                            exit(255);
515
0
                        }
516
                        // Break the reaping loop to wait for signals.
517
0
                        break;
518
0
                    }
519
                }
520
521
0
                let mut siginfo = core::mem::MaybeUninit::<libc::siginfo_t>::uninit();
522
                // SAFETY: sigset is initialized and siginfo is on the stack.
523
0
                let sig = unsafe { libc::sigwaitinfo(sigset.as_ptr(), siginfo.as_mut_ptr()) };
524
525
0
                if sig == libc::SIGTERM {
526
0
                    // SAFETY: pid is valid and we are sending a signal.
527
0
                    unsafe { libc::kill(pid, libc::SIGKILL) };
528
0
                }
529
            }
530
        }
531
0
        _ => Err(Error::last_os_error()),
532
    }
533
0
}