//
// Syd: rock-solid application kernel
// src/wordexp.rs: Interface for libc's wordexp(3).
//
// Copyright (c) 2024, 2025, 2026 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

use std::{
    borrow::Cow,
    env,
    ffi::{CStr, CString, OsStr},
    fmt,
    fs::File,
    io::{Read, Write},
    marker::PhantomData,
    os::{
        fd::{AsFd, AsRawFd, BorrowedFd, FromRawFd},
        unix::ffi::OsStrExt,
    },
    time::Instant,
};

use bitflags::bitflags;
use data_encoding::HEXLOWER;
use dur::Duration;
use libseccomp::{ScmpAction, ScmpFilterContext, ScmpSyscall};
use memchr::memchr3;
use nix::{
    errno::Errno,
    fcntl::{open, OFlag},
    libc::{_exit, c_char, size_t, CLONE_FILES, ENOSYS, SIGCHLD, SIGKILL, SIGSYS},
    mount::MsFlags,
    sched::{unshare, CloneFlags},
    sys::{
        resource::Resource,
        signal::{sigprocmask, SigSet, SigmaskHow, Signal},
        stat::Mode,
        wait::{Id, WaitPidFlag},
    },
    unistd::{chdir, Gid, Pid, Uid},
};

use crate::{
    compat::{pipe2_raw, set_pdeathsig, waitid, MFdFlags, WaitStatus},
    config::{MINI_STACK_SIZE, *},
    confine::{confine_mdwe, confine_rlimit, secure_getenv, CLONE_NEWTIME},
    cookie::safe_memfd_create,
    debug,
    err::err2no,
    fd::{close, pidfd_send_signal, seal_memfd_all, set_cloexec, set_nonblock},
    fs::safe_clone,
    get_user_home, get_user_name,
    hash::SydHashSet,
    landlock::RulesetStatus,
    landlock_policy::LandlockPolicy,
    log::contains_ascii_unprintable,
    lookup::safe_copy_if_exists,
    mount::{
        api::MountAttrFlags,
        util::{mount_bind, mount_fs, set_root_mount_propagation},
    },
    path::PATH_MAX,
    proc::{proc_map_user, proc_open},
    xpath, XPathBuf,
};

bitflags! {
    /// Represents Word Expansion flags.
    #[derive(Clone, Copy, Debug, Eq, PartialEq)]
    pub struct WordExpFlags: i32 {
        /// Don’t do command substitution.
        const WRDE_NOCMD = 1 << 2;
        /// Normally during command substitution stderr is redirected to
        /// /dev/null. This flag specifies that stderr is not to be
        /// redirected.
        const WRDE_SHOWERR = 1 << 4;
        /// Consider it an error if an undefined shell variable is expanded.
        /// Note, this is not supported by musl.
        const WRDE_UNDEF = 1 << 5;
    }
}

impl Default for WordExpFlags {
    fn default() -> Self {
        Self::WRDE_NOCMD
    }
}

/// Represents error conditions from wordexp(3).
#[derive(Debug, Eq, PartialEq)]
pub enum WordExpError {
    /// Illegal occurrence of newline or one of |, &, ;, <, >, (, ), {, }.
    BadCharacter,
    /// An undefined shell variable was referenced, and the WRDE_UNDEF
    /// flag told us to consider this an error.
    BadValue,
    /// Command substitution requested, but the WRDE_NOCMD flag told us
    /// to consider this an error.
    CommandSubstitution,
    /// Out of memory.
    OutOfMemory,
    /// /bin/sh returned syntax error.
    Syntax,
    /// System error during pipe or fork.
    SystemError(Errno),
    /// Invalid system call.
    SeccompError,
    /// Process was aborted unexpectedly with signal.
    ProcessError(i32),
    /// Timeout error
    TimeoutError(u128),
}

/// Out of memory.
pub const WRDE_NOSPACE: i32 = 1;
/// Illegal occurrence of newline or one of |, &, ;, <, >, (, ), {, }.
pub const WRDE_BADCHAR: i32 = 2;
/// An undefined shell variable was referenced, and the WRDE_UNDEF
/// flag told us to consider this an error.
pub const WRDE_BADVAL: i32 = 3;
/// Command substitution requested, but the WRDE_NOCMD flag told us
/// to consider this an error.
pub const WRDE_CMDSUB: i32 = 4;
/// /bin/sh returned syntax error.
pub const WRDE_SYNTAX: i32 = 5;

// below are our additions,
// 128 is the errno/signal sentinel.

/// Invalid system call.
pub const WRDE_SECCOMP: i32 = 127;
/// Timeout error
pub const WRDE_TIMEOUT: i32 = 126;

impl From<std::io::Error> for WordExpError {
    fn from(io_err: std::io::Error) -> Self {
        Self::SystemError(err2no(&io_err))
    }
}

impl From<Errno> for WordExpError {
    fn from(err: Errno) -> Self {
        Self::SystemError(err)
    }
}

impl From<i32> for WordExpError {
    fn from(code: i32) -> Self {
        if code > 128 {
            // Used by pipe writer in the confined process.
            return Self::SystemError(Errno::from_raw(code));
        }
        #[expect(clippy::arithmetic_side_effects)]
        match code {
            WRDE_BADCHAR => Self::BadCharacter,
            WRDE_BADVAL => Self::BadValue,
            WRDE_CMDSUB => Self::CommandSubstitution,
            WRDE_NOSPACE => Self::OutOfMemory,
            WRDE_SYNTAX => Self::Syntax,
            // custom errors we invented.
            WRDE_SECCOMP => Self::SeccompError,
            _ => Self::SystemError(Errno::from_raw(code - 128)),
        }
    }
}

impl From<WordExpError> for i32 {
    fn from(val: WordExpError) -> Self {
        #[expect(clippy::arithmetic_side_effects)]
        match val {
            WordExpError::BadCharacter => WRDE_BADCHAR,
            WordExpError::BadValue => WRDE_BADVAL,
            WordExpError::CommandSubstitution => WRDE_CMDSUB,
            WordExpError::OutOfMemory => WRDE_NOSPACE,
            WordExpError::Syntax => WRDE_SYNTAX,
            // custom errors we invented.
            WordExpError::SeccompError => WRDE_SECCOMP,
            WordExpError::ProcessError(sig) => 128 + sig,
            WordExpError::SystemError(errno) => 128 + errno as i32,
            WordExpError::TimeoutError(_) => WRDE_TIMEOUT,
        }
    }
}

impl fmt::Display for WordExpError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            WordExpError::SystemError(Errno::EINVAL) => write!(
                f,
                "environment expansion is not permitted, enable with config/expand"
            ),
            WordExpError::CommandSubstitution => write!(
                f,
                "command substitution is not permitted, enable with config/expand_cmd"
            ),
            WordExpError::BadValue => write!(f, "empty replacement is not permitted"),
            WordExpError::BadCharacter => write!(
                f,
                "illegal occurrence of newline or one of |, &, ;, <, >, (, ), {{, }}"
            ),
            WordExpError::OutOfMemory => write!(f, "out of memory"),
            WordExpError::Syntax => write!(f, "shell syntax error"),
            WordExpError::SeccompError => write!(f, "seccomp error: invalid system call"),
            WordExpError::SystemError(e) => write!(f, "system error: {e}"),
            WordExpError::ProcessError(sig) => {
                let sig = Signal::try_from(*sig)
                    .map(|s| s.as_str())
                    .unwrap_or("SIGUNKNOWN");
                write!(f, "process error: received signal {sig}")
            }
            WordExpError::TimeoutError(t) => {
                let s = if *t > 1 { "s" } else { "" };
                write!(f, "timeout error: runtime exceeded {t} second{s}")
            }
        }
    }
}

#[repr(C)]
struct wordexp_t {
    // Count of words matched
    we_wordc: size_t,
    // List of words
    we_wordv: *mut *mut c_char,
    // Slots to reserve at the beginning.
    we_offs: size_t,
}

extern "C" {
    fn wordexp(s: *const c_char, p: *mut wordexp_t, flags: i32) -> i32;
    fn wordfree(p: *mut wordexp_t);
}

/// `WordExp` wraps wordfree(3) and provides an `Iterator`.
pub struct WordExp<'a> {
    p: wordexp_t,
    i: usize,
    _m: PhantomData<&'a ()>,
}

impl Drop for WordExp<'_> {
    fn drop(&mut self) {
        // SAFETY: In libc we trust.
        unsafe { wordfree(std::ptr::addr_of_mut!(self.p)) };
    }
}

impl<'a> Iterator for WordExp<'a> {
    type Item = &'a OsStr;

    fn next(&mut self) -> Option<Self::Item> {
        if self.i >= self.p.we_wordc {
            return None;
        }
        let off = isize::try_from(self.i).ok()?;

        // SAFETY: In libc, we trust.
        let ptr = unsafe { self.p.we_wordv.offset(off) };
        if ptr.is_null() {
            return None;
        }

        let ret = Some(OsStr::from_bytes(
            // SAFETY: In libc, we trust.
            unsafe { CStr::from_ptr(*ptr) }.to_bytes(),
        ));
        if let Some(i) = self.i.checked_add(1) {
            self.i = i;
        }
        ret
    }
}

impl WordExp<'_> {
    /// Performs shell-like word expansion.
    ///
    /// This is only a thin wrapper around libc's wordexp(3).
    /// Use `WordExp::expand` for safety.
    pub fn expand_word(s: &str, flags: WordExpFlags) -> Result<Self, i32> {
        let c_s = CString::new(s).or(Err(WRDE_BADCHAR))?;

        // SAFETY: init a `wordexp_t' structure.
        let mut p: wordexp_t = unsafe { std::mem::zeroed() };

        // SAFETY: call into libc wordexp(3).
        let ret = unsafe { wordexp(c_s.as_ptr(), std::ptr::addr_of_mut!(p), flags.bits()) };
        if ret != 0 {
            return Err(ret);
        }

        // SAFETY: return iterator for safe access.
        Ok(Self {
            p,
            i: 0,
            _m: PhantomData,
        })
    }

    /// Perform environment/tilde expansion and optionally command substitution.
    #[expect(clippy::cognitive_complexity)]
    pub fn expand_full(input: &str, timeout: Duration) -> Result<Cow<'_, str>, WordExpError> {
        // Quick returns:
        // Empty string or no special characters present.
        if input.is_empty() || memchr3(b'$', b'`', b'(', input.as_bytes()).is_none() {
            return Ok(Cow::Borrowed(input));
        }
        // Zero timeout prevents evaluation.
        if timeout.is_zero() {
            return Err(WordExpError::SystemError(Errno::EINVAL));
        }

        // Create a memory fd to write input into,
        // and pass to the internal /bin/sh invoked
        // by wordexp(3).
        let mut file = safe_memfd_create(
            c"syd-wordexp",
            MFdFlags::MFD_ALLOW_SEALING | MFdFlags::MFD_CLOEXEC,
        )
        .map(File::from)?;
        debug!("ctx": "expand",
            "msg": format!("created memory-file {} with close-on-exec flag set",
                file.as_raw_fd()));

        // Define the `esyd` function.
        file.write_all(ESYD_SH.as_bytes())?;
        file.write_all(b"\n")?;

        // Handle system-wide configuration.
        safe_copy_if_exists(&mut file, "/etc/syd/init.sh")?;
        file.write_all(b"\n")?;

        // Handle user-specific configuration.
        if let Some(home) = env::var_os("HOME").map(XPathBuf::from) {
            safe_copy_if_exists(&mut file, &home.join(b".config/syd/init.sh"))?;
            file.write_all(b"\n")?;
        }

        // Write input.
        file.write_all(b"eval set -- x ")?;
        file.write_all(input.as_bytes())?;
        file.write_all(b"\nshift\nprintf '%s ' \"$@\"\n")?;

        // Seal memfd for future writes.
        seal_memfd_all(&file)?;
        debug!("ctx": "expand",
            "msg": format!("sealed memory-file {} against grows, shrinks and writes",
                file.as_raw_fd()));

        // Set close-on-exec to off.
        set_cloexec(&file, false)?;
        debug!("ctx": "expand",
            "msg": format!("set close-on-exec flag to off for memory-file {}",
                file.as_raw_fd()));

        let shell = format!("`. /proc/thread-self/fd/{}`", file.as_raw_fd());
        debug!("ctx": "expand",
            "msg": format!("passing memory file {} to wordexp(3) with {} seconds timeout...",
                file.as_raw_fd(), timeout.as_secs()));
        Ok(Cow::Owned(Self::expand(&shell, true, timeout)?.to_string()))
    }

    /// Perform environment/tilde expansion and optionally command substitution.
    pub fn expand(
        input: &str,
        cmd_subs: bool,
        timeout: Duration,
    ) -> Result<Cow<'_, str>, WordExpError> {
        // Quick returns:
        // Empty string or no special characters present.
        if input.is_empty() || memchr3(b'$', b'`', b'(', input.as_bytes()).is_none() {
            return Ok(Cow::Borrowed(input));
        }
        // Zero timeout prevents evaluation.
        if timeout.is_zero() {
            return Err(WordExpError::SystemError(Errno::EINVAL));
        }

        // Command substitution is optional.
        let mut flags = WordExpFlags::WRDE_SHOWERR;
        if !cmd_subs {
            flags |= WordExpFlags::WRDE_NOCMD;
        }

        // set up pipe to transfer wordexp(3) return string.
        let (pipe_rd, pipe_wr) = pipe2_raw(OFlag::O_CLOEXEC)?;

        // SAFETY: set read end of the pipe as non-blocking.
        let pipe_rd_ref = unsafe { BorrowedFd::borrow_raw(pipe_rd) };
        set_nonblock(pipe_rd_ref, true)?;
        // SAFETY: Fork and confine before running wordexp(3)!

        let mut stack = [0u8; MINI_STACK_SIZE];
        let epoch = Instant::now();
        let pid_fd = safe_clone(
            Box::new(move || -> isize {
                let _ = close(pipe_rd);
                // SAFETY: acquire a safe File handle to the pipe.
                let mut pipe = unsafe { File::from_raw_fd(pipe_wr) };
                // SAFETY: confine or panic!
                Self::confine();
                debug!("ctx": "expand",
                    "msg": format!("calling wordexp(3), good luck!"));
                // SAFETY: call into libc wordexp(3).
                for word in match Self::expand_word(input, flags) {
                    Ok(iter) => iter,
                    Err(err) =>
                    // SAFETY: In libc we trust.
                    unsafe { _exit(err) },
                } {
                    if word.is_empty() {
                        continue;
                    }
                    if let Err(ref error) = pipe.write_all(word.as_bytes()) {
                        let err = err2no(error) as i32;
                        // SAFETY: In libc we trust.
                        #[expect(clippy::arithmetic_side_effects)]
                        unsafe {
                            _exit(128 + err)
                        };
                    }
                    if let Err(ref error) = pipe.write_all(b" ") {
                        let err = err2no(error) as i32;
                        // SAFETY: In libc we trust.
                        #[expect(clippy::arithmetic_side_effects)]
                        unsafe {
                            _exit(128 + err)
                        };
                    }
                }
                // SAFETY: In libc we trust.
                unsafe { _exit(0) };
            }),
            &mut stack[..],
            0,
            Some(SIGCHLD),
        )?;

        let _ = close(pipe_wr);
        // SAFETY: pipe_rd is a valid FD.
        let mut pipe = unsafe { File::from_raw_fd(pipe_rd) };

        let mut eof = false;
        let mut sig = false;
        let mut err = Errno::UnknownErrno;

        let mut buf = [0u8; PATH_MAX];
        let mut ret = Vec::new();

        loop {
            if !sig && (err as i32 != 0 || epoch.elapsed() >= timeout.into()) {
                // a. Out of memory condition
                // b. Timeout exceeded
                // Send SIGKILL once, and fall-through to wait.
                sig = true;
                let _ = pidfd_send_signal(&pid_fd, SIGKILL);
            } else if !eof {
                // read one batch from pipe.
                match pipe.read(&mut buf) {
                    Ok(0) => {
                        // EOF, fall-through to wait.
                        eof = true;
                    }
                    Ok(n) => {
                        // child started writing to the pipe.
                        // this means wordexp(3) is done
                        // executing, so we no longer need
                        // to keep track of timeout.
                        if ret.try_reserve(n).is_err() {
                            err = Errno::ENOMEM;
                        } else {
                            ret.extend(&buf[..n]);
                        }
                        continue;
                    }
                    Err(ref e) if matches!(err2no(e), Errno::EAGAIN | Errno::EINTR) => {
                        std::thread::sleep(Duration::from_millis(100).into());
                        continue;
                    }
                    Err(ref e) => {
                        err = err2no(e);
                        continue;
                    }
                };
            }

            // wait for process without blocking.
            match waitid(
                Id::PIDFd(pid_fd.as_fd()),
                WaitPidFlag::WEXITED | WaitPidFlag::WNOHANG,
            ) {
                Ok(WaitStatus::Exited(_, 0)) if eof => break,
                Ok(WaitStatus::Exited(_, 0)) => {
                    let mut end = Vec::new();
                    if end.try_reserve(16).is_err() {
                        return Err(WordExpError::OutOfMemory);
                    }
                    if let Err(e) = set_nonblock(&pipe, false) {
                        return Err(WordExpError::SystemError(e));
                    }
                    match pipe.read_to_end(&mut end) {
                        Ok(0) => break,
                        Ok(n) => {
                            if ret.try_reserve(n).is_err() {
                                return Err(WordExpError::OutOfMemory);
                            }
                            ret.extend(&end[..n]);
                            break;
                        }
                        Err(ref e) => return Err(WordExpError::SystemError(err2no(e))),
                    }
                }
                Ok(WaitStatus::Exited(_, n)) => return Err(WordExpError::from(n)),
                Ok(WaitStatus::Signaled(_, SIGSYS, _)) => return Err(WordExpError::SeccompError),
                Ok(WaitStatus::Signaled(_, SIGKILL, _)) if err == Errno::ENOMEM => {
                    return Err(WordExpError::OutOfMemory)
                }
                Ok(WaitStatus::Signaled(_, SIGKILL, _)) if err as i32 != 0 => {
                    return Err(WordExpError::SystemError(err))
                }
                Ok(WaitStatus::Signaled(_, SIGKILL, _)) => {
                    return Err(WordExpError::TimeoutError(timeout.as_secs()))
                }
                Ok(WaitStatus::Signaled(_, sig, _)) => return Err(WordExpError::ProcessError(sig)),
                _ => {}
            };
        }

        // SAFETY: do not allow empty replacement.
        if ret.is_empty() {
            return Err(WordExpError::BadValue);
        }
        ret.pop(); // pop the trailing word separator.

        // SAFETY: hex-encode if expansion is invalid UTF-8.
        let ret = match std::str::from_utf8(&ret) {
            Ok(ret) => ret.to_string(),
            Err(_) => return Ok(HEXLOWER.encode(&ret).into()),
        };

        // SAFETY: do not allow empty replacement.
        if ret.is_empty() {
            return Err(WordExpError::BadValue);
        }

        // SAFETY: hex-encode if string has non-printables.
        if contains_ascii_unprintable(ret.as_bytes()) {
            Ok(HEXLOWER.encode(ret.as_bytes()).into())
        } else {
            Ok(ret.into())
        }
    }

    /// Transit the wordexp(3) fork process into a confined state,
    /// with read-only access to the filesystem.
    ///
    /// # Safety
    ///
    /// Panics on all errors except Landlock and namespaces which are
    /// optional as they may not be available.
    #[expect(clippy::cognitive_complexity)]
    #[expect(clippy::disallowed_methods)]
    pub fn confine() {
        if secure_getenv(ENV_SKIP_SCMP).is_some() {
            return;
        }

        // SAFETY: Determine user HOME directory.
        // This will be confined by Landlock.
        let uid = Uid::current();
        let gid = Gid::current();
        let name = get_user_name(uid);
        let home = get_user_home(&name);
        debug!("ctx": "expand",
            "msg": format!("started confining wordexp process {} running as user {name}",
                Pid::this().as_raw()));

        // SAFETY: ensure safe working directory.
        chdir(&home).expect("change dir to home");
        debug!("ctx": "expand",
            "msg": format!("changed directory to {home}"));

        // SAFETY: set up namespace isolation.
        // continue on errors as unprivileged userns may not be supported.
        let _ = Self::setup_namespaces(uid, gid);

        // SAFETY: Landlock: confine filesystem as read-only.
        // continue on errors as Landlock may not be supported.
        let mut path_ro = SydHashSet::default();
        let mut path_rw = SydHashSet::default();
        for ro in [
            "/bin",
            "/dev",
            "/lib",
            "/lib64",
            "/libexec",
            "/opt",
            "/proc",
            "/run",
            "/sbin",
            "/usr",
            "/var",
            "/etc/ld.so.conf",
            "/etc/ld.so.cache",
            "/etc/ld.so.conf.d",
            "/etc/ld-x86_64-pc-linux-musl.path",
            "/etc/ld-musl-aarch64.path",
            "/etc/ld-musl-aarch64.d",
            "/etc/hostname",
            "/etc/motd",
            "/etc/os-release",
            "/etc/machine-id",
            "/etc/passwd",
            "/etc/group",
            "/etc/group-",
            "/etc/securetty",
            "/etc/shells",
            "/etc/sysctl.conf",
            "/etc/sysctl.d",
            "/etc/xdg",
            "/etc/networks",
            "/etc/protocols",
            "/etc/services",
            "/etc/environment",
            "/etc/login.defs",
            "/etc/mime.types",
            "/etc/profile",
            "/etc/profile.env",
            "/etc/profile.d",
            "/etc/profile.csh",
            "/etc/bash",
            "/etc/zsh",
            "/etc/zshenv",
            "/etc/zshrc",
            "/etc/zlogin",
            "/etc/zprofile",
            "/etc/syd",
        ] {
            path_ro.insert(XPathBuf::from(ro));
        }
        for home_ro in [
            ".profile",
            ".bashrc",
            ".bash_login",
            ".bash_profile",
            ".zshenv",
            ".zshrc",
            ".zlogin",
            ".zprofile",
            ".config/syd",
            ".local/share/syd",
        ] {
            path_ro.insert(xpath!("{home}/{home_ro}"));
        }
        for rw in ["/dev/null", "/dev/tty"] {
            path_rw.insert(XPathBuf::from(rw));
        }

        // SAFETY: RW implies RO for simplicity.
        path_rw.extend(path_ro.clone());

        let policy = LandlockPolicy {
            read_pathset: Some(path_ro.clone()),
            readdir_pathset: Some(path_ro.clone()),
            exec_pathset: Some(path_ro.clone()),

            write_pathset: Some(path_rw.clone()),
            ioctl_pathset: Some(path_rw.clone()),
            create_pathset: Some(path_rw.clone()),
            delete_pathset: Some(path_rw.clone()),
            rename_pathset: Some(path_rw.clone()),
            symlink_pathset: Some(path_rw.clone()),
            truncate_pathset: Some(path_rw.clone()),
            mkdir_pathset: Some(path_rw.clone()),
            rmdir_pathset: Some(path_rw.clone()),
            // SAFETY: Deny MakeChar for added hardening.
            // mkdev_pathset: None,
            mkfifo_pathset: Some(path_rw.clone()),
            bind_pathset: Some(path_rw.clone()),

            // Note we don't use scoped signals of Landlock ABI 6 here,
            // because we want the wordexp process to signal the init
            // process with the parent death signal.
            scoped_abs: true,

            ..Default::default()
        };

        let abi = crate::landlock::ABI::new_current();
        match policy.restrict_self(abi) {
            Ok(status) => match status.ruleset {
                RulesetStatus::FullyEnforced => {
                    debug!("ctx": "expand",
                        "msg": format!("Landlock ABI {} is fully enforced",
                            abi as i32),
                        "abi": abi as i32);
                }
                RulesetStatus::PartiallyEnforced => {
                    debug!("ctx": "expand",
                        "msg": format!("Landlock ABI {} is partially enforced",
                            abi as i32),
                        "abi": abi as i32);
                }
                RulesetStatus::NotEnforced => {
                    debug!("ctx": "expand",
                        "msg": format!("Landlock ABI {} is not enforced",
                            abi as i32),
                        "abi": abi as i32);
                }
            },
            Err(error) => {
                debug!("ctx": "expand",
                    "msg": format!("Landlock ABI {} is unsupported: {error}",
                        abi as i32),
                    "abi": abi as i32);
            }
        }

        #[cfg(not(any(
            target_arch = "mips",
            target_arch = "mips32r6",
            target_arch = "mips64",
            target_arch = "mips64r6"
        )))]
        // Set Memory-Deny-Write-Execute attribute.
        // continue on errors as MDWE may not be supported.
        match confine_mdwe(false) {
            Ok(_) => {
                debug!("ctx": "expand",
                    "msg": "set Memory-Deny-Write-Execute attribute to deny W^X memory");
            }
            Err(Errno::EINVAL) => {
                debug!("ctx": "expand",
                    "msg": "Memory-Deny-Write-Execute attribute requires Linux-6.3 or newer");
            }
            Err(Errno::EPERM) => {
                debug!("ctx": "expand",
                    "msg": "Memory-Deny-Write-Execute attribute was set already");
            }
            Err(errno) => {
                debug!("ctx": "expand",
                    "msg": format!("failed to enable Memory-Deny-Write-Execute attribute: {errno}"));
            }
        }

        // Set file size rlimits to zero, panic on errors.
        #[expect(clippy::disallowed_methods)]
        confine_rlimit(Resource::RLIMIT_FSIZE, None).expect("setrlimit(RLIMIT_FSIZE,0)");

        // SAFETY: confine with seccomp, panics on errors.
        Self::confine_seccomp();
    }

    #[expect(clippy::disallowed_methods)]
    fn confine_seccomp() {
        let mut filter = ScmpFilterContext::new(ScmpAction::Errno(ENOSYS)).expect("create filter");

        // Enforce the NO_NEW_PRIVS functionality before
        // loading the seccomp filter into the kernel.
        filter.set_ctl_nnp(true).expect("enforce no-new-privs");

        // Deny requests with bad architecture.
        filter
            .set_act_badarch(ScmpAction::Errno(ENOSYS))
            .expect("set bad architecture action");

        // Use a binary tree sorted by syscall number, if possible.
        let _ = filter.set_ctl_optimize(2);

        for sysname in WORDEXP_SYSCALLS
            .iter()
            .chain(FUTEX_SYSCALLS)
            .chain(GETID_SYSCALLS)
            .chain(VDSO_SYSCALLS)
        {
            if let Ok(syscall) = ScmpSyscall::from_name(sysname) {
                filter
                    .add_rule(ScmpAction::Allow, syscall)
                    .expect("filter syscall");
            }
        }

        filter.load().expect("load filter");
        debug!("ctx": "expand",
            "msg": "loaded seccomp filter");
    }

    #[expect(clippy::cognitive_complexity)]
    #[expect(clippy::disallowed_methods)]
    fn setup_namespaces(uid: Uid, gid: Gid) -> Result<(), Errno> {
        unshare(
            CloneFlags::CLONE_NEWUSER
                | CloneFlags::CLONE_NEWCGROUP
                | CloneFlags::CLONE_NEWIPC
                | CloneFlags::CLONE_NEWNET
                | CloneFlags::CLONE_NEWNS
                | CloneFlags::CLONE_NEWPID
                | CloneFlags::CLONE_NEWUTS
                | CLONE_NEWTIME,
        )?;
        debug!("ctx": "expand",
            "msg": "created and entered into new user, mount, pid, network, cgroup, ipc, uts, and time namespaces");

        // Set up UID/GID mapping in new user namespace.
        proc_map_user(proc_open()?, uid, gid, false /*map_root*/)?;

        // SAFETY: Remount rootfs as readonly,nosuid,nodev,nosymfollow.
        let mut flags = MountAttrFlags::MOUNT_ATTR_RDONLY
            | MountAttrFlags::MOUNT_ATTR_NOSUID
            | MountAttrFlags::MOUNT_ATTR_NODEV
            | MountAttrFlags::MOUNT_ATTR_NOSYMFOLLOW;

        // Set mount propagation to private.
        set_root_mount_propagation(MsFlags::MS_PRIVATE)?;
        debug!("ctx": "expand",
            "msg": "set mount propagation to private in new mount namespace");

        // Remount root.
        open(
            "/",
            OFlag::O_CLOEXEC | OFlag::O_PATH | OFlag::O_DIRECTORY | OFlag::O_NOFOLLOW,
            Mode::empty(),
        )
        .and_then(|root| mount_bind(&root, &root, flags))?;
        debug!("ctx": "expand",
            "msg": "remounted root with readonly, nosuid, nodev, and nosymfollow options in new mount namespace");

        // SAFETY: Mount private procfs.
        // pid=1 is required to exist before this.
        flags.remove(MountAttrFlags::MOUNT_ATTR_NOSYMFOLLOW);
        flags.insert(MountAttrFlags::MOUNT_ATTR_NOEXEC);
        Self::mount_proc(flags);

        Ok(())
    }

    #[expect(clippy::cognitive_complexity)]
    #[expect(clippy::disallowed_methods)]
    fn mount_proc(flags: MountAttrFlags) {
        let mut stack = [0u8; MINI_STACK_SIZE];
        safe_clone(
            Box::new(move || -> isize {
                // pid=1 here.
                debug!("ctx": "expand",
                    "msg": "started init process in new pid namespace");

                // SAFETY: set parent-death signal to SIGKILL
                if set_pdeathsig(Some(Signal::SIGKILL)).is_err() {
                    return 0; // tear down the pid-ns.
                }
                debug!("ctx": "expand",
                    "msg": "set parent-death signal to SIGKILL for the init process");

                // SAFETY: block all signals
                sigprocmask(SigmaskHow::SIG_BLOCK, Some(&SigSet::all()), None)
                    .expect("block signals");

                // SAFETY: mount private procfs, continue on errors.
                match open(
                    "/proc",
                    OFlag::O_CLOEXEC | OFlag::O_PATH | OFlag::O_DIRECTORY,
                    Mode::empty(),
                )
                .and_then(|proc| {
                    mount_fs(
                        OsStr::new("proc"),
                        proc,
                        flags,
                        Some("hidepid=4,subset=pid"),
                    )
                }) {
                    Ok(_) => {
                        debug!("ctx": "expand",
                            "msg": "mounted proc with hidepid=4,subset=pid in new mount namespace");
                    }
                    Err(errno) => {
                        debug!("ctx": "expand",
                            "msg": format!("failed to mount private procfs: {errno}"));
                    }
                };

                // SAFETY: block until the parent-death signal kills us.
                std::thread::sleep(std::time::Duration::MAX);

                unreachable!();
            }),
            &mut stack[..],
            // SAFETY: do not copy pipe-fds into this process.
            // if write end of the pipe remains open unintentionally,
            // the read end will block forever which we absolutely
            // don't want. parent-death signal also helps with this
            // otherwise but better safe than sorry.
            CLONE_FILES,
            Some(SIGCHLD),
        )
        .map(drop)
        .expect("spawn pid1");
    }
}

const WORDEXP_SYSCALLS: &[&str] = &[
    "_llseek",
    "_newselect",
    "access",
    "alarm",
    "arch_prctl", // Used during platform-specific initialization by ld-linux.so.
    "arm_fadvise64_64",
    "arm_sync_file_range",
    "breakpoint", // arm
    "brk",
    "cacheflush", // arm
    "capget",
    "chdir",
    "clock_nanosleep",
    "clock_nanosleep_time64",
    "clone",
    "clone3",
    "close",
    "close_range",
    "copy_file_range",
    "dup",
    "dup2",
    "dup3",
    "epoll_create",
    "epoll_create1",
    "epoll_ctl",
    "epoll_ctl_old",
    "epoll_pwait",
    "epoll_pwait2",
    "epoll_wait",
    "epoll_wait_old",
    "eventfd",
    "eventfd2",
    "execve",
    "execveat",
    "exit",
    "exit_group",
    "faccessat",
    "faccessat2",
    "fadvise64",
    "fadvise64_64",
    "fchdir",
    "fcntl",
    "fcntl64",
    "fdatasync",
    "fgetxattr",
    "flistxattr",
    "flock",
    "fork",
    "fstat",
    "fstat64",
    "fstatfs",
    "fstatfs64",
    "fsync",
    "futex",
    "futex_time64",
    "futex_waitv",
    "get_mempolicy",
    "get_robust_list",
    "get_thread_area",
    "getcwd",
    "getitimer",
    "getpeername",
    "getpgid",
    "getpgrp",
    "getpid",
    "getpmsg",
    "getppid",
    "getpriority",
    "getrlimit",
    "getrusage",
    "getsid",
    "getsockopt",
    "gettid",
    "getxattr",
    "io_cancel",
    "io_destroy",
    "io_getevents",
    "io_pgetevents",
    "io_pgetevents_time64",
    "io_setup",
    "io_submit",
    "ioprio_get",
    "ioprio_set",
    "kcmp",
    "kill",
    "landlock_add_rule",
    "landlock_create_ruleset",
    "landlock_restrict_self",
    "lgetxattr",
    "listxattr",
    "llistxattr",
    "lseek",
    "lstat",
    "madvise", // TODO: confine advice (no-op!).
    "membarrier",
    "mlock",
    "mlock2",
    "mlockall",
    "mmap",
    "mmap2",
    "mprotect",
    "mq_getsetattr",
    "mq_notify",
    "mq_open",
    "mq_timedreceive",
    "mq_timedreceive_time64",
    "mq_timedsend",
    "mq_timedsend_time64",
    "mq_unlink",
    "mremap",
    "msgctl",
    "msgget",
    "msgrcv",
    "msync",
    "munlock",
    "munlockall",
    "munmap",
    "nanosleep",
    "newfstatat",
    "oldfstat",
    "oldolduname",
    "olduname",
    "open",
    "openat",
    "openat2",
    "pause",
    "pipe",
    "pipe2",
    "poll",
    "ppoll",
    "ppoll_time64",
    "prctl",
    "pread64",
    "preadv",
    "preadv2",
    "prlimit64",
    "process_madvise",
    "process_mrelease",
    "pselect6",
    "pselect6_time64",
    "pwrite64",
    "pwritev",
    "pwritev2",
    "read",
    "readahead",
    "readlink",
    "readlinkat",
    "readv",
    "remap_file_pages",
    "restart_syscall",
    "riscv_flush_icache",
    "rseq",
    "rt_sigaction",
    "rt_sigpending",
    "rt_sigprocmask",
    "rt_sigqueueinfo",
    "rt_sigreturn",
    "rt_sigsuspend",
    "rt_sigtimedwait",
    "rt_sigtimedwait_time64",
    "rt_tgsigqueueinfo",
    "s390_pci_mmio_read",
    "s390_pci_mmio_write",
    "s390_runtime_instr",
    "sched_get_priority_max",
    "sched_get_priority_min",
    "sched_getaffinity",
    "sched_getattr",
    "sched_getparam",
    "sched_getscheduler",
    "sched_rr_get_interval",
    "sched_rr_get_interval_time64",
    "sched_setaffinity",
    "sched_setattr",
    "sched_setparam",
    "sched_setscheduler",
    "sched_yield",
    "seccomp",
    "select",
    "semctl",
    "semget",
    "semop",
    "semtimedop",
    "semtimedop_time64",
    "set_robust_list",
    "set_thread_area",
    "set_tid_address",
    "set_tls", // arm
    "setitimer",
    "setpgid",
    "setpriority",
    "setrlimit",
    "setsid",
    "setsockopt",
    "shmat",
    "shmctl",
    "shmdt",
    "shmget",
    "sigaction",
    "sigaltstack",
    "signal",
    "signalfd",
    "signalfd4",
    "sigpending",
    "sigprocmask",
    "sigreturn",
    "sigsuspend",
    "splice",
    "stat",
    "stat64",
    "statx",
    "sync_file_range",
    "tee",
    "tgkill",
    "timer_create",
    "timer_delete",
    "timer_getoverrun",
    "timer_gettime",
    "timer_gettime64",
    "timer_settime",
    "timer_settime64",
    "timerfd_create",
    "timerfd_gettime",
    "timerfd_gettime64",
    "timerfd_settime",
    "timerfd_settime64",
    "times",
    "tkill",
    "ugetrlimit",
    "umask",
    "uname",
    "vfork",
    "wait4",
    "waitid",
    "waitpid",
    "write",
    "writev",
];
