//
// Syd: rock-solid application kernel
// src/kernel/memfd.rs: memfd_create(2) handler
//
// Copyright (c) 2023, 2024, 2025 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

use std::ffi::CString;

use libseccomp::ScmpNotifResp;
use memchr::{arch::all::is_prefix, memchr};
use nix::errno::Errno;

use crate::{
    compat::MFdFlags,
    config::{HAVE_MFD_NOEXEC_SEAL, MFD_HUGETLB_NAME_PREFIX, MFD_NAME_PREFIX, MMAP_MIN_ADDR},
    cookie::safe_memfd_create,
    kernel::sandbox_path,
    path::XPathBuf,
    req::UNotifyEventRequest,
    sandbox::Capability,
};

pub(crate) fn sys_memfd_create(request: UNotifyEventRequest) -> ScmpNotifResp {
    syscall_handler!(request, |request: UNotifyEventRequest| {
        const NAME_MAX: usize = 255;
        // The slash is not included in the limit.
        const MFD_NAME_PREFIX_LEN: usize = MFD_NAME_PREFIX.len() - 1;
        const MFD_NAME_MAX_LEN: usize = NAME_MAX - MFD_NAME_PREFIX_LEN;

        let req = request.scmpreq;
        let addr = req.data.args[0];
        let flags = req.data.args[1];

        // Validate flags argument first.
        let mut flags = to_mfdflags(flags)?;

        // Validate name argument next.
        if addr < *MMAP_MIN_ADDR {
            // Return EFAULT for invalid pointer.
            return Err(Errno::EFAULT);
        }

        // If sandboxing for create capability is off, return immediately.
        let sandbox = request.get_sandbox();
        let force_cloexec = sandbox.flags.force_cloexec();
        let force_rand_fd = sandbox.flags.force_rand_fd();
        let restrict_memfd = !sandbox.flags.allow_unsafe_memfd();

        // SAFETY: Drop the executable flag and seal as nonexecutable,
        // unless trace/allow_unsafe_memfd:1 is set.
        if restrict_memfd && *HAVE_MFD_NOEXEC_SEAL {
            flags.remove(MFdFlags::MFD_EXEC);
            flags.insert(MFdFlags::MFD_NOEXEC_SEAL);
        }

        let mut buf = request.read_vec(addr, MFD_NAME_MAX_LEN)?;

        // If buffer has no null byte,
        // return EINVAL as the path is too long for us to handle.
        let null = memchr(0, &buf)
            .ok_or(Errno::EINVAL)?
            .checked_add(1)
            .ok_or(Errno::EINVAL)?;
        buf.truncate(null);
        buf.shrink_to_fit();

        // SAFETY:
        // If name starts with `syd', turn into `Syd'.
        // The `syd' prefix is used internally.
        // We don't return EINVAL here for stealth.
        if is_prefix(&buf, b"syd") {
            buf[0] = b'S';
        }

        // SAFETY:
        // Buffer is guaranteed to have null-byte as its last element.
        let name = unsafe { CString::from_vec_with_nul_unchecked(buf) };

        // Determine sandbox capabilities.
        let mut caps = Capability::CAP_CREATE;
        if !flags.contains(MFdFlags::MFD_NOEXEC_SEAL) {
            caps.insert(Capability::CAP_EXEC);
        }

        if sandbox.enabled_any(caps) {
            // Check for access by appending the memfd prefix.
            let mut path = XPathBuf::from(if flags.contains(MFdFlags::MFD_HUGETLB) {
                // !memfd-hugetlb:
                MFD_HUGETLB_NAME_PREFIX
            } else {
                // !memfd:
                MFD_NAME_PREFIX
            });
            path.append_bytes(name.as_bytes());

            sandbox_path(
                Some(&request),
                &sandbox,
                request.scmpreq.pid(), // Unused when request.is_some()
                &path,
                caps,
                false,
                "memfd_create",
            )
            .or(Err(Errno::EACCES))?;
        }
        drop(sandbox); // release the read-lock.

        // Set CLOEXEC for our fd always, and
        // Set CLOEXEC for remote fd as necessary.
        let cloexec = force_cloexec || flags.contains(MFdFlags::MFD_CLOEXEC);
        flags.insert(MFdFlags::MFD_CLOEXEC);

        // Access granted, emulate call.
        let fd = safe_memfd_create(name.as_c_str(), flags)?;

        // Return the fd to the sandbox process.
        request.send_fd(fd, cloexec, force_rand_fd)
    })
}

// Convert system call argument to MFdFlags safely.
#[inline]
fn to_mfdflags(arg: u64) -> Result<MFdFlags, Errno> {
    // SAFETY: Reject undefined flags.
    let flags = arg.try_into().or(Err(Errno::EINVAL))?;

    // SAFETY: Reject invalid flags for future safety!
    let flags = MFdFlags::from_bits(flags).ok_or(Errno::EINVAL)?;

    // SAFETY:
    // 1. Linux<6.3: Reject both MFD_EXEC and MFD_NOEXEC_SEAL.
    // 2. Linux>=6.3: Reject when both are specified together.
    if *HAVE_MFD_NOEXEC_SEAL {
        if flags.contains(MFdFlags::MFD_EXEC | MFdFlags::MFD_NOEXEC_SEAL) {
            return Err(Errno::EINVAL);
        }
    } else if flags.intersects(MFdFlags::MFD_EXEC | MFdFlags::MFD_NOEXEC_SEAL) {
        return Err(Errno::EINVAL);
    }

    Ok(flags)
}
