// Syd: rock-solid application kernel
// src/kernel/net/sendmsg.rs: sendmsg(2) and sendmmsg(2) handlers
//
// Copyright (c) 2023, 2024, 2025 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

use std::{
    io::IoSlice,
    os::fd::{AsRawFd, OwnedFd, RawFd},
};

use libseccomp::ScmpNotifResp;
use nix::{
    errno::Errno,
    sys::socket::{sendmsg, ControlMessage, MsgFlags, SockaddrStorage, UnixCredentials},
    unistd::Pid,
};
use zeroize::Zeroizing;

use crate::{
    compat::{
        addr_family, cmsg_len_32, cmsg_space_32, msghdr, msghdr32, PF_ALG, PF_INET, PF_INET6,
        PF_NETLINK, PF_UNIX,
    },
    confine::scmp_arch_bits,
    fs::{file_type, get_nonblock, has_recv_timeout, is_same_vm, FileType},
    hook::UNotifyEventRequest,
    kernel::{
        net::{canon_addr, get_addr, sandbox_addr, to_msgflags},
        sandbox_path,
    },
    path::XPath,
    sandbox::Capability,
};

#[expect(clippy::cognitive_complexity)]
pub(crate) fn handle_sendmsg(
    fd: OwnedFd,
    request: &UNotifyEventRequest,
    args: &[u64; 6],
    allow_unsupp_socket: bool,
    restrict_oob: bool,
    restrict_mkbdev: bool,
) -> Result<ScmpNotifResp, Errno> {
    // SAFETY: Reject undefined/invalid flags.
    let flags = to_msgflags(args[2])?;

    // SAFETY: Reject MSG_OOB as necessary.
    if restrict_oob && flags.contains(MsgFlags::MSG_OOB) {
        // Signal no support to let the sandbox process
        // handle the error gracefully. This is consistent
        // with the Linux kernel.
        return Err(Errno::EOPNOTSUPP);
    }

    let req = request.scmpreq;
    let is32 = scmp_arch_bits(req.data.arch) == 32;
    let size = if is32 {
        // Note, socketcall is the same number on all:
        // x86, mips, mipsel, ppc, ppc64, ppc64le, s390 and s390x.
        size_of::<msghdr32>()
    } else {
        size_of::<msghdr>()
    };
    let mut buf = Zeroizing::new(Vec::new());
    buf.try_reserve(size).or(Err(Errno::ENOMEM))?;
    buf.resize(size, 0);
    request.read_mem(&mut buf, args[1])?;

    let msg = if is32 {
        // SAFETY: See below.
        let msg: msghdr32 = unsafe { std::ptr::read_unaligned(buf.as_ptr() as *const _) };
        crate::compat::msghdr::from(msg)
    } else {
        // SAFETY: The following unsafe block assumes that:
        // 1. The memory layout of open_how in our Rust environment
        //    matches that of the target process.
        // 2. The request.process.read_mem call has populated buf with valid data
        //    of the appropriate size (ensured by the size check above).
        // 3. The buffer is appropriately aligned for reading an
        //    open_how struct. If the remote process's representation of
        //    open_how was correctly aligned, our local buffer should be
        //    too, since it's an array on the stack.
        unsafe { std::ptr::read_unaligned(buf.as_ptr() as *const _) }
    };

    let addr_remote = msg.msg_name;
    let addr_len = msg.msg_namelen;

    // Step 1: Handle the address.
    let sandbox = request.get_sandbox();
    let addr = if !addr_remote.is_null() && addr_len > 0 {
        let addr = get_addr(request, addr_remote as u64, addr_len)?;
        let (addr, root) = canon_addr(request, &sandbox, &addr, Capability::CAP_NET_CONNECT)?;
        match addr_family(&addr) {
            PF_UNIX | PF_INET | PF_INET6 => {
                // Check for access.
                sandbox_addr(
                    request,
                    &sandbox,
                    &addr,
                    &root,
                    0x10,
                    Capability::CAP_NET_CONNECT,
                )?;
            }
            PF_ALG | PF_NETLINK => {
                // SAFETY: We do not check AF_ALG or AF_NETLINK for access.
            }
            _ if allow_unsupp_socket => {
                // SAFETY: We do not check unsupported sockets for access.
            }
            _ => return Err(Errno::EAFNOSUPPORT),
        };

        Some((addr, root))
    } else {
        // Connection-mode socket.
        // SAFETY: We cannot continue here due to the added level of
        // pointer indirection.
        None
    };

    // Step 2: Handle control messages.
    let mut control_messages = Vec::new();
    let control_data = if !msg.msg_control.is_null() && msg.msg_controllen > 0 {
        #[expect(clippy::useless_conversion)]
        let cmsg_len = usize::try_from(msg.msg_controllen)
            .or(Err(Errno::EINVAL))?
            .min(1000000); // SAFETY: Cap at 1mio.
        let mut cmsg_buf = Vec::new();
        cmsg_buf.try_reserve(cmsg_len).or(Err(Errno::ENOMEM))?;
        cmsg_buf.resize(cmsg_len, 0);
        request.read_mem(&mut cmsg_buf, msg.msg_control as u64)?;
        Some(parse_control_messages(request, &cmsg_buf)?)
    } else {
        None
    };

    if let Some((
        ref control_fds,
        ref control_creds,
        ref control_ivs,
        ref control_ops,
        ref control_aead_assoclens,
        ref control_udp_gso_segments,
        ref control_ipv4_packet_infos,
        ref control_ipv6_packet_infos,
        ref control_rxq_ovfls,
        ref control_tx_times,
        ref control_ipv4_toses,
        ref control_ipv6_tclasses,
    )) = control_data
    {
        // Check for sendfd access as necessary.
        if !control_fds.is_empty() {
            if let Some((ref addr, ref root)) = addr {
                sandbox_addr(
                    request,
                    &sandbox,
                    addr,
                    root,
                    0x10,
                    Capability::CAP_NET_SENDFD,
                )?;
            } else {
                // SAFETY: For cases where address is not available, we
                // perform an access check with a dummy path so as to
                // enable user to practically confine this case.
                sandbox_path(
                    Some(request),
                    &sandbox,
                    request.scmpreq.pid(), // Unused when request.is_some()
                    XPath::from_bytes(b"!unnamed"),
                    Capability::CAP_NET_SENDFD,
                    false,
                    "sendmsg",
                )?;
            }
        }

        for fds in control_fds {
            // SAFETY: Deny sending file descriptors referring to
            // 1. Block devices unless trace/allow_unsafe_mkbdev:1 is set.
            // 2. Directories
            // 3. Symbolic links
            //
            // Note, we do allow files of unknown type such as epoll
            // fds and event fds as some programs such as pipewire
            // depend on this. See test-pw-filter test of pipewire
            // for more information about this.
            for fd in fds {
                // TODO: Log this deny!
                match file_type(fd, None, false)? {
                    FileType::Dir | FileType::Lnk => return Err(Errno::EACCES),
                    FileType::Blk if restrict_mkbdev => return Err(Errno::EACCES),
                    _ => {}
                }
            }

            // SAFETY: OwnedFd is repr(transparent) over RawFd, so
            // the pointer cast & the slice length are correct.
            let raw_fds: &[RawFd] =
                unsafe { std::slice::from_raw_parts(fds.as_ptr() as *const RawFd, fds.len()) };

            control_messages.push(ControlMessage::ScmRights(raw_fds));
        }

        for creds in control_creds {
            control_messages.push(ControlMessage::ScmCredentials(creds));
        }

        for iv_data in control_ivs {
            control_messages.push(ControlMessage::AlgSetIv(iv_data.as_slice()));
        }

        for op in control_ops {
            control_messages.push(ControlMessage::AlgSetOp(op));
        }

        for assoclen in control_aead_assoclens {
            control_messages.push(ControlMessage::AlgSetAeadAssoclen(assoclen));
        }

        for gso_segments in control_udp_gso_segments {
            control_messages.push(ControlMessage::UdpGsoSegments(gso_segments));
        }

        for pktinfo in control_ipv4_packet_infos {
            control_messages.push(ControlMessage::Ipv4PacketInfo(pktinfo));
        }

        for pktinfo in control_ipv6_packet_infos {
            control_messages.push(ControlMessage::Ipv6PacketInfo(pktinfo));
        }

        for rxq_ovfl in control_rxq_ovfls {
            control_messages.push(ControlMessage::RxqOvfl(rxq_ovfl));
        }

        for tx_time in control_tx_times {
            control_messages.push(ControlMessage::TxTime(tx_time));
        }

        for tos in control_ipv4_toses {
            control_messages.push(ControlMessage::Ipv4Tos(tos));
        }

        for tclass in control_ipv6_tclasses {
            control_messages.push(ControlMessage::Ipv6TClass(tclass));
        }
    }
    drop(sandbox); // release the read-lock.

    // Step 3: Handle the payload which is an array of struct iovecs.
    let mut io_buffers: Vec<Vec<u8>> = Vec::new();
    let mut io_slices: Vec<IoSlice> = Vec::new();
    if !(msg.msg_iov.is_null() || msg.msg_iovlen == 0) {
        // SAFETY: The msg_iovlen member of the msghdr struct
        // must not be fully trusted, it can be overly large,
        // and allocating a Vector of that capacity may overflow.
        #[expect(clippy::useless_conversion)]
        let len = usize::try_from(msg.msg_iovlen)
            .or(Err(Errno::EINVAL))?
            .min(1000000); // Cap count at 1mio
        let size = if is32 {
            len.checked_mul(size_of::<crate::compat::iovec32>())
        } else {
            len.checked_mul(size_of::<libc::iovec>())
        }
        .ok_or(Errno::EINVAL)?;
        let mut buf = Zeroizing::new(Vec::new());
        buf.try_reserve(size).or(Err(Errno::ENOMEM))?;
        buf.resize(size, 0);
        request.read_mem(&mut buf, msg.msg_iov as u64)?;

        // SAFETY: This operation assumes that the buffer (`buf`) contains a valid sequence of bytes
        // that correctly represent an array of `iovec` structures. This is ensured by the preceding
        // code that reads memory into `buf` with proper length calculation. The length `len` is
        // derived from `msg.msg_iovlen` and capped to prevent overflow, ensuring that we do not
        // exceed the allocation size of `buf`. The conversion to a pointer and then to a slice
        // of `iovec` is safe under these conditions, assuming the memory layout of `iovec` is
        // correct and `buf` is correctly sized and aligned.
        let mut iovecs: Vec<libc::iovec> = Vec::new();
        if is32 {
            for chunk in buf.chunks(size_of::<crate::compat::iovec32>()) {
                // SAFETY: See above.
                let iov32: crate::compat::iovec32 =
                    unsafe { std::ptr::read_unaligned(chunk.as_ptr() as *const _) };
                iovecs.push(iov32.into());
            }
        } else {
            for chunk in buf.chunks(size_of::<libc::iovec>()) {
                // SAFETY: See above.
                iovecs.push(unsafe { std::ptr::read_unaligned(chunk.as_ptr() as *const _) });
            }
        };

        for iov in iovecs {
            if iov.iov_base.is_null() || iov.iov_len == 0 {
                // XXX: This happens with socketcall on x86, why?
                continue;
            }

            // Cap the length to a maximum value to avoid large allocations.
            // SAFETY: The maximum length cap prevents excessive memory
            // allocation based on untrusted `iov_len`.
            let iov_len = iov.iov_len.min(1000000); // Cap count at 1mio

            // Allocate a buffer to read into. This buffer size is now capped.
            let mut data_buf = Vec::new();
            data_buf.try_reserve(iov_len).or(Err(Errno::ENOMEM))?;
            data_buf.resize(iov_len, 0);

            // Read the memory from the remote process into our buffer.
            // SAFETY: This operation relies on the correctness of
            // `iov_base` as a pointer into the remote process's memory and
            // the capped `iov_len`.
            request.read_mem(&mut data_buf, iov.iov_base as u64)?;

            // Keep the pointer accessible, IoSlice needs a valid reference.
            io_buffers.try_reserve(1).or(Err(Errno::ENOMEM))?;
            io_buffers.push(data_buf);
        }
        io_slices
            .try_reserve(io_buffers.len())
            .or(Err(Errno::ENOMEM))?;
        for buffer in &io_buffers {
            io_slices.push(IoSlice::new(buffer));
        }
    }

    // SAFETY: Record blocking call so it can get invalidated.
    let is_blocking = if !flags.contains(MsgFlags::MSG_DONTWAIT) && !get_nonblock(&fd)? {
        let req = request.scmpreq;
        let ignore_restart = has_recv_timeout(&fd)?;

        // Record the blocking call.
        request.cache.add_sys_block(req, ignore_restart)?;

        true
    } else {
        false
    };

    let result = if let Some((addr, _)) = addr {
        // UNIX domain/abstract socket.
        sendmsg(
            fd.as_raw_fd(),
            &io_slices,
            &control_messages,
            flags,
            Some(&addr),
        )
    } else {
        // Connection-mode socket.
        sendmsg::<SockaddrStorage>(fd.as_raw_fd(), &io_slices, &control_messages, flags, None)
    };

    // Remove invalidation record unless interrupted.
    if is_blocking {
        request
            .cache
            .del_sys_block(req.id, matches!(result, Err(Errno::EINTR)))?;
    }

    // Send SIGPIPE for EPIPE unless MSG_NOSIGNAL is set.
    #[expect(clippy::cast_possible_wrap)]
    Ok(match result {
        Ok(n) => request.return_syscall(n as i64),
        Err(Errno::EPIPE) if !flags.contains(MsgFlags::MSG_NOSIGNAL) => {
            request.pidfd_kill(libc::SIGPIPE)?;
            request.fail_syscall(Errno::EPIPE)
        }
        Err(errno) => request.fail_syscall(errno),
    })
}

#[expect(clippy::cognitive_complexity)]
pub(crate) fn handle_sendmmsg(
    fd: OwnedFd,
    request: &UNotifyEventRequest,
    args: &[u64; 6],
    allow_unsupp_socket: bool,
    restrict_oob: bool,
    restrict_mkbdev: bool,
) -> Result<ScmpNotifResp, Errno> {
    // SAFETY: Reject undefined/invalid flags.
    let msgflags = to_msgflags(args[3])?;

    // SAFETY: Reject MSG_OOB as necessary.
    if restrict_oob && msgflags.contains(MsgFlags::MSG_OOB) {
        // Signal no support to let the sandbox process
        // handle the error gracefully. This is consistent
        // with the Linux kernel.
        return Err(Errno::EOPNOTSUPP);
    }

    // NULL check was performed already.
    let addr = args[1];
    let vlen = usize::try_from(args[2]).or(Err(Errno::EINVAL))?;
    if vlen == 0 {
        return Ok(request.return_syscall(0));
    }
    let vlen = vlen.min(1024); // Cap at IOV_MAX

    let req = request.scmpreq;
    let is32 = scmp_arch_bits(req.data.arch) == 32;

    let size = if is32 {
        vlen.checked_mul(size_of::<crate::compat::mmsghdr32>())
    } else {
        vlen.checked_mul(size_of::<crate::compat::mmsghdr>())
    }
    .ok_or(Errno::EINVAL)?;

    // Read mmsghdr structures from remote process memory
    let mut buf = Zeroizing::new(Vec::new());
    buf.try_reserve(size).or(Err(Errno::ENOMEM))?;
    buf.resize(size, 0);
    request.read_mem(&mut buf, addr)?;

    let mut mmsghdrs: Vec<crate::compat::mmsghdr> = Vec::new();
    if is32 {
        for chunk in buf.chunks(size_of::<crate::compat::mmsghdr32>()) {
            // SAFETY: See the relevant comment in handle_sendmsg.
            let mmsghdr: crate::compat::mmsghdr32 =
                unsafe { std::ptr::read_unaligned(chunk.as_ptr() as *const _) };
            mmsghdrs.try_reserve(1).or(Err(Errno::ENOMEM))?;
            mmsghdrs.push(mmsghdr.into());
        }
    } else {
        for chunk in buf.chunks(size_of::<crate::compat::mmsghdr>()) {
            mmsghdrs.try_reserve(1).or(Err(Errno::ENOMEM))?;
            // SAFETY: See the relevant comment in handle_sendmsg.
            mmsghdrs.push(unsafe { std::ptr::read_unaligned(chunk.as_ptr() as *const _) });
        }
    };

    // Check if the call is a blocking call which we need to invalidate as necessary.
    let (is_blocking, ignore_restart) =
        if !msgflags.contains(MsgFlags::MSG_DONTWAIT) && !get_nonblock(&fd)? {
            (true, has_recv_timeout(&fd)?)
        } else {
            (false, false)
        };
    // Check if we want to send SIGPIPE on EPIPE.
    let must_signal = !msgflags.contains(MsgFlags::MSG_NOSIGNAL);

    // Prepare a series of sendmsg calls.
    for mmsg in &mut mmsghdrs {
        let msg = &mut mmsg.msg_hdr;

        // Step 1: Handle the address.
        let addr_remote = msg.msg_name;
        let addr_len = msg.msg_namelen;
        let sandbox = request.get_sandbox();
        let addr_root = if !addr_remote.is_null() && addr_len > 0 {
            let addr = get_addr(request, addr_remote as u64, addr_len)?;
            let (addr, root) = canon_addr(request, &sandbox, &addr, Capability::CAP_NET_CONNECT)?;
            match addr_family(&addr) {
                PF_UNIX | PF_INET | PF_INET6 => {
                    // Check for access.
                    sandbox_addr(
                        request,
                        &sandbox,
                        &addr,
                        &root,
                        0x14,
                        Capability::CAP_NET_CONNECT,
                    )?;
                }
                PF_ALG | PF_NETLINK => {
                    // SAFETY: We do not check AF_ALG and AF_NETLINK for access.
                }
                _ if allow_unsupp_socket => {
                    // SAFETY: We do not check unsupported sockets for access.
                }
                _ => return Err(Errno::EAFNOSUPPORT),
            };

            Some((addr, root))
        } else {
            // Connection-mode socket.
            // SAFETY: We cannot continue here due to the added level of
            // pointer indirection.
            None
        };

        // Step 2: Handle control messages.
        let mut control_data = Vec::new();
        let control_datum = if !msg.msg_control.is_null() && msg.msg_controllen > 0 {
            #[expect(clippy::useless_conversion)]
            let cmsg_len = usize::try_from(msg.msg_controllen)
                .or(Err(Errno::EINVAL))?
                .min(1000000); // SAFETY: Cap at 1mio.
            let mut cmsg_buf = Vec::new();
            cmsg_buf.try_reserve(cmsg_len).or(Err(Errno::ENOMEM))?;
            cmsg_buf.resize(cmsg_len, 0);
            request.read_mem(&mut cmsg_buf, msg.msg_control as u64)?;
            Some(parse_control_messages(request, &cmsg_buf)?)
        } else {
            None
        };
        control_data.try_reserve(1).or(Err(Errno::ENOMEM))?;
        control_data.push(control_datum); // Keep OwnedFd alive!

        let mut control_messages = Vec::new();
        for control_datum in &control_data {
            if let Some((
                ref control_fds,
                ref control_creds,
                ref control_ivs,
                ref control_ops,
                ref control_aead_assoclens,
                ref control_udp_gso_segments,
                ref control_ipv4_packet_infos,
                ref control_ipv6_packet_infos,
                ref control_rxq_ovfls,
                ref control_tx_times,
                ref control_ipv4_toses,
                ref control_ipv6_tclasses,
            )) = &control_datum
            {
                // Check for sendfd access as necessary.
                if !control_fds.is_empty() {
                    if let Some((ref addr, ref root)) = addr_root {
                        sandbox_addr(
                            request,
                            &sandbox,
                            addr,
                            root,
                            0x10,
                            Capability::CAP_NET_SENDFD,
                        )?;
                    } else {
                        // SAFETY: For cases where address is not available, we
                        // perform an access check with a dummy path so as to
                        // enable user to practically confine this case.
                        sandbox_path(
                            Some(request),
                            &sandbox,
                            request.scmpreq.pid(), // Unused when request.is_some()
                            XPath::from_bytes(b"!unnamed"),
                            Capability::CAP_NET_SENDFD,
                            false,
                            "sendmmsg",
                        )?;
                    }
                }

                for fds in control_fds {
                    // SAFETY: Deny sending file descriptors referring to
                    // 1. Block devices unless trace/allow_unsafe_mkbdev:1 is set.
                    // 2. Directories
                    // 3. Symbolic links
                    //
                    // Note, we do allow files of unknown type such as epoll
                    // fds and event fds as some programs such as pipewire
                    // depend on this. See test-pw-filter test of pipewire
                    // for more information about this.
                    for fd in fds {
                        // TODO: Log this deny!
                        match file_type(fd, None, false)? {
                            FileType::Dir | FileType::Lnk => return Err(Errno::EACCES),
                            FileType::Blk if restrict_mkbdev => return Err(Errno::EACCES),
                            _ => {}
                        }
                    }

                    // SAFETY: OwnedFd is repr(transparent) over RawFd, so
                    // the pointer cast & the slice length are correct.
                    let raw_fds: &[RawFd] = unsafe {
                        std::slice::from_raw_parts(fds.as_ptr() as *const RawFd, fds.len())
                    };

                    control_messages.push(ControlMessage::ScmRights(raw_fds));
                }

                for creds in control_creds {
                    control_messages.push(ControlMessage::ScmCredentials(creds));
                }

                for iv_data in control_ivs {
                    control_messages.push(ControlMessage::AlgSetIv(iv_data.as_slice()));
                }

                for op in control_ops {
                    control_messages.push(ControlMessage::AlgSetOp(op));
                }

                for assoclen in control_aead_assoclens {
                    control_messages.push(ControlMessage::AlgSetAeadAssoclen(assoclen));
                }

                for gso_segments in control_udp_gso_segments {
                    control_messages.push(ControlMessage::UdpGsoSegments(gso_segments));
                }

                for pktinfo in control_ipv4_packet_infos {
                    control_messages.push(ControlMessage::Ipv4PacketInfo(pktinfo));
                }

                for pktinfo in control_ipv6_packet_infos {
                    control_messages.push(ControlMessage::Ipv6PacketInfo(pktinfo));
                }

                for rxq_ovfl in control_rxq_ovfls {
                    control_messages.push(ControlMessage::RxqOvfl(rxq_ovfl));
                }

                for tx_time in control_tx_times {
                    control_messages.push(ControlMessage::TxTime(tx_time));
                }

                for tos in control_ipv4_toses {
                    control_messages.push(ControlMessage::Ipv4Tos(tos));
                }

                for tclass in control_ipv6_tclasses {
                    control_messages.push(ControlMessage::Ipv6TClass(tclass));
                }
            }
        }
        drop(sandbox); // release the read-lock before emulation.

        // Step 3: Handle the payload which is an array of struct iovecs.
        let mut io_buffers = Vec::new();
        let mut io_slices: Vec<IoSlice> = Vec::new();
        if !(msg.msg_iov.is_null() || msg.msg_iovlen == 0) {
            // SAFETY: The msg_iovlen member of the msghdr struct
            // must not be fully trusted, it can be overly large,
            // and allocating a Vector of that capacity may overflow.
            #[expect(clippy::useless_conversion)]
            let len = usize::try_from(msg.msg_iovlen)
                .or(Err(Errno::EINVAL))?
                .min(1000000); // Cap count at 1mio.
            let size = if is32 {
                len.checked_mul(size_of::<crate::compat::iovec32>())
            } else {
                len.checked_mul(size_of::<libc::iovec>())
            }
            .ok_or(Errno::EINVAL)?;
            let mut buf = Zeroizing::new(Vec::new());
            buf.try_reserve(size).or(Err(Errno::ENOMEM))?;
            buf.resize(size, 0);
            request.read_mem(&mut buf, msg.msg_iov as u64)?;

            let mut iovecs: Vec<libc::iovec> = Vec::new();
            if is32 {
                for chunk in buf.chunks(size_of::<crate::compat::iovec32>()) {
                    // SAFETY: See the relevant comment in handle_sendmsg.
                    let iov32: crate::compat::iovec32 =
                        unsafe { std::ptr::read_unaligned(chunk.as_ptr() as *const _) };
                    iovecs.try_reserve(1).or(Err(Errno::ENOMEM))?;
                    iovecs.push(iov32.into());
                }
            } else {
                for chunk in buf.chunks(size_of::<libc::iovec>()) {
                    iovecs.try_reserve(1).or(Err(Errno::ENOMEM))?;
                    // SAFETY: See the relevant comment in handle_sendmsg.
                    iovecs.push(unsafe { std::ptr::read_unaligned(chunk.as_ptr() as *const _) });
                }
            };

            for iov in iovecs {
                // Cap the length to a maximum value to avoid large allocations.
                // SAFETY: The maximum length cap prevents excessive memory
                // allocation based on untrusted `iov_len`.
                let iov_len = iov.iov_len.min(1000000); // Cap count at 1mio

                // Allocate a buffer to read into. This buffer size is now capped.
                let mut data_buf = Vec::new();
                data_buf.try_reserve(iov_len).or(Err(Errno::ENOMEM))?;
                data_buf.resize(iov_len, 0);

                // Read the memory from the remote process into our buffer.
                // SAFETY: This operation relies on the correctness of
                // `iov_base` as a pointer into the remote process's memory and
                // the capped `iov_len`.
                request.read_mem(&mut data_buf, iov.iov_base as u64)?;

                // Keep the pointer accessible, IoSlice needs a valid reference.
                io_buffers.push(data_buf);
            }
            for buffer in &io_buffers {
                io_slices.try_reserve(1).or(Err(Errno::ENOMEM))?;
                io_slices.push(IoSlice::new(buffer));
            }

            // SAFETY: Record blocking call so it can get invalidated.
            if is_blocking {
                request.cache.add_sys_block(req, ignore_restart)?;
            }

            // Make the sendmsg call.
            let result = if let Some((addr, _)) = addr_root {
                // Connection-less socket.
                sendmsg(
                    fd.as_raw_fd(),
                    &io_slices,
                    &control_messages,
                    msgflags,
                    Some(&addr),
                )
            } else {
                // Connection-mode socket.
                sendmsg::<SockaddrStorage>(
                    fd.as_raw_fd(),
                    &io_slices,
                    &control_messages,
                    msgflags,
                    None,
                )
            };

            // Remove invalidation record unless interrupted.
            if is_blocking {
                request
                    .cache
                    .del_sys_block(req.id, matches!(result, Err(Errno::EINTR)))?;
            }

            // Send SIGPIPE for EPIPE unless MSG_NOSIGNAL is set.
            mmsg.msg_len = match result {
                Ok(n) => n.try_into().or(Err(Errno::EINVAL))?,
                Err(Errno::EPIPE) if must_signal => {
                    request.pidfd_kill(libc::SIGPIPE)?;
                    return Err(Errno::EPIPE);
                }
                Err(errno) => return Err(errno),
            };

            mmsg.msg_len = result?.try_into().or(Err(Errno::EINVAL))?;
        }
    }

    // Write back mmsghdr structures to remote process memory
    let mut buf: Zeroizing<Vec<u8>> = Zeroizing::new(Vec::new());
    if is32 {
        for mmsghdr in &mmsghdrs {
            let mmsghdr32: crate::compat::mmsghdr32 = (*mmsghdr).into();
            // SAFETY: Convert each mmsghdr (or mmsghdr32 within the
            // conversion logic) back to its byte representation.
            let bytes: [u8; size_of::<crate::compat::mmsghdr32>()] =
                unsafe { std::mem::transmute(mmsghdr32) };
            buf.try_reserve(bytes.len()).or(Err(Errno::ENOMEM))?;
            buf.extend_from_slice(&bytes);
        }
    } else {
        for mmsghdr in &mmsghdrs {
            // SAFETY: See above.
            let bytes: [u8; size_of::<crate::compat::mmsghdr>()] =
                unsafe { std::mem::transmute(*mmsghdr) };
            buf.try_reserve(bytes.len()).or(Err(Errno::ENOMEM))?;
            buf.extend_from_slice(&bytes);
        }
    }
    request.write_mem(&buf, addr)?;

    // FIXME: We do not handle partial success.
    #[expect(clippy::cast_possible_wrap)]
    Ok(request.return_syscall(mmsghdrs.len() as i64))
}

// SAFETY: Below lie daemons...
#[expect(clippy::type_complexity)]
fn parse_control_messages(
    request: &UNotifyEventRequest,
    cmsg_buf: &[u8],
) -> Result<
    (
        Vec<Vec<OwnedFd>>,
        Vec<UnixCredentials>,
        Vec<Zeroizing<Vec<u8>>>, // ivs
        Vec<libc::c_int>,        // ops
        Vec<u32>,                // aead_assoclens
        Vec<u16>,                // udp_gso_segments
        Vec<libc::in_pktinfo>,
        Vec<libc::in6_pktinfo>,
        Vec<u32>, // rxq_ovfls
        Vec<u64>, // tx_times
        Vec<u8>,  // ipv4 tos
        Vec<i32>, // ipv6 tclass
    ),
    Errno,
> {
    let mut control_fds = Vec::new();
    let mut control_creds = Vec::new();
    let mut control_ivs = Vec::new();
    let mut control_ops = Vec::new();
    let mut control_aead_assoclens = Vec::new();
    let mut control_udp_gso_segments = Vec::new();
    let mut control_ipv4_packet_infos = Vec::new();
    let mut control_ipv6_packet_infos = Vec::new();
    let mut control_rxq_ovfls = Vec::new();
    let mut control_tx_times = Vec::new();
    let mut control_ipv4_toses = Vec::new();
    let mut control_ipv6_tclasses = Vec::new();

    let mut offset = 0;
    let req = request.scmpreq;
    let is32 = scmp_arch_bits(req.data.arch) == 32;
    while offset < cmsg_buf.len() {
        // SAFETY: Ensuring alignment for `cmsghdr` by starting from a u8 pointer.  The
        // `cmsg_buf` is originally a u8 buffer, which may not satisfy the alignment
        // requirements of `cmsghdr`. This cast assumes that the buffer provided by
        // `request.process.read_mem` is correctly aligned for `cmsghdr` structures, which is true if
        // the buffer is initially populated in a manner adhering to the alignment
        // requirements of `cmsghdr`. The caller is responsible for ensuring that `offset`
        // is correctly aligned for `cmsghdr` when accessing the buffer.
        let (cmsg_header, cmsg_len0): (crate::compat::cmsghdr, usize) = if is32 {
            // SAFETY: See the comment above.
            let cmsg_header_32: crate::compat::cmsghdr32 = unsafe {
                std::ptr::read_unaligned(
                    cmsg_buf[offset..].as_ptr() as *const crate::compat::cmsghdr32
                )
            };
            (cmsg_header_32.into(), cmsg_len_32(0))
        } else {
            (
                // SAFETY: See the comment above.
                unsafe {
                    std::ptr::read_unaligned(
                        cmsg_buf[offset..].as_ptr() as *const crate::compat::cmsghdr
                    )
                },
                // SAFETY: See the comment above.
                unsafe { libc::CMSG_LEN(0) } as usize,
            )
        };
        if cmsg_header.cmsg_len < cmsg_len0 {
            return Err(Errno::EINVAL); // Invalid header length
        }
        #[expect(clippy::useless_conversion)]
        let data_len: usize = cmsg_header.cmsg_len.try_into().or(Err(Errno::EINVAL))?;
        let data_len = data_len.checked_sub(cmsg_len0).ok_or(Errno::EINVAL)?;

        let data_off = offset.checked_add(cmsg_len0).ok_or(Errno::EINVAL)?;
        let data_end = data_off.checked_add(data_len).ok_or(Errno::EINVAL)?;
        if data_end > cmsg_buf.len() {
            return Err(Errno::EINVAL); // Data goes beyond buffer.
        }
        let data = &cmsg_buf[data_off..data_end];

        match (cmsg_header.cmsg_level, cmsg_header.cmsg_type) {
            (libc::SOL_SOCKET, libc::SCM_RIGHTS) => {
                let fd_count = data_len
                    .checked_div(size_of::<RawFd>())
                    .ok_or(Errno::EINVAL)?;

                let mut fds = Vec::new();
                fds.try_reserve(fd_count).or(Err(Errno::ENOMEM))?;

                // SAFETY: Multiplying `i` by `size_of::<RawFd>()` calculates the
                // offset for each file descriptor in the control message data. This is safe
                // under the assumption that `data_len` (used to derive `fd_count`)
                // correctly represents a buffer containing `RawFd`s. `data_len` is checked
                // to ensure it's an exact multiple of `size_of::<RawFd>()`,
                // preventing out-of-bounds access. Accessing the file descriptor using this
                // offset and converting it with `request.get_fd()` is based on the valid and
                // expected layout of file descriptors in the control message. This layout
                // and access method align with the conventions used by the underlying
                // system for `SCM_RIGHTS` control messages, ensuring that we read valid
                // file descriptor values from the buffer.
                for i in 0..fd_count {
                    #[expect(clippy::arithmetic_side_effects)]
                    let fd_offset = i * size_of::<RawFd>();
                    #[expect(clippy::cast_ptr_alignment)]
                    // SAFETY: See the comment above.
                    let fd = unsafe { *(data[fd_offset..].as_ptr() as *const RawFd) };
                    let fd = request.get_fd(fd)?;
                    fds.push(fd);
                }
                control_fds.push(fds);
            }
            (libc::SOL_SOCKET, libc::SCM_CREDENTIALS) => {
                #[expect(clippy::cast_ptr_alignment)]
                // SAFETY: Casting `data.as_ptr()` to `*const libc::ucred` is safe
                // under the assumption that `data` contains bytes that correctly represent
                // a `libc::ucred` structure, and that `data_len` matches the size of
                // `libc::ucred`.  This assumption is based on the control message type
                // `SCM_CREDENTIALS`, which is expected to contain exactly one `libc::ucred`
                // structure representing the credentials of the sending process.  The
                // conversion to `UnixCredentials` is a safe operation that simply wraps the
                // raw credentials in a Rust-friendly type. The use of `unsafe` is necessary
                // to dereference the raw pointer obtained from the byte buffer, but the
                // operation is ensured to be valid by adhering to the expected control
                // message format and size. This access pattern is consistent with the
                // standard way of handling `SCM_CREDENTIALS` control messages.
                let mut creds = *(unsafe { &*(data.as_ptr() as *const libc::ucred) });
                // SAFETY: The sender must specify its own pid (unless it has the capability
                // CAP_SYS_ADMIN, in which case the PID of any existing process may be specified.)
                if creds.pid != request.scmpreq.pid().as_raw() {
                    // Quoting unix(7):
                    // EPERM: The sender passed invalid credentials in the struct ucred.
                    // `is_same_vm` check is necessary to make dbus tests of vala work, see: #78
                    if !is_same_vm(Pid::from_raw(creds.pid), request.scmpreq.pid()).unwrap_or(false)
                    {
                        return Err(Errno::EPERM);
                    }
                }
                creds.pid = Pid::this().as_raw();
                let unix_creds = UnixCredentials::from(creds);
                control_creds.push(unix_creds); // Keep a ref to the UnixCredentials.
            }
            (libc::SOL_ALG, libc::ALG_SET_IV) => {
                // IV data is directly contained in the data part of the control message
                // First four bytes represent the length of the IV.
                if data_len < 4 {
                    return Err(Errno::EINVAL); // Data length mismatch
                }
                let iv_size = u32::from_ne_bytes([data[0], data[1], data[2], data[3]]) as usize;
                let iv_size = iv_size.checked_add(4).ok_or(Errno::EINVAL)?;
                if iv_size <= data_len {
                    // Extract the IV while respecting the indicated size,
                    // if the size is valid.
                    let iv_data = Zeroizing::new(data[4..iv_size].to_vec());
                    control_ivs.push(iv_data); // Store the IV data.
                } else {
                    return Err(Errno::EINVAL); // Data length mismatch.
                }
            }
            (libc::SOL_ALG, libc::ALG_SET_OP) => {
                // SAFETY: Casting `data.as_ptr()` to `*const libc::c_int` is based on
                // the expectation that `data` contains a buffer representing an operation
                // code of type `c_int` for the `ALG_SET_OP` control message.  This cast
                // assumes the beginning of `data` is correctly aligned for an `i32`, which
                // is valid if the control message was constructed correctly by the sender
                // according to the `AF_ALG` socket requirements. The check `data_len !=
                // size_of::<libc::c_int>()` ensures that the buffer length
                // exactly matches the size of an `i32`, mitigating the risk of undefined
                // behavior due to incorrect buffer size. However, this operation bypasses
                // Rust's guarantees on data alignment, relying on the correct alignment by
                // the sender and adherence to the protocol's specification, which mandates
                // proper alignment for control message data.
                if data_len != size_of::<libc::c_int>() {
                    return Err(Errno::EINVAL); // Data length mismatch
                }
                #[expect(clippy::cast_ptr_alignment)]
                // SAFETY: See the comment above.
                let op = unsafe { *(data.as_ptr() as *const libc::c_int) };
                control_ops.push(op); // Store the operation code
            }
            (libc::SOL_ALG, libc::ALG_SET_AEAD_ASSOCLEN) => {
                // SAFETY: The cast from `*const u8` to `*const u32` here assumes that the
                // data buffer, although initially handled as a sequence of bytes, is
                // correctly aligned for a `u32`. This assumption is contingent upon the
                // sender properly constructing the control message with the
                // `ALG_SET_AEAD_ASSOCLEN` type, ensuring the alignment meets the
                // requirements for `u32` data. The prerequisite check `data_len !=
                // size_of::<u32>()` ensures the buffer is exactly the size of a
                // `u32`, mitigating risks associated with accessing beyond the buffer or
                // misinterpreting the data type.  While this operation inherently trusts
                // the message sender to adhere to alignment requirements, it aligns with
                // common practices for handling similarly structured control messages in
                // systems programming, where protocol adherence guarantees data alignment.
                if data_len != size_of::<u32>() {
                    return Err(Errno::EINVAL); // Data length mismatch
                }
                #[expect(clippy::cast_ptr_alignment)]
                // SAFETY: See the comment above.
                let assoclen = unsafe { *(data.as_ptr() as *const u32) };
                control_aead_assoclens.push(assoclen); // Store the AEAD assoclen
            }
            (libc::SOL_UDP, libc::UDP_SEGMENT) => {
                // SAFETY: This unsafe block casts a pointer from `*const u8` to `*const
                // u16` under the assumption that the data at `data.as_ptr()` is correctly
                // aligned for `u16`. This is based on the expectation that the sender of
                // the control message aligns the data according to the `u16` requirements
                // when constructing the message for `UDP_SEGMENT`. The check `data_len !=
                // size_of::<u16>()` ensures that the buffer is precisely the size
                // expected for a single `u16` value, thus avoiding potential overreads or
                // misinterpretation of the buffer content. This cast and dereference
                // operation is predicated on the alignment and size of the data being
                // appropriate for a `u16`, as per the protocol's definition for UDP segment
                // control messages, thereby justifying the bypass of Rust's alignment
                // safety checks.
                if data_len != size_of::<u16>() {
                    return Err(Errno::EINVAL); // Data length mismatch
                }
                #[expect(clippy::cast_ptr_alignment)]
                // SAFETY: See the comment above.
                let gso_segments = unsafe { *(data.as_ptr() as *const u16) };
                control_udp_gso_segments.push(gso_segments); // Store the GSO segment count
            }
            (libc::IPPROTO_IP, libc::IP_PKTINFO) => {
                // SAFETY: The cast from `*const u8` to `*const libc::in_pktinfo` assumes
                // that the alignment requirements for `libc::in_pktinfo` are met. This
                // assumption is based on the contract that control message data, in this
                // case for `IP_PKTINFO`, is correctly aligned according to the
                // specifications of the underlying C and network protocols. The preceding
                // size check ensures that the buffer `data` contains exactly the amount of
                // bytes necessary to represent a single `libc::in_pktinfo` structure,
                // thereby avoiding both overreads and misinterpretation of the data.  The
                // dereference to access the `in_pktinfo` is then justified under the
                // assumption of proper alignment and correct data length, as mandated by
                // the control message's protocol definition.
                if data_len != size_of::<libc::in_pktinfo>() {
                    return Err(Errno::EINVAL); // Data length mismatch
                }
                #[expect(clippy::cast_ptr_alignment)]
                // SAFETY: See the comment above.
                let pktinfo = unsafe { &*(data.as_ptr() as *const libc::in_pktinfo) };
                control_ipv4_packet_infos.push(*pktinfo); // Store the IPv4 packet info
            }
            (libc::IPPROTO_IPV6, libc::IPV6_PKTINFO) => {
                // SAFETY: The cast from `*const u8` to `*const libc::in6_pktinfo` is made
                // under the assumption that the buffer is properly aligned for the
                // `libc::in6_pktinfo` structure. This is based on the expectation that the
                // sender of the control message correctly aligns the packet information
                // according to the IPv6 standard requirements. The check ensuring
                // `data_len` matches the size of `libc::in6_pktinfo` guarantees the buffer
                // contains enough data to represent an `in6_pktinfo` structure without
                // overreading. Aligning to and dereferencing the pointer to access the data
                // is therefore considered safe, assuming adherence to the protocol by the
                // message sender and that the data has been formatted and aligned correctly
                // for the type of control message being processed.
                if data_len != size_of::<libc::in6_pktinfo>() {
                    return Err(Errno::EINVAL); // Data length mismatch
                }
                #[expect(clippy::cast_ptr_alignment)]
                // SAFETY: See the comment above.
                let pktinfo = unsafe { &*(data.as_ptr() as *const libc::in6_pktinfo) };
                control_ipv6_packet_infos.push(*pktinfo); // Store the IPv6 packet info
            }
            (libc::SOL_SOCKET, libc::SO_RXQ_OVFL) => {
                // SAFETY: Casting from `*const u8` to `*const u32` here assumes that the
                // starting position of `data` is correctly aligned for `u32`. This
                // assumption is valid if the control message, specifically for
                // `SO_RXQ_OVFL`, is constructed with alignment considerations for `u32` as
                // per the protocol's specification. The precondition check that `data_len`
                // equals the size of `u32` ensures that we are accessing exactly one `u32`
                // value, preventing any overread or misinterpretation of the buffer's
                // content. This operation presumes that the control message's sender aligns
                // the data correctly and that the entire length of `data` is intended to
                // represent a single `u32` value, corresponding to the RX queue overflow
                // count. The correctness of this operation depends on adherence to these
                // alignment and size specifications by the sender.
                if data_len != size_of::<u32>() {
                    return Err(Errno::EINVAL); // Data length mismatch
                }
                #[expect(clippy::cast_ptr_alignment)]
                // SAFETY: See the comment above.
                let rxq_ovfl = unsafe { *(data.as_ptr() as *const u32) };
                control_rxq_ovfls.push(rxq_ovfl); // Store the Rx queue overflow count
            }
            (libc::SOL_SOCKET, libc::SCM_TXTIME) => {
                // SAFETY: The casting from `*const u8` to `*const u64` is contingent upon
                // the assumption that the `data` buffer is aligned according to `u64`
                // alignment requirements. This operation is premised on the protocol's or
                // sender's adherence to correctly aligning the data for a `u64` value,
                // which is the expected format for `SCM_TXTIME` control messages. The check
                // against `data_len` being equal to the size of a `u64` ensures that only a
                // single `u64` value is accessed, mitigating the risk of buffer overreads
                // and ensuring the data is interpreted correctly as a transmission time.
                // This cast and dereference assume that the control message's composition
                // and alignment practices properly account for the alignment needs of a
                // `u64`, making the operation safe under these controlled conditions.
                if data_len != size_of::<u64>() {
                    return Err(Errno::EINVAL); // Data length mismatch
                }
                #[expect(clippy::cast_ptr_alignment)]
                // SAFETY: See the comment above.
                let tx_time = unsafe { *(data.as_ptr() as *const u64) };
                control_tx_times.push(tx_time); // Store the Tx time
            }
            (libc::IPPROTO_IP, libc::IP_TOS) => {
                // IP_TOS ancillary data is a u8.
                if data_len != size_of::<u8>() {
                    return Err(Errno::EINVAL);
                }
                // SAFETY: Exact u8-sized payload validated; bounds and alignment guaranteed
                // by prior checks and cmsg parsing semantics. Single u8 read is safe.
                let val = unsafe { *data.as_ptr() };
                control_ipv4_toses.push(val);
            }
            (libc::IPPROTO_IPV6, libc::IPV6_TCLASS) => {
                if data_len != size_of::<i32>() {
                    return Err(Errno::EINVAL);
                }
                // SAFETY: Exact int-sized payload validated; bounds and alignment guaranteed
                // by prior checks and cmsg parsing semantics. Single c_int read is safe.
                #[expect(clippy::cast_ptr_alignment)]
                let val = unsafe { *(data.as_ptr() as *const i32) };
                control_ipv6_tclasses.push(val);
            }
            _ => return Err(Errno::EINVAL),
        }

        // SAFETY: Incrementing `offset` by the result of `CMSG_SPACE(data_len as u32)` is safe
        // under the assumption that `data_len` accurately reflects the length of the current
        // control message's data, and the calculation of space accounts for any padding needed
        // for alignment in subsequent control messages.  The use of `CMSG_SPACE` ensures that
        // `offset` is correctly aligned for the start of the next control message in the
        // buffer, adhering to the alignment requirements of control messages. This operation
        // is guarded by checks on `data_len` and buffer bounds to prevent arithmetic overflows
        // or buffer over-reads, ensuring that the new `offset` value is within the bounds of
        // `cmsg_buf`. The reliance on `CMSG_SPACE` for alignment adjustment is standard
        // practice for parsing sequences of control messages in a buffer, provided that the
        // control message buffer (`cmsg_buf`) is initially aligned and structured correctly
        // according to control message protocols.
        #[expect(clippy::arithmetic_side_effects)]
        #[expect(clippy::cast_possible_truncation)]
        if !is32 {
            // SAFETY: See the comment above.
            offset += unsafe { libc::CMSG_SPACE(data_len as u32) } as usize;
        } else {
            offset += cmsg_space_32(data_len as u32);
        }
    }

    Ok((
        control_fds,
        control_creds,
        control_ivs,
        control_ops,
        control_aead_assoclens,
        control_udp_gso_segments,
        control_ipv4_packet_infos,
        control_ipv6_packet_infos,
        control_rxq_ovfls,
        control_tx_times,
        control_ipv4_toses,
        control_ipv6_tclasses,
    ))
}
