diff --git a/Cargo.toml b/Cargo.toml index 1393a6d5c8..137ccbdd7b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,12 +8,14 @@ lto = true panic = 'abort' [dependencies] +io_jail = { path = "io_jail" } kvm = { path = "kvm" } sys_util = { path = "sys_util" } x86_64 = { path = "x86_64" } kernel_loader = { path = "kernel_loader" } libc = "0.2.21" byteorder = "1" +syscall_defines = { path = "syscall_defines" } [dependencies.clap] version = "*" diff --git a/block_device.policy b/block_device.policy new file mode 100644 index 0000000000..40097a5292 --- /dev/null +++ b/block_device.policy @@ -0,0 +1,20 @@ +close: 1 +exit_group: 1 +futex: 1 +lseek: 1 +# Disallow mmap with PROT_EXEC set. The syntax here doesn't allow bit +# negation, thus the manually negated mask constant. +mmap: arg2 in 0xfffffffb +mprotect: arg2 in 0xfffffffb +munmap: 1 +read: 1 +recvfrom: 1 +sched_getaffinity: 1 +set_robust_list: 1 +sigaltstack: 1 +# Disallow clone's other than new threads. +clone: arg0 & 0x00010000 +write: 1 +eventfd2: 1 +dup: 1 +poll: 1 diff --git a/src/hw/proxy.rs b/src/hw/proxy.rs index 628c0af6e7..efd9734715 100644 --- a/src/hw/proxy.rs +++ b/src/hw/proxy.rs @@ -9,11 +9,13 @@ use std::io::{Error, Result}; use std::os::unix::net::UnixDatagram; use std::time::Duration; -use libc::fork; +use libc; +use libc::pid_t; use byteorder::{NativeEndian, ByteOrder}; use hw::BusDevice; +use syscall_defines::linux::LinuxSyscall::SYS_clone; const SOCKET_TIMEOUT_MS: u64 = 2000; const MSG_SIZE: usize = 24; @@ -78,6 +80,20 @@ fn child_proc(sock: UnixDatagram, device: &mut BusDevice) -> ! { process::exit(0); } +unsafe fn do_clone() -> Result { + // Forking is unsafe, this function must be unsafe as there is no way to + // guarantee saftey without more context about the state of the program. + let pid = libc::syscall(SYS_clone as i64, + libc::CLONE_NEWUSER | libc::CLONE_NEWPID | + libc::SIGCHLD as i32, + 0); + if pid < 0 { + Err(Error::last_os_error()) + } else { + Ok(pid as pid_t) + } +} + /// Wraps an inner `hw::BusDevice` that is run inside a child process via fork. /// /// Because forks are very unfriendly to destructors and all memory mappings and file descriptors @@ -91,15 +107,19 @@ impl ProxyDevice { /// /// The forked process will automatically be terminated when this is dropped, so be sure to keep /// a reference. - pub fn new(mut device: D) -> Result { + /// `post_clone_cb` - Called after forking the child process, passed the + /// child end of the pipe that must be kep open. + pub fn new(mut device: D, post_clone_cb: F) -> Result + where F: FnOnce(&UnixDatagram) { let (child_sock, parent_sock) = UnixDatagram::pair()?; - let ret = unsafe { fork() }; + // Forking a new process is unsafe, we must ensure no resources required + // by the other side are freed after the two processes start. + let ret = unsafe { do_clone()? }; if ret == 0 { + post_clone_cb(&child_sock); // ! Never returns child_proc(child_sock, &mut device); - } else if ret == -1 { - return Err(Error::last_os_error()); } let mut buf = [0; MSG_SIZE]; diff --git a/src/main.rs b/src/main.rs index 1e1d60150f..603369db18 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,22 +6,27 @@ extern crate clap; extern crate libc; +extern crate io_jail; extern crate kvm; extern crate x86_64; extern crate kernel_loader; extern crate byteorder; #[macro_use] extern crate sys_util; +extern crate syscall_defines; use std::ffi::{CString, CStr}; use std::fmt; use std::fs::File; use std::io::{stdin, stdout}; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::path::Path; use std::string::String; use std::sync::{Arc, Mutex, Barrier}; use std::thread::{spawn, JoinHandle}; use clap::{Arg, App, SubCommand}; +use io_jail::Minijail; use kvm::*; use sys_util::{GuestAddress, GuestMemory, EventFd, Terminal, Poller, Pollable, register_signal_handler, Killable}; @@ -36,6 +41,8 @@ enum Error { Socket(std::io::Error), Disk(std::io::Error), BlockDeviceNew(sys_util::Error), + BlockDeviceJail(io_jail::Error), + BlockDevicePivotRoot(io_jail::Error), Cmdline(kernel_cmdline::Error), ProxyDeviceCreation(std::io::Error), RegisterIoevent(sys_util::Error), @@ -73,6 +80,10 @@ impl fmt::Display for Error { &Error::Socket(ref e) => write!(f, "failed to create socket: {}", e), &Error::Disk(ref e) => write!(f, "failed to load disk image: {}", e), &Error::BlockDeviceNew(ref e) => write!(f, "failed to create block device: {:?}", e), + &Error::BlockDeviceJail(ref e) => write!(f, "failed to jail block device: {:?}", e), + &Error::BlockDevicePivotRoot(ref e) => { + write!(f, "failed to pivot root block device: {:?}", e) + } &Error::Cmdline(ref e) => write!(f, "the given kernel command line was invalid: {}", e), &Error::ProxyDeviceCreation(ref e) => write!(f, "failed to create proxy device: {}", e), &Error::RegisterIoevent(ref e) => write!(f, "error registering ioevent: {:?}", e), @@ -110,6 +121,26 @@ const KERNEL_START_OFFSET: usize = 0x200000; const CMDLINE_OFFSET: usize = 0x20000; const CMDLINE_MAX_SIZE: usize = KERNEL_START_OFFSET - CMDLINE_OFFSET; +fn create_block_device_jail() -> Result { + // All child jails run in a new user namespace without any users mapped, + // they run as nobody unless otherwise configured. + let mut j = Minijail::new().map_err(|e| Error::BlockDeviceJail(e))?; + // Don't need any capabilities. + j.use_caps(0); + // Create a new mount namespace with an empty root FS. + j.namespace_vfs(); + j.enter_pivot_root(Path::new("/run/asdf")) + .map_err(|e| Error::BlockDevicePivotRoot(e))?; + // Run in an empty network namespace. + j.namespace_net(); + // Apply the block device seccomp policy. + j.no_new_privs(); + j.parse_seccomp_filters(Path::new("block_device.policy")) + .map_err(|e| Error::BlockDeviceJail(e))?; + j.use_seccomp_filter(); + Ok(j) +} + fn run_config(cfg: Config) -> Result<()> { let socket = if let Some(ref socket_path) = cfg.socket_path { Some(ControlSocketRecv::new(socket_path) @@ -136,7 +167,11 @@ fn run_config(cfg: Config) -> Result<()> { let mut irq: u32 = 5; if let Some(ref disk_path) = cfg.disk_path { + // List of FDs to keep open in the child after it forks. + let mut keep_fds: Vec = Vec::new(); + let disk_image = File::open(disk_path).map_err(|e| Error::Disk(e))?; + keep_fds.push(disk_image.as_raw_fd()); let block_box = Box::new(hw::virtio::Block::new(disk_image) .map_err(|e| Error::BlockDeviceNew(e))?); @@ -144,16 +179,27 @@ fn run_config(cfg: Config) -> Result<()> { for (i, queue_evt) in block_mmio.queue_evts().iter().enumerate() { let io_addr = IoeventAddress::Mmio(mmio_base + hw::virtio::NOITFY_REG_OFFSET as u64); vm_requests.push(VmRequest::RegisterIoevent(queue_evt.try_clone()?, io_addr, i as u32)); + keep_fds.push(queue_evt.as_raw_fd()); } if let Some(interrupt_evt) = block_mmio.interrupt_evt() { vm_requests.push(VmRequest::RegisterIrqfd(interrupt_evt.try_clone()?, irq)); + keep_fds.push(interrupt_evt.as_raw_fd()); } if cfg.multiprocess { - bus.insert(Arc::new(Mutex::new(hw::ProxyDevice::new(block_mmio).unwrap())), - mmio_base, - mmio_len) + let jail = create_block_device_jail()?; + let proxy_dev = hw::ProxyDevice::new(block_mmio, move |keep_pipe| { + keep_fds.push(keep_pipe.as_raw_fd()); + // Need to panic here as there isn't a way to recover from a + // partly-jailed process. + unsafe { + // This is OK as we have whitelisted all the FDs we need open. + jail.enter(Some(&keep_fds)).unwrap(); + } + }) + .map_err(|e| Error::ProxyDeviceCreation(e))?; + bus.insert(Arc::new(Mutex::new(proxy_dev)), mmio_base, mmio_len) .unwrap(); } else { bus.insert(Arc::new(Mutex::new(block_mmio)), mmio_base, mmio_len)