mirror of
https://chromium.googlesource.com/crosvm/crosvm
synced 2024-12-26 13:10:56 +00:00
crosvm: Put block device process in a minijail
Run with the new seccomp filter and drop all capabilities. In addition enter a new user, mount, network, and ipc namespace. Leave the mount namespace empty after pivot-rooting to an empty directory. Change-Id: Iee583cf260ede8ca13f005836684eb80c2c3ac3e Signed-off-by: Dylan Reid <dgreid@chromium.org> Reviewed-on: https://chromium-review.googlesource.com/515603
This commit is contained in:
parent
f2164a18bf
commit
61edbbff53
4 changed files with 96 additions and 8 deletions
|
@ -8,12 +8,14 @@ lto = true
|
|||
panic = 'abort'
|
||||
|
||||
[dependencies]
|
||||
io_jail = { path = "io_jail" }
|
||||
kvm = { path = "kvm" }
|
||||
sys_util = { path = "sys_util" }
|
||||
x86_64 = { path = "x86_64" }
|
||||
kernel_loader = { path = "kernel_loader" }
|
||||
libc = "0.2.21"
|
||||
byteorder = "1"
|
||||
syscall_defines = { path = "syscall_defines" }
|
||||
|
||||
[dependencies.clap]
|
||||
version = "*"
|
||||
|
|
20
block_device.policy
Normal file
20
block_device.policy
Normal file
|
@ -0,0 +1,20 @@
|
|||
close: 1
|
||||
exit_group: 1
|
||||
futex: 1
|
||||
lseek: 1
|
||||
# Disallow mmap with PROT_EXEC set. The syntax here doesn't allow bit
|
||||
# negation, thus the manually negated mask constant.
|
||||
mmap: arg2 in 0xfffffffb
|
||||
mprotect: arg2 in 0xfffffffb
|
||||
munmap: 1
|
||||
read: 1
|
||||
recvfrom: 1
|
||||
sched_getaffinity: 1
|
||||
set_robust_list: 1
|
||||
sigaltstack: 1
|
||||
# Disallow clone's other than new threads.
|
||||
clone: arg0 & 0x00010000
|
||||
write: 1
|
||||
eventfd2: 1
|
||||
dup: 1
|
||||
poll: 1
|
|
@ -9,11 +9,13 @@ use std::io::{Error, Result};
|
|||
use std::os::unix::net::UnixDatagram;
|
||||
use std::time::Duration;
|
||||
|
||||
use libc::fork;
|
||||
use libc;
|
||||
use libc::pid_t;
|
||||
|
||||
use byteorder::{NativeEndian, ByteOrder};
|
||||
|
||||
use hw::BusDevice;
|
||||
use syscall_defines::linux::LinuxSyscall::SYS_clone;
|
||||
|
||||
const SOCKET_TIMEOUT_MS: u64 = 2000;
|
||||
const MSG_SIZE: usize = 24;
|
||||
|
@ -78,6 +80,20 @@ fn child_proc(sock: UnixDatagram, device: &mut BusDevice) -> ! {
|
|||
process::exit(0);
|
||||
}
|
||||
|
||||
unsafe fn do_clone() -> Result<pid_t> {
|
||||
// Forking is unsafe, this function must be unsafe as there is no way to
|
||||
// guarantee saftey without more context about the state of the program.
|
||||
let pid = libc::syscall(SYS_clone as i64,
|
||||
libc::CLONE_NEWUSER | libc::CLONE_NEWPID |
|
||||
libc::SIGCHLD as i32,
|
||||
0);
|
||||
if pid < 0 {
|
||||
Err(Error::last_os_error())
|
||||
} else {
|
||||
Ok(pid as pid_t)
|
||||
}
|
||||
}
|
||||
|
||||
/// Wraps an inner `hw::BusDevice` that is run inside a child process via fork.
|
||||
///
|
||||
/// Because forks are very unfriendly to destructors and all memory mappings and file descriptors
|
||||
|
@ -91,15 +107,19 @@ impl ProxyDevice {
|
|||
///
|
||||
/// The forked process will automatically be terminated when this is dropped, so be sure to keep
|
||||
/// a reference.
|
||||
pub fn new<D: BusDevice>(mut device: D) -> Result<ProxyDevice> {
|
||||
/// `post_clone_cb` - Called after forking the child process, passed the
|
||||
/// child end of the pipe that must be kep open.
|
||||
pub fn new<D: BusDevice, F>(mut device: D, post_clone_cb: F) -> Result<ProxyDevice>
|
||||
where F: FnOnce(&UnixDatagram) {
|
||||
let (child_sock, parent_sock) = UnixDatagram::pair()?;
|
||||
|
||||
let ret = unsafe { fork() };
|
||||
// Forking a new process is unsafe, we must ensure no resources required
|
||||
// by the other side are freed after the two processes start.
|
||||
let ret = unsafe { do_clone()? };
|
||||
if ret == 0 {
|
||||
post_clone_cb(&child_sock);
|
||||
// ! Never returns
|
||||
child_proc(child_sock, &mut device);
|
||||
} else if ret == -1 {
|
||||
return Err(Error::last_os_error());
|
||||
}
|
||||
|
||||
let mut buf = [0; MSG_SIZE];
|
||||
|
|
52
src/main.rs
52
src/main.rs
|
@ -6,22 +6,27 @@
|
|||
|
||||
extern crate clap;
|
||||
extern crate libc;
|
||||
extern crate io_jail;
|
||||
extern crate kvm;
|
||||
extern crate x86_64;
|
||||
extern crate kernel_loader;
|
||||
extern crate byteorder;
|
||||
#[macro_use] extern crate sys_util;
|
||||
extern crate syscall_defines;
|
||||
|
||||
use std::ffi::{CString, CStr};
|
||||
use std::fmt;
|
||||
use std::fs::File;
|
||||
use std::io::{stdin, stdout};
|
||||
use std::os::unix::io::{AsRawFd, RawFd};
|
||||
use std::path::Path;
|
||||
use std::string::String;
|
||||
use std::sync::{Arc, Mutex, Barrier};
|
||||
use std::thread::{spawn, JoinHandle};
|
||||
|
||||
use clap::{Arg, App, SubCommand};
|
||||
|
||||
use io_jail::Minijail;
|
||||
use kvm::*;
|
||||
use sys_util::{GuestAddress, GuestMemory, EventFd, Terminal, Poller, Pollable,
|
||||
register_signal_handler, Killable};
|
||||
|
@ -36,6 +41,8 @@ enum Error {
|
|||
Socket(std::io::Error),
|
||||
Disk(std::io::Error),
|
||||
BlockDeviceNew(sys_util::Error),
|
||||
BlockDeviceJail(io_jail::Error),
|
||||
BlockDevicePivotRoot(io_jail::Error),
|
||||
Cmdline(kernel_cmdline::Error),
|
||||
ProxyDeviceCreation(std::io::Error),
|
||||
RegisterIoevent(sys_util::Error),
|
||||
|
@ -73,6 +80,10 @@ impl fmt::Display for Error {
|
|||
&Error::Socket(ref e) => write!(f, "failed to create socket: {}", e),
|
||||
&Error::Disk(ref e) => write!(f, "failed to load disk image: {}", e),
|
||||
&Error::BlockDeviceNew(ref e) => write!(f, "failed to create block device: {:?}", e),
|
||||
&Error::BlockDeviceJail(ref e) => write!(f, "failed to jail block device: {:?}", e),
|
||||
&Error::BlockDevicePivotRoot(ref e) => {
|
||||
write!(f, "failed to pivot root block device: {:?}", e)
|
||||
}
|
||||
&Error::Cmdline(ref e) => write!(f, "the given kernel command line was invalid: {}", e),
|
||||
&Error::ProxyDeviceCreation(ref e) => write!(f, "failed to create proxy device: {}", e),
|
||||
&Error::RegisterIoevent(ref e) => write!(f, "error registering ioevent: {:?}", e),
|
||||
|
@ -110,6 +121,26 @@ const KERNEL_START_OFFSET: usize = 0x200000;
|
|||
const CMDLINE_OFFSET: usize = 0x20000;
|
||||
const CMDLINE_MAX_SIZE: usize = KERNEL_START_OFFSET - CMDLINE_OFFSET;
|
||||
|
||||
fn create_block_device_jail() -> Result<Minijail> {
|
||||
// All child jails run in a new user namespace without any users mapped,
|
||||
// they run as nobody unless otherwise configured.
|
||||
let mut j = Minijail::new().map_err(|e| Error::BlockDeviceJail(e))?;
|
||||
// Don't need any capabilities.
|
||||
j.use_caps(0);
|
||||
// Create a new mount namespace with an empty root FS.
|
||||
j.namespace_vfs();
|
||||
j.enter_pivot_root(Path::new("/run/asdf"))
|
||||
.map_err(|e| Error::BlockDevicePivotRoot(e))?;
|
||||
// Run in an empty network namespace.
|
||||
j.namespace_net();
|
||||
// Apply the block device seccomp policy.
|
||||
j.no_new_privs();
|
||||
j.parse_seccomp_filters(Path::new("block_device.policy"))
|
||||
.map_err(|e| Error::BlockDeviceJail(e))?;
|
||||
j.use_seccomp_filter();
|
||||
Ok(j)
|
||||
}
|
||||
|
||||
fn run_config(cfg: Config) -> Result<()> {
|
||||
let socket = if let Some(ref socket_path) = cfg.socket_path {
|
||||
Some(ControlSocketRecv::new(socket_path)
|
||||
|
@ -136,7 +167,11 @@ fn run_config(cfg: Config) -> Result<()> {
|
|||
let mut irq: u32 = 5;
|
||||
|
||||
if let Some(ref disk_path) = cfg.disk_path {
|
||||
// List of FDs to keep open in the child after it forks.
|
||||
let mut keep_fds: Vec<RawFd> = Vec::new();
|
||||
|
||||
let disk_image = File::open(disk_path).map_err(|e| Error::Disk(e))?;
|
||||
keep_fds.push(disk_image.as_raw_fd());
|
||||
|
||||
let block_box = Box::new(hw::virtio::Block::new(disk_image)
|
||||
.map_err(|e| Error::BlockDeviceNew(e))?);
|
||||
|
@ -144,16 +179,27 @@ fn run_config(cfg: Config) -> Result<()> {
|
|||
for (i, queue_evt) in block_mmio.queue_evts().iter().enumerate() {
|
||||
let io_addr = IoeventAddress::Mmio(mmio_base + hw::virtio::NOITFY_REG_OFFSET as u64);
|
||||
vm_requests.push(VmRequest::RegisterIoevent(queue_evt.try_clone()?, io_addr, i as u32));
|
||||
keep_fds.push(queue_evt.as_raw_fd());
|
||||
}
|
||||
|
||||
if let Some(interrupt_evt) = block_mmio.interrupt_evt() {
|
||||
vm_requests.push(VmRequest::RegisterIrqfd(interrupt_evt.try_clone()?, irq));
|
||||
keep_fds.push(interrupt_evt.as_raw_fd());
|
||||
}
|
||||
|
||||
if cfg.multiprocess {
|
||||
bus.insert(Arc::new(Mutex::new(hw::ProxyDevice::new(block_mmio).unwrap())),
|
||||
mmio_base,
|
||||
mmio_len)
|
||||
let jail = create_block_device_jail()?;
|
||||
let proxy_dev = hw::ProxyDevice::new(block_mmio, move |keep_pipe| {
|
||||
keep_fds.push(keep_pipe.as_raw_fd());
|
||||
// Need to panic here as there isn't a way to recover from a
|
||||
// partly-jailed process.
|
||||
unsafe {
|
||||
// This is OK as we have whitelisted all the FDs we need open.
|
||||
jail.enter(Some(&keep_fds)).unwrap();
|
||||
}
|
||||
})
|
||||
.map_err(|e| Error::ProxyDeviceCreation(e))?;
|
||||
bus.insert(Arc::new(Mutex::new(proxy_dev)), mmio_base, mmio_len)
|
||||
.unwrap();
|
||||
} else {
|
||||
bus.insert(Arc::new(Mutex::new(block_mmio)), mmio_base, mmio_len)
|
||||
|
|
Loading…
Reference in a new issue