From 88f9cba448ff7f1cd61c8bf66e34772132a8663f Mon Sep 17 00:00:00 2001 From: Chirantan Ekbote Date: Mon, 28 Aug 2017 09:51:18 -0700 Subject: [PATCH] Implement virtio-vsock Implement the virtual sockets device using vhost subsystem of the host kernel to handle data transfer. BUG=chromium:708267 TEST=build and run maitred in guest VM without issue Change-Id: I35b542c0fc7e0fd9296f7ba3e1dfce60bf524d15 Signed-off-by: Chirantan Ekbote Reviewed-on: https://chromium-review.googlesource.com/638838 Reviewed-by: Stephen Barber --- seccomp/x86_64/vhost_vsock_device.policy | 43 ++++ src/hw/virtio/mod.rs | 1 + src/hw/virtio/vhost/mod.rs | 6 + src/hw/virtio/vhost/vsock.rs | 280 +++++++++++++++++++++++ src/main.rs | 39 ++++ vhost/src/lib.rs | 3 + vhost/src/net.rs | 25 +- vhost/src/vsock.rs | 82 +++++++ virtio_sys/src/lib.rs | 2 + 9 files changed, 465 insertions(+), 16 deletions(-) create mode 100644 seccomp/x86_64/vhost_vsock_device.policy create mode 100644 src/hw/virtio/vhost/vsock.rs create mode 100644 vhost/src/vsock.rs diff --git a/seccomp/x86_64/vhost_vsock_device.policy b/seccomp/x86_64/vhost_vsock_device.policy new file mode 100644 index 0000000000..0310470d62 --- /dev/null +++ b/seccomp/x86_64/vhost_vsock_device.policy @@ -0,0 +1,43 @@ +# Copyright 2017 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +close: 1 +exit_group: 1 +futex: 1 +# Whitelist vhost_vsock ioctls only. +# arg1 == VHOST_GET_FEATURES || +# arg1 == VHOST_SET_FEATURES || +# arg1 == VHOST_SET_OWNER || +# arg1 == VHOST_RESET_OWNER || +# arg1 == VHOST_SET_MEM_TABLE || +# arg1 == VHOST_SET_LOG_BASE || +# arg1 == VHOST_SET_LOG_FD || +# arg1 == VHOST_SET_VRING_NUM || +# arg1 == VHOST_SET_VRING_ADDR || +# arg1 == VHOST_SET_VRING_BASE || +# arg1 == VHOST_GET_VRING_BASE || +# arg1 == VHOST_SET_VRING_KICK || +# arg1 == VHOST_SET_VRING_CALL || +# arg1 == VHOST_SET_VRING_ERR || +# arg1 == VHOST_VSOCK_SET_GUEST_CID || +# arg1 == VHOST_VSOCK_SET_RUNNING +ioctl: arg1 == 0x8008af00 || arg1 == 0x4008af00 || arg1 == 0x0000af01 || arg1 == 0x0000af02 || arg1 == 0x4008af03 || arg1 == 0x4008af04 || arg1 == 0x4004af07 || arg1 == 0x4008af10 || arg1 == 0x4028af11 || arg1 == 0x4008af12 || arg1 == 0xc008af12 || arg1 == 0x4008af20 || arg1 == 0x4008af21 || arg1 == 0x4008af22 || arg1 == 0x4008af60 || arg1 == 0x4004af61 +# Disallow mmap with PROT_EXEC set. The syntax here doesn't allow bit +# negation, thus the manually negated mask constant. +mmap: arg2 in 0xfffffffb +mprotect: arg2 in 0xfffffffb +munmap: 1 +poll: 1 +read: 1 +connect: 1 +sendto: 1 +recvfrom: 1 +sched_getaffinity: 1 +set_robust_list: 1 +sigaltstack: 1 +# Disallow clone's other than new threads. +# arg0 is flags. Because kernel. +clone: arg0 & 0x00010000 +write: 1 +getpid: 1 diff --git a/src/hw/virtio/mod.rs b/src/hw/virtio/mod.rs index 012d7523f6..7c1bf1c404 100644 --- a/src/hw/virtio/mod.rs +++ b/src/hw/virtio/mod.rs @@ -30,6 +30,7 @@ const DEVICE_FAILED: u32 = 0x80; const TYPE_NET: u32 = 1; const TYPE_BLOCK: u32 = 2; const TYPE_RNG: u32 = 4; +const TYPE_VSOCK: u32 = 19; const TYPE_WL: u32 = 30; const INTERRUPT_STATUS_USED_RING: u32 = 0x1; diff --git a/src/hw/virtio/vhost/mod.rs b/src/hw/virtio/vhost/mod.rs index 3ed087429e..1a45c5b23f 100644 --- a/src/hw/virtio/vhost/mod.rs +++ b/src/hw/virtio/vhost/mod.rs @@ -11,9 +11,11 @@ use sys_util::Error as SysError; use vhost::Error as VhostError; mod net; +mod vsock; mod worker; pub use self::net::Net; +pub use self::vsock::Vsock; #[derive(Debug)] pub enum Error { @@ -57,6 +59,10 @@ pub enum Error { VhostSetVringKick(VhostError), /// Net set backend failed. VhostNetSetBackend(VhostError), + /// Failed to set CID for guest. + VhostVsockSetCid(VhostError), + /// Failed to start vhost-vsock driver. + VhostVsockStart(VhostError), /// Failed to create vhost eventfd. VhostIrqCreate(SysError), /// Failed to read vhost eventfd. diff --git a/src/hw/virtio/vhost/vsock.rs b/src/hw/virtio/vhost/vsock.rs new file mode 100644 index 0000000000..a3c24aa30f --- /dev/null +++ b/src/hw/virtio/vhost/vsock.rs @@ -0,0 +1,280 @@ +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::Arc; +use std::sync::atomic::AtomicUsize; +use std::thread::spawn; + +use byteorder::{ByteOrder, LittleEndian}; + +use sys_util::{EventFd, GuestMemory}; +use vhost::Vsock as VhostVsockHandle; +use virtio_sys::vhost; + +use super::{Error, Result}; +use super::super::{Queue, VirtioDevice, TYPE_VSOCK}; +use super::worker::Worker; + +const QUEUE_SIZE: u16 = 256; +const NUM_QUEUES: usize = 3; +const QUEUE_SIZES: &'static [u16] = &[QUEUE_SIZE; NUM_QUEUES]; + +pub struct Vsock { + worker_kill_evt: Option, + kill_evt: Option, + vhost_handle: Option, + cid: u64, + interrupt: Option, + avail_features: u64, + acked_features: u64, +} + +impl Vsock { + /// Create a new virtio-vsock device with the given VM cid. + pub fn new(cid: u64, mem: &GuestMemory) -> Result { + let kill_evt = EventFd::new().map_err(Error::CreateKillEventFd)?; + let handle = VhostVsockHandle::new(mem).map_err(Error::VhostOpen)?; + + let avail_features = + 1 << vhost::VIRTIO_F_NOTIFY_ON_EMPTY | 1 << vhost::VIRTIO_RING_F_INDIRECT_DESC | + 1 << vhost::VIRTIO_RING_F_EVENT_IDX | 1 << vhost::VHOST_F_LOG_ALL | + 1 << vhost::VIRTIO_F_ANY_LAYOUT | 1 << vhost::VIRTIO_F_VERSION_1; + + Ok(Vsock { + worker_kill_evt: Some(kill_evt.try_clone().map_err(Error::CloneKillEventFd)?), + kill_evt: Some(kill_evt), + vhost_handle: Some(handle), + cid: cid, + interrupt: Some(EventFd::new().map_err(Error::VhostIrqCreate)?), + avail_features: avail_features, + acked_features: 0, + }) + } + + pub fn new_for_testing(cid: u64, features: u64) -> Vsock { + Vsock { + worker_kill_evt: None, + kill_evt: None, + vhost_handle: None, + cid: cid, + interrupt: None, + avail_features: features, + acked_features: 0, + } + } + + pub fn acked_features(&self) -> u64 { + self.acked_features + } +} + +impl Drop for Vsock { + fn drop(&mut self) { + // Only kill the child if it claimed its eventfd. + if self.worker_kill_evt.is_none() { + if let Some(ref kill_evt) = self.kill_evt { + // Ignore the result because there is nothing we can do about it. + let _ = kill_evt.write(1); + } + } + } +} + +impl VirtioDevice for Vsock { + fn keep_fds(&self) -> Vec { + let mut keep_fds = Vec::new(); + + if let Some(ref handle) = self.vhost_handle { + keep_fds.push(handle.as_raw_fd()); + } + + if let Some(ref interrupt) = self.interrupt { + keep_fds.push(interrupt.as_raw_fd()); + } + + if let Some(ref worker_kill_evt) = self.worker_kill_evt { + keep_fds.push(worker_kill_evt.as_raw_fd()); + } + + keep_fds + } + + fn device_type(&self) -> u32 { + TYPE_VSOCK + } + + fn queue_max_sizes(&self) -> &[u16] { + QUEUE_SIZES + } + + fn features(&self, page: u32) -> u32 { + match page { + // Get the lower 32-bits of the features bitfield. + 0 => self.avail_features as u32, + // Get the upper 32-bits of the features bitfield. + 1 => (self.avail_features >> 32) as u32, + _ => { + warn!( + "vsock: virtio-vsock got request for features page: {}", + page + ); + 0u32 + }, + } + } + + fn read_config(&self, offset: u64, data: &mut [u8]) { + match offset { + 0 if data.len() == 8 => LittleEndian::write_u64(data, self.cid), + 0 if data.len() == 4 => LittleEndian::write_u32(data, (self.cid & 0xffffffff) as u32), + 4 if data.len() == 4 => { + LittleEndian::write_u32(data, ((self.cid >> 32) & 0xffffffff) as u32) + }, + _ => warn!( + "vsock: virtio-vsock received invalid read request of {} bytes at offset {}", + data.len(), + offset + ), + } + } + + fn ack_features(&mut self, page: u32, value: u32) { + let mut v = match page { + 0 => value as u64, + 1 => (value as u64) << 32, + _ => { + warn!( + "vsock: virtio-vsock device cannot ack unknown feature page: {}", + page + ); + 0u64 + }, + }; + + // Check if the guest is ACK'ing a feature that we didn't claim to have. + let unrequested_features = v & !self.avail_features; + if unrequested_features != 0 { + warn!("vsock: virtio-vsock got unknown feature ack: {:x}", v); + + // Don't count these features as acked. + v &= !unrequested_features; + } + self.acked_features |= v; + } + + fn activate( + &mut self, + _: GuestMemory, + interrupt_evt: EventFd, + status: Arc, + queues: Vec, + queue_evts: Vec, + ) { + if queues.len() != NUM_QUEUES || queue_evts.len() != NUM_QUEUES { + error!("net: expected {} queues, got {}", NUM_QUEUES, queues.len()); + return; + } + + if let Some(vhost_handle) = self.vhost_handle.take() { + if let Some(interrupt) = self.interrupt.take() { + if let Some(kill_evt) = self.worker_kill_evt.take() { + let acked_features = self.acked_features; + let cid = self.cid; + spawn(move || { + // The third vq is an event-only vq that is not handled by the vhost + // subsystem (but still needs to exist). Split it off here. + let vhost_queues = queues[..2].to_vec(); + let mut worker = Worker::new( + vhost_queues, + vhost_handle, + interrupt, + status, + interrupt_evt, + acked_features, + ); + let activate_vqs = |handle: &VhostVsockHandle| -> Result<()> { + handle.set_cid(cid).map_err(Error::VhostVsockSetCid)?; + handle.start().map_err(Error::VhostVsockStart)?; + Ok(()) + }; + let result = worker.run(queue_evts, QUEUE_SIZES, kill_evt, activate_vqs); + if let Err(e) = result { + error!("vsock worker thread exited with error: {:?}", e); + } + }); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use byteorder::{ByteOrder, LittleEndian}; + #[test] + fn ack_features() { + let cid = 5; + let features: u64 = (1 << 20) | (1 << 49) | (1 << 2) | (1 << 19); + let mut acked_features: u64 = 0; + let mut unavailable_features: u64 = 0; + + let mut vsock = Vsock::new_for_testing(cid, features); + assert_eq!(acked_features, vsock.acked_features()); + + acked_features |= 1 << 2; + vsock.ack_features(0, (acked_features & 0xffffffff) as u32); + assert_eq!(acked_features, vsock.acked_features()); + + acked_features |= 1 << 49; + vsock.ack_features(1, (acked_features >> 32) as u32); + assert_eq!(acked_features, vsock.acked_features()); + + acked_features |= 1 << 60; + unavailable_features |= 1 << 60; + vsock.ack_features(1, (acked_features >> 32) as u32); + assert_eq!(acked_features & !unavailable_features, vsock.acked_features()); + + acked_features |= 1 << 1; + unavailable_features |= 1 << 1; + vsock.ack_features(0, (acked_features & 0xffffffff) as u32); + assert_eq!(acked_features & !unavailable_features, vsock.acked_features()); + } + + #[test] + fn read_config() { + let cid = 0xfca9a559fdcb9756; + let vsock = Vsock::new_for_testing(cid, 0); + + let mut buf = [0 as u8; 8]; + vsock.read_config(0, &mut buf); + assert_eq!(cid, LittleEndian::read_u64(&buf)); + + vsock.read_config(0, &mut buf[..4]); + assert_eq!((cid & 0xffffffff) as u32, LittleEndian::read_u32(&buf[..4])); + + vsock.read_config(4, &mut buf[..4]); + assert_eq!((cid >> 32) as u32, LittleEndian::read_u32(&buf[..4])); + + let data: [u8; 8] = [8, 226, 5, 46, 159, 59, 89, 77]; + buf.copy_from_slice(&data); + + vsock.read_config(12, &mut buf); + assert_eq!(&buf, &data); + } + + #[test] + fn features() { + let cid = 5; + let features: u64 = 0xfc195ae8db88cff9; + + let vsock = Vsock::new_for_testing(cid, features); + assert_eq!((features & 0xffffffff) as u32, vsock.features(0)); + assert_eq!((features >> 32) as u32, vsock.features(1)); + assert_eq!(0, vsock.features(559)); + assert_eq!(0, vsock.features(3)); + } +} diff --git a/src/main.rs b/src/main.rs index 273b4bbd13..e3f4da32b6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -60,11 +60,14 @@ enum Error { VhostNetDeviceNew(hw::virtio::vhost::Error), NetDeviceNew(hw::virtio::NetError), NetDeviceRootSetup(sys_util::Error), + VhostVsockDeviceNew(hw::virtio::vhost::Error), + VsockDeviceRootSetup(sys_util::Error), DeviceJail(io_jail::Error), DevicePivotRoot(io_jail::Error), RegisterBlock(device_manager::Error), RegisterNet(device_manager::Error), RegisterWayland(device_manager::Error), + RegisterVsock(device_manager::Error), Cmdline(kernel_cmdline::Error), MissingWayland(PathBuf), RegisterIrqfd(sys_util::Error), @@ -114,12 +117,17 @@ impl fmt::Display for Error { } &Error::RegisterBlock(ref e) => write!(f, "error registering block device: {:?}", e), &Error::VhostNetDeviceNew(ref e) => write!(f, "failed to set up vhost networking: {:?}", e), + &Error::RegisterVsock(ref e) => write!(f, "error registering virtual socket device: {:?}", e), &Error::NetDeviceNew(ref e) => write!(f, "failed to set up virtio networking: {:?}", e), &Error::NetDeviceRootSetup(ref e) => { write!(f, "failed to create root directory for a net device: {:?}", e) } &Error::DeviceJail(ref e) => write!(f, "failed to jail device: {}", e), &Error::DevicePivotRoot(ref e) => write!(f, "failed to pivot root device: {}", e), + &Error::VhostVsockDeviceNew(ref e) => write!(f, "failed to set up virtual socket device: {:?}", e), + &Error::VsockDeviceRootSetup(ref e) => { + write!(f, "failed to create root directory for a vsock device: {:?}", e) + } &Error::RegisterNet(ref e) => write!(f, "error registering net device: {:?}", e), &Error::RegisterRng(ref e) => write!(f, "error registering rng device: {:?}", e), &Error::RngDeviceNew(ref e) => write!(f, "failed to set up rng: {:?}", e), @@ -190,6 +198,7 @@ struct Config { socket_path: Option, multiprocess: bool, warn_unknown_ports: bool, + cid: Option, } const KERNEL_START_OFFSET: usize = 0x200000; @@ -383,6 +392,24 @@ fn run_config(cfg: Config) -> Result<()> { } } + let vsock_root = TempDir::new(&PathBuf::from("/tmp/vsock_root")) + .map_err(Error::VsockDeviceRootSetup)?; + if let Some(cid) = cfg.cid { + let vsock_box = Box::new(hw::virtio::vhost::Vsock::new(cid, &guest_mem) + .map_err(|e| Error::VhostVsockDeviceNew(e))?); + + let jail = if cfg.multiprocess { + let root_path = vsock_root.as_path().unwrap(); + let policy_path = Path::new("vhost_vsock_device.policy"); + + Some(create_base_minijail(root_path, policy_path)?) + } else { + None + }; + + device_manager.register_mmio(vsock_box, jail, &mut cmdline).map_err(Error::RegisterVsock)?; + } + if !cfg.params.is_empty() { cmdline .insert_str(cfg.params) @@ -856,6 +883,17 @@ fn set_argument(cfg: &mut Config, name: &str, value: Option<&str>) -> argument:: "multiprocess" => { cfg.multiprocess = true; } + "cid" => { + if cfg.cid.is_some() { + return Err(argument::Error::TooManyArguments("`cid` alread given".to_owned())); + } + cfg.cid = Some(value.unwrap().parse().map_err(|_| { + argument::Error::InvalidValue { + value: value.unwrap().to_owned(), + expected: "this value for `cid` must be an unsigned integer", + } + })?); + } "help" => return Err(argument::Error::PrintHelp), _ => unreachable!(), } @@ -892,6 +930,7 @@ fn run_vm(args: std::env::Args) { "PATH", "Path to put the control socket. If PATH is a directory, a name will be generated."), Argument::short_flag('u', "multiprocess", "Run each device in a child process."), + Argument::value("cid", "CID", "Context ID for virtual sockets"), Argument::short_flag('h', "help", "Print help message.")]; let mut cfg = Config::default(); diff --git a/vhost/src/lib.rs b/vhost/src/lib.rs index b4282d2471..4a103a2312 100644 --- a/vhost/src/lib.rs +++ b/vhost/src/lib.rs @@ -8,7 +8,10 @@ extern crate sys_util; extern crate virtio_sys; pub mod net; +mod vsock; + pub use net::Net; +pub use vsock::Vsock; use std::io::Error as IoError; use std::mem; diff --git a/vhost/src/net.rs b/vhost/src/net.rs index f7c5dfd7a2..0c6eaadcab 100644 --- a/vhost/src/net.rs +++ b/vhost/src/net.rs @@ -4,10 +4,9 @@ use libc; use net_util; -use std::ffi::CString; -use std::fs::File; -use std::io::Error as IoError; -use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; +use std::fs::{File, OpenOptions}; +use std::os::unix::fs::OpenOptionsExt; +use std::os::unix::io::{AsRawFd, RawFd}; use virtio_sys; use sys_util::{ioctl_with_ref, GuestMemory}; @@ -43,19 +42,13 @@ impl Net { /// # Arguments /// * `mem` - Guest memory mapping. pub fn new(mem: &GuestMemory) -> Result { - // Open calls are safe because we give a constant nul-terminated - // string and verify the result. The CString unwrap is safe because - // DEVICE does not have any embedded '\0' characters. - let fd = unsafe { - libc::open(CString::new(DEVICE).unwrap().as_ptr(), - libc::O_RDWR | libc::O_NONBLOCK | libc::O_CLOEXEC) - }; - if fd < 0 { - return Err(Error::VhostOpen(IoError::last_os_error())); - } Ok(Net { - // There are no other users of this fd, so this is safe. - fd: unsafe { File::from_raw_fd(fd) }, + fd: OpenOptions::new() + .read(true) + .write(true) + .custom_flags(libc::O_CLOEXEC | libc::O_NONBLOCK) + .open(DEVICE) + .map_err(Error::VhostOpen)?, mem: mem.clone(), }) } diff --git a/vhost/src/vsock.rs b/vhost/src/vsock.rs new file mode 100644 index 0000000000..8c96b9f107 --- /dev/null +++ b/vhost/src/vsock.rs @@ -0,0 +1,82 @@ +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use libc; +use std::fs::{File, OpenOptions}; +use std::os::unix::fs::OpenOptionsExt; +use std::os::unix::io::{AsRawFd, RawFd}; + +use sys_util::{ioctl_with_ref, GuestMemory}; +use virtio_sys::{VHOST_VSOCK_SET_GUEST_CID, VHOST_VSOCK_SET_RUNNING}; + +use super::{ioctl_result, Error, Result, Vhost}; + +static DEVICE: &'static str = "/dev/vhost-vsock"; + +/// Handle for running VHOST_VSOCK ioctls. +pub struct Vsock { + fd: File, + mem: GuestMemory, +} + +impl Vsock { + /// Open a handle to a new VHOST_VSOCK instance. + pub fn new(mem: &GuestMemory) -> Result { + Ok(Vsock { + fd: OpenOptions::new() + .read(true) + .write(true) + .custom_flags(libc::O_CLOEXEC | libc::O_NONBLOCK) + .open(DEVICE) + .map_err(Error::VhostOpen)?, + mem: mem.clone(), + }) + } + + /// Set the CID for the guest. This number is used for routing all data destined for + /// programs + /// running in the guest. + /// + /// # Arguments + /// * `cid` - CID to assign to the guest + pub fn set_cid(&self, cid: u64) -> Result<()> { + let ret = unsafe { ioctl_with_ref(&self.fd, VHOST_VSOCK_SET_GUEST_CID(), &cid) }; + if ret < 0 { + return ioctl_result(); + } + Ok(()) + } + + /// Tell the VHOST driver to start performing data transfer. + pub fn start(&self) -> Result<()> { + self.set_running(true) + } + + /// Tell the VHOST driver to stop performing data transfer. + pub fn stop(&self) -> Result<()> { + self.set_running(false) + } + + fn set_running(&self, running: bool) -> Result<()> { + let on: ::std::os::raw::c_int = if running { 1 } else { 0 }; + let ret = unsafe { ioctl_with_ref(&self.fd, VHOST_VSOCK_SET_RUNNING(), &on) }; + + if ret < 0 { + return ioctl_result(); + } + Ok(()) + } +} + +impl Vhost for Vsock { + fn mem(&self) -> &GuestMemory { + &self.mem + } +} + +impl AsRawFd for Vsock { + fn as_raw_fd(&self) -> RawFd { + self.fd.as_raw_fd() + } +} diff --git a/virtio_sys/src/lib.rs b/virtio_sys/src/lib.rs index 3921c4002d..299bc448e9 100644 --- a/virtio_sys/src/lib.rs +++ b/virtio_sys/src/lib.rs @@ -41,3 +41,5 @@ ioctl_iow_nr!(VHOST_SCSI_CLEAR_ENDPOINT, VHOST, 0x41, vhost_scsi_target); ioctl_iow_nr!(VHOST_SCSI_GET_ABI_VERSION, VHOST, 0x42, ::std::os::raw::c_int); ioctl_iow_nr!(VHOST_SCSI_SET_EVENTS_MISSED, VHOST, 0x43, ::std::os::raw::c_uint); ioctl_iow_nr!(VHOST_SCSI_GET_EVENTS_MISSED, VHOST, 0x44, ::std::os::raw::c_uint); +ioctl_iow_nr!(VHOST_VSOCK_SET_GUEST_CID, VHOST, 0x60, ::std::os::raw::c_ulonglong); +ioctl_iow_nr!(VHOST_VSOCK_SET_RUNNING, VHOST, 0x61, ::std::os::raw::c_int);