diff --git a/Cargo.toml b/Cargo.toml index b24346d842..1c1e60b9d0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,11 @@ x86_64 = { path = "x86_64" } kernel_loader = { path = "kernel_loader" } libc = "0.2.21" byteorder = "1" +syscall_defines = { path = "syscall_defines" } +net_sys = { path = "net_sys" } +net_util = { path = "net_util" } +vhost = { path = "vhost" } +virtio_sys = { path = "virtio_sys" } [dependencies.clap] version = "*" diff --git a/net_device.policy b/net_device.policy new file mode 100644 index 0000000000..f1e051587a --- /dev/null +++ b/net_device.policy @@ -0,0 +1,35 @@ +close: 1 +exit_group: 1 +futex: 1 +# Whitelist vhost_net ioctls only. +# arg1 == VHOST_GET_FEATURES || +# arg1 == VHOST_SET_FEATURES || +# arg1 == VHOST_SET_OWNER || +# arg1 == VHOST_RESET_OWNER || +# arg1 == VHOST_SET_MEM_TABLE || +# arg1 == VHOST_SET_LOG_BASE || +# arg1 == VHOST_SET_LOG_FD || +# arg1 == VHOST_SET_VRING_NUM || +# arg1 == VHOST_SET_VRING_ADDR || +# arg1 == VHOST_SET_VRING_BASE || +# arg1 == VHOST_GET_VRING_BASE || +# arg1 == VHOST_SET_VRING_KICK || +# arg1 == VHOST_SET_VRING_CALL || +# arg1 == VHOST_SET_VRING_ERR || +# arg1 == VHOST_NET_SET_BACKEND +ioctl: arg1 == 0x8008af00 || arg1 == 0x4008af00 || arg1 == 0x0000af01 || arg1 == 0x0000af02 || arg1 == 0x4008af03 || arg1 == 0x4008af04 || arg1 == 0x4004af07 || arg1 == 0x4008af10 || arg1 == 0x4028af11 || arg1 == 0x4008af12 || arg1 == 0xc008af12 || arg1 == 0x4008af20 || arg1 == 0x4008af21 || arg1 == 0x4008af22 || arg1 == 0x4008af30 +# Disallow mmap with PROT_EXEC set. The syntax here doesn't allow bit +# negation, thus the manually negated mask constant. +mmap: arg2 in 0xfffffffb +mprotect: arg2 in 0xfffffffb +munmap: 1 +poll: 1 +read: 1 +recvfrom: 1 +sched_getaffinity: 1 +set_robust_list: 1 +sigaltstack: 1 +# Disallow clone's other than new threads. +# arg0 is flags. Because kernel. +clone: arg0 & 0x00010000 +write: 1 diff --git a/src/hw/virtio/mod.rs b/src/hw/virtio/mod.rs index 7335ac1fce..a7298bb379 100644 --- a/src/hw/virtio/mod.rs +++ b/src/hw/virtio/mod.rs @@ -7,10 +7,12 @@ mod queue; mod mmio; mod block; +mod net; pub use self::queue::*; pub use self::mmio::*; pub use self::block::*; +pub use self::net::*; const DEVICE_ACKNOWLEDGE: u32 = 0x01; const DEVICE_DRIVER: u32 = 0x02; @@ -18,6 +20,7 @@ const DEVICE_DRIVER_OK: u32 = 0x04; const DEVICE_FEATURES_OK: u32 = 0x08; const DEVICE_FAILED: u32 = 0x80; +const TYPE_NET: u32 = 1; const TYPE_BLOCK: u32 = 2; const INTERRUPT_STATUS_USED_RING: u32 = 0x1; diff --git a/src/hw/virtio/net.rs b/src/hw/virtio/net.rs new file mode 100644 index 0000000000..731d579f3e --- /dev/null +++ b/src/hw/virtio/net.rs @@ -0,0 +1,340 @@ +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::mem; +use std::net::Ipv4Addr; +use std::os::raw::*; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::thread::spawn; + +use net_sys; +use net_util::{Tap, Error as TapError}; +use sys_util::{Error as SysError, EventFd, GuestMemory, Poller}; +use vhost::{VhostNet, Error as VhostError}; +use virtio_sys::{vhost, virtio_net}; +use virtio_sys::virtio_net::virtio_net_hdr_mrg_rxbuf; + +use super::{VirtioDevice, Queue, INTERRUPT_STATUS_USED_RING, TYPE_NET}; + +const QUEUE_SIZE: u16 = 256; +const QUEUE_SIZES: &'static [u16] = &[QUEUE_SIZE, QUEUE_SIZE]; + +#[derive(Debug)] +pub enum NetError { + /// Creating kill eventfd failed. + CreateKillEventFd(SysError), + /// Cloning kill eventfd failed. + CloneKillEventFd(SysError), + /// Open tap device failed. + TapOpen(TapError), + /// Setting tap IP failed. + TapSetIp(TapError), + /// Setting tap netmask failed. + TapSetNetmask(TapError), + /// Setting tap interface offload flags failed. + TapSetOffload(TapError), + /// Setting vnet header size failed. + TapSetVnetHdrSize(TapError), + /// Enabling tap interface failed. + TapEnable(TapError), + /// Open vhost-net device failed. + VhostOpen(VhostError), + /// Set owner failed. + VhostSetOwner(VhostError), + /// Get features failed. + VhostGetFeatures(VhostError), + /// Set features failed. + VhostSetFeatures(VhostError), + /// Set mem table failed. + VhostSetMemTable(VhostError), + /// Set vring num failed. + VhostSetVringNum(VhostError), + /// Set vring addr failed. + VhostSetVringAddr(VhostError), + /// Set vring base failed. + VhostSetVringBase(VhostError), + /// Set vring call failed. + VhostSetVringCall(VhostError), + /// Set vring kick failed. + VhostSetVringKick(VhostError), + /// Net set backend failed. + VhostNetSetBackend(VhostError), + /// Failed to create vhost eventfd. + VhostIrqCreate(SysError), + /// Failed to read vhost eventfd. + VhostIrqRead(SysError), + /// Error while polling for events. + PollError(SysError), +} + +struct Worker { + queues: Vec, + tap: Tap, + vhost_net: VhostNet, + vhost_interrupt: EventFd, + interrupt_status: Arc, + interrupt_evt: EventFd, + acked_features: u64, +} + +impl Worker { + fn signal_used_queue(&self) { + self.interrupt_status + .fetch_or(INTERRUPT_STATUS_USED_RING as usize, Ordering::SeqCst); + self.interrupt_evt.write(1).unwrap(); + } + + fn run(&mut self, queue_evts: Vec, kill_evt: EventFd) -> Result<(), NetError> { + // Preliminary setup for vhost net. + self.vhost_net.set_owner().map_err(NetError::VhostSetOwner)?; + + let avail_features = self.vhost_net + .get_features() + .map_err(NetError::VhostGetFeatures)?; + + let features: c_ulonglong = self.acked_features & avail_features; + self.vhost_net + .set_features(features) + .map_err(NetError::VhostSetFeatures)?; + + self.vhost_net + .set_mem_table() + .map_err(NetError::VhostSetMemTable)?; + + for (queue_index, ref queue) in self.queues.iter().enumerate() { + self.vhost_net + .set_vring_num(queue_index, queue.max_size) + .map_err(NetError::VhostSetVringNum)?; + + self.vhost_net + .set_vring_addr(QUEUE_SIZES[queue_index], + queue.actual_size(), + queue_index, + 0, + queue.desc_table, + queue.used_ring, + queue.avail_ring, + None) + .map_err(NetError::VhostSetVringAddr)?; + self.vhost_net + .set_vring_base(queue_index, 0) + .map_err(NetError::VhostSetVringBase)?; + self.vhost_net + .set_vring_call(queue_index, &self.vhost_interrupt) + .map_err(NetError::VhostSetVringCall)?; + self.vhost_net + .set_vring_kick(queue_index, &queue_evts[queue_index]) + .map_err(NetError::VhostSetVringKick)?; + self.vhost_net + .net_set_backend(queue_index, &self.tap) + .map_err(NetError::VhostNetSetBackend)?; + } + + const VHOST_IRQ: u32 = 1; + const KILL: u32 = 2; + + let mut poller = Poller::new(2); + + 'poll: loop { + let tokens = + match poller.poll(&[(VHOST_IRQ, &self.vhost_interrupt), (KILL, &kill_evt)]) { + Ok(v) => v, + Err(e) => return Err(NetError::PollError(e)) + }; + + let mut needs_interrupt = false; + for &token in tokens { + match token { + VHOST_IRQ => { + needs_interrupt = true; + self.vhost_interrupt.read().map_err(NetError::VhostIrqRead)?; + } + KILL => break 'poll, + _ => unreachable!(), + } + } + if needs_interrupt { + self.signal_used_queue(); + } + } + Ok(()) + } +} + +pub struct Net { + workers_kill_evt: Option, + kill_evt: EventFd, + tap: Option, + vhost_net: Option, + vhost_interrupt: Option, + avail_features: u64, + acked_features: u64, +} + +impl Net { + /// Create a new virtio network device with the given IP address and + /// netmask. + pub fn new(ip_addr: Ipv4Addr, + netmask: Ipv4Addr, + mem: &GuestMemory) -> Result { + let kill_evt = EventFd::new().map_err(NetError::CreateKillEventFd)?; + + let tap = Tap::new().map_err(NetError::TapOpen)?; + tap.set_ip_addr(ip_addr).map_err(NetError::TapSetIp)?; + tap.set_netmask(netmask).map_err(NetError::TapSetNetmask)?; + + // Set offload flags to match the virtio features below. + tap.set_offload(net_sys::TUN_F_CSUM | + net_sys::TUN_F_UFO | + net_sys::TUN_F_TSO4 | + net_sys::TUN_F_TSO6) + .map_err(NetError::TapSetOffload)?; + + // We declare VIRTIO_NET_F_MRG_RXBUF, so set the vnet hdr size to match. + let vnet_hdr_size = mem::size_of::() as i32; + tap.set_vnet_hdr_size(vnet_hdr_size) + .map_err(NetError::TapSetVnetHdrSize)?; + + tap.enable().map_err(NetError::TapEnable)?; + let vhost_net = VhostNet::new(mem).map_err(NetError::VhostOpen)?; + + let avail_features = + 1 << virtio_net::VIRTIO_NET_F_GUEST_CSUM | + 1 << virtio_net::VIRTIO_NET_F_CSUM | + 1 << virtio_net::VIRTIO_NET_F_GUEST_TSO4 | + 1 << virtio_net::VIRTIO_NET_F_GUEST_UFO | + 1 << virtio_net::VIRTIO_NET_F_HOST_TSO4 | + 1 << virtio_net::VIRTIO_NET_F_HOST_UFO | + 1 << virtio_net::VIRTIO_NET_F_MRG_RXBUF | + 1 << vhost::VIRTIO_RING_F_INDIRECT_DESC | + 1 << vhost::VIRTIO_RING_F_EVENT_IDX | + 1 << vhost::VIRTIO_F_NOTIFY_ON_EMPTY | + 1 << vhost::VIRTIO_F_VERSION_1; + + Ok(Net { + workers_kill_evt: Some(kill_evt.try_clone().map_err(NetError::CloneKillEventFd)?), + kill_evt: kill_evt, + tap: Some(tap), + vhost_net: Some(vhost_net), + vhost_interrupt: Some(EventFd::new().map_err(NetError::VhostIrqCreate)?), + avail_features: avail_features, + acked_features: 0u64, + }) + } +} + +impl Drop for Net { + fn drop(&mut self) { + // Only kill the child if it claimed its eventfd. + if self.workers_kill_evt.is_none() { + // Ignore the result because there is nothing we can do about it. + let _ = self.kill_evt.write(1); + } + } +} + +impl VirtioDevice for Net { + fn keep_fds(&self) -> Vec { + let mut keep_fds = Vec::new(); + + if let Some(ref tap) = self.tap { + keep_fds.push(tap.as_raw_fd()); + } + + if let Some(ref vhost_net) = self.vhost_net { + keep_fds.push(vhost_net.as_raw_fd()); + } + + if let Some(ref vhost_interrupt) = self.vhost_interrupt { + keep_fds.push(vhost_interrupt.as_raw_fd()); + } + + if let Some(ref workers_kill_evt) = self.workers_kill_evt { + keep_fds.push(workers_kill_evt.as_raw_fd()); + } + + keep_fds + } + + fn device_type(&self) -> u32 { + TYPE_NET + } + + fn queue_max_sizes(&self) -> &[u16] { + QUEUE_SIZES + } + + fn features(&self, page: u32) -> u32 { + match page { + 0 => self.avail_features as u32, + 1 => (self.avail_features >> 32) as u32, + _ => { + warn!("net: virtio net got request for features page: {}", page); + 0u32 + } + } + } + + fn ack_features(&mut self, page: u32, value: u32) { + let mut v = match page { + 0 => value as u64, + 1 => (value as u64) << 32, + _ => { + warn!("net: virtio net device cannot ack unknown feature page: {}", + page); + 0u64 + } + }; + + // Check if the guest is ACK'ing a feature that we didn't claim to have. + let unrequested_features = v & !self.avail_features; + if unrequested_features != 0 { + warn!("net: virtio net got unknown feature ack: {:x}", v); + + // Don't count these features as acked. + v &= !unrequested_features; + } + self.acked_features |= v; + } + + fn activate(&mut self, + _: GuestMemory, + interrupt_evt: EventFd, + status: Arc, + queues: Vec, + queue_evts: Vec) { + if queues.len() != 2 || queue_evts.len() != 2 { + error!("net: expected 2 queues, got {}", queues.len()); + return; + } + + if let Some(vhost_net) = self.vhost_net.take() { + if let Some(tap) = self.tap.take() { + if let Some(vhost_interrupt) = self.vhost_interrupt.take() { + if let Some(kill_evt) = self.workers_kill_evt.take() { + let acked_features = self.acked_features; + spawn(move || { + let mut worker = Worker { + queues: queues, + tap: tap, + vhost_net: vhost_net, + vhost_interrupt: vhost_interrupt, + interrupt_status: status, + interrupt_evt: interrupt_evt, + acked_features: acked_features, + }; + let result = worker.run(queue_evts, kill_evt); + if let Err(e) = result { + error!("net worker thread exited with error: {:?}", + e); + } + }); + } + } + } + } + } +} diff --git a/src/hw/virtio/queue.rs b/src/hw/virtio/queue.rs index 1cdb9c00de..e1f22072cf 100644 --- a/src/hw/virtio/queue.rs +++ b/src/hw/virtio/queue.rs @@ -198,7 +198,9 @@ impl Queue { } } - fn actual_size(&self) -> u16 { + /// Return the actual size of the queue, as the driver may not set up a + /// queue as big as the device allows. + pub fn actual_size(&self) -> u16 { min(self.size, self.max_size) } diff --git a/src/main.rs b/src/main.rs index b7c6ccae21..1e95a198ef 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,11 +12,16 @@ extern crate x86_64; extern crate kernel_loader; extern crate byteorder; #[macro_use] extern crate sys_util; +extern crate net_sys; +extern crate net_util; +extern crate vhost; +extern crate virtio_sys; use std::ffi::{CString, CStr}; use std::fmt; use std::fs::File; use std::io::{stdin, stdout}; +use std::net; use std::path::{Path, PathBuf}; use std::ptr; use std::string::String; @@ -45,9 +50,14 @@ enum Error { Disk(std::io::Error), BlockDeviceNew(sys_util::Error), BlockDeviceRootSetup(sys_util::Error), + NetDeviceNew(hw::virtio::NetError), + NetDeviceRootSetup(sys_util::Error), + MacAddressNeedsNetConfig, + NetMissingConfig, DeviceJail(io_jail::Error), DevicePivotRoot(io_jail::Error), RegisterBlock(device_manager::Error), + RegisterNet(device_manager::Error), Cmdline(kernel_cmdline::Error), RegisterIoevent(sys_util::Error), RegisterIrqfd(sys_util::Error), @@ -89,8 +99,15 @@ impl fmt::Display for Error { write!(f, "failed to create root directory for a block device: {:?}", e) } &Error::RegisterBlock(ref e) => write!(f, "error registering block device: {:?}", e), + &Error::NetDeviceNew(ref e) => write!(f, "failed to set up networking: {:?}", e), + &Error::NetDeviceRootSetup(ref e) => { + write!(f, "failed to create root directory for a net device: {:?}", e) + } + &Error::MacAddressNeedsNetConfig => write!(f, "MAC address can only be specified when host IP and netmask are provided"), + &Error::NetMissingConfig => write!(f, "networking requires both host IP and netmask specified"), &Error::DeviceJail(ref e) => write!(f, "failed to jail device: {:?}", e), &Error::DevicePivotRoot(ref e) => write!(f, "failed to pivot root device: {:?}", e), + &Error::RegisterNet(ref e) => write!(f, "error registering net device: {:?}", e), &Error::Cmdline(ref e) => write!(f, "the given kernel command line was invalid: {}", e), &Error::RegisterIoevent(ref e) => write!(f, "error registering ioevent: {:?}", e), &Error::RegisterIrqfd(ref e) => write!(f, "error registering irqfd: {:?}", e), @@ -114,6 +131,9 @@ struct Config { memory: Option, kernel_image: File, params: Option, + host_ip: Option, + netmask: Option, + mac_address: Option, socket_path: Option, multiprocess: bool, warn_unknown_ports: bool, @@ -185,6 +205,15 @@ fn wait_all_children() -> bool { } fn run_config(cfg: Config) -> Result<()> { + if cfg.mac_address.is_some() && + (cfg.netmask.is_none() || cfg.host_ip.is_none()) { + return Err(Error::MacAddressNeedsNetConfig); + } + + if cfg.netmask.is_some() != cfg.host_ip.is_some() { + return Err(Error::NetMissingConfig); + } + let socket = if let Some(ref socket_path) = cfg.socket_path { Some(ControlSocketRecv::new(socket_path) .map_err(|e| Error::Socket(e))?) @@ -226,6 +255,26 @@ fn run_config(cfg: Config) -> Result<()> { .map_err(Error::RegisterBlock)?; } + // We checked above that if the IP is defined, then the netmask is, too. + let net_root = TempDir::new(&PathBuf::from("/tmp/net_root")) + .map_err(Error::NetDeviceRootSetup)?; + if let Some(host_ip) = cfg.host_ip { + if let Some(netmask) = cfg.netmask { + let net_box = Box::new(hw::virtio::Net::new(host_ip, netmask, &guest_mem) + .map_err(|e| Error::NetDeviceNew(e))?); + let jail = if cfg.multiprocess { + let net_root_path = net_root.as_path().unwrap(); // Won't fail if new succeeded. + + Some(create_base_minijail(net_root_path, Path::new("net_device.policy"))?) + } + else { + None + }; + + device_manager.register_mmio(net_box, jail, &mut cmdline).map_err(Error::RegisterNet)?; + } + } + if let Some(params) = cfg.params { cmdline .insert_str(params) @@ -586,6 +635,18 @@ fn main() { .short("u") .long("multiprocess") .help("run the devices in a child process")) + .arg(Arg::with_name("host_ip") + .long("host_ip") + .value_name("HOST_IP") + .help("IP address to assign to host tap interface")) + .arg(Arg::with_name("netmask") + .long("netmask") + .value_name("NETMASK") + .help("netmask for VM subnet")) + .arg(Arg::with_name("mac") + .long("mac") + .value_name("MAC") + .help("mac address for VM")) .arg(Arg::with_name("socket") .short("s") .long("socket") @@ -624,6 +685,9 @@ fn main() { .expect("Expected kernel image path to be valid"), params: matches.value_of("params").map(|s| s.to_string()), multiprocess: matches.is_present("multiprocess"), + host_ip: matches.value_of("host_ip").and_then(|v| v.parse().ok()), + netmask: matches.value_of("netmask").and_then(|v| v.parse().ok()), + mac_address: matches.value_of("mac").map(|s| s.to_string()), socket_path: matches.value_of("socket").map(|s| s.to_string()), warn_unknown_ports: matches.is_present("warn-unknown-ports"), };