From eefe7fb19e503e021502c2d8311b85a128d0e054 Mon Sep 17 00:00:00 2001 From: Keiichi Watanabe Date: Tue, 17 Nov 2020 17:58:35 +0900 Subject: [PATCH] devices: virtio: fs: DAX based shared memory support Support virtio-fs's DAX (direct memory access) operation which allows the guest to directly access file pages. Specifically, FUSE_SETUP_MAPPING and FUSE_REMOVE_MAPPING operations are supported. This option can be used by specifing `dax` option when mount a file system in the guest. The DAX optoin improved file I/O performance in most cases. In Fio tests, both of read and write score were improved by 1.3-14x depending on test cases. In Blogbench tests, which create many small files, DAX improved the write score by 1.5x while the read score was reduced to ~25% (20391 -> 4593). Here is an excerpt of results: Fio * seq_read: 10.2x (143528 -> 1464911) * seq_write: 3.3x (61253 -> 896791) * rand_read: 11.6x (138753 -> 1612739) * rand_write: 14.6x (61253 -> 896791) * surfing_read: 1.3x (98473 -> 127907) * surfing_write: 1.3x (83309 -> 108089) Blogbench * read: 0.23x (20391 -> 4593) * write: 1.50x (248 -> 373) BUG=b:147341783 TEST=Run vm.{Blogbench, Fio} with CL:2291856 Change-Id: I4a47c601412ed32d926de6304337e1594252d258 Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/crosvm/+/2108315 Tested-by: kokoro Tested-by: Keiichi Watanabe Commit-Queue: Keiichi Watanabe Reviewed-by: Chirantan Ekbote --- Cargo.lock | 1 + devices/src/virtio/fs/mod.rs | 83 +++++++++++++- devices/src/virtio/fs/passthrough.rs | 38 ++++++- devices/src/virtio/fs/worker.rs | 95 +++++++++++++++- fuse/src/filesystem.rs | 38 ++++++- fuse/src/lib.rs | 2 +- fuse/src/server.rs | 163 ++++++++++++++++++++++++++- fuse/src/sys.rs | 53 +++++++++ fuse/src/worker.rs | 31 ++++- hypervisor/src/kvm/mod.rs | 35 +++++- hypervisor/src/lib.rs | 26 ++++- src/crosvm.rs | 1 + src/linux.rs | 63 +++++++++-- sys_util/src/mmap.rs | 47 ++++++++ vm_control/src/lib.rs | 116 ++++++++++++++++++- 15 files changed, 761 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3179b43f5c..4e460fdcca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -534,6 +534,7 @@ version = "0.1.0" dependencies = [ "data_model", "futures", + "getopts", "intrusive-collections", "libc", "log", diff --git a/devices/src/virtio/fs/mod.rs b/devices/src/virtio/fs/mod.rs index 6c4a3b4bff..a7a3deb087 100644 --- a/devices/src/virtio/fs/mod.rs +++ b/devices/src/virtio/fs/mod.rs @@ -5,14 +5,23 @@ use std::fmt; use std::io; use std::mem; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use std::thread; -use base::{error, warn, Error as SysError, Event, RawDescriptor}; +use base::{error, warn, AsRawDescriptor, Error as SysError, Event, RawDescriptor}; use data_model::{DataInit, Le32}; +use msg_socket::{MsgReceiver, MsgSender}; +use resources::Alloc; +use vm_control::{FsMappingRequest, FsMappingRequestSocket, VmResponse}; use vm_memory::GuestMemory; -use crate::virtio::{copy_config, DescriptorError, Interrupt, Queue, VirtioDevice, TYPE_FS}; +use crate::pci::{ + PciAddress, PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, +}; +use crate::virtio::{ + copy_config, DescriptorError, Interrupt, PciCapabilityType, Queue, VirtioDevice, + VirtioPciShmCap, TYPE_FS, +}; mod multikey; pub mod passthrough; @@ -26,6 +35,13 @@ use worker::Worker; // The fs device does not have a fixed number of queues. const QUEUE_SIZE: u16 = 1024; +const FS_BAR_NUM: u8 = 4; +const FS_BAR_OFFSET: u64 = 0; +const FS_BAR_SIZE: u64 = 1 << 33; + +/// Defined in kernel/include/uapi/linux/virtio_fs.h. +const VIRTIO_FS_SHMCAP_ID_CACHE: u8 = 0; + /// The maximum allowable length of the tag used to identify a specific virtio-fs device. pub const FS_MAX_TAG_LEN: usize = 36; @@ -105,6 +121,8 @@ pub struct Fs { queue_sizes: Box<[u16]>, avail_features: u64, acked_features: u64, + pci_bar: Option, + socket: Option, workers: Vec<(Event, thread::JoinHandle>)>, } @@ -114,6 +132,7 @@ impl Fs { tag: &str, num_workers: usize, fs_cfg: passthrough::Config, + socket: FsMappingRequestSocket, ) -> Result { if tag.len() > FS_MAX_TAG_LEN { return Err(Error::TagTooLong(tag.len())); @@ -138,6 +157,8 @@ impl Fs { queue_sizes: vec![QUEUE_SIZE; num_queues].into_boxed_slice(), avail_features: base_features, acked_features: 0, + pci_bar: None, + socket: Some(socket), workers: Vec::with_capacity(num_workers + 1), }) } @@ -164,10 +185,16 @@ impl Fs { impl VirtioDevice for Fs { fn keep_rds(&self) -> Vec { - self.fs + let mut fds = self + .fs .as_ref() .map(PassthroughFs::keep_rds) - .unwrap_or_else(Vec::new) + .unwrap_or_else(Vec::new); + if let Some(rd) = self.socket.as_ref().map(|s| s.as_raw_descriptor()) { + fds.push(rd); + } + + fds } fn device_type(&self) -> u32 { @@ -213,7 +240,24 @@ impl VirtioDevice for Fs { let server = Arc::new(Server::new(fs)); let irq = Arc::new(interrupt); + let socket = self.socket.take().expect("missing mapping socket"); + // Create the shared memory region now before we start processing requests. + let request = FsMappingRequest::AllocateSharedMemoryRegion( + self.pci_bar.as_ref().cloned().expect("No pci_bar"), + ); + socket + .send(&request) + .expect("failed to send allocation message"); + let slot = match socket.recv() { + Ok(VmResponse::RegisterMemory { pfn: _, slot }) => slot, + Ok(VmResponse::Err(e)) => panic!("failed to allocate shared memory region: {}", e), + r => panic!( + "unexpected response to allocate shared memory region: {:?}", + r + ), + }; + let socket = Arc::new(Mutex::new(socket)); let mut watch_resample_event = true; for (idx, (queue, evt)) in queues.into_iter().zip(queue_evts.into_iter()).enumerate() { let (self_kill_evt, kill_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) @@ -229,11 +273,12 @@ impl VirtioDevice for Fs { let mem = guest_mem.clone(); let server = server.clone(); let irq = irq.clone(); + let socket = Arc::clone(&socket); let worker_result = thread::Builder::new() .name(format!("virtio-fs worker {}", idx)) .spawn(move || { - let mut worker = Worker::new(mem, queue, server, irq); + let mut worker = Worker::new(mem, queue, server, irq, socket, slot); worker.run(evt, kill_evt, watch_resample_event) }); @@ -251,6 +296,32 @@ impl VirtioDevice for Fs { } } } + + fn get_device_bars(&mut self, address: PciAddress) -> Vec { + self.pci_bar = Some(Alloc::PciBar { + bus: address.bus, + dev: address.dev, + func: address.func, + bar: FS_BAR_NUM, + }); + + vec![PciBarConfiguration::new( + FS_BAR_NUM as usize, + FS_BAR_SIZE, + PciBarRegionType::Memory64BitRegion, + PciBarPrefetchable::NotPrefetchable, + )] + } + + fn get_device_caps(&self) -> Vec> { + vec![Box::new(VirtioPciShmCap::new( + PciCapabilityType::SharedMemoryConfig, + FS_BAR_NUM, + FS_BAR_OFFSET, + FS_BAR_SIZE, + VIRTIO_FS_SHMCAP_ID_CACHE, + ))] + } } impl Drop for Fs { diff --git a/devices/src/virtio/fs/passthrough.rs b/devices/src/virtio/fs/passthrough.rs index 5f89dc20c7..c05687b582 100644 --- a/devices/src/virtio/fs/passthrough.rs +++ b/devices/src/virtio/fs/passthrough.rs @@ -25,8 +25,10 @@ use base::{ use data_model::DataInit; use fuse::filesystem::{ Context, DirectoryIterator, Entry, FileSystem, FsOptions, GetxattrReply, IoctlFlags, - IoctlReply, ListxattrReply, OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, ROOT_ID, + IoctlReply, ListxattrReply, OpenOptions, RemoveMappingOne, SetattrValid, ZeroCopyReader, + ZeroCopyWriter, ROOT_ID, }; +use fuse::Mapper; use rand_ish::SimpleRng; use sync::Mutex; @@ -2306,6 +2308,40 @@ impl FileSystem for PassthroughFs { Err(io::Error::last_os_error()) } } + + fn set_up_mapping( + &self, + _ctx: Context, + inode: Self::Inode, + _handle: Self::Handle, + file_offset: u64, + mem_offset: u64, + size: usize, + prot: u32, + mapper: M, + ) -> io::Result<()> { + let read = prot & libc::PROT_READ as u32 != 0; + let write = prot & libc::PROT_WRITE as u32 != 0; + + let flags = match (read, write) { + (true, true) => libc::O_RDWR, + (true, false) => libc::O_RDONLY, + (false, true) => libc::O_WRONLY, + (false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)), + }; + let data = self.find_inode(inode)?; + + let file = self.open_inode(&data, flags | libc::O_NONBLOCK)?; + + mapper.map(mem_offset, size, &file, file_offset, prot) + } + + fn remove_mapping(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> { + for RemoveMappingOne { moffset, len } in msgs { + mapper.unmap(*moffset, *len)?; + } + Ok(()) + } } #[cfg(test)] diff --git a/devices/src/virtio/fs/worker.rs b/devices/src/virtio/fs/worker.rs index 9a4da6cf44..ded93b2c74 100644 --- a/devices/src/virtio/fs/worker.rs +++ b/devices/src/virtio/fs/worker.rs @@ -2,12 +2,16 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +use std::convert::TryInto; use std::fs::File; use std::io; -use std::sync::Arc; +use std::os::unix::io::AsRawFd; +use std::sync::{Arc, Mutex}; use base::{error, Event, PollToken, WaitContext}; use fuse::filesystem::{FileSystem, ZeroCopyReader, ZeroCopyWriter}; +use msg_socket::{MsgReceiver, MsgSender}; +use vm_control::{FsMappingRequest, FsMappingRequestSocket, MaybeOwnedDescriptor, VmResponse}; use vm_memory::GuestMemory; use crate::virtio::fs::{Error, Result}; @@ -40,11 +44,92 @@ impl ZeroCopyWriter for Writer { self.write_from_at(f, count, off) } } + +struct Mapper { + socket: Arc>, + slot: u32, +} + +impl Mapper { + fn new(socket: Arc>, slot: u32) -> Self { + Self { socket, slot } + } + + fn process_request(&self, request: &FsMappingRequest) -> io::Result<()> { + let socket = self.socket.lock().map_err(|e| { + error!("failed to lock socket: {}", e); + io::Error::from_raw_os_error(libc::EINVAL) + })?; + + socket.send(request).map_err(|e| { + error!("failed to send request {:?}: {}", request, e); + io::Error::from_raw_os_error(libc::EINVAL) + })?; + + match socket.recv() { + Ok(VmResponse::Ok) => Ok(()), + Ok(VmResponse::Err(e)) => Err(e.into()), + r => { + error!("failed to process {:?}: {:?}", request, r); + Err(io::Error::from_raw_os_error(libc::EIO)) + } + } + } +} + +impl fuse::Mapper for Mapper { + fn map( + &self, + mem_offset: u64, + size: usize, + fd: &dyn AsRawFd, + file_offset: u64, + prot: u32, + ) -> io::Result<()> { + let mem_offset: usize = mem_offset.try_into().map_err(|e| { + error!("mem_offset {} is too big: {}", mem_offset, e); + io::Error::from_raw_os_error(libc::EINVAL) + })?; + + let request = FsMappingRequest::CreateMemoryMapping { + slot: self.slot, + fd: MaybeOwnedDescriptor::Borrowed(fd.as_raw_fd()), + size, + file_offset, + prot, + mem_offset, + }; + + self.process_request(&request) + } + + fn unmap(&self, offset: u64, size: u64) -> io::Result<()> { + let offset: usize = offset.try_into().map_err(|e| { + error!("offset {} is too big: {}", offset, e); + io::Error::from_raw_os_error(libc::EINVAL) + })?; + let size: usize = size.try_into().map_err(|e| { + error!("size {} is too big: {}", size, e); + io::Error::from_raw_os_error(libc::EINVAL) + })?; + + let request = FsMappingRequest::RemoveMemoryMapping { + slot: self.slot, + offset, + size, + }; + + self.process_request(&request) + } +} + pub struct Worker { mem: GuestMemory, queue: Queue, server: Arc>, irq: Arc, + socket: Arc>, + slot: u32, } impl Worker { @@ -53,24 +138,30 @@ impl Worker { queue: Queue, server: Arc>, irq: Arc, + socket: Arc>, + slot: u32, ) -> Worker { Worker { mem, queue, server, irq, + socket, + slot, } } fn process_queue(&mut self) -> Result<()> { let mut needs_interrupt = false; + + let mapper = Mapper::new(Arc::clone(&self.socket), self.slot); while let Some(avail_desc) = self.queue.pop(&self.mem) { let reader = Reader::new(self.mem.clone(), avail_desc.clone()) .map_err(Error::InvalidDescriptorChain)?; let writer = Writer::new(self.mem.clone(), avail_desc.clone()) .map_err(Error::InvalidDescriptorChain)?; - let total = self.server.handle_message(reader, writer)?; + let total = self.server.handle_message(reader, writer, &mapper)?; self.queue .add_used(&self.mem, avail_desc.index, total as u32); diff --git a/fuse/src/filesystem.rs b/fuse/src/filesystem.rs index 400abb89da..ff88628022 100644 --- a/fuse/src/filesystem.rs +++ b/fuse/src/filesystem.rs @@ -11,7 +11,10 @@ use std::time::Duration; use crate::sys; -pub use crate::sys::{FsOptions, IoctlFlags, IoctlIovec, OpenOptions, SetattrValid, ROOT_ID}; +use crate::server::Mapper; +pub use crate::sys::{ + FsOptions, IoctlFlags, IoctlIovec, OpenOptions, RemoveMappingOne, SetattrValid, ROOT_ID, +}; const MAX_BUFFER_SIZE: u32 = 1 << 20; @@ -1158,4 +1161,37 @@ pub trait FileSystem { ) -> io::Result { Err(io::Error::from_raw_os_error(libc::ENOSYS)) } + + /// Set up memory mappings. + /// + /// Used to set up file mappings in DAX window. + /// + /// # Arguments + /// + /// * `file_offset` - Offset into the file to start the mapping. + /// * `mem_offset` - Offset in Memory Window. + /// * `size` - Length of mapping required. + /// * `flags` - Bit field of `FUSE_SETUPMAPPING_FLAGS_*`. + /// * `mapper` - Mapper object which performs the mapping. + fn set_up_mapping( + &self, + ctx: Context, + inode: Self::Inode, + handle: Self::Handle, + file_offset: u64, + mem_offset: u64, + size: usize, + flags: u32, + mapper: M, + ) -> io::Result<()> { + Err(io::Error::from_raw_os_error(libc::ENOSYS)) + } + + /// Remove memory mappings. + /// + /// Used to tear down file mappings in DAX window. This method must be supported when + /// `set_up_mapping` is supported. + fn remove_mapping(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> { + Err(io::Error::from_raw_os_error(libc::ENOSYS)) + } } diff --git a/fuse/src/lib.rs b/fuse/src/lib.rs index 8da2265de6..f7ac2e74a0 100644 --- a/fuse/src/lib.rs +++ b/fuse/src/lib.rs @@ -17,7 +17,7 @@ pub mod sys; pub mod worker; pub use mount::mount; -pub use server::{Reader, Server, Writer}; +pub use server::{Mapper, Reader, Server, Writer}; /// Errors that may occur during the creation or operation of an Fs device. #[derive(ThisError, Debug)] diff --git a/fuse/src/server.rs b/fuse/src/server.rs index e31621542a..28c4682322 100644 --- a/fuse/src/server.rs +++ b/fuse/src/server.rs @@ -2,12 +2,14 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +use std::convert::TryInto; use std::ffi::CStr; use std::io; use std::mem::{size_of, MaybeUninit}; +use std::os::unix::io::AsRawFd; use std::time::Duration; -use base::error; +use base::{error, pagesize}; use data_model::DataInit; use crate::filesystem::{ @@ -45,6 +47,58 @@ pub trait Writer: io::Write { fn has_sufficient_buffer(&self, size: u32) -> bool; } +/// A trait for memory mapping for DAX. +/// +/// For some transports (like virtio) it may be possible to share a region of memory with the +/// FUSE kernel driver so that it can access file contents directly without issuing read or +/// write requests. In this case the driver will instead send requests to map a section of a +/// file into the shared memory region. +pub trait Mapper { + /// Maps `size` bytes starting at `file_offset` bytes from within the given `fd` at `mem_offset` + /// bytes from the start of the memory region with `prot` protections. `mem_offset` must be + /// page aligned. + /// + /// # Arguments + /// * `mem_offset` - Page aligned offset into the memory region in bytes. + /// * `size` - Size of memory region in bytes. + /// * `fd` - File descriptor to mmap from. + /// * `file_offset` - Offset in bytes from the beginning of `fd` to start the mmap. + /// * `prot` - Protection (e.g. `libc::PROT_READ`) of the memory region. + fn map( + &self, + mem_offset: u64, + size: usize, + fd: &dyn AsRawFd, + file_offset: u64, + prot: u32, + ) -> io::Result<()>; + + /// Unmaps `size` bytes at `offset` bytes from the start of the memory region. `offset` must be + /// page aligned. + /// + /// # Arguments + /// * `offset` - Page aligned offset into the arena in bytes. + /// * `size` - Size of memory region in bytes. + fn unmap(&self, offset: u64, size: u64) -> io::Result<()>; +} + +impl<'a, M: Mapper> Mapper for &'a M { + fn map( + &self, + mem_offset: u64, + size: usize, + fd: &dyn AsRawFd, + file_offset: u64, + prot: u32, + ) -> io::Result<()> { + (**self).map(mem_offset, size, fd, file_offset, prot) + } + + fn unmap(&self, offset: u64, size: u64) -> io::Result<()> { + (**self).unmap(offset, size) + } +} + pub struct Server { fs: F, } @@ -54,13 +108,13 @@ impl Server { Server { fs } } - pub fn handle_message( + pub fn handle_message( &self, mut r: R, w: W, + mapper: M, ) -> Result { let in_header = InHeader::from_reader(&mut r).map_err(Error::DecodeMessage)?; - if in_header.len > self.fs.max_buffer_size() { return reply_error( io::Error::from_raw_os_error(libc::ENOMEM), @@ -115,7 +169,9 @@ impl Server { Some(Opcode::Lseek) => self.lseek(in_header, r, w), Some(Opcode::CopyFileRange) => self.copy_file_range(in_header, r, w), Some(Opcode::ChromeOsTmpfile) => self.chromeos_tmpfile(in_header, r, w), - Some(Opcode::SetUpMapping) | Some(Opcode::RemoveMapping) | None => reply_error( + Some(Opcode::SetUpMapping) => self.set_up_mapping(in_header, r, w, mapper), + Some(Opcode::RemoveMapping) => self.remove_mapping(in_header, r, w, mapper), + None => reply_error( io::Error::from_raw_os_error(libc::ENOSYS), in_header.unique, w, @@ -905,7 +961,8 @@ impl Server { | FsOptions::HAS_IOCTL_DIR | FsOptions::DO_READDIRPLUS | FsOptions::READDIRPLUS_AUTO - | FsOptions::ATOMIC_O_TRUNC; + | FsOptions::ATOMIC_O_TRUNC + | FsOptions::MAP_ALIGNMENT; let capable = FsOptions::from_bits_truncate(flags); @@ -928,6 +985,7 @@ impl Server { congestion_threshold: (::std::u16::MAX / 4) * 3, max_write: self.fs.max_buffer_size(), time_gran: 1, // nanoseconds + map_alignment: pagesize().trailing_zeros() as u16, ..Default::default() }; @@ -1452,6 +1510,101 @@ impl Server { Err(e) => reply_error(e, in_header.unique, w), } } + + fn set_up_mapping( + &self, + in_header: InHeader, + mut r: R, + w: W, + mapper: M, + ) -> Result + where + R: Reader, + W: Writer, + M: Mapper, + { + let SetUpMappingIn { + fh, + foffset, + len, + flags, + moffset, + } = SetUpMappingIn::from_reader(&mut r).map_err(Error::DecodeMessage)?; + let flags = SetUpMappingFlags::from_bits_truncate(flags); + + let mut prot = 0; + if flags.contains(SetUpMappingFlags::READ) { + prot |= libc::PROT_READ as u32; + } + if flags.contains(SetUpMappingFlags::WRITE) { + prot |= libc::PROT_WRITE as u32; + } + + let size = if let Ok(s) = len.try_into() { + s + } else { + return reply_error( + io::Error::from_raw_os_error(libc::EOVERFLOW), + in_header.unique, + w, + ); + }; + + match self.fs.set_up_mapping( + Context::from(in_header), + in_header.nodeid.into(), + fh.into(), + foffset, + moffset, + size, + prot, + mapper, + ) { + Ok(()) => reply_ok(None::, None, in_header.unique, w), + Err(e) => { + error!("set_up_mapping failed: {}", e); + reply_error(e, in_header.unique, w) + } + } + } + + fn remove_mapping( + &self, + in_header: InHeader, + mut r: R, + w: W, + mapper: M, + ) -> Result + where + R: Reader, + W: Writer, + M: Mapper, + { + let RemoveMappingIn { count } = + RemoveMappingIn::from_reader(&mut r).map_err(Error::DecodeMessage)?; + + // `FUSE_REMOVEMAPPING_MAX_ENTRY` is defined as + // `PAGE_SIZE / sizeof(struct fuse_removemapping_one)` in /kernel/include/uapi/linux/fuse.h. + let max_entry = pagesize() / std::mem::size_of::(); + + if max_entry < count as usize { + return reply_error( + io::Error::from_raw_os_error(libc::EINVAL), + in_header.unique, + w, + ); + } + + let mut msgs = Vec::with_capacity(count as usize); + for _ in 0..(count as usize) { + msgs.push(RemoveMappingOne::from_reader(&mut r).map_err(Error::DecodeMessage)?); + } + + match self.fs.remove_mapping(&msgs, mapper) { + Ok(()) => reply_ok(None::, None, in_header.unique, w), + Err(e) => reply_error(e, in_header.unique, w), + } + } } fn retry_ioctl( diff --git a/fuse/src/sys.rs b/fuse/src/sys.rs index 1f4ba564ba..2b2fe8f55e 100644 --- a/fuse/src/sys.rs +++ b/fuse/src/sys.rs @@ -362,6 +362,13 @@ bitflags! { const EXPLICIT_INVAL_DATA = EXPLICIT_INVAL_DATA; const SECURITY_CONTEXT = SECURITY_CONTEXT; + + /// Indicates that the `map_alignment` field of the `InitOut` struct is valid. + /// + /// The `MAP_ALIGNMENT` field is used by the FUSE kernel driver to ensure that its DAX + /// mapping requests are pagesize-aligned. This field automatically set by the server and + /// this feature is enabled by default. + const MAP_ALIGNMENT = MAP_ALIGNMENT; } } @@ -448,6 +455,18 @@ pub const FUSE_COMPAT_STATFS_SIZE: u32 = 48; pub const FUSE_COMPAT_INIT_OUT_SIZE: u32 = 8; pub const FUSE_COMPAT_22_INIT_OUT_SIZE: u32 = 24; +const SETUPMAPPING_FLAG_WRITE: u64 = 1; +const SETUPMAPPING_FLAG_READ: u64 = 2; + +bitflags! { + pub struct SetUpMappingFlags: u64 { + /// Create writable mapping. + const WRITE = SETUPMAPPING_FLAG_WRITE; + /// Create readable mapping. + const READ = SETUPMAPPING_FLAG_READ; + } +} + // Message definitions follow. It is safe to implement DataInit for all of these // because they are POD types. @@ -1151,3 +1170,37 @@ pub struct CopyFileRangeIn { pub flags: u64, } unsafe impl DataInit for CopyFileRangeIn {} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct SetUpMappingIn { + /* An already open handle */ + pub fh: u64, + /* Offset into the file to start the mapping */ + pub foffset: u64, + /* Length of mapping required */ + pub len: u64, + /* Flags, FUSE_SETUPMAPPING_FLAG_* */ + pub flags: u64, + /* Offset in Memory Window */ + pub moffset: u64, +} +unsafe impl DataInit for SetUpMappingIn {} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct RemoveMappingIn { + /* number of fuse_removemapping_one follows */ + pub count: u32, +} +unsafe impl DataInit for RemoveMappingIn {} + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone)] +pub struct RemoveMappingOne { + /* Offset into the dax window start the unmapping */ + pub moffset: u64, + /* Length of mapping required */ + pub len: u64, +} +unsafe impl DataInit for RemoveMappingOne {} diff --git a/fuse/src/worker.rs b/fuse/src/worker.rs index 334738a34e..378e5eee38 100644 --- a/fuse/src/worker.rs +++ b/fuse/src/worker.rs @@ -6,9 +6,10 @@ use std::fs::File; use std::io::{self, BufRead, BufReader, Cursor, Read, Write}; use std::mem::size_of; use std::os::unix::fs::FileExt; +use std::os::unix::io::AsRawFd; use crate::filesystem::{FileSystem, ZeroCopyReader, ZeroCopyWriter}; -use crate::server::{Reader, Server, Writer}; +use crate::server::{Mapper, Reader, Server, Writer}; use crate::sys; use crate::{Error, Result}; @@ -111,6 +112,31 @@ impl ZeroCopyWriter for DevFuseWriter<'_> { } } +struct DevFuseMapper; + +impl DevFuseMapper { + fn new() -> Self { + Self {} + } +} + +impl Mapper for DevFuseMapper { + fn map( + &self, + _mem_offset: u64, + _size: usize, + _fd: &dyn AsRawFd, + _file_offset: u64, + _prot: u32, + ) -> io::Result<()> { + Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)) + } + + fn unmap(&self, _offset: u64, _size: u64) -> io::Result<()> { + Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)) + } +} + /// Start the FUSE message handling loop. Returns when an error happens. pub fn start_message_loop( dev_fuse: File, @@ -129,8 +155,9 @@ pub fn start_message_loop( loop { let dev_fuse_reader = DevFuseReader::new(&mut buf_reader); let dev_fuse_writer = DevFuseWriter::new(&mut wfile, &mut write_buf); + let dev_fuse_mapper = DevFuseMapper::new(); - if let Err(e) = server.handle_message(dev_fuse_reader, dev_fuse_writer) { + if let Err(e) = server.handle_message(dev_fuse_reader, dev_fuse_writer, &dev_fuse_mapper) { return Err(e); } } diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 65cc7b96b2..e195c1556b 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -18,6 +18,7 @@ use std::collections::{BTreeMap, BinaryHeap}; use std::convert::TryFrom; use std::mem::{size_of, ManuallyDrop}; use std::os::raw::{c_char, c_int, c_ulong, c_void}; +use std::os::unix::io::AsRawFd; use std::ptr::copy_nonoverlapping; use std::sync::atomic::AtomicU64; use std::sync::Arc; @@ -29,8 +30,8 @@ use libc::{ use base::{ block_signal, errno_result, error, ioctl, ioctl_with_mut_ref, ioctl_with_ref, ioctl_with_val, pagesize, signal, unblock_signal, AsRawDescriptor, Error, Event, FromRawDescriptor, - MappedRegion, MemoryMapping, MemoryMappingBuilder, MmapError, RawDescriptor, Result, - SafeDescriptor, + MappedRegion, MemoryMapping, MemoryMappingBuilder, MmapError, Protection, RawDescriptor, + Result, SafeDescriptor, }; use data_model::vec_with_array_field; use kvm_sys::*; @@ -560,6 +561,36 @@ impl Vm for KvmVm { fn set_pvclock(&self, state: &ClockState) -> Result<()> { self.set_pvclock_arch(state) } + + fn add_fd_mapping( + &mut self, + slot: u32, + offset: usize, + size: usize, + fd: &dyn AsRawFd, + fd_offset: u64, + prot: Protection, + ) -> Result<()> { + let mut regions = self.mem_regions.lock(); + let region = regions.get_mut(&slot).ok_or(Error::new(EINVAL))?; + + match region.add_fd_mapping(offset, size, fd, fd_offset, prot) { + Ok(()) => Ok(()), + Err(MmapError::SystemCallFailed(e)) => Err(e), + Err(_) => Err(Error::new(EIO)), + } + } + + fn remove_mapping(&mut self, slot: u32, offset: usize, size: usize) -> Result<()> { + let mut regions = self.mem_regions.lock(); + let region = regions.get_mut(&slot).ok_or(Error::new(EINVAL))?; + + match region.remove_mapping(offset, size) { + Ok(()) => Ok(()), + Err(MmapError::SystemCallFailed(e)) => Err(e), + Err(_) => Err(Error::new(EIO)), + } + } } impl AsRawDescriptor for KvmVm { diff --git a/hypervisor/src/lib.rs b/hypervisor/src/lib.rs index c6cf45a6e8..eb21598d2b 100644 --- a/hypervisor/src/lib.rs +++ b/hypervisor/src/lib.rs @@ -11,8 +11,9 @@ pub mod kvm; pub mod x86_64; use std::os::raw::c_int; +use std::os::unix::io::AsRawFd; -use base::{Event, MappedRegion, RawDescriptor, Result, SafeDescriptor}; +use base::{Event, MappedRegion, Protection, RawDescriptor, Result, SafeDescriptor}; use msg_socket::MsgOnSocket; use vm_memory::{GuestAddress, GuestMemory}; @@ -139,6 +140,29 @@ pub trait Vm: Send { /// Sets the current timestamp of the paravirtual clock as seen by the current guest. /// Only works on VMs that support `VmCap::PvClock`. fn set_pvclock(&self, state: &ClockState) -> Result<()>; + + /// Maps `size` bytes starting at `fs_offset` bytes from within the given `fd` + /// at `offset` bytes from the start of the arena with `prot` protections. + /// `offset` must be page aligned. + /// + /// # Arguments + /// * `offset` - Page aligned offset into the arena in bytes. + /// * `size` - Size of memory region in bytes. + /// * `fd` - File descriptor to mmap from. + /// * `fd_offset` - Offset in bytes from the beginning of `fd` to start the mmap. + /// * `prot` - Protection (e.g. readable/writable) of the memory region. + fn add_fd_mapping( + &mut self, + slot: u32, + offset: usize, + size: usize, + fd: &dyn AsRawFd, + fd_offset: u64, + prot: Protection, + ) -> Result<()>; + + /// Remove `size`-byte mapping starting at `offset`. + fn remove_mapping(&mut self, slot: u32, offset: usize, size: usize) -> Result<()>; } /// A unique fingerprint for a particular `VcpuRunHandle`, used in `Vcpu` impls to ensure the diff --git a/src/crosvm.rs b/src/crosvm.rs index 8c68b92592..1074a6e253 100644 --- a/src/crosvm.rs +++ b/src/crosvm.rs @@ -122,6 +122,7 @@ impl TouchDeviceOption { } } +#[derive(Eq, PartialEq)] pub enum SharedDirKind { FS, P9, diff --git a/src/linux.rs b/src/linux.rs index 57f9a8b04f..b7fecb5409 100644 --- a/src/linux.rs +++ b/src/linux.rs @@ -62,13 +62,14 @@ use base::{ use vm_control::{ BalloonControlCommand, BalloonControlRequestSocket, BalloonControlResponseSocket, BalloonControlResult, DiskControlCommand, DiskControlRequestSocket, DiskControlResponseSocket, - DiskControlResult, IrqSetup, UsbControlSocket, VcpuControl, VmControlResponseSocket, - VmIrqRequest, VmIrqRequestSocket, VmIrqResponse, VmIrqResponseSocket, - VmMemoryControlRequestSocket, VmMemoryControlResponseSocket, VmMemoryRequest, VmMemoryResponse, - VmMsyncRequest, VmMsyncRequestSocket, VmMsyncResponse, VmMsyncResponseSocket, VmRunMode, + DiskControlResult, FsMappingRequest, FsMappingRequestSocket, FsMappingResponseSocket, IrqSetup, + UsbControlSocket, VcpuControl, VmControlResponseSocket, VmIrqRequest, VmIrqRequestSocket, + VmIrqResponse, VmIrqResponseSocket, VmMemoryControlRequestSocket, + VmMemoryControlResponseSocket, VmMemoryRequest, VmMemoryResponse, VmMsyncRequest, + VmMsyncRequestSocket, VmMsyncResponse, VmMsyncResponseSocket, VmResponse, VmRunMode, }; #[cfg(all(target_arch = "x86_64", feature = "gdb"))] -use vm_control::{VcpuDebug, VcpuDebugStatus, VcpuDebugStatusMessage, VmRequest, VmResponse}; +use vm_control::{VcpuDebug, VcpuDebugStatus, VcpuDebugStatusMessage, VmRequest}; use vm_memory::{GuestAddress, GuestMemory}; #[cfg(all(target_arch = "x86_64", feature = "gdb"))] @@ -310,6 +311,7 @@ impl std::error::Error for Error {} type Result = std::result::Result; enum TaggedControlSocket { + Fs(FsMappingResponseSocket), Vm(VmControlResponseSocket), VmMemory(VmMemoryControlResponseSocket), VmIrq(VmIrqResponseSocket), @@ -320,6 +322,7 @@ impl AsRef for TaggedControlSocket { fn as_ref(&self) -> &UnixSeqpacket { use self::TaggedControlSocket::*; match &self { + Fs(ref socket) => socket.as_ref(), Vm(ref socket) => socket.as_ref(), VmMemory(ref socket) => socket.as_ref(), VmIrq(ref socket) => socket.as_ref(), @@ -1014,6 +1017,7 @@ fn create_fs_device( src: &Path, tag: &str, fs_cfg: virtio::fs::passthrough::Config, + device_socket: FsMappingRequestSocket, ) -> DeviceResult { let max_open_files = get_max_open_files()?; let j = if cfg.sandbox { @@ -1038,7 +1042,8 @@ fn create_fs_device( let features = virtio::base_features(cfg.protected_vm); // TODO(chirantan): Use more than one worker once the kernel driver has been fixed to not panic // when num_queues > 1. - let dev = virtio::fs::Fs::new(features, tag, 1, fs_cfg).map_err(Error::FsDeviceNew)?; + let dev = + virtio::fs::Fs::new(features, tag, 1, fs_cfg, device_socket).map_err(Error::FsDeviceNew)?; Ok(VirtioDeviceStub { dev: Box::new(dev), @@ -1227,6 +1232,7 @@ fn create_virtio_devices( disk_device_sockets: &mut Vec, pmem_device_sockets: &mut Vec, map_request: Arc>>, + fs_device_sockets: &mut Vec, ) -> DeviceResult> { let mut devs = Vec::new(); @@ -1415,7 +1421,18 @@ fn create_virtio_devices( } = shared_dir; let dev = match kind { - SharedDirKind::FS => create_fs_device(cfg, uid_map, gid_map, src, tag, fs_cfg.clone())?, + SharedDirKind::FS => { + let device_socket = fs_device_sockets.remove(0); + create_fs_device( + cfg, + uid_map, + gid_map, + src, + tag, + fs_cfg.clone(), + device_socket, + )? + } SharedDirKind::P9 => create_9p_device(cfg, uid_map, gid_map, src, tag, p9_cfg.clone())?, }; devs.push(dev); @@ -1436,6 +1453,7 @@ fn create_devices( balloon_device_socket: BalloonControlResponseSocket, disk_device_sockets: &mut Vec, pmem_device_sockets: &mut Vec, + fs_device_sockets: &mut Vec, usb_provider: HostBackendDeviceProvider, map_request: Arc>>, ) -> DeviceResult, Option)>> { @@ -1451,6 +1469,7 @@ fn create_devices( disk_device_sockets, pmem_device_sockets, map_request, + fs_device_sockets, )?; let mut pci_devices = Vec::new(); @@ -2264,6 +2283,19 @@ where let map_request: Arc>> = Arc::new(Mutex::new(None)); + let fs_count = cfg + .shared_dirs + .iter() + .filter(|sd| sd.kind == SharedDirKind::FS) + .count(); + let mut fs_device_sockets = Vec::with_capacity(fs_count); + for _ in 0..fs_count { + let (fs_host_socket, fs_device_socket) = + msg_socket::pair::().map_err(Error::CreateSocket)?; + control_sockets.push(TaggedControlSocket::Fs(fs_host_socket)); + fs_device_sockets.push(fs_device_socket); + } + let linux: RunnableLinuxVm<_, Vcpu, _> = Arch::build_vm( components, &cfg.serial_parameters, @@ -2282,6 +2314,7 @@ where balloon_device_socket, &mut disk_device_sockets, &mut pmem_device_sockets, + &mut fs_device_sockets, usb_provider, Arc::clone(&map_request), ) @@ -2747,6 +2780,22 @@ fn run_control match socket.recv() { + Ok(request) => { + let response = + request.execute(&mut linux.vm, &mut linux.resources); + if let Err(e) = socket.send(&response) { + error!("failed to send VmResponse: {}", e); + } + } + Err(e) => { + if let MsgError::BadRecvSize { actual: 0, .. } = e { + vm_control_indices_to_remove.push(index); + } else { + error!("failed to recv VmResponse: {}", e); + } + } + }, } } } diff --git a/sys_util/src/mmap.rs b/sys_util/src/mmap.rs index b703fa1ee1..e7f1239d3b 100644 --- a/sys_util/src/mmap.rs +++ b/sys_util/src/mmap.rs @@ -21,6 +21,8 @@ use crate::{errno, pagesize}; #[derive(Debug)] pub enum Error { + /// `add_fd_mapping` is not supported. + AddFdMappingIsUnsupported, /// Requested memory out of range. InvalidAddress, /// Invalid argument provided when building mmap. @@ -35,6 +37,8 @@ pub enum Error { SystemCallFailed(errno::Error), /// Writing to memory failed ReadToMemory(io::Error), + /// `remove_mapping` is not supported + RemoveMappingIsUnsupported, /// Reading from memory failed WriteFromMemory(io::Error), } @@ -45,6 +49,7 @@ impl Display for Error { use self::Error::*; match self { + AddFdMappingIsUnsupported => write!(f, "`add_fd_mapping` is unsupported"), InvalidAddress => write!(f, "requested memory out of range"), InvalidArgument => write!(f, "invalid argument provided when creating mapping"), InvalidOffset => write!(f, "requested offset is out of range of off_t"), @@ -56,6 +61,7 @@ impl Display for Error { ), SystemCallFailed(e) => write!(f, "mmap system call failed: {}", e), ReadToMemory(e) => write!(f, "failed to read from file to memory: {}", e), + RemoveMappingIsUnsupported => write!(f, "`remove_mapping` is unsupported"), WriteFromMemory(e) => write!(f, "failed to write from memory to file: {}", e), } } @@ -134,6 +140,32 @@ pub unsafe trait MappedRegion: Send + Sync { /// Returns the size of the memory region in bytes. fn size(&self) -> usize; + + /// Maps `size` bytes starting at `fd_offset` bytes from within the given `fd` + /// at `offset` bytes from the start of the region with `prot` protections. + /// `offset` must be page aligned. + /// + /// # Arguments + /// * `offset` - Page aligned offset into the arena in bytes. + /// * `size` - Size of memory region in bytes. + /// * `fd` - File descriptor to mmap from. + /// * `fd_offset` - Offset in bytes from the beginning of `fd` to start the mmap. + /// * `prot` - Protection (e.g. readable/writable) of the memory region. + fn add_fd_mapping( + &mut self, + _offset: usize, + _size: usize, + _fd: &dyn AsRawFd, + _fd_offset: u64, + _prot: Protection, + ) -> Result<()> { + Err(Error::AddFdMappingIsUnsupported) + } + + /// Remove `size`-byte mapping starting at `offset`. + fn remove_mapping(&mut self, _offset: usize, _size: usize) -> Result<()> { + Err(Error::RemoveMappingIsUnsupported) + } } impl dyn MappedRegion { @@ -844,6 +876,21 @@ unsafe impl MappedRegion for MemoryMappingArena { fn size(&self) -> usize { self.size } + + fn add_fd_mapping( + &mut self, + offset: usize, + size: usize, + fd: &dyn AsRawFd, + fd_offset: u64, + prot: Protection, + ) -> Result<()> { + self.add_fd_offset_protection(offset, size, fd, fd_offset, prot) + } + + fn remove_mapping(&mut self, offset: usize, size: usize) -> Result<()> { + self.remove(offset, size) + } } impl From for MemoryMappingArena { diff --git a/vm_control/src/lib.rs b/vm_control/src/lib.rs index a4cd152c54..7cc59baab0 100644 --- a/vm_control/src/lib.rs +++ b/vm_control/src/lib.rs @@ -17,6 +17,7 @@ use std::fmt::{self, Display}; use std::fs::File; use std::io::{Seek, SeekFrom}; use std::mem::ManuallyDrop; +use std::os::raw::c_int; use std::result::Result as StdResult; use std::str::FromStr; use std::sync::Arc; @@ -24,9 +25,9 @@ use std::sync::Arc; use libc::{EINVAL, EIO, ENODEV}; use base::{ - error, AsRawDescriptor, Error as SysError, Event, ExternalMapping, FromRawDescriptor, - IntoRawDescriptor, MappedRegion, MemoryMappingBuilder, MmapError, RawDescriptor, Result, - SafeDescriptor, + error, AsRawDescriptor, Error as SysError, Event, ExternalMapping, Fd, FromRawDescriptor, + IntoRawDescriptor, MappedRegion, MemoryMappingArena, MemoryMappingBuilder, MmapError, + Protection, RawDescriptor, Result, SafeDescriptor, }; use hypervisor::{IrqRoute, IrqSource, Vm}; use msg_socket::{MsgError, MsgOnSocket, MsgReceiver, MsgResult, MsgSender, MsgSocket}; @@ -766,6 +767,112 @@ pub struct BatControl { pub control_socket: BatControlRequestSocket, } +#[derive(MsgOnSocket, Debug)] +pub enum FsMappingRequest { + /// Create an anonymous memory mapping that spans the entire region described by `Alloc`. + AllocateSharedMemoryRegion(Alloc), + /// Create a memory mapping. + CreateMemoryMapping { + /// The slot for a MemoryMappingArena, previously returned by a response to an + /// `AllocateSharedMemoryRegion` request. + slot: u32, + /// The file descriptor that should be mapped. + fd: MaybeOwnedDescriptor, + /// The size of the mapping. + size: usize, + /// The offset into the file from where the mapping should start. + file_offset: u64, + /// The memory protection to be used for the mapping. Protections other than readable and + /// writable will be silently dropped. + prot: u32, + /// The offset into the shared memory region where the mapping should be placed. + mem_offset: usize, + }, + /// Remove a memory mapping. + RemoveMemoryMapping { + /// The slot for a MemoryMappingArena. + slot: u32, + /// The offset into the shared memory region. + offset: usize, + /// The size of the mapping. + size: usize, + }, +} + +impl FsMappingRequest { + pub fn execute(&self, vm: &mut dyn Vm, allocator: &mut SystemAllocator) -> VmResponse { + use self::FsMappingRequest::*; + match *self { + AllocateSharedMemoryRegion(Alloc::PciBar { + bus, + dev, + func, + bar, + }) => { + match allocator + .mmio_allocator(MmioType::High) + .get(&Alloc::PciBar { + bus, + dev, + func, + bar, + }) { + Some((addr, length, _)) => { + let arena = match MemoryMappingArena::new(*length as usize) { + Ok(a) => a, + Err(MmapError::SystemCallFailed(e)) => return VmResponse::Err(e), + _ => return VmResponse::Err(SysError::new(EINVAL)), + }; + + match vm.add_memory_region( + GuestAddress(*addr), + Box::new(arena), + false, + false, + ) { + Ok(slot) => VmResponse::RegisterMemory { + pfn: addr >> 12, + slot, + }, + Err(e) => VmResponse::Err(e), + } + } + None => VmResponse::Err(SysError::new(EINVAL)), + } + } + CreateMemoryMapping { + slot, + ref fd, + size, + file_offset, + prot, + mem_offset, + } => { + let raw_fd: Fd = Fd(fd.as_raw_descriptor()); + + match vm.add_fd_mapping( + slot, + mem_offset, + size, + &raw_fd, + file_offset, + Protection::from(prot as c_int & (libc::PROT_READ | libc::PROT_WRITE)), + ) { + Ok(()) => VmResponse::Ok, + Err(e) => VmResponse::Err(e), + } + } + RemoveMemoryMapping { slot, offset, size } => { + match vm.remove_mapping(slot, offset, size) { + Ok(()) => VmResponse::Ok, + Err(e) => VmResponse::Err(e), + } + } + _ => VmResponse::Err(SysError::new(EINVAL)), + } + } +} + pub type BalloonControlRequestSocket = MsgSocket; pub type BalloonControlResponseSocket = MsgSocket; @@ -775,6 +882,9 @@ pub type BatControlResponseSocket = MsgSocket; pub type DiskControlResponseSocket = MsgSocket; +pub type FsMappingRequestSocket = MsgSocket; +pub type FsMappingResponseSocket = MsgSocket; + pub type UsbControlSocket = MsgSocket; pub type VmMemoryControlRequestSocket = MsgSocket;