devices: virtio: fs: DAX based shared memory support

Support virtio-fs's DAX (direct memory access) operation which allows the guest
to directly access file pages.
Specifically, FUSE_SETUP_MAPPING and FUSE_REMOVE_MAPPING operations are
supported.

This option can be used by specifing `dax` option when mount a file system in
the guest.

The DAX optoin improved file I/O performance in most cases.
In Fio tests, both of read and write score were improved by 1.3-14x depending on
test cases.
In Blogbench tests, which create many small files, DAX improved the write score
by 1.5x while the read score was reduced to ~25% (20391 -> 4593).

Here is an excerpt of results:
Fio
* seq_read:     10.2x (143528 -> 1464911)
* seq_write:     3.3x (61253 -> 896791)
* rand_read:    11.6x (138753 -> 1612739)
* rand_write:   14.6x (61253 -> 896791)
* surfing_read:  1.3x (98473 -> 127907)
* surfing_write: 1.3x (83309 -> 108089)

Blogbench
* read:  0.23x (20391 -> 4593)
* write: 1.50x (248 -> 373)

BUG=b:147341783
TEST=Run vm.{Blogbench, Fio} with CL:2291856

Change-Id: I4a47c601412ed32d926de6304337e1594252d258
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/crosvm/+/2108315
Tested-by: kokoro <noreply+kokoro@google.com>
Tested-by: Keiichi Watanabe <keiichiw@chromium.org>
Commit-Queue: Keiichi Watanabe <keiichiw@chromium.org>
Reviewed-by: Chirantan Ekbote <chirantan@chromium.org>
This commit is contained in:
Keiichi Watanabe 2020-11-17 17:58:35 +09:00 committed by Commit Bot
parent 76ab3a4eb0
commit eefe7fb19e
15 changed files with 761 additions and 31 deletions

1
Cargo.lock generated
View file

@ -534,6 +534,7 @@ version = "0.1.0"
dependencies = [
"data_model",
"futures",
"getopts",
"intrusive-collections",
"libc",
"log",

View file

@ -5,14 +5,23 @@
use std::fmt;
use std::io;
use std::mem;
use std::sync::Arc;
use std::sync::{Arc, Mutex};
use std::thread;
use base::{error, warn, Error as SysError, Event, RawDescriptor};
use base::{error, warn, AsRawDescriptor, Error as SysError, Event, RawDescriptor};
use data_model::{DataInit, Le32};
use msg_socket::{MsgReceiver, MsgSender};
use resources::Alloc;
use vm_control::{FsMappingRequest, FsMappingRequestSocket, VmResponse};
use vm_memory::GuestMemory;
use crate::virtio::{copy_config, DescriptorError, Interrupt, Queue, VirtioDevice, TYPE_FS};
use crate::pci::{
PciAddress, PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability,
};
use crate::virtio::{
copy_config, DescriptorError, Interrupt, PciCapabilityType, Queue, VirtioDevice,
VirtioPciShmCap, TYPE_FS,
};
mod multikey;
pub mod passthrough;
@ -26,6 +35,13 @@ use worker::Worker;
// The fs device does not have a fixed number of queues.
const QUEUE_SIZE: u16 = 1024;
const FS_BAR_NUM: u8 = 4;
const FS_BAR_OFFSET: u64 = 0;
const FS_BAR_SIZE: u64 = 1 << 33;
/// Defined in kernel/include/uapi/linux/virtio_fs.h.
const VIRTIO_FS_SHMCAP_ID_CACHE: u8 = 0;
/// The maximum allowable length of the tag used to identify a specific virtio-fs device.
pub const FS_MAX_TAG_LEN: usize = 36;
@ -105,6 +121,8 @@ pub struct Fs {
queue_sizes: Box<[u16]>,
avail_features: u64,
acked_features: u64,
pci_bar: Option<Alloc>,
socket: Option<FsMappingRequestSocket>,
workers: Vec<(Event, thread::JoinHandle<Result<()>>)>,
}
@ -114,6 +132,7 @@ impl Fs {
tag: &str,
num_workers: usize,
fs_cfg: passthrough::Config,
socket: FsMappingRequestSocket,
) -> Result<Fs> {
if tag.len() > FS_MAX_TAG_LEN {
return Err(Error::TagTooLong(tag.len()));
@ -138,6 +157,8 @@ impl Fs {
queue_sizes: vec![QUEUE_SIZE; num_queues].into_boxed_slice(),
avail_features: base_features,
acked_features: 0,
pci_bar: None,
socket: Some(socket),
workers: Vec::with_capacity(num_workers + 1),
})
}
@ -164,10 +185,16 @@ impl Fs {
impl VirtioDevice for Fs {
fn keep_rds(&self) -> Vec<RawDescriptor> {
self.fs
let mut fds = self
.fs
.as_ref()
.map(PassthroughFs::keep_rds)
.unwrap_or_else(Vec::new)
.unwrap_or_else(Vec::new);
if let Some(rd) = self.socket.as_ref().map(|s| s.as_raw_descriptor()) {
fds.push(rd);
}
fds
}
fn device_type(&self) -> u32 {
@ -213,7 +240,24 @@ impl VirtioDevice for Fs {
let server = Arc::new(Server::new(fs));
let irq = Arc::new(interrupt);
let socket = self.socket.take().expect("missing mapping socket");
// Create the shared memory region now before we start processing requests.
let request = FsMappingRequest::AllocateSharedMemoryRegion(
self.pci_bar.as_ref().cloned().expect("No pci_bar"),
);
socket
.send(&request)
.expect("failed to send allocation message");
let slot = match socket.recv() {
Ok(VmResponse::RegisterMemory { pfn: _, slot }) => slot,
Ok(VmResponse::Err(e)) => panic!("failed to allocate shared memory region: {}", e),
r => panic!(
"unexpected response to allocate shared memory region: {:?}",
r
),
};
let socket = Arc::new(Mutex::new(socket));
let mut watch_resample_event = true;
for (idx, (queue, evt)) in queues.into_iter().zip(queue_evts.into_iter()).enumerate() {
let (self_kill_evt, kill_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e)))
@ -229,11 +273,12 @@ impl VirtioDevice for Fs {
let mem = guest_mem.clone();
let server = server.clone();
let irq = irq.clone();
let socket = Arc::clone(&socket);
let worker_result = thread::Builder::new()
.name(format!("virtio-fs worker {}", idx))
.spawn(move || {
let mut worker = Worker::new(mem, queue, server, irq);
let mut worker = Worker::new(mem, queue, server, irq, socket, slot);
worker.run(evt, kill_evt, watch_resample_event)
});
@ -251,6 +296,32 @@ impl VirtioDevice for Fs {
}
}
}
fn get_device_bars(&mut self, address: PciAddress) -> Vec<PciBarConfiguration> {
self.pci_bar = Some(Alloc::PciBar {
bus: address.bus,
dev: address.dev,
func: address.func,
bar: FS_BAR_NUM,
});
vec![PciBarConfiguration::new(
FS_BAR_NUM as usize,
FS_BAR_SIZE,
PciBarRegionType::Memory64BitRegion,
PciBarPrefetchable::NotPrefetchable,
)]
}
fn get_device_caps(&self) -> Vec<Box<dyn PciCapability>> {
vec![Box::new(VirtioPciShmCap::new(
PciCapabilityType::SharedMemoryConfig,
FS_BAR_NUM,
FS_BAR_OFFSET,
FS_BAR_SIZE,
VIRTIO_FS_SHMCAP_ID_CACHE,
))]
}
}
impl Drop for Fs {

View file

@ -25,8 +25,10 @@ use base::{
use data_model::DataInit;
use fuse::filesystem::{
Context, DirectoryIterator, Entry, FileSystem, FsOptions, GetxattrReply, IoctlFlags,
IoctlReply, ListxattrReply, OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, ROOT_ID,
IoctlReply, ListxattrReply, OpenOptions, RemoveMappingOne, SetattrValid, ZeroCopyReader,
ZeroCopyWriter, ROOT_ID,
};
use fuse::Mapper;
use rand_ish::SimpleRng;
use sync::Mutex;
@ -2306,6 +2308,40 @@ impl FileSystem for PassthroughFs {
Err(io::Error::last_os_error())
}
}
fn set_up_mapping<M: Mapper>(
&self,
_ctx: Context,
inode: Self::Inode,
_handle: Self::Handle,
file_offset: u64,
mem_offset: u64,
size: usize,
prot: u32,
mapper: M,
) -> io::Result<()> {
let read = prot & libc::PROT_READ as u32 != 0;
let write = prot & libc::PROT_WRITE as u32 != 0;
let flags = match (read, write) {
(true, true) => libc::O_RDWR,
(true, false) => libc::O_RDONLY,
(false, true) => libc::O_WRONLY,
(false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
};
let data = self.find_inode(inode)?;
let file = self.open_inode(&data, flags | libc::O_NONBLOCK)?;
mapper.map(mem_offset, size, &file, file_offset, prot)
}
fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
for RemoveMappingOne { moffset, len } in msgs {
mapper.unmap(*moffset, *len)?;
}
Ok(())
}
}
#[cfg(test)]

View file

@ -2,12 +2,16 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
use std::convert::TryInto;
use std::fs::File;
use std::io;
use std::sync::Arc;
use std::os::unix::io::AsRawFd;
use std::sync::{Arc, Mutex};
use base::{error, Event, PollToken, WaitContext};
use fuse::filesystem::{FileSystem, ZeroCopyReader, ZeroCopyWriter};
use msg_socket::{MsgReceiver, MsgSender};
use vm_control::{FsMappingRequest, FsMappingRequestSocket, MaybeOwnedDescriptor, VmResponse};
use vm_memory::GuestMemory;
use crate::virtio::fs::{Error, Result};
@ -40,11 +44,92 @@ impl ZeroCopyWriter for Writer {
self.write_from_at(f, count, off)
}
}
struct Mapper {
socket: Arc<Mutex<FsMappingRequestSocket>>,
slot: u32,
}
impl Mapper {
fn new(socket: Arc<Mutex<FsMappingRequestSocket>>, slot: u32) -> Self {
Self { socket, slot }
}
fn process_request(&self, request: &FsMappingRequest) -> io::Result<()> {
let socket = self.socket.lock().map_err(|e| {
error!("failed to lock socket: {}", e);
io::Error::from_raw_os_error(libc::EINVAL)
})?;
socket.send(request).map_err(|e| {
error!("failed to send request {:?}: {}", request, e);
io::Error::from_raw_os_error(libc::EINVAL)
})?;
match socket.recv() {
Ok(VmResponse::Ok) => Ok(()),
Ok(VmResponse::Err(e)) => Err(e.into()),
r => {
error!("failed to process {:?}: {:?}", request, r);
Err(io::Error::from_raw_os_error(libc::EIO))
}
}
}
}
impl fuse::Mapper for Mapper {
fn map(
&self,
mem_offset: u64,
size: usize,
fd: &dyn AsRawFd,
file_offset: u64,
prot: u32,
) -> io::Result<()> {
let mem_offset: usize = mem_offset.try_into().map_err(|e| {
error!("mem_offset {} is too big: {}", mem_offset, e);
io::Error::from_raw_os_error(libc::EINVAL)
})?;
let request = FsMappingRequest::CreateMemoryMapping {
slot: self.slot,
fd: MaybeOwnedDescriptor::Borrowed(fd.as_raw_fd()),
size,
file_offset,
prot,
mem_offset,
};
self.process_request(&request)
}
fn unmap(&self, offset: u64, size: u64) -> io::Result<()> {
let offset: usize = offset.try_into().map_err(|e| {
error!("offset {} is too big: {}", offset, e);
io::Error::from_raw_os_error(libc::EINVAL)
})?;
let size: usize = size.try_into().map_err(|e| {
error!("size {} is too big: {}", size, e);
io::Error::from_raw_os_error(libc::EINVAL)
})?;
let request = FsMappingRequest::RemoveMemoryMapping {
slot: self.slot,
offset,
size,
};
self.process_request(&request)
}
}
pub struct Worker<F: FileSystem + Sync> {
mem: GuestMemory,
queue: Queue,
server: Arc<fuse::Server<F>>,
irq: Arc<Interrupt>,
socket: Arc<Mutex<FsMappingRequestSocket>>,
slot: u32,
}
impl<F: FileSystem + Sync> Worker<F> {
@ -53,24 +138,30 @@ impl<F: FileSystem + Sync> Worker<F> {
queue: Queue,
server: Arc<fuse::Server<F>>,
irq: Arc<Interrupt>,
socket: Arc<Mutex<FsMappingRequestSocket>>,
slot: u32,
) -> Worker<F> {
Worker {
mem,
queue,
server,
irq,
socket,
slot,
}
}
fn process_queue(&mut self) -> Result<()> {
let mut needs_interrupt = false;
let mapper = Mapper::new(Arc::clone(&self.socket), self.slot);
while let Some(avail_desc) = self.queue.pop(&self.mem) {
let reader = Reader::new(self.mem.clone(), avail_desc.clone())
.map_err(Error::InvalidDescriptorChain)?;
let writer = Writer::new(self.mem.clone(), avail_desc.clone())
.map_err(Error::InvalidDescriptorChain)?;
let total = self.server.handle_message(reader, writer)?;
let total = self.server.handle_message(reader, writer, &mapper)?;
self.queue
.add_used(&self.mem, avail_desc.index, total as u32);

View file

@ -11,7 +11,10 @@ use std::time::Duration;
use crate::sys;
pub use crate::sys::{FsOptions, IoctlFlags, IoctlIovec, OpenOptions, SetattrValid, ROOT_ID};
use crate::server::Mapper;
pub use crate::sys::{
FsOptions, IoctlFlags, IoctlIovec, OpenOptions, RemoveMappingOne, SetattrValid, ROOT_ID,
};
const MAX_BUFFER_SIZE: u32 = 1 << 20;
@ -1158,4 +1161,37 @@ pub trait FileSystem {
) -> io::Result<usize> {
Err(io::Error::from_raw_os_error(libc::ENOSYS))
}
/// Set up memory mappings.
///
/// Used to set up file mappings in DAX window.
///
/// # Arguments
///
/// * `file_offset` - Offset into the file to start the mapping.
/// * `mem_offset` - Offset in Memory Window.
/// * `size` - Length of mapping required.
/// * `flags` - Bit field of `FUSE_SETUPMAPPING_FLAGS_*`.
/// * `mapper` - Mapper object which performs the mapping.
fn set_up_mapping<M: Mapper>(
&self,
ctx: Context,
inode: Self::Inode,
handle: Self::Handle,
file_offset: u64,
mem_offset: u64,
size: usize,
flags: u32,
mapper: M,
) -> io::Result<()> {
Err(io::Error::from_raw_os_error(libc::ENOSYS))
}
/// Remove memory mappings.
///
/// Used to tear down file mappings in DAX window. This method must be supported when
/// `set_up_mapping` is supported.
fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
Err(io::Error::from_raw_os_error(libc::ENOSYS))
}
}

View file

@ -17,7 +17,7 @@ pub mod sys;
pub mod worker;
pub use mount::mount;
pub use server::{Reader, Server, Writer};
pub use server::{Mapper, Reader, Server, Writer};
/// Errors that may occur during the creation or operation of an Fs device.
#[derive(ThisError, Debug)]

View file

@ -2,12 +2,14 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
use std::convert::TryInto;
use std::ffi::CStr;
use std::io;
use std::mem::{size_of, MaybeUninit};
use std::os::unix::io::AsRawFd;
use std::time::Duration;
use base::error;
use base::{error, pagesize};
use data_model::DataInit;
use crate::filesystem::{
@ -45,6 +47,58 @@ pub trait Writer: io::Write {
fn has_sufficient_buffer(&self, size: u32) -> bool;
}
/// A trait for memory mapping for DAX.
///
/// For some transports (like virtio) it may be possible to share a region of memory with the
/// FUSE kernel driver so that it can access file contents directly without issuing read or
/// write requests. In this case the driver will instead send requests to map a section of a
/// file into the shared memory region.
pub trait Mapper {
/// Maps `size` bytes starting at `file_offset` bytes from within the given `fd` at `mem_offset`
/// bytes from the start of the memory region with `prot` protections. `mem_offset` must be
/// page aligned.
///
/// # Arguments
/// * `mem_offset` - Page aligned offset into the memory region in bytes.
/// * `size` - Size of memory region in bytes.
/// * `fd` - File descriptor to mmap from.
/// * `file_offset` - Offset in bytes from the beginning of `fd` to start the mmap.
/// * `prot` - Protection (e.g. `libc::PROT_READ`) of the memory region.
fn map(
&self,
mem_offset: u64,
size: usize,
fd: &dyn AsRawFd,
file_offset: u64,
prot: u32,
) -> io::Result<()>;
/// Unmaps `size` bytes at `offset` bytes from the start of the memory region. `offset` must be
/// page aligned.
///
/// # Arguments
/// * `offset` - Page aligned offset into the arena in bytes.
/// * `size` - Size of memory region in bytes.
fn unmap(&self, offset: u64, size: u64) -> io::Result<()>;
}
impl<'a, M: Mapper> Mapper for &'a M {
fn map(
&self,
mem_offset: u64,
size: usize,
fd: &dyn AsRawFd,
file_offset: u64,
prot: u32,
) -> io::Result<()> {
(**self).map(mem_offset, size, fd, file_offset, prot)
}
fn unmap(&self, offset: u64, size: u64) -> io::Result<()> {
(**self).unmap(offset, size)
}
}
pub struct Server<F: FileSystem + Sync> {
fs: F,
}
@ -54,13 +108,13 @@ impl<F: FileSystem + Sync> Server<F> {
Server { fs }
}
pub fn handle_message<R: Reader + ZeroCopyReader, W: Writer + ZeroCopyWriter>(
pub fn handle_message<R: Reader + ZeroCopyReader, W: Writer + ZeroCopyWriter, M: Mapper>(
&self,
mut r: R,
w: W,
mapper: M,
) -> Result<usize> {
let in_header = InHeader::from_reader(&mut r).map_err(Error::DecodeMessage)?;
if in_header.len > self.fs.max_buffer_size() {
return reply_error(
io::Error::from_raw_os_error(libc::ENOMEM),
@ -115,7 +169,9 @@ impl<F: FileSystem + Sync> Server<F> {
Some(Opcode::Lseek) => self.lseek(in_header, r, w),
Some(Opcode::CopyFileRange) => self.copy_file_range(in_header, r, w),
Some(Opcode::ChromeOsTmpfile) => self.chromeos_tmpfile(in_header, r, w),
Some(Opcode::SetUpMapping) | Some(Opcode::RemoveMapping) | None => reply_error(
Some(Opcode::SetUpMapping) => self.set_up_mapping(in_header, r, w, mapper),
Some(Opcode::RemoveMapping) => self.remove_mapping(in_header, r, w, mapper),
None => reply_error(
io::Error::from_raw_os_error(libc::ENOSYS),
in_header.unique,
w,
@ -905,7 +961,8 @@ impl<F: FileSystem + Sync> Server<F> {
| FsOptions::HAS_IOCTL_DIR
| FsOptions::DO_READDIRPLUS
| FsOptions::READDIRPLUS_AUTO
| FsOptions::ATOMIC_O_TRUNC;
| FsOptions::ATOMIC_O_TRUNC
| FsOptions::MAP_ALIGNMENT;
let capable = FsOptions::from_bits_truncate(flags);
@ -928,6 +985,7 @@ impl<F: FileSystem + Sync> Server<F> {
congestion_threshold: (::std::u16::MAX / 4) * 3,
max_write: self.fs.max_buffer_size(),
time_gran: 1, // nanoseconds
map_alignment: pagesize().trailing_zeros() as u16,
..Default::default()
};
@ -1452,6 +1510,101 @@ impl<F: FileSystem + Sync> Server<F> {
Err(e) => reply_error(e, in_header.unique, w),
}
}
fn set_up_mapping<R, W, M>(
&self,
in_header: InHeader,
mut r: R,
w: W,
mapper: M,
) -> Result<usize>
where
R: Reader,
W: Writer,
M: Mapper,
{
let SetUpMappingIn {
fh,
foffset,
len,
flags,
moffset,
} = SetUpMappingIn::from_reader(&mut r).map_err(Error::DecodeMessage)?;
let flags = SetUpMappingFlags::from_bits_truncate(flags);
let mut prot = 0;
if flags.contains(SetUpMappingFlags::READ) {
prot |= libc::PROT_READ as u32;
}
if flags.contains(SetUpMappingFlags::WRITE) {
prot |= libc::PROT_WRITE as u32;
}
let size = if let Ok(s) = len.try_into() {
s
} else {
return reply_error(
io::Error::from_raw_os_error(libc::EOVERFLOW),
in_header.unique,
w,
);
};
match self.fs.set_up_mapping(
Context::from(in_header),
in_header.nodeid.into(),
fh.into(),
foffset,
moffset,
size,
prot,
mapper,
) {
Ok(()) => reply_ok(None::<u8>, None, in_header.unique, w),
Err(e) => {
error!("set_up_mapping failed: {}", e);
reply_error(e, in_header.unique, w)
}
}
}
fn remove_mapping<R, W, M>(
&self,
in_header: InHeader,
mut r: R,
w: W,
mapper: M,
) -> Result<usize>
where
R: Reader,
W: Writer,
M: Mapper,
{
let RemoveMappingIn { count } =
RemoveMappingIn::from_reader(&mut r).map_err(Error::DecodeMessage)?;
// `FUSE_REMOVEMAPPING_MAX_ENTRY` is defined as
// `PAGE_SIZE / sizeof(struct fuse_removemapping_one)` in /kernel/include/uapi/linux/fuse.h.
let max_entry = pagesize() / std::mem::size_of::<RemoveMappingOne>();
if max_entry < count as usize {
return reply_error(
io::Error::from_raw_os_error(libc::EINVAL),
in_header.unique,
w,
);
}
let mut msgs = Vec::with_capacity(count as usize);
for _ in 0..(count as usize) {
msgs.push(RemoveMappingOne::from_reader(&mut r).map_err(Error::DecodeMessage)?);
}
match self.fs.remove_mapping(&msgs, mapper) {
Ok(()) => reply_ok(None::<u8>, None, in_header.unique, w),
Err(e) => reply_error(e, in_header.unique, w),
}
}
}
fn retry_ioctl<W: Writer>(

View file

@ -362,6 +362,13 @@ bitflags! {
const EXPLICIT_INVAL_DATA = EXPLICIT_INVAL_DATA;
const SECURITY_CONTEXT = SECURITY_CONTEXT;
/// Indicates that the `map_alignment` field of the `InitOut` struct is valid.
///
/// The `MAP_ALIGNMENT` field is used by the FUSE kernel driver to ensure that its DAX
/// mapping requests are pagesize-aligned. This field automatically set by the server and
/// this feature is enabled by default.
const MAP_ALIGNMENT = MAP_ALIGNMENT;
}
}
@ -448,6 +455,18 @@ pub const FUSE_COMPAT_STATFS_SIZE: u32 = 48;
pub const FUSE_COMPAT_INIT_OUT_SIZE: u32 = 8;
pub const FUSE_COMPAT_22_INIT_OUT_SIZE: u32 = 24;
const SETUPMAPPING_FLAG_WRITE: u64 = 1;
const SETUPMAPPING_FLAG_READ: u64 = 2;
bitflags! {
pub struct SetUpMappingFlags: u64 {
/// Create writable mapping.
const WRITE = SETUPMAPPING_FLAG_WRITE;
/// Create readable mapping.
const READ = SETUPMAPPING_FLAG_READ;
}
}
// Message definitions follow. It is safe to implement DataInit for all of these
// because they are POD types.
@ -1151,3 +1170,37 @@ pub struct CopyFileRangeIn {
pub flags: u64,
}
unsafe impl DataInit for CopyFileRangeIn {}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct SetUpMappingIn {
/* An already open handle */
pub fh: u64,
/* Offset into the file to start the mapping */
pub foffset: u64,
/* Length of mapping required */
pub len: u64,
/* Flags, FUSE_SETUPMAPPING_FLAG_* */
pub flags: u64,
/* Offset in Memory Window */
pub moffset: u64,
}
unsafe impl DataInit for SetUpMappingIn {}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct RemoveMappingIn {
/* number of fuse_removemapping_one follows */
pub count: u32,
}
unsafe impl DataInit for RemoveMappingIn {}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone)]
pub struct RemoveMappingOne {
/* Offset into the dax window start the unmapping */
pub moffset: u64,
/* Length of mapping required */
pub len: u64,
}
unsafe impl DataInit for RemoveMappingOne {}

View file

@ -6,9 +6,10 @@ use std::fs::File;
use std::io::{self, BufRead, BufReader, Cursor, Read, Write};
use std::mem::size_of;
use std::os::unix::fs::FileExt;
use std::os::unix::io::AsRawFd;
use crate::filesystem::{FileSystem, ZeroCopyReader, ZeroCopyWriter};
use crate::server::{Reader, Server, Writer};
use crate::server::{Mapper, Reader, Server, Writer};
use crate::sys;
use crate::{Error, Result};
@ -111,6 +112,31 @@ impl ZeroCopyWriter for DevFuseWriter<'_> {
}
}
struct DevFuseMapper;
impl DevFuseMapper {
fn new() -> Self {
Self {}
}
}
impl Mapper for DevFuseMapper {
fn map(
&self,
_mem_offset: u64,
_size: usize,
_fd: &dyn AsRawFd,
_file_offset: u64,
_prot: u32,
) -> io::Result<()> {
Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP))
}
fn unmap(&self, _offset: u64, _size: u64) -> io::Result<()> {
Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP))
}
}
/// Start the FUSE message handling loop. Returns when an error happens.
pub fn start_message_loop<F: FileSystem + Sync>(
dev_fuse: File,
@ -129,8 +155,9 @@ pub fn start_message_loop<F: FileSystem + Sync>(
loop {
let dev_fuse_reader = DevFuseReader::new(&mut buf_reader);
let dev_fuse_writer = DevFuseWriter::new(&mut wfile, &mut write_buf);
let dev_fuse_mapper = DevFuseMapper::new();
if let Err(e) = server.handle_message(dev_fuse_reader, dev_fuse_writer) {
if let Err(e) = server.handle_message(dev_fuse_reader, dev_fuse_writer, &dev_fuse_mapper) {
return Err(e);
}
}

View file

@ -18,6 +18,7 @@ use std::collections::{BTreeMap, BinaryHeap};
use std::convert::TryFrom;
use std::mem::{size_of, ManuallyDrop};
use std::os::raw::{c_char, c_int, c_ulong, c_void};
use std::os::unix::io::AsRawFd;
use std::ptr::copy_nonoverlapping;
use std::sync::atomic::AtomicU64;
use std::sync::Arc;
@ -29,8 +30,8 @@ use libc::{
use base::{
block_signal, errno_result, error, ioctl, ioctl_with_mut_ref, ioctl_with_ref, ioctl_with_val,
pagesize, signal, unblock_signal, AsRawDescriptor, Error, Event, FromRawDescriptor,
MappedRegion, MemoryMapping, MemoryMappingBuilder, MmapError, RawDescriptor, Result,
SafeDescriptor,
MappedRegion, MemoryMapping, MemoryMappingBuilder, MmapError, Protection, RawDescriptor,
Result, SafeDescriptor,
};
use data_model::vec_with_array_field;
use kvm_sys::*;
@ -560,6 +561,36 @@ impl Vm for KvmVm {
fn set_pvclock(&self, state: &ClockState) -> Result<()> {
self.set_pvclock_arch(state)
}
fn add_fd_mapping(
&mut self,
slot: u32,
offset: usize,
size: usize,
fd: &dyn AsRawFd,
fd_offset: u64,
prot: Protection,
) -> Result<()> {
let mut regions = self.mem_regions.lock();
let region = regions.get_mut(&slot).ok_or(Error::new(EINVAL))?;
match region.add_fd_mapping(offset, size, fd, fd_offset, prot) {
Ok(()) => Ok(()),
Err(MmapError::SystemCallFailed(e)) => Err(e),
Err(_) => Err(Error::new(EIO)),
}
}
fn remove_mapping(&mut self, slot: u32, offset: usize, size: usize) -> Result<()> {
let mut regions = self.mem_regions.lock();
let region = regions.get_mut(&slot).ok_or(Error::new(EINVAL))?;
match region.remove_mapping(offset, size) {
Ok(()) => Ok(()),
Err(MmapError::SystemCallFailed(e)) => Err(e),
Err(_) => Err(Error::new(EIO)),
}
}
}
impl AsRawDescriptor for KvmVm {

View file

@ -11,8 +11,9 @@ pub mod kvm;
pub mod x86_64;
use std::os::raw::c_int;
use std::os::unix::io::AsRawFd;
use base::{Event, MappedRegion, RawDescriptor, Result, SafeDescriptor};
use base::{Event, MappedRegion, Protection, RawDescriptor, Result, SafeDescriptor};
use msg_socket::MsgOnSocket;
use vm_memory::{GuestAddress, GuestMemory};
@ -139,6 +140,29 @@ pub trait Vm: Send {
/// Sets the current timestamp of the paravirtual clock as seen by the current guest.
/// Only works on VMs that support `VmCap::PvClock`.
fn set_pvclock(&self, state: &ClockState) -> Result<()>;
/// Maps `size` bytes starting at `fs_offset` bytes from within the given `fd`
/// at `offset` bytes from the start of the arena with `prot` protections.
/// `offset` must be page aligned.
///
/// # Arguments
/// * `offset` - Page aligned offset into the arena in bytes.
/// * `size` - Size of memory region in bytes.
/// * `fd` - File descriptor to mmap from.
/// * `fd_offset` - Offset in bytes from the beginning of `fd` to start the mmap.
/// * `prot` - Protection (e.g. readable/writable) of the memory region.
fn add_fd_mapping(
&mut self,
slot: u32,
offset: usize,
size: usize,
fd: &dyn AsRawFd,
fd_offset: u64,
prot: Protection,
) -> Result<()>;
/// Remove `size`-byte mapping starting at `offset`.
fn remove_mapping(&mut self, slot: u32, offset: usize, size: usize) -> Result<()>;
}
/// A unique fingerprint for a particular `VcpuRunHandle`, used in `Vcpu` impls to ensure the

View file

@ -122,6 +122,7 @@ impl TouchDeviceOption {
}
}
#[derive(Eq, PartialEq)]
pub enum SharedDirKind {
FS,
P9,

View file

@ -62,13 +62,14 @@ use base::{
use vm_control::{
BalloonControlCommand, BalloonControlRequestSocket, BalloonControlResponseSocket,
BalloonControlResult, DiskControlCommand, DiskControlRequestSocket, DiskControlResponseSocket,
DiskControlResult, IrqSetup, UsbControlSocket, VcpuControl, VmControlResponseSocket,
VmIrqRequest, VmIrqRequestSocket, VmIrqResponse, VmIrqResponseSocket,
VmMemoryControlRequestSocket, VmMemoryControlResponseSocket, VmMemoryRequest, VmMemoryResponse,
VmMsyncRequest, VmMsyncRequestSocket, VmMsyncResponse, VmMsyncResponseSocket, VmRunMode,
DiskControlResult, FsMappingRequest, FsMappingRequestSocket, FsMappingResponseSocket, IrqSetup,
UsbControlSocket, VcpuControl, VmControlResponseSocket, VmIrqRequest, VmIrqRequestSocket,
VmIrqResponse, VmIrqResponseSocket, VmMemoryControlRequestSocket,
VmMemoryControlResponseSocket, VmMemoryRequest, VmMemoryResponse, VmMsyncRequest,
VmMsyncRequestSocket, VmMsyncResponse, VmMsyncResponseSocket, VmResponse, VmRunMode,
};
#[cfg(all(target_arch = "x86_64", feature = "gdb"))]
use vm_control::{VcpuDebug, VcpuDebugStatus, VcpuDebugStatusMessage, VmRequest, VmResponse};
use vm_control::{VcpuDebug, VcpuDebugStatus, VcpuDebugStatusMessage, VmRequest};
use vm_memory::{GuestAddress, GuestMemory};
#[cfg(all(target_arch = "x86_64", feature = "gdb"))]
@ -310,6 +311,7 @@ impl std::error::Error for Error {}
type Result<T> = std::result::Result<T, Error>;
enum TaggedControlSocket {
Fs(FsMappingResponseSocket),
Vm(VmControlResponseSocket),
VmMemory(VmMemoryControlResponseSocket),
VmIrq(VmIrqResponseSocket),
@ -320,6 +322,7 @@ impl AsRef<UnixSeqpacket> for TaggedControlSocket {
fn as_ref(&self) -> &UnixSeqpacket {
use self::TaggedControlSocket::*;
match &self {
Fs(ref socket) => socket.as_ref(),
Vm(ref socket) => socket.as_ref(),
VmMemory(ref socket) => socket.as_ref(),
VmIrq(ref socket) => socket.as_ref(),
@ -1014,6 +1017,7 @@ fn create_fs_device(
src: &Path,
tag: &str,
fs_cfg: virtio::fs::passthrough::Config,
device_socket: FsMappingRequestSocket,
) -> DeviceResult {
let max_open_files = get_max_open_files()?;
let j = if cfg.sandbox {
@ -1038,7 +1042,8 @@ fn create_fs_device(
let features = virtio::base_features(cfg.protected_vm);
// TODO(chirantan): Use more than one worker once the kernel driver has been fixed to not panic
// when num_queues > 1.
let dev = virtio::fs::Fs::new(features, tag, 1, fs_cfg).map_err(Error::FsDeviceNew)?;
let dev =
virtio::fs::Fs::new(features, tag, 1, fs_cfg, device_socket).map_err(Error::FsDeviceNew)?;
Ok(VirtioDeviceStub {
dev: Box::new(dev),
@ -1227,6 +1232,7 @@ fn create_virtio_devices(
disk_device_sockets: &mut Vec<DiskControlResponseSocket>,
pmem_device_sockets: &mut Vec<VmMsyncRequestSocket>,
map_request: Arc<Mutex<Option<ExternalMapping>>>,
fs_device_sockets: &mut Vec<FsMappingRequestSocket>,
) -> DeviceResult<Vec<VirtioDeviceStub>> {
let mut devs = Vec::new();
@ -1415,7 +1421,18 @@ fn create_virtio_devices(
} = shared_dir;
let dev = match kind {
SharedDirKind::FS => create_fs_device(cfg, uid_map, gid_map, src, tag, fs_cfg.clone())?,
SharedDirKind::FS => {
let device_socket = fs_device_sockets.remove(0);
create_fs_device(
cfg,
uid_map,
gid_map,
src,
tag,
fs_cfg.clone(),
device_socket,
)?
}
SharedDirKind::P9 => create_9p_device(cfg, uid_map, gid_map, src, tag, p9_cfg.clone())?,
};
devs.push(dev);
@ -1436,6 +1453,7 @@ fn create_devices(
balloon_device_socket: BalloonControlResponseSocket,
disk_device_sockets: &mut Vec<DiskControlResponseSocket>,
pmem_device_sockets: &mut Vec<VmMsyncRequestSocket>,
fs_device_sockets: &mut Vec<FsMappingRequestSocket>,
usb_provider: HostBackendDeviceProvider,
map_request: Arc<Mutex<Option<ExternalMapping>>>,
) -> DeviceResult<Vec<(Box<dyn PciDevice>, Option<Minijail>)>> {
@ -1451,6 +1469,7 @@ fn create_devices(
disk_device_sockets,
pmem_device_sockets,
map_request,
fs_device_sockets,
)?;
let mut pci_devices = Vec::new();
@ -2264,6 +2283,19 @@ where
let map_request: Arc<Mutex<Option<ExternalMapping>>> = Arc::new(Mutex::new(None));
let fs_count = cfg
.shared_dirs
.iter()
.filter(|sd| sd.kind == SharedDirKind::FS)
.count();
let mut fs_device_sockets = Vec::with_capacity(fs_count);
for _ in 0..fs_count {
let (fs_host_socket, fs_device_socket) =
msg_socket::pair::<VmResponse, FsMappingRequest>().map_err(Error::CreateSocket)?;
control_sockets.push(TaggedControlSocket::Fs(fs_host_socket));
fs_device_sockets.push(fs_device_socket);
}
let linux: RunnableLinuxVm<_, Vcpu, _> = Arch::build_vm(
components,
&cfg.serial_parameters,
@ -2282,6 +2314,7 @@ where
balloon_device_socket,
&mut disk_device_sockets,
&mut pmem_device_sockets,
&mut fs_device_sockets,
usb_provider,
Arc::clone(&map_request),
)
@ -2747,6 +2780,22 @@ fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static, I: IrqChipArch + '
}
}
},
TaggedControlSocket::Fs(socket) => match socket.recv() {
Ok(request) => {
let response =
request.execute(&mut linux.vm, &mut linux.resources);
if let Err(e) = socket.send(&response) {
error!("failed to send VmResponse: {}", e);
}
}
Err(e) => {
if let MsgError::BadRecvSize { actual: 0, .. } = e {
vm_control_indices_to_remove.push(index);
} else {
error!("failed to recv VmResponse: {}", e);
}
}
},
}
}
}

View file

@ -21,6 +21,8 @@ use crate::{errno, pagesize};
#[derive(Debug)]
pub enum Error {
/// `add_fd_mapping` is not supported.
AddFdMappingIsUnsupported,
/// Requested memory out of range.
InvalidAddress,
/// Invalid argument provided when building mmap.
@ -35,6 +37,8 @@ pub enum Error {
SystemCallFailed(errno::Error),
/// Writing to memory failed
ReadToMemory(io::Error),
/// `remove_mapping` is not supported
RemoveMappingIsUnsupported,
/// Reading from memory failed
WriteFromMemory(io::Error),
}
@ -45,6 +49,7 @@ impl Display for Error {
use self::Error::*;
match self {
AddFdMappingIsUnsupported => write!(f, "`add_fd_mapping` is unsupported"),
InvalidAddress => write!(f, "requested memory out of range"),
InvalidArgument => write!(f, "invalid argument provided when creating mapping"),
InvalidOffset => write!(f, "requested offset is out of range of off_t"),
@ -56,6 +61,7 @@ impl Display for Error {
),
SystemCallFailed(e) => write!(f, "mmap system call failed: {}", e),
ReadToMemory(e) => write!(f, "failed to read from file to memory: {}", e),
RemoveMappingIsUnsupported => write!(f, "`remove_mapping` is unsupported"),
WriteFromMemory(e) => write!(f, "failed to write from memory to file: {}", e),
}
}
@ -134,6 +140,32 @@ pub unsafe trait MappedRegion: Send + Sync {
/// Returns the size of the memory region in bytes.
fn size(&self) -> usize;
/// Maps `size` bytes starting at `fd_offset` bytes from within the given `fd`
/// at `offset` bytes from the start of the region with `prot` protections.
/// `offset` must be page aligned.
///
/// # Arguments
/// * `offset` - Page aligned offset into the arena in bytes.
/// * `size` - Size of memory region in bytes.
/// * `fd` - File descriptor to mmap from.
/// * `fd_offset` - Offset in bytes from the beginning of `fd` to start the mmap.
/// * `prot` - Protection (e.g. readable/writable) of the memory region.
fn add_fd_mapping(
&mut self,
_offset: usize,
_size: usize,
_fd: &dyn AsRawFd,
_fd_offset: u64,
_prot: Protection,
) -> Result<()> {
Err(Error::AddFdMappingIsUnsupported)
}
/// Remove `size`-byte mapping starting at `offset`.
fn remove_mapping(&mut self, _offset: usize, _size: usize) -> Result<()> {
Err(Error::RemoveMappingIsUnsupported)
}
}
impl dyn MappedRegion {
@ -844,6 +876,21 @@ unsafe impl MappedRegion for MemoryMappingArena {
fn size(&self) -> usize {
self.size
}
fn add_fd_mapping(
&mut self,
offset: usize,
size: usize,
fd: &dyn AsRawFd,
fd_offset: u64,
prot: Protection,
) -> Result<()> {
self.add_fd_offset_protection(offset, size, fd, fd_offset, prot)
}
fn remove_mapping(&mut self, offset: usize, size: usize) -> Result<()> {
self.remove(offset, size)
}
}
impl From<MemoryMapping> for MemoryMappingArena {

View file

@ -17,6 +17,7 @@ use std::fmt::{self, Display};
use std::fs::File;
use std::io::{Seek, SeekFrom};
use std::mem::ManuallyDrop;
use std::os::raw::c_int;
use std::result::Result as StdResult;
use std::str::FromStr;
use std::sync::Arc;
@ -24,9 +25,9 @@ use std::sync::Arc;
use libc::{EINVAL, EIO, ENODEV};
use base::{
error, AsRawDescriptor, Error as SysError, Event, ExternalMapping, FromRawDescriptor,
IntoRawDescriptor, MappedRegion, MemoryMappingBuilder, MmapError, RawDescriptor, Result,
SafeDescriptor,
error, AsRawDescriptor, Error as SysError, Event, ExternalMapping, Fd, FromRawDescriptor,
IntoRawDescriptor, MappedRegion, MemoryMappingArena, MemoryMappingBuilder, MmapError,
Protection, RawDescriptor, Result, SafeDescriptor,
};
use hypervisor::{IrqRoute, IrqSource, Vm};
use msg_socket::{MsgError, MsgOnSocket, MsgReceiver, MsgResult, MsgSender, MsgSocket};
@ -766,6 +767,112 @@ pub struct BatControl {
pub control_socket: BatControlRequestSocket,
}
#[derive(MsgOnSocket, Debug)]
pub enum FsMappingRequest {
/// Create an anonymous memory mapping that spans the entire region described by `Alloc`.
AllocateSharedMemoryRegion(Alloc),
/// Create a memory mapping.
CreateMemoryMapping {
/// The slot for a MemoryMappingArena, previously returned by a response to an
/// `AllocateSharedMemoryRegion` request.
slot: u32,
/// The file descriptor that should be mapped.
fd: MaybeOwnedDescriptor,
/// The size of the mapping.
size: usize,
/// The offset into the file from where the mapping should start.
file_offset: u64,
/// The memory protection to be used for the mapping. Protections other than readable and
/// writable will be silently dropped.
prot: u32,
/// The offset into the shared memory region where the mapping should be placed.
mem_offset: usize,
},
/// Remove a memory mapping.
RemoveMemoryMapping {
/// The slot for a MemoryMappingArena.
slot: u32,
/// The offset into the shared memory region.
offset: usize,
/// The size of the mapping.
size: usize,
},
}
impl FsMappingRequest {
pub fn execute(&self, vm: &mut dyn Vm, allocator: &mut SystemAllocator) -> VmResponse {
use self::FsMappingRequest::*;
match *self {
AllocateSharedMemoryRegion(Alloc::PciBar {
bus,
dev,
func,
bar,
}) => {
match allocator
.mmio_allocator(MmioType::High)
.get(&Alloc::PciBar {
bus,
dev,
func,
bar,
}) {
Some((addr, length, _)) => {
let arena = match MemoryMappingArena::new(*length as usize) {
Ok(a) => a,
Err(MmapError::SystemCallFailed(e)) => return VmResponse::Err(e),
_ => return VmResponse::Err(SysError::new(EINVAL)),
};
match vm.add_memory_region(
GuestAddress(*addr),
Box::new(arena),
false,
false,
) {
Ok(slot) => VmResponse::RegisterMemory {
pfn: addr >> 12,
slot,
},
Err(e) => VmResponse::Err(e),
}
}
None => VmResponse::Err(SysError::new(EINVAL)),
}
}
CreateMemoryMapping {
slot,
ref fd,
size,
file_offset,
prot,
mem_offset,
} => {
let raw_fd: Fd = Fd(fd.as_raw_descriptor());
match vm.add_fd_mapping(
slot,
mem_offset,
size,
&raw_fd,
file_offset,
Protection::from(prot as c_int & (libc::PROT_READ | libc::PROT_WRITE)),
) {
Ok(()) => VmResponse::Ok,
Err(e) => VmResponse::Err(e),
}
}
RemoveMemoryMapping { slot, offset, size } => {
match vm.remove_mapping(slot, offset, size) {
Ok(()) => VmResponse::Ok,
Err(e) => VmResponse::Err(e),
}
}
_ => VmResponse::Err(SysError::new(EINVAL)),
}
}
}
pub type BalloonControlRequestSocket = MsgSocket<BalloonControlCommand, BalloonControlResult>;
pub type BalloonControlResponseSocket = MsgSocket<BalloonControlResult, BalloonControlCommand>;
@ -775,6 +882,9 @@ pub type BatControlResponseSocket = MsgSocket<BatControlResult, BatControlComman
pub type DiskControlRequestSocket = MsgSocket<DiskControlCommand, DiskControlResult>;
pub type DiskControlResponseSocket = MsgSocket<DiskControlResult, DiskControlCommand>;
pub type FsMappingRequestSocket = MsgSocket<FsMappingRequest, VmResponse>;
pub type FsMappingResponseSocket = MsgSocket<VmResponse, FsMappingRequest>;
pub type UsbControlSocket = MsgSocket<UsbControlCommand, UsbControlResult>;
pub type VmMemoryControlRequestSocket = MsgSocket<VmMemoryRequest, VmMemoryResponse>;